{-# LANGUAGE BangPatterns, GeneralizedNewtypeDeriving #-}
module Data.SearchEngine.DocTermIds (
    DocTermIds,
    TermId,
    fieldLength,
    fieldTermCount,
    fieldElems,
    create,
    denseTable,
    vecIndexIx,
    vecCreateIx,
  ) where

import Data.SearchEngine.TermBag (TermBag, TermId)
import qualified Data.SearchEngine.TermBag as TermBag

import Data.Vector (Vector, (!))
import qualified Data.Vector as Vec
import qualified Data.Vector.Unboxed as UVec
import Data.Ix (Ix)
import qualified Data.Ix as Ix


-- | The 'TermId's for the 'Term's that occur in a document. Documents may have
-- multiple fields and the 'DocTerms' type holds them separately for each field.
--
newtype DocTermIds field = DocTermIds (Vector TermBag)
  deriving (Int -> DocTermIds field -> ShowS
forall field. Int -> DocTermIds field -> ShowS
forall field. [DocTermIds field] -> ShowS
forall field. DocTermIds field -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [DocTermIds field] -> ShowS
$cshowList :: forall field. [DocTermIds field] -> ShowS
show :: DocTermIds field -> String
$cshow :: forall field. DocTermIds field -> String
showsPrec :: Int -> DocTermIds field -> ShowS
$cshowsPrec :: forall field. Int -> DocTermIds field -> ShowS
Show)

getField :: (Ix field, Bounded field) => DocTermIds field -> field -> TermBag
getField :: forall field.
(Ix field, Bounded field) =>
DocTermIds field -> field -> TermBag
getField (DocTermIds Vector TermBag
fieldVec) = forall ix a. (Ix ix, Bounded ix) => Vector a -> ix -> a
vecIndexIx Vector TermBag
fieldVec

create :: (Ix field, Bounded field) =>
          (field -> [TermId]) -> DocTermIds field
create :: forall field.
(Ix field, Bounded field) =>
(field -> [TermId]) -> DocTermIds field
create field -> [TermId]
docTermIds =
    forall field. Vector TermBag -> DocTermIds field
DocTermIds (forall ix a. (Ix ix, Bounded ix) => (ix -> a) -> Vector a
vecCreateIx ([TermId] -> TermBag
TermBag.fromList forall b c a. (b -> c) -> (a -> b) -> a -> c
. field -> [TermId]
docTermIds))

-- | The number of terms in a field within the document.
fieldLength :: (Ix field, Bounded field) => DocTermIds field -> field -> Int
fieldLength :: forall field.
(Ix field, Bounded field) =>
DocTermIds field -> field -> Int
fieldLength DocTermIds field
docterms field
field =
    TermBag -> Int
TermBag.size (forall field.
(Ix field, Bounded field) =>
DocTermIds field -> field -> TermBag
getField DocTermIds field
docterms field
field)

-- | /O(log n)/ The frequency of a particular term in a field within the document.
--
fieldTermCount :: (Ix field, Bounded field) =>
                  DocTermIds field -> field -> TermId -> Int
fieldTermCount :: forall field.
(Ix field, Bounded field) =>
DocTermIds field -> field -> TermId -> Int
fieldTermCount DocTermIds field
docterms field
field TermId
termid =
    forall a b. (Integral a, Num b) => a -> b
fromIntegral (TermBag -> TermId -> TermCount
TermBag.termCount (forall field.
(Ix field, Bounded field) =>
DocTermIds field -> field -> TermBag
getField DocTermIds field
docterms field
field) TermId
termid)

fieldElems :: (Ix field, Bounded field) => DocTermIds field -> field -> [TermId]
fieldElems :: forall field.
(Ix field, Bounded field) =>
DocTermIds field -> field -> [TermId]
fieldElems DocTermIds field
docterms field
field =
    TermBag -> [TermId]
TermBag.elems (forall field.
(Ix field, Bounded field) =>
DocTermIds field -> field -> TermBag
getField DocTermIds field
docterms field
field)

-- | The 'DocTermIds' is really a sparse 2d array, and doing lookups with
-- 'fieldTermCount' has a O(log n) cost. This function converts to a dense
-- tabular representation which then enables linear scans.
--
denseTable :: (Ix field, Bounded field) => DocTermIds field ->
              (Int, Int -> TermId, Int -> field -> Int)
denseTable :: forall field.
(Ix field, Bounded field) =>
DocTermIds field -> (Int, Int -> TermId, Int -> field -> Int)
denseTable (DocTermIds Vector TermBag
fieldVec) =
    let (!Vector TermId
termids, !Vector TermCount
termcounts) = [TermBag] -> (Vector TermId, Vector TermCount)
TermBag.denseTable (forall a. Vector a -> [a]
Vec.toList Vector TermBag
fieldVec)
        !numTerms :: Int
numTerms = forall a. Unbox a => Vector a -> Int
UVec.length Vector TermId
termids
     in ( Int
numTerms
        , \Int
i    -> Vector TermId
termids forall a. Unbox a => Vector a -> Int -> a
UVec.! Int
i
        , \Int
i field
ix -> let j :: Int
j = forall a. Ix a => (a, a) -> a -> Int
Ix.index (forall a. Bounded a => a
minBound, forall a. Bounded a => a
maxBound) field
ix
                    in forall a b. (Integral a, Num b) => a -> b
fromIntegral (Vector TermCount
termcounts forall a. Unbox a => Vector a -> Int -> a
UVec.! (Int
j forall a. Num a => a -> a -> a
* Int
numTerms forall a. Num a => a -> a -> a
+ Int
i))
        )

---------------------------------
-- Vector indexed by Ix Bounded
--

vecIndexIx  :: (Ix ix, Bounded ix) => Vector a -> ix -> a
vecIndexIx :: forall ix a. (Ix ix, Bounded ix) => Vector a -> ix -> a
vecIndexIx Vector a
vec ix
ix = Vector a
vec forall a. Vector a -> Int -> a
! forall a. Ix a => (a, a) -> a -> Int
Ix.index (forall a. Bounded a => a
minBound, forall a. Bounded a => a
maxBound) ix
ix

vecCreateIx :: (Ix ix, Bounded ix) => (ix -> a) -> Vector a
vecCreateIx :: forall ix a. (Ix ix, Bounded ix) => (ix -> a) -> Vector a
vecCreateIx ix -> a
f = forall a. Int -> [a] -> Vector a
Vec.fromListN (forall a. Ix a => (a, a) -> Int
Ix.rangeSize (ix, ix)
bounds)
                  [ a
y | ix
ix <- forall a. Ix a => (a, a) -> [a]
Ix.range (ix, ix)
bounds, let !y :: a
y = ix -> a
f ix
ix ]
  where
    bounds :: (ix, ix)
bounds = (forall a. Bounded a => a
minBound, forall a. Bounded a => a
maxBound)