module Hunt.Index.Schema.Analyze
( toDocAndWords
, toDocAndWords'
, normalize
, scanTextRE
)
where
import Data.DList (DList)
import qualified Data.DList as DL
import Data.Map (Map)
import qualified Data.Map as M
import Data.Maybe (fromJust, fromMaybe)
import Data.Text (Text)
import qualified Data.Text as T
import Text.Regex.XMLSchema.String
import Hunt.Common.ApiDocument
import Hunt.Common.BasicTypes
import Hunt.Common.Document (Document (..),
DocumentWrapper (..))
import Hunt.Index.Schema
toDocAndWords :: DocumentWrapper e => Schema -> ApiDocument -> (e, Score, Words)
toDocAndWords s
= ( \ (d, dw, ws) -> (wrap d, dw, ws) )
. toDocAndWords' s
toDocAndWords' :: Schema -> ApiDocument -> (Document, Score, Words)
toDocAndWords' schema apiDoc
= (doc, weight, ws)
where
indexMap = adIndex apiDoc
descrMap = adDescr apiDoc
weight = adWght apiDoc
doc = Document
{ uri = adUri apiDoc
, desc = descrMap
, wght = toDefScore weight
}
ws = M.mapWithKey
( \ context content ->
let (ContextSchema rex normalizers _ _ cType)
= fromJust $ M.lookup context schema
(CType _ defRex validator _)
= cType
scan
= filter (validate validator) . scanTextRE (fromMaybe defRex rex)
in
toWordList scan (normalize' normalizers) content
)
indexMap
toWordList :: (Text -> [Word]) -> (Word -> Word) -> Text -> WordList
toWordList scan norm
= M.map DL.toList
. foldr insert M.empty
. zip [1..]
. map norm
. scan
where
insert :: (Position, Word) -> Map Word (DList Position) -> Map Word (DList Position)
insert (p, w)
= M.alter (return . maybe (DL.singleton p) (`DL.snoc` p)) w
scanTextRE :: RegEx -> Text -> [Word]
scanTextRE wRex
= map T.pack
. tokenize (T.unpack wRex)
. T.unpack