module Holumbus.Query.Processor
(
ProcessConfig (..)
, processQuery
, processPartial
, processQueryM
, processPartialM
)
where
import Control.Monad
import Control.Parallel.Strategies
import Data.Binary ( Binary (..) )
import Data.Function
import qualified Data.List as L
import Holumbus.Index.Common hiding (contexts)
import qualified Holumbus.Index.Common as IDX
import Holumbus.Query.Language.Grammar
import Holumbus.Query.Fuzzy ( FuzzyScore
, FuzzyConfig
)
import qualified Holumbus.Query.Fuzzy as F
import Holumbus.Query.Result ( Result )
import Holumbus.Query.Intermediate ( Intermediate )
import qualified Holumbus.Query.Intermediate as I
data ProcessConfig
= ProcessConfig
{ fuzzyConfig :: ! FuzzyConfig
, optimizeQuery :: ! Bool
, wordLimit :: ! Int
, docLimit :: ! Int
}
instance Binary ProcessConfig where
put (ProcessConfig fc o l d)
= put fc >> put o >> put l >> put d
get
= liftM4 ProcessConfig get get get get
data ProcessState i
= ProcessState
{ config :: ! ProcessConfig
, contexts :: ! [Context]
, index :: ! i
, total :: ! Int
}
getFuzzyConfig :: HolIndex i => ProcessState i -> FuzzyConfig
getFuzzyConfig = fuzzyConfig . config
getFuzzyConfigM :: HolIndexM m i => ProcessState i -> m FuzzyConfig
getFuzzyConfigM s = return $ fuzzyConfig $ config s
setContexts :: HolIndex i => [Context] -> ProcessState i -> ProcessState i
setContexts cs (ProcessState cfg _ i t) = ProcessState cfg cs i t
setContextsM :: HolIndexM m i => [Context] -> ProcessState i -> m (ProcessState i)
setContextsM cs (ProcessState cfg _ i t) = return $ ProcessState cfg cs i t
initState :: HolIndex i => ProcessConfig -> i -> Int -> ProcessState i
initState cfg i t = ProcessState cfg (IDX.contexts i) i t
initStateM :: HolIndexM m i => ProcessConfig -> i -> Int -> m (ProcessState i)
initStateM cfg i t = IDX.contextsM i >>= \cs -> return $ ProcessState cfg cs i t
forAllContexts :: (Context -> Intermediate) -> [Context] -> Intermediate
forAllContexts f cs = L.foldl' I.union I.emptyIntermediate $ parMap rdeepseq f cs
forAllContextsM :: Monad m => (Context -> m Intermediate) -> [Context] -> m Intermediate
forAllContextsM f cs = mapM f cs >>= \is -> return $ L.foldl' I.union I.emptyIntermediate is
allDocuments :: HolIndex i => ProcessState i -> Intermediate
allDocuments s = forAllContexts (\c -> I.fromList "" c $ IDX.allWords (index s) c) (contexts s)
allDocumentsM :: HolIndexM m i => ProcessState i -> m Intermediate
allDocumentsM s = forAllContextsM (\c -> IDX.allWordsM (index s) c >>= \r -> return $ I.fromList "" c r) (contexts s)
processPartial :: (HolIndex i) => ProcessConfig -> i -> Int -> Query -> Intermediate
processPartial cfg i t q = process (initState cfg i t) oq
where
oq = if optimizeQuery cfg then optimize q else q
processPartialM :: (HolIndexM m i) => ProcessConfig -> i -> Int -> Query -> m Intermediate
processPartialM cfg i t q = initStateM cfg i t >>= (flip processM) oq
where
oq = if optimizeQuery cfg then optimize q else q
processQuery :: (HolIndex i, HolDocuments d c) => ProcessConfig -> i -> d c -> Query -> Result c
processQuery cfg i d q = I.toResult d (processPartial cfg i (sizeDocs d) q)
processQueryM :: (HolIndexM m i, HolDocuments d c) => ProcessConfig -> i -> d c -> Query -> m (Result c)
processQueryM cfg i d q = processPartialM cfg i (sizeDocs d) q >>= \ir -> return $ I.toResult d ir
process :: HolIndex i => ProcessState i -> Query -> Intermediate
process s (Word w) = processWord s w
process s (Phrase w) = processPhrase s w
process s (CaseWord w) = processCaseWord s w
process s (CasePhrase w) = processCasePhrase s w
process s (FuzzyWord w) = processFuzzyWord s w
process s (Negation q) = processNegation s (process s q)
process s (Specifier c q) = process (setContexts c s) q
process s (BinQuery o q1 q2) = processBin o (process s q1) (process s q2)
processM :: HolIndexM m i => ProcessState i -> Query -> m Intermediate
processM s (Word w) = processWordM s w
processM _ (Phrase _) = return I.emptyIntermediate
processM s (CaseWord w) = processCaseWordM s w
processM _ (CasePhrase _) = return I.emptyIntermediate
processM s (FuzzyWord w) = processFuzzyWordM s w
processM s (Negation q) = processM s q >>= processNegationM s
processM s (Specifier c q) = setContextsM c s >>= \ns -> processM ns q
processM s (BinQuery o q1 q2) = do
ir1 <- processM s q1
ir2 <- processM s q2
return $ processBin o ir1 ir2
processWord :: HolIndex i => ProcessState i -> String -> Intermediate
processWord s q = forAllContexts wordNoCase (contexts s)
where
wordNoCase c = I.fromList q c $ limitWords s $ IDX.prefixNoCase (index s) c q
processWordM :: HolIndexM m i => ProcessState i -> String -> m Intermediate
processWordM s q = forAllContextsM wordNoCase (contexts s)
where
wordNoCase c = IDX.prefixNoCaseM (index s) c q >>= limitWordsM s >>= \r -> return $ I.fromList q c r
processCaseWord :: HolIndex i => ProcessState i -> String -> Intermediate
processCaseWord s q = forAllContexts wordCase (contexts s)
where
wordCase c = I.fromList q c $ limitWords s $ IDX.prefixCase (index s) c q
processCaseWordM :: HolIndexM m i => ProcessState i -> String -> m Intermediate
processCaseWordM s q = forAllContextsM wordCase (contexts s)
where
wordCase c = IDX.prefixCaseM (index s) c q >>= limitWordsM s >>= \r -> return $ I.fromList q c r
processPhrase :: HolIndex i => ProcessState i -> String -> Intermediate
processPhrase s q = forAllContexts phraseNoCase (contexts s)
where
phraseNoCase c = processPhraseInternal (IDX.lookupNoCase (index s) c) c q
processCasePhrase :: HolIndex i => ProcessState i -> String -> Intermediate
processCasePhrase s q = forAllContexts phraseCase (contexts s)
where
phraseCase c = processPhraseInternal (IDX.lookupCase (index s) c) c q
processPhraseInternal :: (String -> RawResult) -> Context -> String -> Intermediate
processPhraseInternal f c q = let
w = words q
m = mergeOccurrencesList $ map snd $ f (head w) in
if nullDocIdMap m
then I.emptyIntermediate
else I.fromList q c [(q, processPhrase' (tail w) 1 m)]
where
processPhrase' :: [String] -> Position -> Occurrences -> Occurrences
processPhrase' [] _ o = o
processPhrase' (x:xs) p o = processPhrase' xs (p+1) (filterWithKeyDocIdMap (nextWord $ map snd $ f x) o)
where
nextWord :: [Occurrences] -> DocId -> Positions -> Bool
nextWord [] _ _ = False
nextWord no d np = maybe False hasSuccessor (lookupDocIdMap d (mergeOccurrencesList no))
where
hasSuccessor :: Positions -> Bool
hasSuccessor w = foldPos (\cp r -> r || (memberPos (cp + p) w)) False np
processFuzzyWord :: HolIndex i => ProcessState i -> String -> Intermediate
processFuzzyWord s oq = processFuzzyWord' (F.toList $ F.fuzz (getFuzzyConfig s) oq) (processWord s oq)
where
processFuzzyWord' :: [(String, FuzzyScore)] -> Intermediate -> Intermediate
processFuzzyWord' [] r = r
processFuzzyWord' (q:qs) r = if I.null r then processFuzzyWord' qs (processWord s (fst q)) else r
processFuzzyWordM :: HolIndexM m i => ProcessState i -> String -> m Intermediate
processFuzzyWordM s oq = do
sr <- processWordM s oq
cfg <- getFuzzyConfigM s
processFuzzyWordM' (F.toList $ F.fuzz cfg oq) sr
where
processFuzzyWordM' [] r = return r
processFuzzyWordM' (q:qs) r = if I.null r
then processWordM s (fst q) >>= processFuzzyWordM' qs
else return r
processNegation :: HolIndex i => ProcessState i -> Intermediate -> Intermediate
processNegation s r = I.difference (allDocuments s) r
processNegationM :: HolIndexM m i => ProcessState i -> Intermediate -> m Intermediate
processNegationM s r1 = allDocumentsM s >>= \r2 -> return $ I.difference r2 r1
processBin :: BinOp -> Intermediate -> Intermediate -> Intermediate
processBin And r1 r2 = I.intersection r1 r2
processBin Or r1 r2 = I.union r1 r2
processBin But r1 r2 = I.difference r1 r2
limitWords :: ProcessState i -> RawResult -> RawResult
limitWords s r = cutW . cutD $ r
where
limitD = docLimit $ config s
cutD
| limitD > 0 = limitDocs limitD
| otherwise = id
limitW = wordLimit $ config s
cutW
| limitW > 0
&&
length r > limitW
= map snd . take limitW . L.sortBy (compare `on` fst) . map calcScore
| otherwise = id
calcScore :: (Word, Occurrences) -> (Double, (Word, Occurrences))
calcScore w@(_, o) = (log (fromIntegral (total s) / fromIntegral (sizeDocIdMap o)), w)
limitDocs :: Int -> RawResult -> RawResult
limitDocs _ [] = []
limitDocs limit _
| limit <= 0 = []
limitDocs limit (x:xs) = x : limitDocs (limit sizeDocIdMap (snd x)) xs
limitWordsM :: (Monad m) => ProcessState i -> RawResult -> m RawResult
limitWordsM s r = return $ limitWords s r
mergeOccurrencesList :: [Occurrences] -> Occurrences
mergeOccurrencesList = unionsWithDocIdMap unionPos