src/Holumbus/Query/Processor.hs

{-# OPTIONS #-}

-- ----------------------------------------------------------------------------

{- |
  Module     : Holumbus.Query.Processor
  Copyright  : Copyright (C) 2007, 2008 Timo B. Huebel
  License    : MIT

  Maintainer : Timo B. Huebel (tbh@holumbus.org)
  Stability  : experimental
  Portability: portable

  The Holumbus query processor. Supports exact word or phrase queries as well
  as fuzzy word and case-insensitive word and phrase queries. Boolean
  operators like AND, OR and NOT are supported. Context specifiers and
  priorities are supported, too.

-}

-- ----------------------------------------------------------------------------

module Holumbus.Query.Processor 
  (
  -- * Processor types
  ProcessConfig (..)
  
  -- * Processing
  , processQuery
  , processPartial
  , processQueryM
  , processPartialM
  )
where

-- import Data.Maybe

import           Control.Monad
import           Control.Parallel.Strategies

import           Data.Binary                    ( Binary (..) )
import           Data.Function
import qualified Data.List                      as L

import           Holumbus.Index.Common          hiding (contexts)
import qualified Holumbus.Index.Common          as IDX

import           Holumbus.Query.Language.Grammar

import           Holumbus.Query.Fuzzy           ( FuzzyScore
                                                , FuzzyConfig
                                                )
import qualified Holumbus.Query.Fuzzy           as F

import           Holumbus.Query.Result          ( Result )

import           Holumbus.Query.Intermediate    ( Intermediate )
import qualified Holumbus.Query.Intermediate    as I

-- ----------------------------------------------------------------------------

-- | The configuration for the query processor.

data ProcessConfig
    = ProcessConfig 
      { fuzzyConfig   :: ! FuzzyConfig -- ^ The configuration for fuzzy queries.
      , optimizeQuery :: ! Bool        -- ^ Optimize the query before processing.
      , wordLimit     :: ! Int         -- ^ The maximum number of words used from a prefix. Zero switches off limiting.
      , docLimit      :: ! Int         -- ^ The maximum number of documents taken into account. Zero switches off limiting.
      }

instance Binary ProcessConfig where
  put (ProcessConfig fc o l d)
      = put fc >> put o >> put l >> put d
  get
      = liftM4 ProcessConfig get get get get

-- | The internal state of the query processor.
data ProcessState i
    = ProcessState 
      { config   :: ! ProcessConfig   -- ^ The configuration for the query processor.
      , contexts :: ! [Context]       -- ^ The current list of contexts.
      , index    :: ! i               -- ^ The index to search.
      , total    :: ! Int             -- ^ The number of documents in the index.
      }

-- ----------------------------------------------------------------------------

-- | Get the fuzzy config out of the process state.

getFuzzyConfig :: HolIndex i => ProcessState i -> FuzzyConfig
getFuzzyConfig = fuzzyConfig . config

-- | Monadic version of 'getFuzzyConfig'.

getFuzzyConfigM :: HolIndexM m i => ProcessState i -> m FuzzyConfig
getFuzzyConfigM s = return $ fuzzyConfig $ config s

-- | Set the current context in the state.

setContexts :: HolIndex i => [Context] -> ProcessState i -> ProcessState i
setContexts cs (ProcessState cfg _ i t) = ProcessState cfg cs i t

-- | Monadic version of 'setContexts'.

setContextsM :: HolIndexM m i => [Context] -> ProcessState i -> m (ProcessState i)
setContextsM cs (ProcessState cfg _ i t) = return $ ProcessState cfg cs i t

-- | Initialize the state of the processor.

initState :: HolIndex i => ProcessConfig -> i -> Int -> ProcessState i
initState cfg i t = ProcessState cfg (IDX.contexts i) i t

-- | Monadic version of 'initState'.

initStateM :: HolIndexM m i => ProcessConfig -> i -> Int -> m (ProcessState i)
initStateM cfg i t = IDX.contextsM i >>= \cs -> return $ ProcessState cfg cs i t

-- | Try to evaluate the query for all contexts in parallel.

forAllContexts :: (Context -> Intermediate) -> [Context] -> Intermediate
forAllContexts f cs = L.foldl' I.union I.emptyIntermediate $ parMap rdeepseq f cs

-- | Monadic version of 'forAllContexts'.

forAllContextsM :: Monad m => (Context -> m Intermediate) -> [Context] -> m Intermediate
forAllContextsM f cs = mapM f cs >>= \is -> return $ L.foldl' I.union I.emptyIntermediate is

-- | Just everything.

allDocuments :: HolIndex i => ProcessState i -> Intermediate
allDocuments s = forAllContexts (\c -> I.fromList "" c $ IDX.allWords (index s) c) (contexts s)

allDocumentsM :: HolIndexM m i => ProcessState i -> m Intermediate
allDocumentsM s = forAllContextsM (\c -> IDX.allWordsM (index s) c >>= \r -> return $ I.fromList "" c r) (contexts s)

-- | Process a query only partially in terms of a distributed index. Only the intermediate 
-- result will be returned.

processPartial :: (HolIndex i) => ProcessConfig -> i -> Int -> Query -> Intermediate
processPartial cfg i t q = process (initState cfg i t) oq
  where
  oq = if optimizeQuery cfg then optimize q else q

-- | Monadic version of 'processPartial'.

processPartialM :: (HolIndexM m i) => ProcessConfig -> i -> Int -> Query -> m Intermediate
processPartialM cfg i t q = initStateM cfg i t >>= (flip processM) oq
  where
  oq = if optimizeQuery cfg then optimize q else q

-- | Process a query on a specific index with regard to the configuration.

processQuery :: (HolIndex i, HolDocuments d c) => ProcessConfig -> i -> d c -> Query -> Result c
processQuery cfg i d q = I.toResult d (processPartial cfg i (sizeDocs d) q)

-- | Monadic version of 'processQuery'.

processQueryM :: (HolIndexM m i, HolDocuments d c) => ProcessConfig -> i -> d c -> Query -> m (Result c)
processQueryM cfg i d q = processPartialM cfg i (sizeDocs d) q >>= \ir -> return $ I.toResult d ir

-- | Continue processing a query by deciding what to do depending on the current query element.

process :: HolIndex i => ProcessState i -> Query -> Intermediate
process s (Word w)           = processWord s w
process s (Phrase w)         = processPhrase s w
process s (CaseWord w)       = processCaseWord s w
process s (CasePhrase w)     = processCasePhrase s w
process s (FuzzyWord w)      = processFuzzyWord s w
process s (Negation q)       = processNegation s (process s q)
process s (Specifier c q)    = process (setContexts c s) q
process s (BinQuery o q1 q2) = processBin o (process s q1) (process s q2)

-- | Monadic version of 'process'.

processM :: HolIndexM m i => ProcessState i -> Query -> m Intermediate
processM s (Word w)           = processWordM s w
processM _ (Phrase _)         = return I.emptyIntermediate -- processPhraseM s w
processM s (CaseWord w)       = processCaseWordM s w
processM _ (CasePhrase _)     = return I.emptyIntermediate -- processCasePhraseM s w
processM s (FuzzyWord w)      = processFuzzyWordM s w
processM s (Negation q)       = processM s q >>= processNegationM s
processM s (Specifier c q)    = setContextsM c s >>= \ns -> processM ns q
processM s (BinQuery o q1 q2) = do
  ir1 <- processM s q1
  ir2 <- processM s q2
  return $ processBin o ir1 ir2

-- | Process a single, case-insensitive word by finding all documents whreturn I.emptyIntermediate -- ich contain the word as prefix.

processWord :: HolIndex i => ProcessState i -> String -> Intermediate
processWord s q = forAllContexts wordNoCase (contexts s)
  where
  wordNoCase c = I.fromList q c $ limitWords s $ IDX.prefixNoCase (index s) c q

-- | Monadic version of 'processWord'.

processWordM :: HolIndexM m i => ProcessState i -> String -> m Intermediate
processWordM s q = forAllContextsM wordNoCase (contexts s)
  where
  wordNoCase c = IDX.prefixNoCaseM (index s) c q >>= limitWordsM s >>= \r -> return $ I.fromList q c r

-- | Process a single, case-sensitive word by finding all documents which contain the word as prefix.

processCaseWord :: HolIndex i => ProcessState i -> String -> Intermediate
processCaseWord s q = forAllContexts wordCase (contexts s)
  where
  wordCase c = I.fromList q c $ limitWords s $ IDX.prefixCase (index s) c q

-- | Monadic version of 'processCaseWord'.

processCaseWordM :: HolIndexM m i => ProcessState i -> String -> m Intermediate
processCaseWordM s q = forAllContextsM wordCase (contexts s)
  where
  wordCase c = IDX.prefixCaseM (index s) c q >>= limitWordsM s >>= \r -> return $ I.fromList q c r

-- | Process a phrase case-insensitive.

processPhrase :: HolIndex i => ProcessState i -> String -> Intermediate
processPhrase s q = forAllContexts phraseNoCase (contexts s)
  where
  phraseNoCase c = processPhraseInternal (IDX.lookupNoCase (index s) c) c q

-- processPhraseM :: HolIndexM m i => ProcessState i -> String -> m Intermediate
-- processPhraseM s q = forAllContextsM phraseNoCase (contexts s)
--   where
--   phraseNoCase c = 

-- | Process a phrase case-sensitive.

processCasePhrase :: HolIndex i => ProcessState i -> String -> Intermediate
processCasePhrase s q = forAllContexts phraseCase (contexts s)
  where
  phraseCase c = processPhraseInternal (IDX.lookupCase (index s) c) c q

-- | Process a phrase query by searching for every word of the phrase and comparing their positions.

processPhraseInternal :: (String -> RawResult) -> Context -> String -> Intermediate
processPhraseInternal f c q = let
  w = words q 
  m = mergeOccurrencesList $ map snd $ f (head w) in
  if nullDocIdMap m
  then I.emptyIntermediate
  else I.fromList q c [(q, processPhrase' (tail w) 1 m)]
  where
  processPhrase' :: [String] -> Position -> Occurrences -> Occurrences
  processPhrase' [] _ o = o
  processPhrase' (x:xs) p o = processPhrase' xs (p+1) (filterWithKeyDocIdMap (nextWord $ map snd $ f x) o)
    where
      nextWord :: [Occurrences] -> DocId -> Positions -> Bool
      nextWord [] _ _  = False
      nextWord no d np = maybe False hasSuccessor (lookupDocIdMap d (mergeOccurrencesList no))
          where
            hasSuccessor :: Positions -> Bool
            hasSuccessor w = foldPos (\cp r -> r || (memberPos (cp + p) w)) False np

-- | Process a single word and try some fuzzy alternatives if nothing was found.

processFuzzyWord :: HolIndex i => ProcessState i -> String -> Intermediate
processFuzzyWord s oq = processFuzzyWord' (F.toList $ F.fuzz (getFuzzyConfig s) oq) (processWord s oq)
  where
  processFuzzyWord' :: [(String, FuzzyScore)] -> Intermediate -> Intermediate
  processFuzzyWord' []     r = r
  processFuzzyWord' (q:qs) r = if I.null r then processFuzzyWord' qs (processWord s (fst q)) else r

-- | Monadic version of 'processFuzzyWord'.

processFuzzyWordM :: HolIndexM m i => ProcessState i -> String -> m Intermediate
processFuzzyWordM s oq = do
  sr <- processWordM s oq 
  cfg <- getFuzzyConfigM s
  processFuzzyWordM' (F.toList $ F.fuzz cfg oq) sr
    where
    processFuzzyWordM' []     r = return r
    processFuzzyWordM' (q:qs) r = if I.null r
                                  then processWordM s (fst q) >>= processFuzzyWordM' qs
                                  else return r

-- | Process a negation by getting all documents and substracting the result of the negated query.

processNegation :: HolIndex i => ProcessState i -> Intermediate -> Intermediate
processNegation s r = I.difference (allDocuments s) r

-- | Monadic version of 'processNegation'.

processNegationM :: HolIndexM m i => ProcessState i -> Intermediate -> m Intermediate
processNegationM s r1 = allDocumentsM s >>= \r2 -> return $ I.difference r2 r1

-- | Process a binary operator by caculating the union or the intersection of the two subqueries.

processBin :: BinOp -> Intermediate -> Intermediate -> Intermediate
processBin And r1 r2 = I.intersection r1 r2
processBin Or r1 r2  = I.union r1 r2
processBin But r1 r2 = I.difference r1 r2


-- | Limit a 'RawResult' to a fixed amount of the best words.
--
-- First heuristic applied is limiting the number of documents in the result,
-- assuming the short words come first in the result list
-- So the length of the result list depends on the number of documents found.
--
-- TODO: This is fixed to 2000, should be part of the config part of the state
--
-- A 2. simple heuristic is used to 
-- determine the quality of a word: The total number of occurrences divided by the number of 
-- documents in which the word appears.
--
-- The second heuristic isn't that expensive any more when the resul list is cut of by the heuristic
--
-- The limit 500 should be part of a configuration

limitWords              :: ProcessState i -> RawResult -> RawResult
limitWords s r          = cutW . cutD $ r
  where
  limitD                = docLimit $ config s
  cutD
      | limitD > 0      = limitDocs limitD
      | otherwise       = id

  limitW                = wordLimit $ config s
  cutW
      | limitW > 0
        &&
        length r > limitW
                        = map snd . take limitW . L.sortBy (compare `on` fst) . map calcScore
      | otherwise       = id

  calcScore             :: (Word, Occurrences) -> (Double, (Word, Occurrences))
  calcScore w@(_, o)    = (log (fromIntegral (total s) / fromIntegral (sizeDocIdMap o)), w)

-- ----------------------------------------------------------------------------

-- | Limit the number of docs in a raw result

limitDocs               :: Int -> RawResult -> RawResult
limitDocs _     []      = []
limitDocs limit _
    | limit <= 0        = []
limitDocs limit (x:xs)  = x : limitDocs (limit - sizeDocIdMap (snd x)) xs

-- ----------------------------------------------------------------------------

-- | Monadic version of 'limitWords'.
limitWordsM             :: (Monad m) => ProcessState i -> RawResult -> m RawResult
limitWordsM s r         = return $ limitWords s r

-- | Merge occurrences
mergeOccurrencesList    :: [Occurrences] -> Occurrences
mergeOccurrencesList    = unionsWithDocIdMap unionPos

-- ----------------------------------------------------------------------------