src/Hunt/Query/Processor.hs

{-# LANGUAGE ConstraintKinds            #-}
{-# LANGUAGE FlexibleContexts           #-}
{-# LANGUAGE FlexibleInstances          #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE MultiParamTypeClasses      #-}
{-# LANGUAGE OverloadedStrings          #-}
{-# LANGUAGE RankNTypes                 #-}
{-# LANGUAGE TypeFamilies               #-}

-- ----------------------------------------------------------------------------
{- |
  The query processor to perform 'Query's.

  'processQuery' executes the query and generates the unranked result.
  The result can be ranked with the default 'Hunt.Query.Ranking.rank' function.
-}
-- ----------------------------------------------------------------------------

module Hunt.Query.Processor
    ( processQueryScoredDocs
    , processQueryUnScoredDocs
    , processQueryScoredWords
    , initProcessor

    , ProcessConfig (..)
    , ProcessEnv
    )

where

import           Control.Applicative
import           Control.Monad.Error
import           Control.Monad.Reader

import           Data.Binary                 (Binary)
import qualified Data.Binary                 as Bin
import           Data.Default
import qualified Data.List                   as L
import qualified Data.Map                    as M
import           Data.Maybe
import           Data.Text                   (Text)
import qualified Data.Text                   as T

import           Hunt.Common
import           Hunt.ContextIndex           (ContextMap)
import qualified Hunt.ContextIndex           as CIx
import           Hunt.Interpreter.Command    (CmdError (..))
import           Hunt.Query.Fuzzy            (FuzzyConfig)
import           Hunt.Query.Intermediate
import           Hunt.Query.Language.Grammar
import           Hunt.Utility                (showText)

import qualified System.Log.Logger           as Log

-- import           Debug.Trace

-- ------------------------------------------------------------
-- Logging
-- ------------------------------------------------------------

-- | Name of the module for logging purposes.
modName :: String
modName = "Hunt.Query.Processor"

-- | Log a message at 'DEBUG' priority.
debugM :: String -> IO ()
debugM = Log.debugM modName

-- ------------------------------------------------------------
-- Configuration and state for the query processor
-- ------------------------------------------------------------

-- | Query processor configuration.
data ProcessConfig
    = ProcessConfig
      { fuzzyConfig   :: ! FuzzyConfig -- ^ The configuration for fuzzy queries.
      , optimizeQuery :: ! Bool        -- ^ Optimize the query before processing (default: @False@).
      , wordLimit     :: ! Int         -- ^ The maximum number of words used from a prefix. @0@ = no limit (default: @100@).
      , docLimit      :: ! Int         -- ^ The maximum number of documents taken into account. @0@ = no limit (default: @500@).
      }

-- ------------------------------------------------------------

instance Default ProcessConfig where
    def = ProcessConfig def False 100 500

-- ------------------------------------------------------------

instance Binary ProcessConfig where
    put (ProcessConfig fc o l d)
        = Bin.put fc >> Bin.put o >> Bin.put l >> Bin.put d
    get
        = ProcessConfig <$> Bin.get <*> Bin.get <*> Bin.get <*> Bin.get

-- ------------------------------------------------------------

-- | The internal state of the query processor.
data ProcessEnv
    = ProcessEnv
      { psConfig   :: ! ProcessConfig  -- ^ The configuration for the query processor.
      , psContexts :: ! [Context]      -- ^ The current list of contexts.
      , psIndex    ::   ContextMap     -- ^ The index to search.
      }

-- ------------------------------------------------------------
-- Processor monad
-- ------------------------------------------------------------

type QueryIndex
    = ContextMap

-- | the processor monad
newtype ProcessorT m a
    = PT { runProcessor :: ReaderT ProcessEnv (ErrorT CmdError m) a
         }
      deriving (Applicative, Monad, MonadIO, Functor, MonadReader ProcessEnv, MonadError CmdError)

instance MonadTrans ProcessorT where
    lift = PT . lift . lift

type Processor
    = ProcessorT IO

-- ------------------------------------------------------------
-- Helper
-- ------------------------------------------------------------

queryError ::   Int -> Text -> Processor a
queryError n msg
    = throwError $ ResError n msg

getContexts ::   Processor [Context]
getContexts
    = asks psContexts

getIx :: Processor QueryIndex
getIx
    = asks psIndex

getSchema :: Processor Schema
getSchema
    = getIx >>= return . (M.map fst) . CIx.cxMap

-- | Get the schema associated with that context/index.
--
--   /Note/: This fails if the schema does not exist.

getContextSchema ::   Context -> Processor ContextSchema
getContextSchema c
    = do schema <- getSchema
         case M.lookup c schema of
           Just cs -> return cs
           _       -> queryError 420 ("Context does not exist in schema: " <> c)


-- | Normalizes the search value with respect to the schema context type.
--   First runs the validator that throws an error for invalid values,
--   then runs the normalizers associated with the context.

normQueryCx :: Context -> Text -> Processor (Maybe Text)
normQueryCx c t
    = do s <- getContextSchema c
         -- apply context type validator
         if (validate . ctValidate . cxType $ s) t
           then
               do liftIO . debugM . debugMsg $ s
                  -- apply context schema normalizer
                  return . Just . norm $ s
           else
               return Nothing
    where
      norm s
          = normalize' (cxNormalizer s) t

      debugMsg s
          = T.unpack $ T.concat [ "query normalizer: ", c, ": [", t, "=>", norm s, "]"]

-- | Initialize the state of the processor.
initProcessor :: ProcessConfig -> QueryIndex -> ProcessEnv
initProcessor cfg ix
    = ProcessEnv cfg cxs ix
    where
      s = CIx.mapToSchema ix
      cxs = filter (\c -> fromMaybe False $ M.lookup c s >>= return . cxDefault)
            $ CIx.contexts ix

-- ------------------------------------------------------------

processQueryUnScoredDocs :: ProcessEnv -> Query -> IO (Either CmdError UnScoredDocs)
processQueryUnScoredDocs = processQueryScoredResult evalUnScoredDocs

-- | evaluate a query into a UnScoredDocs result
--
-- all info about contexts, words and positions and the score of docs
-- are removed by the aggregation.
--
-- This evaluator is called by commands which need to compute just a set of documents,
-- e.g. DeleteByQuery. When calling evalUnScoredDocs the 'ProcessEnv' value
-- should be configured such that the limit for document to taken into account
-- is set to infinity (represented as @0@ in the config), else the result set
-- may not be complete.
--
-- In that case it becomes easy to build a query witch acts as a denial of service
-- attack, because the intermediate results become too large to be processed.
-- So these queries must be used in applications rather carefully, e.g. a user should
-- not be able to construct these types of queries by filling in some input fields in a web interface.

evalUnScoredDocs :: Query -> Processor UnScoredDocs
evalUnScoredDocs q
    | isPrimaryQuery q
        = do res <- forallCx (evalPrimary q)
             aggregateToScoredResult res

evalUnScoredDocs (QRange lb ub)
    = do res <- forallCx (evalRange lb ub)
         aggregateToScoredResult res

evalUnScoredDocs (QSeq op qs)
    | isLocalCxOp op
        = do res <- forallCxLocal
                    ( evalSeq' op                     -- do the position aware operation
                      <$> mapM evalScoredRawDocs qs ) -- switch to the raw evaluator due to positions
             aggregateToScoredResult res              -- and aggregate res to UnScoredDocs
    | otherwise
        = evalSeq op                                  -- do the result combination
          <$> mapM evalUnScoredDocs qs                  -- for the args stay in evaluator

evalUnScoredDocs (QContext cxs q)
    = withCxs cxs $ evalUnScoredDocs q

evalUnScoredDocs (QBoost _w q)
    = evalUnScoredDocs q

evalUnScoredDocs q@QPhrase{}              -- QPhrase is transformed into QFullWord or QSeq Phrase
    = normQuery q >>= evalUnScoredDocs

evalUnScoredDocs q@QBinary{}              -- QBin is transformed into QSeq
    = normQuery q >>= evalUnScoredDocs

evalUnScoredDocs q
    = queryEvalError q

-- ------------------------------------------------------------

processQueryScoredDocs :: ProcessEnv -> Query -> IO (Either CmdError ScoredDocs)
processQueryScoredDocs = processQueryScoredResult evalScoredDocs'

-- | evaluate a query into a ScoredDocs result
--
-- all info about contexts, words and positions is removed by the aggregation,
-- just a set of DocIds and associated scores is computed

evalScoredDocs :: Query -> Processor ScoredDocs
evalScoredDocs q
    | isPrimaryQuery q
        = do res <- forallCx (evalPrimary q)
             aggregateToScoredResult res

evalScoredDocs (QRange lb ub)
    = do res <- forallCx (evalRange lb ub)
         aggregateToScoredResult res

evalScoredDocs (QSeq op qs)
    | isLocalCxOp op
        = do res <- forallCxLocal
                    ( evalSeq' op                     -- do the position aware operation
                      <$> mapM evalScoredRawDocs' qs ) -- switch to the raw evaluator due to positions
             aggregateToScoredResult res                -- and aggregate res to ScoredDocs
    | otherwise
        = evalSeq op                                  -- do the result combination
          <$> mapM evalScoredDocs' qs                  -- for the args stay in evaluator

evalScoredDocs (QContext cxs q)
    = withCxs cxs $ evalScoredDocs' q

evalScoredDocs (QBoost w q)
    = boost w <$> evalScoredDocs' q

evalScoredDocs q@QPhrase{}              -- QPhrase is transformed into QFullWord or QSeq Phrase
    = normQuery q >>= evalScoredDocs

evalScoredDocs q@QBinary{}              -- QBin is transformed into QSeq
    = normQuery q >>= evalScoredDocs

evalScoredDocs q
    = queryEvalError q

-- --------------------
-- {- switch off trace

evalScoredDocs' :: Query -> Processor ScoredDocs
evalScoredDocs' = evalScoredDocs
{-# INLINE evalScoredDocs' #-}

-- -}
{- switch on trace the evaluation

evalScoredDocs' :: Query -> Processor ScoredDocs
evalScoredDocs' q = trc <$> evalScoredDocs q
    where
      trc res = traceShow ("evalScoredDocs: "::String,q  ) $
                traceShow ("evalScoredDocs: "::String,res) $ res
-- -}
-- ------------------------------------------------------------

processQueryScoredWords :: ProcessEnv -> Query -> IO (Either CmdError ScoredWords)
processQueryScoredWords = processQueryScoredResult evalScoredWords'

-- | evaluate a query into a ScoredWords result
--
-- all info about contexts, words and positions is removed by the aggregation,
-- just a set of words and associated scores is computed.
--
-- the words found are suggestions for the last primitive prefix in the query

evalScoredWords :: Query -> Processor ScoredWords
evalScoredWords q
    | isPrimaryQuery q
        = do res <- forallCx (evalPrimary q)
             aggregateToScoredResult res

evalScoredWords (QRange lb ub)
    = do res <- forallCx (evalRange lb ub)
         aggregateToScoredResult res

evalScoredWords (QSeq Or qs)                           -- for completions just search
    = evalScoredWords' (last qs)                       -- for rightmost subquery

evalScoredWords (QSeq AndNot qs)                       -- for completions just search
    = evalScoredWords' (last qs)                       -- for rightmost subquery

evalScoredWords (QSeq And qs)                          -- for completions
    = do docs <- evalUnScoredDocs  (mkQ $ init qs)     -- eval the set of docs for all but the last q
         res  <- evalScoredRawDocs (      last qs)     -- eval the last q as ScoredRawDos
         aggregateToScoredResult                       -- aggregate to ScoredWords
           $ fmap (filterByDocSet docs) res            -- restrict result to docs found
      where
        mkQ [q'] = q'
        mkQ qs'  = QSeq And qs'

evalScoredWords (QSeq op qs)
    | isLocalCxOp op
        = do res <- forallCxLocal
                    ( evalSeq' op                      -- do the position aware operation
                      <$> mapM evalScoredRawDocs' qs ) -- switch to the raw evaluator due to positions
             aggregateToScoredResult res               -- and aggregate res to ScoredWords

evalScoredWords (QContext cxs q)
    = withCxs cxs $ evalScoredWords' q

evalScoredWords (QBoost w q)
    = boost w <$> evalScoredWords' q

evalScoredWords q@QPhrase{}              -- QPhrase is transformed into QFullWord or QSeq Phrase
    = normPhraseQuery q >>= evalScoredWords

evalScoredWords q@QBinary{}              -- QBin is transformed into QSeq
    = normQuery q >>= evalScoredWords

evalScoredWords q
    = queryEvalError q

-- --------------------
-- {- switch off trace

evalScoredWords' :: Query -> Processor ScoredWords
evalScoredWords' = evalScoredWords
{-# INLINE evalScoredWords' #-}

-- -}
{- switch on trace of evaluation

evalScoredWords' :: Query -> Processor ScoredWords
evalScoredWords' q = trc <$> evalScoredWords q
    where
      trc res = traceShow ("evalScoredWords: "::String,q  ) $
                traceShow ("evalScoredWords: "::String,res) $ res
-- -}
-- ------------------------------------------------------------

-- | evaluate a query into a context aware raw docs result
--
-- This evaluator is called from 'evalScoredDocs', 'evalUnScoredDocs' and 'evalScoredWords'
-- in the case of Phrase-, Follow- and Near-queries.
--
-- No info about contexts, words and positions is removed by the aggregation.
-- 'evalScoredRawDocs' always runs in a single context at a time,
-- because the position information does not have any meaning across contexts
--
-- Therefore QContext subqueries become meaningless within Phrase-, Follow- and Near-queries

evalScoredRawDocs :: Query -> Processor (ScoredCx ScoredRawDocs)
evalScoredRawDocs q
    | isPrimaryQuery q
        = forallCx (evalPrimary q)

evalScoredRawDocs (QRange lb ub)
    = forallCx (evalRange lb ub)

evalScoredRawDocs (QSeq op qs)
    = evalSeq' op
      <$> mapM evalScoredRawDocs' qs

evalScoredRawDocs (QContext cxs q)
    = restrictCxs cxs $ evalScoredRawDocs' q

evalScoredRawDocs (QBoost w q)
    = boost w <$> evalScoredRawDocs' q

evalScoredRawDocs q@QPhrase{}              -- QPhrase is transformed into QFullWord or QSeq Phrase
    = normQuery q >>= evalScoredRawDocs

evalScoredRawDocs q@QBinary{}              -- QBin is transformed into QSeq
    = normQuery q >>= evalScoredRawDocs

evalScoredRawDocs q
    = queryEvalError q

-- --------------------
-- {- switch off trace

evalScoredRawDocs' :: Query -> Processor (ScoredCx ScoredRawDocs)
evalScoredRawDocs' = evalScoredRawDocs
{-# INLINE evalScoredRawDocs' #-}

-- -}
{- switch on trace of evaluation

evalScoredRawDocs' :: Query -> Processor (ScoredCx ScoredRawDocs)
evalScoredRawDocs' q = trc <$> evalScoredRawDocs q
    where
      trc res = traceShow ("evalScoredRawDocs: "::String,q  ) $
                traceShow ("evalScoredRawDocs: "::String,res) $ res
-- -}
-- ------------------------------------------------------------

-- run a query evaluation

processQueryScoredResult :: (q -> Processor r) -> ProcessEnv -> q -> IO (Either CmdError r)
processQueryScoredResult eval st q
    = runErrorT . runReaderT (runProcessor $ eval q) $ st


-- ------------------------------------------------------------
--
-- query normalization: transform "old" queries into generalized new form

normQuery :: Query -> Processor Query
normQuery (QPhrase op w)
    = return . QSeq Phrase . L.map (QFullWord op) $ T.words w

normQuery q@(QBinary op _q1 _q2)
    = return . QSeq op . collect op q $ []
      where
        collect
            | isAssocOp     op = collectAssoc
            | isLeftAssocOp op = collectLeftAssoc
            | otherwise        = \ _op q' -> (q':)

normQuery q
    = return q


normPhraseQuery :: Query -> Processor Query
normPhraseQuery (QPhrase op w)
    = return . QSeq Phrase $ normPhrase (T.words w) []
    where
      normPhrase (x:[]) r = r ++ [QWord op x]
      normPhrase (x:xs) r = normPhrase xs $
                            r ++ [QFullWord op x]
      normPhrase []     r = r
normPhraseQuery q
    = return q



collectAssoc :: BinOp -> Query -> [Query] -> [Query]
collectAssoc op (QBinary op' q1 q2)
    | op == op'
        = collectAssoc op q1 . collectAssoc op q2
collectAssoc _ q
    = (q :)

collectLeftAssoc :: BinOp -> Query -> [Query] -> [Query]
collectLeftAssoc op (QBinary op' q1 q2)
    | op == op'
        = collectAssoc op q1 . (q2 :)
collectLeftAssoc _ q
    = (q :)

isAssocOp :: BinOp -> Bool
isAssocOp AndNot = False
isAssocOp _      = True

isLeftAssocOp :: BinOp -> Bool          -- currently AndNot is left assoc
isLeftAssocOp = not . isAssocOp         -- all others are assoc ops

isLocalCxOp :: BinOp -> Bool
isLocalCxOp Phrase   = True
isLocalCxOp Follow{} = True
isLocalCxOp Near{}   = True
isLocalCxOp _        = False

-- ------------------------------------------------------------

-- eval query combinators

evalSeq :: (ScoredResult r) => BinOp -> [r] -> r
evalSeq Or     = evalOr
evalSeq And    = evalAnd
evalSeq AndNot = evalAndNot
evalSeq _op    = const mempty

evalSeq' :: BinOp -> [ScoredCx ScoredRawDocs] -> ScoredCx ScoredRawDocs
evalSeq' Phrase     = evalSequence id
evalSeq' (Follow d) = evalFollow   id d
evalSeq' (Near   d) = evalNear     id d
evalSeq' op         = evalSeq op

-- | restrict the current context for a query
--
-- used in evaluating phrase-, follow- and near- queries
-- there it's meaningless to use other than the current contexts

restrictCxs :: [Context] -> Processor r -> Processor r
restrictCxs cxs p
    = do cxs0 <- getContexts
         withCxs cxs $                                           -- check for legal contexts
                 local (setCx $ cxs0 `L.intersect` L.nub cxs) p  -- restrict contexts to cxs
    where
      setCx cxs' cfg
          = cfg {psContexts = cxs'}

-- | set the context for a part of a query evaluation

withCxs :: [Context] -> Processor r -> Processor r
withCxs cxs p
    = do icxs <- invalidContexts <$> getSchema
         if L.null icxs
            then local setCx $ p
            else queryError 404
                 $ "mentioned context(s) do not exist: "
                   <> showText icxs
    where
      invalidContexts sc
          = filter (\ c -> not (c `M.member` sc)) cxs
      setCx cfg
          = cfg {psContexts = L.nub cxs}

-- | execute a command (query) for all contexts and give the current context
-- as extra argument

forallCx :: (ScoredResult r) => (Context -> Processor r) -> Processor r
forallCx action
    = do cxs <- getContexts
         res <- mapM action cxs
         return $ mconcat res

-- | execute a command for all contexts, but restrict the set of contexts
-- to the current context before executing the command
--
-- So the loop over all contexts (forallCx) called in the evaluation of primary
-- queries is switched off

forallCxLocal :: (ScoredResult r) => Processor r -> Processor r
forallCxLocal action
    = do cxs <- getContexts
         res <- mapM f' cxs
         return $ mconcat res
    where
      f' cx
          = withCxs [cx] action

isPrimaryQuery :: Query -> Bool
isPrimaryQuery QWord{}     = True
isPrimaryQuery QFullWord{} = True
isPrimaryQuery _           = False

queryEvalError :: Query -> Processor r
queryEvalError q
    = queryError 501 $ "Hunt.Query.Processor: query can't be evaluated " <> showText q

-- ------------------------------------------------------------

aggregateToScoredResult :: (Aggregate a (ScoredCx b), ScoredResult b) =>
                           a -> ProcessorT IO b
aggregateToScoredResult res
    = do cxScores <- contextWeights <$> getSchema
         return $ boostAndAggregateCx cxScores (aggregate res)


-- eval basic queries

evalPrimary :: Query -> Context -> Processor (ScoredCx ScoredRawDocs)
evalPrimary (QWord QCase w) cx
    = searchCx PrefixCase w cx

evalPrimary (QWord QNoCase w) cx
    = searchCx PrefixNoCase w cx

evalPrimary (QWord QFuzzy w) cx         -- TODO: QFuzzy is processed like nocase search
    = searchCx PrefixNoCase w cx

evalPrimary (QFullWord QCase w) cx
    = searchCx Case w cx

evalPrimary (QFullWord QNoCase w) cx
    = searchCx NoCase w cx

evalPrimary (QFullWord QFuzzy w) cx     -- TODO: QFuzzy is processed like nocase search
    = searchCx NoCase w cx

evalPrimary q _cx
    = queryError 501 $ "evalPrimary: not a primary query: " <> showText q

searchCx :: TextSearchOp -> Word -> Context -> Processor (ScoredCx ScoredRawDocs)
searchCx op w' cx
    = do mw <- normQueryCx cx w'                         -- normalize the word with respect to context
         case mw of
           Nothing  -> return $ fromCxRawResults []      -- if normalization not possible, return empty result
           (Just w) -> searchCx' w
    where
      searchCx' w
        = do
          limit <- asks (docLimit . psConfig)            -- get the max. # of docs
          ix    <- getIx                                 -- get the context search index
          rawr  <- limitRawResult limit
                   <$> CIx.searchWithCxSc op cx w ix     -- do the real search and limit result
          return $ fromCxRawResults [(cx, rawr)]
                                                         -- convert the result to a ScoredResult
                                                         -- the score comes from a similarity test

evalRange :: Word -> Word -> Context -> Processor (ScoredCx ScoredRawDocs)
evalRange lb0 ub0 cx
    = do mlb    <- normQueryCx cx lb0                         -- normalize the word with respect to context
         mub    <- normQueryCx cx ub0                         -- normalize the word with respect to context
         case (mlb, mub) of
           (Just lb, Just ub) -> evalRange' lb ub
           _                  -> return $ fromCxRawResults [] -- if one of the words fails validation, return empty result
    where
      evalRange' lb ub
        = do
          limit <- asks (docLimit . psConfig)
          ix    <- getIx
          rawr  <- limitRawResult limit
                   <$> CIx.lookupRangeCxSc cx lb ub ix
          return $ fromCxRawResults [(cx, rawr)]

-- ------------------------------------------------------------

{-
-- | Log a message at 'WARNING' priority.
warningM :: String -> IO ()
warningM = Log.warningM modName

-- | Log a message at 'ERROR' priority.
errorM :: String -> IO ()
errorM = Log.errorM modName
-}

-- ------------------------------------------------------------