src/Hunt/Query/Intermediate.hs

{-# LANGUAGE FlexibleContexts           #-}
{-# LANGUAGE FlexibleInstances          #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE MultiParamTypeClasses      #-}
{-# LANGUAGE OverloadedStrings          #-}
{-# LANGUAGE TypeFamilies               #-}

-- ----------------------------------------------------------------------------
{- |
  The intermediate query results which have to be merged for the various combinatorial operations.

  'toResult' creates the final result which includes the document (and word) hits.
-}
-- ----------------------------------------------------------------------------

module Hunt.Query.Intermediate
    ( ScoredResult(..)
    , Aggregate(..)

    , ScoredDocs
    , ScoredWords
    , ScoredContexts
    , ScoredOccs
    , ScoredRawDocs
    , ScoredCx
    , UnScoredDocs
    , RankedDoc (..)

    , toScoredDocs
    , boostAndAggregateCx
    , fromCxRawResults
    , fromRawResult
    , limitRawResult
    , limitCxRawResults
    , contextWeights
    , filterByDocSet
    , toDocIdSet
    , toDocsResult
    , toDocumentResultPage
    , toWordsResult

    , evalSequence
    , evalFollow
    , evalNear
    , evalOr
    , evalAnd
    , evalAndNot
    , evalBoost
    , evalPrim
    )
where

import           Prelude                   hiding (null)

import           Control.Applicative       hiding (empty)
import           Control.Arrow             (second, (***))

import qualified Data.LimitedPriorityQueue as Q
import qualified Data.List                 as L
import           Data.Map                  (Map)
import qualified Data.Map                  as M
import           Data.Maybe
import           Data.Aeson
import qualified Data.HashMap.Strict       as HM
import           Hunt.Query.Result         hiding (null)

import           Hunt.Common
import qualified Hunt.Common.DocIdMap      as DM
import qualified Hunt.Common.DocIdSet      as DS
import           Hunt.Common.Document      (DocumentWrapper (..))
import qualified Hunt.Common.Occurrences   as Occ
import qualified Hunt.Common.Positions     as Pos
import           Hunt.DocTable             (DocTable)
import qualified Hunt.DocTable             as Dt

-- import           Debug.Trace

-- ------------------------------------------------------------
--
-- The Monoid @mempty@ acts as empty result,
-- (<>) is the union of scored results

class Monoid a => ScoredResult a where
    boost            :: Score -> a -> a
    nullSC           :: a -> Bool
    differenceSC     :: a -> a -> a
    intersectSC      :: a -> a -> a
    intersectDisplSC :: Int -> a -> a -> a
    intersectFuzzySC :: Int -> Int -> a -> a -> a

-- ------------------------------------------------------------

-- The result type for document search, every doc is associated with a score

newtype ScoredDocs
    = SDS (DocIdMap Score)
      deriving (Show)

instance Monoid ScoredDocs where
    mempty
        = SDS DM.empty
    mappend (SDS m1) (SDS m2)
        = SDS $ DM.unionWith (<>) m1 m2

instance ScoredResult ScoredDocs where
    boost b (SDS m1)
        = SDS $ DM.map (b *) m1

    nullSC (SDS m1)
        = DM.null m1

    differenceSC (SDS m1) (SDS m2)
        = SDS $ DM.difference m1 m2

    intersectSC (SDS m1) (SDS m2)
        = SDS $ DM.intersectionWith (+) m1 m2

    intersectDisplSC _disp
        = intersectSC

    intersectFuzzySC _lb _ub
        = intersectSC

toScoredDocs :: Occurrences -> ScoredDocs
toScoredDocs os
    = SDS $ DM.map toScore os
      where
        toScore = mkScore . fromIntegral . Pos.size

-- ------------------------------------------------------------

-- | The result type for unscored search of documents
--
-- used when all matching docs must be processed,
-- e.g. in DeleteByQuery commands

newtype UnScoredDocs
    = UDS DS.DocIdSet
      deriving (Show, Monoid)

instance ScoredResult UnScoredDocs where
    boost _b uds
        = uds

    nullSC (UDS s)
        = DS.null s

    differenceSC (UDS s1) (UDS s2)
        = UDS $ DS.difference s1 s2

    intersectSC (UDS s1) (UDS s2)
        = UDS $ DS.intersection s1 s2

    intersectDisplSC _disp
        = intersectSC

    intersectFuzzySC _lb _ub
        = intersectSC

toUnScoredDocs :: Occurrences -> UnScoredDocs
toUnScoredDocs os
    = UDS . DS.fromList . DM.keys $ os

toDocIdSet :: UnScoredDocs -> DS.DocIdSet
toDocIdSet (UDS ds) = ds

-- ------------------------------------------------------------

-- The result type for word search,
-- this is the required result type for completions search

newtype ScoredWords
    = SWS (Map Word Score)
      deriving (Show)

type ScoredContexts = ScoredWords

instance Monoid ScoredWords where
    mempty
        = SWS M.empty

    mappend (SWS m1) (SWS m2)
        = SWS $ M.unionWith (<>) m1 m2

instance ScoredResult ScoredWords where
    boost b (SWS m1)
        = SWS $ M.map (b *) m1

    nullSC (SWS m1)
        = M.null m1

    differenceSC (SWS m1) (SWS m2)
        = SWS $ M.difference m1 m2

    intersectSC (SWS m1) (SWS m2)
        = SWS $ M.intersectionWith (+) m1 m2

    intersectDisplSC _disp
        = intersectSC

    intersectFuzzySC _lb _ub
        = intersectSC

-- ------------------------------------------------------------

data ScoredOccs
    = SCO Score Occurrences
      deriving (Show)

instance Monoid ScoredOccs where
    mempty
        = SCO defScore DM.empty
    mappend (SCO s1 d1) (SCO s2 d2)
        = SCO ((s1 + s2) / 2.0) (DM.unionWith Pos.union d1 d2)

instance ScoredResult ScoredOccs where
    boost b (SCO s d)
        = SCO (b * s) d

    nullSC (SCO _s d)
        = DM.null d

    differenceSC (SCO s1 d1) (SCO _s2 d2)
        = SCO s1 (DM.difference d1 d2)

    intersectSC (SCO s1 d1) (SCO s2 d2)
        = SCO (s1 + s2) (Occ.intersectOccurrences Pos.union d1 d2)

    intersectDisplSC disp (SCO s1 d1) (SCO s2 d2)
        = SCO (s1 + s2) (Occ.intersectOccurrences (Pos.intersectionWithDispl disp) d1 d2)

    intersectFuzzySC lb ub (SCO s1 d1) (SCO s2 d2)
        = SCO (s1 + s2) (Occ.intersectOccurrences (Pos.intersectionWithIntervall lb ub) d1 d2)

-- ------------------------------------------------------------

-- A result type for searching phrases and context search,
-- every word hit or word sequence hit is
-- associated with a score and a DocIdMap containing
-- the docs and position.
--
-- The positions are neccessary in phrase and context search,
-- afterwards the positions can be accumulated into a score,
-- e.g. by counting the occurences
--
-- A RawResult can easily be converted into this type

newtype ScoredRawDocs
    = SRD [([Word], ScoredOccs)]
      deriving (Show)

instance Monoid ScoredRawDocs where
    mempty
        = SRD []
    mappend (SRD xs1) (SRD xs2)
        = SRD $ xs1 ++ xs2

instance ScoredResult ScoredRawDocs where
    boost b (SRD xs)
        = SRD $ L.map (second (boost b)) xs

    nullSC (SRD xs)
        = L.null xs

    differenceSC (SRD xs1) (SRD xs2)
        = srd $ [(ws1, SCO sc1 (diff occ1 xs2)) | (ws1, SCO sc1 occ1) <- xs1]
          where
            diff occ xs
                = L.foldl op occ xs
                  where
                    op occ' (_ws2, SCO _sc2 occ2)
                        = DM.difference occ' occ2

    intersectSC
        = intersectSRD intersectSC

    intersectDisplSC d
        = intersectSRD $ intersectDisplSC d

    intersectFuzzySC lb ub
        = intersectSRD $ intersectFuzzySC lb ub

-- ------------------------------------------------------------
--
-- auxiliary functions for ScoredRawDocs

-- | generalized intersection operator for ScoredRawDocs

intersectSRD :: (ScoredOccs -> ScoredOccs -> ScoredOccs)
              -> ScoredRawDocs -> ScoredRawDocs -> ScoredRawDocs
intersectSRD interSRD (SRD xs1) (SRD xs2)
    = srd $ [ (ws1 ++ ws2, interSRD socc1 socc2)
            | (ws1, socc1) <- xs1
            , (ws2, socc2) <- xs2
            ]

-- | smart constructor for ScoredRawDoc removing empty entries

srd :: [([Word], ScoredOccs)] -> ScoredRawDocs
srd
    = SRD . L.filter (not . nullSC . snd)

filterByDocSet :: UnScoredDocs -> ScoredRawDocs -> ScoredRawDocs
filterByDocSet (UDS ds) (SRD xs)
    = SRD $ concatMap filterDocs xs
      where
        filterDocs (ws, SCO sc occ)
            | Occ.null occ'
                = []
            | otherwise
                = [(ws, SCO sc occ')]
            where
              occ' = DM.filterWithKey p occ
                  where
                    p k _v = k `DS.member` ds

-- ------------------------------------------------------------

-- | Add the Context dimension to a scored result

newtype ScoredCx a
    = SCX (Map Context a)
      deriving (Show)

instance Functor ScoredCx where
    fmap f (SCX m) = SCX $ M.map f m

instance Monoid a => Monoid (ScoredCx a) where
    mempty
        = SCX M.empty
    mappend (SCX m1) (SCX m2)
        = SCX $ M.unionWith (<>) m1 m2

instance ScoredResult a => ScoredResult (ScoredCx a) where
    boost b (SCX m)
        = SCX $ M.map (boost b) m

    nullSC (SCX m)
        = M.null m

    differenceSC
        = binopSCX differenceSC

    intersectSC
        = binopSCX intersectSC

    intersectDisplSC d
        = binopSCX $ intersectDisplSC d

    intersectFuzzySC lb ub
        = binopSCX $ intersectFuzzySC lb ub


-- | The final op for a search result: boost the partial results of the contexts
-- with the context weights from the schema and aggregate them
-- by throwing away the contexts

boostAndAggregateCx :: ScoredResult a => ScoredContexts -> ScoredCx a -> a
boostAndAggregateCx (SWS sm) (SCX m)
    = M.foldlWithKey op mempty m
      where
        op res k x
            = boost b x <> res
              where
                b = fromMaybe defScore $ M.lookup k sm

contextWeights :: Schema -> ScoredContexts
contextWeights s
    = SWS $ M.map cxWeight s

-- ------------------------------------------------------------
--
-- auxiliary functions for scored context results

binopSCX :: ScoredResult a => (a -> a -> a) -> (ScoredCx a -> ScoredCx a -> ScoredCx a)
binopSCX op (SCX m1) (SCX m2)
    = scx $ M.mapWithKey op' m1
          where
            op' cx1 sr1
                = sr1 `op` sr2
                  where
                    sr2 = fromMaybe mempty $ M.lookup cx1 m2


-- | smart constructore for ScoredCx removing empty contexts

scx :: ScoredResult a => Map Word a -> ScoredCx a
scx = SCX . M.filter (not . nullSC)

-- ------------------------------------------------------------
--
-- conversions from RawResult into scored results

type CxRawResults = [(Context, RawScoredResult)]

fromCxRawResults :: CxRawResults -> ScoredCx ScoredRawDocs
fromCxRawResults crs
    = -- traceShow ("fromCxRawResults:"::String, crs, res) $
      res
      where
        res = mconcat $ L.map (uncurry $ fromRawResult) crs

fromRawResult :: Context -> RawScoredResult -> ScoredCx ScoredRawDocs
fromRawResult cx rr
    = SCX $ M.singleton cx sr
      where
        sr = SRD $ L.map toScored rr
            where
              toScored (w, (sc, occ))
                  = ([w], SCO sc occ)

limitCxRawResults :: Int -> CxRawResults -> CxRawResults
limitCxRawResults mx
    = L.map (second $ limitRawResult mx)

limitRawResult :: Int -> RawScoredResult -> RawScoredResult
limitRawResult maxDocs rs
    | maxDocs <= 0 = rs
    | otherwise = takeDocs maxDocs rs
    where
      takeDocs _  xs@[]
          = xs
      takeDocs _  xs@[_]
          = xs
      takeDocs mx (r@(_w, (_sc, occ)) : xs)
          = case DM.sizeWithLimit mx occ of
              Nothing -> [r]
              Just s  -> let mx' = mx - s in
                         if mx' <= 0
                            then [r]
                            else r : takeDocs mx' xs

-- | convert a set of scored doc ids into a list of documents containing the score
--
-- This list may be further sorted by score, partitioned into pages or ...

toDocsResult :: (Applicative m, Monad m, DocTable dt) =>
                dt -> ScoredDocs -> m [RankedDoc]
toDocsResult dt (SDS m)
    = mapM toDoc (DM.toList m)
      where
        toDoc (did, sc)
            = (toD . fromJust') <$> Dt.lookup did dt
            where
              toD d'
                  = RD (wght d * sc, d)
                  where
                    d = unwrap d'
        fromJust' (Just x) = x
        fromJust' Nothing  = error "Intermediate.toDocsResult: undefined"
-- ----------------------------------------
-- ranking of documents

-- define a total ordering over documents by taking score and uri into account
-- this is used for ranking (only)
--
-- XXX TODO: maybe move to Common modules, since this type is now the regular
--           return type

newtype RankedDoc = RD {unRD :: (Score, Document)}
  deriving Show

instance Eq RankedDoc where
    (RD (sc1,d1)) == (RD (sc2,d2))
        = sc1 == sc2
          &&
          uri   d1 == uri   d2

instance Ord RankedDoc where
    (RD (sc1,d1)) `compare` (RD (sc2,d2))
        = case sc1 `compare` sc2 of
            EQ -> uri d2 `compare` uri d1
            r  -> r

instance ToJSON RankedDoc where
    toJSON (RD (c,d))
      = addScore $ toJSON d
      where
        addScore (Object hm)
          = Object $ HM.insert "score" (toJSON c) hm
        addScore _
          = error "toJSON rankedDoc: propably a bug introduced with #75"

toDocumentResultPage :: Int -> Int -> [RankedDoc] -> [RankedDoc]
toDocumentResultPage = Q.pageList

-- ----------------------------------------
-- ranking of completions

data RankedWord = RW Score Word
                   deriving Eq

instance Ord RankedWord where
    (RW s1 w1) `compare` (RW s2 w2)
        = case s1 `compare` s2 of
            EQ -> w2 `compare` w1
            r  -> r

toWordsResult :: Int -> ScoredWords -> [(Word, Score)]
toWordsResult len (SWS m)
    = map (\ (RW s w) -> (w, s))
      . Q.toList 0 len
      . M.foldWithKey (\ w s -> Q.insert (RW s w)) (Q.mkQueue len)
      $ m

-- ------------------------------------------------------------
--
-- aggregating (raw) results for various types of scored results

class Aggregate a b where
    aggregate :: a -> b

-- | allow no aggregation

instance Aggregate a a where
    aggregate = id

-- | aggregate scored docs to a single score by summing up the scores and throw away the DocIds

instance Aggregate ScoredDocs Score where
    aggregate (SDS m)
        = DM.foldr (<>) defScore m


-- | aggregate scored occurences by counting the positions per doc and boost the
-- result with the score.
--
-- The positions in a doc are thrown away, so all kinds of phrase search become impossible

instance Aggregate ScoredOccs ScoredDocs where
    aggregate (SCO sc occ)
        = SDS $ DM.map toScore occ
          where
            toScore = (sc *) . mkScore . fromIntegral . Pos.size

-- | aggregate scored occurrences to unscored docs by throwing away the score
instance Aggregate ScoredOccs UnScoredDocs where
    aggregate (SCO _sc occ)
        = toUnScoredDocs occ

-- | aggregate scored occurences to a score by aggregating first the positions and snd the doc ids
--
-- used in computing the score of word in completion search

instance Aggregate ScoredOccs Score where
    aggregate = agg2 . agg1
        where
          agg1 :: ScoredOccs -> ScoredDocs
          agg1 = aggregate
          agg2 :: ScoredDocs -> Score
          agg2 = aggregate

-- | aggregate scored raw results by throwing away the word hits and aggregating
-- all document hits by throwing away the positions.
--
-- The function for computing the scores of documents in a query

instance Aggregate ScoredRawDocs ScoredDocs where
    aggregate (SRD xs)
        = L.foldl (<>) mempty
          $ L.map (aggregate . snd) xs

-- | aggregate scored raw results into an unscored result by throwing away
-- the words, scores and occurrences

instance Aggregate ScoredRawDocs UnScoredDocs where
    aggregate (SRD xs)
        = L.foldl (<>) mempty
          $ L.map (aggregate . snd) xs

-- | aggregate the scored raw results by computing the score of all doc per word
--
-- Used in completion search.
-- The sequence of words (in a phrase) is cut to the last word (@L.last@). For this last one
-- the completions are computed.

instance Aggregate ScoredRawDocs ScoredWords where
    aggregate (SRD xs)
        = SWS
          $ L.foldl (\ res (w, sc) -> M.insertWith (+) w sc res) M.empty
          $ L.map (L.last *** aggregate) xs

-- | Lifting aggregation to scored context results

instance Aggregate a b => Aggregate (ScoredCx a) (ScoredCx b) where
    aggregate (SCX m)
        = SCX $ M.map aggregate m

-- ------------------------------------------------------------

-- query operator evaluation

-- | combine a sequence of results from a phrase query into a single result
-- and aggregate this result into a simpler structure,
-- e.g. a @ScoredDocs@ or @ScoredWords@ value
--
-- The arguments must be of this unaggregated from, still containing word
-- positions, else sequences of words are not detected

evalSequence :: (ScoredResult r, Aggregate ScoredRawDocs r) =>
                (ScoredCx ScoredRawDocs -> ScoredCx r) ->
                [ScoredCx ScoredRawDocs] -> ScoredCx r
evalSequence _aggr []
    = mempty
evalSequence aggr [r1]
    = aggr r1
evalSequence aggr (r1 : rs)
    = aggr $ L.foldl op r1 (L.zip [(1::Int)..] rs)
      where
        op acc (d, r2) = intersectDisplSC d acc r2

evalFollow :: (ScoredResult r, Aggregate ScoredRawDocs r) =>
              (ScoredCx ScoredRawDocs -> ScoredCx r) ->
              Int ->
              [ScoredCx ScoredRawDocs] -> ScoredCx r
evalFollow _aggr _d []
    = mempty
evalFollow aggr _d [r1]
    = aggr r1
evalFollow aggr dp (r1 : rs)
    = aggr $ L.foldl op r1 (L.zip [dp, (2 * dp) ..] rs)
      where
        op acc (d, r2) = intersectFuzzySC 1 d acc r2

evalNear :: (ScoredResult r, Aggregate ScoredRawDocs r) =>
            (ScoredCx ScoredRawDocs -> ScoredCx r) ->
            Int ->
           [ScoredCx ScoredRawDocs] -> ScoredCx r
evalNear _aggr _d []
    = mempty
evalNear aggr _d [r1]
    = aggr r1
evalNear aggr dp (r1 : rs)
    = aggr $ L.foldl op r1 (L.zip [dp, (2 * dp) ..] rs)
      where
        op acc (d, r2) = intersectFuzzySC (-d) d acc r2

evalOr, evalAnd, evalAndNot :: ScoredResult a => [a] -> a
evalOr     = evalBinary (<>)
evalAnd    = evalBinary intersectSC
evalAndNot = evalBinary differenceSC

evalBinary :: ScoredResult a => (a -> a -> a) -> [a] -> a
evalBinary _ []
    = mempty
evalBinary _ [r]
    = r
evalBinary op rs
    = L.foldl1 op rs

evalBoost :: ScoredResult a => Score -> a -> a
evalBoost = boost

evalPrim :: Aggregate (ScoredCx ScoredRawDocs) (ScoredCx r) => ScoredCx ScoredRawDocs -> ScoredCx r
evalPrim = aggregate

-- ------------------------------------------------------------