{-# LANGUAGE BangPatterns, NamedFieldPuns, RecordWildCards,
             ScopedTypeVariables #-}

module Data.SearchEngine.Autosuggest (

    -- * Query auto-completion \/ auto-suggestion
    queryAutosuggest,
    ResultsFilter(..),

    queryAutosuggestPredicate,
    queryAutosuggestMatchingDocuments

  ) where

import Data.SearchEngine.Types
import Data.SearchEngine.Query (ResultsFilter(..))
import qualified Data.SearchEngine.Query       as Query
import qualified Data.SearchEngine.SearchIndex as SI
import qualified Data.SearchEngine.DocIdSet    as DocIdSet
import qualified Data.SearchEngine.DocTermIds  as DocTermIds
import qualified Data.SearchEngine.BM25F       as BM25F

import Data.Ix
import Data.Ord
import Data.List
import Data.Maybe
import qualified Data.Map as Map
import qualified Data.IntSet as IntSet
import qualified Data.Vector.Unboxed as Vec


-- | Execute an \"auto-suggest\" query. This is where one of the search terms
-- is an incomplete prefix and we are looking for possible completions of that
-- search term, and result documents to go with the possible completions.
--
-- An auto-suggest query only gives useful results when the 'SearchEngine' is
-- configured to use a non-term feature score. That is, when we can give
-- documents an importance score independent of what terms we are looking for.
-- This is because an auto-suggest query is backwards from a normal query: we
-- are asking for relevant terms occurring in important or popular documents
-- so we need some notion of important or popular. Without this we would just
-- be ranking based on term frequency which while it makes sense for normal
-- \"forward\" queries is pretty meaningless for auto-suggest \"reverse\"
-- queries. Indeed for single-term auto-suggest queries the ranking function
-- we use will assign 0 for all documents and completions if there is no 
-- non-term feature scores.
--
queryAutosuggest :: (Ix field, Bounded field, Ix feature, Bounded feature) =>
                    SearchEngine doc key field feature ->
                    ResultsFilter key ->
                    [Term] -> Term -> ([(Term, Float)], [(key, Float)])
queryAutosuggest :: forall field feature doc key.
(Ix field, Bounded field, Ix feature, Bounded feature) =>
SearchEngine doc key field feature
-> ResultsFilter key
-> [Term]
-> Term
-> ([(Term, Float)], [(key, Float)])
queryAutosuggest SearchEngine doc key field feature
se ResultsFilter key
resultsFilter [Term]
precedingTerms Term
partialTerm =

     forall {v}.
([(TermId, v)], [(DocId, v)]) -> ([(Term, v)], [(key, v)])
step_external
   forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall {a} {b}.
([(a, Float)], [(b, Float)]) -> ([(a, Float)], [(b, Float)])
step_rank
   forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Map DocId (Float, Map TermId Float), [(TermId, DocIdSet)],
 [(TermId, Float)])
-> ([(TermId, Float)], [(DocId, Float)])
step_scoreDs
   forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Map DocId (Float, Map TermId Float), [(TermId, DocIdSet)])
-> (Map DocId (Float, Map TermId Float), [(TermId, DocIdSet)],
    [(TermId, Float)])
step_scoreTs
   forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall {b}.
([TermId], [(DocId, (key, DocTermIds field, DocFeatVals feature))],
 b, [TermId])
-> (Map DocId (Float, Map TermId Float), b)
step_cache
   forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall {a} {a} {a} {a}.
([a], [a], [a], [a]) -> ([a], [a], [a], [a])
step_postfilterlimit
   forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall {a} {c} {d}.
(a, DocIdSet, c, d)
-> (a, [(DocId, (key, DocTermIds field, DocFeatVals feature))], c,
    d)
step_filter
   forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall {a} {a} {a}.
([a], DocIdSet, [a], [a]) -> ([a], DocIdSet, [a], [a])
step_prefilterlimit
   forall b c a. (b -> c) -> (a -> b) -> a -> c
. AutosuggestQuery
-> ([TermId], DocIdSet, [(TermId, DocIdSet)], [TermId])
step_process
   forall a b. (a -> b) -> a -> b
$ [Term] -> Term -> AutosuggestQuery
step_prep
       [Term]
precedingTerms Term
partialTerm

  where
    -- Construct the auto-suggest query from the query terms
    step_prep :: [Term] -> Term -> AutosuggestQuery
step_prep [Term]
pre_ts Term
t = forall field doc key feature.
(Ix field, Bounded field) =>
SearchEngine doc key field feature
-> [Term] -> Term -> AutosuggestQuery
mkAutosuggestQuery SearchEngine doc key field feature
se [Term]
pre_ts Term
t

    -- Find the appropriate subset of ts and ds
    -- and an intermediate result that will be useful later:
    -- { (t, ds ∩ ds_t) | t ∈ ts, ds ∩ ds_t ≠ ∅ }
    step_process :: AutosuggestQuery
-> ([TermId], DocIdSet, [(TermId, DocIdSet)], [TermId])
step_process (Map TermId DocIdSet
ts, Maybe DocIdSet
ds, [TermId]
pre_ts) = ([TermId]
ts', DocIdSet
ds', [(TermId, DocIdSet)]
tdss', [TermId]
pre_ts)
      where
        ([(TermId, DocIdSet)]
tdss', [TermId]
ts', DocIdSet
ds') = AutosuggestQuery -> ([(TermId, DocIdSet)], [TermId], DocIdSet)
processAutosuggestQuery (Map TermId DocIdSet
ts, Maybe DocIdSet
ds, [TermId]
pre_ts)

    -- If the number of docs results is huge then we may not want to bother
    -- and just return no results. Even the filtering of a huge number of
    -- docs can be expensive.
    step_prefilterlimit :: ([a], DocIdSet, [a], [a]) -> ([a], DocIdSet, [a], [a])
step_prefilterlimit args :: ([a], DocIdSet, [a], [a])
args@([a]
_, DocIdSet
ds, [a]
_, [a]
_)
      | forall doc key field feature.
SearchEngine doc key field feature -> DocIdSet -> Bool
withinPrefilterLimit SearchEngine doc key field feature
se DocIdSet
ds = ([a], DocIdSet, [a], [a])
args
      | Bool
otherwise                  = ([], DocIdSet
DocIdSet.empty, [], [])

    -- Filter ds to those that are visible for this query
    -- and at the same time, do the docid -> docinfo lookup
    -- (needed at this step anyway to do the filter)
    step_filter :: (a, DocIdSet, c, d)
-> (a, [(DocId, (key, DocTermIds field, DocFeatVals feature))], c,
    d)
step_filter (a
ts, DocIdSet
ds, c
tdss, d
pre_ts) = (a
ts, [(DocId, (key, DocTermIds field, DocFeatVals feature))]
ds_info, c
tdss, d
pre_ts)
      where
        ds_info :: [(DocId, (key, DocTermIds field, DocFeatVals feature))]
ds_info = forall doc key field feature.
SearchEngine doc key field feature
-> ResultsFilter key
-> DocIdSet
-> [(DocId, (key, DocTermIds field, DocFeatVals feature))]
filterAutosuggestQuery SearchEngine doc key field feature
se ResultsFilter key
resultsFilter DocIdSet
ds

    -- If the number of docs results is huge then we may not want to bother
    -- and just return no results. Scoring a large number of docs is expensive.
    step_postfilterlimit :: ([a], [a], [a], [a]) -> ([a], [a], [a], [a])
step_postfilterlimit args :: ([a], [a], [a], [a])
args@([a]
_, [a]
ds_info, [a]
_, [a]
_)
      | forall doc key field feature a.
SearchEngine doc key field feature -> [a] -> Bool
withinPostfilterLimit SearchEngine doc key field feature
se [a]
ds_info = ([a], [a], [a], [a])
args
      | Bool
otherwise                        = ([], [], [], [])

    -- For all ds, calculate and cache a couple bits of info needed
    -- later for scoring completion terms and doc results
    step_cache :: ([TermId], [(DocId, (key, DocTermIds field, DocFeatVals feature))],
 b, [TermId])
-> (Map DocId (Float, Map TermId Float), b)
step_cache ([TermId]
ts, [(DocId, (key, DocTermIds field, DocFeatVals feature))]
ds_info, b
tdss, [TermId]
pre_ts) = (Map DocId (Float, Map TermId Float)
ds_info', b
tdss)
      where
        ds_info' :: Map DocId (Float, Map TermId Float)
ds_info' = forall field feature doc key.
(Ix field, Bounded field, Ix feature, Bounded feature) =>
SearchEngine doc key field feature
-> [TermId]
-> [(DocId, (key, DocTermIds field, DocFeatVals feature))]
-> [TermId]
-> Map DocId (Float, Map TermId Float)
cacheDocScoringInfo SearchEngine doc key field feature
se [TermId]
ts [(DocId, (key, DocTermIds field, DocFeatVals feature))]
ds_info [TermId]
pre_ts

    -- Score the completion terms
    step_scoreTs :: (Map DocId (Float, Map TermId Float), [(TermId, DocIdSet)])
-> (Map DocId (Float, Map TermId Float), [(TermId, DocIdSet)],
    [(TermId, Float)])
step_scoreTs (Map DocId (Float, Map TermId Float)
ds_info, [(TermId, DocIdSet)]
tdss) = (Map DocId (Float, Map TermId Float)
ds_info, [(TermId, DocIdSet)]
tdss, [(TermId, Float)]
ts_scored)
      where
        ts_scored :: [(TermId, Float)]
ts_scored = [(TermId, DocIdSet)]
-> Map DocId (Float, Map TermId Float) -> [(TermId, Float)]
scoreAutosuggestQueryCompletions [(TermId, DocIdSet)]
tdss Map DocId (Float, Map TermId Float)
ds_info

    -- Score the doc results (making use of the completion scores)
    step_scoreDs :: (Map DocId (Float, Map TermId Float), [(TermId, DocIdSet)],
 [(TermId, Float)])
-> ([(TermId, Float)], [(DocId, Float)])
step_scoreDs (Map DocId (Float, Map TermId Float)
ds_info, [(TermId, DocIdSet)]
tdss, [(TermId, Float)]
ts_scored) = ([(TermId, Float)]
ts_scored, [(DocId, Float)]
ds_scored)
      where
        ds_scored :: [(DocId, Float)]
ds_scored = [(TermId, DocIdSet)]
-> Map DocId (Float, Map TermId Float)
-> [(TermId, Float)]
-> [(DocId, Float)]
scoreAutosuggestQueryResults [(TermId, DocIdSet)]
tdss Map DocId (Float, Map TermId Float)
ds_info [(TermId, Float)]
ts_scored

    -- Rank the completions and results based on their scores
    step_rank :: ([(a, Float)], [(b, Float)]) -> ([(a, Float)], [(b, Float)])
step_rank = forall av bv a b.
(Ord av, Ord bv) =>
([(a, av)], [(b, bv)]) -> ([(a, av)], [(b, bv)])
sortResults

    -- Convert from internal Ids into external forms: Term and doc key
    step_external :: ([(TermId, v)], [(DocId, v)]) -> ([(Term, v)], [(key, v)])
step_external = forall doc key field feature v.
SearchEngine doc key field feature
-> ([(TermId, v)], [(DocId, v)]) -> ([(Term, v)], [(key, v)])
convertIdsToExternal SearchEngine doc key field feature
se


-- | Given an incomplete prefix query, find the set of documents that match
-- possible completions of that query.  This should be less computationally
-- expensive than 'queryAutosuggest' as it does not do any ranking of documents.
-- However, it does not apply the pre-filter or post-filter limits, and the list
-- may be large when the query terms occur in many documents.  The order of
-- returned keys is unspecified.
queryAutosuggestMatchingDocuments :: (Ix field, Bounded field, Ord key) =>
                                     SearchEngine doc key field feature ->
                                     [Term] -> Term -> [key]
queryAutosuggestMatchingDocuments :: forall field key doc feature.
(Ix field, Bounded field, Ord key) =>
SearchEngine doc key field feature -> [Term] -> Term -> [key]
queryAutosuggestMatchingDocuments se :: SearchEngine doc key field feature
se@SearchEngine{SearchIndex key field feature
searchIndex :: forall doc key field feature.
SearchEngine doc key field feature -> SearchIndex key field feature
searchIndex :: SearchIndex key field feature
searchIndex} [Term]
precedingTerms Term
partialTerm =
    let ([(TermId, DocIdSet)]
_, [TermId]
_, DocIdSet
ds) = AutosuggestQuery -> ([(TermId, DocIdSet)], [TermId], DocIdSet)
processAutosuggestQuery (forall field doc key feature.
(Ix field, Bounded field) =>
SearchEngine doc key field feature
-> [Term] -> Term -> AutosuggestQuery
mkAutosuggestQuery SearchEngine doc key field feature
se [Term]
precedingTerms Term
partialTerm)
    in forall a b. (a -> b) -> [a] -> [b]
map (forall key field feature.
SearchIndex key field feature -> DocId -> key
SI.getDocKey SearchIndex key field feature
searchIndex) (DocIdSet -> [DocId]
DocIdSet.toList DocIdSet
ds)

-- | Given an incomplete prefix query, return a predicate that indicates whether
-- a key is in the set of documents that match possible completions of that
-- query.  This is equivalent to calling 'queryAutosuggestMatchingDocuments' and
-- testing whether the key is in the list, but should be more efficient.
--
-- This does not apply the pre-filter or post-filter limits.
queryAutosuggestPredicate :: (Ix field, Bounded field, Ord key) =>
                             SearchEngine doc key field feature ->
                             [Term] -> Term -> (key -> Bool)
queryAutosuggestPredicate :: forall field key doc feature.
(Ix field, Bounded field, Ord key) =>
SearchEngine doc key field feature -> [Term] -> Term -> key -> Bool
queryAutosuggestPredicate se :: SearchEngine doc key field feature
se@SearchEngine{SearchIndex key field feature
searchIndex :: SearchIndex key field feature
searchIndex :: forall doc key field feature.
SearchEngine doc key field feature -> SearchIndex key field feature
searchIndex} [Term]
precedingTerms Term
partialTerm =
    let ([(TermId, DocIdSet)]
_, [TermId]
_, DocIdSet
ds) = AutosuggestQuery -> ([(TermId, DocIdSet)], [TermId], DocIdSet)
processAutosuggestQuery (forall field doc key feature.
(Ix field, Bounded field) =>
SearchEngine doc key field feature
-> [Term] -> Term -> AutosuggestQuery
mkAutosuggestQuery SearchEngine doc key field feature
se [Term]
precedingTerms Term
partialTerm)
    in (\ key
key -> forall b a. b -> (a -> b) -> Maybe a -> b
maybe Bool
False (forall a b c. (a -> b -> c) -> b -> a -> c
flip DocId -> DocIdSet -> Bool
DocIdSet.member DocIdSet
ds) (forall key field feature.
Ord key =>
SearchIndex key field feature -> key -> Maybe DocId
SI.lookupDocKeyDocId SearchIndex key field feature
searchIndex key
key))


-- We apply hard limits both before and after filtering.
-- The post-filter limit is to avoid scoring 1000s of documents.
-- The pre-filter limit is to avoid filtering 1000s of docs (which in some
-- apps may be expensive itself)

withinPrefilterLimit :: SearchEngine doc key field feature ->
                        DocIdSet -> Bool
withinPrefilterLimit :: forall doc key field feature.
SearchEngine doc key field feature -> DocIdSet -> Bool
withinPrefilterLimit SearchEngine{SearchRankParameters field feature
searchRankParams :: forall doc key field feature.
SearchEngine doc key field feature
-> SearchRankParameters field feature
searchRankParams :: SearchRankParameters field feature
searchRankParams} DocIdSet
ds =
    DocIdSet -> Int
DocIdSet.size DocIdSet
ds forall a. Ord a => a -> a -> Bool
<= forall field feature. SearchRankParameters field feature -> Int
paramAutosuggestPrefilterLimit SearchRankParameters field feature
searchRankParams

withinPostfilterLimit :: SearchEngine doc key field feature ->
                         [a] -> Bool
withinPostfilterLimit :: forall doc key field feature a.
SearchEngine doc key field feature -> [a] -> Bool
withinPostfilterLimit SearchEngine{SearchRankParameters field feature
searchRankParams :: SearchRankParameters field feature
searchRankParams :: forall doc key field feature.
SearchEngine doc key field feature
-> SearchRankParameters field feature
searchRankParams} [a]
ds_info =
    forall (t :: * -> *) a. Foldable t => t a -> Int
length [a]
ds_info forall a. Ord a => a -> a -> Bool
<= forall field feature. SearchRankParameters field feature -> Int
paramAutosuggestPostfilterLimit SearchRankParameters field feature
searchRankParams


sortResults :: (Ord av, Ord bv) => ([(a,av)], [(b,bv)]) -> ([(a,av)], [(b,bv)])
sortResults :: forall av bv a b.
(Ord av, Ord bv) =>
([(a, av)], [(b, bv)]) -> ([(a, av)], [(b, bv)])
sortResults ([(a, av)]
xs, [(b, bv)]
ys) =
    ( forall v x. Ord v => [(x, v)] -> [(x, v)]
sortBySndDescending [(a, av)]
xs
    , forall v x. Ord v => [(x, v)] -> [(x, v)]
sortBySndDescending [(b, bv)]
ys )
  where
    sortBySndDescending :: Ord v => [(x,v)] -> [(x,v)]
    sortBySndDescending :: forall v x. Ord v => [(x, v)] -> [(x, v)]
sortBySndDescending = forall a. (a -> a -> Ordering) -> [a] -> [a]
sortBy (forall a b c. (a -> b -> c) -> b -> a -> c
flip (forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing forall a b. (a, b) -> b
snd))

convertIdsToExternal :: SearchEngine doc key field feature ->
                        ([(TermId, v)], [(DocId, v)]) -> ([(Term, v)], [(key, v)])
convertIdsToExternal :: forall doc key field feature v.
SearchEngine doc key field feature
-> ([(TermId, v)], [(DocId, v)]) -> ([(Term, v)], [(key, v)])
convertIdsToExternal SearchEngine{SearchIndex key field feature
searchIndex :: SearchIndex key field feature
searchIndex :: forall doc key field feature.
SearchEngine doc key field feature -> SearchIndex key field feature
searchIndex} ([(TermId, v)]
termids, [(DocId, v)]
docids) =
    ( [ (forall key field feature.
SearchIndex key field feature -> TermId -> Term
SI.getTerm   SearchIndex key field feature
searchIndex TermId
termid, v
s) | (TermId
termid, v
s) <- [(TermId, v)]
termids ]
    , [ (forall key field feature.
SearchIndex key field feature -> DocId -> key
SI.getDocKey SearchIndex key field feature
searchIndex DocId
docid,  v
s) | (DocId
docid,  v
s) <- [(DocId, v)]
docids  ]
    )


-- From Bast and Weber:
--
--   An autocompletion query is a pair (T, D), where T is a range of terms
--   (all possible completions of the last term which the user has started
--   typing) and D is a set of documents (the hits for the preceding part of
--   the query).
--
-- We augment this with the preceding terms because we will need these to
-- score the set of documents D.
--
-- Note that the set D will be the entire collection in the case that the
-- preceding part of the query is empty. For efficiency we represent that
-- case specially with Maybe.

type AutosuggestQuery = (Map.Map TermId DocIdSet, Maybe DocIdSet, [TermId])

mkAutosuggestQuery :: (Ix field, Bounded field) =>
                      SearchEngine doc key field feature ->
                      [Term] -> Term -> AutosuggestQuery
mkAutosuggestQuery :: forall field doc key feature.
(Ix field, Bounded field) =>
SearchEngine doc key field feature
-> [Term] -> Term -> AutosuggestQuery
mkAutosuggestQuery se :: SearchEngine doc key field feature
se@SearchEngine{ SearchIndex key field feature
searchIndex :: SearchIndex key field feature
searchIndex :: forall doc key field feature.
SearchEngine doc key field feature -> SearchIndex key field feature
searchIndex }
                   [Term]
precedingTerms Term
partialTerm =
    (Map TermId DocIdSet
completionTerms, Maybe DocIdSet
precedingDocHits, [TermId]
precedingTerms')
  where
    completionTerms :: Map TermId DocIdSet
completionTerms =
      forall (f :: * -> *) k a.
(Foldable f, Ord k) =>
f (Map k a) -> Map k a
Map.unions
        [ forall k a. Ord k => [(k, a)] -> Map k a
Map.fromList (forall key field feature.
SearchIndex key field feature -> Term -> [(TermId, DocIdSet)]
SI.lookupTermsByPrefix SearchIndex key field feature
searchIndex Term
partialTerm')
        | Term
partialTerm' <- forall field doc key feature.
(Ix field, Bounded field) =>
SearchEngine doc key field feature -> Term -> [Term]
Query.expandTransformedQueryTerm SearchEngine doc key field feature
se Term
partialTerm
        ]

    ([TermId]
precedingTerms', Maybe DocIdSet
precedingDocHits)
      | forall (t :: * -> *) a. Foldable t => t a -> Bool
null [Term]
precedingTerms = ([], forall a. Maybe a
Nothing)
      | Bool
otherwise           = forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (forall a. a -> Maybe a
Just forall b c a. (b -> c) -> (a -> b) -> a -> c
. [DocIdSet] -> DocIdSet
DocIdSet.unions)
                                   ([Term] -> ([TermId], [DocIdSet])
lookupRawResults [Term]
precedingTerms)

    lookupRawResults :: [Term] -> ([TermId], [DocIdSet])
    lookupRawResults :: [Term] -> ([TermId], [DocIdSet])
lookupRawResults [Term]
ts =
      forall a b. [(a, b)] -> ([a], [b])
unzip forall a b. (a -> b) -> a -> b
$ forall a. [Maybe a] -> [a]
catMaybes
        [ forall key field feature.
SearchIndex key field feature -> Term -> Maybe (TermId, DocIdSet)
SI.lookupTerm SearchIndex key field feature
searchIndex Term
t'
        | Term
t  <- [Term]
ts
        , Term
t' <- forall field doc key feature.
(Ix field, Bounded field) =>
SearchEngine doc key field feature -> Term -> [Term]
Query.expandTransformedQueryTerm SearchEngine doc key field feature
se Term
t
        ]



-- From Bast and Weber:
--
--   To process the query means to compute the subset T' ⊆ T of terms that
--   occur in at least one document from D, as well as the subset D' ⊆ D of
--   documents that contain at least one of these words.
--
--   The obvious way to use an inverted index to process an autocompletion
--   query (T, D) is to compute, for each t ∈ T, the intersections D ∩ Dt.
--   Then, T' is simply the set of all t for which the intersection was
--   non-empty, and D' is the union of all (non-empty) intersections.
--
-- We will do this but additionally we will return all the non-empty
-- intersections because they will be useful when scoring.

processAutosuggestQuery :: AutosuggestQuery ->
                           ([(TermId, DocIdSet)], [TermId], DocIdSet)
processAutosuggestQuery :: AutosuggestQuery -> ([(TermId, DocIdSet)], [TermId], DocIdSet)
processAutosuggestQuery (Map TermId DocIdSet
completionTerms, Maybe DocIdSet
precedingDocHits, [TermId]
_) =
    ( [(TermId, DocIdSet)]
completionTermAndDocSets
    , [TermId]
completionTerms'
    , DocIdSet
allTermDocSet
    )
  where
    -- We look up each candidate completion to find the set of documents
    -- it appears in, and filtering (intersecting) down to just those
    -- appearing in the existing partial query results (if any).
    -- Candidate completions not appearing at all within the existing
    -- partial query results are excluded at this stage.
    --
    -- We have to keep these doc sets for the whole process, so we keep
    -- them as the compact DocIdSet type.
    --
    completionTermAndDocSets :: [(TermId, DocIdSet)]
    completionTermAndDocSets :: [(TermId, DocIdSet)]
completionTermAndDocSets =
      [ (TermId
t, DocIdSet
ds_t')
      | (TermId
t, DocIdSet
ds_t) <- forall k a. Map k a -> [(k, a)]
Map.toList Map TermId DocIdSet
completionTerms
      , let ds_t' :: DocIdSet
ds_t' = case Maybe DocIdSet
precedingDocHits of
                      Just DocIdSet
ds -> DocIdSet
ds DocIdSet -> DocIdSet -> DocIdSet
`DocIdSet.intersection` DocIdSet
ds_t
                      Maybe DocIdSet
Nothing -> DocIdSet
ds_t
      , Bool -> Bool
not (DocIdSet -> Bool
DocIdSet.null DocIdSet
ds_t')
      ]

    -- The remaining candidate completions
    completionTerms' :: [TermId]
completionTerms' = [ TermId
w | (TermId
w, DocIdSet
_ds_w) <- [(TermId, DocIdSet)]
completionTermAndDocSets ]

    -- The union of all these is this set of documents that form the results.
    allTermDocSet :: DocIdSet
    allTermDocSet :: DocIdSet
allTermDocSet =
      [DocIdSet] -> DocIdSet
DocIdSet.unions [ DocIdSet
ds_t | (TermId
_t, DocIdSet
ds_t) <- [(TermId, DocIdSet)]
completionTermAndDocSets ]


filterAutosuggestQuery :: SearchEngine doc key field feature ->
                          ResultsFilter key ->
                          DocIdSet ->
                          [(DocId, (key, DocTermIds field, DocFeatVals feature))]
filterAutosuggestQuery :: forall doc key field feature.
SearchEngine doc key field feature
-> ResultsFilter key
-> DocIdSet
-> [(DocId, (key, DocTermIds field, DocFeatVals feature))]
filterAutosuggestQuery SearchEngine{ SearchIndex key field feature
searchIndex :: SearchIndex key field feature
searchIndex :: forall doc key field feature.
SearchEngine doc key field feature -> SearchIndex key field feature
searchIndex } ResultsFilter key
resultsFilter DocIdSet
ds =
    case ResultsFilter key
resultsFilter of
      ResultsFilter key
NoFilter ->
        [ (DocId
docid, (key, DocTermIds field, DocFeatVals feature)
doc)
        | DocId
docid <- DocIdSet -> [DocId]
DocIdSet.toList DocIdSet
ds
        , let doc :: (key, DocTermIds field, DocFeatVals feature)
doc = forall key field feature.
SearchIndex key field feature
-> DocId -> (key, DocTermIds field, DocFeatVals feature)
SI.lookupDocId SearchIndex key field feature
searchIndex DocId
docid ]

      FilterPredicate key -> Bool
predicate ->
        [ (DocId
docid, (key, DocTermIds field, DocFeatVals feature)
doc)
        | DocId
docid <- DocIdSet -> [DocId]
DocIdSet.toList DocIdSet
ds
        , let doc :: (key, DocTermIds field, DocFeatVals feature)
doc@(key
k,DocTermIds field
_,DocFeatVals feature
_) = forall key field feature.
SearchIndex key field feature
-> DocId -> (key, DocTermIds field, DocFeatVals feature)
SI.lookupDocId SearchIndex key field feature
searchIndex DocId
docid
        , key -> Bool
predicate key
k ]

      FilterBulkPredicate [key] -> [Bool]
bulkPredicate ->
        [ (DocId
docid, (key, DocTermIds field, DocFeatVals feature)
doc)
        | let docids :: [DocId]
docids = DocIdSet -> [DocId]
DocIdSet.toList DocIdSet
ds
              docinf :: [(key, DocTermIds field, DocFeatVals feature)]
docinf = forall a b. (a -> b) -> [a] -> [b]
map (forall key field feature.
SearchIndex key field feature
-> DocId -> (key, DocTermIds field, DocFeatVals feature)
SI.lookupDocId SearchIndex key field feature
searchIndex) [DocId]
docids
              keep :: [Bool]
keep   = [key] -> [Bool]
bulkPredicate [ key
k | (key
k,DocTermIds field
_,DocFeatVals feature
_) <- [(key, DocTermIds field, DocFeatVals feature)]
docinf ]
        , (DocId
docid, (key, DocTermIds field, DocFeatVals feature)
doc, Bool
True) <- forall a b c. [a] -> [b] -> [c] -> [(a, b, c)]
zip3 [DocId]
docids [(key, DocTermIds field, DocFeatVals feature)]
docinf [Bool]
keep ]


-- Scoring
-------------
--
-- From Bast and Weber:
--   In practice, only a selection of items from these lists can and will be
--   presented to the user, and it is of course crucial that the most relevant
--   completions and hits are selected.
--
--   A standard approach for this task in ad-hoc retrieval is to have a
--   precomputed score for each term-in-document pair, and when a query is
--   being processed, to aggregate these scores for each candidate document,
--   and return documents with the highest such aggregated scores.
--
--   Both INV and HYB can be easily adapted to implement any such scoring and
--   aggregation scheme: store by each term-in-document pair its precomputed
--   score, and when intersecting, aggregate the scores. A decision has to be
--   made on how to reconcile scores from different completions within the
--   same document. We suggest the following: when merging the intersections
--   (which gives the set D' according to Definition 1), compute for each
--   document in D' the maximal score achieved for some completion in T'
--   contained in that document, and compute for each completion in T' the
--   maximal score achieved for a hit from D' achieved for this completion.
--
-- So firstly let us explore what this means and then discuss why it does not
-- work for BM25.
--
-- The "precomputed score for each term-in-document pair" refers to the bm25
-- score for this term in this document (and obviously doesn't have to be
-- precomputed, though that'd be faster).
--
-- So the score for a document d ∈ D' is:
--   maximum of score for d ∈ D ∩ Dt, for any t ∈ T'
--
-- While the score for a completion t ∈ T' is:
--   maximum of score for d ∈ D ∩ Dt
--
-- So for documents we say their score is their best score for any of the
-- completion terms they contain. While for completions we say their score
-- is their best score for any of the documents they appear in.
--
-- For a scoring function like BM25 this appears to be not a good method, both
-- in principle and in practice. Consider what terms get high BM25 scores:
-- very rare ones. So this means we're going to score highly documents that
-- contain the least frequent terms, and completions that are themselves very
-- rare. This is silly.
--
-- Another important thing to note is that if we use this scoring method then
-- we are using the BM25 score in a way that makes no sense. The BM25 score
-- for different documents for the /same/ set of terms are comparable. The
-- score for the same for different document with different terms are simply
-- not comparable.
--
-- This also makes sense if you consider what question the BM25 score is
-- answering: "what is the likelihood that this document is relevant given that
-- I judge these terms to be relevant". However an auto-suggest query is
-- different: "what is the likelihood that this term is relevant given the
-- importance/popularity of the documents (and any preceding terms I've judged
-- to be relevant)". They are both conditional likelihood questions but with
-- different premises.
--
-- More generally, term frequency information simply isn't useful for
-- auto-suggest queries. We don't want results that have the most obscure terms
-- nor the most common terms, not even something in-between. Term frequency
-- just doesn't tell us anything unless we've already judged terms to be
-- relevant, and in an auto-suggest query we've not done that yet.
--
-- What we really need is information on the importance/popularity of the
-- documents. We can actually do something with that.
--
-- So, instead we follow a different strategy. We require that we have
-- importance/popularity info for the documents.
--
-- A first approximation would be to rank result documents by their importance
-- and completion terms by the sum of the importance of the documents each
-- term appears in.
--
-- Score for a document d ∈ D'
--   importance score for d
--
-- Score for a completion t ∈ T'
--   sum of importance score for d ∈ D ∩ Dt
--
-- The only problem with this is that just because a term appears in an
-- important document, doesn't mean that term is /about/ that document, or to
-- put it another way, that term may not be relevant for that document. For
-- example common words like "the" likely appear in all important documents
-- but this doesn't really tell us anything because "the" isn't an important
-- keyword.
--
-- So what we want to do is to weight the document importance by the relevance
-- of the keyword to the document. So now if we have an important document and
-- a relevant keyword for that document then we get a high score, but an
-- irrelevant term like "the" would get a very low weighting and so would not
-- contribute much to the score, even for very important documents.
--
-- The intuition is that we will score term completions by taking the
-- document importance weighted by the relevance of that term to that document
-- and summing over all the documents where the term occurs.
--
-- We define document importance (for the set D') to be the BM25F score for
-- the documents with any preceding terms. So this includes the non-term
-- feature score for the importance/popularity, and also takes account of
-- preceding terms if there were any.
--
-- We define term relevance (for terms in documents) to be the BM25F score for
-- that term in that document as a fraction of the total BM25F score for all
-- terms in the document. Thus the relevance of all terms in a document sums
-- to 1.
--
-- Now we can re-weight the document importance by the term relevance:
--
-- Score for a completion t ∈ T'
--   sum (for d ∈ D ∩ Dt) of ( importance for d * relevance for t in d )
--
-- And now for document result scores. We don't want to just stick with the
-- innate document importance. We want to re-weight by the completion term
-- scores:
--
-- Score for a document d ∈ D'
--   sum (for t ∈ T' ∩ d) (importance score for d * score for completion t)
--
-- Clear as mud?

type DocImportance = Float
type TermRelevanceBreakdown = Map.Map TermId Float

-- | Precompute the document importance and the term relevance breakdown for
-- all the documents. This will be used in scoring the term completions
-- and the result documents. They will all be used and some used many
-- times so it's better to compute up-front and share.
--
-- This is actually the expensive bit (which is why we've filtered already).
--
cacheDocScoringInfo :: (Ix field, Bounded field, Ix feature, Bounded feature) =>
                       SearchEngine doc key field feature ->
                       [TermId] ->
                       [(DocId, (key, DocTermIds field, DocFeatVals feature))] ->
                       [TermId] ->
                       Map.Map DocId (DocImportance, TermRelevanceBreakdown)
cacheDocScoringInfo :: forall field feature doc key.
(Ix field, Bounded field, Ix feature, Bounded feature) =>
SearchEngine doc key field feature
-> [TermId]
-> [(DocId, (key, DocTermIds field, DocFeatVals feature))]
-> [TermId]
-> Map DocId (Float, Map TermId Float)
cacheDocScoringInfo SearchEngine doc key field feature
se [TermId]
completionTerms [(DocId, (key, DocTermIds field, DocFeatVals feature))]
allTermDocInfo [TermId]
precedingTerms =
    forall k a. Ord k => [(k, a)] -> Map k a
Map.fromList
      [ (DocId
docid, (Float
docImportance, Map TermId Float
termRelevances))
      | (DocId
docid, (key
_dockey, DocTermIds field
doctermids, DocFeatVals feature
docfeatvals)) <- [(DocId, (key, DocTermIds field, DocFeatVals feature))]
allTermDocInfo
      , let docImportance :: Float
docImportance  = forall field feature doc key.
(Ix field, Bounded field, Ix feature, Bounded feature) =>
SearchEngine doc key field feature
-> [TermId] -> DocTermIds field -> DocFeatVals feature -> Float
Query.relevanceScore SearchEngine doc key field feature
se [TermId]
precedingTerms
                                                  DocTermIds field
doctermids DocFeatVals feature
docfeatvals
            termRelevances :: Map TermId Float
termRelevances = forall doc key field feature.
(Ix field, Bounded field, Ix feature, Bounded feature) =>
SearchEngine doc key field feature
-> DocTermIds field
-> DocFeatVals feature
-> [TermId]
-> Map TermId Float
relevanceBreakdown SearchEngine doc key field feature
se DocTermIds field
doctermids DocFeatVals feature
docfeatvals
                                                [TermId]
completionTerms
      ]

-- | Calculate the relevance of each of a given set of terms to the given
-- document.
--
-- We define the \"relevance\" of each term in a document to be its
-- term-in-document score as a fraction of the total of the scores for all
-- terms in the document. Thus the sum of all the relevance values in the
-- document is 1.
--
-- Note: we have to calculate the relevance for all terms in the document
-- but we only keep the relevance value for the terms of interest.
--
relevanceBreakdown :: forall doc key field feature.
                      (Ix field, Bounded field, Ix feature, Bounded feature) =>
                      SearchEngine doc key field feature ->
                      DocTermIds field -> DocFeatVals feature ->
                      [TermId] -> TermRelevanceBreakdown
relevanceBreakdown :: forall doc key field feature.
(Ix field, Bounded field, Ix feature, Bounded feature) =>
SearchEngine doc key field feature
-> DocTermIds field
-> DocFeatVals feature
-> [TermId]
-> Map TermId Float
relevanceBreakdown SearchEngine{ Context TermId field feature
bm25Context :: forall doc key field feature.
SearchEngine doc key field feature -> Context TermId field feature
bm25Context :: Context TermId field feature
bm25Context } DocTermIds field
doctermids DocFeatVals feature
docfeatvals [TermId]
ts =
    let -- We'll calculate the bm25 score for each term in this document
        bm25Doc :: Doc TermId field feature
bm25Doc     = forall field feature.
(Ix field, Bounded field, Ix feature, Bounded feature) =>
DocTermIds field -> DocFeatVals feature -> Doc TermId field feature
Query.indexDocToBM25Doc DocTermIds field
doctermids DocFeatVals feature
docfeatvals

        -- Cache the info that depends only on this doc, not the terms
        termScore   :: (TermId -> (field -> Int) -> Float)
        termScore :: TermId -> (field -> Int) -> Float
termScore   = forall field term feature.
(Ix field, Bounded field) =>
Context term field feature
-> Doc term field feature -> term -> (field -> Int) -> Float
BM25F.scoreTermsBulk Context TermId field feature
bm25Context Doc TermId field feature
bm25Doc

        -- The DocTermIds has the info we need to do bulk scoring, but it's
        -- a sparse representation, so we first convert it to a dense table
        term        :: Int -> TermId
        count       :: Int -> field -> Int
        (!Int
numTerms, Int -> TermId
term, Int -> field -> Int
count) = forall field.
(Ix field, Bounded field) =>
DocTermIds field -> (Int, Int -> TermId, Int -> field -> Int)
DocTermIds.denseTable DocTermIds field
doctermids

        -- We generate the vector of scores for all terms, based on looking up
        -- the termid and the per-field counts in the dense table
        termScores  :: Vec.Vector Float
        !termScores :: Vector Float
termScores = forall a. Unbox a => Int -> (Int -> a) -> Vector a
Vec.generate Int
numTerms forall a b. (a -> b) -> a -> b
$ \Int
i ->
                       TermId -> (field -> Int) -> Float
termScore (Int -> TermId
term Int
i) (\field
f -> Int -> field -> Int
count Int
i field
f)

        -- We keep only the values for the terms we're interested in
        -- and normalise so we get the relevence fraction
        !scoreSum :: Float
scoreSum   = forall a. (Unbox a, Num a) => Vector a -> a
Vec.sum Vector Float
termScores
        !tset :: IntSet
tset       = [Int] -> IntSet
IntSet.fromList (forall a b. (a -> b) -> [a] -> [b]
map forall a. Enum a => a -> Int
fromEnum [TermId]
ts)
     in forall k a. Ord k => [(k, a)] -> Map k a
Map.fromList
      forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Unbox a => Vector a -> [a]
Vec.toList
      forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (Unbox a, Unbox b) => (a -> b) -> Vector a -> Vector b
Vec.map    (\(TermId
t,Float
s) -> (TermId
t, Float
sforall a. Fractional a => a -> a -> a
/Float
scoreSum))
      forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Unbox a => (a -> Bool) -> Vector a -> Vector a
Vec.filter (\(TermId
t,Float
_) -> forall a. Enum a => a -> Int
fromEnum TermId
t Int -> IntSet -> Bool
`IntSet.member` IntSet
tset)
      forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b.
(Unbox a, Unbox b) =>
(Int -> a -> b) -> Vector a -> Vector b
Vec.imap   (\Int
i Float
s   -> (Int -> TermId
term Int
i, Float
s))
      forall a b. (a -> b) -> a -> b
$ Vector Float
termScores


scoreAutosuggestQueryCompletions :: [(TermId, DocIdSet)]
                                 -> Map.Map DocId (Float, Map.Map TermId Float)
                                 -> [(TermId, Float)]
scoreAutosuggestQueryCompletions :: [(TermId, DocIdSet)]
-> Map DocId (Float, Map TermId Float) -> [(TermId, Float)]
scoreAutosuggestQueryCompletions [(TermId, DocIdSet)]
completionTermAndDocSets Map DocId (Float, Map TermId Float)
allTermDocInfo =
    [ (TermId
t, TermId -> DocIdSet -> Float
candidateScore TermId
t DocIdSet
ds_t)
    | (TermId
t, DocIdSet
ds_t) <- [(TermId, DocIdSet)]
completionTermAndDocSets ]
  where
    -- The score for a completion is the sum of the importance of the
    -- documents in which that completion occurs, weighted by the relevance
    -- of the term to each document. For example we can have a very
    -- important document and our completion term is highly relevant to it
    -- or we could have a large number of moderately important documents
    -- that our term is quite relevant to. In either example the completion
    -- term would score highly.
    candidateScore :: TermId -> DocIdSet -> Float
    candidateScore :: TermId -> DocIdSet -> Float
candidateScore TermId
t DocIdSet
ds_t =
      forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
sum [ Float
docImportance forall a. Num a => a -> a -> a
* Float
termRelevance
          | Just (Float
docImportance, Map TermId Float
termRelevances) <-
               forall a b. (a -> b) -> [a] -> [b]
map (forall k a. Ord k => k -> Map k a -> Maybe a
`Map.lookup` Map DocId (Float, Map TermId Float)
allTermDocInfo) (DocIdSet -> [DocId]
DocIdSet.toList DocIdSet
ds_t)
          , let termRelevance :: Float
termRelevance = Map TermId Float
termRelevances forall k a. Ord k => Map k a -> k -> a
Map.! TermId
t
          ]


scoreAutosuggestQueryResults :: [(TermId, DocIdSet)] ->
                                Map.Map DocId (Float, Map.Map TermId Float) ->
                                [(TermId, Float)] ->
                                [(DocId, Float)]
scoreAutosuggestQueryResults :: [(TermId, DocIdSet)]
-> Map DocId (Float, Map TermId Float)
-> [(TermId, Float)]
-> [(DocId, Float)]
scoreAutosuggestQueryResults [(TermId, DocIdSet)]
completionTermAndDocSets Map DocId (Float, Map TermId Float)
allTermDocInfo
                             [(TermId, Float)]
scoredCandidates =
  forall k a. Map k a -> [(k, a)]
Map.toList forall a b. (a -> b) -> a -> b
$ forall k a. Ord k => (a -> a -> a) -> [(k, a)] -> Map k a
Map.fromListWith forall a. Num a => a -> a -> a
(+)
    [ (DocId
docid, Float
docImportance forall a. Num a => a -> a -> a
* Float
score_t)
    | ((TermId
_, DocIdSet
ds_t), (TermId
_, Float
score_t)) <- forall a b. [a] -> [b] -> [(a, b)]
zip [(TermId, DocIdSet)]
completionTermAndDocSets [(TermId, Float)]
scoredCandidates
    , let docids :: [DocId]
docids  = DocIdSet -> [DocId]
DocIdSet.toList DocIdSet
ds_t
          docinfo :: [Maybe (Float, Map TermId Float)]
docinfo = forall a b. (a -> b) -> [a] -> [b]
map (forall k a. Ord k => k -> Map k a -> Maybe a
`Map.lookup` Map DocId (Float, Map TermId Float)
allTermDocInfo) [DocId]
docids
    , (DocId
docid, Just (Float
docImportance, Map TermId Float
_)) <- forall a b. [a] -> [b] -> [(a, b)]
zip [DocId]
docids [Maybe (Float, Map TermId Float)]
docinfo
    ]