{-# LANGUAGE RecordWildCards, BangPatterns, ScopedTypeVariables #-}

-- | An implementation of BM25F ranking. See:
--
-- * A quick overview: <http://en.wikipedia.org/wiki/Okapi_BM25>
--
-- * /The Probabilistic Relevance Framework: BM25 and Beyond/
--   <http://www.staff.city.ac.uk/~sbrp622/papers/foundations_bm25_review.pdf>
--
-- * /An Introduction to Information Retrieval/
--   <http://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf>
--
module Data.SearchEngine.BM25F (
    -- * The ranking function
    score,
    Context(..),
    FeatureFunction(..),
    Doc(..),
    -- ** Specialised variants
    scoreTermsBulk,

    -- * Explaining the score
    Explanation(..),
    explain,
  ) where

import Data.Ix
import Data.Array.Unboxed

data Context term field feature = Context {
         forall term field feature. Context term field feature -> Int
numDocsTotal     :: !Int,
         forall term field feature.
Context term field feature -> field -> Float
avgFieldLength   :: field -> Float,
         forall term field feature.
Context term field feature -> term -> Int
numDocsWithTerm  :: term -> Int,
         forall term field feature. Context term field feature -> Float
paramK1          :: !Float,
         forall term field feature.
Context term field feature -> field -> Float
paramB           :: field -> Float,
         -- consider minimum length to prevent massive B bonus?
         forall term field feature.
Context term field feature -> field -> Float
fieldWeight      :: field -> Float,
         forall term field feature.
Context term field feature -> feature -> Float
featureWeight    :: feature -> Float,
         forall term field feature.
Context term field feature -> feature -> FeatureFunction
featureFunction  :: feature -> FeatureFunction
       }

data Doc term field feature = Doc {
         forall term field feature. Doc term field feature -> field -> Int
docFieldLength        :: field -> Int,
         forall term field feature.
Doc term field feature -> field -> term -> Int
docFieldTermFrequency :: field -> term -> Int,
         forall term field feature.
Doc term field feature -> feature -> Float
docFeatureValue       :: feature -> Float
       }


-- | The BM25F score for a document for a given set of terms.
--
score :: (Ix field, Bounded field, Ix feature, Bounded feature) =>
         Context term field feature ->
         Doc term field feature -> [term] -> Float
score :: forall field feature term.
(Ix field, Bounded field, Ix feature, Bounded feature) =>
Context term field feature
-> Doc term field feature -> [term] -> Float
score Context term field feature
ctx Doc term field feature
doc [term]
terms =
    forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
sum (forall a b. (a -> b) -> [a] -> [b]
map (forall field term feature.
(Ix field, Bounded field) =>
Context term field feature
-> Doc term field feature -> term -> Float
weightedTermScore    Context term field feature
ctx Doc term field feature
doc) [term]
terms)
  forall a. Num a => a -> a -> a
+ forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
sum (forall a b. (a -> b) -> [a] -> [b]
map (forall feature term field.
(Ix feature, Bounded feature) =>
Context term field feature
-> Doc term field feature -> feature -> Float
weightedNonTermScore Context term field feature
ctx Doc term field feature
doc) [feature]
features)

  where
    features :: [feature]
features = forall a. Ix a => (a, a) -> [a]
range (forall a. Bounded a => a
minBound, forall a. Bounded a => a
maxBound)


weightedTermScore :: (Ix field, Bounded field) =>
                     Context term field feature ->
                     Doc term field feature -> term -> Float
weightedTermScore :: forall field term feature.
(Ix field, Bounded field) =>
Context term field feature
-> Doc term field feature -> term -> Float
weightedTermScore Context term field feature
ctx Doc term field feature
doc term
t =
    forall term field feature.
Context term field feature -> term -> Float
weightIDF Context term field feature
ctx term
t forall a. Num a => a -> a -> a
*     Float
tf'
                     forall a. Fractional a => a -> a -> a
/ (Float
k1 forall a. Num a => a -> a -> a
+ Float
tf')
  where
    tf' :: Float
tf' = forall field term feature.
(Ix field, Bounded field) =>
Context term field feature
-> Doc term field feature -> term -> Float
weightedDocTermFrequency Context term field feature
ctx Doc term field feature
doc term
t
    k1 :: Float
k1  = forall term field feature. Context term field feature -> Float
paramK1 Context term field feature
ctx


weightIDF :: Context term field feature -> term -> Float
weightIDF :: forall term field feature.
Context term field feature -> term -> Float
weightIDF Context term field feature
ctx term
t =
    forall a. Floating a => a -> a
log ((Float
n forall a. Num a => a -> a -> a
- Float
n_t forall a. Num a => a -> a -> a
+ Float
0.5) forall a. Fractional a => a -> a -> a
/ (Float
n_t forall a. Num a => a -> a -> a
+ Float
0.5))
  where
    n :: Float
n   = forall a b. (Integral a, Num b) => a -> b
fromIntegral (forall term field feature. Context term field feature -> Int
numDocsTotal Context term field feature
ctx)
    n_t :: Float
n_t = forall a b. (Integral a, Num b) => a -> b
fromIntegral (forall term field feature.
Context term field feature -> term -> Int
numDocsWithTerm Context term field feature
ctx term
t)


weightedDocTermFrequency :: (Ix field, Bounded field) =>
                            Context term field feature ->
                            Doc term field feature -> term -> Float
weightedDocTermFrequency :: forall field term feature.
(Ix field, Bounded field) =>
Context term field feature
-> Doc term field feature -> term -> Float
weightedDocTermFrequency Context term field feature
ctx Doc term field feature
doc term
t =
    forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
sum [ Float
w_f forall a. Num a => a -> a -> a
* Float
tf_f forall a. Fractional a => a -> a -> a
/ Float
_B_f
        | field
field <- forall a. Ix a => (a, a) -> [a]
range (forall a. Bounded a => a
minBound, forall a. Bounded a => a
maxBound)
        , let w_f :: Float
w_f  = forall term field feature.
Context term field feature -> field -> Float
fieldWeight Context term field feature
ctx field
field
              tf_f :: Float
tf_f = forall a b. (Integral a, Num b) => a -> b
fromIntegral (forall term field feature.
Doc term field feature -> field -> term -> Int
docFieldTermFrequency Doc term field feature
doc field
field term
t)
              _B_f :: Float
_B_f = forall term field feature.
Context term field feature
-> Doc term field feature -> field -> Float
lengthNorm Context term field feature
ctx Doc term field feature
doc field
field
        , Bool -> Bool
not (forall a. RealFloat a => a -> Bool
isNaN Float
_B_f)
        ]
    -- When the avgFieldLength is 0 we have a field which is empty for all
    -- documents. Unfortunately it leads to a NaN because the
    -- docFieldTermFrequency will also be 0 so we get 0/0. What we want to
    -- do in this situation is have that field contribute nothing to the
    -- score. The simplest way to achieve that is to skip if _B_f is NaN.
    -- So I think this is fine and not an ugly hack.

lengthNorm :: Context term field feature ->
              Doc term field feature -> field -> Float
lengthNorm :: forall term field feature.
Context term field feature
-> Doc term field feature -> field -> Float
lengthNorm Context term field feature
ctx Doc term field feature
doc field
field =
    (Float
1forall a. Num a => a -> a -> a
-Float
b_f) forall a. Num a => a -> a -> a
+ Float
b_f forall a. Num a => a -> a -> a
* Float
sl_f forall a. Fractional a => a -> a -> a
/ Float
avgsl_f
  where
    b_f :: Float
b_f     = forall term field feature.
Context term field feature -> field -> Float
paramB Context term field feature
ctx field
field
    sl_f :: Float
sl_f    = forall a b. (Integral a, Num b) => a -> b
fromIntegral (forall term field feature. Doc term field feature -> field -> Int
docFieldLength Doc term field feature
doc field
field)
    avgsl_f :: Float
avgsl_f = forall term field feature.
Context term field feature -> field -> Float
avgFieldLength Context term field feature
ctx field
field


weightedNonTermScore :: (Ix feature, Bounded feature) =>
                        Context term field feature ->
                        Doc term field feature -> feature -> Float
weightedNonTermScore :: forall feature term field.
(Ix feature, Bounded feature) =>
Context term field feature
-> Doc term field feature -> feature -> Float
weightedNonTermScore Context term field feature
ctx Doc term field feature
doc feature
feature =
    Float
w_f forall a. Num a => a -> a -> a
* Float -> Float
_V_f Float
f_f
  where
    w_f :: Float
w_f  = forall term field feature.
Context term field feature -> feature -> Float
featureWeight Context term field feature
ctx feature
feature
    _V_f :: Float -> Float
_V_f = FeatureFunction -> Float -> Float
applyFeatureFunction (forall term field feature.
Context term field feature -> feature -> FeatureFunction
featureFunction Context term field feature
ctx feature
feature)
    f_f :: Float
f_f  = forall term field feature.
Doc term field feature -> feature -> Float
docFeatureValue Doc term field feature
doc feature
feature


data FeatureFunction
   = LogarithmicFunction   Float -- ^ @log (\lambda_i + f_i)@
   | RationalFunction      Float -- ^ @f_i / (\lambda_i + f_i)@
   | SigmoidFunction Float Float -- ^ @1 / (\lambda + exp(-(\lambda' * f_i))@

applyFeatureFunction :: FeatureFunction -> (Float -> Float)
applyFeatureFunction :: FeatureFunction -> Float -> Float
applyFeatureFunction (LogarithmicFunction Float
p1) = \Float
fi -> forall a. Floating a => a -> a
log (Float
p1 forall a. Num a => a -> a -> a
+ Float
fi)
applyFeatureFunction (RationalFunction    Float
p1) = \Float
fi -> Float
fi forall a. Fractional a => a -> a -> a
/ (Float
p1 forall a. Num a => a -> a -> a
+ Float
fi)
applyFeatureFunction (SigmoidFunction  Float
p1 Float
p2) = \Float
fi -> Float
1 forall a. Fractional a => a -> a -> a
/ (Float
p1 forall a. Num a => a -> a -> a
+ forall a. Floating a => a -> a
exp (-Float
fi forall a. Num a => a -> a -> a
* Float
p2))


-----------------------------
-- Bulk scoring of many terms
--

-- | Most of the time we want to score several different documents for the same
-- set of terms, but sometimes we want to score one document for many terms
-- and in that case we can save a bit of work by doing it in bulk. It lets us
-- calculate once and share things that depend only on the document, and not
-- the term.
--
-- To take advantage of the sharing you must partially apply and name the
-- per-doc score functon, e.g.
--
-- > let score :: term -> (field -> Int) -> Float
-- >     score = BM25.bulkScorer ctx doc
-- >  in sum [ score t (\f -> counts ! (t, f)) | t <- ts ]
--
scoreTermsBulk :: forall field term feature. (Ix field, Bounded field) =>
                  Context term field feature ->
                  Doc term field feature ->
                  (term -> (field -> Int) -> Float)
scoreTermsBulk :: forall field term feature.
(Ix field, Bounded field) =>
Context term field feature
-> Doc term field feature -> term -> (field -> Int) -> Float
scoreTermsBulk Context term field feature
ctx Doc term field feature
doc = 
    -- This is just a rearrangement of weightedTermScore and
    -- weightedDocTermFrequency above, with the doc-constant bits hoisted out.

    \term
t field -> Int
tFreq ->
    let !tf' :: Float
tf' = forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
sum [ UArray field Float
wforall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
!field
f forall a. Num a => a -> a -> a
* Float
tf_f forall a. Fractional a => a -> a -> a
/ UArray field Float
_Bforall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
!field
f
                   | field
f <- forall a. Ix a => (a, a) -> [a]
range (forall a. Bounded a => a
minBound, forall a. Bounded a => a
maxBound)
                   , let tf_f :: Float
tf_f = forall a b. (Integral a, Num b) => a -> b
fromIntegral (field -> Int
tFreq field
f)
                         _B_f :: Float
_B_f = UArray field Float
_Bforall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
!field
f
                   , Bool -> Bool
not (forall a. RealFloat a => a -> Bool
isNaN Float
_B_f)
                   ]

     in forall term field feature.
Context term field feature -> term -> Float
weightIDF Context term field feature
ctx term
t forall a. Num a => a -> a -> a
*     Float
tf'
                         forall a. Fractional a => a -> a -> a
/ (Float
k1 forall a. Num a => a -> a -> a
+ Float
tf')
  where
    -- So long as the caller does the partial application thing then these
    -- values can all be shared between many calls with different terms.

    !k1 :: Float
k1 = forall term field feature. Context term field feature -> Float
paramK1 Context term field feature
ctx
    w, _B :: UArray field Float
    !w :: UArray field Float
w  = forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
(i, i) -> [(i, e)] -> a i e
array (forall a. Bounded a => a
minBound, forall a. Bounded a => a
maxBound)
                [ (field
field, forall term field feature.
Context term field feature -> field -> Float
fieldWeight Context term field feature
ctx field
field)
                | field
field <- forall a. Ix a => (a, a) -> [a]
range (forall a. Bounded a => a
minBound, forall a. Bounded a => a
maxBound) ]
    !_B :: UArray field Float
_B = forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
(i, i) -> [(i, e)] -> a i e
array (forall a. Bounded a => a
minBound, forall a. Bounded a => a
maxBound)
                [ (field
field, forall term field feature.
Context term field feature
-> Doc term field feature -> field -> Float
lengthNorm Context term field feature
ctx Doc term field feature
doc field
field)
                | field
field <- forall a. Ix a => (a, a) -> [a]
range (forall a. Bounded a => a
minBound, forall a. Bounded a => a
maxBound) ]


------------------
-- Explanation
--

-- | A breakdown of the BM25F score, to explain somewhat how it relates to
-- the inputs, and so you can compare the scores of different documents.
--
data Explanation field feature term = Explanation {
       -- | The overall score is the sum of the 'termScores', 'positionScore'
       -- and 'nonTermScore'
       forall field feature term. Explanation field feature term -> Float
overallScore  :: Float,

       -- | There is a score contribution from each query term. This is the
       -- score for the term across all fields in the document (but see
       -- 'termFieldScores').
       forall field feature term.
Explanation field feature term -> [(term, Float)]
termScores    :: [(term, Float)],
{-
       -- | There is a score contribution for positional information. Terms
       -- appearing in the document close together give a bonus.
       positionScore :: [(field, Float)],
-}
       -- | The document can have an inate bonus score independent of the terms
       -- in the query. For example this might be a popularity score.
       forall field feature term.
Explanation field feature term -> [(feature, Float)]
nonTermScores :: [(feature, Float)],

       -- | This does /not/ contribute to the 'overallScore'. It is an
       -- indication of how the 'termScores' relates to per-field scores.
       -- Note however that the term score for all fields is /not/ simply
       -- sum of the per-field scores. The point of the BM25F scoring function
       -- is that a linear combination of per-field scores is wrong, and BM25F
       -- does a more cunning non-linear combination.
       --
       -- However, it is still useful as an indication to see scores for each
       -- field for a term, to see how the compare.
       --
       forall field feature term.
Explanation field feature term -> [(term, [(field, Float)])]
termFieldScores :: [(term, [(field, Float)])]
     }
  deriving Int -> Explanation field feature term -> ShowS
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
forall field feature term.
(Show term, Show feature, Show field) =>
Int -> Explanation field feature term -> ShowS
forall field feature term.
(Show term, Show feature, Show field) =>
[Explanation field feature term] -> ShowS
forall field feature term.
(Show term, Show feature, Show field) =>
Explanation field feature term -> String
showList :: [Explanation field feature term] -> ShowS
$cshowList :: forall field feature term.
(Show term, Show feature, Show field) =>
[Explanation field feature term] -> ShowS
show :: Explanation field feature term -> String
$cshow :: forall field feature term.
(Show term, Show feature, Show field) =>
Explanation field feature term -> String
showsPrec :: Int -> Explanation field feature term -> ShowS
$cshowsPrec :: forall field feature term.
(Show term, Show feature, Show field) =>
Int -> Explanation field feature term -> ShowS
Show

instance Functor (Explanation field feature) where
  fmap :: forall a b.
(a -> b)
-> Explanation field feature a -> Explanation field feature b
fmap a -> b
f e :: Explanation field feature a
e@Explanation{Float
[(feature, Float)]
[(a, Float)]
[(a, [(field, Float)])]
termFieldScores :: [(a, [(field, Float)])]
nonTermScores :: [(feature, Float)]
termScores :: [(a, Float)]
overallScore :: Float
termFieldScores :: forall field feature term.
Explanation field feature term -> [(term, [(field, Float)])]
nonTermScores :: forall field feature term.
Explanation field feature term -> [(feature, Float)]
termScores :: forall field feature term.
Explanation field feature term -> [(term, Float)]
overallScore :: forall field feature term. Explanation field feature term -> Float
..} =
    Explanation field feature a
e {
      termScores :: [(b, Float)]
termScores      = [ (a -> b
f a
t, Float
s)  | (a
t, Float
s)  <- [(a, Float)]
termScores ],
      termFieldScores :: [(b, [(field, Float)])]
termFieldScores = [ (a -> b
f a
t, [(field, Float)]
fs) | (a
t, [(field, Float)]
fs) <- [(a, [(field, Float)])]
termFieldScores ]
    }

explain :: (Ix field, Bounded field, Ix feature, Bounded feature) =>
           Context term field feature ->
           Doc term field feature -> [term] -> Explanation field feature term
explain :: forall field feature term.
(Ix field, Bounded field, Ix feature, Bounded feature) =>
Context term field feature
-> Doc term field feature
-> [term]
-> Explanation field feature term
explain Context term field feature
ctx Doc term field feature
doc [term]
ts =
    Explanation {Float
[(feature, Float)]
[(term, Float)]
[(term, [(field, Float)])]
termFieldScores :: [(term, [(field, Float)])]
nonTermScores :: [(feature, Float)]
termScores :: [(term, Float)]
overallScore :: Float
termFieldScores :: [(term, [(field, Float)])]
nonTermScores :: [(feature, Float)]
termScores :: [(term, Float)]
overallScore :: Float
..}
  where
    overallScore :: Float
overallScore  = forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
sum (forall a b. (a -> b) -> [a] -> [b]
map forall a b. (a, b) -> b
snd [(term, Float)]
termScores)
--                  + sum (map snd positionScore)
                  forall a. Num a => a -> a -> a
+ forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
sum (forall a b. (a -> b) -> [a] -> [b]
map forall a b. (a, b) -> b
snd [(feature, Float)]
nonTermScores)
    termScores :: [(term, Float)]
termScores    = [ (term
t, forall field term feature.
(Ix field, Bounded field) =>
Context term field feature
-> Doc term field feature -> term -> Float
weightedTermScore Context term field feature
ctx Doc term field feature
doc term
t) | term
t <- [term]
ts ]
--    positionScore = [ (f, 0) | f <- range (minBound, maxBound) ]
    nonTermScores :: [(feature, Float)]
nonTermScores = [ (feature
feature, forall feature term field.
(Ix feature, Bounded feature) =>
Context term field feature
-> Doc term field feature -> feature -> Float
weightedNonTermScore Context term field feature
ctx Doc term field feature
doc feature
feature)
                    | feature
feature <- forall a. Ix a => (a, a) -> [a]
range (forall a. Bounded a => a
minBound, forall a. Bounded a => a
maxBound) ]

    termFieldScores :: [(term, [(field, Float)])]
termFieldScores =
      [ (term
t, [(field, Float)]
fieldScores)
      | term
t <- [term]
ts
      , let fieldScores :: [(field, Float)]
fieldScores =
              [ (field
f, forall field term feature.
(Ix field, Bounded field) =>
Context term field feature
-> Doc term field feature -> term -> Float
weightedTermScore Context term field feature
ctx' Doc term field feature
doc term
t)
              | field
f <- forall a. Ix a => (a, a) -> [a]
range (forall a. Bounded a => a
minBound, forall a. Bounded a => a
maxBound)
              , let ctx' :: Context term field feature
ctx' = Context term field feature
ctx { fieldWeight :: field -> Float
fieldWeight = forall {a}. (Ix a, Bounded a) => a -> field -> Float
fieldWeightOnly field
f }
              ]
      ]
    fieldWeightOnly :: a -> field -> Float
fieldWeightOnly a
f field
f' | forall {a} {a}.
(Ix a, Ix a, Bounded a, Bounded a) =>
a -> a -> Bool
sameField a
f field
f' = forall term field feature.
Context term field feature -> field -> Float
fieldWeight Context term field feature
ctx field
f'
                         | Bool
otherwise      = Float
0

    sameField :: a -> a -> Bool
sameField a
f a
f' = forall a. Ix a => (a, a) -> a -> Int
index (forall a. Bounded a => a
minBound, forall a. Bounded a => a
maxBound) a
f
                  forall a. Eq a => a -> a -> Bool
== forall a. Ix a => (a, a) -> a -> Int
index (forall a. Bounded a => a
minBound, forall a. Bounded a => a
maxBound) a
f'