{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE UnicodeSyntax #-}
module Data.FuzzySet.Internal where
import Data.Function ( on )
import Data.FuzzySet.Lens
import Data.FuzzySet.Types
import Data.FuzzySet.Util
import Data.HashMap.Strict ( HashMap, alter, empty, elems, foldrWithKey )
import Data.Maybe ( fromMaybe )
import Data.List ( sortBy )
import Data.Text ( Text )
import Prelude.Unicode
import qualified Data.HashMap.Strict as HashMap
import qualified Data.Text as Text
import qualified Data.Vector as Vector
getMatch ∷ GetContext → Size → [(Double, Text)]
getMatch GetContext{..} size = match <$$> filtered
where
match α = set ^._exactSet.ix α
filtered = filter ((<) minScore ∘ fst) sorted
μ p = p & _1.~ distance (p ^._2) key
sorted = sortBy (flip compare `on` fst) $
let rs = results GetContext{..} size
in if set ^._useLevenshtein
then take 50 (μ <$> rs)
else rs
results ∷ GetContext → Size → [(Double, Text)]
results GetContext{..} size = ζ <$> HashMap.toList (matches set grams)
where
grams = gramMap key size
normal = norm (elems grams)
ζ (index, score) =
let FuzzySetItem{..} = Vector.unsafeIndex (set ^._items.ix size) index
in (fromIntegral score / (normal × vectorMagnitude), normalizedEntry)
matches ∷ FuzzySet → HashMap Text Int → HashMap Int Int
matches set = foldrWithKey ζ empty
where
ζ gram occ m = foldr (\GramInfo{..} →
alter (pure ∘ (+) (occ × gramCount) ∘ fromMaybe 0) itemIndex)
m (set ^._matchDict.ix gram)
gramMap ∷ Text
→ Size
→ HashMap Text Int
gramMap val size = foldr ζ ε (grams val size)
where
ζ = alter (pure ∘ succ ∘ fromMaybe 0)
grams ∷ Text
→ Size
→ [Text]
grams val size
| size < 2 = error "gram size must be >= 2"
| otherwise = ($ str) ∘ substr size <$> [0 .. Text.length str − size]
where
str = normalized val `enclosedIn` '-'