{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE UnicodeSyntax #-} module Data.FuzzySet.Internal where import Data.Function ( on ) import Data.FuzzySet.Lens import Data.FuzzySet.Types import Data.FuzzySet.Util import Data.HashMap.Strict ( HashMap, alter, empty, elems, foldrWithKey ) import Data.Maybe ( fromMaybe ) import Data.List ( sortBy ) import Data.Text ( Text ) import Prelude.Unicode import qualified Data.HashMap.Strict as HashMap import qualified Data.Text as Text import qualified Data.Vector as Vector getMatch ∷ GetContext → Size → [(Double, Text)] getMatch GetContext{..} size = match <$$> filtered where match α = set ^._exactSet.ix α filtered = filter ((<) minScore ∘ fst) sorted μ p = p & _1.~ distance (p ^._2) key sorted = sortBy (flip compare `on` fst) $ let rs = results GetContext{..} size in if set ^._useLevenshtein then take 50 (μ <$> rs) else rs results ∷ GetContext → Size → [(Double, Text)] results GetContext{..} size = ζ <$> HashMap.toList (matches set grams) where grams = gramMap key size normal = norm (elems grams) ζ (index, score) = let FuzzySetItem{..} = Vector.unsafeIndex (set ^._items.ix size) index in (fromIntegral score / (normal × vectorMagnitude), normalizedEntry) matches ∷ FuzzySet → HashMap Text Int → HashMap Int Int matches set = foldrWithKey ζ empty where ζ gram occ m = foldr (\GramInfo{..} → alter (pure ∘ (+) (occ × gramCount) ∘ fromMaybe 0) itemIndex) m (set ^._matchDict.ix gram) -- | Normalize the input string, call 'grams' on the normalized input, and then -- translate the result to a 'HashMap' with the /n/-grams as keys and 'Int' -- values corresponding to the number of occurences of the key in the -- generated gram list. -- -- >>> gramMap "xxxx" 2 -- fromList [("-x",1), ("xx",3), ("x-",1)] -- -- >>> Data.HashMap.Strict.lookup "nts" (gramMap "intrent'srestaurantsomeoftrent'saunt'santswantsamtorentsomepants" 3) -- Just 8 gramMap ∷ Text -- ^ An input string → Size -- ^ The gram size /n/, which must be at least /2/ → HashMap Text Int -- ^ A mapping from /n/-gram keys to the number of occurrences of the -- key in the list returned by 'grams' (i.e., the list of all -- /n/-length substrings of the input enclosed in hyphens). gramMap val size = foldr ζ ε (grams val size) where ζ = alter (pure ∘ succ ∘ fromMaybe 0) -- | Break apart the normalized input string into a list of /n/-grams. For -- instance, the string "Destroido Corp." is first normalized into the -- form "destroido corp", and then enclosed in hyphens, so that it becomes -- "-destroido corp-". The /3/-grams generated from this normalized string are -- -- > "-de", "des", "est", "str", "tro", "roi", "oid", "ido", "do ", "o c", " co", "cor", "orp", "rp-" -- -- Given a normalized string of length /s/, we take all substrings of length -- /n/, letting the offset range from \(0 \text{ to } s + 2 − n\). The number -- of /n/-grams for a normalized string of length /s/ is thus -- \(s + 2 − n + 1 = s − n + 3\), where \(0 < n < s − 2\). grams ∷ Text -- ^ An input string → Size -- ^ The variable /n/, which must be at least /2/ → [Text] -- ^ A /k/-length list of grams of size /n/, -- with \(k = s − n + 3\) grams val size | size < 2 = error "gram size must be >= 2" | otherwise = ($ str) ∘ substr size <$> [0 .. Text.length str − size] where str = normalized val `enclosedIn` '-'