{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE RecordWildCards #-} module Data.FuzzySet.Internal ( FuzzySet(..) , FuzzySetItem(..) , GramInfo(..) , FuzzyMatch , grams , gramVector , matches , getMatches , add_ , addMany_ , normalized , norm , distance ) where import Control.Monad.State (MonadState, get, modify) import Data.Bifunctor (second) import Data.Char (isAlphaNum, isSpace) import Data.Foldable (traverse_) import Data.Function ((&)) import Data.FuzzySet.Utils (enclosedIn, substr, (<$$>)) import Data.HashMap.Strict (HashMap, elems, foldrWithKey, insert, insertWith, lookup, lookupDefault) import qualified Data.HashMap.Strict as HashMap import Data.List (sortBy) import Data.Maybe (fromMaybe) import Data.Ord (Down(..), comparing) import Data.Text (Text) import qualified Data.Text as Text import Data.Text.Metrics (levenshteinNorm) import Data.Vector (Vector, (!?)) import qualified Data.Vector as Vector import Prelude hiding (lookup) data FuzzySetItem = FuzzySetItem { vectorMagnitude :: !Double , normalizedEntry :: !Text } deriving (Eq, Show) data GramInfo = GramInfo { itemIndex :: !Int , gramCount :: !Int } deriving (Eq, Show) -- | Main fuzzy string set data type. data FuzzySet = FuzzySet { exactSet :: !(HashMap Text Text) , matchDict :: !(HashMap Text [GramInfo]) , items :: !(HashMap Int (Vector FuzzySetItem)) , gramSizeLower :: !Int -- ^ Lower bound on gram sizes to use (inclusive) , gramSizeUpper :: !Int -- ^ Upper bound on gram sizes to use (inclusive) , useLevenshtein :: !Bool -- ^ Whether or not to use the Levenshtein distance to determine the score } deriving (Eq, Show) -- | An individual result when looking up a string in the set, consisting of -- -- * a similarity score in the range \([0, 1]\), and -- * the matching string. type FuzzyMatch = (Double, Text) matches :: FuzzySet -> HashMap Text Int -> HashMap Int Int matches FuzzySet{..} = foldrWithKey go mempty where go gram count hashMap = HashMap.lookup gram matchDict & maybe hashMap (foldr (insert_ count) hashMap) insert_ count GramInfo{..} = insertWith (+) itemIndex (gramCount * count) getMatches :: FuzzySet -> Text -> Double -> Int -> [FuzzyMatch] getMatches FuzzySet{..} str minScore gramSize = results & filter ((>= minScore) . fst) & fmap (second (flip (lookupDefault mempty) exactSet)) where results = let sorted = matches FuzzySet{..} queryVector & foldrWithKey go [] & sortBy (comparing (Down . fst)) in if useLevenshtein then sorted & take 50 & fmap (\(_, entry) -> (distance str entry, entry)) & sortBy (comparing (Down . fst)) else sorted queryMagnitude = norm (elems queryVector) queryVector = gramVector str gramSize itemsVector = fromMaybe mempty (gramSize `lookup` items) go index score list = case itemsVector !? index of Nothing -> list Just FuzzySetItem{..} -> ( fromIntegral score / (queryMagnitude * vectorMagnitude) , normalizedEntry ) : list add_ :: (MonadState FuzzySet m) => Text -> m Bool add_ str = do FuzzySet{..} <- get if key `elem` exactSet then -- An entry already exists pure False else do traverse_ (modify . updateDict) [gramSizeLower .. gramSizeUpper] modify (updateExactSet key str) pure True where key = Text.toLower str updateDict size_ FuzzySet{..} = let itemVector = items & HashMap.lookup size_ & fromMaybe Vector.empty grams_ = gramVector key size_ insertInfo gram count = let info = GramInfo (Vector.length itemVector) count in HashMap.insertWith (<>) gram [info] item = FuzzySetItem (elems grams_ & norm) key in FuzzySet { items = insert size_ (itemVector `Vector.snoc` item) items , matchDict = foldrWithKey insertInfo matchDict grams_ , .. } addMany_ :: (MonadState FuzzySet m) => [Text] -> m [Text] addMany_ = concat <$$> traverse addOne where addOne str = do p <- add_ str pure [str | p] -- | Generate a list of /n/-grams (character substrings) from the normalized -- input and then translate this into a dictionary with the /n/-grams as keys -- mapping to the number of occurences of the substring in the list. -- -- >>> gramVector "xxxx" 2 -- fromList [("-x",1), ("xx",3), ("x-",1)] -- -- The substring @"xx"@ appears three times in the normalized string: -- -- >>> grams "xxxx" 2 -- ["-x","xx","xx","xx","x-"] -- -- >>> Data.HashMap.Strict.lookup "nts" (gramVector "intrent'srestaurantsomeoftrent'saunt'santswantsamtorentsomepants" 3) -- Just 8 gramVector :: Text -> Int -> HashMap Text Int gramVector = foldr insert_ HashMap.empty <$$> grams where insert_ key = HashMap.insertWith (+) key 1 -- | Break apart the input string into a list of /n/-grams. The string is first -- 'Data.FuzzySet.Util.normalized' and enclosed in hyphens. We then take all -- substrings of length /n/, letting the offset range from \(0 \text{ to } s + 2 − n\), -- where /s/ is the length of the normalized input. -- -- /Example:/ -- The string @"Destroido Corp."@ is first normalized to @"destroido corp"@, -- and then enclosed in hyphens, so that it becomes @"-destroido corp-"@. The -- trigrams generated from this normalized string are: -- -- > [ "-de" -- > , "des" -- > , "est" -- > , "str" -- > , "tro" -- > , "roi" -- > , "oid" -- > , "ido" -- > , "do " -- > , "o c" -- > , " co" -- > , "cor" -- > , "orp" -- > , "rp-" -- > ] grams :: Text -> Int -> [Text] grams input size_ | size_ < 2 = error "gram size_ must be at least 2" | otherwise = flip (substr size_) normalizedInput <$> offsets where normalizedInput = normalized input `enclosedIn` '-' offsets = [0 .. Text.length normalizedInput - size_] -- | Normalize the input by -- -- * removing non-word characters, except for spaces and commas; and -- * converting alphabetic characters to lowercase. -- normalized :: Text -> Text normalized = Text.filter word . Text.toLower where word char | isAlphaNum char = True | isSpace char = True | char == ',' = True | otherwise = False -- | Return the euclidean norm, or /magnitude/, of the input list interpreted -- as a vector. -- -- That is, -- -- \( \quad \sqrt{ \sum_{i=0}^n a_i^2 } \) -- -- for the input -- -- \( \quad \langle a_0, a_1, \dots, a_n \rangle \) -- -- where \( a_i \) is the element at position /i/ in the input list. norm :: [Int] -> Double norm = sqrt . fromIntegral . sum . fmap (^ (2 :: Int)) -- | Return the normalized Levenshtein distance between the two strings. -- -- See . distance :: Text -> Text -> Double distance = realToFrac <$$> levenshteinNorm updateExactSet :: Text -> Text -> FuzzySet -> FuzzySet updateExactSet key str FuzzySet{..} = FuzzySet { exactSet = insert key str exactSet , .. }