{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE RecordWildCards #-}

module Data.FuzzySet.Internal
  ( FuzzySet(..)
  , FuzzySetItem(..)
  , GramInfo(..)
  , FuzzyMatch
  , grams
  , gramVector
  , matches
  , getMatches
  , add_
  , addMany_
  , normalized
  , norm
  , distance
  ) where

import Control.Monad.State (MonadState, get, modify)
import Data.Bifunctor (second)
import Data.Char (isAlphaNum, isSpace)
import Data.Foldable (traverse_)
import Data.Function ((&))
import Data.FuzzySet.Utils (enclosedIn, substr, (<$$>))
import Data.HashMap.Strict (HashMap, elems, foldrWithKey, insert, insertWith, lookup, lookupDefault)
import qualified Data.HashMap.Strict as HashMap
import Data.List (sortBy)
import Data.Maybe (fromMaybe)
import Data.Ord (Down(..), comparing)
import Data.Text (Text)
import qualified Data.Text as Text
import Data.Text.Metrics (levenshteinNorm)
import Data.Vector (Vector, (!?))
import qualified Data.Vector as Vector
import Prelude hiding (lookup)

data FuzzySetItem = FuzzySetItem
  { FuzzySetItem -> Double
vectorMagnitude :: !Double
  , FuzzySetItem -> Text
normalizedEntry :: !Text
  } deriving (FuzzySetItem -> FuzzySetItem -> Bool
(FuzzySetItem -> FuzzySetItem -> Bool)
-> (FuzzySetItem -> FuzzySetItem -> Bool) -> Eq FuzzySetItem
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: FuzzySetItem -> FuzzySetItem -> Bool
== :: FuzzySetItem -> FuzzySetItem -> Bool
$c/= :: FuzzySetItem -> FuzzySetItem -> Bool
/= :: FuzzySetItem -> FuzzySetItem -> Bool
Eq, Int -> FuzzySetItem -> ShowS
[FuzzySetItem] -> ShowS
FuzzySetItem -> String
(Int -> FuzzySetItem -> ShowS)
-> (FuzzySetItem -> String)
-> ([FuzzySetItem] -> ShowS)
-> Show FuzzySetItem
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> FuzzySetItem -> ShowS
showsPrec :: Int -> FuzzySetItem -> ShowS
$cshow :: FuzzySetItem -> String
show :: FuzzySetItem -> String
$cshowList :: [FuzzySetItem] -> ShowS
showList :: [FuzzySetItem] -> ShowS
Show)

data GramInfo = GramInfo
  { GramInfo -> Int
itemIndex :: !Int
  , GramInfo -> Int
gramCount :: !Int
  } deriving (GramInfo -> GramInfo -> Bool
(GramInfo -> GramInfo -> Bool)
-> (GramInfo -> GramInfo -> Bool) -> Eq GramInfo
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: GramInfo -> GramInfo -> Bool
== :: GramInfo -> GramInfo -> Bool
$c/= :: GramInfo -> GramInfo -> Bool
/= :: GramInfo -> GramInfo -> Bool
Eq, Int -> GramInfo -> ShowS
[GramInfo] -> ShowS
GramInfo -> String
(Int -> GramInfo -> ShowS)
-> (GramInfo -> String) -> ([GramInfo] -> ShowS) -> Show GramInfo
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> GramInfo -> ShowS
showsPrec :: Int -> GramInfo -> ShowS
$cshow :: GramInfo -> String
show :: GramInfo -> String
$cshowList :: [GramInfo] -> ShowS
showList :: [GramInfo] -> ShowS
Show)

-- | Main fuzzy string set data type.
data FuzzySet = FuzzySet
  { FuzzySet -> HashMap Text Text
exactSet       :: !(HashMap Text Text)
  , FuzzySet -> HashMap Text [GramInfo]
matchDict      :: !(HashMap Text [GramInfo])
  , FuzzySet -> HashMap Int (Vector FuzzySetItem)
items          :: !(HashMap Int (Vector FuzzySetItem))
  , FuzzySet -> Int
gramSizeLower  :: !Int
  -- ^ Lower bound on gram sizes to use (inclusive)
  , FuzzySet -> Int
gramSizeUpper  :: !Int
  -- ^ Upper bound on gram sizes to use (inclusive)
  , FuzzySet -> Bool
useLevenshtein :: !Bool
  -- ^ Whether or not to use the Levenshtein distance to determine the score
  } deriving (FuzzySet -> FuzzySet -> Bool
(FuzzySet -> FuzzySet -> Bool)
-> (FuzzySet -> FuzzySet -> Bool) -> Eq FuzzySet
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: FuzzySet -> FuzzySet -> Bool
== :: FuzzySet -> FuzzySet -> Bool
$c/= :: FuzzySet -> FuzzySet -> Bool
/= :: FuzzySet -> FuzzySet -> Bool
Eq, Int -> FuzzySet -> ShowS
[FuzzySet] -> ShowS
FuzzySet -> String
(Int -> FuzzySet -> ShowS)
-> (FuzzySet -> String) -> ([FuzzySet] -> ShowS) -> Show FuzzySet
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> FuzzySet -> ShowS
showsPrec :: Int -> FuzzySet -> ShowS
$cshow :: FuzzySet -> String
show :: FuzzySet -> String
$cshowList :: [FuzzySet] -> ShowS
showList :: [FuzzySet] -> ShowS
Show)

-- | An individual result when looking up a string in the set, consisting of
--
--     * a similarity score in the range \([0, 1]\), and
--     * the matching string.
type FuzzyMatch = (Double, Text)

matches :: FuzzySet -> HashMap Text Int -> HashMap Int Int
matches :: FuzzySet -> HashMap Text Int -> HashMap Int Int
matches FuzzySet{Bool
Int
HashMap Int (Vector FuzzySetItem)
HashMap Text [GramInfo]
HashMap Text Text
exactSet :: FuzzySet -> HashMap Text Text
matchDict :: FuzzySet -> HashMap Text [GramInfo]
items :: FuzzySet -> HashMap Int (Vector FuzzySetItem)
gramSizeLower :: FuzzySet -> Int
gramSizeUpper :: FuzzySet -> Int
useLevenshtein :: FuzzySet -> Bool
exactSet :: HashMap Text Text
matchDict :: HashMap Text [GramInfo]
items :: HashMap Int (Vector FuzzySetItem)
gramSizeLower :: Int
gramSizeUpper :: Int
useLevenshtein :: Bool
..} = (Text -> Int -> HashMap Int Int -> HashMap Int Int)
-> HashMap Int Int -> HashMap Text Int -> HashMap Int Int
forall k v a. (k -> v -> a -> a) -> a -> HashMap k v -> a
foldrWithKey Text -> Int -> HashMap Int Int -> HashMap Int Int
go HashMap Int Int
forall a. Monoid a => a
mempty
  where
    go :: Text -> Int -> HashMap Int Int -> HashMap Int Int
go Text
gram Int
count HashMap Int Int
hashMap =
      Text -> HashMap Text [GramInfo] -> Maybe [GramInfo]
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
HashMap.lookup Text
gram HashMap Text [GramInfo]
matchDict
        Maybe [GramInfo]
-> (Maybe [GramInfo] -> HashMap Int Int) -> HashMap Int Int
forall a b. a -> (a -> b) -> b
& HashMap Int Int
-> ([GramInfo] -> HashMap Int Int)
-> Maybe [GramInfo]
-> HashMap Int Int
forall b a. b -> (a -> b) -> Maybe a -> b
maybe HashMap Int Int
hashMap ((GramInfo -> HashMap Int Int -> HashMap Int Int)
-> HashMap Int Int -> [GramInfo] -> HashMap Int Int
forall a b. (a -> b -> b) -> b -> [a] -> b
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr (Int -> GramInfo -> HashMap Int Int -> HashMap Int Int
insert_ Int
count) HashMap Int Int
hashMap)
    insert_ :: Int -> GramInfo -> HashMap Int Int -> HashMap Int Int
insert_ Int
count GramInfo{Int
itemIndex :: GramInfo -> Int
gramCount :: GramInfo -> Int
itemIndex :: Int
gramCount :: Int
..} =
      (Int -> Int -> Int)
-> Int -> Int -> HashMap Int Int -> HashMap Int Int
forall k v.
(Eq k, Hashable k) =>
(v -> v -> v) -> k -> v -> HashMap k v -> HashMap k v
insertWith Int -> Int -> Int
forall a. Num a => a -> a -> a
(+) Int
itemIndex (Int
gramCount Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
count)

getMatches :: FuzzySet -> Text -> Double -> Int -> [FuzzyMatch]
getMatches :: FuzzySet -> Text -> Double -> Int -> [FuzzyMatch]
getMatches FuzzySet{Bool
Int
HashMap Int (Vector FuzzySetItem)
HashMap Text [GramInfo]
HashMap Text Text
exactSet :: FuzzySet -> HashMap Text Text
matchDict :: FuzzySet -> HashMap Text [GramInfo]
items :: FuzzySet -> HashMap Int (Vector FuzzySetItem)
gramSizeLower :: FuzzySet -> Int
gramSizeUpper :: FuzzySet -> Int
useLevenshtein :: FuzzySet -> Bool
exactSet :: HashMap Text Text
matchDict :: HashMap Text [GramInfo]
items :: HashMap Int (Vector FuzzySetItem)
gramSizeLower :: Int
gramSizeUpper :: Int
useLevenshtein :: Bool
..} Text
str Double
minScore Int
gramSize =
  [FuzzyMatch]
results
    [FuzzyMatch] -> ([FuzzyMatch] -> [FuzzyMatch]) -> [FuzzyMatch]
forall a b. a -> (a -> b) -> b
& (FuzzyMatch -> Bool) -> [FuzzyMatch] -> [FuzzyMatch]
forall a. (a -> Bool) -> [a] -> [a]
filter ((Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
>= Double
minScore) (Double -> Bool) -> (FuzzyMatch -> Double) -> FuzzyMatch -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. FuzzyMatch -> Double
forall a b. (a, b) -> a
fst)
    [FuzzyMatch] -> ([FuzzyMatch] -> [FuzzyMatch]) -> [FuzzyMatch]
forall a b. a -> (a -> b) -> b
& (FuzzyMatch -> FuzzyMatch) -> [FuzzyMatch] -> [FuzzyMatch]
forall a b. (a -> b) -> [a] -> [b]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap ((Text -> Text) -> FuzzyMatch -> FuzzyMatch
forall b c a. (b -> c) -> (a, b) -> (a, c)
forall (p :: * -> * -> *) b c a.
Bifunctor p =>
(b -> c) -> p a b -> p a c
second ((Text -> HashMap Text Text -> Text)
-> HashMap Text Text -> Text -> Text
forall a b c. (a -> b -> c) -> b -> a -> c
flip (Text -> Text -> HashMap Text Text -> Text
forall k v. (Eq k, Hashable k) => v -> k -> HashMap k v -> v
lookupDefault Text
forall a. Monoid a => a
mempty) HashMap Text Text
exactSet))
  where
    results :: [FuzzyMatch]
results =
      let sorted :: [FuzzyMatch]
sorted =
            FuzzySet -> HashMap Text Int -> HashMap Int Int
matches FuzzySet{Bool
Int
HashMap Int (Vector FuzzySetItem)
HashMap Text [GramInfo]
HashMap Text Text
exactSet :: HashMap Text Text
matchDict :: HashMap Text [GramInfo]
items :: HashMap Int (Vector FuzzySetItem)
gramSizeLower :: Int
gramSizeUpper :: Int
useLevenshtein :: Bool
exactSet :: HashMap Text Text
matchDict :: HashMap Text [GramInfo]
items :: HashMap Int (Vector FuzzySetItem)
gramSizeLower :: Int
gramSizeUpper :: Int
useLevenshtein :: Bool
..} HashMap Text Int
queryVector
              HashMap Int Int
-> (HashMap Int Int -> [FuzzyMatch]) -> [FuzzyMatch]
forall a b. a -> (a -> b) -> b
& (Int -> Int -> [FuzzyMatch] -> [FuzzyMatch])
-> [FuzzyMatch] -> HashMap Int Int -> [FuzzyMatch]
forall k v a. (k -> v -> a -> a) -> a -> HashMap k v -> a
foldrWithKey Int -> Int -> [FuzzyMatch] -> [FuzzyMatch]
forall {a}. Integral a => Int -> a -> [FuzzyMatch] -> [FuzzyMatch]
go []
              [FuzzyMatch] -> ([FuzzyMatch] -> [FuzzyMatch]) -> [FuzzyMatch]
forall a b. a -> (a -> b) -> b
& (FuzzyMatch -> FuzzyMatch -> Ordering)
-> [FuzzyMatch] -> [FuzzyMatch]
forall a. (a -> a -> Ordering) -> [a] -> [a]
sortBy ((FuzzyMatch -> Down Double) -> FuzzyMatch -> FuzzyMatch -> Ordering
forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing (Double -> Down Double
forall a. a -> Down a
Down (Double -> Down Double)
-> (FuzzyMatch -> Double) -> FuzzyMatch -> Down Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. FuzzyMatch -> Double
forall a b. (a, b) -> a
fst))
       in if Bool
useLevenshtein
            then
              [FuzzyMatch]
sorted
                [FuzzyMatch] -> ([FuzzyMatch] -> [FuzzyMatch]) -> [FuzzyMatch]
forall a b. a -> (a -> b) -> b
& Int -> [FuzzyMatch] -> [FuzzyMatch]
forall a. Int -> [a] -> [a]
take Int
50
                [FuzzyMatch] -> ([FuzzyMatch] -> [FuzzyMatch]) -> [FuzzyMatch]
forall a b. a -> (a -> b) -> b
& (FuzzyMatch -> FuzzyMatch) -> [FuzzyMatch] -> [FuzzyMatch]
forall a b. (a -> b) -> [a] -> [b]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (\(Double
_, Text
entry) -> (Text -> Text -> Double
distance Text
str Text
entry, Text
entry))
                [FuzzyMatch] -> ([FuzzyMatch] -> [FuzzyMatch]) -> [FuzzyMatch]
forall a b. a -> (a -> b) -> b
& (FuzzyMatch -> FuzzyMatch -> Ordering)
-> [FuzzyMatch] -> [FuzzyMatch]
forall a. (a -> a -> Ordering) -> [a] -> [a]
sortBy ((FuzzyMatch -> Down Double) -> FuzzyMatch -> FuzzyMatch -> Ordering
forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing (Double -> Down Double
forall a. a -> Down a
Down (Double -> Down Double)
-> (FuzzyMatch -> Double) -> FuzzyMatch -> Down Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. FuzzyMatch -> Double
forall a b. (a, b) -> a
fst))
            else [FuzzyMatch]
sorted

    queryMagnitude :: Double
queryMagnitude = [Int] -> Double
norm (HashMap Text Int -> [Int]
forall k v. HashMap k v -> [v]
elems HashMap Text Int
queryVector)
    queryVector :: HashMap Text Int
queryVector = Text -> Int -> HashMap Text Int
gramVector Text
str Int
gramSize
    itemsVector :: Vector FuzzySetItem
itemsVector = Vector FuzzySetItem
-> Maybe (Vector FuzzySetItem) -> Vector FuzzySetItem
forall a. a -> Maybe a -> a
fromMaybe Vector FuzzySetItem
forall a. Monoid a => a
mempty (Int
gramSize Int
-> HashMap Int (Vector FuzzySetItem) -> Maybe (Vector FuzzySetItem)
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
`lookup` HashMap Int (Vector FuzzySetItem)
items)

    go :: Int -> a -> [FuzzyMatch] -> [FuzzyMatch]
go Int
index a
score [FuzzyMatch]
list =
      case Vector FuzzySetItem
itemsVector Vector FuzzySetItem -> Int -> Maybe FuzzySetItem
forall a. Vector a -> Int -> Maybe a
!? Int
index of
        Maybe FuzzySetItem
Nothing ->
          [FuzzyMatch]
list
        Just FuzzySetItem{Double
Text
vectorMagnitude :: FuzzySetItem -> Double
normalizedEntry :: FuzzySetItem -> Text
vectorMagnitude :: Double
normalizedEntry :: Text
..} ->
          ( a -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral a
score Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ (Double
queryMagnitude Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
vectorMagnitude)
          , Text
normalizedEntry
          ) FuzzyMatch -> [FuzzyMatch] -> [FuzzyMatch]
forall a. a -> [a] -> [a]
: [FuzzyMatch]
list

add_ :: (MonadState FuzzySet m) => Text -> m Bool
add_ :: forall (m :: * -> *). MonadState FuzzySet m => Text -> m Bool
add_ Text
str = do
  FuzzySet{Bool
Int
HashMap Int (Vector FuzzySetItem)
HashMap Text [GramInfo]
HashMap Text Text
exactSet :: FuzzySet -> HashMap Text Text
matchDict :: FuzzySet -> HashMap Text [GramInfo]
items :: FuzzySet -> HashMap Int (Vector FuzzySetItem)
gramSizeLower :: FuzzySet -> Int
gramSizeUpper :: FuzzySet -> Int
useLevenshtein :: FuzzySet -> Bool
exactSet :: HashMap Text Text
matchDict :: HashMap Text [GramInfo]
items :: HashMap Int (Vector FuzzySetItem)
gramSizeLower :: Int
gramSizeUpper :: Int
useLevenshtein :: Bool
..} <- m FuzzySet
forall s (m :: * -> *). MonadState s m => m s
get
  if Text
key Text -> HashMap Text Text -> Bool
forall a. Eq a => a -> HashMap Text a -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` HashMap Text Text
exactSet
    then -- An entry already exists
      Bool -> m Bool
forall a. a -> m a
forall (f :: * -> *) a. Applicative f => a -> f a
pure Bool
False
    else do
      (Int -> m ()) -> [Int] -> m ()
forall (t :: * -> *) (f :: * -> *) a b.
(Foldable t, Applicative f) =>
(a -> f b) -> t a -> f ()
traverse_ ((FuzzySet -> FuzzySet) -> m ()
forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify ((FuzzySet -> FuzzySet) -> m ())
-> (Int -> FuzzySet -> FuzzySet) -> Int -> m ()
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> FuzzySet -> FuzzySet
updateDict) [Int
gramSizeLower .. Int
gramSizeUpper]
      (FuzzySet -> FuzzySet) -> m ()
forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify (Text -> Text -> FuzzySet -> FuzzySet
updateExactSet Text
key Text
str)
      Bool -> m Bool
forall a. a -> m a
forall (f :: * -> *) a. Applicative f => a -> f a
pure Bool
True
  where
    key :: Text
key = Text -> Text
Text.toLower Text
str
    updateDict :: Int -> FuzzySet -> FuzzySet
updateDict Int
size_ FuzzySet{Bool
Int
HashMap Int (Vector FuzzySetItem)
HashMap Text [GramInfo]
HashMap Text Text
exactSet :: FuzzySet -> HashMap Text Text
matchDict :: FuzzySet -> HashMap Text [GramInfo]
items :: FuzzySet -> HashMap Int (Vector FuzzySetItem)
gramSizeLower :: FuzzySet -> Int
gramSizeUpper :: FuzzySet -> Int
useLevenshtein :: FuzzySet -> Bool
exactSet :: HashMap Text Text
matchDict :: HashMap Text [GramInfo]
items :: HashMap Int (Vector FuzzySetItem)
gramSizeLower :: Int
gramSizeUpper :: Int
useLevenshtein :: Bool
..} =
      let
        itemVector :: Vector FuzzySetItem
itemVector =
          HashMap Int (Vector FuzzySetItem)
items
            HashMap Int (Vector FuzzySetItem)
-> (HashMap Int (Vector FuzzySetItem)
    -> Maybe (Vector FuzzySetItem))
-> Maybe (Vector FuzzySetItem)
forall a b. a -> (a -> b) -> b
& Int
-> HashMap Int (Vector FuzzySetItem) -> Maybe (Vector FuzzySetItem)
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
HashMap.lookup Int
size_
            Maybe (Vector FuzzySetItem)
-> (Maybe (Vector FuzzySetItem) -> Vector FuzzySetItem)
-> Vector FuzzySetItem
forall a b. a -> (a -> b) -> b
& Vector FuzzySetItem
-> Maybe (Vector FuzzySetItem) -> Vector FuzzySetItem
forall a. a -> Maybe a -> a
fromMaybe Vector FuzzySetItem
forall a. Vector a
Vector.empty
        grams_ :: HashMap Text Int
grams_ =
          Text -> Int -> HashMap Text Int
gramVector Text
key Int
size_
        insertInfo :: k -> Int -> HashMap k [GramInfo] -> HashMap k [GramInfo]
insertInfo k
gram Int
count =
          let info :: GramInfo
info = Int -> Int -> GramInfo
GramInfo (Vector FuzzySetItem -> Int
forall a. Vector a -> Int
Vector.length Vector FuzzySetItem
itemVector) Int
count
           in ([GramInfo] -> [GramInfo] -> [GramInfo])
-> k -> [GramInfo] -> HashMap k [GramInfo] -> HashMap k [GramInfo]
forall k v.
(Eq k, Hashable k) =>
(v -> v -> v) -> k -> v -> HashMap k v -> HashMap k v
HashMap.insertWith [GramInfo] -> [GramInfo] -> [GramInfo]
forall a. Semigroup a => a -> a -> a
(<>) k
gram [GramInfo
info]
        item :: FuzzySetItem
item =
          Double -> Text -> FuzzySetItem
FuzzySetItem (HashMap Text Int -> [Int]
forall k v. HashMap k v -> [v]
elems HashMap Text Int
grams_ [Int] -> ([Int] -> Double) -> Double
forall a b. a -> (a -> b) -> b
& [Int] -> Double
norm) Text
key
       in
        FuzzySet
          { items :: HashMap Int (Vector FuzzySetItem)
items = Int
-> Vector FuzzySetItem
-> HashMap Int (Vector FuzzySetItem)
-> HashMap Int (Vector FuzzySetItem)
forall k v.
(Eq k, Hashable k) =>
k -> v -> HashMap k v -> HashMap k v
insert Int
size_ (Vector FuzzySetItem
itemVector Vector FuzzySetItem -> FuzzySetItem -> Vector FuzzySetItem
forall a. Vector a -> a -> Vector a
`Vector.snoc` FuzzySetItem
item) HashMap Int (Vector FuzzySetItem)
items
          , matchDict :: HashMap Text [GramInfo]
matchDict = (Text -> Int -> HashMap Text [GramInfo] -> HashMap Text [GramInfo])
-> HashMap Text [GramInfo]
-> HashMap Text Int
-> HashMap Text [GramInfo]
forall k v a. (k -> v -> a -> a) -> a -> HashMap k v -> a
foldrWithKey Text -> Int -> HashMap Text [GramInfo] -> HashMap Text [GramInfo]
forall {k}.
Hashable k =>
k -> Int -> HashMap k [GramInfo] -> HashMap k [GramInfo]
insertInfo HashMap Text [GramInfo]
matchDict HashMap Text Int
grams_
          , Bool
Int
HashMap Text Text
exactSet :: HashMap Text Text
gramSizeLower :: Int
gramSizeUpper :: Int
useLevenshtein :: Bool
exactSet :: HashMap Text Text
gramSizeLower :: Int
gramSizeUpper :: Int
useLevenshtein :: Bool
..
          }

addMany_ :: (MonadState FuzzySet m) => [Text] -> m [Text]
addMany_ :: forall (m :: * -> *). MonadState FuzzySet m => [Text] -> m [Text]
addMany_ = [[Text]] -> [Text]
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat ([[Text]] -> [Text])
-> ([Text] -> m [[Text]]) -> [Text] -> m [Text]
forall (f :: * -> *) (g :: * -> *) a b.
(Functor f, Functor g) =>
(a -> b) -> f (g a) -> f (g b)
<$$> (Text -> m [Text]) -> [Text] -> m [[Text]]
forall (t :: * -> *) (f :: * -> *) a b.
(Traversable t, Applicative f) =>
(a -> f b) -> t a -> f (t b)
forall (f :: * -> *) a b.
Applicative f =>
(a -> f b) -> [a] -> f [b]
traverse Text -> m [Text]
forall {m :: * -> *}. MonadState FuzzySet m => Text -> m [Text]
addOne
  where
    addOne :: Text -> m [Text]
addOne Text
str = do
      Bool
p <- Text -> m Bool
forall (m :: * -> *). MonadState FuzzySet m => Text -> m Bool
add_ Text
str
      [Text] -> m [Text]
forall a. a -> m a
forall (f :: * -> *) a. Applicative f => a -> f a
pure [Text
str | Bool
p]

-- | Generate a list of /n/-grams (character substrings) from the normalized
--   input and then translate this into a dictionary with the /n/-grams as keys
--   mapping to the number of occurences of the substring in the list.
--
-- >>> gramVector "xxxx" 2
-- fromList [("-x",1), ("xx",3), ("x-",1)]
--
-- The substring @"xx"@ appears three times in the normalized string:
--
-- >>> grams "xxxx" 2
-- ["-x","xx","xx","xx","x-"]
--
-- >>> Data.HashMap.Strict.lookup "nts" (gramVector "intrent'srestaurantsomeoftrent'saunt'santswantsamtorentsomepants" 3)
-- Just 8
gramVector :: Text -> Int -> HashMap Text Int
gramVector :: Text -> Int -> HashMap Text Int
gramVector = (Text -> HashMap Text Int -> HashMap Text Int)
-> HashMap Text Int -> [Text] -> HashMap Text Int
forall a b. (a -> b -> b) -> b -> [a] -> b
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr Text -> HashMap Text Int -> HashMap Text Int
forall {k} {v}.
(Hashable k, Num v) =>
k -> HashMap k v -> HashMap k v
insert_ HashMap Text Int
forall k v. HashMap k v
HashMap.empty ([Text] -> HashMap Text Int)
-> (Text -> Int -> [Text]) -> Text -> Int -> HashMap Text Int
forall (f :: * -> *) (g :: * -> *) a b.
(Functor f, Functor g) =>
(a -> b) -> f (g a) -> f (g b)
<$$> Text -> Int -> [Text]
grams
  where
    insert_ :: k -> HashMap k v -> HashMap k v
insert_ k
key = (v -> v -> v) -> k -> v -> HashMap k v -> HashMap k v
forall k v.
(Eq k, Hashable k) =>
(v -> v -> v) -> k -> v -> HashMap k v -> HashMap k v
HashMap.insertWith v -> v -> v
forall a. Num a => a -> a -> a
(+) k
key v
1

-- | Break apart the input string into a list of /n/-grams. The string is first
--   'Data.FuzzySet.Util.normalized' and enclosed in hyphens. We then take all
--   substrings of length /n/, letting the offset range from \(0 \text{ to } s + 2 − n\),
--   where /s/ is the length of the normalized input.
--
-- /Example:/
-- The string @"Destroido Corp."@ is first normalized to @"destroido corp"@,
-- and then enclosed in hyphens, so that it becomes @"-destroido corp-"@. The
-- trigrams generated from this normalized string are:
--
-- > [ "-de"
-- > , "des"
-- > , "est"
-- > , "str"
-- > , "tro"
-- > , "roi"
-- > , "oid"
-- > , "ido"
-- > , "do "
-- > , "o c"
-- > , " co"
-- > , "cor"
-- > , "orp"
-- > , "rp-"
-- > ]
grams :: Text -> Int -> [Text]
grams :: Text -> Int -> [Text]
grams Text
input Int
size_
  | Int
size_ Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
2 = String -> [Text]
forall a. HasCallStack => String -> a
error String
"gram size_ must be at least 2"
  | Bool
otherwise = (Int -> Text -> Text) -> Text -> Int -> Text
forall a b c. (a -> b -> c) -> b -> a -> c
flip (Int -> Int -> Text -> Text
substr Int
size_) Text
normalizedInput (Int -> Text) -> [Int] -> [Text]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> [Int]
offsets
  where
    normalizedInput :: Text
normalizedInput = Text -> Text
normalized Text
input Text -> Char -> Text
`enclosedIn` Char
'-'
    offsets :: [Int]
offsets = [Int
0 .. Text -> Int
Text.length Text
normalizedInput Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
size_]

-- | Normalize the input by
--
--   * removing non-word characters, except for spaces and commas; and
--   * converting alphabetic characters to lowercase.
--
normalized :: Text -> Text
normalized :: Text -> Text
normalized = (Char -> Bool) -> Text -> Text
Text.filter Char -> Bool
word (Text -> Text) -> (Text -> Text) -> Text -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Text
Text.toLower
  where
    word :: Char -> Bool
word Char
char
      | Char -> Bool
isAlphaNum Char
char = Bool
True
      | Char -> Bool
isSpace Char
char    = Bool
True
      | Char
char Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
','     = Bool
True
      | Bool
otherwise       = Bool
False

-- | Return the euclidean norm, or /magnitude/, of the input list interpreted
--   as a vector.
--
-- That is,
--
-- \( \quad \sqrt{ \sum_{i=0}^n a_i^2 } \)
--
-- for the input
--
-- \( \quad \langle a_0, a_1, \dots, a_n \rangle \)
--
-- where \( a_i \) is the element at position /i/ in the input list.
norm :: [Int] -> Double
norm :: [Int] -> Double
norm = Double -> Double
forall a. Floating a => a -> a
sqrt (Double -> Double) -> ([Int] -> Double) -> [Int] -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Double) -> ([Int] -> Int) -> [Int] -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Int] -> Int
forall a. Num a => [a] -> a
forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
sum ([Int] -> Int) -> ([Int] -> [Int]) -> [Int] -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Int -> Int) -> [Int] -> [Int]
forall a b. (a -> b) -> [a] -> [b]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (Int -> Int -> Int
forall a b. (Num a, Integral b) => a -> b -> a
^ (Int
2 :: Int))

-- | Return the normalized Levenshtein distance between the two strings.
--
-- See <https://en.wikipedia.org/wiki/Levenshtein_distance>.
distance :: Text -> Text -> Double
distance :: Text -> Text -> Double
distance = Ratio Int -> Double
forall a b. (Real a, Fractional b) => a -> b
realToFrac (Ratio Int -> Double)
-> (Text -> Text -> Ratio Int) -> Text -> Text -> Double
forall (f :: * -> *) (g :: * -> *) a b.
(Functor f, Functor g) =>
(a -> b) -> f (g a) -> f (g b)
<$$> Text -> Text -> Ratio Int
levenshteinNorm

updateExactSet :: Text -> Text -> FuzzySet -> FuzzySet
updateExactSet :: Text -> Text -> FuzzySet -> FuzzySet
updateExactSet Text
key Text
str FuzzySet{Bool
Int
HashMap Int (Vector FuzzySetItem)
HashMap Text [GramInfo]
HashMap Text Text
exactSet :: FuzzySet -> HashMap Text Text
matchDict :: FuzzySet -> HashMap Text [GramInfo]
items :: FuzzySet -> HashMap Int (Vector FuzzySetItem)
gramSizeLower :: FuzzySet -> Int
gramSizeUpper :: FuzzySet -> Int
useLevenshtein :: FuzzySet -> Bool
exactSet :: HashMap Text Text
matchDict :: HashMap Text [GramInfo]
items :: HashMap Int (Vector FuzzySetItem)
gramSizeLower :: Int
gramSizeUpper :: Int
useLevenshtein :: Bool
..} =
  FuzzySet
    { exactSet :: HashMap Text Text
exactSet = Text -> Text -> HashMap Text Text -> HashMap Text Text
forall k v.
(Eq k, Hashable k) =>
k -> v -> HashMap k v -> HashMap k v
insert Text
key Text
str HashMap Text Text
exactSet
    , Bool
Int
HashMap Int (Vector FuzzySetItem)
HashMap Text [GramInfo]
matchDict :: HashMap Text [GramInfo]
items :: HashMap Int (Vector FuzzySetItem)
gramSizeLower :: Int
gramSizeUpper :: Int
useLevenshtein :: Bool
matchDict :: HashMap Text [GramInfo]
items :: HashMap Int (Vector FuzzySetItem)
gramSizeLower :: Int
gramSizeUpper :: Int
useLevenshtein :: Bool
..
    }