{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE RecordWildCards #-}
module Data.FuzzySet.Internal
( FuzzySet(..)
, FuzzySetItem(..)
, GramInfo(..)
, FuzzyMatch
, grams
, gramVector
, matches
, getMatches
, add_
, addMany_
, normalized
, norm
, distance
) where
import Control.Monad.State (MonadState, get, modify)
import Data.Bifunctor (second)
import Data.Char (isAlphaNum, isSpace)
import Data.Foldable (traverse_)
import Data.Function ((&))
import Data.FuzzySet.Utils (enclosedIn, substr, (<$$>))
import Data.HashMap.Strict (HashMap, elems, foldrWithKey, insert, insertWith, lookup, lookupDefault)
import qualified Data.HashMap.Strict as HashMap
import Data.List (sortBy)
import Data.Maybe (fromMaybe)
import Data.Ord (Down(..), comparing)
import Data.Text (Text)
import qualified Data.Text as Text
import Data.Text.Metrics (levenshteinNorm)
import Data.Vector (Vector, (!?))
import qualified Data.Vector as Vector
import Prelude hiding (lookup)
data FuzzySetItem = FuzzySetItem
{ FuzzySetItem -> Double
vectorMagnitude :: !Double
, FuzzySetItem -> Text
normalizedEntry :: !Text
} deriving (FuzzySetItem -> FuzzySetItem -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: FuzzySetItem -> FuzzySetItem -> Bool
$c/= :: FuzzySetItem -> FuzzySetItem -> Bool
== :: FuzzySetItem -> FuzzySetItem -> Bool
$c== :: FuzzySetItem -> FuzzySetItem -> Bool
Eq, Int -> FuzzySetItem -> ShowS
[FuzzySetItem] -> ShowS
FuzzySetItem -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [FuzzySetItem] -> ShowS
$cshowList :: [FuzzySetItem] -> ShowS
show :: FuzzySetItem -> String
$cshow :: FuzzySetItem -> String
showsPrec :: Int -> FuzzySetItem -> ShowS
$cshowsPrec :: Int -> FuzzySetItem -> ShowS
Show)
data GramInfo = GramInfo
{ GramInfo -> Int
itemIndex :: !Int
, GramInfo -> Int
gramCount :: !Int
} deriving (GramInfo -> GramInfo -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: GramInfo -> GramInfo -> Bool
$c/= :: GramInfo -> GramInfo -> Bool
== :: GramInfo -> GramInfo -> Bool
$c== :: GramInfo -> GramInfo -> Bool
Eq, Int -> GramInfo -> ShowS
[GramInfo] -> ShowS
GramInfo -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [GramInfo] -> ShowS
$cshowList :: [GramInfo] -> ShowS
show :: GramInfo -> String
$cshow :: GramInfo -> String
showsPrec :: Int -> GramInfo -> ShowS
$cshowsPrec :: Int -> GramInfo -> ShowS
Show)
data FuzzySet = FuzzySet
{ FuzzySet -> HashMap Text Text
exactSet :: !(HashMap Text Text)
, FuzzySet -> HashMap Text [GramInfo]
matchDict :: !(HashMap Text [GramInfo])
, FuzzySet -> HashMap Int (Vector FuzzySetItem)
items :: !(HashMap Int (Vector FuzzySetItem))
, FuzzySet -> Int
gramSizeLower :: !Int
, FuzzySet -> Int
gramSizeUpper :: !Int
, FuzzySet -> Bool
useLevenshtein :: !Bool
} deriving (FuzzySet -> FuzzySet -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: FuzzySet -> FuzzySet -> Bool
$c/= :: FuzzySet -> FuzzySet -> Bool
== :: FuzzySet -> FuzzySet -> Bool
$c== :: FuzzySet -> FuzzySet -> Bool
Eq, Int -> FuzzySet -> ShowS
[FuzzySet] -> ShowS
FuzzySet -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [FuzzySet] -> ShowS
$cshowList :: [FuzzySet] -> ShowS
show :: FuzzySet -> String
$cshow :: FuzzySet -> String
showsPrec :: Int -> FuzzySet -> ShowS
$cshowsPrec :: Int -> FuzzySet -> ShowS
Show)
type FuzzyMatch = (Double, Text)
matches :: FuzzySet -> HashMap Text Int -> HashMap Int Int
matches :: FuzzySet -> HashMap Text Int -> HashMap Int Int
matches FuzzySet{Bool
Int
HashMap Int (Vector FuzzySetItem)
HashMap Text [GramInfo]
HashMap Text Text
useLevenshtein :: Bool
gramSizeUpper :: Int
gramSizeLower :: Int
items :: HashMap Int (Vector FuzzySetItem)
matchDict :: HashMap Text [GramInfo]
exactSet :: HashMap Text Text
useLevenshtein :: FuzzySet -> Bool
gramSizeUpper :: FuzzySet -> Int
gramSizeLower :: FuzzySet -> Int
items :: FuzzySet -> HashMap Int (Vector FuzzySetItem)
matchDict :: FuzzySet -> HashMap Text [GramInfo]
exactSet :: FuzzySet -> HashMap Text Text
..} = forall k v a. (k -> v -> a -> a) -> a -> HashMap k v -> a
foldrWithKey Text -> Int -> HashMap Int Int -> HashMap Int Int
go forall a. Monoid a => a
mempty
where
go :: Text -> Int -> HashMap Int Int -> HashMap Int Int
go Text
gram Int
count HashMap Int Int
hashMap =
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
HashMap.lookup Text
gram HashMap Text [GramInfo]
matchDict
forall a b. a -> (a -> b) -> b
& forall b a. b -> (a -> b) -> Maybe a -> b
maybe HashMap Int Int
hashMap (forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr (Int -> GramInfo -> HashMap Int Int -> HashMap Int Int
insert_ Int
count) HashMap Int Int
hashMap)
insert_ :: Int -> GramInfo -> HashMap Int Int -> HashMap Int Int
insert_ Int
count GramInfo{Int
gramCount :: Int
itemIndex :: Int
gramCount :: GramInfo -> Int
itemIndex :: GramInfo -> Int
..} =
forall k v.
(Eq k, Hashable k) =>
(v -> v -> v) -> k -> v -> HashMap k v -> HashMap k v
insertWith forall a. Num a => a -> a -> a
(+) Int
itemIndex (Int
gramCount forall a. Num a => a -> a -> a
* Int
count)
getMatches :: FuzzySet -> Text -> Double -> Int -> [FuzzyMatch]
getMatches :: FuzzySet -> Text -> Double -> Int -> [FuzzyMatch]
getMatches FuzzySet{Bool
Int
HashMap Int (Vector FuzzySetItem)
HashMap Text [GramInfo]
HashMap Text Text
useLevenshtein :: Bool
gramSizeUpper :: Int
gramSizeLower :: Int
items :: HashMap Int (Vector FuzzySetItem)
matchDict :: HashMap Text [GramInfo]
exactSet :: HashMap Text Text
useLevenshtein :: FuzzySet -> Bool
gramSizeUpper :: FuzzySet -> Int
gramSizeLower :: FuzzySet -> Int
items :: FuzzySet -> HashMap Int (Vector FuzzySetItem)
matchDict :: FuzzySet -> HashMap Text [GramInfo]
exactSet :: FuzzySet -> HashMap Text Text
..} Text
str Double
minScore Int
gramSize =
[FuzzyMatch]
results
forall a b. a -> (a -> b) -> b
& forall a. (a -> Bool) -> [a] -> [a]
filter ((forall a. Ord a => a -> a -> Bool
>= Double
minScore) forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a, b) -> a
fst)
forall a b. a -> (a -> b) -> b
& forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (forall (p :: * -> * -> *) b c a.
Bifunctor p =>
(b -> c) -> p a b -> p a c
second (forall a b c. (a -> b -> c) -> b -> a -> c
flip (forall k v. (Eq k, Hashable k) => v -> k -> HashMap k v -> v
lookupDefault forall a. Monoid a => a
mempty) HashMap Text Text
exactSet))
where
results :: [FuzzyMatch]
results =
let sorted :: [FuzzyMatch]
sorted =
FuzzySet -> HashMap Text Int -> HashMap Int Int
matches FuzzySet{Bool
Int
HashMap Int (Vector FuzzySetItem)
HashMap Text [GramInfo]
HashMap Text Text
useLevenshtein :: Bool
gramSizeUpper :: Int
gramSizeLower :: Int
items :: HashMap Int (Vector FuzzySetItem)
matchDict :: HashMap Text [GramInfo]
exactSet :: HashMap Text Text
useLevenshtein :: Bool
gramSizeUpper :: Int
gramSizeLower :: Int
items :: HashMap Int (Vector FuzzySetItem)
matchDict :: HashMap Text [GramInfo]
exactSet :: HashMap Text Text
..} HashMap Text Int
queryVector
forall a b. a -> (a -> b) -> b
& forall k v a. (k -> v -> a -> a) -> a -> HashMap k v -> a
foldrWithKey forall {a}. Integral a => Int -> a -> [FuzzyMatch] -> [FuzzyMatch]
go []
forall a b. a -> (a -> b) -> b
& forall a. (a -> a -> Ordering) -> [a] -> [a]
sortBy (forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing (forall a. a -> Down a
Down forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a, b) -> a
fst))
in if Bool
useLevenshtein
then
[FuzzyMatch]
sorted
forall a b. a -> (a -> b) -> b
& forall a. Int -> [a] -> [a]
take Int
50
forall a b. a -> (a -> b) -> b
& forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (\(Double
_, Text
entry) -> (Text -> Text -> Double
distance Text
str Text
entry, Text
entry))
forall a b. a -> (a -> b) -> b
& forall a. (a -> a -> Ordering) -> [a] -> [a]
sortBy (forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing (forall a. a -> Down a
Down forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a, b) -> a
fst))
else [FuzzyMatch]
sorted
queryMagnitude :: Double
queryMagnitude = [Int] -> Double
norm (forall k v. HashMap k v -> [v]
elems HashMap Text Int
queryVector)
queryVector :: HashMap Text Int
queryVector = Text -> Int -> HashMap Text Int
gramVector Text
str Int
gramSize
itemsVector :: Vector FuzzySetItem
itemsVector = forall a. a -> Maybe a -> a
fromMaybe forall a. Monoid a => a
mempty (Int
gramSize forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
`lookup` HashMap Int (Vector FuzzySetItem)
items)
go :: Int -> a -> [FuzzyMatch] -> [FuzzyMatch]
go Int
index a
score [FuzzyMatch]
list =
case Vector FuzzySetItem
itemsVector forall a. Vector a -> Int -> Maybe a
!? Int
index of
Maybe FuzzySetItem
Nothing ->
[FuzzyMatch]
list
Just FuzzySetItem{Double
Text
normalizedEntry :: Text
vectorMagnitude :: Double
normalizedEntry :: FuzzySetItem -> Text
vectorMagnitude :: FuzzySetItem -> Double
..} ->
( forall a b. (Integral a, Num b) => a -> b
fromIntegral a
score forall a. Fractional a => a -> a -> a
/ (Double
queryMagnitude forall a. Num a => a -> a -> a
* Double
vectorMagnitude)
, Text
normalizedEntry
) forall a. a -> [a] -> [a]
: [FuzzyMatch]
list
add_ :: (MonadState FuzzySet m) => Text -> m Bool
add_ :: forall (m :: * -> *). MonadState FuzzySet m => Text -> m Bool
add_ Text
str = do
FuzzySet{Bool
Int
HashMap Int (Vector FuzzySetItem)
HashMap Text [GramInfo]
HashMap Text Text
useLevenshtein :: Bool
gramSizeUpper :: Int
gramSizeLower :: Int
items :: HashMap Int (Vector FuzzySetItem)
matchDict :: HashMap Text [GramInfo]
exactSet :: HashMap Text Text
useLevenshtein :: FuzzySet -> Bool
gramSizeUpper :: FuzzySet -> Int
gramSizeLower :: FuzzySet -> Int
items :: FuzzySet -> HashMap Int (Vector FuzzySetItem)
matchDict :: FuzzySet -> HashMap Text [GramInfo]
exactSet :: FuzzySet -> HashMap Text Text
..} <- forall s (m :: * -> *). MonadState s m => m s
get
if Text
key forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` HashMap Text Text
exactSet
then
forall (f :: * -> *) a. Applicative f => a -> f a
pure Bool
False
else do
forall (t :: * -> *) (f :: * -> *) a b.
(Foldable t, Applicative f) =>
(a -> f b) -> t a -> f ()
traverse_ (forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> FuzzySet -> FuzzySet
updateDict) [Int
gramSizeLower .. Int
gramSizeUpper]
forall s (m :: * -> *). MonadState s m => (s -> s) -> m ()
modify (Text -> Text -> FuzzySet -> FuzzySet
updateExactSet Text
key Text
str)
forall (f :: * -> *) a. Applicative f => a -> f a
pure Bool
True
where
key :: Text
key = Text -> Text
Text.toLower Text
str
updateDict :: Int -> FuzzySet -> FuzzySet
updateDict Int
size_ FuzzySet{Bool
Int
HashMap Int (Vector FuzzySetItem)
HashMap Text [GramInfo]
HashMap Text Text
useLevenshtein :: Bool
gramSizeUpper :: Int
gramSizeLower :: Int
items :: HashMap Int (Vector FuzzySetItem)
matchDict :: HashMap Text [GramInfo]
exactSet :: HashMap Text Text
useLevenshtein :: FuzzySet -> Bool
gramSizeUpper :: FuzzySet -> Int
gramSizeLower :: FuzzySet -> Int
items :: FuzzySet -> HashMap Int (Vector FuzzySetItem)
matchDict :: FuzzySet -> HashMap Text [GramInfo]
exactSet :: FuzzySet -> HashMap Text Text
..} =
let
itemVector :: Vector FuzzySetItem
itemVector =
HashMap Int (Vector FuzzySetItem)
items
forall a b. a -> (a -> b) -> b
& forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
HashMap.lookup Int
size_
forall a b. a -> (a -> b) -> b
& forall a. a -> Maybe a -> a
fromMaybe forall a. Vector a
Vector.empty
grams_ :: HashMap Text Int
grams_ =
Text -> Int -> HashMap Text Int
gramVector Text
key Int
size_
insertInfo :: k -> Int -> HashMap k [GramInfo] -> HashMap k [GramInfo]
insertInfo k
gram Int
count =
let info :: GramInfo
info = Int -> Int -> GramInfo
GramInfo (forall a. Vector a -> Int
Vector.length Vector FuzzySetItem
itemVector) Int
count
in forall k v.
(Eq k, Hashable k) =>
(v -> v -> v) -> k -> v -> HashMap k v -> HashMap k v
HashMap.insertWith forall a. Semigroup a => a -> a -> a
(<>) k
gram [GramInfo
info]
item :: FuzzySetItem
item =
Double -> Text -> FuzzySetItem
FuzzySetItem (forall k v. HashMap k v -> [v]
elems HashMap Text Int
grams_ forall a b. a -> (a -> b) -> b
& [Int] -> Double
norm) Text
key
in
FuzzySet
{ items :: HashMap Int (Vector FuzzySetItem)
items = forall k v.
(Eq k, Hashable k) =>
k -> v -> HashMap k v -> HashMap k v
insert Int
size_ (Vector FuzzySetItem
itemVector forall a. Vector a -> a -> Vector a
`Vector.snoc` FuzzySetItem
item) HashMap Int (Vector FuzzySetItem)
items
, matchDict :: HashMap Text [GramInfo]
matchDict = forall k v a. (k -> v -> a -> a) -> a -> HashMap k v -> a
foldrWithKey forall {k}.
Hashable k =>
k -> Int -> HashMap k [GramInfo] -> HashMap k [GramInfo]
insertInfo HashMap Text [GramInfo]
matchDict HashMap Text Int
grams_
, Bool
Int
HashMap Text Text
useLevenshtein :: Bool
gramSizeUpper :: Int
gramSizeLower :: Int
exactSet :: HashMap Text Text
useLevenshtein :: Bool
gramSizeUpper :: Int
gramSizeLower :: Int
exactSet :: HashMap Text Text
..
}
addMany_ :: (MonadState FuzzySet m) => [Text] -> m [Text]
addMany_ :: forall (m :: * -> *). MonadState FuzzySet m => [Text] -> m [Text]
addMany_ = forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat forall (f :: * -> *) (g :: * -> *) a b.
(Functor f, Functor g) =>
(a -> b) -> f (g a) -> f (g b)
<$$> forall (t :: * -> *) (f :: * -> *) a b.
(Traversable t, Applicative f) =>
(a -> f b) -> t a -> f (t b)
traverse forall {m :: * -> *}. MonadState FuzzySet m => Text -> m [Text]
addOne
where
addOne :: Text -> m [Text]
addOne Text
str = do
Bool
p <- forall (m :: * -> *). MonadState FuzzySet m => Text -> m Bool
add_ Text
str
forall (f :: * -> *) a. Applicative f => a -> f a
pure [Text
str | Bool
p]
gramVector :: Text -> Int -> HashMap Text Int
gramVector :: Text -> Int -> HashMap Text Int
gramVector = forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr forall {k} {v}.
(Hashable k, Num v) =>
k -> HashMap k v -> HashMap k v
insert_ forall k v. HashMap k v
HashMap.empty forall (f :: * -> *) (g :: * -> *) a b.
(Functor f, Functor g) =>
(a -> b) -> f (g a) -> f (g b)
<$$> Text -> Int -> [Text]
grams
where
insert_ :: k -> HashMap k v -> HashMap k v
insert_ k
key = forall k v.
(Eq k, Hashable k) =>
(v -> v -> v) -> k -> v -> HashMap k v -> HashMap k v
HashMap.insertWith forall a. Num a => a -> a -> a
(+) k
key v
1
grams :: Text -> Int -> [Text]
grams :: Text -> Int -> [Text]
grams Text
input Int
size_
| Int
size_ forall a. Ord a => a -> a -> Bool
< Int
2 = forall a. HasCallStack => String -> a
error String
"gram size_ must be at least 2"
| Bool
otherwise = forall a b c. (a -> b -> c) -> b -> a -> c
flip (Int -> Int -> Text -> Text
substr Int
size_) Text
normalizedInput forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> [Int]
offsets
where
normalizedInput :: Text
normalizedInput = Text -> Text
normalized Text
input Text -> Char -> Text
`enclosedIn` Char
'-'
offsets :: [Int]
offsets = [Int
0 .. Text -> Int
Text.length Text
normalizedInput forall a. Num a => a -> a -> a
- Int
size_]
normalized :: Text -> Text
normalized :: Text -> Text
normalized = (Char -> Bool) -> Text -> Text
Text.filter Char -> Bool
word forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Text
Text.toLower
where
word :: Char -> Bool
word Char
char
| Char -> Bool
isAlphaNum Char
char = Bool
True
| Char -> Bool
isSpace Char
char = Bool
True
| Char
char forall a. Eq a => a -> a -> Bool
== Char
',' = Bool
True
| Bool
otherwise = Bool
False
norm :: [Int] -> Double
norm :: [Int] -> Double
norm = forall a. Floating a => a -> a
sqrt forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (Integral a, Num b) => a -> b
fromIntegral forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
sum forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (forall a b. (Num a, Integral b) => a -> b -> a
^ (Int
2 :: Int))
distance :: Text -> Text -> Double
distance :: Text -> Text -> Double
distance = forall a b. (Real a, Fractional b) => a -> b
realToFrac forall (f :: * -> *) (g :: * -> *) a b.
(Functor f, Functor g) =>
(a -> b) -> f (g a) -> f (g b)
<$$> Text -> Text -> Ratio Int
levenshteinNorm
updateExactSet :: Text -> Text -> FuzzySet -> FuzzySet
updateExactSet :: Text -> Text -> FuzzySet -> FuzzySet
updateExactSet Text
key Text
str FuzzySet{Bool
Int
HashMap Int (Vector FuzzySetItem)
HashMap Text [GramInfo]
HashMap Text Text
useLevenshtein :: Bool
gramSizeUpper :: Int
gramSizeLower :: Int
items :: HashMap Int (Vector FuzzySetItem)
matchDict :: HashMap Text [GramInfo]
exactSet :: HashMap Text Text
useLevenshtein :: FuzzySet -> Bool
gramSizeUpper :: FuzzySet -> Int
gramSizeLower :: FuzzySet -> Int
items :: FuzzySet -> HashMap Int (Vector FuzzySetItem)
matchDict :: FuzzySet -> HashMap Text [GramInfo]
exactSet :: FuzzySet -> HashMap Text Text
..} =
FuzzySet
{ exactSet :: HashMap Text Text
exactSet = forall k v.
(Eq k, Hashable k) =>
k -> v -> HashMap k v -> HashMap k v
insert Text
key Text
str HashMap Text Text
exactSet
, Bool
Int
HashMap Int (Vector FuzzySetItem)
HashMap Text [GramInfo]
useLevenshtein :: Bool
gramSizeUpper :: Int
gramSizeLower :: Int
items :: HashMap Int (Vector FuzzySetItem)
matchDict :: HashMap Text [GramInfo]
useLevenshtein :: Bool
gramSizeUpper :: Int
gramSizeLower :: Int
items :: HashMap Int (Vector FuzzySetItem)
matchDict :: HashMap Text [GramInfo]
..
}