{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE OverloadedStrings #-}
module Data.FuzzySet.Internal
( (|>)
, matches
, getMatches
, gramVector
, grams
) where
import Data.Function ((&))
import Data.FuzzySet.Types
import Data.FuzzySet.Util (distance)
import Data.FuzzySet.Util (norm)
import Data.FuzzySet.Util (normalized, substr, enclosedIn)
import Data.HashMap.Strict (HashMap, elems, foldrWithKey, lookup, lookupDefault, alter)
import Data.List (sortBy)
import Data.Maybe (fromMaybe)
import Data.Ord (Down(..), comparing)
import Data.Text (Text)
import Data.Vector ((!?))
import qualified Data.FuzzySet.Util as Util
import qualified Data.HashMap.Strict as HashMap
import qualified Data.Text as Text
(|>) :: a -> (a -> b) -> b
|> :: a -> (a -> b) -> b
(|>) = a -> (a -> b) -> b
forall a b. a -> (a -> b) -> b
(&)
infixl 1 |>
matches
:: FuzzySet
-> HashMap Text Int
-> HashMap Int Int
matches :: FuzzySet -> HashMap Text Int -> HashMap Int Int
matches set :: FuzzySet
set@FuzzySet{Bool
Int
HashMap Int (Vector FuzzySetItem)
HashMap Text [GramInfo]
HashMap Text Text
useLevenshtein :: FuzzySet -> Bool
gramSizeUpper :: FuzzySet -> Int
gramSizeLower :: FuzzySet -> Int
items :: FuzzySet -> HashMap Int (Vector FuzzySetItem)
matchDict :: FuzzySet -> HashMap Text [GramInfo]
exactSet :: FuzzySet -> HashMap Text Text
useLevenshtein :: Bool
gramSizeUpper :: Int
gramSizeLower :: Int
items :: HashMap Int (Vector FuzzySetItem)
matchDict :: HashMap Text [GramInfo]
exactSet :: HashMap Text Text
..} =
(Text -> Int -> HashMap Int Int -> HashMap Int Int)
-> HashMap Int Int -> HashMap Text Int -> HashMap Int Int
forall k v a. (k -> v -> a -> a) -> a -> HashMap k v -> a
foldrWithKey Text -> Int -> HashMap Int Int -> HashMap Int Int
fun HashMap Int Int
forall a. Monoid a => a
mempty
where
fun :: Text -> Int -> HashMap Int Int -> HashMap Int Int
fun Text
gram Int
count HashMap Int Int
map =
let
insScore :: Int -> Maybe Int -> Maybe Int
insScore Int
otherCount Maybe Int
entry =
Int -> Maybe Int
forall a. a -> Maybe a
Just (Int -> Maybe Int -> Int
forall a. a -> Maybe a -> a
fromMaybe Int
0 Maybe Int
entry Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
otherCount Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
count)
in
Text
gram Text -> HashMap Text [GramInfo] -> Maybe [GramInfo]
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
`HashMap.lookup` HashMap Text [GramInfo]
matchDict
Maybe [GramInfo]
-> (Maybe [GramInfo] -> HashMap Int Int) -> HashMap Int Int
forall a b. a -> (a -> b) -> b
|> HashMap Int Int
-> ([GramInfo] -> HashMap Int Int)
-> Maybe [GramInfo]
-> HashMap Int Int
forall b a. b -> (a -> b) -> Maybe a -> b
maybe HashMap Int Int
map ((GramInfo -> HashMap Int Int -> HashMap Int Int)
-> HashMap Int Int -> [GramInfo] -> HashMap Int Int
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr (\GramInfo{Int
gramCount :: GramInfo -> Int
itemIndex :: GramInfo -> Int
gramCount :: Int
itemIndex :: Int
..} -> (Maybe Int -> Maybe Int)
-> Int -> HashMap Int Int -> HashMap Int Int
forall k v.
(Eq k, Hashable k) =>
(Maybe v -> Maybe v) -> k -> HashMap k v -> HashMap k v
alter (Int -> Maybe Int -> Maybe Int
insScore Int
gramCount) Int
itemIndex) HashMap Int Int
map)
getMatches
:: FuzzySet
-> Text
-> Double
-> Int
-> [( Double, Text )]
getMatches :: FuzzySet -> Text -> Double -> Int -> [(Double, Text)]
getMatches set :: FuzzySet
set@FuzzySet{Bool
Int
HashMap Int (Vector FuzzySetItem)
HashMap Text [GramInfo]
HashMap Text Text
useLevenshtein :: Bool
gramSizeUpper :: Int
gramSizeLower :: Int
items :: HashMap Int (Vector FuzzySetItem)
matchDict :: HashMap Text [GramInfo]
exactSet :: HashMap Text Text
useLevenshtein :: FuzzySet -> Bool
gramSizeUpper :: FuzzySet -> Int
gramSizeLower :: FuzzySet -> Int
items :: FuzzySet -> HashMap Int (Vector FuzzySetItem)
matchDict :: FuzzySet -> HashMap Text [GramInfo]
exactSet :: FuzzySet -> HashMap Text Text
..} Text
query Double
minScore Int
gramSize =
[(Double, Text)]
results
[(Double, Text)]
-> ([(Double, Text)] -> [(Double, Text)]) -> [(Double, Text)]
forall a b. a -> (a -> b) -> b
|> ((Double, Text) -> Bool) -> [(Double, Text)] -> [(Double, Text)]
forall a. (a -> Bool) -> [a] -> [a]
filter (\(Double, Text)
pair -> (Double, Text) -> Double
forall a b. (a, b) -> a
fst (Double, Text)
pair Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
>= Double
minScore)
[(Double, Text)]
-> ([(Double, Text)] -> [(Double, Text)]) -> [(Double, Text)]
forall a b. a -> (a -> b) -> b
|> ((Double, Text) -> (Double, Text))
-> [(Double, Text)] -> [(Double, Text)]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (\( Double
score, Text
entry ) -> ( Double
score, HashMap Text Text
exactSet HashMap Text Text -> (HashMap Text Text -> Text) -> Text
forall a b. a -> (a -> b) -> b
|> Text -> Text -> HashMap Text Text -> Text
forall k v. (Eq k, Hashable k) => v -> k -> HashMap k v -> v
lookupDefault Text
"" Text
entry ))
where
results :: [(Double, Text)]
results =
let sorted :: [(Double, Text)]
sorted =
FuzzySet -> HashMap Text Int -> HashMap Int Int
matches FuzzySet
set HashMap Text Int
queryVector
HashMap Int Int
-> (HashMap Int Int -> [(Double, Text)]) -> [(Double, Text)]
forall a b. a -> (a -> b) -> b
|> (Int -> Int -> [(Double, Text)] -> [(Double, Text)])
-> [(Double, Text)] -> HashMap Int Int -> [(Double, Text)]
forall k v a. (k -> v -> a -> a) -> a -> HashMap k v -> a
HashMap.foldrWithKey Int -> Int -> [(Double, Text)] -> [(Double, Text)]
forall a.
Integral a =>
Int -> a -> [(Double, Text)] -> [(Double, Text)]
fun []
[(Double, Text)]
-> ([(Double, Text)] -> [(Double, Text)]) -> [(Double, Text)]
forall a b. a -> (a -> b) -> b
|> ((Double, Text) -> (Double, Text) -> Ordering)
-> [(Double, Text)] -> [(Double, Text)]
forall a. (a -> a -> Ordering) -> [a] -> [a]
sortBy (((Double, Text) -> Down Double)
-> (Double, Text) -> (Double, Text) -> Ordering
forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing (Double -> Down Double
forall a. a -> Down a
Down (Double -> Down Double)
-> ((Double, Text) -> Double) -> (Double, Text) -> Down Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Double, Text) -> Double
forall a b. (a, b) -> a
fst))
in
if Bool
useLevenshtein then
[(Double, Text)]
sorted
[(Double, Text)]
-> ([(Double, Text)] -> [(Double, Text)]) -> [(Double, Text)]
forall a b. a -> (a -> b) -> b
|> Int -> [(Double, Text)] -> [(Double, Text)]
forall a. Int -> [a] -> [a]
take Int
50
[(Double, Text)]
-> ([(Double, Text)] -> [(Double, Text)]) -> [(Double, Text)]
forall a b. a -> (a -> b) -> b
|> ((Double, Text) -> (Double, Text))
-> [(Double, Text)] -> [(Double, Text)]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (\( Double
_, Text
entry ) -> ( Text -> Text -> Double
distance Text
query Text
entry, Text
entry ))
[(Double, Text)]
-> ([(Double, Text)] -> [(Double, Text)]) -> [(Double, Text)]
forall a b. a -> (a -> b) -> b
|> ((Double, Text) -> (Double, Text) -> Ordering)
-> [(Double, Text)] -> [(Double, Text)]
forall a. (a -> a -> Ordering) -> [a] -> [a]
sortBy (((Double, Text) -> Down Double)
-> (Double, Text) -> (Double, Text) -> Ordering
forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing (Double -> Down Double
forall a. a -> Down a
Down (Double -> Down Double)
-> ((Double, Text) -> Double) -> (Double, Text) -> Down Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Double, Text) -> Double
forall a b. (a, b) -> a
fst))
else
[(Double, Text)]
sorted
queryMagnitude :: Double
queryMagnitude = [Int] -> Double
forall a b. (Integral a, Floating b) => [a] -> b
norm (HashMap Text Int -> [Int]
forall k v. HashMap k v -> [v]
elems HashMap Text Int
queryVector)
queryVector :: HashMap Text Int
queryVector = Text -> Int -> HashMap Text Int
gramVector Text
query Int
gramSize
itemsVector :: Vector FuzzySetItem
itemsVector = Vector FuzzySetItem
-> Maybe (Vector FuzzySetItem) -> Vector FuzzySetItem
forall a. a -> Maybe a -> a
fromMaybe Vector FuzzySetItem
forall a. Monoid a => a
mempty (Int
gramSize Int
-> HashMap Int (Vector FuzzySetItem) -> Maybe (Vector FuzzySetItem)
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
`HashMap.lookup` HashMap Int (Vector FuzzySetItem)
items)
fun :: Int -> a -> [(Double, Text)] -> [(Double, Text)]
fun Int
index a
score [(Double, Text)]
list =
case Vector FuzzySetItem
itemsVector Vector FuzzySetItem -> Int -> Maybe FuzzySetItem
forall a. Vector a -> Int -> Maybe a
!? Int
index of
Maybe FuzzySetItem
Nothing ->
[(Double, Text)]
list
Just FuzzySetItem{Double
Text
normalizedEntry :: FuzzySetItem -> Text
vectorMagnitude :: FuzzySetItem -> Double
normalizedEntry :: Text
vectorMagnitude :: Double
..} ->
( a -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral a
score Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ (Double
queryMagnitude Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
vectorMagnitude)
, Text
normalizedEntry
) (Double, Text) -> [(Double, Text)] -> [(Double, Text)]
forall a. a -> [a] -> [a]
: [(Double, Text)]
list
gramVector
:: Text
-> Int
-> HashMap Text Int
gramVector :: Text -> Int -> HashMap Text Int
gramVector Text
value Int
size =
(Text -> HashMap Text Int -> HashMap Text Int)
-> HashMap Text Int -> [Text] -> HashMap Text Int
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr Text -> HashMap Text Int -> HashMap Text Int
fun HashMap Text Int
forall k v. HashMap k v
HashMap.empty (Text -> Int -> [Text]
grams Text
value Int
size)
where
fun :: Text -> HashMap Text Int -> HashMap Text Int
fun = (Maybe Int -> Maybe Int)
-> Text -> HashMap Text Int -> HashMap Text Int
forall k v.
(Eq k, Hashable k) =>
(Maybe v -> Maybe v) -> k -> HashMap k v -> HashMap k v
HashMap.alter (Int -> Maybe Int
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Int -> Maybe Int) -> (Maybe Int -> Int) -> Maybe Int -> Maybe Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Int
forall a. Enum a => a -> a
succ (Int -> Int) -> (Maybe Int -> Int) -> Maybe Int -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Maybe Int -> Int
forall a. a -> Maybe a -> a
fromMaybe Int
0)
grams
:: Text
-> Int
-> [Text]
grams :: Text -> Int -> [Text]
grams Text
value Int
size
| Int
size Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
2 = [Char] -> [Text]
forall a. HasCallStack => [Char] -> a
error [Char]
"gram size must be at least 2"
| Bool
otherwise =
(\Int
offs -> Int -> Int -> Text -> Text
substr Int
size Int
offs Text
str) (Int -> Text) -> [Int] -> [Text]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> [Int]
offsets
where
str :: Text
str = Text -> Text
normalized Text
value Text -> Char -> Text
`enclosedIn` Char
'-'
offsets :: [Int]
offsets = [Int
0 .. Text -> Int
Text.length Text
str Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
size]