{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE OverloadedStrings #-}
module Data.FuzzySet.Internal
( (|>)
, matches
, getMatches
, gramVector
, grams
) where
import Data.Function ((&))
import Data.FuzzySet.Types
import Data.FuzzySet.Util (distance)
import Data.FuzzySet.Util (norm)
import Data.FuzzySet.Util (normalized, substr, enclosedIn)
import Data.HashMap.Strict (HashMap, elems, foldrWithKey, lookup, lookupDefault, alter)
import Data.List (sortBy)
import Data.Maybe (fromMaybe)
import Data.Ord (Down(..), comparing)
import Data.Text (Text)
import Data.Vector ((!?))
import qualified Data.FuzzySet.Util as Util
import qualified Data.HashMap.Strict as HashMap
import qualified Data.Text as Text
(|>) :: a -> (a -> b) -> b
|> :: forall a b. a -> (a -> b) -> b
(|>) = forall a b. a -> (a -> b) -> b
(&)
infixl 1 |>
matches
:: FuzzySet
-> HashMap Text Int
-> HashMap Int Int
matches :: FuzzySet -> HashMap Text Int -> HashMap Int Int
matches set :: FuzzySet
set@FuzzySet{Bool
Int
HashMap Int (Vector FuzzySetItem)
HashMap Text [GramInfo]
HashMap Text Text
useLevenshtein :: FuzzySet -> Bool
gramSizeUpper :: FuzzySet -> Int
gramSizeLower :: FuzzySet -> Int
items :: FuzzySet -> HashMap Int (Vector FuzzySetItem)
matchDict :: FuzzySet -> HashMap Text [GramInfo]
exactSet :: FuzzySet -> HashMap Text Text
useLevenshtein :: Bool
gramSizeUpper :: Int
gramSizeLower :: Int
items :: HashMap Int (Vector FuzzySetItem)
matchDict :: HashMap Text [GramInfo]
exactSet :: HashMap Text Text
..} =
forall k v a. (k -> v -> a -> a) -> a -> HashMap k v -> a
foldrWithKey Text -> Int -> HashMap Int Int -> HashMap Int Int
fun forall a. Monoid a => a
mempty
where
fun :: Text -> Int -> HashMap Int Int -> HashMap Int Int
fun Text
gram Int
count HashMap Int Int
map =
let
insScore :: Int -> Maybe Int -> Maybe Int
insScore Int
otherCount Maybe Int
entry =
forall a. a -> Maybe a
Just (forall a. a -> Maybe a -> a
fromMaybe Int
0 Maybe Int
entry forall a. Num a => a -> a -> a
+ Int
otherCount forall a. Num a => a -> a -> a
* Int
count)
in
Text
gram forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
`HashMap.lookup` HashMap Text [GramInfo]
matchDict
forall a b. a -> (a -> b) -> b
|> forall b a. b -> (a -> b) -> Maybe a -> b
maybe HashMap Int Int
map (forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr (\GramInfo{Int
gramCount :: GramInfo -> Int
itemIndex :: GramInfo -> Int
gramCount :: Int
itemIndex :: Int
..} -> forall k v.
(Eq k, Hashable k) =>
(Maybe v -> Maybe v) -> k -> HashMap k v -> HashMap k v
alter (Int -> Maybe Int -> Maybe Int
insScore Int
gramCount) Int
itemIndex) HashMap Int Int
map)
getMatches
:: FuzzySet
-> Text
-> Double
-> Int
-> [( Double, Text )]
getMatches :: FuzzySet -> Text -> Double -> Int -> [(Double, Text)]
getMatches set :: FuzzySet
set@FuzzySet{Bool
Int
HashMap Int (Vector FuzzySetItem)
HashMap Text [GramInfo]
HashMap Text Text
useLevenshtein :: Bool
gramSizeUpper :: Int
gramSizeLower :: Int
items :: HashMap Int (Vector FuzzySetItem)
matchDict :: HashMap Text [GramInfo]
exactSet :: HashMap Text Text
useLevenshtein :: FuzzySet -> Bool
gramSizeUpper :: FuzzySet -> Int
gramSizeLower :: FuzzySet -> Int
items :: FuzzySet -> HashMap Int (Vector FuzzySetItem)
matchDict :: FuzzySet -> HashMap Text [GramInfo]
exactSet :: FuzzySet -> HashMap Text Text
..} Text
query Double
minScore Int
gramSize =
[(Double, Text)]
results
forall a b. a -> (a -> b) -> b
|> forall a. (a -> Bool) -> [a] -> [a]
filter (\(Double, Text)
pair -> forall a b. (a, b) -> a
fst (Double, Text)
pair forall a. Ord a => a -> a -> Bool
>= Double
minScore)
forall a b. a -> (a -> b) -> b
|> forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (\( Double
score, Text
entry ) -> ( Double
score, HashMap Text Text
exactSet forall a b. a -> (a -> b) -> b
|> forall k v. (Eq k, Hashable k) => v -> k -> HashMap k v -> v
lookupDefault Text
"" Text
entry ))
where
results :: [(Double, Text)]
results =
let sorted :: [(Double, Text)]
sorted =
FuzzySet -> HashMap Text Int -> HashMap Int Int
matches FuzzySet
set HashMap Text Int
queryVector
forall a b. a -> (a -> b) -> b
|> forall k v a. (k -> v -> a -> a) -> a -> HashMap k v -> a
HashMap.foldrWithKey forall {a}.
Integral a =>
Int -> a -> [(Double, Text)] -> [(Double, Text)]
fun []
forall a b. a -> (a -> b) -> b
|> forall a. (a -> a -> Ordering) -> [a] -> [a]
sortBy (forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing (forall a. a -> Down a
Down forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a, b) -> a
fst))
in
if Bool
useLevenshtein then
[(Double, Text)]
sorted
forall a b. a -> (a -> b) -> b
|> forall a. Int -> [a] -> [a]
take Int
50
forall a b. a -> (a -> b) -> b
|> forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (\( Double
_, Text
entry ) -> ( Text -> Text -> Double
distance Text
query Text
entry, Text
entry ))
forall a b. a -> (a -> b) -> b
|> forall a. (a -> a -> Ordering) -> [a] -> [a]
sortBy (forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing (forall a. a -> Down a
Down forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a, b) -> a
fst))
else
[(Double, Text)]
sorted
queryMagnitude :: Double
queryMagnitude = forall a b. (Integral a, Floating b) => [a] -> b
norm (forall k v. HashMap k v -> [v]
elems HashMap Text Int
queryVector)
queryVector :: HashMap Text Int
queryVector = Text -> Int -> HashMap Text Int
gramVector Text
query Int
gramSize
itemsVector :: Vector FuzzySetItem
itemsVector = forall a. a -> Maybe a -> a
fromMaybe forall a. Monoid a => a
mempty (Int
gramSize forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
`HashMap.lookup` HashMap Int (Vector FuzzySetItem)
items)
fun :: Int -> a -> [(Double, Text)] -> [(Double, Text)]
fun Int
index a
score [(Double, Text)]
list =
case Vector FuzzySetItem
itemsVector forall a. Vector a -> Int -> Maybe a
!? Int
index of
Maybe FuzzySetItem
Nothing ->
[(Double, Text)]
list
Just FuzzySetItem{Double
Text
normalizedEntry :: FuzzySetItem -> Text
vectorMagnitude :: FuzzySetItem -> Double
normalizedEntry :: Text
vectorMagnitude :: Double
..} ->
( forall a b. (Integral a, Num b) => a -> b
fromIntegral a
score forall a. Fractional a => a -> a -> a
/ (Double
queryMagnitude forall a. Num a => a -> a -> a
* Double
vectorMagnitude)
, Text
normalizedEntry
) forall a. a -> [a] -> [a]
: [(Double, Text)]
list
gramVector
:: Text
-> Int
-> HashMap Text Int
gramVector :: Text -> Int -> HashMap Text Int
gramVector Text
value Int
size =
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr Text -> HashMap Text Int -> HashMap Text Int
fun forall k v. HashMap k v
HashMap.empty (Text -> Int -> [Text]
grams Text
value Int
size)
where
fun :: Text -> HashMap Text Int -> HashMap Text Int
fun = forall k v.
(Eq k, Hashable k) =>
(Maybe v -> Maybe v) -> k -> HashMap k v -> HashMap k v
HashMap.alter (forall (f :: * -> *) a. Applicative f => a -> f a
pure forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Enum a => a -> a
succ forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. a -> Maybe a -> a
fromMaybe Int
0)
grams
:: Text
-> Int
-> [Text]
grams :: Text -> Int -> [Text]
grams Text
value Int
size
| Int
size forall a. Ord a => a -> a -> Bool
< Int
2 = forall a. HasCallStack => [Char] -> a
error [Char]
"gram size must be at least 2"
| Bool
otherwise =
(\Int
offs -> Int -> Int -> Text -> Text
substr Int
size Int
offs Text
str) forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> [Int]
offsets
where
str :: Text
str = Text -> Text
normalized Text
value Text -> Char -> Text
`enclosedIn` Char
'-'
offsets :: [Int]
offsets = [Int
0 .. Text -> Int
Text.length Text
str forall a. Num a => a -> a -> a
- Int
size]