{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE OverloadedStrings #-}

-- The code in this module is responsible for querying a set for possible
-- matches and determining how similar the string is to each candidate match.
--

module Data.FuzzySet.Internal
    ( (|>)
    , matches
    , getMatches
    , gramVector
    , grams
    ) where

import Data.Function ((&))
import Data.FuzzySet.Types
import Data.FuzzySet.Util (distance)
import Data.FuzzySet.Util (norm)
import Data.FuzzySet.Util (normalized, substr, enclosedIn)
import Data.HashMap.Strict (HashMap, elems, foldrWithKey, lookup, lookupDefault, alter)
import Data.List (sortBy)
import Data.Maybe (fromMaybe)
import Data.Ord (Down(..), comparing)
import Data.Text (Text)
import Data.Vector ((!?))
import qualified Data.FuzzySet.Util as Util
import qualified Data.HashMap.Strict as HashMap
import qualified Data.Text as Text


-- | Alternative syntax for the reverse function application operator @(&)@,
-- known also as the /pipe/ operator.
--
(|>) :: a -> (a -> b) -> b
|> :: a -> (a -> b) -> b
(|>) = a -> (a -> b) -> b
forall a b. a -> (a -> b) -> b
(&)
infixl 1 |>


-- | Dot products used to compute the cosine similarity, which is the similarity
-- score assigned to entries that match the search string in the fuzzy set.
--
matches
    :: FuzzySet
    -- ^ The string set
    -> HashMap Text Int
    -- ^ A sparse vector representation of the search string (generated by 'gramVector')
    -> HashMap Int Int
    -- ^ A mapping from item index to the dot product between the corresponding
    -- entry of the set and the search string
matches :: FuzzySet -> HashMap Text Int -> HashMap Int Int
matches set :: FuzzySet
set@FuzzySet{Bool
Int
HashMap Int (Vector FuzzySetItem)
HashMap Text [GramInfo]
HashMap Text Text
useLevenshtein :: FuzzySet -> Bool
gramSizeUpper :: FuzzySet -> Int
gramSizeLower :: FuzzySet -> Int
items :: FuzzySet -> HashMap Int (Vector FuzzySetItem)
matchDict :: FuzzySet -> HashMap Text [GramInfo]
exactSet :: FuzzySet -> HashMap Text Text
useLevenshtein :: Bool
gramSizeUpper :: Int
gramSizeLower :: Int
items :: HashMap Int (Vector FuzzySetItem)
matchDict :: HashMap Text [GramInfo]
exactSet :: HashMap Text Text
..} =
    (Text -> Int -> HashMap Int Int -> HashMap Int Int)
-> HashMap Int Int -> HashMap Text Int -> HashMap Int Int
forall k v a. (k -> v -> a -> a) -> a -> HashMap k v -> a
foldrWithKey Text -> Int -> HashMap Int Int -> HashMap Int Int
fun HashMap Int Int
forall a. Monoid a => a
mempty
  where
    fun :: Text -> Int -> HashMap Int Int -> HashMap Int Int
fun Text
gram Int
count HashMap Int Int
map =
        let
            insScore :: Int -> Maybe Int -> Maybe Int
insScore Int
otherCount Maybe Int
entry =
                Int -> Maybe Int
forall a. a -> Maybe a
Just (Int -> Maybe Int -> Int
forall a. a -> Maybe a -> a
fromMaybe Int
0 Maybe Int
entry Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
otherCount Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
count)
        in
        Text
gram Text -> HashMap Text [GramInfo] -> Maybe [GramInfo]
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
`HashMap.lookup` HashMap Text [GramInfo]
matchDict
            Maybe [GramInfo]
-> (Maybe [GramInfo] -> HashMap Int Int) -> HashMap Int Int
forall a b. a -> (a -> b) -> b
|> HashMap Int Int
-> ([GramInfo] -> HashMap Int Int)
-> Maybe [GramInfo]
-> HashMap Int Int
forall b a. b -> (a -> b) -> Maybe a -> b
maybe HashMap Int Int
map ((GramInfo -> HashMap Int Int -> HashMap Int Int)
-> HashMap Int Int -> [GramInfo] -> HashMap Int Int
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr (\GramInfo{Int
gramCount :: GramInfo -> Int
itemIndex :: GramInfo -> Int
gramCount :: Int
itemIndex :: Int
..} -> (Maybe Int -> Maybe Int)
-> Int -> HashMap Int Int -> HashMap Int Int
forall k v.
(Eq k, Hashable k) =>
(Maybe v -> Maybe v) -> k -> HashMap k v -> HashMap k v
alter (Int -> Maybe Int -> Maybe Int
insScore Int
gramCount) Int
itemIndex) HashMap Int Int
map)


-- | This function performs the actual task of querying a set for matches,
-- supported by the other functions in this module.
-- See [Implementation](Data-FuzzySet.html#g:8) for an explanation.
--
getMatches
    :: FuzzySet
    -- ^ The string set
    -> Text
    -- ^ A string to search for
    -> Double
    -- ^ Minimum score
    -> Int
    -- ^ The gram size /n/, which must be at least /2/
    -> [( Double, Text )]
    -- ^ A list of results (score and matched value)
getMatches :: FuzzySet -> Text -> Double -> Int -> [(Double, Text)]
getMatches set :: FuzzySet
set@FuzzySet{Bool
Int
HashMap Int (Vector FuzzySetItem)
HashMap Text [GramInfo]
HashMap Text Text
useLevenshtein :: Bool
gramSizeUpper :: Int
gramSizeLower :: Int
items :: HashMap Int (Vector FuzzySetItem)
matchDict :: HashMap Text [GramInfo]
exactSet :: HashMap Text Text
useLevenshtein :: FuzzySet -> Bool
gramSizeUpper :: FuzzySet -> Int
gramSizeLower :: FuzzySet -> Int
items :: FuzzySet -> HashMap Int (Vector FuzzySetItem)
matchDict :: FuzzySet -> HashMap Text [GramInfo]
exactSet :: FuzzySet -> HashMap Text Text
..} Text
query Double
minScore Int
gramSize =
    [(Double, Text)]
results
        [(Double, Text)]
-> ([(Double, Text)] -> [(Double, Text)]) -> [(Double, Text)]
forall a b. a -> (a -> b) -> b
|> ((Double, Text) -> Bool) -> [(Double, Text)] -> [(Double, Text)]
forall a. (a -> Bool) -> [a] -> [a]
filter (\(Double, Text)
pair -> (Double, Text) -> Double
forall a b. (a, b) -> a
fst (Double, Text)
pair Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
>= Double
minScore)
        [(Double, Text)]
-> ([(Double, Text)] -> [(Double, Text)]) -> [(Double, Text)]
forall a b. a -> (a -> b) -> b
|> ((Double, Text) -> (Double, Text))
-> [(Double, Text)] -> [(Double, Text)]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (\( Double
score, Text
entry ) -> ( Double
score, HashMap Text Text
exactSet HashMap Text Text -> (HashMap Text Text -> Text) -> Text
forall a b. a -> (a -> b) -> b
|> Text -> Text -> HashMap Text Text -> Text
forall k v. (Eq k, Hashable k) => v -> k -> HashMap k v -> v
lookupDefault Text
"" Text
entry ))
  where
    results :: [(Double, Text)]
results =
        let sorted :: [(Double, Text)]
sorted =
                FuzzySet -> HashMap Text Int -> HashMap Int Int
matches FuzzySet
set HashMap Text Int
queryVector
                    HashMap Int Int
-> (HashMap Int Int -> [(Double, Text)]) -> [(Double, Text)]
forall a b. a -> (a -> b) -> b
|> (Int -> Int -> [(Double, Text)] -> [(Double, Text)])
-> [(Double, Text)] -> HashMap Int Int -> [(Double, Text)]
forall k v a. (k -> v -> a -> a) -> a -> HashMap k v -> a
HashMap.foldrWithKey Int -> Int -> [(Double, Text)] -> [(Double, Text)]
forall a.
Integral a =>
Int -> a -> [(Double, Text)] -> [(Double, Text)]
fun []
                    [(Double, Text)]
-> ([(Double, Text)] -> [(Double, Text)]) -> [(Double, Text)]
forall a b. a -> (a -> b) -> b
|> ((Double, Text) -> (Double, Text) -> Ordering)
-> [(Double, Text)] -> [(Double, Text)]
forall a. (a -> a -> Ordering) -> [a] -> [a]
sortBy (((Double, Text) -> Down Double)
-> (Double, Text) -> (Double, Text) -> Ordering
forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing (Double -> Down Double
forall a. a -> Down a
Down (Double -> Down Double)
-> ((Double, Text) -> Double) -> (Double, Text) -> Down Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Double, Text) -> Double
forall a b. (a, b) -> a
fst))
        in
        if Bool
useLevenshtein then
            [(Double, Text)]
sorted
                [(Double, Text)]
-> ([(Double, Text)] -> [(Double, Text)]) -> [(Double, Text)]
forall a b. a -> (a -> b) -> b
|> Int -> [(Double, Text)] -> [(Double, Text)]
forall a. Int -> [a] -> [a]
take Int
50
                [(Double, Text)]
-> ([(Double, Text)] -> [(Double, Text)]) -> [(Double, Text)]
forall a b. a -> (a -> b) -> b
|> ((Double, Text) -> (Double, Text))
-> [(Double, Text)] -> [(Double, Text)]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (\( Double
_, Text
entry ) -> ( Text -> Text -> Double
distance Text
query Text
entry, Text
entry ))
                [(Double, Text)]
-> ([(Double, Text)] -> [(Double, Text)]) -> [(Double, Text)]
forall a b. a -> (a -> b) -> b
|> ((Double, Text) -> (Double, Text) -> Ordering)
-> [(Double, Text)] -> [(Double, Text)]
forall a. (a -> a -> Ordering) -> [a] -> [a]
sortBy (((Double, Text) -> Down Double)
-> (Double, Text) -> (Double, Text) -> Ordering
forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing (Double -> Down Double
forall a. a -> Down a
Down (Double -> Down Double)
-> ((Double, Text) -> Double) -> (Double, Text) -> Down Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Double, Text) -> Double
forall a b. (a, b) -> a
fst))
        else
            [(Double, Text)]
sorted

    queryMagnitude :: Double
queryMagnitude = [Int] -> Double
forall a b. (Integral a, Floating b) => [a] -> b
norm (HashMap Text Int -> [Int]
forall k v. HashMap k v -> [v]
elems HashMap Text Int
queryVector)
    queryVector :: HashMap Text Int
queryVector = Text -> Int -> HashMap Text Int
gramVector Text
query Int
gramSize
    itemsVector :: Vector FuzzySetItem
itemsVector = Vector FuzzySetItem
-> Maybe (Vector FuzzySetItem) -> Vector FuzzySetItem
forall a. a -> Maybe a -> a
fromMaybe Vector FuzzySetItem
forall a. Monoid a => a
mempty (Int
gramSize Int
-> HashMap Int (Vector FuzzySetItem) -> Maybe (Vector FuzzySetItem)
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
`HashMap.lookup` HashMap Int (Vector FuzzySetItem)
items)

    fun :: Int -> a -> [(Double, Text)] -> [(Double, Text)]
fun Int
index a
score [(Double, Text)]
list =
        case Vector FuzzySetItem
itemsVector Vector FuzzySetItem -> Int -> Maybe FuzzySetItem
forall a. Vector a -> Int -> Maybe a
!? Int
index of
            Maybe FuzzySetItem
Nothing ->
                [(Double, Text)]
list

            Just FuzzySetItem{Double
Text
normalizedEntry :: FuzzySetItem -> Text
vectorMagnitude :: FuzzySetItem -> Double
normalizedEntry :: Text
vectorMagnitude :: Double
..} ->
                ( a -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral a
score Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ (Double
queryMagnitude Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
vectorMagnitude)
                , Text
normalizedEntry
                ) (Double, Text) -> [(Double, Text)] -> [(Double, Text)]
forall a. a -> [a] -> [a]
: [(Double, Text)]
list


-- | Generate a list of /n/-grams (character substrings) from the normalized
-- input and then translate this into a dictionary with the /n/-grams as keys
-- mapping to the number of occurences of the substring in the list.
--
-- >>> gramVector "xxxx" 2
-- fromList [("-x",1), ("xx",3), ("x-",1)]
--
-- The substring @"xx"@ appears three times in the normalized string:
--
-- >>> grams "xxxx" 2
-- ["-x","xx","xx","xx","x-"]
--
-- >>> Data.HashMap.Strict.lookup "nts" (gramVector "intrent'srestaurantsomeoftrent'saunt'santswantsamtorentsomepants" 3)
-- Just 8
--
gramVector
    :: Text
    -- ^ An input string
    -> Int
    -- ^ The gram size /n/, which must be at least /2/
    -> HashMap Text Int
    -- ^ A sparse vector with the number of times a substring occurs in the
    -- normalized input string
gramVector :: Text -> Int -> HashMap Text Int
gramVector Text
value Int
size =
    (Text -> HashMap Text Int -> HashMap Text Int)
-> HashMap Text Int -> [Text] -> HashMap Text Int
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr Text -> HashMap Text Int -> HashMap Text Int
fun HashMap Text Int
forall k v. HashMap k v
HashMap.empty (Text -> Int -> [Text]
grams Text
value Int
size)
  where
    fun :: Text -> HashMap Text Int -> HashMap Text Int
fun = (Maybe Int -> Maybe Int)
-> Text -> HashMap Text Int -> HashMap Text Int
forall k v.
(Eq k, Hashable k) =>
(Maybe v -> Maybe v) -> k -> HashMap k v -> HashMap k v
HashMap.alter (Int -> Maybe Int
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Int -> Maybe Int) -> (Maybe Int -> Int) -> Maybe Int -> Maybe Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Int
forall a. Enum a => a -> a
succ (Int -> Int) -> (Maybe Int -> Int) -> Maybe Int -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Maybe Int -> Int
forall a. a -> Maybe a -> a
fromMaybe Int
0)


-- | Break apart the input string into a list of /n/-grams. The string is
-- first 'Data.FuzzySet.Util.normalized' and enclosed in hyphens.  We then take
-- all substrings of length /n/, letting the offset range from
-- \(0 \text{ to } s + 2 − n\), where /s/ is the length of the normalized input.
--
-- /Example:/
-- The string @"Destroido Corp."@ is first normalized to @"destroido corp"@,
-- and then enclosed in hyphens, so that it becomes @"-destroido corp-"@. The
-- trigrams generated from this normalized string are:
--
-- > [ "-de"
-- > , "des"
-- > , "est"
-- > , "str"
-- > , "tro"
-- > , "roi"
-- > , "oid"
-- > , "ido"
-- > , "do "
-- > , "o c"
-- > , " co"
-- > , "cor"
-- > , "orp"
-- > , "rp-"
-- > ]
--
grams
    :: Text
    -- ^ An input string
    -> Int
    -- ^ The gram size /n/, which must be at least /2/
    -> [Text]
    -- ^ A list of /n/-grams
grams :: Text -> Int -> [Text]
grams Text
value Int
size
    | Int
size Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
2 = [Char] -> [Text]
forall a. HasCallStack => [Char] -> a
error [Char]
"gram size must be at least 2"
    | Bool
otherwise =
        (\Int
offs -> Int -> Int -> Text -> Text
substr Int
size Int
offs Text
str) (Int -> Text) -> [Int] -> [Text]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> [Int]
offsets
  where
    str :: Text
str = Text -> Text
normalized Text
value Text -> Char -> Text
`enclosedIn` Char
'-'
    offsets :: [Int]
offsets = [Int
0 .. Text -> Int
Text.length Text
str Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
size]