{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE OverloadedStrings #-}

-- The code in this module is responsible for querying a set for possible
-- matches and determining how similar the string is to each candidate match.
--

module Data.FuzzySet.Internal
    ( (|>)
    , matches
    , getMatches
    , gramVector
    , grams
    ) where

import Data.Function ((&))
import Data.FuzzySet.Types
import Data.FuzzySet.Util (distance)
import Data.FuzzySet.Util (norm)
import Data.FuzzySet.Util (normalized, substr, enclosedIn)
import Data.HashMap.Strict (HashMap, elems, foldrWithKey, lookup, lookupDefault, alter)
import Data.List (sortBy)
import Data.Maybe (fromMaybe)
import Data.Ord (Down(..), comparing)
import Data.Text (Text)
import Data.Vector ((!?))
import qualified Data.FuzzySet.Util as Util
import qualified Data.HashMap.Strict as HashMap
import qualified Data.Text as Text


-- | Alternative syntax for the reverse function application operator @(&)@,
-- known also as the /pipe/ operator.
--
(|>) :: a -> (a -> b) -> b
|> :: forall a b. a -> (a -> b) -> b
(|>) = forall a b. a -> (a -> b) -> b
(&)
infixl 1 |>


-- | Dot products used to compute the cosine similarity, which is the similarity
-- score assigned to entries that match the search string in the fuzzy set.
--
matches
    :: FuzzySet
    -- ^ The string set
    -> HashMap Text Int
    -- ^ A sparse vector representation of the search string (generated by 'gramVector')
    -> HashMap Int Int
    -- ^ A mapping from item index to the dot product between the corresponding
    -- entry of the set and the search string
matches :: FuzzySet -> HashMap Text Int -> HashMap Int Int
matches set :: FuzzySet
set@FuzzySet{Bool
Int
HashMap Int (Vector FuzzySetItem)
HashMap Text [GramInfo]
HashMap Text Text
useLevenshtein :: FuzzySet -> Bool
gramSizeUpper :: FuzzySet -> Int
gramSizeLower :: FuzzySet -> Int
items :: FuzzySet -> HashMap Int (Vector FuzzySetItem)
matchDict :: FuzzySet -> HashMap Text [GramInfo]
exactSet :: FuzzySet -> HashMap Text Text
useLevenshtein :: Bool
gramSizeUpper :: Int
gramSizeLower :: Int
items :: HashMap Int (Vector FuzzySetItem)
matchDict :: HashMap Text [GramInfo]
exactSet :: HashMap Text Text
..} =
    forall k v a. (k -> v -> a -> a) -> a -> HashMap k v -> a
foldrWithKey Text -> Int -> HashMap Int Int -> HashMap Int Int
fun forall a. Monoid a => a
mempty
  where
    fun :: Text -> Int -> HashMap Int Int -> HashMap Int Int
fun Text
gram Int
count HashMap Int Int
map =
        let
            insScore :: Int -> Maybe Int -> Maybe Int
insScore Int
otherCount Maybe Int
entry =
                forall a. a -> Maybe a
Just (forall a. a -> Maybe a -> a
fromMaybe Int
0 Maybe Int
entry forall a. Num a => a -> a -> a
+ Int
otherCount forall a. Num a => a -> a -> a
* Int
count)
        in
        Text
gram forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
`HashMap.lookup` HashMap Text [GramInfo]
matchDict
            forall a b. a -> (a -> b) -> b
|> forall b a. b -> (a -> b) -> Maybe a -> b
maybe HashMap Int Int
map (forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr (\GramInfo{Int
gramCount :: GramInfo -> Int
itemIndex :: GramInfo -> Int
gramCount :: Int
itemIndex :: Int
..} -> forall k v.
(Eq k, Hashable k) =>
(Maybe v -> Maybe v) -> k -> HashMap k v -> HashMap k v
alter (Int -> Maybe Int -> Maybe Int
insScore Int
gramCount) Int
itemIndex) HashMap Int Int
map)


-- | This function performs the actual task of querying a set for matches,
-- supported by the other functions in this module.
-- See [Implementation](Data-FuzzySet.html#g:8) for an explanation.
--
getMatches
    :: FuzzySet
    -- ^ The string set
    -> Text
    -- ^ A string to search for
    -> Double
    -- ^ Minimum score
    -> Int
    -- ^ The gram size /n/, which must be at least /2/
    -> [( Double, Text )]
    -- ^ A list of results (score and matched value)
getMatches :: FuzzySet -> Text -> Double -> Int -> [(Double, Text)]
getMatches set :: FuzzySet
set@FuzzySet{Bool
Int
HashMap Int (Vector FuzzySetItem)
HashMap Text [GramInfo]
HashMap Text Text
useLevenshtein :: Bool
gramSizeUpper :: Int
gramSizeLower :: Int
items :: HashMap Int (Vector FuzzySetItem)
matchDict :: HashMap Text [GramInfo]
exactSet :: HashMap Text Text
useLevenshtein :: FuzzySet -> Bool
gramSizeUpper :: FuzzySet -> Int
gramSizeLower :: FuzzySet -> Int
items :: FuzzySet -> HashMap Int (Vector FuzzySetItem)
matchDict :: FuzzySet -> HashMap Text [GramInfo]
exactSet :: FuzzySet -> HashMap Text Text
..} Text
query Double
minScore Int
gramSize =
    [(Double, Text)]
results
        forall a b. a -> (a -> b) -> b
|> forall a. (a -> Bool) -> [a] -> [a]
filter (\(Double, Text)
pair -> forall a b. (a, b) -> a
fst (Double, Text)
pair forall a. Ord a => a -> a -> Bool
>= Double
minScore)
        forall a b. a -> (a -> b) -> b
|> forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (\( Double
score, Text
entry ) -> ( Double
score, HashMap Text Text
exactSet forall a b. a -> (a -> b) -> b
|> forall k v. (Eq k, Hashable k) => v -> k -> HashMap k v -> v
lookupDefault Text
"" Text
entry ))
  where
    results :: [(Double, Text)]
results =
        let sorted :: [(Double, Text)]
sorted =
                FuzzySet -> HashMap Text Int -> HashMap Int Int
matches FuzzySet
set HashMap Text Int
queryVector
                    forall a b. a -> (a -> b) -> b
|> forall k v a. (k -> v -> a -> a) -> a -> HashMap k v -> a
HashMap.foldrWithKey forall {a}.
Integral a =>
Int -> a -> [(Double, Text)] -> [(Double, Text)]
fun []
                    forall a b. a -> (a -> b) -> b
|> forall a. (a -> a -> Ordering) -> [a] -> [a]
sortBy (forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing (forall a. a -> Down a
Down forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a, b) -> a
fst))
        in
        if Bool
useLevenshtein then
            [(Double, Text)]
sorted
                forall a b. a -> (a -> b) -> b
|> forall a. Int -> [a] -> [a]
take Int
50
                forall a b. a -> (a -> b) -> b
|> forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (\( Double
_, Text
entry ) -> ( Text -> Text -> Double
distance Text
query Text
entry, Text
entry ))
                forall a b. a -> (a -> b) -> b
|> forall a. (a -> a -> Ordering) -> [a] -> [a]
sortBy (forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing (forall a. a -> Down a
Down forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a, b) -> a
fst))
        else
            [(Double, Text)]
sorted

    queryMagnitude :: Double
queryMagnitude = forall a b. (Integral a, Floating b) => [a] -> b
norm (forall k v. HashMap k v -> [v]
elems HashMap Text Int
queryVector)
    queryVector :: HashMap Text Int
queryVector = Text -> Int -> HashMap Text Int
gramVector Text
query Int
gramSize
    itemsVector :: Vector FuzzySetItem
itemsVector = forall a. a -> Maybe a -> a
fromMaybe forall a. Monoid a => a
mempty (Int
gramSize forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
`HashMap.lookup` HashMap Int (Vector FuzzySetItem)
items)

    fun :: Int -> a -> [(Double, Text)] -> [(Double, Text)]
fun Int
index a
score [(Double, Text)]
list =
        case Vector FuzzySetItem
itemsVector forall a. Vector a -> Int -> Maybe a
!? Int
index of
            Maybe FuzzySetItem
Nothing ->
                [(Double, Text)]
list

            Just FuzzySetItem{Double
Text
normalizedEntry :: FuzzySetItem -> Text
vectorMagnitude :: FuzzySetItem -> Double
normalizedEntry :: Text
vectorMagnitude :: Double
..} ->
                ( forall a b. (Integral a, Num b) => a -> b
fromIntegral a
score forall a. Fractional a => a -> a -> a
/ (Double
queryMagnitude forall a. Num a => a -> a -> a
* Double
vectorMagnitude)
                , Text
normalizedEntry
                ) forall a. a -> [a] -> [a]
: [(Double, Text)]
list


-- | Generate a list of /n/-grams (character substrings) from the normalized
-- input and then translate this into a dictionary with the /n/-grams as keys
-- mapping to the number of occurences of the substring in the list.
--
-- >>> gramVector "xxxx" 2
-- fromList [("-x",1), ("xx",3), ("x-",1)]
--
-- The substring @"xx"@ appears three times in the normalized string:
--
-- >>> grams "xxxx" 2
-- ["-x","xx","xx","xx","x-"]
--
-- >>> Data.HashMap.Strict.lookup "nts" (gramVector "intrent'srestaurantsomeoftrent'saunt'santswantsamtorentsomepants" 3)
-- Just 8
--
gramVector
    :: Text
    -- ^ An input string
    -> Int
    -- ^ The gram size /n/, which must be at least /2/
    -> HashMap Text Int
    -- ^ A sparse vector with the number of times a substring occurs in the
    -- normalized input string
gramVector :: Text -> Int -> HashMap Text Int
gramVector Text
value Int
size =
    forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr Text -> HashMap Text Int -> HashMap Text Int
fun forall k v. HashMap k v
HashMap.empty (Text -> Int -> [Text]
grams Text
value Int
size)
  where
    fun :: Text -> HashMap Text Int -> HashMap Text Int
fun = forall k v.
(Eq k, Hashable k) =>
(Maybe v -> Maybe v) -> k -> HashMap k v -> HashMap k v
HashMap.alter (forall (f :: * -> *) a. Applicative f => a -> f a
pure forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Enum a => a -> a
succ forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. a -> Maybe a -> a
fromMaybe Int
0)


-- | Break apart the input string into a list of /n/-grams. The string is
-- first 'Data.FuzzySet.Util.normalized' and enclosed in hyphens.  We then take
-- all substrings of length /n/, letting the offset range from
-- \(0 \text{ to } s + 2 − n\), where /s/ is the length of the normalized input.
--
-- /Example:/
-- The string @"Destroido Corp."@ is first normalized to @"destroido corp"@,
-- and then enclosed in hyphens, so that it becomes @"-destroido corp-"@. The
-- trigrams generated from this normalized string are:
--
-- > [ "-de"
-- > , "des"
-- > , "est"
-- > , "str"
-- > , "tro"
-- > , "roi"
-- > , "oid"
-- > , "ido"
-- > , "do "
-- > , "o c"
-- > , " co"
-- > , "cor"
-- > , "orp"
-- > , "rp-"
-- > ]
--
grams
    :: Text
    -- ^ An input string
    -> Int
    -- ^ The gram size /n/, which must be at least /2/
    -> [Text]
    -- ^ A list of /n/-grams
grams :: Text -> Int -> [Text]
grams Text
value Int
size
    | Int
size forall a. Ord a => a -> a -> Bool
< Int
2 = forall a. HasCallStack => [Char] -> a
error [Char]
"gram size must be at least 2"
    | Bool
otherwise =
        (\Int
offs -> Int -> Int -> Text -> Text
substr Int
size Int
offs Text
str) forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> [Int]
offsets
  where
    str :: Text
str = Text -> Text
normalized Text
value Text -> Char -> Text
`enclosedIn` Char
'-'
    offsets :: [Int]
offsets = [Int
0 .. Text -> Int
Text.length Text
str forall a. Num a => a -> a -> a
- Int
size]