{-# LANGUAGE RecordWildCards #-}

-- |
--
-- Module      : Data.FuzzySet
-- Copyright   : (c) 2017-2019 Johannes Hildén
-- License     : BSD3
-- Maintainer  : hildenjohannes@gmail.com
-- Stability   : experimental
-- Portability : GHC
--
-- A fuzzy string set data structure for approximate string matching. This
-- implementation is based on the Python and JavaScript libraries with similar
-- names; [fuzzyset.js](http://glench.github.io/fuzzyset.js/), and the original
-- [fuzzyset](https://github.com/axiak/fuzzyset) Python library.

module Data.FuzzySet
    (
    -- * How to use this library
    -- $howto

    -- * Types
      FuzzySet

    -- * API

    -- ** Initializing
    , emptySet
    , defaultSet
    , fromList

    -- ** Adding
    , add
    , addToSet
    , addMany

    -- ** Retrieving
    , get
    , getWithMinScore
    , getOne
    , getOneWithMinScore

    -- ** Inspecting
    , size
    , isEmpty
    , values

    -- * Implementation
    -- $implementation

    ) where

import Data.Default (Default, def)
import Data.FuzzySet.Internal
import Data.FuzzySet.Types
import Data.FuzzySet.Util
import Data.HashMap.Strict (HashMap, elems, insert)
import Data.List (find)
import Data.Maybe (fromMaybe)
import Data.Text (Text)
import Data.Vector (snoc)
import qualified Data.FuzzySet.Util as Util
import qualified Data.HashMap.Strict as HashMap
import qualified Data.Text as Text
import qualified Data.Vector as Vector

-- $howto
--
-- Make sure the @OverloadedStrings@ pragma is enabled. After that, three steps
-- are typically involved:
--
--   1. Create a set using one of 'defaultSet', 'emptySet', or 'fromList'.
--   2. To add entries, use 'add', 'addToSet', or 'addMany'.
--   3. Query the set with 'get', 'getOne', 'getWithMinScore', or 'getOneWithMinScore'.
--
-- >>> defaultSet `add` "Jurassic Park" `add` "Terminator" `add` "The Matrix" `getOne` "percolator"
-- Just "Terminator"
--
-- >>> defaultSet `add` "Shaggy Rogers" `add` "Fred Jones" `add` "Daphne Blake" `add` "Velma Dinkley" `get` "Shaggy Jones"
-- [(0.7692307692307693,"Shaggy Rogers"),(0.5,"Fred Jones")]
--
-- There are also a few functions to inspect sets: 'size', 'isEmpty', and 'values'.
--
-- == More examples
--
-- > {-# LANGUAGE OverloadedStrings #-}
-- > module Main where
-- >
-- > import Data.FuzzySet
-- >
-- > states = [ "Alabama"        , "Alaska"         , "American Samoa"            , "Arizona"       , "Arkansas"
-- >          , "California"     , "Colorado"       , "Connecticut"               , "Delaware"      , "District of Columbia"
-- >          , "Florida"        , "Georgia"        , "Guam"                      , "Hawaii"        , "Idaho"
-- >          , "Illinois"       , "Indiana"        , "Iowa"                      , "Kansas"        , "Kentucky"
-- >          , "Louisiana"      , "Maine"          , "Maryland"                  , "Massachusetts" , "Michigan"
-- >          , "Minnesota"      , "Mississippi"    , "Missouri"                  , "Montana"       , "Nebraska"
-- >          , "Nevada"         , "New Hampshire"  , "New Jersey"                , "New Mexico"    , "New York"
-- >          , "North Carolina" , "North Dakota"   , "Northern Marianas Islands" , "Ohio"          , "Oklahoma"
-- >          , "Oregon"         , "Pennsylvania"   , "Puerto Rico"               , "Rhode Island"  , "South Carolina"
-- >          , "South Dakota"   , "Tennessee"      , "Texas"                     , "Utah"          , "Vermont"
-- >          , "Virginia"       , "Virgin Islands" , "Washington"                , "West Virginia" , "Wisconsin"
-- >          , "Wyoming" ]
-- >
-- > statesSet = fromList states
-- >
-- > main = mapM_ print (get statesSet "Burger Islands")
--
-- The output of this program is:
--
-- > (0.7142857142857143,"Virgin Islands")
-- > (0.5714285714285714,"Rhode Island")
-- > (0.44,"Northern Marianas Islands")
-- > (0.35714285714285715,"Maryland")
--
--  Using the definition of @statesSet@ from previous example:
--
-- > >>> get statesSet "Why-oh-me-ing"
-- > [(0.5384615384615384,"Wyoming")]
--
-- > >>> get statesSet "Connect a cat"
-- > [(0.6923076923076923,"Connecticut")]
--
-- > >>> get statesSet "Transylvania"
-- > [(0.75,"Pennsylvania"),(0.3333333333333333,"California"),(0.3333333333333333,"Arkansas"),(0.3333333333333333,"Kansas")]
--
-- > >>> get statesSet "CanOfSauce"
-- > [(0.4,"Kansas")]
--
-- > >>> get statesSet "Alaska"
-- > [(1.0,"Alaska")]
--
-- > >>> get statesSet "Alaskanbraskansas"
-- > [(0.47058823529411764,"Arkansas"),(0.35294117647058826,"Kansas"),(0.35294117647058826,"Alaska"),(0.35294117647058826,"Alabama"),(0.35294117647058826,"Nebraska")]
--
-- $implementation
--
-- To determine the similarity between entries of the set and the search string,
-- the algorithm translates the strings to vectors and then calculates a metric
-- known as the /cosine similarity/ between these. A detailed explanation, with
-- interactive examples, can be found on the website for the
-- [JavaScript version](http://glench.github.io/fuzzyset.js/ui/) of this library.
-- A brief overview follows here.
--
-- == Cosine similarity
--
-- The cosine similarity of two vectors \(A\) and \(B\) is given by the formula
--
-- \[ \frac{A \cdot B}{||A||\ ||B||} \]
--
-- where \(A \cdot B\) is the dot product of the two vectors, and \(||A||\)
-- denotes the [euclidean norm](Data-FuzzySet-Util.html#v:norm), or /magnitude/,
-- of \(A\). The cosine similarity is a measure of the (cosine of the) angle
-- between two vectors. Since we will only deal with vectors with non-negative
-- components, the result of this operation is always in the range \([0, 1]\).
--
-- == Gram vectors
--
-- The vector we are interested in has as its components the number of times
-- a gram (substring) occurs in the (normalized version of the) string. The
-- function 'gramVector' takes an arbitrary string as input and returns this
-- vector, in dictionary form:
--
-- >>> gramVector "Mississippi" 3
-- fromList [("pi-",1),("ssi",2),("sis",1),("iss",2),("-mi",1),("mis",1),("sip",1),("ppi",1),("ipp",1)]
--
-- This dictionary maps each /n/-gram key to to the number of times it occurs
-- in the string. The below table makes it more evident that this can be thought
-- of as a sparse vector.
--
-- +---------+-------+-------+-------+-------+-------+-------+-------+-------+-------+
-- | /Gram/  | @-mi@ | @mis@ | @iss@ | @ssi@ | @sis@ | @sip@ | @ipp@ | @ppi@ | @pi-@ |
-- +---------+-------+-------+-------+-------+-------+-------+-------+-------+-------+
-- | /Count/ |   1   |   1   |   2   |   2   |   1   |   1   |   1   |   1   |   1   |
-- +---------+-------+-------+-------+-------+-------+-------+-------+-------+-------+
--
-- == Lookup
--
-- The 'FuzzySet' data structure maintains a dictionary with all /n/-grams that
-- occur in the entries of the set for different sizes of grams.
--
-- > "mis" => [ { itemIndex = 4, gramCount = 1 }, { itemIndex = 11, gramCount = 2 } ]
--
-- To compute the cosine similarity score, the function 'Data.FuzzySet.Internal.getMatches'
-- queries the set for the grams that stem from the search string. Here is an
-- example: Let's say we have a set where the string @"coffee"@ appears, and
-- want to search for the string @"covfefe"@. The two strings translate to the
-- following /bigram/ vectors:
--
-- >>> gramVector "coffee" 2
-- fromList [("e-",1),("ff",1),("of",1),("co",1),("ee",1),("fe",1),("-c",1)]
-- >>> gramVector "covfefe" 2
-- fromList [("e-",1),("vf",1),("ef",1),("ov",1),("co",1),("fe",2),("-c",1)]
--
-- +------------------------------------------------+------------------------------------------------+
-- |                 /coffee/                       |            /covfefe/                           |
-- +------+------+------+------+------+------+------+------+------+------+------+------+------+------+
-- | @-c@ | @co@ | @of@ | @ff@ | @fe@ | @ee@ | @e-@ | @-c@ | @co@ | @ov@ | @vf@ | @fe@ | @e-@ | @e-@ |
-- +------+------+------+------+------+------+------+------+------+------+------+------+------+------+
-- |  1   |  1   |  1   |  1   |  1   |  1   |  1   |  1   |  1   |  1   |  1   |  2   |  1   |  1   |
-- +------+------+------+------+------+------+------+------+------+------+------+------+------+------+
--
-- The non-zero entries common to both vectors are then:
--
-- +---------+------+------+------+------+
-- |         | @-c@ | @co@ | @fe@ | @e-@ |
-- +---------+------+------+------+------+
-- | \(a_i\) |  1   |  1   |  1   |  1   |
-- +---------+------+------+------+------+
-- | \(b_i\) |  1   |  1   |  2   |  1   |
-- +---------+------+------+------+------+
--
-- Dotting these we get \(1 \times 1 + 1 \times 1 + 1 \times 2 + 1 \times 1 = 5 \).
-- The function 'matches' computes these dot products and returns a dictionary
-- with the matched indices as keys. If the entry appears at item index, say,
-- 3 in the set's internal list, this would yield a key-value pair @(3, 5)@ in
-- the map.
--
-- >>> matches (defaultSet `add` "tea" `add` "biscuits" `add` "cake" `add` "coffee") (gramVector "covfefe" 2)
-- fromList [(2,2),(3,5)]
--
-- We now have the numerators of the cosine similarity scores. The data
-- structure keeps track of the magnitudes of the set entries, so we just need
-- to look this quantity up using the index:
--
-- > {vectorMagnitude = 2.6457513110645907, normalizedEntry = "coffee"}
--
-- Multiplying this by the magnitude of the search string's vector,
--
-- >>> norm $ elems $ gramVector "covfefe" 2
-- 3.1622776601683795
--
-- … we get 8.366600265340756, which is the denominator of the expression.
-- So the similarity score for this entry of the set is
--
-- >>> 5/(3.1622776601683795 * 2.6457513110645907)
-- 0.5976143046671968
--
-- And indeed, this is the score we get if the 'FuzzySet' is initialized with
-- the Levenshtein option set to @False@:
--
-- >>> (emptySet 2 3 False `add` "tea" `add` "biscuits" `add` "cake" `add` "coffee") `get` "covfefe"
-- [(0.5976143046671968,"coffee")]
--
-- Note that the above procedure is repeated for each gram size (starting with
-- the highest) in the selected range, until we either get some results, or all
-- sizes have been exhausted.
--
-- Finally, if the set was initialized with the Levenshtein distance option
-- enabled (e.g., using 'defaultSet'), then only the first 50 results are kept
-- and a new score is computed based on the Levenshtein 'Data.FuzzySet.Util.distance'.
--


-- | Initialize an empty 'FuzzySet'.
--
emptySet
    :: Int
    -- ^ Lower bound on gram sizes to use (inclusive)
    -> Int
    -- ^ Upper bound on gram sizes to use (inclusive)
    -> Bool
    -- ^ Whether or not to use the [Levenshtein distance](https://people.cs.pitt.edu/~kirk/cs1501/Pruhs/Spring2006/assignments/editdistance/Levenshtein%20Distance.htm)
    -- to determine the score
    -> FuzzySet
    -- ^ An empty fuzzy string set
emptySet :: Int -> Int -> Bool -> FuzzySet
emptySet =
    HashMap Text Text
-> HashMap Text [GramInfo]
-> HashMap Int (Vector FuzzySetItem)
-> Int
-> Int
-> Bool
-> FuzzySet
FuzzySet HashMap Text Text
forall a. Monoid a => a
mempty HashMap Text [GramInfo]
forall a. Monoid a => a
mempty HashMap Int (Vector FuzzySetItem)
forall a. Monoid a => a
mempty


-- | An empty 'FuzzySet' with the following defaults:
--
--   * Gram size lower: @2@
--   * Gram size upper: @3@
--   * Use Levenshtein distance: @True@
--
defaultSet :: FuzzySet
defaultSet :: FuzzySet
defaultSet =
    Int -> Int -> Bool -> FuzzySet
emptySet Int
2 Int
3 Bool
True


-- | See 'defaultSet'.
--
instance Default FuzzySet where
    def :: FuzzySet
def = FuzzySet
defaultSet


-- | Try to match a string against the entries in the set, and return a list of
-- all results with a score greater than or equal to the specified minimum score
-- (i.e., the first argument). The results are ordered by similarity score, with
-- the closest match first.
--
getWithMinScore
    :: Double
    -- ^ A minimum score
    -> FuzzySet
    -- ^ The fuzzy string set to compare the string against
    -> Text
    -- ^ The string to search for
    -> [( Double, Text )]
    -- ^ A list of results (score and matched value)
getWithMinScore :: Double -> FuzzySet -> Text -> [(Double, Text)]
getWithMinScore
      Double
minScore
      set :: FuzzySet
set@FuzzySet{ gramSizeLower :: FuzzySet -> Int
gramSizeLower = Int
lower, gramSizeUpper :: FuzzySet -> Int
gramSizeUpper = Int
upper, Bool
HashMap Int (Vector FuzzySetItem)
HashMap Text [GramInfo]
HashMap Text Text
useLevenshtein :: FuzzySet -> Bool
items :: FuzzySet -> HashMap Int (Vector FuzzySetItem)
matchDict :: FuzzySet -> HashMap Text [GramInfo]
exactSet :: FuzzySet -> HashMap Text Text
useLevenshtein :: Bool
items :: HashMap Int (Vector FuzzySetItem)
matchDict :: HashMap Text [GramInfo]
exactSet :: HashMap Text Text
.. }
      Text
value =
    case Text
key Text -> HashMap Text Text -> Maybe Text
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
`HashMap.lookup` HashMap Text Text
exactSet of
        Just Text
match ->
            [( Double
1, Text
match )]

        Maybe Text
Nothing ->
            [Int]
sizes
                [Int] -> ([Int] -> [[(Double, Text)]]) -> [[(Double, Text)]]
forall a b. a -> (a -> b) -> b
|> (Int -> [(Double, Text)]) -> [Int] -> [[(Double, Text)]]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (FuzzySet -> Text -> Double -> Int -> [(Double, Text)]
getMatches FuzzySet
set Text
key Double
minScore)
                [[(Double, Text)]]
-> ([[(Double, Text)]] -> Maybe [(Double, Text)])
-> Maybe [(Double, Text)]
forall a b. a -> (a -> b) -> b
|> ([(Double, Text)] -> Bool)
-> [[(Double, Text)]] -> Maybe [(Double, Text)]
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Maybe a
find (Bool -> Bool
not (Bool -> Bool)
-> ([(Double, Text)] -> Bool) -> [(Double, Text)] -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [(Double, Text)] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null)
                Maybe [(Double, Text)]
-> (Maybe [(Double, Text)] -> [(Double, Text)]) -> [(Double, Text)]
forall a b. a -> (a -> b) -> b
|> [(Double, Text)] -> Maybe [(Double, Text)] -> [(Double, Text)]
forall a. a -> Maybe a -> a
fromMaybe []
  where
    key :: Text
key = Text -> Text
Text.toLower Text
value
    sizes :: [Int]
sizes = [Int] -> [Int]
forall a. [a] -> [a]
reverse (Int -> Int -> [Int]
forall a. Enum a => a -> a -> [a]
enumFromTo Int
lower Int
upper)


-- | Try to match the given string against the entries in the set, using a
-- minimum score of 0.33. Return a list of results ordered by similarity score,
-- with the closest match first. Use 'getWithMinScore' to specify a different
-- threshold value.
--
get
    :: FuzzySet
    -- ^ The fuzzy string set to compare the string against
    -> Text
    -- ^ The string to search for
    -> [( Double, Text )]
    -- ^ A list of results (score and matched value)
get :: FuzzySet -> Text -> [(Double, Text)]
get =
    Double -> FuzzySet -> Text -> [(Double, Text)]
getWithMinScore Double
0.33


-- | Try to match the given string against the entries in the set using the
-- specified minimum score and return the closest match, if one is found.
--
getOneWithMinScore
    :: Double
    -- ^ A minimum score
    -> FuzzySet
    -- ^ The fuzzy string set to compare the string against
    -> Text
    -- ^ The string to search for
    -> Maybe Text
    -- ^ The closest match, if one is found
getOneWithMinScore :: Double -> FuzzySet -> Text -> Maybe Text
getOneWithMinScore Double
minScore FuzzySet
fuzzySet Text
value =
    case Double -> FuzzySet -> Text -> [(Double, Text)]
getWithMinScore Double
minScore FuzzySet
fuzzySet Text
value of
        [] ->
            Maybe Text
forall a. Maybe a
Nothing

        (Double, Text)
head : [(Double, Text)]
_ ->
            Text -> Maybe Text
forall a. a -> Maybe a
Just ((Double, Text) -> Text
forall a b. (a, b) -> b
snd (Double, Text)
head)


-- | Try to match the given string against the entries in the set, and return
-- the closest match, if one is found. A minimum score of 0.33 is used. To
-- specify a different threshold value, instead use 'getOneWithMinScore'.
--
getOne :: FuzzySet
    -- ^ The fuzzy string set to compare the string against
    -> Text
    -- ^ The string to search for
    -> Maybe Text
    -- ^ The closest match, if one is found
getOne :: FuzzySet -> Text -> Maybe Text
getOne =
    Double -> FuzzySet -> Text -> Maybe Text
getOneWithMinScore Double
0.33


-- | Add an entry to the set, or do nothing if a key that matches the string
-- already exists in the set.
--
add
    :: FuzzySet
    -- ^ Set to add the string to
    -> Text
    -- ^ The new entry
    -> FuzzySet
    -- ^ An updated set
add :: FuzzySet -> Text -> FuzzySet
add FuzzySet
fuzzySet =
    (FuzzySet, Bool) -> FuzzySet
forall a b. (a, b) -> a
fst ((FuzzySet, Bool) -> FuzzySet)
-> (Text -> (FuzzySet, Bool)) -> Text -> FuzzySet
forall b c a. (b -> c) -> (a -> b) -> a -> c
. FuzzySet -> Text -> (FuzzySet, Bool)
addToSet FuzzySet
fuzzySet


-- | Add an entry, unless it is already present in the set. A pair is returned
-- with the new set and a boolean which denotes whether or not anything was
-- inserted.
--
addToSet
    :: FuzzySet
    -- ^ Fuzzy string set to add the entry to
    -> Text
    -- ^ The new entry
    -> ( FuzzySet, Bool )
    -- ^ The updated set and a boolean, which will be 'True' if, and only if,
    -- the value was not already in the set
addToSet :: FuzzySet -> Text -> (FuzzySet, Bool)
addToSet set :: FuzzySet
set@FuzzySet{ gramSizeLower :: FuzzySet -> Int
gramSizeLower = Int
lower, gramSizeUpper :: FuzzySet -> Int
gramSizeUpper = Int
upper, Bool
HashMap Int (Vector FuzzySetItem)
HashMap Text [GramInfo]
HashMap Text Text
useLevenshtein :: Bool
items :: HashMap Int (Vector FuzzySetItem)
matchDict :: HashMap Text [GramInfo]
exactSet :: HashMap Text Text
useLevenshtein :: FuzzySet -> Bool
items :: FuzzySet -> HashMap Int (Vector FuzzySetItem)
matchDict :: FuzzySet -> HashMap Text [GramInfo]
exactSet :: FuzzySet -> HashMap Text Text
.. } Text
value
    | Text
key Text -> HashMap Text Text -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` HashMap Text Text
exactSet =
        ( FuzzySet
set, Bool
False )
    | Bool
otherwise =
        ( FuzzySet
newSet FuzzySet -> (FuzzySet -> FuzzySet) -> FuzzySet
forall a b. a -> (a -> b) -> b
|> Text -> FuzzySet -> FuzzySet
updateExactSet Text
value, Bool
True )
  where
    newSet :: FuzzySet
newSet = (Int -> FuzzySet -> FuzzySet) -> FuzzySet -> [Int] -> FuzzySet
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr Int -> FuzzySet -> FuzzySet
addSize FuzzySet
set (Int -> Int -> [Int]
forall a. Enum a => a -> a -> [a]
enumFromTo Int
lower Int
upper)
    key :: Text
key = Text -> Text
Text.toLower Text
value

    addSize :: Int -> FuzzySet -> FuzzySet
    addSize :: Int -> FuzzySet -> FuzzySet
addSize Int
gramSize FuzzySet{Bool
Int
HashMap Int (Vector FuzzySetItem)
HashMap Text [GramInfo]
HashMap Text Text
useLevenshtein :: Bool
gramSizeUpper :: Int
gramSizeLower :: Int
items :: HashMap Int (Vector FuzzySetItem)
matchDict :: HashMap Text [GramInfo]
exactSet :: HashMap Text Text
useLevenshtein :: FuzzySet -> Bool
items :: FuzzySet -> HashMap Int (Vector FuzzySetItem)
matchDict :: FuzzySet -> HashMap Text [GramInfo]
exactSet :: FuzzySet -> HashMap Text Text
gramSizeUpper :: FuzzySet -> Int
gramSizeLower :: FuzzySet -> Int
..} =
        let
            item :: FuzzySetItem
item = Double -> Text -> FuzzySetItem
FuzzySetItem (HashMap Text Int -> [Int]
forall k v. HashMap k v -> [v]
elems HashMap Text Int
grams [Int] -> ([Int] -> Double) -> Double
forall a b. a -> (a -> b) -> b
|> [Int] -> Double
forall a b. (Integral a, Floating b) => [a] -> b
Util.norm) Text
key
        in
        FuzzySet :: HashMap Text Text
-> HashMap Text [GramInfo]
-> HashMap Int (Vector FuzzySetItem)
-> Int
-> Int
-> Bool
-> FuzzySet
FuzzySet{ items :: HashMap Int (Vector FuzzySetItem)
items = HashMap Int (Vector FuzzySetItem)
items HashMap Int (Vector FuzzySetItem)
-> (HashMap Int (Vector FuzzySetItem)
    -> HashMap Int (Vector FuzzySetItem))
-> HashMap Int (Vector FuzzySetItem)
forall a b. a -> (a -> b) -> b
|> Int
-> Vector FuzzySetItem
-> HashMap Int (Vector FuzzySetItem)
-> HashMap Int (Vector FuzzySetItem)
forall k v.
(Eq k, Hashable k) =>
k -> v -> HashMap k v -> HashMap k v
insert Int
gramSize (Vector FuzzySetItem
itemVector Vector FuzzySetItem -> FuzzySetItem -> Vector FuzzySetItem
forall a. Vector a -> a -> Vector a
`snoc` FuzzySetItem
item)
                , matchDict :: HashMap Text [GramInfo]
matchDict = HashMap Text Int
grams HashMap Text Int
-> (HashMap Text Int -> HashMap Text [GramInfo])
-> HashMap Text [GramInfo]
forall a b. a -> (a -> b) -> b
|> (Text -> Int -> HashMap Text [GramInfo] -> HashMap Text [GramInfo])
-> HashMap Text [GramInfo]
-> HashMap Text Int
-> HashMap Text [GramInfo]
forall k v a. (k -> v -> a -> a) -> a -> HashMap k v -> a
HashMap.foldrWithKey Text -> Int -> HashMap Text [GramInfo] -> HashMap Text [GramInfo]
forall k.
(Eq k, Hashable k) =>
k -> Int -> HashMap k [GramInfo] -> HashMap k [GramInfo]
updateDict HashMap Text [GramInfo]
matchDict
                , Bool
Int
HashMap Text Text
useLevenshtein :: Bool
gramSizeUpper :: Int
gramSizeLower :: Int
exactSet :: HashMap Text Text
useLevenshtein :: Bool
exactSet :: HashMap Text Text
gramSizeUpper :: Int
gramSizeLower :: Int
..  }
      where
        updateDict :: k -> Int -> HashMap k [GramInfo] -> HashMap k [GramInfo]
updateDict k
gram Int
count =
            let
                info :: GramInfo
info = Int -> Int -> GramInfo
GramInfo (Vector FuzzySetItem -> Int
forall a. Vector a -> Int
Vector.length Vector FuzzySetItem
itemVector) Int
count
            in
            (Maybe [GramInfo] -> Maybe [GramInfo])
-> k -> HashMap k [GramInfo] -> HashMap k [GramInfo]
forall k v.
(Eq k, Hashable k) =>
(Maybe v -> Maybe v) -> k -> HashMap k v -> HashMap k v
HashMap.alter (\Maybe [GramInfo]
maybeInfos -> [GramInfo] -> Maybe [GramInfo]
forall a. a -> Maybe a
Just ([GramInfo] -> Maybe [GramInfo]) -> [GramInfo] -> Maybe [GramInfo]
forall a b. (a -> b) -> a -> b
$ GramInfo
info GramInfo -> [GramInfo] -> [GramInfo]
forall a. a -> [a] -> [a]
: [GramInfo] -> Maybe [GramInfo] -> [GramInfo]
forall a. a -> Maybe a -> a
fromMaybe [] Maybe [GramInfo]
maybeInfos) k
gram

        itemVector :: Vector FuzzySetItem
itemVector =
            HashMap Int (Vector FuzzySetItem)
items
                HashMap Int (Vector FuzzySetItem)
-> (HashMap Int (Vector FuzzySetItem)
    -> Maybe (Vector FuzzySetItem))
-> Maybe (Vector FuzzySetItem)
forall a b. a -> (a -> b) -> b
|> Int
-> HashMap Int (Vector FuzzySetItem) -> Maybe (Vector FuzzySetItem)
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
HashMap.lookup Int
gramSize
                Maybe (Vector FuzzySetItem)
-> (Maybe (Vector FuzzySetItem) -> Vector FuzzySetItem)
-> Vector FuzzySetItem
forall a b. a -> (a -> b) -> b
|> Vector FuzzySetItem
-> Maybe (Vector FuzzySetItem) -> Vector FuzzySetItem
forall a. a -> Maybe a -> a
fromMaybe Vector FuzzySetItem
forall a. Vector a
Vector.empty
        grams :: HashMap Text Int
grams =
            Text -> Int -> HashMap Text Int
gramVector Text
key Int
gramSize

    updateExactSet :: Text -> FuzzySet -> FuzzySet
    updateExactSet :: Text -> FuzzySet -> FuzzySet
updateExactSet Text
value FuzzySet{Bool
Int
HashMap Int (Vector FuzzySetItem)
HashMap Text [GramInfo]
HashMap Text Text
useLevenshtein :: Bool
gramSizeUpper :: Int
gramSizeLower :: Int
items :: HashMap Int (Vector FuzzySetItem)
matchDict :: HashMap Text [GramInfo]
exactSet :: HashMap Text Text
useLevenshtein :: FuzzySet -> Bool
items :: FuzzySet -> HashMap Int (Vector FuzzySetItem)
matchDict :: FuzzySet -> HashMap Text [GramInfo]
exactSet :: FuzzySet -> HashMap Text Text
gramSizeUpper :: FuzzySet -> Int
gramSizeLower :: FuzzySet -> Int
..} =
        FuzzySet :: HashMap Text Text
-> HashMap Text [GramInfo]
-> HashMap Int (Vector FuzzySetItem)
-> Int
-> Int
-> Bool
-> FuzzySet
FuzzySet{ exactSet :: HashMap Text Text
exactSet = HashMap Text Text
exactSet HashMap Text Text
-> (HashMap Text Text -> HashMap Text Text) -> HashMap Text Text
forall a b. a -> (a -> b) -> b
|> Text -> Text -> HashMap Text Text -> HashMap Text Text
forall k v.
(Eq k, Hashable k) =>
k -> v -> HashMap k v -> HashMap k v
insert Text
key Text
value
                , Bool
Int
HashMap Int (Vector FuzzySetItem)
HashMap Text [GramInfo]
useLevenshtein :: Bool
gramSizeUpper :: Int
gramSizeLower :: Int
items :: HashMap Int (Vector FuzzySetItem)
matchDict :: HashMap Text [GramInfo]
useLevenshtein :: Bool
items :: HashMap Int (Vector FuzzySetItem)
matchDict :: HashMap Text [GramInfo]
gramSizeUpper :: Int
gramSizeLower :: Int
.. }


-- | Add a list of entries to the set, in one go.
--
-- @addMany = foldr (flip add)@
--
addMany :: FuzzySet -> [Text] -> FuzzySet
addMany :: FuzzySet -> [Text] -> FuzzySet
addMany =
    (Text -> FuzzySet -> FuzzySet) -> FuzzySet -> [Text] -> FuzzySet
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr ((FuzzySet -> Text -> FuzzySet) -> Text -> FuzzySet -> FuzzySet
forall a b c. (a -> b -> c) -> b -> a -> c
flip FuzzySet -> Text -> FuzzySet
add)


-- | Create a set from a list of entries, using the default settings.
--
-- @fromList = addMany defaultSet@
--
fromList :: [Text] -> FuzzySet
fromList :: [Text] -> FuzzySet
fromList =
    FuzzySet -> [Text] -> FuzzySet
addMany FuzzySet
defaultSet


-- | Return the number of entries in the set.
--
-- >>> size (defaultSet `add` "map" `add` "cap")
-- 2
-- >>> size (defaultSet `add` "bork" `add` "bork" `add` "bork")
-- 1
--
size :: FuzzySet -> Int
size :: FuzzySet -> Int
size =
    HashMap Text Text -> Int
forall k v. HashMap k v -> Int
HashMap.size (HashMap Text Text -> Int)
-> (FuzzySet -> HashMap Text Text) -> FuzzySet -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. FuzzySet -> HashMap Text Text
exactSet


-- | Return a boolean indicating whether the set is empty.
--
-- >>> isEmpty (fromList [])
-- True
-- >>> isEmpty $ fromList ["Aramis", "Porthos", "Athos"]
-- False
--
isEmpty :: FuzzySet -> Bool
isEmpty :: FuzzySet -> Bool
isEmpty =
    HashMap Text Text -> Bool
forall k v. HashMap k v -> Bool
HashMap.null (HashMap Text Text -> Bool)
-> (FuzzySet -> HashMap Text Text) -> FuzzySet -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. FuzzySet -> HashMap Text Text
exactSet


-- | Return the elements of the set. No particular order is guaranteed.
--
-- >>> values (fromList ["bass", "craze", "space", "lace", "daze", "haze", "ace", "maze"])
-- ["space","daze","bass","maze","ace","craze","lace","haze"]
--
values :: FuzzySet -> [Text]
values :: FuzzySet -> [Text]
values =
    HashMap Text Text -> [Text]
forall k v. HashMap k v -> [v]
elems (HashMap Text Text -> [Text])
-> (FuzzySet -> HashMap Text Text) -> FuzzySet -> [Text]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. FuzzySet -> HashMap Text Text
exactSet