Safe Haskell	Safe-Inferred
Language	Haskell2010

Data.FuzzySet.Internal

Synopsis

data FuzzySet = FuzzySet {
- exactSet :: !(HashMap Text Text)
- matchDict :: !(HashMap Text [GramInfo])
- items :: !(HashMap Int (Vector FuzzySetItem))
- gramSizeLower :: !Int
- gramSizeUpper :: !Int
- useLevenshtein :: !Bool
}
data FuzzySetItem = FuzzySetItem {
- vectorMagnitude :: !Double
- normalizedEntry :: !Text
}
data GramInfo = GramInfo {
- itemIndex :: !Int
- gramCount :: !Int
}
type FuzzyMatch = (Double, Text)
grams :: Text -> Int -> [Text]
gramVector :: Text -> Int -> HashMap Text Int
matches :: FuzzySet -> HashMap Text Int -> HashMap Int Int
getMatches :: FuzzySet -> Text -> Double -> Int -> [FuzzyMatch]
add_ :: MonadState FuzzySet m => Text -> m Bool
addMany_ :: MonadState FuzzySet m => [Text] -> m [Text]
normalized :: Text -> Text
norm :: [Int] -> Double
distance :: Text -> Text -> Double

Documentation

data FuzzySet Source #

Main fuzzy string set data type.

Constructors

FuzzySet
Fields exactSet :: !(HashMap Text Text) matchDict :: !(HashMap Text [GramInfo]) items :: !(HashMap Int (Vector FuzzySetItem)) gramSizeLower :: !Int Lower bound on gram sizes to use (inclusive) gramSizeUpper :: !Int Upper bound on gram sizes to use (inclusive) useLevenshtein :: !Bool Whether or not to use the Levenshtein distance to determine the score

Instances

Instances details

Show FuzzySet Source #
Instance details Defined in Data.FuzzySet.Internal Methods showsPrec :: Int -> FuzzySet -> ShowS # show :: FuzzySet -> String # showList :: [FuzzySet] -> ShowS #
Eq FuzzySet Source #
Instance details Defined in Data.FuzzySet.Internal Methods (==) :: FuzzySet -> FuzzySet -> Bool # (/=) :: FuzzySet -> FuzzySet -> Bool #
Monad m => MonadState FuzzySet (FuzzySearchT m) Source #
Instance details Defined in Data.FuzzySet.Monad Methods get :: FuzzySearchT m FuzzySet # put :: FuzzySet -> FuzzySearchT m () # state :: (FuzzySet -> (a, FuzzySet)) -> FuzzySearchT m a #
MonadFuzzySearch m => MonadFuzzySearch (StateT FuzzySet m) Source #
Instance details Defined in Data.FuzzySet.Monad Methods add :: Text -> StateT FuzzySet m Bool Source # findMin :: Double -> Text -> StateT FuzzySet m [FuzzyMatch] Source #

data FuzzySetItem Source #

Constructors

FuzzySetItem
Fields vectorMagnitude :: !Double normalizedEntry :: !Text

Instances

Instances details

Show FuzzySetItem Source #
Instance details Defined in Data.FuzzySet.Internal Methods showsPrec :: Int -> FuzzySetItem -> ShowS # show :: FuzzySetItem -> String # showList :: [FuzzySetItem] -> ShowS #
Eq FuzzySetItem Source #
Instance details Defined in Data.FuzzySet.Internal Methods (==) :: FuzzySetItem -> FuzzySetItem -> Bool # (/=) :: FuzzySetItem -> FuzzySetItem -> Bool #

data GramInfo Source #

Constructors

GramInfo
Fields itemIndex :: !Int gramCount :: !Int

Instances

Instances details

Show GramInfo Source #
Instance details Defined in Data.FuzzySet.Internal Methods showsPrec :: Int -> GramInfo -> ShowS # show :: GramInfo -> String # showList :: [GramInfo] -> ShowS #
Eq GramInfo Source #
Instance details Defined in Data.FuzzySet.Internal Methods (==) :: GramInfo -> GramInfo -> Bool # (/=) :: GramInfo -> GramInfo -> Bool #

type FuzzyMatch = (Double, Text) Source #

An individual result when looking up a string in the set, consisting of

a similarity score in the range \([0, 1]\), and
the matching string.

grams :: Text -> Int -> [Text] Source #

Break apart the input string into a list of n-grams. The string is first normalized and enclosed in hyphens. We then take all substrings of length n, letting the offset range from \(0 \text{ to } s + 2 − n\), where s is the length of the normalized input.

Example: The string "Destroido Corp." is first normalized to "destroido corp", and then enclosed in hyphens, so that it becomes "-destroido corp-". The trigrams generated from this normalized string are:

[ "-de"
, "des"
, "est"
, "str"
, "tro"
, "roi"
, "oid"
, "ido"
, "do "
, "o c"
, " co"
, "cor"
, "orp"
, "rp-"
]

gramVector :: Text -> Int -> HashMap Text Int Source #

Generate a list of n-grams (character substrings) from the normalized input and then translate this into a dictionary with the n-grams as keys mapping to the number of occurences of the substring in the list.

>>> gramVector "xxxx" 2
fromList [("-x",1), ("xx",3), ("x-",1)]

The substring "xx" appears three times in the normalized string:

>>> grams "xxxx" 2
["-x","xx","xx","xx","x-"]

>>> Data.HashMap.Strict.lookup "nts" (gramVector "intrent'srestaurantsomeoftrent'saunt'santswantsamtorentsomepants" 3)
Just 8

matches :: FuzzySet -> HashMap Text Int -> HashMap Int Int Source #

getMatches :: FuzzySet -> Text -> Double -> Int -> [FuzzyMatch] Source #

add_ :: MonadState FuzzySet m => Text -> m Bool Source #

addMany_ :: MonadState FuzzySet m => [Text] -> m [Text] Source #

normalized :: Text -> Text Source #

Normalize the input by

removing non-word characters, except for spaces and commas; and
converting alphabetic characters to lowercase.

norm :: [Int] -> Double Source #

Return the euclidean norm, or magnitude, of the input list interpreted as a vector.

That is,

\( \quad \sqrt{ \sum_{i=0}^n a_i^2 } \)

for the input

\( \quad \langle a_0, a_1, \dots, a_n \rangle \)

where \( a_i \) is the element at position i in the input list.

distance :: Text -> Text -> Double Source #

Return the normalized Levenshtein distance between the two strings.

See https://en.wikipedia.org/wiki/Levenshtein_distance.