{-# LANGUAGE RecordWildCards #-} -- | -- -- Module : Data.FuzzySet.Simple -- Copyright : (c) 2017-present Heikki Johannes Hildén -- License : BSD3 -- Maintainer : hildenjohannes@gmail.com -- Stability : experimental -- Portability : GHC -- module Data.FuzzySet.Simple ( -- * A note about the simple API -- -- | This module exposes a /pure/, simpler API for working with fuzzy sets. -- If you anticipate using the fuzzy search functionality in multiple -- places of your application, consider using the default monadic -- interface in 'Data.FuzzySet'. -- * How to use this module -- $howto -- * Types FuzzySet , FuzzyMatch -- * Initialization , emptySet , defaultSet , fromList -- * Insertion , addToSet , add , addManyToSet , addMany , (>+<) -- * Lookup , findMin , findOneMin , closestMatchMin , find , findOne , closestMatch -- * Inspection , values , size , isEmpty ) where import Data.FuzzySet.Internal ( FuzzySet(..) , FuzzyMatch , addMany_ , add_ , getMatches ) import Control.Monad.State (runState) import Data.Text (Text) import Data.Maybe (fromMaybe) import Data.HashMap.Strict (elems, lookup) import Data.FuzzySet.Utils (safeHead, (<$$>), (<$$$>)) import Data.Function ((&)) import qualified Data.Text as Text import qualified Data.Foldable as Foldable import Prelude hiding (lookup) -- $howto -- -- Make sure the @OverloadedStrings@ pragma is enabled and import the module: -- -- > import Data.FuzzySet.Simple -- -- After that, three steps are typically involved: -- -- 1. Create a set using one of 'defaultSet', 'emptySet', or 'fromList'. -- 2. To add entries, use 'add', 'addToSet', or 'addMany'. -- 3. Query the set with 'find', 'closestMatch', 'findMin', or 'closestMatchMin'. -- -- >>> closestMatch "percolator" (defaultSet >+< "Jurassic Park" >+< "Terminator" >+< "The Matrix") -- Just "Terminator" -- -- >>> find "Shaggy Jones" (defaultSet >+< "Shaggy Rogers" >+< "Fred Jones" >+< "Daphne Blake" >+< "Velma Dinkley") -- [(0.7692307692307693,"Shaggy Rogers"),(0.5,"Fred Jones")] -- -- There are also a few functions to inspect a set: 'size', 'isEmpty', and 'values'. -- -- == More examples -- -- > {-# LANGUAGE OverloadedStrings #-} -- > -- > import Data.FuzzySet.Simple -- > -- > states = [ "Alabama" , "Alaska" , "American Samoa" , "Arizona" , "Arkansas" -- > , "California" , "Colorado" , "Connecticut" , "Delaware" , "District of Columbia" -- > , "Florida" , "Georgia" , "Guam" , "Hawaii" , "Idaho" -- > , "Illinois" , "Indiana" , "Iowa" , "Kansas" , "Kentucky" -- > , "Louisiana" , "Maine" , "Maryland" , "Massachusetts" , "Michigan" -- > , "Minnesota" , "Mississippi" , "Missouri" , "Montana" , "Nebraska" -- > , "Nevada" , "New Hampshire" , "New Jersey" , "New Mexico" , "New York" -- > , "North Carolina" , "North Dakota" , "Northern Marianas Islands" , "Ohio" , "Oklahoma" -- > , "Oregon" , "Pennsylvania" , "Puerto Rico" , "Rhode Island" , "South Carolina" -- > , "South Dakota" , "Tennessee" , "Texas" , "Utah" , "Vermont" -- > , "Virginia" , "Virgin Islands" , "Washington" , "West Virginia" , "Wisconsin" -- > , "Wyoming" ] -- > -- > statesSet = fromList states -- > -- > main = mapM_ print (find "Burger Islands" statesSet) -- -- The output of this program is: -- -- > (0.7142857142857143,"Virgin Islands") -- > (0.5714285714285714,"Rhode Island") -- > (0.44,"Northern Marianas Islands") -- > (0.35714285714285715,"Maryland") -- -- Using the definition of @statesSet@ from previous example: -- -- > >>> find "Why-oh-me-ing" statesSet -- > [(0.5384615384615384,"Wyoming")] -- -- > >>> find "Connect a cat" statesSet -- > [(0.6923076923076923,"Connecticut")] -- -- > >>> find "Transylvania" statesSet -- > [(0.75,"Pennsylvania"),(0.3333333333333333,"California"),(0.3333333333333333,"Arkansas"),(0.3333333333333333,"Kansas")] -- -- > >>> find "CanOfSauce" statesSet -- > [(0.4,"Kansas")] -- -- > >>> find "Alaska" statesSet -- > [(1.0,"Alaska")] -- -- > >>> find "Alaskanbraskansas" statesSet -- > [(0.47058823529411764,"Arkansas"),(0.35294117647058826,"Kansas"),(0.35294117647058826,"Alaska"),(0.35294117647058826,"Alabama"),(0.35294117647058826,"Nebraska")] -- | Initialize an empty 'FuzzySet'. emptySet :: Int -- ^ Lower bound on gram sizes to use (inclusive) -> Int -- ^ Upper bound on gram sizes to use (inclusive) -> Bool -- ^ Whether or not to use the [Levenshtein distance](https://people.cs.pitt.edu/~kirk/cs1501/Pruhs/Spring2006/assignments/editdistance/Levenshtein%20Distance.htm) -- to determine the score -> FuzzySet -- ^ An empty fuzzy string set emptySet :: Int -> Int -> Bool -> FuzzySet emptySet = HashMap Text Text -> HashMap Text [GramInfo] -> HashMap Int (Vector FuzzySetItem) -> Int -> Int -> Bool -> FuzzySet FuzzySet forall a. Monoid a => a mempty forall a. Monoid a => a mempty forall a. Monoid a => a mempty -- | An empty 'FuzzySet' with the following defaults: -- -- * Gram size lower: @2@ -- * Gram size upper: @3@ -- * Use Levenshtein distance: @True@ defaultSet :: FuzzySet defaultSet :: FuzzySet defaultSet = Int -> Int -> Bool -> FuzzySet emptySet Int 2 Int 3 Bool True -- | Try to match a string against the entries in the set, and return a list of -- all results with a score greater than or equal to the specified minimum -- score (i.e., the first argument). The results are ordered by similarity, -- with the closest match first. findMin :: Double -- ^ A minimum score -> Text -- ^ The string to search for -> FuzzySet -- ^ The fuzzy string set to compare the string against -> [FuzzyMatch] -- ^ A list of results (score and matched value) findMin :: Double -> Text -> FuzzySet -> [FuzzyMatch] findMin Double minScore Text str FuzzySet{Bool Int HashMap Int (Vector FuzzySetItem) HashMap Text [GramInfo] HashMap Text Text useLevenshtein :: FuzzySet -> Bool gramSizeUpper :: FuzzySet -> Int gramSizeLower :: FuzzySet -> Int items :: FuzzySet -> HashMap Int (Vector FuzzySetItem) matchDict :: FuzzySet -> HashMap Text [GramInfo] exactSet :: FuzzySet -> HashMap Text Text useLevenshtein :: Bool gramSizeUpper :: Int gramSizeLower :: Int items :: HashMap Int (Vector FuzzySetItem) matchDict :: HashMap Text [GramInfo] exactSet :: HashMap Text Text ..} = case Text key forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v `lookup` HashMap Text Text exactSet of Just Text exactMatch -> [(Double 1, Text exactMatch)] Maybe Text Nothing -> [Int gramSizeUpper, Int gramSizeUpper forall a. Num a => a -> a -> a - Int 1 .. Int gramSizeLower] forall a b. a -> (a -> b) -> b & forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b fmap (FuzzySet -> Text -> Double -> Int -> [FuzzyMatch] getMatches FuzzySet{Bool Int HashMap Int (Vector FuzzySetItem) HashMap Text [GramInfo] HashMap Text Text useLevenshtein :: Bool gramSizeUpper :: Int gramSizeLower :: Int items :: HashMap Int (Vector FuzzySetItem) matchDict :: HashMap Text [GramInfo] exactSet :: HashMap Text Text useLevenshtein :: Bool gramSizeUpper :: Int gramSizeLower :: Int items :: HashMap Int (Vector FuzzySetItem) matchDict :: HashMap Text [GramInfo] exactSet :: HashMap Text Text ..} Text key Double minScore) forall a b. a -> (a -> b) -> b & forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Maybe a Foldable.find (Bool -> Bool not forall b c a. (b -> c) -> (a -> b) -> a -> c . forall (t :: * -> *) a. Foldable t => t a -> Bool null) forall a b. a -> (a -> b) -> b & forall a. a -> Maybe a -> a fromMaybe [] where key :: Text key = Text -> Text Text.toLower Text str -- | Try to match the given string against the entries in the set using the -- specified minimum score and return the closest match, if one is found. findOneMin :: Double -- ^ A minimum score -> Text -- ^ The string to search for -> FuzzySet -- ^ The fuzzy string set to compare the string against -> Maybe FuzzyMatch -- ^ The closest match, if one is found findOneMin :: Double -> Text -> FuzzySet -> Maybe FuzzyMatch findOneMin = forall a. [a] -> Maybe a safeHead forall (f :: * -> *) (g :: * -> *) (h :: * -> *) a b. (Functor f, Functor g, Functor h) => (a -> b) -> f (g (h a)) -> f (g (h b)) <$$$> Double -> Text -> FuzzySet -> [FuzzyMatch] findMin -- | Try to match the given string against the entries in the set using the -- specified minimum score and return the string that most closely matches -- the input, if a match is found. closestMatchMin :: Double -- ^ A minimum score -> Text -- ^ The string to search for -> FuzzySet -- ^ The fuzzy string set to compare the string against -> Maybe Text -- ^ The string most closely matching the input, if a match is found closestMatchMin :: Double -> Text -> FuzzySet -> Maybe Text closestMatchMin = forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b fmap forall a b. (a, b) -> b snd forall (f :: * -> *) (g :: * -> *) (h :: * -> *) a b. (Functor f, Functor g, Functor h) => (a -> b) -> f (g (h a)) -> f (g (h b)) <$$$> Double -> Text -> FuzzySet -> Maybe FuzzyMatch findOneMin -- | Try to match the given string against the entries in the set, using a -- minimum score of 0.33. Return a list of results ordered by similarity -- score, with the closest match first. Use 'findMin' if you need to specify -- a custom threshold value. find :: Text -- ^ The string to search for -> FuzzySet -- ^ The fuzzy string set to compare the string against -> [FuzzyMatch] -- ^ A list of results (score and matched value) find :: Text -> FuzzySet -> [FuzzyMatch] find = Double -> Text -> FuzzySet -> [FuzzyMatch] findMin Double 0.33 -- | Try to match the given string against the entries in the set, and return -- the closest match, if one is found. A minimum score of 0.33 is used. To -- specify a custom threshold value, instead use 'findOneMin'. findOne :: Text -- ^ The string to search for -> FuzzySet -- ^ The fuzzy string set to compare the string against -> Maybe FuzzyMatch -- ^ The closest match, if one is found findOne :: Text -> FuzzySet -> Maybe FuzzyMatch findOne = Double -> Text -> FuzzySet -> Maybe FuzzyMatch findOneMin Double 0.33 -- | Try to match the given string against the entries in the set, and return -- the string that most closely matches the input, if a match is found. A -- minimum score of 0.33 is used. To specify a custom threshold value, -- instead use 'closestMatchMin'. closestMatch :: Text -- ^ The string to search for -> FuzzySet -- ^ The fuzzy string set to compare the string against -> Maybe Text -- ^ The string most closely matching the input, if a match is found closestMatch :: Text -> FuzzySet -> Maybe Text closestMatch = forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b fmap forall a b. (a, b) -> b snd forall (f :: * -> *) (g :: * -> *) a b. (Functor f, Functor g) => (a -> b) -> f (g a) -> f (g b) <$$> Text -> FuzzySet -> Maybe FuzzyMatch findOne -- | Add a string to the set, unless it is already present. A pair is returned -- consisting of a boolean which denotes whether or not anything was inserted, -- and the updated set. addToSet :: Text -- ^ The new entry -> FuzzySet -- ^ Fuzzy string set to add the entry to -> (Bool, FuzzySet) -- ^ A flag to indicate if the value was added (i.e., did not already exist -- in the set), and the updated set. addToSet :: Text -> FuzzySet -> (Bool, FuzzySet) addToSet = forall s a. State s a -> s -> (a, s) runState forall b c a. (b -> c) -> (a -> b) -> a -> c . forall (m :: * -> *). MonadState FuzzySet m => Text -> m Bool add_ -- | Add a string to the set, or do nothing if a key that matches the string -- already exists. add :: Text -- ^ The new entry -> FuzzySet -- ^ Set to add the string to -> FuzzySet -- ^ An updated set add :: Text -> FuzzySet -> FuzzySet add = forall a b. (a, b) -> b snd forall (f :: * -> *) (g :: * -> *) a b. (Functor f, Functor g) => (a -> b) -> f (g a) -> f (g b) <$$> Text -> FuzzySet -> (Bool, FuzzySet) addToSet -- | Infix operator to add entries to a 'FuzzySet', defined as @flip add@. (>+<) :: FuzzySet -- ^ Set to add the string to -> Text -- ^ The new entry -> FuzzySet -- ^ An updated set >+< :: FuzzySet -> Text -> FuzzySet (>+<) = forall a b c. (a -> b -> c) -> b -> a -> c flip Text -> FuzzySet -> FuzzySet add infixl 4 >+< -- | Add a list of strings to the set, all at once. -- -- Unless you need to know the subset of values that were actually inserted, -- use 'addMany' instead. addManyToSet :: [Text] -- ^ A list of strings to add to the set -> FuzzySet -- ^ The set to add the strings to -> ([Text], FuzzySet) -- ^ A pair where the first component is a list of all values that were -- inserted, and the second is the updated set. addManyToSet :: [Text] -> FuzzySet -> ([Text], FuzzySet) addManyToSet = forall s a. State s a -> s -> (a, s) runState forall b c a. (b -> c) -> (a -> b) -> a -> c . forall (m :: * -> *). MonadState FuzzySet m => [Text] -> m [Text] addMany_ -- | Add a list of strings to the set, all at once. -- -- This function is identical to 'addManyToSet', except that it only returns -- the set itself. If you need to know what values were inserted, then use the -- latter instead. addMany :: [Text] -- ^ A list of strings to add to the set -> FuzzySet -- ^ The set to add the strings to -> FuzzySet -- ^ The updated set addMany :: [Text] -> FuzzySet -> FuzzySet addMany = forall a b. (a, b) -> b snd forall (f :: * -> *) (g :: * -> *) a b. (Functor f, Functor g) => (a -> b) -> f (g a) -> f (g b) <$$> [Text] -> FuzzySet -> ([Text], FuzzySet) addManyToSet -- | Create a new set from a list of entries, using the default settings. fromList :: [Text] -- ^ A list of strings to insert into the new set -> FuzzySet -- ^ A new fuzzy string set fromList :: [Text] -> FuzzySet fromList = ([Text] -> FuzzySet -> FuzzySet `addMany` FuzzySet defaultSet) -- | Return the elements of the set. No particular order is guaranteed. -- -- >>> values (fromList ["bass", "craze", "space", "lace", "daze", "haze", "ace", "maze"]) -- ["space","daze","bass","maze","ace","craze","lace","haze"] values :: FuzzySet -> [Text] values :: FuzzySet -> [Text] values = forall k v. HashMap k v -> [v] elems forall b c a. (b -> c) -> (a -> b) -> a -> c . FuzzySet -> HashMap Text Text exactSet -- | Return the number of entries in the set. -- -- >>> size (defaultSet >+< "map" >+< "cap") -- 2 -- >>> size (defaultSet >+< "bork" >+< "bork" >+< "bork") -- 1 size :: FuzzySet -> Int size :: FuzzySet -> Int size = forall (t :: * -> *) a. Foldable t => t a -> Int length forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b <$> FuzzySet -> [Text] values -- | Return a boolean indicating whether the set is empty. -- -- >>> isEmpty (fromList []) -- True -- >>> isEmpty $ fromList ["Aramis", "Porthos", "Athos"] -- False isEmpty :: FuzzySet -> Bool isEmpty :: FuzzySet -> Bool isEmpty = forall (t :: * -> *) a. Foldable t => t a -> Bool null forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b <$> FuzzySet -> [Text] values