{-# LANGUAGE RecordWildCards #-}

-- |
--
-- Module      : Data.FuzzySet.Simple
-- Copyright   : (c) 2017-present Heikki Johannes Hildén
-- License     : BSD3
-- Maintainer  : hildenjohannes@gmail.com
-- Stability   : experimental
-- Portability : GHC
--
module Data.FuzzySet.Simple
  (
    -- * A note about the simple API
    --
    -- | This module exposes a /pure/, simpler API for working with fuzzy sets.
    --   If you anticipate using the fuzzy search functionality in multiple
    --   places of your application, consider using the default monadic
    --   interface in 'Data.FuzzySet'.

    -- * How to use this module
    -- $howto

    -- * Types
    FuzzySet
  , FuzzyMatch

    -- * Initialization
  , emptySet
  , defaultSet
  , fromList

    -- * Insertion
  , addToSet
  , add
  , addManyToSet
  , addMany
  , (>+<)

    -- * Lookup
  , findMin
  , findOneMin
  , closestMatchMin
  , find
  , findOne
  , closestMatch

    -- * Inspection
  , values
  , size
  , isEmpty
  ) where

import Data.FuzzySet.Internal
  ( FuzzySet(..)
  , FuzzyMatch
  , addMany_
  , add_
  , getMatches
  )

import Control.Monad.State (runState)
import Data.Text (Text)
import Data.Maybe (fromMaybe)
import Data.HashMap.Strict (elems, lookup)
import Data.FuzzySet.Utils (safeHead, (<$$>), (<$$$>))
import Data.Function ((&))
import qualified Data.Text as Text
import qualified Data.Foldable as Foldable
import Prelude hiding (lookup)

-- $howto
--
-- Make sure the @OverloadedStrings@ pragma is enabled and import the module:
--
-- > import Data.FuzzySet.Simple
--
-- After that, three steps are typically involved:
--
--   1. Create a set using one of 'defaultSet', 'emptySet', or 'fromList'.
--   2. To add entries, use 'add', 'addToSet', or 'addMany'.
--   3. Query the set with 'find', 'closestMatch', 'findMin', or 'closestMatchMin'.
--
-- >>> closestMatch "percolator" (defaultSet >+< "Jurassic Park" >+< "Terminator" >+< "The Matrix")
-- Just "Terminator"
--
-- >>> find "Shaggy Jones" (defaultSet >+< "Shaggy Rogers" >+< "Fred Jones" >+< "Daphne Blake" >+< "Velma Dinkley")
-- [(0.7692307692307693,"Shaggy Rogers"),(0.5,"Fred Jones")]
--
-- There are also a few functions to inspect a set: 'size', 'isEmpty', and 'values'.
--
-- == More examples
--
-- > {-# LANGUAGE OverloadedStrings #-}
-- >
-- > import Data.FuzzySet.Simple
-- >
-- > states = [ "Alabama"        , "Alaska"         , "American Samoa"            , "Arizona"       , "Arkansas"
-- >          , "California"     , "Colorado"       , "Connecticut"               , "Delaware"      , "District of Columbia"
-- >          , "Florida"        , "Georgia"        , "Guam"                      , "Hawaii"        , "Idaho"
-- >          , "Illinois"       , "Indiana"        , "Iowa"                      , "Kansas"        , "Kentucky"
-- >          , "Louisiana"      , "Maine"          , "Maryland"                  , "Massachusetts" , "Michigan"
-- >          , "Minnesota"      , "Mississippi"    , "Missouri"                  , "Montana"       , "Nebraska"
-- >          , "Nevada"         , "New Hampshire"  , "New Jersey"                , "New Mexico"    , "New York"
-- >          , "North Carolina" , "North Dakota"   , "Northern Marianas Islands" , "Ohio"          , "Oklahoma"
-- >          , "Oregon"         , "Pennsylvania"   , "Puerto Rico"               , "Rhode Island"  , "South Carolina"
-- >          , "South Dakota"   , "Tennessee"      , "Texas"                     , "Utah"          , "Vermont"
-- >          , "Virginia"       , "Virgin Islands" , "Washington"                , "West Virginia" , "Wisconsin"
-- >          , "Wyoming" ]
-- >
-- > statesSet = fromList states
-- >
-- > main = mapM_ print (find "Burger Islands" statesSet)
--
-- The output of this program is:
--
-- > (0.7142857142857143,"Virgin Islands")
-- > (0.5714285714285714,"Rhode Island")
-- > (0.44,"Northern Marianas Islands")
-- > (0.35714285714285715,"Maryland")
--
--  Using the definition of @statesSet@ from previous example:
--
-- > >>> find "Why-oh-me-ing" statesSet
-- > [(0.5384615384615384,"Wyoming")]
--
-- > >>> find "Connect a cat" statesSet
-- > [(0.6923076923076923,"Connecticut")]
--
-- > >>> find "Transylvania" statesSet
-- > [(0.75,"Pennsylvania"),(0.3333333333333333,"California"),(0.3333333333333333,"Arkansas"),(0.3333333333333333,"Kansas")]
--
-- > >>> find "CanOfSauce" statesSet
-- > [(0.4,"Kansas")]
--
-- > >>> find "Alaska" statesSet
-- > [(1.0,"Alaska")]
--
-- > >>> find "Alaskanbraskansas" statesSet
-- > [(0.47058823529411764,"Arkansas"),(0.35294117647058826,"Kansas"),(0.35294117647058826,"Alaska"),(0.35294117647058826,"Alabama"),(0.35294117647058826,"Nebraska")]

-- | Initialize an empty 'FuzzySet'.
emptySet
  :: Int
  -- ^ Lower bound on gram sizes to use (inclusive)
  -> Int
  -- ^ Upper bound on gram sizes to use (inclusive)
  -> Bool
  -- ^ Whether or not to use the [Levenshtein distance](https://people.cs.pitt.edu/~kirk/cs1501/Pruhs/Spring2006/assignments/editdistance/Levenshtein%20Distance.htm)
  -- to determine the score
  -> FuzzySet
  -- ^ An empty fuzzy string set
emptySet :: Int -> Int -> Bool -> FuzzySet
emptySet = HashMap Text Text
-> HashMap Text [GramInfo]
-> HashMap Int (Vector FuzzySetItem)
-> Int
-> Int
-> Bool
-> FuzzySet
FuzzySet forall a. Monoid a => a
mempty forall a. Monoid a => a
mempty forall a. Monoid a => a
mempty

-- | An empty 'FuzzySet' with the following defaults:
--
--   * Gram size lower: @2@
--   * Gram size upper: @3@
--   * Use Levenshtein distance: @True@
defaultSet :: FuzzySet
defaultSet :: FuzzySet
defaultSet = Int -> Int -> Bool -> FuzzySet
emptySet Int
2 Int
3 Bool
True

-- | Try to match a string against the entries in the set, and return a list of
--   all results with a score greater than or equal to the specified minimum
--   score (i.e., the first argument). The results are ordered by similarity,
--   with the closest match first.
findMin
  :: Double
  -- ^ A minimum score
  -> Text
  -- ^ The string to search for
  -> FuzzySet
  -- ^ The fuzzy string set to compare the string against
  -> [FuzzyMatch]
  -- ^ A list of results (score and matched value)
findMin :: Double -> Text -> FuzzySet -> [FuzzyMatch]
findMin Double
minScore Text
str FuzzySet{Bool
Int
HashMap Int (Vector FuzzySetItem)
HashMap Text [GramInfo]
HashMap Text Text
useLevenshtein :: FuzzySet -> Bool
gramSizeUpper :: FuzzySet -> Int
gramSizeLower :: FuzzySet -> Int
items :: FuzzySet -> HashMap Int (Vector FuzzySetItem)
matchDict :: FuzzySet -> HashMap Text [GramInfo]
exactSet :: FuzzySet -> HashMap Text Text
useLevenshtein :: Bool
gramSizeUpper :: Int
gramSizeLower :: Int
items :: HashMap Int (Vector FuzzySetItem)
matchDict :: HashMap Text [GramInfo]
exactSet :: HashMap Text Text
..} =
  case Text
key forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
`lookup` HashMap Text Text
exactSet of
    Just Text
exactMatch ->
      [(Double
1, Text
exactMatch)]
    Maybe Text
Nothing ->
      [Int
gramSizeUpper, Int
gramSizeUpper forall a. Num a => a -> a -> a
- Int
1 .. Int
gramSizeLower]
        forall a b. a -> (a -> b) -> b
& forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (FuzzySet -> Text -> Double -> Int -> [FuzzyMatch]
getMatches FuzzySet{Bool
Int
HashMap Int (Vector FuzzySetItem)
HashMap Text [GramInfo]
HashMap Text Text
useLevenshtein :: Bool
gramSizeUpper :: Int
gramSizeLower :: Int
items :: HashMap Int (Vector FuzzySetItem)
matchDict :: HashMap Text [GramInfo]
exactSet :: HashMap Text Text
useLevenshtein :: Bool
gramSizeUpper :: Int
gramSizeLower :: Int
items :: HashMap Int (Vector FuzzySetItem)
matchDict :: HashMap Text [GramInfo]
exactSet :: HashMap Text Text
..} Text
key Double
minScore)
        forall a b. a -> (a -> b) -> b
& forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Maybe a
Foldable.find (Bool -> Bool
not forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall (t :: * -> *) a. Foldable t => t a -> Bool
null)
        forall a b. a -> (a -> b) -> b
& forall a. a -> Maybe a -> a
fromMaybe []
  where
    key :: Text
key = Text -> Text
Text.toLower Text
str

-- | Try to match the given string against the entries in the set using the
--   specified minimum score and return the closest match, if one is found.
findOneMin
  :: Double
  -- ^ A minimum score
  -> Text
  -- ^ The string to search for
  -> FuzzySet
  -- ^ The fuzzy string set to compare the string against
  -> Maybe FuzzyMatch
  -- ^ The closest match, if one is found
findOneMin :: Double -> Text -> FuzzySet -> Maybe FuzzyMatch
findOneMin = forall a. [a] -> Maybe a
safeHead forall (f :: * -> *) (g :: * -> *) (h :: * -> *) a b.
(Functor f, Functor g, Functor h) =>
(a -> b) -> f (g (h a)) -> f (g (h b))
<$$$> Double -> Text -> FuzzySet -> [FuzzyMatch]
findMin

-- | Try to match the given string against the entries in the set using the
--   specified minimum score and return the string that most closely matches
--   the input, if a match is found.
closestMatchMin
  :: Double
  -- ^ A minimum score
  -> Text
  -- ^ The string to search for
  -> FuzzySet
  -- ^ The fuzzy string set to compare the string against
  -> Maybe Text
  -- ^ The string most closely matching the input, if a match is found
closestMatchMin :: Double -> Text -> FuzzySet -> Maybe Text
closestMatchMin = forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap forall a b. (a, b) -> b
snd forall (f :: * -> *) (g :: * -> *) (h :: * -> *) a b.
(Functor f, Functor g, Functor h) =>
(a -> b) -> f (g (h a)) -> f (g (h b))
<$$$> Double -> Text -> FuzzySet -> Maybe FuzzyMatch
findOneMin

-- | Try to match the given string against the entries in the set, using a
--   minimum score of 0.33. Return a list of results ordered by similarity
--   score, with the closest match first. Use 'findMin' if you need to specify
--   a custom threshold value.
find
  :: Text
  -- ^ The string to search for
  -> FuzzySet
  -- ^ The fuzzy string set to compare the string against
  -> [FuzzyMatch]
  -- ^ A list of results (score and matched value)
find :: Text -> FuzzySet -> [FuzzyMatch]
find = Double -> Text -> FuzzySet -> [FuzzyMatch]
findMin Double
0.33

-- | Try to match the given string against the entries in the set, and return
--   the closest match, if one is found. A minimum score of 0.33 is used. To
--   specify a custom threshold value, instead use 'findOneMin'.
findOne
  :: Text
  -- ^ The string to search for
  -> FuzzySet
  -- ^ The fuzzy string set to compare the string against
  -> Maybe FuzzyMatch
  -- ^ The closest match, if one is found
findOne :: Text -> FuzzySet -> Maybe FuzzyMatch
findOne = Double -> Text -> FuzzySet -> Maybe FuzzyMatch
findOneMin Double
0.33

-- | Try to match the given string against the entries in the set, and return
--   the string that most closely matches the input, if a match is found. A
--   minimum score of 0.33 is used. To specify a custom threshold value,
--   instead use 'closestMatchMin'.
closestMatch
  :: Text
  -- ^ The string to search for
  -> FuzzySet
  -- ^ The fuzzy string set to compare the string against
  -> Maybe Text
  -- ^ The string most closely matching the input, if a match is found
closestMatch :: Text -> FuzzySet -> Maybe Text
closestMatch = forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap forall a b. (a, b) -> b
snd forall (f :: * -> *) (g :: * -> *) a b.
(Functor f, Functor g) =>
(a -> b) -> f (g a) -> f (g b)
<$$> Text -> FuzzySet -> Maybe FuzzyMatch
findOne

-- | Add a string to the set, unless it is already present. A pair is returned
--   consisting of a boolean which denotes whether or not anything was inserted,
--   and the updated set.
addToSet
  :: Text
  -- ^ The new entry
  -> FuzzySet
  -- ^ Fuzzy string set to add the entry to
  -> (Bool, FuzzySet)
  -- ^ A flag to indicate if the value was added (i.e., did not already exist
  --   in the set), and the updated set.
addToSet :: Text -> FuzzySet -> (Bool, FuzzySet)
addToSet = forall s a. State s a -> s -> (a, s)
runState forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall (m :: * -> *). MonadState FuzzySet m => Text -> m Bool
add_

-- | Add a string to the set, or do nothing if a key that matches the string
--   already exists.
add
  :: Text
  -- ^ The new entry
  -> FuzzySet
  -- ^ Set to add the string to
  -> FuzzySet
  -- ^ An updated set
add :: Text -> FuzzySet -> FuzzySet
add = forall a b. (a, b) -> b
snd forall (f :: * -> *) (g :: * -> *) a b.
(Functor f, Functor g) =>
(a -> b) -> f (g a) -> f (g b)
<$$> Text -> FuzzySet -> (Bool, FuzzySet)
addToSet

-- | Infix operator to add entries to a 'FuzzySet', defined as @flip add@.
(>+<)
  :: FuzzySet
  -- ^ Set to add the string to
  -> Text
  -- ^ The new entry
  -> FuzzySet
  -- ^ An updated set
>+< :: FuzzySet -> Text -> FuzzySet
(>+<) = forall a b c. (a -> b -> c) -> b -> a -> c
flip Text -> FuzzySet -> FuzzySet
add

infixl 4 >+<

-- | Add a list of strings to the set, all at once.
--
-- Unless you need to know the subset of values that were actually inserted,
-- use 'addMany' instead.
addManyToSet
  :: [Text]
  -- ^ A list of strings to add to the set
  -> FuzzySet
  -- ^ The set to add the strings to
  -> ([Text], FuzzySet)
  -- ^ A pair where the first component is a list of all values that were
  --   inserted, and the second is the updated set.
addManyToSet :: [Text] -> FuzzySet -> ([Text], FuzzySet)
addManyToSet = forall s a. State s a -> s -> (a, s)
runState forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall (m :: * -> *). MonadState FuzzySet m => [Text] -> m [Text]
addMany_

-- | Add a list of strings to the set, all at once.
--
-- This function is identical to 'addManyToSet', except that it only returns
-- the set itself. If you need to know what values were inserted, then use the
-- latter instead.
addMany
  :: [Text]
  -- ^ A list of strings to add to the set
  -> FuzzySet
  -- ^ The set to add the strings to
  -> FuzzySet
  -- ^ The updated set
addMany :: [Text] -> FuzzySet -> FuzzySet
addMany = forall a b. (a, b) -> b
snd forall (f :: * -> *) (g :: * -> *) a b.
(Functor f, Functor g) =>
(a -> b) -> f (g a) -> f (g b)
<$$> [Text] -> FuzzySet -> ([Text], FuzzySet)
addManyToSet

-- | Create a new set from a list of entries, using the default settings.
fromList
  :: [Text]
  -- ^ A list of strings to insert into the new set
  -> FuzzySet
  -- ^ A new fuzzy string set
fromList :: [Text] -> FuzzySet
fromList = ([Text] -> FuzzySet -> FuzzySet
`addMany` FuzzySet
defaultSet)

-- | Return the elements of the set. No particular order is guaranteed.
--
-- >>> values (fromList ["bass", "craze", "space", "lace", "daze", "haze", "ace", "maze"])
-- ["space","daze","bass","maze","ace","craze","lace","haze"]
values :: FuzzySet -> [Text]
values :: FuzzySet -> [Text]
values = forall k v. HashMap k v -> [v]
elems forall b c a. (b -> c) -> (a -> b) -> a -> c
. FuzzySet -> HashMap Text Text
exactSet

-- | Return the number of entries in the set.
--
-- >>> size (defaultSet >+< "map" >+< "cap")
-- 2
-- >>> size (defaultSet >+< "bork" >+< "bork" >+< "bork")
-- 1
size :: FuzzySet -> Int
size :: FuzzySet -> Int
size = forall (t :: * -> *) a. Foldable t => t a -> Int
length forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FuzzySet -> [Text]
values

-- | Return a boolean indicating whether the set is empty.
--
-- >>> isEmpty (fromList [])
-- True
-- >>> isEmpty $ fromList ["Aramis", "Porthos", "Athos"]
-- False
isEmpty :: FuzzySet -> Bool
isEmpty :: FuzzySet -> Bool
isEmpty = forall (t :: * -> *) a. Foldable t => t a -> Bool
null forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FuzzySet -> [Text]
values