module Data.FuzzySet.Util
    ( normalized
    , substr
    , enclosedIn
    , norm
    , distance
    ) where

import Data.Char (isAlphaNum, isSpace)
import Data.Text (Text, cons, snoc)
import Data.Text.Metrics (levenshteinNorm)
import qualified Data.Text as Text


-- | Normalize the input by
--
--   * removing non-word characters, except for spaces and commas; and
--   * converting alphabetic characters to lowercase.
--
normalized :: Text -> Text
{-# INLINE normalized #-}
normalized :: Text -> Text
normalized =
    (Char -> Bool) -> Text -> Text
Text.filter Char -> Bool
word (Text -> Text) -> (Text -> Text) -> Text -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Text
Text.toLower
  where
    word :: Char -> Bool
word Char
char
        | Char -> Bool
isAlphaNum Char
char = Bool
True
        | Char -> Bool
isSpace Char
char = Bool
True
        | Char
char Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
',' = Bool
True
        | Bool
otherwise = Bool
False


-- | Return /n/ characters starting from offset /m/ in the input string.
--
substr
    :: Int
    -- ^ Length of the substring
    -> Int
    -- ^ The character offset /m/
    -> Text
    -- ^ The input string
    -> Text
    -- ^ A substring of length /n/
{-# INLINE substr #-}
substr :: Int -> Int -> Text -> Text
substr Int
n Int
m =
    Int -> Text -> Text
Text.take Int
n (Text -> Text) -> (Text -> Text) -> Text -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Text -> Text
Text.drop Int
m


-- | Insert a character at the beginning and end of the given string.
--
enclosedIn :: Text -> Char -> Text
{-# INLINE enclosedIn #-}
enclosedIn :: Text -> Char -> Text
enclosedIn Text
str Char
char =
    Char
char Char -> Text -> Text
`cons` Text
str Text -> Char -> Text
`snoc` Char
char


-- | Returns the euclidean norm, or /magnitude/, of the input list interpreted
-- as a vector.
--
-- That is,
--
-- \( \quad \sqrt{ \sum_{i=0}^n a_i^2 } \)
--
-- for the input
--
-- \( \quad \langle a_0, a_1, \dots, a_n \rangle \)
--
-- where \( a_i \) is the element at position /i/ in the input list.
--
norm :: (Integral a, Floating b) => [a] -> b
norm :: [a] -> b
norm =
    b -> b
forall a. Floating a => a -> a
sqrt (b -> b) -> ([a] -> b) -> [a] -> b
forall b c a. (b -> c) -> (a -> b) -> a -> c
. a -> b
forall a b. (Integral a, Num b) => a -> b
fromIntegral (a -> b) -> ([a] -> a) -> [a] -> b
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [a] -> a
forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
sum ([a] -> a) -> ([a] -> [a]) -> [a] -> a
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (a -> a) -> [a] -> [a]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (a -> Integer -> a
forall a b. (Num a, Integral b) => a -> b -> a
^Integer
2)


-- | Return the normalized Levenshtein distance between the two strings.
-- See <https://en.wikipedia.org/wiki/Levenshtein_distance>.
--
distance :: Text -> Text -> Double
distance :: Text -> Text -> Double
distance Text
s Text
t =
    Rational -> Double
forall a. Fractional a => Rational -> a
fromRational (Ratio Int -> Rational
forall a. Real a => a -> Rational
toRational Ratio Int
dist)
  where
    dist :: Ratio Int
dist = Text -> Text -> Ratio Int
levenshteinNorm Text
s Text
t