module Data.FuzzySet.Util ( normalized , substr , enclosedIn , norm , distance ) where import Data.Char (isAlphaNum, isSpace) import Data.Text (Text, cons, snoc) import Data.Text.Metrics (levenshteinNorm) import qualified Data.Text as Text -- | Normalize the input by -- -- * removing non-word characters, except for spaces and commas; and -- * converting alphabetic characters to lowercase. -- normalized :: Text -> Text {-# INLINE normalized #-} normalized = Text.filter word . Text.toLower where word char | isAlphaNum char = True | isSpace char = True | char == ',' = True | otherwise = False -- | Return /n/ characters starting from offset /m/ in the input string. -- substr :: Int -- ^ Length of the substring -> Int -- ^ The character offset /m/ -> Text -- ^ The input string -> Text -- ^ A substring of length /n/ {-# INLINE substr #-} substr n m = Text.take n . Text.drop m -- | Insert a character at the beginning and end of the given string. -- enclosedIn :: Text -> Char -> Text {-# INLINE enclosedIn #-} enclosedIn str char = char `cons` str `snoc` char -- | Returns the euclidean norm, or /magnitude/, of the input list interpreted -- as a vector. -- -- That is, -- -- \( \quad \sqrt{ \sum_{i=0}^n a_i^2 } \) -- -- for the input -- -- \( \quad \langle a_0, a_1, \dots, a_n \rangle \) -- -- where \( a_i \) is the element at position /i/ in the input list. -- norm :: (Integral a, Floating b) => [a] -> b norm = sqrt . fromIntegral . sum . fmap (^2) -- | Return the normalized Levenshtein distance between the two strings. -- See . -- distance :: Text -> Text -> Double distance s t = fromRational (toRational dist) where dist = levenshteinNorm s t