{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE MultiWayIf #-}

-- |
-- Module      :  Data.Text.Metrics
-- Copyright   :  © 2016–present Mark Karpov
-- License     :  BSD 3 clause
--
-- Maintainer  :  Mark Karpov <markkarpov92@gmail.com>
-- Stability   :  experimental
-- Portability :  portable
--
-- The module provides efficient implementations of various strings metric
-- algorithms. It works with strict 'Text' values.
--
-- __Note__: before version /0.3.0/ the package used C implementations of
-- the algorithms under the hood. Beginning from version /0.3.0/, the
-- implementations are written in Haskell while staying almost as fast, see:
--
-- <https://markkarpov.com/post/migrating-text-metrics.html>
module Data.Text.Metrics
  ( -- * Levenshtein variants
    levenshtein,
    levenshteinNorm,
    damerauLevenshtein,
    damerauLevenshteinNorm,

    -- * Treating inputs like sets
    overlap,
    jaccard,

    -- * Other
    hamming,
    jaro,
    jaroWinkler,
  )
where

import Control.Monad
import Control.Monad.ST
import Data.Map.Strict (Map)
import qualified Data.Map.Strict as M
import Data.Ratio
import Data.Text
import qualified Data.Text as T
import qualified Data.Text.Unsafe as TU
import qualified Data.Vector.Unboxed.Mutable as VUM
import GHC.Exts (inline)

----------------------------------------------------------------------------
-- Levenshtein variants

-- | Return the Levenshtein distance between two 'Text' values. The
-- Levenshtein distance between two strings is the minimal number of
-- operations necessary to transform one string into another. For the
-- Levenshtein distance allowed operations are: deletion, insertion, and
-- substitution.
--
-- See also: <https://en.wikipedia.org/wiki/Levenshtein_distance>.
--
-- __Heads up__, before version /0.3.0/ this function returned
-- 'Data.Numeric.Natural'.
levenshtein :: Text -> Text -> Int
levenshtein :: Text -> Text -> Int
levenshtein Text
a Text
b = (Int, Int) -> Int
forall a b. (a, b) -> a
fst (Text -> Text -> (Int, Int)
levenshtein_ Text
a Text
b)

-- | Return the normalized Levenshtein distance between two 'Text' values.
-- Result is a non-negative rational number (represented as @'Ratio'
-- 'Data.Numeric.Natural'@), where 0 signifies no similarity between the
-- strings, while 1 means exact match.
--
-- See also: <https://en.wikipedia.org/wiki/Levenshtein_distance>.
--
-- __Heads up__, before version /0.3.0/ this function returned @'Ratio'
-- 'Data.Numeric.Natural'@.
levenshteinNorm :: Text -> Text -> Ratio Int
levenshteinNorm :: Text -> Text -> Ratio Int
levenshteinNorm = (Text -> Text -> (Int, Int)) -> Text -> Text -> Ratio Int
norm Text -> Text -> (Int, Int)
levenshtein_

-- | An internal helper, returns the Levenshtein distance as the first
-- element of the tuple and max length of the two inputs as the second
-- element of the tuple.
levenshtein_ :: Text -> Text -> (Int, Int)
levenshtein_ :: Text -> Text -> (Int, Int)
levenshtein_ Text
a Text
b
  | Text -> Bool
T.null Text
a = (Int
lenb, Int
lenm)
  | Text -> Bool
T.null Text
b = (Int
lena, Int
lenm)
  | Bool
otherwise = (forall s. ST s (Int, Int)) -> (Int, Int)
forall a. (forall s. ST s a) -> a
runST ((forall s. ST s (Int, Int)) -> (Int, Int))
-> (forall s. ST s (Int, Int)) -> (Int, Int)
forall a b. (a -> b) -> a -> b
$ do
    let v_len :: Int
v_len = Int
lenb Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1
    MVector s Int
v <- Int -> ST s (MVector (PrimState (ST s)) Int)
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
Int -> m (MVector (PrimState m) a)
VUM.unsafeNew (Int
v_len Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
2)
    let gov :: Int -> ST s ()
gov !Int
i =
          Bool -> ST s () -> ST s ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
v_len) (ST s () -> ST s ()) -> ST s () -> ST s ()
forall a b. (a -> b) -> a -> b
$ do
            MVector (PrimState (ST s)) Int -> Int -> Int -> ST s ()
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> a -> m ()
VUM.unsafeWrite MVector s Int
MVector (PrimState (ST s)) Int
v Int
i Int
i
            Int -> ST s ()
gov (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1)
        goi :: Int -> Int -> Int -> Int -> ST s ()
goi !Int
i !Int
na !Int
v0 !Int
v1 = do
          let !(TU.Iter Char
ai Int
da) = Text -> Int -> Iter
TU.iter Text
a Int
na
              goj :: Int -> Int -> ST s ()
goj !Int
j !Int
nb =
                Bool -> ST s () -> ST s ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Int
j Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
lenb) (ST s () -> ST s ()) -> ST s () -> ST s ()
forall a b. (a -> b) -> a -> b
$ do
                  let !(TU.Iter Char
bj Int
db) = Text -> Int -> Iter
TU.iter Text
b Int
nb
                      cost :: Int
cost = if Char
ai Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
bj then Int
0 else Int
1
                  Int
x <- (Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) (Int -> Int) -> ST s Int -> ST s Int
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> MVector (PrimState (ST s)) Int -> Int -> ST s Int
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> m a
VUM.unsafeRead MVector s Int
MVector (PrimState (ST s)) Int
v (Int
v1 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
j)
                  Int
y <- (Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) (Int -> Int) -> ST s Int -> ST s Int
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> MVector (PrimState (ST s)) Int -> Int -> ST s Int
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> m a
VUM.unsafeRead MVector s Int
MVector (PrimState (ST s)) Int
v (Int
v0 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
j Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1)
                  Int
z <- (Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
cost) (Int -> Int) -> ST s Int -> ST s Int
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> MVector (PrimState (ST s)) Int -> Int -> ST s Int
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> m a
VUM.unsafeRead MVector s Int
MVector (PrimState (ST s)) Int
v (Int
v0 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
j)
                  MVector (PrimState (ST s)) Int -> Int -> Int -> ST s ()
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> a -> m ()
VUM.unsafeWrite MVector s Int
MVector (PrimState (ST s)) Int
v (Int
v1 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
j Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) (Int -> Int -> Int
forall a. Ord a => a -> a -> a
min Int
x (Int -> Int -> Int
forall a. Ord a => a -> a -> a
min Int
y Int
z))
                  Int -> Int -> ST s ()
goj (Int
j Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) (Int
nb Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
db)
          Bool -> ST s () -> ST s ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
lena) (ST s () -> ST s ()) -> ST s () -> ST s ()
forall a b. (a -> b) -> a -> b
$ do
            MVector (PrimState (ST s)) Int -> Int -> Int -> ST s ()
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> a -> m ()
VUM.unsafeWrite MVector s Int
MVector (PrimState (ST s)) Int
v Int
v1 (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1)
            Int -> Int -> ST s ()
goj Int
0 Int
0
            Int -> Int -> Int -> Int -> ST s ()
goi (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) (Int
na Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
da) Int
v1 Int
v0
    Int -> ST s ()
gov Int
0
    Int -> Int -> Int -> Int -> ST s ()
goi Int
0 Int
0 Int
0 Int
v_len
    Int
ld <- MVector (PrimState (ST s)) Int -> Int -> ST s Int
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> m a
VUM.unsafeRead MVector s Int
MVector (PrimState (ST s)) Int
v (Int
lenb Int -> Int -> Int
forall a. Num a => a -> a -> a
+ if Int -> Bool
forall a. Integral a => a -> Bool
even Int
lena then Int
0 else Int
v_len)
    (Int, Int) -> ST s (Int, Int)
forall (m :: * -> *) a. Monad m => a -> m a
return (Int
ld, Int
lenm)
  where
    lena :: Int
lena = Text -> Int
T.length Text
a
    lenb :: Int
lenb = Text -> Int
T.length Text
b
    lenm :: Int
lenm = Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Int
lena Int
lenb
{-# INLINE levenshtein_ #-}

-- | Return the Damerau-Levenshtein distance between two 'Text' values. The
-- function works like 'levenshtein', but the collection of allowed
-- operations also includes transposition of two /adjacent/ characters.
--
-- See also: <https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance>.
--
-- __Heads up__, before version /0.3.0/ this function returned
-- 'Data.Numeric.Natural'.
damerauLevenshtein :: Text -> Text -> Int
damerauLevenshtein :: Text -> Text -> Int
damerauLevenshtein Text
a Text
b = (Int, Int) -> Int
forall a b. (a, b) -> a
fst (Text -> Text -> (Int, Int)
damerauLevenshtein_ Text
a Text
b)

-- | Return the normalized Damerau-Levenshtein distance between two 'Text'
-- values. 0 signifies no similarity between the strings, while 1 means
-- exact match.
--
-- See also: <https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance>.
--
-- __Heads up__, before version /0.3.0/ this function returned @'Ratio'
-- 'Data.Numeric.Natural'@.
damerauLevenshteinNorm :: Text -> Text -> Ratio Int
damerauLevenshteinNorm :: Text -> Text -> Ratio Int
damerauLevenshteinNorm = (Text -> Text -> (Int, Int)) -> Text -> Text -> Ratio Int
norm Text -> Text -> (Int, Int)
damerauLevenshtein_

-- | An internal helper, returns the Damerau-Levenshtein distance as the
-- first element of the tuple and max length of the two inputs as the second
-- element of the tuple.
damerauLevenshtein_ :: Text -> Text -> (Int, Int)
damerauLevenshtein_ :: Text -> Text -> (Int, Int)
damerauLevenshtein_ Text
a Text
b
  | Text -> Bool
T.null Text
a = (Int
lenb, Int
lenm)
  | Text -> Bool
T.null Text
b = (Int
lena, Int
lenm)
  | Bool
otherwise = (forall s. ST s (Int, Int)) -> (Int, Int)
forall a. (forall s. ST s a) -> a
runST ((forall s. ST s (Int, Int)) -> (Int, Int))
-> (forall s. ST s (Int, Int)) -> (Int, Int)
forall a b. (a -> b) -> a -> b
$ do
    let v_len :: Int
v_len = Int
lenb Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1
    MVector s Int
v <- Int -> ST s (MVector (PrimState (ST s)) Int)
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
Int -> m (MVector (PrimState m) a)
VUM.unsafeNew (Int
v_len Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
3)
    let gov :: Int -> ST s ()
gov !Int
i =
          Bool -> ST s () -> ST s ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
v_len) (ST s () -> ST s ()) -> ST s () -> ST s ()
forall a b. (a -> b) -> a -> b
$ do
            MVector (PrimState (ST s)) Int -> Int -> Int -> ST s ()
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> a -> m ()
VUM.unsafeWrite MVector s Int
MVector (PrimState (ST s)) Int
v Int
i Int
i
            Int -> ST s ()
gov (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1)
        goi :: Int -> Int -> Char -> Int -> Int -> Int -> ST s ()
goi !Int
i !Int
na !Char
ai_1 !Int
v0 !Int
v1 !Int
v2 = do
          let !(TU.Iter Char
ai Int
da) = Text -> Int -> Iter
TU.iter Text
a Int
na
              goj :: Int -> Int -> Char -> ST s ()
goj !Int
j !Int
nb !Char
bj_1 =
                Bool -> ST s () -> ST s ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Int
j Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
lenb) (ST s () -> ST s ()) -> ST s () -> ST s ()
forall a b. (a -> b) -> a -> b
$ do
                  let !(TU.Iter Char
bj Int
db) = Text -> Int -> Iter
TU.iter Text
b Int
nb
                      cost :: Int
cost = if Char
ai Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
bj then Int
0 else Int
1
                  Int
x <- (Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) (Int -> Int) -> ST s Int -> ST s Int
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> MVector (PrimState (ST s)) Int -> Int -> ST s Int
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> m a
VUM.unsafeRead MVector s Int
MVector (PrimState (ST s)) Int
v (Int
v1 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
j)
                  Int
y <- (Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) (Int -> Int) -> ST s Int -> ST s Int
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> MVector (PrimState (ST s)) Int -> Int -> ST s Int
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> m a
VUM.unsafeRead MVector s Int
MVector (PrimState (ST s)) Int
v (Int
v0 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
j Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1)
                  Int
z <- (Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
cost) (Int -> Int) -> ST s Int -> ST s Int
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> MVector (PrimState (ST s)) Int -> Int -> ST s Int
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> m a
VUM.unsafeRead MVector s Int
MVector (PrimState (ST s)) Int
v (Int
v0 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
j)
                  let g :: Int
g = Int -> Int -> Int
forall a. Ord a => a -> a -> a
min Int
x (Int -> Int -> Int
forall a. Ord a => a -> a -> a
min Int
y Int
z)
                  Int
val <- (Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
cost) (Int -> Int) -> ST s Int -> ST s Int
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> MVector (PrimState (ST s)) Int -> Int -> ST s Int
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> m a
VUM.unsafeRead MVector s Int
MVector (PrimState (ST s)) Int
v (Int
v2 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
j Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)
                  MVector (PrimState (ST s)) Int -> Int -> Int -> ST s ()
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> a -> m ()
VUM.unsafeWrite MVector s Int
MVector (PrimState (ST s)) Int
v (Int
v1 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
j Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) (Int -> ST s ()) -> Int -> ST s ()
forall a b. (a -> b) -> a -> b
$
                    if Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
0 Bool -> Bool -> Bool
&& Int
j Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
0 Bool -> Bool -> Bool
&& Char
ai Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
bj_1 Bool -> Bool -> Bool
&& Char
ai_1 Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
bj Bool -> Bool -> Bool
&& Int
val Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
g
                      then Int
val
                      else Int
g
                  Int -> Int -> Char -> ST s ()
goj (Int
j Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) (Int
nb Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
db) Char
bj
          Bool -> ST s () -> ST s ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
lena) (ST s () -> ST s ()) -> ST s () -> ST s ()
forall a b. (a -> b) -> a -> b
$ do
            MVector (PrimState (ST s)) Int -> Int -> Int -> ST s ()
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> a -> m ()
VUM.unsafeWrite MVector s Int
MVector (PrimState (ST s)) Int
v Int
v1 (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1)
            Int -> Int -> Char -> ST s ()
goj Int
0 Int
0 Char
'a'
            Int -> Int -> Char -> Int -> Int -> Int -> ST s ()
goi (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) (Int
na Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
da) Char
ai Int
v1 Int
v2 Int
v0
    Int -> ST s ()
gov Int
0
    Int -> Int -> Char -> Int -> Int -> Int -> ST s ()
goi Int
0 Int
0 Char
'a' Int
0 Int
v_len (Int
v_len Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
2)
    Int
ld <- MVector (PrimState (ST s)) Int -> Int -> ST s Int
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> m a
VUM.unsafeRead MVector s Int
MVector (PrimState (ST s)) Int
v (Int
lenb Int -> Int -> Int
forall a. Num a => a -> a -> a
+ (Int
lena Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
3) Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
v_len)
    (Int, Int) -> ST s (Int, Int)
forall (m :: * -> *) a. Monad m => a -> m a
return (Int
ld, Int
lenm)
  where
    lena :: Int
lena = Text -> Int
T.length Text
a
    lenb :: Int
lenb = Text -> Int
T.length Text
b
    lenm :: Int
lenm = Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Int
lena Int
lenb
{-# INLINE damerauLevenshtein_ #-}

----------------------------------------------------------------------------
-- Treating inputs like sets

-- | Return the overlap coefficient for two 'Text' values. Returned value is
-- in the range from 0 (no similarity) to 1 (exact match). Return 1 if both
-- 'Text' values are empty.
--
-- See also: <https://en.wikipedia.org/wiki/Overlap_coefficient>.
--
-- @since 0.3.0
overlap :: Text -> Text -> Ratio Int
overlap :: Text -> Text -> Ratio Int
overlap Text
a Text
b =
  if Int
d Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0
    then Int
1 Int -> Int -> Ratio Int
forall a. Integral a => a -> a -> Ratio a
% Int
1
    else Map Char Int -> Map Char Int -> Int
intersectionSize (Text -> Map Char Int
mkTextMap Text
a) (Text -> Map Char Int
mkTextMap Text
b) Int -> Int -> Ratio Int
forall a. Integral a => a -> a -> Ratio a
% Int
d
  where
    d :: Int
d = Int -> Int -> Int
forall a. Ord a => a -> a -> a
min (Text -> Int
T.length Text
a) (Text -> Int
T.length Text
b)

-- | Return the Jaccard similarity coefficient for two 'Text' values.
-- Returned value is in the range from 0 (no similarity) to 1 (exact match).
-- Return 1 if both
--
-- See also: <https://en.wikipedia.org/wiki/Jaccard_index>
--
-- @since 0.3.0
jaccard :: Text -> Text -> Ratio Int
jaccard :: Text -> Text -> Ratio Int
jaccard Text
a Text
b =
  if Int
d Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0
    then Int
1 Int -> Int -> Ratio Int
forall a. Integral a => a -> a -> Ratio a
% Int
1
    else Map Char Int -> Map Char Int -> Int
intersectionSize Map Char Int
ma Map Char Int
mb Int -> Int -> Ratio Int
forall a. Integral a => a -> a -> Ratio a
% Int
d
  where
    ma :: Map Char Int
ma = Text -> Map Char Int
mkTextMap Text
a
    mb :: Map Char Int
mb = Text -> Map Char Int
mkTextMap Text
b
    d :: Int
d = Map Char Int -> Map Char Int -> Int
unionSize Map Char Int
ma Map Char Int
mb

-- | Make a map from 'Char' to 'Int' representing how many times the 'Char'
-- appears in the input 'Text'.
mkTextMap :: Text -> Map Char Int
mkTextMap :: Text -> Map Char Int
mkTextMap = (Map Char Int -> Char -> Map Char Int)
-> Map Char Int -> Text -> Map Char Int
forall a. (a -> Char -> a) -> a -> Text -> a
T.foldl' Map Char Int -> Char -> Map Char Int
forall k a. (Ord k, Num a) => Map k a -> k -> Map k a
f Map Char Int
forall k a. Map k a
M.empty
  where
    f :: Map k a -> k -> Map k a
f Map k a
m k
ch = (a -> a -> a) -> k -> a -> Map k a -> Map k a
forall k a. Ord k => (a -> a -> a) -> k -> a -> Map k a -> Map k a
M.insertWith a -> a -> a
forall a. Num a => a -> a -> a
(+) k
ch a
1 Map k a
m
{-# INLINE mkTextMap #-}

-- | Return intersection size between two 'Text'-maps.
intersectionSize :: Map Char Int -> Map Char Int -> Int
intersectionSize :: Map Char Int -> Map Char Int -> Int
intersectionSize Map Char Int
a Map Char Int
b = (Int -> Int -> Int) -> Int -> Map Char Int -> Int
forall a b k. (a -> b -> a) -> a -> Map k b -> a
M.foldl' Int -> Int -> Int
forall a. Num a => a -> a -> a
(+) Int
0 ((Int -> Int -> Int) -> Map Char Int -> Map Char Int -> Map Char Int
forall k a b c.
Ord k =>
(a -> b -> c) -> Map k a -> Map k b -> Map k c
M.intersectionWith Int -> Int -> Int
forall a. Ord a => a -> a -> a
min Map Char Int
a Map Char Int
b)
{-# INLINE intersectionSize #-}

-- | Return union size between two 'Text'-maps.
unionSize :: Map Char Int -> Map Char Int -> Int
unionSize :: Map Char Int -> Map Char Int -> Int
unionSize Map Char Int
a Map Char Int
b = (Int -> Int -> Int) -> Int -> Map Char Int -> Int
forall a b k. (a -> b -> a) -> a -> Map k b -> a
M.foldl' Int -> Int -> Int
forall a. Num a => a -> a -> a
(+) Int
0 ((Int -> Int -> Int) -> Map Char Int -> Map Char Int -> Map Char Int
forall k a. Ord k => (a -> a -> a) -> Map k a -> Map k a -> Map k a
M.unionWith Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Map Char Int
a Map Char Int
b)
{-# INLINE unionSize #-}

----------------------------------------------------------------------------
-- Other

-- | /O(n)/ Return the Hamming distance between two 'Text' values. Hamming
-- distance is defined as the number of positions at which the corresponding
-- symbols are different. The input 'Text' values should be of equal length
-- or 'Nothing' will be returned.
--
-- See also: <https://en.wikipedia.org/wiki/Hamming_distance>.
--
-- __Heads up__, before version /0.3.0/ this function returned @'Maybe'
-- 'Data.Numeric.Natural'@.
hamming :: Text -> Text -> Maybe Int
hamming :: Text -> Text -> Maybe Int
hamming Text
a Text
b =
  if Text -> Int
T.length Text
a Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Text -> Int
T.length Text
b
    then Int -> Maybe Int
forall a. a -> Maybe a
Just (Int -> Int -> Int -> Int
forall t. Num t => Int -> Int -> t -> t
go Int
0 Int
0 Int
0)
    else Maybe Int
forall a. Maybe a
Nothing
  where
    go :: Int -> Int -> t -> t
go !Int
na !Int
nb !t
r =
      let !(TU.Iter Char
cha Int
da) = Text -> Int -> Iter
TU.iter Text
a Int
na
          !(TU.Iter Char
chb Int
db) = Text -> Int -> Iter
TU.iter Text
b Int
nb
       in if
              | Int
na Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
len -> t
r
              | Char
cha Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
chb -> Int -> Int -> t -> t
go (Int
na Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
da) (Int
nb Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
db) (t
r t -> t -> t
forall a. Num a => a -> a -> a
+ t
1)
              | Bool
otherwise -> Int -> Int -> t -> t
go (Int
na Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
da) (Int
nb Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
db) t
r
    len :: Int
len = Text -> Int
TU.lengthWord16 Text
a

-- | Return the Jaro distance between two 'Text' values. Returned value is
-- in the range from 0 (no similarity) to 1 (exact match).
--
-- While the algorithm is pretty clear for artificial examples (like those
-- from the linked Wikipedia article), for /arbitrary/ strings, it may be
-- hard to decide which of two strings should be considered as one having
-- “reference” order of characters (order of matching characters in an
-- essential part of the definition of the algorithm). This makes us
-- consider the first string the “reference” string (with correct order of
-- characters). Thus generally,
--
-- > jaro a b ≠ jaro b a
--
-- This asymmetry can be found in all implementations of the algorithm on
-- the internet, AFAIK.
--
-- See also: <https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance>
--
-- @since 0.2.0
--
-- __Heads up__, before version /0.3.0/ this function returned @'Ratio'
-- 'Data.Numeric.Natural'@.
jaro :: Text -> Text -> Ratio Int
jaro :: Text -> Text -> Ratio Int
jaro Text
a Text
b =
  if Text -> Bool
T.null Text
a Bool -> Bool -> Bool
|| Text -> Bool
T.null Text
b
    then Int
0 Int -> Int -> Ratio Int
forall a. Integral a => a -> a -> Ratio a
% Int
1
    else (forall s. ST s (Ratio Int)) -> Ratio Int
forall a. (forall s. ST s a) -> a
runST ((forall s. ST s (Ratio Int)) -> Ratio Int)
-> (forall s. ST s (Ratio Int)) -> Ratio Int
forall a b. (a -> b) -> a -> b
$ do
      let lena :: Int
lena = Text -> Int
T.length Text
a
          lenb :: Int
lenb = Text -> Int
T.length Text
b
          d :: Int
d =
            if Int
lena Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
2 Bool -> Bool -> Bool
&& Int
lenb Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
2
              then Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Int
lena Int
lenb Int -> Int -> Int
forall a. Integral a => a -> a -> a
`quot` Int
2 Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1
              else Int
0
      MVector s Int
v <- Int -> Int -> ST s (MVector (PrimState (ST s)) Int)
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
Int -> a -> m (MVector (PrimState m) a)
VUM.replicate Int
lenb (Int
0 :: Int)
      MVector s Int
r <- Int -> Int -> ST s (MVector (PrimState (ST s)) Int)
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
Int -> a -> m (MVector (PrimState m) a)
VUM.replicate Int
3 (Int
0 :: Int) -- tj, m, t
      let goi :: Int -> Int -> Int -> ST s ()
goi !Int
i !Int
na !Int
fromb = do
            let !(TU.Iter Char
ai Int
da) = Text -> Int -> Iter
TU.iter Text
a Int
na
                (Int
from, Int
fromb') =
                  if Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
d
                    then (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
d, Int
fromb Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Text -> Int -> Int
TU.iter_ Text
b Int
fromb)
                    else (Int
0, Int
0)
                to :: Int
to = Int -> Int -> Int
forall a. Ord a => a -> a -> a
min (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
d Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) Int
lenb
                goj :: Int -> Int -> ST s ()
goj !Int
j !Int
nb =
                  Bool -> ST s () -> ST s ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Int
j Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
to) (ST s () -> ST s ()) -> ST s () -> ST s ()
forall a b. (a -> b) -> a -> b
$ do
                    let !(TU.Iter Char
bj Int
db) = Text -> Int -> Iter
TU.iter Text
b Int
nb
                    Bool
used <- (Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
1) (Int -> Bool) -> ST s Int -> ST s Bool
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> MVector (PrimState (ST s)) Int -> Int -> ST s Int
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> m a
VUM.unsafeRead MVector s Int
MVector (PrimState (ST s)) Int
v Int
j
                    if Bool -> Bool
not Bool
used Bool -> Bool -> Bool
&& Char
ai Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
bj
                      then do
                        Int
tj <- MVector (PrimState (ST s)) Int -> Int -> ST s Int
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> m a
VUM.unsafeRead MVector s Int
MVector (PrimState (ST s)) Int
r Int
0
                        if Int
j Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
tj
                          then MVector (PrimState (ST s)) Int -> (Int -> Int) -> Int -> ST s ()
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> (a -> a) -> Int -> m ()
VUM.unsafeModify MVector s Int
MVector (PrimState (ST s)) Int
r (Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) Int
2
                          else MVector (PrimState (ST s)) Int -> Int -> Int -> ST s ()
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> a -> m ()
VUM.unsafeWrite MVector s Int
MVector (PrimState (ST s)) Int
r Int
0 Int
j
                        MVector (PrimState (ST s)) Int -> Int -> Int -> ST s ()
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> a -> m ()
VUM.unsafeWrite MVector s Int
MVector (PrimState (ST s)) Int
v Int
j Int
1
                        MVector (PrimState (ST s)) Int -> (Int -> Int) -> Int -> ST s ()
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> (a -> a) -> Int -> m ()
VUM.unsafeModify MVector s Int
MVector (PrimState (ST s)) Int
r (Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) Int
1
                      else Int -> Int -> ST s ()
goj (Int
j Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) (Int
nb Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
db)
            Bool -> ST s () -> ST s ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
lena) (ST s () -> ST s ()) -> ST s () -> ST s ()
forall a b. (a -> b) -> a -> b
$ do
              Int -> Int -> ST s ()
goj Int
from Int
fromb
              Int -> Int -> Int -> ST s ()
goi (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) (Int
na Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
da) Int
fromb'
      Int -> Int -> Int -> ST s ()
goi Int
0 Int
0 Int
0
      Int
m <- MVector (PrimState (ST s)) Int -> Int -> ST s Int
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> m a
VUM.unsafeRead MVector s Int
MVector (PrimState (ST s)) Int
r Int
1
      Int
t <- MVector (PrimState (ST s)) Int -> Int -> ST s Int
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> m a
VUM.unsafeRead MVector s Int
MVector (PrimState (ST s)) Int
r Int
2
      Ratio Int -> ST s (Ratio Int)
forall (m :: * -> *) a. Monad m => a -> m a
return (Ratio Int -> ST s (Ratio Int)) -> Ratio Int -> ST s (Ratio Int)
forall a b. (a -> b) -> a -> b
$
        if Int
m Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0
          then Int
0 Int -> Int -> Ratio Int
forall a. Integral a => a -> a -> Ratio a
% Int
1
          else
            ( (Int
m Int -> Int -> Ratio Int
forall a. Integral a => a -> a -> Ratio a
% Int
lena)
                Ratio Int -> Ratio Int -> Ratio Int
forall a. Num a => a -> a -> a
+ (Int
m Int -> Int -> Ratio Int
forall a. Integral a => a -> a -> Ratio a
% Int
lenb)
                Ratio Int -> Ratio Int -> Ratio Int
forall a. Num a => a -> a -> a
+ ((Int
m Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
t) Int -> Int -> Ratio Int
forall a. Integral a => a -> a -> Ratio a
% Int
m)
            )
              Ratio Int -> Ratio Int -> Ratio Int
forall a. Fractional a => a -> a -> a
/ Ratio Int
3

-- | Return the Jaro-Winkler distance between two 'Text' values. Returned
-- value is in range from 0 (no similarity) to 1 (exact match).
--
-- See also: <https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance>
--
-- @since 0.2.0
--
-- __Heads up__, before version /0.3.0/ this function returned @'Ratio'
-- 'Data.Numeric.Natural'@.
jaroWinkler :: Text -> Text -> Ratio Int
jaroWinkler :: Text -> Text -> Ratio Int
jaroWinkler Text
a Text
b = Ratio Int
dj Ratio Int -> Ratio Int -> Ratio Int
forall a. Num a => a -> a -> a
+ (Int
1 Int -> Int -> Ratio Int
forall a. Integral a => a -> a -> Ratio a
% Int
10) Ratio Int -> Ratio Int -> Ratio Int
forall a. Num a => a -> a -> a
* Ratio Int
l Ratio Int -> Ratio Int -> Ratio Int
forall a. Num a => a -> a -> a
* (Ratio Int
1 Ratio Int -> Ratio Int -> Ratio Int
forall a. Num a => a -> a -> a
- Ratio Int
dj)
  where
    dj :: Ratio Int
dj = Ratio Int -> Ratio Int
forall a. a -> a
inline (Text -> Text -> Ratio Int
jaro Text
a Text
b)
    l :: Ratio Int
l = Int -> Ratio Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Int -> Int
forall a. Ord a => a -> a -> a
min Int
4 (Text -> Text -> Int
commonPrefix Text
a Text
b))

-- | Return the length of the common prefix two 'Text' values have.
commonPrefix :: Text -> Text -> Int
commonPrefix :: Text -> Text -> Int
commonPrefix Text
a Text
b = Int -> Int -> Int -> Int
forall t. Num t => Int -> Int -> t -> t
go Int
0 Int
0 Int
0
  where
    go :: Int -> Int -> t -> t
go !Int
na !Int
nb !t
r =
      let !(TU.Iter Char
cha Int
da) = Text -> Int -> Iter
TU.iter Text
a Int
na
          !(TU.Iter Char
chb Int
db) = Text -> Int -> Iter
TU.iter Text
b Int
nb
       in if
              | Int
na Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
lena -> t
r
              | Int
nb Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
lenb -> t
r
              | Char
cha Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
chb -> Int -> Int -> t -> t
go (Int
na Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
da) (Int
nb Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
db) (t
r t -> t -> t
forall a. Num a => a -> a -> a
+ t
1)
              | Bool
otherwise -> t
r
    lena :: Int
lena = Text -> Int
TU.lengthWord16 Text
a
    lenb :: Int
lenb = Text -> Int
TU.lengthWord16 Text
b
{-# INLINE commonPrefix #-}

----------------------------------------------------------------------------
-- Helpers

norm :: (Text -> Text -> (Int, Int)) -> Text -> Text -> Ratio Int
norm :: (Text -> Text -> (Int, Int)) -> Text -> Text -> Ratio Int
norm Text -> Text -> (Int, Int)
f Text
a Text
b =
  let (Int
r, Int
l) = Text -> Text -> (Int, Int)
f Text
a Text
b
   in if Int
r Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0
        then Int
1 Int -> Int -> Ratio Int
forall a. Integral a => a -> a -> Ratio a
% Int
1
        else Int
1 Int -> Int -> Ratio Int
forall a. Integral a => a -> a -> Ratio a
% Int
1 Ratio Int -> Ratio Int -> Ratio Int
forall a. Num a => a -> a -> a
- Int
r Int -> Int -> Ratio Int
forall a. Integral a => a -> a -> Ratio a
% Int
l
{-# INLINE norm #-}