module Hunt.Query.Fuzzy
(
FuzzySet
, Replacements
, Replacement
, FuzzyScore
, FuzzyConfig (..)
, englishReplacements
, germanReplacements
, fuzz
, toList
)
where
import Data.Binary
import Data.Default
import Data.Function
import Data.List
import Data.Maybe (fromMaybe)
import Control.Applicative
import Data.Map (Map)
import qualified Data.Map as M
import Data.Text (Text)
import qualified Data.Text as T
import Data.Text.Binary ()
type FuzzySet = Map Text FuzzyScore
type Replacements = [Replacement]
type Replacement = ((Text,Text), FuzzyScore)
type FuzzyScore = Float
data FuzzyConfig
= FuzzyConfig
{ applyReplacements :: Bool
, applySwappings :: Bool
, maxFuzziness :: FuzzyScore
, customReplacements :: Replacements
}
deriving (Show)
instance Default FuzzyConfig where
def = FuzzyConfig True True 1.0 englishReplacements
instance Binary FuzzyConfig where
put (FuzzyConfig r s m f) = put r >> put s >> put m >> put f
get = FuzzyConfig <$> get <*> get <*> get <*> get
englishReplacements :: Replacements
englishReplacements =
[ (("l", "ll"), 0.2)
, (("t", "tt"), 0.2)
, (("r", "rr"), 0.2)
, (("e", "ee"), 0.2)
, (("o", "oo"), 0.2)
, (("s", "ss"), 0.2)
, (("g", "ck"), 0.4)
, (("k", "ck"), 0.4)
, (("ea", "ee"), 0.4)
, (("ou", "oo"), 0.4)
, (("ou", "au"), 0.4)
, (("ou", "ow"), 0.4)
, (("s", "c"), 0.6)
, (("uy", "ye"), 0.6)
, (("y", "ey"), 0.6)
, (("kn", "n"), 0.6)
]
germanReplacements :: Replacements
germanReplacements =
[ (("l", "ll"), 0.2)
, (("t", "tt"), 0.2)
, (("n", "nn"), 0.2)
, (("r", "rr"), 0.2)
, (("i", "ie"), 0.2)
, (("ei", "ie"), 0.2)
, (("k", "ck"), 0.2)
, (("d", "t"), 0.4)
, (("b", "p"), 0.4)
, (("g", "k"), 0.4)
, (("g", "ch"), 0.4)
, (("c", "k"), 0.4)
, (("s", "z"), 0.4)
, (("u", "ou"), 0.4)
, (("ü", "ue"), 0.1)
, (("ä", "ae"), 0.1)
, (("ö", "oe"), 0.1)
, (("ß", "ss"), 0.1)
]
fuzz :: FuzzyConfig -> Text -> FuzzySet
fuzz cfg s = M.delete s (fuzz' (fuzzLimit cfg 0.0 s))
where
fuzz' :: FuzzySet -> FuzzySet
fuzz' fs = if M.null more then fs else M.unionWith min fs (fuzz' more)
where
more = M.foldrWithKey (\sm sc res -> M.unionWith min res (fuzzLimit cfg (sc + sc) sm)) M.empty fs
fuzzLimit :: FuzzyConfig -> FuzzyScore -> Text -> FuzzySet
fuzzLimit cfg sc s = if sc <= th then M.filter (<= th) (fuzzInternal cfg sc s) else M.empty
where
th = maxFuzziness cfg
fuzzInternal :: FuzzyConfig -> FuzzyScore -> Text -> FuzzySet
fuzzInternal cfg sc s = M.unionWith min replaced swapped
where
replaced = let rs = customReplacements cfg in if applyReplacements cfg
then foldr (\r res -> M.unionWith min res (applyFuzz (replace rs r) sc s)) M.empty rs
else M.empty
swapped = if applySwappings cfg
then applyFuzz swap sc s
else M.empty
applyFuzz :: (Text -> Text -> [(Text, FuzzyScore)]) -> FuzzyScore -> Text -> FuzzySet
applyFuzz f sc s = apply (init $ T.inits s) (init $ T.tails s)
where
apply :: [Text] -> [Text] -> FuzzySet
apply [] _ = M.empty
apply _ [] = M.empty
apply (pr:prs) (su:sus) = M.unionsWith min $ apply prs sus:map create (f pr su)
where
create (fuzzed, score) = M.singleton fuzzed (sc + score * calcWeight (T.length pr) (T.length s))
replace :: Replacements -> Replacement -> Text -> Text -> [(Text, FuzzyScore)]
replace rs ((r1, r2), s) prefix suffix = replace' r1 r2 ++ replace' r2 r1
where
replace' tok sub = if replaced == suffix then [] else [(prefix `T.append` replaced, score)]
where
replaced = replaceFirst tok sub suffix
score = s / (snd $ maximumBy (compare `on` snd) rs)
swap :: Text -> Text -> [(Text, FuzzyScore)]
swap prefix str = fromMaybe [] $ do
(s1, suffix0) <- T.uncons str
(s2, suffix) <- T.uncons suffix0
return [(prefix `T.append` (s2 `T.cons` s1 `T.cons` suffix), 1.0)]
calcWeight :: Int -> Int -> FuzzyScore
calcWeight pos len = (l p) / l
where
p = fromIntegral pos
l = fromIntegral len
replaceFirst :: Text -> Text -> Text -> Text
replaceFirst xs' ys' zs'
= case () of
_ | T.null xs' -> ys' `T.append` zs'
| T.null zs' -> T.empty
| otherwise -> let (x, xs) = (T.head xs', T.tail xs')
(y, ys) = (T.head ys', T.tail ys')
(z, zs) = (T.head zs', T.tail zs')
in if x == z && xs' `T.isPrefixOf` zs' then
if T.null ys' then replaceFirst xs T.empty zs
else y `T.cons` replaceFirst xs ys zs
else zs'
toList :: FuzzySet -> [ (Text, FuzzyScore) ]
toList = sortBy (compare `on` snd) . M.toList