module Holumbus.Query.Fuzzy
(
FuzzySet
, Replacements
, Replacement
, FuzzyScore
, FuzzyConfig (..)
, englishReplacements
, germanReplacements
, fuzz
, toList
)
where
import Data.Binary
import Data.List
import Data.Function
import Control.Monad
import Data.Map (Map)
import qualified Data.Map as M
type FuzzySet = Map String FuzzyScore
type Replacements = [ Replacement ]
type Replacement = ((String, String), FuzzyScore)
type FuzzyScore = Float
data FuzzyConfig
= FuzzyConfig
{ applyReplacements :: Bool
, applySwappings :: Bool
, maxFuzziness :: FuzzyScore
, customReplacements :: Replacements
}
deriving (Show)
instance Binary FuzzyConfig where
put (FuzzyConfig r s m f)
= put r >> put s >> put m >> put f
get
= liftM4 FuzzyConfig get get get get
englishReplacements :: Replacements
englishReplacements =
[ (("l", "ll"), 0.2)
, (("t", "tt"), 0.2)
, (("r", "rr"), 0.2)
, (("e", "ee"), 0.2)
, (("o", "oo"), 0.2)
, (("s", "ss"), 0.2)
, (("g", "ck"), 0.4)
, (("k", "ck"), 0.4)
, (("ea", "ee"), 0.4)
, (("ou", "oo"), 0.4)
, (("ou", "au"), 0.4)
, (("ou", "ow"), 0.4)
, (("s", "c"), 0.6)
, (("uy", "ye"), 0.6)
, (("y", "ey"), 0.6)
, (("kn", "n"), 0.6)
]
germanReplacements :: Replacements
germanReplacements =
[ (("l", "ll"), 0.2)
, (("t", "tt"), 0.2)
, (("n", "nn"), 0.2)
, (("r", "rr"), 0.2)
, (("i", "ie"), 0.2)
, (("ei", "ie"), 0.2)
, (("k", "ck"), 0.2)
, (("d", "t"), 0.4)
, (("b", "p"), 0.4)
, (("g", "k"), 0.4)
, (("g", "ch"), 0.4)
, (("c", "k"), 0.4)
, (("s", "z"), 0.4)
, (("u", "ou"), 0.4)
, (("ü", "ue"), 0.1)
, (("ä", "ae"), 0.1)
, (("ö", "oe"), 0.1)
, (("ß", "ss"), 0.1)
]
fuzz :: FuzzyConfig -> String -> FuzzySet
fuzz cfg s = M.delete s (fuzz' (fuzzLimit cfg 0.0 s))
where
fuzz' :: FuzzySet -> FuzzySet
fuzz' fs = if M.null more then fs else M.unionWith min fs (fuzz' more)
where
more = M.foldrWithKey (\sm sc res -> M.unionWith min res (fuzzLimit cfg (sc + sc) sm)) M.empty fs
fuzzLimit :: FuzzyConfig -> FuzzyScore -> String -> FuzzySet
fuzzLimit cfg sc s = if sc <= th then M.filter (\ns -> ns <= th) (fuzzInternal cfg sc s) else M.empty
where
th = maxFuzziness cfg
fuzzInternal :: FuzzyConfig -> FuzzyScore -> String -> FuzzySet
fuzzInternal cfg sc s = M.unionWith min replaced swapped
where
replaced = let rs = customReplacements cfg in if (applyReplacements cfg)
then foldr (\r res -> M.unionWith min res (applyFuzz (replace rs r) sc s)) M.empty rs
else M.empty
swapped = if (applySwappings cfg)
then applyFuzz swap sc s
else M.empty
applyFuzz :: (String -> String -> [(String, FuzzyScore)]) -> FuzzyScore -> String -> FuzzySet
applyFuzz f sc s = apply (init $ inits s) (init $ tails s)
where
apply :: [String] -> [String] -> FuzzySet
apply [] _ = M.empty
apply _ [] = M.empty
apply (pr:prs) (su:sus) = M.unionsWith min $ (apply prs sus):(map create $ (f pr su))
where
create (fuzzed, score) = M.singleton fuzzed (sc + score * (calcWeight (length pr) (length s)))
replace :: Replacements -> Replacement -> String -> String -> [(String, FuzzyScore)]
replace rs ((r1, r2), s) prefix suffix = (replace' r1 r2) ++ (replace' r2 r1)
where
replace' tok sub = if replaced == suffix then [] else [(prefix ++ replaced, score)]
where
replaced = replaceFirst tok sub suffix
score = s / (snd $ maximumBy (compare `on` snd) rs)
swap :: String -> String -> [(String, FuzzyScore)]
swap prefix (s1:s2:suffix) = [(prefix ++ (s2:s1:suffix), 1.0)]
swap _ _ = []
calcWeight :: Int -> Int -> FuzzyScore
calcWeight pos len = (l p) / l
where
p = fromIntegral pos
l = fromIntegral len
replaceFirst :: Eq a => [a] -> [a] -> [a] -> [a]
replaceFirst [] ys zs = ys ++ zs
replaceFirst _ _ [] = []
replaceFirst t@(x:xs) ys s@(z:zs) = if x == z && t `isPrefixOf` s then
if null ys then replaceFirst xs [] zs
else (head ys) : replaceFirst xs (tail ys) zs
else s
toList :: FuzzySet -> [ (String, FuzzyScore) ]
toList = sortBy (compare `on` snd) . M.toList