{-# LANGUAGE OverloadedStrings #-} -- ---------------------------------------------------------------------------- {- | Module : Hunt.Query.Fuzzy Copyright : Copyright (C) 2007, 2008 Timo B. Huebel License : MIT Maintainer : Timo B. Huebel (tbh@holumbus.org) Stability : experimental Portability: portable Version : 0.2 The unique Holumbus mechanism for generating fuzzy sets. -} -- ---------------------------------------------------------------------------- module Hunt.Query.Fuzzy ( -- * Fuzzy types FuzzySet , Replacements , Replacement , FuzzyScore , FuzzyConfig (..) -- * Predefined replacements , englishReplacements , germanReplacements -- * Generation , fuzz -- * Conversion , toList ) where import Data.Binary import Data.Default import Data.Function import Data.List import Data.Maybe (fromMaybe) import Control.Applicative import Data.Map (Map) import qualified Data.Map as M import Data.Text (Text) import qualified Data.Text as T import Data.Text.Binary () -- ------------------------------------------------------------ -- | A set of string which have been "fuzzed" with an associated score. type FuzzySet = Map Text FuzzyScore -- | Some replacements which can be applied to a string to generate a 'FuzzySet'. The scores of -- the replacements will be normalized to a maximum of 1.0. type Replacements = [Replacement] -- | A single replacements, where the first will be replaced by the second and vice versa in -- the target string. The score indicates the amount of fuzzines that one single application -- of this replacement in just one direction will cause on the target string. type Replacement = ((Text,Text), FuzzyScore) -- | The score indicating an amount of fuzziness. type FuzzyScore = Float -- | The configuration of a fuzzy query. data FuzzyConfig = FuzzyConfig { applyReplacements :: Bool -- ^ Indicates whether the replacements should be applied. , applySwappings :: Bool -- ^ Indicates whether the swapping of adjacent characters should be applied. , maxFuzziness :: FuzzyScore -- ^ The maximum allowed fuzziness. , customReplacements :: Replacements -- ^ The replacements that should be applied. } deriving (Show) -- ------------------------------------------------------------ instance Default FuzzyConfig where def = FuzzyConfig True True 1.0 englishReplacements instance Binary FuzzyConfig where put (FuzzyConfig r s m f) = put r >> put s >> put m >> put f get = FuzzyConfig <$> get <*> get <*> get <*> get -- ------------------------------------------------------------ -- | Some default replacements for the english language. englishReplacements :: Replacements englishReplacements = [ (("l", "ll"), 0.2) , (("t", "tt"), 0.2) , (("r", "rr"), 0.2) , (("e", "ee"), 0.2) , (("o", "oo"), 0.2) , (("s", "ss"), 0.2) , (("g", "ck"), 0.4) , (("k", "ck"), 0.4) , (("ea", "ee"), 0.4) , (("ou", "oo"), 0.4) , (("ou", "au"), 0.4) , (("ou", "ow"), 0.4) , (("s", "c"), 0.6) , (("uy", "ye"), 0.6) , (("y", "ey"), 0.6) , (("kn", "n"), 0.6) ] -- | Some default replacements for the german language. germanReplacements :: Replacements germanReplacements = [ (("l", "ll"), 0.2) , (("t", "tt"), 0.2) , (("n", "nn"), 0.2) , (("r", "rr"), 0.2) , (("i", "ie"), 0.2) , (("ei", "ie"), 0.2) , (("k", "ck"), 0.2) , (("d", "t"), 0.4) , (("b", "p"), 0.4) , (("g", "k"), 0.4) , (("g", "ch"), 0.4) , (("c", "k"), 0.4) , (("s", "z"), 0.4) , (("u", "ou"), 0.4) , (("ü", "ue"), 0.1) , (("ä", "ae"), 0.1) , (("ö", "oe"), 0.1) , (("ß", "ss"), 0.1) ] -- | Continue fuzzing a string with the an explicitly specified list of replacements until -- a given score threshold is reached. fuzz :: FuzzyConfig -> Text -> FuzzySet fuzz cfg s = M.delete s (fuzz' (fuzzLimit cfg 0.0 s)) where fuzz' :: FuzzySet -> FuzzySet fuzz' fs = if M.null more then fs else M.unionWith min fs (fuzz' more) where -- The current score is doubled on every recursive call, because fuzziness increases exponentially. more = M.foldrWithKey (\sm sc res -> M.unionWith min res (fuzzLimit cfg (sc + sc) sm)) M.empty fs -- | Fuzz a string and limit the allowed score to a given threshold. fuzzLimit :: FuzzyConfig -> FuzzyScore -> Text -> FuzzySet fuzzLimit cfg sc s = if sc <= th then M.filter (<= th) (fuzzInternal cfg sc s) else M.empty where th = maxFuzziness cfg -- | Fuzz a string with an list of explicitly specified replacements and combine the scores -- with an initial score. fuzzInternal :: FuzzyConfig -> FuzzyScore -> Text -> FuzzySet fuzzInternal cfg sc s = M.unionWith min replaced swapped where replaced = let rs = customReplacements cfg in if applyReplacements cfg then foldr (\r res -> M.unionWith min res (applyFuzz (replace rs r) sc s)) M.empty rs else M.empty swapped = if applySwappings cfg then applyFuzz swap sc s else M.empty -- | Applies a fuzzy function to a string. An initial score is combined with the new score -- for the replacement. applyFuzz :: (Text -> Text -> [(Text, FuzzyScore)]) -> FuzzyScore -> Text -> FuzzySet applyFuzz f sc s = apply (init $ T.inits s) (init $ T.tails s) where apply :: [Text] -> [Text] -> FuzzySet apply [] _ = M.empty apply _ [] = M.empty apply (pr:prs) (su:sus) = M.unionsWith min $ apply prs sus:map create (f pr su) where create (fuzzed, score) = M.singleton fuzzed (sc + score * calcWeight (T.length pr) (T.length s)) -- | Apply a replacement in both directions to the suffix of a string and return the complete -- string with a score, calculated from the replacement itself and the list of replacements. replace :: Replacements -> Replacement -> Text -> Text -> [(Text, FuzzyScore)] replace rs ((r1, r2), s) prefix suffix = replace' r1 r2 ++ replace' r2 r1 where replace' tok sub = if replaced == suffix then [] else [(prefix `T.append` replaced, score)] where replaced = replaceFirst tok sub suffix score = s / (snd $ maximumBy (compare `on` snd) rs) -- | Swap the first two characters of the suffix and return the complete string with a score or -- Nothing if there are not enough characters to swap. swap :: Text -> Text -> [(Text, FuzzyScore)] swap prefix str = fromMaybe [] $ do (s1, suffix0) <- T.uncons str (s2, suffix) <- T.uncons suffix0 return [(prefix `T.append` (s2 `T.cons` s1 `T.cons` suffix), 1.0)] -- | Calculate the weighting factor depending on the position in the string and it's total length. calcWeight :: Int -> Int -> FuzzyScore calcWeight pos len = (l - p) / l where p = fromIntegral pos l = fromIntegral len -- | Searches a prefix and replaces it with a substitute in a list. replaceFirst :: Text -> Text -> Text -> Text replaceFirst xs' ys' zs' = case () of _ | T.null xs' -> ys' `T.append` zs' | T.null zs' -> T.empty | otherwise -> let (x, xs) = (T.head xs', T.tail xs') (y, ys) = (T.head ys', T.tail ys') (z, zs) = (T.head zs', T.tail zs') in if x == z && xs' `T.isPrefixOf` zs' then if T.null ys' then replaceFirst xs T.empty zs else y `T.cons` replaceFirst xs ys zs else zs' -- | Transform a fuzzy set into a list (ordered by score). toList :: FuzzySet -> [ (Text, FuzzyScore) ] toList = sortBy (compare `on` snd) . M.toList -- ------------------------------------------------------------