{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE MultiParamTypeClasses #-} -- | An implementation of 'Search' based on a ternary search tree -- ('Data.TSTSet'): . -- -- The searches are performed by manually generating the close word by -- deleting, transposing, or adding wildcards to match additional characters. -- -- This makes this structure fast for small distances with a small number of -- generated candidates, but impractical for even slightly larger distances - -- in my tests 'Language.Distance.Search.BK' outpeforms this module when the -- distance is greater than 2. -- -- The data structure has no knowledge of the distance and thus it does not -- need to be rebuilt if different edit distances are needed. However this -- means that it cannot work with arbitrary 'EditDistance' instances are -- functions need to be defined manually to generate the candidates. In this -- case 'levenshtein' uses 'deletions', 'replaces', and 'insertions' to -- generate the candidates; while 'damerauLevenshtein' also uses -- 'transpositions'. module Language.Distance.Search.TST ( -- * Type TSTSet -- * Operations , TSTSet.empty , insert , levenshtein , damerauLevenshtein -- * Candidates generation -- $Wildcard -- ** Types , WildCard (..) , WildList -- ** Operations , deletions , transpositions , replaces , insertions ) where import Control.Arrow (first) import Data.Word (Word8) import Data.ByteString (ByteString) import Data.Text (Text) import Data.ListLike (ListLike) import qualified Data.ListLike as ListLike import Data.ListLike.Text () import Data.TST (WildCard (..), WildList) import qualified Data.TST as TST import Data.TSTSet (TSTSet) import qualified Data.TSTSet as TSTSet import Language.Distance (Levenshtein, DamerauLevenshtein) import Language.Distance.Internal -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- -- ~~ Edits ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- {- $Wildcard The following functions generate candidates distant 1 from the given word, and they are reexported for completeness. 'deletions' and 'transpositions' do not need to wildcard the word, while 'replaces' and 'insertions' do since we are adding characters. -} deletions, transpositions :: [a] -> [[a]] replaces, insertions :: WildList a -> [WildList a] deletions [] = [] deletions (c : s) = s : map (c :) (deletions s) transpositions [] = [] transpositions [x] = [[x]] transpositions (x : y : s) = (y : x : s) : map (x :) (transpositions (y : s)) replaces [] = [] replaces (c : s) = (WildCard : s) : map (c :) (replaces s) insertions [] = [[WildCard]] insertions (c : s) = (WildCard : c : s) : map (c :) (insertions s) -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- -- ~~ Search ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- levenshtein :: (Ord sym, ListLike full sym) => Int -> full -> TSTSet sym -> [(full, Distance Levenshtein)] levenshtein = queryTST (\s -> deletions s ++ replaces s ++ insertions s) damerauLevenshtein :: (Ord sym, ListLike full sym) => Int -> full -> TSTSet sym -> [(full, Distance DamerauLevenshtein)] damerauLevenshtein = queryTST (\s -> deletions s ++ replaces s ++ insertions s ++ transpositions s) insert :: (Ord sym, ListLike full sym) => full -> TSTSet sym -> TSTSet sym insert ll = TSTSet.insert (ListLike.toList ll) {-# SPECIALISE insert :: String -> TSTSet Char -> TSTSet Char #-} {-# SPECIALISE insert :: ByteString -> TSTSet Word8 -> TSTSet Word8 #-} {-# SPECIALISE insert :: Text -> TSTSet Char -> TSTSet Char #-} queryTST :: (Ord sym, ListLike full sym) => (WildList sym -> [WildList sym]) -> Int -> full -> TSTSet sym -> [(full, Distance algo)] queryTST f maxd s tst = map (first ListLike.fromList) $ TST.toList $ go 0 TSTSet.empty [wildList $ ListLike.toList s] TST.empty where go n visited edits matches | n <= maxd = let edits' = filter (\x -> not (TSTSet.member x visited)) edits newMatches = zip (concatMap (flip TSTSet.matchWL tst) edits') (map Distance (repeat n)) in go (n + 1) (foldr TSTSet.insert visited edits') (concatMap f edits') (update newMatches matches) | otherwise = matches update new matches = foldr (uncurry (TST.insertWith (flip const))) matches new {-# SPECIALISE queryTST :: (WildList Char -> [WildList Char]) -> Int -> String -> TSTSet Char -> [(String, Distance algo)] #-} {-# SPECIALISE queryTST :: (WildList Word8 -> [WildList Word8]) -> Int -> ByteString -> TSTSet Word8 -> [(ByteString, Distance algo)] #-} {-# SPECIALISE queryTST :: (WildList Char -> [WildList Char]) -> Int -> Text -> TSTSet Char -> [(Text, Distance algo)] #-} wildList :: ListLike full sym => full -> WildList sym wildList = map El . ListLike.toList