-- ./Levenshtein "\$(cat ../web/regression/test/blocks/input/input.xml)" "\$(cat ../web/regression/test/blocks/output/input.xml)" {-# Language DeriveFunctor #-} import Control.Applicative import Data.Monoid (Sum(..)) import Data.Semigroup (First(..), Option(Option, getOption), option) import System.Environment (getArgs) import qualified Data.IntMap.Strict as IntMap import Data.PriorityQueue data Edit a = Keep !a | Delete !a | Insert !a | Replace !a !a deriving (Eq, Show, Functor) totalCost :: [Edit a] -> Int totalCost = getSum . foldMap editCost editCost :: Edit a -> Sum Int editCost Keep{} = Sum 0 editCost _ = Sum 1 -- | A naïve Levenshtein distance function suffers from an exponential explosion problem. naïve :: Eq a => [a] -> [a] -> [Edit a] naïve [] goal = Insert <\$> goal naïve origin [] = Delete <\$> origin naïve (x:xs) (y:ys) | x == y = Keep x : naïve xs ys | deletionCost < insertionCost && deletionCost < replacementCost = deletion | insertionCost < deletionCost && insertionCost < replacementCost = insertion | otherwise = replacement where deletion = Delete x : naïve xs (y:ys) insertion = Insert y : naïve (x:xs) ys replacement = Replace x y : naïve xs ys deletionCost = totalCost deletion insertionCost = totalCost insertion replacementCost = totalCost replacement -- | We shall keep track of the partial solution of the problem in this data structure. data Partial a = Partial { -- | length of the matched prefixes of the two strings xlen, ylen :: !Int, -- | remaining unmatched suffixes of the two strings xrest, yrest :: [a], -- | calculated edits that convert the first prefix into the second delta :: [Edit a]} -- | Given a single partial solution, produce a priority queue of all possible solutions. step :: (Eq a, Show a) => Partial a -> PQueue Branching (Sum Int) (Partial a) step (Partial xl yl [] goal edits) = withCost (Sum yl') (pure \$ Partial xl (yl+yl') [] [] \$ reverse (Insert <\$> goal) <> edits) where yl' = length goal step (Partial xl yl origin [] edits) = withCost (Sum xl') (pure \$ Partial (xl+xl') yl [] [] \$ reverse (Delete <\$> origin) <> edits) where xl' = length origin step (Partial xl yl (x:xs) (y:ys) edits) | x == y = pure (Partial (xl+1) (yl+1) xs ys \$ Keep x : edits) | otherwise = {-# SCC "step.hard" #-} withCost 1 (pure (Partial (xl+1) yl xs (y:ys) \$ Delete x : edits) <|> pure (Partial xl (yl+1) (x:xs) ys \$ Insert y : edits) <|> pure (Partial (xl+1) (yl+1) xs ys \$ Replace x y : edits)) -- | Repeatedly 'step' through the two strings and 'optimize' after every step, collect all relevant solutions, and -- then extract the best one (i.e., the solution with the least cost). stepped :: (Eq a, Show a) => [a] -> [a] -> [Edit a] stepped xs ys = reverse (delta \$ getFirst \$ option undefined id \$ foldMap (Option . Just . First) \$ applyN (length xs + length ys) (optimize . (>>= step)) (pure \$ Partial 0 0 xs ys [])) -- | Optimize the queue by removing every partial solution that can be proven to be no better than another existing -- solution. The task is performed by 'pruneSubsets', which requires a function, 'unionDiff', that compares two -- partial solutions. optimize = pruneSubsets unionDiff mempty -- | A partial Levenshtein distance solution that has edited the first x items from the original string into the first -- y items of the target string cannot be any worse than any alternative partial solution that has edited x−n original -- items into y−n target items. -- -- The first argument is a map from @length x - length y@ into the longest @length x@ consumed by the solutions -- examined so far. Any 'Partial' solution worse than that will be eliminated. unionDiff :: IntMap.IntMap Int -> Partial a -> Maybe (IntMap.IntMap Int, Partial a) unionDiff state p@Partial{xlen= xl, ylen= yl} | Just reach <- IntMap.lookup distance state, reach >= xl = Nothing | otherwise = Just (IntMap.insert distance xl state, p) where distance = xl - yl main = do args <- getArgs solution <- case args of -- character-based distance of two command-line arguments ["-c", xs, ys] -> pure (fmap (:[]) <\$> stepped xs ys) -- line-based distance of two command-line arguments ["-l", xs, ys] -> pure (stepped (lines xs) (lines ys)) -- line-based distance of two named files' contents ["-f", xf, yf] -> stepped <\$> (lines <\$> readFile xf) <*> (lines <\$> readFile yf) _ -> error "Expecting -[clf] and two more arguments to diff." if head args == "-f" then mapM_ (putStrLn . toDiff) solution else print solution print (totalCost solution) -- | Convert the edits into diff-like output. toDiff (Keep line) = " " <> line toDiff (Delete line) = "< " <> line toDiff (Insert line) = "> " <> line toDiff (Replace old new) = "< " <> old <> "\n> " <> new -- | Apply a function N times. applyN :: Int -> (a -> a) -> a -> a applyN n f x | n > 0 = applyN (pred n) f (f x) | otherwise = x