{-# LANGUAGE FlexibleContexts, FlexibleInstances, GeneralizedNewtypeDeriving, ScopedTypeVariables, TypeFamilies #-} module Data.Module.String where import Control.Arrow import Control.Monad import Data.Algorithm.Diff import Data.Default import Data.List import Data.Maybe import Data.Module.Class import Data.Module.Container (ContainerAtom) import Data.Module.Product import Data.Module.Shape import Data.Monoid import Text.Regex.PCRE import qualified Data.Map as M import qualified Data.Module.Container as C class Module (M m) => StringModule m where type M m valid :: m -> String -- returns a regex telling what strings are valid parse :: m -> String -> Maybe (V (M m)) pprint :: m -> V (M m) -> String edit :: m -> V (M m) -> [Edit] -> M m data Edit = Insert Int String | Delete Int Int deriving (Show, Read) instance PartialEdit Edit where type V_0 Edit = String apply_0 (Insert n s') s = Just (take n s ++ s' ++ drop n s) apply_0 (Delete n n') s = Just (take n s ++ drop n' s) newtype Match = Match String deriving (Eq, Ord, Show, Read) maybeMatch re s = guard (s =~ re == (0 :: Int, length s)) >> return s instance StringModule Match where type M Match = [Edit] valid (Match re) = re parse = maybeMatch . valid pprint = const id edit = const (const id) instance (StringModule dX, StringModule dY) => StringModule (dX, dY) where type M (dX, dY) = (M dX, M dY) valid (mx, my) = valid mx ++ valid my parse (mx, my) s = do let (m , n ) = s =~ valid mx (m', n') = drop n s =~ valid my guard ([m, m', n + n'] == [0, 0, length s]) vx <- parse mx (take n s) vy <- parse my (drop n s) return (vx, vy) pprint (mx, my) (vx, vy) = pprint mx vx ++ pprint my vy edit = editProd editProd (mx, my) (vx, vy) es = edit mx vx *** edit my vy $ case splitMap of Nothing -> (replace sx sx', replace sy sy') Just ms -> (exs, eys) where die = error "The impossible happened! A [Edit] didn't successfully apply to a String in editProd." sx = pprint mx vx sy = pprint my vy sn = fromMaybe die (apply es (sx ++ sy)) oldSplit = length sx newSplit = snd (sn =~ valid mx :: (Int, Int)) (sx', sy') = splitAt newSplit sn easySplits = trackEdits [oldSplit] es splitMap = M.lookup newSplit easySplits -- this is dangerous, only use the variables it binds when splitMap is definitely a Just! (exs, eys) = splitEdits (fromJust splitMap) oldSplit es replace s s' = [Insert 0 s', Delete 0 (length s)] trackEdit :: Edit -> (Int, M.Map Int (M.Map Int Int)) -> (Int, M.Map Int (M.Map Int Int)) trackEdit (Insert n s) (i, m) = (i + 1, m') where (smaller, larger') = M.split n m len = length s larger = M.mapKeysMonotonic (+ len) larger' exact = case M.lookup n m of Nothing -> M.empty Just im -> M.fromList [(n + i', M.insert i i' im) | i' <- [0 .. len]] m' = smaller `M.union` larger `M.union` exact trackEdit (Delete n n') (i, m) | n > n' = trackEdit (Delete n' n) (i, m) | n == n' = (i + 1, m) | n < n' = (i + 1, m') where (smaller, notSmaller) = M.split (n +1) m (deleted, larger') = M.split (n'-1) notSmaller len = n' - n larger = M.mapKeysMonotonic (subtract len) larger' m' = smaller `M.union` larger -- NOTE: this arbitrarily prefers the edits on the left-hand side of the -- deletion boundary over the edits on the right-hand side of the deletion -- boundary when a chunk boundary happens to fall on a position that -- matches the deletion boundary -- input: the locations of chunk boundaries, plus some edits -- returns: -- The inner Map Int Int tells, for each index into the edits, whether to split the resulting edit and where. -- The outer Map tells, for each position, if a chunk boundary ends up landing there, how to split the edits. trackEdits :: [Int] -> [Edit] -> M.Map Int (M.Map Int Int) trackEdits boundaries = snd . foldr trackEdit (0, M.fromList (zip boundaries (repeat M.empty))) splitEdits :: M.Map Int Int -> Int -> [Edit] -> ([Edit], [Edit]) splitEdits m split = snd . foldr splitEdit ((0, split), ([], [])) where splitEdit :: Edit -> ((Int, Int), ([Edit], [Edit])) -> ((Int, Int), ([Edit], [Edit])) splitEdit e@(Delete n n') ((i, split), (els, ers)) | n > n' = splitEdit (Delete n' n) ((i, split), (els, ers)) | n' <= split = ((i+1, split-n'+n), (e:els, ers)) | n >= split = ((i+1, split), (els, Delete (n-split) (n'-split) : ers)) | otherwise = error "The impossible happened! A deletion crossed a chunk boundary." -- | otherwise = ((i+1, n), (Delete n split : els, Delete 0 (n'-split) : ers)) splitEdit e@(Insert n s) ((i, split), (els, ers)) = case M.lookup i m of Nothing | n < split -> ((i+1, split+length s), (e:els, ers)) | n > split -> ((i+1, split), (els, Insert (n-split) s : ers)) | otherwise -> error "The impossible happened! An insertion crossed a chunk boundary without being in the splitting map." Just n' -> ((i+1, split+n'), (consInsert n (take n' s) els, consInsert 0 (drop n' s) ers)) consInsert n "" es = es consInsert n s es = Insert n s : es data Asterisk m = Asterisk m instance StringModule m => StringModule (Asterisk m) where type M (Asterisk m) = [ContainerAtom Int (M m)] valid (Asterisk m) = "(" ++ valid m ++ ")*" pprint (Asterisk m) = containerToList >=> pprint m edit (Asterisk m) = editList m . containerToList parse (Asterisk m) = liftM listToContainer . parseList m parseList m = splitList (valid m) >=> mapM (parse m) splitList re "" = return [] splitList re s = do -- pattern match failure means the regex didn't match at the beginning, -- and results in a failed parse overall ("", sMatch, rest) <- return (s =~ re) liftM (sMatch:) (splitList re rest) -- Heuristic: if we can track where *all* the old splits went, then go ahead -- and do that. Nice! Otherwise, use diff to compare the entire old list with -- the entire new list. -- -- There's a lot of room for improvement: we could take the regions between -- successfully tracked splits and the regions with unsuccessfully tracked -- splits and run the diff only on the unsuccessful regions, for example. -- Additionally, note that when we are tracking splits, we assume that no -- splits are inserted or deleted. A more sophisticated heuristic might try to -- relax this assumption somehow (though the only way I could think of to relax -- it would result in an exponential-time algorithm). editList m vs es = fromMaybe diffy exact where exact = editListExact m vs oldss newss es 0 diffy = editListDiff m oldss newss oldss = map (pprint m) vs newss = fromJust (splitList (valid m) =<< apply es (concat oldss)) :: [String] editListExact m (v:vs) (olds:oldss) (news:newss) es i = case splitMap of Nothing -> Nothing Just ms -> liftM (C.Modify i (edit m v e) :) (editListExact m vs oldss newss erest (i+1)) where oldSplit = length olds newSplit = length news easySplits = trackEdits [oldSplit] es splitMap = M.lookup newSplit easySplits -- this is dangerous, only use the variables it binds when splitMap is definitely a Just! (e, erest) = splitEdits (fromJust splitMap) oldSplit es editListExact m [] [] [] [] i = Just [] editListExact _ _ _ _ _ _ = Nothing editListDiff m oldss newss = result where diff = getDiff oldss newss tags = map fst diff count place = length . filter (==place) $ tags match place B = [True] match place place' = [False | place == place'] reordered place = map fst . uncurry (++) . partition snd . zip [0..] $ (tags >>= match place) create s = edit m def [Insert 0 s, Delete 0 . length . pprint m $ def] result = C.Rearrange (Sum 0) (\_ i -> fromJust (findIndex (==i) (reordered S))) : zipWith C.Modify [count B .. count B + count S - 1] [create news | (S, news) <- diff] ++ [ C.Insert . Sum . count $ S , C.Delete . Sum . negate . count $ F , C.Rearrange (Sum 0) (\_ i -> reordered F !! i) ] -- a few utilities for defining string modules whose default value is not "" newtype NewDefault v = NewDefault String deriving (Eq, Ord) instance Show (NewDefault v) where show (NewDefault s) = show s instance Read (NewDefault v) where readsPrec n s = map (first NewDefault) (readsPrec n s) instance (Default v, Show v) => Default (NewDefault v) where def = NewDefault (show (def :: v)) newtype NewDefaultModule v = NewDefaultModule [Edit] deriving (Show, Read, Monoid) instance (Default v, Show v) => Module (NewDefaultModule v) where type V (NewDefaultModule v) = NewDefault v apply (NewDefaultModule es) (NewDefault v) = liftM NewDefault (apply es v) newtype NewDefaultMatch v = NewDefaultMatch String deriving (Eq, Ord, Show, Read) instance (Default v, Show v) => StringModule (NewDefaultMatch v) where type M (NewDefaultMatch v) = NewDefaultModule v valid (NewDefaultMatch re) = re parse m = liftM NewDefault . maybeMatch (valid m) pprint m (NewDefault s) = s edit = const (const NewDefaultModule)