{-# LANGUAGE CPP #-} ------------------------------------------------------------------------------------------ {-| Is one text file a subset of the other? Or is there some bit of new text that needs to be salvaged? The basic unix diff tool is sometimes incredibly unsatisfactory for this purpose -- for example when text has been moved around, or when there are widespread whitespace differences. This program compares two files by treating them as unstructured sets of word sequences. By default words are defined by `Data.Char.isAlpha`. Run `wordsetdiff` with no arguments to print the help information. -} -- TODO: can make the map of sets more efficient by defining a non-empty set type. ------------------------------------------------------------------------------------------ module Main where import System.Environment import System.FilePath import System.IO import System.Exit import System.Console.GetOpt import System.Console.ANSI import System.FilePath --import System.Console.GetOpt import Prelude hiding (null, readFile) import Debug.Trace import Numeric import Data.Int import Data.Char import Data.Function import Data.Word as W import qualified Data.Set as Set import qualified Data.List as List --import qualified Data.ByteString.Lazy as L --import Data.ByteString.Lazy.Char8 as B import Data.ByteString as B import Data.ByteString.Lazy.Char8 as BC import Data.ByteString.Internal (w2c,c2w) -- define TRIEVERSION #define HASHVERSION #ifdef TRIEVERSION import qualified Data.Trie as M import Data.Trie.Convenience as M type TupMap a = M.Trie a -- This may not be very efficient: difference a b = M.mergeBy (\ a b -> Nothing) (M.unionL a b) a -- INEFFICIENT AND SERIAL: intersection left right = -- trace ("LEFT "++show left++ "\n\n RIGHT "++ show right++ "\n\nINTERSECT " ++ show x ++"\n")$ x where x = M.fromList$ List.filter (\ (k,_) -> k `M.member` right) $ M.toList left pack_window ls = B.concat$ List.intersperse (B.singleton$ c2w ' ') ls -- A window of words is just represented as a single bytestring -- separated by single spaces. type Window = B.ByteString #else #ifdef HASHVERSION #warning "Using HashMap instead of Data.Map" import qualified Data.HashMap as M type TupMap a = M.HashMap [B.ByteString] a #else import qualified Data.Map as M type TupMap a = M.Map [B.ByteString] a #endif intersection = M.intersection difference = M.difference pack_window = id -- A window of words is represented as a list type Window = [B.ByteString] #endif pack_window :: [B.ByteString] -> Window mytoStrict :: BC.ByteString -> B.ByteString mytoStrict lazy = case BC.toChunks lazy of [] -> B.empty [a] -> a ls -> B.concat ls -------------------------------------------------------------------------------- -- CONFIGURATION VARIABLES: version = "0.0.3" -- How many consecutive words should we look at? default_word_sequence_size = 3 clump_distance = 10 use_color = True -------------------------------------------------------------------------------- -- | Command line option flags data CmdFlag = NoColor | NWords Int | UseLines | WithPunc | AlphaNumeric | AlphaOnly | CaseInsensitive options :: [OptDescr CmdFlag] options = [ Option ['n'] ["nwords"] (ReqArg (NWords . safeRead) "NUM") "use word tuples of length NUM" , Option ['l'] ["lines"] (NoArg UseLines) "compare using lines instead of a fixed size sliding window of words" , Option [] ["nocolor"] (NoArg NoColor) "disable ANSI color output" , Option [ ] ["alpha"] (NoArg AlphaOnly) "ignore all but A-Z characters when forming words [default]" , Option ['a'] ["alphanumeric"] (NoArg AlphaNumeric) "ignore all but A-Z and 0-9" , Option ['p'] ["withpunc"] (NoArg WithPunc) "use all non-whitespace chars as 'words'" , Option ['i'] ["ignore-case"] (NoArg CaseInsensitive) "ignore case distinctions" ] usage = "\nVersion "++version++"\n"++ "Usage: wordsetdiff [OPTION...] file1 file2 [file3 ...]\n\n"++ "The wordsetdiff program subtracts the contents of file2, file3, ... from file1.\n\n"++ "As output wordsetdiff will print any remaining regions of text in file1 after the\n"++ "other files' contents are subtracted. Files are compared by treating them as\n"++ "unordered sets of word sequences of length N-words.\n"++ "\nOptions include:\n" defaultErr errs = error $ "ERROR!\n" ++ (List.concat errs ++ usageInfo usage options) safeRead :: String -> Int safeRead s = case readDec s of [(n,"")] -> n _ -> error$ "Could not read '"++ s ++"' as an integer." -------------------------------------------------------------------------------- -- | Tracking simple source locations as (start,end) inclusive/exclusive character indices. data Loc = Loc Int64 Int64 deriving (Show, Ord, Eq) left (Loc l _) = l right (Loc _ r) = r -- | Returns words satisfying whose characters satisfy a predicate along with their ZERO BASED locations. words_wloc :: (Char -> Bool) -> BC.ByteString -> [(B.ByteString, Loc)] words_wloc isWordChar bs = -- Convert each point location into a start/end Loc structure: List.map withLoc filtered where withLoc :: (B.ByteString,Int64) -> (B.ByteString,Loc) withLoc (word,pos) = (word, Loc pos (pos + (fromIntegral$ B.length word))) filtered = List.filter (not . B.null . fst) withpos withpos = List.scanl scanner (B.empty,-1) split scanner (last,pos) chunk = (mytoStrict chunk, pos + (fromIntegral$ B.length last) + 1) split = BC.splitWith (not . isWordChar) bs -- | Cluster regions together if they are "almost touching". -- Any regions within clump_distance characters of one another are joined. -- The result should have no overlaps: clump_regions :: [Loc] -> [Loc] clump_regions [] = [] clump_regions [a] = [a] clump_regions (Loc a1 a2 : Loc b1 b2 :c) | b1-a2 <= clump_distance = clump_regions (Loc a1 (max a2 b2) : c) clump_regions (a:b) = a : clump_regions b -- | Take the "bounding box" of a list of locations. combine_locs :: [Loc] -> Loc combine_locs [] = error "No locs!" combine_locs [a] = a combine_locs ls = Loc (left$ List.head ls) (right$ List.last ls) -- | Form a map mapping words to a set of occurrence locations within the bytestring. -- | This version forms a map using consecutive sequences of -- | N words (represented as lists) as the keys instead of individual words. wordmapN :: (Char -> Bool) -> Int -> BC.ByteString -> TupMap (Set.Set Loc) wordmapN isWordChar n bs = M.fromListWith Set.union $ combined where loc_list ls = Loc (List.head ls) (List.last ls) -- TODO, could try not to use lists here and manually fuse all this stuff: combined :: [(Window, Set.Set Loc)] combined = List.map (combine_entries) $ sliding_win n $ words_wloc isWordChar $ bs combine_entries :: [(B.ByteString, Loc)] -> (Window, Set.Set Loc) combine_entries ls = (pack_window (List.map fst ls), Set.singleton (combine_locs (List.map snd ls))) -- Perhaps not the most efficient... -- | Similar to wordmapN but use lines rather than a sliding window of words. lineMap :: (Char -> Bool) -> BC.ByteString -> TupMap (Set.Set Loc) lineMap isWordChar bs = M.fromListWith Set.union $ words_only where -- Here's a little trick, we get labeled lines by forming them as "big words" separated by newlines: lns :: [(B.ByteString, Loc)] lns = words_wloc (not . (== '\n')) bs words_only :: [(Window, Set.Set Loc)] words_only = List.map (\ (bs,loc) -> (pack_window$ List.filter (not . B.null) $ B.splitWith (not . isWordChar . w2c) bs, Set.singleton loc)) lns --sliding_win :: Int -> [ByteString] -> [Window] sliding_win :: Int -> [(B.ByteString,Loc)] -> [[(B.ByteString,Loc)]] --sliding_win :: Int -> [a] -> [[a]] sliding_win n ls = List.take (List.length ls - n + 1) $ List.map (List.take n) $ List.tails ls -- | The region of interest will end up bloated with separator -- charactors around the edges. This will trim those down. trim_separators :: (Char -> Bool) -> BC.ByteString -> [Loc] -> [Loc] trim_separators isWordChar bs [] = [] trim_separators isWordChar bs diffs@(Loc start end : _) = -- Note: passing isWordChar as a function is probably less efficient than passing a boolean flag. -- (Unless GHC does MLton-style control flow analysis and transforming calling sites into switches.) loop 0 bs diffs where -- This look keeps the head of the bytestring bs lined up with the region we're looking at. -- Invariant next start == offset (could get rid of offset) loop offset bs [] = [] loop offset bs origd@(Loc start end : diffs) -- Need to scroll the tape. | offset < start = loop start (BC.drop (start - offset) bs) origd | isWordChar (BC.head bs) = -- Scroll past to the next (non-overlapping) segment: let tail = loop end (BC.drop (end-offset) bs) diffs in case trim_tail offset bs end of Nothing -> tail Just trimmed -> Loc start trimmed : tail -- Otherwise scroll forward one character: | otherwise = loop (offset+1) (BC.tail bs) (Loc (start+1) end : diffs) -- Trim from the other end. trim_tail offset bs end | end == offset = Nothing -- The whole section is nixed is trimmed | isWordChar (BC.index bs (end - offset - 1)) = Just end | otherwise = trim_tail offset bs (end - 1) --baseline = [SetColor Background Dull Black] withCol False viv col act = act withCol True viv col act = do setSGR [SetColor Foreground viv col] act setSGR [] -- | Print out results, i.e. the distinct regions of text within one file and not the other. print_diff_regions :: Bool -> BC.ByteString -> [Loc] -> IO () print_diff_regions color bs ls = loop 0 0 (BC.lines bs) ls where loop lnum pos lines [] = return () loop lnum pos [] diff = error$ "difference regions beyond end of file: " ++ show diff loop lnum pos (ln:lines) origd@(Loc start end : diffs) -- NOTE: This adds ONE character for the newline, won't work with carriage-return/newline: | pos + BC.length ln < start = loop (lnum+1) (pos + BC.length ln + 1) lines origd | otherwise = do -- Make all these locations one-based? For now I make just the lines one-based. --withCol Dull Red $ Prelude.putStr$ "\n==== line "++ show (lnum+1) ++": " withCol color Dull Red $ Prelude.putStr$ "\n==== " Prelude.putStr$ "Found distinct material, "++ show (end-start) ++" characters (chars "++ show start ++ " to " ++ show end ++ ") " -- starting on withCol color Dull Red $ Prelude.putStrLn$ "line "++ show (lnum+1) ++"\n" let snip = BC.take (end-start) $ BC.drop (start-pos) $ BC.unlines (ln:lines) header = BC.take (start-pos) $ BC.repeat ' ' withCol color Dull Green $ do BC.putStr header -- Not sure if this helps readability BC.putStrLn snip loop lnum pos (ln:lines) diffs -- Finished printing that diff, move to next data Config = Cfg { color :: Bool , word_sequence_size :: Int , case_insensitive :: Bool , is_wordchar :: Char -> Bool } default_cfg = Cfg { color = True , word_sequence_size = default_word_sequence_size , case_insensitive = False , is_wordchar = isAlpha } main = do args <- getArgs -- We could perhaps search for a value of word_sequence_size that -- gives us a "desirable" output according to some metric, which -- would probably be a combination of the number of distinct -- difference regions and the total length of characters in them. -- let (left, right, word_sequence_size) = -- case args of -- [l,r] -> (l,r, default_word_sequence_size) -- [l,r,w] -> (l,r, read w) -- _ -> error "Expects two arguments!" (opts,left,rights) <- case getOpt Permute options args of (o,l:r:rest,[]) -> return (o,l,r:rest) (_,_,errs) -> defaultErr errs let cfg = List.foldl process_opt default_cfg opts process_opt cfg opt = case opt of NoColor -> cfg { color = False } NWords n -> cfg { word_sequence_size = n } UseLines -> cfg { word_sequence_size = 0 } -- A little convention here, 0 means lines. WithPunc -> cfg { is_wordchar = not . isSpace } AlphaOnly -> cfg { is_wordchar = isAlpha } AlphaNumeric -> cfg { is_wordchar = isAlphaNum } CaseInsensitive -> cfg { case_insensitive = True } _bs_left <- BC.readFile left _bs_rights <- mapM BC.readFile rights let -- This is a sloppy way to implement case insesitivity. Do it at the outset: isWordChar = is_wordchar cfg lower = if case_insensitive cfg then BC.map toLower else id bs_left = lower _bs_left bs_rights = List.map lower _bs_rights get_snips bs = if 0 == word_sequence_size cfg then lineMap isWordChar bs else wordmapN isWordChar (word_sequence_size cfg) bs snips_left = get_snips bs_left sub_one_file (remain,common) bs_right = let snips_right = get_snips bs_right remain' = difference remain snips_right common' = intersection common snips_right in (remain',common') -- We go through each of the files to carve its contents out -- of the starting contents of file1 ('left') (remain,common) = List.foldl sub_one_file (snips_left, snips_left) bs_rights sorted_locs map = List.sort $ List.concat $ List.map Set.toList $ M.elems $ map diff_area = clump_regions $ sorted_locs remain common_area = clump_regions $ sorted_locs common -- Because of our N-word strategy we get sloppy edges, to -- prune the borders we subtract out the known common area: -- 'pruned' is the final, important differences. pruned = trim_separators isWordChar bs_left $ loop diff_area common_area inbetween a1 a2 b = (a1 <= b && b <= a2) loop a [] = a loop [] b = [] loop a@(Loc a1 a2 :ar) b@(Loc b1 b2 :br) -- Subtract b from a | a2 < b1 = Loc a1 a2 : loop ar b -- Already in the clear. | b2 < a1 = loop a br -- Keep looking for disqualification | inbetween a1 a2 b1 && inbetween a1 a2 b2 = -- It splits us in two: loop (add_loc a1 b1 $ add_loc b2 a2 ar) br | inbetween a1 a2 b1 = add_loc a1 b1 (loop ar b) -- Chop our right end | inbetween a1 a2 b2 = loop (add_loc b2 a2 ar) br -- Chop our left end -- In this case we instead fall entirely inside the prune range. | inbetween b1 b2 a1 && inbetween b1 b2 a2 = loop ar b | otherwise = error$ "Should not happen (a1 a2) (b1 b2): "++ show (a1,a2) ++" "++ show (b1,b2) add_loc start end ls | start == end = ls | otherwise = Loc start end : ls numpruned = List.length pruned withCol (color cfg) Dull Red $ Prelude.putStr$ "==== " Prelude.putStr$ "Subtracting contents of file(s) " withCol (color cfg) Dull Red$ Prelude.putStr$ (List.concat$ List.intersperse ", " $ List.map takeBaseName rights) Prelude.putStr$ " from " withCol (color cfg) Dull Red$ Prelude.putStr$ takeBaseName left if List.null pruned then do Prelude.putStrLn$ "\nNo contents remaining after difference."; exitSuccess else do print_diff_regions (color cfg) bs_left pruned withCol (color cfg) Dull Red$ Prelude.putStrLn$ "\n--------------------------------------------------------------------------------" Prelude.putStrLn$ " !!! Discovered "++ show numpruned ++" difference region(s) using word sequences of length " ++ show (word_sequence_size cfg) ++". Excerpts above." exitWith (ExitFailure numpruned) {- Performance notes [2010.10.13]: If I run this on a 378K file on my laptop it takes 1.7 seconds with under 50% productivity. Parallelism in the map/set operations could help a little here... not sure what else would other than using an imperative algorithm with different data structures. Comparing against some stray versions of my large amorphous computing notes file it does indeed prove useful. [2010.10.17] I subtracted a 16.5 meg file from a 566K file... this took about 54 seconds and went over 1.3 gb usage just eyeballing activity monitor. (Though RTS -s claims only 641mb max memory.) 55.55% productivity. 5.1 gb alloc... so I guess I can't really just turn it off. (There were NO remaining differences. Nice.) [2010.10.18] {Tried a Trie implementation} . It's not working yet... it gets slightly wrong answers. I tried to time it anyway for fun on that .56/16.5 mb file combo. Hmm.. running my VM so I don't have enough memory... going to turn that off. Well... in any case it seems to be taking a very very long time (5min have passed). Let's try a smaller file. What about just subtracting a 566K file from itself? 6.5 seconds with initial Trie implementation vs. 3.0 seconds for the data.map one. What about a hashmap? That speeds it up to 2.1 seconds. -}