{-# LANGUAGE CPP #-}

------------------------------------------------------------------------------------------
{-| 
   Is one text file a subset of the other?
   Or is there some bit of new text that needs to be salvaged?

   The basic unix diff tool is sometimes incredibly unsatisfactory for
   this purpose -- for example when text has been moved around, or when
   there are widespread whitespace differences.

   This program compares two files by treating them as unstructured
   sets of word sequences.  By default words are defined by `Data.Char.isAlpha`.

   Run `wordsetdiff` with no arguments to print the help information.
 -}

-- TODO: can make the map of sets more efficient by defining a non-empty set type.

------------------------------------------------------------------------------------------

module Main where

import System.Environment
import System.FilePath
import System.IO
import System.Exit

import System.Console.GetOpt
import System.Console.ANSI
import System.FilePath

--import System.Console.GetOpt

import Prelude hiding (null, readFile)

import Debug.Trace
import Numeric
import Data.Int
import Data.Char
import Data.Function
import Data.Word as W
import qualified Data.Set as Set
import qualified Data.List as List

--import qualified Data.ByteString.Lazy as L
--import Data.ByteString.Lazy.Char8 as B
import Data.ByteString as B
import Data.ByteString.Lazy.Char8 as BC

import Data.ByteString.Internal (w2c,c2w)

-- define TRIEVERSION
#define HASHVERSION

#ifdef TRIEVERSION
import qualified Data.Trie as M
import Data.Trie.Convenience as M
type TupMap a = M.Trie a
-- This may not be very efficient:
difference a b = M.mergeBy (\ a b -> Nothing) (M.unionL a b) a 
-- INEFFICIENT AND SERIAL:
intersection left right = 
--   trace ("LEFT "++show left++ "\n\n RIGHT "++ show right++ "\n\nINTERSECT " ++ show x ++"\n")$ 
   x
 where 
  x = M.fromList$
      List.filter (\ (k,_) -> k `M.member` right) $ 
      M.toList left
pack_window ls = B.concat$ List.intersperse (B.singleton$ c2w ' ') ls
-- A window of words is just represented as a single bytestring
-- separated by single spaces.
type Window = B.ByteString
#else
#ifdef HASHVERSION
#warning "Using HashMap instead of Data.Map"
import qualified Data.HashMap as M
type TupMap a = M.HashMap [B.ByteString] a
#else
import qualified Data.Map as M
type TupMap a = M.Map [B.ByteString] a
#endif
intersection = M.intersection
difference = M.difference
pack_window = id
-- A window of words is represented as a list
type Window = [B.ByteString]
#endif

pack_window :: [B.ByteString] -> Window
mytoStrict :: BC.ByteString -> B.ByteString
mytoStrict lazy = 
  case BC.toChunks lazy of 
    []  -> B.empty
    [a] -> a
    ls  -> B.concat ls

--------------------------------------------------------------------------------
-- CONFIGURATION VARIABLES:

version = "0.0.3"

-- How many consecutive words should we look at?
default_word_sequence_size = 3
clump_distance = 10
use_color = True

--------------------------------------------------------------------------------
-- | Command line option flags
data CmdFlag = 
      NoColor
    | NWords Int
    | UseLines
    | WithPunc
    | AlphaNumeric
    | AlphaOnly
    | CaseInsensitive

options :: [OptDescr CmdFlag]
options =
     [ Option ['n']  ["nwords"]   (ReqArg (NWords . safeRead) "NUM")  "use word tuples of length NUM"
     , Option ['l']  ["lines"]    (NoArg UseLines)  "compare using lines instead of a fixed size sliding window of words"
     , Option []     ["nocolor"]  (NoArg NoColor)                 "disable ANSI color output"

     , Option [   ]  ["alpha"]        (NoArg AlphaOnly)  "ignore all but A-Z characters when forming words [default]"
     , Option ['a']  ["alphanumeric"] (NoArg AlphaNumeric)  "ignore all but A-Z and 0-9"
     , Option ['p']  ["withpunc"]     (NoArg WithPunc)   "use all non-whitespace chars as 'words'"

     , Option ['i']  ["ignore-case"]  (NoArg CaseInsensitive) "ignore case distinctions"
     ]

usage = "\nVersion "++version++"\n"++
	 "Usage: wordsetdiff [OPTION...] file1 file2 [file3 ...]\n\n"++	

        "The wordsetdiff program subtracts the contents of file2, file3, ... from file1.\n\n"++

        "As output wordsetdiff will print any remaining regions of text in file1 after the\n"++
        "other files' contents are subtracted.  Files are compared by treating them as\n"++
	"unordered sets of word sequences of length N-words.\n"++

	"\nOptions include:\n"
defaultErr errs = error $ "ERROR!\n" ++ (List.concat errs ++ usageInfo usage options)

safeRead :: String -> Int 
safeRead s = 
  case readDec s of
   [(n,"")] -> n
   _ -> error$ "Could not read '"++ s ++"' as an integer."

--------------------------------------------------------------------------------

-- | Tracking simple source locations as (start,end) inclusive/exclusive character indices.
data Loc = Loc Int64 Int64
  deriving (Show, Ord, Eq)

left  (Loc l _) = l
right (Loc _ r) = r

-- | Returns words satisfying whose characters satisfy a predicate along with their ZERO BASED locations.
words_wloc :: (Char -> Bool) -> BC.ByteString -> [(B.ByteString, Loc)]
words_wloc isWordChar bs = 
  -- Convert each point location into a start/end Loc structure:
  List.map withLoc filtered 
 where 
  withLoc :: (B.ByteString,Int64) -> (B.ByteString,Loc)
  withLoc (word,pos) = (word, Loc pos (pos + (fromIntegral$ B.length word)))

  filtered = List.filter (not . B.null . fst) withpos
  withpos  = List.scanl scanner (B.empty,-1) split 
  scanner (last,pos) chunk = (mytoStrict chunk, pos + (fromIntegral$ B.length last) + 1)
  split    = BC.splitWith (not . isWordChar) bs


-- | Cluster regions together if they are "almost touching".
--   Any regions within clump_distance characters of one another are joined.
--   The result should have no overlaps:
clump_regions :: [Loc] -> [Loc]
clump_regions [] = []
clump_regions [a] = [a]
clump_regions (Loc a1 a2 : Loc b1 b2 :c) | b1-a2 <= clump_distance = clump_regions (Loc a1 (max a2 b2) : c)
clump_regions (a:b) = a : clump_regions b


-- | Take the "bounding box" of a list of locations.
combine_locs :: [Loc] -> Loc
combine_locs []  = error "No locs!"
combine_locs [a] = a
combine_locs ls  = Loc (left$ List.head ls) (right$ List.last ls)		   


-- | Form a map mapping words to a set of occurrence locations within the bytestring.
-- | This version forms a map using consecutive sequences of
-- | N words (represented as lists) as the keys instead of individual words.
wordmapN :: (Char -> Bool) -> Int -> BC.ByteString -> TupMap (Set.Set Loc)
wordmapN isWordChar n bs = 
    M.fromListWith Set.union $ 
    combined
  where 
   loc_list ls = Loc (List.head ls) (List.last ls)

   -- TODO, could try not to use lists here and manually fuse all this stuff:
   combined :: [(Window, Set.Set Loc)]
   combined = List.map (combine_entries) $ 
	      sliding_win n $ 
	      words_wloc isWordChar $ bs

   combine_entries :: [(B.ByteString, Loc)] -> (Window, Set.Set Loc)
   combine_entries ls = (pack_window (List.map fst ls), Set.singleton (combine_locs (List.map snd ls)))
   -- Perhaps not the most efficient...


-- | Similar to wordmapN but use lines rather than a sliding window of words.
lineMap :: (Char -> Bool) -> BC.ByteString -> TupMap (Set.Set Loc)
lineMap isWordChar bs = 
    M.fromListWith Set.union $ 
    words_only
  where 
   -- Here's a little trick, we get labeled lines by forming them as "big words" separated by newlines:
   lns :: [(B.ByteString, Loc)]
   lns = words_wloc (not . (== '\n')) bs

   words_only :: [(Window, Set.Set Loc)]
   words_only = List.map (\ (bs,loc) -> (pack_window$ 
					 List.filter (not . B.null) $
					 B.splitWith (not . isWordChar . w2c) bs,
					 Set.singleton loc)) 
         		 lns       
  

--sliding_win :: Int -> [ByteString] -> [Window]
sliding_win :: Int -> [(B.ByteString,Loc)] -> [[(B.ByteString,Loc)]]
--sliding_win :: Int -> [a] -> [[a]]
sliding_win n ls = List.take (List.length ls - n + 1) $
 	           List.map  (List.take n) $
	           List.tails ls


-- | The region of interest will end up bloated with separator
--   charactors around the edges.  This will trim those down.
trim_separators :: (Char -> Bool) -> BC.ByteString -> [Loc] -> [Loc]
trim_separators isWordChar bs [] = []
trim_separators isWordChar bs diffs@(Loc start end : _) =     
    -- Note: passing isWordChar as a function is probably less efficient than passing a boolean flag.
    -- (Unless GHC does MLton-style control flow analysis and transforming calling sites into switches.)
    loop 0 bs diffs
 where 
  -- This look keeps the head of the bytestring bs lined up with the region we're looking at.
  -- Invariant next start == offset (could get rid of offset)
  loop offset bs [] = []
  loop offset bs origd@(Loc start end : diffs) 

    -- Need to scroll the tape.
    | offset < start = loop start (BC.drop (start - offset) bs) origd

    | isWordChar (BC.head bs) = 
                            -- Scroll past to the next (non-overlapping) segment:
			    let tail = loop end (BC.drop (end-offset) bs) diffs in 
	                    case trim_tail offset bs end of 
                               Nothing      -> tail 
                               Just trimmed -> Loc start trimmed : tail

			    
    -- Otherwise scroll forward one character:
    | otherwise = loop (offset+1) (BC.tail bs) (Loc (start+1) end : diffs)
  -- Trim from the other end.
  trim_tail offset bs end 
      | end == offset = Nothing -- The whole section is nixed is trimmed
      | isWordChar (BC.index bs (end - offset - 1)) = Just end
      | otherwise = trim_tail offset bs (end - 1)

--baseline = [SetColor Background Dull Black]
withCol False viv col act = act
withCol True viv col act = 
    do setSGR [SetColor Foreground viv col]
       act 
       setSGR []

-- | Print out results, i.e. the distinct regions of text within one file and not the other.
print_diff_regions :: Bool -> BC.ByteString -> [Loc] -> IO ()
print_diff_regions color bs ls = 
    loop 0 0 (BC.lines bs) ls
 where 
  loop lnum pos lines [] = return ()
  loop lnum pos [] diff  = error$ "difference regions beyond end of file: " ++ show diff
  loop lnum pos (ln:lines) origd@(Loc start end : diffs)
      -- NOTE: This adds ONE character for the newline, won't work with carriage-return/newline:
      | pos + BC.length ln < start = 
	  loop (lnum+1) (pos + BC.length ln + 1) lines origd
      | otherwise = do 
          -- Make all these locations one-based?  For now I make just the lines one-based.
          --withCol Dull Red $ Prelude.putStr$ "\n==== line "++ show (lnum+1) ++":  " 
	  withCol color Dull Red $ Prelude.putStr$ "\n==== "
	  Prelude.putStr$ "Found distinct material, "++ show (end-start) ++" characters (chars "++ show start ++ " to " ++ show end 
		            ++ ") " -- starting on
	  withCol color Dull Red $ Prelude.putStrLn$ "line "++ show (lnum+1) ++"\n" 
          let snip   = BC.take (end-start) $ BC.drop (start-pos) $ BC.unlines (ln:lines)
	      header = BC.take (start-pos) $ BC.repeat ' '

	  withCol color Dull Green $ do
	    BC.putStr header -- Not sure if this helps readability
	    BC.putStrLn snip
          loop lnum pos (ln:lines) diffs -- Finished printing that diff, move to next

data Config = 
  Cfg { color :: Bool
      , word_sequence_size :: Int
      , case_insensitive :: Bool
      , is_wordchar :: Char -> Bool
      }

default_cfg = 
  Cfg { color = True
      , word_sequence_size = default_word_sequence_size
      , case_insensitive = False
      , is_wordchar = isAlpha
      }

main = 
 do args <- getArgs
    -- We could perhaps search for a value of word_sequence_size that
    -- gives us a "desirable" output according to some metric, which
    -- would probably be a combination of the number of distinct
    -- difference regions and the total length of characters in them.

    -- let (left, right, word_sequence_size) =
    -- 	    case args of 
    -- 	      [l,r]   -> (l,r, default_word_sequence_size)
    -- 	      [l,r,w] -> (l,r, read w)
    -- 	      _ -> error "Expects two arguments!"		   

    (opts,left,rights) <- 
       case getOpt Permute options args of
	 (o,l:r:rest,[]) -> return (o,l,r:rest)
         (_,_,errs)   -> defaultErr errs

    let 
	cfg = List.foldl process_opt default_cfg opts
        process_opt cfg opt = 
          case opt of 
	    NoColor   -> cfg { color = False }
            NWords n  -> cfg { word_sequence_size = n }
            UseLines  -> cfg { word_sequence_size = 0 } -- A little convention here, 0 means lines.
	    WithPunc        -> cfg { is_wordchar = not . isSpace }
	    AlphaOnly       -> cfg { is_wordchar = isAlpha }
	    AlphaNumeric    -> cfg { is_wordchar = isAlphaNum }
            CaseInsensitive -> cfg { case_insensitive = True }

    _bs_left   <- BC.readFile left	      
    _bs_rights <- mapM BC.readFile rights

    let 
        -- This is a sloppy way to implement case insesitivity.  Do it at the outset:
        isWordChar = is_wordchar cfg 
	lower      = if case_insensitive cfg then BC.map toLower else id
        bs_left    = lower _bs_left 
	bs_rights  = List.map lower _bs_rights
	get_snips bs = if 0 == word_sequence_size cfg
		       then lineMap isWordChar bs
		       else wordmapN isWordChar (word_sequence_size cfg) bs
        snips_left = get_snips bs_left

        sub_one_file (remain,common) bs_right = 
	  let
	      snips_right = get_snips bs_right
	      remain' = difference   remain snips_right
	      common' = intersection common snips_right
	  in (remain',common')

        -- We go through each of the files to carve its contents out
        -- of the starting contents of file1 ('left')
        (remain,common) = List.foldl sub_one_file (snips_left, snips_left) bs_rights

        sorted_locs map = List.sort $ List.concat $ 
		          List.map Set.toList $ M.elems $ map
	diff_area   = clump_regions $ sorted_locs remain
	common_area = clump_regions $ sorted_locs common

        -- Because of our N-word strategy we get sloppy edges, to
        -- prune the borders we subtract out the known common area:
	-- 'pruned' is the final, important differences.
        pruned = trim_separators isWordChar bs_left $
		 loop diff_area common_area
        inbetween a1 a2 b = (a1 <= b && b <= a2)			    
	loop a [] = a
	loop [] b = []
	loop a@(Loc a1 a2 :ar) b@(Loc b1 b2 :br) -- Subtract b from a
	    | a2 < b1 = Loc a1 a2 : loop ar b -- Already in the clear.
	    | b2 < a1 =             loop a br -- Keep looking for disqualification
	    | inbetween a1 a2 b1 && inbetween a1 a2 b2 = 
		-- It splits us in two:
		loop (add_loc a1 b1 $ add_loc b2 a2 ar) br
	    | inbetween a1 a2 b1 = add_loc a1 b1 (loop ar b)  -- Chop our right end
	    | inbetween a1 a2 b2 = loop (add_loc b2 a2 ar) br -- Chop our left end

            -- In this case we instead fall entirely inside the prune range.
	    | inbetween b1 b2 a1 && inbetween b1 b2 a2 = loop ar b

	    | otherwise = error$ "Should not happen (a1 a2) (b1 b2): "++ show (a1,a2) ++" "++ show (b1,b2)
	add_loc start end ls | start == end = ls
			     | otherwise = Loc start end : ls
        numpruned = List.length pruned
    
    withCol (color cfg) Dull Red $ Prelude.putStr$ "==== "
    Prelude.putStr$ "Subtracting contents of file(s) "
    withCol (color cfg) Dull Red$ Prelude.putStr$ (List.concat$ List.intersperse ", " $ List.map takeBaseName rights)
    Prelude.putStr$ " from "
    withCol (color cfg) Dull Red$ Prelude.putStr$ takeBaseName left
    if List.null pruned
     then do Prelude.putStrLn$ "\nNo contents remaining after difference."; exitSuccess
     else do 
	     print_diff_regions (color cfg) bs_left pruned
	     withCol (color cfg) Dull Red$ Prelude.putStrLn$ "\n--------------------------------------------------------------------------------"
	     Prelude.putStrLn$ " !!!  Discovered "++ show numpruned ++" difference region(s) using word sequences of length "
                                ++ show (word_sequence_size cfg) ++".  Excerpts above."
	     exitWith (ExitFailure numpruned)

{- 

 Performance notes [2010.10.13]:
    If I run this on a 378K file on my laptop it takes 1.7 seconds with under 50% productivity.

 Parallelism in the map/set operations could help a little here... not
 sure what else would other than using an imperative algorithm with
 different data structures.

Comparing against some stray versions of my large amorphous computing notes file it does indeed prove useful.

[2010.10.17] I subtracted a 16.5 meg file from a 566K file... this
took about 54 seconds and went over 1.3 gb usage just eyeballing
activity monitor. (Though RTS -s claims only 641mb max memory.)  55.55% productivity.
5.1 gb alloc... so I guess I can't really just turn it off.

(There were NO remaining differences.  Nice.)


[2010.10.18] {Tried a Trie implementation}
.
It's not working yet... it gets slightly wrong answers.
I tried to time it anyway for fun on that .56/16.5 mb file combo.
Hmm.. running my VM so I don't have enough memory... going to turn that off.
Well... in any case it seems to be taking a very very long time (5min have passed).

Let's try a smaller file.
What about just subtracting a 566K file from itself?  6.5 seconds with
initial Trie implementation vs. 3.0 seconds for the data.map one.

What about a hashmap?  That speeds it up to 2.1 seconds.

 -}