-- Copyright (C) 2009 Petr Rockai
--
-- Permission is hereby granted, free of charge, to any person
-- obtaining a copy of this software and associated documentation
-- files (the "Software"), to deal in the Software without
-- restriction, including without limitation the rights to use, copy,
-- modify, merge, publish, distribute, sublicense, and/or sell copies
-- of the Software, and to permit persons to whom the Software is
-- furnished to do so, subject to the following conditions:
--
-- The above copyright notice and this permission notice shall be
-- included in all copies or substantial portions of the Software.
--
-- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-- SOFTWARE.

-- |
-- Module      : Darcs.Repository.Diff
-- Copyright   : 2009 Petr Rockai
-- License     : MIT
-- Maintainer  : darcs-devel@darcs.net
-- Stability   : experimental
-- Portability : portable

module Darcs.Repository.Diff
    (
      treeDiff
    ) where

import Darcs.Prelude

import qualified Data.ByteString.Lazy.Char8 as BLC
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as BL

import Data.List ( sortBy )

import Darcs.Util.Tree      ( diffTrees
                            , zipTrees
                            , TreeItem(..)
                            , Tree
                            , readBlob
                            , emptyBlob
                            )
import Darcs.Util.Path( AnchoredPath, anchorPath )


import Darcs.Util.ByteString ( isFunky )
import Darcs.Patch  ( PrimPatch
                    , hunk
                    , canonize
                    , binary
                    , addfile
                    , rmfile
                    , adddir
                    , rmdir
                    , invert
                    )
import Darcs.Repository.Prefs ( FileType(..) )
import Darcs.Patch.Witnesses.Ordered ( FL(..), (+>+) )
import Darcs.Patch.Witnesses.Sealed ( Gap(..) )
import Darcs.Repository.Flags ( DiffAlgorithm(..) )

data Diff m = Added (TreeItem m)
            | Removed (TreeItem m)
            | Changed (TreeItem m) (TreeItem m)


getDiff :: AnchoredPath
        -> Maybe (TreeItem m)
        -> Maybe (TreeItem m)
        -> (AnchoredPath, Diff m)
getDiff p Nothing (Just t) = (p, Added t)
getDiff p (Just from) (Just to) = (p, Changed from to)
getDiff p (Just t) Nothing = (p, Removed t)
getDiff _ Nothing Nothing = error "impossible case" -- zipTrees should never return this


treeDiff :: forall m w prim . (Monad m, Gap w, PrimPatch prim)
         => DiffAlgorithm
         -> (FilePath -> FileType)
         -> Tree m
         -> Tree m
         -> m (w (FL prim))
treeDiff da ft t1 t2 = do
    (from, to) <- diffTrees t1 t2
    diffs <- mapM (uncurry diff) $ sortBy organise $ zipTrees getDiff from to
    return $ foldr (joinGap (+>+)) (emptyGap NilFL) diffs
  where
    -- sort into removes, changes, adds, with removes in reverse-path order
    -- and everything else in forward order
    organise :: (AnchoredPath, Diff m) -> (AnchoredPath, Diff m) -> Ordering

    organise (p1, Changed _ _ ) (p2, Changed _ _) = compare p1 p2
    organise (p1, Added _)      (p2, Added _)   = compare p1 p2
    organise (p1, Removed _)    (p2, Removed _) = compare p2 p1

    organise (_, Removed _) _ = LT
    organise _ (_, Removed _) = GT

    organise (_, Changed _ _) _ = LT
    organise _ (_, Changed _ _) = GT

    diff :: AnchoredPath -> Diff m -> m (w (FL prim))
    diff _ (Changed (SubTree _) (SubTree _)) = return (emptyGap NilFL)
    diff p (Removed (SubTree _)) =
        -- Note: With files we first make the file empty before removing it.
        -- But for subtrees this has already been done in previous recursive calls.
        return $ freeGap (rmdir p :>: NilFL)
    diff p (Added (SubTree _)) =
        return $ freeGap (adddir p :>: NilFL)
    diff p (Added b'@(File _)) =
        do diff' <- diff p (Changed (File emptyBlob) b')
           return $ joinGap (:>:) (freeGap (addfile p)) diff'
    diff p (Removed a'@(File _)) =
        do diff' <- diff p (Changed a' (File emptyBlob))
           return $ joinGap (+>+) diff' (freeGap (rmfile p :>: NilFL))
    diff p (Changed (File a') (File b')) =
        do a <- readBlob a'
           b <- readBlob b'
           case ft (anchorPath "" p) of
             TextFile | no_bin a && no_bin b ->
                          return $ text_diff p a b
             _ -> return $ if a /= b
                              then freeGap (binary p (strict a) (strict b) :>: NilFL)
                              else emptyGap NilFL
    diff p (Changed a'@(File _) subtree@(SubTree _)) =
        do rmFileP <- diff p (Removed a')
           addDirP <- diff p (Added subtree)
           return $ joinGap (+>+) rmFileP addDirP
    diff p (Changed subtree@(SubTree _) b'@(File _)) =
        do rmDirP <- diff p (Removed subtree)
           addFileP <- diff p (Added b')
           return $ joinGap (+>+) rmDirP addFileP
    diff p _ = error $ "Missing case at path " ++ show (anchorPath "" p)

    text_diff p a b
        | BL.null a && BL.null b = emptyGap NilFL
        | BL.null a = freeGap (diff_from_empty p b)
        | BL.null b = freeGap (diff_to_empty p a)

        -- What is 'a line'? One view is that a line is something that is
        -- /terminated/ by either a newline or end of file. Another view is
        -- that lines are /separated/ by newline symbols.
        --
        -- The first view is the more "intuitive" one. The second is more
        -- "technical", it has the simpler definition and the highly desirable
        -- property that splitting a text into lines and joining them with
        -- newline symbols are inverse operations. The last point is the reason
        -- we never use the standard versions of 'unlines' for ByteString
        -- anywhere in darcs.
        --
        -- The two views differ mostly when enumerating the lines of a file
        -- that ends with a newline symbol: here, the technical view counts one
        -- more (empty) line. This leads to un-intuitive (though technically
        -- not incorrect) results when calculating the diff for a change that
        -- appends an empty line to a file that already has a newline at the
        -- end. For instance, for a file with a single, newline-terminated line
        -- of text, the LCS algorithm would tell us that a *third* (empty) line
        -- is being added.
        --
        -- To avoid this, we add a special case here: we strip off common
        -- newline symbols at the end. When we later split the result into
        -- lines for the diff algorithm, it never gets to see the empty
        -- last lines in both files and thus gives us the more intuitive result.

        | BLC.last a == '\n' && BLC.last b == '\n'
                    = freeGap (line_diff p (linesB $ BLC.init a) (linesB $ BLC.init b))
        | otherwise = freeGap (line_diff p (linesB a) (linesB b))

    line_diff p a b = canonize da (hunk p 1 a b)

    diff_to_empty p x | BLC.last x == '\n' = line_diff p (init $ linesB x) []
                      | otherwise = line_diff p (linesB x) [B.empty]

    diff_from_empty p x = invert (diff_to_empty p x)

    no_bin = not . isFunky . strict . BL.take 4096

    linesB = map strict . BLC.split '\n'

    strict = B.concat . BL.toChunks