src/Filediff.hs

{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE InstanceSigs #-}

-- | The module exposing the functionality of this package
module Filediff
( -- * lists
  diffLists
, applyListDiff

  -- * files
, diffFiles
, applyFileDiff

  -- * directories
, diffDirectories
, diffDirectoriesWithIgnoredSubdirs
, applyDirectoryDiff
) where

import Debug.Trace
import qualified Data.HashMap as HMap

import Control.Concurrent (forkIO)
import Control.Concurrent.Thread as Thread (Result(..), result)
import Control.Concurrent.Thread.Group as ThreadGroup (new, forkIO, wait)

import qualified System.IO as IO
import qualified System.Directory as D

import qualified Data.Text as T
import qualified Data.Text.IO as TIO

import qualified Data.MemoCombinators as Memo

import Data.Either.Combinators

-- function imports

import Data.Maybe (isJust, fromJust, catMaybes)

import Data.List ((\\), intersect)

import Data.Monoid
import Control.Applicative
import Control.Monad
import Control.Monad.Trans.Either
import Control.Monad.IO.Class (liftIO)

-- Filediff imports

import Filediff.Types
import Filediff.Utils
    ( (</>)
    , (<.>)
    , getFileDirectory
    , removeDotDirs
    , createFileWithContents
    , dropUntil
    , removeFirstPathComponent
    , removePathComponents
    , getDirectoryContentsRecursiveSafe
    , isPrefix
    , dropPrefix
    , dropInitialSlash
    , dropTrailingSlash )

-- * basic operations

-- | /O(mn)/. Compute the difference between the two files (more
--   specifically, the minimal number of changes to make to transform the
--   file residing at the location specified by the first
--   parameter into the second). Throws an exception if either or both of
--   the parameters point to a directory, not a file.
--
--   Files are allowed to not exist at either or both of the parameters.
diffFiles :: FilePath -> FilePath -> IO Filediff
diffFiles a b = do
    aIsDir <- D.doesDirectoryExist a
    bIsDir <- D.doesDirectoryExist b
    when (aIsDir || bIsDir) $ error $ "One or both of " ++ a ++ " and " ++ b ++ "is not a file, but a directory."

    aExists <- D.doesFileExist a
    bExists <- D.doesFileExist b
    case (aExists, bExists) of
        (False, False) -> return $ Filediff a b mempty
        (False, True ) -> addCase
        (True , False) -> delCase
        (True , True ) -> modCase
    where
        addCase :: IO Filediff
        addCase = do
            let aLines = []
            bLines <- T.lines <$> TIO.readFile b
            return Filediff
                { base = a
                , comp = b
                , change = Add $ diffLists aLines bLines }

        delCase :: IO Filediff
        delCase = do
            aLines <- T.lines <$> TIO.readFile a
            let bLines = []
            return Filediff
                { base = a
                , comp = b
                , change = Del $ diffLists aLines bLines }

        modCase :: IO Filediff
        modCase = do
            aLines <- T.lines <$> TIO.readFile a
            bLines <- T.lines <$> TIO.readFile b
            return Filediff
                { base = a
                , comp = b
                , change = Mod $ diffLists aLines bLines }

-- | Compute the difference between the two directories (more
--   specifically, the minimal number of changes to make to transform the
--   directory residing at the location specified by the first
--   parameter into the second). Throws an exception if either or both of
--   the parameters point to a file, not a directory.
diffDirectories :: FilePath -> FilePath -> IO Diff
diffDirectories a b = diffDirectoriesWithIgnoredSubdirs a b [] []

-- | Diff two directories, ignoring some subdirectories. The first
-- `[FilePath]` parameter refers to the first `FilePath` parameter,
-- and same for the second, respectively.
diffDirectoriesWithIgnoredSubdirs :: FilePath -> FilePath -> [FilePath] -> [FilePath] -> IO Diff
diffDirectoriesWithIgnoredSubdirs a' b' aToIgnore bToIgnore = do
    aIsFile <- D.doesFileExist a
    bIsFile <- D.doesFileExist b
    when (aIsFile || bIsFile) $ error $ "One or both of " ++ a ++ " and " ++ b ++ "is not a directory, but a file."

    aContents' <- getDirectoryContentsRecursiveSafe a
    bContents' <- getDirectoryContentsRecursiveSafe b
    let aContents = filter (not . shouldIgnore aToIgnore) aContents'
    let bContents = filter (not . shouldIgnore bToIgnore) bContents'

    intersectionDiffs <- getDiffs $ intersect aContents bContents

    aOnlyDiffs <- getDiffs $ aContents \\ bContents

    bOnlyDiffs <- getDiffs $ bContents \\ aContents

    let allDiffs = map makeRelative $ intersectionDiffs ++ aOnlyDiffs ++ bOnlyDiffs

    return $ Diff allDiffs
    where
        a :: FilePath
        a = dropTrailingSlash a'

        b :: FilePath
        b = dropTrailingSlash b'

        -- | `x` is the prefix of the "base" of the diff; `y` is the
        -- | "compare".
        getDiffs :: [FilePath] -> IO [Filediff]
        getDiffs filepaths
            = filter (not . isIdentityFileDiff)
            <$> mapM (\fp -> diffFiles (a </> fp) (b </> fp)) filepaths

        isIdentityFileDiff :: Filediff -> Bool
        isIdentityFileDiff = (==) mempty . change

        makeRelative :: Filediff -> Filediff
        makeRelative (Filediff base comp change) =
            Filediff
                (dropInitialSlash $ dropPrefix a base)
                (dropInitialSlash $ dropPrefix b comp)
                change

        shouldIgnore :: [FilePath] -> FilePath -> Bool
        shouldIgnore toIgnore filepath = any (flip isPrefix $ filepath) toIgnore

-- | /O(n)/. Apply a diff to a file. Returns the fail state if
-- application fails. For more on how diff application can fail,
-- see 'applyListDiff'.
applyFileDiff :: Filediff -> FilePath -> EitherT Error IO [Line]
applyFileDiff (Filediff _ _ change) filepath = do
    case change of
        Del _       -> delCase
        Mod listdiff -> modCase listdiff
        Add listdiff -> addCase listdiff
    where
        delCase :: EitherT Error IO [Line]
        delCase = liftIO (D.removeFile filepath) >> right []

        addCase :: ListDiff Line -> EitherT Error IO [Line]
        addCase listDiff = liftIO (createFileWithContents filepath "") >> modCase listDiff

        modCase :: ListDiff Line -> EitherT Error IO [Line]
        modCase listDiff = do
            -- Data.Text.IO.readFile is strict, which is what we
            -- need, here (because of the write right after)
            file <- liftIO (TIO.readFile filepath)
            result <- hoistEither (applyListDiff listDiff . T.lines $ file)
            liftIO (TIO.writeFile filepath (safeInit . T.unlines $ result))-- `init` for trailing \n
            right result

        safeInit :: T.Text -> T.Text
        safeInit x = if T.null x then x else T.init x

-- | Applies a `Diff` to a directory. Returns the fail state if
-- any file application fails, but because of the parallelism in the
-- implementation, all file diffs will be attempted to be applied, so
-- if this fails, your directory will be left in an inconsistent state.
-- For more on how diff application can fail, see 'applyListDiff'.
applyDirectoryDiff :: Diff -> FilePath -> EitherT Error IO [[Line]]
applyDirectoryDiff (Diff filediffs) filepath
    = EitherT
    . fmap sequence
    $ mapMParallelWaitForAll (runEitherT . apply) filediffs >>= mapM Thread.result
    where
        apply :: Filediff -> EitherT Error IO [Line]
        apply diff@(Filediff base compare _)
            = applyFileDiff diff (filepath </> base)

-- | forks the given 'IO' action for each element in the given list,
-- but waits for all to finish before returning.
mapMParallelWaitForAll :: (a -> IO b) -> [a] -> IO [Thread.Result b]
mapMParallelWaitForAll f list = do
    group <- ThreadGroup.new
    ioResults <- mapM (fmap snd . ThreadGroup.forkIO group . f) list
    ThreadGroup.wait group
    sequence ioResults

-- | Computes the minimal number of additions and deletions needed to
--   transform the first parameter into the second.
--
--       > λ diffLists "abcdefg" "wabxyze"
--       > ListDiff {dels = [(2,'c'),(3,'d'),(5,'f'),(6,'g')], adds = [(0,'w'),(3,'x'),(4,'y'),(5,'z')]}
diffLists :: forall a. (Eq a) => [a] -> [a] -> ListDiff a
diffLists a b = ListDiff
    (map (\i -> (i, a !! i)) $ nonSubsequenceIndices common a)
    (getProgressiveIndicesToAdd common b)
    where
        common :: [a]
        common = longestCommonSubsequenceWrapper a b

        -- | > λ add
        --   > [(0,"w"),(3,"x"),(4,"y")]
        --   > λ common
        --   > ["a","b","e"]
        getProgressiveIndicesToAdd :: [a] -> [a] -> [(Int, a)]
        getProgressiveIndicesToAdd sub super =
            map (\i -> (i, super !! i)) $ nonSubsequenceIndices sub super

-- | Applies a list diff. For example,
--
--       > ListDiff {dels = [(2,'c'),(3,'d'),(5,'f'),(6,'g')], adds = [(0,'w'),(3,'x'),(4,'y'),(5,'z')]}
--       > λ applyListDiff it "abcdefg"
--       > Right "wabxyze"
--
-- Returns a fail state if the diff cannot be applied. This can happen
-- for two reasons: first, the diff calls for a deletion at an index
-- but the element at that index doesn't match the element believed by
-- the to be diff at that index. Second, it can happen if the diff calls
-- for an element to be added at an index too large for the given input.
-- Here are respective examples of inputs that would trigger this case:
--
--     > let base = "abcdefg"
--     > let faultyBase = "ab*defg"
--     > let comp = "wabxyze"
--     > let listDiff = F.diffLists base comp
--     > F.applyListDiff listDiff faultyBase -- fails
--
-- and
--
--     > let base = "abcdefg"
--     > let faultyBase = "abcde"
--     > let comp = "wabxyzefgq"
--     > let listDiff = F.diffLists base comp
--     > F.applyListDiff listDiff faultyBase -- fails
applyListDiff :: forall a. (Eq a) => ListDiff a -> [a] -> Either Error [a]
applyListDiff l@(ListDiff dels adds) list =
    removeAtIndices dels list >>= insertAtProgressiveIndices adds

-- | Best explained by example:
--
--       > λ insertAtProgressiveIndices [(1,'a'),(3,'b')] "def"
--       > Just "daebf"
insertAtProgressiveIndices :: [(Int, a)] -> [a] -> Either Error [a]
insertAtProgressiveIndices = insertAtProgressiveIndices' 0

insertAtProgressiveIndices' :: Int -> [(Int, a)] -> [a] -> Either Error [a]
insertAtProgressiveIndices' _ [] dest = Right dest
insertAtProgressiveIndices' curr src@((i,s):src') dest =
    if i == curr
        then (:) s <$> insertAtProgressiveIndices' (succ curr) src' dest
        else case dest of
            (d:dest') -> (:) d <$> insertAtProgressiveIndices' (succ curr) src dest'
            [] -> Left "Fatal: couldn't apply list diff (application requires inserting at an index larger than the length of the list to which to apply the diff)."

-- all functions below are not exposed

-- don't hit the memotable if not necessary
-- | A wrapper around `longestCommonSubsequence`. It gives a bit of a
-- performance boost; it avoids hitting the memo table to an extent
-- (exactly how much depends on the arguments).
longestCommonSubsequenceWrapper :: forall a. (Eq a) => [a] -> [a] -> [a]
longestCommonSubsequenceWrapper xs ys =
    if xs == ys
        then xs -- (WLOG) don't want to return xs ++ xs
        else commonPrefix
            ++ longestCommonSubsequence (getMiddle xs) (getMiddle ys)
            ++ commonSuffix
    where
        commonPrefix :: [a]
        commonPrefix = getCommonPrefix xs ys

        -- drop (length commonPrefix) to prevent the "abc" vs. "abc*abc" case
        commonSuffix :: [a]
        commonSuffix = reverse
            (getCommonPrefix
                (reverse (drop (length commonPrefix) xs))
                (reverse (drop (length commonPrefix) ys)))

        getCommonPrefix :: [a] -> [a] -> [a]
        getCommonPrefix as bs = map fst . takeWhile (uncurry (==)) $ zip as bs

        -- xs = abcd***efg
        -- ys = abcd???????efg
        -- getMiddle xs == ****
        -- getMiddle ys = ??????
        getMiddle :: [a] -> [a]
        getMiddle elems = take (length elems - length commonPrefix - length commonSuffix) . drop (length commonPrefix) $ elems

-- | Compute the longest common (potentially noncontiguous) subsequence
--   between two sequences. Element type is fixed because memoization
--   requires a static type.
longestCommonSubsequence :: forall a. (Eq a) => [a] -> [a] -> [a]
longestCommonSubsequence xs ys = longestCommonSubsequence' 0 0
    where
        -- TODO: UArray?
        xs' :: HMap.Map Int a
        xs' = foldl update HMap.empty (zip [0..] xs)

        ys' :: HMap.Map Int a
        ys' = foldl update HMap.empty (zip [0..] ys)

        update :: HMap.Map Int a -> (Int, a) -> HMap.Map Int a
        update hmap (i, a) = HMap.insert i a hmap

        xsLength :: Int
        xsLength = length xs

        ysLength :: Int
        ysLength = length ys

        longestCommonSubsequence' :: Int -> Int -> [a]
        longestCommonSubsequence' = Memo.memo2 Memo.integral Memo.integral longestCommonSubsequence''

        longestCommonSubsequence'' :: Int -> Int -> [a]
        longestCommonSubsequence'' i j
            | i == xsLength = []
            | j == ysLength = []
            | x == y = x : longestCommonSubsequence' (i + 1) (j + 1) -- WLOG
            | length caseX > length caseY = caseX
            | otherwise = caseY
            where
                x :: a
                x = xs' HMap.! i

                y :: a
                y = ys' HMap.! j

                caseX :: [a]
                caseX = longestCommonSubsequence' (i + 1) j

                caseY :: [a]
                caseY = longestCommonSubsequence' i (j + 1)

-- | When `sub` is a (not necessarily contiguous) subsequence of `super`,
--   get the index at which each element of `sub` appears. E.g.
--
--       > λ subsequenceIndices "abe" "abcdefg"
--       > [0,1,4]
subsequenceIndices :: (Eq a) => [a] -> [a] -> [Int]
subsequenceIndices [] _ = []
subsequenceIndices _ [] = error "`sub` was not a subsequence of `super`"
subsequenceIndices sub@(a:sub') super@(b:super') =
    if a == b
        then 0 : map succ (subsequenceIndices sub' super')
        else     map succ (subsequenceIndices sub super')

-- | When `sub` is a (not necessarily contiguous) subsequence of `super`,
--   get the indices at which elements of `sub` do *not* appear. E.g.
--
--       > λ nonSubsequenceIndices "abe" "abcdefg"
--       > [2,3,5,6]
nonSubsequenceIndices :: (Eq a) => [a] -> [a] -> [Int]
nonSubsequenceIndices sub super =
    [0..(length super - 1)] \\ (subsequenceIndices sub super)

-- | /O(n)/. `indices` parameter *must* be sorted in increasing order,
--   and indices must all exist.
removeAtIndices :: forall a. (Eq a) => [(Int, a)] -> [a] -> Either Error [a]
removeAtIndices dels list = if not matches
    then Left "Fatal: couldn't apply list diff (application requires removing an element where the diff calls for a different element residing at that index)."
    else Right (removeAtIndices' 0 (map fst dels) list)
    where
        matches :: Bool
        matches = all (< length list) (map fst dels)
            && all (\(i, ch) -> (list !! i) == ch) dels

        removeAtIndices' :: Int -> [Int] -> [a] -> [a]
        removeAtIndices' _ [] xs = xs
        removeAtIndices' curr (i:is) (x:xs) =
            if curr == i
                then     removeAtIndices' (succ curr) is xs
                else x : removeAtIndices' (succ curr) (i:is) xs