module Takedouble (findPossibleDuplicates, getFileNames, checkFullDuplicates, File (..)) where

import Control.Monad.Extra (filterM, forM, ifM)
import qualified Data.ByteString as BS
import Data.List (group, sort, sortOn)
import Data.List.Extra (groupOn)
import System.Directory (doesDirectoryExist, doesPathExist, listDirectory)
import System.FilePattern
import System.FilePath.Posix ((</>))
import System.IO
  ( Handle,
    IOMode (ReadMode),
    SeekMode (SeekFromEnd),
    hFileSize,
    hSeek,
    withFile,
  )
import System.Posix.Files
  ( getFileStatus,
    isDirectory,
    isRegularFile,
  )

data File = File
  { File -> FilePath
filepath :: FilePath,
    File -> Integer
filesize :: Integer,
    File -> ByteString
firstchunk :: BS.ByteString,
    File -> ByteString
lastchunk :: BS.ByteString
  }

-- | File needs a custom Eq instance so filepath is not part of the comparison
instance Eq File where
  (File FilePath
_ Integer
fs1 ByteString
firstchunk1 ByteString
lastchunk1) == :: File -> File -> Bool
== (File FilePath
_ Integer
fs2 ByteString
firstchunk2 ByteString
lastchunk2) =
    (Integer
fs1, ByteString
firstchunk1, ByteString
lastchunk1) (Integer, ByteString, ByteString)
-> (Integer, ByteString, ByteString) -> Bool
forall a. Eq a => a -> a -> Bool
== (Integer
fs2, ByteString
firstchunk2, ByteString
lastchunk2)

instance Ord File where
  compare :: File -> File -> Ordering
compare (File FilePath
_ Integer
fs1 ByteString
firstchunk1 ByteString
lastchunk1) (File FilePath
_ Integer
fs2 ByteString
firstchunk2 ByteString
lastchunk2) =
    (Integer, ByteString, ByteString)
-> (Integer, ByteString, ByteString) -> Ordering
forall a. Ord a => a -> a -> Ordering
compare (Integer
fs1, ByteString
firstchunk1, ByteString
lastchunk1) (Integer
fs2, ByteString
firstchunk2, ByteString
lastchunk2)

instance Show File where
  show :: File -> FilePath
show (File FilePath
fp Integer
_ ByteString
_ ByteString
_) = ShowS
forall a. Show a => a -> FilePath
show FilePath
fp

-- | (hopefully) lazy comparison of files by size, first, and last chunk.
findPossibleDuplicates :: [FilePath] -> Maybe String -> IO [[File]]
findPossibleDuplicates :: [FilePath] -> Maybe FilePath -> IO [[File]]
findPossibleDuplicates [FilePath]
filenames Maybe FilePath
glob = do
  [File]
files <- (FilePath -> IO File) -> [FilePath] -> IO [File]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM FilePath -> IO File
loadFile [FilePath]
filteredFilenames
  [[File]] -> IO [[File]]
forall (f :: * -> *) a. Applicative f => a -> f a
pure ([[File]] -> IO [[File]]) -> [[File]] -> IO [[File]]
forall a b. (a -> b) -> a -> b
$ ([File] -> Bool) -> [[File]] -> [[File]]
forall a. (a -> Bool) -> [a] -> [a]
filter (\[File]
x -> Int
1 Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< [File] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [File]
x) ([[File]] -> [[File]]) -> [[File]] -> [[File]]
forall a b. (a -> b) -> a -> b
$ [File] -> [[File]]
forall a. Eq a => [a] -> [[a]]
group ([File] -> [File]
forall a. Ord a => [a] -> [a]
sort [File]
files)
  where filteredFilenames :: [FilePath]
filteredFilenames =
          case Maybe FilePath
glob of
            Maybe FilePath
Nothing -> [FilePath]
filenames
            Just FilePath
g -> (FilePath -> Bool) -> [FilePath] -> [FilePath]
forall a. (a -> Bool) -> [a] -> [a]
filter (\FilePath
fn -> Bool -> Bool
not (FilePath
g FilePath -> FilePath -> Bool
?== FilePath
fn)) [FilePath]
filenames

checkFullDuplicates :: [FilePath] -> IO [[FilePath]]
checkFullDuplicates :: [FilePath] -> IO [[FilePath]]
checkFullDuplicates [FilePath]
fps = do
  [ByteString]
allContents <- (FilePath -> IO ByteString) -> [FilePath] -> IO [ByteString]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM FilePath -> IO ByteString
BS.readFile [FilePath]
fps
  let pairs :: [(FilePath, ByteString)]
pairs = [FilePath] -> [ByteString] -> [(FilePath, ByteString)]
forall a b. [a] -> [b] -> [(a, b)]
zip [FilePath]
fps [ByteString]
allContents
      sorted :: [(FilePath, ByteString)]
sorted = ((FilePath, ByteString) -> ByteString)
-> [(FilePath, ByteString)] -> [(FilePath, ByteString)]
forall b a. Ord b => (a -> b) -> [a] -> [a]
sortOn (FilePath, ByteString) -> ByteString
forall a b. (a, b) -> b
snd [(FilePath, ByteString)]
pairs
      dups :: [[(FilePath, ByteString)]]
dups = ([(FilePath, ByteString)] -> Bool)
-> [[(FilePath, ByteString)]] -> [[(FilePath, ByteString)]]
forall a. (a -> Bool) -> [a] -> [a]
filter (\[(FilePath, ByteString)]
x -> [(FilePath, ByteString)] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [(FilePath, ByteString)]
x Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
1) ([[(FilePath, ByteString)]] -> [[(FilePath, ByteString)]])
-> [[(FilePath, ByteString)]] -> [[(FilePath, ByteString)]]
forall a b. (a -> b) -> a -> b
$ ((FilePath, ByteString) -> ByteString)
-> [(FilePath, ByteString)] -> [[(FilePath, ByteString)]]
forall b a. Eq b => (a -> b) -> [a] -> [[a]]
groupOn (FilePath, ByteString) -> ByteString
forall a b. (a, b) -> b
snd [(FilePath, ByteString)]
sorted
      res :: [[FilePath]]
res = ((FilePath, ByteString) -> FilePath
forall a b. (a, b) -> a
fst ((FilePath, ByteString) -> FilePath)
-> [(FilePath, ByteString)] -> [FilePath]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$>) ([(FilePath, ByteString)] -> [FilePath])
-> [[(FilePath, ByteString)]] -> [[FilePath]]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
`fmap` [[(FilePath, ByteString)]]
dups
  [[FilePath]] -> IO [[FilePath]]
forall (f :: * -> *) a. Applicative f => a -> f a
pure [[FilePath]]
res

loadFile :: FilePath -> IO File
loadFile :: FilePath -> IO File
loadFile FilePath
fp = do
  (Integer
fsize, ByteString
firstchunk, ByteString
lastchunk) <- FilePath
-> IOMode
-> (Handle -> IO (Integer, ByteString, ByteString))
-> IO (Integer, ByteString, ByteString)
forall r. FilePath -> IOMode -> (Handle -> IO r) -> IO r
withFile FilePath
fp IOMode
ReadMode Handle -> IO (Integer, ByteString, ByteString)
getChunks
  File -> IO File
forall (f :: * -> *) a. Applicative f => a -> f a
pure (File -> IO File) -> File -> IO File
forall a b. (a -> b) -> a -> b
$ FilePath -> Integer -> ByteString -> ByteString -> File
File FilePath
fp Integer
fsize ByteString
firstchunk ByteString
lastchunk

-- | chunkSize is 4096 so NVMe drives will be especially happy
chunkSize :: Int
chunkSize :: Int
chunkSize = Int
4 Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
1024

-- | fetch the file size and first and last 4k chunks of the file
getChunks :: Handle -> IO (Integer, BS.ByteString, BS.ByteString)
getChunks :: Handle -> IO (Integer, ByteString, ByteString)
getChunks Handle
h = do
  Integer
fsize <- Handle -> IO Integer
hFileSize Handle
h
  ByteString
begin <- Handle -> Int -> IO ByteString
BS.hGet Handle
h Int
chunkSize
  Handle -> SeekMode -> Integer -> IO ()
hSeek Handle
h SeekMode
SeekFromEnd (Int -> Integer
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
chunkSize) -- [TODO] needs to be read from filesize - filesize % 4096
  ByteString
end <- Handle -> Int -> IO ByteString
BS.hGet Handle
h Int
chunkSize
  (Integer, ByteString, ByteString)
-> IO (Integer, ByteString, ByteString)
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Integer
fsize, ByteString
begin, ByteString
end)

-- | get all the FilePath values
getFileNames :: FilePath -> IO [FilePath]
getFileNames :: FilePath -> IO [FilePath]
getFileNames FilePath
curDir = do
  [FilePath]
names <- FilePath -> IO [FilePath]
listDirectory FilePath
curDir
  let names' :: [FilePath]
names' = (FilePath
curDir FilePath -> ShowS
</>) ShowS -> [FilePath] -> [FilePath]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> [FilePath]
names
  [FilePath]
names'' <- (FilePath -> IO Bool) -> [FilePath] -> IO [FilePath]
forall (m :: * -> *) a.
Applicative m =>
(a -> m Bool) -> [a] -> m [a]
filterM FilePath -> IO Bool
saneFile [FilePath]
names'
  [[FilePath]]
files <- [FilePath] -> (FilePath -> IO [FilePath]) -> IO [[FilePath]]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
t a -> (a -> m b) -> m (t b)
forM [FilePath]
names'' ((FilePath -> IO [FilePath]) -> IO [[FilePath]])
-> (FilePath -> IO [FilePath]) -> IO [[FilePath]]
forall a b. (a -> b) -> a -> b
$ \FilePath
path -> do
    let path' :: FilePath
path' = FilePath
curDir FilePath -> ShowS
</> FilePath
path
    Bool
exists <- FilePath -> IO Bool
doesDirectoryExist FilePath
path'
    if Bool
exists
      then FilePath -> IO [FilePath]
getFileNames FilePath
path'
      else [FilePath] -> IO [FilePath]
forall (f :: * -> *) a. Applicative f => a -> f a
pure ([FilePath] -> IO [FilePath]) -> [FilePath] -> IO [FilePath]
forall a b. (a -> b) -> a -> b
$ FilePath -> [FilePath]
forall (f :: * -> *) a. Applicative f => a -> f a
pure FilePath
path'
  [FilePath] -> IO [FilePath]
forall (f :: * -> *) a. Applicative f => a -> f a
pure ([FilePath] -> IO [FilePath]) -> [FilePath] -> IO [FilePath]
forall a b. (a -> b) -> a -> b
$ [[FilePath]] -> [FilePath]
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat [[FilePath]]
files

-- | Check if the file exists, and is not a fifo or broken symbolic link
saneFile :: FilePath -> IO Bool
saneFile :: FilePath -> IO Bool
saneFile FilePath
fp =
  IO Bool -> IO Bool -> IO Bool -> IO Bool
forall (m :: * -> *) a. Monad m => m Bool -> m a -> m a -> m a
ifM
    (FilePath -> IO Bool
doesPathExist FilePath
fp)
    ( do
        FileStatus
stat <- FilePath -> IO FileStatus
getFileStatus FilePath
fp
        Bool -> IO Bool
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Bool -> IO Bool) -> Bool -> IO Bool
forall a b. (a -> b) -> a -> b
$ FileStatus -> Bool
isRegularFile FileStatus
stat Bool -> Bool -> Bool
|| FileStatus -> Bool
isDirectory FileStatus
stat
    )
    (Bool -> IO Bool
forall (f :: * -> *) a. Applicative f => a -> f a
pure Bool
False)