module Takedouble (findPossibleDuplicates, getFileNames, checkFullDuplicates, File (..)) where
import Control.Monad.Extra (filterM, forM, ifM)
import qualified Data.ByteString as BS
import Data.List (group, sort, sortOn)
import Data.List.Extra (groupOn)
import System.Directory (doesDirectoryExist, doesPathExist, listDirectory)
import System.FilePattern
import System.FilePath.Posix ((</>))
import System.IO
( Handle,
IOMode (ReadMode),
SeekMode (SeekFromEnd),
hFileSize,
hSeek,
withFile,
)
import System.Posix.Files
( getFileStatus,
isDirectory,
isRegularFile,
)
data File = File
{ File -> FilePath
filepath :: FilePath,
File -> Integer
filesize :: Integer,
File -> ByteString
firstchunk :: BS.ByteString,
File -> ByteString
lastchunk :: BS.ByteString
}
instance Eq File where
(File FilePath
_ Integer
fs1 ByteString
firstchunk1 ByteString
lastchunk1) == :: File -> File -> Bool
== (File FilePath
_ Integer
fs2 ByteString
firstchunk2 ByteString
lastchunk2) =
(Integer
fs1, ByteString
firstchunk1, ByteString
lastchunk1) (Integer, ByteString, ByteString)
-> (Integer, ByteString, ByteString) -> Bool
forall a. Eq a => a -> a -> Bool
== (Integer
fs2, ByteString
firstchunk2, ByteString
lastchunk2)
instance Ord File where
compare :: File -> File -> Ordering
compare (File FilePath
_ Integer
fs1 ByteString
firstchunk1 ByteString
lastchunk1) (File FilePath
_ Integer
fs2 ByteString
firstchunk2 ByteString
lastchunk2) =
(Integer, ByteString, ByteString)
-> (Integer, ByteString, ByteString) -> Ordering
forall a. Ord a => a -> a -> Ordering
compare (Integer
fs1, ByteString
firstchunk1, ByteString
lastchunk1) (Integer
fs2, ByteString
firstchunk2, ByteString
lastchunk2)
instance Show File where
show :: File -> FilePath
show (File FilePath
fp Integer
_ ByteString
_ ByteString
_) = ShowS
forall a. Show a => a -> FilePath
show FilePath
fp
findPossibleDuplicates :: [FilePath] -> Maybe String -> IO [[File]]
findPossibleDuplicates :: [FilePath] -> Maybe FilePath -> IO [[File]]
findPossibleDuplicates [FilePath]
filenames Maybe FilePath
glob = do
[File]
files <- (FilePath -> IO File) -> [FilePath] -> IO [File]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM FilePath -> IO File
loadFile [FilePath]
filteredFilenames
[[File]] -> IO [[File]]
forall (f :: * -> *) a. Applicative f => a -> f a
pure ([[File]] -> IO [[File]]) -> [[File]] -> IO [[File]]
forall a b. (a -> b) -> a -> b
$ ([File] -> Bool) -> [[File]] -> [[File]]
forall a. (a -> Bool) -> [a] -> [a]
filter (\[File]
x -> Int
1 Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< [File] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [File]
x) ([[File]] -> [[File]]) -> [[File]] -> [[File]]
forall a b. (a -> b) -> a -> b
$ [File] -> [[File]]
forall a. Eq a => [a] -> [[a]]
group ([File] -> [File]
forall a. Ord a => [a] -> [a]
sort [File]
files)
where filteredFilenames :: [FilePath]
filteredFilenames =
case Maybe FilePath
glob of
Maybe FilePath
Nothing -> [FilePath]
filenames
Just FilePath
g -> (FilePath -> Bool) -> [FilePath] -> [FilePath]
forall a. (a -> Bool) -> [a] -> [a]
filter (\FilePath
fn -> Bool -> Bool
not (FilePath
g FilePath -> FilePath -> Bool
?== FilePath
fn)) [FilePath]
filenames
checkFullDuplicates :: [FilePath] -> IO [[FilePath]]
checkFullDuplicates :: [FilePath] -> IO [[FilePath]]
checkFullDuplicates [FilePath]
fps = do
[ByteString]
allContents <- (FilePath -> IO ByteString) -> [FilePath] -> IO [ByteString]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM FilePath -> IO ByteString
BS.readFile [FilePath]
fps
let pairs :: [(FilePath, ByteString)]
pairs = [FilePath] -> [ByteString] -> [(FilePath, ByteString)]
forall a b. [a] -> [b] -> [(a, b)]
zip [FilePath]
fps [ByteString]
allContents
sorted :: [(FilePath, ByteString)]
sorted = ((FilePath, ByteString) -> ByteString)
-> [(FilePath, ByteString)] -> [(FilePath, ByteString)]
forall b a. Ord b => (a -> b) -> [a] -> [a]
sortOn (FilePath, ByteString) -> ByteString
forall a b. (a, b) -> b
snd [(FilePath, ByteString)]
pairs
dups :: [[(FilePath, ByteString)]]
dups = ([(FilePath, ByteString)] -> Bool)
-> [[(FilePath, ByteString)]] -> [[(FilePath, ByteString)]]
forall a. (a -> Bool) -> [a] -> [a]
filter (\[(FilePath, ByteString)]
x -> [(FilePath, ByteString)] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [(FilePath, ByteString)]
x Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
1) ([[(FilePath, ByteString)]] -> [[(FilePath, ByteString)]])
-> [[(FilePath, ByteString)]] -> [[(FilePath, ByteString)]]
forall a b. (a -> b) -> a -> b
$ ((FilePath, ByteString) -> ByteString)
-> [(FilePath, ByteString)] -> [[(FilePath, ByteString)]]
forall b a. Eq b => (a -> b) -> [a] -> [[a]]
groupOn (FilePath, ByteString) -> ByteString
forall a b. (a, b) -> b
snd [(FilePath, ByteString)]
sorted
res :: [[FilePath]]
res = ((FilePath, ByteString) -> FilePath
forall a b. (a, b) -> a
fst ((FilePath, ByteString) -> FilePath)
-> [(FilePath, ByteString)] -> [FilePath]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$>) ([(FilePath, ByteString)] -> [FilePath])
-> [[(FilePath, ByteString)]] -> [[FilePath]]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
`fmap` [[(FilePath, ByteString)]]
dups
[[FilePath]] -> IO [[FilePath]]
forall (f :: * -> *) a. Applicative f => a -> f a
pure [[FilePath]]
res
loadFile :: FilePath -> IO File
loadFile :: FilePath -> IO File
loadFile FilePath
fp = do
(Integer
fsize, ByteString
firstchunk, ByteString
lastchunk) <- FilePath
-> IOMode
-> (Handle -> IO (Integer, ByteString, ByteString))
-> IO (Integer, ByteString, ByteString)
forall r. FilePath -> IOMode -> (Handle -> IO r) -> IO r
withFile FilePath
fp IOMode
ReadMode Handle -> IO (Integer, ByteString, ByteString)
getChunks
File -> IO File
forall (f :: * -> *) a. Applicative f => a -> f a
pure (File -> IO File) -> File -> IO File
forall a b. (a -> b) -> a -> b
$ FilePath -> Integer -> ByteString -> ByteString -> File
File FilePath
fp Integer
fsize ByteString
firstchunk ByteString
lastchunk
chunkSize :: Int
chunkSize :: Int
chunkSize = Int
4 Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
1024
getChunks :: Handle -> IO (Integer, BS.ByteString, BS.ByteString)
getChunks :: Handle -> IO (Integer, ByteString, ByteString)
getChunks Handle
h = do
Integer
fsize <- Handle -> IO Integer
hFileSize Handle
h
ByteString
begin <- Handle -> Int -> IO ByteString
BS.hGet Handle
h Int
chunkSize
Handle -> SeekMode -> Integer -> IO ()
hSeek Handle
h SeekMode
SeekFromEnd (Int -> Integer
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
chunkSize)
ByteString
end <- Handle -> Int -> IO ByteString
BS.hGet Handle
h Int
chunkSize
(Integer, ByteString, ByteString)
-> IO (Integer, ByteString, ByteString)
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Integer
fsize, ByteString
begin, ByteString
end)
getFileNames :: FilePath -> IO [FilePath]
getFileNames :: FilePath -> IO [FilePath]
getFileNames FilePath
curDir = do
[FilePath]
names <- FilePath -> IO [FilePath]
listDirectory FilePath
curDir
let names' :: [FilePath]
names' = (FilePath
curDir FilePath -> ShowS
</>) ShowS -> [FilePath] -> [FilePath]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> [FilePath]
names
[FilePath]
names'' <- (FilePath -> IO Bool) -> [FilePath] -> IO [FilePath]
forall (m :: * -> *) a.
Applicative m =>
(a -> m Bool) -> [a] -> m [a]
filterM FilePath -> IO Bool
saneFile [FilePath]
names'
[[FilePath]]
files <- [FilePath] -> (FilePath -> IO [FilePath]) -> IO [[FilePath]]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
t a -> (a -> m b) -> m (t b)
forM [FilePath]
names'' ((FilePath -> IO [FilePath]) -> IO [[FilePath]])
-> (FilePath -> IO [FilePath]) -> IO [[FilePath]]
forall a b. (a -> b) -> a -> b
$ \FilePath
path -> do
let path' :: FilePath
path' = FilePath
curDir FilePath -> ShowS
</> FilePath
path
Bool
exists <- FilePath -> IO Bool
doesDirectoryExist FilePath
path'
if Bool
exists
then FilePath -> IO [FilePath]
getFileNames FilePath
path'
else [FilePath] -> IO [FilePath]
forall (f :: * -> *) a. Applicative f => a -> f a
pure ([FilePath] -> IO [FilePath]) -> [FilePath] -> IO [FilePath]
forall a b. (a -> b) -> a -> b
$ FilePath -> [FilePath]
forall (f :: * -> *) a. Applicative f => a -> f a
pure FilePath
path'
[FilePath] -> IO [FilePath]
forall (f :: * -> *) a. Applicative f => a -> f a
pure ([FilePath] -> IO [FilePath]) -> [FilePath] -> IO [FilePath]
forall a b. (a -> b) -> a -> b
$ [[FilePath]] -> [FilePath]
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat [[FilePath]]
files
saneFile :: FilePath -> IO Bool
saneFile :: FilePath -> IO Bool
saneFile FilePath
fp =
IO Bool -> IO Bool -> IO Bool -> IO Bool
forall (m :: * -> *) a. Monad m => m Bool -> m a -> m a -> m a
ifM
(FilePath -> IO Bool
doesPathExist FilePath
fp)
( do
FileStatus
stat <- FilePath -> IO FileStatus
getFileStatus FilePath
fp
Bool -> IO Bool
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Bool -> IO Bool) -> Bool -> IO Bool
forall a b. (a -> b) -> a -> b
$ FileStatus -> Bool
isRegularFile FileStatus
stat Bool -> Bool -> Bool
|| FileStatus -> Bool
isDirectory FileStatus
stat
)
(Bool -> IO Bool
forall (f :: * -> *) a. Applicative f => a -> f a
pure Bool
False)