{-# LANGUAGE QuasiQuotes, TupleSections #-}

{- dupe.hs: find duplicate files

This program finds duplicate files in the directory tree rooted at the
current working directory.

It demonstrates the goal of the shqq library: to combine the easy
command execution of traditional shell scripting with the strengths of
Haskell, such as concurrency and non-trivial data structures.

Depends on: shqq-0.1, spawn-0.3, split-0.1

Run it with:
  $ ghc -O -threaded -rtsopts dupe.hs
  $ ./dupe +RTS -N

-}

import Control.Concurrent.Spawn
import Data.List.Split
import System.Posix.Files
import System.ShQQ

import qualified Data.Map as M

-- Find potentially duplicate files based on some key, either size
-- or a checksum.
dupes :: (Ord k) => [(FilePath,k)] -> [[FilePath]]
dupes = filter (not . null . drop 1) . M.elems
      . foldr (\(v,k) -> M.insertWith (++) k [v]) M.empty

-- We want to collect data in parallel, but the OS will complain if we
-- have too many open files.  We limit each pass to have at most 256
-- tests in progress at once.
inParallel :: (a -> IO b) -> [a] -> IO [b]
inParallel f xs = do p <- pool 256; parMapIO (p . f) xs

main :: IO ()
main = do
    files <- endBy "\0" `fmap` [sh| find -type f -print0 |]

    -- Get the size of every file, as a first pass.
    let getSize f = ((f,) . fileSize) `fmap` getFileStatus f
    sizeDupes <- dupes `fmap` inParallel getSize files

    -- Checksum the files which have duplicated sizes.
    let getSha f = ((f,) . head . words) `fmap` [sh| sha1sum $f |]
    shaDupes  <- dupes `fmap` inParallel getSha (concat sizeDupes)

    -- Print duplicated file names, one per line, with a blank line
    -- after each group of duplicates.
    mapM_ (mapM_ putStrLn . (++[""])) shaDupes