#!/usr/bin/env runhaskell

{-# LANGUAGE DeriveDataTypeable #-}
{-# OPTIONS_GHC -Wall
                -fno-warn-name-shadowing
                -fno-warn-missing-signatures
                -fno-warn-incomplete-patterns #-}

module Main where

-- una.hs, version 2.0 (2010-07-02)
--
-- by John Wiegley <johnw@newartisans.com>
--
-- A simple universal unarchiving utility.  Just point it at any archive or
-- compressed file, and it spits out a single file or directory in the current
-- directory with its contents.  Use -d to delete the original archive on
-- success.  Use -f to overwrite any existing file or directory which might be
-- in the way.
--
-- To handle all the supported formats on Mac OS X, you must first install:
--   sudo port install cabextract unarj unrar lha p7zip

import           Control.Applicative
import           Control.Concurrent
import qualified Control.Exception as C
import           Control.Monad
import qualified Data.ByteString.Lazy as B
import           Data.Char
import           Data.Function (fix)
import           Data.List
import           Data.Maybe
import           Data.Traversable hiding (sequence)
import           Prelude hiding (sequence)
import           System.Console.CmdArgs
import           System.Directory
import           System.Environment
import           System.Exit
import           System.FilePath
import           System.IO
import           System.IO.Storage
import           System.Process

-- This script takes a series of pathnames to compressed files and/or
-- archives, and uncompresses/decodes/unarchives them.
--
-- What's especially useful about this script is that it guarantees that the
-- result of this process is a single new entry in the current directory,
-- whether that be a file or a directory.  That is:
--
--   1. If it's simply a compressed file, it uncompresses in the current
--      directory.
--   2. If it's an archive containing a single file or directory, the result
--      is the same as if it had been compressed: the file or directory
--      ends up in the current directory.
--   3. If the archive contains multiple items, they are unarchived in a
--      directory named after the original file.

version :: String
version    = "2.0.1"

copyright :: String
copyright  = "2009-2012"

unaSummary :: String
unaSummary = "una v" ++ version ++ ", (C) John Wiegley " ++ copyright

data UnaOpts = UnaOpts
    { delete_   :: Bool
    , force     :: Bool
    , temp      :: FilePath
    , output    :: FilePath
    , sysTemp   :: Bool
    , test      :: Bool
    , files     :: [FilePath]
    }
    deriving (Data, Typeable, Show, Eq)

unaOpts :: UnaOpts
unaOpts = UnaOpts
    { delete_ = def &= help "Delete the original archive if successful"
    , force   = def &= name "f" &= help "Overwrite any existing file/dir"
    , temp    = def &= typDir &=
                help "Use DIR as a temp directory, instead of current"
    , output  = def &= typDir &= name "o" &=
                help "Unarchive to DIR instead of archive's directory"
    , sysTemp = def &= name "T" &=
                help "Use the system's temp directory (typically /tmp)"
    , test    = def &= explicit &= name "test" &=
                help "Extract, throw away resulting file(s), set error code"
    , files   = def &= args &= typ "FILE..."
    } &=
    summary unaSummary &=
    program "una" &=
    help "Universal recursive unarchiver/decoder/decompressor tool"


main :: IO ()
main = do
  mainArgs <- getArgs
  opts <- withArgs (if null mainArgs then ["--help"] else mainArgs)
          (cmdArgs unaOpts)

  --when (null (files opts)) $ cmdArgsApply cmdArgsHelp

  -- Extract each archive given on the command-line.  If it's not recognizable
  -- as an archive, the resulting pathname will be identical to the input, in
  -- which case nothing has been done.  If an error occurs for a given
  -- archive, stop then.
  forM_ (files opts) $ \path -> do
    result <- withStore "main" $ do putValue "main" "opts" opts
                                    extract path (force opts)
    if test opts
      then case result of
        ArchiveError _   -> exitWith $ ExitFailure 1
        FileName fp      -> if fp /= path
                            then removeFilePath fp
                            else exitWith $ ExitFailure 1
        DirectoryName dp -> removeFilePath dp
        DataStream _     -> error "Unexpected DataStream"

      else case result of
        ArchiveError err -> error err
        FileName fp      -> if fp /= path
                            then success path fp "file" (delete_ opts)
                            else putStrLn $ "Archive unrecognized: " ++ fp
        DirectoryName dp -> success path dp "directory" (delete_ opts)
        DataStream _     -> error "Unexpected DataStream"

  -- In case of success, print the final product's path and delete the
  -- original archive if -d was used.
  where success path f kind del = do
          rf <- makeRelativeToCurrentDirectory f
          putStrLn $ "Extracted " ++ kind ++ ": " ++ rf
          when del $ removeFile path


getOption :: Data a => (a -> b) -> IO b
getOption option = do
  opts <- getValue "main" "opts"
  return $ fromJust $ option <$> opts

-- Determine which "type" an archive is by examining its extension.  It may
-- not be an archive at all, but just a compressed file.  It could even be a
-- compressed archive containing just a single file!  There are too many
-- possible combinations to know at this point.

extractors :: [(String, [Extractor])]
extractors =
  [ (".tar",          [tarballExtractor])
  , (".taz",          [compressExtractor, tarballExtractor])
  , (".tgz",          [gzipExtractor, tarballExtractor])
  , (".tbz",          [bzip2Extractor, tarballExtractor])
  , (".tz2",          [bzip2Extractor, tarballExtractor])
  , (".txz",          [xzipExtractor, tarballExtractor])
  , (".shar",         [sharExtractor])
  , (".z",            [compressExtractor])
  , (".gz",           [gzipExtractor])
  , (".bz2",          [bzip2Extractor])
  , (".xz",           [xzipExtractor])
  , (".7z",           [p7zipExtractor])
  , (".zip",          [zipExtractor])
  , (".jar",          [zipExtractor])
  , (".arj",          [arjExtractor])
  , (".lha",          [lhaExtractor])
  , (".lzh",          [lhaExtractor])
  , (".rar",          [rarExtractor])
  , (".uu",           [uuExtractor])
  , (".a",            [arExtractor])
  , (".cab",          [cabExtractor])
  , (".cpio",         [cpioExtractor])

  , (".gpg",          [gpgExtractor])
  , (".asc",          [gpgExtractor])

  , (".dmg",          [diskImageExtractor])
  , (".iso",          [diskImageExtractor])
  , (".cdr",          [diskImageExtractor])
  , (".sparseimage",  [diskImageExtractor])
  , (".sparsebundle", [diskImageExtractor])

  , (".sit",          [stuffItExtractor True])
  , (".sea",          [stuffItExtractor True])
  , (".bin",          [stuffItExtractor False])
  , (".hqx",          [stuffItExtractor False])

  , (".sdk",          [shrinkItExtractor])
  , (".shk",          [shrinkItExtractor])
  , (".bxy",          [shrinkItExtractor])
  , (".bny",          [shrinkItExtractor])
  , (".bqy",          [shrinkItExtractor])
  ]

-- Gzip and family are very simple compressors that can handle streaming data
-- quite easily.

returnStream :: B.ByteString -> IO ExtractionResult
returnStream out = return (DataStream out, noCleanup)

simpleExtractor :: String -> [String] -> Extraction -> IO ExtractionResult
simpleExtractor cmd args item =
  case item of
    DataStream d -> performExtract cmd args d returnStream
    FileName f   -> performExtract cmd (args ++ [f]) B.empty returnStream

bestExe :: [String] -> IO String
bestExe xs = fromJust <$> msum <$> traverse findExecutable xs

gzipExtractor     = Extractor False $ \x -> do
                      exePath <- bestExe ["pigz", "gzip"]
                      simpleExtractor exePath ["-qdc"] x
compressExtractor = gzipExtractor
bzip2Extractor    = Extractor False $ \x -> do
                      exePath <- bestExe ["pbzip2", "bzip2"]
                      simpleExtractor exePath ["-qdc"] x
xzipExtractor     = Extractor False $ \x -> do
                      exePath <- bestExe ["pxz", "xz"]
                      simpleExtractor exePath ["-qdc"] x

uuExtractor       = Extractor False $ simpleExtractor "uudecode" []
gpgExtractor      = Extractor False $ simpleExtractor "gpg" ["-d"]

-- Tarballs and 7-zip are both archive formats that can accept their input on
-- stdin.  It's not likely that someone will compress a 7zip archive, but it's
-- handled anyway.

returnContents :: FilePath -> B.ByteString -> IO ExtractionResult
returnContents dir _ = examineContents dir True

tarballExtractor = Extractor True $ \item -> do
  dir <- createTempDirectory
  case item of
    DataStream d ->
      performExtract "tar" ["xCf", dir, "-"] d (returnContents dir)
    FileName f   ->
      performExtract "tar" ["xCf", dir, f] B.empty (returnContents dir)

sharExtractor = Extractor True $ \item -> do
  dir <- createTempDirectory
  case item of
    DataStream d ->
      performExtract "unshar" ["-d", dir] d (returnContents dir)
    FileName f   ->
      performExtract "unshar" ["-d", dir, f] B.empty (returnContents dir)

p7zipExtractor = Extractor True $ \item -> do
  dir <- createTempDirectory
  case item of
    DataStream d ->
      performExtract "7za" ["x", "-bd", "-o" ++ dir, "-si"] d
                     (returnContents dir)
    FileName f   ->
      performExtract "7za" ["x", f, "-bd", "-o" ++ dir] B.empty
                     (returnContents dir)

-- Zip and CAB are not quite as flexible as tar and 7-zip, in that they cannot
-- accept the archive via stdin.  If there is a stream from an earlier
-- decoding, it must be written to a temp file and then extracted from that.

zipExtractor = Extractor True $ fix $ \fn item ->
  case item of
    DataStream d -> extractByTemp d ".zip" fn
    FileName f   -> do
      dir <- createTempDirectory
      performExtract "unzip" ["-q", "-d", dir, f] B.empty (returnContents dir)

cabExtractor = Extractor True $ fix $ \fn item ->
  case item of
    DataStream d -> extractByTemp d ".cab" fn
    FileName f   -> do
      dir <- createTempDirectory
      performExtract "cabextract" ["-q", "-d", dir, f] B.empty
                     (returnContents dir)

-- cpio doesn't know how to extract its contents to a particular directory, so
-- a temporary must be created.  It can, however, read input from stream.

extractInTempDir :: String -> [String] -> B.ByteString -> IO ExtractionResult
extractInTempDir cmd args inp = do
  dir  <- createTempDirectory
  cdir <- canonicalizePath dir
  cwd  <- getCurrentDirectory
  ccwd <- canonicalizePath cwd

  C.bracket (setCurrentDirectory cdir)
            (\_ -> setCurrentDirectory ccwd)
            (\_ -> performExtract cmd args inp (returnContents cdir))

cpioExtractor = Extractor True $ \item ->
  case item of
    DataStream d -> extractInTempDir "cpio" ["-id"] d
    FileName f   -> extractInTempDir "cpio" ["-idF", f] B.empty

-- The next set of formats can't handle streaming input, nor can they extract
-- to a specified directory.  This is why I call them the "dumb" extractors.
-- Everything must be setup for them in advance.

dumbExtractor :: String -> String -> String -> Extraction -> IO ExtractionResult
dumbExtractor cmd arg ext item =
  case item of
    DataStream d -> extractByTemp d ext (dumbExtractor cmd ext arg)
    FileName f   -> extractInTempDir cmd [arg, f] B.empty

arjExtractor      = Extractor True $ dumbExtractor "unarj" "x" ".arj"
lhaExtractor      = Extractor True $ dumbExtractor "lha" "x" ".lha"
rarExtractor      = Extractor True $ dumbExtractor "unrar" "x" ".rar"
arExtractor       = Extractor True $ dumbExtractor "ar" "x" ".ar"
shrinkItExtractor = Extractor True $ dumbExtractor "nulib2" "-x" ".shk"

-- Disk images are mountable archives, which means the data must be copied out
-- in order to "extract" it.  jww (2010-07-09): We should handle Linux
-- loopbook mounts too.

diskImageExtractor = Extractor True $ fix $ \fn item ->
  case item of
    DataStream d -> extractByTemp d ".dmg" fn
    FileName f   -> do
      let args = ["attach", "-readonly", "-mountrandom", "/tmp",
                  "-noverify", "-noautofsck", f]

      loud <- isLoud
      when loud $ putStrLn $ "! hdiutil " ++ unwords args

      (exit, out, _) <- readProcessWithExitCode "hdiutil" args []
      case exit of
        ExitFailure _ ->
          return (ArchiveError "Failed to attach disk image", noCleanup)

        ExitSuccess -> do
          let mountPoint =
                case find (isInfixOf "/tmp/dmg") (lines out) of
                  Just line -> (Just . last . words) line
                  Nothing   -> Nothing
          case mountPoint of
            Nothing -> return (ArchiveError "Failed to attach disk image",
                               noCleanup)
            Just dir -> do
              tmpDir <- createTempDirectory

              when loud $
                putStrLn $ "! ditto " ++ unwords [dir, tmpDir]
              code <- readProcessWithExitCode "ditto" [dir, tmpDir] []
              _ <- case code of (ExitFailure _, _, _) -> error "ditto: failed"

              when loud $
                putStrLn $ "! hdiutil " ++ unwords ["detach", dir, "-force"]
              code <- readProcessWithExitCode "hdiutil"
                                              ["detach", dir, "-force"] []
              _ <- case code of (ExitFailure _, _, _) -> error "hdiutil: failed"

              examineContents tmpDir True

-- StuffIt Expander is its own creature.  We talk to it via Applescript, as I
-- know of no better way.

stuffItExtractor archivep = Extractor archivep $ fix $ \fn item ->
  case item of
    DataStream d -> extractByTemp d ".sit" fn
    FileName f   -> do
      tmpDir <- createTempDirectory
      canonTmp <- canonicalizePath tmpDir

      let script =    "  tell application \"StuffIt Expander\"\n"
                   ++ "    run\n"
                   ++ "    expand {POSIX file \"" ++ f ++ "\"}"
                   -- ++ " with delete originals"
                   ++ " to POSIX file \"" ++ canonTmp ++ "\"\n"
                   ++ "  end tell"

      loud <- isLoud
      when loud $ putStrLn "! invoke StuffIt Expander"

      (exit, _, err) <- readProcessWithExitCode "osascript" [] script
      case exit of
        ExitSuccess   ->
          if archivep
          then examineContents canonTmp True
          else do contents <- getDirectoryContents canonTmp
                  let elems = delete "." $ delete ".." contents
                      file  = C.assert (length elems == 1)
                                       (canonTmp </> head elems)
                  exists <- doesFileExist file
                  return (FileName $ C.assert exists file,
                          removeDirectoryRecursive canonTmp)
        ExitFailure _ -> return (ArchiveError $
                                 "Failed to invoke StuffIt Expander: " ++ err,
                                 removeDirectoryRecursive canonTmp)

-- Types used by this script.

data Extraction = DataStream    B.ByteString
                | FileName      FilePath
                | DirectoryName FilePath
                | ArchiveError  String
                deriving Show

type ExtractionResult = (Extraction, IO ())

data Extractor = Extractor { isArchive :: Bool
                           , extractor :: Extraction -> IO ExtractionResult }

noCleanup :: IO ()
noCleanup = return ()

-- Given a file path, determine its type and extract its contents.  Depending
-- on the type of the file, and what command-line options the extraction
-- command supports, the result may be one of several types, described by the
-- type Extraction.

extract :: FilePath -> Bool -> IO Extraction
extract rpath overwrite = do
  path    <- canonicalizePath rpath
  pexists <- doesFileExist path

  unless pexists $ error $ "File does not exist: " ++ path

  destination <- getDestination
  fexists     <- doesFileExist destination
  dexists     <- doesDirectoryExist destination

  when (fexists || dexists) $
    if overwrite
    then if fexists
         then removeFile destination
         else removeDirectoryRecursive destination
    else unless (null typs) $
         error $ "Destination already exists: " ++ destination

  -- Recursively perform all the extractions determined by typs, starting with
  -- the input Extraction, which simply identifies the data source path.  This
  -- same path was used to determine typs, rather than sniffing the data for
  -- identifying marks (jww (2012-09-06): sniff data to allow complex stream
  -- extractions in future).
  extract' typs (FileName path, return ())

  where
    (basename, typs) = findExtractors [] rpath

    getDestination = do
      destpath <- getOption output
      cwd      <- getCurrentDirectory
      return $ (if null destpath
                then cwd
                else destpath) </> takeFileName basename

    wrap :: IO () -> IO Extraction -> IO Extraction
    wrap m io = C.bracket (return ()) (const m) (const io)

    extract' :: [Extractor] -> (Extraction, IO ()) -> IO Extraction

    -- The variations of extract' receive a list of archive types yet to be
    -- "unwrapped" from the previous extraction, plus a cleanup action which
    -- must be executed before the final result is returned.  The end result
    -- is that only the final extraction remains, indicated by the return
    -- value, with all temporaries having been properly cleaned up.

    extract' _ (x@(ArchiveError _),m) = wrap m $ return x

    extract' [] (DataStream d,m) = wrap m $ do
      -- If we reach the final step of the unarchiving process, and the result
      -- is a data stream, write it to disk at the desired basename.
      destination <- getDestination
      B.writeFile destination d
      return $ FileName destination

    extract' [] (FileName f,m) = wrap m $ do
      destination <- getDestination
      -- Don't rename the file
      let realdest = dropFileName destination </> takeFileName f
      renameFile f realdest
      return $ FileName realdest

    extract' [] (DirectoryName f,m) = wrap m $ do
      destination <- getDestination
      renameDirectory f destination
      return $ DirectoryName destination

    extract' (t:ts) (x,m) = wrap m $ do
      -- Each extractor returns an ExtractionResult, which identifies the form
      -- of the extraction (is the result in a file, on a data stream, etc.),
      -- and a cleanup action in IO, which should be performed only after any
      -- sub-extractions have taken place.
      y <- extractor t x

      -- If t is an archive extractor, we'll let examineContents decide if it
      -- contains an item needing further extraction.  Otherwise, if there are
      -- successive compression or encoding stages, process them recursively.
      extract' (if isArchive t then [] else ts) y

findExtractors :: [Extractor] -> FilePath -> (FilePath, [Extractor])
findExtractors acc f =
  apply $ lookup (map toLower (takeExtension f)) extractors
  where
    apply (Just types) = findExtractors (acc ++ types) (dropExtension f)
    apply Nothing      = (f, acc)

performExtract :: String          -- command to execute
                  -> [String]     -- command arguments
                  -> B.ByteString -- standard input
                  -> (B.ByteString -> IO ExtractionResult)
                                  -- function to process output
                  -> IO ExtractionResult
performExtract cmd args ds fn = do
  (exit, out, err) <- bReadProcessWithExitCode cmd args ds
  if exit == ExitSuccess
    then fn out
    else return (ArchiveError err, noCleanup)

examineContents :: FilePath -> Bool -> IO ExtractionResult
examineContents dir cleanup = do
  -- Examine the contents of a populated directory
  --  if it's a single file, return the name and a cleanup action to
  --    remove temp
  --  if it's a single directory, recurse this step
  --  if it's many files and/or directories, return the name of the temp
  --    directory, and no cleanup action
  canon    <- canonicalizePath dir
  contents <- getDirectoryContents canon
  case delete "." $ delete ".." contents of
    [] -> return (ArchiveError "Empty archive", removeDirectoryRecursive canon)

    (x:[]) -> do
      let path = canon </> x
      isDir <- doesDirectoryExist path
      if isDir
        then do (x,m) <- examineContents path False
                return (x, do when cleanup $ removeDirectoryRecursive canon; m)
        else do x <- extract path False
                return (x, when cleanup $ removeDirectoryRecursive canon)

    _ -> return (DirectoryName canon, noCleanup)


extractByTemp :: B.ByteString   -- output to write to temp
                 -> String      -- the temporary file extension
                 -> (Extraction -> IO ExtractionResult)
                                -- function to handle the new temp file
                 -> IO ExtractionResult
extractByTemp ds ext fn = do
  dir <- workingDirectory
  (path, handle) <- openBinaryTempFile dir ("file" ++ ext)
  loud <- isLoud
  when loud $ putStrLn $ "> " ++ path
  B.hPut handle ds
  hFlush handle
  (x,m) <- fn (FileName path)
  hClose handle
  return (x, do removeFile path; m)

createTempDirectory :: IO FilePath
createTempDirectory = do
  dir <- workingDirectory
  (path, handle) <- openBinaryTempFile dir "dir.tmp"
  hClose handle
  removeFile path
  createDirectory path
  return path

workingDirectory :: IO FilePath
workingDirectory = do
  sysp <- getOption sysTemp
  if sysp
    then getTemporaryDirectory
    else do dir <- getOption temp
            if null dir
              then return "."
              else return dir

removeFilePath :: FilePath -> IO ()
removeFilePath path = do
  fexists <- doesFileExist path
  dexists <- doesDirectoryExist path
  when (fexists || dexists) $
    if fexists
    then removeFile path
    else removeDirectoryRecursive path

-- The following function was copied from System.Process, because I needed a
-- variant which operates on lazy ByteStrings.  The regular version attempts
-- to decode Unicode in the binary output from the decompressor.

bReadProcessWithExitCode
    :: FilePath                          -- ^ command to run
    -> [String]                          -- ^ any arguments
    -> B.ByteString                      -- ^ standard input
    -> IO (ExitCode,B.ByteString,String) -- ^ exitcode, stdout, stderr
bReadProcessWithExitCode cmd args input = do
    (Just inh, Just outh, Just errh, pid) <-
        createProcess (proc cmd args){ std_in  = CreatePipe
                                     , std_out = CreatePipe
                                     , std_err = CreatePipe }

    outMVar <- newEmptyMVar

    -- fork off a thread to start consuming stdout
    hSetBinaryMode outh True
    out <- B.hGetContents outh
    _   <- forkIO $ C.evaluate (B.length out) >> putMVar outMVar ()

    -- fork off a thread to start consuming stderr
    hSetBinaryMode errh False
    err <- hGetContents errh
    _   <- forkIO $ C.evaluate (length err) >> putMVar outMVar ()

    -- now write and flush any input
    loud <- isLoud
    if loud
      then if B.null input
           then putStrLn $ "! " ++ cmd ++ " " ++ unwords args
           else do putStrLn $ "| " ++ cmd ++ " " ++ unwords args
                   B.hPutStr inh input; hFlush inh
      else unless (B.null input) $ do B.hPutStr inh input; hFlush inh
    hClose inh -- done with stdin

    -- wait on the output
    takeMVar outMVar
    takeMVar outMVar
    hClose outh
    hClose errh

    -- wait on the process
    ex <- waitForProcess pid

    return (ex, out, err)

-- Main.hs (una) ends here