{- arch-tag: GZip file support in Haskell
Copyright (C) 2004 John Goerzen <jgoerzen@complete.org>

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

{- |
   Module     : System.FileArchive.GZip
   Copyright  : Copyright (C) 2004 John Goerzen
   License    : GNU GPL, version 2 or above

   Maintainer : John Goerzen <jgoerzen@complete.org>
   Stability  : provisional
   Portability: portable

GZip file decompression

Copyright (c) 2004 John Goerzen, jgoerzen\@complete.org

The GZip format is described in RFC1952.
module System.FileArchive.GZip (
                                  -- * GZip Files
                                  -- $gzipfiles

                                  -- * Types
                                  Header(..), Section, GZipError(..),
                                  -- * Whole-File Processing
                                  -- * Section Processing

import Data.Compression.Inflate (inflate_string_remainder)
import Data.Hash.CRC32.GZip (update_crc)
import Data.Bits ((.&.))
import Control.Monad.Error -- (Error(), strMsg, throwError)
import Data.Char (ord)
import Data.Word (Word32())
import Data.Bits.Utils (fromBytes)
import System.IO (hGetContents, hPutStr, Handle())

data GZipError = CRCError               -- ^ CRC-32 check failed
               | NotGZIPFile            -- ^ Couldn't find a GZip header
               | UnknownMethod          -- ^ Compressed with something other than method 8 (deflate)
               | UnknownError String    -- ^ Other problem arose
               deriving (Eq, Show)

instance Error GZipError where
    noMsg = UnknownError ""
    strMsg = UnknownError

-- | First two bytes of file
magic :: String
magic = "\x1f\x8b"

-- | Flags
-- fFTEXT = 1 :: Int
fFHCRC = 2
fFNAME = 8

{- | The data structure representing the GZip header.  This occurs
at the beginning of each 'Section' on disk. -}
data Header = Header {
                      method :: Int,    -- ^ Compression method.  Only 8 is defined at present.
                      flags :: Int,
                      extra :: Maybe String,
                      filename :: Maybe String,
                      comment :: Maybe String,
                      mtime :: Word32,  -- ^ Modification time of the original file
                      xfl :: Int,       -- ^ Extra flags
                      os :: Int         -- ^ Creating operating system
                     } deriving (Eq, Show)

{- | Stored on-disk at the end of each section. -}
data Footer = Footer {
                      size :: Word32,   -- ^ The size of the original, decompressed data
                      crc32 :: Word32,  -- ^ The stored GZip CRC-32 of the original, decompressed data
                      crc32valid :: Bool -- ^ Whether or not the stored CRC-32 matches the calculated CRC-32 of the data

{- | A section represents a compressed component in a GZip file.
Every GZip file has at least one. -}
type Section = (Header, String, Footer)

split1 :: String -> (Char, String)
split1 s = (head s, tail s)

{- | Read a GZip file, decompressing all sections found.

Writes the decompressed data stream to the given output handle.

Returns Nothing if the action was successful, or Just GZipError if there
was a problem.  If there was a problem, the data written to the output
handle should be discarded.

hDecompress :: Handle                   -- ^ Input handle
            -> Handle                   -- ^ Output handle
            -> IO (Maybe GZipError)
hDecompress infd outfd =
    do inc <- hGetContents infd
       let (outstr, err) = decompress inc
       hPutStr outfd outstr
       return err

{- | Read a GZip file, decompressing all sections that are found.

Returns a decompresed data stream and Nothing, or an unreliable string
and Just (error).  If you get anything other than Nothing, the String
returned should be discarded.
decompress :: String -> (String, Maybe GZipError)
decompress s =
    do x <- read_header s
       let rem = snd x
       return $ inflate_string rem
decompress s =
    let procs :: [Section] -> (String, Bool)
        procs [] = ([], True)
        procs ((_, content, foot):xs) =
            let (nexth, nextb) = procs xs in
                (content ++ nexth, (crc32valid foot) && nextb)
        in case read_sections s of
           Left x -> ("", Just x)
           Right x -> let (decomp, iscrcok) = procs x
                          in (decomp, if iscrcok then Nothing else Just CRCError)

decompress s = do x <- read_sections s
                  return $ concatMap (\(_, x, _) -> x) x

-- | Read all sections.
read_sections :: String -> Either GZipError [Section]
read_sections [] = Right []
read_sections s =
    do x <- read_section s
       case x of
           (sect, remain) ->
               do next <- read_sections remain
                  return $ sect : next

parseword :: String -> Word32
parseword s = fromBytes $ map (fromIntegral . ord) $ reverse s

-- | Read one section, returning (ThisSection, Remainder)
read_section :: String -> Either GZipError (Section, String)
read_section s =
        do x <- read_header s
           let headerrem = snd x
           let (decompressed, crc, remainder) = read_data headerrem
           let (crc32str, rm) = splitAt 4 remainder
           let (sizestr, rem2) = splitAt 4 rm
           let filecrc32 = parseword crc32str
           let filesize = parseword sizestr
           return ((fst x, decompressed,
                   Footer {size = filesize, crc32 = filecrc32,
                           crc32valid = filecrc32 == crc})

-- | Read the file's compressed data, returning
-- (Decompressed, Calculated CRC32, Remainder)
read_data :: String -> (String, Word32, String)
read_data x =
    let (decompressed1, remainder) = inflate_string_remainder x
        (decompressed, crc32) = read_data_internal decompressed1 0
          (decompressed, crc32, remainder)
      read_data_internal [] ck = ([], ck)
      read_data_internal (y:ys) ck =
        let newcrc = update_crc ck y
            n = newcrc `seq` read_data_internal ys newcrc
            (y : fst n, snd n)

{- | Read the GZip header.  Return (Header, Remainder).
read_header :: String -> Either GZipError (Header, String)
read_header s =
    let ok = Right "ok" in
    do let (mag, rem) = splitAt 2 s
       if mag /= magic
          then throwError NotGZIPFile
          else ok
       let (method, rem2) = split1 rem
       if (ord(method) /= 8)
          then throwError UnknownMethod
          else ok
       let (flag_S, rem3) = split1 rem2
       let flag = ord flag_S
       let (mtimea, rem3a) = splitAt 4 rem3
       let mtime = parseword mtimea
       let (xfla, rem3b) = split1 rem3a
       let xfl = ord xfla
       let (osa, _) = split1 rem3b
       let os = ord osa
       -- skip modtime (4), extraflag (1), and os (1)
       let rem4 = drop 6 rem3

       let (extra, rem5) =
               if (flag .&. fFEXTRA /= 0)
               -- Skip past the extra field if we have it.
                  then let (xlen_S, _) = split1 rem4
                           (xlen2_S, rem4b) = split1 rem4
                           xlen = (ord xlen_S) + 256 * (ord xlen2_S)
                           (ex, rrem) = splitAt xlen rem4b
                           in (Just ex, rrem)
                  else (Nothing, rem4)

       let (filename, rem6) =
               if (flag .&. fFNAME /= 0)
               -- Skip past the null-terminated filename
                  then let fn = takeWhile (/= '\x00') rem5
                                in (Just fn, drop ((length fn) + 1) rem5)
                  else (Nothing, rem5)

       let (comment, rem7) =
               if (flag .&. fFCOMMENT /= 0)
                  -- Skip past the null-terminated comment
                  then let cm = takeWhile (/= '\x00') rem6
                           in (Just cm, drop ((length cm) + 1) rem6)
                  else (Nothing, rem6)

       rem8 <- if (flag .&. fFHCRC /= 0)
                  -- Skip past the header CRC
                  then return $ drop 2 rem7
                  else return rem7

       return (Header {method = ord method,
                      flags = flag,
                      extra = extra,
                      filename = filename,
                      comment = comment,
                      mtime = mtime,
                      xfl = xfl,
                      os = os}, rem8)

-- Documentation

{- $gzipfiles

GZip files contain one or more 'Section's.  Each 'Section', on disk, begins
with a GZip 'Header', then stores the compressed data itself, and finally
stores a GZip 'Footer'.

The 'Header' identifies the file as a GZip file, records the original
modification date and time, and, in some cases, also records the original
filename and comments.

The 'Footer' contains a GZip CRC32 checksum over the decompressed data as
well as a 32-bit length of the decompressed data.  The module
'Data.Hash.CRC32.GZip' is used to validate stored CRC32 values.

The vast majority of GZip files contain only one 'Section'.  Standard tools
that work with GZip files create single-section files by default.

Multi-section files can be created by simply concatenating two existing
GZip files together.  The standard gunzip and zcat tools will simply
concatenate the decompressed data when reading these files back.  The
'decompress' function in this module will do the same.

When reading data from this module, please use caution regarding how you access
it.  For instance, if you are wanting to write the decompressed stream
to disk and validate its CRC32 value, you could use the 'decompress'
function.  However, you should process the entire stream before you check
the value of the Bool it returns.  Otherwise, you will force Haskell to buffer
the entire file in memory just so it can check the CRC32.