----------------------------------------------------------------------------- -- | -- Module : Distribution.Client.Tar -- Copyright : (c) 2007 Bjorn Bringert, -- 2008 Andrea Vezzosi, -- 2008 Duncan Coutts -- License : BSD-like -- -- Maintainer : duncan@haskell.org -- Stability : provisional -- Portability : portable -- -- TAR archive reading and writing -- ----------------------------------------------------------------------------- module Distribution.Client.Tar ( -- * High level all in one operations on files createTarGzFile, extractTarGzFile, -- * Reading and writing the tar format read, write, -- * Packing and unpacking files to\/from a tar archive pack, unpack, -- * Tar archive 'Entry' Entry(..), fileName, ExtendedHeader(..), FileType(..), UserId, GroupId, EpochTime, DevMajor, DevMinor, FileSize, -- ** Constructing entries emptyEntry, simpleFileEntry, simpleDirectoryEntry, -- ** 'TarPath's TarPath, toTarPath, fromTarPath, -- * Sequence of 'Entry' records with failures Entries(..), foldEntries, unfoldEntries, mapEntries, -- * Sanity checking tar contents checkEntryNames ) where import Data.Char (ord) import Data.Int (Int64) import Data.List (foldl') import Control.Monad (MonadPlus(mplus)) import Numeric (readOct, showOct) import qualified Data.ByteString.Lazy as BS import qualified Data.ByteString.Lazy.Char8 as BS.Char8 import Data.ByteString.Lazy (ByteString) import qualified Codec.Compression.GZip as GZip import System.FilePath ( () ) import qualified System.FilePath as FilePath.Native ( (), joinPath, splitDirectories, takeDirectory , isAbsolute, isValid, makeRelative ) import qualified System.FilePath.Posix as FilePath.Posix ( joinPath, pathSeparator, splitPath, splitDirectories ) import System.Directory ( getDirectoryContents, doesDirectoryExist , getModificationTime, createDirectoryIfMissing, copyFile , Permissions(..), getPermissions ) import System.Posix.Types ( FileMode ) import System.Time ( ClockTime(..) ) import System.IO ( IOMode(ReadMode), openBinaryFile, hFileSize ) import System.IO.Unsafe (unsafeInterleaveIO) import Distribution.Client.Utils ( writeFileAtomic ) import Prelude hiding (read) -- -- * High level operations -- createTarGzFile :: FilePath -- ^ Full Tarball path -> FilePath -- ^ Base directory -> FilePath -- ^ Directory to archive, relative to base dir -> IO () createTarGzFile tar base dir = writeFileAtomic tar . GZip.compress . write =<< pack base dir extractTarGzFile :: FilePath -- ^ Destination directory -> FilePath -- ^ Tarball -> IO () extractTarGzFile dir tar = unpack dir . checkEntryNames . read . GZip.decompress =<< BS.readFile tar -- -- * Entry type -- type UserId = Int type GroupId = Int type EpochTime = Int -- ^ The number of seconds since the UNIX epoch type DevMajor = Int type DevMinor = Int type FileSize = Int64 -- | TAR archive entry data Entry = Entry { -- | Path of the file or directory. The path separator should be @/@ for -- portable TAR archives. filePath :: TarPath, -- | UNIX file mode. fileMode :: FileMode, -- | Numeric owner user id. Should be set to @0@ if unknown. ownerId :: UserId, -- | Numeric owner group id. Should be set to @0@ if unknown. groupId :: GroupId, -- | File size in bytes. Should be 0 for entries other than normal files. fileSize :: FileSize, -- | Last modification time. modTime :: EpochTime, -- | Type of this entry. fileType :: FileType, -- | If the entry is a hard link or a symbolic link, this is the path of -- the link target. For all other entry types this should be @\"\"@. linkTarget :: FilePath, -- | The remaining meta-data is in the V7, ustar/posix or gnu formats -- For V7 there is no extended info at all and for posix/ustar the -- information is the same though the kind affects the way the information -- is encoded. headerExt :: ExtendedHeader, -- | Entry contents. For entries other than normal -- files, this should be an empty string. fileContent :: ByteString } fileName :: Entry -> FilePath fileName = fromTarPath . filePath data ExtendedHeader = V7 | USTAR { -- | The owner user name. Should be set to @\"\"@ if unknown. ownerName :: String, -- | The owner group name. Should be set to @\"\"@ if unknown. groupName :: String, -- | For character and block device entries, this is the -- major number of the device. For all other entry types, it -- should be set to @0@. deviceMajor :: DevMajor, -- | For character and block device entries, this is the -- minor number of the device. For all other entry types, it -- should be set to @0@. deviceMinor :: DevMinor } | GNU { -- | The owner user name. Should be set to @\"\"@ if unknown. ownerName :: String, -- | The owner group name. Should be set to @\"\"@ if unknown. groupName :: String, -- | For character and block device entries, this is the -- major number of the device. For all other entry types, it -- should be set to @0@. deviceMajor :: DevMajor, -- | For character and block device entries, this is the -- minor number of the device. For all other entry types, it -- should be set to @0@. deviceMinor :: DevMinor } -- | TAR archive entry types. data FileType = NormalFile | HardLink | SymbolicLink | CharacterDevice | BlockDevice | Directory | FIFO | ExtendedHeader | GlobalHeader | Custom Char -- 'A' .. 'Z' | Reserved Char -- other / reserved / unknown deriving (Eq, Show) toFileTypeCode :: FileType -> Char toFileTypeCode NormalFile = '0' toFileTypeCode HardLink = '1' toFileTypeCode SymbolicLink = '2' toFileTypeCode CharacterDevice = '3' toFileTypeCode BlockDevice = '4' toFileTypeCode Directory = '5' toFileTypeCode FIFO = '6' toFileTypeCode ExtendedHeader = 'x' toFileTypeCode GlobalHeader = 'g' toFileTypeCode (Custom c) = c toFileTypeCode (Reserved c) = c fromFileTypeCode :: Char -> FileType fromFileTypeCode '0' = NormalFile fromFileTypeCode '\0' = NormalFile fromFileTypeCode '1' = HardLink fromFileTypeCode '2' = SymbolicLink fromFileTypeCode '3' = CharacterDevice fromFileTypeCode '4' = BlockDevice fromFileTypeCode '5' = Directory fromFileTypeCode '6' = FIFO fromFileTypeCode '7' = NormalFile fromFileTypeCode 'x' = ExtendedHeader fromFileTypeCode 'g' = GlobalHeader fromFileTypeCode c | c >= 'A' && c <= 'Z' = Custom c fromFileTypeCode c = Reserved c emptyEntry :: FileType -> TarPath -> Entry emptyEntry ftype tarpath = Entry { filePath = tarpath, fileMode = case ftype of Directory -> 0o0755 -- rwxr-xr-x for directories _ -> 0o0644, -- rw-r--r-- for normal files ownerId = 0, groupId = 0, fileSize = 0, modTime = 0, fileType = ftype, linkTarget = "", headerExt = USTAR { ownerName = "", groupName = "", deviceMajor = 0, deviceMinor = 0 }, fileContent = BS.empty } simpleFileEntry :: TarPath -> ByteString -> Entry simpleFileEntry name content = (emptyEntry NormalFile name) { fileSize = BS.length content, fileContent = content } simpleDirectoryEntry :: TarPath -> Entry simpleDirectoryEntry name = emptyEntry Directory name -- -- * Tar paths -- -- | The classic tar format allowed just 100 charcters for the file name. The -- USTAR format extended this with an extra 155 characters, however it uses a -- complex method of splitting the name between the two sections. -- -- Instead of just putting any overflow into the extended area, it uses the -- extended area as a prefix. The agrevating insane bit however is that the -- prefix (if any) must only contain a directory prefix. That is the split -- between the two areas must be on a directory separator boundary. So there is -- no simple calculation to work out if a file name is too long. Instead we -- have to try to find a valid split that makes the name fit in the two areas. -- -- The rationale presumably was to make it a bit more compatible with tar -- programs that only understand the classic format. A classic tar would be -- able to extract the file name and possibly some dir prefix, but not the -- full dir prefix. So the files would end up in the wrong place, but that's -- probably better than ending up with the wrong names too. -- -- So it's understandable but rather annoying. -- -- * Tar paths use posix format (ie @\'/\'@ directory separators), irrespective -- of the local path conventions. -- -- * The directory separator between the prefix and name is /not/ stored. -- data TarPath = TarPath FilePath -- path name, 100 characters max. FilePath -- path prefix, 155 characters max. -- | Convert a 'TarPath' to a native 'FilePath'. -- -- The native 'FilePath' will use the native directory separator but it is not -- otherwise checked for validity or sanity. In particular: -- -- * The tar path may be invalid as a native path, eg the filename @\"nul\"@ is -- not valid on Windows. -- * The tar path may be an absolute path or may contain @\"..\"@ components. -- For security reasons this should not usually be allowed, but it is your -- responsibility to check for these conditions. -- fromTarPath :: TarPath -> FilePath fromTarPath (TarPath name prefix) = FilePath.Native.joinPath $ FilePath.Posix.splitDirectories prefix ++ FilePath.Posix.splitDirectories name -- | Convert a native 'FilePath' to a 'TarPath'. The 'FileType' is needed -- because for directories a 'TarPath' uses a trailing @\/@. -- -- The conversion may fail if the 'FilePath' is too long. See 'TarPath' for a -- description of the problem with splitting long 'FilePath's. -- toTarPath :: FileType -> FilePath -> Either String TarPath toTarPath ftype = splitLongPath . addTrailingSep ftype . FilePath.Posix.joinPath . FilePath.Native.splitDirectories where addTrailingSep Directory path = path ++ [FilePath.Posix.pathSeparator] addTrailingSep _ path = path -- | Takes a sanitized path, split on directory separators and tries to pack it -- into the 155 + 100 tar file name format. -- -- The stragey is this: take the name-directory components in reverse order -- and try to fit as many components into the 100 long name area as possible. -- If all the remaining components fit in the 155 name area then we win. -- splitLongPath :: FilePath -> Either String TarPath splitLongPath path = case packName nameMax (reverse (FilePath.Posix.splitPath path)) of Left err -> Left err Right (name, []) -> Right (TarPath name "") Right (name, first:rest) -> case packName prefixMax remainder of Left err -> Left err Right (_ , (_:_)) -> Left "File name too long (cannot split)" Right (prefix, []) -> Right (TarPath name prefix) where -- drop the '/' between the name and prefix: remainder = init first : rest where nameMax, prefixMax :: Int nameMax = 100 prefixMax = 155 packName _ [] = Left "File name empty" packName maxLen (c:cs) | n > maxLen = Left "File name too long" | otherwise = Right (packName' maxLen n [c] cs) where n = length c packName' maxLen n ok (c:cs) | n' <= maxLen = packName' maxLen n' (c:ok) cs where n' = n + length c packName' _ _ ok cs = (FilePath.Posix.joinPath ok, cs) -- -- * Entries type -- -- | A tar archive is a sequence of entries. data Entries = Next Entry Entries | Done | Fail String unfoldEntries :: (a -> Either String (Maybe (Entry, a))) -> a -> Entries unfoldEntries f = unfold where unfold x = case f x of Left err -> Fail err Right Nothing -> Done Right (Just (e, x')) -> Next e (unfold x') foldEntries :: (Entry -> a -> a) -> a -> (String -> a) -> Entries -> a foldEntries next done fail' = fold where fold (Next e es) = next e (fold es) fold Done = done fold (Fail err) = fail' err mapEntries :: (Entry -> Either String Entry) -> Entries -> Entries mapEntries f = foldEntries (\entry rest -> either Fail (flip Next rest) (f entry)) Done Fail -- -- * Checking -- checkEntryNames :: Entries -> Entries checkEntryNames = mapEntries (\entry -> maybe (Right entry) Left (checkEntryName entry)) checkEntryName :: Entry -> Maybe String checkEntryName entry = case fileType entry of HardLink -> check (fileName entry) `mplus` check (linkTarget entry) SymbolicLink -> check (fileName entry) `mplus` check (linkTarget entry) _ -> check (fileName entry) where check name | FilePath.Native.isAbsolute name = Just $ "Absolute file name in tar archive: " ++ show name | not (FilePath.Native.isValid name) = Just $ "Invalid file name in tar archive: " ++ show name | any (=="..") (FilePath.Native.splitDirectories name) = Just $ "Invalid file name in tar archive: " ++ show name | otherwise = Nothing -- -- * Reading -- read :: ByteString -> Entries read = unfoldEntries getEntry getEntry :: ByteString -> Either String (Maybe (Entry, ByteString)) getEntry bs | BS.length header < 512 = Left "Truncated TAR archive" | endBlock = Right Nothing --FIXME: force last two blocks to close fds! | not (correctChecksum header chksum) = Left "TAR checksum error" | magic /= "ustar\NUL00" && magic /= "ustar \NUL" = Left $ "TAR entry not ustar format: " ++ show magic | otherwise = Right (Just (entry, bs''')) where (header,bs') = BS.splitAt 512 bs endBlock = getByte 0 header == '\0' name = getString 0 100 header mode = getOct 100 8 header uid = getOct 108 8 header gid = getOct 116 8 header size = getOct 124 12 header mtime = getOct 136 12 header chksum = getOct 148 8 header typecode = getByte 156 header linkname = getString 157 100 header magic = getChars 257 8 header uname = getString 265 32 header gname = getString 297 32 header devmajor = getOct 329 8 header devminor = getOct 337 8 header prefix = getString 345 155 header -- trailing = getBytes 500 12 header --TODO: check all \0's padding = (512 - size) `mod` 512 (cnt,bs'') = BS.splitAt size bs' bs''' = BS.drop padding bs'' entry = Entry { filePath = TarPath name prefix, fileMode = mode, ownerId = uid, groupId = gid, fileSize = size, modTime = mtime, fileType = fromFileTypeCode typecode, linkTarget = linkname, headerExt = case magic of "\0\0\0\0\0\0\0\0" -> V7 "ustar\NUL00" -> USTAR { ownerName = uname, groupName = gname, deviceMajor = devmajor, deviceMinor = devminor } "ustar \NUL" -> GNU { ownerName = uname, groupName = gname, deviceMajor = devmajor, deviceMinor = devminor } _ -> V7, --FIXME: fail instead fileContent = cnt } correctChecksum :: ByteString -> Int -> Bool correctChecksum header checksum = checksum == checksum' where -- sum of all 512 bytes in the header block, -- treating each byte as an 8-bit unsigned value checksum' = BS.Char8.foldl' (\x y -> x + ord y) 0 header' -- treating the 8 bytes of chksum as blank characters. header' = BS.concat [BS.take 148 header, BS.Char8.replicate 8 ' ', BS.drop 156 header] -- * TAR format primitive input getOct :: Integral a => Int64 -> Int64 -> ByteString -> a getOct off len = parseOct . BS.Char8.unpack . BS.Char8.takeWhile (\c -> c /= '\NUL' && c /= ' ') . getBytes off len where parseOct "" = 0 parseOct s = case readOct s of [(x,[])] -> x _ -> error $ "Number format error: " ++ show s getBytes :: Int64 -> Int64 -> ByteString -> ByteString getBytes off len = BS.take len . BS.drop off getByte :: Int64 -> ByteString -> Char getByte off bs = BS.Char8.index bs off getChars :: Int64 -> Int64 -> ByteString -> String getChars off len = BS.Char8.unpack . getBytes off len getString :: Int64 -> Int64 -> ByteString -> String getString off len = BS.Char8.unpack . BS.Char8.takeWhile (/='\0') . getBytes off len -- -- * Writing -- -- | Creates an uncompressed archive write :: [Entry] -> ByteString write es = BS.concat $ map putEntry es ++ [BS.replicate (512*2) 0] putEntry :: Entry -> ByteString putEntry entry = BS.concat [ header, content, padding ] where header = putHeader entry content = fileContent entry padding = BS.replicate paddingSize 0 paddingSize = fromIntegral $ negate (fileSize entry) `mod` 512 putHeader :: Entry -> BS.ByteString putHeader entry = BS.Char8.pack $ take 148 block ++ putOct 7 checksum ++ ' ' : drop 156 block where block = putHeaderNoChkSum entry checksum = foldl' (\x y -> x + ord y) 0 block putHeaderNoChkSum :: Entry -> String putHeaderNoChkSum entry = concat [ putString 100 $ name , putOct 8 $ fileMode entry , putOct 8 $ ownerId entry , putOct 8 $ groupId entry , putOct 12 $ fileSize entry , putOct 12 $ modTime entry , fill 8 $ ' ' -- dummy checksum , putChar8 $ toFileTypeCode (fileType entry) , putString 100 $ linkTarget entry ] ++ case headerExt entry of V7 -> fill 255 '\NUL' ext@USTAR {}-> concat [ putString 8 $ "ustar\NUL00" , putString 32 $ ownerName ext , putString 32 $ groupName ext , putOct 8 $ deviceMajor ext , putOct 8 $ deviceMinor ext , putString 155 $ prefix , fill 12 $ '\NUL' ] ext@GNU {} -> concat [ putString 8 $ "ustar \NUL" , putString 32 $ ownerName ext , putString 32 $ groupName ext , putGnuDev 8 $ deviceMajor ext , putGnuDev 8 $ deviceMinor ext , putString 155 $ prefix , fill 12 $ '\NUL' ] where TarPath name prefix = filePath entry putGnuDev w n = case fileType entry of CharacterDevice -> putOct w n BlockDevice -> putOct w n _ -> replicate w '\NUL' -- * TAR format primitive output type FieldWidth = Int putString :: FieldWidth -> String -> String putString n s = take n s ++ fill (n - length s) '\NUL' putOct :: Integral a => FieldWidth -> a -> String putOct n x = let octStr = take (n-1) $ showOct x "" in fill (n - length octStr - 1) '0' ++ octStr ++ putChar8 '\NUL' putChar8 :: Char -> String putChar8 c = [c] fill :: FieldWidth -> Char -> String fill n c = replicate n c -- -- * Unpacking -- unpack :: FilePath -> Entries -> IO () unpack dir entries = extractLinks =<< extractFiles [] entries where extractFiles _ (Fail err) = Prelude.fail err extractFiles links Done = return links extractFiles links (Next entry entries') = case fileType entry of NormalFile -> BS.writeFile (dir fileName entry) (fileContent entry) >> extractFiles links entries' HardLink -> saveLink SymbolicLink -> saveLink Directory -> createDirectoryIfMissing False (dir fileName entry) >> extractFiles links entries' _ -> extractFiles links entries' -- FIXME: warning? where saveLink = seq (length name) $ seq (length name) $ extractFiles (link:links) entries' where name = fileName entry target = linkTarget entry link = (name, target) extractLinks = mapM_ $ \(name, target) -> let path = dir name in copyFile (FilePath.Native.takeDirectory path target) path -- -- * Packing -- -- | Creates a tar archive from a directory of files, the paths in the archive -- will be relative to the given base directory. -- pack :: FilePath -- ^ Base directory -> FilePath -- ^ Directory or file to package, relative to the base dir -> IO [Entry] pack baseDir sourceDir = mapM (unsafeInterleaveIO . uncurry (createFileEntry baseDir)) =<< recurseDirectories [baseDir sourceDir] recurseDirectories :: [FilePath] -> IO [(FileType, FilePath)] recurseDirectories [] = return [] recurseDirectories (dir:dirs) = unsafeInterleaveIO $ do (files, dirs') <- collect [] [] =<< getDirectoryContents dir files' <- recurseDirectories (dirs' ++ dirs) return ((Directory, dir) : map ((,) NormalFile) files ++ files') where collect files dirs' [] = return (reverse files, reverse dirs') collect files dirs' (entry:entries) | ignore entry = collect files dirs' entries collect files dirs' (entry:entries) = do let dirEntry = dir entry isDirectory <- doesDirectoryExist dirEntry if isDirectory then collect files (dirEntry:dirs') entries else collect (dirEntry:files) dirs' entries ignore ['.'] = True ignore ['.', '.'] = True ignore _ = False createFileEntry :: FilePath -- ^ path to find the file -> FileType -> FilePath -- ^ path to use for the tar Entry -> IO Entry createFileEntry baseDir ftype absPath = do let relPath = FilePath.Native.makeRelative baseDir absPath tarpath <- either Prelude.fail return (toTarPath ftype relPath) mtime <- getModTime absPath case ftype of NormalFile -> do file <- openBinaryFile absPath ReadMode mode <- getFileMode absPath size <- hFileSize file content <- BS.hGetContents file return (emptyEntry NormalFile tarpath) { fileMode = mode, modTime = mtime, fileSize = fromIntegral size, fileContent = content } _ -> return (emptyEntry Directory tarpath) { modTime = mtime } -- | We can't be precise because of portability, so we default to rw-r--r-- for -- normal filesand rwxr-xr-x for executables. getFileMode :: FilePath -> IO FileMode getFileMode path = do perms <- getPermissions path if executable perms then return 0o0755 else return 0o0644 getModTime :: FilePath -> IO EpochTime getModTime path = do (TOD s _) <- getModificationTime path return (fromIntegral s)