{-# LANGUAGE BangPatterns #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE TupleSections #-} -- | Pack file support. module Data.Git.Internal.Pack where import Codec.Compression.Zlib.Internal hiding (Raw) import Control.Applicative import Control.Concurrent.MVar import Control.Exception import Control.Monad import Control.Monad.Fail import Control.Monad.Reader import Control.Monad.RWS import Control.Monad.ST import Data.Attoparsec.ByteString as A import Data.Attoparsec.Combinator (lookAhead) import Data.Bits import qualified Data.ByteString as B import Data.ByteString.Builder (Builder) import qualified Data.ByteString.Builder as BB import qualified Data.ByteString.Lazy as BL import Data.Digest.CRC32 import Data.Map (Map) import qualified Data.Map as M import Data.STRef import qualified Data.Vector as V import qualified Data.Vector.Mutable as VM import qualified Data.Vector.Unboxed as UV import qualified Data.Vector.Unboxed.Mutable as UM import Data.Word import System.IO (SeekMode (..), hSeek, hTell) import System.Posix.FilePath import Data.Git.Hash import Data.Git.Internal.FileUtil import Data.Git.Internal.Object (parseBlob, parseCommit, parseTag, parseTree) import Data.Git.Internal.Parsers import Data.Git.Internal.Types (PackFile (..), PackIndex (..), Crc32) import Data.Git.Object data PackObject = PackObject ObjectType | DeltaOff Int | DeltaRef Sha1 deriving (Eq, Ord, Show) data Raw = Raw { rawType :: !PackObject , rawData :: !BL.ByteString } deriving (Eq, Ord, Show) parsePackedRaw' :: Parser Raw parsePackedRaw' = do (ty, sz) <- parseCompactHeader rest <- lookAhead takeLazyByteString let (_, dat) = decompressTo sz rest return $ Raw ty dat base128le :: Parser Word base128le = do b <- fromIntegral <$> anyWord8 if b `testBit` 7 then base128le >>= return . \n -> n `unsafeShiftL` 7 .|. (0x7f .&. b) else return b parseCompactHeader :: Parser (PackObject, Word) parseCompactHeader = do h <- anyWord8 sz <- if h `testBit` 7 then base128le else return 0 ty <- parseType $ 0x70 .&. h return (ty, (sz `unsafeShiftL` 4) .|. (0x0f .&. fromIntegral h)) where parseType 0x10 = pure $ PackObject CommitType parseType 0x20 = pure $ PackObject TreeType parseType 0x30 = pure $ PackObject BlobType parseType 0x40 = pure $ PackObject TagType parseType 0x60 = DeltaOff <$> parseOffset parseType 0x70 = DeltaRef . Sha1 <$> A.take 20 parseType n = error $ "no type for " ++ show (n `shiftR` 4) -- Parse according to the following (cargo-culted) recommended C: -- # byte = *data++; -- # number = byte & 0x7f; -- # while (byte & 0x80) { -- # byte = *data++; -- # number = ((number + 1) << 7) | (byte & 0x7f); -- # } parseOffset :: Parser Int parseOffset = do byte <- anyWord8 let number = byte .&. 0x7f loop byte (fromIntegral number) where loop byte number | byte `testBit` 7 = do byte' <- anyWord8 loop byte' $ (number + 1) `unsafeShiftL` 7 .|. (fromIntegral byte' .&. 0x7f) loop _ number | otherwise = return number decompressTo :: Word -> BL.ByteString -> (BL.ByteString, BL.ByteString) decompressTo sz = fmap BL.fromStrict . foldDecompressStreamWithInput ((<>) . ("",)) (,"") throw go where go = decompressST zlibFormat defaultDecompressParams { decompressBufferSize = fromIntegral sz } type Patch a = RWST BL.ByteString BB.Builder Word64 Parser a -- see: https://github.com/git/git/blob/master/patch-delta.c parsePatch :: Patch () parsePatch = do op <- lift anyWord8 b <- if op `testBit` 7 then copy op else BL.fromStrict <$> lift (A.take $ fromIntegral op) tell . BB.lazyByteString $ b where copy op = do put 0 when (op `testBit` 0) $ orShift 0 when (op `testBit` 1) $ orShift 8 when (op `testBit` 2) $ orShift 16 when (op `testBit` 3) $ orShift 24 cp_off <- fromIntegral <$> get put 0 when (op `testBit` 4) $ orShift 0 when (op `testBit` 5) $ orShift 8 when (op `testBit` 6) $ orShift 16 gets (==0) >>= flip when (put 0x10000) cp_size <- fromIntegral <$> get asks $ BL.take cp_size . BL.drop cp_off orShift n = do byte <- lift anyWord8 modify ((fromIntegral byte `unsafeShiftL` n) .|.) applyPatch :: BL.ByteString -> B.ByteString -> Either String BL.ByteString applyPatch base = fmap (BB.toLazyByteString . snd) . parseOnly (execRWST go base 0) where go = lift base128le >> lift base128le >> many parsePatch resolveDelta :: Int -> PackFile -> Raw -> Maybe Raw resolveDelta off pf@(PackFile pb _) (Raw (DeltaOff o) b) = do lu <- parseMaybe parsePackedRaw' $ BL.drop (fromIntegral $ off - o) pb mr <- resolveDelta (off - o) pf lu let patched = either error id (applyPatch (rawData mr) $ BL.toStrict b) return $ mr { rawData = patched } resolveDelta _ pf@(PackFile pb ind) (Raw (DeltaRef s) b) = do o <- getShaOffset ind s lu <- parseMaybe parsePackedRaw' $ BL.drop (fromIntegral o) pb mr <- resolveDelta (fromIntegral o) pf lu let patched = either error id (applyPatch (rawData mr) $ BL.toStrict b) return $ mr { rawData = patched } resolveDelta _ _ r = return r parseFanout :: Parser (UV.Vector Word32) parseFanout = UV.replicateM 256 word32 parseIndexShas :: Word32 -> Parser (V.Vector Sha1) parseIndexShas n = V.replicateM (fromIntegral n) (A.take 20 >>= return . Sha1) parseShaCrcs :: Word32 -> Parser (UV.Vector Crc32) parseShaCrcs n = UV.replicateM (fromIntegral n) word32 parseShaOffsets :: Word32 -> Parser (UV.Vector Word32) parseShaOffsets n = UV.replicateM (fromIntegral n) word32 parseBigOffsets :: UV.Vector Word32 -> Parser (UV.Vector Word64) parseBigOffsets os = UV.replicateM (fromIntegral n) word64 where n = UV.foldl' (\a b -> a + (b `unsafeShiftL` 63)) 0 os -- FIXME: Handle large packfile offsets parsePackIndex :: Parser PackIndex parsePackIndex = do void "\255tOc" 2 <- word32 fo <- parseFanout let size = fo UV.! 255 ss <- parseIndexShas size cs <- parseShaCrcs size os <- parseShaOffsets size bs <- parseBigOffsets os -- ps <- parseSha1 return $ PackIndex fo ss cs os bs -- I'm not proud of this function. getShaOffset :: PackIndex -> Sha1 -> Maybe Word64 getShaOffset pidx s = offset' where fb, ub, lb :: Int -- what the Sha starts with, which is our index into the fanout table. fb = fromIntegral . B.head . getSha1 $ s -- The last value position that can start with this byte. ub = fromIntegral $ (fanout pidx) UV.! fb -- Our search starts at the highest value in the value below our sha1 initial byte. lb | fb == 0 = 0 | otherwise = fromIntegral $ (fanout pidx) UV.! pred fb offset = ((shaOffsets pidx) UV.!) . (lb+) <$> V.elemIndex s (V.slice lb (ub - lb) (indexShas pidx)) offset' = case offset of Nothing -> Nothing Just off | off `testBit` 31 -> Just ((shaBigOffsets pidx) UV.! fromIntegral (clearBit off 31)) | otherwise -> Just $ fromIntegral off readIndexFile :: RawFilePath -> IO PackIndex readIndexFile p = (either error id . parseOnly parsePackIndex) <$> readRawFileS (p <.> "idx") isPackIndex :: RawFilePath -> Bool isPackIndex = (==".idx") . takeExtension -- | Read a 'PackFile' off the filesystem. readPackFile :: RawFilePath -> IO PackFile readPackFile p = PackFile <$> readRawFileL (p <.> "pack") <*> readIndexFile p -- | Try to get the 'Object' for a 'Sha1' in a 'PackFile'. findPackSha :: PackFile -> Sha1 -> Maybe Object findPackSha pf@(PackFile pb ix) s = do offset <- getShaOffset ix s raw <- parseMaybe parsePackedRaw' (BL.drop (fromIntegral offset) pb) Raw (PackObject t) d <- resolveDelta (fromIntegral offset) pf raw case t of -- FIXME: don't do the stupid "blob 1234\NUL" thing here BlobType -> BlobObj <$> parseMaybe parseBlob ("blob 1234\NUL" <> d) TreeType -> TreeObj <$> parseMaybe parseTree ("tree 1234\NUL" <> d) CommitType -> CommitObj <$> parseMaybe parseCommit ("commit 1234\NUL" <> d) TagType -> TagObj <$> parseMaybe parseTag ("tag 1234\NUL" <> d) buildPackedObject :: Object -> Builder buildPackedObject o = buildCompactHeader ty sz <> BB.lazyByteString (compress zlibFormat defaultCompressParams b) where b = BB.toLazyByteString $ buildObject o ty = compactTag o sz = fromIntegral . BL.length $ b -- TODO: suck less compactTag :: Object -> Word8 compactTag CommitObj {} = 0x10 compactTag TreeObj {} = 0x20 compactTag BlobObj {} = 0x30 compactTag TagObj {} = 0x40 buildCompactHeader :: Word8 -> Word64 -> Builder buildCompactHeader t sz = go (t .|. 0x0f .&. fromIntegral sz) (sz `unsafeShiftR` 4) where go c 0 = BB.word8 c go c n = BB.word8 (c .|. 0x80) <> go (fromIntegral $ 0x7f .&. n) (n `unsafeShiftR` 7) data IndexData = IndexData { idxCrc :: !Word32 , idxOffset :: !Word64 } deriving (Eq, Ord, Show) type PackIndexer = Map Sha1 IndexData data PackingState = PackingState { psTempFile :: TempFile , psCount :: Word32 , psIndexer :: PackIndexer , psOffset :: Word64 } makePackIndex :: PackIndexer -> PackIndex makePackIndex idx | sz == 0 = -- The empty PackIndex PackIndex (UV.replicate 256 0) mempty mempty mempty mempty | otherwise = runST $ do fan <- UM.replicate 256 0 shas <- VM.unsafeNew sz crcs <- UM.unsafeNew sz offs <- UM.unsafeNew sz bos <- newSTRef (0, []) forM_ (zip [0..] $ M.toAscList idx) $ \(i, (h@(Sha1 bs), IndexData crc off)) -> do UM.unsafeWrite fan (fromIntegral $ B.head bs) (fromIntegral $ i + 1) VM.unsafeWrite shas i h UM.unsafeWrite crcs i crc if off < bit 31 then UM.unsafeWrite offs i $ fromIntegral off else do (boc, bol) <- readSTRef bos UM.unsafeWrite offs i (boc `setBit` 31) writeSTRef bos (boc + 1, (boc, off) : bol) (boc, bol) <- readSTRef bos let fillFan lst fi = do cur <- UM.unsafeRead fan fi if cur < lst then UM.unsafeWrite fan fi lst >> return lst else return cur foldM_ fillFan 0 [0..255] PackIndex <$> UV.unsafeFreeze fan <*> V.unsafeFreeze shas <*> UV.unsafeFreeze crcs <*> UV.unsafeFreeze offs <*> pure (UV.create $ do v <- UM.unsafeNew (fromIntegral boc) sequence_ [UM.unsafeWrite v (fromIntegral bi) bo | (bi, bo) <- bol] return v) where sz = M.size idx buildPackIndex :: PackIndex -> Builder buildPackIndex (PackIndex fan shas crcs offs boffs) = BB.byteString (B.pack [0xff, 0x74, 0x4f, 0x63]) <> BB.word32BE 2 <> foldMap BB.word32BE (UV.toList fan) <> foldMap (BB.byteString . getSha1) shas <> foldMap BB.word32BE (UV.toList crcs) <> foldMap BB.word32BE (UV.toList offs) <> foldMap BB.word64BE (UV.toList boffs) -- | A monad transformer for writing packfiles. newtype PackingT m a = PackingT { unPackingT :: ReaderT (MVar PackingState) m a } deriving (Functor, Applicative, Monad, MonadIO, MonadReader (MVar PackingState) ,MonadTrans, MonadFail) -- | Run a 'PackingT' computation, using the given path as a template for the tempfile. The pack -- and the index will be written to the same directory as the tempfile. runPackingT :: MonadIO m => (PackFile -> m ()) -> RawFilePath -> PackingT m a -> m a runPackingT reg p pma = do mvar <- liftIO $ newPackFile p >>= newMVar runReaderT (unPackingT $ pma >>= \(!a) -> finishPacking >> return a) mvar where finishPacking = ask >>= liftIO . (`withMVar` finishPackFile) >>= lift . maybe (pure ()) reg -- | Write out the currently accumulating packfile, and start writing a new one. flushPackFile :: MonadIO m => (PackFile -> m ()) -> PackingT m () flushPackFile reg = ask >>= liftIO . (`modifyMVar` go) >>= lift . maybe (pure ()) reg where go ps = do p <- finishPackFile ps n <- newPackFile (tempTemplate (psTempFile ps)) pure (n, p) -- | Write an object to the packfile. packObject :: MonadIO m => Object -> PackingT m Sha1 packObject o = ask >>= liftIO . (`modifyMVar` go) where go !ps@(PackingState tmp n idx off) | hash `M.member` idx = pure (ps, hash) | otherwise = do let objd = BB.toLazyByteString $ buildPackedObject o off' = off + fromIntegral (BL.length objd) crc = crc32 objd idx' = M.insert hash (IndexData crc off) idx BL.hPut (tempHandle tmp) objd return $ (PackingState tmp (n + 1) idx' off', hash) hash = sha1 o -- | Write the given objects to a packfile using the given path as a template for the tempfile. The -- pack will be written to the same directory as the tempfile. Also writes the pack index. writePackFile :: MonadIO m => (PackFile -> m ()) -> RawFilePath -> [Object] -> m () writePackFile reg p os = runPackingT reg p $ mapM_ packObject os finishPackFile :: MonadIO m => PackingState -> m (Maybe PackFile) finishPackFile (PackingState tmp 0 _ _ ) = -- don't write empty files liftIO (closeTempFile tmp Nothing) >> pure Nothing finishPackFile (PackingState tmp n idx _) = liftIO $ do let h = tempHandle tmp len <- fromIntegral <$> hTell h hSeek h AbsoluteSeek 8 -- rewind to the length field... BB.hPutBuilder h $ BB.word32BE n -- ...and scribble it in hSeek h AbsoluteSeek 0 -- now we go back to the beginning... !hash <- sha1 <$> BL.hGet h len -- ...to hash the whole thing hSeek h SeekFromEnd 0 -- and be paranoid about appending... B.hPut h $ getSha1 hash -- ...the hash to the end of the pack let !baseFilename = takeDirectory (tempFileName tmp) ("pack-" <> getSha1Hex (toHex hash)) closeTempFile tmp (Just $ baseFilename <.> "pack") pnm <- withHandleAtomic (tempFileName tmp) $ \ih -> do let built = buildPackIndex $ makePackIndex idx idxbs = BB.toLazyByteString $ built <> (BB.byteString . getSha1 $ hash) idxhash = sha1 idxbs BL.hPut ih idxbs B.hPut ih $ getSha1 idxhash return (Just $ baseFilename <.> ".idx", baseFilename) Just <$> readPackFile pnm newPackFile :: RawFilePath -> IO PackingState newPackFile p = do !tmp <- tempFile p BB.hPutBuilder (tempHandle tmp) $ "PACK" <> BB.word32BE 2 <> BB.word32BE 0 return $ PackingState tmp 0 mempty 12