{-# LANGUAGE CPP, ScopedTypeVariables, NoMonomorphismRestriction #-} -- | This module contains plain tree indexing code. The index itself is a -- CACHE: you should only ever use it as an optimisation and never as a primary -- storage. In practice, this means that when we change index format, the -- application is expected to throw the old index away and build a fresh -- index. Please note that tracking index validity is out of scope for this -- library: this is responsibility of your application. It is advisable that in -- your validity tracking code, you also check for format validity (see -- "indexFormatValid") and scrap and re-create index when needed. -- -- The index is a binary file that overlays a hashed tree over the working -- copy. This means that every working file and directory has an entry in the -- index, that contains its path and hash and validity data. The validity data -- is a "last seen" timestamp plus the file size. The file hashes are sha256's -- of the file's content. -- -- There are two entry types, a file entry and a directory entry. Both have a -- common binary format (see 'Item'). The on-disk format is best described by -- 'peekItem'. -- -- For each file, the index has a copy of the timestamp taken at the instant -- when the hash has been computed. This means that when file size and -- timestamp of a file in working copy matches those in the index, we assume -- that the hash stored in the index for given file is valid. These hashes are -- then exposed in the resulting 'Tree' object, and can be leveraged by eg. -- 'diffTrees' to compare many files quickly. -- -- You may have noticed that we also keep hashes of directories. These are -- assumed to be valid whenever the complete subtree has had valid -- timestamps. At any point, as soon as a size or timestamp mismatch is found, -- the working file in question is opened, its hash (and timestamp and size) is -- recomputed and updated in-place in the index file (everything lives at a -- fixed offset and is fixed size, so this isn't an issue). This is also true -- of directories: when a file in a directory changes hash, this triggers -- recomputation of all of its parent directory hashes; moreover this is done -- efficiently -- each directory is updated at most once during a run. module Storage.Hashed.Index( readIndex, updateIndexFrom, readOrUpgradeIndex , indexFormatValid ) where import Prelude hiding ( lookup, readFile, writeFile, catch ) import Storage.Hashed.Utils import Storage.Hashed.Tree import Storage.Hashed.AnchoredPath import Data.Int( Int64, Int32 ) import qualified Data.Set as S import qualified Data.Map as M import Bundled.Posix( getFileStatusBS, modificationTime, getFileStatus, fileSize, fileExists, EpochTime ) import System.IO.MMap( mmapFileForeignPtr, Mode(..) ) import System.IO( openBinaryFile, hGetChar, hClose, IOMode(..) ) import System.Directory( removeFile, doesFileExist, renameFile ) import System.FilePath( (<.>) ) import Control.Monad( when ) import qualified Data.ByteString.Char8 as BS import Data.ByteString.Internal( toForeignPtr, fromForeignPtr, memcpy , nullForeignPtr ) import Data.IORef( newIORef, readIORef, modifyIORef, IORef ) import Data.Maybe( fromJust, isJust ) import Data.Bits( Bits ) import Foreign.Storable import Foreign.ForeignPtr import Foreign.Ptr -------------------------- -- Indexed trees -- -- | Description of a a single indexed item. The structure itself does not -- contain any data, just pointers to the underlying mmap (bytestring is a -- pointer + offset + length). -- -- The structure is recursive-ish (as opposed to flat-ish structure, which is -- used by git...) It turns out that it's hard to efficiently read a flat index -- with our internal data structures -- we need to turn the flat index into a -- recursive Tree object, which is rather expensive... As a bonus, we can also -- efficiently implement subtree queries this way (cf. 'readIndex'). data Item = Item { iPath :: BS.ByteString , iName :: BS.ByteString , iHash :: BS.ByteString , iSize :: Ptr Int64 , iAux :: Ptr Int64 -- end-offset for dirs, mtime for files } deriving Show itemSize :: Item -> Int itemSize i = 4 + (BS.length $ iPath i) + 1 + 64 + 16 itemSizeI :: (Num a) => Item -> a itemSizeI = fromIntegral . itemSize itemIsDir :: Item -> Bool itemIsDir i = BS.last (iPath i) == '/' noslashpath :: Item -> FilePath noslashpath i = BS.unpack $ itemIsDir i ? (BS.init $ iPath i, iPath i) -- xlatePeek32 = fmap xlate32 . peek xlatePeek64 :: (Storable a, Bits a) => Ptr a -> IO a xlatePeek64 = fmap xlate64 . peek -- xlatePoke32 ptr v = poke ptr (xlate32 v) xlatePoke64 :: (Storable a, Bits a) => Ptr a -> a -> IO () xlatePoke64 ptr v = poke ptr (xlate64 v) -- | Lay out the basic index item structure in memory. The memory location is -- given by a ForeignPointer () and an offset. The path and type given are -- written out, and a corresponding Item is given back. The remaining bits of -- the item can be filled out using 'update'. createItem :: ItemType -> AnchoredPath -> ForeignPtr () -> Int -> IO Item createItem typ path fp off = do let name = BS.concat [ flatten path, (typ == TreeType) ? (BS.singleton '/', BS.empty), BS.singleton '\0' ] (namefp, nameoff, namel) = toForeignPtr name withForeignPtr fp $ \p -> withForeignPtr namefp $ \namep -> do pokeByteOff p off (xlate32 $ fromIntegral namel :: Int32) memcpy (plusPtr p $ off + 4) (plusPtr namep nameoff) (fromIntegral namel) peekItem fp off Nothing -- | Read the on-disk representation into internal data structure. The Index is -- organised into "lines" where each line describes a single indexed -- item. Cf. 'Item'. -- -- The first word on the index "line" is the length of the file path (which is -- the only variable-length part of the line). Then comes the path itself, then -- fixed-length hash (sha256) of the file in question, then two words, one for -- size and one "aux", which is used differently for directories and for files. -- -- With directories, this aux holds the offset of the next sibling line in the -- index, so we can efficiently skip reading the whole subtree starting at a -- given directory (by just seeking aux bytes forward). The lines are -- pre-ordered with respect to directory structure -- the directory comes first -- and after it come all its items. Cf. 'readIndex''. -- -- For files, the aux field holds a timestamp. peekItem :: ForeignPtr () -> Int -> Maybe Int -> IO Item peekItem fp off dirlen = withForeignPtr fp $ \p -> do nl' :: Int32 <- xlate32 `fmap` peekByteOff p off let nl = fromIntegral nl' path = fromForeignPtr (castForeignPtr fp) (off + 4) (nl - 1) path_noslash = (BS.last path == '/') ? (BS.init path, path) hash = fromForeignPtr (castForeignPtr fp) (off + 4 + nl) 64 name = snd $ case dirlen of Just split -> BS.splitAt split path_noslash Nothing -> BS.spanEnd (/= '/') path_noslash return $! Item { iName = name , iPath = path , iHash = hash , iSize = plusPtr p (off + 4 + nl + 64) , iAux = plusPtr p (off + 4 + nl + 64 + 8) } -- | Update an existing item with new hash and optionally mtime (give Nothing -- when updating directory entries). update :: Item -> Maybe EpochTime -> Hash -> IO () update item mtime (Hash (Just size, hash)) = do xlatePoke64 (iSize item) size pokeBS (iHash item) hash when (isJust mtime) $ xlatePoke64 (iAux item) (fromIntegral $ fromEnum $ fromJust mtime) update _ _ _ = fail "Index.update requires a hash with size included." iHash' :: Item -> IO Hash iHash' i = do size <- xlatePeek64 $ iSize i return $ hashSetSize (Hash (undefined, iHash i)) size -- | Gives a ForeignPtr to mmapped index, which can be used for reading and -- updates. mmapIndex :: forall a. FilePath -> Int -> IO (ForeignPtr a, Int) mmapIndex indexpath req_size = do exist <- doesFileExist indexpath act_size <- if exist then fileSize `fmap` getFileStatus indexpath else return 0 let size :: Int size = fromIntegral $ if req_size > 0 then fromIntegral req_size else act_size case size of 0 -> return (castForeignPtr nullForeignPtr, size) _ -> do (x, _) <- mmapFileForeignPtr indexpath ReadWrite (Just (0, size + 4)) return (x, size) -- | See 'readIndex'. This version also gives a map from paths to items, so the -- extra per-item data can be used (hash and mtime) directly. The map is in a -- form of 'IORef', since the data is not available until the tree is expanded. readIndex' :: FilePath -> (Tree -> Hash) -> IO (Tree, IORef (M.Map AnchoredPath Item)) readIndex' indexpath hashtree = do (mmap, mmap_size) <- mmapIndex indexpath 0 dirs_changed <- newIORef S.empty item_map <- newIORef M.empty let readItem :: AnchoredPath -> Int -> Int -> IO (Item, Maybe TreeItem) readItem parent_path off dl = do item <- peekItem mmap off (Just dl) x <- if itemIsDir item then readDir parent_path item off dl else readFile parent_path item when (isJust x) $ modifyIORef item_map $ \m -> M.insert (parent_path `appendPath` (Name $ iName item)) item m return (item, x) readDir :: AnchoredPath -> Item -> Int -> Int -> IO (Maybe TreeItem) readDir parent_path item off dl = do dirend <- xlatePeek64 $ iAux item st <- getFileStatus (noslashpath item) let this_path = parent_path `appendPath` (Name $ iName item) nl = BS.length (iName item) dl' = dl + (nl == 0) ? (0, 1 + nl) subs coff | coff < dirend = do (idx_item, tree_item) <- readItem this_path (fromIntegral coff) dl' next <- if itemIsDir idx_item then xlatePeek64 $ iAux idx_item else return $ coff + itemSizeI idx_item rest <- subs next case tree_item of Nothing -> return $! rest Just ti -> return $! (Name $ iName idx_item, ti) : rest subs coff | coff == dirend = return [] | otherwise = fail "Offset mismatch." updateHash path tree = do changed <- S.member path `fmap` readIORef dirs_changed let hash = hashtree tree tree' = tree { treeHash = Just hash } if changed then do update item Nothing hash return tree' else return tree treehash <- iHash' item let rt = Stub (do s <- subs $ fromIntegral (off + itemSize item) return $ (makeTree s) { finish = updateHash this_path , treeHash = Just treehash }) (Just treehash) return $ if fileExists st then Just rt else Nothing readFile parent_path item = do st <- getFileStatusBS (iPath item) mtime <- fromIntegral `fmap` (xlatePeek64 $ iAux item) size <- xlatePeek64 $ iSize item let mtime' = modificationTime st size' = fileSize st readblob = readSegment (BS.unpack $ iPath item, Nothing) when ( mtime /= mtime' || size /= fromIntegral size' ) $ do hash_' <- sha256 `fmap` readblob let hash' = hashSetSize hash_' (fromIntegral size') update item (Just mtime') hash' modifyIORef dirs_changed $ \s -> S.union (S.fromList $ parent_path : parents parent_path) s hash <- iHash' item if fileExists st then return $ Just $ File (Blob readblob $ Just hash) else return Nothing if mmap_size > 0 then do (_, Just (Stub root h)) <- readItem (AnchoredPath []) 4 (-2) tree <- root return (tree { treeHash = h }, item_map) else return (emptyTree, item_map) -- | Read an index and build up a 'Tree' object from it, referring to current -- working directory. Any parts of the index that are out of date are updated -- in-place. The result is always an up-to-date index. Also, the 'Tree' is -- stubby and only the pieces of the index that are expanded will be actually -- updated! To implement a subtree query, you can use 'Tree.filter' and then -- expand the result. Otherwise just expand the whole tree to avoid unexpected -- problems. readIndex :: FilePath -> (Tree -> Hash) -> IO Tree readIndex x y = fst `fmap` readIndex' x y -- pointfree is uglier -- | Will add and remove files in index to make it match the 'Tree' object -- given (it is an error for the 'Tree' to contain a file or directory that -- does not exist in a plain form in current working directory). updateIndexFrom :: FilePath -> (Tree -> Hash) -> Tree -> IO Tree updateIndexFrom indexpath hashtree ref = do (oldidx', item_map') <- readIndex' indexpath hashtree expand oldidx' item_map <- readIORef item_map' reference <- expand ref let typeLen TreeType = 1 typeLen BlobType = 0 paths = [ (p, itemType i) | (p, i) <- list reference ] len = 87 + sum [ typeLen typ + 84 + 1 + length (anchorPath "" p) | (p, typ) <- paths ] exist <- doesFileExist indexpath #if mingw32_HOST_OS when exist $ renameFile indexpath (indexpath <.> "old") #else when exist $ removeFile indexpath -- to avoid clobbering oldidx #endif (mmap, _) <- mmapIndex indexpath len let magic = fromForeignPtr (castForeignPtr mmap) 0 4 create (File _) path off = do i <- createItem BlobType path mmap off case M.lookup path item_map of Nothing -> return () Just item -> do mtime <- xlatePeek64 $ iAux item hash <- iHash' item update i (Just $ fromIntegral mtime) hash return $ off + itemSize i create (SubTree s) path off = do i <- createItem TreeType path mmap off case M.lookup path item_map of Nothing -> return () Just item -> iHash' item >>= update i Nothing let subs [] = return $ off + itemSize i subs ((name,x):xs) = do let path' = path `appendPath` name noff <- subs xs create x path' noff lastOff <- subs (listImmediate s) xlatePoke64 (iAux i) (fromIntegral lastOff) return lastOff create (Stub _ _) path _ = fail $ "Cannot create index from stubbed Tree at " ++ show path pokeBS magic (BS.pack "HSI1") create (SubTree reference) (AnchoredPath []) 4 readIndex indexpath hashtree -- | Check that a given file is an index file with a format we can handle. You -- should remove and re-create the index whenever this is not true. indexFormatValid :: FilePath -> IO Bool indexFormatValid path = do fd <- openBinaryFile path ReadMode magic <- sequence [ hGetChar fd | _ <- [1..4] :: [Int] ] hClose fd return $ case magic of "HSI1" -> True _ -> False -- | DEPRECATED! Read index (just like readIndex). However, also check that the -- index version matches our expectations and if not, rebuild it from the -- reference (which is provided in form of un-executed action; we will only -- execute it when needed). readOrUpgradeIndex :: FilePath -> (Tree -> Hash) -> IO Tree -> IO Tree readOrUpgradeIndex path hashtree getref = do valid <- indexFormatValid path if valid then readIndex path hashtree else do ref <- getref >>= expand removeFile path updateIndexFrom path hashtree ref