Haskell Code by HsColour

{-# LANGUAGE ScopedTypeVariables, NoMonomorphismRestriction #-}

-- | This module contains plain tree indexing code. The index itself is a
-- CACHE: you should only ever use it as an optimisation and never as a primary
-- storage. In practice, this means that when we change index format, the
-- application is expected to throw the old index away and build a fresh
-- index. Please note that tracking index validity is out of scope for this
-- library: this is responsibility of your application. It is advisable that in
-- your validity tracking code, you also check for format validity (see
-- "indexFormatValid") and scrap and re-create index when needed.
--
-- The index is a binary file, that overlays a hashed tree over the working
-- copy. This means that every working file and directory has an entry in the
-- index, that contains its path and hash and validity data. The validity data
-- is a "last seen" timestamp plus the file size. The file hashes are sha256's
-- of the file's content.
--
-- There are two entry types, a file entry and a directory entry. Both have a
-- common binary format (see 'Item'). The on-disk format is best described by
-- 'peekItem'.
--
-- For each file, the index has a copy of the timestamp taken at the instant
-- when the hash has been computed. This means that when file size and
-- timestamp of a file in working copy matches those in the index, we assume
-- that the hash stored in the index for given file is valid. These hashes are
-- then exposed in the resulting 'Tree' object, and can be leveraged by eg.
-- 'diffTrees' to compare many files quickly.
--
-- You may have noticed that we also keep hashes of directories. These are
-- assumed to be valid whenever the complete subtree has had valid
-- timestamps. At any point, as soon as a size or timestamp mismatch is found,
-- the working file in question is opened, its hash (and timestamp and size) is
-- recomputed and updated in-place in the index file (everything lives at a
-- fixed offset and is fixed size, so this isn't an issue). This is also true
-- of directories: when a file in a directory changes hash, this triggers
-- recomputation of all of its parent directory hashes; moreover this is done
-- efficiently -- each directory is updated at most once during a run.

module Storage.Hashed.Index( readIndex, updateIndexFrom, readOrUpgradeIndex
                           , indexFormatValid )
    where

import Prelude hiding ( lookup, readFile, writeFile, catch )
import Storage.Hashed.Utils
import Storage.Hashed.Tree
import Storage.Hashed.AnchoredPath
import Data.Int( Int64, Int32 )

import qualified Data.Set as S
import qualified Data.Map as M

import Bundled.Posix( getFileStatusBS, modificationTime,
                      getFileStatus, fileSize, fileExists, EpochTime )
import System.IO.MMap( mmapFileForeignPtr, Mode(..) )
import System.IO( openBinaryFile, hGetChar, hClose, IOMode(..) )
import System.Directory( removeFile, doesFileExist )
import Control.Monad( when )

import qualified Data.ByteString.Char8 as BS
import Data.ByteString.Internal( toForeignPtr, fromForeignPtr, memcpy
                               , nullForeignPtr )

import Data.IORef( newIORef, readIORef, modifyIORef, IORef )
import Data.Maybe( fromJust, isJust )
import Data.Bits( Bits )

import Foreign.Storable
import Foreign.ForeignPtr
import Foreign.Ptr

--------------------------
-- Indexed trees
--

-- | Description of a a single indexed item. The structure itself does not
-- contain any data, just pointers to the underlying mmap (bytestring is a
-- pointer + offset + length).
--
-- The structure is recursive-ish (as opposed to flat-ish structure, which is
-- used by git...) It turns out that it's hard to efficiently read a flat index
-- with our internal data structures -- we need to turn the flat index into a
-- recursive Tree object, which is rather expensive... As a bonus, we can also
-- efficiently implement subtree queries this way (cf. 'readIndex').
data Item = Item { iPath :: BS.ByteString
                 , iName :: BS.ByteString
                 , iHash :: BS.ByteString
                 , iSize :: Ptr Int64
                 , iAux :: Ptr Int64 -- end-offset for dirs, mtime for files
                 } deriving Show

itemSize :: Item -> Int
itemSize i = 4 + (BS.length $ iPath i) + 1 + 64 + 16

itemSizeI :: (Num a) => Item -> a
itemSizeI = fromIntegral . itemSize

itemIsDir :: Item -> Bool
itemIsDir i = BS.last (iPath i) == '/'

noslashpath :: Item -> FilePath
noslashpath i = BS.unpack $ itemIsDir i ? (BS.init $ iPath i, iPath i)

-- xlatePeek32 = fmap xlate32 . peek
xlatePeek64 :: (Storable a, Bits a) => Ptr a -> IO a
xlatePeek64 = fmap xlate64 . peek

-- xlatePoke32 ptr v = poke ptr (xlate32 v)
xlatePoke64 :: (Storable a, Bits a) => Ptr a -> a -> IO ()
xlatePoke64 ptr v = poke ptr (xlate64 v)

-- | Lay out the basic index item structure in memory. The memory location is
-- given by a ForeignPointer () and an offset. The path and type given are
-- written out, and a corresponding Item is given back. The remaining bits of
-- the item can be filled out using 'update'.
createItem :: ItemType -> AnchoredPath -> ForeignPtr () -> Int -> IO Item
createItem typ path fp off =
 do let name = BS.concat [ flatten path,
                           (typ == TreeType) ? (BS.singleton '/', BS.empty),
                           BS.singleton '\0' ]
        (namefp, nameoff, namel) = toForeignPtr name
    withForeignPtr fp $ \p ->
        withForeignPtr namefp $ \namep ->
            do pokeByteOff p off namel
               memcpy (plusPtr p $ off + 4)
                      (plusPtr namep nameoff)
                      (fromIntegral namel)
               peekItem fp off Nothing

-- | Read the on-disk representation into internal data structure. The Index is
-- organised into "lines" where each line describes a single indexed
-- item. Cf. 'Item'.
--
-- The first word on the index "line" is the length of the file path (which is
-- the only variable-length part of the line). Then comes the path itself, then
-- fixed-length hash (sha256) of the file in question, then two words, one for
-- size and one "aux", which is used differently for directories and for files.
--
-- With directories, this aux holds the offset of the next sibling line in the
-- index, so we can efficiently skip reading the whole subtree starting at a
-- given directory (by just seeking aux bytes forward). The lines are
-- pre-ordered with respect to directory structure -- the directory comes first
-- and after it come all its items. Cf. 'readIndex''.
--
-- For files, the aux field holds a timestamp.
peekItem :: ForeignPtr () -> Int -> Maybe Int -> IO Item
peekItem fp off dirlen =
    withForeignPtr fp $ \p -> do
      nl' :: Int32 <- xlate32 `fmap` peekByteOff p off
      let nl = fromIntegral nl'
          path = fromForeignPtr (castForeignPtr fp) (off + 4) (nl - 1)
          path_noslash = (BS.last path == '/') ? (BS.init path, path)
          hash = fromForeignPtr (castForeignPtr fp) (off + 4 + nl) 64
          name = snd $ case dirlen of
                         Just split -> BS.splitAt split path_noslash
                         Nothing -> BS.spanEnd (/= '/') path_noslash
      return $! Item { iName = name
                     , iPath = path
                     , iHash = hash
                     , iSize = plusPtr p (off + 4 + nl + 64)
                     , iAux = plusPtr p (off + 4 + nl + 64 + 8)
                     }

-- | Update an existing item with new hash and optionally mtime (give Nothing
-- when updating directory entries).
update :: Item -> Maybe EpochTime -> Hash -> IO ()
update item mtime (Hash (Just size, hash)) =
    do xlatePoke64 (iSize item) size
       pokeBS (iHash item) hash
       when (isJust mtime) $ xlatePoke64 (iAux item)
                                  (fromIntegral $ fromEnum $ fromJust mtime)
update _ _ _ = fail "Index.update requires a hash with size included."

iHash' :: Item -> IO Hash
iHash' i = do size <- xlatePeek64 $ iSize i
              return $ hashSetSize (Hash (undefined, iHash i)) size

-- | Gives a ForeignPtr to mmapped index, which can be used for reading and
-- updates.
mmapIndex :: forall a. FilePath -> Int -> IO (ForeignPtr a, Int)
mmapIndex indexpath req_size = do
  exist <- doesFileExist indexpath
  act_size <- if exist then fileSize `fmap` getFileStatus indexpath
                       else return 0
  let size :: Int
      size = fromIntegral $
                 if req_size > 0 then fromIntegral req_size else act_size
  case size of
    0 -> return (castForeignPtr nullForeignPtr, size)
    _ -> do (x, _) <- mmapFileForeignPtr indexpath
                                         ReadWrite (Just (0, size + 4))
            return (x, size)

-- | See 'readIndex'. This version also gives a map from paths to items, so the
-- extra per-item data can be used (hash and mtime) directly. The map is in a
-- form of 'IORef', since the data is not available until the tree is expanded.
readIndex' :: FilePath -> (Tree -> Hash)
           -> IO (Tree, IORef (M.Map AnchoredPath Item))
readIndex' indexpath hashtree = do
  (mmap, mmap_size) <- mmapIndex indexpath 0
  dirs_changed <- newIORef S.empty
  item_map <- newIORef M.empty
  let readItem :: AnchoredPath -> Int -> Int -> IO (Item, Maybe TreeItem)
      readItem parent_path off dl =
          do item <- peekItem mmap off (Just dl)
             x <- if itemIsDir item then readDir parent_path item off dl
                                    else readFile parent_path item
             when (isJust x) $ modifyIORef item_map $ \m ->
                 M.insert (parent_path `appendPath` (Name $ iName item)) item m
             return (item, x)
      readDir :: AnchoredPath -> Item -> Int -> Int -> IO (Maybe TreeItem)
      readDir parent_path item off dl =
          do dirend <- xlatePeek64 $ iAux item
             st <- getFileStatus (noslashpath item)
             let this_path = parent_path `appendPath` (Name $ iName item)
                 nl = BS.length (iName item)
                 dl' = dl + (nl == 0) ? (0, 1 + nl)
                 subs coff | coff < dirend = do
                   (idx_item, tree_item) <- readItem this_path
                                                     (fromIntegral coff) dl'
                   next <- if itemIsDir idx_item
                              then xlatePeek64 $ iAux idx_item
                              else return $ coff + itemSizeI idx_item
                   rest <- subs next
                   case tree_item of
                     Nothing -> return $! rest
                     Just ti -> return $! (Name $ iName idx_item, ti) : rest
                 subs coff | coff == dirend = return []
                           | otherwise = fail "Offset mismatch."
                 updateHash path tree =
                     do changed <- S.member path `fmap` readIORef dirs_changed
                        let hash = hashtree tree
                            tree' = tree { treeHash = Just hash }
                        if changed
                           then do update item Nothing hash
                                   return tree'
                           else return tree
             treehash <- iHash' item
             let rt = Stub (do s <- subs $ fromIntegral (off + itemSize item)
                               return $ (makeTree s)
                                        { finish = updateHash this_path
                                        , treeHash = Just treehash })
                           (Just treehash)
             return $ if fileExists st then Just rt else Nothing
      readFile parent_path item =
          do st <- getFileStatusBS (iPath item)
             mtime <- fromIntegral `fmap` (xlatePeek64 $ iAux item)
             size <- xlatePeek64 $ iSize item
             let mtime' = modificationTime st
                 size' = fileSize st
                 readblob = readSegment (BS.unpack $ iPath item, Nothing)
             when ( mtime /= mtime' || size /= fromIntegral size' ) $
                  do hash_' <- sha256 `fmap` readblob
                     let hash' = hashSetSize hash_' (fromIntegral size')
                     update item (Just mtime') hash'
                     modifyIORef dirs_changed $ \s ->
                         S.union (S.fromList $ parent_path : parents parent_path) s
             hash <- iHash' item
             if fileExists st
                then return $ Just $ File (Blob readblob $ Just hash)
                else return Nothing
  if mmap_size > 0 then
      do (_, Just (Stub root h)) <- readItem (AnchoredPath []) 4 (-2)
         tree <- root
         return (tree { treeHash = h }, item_map)
    else return (emptyTree, item_map)

-- | Read an index and build up a 'Tree' object from it, referring to current
-- working directory. Any parts of the index that are out of date are updated
-- in-place. The result is always an up-to-date index. Also, the 'Tree' is
-- stubby and only the pieces of the index that are expanded will be actually
-- updated!  To implement a subtree query, you can use 'Tree.filter' and then
-- expand the result. Otherwise just expand the whole tree to avoid unexpected
-- problems.
readIndex :: FilePath -> (Tree -> Hash) -> IO Tree
readIndex x y = fst `fmap` readIndex' x y -- pointfree is uglier

-- | Will add and remove files in index to make it match the 'Tree' object
-- given (it is an error for the 'Tree' to contain a file or directory that
-- does not exist in a plain form in current working directory).
updateIndexFrom :: FilePath -> (Tree -> Hash) -> Tree -> IO Tree
updateIndexFrom indexpath hashtree ref =
    do (oldidx', item_map') <- readIndex' indexpath hashtree
       expand oldidx'
       item_map <- readIORef item_map'
       reference <- expand ref
       let typeLen TreeType = 1
           typeLen BlobType = 0
           paths = [ (p, itemType i) | (p, i) <- list reference ]
           len = 87 + sum [ typeLen typ + 84 + 1 + length (anchorPath "" p)
                                | (p, typ) <- paths ]
       exist <- doesFileExist indexpath
       when exist $ removeFile indexpath -- to avoid clobbering oldidx
       (mmap, _) <- mmapIndex indexpath len
       let magic = fromForeignPtr (castForeignPtr mmap) 0 4
           create (File _) path off =
               do i <- createItem BlobType path mmap off
                  case M.lookup path item_map of
                    Nothing -> return ()
                    Just item -> do mtime <- xlatePeek64 $ iAux item
                                    hash <- iHash' item
                                    update i (Just $ fromIntegral mtime) hash
                  return $ off + itemSize i
           create (SubTree s) path off =
               do i <- createItem TreeType path mmap off
                  case M.lookup path item_map of
                    Nothing -> return ()
                    Just item -> iHash' item >>= update i Nothing
                  let subs [] = return $ off + itemSize i
                      subs ((name,x):xs) = do
                        let path' = path `appendPath` name
                        noff <- subs xs
                        create x path' noff
                  lastOff <- subs (listImmediate s)
                  xlatePoke64 (iAux i) (fromIntegral lastOff)
                  return lastOff
           create (Stub _ _) path _ =
               fail $ "Cannot create index from stubbed Tree at " ++ show path
       pokeBS magic (BS.pack "HSI1")
       create (SubTree reference) (AnchoredPath []) 4
       readIndex indexpath hashtree

-- | Check that a given file is an index file with a format we can handle. You
-- should remove and re-create the index whenever this is not true.
indexFormatValid :: FilePath -> IO Bool
indexFormatValid path = do
  fd <- openBinaryFile path ReadMode
  magic <- sequence [ hGetChar fd | _ <- [1..4] :: [Int] ]
  hClose fd
  return $ case magic of
             "HSI1" -> True
             _ -> False

-- | DEPRECATED! Read index (just like readIndex). However, also check that the
-- index version matches our expectations and if not, rebuild it from the
-- reference (which is provided in form of un-executed action; we will only
-- execute it when needed).
readOrUpgradeIndex :: FilePath -> (Tree -> Hash) -> IO Tree -> IO Tree
readOrUpgradeIndex path hashtree getref = do
  valid <- indexFormatValid path
  if valid then readIndex path hashtree
           else do ref <- getref >>= expand
                   removeFile path
                   updateIndexFrom path hashtree ref