{-# LANGUAGE ParallelListComp #-} -- | This module implements an "object storage". This is a directory on disk -- containing a content-addressed storage. This is useful for storing all kinds -- of things, particularly filesystem trees, or darcs pristine caches and patch -- objects. However, this is an abstract, flat storage: no tree semantics are -- provided. You just need to provide a reference-collecting functionality, -- computing a list of references for any given object. The system provides -- transparent garbage collection and packing. module Storage.Hashed.Packed ( Format(..), Block, OS -- * Basic operations. , hatch, compact, repack, lookup -- * Creating and loading. , create, load -- * Low-level. , format, blockLookup, live, hatchery, mature, roots, references, rootdir ) where import Prelude hiding ( lookup, read ) import Storage.Hashed.AnchoredPath( ) import Storage.Hashed.Tree ( ) import Storage.Hashed.Utils import Storage.Hashed.Hash import Control.Monad( forM, forM_, unless ) import Control.Applicative( (<$>) ) import System.FilePath( (), (<.>) ) import System.Directory( createDirectoryIfMissing, removeFile , getDirectoryContents ) import Bundled.Posix( fileExists, isDirectory, getFileStatus ) import qualified Data.ByteString.Lazy.Char8 as BL import qualified Data.ByteString.Char8 as BS import Data.Maybe( listToMaybe, catMaybes, isNothing ) import Data.Binary( encode, decode ) import qualified Data.Set as S import qualified Data.Map as M import Data.List( sort ) import Data.Int( Int64 ) -- | On-disk format for object storage: we implement a completely loose format -- (one file per object), a compact format stored in a single append-only file -- and an immutable \"pack\" format. data Format = Loose | Compact | Pack deriving (Show, Eq) loose_dirs :: [[Char]] loose_dirs = let chars = ['0'..'9'] ++ ['a'..'f'] in [ [a,b] | a <- chars, b <- chars ] loosePath :: OS -> Hash -> FilePath loosePath _ NoHash = error "No path for NoHash!" loosePath os hash = let hash' = BS.unpack (encodeBase16 hash) in rootdir os "hatchery" take 2 hash' drop 2 hash' looseLookup :: OS -> Hash -> IO (Maybe FileSegment) looseLookup _ NoHash = return Nothing looseLookup os hash = do let path = loosePath os hash exist <- fileExists <$> getFileStatus path return $ if exist then Just (path, Nothing) else Nothing -- | Object storage block. When used as a hatchery, the loose or compact format -- are preferable, while for mature space, the pack format is more useful. data Block = Block { blockLookup :: Hash -> IO (Maybe FileSegment) , size :: Int64 , format :: Format } -- | Object storage. Contains a single \"hatchery\" and possibly a number of -- mature space blocks, usually in form of packs. It also keeps a list of root -- pointers and has a way to extract pointers from objects (externally -- supplied). These last two things are used to implement a simple GC. data OS = OS { hatchery :: Block , mature :: [Block] , roots :: [Hash] , references :: FileSegment -> IO [Hash] , rootdir :: FilePath } -- | Reduce number of packs in the object storage. This may both recombine -- packs to eliminate dead objects and join some packs to form bigger packs. repack :: OS -> IO OS repack _ = error "repack undefined" -- | Add new objects to the object storage (i.e. put them into hatchery). It is -- safe to call this even on objects that are already present in the storage: -- such objects will be skipped. hatch :: OS -> [BL.ByteString] -> IO OS hatch os blobs = do processed <- mapM sieve blobs write [ (h, b) | (True, h, b) <- processed ] where write bits = case format (hatchery os) of Loose -> do forM bits $ \(hash, blob) -> do BL.writeFile (loosePath os hash) blob return os Compact -> error "hatch/compact undefined" _ -> fail "Hatchery must be either Loose or Compact." sieve blob = do let hash = sha256 blob absent <- isNothing <$> lookup os hash return (absent, hash, blob) -- | Move things from hatchery into a (new) pack. compact :: OS -> IO OS compact os = do objects <- live os [hatchery os] block <- createPack os (M.toList objects) cleanup return $ os { mature = block:mature os } where cleanup = case format (hatchery os) of Loose -> forM_ loose_dirs $ nuke . ((rootdir os "hatchery") ) Compact -> removeFile (rootdir os "hatchery") >> return () _ -> fail "Hatchery must be either Loose or Compact." nuke dir = mapM (removeFile . (dir )) =<< (Prelude.filter (`notElem` [".", ".."]) `fmap` getDirectoryContents dir) blocksLookup :: [Block] -> Hash -> IO (Maybe (Hash, FileSegment)) blocksLookup blocks hash = do segment <- cat `fmap` mapM (flip blockLookup hash) blocks return $ case segment of Nothing -> Nothing Just seg -> Just (hash, seg) where cat = listToMaybe . catMaybes lookup :: OS -> Hash -> IO (Maybe FileSegment) lookup os hash = do res <- blocksLookup (hatchery os : mature os) hash return $ case res of Nothing -> Nothing Just (_, seg) -> Just seg -- | Create an empty object storage in given directory, with a hatchery of -- given format. The directory is created if needed, but is assumed to be -- empty. create :: FilePath -> Format -> IO OS create path fmt = do createDirectoryIfMissing True path initHatchery load path where initHatchery | fmt == Loose = do mkdir hatchpath forM loose_dirs $ mkdir . (hatchpath ) | fmt == Compact = error "create/mkHatchery Compact undefined" mkdir = createDirectoryIfMissing False hatchpath = path "hatchery" load :: FilePath -> IO OS load path = do hatch_stat <- getFileStatus $ path "hatchery" let is_os = fileExists hatch_stat is_dir = isDirectory hatch_stat unless is_os $ fail $ path ++ " is not an object storage!" let _hatchery = Block { blockLookup = look os , format = if is_dir then Loose else Compact , size = undefined } os = OS { hatchery = _hatchery , rootdir = path , mature = packs , roots = _roots , references = undefined } look | format _hatchery == Loose = looseLookup | otherwise = undefined packs = [] -- FIXME read packs _roots = [] -- FIXME read root pointers return os readPack :: FilePath -> IO Block readPack file = do bits <- readSegment (file, Nothing) let count = decode (BL.take 8 $ bits) _lookup NoHash _ _ = return Nothing _lookup hash@(SHA256 rawhash) first final = do let middle = first + ((final - first) `div` 2) res <- case ( compare rawhash (hashof first) , compare rawhash (hashof middle) , compare rawhash (hashof final) ) of (LT, _, _) -> return Nothing ( _, _, GT) -> return Nothing (EQ, _, _) -> return $ Just (segof first) ( _, _, EQ) -> return $ Just (segof final) (GT, EQ, LT) -> return $ Just (segof middle) (GT, GT, LT) | middle /= final -> _lookup hash middle final (GT, LT, LT) | first /= middle -> _lookup hash first middle ( _, _, _) -> return Nothing return res headerof i = BL.take 51 $ BL.drop (8 + i * 51) bits hashof i = BS.concat $ BL.toChunks $ BL.take 32 $ headerof i segof i = (file, Just (count * 51 + 8 + from, sz)) where from = decode (BL.take 8 $ BL.drop 33 $ headerof i) sz = decode (BL.take 8 $ BL.drop 42 $ headerof i) return $ Block { size = BL.length bits , format = Pack , blockLookup = \h -> _lookup h 0 (count - 1) } createPack :: OS -> [(Hash, FileSegment)] -> IO Block createPack os bits = do contents <- mapM readSegment (map snd bits) let offsets = scanl (+) 0 $ map BL.length contents headerbits = [ BL.concat [ BL.fromChunks [rawhash] , BL.pack "@" , encode offset , BL.pack "!" , encode $ BL.length string , BL.pack "\n" ] | (SHA256 rawhash, _) <- bits | string <- contents | offset <- offsets ] header = BL.concat $ (encode $ length bits) : sort headerbits blob = BL.concat $ header:contents hash = sha256 blob path = rootdir os BS.unpack (encodeBase16 hash) <.> "bin" BL.writeFile path blob readPack path -- | Build a map of live objects (i.e. those reachable from the given roots) in -- a given list of Blocks. live :: OS -> [Block] -> IO (M.Map Hash FileSegment) live os blocks = reachable (references os) (blocksLookup blocks) (S.fromList $ roots os)