-- | -- Module : Database.CDB -- Copyright : (c) Boris Sukholitko 2013 -- -- License : BSD3 -- -- A library for reading and writing CDB (Constant Database) files. -- -- CDB files are immutable key-value stores, designed for extremely fast and -- memory-efficient construction and lookup. They can be as large as 4GB, and -- at no point in their construction or use must all data be loaded into -- memory. CDB files can contain multiple values for a given key. -- -- For more information on the CDB file format, please see: -- -- -- Here's how you make new CDB file: -- -- > import qualified Data.ByteString.Char8 as B -- > import Database.PureCDB -- > -- > makeIt :: IO () -- > makeIt = makeCDB (do -- > addBS (B.pack "foo") (B.pack "bar") -- > addBS (B.pack "foo") (B.pack "baz")) "foo.cdb" -- -- You can later use it as in: -- -- > getIt :: IO [ByteString] -- > getIt = do -- > f <- openCDB "foo.cdb" -- > getBS f (B.pack "foo") -- > closeCDB "foo.cdb" -- > -- -- @getIt@ returns [ \"bar\", \"baz\" ] in unspecified order. -- -- Note that @pure-cdb@ works on strict 'ByteString''s only for now. {-# LANGUAGE DeriveFunctor, GeneralizedNewtypeDeriving #-} module Database.PureCDB ( -- * Writing interface WriteCDB, makeCDB, addBS -- * Reading interface , ReadCDB, openCDB, closeCDB, getBS) where import Data.Word import System.IO import qualified Data.ByteString.Lazy as BL import qualified Data.ByteString as B import Data.Binary.Get import Data.Binary.Put import qualified Data.Vector as V import qualified Data.Vector.Generic.Mutable as MV import Control.Applicative import Control.Monad.State import System.Directory import Control.Monad.ST import Database.PureCDB.Internal data HashState = HashState { hsCount :: !Word32, hsPairs :: ![(Word32, Word32)] } data WriteState = WriteState { wsHandle :: Handle, wsTOC :: !(V.Vector HashState) } -- | Write context monad transformer. newtype WriteCDB m a = WriteCDB (StateT WriteState m a) deriving (Functor, Monad, Applicative, MonadTrans, MonadIO) readWordPairs :: Handle -> Int -> IO [(Word32, Word32)] readWordPairs ioh sz = do bs <- BL.hGet ioh sz return $ runGet (mapM (const go) [ 1 .. sz `div` 8 ]) bs where go = (,) <$> getWord32le <*> getWord32le -- | Opens CDB database. openCDB :: FilePath -> IO ReadCDB openCDB fp = do ioh <- openBinaryFile fp ReadMode hSetBuffering ioh NoBuffering wps <- readWordPairs ioh 2048 let v = V.fromList $ map (uncurry TOCHash) wps return $ ReadCDB ioh v -- | Closes the database. closeCDB :: ReadCDB -> IO () closeCDB (ReadCDB ioh _) = hClose ioh getRecord :: Handle -> Word32 -> IO (B.ByteString, B.ByteString) getRecord ioh sk = do hSeek ioh AbsoluteSeek (fromIntegral sk) [(ksz, vsz)] <- readWordPairs ioh 8 k <- B.hGet ioh $ fromIntegral ksz v <- B.hGet ioh $ fromIntegral vsz return (k, v) -- | Fetches key from the database. getBS :: ReadCDB -> B.ByteString -> IO [B.ByteString] getBS r@(ReadCDB ioh _) bs = do hSeek ioh AbsoluteSeek (fromIntegral $ hpos + slot * 8) wps <- readWordPairs ioh (fromIntegral $ (hlen - slot) * 8) let pairs = filter ((== h) . fst) $ takeWhile ((/= 0) . snd) wps kvs <- mapM (getRecord ioh . snd) pairs return $ map snd $ filter ((bs ==) . fst) kvs where (TOCHash hpos hlen, h) = tocFind r bs slot = hashSlot h hlen updateTOC :: V.Vector HashState -> B.ByteString -> Word32 -> V.Vector HashState updateTOC vec key cur = runST $ do v <- V.unsafeThaw vec hs <- MV.read v i MV.write v i (HashState (cnt hs) (pairs hs)) V.unsafeFreeze v where hsh = cdbHash key cnt hs = hsCount hs + 1 i = tocIndex hsh pairs hs = (hsh, cur):hsPairs hs -- | Adds key and value to the CDB database. addBS :: MonadIO m => B.ByteString -> B.ByteString -> WriteCDB m () addBS key val = WriteCDB $ do st <- get cur <- liftIO $ hTell (wsHandle st) liftIO $ BL.hPut (wsHandle st) buf put $ st { wsTOC = updateTOC (wsTOC st) key (fromIntegral cur) } where buf = runPut $ do putWord32le $ fromIntegral $ B.length key putWord32le $ fromIntegral $ B.length val putByteString key putByteString val writePairs :: Handle -> [(Word32, Word32)] -> IO () writePairs ioh pairs = BL.hPut ioh buf where buf = runPut $ mapM_ one pairs one (a, b) = putWord32le a >> putWord32le b writeOneHash :: Handle -> HashState -> IO (Word32, Word32) writeOneHash ioh (HashState cnt pairs) = do cur <- hTell ioh writePairs ioh $ V.toList vec return (fromIntegral cur, fromIntegral $ V.length vec) where vec = createHashVector (1 + cnt * 2) (0, 0) pairs -- | Runs WriteCDB monad transformer to make the database. makeCDB :: MonadIO m => WriteCDB m a -> FilePath -> m a makeCDB (WriteCDB m) fp = do ioh <- liftIO $ openBinaryFile (fp ++ ".tmp") WriteMode liftIO $ hSeek ioh AbsoluteSeek 2048 (res, st) <- runStateT m $ WriteState ioh $ V.replicate 256 (HashState 0 []) tocs <- liftIO $ mapM (writeOneHash ioh) $ V.toList $ wsTOC st liftIO $ hSeek ioh AbsoluteSeek 0 liftIO $ writePairs ioh tocs liftIO $ hClose ioh liftIO $ renameFile (fp ++ ".tmp") fp return res