#ifdef __GLASGOW_HASKELL__
#if __GLASGOW_HASKELL__ < 708
#endif
#endif
module Algorithm.OptimalBlocks
( Blocks(..)
, ChunkConfig(..)
, OptimalBlock(..)
, Algorithm(..)
, chop
, split
, defaultConfig
, sizedBitmask
) where
import qualified Data.Vector.Unboxed as V
import qualified Data.ByteString as ByteString
import Data.ByteString ( ByteString, length, splitAt)
import Data.Word ( Word64 )
import Data.Bits ( (.&.), shiftL )
import Control.DeepSeq ( NFData(..) )
import Algorithm.OptimalBlocks.BuzzHash ( hashes, split, slowSplit )
import Prelude hiding ( length, splitAt )
newtype OptimalBlock = OptimalBlock
{ fromOptimal :: ByteString
} deriving ( Eq, Ord, Show )
data Blocks = Blocks
{ blocksOptimal :: [OptimalBlock]
, blocksRemain :: ByteString
} deriving ( Eq, Show )
data Algorithm = Reference | Old | New deriving ( Eq, Ord, Show )
data ChunkConfig = ChunkConfig
{ windowSize :: Int
, blockSize :: Int
, chunkAlg :: Algorithm
} deriving ( Show )
defaultConfig :: ChunkConfig
defaultConfig = ChunkConfig 128 (256*kb) New
where
kb = 1024
chop :: ChunkConfig
-> ByteString
-> Blocks
chop (ChunkConfig win bksz alg) bs | alg == New || alg == Reference =
let target = toEnum bksz :: Float
bits = fromEnum $ 0.5 + logBase 2 target
in go bits [] bs
where
go bits accum rest =
let fn = case alg of New -> split
Reference -> slowSplit
_ -> error "I already checked this above"
in case fn win bits rest of
(complete, remain) | ByteString.null complete ->
Blocks (map OptimalBlock $ reverse accum) remain
(complete, remain) ->
go bits (complete:accum) remain
chop cfg bs
| length bs < winSz = Blocks [] bs
| otherwise = go
where
go =
let hashed = hashes winSz bs
locs = V.map (+winSz) $ V.findIndices (\h -> mask == (mask .&. h))
hashed
lens = V.zipWith () locs (V.cons 0 locs)
(end, rlist, _) = V.foldl' doSplit (bs, [], 0) lens
in Blocks (map OptimalBlock $ reverse rlist) end
mask :: Word64
mask = sizedBitmask desiredSz
doSplit :: (ByteString, [ByteString], Int)
-> Int
-> (ByteString, [ByteString], Int)
doSplit (b, ls, add) loc
| add+loc < winSz = (b, ls, add+loc)
| otherwise =
let (h, t) = splitAt (add+loc) b
in (t, h:ls, 0)
winSz = windowSize cfg
desiredSz = blockSize cfg
sizedBitmask :: Int -> Word64
sizedBitmask desiredSz =
let target = toEnum desiredSz :: Float
bits = fromEnum $ 0.5 + logBase 2 target
in 1 `shiftL` bits 1
instance NFData Blocks where
rnf (Blocks lst b) =
b `seq` examine lst
where
examine [] = ()
examine (hd:tl) = hd `seq` examine tl