-- | Vector of (small) words which adapt their representation 
-- to make them more compact when the elements are small.
--
-- This is data structure engineered to store large amount of 
-- small vectors of small elements compactly on memory.
-- 
-- For example the list @[1..14] :: [Int]@ consumes 560 bytes (14x5=70 words) on 
-- a 64 bit machine, while the corresponding 'WordVec' takes only
-- 16 bytes (2 words), and the one corresponding to @[101..115]@ still only 
-- 24 bytes (3 words).
--
-- Unboxed arrays or unboxed vectors are better, as they only have a constant
-- overhead, but those constants are big: 13 words (104 bytes on 64 bit)
-- for unboxed arrays, and 6 words (48 bytes) for unboxed vectors. And you
-- still have to select the number of bits per element in advance.
--
-- Some operations may be a bit slower, but hopefully the cache-friendlyness 
-- will somewhat balance that (a simple microbenchmark with 'Data.Map'-s
-- indexed by @[Int]@ vs. @WordVec@ showed a 2x improvement in speed and
-- 20x improvement in memory usage). In any case the primary goal
-- here is optimized memory usage.
--
-- This module should be imported qualified (to avoid name clashes with Prelude).
--
-- TODO: ability to add user-defined (fixed-length) header, it can be 
-- potentially useful for some applications 
--

{-# LANGUAGE CPP, BangPatterns, ForeignFunctionInterface #-}
module Data.Vector.Compact.WordVec 
  ( -- * The dynamic Word vector type
    WordVec(..)
  , Shape(..)
  , vecShape , vecShape'
  , vecLen , vecBits , vecIsSmall
    -- * Show instance
  , showWordVec , showsPrecWordVec
    -- * Empty vector, singleton
  , null , empty
  , singleton , isSingleton
    -- * Conversion to\/from lists
  , fromList , fromListN , fromList'
  , toList , toRevList
    -- * Indexing
  , unsafeIndex , safeIndex 
    -- * Head, tail, etc
  , head , tail , cons , uncons
  , last ,        snoc                   -- init, unsnoc
  , concat
    -- * Specialized operations 
    --
    -- $spec
    --
    -- ** Specialized folds 
  , sum , maximum
    -- ** Specialized \"zipping folds\" 
  , eqStrict  , eqExtZero
  , cmpStrict , cmpExtZero
  , lessOrEqual , partialSumsLessOrEqual
    -- ** Specialized zips
  , add , subtract
    -- ** Specialized maps
  , scale 
    -- ** Specialized scans
  , partialSums
    -- * Generic operations
  , fold
  , naiveMap , boundedMap
  , naiveZipWith , boundedZipWith , listZipWith
    -- * Number of bits needed
  , bitsNeededFor , bitsNeededFor'
  , roundBits
  )
  where

--------------------------------------------------------------------------------

import Prelude hiding ( head , tail , init , last , null , concat , subtract , sum , maximum ) 
import qualified Data.List as L

import Data.Bits
import Data.Word

import Foreign.C

import Data.Vector.Compact.Blob hiding ( head , tail , last )
import qualified Data.Vector.Compact.Blob as Blob

--------------------------------------------------------------------------------

-- ???? how to determine this properly... 
-- why on earth isn't this stuff properly documented?!?!?!?!? 
#ifdef x86_64_HOST_ARCH
#define MACHINE_WORD_BITS 64 
#elif i386_HOST_ARCH
#define MACHINE_WORD_BITS 32
#elif i686_HOST_ARCH
#define MACHINE_WORD_BITS 32
#elif aarch64_HOST_ARCH
#define MACHINE_WORD_BITS 64 
#else
#define MACHINE_WORD_BITS 32
#endif

--------------------------------------------------------------------------------
-- * The dynamic Word vector type

-- | Dynamic word vectors are internally 'Blob'-s, which the first few bits
-- encoding their shape, and after that their content.
--
-- * small vectors has 2 bits of \"resolution\" and  5 bits of length
-- * big   vectors has 4 bits of \"resolution\" and 27 bits of length
--
-- Resolution encodes the number of bits per element. The latter is always a multiple
-- of 4 (that is: 4 bits per element, or 8, or 12, etc. up to 64 bits per element).
--
-- We use the very first bit to decide which of these two encoding we use.
-- (if we would make a sum type instead, it would take 2 extra words...)
--
-- About the instances:
-- 
-- * the @Eq@ instance is strict: @x == y@ iff @toList x == toList y@.
--   For an equality which disregards trailing zeros, see 'eqExtZero'
-- 
-- * the @Ord@ instance first compares the length, 
--   then if the lengths are equal, compares the content lexicographically.
--   For a comparison which disregards the length, and lexicographically
--   compares the sequences extended with zeros, see 'cmpExtZero'
--
newtype WordVec 
  = WordVec Blob

-- | The \"shape\" of a dynamic word vector
data Shape = Shape
  { shapeLen  :: !Int      -- ^ length of the vector
  , shapeBits :: !Int      -- ^ bits per element (quantized to multiples of 4)
  }
  deriving (Eq,Show)

vecShape :: WordVec -> Shape
vecShape = snd . vecShape'
  
-- | @vecShape' vec == (vecIsSmall vec , vecShape vec)@
vecShape' :: WordVec -> (Bool,Shape)
vecShape' (WordVec blob) = (isSmall,shape) where
  !h      = Blob.head blob
  !h2     = shiftR h 1
  !isSmall = (h .&. 1) == 0
  shape   = if isSmall
    then mkShape (shiftR h 3 .&. 31        ) (shiftL ((h2.&. 3)+1) 2)
    else mkShape (shiftR h 5 .&. 0x07ffffff) (shiftL ((h2.&.15)+1) 2)
  mkShape :: Word64 -> Word64 -> Shape
  mkShape !x !y = Shape (fromIntegral x) (fromIntegral y)

-- | @True@ if the internal representation is the \"small\" one
vecIsSmall :: WordVec -> Bool
vecIsSmall (WordVec !blob) = (Blob.head blob .&. 1) == 0  

-- | The length of the vector
vecLen :: WordVec -> Int
vecLen  = shapeLen  . vecShape

-- | The number of bits per element used to encode the vector
vecBits :: WordVec -> Int
vecBits = shapeBits . vecShape

--------------------------------------------------------------------------------
-- * Instances

instance Show WordVec where
  showsPrec = showsPrecWordVec

showWordVec :: WordVec -> String
showWordVec dynvec = showsPrecWordVec 0 dynvec []

showsPrecWordVec :: Int -> WordVec -> ShowS
showsPrecWordVec prec dynvec
  = showParen (prec > 10) 
  $ showString "fromList' "
  . showsPrec 11 (vecShape dynvec)
  . showChar ' ' 
  . shows (toList dynvec)
    
-- | The Eq instance is strict: @x == y@ iff @toList x == toList y@.
-- For an equality which disregards trailing zeros, see 'eqExtZero'.
instance Eq WordVec where
  (==) x y  = eqStrict x y 
  -- (==) x y  = (vecLen x == vecLen y) && (toList x == toList y)

-- | The Ord instance first compares the length, then if the lengths are equal, 
-- compares the content lexicographically. For a different ordering, see 'cmpExtZero'.
instance Ord WordVec where
  compare x y = cmpStrict x y
{-
  compare x y = case compare (vecLen x) (vecLen y) of 
    LT -> LT
    GT -> GT
    EQ -> compare (toList x) (toList y)
-}

--------------------------------------------------------------------------------
-- * Empty vector, singleton

empty :: WordVec
empty = fromList []

null :: WordVec -> Bool
null (WordVec !blob) = 
  -- null v = (vecLen v == 0)
  let !h = Blob.head blob 
  in  (h .&. 0xf9 == 0) || (h .&. 0xffffffe1 == 1)
  -- 0xf9       = 000 ... 00|11111001
  -- 0xffffffe1 = 111 ... 11|11100001
 
{-  
null_naive :: WordVec -> Bool
null_naive v = (vecLen v == 0)
-}

singleton :: Word -> WordVec
singleton !x = fromListN 1 x [x] where

isSingleton :: WordVec -> Maybe Word
isSingleton !v = case (vecLen v) of
  1 -> Just (head v)
  _ -> Nothing

--------------------------------------------------------------------------------
-- * Indexing

-- | No boundary check is done. Indexing starts from 0.
unsafeIndex :: Int -> WordVec -> Word
unsafeIndex idx dynvec@(WordVec blob) = 
  case isSmall of
    True  -> extractSmallWord bits blob ( 8 + bits*idx)
    False -> extractSmallWord bits blob (32 + bits*idx)
  where
    (isSmall, Shape _ bits) = vecShape' dynvec

safeIndex :: Int -> WordVec -> Maybe Word
safeIndex idx dynvec@(WordVec blob)
  | idx < 0    = Nothing
  | idx >= len = Nothing
  | otherwise  = Just $ case isSmall of
      True  -> extractSmallWord bits blob ( 8 + bits*idx)
      False -> extractSmallWord bits blob (32 + bits*idx)
  where
    (isSmall, Shape len bits) = vecShape' dynvec

--------------------------------------------------------------------------------
-- * Head, tail, etc
    
-- | Note: For the empty vector, @head@ returns 0
head :: WordVec -> Word
head dynvec@(WordVec blob) 
  | null dynvec  = 0
  | otherwise    = case vecIsSmall dynvec of
      True  -> extractSmallWord bits blob  8
      False -> extractSmallWord bits blob 32
  where
    bits = vecBits dynvec

-- | Note: For the empty vector, @last@ returns 0
last :: WordVec -> Word
last dynvec@(WordVec blob) 
  | len == 0   = 0
  | otherwise  = case isSmall of
    True  -> extractSmallWord bits blob ( 8 + bits*(len-1))
    False -> extractSmallWord bits blob (32 + bits*(len-1))
  where
    (isSmall, Shape len bits) = vecShape' dynvec

--------------------------------------------------------------------------------

-- | Note: For the empty vector, @tail@ returns (another) empty vector
tail :: WordVec -> WordVec
tail = tail_v2

-- | Prepends an element
cons :: Word -> WordVec -> WordVec
cons = cons_v2

-- | Appends an element
snoc :: WordVec -> Word -> WordVec
snoc = snoc_v2

uncons :: WordVec -> Maybe (Word, WordVec)
uncons = uncons_v2

concat :: WordVec -> WordVec -> WordVec
concat u v = fromList' (Shape (lu+lv) (max bu bv)) (toList u ++ toList v) where
  Shape lu bu = vecShape u
  Shape lv bv = vecShape v

--------------------------------------------------------------------------------

foreign import ccall unsafe "vec_identity"  c_vec_identity  :: CFun11_       -- for testing
foreign import ccall unsafe "vec_tail"      c_vec_tail      :: CFun11_
foreign import ccall unsafe "vec_head_tail" c_vec_head_tail :: CFun11 Word64
foreign import ccall unsafe "vec_cons"      c_vec_cons      :: Word64 -> CFun11_
foreign import ccall unsafe "vec_snoc"      c_vec_snoc      :: Word64 -> CFun11_

tail_v2 :: WordVec -> WordVec
tail_v2 (WordVec blob) = WordVec $ wrapCFun11_ c_vec_tail id blob

cons_v2 :: Word -> WordVec -> WordVec
cons_v2 y vec@(WordVec blob) = WordVec $ wrapCFun11_ (c_vec_cons (fromIntegral y)) f blob where
  f !n = max (n+2) worstcase
  len  = vecLen vec
  worstcase = shiftR (32 + bitsNeededFor y * (len+1) + 63) 6
  -- it can happen that we cons (2^64-1) to a long vector of 4 bit numbers...
  -- now it either fits in the old bits, in which case we need at most 1 new word
  -- (maybe two, if we also switch from small header to big header at the same time???)
  -- or does not, which is computed by @worstcase@

snoc_v2 :: WordVec -> Word -> WordVec
snoc_v2 vec@(WordVec blob) y = WordVec $ wrapCFun11_ (c_vec_snoc (fromIntegral y)) f blob where
  f !n = max (n+2) worstcase
  len  = vecLen vec
  worstcase = shiftR (32 + bitsNeededFor y * (len+1) + 63) 6
 
uncons_v2 :: WordVec -> Maybe (Word,WordVec)
uncons_v2 vec@(WordVec blob) = if null vec 
  then Nothing
  else let (hd,tl) = wrapCFun11 c_vec_head_tail id blob
       in  Just (fromIntegral hd , WordVec tl)
       
--------------------------------------------------------------------------------
-- * Conversion to\/from lists

toList :: WordVec -> [Word]
toList dynvec@(WordVec blob) =
  case isSmall of
    True  -> worker  8 len (shiftR header  8 : restOfWords)
    False -> worker 32 len (shiftR header 32 : restOfWords)
  
  where
    isSmall = (header .&. 1) == 0
    (header:restOfWords) = blobToWordList blob
      
    Shape len bits = vecShape dynvec
    
    the_mask = shiftL 1 bits - 1 :: Word64

    mask :: Word64 -> Word
    mask w = fromIntegral (w .&. the_mask)

    worker !bitOfs !0 _  = []
    worker !bitOfs !k [] = replicate k 0     -- this shouldn't happen btw 
    worker !bitOfs !k (this:rest) = 
      let newOfs = bitOfs + bits 
      in  case compare newOfs 64 of
        LT -> (mask this) : worker newOfs (k-1) (shiftR this bits : rest)
        EQ -> (mask this) : worker 0      (k-1)                     rest
        GT -> case rest of 
                (that:rest') -> 
                  let !newOfs' = newOfs - 64
                      !elem = mask (this .|. shiftL that (64-bitOfs)) 
                  in  elem : worker newOfs' (k-1) (shiftR that newOfs' : rest') 
                [] -> error "WordVec/toList: FATAL ERROR! this should not happen"

-- | @toRevList vec == reverse (toList vec)@, but should be faster (?)
toRevList :: WordVec -> [Word] 
toRevList dynvec@(WordVec blob)  = 
  case isSmall of
    True  -> [ extractSmallWord bits blob ( 8 + bits*(len-i)) | i<-[1..len] ]
    False -> [ extractSmallWord bits blob (32 + bits*(len-i)) | i<-[1..len] ]
  where
    (isSmall, Shape len bits) = vecShape' dynvec

--------------------------------------------------------------------------------
    
fromList :: [Word] -> WordVec
fromList [] = fromList' (Shape 0 4) []
fromList xs = fromList' (Shape l b) xs where
  l = length xs
  b = bitsNeededFor (L.maximum xs)

-- | This is faster than 'fromList'
fromListN
 :: Int       -- ^ length
 -> Word      -- ^ maximum (or just an upper bound)
 -> [Word]    -- ^ elements
 -> WordVec
fromListN len max = fromList' (Shape len (bitsNeededFor max))
 
-- | If you know the shape in advance, it\'s faster to use this function 
fromList' :: Shape -> [Word] -> WordVec
fromList' (Shape len bits0) words
  | bits <= 16 && len <= 31  = WordVec $ mkBlob (mkHeader 0 2)  8 words
  | otherwise                = WordVec $ mkBlob (mkHeader 1 4) 32 words
  
  where
    !bits    = max 4 $ min 64 $ (bits0 + 3) .&. 0xfc
    !bitsEnc = shiftR bits 2 - 1 :: Int
    !content = bits*len          :: Int
    !mask    = shiftL 1 bits - 1 :: Word64

    mkHeader :: Word64 -> Int -> Word64
    mkHeader !isSmall !resoBits = isSmall + fromIntegral (shiftL (bitsEnc + shiftL len resoBits) 1)
     
    mkBlob !header !ofs words = blobFromWordListN (shiftR (ofs+content+63) 6) 
                              $ worker len header ofs words
    
    worker :: Int -> Word64 -> Int -> [Word] -> [Word64]
    worker  0 !current !bitOfs _           = if bitOfs == 0 then [] else [current] 
    worker !k !current !bitOfs []          = worker k current bitOfs [0]   
    worker !k !current !bitOfs (this0:rest) = 
      let !this     = (fromIntegral this0) .&. mask
          !newOfs   = bitOfs + bits 
          !current' = (shiftL this bitOfs) .|. current 
      in  case compare newOfs 64 of
        LT ->            worker (k-1) current' newOfs rest
        EQ -> current' : worker (k-1) 0        0      rest 
        GT -> let !newOfs' = newOfs - 64
              in   current' : worker (k-1) (shiftR this (64-bitOfs)) newOfs' rest

--------------------------------------------------------------------------------
-- * Specialized operations 
--
-- $spec
--
-- These are are faster than the generic operations below, and should be preferred
-- to those.
--

--------------------------------------------------------------------------------
-- ** Specialized folds 

-- | Sum of the elements of the vector
sum :: WordVec -> Word
sum (WordVec blob) = fromIntegral $ wrapCFun10 c_vec_sum blob

-- | Maximum of the elements of the vector
maximum :: WordVec -> Word
maximum (WordVec blob) = fromIntegral $ wrapCFun10 c_vec_max blob

foreign import ccall unsafe "vec_sum" c_vec_sum :: CFun10 Word64
foreign import ccall unsafe "vec_max" c_vec_max :: CFun10 Word64

--------------------------------------------------------------------------------
-- ** Specialized \"zipping folds\" 
--

foreign import ccall unsafe "vec_equal_strict"    c_equal_strict  :: CFun20 CInt
foreign import ccall unsafe "vec_equal_extzero"   c_equal_extzero :: CFun20 CInt
foreign import ccall unsafe "vec_compare_strict"  c_compare_strict  :: CFun20 CInt
foreign import ccall unsafe "vec_compare_extzero" c_compare_extzero :: CFun20 CInt
foreign import ccall unsafe "vec_less_or_equal"              c_less_or_equal :: CFun20 CInt
foreign import ccall unsafe "vec_partial_sums_less_or_equal" c_partial_sums_less_or_equal :: CFun20 CInt

-- | Strict equality of vectors (same length, same content)
eqStrict :: WordVec -> WordVec -> Bool
eqStrict (WordVec blob1) (WordVec blob2) = (0 /= wrapCFun20 c_equal_strict blob1 blob2)

-- | Equality of vectors extended with zeros to infinity
eqExtZero :: WordVec -> WordVec -> Bool
eqExtZero (WordVec blob1) (WordVec blob2) = (0 /= wrapCFun20 c_equal_extzero blob1 blob2)

cintToOrdering :: CInt -> Ordering
cintToOrdering !k
  | k < 0     = LT
  | k > 0     = GT
  | otherwise = EQ
  
-- | Strict comparison of vectors (first compare the lengths; if the lengths are the same then compare lexicographically)
cmpStrict :: WordVec -> WordVec -> Ordering
cmpStrict (WordVec blob1) (WordVec blob2) = cintToOrdering $ wrapCFun20 c_compare_strict blob1 blob2

-- | Lexicographic ordering of vectors extended with zeros to infinity
cmpExtZero :: WordVec -> WordVec -> Ordering
cmpExtZero (WordVec blob1) (WordVec blob2) = cintToOrdering $ wrapCFun20 c_compare_extzero blob1 blob2

-- | Pointwise comparison of vectors extended with zeros to infinity
lessOrEqual :: WordVec -> WordVec -> Bool
lessOrEqual (WordVec blob1) (WordVec blob2) = (0 /= wrapCFun20 c_less_or_equal blob1 blob2)

-- | Pointwise comparison of partial sums of vectors extended with zeros to infinity
-- 
-- For example @[x1,x2,x3] <= [y1,y2,y3]@ iff (@x1 <=y1 && x1+x2 <= y1+y2 && x1+x2+x3 <= y1+y2+y3@).
--
partialSumsLessOrEqual :: WordVec -> WordVec -> Bool
partialSumsLessOrEqual (WordVec blob1) (WordVec blob2) =
  (0 /= wrapCFun20 c_partial_sums_less_or_equal blob1 blob2)

--------------------------------------------------------------------------------
-- ** Specialized zips
--

foreign import ccall unsafe "vec_add"           c_vec_add          :: CFun21_
foreign import ccall unsafe "vec_sub_overflow"  c_vec_sub_overflow :: CFun21 CInt

-- | Pointwise addition of vectors. The shorter one is extended by zeros.
add :: WordVec -> WordVec -> WordVec
add vec1@(WordVec blob1) vec2@(WordVec blob2) = WordVec $ wrapCFun21_ c_vec_add f blob1 blob2 where
  -- WARNING! memory allocation is _very_ tricky here!
  -- worst case: we have a very long vector with 4 bits/elem,
  -- and a very short vector with 64 bits/elem!
  -- even @max b1 b2@ is not enough, because it can overflow...
  f _ _ = 1 + shiftR ( (max b1 b2 + 4)*(max l1 l2) + 63 ) 6
  Shape !l1 !b1 = vecShape vec1
  Shape !l2 !b2 = vecShape vec2

-- | Pointwise subtraction of vectors. The shorter one is extended by zeros.
-- If any element would become negative, we return Nothing
subtract :: WordVec -> WordVec -> Maybe WordVec
subtract vec1@(WordVec blob1) vec2@(WordVec blob2) = 
  case (wrapCFun21 c_vec_sub_overflow f blob1 blob2) of
    (0 , blob3) -> Just (WordVec blob3)
    (_ , _    ) -> Nothing
  where
    f _ _ = 1 + shiftR ( (max b1 b2 + 4)*(max l1 l2) + 63 ) 6
    Shape !l1 !b1 = vecShape vec1
    Shape !l2 !b2 = vecShape vec2

--------------------------------------------------------------------------------
-- ** Specialized maps

foreign import ccall unsafe "vec_scale" c_vec_scale :: Word64 -> CFun11_

-- | Pointwise multiplication by a constant.
scale :: Word -> WordVec -> WordVec
scale s vec@(WordVec blob) = WordVec $ wrapCFun11_ (c_vec_scale (fromIntegral s)) f blob where
  f _ = shiftR (32 + len*newbits + 63) 6   
  Shape !len !bits = vecShape vec
  bound = if s <= shiftL 1 (64-bits)
    then (2^bits - 1) * s
    else (2^64   - 1)
  newbits = bitsNeededFor bound  

--------------------------------------------------------------------------------
-- ** Specialized scans

foreign import ccall unsafe "vec_partial_sums" c_vec_partial_sums :: CFun11 Word64

-- | @toList (partialSums vec) == tail (scanl (+) 0 $ toList vec)@
partialSums :: WordVec -> WordVec
partialSums vec@(WordVec blob) = WordVec $ snd $ wrapCFun11 c_vec_partial_sums f blob where
  f _ = shiftR (32 + len*newbits + 63) 6   
  Shape !len !bits = vecShape vec
  bound = if len <= shiftL 1 (64-bits)
    then (2^bits - 1) * (fromIntegral len :: Word)        -- worst case: @replicate N (2^bits-1)@
    else (2^64   - 1)
  newbits = bitsNeededFor bound  
    
--------------------------------------------------------------------------------
-- * Some generic operations

-- | Left fold
fold :: (a -> Word -> a) -> a -> WordVec -> a
fold f x v = L.foldl' f x (toList v)  

naiveMap :: (Word -> Word) -> WordVec -> WordVec
naiveMap f u = fromList (map f $ toList u)

-- | If you have a (nearly sharp) upper bound to the result of your of function
-- on your vector, mapping can be more efficient 
boundedMap :: Word -> (Word -> Word) -> WordVec -> WordVec
boundedMap bound f vec = fromList' (Shape l bits) (toList vec) where
  l    = vecLen vec
  bits = bitsNeededFor bound

naiveZipWith :: (Word -> Word -> Word) -> WordVec -> WordVec -> WordVec
naiveZipWith f u v = fromList $ L.zipWith f (toList u) (toList v)

-- | If you have a (nearly sharp) upper bound to the result of your of function
-- on your vector, zipping can be more efficient 
boundedZipWith :: Word -> (Word -> Word -> Word) -> WordVec -> WordVec -> WordVec
boundedZipWith bound f vec1 vec2  = fromList' (Shape l bits) $ L.zipWith f (toList vec1) (toList vec2) where
  l    = min (vecLen vec1) (vecLen vec2)
  bits = bitsNeededFor bound

listZipWith :: (Word -> Word -> a) -> WordVec -> WordVec -> [a]
listZipWith f u v = L.zipWith f (toList u) (toList v)
              
--------------------------------------------------------------------------------
-- * Misc helpers

-- | Number of bits needed to encode a given number, rounded up to multiples of four
bitsNeededFor :: Word -> Int
bitsNeededFor = bitsNeededForHs

-- | Number of bits needed to encode a given number
bitsNeededFor' :: Word -> Int
bitsNeededFor' = bitsNeededForHs'

bitsNeededForHs :: Word -> Int
bitsNeededForHs = roundBits . bitsNeededForHs'

bitsNeededForHs' :: Word -> Int
bitsNeededForHs' bound 
  | bound   == 0  = 1                             -- this is handled incorrectly by the formula below
  | bound+1 == 0  = MACHINE_WORD_BITS             -- and this handled incorrectly because of overflow
  | otherwise     = ceilingLog2 (bound + 1)       -- for example, if maximum is 16, log2 = 4 but we need 5 bits 
  where    
    -- | Smallest integer @k@ such that @2^k@ is larger or equal to @n@
    ceilingLog2 :: Word -> Int
    ceilingLog2 0 = 0
    ceilingLog2 n = 1 + go (n-1) where
      go 0 = -1
      go k = 1 + go (shiftR k 1)

{-

-- apparently, the C implementation is _not_ faster...

foreign import ccall unsafe "export_required_bits_not_rounded" export_required_bits_not_rounded :: Word64 -> CInt
foreign import ccall unsafe "export_required_bits"             export_required_bits             :: Word64 -> CInt

bitsNeededForC :: Word -> Int
bitsNeededForC = fromIntegral . export_required_bits . fromIntegral

bitsNeededForC' :: Word -> Int
bitsNeededForC' = fromIntegral . export_required_bits_not_rounded . fromIntegral
-}

-- | We only allow multiples of 4.
roundBits :: Int -> Int
roundBits 0 = 4
roundBits k = shiftL (shiftR (k+3) 2) 2

--------------------------------------------------------------------------------