src/Algorithm/OptimalBlocks/BuzzHash.hs

{-# LANGUAGE BangPatterns #-}
{- | This is a rolling hash used to calculate the hashes of fixed-length
 sequences of characters within a given ByteString. BuzzHash is nice in that it
 makes this calculation very efficient.
 -}

module Algorithm.OptimalBlocks.BuzzHash
( hashes
, split
, slowSplit
, Hash(..)
, init
, roll
, h
) where

import qualified Data.ByteString as ByteString
import qualified Data.Vector.Unboxed as VU
import Data.Bits              ( (.&.), rotateL, shiftL, xor )
import Data.ByteString        ( ByteString, pack, length )
import Data.ByteString.Unsafe ( unsafeTake, unsafeIndex )
import Data.Vector.Unboxed    ( Vector, generate, (!), empty, constructN, unsafeLast, null )
import Data.Word              ( Word8, Word64 )

import Algorithm.OptimalBlocks.SipHash ( hashByteString )

import Prelude hiding ( init, length, null, rem )

type WindowSize = Int
type BlockShift = Int

-- | Split up a ByteString into a complete block and a remainder. If no break
-- point is found in the given string, the first element of the resulting pair
-- will be empty.
split :: WindowSize -- ^ How many bytes to use when generating the pattern hash
      -> BlockShift -- ^ log2 of the desired block size
      -> ByteString -- ^ The ByteString to split
      -> (ByteString, ByteString)
split win _shift bytes | ByteString.length bytes <= win =
  (ByteString.empty, bytes)
split win shift bytes = go 0 (hash0 $ unsafeTake win bytes)
  where
  go !start !hash | mask == (hash .&. mask) =
    ByteString.splitAt (start + win) bytes
  go !start !_hash | start == end =
    (ByteString.empty, bytes)
  go !start !hash =
    -- 8-byte window:
    -- hash of 0 1 2 3 4 5 6 7
    -- rolls into
    -- hash of   1 2 3 4 5 6 7 8
    -- by removing the value at "start" and adding in the value at "start+win"
    let !old = unsafeIndex bytes start
        !new = unsafeIndex bytes (start + win)
        !hh' = roll win hash old new
    in go (start+1) hh'

  -- 8-byte window, 10 bytes of data, rolls like
  -- [01234567]89  (start = 0)
  -- 0[12345678]9  (start = 1)
  -- 01[23456789]  (start = 2)
  -- we are done when start = (length bytes) - win (2, here)
  end = ByteString.length bytes - win

  mask = (1 `shiftL` shift) - 1

-- | Same thing as 'split', but actually calculates the hash of every window
-- explicitly. This is really slow, and is only used to validate the results of
-- 'split'
slowSplit :: WindowSize -- ^ How many bytes to use for pattern search
          -> BlockShift -- ^ log2 of the desired block size
          -> ByteString -- ^ The ByteString to split
          -> (ByteString, ByteString)
slowSplit win _shift bytes | ByteString.length bytes <= win =
  (ByteString.empty, bytes)
slowSplit win shift bytes = go 0
  where
  go start =
    let tl   = ByteString.drop start bytes
        bs   = ByteString.take win tl
        hash = hash0 bs
    in if ByteString.length bs < win
        then (ByteString.empty, bytes)
        else if mask == (mask .&. hash)
              then ByteString.splitAt (start + win) bytes
              else go (start+1)

  mask = (1 `shiftL` shift) - 1

{-| Determine the hash of every 'len' sequence of bytes in the given
 'ByteString'. Because this uses BuzzHash, a rolling hash, this runs in @O(n)@
 time dependent on the length of 'bs', and independent of 'len'.

 This will generate @(length bs - len + 1)@ 64-bit hashes.

 This function really doesn't serve any purpose anymore, and is just kept
 around as a sanity check for the much faster 'split' function.
 -}
hashes :: WindowSize  -- ^ How many bytes to put into each hash
       -> ByteString  -- ^ The 'ByteString' to calculate hashes of.
       -> Vector Word64
hashes len bs 
  | length bs < len = empty
  | otherwise = constructN (length bs - len + 1) upd
  where
  upd :: Vector Word64 -> Word64
  upd v
    | null v    = currentVal $ init $ unsafeTake len bs
    | otherwise =
        let !prevH = {-# SCC hash_last #-} Hash len $ unsafeLast v
            !vlen  = VU.length v - 1
            !old8  = unsafeIndex bs vlen
            !new8  = unsafeIndex bs (len + vlen)
            !rollV = roll (windowLength prevH) (currentVal prevH) old8 new8
        in rollV

{-| A representation of a hash that allows rolling hashes to be easily
 calculated.
 -}
data Hash = Hash { windowLength :: !WindowSize
                 , currentVal   :: !Word64
                 }
            deriving ( Show )

{-| Create a 'Hash' instance using an entire 'ByteString'. This doesn't have
 any sort of length argument to do partial 'ByteString's because 'ByteString'
 supports efficient slices on its own.
 -}
init :: ByteString -> Hash
init bs = Hash (length bs) (hash0 bs)

{-| Calculate the actual hash of the given ByteString.
 -}
hash0 :: ByteString -> Word64
hash0 bs = ByteString.foldl upd 0 bs
  where
  upd :: Word64 -> Word8 -> Word64
  upd hsh w8 = (hsh `rotateL` 1) `xor` (h w8)

{-| Roll the 'Hash' to the next byte over in the 'ByteString' being hashed.
 This doesn't do any sort of checking to ensure that 'old' and 'new' are
 actually correct, so this is probably easy to mess up when using it manually.
 The expected usage is that one would initialize a hash using 'init' on the
 beginning of some 'ByteString', and then to calculate the hash of each
 sequence of bytes one would manually track the first and last byte of each
 window on the 'ByteString'. 'hashes' does this for you...
 -}
roll :: WindowSize -> Word64 -> Word8 -> Word8 -> Word64
roll !winsz !hsh !old !new =
  let !rem = {- {-# SCC roll_rem #-} -} (h old) `rotateL` winsz
      !add = {- {-# SCC roll_add #-} -} h new
      !skh = {- {-# SCC roll_skh #-} -} hsh `rotateL` 1
  in {-# SCC roll_hsh #-} rem `xor` add `xor` skh
{-# INLINE roll #-}

{-| Upgrade an 8-bit word to a 64-bit one that is very "different" from all
 the other possible 8-bit words. This library uses SipHash to do this.
 -}
h :: Word8 -> Word64
h = (hs !) . fromEnum
{-# INLINE h #-}

{- |A pre-calculated array of hashes of bytes.
 -}
hs :: Vector Word64
hs = generate 256 fn
  where
  fn n = hashByteString 2 4 k0 k1 $ pack [toEnum n]
  k0 = 0x4a7330fae70f52e8
  k1 = 0x919ea5953a9a1ec9
{-# NOINLINE hs #-}