{-# LANGUAGE BangPatterns #-} {- | This is a rolling hash used to calculate the hashes of fixed-length sequences of characters within a given ByteString. BuzzHash is nice in that it makes this calculation very efficient. -} module Algorithm.OptimalBlocks.BuzzHash ( hashes , Hash(..) , init , roll , h ) where import qualified Data.ByteString as BS import qualified Data.Vector.Unboxed as VU import Data.Vector.Unboxed ( Vector, generate, (!), empty, constructN, unsafeLast, null ) import Data.ByteString ( ByteString, pack, length ) import Data.ByteString.Unsafe ( unsafeTake, unsafeIndex ) import Data.Word ( Word8, Word64 ) import Data.Bits ( rotateL, xor ) import Algorithm.OptimalBlocks.SipHash ( hashByteString ) import Prelude hiding ( init, length, null, rem ) {-| Determine the hash of every 'len' sequence of bytes in the given 'ByteString'. Because this uses BuzzHash, a rolling hash, this runs in @O(n)@ time dependent on the length of 'bs', and independent of 'len'. This will generate @(length bs - len + 1)@ 64-bit hashes. -} hashes :: Int -- ^ How many bytes to put into each hash -> ByteString -- ^ The 'ByteString' to calculate hashes of. -> Vector Word64 hashes len bs | length bs < len = empty | otherwise = constructN (length bs - len + 1) upd where upd :: Vector Word64 -> Word64 upd v | null v = currentVal $ init $ unsafeTake len bs | otherwise = let prevH = {-# SCC hash_last #-} Hash len $ unsafeLast v old8 = unsafeIndex bs (VU.length v - 1) new8 = unsafeIndex bs (len + VU.length v - 1) in {-# SCC current_roll #-} currentVal $ roll prevH old8 new8 {-| A representation of a hash that allows rolling hashes to be easily calculated. -} data Hash = Hash { windowLength :: !Int , currentVal :: !Word64 } deriving ( Show ) {-| Create a 'Hash' instance using an entire 'ByteString'. This doesn't have any sort of length argument to do partial 'ByteString's because 'ByteString' supports efficient slices on its own. -} init :: ByteString -> Hash init bs = let hash = BS.foldl upd 0 bs in Hash (length bs) hash where upd :: Word64 -> Word8 -> Word64 upd hsh w8 = (hsh `rotateL` 1) `xor` (h w8) {-| Roll the 'Hash' to the next byte over in the 'ByteString' being hashed. This doesn't do any sort of checking to ensure that 'old' and 'new' are actually correct, so this is probably easy to mess up when using it manually. The expected usage is that one would initialize a hash using 'init' on the beginning of some 'ByteString', and then to calculate the hash of each sequence of bytes one would manually track the first and last byte of each window on the 'ByteString'. 'hashes' does this for you... -} roll :: Hash -> Word8 -> Word8 -> Hash roll hsh old new = let rem = {-# SCC roll_rem #-} (h old) `rotateL` (windowLength hsh) add = {-# SCC roll_add #-} h new skh = {-# SCC roll_skh #-} (currentVal hsh) `rotateL` 1 in {-# SCC roll_hsh #-} hsh { currentVal = rem `xor` add `xor` skh } {-# INLINE roll #-} {-| Upgrade an 8-bit word to a 64-bit one that is very "different" from all the other possible 8-bit words. This library uses SipHash to do this. -} h :: Word8 -> Word64 h w = let a = {-# SCC h_a #-} hs b = {-# SCC h_fromEnum #-} fromEnum w c = {-# SCC h_lookup #-} a ! b in c -- h = (hs !) . fromEnum {-# INLINE h #-} hs :: Vector Word64 hs = generate 256 fn where fn n = hashByteString 2 4 k0 k1 $ pack [toEnum n] k0 = 0x4a7330fae70f52e8 k1 = 0x919ea5953a9a1ec9 {-# NOINLINE hs #-}