-- | This module defines the central type class `LargeHashable` of this package. {-# LANGUAGE BangPatterns #-} {-# LANGUAGE DefaultSignatures #-} {-# LANGUAGE TypeOperators #-} {-# LANGUAGE FlexibleContexts #-} module Data.LargeHashable.Class ( LargeHashable(..), largeHash, LargeHashable'(..) ) where -- keep imports in alphabetic order (in Emacs, use "M-x sort-lines") import Data.Bits import Data.Fixed import Data.Foldable import Data.Int import Data.LargeHashable.Intern import Data.Ratio import Data.Time import Data.Time.Clock.TAI import Data.Void (Void) import Data.Word import Foreign.C.Types import Foreign.Ptr import GHC.Generics import qualified Codec.Binary.UTF8.Light as Utf8 import qualified Data.Aeson as J import qualified Data.ByteString as B import qualified Data.ByteString.Lazy as BL import qualified Data.ByteString.Lazy.Internal as BLI import qualified Data.ByteString.Short as BS import qualified Data.Foldable as F import qualified Data.HashMap.Lazy as HashMap import qualified Data.HashSet as HashSet import qualified Data.IntMap as IntMap import qualified Data.IntSet as IntSet import qualified Data.Map as M import qualified Data.Scientific as Sci import qualified Data.Sequence as Seq import qualified Data.Set as S import qualified Data.Strict.Tuple as Tuple import qualified Data.Text as T import qualified Data.Text.Foreign as TF import qualified Data.Text.Internal.Lazy as TLI import qualified Data.Text.Lazy as TL import qualified Data.Vector as V -- | A type class for computing hashes (i.e. MD5, SHA256, ...) from -- haskell values. -- -- The laws of this typeclass are the following: -- -- (1) If two values are equal -- according to '==', then the finally computed hashes must also be equal -- according to '=='. However it is not required that the hashes of inequal -- values have to be inequal. Also note that an instance of 'LargeHashable' -- does not require a instance of 'Eq'. Using any sane algorithm the chance -- of a collision should be 1 / n where n is the number of different hashes -- possible. -- -- (2) If two values are inequal -- according to '==', then the probability of a hash collision is 1/n, -- where n is the number of possible hashes produced by the -- underlying hash algorithm. -- -- A rule of thumb: hash all information that you would also need for -- serializing/deserializing values of your datatype. For instance, when -- hashing lists, you would not only hash the list elements but also the -- length of the list. Consider the following datatype -- -- > data Foo = Foo [Int] [Int] -- -- We now write an instance for LargeHashable like this -- -- > instance LargeHashable Foo where -- > updateHash (Foo l1 l2) = updateHash l1 >> updateHash l2 -- -- If we did not hash the length of a list, then the following two values -- of @Foo@ would produce identical hashes: -- -- > Foo [1,2,3] [] -- > Foo [1] [2,3] -- class LargeHashable a where updateHash :: a -> LH () default updateHash :: (GenericLargeHashable (Rep a), Generic a) => a -> LH () updateHash = updateHashGeneric . from class LargeHashable' t where updateHash' :: LargeHashable a => t a -> LH () -- | 'largeHash' is the central function of this package. -- For a given value it computes a 'Hash' using the given -- 'HashAlgorithm'. largeHash :: LargeHashable a => HashAlgorithm h -> a -> h largeHash algo x = runLH algo (updateHash x) {-# INLINE updateHashTextData #-} updateHashTextData :: T.Text -> LH () updateHashTextData !t = do updates <- hashUpdates ioInLH $ do TF.useAsPtr t $ \valPtr units -> hu_updatePtr updates (castPtr valPtr) (fromIntegral (2 * units)) return () {-# INLINE updateHashText #-} updateHashText :: T.Text -> LH () updateHashText !t = do updateHashTextData t updates <- hashUpdates ioInLH $ hu_updateULong updates (fromIntegral (T.length t)) instance LargeHashable T.Text where updateHash = updateHashText {-# INLINE updateHashLazyText #-} updateHashLazyText :: Int -> TL.Text -> LH () updateHashLazyText !len (TLI.Chunk !t !next) = do updateHashTextData t updateHashLazyText (len + T.length t) next updateHashLazyText !len TLI.Empty = updateHash len instance LargeHashable TL.Text where updateHash = updateHashLazyText 0 {-# INLINE updateHashByteStringData #-} updateHashByteStringData :: B.ByteString -> LH () updateHashByteStringData !b = do updates <- hashUpdates ioInLH $ do ptr <- B.useAsCString b return hu_updatePtr updates (castPtr ptr) (B.length b) {-# INLINE updateHashByteString #-} updateHashByteString :: B.ByteString -> LH () updateHashByteString !b = do updateHashByteStringData b updates <- hashUpdates ioInLH $ hu_updateULong updates (fromIntegral (B.length b)) instance LargeHashable B.ByteString where updateHash = updateHashByteString {-# INLINE updateHashLazyByteString #-} updateHashLazyByteString :: Int -> BL.ByteString -> LH () updateHashLazyByteString !len (BLI.Chunk !bs !next) = do updateHashByteStringData bs updateHashLazyByteString (len + B.length bs) next updateHashLazyByteString !len BLI.Empty = updateHash len instance LargeHashable BL.ByteString where updateHash = updateHashLazyByteString 0 instance LargeHashable BS.ShortByteString where updateHash = updateHash . BS.fromShort {-# INLINE updateHashWithFun #-} updateHashWithFun :: (HashUpdates -> a -> IO ()) -> a -> LH () updateHashWithFun f x = do updates <- hashUpdates ioInLH $ f updates x instance LargeHashable Int where updateHash i = updateHashWithFun hu_updateLong (fromIntegral i) instance LargeHashable Int8 where updateHash = updateHashWithFun hu_updateChar . CChar instance LargeHashable Int16 where updateHash = updateHashWithFun hu_updateShort . CShort instance LargeHashable Int32 where updateHash = updateHashWithFun hu_updateInt . CInt instance LargeHashable Int64 where updateHash = updateHashWithFun hu_updateLong . CLong instance LargeHashable Word where updateHash i = updateHashWithFun hu_updateLong (fromIntegral i) instance LargeHashable Word8 where updateHash = updateHashWithFun hu_updateUChar . CUChar instance LargeHashable Word16 where updateHash = updateHashWithFun hu_updateUShort . CUShort instance LargeHashable Word32 where updateHash = updateHashWithFun hu_updateUInt . CUInt instance LargeHashable Word64 where updateHash = updateHashWithFun hu_updateULong . CULong instance LargeHashable CChar where updateHash = updateHashWithFun hu_updateChar instance LargeHashable CShort where updateHash = updateHashWithFun hu_updateShort instance LargeHashable CInt where updateHash = updateHashWithFun hu_updateInt instance LargeHashable CLong where updateHash = updateHashWithFun hu_updateLong instance LargeHashable CUChar where updateHash = updateHashWithFun hu_updateUChar instance LargeHashable CUShort where updateHash = updateHashWithFun hu_updateUShort instance LargeHashable CUInt where updateHash = updateHashWithFun hu_updateUInt instance LargeHashable CULong where updateHash = updateHashWithFun hu_updateULong instance LargeHashable Char where updateHash = updateHashWithFun hu_updateUInt . CUInt . Utf8.c2w {-# INLINE updateHashInteger #-} updateHashInteger :: Integer -> LH () updateHashInteger !i | i == 0 = updateHash (0 :: CUChar) | i > 0 = do updateHash (fromIntegral (i .&. 0xffffffffffffffff) :: CULong) updateHashInteger (shift i (-64)) | otherwise = do updateHash (0 :: CUChar) -- prepend 0 to show it is negative updateHashInteger (abs i) instance LargeHashable Integer where updateHash = updateHashInteger foreign import ccall doubleToWord64 :: Double -> Word64 instance LargeHashable Double where updateHash = updateHash . doubleToWord64 foreign import ccall floatToWord32 :: Float -> Word32 instance LargeHashable Float where updateHash = updateHash . floatToWord32 {-# INLINE updateHashFixed #-} updateHashFixed :: HasResolution a => Fixed a -> LH () updateHashFixed f = updateHash (truncate . (* f) . fromInteger $ resolution f :: Integer) instance HasResolution a => LargeHashable (Fixed a) where updateHash = updateHashFixed {-# INLINE updateHashBool #-} updateHashBool :: Bool -> LH () updateHashBool True = updateHash (1 :: CUChar) updateHashBool False = updateHash (0 :: CUChar) instance LargeHashable Bool where updateHash = updateHashBool {-# INLINE updateHashList #-} updateHashList :: LargeHashable a => [a] -> LH () updateHashList = loop 0 where loop :: LargeHashable a => Int -> [a] -> LH () loop !i [] = updateHash i loop !i (x:xs) = do updateHash x loop (i + 1) xs instance LargeHashable a => LargeHashable [a] where updateHash = updateHashList {-# INLINE setFoldFun #-} setFoldFun :: LargeHashable a => LH () -> a -> LH () setFoldFun action value = action >> updateHash value {-# INLINE updateHashSet #-} updateHashSet :: LargeHashable a => S.Set a -> LH () updateHashSet !set = do foldl' setFoldFun (return ()) set -- Note: foldl' for sets traverses the elements in asc order updateHash (S.size set) instance LargeHashable a => LargeHashable (S.Set a) where updateHash = updateHashSet {-# INLINE updateHashIntSet #-} updateHashIntSet :: IntSet.IntSet -> LH () updateHashIntSet !set = do IntSet.foldl' setFoldFun (return ()) set updateHash (IntSet.size set) -- Lazy and Strict IntSet share the same definition instance LargeHashable IntSet.IntSet where updateHash = updateHashIntSet {-# INLINE updateHashHashSet #-} updateHashHashSet :: LargeHashable a => HashSet.HashSet a -> LH () updateHashHashSet !set = -- The ordering of elements in a set does not matter. A HashSet does not -- offer an efficient way of exctracting its elements in some specific -- ordering. So we use the auxiliary function 'hashListModuloOrdering'. hashListModuloOrdering (HashSet.size set) (HashSet.toList set) -- | Hashes a list of values such the two permutations of the same list -- yields the same hash. hashListModuloOrdering :: LargeHashable a => Int -> [a] -> LH () hashListModuloOrdering len list = do updateXorHash (map updateHash list) updateHash len -- Lazy and Strict HashSet share the same definition instance LargeHashable a => LargeHashable (HashSet.HashSet a) where updateHash = updateHashHashSet {-# INLINE mapFoldFun #-} mapFoldFun :: (LargeHashable k, LargeHashable a) => LH () -> k -> a -> LH () mapFoldFun action key value = action >> updateHash key >> updateHash value {-# INLINE updateHashMap #-} updateHashMap :: (LargeHashable k, LargeHashable a) => M.Map k a -> LH () updateHashMap !m = do M.foldlWithKey' mapFoldFun (return ()) m updateHash (M.size m) -- Lazy and Strict Map share the same definition instance (LargeHashable k, LargeHashable a) => LargeHashable (M.Map k a) where updateHash = updateHashMap {-# INLINE updateHashIntMap #-} updateHashIntMap :: LargeHashable a => IntMap.IntMap a -> LH () updateHashIntMap !m = do IntMap.foldlWithKey' mapFoldFun (return ()) m updateHash (IntMap.size m) -- Lazy and Strict IntMap share the same definition instance LargeHashable a => LargeHashable (IntMap.IntMap a) where updateHash = updateHashIntMap updateHashHashMap :: (LargeHashable k, LargeHashable v) => HashMap.HashMap k v -> LH () updateHashHashMap !m = -- The ordering of elements in a map do not matter. A HashMap does not -- offer an efficient way of exctracting its elements in some specific -- ordering. So we use the auxiliary function 'hashListModuloOrdering'. hashListModuloOrdering (HashMap.size m) (HashMap.toList m) -- Lazy and Strict HashMap share the same definition instance (LargeHashable k, LargeHashable v) => LargeHashable (HashMap.HashMap k v) where updateHash = updateHashHashMap {-# INLINE updateHashTuple #-} updateHashTuple :: (LargeHashable a, LargeHashable b) => (a, b) -> LH () updateHashTuple (!a, !b) = updateHash a >> updateHash b instance (LargeHashable a, LargeHashable b) => LargeHashable (a, b) where updateHash = updateHashTuple {-# INLINE updateHashTriple #-} updateHashTriple :: (LargeHashable a, LargeHashable b, LargeHashable c) => (a, b, c) -> LH () updateHashTriple (a, b, c) = updateHash a >> updateHash b >> updateHash c instance (LargeHashable a, LargeHashable b, LargeHashable c) => LargeHashable (a, b, c) where updateHash = updateHashTriple {-# INLINE updateHashQuadruple #-} updateHashQuadruple :: (LargeHashable a, LargeHashable b, LargeHashable c, LargeHashable d) => (a, b, c, d) -> LH () updateHashQuadruple (a, b, c, d) = updateHash a >> updateHash b >> updateHash c >> updateHash d instance (LargeHashable a, LargeHashable b, LargeHashable c, LargeHashable d) => LargeHashable (a, b, c, d) where updateHash = updateHashQuadruple {-# INLINE updateHashQuintuple #-} updateHashQuintuple :: (LargeHashable a, LargeHashable b, LargeHashable c, LargeHashable d, LargeHashable e) => (a, b, c, d, e) -> LH () updateHashQuintuple (a, b, c, d, e) = updateHash a >> updateHash b >> updateHash c >> updateHash d >> updateHash e instance (LargeHashable a, LargeHashable b, LargeHashable c, LargeHashable d, LargeHashable e) => LargeHashable (a, b, c, d, e) where updateHash = updateHashQuintuple updateHashMaybe :: LargeHashable a => Maybe a -> LH () updateHashMaybe Nothing = updateHash (0 :: CULong) updateHashMaybe (Just !x) = updateHash (1 :: CULong) >> updateHash x instance LargeHashable a => LargeHashable (Maybe a) where updateHash = updateHashMaybe instance (LargeHashable a, LargeHashable b) => LargeHashable (Either a b) where updateHash (Left !l) = updateHash (0 :: CULong) >> updateHash l updateHash (Right !r) = updateHash (1 :: CULong) >> updateHash r instance LargeHashable () where updateHash () = updateHash (0 :: CULong) instance LargeHashable Ordering where updateHash EQ = updateHash (0 :: CULong) updateHash GT = updateHash (-1 :: CULong) updateHash LT = updateHash (1 :: CULong) instance (Integral a, LargeHashable a) => LargeHashable (Ratio a) where updateHash !i = do updateHash $ numerator i updateHash $ denominator i instance LargeHashable AbsoluteTime where updateHash t = updateHash $ diffAbsoluteTime t taiEpoch instance LargeHashable DiffTime where -- could be replaced by diffTimeToPicoseconds as soon as -- time 1.6 becomes more common updateHash = updateHash . (fromRational . toRational :: DiffTime -> Pico) instance LargeHashable NominalDiffTime where updateHash = updateHash . (fromRational . toRational :: NominalDiffTime -> Pico) instance LargeHashable LocalTime where updateHash (LocalTime d tod) = updateHash d >> updateHash tod instance LargeHashable ZonedTime where updateHash (ZonedTime lt tz) = updateHash lt >> updateHash tz instance LargeHashable TimeOfDay where updateHash (TimeOfDay h m s) = updateHash h >> updateHash m >> updateHash s instance LargeHashable TimeZone where updateHash (TimeZone mintz summerOnly name) = updateHash mintz >> updateHash summerOnly >> updateHash name instance LargeHashable UTCTime where updateHash (UTCTime d dt) = updateHash d >> updateHash dt instance LargeHashable Day where updateHash (ModifiedJulianDay d) = updateHash d instance LargeHashable UniversalTime where updateHash (ModJulianDate d) = updateHash d instance LargeHashable a => LargeHashable (V.Vector a) where updateHash = updateHash . V.toList instance (LargeHashable a, LargeHashable b) => LargeHashable (Tuple.Pair a b) where updateHash (x Tuple.:!: y) = do updateHash x updateHash y instance LargeHashable Sci.Scientific where updateHash notNormalized = do let n = Sci.normalize notNormalized updateHash (Sci.coefficient n) updateHash (Sci.base10Exponent n) instance LargeHashable J.Value where updateHash v = case v of J.Object obj -> do updateHash (0::Int) updateHash obj J.Array arr -> do updateHash (1::Int) updateHash arr J.String t -> do updateHash (2::Int) updateHash t J.Number n -> do updateHash (3::Int) updateHash n J.Bool b -> do updateHash (4::Int) updateHash b J.Null -> updateHash (5::Int) instance LargeHashable Void where updateHash _ = error "I'm void" instance LargeHashable a => LargeHashable (Seq.Seq a) where updateHash = updateHash . F.toList -- | Support for generically deriving 'LargeHashable' instances. -- Any instance of the type class 'GHC.Generics.Generic' can be made -- an instance of 'LargeHashable' by an empty instance declaration. class GenericLargeHashable f where updateHashGeneric :: f p -> LH () instance GenericLargeHashable V1 where updateHashGeneric = undefined instance GenericLargeHashable U1 where updateHashGeneric _ = updateHash () instance (GenericLargeHashable f, GenericLargeHashable g) => GenericLargeHashable (f :+: g) where updateHashGeneric (L1 x) = do updateHash (0 :: CULong) -- is left updateHashGeneric x updateHashGeneric (R1 x) = do updateHash (1 :: CULong) -- is right updateHashGeneric x instance (GenericLargeHashable f, GenericLargeHashable g) => GenericLargeHashable (f :*: g) where updateHashGeneric (x :*: y) = updateHashGeneric x >> updateHashGeneric y instance LargeHashable c => GenericLargeHashable (K1 i c) where updateHashGeneric (K1 x) = updateHash x -- ignore meta-info (for now) instance (GenericLargeHashable f) => GenericLargeHashable (M1 i t f) where updateHashGeneric (M1 x) = updateHashGeneric x