#if __GLASGOW_HASKELL__ >= 706
#endif
#if __GLASGOW_HASKELL__ >= 707
#endif
module Data.HyperLogLog.Type
(
HyperLogLog(..)
, HasHyperLogLog(..)
, size
, insert
, insertHash
, intersectionSize
, cast
#if __GLASGOW_HASKELL__ >= 708
, coerceConfig
#endif
) where
#if __GLASGOW_HASKELL__ < 710
import Control.Applicative
#endif
import Control.DeepSeq (NFData (..))
import Control.Lens
import Control.Monad
import Crypto.MAC.SipHash
import Data.Approximate.Type
import Data.Bits.Extras
import qualified Data.Binary as Binary
import Data.Binary
import Data.Bytes.Put (runPutS)
import Data.Bytes.Serial
import Data.HyperLogLog.Config
import Data.Proxy
import Data.Reflection
import Data.Semigroup
import Data.Serialize as Serialize
import qualified Data.Vector.Unboxed as V
import qualified Data.Vector.Unboxed.Mutable as MV
#if __GLASGOW_HASKELL__ < 710
import Data.Word
#endif
import GHC.Generics hiding (D, to)
import GHC.Int
#if __GLASGOW_HASKELL__ >= 708
import Data.Type.Coercion (Coercion(..))
#endif
newtype HyperLogLog p = HyperLogLog { runHyperLogLog :: V.Vector Rank }
deriving (Eq, Show, Generic, NFData)
#if __GLASGOW_HASKELL__ >= 708
coerceConfig :: forall p q . (Reifies p Integer, Reifies q Integer) => Maybe (Coercion (HyperLogLog p) (HyperLogLog q))
coerceConfig | reflect (Proxy :: Proxy p) == reflect (Proxy :: Proxy q) = Just Coercion
| otherwise = Nothing
#endif
#if __GLASGOW_HASKELL__ >= 707
type role HyperLogLog nominal
#endif
instance Serialize (HyperLogLog p)
instance Serial (HyperLogLog p) where
serialize (HyperLogLog v) = serialize (V.toList v)
deserialize = liftM (HyperLogLog . V.fromList) deserialize
instance Binary (HyperLogLog p) where
put (HyperLogLog v) = Binary.put (V.toList v)
get = fmap (HyperLogLog . V.fromList) Binary.get
class HasHyperLogLog a p | a -> p where
hyperLogLog :: Lens' a (HyperLogLog p)
instance HasHyperLogLog (HyperLogLog p) p where
hyperLogLog = id
_HyperLogLog :: Iso' (HyperLogLog p) (V.Vector Rank)
_HyperLogLog = iso runHyperLogLog HyperLogLog
instance Semigroup (HyperLogLog p) where
HyperLogLog a <> HyperLogLog b = HyperLogLog (V.zipWith max a b)
instance Reifies p Integer => Monoid (HyperLogLog p) where
mempty = HyperLogLog $ V.replicate (numBuckets (reflect (Proxy :: Proxy p))) 0
mappend = (<>)
sipKey :: SipKey
sipKey = SipKey 4 7
siphash :: Serial a => a -> Word64
siphash a = h
where !bs = runPutS (serialize a)
(SipHash !h) = hash sipKey bs
insert :: (Reifies s Integer, Serial a) => a -> HyperLogLog s -> HyperLogLog s
insert = insertHash . w32 . siphash
insertHash :: Reifies s Integer => Word32 -> HyperLogLog s -> HyperLogLog s
insertHash h m@(HyperLogLog v) = HyperLogLog $ V.modify (\x -> do
old <- MV.read x bk
when (rnk > old) $ MV.write x bk rnk
) v where
!n = reflect m
!bk = calcBucket n h
!rnk = calcRank n h
size :: Reifies p Integer => HyperLogLog p -> Approximate Int64
size m@(HyperLogLog bs) = Approximate 0.9972 l expected h where
n = reflect m
m' = fromIntegral (numBuckets n)
numZeros = fromIntegral . V.length . V.filter (== 0) $ bs
res = case raw < smallRange n of
True | numZeros > 0 -> m' * log (m' / numZeros)
| otherwise -> raw
False | raw <= interRange -> raw
| otherwise -> raw + (raw / lim32) * raw
raw = rawFact n * (1 / sm)
sm = V.sum $ V.map (\x -> 1 / (2 ^^ x)) bs
expected = round res
sd = 1.04 / sqrt m'
l = floor $ max (res*(13*sd)) 0
h = ceiling $ res*(1+3*sd)
#ifdef HERBIE
#endif
intersectionSize :: Reifies p Integer => [HyperLogLog p] -> Approximate Int64
intersectionSize [] = 0
intersectionSize (x:xs) = withMin 0 $ size x + intersectionSize xs intersectionSize (mappend x <$> xs)
cast :: forall p q. (Reifies p Integer, Reifies q Integer) => HyperLogLog p -> Maybe (HyperLogLog q)
cast old
| newBuckets <= oldBuckets = Just $ over _HyperLogLog ?? mempty $ V.modify $ \m ->
V.forM_ (V.indexed $ old^._HyperLogLog) $ \ (i,o) -> do
let j = mod i newBuckets
a <- MV.read m j
MV.write m j (max o a)
| otherwise = Nothing
where
newBuckets = numBuckets (reflect (Proxy :: Proxy q))
oldBuckets = numBuckets (reflect old)