{-# LANGUAGE BangPatterns              #-}
{-# LANGUAGE CPP                       #-}
{-# LANGUAGE DeriveGeneric             #-}
{-# LANGUAGE FlexibleContexts          #-}
{-# LANGUAGE FlexibleInstances         #-}
{-# LANGUAGE FunctionalDependencies    #-}
{-# LANGUAGE GeneralizedNewtypeDeriving#-}
{-# LANGUAGE MultiParamTypeClasses     #-}
{-# LANGUAGE PolyKinds                 #-}
{-# LANGUAGE RankNTypes                #-}
{-# LANGUAGE RoleAnnotations           #-}
{-# LANGUAGE ScopedTypeVariables       #-}
{-# LANGUAGE TypeApplications          #-}
{-# LANGUAGE UndecidableInstances      #-}
{-# LANGUAGE MonoLocalBinds            #-}
{-# OPTIONS_GHC -fno-cse #-}
{-# OPTIONS_GHC -fno-full-laziness #-}
{-# OPTIONS_GHC -fno-float-in #-}
{-# OPTIONS_GHC -Wno-unused-binds #-}

--------------------------------------------------------------------

-- |

-- Copyright :  (c) Edward Kmett 2013-2025

-- License   :  BSD3

-- Maintainer:  Edward Kmett <ekmett@gmail.com>

-- Stability :  experimental

-- Portability: non-portable

--

-- This package provides an approximate streaming (constant space)

-- unique object counter.

--

-- See the original paper for details:

-- <http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf>

--------------------------------------------------------------------

module Data.HyperLogLog.Type
  (
  -- * HyperLogLog

    DefaultSipKey
  , DefaultHyperLogLog
  , SipKey
  , reifySipKey
  , HyperLogLog(..)
  , generateHyperLogLog
  , HasHyperLogLog(..)
  , size
  , insert
  , insertHash
  , intersectionSize
  , cast
  , coerceConfig
  ) where

import Control.DeepSeq (NFData (..))
import Control.Lens
import Control.Monad
import Crypto.MAC.SipHash
import Data.Approximate.Type
import Data.Bits.Extras
import qualified Data.Binary as Binary
import Data.Binary
import qualified Data.ByteString.Lazy as BSL
import qualified Data.Bytes.Get as Bytes (getWord64host, runGetL)
import Data.Bytes.Put (runPutS)
import Data.Bytes.Serial
import Data.HyperLogLog.Config
import Data.Proxy
import Data.Reflection
import Data.Serialize as Serialize
import Data.Type.Coercion (Coercion(..))
import qualified Data.Vector.Unboxed as V
import qualified Data.Vector.Unboxed.Mutable as MV
import GHC.Generics hiding (D, to)
import GHC.Int
import GHC.Types (Type)
import System.Entropy (getEntropy)

#if !(MIN_VERSION_base(4,11,0))
import Data.Semigroup (Semigroup(..))
#endif

-- $setup

-- >>> :set -XScopedTypeVariables

-- >>> :set -XTemplateHaskell

-- >>> :set -XDataKinds

-- >>> import Data.HyperLogLog

-- >>> import Control.Lens

-- >>> import Data.Reflection

-- >>> import Data.Monoid

-- >>> import qualified Data.Vector.Unboxed as V


------------------------------------------------------------------------------

-- HyperLogLog

------------------------------------------------------------------------------


-- |

--

-- Initialize a new counter:

--

-- >>> runHyperLogLog (mempty :: DefaultHyperLogLog 3) == V.fromList [0,0,0,0,0,0,0,0]

-- True

--

-- Please note how you specify a counter size with the @n@

-- invocation. Sizes of up to 16 are valid, with 7 being a

-- likely good minimum for decent accuracy.

--

-- Let's count a list of unique items and get the latest estimate:

--

-- >>> size (foldr insert mempty [1..10] :: DefaultHyperLogLog 4)

-- Approximate {_confidence = 0.9972, _lo = 2, _estimate = 9, _hi = 17}

--

-- Note how 'insert' can be used to add new observations to the

-- approximate counter.

--

-- The @s@ type parameter configures the 'SipKey' that is passed to the hash

-- function when 'insert'ing a new value. Note that if cryptographic security is

-- a primary consideration, it is recommended that you create 'HyperLogLog'

-- values using 'generateHyperLogLog' so that the 'SipKey' is randomly

-- generated using system entropy. In contrast, the 'HyperLogLog' data

-- constructor and the 'mempty' method allow constructing 'HyperLogLog' values

-- with fixed 'SipKey's, which can result in exponentially inaccurate estimates

-- if exploited by an adversary. (See <https://eprint.iacr.org/2021/1139>.)

newtype HyperLogLog s p =
  -- | Construct a 'HyperLogLog' value directly from a 'V.Vector'.

  --

  -- Note that using this data constructor directly permits the @s@ type

  -- parameter to be a fixed 'SipKey', which can have cryptographic security

  -- implications. See the Haddocks for 'HyperLogLog' for more details.

  HyperLogLog { forall {k} {k} (s :: k) (p :: k). HyperLogLog s p -> Vector Rank
runHyperLogLog :: V.Vector Rank }
    deriving (HyperLogLog s p -> HyperLogLog s p -> Bool
(HyperLogLog s p -> HyperLogLog s p -> Bool)
-> (HyperLogLog s p -> HyperLogLog s p -> Bool)
-> Eq (HyperLogLog s p)
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
forall k (s :: k) k (p :: k).
HyperLogLog s p -> HyperLogLog s p -> Bool
$c== :: forall k (s :: k) k (p :: k).
HyperLogLog s p -> HyperLogLog s p -> Bool
== :: HyperLogLog s p -> HyperLogLog s p -> Bool
$c/= :: forall k (s :: k) k (p :: k).
HyperLogLog s p -> HyperLogLog s p -> Bool
/= :: HyperLogLog s p -> HyperLogLog s p -> Bool
Eq, Int -> HyperLogLog s p -> ShowS
[HyperLogLog s p] -> ShowS
HyperLogLog s p -> String
(Int -> HyperLogLog s p -> ShowS)
-> (HyperLogLog s p -> String)
-> ([HyperLogLog s p] -> ShowS)
-> Show (HyperLogLog s p)
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
forall k (s :: k) k (p :: k). Int -> HyperLogLog s p -> ShowS
forall k (s :: k) k (p :: k). [HyperLogLog s p] -> ShowS
forall k (s :: k) k (p :: k). HyperLogLog s p -> String
$cshowsPrec :: forall k (s :: k) k (p :: k). Int -> HyperLogLog s p -> ShowS
showsPrec :: Int -> HyperLogLog s p -> ShowS
$cshow :: forall k (s :: k) k (p :: k). HyperLogLog s p -> String
show :: HyperLogLog s p -> String
$cshowList :: forall k (s :: k) k (p :: k). [HyperLogLog s p] -> ShowS
showList :: [HyperLogLog s p] -> ShowS
Show, (forall x. HyperLogLog s p -> Rep (HyperLogLog s p) x)
-> (forall x. Rep (HyperLogLog s p) x -> HyperLogLog s p)
-> Generic (HyperLogLog s p)
forall x. Rep (HyperLogLog s p) x -> HyperLogLog s p
forall x. HyperLogLog s p -> Rep (HyperLogLog s p) x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
forall k (s :: k) k (p :: k) x.
Rep (HyperLogLog s p) x -> HyperLogLog s p
forall k (s :: k) k (p :: k) x.
HyperLogLog s p -> Rep (HyperLogLog s p) x
$cfrom :: forall k (s :: k) k (p :: k) x.
HyperLogLog s p -> Rep (HyperLogLog s p) x
from :: forall x. HyperLogLog s p -> Rep (HyperLogLog s p) x
$cto :: forall k (s :: k) k (p :: k) x.
Rep (HyperLogLog s p) x -> HyperLogLog s p
to :: forall x. Rep (HyperLogLog s p) x -> HyperLogLog s p
Generic, HyperLogLog s p -> ()
(HyperLogLog s p -> ()) -> NFData (HyperLogLog s p)
forall a. (a -> ()) -> NFData a
forall k (s :: k) k (p :: k). HyperLogLog s p -> ()
$crnf :: forall k (s :: k) k (p :: k). HyperLogLog s p -> ()
rnf :: HyperLogLog s p -> ()
NFData)
type role HyperLogLog nominal nominal

data DefaultSipKey = DefaultSipKey

instance Reifies DefaultSipKey SipKey where
  reflect :: forall (proxy :: * -> *). proxy DefaultSipKey -> SipKey
reflect proxy DefaultSipKey
_ = Word64 -> Word64 -> SipKey
SipKey Word64
4 Word64
7

type DefaultHyperLogLog = HyperLogLog DefaultSipKey

-- | Promote a 'SipKey' to the type level for use as part of a 'HyperLogLog'

-- type.

reifySipKey :: Word64 -> Word64 -> (forall (s :: Type). Reifies s SipKey => Proxy s -> r) -> r
reifySipKey :: forall r.
Word64
-> Word64 -> (forall s. Reifies s SipKey => Proxy s -> r) -> r
reifySipKey Word64
m Word64
n forall s. Reifies s SipKey => Proxy s -> r
k = SipKey -> (forall s. Reifies s SipKey => Proxy s -> r) -> r
forall a r. a -> (forall s. Reifies s a => Proxy s -> r) -> r
reify (Word64 -> Word64 -> SipKey
SipKey Word64
m Word64
n) Proxy s -> r
forall s. Reifies s SipKey => Proxy s -> r
k

-- | Generate a fresh 'HyperLogLog' value using a randomly generated 'SipKey':

--

-- >>> generateHyperLogLog $ \(m :: HyperLogLog s 3) -> pure (runHyperLogLog m == V.fromList [0,0,0,0,0,0,0,0])

-- True

--

--

-- The 'SipKey' is generated using system entropy, so if cryptographic security

-- is a primary consideration, use this function to create a 'HyperLogLog'

-- value instead of manually building one (e.g., by using the 'HyperLogLog'

-- data constructor or by using 'mempty').

generateHyperLogLog :: Reifies p Integer => (forall (s :: Type). HyperLogLog s p -> IO r) -> IO r
generateHyperLogLog :: forall {k} (p :: k) r.
Reifies p Integer =>
(forall s. HyperLogLog s p -> IO r) -> IO r
generateHyperLogLog forall s. HyperLogLog s p -> IO r
k = do
  Word64
m <- IO Word64
generateWord64
  Word64
n <- IO Word64
generateWord64
  Word64
-> Word64
-> (forall {s}. Reifies s SipKey => Proxy s -> IO r)
-> IO r
forall r.
Word64
-> Word64 -> (forall s. Reifies s SipKey => Proxy s -> r) -> r
reifySipKey Word64
m Word64
n ((forall {s}. Reifies s SipKey => Proxy s -> IO r) -> IO r)
-> (forall {s}. Reifies s SipKey => Proxy s -> IO r) -> IO r
forall a b. (a -> b) -> a -> b
$ \(Proxy s
_ :: Proxy s) ->
    forall s. HyperLogLog s p -> IO r
k @s HyperLogLog s p
forall a. Monoid a => a
mempty
  where
    generateWord64 :: IO Word64
    generateWord64 :: IO Word64
generateWord64 = do
      ByteString
bs <- ByteString -> ByteString
BSL.fromStrict (ByteString -> ByteString) -> IO ByteString -> IO ByteString
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Int -> IO ByteString
getEntropy Int
8
      Word64 -> IO Word64
forall a. a -> IO a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Word64 -> IO Word64) -> Word64 -> IO Word64
forall a b. (a -> b) -> a -> b
$ Get Word64 -> ByteString -> Word64
forall a. Get a -> ByteString -> a
Bytes.runGetL Get Word64
forall (m :: * -> *). MonadGet m => m Word64
Bytes.getWord64host ByteString
bs

-- | If the two types @p@ and @q@ reify the same configuration, and if the two

-- types @r@ and @s@ reify the same 'SipKey', then we can coerce between

-- @'HyperLogLog' r p@ and @'HyperLogLog' s q@. We do this by building a hole in

-- the @nominal@ role for the configuration parameter.

coerceConfig :: forall p q r s. (Reifies p Integer, Reifies q Integer, Reifies r SipKey, Reifies s SipKey) => Maybe (Coercion (HyperLogLog r p) (HyperLogLog s q))
coerceConfig :: forall {k} {k} {k} {k} (p :: k) (q :: k) (r :: k) (s :: k).
(Reifies p Integer, Reifies q Integer, Reifies r SipKey,
 Reifies s SipKey) =>
Maybe (Coercion (HyperLogLog r p) (HyperLogLog s q))
coerceConfig | Proxy p -> Integer
forall {k} (s :: k) a (proxy :: k -> *).
Reifies s a =>
proxy s -> a
forall (proxy :: k -> *). proxy p -> Integer
reflect (Proxy p
forall {k} (t :: k). Proxy t
Proxy :: Proxy p) Integer -> Integer -> Bool
forall a. Eq a => a -> a -> Bool
== Proxy q -> Integer
forall {k} (s :: k) a (proxy :: k -> *).
Reifies s a =>
proxy s -> a
forall (proxy :: k -> *). proxy q -> Integer
reflect (Proxy q
forall {k} (t :: k). Proxy t
Proxy :: Proxy q)
             , Proxy r -> SipKey
forall {k} (s :: k) a (proxy :: k -> *).
Reifies s a =>
proxy s -> a
forall (proxy :: k -> *). proxy r -> SipKey
reflect (Proxy r
forall {k} (t :: k). Proxy t
Proxy :: Proxy r) SipKey -> SipKey -> Bool
forall a. Eq a => a -> a -> Bool
== Proxy s -> SipKey
forall {k} (s :: k) a (proxy :: k -> *).
Reifies s a =>
proxy s -> a
forall (proxy :: k -> *). proxy s -> SipKey
reflect (Proxy s
forall {k} (t :: k). Proxy t
Proxy :: Proxy s) = Coercion (HyperLogLog r p) (HyperLogLog s q)
-> Maybe (Coercion (HyperLogLog r p) (HyperLogLog s q))
forall a. a -> Maybe a
Just Coercion (HyperLogLog r p) (HyperLogLog s q)
forall {k} (a :: k) (b :: k). Coercible a b => Coercion a b
Coercion
             | Bool
otherwise = Maybe (Coercion (HyperLogLog r p) (HyperLogLog s q))
forall a. Maybe a
Nothing

instance Serialize (HyperLogLog s p)

instance Serial (HyperLogLog s p) where
  serialize :: forall (m :: * -> *). MonadPut m => HyperLogLog s p -> m ()
serialize (HyperLogLog Vector Rank
v) = [Rank] -> m ()
forall a (m :: * -> *). (Serial a, MonadPut m) => a -> m ()
forall (m :: * -> *). MonadPut m => [Rank] -> m ()
serialize (Vector Rank -> [Rank]
forall a. Unbox a => Vector a -> [a]
V.toList Vector Rank
v)
  deserialize :: forall (m :: * -> *). MonadGet m => m (HyperLogLog s p)
deserialize = ([Rank] -> HyperLogLog s p) -> m [Rank] -> m (HyperLogLog s p)
forall (m :: * -> *) a1 r. Monad m => (a1 -> r) -> m a1 -> m r
liftM (Vector Rank -> HyperLogLog s p
forall {k} {k} (s :: k) (p :: k). Vector Rank -> HyperLogLog s p
HyperLogLog (Vector Rank -> HyperLogLog s p)
-> ([Rank] -> Vector Rank) -> [Rank] -> HyperLogLog s p
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Rank] -> Vector Rank
forall a. Unbox a => [a] -> Vector a
V.fromList) m [Rank]
forall a (m :: * -> *). (Serial a, MonadGet m) => m a
forall (m :: * -> *). MonadGet m => m [Rank]
deserialize

instance Binary (HyperLogLog s p) where
  put :: HyperLogLog s p -> Put
put (HyperLogLog Vector Rank
v) = [Rank] -> Put
forall t. Binary t => t -> Put
Binary.put (Vector Rank -> [Rank]
forall a. Unbox a => Vector a -> [a]
V.toList Vector Rank
v)
  get :: Get (HyperLogLog s p)
get = ([Rank] -> HyperLogLog s p) -> Get [Rank] -> Get (HyperLogLog s p)
forall a b. (a -> b) -> Get a -> Get b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (Vector Rank -> HyperLogLog s p
forall {k} {k} (s :: k) (p :: k). Vector Rank -> HyperLogLog s p
HyperLogLog (Vector Rank -> HyperLogLog s p)
-> ([Rank] -> Vector Rank) -> [Rank] -> HyperLogLog s p
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Rank] -> Vector Rank
forall a. Unbox a => [a] -> Vector a
V.fromList) Get [Rank]
forall t. Binary t => Get t
Binary.get

class HasHyperLogLog a s p | a -> s p where
  hyperLogLog :: Lens' a (HyperLogLog s p)

instance HasHyperLogLog (HyperLogLog s p) s p where
  hyperLogLog :: Lens' (HyperLogLog s p) (HyperLogLog s p)
hyperLogLog = (HyperLogLog s p -> f (HyperLogLog s p))
-> HyperLogLog s p -> f (HyperLogLog s p)
forall a. a -> a
id

-- TODO: prism to ensure the sizes are right

_HyperLogLog :: Iso' (HyperLogLog s p) (V.Vector Rank)
_HyperLogLog :: forall {k} {k} (s :: k) (p :: k) (p :: * -> * -> *) (f :: * -> *).
(Profunctor p, Functor f) =>
p (Vector Rank) (f (Vector Rank))
-> p (HyperLogLog s p) (f (HyperLogLog s p))
_HyperLogLog = (HyperLogLog s p -> Vector Rank)
-> (Vector Rank -> HyperLogLog s p)
-> Iso
     (HyperLogLog s p) (HyperLogLog s p) (Vector Rank) (Vector Rank)
forall s a b t. (s -> a) -> (b -> t) -> Iso s t a b
iso HyperLogLog s p -> Vector Rank
forall {k} {k} (s :: k) (p :: k). HyperLogLog s p -> Vector Rank
runHyperLogLog Vector Rank -> HyperLogLog s p
forall {k} {k} (s :: k) (p :: k). Vector Rank -> HyperLogLog s p
HyperLogLog
{-# INLINE _HyperLogLog #-}

instance Semigroup (HyperLogLog s p) where
  HyperLogLog Vector Rank
a <> :: HyperLogLog s p -> HyperLogLog s p -> HyperLogLog s p
<> HyperLogLog Vector Rank
b = Vector Rank -> HyperLogLog s p
forall {k} {k} (s :: k) (p :: k). Vector Rank -> HyperLogLog s p
HyperLogLog ((Rank -> Rank -> Rank) -> Vector Rank -> Vector Rank -> Vector Rank
forall a b c.
(Unbox a, Unbox b, Unbox c) =>
(a -> b -> c) -> Vector a -> Vector b -> Vector c
V.zipWith Rank -> Rank -> Rank
forall a. Ord a => a -> a -> a
max Vector Rank
a Vector Rank
b)
  {-# INLINE (<>) #-}

-- | The 'Monoid' instance \"should\" just work. Give me two estimators and I

-- can give you an estimator for the union set of the two.

--

-- Note that using 'mempty' permits the @s@ type parameter to be a fixed

-- 'SipKey', which can have cryptographic security implications. See the

-- Haddocks for 'HyperLogLog' for more details.

instance Reifies p Integer => Monoid (HyperLogLog s p) where
  mempty :: HyperLogLog s p
mempty = Vector Rank -> HyperLogLog s p
forall {k} {k} (s :: k) (p :: k). Vector Rank -> HyperLogLog s p
HyperLogLog (Vector Rank -> HyperLogLog s p) -> Vector Rank -> HyperLogLog s p
forall a b. (a -> b) -> a -> b
$ Int -> Rank -> Vector Rank
forall a. Unbox a => Int -> a -> Vector a
V.replicate (Integer -> Int
numBuckets (Proxy p -> Integer
forall {k} (s :: k) a (proxy :: k -> *).
Reifies s a =>
proxy s -> a
forall (proxy :: k -> *). proxy p -> Integer
reflect (Proxy p
forall {k} (t :: k). Proxy t
Proxy :: Proxy p))) Rank
0
  {-# INLINE mempty #-}
  mappend :: HyperLogLog s p -> HyperLogLog s p -> HyperLogLog s p
mappend = HyperLogLog s p -> HyperLogLog s p -> HyperLogLog s p
forall a. Semigroup a => a -> a -> a
(<>)
  {-# INLINE mappend #-}

siphash :: Serial a => SipKey -> a -> Word64
siphash :: forall a. Serial a => SipKey -> a -> Word64
siphash SipKey
k a
a = Word64
h
  where !bs :: ByteString
bs = Put -> ByteString
runPutS (a -> Put
forall a (m :: * -> *). (Serial a, MonadPut m) => a -> m ()
forall (m :: * -> *). MonadPut m => a -> m ()
serialize a
a)
        (SipHash !Word64
h) = SipKey -> ByteString -> SipHash
hash SipKey
k ByteString
bs
{-# INLINE siphash #-}

insert :: forall s p a. (Reifies s SipKey, Reifies p Integer, Serial a) => a -> HyperLogLog s p -> HyperLogLog s p
insert :: forall {k} {k} (s :: k) (p :: k) a.
(Reifies s SipKey, Reifies p Integer, Serial a) =>
a -> HyperLogLog s p -> HyperLogLog s p
insert = Word32 -> HyperLogLog s p -> HyperLogLog s p
forall {k} {k} (p :: k) (s :: k).
Reifies p Integer =>
Word32 -> HyperLogLog s p -> HyperLogLog s p
insertHash (Word32 -> HyperLogLog s p -> HyperLogLog s p)
-> (a -> Word32) -> a -> HyperLogLog s p -> HyperLogLog s p
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Word64 -> Word32
forall a. Integral a => a -> Word32
w32 (Word64 -> Word32) -> (a -> Word64) -> a -> Word32
forall b c a. (b -> c) -> (a -> b) -> a -> c
. SipKey -> a -> Word64
forall a. Serial a => SipKey -> a -> Word64
siphash (Proxy s -> SipKey
forall {k} (s :: k) a (proxy :: k -> *).
Reifies s a =>
proxy s -> a
forall (proxy :: k -> *). proxy s -> SipKey
reflect (Proxy s
forall {k} (t :: k). Proxy t
Proxy :: Proxy s))
{-# INLINE insert #-}

-- | Insert a value that has already been hashed by whatever user defined hash function you want.

insertHash :: Reifies p Integer => Word32 -> HyperLogLog s p -> HyperLogLog s p
insertHash :: forall {k} {k} (p :: k) (s :: k).
Reifies p Integer =>
Word32 -> HyperLogLog s p -> HyperLogLog s p
insertHash Word32
h m :: HyperLogLog s p
m@(HyperLogLog Vector Rank
v) = Vector Rank -> HyperLogLog s p
forall {k} {k} (s :: k) (p :: k). Vector Rank -> HyperLogLog s p
HyperLogLog (Vector Rank -> HyperLogLog s p) -> Vector Rank -> HyperLogLog s p
forall a b. (a -> b) -> a -> b
$ (forall s. MVector s Rank -> ST s ()) -> Vector Rank -> Vector Rank
forall a.
Unbox a =>
(forall s. MVector s a -> ST s ()) -> Vector a -> Vector a
V.modify (\MVector s Rank
x -> do
    Rank
old <- MVector (PrimState (ST s)) Rank -> Int -> ST s Rank
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> m a
MV.read MVector s Rank
MVector (PrimState (ST s)) Rank
x Int
bk
    Bool -> ST s () -> ST s ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Rank
rnk Rank -> Rank -> Bool
forall a. Ord a => a -> a -> Bool
> Rank
old) (ST s () -> ST s ()) -> ST s () -> ST s ()
forall a b. (a -> b) -> a -> b
$ MVector (PrimState (ST s)) Rank -> Int -> Rank -> ST s ()
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> a -> m ()
MV.write MVector s Rank
MVector (PrimState (ST s)) Rank
x Int
bk Rank
rnk
  ) Vector Rank
v where
  !n :: Integer
n = HyperLogLog s p -> Integer
forall {k} (s :: k) a (proxy :: k -> *).
Reifies s a =>
proxy s -> a
forall (proxy :: k -> *). proxy p -> Integer
reflect HyperLogLog s p
m
  !bk :: Int
bk = Integer -> Word32 -> Int
calcBucket Integer
n Word32
h
  !rnk :: Rank
rnk = Integer -> Word32 -> Rank
calcRank Integer
n Word32
h
{-# INLINE insertHash #-}

-- | Approximate size of our set

size :: Reifies p Integer => HyperLogLog s p -> Approximate Int64
size :: forall {k} {k} (p :: k) (s :: k).
Reifies p Integer =>
HyperLogLog s p -> Approximate Int64
size m :: HyperLogLog s p
m@(HyperLogLog Vector Rank
bs) = Log Double -> Int64 -> Int64 -> Int64 -> Approximate Int64
forall a. Log Double -> a -> a -> a -> Approximate a
Approximate Log Double
0.9972 Int64
l Int64
expected Int64
h where
  n :: Integer
n = HyperLogLog s p -> Integer
forall {k} (s :: k) a (proxy :: k -> *).
Reifies s a =>
proxy s -> a
forall (proxy :: k -> *). proxy p -> Integer
reflect HyperLogLog s p
m
  m' :: Double
m' = Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Integer -> Int
numBuckets Integer
n)
  numZeros :: Double
numZeros = Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Double) -> (Vector Rank -> Int) -> Vector Rank -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Vector Rank -> Int
forall a. Unbox a => Vector a -> Int
V.length (Vector Rank -> Int)
-> (Vector Rank -> Vector Rank) -> Vector Rank -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Rank -> Bool) -> Vector Rank -> Vector Rank
forall a. Unbox a => (a -> Bool) -> Vector a -> Vector a
V.filter (Rank -> Rank -> Bool
forall a. Eq a => a -> a -> Bool
== Rank
0) (Vector Rank -> Double) -> Vector Rank -> Double
forall a b. (a -> b) -> a -> b
$ Vector Rank
bs
  res :: Double
res = case Double
raw Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
< Integer -> Double
smallRange Integer
n of
    Bool
True | Double
numZeros Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
> Double
0 -> Double
m' Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double -> Double
forall a. Floating a => a -> a
log (Double
m' Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double
numZeros) -- 13.47 bits max error

         -- numZeros > 0 -> m' / 1 / (log m' - log numZeros) -- 6.47 bits max error

         | Bool
otherwise -> Double
raw
    Bool
False | Double
raw Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
<= Double
interRange -> Double
raw
          -- otherwise -> -1 * lim32 * log (1 - raw / lim32) -- 44 bits max error

          -- raw / lim32 < -1.7563532969399233e-6 -> - log (1 - (raw / lim32)) * lim32 -- 5.39 bits max error

          | Bool
otherwise -> Double
raw Double -> Double -> Double
forall a. Num a => a -> a -> a
+ (Double
raw Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double
lim32) Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
raw

  raw :: Double
raw = Integer -> Double
rawFact Integer
n Double -> Double -> Double
forall a. Num a => a -> a -> a
* (Double
1 Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double
sm)
  sm :: Double
sm = Vector Double -> Double
forall a. (Unbox a, Num a) => Vector a -> a
V.sum (Vector Double -> Double) -> Vector Double -> Double
forall a b. (a -> b) -> a -> b
$ (Rank -> Double) -> Vector Rank -> Vector Double
forall a b. (Unbox a, Unbox b) => (a -> b) -> Vector a -> Vector b
V.map (\Rank
x -> Double
1 Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ (Double
2 Double -> Rank -> Double
forall a b. (Fractional a, Integral b) => a -> b -> a
^^ Rank
x)) Vector Rank
bs
  expected :: Int64
expected = Double -> Int64
forall b. Integral b => Double -> b
forall a b. (RealFrac a, Integral b) => a -> b
round Double
res
  sd :: Double
sd = Double
1.04 Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double -> Double
forall a. Floating a => a -> a
sqrt Double
m'
  l :: Int64
l = Double -> Int64
forall b. Integral b => Double -> b
forall a b. (RealFrac a, Integral b) => a -> b
floor (Double -> Int64) -> Double -> Int64
forall a b. (a -> b) -> a -> b
$ Double -> Double -> Double
forall a. Ord a => a -> a -> a
max (Double
resDouble -> Double -> Double
forall a. Num a => a -> a -> a
*(Double
1Double -> Double -> Double
forall a. Num a => a -> a -> a
-Double
3Double -> Double -> Double
forall a. Num a => a -> a -> a
*Double
sd)) Double
0
  h :: Int64
h = Double -> Int64
forall b. Integral b => Double -> b
forall a b. (RealFrac a, Integral b) => a -> b
ceiling (Double -> Int64) -> Double -> Int64
forall a b. (a -> b) -> a -> b
$ Double
resDouble -> Double -> Double
forall a. Num a => a -> a -> a
*(Double
1Double -> Double -> Double
forall a. Num a => a -> a -> a
+Double
3Double -> Double -> Double
forall a. Num a => a -> a -> a
*Double
sd)
{-# INLINE size #-}
#ifdef HERBIE
{-# ANN size "NoHerbie" #-}
#endif

intersectionSize :: Reifies p Integer => [HyperLogLog s p] -> Approximate Int64
intersectionSize :: forall {k} {k} (p :: k) (s :: k).
Reifies p Integer =>
[HyperLogLog s p] -> Approximate Int64
intersectionSize [] = Approximate Int64
0
intersectionSize (HyperLogLog s p
x:[HyperLogLog s p]
xs) = Int64 -> Approximate Int64 -> Approximate Int64
forall a. Ord a => a -> Approximate a -> Approximate a
withMin Int64
0 (Approximate Int64 -> Approximate Int64)
-> Approximate Int64 -> Approximate Int64
forall a b. (a -> b) -> a -> b
$ HyperLogLog s p -> Approximate Int64
forall {k} {k} (p :: k) (s :: k).
Reifies p Integer =>
HyperLogLog s p -> Approximate Int64
size HyperLogLog s p
x Approximate Int64 -> Approximate Int64 -> Approximate Int64
forall a. Num a => a -> a -> a
+ [HyperLogLog s p] -> Approximate Int64
forall {k} {k} (p :: k) (s :: k).
Reifies p Integer =>
[HyperLogLog s p] -> Approximate Int64
intersectionSize [HyperLogLog s p]
xs Approximate Int64 -> Approximate Int64 -> Approximate Int64
forall a. Num a => a -> a -> a
- [HyperLogLog s p] -> Approximate Int64
forall {k} {k} (p :: k) (s :: k).
Reifies p Integer =>
[HyperLogLog s p] -> Approximate Int64
intersectionSize (HyperLogLog s p -> HyperLogLog s p -> HyperLogLog s p
forall a. Monoid a => a -> a -> a
mappend HyperLogLog s p
x (HyperLogLog s p -> HyperLogLog s p)
-> [HyperLogLog s p] -> [HyperLogLog s p]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> [HyperLogLog s p]
xs)
{-# INLINE intersectionSize #-}

cast :: forall p q s. (Reifies p Integer, Reifies q Integer) => HyperLogLog s p -> Maybe (HyperLogLog s q)
cast :: forall {k} {k} {k} (p :: k) (q :: k) (s :: k).
(Reifies p Integer, Reifies q Integer) =>
HyperLogLog s p -> Maybe (HyperLogLog s q)
cast HyperLogLog s p
old
  | Int
newBuckets Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
oldBuckets = HyperLogLog s q -> Maybe (HyperLogLog s q)
forall a. a -> Maybe a
Just (HyperLogLog s q -> Maybe (HyperLogLog s q))
-> HyperLogLog s q -> Maybe (HyperLogLog s q)
forall a b. (a -> b) -> a -> b
$ ASetter
  (HyperLogLog s q) (HyperLogLog s q) (Vector Rank) (Vector Rank)
-> (Vector Rank -> Vector Rank)
-> HyperLogLog s q
-> HyperLogLog s q
forall s t a b. ASetter s t a b -> (a -> b) -> s -> t
over ASetter
  (HyperLogLog s q) (HyperLogLog s q) (Vector Rank) (Vector Rank)
forall {k} {k} (s :: k) (p :: k) (p :: * -> * -> *) (f :: * -> *).
(Profunctor p, Functor f) =>
p (Vector Rank) (f (Vector Rank))
-> p (HyperLogLog s p) (f (HyperLogLog s p))
_HyperLogLog ((Vector Rank -> Vector Rank)
 -> HyperLogLog s q -> HyperLogLog s q)
-> HyperLogLog s q
-> (Vector Rank -> Vector Rank)
-> HyperLogLog s q
forall (f :: * -> *) a b. Functor f => f (a -> b) -> a -> f b
?? HyperLogLog s q
forall a. Monoid a => a
mempty ((Vector Rank -> Vector Rank) -> HyperLogLog s q)
-> (Vector Rank -> Vector Rank) -> HyperLogLog s q
forall a b. (a -> b) -> a -> b
$ (forall s. MVector s Rank -> ST s ()) -> Vector Rank -> Vector Rank
forall a.
Unbox a =>
(forall s. MVector s a -> ST s ()) -> Vector a -> Vector a
V.modify ((forall s. MVector s Rank -> ST s ())
 -> Vector Rank -> Vector Rank)
-> (forall s. MVector s Rank -> ST s ())
-> Vector Rank
-> Vector Rank
forall a b. (a -> b) -> a -> b
$ \MVector s Rank
m ->
    Vector (Int, Rank) -> ((Int, Rank) -> ST s ()) -> ST s ()
forall (m :: * -> *) a b.
(Monad m, Unbox a) =>
Vector a -> (a -> m b) -> m ()
V.forM_ (Vector Rank -> Vector (Int, Rank)
forall a. Unbox a => Vector a -> Vector (Int, a)
V.indexed (Vector Rank -> Vector (Int, Rank))
-> Vector Rank -> Vector (Int, Rank)
forall a b. (a -> b) -> a -> b
$ HyperLogLog s p
oldHyperLogLog s p
-> Getting (Vector Rank) (HyperLogLog s p) (Vector Rank)
-> Vector Rank
forall s a. s -> Getting a s a -> a
^.Getting (Vector Rank) (HyperLogLog s p) (Vector Rank)
forall {k} {k} (s :: k) (p :: k) (p :: * -> * -> *) (f :: * -> *).
(Profunctor p, Functor f) =>
p (Vector Rank) (f (Vector Rank))
-> p (HyperLogLog s p) (f (HyperLogLog s p))
_HyperLogLog) (((Int, Rank) -> ST s ()) -> ST s ())
-> ((Int, Rank) -> ST s ()) -> ST s ()
forall a b. (a -> b) -> a -> b
$ \ (Int
i,Rank
o) -> do
      let j :: Int
j = Int -> Int -> Int
forall a. Integral a => a -> a -> a
mod Int
i Int
newBuckets
      Rank
a <- MVector (PrimState (ST s)) Rank -> Int -> ST s Rank
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> m a
MV.read MVector s Rank
MVector (PrimState (ST s)) Rank
m Int
j
      MVector (PrimState (ST s)) Rank -> Int -> Rank -> ST s ()
forall (m :: * -> *) a.
(PrimMonad m, Unbox a) =>
MVector (PrimState m) a -> Int -> a -> m ()
MV.write MVector s Rank
MVector (PrimState (ST s)) Rank
m Int
j (Rank -> Rank -> Rank
forall a. Ord a => a -> a -> a
max Rank
o Rank
a)
  | Bool
otherwise = Maybe (HyperLogLog s q)
forall a. Maybe a
Nothing -- TODO?

  where
  newBuckets :: Int
newBuckets = Integer -> Int
numBuckets (Proxy q -> Integer
forall {k} (s :: k) a (proxy :: k -> *).
Reifies s a =>
proxy s -> a
forall (proxy :: k -> *). proxy q -> Integer
reflect (Proxy q
forall {k} (t :: k). Proxy t
Proxy :: Proxy q))
  oldBuckets :: Int
oldBuckets = Integer -> Int
numBuckets (HyperLogLog s p -> Integer
forall {k} (s :: k) a (proxy :: k -> *).
Reifies s a =>
proxy s -> a
forall (proxy :: k -> *). proxy p -> Integer
reflect HyperLogLog s p
old)
{-# INLINE cast #-}