{-# LANGUAGE BangPatterns, CPP, Rank2Types,
    TypeOperators,FlexibleContexts #-}

-- |
-- Module: Data.BloomFilter.Mutable
-- Copyright: Bryan O'Sullivan
-- License: BSD3
--
-- Maintainer: Bryan O'Sullivan <bos@serpentine.com>
-- Stability: unstable
-- Portability: portable
--
-- A fast, space efficient Bloom filter implementation.  A Bloom
-- filter is a set-like data structure that provides a probabilistic
-- membership test.
--
-- * Queries do not give false negatives.  When an element is added to
--   a filter, a subsequent membership test will definitely return
--   'True'.
--
-- * False positives /are/ possible.  If an element has not been added
--   to a filter, a membership test /may/ nevertheless indicate that
--   the element is present.
--
-- This module provides low-level control.  For an easier to use
-- interface, see the "Data.BloomFilter.Easy" module.

module Data.BloomFilter.Mutable
    (
    -- * Overview
    -- $overview

    -- ** Ease of use
    -- $ease

    -- ** Performance
    -- $performance

    -- * Types
      Hash
    , MBloom
    -- * Mutable Bloom filters

    -- ** Creation
    , new

    -- ** Accessors
    , length
    , elem

    -- ** Mutation
    , insert

    -- * The underlying representation
    -- | If you serialize the raw bit arrays below to disk, do not
    -- expect them to be portable to systems with different
    -- conventions for endianness or word size.

    -- | The raw bit array used by the mutable 'MBloom' type.
    , bitArray
    ) where

#include "MachDeps.h"

import Control.Monad (liftM, forM_)
import Control.Monad.ST (ST)
import Data.Array.Base (unsafeRead, unsafeWrite)
import Data.Bits ((.&.), (.|.), unsafeShiftL, unsafeShiftR)
import Data.BloomFilter.Array (newArray)
import Data.BloomFilter.Util ((:*)(..), nextPowerOfTwo)
import Data.Word (Word32)
import Data.BloomFilter.Mutable.Internal

import Prelude hiding (elem, length, notElem,
                       (/), (*), div, divMod, mod, rem)

-- | Create a new mutable Bloom filter.  For efficiency, the number of
-- bits used may be larger than the number requested.  It is always
-- rounded up to the nearest higher power of two, but will be clamped
-- at a maximum of 4 gigabits, since hashes are 32 bits in size.
new :: (a -> [Hash])          -- ^ family of hash functions to use
    -> Int                    -- ^ number of bits in filter
    -> ST s (MBloom s a)
new :: forall a s. (a -> [Hash]) -> Int -> ST s (MBloom s a)
new a -> [Hash]
hash Int
numBits = forall s a.
(a -> [Hash]) -> Int -> Int -> STUArray s Int Hash -> MBloom s a
MB a -> [Hash]
hash Int
shft Int
msk forall (m :: * -> *) a1 r. Monad m => (a1 -> r) -> m a1 -> m r
`liftM` forall e s.
MArray (STUArray s) e (ST s) =>
Int -> Int -> ST s (STUArray s Int e)
newArray Int
numElems Int
numBytes
  where twoBits :: Int
twoBits | Int
numBits forall a. Ord a => a -> a -> Bool
< Int
1 = Int
1
                | Int
numBits forall a. Ord a => a -> a -> Bool
> Int
maxHash = Int
maxHash
                | forall {a}. (Bits a, Num a) => a -> Bool
isPowerOfTwo Int
numBits = Int
numBits
                | Bool
otherwise = Int -> Int
nextPowerOfTwo Int
numBits
        numElems :: Int
numElems = forall a. Ord a => a -> a -> a
max Int
2 (Int
twoBits forall a. Bits a => a -> Int -> a
`unsafeShiftR` Int
logBitsInHash)
        numBytes :: Int
numBytes = Int
numElems forall a. Bits a => a -> Int -> a
`unsafeShiftL` Int
logBytesInHash
        trueBits :: Int
trueBits = Int
numElems forall a. Bits a => a -> Int -> a
`unsafeShiftL` Int
logBitsInHash
        shft :: Int
shft     = Int -> Int
logPower2 Int
trueBits
        msk :: Int
msk      = Int
trueBits forall a. Num a => a -> a -> a
- Int
1
        isPowerOfTwo :: a -> Bool
isPowerOfTwo a
n = a
n forall a. Bits a => a -> a -> a
.&. (a
n forall a. Num a => a -> a -> a
- a
1) forall a. Eq a => a -> a -> Bool
== a
0

maxHash :: Int
#if WORD_SIZE_IN_BITS == 64
maxHash :: Int
maxHash = Int
4294967296
#else
maxHash = 1073741824
#endif

logBitsInHash :: Int
logBitsInHash :: Int
logBitsInHash = Int
5 -- logPower2 bitsInHash

logBytesInHash :: Int
logBytesInHash :: Int
logBytesInHash = Int
2 -- logPower2 (sizeOf (undefined :: Hash))

-- | Given a filter's mask and a hash value, compute an offset into
-- a word array and a bit offset within that word.
hashIdx :: Int -> Word32 -> (Int :* Int)
hashIdx :: Int -> Hash -> Int :* Int
hashIdx Int
msk Hash
x = (Int
y forall a. Bits a => a -> Int -> a
`unsafeShiftR` Int
logBitsInHash) forall a b. a -> b -> a :* b
:* (Int
y forall a. Bits a => a -> a -> a
.&. Int
hashMask)
  where hashMask :: Int
hashMask = Int
31 -- bitsInHash - 1
        y :: Int
y = forall a b. (Integral a, Num b) => a -> b
fromIntegral Hash
x forall a. Bits a => a -> a -> a
.&. Int
msk

-- | Hash the given value, returning a list of (word offset, bit
-- offset) pairs, one per hash value.
hashesM :: MBloom s a -> a -> [Int :* Int]
hashesM :: forall s a. MBloom s a -> a -> [Int :* Int]
hashesM MBloom s a
mb a
elt = Int -> Hash -> Int :* Int
hashIdx (forall s a. MBloom s a -> Int
mask MBloom s a
mb) forall a b. (a -> b) -> [a] -> [b]
`map` forall s a. MBloom s a -> a -> [Hash]
hashes MBloom s a
mb a
elt

-- | Insert a value into a mutable Bloom filter.  Afterwards, a
-- membership query for the same value is guaranteed to return @True@.
insert :: MBloom s a -> a -> ST s ()
insert :: forall s a. MBloom s a -> a -> ST s ()
insert MBloom s a
mb a
elt = do
  let mu :: STUArray s Int Hash
mu = forall s a. MBloom s a -> STUArray s Int Hash
bitArray MBloom s a
mb
  forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ (forall s a. MBloom s a -> a -> [Int :* Int]
hashesM MBloom s a
mb a
elt) forall a b. (a -> b) -> a -> b
$ \(Int
word :* Int
bit) -> do
      Hash
old <- forall (a :: * -> * -> *) e (m :: * -> *) i.
(MArray a e m, Ix i) =>
a i e -> Int -> m e
unsafeRead STUArray s Int Hash
mu Int
word
      forall (a :: * -> * -> *) e (m :: * -> *) i.
(MArray a e m, Ix i) =>
a i e -> Int -> e -> m ()
unsafeWrite STUArray s Int Hash
mu Int
word (Hash
old forall a. Bits a => a -> a -> a
.|. (Hash
1 forall a. Bits a => a -> Int -> a
`unsafeShiftL` Int
bit))

-- | Query a mutable Bloom filter for membership.  If the value is
-- present, return @True@.  If the value is not present, there is
-- /still/ some possibility that @True@ will be returned.
elem :: a -> MBloom s a -> ST s Bool
elem :: forall a s. a -> MBloom s a -> ST s Bool
elem a
elt MBloom s a
mb = forall {m :: * -> *}.
MArray (STUArray s) Hash m =>
[Int :* Int] -> m Bool
loop (forall s a. MBloom s a -> a -> [Int :* Int]
hashesM MBloom s a
mb a
elt)
  where mu :: STUArray s Int Hash
mu = forall s a. MBloom s a -> STUArray s Int Hash
bitArray MBloom s a
mb
        loop :: [Int :* Int] -> m Bool
loop ((Int
word :* Int
bit):[Int :* Int]
wbs) = do
          Hash
i <- forall (a :: * -> * -> *) e (m :: * -> *) i.
(MArray a e m, Ix i) =>
a i e -> Int -> m e
unsafeRead STUArray s Int Hash
mu Int
word
          if Hash
i forall a. Bits a => a -> a -> a
.&. (Hash
1 forall a. Bits a => a -> Int -> a
`unsafeShiftL` Int
bit) forall a. Eq a => a -> a -> Bool
== Hash
0
            then forall (m :: * -> *) a. Monad m => a -> m a
return Bool
False
            else [Int :* Int] -> m Bool
loop [Int :* Int]
wbs
        loop [Int :* Int]
_ = forall (m :: * -> *) a. Monad m => a -> m a
return Bool
True

-- bitsInHash :: Int
-- bitsInHash = sizeOf (undefined :: Hash) `shiftL` 3

-- | Return the size of a mutable Bloom filter, in bits.
length :: MBloom s a -> Int
length :: forall s a. MBloom s a -> Int
length = forall a. Bits a => a -> Int -> a
unsafeShiftL Int
1 forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall s a. MBloom s a -> Int
shift


-- | Slow, crummy way of computing the integer log of an integer known
-- to be a power of two.
logPower2 :: Int -> Int
logPower2 :: Int -> Int
logPower2 Int
k = forall {t} {t}. (Num t, Num t, Bits t) => t -> t -> t
go Int
0 Int
k
    where go :: t -> t -> t
go t
j t
1 = t
j
          go t
j t
n = t -> t -> t
go (t
jforall a. Num a => a -> a -> a
+t
1) (t
n forall a. Bits a => a -> Int -> a
`unsafeShiftR` Int
1)

-- $overview
--
-- Each of the functions for creating Bloom filters accepts two parameters:
--
-- * The number of bits that should be used for the filter.  Note that
--   a filter is fixed in size; it cannot be resized after creation.
--
-- * A function that accepts a value, and should return a fixed-size
--   list of hashes of that value.  To keep the false positive rate
--   low, the hashes computes should, as far as possible, be
--   independent.
--
-- By choosing these parameters with care, it is possible to tune for
-- a particular false positive rate.  The @suggestSizing@ function in
-- the "Data.BloomFilter.Easy" module calculates useful estimates for
-- these parameters.

-- $ease
--
-- This module provides both mutable interfaces for creating and
-- querying a Bloom filter.  It is most useful as a low-level way to
-- manage a Bloom filter with a custom set of characteristics.

-- $performance
--
-- The implementation has been carefully tuned for high performance
-- and low space consumption.
--
-- For efficiency, the number of bits requested when creating a Bloom
-- filter is rounded up to the nearest power of two.  This lets the
-- implementation use bitwise operations internally, instead of much
-- more expensive multiplication, division, and modulus operations.