{-# LANGUAGE CPP , OverloadedStrings #-} module Main where # ifdef ASSERTIONS_ON # error "Sorry, please reconfigure without -finstrumented so that we turn off assertions in library code." # endif import Criterion.Main import Control.DeepSeq import Control.Monad import Control.Concurrent import qualified Data.Text as T import Control.Concurrent.BloomFilter.Internal import qualified Control.Concurrent.BloomFilter as Bloom import Data.Hashabler import qualified Data.Set as Set import qualified Data.HashSet as HashSet -- import System.IO.Unsafe(unsafePerformIO) import System.Random -- TODO comparisons with: -- - pure Set -- - best in class Int (or other specialized) hash map or trie -- - general hashmap (of Hashable things) -- - the above, wrapped in an IORef or MVar main :: IO () main = do assertionsOn <- assertionCanary when assertionsOn $ putStrLn $ "!!! WARNING !!! assertions are enabled in library code and may result in " ++"slower than realistic benchmarks. Try configuring without -finstrumented" procs <- getNumCapabilities if procs < 2 then putStrLn "!!! WARNING !!!: Some benchmarks are only valid if more than 1 core is available" else return () b_5_20 <- Bloom.new (SipKey 1 1) 5 20 b_13_20 <- Bloom.new (SipKey 1 1) 13 20 -- needs 128 -- This has 0.3% fpr for 10000 elements, so I think can be fairly compared b_text <- Bloom.new (SipKey 11 22) 3 12 let txt = "orange" :: T.Text let g = mkStdGen 8973459 chars = randoms g :: [Char] fakeWords = go chars go :: [Char] -> [String] go [] = error "noninfinite list" go (s:ss) = let (a,as) = splitAt 3 ss (b,bs) = splitAt 5 as (c,cs) = splitAt 5 bs (d,ds) = splitAt 6 cs (e,es) = splitAt 8 ds in [s]:a:b:c:d:e:(go es) let textWords10k = map T.pack $ take 10000 fakeWords (wds5k_0, wds5k_1) = splitAt 5000 textWords10k deepseq textWords10k $ return () let hashset10 = HashSet.fromList $ take 10 textWords10k let hashset100 = HashSet.fromList $ take 100 textWords10k let hashset10000 = HashSet.fromList $ take 10000 textWords10k let set10 = Set.fromList $ take 10 textWords10k let set100 = Set.fromList $ take 100 textWords10k let set10000 = Set.fromList $ take 10000 textWords10k defaultMain [ bgroup "internals" [ bench "membershipWordAndBits64" $ nf (membershipWordAndBits64 (Hash64 1)) b_5_20 , bench "membershipWordAndBits128" $ nf (membershipWordAndBits128 (Hash128 1 1)) b_13_20 ], -- For comparing cache behavior with perf, against below: bgroup "HashSet" $ [ bench "10K insert" $ whnf (HashSet.fromList) textWords10k ], bgroup "Set" $ [ bench "10K insert" $ whnf (Set.fromList) textWords10k ], bgroup "different sizes" $ let benches b = [ bench "10K inserts" $ whnfIO $ manyInserts b textWords10k , bench "10K lookups" $ whnfIO $ manyLookups b textWords10k ] in [ env (Bloom.new (SipKey 11 22) 3 12) $ \ ~b -> bgroup "4096" (benches b) , env (Bloom.new (SipKey 11 22) 3 14) $ \ ~b -> bgroup "16384" (benches b) , env (Bloom.new (SipKey 11 22) 3 16) $ \ ~b -> bgroup "65536" (benches b) , env (Bloom.new (SipKey 11 22) 3 20) $ \ ~b -> bgroup "1MB" (benches b) , env (Bloom.new (SipKey 11 22) 3 24) $ \ ~b -> bgroup "8MB" (benches b) , env (Bloom.new (SipKey 11 22) 3 27) $ \ ~b -> bgroup "64MB" (benches b) ] , bgroup "different sizes (concurrency)" $ {- -- TODO factor out cost of 'new' in some better way: [ env (Bloom.new (SipKey 11 22) 3 12) $ \ ~b -> bench "bigInsertLookup 15k ops" $ whnfIO (largeInsertQueryBench b wds5k_0 wds5k_1) , env (Bloom.new (SipKey 11 22) 3 12) $ \ ~b -> bench "bigInsertLookup 15k ops across two threads (4096)" $ whnfIO (largeInsertQueryBenchTwoThreads b 5000 wds5k_0 wds5k_1) , env (Bloom.new (SipKey 11 22) 3 14) $ \ ~b -> bench "bigInsertLookup 15k ops across two threads (16384)" $ whnfIO (largeInsertQueryBenchTwoThreads b 5000 wds5k_0 wds5k_1) , env (Bloom.new (SipKey 11 22) 3 16) $ \ ~b -> bench "bigInsertLookup 15k ops across two threads (65536)" $ whnfIO (largeInsertQueryBenchTwoThreads b 5000 wds5k_0 wds5k_1) , env (Bloom.new (SipKey 11 22) 3 20) $ \ ~b -> bench "bigInsertLookup 15k ops across two threads (1MB)" $ whnfIO (largeInsertQueryBenchTwoThreads b 5000 wds5k_0 wds5k_1) , env (Bloom.new (SipKey 11 22) 3 24) $ \ ~b -> bench "bigInsertLookup 15k ops across two threads (8MB)" $ whnfIO (largeInsertQueryBenchTwoThreads b 5000 wds5k_0 wds5k_1) , env (Bloom.new (SipKey 11 22) 3 27) $ \ ~b -> bench "bigInsertLookup 15k ops across two threads (64MB)" $ whnfIO (largeInsertQueryBenchTwoThreads b 5000 wds5k_0 wds5k_1) -} let benches b = [ bench "10K inserts, across 2 threads" $ whnfIO $ manyInsertsTwoThreads b wds5k_0 wds5k_1 , bench "10K lookups, across 2 threads" $ whnfIO $ manyLookupsTwoThreads b wds5k_0 wds5k_1 ] in [ env (Bloom.new (SipKey 11 22) 3 12) $ \ ~b -> bgroup "4096" (benches b) , env (Bloom.new (SipKey 11 22) 3 14) $ \ ~b -> bgroup "16384" (benches b) , env (Bloom.new (SipKey 11 22) 3 16) $ \ ~b -> bgroup "65536" (benches b) , env (Bloom.new (SipKey 11 22) 3 20) $ \ ~b -> bgroup "1MB" (benches b) , env (Bloom.new (SipKey 11 22) 3 24) $ \ ~b -> bgroup "8MB" (benches b) , env (Bloom.new (SipKey 11 22) 3 27) $ \ ~b -> bgroup "64MB" (benches b) ] , bgroup "lookup insert" [ bench "siphash64_1_3 for comparison" $ whnf (siphash64_1_3 (SipKey 1 1)) (1::Int) -- best case, with no cache effects (I think): , bench "lookup (64)" $ whnfIO (Bloom.lookup b_5_20 (1::Int)) , bench "lookup x10 (64)" $ nfIO (mapM_ (Bloom.lookup b_5_20) [1..10]) , bench "lookup x100 (64)" $ nfIO (mapM_ (Bloom.lookup b_5_20) [1..100]) , bench "lookup (128)" $ whnfIO (Bloom.lookup b_13_20 (1::Int)) , bench "lookup x10 (128)" $ nfIO (mapM_ (Bloom.lookup b_13_20) [1..10]) , bench "lookup x100 (128)" $ nfIO (mapM_ (Bloom.lookup b_13_20) [1..100]) , bench "insert (64)" $ whnfIO (Bloom.insert b_5_20 (1::Int)) , bench "insert x10 (64)" $ nfIO (mapM_ (Bloom.insert b_5_20) [1..10]) , bench "insert x100 (64)" $ nfIO (mapM_ (Bloom.insert b_5_20) [1..100]) , bench "insert (128)" $ whnfIO (Bloom.insert b_13_20 (1::Int)) , bench "insert x10 (128)" $ nfIO (mapM_ (Bloom.insert b_13_20) [1..10]) , bench "insert x100 (128)" $ nfIO (mapM_ (Bloom.insert b_13_20) [1..100]) ], bgroup "comparisons micro" [ bench "(just siphash64_1_3 on txt for below)" $ whnf (siphash64_1_3 (SipKey 1 1)) ("orange"::T.Text) , bench "Bloom.insert (64)" $ whnfIO (Bloom.insert b_text txt) {- I was concerned that the above might not be valid (perhaps the - hashing of the Text value was getting reused?), but the following - convinced me it's all right; we can see differences in size of input - string reflected in all these benchmarks. I believe bloomInsertPure1 - reflects the inability to inline Hashable instance machinery (since - it must remain polymorphic. , bench "Bloom.insert (64)(validation1)" $ whnf (bloomInsertPure1 b_text) txt , bench "Bloom.insert (64)(validation2)" $ whnf (bloomInsertPure2 b_text) txt , bench "Bloom.insert (64)(validation3)" $ whnfIO (Bloom.insert b_text "ora") , bench "Bloom.insert (64)(validation4)" $ whnf (bloomInsertPure1 b_text) "ora" , bench "Bloom.insert (64)(validation5)" $ whnf (bloomInsertPure2 b_text) "ora" , bench "(validation orange)" $ whnf (siphash64_1_3 (SipKey 1 1)) ("orange"::T.Text) , bench "(validation ora)" $ whnf (siphash64_1_3 (SipKey 1 1)) ("ora"::T.Text) -} , bench "Set.insert into 10" $ whnf (\t-> Set.insert t set10) txt , bench "Set.insert into 100" $ whnf (\t-> Set.insert t set100) txt , bench "Set.insert into 10000" $ whnf (\t-> Set.insert t set10000) txt , bench "HashSet.insert into 10" $ whnf (\t-> HashSet.insert t hashset10) txt , bench "HashSet.insert into 100" $ whnf (\t-> HashSet.insert t hashset100) txt , bench "HashSet.insert into 10000" $ whnf (\t-> HashSet.insert t hashset10000) txt , bench "Bloom.lookup (64)" $ whnfIO (Bloom.lookup b_text txt) , bench "Set.member of 10" $ whnf (\t-> Set.member t set10) txt , bench "Set.member of 100" $ whnf (\t-> Set.member t set100) txt , bench "Set.member of 10000" $ whnf (\t-> Set.member t set10000) txt , bench "HashSet.member of 10" $ whnf (\t-> HashSet.member t hashset10) txt , bench "HashSet.member of 100" $ whnf (\t-> HashSet.member t hashset100) txt , bench "HashSet.member of 10000" $ whnf (\t-> HashSet.member t hashset10000) txt ], bgroup "comparisons big" [ ], -- TODO large random lookup and insert benchmark, comparing with single-thread and then with work split. -- make this how we compare as well? -- Do this for various types of elements bgroup "combining and creation" [ -- These timings can be subtracted from union timings: bench "new 14" $ whnfIO $ Bloom.new (SipKey 1 1) 3 14 , bench "new 20" $ whnfIO $ Bloom.new (SipKey 1 1) 3 20 , bench "unionInto (14 -> 14)" $ whnfIO $ unionBench 14 14 , bench "unionInto (20 -> 14)" $ whnfIO $ unionBench 20 14 -- 20 is 6x , bench "unionInto (20 -> 20)" $ whnfIO $ unionBench 20 20 ] ] unionBench :: Int -> Int -> IO () unionBench bigl littlel = do b1 <- Bloom.new (SipKey 1 1) 3 bigl b2 <- Bloom.new (SipKey 1 1) 3 littlel b1 `Bloom.unionInto` b2 instance NFData (BloomFilter a) where rnf _ = () {- -- TODO fix both of these and compare with Set/HashSet (wrapped in IORef or MVar for second) largeInsertQueryBench :: Bloom.BloomFilter T.Text -> [T.Text] -> [T.Text] -> IO () largeInsertQueryBench b payload antipayload = do forM_ payload $ Bloom.insert b forM_ (zip payload antipayload) $ \(x,y)-> do --- can't test, since we're re-using bloom: _xOk <- Bloom.lookup b x _yOk <- Bloom.lookup b y -- usually False -- unless (xOk) $ error "largeInsertQueryBench" return () largeInsertQueryBenchTwoThreads :: Bloom.BloomFilter T.Text -> Int -> [T.Text] -> [T.Text] -> IO () largeInsertQueryBenchTwoThreads b length_payload payload antipayload = do t0 <- newEmptyMVar t1 <- newEmptyMVar let (payload0,payload1) = splitAt (length_payload `div` 2) payload let (antipayload0,antipayload1) = splitAt (length_payload `div` 2) antipayload let go pld antpld v = do forM_ pld $ Bloom.insert b forM_ (zip pld antpld) $ \(x,y)-> do _xOk <- Bloom.lookup b x _yOk <- Bloom.lookup b y -- usually False -- unless (xOk) $ error "largeInsertQueryBench" return () putMVar v () void $ forkIO $ go payload0 antipayload0 t0 void $ forkIO $ go payload1 antipayload1 t1 takeMVar t0 >> takeMVar t1 -} -- These are mostly to check cache behavior, and I don't expect it to matter -- whether a bloom filter was already "filled with elements" or not. manyInserts :: Bloom.BloomFilter T.Text -> [T.Text] -> IO () manyInserts b payload = do forM_ payload (void . Bloom.insert b) manyLookups :: Bloom.BloomFilter T.Text -> [T.Text] -> IO () manyLookups b payload = do forM_ payload (void . Bloom.lookup b) manyInsertsTwoThreads :: Bloom.BloomFilter T.Text -> [T.Text] -> [T.Text] -> IO () manyInsertsTwoThreads b payload0 payload1 = do t0 <- newEmptyMVar t1 <- newEmptyMVar let go pld v = manyInserts b pld >> putMVar v () void $ forkIO $ go payload0 t0 void $ forkIO $ go payload1 t1 takeMVar t0 >> takeMVar t1 manyLookupsTwoThreads :: Bloom.BloomFilter T.Text -> [T.Text] -> [T.Text] -> IO () manyLookupsTwoThreads b payload0 payload1 = do t0 <- newEmptyMVar t1 <- newEmptyMVar let go pld v = manyLookups b pld >> putMVar v () void $ forkIO $ go payload0 t0 void $ forkIO $ go payload1 t1 takeMVar t0 >> takeMVar t1 {- -- So we can use whnf, and make sure hashes aren't being cached {-# NOINLINE bloomInsertPure1 #-} bloomInsertPure1 :: Hashable a => BloomFilter a -> a -> Bool bloomInsertPure1 b = unsafePerformIO . Bloom.insert b bloomInsertPure2 :: Hashable a => BloomFilter a -> a -> Bool bloomInsertPure2 b = unsafePerformIO . Bloom.insert b -}