{-# LANGUAGE CPP , OverloadedStrings #-}
module Main where
#  ifdef ASSERTIONS_ON
#    error "Sorry, please reconfigure without -finstrumented so that we turn off assertions in library code."
#  endif

import Criterion.Main
import Control.DeepSeq
import Control.Monad
import Control.Concurrent
import qualified Data.Text as T
import Data.List

import Control.Concurrent.BloomFilter.Internal
import qualified Control.Concurrent.BloomFilter as Bloom
import Data.Hashabler

import qualified Data.Set as Set
import qualified Data.HashSet as HashSet

-- import System.IO.Unsafe(unsafePerformIO)
import System.Random

-- TODO comparisons with:
--   - pure Set
--   - best in class Int (or other specialized) hash map or trie
--   - general hashmap (of Hashable things)
--   - the above, wrapped in an IORef or MVar

main :: IO ()
main = do
    assertionsOn <- assertionCanary
    when assertionsOn $
      putStrLn  $ "!!! WARNING !!! assertions are enabled in library code and may result in "
                ++"slower than realistic benchmarks. Try configuring without -finstrumented"

    procs <- getNumCapabilities
    if procs < 2 
        then putStrLn "!!! WARNING !!!: Some benchmarks are only valid if more than 1 core is available"
        else return ()
    

    -- TODO make this a function will call in 'env'
    let g = mkStdGen 8973459
        chars = randoms g :: [Char]
        fakeWords = go chars
        go :: [Char] -> [String]
        go [] = error "noninfinite list"
        go (s:ss) = let (a,as) = splitAt 3 ss
                        (b,bs) = splitAt 5 as
                        (c,cs) = splitAt 5 bs
                        (d,ds) = splitAt 6 cs
                        (e,es) = splitAt 8 ds
                     in [s]:a:b:c:d:e:(go es)

    let textWords10k = map T.pack $ take 10000 fakeWords
        (wds5k_0, wds5k_1) = splitAt 5000 textWords10k
    deepseq textWords10k $ return ()

    let txt = "orange" :: T.Text
        -- so half are in set and half are not:
        txt10New, txt10Mix :: [T.Text]
        txt10New = take 10 $ reverse textWords10k
        txt10Mix = concatMap (\(x,y)->[x,y]) $ zip textWords10k (take 5 txt10New)
    

    defaultMain [
      bgroup "internals" [
          env (Bloom.new (SipKey 1 1) 5 20) $ \ ~b->
            bench "membershipWordAndBits64" $ nf (membershipWordAndBits64 (Hash64 1)) b
        , env (Bloom.new (SipKey 1 1) 13 20) $ \ ~b->
            bench "membershipWordAndBits128" $ nf (membershipWordAndBits128 (Hash128 1 1)) b
        ],

      -- For comparing cache behavior with perf, against below:
      bgroup "HashSet" $
        [ bench "10K insert" $ whnf (HashSet.fromList) wds5k_0 
        , env (return $ HashSet.fromList textWords10k) $ \ ~hs ->
            bench "10K lookups on 5k elems" $ whnf (foldl1' (==) . map (\t->HashSet.member t hs)) textWords10k
        ],
      bgroup "Set" $
        [ bench "10K insert" $ whnf (Set.fromList) textWords10k 
        , env (return $ Set.fromList textWords10k) $ \ ~hs ->
            bench "10K lookups on 5k elems" $ whnf (foldl1' (==) . map (\t->Set.member t hs)) textWords10k
        ],

      bgroup "different sizes" $
        let benches b = [
                bench "10K inserts" $ whnfIO $ manyInserts b textWords10k
              , bench "10K lookups" $ whnfIO $ manyLookups b textWords10k
              ]
         in
            [ env (Bloom.new (SipKey 11 22) 3 12) $ \ ~b -> 
                bgroup "4096" (benches b)
            , env (Bloom.new (SipKey 11 22) 3 14) $ \ ~b -> 
                bgroup "16384" (benches b)
            , env (Bloom.new (SipKey 11 22) 3 16) $ \ ~b -> 
                bgroup "65536" (benches b)
            , env (Bloom.new (SipKey 11 22) 3 20) $ \ ~b -> 
                bgroup "1MB" (benches b)
            , env (Bloom.new (SipKey 11 22) 3 24) $ \ ~b -> 
                bgroup "8MB" (benches b)
            , env (Bloom.new (SipKey 11 22) 3 27) $ \ ~b -> 
                bgroup "64MB" (benches b)
            ]
      , bgroup "different sizes (concurrency)" $
        {-
          -- TODO factor out cost of 'new' in some better way:
        [ env (Bloom.new (SipKey 11 22) 3 12) $ \ ~b -> 
            bench "bigInsertLookup 15k ops" $  whnfIO (largeInsertQueryBench b wds5k_0 wds5k_1)

        , env (Bloom.new (SipKey 11 22) 3 12) $ \ ~b -> 
           bench "bigInsertLookup 15k ops across two threads (4096)" $ whnfIO (largeInsertQueryBenchTwoThreads b 5000 wds5k_0 wds5k_1)
        , env (Bloom.new (SipKey 11 22) 3 14) $ \ ~b -> 
           bench "bigInsertLookup 15k ops across two threads (16384)" $ whnfIO (largeInsertQueryBenchTwoThreads b 5000 wds5k_0 wds5k_1)
        , env (Bloom.new (SipKey 11 22) 3 16) $ \ ~b -> 
           bench "bigInsertLookup 15k ops across two threads (65536)" $ whnfIO (largeInsertQueryBenchTwoThreads b 5000 wds5k_0 wds5k_1)
        , env (Bloom.new (SipKey 11 22) 3 20) $ \ ~b -> 
           bench "bigInsertLookup 15k ops across two threads (1MB)" $ whnfIO (largeInsertQueryBenchTwoThreads b 5000 wds5k_0 wds5k_1)
        , env (Bloom.new (SipKey 11 22) 3 24) $ \ ~b -> 
           bench "bigInsertLookup 15k ops across two threads (8MB)" $ whnfIO (largeInsertQueryBenchTwoThreads b 5000 wds5k_0 wds5k_1)
        , env (Bloom.new (SipKey 11 22) 3 27) $ \ ~b -> 
           bench "bigInsertLookup 15k ops across two threads (64MB)" $ whnfIO (largeInsertQueryBenchTwoThreads b 5000 wds5k_0 wds5k_1)
        -}
        let benches b = [
                bench "10K inserts, across 2 threads" $ whnfIO $ manyInsertsTwoThreads b wds5k_0 wds5k_1
              , bench "10K lookups, across 2 threads" $ whnfIO $ manyLookupsTwoThreads b wds5k_0 wds5k_1
              ]
         in
            [ env (Bloom.new (SipKey 11 22) 3 12) $ \ ~b -> 
                bgroup "4096" (benches b)
            , env (Bloom.new (SipKey 11 22) 3 14) $ \ ~b -> 
                bgroup "16384" (benches b)
            , env (Bloom.new (SipKey 11 22) 3 16) $ \ ~b -> 
                bgroup "65536" (benches b)
            , env (Bloom.new (SipKey 11 22) 3 20) $ \ ~b -> 
                bgroup "1MB" (benches b)
            , env (Bloom.new (SipKey 11 22) 3 24) $ \ ~b -> 
                bgroup "8MB" (benches b)
            , env (Bloom.new (SipKey 11 22) 3 27) $ \ ~b -> 
                bgroup "64MB" (benches b)
            ]
      , bgroup "lookup insert" [
          bgroup "Int" [
              bench "siphash64_1_3 for comparison" $ whnf (siphash64_1_3 (SipKey 1 1)) (1::Int)
            , bench "siphash128 for comparison" $ whnf (siphash128 (SipKey 1 1)) (1::Int)
            , env (Bloom.new (SipKey 1 1) 3 12) $ \ ~b->
              bgroup "3 12 (64-bit hash)" [

                  -- best case, with no cache effects (I think):
                  bench "lookup x1" $ whnfIO (Bloom.lookup b (1::Int))
                , bench "lookup x10" $ nfIO (mapM_ (Bloom.lookup b) [1..10])
                , bench "lookup x100" $ nfIO (mapM_ (Bloom.lookup b) [1..100])

                , bench "insert x1" $ whnfIO (Bloom.insert b (1::Int))
                , bench "insert x10" $ nfIO (mapM_ (Bloom.insert b) [1..10])
                , bench "insert x100" $ nfIO (mapM_ (Bloom.insert b) [1..100])
              ]
            , env (Bloom.new (SipKey 1 1) 5 20) $ \ ~b->
              bgroup "5 20 (64-bit hash)" [

                  -- best case, with no cache effects (I think):
                  bench "lookup x1" $ whnfIO (Bloom.lookup b (1::Int))
                , bench "lookup x10" $ nfIO (mapM_ (Bloom.lookup b) [1..10])
                , bench "lookup x100" $ nfIO (mapM_ (Bloom.lookup b) [1..100])

                , bench "insert x1" $ whnfIO (Bloom.insert b (1::Int))
                , bench "insert x10" $ nfIO (mapM_ (Bloom.insert b) [1..10])
                , bench "insert x100" $ nfIO (mapM_ (Bloom.insert b) [1..100])
              ]
            , env (Bloom.new (SipKey 1 1) 13 20) $ \ ~b->
              bgroup "13 20 (128-bit hash)" [

                  bench "lookup x1" $ whnfIO (Bloom.lookup b (1::Int))
                , bench "lookup x10" $ nfIO (mapM_ (Bloom.lookup b) [1..10])
                , bench "lookup x100" $ nfIO (mapM_ (Bloom.lookup b) [1..100])

                , bench "insert x1" $ whnfIO (Bloom.insert b (1::Int))
                , bench "insert x10" $ nfIO (mapM_ (Bloom.insert b) [1..10])
                , bench "insert x100" $ nfIO (mapM_ (Bloom.insert b) [1..100])
              ]
          ],
          bgroup "Text" [
              bench "siphash64_1_3 for comparison" $ whnf (siphash64_1_3 (SipKey 1 1)) txt
            , bench "siphash128 for comparison" $ whnf (siphash128 (SipKey 1 1)) txt
            , env (Bloom.new (SipKey 1 1) 3 12) $ \ ~b->
              bgroup "3 12 (64-bit hash)" [

                  -- best case, with no cache effects (I think):
                  bench "lookup x1" $ whnfIO (Bloom.lookup b txt)
                , bench "lookup x10" $ nfIO (mapM_ (Bloom.lookup b) (take 10 textWords10k))
                , bench "lookup x100" $ nfIO (mapM_ (Bloom.lookup b) (take 100 textWords10k))

                , bench "insert x1" $ whnfIO (Bloom.insert b txt)
                , bench "insert x10" $ nfIO (mapM_ (Bloom.insert b) (take 10 textWords10k))
                , bench "insert x100" $ nfIO (mapM_ (Bloom.insert b) (take 100 textWords10k))
              ]
            , env (Bloom.new (SipKey 1 1) 5 20) $ \ ~b->
              bgroup "5 20 (64-bit hash)" [

                  -- best case, with no cache effects (I think):
                  bench "lookup x1" $ whnfIO (Bloom.lookup b txt)
                , bench "lookup x10" $ nfIO (mapM_ (Bloom.lookup b) (take 10 textWords10k))
                , bench "lookup x100" $ nfIO (mapM_ (Bloom.lookup b) (take 100 textWords10k))

                , bench "insert x1" $ whnfIO (Bloom.insert b txt)
                , bench "insert x10" $ nfIO (mapM_ (Bloom.insert b) (take 10 textWords10k))
                , bench "insert x100" $ nfIO (mapM_ (Bloom.insert b) (take 100 textWords10k))
              ]
            , env (Bloom.new (SipKey 1 1) 13 20) $ \ ~b->
              bgroup "13 20 (128-bit hash)" [

                  bench "lookup x1" $ whnfIO (Bloom.lookup b txt)
                , bench "lookup x10" $ nfIO (mapM_ (Bloom.lookup b) (take 10 textWords10k))
                , bench "lookup x100" $ nfIO (mapM_ (Bloom.lookup b) (take 100 textWords10k))

                , bench "insert x1" $ whnfIO (Bloom.insert b txt)
                , bench "insert x10" $ nfIO (mapM_ (Bloom.insert b) (take 10 textWords10k))
                , bench "insert x100" $ nfIO (mapM_ (Bloom.insert b) (take 100 textWords10k))
              ]
          ]

        ],
  --
  -- TODO check  TO SEE HOW THINGS LOOK BEFORE AND AFTER UNFOLDING CHANGE,
  --             MAYBE TRY DOING inserts/lookups x10 here.
  --   3x12 insert went from  51.8 to 49  (below)
  --   5x20 insert went from  59.1 to 47.6 (in "lookup insert")
      bgroup "comparisons micro x1 " [
          bench "(just siphash64_1_3 on txt for below)" $ whnf (siphash64_1_3 (SipKey 1 1)) ("orange"::T.Text)
        -- This has 0.3% fpr for 10000 elements, so I think can be fairly compared
        , env (Bloom.new (SipKey 11 22) 3 12) $ \ ~b_text->
          bgroup "unagi-bloomfilter 3 12" [
              bench "insert" $ whnfIO (Bloom.insert b_text txt)
            {- I was concerned that the above might not be valid (perhaps the
             - hashing of the Text value was getting reused?), but the following
             - convinced me it's all right; we can see differences in size of input
             - string reflected in all these benchmarks. I believe bloomInsertPure1
             - reflects the inability to inline Hashable instance machinery (since
             - it must remain polymorphic.
            , bench "Bloom.insert (64)(validation1)" $ whnf (bloomInsertPure1 b_text) txt
            , bench "Bloom.insert (64)(validation2)" $ whnf (bloomInsertPure2 b_text) txt
            , bench "Bloom.insert (64)(validation3)" $ whnfIO (Bloom.insert b_text "ora")
            , bench "Bloom.insert (64)(validation4)" $ whnf (bloomInsertPure1 b_text) "ora"
            , bench "Bloom.insert (64)(validation5)" $ whnf (bloomInsertPure2 b_text) "ora"
            , bench "(validation orange)" $ whnf (siphash64_1_3 (SipKey 1 1)) ("orange"::T.Text)
            , bench "(validation ora)" $ whnf (siphash64_1_3 (SipKey 1 1)) ("ora"::T.Text)
            -}
            , bench "lookup" $ nfIO (Bloom.lookup b_text txt)
          ]

        , env (return $ HashSet.fromList $ take 10 textWords10k) $ \ ~hashset10->
          bgroup "HashSet Text (10)" [
              bench "insert" $ whnf (\t-> HashSet.insert t hashset10) txt
            , bench "member" $ nf (\t-> HashSet.member t hashset10) txt
          ]
        , env (return $ HashSet.fromList $ take 100 textWords10k) $ \ ~hashset100->
          bgroup "HashSet Text (100)" [
              bench "insert" $ whnf (\t-> HashSet.insert t hashset100) txt
            , bench "member" $ nf (\t-> HashSet.member t hashset100) txt
          ]
        , env (return $ HashSet.fromList $ take 10000 textWords10k) $ \ ~hashset10000->
          bgroup "HashSet Text (10000)" [
              bench "insert" $ whnf (\t-> HashSet.insert t hashset10000) txt
            , bench "member" $ nf (\t-> HashSet.member t hashset10000) txt
          ]
        , env (return $ Set.fromList $ take 10 textWords10k) $ \ ~set10->
          bgroup "Set Text (10)" [
              bench "insert" $ whnf (\t-> Set.insert t set10) txt
            , bench "member" $ nf (\t-> Set.member t set10) txt
          ]
        , env (return $ Set.fromList $ take 100 textWords10k) $ \ ~set100->
          bgroup "Set Text (100)" [
              bench "insert" $ whnf (\t-> Set.insert t set100) txt
            , bench "member" $ nf (\t-> Set.member t set100) txt
          ]
        , env (return $ Set.fromList $ take 10000 textWords10k) $ \ ~set10000->
          bgroup "Set Text (10000)" [
              bench "insert" $ whnf (\t-> Set.insert t set10000) txt
            , bench "member" $ nf (\t-> Set.member t set10000) txt
          ]
      ],
      
      bgroup "comparisons micro x10" [
        -- This has 0.3% fpr for 10000 elements, so I think can be fairly compared
          env (Bloom.new (SipKey 11 22) 3 12) $ \ ~b_text->
          bgroup "unagi-bloomfilter 3 12" [
              bench "insert" $ whnfIO (mapM (Bloom.insert b_text) txt10New)
            , bench "lookup" $ nfIO (mapM (Bloom.lookup b_text) txt10Mix)
          ]

        , env (return $ HashSet.fromList $ take 100 textWords10k) $ \ ~hashset100->
          bgroup "HashSet Text (100)" [
              bench "insert" $ whnf (foldr (\t s-> HashSet.insert t s) hashset100) txt10New
            , bench "member" $ nf (map $ \t-> HashSet.member t hashset100) txt10Mix
          ]
        , env (return $ HashSet.fromList $ take 10000 textWords10k) $ \ ~hashset10000->
          bgroup "HashSet Text (10000)" [
              bench "insert" $ whnf (foldr (\t s-> HashSet.insert t s) hashset10000) txt10New
            , bench "member" $ nf (map $ \t-> HashSet.member t hashset10000) txt10Mix
          ]
        , env (return $ Set.fromList $ take 100 textWords10k) $ \ ~set100->
          bgroup "Set Text (100)" [
              bench "insert" $ whnf (foldr (\t s-> Set.insert t s) set100) txt10New
            , bench "member" $ nf (map $ \t-> Set.member t set100) txt10Mix
          ]
        , env (return $ Set.fromList $ take 10000 textWords10k) $ \ ~set10000->
          bgroup "Set Text (10000)" [
              bench "insert" $ whnf (foldr (\t s-> Set.insert t s) set10000) txt10New
            , bench "member" $ nf (map $ \t-> Set.member t set10000) txt10Mix
          ]
      ],

      bgroup "comparisons big" [
          -- TODO large random lookup and insert benchmark, comparing with single-thread and then with work split.
          --      make this how we compare as well?
          --      Do this for various types of elements
      ],

      bgroup "combining and creation" [
        -- These timings can be subtracted from union timings:
          bench "new 14" $ whnfIO $ Bloom.new (SipKey 1 1) 3 14
        , bench "new 20" $ whnfIO $ Bloom.new (SipKey 1 1) 3 20

        , bench "unionInto (14 -> 14)" $ whnfIO $ unionBench 14 14
        , bench "unionInto (20 -> 14)" $ whnfIO $ unionBench 20 14 -- 20 is 6x
        , bench "unionInto (20 -> 20)" $ whnfIO $ unionBench 20 20
        ]
      ]

unionBench :: Int -> Int -> IO ()
unionBench bigl littlel = do
    b1 <- Bloom.new (SipKey 1 1) 3 bigl
    b2 <- Bloom.new (SipKey 1 1) 3 littlel
    b1 `Bloom.unionInto` b2


instance NFData (BloomFilter a) where
  rnf _ = ()

{-
-- TODO fix both of these and compare with Set/HashSet (wrapped in IORef or MVar for second)
largeInsertQueryBench :: Bloom.BloomFilter T.Text -> [T.Text] -> [T.Text] -> IO ()
largeInsertQueryBench b payload antipayload = do
  forM_ payload $ Bloom.insert b
  forM_ (zip payload antipayload) $ \(x,y)-> do
    --- can't test, since we're re-using bloom:
    _xOk <- Bloom.lookup b x
    _yOk <- Bloom.lookup b y  -- usually False
    -- unless (xOk) $ error "largeInsertQueryBench"
    return ()

largeInsertQueryBenchTwoThreads :: Bloom.BloomFilter T.Text -> Int -> [T.Text] -> [T.Text] -> IO ()
largeInsertQueryBenchTwoThreads b length_payload payload antipayload = do
  t0 <- newEmptyMVar
  t1 <- newEmptyMVar
  let (payload0,payload1) = splitAt (length_payload `div` 2) payload
  let (antipayload0,antipayload1) = splitAt (length_payload `div` 2) antipayload

  let go pld antpld v = do
          forM_ pld $ Bloom.insert b
          forM_ (zip pld antpld) $ \(x,y)-> do
            _xOk <- Bloom.lookup b x
            _yOk <- Bloom.lookup b y  -- usually False
            -- unless (xOk) $ error "largeInsertQueryBench"
            return ()
          putMVar v ()
  void $ forkIO $ go payload0 antipayload0 t0
  void $ forkIO $ go payload1 antipayload1 t1
  takeMVar t0 >> takeMVar t1
  -}


-- These are mostly to check cache behavior, and I don't expect it to matter
-- whether a bloom filter was already "filled with elements" or not.
manyInserts :: Bloom.BloomFilter T.Text -> [T.Text] -> IO ()
manyInserts b payload = do
  forM_ payload (void . Bloom.insert b)

manyLookups :: Bloom.BloomFilter T.Text -> [T.Text] -> IO ()
manyLookups b payload = do
  forM_ payload (void . Bloom.lookup b)

manyInsertsTwoThreads :: Bloom.BloomFilter T.Text -> [T.Text] -> [T.Text] -> IO ()
manyInsertsTwoThreads b payload0 payload1 = do
  t0 <- newEmptyMVar
  t1 <- newEmptyMVar
  let go pld v = manyInserts b pld >> putMVar v ()
  void $ forkIO $ go payload0 t0
  void $ forkIO $ go payload1 t1
  takeMVar t0 >> takeMVar t1

manyLookupsTwoThreads :: Bloom.BloomFilter T.Text -> [T.Text] -> [T.Text] -> IO ()
manyLookupsTwoThreads b payload0 payload1 = do
  t0 <- newEmptyMVar
  t1 <- newEmptyMVar
  let go pld v = manyLookups b pld >> putMVar v ()
  void $ forkIO $ go payload0 t0
  void $ forkIO $ go payload1 t1
  takeMVar t0 >> takeMVar t1


{-
-- So we can use whnf, and make sure hashes aren't being cached
{-# NOINLINE bloomInsertPure1 #-}
bloomInsertPure1 :: Hashable a => BloomFilter a -> a -> Bool
bloomInsertPure1 b = unsafePerformIO . Bloom.insert b

bloomInsertPure2 :: Hashable a => BloomFilter a -> a -> Bool
bloomInsertPure2 b = unsafePerformIO . Bloom.insert b

{-# NOINLINE bloomInsertPure3 #-}
bloomInsertPure3 :: BloomFilter Text -> Text -> Bool
bloomInsertPure3 b = unsafePerformIO . Bloom.insert b
-}