{-# OPTIONS_GHC -Wall -fwarn-tabs #-}
{-# LANGUAGE CPP, BangPatterns #-}

----------------------------------------------------------------
--                                                  ~ 2022.03.04
-- |
-- Module      :  Bench.Foldable
-- Copyright   :  2008--2022 wren romano
-- License     :  BSD-3-Clause
-- Maintainer  :  wren@cpan.org
-- Stability   :  provisional
-- Portability :  portable (with bang-patterns)
--
-- Benchmarking definitions for 'Foldable'
----------------------------------------------------------------

module Bench.Foldable (main, realTrie_to_benchTrie, bgroup_Foldable) where

import           Shared.BaseCompat
import qualified Data.Trie           as T
import qualified Data.Trie.Internal  as TI

import qualified Data.ByteString     as S
#if MIN_VERSION_base(4,7,0)
-- [GHC 7.8.1]: "Data.Coerce" added to base.
import           Data.Coerce         (Coercible, coerce)
#endif
#if MIN_VERSION_base(4,9,0)
-- [GHC 8.0.1]: "Data.Semigroup" added to base.
import           Data.Semigroup      (Endo(..))
#endif
-- TODO: version limits for 'Dual'?
import           Data.Monoid         (Dual(..))
import           Data.Word           (Word8)
import           Control.DeepSeq     (NFData(rnf))
import qualified Test.QuickCheck     as QC
import qualified Criterion.Main      as C
-- TODO: consider using the @gauge@ library instead of @criterion@.
-- It's a clone of criterion, also by BOS; with the intention of
-- having reduced dependencies.  It doesn't do the HTML output, but
-- otherwise should be fine for our needs.  It's what the @containers@
-- library is using these days.
-- BUG: @gauge@ depends on @basement@ which as of version 0.0.12
-- doesn't support GHC 9.2; so we'll have to revisit this later.
----------------------------------------------------------------

-- | Reference definition, to avoid needing to expose constructors
-- of the real definition.
--
-- TODO: should maybe have a CPP macro to expose them for benchmarking,
-- to ensure this file stays in sync!
data Trie a
    = Branch {-# UNPACK #-} !Word8 -- Prefix
             {-# UNPACK #-} !Word8 -- Mask
                            !(Trie a)
                            !(Trie a)
    | Arc    {-# UNPACK #-} !S.ByteString
                            !(Maybe a)
                            !(Trie a)
    | Empty
    deriving Show

-- Needed for 'C.env'
instance NFData a => NFData (Trie a) where
    rnf Empty            = ()
    rnf (Arc _ mv t)     = rnf mv `seq` rnf t
    rnf (Branch _ _ l r) = rnf l `seq` rnf r

----------------------------------------------------------------
-- | From "Data.Functor.Utils", but isn't exported.  Is used heavily
-- by the default implementations, since they use so many newtype
-- wrappers.
(#.) :: Coercible b c => (b -> c) -> (a -> b) -> (a -> c)
(#.) _f = coerce
{-# INLINE (#.) #-}

----------------------------------------------------------------
fold_foldMap, fold_foldrCompose, fold_foldrEta, fold_v027
    :: Monoid m => Trie m -> m

-- | The default 'fold' definition as of base-4.16.0.0
fold_foldMap = foldMap_v027 id

-- | Default 'fold' via default 'foldMap' (using 'foldr_compose'),
-- but inlining the 'id' away.
-- Far worse than 'fold_foldMap', both for runtime and allocation.
fold_foldrCompose = foldr_compose mappend mempty

-- | Default 'fold' via default 'foldMap' (using 'foldr_eta'),
-- but inlining the 'id' away.
-- This is somewhat worse than 'fold_foldrCompose' even.
fold_foldrEta = foldr_eta mappend mempty

-- | bytestring-trie-0.2.7 definition.
-- This is a clear win over 'fold_foldMap': about half the runtime,
-- and one eighth the allocations.
fold_v027 = go
    where
    go Empty              = mempty
    go (Arc _ Nothing  t) = go t
    go (Arc _ (Just v) t) = v `mappend` go t
    go (Branch _ _ l r)   = go l `mappend` go r

----------------------------------------------------------------
foldMap_foldrCompose, foldMap_foldrEta, foldMap_v027
    :: Monoid m => (a -> m) -> Trie a -> m

-- | The default 'fold' definition as of base-4.16.0.0 (using 'foldr_compose').
foldMap_foldrCompose f = foldr_compose (mappend . f) mempty
-- | The default 'fold' definition as of base-4.16.0.0 (using 'foldr_eta').
foldMap_foldrEta     f = foldr_eta     (mappend . f) mempty

-- | bytestring-trie-0.2.7 definition, also used elsewhere for defaults.
-- (This was also the bytestring-trie-0.2.6 definition; the only
-- thing defined for that version.)
-- Again, this is the clear winner over the default implementation.
foldMap_v027 f = go
    where
    go Empty              = mempty
    go (Arc _ Nothing  t) = go t
    go (Arc _ (Just v) t) = f v `mappend` go t
    go (Branch _ _ l r)   = go l `mappend` go r

----------------------------------------------------------------
foldMap'_default, foldMap'_v027, foldMap'_v027_cps, foldMap'_v027_flop, foldMap'_v027_bang, foldMap'_v027_flop_bang
    :: Monoid m => (a -> m) -> Trie a -> m

-- | default definition.
foldMap'_default f = foldl'_v027 (\ acc a -> acc <> f a) mempty

-- | bytestring-trie-0.2.7 definition.
-- Again, this is the clear winner over the default implementation.
foldMap'_v027 f = go mempty
    where
    go !m Empty              = m
    go  m (Arc _ Nothing  t) = go m t
    go  m (Arc _ (Just v) t) = go (m `mappend` f v) t
    go  m (Branch _ _ l r)   = go (go m l) r

-- | CPS variant, to ensure proper tail-recursion.
-- In between 'foldMap'_v027' and 'foldMap'_default'.
foldMap'_v027_cps f = \t -> go t mempty id
    where
    go Empty              !m c = c m
    go (Arc _ Nothing  t)  m c = go t m c
    go (Arc _ (Just v) t)  m c = go t (m `mappend` f v) c
    go (Branch _ _ l r)    m c = go l m (\m' -> go r m' c)

-- | Based on our experience with 'foldl_v027_flop', trying the flop here.
-- On the small tries tested here, is slightly slower than
-- 'foldMap'_v027'; identical allocation.  On larger tries this
-- gets to be slightly better than 'foldMap'_v027'.
foldMap'_v027_flop f = \t -> go t mempty
    where
    go Empty              !m = m
    go (Arc _ Nothing  t)  m = go t m
    go (Arc _ (Just v) t)  m = go t (m `mappend` f v)
    go (Branch _ _ l r)    m = go r (go l m)

-- | This really shouldn't matter, but given our experience with 'foldl'_v027_flop_bang'...
-- And yet, is slightly better than 'foldMap'_v027_flop', slightly
-- worse than 'foldMap'_v027_flop_bang'; identical allocation for
-- all these variants.
foldMap'_v027_bang f = go mempty
    where
    go !m Empty              = m
    go  m (Arc _ Nothing  t) = go m t
    go  m (Arc _ (Just v) t) = (go $! mappend m (f v)) t
    go  m (Branch _ _ l r)   = (go $! go m l) r

-- | This really shouldn't matter, but given our experience with 'foldl'_v027_flop_bang'...
-- And yet, is slightly better than 'foldMap'_v027_bang', slightly
-- worse than 'foldMap'_v027'; identical allocation for all these
-- variants.  For larger tries, is slightly slower than 'foldMap'_v027_flop'.
foldMap'_v027_flop_bang f = \t -> go t mempty
    where
    go Empty              !m = m
    go (Arc _ Nothing  t)  m = go t m
    go (Arc _ (Just v) t)  m = go t $! mappend m (f v)
    go (Branch _ _ l r)    m = go r $! go l m

----------------------------------------------------------------
foldr_default, foldr_compose, foldr_eta, foldr_cps_eta, foldr_cps, foldr_noClosure
    :: (a -> b -> b) -> b -> Trie a -> b

-- | The default definition as of base-4.16.0.0
-- Actually a pretty solid baseline, for small tries.  And for
-- larger tries it's far and away the clear winner.
foldr_default f z t =
    appEndo (foldMap_v027 (Endo #. f) t) z

-- | bytestring-trie-0.2.7 definition
-- Identical allocation as EndoDefault; about the same speed, or a
-- slightly slower; but only for the small tries tested here.  For
-- larger tries it's far worse than 'foldr_default'.
foldr_compose f z0 = \t -> go t z0 -- eta for better inlining.
    where
    go Empty              = id
    go (Arc _ Nothing  t) =       go t
    go (Arc _ (Just v) t) = f v . go t
    go (Branch _ _ l r)   = go l . go r

-- |
-- Usually little faster than EndoDefault, occasionally about the
-- same, rarely slower.  However, this allocates (consistently) ~43% more.
-- [I'm thinking because (a) larger thunks to store the @z@, but
-- (b) fully-saturated thunks so faster apply?]
-- For larger tries, is also far worse than 'foldr_default'.
foldr_eta f z0 = \t -> go t z0 -- eta for better inlining.
    where
    go Empty              z = z
    go (Arc _ Nothing  t) z =      go t z
    go (Arc _ (Just v) t) z = f v (go t z)
    go (Branch _ _ l r)   z = go l (go r z)

-- | Much slower; also allocates much more. (For larger tries too)
foldr_cps_eta f z0 = \t -> go t id z0 -- eta for better inlining.
    where
    go Empty              c z = c z
    go (Arc _ Nothing  t) c z = go t c z
    go (Arc _ (Just v) t) c z = go t (c . f v) z
    go (Branch _ _ l r)   c z = go r (go l c) z

-- | Even slower; and even more allocation. (For larger tries too)
foldr_cps f z0 = \t -> go t id z0 -- eta for better inlining.
    where
    go Empty              c = c
    go (Arc _ Nothing  t) c = go t c
    go (Arc _ (Just v) t) c = go t (c . f v)
    go (Branch _ _ l r)   c = go r (go l c)

-- | Faster than 'foldr_cps_eta', but still much slower than
-- EndoDefault/compose/eta; Also allocates about as much as
-- 'foldr_cps_eta'.
foldr_noClosure _ z Empty              = z
foldr_noClosure f z (Arc _ Nothing  t) =      foldr_noClosure f z t
foldr_noClosure f z (Arc _ (Just v) t) = f v (foldr_noClosure f z t)
foldr_noClosure f z (Branch _ _ l r)   =
    foldr_noClosure f (foldr_noClosure f z r) l

----------------------------------------------------------------
foldr'_default, foldr'_v027, foldr'_v027_cps, foldr'_cps, foldr'_v027_flop, foldr'_v027_bang, foldr'_v027_flop_bang
    :: (a -> b -> b) -> b -> Trie a -> b

-- | The default definition as of base-4.16.0.0 (including the phrasing
-- of using a where-clause rather than a lambda).
-- The worst of the lot.
foldr'_default f z0 xs =
    foldl_v027 f' id xs z0
    where f' k x z = k $! f x z

-- | bytestring-trie-0.2.7 definition.
-- This one is the clear winner, both for time and allocation.
foldr'_v027 f z0 = go z0 -- eta for better inlining
    where
    go !z Empty              = z
    go  z (Arc _ Nothing  t) = go z t
    go  z (Arc _ (Just v) t) = f v $! go z t
    go  z (Branch _ _ l r)   = go (go z r) l

-- TODO: (2022.03.05) Try a variant of 'foldr'_v027' which doesn't
-- closure over @f@.

-- | Worse than 'foldr'_v027' but still better than the rest.
foldr'_v027_cps f z0 = \t -> go t z0 id -- eta for better inlining
    where
    go Empty              !z c = c z
    go (Arc _ Nothing  t)  z c = go t z c
    go (Arc _ (Just v) t)  z c = go t z (\ !z' -> c $! f v z')
    go (Branch _ _ l r)    z c = go r z (\ !z' -> go l z' c)

-- | Worse than 'foldr'_v027_cps' but still better than 'foldr'_default'
foldr'_cps f z0 = \t -> go t id z0 -- eta for better inlining
    where
    go Empty              c = c
    go (Arc _ Nothing  t) c = go t c
    go (Arc _ (Just v) t) c = go t (\ !z -> c $! f v z)
    go (Branch _ _ l r)   c = go r (go l c)

-- | Based on our experience with 'foldl_v027_flop', trying the flop here.
-- Actually faster than 'foldr'_v027' (but only for small tries);
-- identical allocation. For larger tries, this is a bit slower
-- than 'foldr'_v027'.
foldr'_v027_flop f z0 = \t -> go t z0 -- eta for better inlining
    where
    go Empty              !z = z
    go (Arc _ Nothing  t)  z = go t z
    go (Arc _ (Just v) t)  z = f v $! go t z
    go (Branch _ _ l r)    z = go l (go r z)

-- | This really shouldn't matter, but given our experience with 'foldl'_v027_flop_bang'...
-- And yet, faster than 'foldr'_cps', slower than 'foldr'_v027_flop'.
foldr'_v027_bang f z0 = go z0 -- eta for better inlining
    where
    go !z Empty              = z
    go  z (Arc _ Nothing  t) = go z t
    go  z (Arc _ (Just v) t) = f v $! go z t
    go  z (Branch _ _ l r)   = (go $! go z r) l

-- | This really shouldn't matter, but given our experience with 'foldl'_v027_flop_bang'...
-- And yet, marginally faster than 'foldr'_v027_flop', more
-- significantly slower than 'foldr'_v027'.
foldr'_v027_flop_bang f z0 = \t -> go t z0 -- eta for better inlining
    where
    go Empty              !z = z
    go (Arc _ Nothing  t)  z = go t z
    go (Arc _ (Just v) t)  z = f v $! go t z
    go (Branch _ _ l r)    z = go l $! go r z

----------------------------------------------------------------
foldl_default, foldl_default_Coerce, foldl_v027, foldl_v027_flop
    :: (b -> a -> b) -> b -> Trie a -> b

-- | The default definition as of base-4.16.0.0.
foldl_default f z t =
    appEndo (getDual (foldMap_v027 (Dual . Endo . flip f) t)) z

-- | Variant of the default that uses @('#.')@ in lieu of @('.')@.
-- This one performs massively better than 'foldl_default'.
-- TODO: send a patch to @base@ for this.
-- Hrm, for larger tries it looks like the benefit is only minimal...
foldl_default_Coerce f z t =
    appEndo (getDual (foldMap_v027 (Dual #. Endo #. flip f) t)) z

-- | bytestring-trie-0.2.7 definition, also used for defaults.
-- This one is faster than 'foldl_default_Coerce'; though it allocates
-- ~56% more than 'foldl_default_Coerce' (albeit ~11% /less/ than
-- 'foldl_default').
foldl_v027 f z0 = go z0 -- eta for better inlining
    where
    go z Empty              = z
    go z (Arc _ Nothing  t) = go z t
    go z (Arc _ (Just v) t) = go (f z v) t
    go z (Branch _ _ l r)   = go (go z l) r

-- | Since 'foldr_eta' allocates ~43% more than 'foldr_default',
-- yet 'foldl_v027' allocates ~56% more: see if swapping the order
-- of arguments in the recursion changes that.
-- For small tries:
-- * Marginally slower than 'foldl_v027' (<1%).
-- * Allocates ~8% less than 'foldl_v027'; ~19% less than 'foldl_default';
-- * albeit still ~42% more than 'foldl_default_Coerce'.
-- For larger tries: far and away the best of the lot.
foldl_v027_flop f z0 = \t -> go t z0 -- eta for better inlining
    where
    go Empty              z = z
    go (Arc _ Nothing  t) z = go t z
    go (Arc _ (Just v) t) z = go t (f z v)
    go (Branch _ _ l r)   z = go r (go l z)

-- We could try CPSing to restore the tail-recursion; but since
-- that's worse everywhere else, there seems little point.

----------------------------------------------------------------
foldl'_defaultCompose, foldl'_defaultEta, foldl'_v027, foldl'_v027_flop, foldl'_v027_flop_bang
    :: (b -> a -> b) -> b -> Trie a -> b

-- | The default  definition as of base-4.16.0.0 (using 'foldr_compose').
foldl'_defaultCompose f z0 xs =
    foldr_compose f' id xs z0
    where f' x k z = k $! f z x

-- | The default  definition as of base-4.16.0.0 (using 'foldr_eta').
-- Much worse than 'foldl'_defaultCompose'.
foldl'_defaultEta f z0 xs =
    foldr_eta f' id xs z0
    where f' x k z = k $! f z x

-- | bytestring-trie-0.2.7 definition, also used for defaults.
-- Clear winner, for small tries. For large tries it's about halfway
-- between the two defaults above, and the two flops below.
foldl'_v027 f z0 = go z0 -- eta for better inlining
    where
    go !z Empty              = z
    go  z (Arc _ Nothing  t) = go z t
    go  z (Arc _ (Just v) t) = go (f z v) t
    go  z (Branch _ _ l r)   = go (go z l) r

-- | For small tries: worse than 'foldl'_v027': ~10% slower, ~119%
-- more allocation.  But for large tries it's much faster.
foldl'_v027_flop f z0 = \t -> go t z0 -- eta for better inlining
    where
    go Empty              !z = z
    go (Arc _ Nothing  t)  z = go t z
    go (Arc _ (Just v) t)  z = go t (f z v)
    go (Branch _ _ l r)    z = go r (go l z)

-- FIXME: why on earth does this perform better?! Why aren't bang
-- patterns giving me the same thing?!
--
-- | For small tries: ~4.4% faster than 'foldl'_v027'; but allocates
-- ~32% more still.  For large tries it's only marginally faster
-- than 'foldl'_v027_flop' (which is to say massively faster than
-- 'foldl'_v027')
foldl'_v027_flop_bang f z0 = \t -> go t z0 -- eta for better inlining
    where
    go Empty              !z = z
    go (Arc _ Nothing  t)  z = go t z
    go (Arc _ (Just v) t)  z = go t $! f z v
    go (Branch _ _ l r)    z = go r $! go l z

-- TODO: CPS to restore the purely tail-call for @Branch@?

----------------------------------------------------------------
{-
-- TODO: the base-4.16.0.0 defaults are shown
{-# INLINE toList #-} toList t = build (\ c n -> foldr c n t)
null = foldr (\_ _ -> False) True
length = foldl' (\c _ -> c+1) 0
elem = any . (==)
-- With standalone definition: any p = getAny #. foldMap (Any #. p)

{-# INLINEABLE maximum #-}
maximum = fromMaybe (errorWithoutStackTrace "maximum: empty structure") .
    getMax . foldMap' (Max #. (Just :: a -> Maybe a)) -- -XScopedTypeVariables
{-# INLINEABLE minimum #-}
minimum = fromMaybe (errorWithoutStackTrace "minimum: empty structure") .
    getMin . foldMap' (Min #. (Just :: a -> Maybe a)) -- -XScopedTypeVariables
{-# INLINEABLE sum #-}
sum = getSum #. foldMap' Sum
{-# INLINEABLE product #-}
product = getProduct #. foldMap' Product
-}


----------------------------------------------------------------
----------------------------------------------------------------
-- Definitely not the most efficient thing...
-- TODO: define an anamorphism function in "TI" to use here...
arbitraryTrie :: Int -> Int -> QC.Gen (Trie Int)
arbitraryTrie maxK maxL = do
    k    <- QC.chooseInt (0, maxK)
    keys <- QC.vectorOf k $ do
        l  <- QC.chooseInt (0, maxL)
        xs <- QC.vector l
        return $ S.pack xs
    return . realTrie_to_benchTrie . T.fromList $ zip keys [0..]

-- TODO: still gotta reconstruct the 'Prefix' and 'Mask'.
-- (Even though none of the folds depend on them...)
realTrie_to_benchTrie :: T.Trie a -> Trie a
realTrie_to_benchTrie = TI.cata_ Arc (Branch 0 0) Empty

----------------------------------------------------------------
intToSum :: (Trie (Sum Int) -> a) -> [Trie Int] -> [a]
intToSum f = fmap f . coerce

-- BUG: since we started generating a collection of tries, we've
-- started to see different results: each test progressively slower
-- than the last.  Seems bogus, but not sure what's up since criterion
-- ought to be forcing all of @ts@ early and doing GC between each
-- benchmark...
main :: IO ()
main = C.defaultMain
  [ C.env (QC.generate $ QC.vectorOf 10 $ arbitraryTrie 30 10) $ \ ts ->
    bgroup_Foldable ts
  ]

bgroup_Foldable :: [Trie Int] -> C.Benchmark
bgroup_Foldable ts =
  C.bgroup "Foldable"
  [ C.bgroup "fold"
    [ C.bench "default (foldMap)"       $ C.nf (intToSum fold_foldMap     ) ts
    , C.bench "default (foldr_compose)" $ C.nf (intToSum fold_foldrCompose) ts
    , C.bench "default (foldr_eta)"     $ C.nf (intToSum fold_foldrEta    ) ts
    , C.bench "v0.2.7"                  $ C.nf (intToSum fold_v027        ) ts
    ]
  , C.bgroup "foldMap"
    [ C.bench "default (foldr_compose)" $ C.nf (intToSum (foldMap_foldrCompose id)) ts
    , C.bench "default (foldr_eta)"     $ C.nf (intToSum (foldMap_foldrEta     id)) ts
    , C.bench "v0.2.7"                  $ C.nf (intToSum (foldMap_v027         id)) ts
    ]
  , C.bgroup "foldMap'"
    [ C.bench "default (foldl')"  $ C.nf (intToSum (foldMap'_default   id)) ts
    , C.bench "v0.2.7"            $ C.nf (intToSum (foldMap'_v027      id)) ts
    , C.bench "v0.2.7 +cps"       $ C.nf (intToSum (foldMap'_v027_cps  id)) ts
    , C.bench "v0.2.7 +flopped"   $ C.nf (intToSum (foldMap'_v027_flop id)) ts
    , C.bench "v0.2.7 +($!)"      $ C.nf (intToSum (foldMap'_v027_bang id)) ts
    , C.bench "v0.2.7 +flopped +($!)" $ C.nf (intToSum (foldMap'_v027_flop_bang id)) ts
    ]
  , C.bgroup "foldr"
    [ C.bench "default (foldMap)" $ C.nf (foldr_default     (+) 0 <$>) ts
    , C.bench "compose"           $ C.nf (foldr_compose     (+) 0 <$>) ts
    , C.bench "eta"               $ C.nf (foldr_eta         (+) 0 <$>) ts
    , C.bench "cps_eta"           $ C.nf (foldr_cps_eta     (+) 0 <$>) ts
    , C.bench "cps"               $ C.nf (foldr_cps         (+) 0 <$>) ts
    , C.bench "noClosure"         $ C.nf (foldr_noClosure   (+) 0 <$>) ts
    ]
  , C.bgroup "foldr'"
    [ C.bench "default"           $ C.nf (foldr'_default     (+) 0 <$>) ts
    , C.bench "v0.2.7"            $ C.nf (foldr'_v027        (+) 0 <$>) ts
    , C.bench "v0.2.7 +cps"       $ C.nf (foldr'_v027_cps    (+) 0 <$>) ts
    , C.bench "cps, no eta"       $ C.nf (foldr'_cps         (+) 0 <$>) ts
    , C.bench "v0.2.7 +flopped"   $ C.nf (foldr'_v027_flop   (+) 0 <$>) ts
    , C.bench "v0.2.7 +($!)"      $ C.nf (foldr'_v027_bang   (+) 0 <$>) ts
    , C.bench "v0.2.7 +flopped +($!)" $ C.nf (foldr'_v027_flop_bang (+) 0 <$>) ts
    ]
  , C.bgroup "foldl"
    [ C.bench "default (foldMap)" $ C.nf (foldl_default        (+) 0 <$>) ts
    , C.bench "default +Coerce"   $ C.nf (foldl_default_Coerce (+) 0 <$>) ts
    , C.bench "v0.2.7"            $ C.nf (foldl_v027           (+) 0 <$>) ts
    , C.bench "v0.2.7 +flopped"   $ C.nf (foldl_v027_flop      (+) 0 <$>) ts
    ]
  , C.bgroup "foldl'"
    [ C.bench "default (foldr_compose)" $ C.nf (foldl'_defaultCompose (+) 0 <$>) ts
    , C.bench "default (foldr_eta)"     $ C.nf (foldl'_defaultEta     (+) 0 <$>) ts
    , C.bench "v0.2.7"                  $ C.nf (foldl'_v027           (+) 0 <$>) ts
    , C.bench "v0.2.7 +flopped"         $ C.nf (foldl'_v027_flop      (+) 0 <$>) ts
    , C.bench "v0.2.7 +flopped +($!)"   $ C.nf (foldl'_v027_flop_bang (+) 0 <$>) ts
    ]
  ]

----------------------------------------------------------------
----------------------------------------------------------- fin.