{-# LANGUAGE CPP #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE UnliftedFFITypes #-}

{-|
Module      : Std.Data.PrimArray.BitTwiddle
Description : Primitive bits twiddling
Copyright   : (c) Dong Han, 2017-2018
License     : BSD
Maintainer  : winterland1989@gmail.com
Stability   : experimental
Portability : non-portable

This module implement some bit twiddling with ghc primitives.

We currently didn't use all functions from this module though: the performance is not
catching up c version yet. But this module and relevant benchmarks are kept in hope
that once we have fully SIMD support in GHC, we might optimize these functions further
to compete with c.

Reference:

    * https://graphics.stanford.edu/~seander/bithacks.html
    * https://jameshfisher.github.io/2017/01/24/bitwise-check-for-zero-byte.html

-}

module Std.Data.PrimArray.BitTwiddle where

import GHC.Prim
import GHC.Types
import GHC.Word
import Data.Primitive.PrimArray

-- we need to know word size
#include "MachDeps.h"

#if SIZEOF_HSWORD == 4
# define CAST_OFFSET_WORD_TO_BYTE(x) (x `uncheckedIShiftL#` 2#)
# define CAST_OFFSET_BYTE_TO_WORD(x) (x `uncheckedIShiftRA#` 2#)
#else
# define CAST_OFFSET_WORD_TO_BYTE(x) (x `uncheckedIShiftL#` 3#)
# define CAST_OFFSET_BYTE_TO_WORD(x) (x `uncheckedIShiftRA#` 3#)
#endif

isOffsetAligned# :: Int# -> Bool
{-# INLINE isOffsetAligned# #-}
isOffsetAligned# s# = isTrue# ((SIZEOF_HSWORD# -# 1#) `andI#` s# ==# 0#)

mkMask# :: Word# -> Word#
{-# INLINE mkMask# #-}
mkMask# w8# =
#if SIZEOF_HSWORD == 4
    let w16# = w8# `or#` (w8# `uncheckedShiftL#` 8#)
    in w16# `or#` (w16# `uncheckedShiftL#` 16#)
#else
    let w16# = w8# `or#` (w8# `uncheckedShiftL#` 8#)
        w32# = w16# `or#` (w16# `uncheckedShiftL#` 16#)
    in w32# `or#` (w32# `uncheckedShiftL#` 32#)
#endif

-- https://jameshfisher.github.io/2017/01/24/bitwise-check-for-zero-byte.html
--
nullByteMagic# :: Word# -> Word#
{-# INLINE nullByteMagic# #-}
nullByteMagic# w# =
#if SIZEOF_HSWORD == 4
    (w# `minusWord#` 0x01010101##) `and#` (not# w#) `and#` 0x80808080##
#else
    (w# `minusWord#` 0x0101010101010101##) `and#` (not# w#) `and#` 0x8080808080808080##
#endif

-- | Search a word8 in array.
--
-- Currently this function is ~4 times slow than c version, so we didn't use it.
--
memchr :: PrimArray Word8 -- array
       -> Word8           -- target
       -> Int             -- start offset
       -> Int             -- search length
       -> Int
{-# INLINE memchr #-}
memchr (PrimArray ba#) (W8# c#) (I# s#) (I# siz#) =
    I# (memchr# ba# c# s# siz#)

-- | The unboxed version of 'memchr'
--
memchr# :: ByteArray# -> Word# -> Int# -> Int# -> Int#
{-# NOINLINE memchr# #-}
memchr# ba# c# s# siz# = beforeAlignedLoop# ba# c# s# (s# +# siz#)
  where
    beforeAlignedLoop# :: ByteArray# -> Word# -> Int# -> Int# -> Int#
    beforeAlignedLoop# ba# c# s# end#
        | isTrue# (s# >=# end#) = -1#
        | isTrue# (c# `eqWord#` indexWord8Array# ba# s#) = s#
        | isOffsetAligned# s# = alignedLoop# ba# (mkMask# c#)
                                           CAST_OFFSET_BYTE_TO_WORD(s#)
                                           CAST_OFFSET_BYTE_TO_WORD(end#)
                                           end#
        | otherwise = beforeAlignedLoop# ba# c# (s# +# 1#) end#

    alignedLoop# :: ByteArray# -> Word# -> Int# -> Int# -> Int# -> Int#
    alignedLoop# ba# mask# s# end# end_#
        | isTrue# (s# >=# end#) = afterAlignedLoop# ba# (mask# `and#` 0xFF##)
                                                    CAST_OFFSET_WORD_TO_BYTE(s#)
                                                    end_#
        | otherwise = case indexWordArray# ba# s# of
            w# ->
                case nullByteMagic# (mask# `xor#` w#) of
                    0## -> alignedLoop# ba# mask# (s# +# 1#) end# end_#
                    _   -> afterAlignedLoop# ba# (mask# `and#` 0xFF##)
                                             CAST_OFFSET_WORD_TO_BYTE(s#)
                                             end_#

    afterAlignedLoop# :: ByteArray# -> Word# -> Int# -> Int# -> Int#
    afterAlignedLoop# ba# c# s# end#
        | isTrue# (s# >=# end#) = -1#
        | isTrue# (c# `eqWord#` indexWord8Array# ba# s#) = s#
        | otherwise = afterAlignedLoop# ba# c# (s# +# 1#) end#

-- | Search a word8 array in reverse order.
--
-- This function is used in @elemIndexEnd@, since there's no c equivalent (memrchr) on OSX.
--
memchrReverse :: PrimArray Word8  -- array
              -> Word8            -- target
              -> Int              -- start offset
              -> Int              -- search length
              -> Int
{-# INLINE memchrReverse #-}
memchrReverse (PrimArray ba#) (W8# c#) (I# s#) (I# siz#) =
    I# (memchr# ba# c# s# siz#)

-- | The unboxed version of 'memchrReverse'
--
memchrReverse# :: ByteArray# -> Word# -> Int# -> Int# -> Int#
{-# NOINLINE memchrReverse# #-}
memchrReverse# ba# c# s# siz# = beforeAlignedLoop# ba# c# s# (s# -# siz#)
  where
    beforeAlignedLoop# :: ByteArray# -> Word# -> Int# -> Int# -> Int#
    beforeAlignedLoop# ba# c# s# end#
        | isTrue# (s# <# end#) = -1#
        | isTrue# (c# `eqWord#` indexWord8Array# ba# s#) = s#
        | isOffsetAligned# s# = alignedLoop# ba# (mkMask# c#)
                                           CAST_OFFSET_BYTE_TO_WORD(s#)
                                           CAST_OFFSET_BYTE_TO_WORD(end#)
                                           end#
        | otherwise = beforeAlignedLoop# ba# c# (s# -# 1#) end#

    alignedLoop# :: ByteArray# -> Word# -> Int# -> Int# -> Int# -> Int#
    alignedLoop# ba# mask# s# end# end_#
        | isTrue# (s# <# end#) = afterAlignedLoop# ba# (mask# `and#` 0xFF##)
                                                   CAST_OFFSET_WORD_TO_BYTE(s#)
                                                   end_#
        | otherwise = case indexWordArray# ba# s# of
            w# ->
                case nullByteMagic# (mask# `xor#` w#) of
                    0## -> alignedLoop# ba# mask# (s# -# 1#) end# end_#
                    _   -> afterAlignedLoop# ba# (mask# `and#` 0xFF##)
                                             CAST_OFFSET_WORD_TO_BYTE(s#)
                                             end_#

    afterAlignedLoop# :: ByteArray# -> Word# -> Int# -> Int# -> Int#
    afterAlignedLoop# ba# c# s# end#
        | isTrue# (s# <# end#) = -1#
        | isTrue# (c# `eqWord#` indexWord8Array# ba# s#) = s#
        | otherwise = afterAlignedLoop# ba# c# (s# -# 1#) end#


--------------------------------------------------------------------------------

-- HsInt hs_memchr(uint8_t *a, HsInt aoff, uint8_t b, HsInt n);
foreign import ccall unsafe "hs_memchr" c_memchr ::
    ByteArray# -> Int -> Word8 -> Int -> Int