{-# language BangPatterns #-}
{-# language BinaryLiterals #-}
{-# language DataKinds #-}
{-# language KindSignatures #-}
{-# language LambdaCase #-}
{-# language MagicHash #-}
{-# language RankNTypes #-}
{-# language ScopedTypeVariables #-}
{-# language TypeApplications #-}
{-# language TypeOperators #-}
{-# language UnboxedTuples #-}

-- | The functions in this module are explict about the maximum number
-- of bytes they require.
module Data.ByteArray.Builder.Bounded
  ( -- * Builder
    Builder
    -- * Execute
  , run
  , pasteGrowST
    -- * Combine
  , empty
  , append
    -- * Bounds Manipulation
  , weaken
  , substitute
    -- * Encode Integral Types
    -- ** Human-Readable
  , word64Dec
  , word32Dec
  , word16Dec
  , word8Dec
  , wordDec
  , int64Dec
  , int32Dec
  , int16Dec
  , int8Dec
  , intDec
    -- * Unsigned Words
    -- ** 64-bit
  , word64PaddedUpperHex
    -- ** 32-bit
  , word32PaddedUpperHex
    -- ** 16-bit
  , word16PaddedLowerHex
  , word16PaddedUpperHex
  , word16LowerHex
  , word16UpperHex
    -- ** 8-bit
  , word8PaddedUpperHex
  , word8LowerHex
  , ascii
  , char
    -- ** Machine-Readable
    -- *** One
  , word8
    -- **** Big Endian
  , word64BE
  , word32BE
  , word16BE
  , int64BE
  , int32BE
  , int16BE
    -- **** Little Endian
  , word64LE
  , word32LE
  , word16LE
  , int64LE
  , int32LE
  , int16LE
    -- * Encode Floating-Point Types
  , doubleDec
  ) where

import Arithmetic.Types (type (<=), type (:=:))
import Control.Monad.Primitive
import Control.Monad.ST (ST)
import Control.Monad.ST.Run (runByteArrayST)
import Data.Bits
import Data.ByteArray.Builder.Bounded.Unsafe (Builder(..))
import Data.Char (ord)
import Data.Primitive
import Data.Primitive.ByteArray.Offset (MutableByteArrayOffset(..))
import GHC.Exts
import GHC.Int (Int64(I64#),Int32(I32#),Int16(I16#),Int8(I8#))
import GHC.ST (ST(ST))
import GHC.TypeLits (type (+))
import GHC.Word (Word8(W8#),Word16(W16#),Word32(W32#),Word64(W64#))

import qualified Arithmetic.Types as Arithmetic
import qualified Arithmetic.Nat as Nat
import qualified Arithmetic.Lte as Lte
import qualified Data.ByteArray.Builder.Bounded.Unsafe as Unsafe
import qualified Data.Primitive as PM

-- | Execute the bounded builder. If the size is a constant,
-- use @Arithmetic.Nat.constant@ as the first argument to let
-- GHC conjure up this value for you.
run ::
     Arithmetic.Nat n
  -> Builder n -- ^ Builder
  -> ByteArray
{-# inline run #-}
run n b = runByteArrayST $ do
  arr <- newByteArray (Nat.demote n)
  len <- Unsafe.pasteST b arr 0
  shrinkMutableByteArray arr len
  unsafeFreezeByteArray arr

-- | Paste the builder into the byte array starting at offset zero.
-- This reallocates the byte array if it cannot accomodate the builder,
-- growing it by the minimum amount necessary.
pasteGrowST ::
     Arithmetic.Nat n
  -> Builder n
  -> MutableByteArrayOffset s
     -- ^ Initial buffer, used linearly. Do not reuse this argument.
  -> ST s (MutableByteArrayOffset s)
     -- ^ Final buffer that accomodated the builder.
{-# inline pasteGrowST #-}
pasteGrowST n b !(MutableByteArrayOffset{array=arr0,offset=off0}) = do
  sz0 <- PM.getSizeofMutableByteArray arr0
  let req = Nat.demote n
  let sz1 = off0 + req
  if sz1 <= sz0
    then do
      off1 <- Unsafe.pasteST b arr0 off0
      pure (MutableByteArrayOffset arr0 off1)
    else do
      arr1 <- PM.resizeMutableByteArray arr0 sz1
      off1 <- Unsafe.pasteST b arr1 off0
      pure (MutableByteArrayOffset arr1 off1)

-- | The monoidal unit of `append`
empty :: Builder 0
empty = Builder $ \_ off0 s0 -> (# s0, off0 #)

infixr 9 `append`

-- | Concatenate two builders.
append :: Builder m -> Builder n -> Builder (m + n)
append (Builder f) (Builder g) =
  Builder $ \arr off0 s0 -> case f arr off0 s0 of
    (# s1, r #) -> g arr r s1

-- | Weaken the bound on the maximum number of bytes required. For example,
-- to use two builders with unequal bounds in a disjunctive setting:
--
-- > import qualified Arithmetic.Lte as Lte
-- >
-- > buildNumber :: Either Double Word64 -> Builder 32
-- > buildNumber = \case
-- >   Left d  -> doubleDec d
-- >   Right w -> weaken (Lte.constant @19 @32) (word64Dec w)
weaken :: forall m n. (m <= n) -> Builder m -> Builder n
weaken !_ (Builder f) = Builder f

-- | Replace the upper bound on size with an equal number.
substitute :: forall m n. (m :=: n) -> Builder m -> Builder n
substitute !_ (Builder f) = Builder f

-- | Encode a double-floating-point number, using decimal notation or
-- scientific notation depending on the magnitude. This has undefined
-- behavior when representing @+inf@, @-inf@, and @NaN@. It will not
-- crash, but the generated numbers will be nonsense.
doubleDec :: Double -> Builder 32
doubleDec (D# d) = Builder (\arr off0 s0 -> doubleDec# d arr off0 s0)

-- | Requires up to 19 bytes. Encodes an unsigned 64-bit integer as decimal.
-- This encoding never starts with a zero unless the argument was zero.
word64Dec :: Word64 -> Builder 19
word64Dec (W64# w) = wordCommonDec# w

-- | Requires up to 10 bytes. Encodes an unsigned 32-bit integer as decimal.
-- This encoding never starts with a zero unless the argument was zero.
word32Dec :: Word32 -> Builder 10
word32Dec (W32# w) = wordCommonDec# w

-- | Requires up to 5 bytes. Encodes an unsigned 16-bit integer as decimal.
-- This encoding never starts with a zero unless the argument was zero.
word16Dec :: Word16 -> Builder 5
word16Dec (W16# w) = wordCommonDec# w

-- | Requires up to 3 bytes. Encodes an unsigned 8-bit integer as decimal.
-- This encoding never starts with a zero unless the argument was zero.
word8Dec :: Word8 -> Builder 3
word8Dec (W8# w) =
  -- We unroll the loop when encoding Word8s. This speeds things
  -- up IPv4 encoding by about 10% in the @ip@ library. We can
  -- encode Word8s at twice this speed by using a lookup table.
  -- However, I (Andrew Martin) am concerned that although lookup
  -- table perform very well in microbenchmarks, they can thrash
  -- L1 cache in real applications.
  word8Dec# w

-- | Requires up to 19 bytes. Encodes an unsigned machine-sized integer
-- as decimal. This encoding never starts with a zero unless the argument
-- was zero.
wordDec :: Word -> Builder 19
wordDec (W# w) = wordCommonDec# w

-- | Requires up to 20 bytes. Encodes a signed 64-bit integer as decimal.
-- This encoding never starts with a zero unless the argument was zero.
-- Negative numbers are preceded by a minus sign. Positive numbers
-- are not preceded by anything.
int64Dec :: Int64 -> Builder 20
int64Dec (I64# w) = intCommonDec# w

-- | Requires up to 11 bytes. Encodes a signed 32-bit integer as decimal.
-- This encoding never starts with a zero unless the argument was zero.
-- Negative numbers are preceded by a minus sign. Positive numbers
-- are not preceded by anything.
int32Dec :: Int32 -> Builder 11
int32Dec (I32# w) = intCommonDec# w

-- | Requires up to 6 bytes. Encodes a signed 16-bit integer as decimal.
-- This encoding never starts with a zero unless the argument was zero.
-- Negative numbers are preceded by a minus sign. Positive numbers
-- are not preceded by anything.
int16Dec :: Int16 -> Builder 6
int16Dec (I16# w) = intCommonDec# w

-- | Requires up to 4 bytes. Encodes a signed 8-bit integer as decimal.
-- This encoding never starts with a zero unless the argument was zero.
-- Negative numbers are preceded by a minus sign. Positive numbers
-- are not preceded by anything.
int8Dec :: Int8 -> Builder 4
int8Dec (I8# w) = intCommonDec# w

-- | Requires up to 20 bytes. Encodes a signed machine-sized integer
-- as decimal. This encoding never starts with a zero unless the
-- argument was zero. Negative numbers are preceded by a minus sign.
-- Positive numbers are not preceded by anything.
intDec :: Int -> Builder 20
intDec (I# w) = intCommonDec# w

word8Dec# :: Word# -> Builder 3
{-# noinline word8Dec# #-}
word8Dec# w# = Unsafe.construct $ \arr off0 -> do
  let !(I# off0# ) = off0
      !(!x,!ones) = quotRem w 10
      !(hundreds@(W# hundreds# ),tens@(W# tens# )) = quotRem x 10
  writeByteArray arr off0 (fromIntegral (hundreds + 0x30) :: Word8)
  let !hasHundreds = gtWord# hundreds# 0##
      !off1@(I# off1# ) = I# (off0# +# hasHundreds)
  writeByteArray arr off1 (fromIntegral (tens + 0x30) :: Word8)
  let !off2 = I# (off1# +# (orI# hasHundreds (gtWord# tens# 0## )))
  writeByteArray arr off2 (fromIntegral (ones + 0x30) :: Word8)
  pure (off2 + 1)
  where
  w = W# w#

-- Requires a number of bytes that is bounded by the size of
-- the word. This is only used internally.
wordCommonDec# :: Word# -> Builder n
{-# noinline wordCommonDec# #-}
wordCommonDec# w# = Unsafe.construct $ \arr off0 -> if w /= 0
  then internalWordLoop arr off0 (W# w#)
  else do
    writeByteArray arr off0 (c2w '0')
    pure (off0 + 1)
  where
  w = W64# w#

internalWordLoop :: MutableByteArray s -> Int -> Word -> ST s Int
{-# inline internalWordLoop #-}
internalWordLoop arr off0 x0 = go off0 x0 where
  go !off !(x :: Word) = if x > 0
    then do
      let (y,z) = quotRem x 10
      writeByteArray arr off (fromIntegral (z + 0x30) :: Word8)
      go (off + 1) y
    else do
      reverseBytes arr off0 (off - 1)
      pure off

-- Requires up to 20 bytes. Can be less depending on what the
-- size of the argument is known to be. Unsafe.
intCommonDec# :: Int# -> Builder n
{-# noinline intCommonDec# #-}
intCommonDec# w# = Unsafe.construct $ \arr off0 -> case compare w 0 of
  GT -> internalWordLoop arr off0 (fromIntegral w)
  EQ -> do
    writeByteArray arr off0 (c2w '0')
    pure (off0 + 1)
  LT -> do
    writeByteArray arr off0 (c2w '-')
    internalWordLoop arr (off0 + 1) (fromIntegral (negate w))
  where
  w = I64# w#

-- Convert a number between 0 and 16 to the ASCII
-- representation of its hexadecimal character.
-- The use of fromIntegral causes us to incur an
-- unneeded bitmask. This actually needs a Word64
-- argument.
toHexUpper :: Word -> Word8
toHexUpper w' = fromIntegral
    $ (complement theMask .&. loSolved)
  .|. (theMask .&. hiSolved)
  where
  w = w' .&. 0xF
  -- This is all ones if the value was >= 10
  theMask = (1 .&. unsafeShiftR (w - 10) 63) - 1
  loSolved = w + 48
  hiSolved = w + 55

toHexLower :: Word -> Word8
toHexLower w' = fromIntegral
    $ (complement theMask .&. loSolved)
  .|. (theMask .&. hiSolved)
  where
  w = w' .&. 0xF
  -- This is all ones if the value was >= 10
  theMask = (1 .&. unsafeShiftR (w - 10) 63) - 1
  loSolved = w + 48
  hiSolved = w + 87

-- | Requires exactly 16 bytes. Encodes a 64-bit unsigned integer as
-- hexadecimal, zero-padding the encoding to 16 digits. This uses
-- uppercase for the alphabetical digits. For example, this encodes the
-- number 1022 as @00000000000003FE@.
word64PaddedUpperHex :: Word64 -> Builder 16
word64PaddedUpperHex (W64# w) = word64PaddedUpperHex# w

-- | Requires exactly 8 bytes. Encodes a 32-bit unsigned integer as
-- hexadecimal, zero-padding the encoding to 8 digits. This uses
-- uppercase for the alphabetical digits.
word32PaddedUpperHex :: Word32 -> Builder 8
word32PaddedUpperHex (W32# w) = word32PaddedUpperHex# w

-- | Requires exactly 4 bytes. Encodes a 16-bit unsigned integer as
-- hexadecimal, zero-padding the encoding to 4 digits. This uses
-- uppercase for the alphabetical digits.
--
-- >>> word16PaddedUpperHex 0xab0
-- 0AB0
word16PaddedUpperHex :: Word16 -> Builder 4
word16PaddedUpperHex (W16# w) = word16PaddedUpperHex# w

-- | Requires exactly 4 bytes. Encodes a 16-bit unsigned integer as
-- hexadecimal, zero-padding the encoding to 4 digits. This uses
-- lowercase for the alphabetical digits.
--
-- >>> word16PaddedLowerHex 0xab0
-- 0ab0
word16PaddedLowerHex :: Word16 -> Builder 4
word16PaddedLowerHex (W16# w) = word16PaddedLowerHex# w

-- | Requires at most 4 bytes. Encodes a 16-bit unsigned integer as
-- hexadecimal. No leading zeroes are displayed. Letters are presented
-- in lowercase. If the number is zero, a single zero digit is used.
--
-- >>> word16LowerHex 0xab0
-- ab0
word16LowerHex :: Word16 -> Builder 4
word16LowerHex (W16# w) = word16LowerHex# w

-- | Requires at most 4 bytes. Encodes a 16-bit unsigned integer as
-- hexadecimal. No leading zeroes are displayed. Letters are presented
-- in uppercase. If the number is zero, a single zero digit is used.
--
-- >>> word16UpperHex 0xab0
-- AB0
word16UpperHex :: Word16 -> Builder 4
word16UpperHex (W16# w) = word16UpperHex# w

-- | Requires at most 2 bytes. Encodes a 8-bit unsigned integer as
-- hexadecimal. No leading zeroes are displayed. If the number is zero,
-- a single zero digit is used.
word8LowerHex :: Word8 -> Builder 2
word8LowerHex (W8# w) = word8LowerHex# w

-- | Requires exactly 2 bytes. Encodes a 8-bit unsigned integer as
-- hexadecimal, zero-padding the encoding to 2 digits. This uses
-- uppercase for the alphabetical digits.
word8PaddedUpperHex :: Word8 -> Builder 2
word8PaddedUpperHex (W8# w) = word8PaddedUpperHex# w

-- TODO: Is it actually worth unrolling this loop. I suspect that it
-- might not be. Benchmark this.
word64PaddedUpperHex# :: Word# -> Builder 16
{-# noinline word64PaddedUpperHex# #-}
word64PaddedUpperHex# w# = Unsafe.construct $ \arr off -> do
  writeByteArray arr off (toHexUpper (unsafeShiftR w 60))
  writeByteArray arr (off + 1) (toHexUpper (unsafeShiftR w 56))
  writeByteArray arr (off + 2) (toHexUpper (unsafeShiftR w 52))
  writeByteArray arr (off + 3) (toHexUpper (unsafeShiftR w 48))
  writeByteArray arr (off + 4) (toHexUpper (unsafeShiftR w 44))
  writeByteArray arr (off + 5) (toHexUpper (unsafeShiftR w 40))
  writeByteArray arr (off + 6) (toHexUpper (unsafeShiftR w 36))
  writeByteArray arr (off + 7) (toHexUpper (unsafeShiftR w 32))
  writeByteArray arr (off + 8) (toHexUpper (unsafeShiftR w 28))
  writeByteArray arr (off + 9) (toHexUpper (unsafeShiftR w 24))
  writeByteArray arr (off + 10) (toHexUpper (unsafeShiftR w 20))
  writeByteArray arr (off + 11) (toHexUpper (unsafeShiftR w 16))
  writeByteArray arr (off + 12) (toHexUpper (unsafeShiftR w 12))
  writeByteArray arr (off + 13) (toHexUpper (unsafeShiftR w 8))
  writeByteArray arr (off + 14) (toHexUpper (unsafeShiftR w 4))
  writeByteArray arr (off + 15) (toHexUpper (unsafeShiftR w 0))
  pure (off + 16)
  where
  w = W# w#

word32PaddedUpperHex# :: Word# -> Builder 8
{-# noinline word32PaddedUpperHex# #-}
word32PaddedUpperHex# w# = Unsafe.construct $ \arr off -> do
  writeByteArray arr off (toHexUpper (unsafeShiftR w 28))
  writeByteArray arr (off + 1) (toHexUpper (unsafeShiftR w 24))
  writeByteArray arr (off + 2) (toHexUpper (unsafeShiftR w 20))
  writeByteArray arr (off + 3) (toHexUpper (unsafeShiftR w 16))
  writeByteArray arr (off + 4) (toHexUpper (unsafeShiftR w 12))
  writeByteArray arr (off + 5) (toHexUpper (unsafeShiftR w 8))
  writeByteArray arr (off + 6) (toHexUpper (unsafeShiftR w 4))
  writeByteArray arr (off + 7) (toHexUpper (unsafeShiftR w 0))
  pure (off + 8)
  where
  w = W# w#

-- Not sure if it is beneficial to inline this. We just let
-- GHC make the decision. Open an issue on github if this is
-- a problem.
word16PaddedUpperHex# :: Word# -> Builder 4
word16PaddedUpperHex# w# = Unsafe.construct $ \arr off -> do
  writeByteArray arr off (toHexUpper (unsafeShiftR w 12))
  writeByteArray arr (off + 1) (toHexUpper (unsafeShiftR w 8))
  writeByteArray arr (off + 2) (toHexUpper (unsafeShiftR w 4))
  writeByteArray arr (off + 3) (toHexUpper (unsafeShiftR w 0))
  pure (off + 4)
  where
  w = W# w#

word16PaddedLowerHex# :: Word# -> Builder 4
word16PaddedLowerHex# w# = Unsafe.construct $ \arr off -> do
  writeByteArray arr off (toHexLower (unsafeShiftR w 12))
  writeByteArray arr (off + 1) (toHexLower (unsafeShiftR w 8))
  writeByteArray arr (off + 2) (toHexLower (unsafeShiftR w 4))
  writeByteArray arr (off + 3) (toHexLower (unsafeShiftR w 0))
  pure (off + 4)
  where
  w = W# w#

word12PaddedLowerHex# :: Word# -> Builder 3
word12PaddedLowerHex# w# = Unsafe.construct $ \arr off -> do
  writeByteArray arr off (toHexLower (unsafeShiftR w 8))
  writeByteArray arr (off + 1) (toHexLower (unsafeShiftR w 4))
  writeByteArray arr (off + 2) (toHexLower (unsafeShiftR w 0))
  pure (off + 3)
  where
  w = W# w#

word12PaddedUpperHex# :: Word# -> Builder 3
word12PaddedUpperHex# w# = Unsafe.construct $ \arr off -> do
  writeByteArray arr off (toHexUpper (unsafeShiftR w 8))
  writeByteArray arr (off + 1) (toHexUpper (unsafeShiftR w 4))
  writeByteArray arr (off + 2) (toHexUpper (unsafeShiftR w 0))
  pure (off + 3)
  where
  w = W# w#

-- Definitely want this to inline. It's maybe a dozen instructions total.
word8PaddedUpperHex# :: Word# -> Builder 2
{-# inline word8PaddedUpperHex# #-}
word8PaddedUpperHex# w# = Unsafe.construct $ \arr off -> do
  writeByteArray arr off (toHexUpper (unsafeShiftR w 4))
  writeByteArray arr (off + 1) (toHexUpper (unsafeShiftR w 0))
  pure (off + 2)
  where
  w = W# w#

word8PaddedLowerHex# :: Word# -> Builder 2
{-# inline word8PaddedLowerHex# #-}
word8PaddedLowerHex# w# = Unsafe.construct $ \arr off -> do
  writeByteArray arr off (toHexLower (unsafeShiftR w 4))
  writeByteArray arr (off + 1) (toHexLower (unsafeShiftR w 0))
  pure (off + 2)
  where
  w = W# w#

word4PaddedLowerHex# :: Word# -> Builder 1
{-# inline word4PaddedLowerHex# #-}
word4PaddedLowerHex# w# = Unsafe.construct $ \arr off -> do
  writeByteArray arr off (toHexLower w)
  pure (off + 1)
  where
  w = W# w#

word4PaddedUpperHex# :: Word# -> Builder 1
{-# inline word4PaddedUpperHex# #-}
word4PaddedUpperHex# w# = Unsafe.construct $ \arr off -> do
  writeByteArray arr off (toHexUpper w)
  pure (off + 1)
  where
  w = W# w#

word16UpperHex# :: Word# -> Builder 4
word16UpperHex# w#
  | w <= 0xF = weaken Lte.constant (word4PaddedUpperHex# w#)
  | w <= 0xFF = weaken Lte.constant (word8PaddedUpperHex# w#)
  | w <= 0xFFF = weaken Lte.constant (word12PaddedUpperHex# w#)
  | otherwise = word16PaddedUpperHex# w#
  where
  w = W# w#

word16LowerHex# :: Word# -> Builder 4
word16LowerHex# w#
  | w <= 0xF = weaken Lte.constant (word4PaddedLowerHex# w#)
  | w <= 0xFF = weaken Lte.constant (word8PaddedLowerHex# w#)
  | w <= 0xFFF = weaken Lte.constant (word12PaddedLowerHex# w#)
  | otherwise = word16PaddedLowerHex# w#
  where
  w = W# w#

-- Precondition: argument less than 256
word8LowerHex# :: Word# -> Builder 2
word8LowerHex# w#
  | w <= 0xF = weaken Lte.constant (word4PaddedLowerHex# w#)
  | otherwise = weaken Lte.constant (word8PaddedLowerHex# w#)
  where
  w = W# w#

-- | Encode an ASCII character.
-- Precondition: Input must be an ASCII character. This is not checked.
ascii :: Char -> Builder 1
ascii (C# c) = Unsafe.construct $ \(MutableByteArray arr) (I# off) -> do
  primitive_ (writeCharArray# arr off c)
  pure (I# (off +# 1# ))

-- | Encode a character as UTF-8. This only uses as much space as is required.
char :: Char -> Builder 4
char c
  | codepoint < 0x80 = Unsafe.construct $ \arr off -> do
      writeByteArray arr off (unsafeWordToWord8 codepoint)
      pure (off + 1)
  | codepoint < 0x800 = Unsafe.construct $ \arr off -> do
      writeByteArray arr off       (unsafeWordToWord8 (byteTwoOne codepoint))
      writeByteArray arr (off + 1) (unsafeWordToWord8 (byteTwoTwo codepoint))
      return (off + 2)
  | codepoint >= 0xD800 && codepoint < 0xE000 = Unsafe.construct $ \arr off -> do
      -- Codepoint U+FFFD
      writeByteArray arr off       (0xEF :: Word8)
      writeByteArray arr (off + 1) (0xBF :: Word8)
      writeByteArray arr (off + 2) (0xBD :: Word8)
      return (off + 3)
  | codepoint < 0x10000 = Unsafe.construct $ \arr off -> do
      writeByteArray arr off       (unsafeWordToWord8 (byteThreeOne codepoint))
      writeByteArray arr (off + 1) (unsafeWordToWord8 (byteThreeTwo codepoint))
      writeByteArray arr (off + 2) (unsafeWordToWord8 (byteThreeThree codepoint))
      return (off + 3)
  | otherwise = Unsafe.construct $ \arr off -> do
      writeByteArray arr off       (unsafeWordToWord8 (byteFourOne codepoint))
      writeByteArray arr (off + 1) (unsafeWordToWord8 (byteFourTwo codepoint))
      writeByteArray arr (off + 2) (unsafeWordToWord8 (byteFourThree codepoint))
      writeByteArray arr (off + 3) (unsafeWordToWord8 (byteFourFour codepoint))
      return (off + 4)

  where
    codepoint :: Word
    codepoint = fromIntegral (ord c)

    unsafeWordToWord8 :: Word -> Word8
    unsafeWordToWord8 (W# w) = W8# w

    -- precondition: codepoint is less than 0x800
    byteTwoOne :: Word -> Word
    byteTwoOne w = unsafeShiftR w 6 .|. 0b11000000

    byteTwoTwo :: Word -> Word
    byteTwoTwo w = (w .&. 0b00111111) .|. 0b10000000

    -- precondition: codepoint is less than 0x1000
    byteThreeOne :: Word -> Word
    byteThreeOne w = unsafeShiftR w 12 .|. 0b11100000

    byteThreeTwo :: Word -> Word
    byteThreeTwo w = (0b00111111 .&. unsafeShiftR w 6) .|. 0b10000000

    byteThreeThree :: Word -> Word
    byteThreeThree w = (w .&. 0b00111111) .|. 0b10000000

    -- precondition: codepoint is less than 0x110000
    byteFourOne :: Word -> Word
    byteFourOne w = unsafeShiftR w 18 .|. 0b11110000

    byteFourTwo :: Word -> Word
    byteFourTwo w = (0b00111111 .&. unsafeShiftR w 12) .|. 0b10000000

    byteFourThree :: Word -> Word
    byteFourThree w = (0b00111111 .&. unsafeShiftR w 6) .|. 0b10000000

    byteFourFour :: Word -> Word
    byteFourFour w = (0b00111111 .&. w) .|. 0b10000000

int64BE :: Int64 -> Builder 8
int64BE (I64# i) = word64BE (W64# (int2Word# i))

int32BE :: Int32 -> Builder 4
int32BE (I32# i) = word32BE (W32# (int2Word# i))

int16BE :: Int16 -> Builder 2
int16BE (I16# i) = word16BE (W16# (int2Word# i))

int64LE :: Int64 -> Builder 8
int64LE (I64# i) = word64LE (W64# (int2Word# i))

int32LE :: Int32 -> Builder 4
int32LE (I32# i) = word32LE (W32# (int2Word# i))

int16LE :: Int16 -> Builder 2
int16LE (I16# i) = word16LE (W16# (int2Word# i))

-- | Requires exactly 8 bytes. Dump the octets of a 64-bit
-- word in a little-endian fashion.
word64LE :: Word64 -> Builder 8
word64LE w = Unsafe.construct $ \arr off -> do
  writeByteArray arr (off + 7) (fromIntegral @Word64 @Word8 (unsafeShiftR w 56))
  writeByteArray arr (off + 6) (fromIntegral @Word64 @Word8 (unsafeShiftR w 48))
  writeByteArray arr (off + 5) (fromIntegral @Word64 @Word8 (unsafeShiftR w 40))
  writeByteArray arr (off + 4) (fromIntegral @Word64 @Word8 (unsafeShiftR w 32))
  writeByteArray arr (off + 3) (fromIntegral @Word64 @Word8 (unsafeShiftR w 24))
  writeByteArray arr (off + 2) (fromIntegral @Word64 @Word8 (unsafeShiftR w 16))
  writeByteArray arr (off + 1) (fromIntegral @Word64 @Word8 (unsafeShiftR w 8))
  writeByteArray arr (off    ) (fromIntegral @Word64 @Word8 w)
  pure (off + 8)

-- | Requires exactly 8 bytes. Dump the octets of a 64-bit
-- word in a big-endian fashion.
word64BE :: Word64 -> Builder 8
word64BE w = Unsafe.construct $ \arr off -> do
  writeByteArray arr (off    ) (fromIntegral @Word64 @Word8 (unsafeShiftR w 56))
  writeByteArray arr (off + 1) (fromIntegral @Word64 @Word8 (unsafeShiftR w 48))
  writeByteArray arr (off + 2) (fromIntegral @Word64 @Word8 (unsafeShiftR w 40))
  writeByteArray arr (off + 3) (fromIntegral @Word64 @Word8 (unsafeShiftR w 32))
  writeByteArray arr (off + 4) (fromIntegral @Word64 @Word8 (unsafeShiftR w 24))
  writeByteArray arr (off + 5) (fromIntegral @Word64 @Word8 (unsafeShiftR w 16))
  writeByteArray arr (off + 6) (fromIntegral @Word64 @Word8 (unsafeShiftR w 8))
  writeByteArray arr (off + 7) (fromIntegral @Word64 @Word8 w)
  pure (off + 8)

-- | Requires exactly 4 bytes. Dump the octets of a 32-bit
-- word in a little-endian fashion.
word32LE :: Word32 -> Builder 4
word32LE w = Unsafe.construct $ \arr off -> do
  writeByteArray arr (off + 3) (fromIntegral @Word32 @Word8 (unsafeShiftR w 24))
  writeByteArray arr (off + 2) (fromIntegral @Word32 @Word8 (unsafeShiftR w 16))
  writeByteArray arr (off + 1) (fromIntegral @Word32 @Word8 (unsafeShiftR w 8))
  writeByteArray arr (off    ) (fromIntegral @Word32 @Word8 w)
  pure (off + 4)

-- | Requires exactly 4 bytes. Dump the octets of a 32-bit
-- word in a big-endian fashion.
word32BE :: Word32 -> Builder 4
word32BE w = Unsafe.construct $ \arr off -> do
  writeByteArray arr (off    ) (fromIntegral @Word32 @Word8 (unsafeShiftR w 24))
  writeByteArray arr (off + 1) (fromIntegral @Word32 @Word8 (unsafeShiftR w 16))
  writeByteArray arr (off + 2) (fromIntegral @Word32 @Word8 (unsafeShiftR w 8))
  writeByteArray arr (off + 3) (fromIntegral @Word32 @Word8 w)
  pure (off + 4)

-- | Requires exactly 2 bytes. Dump the octets of a 16-bit
-- word in a little-endian fashion.
word16LE :: Word16 -> Builder 2
word16LE w = Unsafe.construct $ \arr off -> do
  writeByteArray arr (off + 1) (fromIntegral @Word16 @Word8 (unsafeShiftR w 8))
  writeByteArray arr (off    ) (fromIntegral @Word16 @Word8 w)
  pure (off + 2)

-- | Requires exactly 2 bytes. Dump the octets of a 16-bit
-- word in a big-endian fashion.
word16BE :: Word16 -> Builder 2
word16BE w = Unsafe.construct $ \arr off -> do
  writeByteArray arr (off    ) (fromIntegral @Word16 @Word8 (unsafeShiftR w 8))
  writeByteArray arr (off + 1) (fromIntegral @Word16 @Word8 w)
  pure (off + 2)

word8 :: Word8 -> Builder 1
word8 w = Unsafe.construct $ \arr off -> do
  writeByteArray arr off w
  pure (off + 1)

-- Reverse the bytes in the designated slice. This takes
-- an inclusive start offset and an inclusive end offset.
reverseBytes :: MutableByteArray s -> Int -> Int -> ST s ()
{-# inline reverseBytes #-}
reverseBytes arr begin end = go begin end where
  go ixA ixB = if ixA < ixB
    then do
      a :: Word8 <- readByteArray arr ixA
      b :: Word8 <- readByteArray arr ixB
      writeByteArray arr ixA b
      writeByteArray arr ixB a
      go (ixA + 1) (ixB - 1)
    else pure ()

c2w :: Char -> Word8
c2w = fromIntegral . ord

shrinkMutableByteArray :: MutableByteArray s -> Int -> ST s ()
shrinkMutableByteArray (MutableByteArray arr) (I# sz) =
  primitive_ (shrinkMutableByteArray# arr sz)

-- This is adapted from androider's code in https://stackoverflow.com/a/7097567
-- The checks for infinity and NaN have been removed. Note that this is a little
-- inaccurate. This is very visible when encoding a number like 2.25, which
-- is perfectly represented as an IEEE 754 floating point number but is goofed
-- up by this function.
-- If you modify this function, please take a took at the resulting core.
-- It currently performs no boxing at all, and it would be nice to keep
-- it that way.
doubleDec# :: forall s.
  Double# -> MutableByteArray# s -> Int# -> State# s -> (# State# s, Int# #)
{-# noinline doubleDec# #-}
doubleDec# d# marr# off# s0 = unIntST s0 $ do
  let marr = MutableByteArray marr#
  let d0 = D# d#
  let off0 = I# off#
  if d0 == 0
    then do
      writeByteArray marr off0 (c2w '0')
      pure (off0 + 1)
    else do
      let neg = d0 < 0
      off1 <- if neg
        then do
          writeByteArray marr off0 (c2w '-')
          pure (off0 + 1)
        else pure off0
      let d1 = abs d0
      let mag0 = floor (logBase10 d1) :: Int
      let useExp = (mag0 >= 14 || (neg && mag0 >= 9) || mag0 <= (-9))
      -- This straightforward adaptation of the C code is awkward
      -- in Haskell. Binding the triple where mag1 might not even
      -- get used is strange.
      let !(!d2,!mag1,!mag0A) = if useExp
            then
              let mag0' = if mag0 < 0 then mag0 - 1 else mag0
               in (d1 / (10.0 ** fromIntegral @Int @Double mag0'), mag0', 0)
            else (d1,0,mag0)
      let mag0B = if mag0A < 1 then 0 else mag0A
      let goNum :: Double -> Int -> Int -> ST s Int
          goNum !dA0 !mag !offA0 = if (dA0 > doublePrecision || mag >= 0)
            then do
              let weight = 10.0 ** (fromIntegral @Int @Double mag)
              -- We should actually check weight with isinf here,
              -- but we do not.
              (dA1,offA1) <- if weight > 0
                then do
                  -- TODO: use a better floor function
                  let digit = ((floor :: Double -> Int) (dA0 / weight))
                  let discard = fromIntegral @Int @Double digit * weight
                  writeByteArray marr offA0
                    (fromIntegral @Int @Word8 (digit + ord '0'))
                  pure (dA0 - discard,offA0 + 1)
                else pure (dA0,offA0)
              offA2 <- if mag == 0 && dA1 > 0
                then do
                  writeByteArray marr offA1 (c2w '.')
                  pure (offA1 + 1)
                else pure offA1
              goNum dA1 (mag - 1) offA2
            else pure offA0
      !off2 <- goNum d2 mag0B off1
      off3 <- if useExp
        then do
          writeByteArray marr off2 (c2w 'e')
          !mag2 <- if mag1 > 0
            then do
              writeByteArray marr (off2 + 1) (c2w '+')
              pure mag1
            else do
              writeByteArray marr (off2 + 1) (c2w '-')
              pure (-mag1)
          let goMag !mag !off = if mag > 0
                then do
                  let (q,r) = quotRem mag 10
                  writeByteArray marr off (fromIntegral @Int @Word8 (ord '0' + r))
                  goMag q (off + 1)
                else pure off
          !off3 <- goMag mag2 (off2 + 2)
          reverseBytes marr (off2 + 2) (off3 - 1)
          pure off3
        else pure off2
      pure off3

doublePrecision :: Double
doublePrecision = 0.00000000000001

unIntST :: State# s -> ST s Int -> (# State# s, Int# #)
{-# inline unIntST #-}
unIntST s0 (ST f) = case f s0 of
  (# s1, I# i #) -> (# s1, i #)

-- This is slightly inaccurate. I think this can actually cause
-- problems in some situations. The log10 function from C would
-- be better. The inaccuracy here cause the logarithm to be slightly
-- larger than it should be. There might actually be a simple way to
-- fix this by just using recursion to compute it. We just floor the
-- result anyway. Hmm...
logBase10 :: Double -> Double
logBase10 d = log d / 2.30258509299