{-|
Description:    Character translation functions to and from standardized binary streams.

Copyright:      (c) 2020 Sam May
License:        MPL-2.0
Maintainer:     ag.eitilt@gmail.com

Stability:      experimental
Portability:    portable

This module and the internal branch it heads implement the
__[Encoding](https://encoding.spec.whatwg.org/)__ specification for translating
text to and from UTF-8 and a selection of less-favoured but grandfathered
encoding schemes.  As the standard authors' primary goal has been security
followed closely by compatibility with existing web pages, the algorithms
described and the names associated with them do not perfectly match the
descriptions originally given by the various original encoding specifications
themselves.
-}
module Web.Willow.Common.Encoding
    ( -- * Types
      Encoding ( .. )
    , DecoderState
    , decoderEncoding
    , decoderRemainder
    , ReparseData
    , EncoderState
      -- * Initialization
      -- ** Decoding
    , initialDecoderState
    , setEncodingCertain
    , setRemainder
      -- ** Encoding
    , initialEncoderState
      -- * Transformations
      -- ** Decoding
      -- $decode-bom
    , decode
    , decode'
    , byteOrderMark
    , finalizeDecode
    , finalizeDecode'
      -- *** UTF-8
    , decodeUtf8
    , decodeUtf8NoBom
    , decodeUtf8'
    , decodeUtf8NoBom'
      -- ** Encoding
    , encode
    , encode'
    , encodeUtf8
      -- ** Continuations
    , decodeStep
    , encodeStep
    , decodeStep'
    , encodeStep'
      -- * Internal
      -- $internal
    , InnerDecoderState
    , InnerEncoderState
    ) where


import qualified Control.Applicative as A
import qualified Control.Monad.Trans.State as N.S

import qualified Data.Bifunctor as F.B
import qualified Data.ByteString.Builder as BS.B
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as BS.L
import qualified Data.ByteString.Short as BS.SH
import qualified Data.Char as C
import qualified Data.Either as E
import qualified Data.HashMap.Strict as M
import qualified Data.Text as T
import qualified Data.Text.Lazy as T.L
import qualified Data.Text.Lazy.Builder as T.L.B
import qualified Data.Tuple.HT as U.HT

import Web.Willow.Common.Encoding.Character
import Web.Willow.Common.Encoding.Common
import Web.Willow.Common.Parser
import Web.Willow.Common.Parser.Switch
import Web.Willow.Common.Parser.Util

import qualified Web.Willow.Common.Encoding.Big5 as Big5
import qualified Web.Willow.Common.Encoding.EucJp as EucJp
import qualified Web.Willow.Common.Encoding.EucKr as EucKr
import qualified Web.Willow.Common.Encoding.GB as GB
import qualified Web.Willow.Common.Encoding.Iso2022Jp as Iso2022Jp
import qualified Web.Willow.Common.Encoding.ShiftJis as ShiftJis
import qualified Web.Willow.Common.Encoding.SingleByte as Single
import qualified Web.Willow.Common.Encoding.Utf8 as Utf8
import qualified Web.Willow.Common.Encoding.Utf16 as Utf16


-- | __Encoding:__
--      @[BOM sniff]
--      (https://encoding.spec.whatwg.org/#bom-sniff)@
-- 
-- Checks for a "byte-order mark" signature character in various encodings.  If
-- present, returns both the encoding found and the remainder of the stream,
-- otherwise returns the input unchanged.
byteOrderMark :: BS.ByteString -> (Maybe Encoding, BS.ByteString)
byteOrderMark input = maybe (Nothing, input) pack $ runParser marks input
  where pack = F.B.first Just
        marks = choice
            [ Utf8.byteOrderMark
            , Utf16.byteOrderMarkBigEndian
            , Utf16.byteOrderMarkLittleEndian
            ]


-- $decode-bom
-- The standard 'decode' and 'decode'' functions (and therefore the similar but
-- higher-level functions which build on it) defer to a byte-order mark over
-- the argument encoding.  If this behaviour isn't desired (i.e., you want to
-- force the parser to use the encoding, even if it's not appropriate), 
-- try to explicitly parse 'byteOrderMark' first:
-- 
-- @
-- (_, input') = 'byteOrderMark' input
-- Just text = 'decode' enc input'
-- @


-- | __Encoding:__
--      @[run an encoding's decoder]
--      (https://encoding.spec.whatwg.org/#concept-encoding-run)@
--      with error mode @fatal@
-- 
-- Given a character encoding scheme, transform a dependant 'BS.ByteString'
-- into portable 'Char's.  If any byte sequences are meaningless or illegal,
-- they are returned verbatim for error reporting; a 'Left' should not be
-- parsed further.
-- 
-- See 'decodeStep' to decode only a minimal section, or 'decode'' for simple
-- error replacement.  Call 'finalizeDecode' on the returned 'DecoderState' if
-- no further bytes will be added to the document stream.
decode :: DecoderState -> BS.ByteString -> ([Either BS.SH.ShortByteString String], DecoderState)
decode state stream = case decodeStep stateBom' streamBom of
    (Nothing, state', _) -> ([], state')
    (Just out, state', stream') ->
        let (trail, state'') = decode state' stream'
        in  (out : trail, state'')
  where (stateBom, streamBom) = case byteOrderMark stream of
            (Just enc, trail) | useBom state == Just True ->
                (reparseBom enc $ initialDecoderState enc, trail)
            (Just enc, trail) | useBom state == Just False -> case decoderConfidence_ state of
                Tentative enc' _ | enc == enc' -> (reparseBom enc state, trail)
                Certain enc' | enc == enc' -> (reparseBom enc state, trail)
                _ -> (state, stream)
            _ -> (state, stream)
        reparseBom Utf8 state' = state'
            { decoderConfidence_ = recordBom [0xEF, 0xBB, 0xBF] $ decoderConfidence_ state'
            }
        reparseBom Utf16be state' = state'
            { decoderConfidence_ = recordBom [0xFE, 0xFF] $ decoderConfidence_ state'
            }
        reparseBom Utf16le state' = state'
            { decoderConfidence_ = recordBom [0xFF, 0xFE] $ decoderConfidence_ state'
            }
        reparseBom _ state' = state'
        recordBom bs (Tentative enc d') = Tentative enc $ d'
            { parsedChars = M.insert (BS.SH.pack bs) '\xFEFF' $ parsedChars d'
            }
        recordBom _ conf = conf
        stateBom' = stateBom
            { decoderConfidence_ = case decoderConfidence_ stateBom of
                Tentative enc d' -> Tentative enc $ d'
                    { streamStart = streamStart d' <> BS.L.fromStrict stream
                    }
                conf -> conf
            , useBom = Nothing
            }

-- | __Encoding:__
--      @[decode]
--      (https://encoding.spec.whatwg.org/#decode)@
-- 
-- Given a character encoding scheme, transform a dependant 'BS.ByteString'
-- into a portable 'T.Text'.  If any byte sequences are meaningless or
-- illegal, they are replaced with the Unicode replacement character @\\xFFFD@.
-- 
-- See 'decodeStep'' to decode only a minimal section, or 'decode' if the
-- original data should be retained for custom error reporting.  Call
-- 'finalizeDecode'' on the returned 'DecoderState' if no further bytes will be
-- added to the document stream.
decode' :: DecoderState -> BS.ByteString -> (T.Text, DecoderState)
decode' state stream = F.B.first
    (T.L.toStrict . T.L.B.toLazyText . mconcat . map (T.L.B.fromString . E.fromRight [replacementChar])) $
    decode state stream

-- | Read a binary stream of UTF-8 encoded text.  If the stream begins with a
-- UTF-8 byte-order mark, it's silently dropped (any other BOM is ignored but
-- remains in the output).  Fails (returning a 'Left') if the stream contains
-- byte sequences which don't represent any character, or which encode a
-- surrogate character.
-- 
-- See 'decodeUtf8'' for simple error replacement, or 'decodeUtf8NoBom' if the
-- BOM should always be retained.
decodeUtf8 :: BS.ByteString -> ([Either BS.SH.ShortByteString String], DecoderState)
decodeUtf8 = decode . ignoreSameBom $ initialDecoderState Utf8
  where ignoreSameBom state = state
            { useBom = Just False
            }

-- | __Encoding:__
--      @[UTF-8 decode without BOM or fail]
--      (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)@
-- 
-- Read a binary stream of UTF-8 encoded text.  If the stream begins with a
-- byte-order mark, it is kept as the first character of the output.  Fails
-- (returning a 'Left') if the stream contains byte sequences which don't
-- represent any character, or which encode a surrogate character.
-- 
-- See 'decodeUtf8NoBom'' for simple error replacement, or 'decodeUtf8'' if a
-- redundant UTF-8 BOM should be dropped.
decodeUtf8NoBom :: BS.ByteString -> ([Either BS.SH.ShortByteString String], DecoderState)
decodeUtf8NoBom = decode . ignoreBom $ initialDecoderState Utf8
  where ignoreBom state = state
            { useBom = Nothing
            }

-- | __Encoding:__
--      @[UTF-8 decode]
--      (https://encoding.spec.whatwg.org/#utf-8-decode)@
-- 
-- Read a binary stream of UTF-8 encoded text.  If the stream begins with a
-- UTF-8 byte-order mark, it's silently dropped (any other BOM is ignored but
-- remains in the output).  Any surrogate characters or invalid byte sequences
-- are replaced with the Unicode replacement character @\\xFFFD@.
-- 
-- See 'decodeUtf8' if the original data should be retained for custom error
-- reporting, or 'decodeUtf8NoBom'' if the BOM should always be retained.
decodeUtf8' :: BS.ByteString -> (T.Text, DecoderState)
decodeUtf8' = decode' . ignoreSameBom $ initialDecoderState Utf8
  where ignoreSameBom state = state
            { useBom = Just False
            }

-- | __Encoding:__
--      @[UTF-8 decode without BOM]
--      (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom)@
-- 
-- Read a binary stream of UTF-8 encoded text.  If the stream begins with a
-- byte-order mark, it is kept as the first character of the output.  Any
-- surrogate characters or invalid byte sequences are replaced with the Unicode
-- replacement character @\\xFFFD@.
-- 
-- See 'decodeUtf8NoBom' if the original data should be retained for custom
-- error reporting, or 'decodeUtf8'' if a redundant UTF-8 BOM should be
-- dropped.
decodeUtf8NoBom' :: BS.ByteString -> (T.Text, DecoderState)
decodeUtf8NoBom' = decode' . ignoreBom $ initialDecoderState Utf8
  where ignoreBom state = state
            { useBom = Nothing
            }


-- | The collection of data which, for any given encoding scheme, results in
-- behaviour according to the vanilla decoder before any bytes have been read.
initialDecoderState :: Encoding -> DecoderState
initialDecoderState enc = DecoderState
    { decoderConfidence_ = Certain enc
    , useBom = Just True
    , innerDecoderState = case enc of
        Iso2022Jp -> Iso2022DecoderState Iso2022Jp.defaultIso2022JpDecoderState
        Replacement -> ReplacementDecoderState defaultReplacementDecoderState
        _ -> SimpleDecoderState
    , remainderBytes = BS.SH.empty
    }


-- | Explicitly indicate that the input stream will not contain any further
-- bytes, and perform any finalization processing based on that.
-- 
-- See 'finalizeDecode'' for simple error replacement.
finalizeDecode :: DecoderState -> [Either BS.SH.ShortByteString String]
finalizeDecode state
    | BS.SH.null $ remainderBytes state = []
    | otherwise = [Left $ remainderBytes state]

-- | Explicitly indicate that the input stream will not contain any further
-- bytes, and perform any finalization processing based on that.
-- 
-- See 'finalizeDecode' if the original data should be retained for custom
-- error reporting.
finalizeDecode' :: DecoderState -> T.Text
finalizeDecode' state
    | BS.SH.null $ remainderBytes state = T.empty
    | otherwise = T.singleton replacementChar


-- | __Encoding:__
--      @[run an encoding's decoder]
--      (https://encoding.spec.whatwg.org/#concept-encoding-run)@
--      with error mode @fatal@
-- 
-- Read the smallest number of bytes from the head of the 'BS.ByteString'
-- which would leave the decoder in a re-enterable state.  If any byte
-- sequences are meaningless or illegal, they are returned verbatim for error
-- reporting; a 'Left' should not be parsed further.
-- 
-- See 'decode' to decode the entire string at once, or 'decodeStep'' for
-- simple error replacement.
decodeStep
    :: DecoderState
    -> BS.ByteString
    -> (Maybe (Either BS.SH.ShortByteString String), DecoderState, BS.ByteString)
decodeStep state stream = wrapOuter $ case innerDecoderState state of
    Iso2022DecoderState inner -> wrapInner Iso2022DecoderState <$>
        runParser' streamFull (decoderConfidence_ state, inner) Iso2022Jp.decoder
    ReplacementDecoderState inner -> wrapInner ReplacementDecoderState <$>
        runParser' streamFull (decoderConfidence_ state, inner) decoderReplacement
    SimpleDecoderState -> do
        let confidence = decoderConfidence_ state
        p <- M.lookup (confidenceEncoding confidence) decoders
        wrapInner (const SimpleDecoderState) <$>
            runParser' streamFull (confidence, ()) p
  where wrapInner f = U.HT.mapSnd3 $ F.B.second f
        wrapOuter (Just (out, (enc, innerState'), stream')) = (Just out, state', stream')
          where state' = state
                    { decoderConfidence_ = enc
                    , innerDecoderState = innerState'
                    , remainderBytes = BS.SH.empty
                    }
        wrapOuter Nothing = (Nothing, state', BS.empty)
          where state' = state
                    { remainderBytes = BS.SH.toShort stream
                    }
        streamFull = BS.SH.fromShort (remainderBytes state) <> stream

-- | __Encoding:__
--      @[run an encoding's decoder]
--      (https://encoding.spec.whatwg.org/#concept-encoding-run)@
--      with error mode @replacement@
-- 
-- Read the smallest number of bytes from the head of the 'BS.ByteString'
-- which would leave the decoder in a re-enterable state.  Any byte
-- sequences which are meaningless or illegal are replaced with the Unicode
-- replacement character @\\xFFFD@.
-- 
-- See 'decode'' to decode the entire string at once, or 'decodeStep' if the
-- original data should be retained for custom error reporting.
decodeStep' :: DecoderState -> BS.ByteString -> (Maybe String, DecoderState, BS.ByteString)
decodeStep' state = U.HT.mapFst3 (fmap $ E.fromRight [replacementChar]) . decodeStep state


-- | __Encoding:__
--      @[run an encoding's encoder]
--      (https://encoding.spec.whatwg.org/#concept-encoding-run)@
--      with error mode @fatal@
-- 
-- Given a character encoding scheme, transform a portable 'T.Text' into a
-- sequence of bytes representing those characters.  If the encoding scheme
-- does not define a binary representation for a character in the input, the
-- original 'Char' is returned unchanged for custom error reporting.
-- 
-- See 'encodeStep' to encode only a minimal section, or 'encode'' for escaping
-- with HTML-style character codes.
encode :: EncoderState -> T.Text -> ([Either Char BS.SH.ShortByteString], EncoderState)
encode state stream = case encodeStep state stream of
    Nothing -> ([], state)
    Just (out, state', stream') ->
        let (trail, state'') = encode state' stream'
        in  (out : trail, state'')

-- | __Encoding:__
--      @[encode]
--      (https://encoding.spec.whatwg.org/#encode)@
-- 
-- Given a character encoding scheme, transform a portable 'T.Text' into a
-- sequence of bytes representing those characters.  If the encoding scheme
-- does not define a binary representation for a character in the input, they
-- are replaced with an HTML-style escape (e.g. @"&#38;#0000;"@).
-- 
-- See 'encodeStep'' to encode only a minimal section, or 'encode' if the
-- original data should be retained for custom error reporting.
encode' :: EncoderState -> T.Text -> (BS.ByteString, EncoderState)
encode' state stream = F.B.first
    (BS.L.toStrict . BS.B.toLazyByteString . mconcat . map (BS.B.shortByteString . encodeReplacement)) $
    encode state stream

-- | __Encoding:__
--      @[UTF-8 encode]
--      (https://encoding.spec.whatwg.org/#utf-8-encode)@
-- 
-- Transform a portable 'T.Text' into a sequence of bytes according to the
-- UTF-8 encoding scheme.
encodeUtf8 :: T.Text -> (BS.ByteString, EncoderState)
encodeUtf8 = encode' $ initialEncoderState Utf8


-- | The collection of data which, for any given encoding scheme, results in
-- behaviour according to the vanilla decoder before any bytes have been read.
initialEncoderState :: Encoding -> EncoderState
initialEncoderState enc = EncoderState
    { encoderScheme = enc
    , innerEncoderState = if enc == Iso2022Jp
        then Iso2022EncoderState Iso2022Jp.defaultIso2022JpEncoderState
        else SimpleEncoderState
    }

-- | __Encoding:__
--      @[run an encoding's encoder]
--      (https://encoding.spec.whatwg.org/#concept-encoding-run)@
--      with error mode @fatal@
-- 
-- Read the smallest number of characters from the head of the 'T.Text' which
-- would leave the encoder in a re-enterable state.  If the encoding scheme
-- does not define a binary representation for a character in the input, the
-- original 'Char' is returned unchanged for custom error reporting.
-- 
-- See 'encode' to decode the entire string at once, or 'encodeStep'' for
-- simple error replacement.
encodeStep
    :: EncoderState
    -> T.Text
    -> Maybe (Either Char BS.SH.ShortByteString, EncoderState, T.Text)
encodeStep state stream = wrapOuter <$> case innerEncoderState state of
    Iso2022EncoderState inner -> wrapInner Iso2022EncoderState <$>
        runParser' stream inner Iso2022Jp.encoder
    SimpleEncoderState -> do
        p <- M.lookup (encoderScheme state) encoders
        (out, (), t) <- runParser' stream () p
        return (out, SimpleEncoderState, t)
  where wrapInner = U.HT.mapSnd3
        wrapOuter (out, innerState', stream') = (out, state', stream')
          where state' = state
                    { innerEncoderState = innerState'
                    }

-- | __Encoding:__
--      @[run an encoding's encoder]
--      (https://encoding.spec.whatwg.org/#concept-encoding-run)@
--      with error mode @html@
-- 
-- Read the smallest number of characters from the head of the 'T.Text' which
-- would leave the encoder in a re-enterable state.  If the encoding scheme
-- does not define a binary representation for a character in the input, they
-- are replaced with an HTML-style escape (e.g. @"&#38;#0000;"@).
-- 
-- See 'encode'' to encode the entire string at once, or 'encodeStep' if the
-- original data should be retained for custom error reporting.
encodeStep' :: EncoderState -> T.Text -> Maybe (BS.SH.ShortByteString, EncoderState, T.Text)
encodeStep' state stream = U.HT.mapFst3 encodeReplacement <$> encodeStep state stream


-- | Abstract the common core to both the decoder and the encoder: end the
-- parser if the input has been exhausted, optionally recover if the input is
-- meaningless, and otherwise flatten the nested tuples in the result.
runParser'
    :: stream
    -> state
    -> N.S.StateT state (ParserT stream Maybe) out
    -> Maybe (out, state, stream)
runParser' stream state p' = do
    ((out, state'), stream') <- runParserT (N.S.runStateT p' state) stream
    return (out, state', stream')


-- | Convert a 'Char' which fails to parse into an HTML-style numeric escape,
-- and try parsing that string instead.
encodeReplacement :: Either Char BS.SH.ShortByteString -> BS.SH.ShortByteString
encodeReplacement = either recovery id
  where recovery err = BS.SH.pack . map (fromIntegral . fromEnum) $ "&#" ++ show (fromEnum err) ++ ";"


-- $internal
-- These types will almost certainly not be useful for anyone using the
-- library, and are exported purely for internal usage.  They can be safely
-- ignored.  Note, however, that they may be removed without warning.


-- | The union of all state variables tracked by the bytes-to-'Char' decoding
-- algorithm of a single encoding scheme.
data InnerDecoderState
    = SimpleDecoderState
        -- ^ Null constructor for encodings which don't require persistant
        -- state in the decoding algorithm.
    | Iso2022DecoderState Iso2022Jp.Iso2022JpDecoderState
        -- ^ The data used by the 'Iso2022Jp' encoding scheme.
    | ReplacementDecoderState ReplacementDecoderState
        -- ^ The data used by the 'Replacement' encoding scheme.
  deriving ( Eq, Show, Read )

-- | The union of all state variables tracked by the 'Char'-to-bytes encoding
-- algorithm of a single encoding scheme.
data InnerEncoderState
    = SimpleEncoderState
        -- ^ Null constructor for encodings which don't require persistant
        -- state in the decoding algorithm.
    | Iso2022EncoderState Iso2022Jp.Iso2022JpEncoderState
        -- ^ The data used by the 'Iso2022Jp' encoding scheme.
  deriving ( Eq, Show, Read )


-- | The registry of 'Encoding' schemes to their byte-to-character
-- algorithms.
decoders :: M.HashMap Encoding TextBuilder
decoders = M.fromList $
    [ (Utf8, Utf8.decoder)
    , (Utf16be, Utf16.decoderBigEndian)
    , (Utf16le, Utf16.decoderLittleEndian)
    , (Big5, Big5.decoder)
    , (EucJp, EucJp.decoder)
    , (EucKr, EucKr.decoder)
    , (Gb18030, GB.decoder)
    , (Gbk, GB.decoder)
    , (ShiftJis, ShiftJis.decoder)
    , (UserDefined, decoderUserDefined)
    ] ++
    [ (enc, Single.decoder enc)
    | enc <- Single.encodings
    ]

-- | The registry of 'Encoding' schemes to their character-to-byte
-- algorithms.
encoders :: M.HashMap Encoding BinaryBuilder
encoders = M.fromList $
    -- 'Replacement', 'Utf16be', and 'Utf16le' are deliberately not included as
    -- per the specification.
    [ (Utf8, Utf8.encoder)
    , (Big5, Big5.encoder)
    , (EucJp, EucJp.encoder)
    , (EucKr, EucKr.encoder)
    , (Gb18030, GB.encoderGb18030)
    , (Gbk, GB.encoderGbk)
    , (ShiftJis, ShiftJis.encoder)
    , (UserDefined, encoderUserDefined)
    ] ++
    [ (enc, Single.encoder enc)
    | enc <- Single.encodings
    ]


-- | __Encoding:__
--      @[replacement decoder]
--      (https://encoding.spec.whatwg.org/#replacement-decoder)@
-- 
-- Return a single @\\xFFFD@ replacement character for the entire input stream.
decoderReplacement :: StateTextBuilder ReplacementDecoderState
decoderReplacement = do
    state <- getDecoderState
    if state
        then A.empty
        else modifyDecoderState (const True) *> next >>= \b -> emit [b] replacementChar

-- | Track whether the single allowed replacement character has already been
-- emitted.
type ReplacementDecoderState = Bool

-- | The default initial state to kickstart the 'Replacement' decoder.
defaultReplacementDecoderState :: ReplacementDecoderState
defaultReplacementDecoderState = False


-- | __Encoding:__
--      @[x-user-defined decoder]
--      (https://encoding.spec.whatwg.org/#x-user-defined-decoder)@
-- 
-- Remap bytes above the ASCII range to characters in the Private Use Area.
decoderUserDefined :: TextBuilder
decoderUserDefined = next >>= switch
    [ If (<= 0x7F) toUnicode1
    , Else $ \b -> emit [b] . toEnum . (+ 0xF700) $ fromIntegral b
    ]

-- | __Encoding:__
--      @[x-user-defined encoder]
--      (https://encoding.spec.whatwg.org/#x-user-defined-encoder)@
-- 
-- Remap a few characters in the Private Use Area to single-byte values just
-- above the ASCII characters.  This is of very limited value except as an
-- inverse for @decoderUserDefined@.
encoderUserDefined :: BinaryBuilder
encoderUserDefined = next >>= switch
    [ If C.isAscii fromAscii
    , If (range '\xF780' '\xF7FF') $
        return . pure . BS.SH.pack . (: []) . fromIntegral . subtract 0xF700 . fromEnum
    , Else encoderFailure
    ]


-- | Store the given binary sequence as unparsable without further input, to be
-- prepended to the beginning of stream on the next 'decode' or 'decode'' call.
setRemainder :: BS.SH.ShortByteString -> DecoderState -> DecoderState
setRemainder bs state = state
    { remainderBytes = bs
    }

-- | Retrieve the encoding scheme currently used by the decoder to decode the
-- binary document stream.
decoderEncoding :: DecoderState -> Encoding
decoderEncoding = confidenceEncoding . decoderConfidence_

-- | Instruct the decoder that the binary document stream is /known/ to be in
-- the certain encoding.
setEncodingCertain :: Encoding -> DecoderState -> DecoderState
setEncodingCertain enc state = state
    { decoderConfidence_ = Certain enc
    }

-- | Any leftover bytes at the end of the binary stream, which require further
-- input to be processed in order to correctly map to a character or error
-- value.
decoderRemainder :: DecoderState -> BS.SH.ShortByteString
decoderRemainder = remainderBytes