{-|
Description:    Character translation functions to and from standardized binary streams.

Copyright:      (c) 2020 Sam May
License:        MPL-2.0
Maintainer:     ag.eitilt@gmail.com

Stability:      experimental
Portability:    portable

This module and the internal branch it heads implement the
__[Encoding](https://encoding.spec.whatwg.org/)__ specification for translating
text to and from UTF-8 and a selection of less-favoured but grandfathered
encoding schemes.  As the standard authors' primary goal has been security
followed closely by compatibility with existing web pages, the algorithms
described and the names associated with them do not perfectly match the
descriptions originally given by the various original encoding specifications
themselves.
-}
module Web.Willow.Common.Encoding
    ( -- * Types
      Encoding ( .. )
    , DecoderState
    , decoderEncoding
    , decoderRemainder
    , ReparseData
    , EncoderState
      -- * Initialization
      -- ** Decoding
    , initialDecoderState
    , setEncodingCertain
    , setRemainder
      -- ** Encoding
    , initialEncoderState
      -- * Transformations
      -- ** Decoding
      -- $decode-bom
    , decode
    , decode'
    , byteOrderMark
    , finalizeDecode
    , finalizeDecode'
      -- *** UTF-8
    , decodeUtf8
    , decodeUtf8NoBom
    , decodeUtf8'
    , decodeUtf8NoBom'
      -- ** Encoding
    , encode
    , encode'
    , encodeUtf8
      -- ** Continuations
    , decodeStep
    , encodeStep
    , decodeStep'
    , encodeStep'
      -- * Internal
      -- $internal
    , InnerDecoderState
    , InnerEncoderState
    ) where


import qualified Control.Applicative as A
import qualified Control.Monad.Trans.State as N.S

import qualified Data.Bifunctor as F.B
import qualified Data.ByteString.Builder as BS.B
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as BS.L
import qualified Data.ByteString.Short as BS.SH
import qualified Data.Char as C
import qualified Data.Either as E
import qualified Data.HashMap.Strict as M
import qualified Data.Text as T
import qualified Data.Text.Lazy as T.L
import qualified Data.Text.Lazy.Builder as T.L.B
import qualified Data.Tuple.HT as U.HT

import Web.Willow.Common.Encoding.Character
import Web.Willow.Common.Encoding.Common
import Web.Willow.Common.Parser
import Web.Willow.Common.Parser.Switch
import Web.Willow.Common.Parser.Util

import qualified Web.Willow.Common.Encoding.Big5 as Big5
import qualified Web.Willow.Common.Encoding.EucJp as EucJp
import qualified Web.Willow.Common.Encoding.EucKr as EucKr
import qualified Web.Willow.Common.Encoding.GB as GB
import qualified Web.Willow.Common.Encoding.Iso2022Jp as Iso2022Jp
import qualified Web.Willow.Common.Encoding.ShiftJis as ShiftJis
import qualified Web.Willow.Common.Encoding.SingleByte as Single
import qualified Web.Willow.Common.Encoding.Utf8 as Utf8
import qualified Web.Willow.Common.Encoding.Utf16 as Utf16


-- | __Encoding:__
--      @[BOM sniff]
--      (https://encoding.spec.whatwg.org/#bom-sniff)@
-- 
-- Checks for a "byte-order mark" signature character in various encodings.  If
-- present, returns both the encoding found and the remainder of the stream,
-- otherwise returns the input unchanged.
byteOrderMark :: BS.ByteString -> (Maybe Encoding, BS.ByteString)
byteOrderMark :: ByteString -> (Maybe Encoding, ByteString)
byteOrderMark ByteString
input = (Maybe Encoding, ByteString)
-> ((Encoding, ByteString) -> (Maybe Encoding, ByteString))
-> Maybe (Encoding, ByteString)
-> (Maybe Encoding, ByteString)
forall b a. b -> (a -> b) -> Maybe a -> b
maybe (Maybe Encoding
forall a. Maybe a
Nothing, ByteString
input) (Encoding, ByteString) -> (Maybe Encoding, ByteString)
forall a c. (a, c) -> (Maybe a, c)
pack (Maybe (Encoding, ByteString) -> (Maybe Encoding, ByteString))
-> Maybe (Encoding, ByteString) -> (Maybe Encoding, ByteString)
forall a b. (a -> b) -> a -> b
$ Parser ByteString Encoding
-> ByteString -> Maybe (Encoding, ByteString)
forall stream out.
Parser stream out -> stream -> Maybe (out, stream)
runParser Parser ByteString Encoding
marks ByteString
input
  where pack :: (a, c) -> (Maybe a, c)
pack = (a -> Maybe a) -> (a, c) -> (Maybe a, c)
forall (p :: * -> * -> *) a b c.
Bifunctor p =>
(a -> b) -> p a c -> p b c
F.B.first a -> Maybe a
forall a. a -> Maybe a
Just
        marks :: Parser ByteString Encoding
marks = [Parser ByteString Encoding] -> Parser ByteString Encoding
forall (m :: * -> *) a. Alternative m => [m a] -> m a
choice
            [ Parser ByteString Encoding
forall (gather :: * -> *).
(Alternative gather, Monad gather) =>
ParserT ByteString gather Encoding
Utf8.byteOrderMark
            , Parser ByteString Encoding
forall (gather :: * -> *).
(Alternative gather, Monad gather) =>
ParserT ByteString gather Encoding
Utf16.byteOrderMarkBigEndian
            , Parser ByteString Encoding
forall (gather :: * -> *).
(Alternative gather, Monad gather) =>
ParserT ByteString gather Encoding
Utf16.byteOrderMarkLittleEndian
            ]


-- $decode-bom
-- The standard 'decode' and 'decode'' functions (and therefore the similar but
-- higher-level functions which build on it) defer to a byte-order mark over
-- the argument encoding.  If this behaviour isn't desired (i.e., you want to
-- force the parser to use the encoding, even if it's not appropriate), 
-- try to explicitly parse 'byteOrderMark' first:
-- 
-- @
-- (_, input') = 'byteOrderMark' input
-- Just text = 'decode' enc input'
-- @


-- | __Encoding:__
--      @[run an encoding's decoder]
--      (https://encoding.spec.whatwg.org/#concept-encoding-run)@
--      with error mode @fatal@
-- 
-- Given a character encoding scheme, transform a dependant 'BS.ByteString'
-- into portable 'Char's.  If any byte sequences are meaningless or illegal,
-- they are returned verbatim for error reporting; a 'Left' should not be
-- parsed further.
-- 
-- See 'decodeStep' to decode only a minimal section, or 'decode'' for simple
-- error replacement.  Call 'finalizeDecode' on the returned 'DecoderState' if
-- no further bytes will be added to the document stream.
decode :: DecoderState -> BS.ByteString -> ([Either BS.SH.ShortByteString String], DecoderState)
decode :: DecoderState
-> ByteString -> ([Either ShortByteString String], DecoderState)
decode DecoderState
state ByteString
stream = case DecoderState
-> ByteString
-> (Maybe (Either ShortByteString String), DecoderState,
    ByteString)
decodeStep DecoderState
stateBom' ByteString
streamBom of
    (Maybe (Either ShortByteString String)
Nothing, DecoderState
state', ByteString
_) -> ([], DecoderState
state')
    (Just Either ShortByteString String
out, DecoderState
state', ByteString
stream') ->
        let ([Either ShortByteString String]
trail, DecoderState
state'') = DecoderState
-> ByteString -> ([Either ShortByteString String], DecoderState)
decode DecoderState
state' ByteString
stream'
        in  (Either ShortByteString String
out Either ShortByteString String
-> [Either ShortByteString String]
-> [Either ShortByteString String]
forall a. a -> [a] -> [a]
: [Either ShortByteString String]
trail, DecoderState
state'')
  where (DecoderState
stateBom, ByteString
streamBom) = case ByteString -> (Maybe Encoding, ByteString)
byteOrderMark ByteString
stream of
            (Just Encoding
enc, ByteString
trail) | DecoderState -> Maybe Bool
useBom DecoderState
state Maybe Bool -> Maybe Bool -> Bool
forall a. Eq a => a -> a -> Bool
== Bool -> Maybe Bool
forall a. a -> Maybe a
Just Bool
True ->
                (Encoding -> DecoderState -> DecoderState
reparseBom Encoding
enc (DecoderState -> DecoderState) -> DecoderState -> DecoderState
forall a b. (a -> b) -> a -> b
$ Encoding -> DecoderState
initialDecoderState Encoding
enc, ByteString
trail)
            (Just Encoding
enc, ByteString
trail) | DecoderState -> Maybe Bool
useBom DecoderState
state Maybe Bool -> Maybe Bool -> Bool
forall a. Eq a => a -> a -> Bool
== Bool -> Maybe Bool
forall a. a -> Maybe a
Just Bool
False -> case DecoderState -> Confidence
decoderConfidence_ DecoderState
state of
                Tentative Encoding
enc' ReparseData
_ | Encoding
enc Encoding -> Encoding -> Bool
forall a. Eq a => a -> a -> Bool
== Encoding
enc' -> (Encoding -> DecoderState -> DecoderState
reparseBom Encoding
enc DecoderState
state, ByteString
trail)
                Certain Encoding
enc' | Encoding
enc Encoding -> Encoding -> Bool
forall a. Eq a => a -> a -> Bool
== Encoding
enc' -> (Encoding -> DecoderState -> DecoderState
reparseBom Encoding
enc DecoderState
state, ByteString
trail)
                Confidence
_ -> (DecoderState
state, ByteString
stream)
            (Maybe Encoding, ByteString)
_ -> (DecoderState
state, ByteString
stream)
        reparseBom :: Encoding -> DecoderState -> DecoderState
reparseBom Encoding
Utf8 DecoderState
state' = DecoderState
state'
            { decoderConfidence_ :: Confidence
decoderConfidence_ = [Word8] -> Confidence -> Confidence
recordBom [Word8
0xEF, Word8
0xBB, Word8
0xBF] (Confidence -> Confidence) -> Confidence -> Confidence
forall a b. (a -> b) -> a -> b
$ DecoderState -> Confidence
decoderConfidence_ DecoderState
state'
            }
        reparseBom Encoding
Utf16be DecoderState
state' = DecoderState
state'
            { decoderConfidence_ :: Confidence
decoderConfidence_ = [Word8] -> Confidence -> Confidence
recordBom [Word8
0xFE, Word8
0xFF] (Confidence -> Confidence) -> Confidence -> Confidence
forall a b. (a -> b) -> a -> b
$ DecoderState -> Confidence
decoderConfidence_ DecoderState
state'
            }
        reparseBom Encoding
Utf16le DecoderState
state' = DecoderState
state'
            { decoderConfidence_ :: Confidence
decoderConfidence_ = [Word8] -> Confidence -> Confidence
recordBom [Word8
0xFF, Word8
0xFE] (Confidence -> Confidence) -> Confidence -> Confidence
forall a b. (a -> b) -> a -> b
$ DecoderState -> Confidence
decoderConfidence_ DecoderState
state'
            }
        reparseBom Encoding
_ DecoderState
state' = DecoderState
state'
        recordBom :: [Word8] -> Confidence -> Confidence
recordBom [Word8]
bs (Tentative Encoding
enc ReparseData
d') = Encoding -> ReparseData -> Confidence
Tentative Encoding
enc (ReparseData -> Confidence) -> ReparseData -> Confidence
forall a b. (a -> b) -> a -> b
$ ReparseData
d'
            { parsedChars :: HashMap ShortByteString Char
parsedChars = ShortByteString
-> Char
-> HashMap ShortByteString Char
-> HashMap ShortByteString Char
forall k v.
(Eq k, Hashable k) =>
k -> v -> HashMap k v -> HashMap k v
M.insert ([Word8] -> ShortByteString
BS.SH.pack [Word8]
bs) Char
'\xFEFF' (HashMap ShortByteString Char -> HashMap ShortByteString Char)
-> HashMap ShortByteString Char -> HashMap ShortByteString Char
forall a b. (a -> b) -> a -> b
$ ReparseData -> HashMap ShortByteString Char
parsedChars ReparseData
d'
            }
        recordBom [Word8]
_ Confidence
conf = Confidence
conf
        stateBom' :: DecoderState
stateBom' = DecoderState
stateBom
            { decoderConfidence_ :: Confidence
decoderConfidence_ = case DecoderState -> Confidence
decoderConfidence_ DecoderState
stateBom of
                Tentative Encoding
enc ReparseData
d' -> Encoding -> ReparseData -> Confidence
Tentative Encoding
enc (ReparseData -> Confidence) -> ReparseData -> Confidence
forall a b. (a -> b) -> a -> b
$ ReparseData
d'
                    { streamStart :: ByteString
streamStart = ReparseData -> ByteString
streamStart ReparseData
d' ByteString -> ByteString -> ByteString
forall a. Semigroup a => a -> a -> a
<> ByteString -> ByteString
BS.L.fromStrict ByteString
stream
                    }
                Confidence
conf -> Confidence
conf
            , useBom :: Maybe Bool
useBom = Maybe Bool
forall a. Maybe a
Nothing
            }

-- | __Encoding:__
--      @[decode]
--      (https://encoding.spec.whatwg.org/#decode)@
-- 
-- Given a character encoding scheme, transform a dependant 'BS.ByteString'
-- into a portable 'T.Text'.  If any byte sequences are meaningless or
-- illegal, they are replaced with the Unicode replacement character @\\xFFFD@.
-- 
-- See 'decodeStep'' to decode only a minimal section, or 'decode' if the
-- original data should be retained for custom error reporting.  Call
-- 'finalizeDecode'' on the returned 'DecoderState' if no further bytes will be
-- added to the document stream.
decode' :: DecoderState -> BS.ByteString -> (T.Text, DecoderState)
decode' :: DecoderState -> ByteString -> (Text, DecoderState)
decode' DecoderState
state ByteString
stream = ([Either ShortByteString String] -> Text)
-> ([Either ShortByteString String], DecoderState)
-> (Text, DecoderState)
forall (p :: * -> * -> *) a b c.
Bifunctor p =>
(a -> b) -> p a c -> p b c
F.B.first
    (Text -> Text
T.L.toStrict (Text -> Text)
-> ([Either ShortByteString String] -> Text)
-> [Either ShortByteString String]
-> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Builder -> Text
T.L.B.toLazyText (Builder -> Text)
-> ([Either ShortByteString String] -> Builder)
-> [Either ShortByteString String]
-> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Builder] -> Builder
forall a. Monoid a => [a] -> a
mconcat ([Builder] -> Builder)
-> ([Either ShortByteString String] -> [Builder])
-> [Either ShortByteString String]
-> Builder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Either ShortByteString String -> Builder)
-> [Either ShortByteString String] -> [Builder]
forall a b. (a -> b) -> [a] -> [b]
map (String -> Builder
T.L.B.fromString (String -> Builder)
-> (Either ShortByteString String -> String)
-> Either ShortByteString String
-> Builder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> Either ShortByteString String -> String
forall b a. b -> Either a b -> b
E.fromRight [Char
replacementChar])) (([Either ShortByteString String], DecoderState)
 -> (Text, DecoderState))
-> ([Either ShortByteString String], DecoderState)
-> (Text, DecoderState)
forall a b. (a -> b) -> a -> b
$
    DecoderState
-> ByteString -> ([Either ShortByteString String], DecoderState)
decode DecoderState
state ByteString
stream

-- | Read a binary stream of UTF-8 encoded text.  If the stream begins with a
-- UTF-8 byte-order mark, it's silently dropped (any other BOM is ignored but
-- remains in the output).  Fails (returning a 'Left') if the stream contains
-- byte sequences which don't represent any character, or which encode a
-- surrogate character.
-- 
-- See 'decodeUtf8'' for simple error replacement, or 'decodeUtf8NoBom' if the
-- BOM should always be retained.
decodeUtf8 :: BS.ByteString -> ([Either BS.SH.ShortByteString String], DecoderState)
decodeUtf8 :: ByteString -> ([Either ShortByteString String], DecoderState)
decodeUtf8 = DecoderState
-> ByteString -> ([Either ShortByteString String], DecoderState)
decode (DecoderState
 -> ByteString -> ([Either ShortByteString String], DecoderState))
-> (DecoderState -> DecoderState)
-> DecoderState
-> ByteString
-> ([Either ShortByteString String], DecoderState)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. DecoderState -> DecoderState
ignoreSameBom (DecoderState
 -> ByteString -> ([Either ShortByteString String], DecoderState))
-> DecoderState
-> ByteString
-> ([Either ShortByteString String], DecoderState)
forall a b. (a -> b) -> a -> b
$ Encoding -> DecoderState
initialDecoderState Encoding
Utf8
  where ignoreSameBom :: DecoderState -> DecoderState
ignoreSameBom DecoderState
state = DecoderState
state
            { useBom :: Maybe Bool
useBom = Bool -> Maybe Bool
forall a. a -> Maybe a
Just Bool
False
            }

-- | __Encoding:__
--      @[UTF-8 decode without BOM or fail]
--      (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)@
-- 
-- Read a binary stream of UTF-8 encoded text.  If the stream begins with a
-- byte-order mark, it is kept as the first character of the output.  Fails
-- (returning a 'Left') if the stream contains byte sequences which don't
-- represent any character, or which encode a surrogate character.
-- 
-- See 'decodeUtf8NoBom'' for simple error replacement, or 'decodeUtf8'' if a
-- redundant UTF-8 BOM should be dropped.
decodeUtf8NoBom :: BS.ByteString -> ([Either BS.SH.ShortByteString String], DecoderState)
decodeUtf8NoBom :: ByteString -> ([Either ShortByteString String], DecoderState)
decodeUtf8NoBom = DecoderState
-> ByteString -> ([Either ShortByteString String], DecoderState)
decode (DecoderState
 -> ByteString -> ([Either ShortByteString String], DecoderState))
-> (DecoderState -> DecoderState)
-> DecoderState
-> ByteString
-> ([Either ShortByteString String], DecoderState)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. DecoderState -> DecoderState
ignoreBom (DecoderState
 -> ByteString -> ([Either ShortByteString String], DecoderState))
-> DecoderState
-> ByteString
-> ([Either ShortByteString String], DecoderState)
forall a b. (a -> b) -> a -> b
$ Encoding -> DecoderState
initialDecoderState Encoding
Utf8
  where ignoreBom :: DecoderState -> DecoderState
ignoreBom DecoderState
state = DecoderState
state
            { useBom :: Maybe Bool
useBom = Maybe Bool
forall a. Maybe a
Nothing
            }

-- | __Encoding:__
--      @[UTF-8 decode]
--      (https://encoding.spec.whatwg.org/#utf-8-decode)@
-- 
-- Read a binary stream of UTF-8 encoded text.  If the stream begins with a
-- UTF-8 byte-order mark, it's silently dropped (any other BOM is ignored but
-- remains in the output).  Any surrogate characters or invalid byte sequences
-- are replaced with the Unicode replacement character @\\xFFFD@.
-- 
-- See 'decodeUtf8' if the original data should be retained for custom error
-- reporting, or 'decodeUtf8NoBom'' if the BOM should always be retained.
decodeUtf8' :: BS.ByteString -> (T.Text, DecoderState)
decodeUtf8' :: ByteString -> (Text, DecoderState)
decodeUtf8' = DecoderState -> ByteString -> (Text, DecoderState)
decode' (DecoderState -> ByteString -> (Text, DecoderState))
-> (DecoderState -> DecoderState)
-> DecoderState
-> ByteString
-> (Text, DecoderState)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. DecoderState -> DecoderState
ignoreSameBom (DecoderState -> ByteString -> (Text, DecoderState))
-> DecoderState -> ByteString -> (Text, DecoderState)
forall a b. (a -> b) -> a -> b
$ Encoding -> DecoderState
initialDecoderState Encoding
Utf8
  where ignoreSameBom :: DecoderState -> DecoderState
ignoreSameBom DecoderState
state = DecoderState
state
            { useBom :: Maybe Bool
useBom = Bool -> Maybe Bool
forall a. a -> Maybe a
Just Bool
False
            }

-- | __Encoding:__
--      @[UTF-8 decode without BOM]
--      (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom)@
-- 
-- Read a binary stream of UTF-8 encoded text.  If the stream begins with a
-- byte-order mark, it is kept as the first character of the output.  Any
-- surrogate characters or invalid byte sequences are replaced with the Unicode
-- replacement character @\\xFFFD@.
-- 
-- See 'decodeUtf8NoBom' if the original data should be retained for custom
-- error reporting, or 'decodeUtf8'' if a redundant UTF-8 BOM should be
-- dropped.
decodeUtf8NoBom' :: BS.ByteString -> (T.Text, DecoderState)
decodeUtf8NoBom' :: ByteString -> (Text, DecoderState)
decodeUtf8NoBom' = DecoderState -> ByteString -> (Text, DecoderState)
decode' (DecoderState -> ByteString -> (Text, DecoderState))
-> (DecoderState -> DecoderState)
-> DecoderState
-> ByteString
-> (Text, DecoderState)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. DecoderState -> DecoderState
ignoreBom (DecoderState -> ByteString -> (Text, DecoderState))
-> DecoderState -> ByteString -> (Text, DecoderState)
forall a b. (a -> b) -> a -> b
$ Encoding -> DecoderState
initialDecoderState Encoding
Utf8
  where ignoreBom :: DecoderState -> DecoderState
ignoreBom DecoderState
state = DecoderState
state
            { useBom :: Maybe Bool
useBom = Maybe Bool
forall a. Maybe a
Nothing
            }


-- | The collection of data which, for any given encoding scheme, results in
-- behaviour according to the vanilla decoder before any bytes have been read.
initialDecoderState :: Encoding -> DecoderState
initialDecoderState :: Encoding -> DecoderState
initialDecoderState Encoding
enc = DecoderState :: Confidence
-> Maybe Bool
-> InnerDecoderState
-> ShortByteString
-> DecoderState
DecoderState
    { decoderConfidence_ :: Confidence
decoderConfidence_ = Encoding -> Confidence
Certain Encoding
enc
    , useBom :: Maybe Bool
useBom = Bool -> Maybe Bool
forall a. a -> Maybe a
Just Bool
True
    , innerDecoderState :: InnerDecoderState
innerDecoderState = case Encoding
enc of
        Encoding
Iso2022Jp -> Iso2022JpDecoderState -> InnerDecoderState
Iso2022DecoderState Iso2022JpDecoderState
Iso2022Jp.defaultIso2022JpDecoderState
        Encoding
Replacement -> Bool -> InnerDecoderState
ReplacementDecoderState Bool
defaultReplacementDecoderState
        Encoding
_ -> InnerDecoderState
SimpleDecoderState
    , remainderBytes :: ShortByteString
remainderBytes = ShortByteString
BS.SH.empty
    }


-- | Explicitly indicate that the input stream will not contain any further
-- bytes, and perform any finalization processing based on that.
-- 
-- See 'finalizeDecode'' for simple error replacement.
finalizeDecode :: DecoderState -> [Either BS.SH.ShortByteString String]
finalizeDecode :: DecoderState -> [Either ShortByteString String]
finalizeDecode DecoderState
state
    | ShortByteString -> Bool
BS.SH.null (ShortByteString -> Bool) -> ShortByteString -> Bool
forall a b. (a -> b) -> a -> b
$ DecoderState -> ShortByteString
remainderBytes DecoderState
state = []
    | Bool
otherwise = [ShortByteString -> Either ShortByteString String
forall a b. a -> Either a b
Left (ShortByteString -> Either ShortByteString String)
-> ShortByteString -> Either ShortByteString String
forall a b. (a -> b) -> a -> b
$ DecoderState -> ShortByteString
remainderBytes DecoderState
state]

-- | Explicitly indicate that the input stream will not contain any further
-- bytes, and perform any finalization processing based on that.
-- 
-- See 'finalizeDecode' if the original data should be retained for custom
-- error reporting.
finalizeDecode' :: DecoderState -> T.Text
finalizeDecode' :: DecoderState -> Text
finalizeDecode' DecoderState
state
    | ShortByteString -> Bool
BS.SH.null (ShortByteString -> Bool) -> ShortByteString -> Bool
forall a b. (a -> b) -> a -> b
$ DecoderState -> ShortByteString
remainderBytes DecoderState
state = Text
T.empty
    | Bool
otherwise = Char -> Text
T.singleton Char
replacementChar


-- | __Encoding:__
--      @[run an encoding's decoder]
--      (https://encoding.spec.whatwg.org/#concept-encoding-run)@
--      with error mode @fatal@
-- 
-- Read the smallest number of bytes from the head of the 'BS.ByteString'
-- which would leave the decoder in a re-enterable state.  If any byte
-- sequences are meaningless or illegal, they are returned verbatim for error
-- reporting; a 'Left' should not be parsed further.
-- 
-- See 'decode' to decode the entire string at once, or 'decodeStep'' for
-- simple error replacement.
decodeStep
    :: DecoderState
    -> BS.ByteString
    -> (Maybe (Either BS.SH.ShortByteString String), DecoderState, BS.ByteString)
decodeStep :: DecoderState
-> ByteString
-> (Maybe (Either ShortByteString String), DecoderState,
    ByteString)
decodeStep DecoderState
state ByteString
stream = Maybe
  (Either ShortByteString String, (Confidence, InnerDecoderState),
   ByteString)
-> (Maybe (Either ShortByteString String), DecoderState,
    ByteString)
forall a.
Maybe (a, (Confidence, InnerDecoderState), ByteString)
-> (Maybe a, DecoderState, ByteString)
wrapOuter (Maybe
   (Either ShortByteString String, (Confidence, InnerDecoderState),
    ByteString)
 -> (Maybe (Either ShortByteString String), DecoderState,
     ByteString))
-> Maybe
     (Either ShortByteString String, (Confidence, InnerDecoderState),
      ByteString)
-> (Maybe (Either ShortByteString String), DecoderState,
    ByteString)
forall a b. (a -> b) -> a -> b
$ case DecoderState -> InnerDecoderState
innerDecoderState DecoderState
state of
    Iso2022DecoderState Iso2022JpDecoderState
inner -> (Iso2022JpDecoderState -> InnerDecoderState)
-> (Either ShortByteString String,
    (Confidence, Iso2022JpDecoderState), ByteString)
-> (Either ShortByteString String, (Confidence, InnerDecoderState),
    ByteString)
forall (p :: * -> * -> *) b c a a c.
Bifunctor p =>
(b -> c) -> (a, p a b, c) -> (a, p a c, c)
wrapInner Iso2022JpDecoderState -> InnerDecoderState
Iso2022DecoderState ((Either ShortByteString String,
  (Confidence, Iso2022JpDecoderState), ByteString)
 -> (Either ShortByteString String, (Confidence, InnerDecoderState),
     ByteString))
-> Maybe
     (Either ShortByteString String,
      (Confidence, Iso2022JpDecoderState), ByteString)
-> Maybe
     (Either ShortByteString String, (Confidence, InnerDecoderState),
      ByteString)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$>
        ByteString
-> (Confidence, Iso2022JpDecoderState)
-> StateT
     (Confidence, Iso2022JpDecoderState)
     (ParserT ByteString Maybe)
     (Either ShortByteString String)
-> Maybe
     (Either ShortByteString String,
      (Confidence, Iso2022JpDecoderState), ByteString)
forall stream state out.
stream
-> state
-> StateT state (ParserT stream Maybe) out
-> Maybe (out, state, stream)
runParser' ByteString
streamFull (DecoderState -> Confidence
decoderConfidence_ DecoderState
state, Iso2022JpDecoderState
inner) StateT
  (Confidence, Iso2022JpDecoderState)
  (ParserT ByteString Maybe)
  (Either ShortByteString String)
Iso2022Jp.decoder
    ReplacementDecoderState Bool
inner -> (Bool -> InnerDecoderState)
-> (Either ShortByteString String, (Confidence, Bool), ByteString)
-> (Either ShortByteString String, (Confidence, InnerDecoderState),
    ByteString)
forall (p :: * -> * -> *) b c a a c.
Bifunctor p =>
(b -> c) -> (a, p a b, c) -> (a, p a c, c)
wrapInner Bool -> InnerDecoderState
ReplacementDecoderState ((Either ShortByteString String, (Confidence, Bool), ByteString)
 -> (Either ShortByteString String, (Confidence, InnerDecoderState),
     ByteString))
-> Maybe
     (Either ShortByteString String, (Confidence, Bool), ByteString)
-> Maybe
     (Either ShortByteString String, (Confidence, InnerDecoderState),
      ByteString)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$>
        ByteString
-> (Confidence, Bool)
-> StateT
     (Confidence, Bool)
     (ParserT ByteString Maybe)
     (Either ShortByteString String)
-> Maybe
     (Either ShortByteString String, (Confidence, Bool), ByteString)
forall stream state out.
stream
-> state
-> StateT state (ParserT stream Maybe) out
-> Maybe (out, state, stream)
runParser' ByteString
streamFull (DecoderState -> Confidence
decoderConfidence_ DecoderState
state, Bool
inner) StateT
  (Confidence, Bool)
  (ParserT ByteString Maybe)
  (Either ShortByteString String)
decoderReplacement
    InnerDecoderState
SimpleDecoderState -> do
        let confidence :: Confidence
confidence = DecoderState -> Confidence
decoderConfidence_ DecoderState
state
        TextBuilder
p <- Encoding -> HashMap Encoding TextBuilder -> Maybe TextBuilder
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
M.lookup (Confidence -> Encoding
confidenceEncoding Confidence
confidence) HashMap Encoding TextBuilder
decoders
        (() -> InnerDecoderState)
-> (Either ShortByteString String, (Confidence, ()), ByteString)
-> (Either ShortByteString String, (Confidence, InnerDecoderState),
    ByteString)
forall (p :: * -> * -> *) b c a a c.
Bifunctor p =>
(b -> c) -> (a, p a b, c) -> (a, p a c, c)
wrapInner (InnerDecoderState -> () -> InnerDecoderState
forall a b. a -> b -> a
const InnerDecoderState
SimpleDecoderState) ((Either ShortByteString String, (Confidence, ()), ByteString)
 -> (Either ShortByteString String, (Confidence, InnerDecoderState),
     ByteString))
-> Maybe
     (Either ShortByteString String, (Confidence, ()), ByteString)
-> Maybe
     (Either ShortByteString String, (Confidence, InnerDecoderState),
      ByteString)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$>
            ByteString
-> (Confidence, ())
-> TextBuilder
-> Maybe
     (Either ShortByteString String, (Confidence, ()), ByteString)
forall stream state out.
stream
-> state
-> StateT state (ParserT stream Maybe) out
-> Maybe (out, state, stream)
runParser' ByteString
streamFull (Confidence
confidence, ()) TextBuilder
p
  where wrapInner :: (b -> c) -> (a, p a b, c) -> (a, p a c, c)
wrapInner b -> c
f = (p a b -> p a c) -> (a, p a b, c) -> (a, p a c, c)
forall b d a c. (b -> d) -> (a, b, c) -> (a, d, c)
U.HT.mapSnd3 ((p a b -> p a c) -> (a, p a b, c) -> (a, p a c, c))
-> (p a b -> p a c) -> (a, p a b, c) -> (a, p a c, c)
forall a b. (a -> b) -> a -> b
$ (b -> c) -> p a b -> p a c
forall (p :: * -> * -> *) b c a.
Bifunctor p =>
(b -> c) -> p a b -> p a c
F.B.second b -> c
f
        wrapOuter :: Maybe (a, (Confidence, InnerDecoderState), ByteString)
-> (Maybe a, DecoderState, ByteString)
wrapOuter (Just (a
out, (Confidence
enc, InnerDecoderState
innerState'), ByteString
stream')) = (a -> Maybe a
forall a. a -> Maybe a
Just a
out, DecoderState
state', ByteString
stream')
          where state' :: DecoderState
state' = DecoderState
state
                    { decoderConfidence_ :: Confidence
decoderConfidence_ = Confidence
enc
                    , innerDecoderState :: InnerDecoderState
innerDecoderState = InnerDecoderState
innerState'
                    , remainderBytes :: ShortByteString
remainderBytes = ShortByteString
BS.SH.empty
                    }
        wrapOuter Maybe (a, (Confidence, InnerDecoderState), ByteString)
Nothing = (Maybe a
forall a. Maybe a
Nothing, DecoderState
state', ByteString
BS.empty)
          where state' :: DecoderState
state' = DecoderState
state
                    { remainderBytes :: ShortByteString
remainderBytes = ByteString -> ShortByteString
BS.SH.toShort ByteString
stream
                    }
        streamFull :: ByteString
streamFull = ShortByteString -> ByteString
BS.SH.fromShort (DecoderState -> ShortByteString
remainderBytes DecoderState
state) ByteString -> ByteString -> ByteString
forall a. Semigroup a => a -> a -> a
<> ByteString
stream

-- | __Encoding:__
--      @[run an encoding's decoder]
--      (https://encoding.spec.whatwg.org/#concept-encoding-run)@
--      with error mode @replacement@
-- 
-- Read the smallest number of bytes from the head of the 'BS.ByteString'
-- which would leave the decoder in a re-enterable state.  Any byte
-- sequences which are meaningless or illegal are replaced with the Unicode
-- replacement character @\\xFFFD@.
-- 
-- See 'decode'' to decode the entire string at once, or 'decodeStep' if the
-- original data should be retained for custom error reporting.
decodeStep' :: DecoderState -> BS.ByteString -> (Maybe String, DecoderState, BS.ByteString)
decodeStep' :: DecoderState
-> ByteString -> (Maybe String, DecoderState, ByteString)
decodeStep' DecoderState
state = (Maybe (Either ShortByteString String) -> Maybe String)
-> (Maybe (Either ShortByteString String), DecoderState,
    ByteString)
-> (Maybe String, DecoderState, ByteString)
forall a d b c. (a -> d) -> (a, b, c) -> (d, b, c)
U.HT.mapFst3 ((Either ShortByteString String -> String)
-> Maybe (Either ShortByteString String) -> Maybe String
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap ((Either ShortByteString String -> String)
 -> Maybe (Either ShortByteString String) -> Maybe String)
-> (Either ShortByteString String -> String)
-> Maybe (Either ShortByteString String)
-> Maybe String
forall a b. (a -> b) -> a -> b
$ String -> Either ShortByteString String -> String
forall b a. b -> Either a b -> b
E.fromRight [Char
replacementChar]) ((Maybe (Either ShortByteString String), DecoderState, ByteString)
 -> (Maybe String, DecoderState, ByteString))
-> (ByteString
    -> (Maybe (Either ShortByteString String), DecoderState,
        ByteString))
-> ByteString
-> (Maybe String, DecoderState, ByteString)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. DecoderState
-> ByteString
-> (Maybe (Either ShortByteString String), DecoderState,
    ByteString)
decodeStep DecoderState
state


-- | __Encoding:__
--      @[run an encoding's encoder]
--      (https://encoding.spec.whatwg.org/#concept-encoding-run)@
--      with error mode @fatal@
-- 
-- Given a character encoding scheme, transform a portable 'T.Text' into a
-- sequence of bytes representing those characters.  If the encoding scheme
-- does not define a binary representation for a character in the input, the
-- original 'Char' is returned unchanged for custom error reporting.
-- 
-- See 'encodeStep' to encode only a minimal section, or 'encode'' for escaping
-- with HTML-style character codes.
encode :: EncoderState -> T.Text -> ([Either Char BS.SH.ShortByteString], EncoderState)
encode :: EncoderState
-> Text -> ([Either Char ShortByteString], EncoderState)
encode EncoderState
state Text
stream = case EncoderState
-> Text -> Maybe (Either Char ShortByteString, EncoderState, Text)
encodeStep EncoderState
state Text
stream of
    Maybe (Either Char ShortByteString, EncoderState, Text)
Nothing -> ([], EncoderState
state)
    Just (Either Char ShortByteString
out, EncoderState
state', Text
stream') ->
        let ([Either Char ShortByteString]
trail, EncoderState
state'') = EncoderState
-> Text -> ([Either Char ShortByteString], EncoderState)
encode EncoderState
state' Text
stream'
        in  (Either Char ShortByteString
out Either Char ShortByteString
-> [Either Char ShortByteString] -> [Either Char ShortByteString]
forall a. a -> [a] -> [a]
: [Either Char ShortByteString]
trail, EncoderState
state'')

-- | __Encoding:__
--      @[encode]
--      (https://encoding.spec.whatwg.org/#encode)@
-- 
-- Given a character encoding scheme, transform a portable 'T.Text' into a
-- sequence of bytes representing those characters.  If the encoding scheme
-- does not define a binary representation for a character in the input, they
-- are replaced with an HTML-style escape (e.g. @"&#38;#0000;"@).
-- 
-- See 'encodeStep'' to encode only a minimal section, or 'encode' if the
-- original data should be retained for custom error reporting.
encode' :: EncoderState -> T.Text -> (BS.ByteString, EncoderState)
encode' :: EncoderState -> Text -> (ByteString, EncoderState)
encode' EncoderState
state Text
stream = ([Either Char ShortByteString] -> ByteString)
-> ([Either Char ShortByteString], EncoderState)
-> (ByteString, EncoderState)
forall (p :: * -> * -> *) a b c.
Bifunctor p =>
(a -> b) -> p a c -> p b c
F.B.first
    (ByteString -> ByteString
BS.L.toStrict (ByteString -> ByteString)
-> ([Either Char ShortByteString] -> ByteString)
-> [Either Char ShortByteString]
-> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Builder -> ByteString
BS.B.toLazyByteString (Builder -> ByteString)
-> ([Either Char ShortByteString] -> Builder)
-> [Either Char ShortByteString]
-> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Builder] -> Builder
forall a. Monoid a => [a] -> a
mconcat ([Builder] -> Builder)
-> ([Either Char ShortByteString] -> [Builder])
-> [Either Char ShortByteString]
-> Builder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Either Char ShortByteString -> Builder)
-> [Either Char ShortByteString] -> [Builder]
forall a b. (a -> b) -> [a] -> [b]
map (ShortByteString -> Builder
BS.B.shortByteString (ShortByteString -> Builder)
-> (Either Char ShortByteString -> ShortByteString)
-> Either Char ShortByteString
-> Builder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Either Char ShortByteString -> ShortByteString
encodeReplacement)) (([Either Char ShortByteString], EncoderState)
 -> (ByteString, EncoderState))
-> ([Either Char ShortByteString], EncoderState)
-> (ByteString, EncoderState)
forall a b. (a -> b) -> a -> b
$
    EncoderState
-> Text -> ([Either Char ShortByteString], EncoderState)
encode EncoderState
state Text
stream

-- | __Encoding:__
--      @[UTF-8 encode]
--      (https://encoding.spec.whatwg.org/#utf-8-encode)@
-- 
-- Transform a portable 'T.Text' into a sequence of bytes according to the
-- UTF-8 encoding scheme.
encodeUtf8 :: T.Text -> (BS.ByteString, EncoderState)
encodeUtf8 :: Text -> (ByteString, EncoderState)
encodeUtf8 = EncoderState -> Text -> (ByteString, EncoderState)
encode' (EncoderState -> Text -> (ByteString, EncoderState))
-> EncoderState -> Text -> (ByteString, EncoderState)
forall a b. (a -> b) -> a -> b
$ Encoding -> EncoderState
initialEncoderState Encoding
Utf8


-- | The collection of data which, for any given encoding scheme, results in
-- behaviour according to the vanilla decoder before any bytes have been read.
initialEncoderState :: Encoding -> EncoderState
initialEncoderState :: Encoding -> EncoderState
initialEncoderState Encoding
enc = EncoderState :: Encoding -> InnerEncoderState -> EncoderState
EncoderState
    { encoderScheme :: Encoding
encoderScheme = Encoding
enc
    , innerEncoderState :: InnerEncoderState
innerEncoderState = if Encoding
enc Encoding -> Encoding -> Bool
forall a. Eq a => a -> a -> Bool
== Encoding
Iso2022Jp
        then Iso2022JpEncoderState -> InnerEncoderState
Iso2022EncoderState Iso2022JpEncoderState
Iso2022Jp.defaultIso2022JpEncoderState
        else InnerEncoderState
SimpleEncoderState
    }

-- | __Encoding:__
--      @[run an encoding's encoder]
--      (https://encoding.spec.whatwg.org/#concept-encoding-run)@
--      with error mode @fatal@
-- 
-- Read the smallest number of characters from the head of the 'T.Text' which
-- would leave the encoder in a re-enterable state.  If the encoding scheme
-- does not define a binary representation for a character in the input, the
-- original 'Char' is returned unchanged for custom error reporting.
-- 
-- See 'encode' to decode the entire string at once, or 'encodeStep'' for
-- simple error replacement.
encodeStep
    :: EncoderState
    -> T.Text
    -> Maybe (Either Char BS.SH.ShortByteString, EncoderState, T.Text)
encodeStep :: EncoderState
-> Text -> Maybe (Either Char ShortByteString, EncoderState, Text)
encodeStep EncoderState
state Text
stream = (Either Char ShortByteString, InnerEncoderState, Text)
-> (Either Char ShortByteString, EncoderState, Text)
forall a c. (a, InnerEncoderState, c) -> (a, EncoderState, c)
wrapOuter ((Either Char ShortByteString, InnerEncoderState, Text)
 -> (Either Char ShortByteString, EncoderState, Text))
-> Maybe (Either Char ShortByteString, InnerEncoderState, Text)
-> Maybe (Either Char ShortByteString, EncoderState, Text)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> case EncoderState -> InnerEncoderState
innerEncoderState EncoderState
state of
    Iso2022EncoderState Iso2022JpEncoderState
inner -> (Iso2022JpEncoderState -> InnerEncoderState)
-> (Either Char ShortByteString, Iso2022JpEncoderState, Text)
-> (Either Char ShortByteString, InnerEncoderState, Text)
forall b d a c. (b -> d) -> (a, b, c) -> (a, d, c)
wrapInner Iso2022JpEncoderState -> InnerEncoderState
Iso2022EncoderState ((Either Char ShortByteString, Iso2022JpEncoderState, Text)
 -> (Either Char ShortByteString, InnerEncoderState, Text))
-> Maybe (Either Char ShortByteString, Iso2022JpEncoderState, Text)
-> Maybe (Either Char ShortByteString, InnerEncoderState, Text)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$>
        Text
-> Iso2022JpEncoderState
-> StateT
     Iso2022JpEncoderState
     (ParserT Text Maybe)
     (Either Char ShortByteString)
-> Maybe (Either Char ShortByteString, Iso2022JpEncoderState, Text)
forall stream state out.
stream
-> state
-> StateT state (ParserT stream Maybe) out
-> Maybe (out, state, stream)
runParser' Text
stream Iso2022JpEncoderState
inner StateT
  Iso2022JpEncoderState
  (ParserT Text Maybe)
  (Either Char ShortByteString)
Iso2022Jp.encoder
    InnerEncoderState
SimpleEncoderState -> do
        BinaryBuilder
p <- Encoding -> HashMap Encoding BinaryBuilder -> Maybe BinaryBuilder
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
M.lookup (EncoderState -> Encoding
encoderScheme EncoderState
state) HashMap Encoding BinaryBuilder
encoders
        (Either Char ShortByteString
out, (), Text
t) <- Text
-> ()
-> BinaryBuilder
-> Maybe (Either Char ShortByteString, (), Text)
forall stream state out.
stream
-> state
-> StateT state (ParserT stream Maybe) out
-> Maybe (out, state, stream)
runParser' Text
stream () BinaryBuilder
p
        (Either Char ShortByteString, InnerEncoderState, Text)
-> Maybe (Either Char ShortByteString, InnerEncoderState, Text)
forall (m :: * -> *) a. Monad m => a -> m a
return (Either Char ShortByteString
out, InnerEncoderState
SimpleEncoderState, Text
t)
  where wrapInner :: (b -> d) -> (a, b, c) -> (a, d, c)
wrapInner = (b -> d) -> (a, b, c) -> (a, d, c)
forall b d a c. (b -> d) -> (a, b, c) -> (a, d, c)
U.HT.mapSnd3
        wrapOuter :: (a, InnerEncoderState, c) -> (a, EncoderState, c)
wrapOuter (a
out, InnerEncoderState
innerState', c
stream') = (a
out, EncoderState
state', c
stream')
          where state' :: EncoderState
state' = EncoderState
state
                    { innerEncoderState :: InnerEncoderState
innerEncoderState = InnerEncoderState
innerState'
                    }

-- | __Encoding:__
--      @[run an encoding's encoder]
--      (https://encoding.spec.whatwg.org/#concept-encoding-run)@
--      with error mode @html@
-- 
-- Read the smallest number of characters from the head of the 'T.Text' which
-- would leave the encoder in a re-enterable state.  If the encoding scheme
-- does not define a binary representation for a character in the input, they
-- are replaced with an HTML-style escape (e.g. @"&#38;#0000;"@).
-- 
-- See 'encode'' to encode the entire string at once, or 'encodeStep' if the
-- original data should be retained for custom error reporting.
encodeStep' :: EncoderState -> T.Text -> Maybe (BS.SH.ShortByteString, EncoderState, T.Text)
encodeStep' :: EncoderState -> Text -> Maybe (ShortByteString, EncoderState, Text)
encodeStep' EncoderState
state Text
stream = (Either Char ShortByteString -> ShortByteString)
-> (Either Char ShortByteString, EncoderState, Text)
-> (ShortByteString, EncoderState, Text)
forall a d b c. (a -> d) -> (a, b, c) -> (d, b, c)
U.HT.mapFst3 Either Char ShortByteString -> ShortByteString
encodeReplacement ((Either Char ShortByteString, EncoderState, Text)
 -> (ShortByteString, EncoderState, Text))
-> Maybe (Either Char ShortByteString, EncoderState, Text)
-> Maybe (ShortByteString, EncoderState, Text)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> EncoderState
-> Text -> Maybe (Either Char ShortByteString, EncoderState, Text)
encodeStep EncoderState
state Text
stream


-- | Abstract the common core to both the decoder and the encoder: end the
-- parser if the input has been exhausted, optionally recover if the input is
-- meaningless, and otherwise flatten the nested tuples in the result.
runParser'
    :: stream
    -> state
    -> N.S.StateT state (ParserT stream Maybe) out
    -> Maybe (out, state, stream)
runParser' :: stream
-> state
-> StateT state (ParserT stream Maybe) out
-> Maybe (out, state, stream)
runParser' stream
stream state
state StateT state (ParserT stream Maybe) out
p' = do
    ((out
out, state
state'), stream
stream') <- ParserT stream Maybe (out, state)
-> stream -> Maybe ((out, state), stream)
forall stream (gather :: * -> *) out.
ParserT stream gather out -> stream -> gather (out, stream)
runParserT (StateT state (ParserT stream Maybe) out
-> state -> ParserT stream Maybe (out, state)
forall s (m :: * -> *) a. StateT s m a -> s -> m (a, s)
N.S.runStateT StateT state (ParserT stream Maybe) out
p' state
state) stream
stream
    (out, state, stream) -> Maybe (out, state, stream)
forall (m :: * -> *) a. Monad m => a -> m a
return (out
out, state
state', stream
stream')


-- | Convert a 'Char' which fails to parse into an HTML-style numeric escape,
-- and try parsing that string instead.
encodeReplacement :: Either Char BS.SH.ShortByteString -> BS.SH.ShortByteString
encodeReplacement :: Either Char ShortByteString -> ShortByteString
encodeReplacement = (Char -> ShortByteString)
-> (ShortByteString -> ShortByteString)
-> Either Char ShortByteString
-> ShortByteString
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either Char -> ShortByteString
forall a. Enum a => a -> ShortByteString
recovery ShortByteString -> ShortByteString
forall a. a -> a
id
  where recovery :: a -> ShortByteString
recovery a
err = [Word8] -> ShortByteString
BS.SH.pack ([Word8] -> ShortByteString)
-> (String -> [Word8]) -> String -> ShortByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Word8) -> String -> [Word8]
forall a b. (a -> b) -> [a] -> [b]
map (Int -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Word8) -> (Char -> Int) -> Char -> Word8
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Int
forall a. Enum a => a -> Int
fromEnum) (String -> ShortByteString) -> String -> ShortByteString
forall a b. (a -> b) -> a -> b
$ String
"&#" String -> String -> String
forall a. [a] -> [a] -> [a]
++ Int -> String
forall a. Show a => a -> String
show (a -> Int
forall a. Enum a => a -> Int
fromEnum a
err) String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
";"


-- $internal
-- These types will almost certainly not be useful for anyone using the
-- library, and are exported purely for internal usage.  They can be safely
-- ignored.  Note, however, that they may be removed without warning.


-- | The union of all state variables tracked by the bytes-to-'Char' decoding
-- algorithm of a single encoding scheme.
data InnerDecoderState
    = SimpleDecoderState
        -- ^ Null constructor for encodings which don't require persistant
        -- state in the decoding algorithm.
    | Iso2022DecoderState Iso2022Jp.Iso2022JpDecoderState
        -- ^ The data used by the 'Iso2022Jp' encoding scheme.
    | ReplacementDecoderState ReplacementDecoderState
        -- ^ The data used by the 'Replacement' encoding scheme.
  deriving ( InnerDecoderState -> InnerDecoderState -> Bool
(InnerDecoderState -> InnerDecoderState -> Bool)
-> (InnerDecoderState -> InnerDecoderState -> Bool)
-> Eq InnerDecoderState
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: InnerDecoderState -> InnerDecoderState -> Bool
$c/= :: InnerDecoderState -> InnerDecoderState -> Bool
== :: InnerDecoderState -> InnerDecoderState -> Bool
$c== :: InnerDecoderState -> InnerDecoderState -> Bool
Eq, Int -> InnerDecoderState -> String -> String
[InnerDecoderState] -> String -> String
InnerDecoderState -> String
(Int -> InnerDecoderState -> String -> String)
-> (InnerDecoderState -> String)
-> ([InnerDecoderState] -> String -> String)
-> Show InnerDecoderState
forall a.
(Int -> a -> String -> String)
-> (a -> String) -> ([a] -> String -> String) -> Show a
showList :: [InnerDecoderState] -> String -> String
$cshowList :: [InnerDecoderState] -> String -> String
show :: InnerDecoderState -> String
$cshow :: InnerDecoderState -> String
showsPrec :: Int -> InnerDecoderState -> String -> String
$cshowsPrec :: Int -> InnerDecoderState -> String -> String
Show, ReadPrec [InnerDecoderState]
ReadPrec InnerDecoderState
Int -> ReadS InnerDecoderState
ReadS [InnerDecoderState]
(Int -> ReadS InnerDecoderState)
-> ReadS [InnerDecoderState]
-> ReadPrec InnerDecoderState
-> ReadPrec [InnerDecoderState]
-> Read InnerDecoderState
forall a.
(Int -> ReadS a)
-> ReadS [a] -> ReadPrec a -> ReadPrec [a] -> Read a
readListPrec :: ReadPrec [InnerDecoderState]
$creadListPrec :: ReadPrec [InnerDecoderState]
readPrec :: ReadPrec InnerDecoderState
$creadPrec :: ReadPrec InnerDecoderState
readList :: ReadS [InnerDecoderState]
$creadList :: ReadS [InnerDecoderState]
readsPrec :: Int -> ReadS InnerDecoderState
$creadsPrec :: Int -> ReadS InnerDecoderState
Read )

-- | The union of all state variables tracked by the 'Char'-to-bytes encoding
-- algorithm of a single encoding scheme.
data InnerEncoderState
    = SimpleEncoderState
        -- ^ Null constructor for encodings which don't require persistant
        -- state in the decoding algorithm.
    | Iso2022EncoderState Iso2022Jp.Iso2022JpEncoderState
        -- ^ The data used by the 'Iso2022Jp' encoding scheme.
  deriving ( InnerEncoderState -> InnerEncoderState -> Bool
(InnerEncoderState -> InnerEncoderState -> Bool)
-> (InnerEncoderState -> InnerEncoderState -> Bool)
-> Eq InnerEncoderState
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: InnerEncoderState -> InnerEncoderState -> Bool
$c/= :: InnerEncoderState -> InnerEncoderState -> Bool
== :: InnerEncoderState -> InnerEncoderState -> Bool
$c== :: InnerEncoderState -> InnerEncoderState -> Bool
Eq, Int -> InnerEncoderState -> String -> String
[InnerEncoderState] -> String -> String
InnerEncoderState -> String
(Int -> InnerEncoderState -> String -> String)
-> (InnerEncoderState -> String)
-> ([InnerEncoderState] -> String -> String)
-> Show InnerEncoderState
forall a.
(Int -> a -> String -> String)
-> (a -> String) -> ([a] -> String -> String) -> Show a
showList :: [InnerEncoderState] -> String -> String
$cshowList :: [InnerEncoderState] -> String -> String
show :: InnerEncoderState -> String
$cshow :: InnerEncoderState -> String
showsPrec :: Int -> InnerEncoderState -> String -> String
$cshowsPrec :: Int -> InnerEncoderState -> String -> String
Show, ReadPrec [InnerEncoderState]
ReadPrec InnerEncoderState
Int -> ReadS InnerEncoderState
ReadS [InnerEncoderState]
(Int -> ReadS InnerEncoderState)
-> ReadS [InnerEncoderState]
-> ReadPrec InnerEncoderState
-> ReadPrec [InnerEncoderState]
-> Read InnerEncoderState
forall a.
(Int -> ReadS a)
-> ReadS [a] -> ReadPrec a -> ReadPrec [a] -> Read a
readListPrec :: ReadPrec [InnerEncoderState]
$creadListPrec :: ReadPrec [InnerEncoderState]
readPrec :: ReadPrec InnerEncoderState
$creadPrec :: ReadPrec InnerEncoderState
readList :: ReadS [InnerEncoderState]
$creadList :: ReadS [InnerEncoderState]
readsPrec :: Int -> ReadS InnerEncoderState
$creadsPrec :: Int -> ReadS InnerEncoderState
Read )


-- | The registry of 'Encoding' schemes to their byte-to-character
-- algorithms.
decoders :: M.HashMap Encoding TextBuilder
decoders :: HashMap Encoding TextBuilder
decoders = [(Encoding, TextBuilder)] -> HashMap Encoding TextBuilder
forall k v. (Eq k, Hashable k) => [(k, v)] -> HashMap k v
M.fromList ([(Encoding, TextBuilder)] -> HashMap Encoding TextBuilder)
-> [(Encoding, TextBuilder)] -> HashMap Encoding TextBuilder
forall a b. (a -> b) -> a -> b
$
    [ (Encoding
Utf8, TextBuilder
Utf8.decoder)
    , (Encoding
Utf16be, TextBuilder
Utf16.decoderBigEndian)
    , (Encoding
Utf16le, TextBuilder
Utf16.decoderLittleEndian)
    , (Encoding
Big5, TextBuilder
Big5.decoder)
    , (Encoding
EucJp, TextBuilder
EucJp.decoder)
    , (Encoding
EucKr, TextBuilder
EucKr.decoder)
    , (Encoding
Gb18030, TextBuilder
GB.decoder)
    , (Encoding
Gbk, TextBuilder
GB.decoder)
    , (Encoding
ShiftJis, TextBuilder
ShiftJis.decoder)
    , (Encoding
UserDefined, TextBuilder
decoderUserDefined)
    ] [(Encoding, TextBuilder)]
-> [(Encoding, TextBuilder)] -> [(Encoding, TextBuilder)]
forall a. [a] -> [a] -> [a]
++
    [ (Encoding
enc, Encoding -> TextBuilder
Single.decoder Encoding
enc)
    | Encoding
enc <- [Encoding]
Single.encodings
    ]

-- | The registry of 'Encoding' schemes to their character-to-byte
-- algorithms.
encoders :: M.HashMap Encoding BinaryBuilder
encoders :: HashMap Encoding BinaryBuilder
encoders = [(Encoding, BinaryBuilder)] -> HashMap Encoding BinaryBuilder
forall k v. (Eq k, Hashable k) => [(k, v)] -> HashMap k v
M.fromList ([(Encoding, BinaryBuilder)] -> HashMap Encoding BinaryBuilder)
-> [(Encoding, BinaryBuilder)] -> HashMap Encoding BinaryBuilder
forall a b. (a -> b) -> a -> b
$
    -- 'Replacement', 'Utf16be', and 'Utf16le' are deliberately not included as
    -- per the specification.
    [ (Encoding
Utf8, BinaryBuilder
Utf8.encoder)
    , (Encoding
Big5, BinaryBuilder
Big5.encoder)
    , (Encoding
EucJp, BinaryBuilder
EucJp.encoder)
    , (Encoding
EucKr, BinaryBuilder
EucKr.encoder)
    , (Encoding
Gb18030, BinaryBuilder
GB.encoderGb18030)
    , (Encoding
Gbk, BinaryBuilder
GB.encoderGbk)
    , (Encoding
ShiftJis, BinaryBuilder
ShiftJis.encoder)
    , (Encoding
UserDefined, BinaryBuilder
encoderUserDefined)
    ] [(Encoding, BinaryBuilder)]
-> [(Encoding, BinaryBuilder)] -> [(Encoding, BinaryBuilder)]
forall a. [a] -> [a] -> [a]
++
    [ (Encoding
enc, Encoding -> BinaryBuilder
Single.encoder Encoding
enc)
    | Encoding
enc <- [Encoding]
Single.encodings
    ]


-- | __Encoding:__
--      @[replacement decoder]
--      (https://encoding.spec.whatwg.org/#replacement-decoder)@
-- 
-- Return a single @\\xFFFD@ replacement character for the entire input stream.
decoderReplacement :: StateTextBuilder ReplacementDecoderState
decoderReplacement :: StateT
  (Confidence, Bool)
  (ParserT ByteString Maybe)
  (Either ShortByteString String)
decoderReplacement = do
    Bool
state <- StateDecoder Bool Bool
forall state. StateDecoder state state
getDecoderState
    if Bool
state
        then StateT
  (Confidence, Bool)
  (ParserT ByteString Maybe)
  (Either ShortByteString String)
forall (f :: * -> *) a. Alternative f => f a
A.empty
        else (Bool -> Bool) -> StateDecoder Bool ()
forall state. (state -> state) -> StateDecoder state ()
modifyDecoderState (Bool -> Bool -> Bool
forall a b. a -> b -> a
const Bool
True) StateDecoder Bool ()
-> StateT (Confidence, Bool) (ParserT ByteString Maybe) Word8
-> StateT (Confidence, Bool) (ParserT ByteString Maybe) Word8
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> StateT (Confidence, Bool) (ParserT ByteString Maybe) Word8
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next StateT (Confidence, Bool) (ParserT ByteString Maybe) Word8
-> (Word8
    -> StateT
         (Confidence, Bool)
         (ParserT ByteString Maybe)
         (Either ShortByteString String))
-> StateT
     (Confidence, Bool)
     (ParserT ByteString Maybe)
     (Either ShortByteString String)
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \Word8
b -> [Word8]
-> Char
-> StateT
     (Confidence, Bool)
     (ParserT ByteString Maybe)
     (Either ShortByteString String)
forall state. [Word8] -> Char -> StateTextBuilder state
emit [Word8
b] Char
replacementChar

-- | Track whether the single allowed replacement character has already been
-- emitted.
type ReplacementDecoderState = Bool

-- | The default initial state to kickstart the 'Replacement' decoder.
defaultReplacementDecoderState :: ReplacementDecoderState
defaultReplacementDecoderState :: Bool
defaultReplacementDecoderState = Bool
False


-- | __Encoding:__
--      @[x-user-defined decoder]
--      (https://encoding.spec.whatwg.org/#x-user-defined-decoder)@
-- 
-- Remap bytes above the ASCII range to characters in the Private Use Area.
decoderUserDefined :: TextBuilder
decoderUserDefined :: TextBuilder
decoderUserDefined = StateT (Confidence, ()) (ParserT ByteString Maybe) Word8
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next StateT (Confidence, ()) (ParserT ByteString Maybe) Word8
-> (Word8 -> TextBuilder) -> TextBuilder
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= [SwitchCase
   Word8
   (StateT (Confidence, ()) (ParserT ByteString Maybe))
   (Either ShortByteString String)]
-> Word8 -> TextBuilder
forall (m :: * -> *) test out.
Alternative m =>
[SwitchCase test m out] -> test -> m out
switch
    [ (Word8 -> Bool)
-> (Word8 -> TextBuilder)
-> SwitchCase
     Word8
     (StateT (Confidence, ()) (ParserT ByteString Maybe))
     (Either ShortByteString String)
forall test (m :: * -> *) out.
(test -> Bool) -> (test -> m out) -> SwitchCase test m out
If (Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word8
0x7F) Word8 -> TextBuilder
forall state. Word8 -> StateTextBuilder state
toUnicode1
    , (Word8 -> TextBuilder)
-> SwitchCase
     Word8
     (StateT (Confidence, ()) (ParserT ByteString Maybe))
     (Either ShortByteString String)
forall test (m :: * -> *) out.
(test -> m out) -> SwitchCase test m out
Else ((Word8 -> TextBuilder)
 -> SwitchCase
      Word8
      (StateT (Confidence, ()) (ParserT ByteString Maybe))
      (Either ShortByteString String))
-> (Word8 -> TextBuilder)
-> SwitchCase
     Word8
     (StateT (Confidence, ()) (ParserT ByteString Maybe))
     (Either ShortByteString String)
forall a b. (a -> b) -> a -> b
$ \Word8
b -> [Word8] -> Char -> TextBuilder
forall state. [Word8] -> Char -> StateTextBuilder state
emit [Word8
b] (Char -> TextBuilder) -> (Int -> Char) -> Int -> TextBuilder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Char
forall a. Enum a => Int -> a
toEnum (Int -> Char) -> (Int -> Int) -> Int -> Char
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
0xF700) (Int -> TextBuilder) -> Int -> TextBuilder
forall a b. (a -> b) -> a -> b
$ Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
b
    ]

-- | __Encoding:__
--      @[x-user-defined encoder]
--      (https://encoding.spec.whatwg.org/#x-user-defined-encoder)@
-- 
-- Remap a few characters in the Private Use Area to single-byte values just
-- above the ASCII characters.  This is of very limited value except as an
-- inverse for @decoderUserDefined@.
encoderUserDefined :: BinaryBuilder
encoderUserDefined :: BinaryBuilder
encoderUserDefined = StateT () (ParserT Text Maybe) Char
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next StateT () (ParserT Text Maybe) Char
-> (Char -> BinaryBuilder) -> BinaryBuilder
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= [SwitchCase
   Char
   (StateT () (ParserT Text Maybe))
   (Either Char ShortByteString)]
-> Char -> BinaryBuilder
forall (m :: * -> *) test out.
Alternative m =>
[SwitchCase test m out] -> test -> m out
switch
    [ (Char -> Bool)
-> (Char -> BinaryBuilder)
-> SwitchCase
     Char (StateT () (ParserT Text Maybe)) (Either Char ShortByteString)
forall test (m :: * -> *) out.
(test -> Bool) -> (test -> m out) -> SwitchCase test m out
If Char -> Bool
C.isAscii Char -> BinaryBuilder
forall state. Char -> StateBinaryBuilder state
fromAscii
    , (Char -> Bool)
-> (Char -> BinaryBuilder)
-> SwitchCase
     Char (StateT () (ParserT Text Maybe)) (Either Char ShortByteString)
forall test (m :: * -> *) out.
(test -> Bool) -> (test -> m out) -> SwitchCase test m out
If (Char -> Char -> Char -> Bool
forall a. Ord a => a -> a -> a -> Bool
range Char
'\xF780' Char
'\xF7FF') ((Char -> BinaryBuilder)
 -> SwitchCase
      Char
      (StateT () (ParserT Text Maybe))
      (Either Char ShortByteString))
-> (Char -> BinaryBuilder)
-> SwitchCase
     Char (StateT () (ParserT Text Maybe)) (Either Char ShortByteString)
forall a b. (a -> b) -> a -> b
$
        Either Char ShortByteString -> BinaryBuilder
forall (m :: * -> *) a. Monad m => a -> m a
return (Either Char ShortByteString -> BinaryBuilder)
-> (Char -> Either Char ShortByteString) -> Char -> BinaryBuilder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ShortByteString -> Either Char ShortByteString
forall (f :: * -> *) a. Applicative f => a -> f a
pure (ShortByteString -> Either Char ShortByteString)
-> (Char -> ShortByteString) -> Char -> Either Char ShortByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Word8] -> ShortByteString
BS.SH.pack ([Word8] -> ShortByteString)
-> (Char -> [Word8]) -> Char -> ShortByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: []) (Word8 -> [Word8]) -> (Char -> Word8) -> Char -> [Word8]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Word8) -> (Char -> Int) -> Char -> Word8
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Int -> Int
forall a. Num a => a -> a -> a
subtract Int
0xF700 (Int -> Int) -> (Char -> Int) -> Char -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Int
forall a. Enum a => a -> Int
fromEnum
    , (Char -> BinaryBuilder)
-> SwitchCase
     Char (StateT () (ParserT Text Maybe)) (Either Char ShortByteString)
forall test (m :: * -> *) out.
(test -> m out) -> SwitchCase test m out
Else Char -> BinaryBuilder
forall state out. Char -> StateEncoder state (EncoderError out)
encoderFailure
    ]


-- | Store the given binary sequence as unparsable without further input, to be
-- prepended to the beginning of stream on the next 'decode' or 'decode'' call.
setRemainder :: BS.SH.ShortByteString -> DecoderState -> DecoderState
setRemainder :: ShortByteString -> DecoderState -> DecoderState
setRemainder ShortByteString
bs DecoderState
state = DecoderState
state
    { remainderBytes :: ShortByteString
remainderBytes = ShortByteString
bs
    }

-- | Retrieve the encoding scheme currently used by the decoder to decode the
-- binary document stream.
decoderEncoding :: DecoderState -> Encoding
decoderEncoding :: DecoderState -> Encoding
decoderEncoding = Confidence -> Encoding
confidenceEncoding (Confidence -> Encoding)
-> (DecoderState -> Confidence) -> DecoderState -> Encoding
forall b c a. (b -> c) -> (a -> b) -> a -> c
. DecoderState -> Confidence
decoderConfidence_

-- | Instruct the decoder that the binary document stream is /known/ to be in
-- the certain encoding.
setEncodingCertain :: Encoding -> DecoderState -> DecoderState
setEncodingCertain :: Encoding -> DecoderState -> DecoderState
setEncodingCertain Encoding
enc DecoderState
state = DecoderState
state
    { decoderConfidence_ :: Confidence
decoderConfidence_ = Encoding -> Confidence
Certain Encoding
enc
    }

-- | Any leftover bytes at the end of the binary stream, which require further
-- input to be processed in order to correctly map to a character or error
-- value.
decoderRemainder :: DecoderState -> BS.SH.ShortByteString
decoderRemainder :: DecoderState -> ShortByteString
decoderRemainder = DecoderState -> ShortByteString
remainderBytes