{-| Description: Character translation functions to and from standardized binary streams. Copyright: (c) 2020 Sam May License: MPL-2.0 Maintainer: ag.eitilt@gmail.com Stability: experimental Portability: portable This module and the internal branch it heads implement the __[Encoding](https://encoding.spec.whatwg.org/)__ specification for translating text to and from UTF-8 and a selection of less-favoured but grandfathered encoding schemes. As the standard authors' primary goal has been security followed closely by compatibility with existing web pages, the algorithms described and the names associated with them do not perfectly match the descriptions originally given by the various original encoding specifications themselves. -} module Web.Willow.Common.Encoding ( -- * Types Encoding ( .. ) , DecoderState , decoderEncoding , decoderRemainder , ReparseData , EncoderState -- * Initialization -- ** Decoding , initialDecoderState , setEncodingCertain , setRemainder -- ** Encoding , initialEncoderState -- * Transformations -- ** Decoding -- $decode-bom , decode , decode' , byteOrderMark , finalizeDecode , finalizeDecode' -- *** UTF-8 , decodeUtf8 , decodeUtf8NoBom , decodeUtf8' , decodeUtf8NoBom' -- ** Encoding , encode , encode' , encodeUtf8 -- ** Continuations , decodeStep , encodeStep , decodeStep' , encodeStep' -- * Internal -- $internal , InnerDecoderState , InnerEncoderState ) where import qualified Control.Applicative as A import qualified Control.Monad.Trans.State as N.S import qualified Data.Bifunctor as F.B import qualified Data.ByteString.Builder as BS.B import qualified Data.ByteString as BS import qualified Data.ByteString.Lazy as BS.L import qualified Data.ByteString.Short as BS.SH import qualified Data.Char as C import qualified Data.Either as E import qualified Data.HashMap.Strict as M import qualified Data.Text as T import qualified Data.Text.Lazy as T.L import qualified Data.Text.Lazy.Builder as T.L.B import qualified Data.Tuple.HT as U.HT import Web.Willow.Common.Encoding.Character import Web.Willow.Common.Encoding.Common import Web.Willow.Common.Parser import Web.Willow.Common.Parser.Switch import Web.Willow.Common.Parser.Util import qualified Web.Willow.Common.Encoding.Big5 as Big5 import qualified Web.Willow.Common.Encoding.EucJp as EucJp import qualified Web.Willow.Common.Encoding.EucKr as EucKr import qualified Web.Willow.Common.Encoding.GB as GB import qualified Web.Willow.Common.Encoding.Iso2022Jp as Iso2022Jp import qualified Web.Willow.Common.Encoding.ShiftJis as ShiftJis import qualified Web.Willow.Common.Encoding.SingleByte as Single import qualified Web.Willow.Common.Encoding.Utf8 as Utf8 import qualified Web.Willow.Common.Encoding.Utf16 as Utf16 -- | __Encoding:__ -- @[BOM sniff] -- (https://encoding.spec.whatwg.org/#bom-sniff)@ -- -- Checks for a "byte-order mark" signature character in various encodings. If -- present, returns both the encoding found and the remainder of the stream, -- otherwise returns the input unchanged. byteOrderMark :: BS.ByteString -> (Maybe Encoding, BS.ByteString) byteOrderMark input = maybe (Nothing, input) pack $ runParser marks input where pack = F.B.first Just marks = choice [ Utf8.byteOrderMark , Utf16.byteOrderMarkBigEndian , Utf16.byteOrderMarkLittleEndian ] -- $decode-bom -- The standard 'decode' and 'decode'' functions (and therefore the similar but -- higher-level functions which build on it) defer to a byte-order mark over -- the argument encoding. If this behaviour isn't desired (i.e., you want to -- force the parser to use the encoding, even if it's not appropriate), -- try to explicitly parse 'byteOrderMark' first: -- -- @ -- (_, input') = 'byteOrderMark' input -- Just text = 'decode' enc input' -- @ -- | __Encoding:__ -- @[run an encoding's decoder] -- (https://encoding.spec.whatwg.org/#concept-encoding-run)@ -- with error mode @fatal@ -- -- Given a character encoding scheme, transform a dependant 'BS.ByteString' -- into portable 'Char's. If any byte sequences are meaningless or illegal, -- they are returned verbatim for error reporting; a 'Left' should not be -- parsed further. -- -- See 'decodeStep' to decode only a minimal section, or 'decode'' for simple -- error replacement. Call 'finalizeDecode' on the returned 'DecoderState' if -- no further bytes will be added to the document stream. decode :: DecoderState -> BS.ByteString -> ([Either BS.SH.ShortByteString String], DecoderState) decode state stream = case decodeStep stateBom' streamBom of (Nothing, state', _) -> ([], state') (Just out, state', stream') -> let (trail, state'') = decode state' stream' in (out : trail, state'') where (stateBom, streamBom) = case byteOrderMark stream of (Just enc, trail) | useBom state == Just True -> (reparseBom enc $ initialDecoderState enc, trail) (Just enc, trail) | useBom state == Just False -> case decoderConfidence_ state of Tentative enc' _ | enc == enc' -> (reparseBom enc state, trail) Certain enc' | enc == enc' -> (reparseBom enc state, trail) _ -> (state, stream) _ -> (state, stream) reparseBom Utf8 state' = state' { decoderConfidence_ = recordBom [0xEF, 0xBB, 0xBF] $ decoderConfidence_ state' } reparseBom Utf16be state' = state' { decoderConfidence_ = recordBom [0xFE, 0xFF] $ decoderConfidence_ state' } reparseBom Utf16le state' = state' { decoderConfidence_ = recordBom [0xFF, 0xFE] $ decoderConfidence_ state' } reparseBom _ state' = state' recordBom bs (Tentative enc d') = Tentative enc $ d' { parsedChars = M.insert (BS.SH.pack bs) '\xFEFF' $ parsedChars d' } recordBom _ conf = conf stateBom' = stateBom { decoderConfidence_ = case decoderConfidence_ stateBom of Tentative enc d' -> Tentative enc $ d' { streamStart = streamStart d' <> BS.L.fromStrict stream } conf -> conf , useBom = Nothing } -- | __Encoding:__ -- @[decode] -- (https://encoding.spec.whatwg.org/#decode)@ -- -- Given a character encoding scheme, transform a dependant 'BS.ByteString' -- into a portable 'T.Text'. If any byte sequences are meaningless or -- illegal, they are replaced with the Unicode replacement character @\\xFFFD@. -- -- See 'decodeStep'' to decode only a minimal section, or 'decode' if the -- original data should be retained for custom error reporting. Call -- 'finalizeDecode'' on the returned 'DecoderState' if no further bytes will be -- added to the document stream. decode' :: DecoderState -> BS.ByteString -> (T.Text, DecoderState) decode' state stream = F.B.first (T.L.toStrict . T.L.B.toLazyText . mconcat . map (T.L.B.fromString . E.fromRight [replacementChar])) $ decode state stream -- | Read a binary stream of UTF-8 encoded text. If the stream begins with a -- UTF-8 byte-order mark, it's silently dropped (any other BOM is ignored but -- remains in the output). Fails (returning a 'Left') if the stream contains -- byte sequences which don't represent any character, or which encode a -- surrogate character. -- -- See 'decodeUtf8'' for simple error replacement, or 'decodeUtf8NoBom' if the -- BOM should always be retained. decodeUtf8 :: BS.ByteString -> ([Either BS.SH.ShortByteString String], DecoderState) decodeUtf8 = decode . ignoreSameBom $ initialDecoderState Utf8 where ignoreSameBom state = state { useBom = Just False } -- | __Encoding:__ -- @[UTF-8 decode without BOM or fail] -- (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)@ -- -- Read a binary stream of UTF-8 encoded text. If the stream begins with a -- byte-order mark, it is kept as the first character of the output. Fails -- (returning a 'Left') if the stream contains byte sequences which don't -- represent any character, or which encode a surrogate character. -- -- See 'decodeUtf8NoBom'' for simple error replacement, or 'decodeUtf8'' if a -- redundant UTF-8 BOM should be dropped. decodeUtf8NoBom :: BS.ByteString -> ([Either BS.SH.ShortByteString String], DecoderState) decodeUtf8NoBom = decode . ignoreBom $ initialDecoderState Utf8 where ignoreBom state = state { useBom = Nothing } -- | __Encoding:__ -- @[UTF-8 decode] -- (https://encoding.spec.whatwg.org/#utf-8-decode)@ -- -- Read a binary stream of UTF-8 encoded text. If the stream begins with a -- UTF-8 byte-order mark, it's silently dropped (any other BOM is ignored but -- remains in the output). Any surrogate characters or invalid byte sequences -- are replaced with the Unicode replacement character @\\xFFFD@. -- -- See 'decodeUtf8' if the original data should be retained for custom error -- reporting, or 'decodeUtf8NoBom'' if the BOM should always be retained. decodeUtf8' :: BS.ByteString -> (T.Text, DecoderState) decodeUtf8' = decode' . ignoreSameBom $ initialDecoderState Utf8 where ignoreSameBom state = state { useBom = Just False } -- | __Encoding:__ -- @[UTF-8 decode without BOM] -- (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom)@ -- -- Read a binary stream of UTF-8 encoded text. If the stream begins with a -- byte-order mark, it is kept as the first character of the output. Any -- surrogate characters or invalid byte sequences are replaced with the Unicode -- replacement character @\\xFFFD@. -- -- See 'decodeUtf8NoBom' if the original data should be retained for custom -- error reporting, or 'decodeUtf8'' if a redundant UTF-8 BOM should be -- dropped. decodeUtf8NoBom' :: BS.ByteString -> (T.Text, DecoderState) decodeUtf8NoBom' = decode' . ignoreBom $ initialDecoderState Utf8 where ignoreBom state = state { useBom = Nothing } -- | The collection of data which, for any given encoding scheme, results in -- behaviour according to the vanilla decoder before any bytes have been read. initialDecoderState :: Encoding -> DecoderState initialDecoderState enc = DecoderState { decoderConfidence_ = Certain enc , useBom = Just True , innerDecoderState = case enc of Iso2022Jp -> Iso2022DecoderState Iso2022Jp.defaultIso2022JpDecoderState Replacement -> ReplacementDecoderState defaultReplacementDecoderState _ -> SimpleDecoderState , remainderBytes = BS.SH.empty } -- | Explicitly indicate that the input stream will not contain any further -- bytes, and perform any finalization processing based on that. -- -- See 'finalizeDecode'' for simple error replacement. finalizeDecode :: DecoderState -> [Either BS.SH.ShortByteString String] finalizeDecode state | BS.SH.null $ remainderBytes state = [] | otherwise = [Left $ remainderBytes state] -- | Explicitly indicate that the input stream will not contain any further -- bytes, and perform any finalization processing based on that. -- -- See 'finalizeDecode' if the original data should be retained for custom -- error reporting. finalizeDecode' :: DecoderState -> T.Text finalizeDecode' state | BS.SH.null $ remainderBytes state = T.empty | otherwise = T.singleton replacementChar -- | __Encoding:__ -- @[run an encoding's decoder] -- (https://encoding.spec.whatwg.org/#concept-encoding-run)@ -- with error mode @fatal@ -- -- Read the smallest number of bytes from the head of the 'BS.ByteString' -- which would leave the decoder in a re-enterable state. If any byte -- sequences are meaningless or illegal, they are returned verbatim for error -- reporting; a 'Left' should not be parsed further. -- -- See 'decode' to decode the entire string at once, or 'decodeStep'' for -- simple error replacement. decodeStep :: DecoderState -> BS.ByteString -> (Maybe (Either BS.SH.ShortByteString String), DecoderState, BS.ByteString) decodeStep state stream = wrapOuter $ case innerDecoderState state of Iso2022DecoderState inner -> wrapInner Iso2022DecoderState <$> runParser' streamFull (decoderConfidence_ state, inner) Iso2022Jp.decoder ReplacementDecoderState inner -> wrapInner ReplacementDecoderState <$> runParser' streamFull (decoderConfidence_ state, inner) decoderReplacement SimpleDecoderState -> do let confidence = decoderConfidence_ state p <- M.lookup (confidenceEncoding confidence) decoders wrapInner (const SimpleDecoderState) <$> runParser' streamFull (confidence, ()) p where wrapInner f = U.HT.mapSnd3 $ F.B.second f wrapOuter (Just (out, (enc, innerState'), stream')) = (Just out, state', stream') where state' = state { decoderConfidence_ = enc , innerDecoderState = innerState' , remainderBytes = BS.SH.empty } wrapOuter Nothing = (Nothing, state', BS.empty) where state' = state { remainderBytes = BS.SH.toShort stream } streamFull = BS.SH.fromShort (remainderBytes state) <> stream -- | __Encoding:__ -- @[run an encoding's decoder] -- (https://encoding.spec.whatwg.org/#concept-encoding-run)@ -- with error mode @replacement@ -- -- Read the smallest number of bytes from the head of the 'BS.ByteString' -- which would leave the decoder in a re-enterable state. Any byte -- sequences which are meaningless or illegal are replaced with the Unicode -- replacement character @\\xFFFD@. -- -- See 'decode'' to decode the entire string at once, or 'decodeStep' if the -- original data should be retained for custom error reporting. decodeStep' :: DecoderState -> BS.ByteString -> (Maybe String, DecoderState, BS.ByteString) decodeStep' state = U.HT.mapFst3 (fmap $ E.fromRight [replacementChar]) . decodeStep state -- | __Encoding:__ -- @[run an encoding's encoder] -- (https://encoding.spec.whatwg.org/#concept-encoding-run)@ -- with error mode @fatal@ -- -- Given a character encoding scheme, transform a portable 'T.Text' into a -- sequence of bytes representing those characters. If the encoding scheme -- does not define a binary representation for a character in the input, the -- original 'Char' is returned unchanged for custom error reporting. -- -- See 'encodeStep' to encode only a minimal section, or 'encode'' for escaping -- with HTML-style character codes. encode :: EncoderState -> T.Text -> ([Either Char BS.SH.ShortByteString], EncoderState) encode state stream = case encodeStep state stream of Nothing -> ([], state) Just (out, state', stream') -> let (trail, state'') = encode state' stream' in (out : trail, state'') -- | __Encoding:__ -- @[encode] -- (https://encoding.spec.whatwg.org/#encode)@ -- -- Given a character encoding scheme, transform a portable 'T.Text' into a -- sequence of bytes representing those characters. If the encoding scheme -- does not define a binary representation for a character in the input, they -- are replaced with an HTML-style escape (e.g. @"&#0000;"@). -- -- See 'encodeStep'' to encode only a minimal section, or 'encode' if the -- original data should be retained for custom error reporting. encode' :: EncoderState -> T.Text -> (BS.ByteString, EncoderState) encode' state stream = F.B.first (BS.L.toStrict . BS.B.toLazyByteString . mconcat . map (BS.B.shortByteString . encodeReplacement)) $ encode state stream -- | __Encoding:__ -- @[UTF-8 encode] -- (https://encoding.spec.whatwg.org/#utf-8-encode)@ -- -- Transform a portable 'T.Text' into a sequence of bytes according to the -- UTF-8 encoding scheme. encodeUtf8 :: T.Text -> (BS.ByteString, EncoderState) encodeUtf8 = encode' $ initialEncoderState Utf8 -- | The collection of data which, for any given encoding scheme, results in -- behaviour according to the vanilla decoder before any bytes have been read. initialEncoderState :: Encoding -> EncoderState initialEncoderState enc = EncoderState { encoderScheme = enc , innerEncoderState = if enc == Iso2022Jp then Iso2022EncoderState Iso2022Jp.defaultIso2022JpEncoderState else SimpleEncoderState } -- | __Encoding:__ -- @[run an encoding's encoder] -- (https://encoding.spec.whatwg.org/#concept-encoding-run)@ -- with error mode @fatal@ -- -- Read the smallest number of characters from the head of the 'T.Text' which -- would leave the encoder in a re-enterable state. If the encoding scheme -- does not define a binary representation for a character in the input, the -- original 'Char' is returned unchanged for custom error reporting. -- -- See 'encode' to decode the entire string at once, or 'encodeStep'' for -- simple error replacement. encodeStep :: EncoderState -> T.Text -> Maybe (Either Char BS.SH.ShortByteString, EncoderState, T.Text) encodeStep state stream = wrapOuter <$> case innerEncoderState state of Iso2022EncoderState inner -> wrapInner Iso2022EncoderState <$> runParser' stream inner Iso2022Jp.encoder SimpleEncoderState -> do p <- M.lookup (encoderScheme state) encoders (out, (), t) <- runParser' stream () p return (out, SimpleEncoderState, t) where wrapInner = U.HT.mapSnd3 wrapOuter (out, innerState', stream') = (out, state', stream') where state' = state { innerEncoderState = innerState' } -- | __Encoding:__ -- @[run an encoding's encoder] -- (https://encoding.spec.whatwg.org/#concept-encoding-run)@ -- with error mode @html@ -- -- Read the smallest number of characters from the head of the 'T.Text' which -- would leave the encoder in a re-enterable state. If the encoding scheme -- does not define a binary representation for a character in the input, they -- are replaced with an HTML-style escape (e.g. @"&#0000;"@). -- -- See 'encode'' to encode the entire string at once, or 'encodeStep' if the -- original data should be retained for custom error reporting. encodeStep' :: EncoderState -> T.Text -> Maybe (BS.SH.ShortByteString, EncoderState, T.Text) encodeStep' state stream = U.HT.mapFst3 encodeReplacement <$> encodeStep state stream -- | Abstract the common core to both the decoder and the encoder: end the -- parser if the input has been exhausted, optionally recover if the input is -- meaningless, and otherwise flatten the nested tuples in the result. runParser' :: stream -> state -> N.S.StateT state (ParserT stream Maybe) out -> Maybe (out, state, stream) runParser' stream state p' = do ((out, state'), stream') <- runParserT (N.S.runStateT p' state) stream return (out, state', stream') -- | Convert a 'Char' which fails to parse into an HTML-style numeric escape, -- and try parsing that string instead. encodeReplacement :: Either Char BS.SH.ShortByteString -> BS.SH.ShortByteString encodeReplacement = either recovery id where recovery err = BS.SH.pack . map (fromIntegral . fromEnum) $ "&#" ++ show (fromEnum err) ++ ";" -- $internal -- These types will almost certainly not be useful for anyone using the -- library, and are exported purely for internal usage. They can be safely -- ignored. Note, however, that they may be removed without warning. -- | The union of all state variables tracked by the bytes-to-'Char' decoding -- algorithm of a single encoding scheme. data InnerDecoderState = SimpleDecoderState -- ^ Null constructor for encodings which don't require persistant -- state in the decoding algorithm. | Iso2022DecoderState Iso2022Jp.Iso2022JpDecoderState -- ^ The data used by the 'Iso2022Jp' encoding scheme. | ReplacementDecoderState ReplacementDecoderState -- ^ The data used by the 'Replacement' encoding scheme. deriving ( Eq, Show, Read ) -- | The union of all state variables tracked by the 'Char'-to-bytes encoding -- algorithm of a single encoding scheme. data InnerEncoderState = SimpleEncoderState -- ^ Null constructor for encodings which don't require persistant -- state in the decoding algorithm. | Iso2022EncoderState Iso2022Jp.Iso2022JpEncoderState -- ^ The data used by the 'Iso2022Jp' encoding scheme. deriving ( Eq, Show, Read ) -- | The registry of 'Encoding' schemes to their byte-to-character -- algorithms. decoders :: M.HashMap Encoding TextBuilder decoders = M.fromList $ [ (Utf8, Utf8.decoder) , (Utf16be, Utf16.decoderBigEndian) , (Utf16le, Utf16.decoderLittleEndian) , (Big5, Big5.decoder) , (EucJp, EucJp.decoder) , (EucKr, EucKr.decoder) , (Gb18030, GB.decoder) , (Gbk, GB.decoder) , (ShiftJis, ShiftJis.decoder) , (UserDefined, decoderUserDefined) ] ++ [ (enc, Single.decoder enc) | enc <- Single.encodings ] -- | The registry of 'Encoding' schemes to their character-to-byte -- algorithms. encoders :: M.HashMap Encoding BinaryBuilder encoders = M.fromList $ -- 'Replacement', 'Utf16be', and 'Utf16le' are deliberately not included as -- per the specification. [ (Utf8, Utf8.encoder) , (Big5, Big5.encoder) , (EucJp, EucJp.encoder) , (EucKr, EucKr.encoder) , (Gb18030, GB.encoderGb18030) , (Gbk, GB.encoderGbk) , (ShiftJis, ShiftJis.encoder) , (UserDefined, encoderUserDefined) ] ++ [ (enc, Single.encoder enc) | enc <- Single.encodings ] -- | __Encoding:__ -- @[replacement decoder] -- (https://encoding.spec.whatwg.org/#replacement-decoder)@ -- -- Return a single @\\xFFFD@ replacement character for the entire input stream. decoderReplacement :: StateTextBuilder ReplacementDecoderState decoderReplacement = do state <- getDecoderState if state then A.empty else modifyDecoderState (const True) *> next >>= \b -> emit [b] replacementChar -- | Track whether the single allowed replacement character has already been -- emitted. type ReplacementDecoderState = Bool -- | The default initial state to kickstart the 'Replacement' decoder. defaultReplacementDecoderState :: ReplacementDecoderState defaultReplacementDecoderState = False -- | __Encoding:__ -- @[x-user-defined decoder] -- (https://encoding.spec.whatwg.org/#x-user-defined-decoder)@ -- -- Remap bytes above the ASCII range to characters in the Private Use Area. decoderUserDefined :: TextBuilder decoderUserDefined = next >>= switch [ If (<= 0x7F) toUnicode1 , Else $ \b -> emit [b] . toEnum . (+ 0xF700) $ fromIntegral b ] -- | __Encoding:__ -- @[x-user-defined encoder] -- (https://encoding.spec.whatwg.org/#x-user-defined-encoder)@ -- -- Remap a few characters in the Private Use Area to single-byte values just -- above the ASCII characters. This is of very limited value except as an -- inverse for @decoderUserDefined@. encoderUserDefined :: BinaryBuilder encoderUserDefined = next >>= switch [ If C.isAscii fromAscii , If (range '\xF780' '\xF7FF') $ return . pure . BS.SH.pack . (: []) . fromIntegral . subtract 0xF700 . fromEnum , Else encoderFailure ] -- | Store the given binary sequence as unparsable without further input, to be -- prepended to the beginning of stream on the next 'decode' or 'decode'' call. setRemainder :: BS.SH.ShortByteString -> DecoderState -> DecoderState setRemainder bs state = state { remainderBytes = bs } -- | Retrieve the encoding scheme currently used by the decoder to decode the -- binary document stream. decoderEncoding :: DecoderState -> Encoding decoderEncoding = confidenceEncoding . decoderConfidence_ -- | Instruct the decoder that the binary document stream is /known/ to be in -- the certain encoding. setEncodingCertain :: Encoding -> DecoderState -> DecoderState setEncodingCertain enc state = state { decoderConfidence_ = Certain enc } -- | Any leftover bytes at the end of the binary stream, which require further -- input to be processed in order to correctly map to a character or error -- value. decoderRemainder :: DecoderState -> BS.SH.ShortByteString decoderRemainder = remainderBytes