{-# LANGUAGE CPP #-} {-| Description: Clean a 'Char' stream by normalizing newlines and warning about control characters. Copyright: (c) 2020 Sam May License: MPL-2.0 Maintainer: ag.eitilt@gmail.com Stability: experimental Portability: portable To simplify the tokenization parsers, the many representations of line breaks are unified into a single, Unix-style @\\n@. While we're iterating over the input, and before some of the special characters are replaced, it's also a good time to trigger the warnings for unexpected characters ('ControlCharacterInInputStream', 'SurrogateInInputStream', and 'NoncharacterInInputStream'). -} module Web.Mangrove.Parse.Encoding.Preprocess ( preprocess , preprocessStep -- * Initialization , Encoding ( .. ) , DecoderState , initialDecoderState ) where import qualified Data.Bifunctor as F.B import qualified Data.ByteString as BS import qualified Data.ByteString.Short as BS.SH import qualified Data.Tuple.HT as U.HT #if MIN_VERSION_base(4,11,0) #else import Data.Semigroup ( (<>) ) #endif import Web.Mangrove.Parse.Common.Error import Web.Willow.Common.Encoding import Web.Willow.Common.Encoding.Character import Web.Willow.Common.Parser.Util -- | __Encoding:__ -- @[preprocessing the input stream] -- (https://encoding.spec.whatwg.org/#preprocessing-the-input-stream)@ -- -- Given a character encoding scheme, transform a dependant 'BS.ByteString' -- into portable 'Char's. If any byte sequences are meaningless or illegal, -- they are replaced with the Unicode replacement character @\\xFFFD@. All -- newlines are normallized to a single @\\n@ 'Char', and Unicode control -- characters, surrogate characters, and non-characters are marked with the -- proper errors. -- -- See 'preprocessStep' to operate over only a minimal section. preprocess :: DecoderState -> BS.ByteString -> ([([ParseError], Char)], DecoderState) preprocess state = F.B.first (normalize . map charError . concatMap flatten) . decode state where normalize ((err1, '\r'):(err2, '\n'):cs) = (err1 <> err2, '\n') : normalize cs normalize ((err, '\r'):cs) = (err, '\n') : normalize cs normalize (c:cs) = c : normalize cs normalize [] = [] -- | __Encoding:__ -- @[preprocessing the input stream] -- (https://encoding.spec.whatwg.org/#preprocessing-the-input-stream)@ -- -- Read the smallest number of bytes from the head of the 'BS.ByteString' -- which would leave the decoder in a re-enterable state. Any byte -- sequences which are meaningless or illegal are replaced with the Unicode -- replacement character @\\xFFFD@. All newlines are normallized to a single -- @\\n@ 'Char', and Unicode control characters, surrogate characters, and -- non-characters are marked with the proper errors. -- -- See 'preprocess' to operate over the entire string at once. preprocessStep :: DecoderState -> BS.ByteString -> ([([ParseError], Char)], DecoderState, BS.ByteString) preprocessStep state stream = normalize' . flatten' $ decodeStep state stream where flatten' = U.HT.mapFst3 (maybe [] $ map charError . flatten) normalize' c'@([], _, _) = c' normalize' ([(errs, '\r')], state', stream') = case flatten' $ decodeStep state' stream' of ((errs', '\n'):cs, state'', stream'') -> normalize' ((errs ++ errs', '\n') : cs, state'', stream'') _ -> ([(errs, '\n')], state', stream') normalize' ((errs, '\r'):(errs', '\n'):cs, state', stream') = U.HT.mapFst3 ((errs ++ errs', '\n') :) $ normalize' (cs, state', stream') normalize' ((errs, '\r'):cs, state', stream') = U.HT.mapFst3 ((errs, '\n') :) $ normalize' (cs, state', stream') normalize' (c:cs, state', stream') = U.HT.mapFst3 (c :) $ normalize' (cs, state', stream') -- | Add a 'ControlCharacterInInputStream', 'SurrogateInInputStream', or -- 'NoncharacterInInputStream' error to the relevant characters. charError :: ([ParseError], Char) -> ([ParseError], Char) charError c'@(_, c) | range '\xD800' '\xDFFF' c = addErr SurrogateInInputStream | range '\xFDD0' '\xFDEF' c = addErr NoncharacterInInputStream | noncharacter = addErr NoncharacterInInputStream | range '\SOH' '\US' c && notElem c "\t\n\f\r" = addErr ControlCharacterInInputStream | range '\DEL' '\x9F' c = addErr ControlCharacterInInputStream | otherwise = c' where noncharacter = case mod (fromEnum c) 0x10000 of 0xFFFE -> True 0xFFFF -> True _ -> False -- If this function could ever be applied twice, any errors would be -- duplicated. As it's not exported from the module, that shouldn't be -- an issue. addErr err = F.B.first (err :) c' -- | Rewrap the split type containing a list into a list at the top level. Any -- 'Left' errors are replaced by @[('InvalidByteSequence' /bytes/, '\\xFFFD')]@. flatten :: Either BS.SH.ShortByteString String -> [([ParseError], Char)] flatten = either (\err -> [([InvalidByteSequence err], replacementChar)]) (map $ \c -> ([], c))