{-# LANGUAGE CPP #-}

{-|
Description:    Clean a 'Char' stream by normalizing newlines and warning about control characters.

Copyright:      (c) 2020 Sam May
License:        MPL-2.0
Maintainer:     ag.eitilt@gmail.com

Stability:      experimental
Portability:    portable

To simplify the tokenization parsers, the many representations of line breaks
are unified into a single, Unix-style @\\n@.  While we're iterating over the
input, and before some of the special characters are replaced, it's also a good
time to trigger the warnings for unexpected characters
('ControlCharacterInInputStream', 'SurrogateInInputStream', and
'NoncharacterInInputStream').
-}
module Web.Mangrove.Parse.Encoding.Preprocess
    ( preprocess
    , preprocessStep
      -- * Initialization
    , Encoding ( .. )
    , DecoderState
    , initialDecoderState
    ) where


import qualified Data.Bifunctor as F.B
import qualified Data.ByteString as BS
import qualified Data.ByteString.Short as BS.SH
import qualified Data.Tuple.HT as U.HT

#if MIN_VERSION_base(4,11,0)
#else
import Data.Semigroup ( (<>) )
#endif

import Web.Mangrove.Parse.Common.Error
import Web.Willow.Common.Encoding
import Web.Willow.Common.Encoding.Character
import Web.Willow.Common.Parser.Util


-- | __Encoding:__
--      @[preprocessing the input stream]
--      (https://encoding.spec.whatwg.org/#preprocessing-the-input-stream)@
-- 
-- Given a character encoding scheme, transform a dependant 'BS.ByteString'
-- into portable 'Char's.  If any byte sequences are meaningless or illegal,
-- they are replaced with the Unicode replacement character @\\xFFFD@.  All
-- newlines are normallized to a single @\\n@ 'Char', and Unicode control
-- characters, surrogate characters, and non-characters are marked with the
-- proper errors.
-- 
-- See 'preprocessStep' to operate over only a minimal section.
preprocess :: DecoderState -> BS.ByteString -> ([([ParseError], Char)], DecoderState)
preprocess state = F.B.first (normalize . map charError . concatMap flatten) . decode state
  where normalize ((err1, '\r'):(err2, '\n'):cs) = (err1 <> err2, '\n') : normalize cs
        normalize ((err, '\r'):cs) = (err, '\n') : normalize cs
        normalize (c:cs) = c : normalize cs
        normalize [] = []

-- | __Encoding:__
--      @[preprocessing the input stream]
--      (https://encoding.spec.whatwg.org/#preprocessing-the-input-stream)@
-- 
-- Read the smallest number of bytes from the head of the 'BS.ByteString'
-- which would leave the decoder in a re-enterable state.  Any byte
-- sequences which are meaningless or illegal are replaced with the Unicode
-- replacement character @\\xFFFD@.  All newlines are normallized to a single
-- @\\n@ 'Char', and Unicode control characters, surrogate characters, and
-- non-characters are marked with the proper errors.
-- 
-- See 'preprocess' to operate over the entire string at once.
preprocessStep
    :: DecoderState
    -> BS.ByteString
    -> ([([ParseError], Char)], DecoderState, BS.ByteString)
preprocessStep state stream = normalize' . flatten' $ decodeStep state stream
  where flatten' = U.HT.mapFst3 (maybe [] $ map charError . flatten)
        normalize' c'@([], _, _) = c'
        normalize' ([(errs, '\r')], state', stream') = case flatten' $ decodeStep state' stream' of
            ((errs', '\n'):cs, state'', stream'') ->
                normalize' ((errs ++ errs', '\n') : cs, state'', stream'')
            _ -> ([(errs, '\n')], state', stream')
        normalize' ((errs, '\r'):(errs', '\n'):cs, state', stream') =
            U.HT.mapFst3 ((errs ++ errs', '\n') :) $ normalize' (cs, state', stream')
        normalize' ((errs, '\r'):cs, state', stream') =
            U.HT.mapFst3 ((errs, '\n') :) $ normalize' (cs, state', stream')
        normalize' (c:cs, state', stream') =
            U.HT.mapFst3 (c :) $ normalize' (cs, state', stream')


-- | Add a 'ControlCharacterInInputStream', 'SurrogateInInputStream', or
-- 'NoncharacterInInputStream' error to the relevant characters.
charError :: ([ParseError], Char) -> ([ParseError], Char)
charError c'@(_, c)
    | range '\xD800' '\xDFFF' c = addErr SurrogateInInputStream
    | range '\xFDD0' '\xFDEF' c = addErr NoncharacterInInputStream
    | noncharacter = addErr NoncharacterInInputStream
    | range '\SOH' '\US' c && notElem c "\t\n\f\r" = addErr ControlCharacterInInputStream
    | range '\DEL' '\x9F' c = addErr ControlCharacterInInputStream
    | otherwise = c'
  where noncharacter = case mod (fromEnum c) 0x10000 of
            0xFFFE -> True
            0xFFFF -> True
            _ -> False
        -- If this function could ever be applied twice, any errors would be
        -- duplicated.  As it's not exported from the module, that shouldn't be
        -- an issue.
        addErr err = F.B.first (err :) c'

-- | Rewrap the split type containing a list into a list at the top level.  Any
-- 'Left' errors are replaced by @[('InvalidByteSequence' /bytes/, '\\xFFFD')]@.
flatten :: Either BS.SH.ShortByteString String -> [([ParseError], Char)]
flatten = either (\err -> [([InvalidByteSequence err], replacementChar)]) (map $ \c -> ([], c))