{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE Trustworthy #-}

{-|
Description:    Character translation functions to and from the ISO-2022-JP encoding scheme.

Copyright:      (c) 2020 Sam May
License:        MPL-2.0
Maintainer:     ag.eitilt@gmail.com

Stability:      experimental
Portability:    portable
-}
module Web.Willow.Common.Encoding.Iso2022Jp
    ( Iso2022JpMode ( .. )
      -- * Decoder
    , decoder
    , Iso2022JpDecoderState ( .. )
    , defaultIso2022JpDecoderState
    , Iso2022TextBuilder
      -- * Encoder
    , encoder
    , Iso2022JpEncoderState ( .. )
    , defaultIso2022JpEncoderState
    , Iso2022BinaryBuilder
    ) where


import qualified Control.Applicative as A
import qualified Control.Monad as N

import qualified Data.ByteString as BS
import qualified Data.ByteString.Short as BS.SH
import qualified Data.Char as C
import qualified Data.Maybe as Y
import qualified Data.Vector as V

import Control.Applicative ( (<|>) )
import Data.Functor ( ($>) )
import Data.Vector ( (!?) )

import Web.Willow.Common.Encoding.Common
import Web.Willow.Common.Parser
import Web.Willow.Common.Parser.Switch

import Web.Willow.Common.Encoding.EucJp
    ( decodeIndex0208, encodeIndex0208 )


-- | 'Iso2022Jp' is a "sticky-shift" encoding, where the escape sequences
-- change the behaviour of every following character until the next escape; it
-- therefore needs to track which character mode it is currently in.
-- Additionally, escape sequences need to be separated by at least one real
-- character, and while that could be ensured with pure code, we may as well
-- add it to the state data being tracked anyway.
data Iso2022JpDecoderState = Iso2022JpDecoderState
    { decoderMode :: Iso2022JpMode
        -- ^ The set of character mappings currently "loaded" into the
        -- algorithm.
    , decoderErrorOnEscape :: Bool
        -- ^ Whether an output character is /required/ ('True') or if an escape
        -- sequence is allowed ('False').
    }
  deriving ( Eq, Show, Read )

-- | 'Iso2022Jp' is a "sticky-shift" encoding, where the escape sequences
-- change the behaviour of every following character until the next escape; it
-- therefore needs to track which character mode it is currently in.
newtype Iso2022JpEncoderState = Iso2022JpEncoderState
    { encoderMode :: Iso2022JpMode
        -- ^ The set of character mappings currently "loaded" into the
        -- algorithm.
    }
  deriving ( Eq, Show, Read )

-- | The various 'Char'-byte mappings which may be loaded by the 'Iso2022Jp'
-- algorithms.
data Iso2022JpMode
    = Ascii
        -- ^ The basic, seven-bit ASCII set.
    | Roman
        -- ^ As 'Ascii', but replacing backslash (@\'\\'@) with yen (@\'¥'@)
        -- and tilde (@'~'@) with overline (@\'‾'@).
    | Katakana
        -- ^ The standard block of halfwidth katakana located in Unicode
        -- between @\\xFF61@ and @\\xFF9F@.
        -- 
        -- Note that this has identical behaviour to 'Multibyte' when passed to
        -- 'encoder'; halfwidth katakana are instead encoded as fullwidth.
    | Multibyte
        -- ^ An encoding of the full jis0208 set into a seven-bit-compatible
        -- transmission scheme.
  deriving ( Eq, Ord, Bounded, Enum, Show, Read )

-- | The default initial state to kickstart the 'Iso2022Jp' decoder.
defaultIso2022JpDecoderState :: Iso2022JpDecoderState
defaultIso2022JpDecoderState = Iso2022JpDecoderState
    { decoderMode = Ascii
    , decoderErrorOnEscape = False
    }

-- | The default initial state to kickstart the 'Iso2022Jp' encoder.
defaultIso2022JpEncoderState :: Iso2022JpEncoderState
defaultIso2022JpEncoderState = Iso2022JpEncoderState
    { encoderMode = Ascii
    }


-- | Shorthand type for parser combinators written for the 'Iso2022Jp' decoding
-- algorithm.
type Iso2022TextBuilder = StateTextBuilder Iso2022JpDecoderState

-- | __Encoding:__
--      @[ISO-2022-JP decoder]
--      (https://encoding.spec.whatwg.org/#iso-2022-jp-decoder)@
-- 
-- Decodes a 'Char' from a binary stream encoded with the 'Iso2022Jp' encoding
-- scheme, or returns 'Left' if the stream starts with an invalid byte
-- sequence.
decoder :: Iso2022TextBuilder
decoder = do
    state <- getDecoderState
    char <- case decoderMode state of
        Ascii -> decoderAscii
        Roman -> decoderRoman
        Katakana -> decoderKatakana
        Multibyte -> decoderMultibyte
    N.unless (char == Right "") clearDecoderOutput
    return char

-- | __Encoding:__
--      @[ISO-2022-JP decoder]
--      (https://encoding.spec.whatwg.org/#iso-2022-jp-decoder)@
--      @ASCII@ case
-- 
-- Process the head of a binary stream according to the 'Ascii' character set.
decoderAscii :: Iso2022TextBuilder
decoderAscii = next >>= switch
    [ If (`elem` [0x0E, 0x0F]) decoderFailure1
    , If_ (== toByte '\ESC') decoderEscape
    , If isAsciiByte toUnicode1
    , Else decoderFailure1
    ]

-- | __Encoding:__
--      @[ISO-2022-JP decoder]
--      (https://encoding.spec.whatwg.org/#iso-2022-jp-decoder)@
--      @Roman@ case
-- 
-- Process the head of a binary stream according to the 'Roman' character set.
decoderRoman :: Iso2022TextBuilder
decoderRoman = next >>= switch
    [ If (`elem` [0x0E, 0x0F]) decoderFailure1
    , If_ (== toByte '\ESC') decoderEscape
    , If_ (== toByte '\\') $ emit [toByte '\\'] '\x00A5'
    , If_ (== toByte '~') $ emit [toByte '~'] '\x203E'
    , If isAsciiByte toUnicode1
    , Else decoderFailure1
    ]


-- | __Encoding:__
--      @[ISO-2022-JP decoder]
--      (https://encoding.spec.whatwg.org/#iso-2022-jp-decoder)@
--      @katakana@ case
-- 
-- Process the head of a binary stream according to the 'Katakana' character
-- set.
decoderKatakana :: Iso2022TextBuilder
decoderKatakana = next >>= switch
    [ If_ (== toByte '\ESC') decoderEscape
    , If (range 0x21 0x5F) $ \b -> emit [b] . toEnum . (+ 0xFF40) $ fromIntegral b
    , Else decoderFailure1
    ]

-- | __Encoding:__
--      @[ISO-2022-JP decoder]
--      (https://encoding.spec.whatwg.org/#iso-2022-jp-decoder)@
--      @lead byte@ and @trail byte@ cases
-- 
-- Process the head of a binary stream according to the 'Multibyte' character
-- set.
decoderMultibyte :: Iso2022TextBuilder
decoderMultibyte = next >>= switch
    [ If_ (== toByte '\ESC') decoderEscape
    , If (range 0x21 0x7E) $ \b -> decoderMultibyte' b <|> decoderFailure1 b
    , Else decoderFailure1
    ]
  where decoderMultibyte' lead = next >>= switch
            [ If_ (== toByte '\ESC') $ push (toByte '\ESC') *> decoderFailure1 lead
            , If (range 0x21 0x7E) $ \second ->
                maybe (decoderFailure [lead, second]) (emit [lead, second]) . decodeIndex0208 $
                (fromIntegral lead - 0x21) * 94 + fromIntegral second - 0x21
            , Else $ decoderFailure2 lead
            ]

-- | __Encoding:__
--      @[ISO-2022-JP decoder]
--      (https://encoding.spec.whatwg.org/#iso-2022-jp-decoder)@
--      @Escape start@ and @Escape@ cases
-- 
-- Process the head of a binary stream as an escape sequence transitioning from
-- one character set to another.
decoderEscape :: Iso2022TextBuilder
decoderEscape = do
    err <- decoderErrorOnEscape <$> getDecoderState
    nextChunk 2 >>= switch
        [ If (== "$@") $ switchMode err Multibyte
        , If (== "$B") $ switchMode err Multibyte
        , If (== "(B") $ switchMode err Ascii
        , If (== "(J") $ switchMode err Roman
        , If (== "(I") $ switchMode err Katakana
        , Else $ \bs -> do
            pushChunk bs
            clearDecoderOutput
            decoderFailure1 $ toByte '\ESC'
        ]
  where switchMode err mode bs = do
            modifyDecoderState $ \state -> state
                { decoderMode = mode
                , decoderErrorOnEscape = True
                }
            if err
                then decoderFailure $ BS.unpack bs
                else return $ pure []

-- | Mark that some character or error output has occurred, and so allow an
-- escape sequence without complaint.
clearDecoderOutput :: StateDecoder Iso2022JpDecoderState ()
clearDecoderOutput = modifyDecoderState $ \s -> s
    { decoderErrorOnEscape = False
    }


-- | Shorthand type for parser combinators written for the 'Iso2022Jp' encoding
-- algorithm.
type Iso2022BinaryBuilder = StateBinaryBuilder Iso2022JpEncoderState

-- | __Encoding:__
--      @[ISO-2022-JP encoder]
--      (https://encoding.spec.whatwg.org/#iso-2022-jp-encoder)@
-- 
-- Encode the first 'Char' in a string according to the 'Iso2022Jp' encoding
-- scheme, or return that same character if that scheme doesn't define a binary
-- representation for it.
encoder :: Iso2022BinaryBuilder
encoder = do
    state <- getEncoderState
    bs' <- A.optional $ case encoderMode state of
        Ascii -> encoderAscii
        Roman -> encoderRoman
        Katakana -> encoderMultibyte True
        Multibyte -> encoderMultibyte True
    case bs' of
        Nothing | encoderMode state == Ascii -> A.empty
        Nothing -> setEncoderMode Ascii
        Just bs -> return bs

-- | __Encoding:__
--      @[ISO-2022-JP encoder]
--      (https://encoding.spec.whatwg.org/#iso-2022-jp-encoder)@
--      steps 3-4, 7-12
-- 
-- Process the head of a 'Data.Text.Lazy.Text' string starting in the 'Ascii'
-- character set, insert the appropriate escape sequence if a mode transition
-- is required, or return 'Left' if all else fails.
encoderAscii :: Iso2022BinaryBuilder
encoderAscii = next >>= \c -> switch
    [ If_ (`elem` ['\SO', '\SI', '\ESC']) $ encoderFailure '\xFFFD'
    , If C.isAscii fromAscii
    , If_ (`elem` ['\x00A5', '\x203E']) $ push c *> setEncoderMode Roman
    ] c <|> encoderFallback c

-- | __Encoding:__
--      @[ISO-2022-JP encoder]
--      (https://encoding.spec.whatwg.org/#iso-2022-jp-encoder)@
--      steps 5-6, 8-12
-- 
-- Process the head of a 'Data.Text.Lazy.Text' string starting in the 'Roman'
-- character set, insert the appropriate escape sequence if a mode transition
-- is required, or return 'Left' if all else fails.
encoderRoman :: Iso2022BinaryBuilder
encoderRoman = next >>= \c -> switch
    [ If_ (`elem` ['\SO', '\SI', '\ESC']) $ encoderFailure '\xFFFD'
    , If_ (`elem` ['\\', '~']) $ push c *> setEncoderMode Ascii
    , If C.isAscii fromAscii
    , If_ (== '\x00A5') $ return (pure "\\")
    , If_ (== '\x203E') $ return (pure "~")
    ] c <|> encoderFallback c

-- | __Encoding:__
--      @[ISO-2022-JP encoder]
--      (https://encoding.spec.whatwg.org/#iso-2022-jp-encoder)@
--      steps 8-12
-- 
-- Attempt to parse the character in (specifically) the 'Multibyte' mode, and
-- return 'Left' if that fails.
encoderFallback :: Char -> Iso2022BinaryBuilder
encoderFallback c = A.optional encoderFallback' >>= maybe (encoderFailure c) return
  where encoderFallback' = do
            push c
            escape <- setEncoderMode Multibyte
            _ <- lookAhead $ encoderMultibyte False
            return escape

-- | __Encoding:__
--      @[ISO-2022-JP encoder]
--      (https://encoding.spec.whatwg.org/#iso-2022-jp-encoder)@
--      steps 6-15
-- 
-- Process the head of a 'Data.Text.Lazy.Text' string starting in the
-- 'Multibyte' character set, insert the appropriate escape sequence if a mode
-- transition is required, or return 'Left' if all else fails.
encoderMultibyte
    :: Bool
        -- ^ Whether the encoder is allowed to switch to another mode, or
        -- whether it's being used as a fallback and so must remain in
        -- 'Multibyte'.
    -> Iso2022BinaryBuilder
encoderMultibyte allowRecovery = next >>= \c -> switch
    [ If_ C.isAscii $ push c *> setEncoderMode' Ascii
    , If_ (`elem` ['\x00A5', '\x203E']) $ push c *> setEncoderMode' Roman
    , If_ (== '\x2212') $ indexBytes c (encodeIndex0208 '\xFF0D')
    , If (range '\xFF61' '\xFF9F') $ indexBytes c . encodeIndex0208 . remapKatakana
    , Else $ indexBytes c . encodeIndex0208
    ] c
  where setEncoderMode' s
            | allowRecovery = setEncoderMode s
            | otherwise = A.empty
        indexBytes c Nothing = do
            state <- getEncoderState
            if encoderMode state `elem` [Katakana, Multibyte]
                then push c *> setEncoderMode' Ascii
                else encoderFailure c
        indexBytes _ (Just code) = return . pure $
            BS.SH.pack [fromIntegral lead + 0x21, fromIntegral trail + 0x21]
          where (lead, trail) = divMod code 94


-- | Transition the encoder from one character set to another, emitting the
-- appropriate binary escape sequence.
setEncoderMode :: Iso2022JpMode -> Iso2022BinaryBuilder
setEncoderMode mode = modifyEncoderState setEncoderMode' $> pure esc
  where esc = case mode of
            Ascii -> "\ESC(B"
            Roman -> "\ESC(J"
            Katakana -> "\ESC$B"
            Multibyte -> "\ESC$B"
        setEncoderMode' state = state
            { encoderMode = mode
            }


-- | __Encoding:__
--      @[ISO-2022-JP encoder]
--      (https://encoding.spec.whatwg.org/#iso-2022-jp-encoder)@
--      step 9
-- 
-- Expand halfwidth katakana into their (more appropriate) fullwidth forms.
remapKatakana :: Char -> Char
remapKatakana c = Y.fromMaybe '\xFFFD' $ indexKatakana !? (fromEnum c - 0xFF61)

-- | The mapping between halfwidth and fullwidth katakana, in an easily-indexed
-- structure.
indexKatakana :: V.Vector Char
indexKatakana = V.fromList . map snd $ loadIndex "iso-2022-jp-katakana"