{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE Trustworthy #-} {-| Description: Character translation functions to and from the ISO-2022-JP encoding scheme. Copyright: (c) 2020 Sam May License: MPL-2.0 Maintainer: ag.eitilt@gmail.com Stability: experimental Portability: portable -} module Web.Willow.Common.Encoding.Iso2022Jp ( Iso2022JpMode ( .. ) -- * Decoder , decoder , Iso2022JpDecoderState ( .. ) , defaultIso2022JpDecoderState , Iso2022TextBuilder -- * Encoder , encoder , Iso2022JpEncoderState ( .. ) , defaultIso2022JpEncoderState , Iso2022BinaryBuilder ) where import qualified Control.Applicative as A import qualified Control.Monad as N import qualified Data.ByteString as BS import qualified Data.ByteString.Short as BS.SH import qualified Data.Char as C import qualified Data.Maybe as Y import qualified Data.Vector as V import Control.Applicative ( (<|>) ) import Data.Functor ( ($>) ) import Data.Vector ( (!?) ) import Web.Willow.Common.Encoding.Common import Web.Willow.Common.Parser import Web.Willow.Common.Parser.Switch import Web.Willow.Common.Encoding.EucJp ( decodeIndex0208, encodeIndex0208 ) -- | 'Iso2022Jp' is a "sticky-shift" encoding, where the escape sequences -- change the behaviour of every following character until the next escape; it -- therefore needs to track which character mode it is currently in. -- Additionally, escape sequences need to be separated by at least one real -- character, and while that could be ensured with pure code, we may as well -- add it to the state data being tracked anyway. data Iso2022JpDecoderState = Iso2022JpDecoderState { decoderMode :: Iso2022JpMode -- ^ The set of character mappings currently "loaded" into the -- algorithm. , decoderErrorOnEscape :: Bool -- ^ Whether an output character is /required/ ('True') or if an escape -- sequence is allowed ('False'). } deriving ( Eq, Show, Read ) -- | 'Iso2022Jp' is a "sticky-shift" encoding, where the escape sequences -- change the behaviour of every following character until the next escape; it -- therefore needs to track which character mode it is currently in. newtype Iso2022JpEncoderState = Iso2022JpEncoderState { encoderMode :: Iso2022JpMode -- ^ The set of character mappings currently "loaded" into the -- algorithm. } deriving ( Eq, Show, Read ) -- | The various 'Char'-byte mappings which may be loaded by the 'Iso2022Jp' -- algorithms. data Iso2022JpMode = Ascii -- ^ The basic, seven-bit ASCII set. | Roman -- ^ As 'Ascii', but replacing backslash (@\'\\'@) with yen (@\'¥'@) -- and tilde (@'~'@) with overline (@\'‾'@). | Katakana -- ^ The standard block of halfwidth katakana located in Unicode -- between @\\xFF61@ and @\\xFF9F@. -- -- Note that this has identical behaviour to 'Multibyte' when passed to -- 'encoder'; halfwidth katakana are instead encoded as fullwidth. | Multibyte -- ^ An encoding of the full jis0208 set into a seven-bit-compatible -- transmission scheme. deriving ( Eq, Ord, Bounded, Enum, Show, Read ) -- | The default initial state to kickstart the 'Iso2022Jp' decoder. defaultIso2022JpDecoderState :: Iso2022JpDecoderState defaultIso2022JpDecoderState = Iso2022JpDecoderState { decoderMode = Ascii , decoderErrorOnEscape = False } -- | The default initial state to kickstart the 'Iso2022Jp' encoder. defaultIso2022JpEncoderState :: Iso2022JpEncoderState defaultIso2022JpEncoderState = Iso2022JpEncoderState { encoderMode = Ascii } -- | Shorthand type for parser combinators written for the 'Iso2022Jp' decoding -- algorithm. type Iso2022TextBuilder = StateTextBuilder Iso2022JpDecoderState -- | __Encoding:__ -- @[ISO-2022-JP decoder] -- (https://encoding.spec.whatwg.org/#iso-2022-jp-decoder)@ -- -- Decodes a 'Char' from a binary stream encoded with the 'Iso2022Jp' encoding -- scheme, or returns 'Left' if the stream starts with an invalid byte -- sequence. decoder :: Iso2022TextBuilder decoder = do state <- getDecoderState char <- case decoderMode state of Ascii -> decoderAscii Roman -> decoderRoman Katakana -> decoderKatakana Multibyte -> decoderMultibyte N.unless (char == Right "") clearDecoderOutput return char -- | __Encoding:__ -- @[ISO-2022-JP decoder] -- (https://encoding.spec.whatwg.org/#iso-2022-jp-decoder)@ -- @ASCII@ case -- -- Process the head of a binary stream according to the 'Ascii' character set. decoderAscii :: Iso2022TextBuilder decoderAscii = next >>= switch [ If (`elem` [0x0E, 0x0F]) decoderFailure1 , If_ (== toByte '\ESC') decoderEscape , If isAsciiByte toUnicode1 , Else decoderFailure1 ] -- | __Encoding:__ -- @[ISO-2022-JP decoder] -- (https://encoding.spec.whatwg.org/#iso-2022-jp-decoder)@ -- @Roman@ case -- -- Process the head of a binary stream according to the 'Roman' character set. decoderRoman :: Iso2022TextBuilder decoderRoman = next >>= switch [ If (`elem` [0x0E, 0x0F]) decoderFailure1 , If_ (== toByte '\ESC') decoderEscape , If_ (== toByte '\\') $ emit [toByte '\\'] '\x00A5' , If_ (== toByte '~') $ emit [toByte '~'] '\x203E' , If isAsciiByte toUnicode1 , Else decoderFailure1 ] -- | __Encoding:__ -- @[ISO-2022-JP decoder] -- (https://encoding.spec.whatwg.org/#iso-2022-jp-decoder)@ -- @katakana@ case -- -- Process the head of a binary stream according to the 'Katakana' character -- set. decoderKatakana :: Iso2022TextBuilder decoderKatakana = next >>= switch [ If_ (== toByte '\ESC') decoderEscape , If (range 0x21 0x5F) $ \b -> emit [b] . toEnum . (+ 0xFF40) $ fromIntegral b , Else decoderFailure1 ] -- | __Encoding:__ -- @[ISO-2022-JP decoder] -- (https://encoding.spec.whatwg.org/#iso-2022-jp-decoder)@ -- @lead byte@ and @trail byte@ cases -- -- Process the head of a binary stream according to the 'Multibyte' character -- set. decoderMultibyte :: Iso2022TextBuilder decoderMultibyte = next >>= switch [ If_ (== toByte '\ESC') decoderEscape , If (range 0x21 0x7E) $ \b -> decoderMultibyte' b <|> decoderFailure1 b , Else decoderFailure1 ] where decoderMultibyte' lead = next >>= switch [ If_ (== toByte '\ESC') $ push (toByte '\ESC') *> decoderFailure1 lead , If (range 0x21 0x7E) $ \second -> maybe (decoderFailure [lead, second]) (emit [lead, second]) . decodeIndex0208 $ (fromIntegral lead - 0x21) * 94 + fromIntegral second - 0x21 , Else $ decoderFailure2 lead ] -- | __Encoding:__ -- @[ISO-2022-JP decoder] -- (https://encoding.spec.whatwg.org/#iso-2022-jp-decoder)@ -- @Escape start@ and @Escape@ cases -- -- Process the head of a binary stream as an escape sequence transitioning from -- one character set to another. decoderEscape :: Iso2022TextBuilder decoderEscape = do err <- decoderErrorOnEscape <$> getDecoderState nextChunk 2 >>= switch [ If (== "$@") $ switchMode err Multibyte , If (== "$B") $ switchMode err Multibyte , If (== "(B") $ switchMode err Ascii , If (== "(J") $ switchMode err Roman , If (== "(I") $ switchMode err Katakana , Else $ \bs -> do pushChunk bs clearDecoderOutput decoderFailure1 $ toByte '\ESC' ] where switchMode err mode bs = do modifyDecoderState $ \state -> state { decoderMode = mode , decoderErrorOnEscape = True } if err then decoderFailure $ BS.unpack bs else return $ pure [] -- | Mark that some character or error output has occurred, and so allow an -- escape sequence without complaint. clearDecoderOutput :: StateDecoder Iso2022JpDecoderState () clearDecoderOutput = modifyDecoderState $ \s -> s { decoderErrorOnEscape = False } -- | Shorthand type for parser combinators written for the 'Iso2022Jp' encoding -- algorithm. type Iso2022BinaryBuilder = StateBinaryBuilder Iso2022JpEncoderState -- | __Encoding:__ -- @[ISO-2022-JP encoder] -- (https://encoding.spec.whatwg.org/#iso-2022-jp-encoder)@ -- -- Encode the first 'Char' in a string according to the 'Iso2022Jp' encoding -- scheme, or return that same character if that scheme doesn't define a binary -- representation for it. encoder :: Iso2022BinaryBuilder encoder = do state <- getEncoderState bs' <- A.optional $ case encoderMode state of Ascii -> encoderAscii Roman -> encoderRoman Katakana -> encoderMultibyte True Multibyte -> encoderMultibyte True case bs' of Nothing | encoderMode state == Ascii -> A.empty Nothing -> setEncoderMode Ascii Just bs -> return bs -- | __Encoding:__ -- @[ISO-2022-JP encoder] -- (https://encoding.spec.whatwg.org/#iso-2022-jp-encoder)@ -- steps 3-4, 7-12 -- -- Process the head of a 'Data.Text.Lazy.Text' string starting in the 'Ascii' -- character set, insert the appropriate escape sequence if a mode transition -- is required, or return 'Left' if all else fails. encoderAscii :: Iso2022BinaryBuilder encoderAscii = next >>= \c -> switch [ If_ (`elem` ['\SO', '\SI', '\ESC']) $ encoderFailure '\xFFFD' , If C.isAscii fromAscii , If_ (`elem` ['\x00A5', '\x203E']) $ push c *> setEncoderMode Roman ] c <|> encoderFallback c -- | __Encoding:__ -- @[ISO-2022-JP encoder] -- (https://encoding.spec.whatwg.org/#iso-2022-jp-encoder)@ -- steps 5-6, 8-12 -- -- Process the head of a 'Data.Text.Lazy.Text' string starting in the 'Roman' -- character set, insert the appropriate escape sequence if a mode transition -- is required, or return 'Left' if all else fails. encoderRoman :: Iso2022BinaryBuilder encoderRoman = next >>= \c -> switch [ If_ (`elem` ['\SO', '\SI', '\ESC']) $ encoderFailure '\xFFFD' , If_ (`elem` ['\\', '~']) $ push c *> setEncoderMode Ascii , If C.isAscii fromAscii , If_ (== '\x00A5') $ return (pure "\\") , If_ (== '\x203E') $ return (pure "~") ] c <|> encoderFallback c -- | __Encoding:__ -- @[ISO-2022-JP encoder] -- (https://encoding.spec.whatwg.org/#iso-2022-jp-encoder)@ -- steps 8-12 -- -- Attempt to parse the character in (specifically) the 'Multibyte' mode, and -- return 'Left' if that fails. encoderFallback :: Char -> Iso2022BinaryBuilder encoderFallback c = A.optional encoderFallback' >>= maybe (encoderFailure c) return where encoderFallback' = do push c escape <- setEncoderMode Multibyte _ <- lookAhead $ encoderMultibyte False return escape -- | __Encoding:__ -- @[ISO-2022-JP encoder] -- (https://encoding.spec.whatwg.org/#iso-2022-jp-encoder)@ -- steps 6-15 -- -- Process the head of a 'Data.Text.Lazy.Text' string starting in the -- 'Multibyte' character set, insert the appropriate escape sequence if a mode -- transition is required, or return 'Left' if all else fails. encoderMultibyte :: Bool -- ^ Whether the encoder is allowed to switch to another mode, or -- whether it's being used as a fallback and so must remain in -- 'Multibyte'. -> Iso2022BinaryBuilder encoderMultibyte allowRecovery = next >>= \c -> switch [ If_ C.isAscii $ push c *> setEncoderMode' Ascii , If_ (`elem` ['\x00A5', '\x203E']) $ push c *> setEncoderMode' Roman , If_ (== '\x2212') $ indexBytes c (encodeIndex0208 '\xFF0D') , If (range '\xFF61' '\xFF9F') $ indexBytes c . encodeIndex0208 . remapKatakana , Else $ indexBytes c . encodeIndex0208 ] c where setEncoderMode' s | allowRecovery = setEncoderMode s | otherwise = A.empty indexBytes c Nothing = do state <- getEncoderState if encoderMode state `elem` [Katakana, Multibyte] then push c *> setEncoderMode' Ascii else encoderFailure c indexBytes _ (Just code) = return . pure $ BS.SH.pack [fromIntegral lead + 0x21, fromIntegral trail + 0x21] where (lead, trail) = divMod code 94 -- | Transition the encoder from one character set to another, emitting the -- appropriate binary escape sequence. setEncoderMode :: Iso2022JpMode -> Iso2022BinaryBuilder setEncoderMode mode = modifyEncoderState setEncoderMode' $> pure esc where esc = case mode of Ascii -> "\ESC(B" Roman -> "\ESC(J" Katakana -> "\ESC$B" Multibyte -> "\ESC$B" setEncoderMode' state = state { encoderMode = mode } -- | __Encoding:__ -- @[ISO-2022-JP encoder] -- (https://encoding.spec.whatwg.org/#iso-2022-jp-encoder)@ -- step 9 -- -- Expand halfwidth katakana into their (more appropriate) fullwidth forms. remapKatakana :: Char -> Char remapKatakana c = Y.fromMaybe '\xFFFD' $ indexKatakana !? (fromEnum c - 0xFF61) -- | The mapping between halfwidth and fullwidth katakana, in an easily-indexed -- structure. indexKatakana :: V.Vector Char indexKatakana = V.fromList . map snd $ loadIndex "iso-2022-jp-katakana"