{-| Description: Character translation functions to and from the EUC-JP encoding scheme. Copyright: (c) 2020 Sam May License: MPL-2.0 Maintainer: ag.eitilt@gmail.com Stability: experimental Portability: portable -} module Web.Willow.Common.Encoding.EucJp ( -- * Decoder decoder , decodeIndex0208 -- * Encoder , encoder , encodeIndex0208 ) where import qualified Control.Applicative as A import qualified Data.ByteString.Short as BS.SH import qualified Data.Char as C import qualified Data.Maybe as Y import qualified Data.Tuple as U import qualified Data.Word as W import Web.Willow.Common.Encoding.Common import Web.Willow.Common.Parser import Web.Willow.Common.Parser.Switch -- | __Encoding:__ -- @[EUC-JP decoder] -- (https://encoding.spec.whatwg.org/#euc-jp-decoder)@ -- -- Decodes a 'Char' from a binary stream encoded with the 'EucJp' encoding -- scheme, or returns 'Left' if the stream starts with an invalid byte -- sequence. decoder :: TextBuilder decoder = next >>= switch [ If_ (== 0x8E) $ do second <- next if range 0xA1 0xDF second then emit [0x8E, second] . toEnum . (+ 0xFEC0) $ fromIntegral second else if isAsciiByte second then push second *> decoderFailure1 0x8E else decoderFailure [0x8E, second] , If_ (== 0x8F) $ do second <- next if range 0xA1 0xFE second then decodeSecond True second else if isAsciiByte second then push second *> decoderFailure1 0x8F else decoderFailure [0x8F, second] , If isAsciiByte toUnicode1 , If (range 0xA1 0xFE) $ decodeSecond False , Else decoderFailure1 ] -- | __Encoding:__ -- @[EUC-JP decoder] -- (https://encoding.spec.whatwg.org/#euc-jp-decoder)@ -- step 5 -- -- Process a multi-byte sequence according to either the @jis0208@ ('False') or -- the @jis0212@ ('True') encoding scheme. -- -- Fails on an ASCII byte, which should be read on its own. decodeSecond :: Bool -> W.Word8 -> TextBuilder decodeSecond is0212 lead = next >>= switch [ If (range 0xA1 0xFE) $ \trail -> maybe (decoderFailure $ bs ++ [trail]) (emit $ bs ++ [trail]) . decodeIndex $ (fromIntegral lead - 0xA1) * 94 + fromIntegral trail - 0xA1 , If_ (not . isAsciiByte) A.empty ] where (decodeIndex, bs) | is0212 = (decodeIndex0212, [0x8F, lead]) | otherwise = (decodeIndex0208, [lead]) -- | __Encoding:__ -- @[EUC-JP encoder] -- (https://encoding.spec.whatwg.org/#euc-jp-encoder)@ -- -- Encode the first 'Char' in a string according to the 'EucJp'/@jis02080@ -- encoding scheme, or return that same character if that scheme doesn't define -- a binary representation for it. encoder :: BinaryBuilder encoder = next >>= switch [ If C.isAscii fromAscii , If_ (== '\xA5') $ return (pure $ BS.SH.pack [0x5C]) , If_ (== '\x203E') $ return (pure $ BS.SH.pack [0x7E]) , If_ (== '\x2212') $ encodeTwoByte (Just '\x2212') '\xFF0D' , If (range '\xFF61' '\xFF9F') $ \char -> return . pure $ BS.SH.pack [0x8E, fromIntegral $ fromEnum char - 0xFEC0] , Else $ encodeTwoByte Nothing ] -- | __Encoding:__ -- @[EUC-JP encoder] -- (https://encoding.spec.whatwg.org/#euc-jp-encoder)@ -- steps 7-11 -- -- Encode a multi-byte character according to the 'EucJp'/@jis02080@ encoding, -- or return that same character that scheme doesn't define a binary -- representation for it. -- -- Never fails or consumes input. encodeTwoByte :: Maybe Char -> Char -> BinaryBuilder encodeTwoByte err char = case encodeIndex0208 char of Just index -> let (lead, trail) = divMod index 94 in return . pure $ BS.SH.pack [fromIntegral lead + 0xA1, fromIntegral trail + 0xA1] Nothing -> encoderFailure $ Y.fromMaybe char err -- | Look for a character in the jis0208 encoding at the given index. decodeIndex0208 :: Word -> Maybe Char decodeIndex0208 index = lookupMemoizedIndex decodeIndex0208M (Just encodeIndex0208M) index readDecodeIndex0208 -- | Memoization table to save lookup time in the over-large jis0208 index. decodeIndex0208M :: DecoderMemoTable decodeIndex0208M = newMemoizationTable {-# NOINLINE decodeIndex0208M #-} -- | Read the character at a given offset from the jis0208 index. Note that -- this is a heavy function, and should be cached whenever possible. readDecodeIndex0208 :: Word -> Maybe Char readDecodeIndex0208 index = search index $ loadIndex "jis0208" -- | Look for a character in the jis0212 encoding at the given index. decodeIndex0212 :: Word -> Maybe Char decodeIndex0212 index = lookupMemoizedIndex decodeIndex0212M Nothing index readDecodeIndex0212 -- | Memoization table to save lookup time in the over-large jis0212 index. decodeIndex0212M :: DecoderMemoTable decodeIndex0212M = newMemoizationTable {-# NOINLINE decodeIndex0212M #-} -- | Read the character at a given offset from the jis0212 index. Note that -- this is a heavy function, and should be cached whenever possible. readDecodeIndex0212 :: Word -> Maybe Char readDecodeIndex0212 index = search index $ loadIndex "jis0212" -- | Look for the index of a given character in the jis0208 encoding. encodeIndex0208 :: Char -> Maybe Word encodeIndex0208 char = lookupMemoizedIndex encodeIndex0208M (Just decodeIndex0208M) char readEncodeIndex0208 -- | Memoization table to save lookup time in the over-large jis0208 index. encodeIndex0208M :: EncoderMemoTable encodeIndex0208M = newMemoizationTable {-# NOINLINE encodeIndex0208M #-} -- | Find the offset of a given character in the jis0208 index. Note that this -- is a heavy function, and should be cached whenever possible. readEncodeIndex0208 :: Char -> Maybe Word readEncodeIndex0208 char = search char . map U.swap $ loadIndex "jis0208"