{-| Description: Character translation functions to and from the Shift_JIS encoding scheme. Copyright: (c) 2020 Sam May License: MPL-2.0 Maintainer: ag.eitilt@gmail.com Stability: experimental Portability: portable -} module Web.Willow.Common.Encoding.ShiftJis ( decoder , encoder ) where import qualified Control.Applicative as A import qualified Data.ByteString.Short as BS.SH import qualified Data.Char as C import qualified Data.Tuple as U import qualified Data.Word as W import Web.Willow.Common.Parser import Web.Willow.Common.Parser.Switch import Web.Willow.Common.Encoding.Common import Web.Willow.Common.Encoding.EucJp ( decodeIndex0208, encodeIndex0208 ) -- | __Encoding:__ -- @[Shift_JIS decoder] -- (https://encoding.spec.whatwg.org/#shift_jis-decoder)@ -- -- Decodes a 'Char' from a binary stream encoded with the 'ShiftJis' encoding -- scheme, or returns 'Left' if the stream starts with an invalid byte -- sequence. decoder :: TextBuilder decoder = next >>= switch [ If isAsciiByte toUnicode1 , If_ (== 0x80) $ emit [0x80] '\x80' , If (<= 0x9F) decoder' , If (range 0xA1 0xDF) $ \b -> emit [b] . toEnum $ 0xFEC0 + fromIntegral b , If (range 0xE0 0xFC) decoder' , Else decoderFailure1 ] -- | __Encoding:__ -- @[Shift_JIS decoder] -- (https://encoding.spec.whatwg.org/#shift_jis-decoder)@ -- step 3 -- -- Process a double-byte sequence according to the 'ShiftJis' encoding scheme. decoder' :: W.Word8 -> TextBuilder decoder' lead = next >>= switch [ If isAsciiByte $ decodeIndex lead , Else $ decodeIndex lead ] -- | __Encoding:__ -- @[Shift_JIS decoder] -- (https://encoding.spec.whatwg.org/#shift_jis-decoder)@ -- step 3, substeps 3-6 -- -- Given a two-byte sequence, calculate and return its associated character -- representation in the 'ShiftJis' encoding scheme. -- -- Fails if no such sequence is defined. decodeIndex :: W.Word8 -> W.Word8 -> TextBuilder decodeIndex lead trail = case lead' * 188 + trail' of educ | range 8836 10715 educ -> emit [lead, trail] . toEnum $ 0xBD7C + fromIntegral educ pointer -> maybe failure (emit [lead, trail]) $ decodeIndex0208 pointer where lead' | lead <= 0xA0 = fromIntegral lead - 0x81 | otherwise = fromIntegral lead - 0xC1 trail' | trail <= 0x7F = fromIntegral trail - 0x40 | otherwise = fromIntegral trail - 0x41 failure | trail <= 0x7F = A.empty | otherwise = decoderFailure [lead, trail] -- | __Encoding:__ -- @[Shift_JIS encoder] -- (https://encoding.spec.whatwg.org/#shift_jis-encoder)@ -- -- Encode the first 'Char' in a string according to the 'ShiftJis' encoding -- scheme, or return that same character if that scheme doesn't define a binary -- representation for it. encoder :: BinaryBuilder encoder = next >>= switch [ If C.isAscii fromAscii , If_ (== '\x80') $ return (pure $ BS.SH.pack [0x80]) , If_ (== '\xA5') $ return (pure $ BS.SH.pack [0x5C]) , If_ (== '\x203E') $ return (pure $ BS.SH.pack [0x7E]) , If_ (== '\x2212') $ encodeIndex '\xFF0D' , If (range '\xFF61' '\xFF9F') $ return . pure . BS.SH.pack . (: []) . fromIntegral . subtract 0xFEC0 . fromEnum , Else encodeIndex ] -- | __Encoding:__ -- @[Shift_JIS encoder] -- (https://encoding.spec.whatwg.org/#shift_jis-encoder)@ -- steps 7-13 -- -- Look for the binary representation of a given character in the 'ShiftJis' -- encoding. encodeIndex :: Char -> BinaryBuilder encodeIndex char = case encodeIndexShiftJis char of Just i -> let (lead, trail) = divMod i 188 lead' = lead + if lead < 0x1F then 0x81 else 0xC1 trail' = trail + if trail < 0x3F then 0x40 else 0x41 in return . pure $ BS.SH.pack [fromIntegral lead', fromIntegral trail'] Nothing -> encoderFailure char -- | Given a character, try to find the index value corresponding to it in the -- 'ShiftJis' encoding scheme. encodeIndexShiftJis :: Char -> Maybe Word encodeIndexShiftJis char = encodeIndex0208 char >>= redirect where redirect i | i >= 8272 && i <= 8835 = lookupMemoizedIndex encodeIndexShiftJisM Nothing char readEncodeIndexShiftJis | otherwise = Just i -- | Memoization table to save lookup time in the over-large 'ShiftJis' index. encodeIndexShiftJisM :: EncoderMemoTable encodeIndexShiftJisM = newMemoizationTable {-# NOINLINE encodeIndexShiftJisM #-} -- | __Encoding:__ -- @[index Shift_JIS pointer] -- (https://encoding.spec.whatwg.org/#index-shift_jis-pointer)@ -- -- Find the offset of a given character in the 'ShiftJis' index. Note that -- this is a heavy function, and should be cached whenever possible. readEncodeIndexShiftJis :: Char -> Maybe Word readEncodeIndexShiftJis char = search char . map U.swap $ loadIndex' filterIndex "jis0208" where filterIndex (i, _) = i < 8272 || i > 8835