{-| Description: Character translation functions to and from the EUC-KR encoding scheme. Copyright: (c) 2020 Sam May License: MPL-2.0 Maintainer: ag.eitilt@gmail.com Stability: experimental Portability: portable -} module Web.Willow.Common.Encoding.EucKr ( decoder , encoder ) where import qualified Control.Applicative as A import qualified Data.ByteString.Short as BS.SH import qualified Data.Char as C import qualified Data.Tuple as U import qualified Data.Word as W import Web.Willow.Common.Encoding.Common import Web.Willow.Common.Parser import Web.Willow.Common.Parser.Switch -- | __Encoding:__ -- @[EUC-KR decoder] -- (https://encoding.spec.whatwg.org/#euc-kr-decoder)@ -- -- Decodes a 'Char' from a binary stream encoded with the 'EucKr' encoding -- scheme, or returns 'Left' if the stream starts with an invalid byte -- sequence. decoder :: TextBuilder decoder = next >>= switch [ If isAsciiByte toUnicode1 , If (range 0x81 0xFE) decoder' , Else decoderFailure1 ] -- | __Encoding:__ -- @[EUC-KR decoder] -- (https://encoding.spec.whatwg.org/#euc-kr-decoder)@ -- step 3 -- -- Process a multi-byte sequence according to either the @jis0208@ ('False') or -- the @jis0212@ ('True') encoding scheme. decoder' :: W.Word8 -> TextBuilder decoder' lead = next >>= switch [ If (range 0x41 0xFE) $ lookupDecode lead , If isAsciiByte $ \b -> push b *> decoderFailure1 lead , Else $ decoderFailure2 lead ] -- | __Encoding:__ -- @[EUC-KR decoder] -- (https://encoding.spec.whatwg.org/#euc-kr-decoder)@ -- step 3, substeps 1-2 -- -- Calculate the character index value from a two-byte sequence, then retrieve -- the 'Char' associated with that index in the 'EucKr' encoding scheme. lookupDecode :: W.Word8 -> W.Word8 -> TextBuilder lookupDecode lead trail = maybe failure (emit bs) . decodeIndex $ (fromIntegral lead - 0x81) * 190 + fromIntegral trail - 0x41 where failure | isAsciiByte trail = push trail *> decoderFailure1 lead | otherwise = decoderFailure bs bs = [lead, trail] -- | __Encoding:__ -- @[EUC-KR encoder] -- (https://encoding.spec.whatwg.org/#euc-kr-encoder)@ -- -- Encode the first 'Char' in a string according to the 'EucKr' encoding -- scheme, or return that same character if that scheme doesn't define a binary -- representation for it. encoder :: BinaryBuilder encoder = next >>= switch [ If C.isAscii fromAscii , Else encodeIndex ] -- | Look for a character in the 'EucKr' encoding at the given index. decodeIndex :: Word -> Maybe Char decodeIndex index = lookupMemoizedIndex decodeIndexM (Just encodeIndexM) index readDecodeIndex -- | Memoization table to save lookup time in the over-large 'EucKr' index. decodeIndexM :: DecoderMemoTable decodeIndexM = newMemoizationTable {-# NOINLINE decodeIndexM #-} -- | Read the character at a given offset from the 'EucKr' index. Note that -- this is a heavy function, and should be cached whenever possible. readDecodeIndex :: Word -> Maybe Char readDecodeIndex index = search index $ loadIndex "euc-kr" -- | Look for the binary representation of a given character in the 'EucKr' -- encoding. encodeIndex :: Char -> BinaryBuilder encodeIndex char = case lookupMemoizedIndex encodeIndexM (Just decodeIndexM) char readEncodeIndex of Just i -> let (lead, trail) = divMod i 188 lead' = lead + if lead < 0x1F then 0x81 else 0xC1 trail' = trail + if trail < 0x3F then 0x40 else 0x41 in return . pure $ BS.SH.pack [fromIntegral lead', fromIntegral trail'] Nothing -> A.empty -- | Memoization table to save lookup time in the over-large 'EucKr' index. encodeIndexM :: EncoderMemoTable encodeIndexM = newMemoizationTable {-# NOINLINE encodeIndexM #-} -- | Find the offset of a given character in the 'EucKr' index. Note that this -- is a heavy function, and should be cached whenever possible. readEncodeIndex :: Char -> Maybe Word readEncodeIndex char = search char . map U.swap $ loadIndex "euc-kr"