{-# LANGUAGE Trustworthy #-} {-| Description: Character translation functions to and from the gb18030 and GBK encoding schemes. Copyright: (c) 2020 Sam May License: MPL-2.0 Maintainer: ag.eitilt@gmail.com Stability: experimental Portability: portable -} module Web.Willow.Common.Encoding.GB ( decoder , encoderGb18030 , encoderGbk ) where import qualified Control.Applicative as A import qualified Data.ByteString as BS import qualified Data.ByteString.Short as BS.SH import qualified Data.Char as C import qualified Data.Maybe as Y import qualified Data.Tuple as U import qualified Data.Vector as V import qualified Data.Word as W import Control.Applicative ( (<|>) ) import Web.Willow.Common.Encoding.Common import Web.Willow.Common.Parser import Web.Willow.Common.Parser.Switch -- | __Encoding:__ -- @[gp18030 decoder] -- (https://encoding.spec.whatwg.org/#gb18030-decoder)@ -- -- Decodes a 'Char' from a binary stream encoded with either the 'Gb18030' or -- the 'Gbk' encoding schemes, or returns 'Left' if the stream starts with an -- invalid byte sequence. decoder :: TextBuilder decoder = next >>= switch [ If isAsciiByte toUnicode1 , If_ (== 0x80) $ emit [0x80] '\x20AC' , If (<= 0xFE) $ \first -> decoderFourBytes first <|> decoderTwoBytes first <|> do bs <- lookAhead $ nextChunk 3 if BS.length bs < 3 then A.empty else decoderFailure1 first , Else decoderFailure1 ] -- | __Encoding:__ -- @[gp18030 decoder] -- (https://encoding.spec.whatwg.org/#gb18030-decoder)@ -- step 5 -- -- Parse a two-byte sequence given the offset of the second byte. decoderTwoBytes :: W.Word8 -> TextBuilder decoderTwoBytes first = next >>= switch [ If (range 0x40 0x7E) $ decoderTwoBytes' 0x40 , If (range 0x80 0xFE) $ decoderTwoBytes' 0x41 , If (== 0xFF) $ decoderFailure2 first , Else $ \b -> push b *> decoderFailure1 first ] where decoderTwoBytes' offset second = maybe (recovery second) (emit [first, second]) . decodeIndex $ (fromIntegral first - 0x81) * 190 + fromIntegral (second - offset) -- The index is a complete mapping, so this recovery will never be used. recovery second | isAsciiByte second = push second *> decoderFailure1 first | otherwise = decoderFailure [first, second] -- | __Encoding:__ -- @[gp18030 decoder] -- (https://encoding.spec.whatwg.org/#gb18030-decoder)@ -- steps 2-4 -- -- Parse a four-byte-long encoded sequence. This is so much easier with a -- stateful, byte-at-a-time parser as used in the spec rather than a pure, -- character-at-a-time one. -- -- The joint constraints which make this hard to code are: -- -- * Fail if the second byte is outside the allowed range in order to continue -- to the other choices. -- * Return 'Left' without consuming input if all of the four are. -- * Fail again (this time consuming input) if the end of the stream is reached -- before the fourth byte. -- * Succeed consuming input if everything's good. -- -- This handles all of them. -- -- As above, fails without consuming input if the byte at the head of the -- stream is outside the allowed bounds. Returns 'Left' consuming input if -- there are less than three bytes left in the stream. decoderFourBytes :: W.Word8 -> TextBuilder decoderFourBytes first = do second <- lookAhead next >>= satisfying (range 0x30 0x39) trail <- A.optional $ do _ <- next third <- next >>= satisfying (range 0x81 0xFE) fourth <- next >>= satisfying (range 0x30 0x39) return (third, fourth) decoderFourBytes' second trail where decoderFourBytes' second (Just (third, fourth)) = maybe (decoderFailure [first, second, third, fourth]) (emit [first, second, third, fourth]) . decodeRange $ (fromIntegral first - 0x81) * 12600 + (fromIntegral second - 0x30) * 1260 + (fromIntegral third - 0x81) * 10 + fromIntegral fourth - 0x30 decoderFourBytes' _ Nothing = decoderFailure1 first -- | __Encoding:__ -- @[gb18030 encoder] -- (https://encoding.spec.whatwg.org/#gb18030-encoder)@ -- without @is GBK@ -- -- Encode the first 'Char' in a string according to the 'Gb18030' encoding -- scheme, or return that same character if that scheme doesn't define a binary -- representation for it. -- -- Does not round-trip on @\\xE5E5@. encoderGb18030 :: BinaryBuilder encoderGb18030 = encoder False -- | __Encoding:__ -- @[gb18030 encoder] -- (https://encoding.spec.whatwg.org/#gb18030-encoder)@ -- with @is GBK@ -- -- Encode the first 'Char' in a string according to the 'Gbk' encoding scheme, -- or return that same character if that scheme doesn't define a binary -- representation for it. -- -- Does not round-trip on @\\xE5E5@. encoderGbk :: BinaryBuilder encoderGbk = encoder True -- | __Encoding:__ -- @[gb18030 encoder] -- (https://encoding.spec.whatwg.org/#gb18030-encoder)@ -- -- Encode the first 'Char' in a string according to either the 'Gbk' ('True') -- or 'Gb18030' ('False') encoding scheme, or return that same character if -- that scheme doesn't define a binary representation for it. -- -- Does not round-trip on @\\xE5E5@. encoder :: Bool -> BinaryBuilder encoder isGbk = next >>= switch [ If C.isAscii fromAscii , If (== '\xE5E5') encoderFailure , If_ (\c -> c == '\x20AC' && isGbk) $ return (pure $ BS.SH.pack [0x80]) , If (Y.isJust . encodeIndex) $ \c -> let (lead', trail) = divMod (Y.fromJust $ encodeIndex c) 190 lead = fromIntegral lead' + 0x81 in return . pure . BS.SH.pack $ if trail < 0x3F then [lead, fromIntegral trail + 0x40] else [lead, fromIntegral trail + 0x41] , If (const isGbk) encoderFailure , Else $ \c -> let (b1, p1) = divMod (encodeRange c) 12600 (b2, p2) = divMod p1 1260 (b3, b4) = divMod p2 10 in return . pure $ BS.SH.pack [ fromIntegral b1 + 0x81 , fromIntegral b2 + 0x30 , fromIntegral b3 + 0x81 , fromIntegral b4 + 0x30 ] ] -- | Look for a character in the 'Gb18030' encoding at the given index. decodeIndex :: Word -> Maybe Char decodeIndex index = lookupMemoizedIndex decodeIndexM (Just encodeIndexM) index readDecodeIndex -- | Memoization table to save lookup time in the over-large 'Gb18030' index. decodeIndexM :: DecoderMemoTable decodeIndexM = newMemoizationTable {-# NOINLINE decodeIndexM #-} -- | Read the character at a given offset from the 'Gb18030' index. Note that -- this is a heavy function, and should be cached whenever possible. readDecodeIndex :: Word -> Maybe Char readDecodeIndex index = search index $ loadIndex "gb18030" -- | Look for the index of a given character in the 'Gb18030' encoding. encodeIndex :: Char -> Maybe Word encodeIndex char = lookupMemoizedIndex encodeIndexM (Just decodeIndexM) char readEncodeIndex -- | Memoization table to save lookup time in the over-large 'Gb18030' index. encodeIndexM :: EncoderMemoTable encodeIndexM = newMemoizationTable {-# NOINLINE encodeIndexM #-} -- | Find the offset of a given character in the 'Gb18030' index. Note that -- this is a heavy function, and should be cached whenever possible. readEncodeIndex :: Char -> Maybe Word readEncodeIndex char = i1 <|> lookup char is2 where (is1, is2) = splitAt 6176 . map U.swap $ loadIndex "gb18030" i1 = if char >= '\x4E02' && char <= '\xE525' then search char is1 else Nothing -- | __Encoding:__ -- @[index gb18030 ranges code point] -- (https://encoding.spec.whatwg.org/#index-gb18030-ranges-code-point)@ -- -- Retrieve a character reference from the compressed list of four-byte -- sequences, or 'Nothing' if the character is not included in 'Gb18030'. -- -- Note that this only applies to characters which are part of a four-byte, -- sequential range. Offsets to sections with no relation to the Unicode order -- will not return the proper 'Char'. decodeRange :: Word -> Maybe Char decodeRange index | index > 39419 && index < 189000 = Nothing | index > 1237575 = Nothing | index == 7457 = Just '\xE7C7' | otherwise = Just . toEnum . fromIntegral $ co + index - po where (po, co) = findRange $ \(i, _) -> i <= index -- | __Encoding:__ -- @[index gb18030 ranges pointer] -- (https://encoding.spec.whatwg.org/#index-gb18030-ranges-pointer)@ -- -- Retrieve an index offset from the compressed list of characters encoded in -- four-byte sequences. Returns the numeric value of the input unchanged if it -- is an ASCII character. -- -- Note that this only applies to characters which are part of a four-byte, -- sequential range. Characters in sections with no relation to the Unicode -- order will not return any meaningful offset. encodeRange :: Char -> Word encodeRange '\xE7C7' = 7457 encodeRange char = po + fromIntegral (fromEnum char) - co where (po, co) = findRange $ \(_, c) -> c <= char -- | Look up the offset-character pair at the start of the sequential range -- indicated by the predicate; normally, you would test if the value you -- already have is greater than or equal to the one passed to the argument -- function. -- -- Note that there is no length information about the ranges included, so even -- if the predicate applies it may not be any help in finding the true -- 'Gb18030' 'Char' or byte sequence. -- -- Returns @(0, 0)@ for any sequence over ASCII, which is actually correct if -- not really applicable to the four-byte sequences. findRange :: ((Word, Char) -> Bool) -> (Word, Word) findRange f = fmap (fromIntegral . fromEnum) . Y.fromMaybe (0, '\NUL') $ V.find f gbRanges -- | Unlike the unordered indices, the range table is more than small enough to -- keep resident in memory. While it does not provide good keys for random -- access, it is intended to be iterated over---which still perfectly fits the -- data model of a 'V.Vector'. -- -- Note that, as the values in the index indicate the lower bounds of each -- range, the memory storage is reversed (highest offsets first) to allow -- testing for "greatest less-than-or-equal-to" without needing lookahead or -- backtracking. gbRanges :: V.Vector (Word, Char) gbRanges = V.fromList . reverse $ loadIndex "gb18030-ranges"