{-# LANGUAGE Trustworthy #-}

{-|
Description:    Character translation functions to and from the gb18030 and GBK encoding schemes.

Copyright:      (c) 2020 Sam May
License:        MPL-2.0
Maintainer:     ag.eitilt@gmail.com

Stability:      experimental
Portability:    portable
-}
module Web.Willow.Common.Encoding.GB
    ( decoder
    , encoderGb18030
    , encoderGbk
    ) where


import qualified Control.Applicative as A

import qualified Data.ByteString as BS
import qualified Data.ByteString.Short as BS.SH
import qualified Data.Char as C
import qualified Data.Maybe as Y
import qualified Data.Tuple as U
import qualified Data.Vector as V
import qualified Data.Word as W

import Control.Applicative ( (<|>) )

import Web.Willow.Common.Encoding.Common
import Web.Willow.Common.Parser
import Web.Willow.Common.Parser.Switch


-- | __Encoding:__
--      @[gp18030 decoder]
--      (https://encoding.spec.whatwg.org/#gb18030-decoder)@
-- 
-- Decodes a 'Char' from a binary stream encoded with either the 'Gb18030' or
-- the 'Gbk' encoding schemes, or returns 'Left' if the stream starts with an
-- invalid byte sequence.
decoder :: TextBuilder
decoder :: TextBuilder
decoder = StateT (Confidence, ()) (Parser ByteString) Word8
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next StateT (Confidence, ()) (Parser ByteString) Word8
-> (Word8 -> TextBuilder) -> TextBuilder
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= [SwitchCase
   Word8
   (StateT (Confidence, ()) (Parser ByteString))
   (DecoderError String)]
-> Word8 -> TextBuilder
forall (m :: * -> *) test out.
Alternative m =>
[SwitchCase test m out] -> test -> m out
switch
    [ (Word8 -> Bool)
-> (Word8 -> TextBuilder)
-> SwitchCase
     Word8
     (StateT (Confidence, ()) (Parser ByteString))
     (DecoderError String)
forall test (m :: * -> *) out.
(test -> Bool) -> (test -> m out) -> SwitchCase test m out
If Word8 -> Bool
isAsciiByte Word8 -> TextBuilder
forall state. Word8 -> StateTextBuilder state
toUnicode1
    , (Word8 -> Bool)
-> TextBuilder
-> SwitchCase
     Word8
     (StateT (Confidence, ()) (Parser ByteString))
     (DecoderError String)
forall test (m :: * -> *) out.
(test -> Bool) -> m out -> SwitchCase test m out
If_ (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0x80) (TextBuilder
 -> SwitchCase
      Word8
      (StateT (Confidence, ()) (Parser ByteString))
      (DecoderError String))
-> TextBuilder
-> SwitchCase
     Word8
     (StateT (Confidence, ()) (Parser ByteString))
     (DecoderError String)
forall a b. (a -> b) -> a -> b
$ [Word8] -> Char -> TextBuilder
forall state. [Word8] -> Char -> StateTextBuilder state
emit [Word8
0x80] Char
'\x20AC'
    , (Word8 -> Bool)
-> (Word8 -> TextBuilder)
-> SwitchCase
     Word8
     (StateT (Confidence, ()) (Parser ByteString))
     (DecoderError String)
forall test (m :: * -> *) out.
(test -> Bool) -> (test -> m out) -> SwitchCase test m out
If (Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word8
0xFE) ((Word8 -> TextBuilder)
 -> SwitchCase
      Word8
      (StateT (Confidence, ()) (Parser ByteString))
      (DecoderError String))
-> (Word8 -> TextBuilder)
-> SwitchCase
     Word8
     (StateT (Confidence, ()) (Parser ByteString))
     (DecoderError String)
forall a b. (a -> b) -> a -> b
$ \Word8
first -> Word8 -> TextBuilder
decoderFourBytes Word8
first TextBuilder -> TextBuilder -> TextBuilder
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Word8 -> TextBuilder
decoderTwoBytes Word8
first TextBuilder -> TextBuilder -> TextBuilder
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> do
        ByteString
bs <- StateT (Confidence, ()) (Parser ByteString) ByteString
-> StateT (Confidence, ()) (Parser ByteString) ByteString
forall (m :: * -> *) stream token out.
MonadParser m stream token =>
m out -> m out
lookAhead (StateT (Confidence, ()) (Parser ByteString) ByteString
 -> StateT (Confidence, ()) (Parser ByteString) ByteString)
-> StateT (Confidence, ()) (Parser ByteString) ByteString
-> StateT (Confidence, ()) (Parser ByteString) ByteString
forall a b. (a -> b) -> a -> b
$ Word -> StateT (Confidence, ()) (Parser ByteString) ByteString
forall (m :: * -> *) stream token.
MonadParser m stream token =>
Word -> m stream
nextChunk Word
3
        if ByteString -> Int
BS.length ByteString
bs Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
3
            then TextBuilder
forall (f :: * -> *) a. Alternative f => f a
A.empty
            else Word8 -> TextBuilder
forall state. Word8 -> StateTextBuilder state
decoderFailure1 Word8
first
    , (Word8 -> TextBuilder)
-> SwitchCase
     Word8
     (StateT (Confidence, ()) (Parser ByteString))
     (DecoderError String)
forall test (m :: * -> *) out.
(test -> m out) -> SwitchCase test m out
Else Word8 -> TextBuilder
forall state. Word8 -> StateTextBuilder state
decoderFailure1
    ]

-- | __Encoding:__
--      @[gp18030 decoder]
--      (https://encoding.spec.whatwg.org/#gb18030-decoder)@
--      step 5
-- 
-- Parse a two-byte sequence given the offset of the second byte.
decoderTwoBytes :: W.Word8 -> TextBuilder
decoderTwoBytes :: Word8 -> TextBuilder
decoderTwoBytes Word8
first = StateT (Confidence, ()) (Parser ByteString) Word8
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next StateT (Confidence, ()) (Parser ByteString) Word8
-> (Word8 -> TextBuilder) -> TextBuilder
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= [SwitchCase
   Word8
   (StateT (Confidence, ()) (Parser ByteString))
   (DecoderError String)]
-> Word8 -> TextBuilder
forall (m :: * -> *) test out.
Alternative m =>
[SwitchCase test m out] -> test -> m out
switch
    [ (Word8 -> Bool)
-> (Word8 -> TextBuilder)
-> SwitchCase
     Word8
     (StateT (Confidence, ()) (Parser ByteString))
     (DecoderError String)
forall test (m :: * -> *) out.
(test -> Bool) -> (test -> m out) -> SwitchCase test m out
If (Word8 -> Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> a -> Bool
range Word8
0x40 Word8
0x7E) ((Word8 -> TextBuilder)
 -> SwitchCase
      Word8
      (StateT (Confidence, ()) (Parser ByteString))
      (DecoderError String))
-> (Word8 -> TextBuilder)
-> SwitchCase
     Word8
     (StateT (Confidence, ()) (Parser ByteString))
     (DecoderError String)
forall a b. (a -> b) -> a -> b
$ Word8 -> Word8 -> TextBuilder
forall state.
Word8
-> Word8
-> StateT
     (Confidence, state) (Parser ByteString) (DecoderError String)
decoderTwoBytes' Word8
0x40
    , (Word8 -> Bool)
-> (Word8 -> TextBuilder)
-> SwitchCase
     Word8
     (StateT (Confidence, ()) (Parser ByteString))
     (DecoderError String)
forall test (m :: * -> *) out.
(test -> Bool) -> (test -> m out) -> SwitchCase test m out
If (Word8 -> Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> a -> Bool
range Word8
0x80 Word8
0xFE) ((Word8 -> TextBuilder)
 -> SwitchCase
      Word8
      (StateT (Confidence, ()) (Parser ByteString))
      (DecoderError String))
-> (Word8 -> TextBuilder)
-> SwitchCase
     Word8
     (StateT (Confidence, ()) (Parser ByteString))
     (DecoderError String)
forall a b. (a -> b) -> a -> b
$ Word8 -> Word8 -> TextBuilder
forall state.
Word8
-> Word8
-> StateT
     (Confidence, state) (Parser ByteString) (DecoderError String)
decoderTwoBytes' Word8
0x41
    , (Word8 -> Bool)
-> (Word8 -> TextBuilder)
-> SwitchCase
     Word8
     (StateT (Confidence, ()) (Parser ByteString))
     (DecoderError String)
forall test (m :: * -> *) out.
(test -> Bool) -> (test -> m out) -> SwitchCase test m out
If (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0xFF) ((Word8 -> TextBuilder)
 -> SwitchCase
      Word8
      (StateT (Confidence, ()) (Parser ByteString))
      (DecoderError String))
-> (Word8 -> TextBuilder)
-> SwitchCase
     Word8
     (StateT (Confidence, ()) (Parser ByteString))
     (DecoderError String)
forall a b. (a -> b) -> a -> b
$ Word8 -> Word8 -> TextBuilder
forall state.
Word8
-> Word8
-> StateT
     (Confidence, state) (Parser ByteString) (DecoderError String)
decoderFailure2 Word8
first
    , (Word8 -> TextBuilder)
-> SwitchCase
     Word8
     (StateT (Confidence, ()) (Parser ByteString))
     (DecoderError String)
forall test (m :: * -> *) out.
(test -> m out) -> SwitchCase test m out
Else ((Word8 -> TextBuilder)
 -> SwitchCase
      Word8
      (StateT (Confidence, ()) (Parser ByteString))
      (DecoderError String))
-> (Word8 -> TextBuilder)
-> SwitchCase
     Word8
     (StateT (Confidence, ()) (Parser ByteString))
     (DecoderError String)
forall a b. (a -> b) -> a -> b
$ \Word8
b -> Word8 -> StateT (Confidence, ()) (Parser ByteString) ()
forall (m :: * -> *) stream token.
MonadParser m stream token =>
token -> m ()
push Word8
b StateT (Confidence, ()) (Parser ByteString) ()
-> TextBuilder -> TextBuilder
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Word8 -> TextBuilder
forall state. Word8 -> StateTextBuilder state
decoderFailure1 Word8
first
    ]
  where decoderTwoBytes' :: Word8
-> Word8
-> StateT
     (Confidence, state) (Parser ByteString) (DecoderError String)
decoderTwoBytes' Word8
offset Word8
second = StateT
  (Confidence, state) (Parser ByteString) (DecoderError String)
-> (Char
    -> StateT
         (Confidence, state) (Parser ByteString) (DecoderError String))
-> Maybe Char
-> StateT
     (Confidence, state) (Parser ByteString) (DecoderError String)
forall b a. b -> (a -> b) -> Maybe a -> b
maybe (Word8
-> StateT
     (Confidence, state) (Parser ByteString) (DecoderError String)
forall state. Word8 -> StateTextBuilder state
recovery Word8
second) ([Word8]
-> Char
-> StateT
     (Confidence, state) (Parser ByteString) (DecoderError String)
forall state. [Word8] -> Char -> StateTextBuilder state
emit [Word8
first, Word8
second]) (Maybe Char
 -> StateT
      (Confidence, state) (Parser ByteString) (DecoderError String))
-> (Word -> Maybe Char)
-> Word
-> StateT
     (Confidence, state) (Parser ByteString) (DecoderError String)
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
            Word -> Maybe Char
decodeIndex (Word
 -> StateT
      (Confidence, state) (Parser ByteString) (DecoderError String))
-> Word
-> StateT
     (Confidence, state) (Parser ByteString) (DecoderError String)
forall a b. (a -> b) -> a -> b
$ (Word8 -> Word
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
first Word -> Word -> Word
forall a. Num a => a -> a -> a
- Word
0x81) Word -> Word -> Word
forall a. Num a => a -> a -> a
* Word
190 Word -> Word -> Word
forall a. Num a => a -> a -> a
+ Word8 -> Word
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word8
second Word8 -> Word8 -> Word8
forall a. Num a => a -> a -> a
- Word8
offset)
        -- The index is a complete mapping, so this recovery will never be used.
        recovery :: Word8
-> StateT
     (Confidence, state) (Parser ByteString) (DecoderError String)
recovery Word8
second
            | Word8 -> Bool
isAsciiByte Word8
second = Word8 -> StateT (Confidence, state) (Parser ByteString) ()
forall (m :: * -> *) stream token.
MonadParser m stream token =>
token -> m ()
push Word8
second StateT (Confidence, state) (Parser ByteString) ()
-> StateT
     (Confidence, state) (Parser ByteString) (DecoderError String)
-> StateT
     (Confidence, state) (Parser ByteString) (DecoderError String)
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Word8
-> StateT
     (Confidence, state) (Parser ByteString) (DecoderError String)
forall state. Word8 -> StateTextBuilder state
decoderFailure1 Word8
first
            | Bool
otherwise = [Word8]
-> StateT
     (Confidence, state) (Parser ByteString) (DecoderError String)
forall state. [Word8] -> StateTextBuilder state
decoderFailure [Word8
first, Word8
second]

-- | __Encoding:__
--      @[gp18030 decoder]
--      (https://encoding.spec.whatwg.org/#gb18030-decoder)@
--      steps 2-4
-- 
-- Parse a four-byte-long encoded sequence.  This is so much easier with a
-- stateful, byte-at-a-time parser as used in the spec rather than a pure,
-- character-at-a-time one.
-- 
-- The joint constraints which make this hard to code are:
-- 
-- * Fail if the second byte is outside the allowed range in order to continue
--   to the other choices.
-- * Return 'Left' without consuming input if all of the four are.
-- * Fail again (this time consuming input) if the end of the stream is reached
--   before the fourth byte.
-- * Succeed consuming input if everything's good.
-- 
-- This handles all of them.
-- 
-- As above, fails without consuming input if the byte at the head of the
-- stream is outside the allowed bounds.  Returns 'Left' consuming input if
-- there are less than three bytes left in the stream.
decoderFourBytes :: W.Word8 -> TextBuilder
decoderFourBytes :: Word8 -> TextBuilder
decoderFourBytes Word8
first = do
    Word8
second <- StateT (Confidence, ()) (Parser ByteString) Word8
-> StateT (Confidence, ()) (Parser ByteString) Word8
forall (m :: * -> *) stream token out.
MonadParser m stream token =>
m out -> m out
lookAhead StateT (Confidence, ()) (Parser ByteString) Word8
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next StateT (Confidence, ()) (Parser ByteString) Word8
-> (Word8 -> StateT (Confidence, ()) (Parser ByteString) Word8)
-> StateT (Confidence, ()) (Parser ByteString) Word8
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= (Word8 -> Bool)
-> Word8 -> StateT (Confidence, ()) (Parser ByteString) Word8
forall (trans :: * -> *) stream token out.
MonadParser trans stream token =>
(out -> Bool) -> out -> trans out
satisfying (Word8 -> Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> a -> Bool
range Word8
0x30 Word8
0x39)
    Maybe (Word8, Word8)
trail <- StateT (Confidence, ()) (Parser ByteString) (Word8, Word8)
-> StateT
     (Confidence, ()) (Parser ByteString) (Maybe (Word8, Word8))
forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
A.optional (StateT (Confidence, ()) (Parser ByteString) (Word8, Word8)
 -> StateT
      (Confidence, ()) (Parser ByteString) (Maybe (Word8, Word8)))
-> StateT (Confidence, ()) (Parser ByteString) (Word8, Word8)
-> StateT
     (Confidence, ()) (Parser ByteString) (Maybe (Word8, Word8))
forall a b. (a -> b) -> a -> b
$ do
        Word8
_ <- StateT (Confidence, ()) (Parser ByteString) Word8
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next
        Word8
third <- StateT (Confidence, ()) (Parser ByteString) Word8
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next StateT (Confidence, ()) (Parser ByteString) Word8
-> (Word8 -> StateT (Confidence, ()) (Parser ByteString) Word8)
-> StateT (Confidence, ()) (Parser ByteString) Word8
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= (Word8 -> Bool)
-> Word8 -> StateT (Confidence, ()) (Parser ByteString) Word8
forall (trans :: * -> *) stream token out.
MonadParser trans stream token =>
(out -> Bool) -> out -> trans out
satisfying (Word8 -> Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> a -> Bool
range Word8
0x81 Word8
0xFE)
        Word8
fourth <- StateT (Confidence, ()) (Parser ByteString) Word8
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next StateT (Confidence, ()) (Parser ByteString) Word8
-> (Word8 -> StateT (Confidence, ()) (Parser ByteString) Word8)
-> StateT (Confidence, ()) (Parser ByteString) Word8
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= (Word8 -> Bool)
-> Word8 -> StateT (Confidence, ()) (Parser ByteString) Word8
forall (trans :: * -> *) stream token out.
MonadParser trans stream token =>
(out -> Bool) -> out -> trans out
satisfying (Word8 -> Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> a -> Bool
range Word8
0x30 Word8
0x39)
        (Word8, Word8)
-> StateT (Confidence, ()) (Parser ByteString) (Word8, Word8)
forall (m :: * -> *) a. Monad m => a -> m a
return (Word8
third, Word8
fourth)
    Word8 -> Maybe (Word8, Word8) -> TextBuilder
forall state.
Word8 -> Maybe (Word8, Word8) -> StateTextBuilder state
decoderFourBytes' Word8
second Maybe (Word8, Word8)
trail
  where decoderFourBytes' :: Word8 -> Maybe (Word8, Word8) -> StateTextBuilder state
decoderFourBytes' Word8
second (Just (Word8
third, Word8
fourth)) = StateTextBuilder state
-> (Char -> StateTextBuilder state)
-> Maybe Char
-> StateTextBuilder state
forall b a. b -> (a -> b) -> Maybe a -> b
maybe
            ([Word8] -> StateTextBuilder state
forall state. [Word8] -> StateTextBuilder state
decoderFailure [Word8
first, Word8
second, Word8
third, Word8
fourth])
            ([Word8] -> Char -> StateTextBuilder state
forall state. [Word8] -> Char -> StateTextBuilder state
emit [Word8
first, Word8
second, Word8
third, Word8
fourth]) (Maybe Char -> StateTextBuilder state)
-> (Word -> Maybe Char) -> Word -> StateTextBuilder state
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
            Word -> Maybe Char
decodeRange (Word -> StateTextBuilder state) -> Word -> StateTextBuilder state
forall a b. (a -> b) -> a -> b
$
                (Word8 -> Word
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
first Word -> Word -> Word
forall a. Num a => a -> a -> a
- Word
0x81) Word -> Word -> Word
forall a. Num a => a -> a -> a
* Word
12600
                Word -> Word -> Word
forall a. Num a => a -> a -> a
+ (Word8 -> Word
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
second Word -> Word -> Word
forall a. Num a => a -> a -> a
- Word
0x30) Word -> Word -> Word
forall a. Num a => a -> a -> a
* Word
1260
                Word -> Word -> Word
forall a. Num a => a -> a -> a
+ (Word8 -> Word
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
third Word -> Word -> Word
forall a. Num a => a -> a -> a
- Word
0x81) Word -> Word -> Word
forall a. Num a => a -> a -> a
* Word
10
                Word -> Word -> Word
forall a. Num a => a -> a -> a
+ Word8 -> Word
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
fourth Word -> Word -> Word
forall a. Num a => a -> a -> a
- Word
0x30
        decoderFourBytes' Word8
_ Maybe (Word8, Word8)
Nothing = Word8 -> StateTextBuilder state
forall state. Word8 -> StateTextBuilder state
decoderFailure1 Word8
first


-- | __Encoding:__
--      @[gb18030 encoder]
--      (https://encoding.spec.whatwg.org/#gb18030-encoder)@
--      without @is GBK@
-- 
-- Encode the first 'Char' in a string according to the 'Gb18030' encoding
-- scheme, or return that same character if that scheme doesn't define a binary
-- representation for it.
-- 
-- Does not round-trip on @\\xE5E5@.
encoderGb18030 :: BinaryBuilder
encoderGb18030 :: BinaryBuilder
encoderGb18030 = Bool -> BinaryBuilder
encoder Bool
False

-- | __Encoding:__
--      @[gb18030 encoder]
--      (https://encoding.spec.whatwg.org/#gb18030-encoder)@
--      with @is GBK@
-- 
-- Encode the first 'Char' in a string according to the 'Gbk' encoding scheme,
-- or return that same character if that scheme doesn't define a binary
-- representation for it.
-- 
-- Does not round-trip on @\\xE5E5@.
encoderGbk :: BinaryBuilder
encoderGbk :: BinaryBuilder
encoderGbk = Bool -> BinaryBuilder
encoder Bool
True

-- | __Encoding:__
--      @[gb18030 encoder]
--      (https://encoding.spec.whatwg.org/#gb18030-encoder)@
-- 
-- Encode the first 'Char' in a string according to either the 'Gbk' ('True')
-- or 'Gb18030' ('False') encoding scheme, or return that same character if
-- that scheme doesn't define a binary representation for it.
-- 
-- Does not round-trip on @\\xE5E5@.
encoder :: Bool -> BinaryBuilder
encoder :: Bool -> BinaryBuilder
encoder Bool
isGbk = StateT () (Parser Text) Char
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next StateT () (Parser Text) Char
-> (Char -> BinaryBuilder) -> BinaryBuilder
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= [SwitchCase
   Char (StateT () (Parser Text)) (EncoderError ShortByteString)]
-> Char -> BinaryBuilder
forall (m :: * -> *) test out.
Alternative m =>
[SwitchCase test m out] -> test -> m out
switch
    [ (Char -> Bool)
-> (Char -> BinaryBuilder)
-> SwitchCase
     Char (StateT () (Parser Text)) (EncoderError ShortByteString)
forall test (m :: * -> *) out.
(test -> Bool) -> (test -> m out) -> SwitchCase test m out
If Char -> Bool
C.isAscii Char -> BinaryBuilder
forall state. Char -> StateBinaryBuilder state
fromAscii
    , (Char -> Bool)
-> (Char -> BinaryBuilder)
-> SwitchCase
     Char (StateT () (Parser Text)) (EncoderError ShortByteString)
forall test (m :: * -> *) out.
(test -> Bool) -> (test -> m out) -> SwitchCase test m out
If (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'\xE5E5') Char -> BinaryBuilder
forall state out. Char -> StateEncoder state (EncoderError out)
encoderFailure
    , (Char -> Bool)
-> BinaryBuilder
-> SwitchCase
     Char (StateT () (Parser Text)) (EncoderError ShortByteString)
forall test (m :: * -> *) out.
(test -> Bool) -> m out -> SwitchCase test m out
If_ (\Char
c -> Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'\x20AC' Bool -> Bool -> Bool
&& Bool
isGbk) (BinaryBuilder
 -> SwitchCase
      Char (StateT () (Parser Text)) (EncoderError ShortByteString))
-> BinaryBuilder
-> SwitchCase
     Char (StateT () (Parser Text)) (EncoderError ShortByteString)
forall a b. (a -> b) -> a -> b
$ EncoderError ShortByteString -> BinaryBuilder
forall (m :: * -> *) a. Monad m => a -> m a
return (ShortByteString -> EncoderError ShortByteString
forall (f :: * -> *) a. Applicative f => a -> f a
pure (ShortByteString -> EncoderError ShortByteString)
-> ShortByteString -> EncoderError ShortByteString
forall a b. (a -> b) -> a -> b
$ [Word8] -> ShortByteString
BS.SH.pack [Word8
0x80])
    , (Char -> Bool)
-> (Char -> BinaryBuilder)
-> SwitchCase
     Char (StateT () (Parser Text)) (EncoderError ShortByteString)
forall test (m :: * -> *) out.
(test -> Bool) -> (test -> m out) -> SwitchCase test m out
If (Maybe Word -> Bool
forall a. Maybe a -> Bool
Y.isJust (Maybe Word -> Bool) -> (Char -> Maybe Word) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Maybe Word
encodeIndex) ((Char -> BinaryBuilder)
 -> SwitchCase
      Char (StateT () (Parser Text)) (EncoderError ShortByteString))
-> (Char -> BinaryBuilder)
-> SwitchCase
     Char (StateT () (Parser Text)) (EncoderError ShortByteString)
forall a b. (a -> b) -> a -> b
$ \Char
c ->
        let (Word
lead', Word
trail) = Word -> Word -> (Word, Word)
forall a. Integral a => a -> a -> (a, a)
divMod (Maybe Word -> Word
forall a. HasCallStack => Maybe a -> a
Y.fromJust (Maybe Word -> Word) -> Maybe Word -> Word
forall a b. (a -> b) -> a -> b
$ Char -> Maybe Word
encodeIndex Char
c) Word
190
            lead :: Word8
lead = Word -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word
lead' Word8 -> Word8 -> Word8
forall a. Num a => a -> a -> a
+ Word8
0x81
        in  EncoderError ShortByteString -> BinaryBuilder
forall (m :: * -> *) a. Monad m => a -> m a
return (EncoderError ShortByteString -> BinaryBuilder)
-> ([Word8] -> EncoderError ShortByteString)
-> [Word8]
-> BinaryBuilder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ShortByteString -> EncoderError ShortByteString
forall (f :: * -> *) a. Applicative f => a -> f a
pure (ShortByteString -> EncoderError ShortByteString)
-> ([Word8] -> ShortByteString)
-> [Word8]
-> EncoderError ShortByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Word8] -> ShortByteString
BS.SH.pack ([Word8] -> BinaryBuilder) -> [Word8] -> BinaryBuilder
forall a b. (a -> b) -> a -> b
$ if Word
trail Word -> Word -> Bool
forall a. Ord a => a -> a -> Bool
< Word
0x3F
                then [Word8
lead, Word -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word
trail Word8 -> Word8 -> Word8
forall a. Num a => a -> a -> a
+ Word8
0x40]
                else [Word8
lead, Word -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word
trail Word8 -> Word8 -> Word8
forall a. Num a => a -> a -> a
+ Word8
0x41]
    , (Char -> Bool)
-> (Char -> BinaryBuilder)
-> SwitchCase
     Char (StateT () (Parser Text)) (EncoderError ShortByteString)
forall test (m :: * -> *) out.
(test -> Bool) -> (test -> m out) -> SwitchCase test m out
If (Bool -> Char -> Bool
forall a b. a -> b -> a
const Bool
isGbk) Char -> BinaryBuilder
forall state out. Char -> StateEncoder state (EncoderError out)
encoderFailure
    , (Char -> BinaryBuilder)
-> SwitchCase
     Char (StateT () (Parser Text)) (EncoderError ShortByteString)
forall test (m :: * -> *) out.
(test -> m out) -> SwitchCase test m out
Else ((Char -> BinaryBuilder)
 -> SwitchCase
      Char (StateT () (Parser Text)) (EncoderError ShortByteString))
-> (Char -> BinaryBuilder)
-> SwitchCase
     Char (StateT () (Parser Text)) (EncoderError ShortByteString)
forall a b. (a -> b) -> a -> b
$ \Char
c ->
        let (Word
b1, Word
p1) = Word -> Word -> (Word, Word)
forall a. Integral a => a -> a -> (a, a)
divMod (Char -> Word
encodeRange Char
c) Word
12600
            (Word
b2, Word
p2) = Word -> Word -> (Word, Word)
forall a. Integral a => a -> a -> (a, a)
divMod Word
p1 Word
1260
            (Word
b3, Word
b4) = Word -> Word -> (Word, Word)
forall a. Integral a => a -> a -> (a, a)
divMod Word
p2 Word
10
        in  EncoderError ShortByteString -> BinaryBuilder
forall (m :: * -> *) a. Monad m => a -> m a
return (EncoderError ShortByteString -> BinaryBuilder)
-> (ShortByteString -> EncoderError ShortByteString)
-> ShortByteString
-> BinaryBuilder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ShortByteString -> EncoderError ShortByteString
forall (f :: * -> *) a. Applicative f => a -> f a
pure (ShortByteString -> BinaryBuilder)
-> ShortByteString -> BinaryBuilder
forall a b. (a -> b) -> a -> b
$ [Word8] -> ShortByteString
BS.SH.pack
                [ Word -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word
b1 Word8 -> Word8 -> Word8
forall a. Num a => a -> a -> a
+ Word8
0x81
                , Word -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word
b2 Word8 -> Word8 -> Word8
forall a. Num a => a -> a -> a
+ Word8
0x30
                , Word -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word
b3 Word8 -> Word8 -> Word8
forall a. Num a => a -> a -> a
+ Word8
0x81
                , Word -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word
b4 Word8 -> Word8 -> Word8
forall a. Num a => a -> a -> a
+ Word8
0x30
                ]
    ]


-- | Look for a character in the 'Gb18030' encoding at the given index.
decodeIndex :: Word -> Maybe Char
decodeIndex :: Word -> Maybe Char
decodeIndex Word
index = IO (MemoizationTable Word (Maybe Char))
-> Maybe (IO (MemoizationTable Char (Maybe Word)))
-> Word
-> (Word -> Maybe Char)
-> Maybe Char
forall k v.
(Eq k, Hashable k, Eq v, Hashable v) =>
IO (MemoizationTable k (Maybe v))
-> Maybe (IO (MemoizationTable v (Maybe k)))
-> k
-> (k -> Maybe v)
-> Maybe v
lookupMemoizedIndex IO (MemoizationTable Word (Maybe Char))
decodeIndexM (IO (MemoizationTable Char (Maybe Word))
-> Maybe (IO (MemoizationTable Char (Maybe Word)))
forall a. a -> Maybe a
Just IO (MemoizationTable Char (Maybe Word))
encodeIndexM) Word
index Word -> Maybe Char
readDecodeIndex

-- | Memoization table to save lookup time in the over-large 'Gb18030' index.
decodeIndexM :: DecoderMemoTable
decodeIndexM :: IO (MemoizationTable Word (Maybe Char))
decodeIndexM = IO (MemoizationTable Word (Maybe Char))
forall k v. IO (MemoizationTable k v)
newMemoizationTable
{-# NOINLINE decodeIndexM #-}

-- | Read the character at a given offset from the 'Gb18030' index.  Note that
-- this is a heavy function, and should be cached whenever possible.
readDecodeIndex :: Word -> Maybe Char
readDecodeIndex :: Word -> Maybe Char
readDecodeIndex Word
index = Word -> [(Word, Char)] -> Maybe Char
forall k v. Ord k => k -> [(k, v)] -> Maybe v
search Word
index ([(Word, Char)] -> Maybe Char) -> [(Word, Char)] -> Maybe Char
forall a b. (a -> b) -> a -> b
$ String -> [(Word, Char)]
loadIndex String
"gb18030"


-- | Look for the index of a given character in the 'Gb18030' encoding.
encodeIndex :: Char -> Maybe Word
encodeIndex :: Char -> Maybe Word
encodeIndex Char
char = IO (MemoizationTable Char (Maybe Word))
-> Maybe (IO (MemoizationTable Word (Maybe Char)))
-> Char
-> (Char -> Maybe Word)
-> Maybe Word
forall k v.
(Eq k, Hashable k, Eq v, Hashable v) =>
IO (MemoizationTable k (Maybe v))
-> Maybe (IO (MemoizationTable v (Maybe k)))
-> k
-> (k -> Maybe v)
-> Maybe v
lookupMemoizedIndex IO (MemoizationTable Char (Maybe Word))
encodeIndexM (IO (MemoizationTable Word (Maybe Char))
-> Maybe (IO (MemoizationTable Word (Maybe Char)))
forall a. a -> Maybe a
Just IO (MemoizationTable Word (Maybe Char))
decodeIndexM) Char
char Char -> Maybe Word
readEncodeIndex

-- | Memoization table to save lookup time in the over-large 'Gb18030' index.
encodeIndexM :: EncoderMemoTable
encodeIndexM :: IO (MemoizationTable Char (Maybe Word))
encodeIndexM = IO (MemoizationTable Char (Maybe Word))
forall k v. IO (MemoizationTable k v)
newMemoizationTable
{-# NOINLINE encodeIndexM #-}

-- | Find the offset of a given character in the 'Gb18030' index.  Note that
-- this is a heavy function, and should be cached whenever possible.
readEncodeIndex :: Char -> Maybe Word
readEncodeIndex :: Char -> Maybe Word
readEncodeIndex Char
char = Maybe Word
i1 Maybe Word -> Maybe Word -> Maybe Word
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Char -> [(Char, Word)] -> Maybe Word
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup Char
char [(Char, Word)]
is2
  where ([(Char, Word)]
is1, [(Char, Word)]
is2) = Int -> [(Char, Word)] -> ([(Char, Word)], [(Char, Word)])
forall a. Int -> [a] -> ([a], [a])
splitAt Int
6176 ([(Char, Word)] -> ([(Char, Word)], [(Char, Word)]))
-> ([(Word, Char)] -> [(Char, Word)])
-> [(Word, Char)]
-> ([(Char, Word)], [(Char, Word)])
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ((Word, Char) -> (Char, Word)) -> [(Word, Char)] -> [(Char, Word)]
forall a b. (a -> b) -> [a] -> [b]
map (Word, Char) -> (Char, Word)
forall a b. (a, b) -> (b, a)
U.swap ([(Word, Char)] -> ([(Char, Word)], [(Char, Word)]))
-> [(Word, Char)] -> ([(Char, Word)], [(Char, Word)])
forall a b. (a -> b) -> a -> b
$ String -> [(Word, Char)]
loadIndex String
"gb18030"
        i1 :: Maybe Word
i1 = if Char
char Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= Char
'\x4E02' Bool -> Bool -> Bool
&& Char
char Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\xE525'
            then Char -> [(Char, Word)] -> Maybe Word
forall k v. Ord k => k -> [(k, v)] -> Maybe v
search Char
char [(Char, Word)]
is1
            else Maybe Word
forall a. Maybe a
Nothing


-- | __Encoding:__
--      @[index gb18030 ranges code point]
--      (https://encoding.spec.whatwg.org/#index-gb18030-ranges-code-point)@
-- 
-- Retrieve a character reference from the compressed list of four-byte
-- sequences, or 'Nothing' if the character is not included in 'Gb18030'.
-- 
-- Note that this only applies to characters which are part of a four-byte,
-- sequential range.  Offsets to sections with no relation to the Unicode order
-- will not return the proper 'Char'.
decodeRange :: Word -> Maybe Char
decodeRange :: Word -> Maybe Char
decodeRange Word
index
    | Word
index Word -> Word -> Bool
forall a. Ord a => a -> a -> Bool
> Word
39419 Bool -> Bool -> Bool
&& Word
index Word -> Word -> Bool
forall a. Ord a => a -> a -> Bool
< Word
189000 = Maybe Char
forall a. Maybe a
Nothing
    | Word
index Word -> Word -> Bool
forall a. Ord a => a -> a -> Bool
> Word
1237575 = Maybe Char
forall a. Maybe a
Nothing
    | Word
index Word -> Word -> Bool
forall a. Eq a => a -> a -> Bool
== Word
7457 = Char -> Maybe Char
forall a. a -> Maybe a
Just Char
'\xE7C7'
    | Bool
otherwise = Char -> Maybe Char
forall a. a -> Maybe a
Just (Char -> Maybe Char) -> (Word -> Char) -> Word -> Maybe Char
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Char
forall a. Enum a => Int -> a
toEnum (Int -> Char) -> (Word -> Int) -> Word -> Char
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Word -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word -> Maybe Char) -> Word -> Maybe Char
forall a b. (a -> b) -> a -> b
$ Word
co Word -> Word -> Word
forall a. Num a => a -> a -> a
+ Word
index Word -> Word -> Word
forall a. Num a => a -> a -> a
- Word
po
  where (Word
po, Word
co) = ((Word, Char) -> Bool) -> (Word, Word)
findRange (((Word, Char) -> Bool) -> (Word, Word))
-> ((Word, Char) -> Bool) -> (Word, Word)
forall a b. (a -> b) -> a -> b
$ \(Word
i, Char
_) -> Word
i Word -> Word -> Bool
forall a. Ord a => a -> a -> Bool
<= Word
index

-- | __Encoding:__
--      @[index gb18030 ranges pointer]
--      (https://encoding.spec.whatwg.org/#index-gb18030-ranges-pointer)@
-- 
-- Retrieve an index offset from the compressed list of characters encoded in
-- four-byte sequences.  Returns the numeric value of the input unchanged if it
-- is an ASCII character.
-- 
-- Note that this only applies to characters which are part of a four-byte,
-- sequential range.  Characters in sections with no relation to the Unicode
-- order will not return any meaningful offset.
encodeRange :: Char -> Word
encodeRange :: Char -> Word
encodeRange Char
'\xE7C7' = Word
7457
encodeRange Char
char = Word
po Word -> Word -> Word
forall a. Num a => a -> a -> a
+ Int -> Word
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Char -> Int
forall a. Enum a => a -> Int
fromEnum Char
char) Word -> Word -> Word
forall a. Num a => a -> a -> a
- Word
co
  where (Word
po, Word
co) = ((Word, Char) -> Bool) -> (Word, Word)
findRange (((Word, Char) -> Bool) -> (Word, Word))
-> ((Word, Char) -> Bool) -> (Word, Word)
forall a b. (a -> b) -> a -> b
$ \(Word
_, Char
c) -> Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
char

-- | Look up the offset-character pair at the start of the sequential range
-- indicated by the predicate; normally, you would test if the value you
-- already have is greater than or equal to the one passed to the argument
-- function.
-- 
-- Note that there is no length information about the ranges included, so even
-- if the predicate applies it may not be any help in finding the true
-- 'Gb18030' 'Char' or byte sequence.
-- 
-- Returns @(0, 0)@ for any sequence over ASCII, which is actually correct if
-- not really applicable to the four-byte sequences.
findRange :: ((Word, Char) -> Bool) -> (Word, Word)
findRange :: ((Word, Char) -> Bool) -> (Word, Word)
findRange (Word, Char) -> Bool
f = (Char -> Word) -> (Word, Char) -> (Word, Word)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (Int -> Word
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Word) -> (Char -> Int) -> Char -> Word
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Int
forall a. Enum a => a -> Int
fromEnum) ((Word, Char) -> (Word, Word))
-> (Maybe (Word, Char) -> (Word, Char))
-> Maybe (Word, Char)
-> (Word, Word)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Word, Char) -> Maybe (Word, Char) -> (Word, Char)
forall a. a -> Maybe a -> a
Y.fromMaybe (Word
0, Char
'\NUL') (Maybe (Word, Char) -> (Word, Word))
-> Maybe (Word, Char) -> (Word, Word)
forall a b. (a -> b) -> a -> b
$ ((Word, Char) -> Bool) -> Vector (Word, Char) -> Maybe (Word, Char)
forall a. (a -> Bool) -> Vector a -> Maybe a
V.find (Word, Char) -> Bool
f Vector (Word, Char)
gbRanges

-- | Unlike the unordered indices, the range table is more than small enough to
-- keep resident in memory.  While it does not provide good keys for random
-- access, it is intended to be iterated over---which still perfectly fits the
-- data model of a 'V.Vector'.
-- 
-- Note that, as the values in the index indicate the lower bounds of each
-- range, the memory storage is reversed (highest offsets first) to allow
-- testing for "greatest less-than-or-equal-to" without needing lookahead or
-- backtracking.
gbRanges :: V.Vector (Word, Char)
gbRanges :: Vector (Word, Char)
gbRanges = [(Word, Char)] -> Vector (Word, Char)
forall a. [a] -> Vector a
V.fromList ([(Word, Char)] -> Vector (Word, Char))
-> ([(Word, Char)] -> [(Word, Char)])
-> [(Word, Char)]
-> Vector (Word, Char)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [(Word, Char)] -> [(Word, Char)]
forall a. [a] -> [a]
reverse ([(Word, Char)] -> Vector (Word, Char))
-> [(Word, Char)] -> Vector (Word, Char)
forall a b. (a -> b) -> a -> b
$ String -> [(Word, Char)]
loadIndex String
"gb18030-ranges"