{-# LANGUAGE Trustworthy #-}

{-|
Description:    Character translation functions to and from the gb18030 and GBK encoding schemes.

Copyright:      (c) 2020 Sam May
License:        MPL-2.0
Maintainer:     ag.eitilt@gmail.com

Stability:      experimental
Portability:    portable
-}
module Web.Willow.Common.Encoding.GB
    ( decoder
    , encoderGb18030
    , encoderGbk
    ) where


import qualified Control.Applicative as A

import qualified Data.ByteString as BS
import qualified Data.ByteString.Short as BS.SH
import qualified Data.Char as C
import qualified Data.Maybe as Y
import qualified Data.Tuple as U
import qualified Data.Vector as V
import qualified Data.Word as W

import Control.Applicative ( (<|>) )

import Web.Willow.Common.Encoding.Common
import Web.Willow.Common.Parser
import Web.Willow.Common.Parser.Switch


-- | __Encoding:__
--      @[gp18030 decoder]
--      (https://encoding.spec.whatwg.org/#gb18030-decoder)@
-- 
-- Decodes a 'Char' from a binary stream encoded with either the 'Gb18030' or
-- the 'Gbk' encoding schemes, or returns 'Left' if the stream starts with an
-- invalid byte sequence.
decoder :: TextBuilder
decoder = next >>= switch
    [ If isAsciiByte toUnicode1
    , If_ (== 0x80) $ emit [0x80] '\x20AC'
    , If (<= 0xFE) $ \first -> decoderFourBytes first <|> decoderTwoBytes first <|> do
        bs <- lookAhead $ nextChunk 3
        if BS.length bs < 3
            then A.empty
            else decoderFailure1 first
    , Else decoderFailure1
    ]

-- | __Encoding:__
--      @[gp18030 decoder]
--      (https://encoding.spec.whatwg.org/#gb18030-decoder)@
--      step 5
-- 
-- Parse a two-byte sequence given the offset of the second byte.
decoderTwoBytes :: W.Word8 -> TextBuilder
decoderTwoBytes first = next >>= switch
    [ If (range 0x40 0x7E) $ decoderTwoBytes' 0x40
    , If (range 0x80 0xFE) $ decoderTwoBytes' 0x41
    , If (== 0xFF) $ decoderFailure2 first
    , Else $ \b -> push b *> decoderFailure1 first
    ]
  where decoderTwoBytes' offset second = maybe (recovery second) (emit [first, second]) .
            decodeIndex $ (fromIntegral first - 0x81) * 190 + fromIntegral (second - offset)
        -- The index is a complete mapping, so this recovery will never be used.
        recovery second
            | isAsciiByte second = push second *> decoderFailure1 first
            | otherwise = decoderFailure [first, second]

-- | __Encoding:__
--      @[gp18030 decoder]
--      (https://encoding.spec.whatwg.org/#gb18030-decoder)@
--      steps 2-4
-- 
-- Parse a four-byte-long encoded sequence.  This is so much easier with a
-- stateful, byte-at-a-time parser as used in the spec rather than a pure,
-- character-at-a-time one.
-- 
-- The joint constraints which make this hard to code are:
-- 
-- * Fail if the second byte is outside the allowed range in order to continue
--   to the other choices.
-- * Return 'Left' without consuming input if all of the four are.
-- * Fail again (this time consuming input) if the end of the stream is reached
--   before the fourth byte.
-- * Succeed consuming input if everything's good.
-- 
-- This handles all of them.
-- 
-- As above, fails without consuming input if the byte at the head of the
-- stream is outside the allowed bounds.  Returns 'Left' consuming input if
-- there are less than three bytes left in the stream.
decoderFourBytes :: W.Word8 -> TextBuilder
decoderFourBytes first = do
    second <- lookAhead next >>= satisfying (range 0x30 0x39)
    trail <- A.optional $ do
        _ <- next
        third <- next >>= satisfying (range 0x81 0xFE)
        fourth <- next >>= satisfying (range 0x30 0x39)
        return (third, fourth)
    decoderFourBytes' second trail
  where decoderFourBytes' second (Just (third, fourth)) = maybe
            (decoderFailure [first, second, third, fourth])
            (emit [first, second, third, fourth]) .
            decodeRange $
                (fromIntegral first - 0x81) * 12600
                + (fromIntegral second - 0x30) * 1260
                + (fromIntegral third - 0x81) * 10
                + fromIntegral fourth - 0x30
        decoderFourBytes' _ Nothing = decoderFailure1 first


-- | __Encoding:__
--      @[gb18030 encoder]
--      (https://encoding.spec.whatwg.org/#gb18030-encoder)@
--      without @is GBK@
-- 
-- Encode the first 'Char' in a string according to the 'Gb18030' encoding
-- scheme, or return that same character if that scheme doesn't define a binary
-- representation for it.
-- 
-- Does not round-trip on @\\xE5E5@.
encoderGb18030 :: BinaryBuilder
encoderGb18030 = encoder False

-- | __Encoding:__
--      @[gb18030 encoder]
--      (https://encoding.spec.whatwg.org/#gb18030-encoder)@
--      with @is GBK@
-- 
-- Encode the first 'Char' in a string according to the 'Gbk' encoding scheme,
-- or return that same character if that scheme doesn't define a binary
-- representation for it.
-- 
-- Does not round-trip on @\\xE5E5@.
encoderGbk :: BinaryBuilder
encoderGbk = encoder True

-- | __Encoding:__
--      @[gb18030 encoder]
--      (https://encoding.spec.whatwg.org/#gb18030-encoder)@
-- 
-- Encode the first 'Char' in a string according to either the 'Gbk' ('True')
-- or 'Gb18030' ('False') encoding scheme, or return that same character if
-- that scheme doesn't define a binary representation for it.
-- 
-- Does not round-trip on @\\xE5E5@.
encoder :: Bool -> BinaryBuilder
encoder isGbk = next >>= switch
    [ If C.isAscii fromAscii
    , If (== '\xE5E5') encoderFailure
    , If_ (\c -> c == '\x20AC' && isGbk) $ return (pure $ BS.SH.pack [0x80])
    , If (Y.isJust . encodeIndex) $ \c ->
        let (lead', trail) = divMod (Y.fromJust $ encodeIndex c) 190
            lead = fromIntegral lead' + 0x81
        in  return . pure . BS.SH.pack $ if trail < 0x3F
                then [lead, fromIntegral trail + 0x40]
                else [lead, fromIntegral trail + 0x41]
    , If (const isGbk) encoderFailure
    , Else $ \c ->
        let (b1, p1) = divMod (encodeRange c) 12600
            (b2, p2) = divMod p1 1260
            (b3, b4) = divMod p2 10
        in  return . pure $ BS.SH.pack
                [ fromIntegral b1 + 0x81
                , fromIntegral b2 + 0x30
                , fromIntegral b3 + 0x81
                , fromIntegral b4 + 0x30
                ]
    ]


-- | Look for a character in the 'Gb18030' encoding at the given index.
decodeIndex :: Word -> Maybe Char
decodeIndex index = lookupMemoizedIndex decodeIndexM (Just encodeIndexM) index readDecodeIndex

-- | Memoization table to save lookup time in the over-large 'Gb18030' index.
decodeIndexM :: DecoderMemoTable
decodeIndexM = newMemoizationTable
{-# NOINLINE decodeIndexM #-}

-- | Read the character at a given offset from the 'Gb18030' index.  Note that
-- this is a heavy function, and should be cached whenever possible.
readDecodeIndex :: Word -> Maybe Char
readDecodeIndex index = search index $ loadIndex "gb18030"


-- | Look for the index of a given character in the 'Gb18030' encoding.
encodeIndex :: Char -> Maybe Word
encodeIndex char = lookupMemoizedIndex encodeIndexM (Just decodeIndexM) char readEncodeIndex

-- | Memoization table to save lookup time in the over-large 'Gb18030' index.
encodeIndexM :: EncoderMemoTable
encodeIndexM = newMemoizationTable
{-# NOINLINE encodeIndexM #-}

-- | Find the offset of a given character in the 'Gb18030' index.  Note that
-- this is a heavy function, and should be cached whenever possible.
readEncodeIndex :: Char -> Maybe Word
readEncodeIndex char = i1 <|> lookup char is2
  where (is1, is2) = splitAt 6176 . map U.swap $ loadIndex "gb18030"
        i1 = if char >= '\x4E02' && char <= '\xE525'
            then search char is1
            else Nothing


-- | __Encoding:__
--      @[index gb18030 ranges code point]
--      (https://encoding.spec.whatwg.org/#index-gb18030-ranges-code-point)@
-- 
-- Retrieve a character reference from the compressed list of four-byte
-- sequences, or 'Nothing' if the character is not included in 'Gb18030'.
-- 
-- Note that this only applies to characters which are part of a four-byte,
-- sequential range.  Offsets to sections with no relation to the Unicode order
-- will not return the proper 'Char'.
decodeRange :: Word -> Maybe Char
decodeRange index
    | index > 39419 && index < 189000 = Nothing
    | index > 1237575 = Nothing
    | index == 7457 = Just '\xE7C7'
    | otherwise = Just . toEnum . fromIntegral $ co + index - po
  where (po, co) = findRange $ \(i, _) -> i <= index

-- | __Encoding:__
--      @[index gb18030 ranges pointer]
--      (https://encoding.spec.whatwg.org/#index-gb18030-ranges-pointer)@
-- 
-- Retrieve an index offset from the compressed list of characters encoded in
-- four-byte sequences.  Returns the numeric value of the input unchanged if it
-- is an ASCII character.
-- 
-- Note that this only applies to characters which are part of a four-byte,
-- sequential range.  Characters in sections with no relation to the Unicode
-- order will not return any meaningful offset.
encodeRange :: Char -> Word
encodeRange '\xE7C7' = 7457
encodeRange char = po + fromIntegral (fromEnum char) - co
  where (po, co) = findRange $ \(_, c) -> c <= char

-- | Look up the offset-character pair at the start of the sequential range
-- indicated by the predicate; normally, you would test if the value you
-- already have is greater than or equal to the one passed to the argument
-- function.
-- 
-- Note that there is no length information about the ranges included, so even
-- if the predicate applies it may not be any help in finding the true
-- 'Gb18030' 'Char' or byte sequence.
-- 
-- Returns @(0, 0)@ for any sequence over ASCII, which is actually correct if
-- not really applicable to the four-byte sequences.
findRange :: ((Word, Char) -> Bool) -> (Word, Word)
findRange f = fmap (fromIntegral . fromEnum) . Y.fromMaybe (0, '\NUL') $ V.find f gbRanges

-- | Unlike the unordered indices, the range table is more than small enough to
-- keep resident in memory.  While it does not provide good keys for random
-- access, it is intended to be iterated over---which still perfectly fits the
-- data model of a 'V.Vector'.
-- 
-- Note that, as the values in the index indicate the lower bounds of each
-- range, the memory storage is reversed (highest offsets first) to allow
-- testing for "greatest less-than-or-equal-to" without needing lookahead or
-- backtracking.
gbRanges :: V.Vector (Word, Char)
gbRanges = V.fromList . reverse $ loadIndex "gb18030-ranges"