{-|
Description:    Character translation functions to and from the EUC-KR encoding scheme.

Copyright:      (c) 2020 Sam May
License:        MPL-2.0
Maintainer:     ag.eitilt@gmail.com

Stability:      experimental
Portability:    portable
-}
module Web.Willow.Common.Encoding.EucKr
    ( decoder
    , encoder
    ) where


import qualified Control.Applicative as A

import qualified Data.ByteString.Short as BS.SH
import qualified Data.Char as C
import qualified Data.Tuple as U
import qualified Data.Word as W

import Web.Willow.Common.Encoding.Common
import Web.Willow.Common.Parser
import Web.Willow.Common.Parser.Switch


-- | __Encoding:__
--      @[EUC-KR decoder]
--      (https://encoding.spec.whatwg.org/#euc-kr-decoder)@
-- 
-- Decodes a 'Char' from a binary stream encoded with the 'EucKr' encoding
-- scheme, or returns 'Left' if the stream starts with an invalid byte
-- sequence.
decoder :: TextBuilder
decoder = next >>= switch
    [ If isAsciiByte toUnicode1
    , If (range 0x81 0xFE) decoder'
    , Else decoderFailure1
    ]

-- | __Encoding:__
--      @[EUC-KR decoder]
--      (https://encoding.spec.whatwg.org/#euc-kr-decoder)@
--      step 3
-- 
-- Process a multi-byte sequence according to either the @jis0208@ ('False') or
-- the @jis0212@ ('True') encoding scheme.
decoder' :: W.Word8 -> TextBuilder
decoder' lead = next >>= switch
    [ If (range 0x41 0xFE) $ lookupDecode lead
    , If isAsciiByte $ \b -> push b *> decoderFailure1 lead
    , Else $ decoderFailure2 lead
    ]

-- | __Encoding:__
--      @[EUC-KR decoder]
--      (https://encoding.spec.whatwg.org/#euc-kr-decoder)@
--      step 3, substeps 1-2
-- 
-- Calculate the character index value from a two-byte sequence, then retrieve
-- the 'Char' associated with that index in the 'EucKr' encoding scheme.
lookupDecode :: W.Word8 -> W.Word8 -> TextBuilder
lookupDecode lead trail = maybe failure (emit bs) . decodeIndex $
    (fromIntegral lead - 0x81) * 190 + fromIntegral trail - 0x41
  where failure
            | isAsciiByte trail = push trail *> decoderFailure1 lead
            | otherwise = decoderFailure bs
        bs = [lead, trail]


-- | __Encoding:__
--      @[EUC-KR encoder]
--      (https://encoding.spec.whatwg.org/#euc-kr-encoder)@
-- 
-- Encode the first 'Char' in a string according to the 'EucKr' encoding
-- scheme, or return that same character if that scheme doesn't define a binary
-- representation for it.
encoder :: BinaryBuilder
encoder = next >>= switch
    [ If C.isAscii fromAscii
    , Else encodeIndex
    ]


-- | Look for a character in the 'EucKr' encoding at the given index.
decodeIndex :: Word -> Maybe Char
decodeIndex index = lookupMemoizedIndex decodeIndexM (Just encodeIndexM) index readDecodeIndex

-- | Memoization table to save lookup time in the over-large 'EucKr' index.
decodeIndexM :: DecoderMemoTable
decodeIndexM = newMemoizationTable
{-# NOINLINE decodeIndexM #-}

-- | Read the character at a given offset from the 'EucKr' index.  Note that
-- this is a heavy function, and should be cached whenever possible.
readDecodeIndex :: Word -> Maybe Char
readDecodeIndex index = search index $ loadIndex "euc-kr"


-- | Look for the binary representation of a given character in the 'EucKr'
-- encoding.
encodeIndex :: Char -> BinaryBuilder
encodeIndex char = case lookupMemoizedIndex encodeIndexM (Just decodeIndexM) char readEncodeIndex of
    Just i ->
        let (lead, trail) = divMod i 188
            lead' = lead + if lead < 0x1F then 0x81 else 0xC1
            trail' = trail + if trail < 0x3F then 0x40 else 0x41
        in  return . pure $ BS.SH.pack [fromIntegral lead', fromIntegral trail']
    Nothing -> A.empty

-- | Memoization table to save lookup time in the over-large 'EucKr' index.
encodeIndexM :: EncoderMemoTable
encodeIndexM = newMemoizationTable
{-# NOINLINE encodeIndexM #-}

-- | Find the offset of a given character in the 'EucKr' index.  Note that this
-- is a heavy function, and should be cached whenever possible.
readEncodeIndex :: Char -> Maybe Word
readEncodeIndex char = search char . map U.swap $ loadIndex "euc-kr"