{-|
Description:    Character translation functions to and from the UTF-16 encoding scheme.

Copyright:      (c) 2020 Sam May
License:        MPL-2.0
Maintainer:     ag.eitilt@gmail.com

Stability:      experimental
Portability:    portable
-}
module Web.Willow.Common.Encoding.Utf16
    ( -- * Decoders
      decoderBigEndian
    , decoderLittleEndian
      -- * BOM Detection
    , byteOrderMarkBigEndian
    , byteOrderMarkLittleEndian
    ) where


import qualified Control.Applicative as A

import qualified Data.Bits as B
import qualified Data.ByteString as BS
import qualified Data.Word as W

import Data.Functor ( ($>) )

import Web.Willow.Common.Encoding.Common
import Web.Willow.Common.Parser


-- | The binary encoding of a byte-order mark character in UTF-16 with
-- big-endian layout.
-- 
-- Fails if the stream does not start with the exact sequence @[0xFE, 0xFF]@.
-- If successful, always returns 'Utf16be'.
byteOrderMarkBigEndian
    :: (A.Alternative gather, Monad gather)
    => ParserT BS.ByteString gather Encoding
byteOrderMarkBigEndian :: ParserT ByteString gather Encoding
byteOrderMarkBigEndian = ByteString -> ParserT ByteString gather ByteString
forall (trans :: * -> *) stream token.
(MonadParser trans stream token, Eq stream) =>
stream -> trans stream
chunk ([Word8] -> ByteString
BS.pack [Word8
0xFE, Word8
0xFF]) ParserT ByteString gather ByteString
-> Encoding -> ParserT ByteString gather Encoding
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> Encoding
Utf16be

-- | The binary encoding of a byte-order mark character in UTF-16 with
-- little-endian layout.
-- 
-- Fails if the stream does not start with the exact sequence @[0xFF, 0xFE]@.
-- If successful, always returns 'Utf16le'.
byteOrderMarkLittleEndian
    :: (A.Alternative gather, Monad gather)
    => ParserT BS.ByteString gather Encoding
byteOrderMarkLittleEndian :: ParserT ByteString gather Encoding
byteOrderMarkLittleEndian = ByteString -> ParserT ByteString gather ByteString
forall (trans :: * -> *) stream token.
(MonadParser trans stream token, Eq stream) =>
stream -> trans stream
chunk ([Word8] -> ByteString
BS.pack [Word8
0xFF, Word8
0xFE]) ParserT ByteString gather ByteString
-> Encoding -> ParserT ByteString gather Encoding
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> Encoding
Utf16le


-- | __Encoding:__
--      @[UTF-16BE decoder]
--      (https://encoding.spec.whatwg.org/#utf-16be-decoder)@
-- 
-- Decodes a 'Char' from a binary stream encoded with the 'Utf16be' encoding
-- scheme, or returns 'Left' if the stream starts with an invalid byte
-- sequence.
decoderBigEndian :: TextBuilder
decoderBigEndian :: TextBuilder
decoderBigEndian = Bool -> TextBuilder
decoder Bool
True

-- | __Encoding:__
--      @[UTF-16LE decoder]
--      (https://encoding.spec.whatwg.org/#utf-16le-decoder)@
-- 
-- Decodes a 'Char' from a binary stream encoded with the 'Utf16le' encoding
-- scheme, or returns 'Left' if the stream starts with an invalid byte
-- sequence.
decoderLittleEndian :: TextBuilder
decoderLittleEndian :: TextBuilder
decoderLittleEndian = Bool -> TextBuilder
decoder Bool
False


-- | __Encoding:__
--      @[shared UTF-16 decoder]
--      (https://encoding.spec.whatwg.org/#shared-utf-16-decoder)@
-- 
-- Decodes a 'Char' from a binary stream encoded with either UTF-16 encoding
-- scheme, or returns 'Left' if the stream starts with an invalid byte
-- sequence.
decoder
    :: Bool
        -- ^ Endianness of the stream; 'True' is 'Utf16be', 'False' is 'Utf16le'.
    -> TextBuilder
decoder :: Bool -> TextBuilder
decoder Bool
bigEndian = StateT (Confidence, ()) (Parser ByteString) Word8
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next StateT (Confidence, ()) (Parser ByteString) Word8
-> (Word8 -> TextBuilder) -> TextBuilder
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \Word8
b1 -> StateT (Confidence, ()) (Parser ByteString) Word8
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next StateT (Confidence, ()) (Parser ByteString) Word8
-> (Word8 -> TextBuilder) -> TextBuilder
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \Word8
b2 ->
    case Bool -> Word8 -> Word8 -> Word
decodeUnit Bool
bigEndian Word8
b1 Word8
b2 of
        Word
unit | Word -> Word -> Word -> Bool
forall a. Ord a => a -> a -> a -> Bool
range Word
0xDC00 Word
0xDFFF Word
unit -> [Word8] -> TextBuilder
forall state. [Word8] -> StateTextBuilder state
decoderFailure [Word8
b1, Word8
b2]
        Word
unit | Word -> Word -> Word -> Bool
forall a. Ord a => a -> a -> a -> Bool
range Word
0xD800 Word
0xDBFF Word
unit -> Bool -> Word -> Word8 -> Word8 -> TextBuilder
decoder' Bool
bigEndian Word
unit Word8
b1 Word8
b2
        Word
unit -> [Word8] -> Word -> TextBuilder
forall a state.
Integral a =>
[Word8] -> a -> StateTextBuilder state
toUnicode [Word8
b1, Word8
b2] Word
unit

-- | __Encoding:__
--      @[shared UTF-16 decoder]
--      (https://encoding.spec.whatwg.org/#shared-utf-16-decoder)@
--      step 5
-- 
-- Process a character encoded as a surrogate pair according to the UTF-16
-- encoding scheme.
decoder'
    :: Bool
        -- ^ Endianness of the stream; 'True' is 'Utf16be', 'False' is 'Utf16le'.
    -> Word
        -- ^ The code point of the first surrogate in the pair.
    -> W.Word8
        -- ^ The first byte of the entire four-byte sequence, for error reporting.
    -> W.Word8
        -- ^ The second byte of the entire four-byte sequence, for error reporting.
    -> TextBuilder
decoder' :: Bool -> Word -> Word8 -> Word8 -> TextBuilder
decoder' Bool
bigEndian Word
unit1 Word8
b1 Word8
b2 = StateT (Confidence, ()) (Parser ByteString) Word8
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next StateT (Confidence, ()) (Parser ByteString) Word8
-> (Word8 -> TextBuilder) -> TextBuilder
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \Word8
b3 -> StateT (Confidence, ()) (Parser ByteString) Word8
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next StateT (Confidence, ()) (Parser ByteString) Word8
-> (Word8 -> TextBuilder) -> TextBuilder
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \Word8
b4 ->
    case Bool -> Word8 -> Word8 -> Word
decodeUnit Bool
bigEndian Word8
b3 Word8
b4 of
        Word
unit2 | Word -> Word -> Word -> Bool
forall a. Ord a => a -> a -> a -> Bool
range Word
0xDC00 Word
0xDFFF Word
unit2 -> [Word8] -> Word -> TextBuilder
forall a state.
Integral a =>
[Word8] -> a -> StateTextBuilder state
toUnicode [Word8
b1, Word8
b2, Word8
b3, Word8
b4] (Word -> TextBuilder) -> Word -> TextBuilder
forall a b. (a -> b) -> a -> b
$
            Word
0x10000 Word -> Word -> Word
forall a. Num a => a -> a -> a
+ Word -> Int -> Word
forall a. Bits a => a -> Int -> a
B.shiftL (Word
unit1 Word -> Word -> Word
forall a. Num a => a -> a -> a
- Word
0xD800) Int
10 Word -> Word -> Word
forall a. Num a => a -> a -> a
+ Word
unit2 Word -> Word -> Word
forall a. Num a => a -> a -> a
- Word
0xDC00
        Word
_ -> Word8 -> StateT (Confidence, ()) (Parser ByteString) ()
forall (m :: * -> *) stream token.
MonadParser m stream token =>
token -> m ()
push Word8
b4 StateT (Confidence, ()) (Parser ByteString) ()
-> StateT (Confidence, ()) (Parser ByteString) ()
-> StateT (Confidence, ()) (Parser ByteString) ()
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Word8 -> StateT (Confidence, ()) (Parser ByteString) ()
forall (m :: * -> *) stream token.
MonadParser m stream token =>
token -> m ()
push Word8
b3 StateT (Confidence, ()) (Parser ByteString) ()
-> TextBuilder -> TextBuilder
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> [Word8] -> TextBuilder
forall state. [Word8] -> StateTextBuilder state
decoderFailure [Word8
b1, Word8
b2]

-- | __Encoding:__
--      @[shared UTF-16 decoder]
--      (https://encoding.spec.whatwg.org/#shared-utf-16-decoder)@
--      step 4
-- 
-- Join a two-byte UTF-16 sequence into a single value, according to the
-- endianness of the stream.
decodeUnit :: Bool -> W.Word8 -> W.Word8 -> Word
decodeUnit :: Bool -> Word8 -> Word8 -> Word
decodeUnit Bool
True Word8
lead Word8
trail = Word -> Int -> Word
forall a. Bits a => a -> Int -> a
B.shiftL (Word8 -> Word
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
lead) Int
8 Word -> Word -> Word
forall a. Num a => a -> a -> a
+ Word8 -> Word
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
trail
decodeUnit Bool
False Word8
lead Word8
trail = Word -> Int -> Word
forall a. Bits a => a -> Int -> a
B.shiftL (Word8 -> Word
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
trail) Int
8 Word -> Word -> Word
forall a. Num a => a -> a -> a
+ Word8 -> Word
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
lead