{-| Description: Character translation functions to and from the UTF-16 encoding scheme. Copyright: (c) 2020 Sam May License: MPL-2.0 Maintainer: ag.eitilt@gmail.com Stability: experimental Portability: portable -} module Web.Willow.Common.Encoding.Utf16 ( -- * Decoders decoderBigEndian , decoderLittleEndian -- * BOM Detection , byteOrderMarkBigEndian , byteOrderMarkLittleEndian ) where import qualified Control.Applicative as A import qualified Data.Bits as B import qualified Data.ByteString as BS import qualified Data.Word as W import Data.Functor ( ($>) ) import Web.Willow.Common.Encoding.Common import Web.Willow.Common.Parser -- | The binary encoding of a byte-order mark character in UTF-16 with -- big-endian layout. -- -- Fails if the stream does not start with the exact sequence @[0xFE, 0xFF]@. -- If successful, always returns 'Utf16be'. byteOrderMarkBigEndian :: (A.Alternative gather, Monad gather) => ParserT BS.ByteString gather Encoding byteOrderMarkBigEndian = chunk (BS.pack [0xFE, 0xFF]) $> Utf16be -- | The binary encoding of a byte-order mark character in UTF-16 with -- little-endian layout. -- -- Fails if the stream does not start with the exact sequence @[0xFF, 0xFE]@. -- If successful, always returns 'Utf16le'. byteOrderMarkLittleEndian :: (A.Alternative gather, Monad gather) => ParserT BS.ByteString gather Encoding byteOrderMarkLittleEndian = chunk (BS.pack [0xFF, 0xFE]) $> Utf16le -- | __Encoding:__ -- @[UTF-16BE decoder] -- (https://encoding.spec.whatwg.org/#utf-16be-decoder)@ -- -- Decodes a 'Char' from a binary stream encoded with the 'Utf16be' encoding -- scheme, or returns 'Left' if the stream starts with an invalid byte -- sequence. decoderBigEndian :: TextBuilder decoderBigEndian = decoder True -- | __Encoding:__ -- @[UTF-16LE decoder] -- (https://encoding.spec.whatwg.org/#utf-16le-decoder)@ -- -- Decodes a 'Char' from a binary stream encoded with the 'Utf16le' encoding -- scheme, or returns 'Left' if the stream starts with an invalid byte -- sequence. decoderLittleEndian :: TextBuilder decoderLittleEndian = decoder False -- | __Encoding:__ -- @[shared UTF-16 decoder] -- (https://encoding.spec.whatwg.org/#shared-utf-16-decoder)@ -- -- Decodes a 'Char' from a binary stream encoded with either UTF-16 encoding -- scheme, or returns 'Left' if the stream starts with an invalid byte -- sequence. decoder :: Bool -- ^ Endianness of the stream; 'True' is 'Utf16be', 'False' is 'Utf16le'. -> TextBuilder decoder bigEndian = next >>= \b1 -> next >>= \b2 -> case decodeUnit bigEndian b1 b2 of unit | range 0xDC00 0xDFFF unit -> decoderFailure [b1, b2] unit | range 0xD800 0xDBFF unit -> decoder' bigEndian unit b1 b2 unit -> toUnicode [b1, b2] unit -- | __Encoding:__ -- @[shared UTF-16 decoder] -- (https://encoding.spec.whatwg.org/#shared-utf-16-decoder)@ -- step 5 -- -- Process a character encoded as a surrogate pair according to the UTF-16 -- encoding scheme. decoder' :: Bool -- ^ Endianness of the stream; 'True' is 'Utf16be', 'False' is 'Utf16le'. -> Word -- ^ The code point of the first surrogate in the pair. -> W.Word8 -- ^ The first byte of the entire four-byte sequence, for error reporting. -> W.Word8 -- ^ The second byte of the entire four-byte sequence, for error reporting. -> TextBuilder decoder' bigEndian unit1 b1 b2 = next >>= \b3 -> next >>= \b4 -> case decodeUnit bigEndian b3 b4 of unit2 | range 0xDC00 0xDFFF unit2 -> toUnicode [b1, b2, b3, b4] $ 0x10000 + B.shiftL (unit1 - 0xD800) 10 + unit2 - 0xDC00 _ -> push b4 *> push b3 *> decoderFailure [b1, b2] -- | __Encoding:__ -- @[shared UTF-16 decoder] -- (https://encoding.spec.whatwg.org/#shared-utf-16-decoder)@ -- step 4 -- -- Join a two-byte UTF-16 sequence into a single value, according to the -- endianness of the stream. decodeUnit :: Bool -> W.Word8 -> W.Word8 -> Word decodeUnit True lead trail = B.shiftL (fromIntegral lead) 8 + fromIntegral trail decodeUnit False lead trail = B.shiftL (fromIntegral trail) 8 + fromIntegral lead