{-| Description: Character translation functions to and from the UTF-8 encoding scheme. Copyright: (c) 2020 Sam May License: MPL-2.0 Maintainer: ag.eitilt@gmail.com Stability: experimental Portability: portable -} module Web.Willow.Common.Encoding.Utf8 ( decoder , encoder , byteOrderMark ) where import qualified Control.Applicative as A import qualified Data.Bifunctor as F.B import qualified Data.Bits as B import qualified Data.ByteString as BS import qualified Data.ByteString.Short as BS.SH import qualified Data.Char as C import qualified Data.Word as W import Data.Functor ( ($>) ) import Data.Bits ( (.&.), (.|.) ) import Web.Willow.Common.Encoding.Common import Web.Willow.Common.Parser -- | The binary encoding of a byte-order mark character in UTF-8. -- -- Fails if the stream does not start with the exact sequence @[0xEF, 0xBB, -- 0xBF]@. If successful, always returns 'Utf8'. byteOrderMark :: (A.Alternative gather, Monad gather) => ParserT BS.ByteString gather Encoding byteOrderMark = chunk (BS.pack [0xEF, 0xBB, 0xBF]) $> Utf8 -- | __Encoding:__ -- @[UTF-8 decoder] -- (https://encoding.spec.whatwg.org/#utf-8-decoder)@ -- -- Decodes a 'Char' from a binary stream encoded with the 'Utf8' encoding -- scheme, or returns 'Left' if the stream starts with an invalid byte -- sequence. decoder :: TextBuilder decoder = do byte <- next c <- decoder' $ fromIntegral byte case c of Left err -> decoderFailure . (byte :) $ BS.SH.unpack err Right code -> emit [byte] . toEnum $ fromIntegral code where decoder' b | b <= 0x7F = return $ Right b | b <= 0xC1 = return $ Left BS.SH.empty | b <= 0xDF = packError . decodeChar 1 defL defH $ b .&. 0x1F | b == 0xE0 = packError . decodeChar 2 0xA0 defH $ b .&. 0xF | b == 0xED = packError . decodeChar 2 defL 0x9F $ b .&. 0xF | b <= 0xEF = packError . decodeChar 2 defL defH $ b .&. 0xF | b == 0xF0 = packError . decodeChar 3 0x90 defH $ b .&. 0x7 | b <= 0xF3 = packError . decodeChar 3 defL defH $ b .&. 0x7 | b == 0xF4 = packError . decodeChar 3 defL 0x8F $ b .&. 0x7 | otherwise = return $ Left BS.SH.empty defL = 0x80 defH = 0xBF packError = fmap $ F.B.first BS.SH.pack -- | __Encoding:__ -- @[UTF-8 decoder] -- (https://encoding.spec.whatwg.org/#utf-8-decoder)@ -- steps 4-11 -- -- Decodes the remainder of a 'Utf8' encoded 'Char', given its remaining length, -- the allowed range of continuation bytes, and the so-far-parsed portion -- (right-justified; i.e. when called for the second byte of a two-byte -- character, the first byte would be trimmed with @'.&.' 0x1F@ /not/ -- 'B.shiftR'). decodeChar :: Word -- ^ The remaining length of the binary sequence. -> W.Word8 -- ^ The lowest allowed value for this byte. -> W.Word8 -- ^ The highest allowed value for this byte. -> Word -- ^ The accumulated code point bits so far. -> Decoder (Either [W.Word8] Word) decodeChar 0 _ _ code = return $ Right code decodeChar len low high code = do byte <- next if range low high byte then F.B.first (byte :) <$> decodeChar (len - 1) 0x80 0xBF (B.shiftL code 6 .|. (fromIntegral byte .&. 0x3F)) else push byte $> Left [] -- | __Encoding:__ -- @[UTF-8 encoder] -- (https://encoding.spec.whatwg.org/#utf-8-encoder)@ -- -- Encode the first 'Char' in a string according to the 'Utf8' encoding scheme, -- or return that same character if that scheme doesn't define a binary -- representation for it. encoder :: BinaryBuilder encoder = do char <- next (len, offset) <- countOffset char let first = fromIntegral $ B.shiftR (fromEnum char) (6 * len) + offset return . pure . BS.SH.pack $ first : trail (fromEnum char) len where countOffset char | C.isAscii char = pure (0, 0) | char <= '\x07FF' = pure (1, 0xC0) | char <= '\xFFFF' = pure (2, 0xE0) | char <= '\x10FFFF' = pure (3, 0xF0) -- 'Char' can't even be initialized this big, but include to make -- the pattern match complete. | otherwise = A.empty trail _ 0 = [] trail code count = fromIntegral (0x80 .|. (B.shiftR code (6 * (count - 1)) .&. 0x3F)) : trail code (count - 1)