{-# LANGUAGE Trustworthy #-}

{-|
Description:    All encoders and decoders for one-byte-per-'Char' encodings.

Copyright:      (c) 2020 Sam May
License:        MPL-2.0
Maintainer:     ag.eitilt@gmail.com

Stability:      experimental
Portability:    portable
-}
module Web.Willow.Common.Encoding.SingleByte
    ( decoder
    , encoder
    , encodings
    ) where


import qualified Control.Applicative as A
import qualified Control.Monad as N

import qualified Data.ByteString.Short as BS.SH
import qualified Data.Char as C
import qualified Data.HashMap.Lazy as M.L
import qualified Data.HashMap.Strict as M.S
import qualified Data.Maybe as Y
import qualified Data.Vector as V
import qualified Data.Vector.Mutable as V.M
import qualified Data.Word as W

import qualified System.IO.Unsafe as IO.Unsafe

import Data.Vector ( (!?) )

import Web.Willow.Common.Encoding.Common
import Web.Willow.Common.Parser


-- | __Encoding:__
--      @[single-byte decoder]
--      (https://encoding.spec.whatwg.org/#single-byte-decoder)@
-- 
-- Decodes a 'Char' from a binary stream encoded with a given
-- byte-per-character encoding, or returns 'Left' if the stream starts with a
-- byte not used by that encoding.
-- 
-- Fails if the 'Encoding' is handled by a different algorithm.
decoder :: Encoding -> TextBuilder
decoder :: Encoding -> TextBuilder
decoder Encoding
enc = do
    Vector (Maybe Char)
index <- StateT (Confidence, ()) (Parser ByteString) (Vector (Maybe Char))
-> (Vector (Maybe Char)
    -> StateT
         (Confidence, ()) (Parser ByteString) (Vector (Maybe Char)))
-> Maybe (Vector (Maybe Char))
-> StateT
     (Confidence, ()) (Parser ByteString) (Vector (Maybe Char))
forall b a. b -> (a -> b) -> Maybe a -> b
maybe StateT (Confidence, ()) (Parser ByteString) (Vector (Maybe Char))
forall (f :: * -> *) a. Alternative f => f a
A.empty Vector (Maybe Char)
-> StateT
     (Confidence, ()) (Parser ByteString) (Vector (Maybe Char))
forall (m :: * -> *) a. Monad m => a -> m a
return (Maybe (Vector (Maybe Char))
 -> StateT
      (Confidence, ()) (Parser ByteString) (Vector (Maybe Char)))
-> Maybe (Vector (Maybe Char))
-> StateT
     (Confidence, ()) (Parser ByteString) (Vector (Maybe Char))
forall a b. (a -> b) -> a -> b
$ Encoding
-> HashMap Encoding (Vector (Maybe Char))
-> Maybe (Vector (Maybe Char))
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
M.L.lookup Encoding
enc HashMap Encoding (Vector (Maybe Char))
decodeIndices
    Word8
byte <- StateT (Confidence, ()) (Parser ByteString) Word8
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next
    if Word8 -> Bool
isAsciiByte Word8
byte
        then [Word8] -> Char -> TextBuilder
forall state. [Word8] -> Char -> StateTextBuilder state
emit [Word8
byte] (Char -> TextBuilder) -> (Int -> Char) -> Int -> TextBuilder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Char
forall a. Enum a => Int -> a
toEnum (Int -> TextBuilder) -> Int -> TextBuilder
forall a b. (a -> b) -> a -> b
$ Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
byte
        else TextBuilder -> (Char -> TextBuilder) -> Maybe Char -> TextBuilder
forall b a. b -> (a -> b) -> Maybe a -> b
maybe ([Word8] -> TextBuilder
forall state. [Word8] -> StateTextBuilder state
decoderFailure [Word8
byte]) ([Word8] -> Char -> TextBuilder
forall state. [Word8] -> Char -> StateTextBuilder state
emit [Word8
byte]) (Maybe Char -> TextBuilder)
-> (Maybe (Maybe Char) -> Maybe Char)
-> Maybe (Maybe Char)
-> TextBuilder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Maybe (Maybe Char) -> Maybe Char
forall (m :: * -> *) a. Monad m => m (m a) -> m a
N.join (Maybe (Maybe Char) -> TextBuilder)
-> Maybe (Maybe Char) -> TextBuilder
forall a b. (a -> b) -> a -> b
$
            Vector (Maybe Char)
index Vector (Maybe Char) -> Int -> Maybe (Maybe Char)
forall a. Vector a -> Int -> Maybe a
!? Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word8
byte Word8 -> Word8 -> Word8
forall a. Num a => a -> a -> a
- Word8
0x80)

-- | __Encoding:__
--      @[single-byte encoder]
--      (https://encoding.spec.whatwg.org/#single-byte-encoder)@
-- 
-- Encode the first 'Char' in a string according to a given byte-per-character
-- encoding scheme, or return that same character if that scheme doesn't define
-- a binary representation for it.
-- 
-- Fails if the 'Encoding' is handled by a different algorithm.
encoder :: Encoding -> BinaryBuilder
encoder :: Encoding -> BinaryBuilder
encoder Encoding
enc = do
    HashMap Char Word8
index <- StateT () (Parser Text) (HashMap Char Word8)
-> (HashMap Char Word8
    -> StateT () (Parser Text) (HashMap Char Word8))
-> Maybe (HashMap Char Word8)
-> StateT () (Parser Text) (HashMap Char Word8)
forall b a. b -> (a -> b) -> Maybe a -> b
maybe StateT () (Parser Text) (HashMap Char Word8)
forall (f :: * -> *) a. Alternative f => f a
A.empty HashMap Char Word8 -> StateT () (Parser Text) (HashMap Char Word8)
forall (m :: * -> *) a. Monad m => a -> m a
return (Maybe (HashMap Char Word8)
 -> StateT () (Parser Text) (HashMap Char Word8))
-> Maybe (HashMap Char Word8)
-> StateT () (Parser Text) (HashMap Char Word8)
forall a b. (a -> b) -> a -> b
$ Encoding
-> HashMap Encoding (HashMap Char Word8)
-> Maybe (HashMap Char Word8)
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
M.L.lookup Encoding
enc HashMap Encoding (HashMap Char Word8)
encodeIndices
    Char
char <- StateT () (Parser Text) Char
forall (m :: * -> *) stream token.
MonadParser m stream token =>
m token
next
    BinaryBuilder -> Maybe BinaryBuilder -> BinaryBuilder
forall a. a -> Maybe a -> a
Y.fromMaybe (Char -> BinaryBuilder
forall state out. Char -> StateEncoder state (EncoderError out)
encoderFailure Char
char) (Maybe BinaryBuilder -> BinaryBuilder)
-> Maybe BinaryBuilder -> BinaryBuilder
forall a b. (a -> b) -> a -> b
$ if Char -> Bool
C.isAscii Char
char
        then BinaryBuilder -> Maybe BinaryBuilder
forall a. a -> Maybe a
Just (BinaryBuilder -> Maybe BinaryBuilder)
-> BinaryBuilder -> Maybe BinaryBuilder
forall a b. (a -> b) -> a -> b
$ Char -> BinaryBuilder
forall state. Char -> StateBinaryBuilder state
fromAscii Char
char
        else Either Char ShortByteString -> BinaryBuilder
forall (m :: * -> *) a. Monad m => a -> m a
return (Either Char ShortByteString -> BinaryBuilder)
-> (Word8 -> Either Char ShortByteString) -> Word8 -> BinaryBuilder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ShortByteString -> Either Char ShortByteString
forall (f :: * -> *) a. Applicative f => a -> f a
pure (ShortByteString -> Either Char ShortByteString)
-> (Word8 -> ShortByteString)
-> Word8
-> Either Char ShortByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Word8] -> ShortByteString
BS.SH.pack ([Word8] -> ShortByteString)
-> (Word8 -> [Word8]) -> Word8 -> ShortByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Word8 -> [Word8] -> [Word8]
forall a. a -> [a] -> [a]
: []) (Word8 -> [Word8]) -> (Word8 -> Word8) -> Word8 -> [Word8]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Word8 -> Word8 -> Word8
forall a. Num a => a -> a -> a
+ Word8
0x80) (Word8 -> BinaryBuilder) -> Maybe Word8 -> Maybe BinaryBuilder
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Char -> HashMap Char Word8 -> Maybe Word8
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
M.S.lookup Char
char HashMap Char Word8
index


-- | __Encoding:__
--      @[Legacy single-byte encodings]
--      (https://encoding.spec.whatwg.org/#legacy-single-byte-encodings)@
--      table column 1
-- 
-- All byte-per-character encodings handled by 'decoder' and 'encoder'; those
-- parsers will fail if passed any 'Encoding' not in this list.
encodings :: [Encoding]
encodings :: [Encoding]
encodings = ((Encoding, String) -> Encoding)
-> [(Encoding, String)] -> [Encoding]
forall a b. (a -> b) -> [a] -> [b]
map (Encoding, String) -> Encoding
forall a b. (a, b) -> a
fst [(Encoding, String)]
indexNames


-- | The lookup tables generated for reading byte-per-character encodings.  The
-- 'V.Vector's themselves should hopefully not actually be evaluated (and
-- therefore read from disc) until they're needed.
decodeIndices :: M.L.HashMap Encoding (V.Vector (Maybe Char))
decodeIndices :: HashMap Encoding (Vector (Maybe Char))
decodeIndices = [(Encoding, Vector (Maybe Char))]
-> HashMap Encoding (Vector (Maybe Char))
forall k v. (Eq k, Hashable k) => [(k, v)] -> HashMap k v
M.L.fromList ([(Encoding, Vector (Maybe Char))]
 -> HashMap Encoding (Vector (Maybe Char)))
-> [(Encoding, Vector (Maybe Char))]
-> HashMap Encoding (Vector (Maybe Char))
forall a b. (a -> b) -> a -> b
$ ((Encoding, String) -> (Encoding, Vector (Maybe Char)))
-> [(Encoding, String)] -> [(Encoding, Vector (Maybe Char))]
forall a b. (a -> b) -> [a] -> [b]
map ((String -> Vector (Maybe Char))
-> (Encoding, String) -> (Encoding, Vector (Maybe Char))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap String -> Vector (Maybe Char)
readDecodeIndex) [(Encoding, String)]
indexNames

-- | Generate the in-memory representation of high bytes in byte-per-character
-- encodings.
-- 
-- As all the single-byte 'Encoding's used by the HTML standard pack the
-- limited space tightly, and because that space is indeed limited, using a
-- 'V.Vector' instead of a 'M.S.HashMap' is able to provide great random lookup
-- and space efficiency.
-- 
-- This uses 'IO.Unsafe.unsafePerformIO' internally, as the index files
-- shouldn't change during runtime, and the vector mutability is
-- tightly-scoped.  This should therefore be a safe use of it.
readDecodeIndex :: String -> V.Vector (Maybe Char)
readDecodeIndex :: String -> Vector (Maybe Char)
readDecodeIndex String
name = IO (Vector (Maybe Char)) -> Vector (Maybe Char)
forall a. IO a -> a
IO.Unsafe.unsafePerformIO (IO (Vector (Maybe Char)) -> Vector (Maybe Char))
-> IO (Vector (Maybe Char)) -> Vector (Maybe Char)
forall a b. (a -> b) -> a -> b
$ do
    IOVector (Maybe Char)
vector <- Int -> Maybe Char -> IO (MVector (PrimState IO) (Maybe Char))
forall (m :: * -> *) a.
PrimMonad m =>
Int -> a -> m (MVector (PrimState m) a)
V.M.replicate Int
0x80 Maybe Char
forall a. Maybe a
Nothing
    ((Word, Char) -> IO ()) -> [(Word, Char)] -> IO ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
(a -> m b) -> t a -> m ()
mapM_ (IOVector (Maybe Char) -> (Word, Char) -> IO ()
setDecodeIndex IOVector (Maybe Char)
vector) ([(Word, Char)] -> IO ()) -> [(Word, Char)] -> IO ()
forall a b. (a -> b) -> a -> b
$ String -> [(Word, Char)]
loadIndex String
name
    MVector (PrimState IO) (Maybe Char) -> IO (Vector (Maybe Char))
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> m (Vector a)
V.freeze IOVector (Maybe Char)
MVector (PrimState IO) (Maybe Char)
vector
{-# NOINLINE readDecodeIndex #-}

-- | Modify the data storage used for a particular 'Encoding' based on a single
-- pre-processed line in its index file.
setDecodeIndex :: V.M.IOVector (Maybe Char) -> (Word, Char) -> IO ()
setDecodeIndex :: IOVector (Maybe Char) -> (Word, Char) -> IO ()
setDecodeIndex IOVector (Maybe Char)
vector (Word
index, Char
char) = MVector (PrimState IO) (Maybe Char) -> Int -> Maybe Char -> IO ()
forall (m :: * -> *) a.
PrimMonad m =>
MVector (PrimState m) a -> Int -> a -> m ()
V.M.write IOVector (Maybe Char)
MVector (PrimState IO) (Maybe Char)
vector (Word -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word
index) (Char -> Maybe Char
forall a. a -> Maybe a
Just Char
char)


-- | The lookup tables generated for writing in byte-per-character encodings.
-- The value 'M.S.HashMap's themselves should hopefully not actually be
-- evaluated (and therefore read from disc) until they're needed.
encodeIndices :: M.L.HashMap Encoding (M.S.HashMap Char W.Word8)
encodeIndices :: HashMap Encoding (HashMap Char Word8)
encodeIndices = [(Encoding, HashMap Char Word8)]
-> HashMap Encoding (HashMap Char Word8)
forall k v. (Eq k, Hashable k) => [(k, v)] -> HashMap k v
M.L.fromList ([(Encoding, HashMap Char Word8)]
 -> HashMap Encoding (HashMap Char Word8))
-> [(Encoding, HashMap Char Word8)]
-> HashMap Encoding (HashMap Char Word8)
forall a b. (a -> b) -> a -> b
$ ((Encoding, String) -> (Encoding, HashMap Char Word8))
-> [(Encoding, String)] -> [(Encoding, HashMap Char Word8)]
forall a b. (a -> b) -> [a] -> [b]
map ((String -> HashMap Char Word8)
-> (Encoding, String) -> (Encoding, HashMap Char Word8)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap String -> HashMap Char Word8
readEncodeIndex) [(Encoding, String)]
indexNames

-- | Generate the in-memory lookup of high bytes in byte-per-character
-- encodings.
-- 
-- The limited size and resulting high percentage of characters used per-file
-- mean that a strict 'M.S.HashMap' should take less memory than a lazy one,
-- with its thunks.
readEncodeIndex :: String -> M.S.HashMap Char W.Word8
readEncodeIndex :: String -> HashMap Char Word8
readEncodeIndex String
name = [(Char, Word8)] -> HashMap Char Word8
forall k v. (Eq k, Hashable k) => [(k, v)] -> HashMap k v
M.S.fromList ([(Char, Word8)] -> HashMap Char Word8)
-> ([(Word, Char)] -> [(Char, Word8)])
-> [(Word, Char)]
-> HashMap Char Word8
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ((Word, Char) -> (Char, Word8))
-> [(Word, Char)] -> [(Char, Word8)]
forall a b. (a -> b) -> [a] -> [b]
map (Word, Char) -> (Char, Word8)
forall a b a. (Integral a, Num b) => (a, a) -> (a, b)
pack ([(Word, Char)] -> HashMap Char Word8)
-> [(Word, Char)] -> HashMap Char Word8
forall a b. (a -> b) -> a -> b
$ String -> [(Word, Char)]
loadIndex String
name
  where pack :: (a, a) -> (a, b)
pack (a
index, a
char) = (a
char, a -> b
forall a b. (Integral a, Num b) => a -> b
fromIntegral a
index)


-- | __Encoding:__
--      @[Legacy single-byte encodings]
--      (https://encoding.spec.whatwg.org/#legacy-single-byte-encodings)@
--      table columns 1-2
-- 
-- The mapping between byte-per-character encodings and their lookup index
-- file name (without extension or path).  If one isn't listed here, 'decoder'
-- and 'encoder' will fail if given that 'Encoding'.
indexNames :: [(Encoding, String)]
indexNames :: [(Encoding, String)]
indexNames =
    [ (Encoding
Ibm866, String
"ibm866")
    , (Encoding
Iso8859_2, String
"iso-8859-2")
    , (Encoding
Iso8859_3, String
"iso-8859-3")
    , (Encoding
Iso8859_4, String
"iso-8859-4")
    , (Encoding
Iso8859_5, String
"iso-8859-5")
    , (Encoding
Iso8859_6, String
"iso-8859-6")
    , (Encoding
Iso8859_7, String
"iso-8859-7")
    , (Encoding
Iso8859_8, String
"iso-8859-8")
    , (Encoding
Iso8859_8i, String
"iso-8859-8")
    , (Encoding
Iso8859_10, String
"iso-8859-10")
    , (Encoding
Iso8859_13, String
"iso-8859-13")
    , (Encoding
Iso8859_14, String
"iso-8859-14")
    , (Encoding
Iso8859_15, String
"iso-8859-15")
    , (Encoding
Iso8859_16, String
"iso-8859-16")
    , (Encoding
Koi8R, String
"koi8-r")
    , (Encoding
Koi8U, String
"koi8-u")
    , (Encoding
Macintosh, String
"macintosh")
    , (Encoding
MacintoshCyrillic, String
"x-mac-cyrillic")
    , (Encoding
Windows874, String
"windows-874")
    , (Encoding
Windows1250, String
"windows-1250")
    , (Encoding
Windows1251, String
"windows-1251")
    , (Encoding
Windows1252, String
"windows-1252")
    , (Encoding
Windows1253, String
"windows-1253")
    , (Encoding
Windows1254, String
"windows-1254")
    , (Encoding
Windows1255, String
"windows-1255")
    , (Encoding
Windows1256, String
"windows-1256")
    , (Encoding
Windows1257, String
"windows-1257")
    , (Encoding
Windows1258, String
"windows-1258")
    ]