{-# LANGUAGE Trustworthy #-} {-| Description: All encoders and decoders for one-byte-per-'Char' encodings. Copyright: (c) 2020 Sam May License: MPL-2.0 Maintainer: ag.eitilt@gmail.com Stability: experimental Portability: portable -} module Web.Willow.Common.Encoding.SingleByte ( decoder , encoder , encodings ) where import qualified Control.Applicative as A import qualified Control.Monad as N import qualified Data.ByteString.Short as BS.SH import qualified Data.Char as C import qualified Data.HashMap.Lazy as M.L import qualified Data.HashMap.Strict as M.S import qualified Data.Maybe as Y import qualified Data.Vector as V import qualified Data.Vector.Mutable as V.M import qualified Data.Word as W import qualified System.IO.Unsafe as IO.Unsafe import Data.Vector ( (!?) ) import Web.Willow.Common.Encoding.Common import Web.Willow.Common.Parser -- | __Encoding:__ -- @[single-byte decoder] -- (https://encoding.spec.whatwg.org/#single-byte-decoder)@ -- -- Decodes a 'Char' from a binary stream encoded with a given -- byte-per-character encoding, or returns 'Left' if the stream starts with a -- byte not used by that encoding. -- -- Fails if the 'Encoding' is handled by a different algorithm. decoder :: Encoding -> TextBuilder decoder enc = do index <- maybe A.empty return $ M.L.lookup enc decodeIndices byte <- next if isAsciiByte byte then emit [byte] . toEnum $ fromIntegral byte else maybe (decoderFailure [byte]) (emit [byte]) . N.join $ index !? fromIntegral (byte - 0x80) -- | __Encoding:__ -- @[single-byte encoder] -- (https://encoding.spec.whatwg.org/#single-byte-encoder)@ -- -- Encode the first 'Char' in a string according to a given byte-per-character -- encoding scheme, or return that same character if that scheme doesn't define -- a binary representation for it. -- -- Fails if the 'Encoding' is handled by a different algorithm. encoder :: Encoding -> BinaryBuilder encoder enc = do index <- maybe A.empty return $ M.L.lookup enc encodeIndices char <- next Y.fromMaybe (encoderFailure char) $ if C.isAscii char then Just $ fromAscii char else return . pure . BS.SH.pack . (: []) . (+ 0x80) <$> M.S.lookup char index -- | __Encoding:__ -- @[Legacy single-byte encodings] -- (https://encoding.spec.whatwg.org/#legacy-single-byte-encodings)@ -- table column 1 -- -- All byte-per-character encodings handled by 'decoder' and 'encoder'; those -- parsers will fail if passed any 'Encoding' not in this list. encodings :: [Encoding] encodings = map fst indexNames -- | The lookup tables generated for reading byte-per-character encodings. The -- 'V.Vector's themselves should hopefully not actually be evaluated (and -- therefore read from disc) until they're needed. decodeIndices :: M.L.HashMap Encoding (V.Vector (Maybe Char)) decodeIndices = M.L.fromList $ map (fmap readDecodeIndex) indexNames -- | Generate the in-memory representation of high bytes in byte-per-character -- encodings. -- -- As all the single-byte 'Encoding's used by the HTML standard pack the -- limited space tightly, and because that space is indeed limited, using a -- 'V.Vector' instead of a 'M.S.HashMap' is able to provide great random lookup -- and space efficiency. -- -- This uses 'IO.Unsafe.unsafePerformIO' internally, as the index files -- shouldn't change during runtime, and the vector mutability is -- tightly-scoped. This should therefore be a safe use of it. readDecodeIndex :: String -> V.Vector (Maybe Char) readDecodeIndex name = IO.Unsafe.unsafePerformIO $ do vector <- V.M.replicate 0x80 Nothing mapM_ (setDecodeIndex vector) $ loadIndex name V.freeze vector {-# NOINLINE readDecodeIndex #-} -- | Modify the data storage used for a particular 'Encoding' based on a single -- pre-processed line in its index file. setDecodeIndex :: V.M.IOVector (Maybe Char) -> (Word, Char) -> IO () setDecodeIndex vector (index, char) = V.M.write vector (fromIntegral index) (Just char) -- | The lookup tables generated for writing in byte-per-character encodings. -- The value 'M.S.HashMap's themselves should hopefully not actually be -- evaluated (and therefore read from disc) until they're needed. encodeIndices :: M.L.HashMap Encoding (M.S.HashMap Char W.Word8) encodeIndices = M.L.fromList $ map (fmap readEncodeIndex) indexNames -- | Generate the in-memory lookup of high bytes in byte-per-character -- encodings. -- -- The limited size and resulting high percentage of characters used per-file -- mean that a strict 'M.S.HashMap' should take less memory than a lazy one, -- with its thunks. readEncodeIndex :: String -> M.S.HashMap Char W.Word8 readEncodeIndex name = M.S.fromList . map pack $ loadIndex name where pack (index, char) = (char, fromIntegral index) -- | __Encoding:__ -- @[Legacy single-byte encodings] -- (https://encoding.spec.whatwg.org/#legacy-single-byte-encodings)@ -- table columns 1-2 -- -- The mapping between byte-per-character encodings and their lookup index -- file name (without extension or path). If one isn't listed here, 'decoder' -- and 'encoder' will fail if given that 'Encoding'. indexNames :: [(Encoding, String)] indexNames = [ (Ibm866, "ibm866") , (Iso8859_2, "iso-8859-2") , (Iso8859_3, "iso-8859-3") , (Iso8859_4, "iso-8859-4") , (Iso8859_5, "iso-8859-5") , (Iso8859_6, "iso-8859-6") , (Iso8859_7, "iso-8859-7") , (Iso8859_8, "iso-8859-8") , (Iso8859_8i, "iso-8859-8") , (Iso8859_10, "iso-8859-10") , (Iso8859_13, "iso-8859-13") , (Iso8859_14, "iso-8859-14") , (Iso8859_15, "iso-8859-15") , (Iso8859_16, "iso-8859-16") , (Koi8R, "koi8-r") , (Koi8U, "koi8-u") , (Macintosh, "macintosh") , (MacintoshCyrillic, "x-mac-cyrillic") , (Windows874, "windows-874") , (Windows1250, "windows-1250") , (Windows1251, "windows-1251") , (Windows1252, "windows-1252") , (Windows1253, "windows-1253") , (Windows1254, "windows-1254") , (Windows1255, "windows-1255") , (Windows1256, "windows-1256") , (Windows1257, "windows-1257") , (Windows1258, "windows-1258") ]