{-# LANGUAGE Trustworthy #-}

{-|
Description:    Functions and objects used to build the encode/decode parsers.

Copyright:      (c) 2020 Sam May
License:        MPL-2.0
Maintainer:     ag.eitilt@gmail.com

Stability:      experimental
Portability:    portable
-}
module Web.Willow.Common.Encoding.Common
    ( Encoding ( .. )
    , Confidence ( .. )
    , confidenceEncoding
    , ReparseData ( .. )
    , emptyReparseData
    , DecoderState ( .. )
    , EncoderState ( .. )
      -- * Predicates
    , toByte
    , asciiWhitespace
    , asciiWhitespaceB
    , toAsciiLower
    , toAsciiLowerB
    , isAsciiByte
    , range
    , search
      -- * Parsers
      -- ** Decoders
      -- *** Core
    , Decoder
    , StateDecoder
    , getDecoderState
    , modifyDecoderState
    , decoderFailure
    , decoderFailure1
    , decoderFailure2
      -- *** Output
    , TextBuilder
    , StateTextBuilder
    , DecoderError
    , toUnicode
    , toUnicode1
    , emit
    , emit'
      -- ** Encoders
      -- *** Core
    , Encoder
    , StateEncoder
    , getEncoderState
    , modifyEncoderState
    , encoderFailure
      -- *** Output
    , BinaryBuilder
    , StateBinaryBuilder
    , EncoderError
    , fromAscii
      -- * Memoization
    , MemoizationTable
    , DecoderMemoTable
    , EncoderMemoTable
    , lookupMemoizedIndex
      -- ** Construction
    , newMemoizationTable
    , loadIndex
    , loadIndex'
    ) where


import qualified Control.Applicative as A
import qualified Control.Concurrent as IO.C
import qualified Control.Monad.Trans.State as N.S

import qualified Data.Bifunctor as F.B
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as BS.L
import qualified Data.ByteString.Short as BS.SH
import qualified Data.Char as C
import qualified Data.Hashable as H
import qualified Data.HashMap.Strict as M.S
import qualified Data.IORef as IO.R
import qualified Data.Maybe as Y
import qualified Data.Text as T
import qualified Data.Text.IO as T.IO
import qualified Data.Text.Read as T.R
import qualified Data.Word as W

import qualified System.IO as IO
import qualified System.IO.Unsafe as IO.Unsafe

import Data.Functor ( ($>) )
import System.FilePath ( (<.>) )

import Paths_willow

import Web.Willow.Common.Encoding.Character
import Web.Willow.Common.Parser
import Web.Willow.Common.Parser.Util

import {-# SOURCE #-} Web.Willow.Common.Encoding


-- | Allow specifying binary data as ASCII, purely to simplify authoring.
toByte :: Char -> W.Word8
toByte :: Char -> Word8
toByte = Int -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Word8) -> (Char -> Int) -> Char -> Word8
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Int
forall a. Enum a => a -> Int
fromEnum
{-# INLINE toByte #-}


-- | The underlying binary representations of 'asciiWhitespace', in any
-- ASCII-compatible encoding.
asciiWhitespaceB :: [W.Word8]
asciiWhitespaceB :: [Word8]
asciiWhitespaceB = (Char -> Word8) -> [Char] -> [Word8]
forall a b. (a -> b) -> [a] -> [b]
map (Int -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Word8) -> (Char -> Int) -> Char -> Word8
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Int
forall a. Enum a => a -> Int
fromEnum) [Char]
asciiWhitespace


-- | Map a byte representing an ASCII uppercase letter to its corresponding
-- lowercase value; compare with 'Data.Char.toLower'.
toAsciiLowerB :: W.Word8 -> W.Word8
toAsciiLowerB :: Word8 -> Word8
toAsciiLowerB Word8
c
    | Word8
c Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
>= Char -> Word8
toByte Char
'A' Bool -> Bool -> Bool
&& Word8
c Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= Char -> Word8
toByte Char
'Z' = Word8
c Word8 -> Word8 -> Word8
forall a. Num a => a -> a -> a
+ Word8
0x20
    | Bool
otherwise = Word8
c


-- | __Encoding:__
--      @[encoding]
--      (https://encoding.spec.whatwg.org/#encoding)@
-- 
-- All character encoding schemes supported by the HTML standard, defined as a
-- bidirectional map between characters and binary sequences.  'Utf8' is
-- strongly encouraged for new content (including all encoding purposes), but
-- the others are retained for compatibility with existing pages.
-- 
-- Note that none of these are complete functions, to one degree or another,
-- and that no guarantee is made that the mapping round-trips.
data Encoding
    = Utf8
        -- ^ The UTF-8 encoding for Unicode.
    | Utf16be
        -- ^ The UTF-16 encoding for Unicode, in big endian order.
        -- 
        -- No encoder is provided for this scheme.
    | Utf16le
        -- ^ The UTF-16 encoding for Unicode, in little endian order.
        -- 
        -- No encoder is provided for this scheme.
    | Big5
        -- ^ [Big5](https://encoding.spec.whatwg.org/big5.html), primarily
        -- covering traditional Chinese characters.
    | EucJp
        -- ^ EUC-JP, primarily covering Japanese as the union of [JIS-0208]
        -- (https://encoding.spec.whatwg.org/jis0208.html) and [JIS-0212]
        -- (https://encoding.spec.whatwg.org/jis0212.html).
    | EucKr
        -- ^ [EUC-KR](https://encoding.spec.whatwg.org/euc-kr.html), primarily
        -- covering Hangul.
    | Gb18030
        -- ^ The [GB18030-2005 extension]
        -- (https://encoding.spec.whatwg.org/gb18030.html) to GBK, with one
        -- tweak for web compatibility, primarily covering both forms of
        -- Chinese characters.
        -- 
        -- Note that this encoding also includes a large number of four-byte
        -- sequences which aren't listed in the linked visualization.
    | Gbk
        -- ^ GBK, primarily covering simplified Chinese characters.
        -- 
        -- In practice, this is just 'Gb18030' with a restricted set of
        -- encodable characters; the decoder is identical.
    | Ibm866
        -- ^ DOS and OS/2 [code page]
        -- (https://encoding.spec.whatwg.org/ibm866.html) for Cyrillic
        -- characters.
    | Iso2022Jp
        -- ^ A Japanese-focused implementation of the ISO 2022 meta-encoding,
        -- including both [JIS-0208]
        -- (https://encoding.spec.whatwg.org/jis0208.html) and halfwidth
        -- katakana.
    | Iso8859_2
        -- ^ [Latin-2](https://encoding.spec.whatwg.org/iso-8859-2.html)
        -- (Central European).
    | Iso8859_3
        -- ^ [Latin-3](https://encoding.spec.whatwg.org/iso-8859-3.html)
        -- (South European and Esperanto)
    | Iso8859_4
        -- ^ [Latin-4](https://encoding.spec.whatwg.org/iso-8859-4.html)
        -- (North European).
    | Iso8859_5
        -- ^ [Latin/Cyrillic](https://encoding.spec.whatwg.org/iso-8859-5.html).
    | Iso8859_6
        -- ^ [Latin/Arabic](https://encoding.spec.whatwg.org/iso-8859-6.html).
    | Iso8859_7
        -- ^ [Latin/Greek](https://encoding.spec.whatwg.org/iso-8859-7.html)
        -- (modern monotonic).
    | Iso8859_8
        -- ^ [Latin/Hebrew](https://encoding.spec.whatwg.org/iso-8859-8.html)
        -- (visual order).
    | Iso8859_8i
        -- ^ [Latin/Hebrew](https://encoding.spec.whatwg.org/iso-8859-8.html)
        -- (logical order).
    | Iso8859_10
        -- ^ [Latin-6](https://encoding.spec.whatwg.org/iso-8859-10.html)
        -- (Nordic).
    | Iso8859_13
        -- ^ [Latin-7](https://encoding.spec.whatwg.org/iso-8859-13.html)
        -- (Baltic Rim).
    | Iso8859_14
        -- ^ [Latin-8](https://encoding.spec.whatwg.org/iso-8859-14.html)
        -- (Celtic).
    | Iso8859_15
        -- ^ [Latin-9](https://encoding.spec.whatwg.org/iso-8859-15.html)
        -- (revision of ISO 8859-1 Latin-1, Western European).
    | Iso8859_16
        -- ^ [Latin-10](https://encoding.spec.whatwg.org/iso-8859-16.html)
        -- (South-Eastern European).
    | Koi8R
        -- ^ KOI-8 [specialized](https://encoding.spec.whatwg.org/koi8-r.html)
        -- for Russian Cyrillic.
    | Koi8U
        -- ^ KOI-8 [specialized](https://encoding.spec.whatwg.org/koi8-u.html)
        -- for Ukrainian Cyrillic.
    | Macintosh
        -- ^ [Mac OS Roman](https://encoding.spec.whatwg.org/macintosh.html).
    | MacintoshCyrillic -- "x-mac-cyrillic"
        -- ^ [Mac OS Cyrillic]
        -- (https://encoding.spec.whatwg.org/x-mac-cyrillic.html) (as of Mac OS
        -- 9.0)
    | ShiftJis
        -- ^ The [Windows variant]
        -- (https://encoding.spec.whatwg.org/shift_jis.html) (code page 932) of
        -- Shift JIS.
    | Windows874
        -- ^ ISO 8859-11 [Latin\/Thai]
        -- (https://encoding.spec.whatwg.org/windows-874.html) with Windows
        -- extensions in the C1 control character slots.
        -- 
        -- Note that this encoding is always used instead of pure Latin/Thai.
    | Windows1250
        -- ^ The Windows [extension and rearrangement]
        -- (https://encoding.spec.whatwg.org/windows-1250.html) of ISO 8859-2
        -- Latin-2.
    | Windows1251
        -- ^ [Windows Cyrillic]
        -- (https://encoding.spec.whatwg.org/windows-1251.html).
    | Windows1252
        -- ^ The Windows extension of ISO 8859-1 [Latin-1]
        -- (https://encoding.spec.whatwg.org/windows-1252.html), replacing most
        -- of the C1 control characters with printable glyphs.
        -- 
        -- Note that this encoding is always used instead of pure Latin-1.
    | Windows1253
        -- ^ [Windows Greek]
        -- (https://encoding.spec.whatwg.org/windows-1253.html) (modern
        -- monotonic).
    | Windows1254
        -- ^ The Windows extension of ISO 8859-9 [Latin-5 (Turkish)]
        -- (https://encoding.spec.whatwg.org/windows-1254.html), replacing most
        -- of the C1 control characters with printable glyphs.
        -- 
        -- Note that this encoding is always used instead of pure Latin-5.
    | Windows1255
        -- ^ The Windows [extension and rearrangement]
        -- (https://encoding.spec.whatwg.org/windows-1255.html) of ISO 8859-8
        -- Latin/Hebrew.
    | Windows1256
        -- ^ [Windows Arabic]
        -- (https://encoding.spec.whatwg.org/windows-1256.html).
    | Windows1257
        -- ^ [Windows Baltic]
        -- (https://encoding.spec.whatwg.org/windows-1257.html).
    | Windows1258
        -- ^ [Windows Vietnamese]
        -- (https://encoding.spec.whatwg.org/windows-1258.html).
    | Replacement
        -- ^ The input is reduced to a single @\\xFFFD@ replacement character.
        -- 
        -- No encoder is provided for this scheme.
    | UserDefined --"x-user-defined"
        -- ^ Non-ASCII bytes (@\\x80@ through @\\xFF@) are mapped to a portion
        -- of the Unicode Private Use Area (@\\xF780@ through @\\xF7FF@).
  deriving ( Encoding -> Encoding -> Bool
(Encoding -> Encoding -> Bool)
-> (Encoding -> Encoding -> Bool) -> Eq Encoding
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Encoding -> Encoding -> Bool
$c/= :: Encoding -> Encoding -> Bool
== :: Encoding -> Encoding -> Bool
$c== :: Encoding -> Encoding -> Bool
Eq, Eq Encoding
Eq Encoding
-> (Encoding -> Encoding -> Ordering)
-> (Encoding -> Encoding -> Bool)
-> (Encoding -> Encoding -> Bool)
-> (Encoding -> Encoding -> Bool)
-> (Encoding -> Encoding -> Bool)
-> (Encoding -> Encoding -> Encoding)
-> (Encoding -> Encoding -> Encoding)
-> Ord Encoding
Encoding -> Encoding -> Bool
Encoding -> Encoding -> Ordering
Encoding -> Encoding -> Encoding
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: Encoding -> Encoding -> Encoding
$cmin :: Encoding -> Encoding -> Encoding
max :: Encoding -> Encoding -> Encoding
$cmax :: Encoding -> Encoding -> Encoding
>= :: Encoding -> Encoding -> Bool
$c>= :: Encoding -> Encoding -> Bool
> :: Encoding -> Encoding -> Bool
$c> :: Encoding -> Encoding -> Bool
<= :: Encoding -> Encoding -> Bool
$c<= :: Encoding -> Encoding -> Bool
< :: Encoding -> Encoding -> Bool
$c< :: Encoding -> Encoding -> Bool
compare :: Encoding -> Encoding -> Ordering
$ccompare :: Encoding -> Encoding -> Ordering
$cp1Ord :: Eq Encoding
Ord, Encoding
Encoding -> Encoding -> Bounded Encoding
forall a. a -> a -> Bounded a
maxBound :: Encoding
$cmaxBound :: Encoding
minBound :: Encoding
$cminBound :: Encoding
Bounded, Int -> Encoding
Encoding -> Int
Encoding -> [Encoding]
Encoding -> Encoding
Encoding -> Encoding -> [Encoding]
Encoding -> Encoding -> Encoding -> [Encoding]
(Encoding -> Encoding)
-> (Encoding -> Encoding)
-> (Int -> Encoding)
-> (Encoding -> Int)
-> (Encoding -> [Encoding])
-> (Encoding -> Encoding -> [Encoding])
-> (Encoding -> Encoding -> [Encoding])
-> (Encoding -> Encoding -> Encoding -> [Encoding])
-> Enum Encoding
forall a.
(a -> a)
-> (a -> a)
-> (Int -> a)
-> (a -> Int)
-> (a -> [a])
-> (a -> a -> [a])
-> (a -> a -> [a])
-> (a -> a -> a -> [a])
-> Enum a
enumFromThenTo :: Encoding -> Encoding -> Encoding -> [Encoding]
$cenumFromThenTo :: Encoding -> Encoding -> Encoding -> [Encoding]
enumFromTo :: Encoding -> Encoding -> [Encoding]
$cenumFromTo :: Encoding -> Encoding -> [Encoding]
enumFromThen :: Encoding -> Encoding -> [Encoding]
$cenumFromThen :: Encoding -> Encoding -> [Encoding]
enumFrom :: Encoding -> [Encoding]
$cenumFrom :: Encoding -> [Encoding]
fromEnum :: Encoding -> Int
$cfromEnum :: Encoding -> Int
toEnum :: Int -> Encoding
$ctoEnum :: Int -> Encoding
pred :: Encoding -> Encoding
$cpred :: Encoding -> Encoding
succ :: Encoding -> Encoding
$csucc :: Encoding -> Encoding
Enum, Int -> Encoding -> ShowS
[Encoding] -> ShowS
Encoding -> [Char]
(Int -> Encoding -> ShowS)
-> (Encoding -> [Char]) -> ([Encoding] -> ShowS) -> Show Encoding
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
showList :: [Encoding] -> ShowS
$cshowList :: [Encoding] -> ShowS
show :: Encoding -> [Char]
$cshow :: Encoding -> [Char]
showsPrec :: Int -> Encoding -> ShowS
$cshowsPrec :: Int -> Encoding -> ShowS
Show, ReadPrec [Encoding]
ReadPrec Encoding
Int -> ReadS Encoding
ReadS [Encoding]
(Int -> ReadS Encoding)
-> ReadS [Encoding]
-> ReadPrec Encoding
-> ReadPrec [Encoding]
-> Read Encoding
forall a.
(Int -> ReadS a)
-> ReadS [a] -> ReadPrec a -> ReadPrec [a] -> Read a
readListPrec :: ReadPrec [Encoding]
$creadListPrec :: ReadPrec [Encoding]
readPrec :: ReadPrec Encoding
$creadPrec :: ReadPrec Encoding
readList :: ReadS [Encoding]
$creadList :: ReadS [Encoding]
readsPrec :: Int -> ReadS Encoding
$creadsPrec :: Int -> ReadS Encoding
Read )
instance H.Hashable Encoding where
    hashWithSalt :: Int -> Encoding -> Int
hashWithSalt Int
s = (Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
s) (Int -> Int) -> (Encoding -> Int) -> Encoding -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Encoding -> Int
forall a. Enum a => a -> Int
fromEnum


-- | All the data which needs to be tracked for correct behaviour in decoding a
-- binary stream into readable text.
data DecoderState = DecoderState
    { DecoderState -> Confidence
decoderConfidence_ :: Confidence
        -- ^ The encoding scheme currently in use by the parser, along with how
        -- likely that scheme actually represents the binary stream.
    , DecoderState -> Maybe Bool
useBom :: Maybe Bool
        -- ^ Whether a byte-order mark at the beginning of the stream should
        -- override the encoding given by 'decoderConfidence_'.  A 'Nothing'
        -- value treats it as any other character, while a @'Just' 'False'@
        -- does the same if and only if it differs from the current encoding,
        -- silently consuming it (without output) if it matches.
    , DecoderState -> InnerDecoderState
innerDecoderState :: InnerDecoderState
        -- ^ Any state parameters specific to the encoding scheme used.  Note
        -- this value takes precidence over the 'Encoding' specified by
        -- 'decoderConfidence_' (but not the degree of confidence itself) if the
        -- two differ.
    , DecoderState -> ShortByteString
remainderBytes :: BS.SH.ShortByteString
        -- ^ Any leftover bytes at the end of the binary stream, which require
        -- further input to be processed in order to correctly map to a
        -- character or error value.
    }
  deriving ( DecoderState -> DecoderState -> Bool
(DecoderState -> DecoderState -> Bool)
-> (DecoderState -> DecoderState -> Bool) -> Eq DecoderState
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: DecoderState -> DecoderState -> Bool
$c/= :: DecoderState -> DecoderState -> Bool
== :: DecoderState -> DecoderState -> Bool
$c== :: DecoderState -> DecoderState -> Bool
Eq, Int -> DecoderState -> ShowS
[DecoderState] -> ShowS
DecoderState -> [Char]
(Int -> DecoderState -> ShowS)
-> (DecoderState -> [Char])
-> ([DecoderState] -> ShowS)
-> Show DecoderState
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
showList :: [DecoderState] -> ShowS
$cshowList :: [DecoderState] -> ShowS
show :: DecoderState -> [Char]
$cshow :: DecoderState -> [Char]
showsPrec :: Int -> DecoderState -> ShowS
$cshowsPrec :: Int -> DecoderState -> ShowS
Show, ReadPrec [DecoderState]
ReadPrec DecoderState
Int -> ReadS DecoderState
ReadS [DecoderState]
(Int -> ReadS DecoderState)
-> ReadS [DecoderState]
-> ReadPrec DecoderState
-> ReadPrec [DecoderState]
-> Read DecoderState
forall a.
(Int -> ReadS a)
-> ReadS [a] -> ReadPrec a -> ReadPrec [a] -> Read a
readListPrec :: ReadPrec [DecoderState]
$creadListPrec :: ReadPrec [DecoderState]
readPrec :: ReadPrec DecoderState
$creadPrec :: ReadPrec DecoderState
readList :: ReadS [DecoderState]
$creadList :: ReadS [DecoderState]
readsPrec :: Int -> ReadS DecoderState
$creadsPrec :: Int -> ReadS DecoderState
Read )

-- | __HTML:__
--      @[confidence]
--      (https://html.spec.whatwg.org/multipage/parsing.html#concept-encoding-confidence)@
-- 
-- How likely the specified encoding is to be the actual stream encoding.
-- 
-- The spec names a third confidence level @irrelevant@, to be used when the
-- stream doesn't depend on any particular encoding scheme (i.e. it is
-- composed directly of 'Char's rather than parsed from a binary stream).  This
-- has not been included in the sum type, as it makes little sense to have that
-- as a parameter of the /decoding/ stage.  Use @'Maybe' 'DecoderState'@ to
-- represent it instead.
data Confidence
    = Tentative Encoding ReparseData
        -- ^ The binary stream is likely the named encoding, but more data may
        -- prove it to be something else.  In the latter case, the
        -- 'ReparseData' (if available) may be used to transition to the proper
        -- encoding, or restart the stream if necessary.
    | Certain Encoding
        -- ^ The binary stream is confirmed to be of the given encoding.
  deriving ( Confidence -> Confidence -> Bool
(Confidence -> Confidence -> Bool)
-> (Confidence -> Confidence -> Bool) -> Eq Confidence
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Confidence -> Confidence -> Bool
$c/= :: Confidence -> Confidence -> Bool
== :: Confidence -> Confidence -> Bool
$c== :: Confidence -> Confidence -> Bool
Eq, Int -> Confidence -> ShowS
[Confidence] -> ShowS
Confidence -> [Char]
(Int -> Confidence -> ShowS)
-> (Confidence -> [Char])
-> ([Confidence] -> ShowS)
-> Show Confidence
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
showList :: [Confidence] -> ShowS
$cshowList :: [Confidence] -> ShowS
show :: Confidence -> [Char]
$cshow :: Confidence -> [Char]
showsPrec :: Int -> Confidence -> ShowS
$cshowsPrec :: Int -> Confidence -> ShowS
Show, ReadPrec [Confidence]
ReadPrec Confidence
Int -> ReadS Confidence
ReadS [Confidence]
(Int -> ReadS Confidence)
-> ReadS [Confidence]
-> ReadPrec Confidence
-> ReadPrec [Confidence]
-> Read Confidence
forall a.
(Int -> ReadS a)
-> ReadS [a] -> ReadPrec a -> ReadPrec [a] -> Read a
readListPrec :: ReadPrec [Confidence]
$creadListPrec :: ReadPrec [Confidence]
readPrec :: ReadPrec Confidence
$creadPrec :: ReadPrec Confidence
readList :: ReadS [Confidence]
$creadList :: ReadS [Confidence]
readsPrec :: Int -> ReadS Confidence
$creadsPrec :: Int -> ReadS Confidence
Read )

-- | Extract the underlying encoding scheme from the wrapping data.
confidenceEncoding :: Confidence -> Encoding
confidenceEncoding :: Confidence -> Encoding
confidenceEncoding (Tentative Encoding
enc ReparseData
_) = Encoding
enc
confidenceEncoding (Certain Encoding
enc) = Encoding
enc

-- | __HTML:__
--      @[change the encoding]
--      (https://html.spec.whatwg.org/multipage/parsing.html#change-the-encoding)@
-- 
-- The data required to determine if a new encoding would produce an identical
-- output to what the current one has already done, and to restart the parsing
-- with the new one if the two are incompatible.  Values may be easily
-- initialized via 'emptyReparseData'.
data ReparseData = ReparseData
    { ReparseData -> HashMap ShortByteString Char
parsedChars :: M.S.HashMap BS.SH.ShortByteString Char
        -- ^ The input binary sequences and the resulting characters which are
        -- already emitted to the output.
    , ReparseData -> ByteString
streamStart :: BS.L.ByteString
        -- ^ The complete binary sequence parsed thus far, in case it needs to
        -- be re-processed under a new, incompatible encoding.
    }
  deriving ( ReparseData -> ReparseData -> Bool
(ReparseData -> ReparseData -> Bool)
-> (ReparseData -> ReparseData -> Bool) -> Eq ReparseData
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: ReparseData -> ReparseData -> Bool
$c/= :: ReparseData -> ReparseData -> Bool
== :: ReparseData -> ReparseData -> Bool
$c== :: ReparseData -> ReparseData -> Bool
Eq, Int -> ReparseData -> ShowS
[ReparseData] -> ShowS
ReparseData -> [Char]
(Int -> ReparseData -> ShowS)
-> (ReparseData -> [Char])
-> ([ReparseData] -> ShowS)
-> Show ReparseData
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
showList :: [ReparseData] -> ShowS
$cshowList :: [ReparseData] -> ShowS
show :: ReparseData -> [Char]
$cshow :: ReparseData -> [Char]
showsPrec :: Int -> ReparseData -> ShowS
$cshowsPrec :: Int -> ReparseData -> ShowS
Show, ReadPrec [ReparseData]
ReadPrec ReparseData
Int -> ReadS ReparseData
ReadS [ReparseData]
(Int -> ReadS ReparseData)
-> ReadS [ReparseData]
-> ReadPrec ReparseData
-> ReadPrec [ReparseData]
-> Read ReparseData
forall a.
(Int -> ReadS a)
-> ReadS [a] -> ReadPrec a -> ReadPrec [a] -> Read a
readListPrec :: ReadPrec [ReparseData]
$creadListPrec :: ReadPrec [ReparseData]
readPrec :: ReadPrec ReparseData
$creadPrec :: ReadPrec ReparseData
readList :: ReadS [ReparseData]
$creadList :: ReadS [ReparseData]
readsPrec :: Int -> ReadS ReparseData
$creadsPrec :: Int -> ReadS ReparseData
Read )

-- | The collection of data which would indicate nothing has yet been parsed.
emptyReparseData :: ReparseData
emptyReparseData :: ReparseData
emptyReparseData = ReparseData :: HashMap ShortByteString Char -> ByteString -> ReparseData
ReparseData
    { parsedChars :: HashMap ShortByteString Char
parsedChars = HashMap ShortByteString Char
forall k v. HashMap k v
M.S.empty
    , streamStart :: ByteString
streamStart = ByteString
BS.L.empty
    }


-- | All the data which needs to be tracked for correct behaviour in decoding a
-- binary stream into readable text.
data EncoderState = EncoderState
    { EncoderState -> Encoding
encoderScheme :: Encoding
        -- ^ The encoding scheme in use by the parser.
    , EncoderState -> InnerEncoderState
innerEncoderState :: InnerEncoderState
        -- ^ Any state parameters specific to the encoding scheme used.  Note
        -- that this value takes precidence over the 'Encoding' specified by
        -- 'encoderScheme' if the two differ.
    }
  deriving ( EncoderState -> EncoderState -> Bool
(EncoderState -> EncoderState -> Bool)
-> (EncoderState -> EncoderState -> Bool) -> Eq EncoderState
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: EncoderState -> EncoderState -> Bool
$c/= :: EncoderState -> EncoderState -> Bool
== :: EncoderState -> EncoderState -> Bool
$c== :: EncoderState -> EncoderState -> Bool
Eq, Int -> EncoderState -> ShowS
[EncoderState] -> ShowS
EncoderState -> [Char]
(Int -> EncoderState -> ShowS)
-> (EncoderState -> [Char])
-> ([EncoderState] -> ShowS)
-> Show EncoderState
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
showList :: [EncoderState] -> ShowS
$cshowList :: [EncoderState] -> ShowS
show :: EncoderState -> [Char]
$cshow :: EncoderState -> [Char]
showsPrec :: Int -> EncoderState -> ShowS
$cshowsPrec :: Int -> EncoderState -> ShowS
Show, ReadPrec [EncoderState]
ReadPrec EncoderState
Int -> ReadS EncoderState
ReadS [EncoderState]
(Int -> ReadS EncoderState)
-> ReadS [EncoderState]
-> ReadPrec EncoderState
-> ReadPrec [EncoderState]
-> Read EncoderState
forall a.
(Int -> ReadS a)
-> ReadS [a] -> ReadPrec a -> ReadPrec [a] -> Read a
readListPrec :: ReadPrec [EncoderState]
$creadListPrec :: ReadPrec [EncoderState]
readPrec :: ReadPrec EncoderState
$creadPrec :: ReadPrec EncoderState
readList :: ReadS [EncoderState]
$creadList :: ReadS [EncoderState]
readsPrec :: Int -> ReadS EncoderState
$creadsPrec :: Int -> ReadS EncoderState
Read )


-- | Test whether the byte is within the range of ASCII (only lower seven bits
-- may be set).
isAsciiByte :: W.Word8 -> Bool
isAsciiByte :: Word8 -> Bool
isAsciiByte = (Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= Char -> Word8
toByte Char
'\DEL')


-- | Convert an ASCII character into its respective singleton byte.  Fails if
-- the character is not included in the basic ASCII set.
fromAscii :: Char -> StateBinaryBuilder state
fromAscii :: Char -> StateBinaryBuilder state
fromAscii Char
char
    | Char -> Bool
C.isAscii Char
char = Either Char ShortByteString -> StateBinaryBuilder state
forall (m :: * -> *) a. Monad m => a -> m a
return (Either Char ShortByteString -> StateBinaryBuilder state)
-> (ShortByteString -> Either Char ShortByteString)
-> ShortByteString
-> StateBinaryBuilder state
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ShortByteString -> Either Char ShortByteString
forall (f :: * -> *) a. Applicative f => a -> f a
pure (ShortByteString -> StateBinaryBuilder state)
-> ShortByteString -> StateBinaryBuilder state
forall a b. (a -> b) -> a -> b
$ [Word8] -> ShortByteString
BS.SH.pack [Int -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Word8) -> Int -> Word8
forall a b. (a -> b) -> a -> b
$ Char -> Int
forall a. Enum a => a -> Int
fromEnum Char
char]
    | Bool
otherwise = StateBinaryBuilder state
forall (f :: * -> *) a. Alternative f => f a
A.empty

-- | Convert a numeric value to its associated Unicode character, and wrap it
-- accordingly for the parsers.
toUnicode :: Integral a => [W.Word8] -> a -> StateTextBuilder state
toUnicode :: [Word8] -> a -> StateTextBuilder state
toUnicode [Word8]
bs = [Word8] -> Char -> StateTextBuilder state
forall state. [Word8] -> Char -> StateTextBuilder state
emit [Word8]
bs (Char -> StateTextBuilder state)
-> (a -> Char) -> a -> StateTextBuilder state
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Char
forall a. Enum a => Int -> a
toEnum (Int -> Char) -> (a -> Int) -> a -> Char
forall b c a. (b -> c) -> (a -> b) -> a -> c
. a -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral

-- | Convert a numeric value to its associated Unicode character, and wrap it
-- accordingly for the parsers.
toUnicode1 :: W.Word8 -> StateTextBuilder state
toUnicode1 :: Word8 -> StateTextBuilder state
toUnicode1 Word8
b = [Word8] -> Char -> StateTextBuilder state
forall state. [Word8] -> Char -> StateTextBuilder state
emit [Word8
b] (Char -> StateTextBuilder state)
-> (Int -> Char) -> Int -> StateTextBuilder state
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Char
forall a. Enum a => Int -> a
toEnum (Int -> StateTextBuilder state) -> Int -> StateTextBuilder state
forall a b. (a -> b) -> a -> b
$ Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
b

-- | Wrap a single character in the enclosing functors required to use it as
-- the output of a binary-to-text parser.
emit :: [W.Word8] -> Char -> StateTextBuilder state
emit :: [Word8] -> Char -> StateTextBuilder state
emit [Word8]
bs Char
c = do
    ((Confidence, state) -> (Confidence, state))
-> StateT (Confidence, state) (Parser ByteString) ()
forall (m :: * -> *) s. Monad m => (s -> s) -> StateT s m ()
N.S.modify (((Confidence, state) -> (Confidence, state))
 -> StateT (Confidence, state) (Parser ByteString) ())
-> ((Confidence, state) -> (Confidence, state))
-> StateT (Confidence, state) (Parser ByteString) ()
forall a b. (a -> b) -> a -> b
$ \(Confidence, state)
state -> case (Confidence, state) -> Confidence
forall a b. (a, b) -> a
fst (Confidence, state)
state of
        Tentative Encoding
enc ReparseData
d' -> (Encoding -> ReparseData -> Confidence
Tentative Encoding
enc (ReparseData -> Confidence) -> ReparseData -> Confidence
forall a b. (a -> b) -> a -> b
$ ReparseData -> ReparseData
insertChar ReparseData
d', (Confidence, state) -> state
forall a b. (a, b) -> b
snd (Confidence, state)
state)
          where insertChar :: ReparseData -> ReparseData
insertChar ReparseData
d = ReparseData
d
                    { parsedChars :: HashMap ShortByteString Char
parsedChars = ShortByteString
-> Char
-> HashMap ShortByteString Char
-> HashMap ShortByteString Char
forall k v.
(Eq k, Hashable k) =>
k -> v -> HashMap k v -> HashMap k v
M.S.insert ([Word8] -> ShortByteString
BS.SH.pack [Word8]
bs) Char
c (HashMap ShortByteString Char -> HashMap ShortByteString Char)
-> HashMap ShortByteString Char -> HashMap ShortByteString Char
forall a b. (a -> b) -> a -> b
$ ReparseData -> HashMap ShortByteString Char
parsedChars ReparseData
d
                    }
        Confidence
_ -> (Confidence, state)
state
    Either ShortByteString [Char] -> StateTextBuilder state
forall (m :: * -> *) a. Monad m => a -> m a
return (Either ShortByteString [Char] -> StateTextBuilder state)
-> ([Char] -> Either ShortByteString [Char])
-> [Char]
-> StateTextBuilder state
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Char] -> Either ShortByteString [Char]
forall (f :: * -> *) a. Applicative f => a -> f a
pure ([Char] -> StateTextBuilder state)
-> [Char] -> StateTextBuilder state
forall a b. (a -> b) -> a -> b
$ Char -> [Char]
forall (f :: * -> *) a. Applicative f => a -> f a
pure Char
c


-- | Wrap several characters in the enclosing functors required to use them as
-- the output of a binary-to-text parser.
emit' :: [W.Word8] -> String -> StateTextBuilder state
emit' :: [Word8] -> [Char] -> StateTextBuilder state
emit' [Word8]
bs [Char
c] = [Word8] -> Char -> StateTextBuilder state
forall state. [Word8] -> Char -> StateTextBuilder state
emit [Word8]
bs Char
c
emit' [Word8]
bs [Char]
cs = do
    -- This is not the actual character(s) emitted, but the string is
    -- inherantly not going to be the same as any single one emitted by any
    -- other encoding, so any non-equal value will work as a placeholder.
    Either ShortByteString [Char]
_ <- [Word8] -> Char -> StateTextBuilder state
forall state. [Word8] -> Char -> StateTextBuilder state
emit [Word8]
bs Char
'\NUL'
    Either ShortByteString [Char] -> StateTextBuilder state
forall (m :: * -> *) a. Monad m => a -> m a
return (Either ShortByteString [Char] -> StateTextBuilder state)
-> Either ShortByteString [Char] -> StateTextBuilder state
forall a b. (a -> b) -> a -> b
$ [Char] -> Either ShortByteString [Char]
forall (f :: * -> *) a. Applicative f => a -> f a
pure [Char]
cs


-- | A parser combinator written as part of the algorithm to extract text from
-- a binary stream, carrying some persistent state.
type StateDecoder state = StateParser (Confidence, state) BS.ByteString

-- | A parser combinator written as part of the algorithm to extract text from
-- a binary stream.  For compatibility with the few encodings which require
-- tracking some state, all of the core bytes-to-text algorithms are written
-- within a null 'Control.Monad.State.Lazy.MonadState'.
type Decoder = StateDecoder ()

-- | The fallible output of one of the core bytes-to-text parsers.  If some
-- binary sequence proves uninterpretable, the 'Left' value contains the
-- original sequence unchanged; such a value should be either ignored or
-- reported verbatim, with no further fallback attempts at parsing.
type DecoderError out = Either BS.SH.ShortByteString out


-- | Fetch the current value of the state determining how to transform a binary
-- stream into a 'Char' representation within an encoding scheme.
getDecoderState :: StateDecoder state state
getDecoderState :: StateDecoder state state
getDecoderState = (Confidence, state) -> state
forall a b. (a, b) -> b
snd ((Confidence, state) -> state)
-> StateT
     (Confidence, state) (Parser ByteString) (Confidence, state)
-> StateDecoder state state
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> StateT (Confidence, state) (Parser ByteString) (Confidence, state)
forall (m :: * -> *) s. Monad m => StateT s m s
N.S.get

-- | Update the current state of the binary-to-'Char' parser according to the
-- given function.
modifyDecoderState :: (state -> state) -> StateDecoder state ()
modifyDecoderState :: (state -> state) -> StateDecoder state ()
modifyDecoderState state -> state
f = ((Confidence, state) -> (Confidence, state))
-> StateDecoder state ()
forall (m :: * -> *) s. Monad m => (s -> s) -> StateT s m ()
N.S.modify (((Confidence, state) -> (Confidence, state))
 -> StateDecoder state ())
-> ((Confidence, state) -> (Confidence, state))
-> StateDecoder state ()
forall a b. (a -> b) -> a -> b
$ (state -> state) -> (Confidence, state) -> (Confidence, state)
forall (p :: * -> * -> *) b c a.
Bifunctor p =>
(b -> c) -> p a b -> p a c
F.B.second state -> state
f


-- | A parser combinator written as part of the algorithm to serialize text
-- into a binary stream, carrying some persistent state.
type StateEncoder state = StateParser state T.Text

-- | A parser combinator written as part of the algorithm to serialize text
-- into a binary stream.  For compatibility with the few encodings which
-- require tracking some state, all of the core bytes-to-text algorithms are
-- written within a null 'Control.Monad.State.Lazy.MonadState'.
type Encoder = StateEncoder ()

-- | The fallible output of one of the core text-to-bytes parsers.  If some
-- character is not representable in the encoding, the 'Left' value contains
-- the original 'Char' unchanged; such a value should be either ignored or
-- reported verbatim, with no further fallback attempts at encoding.
type EncoderError out = Either Char out


-- | Fetch the current value of the state determining how to serialize 'Char's
-- into a binary representation within an encoding scheme.
getEncoderState :: StateEncoder state state
getEncoderState :: StateEncoder state state
getEncoderState = StateEncoder state state
forall (m :: * -> *) s. Monad m => StateT s m s
N.S.get

-- | Update the current state of the 'Char'-to-binary parser according to the
-- given function.
modifyEncoderState :: (state -> state) -> StateEncoder state ()
modifyEncoderState :: (state -> state) -> StateEncoder state ()
modifyEncoderState = (state -> state) -> StateEncoder state ()
forall (m :: * -> *) s. Monad m => (s -> s) -> StateT s m ()
N.S.modify


-- | Efficient construction of 'T.Text' strings from a 'BS.ByteString'.  This
-- will be called in a loop until failure or the end of the stream, and so
-- should return a sensible minimum string given the restriction of consuming
-- whole bytes (e.g. a UTF-7 'T.B.Builder' might translate blocks of eight
-- bytes into three 'Char's, while UTF-8 would translate a 'Char' at a time
-- from a variable number of bytes).
-- 
-- If the current such block is invalid for the encoding, it should be returned
-- in the 'Left' instead of any "sensible" default, so failure only occurs at
-- the end of the stream.  Note that a failing base case must eventually be
-- reached on an empty stream, or the generated 'T.Text' will be infinite.
type TextBuilder = Decoder (DecoderError String)

-- | As 'TextBuilder', for those few encodings which require tracking a
-- persistent state across character boundaries.
type StateTextBuilder state = StateDecoder state (DecoderError String)


-- | Efficient construction of 'BS.ByteString's from a 'T.Text' string.  This
-- will be called in a loop until failure or the end of the stream, and so
-- should operate on a single 'Char' at a time unless some property of the
-- encoding requires a longer span (e.g. UTF-7 requires three 'Char's to return
-- to a byte boundary).  No attempt at Unicode (de)composition is attempted,
-- even if that would succeed where the input form fails.
-- 
-- If the current 'Char' can not be represented in the encoding, it should be
-- returned as a 'Left' value, so failure only occurs at the end of the stream.
-- Note that a failing base case must eventually be reached on an empty stream,
-- or the generated 'BS.ByteString' will be infinite.
type BinaryBuilder = Encoder (EncoderError BS.SH.ShortByteString)

-- | As 'BinaryBuilder', for those few encodings which require tracking a
-- persistent state across character boundaries.
type StateBinaryBuilder state = StateEncoder state (EncoderError BS.SH.ShortByteString)


-- | Pack the given binary sequence as a failed parse.
decoderFailure :: [W.Word8] -> StateTextBuilder state
decoderFailure :: [Word8] -> StateTextBuilder state
decoderFailure [Word8]
bs = do
    Either ShortByteString [Char]
_ <- [Word8] -> Char -> StateTextBuilder state
forall state. [Word8] -> Char -> StateTextBuilder state
emit [Word8]
bs Char
replacementChar
    Either ShortByteString [Char] -> StateTextBuilder state
forall (m :: * -> *) a. Monad m => a -> m a
return (Either ShortByteString [Char] -> StateTextBuilder state)
-> (ShortByteString -> Either ShortByteString [Char])
-> ShortByteString
-> StateTextBuilder state
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ShortByteString -> Either ShortByteString [Char]
forall a b. a -> Either a b
Left (ShortByteString -> StateTextBuilder state)
-> ShortByteString -> StateTextBuilder state
forall a b. (a -> b) -> a -> b
$ [Word8] -> ShortByteString
BS.SH.pack [Word8]
bs

-- | Pack the given byte as a singleton sequence triggering a failed parse.
decoderFailure1 :: W.Word8 -> StateTextBuilder state
decoderFailure1 :: Word8 -> StateTextBuilder state
decoderFailure1 Word8
b = [Word8] -> StateTextBuilder state
forall state. [Word8] -> StateTextBuilder state
decoderFailure [Word8
b]

-- | Combine the given bytes as a binary sequence triggering a failed parse.
-- This is purely driven by a few fallback statements in multi-byte encodings,
-- where the lead byte is known from the invocation, and the trail byte is
-- piped in from the 'Web.Willow.Common.Parser.Switch.switch' block.
decoderFailure2 :: W.Word8 -> W.Word8 -> StateTextBuilder state
decoderFailure2 :: Word8 -> Word8 -> StateTextBuilder state
decoderFailure2 Word8
b Word8
b' = [Word8] -> StateTextBuilder state
forall state. [Word8] -> StateTextBuilder state
decoderFailure [Word8
b, Word8
b']

-- | Pack the given character as invalid for the target encoding.
encoderFailure :: Char -> StateEncoder state (EncoderError out)
encoderFailure :: Char -> StateEncoder state (EncoderError out)
encoderFailure = EncoderError out -> StateEncoder state (EncoderError out)
forall (m :: * -> *) a. Monad m => a -> m a
return (EncoderError out -> StateEncoder state (EncoderError out))
-> (Char -> EncoderError out)
-> Char
-> StateEncoder state (EncoderError out)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> EncoderError out
forall a b. a -> Either a b
Left


-- | Retrieve a character index file distributed as part of the HTML standard,
-- and process it into an easily-digestible type.  Note that this is a heavy
-- function and should be cached whenever possible.
-- 
-- This may cause a panic if the encoding index file doesn't exist or the user
-- doesn't have the proper permissions to read it.
loadIndex :: String -> [(Word, Char)]
loadIndex :: [Char] -> [(Word, Char)]
loadIndex = ((Word, Char) -> Bool) -> [Char] -> [(Word, Char)]
loadIndex' (((Word, Char) -> Bool) -> [Char] -> [(Word, Char)])
-> ((Word, Char) -> Bool) -> [Char] -> [(Word, Char)]
forall a b. (a -> b) -> a -> b
$ Bool -> (Word, Char) -> Bool
forall a b. a -> b -> a
const Bool
True

-- | Retrieve a character index file distributed as part of the HTML standard,
-- and process it into an easily-digestible type, filtering out a selection of
-- the original values in the process.  Note that this is a heavy function and
-- should be cached whenever possible.
-- 
-- This may cause a panic if the encoding index file doesn't exist or the user
-- doesn't have the proper permissions to read it.
loadIndex' :: ((Word, Char) -> Bool) -> String -> [(Word, Char)]
loadIndex' :: ((Word, Char) -> Bool) -> [Char] -> [(Word, Char)]
loadIndex' (Word, Char) -> Bool
test [Char]
name = IO [(Word, Char)] -> [(Word, Char)]
forall a. IO a -> a
IO.Unsafe.unsafePerformIO (IO [(Word, Char)] -> [(Word, Char)])
-> IO [(Word, Char)] -> [(Word, Char)]
forall a b. (a -> b) -> a -> b
$ do
    [Char]
path <- [Char] -> IO [Char]
getDataFileName ([Char] -> IO [Char]) -> [Char] -> IO [Char]
forall a b. (a -> b) -> a -> b
$ [Char]
name [Char] -> ShowS
<.> [Char]
"index"
    -- In addition to the expected errors above, this may theoretically throw
    -- an 'IO.isAlreadyInUseError'; however, that should mostly be an issue
    -- when writing, rather than in 'IO.ReadMode'.
    Text
index <- [Char] -> IOMode -> (Handle -> IO Text) -> IO Text
forall r. [Char] -> IOMode -> (Handle -> IO r) -> IO r
IO.withFile [Char]
path IOMode
IO.ReadMode Handle -> IO Text
T.IO.hGetContents
    [(Word, Char)] -> IO [(Word, Char)]
forall (m :: * -> *) a. Monad m => a -> m a
return ([(Word, Char)] -> IO [(Word, Char)])
-> ([Text] -> [(Word, Char)]) -> [Text] -> IO [(Word, Char)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Text -> Maybe (Word, Char)) -> [Text] -> [(Word, Char)]
forall a b. (a -> Maybe b) -> [a] -> [b]
Y.mapMaybe (((Word, Char) -> Bool) -> Text -> Maybe (Word, Char)
indexLine (Word, Char) -> Bool
test) ([Text] -> IO [(Word, Char)]) -> [Text] -> IO [(Word, Char)]
forall a b. (a -> b) -> a -> b
$ Text -> [Text]
T.lines Text
index
{-# NOINLINE loadIndex' #-}

-- | Process a line in the format used by the HTML index files, discarding the
-- result if it fails the given predicate.
indexLine :: ((Word, Char) -> Bool) -> T.Text -> Maybe (Word, Char)
indexLine :: ((Word, Char) -> Bool) -> Text -> Maybe (Word, Char)
indexLine (Word, Char) -> Bool
test Text
l
    | Text -> Bool
T.null Text
l = Maybe (Word, Char)
forall a. Maybe a
Nothing
    | Text -> Char
T.head Text
l Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'#' = Maybe (Word, Char)
forall a. Maybe a
Nothing
    | Bool
otherwise = [Text] -> Maybe (Word, Char)
indexLine' ([Text] -> Maybe (Word, Char)) -> [Text] -> Maybe (Word, Char)
forall a b. (a -> b) -> a -> b
$ (Char -> Bool) -> Text -> [Text]
T.split (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'\t') Text
l
  where indexLine' :: [Text] -> Maybe (Word, Char)
indexLine' (Text
i':Text
c':[Text]
_) = do
            (Word, Text)
i <- ([Char] -> Maybe (Word, Text))
-> ((Word, Text) -> Maybe (Word, Text))
-> Either [Char] (Word, Text)
-> Maybe (Word, Text)
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either [Char] -> Maybe (Word, Text)
forall (m :: * -> *) a. MonadFail m => [Char] -> m a
fail (Word, Text) -> Maybe (Word, Text)
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Either [Char] (Word, Text) -> Maybe (Word, Text))
-> (Text -> Either [Char] (Word, Text))
-> Text
-> Maybe (Word, Text)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Either [Char] (Word, Text)
forall a. Integral a => Reader a
T.R.decimal (Text -> Maybe (Word, Text)) -> Text -> Maybe (Word, Text)
forall a b. (a -> b) -> a -> b
$ Text -> Text
T.strip Text
i'
            (Int, Text)
c <- ([Char] -> Maybe (Int, Text))
-> ((Int, Text) -> Maybe (Int, Text))
-> Either [Char] (Int, Text)
-> Maybe (Int, Text)
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either [Char] -> Maybe (Int, Text)
forall (m :: * -> *) a. MonadFail m => [Char] -> m a
fail (Int, Text) -> Maybe (Int, Text)
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Either [Char] (Int, Text) -> Maybe (Int, Text))
-> (Text -> Either [Char] (Int, Text)) -> Text -> Maybe (Int, Text)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Either [Char] (Int, Text)
forall a. Integral a => Reader a
T.R.hexadecimal (Text -> Maybe (Int, Text)) -> Text -> Maybe (Int, Text)
forall a b. (a -> b) -> a -> b
$ Text -> Text
T.strip Text
c'
            let entry :: (Word, Char)
entry = ((Word, Text) -> Word
forall a b. (a, b) -> a
fst (Word, Text)
i, Int -> Char
forall a. Enum a => Int -> a
toEnum (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$ (Int, Text) -> Int
forall a b. (a, b) -> a
fst (Int, Text)
c)
            if (Word, Char) -> Bool
test (Word, Char)
entry then (Word, Char) -> Maybe (Word, Char)
forall (m :: * -> *) a. Monad m => a -> m a
return (Word, Char)
entry else Maybe (Word, Char)
forall a. Maybe a
Nothing
        indexLine' [Text]
_ = Maybe (Word, Char)
forall a. Maybe a
Nothing


-- | Memoization table for lookups into the CJK index files.  Storing the
-- entire thing in a 'Data.Vector.Vector' or other indexable form would take a
-- megabyte of memory each in the best case, and many characters would likely
-- never be used.  At the same time, performing a file lookup on every
-- character would be unacceptably slow.  Instead, take the performance hit
-- /once/ per character, and store the result for faster access in future
-- calls.
-- 
-- Depending on the frequency of the updates, some updates (via
-- @updateIndices@) may be lost.  This should still be a smaller performance
-- hit than getting blocked while several other threads try to do the same.
type MemoizationTable k v = IO.R.IORef (M.S.HashMap k v)

-- | Specialization of the CJK lookup table for parsers comprising a
-- bytes-to-text algorithm.
type DecoderMemoTable = IO (MemoizationTable Word (Maybe Char))

-- | Specialization of the CJK lookup table for parsers comprising a
-- text-to-bytes algorithm.
type EncoderMemoTable = IO (MemoizationTable Char (Maybe Word))

-- | Allocate a lookup table for caching the results of expensive computations.
-- Note that the table is mutable and threaded access should be reasoned
-- through carefully.
newMemoizationTable :: IO (MemoizationTable k v)
newMemoizationTable :: IO (MemoizationTable k v)
newMemoizationTable = HashMap k v -> IO (MemoizationTable k v)
forall a. a -> IO (IORef a)
IO.R.newIORef HashMap k v
forall k v. HashMap k v
M.S.empty


-- | Look for the index of a given character, checking chached values before
-- running a (potentially expensive) direct retrieval function.  Specialized
-- for symmetric lookup functions, and so populates the inverse table at the
-- same time, if given one.
lookupMemoizedIndex
    :: (Eq k, H.Hashable k, Eq v, H.Hashable v)
    => IO (MemoizationTable k (Maybe v))
    -> Maybe (IO (MemoizationTable v (Maybe k)))
    -> k
    -> (k -> Maybe v)
    -> Maybe v
lookupMemoizedIndex :: IO (MemoizationTable k (Maybe v))
-> Maybe (IO (MemoizationTable v (Maybe k)))
-> k
-> (k -> Maybe v)
-> Maybe v
lookupMemoizedIndex IO (MemoizationTable k (Maybe v))
memo Maybe (IO (MemoizationTable v (Maybe k)))
inverse k
key k -> Maybe v
valueF = IO (Maybe v) -> Maybe v
forall a. IO a -> a
IO.Unsafe.unsafePerformIO (IO (Maybe v) -> Maybe v) -> IO (Maybe v) -> Maybe v
forall a b. (a -> b) -> a -> b
$ do
    -- 'IO.Unsafe.unsafePerformIO' should be safe, as the only effect the
    -- environment theoretically has on the computation is the time it takes to
    -- complete (i.e. whether that particular character has been memoized yet).
    HashMap k (Maybe v)
is <- IO (MemoizationTable k (Maybe v))
memo IO (MemoizationTable k (Maybe v))
-> (MemoizationTable k (Maybe v) -> IO (HashMap k (Maybe v)))
-> IO (HashMap k (Maybe v))
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= MemoizationTable k (Maybe v) -> IO (HashMap k (Maybe v))
forall a. IORef a -> IO a
IO.R.readIORef
    case k -> HashMap k (Maybe v) -> Maybe (Maybe v)
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
M.S.lookup k
key HashMap k (Maybe v)
is of
        Just Maybe v
i -> Maybe v -> IO (Maybe v)
forall (m :: * -> *) a. Monad m => a -> m a
return Maybe v
i
        Maybe (Maybe v)
Nothing -> k
-> Maybe v
-> IO (MemoizationTable k (Maybe v))
-> Maybe (IO (MemoizationTable v (Maybe k)))
-> IO (Maybe v)
forall k v.
(Eq k, Hashable k, Eq v, Hashable v) =>
k
-> Maybe v
-> IO (MemoizationTable k (Maybe v))
-> Maybe (IO (MemoizationTable v (Maybe k)))
-> IO (Maybe v)
updateIndices k
key (k -> Maybe v
valueF k
key) IO (MemoizationTable k (Maybe v))
memo Maybe (IO (MemoizationTable v (Maybe k)))
inverse
{-# NOINLINE lookupMemoizedIndex #-}

-- | Given either an index offset or a 'Char' code point, and whether the other
-- was found in the on-disc index file, update one or both memoization tables
-- to avoid having to look for it again.
updateIndices
    :: (Eq k, H.Hashable k, Eq v, H.Hashable v)
    => k
    -> Maybe v
    -> IO (MemoizationTable k (Maybe v))
    -> Maybe (IO (MemoizationTable v (Maybe k)))
    -> IO (Maybe v)
updateIndices :: k
-> Maybe v
-> IO (MemoizationTable k (Maybe v))
-> Maybe (IO (MemoizationTable v (Maybe k)))
-> IO (Maybe v)
updateIndices k
k Maybe v
Nothing IO (MemoizationTable k (Maybe v))
l Maybe (IO (MemoizationTable v (Maybe k)))
_ =
    (IO (MemoizationTable k (Maybe v))
l IO (MemoizationTable k (Maybe v))
-> (MemoizationTable k (Maybe v) -> IO ThreadId) -> IO ThreadId
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= IO () -> IO ThreadId
IO.C.forkIO (IO () -> IO ThreadId)
-> (MemoizationTable k (Maybe v) -> IO ())
-> MemoizationTable k (Maybe v)
-> IO ThreadId
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (MemoizationTable k (Maybe v)
 -> (HashMap k (Maybe v) -> HashMap k (Maybe v)) -> IO ())
-> (HashMap k (Maybe v) -> HashMap k (Maybe v))
-> MemoizationTable k (Maybe v)
-> IO ()
forall a b c. (a -> b -> c) -> b -> a -> c
flip MemoizationTable k (Maybe v)
-> (HashMap k (Maybe v) -> HashMap k (Maybe v)) -> IO ()
forall a. IORef a -> (a -> a) -> IO ()
IO.R.modifyIORef' (k -> Maybe v -> HashMap k (Maybe v) -> HashMap k (Maybe v)
forall k v.
(Eq k, Hashable k) =>
k -> v -> HashMap k v -> HashMap k v
M.S.insert k
k Maybe v
forall a. Maybe a
Nothing)) IO ThreadId -> Maybe v -> IO (Maybe v)
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> Maybe v
forall a. Maybe a
Nothing
updateIndices k
k v' :: Maybe v
v'@(Just v
v) IO (MemoizationTable k (Maybe v))
l Maybe (IO (MemoizationTable v (Maybe k)))
Nothing =
    (IO (MemoizationTable k (Maybe v))
l IO (MemoizationTable k (Maybe v))
-> (MemoizationTable k (Maybe v) -> IO ThreadId) -> IO ThreadId
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= IO () -> IO ThreadId
IO.C.forkIO (IO () -> IO ThreadId)
-> (MemoizationTable k (Maybe v) -> IO ())
-> MemoizationTable k (Maybe v)
-> IO ThreadId
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (MemoizationTable k (Maybe v)
 -> (HashMap k (Maybe v) -> HashMap k (Maybe v)) -> IO ())
-> (HashMap k (Maybe v) -> HashMap k (Maybe v))
-> MemoizationTable k (Maybe v)
-> IO ()
forall a b c. (a -> b -> c) -> b -> a -> c
flip MemoizationTable k (Maybe v)
-> (HashMap k (Maybe v) -> HashMap k (Maybe v)) -> IO ()
forall a. IORef a -> (a -> a) -> IO ()
IO.R.modifyIORef' (k -> Maybe v -> HashMap k (Maybe v) -> HashMap k (Maybe v)
forall k v.
(Eq k, Hashable k) =>
k -> v -> HashMap k v -> HashMap k v
M.S.insert k
k (Maybe v -> HashMap k (Maybe v) -> HashMap k (Maybe v))
-> Maybe v -> HashMap k (Maybe v) -> HashMap k (Maybe v)
forall a b. (a -> b) -> a -> b
$ v -> Maybe v
forall a. a -> Maybe a
Just v
v)) IO ThreadId -> Maybe v -> IO (Maybe v)
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> Maybe v
v'
updateIndices k
k v' :: Maybe v
v'@(Just v
v) IO (MemoizationTable k (Maybe v))
l (Just IO (MemoizationTable v (Maybe k))
r) =
    (IO (MemoizationTable k (Maybe v))
l IO (MemoizationTable k (Maybe v))
-> (MemoizationTable k (Maybe v) -> IO ThreadId) -> IO ThreadId
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= IO () -> IO ThreadId
IO.C.forkIO (IO () -> IO ThreadId)
-> (MemoizationTable k (Maybe v) -> IO ())
-> MemoizationTable k (Maybe v)
-> IO ThreadId
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (MemoizationTable k (Maybe v)
 -> (HashMap k (Maybe v) -> HashMap k (Maybe v)) -> IO ())
-> (HashMap k (Maybe v) -> HashMap k (Maybe v))
-> MemoizationTable k (Maybe v)
-> IO ()
forall a b c. (a -> b -> c) -> b -> a -> c
flip MemoizationTable k (Maybe v)
-> (HashMap k (Maybe v) -> HashMap k (Maybe v)) -> IO ()
forall a. IORef a -> (a -> a) -> IO ()
IO.R.modifyIORef' (k -> Maybe v -> HashMap k (Maybe v) -> HashMap k (Maybe v)
forall k v.
(Eq k, Hashable k) =>
k -> v -> HashMap k v -> HashMap k v
M.S.insert k
k (Maybe v -> HashMap k (Maybe v) -> HashMap k (Maybe v))
-> Maybe v -> HashMap k (Maybe v) -> HashMap k (Maybe v)
forall a b. (a -> b) -> a -> b
$ v -> Maybe v
forall a. a -> Maybe a
Just v
v)) IO ThreadId -> IO ThreadId -> IO ThreadId
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*>
    (IO (MemoizationTable v (Maybe k))
r IO (MemoizationTable v (Maybe k))
-> (MemoizationTable v (Maybe k) -> IO ThreadId) -> IO ThreadId
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= IO () -> IO ThreadId
IO.C.forkIO (IO () -> IO ThreadId)
-> (MemoizationTable v (Maybe k) -> IO ())
-> MemoizationTable v (Maybe k)
-> IO ThreadId
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (MemoizationTable v (Maybe k)
 -> (HashMap v (Maybe k) -> HashMap v (Maybe k)) -> IO ())
-> (HashMap v (Maybe k) -> HashMap v (Maybe k))
-> MemoizationTable v (Maybe k)
-> IO ()
forall a b c. (a -> b -> c) -> b -> a -> c
flip MemoizationTable v (Maybe k)
-> (HashMap v (Maybe k) -> HashMap v (Maybe k)) -> IO ()
forall a. IORef a -> (a -> a) -> IO ()
IO.R.modifyIORef' (v -> Maybe k -> HashMap v (Maybe k) -> HashMap v (Maybe k)
forall k v.
(Eq k, Hashable k) =>
k -> v -> HashMap k v -> HashMap k v
M.S.insert v
v (Maybe k -> HashMap v (Maybe k) -> HashMap v (Maybe k))
-> Maybe k -> HashMap v (Maybe k) -> HashMap v (Maybe k)
forall a b. (a -> b) -> a -> b
$ k -> Maybe k
forall a. a -> Maybe a
Just k
k)) IO ThreadId -> Maybe v -> IO (Maybe v)
forall (f :: * -> *) a b. Functor f => f a -> b -> f b
$> Maybe v
v'

-- | Find the value of a given key in an ordered list.  At the moment, just an
-- alias for an O(n) 'lookup', but will hopefully be improved at some point.
search :: Ord k => k -> [(k, v)] -> Maybe v
search :: k -> [(k, v)] -> Maybe v
search = k -> [(k, v)] -> Maybe v
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup