{-# LANGUAGE Trustworthy #-}

{-|
Description:    Functions and objects used to build the encode/decode parsers.

Copyright:      (c) 2020 Sam May
License:        MPL-2.0
Maintainer:     ag.eitilt@gmail.com

Stability:      experimental
Portability:    portable
-}
module Web.Willow.Common.Encoding.Common
    ( Encoding ( .. )
    , Confidence ( .. )
    , confidenceEncoding
    , ReparseData ( .. )
    , emptyReparseData
    , DecoderState ( .. )
    , EncoderState ( .. )
      -- * Predicates
    , toByte
    , asciiWhitespace
    , asciiWhitespaceB
    , toAsciiLower
    , toAsciiLowerB
    , isAsciiByte
    , range
    , search
      -- * Parsers
      -- ** Decoders
      -- *** Core
    , Decoder
    , StateDecoder
    , getDecoderState
    , modifyDecoderState
    , decoderFailure
    , decoderFailure1
    , decoderFailure2
      -- *** Output
    , TextBuilder
    , StateTextBuilder
    , DecoderError
    , toUnicode
    , toUnicode1
    , emit
    , emit'
      -- ** Encoders
      -- *** Core
    , Encoder
    , StateEncoder
    , getEncoderState
    , modifyEncoderState
    , encoderFailure
      -- *** Output
    , BinaryBuilder
    , StateBinaryBuilder
    , EncoderError
    , fromAscii
      -- * Memoization
    , MemoizationTable
    , DecoderMemoTable
    , EncoderMemoTable
    , lookupMemoizedIndex
      -- ** Construction
    , newMemoizationTable
    , loadIndex
    , loadIndex'
    ) where


import qualified Control.Applicative as A
import qualified Control.Concurrent as IO.C
import qualified Control.Monad.Trans.State as N.S

import qualified Data.Bifunctor as F.B
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as BS.L
import qualified Data.ByteString.Short as BS.SH
import qualified Data.Char as C
import qualified Data.Hashable as H
import qualified Data.HashMap.Strict as M.S
import qualified Data.IORef as IO.R
import qualified Data.Maybe as Y
import qualified Data.Text as T
import qualified Data.Text.IO as T.IO
import qualified Data.Text.Read as T.R
import qualified Data.Word as W

import qualified System.IO as IO
import qualified System.IO.Unsafe as IO.Unsafe

import Data.Functor ( ($>) )
import System.FilePath ( (<.>) )

import Paths_willow

import Web.Willow.Common.Encoding.Character
import Web.Willow.Common.Parser
import Web.Willow.Common.Parser.Util

import {-# SOURCE #-} Web.Willow.Common.Encoding


-- | Allow specifying binary data as ASCII, purely to simplify authoring.
toByte :: Char -> W.Word8
toByte = fromIntegral . fromEnum
{-# INLINE toByte #-}


-- | The underlying binary representations of 'asciiWhitespace', in any
-- ASCII-compatible encoding.
asciiWhitespaceB :: [W.Word8]
asciiWhitespaceB = map (fromIntegral . fromEnum) asciiWhitespace


-- | Map a byte representing an ASCII uppercase letter to its corresponding
-- lowercase value; compare with 'Data.Char.toLower'.
toAsciiLowerB :: W.Word8 -> W.Word8
toAsciiLowerB c
    | c >= toByte 'A' && c <= toByte 'Z' = c + 0x20
    | otherwise = c


-- | __Encoding:__
--      @[encoding]
--      (https://encoding.spec.whatwg.org/#encoding)@
-- 
-- All character encoding schemes supported by the HTML standard, defined as a
-- bidirectional map between characters and binary sequences.  'Utf8' is
-- strongly encouraged for new content (including all encoding purposes), but
-- the others are retained for compatibility with existing pages.
-- 
-- Note that none of these are complete functions, to one degree or another,
-- and that no guarantee is made that the mapping round-trips.
data Encoding
    = Utf8
        -- ^ The UTF-8 encoding for Unicode.
    | Utf16be
        -- ^ The UTF-16 encoding for Unicode, in big endian order.
        -- 
        -- No encoder is provided for this scheme.
    | Utf16le
        -- ^ The UTF-16 encoding for Unicode, in little endian order.
        -- 
        -- No encoder is provided for this scheme.
    | Big5
        -- ^ [Big5](https://encoding.spec.whatwg.org/big5.html), primarily
        -- covering traditional Chinese characters.
    | EucJp
        -- ^ EUC-JP, primarily covering Japanese as the union of [JIS-0208]
        -- (https://encoding.spec.whatwg.org/jis0208.html) and [JIS-0212]
        -- (https://encoding.spec.whatwg.org/jis0212.html).
    | EucKr
        -- ^ [EUC-KR](https://encoding.spec.whatwg.org/euc-kr.html), primarily
        -- covering Hangul.
    | Gb18030
        -- ^ The [GB18030-2005 extension]
        -- (https://encoding.spec.whatwg.org/gb18030.html) to GBK, with one
        -- tweak for web compatibility, primarily covering both forms of
        -- Chinese characters.
        -- 
        -- Note that this encoding also includes a large number of four-byte
        -- sequences which aren't listed in the linked visualization.
    | Gbk
        -- ^ GBK, primarily covering simplified Chinese characters.
        -- 
        -- In practice, this is just 'Gb18030' with a restricted set of
        -- encodable characters; the decoder is identical.
    | Ibm866
        -- ^ DOS and OS/2 [code page]
        -- (https://encoding.spec.whatwg.org/ibm866.html) for Cyrillic
        -- characters.
    | Iso2022Jp
        -- ^ A Japanese-focused implementation of the ISO 2022 meta-encoding,
        -- including both [JIS-0208]
        -- (https://encoding.spec.whatwg.org/jis0208.html) and halfwidth
        -- katakana.
    | Iso8859_2
        -- ^ [Latin-2](https://encoding.spec.whatwg.org/iso-8859-2.html)
        -- (Central European).
    | Iso8859_3
        -- ^ [Latin-3](https://encoding.spec.whatwg.org/iso-8859-3.html)
        -- (South European and Esperanto)
    | Iso8859_4
        -- ^ [Latin-4](https://encoding.spec.whatwg.org/iso-8859-4.html)
        -- (North European).
    | Iso8859_5
        -- ^ [Latin/Cyrillic](https://encoding.spec.whatwg.org/iso-8859-5.html).
    | Iso8859_6
        -- ^ [Latin/Arabic](https://encoding.spec.whatwg.org/iso-8859-6.html).
    | Iso8859_7
        -- ^ [Latin/Greek](https://encoding.spec.whatwg.org/iso-8859-7.html)
        -- (modern monotonic).
    | Iso8859_8
        -- ^ [Latin/Hebrew](https://encoding.spec.whatwg.org/iso-8859-8.html)
        -- (visual order).
    | Iso8859_8i
        -- ^ [Latin/Hebrew](https://encoding.spec.whatwg.org/iso-8859-8.html)
        -- (logical order).
    | Iso8859_10
        -- ^ [Latin-6](https://encoding.spec.whatwg.org/iso-8859-10.html)
        -- (Nordic).
    | Iso8859_13
        -- ^ [Latin-7](https://encoding.spec.whatwg.org/iso-8859-13.html)
        -- (Baltic Rim).
    | Iso8859_14
        -- ^ [Latin-8](https://encoding.spec.whatwg.org/iso-8859-14.html)
        -- (Celtic).
    | Iso8859_15
        -- ^ [Latin-9](https://encoding.spec.whatwg.org/iso-8859-15.html)
        -- (revision of ISO 8859-1 Latin-1, Western European).
    | Iso8859_16
        -- ^ [Latin-10](https://encoding.spec.whatwg.org/iso-8859-16.html)
        -- (South-Eastern European).
    | Koi8R
        -- ^ KOI-8 [specialized](https://encoding.spec.whatwg.org/koi8-r.html)
        -- for Russian Cyrillic.
    | Koi8U
        -- ^ KOI-8 [specialized](https://encoding.spec.whatwg.org/koi8-u.html)
        -- for Ukrainian Cyrillic.
    | Macintosh
        -- ^ [Mac OS Roman](https://encoding.spec.whatwg.org/macintosh.html).
    | MacintoshCyrillic -- "x-mac-cyrillic"
        -- ^ [Mac OS Cyrillic]
        -- (https://encoding.spec.whatwg.org/x-mac-cyrillic.html) (as of Mac OS
        -- 9.0)
    | ShiftJis
        -- ^ The [Windows variant]
        -- (https://encoding.spec.whatwg.org/shift_jis.html) (code page 932) of
        -- Shift JIS.
    | Windows874
        -- ^ ISO 8859-11 [Latin\/Thai]
        -- (https://encoding.spec.whatwg.org/windows-874.html) with Windows
        -- extensions in the C1 control character slots.
        -- 
        -- Note that this encoding is always used instead of pure Latin/Thai.
    | Windows1250
        -- ^ The Windows [extension and rearrangement]
        -- (https://encoding.spec.whatwg.org/windows-1250.html) of ISO 8859-2
        -- Latin-2.
    | Windows1251
        -- ^ [Windows Cyrillic]
        -- (https://encoding.spec.whatwg.org/windows-1251.html).
    | Windows1252
        -- ^ The Windows extension of ISO 8859-1 [Latin-1]
        -- (https://encoding.spec.whatwg.org/windows-1252.html), replacing most
        -- of the C1 control characters with printable glyphs.
        -- 
        -- Note that this encoding is always used instead of pure Latin-1.
    | Windows1253
        -- ^ [Windows Greek]
        -- (https://encoding.spec.whatwg.org/windows-1253.html) (modern
        -- monotonic).
    | Windows1254
        -- ^ The Windows extension of ISO 8859-9 [Latin-5 (Turkish)]
        -- (https://encoding.spec.whatwg.org/windows-1254.html), replacing most
        -- of the C1 control characters with printable glyphs.
        -- 
        -- Note that this encoding is always used instead of pure Latin-5.
    | Windows1255
        -- ^ The Windows [extension and rearrangement]
        -- (https://encoding.spec.whatwg.org/windows-1255.html) of ISO 8859-8
        -- Latin/Hebrew.
    | Windows1256
        -- ^ [Windows Arabic]
        -- (https://encoding.spec.whatwg.org/windows-1256.html).
    | Windows1257
        -- ^ [Windows Baltic]
        -- (https://encoding.spec.whatwg.org/windows-1257.html).
    | Windows1258
        -- ^ [Windows Vietnamese]
        -- (https://encoding.spec.whatwg.org/windows-1258.html).
    | Replacement
        -- ^ The input is reduced to a single @\\xFFFD@ replacement character.
        -- 
        -- No encoder is provided for this scheme.
    | UserDefined --"x-user-defined"
        -- ^ Non-ASCII bytes (@\\x80@ through @\\xFF@) are mapped to a portion
        -- of the Unicode Private Use Area (@\\xF780@ through @\\xF7FF@).
  deriving ( Eq, Ord, Bounded, Enum, Show, Read )
instance H.Hashable Encoding where
    hashWithSalt s = (+ s) . fromEnum


-- | All the data which needs to be tracked for correct behaviour in decoding a
-- binary stream into readable text.
data DecoderState = DecoderState
    { decoderConfidence_ :: Confidence
        -- ^ The encoding scheme currently in use by the parser, along with how
        -- likely that scheme actually represents the binary stream.
    , useBom :: Maybe Bool
        -- ^ Whether a byte-order mark at the beginning of the stream should
        -- override the encoding given by 'decoderConfidence_'.  A 'Nothing'
        -- value treats it as any other character, while a @'Just' 'False'@
        -- does the same if and only if it differs from the current encoding,
        -- silently consuming it (without output) if it matches.
    , innerDecoderState :: InnerDecoderState
        -- ^ Any state parameters specific to the encoding scheme used.  Note
        -- this value takes precidence over the 'Encoding' specified by
        -- 'decoderConfidence_' (but not the degree of confidence itself) if the
        -- two differ.
    , remainderBytes :: BS.SH.ShortByteString
        -- ^ Any leftover bytes at the end of the binary stream, which require
        -- further input to be processed in order to correctly map to a
        -- character or error value.
    }
  deriving ( Eq, Show, Read )

-- | __HTML:__
--      @[confidence]
--      (https://html.spec.whatwg.org/multipage/parsing.html#concept-encoding-confidence)@
-- 
-- How likely the specified encoding is to be the actual stream encoding.
-- 
-- The spec names a third confidence level @irrelevant@, to be used when the
-- stream doesn't depend on any particular encoding scheme (i.e. it is
-- composed directly of 'Char's rather than parsed from a binary stream).  This
-- has not been included in the sum type, as it makes little sense to have that
-- as a parameter of the /decoding/ stage.  Use @'Maybe' 'DecoderState'@ to
-- represent it instead.
data Confidence
    = Tentative Encoding ReparseData
        -- ^ The binary stream is likely the named encoding, but more data may
        -- prove it to be something else.  In the latter case, the
        -- 'ReparseData' (if available) may be used to transition to the proper
        -- encoding, or restart the stream if necessary.
    | Certain Encoding
        -- ^ The binary stream is confirmed to be of the given encoding.
  deriving ( Eq, Show, Read )

-- | Extract the underlying encoding scheme from the wrapping data.
confidenceEncoding :: Confidence -> Encoding
confidenceEncoding (Tentative enc _) = enc
confidenceEncoding (Certain enc) = enc

-- | __HTML:__
--      @[change the encoding]
--      (https://html.spec.whatwg.org/multipage/parsing.html#change-the-encoding)@
-- 
-- The data required to determine if a new encoding would produce an identical
-- output to what the current one has already done, and to restart the parsing
-- with the new one if the two are incompatible.  Values may be easily
-- initialized via 'emptyReparseData'.
data ReparseData = ReparseData
    { parsedChars :: M.S.HashMap BS.SH.ShortByteString Char
        -- ^ The input binary sequences and the resulting characters which are
        -- already emitted to the output.
    , streamStart :: BS.L.ByteString
        -- ^ The complete binary sequence parsed thus far, in case it needs to
        -- be re-processed under a new, incompatible encoding.
    }
  deriving ( Eq, Show, Read )

-- | The collection of data which would indicate nothing has yet been parsed.
emptyReparseData :: ReparseData
emptyReparseData = ReparseData
    { parsedChars = M.S.empty
    , streamStart = BS.L.empty
    }


-- | All the data which needs to be tracked for correct behaviour in decoding a
-- binary stream into readable text.
data EncoderState = EncoderState
    { encoderScheme :: Encoding
        -- ^ The encoding scheme in use by the parser.
    , innerEncoderState :: InnerEncoderState
        -- ^ Any state parameters specific to the encoding scheme used.  Note
        -- that this value takes precidence over the 'Encoding' specified by
        -- 'encoderScheme' if the two differ.
    }
  deriving ( Eq, Show, Read )


-- | Test whether the byte is within the range of ASCII (only lower seven bits
-- may be set).
isAsciiByte :: W.Word8 -> Bool
isAsciiByte = (<= toByte '\DEL')


-- | Convert an ASCII character into its respective singleton byte.  Fails if
-- the character is not included in the basic ASCII set.
fromAscii :: Char -> StateBinaryBuilder state
fromAscii char
    | C.isAscii char = return . pure $ BS.SH.pack [fromIntegral $ fromEnum char]
    | otherwise = A.empty

-- | Convert a numeric value to its associated Unicode character, and wrap it
-- accordingly for the parsers.
toUnicode :: Integral a => [W.Word8] -> a -> StateTextBuilder state
toUnicode bs = emit bs . toEnum . fromIntegral

-- | Convert a numeric value to its associated Unicode character, and wrap it
-- accordingly for the parsers.
toUnicode1 :: W.Word8 -> StateTextBuilder state
toUnicode1 b = emit [b] . toEnum $ fromIntegral b

-- | Wrap a single character in the enclosing functors required to use it as
-- the output of a binary-to-text parser.
emit :: [W.Word8] -> Char -> StateTextBuilder state
emit bs c = do
    N.S.modify $ \state -> case fst state of
        Tentative enc d' -> (Tentative enc $ insertChar d', snd state)
          where insertChar d = d
                    { parsedChars = M.S.insert (BS.SH.pack bs) c $ parsedChars d
                    }
        _ -> state
    return . pure $ pure c


-- | Wrap several characters in the enclosing functors required to use them as
-- the output of a binary-to-text parser.
emit' :: [W.Word8] -> String -> StateTextBuilder state
emit' bs [c] = emit bs c
emit' bs cs = do
    -- This is not the actual character(s) emitted, but the string is
    -- inherantly not going to be the same as any single one emitted by any
    -- other encoding, so any non-equal value will work as a placeholder.
    _ <- emit bs '\NUL'
    return $ pure cs


-- | A parser combinator written as part of the algorithm to extract text from
-- a binary stream, carrying some persistent state.
type StateDecoder state = StateParser (Confidence, state) BS.ByteString

-- | A parser combinator written as part of the algorithm to extract text from
-- a binary stream.  For compatibility with the few encodings which require
-- tracking some state, all of the core bytes-to-text algorithms are written
-- within a null 'Control.Monad.State.Lazy.MonadState'.
type Decoder = StateDecoder ()

-- | The fallible output of one of the core bytes-to-text parsers.  If some
-- binary sequence proves uninterpretable, the 'Left' value contains the
-- original sequence unchanged; such a value should be either ignored or
-- reported verbatim, with no further fallback attempts at parsing.
type DecoderError out = Either BS.SH.ShortByteString out


-- | Fetch the current value of the state determining how to transform a binary
-- stream into a 'Char' representation within an encoding scheme.
getDecoderState :: StateDecoder state state
getDecoderState = snd <$> N.S.get

-- | Update the current state of the binary-to-'Char' parser according to the
-- given function.
modifyDecoderState :: (state -> state) -> StateDecoder state ()
modifyDecoderState f = N.S.modify $ F.B.second f


-- | A parser combinator written as part of the algorithm to serialize text
-- into a binary stream, carrying some persistent state.
type StateEncoder state = StateParser state T.Text

-- | A parser combinator written as part of the algorithm to serialize text
-- into a binary stream.  For compatibility with the few encodings which
-- require tracking some state, all of the core bytes-to-text algorithms are
-- written within a null 'Control.Monad.State.Lazy.MonadState'.
type Encoder = StateEncoder ()

-- | The fallible output of one of the core text-to-bytes parsers.  If some
-- character is not representable in the encoding, the 'Left' value contains
-- the original 'Char' unchanged; such a value should be either ignored or
-- reported verbatim, with no further fallback attempts at encoding.
type EncoderError out = Either Char out


-- | Fetch the current value of the state determining how to serialize 'Char's
-- into a binary representation within an encoding scheme.
getEncoderState :: StateEncoder state state
getEncoderState = N.S.get

-- | Update the current state of the 'Char'-to-binary parser according to the
-- given function.
modifyEncoderState :: (state -> state) -> StateEncoder state ()
modifyEncoderState = N.S.modify


-- | Efficient construction of 'T.Text' strings from a 'BS.ByteString'.  This
-- will be called in a loop until failure or the end of the stream, and so
-- should return a sensible minimum string given the restriction of consuming
-- whole bytes (e.g. a UTF-7 'T.B.Builder' might translate blocks of eight
-- bytes into three 'Char's, while UTF-8 would translate a 'Char' at a time
-- from a variable number of bytes).
-- 
-- If the current such block is invalid for the encoding, it should be returned
-- in the 'Left' instead of any "sensible" default, so failure only occurs at
-- the end of the stream.  Note that a failing base case must eventually be
-- reached on an empty stream, or the generated 'T.Text' will be infinite.
type TextBuilder = Decoder (DecoderError String)

-- | As 'TextBuilder', for those few encodings which require tracking a
-- persistent state across character boundaries.
type StateTextBuilder state = StateDecoder state (DecoderError String)


-- | Efficient construction of 'BS.ByteString's from a 'T.Text' string.  This
-- will be called in a loop until failure or the end of the stream, and so
-- should operate on a single 'Char' at a time unless some property of the
-- encoding requires a longer span (e.g. UTF-7 requires three 'Char's to return
-- to a byte boundary).  No attempt at Unicode (de)composition is attempted,
-- even if that would succeed where the input form fails.
-- 
-- If the current 'Char' can not be represented in the encoding, it should be
-- returned as a 'Left' value, so failure only occurs at the end of the stream.
-- Note that a failing base case must eventually be reached on an empty stream,
-- or the generated 'BS.ByteString' will be infinite.
type BinaryBuilder = Encoder (EncoderError BS.SH.ShortByteString)

-- | As 'BinaryBuilder', for those few encodings which require tracking a
-- persistent state across character boundaries.
type StateBinaryBuilder state = StateEncoder state (EncoderError BS.SH.ShortByteString)


-- | Pack the given binary sequence as a failed parse.
decoderFailure :: [W.Word8] -> StateTextBuilder state
decoderFailure bs = do
    _ <- emit bs replacementChar
    return . Left $ BS.SH.pack bs

-- | Pack the given byte as a singleton sequence triggering a failed parse.
decoderFailure1 :: W.Word8 -> StateTextBuilder state
decoderFailure1 b = decoderFailure [b]

-- | Combine the given bytes as a binary sequence triggering a failed parse.
-- This is purely driven by a few fallback statements in multi-byte encodings,
-- where the lead byte is known from the invocation, and the trail byte is
-- piped in from the 'Web.Willow.Common.Parser.Switch.switch' block.
decoderFailure2 :: W.Word8 -> W.Word8 -> StateTextBuilder state
decoderFailure2 b b' = decoderFailure [b, b']

-- | Pack the given character as invalid for the target encoding.
encoderFailure :: Char -> StateEncoder state (EncoderError out)
encoderFailure = return . Left


-- | Retrieve a character index file distributed as part of the HTML standard,
-- and process it into an easily-digestible type.  Note that this is a heavy
-- function and should be cached whenever possible.
-- 
-- This may cause a panic if the encoding index file doesn't exist or the user
-- doesn't have the proper permissions to read it.
loadIndex :: String -> [(Word, Char)]
loadIndex = loadIndex' $ const True

-- | Retrieve a character index file distributed as part of the HTML standard,
-- and process it into an easily-digestible type, filtering out a selection of
-- the original values in the process.  Note that this is a heavy function and
-- should be cached whenever possible.
-- 
-- This may cause a panic if the encoding index file doesn't exist or the user
-- doesn't have the proper permissions to read it.
loadIndex' :: ((Word, Char) -> Bool) -> String -> [(Word, Char)]
loadIndex' test name = IO.Unsafe.unsafePerformIO $ do
    path <- getDataFileName $ name <.> "index"
    -- In addition to the expected errors above, this may theoretically throw
    -- an 'IO.isAlreadyInUseError'; however, that should mostly be an issue
    -- when writing, rather than in 'IO.ReadMode'.
    index <- IO.withFile path IO.ReadMode T.IO.hGetContents
    return . Y.mapMaybe (indexLine test) $ T.lines index
{-# NOINLINE loadIndex' #-}

-- | Process a line in the format used by the HTML index files, discarding the
-- result if it fails the given predicate.
indexLine :: ((Word, Char) -> Bool) -> T.Text -> Maybe (Word, Char)
indexLine test l
    | T.null l = Nothing
    | T.head l == '#' = Nothing
    | otherwise = indexLine' $ T.split (== '\t') l
  where indexLine' (i':c':_) = do
            i <- either fail pure . T.R.decimal $ T.strip i'
            c <- either fail pure . T.R.hexadecimal $ T.strip c'
            let entry = (fst i, toEnum $ fst c)
            if test entry then return entry else Nothing
        indexLine' _ = Nothing


-- | Memoization table for lookups into the CJK index files.  Storing the
-- entire thing in a 'Data.Vector.Vector' or other indexable form would take a
-- megabyte of memory each in the best case, and many characters would likely
-- never be used.  At the same time, performing a file lookup on every
-- character would be unacceptably slow.  Instead, take the performance hit
-- /once/ per character, and store the result for faster access in future
-- calls.
-- 
-- Depending on the frequency of the updates, some updates (via
-- @updateIndices@) may be lost.  This should still be a smaller performance
-- hit than getting blocked while several other threads try to do the same.
type MemoizationTable k v = IO.R.IORef (M.S.HashMap k v)

-- | Specialization of the CJK lookup table for parsers comprising a
-- bytes-to-text algorithm.
type DecoderMemoTable = IO (MemoizationTable Word (Maybe Char))

-- | Specialization of the CJK lookup table for parsers comprising a
-- text-to-bytes algorithm.
type EncoderMemoTable = IO (MemoizationTable Char (Maybe Word))

-- | Allocate a lookup table for caching the results of expensive computations.
-- Note that the table is mutable and threaded access should be reasoned
-- through carefully.
newMemoizationTable :: IO (MemoizationTable k v)
newMemoizationTable = IO.R.newIORef M.S.empty


-- | Look for the index of a given character, checking chached values before
-- running a (potentially expensive) direct retrieval function.  Specialized
-- for symmetric lookup functions, and so populates the inverse table at the
-- same time, if given one.
lookupMemoizedIndex
    :: (Eq k, H.Hashable k, Eq v, H.Hashable v)
    => IO (MemoizationTable k (Maybe v))
    -> Maybe (IO (MemoizationTable v (Maybe k)))
    -> k
    -> (k -> Maybe v)
    -> Maybe v
lookupMemoizedIndex memo inverse key valueF = IO.Unsafe.unsafePerformIO $ do
    -- 'IO.Unsafe.unsafePerformIO' should be safe, as the only effect the
    -- environment theoretically has on the computation is the time it takes to
    -- complete (i.e. whether that particular character has been memoized yet).
    is <- memo >>= IO.R.readIORef
    case M.S.lookup key is of
        Just i -> return i
        Nothing -> updateIndices key (valueF key) memo inverse
{-# NOINLINE lookupMemoizedIndex #-}

-- | Given either an index offset or a 'Char' code point, and whether the other
-- was found in the on-disc index file, update one or both memoization tables
-- to avoid having to look for it again.
updateIndices
    :: (Eq k, H.Hashable k, Eq v, H.Hashable v)
    => k
    -> Maybe v
    -> IO (MemoizationTable k (Maybe v))
    -> Maybe (IO (MemoizationTable v (Maybe k)))
    -> IO (Maybe v)
updateIndices k Nothing l _ =
    (l >>= IO.C.forkIO . flip IO.R.modifyIORef' (M.S.insert k Nothing)) $> Nothing
updateIndices k v'@(Just v) l Nothing =
    (l >>= IO.C.forkIO . flip IO.R.modifyIORef' (M.S.insert k $ Just v)) $> v'
updateIndices k v'@(Just v) l (Just r) =
    (l >>= IO.C.forkIO . flip IO.R.modifyIORef' (M.S.insert k $ Just v)) *>
    (r >>= IO.C.forkIO . flip IO.R.modifyIORef' (M.S.insert v $ Just k)) $> v'

-- | Find the value of a given key in an ordered list.  At the moment, just an
-- alias for an O(n) 'lookup', but will hopefully be improved at some point.
search :: Ord k => k -> [(k, v)] -> Maybe v
search = lookup