{-# LANGUAGE Trustworthy #-}

{-|
Description:    HTML named character reference definitions and lookup.

Copyright:      (c) 2020 Sam May
License:        MPL-2.0
Maintainer:     ag.eitilt@gmail.com

Stability:      provisional
Portability:    portable

The __[HTML](https://html.spec.whatwg.org/)__ standard defines a large number
of iconic character names for accessing commonly-used characters outside of the
file encoding, or just the capabilities of the keyboard used.  A simple
implementation would be a @'M.HashMap' 'String' 'String'@ (as a few names map
to multiple Unicode characters), but unfortunately the compatibility
restrictions of the parsing algorithm mean that type would be
less-than-performant; as the parser needs to check for a valid reference on
every character, the full reference pool would need to be searched each time
for @O(n*log m)@ with a large @m@.  (While @m@ can be reduced by filtering the
map at each step, amortized for @O(n*m*log m)@, there's no guarantee the
smaller @m@ is ultimately an improvement.)

This module instead organizes the character references into a search tree
indexed by 'Char', allowing each test step to operate over a much smaller
search space for @O(n*log 62)@, at the expense of slightly greater space
overhead.
-}
module Web.Mangrove.Parse.Common.Character
    ( CharacterReferenceTree ( .. )
    , ReferenceValue ( .. )
    , lookupCharacterReference
    , characterReferences
    ) where


import qualified Data.Aeson as J
import qualified Data.Bifunctor as F.B
import qualified Data.Either as E
import qualified Data.HashMap.Strict as M
import qualified Data.Text as T

import qualified System.IO.Unsafe as IO.Unsafe

import Paths_mangrove

import Control.Applicative ( (<|>) )
import Data.Aeson ( (.:) )
import System.FilePath ( (<.>) )


-- | __HTML:__
--      @[named character references]
--      (https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references)@
-- 
-- The decomposition of the HTML named character reference table into a
-- search-optimized form.  The value type consists of the character reference
-- 'String' whose name terminates with the key 'Char', if one exists, alongside
-- any character references whose names are prefixed appropriately.
-- 
-- The ampersand and semicolon delimiting the character reference are not
-- considered part of the name for storage; the former is silently dropped,
-- while the latter is indicated by the value of 'isSemicolonOptional'.
-- 
-- For example, a minimal tree defining only the reference names @"&cent"@,
-- @"&cent;"@, and @"&centerdot;"@ would have the structure:
-- 
-- > [ ( 'c'
-- >   , Nothing
-- >   , [ ( 'e'
-- >       , Nothing
-- >       , [ ( 'n'
-- >           , Nothing
-- >           , [ ( 't'
-- >               , Just $ ReferenceValue True '\xA2'
-- >               , [ ( 'e'
-- >                   , Nothing
-- >                   , [ ( 'r'
-- >                       , Nothing
-- >                       , [ ( 'd'
-- >                           , Nothing
-- >                           , [ ( 'o'
-- >                               , Nothing
-- >                               , [ ( 't'
-- >                                   , Just $ ReferenceValue False '\xB7'
-- >                                   , []
-- >   ) ] ) ] ) ] ) ] ) ] ) ] ) ] ) ] ) ]
newtype CharacterReferenceTree =
    CharacterReferenceTree (M.HashMap Char (Maybe ReferenceValue, CharacterReferenceTree))
  deriving ( Eq, Show, Read )


-- | A collection of data describing how to replace some named character
-- reference with a Unicode character sequence.
data ReferenceValue = ReferenceValue
    { isSemicolonOptional :: Bool
        -- ^ Whether the reference allows a compatibility form without a
        -- terminating semicolon.
    , referenceValue :: String
        -- ^ The 'Char'(s) to insert into the document in place of the
        -- reference.
    }
  deriving ( Eq, Show, Read )


-- | User-friendly access into 'characterReferences', if the full name of the
-- potential character reference is already known.  Note that the underlying
-- map isn't structured as a traditional 'M.HashMap', and so lookup is @O(n)@
-- over the length of the name rather than @O(log m)@ over the size of the map.
-- 
-- This doesn't perform the longest-match calculations described by the HTML
-- standard, just a simple "does this string match a reference name" as if the
-- underlying structure were a flat @'M.HashMap' 'String' ref@.  The leading
-- ampersand and trailing semicolon may be present, but neither is required.
lookupCharacterReference :: String -> Maybe ReferenceValue
lookupCharacterReference = lookupCharacterReference' characterReferences . dropAmpersand
  where dropAmpersand ('&':cs) = cs
        dropAmpersand cs = cs

-- | Iterate through the reference tree according to the remainder of the
-- reference name.
lookupCharacterReference' :: CharacterReferenceTree -> String -> Maybe ReferenceValue
lookupCharacterReference' _ [] = Nothing
lookupCharacterReference' (CharacterReferenceTree refs) [c] = M.lookup c refs >>= fst
lookupCharacterReference' (CharacterReferenceTree refs) [c, ';'] = M.lookup c refs >>= fst
lookupCharacterReference' (CharacterReferenceTree refs) (c:cs) = do
    (_, refs') <- M.lookup c refs
    lookupCharacterReference' refs' cs


-- | The full set of named character references defined by the HTML standard,
-- in a search-optimized form.  Unless the potential reference name isn't
-- completely and unambiguously known (e.g., during the resolution algorithm
-- described by the HTML standard), 'lookupCharacterReference' is the better
-- interface to use.
-- 
-- Uses 'IO.Unsafe.unsafePerformIO' internally, as the underlying file should
-- never change at runtime, and so every evaluation would be pure.
characterReferences :: CharacterReferenceTree
characterReferences = IO.Unsafe.unsafePerformIO $ do
    entities <- getDataFileName $ "entities" <.> "json"
    maybe (CharacterReferenceTree M.empty) repackReferences <$> J.decodeFileStrict entities
{-# NOINLINE characterReferences #-}


-- | Given a naïve map of character reference names to Unicode values, optimize
-- it for searching character-by-character.
repackReferences :: M.HashMap String CharacterData -> CharacterReferenceTree
repackReferences = foldr (repackReferences' . F.B.bimap (drop 1) (map toEnum . codepoints))
    (CharacterReferenceTree M.empty) . M.toList

-- | Add a single key-value character reference pair to the growing search tree.
repackReferences' :: (String, String) -> CharacterReferenceTree -> CharacterReferenceTree
repackReferences' ref (CharacterReferenceTree refs) = CharacterReferenceTree $
    M.unionWith joinTree (singletonReferenceTree ref) refs

-- | 'M.unionWith' any character references sharing a prefix.  Note that if
-- multiple references have the same name (modulo semicolons) but different
-- values, the resulting value isn't necessarily predictable, nor guaranteed to
-- be stable between even minor library versions.
joinTree
    :: (Maybe ReferenceValue, CharacterReferenceTree)
        -- ^ The reference value to be added at some character.
    -> (Maybe ReferenceValue, CharacterReferenceTree)
        -- ^ The reference value already existing in the map.
    -> (Maybe ReferenceValue, CharacterReferenceTree)
joinTree (l, CharacterReferenceTree ls) (r, CharacterReferenceTree rs) =
    (fmap checkOptional $ l <|> r, CharacterReferenceTree $ M.unionWith joinTree ls rs)
  where checkOptional ref = ref
            { isSemicolonOptional =
                maybe False isSemicolonOptional l || maybe False isSemicolonOptional r
            }

-- | Generate a search tree containing the single key-value character reference
-- pair, to be merged into a larger accumulation.
singletonReferenceTree
    :: (String, String)
    -> M.HashMap Char (Maybe ReferenceValue, CharacterReferenceTree)
singletonReferenceTree (key, ref) = E.fromRight M.empty $
    foldr singletonReferenceTree' (Left True) key
  where singletonReferenceTree' ';' _ = Left False
        singletonReferenceTree' c (Left semicolon) = Right $
            M.singleton c (Just $ ReferenceValue semicolon ref, CharacterReferenceTree M.empty)
        singletonReferenceTree' c (Right ref') = Right $
            M.singleton c (Nothing, CharacterReferenceTree ref')


-- | Internal representation of the character reference definitions in the
-- @entities.json@ file; the names are provided by JSON dictionary keys.
newtype CharacterData = CharacterData
    { codepoints :: [Int]
        -- ^ The Unicode code points represented by a given name.
 -- , characters :: String
 --     Uses surrogate character points rather than high-Unicode characters,
 --     and so isn't as desirable to build from.
    }
  deriving ( Eq, Show, Read )
instance J.FromJSON CharacterData where
    parseJSON = J.withObject "reference" $ \v -> CharacterData
        <$> v .: T.pack "codepoints"
     -- <*> v .: T.pack "characters"