{-# LANGUAGE Trustworthy #-} {-| Description: HTML named character reference definitions and lookup. Copyright: (c) 2020 Sam May License: MPL-2.0 Maintainer: ag.eitilt@gmail.com Stability: provisional Portability: portable The __[HTML](https://html.spec.whatwg.org/)__ standard defines a large number of iconic character names for accessing commonly-used characters outside of the file encoding, or just the capabilities of the keyboard used. A simple implementation would be a @'M.HashMap' 'String' 'String'@ (as a few names map to multiple Unicode characters), but unfortunately the compatibility restrictions of the parsing algorithm mean that type would be less-than-performant; as the parser needs to check for a valid reference on every character, the full reference pool would need to be searched each time for @O(n*log m)@ with a large @m@. (While @m@ can be reduced by filtering the map at each step, amortized for @O(n*m*log m)@, there's no guarantee the smaller @m@ is ultimately an improvement.) This module instead organizes the character references into a search tree indexed by 'Char', allowing each test step to operate over a much smaller search space for @O(n*log 62)@, at the expense of slightly greater space overhead. -} module Web.Mangrove.Parse.Common.Character ( CharacterReferenceTree ( .. ) , ReferenceValue ( .. ) , lookupCharacterReference , characterReferences ) where import qualified Data.Aeson as J import qualified Data.Bifunctor as F.B import qualified Data.Either as E import qualified Data.HashMap.Strict as M import qualified Data.Text as T import qualified System.IO.Unsafe as IO.Unsafe import Paths_mangrove import Control.Applicative ( (<|>) ) import Data.Aeson ( (.:) ) import System.FilePath ( (<.>) ) -- | __HTML:__ -- @[named character references] -- (https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references)@ -- -- The decomposition of the HTML named character reference table into a -- search-optimized form. The value type consists of the character reference -- 'String' whose name terminates with the key 'Char', if one exists, alongside -- any character references whose names are prefixed appropriately. -- -- The ampersand and semicolon delimiting the character reference are not -- considered part of the name for storage; the former is silently dropped, -- while the latter is indicated by the value of 'isSemicolonOptional'. -- -- For example, a minimal tree defining only the reference names @"¢"@, -- @"¢"@, and @"·"@ would have the structure: -- -- > [ ( 'c' -- > , Nothing -- > , [ ( 'e' -- > , Nothing -- > , [ ( 'n' -- > , Nothing -- > , [ ( 't' -- > , Just $ ReferenceValue True '\xA2' -- > , [ ( 'e' -- > , Nothing -- > , [ ( 'r' -- > , Nothing -- > , [ ( 'd' -- > , Nothing -- > , [ ( 'o' -- > , Nothing -- > , [ ( 't' -- > , Just $ ReferenceValue False '\xB7' -- > , [] -- > ) ] ) ] ) ] ) ] ) ] ) ] ) ] ) ] ) ] newtype CharacterReferenceTree = CharacterReferenceTree (M.HashMap Char (Maybe ReferenceValue, CharacterReferenceTree)) deriving ( Eq, Show, Read ) -- | A collection of data describing how to replace some named character -- reference with a Unicode character sequence. data ReferenceValue = ReferenceValue { isSemicolonOptional :: Bool -- ^ Whether the reference allows a compatibility form without a -- terminating semicolon. , referenceValue :: String -- ^ The 'Char'(s) to insert into the document in place of the -- reference. } deriving ( Eq, Show, Read ) -- | User-friendly access into 'characterReferences', if the full name of the -- potential character reference is already known. Note that the underlying -- map isn't structured as a traditional 'M.HashMap', and so lookup is @O(n)@ -- over the length of the name rather than @O(log m)@ over the size of the map. -- -- This doesn't perform the longest-match calculations described by the HTML -- standard, just a simple "does this string match a reference name" as if the -- underlying structure were a flat @'M.HashMap' 'String' ref@. The leading -- ampersand and trailing semicolon may be present, but neither is required. lookupCharacterReference :: String -> Maybe ReferenceValue lookupCharacterReference = lookupCharacterReference' characterReferences . dropAmpersand where dropAmpersand ('&':cs) = cs dropAmpersand cs = cs -- | Iterate through the reference tree according to the remainder of the -- reference name. lookupCharacterReference' :: CharacterReferenceTree -> String -> Maybe ReferenceValue lookupCharacterReference' _ [] = Nothing lookupCharacterReference' (CharacterReferenceTree refs) [c] = M.lookup c refs >>= fst lookupCharacterReference' (CharacterReferenceTree refs) [c, ';'] = M.lookup c refs >>= fst lookupCharacterReference' (CharacterReferenceTree refs) (c:cs) = do (_, refs') <- M.lookup c refs lookupCharacterReference' refs' cs -- | The full set of named character references defined by the HTML standard, -- in a search-optimized form. Unless the potential reference name isn't -- completely and unambiguously known (e.g., during the resolution algorithm -- described by the HTML standard), 'lookupCharacterReference' is the better -- interface to use. -- -- Uses 'IO.Unsafe.unsafePerformIO' internally, as the underlying file should -- never change at runtime, and so every evaluation would be pure. characterReferences :: CharacterReferenceTree characterReferences = IO.Unsafe.unsafePerformIO $ do entities <- getDataFileName $ "entities" <.> "json" maybe (CharacterReferenceTree M.empty) repackReferences <$> J.decodeFileStrict entities {-# NOINLINE characterReferences #-} -- | Given a naïve map of character reference names to Unicode values, optimize -- it for searching character-by-character. repackReferences :: M.HashMap String CharacterData -> CharacterReferenceTree repackReferences = foldr (repackReferences' . F.B.bimap (drop 1) (map toEnum . codepoints)) (CharacterReferenceTree M.empty) . M.toList -- | Add a single key-value character reference pair to the growing search tree. repackReferences' :: (String, String) -> CharacterReferenceTree -> CharacterReferenceTree repackReferences' ref (CharacterReferenceTree refs) = CharacterReferenceTree $ M.unionWith joinTree (singletonReferenceTree ref) refs -- | 'M.unionWith' any character references sharing a prefix. Note that if -- multiple references have the same name (modulo semicolons) but different -- values, the resulting value isn't necessarily predictable, nor guaranteed to -- be stable between even minor library versions. joinTree :: (Maybe ReferenceValue, CharacterReferenceTree) -- ^ The reference value to be added at some character. -> (Maybe ReferenceValue, CharacterReferenceTree) -- ^ The reference value already existing in the map. -> (Maybe ReferenceValue, CharacterReferenceTree) joinTree (l, CharacterReferenceTree ls) (r, CharacterReferenceTree rs) = (fmap checkOptional $ l <|> r, CharacterReferenceTree $ M.unionWith joinTree ls rs) where checkOptional ref = ref { isSemicolonOptional = maybe False isSemicolonOptional l || maybe False isSemicolonOptional r } -- | Generate a search tree containing the single key-value character reference -- pair, to be merged into a larger accumulation. singletonReferenceTree :: (String, String) -> M.HashMap Char (Maybe ReferenceValue, CharacterReferenceTree) singletonReferenceTree (key, ref) = E.fromRight M.empty $ foldr singletonReferenceTree' (Left True) key where singletonReferenceTree' ';' _ = Left False singletonReferenceTree' c (Left semicolon) = Right $ M.singleton c (Just $ ReferenceValue semicolon ref, CharacterReferenceTree M.empty) singletonReferenceTree' c (Right ref') = Right $ M.singleton c (Nothing, CharacterReferenceTree ref') -- | Internal representation of the character reference definitions in the -- @entities.json@ file; the names are provided by JSON dictionary keys. newtype CharacterData = CharacterData { codepoints :: [Int] -- ^ The Unicode code points represented by a given name. -- , characters :: String -- Uses surrogate character points rather than high-Unicode characters, -- and so isn't as desirable to build from. } deriving ( Eq, Show, Read ) instance J.FromJSON CharacterData where parseJSON = J.withObject "reference" $ \v -> CharacterData <$> v .: T.pack "codepoints" -- <*> v .: T.pack "characters"