{-# LANGUAGE LambdaCase        #-}
{-# LANGUAGE OverloadedStrings #-}
{- |
   Module      : Text.Pandoc.XML
   Copyright   : Copyright (C) 2006-2023 John MacFarlane
   License     : GNU GPL, version 2 or above

   Maintainer  : John MacFarlane <jgm@berkeley.edu>
   Stability   : alpha
   Portability : portable

Functions for escaping and formatting XML.
-}
module Text.Pandoc.XML ( escapeCharForXML,
                         escapeStringForXML,
                         escapeNCName,
                         inTags,
                         selfClosingTag,
                         inTagsSimple,
                         inTagsIndented,
                         toEntities,
                         toHtml5Entities,
                         fromEntities,
                         lookupEntity,
                         html4Attributes,
                         html5Attributes,
                         rdfaAttributes ) where

import Data.Char (isAscii, isSpace, ord, isLetter, isDigit)
import Data.Text (Text)
import qualified Data.Text as T
import Commonmark.Entity (lookupEntity)
import Text.HTML.TagSoup.Entity (htmlEntities)
import Text.DocLayout
    ( ($$), char, hcat, nest, text, Doc, HasChars )
import Text.Printf (printf)
import qualified Data.Map as M
import Data.String ( IsString )
import qualified Data.Set as Set

-- | Escape one character as needed for XML.
escapeCharForXML :: Char -> Text
escapeCharForXML :: Char -> Text
escapeCharForXML Char
x = case Char
x of
                       Char
'&' -> Text
"&amp;"
                       Char
'<' -> Text
"&lt;"
                       Char
'>' -> Text
"&gt;"
                       Char
'"' -> Text
"&quot;"
                       Char
c   -> Char -> Text
T.singleton Char
c

-- | Escape string as needed for XML.  Entity references are not preserved.
escapeStringForXML :: Text -> Text
escapeStringForXML :: Text -> Text
escapeStringForXML = (Char -> Text) -> Text -> Text
T.concatMap Char -> Text
escapeCharForXML forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Bool) -> Text -> Text
T.filter Char -> Bool
isLegalXMLChar
  where isLegalXMLChar :: Char -> Bool
isLegalXMLChar Char
c = Char
c forall a. Eq a => a -> a -> Bool
== Char
'\t' Bool -> Bool -> Bool
|| Char
c forall a. Eq a => a -> a -> Bool
== Char
'\n' Bool -> Bool -> Bool
|| Char
c forall a. Eq a => a -> a -> Bool
== Char
'\r' Bool -> Bool -> Bool
||
                           (Char
c forall a. Ord a => a -> a -> Bool
>= Char
'\x20' Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
<= Char
'\xD7FF') Bool -> Bool -> Bool
||
                           (Char
c forall a. Ord a => a -> a -> Bool
>= Char
'\xE000' Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
<= Char
'\xFFFD') Bool -> Bool -> Bool
||
                           (Char
c forall a. Ord a => a -> a -> Bool
>= Char
'\x10000' Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
<= Char
'\x10FFFF')
  -- see https://www.w3.org/TR/xml/#charsets

-- | Escape newline characters as &#10;
escapeNls :: Text -> Text
escapeNls :: Text -> Text
escapeNls = (Char -> Text) -> Text -> Text
T.concatMap forall a b. (a -> b) -> a -> b
$ \case
  Char
'\n' -> Text
"&#10;"
  Char
c    -> Char -> Text
T.singleton Char
c

-- | Return a text object with a string of formatted XML attributes.
attributeList :: (HasChars a, IsString a) => [(Text, Text)] -> Doc a
attributeList :: forall a. (HasChars a, IsString a) => [(Text, Text)] -> Doc a
attributeList = forall a. [Doc a] -> Doc a
hcat forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a -> b) -> [a] -> [b]
map
  (\(Text
a, Text
b) -> forall a. HasChars a => String -> Doc a
text (Text -> String
T.unpack forall a b. (a -> b) -> a -> b
$ Text
" " forall a. Semigroup a => a -> a -> a
<> Text -> Text
escapeStringForXML Text
a forall a. Semigroup a => a -> a -> a
<> Text
"=\"" forall a. Semigroup a => a -> a -> a
<>
  Text -> Text
escapeNls (Text -> Text
escapeStringForXML Text
b) forall a. Semigroup a => a -> a -> a
<> Text
"\""))

-- | Put the supplied contents between start and end tags of tagType,
--   with specified attributes and (if specified) indentation.
inTags :: (HasChars a, IsString a)
      => Bool -> Text -> [(Text, Text)] -> Doc a -> Doc a
inTags :: forall a.
(HasChars a, IsString a) =>
Bool -> Text -> [(Text, Text)] -> Doc a -> Doc a
inTags Bool
isIndented Text
tagType [(Text, Text)]
attribs Doc a
contents =
  let openTag :: Doc a
openTag = forall a. HasChars a => Char -> Doc a
char Char
'<' forall a. Semigroup a => a -> a -> a
<> forall a. HasChars a => String -> Doc a
text (Text -> String
T.unpack Text
tagType) forall a. Semigroup a => a -> a -> a
<> forall a. (HasChars a, IsString a) => [(Text, Text)] -> Doc a
attributeList [(Text, Text)]
attribs forall a. Semigroup a => a -> a -> a
<>
                forall a. HasChars a => Char -> Doc a
char Char
'>'
      closeTag :: Doc a
closeTag  = forall a. HasChars a => String -> Doc a
text String
"</" forall a. Semigroup a => a -> a -> a
<> forall a. HasChars a => String -> Doc a
text (Text -> String
T.unpack Text
tagType) forall a. Semigroup a => a -> a -> a
<> forall a. HasChars a => Char -> Doc a
char Char
'>'
  in  if Bool
isIndented
         then Doc a
openTag forall a. Doc a -> Doc a -> Doc a
$$ forall a. IsString a => Int -> Doc a -> Doc a
nest Int
2 Doc a
contents forall a. Doc a -> Doc a -> Doc a
$$ Doc a
closeTag
         else Doc a
openTag forall a. Semigroup a => a -> a -> a
<> Doc a
contents forall a. Semigroup a => a -> a -> a
<> Doc a
closeTag

-- | Return a self-closing tag of tagType with specified attributes
selfClosingTag :: (HasChars a, IsString a)
               => Text -> [(Text, Text)] -> Doc a
selfClosingTag :: forall a.
(HasChars a, IsString a) =>
Text -> [(Text, Text)] -> Doc a
selfClosingTag Text
tagType [(Text, Text)]
attribs =
  forall a. HasChars a => Char -> Doc a
char Char
'<' forall a. Semigroup a => a -> a -> a
<> forall a. HasChars a => String -> Doc a
text (Text -> String
T.unpack Text
tagType) forall a. Semigroup a => a -> a -> a
<> forall a. (HasChars a, IsString a) => [(Text, Text)] -> Doc a
attributeList [(Text, Text)]
attribs forall a. Semigroup a => a -> a -> a
<> forall a. HasChars a => String -> Doc a
text String
" />"

-- | Put the supplied contents between start and end tags of tagType.
inTagsSimple :: (HasChars a, IsString a)
             => Text -> Doc a -> Doc a
inTagsSimple :: forall a. (HasChars a, IsString a) => Text -> Doc a -> Doc a
inTagsSimple Text
tagType = forall a.
(HasChars a, IsString a) =>
Bool -> Text -> [(Text, Text)] -> Doc a -> Doc a
inTags Bool
False Text
tagType []

-- | Put the supplied contents in indented block btw start and end tags.
inTagsIndented :: (HasChars a, IsString a)
               => Text -> Doc a -> Doc a
inTagsIndented :: forall a. (HasChars a, IsString a) => Text -> Doc a -> Doc a
inTagsIndented Text
tagType = forall a.
(HasChars a, IsString a) =>
Bool -> Text -> [(Text, Text)] -> Doc a -> Doc a
inTags Bool
True Text
tagType []

-- | Escape all non-ascii characters using numerical entities.
toEntities :: Text -> Text
toEntities :: Text -> Text
toEntities = (Char -> Text) -> Text -> Text
T.concatMap Char -> Text
go
  where go :: Char -> Text
go Char
c | Char -> Bool
isAscii Char
c = Char -> Text
T.singleton Char
c
             | Bool
otherwise = String -> Text
T.pack (forall r. PrintfType r => String -> r
printf String
"&#x%X;" (Char -> Int
ord Char
c))

-- | Escape all non-ascii characters using HTML5 entities, falling
-- back to numerical entities.
toHtml5Entities :: Text -> Text
toHtml5Entities :: Text -> Text
toHtml5Entities = (Char -> Text) -> Text -> Text
T.concatMap Char -> Text
go
  where go :: Char -> Text
go Char
c | Char -> Bool
isAscii Char
c = Char -> Text
T.singleton Char
c
             | Bool
otherwise =
                 case forall k a. Ord k => k -> Map k a -> Maybe a
M.lookup Char
c Map Char Text
html5EntityMap of
                   Just Text
t  -> Char -> Text
T.singleton Char
'&' forall a. Semigroup a => a -> a -> a
<> Text
t forall a. Semigroup a => a -> a -> a
<> Char -> Text
T.singleton Char
';'
                   Maybe Text
Nothing -> String -> Text
T.pack (String
"&#" forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> String
show (Char -> Int
ord Char
c) forall a. [a] -> [a] -> [a]
++ String
";")

html5EntityMap :: M.Map Char Text
html5EntityMap :: Map Char Text
html5EntityMap = forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr forall {k}. Ord k => (String, [k]) -> Map k Text -> Map k Text
go forall a. Monoid a => a
mempty [(String, String)]
htmlEntities
  where go :: (String, [k]) -> Map k Text -> Map k Text
go (String
ent, [k]
s) Map k Text
entmap =
         case [k]
s of
           [k
c] -> forall k a. Ord k => (a -> a -> a) -> k -> a -> Map k a -> Map k a
M.insertWith
                   (\Text
new Text
old -> if Text -> Int
T.length Text
new forall a. Ord a => a -> a -> Bool
> Text -> Int
T.length Text
old
                                   then Text
old
                                   else Text
new) k
c Text
ent' Map k Text
entmap
                  where ent' :: Text
ent' = (Char -> Bool) -> Text -> Text
T.takeWhile (forall a. Eq a => a -> a -> Bool
/=Char
';') (String -> Text
T.pack String
ent)
           [k]
_ -> Map k Text
entmap

-- | Converts a string into an NCName, i.e., an XML name without colons.
-- Disallowed characters are escaped using @ux%x@, where @%x@ is the
-- hexadecimal unicode identifier of the escaped character.
escapeNCName :: Text -> Text
escapeNCName :: Text -> Text
escapeNCName Text
t = case Text -> Maybe (Char, Text)
T.uncons Text
t of
  Maybe (Char, Text)
Nothing -> Text
T.empty
  Just (Char
c, Text
cs) -> Char -> Text
escapeStartChar Char
c forall a. Semigroup a => a -> a -> a
<> (Char -> Text) -> Text -> Text
T.concatMap Char -> Text
escapeNCNameChar Text
cs
  where
    escapeStartChar :: Char -> Text
    escapeStartChar :: Char -> Text
escapeStartChar Char
c = if Char -> Bool
isLetter Char
c Bool -> Bool -> Bool
|| Char
c forall a. Eq a => a -> a -> Bool
== Char
'_'
                        then Char -> Text
T.singleton Char
c
                        else Char -> Text
escapeChar Char
c

    escapeNCNameChar :: Char -> Text
    escapeNCNameChar :: Char -> Text
escapeNCNameChar Char
c = if Char -> Bool
isNCNameChar Char
c
                         then Char -> Text
T.singleton Char
c
                         else Char -> Text
escapeChar Char
c

    isNCNameChar :: Char -> Bool
    isNCNameChar :: Char -> Bool
isNCNameChar Char
c = Char -> Bool
isLetter Char
c Bool -> Bool -> Bool
|| Char
c forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` (String
"_-.·" :: String) Bool -> Bool -> Bool
|| Char -> Bool
isDigit Char
c
      Bool -> Bool -> Bool
|| Char
'\x0300' forall a. Ord a => a -> a -> Bool
<= Char
c Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
<= Char
'\x036f'
      Bool -> Bool -> Bool
|| Char
'\x203f' forall a. Ord a => a -> a -> Bool
<= Char
c Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
<= Char
'\x2040'

    escapeChar :: Char -> Text
    escapeChar :: Char -> Text
escapeChar = String -> Text
T.pack forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall r. PrintfType r => String -> r
printf String
"U%04X" forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Int
ord

-- | Unescapes XML entities
fromEntities :: Text -> Text
fromEntities :: Text -> Text
fromEntities Text
t
  = let (Text
x, Text
y) = (Char -> Bool) -> Text -> (Text, Text)
T.break (forall a. Eq a => a -> a -> Bool
== Char
'&') Text
t
     in if Text -> Bool
T.null Text
y
           then Text
t
           else Text
x forall a. Semigroup a => a -> a -> a
<>
             let (Text
ent, Text
rest) = (Char -> Bool) -> Text -> (Text, Text)
T.break (\Char
c -> Char -> Bool
isSpace Char
c Bool -> Bool -> Bool
|| Char
c forall a. Eq a => a -> a -> Bool
== Char
';') Text
y
                 rest' :: Text
rest' = case Text -> Maybe (Char, Text)
T.uncons Text
rest of
                            Just (Char
';',Text
ys) -> Text
ys
                            Maybe (Char, Text)
_ -> Text
rest
                 ent' :: Text
ent' = Int -> Text -> Text
T.drop Int
1 Text
ent forall a. Semigroup a => a -> a -> a
<> Text
";"
              in case Text -> Maybe Text
lookupEntity Text
ent' of
                   Just Text
c  -> Text
c forall a. Semigroup a => a -> a -> a
<> Text -> Text
fromEntities Text
rest'
                   Maybe Text
Nothing  -> Text
ent forall a. Semigroup a => a -> a -> a
<> Text -> Text
fromEntities Text
rest


html5Attributes :: Set.Set Text
html5Attributes :: Set Text
html5Attributes = forall a. Ord a => [a] -> Set a
Set.fromList
  [ Text
"abbr"
  , Text
"accept"
  , Text
"accept-charset"
  , Text
"accesskey"
  , Text
"action"
  , Text
"allow"
  , Text
"allowfullscreen"
  , Text
"allowpaymentrequest"
  , Text
"allowusermedia"
  , Text
"alt"
  , Text
"as"
  , Text
"async"
  , Text
"autocapitalize"
  , Text
"autocomplete"
  , Text
"autofocus"
  , Text
"autoplay"
  , Text
"charset"
  , Text
"checked"
  , Text
"cite"
  , Text
"class"
  , Text
"color"
  , Text
"cols"
  , Text
"colspan"
  , Text
"content"
  , Text
"contenteditable"
  , Text
"controls"
  , Text
"coords"
  , Text
"crossorigin"
  , Text
"data"
  , Text
"datetime"
  , Text
"decoding"
  , Text
"default"
  , Text
"defer"
  , Text
"dir"
  , Text
"dirname"
  , Text
"disabled"
  , Text
"download"
  , Text
"draggable"
  , Text
"enctype"
  , Text
"enterkeyhint"
  , Text
"for"
  , Text
"form"
  , Text
"formaction"
  , Text
"formenctype"
  , Text
"formmethod"
  , Text
"formnovalidate"
  , Text
"formtarget"
  , Text
"headers"
  , Text
"height"
  , Text
"hidden"
  , Text
"high"
  , Text
"href"
  , Text
"hreflang"
  , Text
"http-equiv"
  , Text
"id"
  , Text
"imagesizes"
  , Text
"imagesrcset"
  , Text
"inputmode"
  , Text
"integrity"
  , Text
"is"
  , Text
"ismap"
  , Text
"itemid"
  , Text
"itemprop"
  , Text
"itemref"
  , Text
"itemscope"
  , Text
"itemtype"
  , Text
"kind"
  , Text
"label"
  , Text
"lang"
  , Text
"list"
  , Text
"loading"
  , Text
"loop"
  , Text
"low"
  , Text
"manifest"
  , Text
"max"
  , Text
"maxlength"
  , Text
"media"
  , Text
"method"
  , Text
"min"
  , Text
"minlength"
  , Text
"multiple"
  , Text
"muted"
  , Text
"name"
  , Text
"nomodule"
  , Text
"nonce"
  , Text
"novalidate"
  , Text
"onabort"
  , Text
"onafterprint"
  , Text
"onauxclick"
  , Text
"onbeforeprint"
  , Text
"onbeforeunload"
  , Text
"onblur"
  , Text
"oncancel"
  , Text
"oncanplay"
  , Text
"oncanplaythrough"
  , Text
"onchange"
  , Text
"onclick"
  , Text
"onclose"
  , Text
"oncontextmenu"
  , Text
"oncopy"
  , Text
"oncuechange"
  , Text
"oncut"
  , Text
"ondblclick"
  , Text
"ondrag"
  , Text
"ondragend"
  , Text
"ondragenter"
  , Text
"ondragexit"
  , Text
"ondragleave"
  , Text
"ondragover"
  , Text
"ondragstart"
  , Text
"ondrop"
  , Text
"ondurationchange"
  , Text
"onemptied"
  , Text
"onended"
  , Text
"onerror"
  , Text
"onfocus"
  , Text
"onhashchange"
  , Text
"oninput"
  , Text
"oninvalid"
  , Text
"onkeydown"
  , Text
"onkeypress"
  , Text
"onkeyup"
  , Text
"onlanguagechange"
  , Text
"onload"
  , Text
"onloadeddata"
  , Text
"onloadedmetadata"
  , Text
"onloadend"
  , Text
"onloadstart"
  , Text
"onmessage"
  , Text
"onmessageerror"
  , Text
"onmousedown"
  , Text
"onmouseenter"
  , Text
"onmouseleave"
  , Text
"onmousemove"
  , Text
"onmouseout"
  , Text
"onmouseover"
  , Text
"onmouseup"
  , Text
"onoffline"
  , Text
"ononline"
  , Text
"onpagehide"
  , Text
"onpageshow"
  , Text
"onpaste"
  , Text
"onpause"
  , Text
"onplay"
  , Text
"onplaying"
  , Text
"onpopstate"
  , Text
"onprogress"
  , Text
"onratechange"
  , Text
"onrejectionhandled"
  , Text
"onreset"
  , Text
"onresize"
  , Text
"onscroll"
  , Text
"onsecuritypolicyviolation"
  , Text
"onseeked"
  , Text
"onseeking"
  , Text
"onselect"
  , Text
"onstalled"
  , Text
"onstorage"
  , Text
"onsubmit"
  , Text
"onsuspend"
  , Text
"ontimeupdate"
  , Text
"ontoggle"
  , Text
"onunhandledrejection"
  , Text
"onunload"
  , Text
"onvolumechange"
  , Text
"onwaiting"
  , Text
"onwheel"
  , Text
"open"
  , Text
"optimum"
  , Text
"pattern"
  , Text
"ping"
  , Text
"placeholder"
  , Text
"playsinline"
  , Text
"poster"
  , Text
"preload"
  , Text
"readonly"
  , Text
"referrerpolicy"
  , Text
"rel"
  , Text
"required"
  , Text
"reversed"
  , Text
"role"
  , Text
"rows"
  , Text
"rowspan"
  , Text
"sandbox"
  , Text
"scope"
  , Text
"selected"
  , Text
"shape"
  , Text
"size"
  , Text
"sizes"
  , Text
"slot"
  , Text
"span"
  , Text
"spellcheck"
  , Text
"src"
  , Text
"srcdoc"
  , Text
"srclang"
  , Text
"srcset"
  , Text
"start"
  , Text
"step"
  , Text
"style"
  , Text
"tabindex"
  , Text
"target"
  , Text
"title"
  , Text
"translate"
  , Text
"type"
  , Text
"typemustmatch"
  , Text
"updateviacache"
  , Text
"usemap"
  , Text
"value"
  , Text
"width"
  , Text
"workertype"
  , Text
"wrap"
  ]

-- See https://en.wikipedia.org/wiki/RDFa, https://www.w3.org/TR/rdfa-primer/
rdfaAttributes :: Set.Set Text
rdfaAttributes :: Set Text
rdfaAttributes = forall a. Ord a => [a] -> Set a
Set.fromList
  [ Text
"about"
  , Text
"rel"
  , Text
"rev"
  , Text
"src"
  , Text
"href"
  , Text
"resource"
  , Text
"property"
  , Text
"content"
  , Text
"datatype"
  , Text
"typeof"
  , Text
"vocab"
  , Text
"prefix"
  ]

html4Attributes :: Set.Set Text
html4Attributes :: Set Text
html4Attributes = forall a. Ord a => [a] -> Set a
Set.fromList
  [ Text
"abbr"
  , Text
"accept"
  , Text
"accept-charset"
  , Text
"accesskey"
  , Text
"action"
  , Text
"align"
  , Text
"alink"
  , Text
"alt"
  , Text
"archive"
  , Text
"axis"
  , Text
"background"
  , Text
"bgcolor"
  , Text
"border"
  , Text
"cellpadding"
  , Text
"cellspacing"
  , Text
"char"
  , Text
"charoff"
  , Text
"charset"
  , Text
"checked"
  , Text
"cite"
  , Text
"class"
  , Text
"classid"
  , Text
"clear"
  , Text
"code"
  , Text
"codebase"
  , Text
"codetype"
  , Text
"color"
  , Text
"cols"
  , Text
"colspan"
  , Text
"compact"
  , Text
"content"
  , Text
"coords"
  , Text
"data"
  , Text
"datetime"
  , Text
"declare"
  , Text
"defer"
  , Text
"dir"
  , Text
"disabled"
  , Text
"enctype"
  , Text
"face"
  , Text
"for"
  , Text
"frame"
  , Text
"frameborder"
  , Text
"headers"
  , Text
"height"
  , Text
"href"
  , Text
"hreflang"
  , Text
"hspace"
  , Text
"http-equiv"
  , Text
"id"
  , Text
"ismap"
  , Text
"label"
  , Text
"lang"
  , Text
"language"
  , Text
"link"
  , Text
"longdesc"
  , Text
"marginheight"
  , Text
"marginwidth"
  , Text
"maxlength"
  , Text
"media"
  , Text
"method"
  , Text
"multiple"
  , Text
"name"
  , Text
"nohref"
  , Text
"noresize"
  , Text
"noshade"
  , Text
"nowrap"
  , Text
"object"
  , Text
"onblur"
  , Text
"onchange"
  , Text
"onclick"
  , Text
"ondblclick"
  , Text
"onfocus"
  , Text
"onkeydown"
  , Text
"onkeypress"
  , Text
"onkeyup"
  , Text
"onload"
  , Text
"onmousedown"
  , Text
"onmousemove"
  , Text
"onmouseout"
  , Text
"onmouseover"
  , Text
"onmouseup"
  , Text
"onreset"
  , Text
"onselect"
  , Text
"onsubmit"
  , Text
"onunload"
  , Text
"profile"
  , Text
"prompt"
  , Text
"readonly"
  , Text
"rel"
  , Text
"rev"
  , Text
"rows"
  , Text
"rowspan"
  , Text
"rules"
  , Text
"scheme"
  , Text
"scope"
  , Text
"scrolling"
  , Text
"selected"
  , Text
"shape"
  , Text
"size"
  , Text
"span"
  , Text
"src"
  , Text
"standby"
  , Text
"start"
  , Text
"style"
  , Text
"summary"
  , Text
"tabindex"
  , Text
"target"
  , Text
"text"
  , Text
"title"
  , Text
"usemap"
  , Text
"valign"
  , Text
"value"
  , Text
"valuetype"
  , Text
"version"
  , Text
"vlink"
  , Text
"vspace"
  , Text
"width"
  ]