{-# LANGUAGE DeriveAnyClass #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE QuasiQuotes #-} {-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TypeFamilies #-} -- | A 'Markup' parser and printer of strict bytestrings. 'Markup' is a representation of data such as HTML, SVG or XML but the parsing is sub-standard. module MarkupParse ( -- $usage -- * Markup Markup (..), Standard (..), markup, markup_, RenderStyle (..), markdown, normalize, wellFormed, isWellFormed, -- * Warnings MarkupWarning (..), Result, resultError, resultEither, resultMaybe, -- * Token components TagName, name, selfClosers, AttrName, AttrValue, Attr (..), attrs, -- * Tokens Token (..), tokenize, tokenize_, token, detokenize, gather, gather_, degather, degather_, -- * XML specific Parsers xmlVersionInfo, xmlEncodingDecl, xmlStandalone, xmlVersionNum, xmlEncName, xmlYesNo, ) where import Control.Category ((>>>)) import Control.DeepSeq import Control.Monad import Data.Bifunctor import Data.Bool import Data.ByteString (ByteString) import Data.ByteString.Char8 qualified as B import Data.Char hiding (isDigit) import Data.Foldable import Data.Function import Data.List qualified as List import Data.Map.Strict qualified as Map import Data.Maybe import Data.String.Interpolate import Data.These import Data.Tree import Data.TreeDiff import FlatParse.Basic hiding (Result, cut, take) import GHC.Generics import MarkupParse.FlatParse import Prelude hiding (replicate) -- $setup -- >>> :set -XTemplateHaskell -- >>> :set -XQuasiQuotes -- >>> :set -XOverloadedStrings -- >>> import MarkupParse -- >>> import MarkupParse.Patch -- >>> import MarkupParse.FlatParse -- >>> import FlatParse.Basic -- >>> import Data.String.Interpolate -- >>> import Data.ByteString.Char8 qualified as B -- >>> import Data.Tree -- $usage -- -- > import MarkupParse -- > import Data.ByteString qualified as B -- > -- > bs <- B.readFile "other/line.svg" -- > m = markup_ bs -- -- @'markdown' . ''markup_'@ is an approximate round trip from 'ByteString' to 'Markup' back to ByteString'. The underscores represent versions of main functions that throw an exception on warnings encountered along the way. -- -- At a lower level, a round trip pipeline might look something like: -- -- > :t tokenize Html >=> gather Html >>> fmap (Markup Html >>> normalize) >=> degather >>> fmap (fmap (detokenize Html) >>> mconcat) -- > ByteString -> These [MarkupWarning] ByteString -- -- From left to right: -- -- - 'tokenize' converts a 'ByteString' to a 'Token' list, -- -- - 'gather' takes the tokens and gathers them into 'Tree's of tokens -- -- - this is then wrapped into the 'Markup' data type. -- -- - 'normalize' concatenates content, and normalizes attributes, -- -- - 'degather' turns the markup tree back into a token list. Finally, -- -- - 'detokenize' turns a token back into a bytestring. -- -- Along the way, the kleisi fishies and compose forward usage accumulates any warnings via the 'These' monad instance. -- | From a parsing pov, Html & Xml (& Svg) are close enough that they share a lot of parsing logic, so that parsing and printing just need some tweaking. -- -- The xml parsing logic is based on the XML productions found in https://www.w3.org/TR/xml/ -- -- The html parsing was based on a reading of , but ignores the various '\x00' to '\xfffd' & eof directives that form part of the html standards. data Standard = Html | Xml deriving (Eq, Show, Ord, Generic, NFData) instance ToExpr Standard -- | A 'Tree' list of markup 'Token's -- -- >>> markup Html "baz" -- That (Markup {standard = Html, markupTree = [Node {rootLabel = StartTag "foo" [Attr "class" "bar"], subForest = [Node {rootLabel = Content "baz", subForest = []}]}]}) data Markup = Markup {standard :: Standard, markupTree :: [Tree Token]} deriving (Show, Eq, Ord, Generic, NFData) instance ToExpr Markup -- | markup-parse generally tries to continue on parse errors, and return what has/can still be parsed, together with any warnings. data MarkupWarning = -- | A tag ending with "/>" that is not an element of 'selfClosers' (Html only). BadEmptyElemTag | -- | A tag ending with "/>" that has children. Cannot happen in the parsing phase. SelfCloserWithChildren | -- | Only a 'StartTag' can have child tokens. LeafWithChildren | -- | A CloseTag with a different name to the currently open StartTag. TagMismatch TagName TagName | -- | An EndTag with no corresponding StartTag. UnmatchedEndTag | -- | An EndTag with corresponding StartTag. UnclosedTag | -- | An EndTag should never appear in 'Markup' EndTagInTree | -- | Empty Content, Comment, Decl or Doctype EmptyContent | MarkupParser ParserWarning deriving (Eq, Show, Ord, Generic, NFData) showWarnings :: [MarkupWarning] -> String showWarnings = List.nub >>> fmap show >>> unlines -- | The structure of many returning functions. -- -- A common computation pipeline is to take advantage of the 'These' Monad instance eg -- -- > markup s bs = bs & (tokenize s >=> gather s) & second (Markup s) type Result a = These [MarkupWarning] a -- | Convert any warnings to an 'error' -- -- >>> resultError $ (tokenize Html) " a resultError = these (showWarnings >>> error) id (\xs a -> bool (error (showWarnings xs)) a (xs == [])) -- | Returns Left on any warnings -- -- >>> resultEither $ (tokenize Html) " Either [MarkupWarning] a resultEither = these Left Right (\xs a -> bool (Left xs) (Right a) (xs == [])) -- | Returns results if any, ignoring warnings. -- -- >>> resultMaybe $ (tokenize Html) " Maybe a resultMaybe = these (const Nothing) Just (\_ a -> Just a) -- | Convert bytestrings to 'Markup' -- -- >>> markup Html "
ByteString -> These [MarkupWarning] Markup markup s bs = bs & (tokenize s >=> gather s) & second (Markup s) -- | markup but errors on warnings. markup_ :: Standard -> ByteString -> Markup markup_ s bs = markup s bs & resultError -- | concatenate sequential content, and normalize attributes; unwording class values and removing duplicate attributes (taking last). -- -- >>> B.putStr $ markdown Compact $ normalize (markup_ Xml [i||]) -- normalize :: Markup -> Markup normalize (Markup s trees) = Markup s (normContentTrees $ fmap (fmap normTokenAttrs) trees) -- | Are the trees in the markup well-formed? isWellFormed :: Markup -> Bool isWellFormed = (== []) . wellFormed -- | Check for well-formedness and rerturn warnings encountered. -- -- >>> wellFormed $ Markup Html [Node (Comment "") [], Node (EndTag "foo") [], Node (EmptyElemTag "foo" []) [Node (Content "bar") []], Node (EmptyElemTag "foo" []) []] -- [EmptyContent,EndTagInTree,LeafWithChildren,BadEmptyElemTag] wellFormed :: Markup -> [MarkupWarning] wellFormed (Markup s trees) = List.nub $ mconcat (foldTree checkNode <$> trees) where checkNode (StartTag _ _) xs = mconcat xs checkNode (EmptyElemTag n _) [] = bool [] [BadEmptyElemTag] (not (n `elem` selfClosers) && s == Html) checkNode (EndTag _) [] = [EndTagInTree] checkNode (Content bs) [] = bool [] [EmptyContent] (bs == "") checkNode (Comment bs) [] = bool [] [EmptyContent] (bs == "") checkNode (Decl bs) [] = bool [] [EmptyContent] (bs == "") checkNode (Doctype bs) [] = bool [] [EmptyContent] (bs == "") checkNode _ _ = [LeafWithChildren] -- | Name of token type TagName = ByteString -- | A Markup token -- -- >>> runParser_ (many (token Html)) [i|content|] -- [StartTag "foo" [],Content "content",EndTag "foo"] -- -- >>> runParser_ (token Xml) [i||] -- EmptyElemTag "foo" [] -- -- >>> runParser_ (token Html) "" -- Comment " Comment " -- -- >>> runParser_ (token Xml) [i||] -- Decl "xml version=\"1.0\" encoding=\"UTF-8\"" -- -- >>> runParser_ (token Html) "" -- Doctype "DOCTYPE html" -- -- >>> runParser_ (token Xml) "" -- Doctype "DOCTYPE foo [ declarations ]" -- -- >>> runParser (token Html) [i||] -- OK (StartTag "foo" [Attr "a" "a",Attr "b" "b",Attr "c" "c",Attr "check" ""]) "" -- -- >>> runParser (token Xml) [i||] -- Fail data Token = -- | A start tag. https://developer.mozilla.org/en-US/docs/Glossary/Tag StartTag !TagName ![Attr] | -- | An empty element tag. Optional for XML and kind of not allowed in HTML. EmptyElemTag !TagName ![Attr] | -- | A closing tag. EndTag !TagName | -- | The content between tags. Content !ByteString | -- | Contents of a comment. Comment !ByteString | -- | Contents of a declaration Decl !ByteString | -- | Contents of a doctype declaration. Doctype !ByteString deriving (Show, Ord, Eq, Generic) instance NFData Token instance ToExpr Token -- | A flatparse 'Token' parser. -- -- >>> runParser (token Html) "content" -- OK (StartTag "foo" []) "content" token :: Standard -> Parser String Token token Html = tokenHtml token Xml = tokenXml -- | Parse a bytestring into tokens -- -- >>> tokenize Html [i|content|] -- That [StartTag "foo" [],Content "content",EndTag "foo"] tokenize :: Standard -> ByteString -> These [MarkupWarning] [Token] tokenize s bs = first ((: []) . MarkupParser) $ runParserWarn (many (token s)) bs -- | tokenize but errors on warnings. tokenize_ :: Standard -> ByteString -> [Token] tokenize_ s bs = tokenize s bs & resultError -- | Html tags that self-close selfClosers :: [TagName] selfClosers = [ "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr" ] -- | Name of an attribute. type AttrName = ByteString -- | Value of an attribute. "" is equivalent to true with respect to boolean attributes. type AttrValue = ByteString -- | An attribute of a tag -- -- In parsing, boolean attributes, which are not required to have a value in HTML, -- will be set a value of "", which is ok. But this will then be rendered. -- -- >>> detokenize Html <$> tokenize_ Html [i||] -- [""] data Attr = Attr !AttrName !AttrValue deriving (Generic, Show, Eq, Ord) instance NFData Attr instance ToExpr Attr normTokenAttrs :: Token -> Token normTokenAttrs (StartTag n as) = StartTag n (normAttrs as) normTokenAttrs (EmptyElemTag n as) = EmptyElemTag n (normAttrs as) normTokenAttrs x = x -- | normalize an attribution list, removing duplicate AttrNames, and space concatenating class values. normAttrs :: [Attr] -> [Attr] normAttrs as = uncurry Attr <$> ( Map.toList $ foldl' ( \s (Attr n v) -> Map.insertWithKey ( \k new old -> case k of "class" -> old <> " " <> new _ -> new ) n v s ) Map.empty as ) -- | render attributes renderAttrs :: [Attr] -> ByteString renderAttrs = B.unwords . fmap renderAttr -- | render an attribute -- -- Does not attempt to escape double quotes. renderAttr :: Attr -> ByteString renderAttr (Attr k v) = [i|#{k}="#{v}"|] commentClose :: Parser e () commentClose = $(string "-->") charNotMinus :: Parser e ByteString charNotMinus = byteStringOf $ satisfy (/= '-') minusPlusChar :: Parser e ByteString minusPlusChar = byteStringOf ($(char '-') *> charNotMinus) comment :: Parser e Token comment = Comment <$> byteStringOf (many (charNotMinus <|> minusPlusChar)) <* commentClose content :: Parser e Token content = Content <$> byteStringOf (some (satisfy (/= '<'))) -- | bytestring representation of 'Token'. -- -- >>> detokenize Html (StartTag "foo" []) -- "" detokenize :: Standard -> Token -> ByteString detokenize s = \case (StartTag n []) -> [i|<#{n}>|] (StartTag n as) -> [i|<#{n} #{renderAttrs as}>|] (EmptyElemTag n as) -> bool [i|<#{n} #{renderAttrs as}/>|] [i|<#{n} #{renderAttrs as} />|] (s == Html) (EndTag n) -> [i||] (Content t) -> t (Comment t) -> [i||] (Doctype t) -> [i||] (Decl t) -> bool [i||] [i||] (s == Html) -- | Indented 0 puts newlines in between the tags. data RenderStyle = Compact | Indented Int deriving (Eq, Show, Generic) indentChildren :: RenderStyle -> [ByteString] -> [ByteString] indentChildren Compact = id indentChildren (Indented x) = fmap (B.replicate x ' ' <>) finalConcat :: RenderStyle -> [ByteString] -> ByteString finalConcat Compact = mconcat finalConcat (Indented _) = B.intercalate (B.singleton '\n') . filter (/= "") -- | Convert 'Markup' to bytestrings -- -- >>> B.putStr $ markdown (Indented 4) (markup_ Html [i|
|]) -- --
-- markdown :: RenderStyle -> Markup -> ByteString markdown r (Markup std tree) = finalConcat r $ mconcat $ foldTree (renderBranch r std) <$> normContentTrees tree -- note that renderBranch adds in EndTags for StartTags when needed renderBranch :: RenderStyle -> Standard -> Token -> [[ByteString]] -> [ByteString] renderBranch r std s@(StartTag n _) children | n `elem` selfClosers && std == Html = [detokenize std s] <> indentChildren r (mconcat children) | otherwise = [detokenize std s] <> indentChildren r (mconcat children) <> [detokenize std (EndTag n)] renderBranch r std x children = -- ignoring that this should be an error [detokenize std x] <> indentChildren r (mconcat children) normContentTrees :: [Tree Token] -> [Tree Token] normContentTrees trees = foldTree (\x xs -> Node x (filter ((/= Content "") . rootLabel) $ concatContent xs)) <$> concatContent trees concatContent :: [Tree Token] -> [Tree Token] concatContent = \case ((Node (Content t) _) : (Node (Content t') _) : ts) -> concatContent $ Node (Content (t <> t')) [] : ts (t : ts) -> t : concatContent ts [] -> [] -- | Gather together token trees from a token list, placing child elements in nodes and removing EndTags. -- -- >>> gather Html =<< tokenize Html "baz" -- That [Node {rootLabel = StartTag "foo" [Attr "class" "bar"], subForest = [Node {rootLabel = Content "baz", subForest = []}]}] gather :: Standard -> [Token] -> These [MarkupWarning] [Tree Token] gather s ts = case (finalSibs, finalParents, warnings) of (sibs, [], []) -> That (reverse sibs) ([], [], xs) -> This xs (sibs, ps, xs) -> These (xs <> [UnclosedTag]) (reverse $ foldl' (\ss' (p, ss) -> Node p (reverse ss') : ss) sibs ps) where (Cursor finalSibs finalParents, warnings) = foldl' (\(c, xs) t -> incCursor s t c & second (maybeToList >>> (<> xs))) (Cursor [] [], []) ts -- | gather but errors on warnings. gather_ :: Standard -> [Token] -> [Tree Token] gather_ s ts = gather s ts & resultError incCursor :: Standard -> Token -> Cursor -> (Cursor, Maybe MarkupWarning) -- Only StartTags are ever pushed on to the parent list, here: incCursor Xml t@(StartTag _ _) (Cursor ss ps) = (Cursor [] ((t, ss) : ps), Nothing) incCursor Html t@(StartTag n _) (Cursor ss ps) = (bool (Cursor [] ((t, ss) : ps)) (Cursor (Node t [] : ss) ps) (n `elem` selfClosers), Nothing) incCursor Xml t@(EmptyElemTag _ _) (Cursor ss ps) = (Cursor (Node t [] : ss) ps, Nothing) incCursor Html t@(EmptyElemTag n _) (Cursor ss ps) = ( Cursor (Node t [] : ss) ps, bool (Just BadEmptyElemTag) Nothing (n `elem` selfClosers) ) incCursor _ (EndTag n) (Cursor ss ((p@(StartTag n' _), ss') : ps)) = ( Cursor (Node p (reverse ss) : ss') ps, bool (Just (TagMismatch n n')) Nothing (n == n') ) -- Non-StartTag on parent list incCursor _ (EndTag _) (Cursor ss ((p, ss') : ps)) = ( Cursor (Node p (reverse ss) : ss') ps, Just LeafWithChildren ) incCursor _ (EndTag _) (Cursor ss []) = ( Cursor ss [], Just UnmatchedEndTag ) incCursor _ t (Cursor ss ps) = (Cursor (Node t [] : ss) ps, Nothing) data Cursor = Cursor { -- siblings, not (yet) part of another element _sibs :: [Tree Token], -- open elements and their siblings. _stack :: [(Token, [Tree Token])] } -- | Convert a markup into a token list, adding end tags. -- -- >>> degather =<< markup Html "baz" -- That [StartTag "foo" [Attr "class" "bar"],Content "baz",EndTag "foo"] degather :: Markup -> These [MarkupWarning] [Token] degather (Markup s tree) = rconcats $ foldTree (addCloseTags s) <$> tree -- | degather but errors on warning degather_ :: Markup -> [Token] degather_ m = degather m & resultError rconcats :: [Result [a]] -> Result [a] rconcats rs = case bimap mconcat mconcat $ partitionHereThere rs of ([], xs) -> That xs (es, []) -> This es (es, xs) -> These es xs addCloseTags :: Standard -> Token -> [These [MarkupWarning] [Token]] -> These [MarkupWarning] [Token] addCloseTags std s@(StartTag n _) children | children /= [] && n `elem` selfClosers && std == Html = These [SelfCloserWithChildren] [s] <> rconcats children | n `elem` selfClosers && std == Html = That [s] <> rconcats children | otherwise = That [s] <> rconcats children <> That [EndTag n] addCloseTags _ x xs = case xs of [] -> That [x] cs -> These [LeafWithChildren] [x] <> rconcats cs tokenXml :: Parser e Token tokenXml = $( switch [| case _ of "