markup-parse-0.0.0.2: A markup parser.
Safe HaskellSafe-Inferred
LanguageGHC2021

MarkupParse

Description

A Markup parser and printer of strict bytestrings. Markup is a representation of data such as HTML, SVG or XML but the parsing is sub-standard.

Synopsis

Documentation

import MarkupParse
import Data.ByteString qualified as B

bs <- B.readFile "other/line.svg"
m = markup_ bs

markdown . 'markup_ is an approximate round trip from ByteString to Markup back to ByteString'. The underscores represent versions of main functions that throw an exception on warnings encountered along the way.

At a lower level, a round trip pipeline might look something like:

:t tokenize Html >=> gather Html >>> fmap (Markup Html >>> normalize) >=> degather >>> fmap (fmap (detokenize Html) >>> mconcat)
ByteString -> These [MarkupWarning] ByteString

From left to right:

Along the way, the kleisi fishies and compose forward usage accumulates any warnings via the These monad instance.

Markup

data Markup Source #

A Tree list of markup Tokens

>>> markup Html "<foo class=\"bar\">baz</foo>"
That (Markup {standard = Html, markupTree = [Node {rootLabel = StartTag "foo" [Attr "class" "bar"], subForest = [Node {rootLabel = Content "baz", subForest = []}]}]})

Constructors

Markup 

Instances

Instances details
Generic Markup Source # 
Instance details

Defined in MarkupParse

Associated Types

type Rep Markup :: Type -> Type #

Methods

from :: Markup -> Rep Markup x #

to :: Rep Markup x -> Markup #

Show Markup Source # 
Instance details

Defined in MarkupParse

NFData Markup Source # 
Instance details

Defined in MarkupParse

Methods

rnf :: Markup -> () #

Eq Markup Source # 
Instance details

Defined in MarkupParse

Methods

(==) :: Markup -> Markup -> Bool #

(/=) :: Markup -> Markup -> Bool #

Ord Markup Source # 
Instance details

Defined in MarkupParse

ToExpr Markup Source # 
Instance details

Defined in MarkupParse

Methods

toExpr :: Markup -> Expr #

listToExpr :: [Markup] -> Expr #

type Rep Markup Source # 
Instance details

Defined in MarkupParse

type Rep Markup = D1 ('MetaData "Markup" "MarkupParse" "markup-parse-0.0.0.2-HnQ2lGo9uJcI0zQ43aTniK" 'False) (C1 ('MetaCons "Markup" 'PrefixI 'True) (S1 ('MetaSel ('Just "standard") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 Standard) :*: S1 ('MetaSel ('Just "markupTree") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 [Tree Token])))

data Standard Source #

From a parsing pov, Html & Xml (& Svg) are close enough that they share a lot of parsing logic, so that parsing and printing just need some tweaking.

The xml parsing logic is based on the XML productions found in https://www.w3.org/TR/xml/

The html parsing was based on a reading of html-parse, but ignores the various 'x00' to 'xfffd' & eof directives that form part of the html standards.

Constructors

Html 
Xml 

Instances

Instances details
Generic Standard Source # 
Instance details

Defined in MarkupParse

Associated Types

type Rep Standard :: Type -> Type #

Methods

from :: Standard -> Rep Standard x #

to :: Rep Standard x -> Standard #

Show Standard Source # 
Instance details

Defined in MarkupParse

NFData Standard Source # 
Instance details

Defined in MarkupParse

Methods

rnf :: Standard -> () #

Eq Standard Source # 
Instance details

Defined in MarkupParse

Ord Standard Source # 
Instance details

Defined in MarkupParse

ToExpr Standard Source # 
Instance details

Defined in MarkupParse

type Rep Standard Source # 
Instance details

Defined in MarkupParse

type Rep Standard = D1 ('MetaData "Standard" "MarkupParse" "markup-parse-0.0.0.2-HnQ2lGo9uJcI0zQ43aTniK" 'False) (C1 ('MetaCons "Html" 'PrefixI 'False) (U1 :: Type -> Type) :+: C1 ('MetaCons "Xml" 'PrefixI 'False) (U1 :: Type -> Type))

markup :: Standard -> ByteString -> These [MarkupWarning] Markup Source #

Convert bytestrings to Markup

>>> markup Html "<foo><br></foo><baz"
These [MarkupParser (ParserLeftover "<baz")] (Markup {standard = Html, markupTree = [Node {rootLabel = StartTag "foo" [], subForest = [Node {rootLabel = StartTag "br" [], subForest = []}]}]})

markup_ :: Standard -> ByteString -> Markup Source #

markup but errors on warnings.

data RenderStyle Source #

Indented 0 puts newlines in between the tags.

Constructors

Compact 
Indented Int 

Instances

Instances details
Generic RenderStyle Source # 
Instance details

Defined in MarkupParse

Associated Types

type Rep RenderStyle :: Type -> Type #

Show RenderStyle Source # 
Instance details

Defined in MarkupParse

Eq RenderStyle Source # 
Instance details

Defined in MarkupParse

type Rep RenderStyle Source # 
Instance details

Defined in MarkupParse

type Rep RenderStyle = D1 ('MetaData "RenderStyle" "MarkupParse" "markup-parse-0.0.0.2-HnQ2lGo9uJcI0zQ43aTniK" 'False) (C1 ('MetaCons "Compact" 'PrefixI 'False) (U1 :: Type -> Type) :+: C1 ('MetaCons "Indented" 'PrefixI 'False) (S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 Int)))

markdown :: RenderStyle -> Markup -> ByteString Source #

Convert Markup to bytestrings

>>> B.putStr $ markdown (Indented 4) (markup_ Html [i|<foo><br></foo>|])
<foo>
    <br>
</foo>

normalize :: Markup -> Markup Source #

concatenate sequential content, and normalize attributes; unwording class values and removing duplicate attributes (taking last).

>>> B.putStr $ markdown Compact $ normalize (markup_ Xml [i|<foo class="a" class="b" bar="first" bar="last"/>|])
<foo bar="last" class="a b"/>

wellFormed :: Markup -> [MarkupWarning] Source #

Check for well-formedness and rerturn warnings encountered.

>>> wellFormed $ Markup Html [Node (Comment "") [], Node (EndTag "foo") [], Node (EmptyElemTag "foo" []) [Node (Content "bar") []], Node (EmptyElemTag "foo" []) []]
[EmptyContent,EndTagInTree,LeafWithChildren,BadEmptyElemTag]

isWellFormed :: Markup -> Bool Source #

Are the trees in the markup well-formed?

Warnings

data MarkupWarning Source #

markup-parse generally tries to continue on parse errors, and return what has/can still be parsed, together with any warnings.

Constructors

BadEmptyElemTag

A tag ending with "/>" that is not an element of selfClosers (Html only).

SelfCloserWithChildren

A tag ending with "/>" that has children. Cannot happen in the parsing phase.

LeafWithChildren

Only a StartTag can have child tokens.

TagMismatch TagName TagName

A CloseTag with a different name to the currently open StartTag.

UnmatchedEndTag

An EndTag with no corresponding StartTag.

UnclosedTag

An EndTag with corresponding StartTag.

EndTagInTree

An EndTag should never appear in Markup

EmptyContent

Empty Content, Comment, Decl or Doctype

MarkupParser ParserWarning 

Instances

Instances details
Generic MarkupWarning Source # 
Instance details

Defined in MarkupParse

Associated Types

type Rep MarkupWarning :: Type -> Type #

Show MarkupWarning Source # 
Instance details

Defined in MarkupParse

NFData MarkupWarning Source # 
Instance details

Defined in MarkupParse

Methods

rnf :: MarkupWarning -> () #

Eq MarkupWarning Source # 
Instance details

Defined in MarkupParse

Ord MarkupWarning Source # 
Instance details

Defined in MarkupParse

type Rep MarkupWarning Source # 
Instance details

Defined in MarkupParse

type Rep MarkupWarning = D1 ('MetaData "MarkupWarning" "MarkupParse" "markup-parse-0.0.0.2-HnQ2lGo9uJcI0zQ43aTniK" 'False) (((C1 ('MetaCons "BadEmptyElemTag" 'PrefixI 'False) (U1 :: Type -> Type) :+: C1 ('MetaCons "SelfCloserWithChildren" 'PrefixI 'False) (U1 :: Type -> Type)) :+: (C1 ('MetaCons "LeafWithChildren" 'PrefixI 'False) (U1 :: Type -> Type) :+: C1 ('MetaCons "TagMismatch" 'PrefixI 'False) (S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 TagName) :*: S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 TagName)))) :+: ((C1 ('MetaCons "UnmatchedEndTag" 'PrefixI 'False) (U1 :: Type -> Type) :+: C1 ('MetaCons "UnclosedTag" 'PrefixI 'False) (U1 :: Type -> Type)) :+: (C1 ('MetaCons "EndTagInTree" 'PrefixI 'False) (U1 :: Type -> Type) :+: (C1 ('MetaCons "EmptyContent" 'PrefixI 'False) (U1 :: Type -> Type) :+: C1 ('MetaCons "MarkupParser" 'PrefixI 'False) (S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 ParserWarning))))))

type Result a = These [MarkupWarning] a Source #

The structure of many returning functions.

A common computation pipeline is to take advantage of the These Monad instance eg

markup s bs = bs & (tokenize s >=> gather s) & second (Markup s)

resultError :: Result a -> a Source #

Convert any warnings to an error

>>> resultError $ (tokenize Html) "<foo"
*** Exception: MarkupParser (ParserLeftover "<foo")
...

resultEither :: Result a -> Either [MarkupWarning] a Source #

Returns Left on any warnings

>>> resultEither $ (tokenize Html) "<foo><baz"
Left [MarkupParser (ParserLeftover "<baz")]

resultMaybe :: Result a -> Maybe a Source #

Returns results if any, ignoring warnings.

>>> resultMaybe $ (tokenize Html) "<foo><baz"
Just [StartTag "foo" []]

Token components

type TagName = ByteString Source #

Name of token

name :: Standard -> Parser e ByteString Source #

Parse a tag name. Each standard is slightly different.

selfClosers :: [TagName] Source #

Html tags that self-close

type AttrName = ByteString Source #

Name of an attribute.

type AttrValue = ByteString Source #

Value of an attribute. "" is equivalent to true with respect to boolean attributes.

data Attr Source #

An attribute of a tag

In parsing, boolean attributes, which are not required to have a value in HTML, will be set a value of "", which is ok. But this will then be rendered.

>>> detokenize Html <$> tokenize_ Html [i|<input checked>|]
["<input checked=\"\">"]

Constructors

Attr !AttrName !AttrValue 

Instances

Instances details
Generic Attr Source # 
Instance details

Defined in MarkupParse

Associated Types

type Rep Attr :: Type -> Type #

Methods

from :: Attr -> Rep Attr x #

to :: Rep Attr x -> Attr #

Show Attr Source # 
Instance details

Defined in MarkupParse

Methods

showsPrec :: Int -> Attr -> ShowS #

show :: Attr -> String #

showList :: [Attr] -> ShowS #

NFData Attr Source # 
Instance details

Defined in MarkupParse

Methods

rnf :: Attr -> () #

Eq Attr Source # 
Instance details

Defined in MarkupParse

Methods

(==) :: Attr -> Attr -> Bool #

(/=) :: Attr -> Attr -> Bool #

Ord Attr Source # 
Instance details

Defined in MarkupParse

Methods

compare :: Attr -> Attr -> Ordering #

(<) :: Attr -> Attr -> Bool #

(<=) :: Attr -> Attr -> Bool #

(>) :: Attr -> Attr -> Bool #

(>=) :: Attr -> Attr -> Bool #

max :: Attr -> Attr -> Attr #

min :: Attr -> Attr -> Attr #

ToExpr Attr Source # 
Instance details

Defined in MarkupParse

Methods

toExpr :: Attr -> Expr #

listToExpr :: [Attr] -> Expr #

type Rep Attr Source # 
Instance details

Defined in MarkupParse

type Rep Attr = D1 ('MetaData "Attr" "MarkupParse" "markup-parse-0.0.0.2-HnQ2lGo9uJcI0zQ43aTniK" 'False) (C1 ('MetaCons "Attr" 'PrefixI 'False) (S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'SourceStrict 'DecidedStrict) (Rec0 AttrName) :*: S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'SourceStrict 'DecidedStrict) (Rec0 AttrValue)))

attrs :: Standard -> Parser a [Attr] Source #

Parse attributions

Tokens

data Token Source #

A Markup token

>>> runParser_ (many (token Html)) [i|<foo>content</foo>|]
[StartTag "foo" [],Content "content",EndTag "foo"]
>>> runParser_ (token Xml) [i|<foo/>|]
EmptyElemTag "foo" []
>>> runParser_ (token Html) "<!-- Comment -->"
Comment " Comment "
>>> runParser_ (token Xml) [i|<?xml version="1.0" encoding="UTF-8"?>|]
Decl "xml version=\"1.0\" encoding=\"UTF-8\""
>>> runParser_ (token Html) "<!DOCTYPE html>"
Doctype "DOCTYPE html"
>>> runParser_ (token Xml) "<!DOCTYPE foo [ declarations ]>"
Doctype "DOCTYPE foo [ declarations ]"
>>> runParser (token Html) [i|<foo a="a" b="b" c=c check>|]
OK (StartTag "foo" [Attr "a" "a",Attr "b" "b",Attr "c" "c",Attr "check" ""]) ""
>>> runParser (token Xml) [i|<foo a="a" b="b" c=c check>|]
Fail

Constructors

StartTag !TagName ![Attr]

A start tag. https://developer.mozilla.org/en-US/docs/Glossary/Tag

EmptyElemTag !TagName ![Attr]

An empty element tag. Optional for XML and kind of not allowed in HTML.

EndTag !TagName

A closing tag.

Content !ByteString

The content between tags.

Comment !ByteString

Contents of a comment.

Decl !ByteString

Contents of a declaration

Doctype !ByteString

Contents of a doctype declaration.

Instances

Instances details
Generic Token Source # 
Instance details

Defined in MarkupParse

Associated Types

type Rep Token :: Type -> Type #

Methods

from :: Token -> Rep Token x #

to :: Rep Token x -> Token #

Show Token Source # 
Instance details

Defined in MarkupParse

Methods

showsPrec :: Int -> Token -> ShowS #

show :: Token -> String #

showList :: [Token] -> ShowS #

NFData Token Source # 
Instance details

Defined in MarkupParse

Methods

rnf :: Token -> () #

Eq Token Source # 
Instance details

Defined in MarkupParse

Methods

(==) :: Token -> Token -> Bool #

(/=) :: Token -> Token -> Bool #

Ord Token Source # 
Instance details

Defined in MarkupParse

Methods

compare :: Token -> Token -> Ordering #

(<) :: Token -> Token -> Bool #

(<=) :: Token -> Token -> Bool #

(>) :: Token -> Token -> Bool #

(>=) :: Token -> Token -> Bool #

max :: Token -> Token -> Token #

min :: Token -> Token -> Token #

ToExpr Token Source # 
Instance details

Defined in MarkupParse

Methods

toExpr :: Token -> Expr #

listToExpr :: [Token] -> Expr #

type Rep Token Source # 
Instance details

Defined in MarkupParse

tokenize :: Standard -> ByteString -> These [MarkupWarning] [Token] Source #

Parse a bytestring into tokens

>>> tokenize Html [i|<foo>content</foo>|]
That [StartTag "foo" [],Content "content",EndTag "foo"]

tokenize_ :: Standard -> ByteString -> [Token] Source #

tokenize but errors on warnings.

token :: Standard -> Parser String Token Source #

A flatparse Token parser.

>>> runParser (token Html) "<foo>content</foo>"
OK (StartTag "foo" []) "content</foo>"

detokenize :: Standard -> Token -> ByteString Source #

bytestring representation of Token.

>>> detokenize Html (StartTag "foo" [])
"<foo>"

gather :: Standard -> [Token] -> These [MarkupWarning] [Tree Token] Source #

Gather together token trees from a token list, placing child elements in nodes and removing EndTags.

>>> gather Html =<< tokenize Html "<foo class=\"bar\">baz</foo>"
That [Node {rootLabel = StartTag "foo" [Attr "class" "bar"], subForest = [Node {rootLabel = Content "baz", subForest = []}]}]

gather_ :: Standard -> [Token] -> [Tree Token] Source #

gather but errors on warnings.

degather :: Markup -> These [MarkupWarning] [Token] Source #

Convert a markup into a token list, adding end tags.

>>> degather =<< markup Html "<foo class=\"bar\">baz</foo>"
That [StartTag "foo" [Attr "class" "bar"],Content "baz",EndTag "foo"]

degather_ :: Markup -> [Token] Source #

degather but errors on warning

XML specific Parsers

xmlVersionInfo :: Parser e ByteString Source #

xml production [24]

xmlEncodingDecl :: Parser e ByteString Source #

xml production [80]

xmlStandalone :: Parser e ByteString Source #

Xml production [32]

xmlVersionNum :: Parser e ByteString Source #

xml production [26]

xmlEncName :: Parser e ByteString Source #

xml production [81]