| Safe Haskell | Safe-Inferred |
|---|---|
| Language | GHC2021 |
MarkupParse
Description
A Markup parser and printer of strict ByteStrings focused on optimising performance. Markup is a representation of data such as HTML, SVG or XML but the parsing is not always at standards.
Synopsis
- newtype Markup = Markup {}
- data Standard
- markup :: Standard -> ByteString -> Warn Markup
- markup_ :: Standard -> ByteString -> Markup
- data RenderStyle
- markdown :: RenderStyle -> Standard -> Markup -> Warn ByteString
- markdown_ :: RenderStyle -> Standard -> Markup -> ByteString
- normalize :: Markup -> Markup
- normContent :: Markup -> Markup
- wellFormed :: Standard -> Markup -> [MarkupWarning]
- isWellFormed :: Standard -> Markup -> Bool
- data MarkupWarning
- type Warn a = These [MarkupWarning] a
- warnError :: Warn a -> a
- warnEither :: Warn a -> Either [MarkupWarning] a
- warnMaybe :: Warn a -> Maybe a
- type Element = Tree Token
- element :: NameTag -> [Attr] -> Markup -> Markup
- element_ :: NameTag -> [Attr] -> Markup
- emptyElem :: NameTag -> [Attr] -> Markup
- elementc :: NameTag -> [Attr] -> ByteString -> Markup
- content :: ByteString -> Markup
- contentRaw :: ByteString -> Markup
- type NameTag = ByteString
- selfClosers :: [NameTag]
- addAttrs :: [Attr] -> Token -> Maybe Token
- doctypeHtml :: Markup
- doctypeXml :: Markup
- type AttrName = ByteString
- type AttrValue = ByteString
- data Attr = Attr {}
- attrsP :: Standard -> Parser a [Attr]
- nameP :: Standard -> Parser e ByteString
- data OpenTagType
- data Token
- = OpenTag !OpenTagType !NameTag ![Attr]
- | EndTag !NameTag
- | Content !ByteString
- | Comment !ByteString
- | Decl !ByteString ![Attr]
- | Doctype !ByteString
- tokenize :: Standard -> ByteString -> Warn [Token]
- tokenize_ :: Standard -> ByteString -> [Token]
- tokenP :: Standard -> Parser e Token
- detokenize :: Standard -> Token -> ByteString
- gather :: Standard -> [Token] -> Warn Markup
- gather_ :: Standard -> [Token] -> Markup
- degather :: Standard -> Markup -> Warn [Token]
- degather_ :: Standard -> Markup -> [Token]
- xmlVersionInfoP :: Parser e ByteString
- xmlEncodingDeclP :: Parser e ByteString
- xmlStandaloneP :: Parser e ByteString
- xmlVersionNumP :: Parser e ByteString
- xmlEncNameP :: Parser e ByteString
- xmlYesNoP :: Parser e ByteString
- utf8ToStr :: ByteString -> String
- strToUtf8 :: String -> ByteString
- escapeChar :: Char -> ByteString
- escape :: ByteString -> ByteString
- data Tree a = Node {}
Usage
import MarkupParse import Data.ByteString qualified as B bs <- B.readFile "other/line.svg" m = markup_ bs
is an isomorphic round trip from markdown_ . markup_Markup to ByteString to Markup:
- This is subject to the Markup being
wellFormed. - The round-trip
is not isomorphic as parsing forgets whitespace within tags, comments and declarations.markup_.markdown_ - The underscores represent versions of main functions that throw an exception on warnings encountered along the way.
At a lower level, a round trip pipeline might look something like:
tokenize Html >=>
tokenizeconverts aByteStringto aTokenlist.
gather Html >=>
(normalize >>> pure) >=>
normalizeconcatenates content, and normalizes attributes,
degather >=>
degatherturns the markup tree back into a token list. Finally,
fmap (detokenize Html) >>> pure
detokenizeturns a token back into a bytestring.
Along the way, the kleisi fishies and compose forward usage accumulates any warnings via the These monad instance.
Markup
A list of Elements or Tree Tokens
>>>markup Html "<foo class=\"bar\">baz</foo>"That (Markup {elements = [Node {rootLabel = OpenTag StartTag "foo" [Attr {attrName = "class", attrValue = "bar"}], subForest = [Node {rootLabel = Content "baz", subForest = []}]}]})
From a parsing pov, Html & Xml (& Svg) are close enough that they share a lot of parsing logic, so that parsing and printing just need some tweaking.
The xml parsing logic is based on the XML productions found in https://www.w3.org/TR/xml/
The html parsing was based on a reading of html-parse, but ignores the various 'x00' to 'xfffd' & eof directives that form part of the html standards.
markup :: Standard -> ByteString -> Warn Markup Source #
Convert bytestrings to Markup
>>>markup Html "<foo><br></foo><baz"These [MarkupParser (ParserLeftover "<baz")] (Markup {elements = [Node {rootLabel = OpenTag StartTag "foo" [], subForest = [Node {rootLabel = OpenTag StartTag "br" [], subForest = []}]}]})
data RenderStyle Source #
Indented 0 puts newlines in between the tags.
Instances
| Generic RenderStyle Source # | |
Defined in MarkupParse Associated Types type Rep RenderStyle :: Type -> Type # | |
| Show RenderStyle Source # | |
Defined in MarkupParse Methods showsPrec :: Int -> RenderStyle -> ShowS # show :: RenderStyle -> String # showList :: [RenderStyle] -> ShowS # | |
| Eq RenderStyle Source # | |
Defined in MarkupParse | |
| type Rep RenderStyle Source # | |
Defined in MarkupParse type Rep RenderStyle = D1 ('MetaData "RenderStyle" "MarkupParse" "markup-parse-0.1.0.0-EUIevmPWNIb1WdbRKKaIpt" 'False) (C1 ('MetaCons "Compact" 'PrefixI 'False) (U1 :: Type -> Type) :+: C1 ('MetaCons "Indented" 'PrefixI 'False) (S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 Int))) | |
markdown :: RenderStyle -> Standard -> Markup -> Warn ByteString Source #
Convert Markup to bytestrings
>>>markdown (Indented 4) Html (markup_ Html [i|<foo><br></foo>|])That "<foo>\n <br>\n</foo>"
markdown_ :: RenderStyle -> Standard -> Markup -> ByteString Source #
Convert Markup to ByteString and error on warnings.
>>>B.putStr $ markdown_ (Indented 4) Html (markup_ Html [i|<foo><br></foo>|])<foo> <br> </foo>
normalize :: Markup -> Markup Source #
Concatenate sequential content and normalize attributes; unwording class values and removing duplicate attributes (taking last).
>>>B.putStr $ warnError $ markdown Compact Xml $ normalize (markup_ Xml [i|<foo class="a" class="b" bar="first" bar="last"/>|])<foo bar="last" class="a b"/>
normContent :: Markup -> Markup Source #
Normalise Content in Markup, concatenating adjacent Content, and removing mempty Content.
>>>normContent $ content "a" <> content "" <> content "b"Markup {elements = [Node {rootLabel = Content "ab", subForest = []}]}
wellFormed :: Standard -> Markup -> [MarkupWarning] Source #
Check for well-formedness and return warnings encountered.
>>>wellFormed Html $ Markup [Node (Comment "") [], Node (EndTag "foo") [], Node (OpenTag EmptyElemTag "foo" []) [Node (Content "bar") []], Node (OpenTag EmptyElemTag "foo" []) []][EmptyContent,EndTagInTree,LeafWithChildren,BadEmptyElemTag]
Warnings
data MarkupWarning Source #
markup-parse generally tries to continue on parse errors, and return what has/can still be parsed, together with any warnings.
Constructors
| BadEmptyElemTag | A tag ending with "/>" that is not an element of |
| SelfCloserWithChildren | A tag ending with "/>" that has children. Cannot happen in the parsing phase. |
| LeafWithChildren | Only a |
| TagMismatch NameTag NameTag | A CloseTag with a different name to the currently open StartTag. |
| UnmatchedEndTag | An EndTag with no corresponding StartTag. |
| UnclosedTag | An EndTag with corresponding StartTag. |
| EndTagInTree | An EndTag should never appear in |
| EmptyContent | Empty Content, Comment, Decl or Doctype |
| BadDecl | Badly formed declaration |
| MarkupParser ParserWarning |
Instances
type Warn a = These [MarkupWarning] a Source #
A type synonym for the common returning type of many functions. A common computation pipeline is to take advantage of the These Monad instance eg
markup s bs = bs & (tokenize s >=> gather s) & second (Markup s)
warnError :: Warn a -> a Source #
Convert any warnings to an error
>>>warnError $ (tokenize Html) "<foo"*** Exception: MarkupParser (ParserLeftover "<foo") ...
warnEither :: Warn a -> Either [MarkupWarning] a Source #
Returns Left on any warnings
>>>warnEither $ (tokenize Html) "<foo><baz"Left [MarkupParser (ParserLeftover "<baz")]
warnMaybe :: Warn a -> Maybe a Source #
Returns results, if any, ignoring warnings.
>>>warnMaybe $ (tokenize Html) "<foo><baz"Just [OpenTag StartTag "foo" []]
element creation
element :: NameTag -> [Attr] -> Markup -> Markup Source #
Create a Markup element from a NameTag and attributes that wraps some other Markup.
>>>element "div" [] (element_ "br" [])Markup {elements = [Node {rootLabel = OpenTag StartTag "div" [], subForest = [Node {rootLabel = OpenTag StartTag "br" [], subForest = []}]}]}
element_ :: NameTag -> [Attr] -> Markup Source #
Create a Markup element from a NameTag and attributes that doesn't wrap some other Markup. OpenTagType is StartTag. Use emptyElem if you want to create a EmptyElemTag.
>>>(element_ "br" [])Markup {elements = [Node {rootLabel = OpenTag StartTag "br" [], subForest = []}]}
emptyElem :: NameTag -> [Attr] -> Markup Source #
Create a Markup element from a NameTag and attributes using EmptyElemTag, that doesn't wrap some other Markup. No checks are made on whether this creates well-formed Markup.
>>>emptyElem "br" []Markup {elements = [Node {rootLabel = OpenTag EmptyElemTag "br" [], subForest = []}]}
elementc :: NameTag -> [Attr] -> ByteString -> Markup Source #
Create a Markup element from a NameTag and attributes that wraps some Content.
>>>elementc "div" [] "content"Markup {elements = [Node {rootLabel = OpenTag StartTag "div" [], subForest = [Node {rootLabel = Content "content", subForest = []}]}]}
content :: ByteString -> Markup Source #
Create a Markup element from a bytestring, escaping the usual characters.
>>>content "<content>"Markup {elements = [Node {rootLabel = Content "<content>", subForest = []}]}
contentRaw :: ByteString -> Markup Source #
Create a Markup element from a bytestring, not escaping the usual characters.
>>>contentRaw "<content>"Markup {elements = [Node {rootLabel = Content "<content>", subForest = []}]}
>>>markup_ Html $ markdown_ Compact Html $ contentRaw "<content>"*** Exception: UnclosedTag ...
Token components
type NameTag = ByteString Source #
Name of token
selfClosers :: [NameTag] Source #
Html tags that self-close
addAttrs :: [Attr] -> Token -> Maybe Token Source #
Append attributes to the existing Token attribute list.
doctypeHtml :: Markup Source #
Standard Html Doctype
>>>markdown_ Compact Html doctypeHtml"<!DOCTYPE html>"
doctypeXml :: Markup Source #
Standard Xml Doctype
>>>markdown_ Compact Xml doctypeXml"<?xml version=\"1.0\" encoding=\"utf-8\"?><!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">"
type AttrName = ByteString Source #
Name of an attribute.
type AttrValue = ByteString Source #
Value of an attribute. "" is equivalent to true with respect to boolean attributes.
An attribute of a tag
In parsing, boolean attributes, which are not required to have a value in HTML, will be set a value of "", which is ok. But this will then be rendered.
>>>detokenize Html <$> tokenize_ Html [i|<input checked>|]["<input checked=\"\">"]
Instances
| Generic Attr Source # | |
| Show Attr Source # | |
| NFData Attr Source # | |
Defined in MarkupParse | |
| Eq Attr Source # | |
| Ord Attr Source # | |
| ToExpr Attr Source # | |
Defined in MarkupParse | |
| type Rep Attr Source # | |
Defined in MarkupParse type Rep Attr = D1 ('MetaData "Attr" "MarkupParse" "markup-parse-0.1.0.0-EUIevmPWNIb1WdbRKKaIpt" 'False) (C1 ('MetaCons "Attr" 'PrefixI 'True) (S1 ('MetaSel ('Just "attrName") 'NoSourceUnpackedness 'SourceStrict 'DecidedStrict) (Rec0 AttrName) :*: S1 ('MetaSel ('Just "attrValue") 'NoSourceUnpackedness 'SourceStrict 'DecidedStrict) (Rec0 AttrValue))) | |
nameP :: Standard -> Parser e ByteString Source #
Parse a tag name. Each standard is slightly different.
Tokens
data OpenTagType Source #
Whether an opening tag is a start tag or an empty element tag.
Constructors
| StartTag | |
| EmptyElemTag |
Instances
| Generic OpenTagType Source # | |
Defined in MarkupParse Associated Types type Rep OpenTagType :: Type -> Type # | |
| Show OpenTagType Source # | |
Defined in MarkupParse Methods showsPrec :: Int -> OpenTagType -> ShowS # show :: OpenTagType -> String # showList :: [OpenTagType] -> ShowS # | |
| NFData OpenTagType Source # | |
Defined in MarkupParse Methods rnf :: OpenTagType -> () # | |
| Eq OpenTagType Source # | |
Defined in MarkupParse | |
| Ord OpenTagType Source # | |
Defined in MarkupParse Methods compare :: OpenTagType -> OpenTagType -> Ordering # (<) :: OpenTagType -> OpenTagType -> Bool # (<=) :: OpenTagType -> OpenTagType -> Bool # (>) :: OpenTagType -> OpenTagType -> Bool # (>=) :: OpenTagType -> OpenTagType -> Bool # max :: OpenTagType -> OpenTagType -> OpenTagType # min :: OpenTagType -> OpenTagType -> OpenTagType # | |
| ToExpr OpenTagType Source # | |
Defined in MarkupParse | |
| type Rep OpenTagType Source # | |
A Markup token
>>>runParser_ (many (tokenP Html)) [i|<foo>content</foo>|][OpenTag StartTag "foo" [],Content "content",EndTag "foo"]
>>>runParser_ (tokenP Xml) [i|<foo/>|]OpenTag EmptyElemTag "foo" []
>>>runParser_ (tokenP Html) "<!-- Comment -->"Comment " Comment "
>>>runParser_ (tokenP Xml) [i|<?xml version="1.0" encoding="UTF-8"?>|]Decl "xml" [Attr {attrName = "version", attrValue = " version=\"1.0\""},Attr {attrName = "encoding", attrValue = "UTF-8"}]
>>>runParser_ (tokenP Html) "<!DOCTYPE html>"Doctype "DOCTYPE html"
>>>runParser_ (tokenP Xml) "<!DOCTYPE foo [ declarations ]>"Doctype "DOCTYPE foo [ declarations ]"
>>>runParser (tokenP Html) [i|<foo a="a" b="b" c=c check>|]OK (OpenTag StartTag "foo" [Attr {attrName = "a", attrValue = "a"},Attr {attrName = "b", attrValue = "b"},Attr {attrName = "c", attrValue = "c"},Attr {attrName = "check", attrValue = ""}]) ""
>>>runParser (tokenP Xml) [i|<foo a="a" b="b" c=c check>|]Fail
Constructors
| OpenTag !OpenTagType !NameTag ![Attr] | A tag. https://developer.mozilla.org/en-US/docs/Glossary/Tag |
| EndTag !NameTag | A closing tag. |
| Content !ByteString | The content between tags. |
| Comment !ByteString | Contents of a comment. |
| Decl !ByteString ![Attr] | Contents of a declaration |
| Doctype !ByteString | Contents of a doctype declaration. |
Instances
tokenize :: Standard -> ByteString -> Warn [Token] Source #
Parse a bytestring into tokens
>>>tokenize Html [i|<foo>content</foo>|]That [OpenTag StartTag "foo" [],Content "content",EndTag "foo"]
tokenP :: Standard -> Parser e Token Source #
A flatparse Token parser.
>>>runParser (tokenP Html) "<foo>content</foo>"OK (OpenTag StartTag "foo" []) "content</foo>"
detokenize :: Standard -> Token -> ByteString Source #
bytestring representation of Token.
>>>detokenize Html (OpenTag StartTag "foo" [])"<foo>"
gather :: Standard -> [Token] -> Warn Markup Source #
Gather together token trees from a token list, placing child elements in nodes and removing EndTags.
>>>gather Html =<< tokenize Html "<foo class=\"bar\">baz</foo>"That (Markup {elements = [Node {rootLabel = OpenTag StartTag "foo" [Attr {attrName = "class", attrValue = "bar"}], subForest = [Node {rootLabel = Content "baz", subForest = []}]}]})
degather :: Standard -> Markup -> Warn [Token] Source #
Convert a markup into a token list, adding end tags.
>>>degather Html =<< markup Html "<foo class=\"bar\">baz</foo>"That [OpenTag StartTag "foo" [Attr {attrName = "class", attrValue = "bar"}],Content "baz",EndTag "foo"]
XML specific Parsers
xmlVersionInfoP :: Parser e ByteString Source #
xml production [24]
xmlEncodingDeclP :: Parser e ByteString Source #
xml production [80]
xmlStandaloneP :: Parser e ByteString Source #
Xml production [32]
xmlVersionNumP :: Parser e ByteString Source #
xml production [26]
xmlEncNameP :: Parser e ByteString Source #
xml production [81]
xmlYesNoP :: Parser e ByteString Source #
Xml yes/no
bytestring support
utf8ToStr :: ByteString -> String #
Convert a ByteString to an UTF8-encoded String.
strToUtf8 :: String -> ByteString #
Convert an UTF8-encoded String to a ByteString.
escapeChar :: Char -> ByteString Source #
Escape a single character.
escape :: ByteString -> ByteString Source #
Escape Content
>>>escape [i|<foo class="a" bar='b'>|]"<foo class="a" bar=&aposb&apos>"
Tree support
Non-empty, possibly infinite, multi-way trees; also known as rose trees.
Instances
| MonadFix Tree | Since: containers-0.5.11 |
| MonadZip Tree | |
| Foldable Tree | |
Defined in Data.Tree Methods fold :: Monoid m => Tree m -> m # foldMap :: Monoid m => (a -> m) -> Tree a -> m # foldMap' :: Monoid m => (a -> m) -> Tree a -> m # foldr :: (a -> b -> b) -> b -> Tree a -> b # foldr' :: (a -> b -> b) -> b -> Tree a -> b # foldl :: (b -> a -> b) -> b -> Tree a -> b # foldl' :: (b -> a -> b) -> b -> Tree a -> b # foldr1 :: (a -> a -> a) -> Tree a -> a # foldl1 :: (a -> a -> a) -> Tree a -> a # elem :: Eq a => a -> Tree a -> Bool # maximum :: Ord a => Tree a -> a # | |
| Eq1 Tree | Since: containers-0.5.9 |
| Ord1 Tree | Since: containers-0.5.9 |
| Read1 Tree | Since: containers-0.5.9 |
Defined in Data.Tree | |
| Show1 Tree | Since: containers-0.5.9 |
| Traversable Tree | |
| Applicative Tree | |
| Functor Tree | |
| Monad Tree | |
| Hashable1 Tree | Since: hashable-1.3.4.0 |
Defined in Data.Hashable.Class | |
| Generic1 Tree | |
| Data a => Data (Tree a) | |
Defined in Data.Tree Methods gfoldl :: (forall d b. Data d => c (d -> b) -> d -> c b) -> (forall g. g -> c g) -> Tree a -> c (Tree a) # gunfold :: (forall b r. Data b => c (b -> r) -> c r) -> (forall r. r -> c r) -> Constr -> c (Tree a) # toConstr :: Tree a -> Constr # dataTypeOf :: Tree a -> DataType # dataCast1 :: Typeable t => (forall d. Data d => c (t d)) -> Maybe (c (Tree a)) # dataCast2 :: Typeable t => (forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c (Tree a)) # gmapT :: (forall b. Data b => b -> b) -> Tree a -> Tree a # gmapQl :: (r -> r' -> r) -> r -> (forall d. Data d => d -> r') -> Tree a -> r # gmapQr :: forall r r'. (r' -> r -> r) -> r -> (forall d. Data d => d -> r') -> Tree a -> r # gmapQ :: (forall d. Data d => d -> u) -> Tree a -> [u] # gmapQi :: Int -> (forall d. Data d => d -> u) -> Tree a -> u # gmapM :: Monad m => (forall d. Data d => d -> m d) -> Tree a -> m (Tree a) # gmapMp :: MonadPlus m => (forall d. Data d => d -> m d) -> Tree a -> m (Tree a) # gmapMo :: MonadPlus m => (forall d. Data d => d -> m d) -> Tree a -> m (Tree a) # | |
| Generic (Tree a) | |
| Read a => Read (Tree a) | |
| Show a => Show (Tree a) | |
| NFData a => NFData (Tree a) | |
| Eq a => Eq (Tree a) | |
| Ord a => Ord (Tree a) | Since: containers-0.6.5 |
| Hashable v => Hashable (Tree v) | Since: hashable-1.3.4.0 |
Defined in Data.Hashable.Class | |
| ToExpr a => ToExpr (Tree a) | |
Defined in Data.TreeDiff.Class | |
| type Rep1 Tree | Since: containers-0.5.8 |
Defined in Data.Tree type Rep1 Tree = D1 ('MetaData "Tree" "Data.Tree" "containers-0.6.5.1" 'False) (C1 ('MetaCons "Node" 'PrefixI 'True) (S1 ('MetaSel ('Just "rootLabel") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) Par1 :*: S1 ('MetaSel ('Just "subForest") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) ([] :.: Rec1 Tree))) | |
| type Rep (Tree a) | Since: containers-0.5.8 |
Defined in Data.Tree type Rep (Tree a) = D1 ('MetaData "Tree" "Data.Tree" "containers-0.6.5.1" 'False) (C1 ('MetaCons "Node" 'PrefixI 'True) (S1 ('MetaSel ('Just "rootLabel") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 a) :*: S1 ('MetaSel ('Just "subForest") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 [Tree a]))) | |