Safe Haskell | None |
---|---|
Language | Haskell98 |
This module provides both a native Haskell solution for parsing XML documents into a stream of events, and a set of parser combinators for dealing with a stream of events.
As a simple example, if you have the following XML file:
<?xml version="1.0" encoding="utf-8"?> <people> <person age="25">Michael</person> <person age="2">Eliezer</person> </people>
Then this code:
{-# LANGUAGE OverloadedStrings #-} import Control.Monad.Trans.Resource import Data.Conduit (($$)) import Data.Text (Text, unpack) import Text.XML.Stream.Parse data Person = Person Int Text deriving Show parsePerson = tagName "person" (requireAttr "age") $ \age -> do name <- content return $ Person (read $ unpack age) name parsePeople = tagNoAttr "people" $ many parsePerson main = do people <- runResourceT $ parseFile def "people.xml" $$ force "people required" parsePeople print people
will produce:
[Person {age = 25, name = "Michael"},Person {age = 2, name = "Eliezer"}]
This module also supports streaming results using yield
.
This allows parser results to be processed using conduits
while a particular parser (e.g. many
) is still running.
Without using streaming results, you have to wait until the parser finished
before you can process the result list. Large XML files might be easier
to process by using streaming results.
See http://stackoverflow.com/q/21367423/2597135 for a related discussion.
{-# LANGUAGE OverloadedStrings #-} import Control.Monad.Trans.Resource import Data.Conduit import Data.Text (Text, unpack) import Text.XML.Stream.Parse import Text.XML (Name) import Control.Monad.Trans.Class (lift) import Control.Monad (void) import qualified Data.Conduit.List as CL data Person = Person Int Text deriving Show parsePerson = tagName "person" (requireAttr "age") $ \age -> do name <- content return $ Person (read $ unpack age) name parsePeople = void $ tagNoAttr "people" $ manyYield parsePerson main = runResourceT $ parseFile def "people.xml" $$ parsePeople =$ CL.mapM_ (lift . print)
Previous versions of this module contained a number of more sophisticated functions written by Aristid Breitkreuz and Dmitry Olshansky. To keep this package simpler, those functions are being moved to a separate package. This note will be updated with the name of the package(s) when available.
- parseBytes :: MonadThrow m => ParseSettings -> Conduit ByteString m Event
- parseBytesPos :: MonadThrow m => ParseSettings -> Conduit ByteString m EventPos
- parseText' :: MonadThrow m => ParseSettings -> Conduit Text m Event
- parseText :: MonadThrow m => ParseSettings -> Conduit Text m EventPos
- parseTextPos :: MonadThrow m => ParseSettings -> Conduit Text m EventPos
- detectUtf :: MonadThrow m => Conduit ByteString m Text
- parseFile :: MonadResource m => ParseSettings -> FilePath -> Producer m Event
- parseLBS :: MonadThrow m => ParseSettings -> ByteString -> Producer m Event
- data ParseSettings
- def :: Default a => a
- type DecodeEntities = Text -> Content
- psDecodeEntities :: ParseSettings -> DecodeEntities
- psRetainNamespaces :: ParseSettings -> Bool
- decodeXmlEntities :: DecodeEntities
- decodeHtmlEntities :: DecodeEntities
- tag :: MonadThrow m => (Name -> Maybe a) -> (a -> AttrParser b) -> (b -> ConduitM Event o m c) -> ConduitM Event o m (Maybe c)
- tagPredicate :: MonadThrow m => (Name -> Bool) -> AttrParser a -> (a -> ConduitM Event o m b) -> ConduitM Event o m (Maybe b)
- tagName :: MonadThrow m => Name -> AttrParser a -> (a -> ConduitM Event o m b) -> ConduitM Event o m (Maybe b)
- tagNoAttr :: MonadThrow m => Name -> ConduitM Event o m a -> ConduitM Event o m (Maybe a)
- tagIgnoreAttrs :: MonadThrow m => Name -> ConduitM Event o m a -> ConduitM Event o m (Maybe a)
- tagPredicateIgnoreAttrs :: MonadThrow m => (Name -> Bool) -> ConduitM Event o m a -> ConduitM Event o m (Maybe a)
- content :: MonadThrow m => Consumer Event m Text
- contentMaybe :: MonadThrow m => Consumer Event m (Maybe Text)
- ignoreTag :: MonadThrow m => (Name -> Bool) -> ConduitM Event o m (Maybe ())
- ignoreTagName :: MonadThrow m => Name -> ConduitM Event o m (Maybe ())
- ignoreAnyTagName :: MonadThrow m => [Name] -> ConduitM Event o m (Maybe ())
- ignoreAllTags :: MonadThrow m => ConduitM Event o m (Maybe ())
- ignoreTree :: MonadThrow m => (Name -> Bool) -> ConduitM Event o m (Maybe ())
- ignoreTreeName :: MonadThrow m => Name -> ConduitM Event o m (Maybe ())
- ignoreAnyTreeName :: MonadThrow m => [Name] -> ConduitM Event o m (Maybe ())
- ignoreAllTrees :: MonadThrow m => ConduitM Event o m (Maybe ())
- ignoreAllTreesContent :: MonadThrow m => ConduitM Event o m (Maybe ())
- data AttrParser a
- attr :: Name -> AttrParser (Maybe Text)
- requireAttr :: Name -> AttrParser Text
- optionalAttr :: Name -> AttrParser (Maybe Text)
- requireAttrRaw :: String -> ((Name, [Content]) -> Maybe b) -> AttrParser b
- optionalAttrRaw :: ((Name, [Content]) -> Maybe b) -> AttrParser (Maybe b)
- ignoreAttrs :: AttrParser ()
- orE :: Monad m => Consumer Event m (Maybe a) -> Consumer Event m (Maybe a) -> Consumer Event m (Maybe a)
- choose :: Monad m => [ConduitM Event o m (Maybe a)] -> ConduitM Event o m (Maybe a)
- many :: Monad m => Consumer Event m (Maybe a) -> Consumer Event m [a]
- manyIgnore :: Monad m => Consumer Event m (Maybe a) -> Consumer Event m (Maybe ()) -> Consumer Event m [a]
- many' :: MonadThrow m => Consumer Event m (Maybe a) -> Consumer Event m [a]
- force :: MonadThrow m => String -> m (Maybe a) -> m a
- manyYield :: Monad m => ConduitM a b m (Maybe b) -> Conduit a m b
- manyIgnoreYield :: MonadThrow m => ConduitM Event b m (Maybe b) -> Consumer Event m (Maybe ()) -> Conduit Event m b
- manyYield' :: MonadThrow m => ConduitM Event b m (Maybe b) -> Conduit Event m b
- takeAllTreesContent :: MonadThrow m => Conduit Event m Event
- data XmlException
- = XmlException { }
- | InvalidEndElement Name (Maybe Event)
- | InvalidEntity String (Maybe Event)
- | MissingAttribute String
- | UnparsedAttributes [(Name, [Content])]
- data PositionRange :: *
- type EventPos = (Maybe PositionRange, Event)
Parsing XML files
parseBytes :: MonadThrow m => ParseSettings -> Conduit ByteString m Event Source #
Parses a byte stream into Event
s. This function is implemented fully in
Haskell using attoparsec-text for parsing. The produced error messages do
not give line/column information, so you may prefer to stick with the parser
provided by libxml-enumerator. However, this has the advantage of not
relying on any C libraries.
This relies on detectUtf
to determine character encoding, and parseText'
to do the actual parsing.
parseBytesPos :: MonadThrow m => ParseSettings -> Conduit ByteString m EventPos Source #
parseText' :: MonadThrow m => ParseSettings -> Conduit Text m Event Source #
Parses a character stream into Event
s. This function is implemented
fully in Haskell using attoparsec-text for parsing. The produced error
messages do not give line/column information, so you may prefer to stick
with the parser provided by libxml-enumerator. However, this has the
advantage of not relying on any C libraries.
Since 1.2.4
parseText :: MonadThrow m => ParseSettings -> Conduit Text m EventPos Source #
Deprecated: Please use parseText'
or parseTextPos
.
parseTextPos :: MonadThrow m => ParseSettings -> Conduit Text m EventPos Source #
Same as parseText'
, but includes the position of each event.
Since 1.2.4
detectUtf :: MonadThrow m => Conduit ByteString m Text Source #
Automatically determine which UTF variant is being used. This function first checks for BOMs, removing them as necessary, and then check for the equivalent of <?xml for each of UTF-8, UTF-16LEBE, and UTF-32LEBE. It defaults to assuming UTF-8.
parseFile :: MonadResource m => ParseSettings -> FilePath -> Producer m Event Source #
A helper function which reads a file from disk using enumFile
, detects
character encoding using detectUtf
, parses the XML using parseBytes
, and
then hands off control to your supplied parser.
parseLBS :: MonadThrow m => ParseSettings -> ByteString -> Producer m Event Source #
Parse an event stream from a lazy ByteString
.
Parser settings
type DecodeEntities = Text -> Content Source #
psRetainNamespaces :: ParseSettings -> Bool Source #
Whether the original xmlns attributes should be retained in the parsed values. For more information on motivation, see:
https://github.com/snoyberg/xml/issues/38
Default: False
Since 1.2.1
Entity decoding
decodeXmlEntities :: DecodeEntities Source #
Default implementation of DecodeEntities
: handles numeric entities and
the five standard character entities (lt, gt, amp, quot, apos).
decodeHtmlEntities :: DecodeEntities Source #
HTML4-compliant entity decoder. Handles numerics, the five standard character entities, and the additional 248 entities defined by HTML 4 and XHTML 1.
Note that HTML 5 introduces a drastically larger number of entities, and this code does not recognize most of them.
Event parsing
:: MonadThrow m | |
=> (Name -> Maybe a) | Check if this is a correct tag name
and return a value that can be used to get an |
-> (a -> AttrParser b) | Given the value returned by the name checker, this function will
be used to get an |
-> (b -> ConduitM Event o m c) | Handler function to handle the attributes and children
of a tag, given the value return from the |
-> ConduitM Event o m (Maybe c) |
Deprecated: The signature of this function will change in next release.
The most generic way to parse a tag. It takes a predicate for checking if
this is the correct tag name, an AttrParser
for handling attributes, and
then a parser for dealing with content.
Events
are consumed if and only if the predicate holds.
This function automatically absorbs its balancing closing tag, and will
throw an exception if not all of the attributes or child elements are
consumed. If you want to allow extra attributes, see ignoreAttrs
.
This function automatically ignores comments, instructions and whitespace.
:: MonadThrow m | |
=> (Name -> Bool) | Name predicate that returns |
-> AttrParser a | The attribute parser to be used for tags matching the predicate |
-> (a -> ConduitM Event o m b) | Handler function to handle the attributes and children
of a tag, given the value return from the |
-> ConduitM Event o m (Maybe b) |
Deprecated: This function will be removed in next release.
A simplified version of tag
which matches against boolean predicates.
:: MonadThrow m | |
=> Name | The tag name this parser matches to (includes namespaces) |
-> AttrParser a | The attribute parser to be used for tags matching the predicate |
-> (a -> ConduitM Event o m b) | Handler function to handle the attributes and children
of a tag, given the value return from the |
-> ConduitM Event o m (Maybe b) |
Deprecated: This function will be removed in next release.
A simplified version of tag
which matches for specific tag names instead
of taking a predicate function. This is often sufficient, and when combined
with OverloadedStrings and the IsString instance of Name
, can prove to be
very concise.
.
Note that Name
is namespace sensitive. When using the IsString
instance of name,
use
> "{http:/ab}c" :: Name
to match the tag c
in the XML namespace http://a/b
:: MonadThrow m | |
=> Name | The name this parser matches to |
-> ConduitM Event o m a | Handler function to handle the children of the matched tag |
-> ConduitM Event o m (Maybe a) |
Deprecated: The signature of this function will change in next release.
A further simplified tag parser, which requires that no attributes exist.
:: MonadThrow m | |
=> Name | The name this parser matches to |
-> ConduitM Event o m a | Handler function to handle the children of the matched tag |
-> ConduitM Event o m (Maybe a) |
Deprecated: The signature of this function will change in next release.
A further simplified tag parser, which ignores all attributes, if any exist
tagPredicateIgnoreAttrs Source #
:: MonadThrow m | |
=> (Name -> Bool) | The name predicate this parser matches to |
-> ConduitM Event o m a | Handler function to handle the children of the matched tag |
-> ConduitM Event o m (Maybe a) |
Deprecated: This function will be removed in next release.
A further simplified tag parser, which ignores all attributes, if any exist
content :: MonadThrow m => Consumer Event m Text Source #
Grabs the next piece of content. If none if available, returns empty
.
This is simply a wrapper around contentMaybe
.
contentMaybe :: MonadThrow m => Consumer Event m (Maybe Text) Source #
Grabs the next piece of content if available. This function skips over any comments and instructions and concatenates all content until the next start or end tag.
Ignoring tags/trees
Deprecated: The signature of this function will change in next release.
Ignore an empty tag and all of its attributes by predicate.
This does not ignore the tag recursively
(i.e. it assumes there are no child elements).
This functions returns Just
if the tag matched.
:: MonadThrow m | |
=> Name | The name to match to |
-> ConduitM Event o m (Maybe ()) |
Deprecated: This function will be removed in next release.
Like ignoreTag
, but matches an exact name
:: MonadThrow m | |
=> [Name] | The name to match to |
-> ConduitM Event o m (Maybe ()) |
Deprecated: This function will be removed in next release.
Like ignoreTagName
, but matches any name from a list of names.
ignoreAllTags :: MonadThrow m => ConduitM Event o m (Maybe ()) Source #
Deprecated: This function will be removed in next release.
Like ignoreTag
, but matches all tag name.
ignoreAllTags = ignoreTag (const True)
Deprecated: The signature of this function will change in next release.
Ignore an empty tag, its attributes and its children subtree recursively.
Both content and text events are ignored.
This functions returns Just
if the tag matched.
ignoreTreeName :: MonadThrow m => Name -> ConduitM Event o m (Maybe ()) Source #
Deprecated: This function will be removed in next release.
Like ignoreTagName
, but also ignores non-empty tabs
:: MonadThrow m | |
=> [Name] | The name to match to |
-> ConduitM Event o m (Maybe ()) |
Deprecated: This function will be removed in next release.
Like ignoreTagName
, but matches any name from a list of names.
ignoreAllTrees :: MonadThrow m => ConduitM Event o m (Maybe ()) Source #
Deprecated: This function will be removed in next release.
Like ignoreAllTags
, but ignores entire subtrees.
ignoreAllTrees = ignoreTree (const True)
ignoreAllTreesContent :: MonadThrow m => ConduitM Event o m (Maybe ()) Source #
Deprecated: This function will be renamed into ignoreAnyTreeContent
in next release.
Like ignoreAllTrees
, but also ignores all content events
Attribute parsing
data AttrParser a Source #
A monad for parsing attributes. By default, it requires you to deal with
all attributes present on an element, and will throw an exception if there
are unhandled attributes. Use the requireAttr
, attr
et al
functions for handling an attribute, and ignoreAttrs
if you would like to
skip the rest of the attributes on an element.
Alternative
instance behaves like First
monoid: it chooses first
parser which doesn't fail.
requireAttr :: Name -> AttrParser Text Source #
optionalAttr :: Name -> AttrParser (Maybe Text) Source #
Deprecated: Please use attr
.
requireAttrRaw :: String -> ((Name, [Content]) -> Maybe b) -> AttrParser b Source #
optionalAttrRaw :: ((Name, [Content]) -> Maybe b) -> AttrParser (Maybe b) Source #
ignoreAttrs :: AttrParser () Source #
Skip the remaining attributes on an element. Since this will clear the
list of attributes, you must call this after any calls to requireAttr
,
optionalAttr
, etc.
Combinators
many :: Monad m => Consumer Event m (Maybe a) -> Consumer Event m [a] Source #
Keep parsing elements as long as the parser returns Just
.
manyIgnore :: Monad m => Consumer Event m (Maybe a) -> Consumer Event m (Maybe ()) -> Consumer Event m [a] Source #
many' :: MonadThrow m => Consumer Event m (Maybe a) -> Consumer Event m [a] Source #
Like many
, but any tags and content the consumer doesn't match on
are silently ignored.
:: MonadThrow m | |
=> String | Error message |
-> m (Maybe a) | Optional parser to be forced |
-> m a |
Streaming combinators
manyYield' :: MonadThrow m => ConduitM Event b m (Maybe b) -> Conduit Event m b Source #
takeAllTreesContent :: MonadThrow m => Conduit Event m Event Source #
Deprecated: This function will be removed in next release.
Like ignoreAllTreesContent
, but stream the corresponding Event
s rather than ignoring them.
Incomplete elements (without a closing-tag) will trigger an XmlException
.
>>>
runResourceT $ parseLBS def "text<a></a>" $$ takeAllTreesContent =$= consume
Just [ EventContent (ContentText "text"), EventBeginElement "a" [], EventEndElement "a"]
>>>
runResourceT $ parseLBS def "</a><b></b>" $$ takeAllTreesContent =$= consume
Just [ ]
>>>
runResourceT $ parseLBS def "<b><c></c></b></a>text" $$ takeAllTreesContent =$= consume
Just [ EventBeginElement "b" [], EventBeginElement "c" [], EventEndElement "c", EventEndElement "b" ]
Since 1.4.0
Exceptions
data XmlException Source #
Other types
data PositionRange :: * #