| Safe Haskell | None | 
|---|---|
| Language | Haskell98 | 
Text.XML.Stream.Parse
Contents
Description
This module provides both a native Haskell solution for parsing XML documents into a stream of events, and a set of parser combinators for dealing with a stream of events.
As a simple example:
>>>:set -XOverloadedStrings>>>import Data.Conduit (runConduit, (.|))>>>import Data.Text (Text, unpack)>>>import Data.XML.Types (Event)>>>data Person = Person Int Text Text deriving Show>>>:{let parsePerson :: MonadThrow m => ConduitT Event o m (Maybe Person) parsePerson = tag' "person" parseAttributes $ \(age, goodAtHaskell) -> do name <- content return $ Person (read $ unpack age) name goodAtHaskell where parseAttributes = (,) <$> requireAttr "age" <*> requireAttr "goodAtHaskell" <* ignoreAttrs parsePeople :: MonadThrow m => ConduitT Event o m (Maybe [Person]) parsePeople = tagNoAttr "people" $ many parsePerson inputXml = mconcat [ "<?xml version=\"1.0\" encoding=\"utf-8\"?>" , "<people>" , " <person age=\"25\" goodAtHaskell=\"yes\">Michael</person>" , " <person age=\"2\" goodAtHaskell=\"might become\">Eliezer</person>" , "</people>" ] :}
>>>runConduit $ parseLBS def inputXml .| force "people required" parsePeople[Person 25 "Michael" "yes",Person 2 "Eliezer" "might become"]
This module also supports streaming results using yield.
 This allows parser results to be processed using conduits
 while a particular parser (e.g. many) is still running.
 Without using streaming results, you have to wait until the parser finished
 before you can process the result list. Large XML files might be easier
 to process by using streaming results.
 See http://stackoverflow.com/q/21367423/2597135 for a related discussion.
>>>import Data.Conduit.List as CL>>>:{let parsePeople' :: MonadThrow m => ConduitT Event Person m (Maybe ()) parsePeople' = tagNoAttr "people" $ manyYield parsePerson :}
>>>runConduit $ parseLBS def inputXml .| force "people required" parsePeople' .| CL.mapM_ printPerson 25 "Michael" "yes" Person 2 "Eliezer" "might become"
Previous versions of this module contained a number of more sophisticated functions written by Aristid Breitkreuz and Dmitry Olshansky. To keep this package simpler, those functions are being moved to a separate package. This note will be updated with the name of the package(s) when available.
Synopsis
- parseBytes :: MonadThrow m => ParseSettings -> ConduitT ByteString Event m ()
- parseBytesPos :: MonadThrow m => ParseSettings -> ConduitT ByteString EventPos m ()
- parseText' :: MonadThrow m => ParseSettings -> ConduitT Text Event m ()
- parseText :: MonadThrow m => ParseSettings -> ConduitT Text EventPos m ()
- parseTextPos :: MonadThrow m => ParseSettings -> ConduitT Text EventPos m ()
- detectUtf :: MonadThrow m => ConduitT ByteString Text m ()
- parseFile :: MonadResource m => ParseSettings -> FilePath -> ConduitT i Event m ()
- parseLBS :: MonadThrow m => ParseSettings -> ByteString -> ConduitT i Event m ()
- data ParseSettings
- def :: Default a => a
- type DecodeEntities = Text -> Content
- type DecodeIllegalCharacters = Int -> Maybe Char
- psDecodeEntities :: ParseSettings -> DecodeEntities
- psDecodeIllegalCharacters :: ParseSettings -> DecodeIllegalCharacters
- psRetainNamespaces :: ParseSettings -> Bool
- decodeXmlEntities :: DecodeEntities
- decodeHtmlEntities :: DecodeEntities
- tag :: MonadThrow m => NameMatcher a -> (a -> AttrParser b) -> (b -> ConduitT Event o m c) -> ConduitT Event o m (Maybe c)
- tag' :: MonadThrow m => NameMatcher a -> AttrParser b -> (b -> ConduitT Event o m c) -> ConduitT Event o m (Maybe c)
- tagNoAttr :: MonadThrow m => NameMatcher a -> ConduitT Event o m b -> ConduitT Event o m (Maybe b)
- tagIgnoreAttrs :: MonadThrow m => NameMatcher a -> ConduitT Event o m b -> ConduitT Event o m (Maybe b)
- content :: MonadThrow m => ConduitT Event o m Text
- contentMaybe :: MonadThrow m => ConduitT Event o m (Maybe Text)
- ignoreTag :: MonadThrow m => NameMatcher a -> ConduitT Event o m (Maybe ())
- ignoreEmptyTag :: MonadThrow m => NameMatcher a -> ConduitT Event o m (Maybe ())
- ignoreTree :: MonadThrow m => NameMatcher a -> ConduitT Event o m (Maybe ())
- ignoreTreeContent :: MonadThrow m => NameMatcher a -> ConduitT Event o m (Maybe ())
- ignoreAnyTreeContent :: MonadThrow m => ConduitT Event o m (Maybe ())
- ignoreAllTreesContent :: MonadThrow m => ConduitT Event o m (Maybe ())
- takeContent :: MonadThrow m => ConduitT Event Event m (Maybe ())
- takeTree :: MonadThrow m => NameMatcher a -> AttrParser b -> ConduitT Event Event m (Maybe ())
- takeTreeContent :: MonadThrow m => NameMatcher a -> AttrParser b -> ConduitT Event Event m (Maybe ())
- takeAnyTreeContent :: MonadThrow m => ConduitT Event Event m (Maybe ())
- takeAllTreesContent :: MonadThrow m => ConduitT Event Event m (Maybe ())
- newtype NameMatcher a = NameMatcher {- runNameMatcher :: Name -> Maybe a
 
- matching :: (Name -> Bool) -> NameMatcher Name
- anyOf :: [Name] -> NameMatcher Name
- anyName :: NameMatcher Name
- data AttrParser a
- attr :: Name -> AttrParser (Maybe Text)
- requireAttr :: Name -> AttrParser Text
- optionalAttr :: Name -> AttrParser (Maybe Text)
- requireAttrRaw :: String -> ((Name, [Content]) -> Maybe b) -> AttrParser b
- optionalAttrRaw :: ((Name, [Content]) -> Maybe b) -> AttrParser (Maybe b)
- ignoreAttrs :: AttrParser ()
- orE :: Monad m => ConduitT Event o m (Maybe a) -> ConduitT Event o m (Maybe a) -> ConduitT Event o m (Maybe a)
- choose :: Monad m => [ConduitT Event o m (Maybe a)] -> ConduitT Event o m (Maybe a)
- many :: Monad m => ConduitT Event o m (Maybe a) -> ConduitT Event o m [a]
- many_ :: MonadThrow m => ConduitT Event o m (Maybe a) -> ConduitT Event o m ()
- manyIgnore :: Monad m => ConduitT Event o m (Maybe a) -> ConduitT Event o m (Maybe b) -> ConduitT Event o m [a]
- many' :: MonadThrow m => ConduitT Event o m (Maybe a) -> ConduitT Event o m [a]
- force :: MonadThrow m => String -> m (Maybe a) -> m a
- manyYield :: Monad m => ConduitT a b m (Maybe b) -> ConduitT a b m ()
- manyYield' :: MonadThrow m => ConduitT Event b m (Maybe b) -> ConduitT Event b m ()
- manyIgnoreYield :: MonadThrow m => ConduitT Event b m (Maybe b) -> ConduitT Event b m (Maybe ()) -> ConduitT Event b m ()
- data XmlException- = XmlException { }
- | InvalidEndElement Name (Maybe Event)
- | InvalidEntity String (Maybe Event)
- | MissingAttribute String
- | UnparsedAttributes [(Name, [Content])]
 
- data PositionRange
- type EventPos = (Maybe PositionRange, Event)
Parsing XML files
parseBytes :: MonadThrow m => ParseSettings -> ConduitT ByteString Event m () Source #
Parses a byte stream into Events. This function is implemented fully in
 Haskell using attoparsec-text for parsing. The produced error messages do
 not give line/column information, so you may prefer to stick with the parser
 provided by libxml-enumerator. However, this has the advantage of not
 relying on any C libraries.
This relies on detectUtf to determine character encoding, and parseText'
 to do the actual parsing.
parseBytesPos :: MonadThrow m => ParseSettings -> ConduitT ByteString EventPos m () Source #
parseText' :: MonadThrow m => ParseSettings -> ConduitT Text Event m () Source #
Parses a character stream into Events. This function is implemented
 fully in Haskell using attoparsec-text for parsing. The produced error
 messages do not give line/column information, so you may prefer to stick
 with the parser provided by libxml-enumerator. However, this has the
 advantage of not relying on any C libraries.
Since 1.2.4
parseText :: MonadThrow m => ParseSettings -> ConduitT Text EventPos m () Source #
Deprecated: Please use parseText' or parseTextPos.
parseTextPos :: MonadThrow m => ParseSettings -> ConduitT Text EventPos m () Source #
Same as parseText', but includes the position of each event.
Since 1.2.4
detectUtf :: MonadThrow m => ConduitT ByteString Text m () Source #
Automatically determine which UTF variant is being used. This function first checks for BOMs, removing them as necessary, and then check for the equivalent of <?xml for each of UTF-8, UTF-16LEBE, and UTF-32LEBE. It defaults to assuming UTF-8.
parseFile :: MonadResource m => ParseSettings -> FilePath -> ConduitT i Event m () Source #
A helper function which reads a file from disk using enumFile, detects
 character encoding using detectUtf, parses the XML using parseBytes, and
 then hands off control to your supplied parser.
parseLBS :: MonadThrow m => ParseSettings -> ByteString -> ConduitT i Event m () Source #
Parse an event stream from a lazy ByteString.
Parser settings
data ParseSettings Source #
Instances
| Default ParseSettings Source # | |
| Defined in Text.XML.Stream.Parse Methods def :: ParseSettings # | |
type DecodeEntities = Text -> Content Source #
psDecodeIllegalCharacters :: ParseSettings -> DecodeIllegalCharacters Source #
How to decode illegal character references (&#[0-9]+; or &#x[0-9a-fA-F]+;).
Character references within the legal ranges defined by the standard are automatically parsed. Others are passed to this function.
Default: const Nothing
Since 1.7.1
psRetainNamespaces :: ParseSettings -> Bool Source #
Whether the original xmlns attributes should be retained in the parsed values. For more information on motivation, see:
https://github.com/snoyberg/xml/issues/38
Default: False
Since 1.2.1
Entity decoding
decodeXmlEntities :: DecodeEntities Source #
Default implementation of DecodeEntities, which leaves the
 entity as-is. Numeric character references and the five standard
 entities (lt, gt, amp, quot, pos) are handled internally by the
 parser.
decodeHtmlEntities :: DecodeEntities Source #
HTML4-compliant entity decoder. Handles the additional 248 entities defined by HTML 4 and XHTML 1.
Note that HTML 5 introduces a drastically larger number of entities, and this code does not recognize most of them.
Event parsing
Arguments
| :: MonadThrow m | |
| => NameMatcher a | Check if this is a correct tag name
   and return a value that can be used to get an  | 
| -> (a -> AttrParser b) | Given the value returned by the name checker, this function will
   be used to get an  | 
| -> (b -> ConduitT Event o m c) | Handler function to handle the attributes and children
   of a tag, given the value return from the  | 
| -> ConduitT Event o m (Maybe c) | 
The most generic way to parse a tag. It takes a NameMatcher to check whether
 this is a correct tag name, an AttrParser to handle attributes, and
 then a parser to deal with content.
Events are consumed if and only if the tag name and its attributes match.
This function automatically absorbs its balancing closing tag, and will
 throw an exception if not all of the attributes or child elements are
 consumed. If you want to allow extra attributes, see ignoreAttrs.
This function automatically ignores comments, instructions and whitespace.
tag' :: MonadThrow m => NameMatcher a -> AttrParser b -> (b -> ConduitT Event o m c) -> ConduitT Event o m (Maybe c) Source #
A simplified version of tag where the NameMatcher result isn't forwarded to the attributes parser.
Since 1.5.0
Arguments
| :: MonadThrow m | |
| => NameMatcher a | Check if this is a correct tag name | 
| -> ConduitT Event o m b | Handler function to handle the children of the matched tag | 
| -> ConduitT Event o m (Maybe b) | 
A further simplified tag parser, which requires that no attributes exist.
Arguments
| :: MonadThrow m | |
| => NameMatcher a | Check if this is a correct tag name | 
| -> ConduitT Event o m b | Handler function to handle the children of the matched tag | 
| -> ConduitT Event o m (Maybe b) | 
A further simplified tag parser, which ignores all attributes, if any exist
content :: MonadThrow m => ConduitT Event o m Text Source #
Grabs the next piece of content. If none if available, returns empty.
 This is simply a wrapper around contentMaybe.
contentMaybe :: MonadThrow m => ConduitT Event o m (Maybe Text) Source #
Grabs the next piece of content if available. This function skips over any comments and instructions and concatenates all content until the next start or end tag.
Ignoring tags/trees
Arguments
| :: MonadThrow m | |
| => NameMatcher a | Check if this is a correct tag name | 
| -> ConduitT Event o m (Maybe ()) | 
Deprecated: Please use ignoreEmptyTag.
Arguments
| :: MonadThrow m | |
| => NameMatcher a | Check if this is a correct tag name | 
| -> ConduitT Event o m (Maybe ()) | 
Ignore an empty tag and all of its attributes.
   This does not ignore the tag recursively
   (i.e. it assumes there are no child elements).
   This function returns Just () if the tag matched.
Since 1.5.0
Arguments
| :: MonadThrow m | |
| => NameMatcher a | Check if this is a correct tag name | 
| -> ConduitT Event o m (Maybe ()) | 
Deprecated: Please use ignoreTreeContent.
Arguments
| :: MonadThrow m | |
| => NameMatcher a | Check if this is a correct tag name | 
| -> ConduitT Event o m (Maybe ()) | 
Ignore a tag, its attributes and its children subtrees recursively.
   Both content and text events are ignored.
   This function returns Just () if the tag matched.
Since 1.5.0
ignoreAnyTreeContent :: MonadThrow m => ConduitT Event o m (Maybe ()) Source #
Like ignoreTreeContent, but matches any name and also ignores content events.
>>>:set -XOverloadedStrings>>>import Data.Conduit>>>import Data.Conduit.List (consume)
>>>runConduit $ parseLBS def "<a>content</a><b></b>" .| (ignoreAnyTreeContent >> consume)[EventBeginElement (Name {nameLocalName = "b", ...}) [],EventEndElement (Name {nameLocalName = "b", ...}),EventEndDocument]
>>>runConduit $ parseLBS def "text<b></b>" .| (ignoreAnyTreeContent >> consume)[EventBeginElement (Name {nameLocalName = "b", ...}) [],EventEndElement (Name {nameLocalName = "b", ...}),EventEndDocument]
ignoreAllTreesContent :: MonadThrow m => ConduitT Event o m (Maybe ()) Source #
Deprecated: Please use ignoreAnyTreeContent.
Streaming events
takeContent :: MonadThrow m => ConduitT Event Event m (Maybe ()) Source #
takeTree :: MonadThrow m => NameMatcher a -> AttrParser b -> ConduitT Event Event m (Maybe ()) Source #
Stream Events corresponding to a single XML element that matches given NameMatcher and AttrParser, from the opening- to the closing-tag.
>>>:set -XOverloadedStrings>>>import Control.Monad (void)>>>import Data.Conduit>>>import Data.Conduit.List (consume)
>>>runConduit $ parseLBS def "<a>content</a><b></b>" .| void (takeTree "a" ignoreAttrs) .| consume[EventBeginDocument,EventBeginElement (Name {nameLocalName = "a", ...}) [],EventContent (ContentText "content"),EventEndElement (Name {nameLocalName = "a", ...})]
>>>runConduit $ parseLBS def "<a>content</a><b></b>" .| void (takeTree "b" ignoreAttrs) .| consume[EventBeginDocument]
If next Event isn't an element, nothing is consumed.
>>>runConduit $ parseLBS def "text<a></a>" .| void (takeTree "a" ignoreAttrs) .| consume[EventBeginDocument]
If an opening-tag is consumed but no matching closing-tag is found, an XmlException is thrown.
>>>runConduit $ parseLBS def "<a><b></b>" .| void (takeTree "a" ignoreAttrs) .| consume*** Exception: InvalidEndElement (Name {nameLocalName = "a", nameNamespace = Nothing, namePrefix = Nothing}) Nothing
This function automatically ignores comments, instructions and whitespace.
Returns Just () if an element was consumed, Nothing otherwise.
Since 1.5.0
takeTreeContent :: MonadThrow m => NameMatcher a -> AttrParser b -> ConduitT Event Event m (Maybe ()) Source #
takeAnyTreeContent :: MonadThrow m => ConduitT Event Event m (Maybe ()) Source #
Like takeTreeContent, without checking for tag name or attributes.
>>>:set -XOverloadedStrings>>>import Control.Monad (void)>>>import Data.Conduit ((.|), runConduit)>>>import Data.Conduit.List (consume)
>>>runConduit $ parseLBS def "text<a></a>" .| void takeAnyTreeContent .| consume[EventBeginDocument,EventContent (ContentText "text")]
>>>runConduit $ parseLBS def "</a><b></b>" .| void takeAnyTreeContent .| consume[EventBeginDocument]
>>>runConduit $ parseLBS def "<b><c></c></b></a>text" .| void takeAnyTreeContent .| consume[EventBeginDocument,EventBeginElement (Name {nameLocalName = "b", ...}) [],EventBeginElement (Name {nameLocalName = "c", ...}) [],EventEndElement (Name {nameLocalName = "c", ...}),EventEndElement (Name {nameLocalName = "b", ...})]
Since 1.5.0
takeAllTreesContent :: MonadThrow m => ConduitT Event Event m (Maybe ()) Source #
Deprecated: Please use takeAnyTreeContent.
Tag name matching
newtype NameMatcher a Source #
A NameMatcher describes which names a tag parser is allowed to match.
Since 1.5.0
Constructors
| NameMatcher | |
| Fields 
 | |
Instances
matching :: (Name -> Bool) -> NameMatcher Name Source #
matching f matches name iff f name is true. Returns the matched Name.
Since 1.5.0
Attribute parsing
data AttrParser a Source #
A monad for parsing attributes. By default, it requires you to deal with
 all attributes present on an element, and will throw an exception if there
 are unhandled attributes. Use the requireAttr, attr et al
 functions for handling an attribute, and ignoreAttrs if you would like to
 skip the rest of the attributes on an element.
Alternative instance behaves like First monoid: it chooses first
 parser which doesn't fail.
Instances
requireAttr :: Name -> AttrParser Text Source #
optionalAttr :: Name -> AttrParser (Maybe Text) Source #
Deprecated: Please use attr.
requireAttrRaw :: String -> ((Name, [Content]) -> Maybe b) -> AttrParser b Source #
optionalAttrRaw :: ((Name, [Content]) -> Maybe b) -> AttrParser (Maybe b) Source #
ignoreAttrs :: AttrParser () Source #
Skip the remaining attributes on an element. Since this will clear the
 list of attributes, you must call this after any calls to requireAttr,
 optionalAttr, etc.
Combinators
many :: Monad m => ConduitT Event o m (Maybe a) -> ConduitT Event o m [a] Source #
Keep parsing elements as long as the parser returns Just.
many_ :: MonadThrow m => ConduitT Event o m (Maybe a) -> ConduitT Event o m () Source #
Like many but discards the results without building an intermediate list.
Since 1.5.0
manyIgnore :: Monad m => ConduitT Event o m (Maybe a) -> ConduitT Event o m (Maybe b) -> ConduitT Event o m [a] Source #
many' :: MonadThrow m => ConduitT Event o m (Maybe a) -> ConduitT Event o m [a] Source #
Like many, but any tags and content the consumer doesn't match on
   are silently ignored.
Arguments
| :: MonadThrow m | |
| => String | Error message | 
| -> m (Maybe a) | Optional parser to be forced | 
| -> m a | 
Streaming combinators
manyYield' :: MonadThrow m => ConduitT Event b m (Maybe b) -> ConduitT Event b m () Source #
Like many', but uses yield so the result list can be streamed
   to downstream conduits without waiting for manyYield' to finish
Arguments
| :: MonadThrow m | |
| => ConduitT Event b m (Maybe b) | Consuming parser that generates the result stream | 
| -> ConduitT Event b m (Maybe ()) | Ignore parser that consumes elements to be ignored | 
| -> ConduitT Event b m () | 
Like manyIgnore, but uses yield so the result list can be streamed
   to downstream conduits without waiting for manyIgnoreYield to finish
Exceptions
data XmlException Source #
Constructors
| XmlException | |
| Fields | |
| InvalidEndElement Name (Maybe Event) | |
| InvalidEntity String (Maybe Event) | |
| MissingAttribute String | |
| UnparsedAttributes [(Name, [Content])] | |
Instances
| Show XmlException Source # | |
| Defined in Text.XML.Stream.Parse Methods showsPrec :: Int -> XmlException -> ShowS # show :: XmlException -> String # showList :: [XmlException] -> ShowS # | |
| Exception XmlException Source # | |
| Defined in Text.XML.Stream.Parse Methods toException :: XmlException -> SomeException # fromException :: SomeException -> Maybe XmlException # displayException :: XmlException -> String # | |
Other types
data PositionRange #
Instances
| Eq PositionRange | |
| Defined in Data.Conduit.Attoparsec Methods (==) :: PositionRange -> PositionRange -> Bool # (/=) :: PositionRange -> PositionRange -> Bool # | |
| Ord PositionRange | |
| Defined in Data.Conduit.Attoparsec Methods compare :: PositionRange -> PositionRange -> Ordering # (<) :: PositionRange -> PositionRange -> Bool # (<=) :: PositionRange -> PositionRange -> Bool # (>) :: PositionRange -> PositionRange -> Bool # (>=) :: PositionRange -> PositionRange -> Bool # max :: PositionRange -> PositionRange -> PositionRange # min :: PositionRange -> PositionRange -> PositionRange # | |
| Show PositionRange | |
| Defined in Data.Conduit.Attoparsec Methods showsPrec :: Int -> PositionRange -> ShowS # show :: PositionRange -> String # showList :: [PositionRange] -> ShowS # | |