{-# LANGUAGE DeriveFunctor #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RankNTypes #-} {-# LANGUAGE StandaloneDeriving #-} -- | Mid-level XML parsers, built on top of "Data.XML.Parser.Low": -- -- - some formatting details are abstracted away (e.g. quoting, whitespacing), therefore parsers are not reversible -- - entities delimited by an opening and closing sequence are recognized, except for tags which need a more complex, recursive logic -- - token parsers do not overlap, therefore XML document can be tokenized in a stateless way -- -- All documentation examples assume the following setup: -- -- > :set -XOverloadedStrings -- > import Data.Attoparsec.ByteString module Data.XML.Parser.Mid ( module Data.XML.Parser.Mid.Attribute , module Data.XML.Parser.Mid.Comment , module Data.XML.Parser.Mid.Doctype , module Data.XML.Parser.Mid.Instruction , XMLDeclaration(..) , StartTag(..) , EmptyElementTag(..) , Token(..) , TokenParser() , runTokenParser , tokenInstruction , tokenComment , tokenCdata , tokenDoctype , tokenXmlDeclaration , tokenStartTag , tokenEndTag , tokenEmptyElementTag , tokenData , anyToken ) where import Control.Applicative import Control.Arrow ((>>>)) import Control.Monad.Compat import Control.Monad.Fail.Compat import Data.Char import Data.Functor import Data.Maybe import Data.Text (Text) import qualified Data.Text as Text import Data.XML.Parser.Low import Data.XML.Parser.Mid.Attribute import Data.XML.Parser.Mid.Comment import Data.XML.Parser.Mid.Doctype import Data.XML.Parser.Mid.Instruction import Numeric import Text.Parser.Char import Text.Parser.Combinators -- $setup -- >>> :set -XOverloadedStrings -- >>> import Data.Attoparsec.ByteString data Token = TokenXMLDeclaration XMLDeclaration | TokenDoctype Doctype | TokenInstruction Instruction | TokenStartTag StartTag | TokenEndTag QName | TokenEmptyElementTag EmptyElementTag | TokenData [Content] | TokenComment Text | TokenCDATA Text deriving (Eq, Ord, Show) -- | data XMLDeclaration = XMLDeclaration Text (Maybe Text) (Maybe Bool) deriving (Eq, Ord, Read, Show) -- | data StartTag = StartTag QName [Attribute] deriving (Eq, Ord, Read, Show) -- | data EmptyElementTag = EmptyElementTag QName [Attribute] deriving (Eq, Ord, Read, Show) -- | A parser that consumes whole 'Token's. newtype TokenParser m a = TokenParser { runTokenParser :: m a } deriving instance Functor m => Functor (TokenParser m) deriving instance Applicative m => Applicative (TokenParser m) deriving instance Alternative m => Alternative (TokenParser m) deriving instance Monad m => Monad (TokenParser m) instance (Parsing m, Monad m) => MonadFail (TokenParser m) where fail = TokenParser . unexpected -- | tokenDoctype :: CharParsing m => Monad m => TokenParser m Doctype tokenDoctype = TokenParser doctype -- | tokenInstruction :: CharParsing m => Monad m => TokenParser m Instruction tokenInstruction = TokenParser instruction -- | tokenComment :: CharParsing m => Monad m => TokenParser m Text tokenComment = TokenParser comment -- | -- -- >>> parseOnly (runTokenParser tokenCdata) "Hello, world!]]>" -- Right "Hello, world!" tokenCdata :: CharParsing m => Monad m => TokenParser m Text tokenCdata = TokenParser $ do tokenCdataOpen content <- manyTill anyChar $ try tokenCdataClose return $ Text.pack content -- | -- -- >>> parseOnly (runTokenParser tokenXmlDeclaration) "" -- Right (XMLDeclaration "1.0" (Just "UTF-8") (Just True)) tokenXmlDeclaration :: CharParsing m => Monad m => TokenParser m XMLDeclaration tokenXmlDeclaration = TokenParser $ do tokenXmlDeclarationOpen tokenWhitespace Attribute key value <- attribute guard $ key == QName "" "version" version <- expandContents decodePredefinedEntities value encoding <- optional $ do tokenWhitespace Attribute key value <- attribute guard $ key == QName "" "encoding" expandContents decodePredefinedEntities value standalone <- optional $ do tokenWhitespace Attribute key value <- attribute guard $ key == QName "" "standalone" boolean <- expandContents decodePredefinedEntities value case boolean of "yes" -> return True "no" -> return False _ -> empty optional tokenWhitespace tokenXmlDeclarationClose return $ XMLDeclaration version encoding standalone -- | -- -- >>> parseOnly (runTokenParser tokenStartTag) "" -- Right (StartTag (QName {namePrefix = "", nameLocal = "termdef"}) [Attribute (QName {namePrefix = "", nameLocal = "id"}) [ContentText "dt-dog"],Attribute (QName {namePrefix = "", nameLocal = "term"}) [ContentText "dog"]]) -- >>> parse (runTokenParser tokenStartTag) "2003-12-13T18:30:02Z" -- Done "2003-12-13T18:30:02Z" (StartTag (QName {namePrefix = "", nameLocal = "updated"}) []) tokenStartTag :: CharParsing m => Monad m => TokenParser m StartTag tokenStartTag = TokenParser $ do name <- tokenStartTagOpen attributes <- many (tokenWhitespace >> attribute) optional tokenWhitespace tokenElementClose return $ StartTag name attributes -- | -- -- >>> parseOnly (runTokenParser tokenEndTag) "" -- Right (QName {namePrefix = "", nameLocal = "termdef"}) tokenEndTag :: CharParsing m => Monad m => TokenParser m QName tokenEndTag = TokenParser $ do name <- tokenEndTagOpen optional tokenWhitespace tokenElementClose return name -- | -- -- >>> parseOnly (runTokenParser tokenEmptyElementTag) "" -- Right (EmptyElementTag (QName {namePrefix = "", nameLocal = "IMG"}) [Attribute (QName {namePrefix = "", nameLocal = "align"}) [ContentText "left"],Attribute (QName {namePrefix = "", nameLocal = "src"}) [ContentText "http://www.w3.org/Icons/WWW/w3c_home"]]) tokenEmptyElementTag :: CharParsing m => Monad m => TokenParser m EmptyElementTag tokenEmptyElementTag = TokenParser $ do name <- tokenStartTagOpen attributes <- optional $ do tokenWhitespace attribute `sepBy` tokenWhitespace optional tokenWhitespace tokenEmptyElementTagClose return $ EmptyElementTag name $ fromMaybe mempty attributes -- | -- -- >>> parseOnly (runTokenParser tokenData) "Rock & roll" -- Right [ContentText "Rock ",ContentReference (EntityRef "amp"),ContentText " roll"] tokenData :: CharParsing m => Monad m => TokenParser m [Content] tokenData = TokenParser $ some (tokenContent "<") -- | Parse any 'Token'. anyToken :: CharParsing m => Monad m => TokenParser m Token anyToken = TokenDoctype <$> tokenDoctype <|> TokenInstruction <$> tokenInstruction <|> TokenComment <$> tokenComment <|> TokenCDATA <$> tokenCdata <|> TokenXMLDeclaration <$> tokenXmlDeclaration <|> TokenStartTag <$> tokenStartTag <|> TokenEndTag <$> tokenEndTag <|> TokenEmptyElementTag <$> tokenEmptyElementTag <|> TokenData <$> tokenData