{-# LANGUAGE FlexibleInstances, MultiParamTypeClasses, TypeFamilies, FlexibleContexts, EmptyDataDecls #-} -- | An extended variant of /Node/ intended to implement the entire XML -- specification. DTDs are not yet supported, however. -- -- The names conflict with those in /Tree/ so you must use qualified import -- if you want to use both modules. module Text.XML.Expat.Extended ( -- * Tree structure Document, DocumentG(..), Node, NodeG(..), UDocument, LDocument, ULDocument, UNode, LNode, ULNode, -- * Generic document/node manipulation module Text.XML.Expat.Internal.DocumentClass, module Text.XML.Expat.Internal.NodeClass, -- * Annotation-specific modifyAnnotation, mapAnnotation, mapDocumentAnnotation, -- * Qualified nodes QDocument, QLDocument, QNode, QLNode, module Text.XML.Expat.Internal.Qualified, -- * Namespaced nodes NDocument, NLDocument, NNode, NLNode, module Text.XML.Expat.Internal.Namespaced, -- * Parse to tree ParseOptions(..), defaultParseOptions, Encoding(..), parse, parse', XMLParseError(..), XMLParseLocation(..), -- * Variant that throws exceptions parseThrowing, XMLParseException(..), -- * Convert from SAX saxToTree, -- * Abstraction of string types GenericXMLString(..) ) where import Control.Arrow import Text.XML.Expat.SAX ( Encoding(..) , GenericXMLString(..) , ParseOptions(..) , defaultParseOptions , SAXEvent , XMLParseError(..) , XMLParseException(..) , XMLParseLocation(..) ) import qualified Text.XML.Expat.SAX as SAX import Text.XML.Expat.Internal.DocumentClass import Text.XML.Expat.Internal.Namespaced import Text.XML.Expat.Internal.NodeClass import Text.XML.Expat.Internal.Qualified import Control.Monad (mplus, mzero) import Control.DeepSeq import qualified Data.ByteString as B import qualified Data.ByteString.Lazy as L import Data.List.Class import Data.Maybe import Data.Monoid -- | Document representation of the XML document, intended to support the entire -- XML specification. DTDs are not yet supported, however. data DocumentG a c tag text = Document { dXMLDeclaration :: Maybe (XMLDeclaration text), dDocumentTypeDeclaration :: Maybe (DocumentTypeDeclaration c tag text), dTopLevelMiscs :: c (Misc text), dRoot :: NodeG a c tag text } instance (Show tag, Show text, Show a) => Show (DocumentG a [] tag text) where showsPrec d (Document xd dtd m r) = showParen (d > 10) $ ("Document "++) . showsPrec 11 xd . (" "++) . showsPrec 11 dtd . (" "++) . showsPrec 11 m . (" "++) . showsPrec 11 r instance (Eq tag, Eq text, Eq a) => Eq (DocumentG a [] tag text) where Document xd1 dtd1 m1 r1 == Document xd2 dtd2 m2 r2 = xd1 == xd2 && dtd1 == dtd2 && m1 == m2 && r1 == r2 -- | A pure representation of an XML document that uses a list as its container type. type Document a tag text = DocumentG a [] tag text type instance NodeType (DocumentG ann) = NodeG ann instance (Functor c, List c) => DocumentClass (DocumentG ann) c where getXMLDeclaration = dXMLDeclaration getDocumentTypeDeclaration = dDocumentTypeDeclaration getTopLevelMiscs = dTopLevelMiscs getRoot = dRoot mkDocument = Document -- | Extended variant of the tree representation of the XML document, intended -- to support the entire XML specification. DTDs are not yet supported, however. -- -- @c@ is the container type for the element's children, which is [] in the -- @hexpat@ package, and a monadic list type for @hexpat-iteratee@. -- -- @tag@ is the tag type, which can either be one of several string types, -- or a special type from the @Text.XML.Expat.Namespaced@ or -- @Text.XML.Expat.Qualified@ modules. -- -- @text@ is the string type for text content. -- -- @a@ is the type of the annotation. One of the things this can be used for -- is to store the XML parse location, which is useful for error handling. -- -- Note that some functions in the @Text.XML.Expat.Cursor@ module need to create -- new nodes through the 'MkElementClass' type class. Normally this can only be done -- if @a@ is a Maybe type or () (so it can provide the Nothing value for the annotation -- on newly created nodes). Or, you can write your own 'MkElementClass' instance. -- Apart from that, there is no requirement for @a@ to be a Maybe type. data NodeG a c tag text = Element { eName :: !tag, eAttributes :: ![(tag,text)], eChildren :: c (NodeG a c tag text), eAnn :: a } | Text !text | CData !text | Misc (Misc text) type instance ListOf (NodeG a c tag text) = c (NodeG a c tag text) -- | A pure tree representation that uses a list as its container type, -- extended variant. -- -- In the @hexpat@ package, a list of nodes has the type @[Node tag text]@, but note -- that you can also use the more general type function 'ListOf' to give a list of -- any node type, using that node's associated list type, e.g. -- @ListOf (UNode Text)@. type Node a tag text = NodeG a [] tag text instance (Show tag, Show text, Show a) => Show (NodeG a [] tag text) where showsPrec d (Element na at ch an) = showParen (d > 10) $ ("Element "++) . showsPrec 11 na . (" "++) . showsPrec 11 at . (" "++) . showsPrec 11 ch . (" "++) . showsPrec 11 an showsPrec d (Text t) = showParen (d > 10) $ ("Text "++) . showsPrec 11 t showsPrec d (CData t) = showParen (d > 10) $ ("CData "++) . showsPrec 11 t showsPrec d (Misc m) = showParen (d > 10) $ ("Misc "++) . showsPrec 11 m instance (Eq tag, Eq text, Eq a) => Eq (NodeG a [] tag text) where Element na1 at1 ch1 an1 == Element na2 at2 ch2 an2 = na1 == na2 && at1 == at2 && ch1 == ch2 && an1 == an2 Text t1 == Text t2 = t1 == t2 CData t1 == CData t2 = t1 == t2 Misc t1 == Misc t2 = t1 == t2 _ == _ = False instance (NFData tag, NFData text, NFData a) => NFData (NodeG a [] tag text) where rnf (Element nam att chi ann) = rnf (nam, att, chi, ann) rnf (Text txt) = rnf txt rnf (CData txt) = rnf txt rnf (Misc m) = rnf m instance (Functor c, List c) => NodeClass (NodeG a) c where textContentM (Element _ _ children _) = foldlL mappend mempty $ joinM $ fmap textContentM children textContentM (Text txt) = return txt textContentM (CData txt) = return txt textContentM (Misc (ProcessingInstruction _ _)) = return mempty textContentM (Misc (Comment _)) = return mempty isElement (Element _ _ _ _) = True isElement _ = False isText (Text _) = True isText (CData _) = True isText _ = False isCData (CData _) = True isCData _ = False isProcessingInstruction (Misc (ProcessingInstruction _ _)) = True isProcessingInstruction _ = False isComment (Misc (Comment _)) = True isComment _ = False isNamed nm (Element nm' _ _ _) = nm == nm' isNamed _ _ = False getName (Element name _ _ _) = name getName _ = mempty hasTarget t (Misc (ProcessingInstruction t' _ )) = t == t' hasTarget _ _ = False getTarget (Misc (ProcessingInstruction target _)) = target getTarget _ = mempty getAttributes (Element _ attrs _ _) = attrs getAttributes _ = [] getChildren (Element _ _ ch _) = ch getChildren _ = mzero getText (Text txt) = txt getText (CData txt) = txt getText (Misc (ProcessingInstruction _ txt)) = txt getText (Misc (Comment txt)) = txt getText (Element _ _ _ _) = mempty modifyName f (Element n a c ann) = Element (f n) a c ann modifyName _ node = node modifyAttributes f (Element n a c ann) = Element n (f a) c ann modifyAttributes _ node = node modifyChildren f (Element n a c ann) = Element n a (f c) ann modifyChildren _ node = node mapAllTags f (Element n a c ann) = Element (f n) (map (first f) a) (fmap (mapAllTags f) c) ann mapAllTags _ (Text txt) = Text txt mapAllTags _ (CData txt) = CData txt mapAllTags _ (Misc (ProcessingInstruction n txt)) = Misc (ProcessingInstruction n txt) mapAllTags _ (Misc (Comment txt)) = Misc (Comment txt) modifyElement f (Element n a c ann) = let (n', a', c') = f (n, a, c) in Element n' a' c' ann modifyElement _ (Text txt) = Text txt modifyElement _ (CData txt) = CData txt modifyElement _ (Misc (ProcessingInstruction n txt)) = Misc (ProcessingInstruction n txt) modifyElement _ (Misc (Comment txt)) = Misc (Comment txt) mapNodeContainer f (Element n a ch an) = do ch' <- mapNodeListContainer f ch return $ Element n a ch' an mapNodeContainer _ (Text txt) = return $ (Text txt) mapNodeContainer _ (CData txt) = return $ (CData txt) mapNodeContainer _ (Misc (ProcessingInstruction n txt)) = return $ Misc (ProcessingInstruction n txt) mapNodeContainer _ (Misc (Comment txt)) = return $ Misc (Comment txt) mkText = Text instance (Functor c, List c) => MkElementClass (NodeG (Maybe a)) c where mkElement name attrs children = Element name attrs children Nothing instance (Functor c, List c) => MkElementClass (NodeG ()) c where mkElement name attrs children = Element name attrs children () -- | Type alias for an extended document with unqualified tag names where -- tag and text are the same string type type UDocument a text = Document a text text -- | Type alias for an extended document, annotated with parse location type LDocument tag text = Document XMLParseLocation tag text -- | Type alias for an extended document with unqualified tag names where -- tag and text are the same string type, annotated with parse location type ULDocument text = Document XMLParseLocation text text -- | Type alias for an extended document where qualified names are used for tags type QDocument a text = Document a (QName text) text -- | Type alias for an extended document where qualified names are used for tags, annotated with parse location type QLDocument text = Document XMLParseLocation (QName text) text -- | Type alias for an extended document where namespaced names are used for tags type NDocument a text = Document a (NName text) text -- | Type alias for an extended document where namespaced names are used for tags, annotated with parse location type NLDocument text = Document XMLParseLocation (NName text) text -- | Type alias for an extended node with unqualified tag names where -- tag and text are the same string type type UNode a text = Node a text text -- | Type alias for an extended node, annotated with parse location type LNode tag text = Node XMLParseLocation tag text -- | Type alias for an extended node with unqualified tag names where -- tag and text are the same string type, annotated with parse location type ULNode text = LNode text text -- | Type alias for an extended node where qualified names are used for tags type QNode a text = Node a (QName text) text -- | Type alias for an extended node where qualified names are used for tags, annotated with parse location type QLNode text = LNode (QName text) text -- | Type alias for an extended node where namespaced names are used for tags type NNode a text = Node a (NName text) text -- | Type alias for an extended node where namespaced names are used for tags, annotated with parse location type NLNode text = LNode (NName text) text -- | Modify this node's annotation (non-recursively) if it's an element, otherwise no-op. modifyAnnotation :: (a -> a) -> Node a tag text -> Node a tag text f `modifyAnnotation` Element na at ch an = Element na at ch (f an) _ `modifyAnnotation` Text t = Text t _ `modifyAnnotation` CData t = CData t _ `modifyAnnotation` Misc (ProcessingInstruction n t) = Misc (ProcessingInstruction n t) _ `modifyAnnotation` Misc (Comment t) = Misc (Comment t) -- | Modify this node's annotation and all its children recursively if it's an element, otherwise no-op. mapAnnotation :: (a -> b) -> Node a tag text -> Node b tag text f `mapAnnotation` Element na at ch an = Element na at (map (f `mapAnnotation`) ch) (f an) _ `mapAnnotation` Text t = Text t _ `mapAnnotation` CData t = CData t _ `mapAnnotation` Misc (ProcessingInstruction n t) = Misc (ProcessingInstruction n t) _ `mapAnnotation` Misc (Comment t) = Misc (Comment t) -- | Modify the annotation of every node in the document recursively. mapDocumentAnnotation :: (a -> b) -> Document a tag text -> Document b tag text mapDocumentAnnotation f doc = Document { dXMLDeclaration = dXMLDeclaration doc, dDocumentTypeDeclaration = dDocumentTypeDeclaration doc, dTopLevelMiscs = dTopLevelMiscs doc, dRoot = mapAnnotation f (dRoot doc) } -- | A lower level function that lazily converts a SAX stream into a tree structure. -- Variant that takes annotations for start tags. saxToTree :: (GenericXMLString tag, Monoid text) => [(SAXEvent tag text, a)] -> (Document a tag text, Maybe XMLParseError) saxToTree ((SAX.XMLDeclaration ver mEnc mSD, _):events) = let (doc, mErr) = saxToTree events in (doc { dXMLDeclaration = Just $ XMLDeclaration ver mEnc mSD }, mErr) saxToTree events = let (nodes, mError, _) = ptl events False [] doc = Document { dXMLDeclaration = Nothing, dDocumentTypeDeclaration = Nothing, dTopLevelMiscs = findTopLevelMiscs nodes, dRoot = findRoot nodes } in (doc, mError) where findRoot (elt@(Element _ _ _ _):_) = elt findRoot (_:nodes) = findRoot nodes findRoot [] = Element (gxFromString "") [] [] (error "saxToTree null annotation") findTopLevelMiscs = mapMaybe $ \node -> case node of Misc m -> Just m _ -> Nothing ptl ((SAX.StartElement name attrs,ann):rema) isCD cd = let (children, err1, rema') = ptl rema isCD cd elt = Element name attrs children ann (out, err2, rema'') = ptl rema' isCD cd in (elt:out, err1 `mplus` err2, rema'') ptl ((SAX.EndElement _, _):rema) _ _ = ([], Nothing, rema) ptl ((SAX.CharacterData txt, _):rema) isCD cd = if isCD then ptl rema isCD (txt:cd) else let (out, err, rema') = ptl rema isCD cd in (Text txt:out, err, rema') ptl ((SAX.StartCData,_) :rema) _ _ = ptl rema True mzero ptl ((SAX.EndCData, _) :rema) _ cd = let (out, err, rema') = ptl rema False mzero in (CData (mconcat $ reverse cd):out, err, rema') ptl ((SAX.Comment txt, _):rema) isCD cd = let (out, err, rema') = ptl rema isCD cd in (Misc (Comment txt):out, err, rema') ptl ((SAX.ProcessingInstruction target txt, _):rema) isCD cd = let (out, err, rema') = ptl rema isCD cd in (Misc (ProcessingInstruction target txt):out, err, rema') ptl ((SAX.FailDocument err, _):_) _ _ = ([], Just err, []) ptl ((SAX.XMLDeclaration _ _ _, _):rema) isCD cd = ptl rema isCD cd -- doesn't appear in the middle of a document ptl [] _ _ = ([], Nothing, []) -- | Lazily parse XML to tree. Note that forcing the XMLParseError return value -- will force the entire parse. Therefore, to ensure lazy operation, don't -- check the error status until you have processed the tree. parse :: (GenericXMLString tag, GenericXMLString text) => ParseOptions tag text -- ^ Parse options -> L.ByteString -- ^ Input text (a lazy ByteString) -> (LDocument tag text, Maybe XMLParseError) parse opts bs = saxToTree $ SAX.parseLocations opts bs -- | Lazily parse XML to tree. In the event of an error, throw 'XMLParseException'. -- -- @parseThrowing@ can throw an exception from pure code, which is generally a bad -- way to handle errors, because Haskell\'s lazy evaluation means it\'s hard to -- predict where it will be thrown from. However, it may be acceptable in -- situations where it's not expected during normal operation, depending on the -- design of your program. parseThrowing :: (GenericXMLString tag, GenericXMLString text) => ParseOptions tag text -- ^ Parse options -> L.ByteString -- ^ Input text (a lazy ByteString) -> LDocument tag text parseThrowing opts bs = fst $ saxToTree $ SAX.parseLocationsThrowing opts bs -- | Strictly parse XML to tree. Returns error message or valid parsed tree. parse' :: (GenericXMLString tag, GenericXMLString text) => ParseOptions tag text -- ^ Parse options -> B.ByteString -- ^ Input text (a strict ByteString) -> Either XMLParseError (LDocument tag text) parse' opts bs = case parse opts (L.fromChunks [bs]) of (_, Just err) -> Left err (root, Nothing) -> Right root