{-# LANGUAGE DeriveDataTypeable, TypeSynonymInstances #-} -- hexpat, a Haskell wrapper for expat -- Copyright (C) 2008 Evan Martin -- Copyright (C) 2009 Stephen Blackheath -- | This module provides functions to parse an XML document to a tree structure, -- either strictly or lazily, as well as a lazy SAX-style interface. -- -- The 'GenericXMLString' type class allows you to use any string type. Three -- string types are provided for here: 'String', 'ByteString' and 'Text'. -- -- Here is a complete example to get you started: -- -- > -- | A "hello world" example of hexpat that lazily parses a document, printing -- > -- it to standard out. -- > -- > import Text.XML.Expat.Tree -- > import Text.XML.Expat.Format -- > import System.Environment -- > import System.Exit -- > import System.IO -- > import qualified Data.ByteString.Lazy as L -- > -- > main = do -- > args <- getArgs -- > case args of -- > [filename] -> process filename -- > otherwise -> do -- > hPutStrLn stderr "Usage: helloworld " -- > exitWith $ ExitFailure 1 -- > -- > process :: String -> IO () -- > process filename = do -- > inputText <- L.readFile filename -- > -- Note: Because we're not using the tree, Haskell can't infer the type of -- > -- strings we're using so we need to tell it explicitly with a type signature. -- > let (xml, mErr) = parse defaultParserOptions inputText :: (UNode String, Maybe XMLParseError) -- > -- Process document before handling error, so we get lazy processing. -- > L.hPutStr stdout $ format xml -- > putStrLn "" -- > case mErr of -- > Nothing -> return () -- > Just err -> do -- > hPutStrLn stderr $ "XML parse failed: "++show err -- > exitWith $ ExitFailure 2 -- -- Error handling in strict parses is very straight forward - just check the -- 'Either' return value. Lazy parses are not so simple. Here are two working -- examples that illustrate the ways to handle errors. Here they are: -- -- Way no. 1 - Using a Maybe value -- -- > import Text.XML.Expat.Tree -- > import qualified Data.ByteString.Lazy as L -- > import Data.ByteString.Internal (c2w) -- > -- > -- This is the recommended way to handle errors in lazy parses -- > main = do -- > let (tree, mError) = parse defaultParserOptions -- > (L.pack $ map c2w $ "") -- > print (tree :: UNode String) -- > -- > -- Note: We check the error _after_ we have finished our processing -- > -- on the tree. -- > case mError of -- > Just err -> putStrLn $ "It failed : "++show err -- > Nothing -> putStrLn "Success!" -- -- Way no. 2 - Using exceptions -- -- 'parseThrowing' can throw an exception from pure code, which is generally a bad -- way to handle errors, because Haskell\'s lazy evaluation means it\'s hard to -- predict where it will be thrown from. However, it may be acceptable in -- situations where it's not expected during normal operation, depending on the -- design of your program. -- -- > ... -- > import Control.Exception.Extensible as E -- > -- > -- This is not the recommended way to handle errors. -- > main = do -- > do -- > let tree = parseThrowing defaultParserOptions -- > (L.pack $ map c2w $ "") -- > print (tree :: UNode String) -- > -- Because of lazy evaluation, you should not process the tree outside -- > -- the 'do' block, or exceptions could be thrown that won't get caught. -- > `E.catch` (\exc -> -- > case E.fromException exc of -- > Just (XMLParseException err) -> putStrLn $ "It failed : "++show err -- > Nothing -> E.throwIO exc) module Text.XML.Expat.Tree ( -- * Tree structure Node(..), Nodes, Attributes, UNode, UNodes, UAttributes, textContent, isElement, isNamed, isText, getAttribute, getChildren, modifyChildren, -- * Parse functions ParserOptions(..), defaultParserOptions, parse, parse', parseThrowing, -- * Deprecated parse functions parseTree, parseTree', parseSAX, parseSAXLocations, parseTreeThrowing, parseSAXThrowing, parseSAXLocationsThrowing, extractText, -- * Parse to tree Encoding(..), XMLParseError(..), XMLParseLocation(..), -- * SAX-style parse SAXEvent(..), saxToTree, -- * Variants that throw exceptions XMLParseException(..), -- * Abstraction of string types GenericXMLString(..) ) where ------------------------------------------------------------------------------ import Text.XML.Expat.IO hiding (parse,parse') import qualified Text.XML.Expat.IO as IO import Text.XML.Expat.SAX ( Encoding(..) , ParserOptions(..) , XMLParseException(..) , XMLParseError(..) , XMLParseLocation(..) , SAXEvent(..) , defaultParserOptions , mkText , parseSAX , parseSAXLocations , parseSAXLocationsThrowing , parseSAXThrowing , GenericXMLString(..) ) import qualified Text.XML.Expat.SAX as SAX ------------------------------------------------------------------------------ import Data.ByteString (ByteString) import qualified Data.ByteString.Lazy as L import qualified Data.ByteString.Internal as I import Data.IORef import qualified Data.Monoid as M import Data.Monoid import Control.Parallel.Strategies import Control.Monad import System.IO.Unsafe import Foreign.C.String import Foreign.Ptr -- | The tree representation of the XML document. data Node tag text = Element { eName :: !tag, eAttrs :: ![(tag,text)], eChildren :: [Node tag text] } | Text !text deriving (Eq, Show) instance (NFData tag, NFData text) => NFData (Node tag text) where rnf (Element nam att chi) = rnf (nam, att, chi) rnf (Text txt) = rnf txt -- | Type shortcut for attributes type Attributes tag text = [(tag, text)] -- | DEPRECATED: Use [Node tag text] instead. -- -- Type shortcut for nodes. type Nodes tag text = [Node tag text] {-# DEPRECATED Nodes "use [Node tag text] instead" #-} -- | DEPRECATED: Use [UNode text] instead. -- -- Type shortcut for nodes with unqualified tag names where tag and -- text are the same string type. Deprecated type UNodes text = Nodes text text {-# DEPRECATED UNodes "use [UNode text] instead" #-} -- | Type shortcut for a single node with unqualified tag names where tag and -- text are the same string type. type UNode text = Node text text -- | Type shortcut for attributes with unqualified names where tag and -- text are the same string type. type UAttributes text = Attributes text text -- | Extract all text content from inside a tag into a single string, including -- any text contained in children. textContent :: Monoid text => Node tag text -> text textContent (Element _ _ children) = mconcat $ map textContent children textContent (Text txt) = txt -- | DEPRECATED: Renamed to 'textContent'. extractText :: Monoid text => Node tag text -> text {-# DEPRECATED extractText "renamed to textContent" #-} extractText = textContent -- | Is the given node an element? isElement :: Node tag text -> Bool isElement (Element _ _ _) = True isElement _ = False -- | Is the given node text? isText :: Node tag text -> Bool isText (Text _) = True isText _ = False -- | Is the given node a tag with the given name? isNamed :: (Eq tag) => tag -> Node tag text -> Bool isNamed _ (Text _) = False isNamed nm (Element nm' _ _) = nm == nm' -- | Get the value of the attribute having the specified name. getAttribute :: GenericXMLString tag => Node tag text -> tag -> Maybe text getAttribute n t = lookup t $ eAttrs n -- | Get children of a node if it's an element, return empty list otherwise. getChildren :: Node tag text -> [Node tag text] getChildren (Text _) = [] getChildren (Element _ _ ch) = ch -- | Modify a node's children using the specified function. modifyChildren :: ([Node tag text] -> [Node tag text]) -> Node tag text -> Node tag text modifyChildren _ node@(Text _) = node modifyChildren f (Element n a c) = Element n a (f c) setEntityDecoder :: (GenericXMLString tag, GenericXMLString text) => Parser -> IORef [Node tag text] -> (tag -> Maybe text) -> IO () setEntityDecoder parser queueRef decoder = do setUseForeignDTD parser True setExternalEntityRefHandler parser eh setSkippedEntityHandler parser skip where text str (cur:rest) = modifyChildren (Text str:) cur : rest text _ [] = undefined skip _ 1 = return False skip entityName 0 = do en <- mkText entityName let mbt = decoder en maybe (return False) (\t -> do modifyIORef queueRef $ text t return True) mbt skip _ _ = undefined eh p ctx _ systemID publicID = if systemID == nullPtr && publicID == nullPtr then withCStringLen "" $ \c -> do parseExternalEntityReference p ctx Nothing c else return False -- | Strictly parse XML to tree. Returns error message or valid parsed tree. parse' :: (GenericXMLString tag, GenericXMLString text) => ParserOptions tag text -- ^ Parser options -> ByteString -- ^ Input text (a strict ByteString) -> Either XMLParseError (Node tag text) parse' opts doc = unsafePerformIO $ runParse where runParse = do let enc = parserEncoding opts let mEntityDecoder = entityDecoder opts parser <- newParser enc -- We maintain the invariant that the stack always has one element, -- whose only child at the end of parsing is the root of the actual tree. let emptyString = gxFromString "" stack <- newIORef [Element emptyString [] []] maybe (return ()) (setEntityDecoder parser stack) mEntityDecoder setStartElementHandler parser $ \cName cAttrs -> do name <- mkText cName attrs <- forM cAttrs $ \(cAttrName,cAttrValue) -> do attrName <- mkText cAttrName attrValue <- mkText cAttrValue return (attrName, attrValue) modifyIORef stack (start name attrs) return True setEndElementHandler parser $ \_ -> do modifyIORef stack end return True setCharacterDataHandler parser $ \cText -> do txt <- gxFromCStringLen cText modifyIORef stack (text txt) return True mError <- IO.parse' parser doc case mError of Just err -> return $ Left err Nothing -> do [Element _ _ [root]] <- readIORef stack return $ Right root start name attrs stack = Element name attrs [] : stack text str (cur:rest) = modifyChildren (Text str:) cur : rest text _ [] = impossible end (cur:parent:rest) = let node = modifyChildren reverse cur in modifyChildren (node:) parent : rest end _ = impossible impossible = error "parse' impossible" -- | DEPRECATED: use 'parse' instead. -- -- Strictly parse XML to tree. Returns error message or valid parsed tree. parseTree' :: (GenericXMLString tag, GenericXMLString text) => Maybe Encoding -- ^ Optional encoding override -> ByteString -- ^ Input text (a strict ByteString) -> Either XMLParseError (Node tag text) {-# DEPRECATED parseTree' "use Text.XML.Expat.parse' instead" #-} parseTree' enc = parse' (ParserOptions enc Nothing) -- | A lower level function that lazily converts a SAX stream into a tree structure. saxToTree :: GenericXMLString tag => [SAXEvent tag text] -> (Node tag text, Maybe XMLParseError) saxToTree events = let (nodes, mError, _) = ptl events in (safeHead nodes, mError) where safeHead (a:_) = a safeHead [] = Element (gxFromString "") [] [] ptl (StartElement name attrs:rema) = let (children, err1, rema') = ptl rema elt = Element name attrs children (out, err2, rema'') = ptl rema' in (elt:out, err1 `mplus` err2, rema'') ptl (EndElement _:rema) = ([], Nothing, rema) ptl (CharacterData txt:rema) = let (out, err, rema') = ptl rema in (Text txt:out, err, rema') ptl (FailDocument err:_) = ([], Just err, []) ptl [] = ([], Nothing, []) -- | Lazily parse XML to tree. Note that forcing the XMLParseError return value -- will force the entire parse. Therefore, to ensure lazy operation, don't -- check the error status until you have processed the tree. parse :: (GenericXMLString tag, GenericXMLString text) => ParserOptions tag text -- ^ Parser options -> L.ByteString -- ^ Input text (a lazy ByteString) -> (Node tag text, Maybe XMLParseError) parse opts bs = saxToTree $ SAX.parse opts bs -- | DEPREACTED: Use 'parse' instead. -- -- Lazily parse XML to tree. Note that forcing the XMLParseError return value -- will force the entire parse. Therefore, to ensure lazy operation, don't -- check the error status until you have processed the tree. parseTree :: (GenericXMLString tag, GenericXMLString text) => Maybe Encoding -- ^ Optional encoding override -> L.ByteString -- ^ Input text (a lazy ByteString) -> (Node tag text, Maybe XMLParseError) {-# DEPRECATED parseTree "use Text.XML.Expat.Tree.parse instead" #-} parseTree mEnc = parse (ParserOptions mEnc Nothing) -- | Lazily parse XML to tree. In the event of an error, throw 'XMLParseException'. -- -- @parseThrowing@ can throw an exception from pure code, which is generally a bad -- way to handle errors, because Haskell\'s lazy evaluation means it\'s hard to -- predict where it will be thrown from. However, it may be acceptable in -- situations where it's not expected during normal operation, depending on the -- design of your program. parseThrowing :: (GenericXMLString tag, GenericXMLString text) => ParserOptions tag text -- ^ Parser options -> L.ByteString -- ^ Input text (a lazy ByteString) -> Node tag text parseThrowing opts bs = fst $ saxToTree $ SAX.parseThrowing opts bs -- | DEPRECATED: Use 'parseThrowing' instead. -- -- Lazily parse XML to tree. In the event of an error, throw 'XMLParseException'. parseTreeThrowing :: (GenericXMLString tag, GenericXMLString text) => Maybe Encoding -- ^ Optional encoding override -> L.ByteString -- ^ Input text (a lazy ByteString) -> Node tag text {-# DEPRECATED parseTreeThrowing "use Text.XML.Expat.Tree.parseThrowing instead" #-} parseTreeThrowing mEnc = parseThrowing (ParserOptions mEnc Nothing)