-- ------------------------------------------------------------ {- | Module : Text.XML.HXT.Parser.XmlParsec Copyright : Copyright (C) 2005 Uwe Schmidt License : MIT Maintainer : Uwe Schmidt (uwe@fh-wedel.de) Stability : experimental Portability: portable Xml Parsec parser with pure filter interface -} -- ------------------------------------------------------------ module Text.XML.HXT.Parser.XmlParsec ( charData , charData' , comment , pI , cDSect , document , document' , prolog , xMLDecl , xMLDecl' , versionInfo , misc , doctypedecl , markupdecl , sDDecl , element , content , contentWithTextDecl , textDecl , encodingDecl , xread , parseXmlContent , parseXmlDocEncodingSpec , parseXmlDocument , parseXmlDTDPart , parseXmlEncodingSpec , parseXmlEntityEncodingSpec , parseXmlEntityValueAsAttrValue , parseXmlEntityValueAsContent , parseXmlPart , parseXmlText , parseNMToken , parseName , removeEncodingSpec ) where import Control.Applicative ( (<$>) ) import Text.ParserCombinators.Parsec ( runParser , (), (<|>) , char , string , eof , between , many , many1 , notFollowedBy , option , try , unexpected , getPosition , getInput , sourceName ) import Text.XML.HXT.DOM.ShowXml ( xshow ) import Text.XML.HXT.DOM.Interface import Text.XML.HXT.DOM.XmlNode ( mkElement' , mkAttr' , mkRoot' , mkDTDElem' , mkText' , mkCmt' , mkCdata' , mkError' , mkPi' , isText , isRoot , getText , getChildren , getAttrl , getAttrName , changeAttrl , mergeAttrl ) import Text.XML.HXT.Parser.XmlCharParser ( xmlChar , XParser , SimpleXParser , XPState , withNormNewline , withoutNormNewline ) import qualified Text.XML.HXT.Parser.XmlTokenParser as XT import qualified Text.XML.HXT.Parser.XmlDTDTokenParser as XD import Control.FlatSeq import Data.Char (toLower) import Data.Maybe -- import Debug.Trace -- ------------------------------------------------------------ -- -- Character Data (2.4) charData :: XParser s XmlTrees charData = many (charData' <|> XT.referenceT) charData' :: XParser s XmlTree charData' = do t <- XT.allBut1 many1 (\ c -> not (c `elem` "<&")) "]]>" return (mkText' t) -- ------------------------------------------------------------ -- -- Comments (2.5) comment :: XParser s XmlTree comment = comment'' $ XT.checkString "")) (XT.allBut many "--") return (mkCmt' c) ) "comment" -- ------------------------------------------------------------ -- -- Processing Instructions pI :: XParser s XmlTree pI = pI'' $ XT.checkString "> return ()) pI'' :: XParser s () -> XParser s XmlTree pI'' op = between op (string "?>") ( do n <- pITarget p <- option "" (XT.sPace >> XT.allBut many "?>" ) return (mkPi' (mkName n) [mkAttr' (mkName a_value) [mkText' p]]) ) "processing instruction" where pITarget :: XParser s String pITarget = ( do n <- XT.name if map toLower n == t_xml then unexpected n else return n ) -- ------------------------------------------------------------ -- -- CDATA Sections (2.7) cDSect :: XParser s XmlTree cDSect = cDSect'' $ XT.checkString "> return ()) cDSect'' :: XParser s () -> XParser s XmlTree cDSect'' op = do t <- between op (string "]]>") (XT.allBut many "]]>") return (mkCdata' t) "CDATA section" -- ------------------------------------------------------------ -- -- Document (2.1) and Prolog (2.8) document :: XParser s XmlTree document = do pos <- getPosition dl <- document' return (mkRoot' [ mkAttr' (mkName a_source) [mkText' (sourceName pos)] , mkAttr' (mkName a_status) [mkText' (show c_ok)] ] dl ) document' :: XParser s XmlTrees document' = do pl <- prolog el <- element ml <- many misc eof return (pl ++ [el] ++ ml) prolog :: XParser s XmlTrees prolog = do xml <- option [] xMLDecl' misc1 <- many misc dtdPart <- option [] doctypedecl misc2 <- many misc return (xml ++ misc1 ++ dtdPart ++ misc2) xMLDecl :: XParser s XmlTrees xMLDecl = between (try $ string "") ( do vi <- versionInfo ed <- option [] encodingDecl sd <- option [] sDDecl XT.skipS0 return (vi ++ ed ++ sd) ) "xml declaration" xMLDecl' :: XParser s XmlTrees xMLDecl' = do al <- xMLDecl return [mkPi' (mkName t_xml) al] xMLDecl'' :: XParser s XmlTree xMLDecl'' = do al <- option [] (try xMLDecl) return (mkRoot' al []) versionInfo :: XParser s XmlTrees versionInfo = ( do try ( XT.skipS >> XT.keyword a_version >> return () ) XT.eq vi <- XT.quoted XT.versionNum return [mkAttr' (mkName a_version) [mkText' vi]] ) "version info (with quoted version number)" misc :: XParser s XmlTree misc = comment <|> pI <|> ( ( do ws <- XT.sPace return (mkText' ws) ) "" ) -- ------------------------------------------------------------ -- -- Document Type definition (2.8) doctypedecl :: XParser s XmlTrees doctypedecl = between (try $ string "') ( do XT.skipS n <- XT.name exId <- option [] ( try ( do XT.skipS externalID ) ) XT.skipS0 markup <- option [] ( do m <- between (char '[' ) (char ']') markupOrDeclSep XT.skipS0 return m ) return [mkDTDElem' DOCTYPE ((a_name, n) : exId) markup] ) markupOrDeclSep :: XParser s XmlTrees markupOrDeclSep = ( do ll <- many ( markupdecl <|> declSep <|> XT.mkList conditionalSect ) return (concat ll) ) declSep :: XParser s XmlTrees declSep = XT.mkList XT.peReferenceT <|> ( do XT.skipS return [] ) markupdecl :: XParser s XmlTrees markupdecl = XT.mkList ( pI <|> comment <|> XD.dtdDeclTokenizer ) -- ------------------------------------------------------------ -- -- Standalone Document Declaration (2.9) sDDecl :: XParser s XmlTrees sDDecl = do try ( XT.skipS >> XT.keyword a_standalone >> return () ) XT.eq sd <- XT.quoted (XT.keywords [v_yes, v_no]) return [mkAttr' (mkName a_standalone) [mkText' sd]] -- ------------------------------------------------------------ -- -- element, tags and content (3, 3.1) element :: XParser s XmlTree element = char '<' >> element' element' :: XParser s XmlTree element' = ( do e <- elementStart rwnf e `seq` elementRest e -- evaluate name and attribute list before parsing contents ) "element" elementStart :: XParser s (QName, XmlTrees) elementStart = do n <- XT.name al <- attrList XT.skipS0 return (mkName n, al) where attrList = option [] ( do XT.skipS attrList' ) attrList' = option [] ( do a1 <- attribute al <- attrList let n = fromJust . getAttrName $ a1 if n `elem` map (fromJust . getAttrName) al then unexpected ( "attribute name " ++ show (qualifiedName n) ++ " occurs twice in attribute list" ) else return (a1 : al) ) elementRest :: (QName, XmlTrees) -> XParser s XmlTree elementRest (n, al) = ( do XT.checkString "/>" return $ mkElement' n al [] ) <|> ( do XT.gt c <- content eTag n return $ mkElement' n al c ) "proper attribute list followed by \"/>\" or \">\"" eTag :: QName -> XParser s () eTag n' = do XT.checkString " "" n <- XT.name XT.skipS0 XT.gt if n == qualifiedName n' then return () else unexpected ("illegal end tag found, expected") attribute :: XParser s XmlTree attribute = do n <- XT.name XT.eq v <- XT.attrValueT return $ mkAttr' (mkName n) v {- this parser corresponds to the XML spec but it's inefficent because of more than 1 char lookahead content :: XParser s XmlTrees content = do c1 <- charData cl <- many ( do l <- ( element <|> cDSect <|> pI <|> comment ) c <- charData return (l : c) ) return (c1 ++ concat cl) -} -- this simpler content parser does not need more than a single lookahead -- so no try parsers (inefficient) are neccessary content :: XParser s XmlTrees content = XT.mergeTextNodes <$> many ( ( do -- parse markup but no closing tags try ( XT.lt >> notFollowedBy (char '/') >> return () ) markup ) <|> charData' <|> XT.referenceT ) where markup = element' <|> pI' <|> ( char '!' >> ( comment' <|> cDSect' ) ) contentWithTextDecl :: XParser s XmlTrees contentWithTextDecl = option [] textDecl >> content -- ------------------------------------------------------------ -- -- Conditional Sections (3.4) -- -- conditional sections are parsed in two steps, -- first the whole content is detected, -- and then, after PE substitution include sections are parsed again conditionalSect :: XParser s XmlTree conditionalSect = do XT.checkString "" >> return "" ) <|> ( do XT.checkString "" ++ cs2) ) <|> ( do c <- xmlChar cs <- condSectCont return (c : cs) ) -- ------------------------------------------------------------ -- -- External Entities (4.2.2) externalID :: XParser s Attributes externalID = ( do _ <- XT.keyword k_system XT.skipS lit <- XT.systemLiteral return [(k_system, lit)] ) <|> ( do _ <- XT.keyword k_public XT.skipS pl <- XT.pubidLiteral XT.skipS sl <- XT.systemLiteral return [ (k_system, sl) , (k_public, pl) ] ) "SYSTEM or PUBLIC declaration" -- ------------------------------------------------------------ -- -- Text Declaration (4.3.1) textDecl :: XParser s XmlTrees textDecl = between (try $ string "") ( do vi <- option [] versionInfo ed <- encodingDecl XT.skipS0 return (vi ++ ed) ) "text declaration" textDecl'' :: XParser s XmlTree textDecl'' = do al <- option [] (try textDecl) return (mkRoot' al []) -- ------------------------------------------------------------ -- -- Encoding Declaration (4.3.3) encodingDecl :: XParser s XmlTrees encodingDecl = do try ( XT.skipS >> XT.keyword a_encoding >> return () ) XT.eq ed <- XT.quoted XT.encName return [mkAttr' (mkName a_encoding) [mkText' ed]] -- ------------------------------------------------------------ -- -- the main entry points: -- parsing the content of a text node -- or parsing the text children from a tag node -- | -- the inverse function to 'xshow', (for XML content). -- -- the string parameter is parsed with the XML content parser. -- result is the list of trees or in case of an error a single element list with the -- error message as node. No entity or character subtitution is done here, -- but the XML parser can do this for the predefined XML or the char references for performance reasons -- -- see also: 'parseXmlContent' xread :: String -> XmlTrees xread str = parseXmlFromString parser (withNormNewline ()) loc str where loc = "string: " ++ show (if length str > 40 then take 40 str ++ "..." else str) parser = do res <- content -- take the content parser for parsing the string eof -- and test on everything consumed return res -- | -- the filter version of 'xread' parseXmlContent :: XmlTree -> XmlTrees parseXmlContent = xread . xshow . (:[]) -- | -- a more general version of 'parseXmlContent'. -- The parser to be used and the context are extra parameter parseXmlText :: SimpleXParser XmlTrees -> XPState () -> String -> XmlTree -> XmlTrees parseXmlText p s0 loc = parseXmlFromString p s0 loc . xshow . (:[]) parseXmlDocument :: String -> String -> XmlTrees parseXmlDocument = parseXmlFromString document' (withNormNewline ()) parseXmlFromString :: SimpleXParser XmlTrees -> XPState () -> String -> String -> XmlTrees parseXmlFromString parser s0 loc = either ((:[]) . mkError' c_err . (++ "\n") . show) id . runParser parser s0 loc -- ------------------------------------------------------------ -- removeEncodingSpec :: XmlTree -> XmlTrees removeEncodingSpec t | isText t = ( either ((:[]) . mkError' c_err . (++ "\n") . show) ((:[]) . mkText') . runParser parser (withNormNewline ()) "remove encoding spec" . fromMaybe "" . getText ) t | otherwise = [t] where parser :: XParser s String parser = option [] textDecl >> getInput -- ------------------------------------------------------------ -- | -- general parser for parsing arbitray parts of a XML document parseXmlPart :: SimpleXParser XmlTrees -> String -> String -> XmlTree -> XmlTrees parseXmlPart parser expected context t = parseXmlText ( do res <- parser eof expected return res ) (withoutNormNewline ()) context $ t -- ------------------------------------------------------------ -- | -- Parser for parts of a DTD parseXmlDTDPart :: String -> XmlTree -> XmlTrees parseXmlDTDPart = parseXmlPart markupOrDeclSep "markup declaration" -- ------------------------------------------------------------ -- | -- Parser for general entites parseXmlEntityValueAsContent :: String -> XmlTree -> XmlTrees parseXmlEntityValueAsContent = parseXmlPart content "general entity value" -- ------------------------------------------------------------ -- | -- Parser for entity substitution within attribute values parseXmlEntityValueAsAttrValue :: String -> XmlTree -> XmlTrees parseXmlEntityValueAsAttrValue = parseXmlPart (XT.attrValueT' "<&") "attribute value" -- ------------------------------------------------------------ -- | -- Parser for NMTOKENs parseNMToken :: String -> XmlTree -> XmlTrees parseNMToken = parseXmlPart (many1 XT.nmtokenT) "nmtoken" -- ------------------------------------------------------------ -- | -- Parser for XML names parseName :: String -> XmlTree -> XmlTrees parseName = parseXmlPart (many1 XT.nameT) "name" -- ------------------------------------------------------------ -- | -- try to parse a xml encoding spec. -- -- -- * 1.parameter encParse : the parser for the encoding decl -- -- - 2.parameter root : a document root -- -- - returns : the same tree, but with an additional -- attribute \"encoding\" in the root node -- in case of a valid encoding spec -- else the unchanged tree parseXmlEncodingSpec :: SimpleXParser XmlTree -> XmlTree -> XmlTrees parseXmlEncodingSpec encDecl x = (:[]) . ( if isRoot x then parseEncSpec else id ) $ x where parseEncSpec r = case ( runParser encDecl (withNormNewline ()) source . xshow . getChildren $ r ) of Right t -> changeAttrl (mergeAttrl . fromMaybe [] . getAttrl $ t) r Left _ -> r where -- arrow \"getAttrValue a_source\" programmed on the tree level (oops!) source = xshow . concat . map getChildren . filter ((== a_source) . maybe "" qualifiedName . getAttrName) . fromMaybe [] . getAttrl $ r parseXmlEntityEncodingSpec :: XmlTree -> XmlTrees parseXmlEntityEncodingSpec = parseXmlEncodingSpec textDecl'' parseXmlDocEncodingSpec :: XmlTree -> XmlTrees parseXmlDocEncodingSpec = parseXmlEncodingSpec xMLDecl'' -- ------------------------------------------------------------