{-# LANGUAGE CPP #-} -- ------------------------------------------------------------ {- | Module : Text.XML.HXT.Parser.XmlParsec Copyright : Copyright (C) 2005 Uwe Schmidt License : MIT Maintainer : Uwe Schmidt (uwe@fh-wedel.de) Stability : experimental Portability: portable Xml Parsec parser with pure filter interface -} -- ------------------------------------------------------------ module Text.XML.HXT.Parser.XmlParsec ( charData , charData' , comment , pI , cDSect , document , document' , prolog , xMLDecl , xMLDecl' , versionInfo , misc , doctypedecl , markupdecl , sDDecl , element , content , contentWithTextDecl , textDecl , encodingDecl , xread , xreadDoc , parseXmlContent , parseXmlDocEncodingSpec , parseXmlDocument , parseXmlDTDPart , parseXmlEncodingSpec , parseXmlEntityEncodingSpec , parseXmlEntityValueAsAttrValue , parseXmlEntityValueAsContent , parseXmlPart , parseXmlText , parseNMToken , parseName , removeEncodingSpec ) where #if MIN_VERSION_base(4,8,2) #else import Control.Applicative ((<$>)) #endif import Text.ParserCombinators.Parsec (between, char, eof, getInput, getPosition, many, many1, notFollowedBy, option, runParser, sourceName, string, try, unexpected, (), (<|>)) import Text.XML.HXT.DOM.Interface import Text.XML.HXT.DOM.ShowXml (xshow) import Text.XML.HXT.DOM.XmlNode (changeAttrl, getAttrName, getAttrl, getChildren, getText, isRoot, isText, mergeAttrl, mkAttr', mkCdata', mkCmt', mkDTDElem', mkElement', mkError', mkPi', mkRoot', mkText') import Text.XML.HXT.Parser.XmlCharParser (SimpleXParser, XPState, XParser, withNormNewline, withoutNormNewline, xmlChar) import qualified Text.XML.HXT.Parser.XmlDTDTokenParser as XD import qualified Text.XML.HXT.Parser.XmlTokenParser as XT import Control.FlatSeq import Data.Char (toLower) import Data.Maybe -- import Debug.Trace -- ------------------------------------------------------------ -- -- Character Data (2.4) charData :: XParser s XmlTrees charData = many (charData' <|> XT.referenceT) charData' :: XParser s XmlTree charData' = do t <- XT.allBut1 many1 (\ c -> not (c `elem` "<&")) "]]>" return (mkText' t) -- ------------------------------------------------------------ -- -- Comments (2.5) comment :: XParser s XmlTree comment = comment'' $ XT.checkString "")) (XT.allBut many "--") return (mkCmt' c) ) "comment" -- ------------------------------------------------------------ -- -- Processing Instructions pI :: XParser s XmlTree pI = pI'' $ XT.checkString "> return ()) pI'' :: XParser s () -> XParser s XmlTree pI'' op = between op (string "?>") ( do n <- pITarget p <- option "" (XT.sPace >> XT.allBut many "?>" ) return (mkPi' (mkName n) [mkAttr' (mkName a_value) [mkText' p]]) ) "processing instruction" where pITarget :: XParser s String pITarget = ( do n <- XT.name if map toLower n == t_xml then unexpected n else return n ) -- ------------------------------------------------------------ -- -- CDATA Sections (2.7) cDSect :: XParser s XmlTree cDSect = cDSect'' $ XT.checkString "> return ()) cDSect'' :: XParser s () -> XParser s XmlTree cDSect'' op = do t <- between op (string "]]>") (XT.allBut many "]]>") return (mkCdata' t) "CDATA section" -- ------------------------------------------------------------ -- -- Document (2.1) and Prolog (2.8) document :: XParser s XmlTree document = do pos <- getPosition dl <- document' return (mkRoot' [ mkAttr' (mkName a_source) [mkText' (sourceName pos)] , mkAttr' (mkName a_status) [mkText' (show c_ok)] ] dl ) document' :: XParser s XmlTrees document' = do pl <- prolog el <- element ml <- many misc eof return (pl ++ [el] ++ ml) prolog :: XParser s XmlTrees prolog = do xml <- option [] xMLDecl' misc1 <- many misc dtdPart <- option [] doctypedecl misc2 <- many misc return (xml ++ misc1 ++ dtdPart ++ misc2) xMLDecl :: XParser s XmlTrees xMLDecl = between (try $ string "") ( do vi <- versionInfo ed <- option [] encodingDecl sd <- option [] sDDecl XT.skipS0 return (vi ++ ed ++ sd) ) "xml declaration" xMLDecl' :: XParser s XmlTrees xMLDecl' = do al <- xMLDecl return [mkPi' (mkName t_xml) al] xMLDecl'' :: XParser s XmlTree xMLDecl'' = do al <- option [] (try xMLDecl) return (mkRoot' al []) versionInfo :: XParser s XmlTrees versionInfo = ( do try ( XT.skipS >> XT.keyword a_version >> return () ) XT.eq vi <- XT.quoted XT.versionNum return [mkAttr' (mkName a_version) [mkText' vi]] ) "version info (with quoted version number)" misc :: XParser s XmlTree misc = comment <|> pI <|> ( ( do ws <- XT.sPace return (mkText' ws) ) "" ) -- ------------------------------------------------------------ -- -- Document Type definition (2.8) doctypedecl :: XParser s XmlTrees doctypedecl = between (try $ string "') ( do XT.skipS n <- XT.name exId <- option [] ( try ( do XT.skipS externalID ) ) XT.skipS0 markup <- option [] ( do m <- between (char '[' ) (char ']') markupOrDeclSep XT.skipS0 return m ) return [mkDTDElem' DOCTYPE ((a_name, n) : exId) markup] ) markupOrDeclSep :: XParser s XmlTrees markupOrDeclSep = ( do ll <- many ( markupdecl <|> declSep <|> XT.mkList conditionalSect ) return (concat ll) ) declSep :: XParser s XmlTrees declSep = XT.mkList XT.peReferenceT <|> ( do XT.skipS return [] ) markupdecl :: XParser s XmlTrees markupdecl = XT.mkList ( pI <|> comment <|> XD.dtdDeclTokenizer ) -- ------------------------------------------------------------ -- -- Standalone Document Declaration (2.9) sDDecl :: XParser s XmlTrees sDDecl = do try ( XT.skipS >> XT.keyword a_standalone >> return () ) XT.eq sd <- XT.quoted (XT.keywords [v_yes, v_no]) return [mkAttr' (mkName a_standalone) [mkText' sd]] -- ------------------------------------------------------------ -- -- element, tags and content (3, 3.1) element :: XParser s XmlTree element = char '<' >> element' element' :: XParser s XmlTree element' = ( do e <- elementStart rwnf e `seq` elementRest e -- evaluate name and attribute list before parsing contents ) "element" elementStart :: XParser s (QName, XmlTrees) elementStart = do n <- XT.name al <- attrList XT.skipS0 return (mkName n, al) where attrList = option [] ( do XT.skipS attrList' ) attrList' = option [] ( do a1 <- attribute al <- attrList let n = fromJust . getAttrName $ a1 if n `elem` map (fromJust . getAttrName) al then unexpected ( "attribute name " ++ show (qualifiedName n) ++ " occurs twice in attribute list" ) else return (a1 : al) ) elementRest :: (QName, XmlTrees) -> XParser s XmlTree elementRest (n, al) = ( do XT.checkString "/>" return $ mkElement' n al [] ) <|> ( do XT.gt c <- content eTag n return $ mkElement' n al c ) "proper attribute list followed by \"/>\" or \">\"" eTag :: QName -> XParser s () eTag n' = do XT.checkString " "" n <- XT.name XT.skipS0 XT.gt if n == qualifiedName n' then return () else unexpected ("illegal end tag found, expected") attribute :: XParser s XmlTree attribute = do n <- XT.name XT.eq v <- XT.attrValueT return $ mkAttr' (mkName n) v {- this parser corresponds to the XML spec but it's inefficent because of more than 1 char lookahead content :: XParser s XmlTrees content = do c1 <- charData cl <- many ( do l <- ( element <|> cDSect <|> pI <|> comment ) c <- charData return (l : c) ) return (c1 ++ concat cl) -} -- this simpler content parser does not need more than a single lookahead -- so no try parsers (inefficient) are neccessary content :: XParser s XmlTrees content = XT.mergeTextNodes <$> many ( ( do -- parse markup but no closing tags try ( XT.lt >> notFollowedBy (char '/') >> return () ) markup ) <|> charData' <|> XT.referenceT ) where markup = element' <|> pI' <|> ( char '!' >> ( comment' <|> cDSect' ) ) contentWithTextDecl :: XParser s XmlTrees contentWithTextDecl = option [] textDecl >> content -- ------------------------------------------------------------ -- -- Conditional Sections (3.4) -- -- conditional sections are parsed in two steps, -- first the whole content is detected, -- and then, after PE substitution include sections are parsed again conditionalSect :: XParser s XmlTree conditionalSect = do XT.checkString "" >> return "" ) <|> ( do XT.checkString "" ++ cs2) ) <|> ( do c <- xmlChar cs <- condSectCont return (c : cs) ) -- ------------------------------------------------------------ -- -- External Entities (4.2.2) externalID :: XParser s Attributes externalID = ( do _ <- XT.keyword k_system XT.skipS lit <- XT.systemLiteral return [(k_system, lit)] ) <|> ( do _ <- XT.keyword k_public XT.skipS pl <- XT.pubidLiteral XT.skipS sl <- XT.systemLiteral return [ (k_system, sl) , (k_public, pl) ] ) "SYSTEM or PUBLIC declaration" -- ------------------------------------------------------------ -- -- Text Declaration (4.3.1) textDecl :: XParser s XmlTrees textDecl = between (try $ string "") ( do vi <- option [] versionInfo ed <- encodingDecl XT.skipS0 return (vi ++ ed) ) "text declaration" textDecl'' :: XParser s XmlTree textDecl'' = do al <- option [] (try textDecl) return (mkRoot' al []) -- ------------------------------------------------------------ -- -- Encoding Declaration (4.3.3) encodingDecl :: XParser s XmlTrees encodingDecl = do try ( XT.skipS >> XT.keyword a_encoding >> return () ) XT.eq ed <- XT.quoted XT.encName return [mkAttr' (mkName a_encoding) [mkText' ed]] -- ------------------------------------------------------------ -- -- the main entry points: -- parsing the content of a text node -- or parsing the text children from a tag node -- | -- the inverse function to 'xshow', (for XML content). -- -- the string parameter is parsed with the XML content parser. -- result is the list of trees or in case of an error a single element list with the -- error message as node. No entity or character subtitution is done here, -- but the XML parser can do this for the predefined XML or the char references for performance reasons -- -- see also: 'parseXmlContent' xread :: String -> XmlTrees xread = xread' content -- take the content parser for parsing the string xreadDoc :: String -> XmlTrees xreadDoc = xread' document' -- take the document' parser for parsing the string xread' :: XParser () XmlTrees -> String -> XmlTrees xread' content' str = parseXmlFromString parser (withNormNewline ()) loc str where loc = "string: " ++ show (if length str > 40 then take 40 str ++ "..." else str) parser = do res <- content' eof -- test on everything consumed return res -- | -- the filter version of 'xread' parseXmlContent :: XmlTree -> XmlTrees parseXmlContent = xread . xshow . (:[]) -- | -- a more general version of 'parseXmlContent'. -- The parser to be used and the context are extra parameter parseXmlText :: SimpleXParser XmlTrees -> XPState () -> String -> XmlTree -> XmlTrees parseXmlText p s0 loc = parseXmlFromString p s0 loc . xshow . (:[]) parseXmlDocument :: String -> String -> XmlTrees parseXmlDocument = parseXmlFromString document' (withNormNewline ()) parseXmlFromString :: SimpleXParser XmlTrees -> XPState () -> String -> String -> XmlTrees parseXmlFromString parser s0 loc = either ((:[]) . mkError' c_err . (++ "\n") . show) id . runParser parser s0 loc -- ------------------------------------------------------------ -- removeEncodingSpec :: XmlTree -> XmlTrees removeEncodingSpec t | isText t = ( either ((:[]) . mkError' c_err . (++ "\n") . show) ((:[]) . mkText') . runParser parser (withNormNewline ()) "remove encoding spec" . fromMaybe "" . getText ) t | otherwise = [t] where parser :: XParser s String parser = option [] textDecl >> getInput -- ------------------------------------------------------------ -- | -- general parser for parsing arbitray parts of a XML document parseXmlPart :: SimpleXParser XmlTrees -> String -> String -> XmlTree -> XmlTrees parseXmlPart parser expected context t = parseXmlText ( do res <- parser eof expected return res ) (withoutNormNewline ()) context $ t -- ------------------------------------------------------------ -- | -- Parser for parts of a DTD parseXmlDTDPart :: String -> XmlTree -> XmlTrees parseXmlDTDPart = parseXmlPart markupOrDeclSep "markup declaration" -- ------------------------------------------------------------ -- | -- Parser for general entites parseXmlEntityValueAsContent :: String -> XmlTree -> XmlTrees parseXmlEntityValueAsContent = parseXmlPart content "general entity value" -- ------------------------------------------------------------ -- | -- Parser for entity substitution within attribute values parseXmlEntityValueAsAttrValue :: String -> XmlTree -> XmlTrees parseXmlEntityValueAsAttrValue = parseXmlPart (XT.attrValueT' "<&") "attribute value" -- ------------------------------------------------------------ -- | -- Parser for NMTOKENs parseNMToken :: String -> XmlTree -> XmlTrees parseNMToken = parseXmlPart (many1 XT.nmtokenT) "nmtoken" -- ------------------------------------------------------------ -- | -- Parser for XML names parseName :: String -> XmlTree -> XmlTrees parseName = parseXmlPart (many1 XT.nameT) "name" -- ------------------------------------------------------------ -- | -- try to parse a xml encoding spec. -- -- -- * 1.parameter encParse : the parser for the encoding decl -- -- - 2.parameter root : a document root -- -- - returns : the same tree, but with an additional -- attribute \"encoding\" in the root node -- in case of a valid encoding spec -- else the unchanged tree parseXmlEncodingSpec :: SimpleXParser XmlTree -> XmlTree -> XmlTrees parseXmlEncodingSpec encDecl x = (:[]) . ( if isRoot x then parseEncSpec else id ) $ x where parseEncSpec r = case ( runParser encDecl (withNormNewline ()) source . xshow . getChildren $ r ) of Right t -> changeAttrl (mergeAttrl . fromMaybe [] . getAttrl $ t) r Left _ -> r where -- arrow \"getAttrValue a_source\" programmed on the tree level (oops!) source = xshow . concat . map getChildren . filter ((== a_source) . maybe "" qualifiedName . getAttrName) . fromMaybe [] . getAttrl $ r parseXmlEntityEncodingSpec :: XmlTree -> XmlTrees parseXmlEntityEncodingSpec = parseXmlEncodingSpec textDecl'' parseXmlDocEncodingSpec :: XmlTree -> XmlTrees parseXmlDocEncodingSpec = parseXmlEncodingSpec xMLDecl'' -- ------------------------------------------------------------