----------------------------------------------------------------------------- -- Copyright 2019, Ideas project team. This file is distributed under the -- terms of the Apache License 2.0. For more information, see the files -- "LICENSE.txt" and "NOTICE.txt", which are included in the distribution. ----------------------------------------------------------------------------- -- | -- Maintainer : bastiaan.heeren@ou.nl -- Stability : provisional -- Portability : portable (depends on ghc) -- -- A parser for XML documents, directly derived from the specification: -- -- http://www.w3.org/TR/2006/REC-xml-20060816 -- ----------------------------------------------------------------------------- module Ideas.Text.XML.Parser (document, extParsedEnt, extSubset) where import Control.Monad import Data.Char (toUpper, ord) import Data.List (foldl') import Data.Maybe (catMaybes) import Ideas.Text.XML.Document hiding (versionInfo, name, content) import Ideas.Text.XML.Unicode import Ideas.Utils.Parsing hiding (digit, letter, space) import Prelude hiding (seq) import qualified Ideas.Text.XML.Document as D letter, digit, combiningChar, extender :: Parser Char letter = satisfy isLetter digit = satisfy isDigit combiningChar = satisfy isCombiningChar extender = satisfy isExtender -- combinators without lexing (no spaces are consumed) parens, brackets, singleQuoted, doubleQuoted :: Parser a -> Parser a parens = between (char '(') (char ')') brackets = between (char '[') (char ']') singleQuoted = between (char '\'') (char '\'') doubleQuoted = between (char '"') (char '"') -------------------------------------------------- -- * 2 Documents -------------------------------------------------- -- ** 2.1 Well-Formed XML Documents -- [1] document ::= prolog element Misc* document :: Parser XMLDoc document = do (mxml, mdtd) <- prolog rt <- element miscs let (ver, enc, sa) = case mxml of Just (a, b, c) -> (Just a, b, c) Nothing -> (Nothing, Nothing, Nothing) return XMLDoc { D.versionInfo = ver , D.encoding = enc , D.standalone = sa , D.dtd = mdtd , D.externals = [] , root = rt } -------------------------------------------------- -- ** 2.2 Characters -- [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] {- char :: Parser Char char = ranges xs <|> oneOf "\x9\xA\xD" where xs = [('\x20', '\xD7FF'), ('\xE000', '\xFFFD'), ('\x10000', '\x10FFFF')] -} -------------------------------------------------- -- ** 2.3 Common Syntactic Constructs -- [3] S ::= (#x20 | #x9 | #xD | #xA)+ space :: Parser () space = void (many1 (oneOf "\x20\x9\xA\xD")) mspace :: Parser () -- for S? mspace = void (many (oneOf "\x20\x9\xA\xD")) -- [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender nameChar :: Parser Char nameChar = letter <|> digit <|> combiningChar <|> extender <|> oneOf ".-_:" -- [5] Name ::= (Letter | '_' | ':') (NameChar)* name :: Parser String name = do c <- letter <|> oneOf "_:" cs <- many nameChar return (c:cs) spacedName :: Parser String spacedName = space *> name <* space {- -- [6] Names ::= Name (#x20 Name)* names :: Parser [String] names = sepBy1 name (char '\x20') -} -- [7] Nmtoken ::= (NameChar)+ nmtoken :: Parser String nmtoken = many1 nameChar {- -- [8] Nmtokens ::= Nmtoken (#x20 Nmtoken)* nmtokens :: Parser [String] nmtokens = sepBy1 nmtoken (char '\x20') -} -- [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' -- | "'" ([^%&'] | PEReference | Reference)* "'" entityValue :: Parser EntityValue entityValue = doubleQuoted (p "%&\"") <|> singleQuoted (p "%&'") where p s = many (fmap Left (noneOf s) <|> fmap Right (fmap Left peReference <|> fmap Right reference)) -- [10] AttValue ::= '"' ([^<&"] | Reference)* '"' -- | "'" ([^<&'] | Reference)* "'" attValue :: Parser AttValue attValue = doubleQuoted (p "<&\"") <|> singleQuoted (p "<&'") where p s = many (fmap Left (noneOf s) <|> fmap Right reference) -- [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") systemLiteral :: Parser String systemLiteral = doubleQuoted (p "\"") <|> singleQuoted (p "'") where p :: String -> Parser String p s = many (noneOf s) -- [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" pubidLiteral :: Parser String pubidLiteral = doubleQuoted (many (pubidChar True)) <|> singleQuoted (many (pubidChar False)) -- [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%] pubidChar :: Bool -> Parser Char pubidChar withSingleQuote = ranges xs <|> oneOf "\x20\xD\xA-()+,./:=?;!*#@$_%" <|> singleQuote where xs = [('a', 'z'), ('A', 'Z'), ('0', '9')] singleQuote | withSingleQuote = char '\'' >> return '\'' | otherwise = fail "pubidChar" -------------------------------------------------- -- ** 2.4 Character Data and Markup -- [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) charData :: Parser String -- This implementation is too liberal since it allows "]]>" charData = stopOn ["<", "&", "]]>"] -------------------------------------------------- -- ** 2.5 Comments -- [15] Comment ::= '' comment :: Parser String comment = between (string "") (stopOn ["--"]) -------------------------------------------------- -- ** 2.6 Processing Instructions -- [16] PI ::= '' Char*)))? '?>' pInstr :: Parser String pInstr = between (string "") p where p = piTarget >> option "" (space >> stopOn ["?>"]) -- [17] PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) piTarget :: Parser String piTarget = do n <- name when (map toUpper n == "XML") $ fail "XML in piTarget" return n -------------------------------------------------- -- ** 2.7 CDATA Sections -- [18] CDSect ::= CDStart CData CDEnd -- [19] CDStart ::= '' Char*)) -- [21] CDEnd ::= ']]>' cdSect :: Parser XML cdSect = between (string "") p where p = do s <- stopOn ["]]>"] return (CDATA s) -------------------------------------------------- -- ** 2.8 Prolog and Document Type Declaration type XMLDecl = (String, Maybe String, Maybe Bool) -- [22] prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? prolog :: Parser (Maybe XMLDecl, Maybe DTD) prolog = do ma <- optionMaybe (try xmlDecl) miscs mb <- optionMaybe $ try $ do mb <- doctypedecl miscs return mb return (ma, mb) -- [23] XMLDecl ::= '' xmlDecl :: Parser XMLDecl xmlDecl = do skip (string "") return (x, y, z) -- [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"') versionInfo :: Parser String versionInfo = space >> string "version" >> eq >> p where p = singleQuoted versionNum <|> doubleQuoted versionNum -- [25] Eq ::= S? '=' S? eq :: Parser () eq = skip (mspace >> char '=' >> mspace) -- [26] VersionNum ::= '1.0' versionNum :: Parser String versionNum = do skip (string "1.0") return "1.0" -- [27] Misc ::= Comment | PI | S misc :: Parser () misc = try (skip comment) <|> try (skip pInstr) <|> skip space miscs :: Parser () miscs = skip (many misc) -- [28] doctypedecl ::= '' doctypedecl :: Parser DTD doctypedecl = do skip (string "> externalID)) mspace z <- option [] $ do z <- brackets intSubset mspace return z skip (char '>') return (DTD x y z) -- [28a] DeclSep ::= PEReference | S declSep :: Parser (Maybe DocTypeDecl) declSep = fmap (Just . DTDParameter) peReference <|> (space >> return Nothing) -- [28b] intSubset ::= (markupdecl | DeclSep)* intSubset :: Parser [DocTypeDecl] intSubset = fmap catMaybes (many (markupdecl <|> declSep)) -- [29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment markupdecl :: Parser (Maybe DocTypeDecl) markupdecl = fmap Just (choice (map try list)) <|> ((try pInstr <|> comment) >> return Nothing) where list = [elementdecl, attlistDecl, entityDecl, notationDecl] -- [30] extSubset ::= TextDecl? extSubsetDecl extSubset :: Parser (Maybe TextDecl, [DocTypeDecl]) extSubset = do m <- optionMaybe textDecl e <- extSubsetDecl return (m, e) -- [31] extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep)* extSubsetDecl :: Parser [DocTypeDecl] extSubsetDecl = fmap catMaybes (many (choice [markupdecl, fmap (Just . DTDConditional) conditionalSect, declSep])) -------------------------------------------------- -- ** 2.9 Standalone Document Declaration -- [32] SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"')) sdDecl :: Parser Bool sdDecl = space >> string "standalone" >> eq >> (singleQuoted bool <|> doubleQuoted bool) where bool = (string "yes" >> return True) <|> (string "no" >> return False) -------------------------------------------------- -- ** 2.10 White Space Handling -------------------------------------------------- -- * 3 Logical Structures -- [39] element ::= EmptyElemTag | STag content ETag element :: Parser Element element = do (s1, as, closed) <- sTag if closed then return (Element s1 as []) else do c <- content s2 <- eTag when (s1/=s2) $ fail "WFC: element" return (Element s1 as c) -------------------------------------------------- -- ** 3.1 Start-Tags, End-Tags, and Empty-Element Tags -- [40] STag ::= '<' Name (S Attribute)* S? '>' -- [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' -- The boolean indicates whether the tag was closed immediately (an EmptyElemTag) sTag :: Parser (Name, Attributes, Bool) sTag = do skip (char '<') n <- name as <- many (try (space >> attribute)) mspace b <- (char '>' >> return False) <|> (string "/>" >> return True) return (n, as, b) -- [41] Attribute ::= NSAttName Eq AttValue -- | Name Eq AttValue attribute :: Parser Attribute attribute = do n <- name eq a <- attValue return (n := a) -- [42] ETag ::= '' eTag :: Parser Name eTag = do skip (string "') return n -- [43] content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)* -- Note: since CharData accepts epsilon, there is no need to make it optional content :: Parser Content content = chainr1 (fmap g charData) (fmap f ps) where f ma l r = l ++ maybe [] return ma ++ r g s = [ CharData s | not (null s) ] ps = try (fmap Just (choice (map try [fmap Tagged element, fmap Reference reference, cdSect])) <|> ((try pInstr <|> comment) >> return Nothing)) -------------------------------------------------- -- ** 3.2 Element Type Declarations -- [45] elementdecl ::= '' elementdecl :: Parser DocTypeDecl elementdecl = do skip (string "') return (ElementDecl n cs) -- [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | children contentspec :: Parser ContentSpec contentspec = choice [ string "EMPTY" >> return Empty , string "ANY" >> return Any , try mixed , children ] -- [47] children ::= (choice | seq) ('?' | '*' | '+')? children :: Parser ContentSpec children = do a <- try cpChoice <|> cpSeq f <- option id multi return (Children (f a)) multi :: Parser (CP -> CP) multi = (char '?' >> return QuestionMark) <|> (char '*' >> return Star) <|> (char '+' >> return Plus) -- [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')? cp :: Parser CP cp = do a <- fmap CPName name <|> try cpChoice <|> cpSeq f <- option id multi return (f a) -- [49] choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')' cpChoice :: Parser CP cpChoice = parens $ do mspace x <- cp xs <- many1 (try (mspace >> char '|' >> mspace >> cp)) mspace return (Choice (x:xs)) -- [50] seq ::= '(' S? cp ( S? ',' S? cp )* S? ')' cpSeq :: Parser CP cpSeq = parens $ do mspace x <- cp xs <- many (try (mspace >> char ',' >> mspace >> cp)) mspace return (Sequence (x:xs)) -- [51] Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*' -- | '(' S? '#PCDATA' S? ')' mixed :: Parser ContentSpec mixed = char '(' >> mspace >> string "#PCDATA" >> (rest1 <|> rest2) where p = mspace >> char '|' >> mspace >> name rest1 = try $ do xs <- many (try p) mspace skip (string ")*") return (Mixed True xs) rest2 = mspace >> char ')' >> return (Mixed False []) -------------------------------------------------- -- ** 3.3 Attribute-List Declarations -- [52] AttlistDecl ::= '' attlistDecl :: Parser DocTypeDecl attlistDecl = do skip (string "') return (AttListDecl n ds) -- [53] AttDef ::= S Name S AttType S DefaultDecl attDef :: Parser AttDef attDef = do n <- spacedName tp <- attType space dd <- defaultDecl return (n, tp, dd) -- [54] AttType ::= StringType | TokenizedType | EnumeratedType attType :: Parser AttType attType = stringType <|> tokenizedType <|> enumeratedType -- [55] StringType ::= 'CDATA' stringType :: Parser AttType stringType = string "CDATA" >> return StringType -- [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY' | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS' tokenizedType :: Parser AttType tokenizedType = choice (map f xs) where f (tp, s) = try (string s) >> return tp xs = [ (IdRefsType, "IDREFS"), (IdRefType, "IDREF"), (IdType, "ID"), (EntityType, "ENTITY") , (EntitiesType, "ENTITIES"), (NmTokensType, "NMTOKENS"), (NmTokenType, "NMTOKEN") ] -- [57] EnumeratedType ::= NotationType | Enumeration enumeratedType :: Parser AttType enumeratedType = notationType <|> enumeration -- [58] NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')' notationType :: Parser AttType notationType = string "NOTATION" >> space >> parens p where p = do mspace n <- name ns <- many (try (mspace >> char '|' >> mspace >> name)) mspace return (NotationType (n:ns)) -- [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' enumeration :: Parser AttType enumeration = parens $ do mspace x <- nmtoken xs <- many (try (mspace >> char '|' >> mspace >> nmtoken)) mspace return (EnumerationType (x:xs)) -- [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' | (('#FIXED' S)? AttValue) defaultDecl :: Parser DefaultDecl defaultDecl = try (string "#REQUIRED" >> return Required) <|> try (string "#IMPLIED" >> return Implied) <|> do f <- option Value (string "#FIXED" >> space >> return Fixed) a <- attValue return (f a) -------------------------------------------------- -- ** 3.4 Conditional Sections -- [61] conditionalSect ::= includeSect | ignoreSect conditionalSect :: Parser Conditional conditionalSect = try includeSect <|> ignoreSect -- [62] includeSect ::= '' includeSect :: Parser Conditional includeSect = do skip (string "") return (Include ds) -- [63] ignoreSect ::= '' ignoreSect :: Parser Conditional ignoreSect = do skip (string "") return (Ignore (concat xss)) -- [64] ignoreSectContents ::= Ignore ('' Ignore)* ignoreSectContents :: Parser [String] ignoreSectContents = do x <- ignore xss <- many $ do skip (string "") y <- ignore return (ys++[y]) return (x:concat xss) -- [65] Ignore ::= Char* - (Char* ('') Char*) ignore :: Parser String ignore = stopOn [""] -------------------------------------------------- -- * 4 Physical Structures -------------------------------------------------- -- ** 4.1 Character and Entity References -- [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' charRef :: Parser Reference charRef = do skip (string "&#") n <- p <|> (char 'x' >> q) skip (char ';') return (CharRef n) where p = fmap (foldl' (\a b -> a*10+ord b-48) 0) (many1 ('0' <..> '9')) q = fmap hexa (many1 (ranges [('0', '9'), ('a', 'f'), ('A', 'F')])) hexa :: String -> Int hexa = rec 0 where rec n [] = n rec n (x:xs) = rec (16*n + ord x - correct) xs where correct | x <= '9' = ord '0' | x <= 'F' = ord 'A' - 10 | otherwise = ord 'a' - 10 -- [67] Reference ::= EntityRef | CharRef reference :: Parser Reference reference = try entityRef <|> charRef -- [68] EntityRef ::= '&' Name ';' entityRef :: Parser Reference entityRef = between (char '&') (char ';') (fmap EntityRef name) -- [69] PEReference ::= '%' Name ';' peReference :: Parser Parameter peReference = between (char '%') (char ';') (fmap Parameter name) -------------------------------------------------- -- ** 4.2 Entity Declarations -- [70] EntityDecl ::= GEDecl | PEDecl entityDecl :: Parser DocTypeDecl entityDecl = try geDecl <|> peDecl -- [71] GEDecl ::= '' geDecl :: Parser DocTypeDecl geDecl = do skip (string "') return (EntityDecl True n ed) -- [72] PEDecl ::= '' peDecl :: Parser DocTypeDecl peDecl = do skip (string "') return (EntityDecl False n (either Left (\a -> Right (a, Nothing)) e)) -- [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?) entityDef :: Parser EntityDef entityDef = fmap Left entityValue <|> do e <- externalID ms <- optionMaybe (try nDataDecl) return (Right (e, ms)) -- [74] PEDef ::= EntityValue | ExternalID peDef :: Parser (Either EntityValue ExternalID) peDef = fmap Left entityValue <|> fmap Right externalID -- [75] ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral externalID :: Parser ExternalID externalID = (string "SYSTEM" >> space >> fmap System systemLiteral) <|> do skip (string "PUBLIC") space x <- pubidLiteral space y <- systemLiteral return (Public x y) -- [76] NDataDecl ::= S 'NDATA' S Name nDataDecl :: Parser String nDataDecl = space >> string "NDATA" >> space >> name -------------------------------------------------- -- ** 4.3 Parsed Entities -- [77] TextDecl ::= '' textDecl :: Parser TextDecl textDecl = do skip (string "") return (v, e) -- [78] extParsedEnt ::= TextDecl? content extParsedEnt :: Parser (Maybe TextDecl, Content) extParsedEnt = do td <- optionMaybe (try textDecl) c <- content return (td, c) -- [80] EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName "'" ) encodingDecl :: Parser String encodingDecl = space >> string "encoding" >> eq >> (singleQuoted encName <|> doubleQuoted encName) -- [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* encName :: Parser String encName = do x <- ranges [('A', 'Z'), ('a', 'z')] xs <- many (ranges [('A', 'Z'), ('a', 'z'), ('0', '9')] <|> oneOf "._-") return (x:xs) -------------------------------------------------- -- ** 4.7 Notation Declarations -- [82] NotationDecl ::= '' notationDecl :: Parser DocTypeDecl notationDecl = do skip (string " fmap Right publicID mspace skip (char '>') return (NotationDecl n e) -- [83] PublicID ::= 'PUBLIC' S PubidLiteral publicID :: Parser PublicID publicID = string "PUBLIC" >> space >> pubidLiteral