module XMLParse
( XMLEvent(..)
, parseInstance, parseDTD, parseDocument
) where
import XMLScanner
import LLParsing
import XML
import DTD
import Misc
import List(unfoldr)
parseInstance :: String -> [XMLEvent]
newtype UNPARSED = UNPARSED String
deriving Show
replaceGERefs (UNPARSED s) = expandReferences DTD.predefinedEntities s
replacePERefs (UNPARSED s) = s
attributeValueLiteral = replaceGERefs <$> pLiteral
parameterLiteral = replacePERefs <$> pLiteral
systemLiteral = unparsed <$> pLiteral
where unparsed (UNPARSED s) = s
data XMLEvent =
StartEvent Name [(Name,String)]
| EmptyEvent Name [(Name,String)]
| EndEvent Name
| TextEvent String
| PIEvent Name String
| GERefEvent Name
| CommentEvent String
| ErrorEvent String
deriving (Read,Show)
parseInstance = unfoldr (pRun instanceItem) . pcdataMode
parseDTD = foldl (\a b->b a) emptyDTD . unfoldr (pRun dtdItem) . markupMode
parseDocument text =
case pRun prolog (pcdataMode text) of
Just (_, rest) -> unfoldr (pRun instanceItem) rest
Nothing -> [ErrorEvent "Error parsing prolog"]
pDelim d = pTest (==d)
pKeyword kw = pTest (\d->case d of NAME n -> n == kw; _ -> False)
rniName kw = pTest (\d->case d of RNINAME n -> n == kw; _ -> False)
pName = pCheck (\d->case d of NAME n -> Just n ; _ -> Nothing)
pGEREF = pCheck (\d->case d of GEREF n -> Just n ; _ -> Nothing)
pPEREF = pCheck (\d->case d of PEREF n -> Just n ; _ -> Nothing)
pLiteral = pCheck literal where
literal (LITERAL s) = Just (UNPARSED s)
literal _ = Nothing
pCDATA = pCheck cdata where
cdata (CDATA txt) = Just txt
cdata (WS ws) = Just ws
cdata _ = Nothing
dtdItem =
dtdDeclaration
<|> const id <$> processingInstruction
<|> const id <$> sgmlCommentDeclaration
<|> const id <$> pPEREF
dtdDeclaration =
pDelim MDO *> (
pKeyword "ELEMENT" *> elementDeclaration
<|> pKeyword "ATTLIST" *> attlistDeclaration
<|> pKeyword "ENTITY" *> entityDeclaration
<|> pKeyword "NOTATION" *> notationDeclaration
) <* pDelim MDC
prolog =
pair <$> (ws *> pMaybe xmlDeclaration) <*> (ws *> pMaybe doctypeDeclaration)
ws = () <$ pList (pTest (\d -> case d of WS _ -> True ; _ -> False))
xmlDeclaration = processingInstruction
doctypeDeclaration =
pDelim MDO *> pKeyword "DOCTYPE" *> doctype <* pDelim MDC
doctype =
() <$ pName <* externalIdentifier
elementNames =
wrap <$> pName <|> nameGroup
nameGroup =
(:) <$ pDelim GRPO <*> pName <*>
( (:) <$ pDelim SEQ <*> pName <*> pList (pDelim SEQ *> pName)
<|> (:) <$ pDelim OR <*> pName <*> pList (pDelim OR *> pName)
<|> (:) <$ pDelim AND <*> pName <*> pList (pDelim AND *> pName)
<|> pSucceed []
) <* pDelim GRPC
externalIdentifier =
pair Nothing
<$ pKeyword "SYSTEM" <*> pMaybe systemLiteral
<|> pair <$ pKeyword "PUBLIC"
<*> (Just <$> systemLiteral) <*> pMaybe systemLiteral
<|> pSucceed (Nothing,Nothing)
xmlCommentDeclaration =
pDelim MDOCOM *> pcdata <* pDelim COM <* pDelim MDC
where pcdata = pFoldr (++) [] pCDATA
sgmlCommentDeclaration =
(++) <$ pDelim MDOCOM <*> (pcdata <* pDelim COM) <*> comments <* pDelim MDC
where pcdata = pFoldr (++) [] pCDATA
comments = pFoldr (++) [] (pDelim COM *> pcdata <* pDelim COM)
processingInstruction =
makePI . concat <$ pDelim PIO <*> pList pCDATA <* pDelim PIC where
makePI string =
let (pitgt, rest) = span isNMCHAR string
pival = dropWhile isSEPCHAR rest
in (pitgt,pival)
instanceItem =
pTag
<|> TextEvent <$> pCDATA
<|> GERefEvent <$> pGEREF
<|> uncurry PIEvent <$> processingInstruction
<|> CommentEvent <$> xmlCommentDeclaration
pTag =
startEvent <$ pDelim STAGO <*> pName <*> attributes <*> tagc
<|> EndEvent <$ pDelim ETAGO <*> pName <* pDelim TAGC
where
attributes = pList (pName <* pDelim VI <^> attributeValue)
tagc = StartEvent <$ pDelim TAGC
<|> EmptyEvent <$ pDelim EETAGC
startEvent name atts closing = closing name atts
attributeValue =
attributeValueLiteral <|> pName
elementDeclaration =
declareElements
<$> elementNames <*> omissibility <*> contentDefinition <*> exceptions
where
omissibility =
(pair <$> dashoro <*> dashoro) <?> (False,False)
dashoro =
(True <$ pKeyword "O" <|> False <$ pDelim MINUS)
exceptions =
pair <$> (pDelim MINUS *> nameGroup <?> [])
<*> (pDelim PLUS *> nameGroup <?> [])
contentDefinition =
DC_EMPTY <$ pKeyword "EMPTY"
<|> DC_ANY <$ pKeyword "ANY"
<|> DC_MODELGRP <$> contentModel
contentModel =
( Prim . ELEMENT <$> pName
<|> Prim PCDATA <$ rniName "#PCDATA"
<|> pDelim GRPO *> contentModel <**>
( mk Seq <$> pSome (pDelim SEQ *> contentModel)
<|> mk And <$> pSome (pDelim AND *> contentModel)
<|> mk Or <$> pSome (pDelim OR *> contentModel)
<|> pSucceed id ) <* pDelim GRPC
) <**> occurrenceIndicator
where mk f l a = f (a:l)
occurrenceIndicator =
Plus <$ pDelim PLUS
<|> Rep <$ pDelim REP
<|> Opt <$ pDelim OPT
<|> pSucceed id
attlistDeclaration =
declareAttlist <$> elementNames <*> pList attributeDefinition
attributeDefinition =
ATTDEF <$> pName <*> declaredValue <*> defaultValue
declaredValue =
ATcdata <$ pKeyword "CDATA"
<|> ATid <$ pKeyword "ID"
<|> ATidref <$ pKeyword "IDREF"
<|> ATidrefs <$ pKeyword "IDREFS"
<|> ATentity <$ pKeyword "ENTITY"
<|> ATentities <$ pKeyword "ENTITIES"
<|> ATnmtoken <$ pKeyword "NMTOKEN"
<|> ATnmtokens <$ pKeyword "NMTOKENS"
<|> ATnotation <$ pKeyword "NOTATION" <*> nameGroup
<|> ATenumerated <$> nameGroup
<|> ATnmtoken <$ pKeyword "NAME"
<|> ATnmtoken <$ pKeyword "NUMBER"
<|> ATnmtoken <$ pKeyword "NUTOKEN"
<|> ATnmtokens <$ pKeyword "NAMES"
<|> ATnmtokens <$ pKeyword "NUMBERS"
<|> ATnmtokens <$ pKeyword "NUTOKENS"
defaultValue =
ADVimplied <$ rniName "#IMPLIED"
<|> ADVrequired <$ rniName "#REQUIRED"
<|> ADVfixed <$ rniName "#FIXED" <*> attributeValue
<|> ADVdefault <$> attributeValue
<|> ADVcurrent <$ rniName "#CURRENT"
<|> ADVconref <$ rniName "#CONREF"
entityDeclaration =
declareParameterEntity <$ pDelim PERO <*> pName <*> entityText
<|> declareGeneralEntity <$> pName <*> entityText
entityText =
EN_INTERNAL <$> parameterLiteral
<|> EN_EXTERNAL <$> externalIdentifier <* entityType where
entityType =
ETsubdoc <$ pKeyword "SUBDOC"
<|> csndata <* pName <* dataAttributes
dataAttributes =
pDelim DSO
*> pList (pair <$> pName <* pDelim VI <*> attributeValue)
<* pDelim DSC
csndata =
ETcdata <$ pKeyword "CDATA"
<|> ETsdata <$ pKeyword "SDATA"
<|> ETndata <$ pKeyword "NDATA"
notationDeclaration =
declareNotation <$> pName <*> externalIdentifier