module Data.XML.DTD.Parse
(
dtd
, parseDTD
, parseDTDWithExtern
, SymTable
, textDecl
, dtdComponent
, entityDecl
, entityValue
, pERef
, notation
, notationSrc
, elementDecl
, contentDecl
, contentModel
, repeatChar
, attList
, attDecl
, attType
, attDefault
, instruction
, comment
, externalID
, name
, nameSS
, quoted
)
where
import Data.XML.DTD.Types
import Data.XML.Types (ExternalID(PublicID, SystemID),
Instruction(Instruction))
import Data.Attoparsec.Text (Parser, try, satisfy, takeTill,
anyChar, char, digit, (<*.), (.*>))
import qualified Data.Attoparsec.Text as A
import Data.Attoparsec.Text.Lazy (parse, Result(Done, Fail), maybeResult)
import Data.Attoparsec.Combinator (many, manyTill, choice, sepBy1)
import Data.Functor ((<$>))
import Control.Applicative (pure, optional, (<*>), (<*), (*>), (<|>))
import Control.Monad (guard, join)
import Data.Text (Text)
import Data.Char (isSpace)
import qualified Data.Text as T
import qualified Data.Text.Lazy as L
import qualified Data.Map as M
import Data.Maybe (fromMaybe, catMaybes)
import Data.List (groupBy)
parseDTD :: L.Text -> DTD
parseDTD = parseDTDWithExtern M.empty
parseDTDWithExtern :: SymTable -> L.Text -> DTD
parseDTDWithExtern ext = continue . parse (skipWS *> textDecl <* skipWS)
where
continue (Done inp decl) = DTD (Just decl) $ parseCmps ext M.empty inp
continue (Fail inp _ _) = DTD Nothing $ parseCmps ext M.empty inp
data PreParse =
PPERef PERef
| PInstruction Instruction
| PComment Text
| PMarkup [MarkupText]
deriving (Eq, Show)
data MarkupText = MTUnquoted Text | MTQuoted Text | MTPERef PERef
deriving (Eq, Show)
type IntSymTable = M.Map Text (Maybe [EntityValue])
type SymTable = M.Map Text L.Text
parseCmps :: SymTable -> IntSymTable -> L.Text -> [DTDComponent]
parseCmps ext int = handlePre . parse (preparse <* skipWS)
where
handlePre (Done c (PPERef r)) = handlePERef ext int c r
handlePre (Done c (PComment t)) = DTDComment t : parseCmps ext int c
handlePre (Done c (PInstruction i)) = DTDInstruction i :
parseCmps ext int c
handlePre (Done c (PMarkup m)) = handleMarkup ext int c m
handlePre _ = []
handlePERef :: SymTable -> IntSymTable -> L.Text -> Text -> [DTDComponent]
handlePERef ext int cont name = maybe moveOn rescan refVal
where
moveOn = DTDPERef name : parseCmps ext int cont
rescan = (++ parseCmps ext int cont) .
parseCmps ext intNoRecurse .
L.concat . map (L.fromStrict . renderValue)
refVal = join $ M.lookup name int
intNoRecurse = M.insert name Nothing int
preparse :: Parser PreParse
preparse = choice
[ PPERef <$> try pERef
, PInstruction <$> try instruction
, PComment <$> try comment
, PMarkup <$> markup
]
markup :: Parser [MarkupText]
markup = mkMarkup <$>
("<" .*> unquoted) <*>
manyTillS ((:) . MTQuoted <$> try quoted <*> unquoted) ">"
where
unquoted = chunk2 <$> unqText <*> many (list2 <$> pct <*> unqText)
unqText = MTUnquoted <$> takeTill (`elem` "%>'\"")
pct = "%" .*> (MTUnquoted <$> (ws *> pure "% ") <|>
MTPERef <$> takeTill (== ';') <*. ";")
mkMarkup ts tss = wrap . concat $ ts : tss
wrap = (MTUnquoted "<" :) . (++ [MTUnquoted ">"])
chunk2 x xss = x : concat xss
handleMarkup :: SymTable -> IntSymTable -> L.Text -> [MarkupText] ->
[DTDComponent]
handleMarkup ext int cont =
handleCmp . parse dtdComponent . L.fromStrict . renderMarkup int
where
handleCmp (Done _ (DTDEntityDecl e)) = handleEntity ext int cont e
handleCmp (Done _ cmp) = cmp : parseCmps ext int cont
handleCmp _ = []
renderMarkup :: IntSymTable -> [MarkupText] -> Text
renderMarkup syms = T.concat . concatMap render
where
render (MTQuoted t) = ["\"", t, "\""]
render (MTUnquoted t) = [t]
render (MTPERef r) = renderPERef syms r
handleEntity :: SymTable -> IntSymTable -> L.Text -> EntityDecl ->
[DTDComponent]
handleEntity ext int cont e = DTDEntityDecl e' : parseCmps ext int' cont
where
(e', int') = case e of
InternalGeneralEntityDecl n val -> ige n val
InternalParameterEntityDecl n val -> ipe n val
ExternalParameterEntityDecl n _ -> epe n
other -> (other, int)
ige n v = (InternalGeneralEntityDecl n $ resolveValue int v, int)
ipe n v = let v' = resolveValue int v
in (InternalParameterEntityDecl n v', insertPE n v')
epe n = (e, maybe int (insertPE n) $ resolveEPE n)
insertPE n v = M.insertWith (const id) n (Just v) int
resolveEPE n = fmap (resolveValue int) $
M.lookup n ext >>= maybeResult . parse parseEPE
parseEPE = many $
EntityPERef <$> try pERef <|> EntityText <$> takeTill (== '%')
resolveValue :: IntSymTable -> [EntityValue] -> [EntityValue]
resolveValue syms = concatMap combine . groupBy bothText . concatMap resolve
where
resolve e@(EntityPERef r) = fromMaybe [e] . join $ M.lookup r syms
resolve e = [e]
bothText (EntityText {}) (EntityText {}) = True
bothText _ _ = False
combine es@(EntityText {}:_) = [EntityText . T.concat . catMaybes $
map justText es]
combine es = es
justText (EntityText t) = Just t
justText _ = Nothing
renderPERef :: IntSymTable -> PERef -> [Text]
renderPERef syms ref = maybe [pERefText ref] render . join $
M.lookup ref syms
where
render = (" " :) . (++ [" "]) . map renderValue
renderValue :: EntityValue -> Text
renderValue (EntityText t) = t
renderValue (EntityPERef r) = pERefText r
pERefText :: PERef -> Text
pERefText r = T.concat ["%", r, ";"]
dtd :: Parser DTD
dtd = DTD <$> (skipWS *> optional (textDecl <* skipWS)) <*>
many (dtdComponent <* skipWS)
textDecl :: Parser DTDTextDecl
textDecl = do
"<?" .*> xml .*> ws *> skipWS
enc1 <- optional $ try encoding
ver <- optional $ try (maybeSpace version enc1)
enc <- maybe (maybeSpace encoding ver) return enc1
skipWS *> "?>" .*> pure (DTDTextDecl ver enc)
where
xml = ("X" <|> "x") .*> ("M" <|> "m") .*> ("L" <|> "l")
version = attr "version" $ const versionNum
versionNum = T.append <$> "1." <*> (T.singleton <$> digit)
encoding = attr "encoding" $ takeTill . (==)
attr name val = try (attrQ '"' name val) <|> attrQ '\'' name val
attrQ q name val = name .*> skipWS *> "=" .*> skipWS *>
char q *> val q <* char q
maybeSpace p = maybe p (const $ ws *> skipWS *> p)
dtdComponent :: Parser DTDComponent
dtdComponent = choice $ map try
[ DTDPERef <$> pERef
, DTDEntityDecl <$> entityDecl
, DTDElementDecl <$> elementDecl
, DTDAttList <$> attList
, DTDNotation <$> notation
, DTDInstruction <$> instruction
] ++
[ DTDComment <$> comment
]
instruction :: Parser Instruction
instruction = Instruction <$> ("<?" .*> skipWS *> nameSS) <*>
idata <*. "?>"
where
idata = T.concat . concat <$> manyTillS chunk "?>"
chunk = list2 . T.singleton <$> anyChar <*> takeTill (== '?')
entityDecl :: Parser EntityDecl
entityDecl = "<!ENTITY" .*> ws *> skipWS *>
choice [try internalParam, try externalParam,
try internalGen, externalGen]
<* skipWS <*. ">"
where
internalParam = InternalParameterEntityDecl <$>
(param *> nameSS) <*> entityValue
externalParam = ExternalParameterEntityDecl <$>
(param *> nameSS) <*> externalID
internalGen = InternalGeneralEntityDecl <$> nameSS <*> entityValue
externalGen = ExternalGeneralEntityDecl <$>
nameSS <*> externalID <*> optional (try ndata)
param = "%" .*> ws *> skipWS
ndata = skipWS *> "NDATA" .*> ws *> skipWS *> name
name :: Parser Text
name = nonNull $ takeTill notNameChar
where
notNameChar c = isSpace c || c `elem` syntaxChars
syntaxChars = "()[]<>!%&;'\"?*+|,="
nonNull parser = do
text <- parser
guard . not . T.null $ text
return text
nameSS :: Parser Text
nameSS = name <* skipWS
entityValue :: Parser [EntityValue]
entityValue = try (quotedVal '"') <|> quotedVal '\''
where
quotedVal q = char q *> manyTill (content q) (char q)
content q = EntityPERef <$> try pERef <|> EntityText <$> text q
text q = takeTill $ \c -> c == '%' || c == q
pERef :: Parser PERef
pERef = "%" .*> name <*. ";"
elementDecl :: Parser ElementDecl
elementDecl = ElementDecl <$> ("<!ELEMENT" .*> ws *> skipWS *> nameSS) <*>
contentDecl <* skipWS <*. ">"
contentDecl :: Parser ContentDecl
contentDecl = choice $ map try
[ pure ContentEmpty <*. "EMPTY"
, pure ContentAny <*. "ANY"
, ContentMixed <$> pcdata
] ++
[ ContentElement <$> contentModel
]
where
pcdata = "(" .*> skipWS *> "#PCDATA" .*> skipWS *>
(try tags <|> noTagsNoStar)
tags = many ("|" .*> skipWS *> nameSS) <*. ")*"
noTagsNoStar = ")" .*> pure []
contentModel = choice $ map (<*> repeatChar)
[ CMChoice <$> try (cmList '|')
, CMSeq <$> try (cmList ',')
, CMName <$> name
]
where
cmList sep = "(" .*> skipWS *>
((contentModel <* skipWS) `sepBy1` (char sep *> skipWS)) <*. ")"
repeatChar :: Parser Repeat
repeatChar = choice
[ char '?' *> pure ZeroOrOne
, char '*' *> pure ZeroOrMore
, char '+' *> pure OneOrMore
, pure One
]
attList :: Parser AttList
attList = AttList <$> ("<!ATTLIST" .*> ws *> skipWS *> nameSS) <*>
many attDecl <*. ">"
attDecl :: Parser AttDecl
attDecl = AttDecl <$>
nameSS <*> attType <* skipWS <*> attDefault <* skipWS
attType :: Parser AttType
attType = choice $ map try
[ "CDATA" .*> ws *> pure AttStringType
, "ID" .*> ws *> pure AttIDType
, "IDREF" .*> ws *> pure AttIDRefType
, "IDREFS" .*> ws *> pure AttIDRefsType
, "ENTITY" .*> ws *> pure AttEntityType
, "ENTITIES" .*> ws *> pure AttEntitiesType
, "NMTOKEN" .*> ws *> pure AttNmTokenType
, "NMTOKENS" .*> ws *> pure AttNmTokensType
, AttEnumType <$> enumType
] ++
[ AttNotationType <$> notationType
]
where
enumType = nameList
notationType = "NOTATION" .*> ws *> skipWS *> nameList
nameList = "(" .*> skipWS *>
(nameSS `sepBy1` ("|" .*> skipWS)) <*. ")"
attDefault :: Parser AttDefault
attDefault = choice $ map try
[ "#REQUIRED" .*> pure AttRequired
, "#IMPLIED" .*> pure AttImplied
, AttFixed <$> ("#FIXED" .*> ws *> skipWS *> quoted)
] ++
[ AttDefaultValue <$> quoted
]
quoted :: Parser Text
quoted = quotedWith '"' <|> quotedWith '\''
where
quotedWith q = char q *> takeTill (== q) <* char q
notation :: Parser Notation
notation = Notation <$>
("<!NOTATION" .*> ws *> skipWS *> name) <* ws <* skipWS <*>
notationSrc <*. ">"
notationSrc :: Parser NotationSource
notationSrc = try system <|> public
where
system = NotationSysID <$>
("SYSTEM" .*> ws *> skipWS *> quoted <* ws <* skipWS)
public = mkPublic <$>
("PUBLIC" .*> ws *> skipWS *> quoted) <*>
optional (try $ ws *> skipWS *> quoted) <* skipWS
mkPublic pubID = maybe (NotationPubID pubID) (NotationPubSysID pubID)
externalID :: Parser ExternalID
externalID = try system <|> public
where
system = SystemID <$> ("SYSTEM" .*> ws *> skipWS *> quoted)
public = PublicID <$> ("PUBLIC" .*> ws *> skipWS *> quoted) <*
ws <* skipWS <*> quoted
comment :: Parser Text
comment = "<!--" .*> (T.concat . concat <$> manyTillS chunk "--") <*. ">"
where
chunk = list2 . T.singleton <$> anyChar <*> takeTill (== '-')
isXMLSpace :: Char -> Bool
isXMLSpace = (`elem` "\x20\x9\xD\xA")
ws :: Parser Char
ws = satisfy isXMLSpace
skipWS :: Parser ()
skipWS = A.takeWhile isXMLSpace *> pure ()
manyTillS :: Parser a -> Parser Text -> Parser [a]
manyTillS = manyTill
list2 :: a -> a -> [a]
list2 x y = [x, y]