-- | This is a parser for HTML documents. Unlike for XML documents, it
-- must include a certain amount of error-correction to account for
-- HTML features like self-terminating tags, unterminated tags, and
-- incorrect nesting. The input is tokenised by the
-- XML lexer (a separate lexer is not required for HTML).
-- It uses a slightly extended version of the Hutton/Meijer parser
-- combinators.
module Text.XML.HaXml.Html.Parse
( htmlParse
) where
import Prelude hiding (either,maybe,sequence)
import qualified Prelude (either)
import Maybe hiding (maybe)
import Char (toLower, isSpace, isDigit, isHexDigit)
import Numeric (readDec,readHex)
import Monad
import Text.XML.HaXml.Types
import Text.XML.HaXml.Lex
import Text.ParserCombinators.HuttonMeijerWallace
-- #define DEBUG
#if defined(DEBUG)
# if ( defined(__GLASGOW_HASKELL__) && __GLASGOW_HASKELL__ > 502 ) || \
( defined(__NHC__) && __NHC__ > 114 ) || defined(__HUGS__)
import Debug.Trace(trace)
# elif defined(__GLASGOW_HASKELL__)
import IOExts(trace)
# elif defined(__NHC__) || defined(__HBC__)
import NonStdTrace
# endif
debug :: Monad m => String -> m ()
debug s = trace s (return ())
#else
debug :: Monad m => String -> m ()
debug s = return ()
#endif
-- | The first argument is the name of the file, the second is the string
-- contents of the file. The result is the generic representation of
-- an XML document.
htmlParse :: String -> String -> Document
htmlParse name = simplify . sanitycheck . Prelude.either error id
. papply' document () . xmlLex name
sanitycheck :: Show p => [(a,s,[Either String (p,t)])] -> a
sanitycheck [] = error "***Error at line 0: document not HTML?"
sanitycheck ((x,_,[]):_) = x
sanitycheck ((x,_,s@(Right (n,_):_)):xs) =
error ("***Error at "++show n++": data beyond end of parsed document")
---- Document simplification ----
simplify :: Document -> Document
simplify (Document p st (Elem n avs cs) ms) =
Document p st (Elem n avs (deepfilter simp cs)) ms
where
simp (CElem (Elem "null" [] [])) = False
simp (CElem (Elem n _ [])) | n `elem` ["font","p","i","b","em"
,"tt","big","small"] = False
simp (CString False s) | all isSpace s = False
simp _ = True
deepfilter p =
filter p . map (\c-> case c of
(CElem (Elem n avs cs)) -> CElem (Elem n avs (deepfilter p cs))
_ -> c)
-- opening any of these, they close again immediately
selfclosingtags = ["img","hr","br","meta","col","link","base"
,"param","area","frame","input"]
--closing this, implicitly closes any of those which are contained in it
closeInnerTags =
[ ("ul", ["li"])
, ("ol", ["li"])
, ("dl", ["dt","dd"])
, ("tr", ["th","td"])
, ("div", ["p"])
, ("thead", ["th","tr","td"])
, ("tfoot", ["th","tr","td"])
, ("tbody", ["th","tr","td"])
, ("table", ["th","tr","td","thead","tfoot","tbody"])
, ("caption", ["p"])
, ("th", ["p"])
, ("td", ["p"])
, ("li", ["p"])
, ("dt", ["p"])
, ("dd", ["p"])
, ("object", ["p"])
, ("map", ["p"])
, ("body", ["p"])
]
--opening this, implicitly closes that
closes :: Name -> Name -> Bool
"a" `closes` "a" = True
"li" `closes` "li" = True
"th" `closes` t | t `elem` ["th","td"] = True
"td" `closes` t | t `elem` ["th","td"] = True
"tr" `closes` t | t `elem` ["th","td","tr"] = True
"dt" `closes` t | t `elem` ["dt","dd"] = True
"dd" `closes` t | t `elem` ["dt","dd"] = True
"form" `closes` "form" = True
"label" `closes` "label" = True
_ `closes` "option" = True
"thead" `closes` t | t `elem` ["colgroup"] = True
"tfoot" `closes` t | t `elem` ["thead","colgroup"] = True
"tbody" `closes` t | t `elem` ["tbody","tfoot","thead","colgroup"] = True
"colgroup" `closes` "colgroup" = True
t `closes` "p"
| t `elem` ["p","h1","h2","h3","h4","h5","h6"
,"hr","div","ul","dl","ol","table"] = True
_ `closes` _ = False
---- Misc ----
fst3 (a,_,_) = a
snd3 (_,a,_) = a
thd3 (_,_,a) = a
---- Auxiliary Parsing Functions ----
type HParser a = Parser () (Posn,TokenT) String a
name :: HParser Name
name = do {(p,TokName s) <- item; return s}
string, freetext :: HParser String
string = do {(p,TokName s) <- item; return s}
freetext = do {(p,TokFreeText s) <- item; return s}
maybe :: HParser a -> HParser (Maybe a)
maybe p =
( p >>= return . Just) +++
( return Nothing)
either :: HParser a -> HParser b -> HParser (Either a b)
either p q =
( p >>= return . Left) +++
( q >>= return . Right)
word :: String -> HParser ()
word s = P (\st inp-> case inp of {
(Left err: _) -> Left err;
(Right (p,TokName n):ts) -> if s==n then Right [((),st,ts)]
else Right [];
(Right (p,TokFreeText n):ts) -> if s==n then Right [((),st,ts)]
else Right [];
ts -> Right [] } )
posn :: HParser Posn
posn = P (\st inp-> case inp of {
(Left err: _) -> Left err;
(Right (p,_):_) -> Right [(p,st,inp)];
[] -> Right [(Pn "unknown" 0 0 Nothing,st,inp)]; } )
nmtoken :: HParser NmToken
nmtoken = (string +++ freetext)
---- XML Parsing Functions ----
document :: HParser Document
document = do
p <- prolog `elserror` "unrecognisable XML prolog"
es <- many1 (element "HTML document")
ms <- many misc
return (Document p emptyST (case map snd es of
[e] -> e
es -> Elem "html" [] (map CElem es))
ms)
comment :: HParser Comment
comment = do
bracket (tok TokCommentOpen) freetext (tok TokCommentClose)
processinginstruction :: HParser ProcessingInstruction
processinginstruction = do
tok TokPIOpen
n <- string `elserror` "processing instruction has no target"
f <- freetext
(tok TokPIClose +++ tok TokAnyClose) `elserror` "missing ?> or >"
return (n, f)
cdsect :: HParser CDSect
cdsect = do
tok TokSectionOpen
bracket (tok (TokSection CDATAx)) chardata (tok TokSectionClose)
prolog :: HParser Prolog
prolog = do
x <- maybe xmldecl
m1 <- many misc
dtd <- maybe doctypedecl
m2 <- many misc
return (Prolog x m1 dtd m2)
xmldecl :: HParser XMLDecl
xmldecl = do
tok TokPIOpen
(word "xml" +++ word "XML")
p <- posn
s <- freetext
tok TokPIClose `elserror` "missing ?> in "
(raise . papply' aux () . xmlReLex p) s
where
aux = do
v <- versioninfo `elserror` "missing XML version info"
e <- maybe encodingdecl
s <- maybe sddecl
return (XMLDecl v e s)
raise (Left err) = mzero `elserror` err
raise (Right ok) = (return . fst3 . head) ok
versioninfo :: HParser VersionInfo
versioninfo = do
(word "version" +++ word "VERSION")
tok TokEqual
bracket (tok TokQuote) freetext (tok TokQuote)
misc :: HParser Misc
misc =
( comment >>= return . Comment) +++
( processinginstruction >>= return . PI)
-- Question: for HTML, should we disallow in-line DTDs, allowing only externals?
-- Answer: I think so.
doctypedecl :: HParser DocTypeDecl
doctypedecl = do
tok TokSpecialOpen
tok (TokSpecial DOCTYPEx)
n <- name
eid <- maybe externalid
-- es <- maybe (bracket (tok TokSqOpen)
-- (many markupdecl)
-- (tok TokSqClose))
tok TokAnyClose `elserror` "missing > in DOCTYPE decl"
-- return (DTD n eid (case es of { Nothing -> []; Just e -> e }))
return (DTD n eid [])
--markupdecl :: HParser MarkupDecl
--markupdecl =
-- ( elementdecl >>= return . Element) +++
-- ( attlistdecl >>= return . AttList) +++
-- ( entitydecl >>= return . Entity) +++
-- ( notationdecl >>= return . Notation) +++
-- ( misc >>= return . MarkupMisc) +++
-- PEREF(MarkupPE,markupdecl)
--
--extsubset :: HParser ExtSubset
--extsubset = do
-- td <- maybe textdecl
-- ds <- many extsubsetdecl
-- return (ExtSubset td ds)
--
--extsubsetdecl :: HParser ExtSubsetDecl
--extsubsetdecl =
-- ( markupdecl >>= return . ExtMarkupDecl) +++
-- ( conditionalsect >>= return . ExtConditionalSect) +++
-- PEREF(ExtPEReference,extsubsetdecl)
sddecl :: HParser SDDecl
sddecl = do
(word "standalone" +++ word "STANDALONE")
tok TokEqual `elserror` "missing = in 'standalone' decl"
bracket (tok TokQuote)
( (word "yes" >> return True) +++
(word "no" >> return False) `elserror`
"'standalone' decl requires 'yes' or 'no' value" )
(tok TokQuote)
----
-- VERY IMPORTANT NOTE: The stack returned here contains those tags which
-- have been closed implicitly and need to be reopened again at the
-- earliest opportunity.
type Stack = [(Name,[Attribute])]
element :: Name -> HParser (Stack,Element)
element ctx =
do
tok TokAnyOpen
(ElemTag e avs) <- elemtag
( if e `closes` ctx then
-- insert the missing close-tag, fail forward, and reparse.
( do debug ("/")
unparse ([TokEndOpen, TokName ctx, TokAnyClose,
TokAnyOpen, TokName e] ++ reformatAttrs avs)
return ([], Elem "null" [] []))
else if e `elem` selfclosingtags then
-- complete the parse straightaway.
( do tok TokEndClose -- self-closing
debug (e++"[+]")
return ([], Elem e avs [])) +++
-- ( do tok TokAnyClose -- sequence (**not HTML?**)
-- debug (e++"[+")
-- n <- bracket (tok TokEndOpen) name (tok TokAnyClose)
-- debug "]"
-- if e == (map toLower n :: Name)
-- then return ([], Elem e avs [])
-- else return (error "no nesting in empty tag")) +++
( do tok TokAnyClose -- with no close (e.g. )
debug (e++"[+]")
return ([], Elem e avs []))
else
(( do tok TokEndClose
debug (e++"[]")
return ([], Elem e avs [])) +++
( do tok TokAnyClose `elserror` "missing > or /> in element tag"
debug (e++"[")
zz <- many (content e)
-- (if null zz then return (error ("empty content in context: "++e)) else return ())
let (ss,cs) = unzip zz
let s = if null ss then [] else last ss
n <- bracket (tok TokEndOpen) name (tok TokAnyClose)
debug "]"
( if e == (map toLower n :: Name) then
do unparse (reformatTags (closeInner e s))
debug "^"
return ([], Elem e avs cs)
else
do unparse [TokEndOpen, TokName n, TokAnyClose]
debug "-"
return (((e,avs):s), Elem e avs cs))
) `elserror` ("failed to repair non-matching tags in context: "++ctx)))
closeInner :: Name -> [(Name,[Attribute])] -> [(Name,[Attribute])]
closeInner c ts =
case lookup c closeInnerTags of
(Just these) -> filter ((`notElem` these).fst) ts
Nothing -> ts
unparse ts = do p <- posn
reparse (map Right (zip (repeat p) ts))
reformatAttrs avs = concatMap f0 avs
where f0 (a, AttValue [Left s]) = [TokName a, TokEqual, TokQuote,
TokFreeText s, TokQuote]
reformatTags ts = concatMap f0 ts
where f0 (t,avs) = [TokAnyOpen, TokName t]++reformatAttrs avs++[TokAnyClose]
content :: Name -> HParser (Stack,Content)
content ctx =
( element ctx >>= \(s,e)-> return (s, CElem e)) +++
( chardata >>= \s-> return ([], CString False s)) +++
( reference >>= \r-> return ([], CRef r)) +++
( cdsect >>= \c-> return ([], CString True c)) +++
( misc >>= \m-> return ([], CMisc m))
----
elemtag :: HParser ElemTag
elemtag = do
n <- name `elserror` "malformed element tag"
as <- many attribute
return (ElemTag (map toLower n) as)
attribute :: HParser Attribute
attribute = do
n <- name
v <- (do tok TokEqual
attvalue) +++
(return (AttValue [Left "TRUE"]))
return (map toLower n,v)
--elementdecl :: HParser ElementDecl
--elementdecl = do
-- tok TokSpecialOpen
-- tok (TokSpecial ELEMENTx)
-- n <- name `elserror` "missing identifier in ELEMENT decl"
-- c <- contentspec `elserror` "missing content spec in ELEMENT decl"
-- tok TokAnyClose `elserror` "expected > terminating ELEMENT decl"
-- return (ElementDecl n c)
--
--contentspec :: HParser ContentSpec
--contentspec =
-- ( word "EMPTY" >> return EMPTY) +++
-- ( word "ANY" >> return ANY) +++
-- ( mixed >>= return . Mixed) +++
-- ( cp >>= return . ContentSpec) +++
-- PEREF(ContentPE,contentspec)
--
--choice :: HParser [CP]
--choice = do
-- bracket (tok TokBraOpen)
-- (cp `sepby1` (tok TokPipe))
-- (tok TokBraClose)
--
--sequence :: HParser [CP]
--sequence = do
-- bracket (tok TokBraOpen)
-- (cp `sepby1` (tok TokComma))
-- (tok TokBraClose)
--
--cp :: HParser CP
--cp =
-- ( do n <- name
-- m <- modifier
-- return (TagName n m)) +++
-- ( do ss <- sequence
-- m <- modifier
-- return (Seq ss m)) +++
-- ( do cs <- choice
-- m <- modifier
-- return (Choice cs m)) +++
-- PEREF(CPPE,cp)
--
--modifier :: HParser Modifier
--modifier =
-- ( tok TokStar >> return Star) +++
-- ( tok TokQuery >> return Query) +++
-- ( tok TokPlus >> return Plus) +++
-- ( return None)
--
--mixed :: HParser Mixed
--mixed = do
-- tok TokBraOpen
-- tok TokHash
-- word "PCDATA"
-- cont
-- where
-- cont = ( tok TokBraClose >> return PCDATA) +++
-- ( do cs <- many ( do tok TokPipe
-- n <- name
-- return n)
-- tok TokBraClose
-- tok TokStar
-- return (PCDATAplus cs))
--
--attlistdecl :: HParser AttListDecl
--attlistdecl = do
-- tok TokSpecialOpen
-- tok (TokSpecial ATTLISTx)
-- n <- name `elserror` "missing identifier in ATTLIST"
-- ds <- many attdef
-- tok TokAnyClose `elserror` "missing > terminating ATTLIST"
-- return (AttListDecl n ds)
--
--attdef :: HParser AttDef
--attdef = do
-- n <- name
-- t <- atttype `elserror` "missing attribute type in attlist defn"
-- d <- defaultdecl
-- return (AttDef n t d)
--
--atttype :: HParser AttType
--atttype =
-- ( word "CDATA" >> return StringType) +++
-- ( tokenizedtype >>= return . TokenizedType) +++
-- ( enumeratedtype >>= return . EnumeratedType)
--
--tokenizedtype :: HParser TokenizedType
--tokenizedtype =
-- ( word "ID" >> return ID) +++
-- ( word "IDREF" >> return IDREF) +++
-- ( word "IDREFS" >> return IDREFS) +++
-- ( word "ENTITY" >> return ENTITY) +++
-- ( word "ENTITIES" >> return ENTITIES) +++
-- ( word "NMTOKEN" >> return NMTOKEN) +++
-- ( word "NMTOKENS" >> return NMTOKENS)
--
--enumeratedtype :: HParser EnumeratedType
--enumeratedtype =
-- ( notationtype >>= return . NotationType) +++
-- ( enumeration >>= return . Enumeration)
--
--notationtype :: HParser NotationType
--notationtype = do
-- word "NOTATION"
-- bracket (tok TokBraOpen)
-- (name `sepby1` (tok TokPipe))
-- (tok TokBraClose)
--
--enumeration :: HParser Enumeration
--enumeration =
-- bracket (tok TokBraOpen)
-- (nmtoken `sepby1` (tok TokPipe))
-- (tok TokBraClose)
--
--defaultdecl :: HParser DefaultDecl
--defaultdecl =
-- ( tok TokHash >> word "REQUIRED" >> return REQUIRED) +++
-- ( tok TokHash >> word "IMPLIED" >> return IMPLIED) +++
-- ( do f <- maybe (tok TokHash >> word "FIXED" >> return FIXED)
-- a <- attvalue
-- return (DefaultTo a f))
--
--conditionalsect :: HParser ConditionalSect
--conditionalsect =
-- ( do tok TokSectionOpen
-- tok (TokSection INCLUDEx)
-- tok TokSqOpen `elserror` "missing [ after INCLUDE"
-- i <- extsubsetdecl `elserror` "missing ExtSubsetDecl in INCLUDE"
-- tok TokSectionClose `elserror` "missing ] after INCLUDE"
-- return (IncludeSect i)) +++
-- ( do tok TokSectionOpen
-- tok (TokSection IGNOREx)
-- tok TokSqOpen `elserror` "missing [ after IGNORE"
-- i <- many ignoresectcontents
-- tok TokSectionClose `elserror` "missing ] after IGNORE"
-- return (IgnoreSect i))
--
--ignoresectcontents :: HParser IgnoreSectContents
--ignoresectcontents = do
-- i <- ignore
-- is <- many (do tok TokSectionOpen
-- ic <- ignoresectcontents
-- tok TokSectionClose
-- ig <- ignore
-- return (ic,ig))
-- return (IgnoreSectContents i is)
--
--ignore :: HParser Ignore
--ignore = freetext >>= return . Ignore
reference :: HParser Reference
reference = do
bracket (tok TokAmp) (freetext >>= val) (tok TokSemi)
where
val ('#':'x':i) | all isHexDigit i
= return . RefChar . fst . head . readHex $ i
val ('#':i) | all isDigit i
= return . RefChar . fst . head . readDec $ i
val name = return . RefEntity $ name
{-
reference :: HParser Reference
reference =
( charref >>= return . RefChar) +++
( entityref >>= return . RefEntity)
entityref :: HParser EntityRef
entityref = do
n <- bracket (tok TokAmp) name (tok TokSemi)
return n
charref :: HParser CharRef
charref = do
bracket (tok TokAmp) (freetext >>= readCharVal) (tok TokSemi)
where
readCharVal ('#':'x':i) = return . fst . head . readHex $ i
readCharVal ('#':i) = return . fst . head . readDec $ i
readCharVal _ = mzero
-}
--pereference :: HParser PEReference
--pereference = do
-- bracket (tok TokPercent) nmtoken (tok TokSemi)
--
--entitydecl :: HParser EntityDecl
--entitydecl =
-- ( gedecl >>= return . EntityGEDecl) +++
-- ( pedecl >>= return . EntityPEDecl)
--
--gedecl :: HParser GEDecl
--gedecl = do
-- tok TokSpecialOpen
-- tok (TokSpecial ENTITYx)
-- n <- name
-- e <- entitydef `elserror` "missing entity defn in G ENTITY decl"
-- tok TokAnyClose `elserror` "expected > terminating G ENTITY decl"
-- return (GEDecl n e)
--
--pedecl :: HParser PEDecl
--pedecl = do
-- tok TokSpecialOpen
-- tok (TokSpecial ENTITYx)
-- tok TokPercent
-- n <- name
-- e <- pedef `elserror` "missing entity defn in P ENTITY decl"
-- tok TokAnyClose `elserror` "expected > terminating P ENTITY decl"
-- return (PEDecl n e)
--
--entitydef :: HParser EntityDef
--entitydef =
-- ( entityvalue >>= return . DefEntityValue) +++
-- ( do eid <- externalid
-- ndd <- maybe ndatadecl
-- return (DefExternalID eid ndd))
--
--pedef :: HParser PEDef
--pedef =
-- ( entityvalue >>= return . PEDefEntityValue) +++
-- ( externalid >>= return . PEDefExternalID)
externalid :: HParser ExternalID
externalid =
( do word "SYSTEM"
s <- systemliteral
return (SYSTEM s)) +++
( do word "PUBLIC"
p <- pubidliteral
s <- (systemliteral +++ return (SystemLiteral ""))
return (PUBLIC p s))
--ndatadecl :: HParser NDataDecl
--ndatadecl = do
-- word "NDATA"
-- n <- name
-- return (NDATA n)
textdecl :: HParser TextDecl
textdecl = do
tok TokPIOpen
(word "xml" +++ word "XML")
v <- maybe versioninfo
e <- encodingdecl
tok TokPIClose `elserror` "expected ?> terminating text decl"
return (TextDecl v e)
--extparsedent :: HParser ExtParsedEnt
--extparsedent = do
-- t <- maybe textdecl
-- (_,c) <- (content "")
-- return (ExtParsedEnt t c)
--
--extpe :: HParser ExtPE
--extpe = do
-- t <- maybe textdecl
-- e <- extsubsetdecl
-- return (ExtPE t e)
encodingdecl :: HParser EncodingDecl
encodingdecl = do
(word "encoding" +++ word "ENCODING")
tok TokEqual `elserror` "expected = in 'encoding' decl"
f <- bracket (tok TokQuote) freetext (tok TokQuote)
return (EncodingDecl f)
--notationdecl :: HParser NotationDecl
--notationdecl = do
-- tok TokSpecialOpen
-- word "NOTATION"
-- n <- name
-- e <- either externalid publicid
-- tok TokAnyClose `elserror` "expected > terminating NOTATION decl"
-- return (NOTATION n e)
publicid :: HParser PublicID
publicid = do
word "PUBLICID"
p <- pubidliteral
return (PUBLICID p)
entityvalue :: HParser EntityValue
entityvalue = do
evs <- bracket (tok TokQuote) (many ev) (tok TokQuote)
return (EntityValue evs)
ev :: HParser EV
ev =
( freetext >>= return . EVString) +++
-- PEREF(EVPERef,ev) +++
( reference >>= return . EVRef)
attvalue :: HParser AttValue
attvalue =
( do avs <- bracket (tok TokQuote)
(many (either freetext reference))
(tok TokQuote)
return (AttValue avs) ) +++
( do p <- (tok TokPlus >> return ('+':))
+++ (tok TokHash >> return ('#':))
+++ (return id)
v <- nmtoken
return (AttValue [Left (p v)]) )
systemliteral :: HParser SystemLiteral
systemliteral = do
s <- bracket (tok TokQuote) freetext (tok TokQuote)
return (SystemLiteral s) -- note: need to fold &...; escapes
pubidliteral :: HParser PubidLiteral
pubidliteral = do
s <- bracket (tok TokQuote) freetext (tok TokQuote)
return (PubidLiteral s) -- note: need to fold &...; escapes
chardata :: HParser CharData
chardata = freetext -- >>= return . CharData