module Text.XML.HaXml.Html.Parse
( htmlParse
) where
import Prelude hiding (either,maybe,sequence)
import qualified Prelude (either)
import Maybe hiding (maybe)
import Char (toLower, isSpace, isDigit, isHexDigit)
import Numeric (readDec,readHex)
import Monad
import Text.XML.HaXml.Types
import Text.XML.HaXml.Lex
import Text.ParserCombinators.HuttonMeijerWallace
#if defined(DEBUG)
# if ( defined(__GLASGOW_HASKELL__) && __GLASGOW_HASKELL__ > 502 ) || \
( defined(__NHC__) && __NHC__ > 114 ) || defined(__HUGS__)
import Debug.Trace(trace)
# elif defined(__GLASGOW_HASKELL__)
import IOExts(trace)
# elif defined(__NHC__) || defined(__HBC__)
import NonStdTrace
# endif
debug :: Monad m => String -> m ()
debug s = trace s (return ())
#else
debug :: Monad m => String -> m ()
debug s = return ()
#endif
htmlParse :: String -> String -> Document
htmlParse name = simplify . sanitycheck . Prelude.either error id
. papply' document () . xmlLex name
sanitycheck :: Show p => [(a,s,[Either String (p,t)])] -> a
sanitycheck [] = error "***Error at line 0: document not HTML?"
sanitycheck ((x,_,[]):_) = x
sanitycheck ((x,_,s@(Right (n,_):_)):xs) =
error ("***Error at "++show n++": data beyond end of parsed document")
simplify :: Document -> Document
simplify (Document p st (Elem n avs cs) ms) =
Document p st (Elem n avs (deepfilter simp cs)) ms
where
simp (CElem (Elem "null" [] [])) = False
simp (CElem (Elem n _ [])) | n `elem` ["font","p","i","b","em"
,"tt","big","small"] = False
simp (CString False s) | all isSpace s = False
simp _ = True
deepfilter p =
filter p . map (\c-> case c of
(CElem (Elem n avs cs)) -> CElem (Elem n avs (deepfilter p cs))
_ -> c)
selfclosingtags = ["img","hr","br","meta","col","link","base"
,"param","area","frame","input"]
closeInnerTags =
[ ("ul", ["li"])
, ("ol", ["li"])
, ("dl", ["dt","dd"])
, ("tr", ["th","td"])
, ("div", ["p"])
, ("thead", ["th","tr","td"])
, ("tfoot", ["th","tr","td"])
, ("tbody", ["th","tr","td"])
, ("table", ["th","tr","td","thead","tfoot","tbody"])
, ("caption", ["p"])
, ("th", ["p"])
, ("td", ["p"])
, ("li", ["p"])
, ("dt", ["p"])
, ("dd", ["p"])
, ("object", ["p"])
, ("map", ["p"])
, ("body", ["p"])
]
closes :: Name -> Name -> Bool
"a" `closes` "a" = True
"li" `closes` "li" = True
"th" `closes` t | t `elem` ["th","td"] = True
"td" `closes` t | t `elem` ["th","td"] = True
"tr" `closes` t | t `elem` ["th","td","tr"] = True
"dt" `closes` t | t `elem` ["dt","dd"] = True
"dd" `closes` t | t `elem` ["dt","dd"] = True
"form" `closes` "form" = True
"label" `closes` "label" = True
_ `closes` "option" = True
"thead" `closes` t | t `elem` ["colgroup"] = True
"tfoot" `closes` t | t `elem` ["thead","colgroup"] = True
"tbody" `closes` t | t `elem` ["tbody","tfoot","thead","colgroup"] = True
"colgroup" `closes` "colgroup" = True
t `closes` "p"
| t `elem` ["p","h1","h2","h3","h4","h5","h6"
,"hr","div","ul","dl","ol","table"] = True
_ `closes` _ = False
fst3 (a,_,_) = a
snd3 (_,a,_) = a
thd3 (_,_,a) = a
type HParser a = Parser () (Posn,TokenT) String a
name :: HParser Name
name = do {(p,TokName s) <- item; return s}
string, freetext :: HParser String
string = do {(p,TokName s) <- item; return s}
freetext = do {(p,TokFreeText s) <- item; return s}
maybe :: HParser a -> HParser (Maybe a)
maybe p =
( p >>= return . Just) +++
( return Nothing)
either :: HParser a -> HParser b -> HParser (Either a b)
either p q =
( p >>= return . Left) +++
( q >>= return . Right)
word :: String -> HParser ()
word s = P (\st inp-> case inp of {
(Left err: _) -> Left err;
(Right (p,TokName n):ts) -> if s==n then Right [((),st,ts)]
else Right [];
(Right (p,TokFreeText n):ts) -> if s==n then Right [((),st,ts)]
else Right [];
ts -> Right [] } )
posn :: HParser Posn
posn = P (\st inp-> case inp of {
(Left err: _) -> Left err;
(Right (p,_):_) -> Right [(p,st,inp)];
[] -> Right [(Pn "unknown" 0 0 Nothing,st,inp)]; } )
nmtoken :: HParser NmToken
nmtoken = (string +++ freetext)
document :: HParser Document
document = do
p <- prolog `elserror` "unrecognisable XML prolog"
es <- many1 (element "HTML document")
ms <- many misc
return (Document p emptyST (case map snd es of
[e] -> e
es -> Elem "html" [] (map CElem es))
ms)
comment :: HParser Comment
comment = do
bracket (tok TokCommentOpen) freetext (tok TokCommentClose)
processinginstruction :: HParser ProcessingInstruction
processinginstruction = do
tok TokPIOpen
n <- string `elserror` "processing instruction has no target"
f <- freetext
(tok TokPIClose +++ tok TokAnyClose) `elserror` "missing ?> or >"
return (n, f)
cdsect :: HParser CDSect
cdsect = do
tok TokSectionOpen
bracket (tok (TokSection CDATAx)) chardata (tok TokSectionClose)
prolog :: HParser Prolog
prolog = do
x <- maybe xmldecl
m1 <- many misc
dtd <- maybe doctypedecl
m2 <- many misc
return (Prolog x m1 dtd m2)
xmldecl :: HParser XMLDecl
xmldecl = do
tok TokPIOpen
(word "xml" +++ word "XML")
p <- posn
s <- freetext
tok TokPIClose `elserror` "missing ?> in <?xml ...?>"
(raise . papply' aux () . xmlReLex p) s
where
aux = do
v <- versioninfo `elserror` "missing XML version info"
e <- maybe encodingdecl
s <- maybe sddecl
return (XMLDecl v e s)
raise (Left err) = mzero `elserror` err
raise (Right ok) = (return . fst3 . head) ok
versioninfo :: HParser VersionInfo
versioninfo = do
(word "version" +++ word "VERSION")
tok TokEqual
bracket (tok TokQuote) freetext (tok TokQuote)
misc :: HParser Misc
misc =
( comment >>= return . Comment) +++
( processinginstruction >>= return . PI)
doctypedecl :: HParser DocTypeDecl
doctypedecl = do
tok TokSpecialOpen
tok (TokSpecial DOCTYPEx)
n <- name
eid <- maybe externalid
tok TokAnyClose `elserror` "missing > in DOCTYPE decl"
return (DTD n eid [])
sddecl :: HParser SDDecl
sddecl = do
(word "standalone" +++ word "STANDALONE")
tok TokEqual `elserror` "missing = in 'standalone' decl"
bracket (tok TokQuote)
( (word "yes" >> return True) +++
(word "no" >> return False) `elserror`
"'standalone' decl requires 'yes' or 'no' value" )
(tok TokQuote)
type Stack = [(Name,[Attribute])]
element :: Name -> HParser (Stack,Element)
element ctx =
do
tok TokAnyOpen
(ElemTag e avs) <- elemtag
( if e `closes` ctx then
( do debug ("/")
unparse ([TokEndOpen, TokName ctx, TokAnyClose,
TokAnyOpen, TokName e] ++ reformatAttrs avs)
return ([], Elem "null" [] []))
else if e `elem` selfclosingtags then
( do tok TokEndClose
debug (e++"[+]")
return ([], Elem e avs [])) +++
( do tok TokAnyClose
debug (e++"[+]")
return ([], Elem e avs []))
else
(( do tok TokEndClose
debug (e++"[]")
return ([], Elem e avs [])) +++
( do tok TokAnyClose `elserror` "missing > or /> in element tag"
debug (e++"[")
zz <- many (content e)
let (ss,cs) = unzip zz
let s = if null ss then [] else last ss
n <- bracket (tok TokEndOpen) name (tok TokAnyClose)
debug "]"
( if e == (map toLower n :: Name) then
do unparse (reformatTags (closeInner e s))
debug "^"
return ([], Elem e avs cs)
else
do unparse [TokEndOpen, TokName n, TokAnyClose]
debug "-"
return (((e,avs):s), Elem e avs cs))
) `elserror` ("failed to repair non-matching tags in context: "++ctx)))
closeInner :: Name -> [(Name,[Attribute])] -> [(Name,[Attribute])]
closeInner c ts =
case lookup c closeInnerTags of
(Just these) -> filter ((`notElem` these).fst) ts
Nothing -> ts
unparse ts = do p <- posn
reparse (map Right (zip (repeat p) ts))
reformatAttrs avs = concatMap f0 avs
where f0 (a, AttValue [Left s]) = [TokName a, TokEqual, TokQuote,
TokFreeText s, TokQuote]
reformatTags ts = concatMap f0 ts
where f0 (t,avs) = [TokAnyOpen, TokName t]++reformatAttrs avs++[TokAnyClose]
content :: Name -> HParser (Stack,Content)
content ctx =
( element ctx >>= \(s,e)-> return (s, CElem e)) +++
( chardata >>= \s-> return ([], CString False s)) +++
( reference >>= \r-> return ([], CRef r)) +++
( cdsect >>= \c-> return ([], CString True c)) +++
( misc >>= \m-> return ([], CMisc m))
elemtag :: HParser ElemTag
elemtag = do
n <- name `elserror` "malformed element tag"
as <- many attribute
return (ElemTag (map toLower n) as)
attribute :: HParser Attribute
attribute = do
n <- name
v <- (do tok TokEqual
attvalue) +++
(return (AttValue [Left "TRUE"]))
return (map toLower n,v)
reference :: HParser Reference
reference = do
bracket (tok TokAmp) (freetext >>= val) (tok TokSemi)
where
val ('#':'x':i) | all isHexDigit i
= return . RefChar . fst . head . readHex $ i
val ('#':i) | all isDigit i
= return . RefChar . fst . head . readDec $ i
val name = return . RefEntity $ name
externalid :: HParser ExternalID
externalid =
( do word "SYSTEM"
s <- systemliteral
return (SYSTEM s)) +++
( do word "PUBLIC"
p <- pubidliteral
s <- (systemliteral +++ return (SystemLiteral ""))
return (PUBLIC p s))
textdecl :: HParser TextDecl
textdecl = do
tok TokPIOpen
(word "xml" +++ word "XML")
v <- maybe versioninfo
e <- encodingdecl
tok TokPIClose `elserror` "expected ?> terminating text decl"
return (TextDecl v e)
encodingdecl :: HParser EncodingDecl
encodingdecl = do
(word "encoding" +++ word "ENCODING")
tok TokEqual `elserror` "expected = in 'encoding' decl"
f <- bracket (tok TokQuote) freetext (tok TokQuote)
return (EncodingDecl f)
publicid :: HParser PublicID
publicid = do
word "PUBLICID"
p <- pubidliteral
return (PUBLICID p)
entityvalue :: HParser EntityValue
entityvalue = do
evs <- bracket (tok TokQuote) (many ev) (tok TokQuote)
return (EntityValue evs)
ev :: HParser EV
ev =
( freetext >>= return . EVString) +++
( reference >>= return . EVRef)
attvalue :: HParser AttValue
attvalue =
( do avs <- bracket (tok TokQuote)
(many (either freetext reference))
(tok TokQuote)
return (AttValue avs) ) +++
( do p <- (tok TokPlus >> return ('+':))
+++ (tok TokHash >> return ('#':))
+++ (return id)
v <- nmtoken
return (AttValue [Left (p v)]) )
systemliteral :: HParser SystemLiteral
systemliteral = do
s <- bracket (tok TokQuote) freetext (tok TokQuote)
return (SystemLiteral s)
pubidliteral :: HParser PubidLiteral
pubidliteral = do
s <- bracket (tok TokQuote) freetext (tok TokQuote)
return (PubidLiteral s)
chardata :: HParser CharData
chardata = freetext