-- ------------------------------------------------------------ {- | Module : Text.XML.HXT.Parser.HtmlParsec Copyright : Copyright (C) 2005 Uwe Schmidt License : MIT Maintainer : Uwe Schmidt (uwe@fh-wedel.de) Stability : stable Portability: portable This parser tries to interprete everything as HTML no errors are emitted during parsing. If something looks weired, warning messages are inserted in the document tree. All filter are pure XmlFilter, errror handling and IO is done in 'Text.XML.HXT.Parser.HtmlParser' or other modules -} -- ------------------------------------------------------------ module Text.XML.HXT.Parser.HtmlParsec ( parseHtmlText , parseHtmlDocument , parseHtmlContent , isEmptyHtmlTag , isInnerHtmlTagOf , closesHtmlTag , emptyHtmlTags ) where import Control.Applicative ( (<$>) ) import Data.Char ( toLower , toUpper ) import Data.Char.Properties.XMLCharProps ( isXmlChar ) import Data.Maybe ( fromMaybe , fromJust ) import qualified Data.Map as M import Text.ParserCombinators.Parsec ( SourcePos , anyChar , between -- , char , eof , getPosition , many , many1 , noneOf , option , runParser , satisfy , string , try , (<|>) ) import Text.XML.HXT.DOM.Interface import Text.XML.HXT.DOM.XmlNode ( mkText' , mkError' , mkCdata' , mkCmt' , mkCharRef' , mkElement' , mkAttr' , mkDTDElem' , mkPi' , isEntityRef , getEntityRef ) import Text.XML.HXT.Parser.XmlTokenParser ( allBut , amp , dq , eq , gt , lt , name , pubidLiteral , skipS , skipS0 , sPace , sq , systemLiteral , checkString , singleCharsT , referenceT , mergeTextNodes ) import Text.XML.HXT.Parser.XmlParsec ( misc , parseXmlText , xMLDecl' ) import Text.XML.HXT.Parser.XmlCharParser ( xmlChar , SimpleXParser , withNormNewline ) import Text.XML.HXT.Parser.XhtmlEntities ( xhtmlEntities ) -- ------------------------------------------------------------ parseHtmlText :: String -> XmlTree -> XmlTrees parseHtmlText loc t = parseXmlText htmlDocument (withNormNewline ()) loc $ t -- ------------------------------------------------------------ parseHtmlFromString :: SimpleXParser XmlTrees -> String -> String -> XmlTrees parseHtmlFromString parser loc = either ((:[]) . mkError' c_err . (++ "\n") . show) id . runParser parser (withNormNewline ()) loc parseHtmlDocument :: String -> String -> XmlTrees parseHtmlDocument = parseHtmlFromString htmlDocument parseHtmlContent :: String -> XmlTrees parseHtmlContent = parseHtmlFromString htmlContent "text" -- ------------------------------------------------------------ type Context = (XmlTreeFl, OpenTags) type XmlTreeFl = XmlTrees -> XmlTrees type OpenTags = [(String, XmlTrees, XmlTreeFl)] -- ------------------------------------------------------------ htmlDocument :: SimpleXParser XmlTrees htmlDocument = do pl <- htmlProlog el <- htmlContent eof return (pl ++ el) htmlProlog :: SimpleXParser XmlTrees htmlProlog = do xml <- option [] ( try xMLDecl' <|> ( do pos <- getPosition checkString " ( do pos <- getPosition upperCaseString " htmlContent' htmlContent' :: SimpleXParser XmlTrees htmlContent' = option [] ( do context <- hContent (id, []) pos <- getPosition return $ closeTags pos context ) where closeTags _pos (body, []) = body [] closeTags pos' (body, ((tn, al, body1) : restOpen)) = closeTags pos' ( addHtmlWarn (show pos' ++ ": no closing tag found for \"<" ++ tn ++ " ...>\"") . addHtmlTag tn al body $ (body1, restOpen) ) -- ------------------------------------------------------------ hElement :: Context -> SimpleXParser Context hElement context = ( do t <- hSimpleData return (addHtmlElem t context) ) <|> hCloseTag context <|> hOpenTag context <|> ( do -- wrong tag, take it as text pos <- getPosition c <- xmlChar return ( addHtmlWarn (show pos ++ " markup char " ++ show c ++ " not allowed in this context") . addHtmlElem (mkText' [c]) $ context ) ) <|> ( do pos <- getPosition c <- anyChar return ( addHtmlWarn ( show pos ++ " illegal data in input or illegal XML char " ++ show c ++ " found and ignored, possibly wrong encoding scheme used") $ context ) ) hSimpleData :: SimpleXParser XmlTree hSimpleData = charData'' <|> hReference' <|> hComment <|> hpI <|> hcDSect where charData'' = do t <- many1 (satisfy (\ x -> isXmlChar x && not (x == '<' || x == '&'))) return (mkText' t) hCloseTag :: Context -> SimpleXParser Context hCloseTag context = do checkString " in tag \" SimpleXParser Context hOpenTag context = ( do e <- hOpenTagStart hOpenTagRest e context ) hOpenTagStart :: SimpleXParser ((SourcePos, String), XmlTrees) hOpenTagStart = do np <- try ( do lt pos <- getPosition n <- lowerCaseName return (pos, n) ) skipS0 as <- hAttrList return (np, as) hOpenTagRest :: ((SourcePos, String), XmlTrees) -> Context -> SimpleXParser Context hOpenTagRest ((pos, tn), al) context = ( do checkString "/>" return (addHtmlTag tn al id context) ) <|> ( do context1 <- checkSymbol gt ("closing > in tag \"<" ++ tn ++ "...\" expected") context return ( let context2 = closePrevTag pos tn context1 in ( if isEmptyHtmlTag tn then addHtmlTag tn al id else openTag tn al ) context2 ) ) hAttrList :: SimpleXParser XmlTrees hAttrList = many (try hAttribute) where hAttribute = do n <- lowerCaseName v <- hAttrValue skipS0 return $ mkAttr' (mkName n) v hAttrValue :: SimpleXParser XmlTrees hAttrValue = option [] ( eq >> hAttrValue' ) hAttrValue' :: SimpleXParser XmlTrees hAttrValue' = try ( between dq dq (hAttrValue'' "&\"") ) <|> try ( between sq sq (hAttrValue'' "&\'") ) <|> ( do -- HTML allows unquoted attribute values cs <- many (noneOf " \r\t\n>\"\'") return [mkText' cs] ) hAttrValue'' :: String -> SimpleXParser XmlTrees hAttrValue'' notAllowed = many ( hReference' <|> singleCharsT notAllowed) hReference' :: SimpleXParser XmlTree hReference' = try hReferenceT <|> ( do amp return (mkText' "&") ) hReferenceT :: SimpleXParser XmlTree hReferenceT = do r <- referenceT return ( if isEntityRef r then substRef r else r ) where -- optimization: HTML entity refs are substituted by char refs, so a later entity ref substituion isn't required substRef r = case (lookup en xhtmlEntities) of Just i -> mkCharRef' i Nothing -> r -- not found: the entity ref remains as it is -- this is also done in the XML parser {- alternative def Nothing -> mkText' ("&" ++ en ++ ";") -- not found: the entity ref is taken as text -} where en = fromJust . getEntityRef $ r hContent :: Context -> SimpleXParser Context hContent context = option context ( hElement context >>= hContent ) -- ------------------------------------------------------------ -- hComment allows "--" in comments -- comment from XML spec does not hComment :: SimpleXParser XmlTree hComment = do checkString "" closeCmt pos c where closeCmt pos c = ( do checkString "-->" return (mkCmt' c) ) <|> ( return $ mkError' c_warn (show pos ++ " no closing comment sequence \"-->\" found") ) -- ------------------------------------------------------------ hpI :: SimpleXParser XmlTree hpI = checkString "> ( try ( do n <- name p <- sPace >> allBut many "?>" string "?>" >> return (mkPi' (mkName n) [mkAttr' (mkName a_value) [mkText' p]]) ) <|> ( do pos <- getPosition return $ mkError' c_warn (show pos ++ " illegal PI found") ) ) -- ------------------------------------------------------------ hcDSect :: SimpleXParser XmlTree hcDSect = do checkString "" closeCD pos t where closeCD pos t = ( do checkString "]]>" return (mkCdata' t) ) <|> ( return $ mkError' c_warn (show pos ++ " no closing CDATA sequence \"]]>\" found") ) -- ------------------------------------------------------------ checkSymbol :: SimpleXParser () -> String -> Context -> SimpleXParser Context checkSymbol p msg context = ( p >> return context ) <|> ( do pos <- getPosition return $ addHtmlWarn (show pos ++ " " ++ msg) context ) lowerCaseName :: SimpleXParser String lowerCaseName = do n <- name return (map toLower n) upperCaseString :: String -> SimpleXParser () upperCaseString s = try (sequence (map (\ c -> satisfy (( == c) . toUpper)) s)) >> return () -- ------------------------------------------------------------ addHtmlTag :: String -> XmlTrees -> XmlTreeFl -> Context -> Context addHtmlTag tn al body context = e `seq` addHtmlElem e context where e = mkElement' (mkName tn) al (body []) addHtmlWarn :: String -> Context -> Context addHtmlWarn msg = addHtmlElem (mkError' c_warn msg) addHtmlElem :: XmlTree -> Context -> Context addHtmlElem elem' (body, openTags) = (body . (elem' :), openTags) openTag :: String -> XmlTrees -> Context -> Context openTag tn al (body, openTags) = (id, (tn, al, body) : openTags) closeTag :: SourcePos -> String -> Context -> Context closeTag pos n context | n `elem` (map ( \ (n1, _, _) -> n1) $ snd context) = closeTag' n context | otherwise = addHtmlWarn (show pos ++ " no opening tag found for ") . addHtmlTag n [] id $ context where closeTag' n' (body', (n1, al1, body1) : restOpen) = close context1 where context1 = addHtmlTag n1 al1 body' (body1, restOpen) close | n' == n1 = id | n1 `isInnerHtmlTagOf` n' = closeTag pos n' | otherwise = addHtmlWarn (show pos ++ " no closing tag found for \"<" ++ n1 ++ " ...>\"") . closeTag' n' closeTag' _ _ = error "illegal argument for closeTag'" closePrevTag :: SourcePos -> String -> Context -> Context closePrevTag _pos _n context@(_body, []) = context closePrevTag pos n context@(body, (n1, al1, body1) : restOpen) | n `closesHtmlTag` n1 = closePrevTag pos n ( addHtmlWarn (show pos ++ " tag \"<" ++ n1 ++ " ...>\" implicitly closed by opening tag \"<" ++ n ++ " ...>\"") . addHtmlTag n1 al1 body $ (body1, restOpen) ) | otherwise = context -- ------------------------------------------------------------ -- -- taken from HaXml and extended isEmptyHtmlTag :: String -> Bool isEmptyHtmlTag n = n `elem` emptyHtmlTags emptyHtmlTags :: [String] emptyHtmlTags = [ "area" , "base" , "br" , "col" , "frame" , "hr" , "img" , "input" , "link" , "meta" , "param" ] {-# INLINE emptyHtmlTags #-} isInnerHtmlTagOf :: String -> String -> Bool n `isInnerHtmlTagOf` tn = n `elem` ( fromMaybe [] . lookup tn $ [ ("body", ["p"]) , ("caption", ["p"]) , ("dd", ["p"]) , ("div", ["p"]) , ("dl", ["dt","dd"]) , ("dt", ["p"]) , ("li", ["p"]) , ("map", ["p"]) , ("object", ["p"]) , ("ol", ["li"]) , ("table", ["th","tr","td","thead","tfoot","tbody"]) , ("tbody", ["th","tr","td"]) , ("td", ["p"]) , ("tfoot", ["th","tr","td"]) , ("th", ["p"]) , ("thead", ["th","tr","td"]) , ("tr", ["th","td"]) , ("ul", ["li"]) ] ) -- a bit more efficient implementation of closes closesHtmlTag :: String -> String -> Bool closesHtmlTag t t2 = fromMaybe False . fmap ($ t) . M.lookup t2 $ closedByTable {-# INLINE closesHtmlTag #-} closedByTable :: M.Map String (String -> Bool) closedByTable = M.fromList $ [ ("a", (== "a")) , ("li", (== "li" )) , ("th", (`elem` ["th", "td", "tr"] )) , ("td", (`elem` ["th", "td", "tr"] )) , ("tr", (== "tr")) , ("dt", (`elem` ["dt", "dd"] )) , ("dd", (`elem` ["dt", "dd"] )) , ("p", (`elem` ["hr" , "h1", "h2", "h3", "h4", "h5", "h6", "dl", "ol", "ul", "table", "div", "p"] )) , ("colgroup", (`elem` ["colgroup", "thead", "tfoot", "tbody"] )) , ("form", (`elem` ["form"] )) , ("label", (`elem` ["label"] )) , ("map", (`elem` ["map"] )) , ("option", const True) , ("script", const True) , ("style", const True) , ("textarea", const True) , ("title", const True) , ("select", ( /= "option")) , ("thead", (`elem` ["tfoot","tbody"] )) , ("tbody", (== "tbody" )) , ("tfoot", (== "tbody" )) , ("h1", (`elem` ["h1", "h2", "h3", "h4", "h5", "h6", "dl", "ol", "ul", "table", "div", "p"] )) , ("h2", (`elem` ["h1", "h2", "h3", "h4", "h5", "h6", "dl", "ol", "ul", "table", "div", "p"] )) , ("h3", (`elem` ["h1", "h2", "h3", "h4", "h5", "h6", "dl", "ol", "ul", "table", "div", "p"] )) , ("h4", (`elem` ["h1", "h2", "h3", "h4", "h5", "h6", "dl", "ol", "ul", "table", "div", "p"] )) , ("h5", (`elem` ["h1", "h2", "h3", "h4", "h5", "h6", "dl", "ol", "ul", "table", "div", "p"] )) , ("h6", (`elem` ["h1", "h2", "h3", "h4", "h5", "h6", "dl", "ol", "ul", "table", "div", "p"] )) ] {- closesHtmlTag :: String -> String -> Bool closesHtmlTag = closes closes :: String -> String -> Bool "a" `closes` "a" = True "li" `closes` "li" = True "th" `closes` t | t `elem` ["th","td"] = True "td" `closes` t | t `elem` ["th","td"] = True "tr" `closes` t | t `elem` ["th","td","tr"] = True "dt" `closes` t | t `elem` ["dt","dd"] = True "dd" `closes` t | t `elem` ["dt","dd"] = True "hr" `closes` "p" = True "colgroup" `closes` "colgroup" = True "form" `closes` "form" = True "label" `closes` "label" = True "map" `closes` "map" = True "object" `closes` "object" = True _ `closes` t | t `elem` ["option" ,"script" ,"style" ,"textarea" ,"title" ] = True t `closes` "select" | t /= "option" = True "thead" `closes` t | t `elem` ["colgroup"] = True "tfoot" `closes` t | t `elem` ["thead" ,"colgroup"] = True "tbody" `closes` t | t `elem` ["tbody" ,"tfoot" ,"thead" ,"colgroup"] = True t `closes` t2 | t `elem` ["h1","h2","h3" ,"h4","h5","h6" ,"dl","ol","ul" ,"table" ,"div","p" ] && t2 `elem` ["h1","h2","h3" ,"h4","h5","h6" ,"p" -- not "div" ] = True _ `closes` _ = False -} -- ------------------------------------------------------------