-- ------------------------------------------------------------ {- | Module : Yuuko.Text.XML.HXT.Parser.HtmlParsec Copyright : Copyright (C) 2005 Uwe Schmidt License : MIT Maintainer : Uwe Schmidt (uwe@fh-wedel.de) Stability : stable Portability: portable This parser tries to interprete everything as HTML no errors are emitted during parsing. If something looks weired, warning messages are inserted in the document tree. All filter are pure XmlFilter, errror handling and IO is done in 'Yuuko.Text.XML.HXT.Parser.HtmlParser' or other modules -} -- ------------------------------------------------------------ module Yuuko.Text.XML.HXT.Parser.HtmlParsec ( parseHtmlText , parseHtmlDocument , parseHtmlContent , isEmptyHtmlTag , isInnerHtmlTagOf , closesHtmlTag , emptyHtmlTags ) where import Yuuko.Text.XML.HXT.DOM.Interface import Yuuko.Text.XML.HXT.DOM.XmlNode ( mkText , mkError , mkCmt , mkElement , mkAttr , mkDTDElem ) import Text.ParserCombinators.Parsec ( Parser , SourcePos , anyChar , between , char , eof , getPosition , many , noneOf , option , parse , satisfy , string , try , (<|>) ) import Yuuko.Text.XML.HXT.Parser.XmlTokenParser ( allBut , dq , eq , gt , name , pubidLiteral , skipS , skipS0 , sq , systemLiteral , singleCharsT , referenceT ) import Yuuko.Text.XML.HXT.Parser.XmlParsec ( cDSect , charData' , misc , parseXmlText , pI , xMLDecl' ) import Yuuko.Text.XML.HXT.Parser.XmlCharParser ( xmlChar ) import Data.Maybe ( fromMaybe ) import Data.Char ( toLower , toUpper ) -- ------------------------------------------------------------ parseHtmlText :: String -> XmlTree -> XmlTrees parseHtmlText loc t = parseXmlText htmlDocument loc $ t -- ------------------------------------------------------------ parseHtmlFromString :: Parser XmlTrees -> String -> String -> XmlTrees parseHtmlFromString parser loc = either ((:[]) . mkError c_err . (++ "\n") . show) id . parse parser loc parseHtmlDocument :: String -> String -> XmlTrees parseHtmlDocument = parseHtmlFromString htmlDocument parseHtmlContent :: String -> XmlTrees parseHtmlContent = parseHtmlFromString htmlContent "text" -- ------------------------------------------------------------ htmlDocument :: Parser XmlTrees htmlDocument = do pl <- htmlProlog el <- htmlContent eof return (pl ++ el) htmlProlog :: Parser XmlTrees htmlProlog = do xml <- option [] ( try xMLDecl' <|> ( do pos <- getPosition _ <- try (string " ( do pos <- getPosition _ <- try (upperCaseString "') ( do skipS n <- name exId <- ( do skipS option [] externalID ) skipS0 return [mkDTDElem DOCTYPE ((a_name, n) : exId) []] ) externalID :: Parser Attributes externalID = do _ <- try (upperCaseString k_public) skipS pl <- pubidLiteral sl <- option "" $ try ( do skipS systemLiteral ) return $ (k_public, pl) : if null sl then [] else [(k_system, sl)] htmlContent :: Parser XmlTrees htmlContent = option [] ( do context <- hContent ([], []) pos <- getPosition return $ closeTags pos context ) where closeTags _ (body, []) = reverse body closeTags pos' (body, ((tn, al, body1) : restOpen)) = closeTags pos' (addHtmlWarn (show pos' ++ ": no closing tag found for \"<" ++ tn ++ " ...>\"") . addHtmlTag tn al body $ (body1, restOpen) ) type OpenTags = [(String, XmlTrees, XmlTrees)] type Context = (XmlTrees, OpenTags) hElement :: Context -> Parser Context hElement context = ( do t <- hSimpleData return (addHtmlElems [t] context) ) <|> hOpenTag context <|> hCloseTag context <|> ( do -- wrong tag, take it as text pos <- getPosition c <- xmlChar return ( addHtmlWarn (show pos ++ " markup char " ++ show c ++ " not allowed in this context") . addHtmlElems [mkText [c]] $ context ) ) <|> ( do pos <- getPosition c <- anyChar return ( addHtmlWarn ( show pos ++ " illegal data in input or illegal XML char " ++ show c ++ " found and ignored, possibly wrong encoding scheme used") $ context ) ) hSimpleData :: Parser XmlTree hSimpleData = charData' <|> try referenceT <|> try hComment <|> try pI <|> try cDSect hCloseTag :: Context -> Parser Context hCloseTag context = do n <- try ( do _ <- string " in tag \" Parser Context hOpenTag context = ( do pos <- getPosition e <- hOpenTagStart hOpenTagRest pos e context ) hOpenTagStart :: Parser (String, XmlTrees) hOpenTagStart = do n <- try ( do _ <- char '<' n <- lowerCaseName return n ) skipS0 as <- hAttrList return (n, as) hOpenTagRest :: SourcePos -> (String, XmlTrees) -> Context -> Parser Context hOpenTagRest pos (tn, al) context = ( do _ <- try $ string "/>" return (addHtmlTag tn al [] context) ) <|> ( do context1 <- checkSymbol gt ("closing > in tag \"<" ++ tn ++ "...\" expected") context return ( let context2 = closePrevTag pos tn context1 in ( if isEmptyHtmlTag tn then addHtmlTag tn al [] else openTag tn al ) context2 ) ) hAttrList :: Parser XmlTrees hAttrList = many (try hAttribute) where hAttribute = do n <- lowerCaseName v <- hAttrValue skipS0 return $ mkAttr (mkName n) v hAttrValue :: Parser XmlTrees hAttrValue = option [] ( try ( do eq hAttrValue' ) ) hAttrValue' :: Parser XmlTrees hAttrValue' = try ( between dq dq (hAttrValue'' "&\"") ) <|> try ( between sq sq (hAttrValue'' "&\'") ) <|> ( do -- HTML allows unquoted attribute values cs <- many (noneOf " \r\t\n>\"\'") return [mkText cs] ) hAttrValue'' :: String -> Parser XmlTrees hAttrValue'' notAllowed = many ( hReference' <|> singleCharsT notAllowed) hReference' :: Parser XmlTree hReference' = try referenceT <|> ( do _ <- char '&' return (mkText "&") ) hContent :: Context -> Parser Context hContent context = option context ( do context1 <- hElement context hContent context1 ) -- hComment allows "--" in comments -- comment from XML spec does not hComment :: Parser XmlTree hComment = do c <- between (try $ string "") (allBut many "-->") return (mkCmt c) checkSymbol :: Parser a -> String -> Context -> Parser Context checkSymbol p msg context = do pos <- getPosition option (addHtmlWarn (show pos ++ " " ++ msg) context) ( do _ <- try p return context ) lowerCaseName :: Parser String lowerCaseName = do n <- name return (map toLower n) upperCaseString :: String -> Parser String upperCaseString = sequence . map (\ c -> satisfy (( == c) . toUpper)) -- ------------------------------------------------------------ addHtmlTag :: String -> XmlTrees -> XmlTrees -> Context -> Context addHtmlTag tn al body (body1, openTags) = ([mkElement (mkName tn) al (reverse body)] ++ body1, openTags) addHtmlWarn :: String -> Context -> Context addHtmlWarn msg = addHtmlElems [mkError c_warn msg] addHtmlElems :: XmlTrees -> Context -> Context addHtmlElems elems (body, openTags) = (reverse elems ++ body, openTags) openTag :: String -> XmlTrees -> Context -> Context openTag tn al (body, openTags) = ([], (tn, al, body) : openTags) closeTag :: SourcePos -> String -> Context -> Context closeTag pos n context | n `elem` (map ( \ (n1, _, _) -> n1) $ snd context) = closeTag' n context | otherwise = addHtmlWarn (show pos ++ " no opening tag found for ") . addHtmlTag n [] [] $ context where closeTag' n' (body', (n1, al1, body1) : restOpen) = close context1 where context1 = addHtmlTag n1 al1 body' (body1, restOpen) close | n' == n1 = id | n1 `isInnerHtmlTagOf` n' = closeTag pos n' | otherwise = addHtmlWarn (show pos ++ " no closing tag found for \"<" ++ n1 ++ " ...>\"") . closeTag' n' closeTag' _ _ = error "illegal argument for closeTag'" closePrevTag :: SourcePos -> String -> Context -> Context closePrevTag _pos _n context@(_body, []) = context closePrevTag pos n context@(body, (n1, al1, body1) : restOpen) | n `closes` n1 = closePrevTag pos n ( addHtmlWarn (show pos ++ " tag \"<" ++ n1 ++ " ...>\" implicitly closed by opening tag \"<" ++ n ++ " ...>\"") . addHtmlTag n1 al1 body $ (body1, restOpen) ) | otherwise = context -- ------------------------------------------------------------ -- -- taken from HaXml and extended isEmptyHtmlTag :: String -> Bool isEmptyHtmlTag n = n `elem` emptyHtmlTags emptyHtmlTags :: [String] emptyHtmlTags = [ "area" , "base" , "br" , "col" , "frame" , "hr" , "img" , "input" , "link" , "meta" , "param" ] isInnerHtmlTagOf :: String -> String -> Bool n `isInnerHtmlTagOf` tn = n `elem` ( fromMaybe [] . lookup tn $ [ ("body", ["p"]) , ("caption", ["p"]) , ("dd", ["p"]) , ("div", ["p"]) , ("dl", ["dt","dd"]) , ("dt", ["p"]) , ("li", ["p"]) , ("map", ["p"]) , ("object", ["p"]) , ("ol", ["li"]) , ("table", ["th","tr","td","thead","tfoot","tbody"]) , ("tbody", ["th","tr","td"]) , ("td", ["p"]) , ("tfoot", ["th","tr","td"]) , ("th", ["p"]) , ("thead", ["th","tr","td"]) , ("tr", ["th","td"]) , ("ul", ["li"]) ] ) closesHtmlTag , closes :: String -> String -> Bool closesHtmlTag = closes "a" `closes` "a" = True "li" `closes` "li" = True "th" `closes` t | t `elem` ["th","td"] = True "td" `closes` t | t `elem` ["th","td"] = True "tr" `closes` t | t `elem` ["th","td","tr"] = True "dt" `closes` t | t `elem` ["dt","dd"] = True "dd" `closes` t | t `elem` ["dt","dd"] = True "hr" `closes` "p" = True "colgroup" `closes` "colgroup" = True "form" `closes` "form" = True "label" `closes` "label" = True "map" `closes` "map" = True "object" `closes` "object" = True _ `closes` t | t `elem` ["option" ,"script" ,"style" ,"textarea" ,"title" ] = True t `closes` "select" | t /= "option" = True "thead" `closes` t | t `elem` ["colgroup"] = True "tfoot" `closes` t | t `elem` ["thead" ,"colgroup"] = True "tbody" `closes` t | t `elem` ["tbody" ,"tfoot" ,"thead" ,"colgroup"] = True t `closes` t2 | t `elem` ["h1","h2","h3" ,"h4","h5","h6" ,"dl","ol","ul" ,"table" ,"div","p" ] && t2 `elem` ["h1","h2","h3" ,"h4","h5","h6" ,"p" -- not "div" ] = True _ `closes` _ = False -- ------------------------------------------------------------