-- |A structure-recovering parser for malformed documents. -- -- Copyright 2007-2008 Arjun Guha. -- Based on HtmlPrag 0.16 Copyright (C) 2003 - 2005 Neil W. Van Dyke. -- -- This program is Free Software; you can redistribute it and/or modify it under -- the terms of the GNU Lesser General Public License as published by the Free -- Software Foundation; either version 2.1 of the License, or (at your option) -- any later version. This program is distributed in the hope that it will be -- useful, but without any warranty; without even the implied warranty of -- merchantability or fitness for a particular purpose. See -- for details. For other license -- options and consulting, contact the author. module WebBits.Html.PermissiveParser ( html , parseHtmlFromFile , parseHtmlFromString -- tokenizer is exported primarily for testing , tokens , Token ) where import Control.Monad import Text.ParserCombinators.Parsec hiding (token,tokens) import qualified Text.ParserCombinators.Parsec as Parsec import Data.Char (toLower) import Data.List (intersperse) import qualified WebBits.Html.Syntax as Html import WebBits.Html.Syntax (HtmlId,Attribute,Script(..)) type ParsedHtml s = Html.Html SourcePos s type ParsedAttribute s = Html.Attribute SourcePos s -------------------------------------------------------------------------------- -- Parsers generate warnings data Warning = StringWarning SourcePos String instance Show Warning where show (StringWarning p s) = "Warning parsing HTML: " ++ s showList [] s = s showList (x:xs) s = show x ++ ('\n':showList xs s) warn:: String -> GenParser tok [Warning] () warn s = do p <- getPosition updateState ((StringWarning p s):) noWarnings:: [Warning] noWarnings = [] -- A structure-recovering parser for malformed documents, derived from -- Neil W. Van Dyke's htmlprag library for PLT Scheme -- The elements in the list can legally enclose the 1st element of the pair. parentConstraints:: [(HtmlId,[HtmlId])] parentConstraints = [("area",["map"]), ("body",["html"]), ("caption", ["table"]), ("colgroup", ["table"]), ("dd", ["dl"]), ("dt", ["dl"]), ("frame", ["frameset"]), ("head", ["html"]), ("isindex", ["head"]), ("li", ["dir", "menu", "ol", "ul"]), ("meta", ["head"]), ("noframes", ["frameset"]), ("option", ["select"]), ("p", ["body", "td", "th"]), ("param", ["applet"]), ("tbody", ["table"]), ("td", ["tr"]), ("th", ["tr"]), ("thead", ["table"]), ("title", ["head"]), ("tr", ["table", "tbody", "thead"])] -- |List of HTML elements that are empty. emptyElements:: [HtmlId] emptyElements = ["area", "base", "br", "frame", "hr", "img", "input", "isindex", "keygen", "link", "meta", "object", "param", "spacer", "wbr"] isLegalChildOf:: HtmlId -> HtmlId -> Bool isLegalChildOf child parent = case lookup child parentConstraints of Nothing -> True (Just legalParents) -> parent `elem` legalParents isEmptyElement:: HtmlId -> Bool isEmptyElement element = element `elem` emptyElements --}}} -------------------------------------------------------------------------------- -- Parses an HTML file into a stream of tokens. -- The auxillary parsing functions return values of this type. data Script s => Token s = Text SourcePos String | EntityToken SourcePos String | EntityInt SourcePos Int | Tag SourcePos HtmlId [Attribute SourcePos s] Bool {-closed?-} | Script SourcePos s | Inline SourcePos s String | EndTag SourcePos HtmlId | Comment SourcePos String | DoctypeToken SourcePos String String String (Maybe String) token:: Script s => Bool -> [Attribute SourcePos s] -> CharParser [Warning] (Token s) token expectedScript prevAttrs = case expectedScript of True -> (liftM2 Script getPosition (parseScriptBlock prevAttrs)) "expected a script after a