module Text.Pandoc.Readers.HTML (
readHtml,
rawHtmlInline,
rawHtmlBlock,
anyHtmlBlockTag,
anyHtmlInlineTag,
anyHtmlTag,
anyHtmlEndTag,
htmlEndTag,
extractTagType,
htmlBlockElement
) where
import Text.ParserCombinators.Parsec
import Text.Pandoc.Definition
import Text.Pandoc.Shared
import Text.Pandoc.CharacterReferences ( characterReference,
decodeCharacterReferences )
import Data.Maybe ( fromMaybe )
import Data.List ( takeWhile, dropWhile, isPrefixOf, isSuffixOf )
import Data.Char ( toUpper, toLower, isAlphaNum )
readHtml :: ParserState
-> String
-> Pandoc
readHtml = readWith parseHtml
eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins",
"map", "area", "object", "script"]
inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big",
"br", "cite", "code", "dfn", "em", "font", "i", "img",
"input", "kbd", "label", "q", "s", "samp", "select",
"small", "span", "strike", "strong", "sub", "sup",
"textarea", "tt", "u", "var"] ++ eitherBlockOrInline
blockHtmlTags = ["address", "blockquote", "center", "dir", "div",
"dl", "fieldset", "form", "h1", "h2", "h3", "h4",
"h5", "h6", "hr", "isindex", "menu", "noframes",
"noscript", "ol", "p", "pre", "table", "ul", "dd",
"dt", "frameset", "li", "tbody", "td", "tfoot",
"th", "thead", "tr"] ++ eitherBlockOrInline
blocksTilEnd tag = do
blocks <- manyTill (block >>~ spaces) (htmlEndTag tag)
return $ filter (/= Null) blocks
inlinesTilEnd tag = manyTill inline (htmlEndTag tag)
blocksIn tag = try $ htmlTag tag >> spaces >> blocksTilEnd tag
inlinesIn tag = try $ htmlTag tag >> spaces >> inlinesTilEnd tag
extractTagType :: String -> String
extractTagType ('<':rest) =
let isSpaceOrSlash c = c `elem` "/ \n\t" in
map toLower $ takeWhile isAlphaNum $ dropWhile isSpaceOrSlash rest
extractTagType _ = ""
anyHtmlTag = try $ do
char '<'
spaces
tag <- many1 alphaNum
attribs <- many htmlAttribute
spaces
ender <- option "" (string "/")
let ender' = if null ender then "" else " /"
spaces
char '>'
return $ "<" ++ tag ++
concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">"
anyHtmlEndTag = try $ do
char '<'
spaces
char '/'
spaces
tagType <- many1 alphaNum
spaces
char '>'
return $ "</" ++ tagType ++ ">"
htmlTag :: String -> GenParser Char st (String, [(String, String)])
htmlTag tag = try $ do
char '<'
spaces
stringAnyCase tag
attribs <- many htmlAttribute
spaces
optional (string "/")
spaces
char '>'
return (tag, (map (\(name, content, raw) -> (name, content)) attribs))
quoted quoteChar = do
result <- between (char quoteChar) (char quoteChar)
(many (noneOf [quoteChar]))
return (result, [quoteChar])
htmlAttribute = htmlRegularAttribute <|> htmlMinimizedAttribute
htmlMinimizedAttribute = try $ do
many1 space
name <- many1 (choice [letter, oneOf ".-_:"])
return (name, name, name)
htmlRegularAttribute = try $ do
many1 space
name <- many1 (choice [letter, oneOf ".-_:"])
spaces
char '='
spaces
(content, quoteStr) <- choice [ (quoted '\''),
(quoted '"'),
(do
a <- many (alphaNum <|> (oneOf "-._:"))
return (a,"")) ]
return (name, content,
(name ++ "=" ++ quoteStr ++ content ++ quoteStr))
htmlEndTag tag = try $ do
char '<'
spaces
char '/'
spaces
stringAnyCase tag
spaces
char '>'
return $ "</" ++ tag ++ ">"
isInline tag = (extractTagType tag) `elem` inlineHtmlTags
isBlock tag = (extractTagType tag) `elem` blockHtmlTags
anyHtmlBlockTag = try $ do
tag <- anyHtmlTag <|> anyHtmlEndTag
if isBlock tag then return tag else fail "inline tag"
anyHtmlInlineTag = try $ do
tag <- anyHtmlTag <|> anyHtmlEndTag
if isInline tag then return tag else fail "not an inline tag"
htmlScript = try $ do
open <- string "<script"
rest <- manyTill anyChar (htmlEndTag "script")
return $ open ++ rest ++ "</script>"
htmlBlockElement = choice [ htmlScript, htmlComment, xmlDec, definition ]
rawHtmlBlock = try $ do
notFollowedBy' (htmlTag "/body" <|> htmlTag "/html")
body <- htmlBlockElement <|> anyHtmlTag <|> anyHtmlEndTag
sp <- many space
state <- getState
if stateParseRaw state then return (RawHtml (body ++ sp)) else return Null
htmlComment = try $ do
string "<!--"
comment <- manyTill anyChar (try (string "-->"))
return $ "<!--" ++ comment ++ "-->"
xmlDec = try $ do
string "<?"
rest <- manyTill anyChar (char '>')
return $ "<?" ++ rest ++ ">"
definition = try $ do
string "<!"
rest <- manyTill anyChar (char '>')
return $ "<!" ++ rest ++ ">"
nonTitleNonHead = try $ notFollowedBy' (htmlTag "title" <|> htmlTag "/head") >>
((rawHtmlBlock >> return ' ') <|> anyChar)
parseTitle = try $ do
(tag, _) <- htmlTag "title"
contents <- inlinesTilEnd tag
spaces
return contents
parseHead = try $ do
htmlTag "head"
spaces
skipMany nonTitleNonHead
contents <- option [] parseTitle
skipMany nonTitleNonHead
htmlTag "/head"
return (contents, [], "")
skipHtmlTag tag = optional (htmlTag tag)
bodyTitle = try $ do
(tag, attribs) <- htmlTag "h1"
cl <- case (extractAttribute "class" attribs) of
Just "title" -> return ""
otherwise -> fail "not title"
inlinesTilEnd "h1"
parseHtml = do
sepEndBy (choice [xmlDec, definition, htmlComment]) spaces
skipHtmlTag "html"
spaces
(title, authors, date) <- option ([], [], "") parseHead
spaces
skipHtmlTag "body"
spaces
optional bodyTitle
blocks <- parseBlocks
spaces
optional (htmlEndTag "body")
spaces
optional (htmlEndTag "html" >> many anyChar)
eof
return $ Pandoc (Meta title authors date) blocks
parseBlocks = spaces >> sepEndBy block spaces >>= (return . filter (/= Null))
block = choice [ codeBlock
, header
, hrule
, list
, blockQuote
, para
, plain
, rawHtmlBlock ] <?> "block"
header = choice (map headerLevel (enumFromTo 1 5)) <?> "header"
headerLevel n = try $ do
let level = "h" ++ show n
(tag, attribs) <- htmlTag level
contents <- inlinesTilEnd level
return $ Header n (normalizeSpaces contents)
hrule = try $ do
(tag, attribs) <- htmlTag "hr"
state <- getState
if not (null attribs) && stateParseRaw state
then unexpected "attributes in hr"
else return HorizontalRule
codeBlock = try $ do
htmlTag "pre"
result <- manyTill
(many1 (satisfy (/= '<')) <|>
((anyHtmlTag <|> anyHtmlEndTag) >> return ""))
(htmlEndTag "pre")
let result' = concat result
let result'' = if "\n" `isPrefixOf` result'
then drop 1 result'
else result'
let result''' = if "\n" `isSuffixOf` result''
then init result''
else result''
return $ CodeBlock $ decodeCharacterReferences result'''
blockQuote = try $ htmlTag "blockquote" >> spaces >>
blocksTilEnd "blockquote" >>= (return . BlockQuote)
list = choice [ bulletList, orderedList, definitionList ] <?> "list"
orderedList = try $ do
(_, attribs) <- htmlTag "ol"
(start, style) <- option (1, DefaultStyle) $
do failIfStrict
let sta = fromMaybe "1" $
lookup "start" attribs
let sty = fromMaybe (fromMaybe "" $
lookup "style" attribs) $
lookup "class" attribs
let sty' = case sty of
"lower-roman" -> LowerRoman
"upper-roman" -> UpperRoman
"lower-alpha" -> LowerAlpha
"upper-alpha" -> UpperAlpha
"decimal" -> Decimal
_ -> DefaultStyle
return (read sta, sty')
spaces
items <- sepEndBy1 (blocksIn "li") spaces
htmlEndTag "ol"
return $ OrderedList (start, style, DefaultDelim) items
bulletList = try $ do
htmlTag "ul"
spaces
items <- sepEndBy1 (blocksIn "li") spaces
htmlEndTag "ul"
return $ BulletList items
definitionList = try $ do
failIfStrict
tag <- htmlTag "dl"
spaces
items <- sepEndBy1 definitionListItem spaces
htmlEndTag "dl"
return $ DefinitionList items
definitionListItem = try $ do
terms <- sepEndBy1 (inlinesIn "dt") spaces
defs <- sepEndBy1 (blocksIn "dd") spaces
let term = joinWithSep [LineBreak] terms
return (term, concat defs)
para = try $ htmlTag "p" >> inlinesTilEnd "p" >>=
return . Para . normalizeSpaces
plain = many1 inline >>= return . Plain . normalizeSpaces
inline = choice [ charRef
, strong
, emph
, superscript
, subscript
, strikeout
, spanStrikeout
, code
, str
, linebreak
, whitespace
, link
, image
, rawHtmlInline
] <?> "inline"
code = try $ do
htmlTag "code"
result <- manyTill anyChar (htmlEndTag "code")
return $ Code $ decodeCharacterReferences $ removeLeadingTrailingSpace $
joinWithSep " " $ lines result
rawHtmlInline = do
result <- htmlScript <|> htmlComment <|> anyHtmlInlineTag
state <- getState
if stateParseRaw state then return (HtmlInline result) else return (Str "")
betweenTags tag = try $ htmlTag tag >> inlinesTilEnd tag >>=
return . normalizeSpaces
emph = (betweenTags "em" <|> betweenTags "it") >>= return . Emph
strong = (betweenTags "b" <|> betweenTags "strong") >>= return . Strong
superscript = failIfStrict >> betweenTags "sup" >>= return . Superscript
subscript = failIfStrict >> betweenTags "sub" >>= return . Subscript
strikeout = failIfStrict >> (betweenTags "s" <|> betweenTags "strike") >>=
return . Strikeout
spanStrikeout = try $ do
failIfStrict
(tag, attributes) <- htmlTag "span"
result <- case (extractAttribute "class" attributes) of
Just "strikeout" -> inlinesTilEnd "span"
_ -> fail "not a strikeout"
return $ Strikeout result
whitespace = many1 space >> return Space
linebreak = htmlTag "br" >> optional newline >> return LineBreak
str = many1 (noneOf "<& \t\n") >>= return . Str
extractAttribute name [] = Nothing
extractAttribute name ((attrName, contents):rest) =
let name' = map toLower name
attrName' = map toLower attrName
in if attrName' == name'
then Just (decodeCharacterReferences contents)
else extractAttribute name rest
link = try $ do
(tag, attributes) <- htmlTag "a"
url <- case (extractAttribute "href" attributes) of
Just url -> return url
Nothing -> fail "no href"
let title = fromMaybe "" $ extractAttribute "title" attributes
label <- inlinesTilEnd "a"
return $ Link (normalizeSpaces label) (url, title)
image = try $ do
(tag, attributes) <- htmlTag "img"
url <- case (extractAttribute "src" attributes) of
Just url -> return url
Nothing -> fail "no src"
let title = fromMaybe "" $ extractAttribute "title" attributes
let alt = fromMaybe "" (extractAttribute "alt" attributes)
return $ Image [Str alt] (url, title)