{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ViewPatterns #-} {- Copyright (C) 2006-2018 John MacFarlane This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -} {- | Module : Text.Pandoc.Readers.HTML Copyright : Copyright (C) 2006-2018 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane Stability : alpha Portability : portable Conversion of HTML to 'Pandoc' document. -} module Text.Pandoc.Readers.HTML ( readHtml , htmlTag , htmlInBalanced , isInlineTag , isBlockTag , NamedTag(..) , isTextTag , isCommentTag ) where import Prelude import Control.Applicative ((<|>)) import Control.Arrow (first) import Control.Monad (guard, mplus, msum, mzero, unless, void) import Control.Monad.Except (throwError) import Control.Monad.Reader (ReaderT, ask, asks, lift, local, runReaderT) import Data.Char (isAlphaNum, isDigit, isLetter) import Data.Default (Default (..), def) import Data.Foldable (for_) import Data.List (isPrefixOf) import Data.List.Split (wordsBy, splitWhen) import qualified Data.Map as M import Data.Maybe (fromMaybe, isJust, isNothing) import Data.Monoid (First (..)) import qualified Data.Set as Set import Data.Text (Text) import qualified Data.Text as T import Network.URI (URI, nonStrictRelativeTo, parseURIReference) import Text.HTML.TagSoup import Text.HTML.TagSoup.Match import Text.Pandoc.Builder (Blocks, HasMeta (..), Inlines, trimInlines) import qualified Text.Pandoc.Builder as B import Text.Pandoc.Class (PandocMonad (..)) import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps) import Text.Pandoc.Definition import Text.Pandoc.Readers.LaTeX (rawLaTeXInline) import Text.Pandoc.Readers.LaTeX.Types (Macro) import Text.Pandoc.Error import Text.Pandoc.Logging import Text.Pandoc.Options ( Extension (Ext_epub_html_exts, Ext_empty_paragraphs, Ext_native_divs, Ext_native_spans, Ext_raw_html, Ext_line_blocks, Ext_raw_tex), ReaderOptions (readerExtensions, readerStripComments), extensionEnabled) import Text.Pandoc.Parsing hiding ((<|>)) import Text.Pandoc.Shared (addMetaField, blocksToInlines', crFilter, escapeURI, extractSpaces, safeRead, underlineSpan) import Text.Pandoc.Walk import Text.Parsec.Error import Text.TeXMath (readMathML, writeTeX) -- | Convert HTML-formatted string to 'Pandoc' document. readHtml :: PandocMonad m => ReaderOptions -- ^ Reader options -> Text -- ^ String to parse (assumes @'\n'@ line endings) -> m Pandoc readHtml opts inp = do let tags = stripPrefixes . canonicalizeTags $ parseTagsOptions parseOptions{ optTagPosition = True } (crFilter inp) parseDoc = do blocks <- fixPlains False . mconcat <$> manyTill block eof meta <- stateMeta . parserState <$> getState bs' <- replaceNotes (B.toList blocks) reportLogMessages return $ Pandoc meta bs' getError (errorMessages -> ms) = case ms of [] -> "" (m:_) -> messageString m result <- flip runReaderT def $ runParserT parseDoc (HTMLState def{ stateOptions = opts } [] Nothing Set.empty M.empty [] M.empty) "source" tags case result of Right doc -> return doc Left err -> throwError $ PandocParseError $ getError err replaceNotes :: PandocMonad m => [Block] -> TagParser m [Block] replaceNotes = walkM replaceNotes' replaceNotes' :: PandocMonad m => Inline -> TagParser m Inline replaceNotes' (RawInline (Format "noteref") ref) = maybe (Str "") (Note . B.toList) . lookup ref <$> getNotes where getNotes = noteTable <$> getState replaceNotes' x = return x data HTMLState = HTMLState { parserState :: ParserState, noteTable :: [(String, Blocks)], baseHref :: Maybe URI, identifiers :: Set.Set String, headerMap :: M.Map Inlines String, logMessages :: [LogMessage], macros :: M.Map Text Macro } data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext , inChapter :: Bool -- ^ Set if in chapter section , inPlain :: Bool -- ^ Set if in pPlain } setInChapter :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a setInChapter = local (\s -> s {inChapter = True}) setInPlain :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a setInPlain = local (\s -> s {inPlain = True}) type HTMLParser m s = ParserT s HTMLState (ReaderT HTMLLocal m) type TagParser m = HTMLParser m [Tag Text] pHtml :: PandocMonad m => TagParser m Blocks pHtml = try $ do (TagOpen "html" attr) <- lookAhead pAnyTag for_ (lookup "lang" attr) $ updateState . B.setMeta "lang" . B.text . T.unpack pInTags "html" block pBody :: PandocMonad m => TagParser m Blocks pBody = pInTags "body" block pHead :: PandocMonad m => TagParser m Blocks pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAnyTag) where pTitle = pInTags "title" inline >>= setTitle . trimInlines setTitle t = mempty <$ updateState (B.setMeta "title" t) pMetaTag = do mt <- pSatisfy (matchTagOpen "meta" []) let name = T.unpack $ fromAttrib "name" mt if null name then return mempty else do let content = T.unpack $ fromAttrib "content" mt updateState $ \s -> let ps = parserState s in s{ parserState = ps{ stateMeta = addMetaField name (B.text content) (stateMeta ps) } } return mempty pBaseTag = do bt <- pSatisfy (matchTagOpen "base" []) updateState $ \st -> st{ baseHref = parseURIReference $ T.unpack $ fromAttrib "href" bt } return mempty block :: PandocMonad m => TagParser m Blocks block = do res <- choice [ eSection , eSwitch B.para block , mempty <$ eFootnote , mempty <$ eTOC , mempty <$ eTitlePage , pPara , pHeader , pBlockQuote , pCodeBlock , pList , pHrule , pTable , pHtml , pHead , pBody , pLineBlock , pDiv , pPlain , pFigure , pRawHtmlBlock ] trace (take 60 $ show $ B.toList res) return res namespaces :: PandocMonad m => [(String, TagParser m Inlines)] namespaces = [(mathMLNamespace, pMath True)] mathMLNamespace :: String mathMLNamespace = "http://www.w3.org/1998/Math/MathML" eSwitch :: (PandocMonad m, Monoid a) => (Inlines -> a) -> TagParser m a -> TagParser m a eSwitch constructor parser = try $ do guardEnabled Ext_epub_html_exts pSatisfy (matchTagOpen "switch" []) cases <- getFirst . mconcat <$> manyTill (First <$> (eCase <* skipMany pBlank) ) (lookAhead $ try $ pSatisfy (matchTagOpen "default" [])) skipMany pBlank fallback <- pInTags "default" (skipMany pBlank *> parser <* skipMany pBlank) skipMany pBlank pSatisfy (matchTagClose "switch") return $ maybe fallback constructor cases eCase :: PandocMonad m => TagParser m (Maybe Inlines) eCase = do skipMany pBlank TagOpen _ attr' <- lookAhead $ pSatisfy (matchTagOpen "case" []) let attr = toStringAttr attr' case flip lookup namespaces =<< lookup "required-namespace" attr of Just p -> Just <$> pInTags "case" (skipMany pBlank *> p <* skipMany pBlank) Nothing -> Nothing <$ manyTill pAnyTag (pSatisfy (matchTagClose "case")) eFootnote :: PandocMonad m => TagParser m () eFootnote = try $ do let notes = ["footnote", "rearnote"] guardEnabled Ext_epub_html_exts (TagOpen tag attr') <- lookAhead pAnyTag let attr = toStringAttr attr' guard $ maybe False (`elem` notes) (lookup "type" attr) let ident = fromMaybe "" (lookup "id" attr) content <- pInTags tag block addNote ident content addNote :: PandocMonad m => String -> Blocks -> TagParser m () addNote uid cont = updateState (\s -> s {noteTable = (uid, cont) : noteTable s}) eNoteref :: PandocMonad m => TagParser m Inlines eNoteref = try $ do guardEnabled Ext_epub_html_exts TagOpen tag attr' <- lookAhead pAnyTag let attr = toStringAttr attr' guard $ lookup "type" attr == Just "noteref" let ident = maybe "" (dropWhile (== '#')) (lookup "href" attr) guard (not (null ident)) pInTags tag block return $ B.rawInline "noteref" ident -- Strip TOC if there is one, better to generate again eTOC :: PandocMonad m => TagParser m () eTOC = try $ do guardEnabled Ext_epub_html_exts (TagOpen tag attr) <- lookAhead pAnyTag guard $ lookup "type" attr == Just "toc" void (pInTags tag block) pList :: PandocMonad m => TagParser m Blocks pList = pBulletList <|> pOrderedList <|> pDefinitionList pBulletList :: PandocMonad m => TagParser m Blocks pBulletList = try $ do pSatisfy (matchTagOpen "ul" []) let nonItem = pSatisfy (\t -> not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) && not (matchTagClose "ul" t)) -- note: if they have an
    or
      not in scope of a
    • , -- treat it as a list item, though it's not valid xhtml... skipMany nonItem items <- manyTill (pListItem nonItem) (pCloses "ul") return $ B.bulletList $ map (fixPlains True) items pListItem :: PandocMonad m => TagParser m a -> TagParser m Blocks pListItem nonItem = do TagOpen _ attr' <- lookAhead $ pSatisfy (matchTagOpen "li" []) let attr = toStringAttr attr' let addId ident bs = case B.toList bs of (Plain ils:xs) -> B.fromList (Plain [Span (ident, [], []) ils] : xs) _ -> B.divWith (ident, [], []) bs maybe id addId (lookup "id" attr) <$> pInTags "li" block <* skipMany nonItem parseListStyleType :: String -> ListNumberStyle parseListStyleType "lower-roman" = LowerRoman parseListStyleType "upper-roman" = UpperRoman parseListStyleType "lower-alpha" = LowerAlpha parseListStyleType "upper-alpha" = UpperAlpha parseListStyleType "decimal" = Decimal parseListStyleType _ = DefaultStyle parseTypeAttr :: String -> ListNumberStyle parseTypeAttr "i" = LowerRoman parseTypeAttr "I" = UpperRoman parseTypeAttr "a" = LowerAlpha parseTypeAttr "A" = UpperAlpha parseTypeAttr "1" = Decimal parseTypeAttr _ = DefaultStyle pOrderedList :: PandocMonad m => TagParser m Blocks pOrderedList = try $ do TagOpen _ attribs' <- pSatisfy (matchTagOpen "ol" []) let attribs = toStringAttr attribs' let (start, style) = (sta', sty') where sta = fromMaybe "1" $ lookup "start" attribs sta' = if all isDigit sta then read sta else 1 pickListStyle = pickStyleAttrProps ["list-style-type", "list-style"] typeAttr = fromMaybe "" $ lookup "type" attribs classAttr = fromMaybe "" $ lookup "class" attribs styleAttr = fromMaybe "" $ lookup "style" attribs listStyle = fromMaybe "" $ pickListStyle styleAttr sty' = foldOrElse DefaultStyle [ parseTypeAttr typeAttr , parseListStyleType classAttr , parseListStyleType listStyle ] let nonItem = pSatisfy (\t -> not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) && not (matchTagClose "ol" t)) -- note: if they have an
        or
          not in scope of a
        • , -- treat it as a list item, though it's not valid xhtml... skipMany nonItem items <- manyTill (pListItem nonItem) (pCloses "ol") return $ B.orderedListWith (start, style, DefaultDelim) $ map (fixPlains True) items pDefinitionList :: PandocMonad m => TagParser m Blocks pDefinitionList = try $ do pSatisfy (matchTagOpen "dl" []) items <- manyTill pDefListItem (pCloses "dl") return $ B.definitionList items pDefListItem :: PandocMonad m => TagParser m (Inlines, [Blocks]) pDefListItem = try $ do let nonItem = pSatisfy (\t -> not (matchTagOpen "dt" [] t) && not (matchTagOpen "dd" [] t) && not (matchTagClose "dl" t)) terms <- many1 (try $ skipMany nonItem >> pInTags "dt" inline) defs <- many1 (try $ skipMany nonItem >> pInTags "dd" block) skipMany nonItem let term = foldl1 (\x y -> x <> B.linebreak <> y) terms return (term, map (fixPlains True) defs) fixPlains :: Bool -> Blocks -> Blocks fixPlains inList bs = if any isParaish bs' then B.fromList $ map plainToPara bs' else bs where isParaish Para{} = True isParaish CodeBlock{} = True isParaish Header{} = True isParaish BlockQuote{} = True isParaish BulletList{} = not inList isParaish OrderedList{} = not inList isParaish DefinitionList{} = not inList isParaish _ = False plainToPara (Plain xs) = Para xs plainToPara x = x bs' = B.toList bs pRawTag :: PandocMonad m => TagParser m Text pRawTag = do tag <- pAnyTag let ignorable x = x `elem` ["html","head","body","!DOCTYPE","?xml"] if tagOpen ignorable (const True) tag || tagClose ignorable tag then return mempty else return $ renderTags' [tag] pLineBlock :: PandocMonad m => TagParser m Blocks pLineBlock = try $ do guardEnabled Ext_line_blocks _ <- pSatisfy $ tagOpen (=="div") (== [("class","line-block")]) ils <- trimInlines . mconcat <$> manyTill inline (pSatisfy (tagClose (=="div"))) let lns = map B.fromList $ splitWhen (== LineBreak) $ filter (/= SoftBreak) $ B.toList ils return $ B.lineBlock lns pDiv :: PandocMonad m => TagParser m Blocks pDiv = try $ do guardEnabled Ext_native_divs let isDivLike "div" = True isDivLike "section" = True isDivLike "main" = True isDivLike _ = False TagOpen tag attr' <- lookAhead $ pSatisfy $ tagOpen isDivLike (const True) let attr = toStringAttr attr' contents <- pInTags tag block let (ident, classes, kvs) = mkAttr attr let classes' = if tag == "section" then "section":classes else classes kvs' = if tag == "main" && isNothing (lookup "role" kvs) then ("role", "main"):kvs else kvs return $ B.divWith (ident, classes', kvs') contents pRawHtmlBlock :: PandocMonad m => TagParser m Blocks pRawHtmlBlock = do raw <- T.unpack <$> (pHtmlBlock "script" <|> pHtmlBlock "style" <|> pRawTag) exts <- getOption readerExtensions if extensionEnabled Ext_raw_html exts && not (null raw) then return $ B.rawBlock "html" raw else ignore raw ignore :: (Monoid a, PandocMonad m) => String -> TagParser m a ignore raw = do pos <- getPosition -- raw can be null for tags like ; see paRawTag -- in this case we don't want a warning: unless (null raw) $ logMessage $ SkippedContent raw pos return mempty pHtmlBlock :: PandocMonad m => Text -> TagParser m Text pHtmlBlock t = try $ do open <- pSatisfy (matchTagOpen t []) contents <- manyTill pAnyTag (pSatisfy (matchTagClose t)) return $ renderTags' $ [open] <> contents <> [TagClose t] -- Sets chapter context eSection :: PandocMonad m => TagParser m Blocks eSection = try $ do let matchChapter as = maybe False (T.isInfixOf "chapter") (lookup "type" as) let sectTag = tagOpen (`elem` sectioningContent) matchChapter TagOpen tag _ <- lookAhead $ pSatisfy sectTag setInChapter (pInTags tag block) headerLevel :: PandocMonad m => Text -> TagParser m Int headerLevel tagtype = case safeRead (T.unpack (T.drop 1 tagtype)) of Just level -> -- try (do -- guardEnabled Ext_epub_html_exts -- asks inChapter >>= guard -- return (level - 1)) -- <|> return level Nothing -> fail "Could not retrieve header level" eTitlePage :: PandocMonad m => TagParser m () eTitlePage = try $ do let isTitlePage as = maybe False (T.isInfixOf "titlepage") (lookup "type" as) let groupTag = tagOpen (\x -> x `elem` groupingContent || x == "section") isTitlePage TagOpen tag _ <- lookAhead $ pSatisfy groupTag () <$ pInTags tag block pHeader :: PandocMonad m => TagParser m Blocks pHeader = try $ do TagOpen tagtype attr' <- pSatisfy $ tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"]) (const True) let attr = toStringAttr attr' let bodyTitle = TagOpen tagtype attr' ~== TagOpen ("h1" :: Text) [("class","title")] level <- headerLevel tagtype contents <- trimInlines . mconcat <$> manyTill inline (pCloses tagtype <|> eof) let ident = fromMaybe "" $ lookup "id" attr let classes = maybe [] words $ lookup "class" attr let keyvals = [(k,v) | (k,v) <- attr, k /= "class", k /= "id"] attr'' <- registerHeader (ident, classes, keyvals) contents return $ if bodyTitle then mempty -- skip a representation of the title in the body else B.headerWith attr'' level contents pHrule :: PandocMonad m => TagParser m Blocks pHrule = do pSelfClosing (=="hr") (const True) return B.horizontalRule pTable :: PandocMonad m => TagParser m Blocks pTable = try $ do TagOpen _ _ <- pSatisfy (matchTagOpen "table" []) skipMany pBlank caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank widths' <- (mconcat <$> many1 pColgroup) <|> many pCol let pTh = option [] $ pInTags "tr" (pCell "th") pTr = try $ skipMany pBlank >> pInTags "tr" (pCell "td" <|> pCell "th") pTBody = pInTag True "tbody" $ many1 pTr head'' <- pInTag False "thead" (option [] pTr) <|> pInTag True "thead" pTh head' <- map snd <$> pInTag True "tbody" (if null head'' then pTh else return head'') topfoot <- option [] $ pInTag False "tfoot" $ many pTr rowsLs <- many pTBody bottomfoot <- option [] $ pInTag False "tfoot" $ many pTr TagClose _ <- pSatisfy (matchTagClose "table") let rows'' = concat rowsLs <> topfoot <> bottomfoot let rows''' = map (map snd) rows'' -- let rows''' = map (map snd) rows'' -- fail on empty table guard $ not $ null head' && null rows''' let isSinglePlain x = case B.toList x of [] -> True [Plain _] -> True _ -> False let isSimple = all isSinglePlain $ concat (head':rows''') let cols = if null head' then maximum (map length rows''') else length head' -- add empty cells to short rows let addEmpties r = case cols - length r of n | n > 0 -> r <> replicate n mempty | otherwise -> r let rows = map addEmpties rows''' let aligns = case rows'' of (cs:_) -> take cols $ map fst cs ++ repeat AlignDefault _ -> replicate cols AlignDefault let widths = if null widths' then if isSimple then replicate cols 0 else replicate cols (1.0 / fromIntegral cols) else widths' return $ B.table caption (zip aligns widths) head' rows pCol :: PandocMonad m => TagParser m Double pCol = try $ do TagOpen _ attribs' <- pSatisfy (matchTagOpen "col" []) let attribs = toStringAttr attribs' skipMany pBlank optional $ pSatisfy (matchTagClose "col") skipMany pBlank let width = case lookup "width" attribs of Nothing -> case lookup "style" attribs of Just ('w':'i':'d':'t':'h':':':xs) | '%' `elem` xs -> fromMaybe 0.0 $ safeRead (filter (`notElem` (" \t\r\n%'\";" :: [Char])) xs) _ -> 0.0 Just x | not (null x) && last x == '%' -> fromMaybe 0.0 $ safeRead (init x) _ -> 0.0 if width > 0.0 then return $ width / 100.0 else return 0.0 pColgroup :: PandocMonad m => TagParser m [Double] pColgroup = try $ do pSatisfy (matchTagOpen "colgroup" []) skipMany pBlank manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank noColOrRowSpans :: Tag Text -> Bool noColOrRowSpans t = isNullOrOne "colspan" && isNullOrOne "rowspan" where isNullOrOne x = case fromAttrib x t of "" -> True "1" -> True _ -> False pCell :: PandocMonad m => Text -> TagParser m [(Alignment, Blocks)] pCell celltype = try $ do skipMany pBlank tag <- lookAhead $ pSatisfy (\t -> t ~== TagOpen celltype [] && noColOrRowSpans t) let extractAlign' [] = "" extractAlign' ("text-align":x:_) = x extractAlign' (_:xs) = extractAlign' xs let extractAlign = extractAlign' . wordsBy (`elem` [' ','\t',';',':']) let align = case maybeFromAttrib "align" tag `mplus` (extractAlign <$> maybeFromAttrib "style" tag) of Just "left" -> AlignLeft Just "right" -> AlignRight Just "center" -> AlignCenter _ -> AlignDefault res <- pInTags' celltype noColOrRowSpans block skipMany pBlank return [(align, res)] pBlockQuote :: PandocMonad m => TagParser m Blocks pBlockQuote = do contents <- pInTags "blockquote" block return $ B.blockQuote $ fixPlains False contents pPlain :: PandocMonad m => TagParser m Blocks pPlain = do contents <- setInPlain $ trimInlines . mconcat <$> many1 inline if B.isNull contents then return mempty else return $ B.plain contents pPara :: PandocMonad m => TagParser m Blocks pPara = do contents <- trimInlines <$> pInTags "p" inline (do guardDisabled Ext_empty_paragraphs guard (B.isNull contents) return mempty) <|> return (B.para contents) pFigure :: PandocMonad m => TagParser m Blocks pFigure = try $ do TagOpen _ _ <- pSatisfy (matchTagOpen "figure" []) skipMany pBlank let pImg = (\x -> (Just x, Nothing)) <$> (pInTag True "p" pImage <* skipMany pBlank) pCapt = (\x -> (Nothing, Just x)) <$> do bs <- pInTags "figcaption" block return $ blocksToInlines' $ B.toList bs pSkip = (Nothing, Nothing) <$ pSatisfy (not . matchTagClose "figure") res <- many (pImg <|> pCapt <|> pSkip) let mbimg = msum $ map fst res let mbcap = msum $ map snd res TagClose _ <- pSatisfy (matchTagClose "figure") let caption = fromMaybe mempty mbcap case B.toList <$> mbimg of Just [Image attr _ (url, tit)] -> return $ B.para $ B.imageWith attr url ("fig:" ++ tit) caption _ -> mzero pCodeBlock :: PandocMonad m => TagParser m Blocks pCodeBlock = try $ do TagOpen _ attr' <- pSatisfy (matchTagOpen "pre" []) let attr = toStringAttr attr' contents <- manyTill pAnyTag (pCloses "pre" <|> eof) let rawText = concatMap tagToString contents -- drop leading newline if any let result' = case rawText of '\n':xs -> xs _ -> rawText -- drop trailing newline if any let result = case reverse result' of '\n':_ -> init result' _ -> result' return $ B.codeBlockWith (mkAttr attr) result tagToString :: Tag Text -> String tagToString (TagText s) = T.unpack s tagToString (TagOpen "br" _) = "\n" tagToString _ = "" inline :: PandocMonad m => TagParser m Inlines inline = choice [ eNoteref , eSwitch id inline , pTagText , pQ , pEmph , pStrong , pSuperscript , pSubscript , pSmall , pStrikeout , pUnderline , pLineBreak , pLink , pImage , pCode , pSpan , pMath False , pScriptMath , pRawHtmlInline ] pLocation :: PandocMonad m => TagParser m () pLocation = do (TagPosition r c) <- pSat isTagPosition setPosition $ newPos "input" r c pSat :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text) pSat f = do pos <- getPosition token show (const pos) (\x -> if f x then Just x else Nothing) pSatisfy :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text) pSatisfy f = try $ optional pLocation >> pSat f pAnyTag :: PandocMonad m => TagParser m (Tag Text) pAnyTag = pSatisfy (const True) pSelfClosing :: PandocMonad m => (Text -> Bool) -> ([Attribute Text] -> Bool) -> TagParser m (Tag Text) pSelfClosing f g = do open <- pSatisfy (tagOpen f g) optional $ pSatisfy (tagClose f) return open pQ :: PandocMonad m => TagParser m Inlines pQ = do context <- asks quoteContext let quoteType = case context of InDoubleQuote -> SingleQuote _ -> DoubleQuote let innerQuoteContext = if quoteType == SingleQuote then InSingleQuote else InDoubleQuote let constructor = case quoteType of SingleQuote -> B.singleQuoted DoubleQuote -> B.doubleQuoted withQuoteContext innerQuoteContext $ pInlinesInTags "q" constructor pEmph :: PandocMonad m => TagParser m Inlines pEmph = pInlinesInTags "em" B.emph <|> pInlinesInTags "i" B.emph pStrong :: PandocMonad m => TagParser m Inlines pStrong = pInlinesInTags "strong" B.strong <|> pInlinesInTags "b" B.strong pSuperscript :: PandocMonad m => TagParser m Inlines pSuperscript = pInlinesInTags "sup" B.superscript pSubscript :: PandocMonad m => TagParser m Inlines pSubscript = pInlinesInTags "sub" B.subscript pSmall :: PandocMonad m => TagParser m Inlines pSmall = pInlinesInTags "small" (B.spanWith ("",["small"],[])) pStrikeout :: PandocMonad m => TagParser m Inlines pStrikeout = pInlinesInTags "s" B.strikeout <|> pInlinesInTags "strike" B.strikeout <|> pInlinesInTags "del" B.strikeout <|> try (do pSatisfy (matchTagOpen "span" [("class","strikeout")]) contents <- mconcat <$> manyTill inline (pCloses "span") return $ B.strikeout contents) pUnderline :: PandocMonad m => TagParser m Inlines pUnderline = pInlinesInTags "u" underlineSpan <|> pInlinesInTags "ins" underlineSpan pLineBreak :: PandocMonad m => TagParser m Inlines pLineBreak = do pSelfClosing (=="br") (const True) return B.linebreak -- Unlike fromAttrib from tagsoup, this distinguishes -- between a missing attribute and an attribute with empty content. maybeFromAttrib :: String -> Tag Text -> Maybe String maybeFromAttrib name (TagOpen _ attrs) = T.unpack <$> lookup (T.pack name) attrs maybeFromAttrib _ _ = Nothing pLink :: PandocMonad m => TagParser m Inlines pLink = try $ do tag <- pSatisfy $ tagOpenLit "a" (const True) let title = T.unpack $ fromAttrib "title" tag -- take id from id attribute if present, otherwise name let uid = fromMaybe (T.unpack $ fromAttrib "name" tag) $ maybeFromAttrib "id" tag let cls = words $ T.unpack $ fromAttrib "class" tag lab <- mconcat <$> manyTill inline (pCloses "a") -- check for href; if href, then a link, otherwise a span case maybeFromAttrib "href" tag of Nothing -> return $ extractSpaces (B.spanWith (uid, cls, [])) lab Just url' -> do mbBaseHref <- baseHref <$> getState let url = case (parseURIReference url', mbBaseHref) of (Just rel, Just bs) -> show (rel `nonStrictRelativeTo` bs) _ -> url' return $ extractSpaces (B.linkWith (uid, cls, []) (escapeURI url) title) lab pImage :: PandocMonad m => TagParser m Inlines pImage = do tag <- pSelfClosing (=="img") (isJust . lookup "src") mbBaseHref <- baseHref <$> getState let url' = T.unpack $ fromAttrib "src" tag let url = case (parseURIReference url', mbBaseHref) of (Just rel, Just bs) -> show (rel `nonStrictRelativeTo` bs) _ -> url' let title = T.unpack $ fromAttrib "title" tag let alt = T.unpack $ fromAttrib "alt" tag let uid = T.unpack $ fromAttrib "id" tag let cls = words $ T.unpack $ fromAttrib "class" tag let getAtt k = case fromAttrib k tag of "" -> [] v -> [(T.unpack k, T.unpack v)] let kvs = concatMap getAtt ["width", "height", "sizes", "srcset"] return $ B.imageWith (uid, cls, kvs) (escapeURI url) title (B.text alt) pCode :: PandocMonad m => TagParser m Inlines pCode = try $ do (TagOpen open attr') <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True) let attr = toStringAttr attr' result <- manyTill pAnyTag (pCloses open) return $ B.codeWith (mkAttr attr) $ unwords $ lines $ T.unpack $ innerText result pSpan :: PandocMonad m => TagParser m Inlines pSpan = try $ do guardEnabled Ext_native_spans TagOpen _ attr' <- lookAhead $ pSatisfy $ tagOpen (=="span") (const True) let attr = toStringAttr attr' contents <- pInTags "span" inline let isSmallCaps = fontVariant == "small-caps" || "smallcaps" `elem` classes where styleAttr = fromMaybe "" $ lookup "style" attr fontVariant = fromMaybe "" $ pickStyleAttrProps ["font-variant"] styleAttr classes = fromMaybe [] $ words <$> lookup "class" attr let tag = if isSmallCaps then B.smallcaps else B.spanWith (mkAttr attr) return $ tag contents pRawHtmlInline :: PandocMonad m => TagParser m Inlines pRawHtmlInline = do inplain <- asks inPlain result <- pSatisfy (tagComment (const True)) <|> if inplain then pSatisfy (not . isBlockTag) else pSatisfy isInlineTag exts <- getOption readerExtensions let raw = T.unpack $ renderTags' [result] if extensionEnabled Ext_raw_html exts then return $ B.rawInline "html" raw else ignore raw mathMLToTeXMath :: String -> Either String String mathMLToTeXMath s = writeTeX <$> readMathML s toStringAttr :: [(Text, Text)] -> [(String, String)] toStringAttr = map go where go (x,y) = (T.unpack x, T.unpack y) pScriptMath :: PandocMonad m => TagParser m Inlines pScriptMath = try $ do TagOpen _ attr' <- pSatisfy $ tagOpen (=="script") (const True) isdisplay <- case lookup "type" attr' of Just x | "math/tex" `T.isPrefixOf` x -> return $ "display" `T.isSuffixOf` x _ -> mzero contents <- T.unpack . innerText <$> manyTill pAnyTag (pSatisfy (matchTagClose "script")) return $ (if isdisplay then B.displayMath else B.math) contents pMath :: PandocMonad m => Bool -> TagParser m Inlines pMath inCase = try $ do open@(TagOpen _ attr') <- pSatisfy $ tagOpen (=="math") (const True) -- we'll assume math tags are MathML unless specially marked -- otherwise... let attr = toStringAttr attr' unless inCase $ guard (maybe True (== mathMLNamespace) (lookup "xmlns" attr)) contents <- manyTill pAnyTag (pSatisfy (matchTagClose "math")) case mathMLToTeXMath (T.unpack $ renderTags $ [open] <> contents <> [TagClose "math"]) of Left _ -> return $ B.spanWith ("",["math"],attr) $ B.text $ T.unpack $ innerText contents Right [] -> return mempty Right x -> return $ case lookup "display" attr of Just "block" -> B.displayMath x _ -> B.math x pInlinesInTags :: PandocMonad m => Text -> (Inlines -> Inlines) -> TagParser m Inlines pInlinesInTags tagtype f = extractSpaces f <$> pInTags tagtype inline pInTags :: (PandocMonad m, Monoid a) => Text -> TagParser m a -> TagParser m a pInTags tagtype parser = pInTags' tagtype (const True) parser pInTags' :: (PandocMonad m, Monoid a) => Text -> (Tag Text -> Bool) -> TagParser m a -> TagParser m a pInTags' tagtype tagtest parser = try $ do pSatisfy (\t -> t ~== TagOpen tagtype [] && tagtest t) mconcat <$> manyTill parser (pCloses tagtype <|> eof) -- parses p, preceded by an opening tag (optional if tagsOptional) -- and followed by a closing tag (optional if tagsOptional) pInTag :: PandocMonad m => Bool -> Text -> TagParser m a -> TagParser m a pInTag tagsOptional tagtype p = try $ do skipMany pBlank (if tagsOptional then optional else void) $ pSatisfy (matchTagOpen tagtype []) skipMany pBlank x <- p skipMany pBlank (if tagsOptional then optional else void) $ pSatisfy (matchTagClose tagtype) skipMany pBlank return x pCloses :: PandocMonad m => Text -> TagParser m () pCloses tagtype = try $ do t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag case t of (TagClose t') | t' == tagtype -> void pAnyTag (TagOpen t' _) | t' `closes` tagtype -> return () (TagClose "ul") | tagtype == "li" -> return () (TagClose "ol") | tagtype == "li" -> return () (TagClose "dl") | tagtype == "dd" -> return () (TagClose "table") | tagtype == "td" -> return () (TagClose "table") | tagtype == "tr" -> return () (TagClose "td") | tagtype `Set.member` blockHtmlTags -> return () (TagClose "th") | tagtype `Set.member` blockHtmlTags -> return () (TagClose t') | tagtype == "p" && t' `Set.member` blockHtmlTags -> return () -- see #3794 _ -> mzero pTagText :: PandocMonad m => TagParser m Inlines pTagText = try $ do (TagText str) <- pSatisfy isTagText st <- getState qu <- ask parsed <- lift $ lift $ flip runReaderT qu $ runParserT (many pTagContents) st "text" str case parsed of Left _ -> throwError $ PandocParseError $ "Could not parse `" <> T.unpack str <> "'" Right result -> return $ mconcat result pBlank :: PandocMonad m => TagParser m () pBlank = try $ do (TagText str) <- pSatisfy isTagText guard $ T.all isSpace str type InlinesParser m = HTMLParser m Text pTagContents :: PandocMonad m => InlinesParser m Inlines pTagContents = B.displayMath <$> mathDisplay <|> B.math <$> mathInline <|> pStr <|> pSpace <|> smartPunctuation pTagContents <|> pRawTeX <|> pSymbol <|> pBad pRawTeX :: PandocMonad m => InlinesParser m Inlines pRawTeX = do lookAhead $ try $ do char '\\' choice $ map (try . string) ["begin", "eqref", "ref"] guardEnabled Ext_raw_tex inp <- getInput st <- getState res <- lift $ runParserT (withRaw rawLaTeXInline) st "chunk" (T.unpack inp) case res of Left _ -> mzero Right (contents, raw) -> do _ <- count (length raw) anyChar return $ B.rawInline "tex" contents pStr :: PandocMonad m => InlinesParser m Inlines pStr = do result <- many1 $ satisfy $ \c -> not (isSpace c) && not (isSpecial c) && not (isBad c) updateLastStrPos return $ B.str result isSpecial :: Char -> Bool isSpecial '"' = True isSpecial '\'' = True isSpecial '.' = True isSpecial '-' = True isSpecial '$' = True isSpecial '\\' = True isSpecial '\8216' = True isSpecial '\8217' = True isSpecial '\8220' = True isSpecial '\8221' = True isSpecial _ = False pSymbol :: PandocMonad m => InlinesParser m Inlines pSymbol = satisfy isSpecial >>= return . B.str . (:[]) isBad :: Char -> Bool isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML pBad :: PandocMonad m => InlinesParser m Inlines pBad = do c <- satisfy isBad let c' = case c of '\128' -> '\8364' '\130' -> '\8218' '\131' -> '\402' '\132' -> '\8222' '\133' -> '\8230' '\134' -> '\8224' '\135' -> '\8225' '\136' -> '\710' '\137' -> '\8240' '\138' -> '\352' '\139' -> '\8249' '\140' -> '\338' '\142' -> '\381' '\145' -> '\8216' '\146' -> '\8217' '\147' -> '\8220' '\148' -> '\8221' '\149' -> '\8226' '\150' -> '\8211' '\151' -> '\8212' '\152' -> '\732' '\153' -> '\8482' '\154' -> '\353' '\155' -> '\8250' '\156' -> '\339' '\158' -> '\382' '\159' -> '\376' _ -> '?' return $ B.str [c'] pSpace :: PandocMonad m => InlinesParser m Inlines pSpace = many1 (satisfy isSpace) >>= \xs -> if '\n' `elem` xs then return B.softbreak else return B.space -- -- Constants -- eitherBlockOrInline :: Set.Set Text eitherBlockOrInline = Set.fromList ["audio", "applet", "button", "iframe", "embed", "del", "ins", "progress", "map", "area", "noscript", "script", "object", "svg", "video", "source"] blockHtmlTags :: Set.Set Text blockHtmlTags = Set.fromList ["?xml", "!DOCTYPE", "address", "article", "aside", "blockquote", "body", "canvas", "caption", "center", "col", "colgroup", "dd", "details", "dir", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", "isindex", "main", "menu", "meta", "noframes", "ol", "output", "p", "pre", "section", "table", "tbody", "textarea", "thead", "tfoot", "ul", "dd", "dt", "frameset", "li", "tbody", "td", "tfoot", "th", "thead", "tr", "script", "style"] -- We want to allow raw docbook in markdown documents, so we -- include docbook block tags here too. blockDocBookTags :: Set.Set Text blockDocBookTags = Set.fromList ["calloutlist", "bibliolist", "glosslist", "itemizedlist", "orderedlist", "segmentedlist", "simplelist", "variablelist", "caution", "important", "note", "tip", "warning", "address", "literallayout", "programlisting", "programlistingco", "screen", "screenco", "screenshot", "synopsis", "example", "informalexample", "figure", "informalfigure", "table", "informaltable", "para", "simpara", "formalpara", "equation", "informalequation", "figure", "screenshot", "mediaobject", "qandaset", "procedure", "task", "cmdsynopsis", "funcsynopsis", "classsynopsis", "blockquote", "epigraph", "msgset", "sidebar", "title"] epubTags :: Set.Set Text epubTags = Set.fromList ["case", "switch", "default"] blockTags :: Set.Set Text blockTags = Set.unions [blockHtmlTags, blockDocBookTags, epubTags] class NamedTag a where getTagName :: a -> Maybe Text instance NamedTag (Tag Text) where getTagName (TagOpen t _) = Just t getTagName (TagClose t) = Just t getTagName _ = Nothing instance NamedTag (Tag String) where getTagName (TagOpen t _) = Just (T.pack t) getTagName (TagClose t) = Just (T.pack t) getTagName _ = Nothing isInlineTag :: NamedTag (Tag a) => Tag a -> Bool isInlineTag t = isCommentTag t || case getTagName t of Nothing -> False Just x -> x `Set.notMember` blockTags || T.take 1 x == "?" -- processing instr. isBlockTag :: NamedTag (Tag a) => Tag a -> Bool isBlockTag t = isBlockTagName || isTagComment t where isBlockTagName = case getTagName t of Just x | "?" `T.isPrefixOf` x -> True | "!" `T.isPrefixOf` x -> True | otherwise -> x `Set.member` blockTags || x `Set.member` eitherBlockOrInline Nothing -> False isTextTag :: Tag a -> Bool isTextTag = tagText (const True) isCommentTag :: Tag a -> Bool isCommentTag = tagComment (const True) -- taken from HXT and extended -- See http://www.w3.org/TR/html5/syntax.html sec 8.1.2.4 optional tags closes :: Text -> Text -> Bool _ `closes` "body" = False _ `closes` "html" = False "body" `closes` "head" = True "a" `closes` "a" = True "li" `closes` "li" = True "th" `closes` t | t `elem` ["th","td"] = True "td" `closes` t | t `elem` ["th","td"] = True "tr" `closes` t | t `elem` ["th","td","tr"] = True "dd" `closes` t | t `elem` ["dt", "dd"] = True "dt" `closes` t | t `elem` ["dt","dd"] = True "rt" `closes` t | t `elem` ["rb", "rt", "rtc"] = True "optgroup" `closes` "optgroup" = True "optgroup" `closes` "option" = True "option" `closes` "option" = True -- https://html.spec.whatwg.org/multipage/syntax.html#optional-tags x `closes` "p" | x `elem` ["address", "article", "aside", "blockquote", "dir", "div", "dl", "fieldset", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "main", "menu", "nav", "ol", "p", "pre", "section", "table", "ul"] = True _ `closes` "meta" = True "form" `closes` "form" = True "label" `closes` "label" = True "map" `closes` "map" = True "object" `closes` "object" = True _ `closes` t | t `elem` ["option","style","script","textarea","title"] = True t `closes` "select" | t /= "option" = True "thead" `closes` t | t `elem` ["colgroup"] = True "tfoot" `closes` t | t `elem` ["thead","colgroup"] = True "tbody" `closes` t | t `elem` ["tbody","tfoot","thead","colgroup"] = True t `closes` t2 | t `elem` ["h1","h2","h3","h4","h5","h6","dl","ol","ul","table","div","main","p"] && t2 `elem` ["h1","h2","h3","h4","h5","h6","p" ] = True -- not "div" or "main" t1 `closes` t2 | t1 `Set.member` blockTags && t2 `Set.notMember` blockTags && t2 `Set.notMember` eitherBlockOrInline = True _ `closes` _ = False --- parsers for use in markdown, textile readers -- | Matches a stretch of HTML in balanced tags. htmlInBalanced :: (HasReaderOptions st, Monad m) => (Tag String -> Bool) -> ParserT String st m String htmlInBalanced f = try $ do lookAhead (char '<') inp <- getInput let ts = canonicalizeTags $ parseTagsOptions parseOptions{ optTagWarning = True, optTagPosition = True } inp case ts of (TagPosition sr sc : t@(TagOpen tn _) : rest) -> do guard $ f t guard $ not $ hasTagWarning (t : take 1 rest) case htmlInBalanced' tn (t:rest) of [] -> mzero xs -> case reverse xs of (TagClose _ : TagPosition er ec : _) -> do let ls = er - sr let cs = ec - sc lscontents <- unlines <$> count ls anyLine cscontents <- count cs anyChar closetag <- do x <- many (satisfy (/='>')) char '>' return (x <> ">") return (lscontents <> cscontents <> closetag) _ -> mzero _ -> mzero htmlInBalanced' :: String -> [Tag String] -> [Tag String] htmlInBalanced' tagname ts = fromMaybe [] $ go 0 ts where go :: Int -> [Tag String] -> Maybe [Tag String] go n (t@(TagOpen tn' _):rest) | tn' == tagname = (t :) <$> go (n + 1) rest go 1 (t@(TagClose tn'):_) | tn' == tagname = return [t] go n (t@(TagClose tn'):rest) | tn' == tagname = (t :) <$> go (n - 1) rest go n (t:ts') = (t :) <$> go n ts' go _ [] = mzero hasTagWarning :: [Tag a] -> Bool hasTagWarning (TagWarning _:_) = True hasTagWarning _ = False -- | Matches a tag meeting a certain condition. htmlTag :: (HasReaderOptions st, Monad m) => (Tag String -> Bool) -> ParserT [Char] st m (Tag String, String) htmlTag f = try $ do lookAhead (char '<') startpos <- getPosition inp <- getInput let ts = canonicalizeTags $ parseTagsOptions parseOptions{ optTagWarning = False , optTagPosition = True } (inp ++ " ") -- add space to ensure that -- we get a TagPosition after the tag (next, ln, col) <- case ts of (TagPosition{} : next : TagPosition ln col : _) | f next -> return (next, ln, col) _ -> mzero -- -- should NOT be parsed as an HTML tag, see #2277, -- so we exclude . even though it's a valid character -- in XML element names let isNameChar c = isAlphaNum c || c == ':' || c == '-' || c == '_' let isName s = case s of [] -> False (c:cs) -> isLetter c && all isNameChar cs let isPI s = case s of ('?':_) -> True -- processing instruction _ -> False let endpos = if ln == 1 then setSourceColumn startpos (sourceColumn startpos + (col - 1)) else setSourceColumn (setSourceLine startpos (sourceLine startpos + (ln - 1))) col let endAngle = try $ do char '>' pos <- getPosition guard $ pos >= endpos let handleTag tagname = do -- basic sanity check, since the parser is very forgiving -- and finds tags in stuff like x should NOT be a tag either. -- tagsoup will parse it as TagOpen "https:" [("example.org","")] guard $ last tagname /= ':' char '<' rendered <- manyTill anyChar endAngle return (next, "<" ++ rendered ++ ">") case next of TagComment s | "" stripComments <- getOption readerStripComments if stripComments then return (next, "") else return (next, "") | otherwise -> fail "bogus comment mode, HTML5 parse error" TagOpen tagname attr -> do guard $ isPI tagname || all (isName . fst) attr handleTag tagname TagClose tagname -> handleTag tagname _ -> mzero mkAttr :: [(String, String)] -> Attr mkAttr attr = (attribsId, attribsClasses, attribsKV) where attribsId = fromMaybe "" $ lookup "id" attr attribsClasses = words (fromMaybe "" $ lookup "class" attr) <> epubTypes attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr epubTypes = words $ fromMaybe "" $ lookup "epub:type" attr -- Strip namespace prefixes stripPrefixes :: [Tag Text] -> [Tag Text] stripPrefixes = map stripPrefix stripPrefix :: Tag Text -> Tag Text stripPrefix (TagOpen s as) = TagOpen (stripPrefix' s) (map (first stripPrefix') as) stripPrefix (TagClose s) = TagClose (stripPrefix' s) stripPrefix x = x stripPrefix' :: Text -> Text stripPrefix' s = if T.null t then s else T.drop 1 t where (_, t) = T.span (/= ':') s isSpace :: Char -> Bool isSpace ' ' = True isSpace '\t' = True isSpace '\n' = True isSpace '\r' = True isSpace _ = False -- Instances instance HasMacros HTMLState where extractMacros = macros updateMacros f st = st{ macros = f $ macros st } instance HasIdentifierList HTMLState where extractIdentifierList = identifiers updateIdentifierList f s = s{ identifiers = f (identifiers s) } instance HasHeaderMap HTMLState where extractHeaderMap = headerMap updateHeaderMap f s = s{ headerMap = f (headerMap s) } instance HasLogMessages HTMLState where addLogMessage m s = s{ logMessages = m : logMessages s } getLogMessages = reverse . logMessages -- This signature should be more general -- MonadReader HTMLLocal m => HasQuoteContext st m instance PandocMonad m => HasQuoteContext HTMLState (ReaderT HTMLLocal m) where getQuoteContext = asks quoteContext withQuoteContext q = local (\s -> s{quoteContext = q}) instance HasReaderOptions HTMLState where extractReaderOptions = extractReaderOptions . parserState instance HasMeta HTMLState where setMeta s b st = st {parserState = setMeta s b $ parserState st} deleteMeta s st = st {parserState = deleteMeta s $ parserState st} instance Default HTMLLocal where def = HTMLLocal NoQuote False False instance HasLastStrPosition HTMLState where setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)} getLastStrPos = getLastStrPos . parserState -- For now we need a special version here; the one in Shared has String type renderTags' :: [Tag Text] -> Text renderTags' = renderTagsOptions renderOptions{ optMinimize = matchTags ["hr", "br", "img", "meta", "link"] , optRawTag = matchTags ["script", "style"] } where matchTags tags = flip elem tags . T.toLower -- EPUB Specific -- -- sectioningContent :: [Text] sectioningContent = ["article", "aside", "nav", "section"] groupingContent :: [Text] groupingContent = ["p", "hr", "pre", "blockquote", "ol" , "ul", "li", "dl", "dt", "dt", "dd" , "figure", "figcaption", "div", "main"] matchTagClose :: Text -> (Tag Text -> Bool) matchTagClose t = (~== TagClose t) matchTagOpen :: Text -> [(Text, Text)] -> (Tag Text -> Bool) matchTagOpen t as = (~== TagOpen t as) {- types :: [(String, ([String], Int))] types = -- Document divisions map (\s -> (s, (["section", "body"], 0))) ["volume", "part", "chapter", "division"] <> -- Document section and components [ ("abstract", ([], 0))] -}