{-# LANGUAGE OverloadedStrings #-} module Cheapskate.Parse ( markdown ) where import Cheapskate.ParserCombinators import Cheapskate.Util import Cheapskate.Inlines import Cheapskate.Types import Data.Char hiding (Space) import qualified Data.Set as Set import Prelude hiding (takeWhile) import Data.Maybe (mapMaybe) import Data.Text (Text) import qualified Data.Text as T import Data.Monoid import Data.Foldable (toList) import Data.Sequence ((|>), viewr, ViewR(..), singleton, Seq) import qualified Data.Sequence as Seq import Control.Monad.RWS import Control.Applicative import qualified Data.Map as M import Data.List (intercalate) import Debug.Trace -- | Parses the input as a markdown document. Note that 'Doc' is an instance -- of 'ToMarkup', so the document can be converted to 'Html' using 'toHtml'. -- A simple 'Text' to 'Html' filter would be -- -- > markdownToHtml :: Text -> Html -- > markdownToHtml = toHtml . markdown def markdown :: Options -> Text -> Doc markdown opts | debug opts = (\x -> trace (show x) $ Doc opts mempty) . processLines | otherwise = Doc opts . processDocument . processLines -- General parsing strategy: -- -- Step 1: processLines -- -- We process the input line by line. Each line modifies the -- container stack, by adding a leaf to the current open container, -- sometimes after closing old containers and/or opening new ones. -- -- To open a container is to add it to the top of the container stack, -- so that new content will be added under this container. -- To close a container is to remove it from the container stack and -- make it a child of the container above it on the container stack. -- -- When all the input has been processed, we close all open containers -- except the root (Document) container. At this point we should also -- have a ReferenceMap containing any defined link references. -- -- Step 2: processDocument -- -- We then convert this container structure into an AST. This principally -- involves (a) gathering consecutive ListItem containers into lists, (b) -- gathering TextLine nodes that don't belong to verbatim containers into -- paragraphs, and (c) parsing the inline contents of non-verbatim TextLines. -------- -- Container stack definitions: data ContainerStack = ContainerStack Container {- top -} [Container] {- rest -} type LineNumber = Int -- Generic type for a container or a leaf. data Elt = C Container | L LineNumber Leaf deriving Show data Container = Container{ containerType :: ContainerType , children :: Seq Elt } data ContainerType = Document | BlockQuote | ListItem { markerColumn :: Int , padding :: Int , listType :: ListType } | FencedCode { startColumn :: Int , fence :: Text , info :: Text } | IndentedCode | RawHtmlBlock | Reference deriving (Eq, Show) instance Show Container where show c = show (containerType c) ++ "\n" ++ nest 2 (intercalate "\n" (map showElt $ toList $ children c)) nest :: Int -> String -> String nest num = intercalate "\n" . map ((replicate num ' ') ++) . lines showElt :: Elt -> String showElt (C c) = show c showElt (L _ (TextLine s)) = show s showElt (L _ lf) = show lf -- Scanners that must be satisfied if the current open container -- is to be continued on a new line (ignoring lazy continuations). containerContinue :: Container -> Scanner containerContinue c = case containerType c of BlockQuote -> scanNonindentSpace *> scanBlockquoteStart IndentedCode -> scanIndentSpace FencedCode{startColumn = col} -> scanSpacesToColumn col RawHtmlBlock -> nfb scanBlankline li@ListItem{} -> scanBlankline <|> (do scanSpacesToColumn (markerColumn li + 1) upToCountChars (padding li - 1) (==' ') return ()) Reference{} -> nfb scanBlankline >> nfb (scanNonindentSpace *> scanReference) _ -> return () {-# INLINE containerContinue #-} -- Defines parsers that open new containers. containerStart :: Bool -> Parser ContainerType containerStart _lastLineIsText = scanNonindentSpace *> ( (BlockQuote <$ scanBlockquoteStart) <|> parseListMarker ) -- Defines parsers that open new verbatim containers (containers -- that take only TextLine and BlankLine as children). verbatimContainerStart :: Bool -> Parser ContainerType verbatimContainerStart lastLineIsText = scanNonindentSpace *> ( parseCodeFence <|> (guard (not lastLineIsText) *> (IndentedCode <$ char ' ' <* nfb scanBlankline)) <|> (guard (not lastLineIsText) *> (RawHtmlBlock <$ parseHtmlBlockStart)) <|> (guard (not lastLineIsText) *> (Reference <$ scanReference)) ) -- Leaves of the container structure (they don't take children). data Leaf = TextLine Text | BlankLine Text | ATXHeader Int Text | SetextHeader Int Text | Rule deriving (Show) type ContainerM = RWS () ReferenceMap ContainerStack -- Close the whole container stack, leaving only the root Document container. closeStack :: ContainerM Container closeStack = do ContainerStack top rest <- get if null rest then return top else closeContainer >> closeStack -- Close the top container on the stack. If the container is a Reference -- container, attempt to parse the reference and update the reference map. -- If it is a list item container, move a final BlankLine outside the list -- item. closeContainer :: ContainerM () closeContainer = do ContainerStack top rest <- get case top of (Container Reference{} cs'') -> case parse pReference (T.strip $ joinLines $ map extractText $ toList cs'') of Right (lab, lnk, tit) -> do tell (M.singleton (normalizeReference lab) (lnk, tit)) case rest of (Container ct' cs' : rs) -> put $ ContainerStack (Container ct' (cs' |> C top)) rs [] -> return () Left _ -> -- pass over in silence if ref doesn't parse? case rest of (c:cs) -> put $ ContainerStack c cs [] -> return () (Container li@ListItem{} cs'') -> case rest of -- move final BlankLine outside of list item (Container ct' cs' : rs) -> case viewr cs'' of (zs :> b@(L _ BlankLine{})) -> put $ ContainerStack (if Seq.null zs then Container ct' (cs' |> C (Container li zs)) else Container ct' (cs' |> C (Container li zs) |> b)) rs _ -> put $ ContainerStack (Container ct' (cs' |> C top)) rs [] -> return () _ -> case rest of (Container ct' cs' : rs) -> put $ ContainerStack (Container ct' (cs' |> C top)) rs [] -> return () -- Add a leaf to the top container. addLeaf :: LineNumber -> Leaf -> ContainerM () addLeaf lineNum lf = do ContainerStack top rest <- get case (top, lf) of (Container ct@(ListItem{}) cs, BlankLine{}) -> case viewr cs of (_ :> L _ BlankLine{}) -> -- two blanks break out of list item: closeContainer >> addLeaf lineNum lf _ -> put $ ContainerStack (Container ct (cs |> L lineNum lf)) rest (Container ct cs, _) -> put $ ContainerStack (Container ct (cs |> L lineNum lf)) rest -- Add a container to the container stack. addContainer :: ContainerType -> ContainerM () addContainer ct = modify $ \(ContainerStack top rest) -> ContainerStack (Container ct mempty) (top:rest) -- Step 2 -- Convert Document container and reference map into an AST. processDocument :: (Container, ReferenceMap) -> Blocks processDocument (Container ct cs, refmap) = case ct of Document -> processElts refmap (toList cs) _ -> error "top level container is not Document" -- Turn the result of `processLines` into a proper AST. -- This requires grouping text lines into paragraphs -- and list items into lists, handling blank lines, -- parsing inline contents of texts and resolving referencess. processElts :: ReferenceMap -> [Elt] -> Blocks processElts _ [] = mempty processElts refmap (L _lineNumber lf : rest) = case lf of -- Gobble text lines and make them into a Para: TextLine t -> singleton (Para $ parseInlines refmap txt) <> processElts refmap rest' where txt = T.stripEnd $ joinLines $ map T.stripStart $ t : map extractText textlines (textlines, rest') = span isTextLine rest isTextLine (L _ (TextLine _)) = True isTextLine _ = False -- Blanks at outer level are ignored: BlankLine{} -> processElts refmap rest -- Headers: ATXHeader lvl t -> singleton (Header lvl $ parseInlines refmap t) <> processElts refmap rest SetextHeader lvl t -> singleton (Header lvl $ parseInlines refmap t) <> processElts refmap rest -- Horizontal rule: Rule -> singleton HRule <> processElts refmap rest processElts refmap (C (Container ct cs) : rest) = case ct of Document -> error "Document container found inside Document" BlockQuote -> singleton (Blockquote $ processElts refmap (toList cs)) <> processElts refmap rest -- List item? Gobble up following list items of the same type -- (skipping blank lines), determine whether the list is tight or -- loose, and generate a List. ListItem { listType = listType' } -> singleton (List isTight listType' items') <> processElts refmap rest' where xs = takeListItems rest rest' = drop (length xs) rest -- take list items as long as list type matches and we -- don't hit two blank lines: takeListItems (C c@(Container ListItem { listType = lt' } _) : zs) | listTypesMatch lt' listType' = C c : takeListItems zs takeListItems (lf@(L _ (BlankLine _)) : c@(C (Container ListItem { listType = lt' } _)) : zs) | listTypesMatch lt' listType' = lf : c : takeListItems zs takeListItems _ = [] listTypesMatch (Bullet c1) (Bullet c2) = c1 == c2 listTypesMatch (Numbered w1 _) (Numbered w2 _) = w1 == w2 listTypesMatch _ _ = False items = mapMaybe getItem (Container ct cs : [c | C c <- xs]) getItem (Container ListItem{} cs') = Just $ toList cs' getItem _ = Nothing items' = map (processElts refmap) items isTight = tightListItem xs && all tightListItem items FencedCode _ _ info' -> singleton (CodeBlock attr txt) <> processElts refmap rest where txt = joinLines $ map extractText $ toList cs attr = CodeAttr x (T.strip y) (x,y) = T.break (==' ') info' IndentedCode -> singleton (CodeBlock (CodeAttr "" "") txt) <> processElts refmap rest' where txt = joinLines $ stripTrailingEmpties $ concatMap extractCode cbs stripTrailingEmpties = reverse . dropWhile (T.all (==' ')) . reverse -- explanation for next line: when we parsed -- the blank line, we dropped 0-3 spaces. -- but for this, code block context, we want -- to have dropped 4 spaces. we simply drop -- one more: extractCode (L _ (BlankLine t)) = [T.drop 1 t] extractCode (C (Container IndentedCode cs')) = map extractText $ toList cs' extractCode _ = [] (cbs, rest') = span isIndentedCodeOrBlank (C (Container ct cs) : rest) isIndentedCodeOrBlank (L _ BlankLine{}) = True isIndentedCodeOrBlank (C (Container IndentedCode _)) = True isIndentedCodeOrBlank _ = False RawHtmlBlock -> singleton (HtmlBlock txt) <> processElts refmap rest where txt = joinLines (map extractText (toList cs)) -- References have already been taken into account in the reference map, -- so we just skip. Reference{} -> processElts refmap rest where isBlankLine (L _ BlankLine{}) = True isBlankLine _ = False tightListItem [] = True tightListItem xs = not $ any isBlankLine xs extractText :: Elt -> Text extractText (L _ (TextLine t)) = t extractText _ = mempty -- Step 1 processLines :: Text -> (Container, ReferenceMap) processLines t = (doc, refmap) where (doc, refmap) = evalRWS (mapM_ processLine lns >> closeStack) () startState lns = zip [1..] (map tabFilter $ T.lines t) startState = ContainerStack (Container Document mempty) [] -- The main block-parsing function. -- We analyze a line of text and modify the container stack accordingly, -- adding a new leaf, or closing or opening containers. processLine :: (LineNumber, Text) -> ContainerM () processLine (lineNumber, txt) = do ContainerStack top@(Container ct cs) rest <- get -- Apply the line-start scanners appropriate for each nested container. -- Return the remainder of the string, and the number of unmatched -- containers. let (t', numUnmatched) = tryOpenContainers (reverse $ top:rest) txt -- Some new containers can be started only after a blank. let lastLineIsText = numUnmatched == 0 && case viewr cs of (_ :> L _ (TextLine _)) -> True _ -> False -- Process the rest of the line in a way that makes sense given -- the container type at the top of the stack (ct): case ct of -- If it's a verbatim line container, add the line. RawHtmlBlock{} | numUnmatched == 0 -> addLeaf lineNumber (TextLine t') IndentedCode | numUnmatched == 0 -> addLeaf lineNumber (TextLine t') FencedCode{ fence = fence' } -> -- here we don't check numUnmatched because we allow laziness if fence' `T.isPrefixOf` t' -- closing code fence then closeContainer else addLeaf lineNumber (TextLine t') -- otherwise, parse the remainder to see if we have new container starts: _ -> case tryNewContainers lastLineIsText (T.length txt - T.length t') t' of -- lazy continuation: text line, last line was text, no new containers, -- some unmatched containers: ([], TextLine t) | numUnmatched > 0 , case viewr cs of (_ :> L _ (TextLine _)) -> True _ -> False , ct /= IndentedCode -> addLeaf lineNumber (TextLine t) -- if it's a setext header line and the top container has a textline -- as last child, add a setext header: ([], SetextHeader lev _) | numUnmatched == 0 -> case viewr cs of (cs' :> L _ (TextLine t)) -> -- replace last text line with setext header put $ ContainerStack (Container ct (cs' |> L lineNumber (SetextHeader lev t))) rest -- Note: the following case should not occur, since -- we don't add a SetextHeader leaf unless lastLineIsText. _ -> error "setext header line without preceding text line" -- otherwise, close all the unmatched containers, add the new -- containers, and finally add the new leaf: (ns, lf) -> do -- close unmatched containers, add new ones replicateM numUnmatched closeContainer mapM_ addContainer ns case (reverse ns, lf) of -- don't add extra blank at beginning of fenced code block (FencedCode{}:_, BlankLine{}) -> return () _ -> addLeaf lineNumber lf -- Try to match the scanners corresponding to any currently open containers. -- Return remaining text after matching scanners, plus the number of open -- containers whose scanners did not match. (These will be closed unless -- we have a lazy text line.) tryOpenContainers :: [Container] -> Text -> (Text, Int) tryOpenContainers cs t = case parse (scanners $ map containerContinue cs) t of Right (t', n) -> (t', n) Left e -> error $ "error parsing scanners: " ++ show e where scanners [] = (,) <$> takeText <*> pure 0 scanners (p:ps) = (p *> scanners ps) <|> ((,) <$> takeText <*> pure (length (p:ps))) -- Try to match parsers for new containers. Return list of new -- container types, and the leaf to add inside the new containers. tryNewContainers :: Bool -> Int -> Text -> ([ContainerType], Leaf) tryNewContainers lastLineIsText offset t = case parse newContainers t of Right (cs,t') -> (cs, t') Left err -> error (show err) where newContainers = do getPosition >>= \pos -> setPosition pos{ column = offset + 1 } regContainers <- many (containerStart lastLineIsText) verbatimContainers <- option [] $ count 1 (verbatimContainerStart lastLineIsText) if null verbatimContainers then (,) <$> pure regContainers <*> leaf lastLineIsText else (,) <$> pure (regContainers ++ verbatimContainers) <*> textLineOrBlank textLineOrBlank :: Parser Leaf textLineOrBlank = consolidate <$> takeText where consolidate ts | T.all (==' ') ts = BlankLine ts | otherwise = TextLine ts -- Parse a leaf node. leaf :: Bool -> Parser Leaf leaf lastLineIsText = scanNonindentSpace *> ( (ATXHeader <$> parseAtxHeaderStart <*> (T.strip . removeATXSuffix <$> takeText)) <|> (guard lastLineIsText *> (SetextHeader <$> parseSetextHeaderLine <*> pure mempty)) <|> (Rule <$ scanHRuleLine) <|> textLineOrBlank ) where removeATXSuffix t = case T.dropWhileEnd (`elem` " #") t of t' | T.null t' -> t' -- an escaped \# | T.last t' == '\\' -> t' <> "#" | otherwise -> t' -- Scanners scanReference :: Scanner scanReference = () <$ lookAhead (pLinkLabel >> scanChar ':') -- Scan the beginning of a blockquote: up to three -- spaces indent, the `>` character, and an optional space. scanBlockquoteStart :: Scanner scanBlockquoteStart = scanChar '>' >> option () (scanChar ' ') -- Parse the sequence of `#` characters that begins an ATX -- header, and return the number of characters. We require -- a space after the initial string of `#`s, as not all markdown -- implementations do. This is because (a) the ATX reference -- implementation requires a space, and (b) since we're allowing -- headers without preceding blank lines, requiring the space -- avoids accidentally capturing a line like `#8 toggle bolt` as -- a header. parseAtxHeaderStart :: Parser Int parseAtxHeaderStart = do char '#' hashes <- upToCountChars 5 (== '#') -- hashes must be followed by space unless empty header: notFollowedBy (skip (/= ' ')) return $ T.length hashes + 1 parseSetextHeaderLine :: Parser Int parseSetextHeaderLine = do d <- satisfy (\c -> c == '-' || c == '=') let lev = if d == '=' then 1 else 2 skipWhile (== d) scanBlankline return lev -- Scan a horizontal rule line: "...three or more hyphens, asterisks, -- or underscores on a line by themselves. If you wish, you may use -- spaces between the hyphens or asterisks." scanHRuleLine :: Scanner scanHRuleLine = do c <- satisfy (\c -> c == '*' || c == '_' || c == '-') count 2 $ scanSpaces >> skip (== c) skipWhile (\x -> x == ' ' || x == c) endOfInput -- Parse an initial code fence line, returning -- the fence part and the rest (after any spaces). parseCodeFence :: Parser ContainerType parseCodeFence = do col <- column <$> getPosition cs <- takeWhile1 (=='`') <|> takeWhile1 (=='~') guard $ T.length cs >= 3 scanSpaces rawattr <- takeWhile (\c -> c /= '`' && c /= '~') endOfInput return $ FencedCode { startColumn = col , fence = cs , info = rawattr } -- Parse the start of an HTML block: either an HTML tag or an -- HTML comment, with no indentation. parseHtmlBlockStart :: Parser () parseHtmlBlockStart = () <$ lookAhead ((do t <- pHtmlTag guard $ f $ fst t return $ snd t) <|> string "" ) where f (Opening name) = name `Set.member` blockHtmlTags f (SelfClosing name) = name `Set.member` blockHtmlTags f (Closing name) = name `Set.member` blockHtmlTags -- List of block level tags for HTML 5. blockHtmlTags :: Set.Set Text blockHtmlTags = Set.fromList [ "article", "header", "aside", "hgroup", "blockquote", "hr", "body", "li", "br", "map", "button", "object", "canvas", "ol", "caption", "output", "col", "p", "colgroup", "pre", "dd", "progress", "div", "section", "dl", "table", "dt", "tbody", "embed", "textarea", "fieldset", "tfoot", "figcaption", "th", "figure", "thead", "footer", "footer", "tr", "form", "ul", "h1", "h2", "h3", "h4", "h5", "h6", "video"] -- Parse a list marker and return the list type. parseListMarker :: Parser ContainerType parseListMarker = do col <- column <$> getPosition ty <- parseBullet <|> parseListNumber -- padding is 1 if list marker followed by a blank line -- or indented code. otherwise it's the length of the -- whitespace between the list marker and the following text: padding' <- (1 <$ scanBlankline) <|> (1 <$ (skip (==' ') *> lookAhead (count 4 (char ' ')))) <|> (T.length <$> takeWhile (==' ')) -- text can't immediately follow the list marker: guard $ padding' > 0 return $ ListItem { listType = ty , markerColumn = col , padding = padding' + listMarkerWidth ty } listMarkerWidth :: ListType -> Int listMarkerWidth (Bullet _) = 1 listMarkerWidth (Numbered _ n) | n < 10 = 2 | n < 100 = 3 | n < 1000 = 4 | otherwise = 5 -- Parse a bullet and return list type. parseBullet :: Parser ListType parseBullet = do c <- satisfy (\c -> c == '+' || c == '*' || c == '-') unless (c == '+') $ nfb $ (count 2 $ scanSpaces >> skip (== c)) >> skipWhile (\x -> x == ' ' || x == c) >> endOfInput -- hrule return $ Bullet c -- Parse a list number marker and return list type. parseListNumber :: Parser ListType parseListNumber = do num <- (read . T.unpack) <$> takeWhile1 isDigit wrap <- PeriodFollowing <$ skip (== '.') <|> ParenFollowing <$ skip (== ')') return $ Numbered wrap num