module Comark.Parser
( parse
, ParserOption(..)
) where
import Prelude hiding (takeWhile)
import Control.Applicative
import Control.Arrow (second)
import Control.Bool
import Control.Monad
import Control.Monad.Trans.RWS.Strict
import Data.Char
import Data.Either
import Data.Foldable
import Data.List (intercalate)
import qualified Data.Map as Map
import Data.Maybe (mapMaybe)
import Data.Monoid
import Data.Sequence
(Seq, ViewL(..), ViewR(..), singleton, viewl, viewr, (<|), (><), (|>))
import qualified Data.Sequence as Seq
import qualified Data.Set as Set
import Data.Text.Extended (Text)
import qualified Data.Text.Extended as Text
import Comark.Parser.Inline
import Comark.Parser.Options
import Comark.Parser.Util
import Comark.ParserCombinators
import Comark.Syntax
parse :: [ParserOption] -> Text -> Doc Text
parse (parserOptions -> opts) text =
Doc $ processDocument
$ second extendRefmap
$ processLines text
where
extendRefmap refmap =
opts { _poLinkReferences =
\t -> lookupLinkReference refmap t <|> _poLinkReferences opts t
}
data ContainerStack
= ContainerStack
{ csTop :: Container
, csRest :: [Container]
}
type LineNumber = Int
data Elt = C Container
| L LineNumber Leaf
deriving (Show)
data Container =
Container
{ containerType :: ContainerType
, containerChildren :: Seq Elt
}
data ContainerType
= Document
| BlockQuote
| ListItem
{ liPadding :: Int
, liType :: ListType
}
| FencedCode
{ codeStartColumn :: Int
, codeFence :: Text
, codeInfo :: Maybe Text
}
| IndentedCode
| RawHtmlBlock Condition
| Reference
deriving (Show, Eq)
isIndentedCode :: Elt -> Bool
isIndentedCode (C (Container IndentedCode _)) = True
isIndentedCode _ = False
isBlankLine :: Elt -> Bool
isBlankLine (L _ BlankLine{}) = True
isBlankLine _ = False
isTextLine :: Elt -> Bool
isTextLine (L _ (TextLine _)) = True
isTextLine _ = False
isListItem :: ContainerType -> Bool
isListItem ListItem{} = True
isListItem _ = False
instance Show Container where
show c = mconcat
[ show (containerType c), "\n"
, nest 2 (intercalate "\n" $ map pptElt $ toList $ containerChildren c)
]
nest :: Int -> String -> String
nest num = intercalate "\n" . map (replicate num ' ' <>) . lines
pptElt :: Elt -> String
pptElt (C c) = show c
pptElt (L _ (TextLine s)) = show s
pptElt (L _ lf) = show lf
containerContinue :: Container -> Scanner
containerContinue c = case containerType c of
BlockQuote -> pNonIndentSpaces *> scanBlockquoteStart
IndentedCode -> void pIndentSpaces
FencedCode{..} -> void $ pSpacesUpToColumn codeStartColumn
ListItem{..} -> void pBlankline <|> (tabCrusher *> replicateM_ liPadding (char ' '))
Reference -> notFollowedBy
(void pBlankline
<|> (do _ <- pNonIndentSpaces
scanReference <|> scanBlockquoteStart <|> scanTBreakLine)
<|> void parseAtxHeadingStart)
_ -> pure ()
containerStart :: Bool -> Bool -> Parser ContainerType
containerStart afterListItem lastLineIsText = asum
[ pNonIndentSpaces
*> scanBlockquoteStart
*> pure BlockQuote
, parseListMarker afterListItem lastLineIsText
]
verbatimContainerStart :: Bool -> Parser ContainerType
verbatimContainerStart lastLineIsText = asum
[ pNonIndentSpaces *> parseCodeFence
, do guard (not lastLineIsText)
void pIndentSpaces
notFollowedBy pBlankline
pure IndentedCode
, RawHtmlBlock <$> pHtmlBlockStart lastLineIsText
, guard (not lastLineIsText) *> pNonIndentSpaces *> (Reference <$ scanReference)
]
type Leaf = GenLeaf Text
data GenLeaf t
= TextLine t
| BlankLine t
| ATXHeading HeadingLevel t
| SetextHeading HeadingLevel t
| SetextToken HeadingLevel t
| Rule
deriving (Show, Functor)
type ContainerM = RWS () ReferenceMap ContainerStack
closeStack :: ContainerM Container
closeStack =
get >>= \case
ContainerStack top [] -> pure top
ContainerStack _ _ -> closeContainer *> closeStack
closeContainer :: ContainerM ()
closeContainer =
get >>= \case
ContainerStack top@(Container Reference cs'') rest ->
case runParser ((,) <$> pReference <*> untilTheEnd) input of
Right ((lab, lnk, tit), unconsumed) -> do
tell (Map.singleton (normalizeReference lab) (lnk, tit))
case rest of
(Container ct' cs' : rs)
| Text.null unconsumed ->
put $ ContainerStack (Container ct' (rest' <> cs' |> C top)) rs
| otherwise ->
let children = (L (1) (TextLine unconsumed) <| rest') >< (cs' |> C top)
in put $ ContainerStack (Container ct' children) rs
[] -> pure ()
Left _ ->
case rest of
(Container ct' cs' : rs) ->
put $ ContainerStack (Container ct' (cs' <> cs'')) rs
[] -> return ()
where
input = Text.strip $ Text.joinLines $ map extractText $ toList textlines
(textlines, rest') = Seq.spanl isTextLine cs''
ContainerStack top rest
| Container li@ListItem{} (viewr -> zs :> b) <- top
, Container ct' cs' : rs <- rest
, isBlankLine b ->
let els = if null zs
then cs' |> C (Container li zs)
else cs' |> C (Container li zs) |> b
in put $ ContainerStack (Container ct' els) rs
ContainerStack top (Container ct' cs' : rs) ->
put $ ContainerStack (Container ct' (cs' |> C top)) rs
ContainerStack _ [] -> pure ()
addLeaf :: LineNumber -> Leaf -> ContainerM ()
addLeaf lineNum lf = do
ContainerStack top rest <- get
case containerType top of
ListItem{}
| (firstLine :< _) <- viewl $ containerChildren top
, BlankLine{} <- lf
, isBlankLine firstLine -> do
closeContainer
addLeaf lineNum lf
_ -> put $ ContainerStack
(Container
(containerType top)
(containerChildren top |> L lineNum lf))
rest
addContainer :: ContainerType -> ContainerM ()
addContainer ct =
modify $ \ContainerStack{..} ->
ContainerStack (Container ct mempty) (csTop:csRest)
processDocument :: (Container, ParserOptions) -> Blocks Text
processDocument (Container Document cs, opts) = processElts opts (toList cs)
processDocument _ = error "top level container is not Document"
processElts :: ParserOptions -> [Elt] -> Blocks Text
processElts _ [] = mempty
processElts opts (L _lineNumber lf : rest) =
case lf of
TextLine t ->
singleton (Para $ parseInlines opts txt) <> processElts opts rest'
where txt = Text.stripEnd $ Text.joinLines $ map Text.stripStart
$ t : map extractText textlines
(textlines, rest') = span isTextLine rest
BlankLine{} -> processElts opts rest
ATXHeading lvl t -> Heading lvl (parseInlines opts t) <| processElts opts rest
SetextHeading lvl t -> Heading lvl (parseInlines opts t) <| processElts opts rest
SetextToken _ _ -> error "Setext token wasn't converted to setext header"
Rule -> ThematicBreak <| processElts opts rest
processElts opts (C (Container ct cs) : rest) =
case ct of
Document -> error "Document container found inside Document"
BlockQuote -> Quote (processElts opts (toList cs)) <| processElts opts rest
ListItem { liType = itemType } ->
List itemType isTight (Seq.fromList items') <| processElts opts rest'
where
xs = takeListItems rest
rest' = drop (length xs) rest
takeListItems (c@(C (Container ListItem { liType = lt } _)) : zs)
| listTypesMatch lt itemType = c : takeListItems zs
takeListItems (lf@(L _ (BlankLine _)) : c@(C (Container ListItem { liType = lt } _)) : zs)
| listTypesMatch lt itemType = lf : c : takeListItems zs
takeListItems _ = []
listTypesMatch (Bullet c1) (Bullet c2) = c1 == c2
listTypesMatch (Ordered w1 _) (Ordered w2 _) = w1 == w2
listTypesMatch _ _ = False
items = mapMaybe getItem (Container ct cs : [c | C c <- xs])
getItem (Container ListItem{} cs') = Just $ toList cs'
getItem _ = Nothing
items' = map (processElts opts) items
isTight = not (any isBlankLine xs) && all tightListItem items
tightListItem [] = True
tightListItem [_] = True
tightListItem (_:is) = not $ any isBlankLine $ init is
FencedCode _ _ info -> CodeBlock (parseInfoString <$> info)
(Text.unlines $ map extractText $ toList cs)
<| processElts opts rest
IndentedCode -> CodeBlock Nothing txt <| processElts opts rest'
where txt = Text.unlines $ stripTrailingEmpties $ concatMap extractCode cbs
stripTrailingEmpties = reverse . dropWhile (Text.all (== ' ')) . reverse
extractCode (L _ (BlankLine t)) = [Text.drop 1 t]
extractCode (C (Container IndentedCode cs')) =
map extractText $ toList cs'
extractCode _ = []
(cbs, rest') = span (isIndentedCode <||> isBlankLine)
(C (Container ct cs) : rest)
RawHtmlBlock _ -> HtmlBlock txt <| processElts opts rest
where txt = Text.unlines (map extractText (toList cs))
Reference -> processElts opts rest
extractText :: Elt -> Text
extractText (L _ (TextLine t)) = t
extractText _ = mempty
processLines :: Text -> (Container, ReferenceMap)
processLines t = evalRWS (mapM_ processLine lns >> closeStack) () initState
where
lns = zip [1..] $ Text.lines' $ Text.replace "\0" "\xFFFD" t
initState = ContainerStack (Container Document mempty) []
processLine :: (LineNumber, Text) -> ContainerM ()
processLine (lineNumber, txt) = do
ContainerStack top@(Container ct cs) rest <- get
let (t', numUnmatched) = tryOpenContainers (reverse $ top:rest) txt
let lastLineIsText = numUnmatched == 0 &&
case viewr cs of
(_ :> L _ (TextLine _)) -> True
_ -> False
case ct of
RawHtmlBlock c
| numUnmatched == 0 -> do
addLeaf lineNumber (TextLine t')
when (isRight $ runParser (blockEnd c) t')
closeContainer
IndentedCode
| numUnmatched == 0 -> addLeaf lineNumber (TextLine t')
FencedCode { codeFence = fence' }
| numUnmatched == 0 -> if
| isRight $ runParser scanClosing t'
-> closeContainer
| otherwise
-> addLeaf lineNumber (TextLine t')
where
scanClosing = satisfyUpTo 3 (== ' ')
*> string fence' *> skipWhile (== Text.head fence')
*> pSpaces
*> endOfInput
_ -> let (verbatimContainers, leaf) =
tryNewContainers (isListItem ct) lastLineIsText (Text.length txt Text.length t') t'
in case (Seq.viewl verbatimContainers, leaf) of
(Seq.EmptyL, TextLine t)
| numUnmatched > 0
, _ :> L _ TextLine{} <- viewr cs
, ct /= IndentedCode
-> addLeaf lineNumber (TextLine t)
(IndentedCode :< _, TextLine t)
| numUnmatched > 0
, _ :> L _ TextLine{} <- viewr cs
, ListItem{} <- ct
-> addLeaf lineNumber $ TextLine $ Text.strip t
(IndentedCode :< _, TextLine t)
| numUnmatched > 0
, _ :> L _ TextLine{} <- viewr cs
, BlockQuote{} <- ct
-> addLeaf lineNumber $ TextLine $ Text.strip t
(Seq.EmptyL, SetextToken lev _setextText) | numUnmatched == 0 ->
case Seq.spanr isTextLine cs of
(textlines, cs')
| not (Seq.null textlines)
-> put $ ContainerStack
(Container ct
(cs' |> L lineNumber
(SetextHeading
lev
(Text.strip $ Text.unlines
$ fmap extractText
$ toList textlines))))
rest
| otherwise -> error "setext header line without preceding text lines"
(RawHtmlBlock condition :< _, TextLine t)
| Right () <- runParser (blockEnd condition) t
-> do closeContainer
addContainer (RawHtmlBlock condition)
addLeaf lineNumber (TextLine t)
closeContainer
(ns, lf) -> do
_ <- replicateM numUnmatched closeContainer
_ <- mapM_ addContainer ns
case (Seq.viewr verbatimContainers, lf) of
(_ :>FencedCode{}, BlankLine{}) -> pure ()
_ -> addLeaf lineNumber lf
tabCrusher :: Parser ()
tabCrusher = do
p <- getPosition
replacing (go (column p 1) "")
where
go cnt acc = do
c <- peekChar
case c of
Just ' ' -> char ' ' *> go (cnt + 1) (acc <> " ")
Just '\t' -> char '\t' *> go (cnt + 4 (cnt `mod` 4)) (acc <> Text.replicate (4 (cnt `mod` 4)) " ")
_ -> pure acc
tryOpenContainers :: [Container] -> Text -> (Text, Int)
tryOpenContainers cs t =
case runParser (scanners $ map containerContinue cs) t of
Right (t', n) -> (t', n)
Left e -> error $ "error parsing scanners: " ++ show e
where
scanners [] = (,0) <$> untilTheEnd
scanners (p:ps) = (p *> scanners ps) <|> ((,length (p:ps)) <$> untilTheEnd)
tryNewContainers :: Bool -> Bool -> Int -> Text -> (Seq ContainerType, Leaf)
tryNewContainers afterListItem lastLineIsText offset t =
case runParser newContainers t of
Right (cs,t') -> (cs, t')
Left err -> error (show err)
where
newContainers = do
getPosition >>= \pos -> setPosition pos{ column = offset + 1 }
regContainers <- Seq.fromList <$> many (containerStart afterListItem lastLineIsText)
mVerbatimContainer <- optional $ verbatimContainerStart lastLineIsText
case mVerbatimContainer of
Just verbatimContainer
-> (regContainers |> verbatimContainer,) <$> textLineOrBlank
Nothing -> (regContainers,) <$> parseLeaf lastLineIsText
textLineOrBlank :: Parser Leaf
textLineOrBlank = consolidate <$> untilTheEnd
where consolidate ts | Text.all (==' ') ts = BlankLine ts
| otherwise = TextLine ts
parseLeaf :: Bool -> Parser Leaf
parseLeaf lastLineIsText = pNonIndentSpaces *> asum
[ ATXHeading <$> parseAtxHeadingStart <*> parseAtxHeadingContent
, guard lastLineIsText *> parseSetextToken
, Rule <$ scanTBreakLine
, textLineOrBlank
]
scanReference :: Scanner
scanReference = void $ lookAhead (char '[')
scanBlockquoteStart :: Scanner
scanBlockquoteStart = char '>' *> tabCrusher *> discardOpt (char ' ')
parseAtxHeadingStart :: Parser HeadingLevel
parseAtxHeadingStart = do
_ <- char '#'
hashes <- satisfyUpTo 5 (== '#')
notFollowedBy (skip ((/= ' ') <&&> (/= '\t')))
pure $ case (Text.length hashes + 1) of
1 -> Heading1
2 -> Heading2
3 -> Heading3
4 -> Heading4
5 -> Heading5
6 -> Heading6
_ -> error $ "IMPOSSIBLE HAPPENED: parseAtxHeading parsed more than 6 characters "
parseAtxHeadingContent :: Parser Text
parseAtxHeadingContent = Text.strip . removeATXSuffix <$> untilTheEnd
where
removeATXSuffix t =
case dropTrailingHashes of
t' | Text.null t' -> t'
| Text.last t' == '\\' -> t' <> Text.replicate trailingHashes "#"
| Text.last t' /= ' ' -> t
| otherwise -> t'
where
dropTrailingSpaces = Text.dropWhileEnd (== ' ') t
dropTrailingHashes = Text.dropWhileEnd (== '#') dropTrailingSpaces
trailingHashes = Text.length dropTrailingSpaces Text.length dropTrailingHashes
parseSetextToken :: Parser Leaf
parseSetextToken = fmap (uncurry SetextToken) $ withConsumed $ do
d <- satisfy (\c -> c == '-' || c == '=')
skipWhile (== d)
void pBlankline
pure $ if d == '=' then Heading1 else Heading2
scanTBreakLine :: Scanner
scanTBreakLine = do
c <- satisfy ((== '*') <||> (== '_') <||> (== '-'))
replicateM_ 2 $ skipWhile ((== ' ') <||> (== '\t')) *> skip (== c)
skipWhile ((== ' ') <||> (== '\t') <||> (== c))
endOfInput
parseCodeFence :: Parser ContainerType
parseCodeFence = do
col <- column <$> getPosition
cs <- takeWhile1 (=='`') <|> takeWhile1 (=='~')
guard $ Text.length cs >= 3
void pSpaces
rawattr <- optional (takeWhile1 (\c -> c /= '`' && c /= '~'))
endOfInput
pure FencedCode
{ codeStartColumn = col
, codeFence = cs
, codeInfo = rawattr
}
pHtmlBlockStart :: Bool -> Parser Condition
pHtmlBlockStart lastLineIsText = lookAhead $ do
discardOpt pNonIndentSpaces
asum starters
where
starters =
[ condition1 <$ blockStart condition1
, condition2 <$ blockStart condition2
, condition3 <$ blockStart condition3
, condition4 <$ blockStart condition4
, condition5 <$ blockStart condition5
, condition6 <$ blockStart condition6
, condition7 <$ if lastLineIsText then mzero else blockStart condition7
]
data Condition =
Condition
{ blockStart :: Parser ()
, blockEnd :: Parser ()
}
instance Show Condition where
show _ = "Condition{}"
instance Eq Condition where
_ == _ = False
lineContains :: Foldable t => t Text -> Parser ()
lineContains terms = do
line <- Text.toCaseFold <$> takeTill isLineEnding
guard $ any (`Text.isInfixOf` line) terms
condition1, condition2, condition3, condition4, condition5, condition6, condition7 :: Condition
condition1 = Condition
{ blockStart = do
_ <- asum $ map stringCaseless ["<script", "<pre", "<style"]
void pWhitespace <|> void ">" <|> void pLineEnding <|> endOfInput
, blockEnd = lineContains ["</script>", "</pre>", "</style>"]
}
condition2 = Condition
{ blockStart = void "<!--"
, blockEnd = void $ lineContains ["-->"]
}
condition3 = Condition
{ blockStart = void "<?"
, blockEnd = void $ lineContains ["?>"]
}
condition4 = Condition
{ blockStart = void $ "<!" *> satisfy isAsciiUpper
, blockEnd = void $ lineContains [">"]
}
condition5 = Condition
{ blockStart = void $ "<![CDATA["
, blockEnd = void $ lineContains ["]]>"]
}
condition6 = Condition
{ blockStart = do
void $ "</" <|> "<"
tag <- takeTill (isWhitespace <||> (== '/') <||> (== '>'))
guard $ isBlockHtmlTag (Text.toLower tag)
void pWhitespace <|> void pLineEnding <|> void ">" <|> void "/>"
, blockEnd = void pBlankline
}
condition7 = Condition
{ blockStart = (openTag <|> closeTag) *> (void pWhitespace <|> endOfInput)
, blockEnd = void pBlankline
}
where
tagName = do
c <- satisfy (inClass "A-Za-z")
cs <- takeWhile ((== '-') <||> inClass "A-Za-z0-9")
guard (Text.cons c cs `notElem` ["script", "style", "pre"])
attr = pWhitespace *> attrName *> optional attrValueSpec
attrName = satisfy (inClass "_:A-Za-z") *> skipWhile (inClass "A-Za-z0-9_.:-")
attrValueSpec = optional pWhitespace *> char '=' *>
optional pWhitespace *> attrValue
attrValue = void unquoted <|> void singleQuoted <|> void doubleQuoted
unquoted = skipWhile1 (notInClass " \"'=<>`")
singleQuoted = "'" *> skipWhile (/= '\'') *> "'"
doubleQuoted = "\"" *> skipWhile (/= '"') *> "\""
openTag = "<" *> tagName *> many attr *> optional pWhitespace
*> optional "/" *> ">"
closeTag = "</" *> tagName *> optional pWhitespace *> ">"
isBlockHtmlTag :: Text -> Bool
isBlockHtmlTag name = Text.toLower name `Set.member` Set.fromList
[ "address", "article", "aside", "base", "basefont", "blockquote"
, "body", "caption", "center", "col", "colgroup", "dd", "details"
, "dialog", "dir", "div", "dl", "dt", "fieldset", "figcaption"
, "figure", "footer", "form", "frame", "frameset"
, "h1", "h2", "h3", "h4", "h5", "h6", "head", "header"
, "hr", "html", "iframe", "legend", "li", "link", "main"
, "menu", "menuitem", "meta", "nav", "noframes", "ol", "optgroup"
, "option", "p", "param", "section", "source", "summary", "table"
, "tbody", "td", "tfoot", "th", "thead", "title", "tr", "track", "ul"
]
parseListMarker :: Bool -> Bool -> Parser ContainerType
parseListMarker afterListItem lastLineIsText = do
tabCrusher
markerPadding <- Text.length <$> if afterListItem then pSpaces else pNonIndentSpaces
ty <- parseBullet <|> parseListNumber lastLineIsText
tabCrusher
contentPadding <- (1 <$ pBlankline)
<|> (1 <$ (skip (==' ') *> lookAhead pIndentSpaces))
<|> (Text.length <$> pSpaces)
guard $ contentPadding > 0
when lastLineIsText $ notFollowedBy endOfInput
pure ListItem
{ liType = ty
, liPadding = markerPadding + contentPadding + listMarkerWidth ty
}
listMarkerWidth :: ListType -> Int
listMarkerWidth (Bullet _) = 1
listMarkerWidth (Ordered _ n)
| n < 10 = 2
| n < 100 = 3
| n < 1000 = 4
| otherwise = 5
parseBullet :: Parser ListType
parseBullet = do
(bulletType, bulletChar) <- ((Plus,) <$> char '+')
<|> ((Minus,) <$> char '-')
<|> ((Asterisk,) <$> char '*')
unless (bulletType == Plus) $
notFollowedBy $ do
replicateM_ 2 $ do
skipWhile ((== ' ') <||> (== '\t'))
skip (== bulletChar)
skipWhile (\x -> x == '\t' || x == ' ' || x == bulletChar)
endOfInput
return $ Bullet bulletType
parseListNumber :: Bool -> Parser ListType
parseListNumber lastLineIsText = do
num :: Integer <- decimal
when lastLineIsText $
guard $ num == 1
guard $ num < (10 ^ (9 :: Integer))
wrap <- asum [Period <$ char '.', Paren <$ char ')']
return $ Ordered wrap (fromInteger num)