module Text.Pandoc.Readers.Markdown (
readMarkdown
) where
import Data.List ( transpose, isSuffixOf, lookup, sortBy )
import Data.Ord ( comparing )
import Data.Char ( isAlphaNum )
import Text.Pandoc.Definition
import Text.Pandoc.Shared
import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment )
import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlBlockTag,
anyHtmlInlineTag, anyHtmlTag,
anyHtmlEndTag, htmlEndTag, extractTagType,
htmlBlockElement )
import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
import Text.ParserCombinators.Parsec
readMarkdown :: ParserState -> String -> Pandoc
readMarkdown state str = (readWith parseMarkdown) state (str ++ "\n\n")
spaceChars = " \t"
bulletListMarkers = "*+-"
hruleChars = "*-_"
titleOpeners = "\"'("
setextHChars = "=-"
specialChars = "\\[]*_~`<>$!^-.&'\""
indentSpaces = try $ do
state <- getState
let tabStop = stateTabStop state
try (count tabStop (char ' ')) <|>
(many (char ' ') >> string "\t") <?> "indentation"
nonindentSpaces = do
state <- getState
let tabStop = stateTabStop state
choice $ map (\n -> (try (count n (char ' ')))) $ reverse [0..(tabStop 1)]
failUnlessBeginningOfLine = do
pos <- getPosition
if sourceColumn pos == 1 then return () else fail "not beginning of line"
failUnlessSmart = do
state <- getState
if stateSmart state then return () else fail "Smart typography feature"
inlineString str = try $ do
(Str res) <- inline
if res == str then return res else fail $ "unexpected Str content"
inlinesInBalanced :: String -> String -> GenParser Char ParserState [Inline]
inlinesInBalanced opener closer = try $ do
string opener
result <- manyTill ( (do lookAhead (inlineString opener)
bal <- inlinesInBalanced opener closer
return $ [Str opener] ++ bal ++ [Str closer])
<|> (count 1 inline))
(try (string closer))
return $ concat result
titleLine = try $ char '%' >> skipSpaces >> manyTill inline newline
authorsLine = try $ do
char '%'
skipSpaces
authors <- sepEndBy (many1 (noneOf ",;\n")) (oneOf ",;")
newline
return $ map (decodeCharacterReferences . removeLeadingTrailingSpace) authors
dateLine = try $ do
char '%'
skipSpaces
date <- many (noneOf "\n")
newline
return $ decodeCharacterReferences $ removeTrailingSpace date
titleBlock = try $ do
failIfStrict
title <- option [] titleLine
author <- option [] authorsLine
date <- option "" dateLine
optional blanklines
return (title, author, date)
parseMarkdown = do
updateState (\state -> state { stateParseRaw = True })
(title, author, date) <- option ([],[],"") titleBlock
refs <- manyTill (referenceKey <|> (lineClump >>= return . LineClump)) eof
let keys = map (\(KeyBlock label target) -> (label, target)) $
filter isKeyBlock refs
let rawlines = map (\(LineClump ln) -> ln) $ filter isLineClump refs
setInput $ concat rawlines
updateState (\state -> state { stateKeys = keys })
refs <- manyTill (noteBlock <|> (lineClump >>= return . LineClump)) eof
let notes = map (\(NoteBlock label blocks) -> (label, blocks)) $
filter isNoteBlock refs
let rawlines = map (\(LineClump ln) -> ln) $ filter isLineClump refs
setInput $ concat rawlines
updateState (\state -> state { stateNotes = notes })
blocks <- parseBlocks
let blocks' = filter (/= Null) blocks
return $ Pandoc (Meta title author date) blocks'
referenceKey = try $ do
nonindentSpaces
label <- reference
char ':'
skipSpaces
optional (char '<')
src <- many (noneOf "> \n\t")
optional (char '>')
tit <- option "" title
blanklines
return $ KeyBlock label (removeTrailingSpace src, tit)
noteMarker = try $ do
char '['
char '^'
manyTill (noneOf " \t\n") (char ']')
rawLine = try $ do
notFollowedBy blankline
notFollowedBy' noteMarker
contents <- many1 nonEndline
end <- option "" (newline >> optional indentSpaces >> return "\n")
return $ contents ++ end
rawLines = many1 rawLine >>= return . concat
noteBlock = try $ do
failIfStrict
ref <- noteMarker
char ':'
optional blankline
optional indentSpaces
raw <- sepBy rawLines (try (blankline >> indentSpaces))
optional blanklines
rest <- getInput
setInput $ (joinWithSep "\n" raw) ++ "\n\n"
contents <- parseBlocks
setInput rest
return $ NoteBlock ref contents
parseBlocks = manyTill block eof
block = choice [ header
, table
, codeBlock
, hrule
, list
, blockQuote
, htmlBlock
, rawLaTeXEnvironment'
, para
, plain
, nullBlock ] <?> "block"
header = setextHeader <|> atxHeader <?> "header"
atxHeader = try $ do
lead <- many1 (char '#')
notFollowedBy (char '.' <|> char ')')
skipSpaces
txt <- manyTill inline atxClosing
return $ Header (length lead) (normalizeSpaces txt)
atxClosing = try $ skipMany (char '#') >> skipSpaces >> newline >>
option "" blanklines
setextHeader = choice $
map (\x -> setextH x) $ enumFromTo 1 (length setextHChars)
setextH n = try $ do
txt <- many1Till inline newline
many1 (char (setextHChars !! (n1)))
skipSpaces
newline
optional blanklines
return $ Header n (normalizeSpaces txt)
hruleWith chr = try $ do
count 3 (skipSpaces >> char chr)
skipMany (skipSpaces >> char chr)
newline
optional blanklines
return HorizontalRule
hrule = choice (map hruleWith hruleChars) <?> "hrule"
indentedLine = try $ do
indentSpaces
result <- manyTill anyChar newline
return $ result ++ "\n"
indentedBlock = try $ do
res1 <- indentedLine
blanks <- many blankline
res2 <- indentedBlock <|> indentedLine
return $ res1 ++ blanks ++ res2
codeBlock = (indentedBlock <|> indentedLine) >>~ optional blanklines >>=
return . CodeBlock . stripTrailingNewlines
emacsBoxQuote = try $ do
failIfStrict
string ",----"
manyTill anyChar newline
raw <- manyTill
(try (char '|' >> optional (char ' ') >> manyTill anyChar newline))
(try (string "`----"))
blanklines
return raw
emailBlockQuoteStart = try $ nonindentSpaces >> char '>' >>~ optional (char ' ')
emailBlockQuote = try $ do
emailBlockQuoteStart
raw <- sepBy (many (nonEndline <|>
(try (endline >> notFollowedBy emailBlockQuoteStart >>
return '\n'))))
(try (newline >> emailBlockQuoteStart))
newline <|> (eof >> return '\n')
optional blanklines
return raw
blockQuote = do
raw <- emailBlockQuote <|> emacsBoxQuote
rest <- getInput
setInput $ (joinWithSep "\n" raw) ++ "\n\n"
contents <- parseBlocks
setInput rest
return $ BlockQuote contents
list = choice [ bulletList, orderedList, definitionList ] <?> "list"
bulletListStart = try $ do
optional newline
nonindentSpaces
notFollowedBy' hrule
oneOf bulletListMarkers
spaceChar
skipSpaces
anyOrderedListStart = try $ do
optional newline
nonindentSpaces
state <- getState
if stateStrict state
then do many1 digit
char '.'
return (1, DefaultStyle, DefaultDelim)
else anyOrderedListMarker
orderedListStart style delim = try $ do
optional newline
nonindentSpaces
state <- getState
if stateStrict state
then do many1 digit
char '.'
return 1
else orderedListMarker style delim
oneOf spaceChars
skipSpaces
listLine start = try $ do
notFollowedBy' start
notFollowedBy blankline
notFollowedBy' (do indentSpaces
many (spaceChar)
bulletListStart <|> (anyOrderedListStart >> return ()))
line <- manyTill anyChar newline
return $ line ++ "\n"
rawListItem start = try $ do
start
result <- many1 (listLine start)
blanks <- many blankline
return $ concat result ++ blanks
listContinuation start = try $ do
lookAhead indentSpaces
result <- many1 (listContinuationLine start)
blanks <- many blankline
return $ concat result ++ blanks
listContinuationLine start = try $ do
notFollowedBy blankline
notFollowedBy' start
optional indentSpaces
result <- manyTill anyChar newline
return $ result ++ "\n"
listItem start = try $ do
first <- rawListItem start
continuations <- many (listContinuation start)
state <- getState
let oldContext = stateParserContext state
setState $ state {stateParserContext = ListItemState}
rest <- getInput
let raw = concat (first:continuations)
setInput raw
contents <- parseBlocks
setInput rest
updateState (\st -> st {stateParserContext = oldContext})
return contents
orderedList = do
(start, style, delim) <- lookAhead anyOrderedListStart
items <- many1 (listItem (orderedListStart style delim))
return $ OrderedList (start, style, delim) $ compactify items
bulletList = many1 (listItem bulletListStart) >>=
return . BulletList . compactify
definitionListItem = try $ do
notFollowedBy blankline
notFollowedBy' indentSpaces
term <- manyTill inline newline
raw <- many1 defRawBlock
state <- getState
let oldContext = stateParserContext state
rest <- getInput
setInput (concat raw)
contents <- parseBlocks
setInput rest
updateState (\st -> st {stateParserContext = oldContext})
return ((normalizeSpaces term), contents)
defRawBlock = try $ do
char ':'
state <- getState
let tabStop = stateTabStop state
try (count (tabStop 1) (char ' ')) <|> (many (char ' ') >> string "\t")
firstline <- anyLine
rawlines <- many (notFollowedBy blankline >> indentSpaces >> anyLine)
trailing <- option "" blanklines
return $ firstline ++ "\n" ++ unlines rawlines ++ trailing
definitionList = do
failIfStrict
items <- many1 definitionListItem
let (terms, defs) = unzip items
let defs' = compactify defs
let items' = zip terms defs'
return $ DefinitionList items'
para = try $ do
result <- many1 inline
newline
st <- getState
if stateStrict st
then choice [ lookAhead blockQuote, lookAhead header,
(blanklines >> return Null) ]
else choice [ lookAhead emacsBoxQuote >> return Null,
(blanklines >> return Null) ]
return $ Para $ normalizeSpaces result
plain = many1 inline >>= return . Plain . normalizeSpaces
htmlElement = strictHtmlBlock <|> htmlBlockElement <?> "html element"
htmlBlock = do
st <- getState
if stateStrict st
then try $ do failUnlessBeginningOfLine
first <- htmlElement
finalSpace <- many (oneOf spaceChars)
finalNewlines <- many newline
return $ RawHtml $ first ++ finalSpace ++ finalNewlines
else rawHtmlBlocks
isSelfClosing tag =
isSuffixOf "/>" $ filter (\c -> (not (c `elem` " \n\t"))) tag
strictHtmlBlock = try $ do
tag <- anyHtmlBlockTag
let tag' = extractTagType tag
if isSelfClosing tag || tag' == "hr"
then return tag
else do contents <- many (notFollowedBy' (htmlEndTag tag') >>
(htmlElement <|> (count 1 anyChar)))
end <- htmlEndTag tag'
return $ tag ++ concat contents ++ end
rawHtmlBlocks = try $ do
htmlBlocks <- many1 rawHtmlBlock
let combined = concatMap (\(RawHtml str) -> str) htmlBlocks
let combined' = if not (null combined) && last combined == '\n'
then init combined
else combined
return $ RawHtml combined'
rawLaTeXEnvironment' = failIfStrict >> rawLaTeXEnvironment
dashedLine ch = try $ do
dashes <- many1 (char ch)
sp <- many spaceChar
return $ (length dashes, length $ dashes ++ sp)
simpleTableHeader = try $ do
rawContent <- anyLine
initSp <- nonindentSpaces
dashes <- many1 (dashedLine '-')
newline
let (lengths, lines) = unzip dashes
let indices = scanl (+) (length initSp) lines
let rawHeads = tail $ splitByIndices (init indices) rawContent
let aligns = zipWith alignType (map (\a -> [a]) rawHeads) lengths
return (rawHeads, aligns, indices)
tableFooter = try $ nonindentSpaces >> many1 (dashedLine '-') >> blanklines
tableSep = try $ nonindentSpaces >> many1 (dashedLine '-') >> string "\n"
rawTableLine indices = do
notFollowedBy' (blanklines <|> tableFooter)
line <- many1Till anyChar newline
return $ map removeLeadingTrailingSpace $ tail $
splitByIndices (init indices) line
tableLine indices = rawTableLine indices >>= mapM (parseFromString (many plain))
multilineRow indices = try $ do
colLines <- many1 (rawTableLine indices)
optional blanklines
let cols = map unlines $ transpose colLines
mapM (parseFromString (many plain)) cols
widthsFromIndices :: Int
-> [Int]
-> [Float]
widthsFromIndices _ [] = []
widthsFromIndices numColumns indices =
let lengths = zipWith () indices (0:indices)
totLength = sum lengths
quotient = if totLength > numColumns
then fromIntegral totLength
else fromIntegral numColumns
fracs = map (\l -> (fromIntegral l) / quotient) lengths in
tail fracs
tableCaption = try $ do
nonindentSpaces
string "Table:"
result <- many1 inline
blanklines
return $ normalizeSpaces result
tableWith headerParser lineParser footerParser = try $ do
(rawHeads, aligns, indices) <- headerParser
lines <- many1Till (lineParser indices) footerParser
caption <- option [] tableCaption
heads <- mapM (parseFromString (many plain)) rawHeads
state <- getState
let numColumns = stateColumns state
let widths = widthsFromIndices numColumns indices
return $ Table caption aligns widths heads lines
simpleTable = tableWith simpleTableHeader tableLine blanklines
multilineTable = tableWith multilineTableHeader multilineRow tableFooter
multilineTableHeader = try $ do
tableSep
rawContent <- many1 (notFollowedBy' tableSep >> many1Till anyChar newline)
initSp <- nonindentSpaces
dashes <- many1 (dashedLine '-')
newline
let (lengths, lines) = unzip dashes
let indices = scanl (+) (length initSp) lines
let rawHeadsList = transpose $ map
(\ln -> tail $ splitByIndices (init indices) ln)
rawContent
let rawHeads = map (joinWithSep " ") rawHeadsList
let aligns = zipWith alignType rawHeadsList lengths
return ((map removeLeadingTrailingSpace rawHeads), aligns, indices)
alignType :: [String] -> Int -> Alignment
alignType [] len = AlignDefault
alignType strLst len =
let str = head $ sortBy (comparing length) $
map removeTrailingSpace strLst
leftSpace = if null str then False else (str !! 0) `elem` " \t"
rightSpace = length str < len || (str !! (len 1)) `elem` " \t"
in case (leftSpace, rightSpace) of
(True, False) -> AlignRight
(False, True) -> AlignLeft
(True, True) -> AlignCenter
(False, False) -> AlignDefault
table = failIfStrict >> (simpleTable <|> multilineTable) <?> "table"
inline = choice [ rawLaTeXInline'
, escapedChar
, charRef
, note
, inlineNote
, link
, referenceLink
, rawHtmlInline'
, autoLink
, image
, math
, strong
, emph
, strikeout
, superscript
, subscript
, smartPunctuation
, code
, ltSign
, symbol
, str
, linebreak
, tabchar
, whitespace
, endline ] <?> "inline"
escapedChar = try $ do
char '\\'
state <- getState
result <- if stateStrict state
then oneOf "\\`*_{}[]()>#+-.!~"
else satisfy (not . isAlphaNum)
return $ Str [result]
ltSign = try $ do
notFollowedBy (noneOf "<")
notFollowedBy' rawHtmlBlocks
char '<'
return $ Str ['<']
specialCharsMinusLt = filter (/= '<') specialChars
symbol = do
result <- oneOf specialCharsMinusLt
return $ Str [result]
code = try $ do
starts <- many1 (char '`')
let num = length starts
result <- many1Till anyChar (try (count num (char '`')))
return $ Code $ removeLeadingTrailingSpace $ joinWithSep " " $ lines result
mathWord = many1 ((noneOf " \t\n\\$") <|>
(try (char '\\') >>~ notFollowedBy (char '$')))
math = try $ do
failIfStrict
char '$'
notFollowedBy space
words <- sepBy1 mathWord (many1 space)
char '$'
return $ TeX ("$" ++ (joinWithSep " " words) ++ "$")
emph = ((enclosed (char '*') (char '*') inline) <|>
(enclosed (char '_') (char '_') inline)) >>=
return . Emph . normalizeSpaces
strong = ((enclosed (string "**") (string "**") inline) <|>
(enclosed (string "__") (string "__") inline)) >>=
return . Strong . normalizeSpaces
strikeout = failIfStrict >> enclosed (string "~~") (string "~~") inline >>=
return . Strikeout . normalizeSpaces
superscript = failIfStrict >> enclosed (char '^') (char '^')
(notFollowedBy' whitespace >> inline) >>=
return . Superscript
subscript = failIfStrict >> enclosed (char '~') (char '~')
(notFollowedBy' whitespace >> inline) >>=
return . Subscript
smartPunctuation = failUnlessSmart >>
choice [ quoted, apostrophe, dash, ellipses ]
apostrophe = (char '\'' <|> char '\8217') >> return Apostrophe
quoted = doubleQuoted <|> singleQuoted
withQuoteContext context parser = do
oldState <- getState
let oldQuoteContext = stateQuoteContext oldState
setState oldState { stateQuoteContext = context }
result <- parser
newState <- getState
setState newState { stateQuoteContext = oldQuoteContext }
return result
singleQuoted = try $ do
singleQuoteStart
withQuoteContext InSingleQuote $ many1Till inline singleQuoteEnd >>=
return . Quoted SingleQuote . normalizeSpaces
doubleQuoted = try $ do
doubleQuoteStart
withQuoteContext InDoubleQuote $ many1Till inline doubleQuoteEnd >>=
return . Quoted DoubleQuote . normalizeSpaces
failIfInQuoteContext context = do
st <- getState
if (stateQuoteContext st == context)
then fail "already inside quotes"
else return ()
singleQuoteStart = do
failIfInQuoteContext InSingleQuote
char '\8216' <|>
do char '\''
notFollowedBy (oneOf ")!],.;:-? \t\n")
notFollowedBy (try (oneOfStrings ["s","t","m","ve","ll","re"] >>
satisfy (not . isAlphaNum)))
return '\''
singleQuoteEnd = (char '\'' <|> char '\8217') >> notFollowedBy alphaNum
doubleQuoteStart = failIfInQuoteContext InDoubleQuote >>
(char '"' <|> char '\8220') >>
notFollowedBy (oneOf " \t\n")
doubleQuoteEnd = char '"' <|> char '\8221'
ellipses = try $ oneOfStrings ["...", " . . . ", ". . .", " . . ."] >>
return Ellipses
dash = enDash <|> emDash
enDash = try $ char '-' >> notFollowedBy (noneOf "0123456789") >> return EnDash
emDash = try $ skipSpaces >> oneOfStrings ["---", "--"] >>
skipSpaces >> return EmDash
whitespace = (many1 (oneOf spaceChars) >> return Space) <?> "whitespace"
tabchar = tab >> return (Str "\t")
linebreak = try $ oneOf spaceChars >> many1 (oneOf spaceChars) >>
endline >> return LineBreak
nonEndline = satisfy (/='\n')
strChar = noneOf (specialChars ++ spaceChars ++ "\n")
str = many1 strChar >>= return . Str
endline = try $ do
newline
notFollowedBy blankline
st <- getState
if stateStrict st
then do
notFollowedBy emailBlockQuoteStart
notFollowedBy (char '#')
notFollowedBy (manyTill anyChar newline >> oneOf setextHChars)
else return ()
if stateParserContext st == ListItemState
then notFollowedBy' (bulletListStart <|>
(anyOrderedListStart >> return ()))
else return ()
return Space
reference = notFollowedBy' (string "[^") >>
inlinesInBalanced "[" "]" >>= (return . normalizeSpaces)
source = try $ do
char '('
optional (char '<')
src <- many (noneOf ")> \t\n")
optional (char '>')
tit <- option "" title
skipSpaces
char ')'
return (removeTrailingSpace src, tit)
titleWith startChar endChar = try $ do
leadingSpace <- many1 (oneOf " \t\n")
if length (filter (=='\n') leadingSpace) > 1
then fail "title must be separated by space and on same or next line"
else return ()
char startChar
tit <- manyTill anyChar (try (char endChar >> skipSpaces >>
notFollowedBy (noneOf ")\n")))
return $ decodeCharacterReferences tit
title = choice [ titleWith '(' ')',
titleWith '"' '"',
titleWith '\'' '\''] <?> "title"
link = choice [explicitLink, referenceLink] <?> "link"
explicitLink = try $ do
label <- reference
src <- source
return $ Link label src
referenceLink = try $ do
label <- reference
ref <- option [] (try (skipSpaces >> optional newline >>
skipSpaces >> reference))
let ref' = if null ref then label else ref
state <- getState
case lookupKeySrc (stateKeys state) ref' of
Nothing -> fail "no corresponding key"
Just target -> return (Link label target)
autoLink = autoLinkEmail <|> autoLinkRegular
autoLinkEmail = try $ do
char '<'
name <- many1Till (noneOf "/:<> \t\n") (char '@')
domain <- sepBy1 (many1 (noneOf "/:.@<> \t\n")) (char '.')
let src = name ++ "@" ++ (joinWithSep "." domain)
char '>'
return $ Link [Code src] (("mailto:" ++ src), "")
autoLinkRegular = try $ do
char '<'
prot <- oneOfStrings ["http:", "ftp:", "mailto:"]
rest <- many1Till (noneOf " \t\n<>") (char '>')
let src = prot ++ rest
return $ Link [Code src] (src, "")
image = try $ do
char '!'
(Link label src) <- link
return $ Image label src
note = try $ do
failIfStrict
ref <- noteMarker
state <- getState
let notes = stateNotes state
case lookup ref notes of
Nothing -> fail "note not found"
Just contents -> return $ Note contents
inlineNote = try $ do
failIfStrict
char '^'
contents <- inlinesInBalanced "[" "]"
return $ Note [Para contents]
rawLaTeXInline' = failIfStrict >> rawLaTeXInline
rawHtmlInline' = do
st <- getState
result <- choice $ if stateStrict st
then [htmlBlockElement, anyHtmlTag, anyHtmlEndTag]
else [htmlBlockElement, anyHtmlInlineTag]
return $ HtmlInline result