{-# LANGUAGE RelaxedPolyRec, FlexibleInstances, TypeSynonymInstances, FlexibleContexts #-} -- RelaxedPolyRec needed for inlinesBetween on GHC < 7 {- Copyright (C) 2014 Alexander Sulfrian This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -} {- | Module : Text.Pandoc.Readers.TWiki Copyright : Copyright (C) 2014 Alexander Sulfrian License : GNU GPL, version 2 or above Maintainer : Alexander Sulfrian Stability : alpha Portability : portable Conversion of twiki text to 'Pandoc' document. -} module Text.Pandoc.Readers.TWiki ( readTWiki , readTWikiWithWarnings ) where import Text.Pandoc.Definition import qualified Text.Pandoc.Builder as B import Text.Pandoc.Options import Text.Pandoc.Parsing hiding (enclosed, macro, nested) import Text.Pandoc.Readers.HTML (htmlTag, isCommentTag) import Data.Monoid (Monoid, mconcat, mempty) import Control.Applicative ((<$>), (<*), (*>), (<$)) import Control.Monad import Text.Printf (printf) import Debug.Trace (trace) import Text.Pandoc.XML (fromEntities) import Data.Maybe (fromMaybe) import Text.HTML.TagSoup import Data.Char (isAlphaNum) import qualified Data.Foldable as F import Text.Pandoc.Error -- | Read twiki from an input string and return a Pandoc document. readTWiki :: ReaderOptions -- ^ Reader options -> String -- ^ String to parse (assuming @'\n'@ line endings) -> Either PandocError Pandoc readTWiki opts s = (readWith parseTWiki) def{ stateOptions = opts } (s ++ "\n\n") readTWikiWithWarnings :: ReaderOptions -- ^ Reader options -> String -- ^ String to parse (assuming @'\n'@ line endings) -> Either PandocError (Pandoc, [String]) readTWikiWithWarnings opts s = (readWith parseTWikiWithWarnings) def{ stateOptions = opts } (s ++ "\n\n") where parseTWikiWithWarnings = do doc <- parseTWiki warnings <- stateWarnings <$> getState return (doc, warnings) type TWParser = Parser [Char] ParserState -- -- utility functions -- tryMsg :: String -> TWParser a -> TWParser a tryMsg msg p = try p msg skip :: TWParser a -> TWParser () skip parser = parser >> return () nested :: TWParser a -> TWParser a nested p = do nestlevel <- stateMaxNestingLevel <$> getState guard $ nestlevel > 0 updateState $ \st -> st{ stateMaxNestingLevel = stateMaxNestingLevel st - 1 } res <- p updateState $ \st -> st{ stateMaxNestingLevel = nestlevel } return res htmlElement :: String -> TWParser (Attr, String) htmlElement tag = tryMsg tag $ do (TagOpen _ attr, _) <- htmlTag (~== TagOpen tag []) content <- manyTill anyChar (endtag <|> endofinput) return (htmlAttrToPandoc attr, trim content) where endtag = skip $ htmlTag (~== TagClose tag) endofinput = lookAhead $ try $ skipMany blankline >> skipSpaces >> eof trim = dropWhile (=='\n') . reverse . dropWhile (=='\n') . reverse htmlAttrToPandoc :: [Attribute String] -> Attr htmlAttrToPandoc attrs = (ident, classes, keyvals) where ident = fromMaybe "" $ lookup "id" attrs classes = maybe [] words $ lookup "class" attrs keyvals = [(k,v) | (k,v) <- attrs, k /= "id" && k /= "class"] parseHtmlContentWithAttrs :: String -> TWParser a -> TWParser (Attr, [a]) parseHtmlContentWithAttrs tag parser = do (attr, content) <- htmlElement tag parsedContent <- try $ parseContent content return (attr, parsedContent) where parseContent = parseFromString $ nested $ manyTill parser endOfContent endOfContent = try $ skipMany blankline >> skipSpaces >> eof parseHtmlContent :: String -> TWParser a -> TWParser [a] parseHtmlContent tag p = parseHtmlContentWithAttrs tag p >>= return . snd -- -- main parser -- parseTWiki :: TWParser Pandoc parseTWiki = do bs <- mconcat <$> many block spaces eof return $ B.doc bs -- -- block parsers -- block :: TWParser B.Blocks block = do tr <- getOption readerTrace pos <- getPosition res <- mempty <$ skipMany1 blankline <|> blockElements <|> para skipMany blankline when tr $ trace (printf "line %d: %s" (sourceLine pos) (take 60 $ show $ B.toList res)) (return ()) return res blockElements :: TWParser B.Blocks blockElements = choice [ separator , header , verbatim , literal , list "" , table , blockQuote , noautolink ] separator :: TWParser B.Blocks separator = tryMsg "separator" $ string "---" >> newline >> return B.horizontalRule header :: TWParser B.Blocks header = tryMsg "header" $ do string "---" level <- many1 (char '+') >>= return . length guard $ level <= 6 classes <- option [] $ string "!!" >> return ["unnumbered"] skipSpaces content <- B.trimInlines . mconcat <$> manyTill inline newline attr <- registerHeader ("", classes, []) content return $ B.headerWith attr level $ content verbatim :: TWParser B.Blocks verbatim = (htmlElement "verbatim" <|> htmlElement "pre") >>= return . (uncurry B.codeBlockWith) literal :: TWParser B.Blocks literal = htmlElement "literal" >>= return . rawBlock where format (_, _, kvs) = fromMaybe "html" $ lookup "format" kvs rawBlock (attrs, content) = B.rawBlock (format attrs) content list :: String -> TWParser B.Blocks list prefix = choice [ bulletList prefix , orderedList prefix , definitionList prefix] definitionList :: String -> TWParser B.Blocks definitionList prefix = tryMsg "definitionList" $ do indent <- lookAhead $ string prefix *> (many1 $ string " ") <* string "$ " elements <- many $ parseDefinitionListItem (prefix ++ concat indent) return $ B.definitionList elements where parseDefinitionListItem :: String -> TWParser (B.Inlines, [B.Blocks]) parseDefinitionListItem indent = do string (indent ++ "$ ") >> skipSpaces term <- many1Till inline $ string ": " line <- listItemLine indent $ string "$ " return $ (mconcat term, [line]) bulletList :: String -> TWParser B.Blocks bulletList prefix = tryMsg "bulletList" $ parseList prefix (char '*') (char ' ') orderedList :: String -> TWParser B.Blocks orderedList prefix = tryMsg "orderedList" $ parseList prefix (oneOf "1iIaA") (string ". ") parseList :: Show a => String -> TWParser Char -> TWParser a -> TWParser B.Blocks parseList prefix marker delim = do (indent, style) <- lookAhead $ string prefix *> listStyle <* delim blocks <- many $ parseListItem (prefix ++ indent) (char style <* delim) return $ case style of '1' -> B.orderedListWith (1, DefaultStyle, DefaultDelim) blocks 'i' -> B.orderedListWith (1, LowerRoman, DefaultDelim) blocks 'I' -> B.orderedListWith (1, UpperRoman, DefaultDelim) blocks 'a' -> B.orderedListWith (1, LowerAlpha, DefaultDelim) blocks 'A' -> B.orderedListWith (1, UpperAlpha, DefaultDelim) blocks _ -> B.bulletList blocks where listStyle = do indent <- many1 $ string " " style <- marker return (concat indent, style) parseListItem :: Show a => String -> TWParser a -> TWParser B.Blocks parseListItem prefix marker = string prefix >> marker >> listItemLine prefix marker listItemLine :: Show a => String -> TWParser a -> TWParser B.Blocks listItemLine prefix marker = lineContent >>= parseContent >>= return . mconcat where lineContent = do content <- anyLine continuation <- optionMaybe listContinuation return $ filterSpaces content ++ "\n" ++ (maybe "" (" " ++) continuation) filterSpaces = reverse . dropWhile (== ' ') . reverse listContinuation = notFollowedBy (string prefix >> marker) >> string " " >> lineContent parseContent = parseFromString $ many1 $ nestedList <|> parseInline parseInline = many1Till inline (lastNewline <|> newlineBeforeNestedList) >>= return . B.plain . mconcat nestedList = list prefix lastNewline = try $ char '\n' <* eof newlineBeforeNestedList = try $ char '\n' <* lookAhead nestedList table :: TWParser B.Blocks table = try $ do tableHead <- optionMaybe $ many1Till tableParseHeader newline >>= return . unzip rows <- many1 tableParseRow return $ buildTable mempty rows $ fromMaybe (align rows, columns rows) tableHead where buildTable caption rows (aligns, heads) = B.table caption aligns heads rows align rows = replicate (columCount rows) (AlignDefault, 0) columns rows = replicate (columCount rows) mempty columCount rows = length $ head rows tableParseHeader :: TWParser ((Alignment, Double), B.Blocks) tableParseHeader = try $ do char '|' leftSpaces <- many spaceChar >>= return . length char '*' content <- tableColumnContent (char '*' >> skipSpaces >> char '|') char '*' rightSpaces <- many spaceChar >>= return . length optional tableEndOfRow return (tableAlign leftSpaces rightSpaces, content) where tableAlign left right | left >= 2 && left == right = (AlignCenter, 0) | left > right = (AlignRight, 0) | otherwise = (AlignLeft, 0) tableParseRow :: TWParser [B.Blocks] tableParseRow = many1Till tableParseColumn newline tableParseColumn :: TWParser B.Blocks tableParseColumn = char '|' *> skipSpaces *> tableColumnContent (skipSpaces >> char '|') <* skipSpaces <* optional tableEndOfRow tableEndOfRow :: TWParser Char tableEndOfRow = lookAhead (try $ char '|' >> char '\n') >> char '|' tableColumnContent :: Show a => TWParser a -> TWParser B.Blocks tableColumnContent end = manyTill content (lookAhead $ try end) >>= return . B.plain . mconcat where content = continuation <|> inline continuation = try $ char '\\' >> newline >> return mempty blockQuote :: TWParser B.Blocks blockQuote = parseHtmlContent "blockquote" block >>= return . B.blockQuote . mconcat noautolink :: TWParser B.Blocks noautolink = do (_, content) <- htmlElement "noautolink" st <- getState setState $ st{ stateAllowLinks = False } blocks <- try $ parseContent content setState $ st{ stateAllowLinks = True } return $ mconcat blocks where parseContent = parseFromString $ many $ block para :: TWParser B.Blocks para = many1Till inline endOfParaElement >>= return . result . mconcat where endOfParaElement = lookAhead $ endOfInput <|> endOfPara <|> newBlockElement endOfInput = try $ skipMany blankline >> skipSpaces >> eof endOfPara = try $ blankline >> skipMany1 blankline newBlockElement = try $ blankline >> skip blockElements result content = if F.all (==Space) content then mempty else B.para $ B.trimInlines content -- -- inline parsers -- inline :: TWParser B.Inlines inline = choice [ whitespace , br , macro , strong , strongHtml , strongAndEmph , emph , emphHtml , boldCode , smart , link , htmlComment , code , codeHtml , nop , autoLink , str , symbol ] "inline" whitespace :: TWParser B.Inlines whitespace = (lb <|> regsp) >>= return where lb = try $ skipMany spaceChar >> linebreak >> return B.space regsp = try $ skipMany1 spaceChar >> return B.space br :: TWParser B.Inlines br = try $ string "%BR%" >> return B.linebreak linebreak :: TWParser B.Inlines linebreak = newline >> notFollowedBy newline >> (lastNewline <|> innerNewline) where lastNewline = eof >> return mempty innerNewline = return B.space between :: (Show b, Monoid c) => TWParser a -> TWParser b -> (TWParser b -> TWParser c) -> TWParser c between start end p = mconcat <$> try (start >> notFollowedBy whitespace >> many1Till (p end) end) enclosed :: (Show a, Monoid b) => TWParser a -> (TWParser a -> TWParser b) -> TWParser b enclosed sep p = between sep (try $ sep <* endMarker) p where endMarker = lookAhead $ skip endSpace <|> skip (oneOf ".,!?:)|") <|> eof endSpace = (spaceChar <|> newline) >> return B.space macro :: TWParser B.Inlines macro = macroWithParameters <|> withoutParameters where withoutParameters = enclosed (char '%') (\_ -> macroName) >>= return . emptySpan emptySpan name = buildSpan name [] mempty macroWithParameters :: TWParser B.Inlines macroWithParameters = try $ do char '%' name <- macroName (content, kvs) <- attributes char '%' return $ buildSpan name kvs $ B.str content buildSpan :: String -> [(String, String)] -> B.Inlines -> B.Inlines buildSpan className kvs = B.spanWith attrs where attrs = ("", ["twiki-macro", className] ++ additionalClasses, kvsWithoutClasses) additionalClasses = maybe [] words $ lookup "class" kvs kvsWithoutClasses = [(k,v) | (k,v) <- kvs, k /= "class"] macroName :: TWParser String macroName = do first <- letter rest <- many $ alphaNum <|> char '_' return (first:rest) attributes :: TWParser (String, [(String, String)]) attributes = char '{' *> spnl *> many (attribute <* spnl) <* char '}' >>= return . foldr (either mkContent mkKvs) ([], []) where spnl = skipMany (spaceChar <|> newline) mkContent c ([], kvs) = (c, kvs) mkContent c (rest, kvs) = (c ++ " " ++ rest, kvs) mkKvs kv (cont, rest) = (cont, (kv : rest)) attribute :: TWParser (Either String (String, String)) attribute = withKey <|> withoutKey where withKey = try $ do key <- macroName char '=' parseValue False >>= return . (curry Right key) withoutKey = try $ parseValue True >>= return . Left parseValue allowSpaces = (withQuotes <|> withoutQuotes allowSpaces) >>= return . fromEntities withQuotes = between (char '"') (char '"') (\_ -> count 1 $ noneOf ['"']) withoutQuotes allowSpaces | allowSpaces == True = many1 $ noneOf "}" | otherwise = many1 $ noneOf " }" nestedInlines :: Show a => TWParser a -> TWParser B.Inlines nestedInlines end = innerSpace <|> nestedInline where innerSpace = try $ whitespace <* (notFollowedBy end) nestedInline = notFollowedBy whitespace >> nested inline strong :: TWParser B.Inlines strong = try $ enclosed (char '*') nestedInlines >>= return . B.strong strongHtml :: TWParser B.Inlines strongHtml = (parseHtmlContent "strong" inline <|> parseHtmlContent "b" inline) >>= return . B.strong . mconcat strongAndEmph :: TWParser B.Inlines strongAndEmph = try $ enclosed (string "__") nestedInlines >>= return . B.emph . B.strong emph :: TWParser B.Inlines emph = try $ enclosed (char '_') nestedInlines >>= return . B.emph emphHtml :: TWParser B.Inlines emphHtml = (parseHtmlContent "em" inline <|> parseHtmlContent "i" inline) >>= return . B.emph . mconcat nestedString :: Show a => TWParser a -> TWParser String nestedString end = innerSpace <|> (count 1 nonspaceChar) where innerSpace = try $ many1 spaceChar <* notFollowedBy end boldCode :: TWParser B.Inlines boldCode = try $ enclosed (string "==") nestedString >>= return . B.strong . B.code . fromEntities htmlComment :: TWParser B.Inlines htmlComment = htmlTag isCommentTag >> return mempty code :: TWParser B.Inlines code = try $ enclosed (char '=') nestedString >>= return . B.code . fromEntities codeHtml :: TWParser B.Inlines codeHtml = do (attrs, content) <- parseHtmlContentWithAttrs "code" anyChar return $ B.codeWith attrs $ fromEntities content autoLink :: TWParser B.Inlines autoLink = try $ do state <- getState guard $ stateAllowLinks state (text, url) <- parseLink guard $ checkLink (head $ reverse url) return $ makeLink (text, url) where parseLink = notFollowedBy nop >> (uri <|> emailAddress) makeLink (text, url) = B.link url "" $ B.str text checkLink c | c == '/' = True | otherwise = isAlphaNum c str :: TWParser B.Inlines str = (many1 alphaNum <|> count 1 characterReference) >>= return . B.str nop :: TWParser B.Inlines nop = try $ (skip exclamation <|> skip nopTag) >> followContent where exclamation = char '!' nopTag = stringAnyCase "" followContent = many1 nonspaceChar >>= return . B.str . fromEntities symbol :: TWParser B.Inlines symbol = count 1 nonspaceChar >>= return . B.str smart :: TWParser B.Inlines smart = do getOption readerSmart >>= guard doubleQuoted <|> singleQuoted <|> choice [ apostrophe , dash , ellipses ] singleQuoted :: TWParser B.Inlines singleQuoted = try $ do singleQuoteStart withQuoteContext InSingleQuote $ many1Till inline singleQuoteEnd >>= (return . B.singleQuoted . B.trimInlines . mconcat) doubleQuoted :: TWParser B.Inlines doubleQuoted = try $ do doubleQuoteStart contents <- mconcat <$> many (try $ notFollowedBy doubleQuoteEnd >> inline) (withQuoteContext InDoubleQuote $ doubleQuoteEnd >> return (B.doubleQuoted $ B.trimInlines contents)) <|> (return $ (B.str "\8220") B.<> contents) link :: TWParser B.Inlines link = try $ do st <- getState guard $ stateAllowLinks st setState $ st{ stateAllowLinks = False } (url, title, content) <- linkText setState $ st{ stateAllowLinks = True } return $ B.link url title content linkText :: TWParser (String, String, B.Inlines) linkText = do string "[[" url <- many1Till anyChar (char ']') content <- option [B.str url] linkContent char ']' return (url, "", mconcat content) where linkContent = (char '[') >> many1Till anyChar (char ']') >>= parseLinkContent parseLinkContent = parseFromString $ many1 inline