{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ViewPatterns #-} {-# LANGUAGE TypeApplications #-} {- | Module : Text.Pandoc.Readers.Pod Copyright : © 2024 Evan Silberman License : GNU GPL, version 2 or above Maintainer : Evan Silberman Stability : WIP Portability : portable Conversion of Pod to 'Pandoc' documents -} module Text.Pandoc.Readers.Pod (readPod) where import Control.Monad (void) import Control.Monad.Except (throwError) import Data.Char (isAsciiUpper, digitToInt) import Data.Default (Default) import Text.Pandoc.Logging import Text.Pandoc.Options import Text.Pandoc.Parsing import Text.Pandoc.Parsing.General (isSpaceChar) import Text.Pandoc.XML (lookupEntity) import Text.Pandoc.Class.PandocMonad (PandocMonad(..)) import Text.Pandoc.Definition (Pandoc) import Text.Pandoc.Error import Text.Pandoc.Builder (Blocks, Inlines) import qualified Text.Pandoc.Builder as B import qualified Data.Text as T import qualified Data.Text.Read as TR import Text.Pandoc.Shared (stringify, textToIdentifier, tshow) import Data.Set (Set) import Data.Functor (($>)) import Data.Maybe (listToMaybe, fromMaybe) import Numeric (readOct) data PodState = PodState { logMessages :: [LogMessage] , headerIds :: Set T.Text , options :: ReaderOptions } deriving (Show) instance HasLogMessages PodState where addLogMessage msg st = st{ logMessages = msg : logMessages st } getLogMessages st = reverse $ logMessages st instance HasIdentifierList PodState where extractIdentifierList = headerIds updateIdentifierList f st = st{headerIds = f (headerIds st)} instance HasReaderOptions PodState where extractReaderOptions = options instance Default PodState where def = PodState { logMessages = [] , headerIds = mempty , options = def } data PodLinkDestination = LinkUrl Inlines T.Text | LinkMan Inlines (Maybe Inlines) | LinkInternal Inlines deriving (Show) defaultLinkName :: PodLinkDestination -> Inlines defaultLinkName (LinkUrl inl _) = inl defaultLinkName (LinkMan nm (Just sec)) = B.doubleQuoted sec <> " in " <> nm defaultLinkName (LinkMan nm Nothing) = nm defaultLinkName (LinkInternal sec) = B.doubleQuoted sec type PodParser m = ParsecT Sources PodState m readPod :: (PandocMonad m, ToSources a) => ReaderOptions -> a -> m Pandoc readPod _ s = do let sources = toSources s p <- readWithM parsePod def sources case p of Right result -> return result Left e -> throwError e parsePod :: PandocMonad m => PodParser m Pandoc parsePod = do -- We don't actually start processing Pod until we encounter a Pod command. -- If we never encounter a Pod command, the document is still valid Pod, it -- just contains no content. notPod bs <- manyTill block eof reportLogMessages return $ B.doc $ mconcat bs block :: PandocMonad m => PodParser m Blocks block = verbatim <|> paragraph <|> command "Pod paragraph" command :: PandocMonad m => PodParser m Blocks command = do try (char '=' >> notFollowedBy (string "item" <|> string "back" <|> string "end")) header <|> pod <|> cut <|> over <|> for <|> begin <|> encoding "Pod command" cmd :: PandocMonad m => T.Text -> PodParser m () cmd nm = do textStr nm notFollowedBy nonspaceChar void $ many spaceChar encoding :: PandocMonad m => PodParser m Blocks encoding = do cmd "encoding" anyLine optional blanklines logMessage $ IgnoredElement "=encoding; Pandoc requires UTF-8 input" return mempty header :: PandocMonad m => PodParser m Blocks header = do string "head" dig <- oneOf "123456" void blankline <|> skipMany1 spaceChar ins <- inlines attrs <- registerHeader B.nullAttr ins optional blanklines return $ B.headerWith attrs (digitToInt dig) ins pod :: PandocMonad m => PodParser m Blocks pod = do cmd "pod" optional (try inlines) optional blanklines return mempty cut :: PandocMonad m => PodParser m Blocks cut = cmd "cut" *> notPod notPod :: PandocMonad m => PodParser m Blocks notPod = do manyTill anyLine (eof <|> void (try (lookAhead (char '=' *> letter)))) return mempty over :: PandocMonad m => PodParser m Blocks over = do cmd "over" anyLine blanklines optional $ try (char '=' *> cut) bs <- list <|> blockquote string "=back" <* blanklines return bs list :: PandocMonad m => PodParser m Blocks list = try bulletList <|> try orderedList <|> definitionList bulletList :: PandocMonad m => PodParser m Blocks bulletList = B.bulletList <$> many1 (item (many spaceChar *> optional (char '*'))) orderedList :: PandocMonad m => PodParser m Blocks orderedList = do start <- item1 more <- many orderedItem return $ B.orderedList (start : more) where item1 = item $ spaces *> char '1' *> optional (char '.') orderedItem = item $ spaces *> many digit *> optional (char '.') item :: PandocMonad m => PodParser m () -> PodParser m Blocks item p = do try (cmd "=item") p blanklines mconcat <$> many block "runaway item" definitionList :: PandocMonad m => PodParser m Blocks definitionList = B.definitionList <$> many1 dlItem where dlItem = do try (cmd "=item") spaces term <- inlines blanklines -- perlpodspec sez the /section part of a link can refer to either -- a header or a dl item, hence treating it as a "header" here attrs <- registerHeader B.nullAttr term defn <- mconcat <$> many block "runaway dlitem" return (B.spanWith attrs term, [defn]) blockquote :: PandocMonad m => PodParser m Blocks blockquote = B.blockQuote . mconcat <$> many block "runaway blockquote" paragraph :: PandocMonad m => PodParser m Blocks paragraph = do try (notFollowedBy (char '=' *> letter)) inl <- inlines optional blanklines return $ B.para $ B.trimInlines inl inlines :: PandocMonad m => PodParser m Inlines inlines = mconcat <$> many1 (format <|> whitespace <|> str) -- perlpodspec sez: -- If a Pod processor sees any formatting code other than the ones listed, -- that processor must by default treat this as an error. format :: PandocMonad m => PodParser m Inlines format = try $ do ctrl <- satisfy isAsciiUpper p <- getPosition lookAhead (char '<') case ctrl of 'B' -> B.strong <$> argument 'C' -> B.code . stringify <$> argument 'F' -> B.spanWith (mempty, ["filename"], mempty) <$> argument 'I' -> B.emph <$> argument 'S' -> argument -- TODO map nbsps 'X' -> argument $> mempty 'Z' -> argument $> mempty 'E' -> do a <- stringify <$> argument case entity a of -- per spec: -- Pod parsers, when faced with some unknown "E" code, -- shouldn't simply replace it with nullstring (by default, at -- least), but may pass it through as a string consisting of the -- literal characters E, less-than, identifier, greater-than. Nothing -> do logMessage $ SkippedContent ("unknown entity " <> a) p return $ B.str $ "E<" <> a <> ">" Just e -> return $ B.str e 'L' -> link x -> throwError $ PandocParseError $ T.snoc "unknown Pod formatting code " x where argument = try expandedArg <|> compactArg "argument" innerStr = B.str <$> many1Char (podCharLess ">") compactArg = do char '<' mconcat <$> manyTill (format <|> whitespace <|> innerStr) (char '>') expandedArg = do openLen <- length <$> many1 (char '<') let close = T.pack $ replicate openLen '>' skipMany1 spaceChar <|> void blankline arg <- mconcat <$> many (format <|> try (whitespace <* notFollowedBy (textStr close)) <|> str) many1 spaceChar textStr close return arg -- Some legacy entity names are required to be parsed by Pod formatters oct = listToMaybe . readOct @Integer entity "apos" = Just "'" entity "sol" = Just "/" entity "verbar" = Just "|" entity "lchevron" = Just "«" entity "rchevron" = Just "»" entity (T.stripPrefix "0x" -> Just suf) = lookupEntity $ "#x" <> suf entity (T.stripPrefix "0" -> Just suf) | Just (n, "") <- oct (T.unpack suf) = lookupEntity $ "#" <> tshow n entity (TR.decimal @Integer -> Right (x, "")) = lookupEntity $ "#" <> tshow x -- named entities in Commonmark.Entity de facto have to be looked up with -- the semicolon at the end. perlpodspec says arguments to E<> must be -- alphanumeric, so an argument that already has a trailing semicolon -- is bogus anyway, so just paste the semicolon on unconditionally. entity x = lookupEntity (x <> ";") -- god knows there must be a higher order way of writing this thing, where we -- have multiple different possible parser states within the link argument -- varying depending on whether the link is expanded or not, but at least I -- understand what I've done. This would be less wacky with a lexing step. link :: PandocMonad m => PodParser m Inlines link = do identifier <- textToIdentifier <$> getOption readerExtensions (name, dest) <- try expandedLinkArg <|> compactLinkArg return $ mkLink identifier name dest where compactLinkArg = do char '<' name <- linkName whitespace ">" dest <- linkDest whitespace (char '>') ">" char '>' return (mconcat <$> name, dest) expandedLinkArg = do openLen <- length <$> many1 (char '<') let closeStr = textStr (T.pack $ replicate openLen '>') let close = skipMany1 spaceChar *> closeStr let sp = try $ many1 spaceChar *> notFollowedBy closeStr $> B.space many1 spaceChar name <- linkName sp "" dest <- linkDest sp close "" close return (mconcat <$> name, dest) mkLink identifier name dest = let name' = fromMaybe (defaultLinkName dest) name in case dest of LinkUrl _ href -> B.link href "" name' LinkMan nm Nothing -> B.linkWith (mempty, mempty, [("manual", stringify nm)]) "" "" name' LinkMan nm (Just sc) -> B.linkWith (mempty, mempty, [("manual", stringify nm), ("section", stringify sc)]) "" "" name' LinkInternal sc -> B.link ("#" <> identifier (stringify sc)) "" name' linkName sp ex = optionMaybe $ try $ many (try format <|> sp <|> B.str <$> many1Char (podCharLess ('|':ex))) <* char '|' linkDest sp close ex = try (url ex) <|> internal sp close ex <|> man sp close ex -- perlpodspec sez: -- Note that you can distinguish URL-links from anything else by the -- fact that they match m/\A\w+:[^:\s]\S*\z/. -- This is obviously not an RFC-compliant matcher for a URI scheme, but -- this is what the specification and the canonical implementation (Pod::Simple) -- do for deciding that a link target "looks like" a URL, as opposed to a -- manual page reference, so what we are doing here is roughly equivalent -- even though it is nonsense url ex = do scheme <- many1Char (letter <|> digit <|> char '_') colon <- T.singleton <$> char ':' <* notFollowedBy (char ':') rst <- many (format <|> B.str <$> many1Char (podCharLess ex)) return $ LinkUrl (B.str scheme <> B.str colon <> mconcat rst) (scheme <> colon <> stringify rst) quotedSection sp close ex = do let mystr = B.str <$> many1Char (podCharLess ('\"':ex) <|> try (char '"' <* notFollowedBy close)) char '"' ins <- mconcat <$> many1 (format <|> sp <|> mystr) char '"' return ins section sp close ex = try (quotedSection sp close ex) <|> mconcat <$> many1 (format <|> sp <|> B.str <$> many1Char (podCharLess ex)) internal sp close ex = do char '/' LinkInternal <$> section sp close ex notSlash sp ex = format <|> sp <|> B.str <$> many1Char (podCharLess ('/':ex)) man sp close ex = do page <- mconcat <$> many (notSlash sp ex) sec <- optionMaybe $ char '/' *> section sp close ex return $ LinkMan page sec whitespace :: PandocMonad m => PodParser m Inlines whitespace = try $ do many1 spaceChar *> optional newline <|> many spaceChar *> void newline notFollowedBy blankline return B.space podCharLess :: PandocMonad m => String -> PodParser m Char podCharLess exclude = try (satisfy isAsciiUpper <* notFollowedBy (char '<')) <|> satisfy (\c -> not (isSpaceChar c || isAsciiUpper c || elem c exclude)) podChar :: PandocMonad m => PodParser m Char podChar = try (satisfy isAsciiUpper <* notFollowedBy (char '<')) <|> satisfy (\c -> not (isSpaceChar c || isAsciiUpper c)) str :: PandocMonad m => PodParser m Inlines str = B.str <$> many1Char podChar nonEmptyLine :: PandocMonad m => PodParser m T.Text nonEmptyLine = try $ do pre <- manyChar spaceChar something <- T.singleton <$> nonspaceChar post <- anyLineNewline return $ pre <> something <> post verbatim :: PandocMonad m => PodParser m Blocks verbatim = do start <- startVerbatimLine lns <- many (nonEmptyLine <|> try (do b <- blanklines l <- startVerbatimLine return $ b <> l)) optional blanklines return $ B.codeBlock $ mconcat $ start:lns where startVerbatimLine = many1Char spaceChar <> nonEmptyLine -- =begin/=end/=for and data paragraphs -- The =begin/=end (and single-paragraph =for variant) markers in Pod are -- designed as an extension point for specific formatters -- -- this doesn't strictly match the intent of "=begin :ident" pod blocks, which -- are still meant to be processed specially by the formatter, and only land in -- the output upon request, i.e. pod2html will process "=begin :html" blocks as -- Pod and include them in the regular output. Since the regions contain Pod -- markup it seems to me that the best thing to do is parse the markup and put -- a classname on it, allowing users to respond as desired with filters. -- Pandoc doesn't have a built-in concept of parsed Divs that are only rendered -- to certain formats, just raw blocks. -- -- perlpodspec allows nesting of =begin/=end regions but we currently don't -- because it would be annoying and we have something somewhat useful we -- can do with these blocks which is treat them as RawBlocks, which matches -- the intent reasonably well, and that gets weirder if we parse a nested -- structure. It seems unlikely this would be encountered in the wild. regionIdentifier :: PandocMonad m => PodParser m T.Text regionIdentifier = many1Char (alphaNum <|> oneOf "-_") for :: PandocMonad m => PodParser m Blocks for = do string "for" many1 spaceChar forDiv <|> forData forDiv :: PandocMonad m => PodParser m Blocks forDiv = do char ':' cls <- regionIdentifier many1 spaceChar B.divWith (mempty, [cls], mempty) <$> paragraph forData :: PandocMonad m => PodParser m Blocks forData = do fmt <- regionIdentifier ln1 <- anyLineNewline lns <- many nonEmptyLine optional blanklines return $ B.rawBlock fmt (T.concat (ln1 : lns)) begin :: PandocMonad m => PodParser m Blocks begin = do cmd "begin" beginDiv <|> beginData beginDiv :: PandocMonad m => PodParser m Blocks beginDiv = do char ':' cls <- regionIdentifier anyLine -- "parameters" may appear in this position blanklines bs <- mconcat <$> many block textStr ("=end :" <> cls) <* blanklines return $ B.divWith (mempty, [cls], mempty) bs beginData :: PandocMonad m => PodParser m Blocks beginData = do fmt <- regionIdentifier anyLine blanklines lns <- mconcat <$> many (try rawCut <|> rawLine) textStr ("=end " <> fmt) <* blanklines return $ B.rawBlock fmt lns where rawCut = do char '=' *> cut pod "=pod to close =cut within =begin/=end" return mempty rawLine = do try (notFollowedBy (char '=' *> letter)) anyLineNewline