> {-# LANGUAGE OverloadedStrings #-} > module Unlit.Text (unlit, relit, Style(..), Name(..), name2style) where > > import Prelude hiding (all) > import Control.Applicative ((<|>)) > import Data.Maybe (maybe,maybeToList,listToMaybe,fromMaybe) > import Data.Text.Lazy (Text) > import qualified Data.Text.Lazy as T > import qualified Data.Text.Lazy.IO as T What are literate programs? =========================== There are several styles of literate programming. Most commonly, these are: LaTeX-style code tags, Bird tags and Markdown fenced code blocks. Each of these styles is characterised by its own set of delimiters: > data Delim = BeginCode | EndCode > | BirdTag > | TildeFence | BacktickFence > deriving (Eq) > instance Show Delim where > show BeginCode = "\\begin{code}" > show EndCode = "\\end{code}" > show BirdTag = ">" > show TildeFence = "~~~" > show BacktickFence = "```" In LaTeX-style, a codeblock is delimited by `\begin{code}` and `\end{code}` tags. > beginCode, endCode :: Text > beginCode = "\\begin{code}" > endCode = "\\end{code}" > > isBeginCode, isEndCode :: Text -> Bool > isBeginCode l = beginCode `T.isPrefixOf` l > isEndCode l = endCode `T.isPrefixOf` l In Bird-style, every line in a codeblock must start with a Bird tag. A tagged line is defined as *either* a line containing solely the symbol '>', or a line starting with the symbol '>' followed by at least one space. > isBirdTag :: Text -> Bool > isBirdTag l = (l == ">") || ("> " `T.isPrefixOf` l) Due to this definition, whenever we strip a bird tag, we also remove a the first space following it. > stripBirdTag :: Text -> Text > stripBirdTag l > | l == ">" = "" > | otherwise = T.drop 2 l Lastly, Markdown supports two styles of fenced codeblocks: using tildes or using backticks. > tildeFence, backtickFence :: Text > tildeFence = "~~~" > backtickFence = "```" > > isTildeFence, isBacktickFence :: Text -> Bool > isTildeFence l = tildeFence `T.isPrefixOf` l > isBacktickFence l = backtickFence `T.isPrefixOf` l These two fences have support for adding metadata, in the form of a CSS-style dictionary (`{#mycode .haskell .numberLines startFrom=100}`) for long fences or a list of classes for short fences.[^fenced-code-attributes] In general, we will also need a function that checks, for a given line, whether it conforms to *any* of the styles. > isDelim :: [Delim] -> Text -> Maybe Delim > isDelim ds l = > listToMaybe . map fst . filter (\(d,p) -> d `elem` ds && p l) $ detectors > where > detectors :: [(Delim, Text -> Bool)] > detectors = > [ (BeginCode , isBeginCode) > , (EndCode , isEndCode) > , (BirdTag , isBirdTag) > , (TildeFence , isTildeFence) > , (BacktickFence , isBacktickFence) > ] And, for the styles which use opening and closing brackets, we will need a function that checks if these pairs match. > match :: Delim -> Delim -> Bool > match BeginCode EndCode = True > match TildeFence TildeFence = True > match BacktickFence BacktickFence = True > match _ _ = False Note that Bird-tags are notably absent from the `match` function, as they are a special case. What do we want `unlit` to do? ============================== The `unlit` program that we will implement below will do the following: it will read a literate program from the standard input---allowing one or more styles of code block---and emit only the code to the standard output. The options for source styles are as follows: > data Name = All | Bird | Haskell | LaTeX | Markdown deriving (Show) > data Style = Style { name :: Name, allowed :: [Delim] } > > latex, bird, markdown :: Style > all = Style All [BeginCode, EndCode, BirdTag, TildeFence, BacktickFence] > bird = Style Bird [BirdTag] > haskell = Style Haskell [BeginCode, EndCode, BirdTag] > latex = Style LaTeX [BeginCode, EndCode] > markdown = Style Markdown [BirdTag, TildeFence, BacktickFence] > > name2style All = all > name2style Bird = bird > name2style Haskell = haskell > name2style LaTeX = latex > name2style Markdown = markdown Additionally, when the source style is set to `Nothing`, the program will guess the style based on the first delimiter it encounters, always guessing the most permissive style---i.e. when it encounters a Bird-tag it will assume that it is dealing with a Markdown-style literate file and also allow fenced code blocks. > infer :: Maybe Delim -> Maybe Style > infer Nothing = Nothing > infer (Just BeginCode) = Just latex > infer (Just _) = Just markdown Thus, the `unlit` function will have two parameters: its source style and the text to convert. > unlit :: Maybe Style -> [(Int, Text)] -> [Text] > unlit ss = unlit' ss Nothing However, the helper function `unlit'` is best thought of as a finite state automaton, where the states are used to remember the what kind of code block (if any) the automaton currently is in. > type State = Maybe Delim > unlit' :: Maybe Style -> State -> [(Int, Text)] -> [Text] > unlit' _ _ [] = [] > unlit' ss q ((n, l):ls) = case (q, q') of > > (Nothing , Nothing) -> continue > (Nothing , Just BirdTag) -> blockOpen $ Just (stripBirdTag l) > (Just BirdTag , Just BirdTag) -> blockContinue $ stripBirdTag l > (Just BirdTag , Nothing) -> blockClose > (Nothing , Just EndCode) -> spurious EndCode > (Nothing , Just o) -> blockOpen $ Nothing > (Just o , Nothing) -> blockContinue $ l > (Just o , Just c) -> if o `match` c then blockClose else spurious c > > where > q' = isDelim (allowed (fromMaybe all ss)) l > continueWith q = unlit' (ss <|> infer q') q ls > continue = continueWith q > blockOpen l = maybeToList l ++ continueWith q' > blockContinue l = l : continue > blockClose = T.empty : continueWith Nothing > spurious q = error ("at line " ++ show n ++ ": spurious " ++ show q) What do we want `relit` to do? ============================== Sadly, no, `relit` won't be able to take source code and automatically convert it to literate code. I'm not quite up to the challenge of automatically generating meaningful documentation from arbitrary code... I wish I was. What `relit` will do is read a literate file using one style of delimiters and emit the same file using an other style of delimiters. > relit :: Maybe Style -> Name -> [(Int, Text)] -> [Text] > relit ss ts = relit' ss ts Nothing Again, we will interpret the helper function `relit'` as an automaton, which remembers the current state. However, we now also need a function which can emit code blocks in a certain style. For this purpose we will define a triple of functions. > emitBirdTag :: Text -> Text > emitBirdTag l = "> " `T.append` l > > emitOpen :: Name -> Maybe Text -> [Text] > emitOpen Bird l = T.empty : map emitBirdTag (maybeToList l) > emitOpen Markdown l = backtickFence : maybeToList l > emitOpen _ l = beginCode : maybeToList l > > emitCode :: Name -> Text -> Text > emitCode Bird l = emitBirdTag l > emitCode _ l = l > > emitClose :: Name -> Text > emitClose Bird = T.empty > emitClose Markdown = backtickFence > emitClose _ = endCode Using these simple functions we can easily define the `relit'` function. > relit' :: Maybe Style -> Name -> State -> [(Int, Text)] -> [Text] > relit' _ _ _ [] = [] > relit' ss ts q ((n, l):ls) = case (q, q') of > > (Nothing , Nothing) -> l : continue > (Nothing , Just BirdTag) -> blockOpen $ Just (stripBirdTag l) > (Just BirdTag , Just BirdTag) -> blockContinue $ stripBirdTag l > (Just BirdTag , Nothing) -> blockClose > (Nothing , Just EndCode) -> spurious EndCode > (Nothing , Just o) -> blockOpen $ Nothing > (Just o , Nothing) -> blockContinue $ l > (Just o , Just c) -> if o `match` c then blockClose else spurious c > > where > q' = isDelim (allowed (fromMaybe all ss)) l > continueWith q = relit' (ss <|> infer q') ts q ls > continue = continueWith q > blockOpen l = emitOpen ts l ++ continueWith q' > blockContinue l = emitCode ts l : continue > blockClose = emitClose ts : continueWith Nothing > spurious q = error ("at line " ++ show n ++ ": spurious " ++ show q) [^fenced-code-attributes]: http://johnmacfarlane.net/pandoc/demo/example9/pandocs-markdown.html#extension-fenced_code_attributes