||| A simple module to process 'literate' documents.
|||
||| The module uses a lexer to split the document into code blocks,
||| delineated by user-defined markers, and code lines that are
||| indicated be a line marker. The lexer returns a document stripped
||| of non-code elements but preserving the original document's line
||| count. Column numbering of code lines are not preserved.
|||
||| The underlying tokeniser is greedy.
|||
||| Once it identifies a line marker it reads a prettifying space then
||| consumes until the end of line. Once identifies a starting code
||| block marker, the lexer will consume input until the next
||| identifiable end block is encountered. Any other content is
||| treated as part of the original document.
|||
||| Thus, the input literate files *must* be well-formed w.r.t
||| to code line markers and code blocks.
|||
||| A further restriction is that literate documents cannot contain
||| the markers within the document's main text: This will confuse the
||| lexer.
|||
module Text.Literate

import Text.Lexer

import Data.List.Views

%default total

untilEOL : Recognise False
untilEOL = manyUntil (is '\n') any

line : String -> Lexer
line s = exact s <+> space <+> untilEOL

block : String -> String -> Lexer
block s e = exact s <+> manyUntil (exact e) any 

data Token = CodeBlock String String String
           | Any String
           | CodeLine String String

Show Token where
  showPrec d (CodeBlock l r x) = showCon d "CodeBlock" $ showArg l ++ showArg r ++ showArg x
  showPrec d (Any x)           = showCon d "Any" $ showArg x
  showPrec d (CodeLine m x)    = showCon d "CodeLine" $ showArg m ++ showArg x

rawTokens : (delims  : List (String, String))
         -> (markers : List String)
         -> TokenMap (Token)
rawTokens delims ls =
          map (\(l,r) => (block l r, CodeBlock l r)) delims
       ++ map (\m => (line m, CodeLine m)) ls
       ++ [(any, Any)]

||| Merge the tokens into a single source file.
reduce : List (TokenData Token) -> String -> String
reduce [] acc = acc
reduce (MkToken _ _ (Any x) :: rest) acc = reduce rest (acc ++ blank_content)
  where
    -- Preserve the original document's line count.
    blank_content : String
    blank_content = if elem '\n' (unpack x)
      then concat $ replicate (length (lines x)) "\n"
      else ""
reduce (MkToken _ _ (CodeLine m src) :: rest) acc =
    reduce rest (acc ++ (substr
                           (length m + 1) -- remove space to right of marker.
                           (length src)
                           src))

reduce (MkToken _ _ (CodeBlock l r src) :: rest) acc with (lines src) -- Strip the deliminators surrounding the block.
  reduce (MkToken _ _ (CodeBlock l r src) :: rest) acc | [] = reduce rest acc -- 1
  reduce (MkToken _ _ (CodeBlock l r src) :: rest) acc | (s :: ys) with (snocList ys)
    reduce (MkToken _ _ (CodeBlock l r src) :: rest) acc | (s :: []) | Empty = reduce rest acc -- 2
    reduce (MkToken _ _ (CodeBlock l r src) :: rest) acc | (s :: (srcs ++ [f])) | (Snoc rec) =
        reduce rest (acc ++ "\n" ++ unlines srcs ++ "\n")

-- [ NOTE ] 1 & 2 shouldn't happen as code blocks are well formed i.e. have two deliminators.


public export
record LiterateError where
  constructor MkLitErr
  line   : Int
  column : Int
  input  : String

||| Description of literate styles.
|||
||| A 'literate' style comprises of
|||
||| + a list of code block deliminators (`deliminators`);
||| + a list of code line markers (`line_markers`); and
||| + a list of known file extensions `file_extensions`.
|||
||| Some example specifications:
|||
||| + Bird Style
|||
|||```
|||MkLitStyle Nil [">", "<"] Nil
|||```
|||
||| + Literate Haskell (for LaTeX)
|||
|||```
|||MkLitStyle [("\\begin{code}", "\\end{code}"),("\\begin{spec}","\\end{spec}")]
|||           Nil
|||           ["lhs", "tex"]
|||```
|||
||| + OrgMode
|||
|||```
|||MkLitStyle [("#+BEGIN_SRC idris","#+END_SRC"), ("#+COMMENT idris","#+END_COMMENT")]
|||           ["#+IDRIS:"]
|||           ["org"]
|||```
|||
public export
record LiterateStyle where
  constructor MkLitStyle
  ||| The pairs of start and end tags for code blocks.
  deliminators : List (String, String)

  ||| Line markers that indicate a line contains code.
  line_markers : List String

  ||| Recognised file extensions. Not used by the module, but will be
  ||| of use when connecting to code that reads in the original source
  ||| files.
  file_extensions : List String

||| Given a 'literate specification' extract the code from the
||| literate source file (`litStr`) that follows the presented style.
|||
||| @specification The literate specification to use.
||| @litStr  The literate source file.
|||
||| Returns a `LiterateError` if the literate file contains malformed
||| code blocks or code lines.
|||
export
extractCode : (specification : LiterateStyle)
           -> (litStr        : String)
           -> Either LiterateError String
extractCode (MkLitStyle delims markers exts) str =
      case lex (rawTokens delims markers) str of
        (toks, (_,_,"")) => Right $ reduce toks ""
        (_, (l,c,i))     => Left (MkLitErr l c i)

-- --------------------------------------------------------------------- [ EOF ]