{-# LANGUAGE CPP #-}
{-# LANGUAGE ViewPatterns #-}
-- | Preprocessors for literate code formats
module Agda.Syntax.Parser.Literate (
   literateProcessors
  ,literateExts
  ,literateExtsShortList
  ,literateSrcFile
  ,literateTeX
  ,literateRsT
  ,literateMd
  ,illiterate
  ,atomizeLayers
  ,Processor
  ,Layers
  ,Layer(..)
  ,LayerRole(..)
  ,isCode
  ,isCodeLayer
  )
  where

import Prelude hiding (getLine)
import Data.Char (isSpace, isControl)
import Data.List (isPrefixOf)
import Agda.Syntax.Position
import Text.Regex.TDFA

#include "undefined.h"
import Agda.Utils.Impossible

-- | Role of a character in the file
data LayerRole = Markup | Comment | Code
                deriving (Show, Eq)

-- | A sequence of characters in a file playing the same role
data Layer = Layer {
  layerRole    :: LayerRole
 ,interval     :: Interval
 ,layerContent :: String
} deriving (Show)

-- | A list of contiguous layers
type Layers = [Layer]

instance HasRange Layer where
  getRange = getRange . interval

-- | Annotates a tokenized string with position information.
mkLayers :: Position -> [(LayerRole, String)] -> Layers
mkLayers pos [] = emptyLiterate pos
-- Ignore empty layers
mkLayers pos ((_,""):xs) = mkLayers pos xs
mkLayers pos ((ty,s):xs) = let next = movePosByString pos s in
                           (Layer ty (Interval pos next) s):(mkLayers next xs)

unMkLayers :: Layers -> [(LayerRole, String)]
unMkLayers = map ((,) <$> layerRole <*> layerContent)

atomizeLayers :: Layers -> [(LayerRole, Char)]
atomizeLayers = (>>= fmap <$> ((,) . fst) <*> snd) . unMkLayers

-- | Type of a literate preprocessor:
--   Invariants:
--
--   > f : Processor
--
--   prop> f pos s /= []
--
--   prop> f pos s >>= layerContent == s
type Processor = Position -> String -> [Layer]

literateSrcFile :: [Layer] -> SrcFile
literateSrcFile [] = __IMPOSSIBLE__
literateSrcFile (Layer{interval}:_) = getIntervalFile interval

-- | List of valid extensions for literate Agda files, and their corresponding
--   preprocessors.
--
--   If you add new extensions, remember to update test/Utils.hs so that test
--   cases ending in the new extensions are found.
literateProcessors :: [(String, Processor)]
literateProcessors = map ((,) <$> (".lagda" ++) . fst <*> snd)
                 [(""    , literateTeX)
                 ,(".rst", literateRsT)
                 ,(".tex", literateTeX)
                 ,(".md", literateMd)
                 ]

-- | Returns `True` if a role corresponds to Agda code
isCode :: LayerRole -> Bool
isCode Code    = True
isCode Markup  = False
isCode Comment = False

-- | Returns `True` a layer contains Agda code
isCodeLayer :: Layer -> Bool
isCodeLayer = isCode . layerRole

-- | Blanks the non-code parts of a given file, preserving positions of
--   characters corresponding to code. This way, there is a direct
--   correspondence between source positions and positions in the
--   processed result.
illiterate :: [Layer] -> String
illiterate xs = concat [
  (if isCode layerRole then id else bleach) layerContent
  | Layer{layerRole,layerContent} <- xs]

-- | Replaces non-space characters in a string with spaces.
bleach :: String -> String
bleach s = map go s
  where
    go c | isSpace c = c
    go _             = ' '

-- | Check if a character is a blank character.
isBlank :: Char -> Bool
isBlank = (&&) <$> isSpace <*> not . (== '\n')

-- | Possible extensions for a literate Agda file
literateExts :: [String]
literateExts = map fst literateProcessors

-- | Short list of extensions for literate Agda files
--   For display purposes.
literateExtsShortList :: [String]
literateExtsShortList = [".lagda"]

-- | break a list just *after* an element satisfying the predicate is found
--
--   >>> break1 even [1,3,5,2,4,7,8]
--   ([1,3,5,2],[4,7,8])
--
break1 :: (a -> Bool) -> [a] -> ([a],[a])
break1 _ []           =  ([], [])
break1 p (x:xs) | p x = (x:[],xs)
break1 p (x:xs)       = let (ys,zs) = break1 p xs in (x:ys,zs)

-- | Returns a tuple consisting of the first line of the input, and the rest
--   of the input.
getLine :: String -> (String, String)
getLine = break1 (== '\n')

-- | Canonical decomposition of an empty literate file
emptyLiterate :: Position -> [Layer]
emptyLiterate pos = [Layer (Markup) (Interval pos pos) ""]

-- | Create a regular expression that:
--   - Must match the whole string
--   - Works across line boundaries
rex :: String -> Regex
rex s = makeRegexOpts blankCompOpt{newSyntax = True} blankExecOpt$ "\\`" ++ s ++ "\\'"

-- | Preprocessor for literate TeX
literateTeX :: Position -> String -> [Layer]
literateTeX pos s = mkLayers pos$ tex s
  where
  tex :: String -> [(LayerRole, String)]
  tex [] = []
  tex s  = let (line, rest) = getLine s in
    case r_begin `matchM` line of
      Just (getAllTextSubmatches -> [_, pre, _, markup, whitespace]) ->
        (Comment, pre):(Markup, markup):(Code, whitespace):code rest
      Just _                 -> __IMPOSSIBLE__
      Nothing                -> (Comment, line):tex rest

  r_begin = rex "(([^\\%]|\\\\.)*)(\\\\begin\\{code\\}[^\n]*)(\n)?"


  code :: String -> [(LayerRole, String)]
  code [] = []
  code s = let (line, rest) = getLine s in
    case r_end `matchM` line of
      Just (getAllTextSubmatches -> [_, code, markup, post]) ->
        (Code,code):(Markup, markup):(Comment, post):tex rest
      Just _ -> __IMPOSSIBLE__
      Nothing             -> (Code, line):code rest

  r_end   = rex "([[:blank:]]*)(\\\\end\\{code\\})(.*)"

-- | Preprocessor for Markdown
literateMd :: Position -> String -> [Layer]
literateMd pos s = mkLayers pos$ md s
  where
  md :: String -> [(LayerRole, String)]
  md [] = []
  md s  = let (line, rest) = getLine s in
    case md_begin `matchM` line of
      Just (getAllTextSubmatches -> [_, pre, markup, _]) ->
        (Comment, pre):(Markup, markup):code rest
      Just _                 -> __IMPOSSIBLE__
      Nothing                ->
        (Comment, line):
          if md_begin_other `match` line
            then code_other rest
            else md rest

  md_begin = rex "(.*)([[:space:]]*```(agda)?[[:space:]]*)"
  md_begin_other = rex "[[:space:]]*```[a-zA-Z0-9-]*[[:space:]]*"

  code :: String -> [(LayerRole, String)]
  code [] = []
  code s = let (line, rest) = getLine s in
    case md_end `matchM` line of
      Just (getAllTextSubmatches -> [_, markup]) ->
        (Markup, markup):md rest
      Just _ -> __IMPOSSIBLE__
      Nothing             -> (Code, line):code rest

  -- A non-Agda code block.
  code_other :: String -> [(LayerRole, String)]
  code_other [] = []
  code_other s = let (line, rest) = getLine s in
    (Comment, line):
      if md_end `match` line
        then md rest
        else code_other rest

  md_end   = rex "([[:space:]]*```[[:space:]]*)"

-- | Preprocessor for reStructuredText
literateRsT :: Position -> String -> [Layer]
literateRsT pos s = mkLayers pos$ rst s
  where
  rst :: String -> [(LayerRole, String)]
  rst [] = []
  rst s  = maybe_code s

  maybe_code s =
    if r_comment `match` line then
      not_code
    else case r_code `match` line of
      []      -> not_code
      [[_, before, "::", after]] ->
        -- Code starts
          if null before || isBlank (last before) then
             (Markup,  line):code rest
          else
             (Comment, before ++ ":"):(Markup, ':':after):code rest

      _ -> __IMPOSSIBLE__
    where
      (line, rest) = getLine s
      not_code = (Comment, line):rst rest


  -- | Finds the next indented block in the input
  code :: String -> [(LayerRole, String)]
  code [] = []
  code s = let (line, rest) = getLine s in
    if all isSpace line then
      (Markup, line):(code rest)
    else
      let (xs,ys) = span isBlank line in
      case xs of
        [] -> maybe_code s
        _  -> (Code, line):
              (indented xs rest)

  -- | Process an indented block
  indented :: String -> String -> [(LayerRole, String)]
  indented _   [] = []
  indented ind s  = let (line, rest) = getLine s in
      if all isSpace line then
        (Code, line):(indented ind rest)
      else if ind `isPrefixOf` line then
        (Code, line):(indented ind rest)
      else
        maybe_code s

  -- | Beginning of a code block
  r_code = rex "(.*)(::)([[:space:]]*)"

  -- | Beginning of a comment block
  r_comment = rex "[[:space:]]*\\.\\.([[:space:]].*)?"