{- |
Module      :  $Header$
Description :  scan tokens of Haskell sources
Copyright   :  (c) C. Maeder 2010
License     :  BSD

Maintainer  :  chr.maeder@web.de
Stability   :  provisional
Portability :  portable

using parsec as scanner to tokenize haskell sources
and check the spacing

-}

module Language.Haskell.Scanner
  ( splitLines
  , showScan
  , processScan
  , scan
  , PosTok
  ) where

import Control.Monad

import Data.Char
import Data.List

import Text.ParserCombinators.Parsec
import Text.ParserCombinators.Parsec.Pos

-- * monad shortcuts

infixl 1 <<

(<<) :: Monad m => m a -> m b -> m a
(<<) = liftM2 const

infixr 5 <:>

(<:>) :: Monad m => m a -> m [a] -> m [a]
(<:>) = liftM2 (:)

infixr 5 <++>

(<++>) :: Monad m => m [a] -> m [a] -> m [a]
(<++>) = liftM2 (++)

single :: Monad m => m a -> m [a]
single = liftM return

flat :: Monad m => m [[a]] -> m [a]
flat = liftM concat

enclosedBy :: Monad m => m [a] -> m a -> m [a]
enclosedBy p q = q <:> p <++> single q

-- * parsec shortcuts

-- | parse an optional list
optionL :: GenParser tok st [a] -> GenParser tok st [a]
optionL = option []

-- | shortcut for @try . string@
tryString :: String -> CharParser st String
tryString = try . string

-- | nested comments, open and closing strings must have at least two chars
nestedComment :: String -> String -> CharParser st String
nestedComment op cl = case (op, cl) of
  (oh : ot : _, ch : ct : _) ->
    tryString op <++>
    flat (many $ single
          (noneOf [oh, ch]
           <|> try (char ch << notFollowedBy (char ct))
           <|> try (char oh << notFollowedBy (char ot)) <?> "")
          <|> nestedComment op cl)
    <++> string cl
  _ -> error "nestedComment"

-- * haskell tokens

-- | the haskell nested comments
nestComment :: CharParser st String
nestComment = nestedComment "{-" "-}"

-- | line comment without final newline
lineComment :: CharParser st String
lineComment =
  try ((char '-' <:> many1 (char '-'))
       << notFollowedBy (oneOf "!#$%&*+./<=>?@\\^|~"))
  <++> many (noneOf "\n")

-- | text in double quotes
stringLit :: CharParser st String
stringLit = enclosedBy (flat $ many $ single (noneOf "\\\"")
                        <|> char '\\' <:> single anyChar) $ char '\"'

-- | text in single quotes
charLit :: CharParser st String
charLit = tryString "'''" <|>
          enclosedBy (flat $ many $ single (noneOf "\\\'")
                      <|> char '\\' <:> single anyChar)
          (char '\'')

-- | a precise number scanner
number :: Parser String
number =
    try (char '0' <:> single (oneOf "oO")) <++> many octDigit
  <|>
    try (char '0' <:> single (oneOf "xX")) <++> many hexDigit
  <|> many1 digit
          <++> optionL (try (char '.' <:> single digit) <++> many digit)
          <++> optionL (try (oneOf "eE" <:> optionL (single $ oneOf "+-")
                             <++> single digit) <++> many digit)

-- | any character within identifiers
hChar :: Parser Char
hChar = alphaNum <|> oneOf "_'"

-- | the underscore parser
uL :: Parser Char
uL = char '_'

-- | lower case identifiers (aka variables)
lId :: Parser String
lId = (uL <|> lower) <:> many hChar

-- | upper case identifiers (aka constructors)
uId :: Parser String
uId = upper <:> many hChar

-- | any character within operators
opSym :: Parser Char
opSym = oneOf "!#$%&*+-./:<=>?@\\^|~"

-- | any operator (mainly infixes)
operator :: Parser String
operator = many1 opSym

-- | possible qualified entities: lower or upper case words or symbols
data QualElem = Var | Cons | Sym deriving Eq

-- | a name qualified or not with its representation
data QualName = Name Bool QualElem String

-- | the show instance renders the original string
instance Show QualName where
  show (Name _ _ s) = s

-- | any qualified or unqualified name or operator
qId :: Parser QualName
qId = fmap (Name False Var) lId
  <|> fmap (Name False Sym) operator
  <|> do
    n <- uId
    option (Name False Cons n) $ do
      d <- try (char '.' << lookAhead (uL <|> letter <|> opSym))
      Name _ k r <- qId
      return $ Name True k $ n ++ d : r

-- | parse any 'qId' within back ticks. This is more liberal than haskell!
infixOp :: Parser String
infixOp = enclosedBy (fmap show qId) $ char '`'

-- | the separators are comma, semicolon and various kinds of parens
seps :: String
seps = "[({,;})]\n"

-- | beside names and separators we have a couple of more token kinds
data TokenKind = LineComment | BlockComment | Literal | Infix

-- | the data type for tokens
data Token
  = QualName QualName
  | Sep Char
  | Token TokenKind String
  | Start -- ^ the void token at the very beginning

-- | the show instance renders the original string without kind information
instance Show Token where
  show t = case t of
    QualName q -> show q
    Sep c -> [c]
    Token _ s -> s
    Start -> ""

-- | all white spaces except newline
isWhite :: Char -> Bool
isWhite c = isSpace c && c /= '\n'

-- | parsing as many 'isWhite' characters as possible (or none)
white :: Parser String
white = many $ satisfy isWhite

-- | parse any non-white token
tok :: Parser Token
tok = fmap (Token BlockComment) nestComment
  <|> fmap (Token LineComment) lineComment
  <|> fmap QualName qId
  <|> fmap Sep (oneOf seps)
  <|> fmap (Token Literal) (charLit <|> stringLit <|> number)
  <|> fmap (Token Infix) infixOp

-- | tokens enriched by positions and following white space
data PosTok = PosTok SourcePos Token String

instance Show PosTok where
  show (PosTok _ t s) = shows t s

-- | attach positions and subsequent 'white' spaces to tokens
posTok :: Parser PosTok
posTok = do
  p <- getPosition
  t <- tok
  w <- white
  return $ PosTok p t w

-- | the start position (without file name)
startPos :: SourcePos
startPos = initialPos ""

-- | the initial 'white' stuff
startTok :: Parser PosTok
startTok = fmap (PosTok startPos Start) white

-- | the final scanner
scan :: Parser [PosTok]
scan = startTok <:> many posTok << eof

-- * splitting lines

{- | a generic splitting function that keeps the separator as first element
except in the first list. -}
splitBy :: (a -> Bool) -> [a] -> [[a]]
splitBy p l = let (fr, rt) = break p l in fr : case rt of
  [] -> []
  d : tl -> let hd : tll = splitBy p tl in (d : hd) : tll

-- | split lines at newline tokens
splitLines :: [PosTok] -> [[PosTok]]
splitLines = splitBy isIndent

-- | removing more than two consecutive lists fulfilling the predicate
removeBlankLines :: Int -> ([a] -> Bool) -> [[a]] -> [[a]]
removeBlankLines c p l = case l of
  [] -> []
  x : r ->
    if p x
    then if c > 1
         then removeBlankLines c p r
         else x : removeBlankLines (c + 1) p r
    else x : removeBlankLines 0 p r

-- * message data type

-- | messages with positions
data Diag = Diag SourcePos String

instance Show Diag where
  show (Diag p s) = show p ++ ' ' : s

-- * checking tokens

-- | only show the token without spaces
showTok :: PosTok -> String
showTok (PosTok _ t _) = show t

isInfixOp :: PosTok -> Bool
isInfixOp (PosTok _ t _) = case t of
  QualName (Name _ Sym s) -> notElem s $ map (: []) "!#@\\~"
  Token Infix _ -> True
  _ -> False

isComment :: PosTok -> Bool
isComment (PosTok _ t _) = case t of
  Token k _ -> case k of
    LineComment -> True
    BlockComment -> True
    _ -> False
  _ -> False

noLineComment :: PosTok -> Bool
noLineComment (PosTok _ t _) = case t of
  Token LineComment _ -> False
  _ -> True

isSepIn :: String -> PosTok -> Bool
isSepIn cs (PosTok _ t _) = case t of
  Sep c -> elem c cs
  _ -> False

isIndent :: PosTok -> Bool
isIndent = isSepIn "\n"

isOpPar :: PosTok -> Bool
isOpPar = isSepIn "[({"

isClPar :: PosTok -> Bool
isClPar = isSepIn "})]"

isOpParOrInfix :: PosTok -> Bool
isOpParOrInfix t = isOpPar t || isInfixOp t

isClParOrInfix :: PosTok -> Bool
isClParOrInfix t = isClPar t || isInfixOp t

isNonPar :: PosTok -> Bool
isNonPar = isSepIn ",;"

isFstInfixMinusArg :: PosTok -> Bool
isFstInfixMinusArg t@(PosTok _ u _) = case u of
  QualName (Name _ k _) -> k /= Sym
  Sep _ -> isClPar t
  Token k _ -> case k of
    Literal -> True
    Infix -> True
    _ -> False
  Start -> False

noSpaceNeededBefore :: PosTok -> Bool
noSpaceNeededBefore t =
  isSepIn ",;})]" t || showTok t == "@"

noSpaceNeededAfter :: PosTok -> Bool
noSpaceNeededAfter t@(PosTok _ u _) =
  isOpPar t || elem (show u) (map (: []) "!#-@~")

-- * adjusting tokens

-- | replace all white spaces by blanks
untabify :: SourcePos -> String -> String
untabify p s =
  let p2 = updatePosString p s
      bs = sourceColumn p2 - sourceColumn p
  in replicate bs ' '

untab :: PosTok -> PosTok
untab (PosTok p t w) = PosTok p t $ untabify (updatePosString p $ show t) w

-- | remove trailing spaces
rmSp :: PosTok -> PosTok
rmSp (PosTok p t _) = PosTok p t ""

-- | append exactly one blank
blank :: PosTok -> PosTok
blank (PosTok p t _) = PosTok p t " "

multipleBlanks :: PosTok -> [Diag]
multipleBlanks (PosTok p t w) = let n = length w in
  [ Diag (updatePosString p $ show t) $ "multiple (" ++ show n ++ ") blanks"
  | n > 1 ]

-- | tidy up line comments
adjustLineComment :: String -> String
adjustLineComment = ("--" ++)
  . reverse . dropWhile isSpace . reverse
  . (' ' :) . dropWhile isSpace . drop 2 -- cut of initial line comment marker

-- | check if prefix extended by one char from next string is still a prefix
hasLongerPrefix :: String -> String -> String -> Bool
hasLongerPrefix p n s = let ps = map (\ c -> p ++ [c]) n in
  any (`isPrefixOf` s) ps

-- | insert a single blank after prefix
adjustPrefix :: String -> String -> String -> String
adjustPrefix p n s =
   if hasLongerPrefix p n s then s else case stripPrefix p s of
     Nothing -> s
     Just r -> p ++ case dropWhile isWhite r of
         rt@('\n' : _) -> case s of
           '}' : _ -> r
           _ -> rt
         rt -> ' ' : rt

-- | adjust comment at both ends. Input is prefix, suffix and extension chars.
adjustBothEnds :: String -> String -> String -> String -> String
adjustBothEnds p q n s =
    if hasLongerPrefix p n s then s else
    concatMap (reverse . dropWhile isWhite . reverse)
    $ removeBlankLines 0 (all isSpace)
    $ splitBy (== '\n')
    $ reverse $ adjustPrefix (reverse q) n $ reverse $ adjustPrefix p n s

-- | adjust a block comment
adjustComment :: String -> String
adjustComment = adjustBothEnds "{-" "-}" "!#"
    . adjustBothEnds "{-!" "!-}" ""
    . adjustBothEnds "{-#" "#-}" ""

-- * analyse and adjust lines

-- | utility to combine results
(<+>) :: (PosTok, [Diag]) -> ([PosTok], [Diag]) -> ([PosTok], [Diag])
(t, ds) <+> (ts, es) = (t : ts, ds ++ es)

infixr 4 <+>

anaPosTok :: PosTok -> (PosTok, [Diag])
anaPosTok t@(PosTok p u w) = case u of
  Token BlockComment s -> let
    n = adjustComment s
    s5 = take 5 s
    nr = take 5 $ reverse n
    sr = take 5 $ reverse s
    in (PosTok p (Token BlockComment n) w,
       [ Diag p $ "non-conventional comment start: " ++ s5 | s5 /= take 5 n ]
       ++ [ Diag (updatePosString p s) $ "non-conventional comment end: "
            ++ reverse sr | sr /= nr ])
  _ | show u == ";" -> (t, [Diag p "use layout instead of ;"])
  _ -> (t, [])

anaLine :: [PosTok] -> ([PosTok], [Diag])
anaLine l = case l of
  [] -> ([], [])
  t0 : r1 -> let (t1@(PosTok p1 u1 w1), cs) = anaPosTok t0 in case r1 of
    [] -> case u1 of
      Token LineComment s -> let
        n = adjustLineComment s
        s4 = take 4 s
        in ([PosTok p1 (Token LineComment n) ""], cs ++
           [Diag p1 $ "leave a single blank after line comment sign: " ++ s4
           | s4 /= take 4 n])
      _ -> ([rmSp t1], cs ++ [Diag p1 "trailing white space" | not (null w1)])
    t2@(PosTok p2 u2 w2) : r2 -> case u1 of
      Start -> (rmSp t1, cs ++ [Diag p1 "leading white space" | not (null w1)])
        <+> anaLine r1
      _ | isIndent t1 -> let (ft, rt) = span (== ' ') w1 in
           (untab t1, cs ++
           [ Diag (updatePosString p1 ft) "use only blanks for indentation"
           | not (null rt) ]) <+> anaLine r1
      _ -> let
        s1 = show u1
        s2 = show u2
        n1 = length s1
        n2 = length s2
        lt = n1 <= n2
        both = s1 ++ s2
        after = case () of
          _ | isNonPar t1 -> True
            | isOpPar t2 -> False
            | s1 == "\\" -> True
            | isInfixOp t1 -> if isInfixOp t2 then lt else True
            | isInfixOp t2 || s2 == ".." -> False
          _ -> lt
        parsOfBoth = filter (`elem` "[({})]") both
        pos = case both of
             _ : _ : _ -> "between"
             _ | isOpPar t1 -> "after"
               | isClPar t2 -> "before"
             _ -> "here"
        omitSpace = isOpParOrInfix t1 && isClParOrInfix t2
          && not (isInfixOp t1 && isInfixOp t2)
          && (s1 /= ".." || s2 /= "]")
        addSpace = not (noSpaceNeededAfter t1)
          && not (noSpaceNeededBefore t2)
        (newT1, ds) =
          if null w1
          then if addSpace
            then (blank t1, Diag p2
                  ("leave space " ++
                   if after then "after " ++ s1 else "before " ++ s2)
               : [ Diag p1 "but may be template haskell splice $("
                 | both == "$(" ])
            else (t1, [])
          else if omitSpace
            then (rmSp t1,
               [Diag p2 $ " no space needed " ++ pos ++ " " ++ parsOfBoth])
            else if isComment t2
              then (untab t1, [])
              else (blank t1, multipleBlanks t1)
        pt0 = (newT1, cs ++ ds)
        in case r2 of
          [] -> pt0 <+> anaLine r1
          t3@(PosTok _ u3 _) : _ -> let
            s3 = show u3
            ms = [Diag p2 "put spaces around infix -" | null w1 || null w2 ]
                 ++ multipleBlanks t1 ++ multipleBlanks t2
            in if s2 == "-" && not (noSpaceNeededBefore t3)
                  && isFstInfixMinusArg t1
               then (blank t1, cs ++ ms) <+> anaLine (blank t2 : r2)
               else (if elem s2 ["do", "of"] && s3 /= "{" && noLineComment t3
                 then (newT1,
                   cs ++ Diag p2 ("break line after " ++ show s2) : ds)
                 else pt0) <+> anaLine r1

-- * ensure final newline

isBlankLine :: [PosTok] -> Bool
isBlankLine x = case x of
  [PosTok _ Start _] -> True
  [t] -> isIndent t
  _ -> False

removeFinalBlankLines :: [[PosTok]] -> [[PosTok]]
removeFinalBlankLines ll = reverse $ [PosTok startPos (Sep '\n') ""]
  : dropWhile isBlankLine (reverse ll)

-- * main functions

-- | create adjusted source file
processScan :: [[PosTok]] -> String
processScan = concatMap (concatMap show . fst . anaLine)
  . removeBlankLines 1 isBlankLine . removeFinalBlankLines

-- | list all diagnostics
showScan :: [[PosTok]] -> String
showScan = intercalate "\n" . concatMap (map show . snd . anaLine)