{- |
Module      :  $Header$
Description :  scan tokens of Haskell sources
Copyright   :  (c) C. Maeder 2010
License     :  BSD

Maintainer  :  chr.maeder@web.de
Stability   :  provisional
Portability :  portable

using parsec as scanner to tokenize haskell sources
and check the spacing

-}

module Language.Haskell.Scanner
  ( scan
  , Opts (..)
  , defaultOpts
  , anaPosToks
  , PosTok (..)
  , Token
  , showPosTok
  , Diag (..)
  , showDiag
  , showSourcePos
  ) where

import Control.Monad

import Data.Char
import Data.List

import Text.ParserCombinators.Parsec
import Text.ParserCombinators.Parsec.Pos

-- * monad shortcuts

infixl 1 <<

(<<) :: Monad m => m a -> m b -> m a
(<<) = liftM2 const

infixr 5 <:>

(<:>) :: Monad m => m a -> m [a] -> m [a]
(<:>) = liftM2 (:)

infixr 5 <++>

(<++>) :: Monad m => m [a] -> m [a] -> m [a]
(<++>) = liftM2 (++)

single :: Monad m => m a -> m [a]
single = liftM return

flat :: Monad m => m [[a]] -> m [a]
flat = liftM concat

enclosedBy :: Monad m => m [a] -> m a -> m [a]
enclosedBy p q = q <:> p <++> single q

-- | a literal enclosed in quotes and a backslash as escape character
quotedLit :: Char -> CharParser st String
quotedLit q = enclosedBy (flat $ many $ single (noneOf ['\\', q])
                        <|> char '\\' <:> single anyChar) $ char q

-- | text in double quotes
stringLit :: CharParser st String
stringLit = quotedLit '"'

-- * parsec shortcuts

-- | parse an optional list
optionL :: GenParser tok st [a] -> GenParser tok st [a]
optionL = option []

-- | shortcut for @try . string@
tryString :: String -> CharParser st String
tryString = try . string

-- | nested comments, open and closing strings must have at least one char
nestedComment :: String -> String -> CharParser st String
nestedComment op cl =
    let inComment = tryString cl
           <|> (nestedComment op cl <|> single anyChar) <++> inComment
    in tryString op <++> inComment

-- * haskell tokens

-- | the haskell nested comments
nestComment :: CharParser st String
nestComment = nestedComment "{-" "-}"

-- | line comment without final newline
lineComment :: CharParser st String
lineComment =
  try ((char '-' <:> many1 (char '-'))
       << notFollowedBy (oneOf "!#$%&*+./<=>?@\\^|~"))
  <++> many (noneOf "\n")

-- | text in single quotes
charLit :: CharParser st String
charLit = tryString "'''" <|> quotedLit '\''

-- | a precise number scanner
number :: Parser String
number =
    try (char '0' <:> single (oneOf "oO")) <++> many octDigit
  <|>
    try (char '0' <:> single (oneOf "xX")) <++> many hexDigit
  <|> many1 digit
          <++> optionL (try (char '.' <:> single digit) <++> many digit)
          <++> optionL (try (oneOf "eE" <:> optionL (single $ oneOf "+-")
                             <++> single digit) <++> many digit)

-- | any character within identifiers
hChar :: Parser Char
hChar = alphaNum <|> oneOf "_'"

-- | lower case identifiers (aka variables)
lId :: Parser String
lId = (char '_' <|> lower) <:> many hChar

-- | upper case identifiers (aka constructors)
uId :: Parser String
uId = upper <:> many hChar

-- | any character within operators
opSym :: Char -> Bool
opSym c = elem c "!#$%&*+-./:<=>?@\\^|~"
  || not (isAscii c) && (isSymbol c || isPunctuation c)

-- | any operator (mainly infixes)
operator :: Parser String
operator = many1 $ satisfy opSym

-- | possible qualified entities: lower or upper case words or symbols
data QualElem = Var | Cons | Sym

-- | a name qualified or not with its representation
data QualName = Name Bool QualElem String

-- | the original string
showQualName :: QualName -> String
showQualName (Name _ _ s) = s

isLetterOrUnderscore :: Char -> Bool
isLetterOrUnderscore c = c == '_' || isLetter c

-- | the possible characters starting a name
qIdStart :: Parser Char
qIdStart = satisfy $ \ c -> isLetterOrUnderscore c || opSym c

-- | any quoted, qualified, or unqualified name or operator
qId :: Parser QualName
qId = fmap (Name False Var) lId
  <|> fmap (Name False Sym) operator
  <|> do
    n <- uId
    option (Name False Cons n) $ do
      d <- try (char '.' << lookAhead qIdStart)
      Name _ k r <- qId
      return $ Name True k $ n ++ d : r
  <|> do
    let noCh = notFollowedBy (char '\'')
    s <- try (string "'" << lookAhead (satisfy isLetterOrUnderscore >> noCh))
      <|> try (string "''" << noCh)
    Name b k r <- qId
    return $ Name b k $ s ++ r

-- | parse any 'qId' within back ticks. This is more liberal than haskell!
infixOp :: Parser String
infixOp = enclosedBy (fmap showQualName qId) $ char '`'

-- | the separators are comma, semicolon and three kinds of parens
seps :: String
seps = "[({,;})]"

-- | beside names and separators we have a couple of more token kinds
data TokenKind =
    LineComment  -- ^ a line comment
  | BlockComment -- ^ single- or multiline (nested) block comment
  | Literal      -- ^ number, character, or string
  | Infix        -- ^ something in back quotes
  | Sep          -- ^ a separator
  | Indent       -- ^ a newline or nothing at the very beginning
  | QuasiQuote   -- ^ a quasi-quote
  | ThQuote      -- ^ a possible template haskell quotation

-- | the data type for tokens
data Token
  = QualName QualName
  | Token TokenKind String

-- | renders the original string without kind information
showToken :: Token -> String
showToken t = case t of
    QualName q -> showQualName q
    Token _ s -> s

-- | all white spaces except newline
isWhite :: Char -> Bool
isWhite c = isSpace c && c /= '\n'

-- | parsing as many 'isWhite' characters as possible (or none)
white :: Parser String
white = many $ satisfy isWhite

-- | a literal followed by up to two (one for 'charLit') optional magic hashes
literal :: Parser String
literal = let optH = optionL (string "#") in
  charLit <++> optH
  <|> (stringLit <|> number) <++> optionL (char '#' <:> optH)

-- | the template haskell quasi-quotes
quasiQuote :: Parser Token
quasiQuote = try $ do
  q <- char '[' >> lId << char '|'
  let start = '[' : q ++ "|"
      -- check for plain TH quotation
  if elem q $ single "dept" then return $ Token ThQuote start else
    fmap (Token QuasiQuote . (start ++) . (++ "|]"))
             $ manyTill anyChar $ tryString "|]"

-- | parse any non-white token
tok :: Parser Token
tok = fmap (Token BlockComment) nestComment
  <|> fmap (Token LineComment) lineComment
  <|> quasiQuote
  <|> fmap QualName qId
  <|> fmap (Token Sep) (single $ oneOf seps)
  <|> fmap (Token Indent) (string "\n")
  <|> fmap (Token Literal) literal
  <|> fmap (Token Infix) infixOp

-- | tokens enriched by positions and following white space
data PosTok = PosTok SourcePos Token String

showPosTok :: PosTok -> String
showPosTok (PosTok _ t s) = showToken t ++ s

-- | attach positions and subsequent 'white' spaces to tokens
posTok :: Parser PosTok
posTok = do
  p <- getPosition
  t <- tok
  w <- white
  return $ PosTok p t w

-- | the start position (without file name)
startPos :: SourcePos
startPos = initialPos ""

-- | the initial 'white' stuff
startTok :: Parser PosTok
startTok = fmap (PosTok startPos $ Token Indent "") white

-- | the final scanner
scan :: Parser [PosTok]
scan = startTok <:> many posTok << eof

-- * message data type

-- | messages with positions
data Diag = Diag SourcePos String

-- | show parsec positions like hlint does
showSourcePos :: SourcePos -> String
showSourcePos p = sourceName p ++ ":" ++ shows (sourceLine p) ":"
  ++ shows (sourceColumn p) ":"

-- | show the message with its position
showDiag :: Diag -> String
showDiag (Diag p s) = showSourcePos p ++ ' ' : s

-- * checking tokens

isInfixOp :: Token -> Bool
isInfixOp t = case t of
  QualName (Name _ Sym s) -> notElem s $ map (: []) "!#@\\~"
  Token Infix _ -> True
  _ -> False

isComment :: Token -> Bool
isComment t = case t of
  Token k _ -> case k of
    LineComment -> True
    BlockComment -> True
    _ -> False
  _ -> False

noLineComment :: Token -> Bool
noLineComment t = case t of
  Token LineComment _ -> False
  _ -> True

isSepIn :: String -> Token -> Bool
isSepIn cs t = case t of
  Token Sep [c] -> elem c cs
  _ -> False

isIndent :: Token -> Bool
isIndent t = case t of
  Token Indent _ -> True
  _ -> False

isOpPar :: Token -> Bool
isOpPar = isSepIn "[({"

isClPar :: Token -> Bool
isClPar = isSepIn "})]"

isFstInfixMinusArg :: Token -> Bool
isFstInfixMinusArg t = case t of
  QualName (Name _ k _) -> case k of
    Sym -> False
    _ -> True
  Token k _ -> case k of
    Literal -> True
    Sep -> isClPar t
    _ -> False

isSymb :: [String] -> Token -> Bool
isSymb s t = case t of
  QualName (Name False Sym r) -> elem r s
  _ -> False

isKeyw :: [String] -> Token -> Bool
isKeyw s t = case t of
  QualName (Name False Var r) -> elem r s
  _ -> False

spaceNeededBefore :: Token -> Bool
spaceNeededBefore t =
  not $ isIndent t || isSymb ["@"] t || isSepIn ",;})]" t

-- * adjusting tokens

-- | remove trailing spaces
rmSp :: PosTok -> PosTok
rmSp (PosTok p t _) = PosTok p t ""

topKeys :: [String]
topKeys = ["import", "data", "newtype", "class", "instance"]
-- "module" may occur in export lists, "type" within type families!

layoutKeys :: [String]
layoutKeys = ["do", "of", "where"]

-- | ensure proper amount of space and diagnose too many blanks
multipleBlanks :: Opts -> Token -> PosTok -> (PosTok, [Diag])
multipleBlanks opts u pTok@(PosTok p t w) =
  let s = showToken t
      q = updatePosString p s
      aft = noMultBlanksAfter opts t
      m = maxBlanks opts
      n = length w
      diag = Diag q $ "up to column " ++ show (sourceColumn q + n)
       ++ " multiple (" ++ show n ++ ") blanks"
       ++ if aft then " after " ++ show s else ""
      bTok = PosTok p t " "
  in case w of
    "" -> (bTok, []) -- insert a missing blank (without diag)
    _ | allowMultBlanksBefore opts u
      -> (pTok, []) -- no change
    _ | noMultBlanksAfter opts t
      -> (bTok, [diag | n > 1])
    _ -> (PosTok p t $ take m w, [diag | n > m])

-- | cut of initial line comment marker and spaces
trim2 :: String -> String
trim2 = dropWhile isSpace . drop 2

-- | add two dashes to a string plus a space for a non-emtpy string
addLineCommentMark :: String -> String
addLineCommentMark r = "--" ++ if null r then r else ' ' : r

-- | tidy up line comments
adjustLineComment :: String -> String
adjustLineComment = addLineCommentMark . trim2

-- | check if prefix extended by one char from next string is still a prefix
hasLongerPrefix :: String -> String -> String -> Bool
hasLongerPrefix p n s = let ps = map (\ c -> p ++ [c]) n in
  any (`isPrefixOf` s) ps

-- | insert a single blank after prefix
adjustPrefix :: String -> String -> String -> String
adjustPrefix p n s =
   if hasLongerPrefix p n s then s else case stripPrefix p s of
     Nothing -> s
     Just r -> p ++ case dropWhile isWhite r of
         rt@('\n' : _) -> case s of
           '}' : _ -> r
           _ -> rt
         rt -> ' ' : rt

-- | adjust comment at both ends. Input is prefix, suffix and extension chars.
adjustBothEnds :: String -> String -> String -> String -> String
adjustBothEnds p q n s =
    if hasLongerPrefix p n s then s else
    reverse $ adjustPrefix (reverse q) n $ reverse $ adjustPrefix p n s

-- | adjust a block comment
adjustComment :: String -> String
adjustComment = adjustBothEnds "{-" "-}" "!#"
    . adjustBothEnds "{-!" "!-}" ""
    . adjustBothEnds "{-#" "#-}" ""

-- * analyse and adjust lines

anaPosTok :: Opts -> PosTok -> (PosTok, [Diag])
anaPosTok opts t@(PosTok p u w) =
  let cC = checkComments opts in case u of
  Token BlockComment s | cC -> let
    n = adjustComment s
    s5 = take 5 s
    nr = take 5 $ reverse n
    sr = take 5 $ reverse s
    in (PosTok p (Token BlockComment n) w,
       [ Diag p $ "non-conventional comment start: " ++ s5 | s5 /= take 5 n ]
       ++ [ Diag p "avoid nested comments" | isInfixOf "{-" $ drop 2 s ]
       ++ [ Diag (updatePosString p s) $ "non-conventional comment end: "
            ++ reverse sr | sr /= nr ])
  Token LineComment s | cC -> let
    n = adjustLineComment s
    s4 = take 4 s
    sr = takeWhile isPrint $ dropWhile isSpace $ drop 2 s4
    q = incSourceColumn p 2
    in (PosTok p (Token LineComment n) "",
        [ Diag q $ "put " ++ (if null sr then "only " else "")
          ++ "one blank after --"
          ++ if null sr then "" else " in --" ++ sr
        | s4 /= take 4 n ]
        ++ [ Diag q "missing comment after --" | n == "--" ]
        ++ [ Diag q "line comment contains block comment start"
             | isInfixOf "{-" n ]
        ++ [ Diag q "line comment contains block comment end"
             | isInfixOf "-}" n ])
  Token ThQuote s | not $ tempHask opts -> let
    (f, r) = splitAt 2 s
    in (PosTok p (Token ThQuote $ f ++ " |") w,
          [ Diag (updatePosString p f) $ "put blank between " ++ f ++ " and |"
          | r == "|" ])
  _ -> (t, [ Diag p "use layout instead of ;" | isSepIn ";" u ])

-- | a comment string that is no pragma or directive
isNormalCommentString :: String -> Bool
isNormalCommentString s = case drop 2 s of
  c : _ -> notElem c "!#"
  _ -> True

-- | a comment that is no pragma or directive
isNormalComment :: Token -> Bool
isNormalComment u =
  isComment u && isNormalCommentString (showToken u)

-- | check if no other normal comment follows
noOtherComment :: [PosTok] -> Bool
noOtherComment ps = case ps of
  [] -> True
  PosTok _ u _ : _ -> not $ isNormalComment u

-- | the data type for options passed to the analysis
data Opts = Opts
  { tempHask :: Bool
  , windowsOutput :: Bool
  , lineLength :: Int
  , maxBlankLines :: Int
  , maxBlanks :: Int
  , allowMultBlanksBefore :: Token -> Bool
  , noMultBlanksAfter :: Token -> Bool
  , makeLineComments :: Bool
  , joinComments :: Bool
  , checkComments :: Bool
  , commentGap :: Int
  , checkSpacing :: Bool
  }

-- | the default options to use
defaultOpts :: Opts
defaultOpts = Opts
  { tempHask = False
  , windowsOutput = False
  , lineLength = 80
  , maxBlankLines = 2
  , maxBlanks = 1
  , allowMultBlanksBefore = isComment
  , noMultBlanksAfter = isKeyw $ "let" : layoutKeys
  , makeLineComments = True
  , joinComments = True
  , checkComments = True
  , commentGap = 0
  , checkSpacing = True
  }

-- | analyse consecutive tokens
anaPosToks :: Opts -> [PosTok] -> ([PosTok], [Diag])
anaPosToks opts l = case l of
  [] -> ([], [])
  t0 : r1 ->
    let (t1@(PosTok p1 u1 w1), cs) = anaPosTok opts t0
    in case r1 of
    [] -> ([t1], cs)
    t2@(PosTok p2 u2 w2) : r2 -> let
        (ar1, rds) = anaPosToks opts r1
        s1 = showToken u1
        s2 = showToken u2
        i1 = isInfixOp u1
        i2 = isInfixOp u2
        o1 = isOpPar u1
        c2 = isClPar u2
        ii1 = isIndent u1
        ni1 = not ii1
        lt = length s1 <= length s2
        both = s1 ++ s2
        nw1 = null w1
        (bt1, m1) = multipleBlanks opts u2 t1
        al = t1 : ar1
        bl = bt1 : ar1
        aS = "after "
        bS = "before "
        aft = case () of
          _ | isSepIn ",;" u1 -> True
            | isOpPar u2 -> False
            | isSymb ["\\"] u1 -> True
            | i1 -> if i2 then lt else True
            | i2 || isSymb [".."] u2 -> False
          _ -> lt
        parsOfBoth = filter (`elem` "[({})]") both
        pos = case parsOfBoth of
          _ : _ : _ -> "between "
          _ | o1 -> aS
            | c2 -> bS
          _ -> "here "
        checkS = checkSpacing opts
        joinC = joinComments opts
        omitSpace = checkS && o1 && (isInfixOp u2 || c2)
          || isInfixOp u1 && c2 && not (isSymb [".."] u1 && isSepIn "]" u2)
        addSpace = checkS && ni1 && nw1
          && (not (o1 || isSymb (single "!#-@~") u1) || isComment u2)
          && spaceNeededBefore u2
          && if tempHask opts then not $ isSymb ["$"] u1 else True
        trim = reverse . trim2
        s1C = trim $ trim s1
        in case r2 of
      _ | makeLineComments opts && isNormalComment u1 && isIndent u2
          && noOtherComment r2 && noLineComment u1
          && notElem '\n' s1C
        -> ( PosTok p1 (Token LineComment $ addLineCommentMark s1C) "" : ar1
           , cs ++ Diag p1 "could be a line comment" : rds)
        | joinC && isNormalComment u1 && isNormalComment u2
        -> (al, cs ++ Diag p2 "consecutive comments" : rds)
      t3@(PosTok _ u3 _) : r3
        | joinC && isNormalComment u1 && isIndent u2 && isNormalComment u3
        -> let (PosTok p3 nu3 w3, cs3) = anaPosTok opts t3
               su3 = showToken nu3
               h3 = trim2 su3
               f3 = take (length su3 - length h3) su3
               hh3 = head h3
               cC = checkComments opts
               isHaddockStart = cC && not (null h3)
                 && elem hh3 "$*^|"
               s3 = if isHaddockStart
                    then dropWhile isSpace
                      $ if hh3 == '*' then dropWhile (== '*') h3 else tail h3
                    else if cC then h3 else drop 2 su3
               nt3 = PosTok p1 (Token BlockComment
                 $ (if noLineComment u1
                    then reverse . dropWhile (== ' ') . drop 2 $ reverse s1
                   else "{-" ++ drop 2 s1)
                   ++ w1 ++ s2 ++ w2 ++ replicate (commentGap opts) ' '
                   ++ if noLineComment nu3 then s3
                      else s3 ++ " -}") w3
               (ar3, rds3) = anaPosToks opts $ nt3 : r3
           in (ar3
              , cs ++ Diag p3 "join (or separate) consecutive comments"
                : [ Diag (updatePosString p3 f3)
                     $ "unexpected haddock marker " ++ [hh3]
                     | isHaddockStart ]
                ++ cs3 ++ rds3)
        | checkS && isFstInfixMinusArg u1 && isSymb ["-"] u2
          && spaceNeededBefore u3
        -> let (bt2, m2) = multipleBlanks opts u3 t2 in
           (bt1 : bt2 : tail ar1, cs ++ m1
             ++ [ Diag p2 "put blanks around infix -" | nw1 || null w2 ]
             ++ m2 ++ rds)
        | ni1 && isKeyw layoutKeys u2
          && not (isSepIn "{" u3) && noLineComment u3 && not (isIndent u3)
        -> (al, cs ++ m1 ++ Diag p2 ("break line after " ++ show s2) : rds)
      _ | addSpace
        -> (bl, cs ++ Diag p2
             ("put blank " ++ if aft then aS ++ s1 else bS ++ s2) : rds)
      _ | not nw1 && omitSpace
        -> (rmSp t1 : ar1, cs
           ++ Diag p2 ("no space needed " ++ pos ++ parsOfBoth) : rds)
      _ | not nw1 && ii1 && isKeyw topKeys u2
        -> (al, cs ++ Diag p2 ("start in column 1 with keyword " ++ s2) : rds)
      _ | nw1 || ii1
        -> (al, cs ++ rds)
      _ -> (bl, cs ++ m1 ++ rds)