{- |
Module      :  $Header$
Description :  scan tokens of Haskell sources
Copyright   :  (c) C. Maeder 2010
License     :  BSD

Maintainer  :  chr.maeder@web.de
Stability   :  provisional
Portability :  portable

using parsec as scanner to tokenize haskell sources
and check the spacing

-}

module Language.Haskell.Scanner
  ( scan
  , Opts (..)
  , defaultOpts
  , anaPosToks
  , PosTok
  , showPosTok
  , Diag (..)
  , showDiag
  , showSourcePos
  ) where

import Control.Monad

import Data.Char
import Data.List

import Text.ParserCombinators.Parsec
import Text.ParserCombinators.Parsec.Pos

-- * monad shortcuts

infixl 1 <<

(<<) :: Monad m => m a -> m b -> m a
(<<) = liftM2 const

infixr 5 <:>

(<:>) :: Monad m => m a -> m [a] -> m [a]
(<:>) = liftM2 (:)

infixr 5 <++>

(<++>) :: Monad m => m [a] -> m [a] -> m [a]
(<++>) = liftM2 (++)

single :: Monad m => m a -> m [a]
single = liftM return

flat :: Monad m => m [[a]] -> m [a]
flat = liftM concat

enclosedBy :: Monad m => m [a] -> m a -> m [a]
enclosedBy p q = q <:> p <++> single q

-- * parsec shortcuts

-- | parse an optional list
optionL :: GenParser tok st [a] -> GenParser tok st [a]
optionL = option []

-- | shortcut for @try . string@
tryString :: String -> CharParser st String
tryString = try . string

-- | nested comments, open and closing strings must have at least two chars
nestedComment :: String -> String -> CharParser st String
nestedComment op cl = case (op, cl) of
  (oh : ot : _, ch : ct : _) ->
    tryString op <++>
    flat (many $ single
          (noneOf [oh, ch]
           <|> try (char ch << notFollowedBy (char ct))
           <|> try (char oh << notFollowedBy (char ot)) <?> "")
          <|> nestedComment op cl)
    <++> string cl
  _ -> error "nestedComment"

-- * haskell tokens

-- | the haskell nested comments
nestComment :: CharParser st String
nestComment = nestedComment "{-" "-}"

-- | line comment without final newline
lineComment :: CharParser st String
lineComment =
  try ((char '-' <:> many1 (char '-'))
       << notFollowedBy (oneOf "!#$%&*+./<=>?@\\^|~"))
  <++> many (noneOf "\n")

-- | text in double quotes
stringLit :: CharParser st String
stringLit = enclosedBy (flat $ many $ single (noneOf "\\\"")
                        <|> char '\\' <:> single anyChar) $ char '\"'

-- | text in single quotes
charLit :: CharParser st String
charLit = tryString "'''" <|>
          enclosedBy (flat $ many $ single (noneOf "\\\'")
                      <|> char '\\' <:> single anyChar)
          (char '\'')

-- | a precise number scanner
number :: Parser String
number =
    try (char '0' <:> single (oneOf "oO")) <++> many octDigit
  <|>
    try (char '0' <:> single (oneOf "xX")) <++> many hexDigit
  <|> many1 digit
          <++> optionL (try (char '.' <:> single digit) <++> many digit)
          <++> optionL (try (oneOf "eE" <:> optionL (single $ oneOf "+-")
                             <++> single digit) <++> many digit)

-- | any character within identifiers
hChar :: Parser Char
hChar = alphaNum <|> oneOf "_'"

-- | lower case identifiers (aka variables)
lId :: Parser String
lId = (char '_' <|> lower) <:> many hChar

-- | upper case identifiers (aka constructors)
uId :: Parser String
uId = upper <:> many hChar

-- | any character within operators
opSym :: Char -> Bool
opSym c = elem c "!#$%&*+-./:<=>?@\\^|~"
  || not (isAscii c) && (isSymbol c || isPunctuation c)

-- | any operator (mainly infixes)
operator :: Parser String
operator = many1 $ satisfy opSym

-- | possible qualified entities: lower or upper case words or symbols
data QualElem = Var | Cons | Sym

-- | a name qualified or not with its representation
data QualName = Name Bool QualElem String

-- | the original string
showQualName :: QualName -> String
showQualName (Name _ _ s) = s

-- | the possible characters starting a name
qIdStart :: Parser Char
qIdStart = satisfy $ \ c -> c == '_' || isLetter c || opSym c

-- | any quoted, qualified, or unqualified name or operator
qId :: Parser QualName
qId = fmap (Name False Var) lId
  <|> fmap (Name False Sym) operator
  <|> do
    n <- uId
    option (Name False Cons n) $ do
      d <- try (char '.' << lookAhead qIdStart)
      Name _ k r <- qId
      return $ Name True k $ n ++ d : r
  <|> do
    let noCh = notFollowedBy (char '\'')
    s <- try (string "'" << lookAhead (letter << noCh)) -- limited recognition
      <|> try (string "''" << noCh)
    Name b k r <- qId
    return $ Name b k $ s ++ r

-- | parse any 'qId' within back ticks. This is more liberal than haskell!
infixOp :: Parser String
infixOp = enclosedBy (fmap showQualName qId) $ char '`'

-- | the separators are comma, semicolon and three kinds of parens
seps :: String
seps = "[({,;})]"

-- | beside names and separators we have a couple of more token kinds
data TokenKind =
    LineComment  -- ^ a line comment
  | BlockComment -- ^ single- or multiline (nested) block comment
  | Literal      -- ^ number, character, or string
  | Infix        -- ^ something in back quotes
  | Sep          -- ^ a separator
  | Indent       -- ^ a newline or nothing at the very beginning

-- | the data type for tokens
data Token
  = QualName QualName
  | Token TokenKind String

-- | renders the original string without kind information
showToken :: Token -> String
showToken t = case t of
    QualName q -> showQualName q
    Token _ s -> s

-- | all white spaces except newline
isWhite :: Char -> Bool
isWhite c = isSpace c && c /= '\n'

-- | parsing as many 'isWhite' characters as possible (or none)
white :: Parser String
white = many $ satisfy isWhite

-- | a literal followed by up to two (one for 'charLit') optional magic hashes
literal :: Parser String
literal = let optH = optionL (string "#") in
  charLit <++> optH
  <|> (stringLit <|> number) <++> optionL (char '#' <:> optH)

-- | parse any non-white token
tok :: Parser Token
tok = fmap (Token BlockComment) nestComment
  <|> fmap (Token LineComment) lineComment
  <|> fmap QualName qId
  <|> fmap (Token Sep) (single $ oneOf seps)
  <|> fmap (Token Indent) (string "\n")
  <|> fmap (Token Literal) literal
  <|> fmap (Token Infix) infixOp

-- | tokens enriched by positions and following white space
data PosTok = PosTok SourcePos Token String

showPosTok :: PosTok -> String
showPosTok (PosTok _ t s) = showToken t ++ s

-- | attach positions and subsequent 'white' spaces to tokens
posTok :: Parser PosTok
posTok = do
  p <- getPosition
  t <- tok
  w <- white
  return $ PosTok p t w

-- | the start position (without file name)
startPos :: SourcePos
startPos = initialPos ""

-- | the initial 'white' stuff
startTok :: Parser PosTok
startTok = fmap (PosTok startPos $ Token Indent "") white

-- | the final scanner
scan :: Parser [PosTok]
scan = startTok <:> many posTok << eof

-- * message data type

-- | messages with positions
data Diag = Diag SourcePos String

showSourcePos :: SourcePos -> String
showSourcePos p = sourceName p ++ ":" ++ shows (sourceLine p) ":"
  ++ shows (sourceColumn p) ":"

showDiag :: Diag -> String
showDiag (Diag p s) = showSourcePos p ++ ' ' : s

-- * checking tokens

isInfixOp :: Token -> Bool
isInfixOp t = case t of
  QualName (Name _ Sym s) -> notElem s $ map (: []) "!#@\\~"
  Token Infix _ -> True
  _ -> False

isComment :: Token -> Bool
isComment t = case t of
  Token k _ -> case k of
    LineComment -> True
    BlockComment -> True
    _ -> False
  _ -> False

noLineComment :: Token -> Bool
noLineComment t = case t of
  Token LineComment _ -> False
  _ -> True

isSepIn :: String -> Token -> Bool
isSepIn cs t = case t of
  Token Sep [c] -> elem c cs
  _ -> False

isIndent :: Token -> Bool
isIndent t = case t of
  Token Indent _ -> True
  _ -> False

isOpPar :: Token -> Bool
isOpPar = isSepIn "[({"

isClPar :: Token -> Bool
isClPar = isSepIn "})]"

isFstInfixMinusArg :: Token -> Bool
isFstInfixMinusArg t = case t of
  QualName (Name _ k _) -> case k of
    Sym -> False
    _ -> True
  Token k _ -> case k of
    Literal -> True
    Sep -> isClPar t
    _ -> False

isSymb :: [String] -> Token -> Bool
isSymb s t = case t of
  QualName (Name False Sym r) -> elem r s
  _ -> False

isKeyw :: [String] -> Token -> Bool
isKeyw s t = case t of
  QualName (Name False Var r) -> elem r s
  _ -> False

spaceNeededBefore :: Token -> Bool
spaceNeededBefore t =
  not $ isIndent t || isSymb ["@"] t || isSepIn ",;})]" t

-- * adjusting tokens

-- | remove trailing spaces
rmSp :: PosTok -> PosTok
rmSp (PosTok p t _) = PosTok p t ""

-- | ensure proper amount of space
blank :: Opts -> Token -> PosTok -> PosTok
blank opts u (PosTok p t w) = PosTok p t
  $ let n = maxBlanks opts in
    if null w || noMultBlanksAfter opts t || noMultBlanksBefore opts u
    then " " else
        if allowMultBlanksAfter opts t || allowMultBlanksBefore opts u
        then if n > 1 then take n w else w
        else take n w

topKeys :: [String]
topKeys = ["import", "data", "newtype", "type", "class", "instance"]
-- "module" may occur in export lists!

layoutKeys :: [String]
layoutKeys = ["do", "of", "where"]

multipleBlanks :: Opts -> Token -> PosTok -> [Diag]
multipleBlanks opts u (PosTok p t w) =
  let s = showToken t
      q = updatePosString p s
      aft = noMultBlanksAfter opts t
      m = maxBlanks opts
      n = length w
  in [ Diag q $ "up to column " ++ show (sourceColumn q + n)
       ++ " multiple (" ++ show n ++ ") blanks"
       ++ if aft then " after " ++ show s else ""
     | n > m && m > 1 || n > 1 && (aft || noMultBlanksBefore opts u)
       || n > 1 && m == 1
       && not (allowMultBlanksAfter opts t || allowMultBlanksBefore opts u)
     ]

-- | tidy up line comments
adjustLineComment :: String -> String
adjustLineComment s = -- cut of initial line comment marker and spaces
  let r = dropWhile isSpace $ drop 2 s in
  "--" ++ if null r then r else ' ' : r

-- | check if prefix extended by one char from next string is still a prefix
hasLongerPrefix :: String -> String -> String -> Bool
hasLongerPrefix p n s = let ps = map (\ c -> p ++ [c]) n in
  any (`isPrefixOf` s) ps

-- | insert a single blank after prefix
adjustPrefix :: String -> String -> String -> String
adjustPrefix p n s =
   if hasLongerPrefix p n s then s else case stripPrefix p s of
     Nothing -> s
     Just r -> p ++ case dropWhile isWhite r of
         rt@('\n' : _) -> case s of
           '}' : _ -> r
           _ -> rt
         rt -> ' ' : rt

-- | adjust comment at both ends. Input is prefix, suffix and extension chars.
adjustBothEnds :: String -> String -> String -> String -> String
adjustBothEnds p q n s =
    if hasLongerPrefix p n s then s else
    reverse $ adjustPrefix (reverse q) n $ reverse $ adjustPrefix p n s

-- | adjust a block comment
adjustComment :: String -> String
adjustComment = adjustBothEnds "{-" "-}" "!#"
    . adjustBothEnds "{-!" "!-}" ""
    . adjustBothEnds "{-#" "#-}" ""

-- * analyse and adjust lines

anaPosTok :: PosTok -> (PosTok, [Diag])
anaPosTok t@(PosTok p u w) = case u of
  Token BlockComment s -> let
    n = adjustComment s
    s5 = take 5 s
    nr = take 5 $ reverse n
    sr = take 5 $ reverse s
    in (PosTok p (Token BlockComment n) w,
       [ Diag p $ "non-conventional comment start: " ++ s5 | s5 /= take 5 n ]
       ++ [ Diag (updatePosString p s) $ "non-conventional comment end: "
            ++ reverse sr | sr /= nr ])
  Token LineComment s -> let
    n = adjustLineComment s
    s4 = take 4 s
    sr = takeWhile isPrint $ dropWhile isSpace $ drop 2 s4
    q = incSourceColumn p 2
    in (PosTok p (Token LineComment n) "",
        [ Diag q $ "put " ++ (if null sr then "only " else "")
          ++ "one blank after --"
          ++ if null sr then "" else " in --" ++ sr
        | s4 /= take 4 n ]
        ++ [ Diag q "missing comment after --" | n == "--" ])
  _ -> (t, [ Diag p "use layout instead of ;" | isSepIn ";" u ])

-- | the data type for options passed to the analysis
data Opts = Opts
  { tempHask :: Bool
  , maxBlanks :: Int
  , allowMultBlanksBefore :: Token -> Bool
  , allowMultBlanksAfter :: Token -> Bool
  , noMultBlanksBefore :: Token -> Bool
  , noMultBlanksAfter :: Token -> Bool
  }

-- | the default options to use
defaultOpts :: Opts
defaultOpts = Opts
  { tempHask = False
  , maxBlanks = 1
  , allowMultBlanksBefore = isComment
  , allowMultBlanksAfter = const False
  , noMultBlanksBefore = const False
  , noMultBlanksAfter = isKeyw $ "let" : layoutKeys
  }

-- | analyse consecutive tokens
anaPosToks :: Opts -> [PosTok] -> ([PosTok], [Diag])
anaPosToks opts l = case l of
  [] -> ([], [])
  t0 : r1 -> let (t1@(PosTok p1 u1 w1), cs) = anaPosTok t0 in case r1 of
    [] -> ([t1], cs)
    t2@(PosTok p2 u2 w2) : r2 -> let
        (ar1, rds) = anaPosToks opts r1
        s1 = showToken u1
        s2 = showToken u2
        i1 = isInfixOp u1
        i2 = isInfixOp u2
        o1 = isOpPar u1
        c2 = isClPar u2
        ii1 = isIndent u1
        ni1 = not ii1
        lt = length s1 <= length s2
        both = s1 ++ s2
        nw1 = null w1
        bt1 = blank opts u2 t1
        al = t1 : ar1
        bl = bt1 : ar1
        rl = rmSp t1 : ar1
        m1 = multipleBlanks opts u2 t1
        aS = "after "
        bS = "before "
        aft = case () of
          _ | isSepIn ",;" u1 -> True
            | isOpPar u2 -> False
            | isSymb ["\\"] u1 -> True
            | i1 -> if i2 then lt else True
            | i2 || isSymb [".."] u2 -> False
          _ -> lt
        parsOfBoth = filter (`elem` "[({})]") both
        pos = case parsOfBoth of
          _ : _ : _ -> "between "
          _ | o1 -> aS
            | c2 -> bS
          _ -> "here "
        omitSpace = o1 && (isInfixOp u2 || c2)
          || isInfixOp u1 && c2 && not (isSymb [".."] u1 && isSepIn "]" u2)
        addSpace = ni1 && nw1
          && (not (o1 || isSymb (map (: []) "!#-@~") u1) || isComment u2)
          && spaceNeededBefore u2
          && if tempHask opts then not $ isSymb ["$"] u1 else True
        in case r2 of
      _ | isComment u1 && isIndent u2
        -> (al, cs ++ [ Diag p1 "could be a line comment"
             | isPrefixOf "{- " s1 && not (any (== '\n') s1) ] ++ rds)
      PosTok _ u3 _ : _
        | isFstInfixMinusArg u1 && isSymb ["-"] u2 && spaceNeededBefore u3
        -> (bt1 : blank opts u3 t2 : tail ar1, cs ++ m1
             ++ [ Diag p2 "put blanks around infix -" | nw1 || null w2 ]
             ++ multipleBlanks opts u3 t2 ++ rds)
        | ni1 && isKeyw layoutKeys u2
          && not (isSepIn "{" u3) && noLineComment u3 && not (isIndent u3)
        -> (bl, cs ++ m1 ++ Diag p2 ("break line after " ++ show s2) : rds)
      _ | addSpace
        -> (bl, cs ++ Diag p2
             ("put blank " ++ if aft then aS ++ s1 else bS ++ s2) : rds)
      _ | not nw1 && omitSpace
        -> (rl, cs
           ++ Diag p2 ("no space needed " ++ pos ++ parsOfBoth) : rds)
      _ | not nw1 && ii1 && isKeyw topKeys u2
        -> (rl, cs ++ Diag p2 ("start in column 1 with keyword " ++ s2) : rds)
      _ | nw1 || ii1
        -> (al, cs ++ rds)
      _ -> (bl, cs ++ m1 ++ rds)