{- | Module : $Header$ Description : scan tokens of Haskell sources Copyright : (c) C. Maeder 2010 License : BSD Maintainer : chr.maeder@web.de Stability : provisional Portability : portable using parsec as scanner to tokenize haskell sources and check the spacing -} module Language.Haskell.Scanner ( scan , Opts (..) , defaultOpts , anaPosToks , PosTok (..) , Token , showPosTok , Diag (..) , showDiag , showSourcePos ) where import Control.Monad import Data.Char import Data.List import Text.ParserCombinators.Parsec import Text.ParserCombinators.Parsec.Pos -- * monad shortcuts infixl 1 << (<<) :: Monad m => m a -> m b -> m a (<<) = liftM2 const infixr 5 <:> (<:>) :: Monad m => m a -> m [a] -> m [a] (<:>) = liftM2 (:) infixr 5 <++> (<++>) :: Monad m => m [a] -> m [a] -> m [a] (<++>) = liftM2 (++) single :: Monad m => m a -> m [a] single = liftM return flat :: Monad m => m [[a]] -> m [a] flat = liftM concat enclosedBy :: Monad m => m [a] -> m a -> m [a] enclosedBy p q = q <:> p <++> single q -- | a literal enclosed in quotes and a backslash as escape character quotedLit :: Char -> CharParser st String quotedLit q = enclosedBy (flat $ many $ single (noneOf ['\\', q]) <|> char '\\' <:> single anyChar) $ char q -- | text in double quotes stringLit :: CharParser st String stringLit = quotedLit '"' -- * parsec shortcuts -- | parse an optional list optionL :: GenParser tok st [a] -> GenParser tok st [a] optionL = option [] -- | shortcut for @try . string@ tryString :: String -> CharParser st String tryString = try . string -- | nested comments, open and closing strings must have at least one char nestedComment :: String -> String -> CharParser st String nestedComment op cl = let inComment = tryString cl <|> (nestedComment op cl <|> single anyChar) <++> inComment in tryString op <++> inComment -- * haskell tokens -- | the haskell nested comments nestComment :: CharParser st String nestComment = nestedComment "{-" "-}" -- | line comment without final newline lineComment :: CharParser st String lineComment = try ((char '-' <:> many1 (char '-')) << notFollowedBy (oneOf "!#$%&*+./<=>?@\\^|~")) <++> many (noneOf "\n") -- | text in single quotes charLit :: CharParser st String charLit = tryString "'''" <|> quotedLit '\'' -- | a precise number scanner number :: Parser String number = try (char '0' <:> single (oneOf "oO")) <++> many octDigit <|> try (char '0' <:> single (oneOf "xX")) <++> many hexDigit <|> many1 digit <++> optionL (try (char '.' <:> single digit) <++> many digit) <++> optionL (try (oneOf "eE" <:> optionL (single $ oneOf "+-") <++> single digit) <++> many digit) -- | any character within identifiers hChar :: Parser Char hChar = alphaNum <|> oneOf "_'" -- | lower case identifiers (aka variables) lId :: Parser String lId = (char '_' <|> lower) <:> many hChar -- | upper case identifiers (aka constructors) uId :: Parser String uId = upper <:> many hChar -- | any character within operators opSym :: Char -> Bool opSym c = elem c "!#$%&*+-./:<=>?@\\^|~" || not (isAscii c) && (isSymbol c || isPunctuation c) -- | any operator (mainly infixes) operator :: Parser String operator = many1 $ satisfy opSym -- | possible qualified entities: lower or upper case words or symbols data QualElem = Var | Cons | Sym -- | a name qualified or not with its representation data QualName = Name Bool QualElem String -- | the original string showQualName :: QualName -> String showQualName (Name _ _ s) = s isLetterOrUnderscore :: Char -> Bool isLetterOrUnderscore c = c == '_' || isLetter c -- | the possible characters starting a name qIdStart :: Parser Char qIdStart = satisfy $ \ c -> isLetterOrUnderscore c || opSym c -- | any quoted, qualified, or unqualified name or operator qId :: Parser QualName qId = fmap (Name False Var) lId <|> fmap (Name False Sym) operator <|> do n <- uId option (Name False Cons n) $ do d <- try (char '.' << lookAhead qIdStart) Name _ k r <- qId return $ Name True k $ n ++ d : r <|> do let noCh = notFollowedBy (char '\'') s <- try (string "'" << lookAhead (satisfy isLetterOrUnderscore >> noCh)) <|> try (string "''" << noCh) Name b k r <- qId return $ Name b k $ s ++ r -- | parse any 'qId' within back ticks. This is more liberal than haskell! infixOp :: Parser String infixOp = enclosedBy (fmap showQualName qId) $ char '`' -- | the separators are comma, semicolon and three kinds of parens seps :: String seps = "[({,;})]" -- | beside names and separators we have a couple of more token kinds data TokenKind = LineComment -- ^ a line comment | BlockComment -- ^ single- or multiline (nested) block comment | Literal -- ^ number, character, or string | Infix -- ^ something in back quotes | Sep -- ^ a separator | Indent -- ^ a newline or nothing at the very beginning | QuasiQuote -- ^ a quasi-quote | ThQuote -- ^ a possible template haskell quotation -- | the data type for tokens data Token = QualName QualName | Token TokenKind String -- | renders the original string without kind information showToken :: Token -> String showToken t = case t of QualName q -> showQualName q Token _ s -> s -- | all white spaces except newline isWhite :: Char -> Bool isWhite c = isSpace c && c /= '\n' -- | parsing as many 'isWhite' characters as possible (or none) white :: Parser String white = many $ satisfy isWhite -- | a literal followed by up to two (one for 'charLit') optional magic hashes literal :: Parser String literal = let optH = optionL (string "#") in charLit <++> optH <|> (stringLit <|> number) <++> optionL (char '#' <:> optH) -- | the template haskell quasi-quotes quasiQuote :: Parser Token quasiQuote = try $ do q <- char '[' >> lId << char '|' let start = '[' : q ++ "|" -- check for plain TH quotation if elem q $ single "dept" then return $ Token ThQuote start else fmap (Token QuasiQuote . (start ++) . (++ "|]")) $ manyTill anyChar $ tryString "|]" -- | parse any non-white token tok :: Parser Token tok = fmap (Token BlockComment) nestComment <|> fmap (Token LineComment) lineComment <|> quasiQuote <|> fmap QualName qId <|> fmap (Token Sep) (single $ oneOf seps) <|> fmap (Token Indent) (string "\n") <|> fmap (Token Literal) literal <|> fmap (Token Infix) infixOp -- | tokens enriched by positions and following white space data PosTok = PosTok SourcePos Token String showPosTok :: PosTok -> String showPosTok (PosTok _ t s) = showToken t ++ s -- | attach positions and subsequent 'white' spaces to tokens posTok :: Parser PosTok posTok = do p <- getPosition t <- tok w <- white return $ PosTok p t w -- | the start position (without file name) startPos :: SourcePos startPos = initialPos "" -- | the initial 'white' stuff startTok :: Parser PosTok startTok = fmap (PosTok startPos $ Token Indent "") white -- | the final scanner scan :: Parser [PosTok] scan = startTok <:> many posTok << eof -- * message data type -- | messages with positions data Diag = Diag SourcePos String -- | show parsec positions like hlint does showSourcePos :: SourcePos -> String showSourcePos p = sourceName p ++ ":" ++ shows (sourceLine p) ":" ++ shows (sourceColumn p) ":" -- | show the message with its position showDiag :: Diag -> String showDiag (Diag p s) = showSourcePos p ++ ' ' : s -- * checking tokens isInfixOp :: Token -> Bool isInfixOp t = case t of QualName (Name _ Sym s) -> notElem s $ map (: []) "!#@\\~" Token Infix _ -> True _ -> False isComment :: Token -> Bool isComment t = case t of Token k _ -> case k of LineComment -> True BlockComment -> True _ -> False _ -> False noLineComment :: Token -> Bool noLineComment t = case t of Token LineComment _ -> False _ -> True isSepIn :: String -> Token -> Bool isSepIn cs t = case t of Token Sep [c] -> elem c cs _ -> False isIndent :: Token -> Bool isIndent t = case t of Token Indent _ -> True _ -> False isOpPar :: Token -> Bool isOpPar = isSepIn "[({" isClPar :: Token -> Bool isClPar = isSepIn "})]" isFstInfixMinusArg :: Token -> Bool isFstInfixMinusArg t = case t of QualName (Name _ k _) -> case k of Sym -> False _ -> True Token k _ -> case k of Literal -> True Sep -> isClPar t _ -> False isSymb :: [String] -> Token -> Bool isSymb s t = case t of QualName (Name False Sym r) -> elem r s _ -> False isKeyw :: [String] -> Token -> Bool isKeyw s t = case t of QualName (Name False Var r) -> elem r s _ -> False spaceNeededBefore :: Token -> Bool spaceNeededBefore t = not $ isIndent t || isSymb ["@"] t || isSepIn ",;})]" t -- * adjusting tokens -- | remove trailing spaces rmSp :: PosTok -> PosTok rmSp (PosTok p t _) = PosTok p t "" topKeys :: [String] topKeys = ["import", "data", "newtype", "class", "instance"] -- "module" may occur in export lists, "type" within type families! layoutKeys :: [String] layoutKeys = ["do", "of", "where"] -- | ensure proper amount of space and diagnose too many blanks multipleBlanks :: Opts -> Token -> PosTok -> (PosTok, [Diag]) multipleBlanks opts u pTok@(PosTok p t w) = let s = showToken t q = updatePosString p s aft = noMultBlanksAfter opts t m = maxBlanks opts n = length w diag = Diag q $ "up to column " ++ show (sourceColumn q + n) ++ " multiple (" ++ show n ++ ") blanks" ++ if aft then " after " ++ show s else "" bTok = PosTok p t " " in case w of "" -> (bTok, []) -- insert a missing blank (without diag) _ | allowMultBlanksBefore opts u -> (pTok, []) -- no change _ | noMultBlanksAfter opts t -> (bTok, [diag | n > 1]) _ -> (PosTok p t $ take m w, [diag | n > m]) -- | cut of initial line comment marker and spaces trim2 :: String -> String trim2 = dropWhile isSpace . drop 2 -- | add two dashes to a string plus a space for a non-emtpy string addLineCommentMark :: String -> String addLineCommentMark r = "--" ++ if null r then r else ' ' : r -- | tidy up line comments adjustLineComment :: String -> String adjustLineComment = addLineCommentMark . trim2 -- | check if prefix extended by one char from next string is still a prefix hasLongerPrefix :: String -> String -> String -> Bool hasLongerPrefix p n s = let ps = map (\ c -> p ++ [c]) n in any (`isPrefixOf` s) ps -- | insert a single blank after prefix adjustPrefix :: String -> String -> String -> String adjustPrefix p n s = if hasLongerPrefix p n s then s else case stripPrefix p s of Nothing -> s Just r -> p ++ case dropWhile isWhite r of rt@('\n' : _) -> case s of '}' : _ -> r _ -> rt rt -> ' ' : rt -- | adjust comment at both ends. Input is prefix, suffix and extension chars. adjustBothEnds :: String -> String -> String -> String -> String adjustBothEnds p q n s = if hasLongerPrefix p n s then s else reverse $ adjustPrefix (reverse q) n $ reverse $ adjustPrefix p n s -- | adjust a block comment adjustComment :: String -> String adjustComment = adjustBothEnds "{-" "-}" "!#" . adjustBothEnds "{-!" "!-}" "" . adjustBothEnds "{-#" "#-}" "" -- * analyse and adjust lines anaPosTok :: Opts -> PosTok -> (PosTok, [Diag]) anaPosTok opts t@(PosTok p u w) = let cC = checkComments opts in case u of Token BlockComment s | cC -> let n = adjustComment s s5 = take 5 s nr = take 5 $ reverse n sr = take 5 $ reverse s in (PosTok p (Token BlockComment n) w, [ Diag p $ "non-conventional comment start: " ++ s5 | s5 /= take 5 n ] ++ [ Diag p "avoid nested comments" | isInfixOf "{-" $ drop 2 s ] ++ [ Diag (updatePosString p s) $ "non-conventional comment end: " ++ reverse sr | sr /= nr ]) Token LineComment s | cC -> let n = adjustLineComment s s4 = take 4 s sr = takeWhile isPrint $ dropWhile isSpace $ drop 2 s4 q = incSourceColumn p 2 in (PosTok p (Token LineComment n) "", [ Diag q $ "put " ++ (if null sr then "only " else "") ++ "one blank after --" ++ if null sr then "" else " in --" ++ sr | s4 /= take 4 n ] ++ [ Diag q "missing comment after --" | n == "--" ] ++ [ Diag q "line comment contains block comment start" | isInfixOf "{-" n ] ++ [ Diag q "line comment contains block comment end" | isInfixOf "-}" n ]) Token ThQuote s | not $ tempHask opts -> let (f, r) = splitAt 2 s in (PosTok p (Token ThQuote $ f ++ " |") w, [ Diag (updatePosString p f) $ "put blank between " ++ f ++ " and |" | r == "|" ]) _ -> (t, [ Diag p "use layout instead of ;" | isSepIn ";" u ]) -- | a comment string that is no pragma or directive isNormalCommentString :: String -> Bool isNormalCommentString s = case drop 2 s of c : _ -> notElem c "!#" _ -> True -- | a comment that is no pragma or directive isNormalComment :: Token -> Bool isNormalComment u = isComment u && isNormalCommentString (showToken u) -- | check if no other normal comment follows noOtherComment :: [PosTok] -> Bool noOtherComment ps = case ps of [] -> True PosTok _ u _ : _ -> not $ isNormalComment u -- | the data type for options passed to the analysis data Opts = Opts { tempHask :: Bool , windowsOutput :: Bool , lineLength :: Int , maxBlankLines :: Int , maxBlanks :: Int , allowMultBlanksBefore :: Token -> Bool , noMultBlanksAfter :: Token -> Bool , makeLineComments :: Bool , joinComments :: Bool , checkComments :: Bool , commentGap :: Int , checkSpacing :: Bool } -- | the default options to use defaultOpts :: Opts defaultOpts = Opts { tempHask = False , windowsOutput = False , lineLength = 80 , maxBlankLines = 2 , maxBlanks = 1 , allowMultBlanksBefore = isComment , noMultBlanksAfter = isKeyw $ "let" : layoutKeys , makeLineComments = True , joinComments = True , checkComments = True , commentGap = 0 , checkSpacing = True } -- | analyse consecutive tokens anaPosToks :: Opts -> [PosTok] -> ([PosTok], [Diag]) anaPosToks opts l = case l of [] -> ([], []) t0 : r1 -> let (t1@(PosTok p1 u1 w1), cs) = anaPosTok opts t0 in case r1 of [] -> ([t1], cs) t2@(PosTok p2 u2 w2) : r2 -> let (ar1, rds) = anaPosToks opts r1 s1 = showToken u1 s2 = showToken u2 i1 = isInfixOp u1 i2 = isInfixOp u2 o1 = isOpPar u1 c2 = isClPar u2 ii1 = isIndent u1 ni1 = not ii1 lt = length s1 <= length s2 both = s1 ++ s2 nw1 = null w1 (bt1, m1) = multipleBlanks opts u2 t1 al = t1 : ar1 bl = bt1 : ar1 aS = "after " bS = "before " aft = case () of _ | isSepIn ",;" u1 -> True | isOpPar u2 -> False | isSymb ["\\"] u1 -> True | i1 -> if i2 then lt else True | i2 || isSymb [".."] u2 -> False _ -> lt parsOfBoth = filter (`elem` "[({})]") both pos = case parsOfBoth of _ : _ : _ -> "between " _ | o1 -> aS | c2 -> bS _ -> "here " checkS = checkSpacing opts joinC = joinComments opts omitSpace = checkS && o1 && (isInfixOp u2 || c2) || isInfixOp u1 && c2 && not (isSymb [".."] u1 && isSepIn "]" u2) addSpace = checkS && ni1 && nw1 && (not (o1 || isSymb (single "!#-@~") u1) || isComment u2) && spaceNeededBefore u2 && if tempHask opts then not $ isSymb ["$"] u1 else True trim = reverse . trim2 s1C = trim $ trim s1 in case r2 of _ | makeLineComments opts && isNormalComment u1 && isIndent u2 && noOtherComment r2 && noLineComment u1 && notElem '\n' s1C -> ( PosTok p1 (Token LineComment $ addLineCommentMark s1C) "" : ar1 , cs ++ Diag p1 "could be a line comment" : rds) | joinC && isNormalComment u1 && isNormalComment u2 -> (al, cs ++ Diag p2 "consecutive comments" : rds) t3@(PosTok _ u3 _) : r3 | joinC && isNormalComment u1 && isIndent u2 && isNormalComment u3 -> let (PosTok p3 nu3 w3, cs3) = anaPosTok opts t3 su3 = showToken nu3 h3 = trim2 su3 f3 = take (length su3 - length h3) su3 hh3 = head h3 cC = checkComments opts isHaddockStart = cC && not (null h3) && elem hh3 "$*^|" s3 = if isHaddockStart then dropWhile isSpace $ if hh3 == '*' then dropWhile (== '*') h3 else tail h3 else if cC then h3 else drop 2 su3 nt3 = PosTok p1 (Token BlockComment $ (if noLineComment u1 then reverse . dropWhile (== ' ') . drop 2 $ reverse s1 else "{-" ++ drop 2 s1) ++ w1 ++ s2 ++ w2 ++ replicate (commentGap opts) ' ' ++ if noLineComment nu3 then s3 else s3 ++ " -}") w3 (ar3, rds3) = anaPosToks opts $ nt3 : r3 in (ar3 , cs ++ Diag p3 "join (or separate) consecutive comments" : [ Diag (updatePosString p3 f3) $ "unexpected haddock marker " ++ [hh3] | isHaddockStart ] ++ cs3 ++ rds3) | checkS && isFstInfixMinusArg u1 && isSymb ["-"] u2 && spaceNeededBefore u3 -> let (bt2, m2) = multipleBlanks opts u3 t2 in (bt1 : bt2 : tail ar1, cs ++ m1 ++ [ Diag p2 "put blanks around infix -" | nw1 || null w2 ] ++ m2 ++ rds) | ni1 && isKeyw layoutKeys u2 && not (isSepIn "{" u3) && noLineComment u3 && not (isIndent u3) -> (al, cs ++ m1 ++ Diag p2 ("break line after " ++ show s2) : rds) _ | addSpace -> (bl, cs ++ Diag p2 ("put blank " ++ if aft then aS ++ s1 else bS ++ s2) : rds) _ | not nw1 && omitSpace -> (rmSp t1 : ar1, cs ++ Diag p2 ("no space needed " ++ pos ++ parsOfBoth) : rds) _ | not nw1 && ii1 && isKeyw topKeys u2 -> (al, cs ++ Diag p2 ("start in column 1 with keyword " ++ s2) : rds) _ | nw1 || ii1 -> (al, cs ++ rds) _ -> (bl, cs ++ m1 ++ rds)