{- | Module : $Header$ Description : scan tokens of Haskell sources Copyright : (c) C. Maeder 2010 License : BSD Maintainer : chr.maeder@web.de Stability : provisional Portability : portable using parsec as scanner to tokenize haskell sources and check the spacing -} module Language.Haskell.Scanner ( scan , Opts (..) , defaultOpts , anaPosToks , PosTok , showPosTok , Diag (..) , showDiag , showSourcePos ) where import Control.Monad import Data.Char import Data.List import Text.ParserCombinators.Parsec import Text.ParserCombinators.Parsec.Pos -- * monad shortcuts infixl 1 << (<<) :: Monad m => m a -> m b -> m a (<<) = liftM2 const infixr 5 <:> (<:>) :: Monad m => m a -> m [a] -> m [a] (<:>) = liftM2 (:) infixr 5 <++> (<++>) :: Monad m => m [a] -> m [a] -> m [a] (<++>) = liftM2 (++) single :: Monad m => m a -> m [a] single = liftM return flat :: Monad m => m [[a]] -> m [a] flat = liftM concat enclosedBy :: Monad m => m [a] -> m a -> m [a] enclosedBy p q = q <:> p <++> single q -- * parsec shortcuts -- | parse an optional list optionL :: GenParser tok st [a] -> GenParser tok st [a] optionL = option [] -- | shortcut for @try . string@ tryString :: String -> CharParser st String tryString = try . string -- | nested comments, open and closing strings must have at least two chars nestedComment :: String -> String -> CharParser st String nestedComment op cl = case (op, cl) of (oh : ot : _, ch : ct : _) -> tryString op <++> flat (many $ single (noneOf [oh, ch] <|> try (char ch << notFollowedBy (char ct)) <|> try (char oh << notFollowedBy (char ot)) "") <|> nestedComment op cl) <++> string cl _ -> error "nestedComment" -- * haskell tokens -- | the haskell nested comments nestComment :: CharParser st String nestComment = nestedComment "{-" "-}" -- | line comment without final newline lineComment :: CharParser st String lineComment = try ((char '-' <:> many1 (char '-')) << notFollowedBy (oneOf "!#$%&*+./<=>?@\\^|~")) <++> many (noneOf "\n") -- | text in double quotes stringLit :: CharParser st String stringLit = enclosedBy (flat $ many $ single (noneOf "\\\"") <|> char '\\' <:> single anyChar) $ char '\"' -- | text in single quotes charLit :: CharParser st String charLit = tryString "'''" <|> enclosedBy (flat $ many $ single (noneOf "\\\'") <|> char '\\' <:> single anyChar) (char '\'') -- | a precise number scanner number :: Parser String number = try (char '0' <:> single (oneOf "oO")) <++> many octDigit <|> try (char '0' <:> single (oneOf "xX")) <++> many hexDigit <|> many1 digit <++> optionL (try (char '.' <:> single digit) <++> many digit) <++> optionL (try (oneOf "eE" <:> optionL (single $ oneOf "+-") <++> single digit) <++> many digit) -- | any character within identifiers hChar :: Parser Char hChar = alphaNum <|> oneOf "_'" -- | lower case identifiers (aka variables) lId :: Parser String lId = (char '_' <|> lower) <:> many hChar -- | upper case identifiers (aka constructors) uId :: Parser String uId = upper <:> many hChar -- | any character within operators opSym :: Char -> Bool opSym c = elem c "!#$%&*+-./:<=>?@\\^|~" || not (isAscii c) && (isSymbol c || isPunctuation c) -- | any operator (mainly infixes) operator :: Parser String operator = many1 $ satisfy opSym -- | possible qualified entities: lower or upper case words or symbols data QualElem = Var | Cons | Sym -- | a name qualified or not with its representation data QualName = Name Bool QualElem String -- | the original string showQualName :: QualName -> String showQualName (Name _ _ s) = s -- | the possible characters starting a name qIdStart :: Parser Char qIdStart = satisfy $ \ c -> c == '_' || isLetter c || opSym c -- | any quoted, qualified, or unqualified name or operator qId :: Parser QualName qId = fmap (Name False Var) lId <|> fmap (Name False Sym) operator <|> do n <- uId option (Name False Cons n) $ do d <- try (char '.' << lookAhead qIdStart) Name _ k r <- qId return $ Name True k $ n ++ d : r <|> do let noCh = notFollowedBy (char '\'') s <- try (string "'" << lookAhead (letter << noCh)) -- limited recognition <|> try (string "''" << noCh) Name b k r <- qId return $ Name b k $ s ++ r -- | parse any 'qId' within back ticks. This is more liberal than haskell! infixOp :: Parser String infixOp = enclosedBy (fmap showQualName qId) $ char '`' -- | the separators are comma, semicolon and three kinds of parens seps :: String seps = "[({,;})]" -- | beside names and separators we have a couple of more token kinds data TokenKind = LineComment -- ^ a line comment | BlockComment -- ^ single- or multiline (nested) block comment | Literal -- ^ number, character, or string | Infix -- ^ something in back quotes | Sep -- ^ a separator | Indent -- ^ a newline or nothing at the very beginning -- | the data type for tokens data Token = QualName QualName | Token TokenKind String -- | renders the original string without kind information showToken :: Token -> String showToken t = case t of QualName q -> showQualName q Token _ s -> s -- | all white spaces except newline isWhite :: Char -> Bool isWhite c = isSpace c && c /= '\n' -- | parsing as many 'isWhite' characters as possible (or none) white :: Parser String white = many $ satisfy isWhite -- | a literal followed by up to two (one for 'charLit') optional magic hashes literal :: Parser String literal = let optH = optionL (string "#") in charLit <++> optH <|> (stringLit <|> number) <++> optionL (char '#' <:> optH) -- | parse any non-white token tok :: Parser Token tok = fmap (Token BlockComment) nestComment <|> fmap (Token LineComment) lineComment <|> fmap QualName qId <|> fmap (Token Sep) (single $ oneOf seps) <|> fmap (Token Indent) (string "\n") <|> fmap (Token Literal) literal <|> fmap (Token Infix) infixOp -- | tokens enriched by positions and following white space data PosTok = PosTok SourcePos Token String showPosTok :: PosTok -> String showPosTok (PosTok _ t s) = showToken t ++ s -- | attach positions and subsequent 'white' spaces to tokens posTok :: Parser PosTok posTok = do p <- getPosition t <- tok w <- white return $ PosTok p t w -- | the start position (without file name) startPos :: SourcePos startPos = initialPos "" -- | the initial 'white' stuff startTok :: Parser PosTok startTok = fmap (PosTok startPos $ Token Indent "") white -- | the final scanner scan :: Parser [PosTok] scan = startTok <:> many posTok << eof -- * message data type -- | messages with positions data Diag = Diag SourcePos String showSourcePos :: SourcePos -> String showSourcePos p = sourceName p ++ ":" ++ shows (sourceLine p) ":" ++ shows (sourceColumn p) ":" showDiag :: Diag -> String showDiag (Diag p s) = showSourcePos p ++ ' ' : s -- * checking tokens isInfixOp :: Token -> Bool isInfixOp t = case t of QualName (Name _ Sym s) -> notElem s $ map (: []) "!#@\\~" Token Infix _ -> True _ -> False isComment :: Token -> Bool isComment t = case t of Token k _ -> case k of LineComment -> True BlockComment -> True _ -> False _ -> False noLineComment :: Token -> Bool noLineComment t = case t of Token LineComment _ -> False _ -> True isSepIn :: String -> Token -> Bool isSepIn cs t = case t of Token Sep [c] -> elem c cs _ -> False isIndent :: Token -> Bool isIndent t = case t of Token Indent _ -> True _ -> False isOpPar :: Token -> Bool isOpPar = isSepIn "[({" isClPar :: Token -> Bool isClPar = isSepIn "})]" isFstInfixMinusArg :: Token -> Bool isFstInfixMinusArg t = case t of QualName (Name _ k _) -> case k of Sym -> False _ -> True Token k _ -> case k of Literal -> True Sep -> isClPar t _ -> False isSymb :: [String] -> Token -> Bool isSymb s t = case t of QualName (Name False Sym r) -> elem r s _ -> False isKeyw :: [String] -> Token -> Bool isKeyw s t = case t of QualName (Name False Var r) -> elem r s _ -> False spaceNeededBefore :: Token -> Bool spaceNeededBefore t = not $ isIndent t || isSymb ["@"] t || isSepIn ",;})]" t -- * adjusting tokens -- | remove trailing spaces rmSp :: PosTok -> PosTok rmSp (PosTok p t _) = PosTok p t "" -- | ensure proper amount of space blank :: Opts -> Token -> PosTok -> PosTok blank opts u (PosTok p t w) = PosTok p t $ let n = maxBlanks opts in if null w || noMultBlanksAfter opts t || noMultBlanksBefore opts u then " " else if allowMultBlanksAfter opts t || allowMultBlanksBefore opts u then if n > 1 then take n w else w else take n w topKeys :: [String] topKeys = ["import", "data", "newtype", "type", "class", "instance"] -- "module" may occur in export lists! layoutKeys :: [String] layoutKeys = ["do", "of", "where"] multipleBlanks :: Opts -> Token -> PosTok -> [Diag] multipleBlanks opts u (PosTok p t w) = let s = showToken t q = updatePosString p s aft = noMultBlanksAfter opts t m = maxBlanks opts n = length w in [ Diag q $ "up to column " ++ show (sourceColumn q + n) ++ " multiple (" ++ show n ++ ") blanks" ++ if aft then " after " ++ show s else "" | n > m && m > 1 || n > 1 && (aft || noMultBlanksBefore opts u) || n > 1 && m == 1 && not (allowMultBlanksAfter opts t || allowMultBlanksBefore opts u) ] -- | tidy up line comments adjustLineComment :: String -> String adjustLineComment s = -- cut of initial line comment marker and spaces let r = dropWhile isSpace $ drop 2 s in "--" ++ if null r then r else ' ' : r -- | check if prefix extended by one char from next string is still a prefix hasLongerPrefix :: String -> String -> String -> Bool hasLongerPrefix p n s = let ps = map (\ c -> p ++ [c]) n in any (`isPrefixOf` s) ps -- | insert a single blank after prefix adjustPrefix :: String -> String -> String -> String adjustPrefix p n s = if hasLongerPrefix p n s then s else case stripPrefix p s of Nothing -> s Just r -> p ++ case dropWhile isWhite r of rt@('\n' : _) -> case s of '}' : _ -> r _ -> rt rt -> ' ' : rt -- | adjust comment at both ends. Input is prefix, suffix and extension chars. adjustBothEnds :: String -> String -> String -> String -> String adjustBothEnds p q n s = if hasLongerPrefix p n s then s else reverse $ adjustPrefix (reverse q) n $ reverse $ adjustPrefix p n s -- | adjust a block comment adjustComment :: String -> String adjustComment = adjustBothEnds "{-" "-}" "!#" . adjustBothEnds "{-!" "!-}" "" . adjustBothEnds "{-#" "#-}" "" -- * analyse and adjust lines anaPosTok :: PosTok -> (PosTok, [Diag]) anaPosTok t@(PosTok p u w) = case u of Token BlockComment s -> let n = adjustComment s s5 = take 5 s nr = take 5 $ reverse n sr = take 5 $ reverse s in (PosTok p (Token BlockComment n) w, [ Diag p $ "non-conventional comment start: " ++ s5 | s5 /= take 5 n ] ++ [ Diag (updatePosString p s) $ "non-conventional comment end: " ++ reverse sr | sr /= nr ]) Token LineComment s -> let n = adjustLineComment s s4 = take 4 s sr = takeWhile isPrint $ dropWhile isSpace $ drop 2 s4 q = incSourceColumn p 2 in (PosTok p (Token LineComment n) "", [ Diag q $ "put " ++ (if null sr then "only " else "") ++ "one blank after --" ++ if null sr then "" else " in --" ++ sr | s4 /= take 4 n ] ++ [ Diag q "missing comment after --" | n == "--" ]) _ -> (t, [ Diag p "use layout instead of ;" | isSepIn ";" u ]) -- | the data type for options passed to the analysis data Opts = Opts { tempHask :: Bool , maxBlanks :: Int , allowMultBlanksBefore :: Token -> Bool , allowMultBlanksAfter :: Token -> Bool , noMultBlanksBefore :: Token -> Bool , noMultBlanksAfter :: Token -> Bool } -- | the default options to use defaultOpts :: Opts defaultOpts = Opts { tempHask = False , maxBlanks = 1 , allowMultBlanksBefore = isComment , allowMultBlanksAfter = const False , noMultBlanksBefore = const False , noMultBlanksAfter = isKeyw $ "let" : layoutKeys } -- | analyse consecutive tokens anaPosToks :: Opts -> [PosTok] -> ([PosTok], [Diag]) anaPosToks opts l = case l of [] -> ([], []) t0 : r1 -> let (t1@(PosTok p1 u1 w1), cs) = anaPosTok t0 in case r1 of [] -> ([t1], cs) t2@(PosTok p2 u2 w2) : r2 -> let (ar1, rds) = anaPosToks opts r1 s1 = showToken u1 s2 = showToken u2 i1 = isInfixOp u1 i2 = isInfixOp u2 o1 = isOpPar u1 c2 = isClPar u2 ii1 = isIndent u1 ni1 = not ii1 lt = length s1 <= length s2 both = s1 ++ s2 nw1 = null w1 bt1 = blank opts u2 t1 al = t1 : ar1 bl = bt1 : ar1 rl = rmSp t1 : ar1 m1 = multipleBlanks opts u2 t1 aS = "after " bS = "before " aft = case () of _ | isSepIn ",;" u1 -> True | isOpPar u2 -> False | isSymb ["\\"] u1 -> True | i1 -> if i2 then lt else True | i2 || isSymb [".."] u2 -> False _ -> lt parsOfBoth = filter (`elem` "[({})]") both pos = case parsOfBoth of _ : _ : _ -> "between " _ | o1 -> aS | c2 -> bS _ -> "here " omitSpace = o1 && (isInfixOp u2 || c2) || isInfixOp u1 && c2 && not (isSymb [".."] u1 && isSepIn "]" u2) addSpace = ni1 && nw1 && (not (o1 || isSymb (map (: []) "!#-@~") u1) || isComment u2) && spaceNeededBefore u2 && if tempHask opts then not $ isSymb ["$"] u1 else True in case r2 of _ | isComment u1 && isIndent u2 -> (al, cs ++ [ Diag p1 "could be a line comment" | isPrefixOf "{- " s1 && not (any (== '\n') s1) ] ++ rds) PosTok _ u3 _ : _ | isFstInfixMinusArg u1 && isSymb ["-"] u2 && spaceNeededBefore u3 -> (bt1 : blank opts u3 t2 : tail ar1, cs ++ m1 ++ [ Diag p2 "put blanks around infix -" | nw1 || null w2 ] ++ multipleBlanks opts u3 t2 ++ rds) | ni1 && isKeyw layoutKeys u2 && not (isSepIn "{" u3) && noLineComment u3 && not (isIndent u3) -> (bl, cs ++ m1 ++ Diag p2 ("break line after " ++ show s2) : rds) _ | addSpace -> (bl, cs ++ Diag p2 ("put blank " ++ if aft then aS ++ s1 else bS ++ s2) : rds) _ | not nw1 && omitSpace -> (rl, cs ++ Diag p2 ("no space needed " ++ pos ++ parsOfBoth) : rds) _ | not nw1 && ii1 && isKeyw topKeys u2 -> (rl, cs ++ Diag p2 ("start in column 1 with keyword " ++ s2) : rds) _ | nw1 || ii1 -> (al, cs ++ rds) _ -> (bl, cs ++ m1 ++ rds)