{- | Module : $Header$ Description : scan tokens of Haskell sources Copyright : (c) C. Maeder 2010 License : BSD Maintainer : chr.maeder@web.de Stability : provisional Portability : portable using parsec as scanner to tokenize haskell sources and check the spacing -} module Language.Haskell.Scanner ( splitLines , showScan , processScan , scan , PosTok ) where import Control.Monad import Data.Char import Data.List import Text.ParserCombinators.Parsec import Text.ParserCombinators.Parsec.Pos -- * monad shortcuts infixl 1 << (<<) :: Monad m => m a -> m b -> m a (<<) = liftM2 const infixr 5 <:> (<:>) :: Monad m => m a -> m [a] -> m [a] (<:>) = liftM2 (:) infixr 5 <++> (<++>) :: Monad m => m [a] -> m [a] -> m [a] (<++>) = liftM2 (++) single :: Monad m => m a -> m [a] single = liftM return flat :: Monad m => m [[a]] -> m [a] flat = liftM concat enclosedBy :: Monad m => m [a] -> m a -> m [a] enclosedBy p q = q <:> p <++> single q -- * parsec shortcuts -- | parse an optional list optionL :: GenParser tok st [a] -> GenParser tok st [a] optionL = option [] -- | shortcut for @try . string@ tryString :: String -> CharParser st String tryString = try . string -- | nested comments, open and closing strings must have at least two chars nestedComment :: String -> String -> CharParser st String nestedComment op cl = case (op, cl) of (oh : ot : _, ch : ct : _) -> tryString op <++> flat (many $ single (noneOf [oh, ch] <|> try (char ch << notFollowedBy (char ct)) <|> try (char oh << notFollowedBy (char ot)) "") <|> nestedComment op cl) <++> string cl _ -> error "nestedComment" -- * haskell tokens -- | the haskell nested comments nestComment :: CharParser st String nestComment = nestedComment "{-" "-}" -- | line comment without final newline lineComment :: CharParser st String lineComment = try ((char '-' <:> many1 (char '-')) << notFollowedBy (oneOf "!#$%&*+./<=>?@\\^|~")) <++> many (noneOf "\n") -- | text in double quotes stringLit :: CharParser st String stringLit = enclosedBy (flat $ many $ single (noneOf "\\\"") <|> char '\\' <:> single anyChar) $ char '\"' -- | text in single quotes charLit :: CharParser st String charLit = tryString "'''" <|> enclosedBy (flat $ many $ single (noneOf "\\\'") <|> char '\\' <:> single anyChar) (char '\'') -- | a precise number scanner number :: Parser String number = try (char '0' <:> single (oneOf "oO")) <++> many octDigit <|> try (char '0' <:> single (oneOf "xX")) <++> many hexDigit <|> many1 digit <++> optionL (try (char '.' <:> single digit) <++> many digit) <++> optionL (try (oneOf "eE" <:> optionL (single $ oneOf "+-") <++> single digit) <++> many digit) -- | any character within identifiers hChar :: Parser Char hChar = alphaNum <|> oneOf "_'" -- | the underscore parser uL :: Parser Char uL = char '_' -- | lower case identifiers (aka variables) lId :: Parser String lId = (uL <|> lower) <:> many hChar -- | upper case identifiers (aka constructors) uId :: Parser String uId = upper <:> many hChar -- | any character within operators opSym :: Parser Char opSym = oneOf "!#$%&*+-./:<=>?@\\^|~" -- | any operator (mainly infixes) operator :: Parser String operator = many1 opSym -- | possible qualified entities: lower or upper case words or symbols data QualElem = Var | Cons | Sym deriving Eq -- | a name qualified or not with its representation data QualName = Name Bool QualElem String -- | the show instance renders the original string instance Show QualName where show (Name _ _ s) = s -- | any qualified or unqualified name or operator qId :: Parser QualName qId = fmap (Name False Var) lId <|> fmap (Name False Sym) operator <|> do n <- uId option (Name False Cons n) $ do d <- try (char '.' << lookAhead (uL <|> letter <|> opSym)) Name _ k r <- qId return $ Name True k $ n ++ d : r -- | parse any 'qId' within back ticks. This is more liberal than haskell! infixOp :: Parser String infixOp = enclosedBy (fmap show qId) $ char '`' -- | the separators are comma, semicolon and various kinds of parens seps :: String seps = "[({,;})]\n" -- | beside names and separators we have a couple of more token kinds data TokenKind = LineComment | BlockComment | Literal | Infix -- | the data type for tokens data Token = QualName QualName | Sep Char | Token TokenKind String | Start -- ^ the void token at the very beginning -- | the show instance renders the original string without kind information instance Show Token where show t = case t of QualName q -> show q Sep c -> [c] Token _ s -> s Start -> "" -- | all white spaces except newline isWhite :: Char -> Bool isWhite c = isSpace c && c /= '\n' -- | parsing as many 'isWhite' characters as possible (or none) white :: Parser String white = many $ satisfy isWhite -- | parse any non-white token tok :: Parser Token tok = fmap (Token BlockComment) nestComment <|> fmap (Token LineComment) lineComment <|> fmap QualName qId <|> fmap Sep (oneOf seps) <|> fmap (Token Literal) (charLit <|> stringLit <|> number) <|> fmap (Token Infix) infixOp -- | tokens enriched by positions and following white space data PosTok = PosTok SourcePos Token String instance Show PosTok where show (PosTok _ t s) = shows t s -- | attach positions and subsequent 'white' spaces to tokens posTok :: Parser PosTok posTok = do p <- getPosition t <- tok w <- white return $ PosTok p t w -- | the start position (without file name) startPos :: SourcePos startPos = initialPos "" -- | the initial 'white' stuff startTok :: Parser PosTok startTok = fmap (PosTok startPos Start) white -- | the final scanner scan :: Parser [PosTok] scan = startTok <:> many posTok << eof -- * splitting lines {- | a generic splitting function that keeps the separator as first element except in the first list. -} splitBy :: (a -> Bool) -> [a] -> [[a]] splitBy p l = let (fr, rt) = break p l in fr : case rt of [] -> [] d : tl -> let hd : tll = splitBy p tl in (d : hd) : tll -- | split lines at newline tokens splitLines :: [PosTok] -> [[PosTok]] splitLines = splitBy isIndent -- | removing more than two consecutive lists fulfilling the predicate removeBlankLines :: Int -> ([a] -> Bool) -> [[a]] -> [[a]] removeBlankLines c p l = case l of [] -> [] x : r -> if p x then if c > 1 then removeBlankLines c p r else x : removeBlankLines (c + 1) p r else x : removeBlankLines 0 p r -- * message data type -- | messages with positions data Diag = Diag SourcePos String instance Show Diag where show (Diag p s) = show p ++ ' ' : s -- * checking tokens -- | only show the token without spaces showTok :: PosTok -> String showTok (PosTok _ t _) = show t isInfixOp :: PosTok -> Bool isInfixOp (PosTok _ t _) = case t of QualName (Name _ Sym s) -> notElem s $ map (: []) "!#@\\~" Token Infix _ -> True _ -> False isComment :: PosTok -> Bool isComment (PosTok _ t _) = case t of Token k _ -> case k of LineComment -> True BlockComment -> True _ -> False _ -> False noLineComment :: PosTok -> Bool noLineComment (PosTok _ t _) = case t of Token LineComment _ -> False _ -> True isSepIn :: String -> PosTok -> Bool isSepIn cs (PosTok _ t _) = case t of Sep c -> elem c cs _ -> False isIndent :: PosTok -> Bool isIndent = isSepIn "\n" isOpPar :: PosTok -> Bool isOpPar = isSepIn "[({" isClPar :: PosTok -> Bool isClPar = isSepIn "})]" isOpParOrInfix :: PosTok -> Bool isOpParOrInfix t = isOpPar t || isInfixOp t isClParOrInfix :: PosTok -> Bool isClParOrInfix t = isClPar t || isInfixOp t isNonPar :: PosTok -> Bool isNonPar = isSepIn ",;" isFstInfixMinusArg :: PosTok -> Bool isFstInfixMinusArg t@(PosTok _ u _) = case u of QualName (Name _ k _) -> k /= Sym Sep _ -> isClPar t Token k _ -> case k of Literal -> True Infix -> True _ -> False Start -> False noSpaceNeededBefore :: PosTok -> Bool noSpaceNeededBefore t = isSepIn ",;})]" t || showTok t == "@" noSpaceNeededAfter :: PosTok -> Bool noSpaceNeededAfter t@(PosTok _ u _) = isOpPar t || elem (show u) (map (: []) "!#-@~") -- * adjusting tokens -- | replace all white spaces by blanks untabify :: SourcePos -> String -> String untabify p s = let p2 = updatePosString p s bs = sourceColumn p2 - sourceColumn p in replicate bs ' ' untab :: PosTok -> PosTok untab (PosTok p t w) = PosTok p t $ untabify (updatePosString p $ show t) w -- | remove trailing spaces rmSp :: PosTok -> PosTok rmSp (PosTok p t _) = PosTok p t "" -- | append exactly one blank blank :: PosTok -> PosTok blank (PosTok p t _) = PosTok p t " " multipleBlanks :: PosTok -> [Diag] multipleBlanks (PosTok p t w) = let n = length w in [ Diag (updatePosString p $ show t) $ "multiple (" ++ show n ++ ") blanks" | n > 1 ] -- | tidy up line comments adjustLineComment :: String -> String adjustLineComment = ("--" ++) . reverse . dropWhile isSpace . reverse . (' ' :) . dropWhile isSpace . drop 2 -- cut of initial line comment marker -- | check if prefix extended by one char from next string is still a prefix hasLongerPrefix :: String -> String -> String -> Bool hasLongerPrefix p n s = let ps = map (\ c -> p ++ [c]) n in any (`isPrefixOf` s) ps -- | insert a single blank after prefix adjustPrefix :: String -> String -> String -> String adjustPrefix p n s = if hasLongerPrefix p n s then s else case stripPrefix p s of Nothing -> s Just r -> p ++ case dropWhile isWhite r of rt@('\n' : _) -> case s of '}' : _ -> r _ -> rt rt -> ' ' : rt -- | adjust comment at both ends. Input is prefix, suffix and extension chars. adjustBothEnds :: String -> String -> String -> String -> String adjustBothEnds p q n s = if hasLongerPrefix p n s then s else concatMap (reverse . dropWhile isWhite . reverse) $ removeBlankLines 0 (all isSpace) $ splitBy (== '\n') $ reverse $ adjustPrefix (reverse q) n $ reverse $ adjustPrefix p n s -- | adjust a block comment adjustComment :: String -> String adjustComment = adjustBothEnds "{-" "-}" "!#" . adjustBothEnds "{-!" "!-}" "" . adjustBothEnds "{-#" "#-}" "" -- * analyse and adjust lines -- | utility to combine results (<+>) :: (PosTok, [Diag]) -> ([PosTok], [Diag]) -> ([PosTok], [Diag]) (t, ds) <+> (ts, es) = (t : ts, ds ++ es) infixr 4 <+> anaPosTok :: PosTok -> (PosTok, [Diag]) anaPosTok t@(PosTok p u w) = case u of Token BlockComment s -> let n = adjustComment s s5 = take 5 s nr = take 5 $ reverse n sr = take 5 $ reverse s in (PosTok p (Token BlockComment n) w, [ Diag p $ "non-conventional comment start: " ++ s5 | s5 /= take 5 n ] ++ [ Diag (updatePosString p s) $ "non-conventional comment end: " ++ reverse sr | sr /= nr ]) _ | show u == ";" -> (t, [Diag p "use layout instead of ;"]) _ -> (t, []) anaLine :: [PosTok] -> ([PosTok], [Diag]) anaLine l = case l of [] -> ([], []) t0 : r1 -> let (t1@(PosTok p1 u1 w1), cs) = anaPosTok t0 in case r1 of [] -> case u1 of Token LineComment s -> let n = adjustLineComment s s4 = take 4 s in ([PosTok p1 (Token LineComment n) ""], cs ++ [Diag p1 $ "leave a single blank after line comment sign: " ++ s4 | s4 /= take 4 n]) _ -> ([rmSp t1], cs ++ [Diag p1 "trailing white space" | not (null w1)]) t2@(PosTok p2 u2 w2) : r2 -> case u1 of Start -> (rmSp t1, cs ++ [Diag p1 "leading white space" | not (null w1)]) <+> anaLine r1 _ | isIndent t1 -> let (ft, rt) = span (== ' ') w1 in (untab t1, cs ++ [ Diag (updatePosString p1 ft) "use only blanks for indentation" | not (null rt) ]) <+> anaLine r1 _ -> let s1 = show u1 s2 = show u2 n1 = length s1 n2 = length s2 lt = n1 <= n2 both = s1 ++ s2 after = case () of _ | isNonPar t1 -> True | isOpPar t2 -> False | s1 == "\\" -> True | isInfixOp t1 -> if isInfixOp t2 then lt else True | isInfixOp t2 || s2 == ".." -> False _ -> lt parsOfBoth = filter (`elem` "[({})]") both pos = case both of _ : _ : _ -> "between" _ | isOpPar t1 -> "after" | isClPar t2 -> "before" _ -> "here" omitSpace = isOpParOrInfix t1 && isClParOrInfix t2 && not (isInfixOp t1 && isInfixOp t2) && (s1 /= ".." || s2 /= "]") addSpace = not (noSpaceNeededAfter t1) && not (noSpaceNeededBefore t2) (newT1, ds) = if null w1 then if addSpace then (blank t1, Diag p2 ("leave space " ++ if after then "after " ++ s1 else "before " ++ s2) : [ Diag p1 "but may be template haskell splice $(" | both == "$(" ]) else (t1, []) else if omitSpace then (rmSp t1, [Diag p2 $ " no space needed " ++ pos ++ " " ++ parsOfBoth]) else if isComment t2 then (untab t1, []) else (blank t1, multipleBlanks t1) pt0 = (newT1, cs ++ ds) in case r2 of [] -> pt0 <+> anaLine r1 t3@(PosTok _ u3 _) : _ -> let s3 = show u3 ms = [Diag p2 "put spaces around infix -" | null w1 || null w2 ] ++ multipleBlanks t1 ++ multipleBlanks t2 in if s2 == "-" && not (noSpaceNeededBefore t3) && isFstInfixMinusArg t1 then (blank t1, cs ++ ms) <+> anaLine (blank t2 : r2) else (if elem s2 ["do", "of"] && s3 /= "{" && noLineComment t3 then (newT1, cs ++ Diag p2 ("break line after " ++ show s2) : ds) else pt0) <+> anaLine r1 -- * ensure final newline isBlankLine :: [PosTok] -> Bool isBlankLine x = case x of [PosTok _ Start _] -> True [t] -> isIndent t _ -> False removeFinalBlankLines :: [[PosTok]] -> [[PosTok]] removeFinalBlankLines ll = reverse $ [PosTok startPos (Sep '\n') ""] : dropWhile isBlankLine (reverse ll) -- * main functions -- | create adjusted source file processScan :: [[PosTok]] -> String processScan = concatMap (concatMap show . fst . anaLine) . removeBlankLines 1 isBlankLine . removeFinalBlankLines -- | list all diagnostics showScan :: [[PosTok]] -> String showScan = intercalate "\n" . concatMap (map show . snd . anaLine)