{-# LANGUAGE OverloadedStrings, PatternGuards, ScopedTypeVariables #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# OPTIONS_GHC -funbox-strict-fields #-} {- | Tagify haskell source. Annotate lines, strip comments, tokenize, then search for It loads the existing tags, and updates it for the given file. Then a SaveBuf action runs tags every time you save a file. Or extend lushtags, the question is complete parsing ok? It means I can't do it on save, since I'd have to invoke the build system to de-hsc. -} module Main where import qualified Control.Exception as Exception import Control.Monad import qualified Data.Char as Char import qualified Data.Either as Either import qualified Data.List as List import qualified Data.Monoid as Monoid import qualified Data.Set as Set import qualified Data.Text as T import Data.Text (Text) import qualified Data.Text.IO as Text.IO import qualified Debug.Trace as Trace import qualified System.Console.GetOpt as GetOpt import qualified System.Environment as Environment import qualified System.Exit import qualified System.IO as IO import qualified System.IO.Error as IO.Error main :: IO () main = do args <- Environment.getArgs (flags, inputs) <- case GetOpt.getOpt GetOpt.Permute options args of (flags, inputs, []) -> return (flags, inputs) (_, _, errs) -> usage $ "flag errors:\n" ++ List.intercalate ", " errs let output = last $ "tags" : [fn | Output fn <- flags] verbose = Verbose `elem` flags oldTags <- fmap (maybe [vimMagicLine] T.lines) $ catchENOENT $ Text.IO.readFile output newTags <- fmap concat $ forM (zip [0..] inputs) $ \(i :: Int, fn) -> do tags <- processFile fn -- This has the side-effect of forcing the the tags, which is essential -- if I'm tagging a lot of files at once. let (warnings, newTags) = Either.partitionEithers tags forM_ warnings $ \warn -> do IO.hPutStrLn IO.stderr warn when verbose $ do let line = show i ++ " of " ++ show (length inputs - 1) ++ ": " ++ fn putStr $ '\r' : line ++ replicate (78 - length line) ' ' IO.hFlush IO.stdout return newTags when verbose $ putChar '\n' let write = if output == "-" then Text.IO.hPutStr IO.stdout else Text.IO.writeFile output write $ T.unlines (mergeTags inputs oldTags newTags) where usage msg = putStr (GetOpt.usageInfo msg options) >> System.Exit.exitSuccess mergeTags :: [FilePath] -> [Text] -> [Pos TagVal] -> [Text] mergeTags inputs old new = merge (List.sort (map showTag new)) (filter (not . isNewTag textFns) old) where -- Turns out GHC will not float out the T.pack and it makes a big -- performance difference. textFns = Set.fromList $ map T.pack inputs data Flag = Output FilePath | Verbose deriving (Eq, Show) options :: [GetOpt.OptDescr Flag] options = [ GetOpt.Option ['o'] [] (GetOpt.ReqArg Output "filename") "output file, defaults to 'tags'" , GetOpt.Option ['v'] [] (GetOpt.NoArg Verbose) "print files as they are tagged, useful to track down slow files" ] -- | Documented in vim :h tags-file-format. -- This tells vim that the file is sorted (but not case folded) so that -- it can do a bsearch and never needs to fall back to linear search. vimMagicLine :: Text vimMagicLine = "!_TAG_FILE_SORTED\t1\t~" isNewTag :: Set.Set Text -> Text -> Bool isNewTag textFns line = Set.member fn textFns where fn = T.takeWhile (/='\t') $ T.drop 1 $ T.dropWhile (/='\t') line merge :: [Text] -> [Text] -> [Text] merge [] ys = ys merge xs [] = xs merge (x:xs) (y:ys) | x <= y = x : merge xs (y:ys) | otherwise = y : merge (x:xs) ys -- | Convert a Tag to text, e.g.: AbsoluteMark\tCmd/TimeStep.hs 67 ;" f showTag :: Pos TagVal -> Text showTag (Pos (SrcPos fn lineno) (Tag text typ)) = T.concat [text, "\t", T.pack fn, "\t", T.pack (show lineno), " ;\" ", T.singleton (showType typ)] -- | Vim takes this to be the \"kind:\" annotation. It's just an arbitrary -- string and these letters conform to no standard. Presumably there are some -- vim extensions that can make use of it. showType :: Type -> Char showType typ = case typ of Module -> 'm' Function -> 'f' Class -> 'c' Type -> 't' Constructor -> 'C' -- * types data TagVal = Tag !Text !Type deriving (Eq, Show) data Type = Module | Function | Class | Type | Constructor deriving (Eq, Show) data TokenVal = Token !Text | Newline !Int deriving (Eq, Show) type Tag = Either String (Pos TagVal) type Token = Pos TokenVal -- | Newlines have to remain in the tokens because 'breakBlocks' relies on -- them. But they make pattern matching on the tokens unreliable because -- newlines might be anywhere. A newtype makes sure that the tokens only get -- stripped once and that I don't do any pattern matching on unstripped tokens. newtype UnstrippedTokens = UnstrippedTokens [Token] deriving (Show, Monoid.Monoid) mapTokens :: ([Token] -> [Token]) -> UnstrippedTokens -> UnstrippedTokens mapTokens f (UnstrippedTokens tokens) = UnstrippedTokens (f tokens) unstrippedTokensOf :: UnstrippedTokens -> [Token] unstrippedTokensOf (UnstrippedTokens tokens) = tokens type Line = Pos Text data Pos a = Pos { posOf :: !SrcPos , valOf :: !a } data SrcPos = SrcPos { posFile :: !FilePath , posLine :: !Int } instance (Show a) => Show (Pos a) where show (Pos pos val) = show pos ++ ":" ++ show val instance Show SrcPos where show (SrcPos fn line) = fn ++ ":" ++ show line processFile :: FilePath -> IO [Tag] processFile fn = fmap (process fn) (Text.IO.readFile fn) `Exception.catch` \(exc :: Exception.SomeException) -> do -- readFile will crash on files that are not UTF8. Unfortunately not -- all haskell source file are. IO.hPutStrLn IO.stderr $ "exception reading " ++ show fn ++ ": " ++ show exc return [] process :: FilePath -> Text -> [Tag] process fn = dropDups tagText . concatMap blockTags . breakBlocks . stripComments . Monoid.mconcat . map tokenize . stripCpp . annotate fn where tagText (Right (Pos _ (Tag text _))) = text tagText (Left warn) = T.pack warn -- * tokenize annotate :: FilePath -> Text -> [Line] annotate fn text = [Pos (SrcPos fn num) line | (num, line) <- zip [1..] (T.lines text)] -- | Also strips out hsc detritus. stripCpp :: [Line] -> [Line] stripCpp = filter $ not . ("#" `T.isPrefixOf`) . valOf tokenize :: Line -> UnstrippedTokens tokenize (Pos pos line) = UnstrippedTokens $ map (Pos pos) (tokenizeLine line) tokenizeLine :: Text -> [TokenVal] tokenizeLine text = Newline nspaces : go line where nspaces = T.count " " spaces + T.count "\t" spaces * 8 (spaces, line) = T.break (not . Char.isSpace) text symbols = ["--", "{-", "-}", "=>", "->", "::"] go unstripped | T.null text = [] | Just sym <- List.find (`T.isPrefixOf` text) symbols = Token sym : go (T.drop (T.length sym) text) | c == '\'' = let (token, rest) = breakChar text in Token (T.cons c token) : go rest | c == '"' = go (skipString cs) | (token, rest) <- spanSymbol text, not (T.null token) = Token token : go rest -- This will tokenize differently than haskell should, e.g. -- 9x will be "9x" not "9" "x". But I just need a wordlike chunk, not -- an actual token. Otherwise I'd have to tokenize numbers. | otherwise = case T.span identChar text of ("", _) -> Token (T.singleton c) : go cs (token, rest) -> Token token : go rest where text = T.dropWhile Char.isSpace unstripped c = T.head text cs = T.tail text startIdentChar :: Char -> Bool startIdentChar c = Char.isAlpha c || c == '_' identChar :: Char -> Bool identChar c = Char.isAlphaNum c || c == '.' || c == '\'' || c == '_' -- | Span a symbol, making sure to not eat comments. spanSymbol :: Text -> (Text, Text) spanSymbol text | any (`T.isPrefixOf` post) [",", "--", "-}", "{-"] = (pre, post) | Just (c, cs) <- T.uncons post, c == '-' || c == '{' = let (pre2, post2) = spanSymbol cs in (pre <> T.cons c pre2, post2) | otherwise = (pre, post) where (pre, post) = T.break (\c -> T.any (==c) "-{," || not (symbolChar c)) text symbolChar :: Char -> Bool symbolChar c = Char.isSymbol c || Char.isPunctuation c breakChar :: Text -> (Text, Text) breakChar text | T.null text = ("", "") | T.head text == '\\' = T.splitAt 3 text | otherwise = T.splitAt 2 text -- | Skip until the ending double-quote of a string. Tags never happen in -- strings so I don't need to keep it. -- -- TODO \ continuation isn't supported. I'd have to tokenize at the file -- level instead of the line level. skipString :: Text -> Text skipString text = case T.uncons (T.dropWhile (not . end) text) of Nothing -> "" Just (c, cs) | c == '"' -> cs | otherwise -> skipString (T.drop 1 cs) where end c = c == '\\' || c == '"' stripComments :: UnstrippedTokens -> UnstrippedTokens stripComments = mapTokens (go 0) where go :: Int -> [Token] -> [Token] go _ [] = [] go nest (pos@(Pos _ token) : rest) | token == Token "--" = go nest (dropLine rest) | token == Token "{-" = go (nest+1) rest | token == Token "-}" = go (nest-1) rest | nest > 0 = go nest rest | otherwise = pos : go nest rest -- | Break the input up into blocks based on indentation. breakBlocks :: UnstrippedTokens -> [UnstrippedTokens] breakBlocks = map UnstrippedTokens . filter (not . null) . go . filterBlank . unstrippedTokensOf where go [] = [] go tokens = pre : go post where (pre, post) = breakBlock tokens -- Blank lines mess up the indentation. filterBlank [] = [] filterBlank (Pos _ (Newline _) : xs@(Pos _ (Newline _) : _)) = filterBlank xs filterBlank (x:xs) = x : filterBlank xs -- | Take until a newline, then take lines until the indent established after -- that newline decreases. breakBlock :: [Token] -> ([Token], [Token]) breakBlock (t@(Pos _ tok):ts) = case tok of Newline indent -> collectIndented indent ts _ -> let (pre, post) = breakBlock ts in (t:pre, post) where collectIndented indent (t@(Pos _ tok) : ts) = case tok of Newline n | n <= indent -> ([], t:ts) _ -> let (pre, post) = collectIndented indent ts in (t:pre, post) collectIndented _ [] = ([], []) breakBlock [] = ([], []) -- * extract tags -- | Get all the tags in one indented block. blockTags :: UnstrippedTokens -> [Tag] blockTags tokens = case stripNewlines tokens of [] -> [] Pos _ (Token "module") : Pos pos (Token name) : _ -> [mktag pos (snd (T.breakOnEnd "." name)) Module] -- newtype X * = X * Pos _ (Token "newtype") : Pos pos (Token name) : rest -> mktag pos name Type : newtypeTags pos rest -- type family X ... Pos _ (Token "type") : Pos _ (Token "family") : Pos pos (Token name) : _ -> [mktag pos name Type] -- type X * = ... Pos _ (Token "type") : Pos pos (Token name) : _ -> [mktag pos name Type] -- data family X ... Pos _ (Token "data") : Pos _ (Token "family") : Pos pos (Token name) : _ -> [mktag pos name Type] -- data X * = X { X :: *, X :: * } -- data X * where ... Pos _ (Token "data") : Pos pos (Token name) : _ -> mktag pos name Type : dataTags pos (mapTokens (drop 2) tokens) -- class * => X where X :: * ... Pos pos (Token "class") : _ -> classTags pos (mapTokens (drop 1) tokens) -- x, y, z :: * stripped -> fst $ functionTags False stripped -- | It's easier to scan for tokens without pesky newlines popping up -- everywhere. But I need to keep the newlines in in case I hit a @where@ -- and need to call 'breakBlocks' again. stripNewlines :: UnstrippedTokens -> [Token] stripNewlines = filter (not . isNewline) . (\(UnstrippedTokens t) -> t) -- | Get tags from a function type declaration: token , token , token :: -- Return the tokens left over. functionTags :: Bool -- ^ expect constructors, not functions -> [Token] -> ([Tag], [Token]) functionTags constructors = go [] where go tags (Pos pos (Token name) : Pos _ (Token "::") : rest) | Just name <- functionName constructors name = (reverse $ mktag pos name Function : tags, rest) go tags (Pos pos (Token name) : Pos _ (Token ",") : rest) | Just name <- functionName constructors name = go (mktag pos name Function : tags) rest go tags tokens = (tags, tokens) functionName :: Bool -> Text -> Maybe Text functionName constructors text | isFunction text = Just text | isOperator text && not (T.null stripped) = Just stripped | otherwise = Nothing where isFunction text = case T.uncons text of Just (c, cs) -> firstChar c && startIdentChar c && T.all identChar cs Nothing -> False firstChar = if constructors then Char.isUpper else not . Char.isUpper -- Technically I could insist on colons if constructors is True, but -- let's let ghc decide about the syntax. isOperator text = "(" `T.isPrefixOf` text && ")" `T.isSuffixOf` text && T.all symbolChar stripped stripped = T.drop 1 $ T.take (T.length text - 1) text -- | * = X * newtypeTags :: SrcPos -> [Token] -> [Tag] newtypeTags prevPos tokens = case dropUntil "=" tokens of Pos pos (Token name) : _ -> [mktag pos name Constructor] rest -> unexpected prevPos (UnstrippedTokens tokens) rest "newtype * =" -- | [] (empty data declaration) -- * = X { X :: *, X :: * } -- * where X :: * X :: * -- * = X | X dataTags :: SrcPos -> UnstrippedTokens -> [Tag] dataTags prevPos unstripped | any ((== Token "where") . valOf) (unstrippedTokensOf unstripped) = concatMap gadtTags (whereBlock unstripped) | otherwise = case dropUntil "=" (stripNewlines unstripped) of [] -> [] -- empty data declaration Pos pos (Token name) : rest -> mktag pos name Constructor : collectRest rest rest -> unexpected prevPos unstripped rest "data * =" where collectRest tokens | (tags@(_:_), rest) <- functionTags False tokens = tags ++ collectRest rest collectRest (Pos _ (Token "|") : Pos pos (Token name) : rest) = mktag pos name Constructor : collectRest rest collectRest (_ : rest) = collectRest rest collectRest [] = [] gadtTags :: UnstrippedTokens -> [Tag] gadtTags = fst . functionTags True . stripNewlines -- | * => X where X :: * ... classTags :: SrcPos -> UnstrippedTokens -> [Tag] classTags prevPos unstripped = case dropContext (stripNewlines unstripped) of Pos pos (Token name) : _ -> -- Drop the where and start expecting functions. mktag pos name Class : concatMap classBodyTags (whereBlock unstripped) rest -> unexpected prevPos unstripped rest "class * =>" where dropContext tokens = if any ((== Token "=>") . valOf) tokens then dropUntil "=>" tokens else tokens classBodyTags :: UnstrippedTokens -> [Tag] classBodyTags unstripped = case stripNewlines unstripped of Pos _ (Token typedata) : Pos pos (Token name) : _ | typedata `elem` ["type", "data"] -> [mktag pos name Type] tokens -> fst $ functionTags False tokens -- | Skip to the where and split the indented block below it. whereBlock :: UnstrippedTokens -> [UnstrippedTokens] whereBlock = breakBlocks . mapTokens (dropUntil "where") -- * util mktag :: SrcPos -> Text -> Type -> Tag mktag pos name typ = Right $ Pos pos (Tag name typ) warning :: SrcPos -> String -> Tag warning pos warn = Left $ show pos ++ ": " ++ warn unexpected :: SrcPos -> UnstrippedTokens -> [Token] -> String -> [Tag] unexpected prevPos (UnstrippedTokens tokensBefore) tokensHere declaration = [warning pos ("unexpected " ++ thing ++ " after " ++ declaration)] where thing = if null tokensHere then "end of block" else show (valOf (head tokensHere)) pos | not (null tokensHere) = posOf (head tokensHere) | not (null tokensBefore) = posOf (last tokensBefore) | otherwise = prevPos dropLine :: [Token] -> [Token] dropLine = drop 1 . dropWhile (not . isNewline) isNewline :: Token -> Bool isNewline (Pos _ (Newline _)) = True isNewline _ = False dropUntil :: Text -> [Token] -> [Token] dropUntil token = drop 1 . dropWhile ((/= Token token) . valOf) -- * misc tracem :: (Show a) => String -> a -> a tracem msg x = Trace.trace (msg ++ ": " ++ show x) x (<>) :: (Monoid.Monoid a) => a -> a -> a (<>) = Monoid.mappend -- | If @op@ raised ENOENT, return Nothing. catchENOENT :: IO a -> IO (Maybe a) catchENOENT op = Exception.handleJust (guard . IO.Error.isDoesNotExistError) (const (return Nothing)) (fmap Just op) dropDups :: (Eq k) => (a -> k) -> [a] -> [a] dropDups key (x:xs) = go x xs where go a [] = [a] go a (b:bs) | key a == key b = go a bs | otherwise = a : go b bs dropDups _ [] = []