{-# LANGUAGE CPP #-} -- should this be named Data.Hasktags or such? module Hasktags ( FileData, generate, findWithCache, findThings, findThingsInBS, Mode(..), -- TODO think about these: Must they be exported ? getMode, getOutFile, dirToFiles ) where import Control.Monad (when) import qualified Data.ByteString.Lazy.Char8 as BS (ByteString, readFile, unpack) import qualified Data.ByteString.Lazy.UTF8 as BS8 (fromString) import Data.Char (isSpace) import Data.List (isPrefixOf, isSuffixOf, groupBy, tails) import Data.Maybe (maybeToList) import DebugShow (trace_) import System.Directory (doesDirectoryExist, doesFileExist, getDirectoryContents, getModificationTime, #if MIN_VERSION_directory(1,3,0) pathIsSymbolicLink) #else isSymbolicLink) #endif import System.FilePath (()) import System.IO (Handle, IOMode (AppendMode, WriteMode), hClose, hGetContents, openFile, stdin, stdout) import Tags (FileData (..), FileName, FoundThing (..), FoundThingType (FTClass, FTCons, FTConsAccessor, FTConsGADT, FTData, FTDataGADT, FTFuncImpl, FTFuncTypeDef, FTInstance, FTModule, FTNewtype, FTPattern, FTPatternTypeDef, FTType), Pos (..), Scope, mywords, writectagsfile, writeetagsfile) import Text.JSON.Generic (decodeJSON, encodeJSON) -- search for definitions of things -- we do this by looking for the following patterns: -- data XXX = ... giving a datatype location -- newtype XXX = ... giving a newtype location -- bla :: ... giving a function location -- -- by doing it this way, we avoid picking up local definitions -- (whether this is good or not is a matter for debate) -- -- We generate both CTAGS and ETAGS format tags files -- The former is for use in most sensible editors, while EMACS uses ETAGS -- alternatives: http://haskell.org/haskellwiki/Tags {- .hs or literate .lhs haskell file? Really not a easy question - maybe there is an answer - I don't know .hs -> non literate haskel file .lhs -> literate haskell file .chs -> is this always plain? .whatsoever -> try to get to know the answer (*) contains any '> ... ' line -> interpreted as literate else non literate (*) This is difficult because System.Log.Logger is using {- [...] > module Example where > [...] -} module System.Log.Logger( so it might looks like beeing a .lhs file My first fix was checking for \\begin occurence (doesn't work because HUnit is using > but no \\begin) Further ideas: * use unlit executable distributed with ghc or the like and check for errors? (Will this work if cpp is used as well ?) * Remove comments before checking for '> ..' does'nt work because {- -} may be unbalanced in literate comments So my solution is : take file extension and keep guessing code for all unkown files -} -- Reference: http://ctags.sourceforge.net/FORMAT -- | getMode takes a list of modes and extracts the mode with the -- highest precedence. These are as follows: Both, CTags, ETags -- The default case is Both. getMode :: [Mode] -> Mode getMode [] = BothTags getMode xs = maximum xs -- | getOutFile scans the modes searching for output redirection -- if not found, open the file with name passed as parameter. -- Handle special file -, which is stdout getOutFile :: String -> IOMode -> [Mode] -> IO Handle getOutFile _ _ (OutRedir "-" : _) = return stdout getOutFile _ openMode (OutRedir f : _) = openFile f openMode getOutFile name openMode (_:xs) = getOutFile name openMode xs getOutFile defaultName openMode [] = openFile defaultName openMode data Mode = ExtendedCtag | ETags | CTags | BothTags | Append | OutRedir String | CacheFiles | FollowDirectorySymLinks | Help | HsSuffixes [String] | AbsolutePath deriving (Ord, Eq, Show) data Token = Token String Pos | NewLine Int -- space 8*" " = "\t" deriving (Eq) instance Show Token where -- show (Token t (Pos _ l _ _) ) = "Token " ++ t ++ " " ++ (show l) show (Token t (Pos _ _l _ _) ) = " " ++ t ++ " " show (NewLine i) = "NewLine " ++ show i tokenString :: Token -> String tokenString (Token s _) = s tokenString (NewLine _) = "\n" isNewLine :: Maybe Int -> Token -> Bool isNewLine Nothing (NewLine _) = True isNewLine (Just c) (NewLine c') = c == c' isNewLine _ _ = False trimNewlines :: [Token] -> [Token] trimNewlines = filter (not . isNewLine Nothing) generate :: [Mode] -> [FileName] -> IO () generate modes filenames = do let mode = getMode (filter ( `elem` [BothTags, CTags, ETags] ) modes) openFileMode = if Append `elem` modes then AppendMode else WriteMode filedata <- mapM (findWithCache (CacheFiles `elem` modes)) filenames when (mode == CTags) (do ctagsfile <- getOutFile "tags" openFileMode modes writectagsfile ctagsfile (ExtendedCtag `elem` modes) filedata hClose ctagsfile) when (mode == ETags) (do etagsfile <- getOutFile "TAGS" openFileMode modes writeetagsfile etagsfile filedata hClose etagsfile) -- avoid problem when both is used in combination -- with redirection on stdout when (mode == BothTags) (do etagsfile <- getOutFile "TAGS" openFileMode modes writeetagsfile etagsfile filedata ctagsfile <- getOutFile "ctags" openFileMode modes writectagsfile ctagsfile (ExtendedCtag `elem` modes) filedata hClose etagsfile hClose ctagsfile) -- Find the definitions in a file, or load from cache if the file -- hasn't changed since last time. findWithCache :: Bool -> FileName -> IO FileData findWithCache cache filename = do cacheExists <- if cache then doesFileExist cacheFilename else return False if cacheExists then do fileModified <- getModificationTime filename cacheModified <- getModificationTime cacheFilename if cacheModified > fileModified then do bytes <- BS.readFile cacheFilename return (decodeJSON (BS.unpack bytes)) else findAndCache else findAndCache where cacheFilename = filenameToTagsName filename filenameToTagsName = (++"tags") . reverse . dropWhile (/='.') . reverse findAndCache = do filedata <- findThings filename when cache (writeFile cacheFilename (encodeJSON filedata)) return filedata -- eg Data.Text says that using ByteStrings could be fastest depending on ghc -- platform and whatnot - so let's keep the hacky BS.readFile >>= BS.unpack -- usage till there is a problem, still need to match utf-8 chars like this: ⇒ -- to get correct class names, eg MonadBaseControl case (testcase testcases/monad-base-control.hs) -- so use the same conversion which is applied to files when they got read .. utf8_to_char8_hack :: String -> String utf8_to_char8_hack = BS.unpack . BS8.fromString -- Find the definitions in a file findThings :: FileName -> IO FileData findThings filename = fmap (findThingsInBS filename) $ BS.readFile filename findThingsInBS :: String -> BS.ByteString -> FileData findThingsInBS filename bs = do let aslines = lines $ BS.unpack bs let stripNonHaskellLines = let emptyLine = all (all isSpace . tokenString) . filter (not . isNewLine Nothing) cppLine (_nl:t:_) = ("#" `isPrefixOf`) $ tokenString t cppLine _ = False in filter (not . emptyLine) . filter (not . cppLine) let debugStep m = (\s -> trace_ (m ++ " result") s s) let (isLiterate, slines) = debugStep "fromLiterate" $ fromLiterate filename $ zip aslines [0..] -- remove -- comments, then break each line into tokens (adding line -- numbers) -- then remove {- -} comments -- split by lines again ( to get indent let (fileLines, numbers) = unzip slines let tokenLines {- :: [[Token]] -} = debugStep "stripNonHaskellLines" $ stripNonHaskellLines $ debugStep "stripslcomments" $ stripslcomments $ debugStep "splitByNL" $ splitByNL Nothing $ debugStep "stripblockcomments pipe" $ stripblockcomments $ concat $ zipWith3 (withline filename) (map (filter (not . all isSpace) . mywords False) fileLines) fileLines numbers -- TODO ($defines / empty lines etc) -- separate by top level declarations (everything starting with the -- same topmost indentation is what I call section here) -- so that z in -- let x = 7 -- z = 20 -- won't be found as function let topLevelIndent = debugStep "top level indent" $ getTopLevelIndent isLiterate tokenLines let sections = map tail -- strip leading NL (no longer needed) $ filter (not . null) $ splitByNL (Just (topLevelIndent) ) $ concat (trace_ "tokenLines" tokenLines tokenLines) -- only take one of -- a 'x' = 7 -- a _ = 0 let filterAdjacentFuncImpl = map head . groupBy (\(FoundThing t1 n1 (Pos f1 _ _ _)) (FoundThing t2 n2 (Pos f2 _ _ _)) -> f1 == f2 && n1 == n2 && areFuncImpls t1 t2) areFuncImpls (FTFuncImpl _) (FTFuncImpl _) = True areFuncImpls _ _ = False let iCI = map head . groupBy (\(FoundThing t1 n1 (Pos f1 l1 _ _)) (FoundThing t2 n2 (Pos f2 l2 _ _)) -> f1 == f2 && n1 == n2 && skipCons t1 t2 && ((<= 7) $ abs $ l2 - l1)) skipCons FTData (FTCons _ _) = False skipCons FTDataGADT (FTConsGADT _) = False skipCons _ _ = True let things = iCI $ filterAdjacentFuncImpl $ concatMap (flip findstuff Nothing) $ map (\s -> trace_ "section in findThingsInBS" s s) sections let -- If there's a module with the same name of another definition, we -- are not interested in the module, but only in the definition. uniqueModuleName (FoundThing FTModule moduleName _) = not $ any (\(FoundThing thingType thingName _) -> thingType /= FTModule && thingName == moduleName) things uniqueModuleName _ = True FileData filename $ filter uniqueModuleName things -- Create tokens from words, by recording their line number -- and which token they are through that line withline :: FileName -> [String] -> String -> Int -> [Token] withline filename sourceWords fullline i = let countSpaces (' ':xs) = 1 + countSpaces xs countSpaces ('\t':xs) = 8 + countSpaces xs countSpaces _ = 0 in NewLine (countSpaces fullline) : zipWith (\w t -> Token w (Pos filename i t fullline)) sourceWords [1 ..] -- comments stripping stripslcomments :: [[Token]] -> [[Token]] stripslcomments = let f (NewLine _ : Token ('-':'-':_) _ : _) = False f _ = True isCmt (Token ('-':'-':_) _) = True isCmt _ = False in map (takeWhile (not . isCmt)) . filter f stripblockcomments :: [Token] -> [Token] stripblockcomments (Token "{-" pos : xs) = trace_ "{- found at " (show pos) $ afterblockcomend xs stripblockcomments (x:xs) = x:stripblockcomments xs stripblockcomments [] = [] afterblockcomend :: [Token] -> [Token] afterblockcomend (t@(Token _ pos):xs) | contains "-}" (tokenString t) = trace_ "-} found at " (show pos) $ stripblockcomments xs | otherwise = afterblockcomend xs afterblockcomend [] = [] afterblockcomend (_:xs) = afterblockcomend xs -- does one string contain another string contains :: Eq a => [a] -> [a] -> Bool contains sub = any (isPrefixOf sub) . tails -- actually pick up definitions findstuff :: [Token] -> Scope -> [FoundThing] findstuff (Token "module" _ : Token name pos : _) _ = trace_ "module" pos $ [FoundThing FTModule name pos] -- nothing will follow this section findstuff tokens@(Token "data" _ : Token name pos : xs) _ | any ( (== "where"). tokenString ) xs -- GADT -- TODO will be found as FTCons (not FTConsGADT), the same for -- functions - but they are found :) = trace_ "findstuff data b1" tokens $ FoundThing FTDataGADT name pos : getcons2 (FTConsGADT name) "" xs ++ fromWhereOn xs Nothing -- ++ (findstuff xs) | otherwise = trace_ "findstuff data otherwise" tokens $ FoundThing FTData name pos : getcons (FTCons FTData name) (trimNewlines xs)-- ++ (findstuff xs) findstuff tokens@(Token "newtype" _ : ts@(Token name pos : _))_ = trace_ "findstuff newtype" tokens $ FoundThing FTNewtype name pos : getcons (FTCons FTNewtype name) (trimNewlines ts)-- ++ (findstuff xs) -- FoundThing FTNewtype name pos : findstuff xs findstuff tokens@(Token "type" _ : Token name pos : xs) _ = trace_ "findstuff type" tokens $ case (break ((== "where").tokenString) xs) of (ys, []) -> trace_ "findstuff type b1 " ys $ [FoundThing FTType name pos] (ys, r) -> trace_ "findstuff type b2 " (ys, r) $ FoundThing FTType name pos : fromWhereOn r Nothing findstuff tokens@(Token "class" _ : xs) _ = trace_ "findstuff class" tokens $ case (break ((== "where").tokenString) xs) of (ys, []) -> trace_ "findstuff class b1 " ys $ maybeToList $ className ys (ys, r) -> trace_ "findstuff class b2 " (ys, r) $ maybe [] (\n@(FoundThing _ name _) -> n : fromWhereOn r (Just (FTClass, name))) $ className ys where isParenOpen (Token "(" _) = True isParenOpen _ = False className lst = case (head' . dropWhile isParenOpen . reverse . takeWhile ((not . (`elem` ["=>", utf8_to_char8_hack "⇒"])) . tokenString) . reverse) lst of (Just (Token name p)) -> Just $ FoundThing FTClass name p _ -> Nothing findstuff tokens@(Token "instance" _ : xs) _ = trace_ "findstuff instance" tokens $ case (break ((== "where").tokenString) xs) of (ys, []) -> trace_ "findstuff instance b1 " ys $ maybeToList $ instanceName ys (ys, r) -> trace_ "findstuff instance b2 " (ys, r) $ maybe [] (\n@(FoundThing _ name _) -> n : fromWhereOn r (Just (FTInstance, name))) $ instanceName ys where instanceName lst@(Token _ p :_) = Just $ FoundThing FTInstance (map (\a -> if a == '.' then '-' else a) $ concatTokens lst) p instanceName _ = Nothing findstuff tokens@(Token "pattern" _ : Token name pos : Token "::" _ : sig) _ = trace_ "findstuff pattern type annotation" tokens $ [FoundThing (FTPatternTypeDef (concatTokens sig)) name pos] findstuff tokens@(Token "pattern" _ : Token name pos : xs) scope = trace_ "findstuff pattern" tokens $ FoundThing FTPattern name pos : findstuff xs scope findstuff xs scope = trace_ "findstuff rest " xs $ findFunc xs scope ++ findFuncTypeDefs [] xs scope findFuncTypeDefs :: [Token] -> [Token] -> Scope -> [FoundThing] findFuncTypeDefs found (t@(Token _ _): Token "," _ :xs) scope = findFuncTypeDefs (t : found) xs scope findFuncTypeDefs found (t@(Token _ _): Token "::" _ : sig) scope = map (\(Token name p) -> FoundThing (FTFuncTypeDef (concatTokens sig) scope) name p) (t:found) findFuncTypeDefs found xs@(Token "(" _ :_) scope = case break myBreakF xs of (inner@(Token _ p : _), rp : xs') -> let merged = Token ( concatMap (\(Token x _) -> x) $ inner ++ [rp] ) p in findFuncTypeDefs found (merged : xs') scope _ -> [] where myBreakF (Token ")" _) = True myBreakF _ = False findFuncTypeDefs _ _ _ = [] fromWhereOn :: [Token] -> Scope -> [FoundThing] fromWhereOn [] _ = [] fromWhereOn [_] _ = [] fromWhereOn (_: xs@(NewLine _ : _)) scope = concatMap (flip findstuff scope . tail') $ splitByNL (Just ( minimum . (10000:) . map (\(NewLine i) -> i) . filter (isNewLine Nothing) $ xs)) xs fromWhereOn (_:xw) scope = findstuff xw scope findFunc :: [Token] -> Scope -> [FoundThing] findFunc x scope = case findInfix x scope of a@(_:_) -> a _ -> findF x scope findInfix :: [Token] -> Scope -> [FoundThing] findInfix x scope = case dropWhile ((/= "`"). tokenString) (takeWhile ( (/= "=") . tokenString) x) of _ : Token name p : _ -> [FoundThing (FTFuncImpl scope) name p] _ -> [] findF :: [Token] -> Scope -> [FoundThing] findF ts@(Token "(" p : _) scope = let (name, xs) = extractOperator ts in [FoundThing (FTFuncImpl scope) name p | any (("=" ==) . tokenString) xs] findF (Token name p : xs) scope = [FoundThing (FTFuncImpl scope) name p | any (("=" ==) . tokenString) xs] findF _ _ = [] head' :: [a] -> Maybe a head' (x:_) = Just x head' [] = Nothing tail' :: [a] -> [a] tail' (_:xs) = xs tail' [] = [] -- get the constructor definitions, knowing that a datatype has just started getcons :: FoundThingType -> [Token] -> [FoundThing] getcons ftt (Token "=" _: Token name pos : xs) = FoundThing ftt name pos : getcons2 ftt name xs getcons ftt (_:xs) = getcons ftt xs getcons _ [] = [] getcons2 :: FoundThingType -> String -> [Token] -> [FoundThing] getcons2 ftt@(FTCons pt p) c (Token name pos : Token "::" _ : xs) = FoundThing (FTConsAccessor pt p c) name pos : getcons2 ftt c xs getcons2 ftt@(FTConsGADT p) _ (Token name pos : Token "::" _ : xs) = FoundThing ftt name pos : getcons2 ftt p xs getcons2 ftt _ (Token "|" _ : Token name pos : xs) = FoundThing ftt name pos : getcons2 ftt name xs getcons2 ftt c (_:xs) = getcons2 ftt c xs getcons2 _ _ [] = [] splitByNL :: Maybe Int -> [Token] -> [[Token]] splitByNL maybeIndent (nl@(NewLine _):ts) = let (a,b) = break (isNewLine maybeIndent) ts in (nl : a) : splitByNL maybeIndent b splitByNL _ _ = [] -- this only exists for test case testcases/HUnitBase.lhs (bird literate haskell style) getTopLevelIndent :: Bool -> [[Token]] -> Int getTopLevelIndent _ [] = 0 -- (no import found, assuming indent 0: this can be -- done better but should suffice for most needs getTopLevelIndent isLiterate ((nl:next:_):xs) = if "import" == (tokenString next) then let (NewLine i) = nl in i else getTopLevelIndent isLiterate xs getTopLevelIndent isLiterate (_:xs) = getTopLevelIndent isLiterate xs -- According to http://www.haskell.org/onlinereport/literate.html either -- birdstyle or LaTeX style should be used. However simple experiments show -- that unlit distributed by GHC has the following behavior -- * The space after ">" can be omitted -- * ">" must be first char in line to be read as birdstyle (then its replaced by a space) -- * \begin{code} gets recognized if its indented, but \end{code} does not (?) -- -- Attention: Base.lhs (shipping with GHC) have birdstyle in block comments fromLiterate :: FilePath -> [(String, Int)] -> (Bool -- is literate , [(String, Int)]) fromLiterate file lns = if ".lhs" `isSuffixOf` file then (True, unlit lns) else (False, lns) where unlit, returnCode :: [(String, Int)] -> [(String, Int)] unlit ((('>':' ':xs),n):ns) = ((' ':xs),n):unlit(ns) -- unlit keeps space, so do we unlit ((line,_):ns) = if "\\begin{code}" `isPrefixOf` line then returnCode ns else unlit ns unlit [] = [] -- in \begin{code} block returnCode (t@(line,_):ns) = if "\\end{code}" `isPrefixOf` line then unlit ns else t:(returnCode ns) returnCode [] = [] -- unexpected - hasktags does tagging, not compiling, thus don't treat missing \end{code} to be an error -- suffixes: [".hs",".lhs"], use "" to match all files dirToFiles :: Bool -> [String] -> FilePath -> IO [ FilePath ] dirToFiles _ _ "STDIN" = fmap lines $ hGetContents stdin dirToFiles followSyms suffixes p = do isD <- doesDirectoryExist p #if MIN_VERSION_directory(1,3,0) isSymLink <- pathIsSymbolicLink p #else isSymLink <- isSymbolicLink p #endif case isD of False -> return $ if matchingSuffix then [p] else [] True -> if isSymLink && not followSyms then return [] else do -- filter . .. and hidden files .* contents <- filter ((/=) '.' . head) `fmap` getDirectoryContents p concat `fmap` (mapM (dirToFiles followSyms suffixes . () p) contents) where matchingSuffix = any (`isSuffixOf` p) suffixes concatTokens :: [Token] -> String concatTokens = smartUnwords . map (\(Token name _) -> name) . filter (not . isNewLine Nothing) where smartUnwords [] = [] smartUnwords a = foldr (\v -> (glueNext v ++)) "" $ a `zip` tail (a ++ [""]) glueNext (a@("("), _) = a glueNext (a, ")") = a glueNext (a@("["), _) = a glueNext (a, "]") = a glueNext (a, ",") = a glueNext (a, "") = a glueNext (a, _) = a ++ " " extractOperator :: [Token] -> (String, [Token]) extractOperator ts@(Token "(" _ : _) = (\(a, b) -> (foldr ((++) . tokenString) ")" a, tail' b)) $ break ((== ")") . tokenString) ts extractOperator _ = ("", [])