{-# LANGUAGE BangPatterns #-} {-# LANGUAGE CPP #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RankNTypes #-} {-# LANGUAGE TupleSections #-} {-# LANGUAGE ScopedTypeVariables #-} module Data.Conduit.Find ( -- * Introduction -- $intro -- ** Basic comparison with GNU find -- $gnufind -- ** Performance -- $performance -- ** Other notes -- $notes -- * Finding functions sourceFindFiles , find , findFiles , findFilePaths , FindOptions(..) , defaultFindOptions , test , ltest , stat , lstat , hasStatus -- * File path predicates , glob , regex , ignoreVcs -- * GNU find compatibility predicates , depth_ , follow_ , noleaf_ , prune_ , maxdepth_ , mindepth_ , ignoreErrors_ , noIgnoreErrors_ , amin_ , atime_ , anewer_ , empty_ , executable_ , gid_ , name_ , getDepth , filename_ , pathname_ , getFilePath -- * File entry predicates (uses stat information) , regular , directory , hasMode , executable , lastAccessed_ , lastModified_ -- * Predicate combinators , module Cond , (=~) -- * Types and type classes , FileEntry(..) ) where import Conduit import Control.Applicative import Control.Exception import Control.Monad hiding (forM_, forM) import Control.Monad.Morph import Control.Monad.State.Class import Data.Attoparsec.Text as A import Data.Bits import qualified Data.Cond as Cond import Data.Cond hiding (test) import qualified Data.Conduit.Filesystem as CF #if LEAFOPT import Data.IORef #endif import Data.Maybe (fromMaybe,fromJust) import Data.Monoid import Data.Text (Text, unpack, pack) import Data.Time import Data.Time.Clock.POSIX import Filesystem.Path.CurrentOS (FilePath, encodeString, decodeString) import Prelude hiding (FilePath) import qualified System.FilePath as FP import System.PosixCompat.Files import System.PosixCompat.Types import Text.Regex.Posix ((=~)) {- $intro **find-conduit** is essentially a souped version of GNU find for Haskell, using a DSL to provide both ease of us, and extensive flexbility. In its simplest form, let's compare some uses of find to find-conduit. Bear in mind that the result of the find function is a conduit, so you're expected to either sink it to a list, or operate on the file paths as they are yielded. -} {- $gnufind A typical find command: @ find src -name '*.hs' -type f -print @ Would in find-conduit be: @ find "src" (glob \"*.hs\" \<\> regular) $$ mapM_C (liftIO . print) @ The 'glob' predicate matches the file basename against the globbing pattern, while the 'regular' predicate matches plain files. A more complicated example: @ find . -size +100M -perm 644 -mtime 1 @ Now in find-conduit: @ let megs = 1024 * 1024 days = 86400 now <- liftIO getCurrentTime find \".\" ( fileSize (> 100*megs) \<\> hasMode 0o644 \<\> lastModified (> addUTCTime now (-(1*days))) ) @ Appending predicates like this expressing an "and" relationship. Use '<|>' to express "or". You can also negate any predicate: @ find \".\" (not_ (hasMode 0o644)) @ By default, predicates, whether matching or not, will allow recursion into directories. In order to express that matching predicate should disallow recursion, use 'prune': @ find \".\" (prune (depth (> 2))) @ This is the same as using '-maxdepth 2' in find. @ find \".\" (prune (filename_ (== \"dist\"))) @ This is the same as: @ find . \\( -name dist -prune \\) -o -print @ -} {- $performance find-conduit strives to make file-finding a well performing operation. To this end, a composed Predicate will only call stat once per entry being considered; and if you prune a directory, it is not traversed at all. By default, 'find' calls stat for every file before it applies the predicate, in order to ensure that only one such call is needed. Sometimes, however, you know just from the FilePath that you don't want to consider a certain file, or you want to prune a directory tree. To support these types of optimized queries, a variant of find is provided called 'findWithPreFilter'. This takes two predicates: one that is applied to only the FilePath, before stat (or lstat) is called; and one that is applied to the full file information after the stat. -} {- $notes See 'Data.Cond' for more details on the Monad used to build predicates. -} data FindOptions = FindOptions { findFollowSymlinks :: !Bool , findContentsFirst :: !Bool , findIgnoreErrors :: !Bool , findIgnoreResults :: !Bool , findLeafOptimization :: !Bool } defaultFindOptions :: FindOptions defaultFindOptions = FindOptions { findFollowSymlinks = True , findContentsFirst = False , findIgnoreErrors = False , findIgnoreResults = False , findLeafOptimization = True } data FileEntry = FileEntry { entryPath :: !FP.FilePath , entryDepth :: !Int , entryFindOptions :: !FindOptions , entryStatus :: !(Maybe FileStatus) -- ^ This is Nothing until we determine stat should be called. } newFileEntry :: FP.FilePath -> Int -> FindOptions -> FileEntry newFileEntry fp d f = FileEntry fp d f Nothing instance Show FileEntry where show entry = "FileEntry " ++ show (entryPath entry) ++ " " ++ show (entryDepth entry) getFilePath :: Monad m => CondT FileEntry m FP.FilePath getFilePath = gets entryPath pathname_ :: Monad m => (FP.FilePath -> Bool) -> CondT FileEntry m () pathname_ f = guard . f =<< getFilePath filename_ :: Monad m => (FP.FilePath -> Bool) -> CondT FileEntry m () filename_ f = pathname_ (f . FP.takeFileName) getDepth :: Monad m => CondT FileEntry m Int getDepth = gets entryDepth modifyFindOptions :: Monad m => (FindOptions -> FindOptions) -> CondT FileEntry m () modifyFindOptions f = modify $ \e -> e { entryFindOptions = f (entryFindOptions e) } ------------------------------------------------------------------------ -- Workalike options for emulating GNU find. ------------------------------------------------------------------------ depth_ :: Monad m => CondT FileEntry m () depth_ = modifyFindOptions $ \opts -> opts { findContentsFirst = True } follow_ :: Monad m => CondT FileEntry m () follow_ = modifyFindOptions $ \opts -> opts { findFollowSymlinks = True } noleaf_ :: Monad m => CondT FileEntry m () noleaf_ = modifyFindOptions $ \opts -> opts { findLeafOptimization = False } prune_ :: Monad m => CondT a m () prune_ = prune ignoreErrors_ :: Monad m => CondT FileEntry m () ignoreErrors_ = modifyFindOptions $ \opts -> opts { findIgnoreErrors = True } noIgnoreErrors_ :: Monad m => CondT FileEntry m () noIgnoreErrors_ = modifyFindOptions $ \opts -> opts { findIgnoreErrors = False } maxdepth_ :: Monad m => Int -> CondT FileEntry m () maxdepth_ l = getDepth >>= guard . (<= l) mindepth_ :: Monad m => Int -> CondT FileEntry m () mindepth_ l = getDepth >>= guard . (>= l) -- xdev_ = error "NYI" timeComp :: MonadIO m => ((UTCTime -> Bool) -> CondT FileEntry m ()) -> Int -> CondT FileEntry m () timeComp f n = do now <- liftIO getCurrentTime f (\t -> diffUTCTime now t > fromIntegral n) amin_ :: MonadIO m => Int -> CondT FileEntry m () amin_ n = timeComp lastAccessed_ (n * 60) atime_ :: MonadIO m => Int -> CondT FileEntry m () atime_ n = timeComp lastAccessed_ (n * 24 * 3600) anewer_ :: MonadIO m => FP.FilePath -> CondT FileEntry m () anewer_ path = do e <- get es <- applyStat Nothing ms <- getStat Nothing e { entryPath = path , entryStatus = Nothing } case ms of Nothing -> prune >> error "This is never reached" Just (s, _) -> guard $ diffUTCTime (f s) (f es) > 0 where f = posixSecondsToUTCTime . realToFrac . accessTime -- cmin_ = error "NYI" -- cnewer_ = error "NYI" -- ctime_ = error "NYI" empty_ :: MonadIO m => CondT FileEntry m () empty_ = (regular >> hasStatus ((== 0) . fileSize)) <|> (directory >> hasStatus ((== 2) . linkCount)) executable_ :: MonadIO m => CondT FileEntry m () executable_ = executable gid_ :: MonadIO m => Int -> CondT FileEntry m () gid_ n = hasStatus ((== n) . fromIntegral . fileGroup) {- group_ name ilname_ pat iname_ pat inum_ n ipath_ pat iregex_ pat iwholename_ pat links_ n lname_ pat mmin_ mtime_ -} name_ :: Monad m => FP.FilePath -> CondT FileEntry m () name_ = filename_ . (==) {- newer_ path newerXY_ ref nogroup_ nouser_ path_ pat perm_ mode :: Perm readable_ regex_ pat samefile_ path size_ n :: Size type_ c uid_ n used_ n user_ name wholename_ pat writable_ xtype_ c -} ------------------------------------------------------------------------ statFilePath :: Bool -> Bool -> FP.FilePath -> IO (Maybe FileStatus) statFilePath follow ignoreErrors path = do let doStat = (if follow then getFileStatus else getSymbolicLinkStatus) path catch (Just <$> doStat) $ \e -> if ignoreErrors then return Nothing else throwIO (e :: IOException) -- | Get the current status for the file. If the status being requested is -- already cached in the entry information, simply return it from there. getStat :: MonadIO m => Maybe Bool -> FileEntry -> m (Maybe (FileStatus, FileEntry)) getStat mfollow entry = case entryStatus entry of Just s | maybe True (== follow entry) mfollow -> return $ Just (s, entry) | otherwise -> fmap (, entry) `liftM` wrapStat Nothing -> do ms <- wrapStat return $ case ms of Just s -> Just (s, entry { entryStatus = Just s }) Nothing -> Nothing where follow = findFollowSymlinks . entryFindOptions wrapStat = liftIO $ statFilePath (fromMaybe (findFollowSymlinks opts) mfollow) (findIgnoreErrors opts) (entryPath entry) where opts = entryFindOptions entry applyStat :: MonadIO m => Maybe Bool -> CondT FileEntry m FileStatus applyStat mfollow = do ms <- lift . getStat mfollow =<< get case ms of Nothing -> prune >> error "This is never reached" Just (s, e') -> s <$ put e' lstat :: MonadIO m => CondT FileEntry m FileStatus lstat = applyStat (Just False) stat :: MonadIO m => CondT FileEntry m FileStatus stat = applyStat (Just True) hasStatus :: MonadIO m => (FileStatus -> Bool) -> CondT FileEntry m () hasStatus f = guard . f =<< applyStat Nothing regular :: MonadIO m => CondT FileEntry m () regular = hasStatus isRegularFile executable :: MonadIO m => CondT FileEntry m () executable = hasMode ownerExecuteMode directory :: MonadIO m => CondT FileEntry m () directory = hasStatus isDirectory hasMode :: MonadIO m => FileMode -> CondT FileEntry m () hasMode m = hasStatus (\s -> fileMode s .&. m /= 0) withStatusTime :: MonadIO m => (FileStatus -> EpochTime) -> (UTCTime -> Bool) -> CondT FileEntry m () withStatusTime g f = hasStatus (f . posixSecondsToUTCTime . realToFrac . g) lastAccessed_ :: MonadIO m => (UTCTime -> Bool) -> CondT FileEntry m () lastAccessed_ = withStatusTime accessTime lastModified_ :: MonadIO m => (UTCTime -> Bool) -> CondT FileEntry m () lastModified_ = withStatusTime modificationTime regex :: Monad m => String -> CondT FileEntry m () regex pat = filename_ (=~ pat) -- | Return all entries, except for those within version-control metadata -- directories (and not including the version control directory itself either). ignoreVcs :: Monad m => CondT FileEntry m () ignoreVcs = when_ (filename_ (`elem` vcsDirs)) prune where vcsDirs = [ ".git", "CVS", "RCS", "SCCS", ".svn", ".hg", "_darcs" ] -- | Find every entry whose filename part matching the given filename globbing -- expression. For example: @glob "*.hs"@. glob :: Monad m => String -> CondT FileEntry m () glob g = case parseOnly globParser (pack g) of Left e -> error $ "Failed to parse glob: " ++ e Right x -> regex ("^" <> unpack x <> "$") where globParser :: Parser Text globParser = fmap mconcat $ many $ char '*' *> return ".*" <|> char '?' *> return "." <|> string "[]]" *> return "[]]" <|> (\x y z -> pack ((x:y) ++ [z])) <$> char '[' <*> manyTill anyChar (A.try (char ']')) <*> char ']' <|> do x <- anyChar return . pack $ if x `elem` (".()^$" :: String) then ['\\', x] else [x] #if LEAFOPT type DirCounter = IORef LinkCount newDirCounter :: MonadIO m => m DirCounter newDirCounter = liftIO $ newIORef 1 #else type DirCounter = () newDirCounter :: MonadIO m => m DirCounter newDirCounter = return () #endif -- | Find file entries in a directory tree, recursively, applying the given -- recursion predicate to the search. This conduit yields pairs of type -- @(FileEntry, a)@, where is the return value from the predicate at each -- step. sourceFindFiles :: (MonadIO m, MonadResource m) => FindOptions -> FilePath -> CondT FileEntry m a -> Producer m (FileEntry, a) sourceFindFiles findOptions startPath predicate = do startDc <- newDirCounter walk startDc (newFileEntry (encodeString startPath) 0 findOptions) (encodeString startPath) predicate where walk :: MonadResource m => DirCounter -> FileEntry -> FP.FilePath -> CondT FileEntry m a -> Producer m (FileEntry, a) walk !dc !entry !path !cond = do ((!mres, !mcond), !entry') <- lift $ applyCondT entry cond let opts' = entryFindOptions entry this = unless (findIgnoreResults opts') $ yieldEntry entry' mres next = walkChildren dc entry' path mcond if findContentsFirst opts' then next >> this else this >> next where yieldEntry _ Nothing = return () yieldEntry entry' (Just res) = yield (entry', res) walkChildren :: MonadResource m => DirCounter -> FileEntry -> FP.FilePath -> Maybe (CondT FileEntry m a) -> Producer m (FileEntry, a) walkChildren _ _ _ Nothing = return () -- If the conditional matched, we are requested to recurse if this is a -- directory walkChildren !dc !entry !path (Just !cond) = do st <- lift $ checkIfDirectory dc entry path when (fmap isDirectory st == Just True) $ do #if LEAFOPT -- Update directory count for the parent directory. liftIO $ modifyIORef dc pred -- Track the directory count for this child path. let leafOpt = findLeafOptimization (entryFindOptions entry) let lc = linkCount (fromJust st) - 2 opts' = (entryFindOptions entry) { findLeafOptimization = leafOpt && lc >= 0 } dc' <- liftIO $ newIORef lc #else let dc' = dc opts' = entryFindOptions entry #endif CF.sourceDirectory path =$= awaitForever (go dc' opts') where go dc' opts' fp = let entry' = newFileEntry fp (succ (entryDepth entry)) opts' in walk dc' entry' fp cond -- Return True if the given entry is a directory. We can sometimes use -- "leaf optimization" on Linux to answer this question without performing -- a stat call. This is possible because the link count of a directory is -- two more than the number of sub-directories it contains, so we've seen -- that many sub-directories, the remaining entries must be files. checkIfDirectory :: MonadResource m => DirCounter -> FileEntry -> FP.FilePath -> m (Maybe FileStatus) checkIfDirectory !dc !entry !path = do #if LEAFOPT let leafOpt = findLeafOptimization (entryFindOptions entry) doStat <- if leafOpt then (> 0) <$> liftIO (readIORef dc) else return True #else let doStat = dc == () -- to quiet hlint warnings #endif let opts = entryFindOptions entry if doStat then liftIO $ statFilePath (findFollowSymlinks opts) (findIgnoreErrors opts) path else return Nothing findFiles :: (MonadIO m, MonadBaseControl IO m, MonadThrow m) => FindOptions -> FilePath -> CondT FileEntry m a -> m () findFiles opts path predicate = runResourceT $ sourceFindFiles opts { findIgnoreResults = True } path (hoist lift predicate) $$ sinkNull -- | A simpler version of 'findFiles', which yields only 'FilePath' values, -- and ignores any values returned by the predicate action. findFilePaths :: (MonadIO m, MonadResource m) => FindOptions -> FilePath -> CondT FileEntry m a -> Producer m FilePath findFilePaths opts path predicate = mapOutput decodeString $ sourceFindFiles opts path predicate =$= mapC (entryPath . fst) -- | Calls 'findFilePaths' with the default set of finding options. -- Equivalent to @findFilePaths defaultFindOptions@. find :: (MonadIO m, MonadResource m) => FilePath -> CondT FileEntry m a -> Producer m FilePath find = findFilePaths defaultFindOptions -- | Test a file path using the same type of predicate that is accepted by -- 'findFiles'. test :: MonadIO m => CondT FileEntry m () -> FilePath -> m Bool test matcher path = Cond.test (newFileEntry (encodeString path) 0 defaultFindOptions) matcher -- | Test a file path using the same type of predicate that is accepted by -- 'findFiles', but do not follow symlinks. ltest :: MonadIO m => CondT FileEntry m () -> FilePath -> m Bool ltest matcher path = Cond.test (newFileEntry (encodeString path) 0 defaultFindOptions { findFollowSymlinks = False }) matcher