Extract sequences matching a list of labels \begin{code} module Main where import Data.Set hiding (null,filter,partition) import System.Environment (getArgs) import Data.List (foldl',partition) import System.IO usage :: String usage = "xcerpt [-r|-f] [] \n\n" ++ "where is the name of a file containing the IDs\n" ++ "of sequences to extract from the file.\n" ++ "The default is to output matching sequences, -r outputs the non-matching sequences" ++ "With -f, the result ends up in xcerpt.match and xcerpt.rest instead of standard output." main :: IO () main = do (opts,args) <- return . partition (\a->not (null a) && head a=='-') =<< getArgs let dict = case length args of 1 -> hGetContents stdin 2 -> readFile (args!!0) _ -> error usage d <- return . mkdict =<< dict (match_out,rest_out) <- case opts of [] -> return (Just stdout,Nothing) ["-r"] -> return (Nothing,Just stdout) ["-f"] -> do m <- openFile "xcerpt.match" WriteMode r <- openFile "xcerpt.rest" WriteMode return (Just m, Just r) _ -> error usage xcerpt d match_out rest_out (last args) maybe (return ()) hClose match_out maybe (return ()) hClose rest_out mkdict :: String -> Set String mkdict = foldl' (flip insert) empty . words xcerpt :: Set String -> Maybe Handle -> Maybe Handle -> String -> IO () xcerpt dict m r input = do i <- readFile input xtr dict m r $ filter (\l->(not.null) l && head l /= '#') $ lines i xtr _ m r [] = return () xtr _ _ _ [x] = error ("Odd number of lines?\n"++x) xtr d m r (l1:ls) = if head l1 == '>' then let f = if (drop 1 $ head $ words l1) `member` d then m else r in do let (sequence,rest) = break ((=='>').head) ls case f of Just x -> hPutStr x $ unlines (l1:sequence) Nothing -> return () xtr d m r rest else error ("Not a FASTA header:\n"++l1) \end{code}