{- Dephd - decode phd (phred output) files. Generates seq/qual output, quality plots, or rankings. -} module Main where import Control.Concurrent import Control.Monad import Data.Char import Data.List (tails,groupBy,sortBy) import Data.Maybe import System.Console.GetOpt import System.Directory import System.Environment (getArgs) import System.Exit import System.IO import System.IO.Unsafe import System.Process import Text.Printf import Text.Regex import qualified Data.ByteString.Lazy.Char8 as B import Bio.Sequence -- ------------------------------------------------------------ -- Option Handling -- ------------------------------------------------------------ data MyOpts = O { actions :: [(FilePath,Sequence) -> IO ()] -- ^ Apply to each sequence , outputs :: [Handle] -- ^ Output handles that must be closed , filters :: (FilePath,Sequence) -> (FilePath,Sequence) -- ^ Filter sequences before processing , inputs :: [String] -> IO [(FilePath,Sequence)] -- ^ Turn args into sequences , verbose :: Bool -- ^ Verbose output (progress reporting) and sequence trimming } defaultopts :: MyOpts defaultopts = O { actions = [], outputs = [], filters = id , inputs = readFiles, verbose = False } getOptions :: IO (MyOpts, [String], [String]) getOptions = do (os,ns,es) <- getArgs >>= (return . getOpt Permute options) os' <- foldl (>>=) (return defaultopts) os return (os',ns,es) options :: [OptDescr (MyOpts -> IO MyOpts)] options = [ Option ['v'] [] (NoArg (\opt -> return opt {verbose = True})) "Verbose output" , Option ['h'] ["help"] (NoArg (\_ -> do {putStrLn (usage []); exitWith ExitSuccess})) "Display usage information" -- Output options , Option ['R'] ["output-ranks"] (ReqArg setrank "file") "Set ranked output file" , Option ['F'] ["output-fasta"] (ReqArg setfasta "file") "Set fasta output file" , Option ['Q'] ["output-qual"] (ReqArg setqual "file") "Set quality output file" , Option ['P'] ["output-plot"] (NoArg (setplot False)) "Generate quality plots" , Option ['X'] ["output-xplot"] (NoArg (setplot True)) "Display quality plots" -- Filter options , Option ['t'] ["filter-trim"] (NoArg filterTrim) "Trim output sequences" , Option ['q'] ["filter-qual"] (NoArg filterQual) "Mask by quality" -- Input options , Option [] ["input-dirs"] (NoArg (\opt -> return opt { inputs = readDirs })) "Read directories containing PHD files" , Option [] ["input-list"] (NoArg inputList) "Read the files listed in an index file" , Option ['i'] ["input-fasta-qual"] (NoArg inputFQ) "Read a fasta- and optionally a qual file" ] where inputList opt = return $ opt { inputs = \args -> case args of [arg] -> do s <- readFile arg readFiles $ lines s _ -> error ("Please specify only one file for list input: "++show args)} inputFQ opt = return $ opt { inputs = \arg -> do ss <- case arg of [fa,q] -> readFastaQual fa q [fa] -> readFasta fa _ -> error ("Too many files specified for fasta/qual input: "++show arg) return $ map (\s -> (toStr $ seqlabel s, s)) ss } addFilter act opt = let f = filters opt in return $ opt { filters = f . act } filterTrim = addFilter trimSeq filterQual = addFilter (\(p,s) -> (p,qualAdjust s)) addAction act arg opt = let as = actions opt hs = outputs opt in do h <- openFile arg WriteMode return opt { actions = act h:as , outputs = h:hs } setrank = addAction (\h -> hPutStrLn h . unwords {- columns -} . qualchk . snd) setfasta = addAction (\h -> hWriteFasta h . return . snd) setqual = addAction (\h -> hWriteQual h . return . snd) setplot b opt = do g <- hasgp if g then let as = actions opt in return $ opt { actions = plot b : as } else error ("You requested quality plots, but I can't find 'gnuplot' in your search path.\n") trimSeq :: (FilePath,Sequence) -> (FilePath,Sequence) trimSeq (i,s@(Seq h d mq)) = case trims s of [t1,t2] -> let clip = B.take (fromIntegral t2-fromIntegral t1) . B.drop (fromIntegral t1) in (i,Seq h (clip d) (case mq of Nothing -> Nothing Just q -> Just (clip q))) _ -> (i,s) -- todo: clip only 'n's? hasgp :: IO Bool hasgp = return . isJust =<< findExecutable "gnuplot" readDirs, readFiles :: [FilePath] -> IO [(FilePath,Sequence)] readDirs dirs = mapM' myReadPhd =<< filterM isPhdFile =<< return . concat =<< mapM myGetDirectoryContents =<< filterM doesDirectoryExist dirs readFiles = mapM' myReadPhd mapM' :: (a -> IO b) -> [a] -> IO [b] mapM' _ [] = hPutStrLn stderr "Warning: no files found" >> return [] mapM' action xs = mapM action xs -- ------------------------------------------------------------ -- Processing -- ------------------------------------------------------------ -- | Quality is divided into four categories, mainly based on length of -- a sliding average of quality greater than 20. data Quality = Junk | Poor | Good | Excl deriving (Show,Eq) main :: IO () main = do (opts,fs,errs) <- getOptions when (not $ null errs) $ error $ usage errs let process xs = if not (verbose opts) then return xs else countIO ("processing "++show (length xs)++" sequences: ") ", done.\n" 100 xs mapM_ (sequence_ . zipWith ($) (actions opts) . repeat) =<< return . map (filters opts) =<< process =<< (inputs opts) fs mapM_ hClose (outputs opts) phd_rx :: Regex phd_rx = mkRegex ".phd(.[0-9]+)?$" isPhdFile :: FilePath -> IO Bool isPhdFile fn = do exists <- doesFileExist fn let matches = matchRegex phd_rx fn return (exists && isJust matches) myGetDirectoryContents :: FilePath -> IO [FilePath] myGetDirectoryContents d = return . map ((d++"/")++) =<< getDirectoryContents d -- | Adjust sequence content according to quality. -- Upper case is >20 and sliding avg >30 qualAdjust :: Sequence -> Sequence qualAdjust (Seq _ _ Nothing) = error "no quality data - impossible!" qualAdjust (Seq l d (Just q)) = Seq l (B.unfoldr conv avgs) (Just q) where avgs = (sliding_avg 1 q, sliding_avg 20 q, d) conv (a:as,s:ss,dd) = Just (if a>20 && s>25 then toUpper (B.head dd) else if a<4 || s<9 then 'n' else toLower (B.head dd),(as,ss,B.tail dd)) conv ([],[],dd) | B.null dd = Nothing -- else broken invariant conv _ = error "internal error in 'qualAdjust/conv'" myReadPhd :: FilePath -> IO (FilePath,Sequence) myReadPhd f = unsafeInterleaveIO (do p <- readPhd f p `seq` return (f,p)) usage :: [String] -> String usage errs = usageInfo msg options where msg = (if (not $ null errs) then concat errs++"\n" else "dephd: analyze phd files (phred output)\n") ++"Usage: dephd -[RFQPX] [phdfile..]\n" ++" dephd -[RFQPX] --input-dirs [phddir..]\n" -- | Align columns: should be a standard function? columns :: [[String]] -> [String] columns ls = map (pad (head lens:map negate (tail lens))) ls where lens = collens ls pad :: [Int] -> [String] -> String pad x = unwords . zipWith padto x where padto l s = if l<0 then replicate (negate l-length s) ' '++s else s ++ replicate (l-length s) ' ' collens :: [[String]] -> [Int] collens ls = let ls' = filter (not . null) ls in if null ls' then [] else (maximum . map (length . head) $ ls'):collens (map tail ls') -- ugly hack myhead :: [Int] -> Int myhead x = if null x then 0 else head x -- | Report (various?) quality estimates qualchk :: Sequence -> [String] qualchk s = [toStr $ seqlabel s ,printf "%.1f" avgqual] ++ map show [s15, s30, a20] ++[show qtot] where s15 = stretch 15 qs s30 = stretch 30 qs a20 = stretch 20 . sliding_avg 20 . seqqual $ s qtot | a20 < 75 && s15 < 50 = Junk | a20 < 150 = Poor | a20 < 250 || s30 < 150 = Good | otherwise = Excl qs = map (fromIntegral . ord) . toStr . seqqual $ s avgqual = sum qs / fromIntegral (seqlength s) :: Double stretch i = myhead . stretches i stretches i = sortOn negate . map (subtract 1 . length) . groupBy (const (>i)) sortOn f = sortBy (\x y -> compare (f x) (f y)) -- | Plot the quality of a sequence in the background bgplot :: Bool -> (FilePath,Sequence) -> IO ThreadId bgplot x = forkIO . plot x -- | Feed the quality data to gnuplot (check if installed?) plot :: Bool -> (FilePath,Sequence) -> IO () plot z (f,s) = case seqlength s of 0 -> hPutStrLn stderr ("cannot plot empty sequence: "++f) >> return () _ -> do (i,o,e,p) <- runInteractiveCommand "gnuplot -persist" hPutStr i header hPutStr i $ unlines $ map (show . ord) $ toStr $ seqqual s hPutStr i "e\n" hPutStr i $ unlines $ map show $ sliding_avg 20 $ seqqual s hPutStr i "e\n" hClose i x <- waitForProcess p hGetContents o >>= hPutStr stderr hGetContents e >>= hPutStr stderr case x of ExitSuccess -> return () ExitFailure j -> hPutStr stderr (errmsg++show j) >> return () where name = toStr (seqlabel s) f' = subRegex phd_rx f ".jpg" arrow (i,c) = "set arrow "++show i++" from "++show c++",0 to "++show c ++",20 nohead\n" mtrim = concatMap arrow $ zip [(1::Int)..] (trims s) header = (if z then "" else "set term jpeg\nset out \""++f'++"\"\n") ++"set xlabel \"position\"\nset ylabel \"quality\"\n" ++ mtrim ++ "set title \""++name++"\"\nset yrange [0:75]\nplot \"-\" t \"qual\" with points,\"-\" t \"avg\" with lines, 20 with lines t \"thresh\"\n" errmsg = "'gnuplot' failed for "++f++" with exit code " trims :: Sequence -> [Int] trims s = case dropWhile ((/=) (B.pack "TRIM:")) (B.words $ seqheader s) of (_:x:y:_) -> [read $ B.unpack x, read $ B.unpack y] _ -> [] -- | Calculate a sliding average. Slightly biased for even i. sliding_avg :: Int -> QualData -> [Double] sliding_avg i q = map avg $ take (fromIntegral $ B.length q) (iblocks i qs) where avg xs = fromIntegral (sum xs) / fromIntegral (length xs) qs = map ord . toStr $ q -- | select centered words iblocks :: Int -> [a] -> [[a]] iblocks i ss = [take n ss | n <- [(i+1) `div` 2..(i-1)]] ++ map (take i) (tails ss) -- copied from RBR countIO :: String -> String -> Int -> [a] -> IO [a] countIO msg post step xs = sequence $ map unsafeInterleaveIO ((blank >> outmsg (0::Int) >> c):cs) where (c:cs) = ct 0 xs output = hPutStr stderr blank = output ('\r':take 70 (repeat ' ')) outmsg x = output ('\r':msg++show x) >> hFlush stderr ct s ys = let (a,b) = splitAt (step-1) ys next = s+step in case b of [b1] -> map return a ++ [outmsg (s+step) >> hPutStr stderr post >> return b1] [] -> map return (init a) ++ [outmsg (s+length a) >> hPutStr stderr post >> return (last a)] _ -> map return a ++ [outmsg s >> return (head b)] ++ ct next (tail b)