-- FlowEr - FLOWgram ExtractoR module Main (main) where import Bio.Sequence.SFF import Bio.Sequence.SFF_filters import Bio.Sequence.Fasta import Bio.Sequence.FastQ import Bio.Util (countIO) import Print import Text.Printf import System.IO (stdout) import System.Environment (getArgs) import Numeric (showFFloat) import Data.Char (toLower) import Data.List (intersperse, partition) import Data.ByteString.Char8 (pack,unpack,ByteString) import qualified Data.ByteString.Char8 as B import qualified Data.ByteString as B1 import qualified Data.ByteString.Lazy.Char8 as L import qualified Data.ByteString.Lazy as L1 import Data.Array.Unboxed import Data.Array.ST import Control.Monad.ST import Metrics main :: IO () main = do args <- getArgs let (opts,files) = partition (\p -> case p of ('-':_) -> True; _ -> False) args reader :: String -> IO SFF reader f = if "-v" `elem` opts then do SFF h rs <- readSFF f rs' <- countIO "reads: " "done" 20 rs return (SFF h rs') else readSFF f writer :: SFF -> IO () writer = case filter (/="-v") opts of ["-r"] -> \s@(SFF ch _) -> hWriteFasta stdout . trim_keys ch . sffToSequence $ s ["-R"] -> writeFastaQual "flower.fasta" "flower.qual" . sffToSequence ["-q"] -> hWriteFastQ stdout . sffToSequence ["-f"] -> L1.putStrLn . L1.fromChunks . intersperse (B.pack "\n") . showflow ["-h"] -> putStr . sffToHistogram ["-H"] -> putStr . sffToHistogramClip ["-i"] -> putStr . getHeader ["-s"] -> summarize [] -> dumpText _ -> error ("Usage: flower -[f|h|H|i|q|r|R|s] [ ..]\n" ++" -r output reads in Fasta format\n" ++" -R output reads in Fasta format with associated .qual\n" ++" (generates files instead of writing to )\n" ++" -q output in FastQ format\n" ++" -f output the flowgram in tabular format\n" ++" -h output a histogram table of flow values\n" ++" -H output a histogram of flows after clipping\n" ++" -i output header information\n" ++" -s output a summary of each read" ) writer `seq` mapM_ (\f -> writer =<< reader f) files -- trim keys if they match, eliminate reads that don't. trim_keys _ [] = [] trim_keys ch (s:ss) = maybe id ((:) . id) (trimKey ch s) $ trim_keys ch ss -- ------------------------------------------------------------ -- No option - dump as text format -- ------------------------------------------------------------ dumpText :: SFF -> IO () dumpText (SFF _ rs) = putStr . concat . map toText $ rs where toText :: ReadBlock -> String toText r = concat [ gt, B.unpack (read_name rh), nl , maybe "" ((\s->info++s++nl) . formatRN) $ decodeReadName (read_name rh) , let (lf,rt) = (clip_adapter_right rh, clip_adapter_left rh) in if lf /= 0 || rt /= 0 then adapter ++ show lf ++ sp++ show rt else "" , clip, show (clip_qual_left rh), sp, show (clip_qual_right rh), nl , flows, unwords $ map show $ flowgram r, nl , index, unwords $ map show $ cumulative_index' r, nl , base, L.unpack (masked_bases' r), nl , qual, unwords $ map show $ L1.unpack (quality r), nl ] where rh = read_header r gt = ">" nl = "\n" sp = " " info = " Info: \t" clip = " Clip: \t" adapter = " Adap: \t" flows = " Flows:\t" index = " Index:\t" base = " Bases:\t" qual = " Quals:\t" formatRN (ReadName (yr,mo,dy) (h,m,s) r x y) = printf "%4d-%02d-%02d %02d:%02d:%02d R%d (%d,%d)" yr mo dy h m s r x y -- Get the bases, but lower case masked bases. Also in biolib's SFF.hs (sans prime), -- so this is only for backwards compatibility with biolib <= 0.4.6. masked_bases' :: ReadBlock -> SeqData masked_bases' rb = let l = fromIntegral $ clip_qual_left $ read_header rb r = fromIntegral $ clip_qual_right $ read_header rb s = bases rb in L.concat [ L.map toLower $ L.take (l-1) s , L.take r (L.drop (l-1) s) , L.map toLower $ L.drop r s] cumulative_index' :: ReadBlock -> [Int] cumulative_index' = scanl1 (+) . map fromIntegral . B1.unpack . flow_index -- ------------------------------------------------------------ -- The -i option: Print header info -- ------------------------------------------------------------ getHeader :: SFF -> String getHeader (SFF h _) = unlines ["Index: \t" ++ show (index_offset h,index_length h) ,"Num_reads:\t" ++ show (num_reads h) ,"Num_flows:\t" ++ show (flow_length h) ,"Key: \t" ++ unpack (key h) ] -- ---------------------------------------------------------- -- The -s option: Summarize each read on one line -- ---------------------------------------------------------- -- | Summarize each read on one line of output summarize :: SFF -> IO () summarize (SFF _rh rs) = do putStrLn "# name........\tdate......\ttime....\treg\ttrim_l\ttrim_r\tx_loc\ty_loc\tlen\tK2\ttrimK2\tncount\tavgQ\ttravgQ\tf:sgnt\tf:q20" L1.putStrLn . toLazyByteString . mconcat . map sum1 $ rs -- todo: date and time are usually constants! sum1 :: ReadBlock -> Builder sum1 r = let rh = read_header r nb = num_bases rh h = read_name rh tr = trim r (rndec1,rndec2) = case decodeReadName h of Just rn -> let ((y,m,d),reg,(hh,mm,ss)) = (date rn,region rn,time rn) in ([putDate y m d, putTime hh mm ss, putInt2 reg] ,[putInt (fromIntegral $ x_loc rn), putInt (fromIntegral $ y_loc rn)]) Nothing -> ([q,q,q],[q,q]) (qleft,qright) = (clip_qual_left rh, clip_qual_right rh) avg_qual q = let l = fromIntegral (L1.length q) in if l>0 then putFix 2 $ sum (map fromIntegral $ L1.unpack q) * 100 `div` l else putFix 2 0 in mconcat $ intersperse tb ([fromByteString h] ++ rndec1 ++ [putInt (fromIntegral qleft), putInt (fromIntegral qright)] ++ rndec2 ++ [putInt (fromIntegral nb) , fromByteString (fi $ quals $ flowgram r), fromByteString (fi $ quals $ flowgram tr) , putInt (n_count r) , avg_qual $ quality r, avg_qual $ quality tr , putInt (flowToBasePos r $ sigint r), putInt (qual20 r)]) ++ [nl] tb, nl, q :: Builder tb = char '\t' nl = char '\n' q = char '?' -- ---------------------------------------------------------- -- The -f option: Output the sequence of flows, one flow per line -- ---------------------------------------------------------- -- | output a list of flows showflow :: SFF -> [ByteString] showflow (SFF h rs) = concatMap (showread h) rs fi :: Flow -> ByteString fi f | f <= 9999 && f >= 0 = farray!f | otherwise = let (i,r) = f `divMod` 100 in B.pack (show i++"."++show r) -- error ("Can't show flow values outside [0..99.99] (You had: "++show f++")") farray :: Array Flow ByteString farray = listArray (0,9999) [B.pack (showFFloat (Just 2) i "") | i <- [0,0.01..99.99::Double]] tab :: ByteString tab = B.pack "\t" showread :: CommonHeader -> ReadBlock -> [ByteString] showread h rd = let rh = read_header rd rn = read_name rh maskFlows = mask rh 1 qgroups . unpack qgroups = qgroup (B1.unpack $ flow_index rd) (L1.unpack $ quality rd) format p c v q = B.concat [rn,tab,B.pack (show p),tab,B.pack [c],tab,fi v,tab,B.pack (init $ drop 1 $ show q)] in zipWith4 format [(1::Int)..] (maskFlows $ flow h) (flowgram rd) qgroups -- lower case based on the clip_qual values mask rh p _ [] = [] -- qgroups are infinite mask rh p (q1:qs) (c:cs) = c' : mask rh (p+length q1) qs cs where c' = if fromIntegral p < clip_qual_left rh || fromIntegral p > clip_qual_right rh then toLower c else c zipWith4 :: (a -> b -> c -> d -> e) -> [a] -> [b] -> [c] -> [d] -> [e] zipWith4 f (a:as) (b:bs) (c:cs) (d:ds) = f a b c d : zipWith4 f as bs cs ds zipWith4 _ _ _ _ _ = [] -- | Take the unpacked index_offsets and quality values, and return -- a list of groups of quality values, each group corresponding to a flow value. -- Flow values < 0.5 result in empty groups. qgroup :: [Index] -> [Qual] -> [[Qual]] qgroup [] [] = let rest = []:rest in rest qgroup is@(1:_) qs = let (iz,irest) = span (==0) (tail is) (q1,qrest) = splitAt (length iz+1) qs in q1 : qgroup irest qrest qgroup (i:is) qs = [] : qgroup (i-1:is) qs -- ---------------------------------------------------------- -- The -h and -H options: Output a histogram of flow values -- ---------------------------------------------------------- -- | Generate a histogram of flow values from an SFF file sffToHistogram, sffToHistogramClip :: SFF -> String sffToHistogram (SFF h rs) = showHist . histogram (B.unpack $ flow h) . map flowgram $ rs sffToHistogramClip (SFF h rs) = showHist . histogram (B.unpack $ flow h) . map clip_flowgram $ rs clip_flowgram rd = let (l,r) = (fromIntegral $ clip_qual_left (read_header rd)-1, fromIntegral $ clip_qual_right (read_header rd)) ps = take r $ B1.unpack $ flow_index rd p1 = fromIntegral $ sum $ take l ps p2 = fromIntegral $ sum $ drop l ps in take p2 $ drop p1 $ flowgram rd type Hist = UArray Flow Int histogram :: String -> [[Flow]] -> (Hist,Hist,Hist,Hist) histogram fl scores = runST $ do let zero = newArray (0,9999) 0 :: ST s (STUArray s Flow Int) a <- zero c <- zero g <- zero t <- zero let ins1 ('A',i) = bump a i ins1 ('C',i) = bump c i ins1 ('G',i) = bump g i ins1 ('T',i) = bump t i ins1 (x,_) = error ("Illegal character "++show x++" in flow!") bump ar i = readArray ar i >>= \x -> writeArray ar i (x+1) mapM_ ins1 (zip (cycle fl) (map (\x->if x>9999 || x<0 then 9999 else x) $ concat scores)) a' <- unsafeFreeze a c' <- unsafeFreeze c g' <- unsafeFreeze g t' <- unsafeFreeze t return (a',c',g',t') showHist :: (Hist,Hist,Hist,Hist) -> String showHist (as,cs,gs,ts) = "Score\tA\tC\tG\tT\tsum\n" ++ unlines [concat $ intersperse "\t" $ showFFloat (Just 2) (fromIntegral sc/100::Double) "" : map show [as!sc,cs!sc,gs!sc,ts!sc, as!sc+cs!sc+gs!sc+ts!sc] | sc <- [0..9999]]