-- FlowEr - FLOWgram ExtractoR module Main (main) where import Bio.Sequence.SFF import Bio.Sequence.Fasta import Bio.Sequence.FastQ import Print import System.IO (stdout) import System.Environment (getArgs) import Numeric (showFFloat) import Data.List (intersperse, partition) import Data.ByteString.Char8 (pack,unpack,ByteString) import qualified Data.ByteString.Char8 as B import qualified Data.ByteString as B1 import qualified Data.ByteString.Lazy as L1 import Data.Array.Unboxed import Data.Array.ST import Control.Monad.ST main :: IO () main = do args <- getArgs let (opts,files) = partition (\p -> case p of ('-':_) -> True; _ -> False) args case opts of ["-r"] -> mapM_ (\f -> hWriteFasta stdout . sffToSequence =<< readSFF f) files ["-R"] -> mapM_ (\f -> writeFastaQual (f++".fasta") (f++".qual") . sffToSequence =<< readSFF f) files ["-q"] -> mapM_ (\f -> hWriteFastQ stdout . sffToSequence =<< readSFF f) files ["-f"] -> L1.putStrLn . L1.fromChunks . intersperse (B.pack "\n") . concat =<< mapM showflow files ["-h"] -> mapM_ (\f -> (putStr . sffToHistogram) =<< readSFF f) files ["-s"] -> mapM_ (\f -> summarize =<< readSFF f) files _ -> error ("Usage: flower -[f|q|r|R] [ ..]\n" ++" -r output reads in Fasta format\n" ++" -R output reads in Fasta format with associated .qual\n" ++" (generates files instead of writing to )\n" ++" -q output in FastQ format\n" ++" -f output the flowgram in tabular format\n" ++" -h output a histogram table of flow values\n" ++" -s output a summary of each read" ) -- ---------------------------------------------------------- summarize :: SFF -> IO () summarize (SFF _rh rs) = do putStrLn "# name........\tdate......\ttime....\treg\tx_loc\ty_loc\tlen\tqual" L1.putStrLn . toLazyByteString . mconcat . map sum1 $ rs -- todo: date and time are usually constants! sum1 :: ReadBlock -> Builder sum1 r = let rh = read_header r nb = num_bases rh h = read_name rh rn = decodeReadName h (y,m,d) = date rn reg = region rn (hh,mm,ss) = time rn in mconcat ([fromByteString h, tb, putDate y m d, tb, putTime hh mm ss, tb, putInt2 reg ,tb, putInt (fromIntegral $ x_loc rn), tb, putInt (fromIntegral $ y_loc rn), tb, putInt (fromIntegral nb) ,tb, fromByteString (fi $ quals $ flowgram r), nl]) -- | Take the fractional parts of the flows, and sum their squares quals :: [Flow] -> Flow quals q = floor $ (*(100/fromIntegral (length q))) $ sqrt $ sum $ map (fromIntegral . (^2) . (flip (-) 50) . (`mod` 100) . (+50)) $ q tb, nl :: Builder tb = char '\t' nl = char '\n' -- these are clumsy, since we just might need the file name showflow :: FilePath -> IO [ByteString] showflow f = return . {- map (\s -> B.concat [B.pack f,t,s]) . -} showrun =<< readSFF f fi :: Flow -> ByteString fi = (!) farray farray :: Array Flow ByteString farray = listArray (0,10000) [B.pack (showFFloat (Just 2) i "") | i <- [0,0.01..99.99::Double]] showrun :: SFF -> [ByteString] showrun (SFF h rs) = concatMap (showread h) rs tab :: ByteString tab = B.pack "\t" showread :: CommonHeader -> ReadBlock -> [ByteString] showread h rd = let rn = read_name $ read_header rd qgroups = qgroup (B1.unpack $ flow_index rd) (L1.unpack $ quality rd) format p c v q = B.concat [rn,tab,B.pack (show p),tab,B.pack [c],tab,fi v,tab,B.pack (show q)] in zipWith4 format [(1::Int)..] (unpack $ flow h) (flowgram rd) qgroups zipWith4 :: (a -> b -> c -> d -> e) -> [a] -> [b] -> [c] -> [d] -> [e] zipWith4 f (a:as) (b:bs) (c:cs) (d:ds) = f a b c d : zipWith4 f as bs cs ds zipWith4 _ _ _ _ _ = [] -- | Take the unpacked index_offsets and quality values, and return -- a list of groups of quality values, each group corresponding to a flow value. -- Flow values < 0.5 result in empty groups. qgroup :: [Index] -> [Qual] -> [[Qual]] qgroup [] [] = let rest = []:rest in rest qgroup is@(1:_) qs = let (iz,irest) = span (==0) (tail is) (q1,qrest) = splitAt (length iz+1) qs in q1 : qgroup irest qrest qgroup (i:is) qs = [] : qgroup (i-1:is) qs sffToHistogram :: SFF -> String sffToHistogram (SFF h rs) = showHist . histogram (B.unpack $ flow h) . map flowgram $ rs type Hist = UArray Flow Int histogram :: String -> [[Flow]] -> (Hist,Hist,Hist,Hist) histogram fl scores = runST $ do let zero = newArray (0,9999) 0 :: ST s (STUArray s Flow Int) a <- zero c <- zero g <- zero t <- zero let ins1 ('A',i) = bump a i ins1 ('C',i) = bump c i ins1 ('G',i) = bump g i ins1 ('T',i) = bump t i ins1 (x,_) = error ("Illegal character "++show x++" in flow!") bump ar i = readArray ar i >>= \x -> writeArray ar i (x+1) mapM_ ins1 (zip (cycle fl) (concat scores)) a' <- unsafeFreeze a c' <- unsafeFreeze c g' <- unsafeFreeze g t' <- unsafeFreeze t return (a',c',g',t') showHist :: (Hist,Hist,Hist,Hist) -> String showHist (as,cs,gs,ts) = "Score\tA\tC\tG\tT\tsum\n" ++ unlines [concat $ intersperse "\t" $ (showFFloat (Just 2) (fromIntegral sc/100::Double) "") : map show [as!sc,cs!sc,gs!sc,ts!sc, as!sc+cs!sc+gs!sc+ts!sc] | sc <- [0..9999]]