{-# LANGUAGE DeriveDataTypeable #-} module Main where import Bio.Sequence.SFF hiding (trim) import Bio.Util (countIO) import qualified Data.ByteString as B import qualified Data.ByteString.Char8 as BC import qualified Data.IntMap as M import Data.IntMap (IntMap) -- import Bloom import qualified Data.IntSet as S import Data.IntSet (IntSet) import Text.Printf import System.IO import Control.Monad (when) import System.Console.CmdArgs type FingerPrints = IntSet type DupMap = IntMap [ReadBlock] data ResultList = Then ReadBlock ResultList | EndWith DupMap splitRes :: ResultList -> ([ReadBlock], DupMap) splitRes (Then x rs) = let (ys,e) = splitRes rs in (x:ys,e) splitRes (EndWith e) = ([],e) -- trim :: ReadBlock -> ReadBlock -- trim = id -- trimFromTo 4 10000 <- this trims to last base called position! (use trimKey?) trim :: ReadBlock -> ReadBlock trim rb = let rh = read_header rb in trimFromTo (clip_qual_left rh) (clip_qual_right rh) rb data Options = O { thresh :: Double , fplen :: Int , summarize :: FilePath , clusters :: Bool , input :: [FilePath] } deriving (Data,Typeable,Show,Eq) mymode :: Options mymode = O { thresh = 50 &= help "similarity threshold" , fplen = 20 &= help "fingerprint size" , summarize = def &= {- empty "-" &= -} help "output cluster summary" , clusters = True &= help "output complete clusters" , input = def &= args &= typFile } &= program "flowt" &= summary "flowt v0.7 - filter out reads from duplicate clones in 454 sequencing." vlog :: Bool -> String -> IO () vlog v s = when v (hPutStr stderr s >> hFlush stderr) main :: IO () main = do opts <- cmdArgs mymode -- putStrLn $ show opts verb <- isLoud vlog verb "Building the fingerprint index" let sff = case input opts of [x] -> x; _ -> error "You need to specify a (single) file name!\n(or use 'flowt -h' for help)." dups <- mkbf opts sff vlog verb (seq dups "...done!") SFF hs rs <- readSFF sff let (uqs,ds) = splitRes $ filter_unique opts dups rs writeSFF' "unique.sff" =<< SFF hs `fmap` (if verb then countIO "Output unique: " "..done!" 100 else return) uqs when (not . null . summarize $ opts) (gensum (summarize opts) ds) let cg = concatMap (clusterGroups opts) (M.elems ds) writeSFF' "duplic.sff" =<< SFF hs `fmap` (if verb then countIO "Output cluster reps: " "..done!" 100 else return) (map head cg) when (clusters opts) $ BC.writeFile "clusters.txt" $ BC.unlines $ map (BC.unwords . map (read_name . read_header)) $ cg return () gensum :: String -> IntMap [ReadBlock] -> IO () gensum f ds = write $ unlines $ map showcluster $ M.assocs ds where write = if f == "-" then putStrLn else writeFile f showcluster (k,v) = let a = map fromIntegral $ flowgram $ trim $ head v -- averageflow v in printf "%16x" k++":\t" ++ show (length v) ++ unwords (map (BC.unpack . read_name . read_header) v) ++ "\n" ++ unlines [let fs = flowgram (trim x) in printf "%8.1f " (dist a $ map fromIntegral fs) ++ concatMap (printf "%3d ") fs | x <- v] filter_unique :: Options -> FingerPrints -> [ReadBlock] -> ResultList filter_unique opts dups = go M.empty where go dm (r:rs) = let fp = fingerprint (fplen opts) r in if fp `S.member` dups then let dm' = myinsert fp r dm in dm' `seq` go dm' rs else r `Then` go dm rs go dm [] = EndWith dm myinsert fp r dm = let v = M.findWithDefault [] fp dm v' = r:v in v `seq` v' `seq` dm `seq` M.insert fp v' dm mkbf :: Options -> FilePath -> IO FingerPrints mkbf opts sff = do SFF _ rs <- readSFF sff let go seen dup (fp:rest) = if fp `S.member` seen then go seen (S.insert fp dup) rest else go (S.insert fp seen) dup rest go _ dup [] = dup return $ go S.empty S.empty $ map (fingerprint $ fplen opts) rs -- | Calculating a fingerprint - basically just a hash of the first 20 elements of the flow_index. On 64 bits, it is -- possible to use more, this is a sensitivity/specificity tradeoff. fingerprint :: Int -> ReadBlock -> Int fingerprint fpl = foldr (\x y -> y*3+x-1) 0 . map fromIntegral . B.unpack . B.take fpl . B.filter (/=0) . flow_index . trim -- trim == drop 4 for the TCAG key, then 3^20 ~ 2^32 for range (or should that be 4^16?) -- | Align each cluster and merge reads that appear to be from the same clone. cluster :: Options -> DupMap -> [ReadBlock] cluster opts = concatMap (map mergeCG . clusterGroups opts) . M.elems mergeCG :: [ReadBlock] -> ReadBlock mergeCG = head -- todo: build a consensus flowgram and base call it. clusterGroups :: Options -> [ReadBlock] -> [[ReadBlock]] clusterGroups opts (c:cs) = let (this,rest) = span ((<= thresh opts) . matches c) cs in (c:this) : clusterGroups opts rest clusterGroups _ [] = [] -- crude flowgram match check, valid range? matches :: ReadBlock -> ReadBlock -> Double matches old new = let f = map fromIntegral . flowgram . trim in dist (f old) (f new) -- Flowgram distance, not including zeroes at end of flowgram dist :: [Double] -> [Double] -> Double dist as bs = go (reverse as) (reverse bs) where go (x:xs) (y:ys) = if x==0 || y==0 then go xs ys else sum $ zipWith (\i j -> (6*(i-j)/(i+j+4))^(2::Int)) (x:xs) (y:ys) go _ _ = 0 averageflow :: [ReadBlock] -> [Double] averageflow = go . map (map fromIntegral . flowgram . trim) where go [] = [] go xs = avg (map head xs) : go (filter (not . null) $ map tail xs) avg xs = sum xs / fromIntegral (length xs)