{- | FlowSim - simulate 454 flowgrams from a reference genome Flow values are modelled from a log-normal noise, and a sequence of normal distributions, one for each homopolymer length. These distributions can be estimated from real data. Todo: * Paired ends models B-linker-A * Support multiple sequences: - scan seqs + lenghts - gen n numbers (0..total length), transform to deltas (cf. rselect) - sort and generate -} {-# options -XParallelListComp #-} module Main where import Bio.Sequence import Bio.Sequence.SFF import qualified Data.ByteString.Lazy.Char8 as B import qualified Data.ByteString.Lazy as BL import qualified Data.ByteString.Char8 as BS import qualified Data.ByteString as BS2 import Data.Char (toUpper) import Data.Int import Statistics import Generations.GenBase import Config import Control.Monad (when) -- ------------------------------------------------------------ -- picking random postions/directions for reads data Dir = Fwd | Rev deriving (Eq,Enum,Bounded) instance Random Dir where random g = let (b,g') = random g in (if b then Fwd else Rev,g') randomR = error "randomR is undefined for Dir" main :: IO () main = do (gen,is,hplc,o) <- mkconf ss <- is case ss of [] -> error ("Input appears to be empty?") _ -> return () sff <- evalRandIO $ sim454 gen hplc ss n <- writeSFF' o sff whenLoud $ inform ("Wrote "++show n++" reads to '"++o++"'.") -- testing test_s :: Sequence Nuc test_s = Seq (fromStr "foo") (fromStr "aacacattcgtggtnagctacggaacacattcgtggtnagctacggaacacattcgtggtnagctacggaacacattcgtggtnagctacgg") Nothing -- | Heavy lifting. Building an SFF file from the specified information sim454 :: RandomGen g => Generation -> HPLprob -> [Sequence Nuc] -> Rand g SFF sim454 gen hplc ss = do let ch = makeCommonHeader gen tf r = foldr ($) r $ trim_filters gen df r = and $ zipWith ($) (disc_filters gen) (repeat r) rbs <- map tf `fmap` filter df `fmap` mapM (\s -> makeReadBlock gen hplc s ch) ss return (SFF ch rbs) -- Generate a sequence of models -- The generation specifies a series of initial models, this function -- uses these with the degradation to generate a progression of models to use makeModels :: RandomGen g => Generation -> Rand g [Model] makeModels gen = case models gen of [m] -> iterativeWorsen (f_len gen) (degrade gen) m im@(_:_) -> do let ns = f_len gen `div` length im return $ concat $ map (replicate ns) im iterativeWorsen :: RandomGen g => Int -> Distribution -> Model -> Rand g [Model] iterativeWorsen 0 _ _ = return [] iterativeWorsen count deg m = do ps <- perturbs 1 count return $ zipWith (fmap . worsen) ps $ replicate count m where perturbs _ 0 = return [] perturbs cur n = do w <- sample deg let new = cur+w rest <- perturbs new (n-1) return (new : rest) -- -------------------------------------------------- -- Generate the flow values -- generate the "absolute" flow values from the cycle and the origin sequence data makeFlows :: [Char] -> SeqData -> [(Char,Int)] makeFlows c s | B.null s = [] | otherwise = let (c1,s1) = makeCycle [] (take 4 c) s s2 = case B.uncons s1 of Just (x,_) -> if not (toUpper x `elem` "ACGT") then B.tail s1 else s1; _ -> s1 in c1 ++ makeFlows c s2 makeCycle :: [(Char,Int)] -> [Char] -> SeqData -> ([(Char,Int)],SeqData) makeCycle acc [] s = (reverse acc, s) makeCycle acc (c:cs) s = let (this,rest) = B.span ((==toUpper c).toUpper) s in makeCycle ((c,fromIntegral $ B.length this):acc) cs rest -- | Takes a sequence of flows with exact homopolymer lengths, -- permutes and calls them. Next step is prepareData permuteAndCall :: RandomGen g => Generation -> HPLprob -> [(Char,Int)] -> Rand g [(Flow,[Char],[Qual])] permuteAndCall gen hplc fs = do -- when (trace (show fs) False) (return ()) let n = fromIntegral $ f_len gen flow_cont = drop (length fs `mod` length (f_cycle gen)) $ cycle (f_cycle gen) xs = fs ++ zip flow_cont (replicate (n-length fs) 0) ms <- makeModels gen -- if and $ zipWith (==) (map fst xs) (cycle $ f_cycle gen) then return () -- else error (map fst xs ++ "\n don't cycle: "++f_cycle gen) sequence $ take n $ zipWith (my_call (qcall gen) hplc) ms xs -- Titanium base/quality calling my_call :: RandomGen g => QualMethod -> HPLprob -> Model -> (Char,Int) -> Rand g (Flow,[Char],[Qual]) my_call qm ft md (c,hpl) = do f <- sample (md hpl) let fi = max 0 . round . (100*) $ f n = fromIntegral ((fi+50) `div` 100) cs = replicate n c qs = qm ft md (c,fi) when (length qs /= n) $ error ("Qual mismatch: "++show (f,fi,n,cs,qs)) return (fi, cs, qs) -- | Somewhat advanced unzip, calculating the index in the process. -- Also replace three consequtive empty lists with Ns convertCalls :: [(Flow,[Char],[Qual])] -> ([Flow],[Char],[Qual],[Int]) convertCalls = go 0 . addDots where go _ [] = ([],[],[],[]) go p ((f,cs,qs):fs) | null cs = let (f',c',q',i') = go (p+1) fs in (f:f',c',q',i') | otherwise = let (f',c',q',i') = go 0 fs in (f:f',cs++c',qs++q',(p+1):replicate (length cs-1) 0 ++ i') addDots ((f1,cs1,qs1):(f2,cs2,qs2):(f3,cs3,qs3):(f4,cs4,qs4):rest) | null cs1 && null cs2 && null cs3 && not (null cs4) = (f1,cs1,qs1):(f2,cs2,qs2):(f3,"N",[0]):addDots ((f4,cs4,qs4):rest) | otherwise = (f1,cs1,qs1) : addDots ((f2,cs2,qs2):(f3,cs3,qs3):(f4,cs4,qs4):rest) addDots fewerThanThree = fewerThanThree -- | Generate a ReadBlock -- direction and position chosen at random, and encoded in the read name makeReadBlock :: RandomGen g => Generation -> HPLprob -> Sequence Nuc -> CommonHeader -> Rand g ReadBlock makeReadBlock g hplc sq ch = do let sdata = seqdata sq rn = BS.concat $ BL.toChunks $ seqlabel sq fs = makeFlows (BS.unpack $ flow ch) $ BL.concat [BL.fromChunks [key ch], {- B.take (floor cl) $ -} sdata] (pfs,cs,qs,is) <- convertCalls `fmap` permuteAndCall g hplc fs return $ verifyRB (flow_length ch) $ ReadBlock { read_header = ReadHeader { name_length = fromIntegral $ BS.length rn -- :: Int16 , num_bases = fromIntegral $ length cs -- :: Int32 , clip_qual_left = 5 -- :: Int16 , clip_qual_right = fromIntegral $ length cs -- :: Int16 , clip_adapter_left = 0 -- :: Int16 , clip_adapter_right = 0 -- :: Int16 , read_name = rn -- :: ByteString } -- The data block , flow_data = packFlows pfs -- :: [Flow] , flow_index = BS2.pack $ map fromIntegral is -- :: ByteString , bases = fromStr cs -- :: SeqData , quality = BL.pack qs -- :: QualData } -- | Consistency check on generated ReadBlocks. verifyRB :: Int16 -> ReadBlock -> ReadBlock verifyRB fl rb | name_length rh == 0 = err "name_length is zero" | num_bases rh == 0 = err "num_bases is zero" -- | clip_qual_left rh > clip_qual_right rh = err "clipping gives negative sequence" | BS2.length (read_name rh) /= (fromIntegral $ name_length rh) = err "read_name has incorrect length" | BS.length (flow_data rb) `div` 2 /= fromIntegral fl = err ("Number of flows ("++show (BS.length (flow_data rb) `div` 2)++")do not match flow_length of "++show fl++" in CommonHeader") | BS.length (flow_data rb) `div` 2 < (fromIntegral $ sum $ BS2.unpack $ flow_index rb) = err "flow_index longer than flows" | B.length (bases rb) /= (fromIntegral $ BS.length $ flow_index rb) = err ("bases ("++show (B.length (bases rb))++") and flow_index ("++show (BS.length $ flow_index rb)++") have different lengths") | B.length (quality rb) /= (fromIntegral $ BS.length $ flow_index rb) = err ("quality ("++show (B.length (quality rb))++") and flow_index ("++show (BS.length $ flow_index rb)++") have different lengths") | otherwise = rb where rh = read_header rb err str = error (str ++ "\n" ++ show rb)