{- | CloneSim - fragment a genome sequence to simulate cloning Factored out from flowsim in order to separate clone generation from flow simulation and base calling. -} {-# Language DeriveDataTypeable #-} module Main where import Bio.Sequence hiding ((!)) import Statistics import System.IO (stdin,stdout,stderr,hPutStrLn) import Control.Monad (forM,when) import qualified Data.ByteString.Lazy as B import Data.Int (Int64) import System.Console.CmdArgs import Data.Array type Dist = String data Conf = Conf { lengths :: Dist , count :: Int , input :: [FilePath] } deriving (Data,Typeable,Show) opts :: Conf opts = Conf { lengths = "Uniform 400 800" &= help "model for clone lengths" &= typ "DIST" , count = 10 &= help "number of reads to generate" &= typ "INT" , input = def &= args &= typFile } &= program "clonesim" &= summary version &= details ["Available distributions (DIST):", "Uniform a b, Normal mu sigma, LogNormal mu sigma"] version :: String version = "clonesim v0.2.8, copyright 2010 Ketil Malde" data Dir = Fwd | Rev deriving (Eq,Enum,Bounded) instance Random Dir where random g = let (b,g') = random g in (if b then Fwd else Rev,g') randomR = error "randomR is undefined for Dir" main :: IO () main = do conf <- cmdArgs opts -- print conf let inf = case input conf of [] -> hPutStrLn stderr "clonesim: reading from stdin" >> hReadFasta stdin [x] -> readFasta x _ -> error "Please specify only a single input file" seq inf $ return () -- force the above warning ss <- map (defragSeq . castToNuc) `fmap` inf when (null ss) $ error "No sequences in input, exiting!" let sa = listArray (0,length ss-1) ss la = listArray (0,length ss) $ scanl (+) 0 $ map seqlength ss hWriteFasta stdout =<< evalRandIO (simulate conf sa la) type SeqArray = Array Int (Sequence Nuc) type LenArray = Array Int Int64 -- | the real 'main' simulate :: RandomGen g => Conf -> SeqArray -> LenArray -> Rand g [Sequence Nuc] simulate conf sa sl = do let ldist = read $ lengths conf (_,alz) = bounds sl top = fromIntegral (sl!alz) forM [1..count conf] $ const (do n <- round `fmap` sample (Uniform 0 top) let i = bsearch sl n p = n - sl!i s = sa!i l <- round `fmap` sample ldist dir <- getRandom return (mkClone dir p l s)) -- bsearch - returns largest array entry less than search value bsearch :: LenArray -> Int64 -> Int bsearch a v = go (bounds a) where go (x,y) | v >= a!y = y | y == x+1 = x | a!m <= v = go (m,y) | a!m > v = go (x,m) where m = (x+y) `div` 2 mkClone :: Dir -> Int64 -> Int64 -> Sequence Nuc -> Sequence Nuc mkClone dir pos len (Seq h s mq) = Seq (fromStr (toStr h++":"++show pos++(case dir of Fwd -> ":fwd"; Rev -> ":rev")++" clonesim")) sd qual where sd = B.take len . (case dir of Fwd -> snd; Rev -> revcompl' . fst) . B.splitAt pos $ s qual = B.take len `fmap` case dir of Fwd -> fst; Rev -> snd `fmap` B.splitAt pos `fmap` mq