{- | CloneSim - fragment a genome sequence to simulate cloning Factored out from flowsim in order to separate clone generation from flow simulation and base calling. -} {-# Language DeriveDataTypeable #-} module Main where import Bio.Sequence hiding ((!)) import Statistics import System.IO (stdin,stdout,stderr,hPutStrLn) import Control.Monad (forM,when) import qualified Data.ByteString.Lazy as B import Data.Int (Int64) import System.Console.CmdArgs import Data.Array type Dist = String data Conf = Conf { lengths :: Dist , count :: Int , input :: [FilePath] } deriving (Data,Typeable,Show) modes :: Mode Conf modes = mode $ Conf { lengths = "Uniform 400 800" &= text "model for clone lengths" & typ "DIST" , count = 10 &= text "number of reads to generate" & typ "INT" , input = def &= args & typFile } &= prog "clonesim" & text "simulate sequence cloning" & helpSuffix ["Available distributions (DIST):" ," Uniform a b, Normal mu sigma, LogNormal mu sigma"] version :: String version = "clonesim v0.2.7, copyright 2010 Ketil Malde" data Dir = Fwd | Rev deriving (Eq,Enum,Bounded) instance Random Dir where random g = let (b,g') = random g in (if b then Fwd else Rev,g') randomR = error "randomR is undefined for Dir" main :: IO () main = do conf <- cmdArgs version [modes] -- print conf let inf = case input conf of [] -> hPutStrLn stderr "clonesim: reading from stdin" >> hReadFasta stdin [x] -> readFasta x _ -> error "Please specify only a single input file" seq inf $ return () -- force the above warning ss <- map (defragSeq . castToNuc) `fmap` inf when (null ss) $ error "No sequences in input, exiting!" let sa = listArray (0,length ss-1) ss hWriteFasta stdout =<< evalRandIO (simulate conf sa) type SeqArray = Array Int (Sequence Nuc) -- | the real 'main' simulate :: RandomGen g => Conf -> SeqArray -> Rand g [Sequence Nuc] simulate conf sa = do let ldist = read $ lengths conf (_,asz) = bounds sa forM [1..count conf] $ const (do i <- floor `fmap` sample (Uniform 0 $ 1+fromIntegral asz) -- inclusive? let s = sa!i p <- round `fmap` sample (Uniform 0 $ fromIntegral (seqlength s-1)) l <- round `fmap` sample ldist dir <- getRandom return (mkClone dir p l s)) mkClone :: Dir -> Int64 -> Int64 -> Sequence Nuc -> Sequence Nuc mkClone dir pos len (Seq h s mq) = Seq (fromStr (toStr h++" "++show pos++case dir of Fwd -> "+"; Rev -> "-")) sd qual where sd = B.take len . (case dir of Fwd -> snd; Rev -> revcompl' . fst) . B.splitAt pos $ s qual = B.take len `fmap` case dir of Fwd -> fst; Rev -> snd `fmap` B.splitAt pos `fmap` mq