{- | CloneSim - fragment a genome sequence to simulate cloning Factored out from flowsim in order to separate clone generation from flow simulation and base calling. -} {-# Language DeriveDataTypeable #-} module Main where import Bio.Core.Sequence import Bio.Sequence.Fasta import Statistics import Version import System.IO (stdin,stdout,stderr,hPutStrLn) import Control.Monad (forM,when) import qualified Data.ByteString.Lazy as B import System.Console.CmdArgs import Data.Array import Data.Char type Dist = String data Conf = Conf { lengths :: Dist , count :: Int , input :: [FilePath] } deriving (Data,Typeable,Show) opts :: Conf opts = Conf { lengths = "Uniform 400 800" &= help "model for clone lengths" &= typ "DIST" , count = 10 &= help "number of reads to generate" &= typ "INT" , input = def &= args &= typFile } &= program "clonesim" &= summary ("clonesim "++version) &= details ["Available distributions (DIST):", "Uniform a b, Normal mu sigma, LogNormal mu sigma"] data Dir = Fwd | Rev deriving (Eq,Enum,Bounded) instance Random Dir where random g = let (b,g') = random g in (if b then Fwd else Rev,g') randomR = error "randomR is undefined for Dir" main :: IO () main = do conf <- cmdArgs opts -- print conf let inf = case input conf of [] -> hPutStrLn stderr "clonesim: reading from stdin" >> hReadFasta stdin [x] -> readFasta x _ -> error "Please specify only a single input file" seq inf $ return () -- force the above warning ss <- map defragSeq `fmap` inf when (null ss) $ error "No sequences in input, exiting!" let sa = listArray (0,length ss-1) ss la = listArray (0,length ss) $ scanl (+) 0 $ map seqlength ss hWriteFasta stdout =<< evalRandIO (simulate conf sa la) type SeqArray = Array Int Sequence type LenArray = Array Int Offset -- | the real 'main' simulate :: RandomGen g => Conf -> SeqArray -> LenArray -> Rand g [Sequence] simulate conf sa sl = do let ldist = read $ lengths conf (_,alz) = bounds sl top = fromIntegral (sl!alz) forM [1..count conf] $ const (do n <- floor `fmap` sample (Uniform 0 top) let i = bsearch sl n p = n - sl!i s = sa!i l <- round `fmap` sample ldist dir <- getRandom return $ mkClone dir p (Offset l) s) -- bsearch - returns largest array entry less than search value bsearch :: LenArray -> Offset -> Int bsearch a v = go (bounds a) where go (x,y) | v >= a!y = y | y == x+1 = x | a!m <= v = go (m,y) | a!m > v = go (x,m) where m = (x+y) `div` 2 mkClone :: Dir -> Offset -> Offset -> Sequence -> Sequence mkClone dir (Offset pos) (Offset len) (Seq h (SeqData s) mq) = Seq label (SeqData sd) (qual mq) where label = fromString (toString h++":"++show pos++ (case dir of Fwd -> ":fwd"; Rev -> ":rev")++" clonesim") sd = B.take len . (case dir of Fwd -> snd; Rev -> revcompl' . fst) . B.splitAt pos $ s qual Nothing = Nothing qual (Just (QualData qd)) = Just . QualData . B.take len . case dir of Fwd -> fst; Rev -> snd $ B.splitAt pos qd revcompl' :: B.ByteString -> B.ByteString revcompl' = B.reverse . B.map (fromIntegral . ord . compl . chr . fromIntegral) compl :: Char -> Char compl 'A' = 'T' compl 'T' = 'A' compl 'C' = 'G' compl 'G' = 'C' compl 'a' = 't' compl 't' = 'a' compl 'c' = 'g' compl 'g' = 'c' compl x = x defragSeq = id