{- Transcribe from a base sequence using unfoldr Probably more complicated than necessary. 'prime' selects starting points (in the form of initial state) 'transcribe' takes a list of mutators, a terminator, and an initial state and generates the transcript. -} module UnfoldMut where import Bio.Sequence hiding ((!)) import Data.Char (toUpper) -- for testing import Data.Int import Data.List (sort) import Data.List (unfoldr) import Data.Maybe (isJust) import System.Random import qualified Bio.Sequence as S at :: Cursor -> Char at (s,o,d) | o < 0 || o >= seqlength s = 'X' | d == Fwd = (S.!) s o | otherwise = compl $ (S.!) s o type Prob = Double -- | The cursor tracks position in the sequence (simulating polymerase) data Direction = Fwd | Rev deriving (Eq,Show) type Cursor = (Sequence,Offset,Direction) -- | A primer determines when to initiate transcripts type Primer = [Sequence] -> IO [MState] ------------------------------------------------------------ -- | A mutator inspects the state, returns the probability of an action, -- and the action, in the form of a string and a modified state type Mutator = MState -> (Prob,[Char], MState) type MutFunc = MState -> ([Char],MState) combine :: [Distribution] -> Distribution combine = foldr (\f g -> \x -> f x + g x) (const 0) mkmut :: Distribution -> [MutFunc] -> [Mutator] mkmut dist fs = map (\f -> \m -> let (s,m') = f m in (t m, s, m')) fs where t = (/fromIntegral (length fs)) . dist . tlen tlen (MS _ _ l) = fromIntegral l -- | Uniform probs for substitution and insertion subst, ins :: String -> [MutFunc] subst = map (\x -> \m -> ([x],fwd m)) ins = map (\x -> \m -> ([x],m)) -- | Deletion of current base del :: [MutFunc] del = [\m -> ([],fwd m)] -- | Duplicate current base without forwarding dup :: [MutFunc] dup = [\m@(MS _ c _) -> ([toUpper $ at c],m)] ------------------------------------------------------------ -- | The terminator examines the state, and the probablity of -- transcript termination type Terminator = MState -> Prob -- | A Model is shorthand for a complete parameter set type Model = (Primer,[Mutator],Terminator) -- | The state of the current transcription data MState = MS StdGen Cursor Int instance Show MState where show = const "xxx" ------------------------------------------------------------ -- | A distribution is a probability as a function of length type Distribution = Offset -> Prob -- | The sigmoid function provides a soft threshold -- center on s, scale by w sigma :: Double -> Double -> Distribution sigma s w = \t -> 1/(1+exp ((s-fromIntegral t)/w)) uniform :: Distribution uniform = const 1 gradient :: Double -> Double -> Distribution gradient s e = \t -> max 0 . min 1 $ (fromIntegral t-s)/(e-s) -- | For 454 (and other?) sequencing, we need a distribution -- that depends on sequence content homopolymer :: MState -> Prob homopolymer (MS _ c@(s,o,_) _) = (/20) . fromIntegral . length . takeWhile (== at c) . map ((S.!) s) $ [o..min (o+19) (seqlength s-1)] ------------------------------------------------------------ -- | Increments the cursor in the MState fwd :: MState -> MState fwd (MS sg (s,o,d) l) = (MS sg (s,if d == Fwd then o+1 else o-1,d) l) transcribe :: [Mutator] -> Terminator -> MState -> [Char] transcribe ms t is = concat . unfoldr (sequencer ms t) $ is -- | walk a list of mutator funcitions, pick one based on p -- whoa! this is almost :: [Mutator] -> Mutator! sequencer :: [Mutator] -> Terminator -> MState -> Maybe ([Char],MState) sequencer ms t = \m@(MS g c _) -> let (p,g') = randomR (0,1) g tp = t m new = mutate m (p-tp) ms in if p < tp then Nothing else update_ms g' $ if isJust new then new else Just ([at c], fwd m) -- | update rndgen and length update_ms :: StdGen -> Maybe (String,MState) -> Maybe (String,MState) update_ms g (Just (xs, MS _ c l)) = Just (xs, MS g c (l+length xs)) update_ms _ Nothing = error "this is impossible" -- | try all mutators in a list mutate :: MState -> Prob -> [Mutator] -> Maybe (String,MState) mutate st p (m:ms) = let (t,ch,st') = m st p' = p-t in if p' < 0 then Just (ch,st') else mutate st p' ms mutate _st _p [] = Nothing ------------------------------------------------------------ -- | A primer with a uniform (poisson) distribution p_uniform :: [String] -> Primer p_uniform [n,d] = \ss -> do (g,g') <- return . split =<< newStdGen let l = sum $ map seqlength ss os = sort . take (read n) . randomRs (0,l-1) $ g return (gen_ms (read d) g' 0 ss os) p_uniform _ = error "primer 'uniform' needs two arguments" -- | Generate MStates for transcript initiation gen_ms :: Double -> StdGen -> Offset -> [Sequence] -> [Offset] -> [MState] gen_ms d g c (s:ss) (p:ps) | p-c >= seqlength s = gen_ms d g (c+seqlength s) ss (p:ps) | otherwise = let (g1,g') = split g (z,g2) = randomR (0,1) g1 in MS g2 (s,p-c,if z (a,a) -> g -> (a,g) integralRandomR (a,b) g = case randomR (fromIntegral a :: Integer, fromIntegral b :: Integer) g of (x,g') -> (fromIntegral x, g')