{-# LANGUAGE ScopedTypeVariables #-} -- | -- Description : Work with transition probability matrices on rooted trees -- Copyright : 2021 Dominik Schrempf -- License : GPLv3 -- -- Maintainer : dominik.schrempf@gmail.com -- Stability : unstable -- Portability : non-portable (not tested) -- -- Calculate transition probability matrices, map rate matrices on trees, populate -- a tree with states according to a stationary distribution, etc. -- -- The implementation of the Markov process is more than basic and can be improved -- in a lot of ways. module ELynx.Simulate.MarkovProcessAlongTree ( -- * Single rate matrix. simulate, simulateAndFlatten, simulateAndFlattenPar, -- * Mixture models. simulateMixtureModel, simulateAndFlattenMixtureModel, simulateAndFlattenMixtureModelPar, ) where import Control.Concurrent import Control.Concurrent.Async import Control.Monad import Data.Tree import qualified Data.Vector as V import ELynx.MarkovProcess.RateMatrix import ELynx.Simulate.MarkovProcess import System.Random.MWC.Distributions import System.Random.Stateful -- XXX @performace. The horizontal concatenation might be slow. If so, -- 'concatenateSeqs' or 'concatenateAlignments' can be used, which directly -- appends vectors. -- A brain f***. As an example, let @xss@ be a list of alignments (i.e., a list -- of a list of a list of alleles). This function horizontally concatenates the -- sites. The number of species needs to be same in each alignment. No checks -- are performed! horizontalConcat :: [[[a]]] -> [[a]] horizontalConcat [xs] = xs horizontalConcat xss = foldr1 (zipWith (++)) xss toProbTree :: RateMatrix -> Tree Double -> Tree ProbMatrix toProbTree q = fmap (probMatrix q) getRootStates :: StatefulGen g m => Int -> StationaryDistribution -> g -> m [State] getRootStates n d g = replicateM n $ categorical d g -- | Simulate a number of sites for a given substitution model. Only the states -- at the leafs are retained. The states at internal nodes are removed. This has -- a lower memory footprint. -- XXX: Improve performance. Use vectors, not lists. I am actually not sure if -- this improves performance... simulateAndFlatten :: StatefulGen g m => Int -> StationaryDistribution -> ExchangeabilityMatrix -> Tree Double -> g -> m [[State]] simulateAndFlatten n d e t g = do let q = fromExchangeabilityMatrix e d pt = toProbTree q t is <- getRootStates n d g simulateAndFlatten' is pt g -- This is the heart of the simulation. Take a tree and a list of root states. -- Recursively jump down the branches to the leafs. Forget states at internal -- nodes. simulateAndFlatten' :: StatefulGen g m => [State] -> Tree ProbMatrix -> g -> m [[State]] simulateAndFlatten' is (Node p f) g = do is' <- mapM (\i -> jump i p g) is if null f then return [is'] else concat <$> sequence [simulateAndFlatten' is' t g | t <- f] -- | See 'simulateAndFlatten', parallel version. simulateAndFlattenPar :: RandomGen g => Int -> StationaryDistribution -> ExchangeabilityMatrix -> Tree Double -> IOGenM g -> IO [[State]] simulateAndFlattenPar n d e t g = do c <- getNumCapabilities rs <- replicateM c $ splitGenM g gs <- mapM newIOGenM rs let chunks = getChunks c n q = fromExchangeabilityMatrix e d pt = toProbTree q t -- The concurrent map returns a list of [[State]] objects. They have to be -- concatenated horizontally. sss <- mapConcurrently ( \(num, gen) -> do is <- getRootStates num d gen simulateAndFlatten' is pt gen ) (zip chunks gs) return $ horizontalConcat sss -- | Simulate a number of sites for a given substitution model. Keep states at -- internal nodes. The result is a tree with the list of simulated states as -- node labels. simulate :: StatefulGen g m => Int -> StationaryDistribution -> ExchangeabilityMatrix -> Tree Double -> g -> m (Tree [State]) simulate n d e t g = do let q = fromExchangeabilityMatrix e d pt = toProbTree q t is <- getRootStates n d g simulate' is pt g -- This is the heart of the simulation. Take a tree and a list of root states. -- Recursively jump down the branches to the leafs. simulate' :: StatefulGen g m => [State] -> Tree ProbMatrix -> g -> m (Tree [State]) simulate' is (Node p f) g = do is' <- mapM (\i -> jump i p g) is f' <- sequence [simulate' is' t g | t <- f] return $ Node is' f' toProbTreeMixtureModel :: V.Vector RateMatrix -> Tree Double -> Tree (V.Vector ProbMatrix) toProbTreeMixtureModel qs = -- XXX: This function is slow. Parallelization? fmap (\a -> V.map (`probMatrix` a) qs) getComponentsAndRootStates :: StatefulGen g m => Int -> V.Vector Double -> V.Vector StationaryDistribution -> g -> m ([Int], [State]) getComponentsAndRootStates n ws ds g = do cs <- replicateM n $ categorical ws g is <- sequence [categorical (ds V.! c) g | c <- cs] return (cs, is) -- | Simulate a number of sites for a given set of substitution models with -- corresponding weights. Forget states at internal nodes. See also -- 'simulateAndFlatten'. simulateAndFlattenMixtureModel :: StatefulGen g m => Int -> V.Vector Double -> V.Vector StationaryDistribution -> V.Vector ExchangeabilityMatrix -> Tree Double -> g -> -- | (IndicesOfComponents, [SimulatedSequenceForEachTip]) m ([Int], [[State]]) simulateAndFlattenMixtureModel n ws ds es t g = do let qs = V.zipWith fromExchangeabilityMatrix es ds pt = toProbTreeMixtureModel qs t (cs, is) <- getComponentsAndRootStates n ws ds g ss <- simulateAndFlattenMixtureModel' is cs pt g return (cs, ss) simulateAndFlattenMixtureModel' :: StatefulGen g m => [State] -> [Int] -> Tree (V.Vector ProbMatrix) -> g -> m [[State]] simulateAndFlattenMixtureModel' is cs (Node ps f) g = do is' <- sequence [jump i (ps V.! c) g | (i, c) <- zip is cs] if null f then return [is'] else concat <$> sequence [simulateAndFlattenMixtureModel' is' cs t g | t <- f] getChunks :: Int -> Int -> [Int] getChunks c n = ns where n' = n `div` c r = n `mod` c ns = replicate r (n' + 1) ++ replicate (c - r) n' -- NOTE: We could move away from IO here, but moving away from 'mapConcurrently' -- requires benchmarks. I am just not sure if it makes sense to spend more time -- on this since the parallelization itself is a bit weird. Like so, we walk -- along separate trees in each process. parComp :: RandomGen g => Int -> (Int -> IOGenM g -> IO b) -> IOGenM g -> IO [b] parComp num fun gen = do ncap <- getNumCapabilities let chunks = getChunks ncap num rs <- replicateM ncap $ splitGenM gen gs <- mapM newIOGenM rs mapConcurrently (uncurry fun) (zip chunks gs) -- | See 'simulateAndFlattenMixtureModel', parallel version. simulateAndFlattenMixtureModelPar :: RandomGen g => Int -> V.Vector Double -> V.Vector StationaryDistribution -> V.Vector ExchangeabilityMatrix -> Tree Double -> IOGenM g -> IO ([Int], [[State]]) simulateAndFlattenMixtureModelPar n ws ds es t g = do let qs = V.zipWith fromExchangeabilityMatrix es ds pt = toProbTreeMixtureModel qs t -- The concurrent computation returns a list of ([Int], [[State]]) objects. -- They have to be concatenated horizontally. csss <- parComp n ( \n' g' -> do (cs, is) <- getComponentsAndRootStates n' ws ds g' ss <- simulateAndFlattenMixtureModel' is cs pt g' return (cs, ss) ) g return (concatMap fst csss, horizontalConcat $ map snd csss) -- | Simulate a number of sites for a given set of substitution models with -- corresponding weights. Keep states at internal nodes. See also -- 'simulate'. simulateMixtureModel :: StatefulGen g m => Int -> V.Vector Double -> V.Vector StationaryDistribution -> V.Vector ExchangeabilityMatrix -> Tree Double -> g -> m (Tree [State]) simulateMixtureModel n ws ds es t g = do let qs = V.zipWith fromExchangeabilityMatrix es ds pt = toProbTreeMixtureModel qs t (cs, is) <- getComponentsAndRootStates n ws ds g simulateMixtureModel' is cs pt g -- See 'simulateAlongProbTree', only we have a number of mixture components. The -- starting states and the components for each site have to be provided. simulateMixtureModel' :: StatefulGen g m => [State] -> [Int] -> Tree (V.Vector ProbMatrix) -> g -> m (Tree [State]) simulateMixtureModel' is cs (Node ps f) g = do is' <- sequence [jump i (ps V.! c) g | (i, c) <- zip is cs] f' <- sequence [simulateMixtureModel' is' cs t g | t <- f] return $ Node is' f'