{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell #-}
module SLynx.Simulate.Simulate
( simulateCmd,
)
where
import Control.Applicative ((<|>))
import Control.Concurrent
import Control.Concurrent.Async
import Control.Monad
( unless,
when,
)
import Control.Monad.IO.Class
import Control.Monad.Logger
import Control.Monad.Trans.Class
import Control.Monad.Trans.Reader (ask)
import qualified Data.ByteString.Builder as BB
import qualified Data.ByteString.Lazy.Char8 as BL
import Data.List.NonEmpty (toList)
import Data.Maybe
import qualified Data.Set as Set
import qualified Data.Text as T
import qualified Data.Text.Lazy as LT
import qualified Data.Text.Lazy.Encoding as LT
import qualified Data.Vector.Unboxed as V
import ELynx.Data.Alphabet.Alphabet as A
import ELynx.Data.MarkovProcess.GammaRateHeterogeneity
import qualified ELynx.Data.MarkovProcess.MixtureModel as M
import qualified ELynx.Data.MarkovProcess.PhyloModel as P
import qualified ELynx.Data.MarkovProcess.SubstitutionModel as SM
import qualified ELynx.Data.Sequence.Alignment as A
import qualified ELynx.Data.Sequence.Sequence as Seq hiding
( name,
)
import ELynx.Tree
import ELynx.Export.Sequence.Fasta
import ELynx.Import.MarkovProcess.EDMModelPhylobayes
import ELynx.Import.MarkovProcess.SiteprofilesPhylobayes
import ELynx.Simulate.MarkovProcessAlongTree
import ELynx.Tools
import Numeric.LinearAlgebra hiding
( toList,
(<>),
)
import SLynx.Simulate.Options
import SLynx.Simulate.PhyloModel
import System.Random.MWC
import Text.Printf
simulateAlignment ::
(Measurable e, Named a) =>
P.PhyloModel ->
Tree e a ->
Int ->
GenIO ->
IO A.Alignment
simulateAlignment pm t' n g = do
let t = getLen <$> toTreeBranchLabels t'
c <- getNumCapabilities
gs <- splitGen c g
let chunks = getChunks c n
leafStatesS <- case pm of
P.SubstitutionModel sm ->
mapConcurrently
(\(num, gen) -> simulateAndFlatten num d e t gen)
(zip chunks gs)
where
d = SM.stationaryDistribution sm
e = SM.exchangeabilityMatrix sm
P.MixtureModel mm -> simulateAndFlattenMixtureModelPar n ws ds es t g
where
ws = vector . toList $ M.getWeights mm
ds = map SM.stationaryDistribution $ toList $ M.getSubstitutionModels mm
es = map SM.exchangeabilityMatrix $ toList $ M.getSubstitutionModels mm
let leafStates = horizontalConcat leafStatesS
leafNames = map getName $ leaves t'
code = P.getAlphabet pm
alph = A.all $ alphabetSpec code
sequences =
[ Seq.Sequence sName "" code (V.fromList $ map (`Set.elemAt` alph) ss)
| (sName, ss) <- zip leafNames leafStates
]
return $ either error id $ A.fromSequences sequences
summarizeEDMComponents :: [EDMComponent] -> BL.ByteString
summarizeEDMComponents cs =
BL.pack $
"Empiricial distribution mixture model with "
++ show (length cs)
++ " components."
reportModel :: P.PhyloModel -> ELynx SimulateArguments ()
reportModel m = do
bn <- outFileBaseName . global <$> ask
case bn of
Nothing ->
$(logInfo)
"No output file provided; omit writing machine-readable phylogenetic model."
Just _ ->
out "model definition (machine readable)" (BL.pack (show m) <> "\n") ".model.gz"
pretty :: BranchLength -> String
pretty = printf "%.5f"
prettyRow :: String -> String -> BL.ByteString
prettyRow name val = alignLeft 33 n <> alignRight 8 v
where
n = BL.pack name
v = BL.pack val
summarizeBranchLengths :: Measurable e => Tree e a -> BL.ByteString
summarizeBranchLengths t =
BL.intercalate
"\n"
[ prettyRow "Origin height: " $ pretty h,
prettyRow "Average distance origin to leaves: " $ pretty h',
prettyRow "Total branch length: " $ pretty b
]
where
n = length $ leaves t
h = height t
h' = sum (distancesOriginLeaves t) / fromIntegral n
b = totalBranchLength t
summarizeSM :: SM.SubstitutionModel -> [BL.ByteString]
summarizeSM sm =
map BL.pack $
(show (SM.alphabet sm) ++ " substitution model: " ++ SM.name sm ++ ".") :
["Parameters: " ++ show (SM.params sm) ++ "." | not (null (SM.params sm))]
++ case SM.alphabet sm of
DNA ->
[ "Stationary distribution: "
++ dispv precision (SM.stationaryDistribution sm)
++ ".",
"Exchangeability matrix:\n"
++ dispmi 2 precision (SM.exchangeabilityMatrix sm)
++ ".",
"Scale: " ++ show (roundN precision $ SM.totalRate sm) ++ "."
]
Protein ->
[ "Stationary distribution: "
++ dispv precision (SM.stationaryDistribution sm)
++ ".",
"Scale: " ++ show (roundN precision $ SM.totalRate sm) ++ "."
]
_ ->
error
"Extended character sets are not supported with substitution models."
summarizeMMComponent :: M.Component -> [BL.ByteString]
summarizeMMComponent c =
BL.pack "Weight: "
<> (BB.toLazyByteString . BB.doubleDec $ M.weight c) :
summarizeSM (M.substModel c)
summarizeMM :: M.MixtureModel -> [BL.ByteString]
summarizeMM m =
[ BL.pack $ "Mixture model: " ++ M.name m ++ ".",
BL.pack $ "Number of components: " ++ show n ++ "."
]
++ detail
where
n = length $ M.components m
detail =
if n <= 100
then
concat
[ BL.pack ("Component " ++ show i ++ ":") : summarizeMMComponent c
| (i, c) <- zip [1 :: Int ..] (toList $ M.components m)
]
else []
summarizePM :: P.PhyloModel -> [BL.ByteString]
summarizePM (P.MixtureModel mm) = summarizeMM mm
summarizePM (P.SubstitutionModel sm) = summarizeSM sm
simulateCmd :: ELynx SimulateArguments ()
simulateCmd = do
l <- local <$> ask
let treeFile = argsTreeFile l
$(logInfo) ""
$(logInfo) $ T.pack $ "Read tree from file '" ++ treeFile ++ "'."
tree <- liftIO $ parseFileWith (newick Standard) treeFile
let t' = either error id $ phyloToLengthTree tree
$(logInfo) $ T.pack $ "Number of leaves: " ++ show (length $ leaves t')
$(logInfo) $ LT.toStrict $ LT.decodeUtf8 $ summarizeBranchLengths t'
let edmFile = argsEDMFile l
let sProfileFiles = argsSiteprofilesFiles l
$(logInfo) ""
$(logDebug) "Read EDM file or siteprofile files."
when (isJust edmFile && isJust sProfileFiles) $
error "Got both: --edm-file and --siteprofile-files."
edmCs <- case edmFile of
Nothing -> return Nothing
Just edmF -> do
$(logInfo) "Read EDM file."
liftIO $ Just <$> parseFileWith phylobayes edmF
maybe
(return ())
($(logInfo) . LT.toStrict . LT.decodeUtf8 . summarizeEDMComponents)
edmCs
sProfiles <- case sProfileFiles of
Nothing -> return Nothing
Just fns -> do
$(logInfo) $
T.pack $
"Read siteprofiles from "
++ show (length fns)
++ " file(s)."
$(logDebug) $ T.pack $ "The file names are:" ++ show fns
xs <- liftIO $ mapM (parseFileWith siteprofiles) fns
return $ Just $ concat xs
maybe
(return ())
($(logInfo) . LT.toStrict . LT.decodeUtf8 . summarizeEDMComponents)
sProfiles
let edmCsOrSiteprofiles = edmCs <|> sProfiles
$(logInfo) "Read model string."
let ms = argsSubstitutionModelString l
mm = argsMixtureModelString l
mws = argsMixtureWeights l
eitherPhyloModel' = getPhyloModel ms mm mws edmCsOrSiteprofiles
phyloModel' <- case eitherPhyloModel' of
Left err -> lift $ error err
Right pm -> return pm
let maybeGammaParams = argsGammaParams l
phyloModel <- case maybeGammaParams of
Nothing -> do
$(logInfo) $
LT.toStrict $
LT.decodeUtf8 $
BL.unlines $
summarizePM
phyloModel'
return phyloModel'
Just (n, alpha) -> do
$(logInfo) $
LT.toStrict $
LT.decodeUtf8 $
BL.intercalate "\n" $
summarizePM phyloModel'
$(logInfo) ""
$(logInfo) $
LT.toStrict $
LT.decodeUtf8 $
BL.intercalate "\n" $
summarizeGammaRateHeterogeneity n alpha
return $ expand n alpha phyloModel'
unless
(isJust sProfiles)
( do
$(logInfo) ""
reportModel phyloModel
$(logInfo) ""
)
$(logInfo) "Simulate alignment."
let alignmentLength = argsLength l
$(logInfo) $ T.pack $ "Length: " <> show alignmentLength <> "."
gen <- case argsSeed l of
Random ->
error "simulateCmd: seed not available; please contact maintainer."
Fixed s -> liftIO $ initialize s
alignment <- liftIO $ simulateAlignment phyloModel t' alignmentLength gen
let output = (sequencesToFasta . A.toSequences) alignment
$(logInfo) ""
out "simulated multi sequence alignment" output ".fasta"