-- | -- Module : ELynx.Data.MarkovProcess.SubstitutionModel -- Description : Data type describing substitution model -- Copyright : (c) Dominik Schrempf 2020 -- License : GPL-3.0-or-later -- -- Maintainer : dominik.schrempf@gmail.com -- Stability : unstable -- Portability : portable -- -- Creation date: Tue Jan 29 19:10:46 2019. -- -- To be imported qualified. module ELynx.Data.MarkovProcess.SubstitutionModel ( -- * Types Name, Params, SubstitutionModel, -- * Lenses and other accessors alphabet, name, stationaryDistribution, exchangeabilityMatrix, rateMatrix, totalRate, -- * Building substitution models substitutionModel, -- * Transformations scale, normalize, appendName, -- * Output summarize, ) where import qualified Data.ByteString.Lazy.Char8 as L import ELynx.Data.Alphabet.Alphabet import qualified ELynx.Data.MarkovProcess.RateMatrix as R import ELynx.Tools import qualified Numeric.LinearAlgebra as LinAlg -- | Name of substitution model; abstracted and subject to change. type Name = String -- | Parameters of substitution model. May be the empty list. type Params = [Double] -- XXX: Use a proper data type. For example: -- data SubstitutionModelAA = LG | WAG | LG-Custom dist | ... -- data SubstitutionModelNuc = JC | HKY p1 p2 ... | GTR p1 p2 ... -- -- I thought about this a lot, and it seems easier like it is at the moment. -- Since the data types are abstracted anyways, not much harm can be done. Of -- course, conflicting substitution models can be declared, or duplicate ones -- with different names, but well... -- | Complete definition of a substitution model. Create instances with -- 'substitutionModel'. A substitution model has an alphabet, a name, and a list -- of parameters (e.g., the kappa value for the HKY model). Further, the -- transition rate matrix is defined by a stationary distribution and a set of -- exchangeabilities. data SubstitutionModel = SubstitutionModel { -- | Alphabet alphabet :: Alphabet, -- | Name name :: Name, -- | List of parameters params :: Params, -- | Stationary distribution stationaryDistribution :: R.StationaryDistribution, -- | Exchangeability matrix exchangeabilityMatrix :: R.ExchangeabilityMatrix } deriving (Show, Read) -- | Calculate rate matrix from substitution model. rateMatrix :: SubstitutionModel -> R.RateMatrix rateMatrix sm = R.fromExchangeabilityMatrix (exchangeabilityMatrix sm) (stationaryDistribution sm) -- | Get scale of substitution model. totalRate :: SubstitutionModel -> Double totalRate sm = R.totalRate (rateMatrix sm) -- | Create normalized 'SubstitutionModel'. See 'normalize'. substitutionModel :: Alphabet -> Name -> Params -> R.StationaryDistribution -> R.ExchangeabilityMatrix -> SubstitutionModel substitutionModel c n ps d e = if R.isValid d then normalize $ SubstitutionModel c n ps d' e else error $ "substitionModel: Stationary distribution does not sum to 1.0: " ++ show d where d' = normalizeSumVec 1.0 d -- | Scale the rate of a substitution model by given factor. scale :: Double -> SubstitutionModel -> SubstitutionModel scale r sm = sm {exchangeabilityMatrix = em'} where em' = LinAlg.scale r $ exchangeabilityMatrix sm -- | Normalize a substitution model, so that, on average, one substitution -- happens per unit time. normalize :: SubstitutionModel -> SubstitutionModel normalize sm = scale (1.0 / r) sm where r = totalRate sm -- | Abbend to name. appendName :: Name -> SubstitutionModel -> SubstitutionModel appendName n sm = sm {name = n'} where n' = name sm <> n -- | Summarize a substitution model; lines to be printed to screen or log. summarize :: SubstitutionModel -> [L.ByteString] summarize sm = map L.pack $ (show (alphabet sm) ++ " substitution model: " ++ name sm ++ ".") : ["Parameters: " ++ show (params sm) ++ "." | not (null (params sm))] ++ case alphabet sm of DNA -> [ "Stationary distribution: " ++ dispv precision (stationaryDistribution sm) ++ ".", "Exchangeability matrix:\n" ++ dispmi 2 precision (exchangeabilityMatrix sm) ++ ".", "Scale: " ++ show (roundN precision $ totalRate sm) ++ "." ] Protein -> [ "Stationary distribution: " ++ dispv precision (stationaryDistribution sm) ++ ".", "Scale: " ++ show (roundN precision $ totalRate sm) ++ "." ] _ -> error "Extended character sets are not supported with substitution models."