-- |
-- Module      :  ELynx.MarkovProcess.AminoAcid
-- Description :  Amino acid rate matrices such as LG
-- Copyright   :  2021 Dominik Schrempf
-- License     :  GPL-3.0-or-later
--
-- Maintainer  :  dominik.schrempf@gmail.com
-- Stability   :  unstable
-- Portability :  portable
--
-- Creation date: Tue Jan 29 09:29:19 2019.
--
-- The order of amino acids is alphabetic.
module ELynx.MarkovProcess.AminoAcid
  ( -- * Amino acid substitution models
    lg,
    lgCustom,
    wag,
    wagCustom,
    poisson,
    poissonCustom,
    gtr20,

    -- * Convenience functions
    alphaToPamlVec,
    pamlToAlphaVec,
  )
where

import Data.ByteString.Internal (c2w)
import Data.Maybe (fromMaybe)
import qualified Data.Vector.Storable as V
import Data.Word (Word8)
import ELynx.Alphabet.Alphabet
import ELynx.MarkovProcess.RateMatrix
import ELynx.MarkovProcess.SubstitutionModel
import Numeric.LinearAlgebra

n :: Int
n :: Int
n = Int
20

-- Some matrices have to be converted from PAML order to alphabetical order. See
-- 'pamlToAlphaVec' and 'pamlToAlphaMat'.

-- Amno acids in alphabetical order.
aaAlphaOrder :: V.Vector Word8
aaAlphaOrder :: Vector Word8
aaAlphaOrder =
  forall a b.
(Storable a, Storable b) =>
(a -> b) -> Vector a -> Vector b
V.map
    Char -> Word8
c2w
    forall a b. (a -> b) -> a -> b
$ forall a. Storable a => [a] -> Vector a
V.fromList
      [ Char
'A',
        Char
'C',
        Char
'D',
        Char
'E',
        Char
'F',
        Char
'G',
        Char
'H',
        Char
'I',
        Char
'K',
        Char
'L',
        Char
'M',
        Char
'N',
        Char
'P',
        Char
'Q',
        Char
'R',
        Char
'S',
        Char
'T',
        Char
'V',
        Char
'W',
        Char
'Y'
      ]

-- Amino acids in PAML oder.
aaPamlOrder :: V.Vector Word8
aaPamlOrder :: Vector Word8
aaPamlOrder =
  forall a b.
(Storable a, Storable b) =>
(a -> b) -> Vector a -> Vector b
V.map
    Char -> Word8
c2w
    forall a b. (a -> b) -> a -> b
$ forall a. Storable a => [a] -> Vector a
V.fromList
      [ Char
'A',
        Char
'R',
        Char
'N',
        Char
'D',
        Char
'C',
        Char
'Q',
        Char
'E',
        Char
'G',
        Char
'H',
        Char
'I',
        Char
'L',
        Char
'K',
        Char
'M',
        Char
'F',
        Char
'P',
        Char
'S',
        Char
'T',
        Char
'W',
        Char
'Y',
        Char
'V'
      ]

alphaIndexToPamlIndex :: Int -> Int
alphaIndexToPamlIndex :: Int -> Int
alphaIndexToPamlIndex Int
i =
  forall a. a -> Maybe a -> a
fromMaybe
    (forall a. HasCallStack => [Char] -> a
error forall a b. (a -> b) -> a -> b
$ [Char]
"Could not convert index " forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show Int
i forall a. [a] -> [a] -> [a]
++ [Char]
".")
    (forall a. (Storable a, Eq a) => a -> Vector a -> Maybe Int
V.elemIndex Word8
aa Vector Word8
aaPamlOrder)
  where
    aa :: Word8
aa = Vector Word8
aaAlphaOrder forall a. Storable a => Vector a -> Int -> a
V.! Int
i

pamlIndexToAlphaIndex :: Int -> Int
pamlIndexToAlphaIndex :: Int -> Int
pamlIndexToAlphaIndex Int
i =
  forall a. a -> Maybe a -> a
fromMaybe
    (forall a. HasCallStack => [Char] -> a
error forall a b. (a -> b) -> a -> b
$ [Char]
"Could not convert index " forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show Int
i forall a. [a] -> [a] -> [a]
++ [Char]
".")
    (forall a. (Storable a, Eq a) => a -> Vector a -> Maybe Int
V.elemIndex Word8
aa Vector Word8
aaAlphaOrder)
  where
    aa :: Word8
aa = Vector Word8
aaPamlOrder forall a. Storable a => Vector a -> Int -> a
V.! Int
i

-- | Convert an amino acid vector in PAML order to a vector in alphabetical order.
pamlToAlphaVec :: Vector R -> Vector R
pamlToAlphaVec :: Vector R -> Vector R
pamlToAlphaVec Vector R
v = forall d f (c :: * -> *) e. Build d f c e => d -> f -> c e
build Int
n (\R
i -> Vector R
v forall c t. Indexable c t => c -> Int -> t
! Int -> Int
alphaIndexToPamlIndex (forall a b. (RealFrac a, Integral b) => a -> b
round R
i))

-- | Convert an amino acid vector in alphabetical order to a vector in PAML order.
alphaToPamlVec :: Vector R -> Vector R
alphaToPamlVec :: Vector R -> Vector R
alphaToPamlVec Vector R
v = forall d f (c :: * -> *) e. Build d f c e => d -> f -> c e
build Int
n (\R
i -> Vector R
v forall c t. Indexable c t => c -> Int -> t
! Int -> Int
pamlIndexToAlphaIndex (forall a b. (RealFrac a, Integral b) => a -> b
round R
i))

-- Convert an amino acid matrix in PAML order to a matrix in alphabetical order.
pamlToAlphaMat :: Matrix R -> Matrix R
pamlToAlphaMat :: Matrix R -> Matrix R
pamlToAlphaMat Matrix R
m =
  forall d f (c :: * -> *) e. Build d f c e => d -> f -> c e
build
    (Int
n, Int
n)
    ( \R
i R
j -> Matrix R
m forall c t. Indexable c t => c -> Int -> t
! Int -> Int
alphaIndexToPamlIndex (forall a b. (RealFrac a, Integral b) => a -> b
round R
i) forall c t. Indexable c t => c -> Int -> t
! Int -> Int
alphaIndexToPamlIndex (forall a b. (RealFrac a, Integral b) => a -> b
round R
j)
    )

-- Lower triangular matrix of LG exchangeabilities in PAML order and in form of
-- a list.
lgExchRawPaml :: [Double]
lgExchRawPaml :: [R]
lgExchRawPaml =
  [ R
0.425093,
    R
0.276818,
    R
0.751878,
    R
0.395144,
    R
0.123954,
    R
5.076149,
    R
2.489084,
    R
0.534551,
    R
0.528768,
    R
0.062556,
    R
0.969894,
    R
2.807908,
    R
1.695752,
    R
0.523386,
    R
0.084808,
    R
1.038545,
    R
0.363970,
    R
0.541712,
    R
5.243870,
    R
0.003499,
    R
4.128591,
    R
2.066040,
    R
0.390192,
    R
1.437645,
    R
0.844926,
    R
0.569265,
    R
0.267959,
    R
0.348847,
    R
0.358858,
    R
2.426601,
    R
4.509238,
    R
0.927114,
    R
0.640543,
    R
4.813505,
    R
0.423881,
    R
0.311484,
    R
0.149830,
    R
0.126991,
    R
0.191503,
    R
0.010690,
    R
0.320627,
    R
0.072854,
    R
0.044265,
    R
0.008705,
    R
0.108882,
    R
0.395337,
    R
0.301848,
    R
0.068427,
    R
0.015076,
    R
0.594007,
    R
0.582457,
    R
0.069673,
    R
0.044261,
    R
0.366317,
    R
4.145067,
    R
0.536518,
    R
6.326067,
    R
2.145078,
    R
0.282959,
    R
0.013266,
    R
3.234294,
    R
1.807177,
    R
0.296636,
    R
0.697264,
    R
0.159069,
    R
0.137500,
    R
1.124035,
    R
0.484133,
    R
0.371004,
    R
0.025548,
    R
0.893680,
    R
1.672569,
    R
0.173735,
    R
0.139538,
    R
0.442472,
    R
4.273607,
    R
6.312358,
    R
0.656604,
    R
0.253701,
    R
0.052722,
    R
0.089525,
    R
0.017416,
    R
1.105251,
    R
0.035855,
    R
0.018811,
    R
0.089586,
    R
0.682139,
    R
1.112727,
    R
2.592692,
    R
0.023918,
    R
1.798853,
    R
1.177651,
    R
0.332533,
    R
0.161787,
    R
0.394456,
    R
0.075382,
    R
0.624294,
    R
0.419409,
    R
0.196961,
    R
0.508851,
    R
0.078281,
    R
0.249060,
    R
0.390322,
    R
0.099849,
    R
0.094464,
    R
4.727182,
    R
0.858151,
    R
4.008358,
    R
1.240275,
    R
2.784478,
    R
1.223828,
    R
0.611973,
    R
1.739990,
    R
0.990012,
    R
0.064105,
    R
0.182287,
    R
0.748683,
    R
0.346960,
    R
0.361819,
    R
1.338132,
    R
2.139501,
    R
0.578987,
    R
2.000679,
    R
0.425860,
    R
1.143480,
    R
1.080136,
    R
0.604545,
    R
0.129836,
    R
0.584262,
    R
1.033739,
    R
0.302936,
    R
1.136863,
    R
2.020366,
    R
0.165001,
    R
0.571468,
    R
6.472279,
    R
0.180717,
    R
0.593607,
    R
0.045376,
    R
0.029890,
    R
0.670128,
    R
0.236199,
    R
0.077852,
    R
0.268491,
    R
0.597054,
    R
0.111660,
    R
0.619632,
    R
0.049906,
    R
0.696175,
    R
2.457121,
    R
0.095131,
    R
0.248862,
    R
0.140825,
    R
0.218959,
    R
0.314440,
    R
0.612025,
    R
0.135107,
    R
1.165532,
    R
0.257336,
    R
0.120037,
    R
0.054679,
    R
5.306834,
    R
0.232523,
    R
0.299648,
    R
0.131932,
    R
0.481306,
    R
7.803902,
    R
0.089613,
    R
0.400547,
    R
0.245841,
    R
3.151815,
    R
2.547870,
    R
0.170887,
    R
0.083688,
    R
0.037967,
    R
1.959291,
    R
0.210332,
    R
0.245034,
    R
0.076701,
    R
0.119013,
    R
10.649107,
    R
1.702745,
    R
0.185202,
    R
1.898718,
    R
0.654683,
    R
0.296501,
    R
0.098369,
    R
2.188158,
    R
0.189510,
    R
0.249313
  ]

-- Exchangeabilities of LG model in alphabetical order.
lgExch :: ExchangeabilityMatrix
lgExch :: Matrix R
lgExch = Matrix R -> Matrix R
pamlToAlphaMat forall a b. (a -> b) -> a -> b
$ forall a.
(RealFrac a, Container Vector a) =>
Int -> [a] -> Matrix a
exchFromListLower Int
n [R]
lgExchRawPaml

normalizeSumVec :: Vector Double -> Vector Double
normalizeSumVec :: Vector R -> Vector R
normalizeSumVec Vector R
v = forall a b.
(Storable a, Storable b) =>
(a -> b) -> Vector a -> Vector b
V.map (forall a. Fractional a => a -> a -> a
/ R
s) Vector R
v
  where
    s :: R
s = forall a. (Storable a, Num a) => Vector a -> a
V.sum Vector R
v
{-# INLINE normalizeSumVec #-}

-- Stationary distribution in PAML order.
lgStatDistPaml :: StationaryDistribution
lgStatDistPaml :: Vector R
lgStatDistPaml =
  Vector R -> Vector R
normalizeSumVec forall a b. (a -> b) -> a -> b
$
    forall a. Storable a => [a] -> Vector a
fromList
      [ R
0.079066,
        R
0.055941,
        R
0.041977,
        R
0.053052,
        R
0.012937,
        R
0.040767,
        R
0.071586,
        R
0.057337,
        R
0.022355,
        R
0.062157,
        R
0.099081,
        R
0.064600,
        R
0.022951,
        R
0.042302,
        R
0.044040,
        R
0.061197,
        R
0.053287,
        R
0.012066,
        R
0.034155,
        R
0.069147
      ]

-- Stationary distribution of LG model in alphabetical order.
lgStatDist :: StationaryDistribution
lgStatDist :: Vector R
lgStatDist = Vector R -> Vector R
pamlToAlphaVec Vector R
lgStatDistPaml

-- | LG substitution model.
lg :: Normalize -> SubstitutionModel
lg :: Normalize -> SubstitutionModel
lg Normalize
nz = Alphabet
-> [Char]
-> [R]
-> Normalize
-> Vector R
-> Matrix R
-> SubstitutionModel
substitutionModel Alphabet
Protein [Char]
"LG" [] Normalize
nz Vector R
lgStatDist Matrix R
lgExch

-- | LG substitution model with maybe a name and a custom stationary distribution.
lgCustom :: Maybe String -> Normalize -> StationaryDistribution -> SubstitutionModel
lgCustom :: Maybe [Char] -> Normalize -> Vector R -> SubstitutionModel
lgCustom Maybe [Char]
mnm Normalize
nz Vector R
d = Alphabet
-> [Char]
-> [R]
-> Normalize
-> Vector R
-> Matrix R
-> SubstitutionModel
substitutionModel Alphabet
Protein [Char]
nm [] Normalize
nz Vector R
d Matrix R
lgExch
  where
    nm :: [Char]
nm = forall a. a -> Maybe a -> a
fromMaybe [Char]
"LG-Custom" Maybe [Char]
mnm

-- WAG exchangeability list in PAML order.
wagExchRawPaml :: [Double]
wagExchRawPaml :: [R]
wagExchRawPaml =
  [ R
55.15710,
    R
50.98480,
    R
63.53460,
    R
73.89980,
    R
14.73040,
    R
542.94200,
    R
102.70400,
    R
52.81910,
    R
26.52560,
    R
3.02949,
    R
90.85980,
    R
303.55000,
    R
154.36400,
    R
61.67830,
    R
9.88179,
    R
158.28500,
    R
43.91570,
    R
94.71980,
    R
617.41600,
    R
2.13520,
    R
546.94700,
    R
141.67200,
    R
58.46650,
    R
112.55600,
    R
86.55840,
    R
30.66740,
    R
33.00520,
    R
56.77170,
    R
31.69540,
    R
213.71500,
    R
395.62900,
    R
93.06760,
    R
24.89720,
    R
429.41100,
    R
57.00250,
    R
24.94100,
    R
19.33350,
    R
18.69790,
    R
55.42360,
    R
3.94370,
    R
17.01350,
    R
11.39170,
    R
12.73950,
    R
3.04501,
    R
13.81900,
    R
39.79150,
    R
49.76710,
    R
13.15280,
    R
8.48047,
    R
38.42870,
    R
86.94890,
    R
15.42630,
    R
6.13037,
    R
49.94620,
    R
317.09700,
    R
90.62650,
    R
535.14200,
    R
301.20100,
    R
47.98550,
    R
7.40339,
    R
389.49000,
    R
258.44300,
    R
37.35580,
    R
89.04320,
    R
32.38320,
    R
25.75550,
    R
89.34960,
    R
68.31620,
    R
19.82210,
    R
10.37540,
    R
39.04820,
    R
154.52600,
    R
31.51240,
    R
17.41000,
    R
40.41410,
    R
425.74600,
    R
485.40200,
    R
93.42760,
    R
21.04940,
    R
10.27110,
    R
9.61621,
    R
4.67304,
    R
39.80200,
    R
9.99208,
    R
8.11339,
    R
4.99310,
    R
67.93710,
    R
105.94700,
    R
211.51700,
    R
8.88360,
    R
119.06300,
    R
143.85500,
    R
67.94890,
    R
19.50810,
    R
42.39840,
    R
10.94040,
    R
93.33720,
    R
68.23550,
    R
24.35700,
    R
69.61980,
    R
9.99288,
    R
41.58440,
    R
55.68960,
    R
17.13290,
    R
16.14440,
    R
337.07900,
    R
122.41900,
    R
397.42300,
    R
107.17600,
    R
140.76600,
    R
102.88700,
    R
70.49390,
    R
134.18200,
    R
74.01690,
    R
31.94400,
    R
34.47390,
    R
96.71300,
    R
49.39050,
    R
54.59310,
    R
161.32800,
    R
212.11100,
    R
55.44130,
    R
203.00600,
    R
37.48660,
    R
51.29840,
    R
85.79280,
    R
82.27650,
    R
22.58330,
    R
47.33070,
    R
145.81600,
    R
32.66220,
    R
138.69800,
    R
151.61200,
    R
17.19030,
    R
79.53840,
    R
437.80200,
    R
11.31330,
    R
116.39200,
    R
7.19167,
    R
12.97670,
    R
71.70700,
    R
21.57370,
    R
15.65570,
    R
33.69830,
    R
26.25690,
    R
21.24830,
    R
66.53090,
    R
13.75050,
    R
51.57060,
    R
152.96400,
    R
13.94050,
    R
52.37420,
    R
11.08640,
    R
24.07350,
    R
38.15330,
    R
108.60000,
    R
32.57110,
    R
54.38330,
    R
22.77100,
    R
19.63030,
    R
10.36040,
    R
387.34400,
    R
42.01700,
    R
39.86180,
    R
13.32640,
    R
42.84370,
    R
645.42800,
    R
21.60460,
    R
78.69930,
    R
29.11480,
    R
248.53900,
    R
200.60100,
    R
25.18490,
    R
19.62460,
    R
15.23350,
    R
100.21400,
    R
30.12810,
    R
58.87310,
    R
18.72470,
    R
11.83580,
    R
782.13000,
    R
180.03400,
    R
30.54340,
    R
205.84500,
    R
64.98920,
    R
31.48870,
    R
23.27390,
    R
138.82300,
    R
36.53690,
    R
31.47300
  ]

-- WAG exchangeability matrix n alphabetical order.
wagExch :: ExchangeabilityMatrix
wagExch :: Matrix R
wagExch = Matrix R -> Matrix R
pamlToAlphaMat forall a b. (a -> b) -> a -> b
$ forall a.
(RealFrac a, Container Vector a) =>
Int -> [a] -> Matrix a
exchFromListLower Int
n [R]
wagExchRawPaml

-- WAG stationary distribution in PAML order.
wagStatDistPaml :: StationaryDistribution
wagStatDistPaml :: Vector R
wagStatDistPaml =
  Vector R -> Vector R
normalizeSumVec forall a b. (a -> b) -> a -> b
$
    forall a. Storable a => [a] -> Vector a
fromList
      [ R
0.0866279,
        R
0.043972,
        R
0.0390894,
        R
0.0570451,
        R
0.0193078,
        R
0.0367281,
        R
0.0580589,
        R
0.0832518,
        R
0.0244313,
        R
0.048466,
        R
0.086209,
        R
0.0620286,
        R
0.0195027,
        R
0.0384319,
        R
0.0457631,
        R
0.0695179,
        R
0.0610127,
        R
0.0143859,
        R
0.0352742,
        R
0.0708957
      ]

-- WAG stationary distribution in alphabetical order.
wagStatDist :: StationaryDistribution
wagStatDist :: Vector R
wagStatDist = Vector R -> Vector R
pamlToAlphaVec Vector R
wagStatDistPaml

-- | LG substitution model.
wag :: Normalize -> SubstitutionModel
wag :: Normalize -> SubstitutionModel
wag Normalize
nz = Alphabet
-> [Char]
-> [R]
-> Normalize
-> Vector R
-> Matrix R
-> SubstitutionModel
substitutionModel Alphabet
Protein [Char]
"WAG" [] Normalize
nz Vector R
wagStatDist Matrix R
wagExch

-- | LG substitution model with maybe a name and a custom stationary distribution.
wagCustom :: Maybe String -> Normalize -> StationaryDistribution -> SubstitutionModel
wagCustom :: Maybe [Char] -> Normalize -> Vector R -> SubstitutionModel
wagCustom Maybe [Char]
mnm Normalize
nz Vector R
d = Alphabet
-> [Char]
-> [R]
-> Normalize
-> Vector R
-> Matrix R
-> SubstitutionModel
substitutionModel Alphabet
Protein [Char]
nm [] Normalize
nz Vector R
d Matrix R
wagExch
  where
    nm :: [Char]
nm = forall a. a -> Maybe a -> a
fromMaybe [Char]
"WAG-Custom" Maybe [Char]
mnm

matrixSetDiagToZero :: Matrix R -> Matrix R
matrixSetDiagToZero :: Matrix R -> Matrix R
matrixSetDiagToZero Matrix R
m = Matrix R
m forall a. Num a => a -> a -> a
- forall a. (Num a, Element a) => Vector a -> Matrix a
diag (forall t. Element t => Matrix t -> Vector t
takeDiag Matrix R
m)
{-# INLINE matrixSetDiagToZero #-}

uniformExch :: ExchangeabilityMatrix
uniformExch :: Matrix R
uniformExch = Matrix R -> Matrix R
matrixSetDiagToZero forall a b. (a -> b) -> a -> b
$ Int -> [R] -> Matrix R
matrix Int
n forall a b. (a -> b) -> a -> b
$ forall a. Int -> a -> [a]
replicate (Int
n forall a. Num a => a -> a -> a
* Int
n) R
1.0

poissonExch :: ExchangeabilityMatrix
poissonExch :: Matrix R
poissonExch = Matrix R
uniformExch

uniformVec :: Vector Double
uniformVec :: Vector R
uniformVec = forall a. Storable a => Int -> a -> Vector a
V.replicate Int
n (R
1 forall a. Fractional a => a -> a -> a
/ forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n)

-- | Poisson substitution model.
poisson :: Normalize -> SubstitutionModel
poisson :: Normalize -> SubstitutionModel
poisson Normalize
nz = Alphabet
-> [Char]
-> [R]
-> Normalize
-> Vector R
-> Matrix R
-> SubstitutionModel
substitutionModel Alphabet
Protein [Char]
"Poisson" [] Normalize
nz Vector R
uniformVec Matrix R
poissonExch

-- | Poisson substitution model with maybe a name and a custom stationary distribution.
poissonCustom :: Maybe String -> Normalize -> StationaryDistribution -> SubstitutionModel
poissonCustom :: Maybe [Char] -> Normalize -> Vector R -> SubstitutionModel
poissonCustom Maybe [Char]
mnm Normalize
nz Vector R
d = Alphabet
-> [Char]
-> [R]
-> Normalize
-> Vector R
-> Matrix R
-> SubstitutionModel
substitutionModel Alphabet
Protein [Char]
nm [] Normalize
nz Vector R
d Matrix R
poissonExch
  where
    nm :: [Char]
nm = forall a. a -> Maybe a -> a
fromMaybe [Char]
"Poisson-Custom" Maybe [Char]
mnm

-- | General time reversible (GTR) substitution model for amino acids.
gtr20 :: Normalize -> [Double] -> StationaryDistribution -> SubstitutionModel
gtr20 :: Normalize -> [R] -> Vector R -> SubstitutionModel
gtr20 Normalize
nz [R]
es Vector R
d = Alphabet
-> [Char]
-> [R]
-> Normalize
-> Vector R
-> Matrix R
-> SubstitutionModel
substitutionModel Alphabet
Protein [Char]
"GTR" [R]
es Normalize
nz Vector R
d Matrix R
e
  where
    e :: Matrix R
e = forall a.
(RealFrac a, Container Vector a) =>
Int -> [a] -> Matrix a
exchFromListUpper Int
n [R]
es