module Biobase.SubstMatrix where
import Control.DeepSeq (NFData(..))
import Control.Lens
import Control.Monad.Except
import Control.Monad.IO.Class
import Data.Aeson (FromJSON,ToJSON)
import Data.Binary (Binary)
import Data.List (maximumBy,find)
import Data.Serialize (Serialize)
import Data.Vector.Unboxed.Deriving
import Debug.Trace (traceShow)
import GHC.Generics (Generic)
import GHC.Real(Ratio(..))
import Numeric.Log
import qualified Data.Map.Strict as M
import qualified Data.Vector.Unboxed as VU
import System.Directory (doesFileExist)
import System.Exit (exitSuccess)
import Text.Printf
import Biobase.GeneticCodes.Translation
import Biobase.GeneticCodes.Types
import Biobase.Primary.AA (aaRange)
import Biobase.Primary.Letter
import Biobase.Primary.Nuc.DNA ()
import Biobase.Primary.Trans
import Biobase.Types.BioSequence (DNA,AA)
import Biobase.Types.Codon
import Data.PrimitiveArray as PA
import Numeric.Discretized
import qualified Biobase.Primary.AA as AA
import qualified Biobase.Primary.Nuc.DNA as DNA
import StatisticalMechanics.Ensemble
import Statistics.Odds
import Statistics.Probability
import Biobase.SubstMatrix.Embedded
import Biobase.SubstMatrix.Import
import Biobase.SubstMatrix.Statistics
import Biobase.SubstMatrix.Types
mkANuc3SubstMat
∷ TranslationTable (Letter DNA n) (Letter AA n)
→ AASubstMat t (DiscLogOdds k) n n
→ ANuc3SubstMat t (Letter AA n, (DiscLogOdds k)) n n
mkANuc3SubstMat :: TranslationTable (Letter DNA n) (Letter AA n)
-> AASubstMat t (DiscLogOdds k) n n
-> ANuc3SubstMat t (Letter AA n, DiscLogOdds k) n n
mkANuc3SubstMat TranslationTable (Letter DNA n) (Letter AA n)
tbl (AASubstMat Unboxed ((Z :. Letter AA n) :. Letter AA n) (DiscLogOdds k)
m Double
l)
= Unboxed
((((Z :. Letter AA n) :. Letter DNA n) :. Letter DNA n)
:. Letter DNA n)
(Letter AA n, DiscLogOdds k)
-> ANuc3SubstMat t (Letter AA n, DiscLogOdds k) n n
forall k k k (t :: k) s (a :: k) (n :: k).
Unboxed
((((Z :. Letter AA a) :. Letter DNA n) :. Letter DNA n)
:. Letter DNA n)
s
-> ANuc3SubstMat t s a n
ANuc3SubstMat
(Unboxed
((((Z :. Letter AA n) :. Letter DNA n) :. Letter DNA n)
:. Letter DNA n)
(Letter AA n, DiscLogOdds k)
-> ANuc3SubstMat t (Letter AA n, DiscLogOdds k) n n)
-> Unboxed
((((Z :. Letter AA n) :. Letter DNA n) :. Letter DNA n)
:. Letter DNA n)
(Letter AA n, DiscLogOdds k)
-> ANuc3SubstMat t (Letter AA n, DiscLogOdds k) n n
forall a b. (a -> b) -> a -> b
$ LimitType
((((Z :. Letter AA n) :. Letter DNA n) :. Letter DNA n)
:. Letter DNA n)
-> (Letter AA n, DiscLogOdds k)
-> [((((Z :. Letter AA n) :. Letter DNA n) :. Letter DNA n)
:. Letter DNA n,
(Letter AA n, DiscLogOdds k))]
-> Unboxed
((((Z :. Letter AA n) :. Letter DNA n) :. Letter DNA n)
:. Letter DNA n)
(Letter AA n, DiscLogOdds k)
forall (arr :: * -> * -> *) sh elm.
PrimArrayOps arr sh elm =>
LimitType sh -> elm -> [(sh, elm)] -> arr sh elm
fromAssocs (LimitType Z
ZZLimitType Z
-> LimitType (Letter AA n) -> LimitType (Z :. Letter AA n)
forall zs z. LimitType zs -> LimitType z -> LimitType (zs :. z)
:..Letter AA n -> LimitType (Letter AA n)
forall k l (n :: k). Letter l n -> LimitType (Letter l n)
LtLetter Letter AA n
forall k (n :: k). Letter AA n
AA.UndefLimitType (Z :. Letter AA n)
-> LimitType (Letter DNA n)
-> LimitType ((Z :. Letter AA n) :. Letter DNA n)
forall zs z. LimitType zs -> LimitType z -> LimitType (zs :. z)
:..Letter DNA n -> LimitType (Letter DNA n)
forall k l (n :: k). Letter l n -> LimitType (Letter l n)
LtLetter Letter DNA n
forall k (n :: k). Letter DNA n
DNA.NLimitType ((Z :. Letter AA n) :. Letter DNA n)
-> LimitType (Letter DNA n)
-> LimitType (((Z :. Letter AA n) :. Letter DNA n) :. Letter DNA n)
forall zs z. LimitType zs -> LimitType z -> LimitType (zs :. z)
:..Letter DNA n -> LimitType (Letter DNA n)
forall k l (n :: k). Letter l n -> LimitType (Letter l n)
LtLetter Letter DNA n
forall k (n :: k). Letter DNA n
DNA.NLimitType (((Z :. Letter AA n) :. Letter DNA n) :. Letter DNA n)
-> LimitType (Letter DNA n)
-> LimitType
((((Z :. Letter AA n) :. Letter DNA n) :. Letter DNA n)
:. Letter DNA n)
forall zs z. LimitType zs -> LimitType z -> LimitType (zs :. z)
:..Letter DNA n -> LimitType (Letter DNA n)
forall k l (n :: k). Letter l n -> LimitType (Letter l n)
LtLetter Letter DNA n
forall k (n :: k). Letter DNA n
DNA.N) (Letter AA n
forall k (n :: k). Letter AA n
AA.Undef, Discretized k -> DiscLogOdds k
forall k (t :: k). Discretized t -> DiscLogOdds t
DiscLogOdds (Discretized k -> DiscLogOdds k)
-> (Int -> Discretized k) -> Int -> DiscLogOdds k
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Discretized k
forall k (b :: k). Int -> Discretized b
Discretized (Int -> DiscLogOdds k) -> Int -> DiscLogOdds k
forall a b. (a -> b) -> a -> b
$ -Int
999)
[ ( (Z
ZZ -> Letter AA n -> Z :. Letter AA n
forall a b. a -> b -> a :. b
:.Letter AA n
a(Z :. Letter AA n)
-> Letter DNA n -> (Z :. Letter AA n) :. Letter DNA n
forall a b. a -> b -> a :. b
:.Letter DNA n
u((Z :. Letter AA n) :. Letter DNA n)
-> Letter DNA n
-> ((Z :. Letter AA n) :. Letter DNA n) :. Letter DNA n
forall a b. a -> b -> a :. b
:.Letter DNA n
v(((Z :. Letter AA n) :. Letter DNA n) :. Letter DNA n)
-> Letter DNA n
-> (((Z :. Letter AA n) :. Letter DNA n) :. Letter DNA n)
:. Letter DNA n
forall a b. a -> b -> a :. b
:.Letter DNA n
w)
, (TargetType (Codon (Letter DNA n))
Letter AA n
t, DiscLogOdds k
x)
)
| Letter AA n
a <- Vector (Letter AA n) -> [Letter AA n]
forall a. Unbox a => Vector a -> [a]
VU.toList Vector (Letter AA n)
forall k (n :: k). Vector (Letter AA n)
aaRange
, Letter DNA n
u <- [Letter DNA n
forall k (n :: k). Letter DNA n
DNA.A .. Letter DNA n
forall k (n :: k). Letter DNA n
DNA.N], Letter DNA n
v <- [Letter DNA n
forall k (n :: k). Letter DNA n
DNA.A .. Letter DNA n
forall k (n :: k). Letter DNA n
DNA.N], Letter DNA n
w <- [Letter DNA n
forall k (n :: k). Letter DNA n
DNA.A .. Letter DNA n
forall k (n :: k). Letter DNA n
DNA.N]
, let b :: Codon (Letter DNA n)
b = Letter DNA n
-> Letter DNA n -> Letter DNA n -> Codon (Letter DNA n)
forall c. c -> c -> c -> Codon c
Codon Letter DNA n
u Letter DNA n
v Letter DNA n
w
, let t :: TargetType (Codon (Letter DNA n))
t = TranslationTable
(CodonType (Codon (Letter DNA n))) (AAType (Codon (Letter DNA n)))
-> Codon (Letter DNA n) -> TargetType (Codon (Letter DNA n))
forall t.
Translation t =>
TranslationTable (CodonType t) (AAType t) -> t -> TargetType t
translate TranslationTable
(CodonType (Codon (Letter DNA n))) (AAType (Codon (Letter DNA n)))
TranslationTable (Letter DNA n) (Letter AA n)
tbl Codon (Letter DNA n)
b
, DiscLogOdds k
x <- [DiscLogOdds k]
-> (DiscLogOdds k -> [DiscLogOdds k])
-> Maybe (DiscLogOdds k)
-> [DiscLogOdds k]
forall b a. b -> (a -> b) -> Maybe a -> b
maybe [] (DiscLogOdds k -> [DiscLogOdds k] -> [DiscLogOdds k]
forall a. a -> [a] -> [a]
:[]) (Maybe (DiscLogOdds k) -> [DiscLogOdds k])
-> Maybe (DiscLogOdds k) -> [DiscLogOdds k]
forall a b. (a -> b) -> a -> b
$ Unboxed ((Z :. Letter AA n) :. Letter AA n) (DiscLogOdds k)
mUnboxed ((Z :. Letter AA n) :. Letter AA n) (DiscLogOdds k)
-> ((Z :. Letter AA n) :. Letter AA n) -> Maybe (DiscLogOdds k)
forall (arr :: * -> * -> *) sh elm.
PrimArrayOps arr sh elm =>
arr sh elm -> sh -> Maybe elm
!?(Z
ZZ -> Letter AA n -> Z :. Letter AA n
forall a b. a -> b -> a :. b
:.Letter AA n
a(Z :. Letter AA n)
-> Letter AA n -> (Z :. Letter AA n) :. Letter AA n
forall a b. a -> b -> a :. b
:.TargetType (Codon (Letter DNA n))
Letter AA n
t)
]
fromFileOrCached ∷ (MonadIO m, MonadError String m) ⇒ FilePath → m (AASubstMat t (DiscLogOdds (1 :% 1)) a b)
fromFileOrCached :: FilePath -> m (AASubstMat t (DiscLogOdds (1 ':% 1)) a b)
fromFileOrCached FilePath
fname = do
Bool
dfe ← IO Bool -> m Bool
forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO (IO Bool -> m Bool) -> IO Bool -> m Bool
forall a b. (a -> b) -> a -> b
$ FilePath -> IO Bool
doesFileExist FilePath
fname
if | FilePath
fname FilePath -> FilePath -> Bool
forall a. Eq a => a -> a -> Bool
== FilePath
"list" → do
((FilePath, AASubstMat Any (DiscLogOdds (1 ':% 1)) Any Any)
-> m ())
-> [(FilePath, AASubstMat Any (DiscLogOdds (1 ':% 1)) Any Any)]
-> m ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
(a -> m b) -> t a -> m ()
mapM_ (IO () -> m ()
forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO (IO () -> m ())
-> ((FilePath, AASubstMat Any (DiscLogOdds (1 ':% 1)) Any Any)
-> IO ())
-> (FilePath, AASubstMat Any (DiscLogOdds (1 ':% 1)) Any Any)
-> m ()
forall b c a. (b -> c) -> (a -> b) -> a -> c
. FilePath -> FilePath -> IO ()
forall r. PrintfType r => FilePath -> r
printf FilePath
"%s\n" (FilePath -> IO ())
-> ((FilePath, AASubstMat Any (DiscLogOdds (1 ':% 1)) Any Any)
-> FilePath)
-> (FilePath, AASubstMat Any (DiscLogOdds (1 ':% 1)) Any Any)
-> IO ()
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (FilePath, AASubstMat Any (DiscLogOdds (1 ':% 1)) Any Any)
-> FilePath
forall a b. (a, b) -> a
fst) [(FilePath, AASubstMat Any (DiscLogOdds (1 ':% 1)) Any Any)]
forall k1 k2 k3 (t :: k1) (a :: k2) (b :: k3).
[(FilePath, AASubstMat t (DiscLogOdds (1 ':% 1)) a b)]
embeddedPamBlosum
IO (AASubstMat t (DiscLogOdds (1 ':% 1)) a b)
-> m (AASubstMat t (DiscLogOdds (1 ':% 1)) a b)
forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO IO (AASubstMat t (DiscLogOdds (1 ':% 1)) a b)
forall a. IO a
exitSuccess
| Bool
dfe → FilePath -> m (AASubstMat t (DiscLogOdds (1 ':% 1)) a b)
forall k1 k2 k3 k4 (m :: * -> *) (k5 :: k1) (t :: k2) (a :: k3)
(b :: k4).
(MonadIO m, MonadError FilePath m, Real (Discretized k5)) =>
FilePath -> m (AASubstMat t (DiscLogOdds k5) a b)
fromFile FilePath
fname
| Just (FilePath
k,AASubstMat t (DiscLogOdds (1 ':% 1)) a b
v) ← ((FilePath, AASubstMat t (DiscLogOdds (1 ':% 1)) a b) -> Bool)
-> [(FilePath, AASubstMat t (DiscLogOdds (1 ':% 1)) a b)]
-> Maybe (FilePath, AASubstMat t (DiscLogOdds (1 ':% 1)) a b)
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Maybe a
find ((FilePath
fnameFilePath -> FilePath -> Bool
forall a. Eq a => a -> a -> Bool
==)(FilePath -> Bool)
-> ((FilePath, AASubstMat t (DiscLogOdds (1 ':% 1)) a b)
-> FilePath)
-> (FilePath, AASubstMat t (DiscLogOdds (1 ':% 1)) a b)
-> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
.(FilePath, AASubstMat t (DiscLogOdds (1 ':% 1)) a b) -> FilePath
forall a b. (a, b) -> a
fst) [(FilePath, AASubstMat t (DiscLogOdds (1 ':% 1)) a b)]
forall k1 k2 k3 (t :: k1) (a :: k2) (b :: k3).
[(FilePath, AASubstMat t (DiscLogOdds (1 ':% 1)) a b)]
embeddedPamBlosum → AASubstMat t (DiscLogOdds (1 ':% 1)) a b
-> m (AASubstMat t (DiscLogOdds (1 ':% 1)) a b)
forall (m :: * -> *) a. Monad m => a -> m a
return AASubstMat t (DiscLogOdds (1 ':% 1)) a b
v
| Bool
otherwise → FilePath -> m (AASubstMat t (DiscLogOdds (1 ':% 1)) a b)
forall e (m :: * -> *) a. MonadError e m => e -> m a
throwError (FilePath -> m (AASubstMat t (DiscLogOdds (1 ':% 1)) a b))
-> FilePath -> m (AASubstMat t (DiscLogOdds (1 ':% 1)) a b)
forall a b. (a -> b) -> a -> b
$ FilePath
fname FilePath -> FilePath -> FilePath
forall a. [a] -> [a] -> [a]
++ FilePath
" is neither a file nor a known substitution matrix"
mkProbabilityMatrix
:: Double
-> AASubstMat t (DiscLogOdds k) n m
-> AASubstMat t (Log (Probability NotNormalized Double)) n m
mkProbabilityMatrix :: Double
-> AASubstMat t (DiscLogOdds k) n m
-> AASubstMat t (Log (Probability 'NotNormalized Double)) n m
mkProbabilityMatrix Double
invScale (AASubstMat Unboxed ((Z :. Letter AA n) :. Letter AA m) (DiscLogOdds k)
dlo Double
l) = Unboxed
((Z :. Letter AA n) :. Letter AA m)
(Log (Probability 'NotNormalized Double))
-> Double
-> AASubstMat t (Log (Probability 'NotNormalized Double)) n m
forall k k k (t :: k) s (a :: k) (b :: k).
Unboxed ((Z :. Letter AA a) :. Letter AA b) s
-> Double -> AASubstMat t s a b
AASubstMat ((Log (Probability 'NotNormalized Double)
-> Log (Probability 'NotNormalized Double))
-> Unboxed
((Z :. Letter AA n) :. Letter AA m)
(Log (Probability 'NotNormalized Double))
-> Unboxed
((Z :. Letter AA n) :. Letter AA m)
(Log (Probability 'NotNormalized Double))
forall k (arr :: k -> * -> *) (sh :: k) e e'.
PrimArrayMap arr sh e e' =>
(e -> e') -> arr sh e -> arr sh e'
mapArray (Log (Probability 'NotNormalized Double)
-> Log (Probability 'NotNormalized Double)
-> Log (Probability 'NotNormalized Double)
forall a. Fractional a => a -> a -> a
/Log (Probability 'NotNormalized Double)
nrm) (Unboxed
((Z :. Letter AA n) :. Letter AA m)
(Log (Probability 'NotNormalized Double))
-> Unboxed
((Z :. Letter AA n) :. Letter AA m)
(Log (Probability 'NotNormalized Double)))
-> Unboxed
((Z :. Letter AA n) :. Letter AA m)
(Log (Probability 'NotNormalized Double))
-> Unboxed
((Z :. Letter AA n) :. Letter AA m)
(Log (Probability 'NotNormalized Double))
forall a b. (a -> b) -> a -> b
$ Unboxed
((Z :. Letter AA n) :. Letter AA m)
(Log (Probability 'NotNormalized Double))
dbl) Double
1.0
where dbl :: Unboxed
((Z :. Letter AA n) :. Letter AA m)
(Log (Probability 'NotNormalized Double))
dbl = (DiscLogOdds k -> Log (Probability 'NotNormalized Double))
-> Unboxed ((Z :. Letter AA n) :. Letter AA m) (DiscLogOdds k)
-> Unboxed
((Z :. Letter AA n) :. Letter AA m)
(Log (Probability 'NotNormalized Double))
forall k (arr :: k -> * -> *) (sh :: k) e e'.
PrimArrayMap arr sh e e' =>
(e -> e') -> arr sh e -> arr sh e'
mapArray (\(DiscLogOdds (Discretized Int
k)) → Double -> Double -> Log (Probability 'NotNormalized Double)
forall a.
StateProbability a =>
Double -> a -> Log (Probability 'NotNormalized Double)
stateLogProbability (Double -> Double
forall a. Num a => a -> a
negate Double
invScale) (Double -> Log (Probability 'NotNormalized Double))
-> Double -> Log (Probability 'NotNormalized Double)
forall a b. (a -> b) -> a -> b
$ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral @Int @Double Int
k) Unboxed ((Z :. Letter AA n) :. Letter AA m) (DiscLogOdds k)
dlo
nrm :: Log (Probability 'NotNormalized Double)
nrm = [Log (Probability 'NotNormalized Double)]
-> Log (Probability 'NotNormalized Double)
forall (t :: * -> *) a. (Foldable t, Ord a) => t a -> a
maximum ([Log (Probability 'NotNormalized Double)]
-> Log (Probability 'NotNormalized Double))
-> ([((Z :. Letter AA n) :. Letter AA m,
Log (Probability 'NotNormalized Double))]
-> [Log (Probability 'NotNormalized Double)])
-> [((Z :. Letter AA n) :. Letter AA m,
Log (Probability 'NotNormalized Double))]
-> Log (Probability 'NotNormalized Double)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (((Z :. Letter AA n) :. Letter AA m,
Log (Probability 'NotNormalized Double))
-> Log (Probability 'NotNormalized Double))
-> [((Z :. Letter AA n) :. Letter AA m,
Log (Probability 'NotNormalized Double))]
-> [Log (Probability 'NotNormalized Double)]
forall a b. (a -> b) -> [a] -> [b]
Prelude.map ((Z :. Letter AA n) :. Letter AA m,
Log (Probability 'NotNormalized Double))
-> Log (Probability 'NotNormalized Double)
forall a b. (a, b) -> b
snd ([((Z :. Letter AA n) :. Letter AA m,
Log (Probability 'NotNormalized Double))]
-> Log (Probability 'NotNormalized Double))
-> [((Z :. Letter AA n) :. Letter AA m,
Log (Probability 'NotNormalized Double))]
-> Log (Probability 'NotNormalized Double)
forall a b. (a -> b) -> a -> b
$ Unboxed
((Z :. Letter AA n) :. Letter AA m)
(Log (Probability 'NotNormalized Double))
-> [((Z :. Letter AA n) :. Letter AA m,
Log (Probability 'NotNormalized Double))]
forall (arr :: * -> * -> *) sh elm.
(IndexStream sh, PrimArrayOps arr sh elm) =>
arr sh elm -> [(sh, elm)]
PA.assocs Unboxed
((Z :. Letter AA n) :. Letter AA m)
(Log (Probability 'NotNormalized Double))
dbl