module Biobase.SubstMatrix where import Control.DeepSeq (NFData(..)) import Control.Lens import Control.Monad.Except import Control.Monad.IO.Class import Data.Aeson (FromJSON,ToJSON) import Data.Binary (Binary) import Data.List (maximumBy,find) import Data.Serialize (Serialize) import Data.Vector.Unboxed.Deriving import GHC.Generics (Generic) import Numeric.Log import qualified Data.Map.Strict as M import qualified Data.Vector.Unboxed as VU import System.Directory (doesFileExist) import System.Exit (exitSuccess) import Text.Printf import Biobase.GeneticCodes.Translation import Biobase.GeneticCodes.Types import Biobase.Primary.AA (aaRange) import Biobase.Primary.Letter import Biobase.Primary.Nuc.DNA () import Biobase.Primary.Trans import Biobase.Types.BioSequence (DNA,AA) import Biobase.Types.Codon import Data.PrimitiveArray as PA import Numeric.Discretized import qualified Biobase.Primary.AA as AA import qualified Biobase.Primary.Nuc.DNA as DNA import StatisticalMechanics.Ensemble import Statistics.Odds import Statistics.Probability import Biobase.SubstMatrix.Embedded import Biobase.SubstMatrix.Import import Biobase.SubstMatrix.Types -- | The usual substitution matrix, but here with a codon and an amino acid -- to be compared. -- -- The resulting @AA@ are tagged with the name type from the @DNA n@. -- -- TODO Definitely use the correct upper bound constants here! mkANuc3SubstMat ∷ TranslationTable (Letter DNA n) (Letter AA n) → AASubstMat t (DiscLogOdds k) n → ANuc3SubstMat t (Letter AA n, (DiscLogOdds k)) n n mkANuc3SubstMat tbl (AASubstMat m) = ANuc3SubstMat $ fromAssocs (ZZ:..LtLetter AA.Undef:..LtLetter DNA.N:..LtLetter DNA.N:..LtLetter DNA.N) (AA.Undef, DiscLogOdds . Discretized $ -999) [ ( (Z:.a:.u:.v:.w) , (t, m!(Z:.a:.t)) ) | a <- VU.toList aaRange , u <- [DNA.A .. DNA.N], v <- [DNA.A .. DNA.N], w <- [DNA.A .. DNA.N] , let b = Codon u v w , let t = translate tbl b ] -- | This function does the following: -- 1. check if @fname@ is a file, and if so try to load it. -- 2. if not, check if @fname@ happens to be the name of one of the known @PAM/BLOSUM@ tables. fromFileOrCached ∷ (MonadIO m, MonadError String m) ⇒ FilePath → m (AASubstMat t (DiscLogOdds Unknown) a) fromFileOrCached fname = do dfe ← liftIO $ doesFileExist fname if | fname == "list" → do mapM_ (liftIO . printf "%s\n" . fst) embeddedPamBlosum liftIO exitSuccess | dfe → fromFile fname | Just (k,v) ← find ((fname==).fst) embeddedPamBlosum → return v | otherwise → throwError $ fname ++ " is neither a file nor a known substitution matrix" -- | Turn log-odds into log-probabilities. Normalizes over the whole set of -- values in the matrix. mkProbabilityMatrix ∷ Double → AASubstMat t (DiscLogOdds k) n → AASubstMat t (Log (Probability NotNormalized Double)) n mkProbabilityMatrix invScale (AASubstMat dlo) = AASubstMat $ PA.map (/nrm) $ dbl where dbl = PA.map (\(DiscLogOdds (Discretized k)) → stateLogProbability (negate invScale) $ fromIntegral @Int @Double k) dlo nrm = maximum . Prelude.map snd $ PA.assocs dbl {- -- | Create a 2-tuple to amino acid substitution matrix. Here, @f@ combines -- all to entries that have the same 2-tuple index. mkANuc2SubstMat ∷ ((Z:.Letter AA:.Letter DNA:.Letter DNA) → (Letter AA, DiscLogOdds) → (Letter AA, DiscLogOdds) → Ordering) → AASubstMat t DiscLogOdds → ANuc2SubstMat t (Letter AA, DiscLogOdds) mkANuc2SubstMat f (AASubstMat m) = ANuc2SubstMat $ fromAssocs (ZZ:..LtLetter (length aaRange):..LtLetter 5:..LtLetter 5) (AA.Undef, DiscLogOdds $ -999) . M.assocs . M.mapWithKey (\k → maximumBy (f k)) . M.fromListWith (++) $ [ ((Z:.a:.x:.y), [maybe (AA.Undef, DiscLogOdds $ -999) (\k -> (k, m!(Z:.a:.k))) $ M.lookup uvw dnaAAmap]) | a <- aaRange , u <- [DNA.A .. DNA.N], v <- [DNA.A .. DNA.N], w <- [DNA.A .. DNA.N] , (x,y) <- [ (u,v), (u,w), (v,w) ] , let uvw = VU.fromList [u,v,w] ] -- | The most degenerate case, where just a single nucleotide remains in -- the amino-acid / nucleotide substitution. Again, @f@ combines different -- entries. mkANuc1SubstMat ∷ ((Z:.Letter AA:.Letter DNA) → (Letter AA, DiscLogOdds) → (Letter AA, DiscLogOdds) → Ordering) → AASubstMat t DiscLogOdds → ANuc1SubstMat t (Letter AA, DiscLogOdds) mkANuc1SubstMat f (AASubstMat m) = ANuc1SubstMat $ fromAssocs (ZZ:..LtLetter (length aaRange):..LtLetter 5) (AA.Undef, DiscLogOdds $ -999) . M.assocs . M.mapWithKey (\k → maximumBy (f k)) . M.fromListWith (++) $ [ ((Z:.a:.x), [maybe (AA.Undef, DiscLogOdds $ -999) (\k -> (k, m!(Z:.a:.k))) $ M.lookup uvw dnaAAmap]) | a <- aaRange , u <- [DNA.A .. DNA.N], v <- [DNA.A .. DNA.N], w <- [DNA.A .. DNA.N] , x <- [u,v,w] , let uvw = VU.fromList [u,v,w] ] -}