module Bio.Tools.Sequence.CodonOptimization.Algo ( optimizeAA , optimizeDNA , score , scoreSequence , scoreCmp ) where import Bio.NucleicAcid.Nucleotide (symbol) import Bio.NucleicAcid.Nucleotide.Type (DNA (..)) import Bio.Protein.AminoAcid.Type (AA (..)) import Bio.Tools.Sequence.CodonOptimization.Constants (ak2Codon, ak2MaxFrequCodon, codon2ak, codonFrequencies, motiveScoreWindow, defaultMotiveScore, forbiddenMotiveScore) import Bio.Tools.Sequence.CodonOptimization.Types (CodonScoreConfig (..), standardTemperature) import Bio.Tools.Sequence.ViennaRNA.Fold (fold) import Data.List (foldl', maximumBy, take) import Data.Map as Map (lookup) import Data.Maybe (fromMaybe) import Text.Regex.TDFA ((=~)) -- | 'optimizeDNA' function does translation from [DNA] to [AA] and then calls 'optimizeAA' optimizeDNA :: CodonScoreConfig -- ^ Config data object. Contains main parameters of codon-optimization and all parameters for scoring function -> [DNA] -- ^ Initial, not optimized nucleotide sequence -> [DNA] -- ^ Result, optimized nucleotide sequence optimizeDNA cfg dna = optimizeAA cfg (translate dna) where translate :: [DNA] -> [AA] translate [] = [] translate dnaSeq = case Map.lookup (take 3 dnaSeq) codon2ak of Just ak -> ak : translate (drop 3 dnaSeq) _ -> error $ "Unknown codon: " ++ show (take 3 dnaSeq) -- | 'optimizeAA' function does codon-optimisation for incoming amino-acid sequence. -- Incoming amino-acid sequence transformed to nucleotide sequence and optimized used the codon-optimization algorithm. -- Algorithm described here doi: 10.1007/s11693-010-9062-3 optimizeAA :: CodonScoreConfig -- ^ Config data object. Contains main parameters of codon-optimization and all parameters for scoring function -> [AA] -- ^ Initial, not optimized amino-acid sequence -> [DNA] -- ^ Result, optimized nucleotide sequence optimizeAA cfg@(CodonScoreConfig organism initLen winLen _ _ _ _ _ _ _ _ _ _) aa = foldl' concatByScore initial variants where lenAA = length aa variants = generateVariants (drop initLen aa) winLen fequCodonsMap = ak2MaxFrequCodon organism initial = concatMap (\ak -> maybe "" fst (Map.lookup ak fequCodonsMap)) (take initLen aa) -- | 'concatByScore' function gets maximum by score variable string and then concat it to result string concatByScore :: [DNA] -- ^ initial string -> [[DNA]] -- ^ list of variable string -> [DNA] -- ^ result string concatByScore result vars | length result == 3 * (lenAA - winLen - 1) = result ++ maximumBy (scoreCmp cfg result) vars | otherwise = result ++ take 3 (maximumBy (scoreCmp cfg result) vars) -- | 'generateVariants' function generates list of all possible variants of nucleotide sequence for amino-acid sequence. -- It is just recursive execution of 'windowVariants' for all amino-acid sequence. -- Example: generateVariants [PHE,TRP,GLU,MET] 2 => [[[DT,DT,DT,DT,DG,DG,DG,DA,DA], [DT,DT,DC,DT,DG,DG,DG,DA,DA], [DT,DT,DT,DT,DG,DG,DG,DA,DG], [DT,DT,DC,DT,DG,DG,DG,DA,DG]], -- [[DT,DG,DG,DG,DA,DA,DA,DT,DG], [DT,DG,DG,DG,DA,DG,DA,DT,DG]]] -- Returns empty list in case of empty incoming string generateVariants :: [AA] -- ^ amino-acid sequence -> Int -- ^ length of window. means how much amino-acid from the right side side will be taken during scoring variant for one codon -> [[[DNA]]] -- ^ result list. for each amino-acid now there is list of all variants for nucleotide sequence generateVariants [] _ = [] generateVariants aa winLen | length aa == winLen = [] | otherwise = windowVariants aa winLen : generateVariants (drop 1 aa) winLen -- | 'windowVariants' function generates list of all possible variants of nucleotide sequence for amino-acid window. -- Example: windowVariants [PHE,TRP,GLU,MET] 2 => [[DT,DT,DT,DT,DG,DG,DG,DA,DA], [DT,DT,DC,DT,DG,DG,DG,DA,DA], [DT,DT,DT,DT,DG,DG,DG,DA,DG], [DT,DT,DC,DT,DG,DG,DG,DA,DG]] -- Returns empty list in case of empty incoming string windowVariants :: [AA] -> Int -> [[DNA]] windowVariants sequ winLen = map concat . mapM getCodons . take (winLen + 1) $ sequ -- | 'getCodons' function gets list of codons for amino-acid -- Example: 'getCodons' PRO => [[DC,DC,DT], [DC,DC,DC], [DC,DC,DA], [DC,DC,DG]] -- Returns empty list in case of unknown amino-acid getCodons :: AA -> [[DNA]] getCodons ak = fromMaybe [] (Map.lookup ak ak2Codon) -- | 'scoreSequence' function calculates the average score for full sequence scoreSequence :: CodonScoreConfig -> [DNA] -> Double scoreSequence cnf@(CodonScoreConfig _ initLen winLen _ _ _ _ _ _ _ _ _ _) nkSequ = sum res / realToFrac (length res) where res = scr ((initLen + winLen + 1) * 3) [] scr :: Int -> [Double] -> [Double] scr partLen acc | partLen > length nkSequ = acc | otherwise = scr (partLen + winLen * 3) (score cnf (take partLen nkSequ) : acc) -- | 'score' function gets scoring for incoming string. -- Scoring function is a composite function of several scoring. More about scoring algorithm see here doi: 10.1007/s11693-010-9062-3 score :: CodonScoreConfig -- ^ Config data object. Contains main parameters of codon-optimization and all parameters for scoring function -> [DNA] -- ^ nucleotide sequence to score -> Double -- ^ result score value score (CodonScoreConfig organism _ winLen codonUsageWeight gcWeight gcFactor gcWindow rnaFoldingWeight rnaFoldingFactor rnaFoldingWindow forbiddenDNAWeight gcContentDesired forbiddenRegexp) nkSequ = scoreCU + scoreGC - scoreMT - realToFrac scoreRNAFold where sequLen = length nkSequ optimizedLen = sequLen - (winLen + 1) * 3 scoreGC = if optimizedLen < gcWindow - optimizedLen -- check if we have enough sequence for gcWindow then 0 else realToFrac gcWeight * gcScore (drop (sequLen - gcWindow) nkSequ) scoreCU = realToFrac codonUsageWeight * codonUsage (drop optimizedLen nkSequ) scoreMT = realToFrac forbiddenDNAWeight * motiveScore nkSequ scoreRNAFold = scoreRnaf nkSequ -- | 'gcScore' function for the GC content. -- It is negatively counted absolute difference between the desired GC content and the GC content of a test sequence. gcScore :: [DNA] -> Double gcScore sequ = -abs (gc / (at + gc) * 100 - realToFrac gcContentDesired) ** gcFactor where gc = realToFrac $ length $ filter (\s -> s == DC || s == DG) sequ at = realToFrac $ length $ filter (\s -> s == DA || s == DT) sequ -- | 'codonUsage' function gets higest score for most frequently used codons codonUsage :: [DNA] -> Double codonUsage sequ = (cai ** (1 / codonCount)) * 100 where codonCount = realToFrac (length sequ) / 3 cai = countWeight sequ 1.0 -- | 'countWeight' function is recursive counting weight for incoming string according to codon usage frequencies countWeight :: [DNA] -> Double -> Double countWeight [] acc = acc countWeight windowSeq acc = countWeight (drop 3 windowSeq) (acc * fromMaybe 0 (countWeightMb windowSeq)) countWeightMb :: [DNA] -> Maybe Double countWeightMb str = do let codon = take 3 str codonFreq <- Map.lookup codon (codonFrequencies organism) ak <- Map.lookup codon codon2ak (_, codonMaxFreq) <- Map.lookup ak (ak2MaxFrequCodon organism) return $ codonFreq / codonMaxFreq -- | 'motiveScore' counts score for the occurrence of desired and unwanted DNA motifs. motiveScore :: [DNA] -> Double motiveScore sequ = if any (drop (length sequ - motiveScoreWindow) (map symbol sequ) =~) forbiddenRegexp then forbiddenMotiveScore else defaultMotiveScore -- | 'scoreRnaf' counts energy of RNA folding scoreRnaf :: [DNA] -> Int scoreRnaf sequ = truncate $ rnaFoldingWeight * (abs result ** rnaFoldingFactor) where result = fst $ fold standardTemperature (drop (length sequ - rnaFoldingWindow) sequ) -- | 'scoreCmp' is compare function for two strings using 'score' function scoreCmp :: CodonScoreConfig -> [DNA] -> [DNA] -> [DNA] -> Ordering scoreCmp cfg optimized str1 str2 = compare score1 score2 where score1 = score cfg (optimized ++ str1) score2 = score cfg (optimized ++ str2)