{-# LANGUAGE BangPatterns, OverloadedStrings #-}
module TransformCloneList ( onlyMutations
, frequentMutations
) where
import Data.Maybe
import Data.List
import qualified Data.Text as T
import qualified Data.Map.Strict as Map
import Data.Tuple
import Data.Monoid
import Control.Arrow (second)
import Data.Fasta.Text
import qualified Data.List.Split as Split
import Types
import Utility
import Diversity
noGaps :: T.Text -> Bool
noGaps = not . any (\x -> x == '.' || x == '-') . T.unpack
onlyMutations :: CodonMut -> T.Text -> T.Text -> CloneEntry -> CloneEntry
onlyMutations codonMut codonMutType mutType = newEntry
where
newEntry (!germ, !fseqs) = (germ, map (removeCodon germ) fseqs)
removeCodon germ fseq = fseq { fastaSeq = remove (fastaSeq germ)
. fastaSeq
$ fseq
}
remove germSeq = mconcat
. map (snd . replaceCodon)
. zip (codonSplit germSeq)
. codonSplit
replaceCodon (x, y)
| (codonMutOp codonMutType) (hamming x y) codonMut
&& isMutType (T.toUpper mutType) x y
&& noGaps x
&& noGaps y = (x, y)
| otherwise = ("---", "---")
codonSplit = fullCodon . T.chunksOf 3
fullCodon = filter ((== 3) . T.length)
codonMutOp ">" = (>)
codonMutOp "<" = (<)
codonMutOp "=" = (==)
isMutType "REPLACEMENT" x y = codon2aa x /= codon2aa y
isMutType "SILENT" x y = codon2aa x == codon2aa y
isMutType "ALL" _ _ = True
isMutType _ _ _ = error "Unknown mutation type"
frequentMutations :: Maybe Int
-> Maybe Int
-> Maybe Double
-> CloneEntry
-> CloneEntry
frequentMutations minSeqs mutCount mutPercent entry@(!germline, !fseqs) =
newEntry
where
newEntry = (germline, map removeCodon fseqs)
removeCodon fseq = fseq { fastaSeq = replace fseq }
replace = rejoinCodons
. replaceCodon minSeqs mutCount mutPercent numSeqs countMap
. positionalCodons germline
rejoinCodons = T.pack . concatMap (map (snd . snd))
countMap = getCountMap germline fseqs
numSeqs = length fseqs
replaceCodon :: Maybe Int
-> Maybe Int
-> Maybe Double
-> Int
-> CountMap
-> CodonMutations
-> CodonMutations
replaceCodon minSeqs mutCount mutPercent numSeqs countMap = map replace
where
replace x = if (any isValid . filter isMutation $ x)
&& (not . any (isGapMut . snd) $ x)
then x
else map ((second . second) (const '-')) x
isGapMut (x, y) = x == '.' || x == '-' || y == '.' || y == '-'
isValid x = case Map.lookup x countMap of
(Just count) -> not
. fromMaybe False
. fmap getAny
$ minSeqsValid
<> mutCountValid count
<> mutPercentValid count
Nothing -> error ("Mutation not found: " ++ show x)
minSeqsValid = fmap (Any . not . (>=) numSeqs) minSeqs
mutCountValid count = case mutCount of
(Just 0) -> fmap (Any . not . (==) count)
$ Just numSeqs
(Just x) -> Just . Any . not . (>=) count $ x
Nothing -> Nothing
mutPercentValid count =
fmap
(Any . not . (>=) ((fromIntegral count / fromIntegral numSeqs) * 100))
mutPercent
isMutation (_, (x, y)) = x /= y
positionalCodons :: Germline -> FastaSequence -> CodonMutations
positionalCodons germline = fullCodon . codonSplit . T.unpack . fastaSeq
where
codonSplit = fullCodon
. Split.chunksOf 3
. zip [1..]
. zip (T.unpack . fastaSeq $ germline)
fullCodon = filter ((== 3) . length)
getCountMap :: Germline
-> [FastaSequence]
-> CountMap
getCountMap (FastaSequence { fastaSeq = germlineSeq }) =
Map.unionsWith (+) . map initializeMaps
where
initializeMaps = Map.fromList
. map swap
. zip [1,1..]
. zip [1..]
. T.zip germlineSeq
. fastaSeq