-- FilterCloneMap module. -- By G.W. Schwartz -- -- Collection of functions for the filtering of a CloneMap {-# LANGUAGE BangPatterns, OverloadedStrings, FlexibleContexts #-} module FilterCloneMap where -- Built in import Data.List import Data.Char import Data.Maybe import Data.Either import qualified Data.Set as S import qualified Data.Map as M import Text.Regex.TDFA import Text.Regex.TDFA.Text import qualified Data.Text as T -- Cabal import Data.Fasta.Text -- Local import Types import Diversity -- | Check if the data structure is Right isRight' :: Either a b -> Bool isRight' (Right _) = True isRight' _ = False -- | Altered version of listToMaybe listToMaybe' :: [a] -> Maybe [a] listToMaybe' [] = Nothing listToMaybe' x = Just x -- | Remove highly mutated sequences (sequences with more than a third of -- their sequence being mutated). filterHighlyMutated :: GeneticUnit -> CodonTable -> CloneMap -> (CloneMap, Maybe String) filterHighlyMutated !genUnit !table !cloneMap = (newCloneMap, errorString) where newCloneMap = M.map (map snd . filter (not . fst) . rights) errorCloneMap errorString = listToMaybe' . unlines . filter (not . null) . map snd . M.toAscList . M.map (intercalate "\n" . lefts) $ errorCloneMap errorCloneMap = M.mapWithKey assignMutated cloneMap assignMutated k = map (isHighlyMutated (snd k)) isHighlyMutated !k !x = case (readSeq genUnit k, readSeq genUnit x) of ((Right a), (Right b)) -> (\n -> Right (n, b)) $ ( (fromIntegral (T.length (fastaSeq a)) :: Double) / 3 ) <= ( ( genericLength . realMutations (fastaSeq a) $ fastaSeq b ) ) ((Left a), (Right _)) -> Left (unwords ["Germline: ", T.unpack a]) ((Right _), (Left b)) -> Left (unwords ["Sequence: ", T.unpack b]) ((Left a), (Left b)) -> Left (unwords [ "Sequence:" , T.unpack b , "with Germline:" , T.unpack a ] ) realMutations k x = filterCodonMutStab (\(!y, !z) -> y /= z) . map snd . mutation k $ x filterCodonMutStab isWhat = filter (filterRules genUnit isWhat) filterRules AminoAcid isWhat x = isWhat x && not (inTuple '-' x) && not (inTuple '.' x) && not (inTuple '~' x) filterRules Nucleotide isWhat x = isWhat x && not (inTuple '-' x) && not (inTuple '.' x) && not (inTuple '~' x) && not (inTuple 'N' x) inTuple c (x, y) | c == x || c == y = True | otherwise = False mutation x y = zip [1..] . T.zip x $ y readSeq Nucleotide x = Right x readSeq AminoAcid x = customTranslate table 1 x -- | Replace codons that have more than CodonMut mutations (make them "---" -- codons). removeCodonMutCount :: CodonMut -> T.Text -> T.Text -> CloneMap -> CloneMap removeCodonMutCount codonMut codonMutType mutType = M.mapWithKey mapRemove where mapRemove (_, germ) = map (removeCodon germ) removeCodon germ clone = clone { fastaSeq = remove (fastaSeq germ) . fastaSeq $ clone } remove germSeq = mconcat . map (snd . replaceCodon) . zip (codonSplit germSeq) . codonSplit replaceCodon (x, y) | (codonMutOp codonMutType) (hamming x y) codonMut && isMutType (T.toUpper mutType) x y = (x, y) | otherwise = ("---", "---") codonSplit = fullCodon . T.chunksOf 3 fullCodon = filter ((== 3) . T.length) codonMutOp ">" = (>) codonMutOp "<" = (<) codonMutOp "=" = (==) isMutType "REPLACEMENT" x y = codon2aa x /= codon2aa y isMutType "SILENT" x y = codon2aa x == codon2aa y isMutType _ _ _ = True -- | Remove clone sequences that have stop codons in the first stopRange -- codons removeStopsCloneMap :: GeneticUnit -> CodonTable -> Int -> CloneMap -> (CloneMap, Maybe String) removeStopsCloneMap !genUnit !table !stopRange !cloneMap = ( newCloneMap , errorString ) where errorString = listToMaybe' . unlines . filter (not . null) . map snd . M.toAscList . M.map ( intercalate "\n" . map T.unpack . lefts . map (customTranslate table 1) ) $ cloneMap newCloneMap = M.map (filter (filterStops genUnit)) cloneMap filterStops Nucleotide x = (isRight' . customTranslate table 1 $ x) && ( not . T.isInfixOf "*" . T.take stopRange . fastaSeq . fromEither . customTranslate table 1 ) x filterStops AminoAcid x = not . T.isInfixOf "*" . T.take stopRange . fastaSeq $ x fromEither (Right x) = x fromEither (Left x) = error (T.unpack x) -- | Remove duplicate sequences removeDuplicatesCloneMap :: CloneMap -> CloneMap removeDuplicatesCloneMap cloneMap = M.map (filter (`S.member` duplicateSet)) cloneMap where duplicateSet = S.fromList . nubBy (\x y -> fastaSeq x == fastaSeq y) . concatMap snd . M.toAscList $ cloneMap -- | Remove out of frame sequences removeOutOfFrameSeqs :: CloneMap -> CloneMap removeOutOfFrameSeqs = M.map (filter isInFrame) where isInFrame = (== 0) . mod 3 . T.length . T.filter (\x -> not $ T.isInfixOf (T.singleton x) ".-") . fastaSeq -- | Remove sequences that do not contain the string customFilter in the -- customField location, split by "|". Note that this is 1 indexed and -- 0 means to search the entire header for the customFilter. If the -- customRemove option is enabled, this function will instead remove -- sequences that have headers which match the custom filter, as opposed to -- the other way around (this is defined in the "equal" function). Also -- takes into account whether to filter on the germline versus the actual -- sequences. removeCustomFilter :: Bool -> Bool -> Maybe Int -> T.Text -> CloneMap -> CloneMap removeCustomFilter germ rm customField customFilter cloneMap | germ && ((customField == Just 0) || (isNothing customField)) = M.filterWithKey (\(_, k) _ -> inField k) cloneMap | germ && customField > Just 0 = M.filterWithKey (\(_, k) _ -> inCustomField k) cloneMap | (customField == Just 0) || (isNothing customField) = M.map (filter inField) cloneMap | customField > Just 0 = M.map (filter inCustomField) cloneMap where inField = equal rm customFilter . fastaHeader inCustomField x = equal rm customFilter . (!!) (T.splitOn "|" . fastaHeader $ x) $ (fromJust customField - 1) equal False x y = y =~ x :: Bool equal True x y = not . equal False x $ y removeAllCustomFilters :: Bool -> Bool -> CloneMap -> [(Maybe Int, T.Text)] -> CloneMap removeAllCustomFilters germ rm = foldl' filterMap where filterMap acc (x, y) = removeCustomFilter germ rm x y acc -- | Remove clones that do not have any sequences after the filtrations removeEmptyClone :: CloneMap -> CloneMap removeEmptyClone = M.filter (not . null) -- | Convert sequences to amino acids convertToAminoAcidsCloneMap :: CodonTable -> CloneMap -> (CloneMap, Maybe String) convertToAminoAcidsCloneMap table cloneMap = (newCloneMap, errorString) where newCloneMap = M.mapKeysWith (++) (\(!x, !y) -> (x, fromEither y)) . M.filterWithKey (\(_, !y) _ -> isRight' y) . M.map rights $ errorCloneMap errorString = listToMaybe' . concatMap snd . M.toAscList . M.mapWithKey (\(_, !y) v -> (++) (eitherToString y) . concatMap T.unpack . lefts $ v ) $ errorCloneMap errorCloneMap = M.mapKeys keyMap . M.map (map (customTranslate table 1)) $ cloneMap keyMap (!x, !y) = (x, customTranslate table 1 y) eitherToString (Right _) = "" eitherToString (Left x) = T.unpack x fromEither (Right x) = x fromEither (Left x) = error (T.unpack x)