-- | Trimming of reads as found in BAM files. Implements trimming low -- quality sequence from the 3' end. module Bio.Bam.Trim ( trim_3 , trim_3' , trim_low_quality , default_fwd_adapters , default_rev_adapters , find_merge , mergeBam , find_trim , trimBam , mergeTrimBam , twoMins , merged_seq , merged_qual ) where import Bio.Bam.Header import Bio.Bam.Rec import Bio.Bam.Rmdup ( ECig(..), setMD, toECig ) import Bio.Iteratee import Bio.Prelude import Foreign.C.Types ( CInt(..) ) import qualified Data.ByteString as B import qualified Data.Vector.Generic as V import qualified Data.Vector.Storable as W -- | Trims from the 3' end of a sequence. -- @trim_3' p b@ trims the 3' end of the sequence in @b@ at the -- earliest position such that @p@ evaluates to true on every suffix -- that was trimmed off. Note that the 3' end may be the beginning of -- the sequence if it happens to be stored in reverse-complemented form. -- Also note that trimming from the 3' end may not make sense for reads -- that were constructed by merging paired end data (but we cannot take -- care of that here). Further note that trimming may break dependent -- information, notably the "mate" information of the mate and many -- optional fields. trim_3' :: ([Nucleotides] -> [Qual] -> Bool) -> BamRec -> BamRec trim_3' p b | b_flag b `testBit` 4 = trim_rev | otherwise = trim_fwd where trim_fwd = let l = subtract 1 . fromIntegral . length . takeWhile (uncurry p) $ zip (inits . reverse . V.toList $ b_seq b) (inits . reverse . V.toList $ b_qual b) in trim_3 l b trim_rev = let l = subtract 1 . fromIntegral . length . takeWhile (uncurry p) $ zip (inits . V.toList $ b_seq b) (inits . V.toList $ b_qual b) in trim_3 l b trim_3 :: Int -> BamRec -> BamRec trim_3 l b | b_flag b `testBit` 4 = trim_rev | otherwise = trim_fwd where trim_fwd = let (_, cigar') = trim_back_cigar (b_cigar b) l c = modMd (takeECig (V.length (b_seq b) - l)) b in c { b_seq = V.take (V.length (b_seq c) - l) (b_seq c) , b_qual = V.take (V.length (b_qual c) - l) (b_qual c) , b_cigar = cigar' , b_exts = map (\(k,e) -> case e of Text t | k `elem` trim_set -> (k, Text (B.take (B.length t - l) t)) _ -> (k,e) ) (b_exts c) } trim_rev = let (off, cigar') = trim_fwd_cigar (b_cigar b) l c = modMd (dropECig l) b in c { b_seq = V.drop l (b_seq c) , b_qual = V.drop l (b_qual c) , b_pos = b_pos c + off , b_cigar = cigar' , b_exts = map (\(k,e) -> case e of Text t | k `elem` trim_set -> (k, Text (B.drop l t)) _ -> (k,e) ) (b_exts c) } trim_set = ["BQ","CQ","CS","E2","OQ","U2"] modMd :: (ECig -> ECig) -> BamRec -> BamRec modMd f br = maybe br (setMD br . f . toECig (b_cigar br)) (getMd br) endOf :: ECig -> ECig endOf WithMD = WithMD endOf WithoutMD = WithoutMD endOf (Mat' _ es) = endOf es endOf (Ins' _ es) = endOf es endOf (SMa' _ es) = endOf es endOf (Rep' _ es) = endOf es endOf (Del' _ es) = endOf es endOf (Nop' _ es) = endOf es endOf (HMa' _ es) = endOf es endOf (Pad' _ es) = endOf es takeECig :: Int -> ECig -> ECig takeECig 0 es = endOf es takeECig _ WithMD = WithMD takeECig _ WithoutMD = WithoutMD takeECig n (Mat' m es) = Mat' n $ if n > m then takeECig (n-m) es else WithMD takeECig n (Ins' m es) = Ins' n $ if n > m then takeECig (n-m) es else WithMD takeECig n (SMa' m es) = SMa' n $ if n > m then takeECig (n-m) es else WithMD takeECig n (Rep' ns es) = Rep' ns $ takeECig (n-1) es takeECig n (Del' ns es) = Del' ns $ takeECig n es takeECig n (Nop' m es) = Nop' m $ takeECig n es takeECig n (HMa' m es) = HMa' m $ takeECig n es takeECig n (Pad' m es) = Pad' m $ takeECig n es dropECig :: Int -> ECig -> ECig dropECig 0 es = es dropECig _ WithMD = WithMD dropECig _ WithoutMD = WithoutMD dropECig n (Mat' m es) = if n > m then dropECig (n-m) es else Mat' n WithMD dropECig n (Ins' m es) = if n > m then dropECig (n-m) es else Ins' n WithMD dropECig n (SMa' m es) = if n > m then dropECig (n-m) es else SMa' n WithMD dropECig n (Rep' _ es) = dropECig (n-1) es dropECig n (Del' _ es) = dropECig n es dropECig n (Nop' _ es) = dropECig n es dropECig n (HMa' _ es) = dropECig n es dropECig n (Pad' _ es) = dropECig n es trim_back_cigar, trim_fwd_cigar :: V.Vector v Cigar => v Cigar -> Int -> ( Int, v Cigar ) trim_back_cigar c l = (o, V.fromList $ reverse c') where (o,c') = sanitize_cigar . trim_cigar l $ reverse $ V.toList c trim_fwd_cigar c l = (o, V.fromList c') where (o,c') = sanitize_cigar $ trim_cigar l $ V.toList c sanitize_cigar :: (Int, [Cigar]) -> (Int, [Cigar]) sanitize_cigar (o, [ ]) = (o, []) sanitize_cigar (o, (op:*l):xs) | op == Pad = sanitize_cigar (o,xs) -- del P | op == Del || op == Nop = sanitize_cigar (o + l, xs) -- adjust D,N | op == Ins = (o, (SMa :* l):xs) -- I --> S | otherwise = (o, (op :* l):xs) -- rest is fine trim_cigar :: Int -> [Cigar] -> (Int, [Cigar]) trim_cigar 0 cs = (0, cs) trim_cigar _ [] = (0, []) trim_cigar l ((op:*ll):cs) | bad_op op = let (o,cs') = trim_cigar l cs in (o + reflen op ll, cs') | otherwise = case l `compare` ll of LT -> (reflen op l, (op :* (ll-l)):cs) EQ -> (reflen op ll, cs) GT -> let (o,cs') = trim_cigar (l - ll) cs in (o + reflen op ll, cs') where reflen op' = if ref_op op' then id else const 0 bad_op o = o /= Mat && o /= Ins && o /= SMa ref_op o = o == Mat || o == Del -- | Trim predicate to get rid of low quality sequence. -- @trim_low_quality q ns qs@ evaluates to true if all qualities in @qs@ -- are smaller (i.e. worse) than @q@. trim_low_quality :: Qual -> a -> [Qual] -> Bool trim_low_quality q = const $ all (< q) -- | Finds the merge point. Input is list of forward adapters, list of -- reverse adapters, sequence1, quality1, sequence2, quality2; output is -- merge point and two qualities (YM, YN). find_merge :: [W.Vector Nucleotides] -> [W.Vector Nucleotides] -> W.Vector Nucleotides -> W.Vector Qual -> W.Vector Nucleotides -> W.Vector Qual -> (Int, Int, Int) find_merge ads1 ads2 r1 q1 r2 q2 = (mlen, score2 - score1, plain_score - score1) where plain_score = 6 * fromIntegral (V.length r1 + V.length r2) (score1, mlen, score2) = twoMins plain_score (V.length r1 + V.length r2) $ merge_score ads1 ads2 r1 q1 r2 q2 -- | Overlap-merging of read pairs. We shall compute the likelihood -- for every possible overlap, then select the most likely one (unless it -- looks completely random), compute a quality from the second best -- merge, then merge and clamp the quality accordingly. -- (We could try looking for chimaera after completing the merge, if -- only we knew which ones to expect?) -- -- Two reads go in, with two adapter lists. We return 'Nothing' if all -- merges looked mostly random. Else we return the two original reads, -- flagged as 'eflagVestigial' *and* the merged version, flagged as -- 'eflagMerged' and optionally 'eflagTrimmed'. All reads contain the -- computed qualities (in YM and YN), which we also return. -- -- The merging automatically limits quality scores some of the time. We -- additionally impose a hard limit of 63 to avoid difficulties -- representing the result, and even that is ridiculous. Sane people -- would further limit the returned quality! (In practice, map quality -- later imposes a limit anyway, so no worries...) mergeBam :: Int -> Int -> [W.Vector Nucleotides] -> [W.Vector Nucleotides] -> BamRec -> BamRec -> [BamRec] mergeBam lowq highq ads1 ads2 r1 r2 | V.null (b_seq r1) && V.null (b_seq r2) = [ ] | qual1 < lowq || mlen < 0 = [ r1', r2' ] | qual1 >= highq && mlen == 0 = [ ] | qual1 >= highq = [ rm ] | mlen < len_r1-20 || mlen < len_r2-20 = [ rm ] | otherwise = map flag_alternative [ r1', r2', rm ] where len_r1 = V.length $ b_seq r1 len_r2 = V.length $ b_seq r2 b_seq_r1 = V.convert $ b_seq r1 b_seq_r2 = V.convert $ b_seq r2 b_qual_r1 = V.convert $ b_qual r1 b_qual_r2 = V.convert $ b_qual r2 (mlen, qual1, qual2) = find_merge ads1 ads2 b_seq_r1 b_qual_r1 b_seq_r2 b_qual_r2 flag_alternative br = br { b_exts = updateE "FF" (Int $ extAsInt 0 "FF" br .|. eflagAlternative) $ b_exts br } store_quals br = br { b_exts = updateE "YM" (Int qual1) $ updateE "YN" (Int qual2) $ b_exts br } pair_flags = flagPaired.|.flagProperlyPaired.|.flagMateUnmapped.|.flagMateReversed.|.flagFirstMate.|.flagSecondMate r1' = store_quals r1 r2' = store_quals r2 rm = store_quals $ merged_read mlen (fromIntegral $ min 63 qual1) merged_read l qmax = nullBamRec { b_qname = b_qname r1, b_flag = flagUnmapped .|. complement pair_flags .&. b_flag r1, b_seq = V.convert $ merged_seq l b_seq_r1 b_qual_r1 b_seq_r2 b_qual_r2, b_qual = V.convert $ merged_qual qmax l b_seq_r1 b_qual_r1 b_seq_r2 b_qual_r2, b_exts = let ff = if l < len_r1 then eflagTrimmed else 0 in updateE "FF" (Int $ extAsInt 0 "FF" r1 .|. eflagMerged .|. ff) $ b_exts r1 } {-# INLINE merged_seq #-} merged_seq :: (V.Vector v Nucleotides, V.Vector v Qual) => Int -> v Nucleotides -> v Qual -> v Nucleotides -> v Qual -> v Nucleotides merged_seq l b_seq_r1 b_qual_r1 b_seq_r2 b_qual_r2 = V.concat [ V.take (l - len_r2) b_seq_r1 , V.zipWith4 zz (V.take l $ V.drop (l - len_r2) b_seq_r1) (V.take l $ V.drop (l - len_r2) b_qual_r1) (V.reverse $ V.take l $ V.drop (l - len_r1) b_seq_r2) (V.reverse $ V.take l $ V.drop (l - len_r1) b_qual_r2) , V.reverse $ V.take (l - len_r1) b_seq_r2 ] where len_r1 = V.length b_qual_r1 len_r2 = V.length b_qual_r2 zz !n1 (Q !q1) !n2 (Q !q2) | n1 == compls n2 = n1 | q1 > q2 = n1 | otherwise = compls n2 {-# INLINE merged_qual #-} merged_qual :: (V.Vector v Nucleotides, V.Vector v Qual) => Word8 -> Int -> v Nucleotides -> v Qual -> v Nucleotides -> v Qual -> v Qual merged_qual qmax l b_seq_r1 b_qual_r1 b_seq_r2 b_qual_r2 = V.concat [ V.take (l - len_r2) b_qual_r1 , V.zipWith4 zz (V.take l $ V.drop (l - len_r2) b_seq_r1) (V.take l $ V.drop (l - len_r2) b_qual_r1) (V.reverse $ V.take l $ V.drop (l - len_r1) b_seq_r2) (V.reverse $ V.take l $ V.drop (l - len_r1) b_qual_r2) , V.reverse $ V.take (l - len_r1) b_qual_r2 ] where len_r1 = V.length b_qual_r1 len_r2 = V.length b_qual_r2 zz !n1 (Q !q1) !n2 (Q !q2) | n1 == compls n2 = Q $ min qmax (q1 + q2) | q1 > q2 = Q $ q1 - q2 | otherwise = Q $ q2 - q1 -- | Finds the trimming point. Input is list of forward adapters, -- sequence, quality; output is trim point and two qualities (YM, YN). find_trim :: [W.Vector Nucleotides] -> W.Vector Nucleotides -> W.Vector Qual -> (Int, Int, Int) find_trim ads1 r1 q1 = (mlen, score2 - score1, plain_score - score1) where plain_score = 6 * fromIntegral (V.length r1) (score1, mlen, score2) = twoMins plain_score (V.length r1) $ merge_score ads1 [V.empty] r1 q1 V.empty V.empty -- | Trimming for a single read: we need one adapter only (the one coming -- /after/ the read), here provided as a list of options, and then we -- merge with an empty second read. Results in up to two reads (the -- original, possibly flagged, and the trimmed one, definitely flagged, -- and two qualities). trimBam :: Int -> Int -> [W.Vector Nucleotides] -> BamRec -> [BamRec] trimBam lowq highq ads1 r1 | V.null (b_seq r1) = [ ] | mlen == 0 && qual1 >= highq = [ ] | qual1 < lowq || mlen < 0 = [ r1' ] | qual1 >= highq = [ r1t ] | otherwise = map flag_alternative [ r1', r1t ] where -- the "merge" score if there is no trimming b_seq_r1 = V.convert $ b_seq r1 b_qual_r1 = V.convert $ b_qual r1 (mlen, qual1, qual2) = find_trim ads1 b_seq_r1 b_qual_r1 flag_alternative br = br { b_exts = updateE "FF" (Int $ extAsInt 0 "FF" br .|. eflagAlternative) $ b_exts br } store_quals br = br { b_exts = updateE "YM" (Int qual1) $ updateE "YN" (Int qual2) $ b_exts br } r1' = store_quals r1 r1t = store_quals $ trimmed_read mlen trimmed_read l = nullBamRec { b_qname = b_qname r1, b_flag = flagUnmapped .|. b_flag r1, b_seq = V.take l $ b_seq r1, b_qual = V.take l $ b_qual r1, b_exts = updateE "FF" (Int $ extAsInt 0 "FF" r1 .|. eflagTrimmed) $ b_exts r1 } -- | For merging, we don't need the complete adapters (length around 70!), -- only a sufficient prefix. Taking only the more-or-less constant -- part (length around 30), there aren't all that many different -- adapters in the world. To deal with pretty much every library, we -- only need the following forward adapters, which will be the default -- (defined here in the direction they would be sequenced in): Genomic -- R2, Multiplex R2, Fraft P7. default_fwd_adapters :: [ W.Vector Nucleotides ] default_fwd_adapters = map (W.fromList. map toNucleotides) [ {- Genomic R2 -} "AGATCGGAAGAGCGGTTCAG" , {- Multiplex R2 -} "AGATCGGAAGAGCACACGTC" , {- Graft P7 -} "AGATCGGAAGAGCTCGTATG" ] -- | Like 'default_rev_adapters', these are the few adapters needed for -- the reverse read (defined in the direction they would be sequenced in -- as part of the second read): Genomic R1, CL 72. default_rev_adapters :: [ W.Vector Nucleotides ] default_rev_adapters = map (W.fromList. map toNucleotides) [ {- Genomic_R1 -} "AGATCGGAAGAGCGTCGTGT" , {- CL72 -} "GGAAGAGCGTCGTGTAGGGA" ] -- We need to compute the likelihood of a read pair given an assumed -- insert length. The likelihood of the first read is the likelihood of -- a match with the adapter where it overlaps the 3' adapter, elsewhere -- it's 1/4 per position. The likelihood of the second read is the -- likelihood of a match with the adapter where it overlaps the adapter, -- the likehood of a read-read match where it overlaps read one, 1/4 per -- position elsewhere. (Yes, this ignores base composition. It doesn't -- matter enough.) merge_score :: [ W.Vector Nucleotides ] -- 3' adapters as they appear in the first read -> [ W.Vector Nucleotides ] -- 5' adapters as they appear in the second read -> W.Vector Nucleotides -> W.Vector Qual -- first read, qual -> W.Vector Nucleotides -> W.Vector Qual -- second read, qual -> Int -- assumed insert length -> Int -- score (roughly sum of qualities at mismatches) merge_score fwd_adapters rev_adapters !read1 !qual1 !read2 !qual2 !l = 6 * fromIntegral (l `min` V.length read1) -- read1, part before adapter + 6 * fromIntegral (max 0 (l - V.length read1)) -- read2, part before overlap + foldl' (\acc fwd_ad -> min acc (match_adapter l read1 qual1 fwd_ad + -- read1, match with forward adapter 6 * fromIntegral (max 0 (V.length read1 - V.length fwd_ad - l))) -- read1, part after (known) adapter ) maxBound fwd_adapters + foldl' (\acc rev_ad -> min acc (match_adapter l read2 qual2 rev_ad + -- read2, match with reverse adapter 6 * fromIntegral (max 0 (V.length read2 - V.length rev_ad - l))) -- read2, part after (known) adapter ) maxBound rev_adapters + match_reads l read1 qual1 read2 qual2 {-# INLINE match_adapter #-} match_adapter :: Int -> W.Vector Nucleotides -> W.Vector Qual -> W.Vector Nucleotides -> Int match_adapter !off !rd !qs !ad | V.length rd /= V.length qs = error "read/qual length mismatch" | efflength <= 0 = 0 | otherwise = fromIntegral . unsafePerformIO $ W.unsafeWith rd $ \p_rd -> W.unsafeWith qs $ \p_qs -> W.unsafeWith ad $ \p_ad -> prim_match_ad (fromIntegral off) (fromIntegral efflength) p_rd p_qs p_ad where !efflength = (V.length rd - off) `min` V.length ad foreign import ccall unsafe "prim_match_ad" prim_match_ad :: CInt -> CInt -> Ptr Nucleotides -> Ptr Qual -> Ptr Nucleotides -> IO CInt -- | Computes overlap score for two reads (with qualities) assuming an -- insert length. {-# INLINE match_reads #-} match_reads :: Int -> W.Vector Nucleotides -> W.Vector Qual -> W.Vector Nucleotides -> W.Vector Qual -> Int match_reads !l !rd1 !qs1 !rd2 !qs2 | V.length rd1 /= V.length qs1 || V.length rd2 /= V.length qs2 = error "read/qual length mismatch" | efflength <= 0 = 0 | otherwise = fromIntegral . unsafePerformIO $ W.unsafeWith rd1 $ \p_rd1 -> W.unsafeWith qs1 $ \p_qs1 -> W.unsafeWith rd2 $ \p_rd2 -> W.unsafeWith qs2 $ \p_qs2 -> prim_match_reads (fromIntegral minidx1) (fromIntegral maxidx2) (fromIntegral efflength) p_rd1 p_qs1 p_rd2 p_qs2 where -- vec1, forward !minidx1 = (l - V.length rd2) `max` 0 -- vec2, backward !maxidx2 = l `min` V.length rd2 -- effective length !efflength = ((V.length rd1 + V.length rd2 - l) `min` l) `max` 0 foreign import ccall unsafe "prim_match_reads" prim_match_reads :: CInt -> CInt -> CInt -> Ptr Nucleotides -> Ptr Qual -> Ptr Nucleotides -> Ptr Qual -> IO CInt {-# INLINE twoMins #-} twoMins :: (Bounded a, Ord a) => a -> Int -> (Int -> a) -> (a,Int,a) twoMins a0 imax f = go a0 (-1) maxBound 0 0 where go !m1 !i1 !m2 !i2 !i | i == imax = (m1,i1,m2) | otherwise = case f i of x | x < m1 -> go x i m1 i1 (i+1) | x < m2 -> go m1 i1 x i (i+1) | otherwise -> go m1 i1 m2 i2 (i+1) mergeTrimBam :: Monad m => Int -> Int -> [W.Vector Nucleotides] -> [W.Vector Nucleotides] -> Enumeratee [BamRec] [BamRec] m a mergeTrimBam lowq highq fwd_ads rev_ads = convStream go where go = do r1 <- headStream if isPaired r1 then tryHead >>= go2 r1 else return $ trimBam lowq highq fwd_ads r1 go2 r1 Nothing = error $ "Lone mate found: " ++ show (b_qname r1) go2 r1 (Just r2) = return $ mergeBam lowq highq fwd_ads rev_ads r1 r2