-- | Trimming of reads as found in BAM files. Implements trimming low -- quality sequence from the 3' end. module Bio.Bam.Trim where import Bio.Bam.Header import Bio.Bam.Rec import Bio.Bam.Rmdup ( ECig(..), setMD, toECig ) import Bio.Iteratee import Bio.Prelude import Foreign.C.Types ( CInt(..) ) import Foreign.Ptr ( Ptr ) import qualified Data.ByteString as B import qualified Data.Vector.Generic as V import qualified Data.Vector.Storable as W -- | Trims from the 3' end of a sequence. -- @trim_3\' p b@ trims the 3' end of the sequence in @b@ at the -- earliest position such that @p@ evaluates to true on every suffix -- that was trimmed off. Note that the 3' end may be the beginning of -- the sequence if it happens to be stored in reverse-complemented form. -- Also note that trimming from the 3' end may not make sense for reads -- that were constructed by merging paired end data (but we cannot take -- care of that here). Further note that trimming may break dependent -- information, notably the "mate" information of the mate and many -- optional fields. trim_3' :: ([Nucleotides] -> [Qual] -> Bool) -> BamRec -> BamRec trim_3' p b | b_flag b `testBit` 4 = trim_rev | otherwise = trim_fwd where trim_fwd = let l = subtract 1 . fromIntegral . length . takeWhile (uncurry p) $ zip (inits . reverse . V.toList $ b_seq b) (inits . reverse . V.toList $ b_qual b) in trim_3 l b trim_rev = let l = subtract 1 . fromIntegral . length . takeWhile (uncurry p) $ zip (inits . V.toList $ b_seq b) (inits . V.toList $ b_qual b) in trim_3 l b trim_3 :: Int -> BamRec -> BamRec trim_3 l b | b_flag b `testBit` 4 = trim_rev | otherwise = trim_fwd where trim_fwd = let (_, cigar') = trim_back_cigar (b_cigar b) l c = modMd (takeECig (V.length (b_seq b) - l)) b in c { b_seq = V.take (V.length (b_seq c) - l) (b_seq c) , b_qual = V.take (V.length (b_qual c) - l) (b_qual c) , b_cigar = cigar' , b_exts = map (\(k,e) -> case e of Text t | k `elem` trim_set -> (k, Text (B.take (B.length t - l) t)) _ -> (k,e) ) (b_exts c) } trim_rev = let (off, cigar') = trim_fwd_cigar (b_cigar b) l c = modMd (dropECig l) b in c { b_seq = V.drop l (b_seq c) , b_qual = V.drop l (b_qual c) , b_pos = b_pos c + off , b_cigar = cigar' , b_exts = map (\(k,e) -> case e of Text t | k `elem` trim_set -> (k, Text (B.drop l t)) _ -> (k,e) ) (b_exts c) } trim_set = ["BQ","CQ","CS","E2","OQ","U2"] modMd :: (ECig -> ECig) -> BamRec -> BamRec modMd f br = maybe br (setMD br . f . toECig (b_cigar br)) (getMd br) endOf :: ECig -> ECig endOf WithMD = WithMD endOf WithoutMD = WithoutMD endOf (Mat' _ es) = endOf es endOf (Ins' _ es) = endOf es endOf (SMa' _ es) = endOf es endOf (Rep' _ es) = endOf es endOf (Del' _ es) = endOf es endOf (Nop' _ es) = endOf es endOf (HMa' _ es) = endOf es endOf (Pad' _ es) = endOf es takeECig :: Int -> ECig -> ECig takeECig 0 es = endOf es takeECig _ WithMD = WithMD takeECig _ WithoutMD = WithoutMD takeECig n (Mat' m es) = Mat' n $ if n > m then takeECig (n-m) es else WithMD takeECig n (Ins' m es) = Ins' n $ if n > m then takeECig (n-m) es else WithMD takeECig n (SMa' m es) = SMa' n $ if n > m then takeECig (n-m) es else WithMD takeECig n (Rep' ns es) = Rep' ns $ takeECig (n-1) es takeECig n (Del' ns es) = Del' ns $ takeECig n es takeECig n (Nop' m es) = Nop' m $ takeECig n es takeECig n (HMa' m es) = HMa' m $ takeECig n es takeECig n (Pad' m es) = Pad' m $ takeECig n es dropECig :: Int -> ECig -> ECig dropECig 0 es = es dropECig _ WithMD = WithMD dropECig _ WithoutMD = WithoutMD dropECig n (Mat' m es) = if n > m then dropECig (n-m) es else Mat' n WithMD dropECig n (Ins' m es) = if n > m then dropECig (n-m) es else Ins' n WithMD dropECig n (SMa' m es) = if n > m then dropECig (n-m) es else SMa' n WithMD dropECig n (Rep' _ es) = dropECig (n-1) es dropECig n (Del' _ es) = dropECig n es dropECig n (Nop' _ es) = dropECig n es dropECig n (HMa' _ es) = dropECig n es dropECig n (Pad' _ es) = dropECig n es trim_back_cigar, trim_fwd_cigar :: V.Vector v Cigar => v Cigar -> Int -> ( Int, v Cigar ) trim_back_cigar c l = (o, V.fromList $ reverse c') where (o,c') = sanitize_cigar . trim_cigar l $ reverse $ V.toList c trim_fwd_cigar c l = (o, V.fromList c') where (o,c') = sanitize_cigar $ trim_cigar l $ V.toList c sanitize_cigar :: (Int, [Cigar]) -> (Int, [Cigar]) sanitize_cigar (o, [ ]) = (o, []) sanitize_cigar (o, (op:*l):xs) | op == Pad = sanitize_cigar (o,xs) -- del P | op == Del || op == Nop = sanitize_cigar (o + l, xs) -- adjust D,N | op == Ins = (o, (SMa :* l):xs) -- I --> S | otherwise = (o, (op :* l):xs) -- rest is fine trim_cigar :: Int -> [Cigar] -> (Int, [Cigar]) trim_cigar 0 cs = (0, cs) trim_cigar _ [] = (0, []) trim_cigar l ((op:*ll):cs) | bad_op op = let (o,cs') = trim_cigar l cs in (o + reflen op ll, cs') | otherwise = case l `compare` ll of LT -> (reflen op l, (op :* (ll-l)):cs) EQ -> (reflen op ll, cs) GT -> let (o,cs') = trim_cigar (l - ll) cs in (o + reflen op ll, cs') where reflen op' = if ref_op op' then id else const 0 bad_op o = o /= Mat && o /= Ins && o /= SMa ref_op o = o == Mat || o == Del -- | Trim predicate to get rid of low quality sequence. -- @trim_low_quality q ns qs@ evaluates to true if all qualities in @qs@ -- are smaller (i.e. worse) than @q@. trim_low_quality :: Qual -> a -> [Qual] -> Bool trim_low_quality q = const $ all (< q) -- | Overlap-merging of read pairs. We shall compute the likelihood -- for every possible overlap, then select the most likely one (unless it -- looks completely random), compute a quality from the second best -- merge, then merge and clamp the quality accordingly. -- (We could try looking for chimaera after completing the merge, if -- only we knew which ones to expect?) -- -- Two reads go in, with two adapter lists. We return 'Nothing' if all -- merges looked mostly random. Else we return the two original reads, -- flagged as 'eflagVestigial' *and* the merged version, flagged as -- 'eflagMerged' and optionally 'eflagTrimmed'. All reads contain the -- computed qualities (in YM and YN), which we also return. -- -- The merging automatically limits quality scores some of the time. We -- additionally impose a hard limit of 63 to avoid difficulties -- representing the result, and even that is ridiculous. Sane people -- would further limit the returned quality! (In practice, map quality -- later imposes a limit anyway, so no worries...) merge_overlap :: BamRec -> [ W.Vector Nucleotides ] -> BamRec -> [ W.Vector Nucleotides ] -> Maybe ( BamRec, BamRec, BamRec, Int, Int ) merge_overlap r1 ads1 r2 ads2 | V.null (b_seq r1) && V.null (b_seq r2) = Nothing | otherwise = result mlen score1 score2 where -- the "merge" score if there is no overlap plain_score = 6 * fromIntegral (len_r1 + len_r2) len_r1 = V.length $ b_seq r1 len_r2 = V.length $ b_seq r2 b_seq_r1 = V.convert $ b_seq r1 b_seq_r2 = V.convert $ b_seq r2 b_qual_r1 = V.convert $ b_qual r1 b_qual_r2 = V.convert $ b_qual r2 (score1, mlen, score2) = twoMins plain_score (len_r1 + len_r2) $ merge_score ads1 ads2 b_seq_r1 b_qual_r1 b_seq_r2 b_qual_r2 flag_vestigial br = br { b_exts = updateE "FF" (Int $ extAsInt 0 "FF" br .|. eflagVestigial) $ b_exts br } store_quals s1 s2 br = br { b_exts = updateE "YM" (Int $ s2 - s1) $ updateE "YN" (Int $ plain_score - s1) $ b_exts br } result l s1 s2 = Just ( store_quals s1 s2 $ flag_vestigial r1 , store_quals s1 s2 $ flag_vestigial r2 , store_quals s1 s2 $ merged_read l (fromIntegral . min 63 $ s2-s1) , s2 - s1, plain_score - s1 ) merged_read l qmax | V.length merged_seq /= l = error $ "Logic error in merged_read: " ++ show (V.length merged_seq, l) | otherwise = nullBamRec { b_qname = b_qname r1, b_flag = flagUnmapped .|. complement pair_flags .&. b_flag r1, b_seq = merged_seq, b_qual = merged_qual, b_exts = let ff = if l < len_r1 then eflagTrimmed else 0 in updateE "FF" (Int $ extAsInt 0 "FF" r1 .|. eflagMerged .|. ff) $ b_exts r1 } where merged_seq = V.convert $ V.concat [ V.take (l - len_r2) (b_seq_r1) , merge_seqs (V.take l $ V.drop (l - len_r2) b_seq_r1) (V.take l $ V.drop (l - len_r2) b_qual_r1) (V.reverse $ V.take l $ V.drop (l - len_r1) b_seq_r2) (V.reverse $ V.take l $ V.drop (l - len_r1) b_qual_r2) , V.reverse $ V.take (l - len_r1) b_seq_r2 ] merged_qual = V.convert $ V.concat [ V.take (l - len_r2) (b_qual_r1) , merge_quals qmax (V.take l $ V.drop (l - len_r2) b_seq_r1) (V.take l $ V.drop (l - len_r2) b_qual_r1) (V.reverse $ V.take l $ V.drop (l - len_r1) b_seq_r2) (V.reverse $ V.take l $ V.drop (l - len_r1) b_qual_r2) , V.reverse $ V.take (l - len_r1) b_qual_r2 ] pair_flags = flagPaired.|.flagProperlyPaired.|.flagMateUnmapped.|.flagMateReversed.|.flagFirstMate.|.flagSecondMate merge_seqs v1 v2 v3 v4 = V.zipWith4 zz v1 v2 v3 v4 where zz !n1 (Q !q1) !n2 (Q !q2) | n1 == compls n2 = n1 | q1 > q2 = n1 | otherwise = compls n2 merge_quals qmax v1 v2 v3 v4 = V.zipWith4 zz v1 v2 v3 v4 where zz !n1 (Q !q1) !n2 (Q !q2) | n1 == compls n2 = Q $ min qmax (q1 + q2) | q1 > q2 = Q $ q1 - q2 | otherwise = Q $ q2 - q1 -- | Trimming for a single read: we need one adapter only (the one coming -- /after/ the read), here provided as a list of options, and then we -- merge with an empty second read. Results in up to two reads (the -- original, possibly flagged, and the trimmed one, definitely flagged, -- and two qualities). trim_adapter :: BamRec -> [ W.Vector Nucleotides ] -> Maybe ( BamRec, BamRec, Int, Int ) trim_adapter r1 ads1 | V.null (b_seq r1) = Nothing | otherwise = result mlen score1 score2 where -- the "merge" score if there is no trimming plain_score = 6 * fromIntegral (V.length (b_seq r1)) b_seq_r1 = V.convert $ b_seq r1 b_qual_r1 = V.convert $ b_qual r1 (score1, mlen, score2) = twoMins plain_score (V.length (b_seq r1)) $ merge_score ads1 [V.empty] b_seq_r1 b_qual_r1 V.empty V.empty flag_vestigial br = br { b_exts = updateE "FF" (Int $ extAsInt 0 "FF" br .|. eflagVestigial) $ b_exts br } store_quals s1 s2 br = br { b_exts = updateE "YM" (Int $ s2 - s1) $ updateE "YN" (Int $ plain_score - s1) $ b_exts br } result l s1 s2 = Just ( store_quals s1 s2 $ flag_vestigial r1 , store_quals s1 s2 $ trimmed_read l , s2 - s1, plain_score - s1 ) trimmed_read l = nullBamRec { b_qname = b_qname r1, b_flag = flagUnmapped .|. b_flag r1, b_seq = V.take l $ b_seq r1, b_qual = V.take l $ b_qual r1, b_exts = updateE "FF" (Int $ extAsInt 0 "FF" r1 .|. eflagTrimmed) $ b_exts r1 } -- | For merging, we don't need the complete adapters (length around 70!), -- only a sufficient prefix. Taking only the more-or-less constant -- part (length around 30), there aren't all that many different -- adapters in the world. To deal with pretty much every library, we -- only need the following forward adapters, which will be the default -- (defined here in the direction they would be sequenced in): Genomic -- R2, Multiplex R2, Fraft P7. default_fwd_adapters :: [ W.Vector Nucleotides ] default_fwd_adapters = map (W.fromList. map toNucleotides) [ {- Genomic R2 -} "AGATCGGAAGAGCGGTTCAG" , {- Multiplex R2 -} "AGATCGGAAGAGCACACGTC" , {- Graft P7 -} "AGATCGGAAGAGCTCGTATG" ] -- | Like 'default_rev_adapters', these are the few adapters needed for -- the reverse read (defined in the direction they would be sequenced in -- as part of the second read): Genomic R1, CL 72. default_rev_adapters :: [ W.Vector Nucleotides ] default_rev_adapters = map (W.fromList. map toNucleotides) [ {- Genomic_R1 -} "AGATCGGAAGAGCGTCGTGT" , {- CL72 -} "GGAAGAGCGTCGTGTAGGGA" ] -- We need to compute the likelihood of a read pair given an assumed -- insert length. The likelihood of the first read is the likelihood of -- a match with the adapter where it overlaps the 3' adapter, elsewhere -- it's 1/4 per position. The likelihood of the second read is the -- likelihood of a match with the adapter where it overlaps the adapter, -- the likehood of a read-read match where it overlaps read one, 1/4 per -- position elsewhere. (Yes, this ignores base composition. It doesn't -- matter enough.) merge_score :: [ W.Vector Nucleotides ] -- 3' adapters as they appear in the first read -> [ W.Vector Nucleotides ] -- 5' adapters as they appear in the second read -> W.Vector Nucleotides -> W.Vector Qual -- first read, qual -> W.Vector Nucleotides -> W.Vector Qual -- second read, qual -> Int -- assumed insert length -> Int -- score (roughly sum of qualities at mismatches) merge_score fwd_adapters rev_adapters !read1 !qual1 !read2 !qual2 !l = 6 * fromIntegral (l `min` V.length read1) -- read1, part before adapter + 6 * fromIntegral (max 0 (l - V.length read1)) -- read2, part before overlap + foldl' (\acc fwd_ad -> min acc (match_adapter l read1 qual1 fwd_ad + -- read1, match with forward adapter 6 * fromIntegral (max 0 (V.length read1 - V.length fwd_ad - l))) -- read1, part after (known) adapter ) maxBound fwd_adapters + foldl' (\acc rev_ad -> min acc (match_adapter l read2 qual2 rev_ad + -- read2, match with reverse adapter 6 * fromIntegral (max 0 (V.length read2 - V.length rev_ad - l))) -- read2, part after (known) adapter ) maxBound rev_adapters + match_reads l read1 qual1 read2 qual2 {-# INLINE match_adapter #-} match_adapter :: Int -> W.Vector Nucleotides -> W.Vector Qual -> W.Vector Nucleotides -> Int match_adapter !off !rd !qs !ad | V.length rd /= V.length qs = error "read/qual length mismatch" | efflength <= 0 = 0 | otherwise = fromIntegral . unsafePerformIO $ W.unsafeWith rd $ \p_rd -> W.unsafeWith qs $ \p_qs -> W.unsafeWith ad $ \p_ad -> prim_match_ad (fromIntegral off) (fromIntegral efflength) p_rd p_qs p_ad where !efflength = (V.length rd - off) `min` V.length ad foreign import ccall unsafe "prim_match_ad" prim_match_ad :: CInt -> CInt -> Ptr Nucleotides -> Ptr Qual -> Ptr Nucleotides -> IO CInt -- | Computes overlap score for two reads (with qualities) assuming an -- insert length. {-# INLINE match_reads #-} match_reads :: Int -> W.Vector Nucleotides -> W.Vector Qual -> W.Vector Nucleotides -> W.Vector Qual -> Int match_reads !l !rd1 !qs1 !rd2 !qs2 | V.length rd1 /= V.length qs1 || V.length rd2 /= V.length qs2 = error "read/qual length mismatch" | efflength <= 0 = 0 | otherwise = fromIntegral . unsafePerformIO $ W.unsafeWith rd1 $ \p_rd1 -> W.unsafeWith qs1 $ \p_qs1 -> W.unsafeWith rd2 $ \p_rd2 -> W.unsafeWith qs2 $ \p_qs2 -> prim_match_reads (fromIntegral minidx1) (fromIntegral maxidx2) (fromIntegral efflength) p_rd1 p_qs1 p_rd2 p_qs2 where -- vec1, forward !minidx1 = (l - V.length rd2) `max` 0 -- vec2, backward !maxidx2 = l `min` V.length rd2 -- effective length !efflength = ((V.length rd1 + V.length rd2 - l) `min` l) `max` 0 foreign import ccall unsafe "prim_match_reads" prim_match_reads :: CInt -> CInt -> CInt -> Ptr Nucleotides -> Ptr Qual -> Ptr Nucleotides -> Ptr Qual -> IO CInt {-# INLINE twoMins #-} twoMins :: (Bounded a, Ord a) => a -> Int -> (Int -> a) -> (a,Int,a) twoMins a0 imax f = go a0 0 maxBound 0 0 where go !m1 !i1 !m2 !i2 !i | i == imax = (m1,i1,m2) | otherwise = case f i of x | x < m1 -> go x i m1 i1 (i+1) | x < m2 -> go m1 i1 x i (i+1) | otherwise -> go m1 i1 m2 i2 (i+1) mergeTrimBam :: Monad m => [W.Vector Nucleotides] -> [W.Vector Nucleotides] -> Enumeratee [BamRec] [BamRec] m a mergeTrimBam fwd_ads rev_ads = convStream go where go = do r1 <- headStream if isPaired r1 then tryHead >>= go2 r1 else case trim_adapter r1 fwd_ads of Nothing -> return [r1] Just (r1',r1t,_q1,_q2) -> return [r1t,r1'] go2 r1 Nothing = error $ "Lone mate found: " ++ show (b_qname r1) go2 r1 (Just r2) = case merge_overlap r1 fwd_ads r2 rev_ads of Nothing -> return [r1,r2] Just (r1',r2',rm,_q1,_q2) -> return [rm,r1',r2']