-- | Trimming and fusing of reads as found in BAM files. -- -- This API is remarkably ugly because the core loop is implemented in -- C. This requires the adapters to be in storable vectors, and since -- they shouldn't be constantly copied around, the ugly 'withADSeqs' -- function is needed. The performance gain seems to be worth it, -- though. module Bio.Bam.Trim ( trim_3 , trim_3' , trim_low_quality , AD_Seqs , withADSeqs , default_fwd_adapters , default_rev_adapters , find_merge , mergeBam , find_trim , trimBam , merged_seq , merged_qual ) where import Bio.Bam.Header import Bio.Bam.Rec import Bio.Bam.Rmdup ( ECig(..), setMD, toECig ) import Bio.Prelude import Control.Monad.Trans.Control ( MonadBaseControl, control ) import Foreign.C.Types ( CInt(..) ) import Foreign.Marshal.Array ( allocaArray ) import qualified Data.ByteString as B import qualified Data.Vector.Generic as V import qualified Data.Vector.Storable as W -- | Trims from the 3' end of a sequence. -- @trim_3' p b@ trims the 3' end of the sequence in @b@ at the -- earliest position such that @p@ evaluates to true on every suffix -- that was trimmed off. Note that the 3' end may be the beginning of -- the sequence if it happens to be stored in reverse-complemented form. -- Also note that trimming from the 3' end may not make sense for reads -- that were constructed by merging paired end data (but we cannot take -- care of that here). Further note that trimming may break dependent -- information, notably the "mate" information and many optional fields. -- Since the intention is to trim based on quality scores, reads without -- qualities are passed along unchanged. trim_3' :: ([Nucleotides] -> [Qual] -> Bool) -> BamRec -> BamRec trim_3' p b = case b_qual b of Nothing -> b Just qs | b_flag b `testBit` 4 -> trim_3 len_rev b | otherwise -> trim_3 len_fwd b where len_fwd = subtract 1 . length . takeWhile (uncurry p) $ zip (inits . reverse . V.toList $ b_seq b) (inits . reverse . V.toList $ qs) len_rev = subtract 1 . length . takeWhile (uncurry p) $ zip (inits . V.toList $ b_seq b) (inits . V.toList $ qs) trim_3 :: Int -> BamRec -> BamRec trim_3 l b | b_flag b `testBit` 4 = trim_rev | otherwise = trim_fwd where trim_fwd = let (_, cigar') = trim_back_cigar (b_cigar b) l c = modMd (takeECig (V.length (b_seq b) - l)) b in c { b_seq = V.take (V.length (b_seq c) - l) $ b_seq c , b_qual = V.take (V.length (b_seq c) - l) <$> b_qual c , b_cigar = cigar' , b_exts = map (\(k,e) -> case e of Text t | k `elem` trim_set -> (k, Text (B.take (B.length t - l) t)) _ -> (k,e) ) (b_exts c) } trim_rev = let (off, cigar') = trim_fwd_cigar (b_cigar b) l c = modMd (dropECig l) b in c { b_seq = V.drop l $ b_seq c , b_qual = V.drop l <$> b_qual c , b_pos = b_pos c + off , b_cigar = cigar' , b_exts = map (\(k,e) -> case e of Text t | k `elem` trim_set -> (k, Text (B.drop l t)) _ -> (k,e) ) (b_exts c) } trim_set = ["BQ","CQ","CS","E2","OQ","U2"] modMd :: (ECig -> ECig) -> BamRec -> BamRec modMd f br = maybe br (setMD br . f . toECig (b_cigar br)) (getMd br) endOf :: ECig -> ECig endOf WithMD = WithMD endOf WithoutMD = WithoutMD endOf (Mat' _ es) = endOf es endOf (Ins' _ es) = endOf es endOf (SMa' _ es) = endOf es endOf (Rep' _ es) = endOf es endOf (Del' _ es) = endOf es endOf (Nop' _ es) = endOf es endOf (HMa' _ es) = endOf es endOf (Pad' _ es) = endOf es takeECig :: Int -> ECig -> ECig takeECig 0 es = endOf es takeECig _ WithMD = WithMD takeECig _ WithoutMD = WithoutMD takeECig n (Mat' m es) = Mat' n $ if n > m then takeECig (n-m) es else WithMD takeECig n (Ins' m es) = Ins' n $ if n > m then takeECig (n-m) es else WithMD takeECig n (SMa' m es) = SMa' n $ if n > m then takeECig (n-m) es else WithMD takeECig n (Rep' ns es) = Rep' ns $ takeECig (n-1) es takeECig n (Del' ns es) = Del' ns $ takeECig n es takeECig n (Nop' m es) = Nop' m $ takeECig n es takeECig n (HMa' m es) = HMa' m $ takeECig n es takeECig n (Pad' m es) = Pad' m $ takeECig n es dropECig :: Int -> ECig -> ECig dropECig 0 es = es dropECig _ WithMD = WithMD dropECig _ WithoutMD = WithoutMD dropECig n (Mat' m es) = if n > m then dropECig (n-m) es else Mat' n WithMD dropECig n (Ins' m es) = if n > m then dropECig (n-m) es else Ins' n WithMD dropECig n (SMa' m es) = if n > m then dropECig (n-m) es else SMa' n WithMD dropECig n (Rep' _ es) = dropECig (n-1) es dropECig n (Del' _ es) = dropECig n es dropECig n (Nop' _ es) = dropECig n es dropECig n (HMa' _ es) = dropECig n es dropECig n (Pad' _ es) = dropECig n es trim_back_cigar, trim_fwd_cigar :: V.Vector v Cigar => v Cigar -> Int -> ( Int, v Cigar ) trim_back_cigar c l = (o, V.fromList $ reverse c') where (o,c') = sanitize_cigar . trim_cigar l $ reverse $ V.toList c trim_fwd_cigar c l = (o, V.fromList c') where (o,c') = sanitize_cigar $ trim_cigar l $ V.toList c sanitize_cigar :: (Int, [Cigar]) -> (Int, [Cigar]) sanitize_cigar (o, [ ]) = (o, []) sanitize_cigar (o, (op:*l):xs) | op == Pad = sanitize_cigar (o,xs) -- del P | op == Del || op == Nop = sanitize_cigar (o + l, xs) -- adjust D,N | op == Ins = (o, (SMa :* l):xs) -- I --> S | otherwise = (o, (op :* l):xs) -- rest is fine trim_cigar :: Int -> [Cigar] -> (Int, [Cigar]) trim_cigar 0 cs = (0, cs) trim_cigar _ [] = (0, []) trim_cigar l ((op:*ll):cs) | bad_op op = let (o,cs') = trim_cigar l cs in (o + reflen op ll, cs') | otherwise = case l `compare` ll of LT -> (reflen op l, (op :* (ll-l)):cs) EQ -> (reflen op ll, cs) GT -> let (o,cs') = trim_cigar (l - ll) cs in (o + reflen op ll, cs') where reflen op' = if ref_op op' then id else const 0 bad_op o = o /= Mat && o /= Ins && o /= SMa ref_op o = o == Mat || o == Del -- | Trim predicate to get rid of low quality sequence. -- @trim_low_quality q ns qs@ evaluates to true if all qualities in @qs@ -- are smaller (i.e. worse) than @q@. trim_low_quality :: Qual -> a -> [Qual] -> Bool trim_low_quality q = const $ all (< q) -- | Finds the merge point. Input is list of forward adapters, list of -- reverse adapters, sequence1, quality1, sequence2, quality2; output is -- merge point and two qualities (YM, YN). find_merge :: AD_Seqs -> AD_Seqs -> W.Vector Nucleotides -> W.Vector Qual -> W.Vector Nucleotides -> W.Vector Qual -> IO (Int, Int, Int) find_merge pads1 pads2 r1 q1 r2 q2 = with_fw_seq r1 q1 $ \pr1 -> with_fw_seq r2 q2 $ \pr2 -> with_rc_seq r2 q2 $ \prv2 -> do min_merge_score pads1 pads2 pr1 pr2 prv2 -- | Overlap-merging of read pairs. We shall compute the likelihood -- for every possible overlap, then select the most likely one (unless it -- looks completely random), compute a quality from the second best -- merge, then merge and clamp the quality accordingly. -- (We could try looking for chimaera after completing the merge, if -- only we knew which ones to expect?) -- -- Two reads go in, with two adapter lists. We return 'Nothing' if all -- merges looked mostly random. Else we return the two original reads, -- flagged as 'eflagVestigial' *and* the merged version, flagged as -- 'eflagMerged' and optionally 'eflagTrimmed'. All reads contain the -- computed qualities (in YM and YN), which we also return. -- -- The merging automatically limits quality scores some of the time. We -- additionally impose a hard limit of 63 to avoid difficulties -- representing the result, and even that is ridiculous. Sane people -- would further limit the returned quality! (In practice, map quality -- later imposes a limit anyway, so no worries...) mergeBam :: Int -> Int -> AD_Seqs -> AD_Seqs -> BamRec -> BamRec -> IO [BamRec] mergeBam lowq highq ads1 ads2 r1 r2 = do let len_r1 = V.length $ b_seq r1 len_r2 = V.length $ b_seq r2 b_seq_r1 = V.convert $ b_seq r1 b_seq_r2 = V.convert $ b_seq r2 b_qual_r1 = fromMaybe (V.map (const (Q 23)) b_seq_r1) (b_qual r1) b_qual_r2 = fromMaybe (V.map (const (Q 23)) b_seq_r2) (b_qual r2) (mlen, qual1, qual2) <- find_merge ads1 ads2 b_seq_r1 b_qual_r1 b_seq_r2 b_qual_r2 let flag_alternative br = br { b_exts = updateE "FF" (Int $ extAsInt 0 "FF" br .|. eflagAlternative) $ b_exts br } store_quals br = br { b_exts = updateE "YM" (Int qual1) $ updateE "YN" (Int qual2) $ b_exts br } pair_flags = flagPaired.|.flagProperlyPaired.|.flagMateUnmapped.|.flagMateReversed.|.flagFirstMate.|.flagSecondMate r1' = store_quals r1 r2' = store_quals r2 rm = store_quals $ merged_read mlen (fromIntegral $ min 63 qual1) merged_read l qmax = nullBamRec { b_qname = b_qname r1 , b_flag = flagUnmapped .|. complement pair_flags .&. b_flag r1 , b_seq = V.convert $ merged_seq l b_seq_r1 b_qual_r1 b_seq_r2 b_qual_r2 , b_qual = Just $ merged_qual qmax l b_seq_r1 b_qual_r1 b_seq_r2 b_qual_r2 , b_exts = let ff = if l < len_r1 then eflagTrimmed else 0 in updateE "FF" (Int $ extAsInt 0 "FF" r1 .|. eflagMerged .|. ff) $ b_exts r1 } return $ case () of _ | V.null (b_seq r1) && V.null (b_seq r2) -> [ ] | qual1 < lowq || mlen < 0 -> [ r1', r2' ] | qual1 >= highq && mlen == 0 -> [ ] | qual1 >= highq -> [ rm ] | mlen < len_r1-20 || mlen < len_r2-20 -> [ rm ] | otherwise -> map flag_alternative [ r1', r2', rm ] {-# INLINE merged_seq #-} merged_seq :: (V.Vector v Nucleotides, V.Vector v Qual) => Int -> v Nucleotides -> v Qual -> v Nucleotides -> v Qual -> v Nucleotides merged_seq l b_seq_r1 b_qual_r1 b_seq_r2 b_qual_r2 = V.concat [ V.take (l - len_r2) b_seq_r1 , V.zipWith4 zz (V.take l $ V.drop (l - len_r2) b_seq_r1) (V.take l $ V.drop (l - len_r2) b_qual_r1) (V.reverse $ V.take l $ V.drop (l - len_r1) b_seq_r2) (V.reverse $ V.take l $ V.drop (l - len_r1) b_qual_r2) , V.reverse $ V.take (l - len_r1) b_seq_r2 ] where len_r1 = V.length b_qual_r1 len_r2 = V.length b_qual_r2 zz !n1 (Q !q1) !n2 (Q !q2) | n1 == compls n2 = n1 | q1 > q2 = n1 | otherwise = compls n2 {-# INLINE merged_qual #-} merged_qual :: (V.Vector v Nucleotides, V.Vector v Qual) => Word8 -> Int -> v Nucleotides -> v Qual -> v Nucleotides -> v Qual -> v Qual merged_qual qmax l b_seq_r1 b_qual_r1 b_seq_r2 b_qual_r2 = V.concat [ V.take (l - len_r2) b_qual_r1 , V.zipWith4 zz (V.take l $ V.drop (l - len_r2) b_seq_r1) (V.take l $ V.drop (l - len_r2) b_qual_r1) (V.reverse $ V.take l $ V.drop (l - len_r1) b_seq_r2) (V.reverse $ V.take l $ V.drop (l - len_r1) b_qual_r2) , V.reverse $ V.take (l - len_r1) b_qual_r2 ] where len_r1 = V.length b_qual_r1 len_r2 = V.length b_qual_r2 zz !n1 (Q !q1) !n2 (Q !q2) | n1 == compls n2 = Q $ min qmax (q1 + q2) | q1 > q2 = Q $ q1 - q2 | otherwise = Q $ q2 - q1 -- | Finds the trimming point. Input is list of forward adapters, -- sequence, quality; output is trim point and two qualities (YM, YN). find_trim :: AD_Seqs -> W.Vector Nucleotides -> W.Vector Qual -> IO (Int, Int, Int) find_trim pads1 r1 q1 = withADSeqs [W.empty] $ \pads2 -> with_fw_seq r1 q1 $ \pr1 -> min_merge_score pads1 pads2 pr1 (FW_Seq nullPtr nullPtr 0) (RC_Seq nullPtr nullPtr 0) -- | Trimming for a single read: we need one adapter only (the one coming -- /after/ the read), here provided as a list of options, and then we -- merge with an empty second read. Results in up to two reads (the -- original, possibly flagged, and the trimmed one, definitely flagged, -- and two qualities). trimBam :: Int -> Int -> AD_Seqs -> BamRec -> IO [BamRec] trimBam lowq highq ads1 r1 = do let b_seq_r1 = V.convert $ b_seq r1 (mlen, qual1, qual2) <- find_trim ads1 b_seq_r1 $ fromMaybe (V.map (const (Q 23)) b_seq_r1) (b_qual r1) let flag_alternative br = br { b_exts = updateE "FF" (Int $ extAsInt 0 "FF" br .|. eflagAlternative) $ b_exts br } store_quals br = br { b_exts = updateE "YM" (Int qual1) $ updateE "YN" (Int qual2) $ b_exts br } r1' = store_quals r1 r1t = store_quals $ trimmed_read mlen trimmed_read l = nullBamRec { b_qname = b_qname r1, b_flag = flagUnmapped .|. b_flag r1, b_seq = V.take l $ b_seq r1, b_qual = V.take l <$> b_qual r1, b_exts = updateE "FF" (Int $ extAsInt 0 "FF" r1 .|. eflagTrimmed) $ b_exts r1 } return $ case () of _ | V.null (b_seq r1) -> [ ] | mlen == 0 && qual1 >= highq -> [ ] | qual1 < lowq || mlen < 0 -> [ r1' ] | qual1 >= highq -> [ r1t ] | otherwise -> map flag_alternative [ r1', r1t ] -- | For merging, we don't need the complete adapters (length around 70!), -- only a sufficient prefix. Taking only the more-or-less constant -- part (length around 30), there aren't all that many different -- adapters in the world. To deal with pretty much every library, we -- only need the following forward adapters, which will be the default -- (defined here in the direction they would be sequenced in): Genomic -- R2, Multiplex R2, Fraft P7. default_fwd_adapters :: [W.Vector Nucleotides] default_fwd_adapters = map (W.fromList . map toNucleotides . map c2w) [ {- Genomic R2 -} "AGATCGGAAGAGCGGTTCAG" , {- Multiplex R2 -} "AGATCGGAAGAGCACACGTC" , {- Graft P7 -} "AGATCGGAAGAGCTCGTATG" ] -- | Like 'default_rev_adapters', these are the few adapters needed for -- the reverse read (defined in the direction they would be sequenced in -- as part of the second read): Genomic R1, CL 72. default_rev_adapters :: [W.Vector Nucleotides] default_rev_adapters = map (W.fromList . map toNucleotides . map c2w) [ {- Genomic_R1 -} "AGATCGGAAGAGCGTCGTGT" , {- CL72 -} "GGAAGAGCGTCGTGTAGGGA" ] -- We need to compute the likelihood of a read pair given an assumed -- insert length. The likelihood of the first read is the likelihood of -- a match with the adapter where it overlaps the 3' adapter, elsewhere -- it's 1/4 per position. The likelihood of the second read is the -- likelihood of a match with the adapter where it overlaps the adapter, -- the likehood of a read-read match where it overlaps read one, 1/4 per -- position elsewhere. (Yes, this ignores base composition. It doesn't -- matter enough.) min_merge_score :: AD_Seqs -- 3' adapters as they appear in the first read -> AD_Seqs -- 5' adapters as they appear in the second read -> FW_Seq -- first read, prepped -> FW_Seq -- second read, qual, prepped -> RC_Seq -- second read, qual, reversed and prepped -> IO (Int,Int,Int) -- best length, min score, 2nd min score min_merge_score (AD_Seqs !p_fwd_ads !p_fwd_lns !n_fwd_ads) (AD_Seqs !p_rev_ads !p_rev_lns !n_rev_ads) (FW_Seq !p_rd1 !p_qs1 !l1) (FW_Seq !p_rd2 !p_qs2 !l2) (RC_Seq !p_rrd2 !p_rqs2 _) = allocaArray 2 $ \pmins -> liftM3 (,,) (fromIntegral <$> prim_merge_score p_fwd_ads p_fwd_lns (fromIntegral n_fwd_ads) p_rev_ads p_rev_lns (fromIntegral n_rev_ads) p_rd1 p_qs1 (fromIntegral l1) p_rd2 p_qs2 (fromIntegral l2) p_rrd2 p_rqs2 pmins) (fromIntegral <$> peekElemOff pmins 0) (fromIntegral <$> peekElemOff pmins 1) foreign import ccall unsafe "prim_merge_score" prim_merge_score :: Ptr (Ptr Nucleotides) -> Ptr CInt -> CInt -> Ptr (Ptr Nucleotides) -> Ptr CInt -> CInt -> Ptr Nucleotides -> Ptr Qual -> CInt -> Ptr Nucleotides -> Ptr Qual -> CInt -> Ptr Nucleotides -> Ptr Qual -> Ptr CInt -> IO CInt data AD_Seqs = AD_Seqs !(Ptr (Ptr Nucleotides)) !(Ptr CInt) !Int data FW_Seq = FW_Seq !(Ptr Nucleotides) !(Ptr Qual) !Int data RC_Seq = RC_Seq !(Ptr Nucleotides) !(Ptr Qual) !Int -- Maybe pad with something suitable? withADSeqs :: MonadBaseControl IO m => [W.Vector Nucleotides] -> (AD_Seqs -> m r) -> m r withADSeqs ads0 k = control $ \run_io -> allocaArray (length ads0) $ \pps -> allocaArray (length ads0) $ \pls -> let go !n [ ] = run_io (k $! AD_Seqs pps pls n) go !n (v:vs) = W.unsafeWith v $ \pa -> do pokeElemOff pps n pa pokeElemOff pls n (fromIntegral (W.length v)) go (succ n) vs in go 0 ads0 -- Maybe pad with something suitable? with_fw_seq :: W.Vector Nucleotides -> W.Vector Qual -> (FW_Seq -> IO r) -> IO r with_fw_seq ns qs k | W.length ns == W.length qs = W.unsafeWith ns $ \p_ns -> W.unsafeWith qs $ \p_qs -> k (FW_Seq p_ns p_qs $ W.length ns) | otherwise = throwIO $ LengthMismatch "forward adapter" {-# INLINE with_fw_seq #-} -- Maybe pad with something suitable? with_rc_seq :: W.Vector Nucleotides -> W.Vector Qual -> (RC_Seq -> IO r) -> IO r with_rc_seq ns qs k | W.length ns == W.length qs = W.unsafeWith (W.reverse $ W.map compls ns) $ \p_rns -> W.unsafeWith (W.reverse qs) $ \p_rqs -> k (RC_Seq p_rns p_rqs $ W.length ns) | otherwise = throwIO $ LengthMismatch "reverse adapter" {-# INLINE with_rc_seq #-}