-- | This implements a number of filters used in the Titanium pipeline, -- based on published documentation. module Bio.Sequence.SFF_filters where import Bio.Sequence.SFF (ReadBlock(..), ReadHeader(..) , flowToBasePos, flowgram, cumulative_index) import qualified Data.ByteString.Lazy as B import qualified Data.ByteString as SB import qualified Data.ByteString.Lazy.Char8 as BL import Data.List (tails) import Data.Char (toUpper) -- Ti uses a set of filters, described in the (something) manual. -- (GS Run Processor Application, section 3.2.2++) -- ** Discarding filters -- | DiscardFilters determine whether a read is to be retained or discarded type DiscardFilter = ReadBlock -> Bool -- True to retain, False to discard -- | This filter discards empty sequences. discard_empty :: DiscardFilter discard_empty rb = num_bases (read_header rb) >= 5 -- | Discard sequences that don't have the given key tag (typically TCAG) at the start -- of the read. discard_key :: String -> DiscardFilter discard_key key rb = (map toUpper key==) $ take (length key) $ BL.unpack $ bases rb -- | 3.2.2.1.2 The "dots" filter discards sequences where the last positive flow is -- before flow 84, and flows with >5% dots (i.e. three successive noise values) -- before the last postitive flow. The percentage can be given as a parameter. discard_dots :: Double -> DiscardFilter discard_dots p rb = let dotcount = SB.length $ SB.filter (>3) $ flow_index rb in fromIntegral dotcount / fromIntegral (BL.length $ bases rb) < p && last (cumulative_index rb) >= 84 -- | 3.2.2.1.3 The "mixed" filter discards sequences with more than 70% positive flows. -- Also, discard with <30% noise, >20% middle (0.45..0.75) or <30% positive. discard_mixed :: DiscardFilter discard_mixed rb = let fs = dropWhile (<50) . reverse . flowgram $ rb fl = dlength fs in and [ (dlength (filter (>50) fs) / fl) < 0.7 -- 70% positive , (dlength (filter (<45) fs) / fl) > 0.3 -- 30% noise , (dlength (filter (>75) fs) / fl) > 0.3 -- 30% postivie , (dlength (filter (\f -> f<=75 && f>=45) fs) / fl) < 0.2 ] -- | Discard a read if the number of untrimmed flows is less than n (n=186 for Titanium) discard_length :: Int -> DiscardFilter discard_length n rb = length (flowgram rb) >= n -- ** Trimming filters -- | TrimFilters modify the read, typically trimming it for quality type TrimFilter = ReadBlock -> ReadBlock -- | 3.2.2.1.4 Signal intensity trim - trim back until <3% borderline flows (0.5..0.7). -- Then trim borderline values or dots from the end (use a window). trim_sigint :: TrimFilter trim_sigint rb = clipSeq rb (sigint rb) -- n counts the "bad" flow values, m counts flow position sigint :: ReadBlock -> Int sigint rb = let bs = drop 1 $ scanl (\(n,m,_) f -> if f >= 50 && f <= 70 then (n+1,m+1,f) else (n,m+1,f)) (0,0,0) $ flowgram rb xs = dropWhile (\(_,_,f) -> f<=70) $ dropWhile (\(n,m,_)->(1000*n) `div` m > (30::Int)) $ reverse bs in case xs of [] -> error "no sequence left?" ((_,m,_):_) -> flowToBasePos rb m -- | 3.2.2.1.5 Primer filter -- This looks for the B-adaptor at the end of the read. The 454 implementation isn't very -- effective at finding mutated adaptors. trim_primer :: String -> TrimFilter trim_primer s rb = clipSeq rb (find_primer s rb) find_primer :: String -> ReadBlock -> Int find_primer s rb = go (num_bases (read_header rb) - 10) where go i | i <= 5 = fromIntegral (num_bases $ read_header rb) | match i = fromIntegral i | otherwise = go (i-1) match j = s' `B.isPrefixOf` B.drop (fromIntegral j) (bases rb) s' = BL.pack $ map toUpper $ take 14 s -- 3.2.2.1.6 Trimback valley filter is ignored, we don't understand the description. -- | 3.2.2.1.7 Quality score trimming trims using a 10-base window until a Q20 average is found. trim_qual20 :: Int -> TrimFilter trim_qual20 w rs = clipSeq rs $ qual20 w rs qual20 :: Int -> ReadBlock -> Int qual20 w rs = (fromIntegral $ num_bases $ read_header rs) - (length . takeWhile (<20) . map (avg . take w) . tails . reverse . B.unpack $ quality rs) -- ** Utility functions -- | List length as a double (eliminates many instances of fromIntegral) dlength :: [a] -> Double dlength = fromIntegral . length -- | Calculate average of a list avg :: Integral a => [a] -> Double avg xs = sum (map fromIntegral xs) / dlength xs -- | Translate a number of flows to position in sequence, and update clipping data accordingly clipFlows :: ReadBlock -> Int -> ReadBlock clipFlows rb n = clipSeq rb (flowToBasePos rb n) -- | Update clip_qual_right if more severe than previous value clipSeq :: ReadBlock -> Int -> ReadBlock clipSeq rb n' = let n = fromIntegral n' rh = read_header rb in if clip_qual_right rh <= n then rb else rb { read_header = rh {clip_qual_right = n }} -- ** Data -- Celera docs, at http://sourceforge.net/apps/mediawiki/wgs-assembler/index.php?title=SffToCA -- These are used for mate-pair libraries, should be located around the middle of the read: flx_linker,ti_linker,rna_adapter,rna_adapter2,rna_adapter3,rapid_adapter,ti_adapter_b :: String flx_linker = "GTTGGAACCGAAAGGGTTTGAATTCAAACCCTTTCGGTTCCAAC" -- Celera ti_linker = "TCGTATAACTTCGTATAATGTATGCTATACGAAGTTATTACG" -- 20K cod jump -- ti_linker and this: "AGCATATTGAAGCATATTACATACGATATGCTTCAATAATGC" -- from "GS FLX Titanium 3 kb Span Paired End Library Preparation Method Manual April 2009" -- ftp://ftp.genome.ou.edu/pub/for_broe/titanium/ -- These are used at the end of RNA (cDNA) sequences, after the poly-A tail: rna_adapter = "ggcgggcgatgtctcgtctgagcgggctggcaaggc" -- cod transcripts? rna_adapter2 = "ttcgcagtgagtgacaggctagtagctgagcgggctggcaaggc" -- Cod_c.sff rna_adapter3 = "gacggggcggatgtctcgtctgagcgggcgtggcaaggc" -- COD1.sff -- These are used at the end of DNA sequencing reads: rapid_adapter = "agtcgtggaggcaaggcacacagggatagg" -- sea louse reads, key GACT ti_adapter_b = "ctgagactgccaaggcacacagggggatagg" -- sea bass and l.s.Ca