-- | This implements a number of filters used in the Titanium pipeline module Bio.Sequence.SFF_filters where import Bio.Sequence.SFF (ReadBlock(..), ReadHeader(..), flowToBasePos, flowgram) import qualified Data.ByteString.Lazy as B import qualified Data.ByteString.Lazy.Char8 as BL import Data.List (tails) -- Ti uses a set of filters, described in the (something) manual. -- (GS Run Processor Application, section 3.2.2++) -- ** Discarding filters ** -- | DiscardFilters determine whether a read is to be retained or discarded type DiscardFilter = ReadBlock -> Bool -- True to retain, False to discard filter_dots, filter_mixed, filter_key, filter_empty :: DiscardFilter filter_empty rb = num_bases (read_header rb) >= 5 filter_key rb = ("TCAG"==) $ take 4 $ BL.unpack $ bases rb -- | 3.2.2.1.2 The "dots" filter discards sequences where the last positive flow is -- before flow 84, and flows with >5% dots (i.e. three successive noise values) -- before the last postitive flow. (Interpreted as 5% of called sequence length is Ns?) filter_dots rb = let dots = BL.length $ BL.filter (=='N') $ bases rb in fromIntegral dots / fromIntegral (BL.length $ bases rb) < (0.05 :: Double) -- | 3.2.2.1.3 The "mixed" filter discards sequences with more than 70% positive flows. -- Also, discard with <30% noise, >20% middle (0.45..0.75) or <30% positive. filter_mixed rb = let fs = dropWhile (<50) . reverse . flowgram $ rb fl = dlength fs in and [ (dlength (filter (>50) fs) / fl) > 0.7 , (dlength (filter (<45) fs) / fl) > 0.3 , (dlength (filter (>75) fs) / fl) > 0.3 , (dlength (filter (\f -> f<=75 && f>=45) fs) / fl) < 0.2 ] -- | Discard a read if the number of untrimmed flows is less than n (n=186 for Titanium) filter_length :: Int -> DiscardFilter filter_length n rb = length (flowgram rb) >= n -- ** Trimming filters ** -- | TrimFilters modify the read, typically trimming it for quality type TrimFilter = ReadBlock -> ReadBlock filter_sigint, filter_qual20 :: TrimFilter -- | 3.2.2.1.4 Signal intensity trim - trim back until <3% borderline flows (0.5..0.7). -- Then trim borderline values or dots from the end (use a window). filter_sigint rb = clipFlows rb (sigint rb) sigint :: ReadBlock -> Int sigint rb = let bs = scanl (\(n,m,_) f -> if f >= 50 && f <= 70 then (n+1,m+1,f) else (n,m+1,f)) (0,0,0) $ flowgram rb in length $ reverse $ dropWhile (\(_,_,f) -> f<=70) $ dropWhile (\(n,m,_)->n `div` (m*1000) > (30::Int)) $ reverse bs -- 3.2.2.1.5 Primer filter and 3.2.2.1.6 Trimback valley filter are ignored, -- since we don't know how to detect the primer, and don't understand the description of tbv. -- | 3.2.2.1.7 Quality score trimming trims using a 10-base window until a Q20 average is found. filter_qual20 rs = clipSeq rs $ qual20 rs qual20 :: ReadBlock -> Int qual20 rs = (fromIntegral $ num_bases $ read_header rs) - (length . takeWhile (<20) . map (avg . take 10) . tails . reverse . B.unpack $ quality rs) -- ** Utility functions ** -- | List length as a double (eliminates many instances of fromIntegral) dlength :: [a] -> Double dlength = fromIntegral . length -- | Calculate average of a list avg :: Integral a => [a] -> Double avg xs = fromIntegral (sum xs) / dlength xs -- | Translate a number of flows to position in sequence, and update clipping data accordingly clipFlows :: ReadBlock -> Int -> ReadBlock clipFlows rb n = clipSeq rb (flowToBasePos rb n) -- | Update clip_qual_right if more severe than previous value clipSeq :: ReadBlock -> Int -> ReadBlock clipSeq rb n' = let n = fromIntegral n' rh = read_header rb in if clip_qual_right rh <= n then rb else rb { read_header = rh {clip_qual_right = n }}