-- | This implements a number of filters used in the Titanium pipeline
module Bio.Sequence.SFF_filters where

import Bio.Sequence.SFF (ReadBlock(..), ReadHeader(..), flowToBasePos, flowgram)
import qualified Data.ByteString.Lazy as B
import qualified Data.ByteString.Lazy.Char8 as BL
import Data.List (tails)

-- Ti uses a set of filters, described in the (something) manual.
-- (GS Run Processor Application, section 3.2.2++)

-- ** Discarding filters **

-- | DiscardFilters determine whether a read is to be retained or discarded
type DiscardFilter = ReadBlock -> Bool -- True to retain, False to discard

filter_dots, filter_mixed, filter_key, filter_empty :: DiscardFilter

filter_empty rb = num_bases (read_header rb) >= 5

filter_key rb = ("TCAG"==) $ take 4 $ BL.unpack $ bases rb

-- | 3.2.2.1.2 The "dots" filter discards sequences where the last positive flow is 
--   before flow 84, and flows with >5% dots (i.e. three successive noise values) 
--   before the last postitive flow. (Interpreted as 5% of called sequence length is Ns?)
filter_dots rb = let dots = BL.length $ BL.filter (=='N') $ bases rb
                 in fromIntegral dots / fromIntegral (BL.length $ bases rb) < (0.05 :: Double)

-- | 3.2.2.1.3 The "mixed" filter discards sequences with more than 70% positive flows.  
--   Also, discard with <30% noise, >20% middle (0.45..0.75) or <30% positive.
filter_mixed rb = let fs = dropWhile (<50) . reverse . flowgram $ rb
                      fl = dlength fs
                  in and [ (dlength (filter (>50) fs) / fl) > 0.7
                         , (dlength (filter (<45) fs) / fl) > 0.3 
                         , (dlength (filter (>75) fs) / fl) > 0.3
                         , (dlength (filter (\f -> f<=75 && f>=45) fs) / fl) < 0.2
                         ]
-- | Discard a read if the number of untrimmed flows is less than n (n=186 for Titanium)
filter_length :: Int -> DiscardFilter
filter_length n rb = length (flowgram rb) >= n

-- ** Trimming filters **

-- | TrimFilters modify the read, typically trimming it for quality
type TrimFilter = ReadBlock -> ReadBlock
filter_sigint, filter_qual20 :: TrimFilter

-- | 3.2.2.1.4 Signal intensity trim - trim back until <3% borderline flows (0.5..0.7).
--   Then trim borderline values or dots from the end (use a window).
filter_sigint rb = clipFlows rb (sigint rb)

sigint :: ReadBlock -> Int
sigint rb = let bs = scanl (\(n,m,_) f -> if f >= 50 && f <= 70 then (n+1,m+1,f) else (n,m+1,f)) (0,0,0) $ flowgram rb 
            in length $ reverse 
                   $ dropWhile (\(_,_,f) -> f<=70)
                   $ dropWhile (\(n,m,_)->n `div` (m*1000) > (30::Int)) $ reverse bs


-- 3.2.2.1.5 Primer filter and 3.2.2.1.6 Trimback valley filter are ignored, 
-- since we don't know how to detect the primer, and don't understand the description of tbv.

-- | 3.2.2.1.7 Quality score trimming trims using a 10-base window until a Q20 average is found. 
filter_qual20 rs = clipSeq rs $ qual20 rs

qual20 :: ReadBlock -> Int
qual20 rs = (fromIntegral $ num_bases $ read_header rs) 
            - (length . takeWhile (<20) . map (avg . take 10) . tails . reverse . B.unpack $ quality rs)

-- ** Utility functions **

-- | List length as a double (eliminates many instances of fromIntegral)
dlength :: [a] -> Double
dlength = fromIntegral . length

-- | Calculate average of a list
avg :: Integral a => [a] -> Double
avg xs = fromIntegral (sum xs) / dlength xs

-- | Translate a number of flows to position in sequence, and update clipping data accordingly
clipFlows :: ReadBlock -> Int -> ReadBlock
clipFlows rb n = clipSeq rb (flowToBasePos rb n)

-- | Update clip_qual_right if more severe than previous value
clipSeq :: ReadBlock -> Int -> ReadBlock
clipSeq rb n' = let n = fromIntegral n' 
                    rh = read_header rb 
                in if clip_qual_right rh <= n then rb else rb { read_header = rh {clip_qual_right = n }}