-- | Quality filters adapted from prehistoric pipeline.

module Bio.Bam.Filter (
    filterPairs, QualFilter,
    complexSimple, complexEntropy,
    qualityAverage, qualityMinimum,
    qualityFromOldIllumina, qualityFromNewIllumina
                      ) where

import Bio.Bam.Header
import Bio.Bam.Rec
import Bio.Prelude
import Bio.Streaming

import qualified Data.Vector.Generic as V

-- | A filter/transformation applied to pairs of reads.  We supply a
-- predicate to be applied to single reads and one to be applied to
-- pairs, the latter can get incomplete pairs, too, if mates have been
-- separated or filtered asymmetrically.  This fails spectacularly if
-- the input isn't grouped by name.

filterPairs :: Monad m => (BamRec -> [BamRec])
                       -> (Maybe BamRec -> Maybe BamRec -> [BamRec])
                       -> Stream (Of BamRec) m r -> Stream (Of BamRec) m r
filterPairs :: (BamRec -> [BamRec])
-> (Maybe BamRec -> Maybe BamRec -> [BamRec])
-> Stream (Of BamRec) m r
-> Stream (Of BamRec) m r
filterPairs ps :: BamRec -> [BamRec]
ps pp :: Maybe BamRec -> Maybe BamRec -> [BamRec]
pp = Stream (Of BamRec) m r -> Stream (Of BamRec) m r
forall b. Stream (Of BamRec) m b -> Stream (Of BamRec) m b
step
  where
    step :: Stream (Of BamRec) m b -> Stream (Of BamRec) m b
step = m (Either b (Of BamRec (Stream (Of BamRec) m b)))
-> Stream
     (Of BamRec) m (Either b (Of BamRec (Stream (Of BamRec) m b)))
forall (t :: (* -> *) -> * -> *) (m :: * -> *) a.
(MonadTrans t, Monad m) =>
m a -> t m a
lift (m (Either b (Of BamRec (Stream (Of BamRec) m b)))
 -> Stream
      (Of BamRec) m (Either b (Of BamRec (Stream (Of BamRec) m b))))
-> (Stream (Of BamRec) m b
    -> m (Either b (Of BamRec (Stream (Of BamRec) m b))))
-> Stream (Of BamRec) m b
-> Stream
     (Of BamRec) m (Either b (Of BamRec (Stream (Of BamRec) m b)))
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. Stream (Of BamRec) m b
-> m (Either b (Of BamRec (Stream (Of BamRec) m b)))
forall (m :: * -> *) (f :: * -> *) r.
Monad m =>
Stream f m r -> m (Either r (f (Stream f m r)))
inspect (Stream (Of BamRec) m b
 -> Stream
      (Of BamRec) m (Either b (Of BamRec (Stream (Of BamRec) m b))))
-> (Either b (Of BamRec (Stream (Of BamRec) m b))
    -> Stream (Of BamRec) m b)
-> Stream (Of BamRec) m b
-> Stream (Of BamRec) m b
forall (m :: * -> *) a b c.
Monad m =>
(a -> m b) -> (b -> m c) -> a -> m c
>=> (b -> Stream (Of BamRec) m b)
-> (Of BamRec (Stream (Of BamRec) m b) -> Stream (Of BamRec) m b)
-> Either b (Of BamRec (Stream (Of BamRec) m b))
-> Stream (Of BamRec) m b
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either b -> Stream (Of BamRec) m b
forall (f :: * -> *) a. Applicative f => a -> f a
pure Of BamRec (Stream (Of BamRec) m b) -> Stream (Of BamRec) m b
step'
    step' :: Of BamRec (Stream (Of BamRec) m b) -> Stream (Of BamRec) m b
step' (b :: BamRec
b :> s :: Stream (Of BamRec) m b
s)
        | BamRec -> Bool
isPaired BamRec
b = m (Either b (Of BamRec (Stream (Of BamRec) m b)))
-> Stream
     (Of BamRec) m (Either b (Of BamRec (Stream (Of BamRec) m b)))
forall (t :: (* -> *) -> * -> *) (m :: * -> *) a.
(MonadTrans t, Monad m) =>
m a -> t m a
lift (Stream (Of BamRec) m b
-> m (Either b (Of BamRec (Stream (Of BamRec) m b)))
forall (m :: * -> *) (f :: * -> *) r.
Monad m =>
Stream f m r -> m (Either r (f (Stream f m r)))
inspect Stream (Of BamRec) m b
s) Stream
  (Of BamRec) m (Either b (Of BamRec (Stream (Of BamRec) m b)))
-> (Either b (Of BamRec (Stream (Of BamRec) m b))
    -> Stream (Of BamRec) m b)
-> Stream (Of BamRec) m b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= BamRec
-> Either b (Of BamRec (Stream (Of BamRec) m b))
-> Stream (Of BamRec) m b
step'' BamRec
b
        | Bool
otherwise  = [BamRec] -> Stream (Of BamRec) m ()
forall (m :: * -> *) (f :: * -> *) a.
(Monad m, Foldable f) =>
f a -> Stream (Of a) m ()
each (BamRec -> [BamRec]
ps BamRec
b) Stream (Of BamRec) m ()
-> Stream (Of BamRec) m b -> Stream (Of BamRec) m b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Stream (Of BamRec) m b -> Stream (Of BamRec) m b
step Stream (Of BamRec) m b
s

    step'' :: BamRec
-> Either b (Of BamRec (Stream (Of BamRec) m b))
-> Stream (Of BamRec) m b
step'' b :: BamRec
b (Left r :: b
r) = [BamRec] -> Stream (Of BamRec) m ()
forall (m :: * -> *) (f :: * -> *) a.
(Monad m, Foldable f) =>
f a -> Stream (Of a) m ()
each (Maybe BamRec -> Maybe BamRec -> [BamRec]
pp (BamRec -> Maybe BamRec
forall a. a -> Maybe a
Just BamRec
b) Maybe BamRec
forall a. Maybe a
Nothing) Stream (Of BamRec) m ()
-> Stream (Of BamRec) m b -> Stream (Of BamRec) m b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> b -> Stream (Of BamRec) m b
forall (f :: * -> *) a. Applicative f => a -> f a
pure b
r

    step'' b :: BamRec
b (Right (c :: BamRec
c :> s :: Stream (Of BamRec) m b
s))
        | BamRec -> Refseq
b_rname BamRec
b Refseq -> Refseq -> Bool
forall a. Eq a => a -> a -> Bool
/= BamRec -> Refseq
b_rname BamRec
c Bool -> Bool -> Bool
|| Bool -> Bool
not (BamRec -> Bool
isPaired BamRec
c) =
                let b' :: [BamRec]
b' = if BamRec -> Bool
isSecondMate BamRec
b then Maybe BamRec -> Maybe BamRec -> [BamRec]
pp Maybe BamRec
forall a. Maybe a
Nothing (BamRec -> Maybe BamRec
forall a. a -> Maybe a
Just BamRec
b) else Maybe BamRec -> Maybe BamRec -> [BamRec]
pp (BamRec -> Maybe BamRec
forall a. a -> Maybe a
Just BamRec
b) Maybe BamRec
forall a. Maybe a
Nothing
                in [BamRec] -> Stream (Of BamRec) m ()
forall (m :: * -> *) (f :: * -> *) a.
(Monad m, Foldable f) =>
f a -> Stream (Of a) m ()
each [BamRec]
b' Stream (Of BamRec) m ()
-> Stream (Of BamRec) m b -> Stream (Of BamRec) m b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Of BamRec (Stream (Of BamRec) m b) -> Stream (Of BamRec) m b
step' (BamRec
c BamRec
-> Stream (Of BamRec) m b -> Of BamRec (Stream (Of BamRec) m b)
forall a b. a -> b -> Of a b
:> Stream (Of BamRec) m b
s)

        | BamRec -> Bool
isFirstMate BamRec
c Bool -> Bool -> Bool
&& BamRec -> Bool
isSecondMate BamRec
b = BamRec
-> BamRec -> Stream (Of BamRec) m b -> Stream (Of BamRec) m b
step''' BamRec
c BamRec
b Stream (Of BamRec) m b
s
        | Bool
otherwise                       = BamRec
-> BamRec -> Stream (Of BamRec) m b -> Stream (Of BamRec) m b
step''' BamRec
b BamRec
c Stream (Of BamRec) m b
s

    step''' :: BamRec
-> BamRec -> Stream (Of BamRec) m b -> Stream (Of BamRec) m b
step''' b :: BamRec
b c :: BamRec
c s :: Stream (Of BamRec) m b
s = [BamRec] -> Stream (Of BamRec) m ()
forall (m :: * -> *) (f :: * -> *) a.
(Monad m, Foldable f) =>
f a -> Stream (Of a) m ()
each (Maybe BamRec -> Maybe BamRec -> [BamRec]
pp (BamRec -> Maybe BamRec
forall a. a -> Maybe a
Just BamRec
b) (BamRec -> Maybe BamRec
forall a. a -> Maybe a
Just BamRec
c)) Stream (Of BamRec) m ()
-> Stream (Of BamRec) m b -> Stream (Of BamRec) m b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Stream (Of BamRec) m b -> Stream (Of BamRec) m b
step Stream (Of BamRec) m b
s


-- | A quality filter is simply a transformation on @BamRec@s.  By
-- convention, quality filters should set @flagFailsQC@, a further step
-- can then remove the failed reads.  Filtering of individual reads
-- tends to result in mate pairs with inconsistent flags, which in turn
-- will result in lone mates and all sort of troubles with programs that
-- expect non-broken BAM files.  It is therefore recommended to use
-- @pairFilter@ with suitable predicates to do the post processing.

type QualFilter = BamRec -> BamRec

{-# INLINE count #-}
count :: (V.Vector v a, Eq a) => a -> v a -> Int
count :: a -> v a -> Int
count x :: a
x = (Int -> a -> Int) -> Int -> v a -> Int
forall (v :: * -> *) b a.
Vector v b =>
(a -> b -> a) -> a -> v b -> a
V.foldl' (\acc :: Int
acc y :: a
y -> if a
x a -> a -> Bool
forall a. Eq a => a -> a -> Bool
== a
y then Int
accInt -> Int -> Int
forall a. Num a => a -> a -> a
+1 else Int
acc) 0

-- | Simple complexity filter aka "Nancy Filter".  A read is considered
-- not-sufficiently-complex if the most common base accounts for greater
-- than the @cutoff@ fraction of all non-N bases.
{-# INLINE complexSimple #-}
complexSimple :: Double -> QualFilter
complexSimple :: Double -> QualFilter
complexSimple r :: Double
r b :: BamRec
b = if Bool
p then BamRec
b else BamRec
b'
  where
    b' :: BamRec
b' = Char -> QualFilter
setQualFlag 'C' QualFilter -> QualFilter
forall a b. (a -> b) -> a -> b
$ BamRec
b { b_flag :: Int
b_flag = BamRec -> Int
b_flag BamRec
b Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
flagFailsQC }
    p :: Bool
p  = let counts :: [Int]
counts = [ Nucleotides -> Vector_Nucs_half Nucleotides -> Int
forall (v :: * -> *) a. (Vector v a, Eq a) => a -> v a -> Int
count Nucleotides
x (Vector_Nucs_half Nucleotides -> Int)
-> Vector_Nucs_half Nucleotides -> Int
forall a b. (a -> b) -> a -> b
$ BamRec -> Vector_Nucs_half Nucleotides
b_seq BamRec
b | Nucleotides
x <- [Nucleotides]
properBases ]
             lim :: Int
lim = Double -> Int
forall a b. (RealFrac a, Integral b) => a -> b
floor (Double -> Int) -> Double -> Int
forall a b. (a -> b) -> a -> b
$ Double
r Double -> Double -> Double
forall a. Num a => a -> a -> a
* Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral ([Int] -> Int
forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
sum [Int]
counts)
         in (Int -> Bool) -> [Int] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all (Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
lim) [Int]
counts

-- | Filter on order zero empirical entropy.  Entropy per base must be
-- greater than cutoff.
{-# INLINE complexEntropy #-}
complexEntropy :: Double -> QualFilter
complexEntropy :: Double -> QualFilter
complexEntropy r :: Double
r b :: BamRec
b = if Bool
p then BamRec
b else BamRec
b'
  where
    b' :: BamRec
b' = Char -> QualFilter
setQualFlag 'C' QualFilter -> QualFilter
forall a b. (a -> b) -> a -> b
$ BamRec
b { b_flag :: Int
b_flag = BamRec -> Int
b_flag BamRec
b Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
flagFailsQC }
    p :: Bool
p = Double
ent Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
>= Double
r Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
total

    counts :: [Int]
counts = [ Nucleotides -> Vector_Nucs_half Nucleotides -> Int
forall (v :: * -> *) a. (Vector v a, Eq a) => a -> v a -> Int
count Nucleotides
x (Vector_Nucs_half Nucleotides -> Int)
-> Vector_Nucs_half Nucleotides -> Int
forall a b. (a -> b) -> a -> b
$ BamRec -> Vector_Nucs_half Nucleotides
b_seq BamRec
b | Nucleotides
x <- [Nucleotides]
properBases ]
    total :: Double
total = Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Double) -> Int -> Double
forall a b. (a -> b) -> a -> b
$ Vector_Nucs_half Nucleotides -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
V.length (Vector_Nucs_half Nucleotides -> Int)
-> Vector_Nucs_half Nucleotides -> Int
forall a b. (a -> b) -> a -> b
$ BamRec -> Vector_Nucs_half Nucleotides
b_seq BamRec
b
    ent :: Double
ent   = [Double] -> Double
forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
sum [ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
c Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double -> Double
forall a. Floating a => a -> a
log (Double
total Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
c) | Int
c <- [Int]
counts, Int
c Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
/= 0 ] Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double -> Double
forall a. Floating a => a -> a
log 2

-- | Filter on average quality.  Reads without qualities pass.
{-# INLINE qualityAverage #-}
qualityAverage :: Int -> QualFilter
qualityAverage :: Int -> QualFilter
qualityAverage q :: Int
q b :: BamRec
b = if Bool
p then BamRec
b else BamRec
b'
  where
    b' :: BamRec
b' = Char -> QualFilter
setQualFlag 'Q' QualFilter -> QualFilter
forall a b. (a -> b) -> a -> b
$ BamRec
b { b_flag :: Int
b_flag = BamRec -> Int
b_flag BamRec
b Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
flagFailsQC }
    p :: Bool
p  = case BamRec -> Maybe (Vector Qual)
b_qual BamRec
b of Nothing -> Bool
True
                          Just qs :: Vector Qual
qs -> let total :: Int
total = (Int -> Qual -> Int) -> Int -> Vector Qual -> Int
forall (v :: * -> *) b a.
Vector v b =>
(a -> b -> a) -> a -> v b -> a
V.foldl' (\a :: Int
a x :: Qual
x -> Int
a Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Qual -> Word8
unQ Qual
x)) 0 Vector Qual
qs
                                     in Int
total Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
q Int -> Int -> Int
forall a. Num a => a -> a -> a
* Vector Qual -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
V.length Vector Qual
qs

-- | Filter on minimum quality.  In @qualityMinimum n q@, a read passes
-- if it has no more than @n@ bases with quality less than @q@.  Reads
-- without qualities pass.
{-# INLINE qualityMinimum #-}
qualityMinimum :: Int -> Qual -> QualFilter
qualityMinimum :: Int -> Qual -> QualFilter
qualityMinimum n :: Int
n (Q q :: Word8
q) b :: BamRec
b = if Bool
p then BamRec
b else BamRec
b'
  where
    b' :: BamRec
b' = Char -> QualFilter
setQualFlag 'Q' QualFilter -> QualFilter
forall a b. (a -> b) -> a -> b
$ BamRec
b { b_flag :: Int
b_flag = BamRec -> Int
b_flag BamRec
b Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
flagFailsQC }
    p :: Bool
p  = case BamRec -> Maybe (Vector Qual)
b_qual BamRec
b of Nothing -> Bool
True
                          Just qs :: Vector Qual
qs -> Vector Qual -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
V.length ((Qual -> Bool) -> Vector Qual -> Vector Qual
forall (v :: * -> *) a. Vector v a => (a -> Bool) -> v a -> v a
V.filter (Qual -> Qual -> Bool
forall a. Ord a => a -> a -> Bool
< Word8 -> Qual
Q Word8
q) Vector Qual
qs) Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
n


-- | Convert quality scores from old Illumina scale (different formula
-- and offset 64 in FastQ).
qualityFromOldIllumina :: BamRec -> BamRec
qualityFromOldIllumina :: QualFilter
qualityFromOldIllumina b :: BamRec
b = BamRec
b { b_qual :: Maybe (Vector Qual)
b_qual = (Qual -> Qual) -> Vector Qual -> Vector Qual
forall (v :: * -> *) a b.
(Vector v a, Vector v b) =>
(a -> b) -> v a -> v b
V.map Qual -> Qual
conv (Vector Qual -> Vector Qual)
-> Maybe (Vector Qual) -> Maybe (Vector Qual)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> BamRec -> Maybe (Vector Qual)
b_qual BamRec
b }
  where
    conv :: Qual -> Qual
conv (Q s :: Word8
s) = let s' :: Double
                     s' :: Double
s' = Double -> Double
forall a. Floating a => a -> a
exp (Double -> Double) -> Double -> Double
forall a b. (a -> b) -> a -> b
$ Double -> Double
forall a. Floating a => a -> a
log 10 Double -> Double -> Double
forall a. Num a => a -> a -> a
* (Word8 -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
s Double -> Double -> Double
forall a. Num a => a -> a -> a
- 31) Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ (-10)
                     p :: Double
p  = Double
s' Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ (1Double -> Double -> Double
forall a. Num a => a -> a -> a
+Double
s')
                     q :: Double
q  = - 10 Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double -> Double
forall a. Floating a => a -> a
log Double
p Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double -> Double
forall a. Floating a => a -> a
log 10
                 in Word8 -> Qual
Q (Double -> Word8
forall a b. (RealFrac a, Integral b) => a -> b
round Double
q)

-- | Convert quality scores from new Illumina scale (standard formula
-- but offset 64 in FastQ).
qualityFromNewIllumina :: BamRec -> BamRec
qualityFromNewIllumina :: QualFilter
qualityFromNewIllumina b :: BamRec
b = BamRec
b { b_qual :: Maybe (Vector Qual)
b_qual = (Qual -> Qual) -> Vector Qual -> Vector Qual
forall (v :: * -> *) a b.
(Vector v a, Vector v b) =>
(a -> b) -> v a -> v b
V.map (Word8 -> Qual
Q (Word8 -> Qual) -> (Qual -> Word8) -> Qual -> Qual
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. Word8 -> Word8 -> Word8
forall a. Num a => a -> a -> a
subtract 31 (Word8 -> Word8) -> (Qual -> Word8) -> Qual -> Word8
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. Qual -> Word8
unQ) (Vector Qual -> Vector Qual)
-> Maybe (Vector Qual) -> Maybe (Vector Qual)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> BamRec -> Maybe (Vector Qual)
b_qual BamRec
b }