-- | This module contains stuff relating to conventions local to MPI
-- EVAN.  The code is needed regularly, but it can be harmful when
-- applied to BAM files that follow different conventions.  Most
-- importantly, no program should call these functions by default.

module Bio.Bam.Evan
    ( fixupFlagAbuse
    , fixupBwaFlags
    , removeWarts
    ) where

import Bio.Bam.Header
import Bio.Bam.Rec
import Bio.Prelude

import qualified Data.ByteString.Char8 as S

-- | Fixes abuse of flags valued 0x800 and 0x1000.  We used them for
-- low quality and low complexity, but they have since been redefined.
-- If set, we clear them and store them into the ZQ field.  Also fixes
-- abuse of the combination of the paired, 1st mate and 2nd mate flags
-- used to indicate merging or trimming.  These are canonicalized and
-- stored into the FF field.  This function is unsafe on BAM files of
-- unclear origin!
fixupFlagAbuse :: BamRec -> BamRec
fixupFlagAbuse :: BamRec -> BamRec
fixupFlagAbuse b :: BamRec
b =
    (if BamRec -> Int
b_flag BamRec
b Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
flag_low_quality Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
/= 0 then Char -> BamRec -> BamRec
setQualFlag 'Q' else BamRec -> BamRec
forall k (cat :: k -> k -> *) (a :: k). Category cat => cat a a
id) (BamRec -> BamRec) -> BamRec -> BamRec
forall a b. (a -> b) -> a -> b
$          -- low qual, new convention
    (if BamRec -> Int
b_flag BamRec
b Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
flag_low_complexity Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
/= 0 then Char -> BamRec -> BamRec
setQualFlag 'C' else BamRec -> BamRec
forall k (cat :: k -> k -> *) (a :: k). Category cat => cat a a
id) (BamRec -> BamRec) -> BamRec -> BamRec
forall a b. (a -> b) -> a -> b
$       -- low complexity, new convention
    BamRec
b { b_flag :: Int
b_flag = Int
cleaned_flags, b_exts :: Extensions
b_exts = Extensions
cleaned_exts }
  where
        -- removes old flag abuse
        flags' :: Int
flags' = BamRec -> Int
b_flag BamRec
b Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int -> Int
forall a. Bits a => a -> a
complement (Int
flag_low_quality Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
flag_low_complexity)
        cleaned_flags :: Int
cleaned_flags | Int
flags' Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
flagPaired Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== 0 = Int
flags' Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int -> Int
forall a. Bits a => a -> a
complement (Int
flagFirstMate Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
flagSecondMate)
                      | Bool
otherwise                  = Int
flags'

        flag_low_quality :: Int
flag_low_quality    =  0x800
        flag_low_complexity :: Int
flag_low_complexity = 0x1000

        -- merged & trimmed from old flag abuse
        is_merged :: Bool
is_merged  = Int
flags' Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. (Int
flagPaired Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
flagFirstMate Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
flagSecondMate) Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
flagFirstMate Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
flagSecondMate
        is_trimmed :: Bool
is_trimmed = Int
flags' Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. (Int
flagPaired Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
flagFirstMate Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
flagSecondMate) Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
flagSecondMate
        newflags :: Int
newflags = (if Bool
is_merged then Int
eflagMerged else 0) Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. (if Bool
is_trimmed then Int
eflagTrimmed else 0)

        -- Extended flags, renamed to avoid collision with BWA.  Goes like this:  if FF is there, use
        -- it.  Else check if XF is there __and is numeric__.  If so, use it, remove it, and set FF
        -- instead.  Else use 0 and leave it alone.  Note that this resolves the collision with BWA,
        -- since BWA puts a character there, not an int.
        cleaned_exts :: Extensions
cleaned_exts = case (BamKey -> Extensions -> Maybe Ext
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup "FF" (BamRec -> Extensions
b_exts BamRec
b), BamKey -> Extensions -> Maybe Ext
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup "XF" (BamRec -> Extensions
b_exts BamRec
b)) of
                ( Just (Int i :: Int
i), _ ) -> BamKey -> Ext -> Extensions -> Extensions
updateE "FF" (Int -> Ext
Int (Int
i Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
newflags))                (BamRec -> Extensions
b_exts BamRec
b)
                ( _, Just (Int i :: Int
i) ) -> BamKey -> Ext -> Extensions -> Extensions
updateE "FF" (Int -> Ext
Int (Int
i Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
newflags)) (Extensions -> Extensions) -> Extensions -> Extensions
forall a b. (a -> b) -> a -> b
$ BamKey -> Extensions -> Extensions
deleteE "XF" (BamRec -> Extensions
b_exts BamRec
b)
                _ | Int
newflags Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
/= 0   -> BamKey -> Ext -> Extensions -> Extensions
updateE "FF" (Int -> Ext
Int        Int
newflags )                (BamRec -> Extensions
b_exts BamRec
b)
                  | Bool
otherwise       ->                                                     BamRec -> Extensions
b_exts BamRec
b


-- | Fixes typical inconsistencies produced by Bwa: sometimes, 'mate unmapped' should be set, and we
-- can see it, because we match the mate's coordinates.  Sometimes 'properly paired' should not be
-- set, because one mate is unmapped.  This function is generally safe, but needs to be called only
-- on the output of affected (older?) versions of Bwa.
fixupBwaFlags :: BamRec -> BamRec
fixupBwaFlags :: BamRec -> BamRec
fixupBwaFlags b :: BamRec
b = BamRec
b { b_flag :: Int
b_flag = Int -> Int
fixPP (Int -> Int) -> Int -> Int
forall a b. (a -> b) -> a -> b
$ BamRec -> Int
b_flag BamRec
b Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. if Bool
mu then Int
flagMateUnmapped else 0 }
  where
        -- Set "mate unmapped" if self coordinates and mate coordinates are equal, but self is
        -- paired and mapped.  (BWA forgets this flag for invalid mate alignments)
        mu :: Bool
mu = [Bool] -> Bool
forall (t :: * -> *). Foldable t => t Bool -> Bool
and [ BamRec -> Bool
isPaired BamRec
b, Bool -> Bool
not (BamRec -> Bool
isUnmapped BamRec
b)
                 , BamRec -> Bool
isReversed BamRec
b Bool -> Bool -> Bool
forall a. Eq a => a -> a -> Bool
== BamRec -> Bool
isMateReversed BamRec
b
                 , BamRec -> Refseq
b_rname BamRec
b Refseq -> Refseq -> Bool
forall a. Eq a => a -> a -> Bool
== BamRec -> Refseq
b_mrnm BamRec
b, BamRec -> Int
b_pos BamRec
b Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== BamRec -> Int
b_mpos BamRec
b ]

        -- If either mate is unmapped, remove "properly paired".
        fixPP :: Int -> Int
fixPP f :: Int
f | Int
f Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. (Int
flagUnmapped Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
flagMateUnmapped) Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== 0 = Int
f
                | Bool
otherwise = Int
f Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int -> Int
forall a. Bits a => a -> a
complement Int
flagProperlyPaired


-- | Removes syntactic warts from old read names or the read names used
-- in FastQ files.  Supported conventions:
--
-- * A name suffix of @/1@ or @/2@ is turned into the first mate or second
--   mate flag and the read is flagged as paired.
--
-- * Same for name prefixes of @F_@ or @R_@, respectively.
--
-- * A name prefix of @M_@ flags the sequence as unpaired and merged
--
-- * A name prefix of @T_@ flags the sequence as unpaired and trimmed
--
-- * A name prefix of @C_@, optionally before or after any of the other
--   prefixes, is turned into the extra flag @XP:i:-1@ (result of
--   duplicate removal with unknown duplicate count).
--
-- * A collection of tags separated from the name by an octothorpe is
--   removed and put into the fields @XI@ and @XJ@ as text.

removeWarts :: BamRec -> BamRec
removeWarts :: BamRec -> BamRec
removeWarts br :: BamRec
br = BamRec
br { b_qname :: Bytes
b_qname = Bytes
name, b_flag :: Int
b_flag = Int
flags, b_exts :: Extensions
b_exts = Extensions
tags }
  where
    (name :: Bytes
name, flags :: Int
flags, tags :: Extensions
tags) = (Bytes, Int, Extensions) -> (Bytes, Int, Extensions)
checkFR ((Bytes, Int, Extensions) -> (Bytes, Int, Extensions))
-> (Bytes, Int, Extensions) -> (Bytes, Int, Extensions)
forall a b. (a -> b) -> a -> b
$ (Bytes, Int, Extensions) -> (Bytes, Int, Extensions)
forall b. (Bytes, b, Extensions) -> (Bytes, b, Extensions)
checkC ((Bytes, Int, Extensions) -> (Bytes, Int, Extensions))
-> (Bytes, Int, Extensions) -> (Bytes, Int, Extensions)
forall a b. (a -> b) -> a -> b
$ (Bytes, Int, Extensions) -> (Bytes, Int, Extensions)
forall b. (Bytes, b, Extensions) -> (Bytes, b, Extensions)
checkSharp (BamRec -> Bytes
b_qname BamRec
br, BamRec -> Int
b_flag BamRec
br, BamRec -> Extensions
b_exts BamRec
br)

    checkFR :: (Bytes, Int, Extensions) -> (Bytes, Int, Extensions)
checkFR (n :: Bytes
n,f :: Int
f,t :: Extensions
t) | "F_" Bytes -> Bytes -> Bool
`S.isPrefixOf` Bytes
n = (Bytes, Int, Extensions) -> (Bytes, Int, Extensions)
forall b. (Bytes, b, Extensions) -> (Bytes, b, Extensions)
checkC (Int -> Bytes -> Bytes
S.drop 2 Bytes
n, Int
f Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
flagFirstMate  Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
flagPaired, Extensions
t)
                    | "R_" Bytes -> Bytes -> Bool
`S.isPrefixOf` Bytes
n = (Bytes, Int, Extensions) -> (Bytes, Int, Extensions)
forall b. (Bytes, b, Extensions) -> (Bytes, b, Extensions)
checkC (Int -> Bytes -> Bytes
S.drop 2 Bytes
n, Int
f Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
flagSecondMate Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
flagPaired, Extensions
t)
                    | "M_" Bytes -> Bytes -> Bool
`S.isPrefixOf` Bytes
n = (Bytes, Int, Extensions) -> (Bytes, Int, Extensions)
forall b. (Bytes, b, Extensions) -> (Bytes, b, Extensions)
checkC (Int -> Bytes -> Bytes
S.drop 2 Bytes
n, Int
f,   BamKey -> Ext -> Extensions -> Extensions
insertE "FF" (Int -> Ext
Int  Int
eflagMerged) Extensions
t)
                    | "T_" Bytes -> Bytes -> Bool
`S.isPrefixOf` Bytes
n = (Bytes, Int, Extensions) -> (Bytes, Int, Extensions)
forall b. (Bytes, b, Extensions) -> (Bytes, b, Extensions)
checkC (Int -> Bytes -> Bytes
S.drop 2 Bytes
n, Int
f,   BamKey -> Ext -> Extensions -> Extensions
insertE "FF" (Int -> Ext
Int Int
eflagTrimmed) Extensions
t)
                    | "/1" Bytes -> Bytes -> Bool
`S.isSuffixOf` Bytes
n =        ( Int -> Bytes -> Bytes
rdrop 2 Bytes
n, Int
f Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
flagFirstMate  Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
flagPaired, Extensions
t)
                    | "/2" Bytes -> Bytes -> Bool
`S.isSuffixOf` Bytes
n =        ( Int -> Bytes -> Bytes
rdrop 2 Bytes
n, Int
f Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
flagSecondMate Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
flagPaired, Extensions
t)
                    | Bool
otherwise             =        (         Bytes
n, Int
f,                                   Extensions
t)

    checkC :: (Bytes, b, Extensions) -> (Bytes, b, Extensions)
checkC (n :: Bytes
n,f :: b
f,t :: Extensions
t) | "C_" Bytes -> Bytes -> Bool
`S.isPrefixOf` Bytes
n  = (Int -> Bytes -> Bytes
S.drop 2 Bytes
n, b
f, BamKey -> Ext -> Extensions -> Extensions
insertE "XP" (Int -> Ext
Int (-1)) Extensions
t)
                   | Bool
otherwise              = (         Bytes
n, b
f,                         Extensions
t)

    rdrop :: Int -> Bytes -> Bytes
rdrop n :: Int
n s :: Bytes
s = Int -> Bytes -> Bytes
S.take (Bytes -> Int
S.length Bytes
s Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
n) Bytes
s

    checkSharp :: (Bytes, b, Extensions) -> (Bytes, b, Extensions)
checkSharp (n :: Bytes
n,f :: b
f,t :: Extensions
t) = case Char -> Bytes -> [Bytes]
S.split '#' Bytes
n of [n' :: Bytes
n',ts :: Bytes
ts] -> (Bytes
n', b
f, Bytes -> Extensions -> Extensions
insertTags Bytes
ts Extensions
t)
                                               _       -> ( Bytes
n, b
f,               Extensions
t)

    insertTags :: Bytes -> Extensions -> Extensions
insertTags ts :: Bytes
ts t :: Extensions
t | Bytes -> Bool
S.null Bytes
y  = BamKey -> Ext -> Extensions -> Extensions
insertE "XI" (Bytes -> Ext
Text Bytes
ts) Extensions
t
                    | Bool
otherwise = BamKey -> Ext -> Extensions -> Extensions
insertE "XI" (Bytes -> Ext
Text  Bytes
x) (Extensions -> Extensions) -> Extensions -> Extensions
forall a b. (a -> b) -> a -> b
$ BamKey -> Ext -> Extensions -> Extensions
insertE "XJ" (Bytes -> Ext
Text (Bytes -> Ext) -> Bytes -> Ext
forall a b. (a -> b) -> a -> b
$ Bytes -> Bytes
S.tail Bytes
y) Extensions
t
        where (x :: Bytes
x,y :: Bytes
y) = (Char -> Bool) -> Bytes -> (Bytes, Bytes)
S.break (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== ',') Bytes
ts