module Bio.ABI.Clean
  ( Cleanable (..)

  , Thresholds (..)
  , defaultThresholds
  ) where

import           Control.Monad (guard, join)
import qualified Data.Vector   as V

import           Bio.Sequence            (mean, meanInRange)
import qualified Bio.Sequence            as S (drop, length, reverse, tail, take)
import           Bio.Sequence.Basecalled (BasecalledSequence, BasecalledSequenceWithRawData (..))

-- | Ability to clean initial data.
-- Returns cleaned variant or 'Nothing' if it can not be cleaned.
--
class Cleanable a where
  clean :: a -> Maybe a
  clean = Thresholds -> a -> Maybe a
forall a. Cleanable a => Thresholds -> a -> Maybe a
cleanWith Thresholds
defaultThresholds

  cleanWith :: Thresholds -> a -> Maybe a

-- | Thresholds to clean the data.
--
-- ABI file contains sequence with quality.
-- By design of sanger sequencing method start and end of the sequence have bad quality.
-- Moreover, internal part of the sequence can also has bad quality.
-- To clean the data we make 2 steps.
--
-- Step 1. Clean edges:
--
--   * take frame with @frameSize@ and go through the sequence;
--   * on each step evaluate mean value;
--   * if mean value less than 'edgeThreshold', go further;
--   * if mean value more than 'edgeThreshold' stop and cut the sequence from END of this frame;
--   * repeat this algorithm for the right edge.
--
-- Step 2. Evaluate quality:
--
--   * for cropped sequence evaluate mean value;
--   * if mean value less then 'innerThreshold', sequence is bad;
--   * if mean value more then 'innerThreshold', sequence is acceptable.
--
-- Logic of this algorithm and 'defaultThresholds' were obtained by taking experiments with read ABI files.
--
data Thresholds
  = Thresholds
      { Thresholds -> Int
frameSize      :: Int
      , Thresholds -> Double
edgeThreshold  :: Double
      , Thresholds -> Double
innerThreshold :: Double
      }
  deriving (Thresholds -> Thresholds -> Bool
(Thresholds -> Thresholds -> Bool)
-> (Thresholds -> Thresholds -> Bool) -> Eq Thresholds
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Thresholds -> Thresholds -> Bool
$c/= :: Thresholds -> Thresholds -> Bool
== :: Thresholds -> Thresholds -> Bool
$c== :: Thresholds -> Thresholds -> Bool
Eq, Int -> Thresholds -> ShowS
[Thresholds] -> ShowS
Thresholds -> String
(Int -> Thresholds -> ShowS)
-> (Thresholds -> String)
-> ([Thresholds] -> ShowS)
-> Show Thresholds
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Thresholds] -> ShowS
$cshowList :: [Thresholds] -> ShowS
show :: Thresholds -> String
$cshow :: Thresholds -> String
showsPrec :: Int -> Thresholds -> ShowS
$cshowsPrec :: Int -> Thresholds -> ShowS
Show)

-- | These thresholds were selected by many experiments on ab1-files.
--
defaultThresholds :: Thresholds
defaultThresholds :: Thresholds
defaultThresholds = Int -> Double -> Double -> Thresholds
Thresholds Int
10 Double
20 Double
30

instance Cleanable BasecalledSequence where
  cleanWith :: Thresholds -> BasecalledSequence -> Maybe BasecalledSequence
cleanWith Thresholds
thr BasecalledSequence
input = do
      BasecalledSequence
cut <- Maybe BasecalledSequence
fromBoth
      Bool -> Maybe ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> Maybe ()) -> Bool -> Maybe ()
forall a b. (a -> b) -> a -> b
$ Thresholds -> BasecalledSequence -> Bool
checkInner Thresholds
thr BasecalledSequence
cut
      BasecalledSequence -> Maybe BasecalledSequence
forall (m :: * -> *) a. Monad m => a -> m a
return BasecalledSequence
cut
    where
      fromLeft :: Maybe BasecalledSequence
fromLeft = Thresholds -> BasecalledSequence -> Maybe BasecalledSequence
doCutEdge Thresholds
thr BasecalledSequence
input
      fromBoth :: Maybe BasecalledSequence
fromBoth =  (BasecalledSequence -> BasecalledSequence)
-> Maybe BasecalledSequence -> Maybe BasecalledSequence
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap BasecalledSequence -> BasecalledSequence
forall s. IsSequence s => s -> s
S.reverse
               (Maybe BasecalledSequence -> Maybe BasecalledSequence)
-> (Maybe (Maybe BasecalledSequence) -> Maybe BasecalledSequence)
-> Maybe (Maybe BasecalledSequence)
-> Maybe BasecalledSequence
forall b c a. (b -> c) -> (a -> b) -> a -> c
.  Maybe (Maybe BasecalledSequence) -> Maybe BasecalledSequence
forall (m :: * -> *) a. Monad m => m (m a) -> m a
join
               (Maybe (Maybe BasecalledSequence) -> Maybe BasecalledSequence)
-> Maybe (Maybe BasecalledSequence) -> Maybe BasecalledSequence
forall a b. (a -> b) -> a -> b
$  Thresholds -> BasecalledSequence -> Maybe BasecalledSequence
doCutEdge Thresholds
thr
               (BasecalledSequence -> Maybe BasecalledSequence)
-> (BasecalledSequence -> BasecalledSequence)
-> BasecalledSequence
-> Maybe BasecalledSequence
forall b c a. (b -> c) -> (a -> b) -> a -> c
.  BasecalledSequence -> BasecalledSequence
forall s. IsSequence s => s -> s
S.reverse
              (BasecalledSequence -> Maybe BasecalledSequence)
-> Maybe BasecalledSequence -> Maybe (Maybe BasecalledSequence)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Maybe BasecalledSequence
fromLeft

instance Cleanable BasecalledSequenceWithRawData where
  cleanWith :: Thresholds
-> BasecalledSequenceWithRawData
-> Maybe BasecalledSequenceWithRawData
cleanWith Thresholds
thr input :: BasecalledSequenceWithRawData
input@BasecalledSequenceWithRawData{Vector Int
Vector Int16
BasecalledSequence
bsPeakLocations :: BasecalledSequenceWithRawData -> Vector Int
bsRawC :: BasecalledSequenceWithRawData -> Vector Int16
bsRawT :: BasecalledSequenceWithRawData -> Vector Int16
bsRawA :: BasecalledSequenceWithRawData -> Vector Int16
bsRawG :: BasecalledSequenceWithRawData -> Vector Int16
bsSequence :: BasecalledSequenceWithRawData -> BasecalledSequence
bsPeakLocations :: Vector Int
bsRawC :: Vector Int16
bsRawT :: Vector Int16
bsRawA :: Vector Int16
bsRawG :: Vector Int16
bsSequence :: BasecalledSequence
..} = do
    Int
toDropLeft <- Thresholds -> BasecalledSequence -> Maybe Int
cutEdge Thresholds
thr BasecalledSequence
bsSequence
    let leftDroppedSequ :: BasecalledSequence
leftDroppedSequ = Int -> BasecalledSequence -> BasecalledSequence
forall s. ContainsNoMarking s => Int -> s -> s
S.drop Int
toDropLeft BasecalledSequence
bsSequence
    let leftDroppedPloc :: Vector Int
leftDroppedPloc = Int -> Vector Int -> Vector Int
forall a. Int -> Vector a -> Vector a
V.drop Int
toDropLeft Vector Int
bsPeakLocations

    Int
toDropRight <- Thresholds -> BasecalledSequence -> Maybe Int
cutEdge Thresholds
thr (BasecalledSequence -> Maybe Int)
-> BasecalledSequence -> Maybe Int
forall a b. (a -> b) -> a -> b
$ BasecalledSequence -> BasecalledSequence
forall s. IsSequence s => s -> s
S.reverse BasecalledSequence
leftDroppedSequ
    let rightDroppedSequ :: BasecalledSequence
rightDroppedSequ = Int -> BasecalledSequence -> BasecalledSequence
forall s. ContainsNoMarking s => Int -> s -> s
S.take (BasecalledSequence -> Int
forall s. IsSequence s => s -> Int
S.length BasecalledSequence
leftDroppedSequ Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
toDropRight) BasecalledSequence
leftDroppedSequ
    let rightDroppedPloc :: Vector Int
rightDroppedPloc = Int -> Vector Int -> Vector Int
forall a. Int -> Vector a -> Vector a
V.take (Vector Int -> Int
forall a. Vector a -> Int
V.length Vector Int
leftDroppedPloc Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
toDropRight) Vector Int
leftDroppedPloc

    Bool -> Maybe ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> Maybe ()) -> Bool -> Maybe ()
forall a b. (a -> b) -> a -> b
$ Thresholds -> BasecalledSequence -> Bool
checkInner Thresholds
thr BasecalledSequence
rightDroppedSequ
    BasecalledSequenceWithRawData
-> Maybe BasecalledSequenceWithRawData
forall (m :: * -> *) a. Monad m => a -> m a
return BasecalledSequenceWithRawData
input { bsSequence :: BasecalledSequence
bsSequence = BasecalledSequence
rightDroppedSequ, bsPeakLocations :: Vector Int
bsPeakLocations = Vector Int
rightDroppedPloc }

-------------------------------------------------------------------------------
-- INTERNAL
-------------------------------------------------------------------------------

checkInner :: Thresholds -> BasecalledSequence -> Bool
checkInner :: Thresholds -> BasecalledSequence -> Bool
checkInner Thresholds{Double
Int
innerThreshold :: Double
edgeThreshold :: Double
frameSize :: Int
innerThreshold :: Thresholds -> Double
edgeThreshold :: Thresholds -> Double
frameSize :: Thresholds -> Int
..} = (Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
> Double
innerThreshold) (Double -> Bool)
-> (BasecalledSequence -> Double) -> BasecalledSequence -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. BasecalledSequence -> Double
forall s. ContainsWeight s => s -> Double
mean

doCutEdge :: Thresholds -> BasecalledSequence -> Maybe BasecalledSequence
doCutEdge :: Thresholds -> BasecalledSequence -> Maybe BasecalledSequence
doCutEdge Thresholds
t BasecalledSequence
sequ = do
    Int
toDrop <- Thresholds -> BasecalledSequence -> Maybe Int
cutEdge Thresholds
t BasecalledSequence
sequ
    BasecalledSequence -> Maybe BasecalledSequence
forall (m :: * -> *) a. Monad m => a -> m a
return (BasecalledSequence -> Maybe BasecalledSequence)
-> BasecalledSequence -> Maybe BasecalledSequence
forall a b. (a -> b) -> a -> b
$ Int -> BasecalledSequence -> BasecalledSequence
forall s. ContainsNoMarking s => Int -> s -> s
S.drop Int
toDrop BasecalledSequence
sequ

cutEdge :: Thresholds -> BasecalledSequence -> Maybe Int
cutEdge :: Thresholds -> BasecalledSequence -> Maybe Int
cutEdge t :: Thresholds
t@Thresholds{Double
Int
innerThreshold :: Double
edgeThreshold :: Double
frameSize :: Int
innerThreshold :: Thresholds -> Double
edgeThreshold :: Thresholds -> Double
frameSize :: Thresholds -> Int
..} BasecalledSequence
sequ | BasecalledSequence -> Int
forall s. IsSequence s => s -> Int
S.length BasecalledSequence
sequ Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
frameSize                    = Int -> Maybe Int
forall a. a -> Maybe a
Just Int
0
                              | Double
meanInR Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
< Double
edgeThreshold Bool -> Bool -> Bool
&& BasecalledSequence -> Int
forall s. IsSequence s => s -> Int
S.length BasecalledSequence
sequ Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
1 = (Int
1Int -> Int -> Int
forall a. Num a => a -> a -> a
+) (Int -> Int) -> Maybe Int -> Maybe Int
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Thresholds -> BasecalledSequence -> Maybe Int
cutEdge Thresholds
t (BasecalledSequence -> BasecalledSequence
forall s. ContainsNoMarking s => s -> s
S.tail BasecalledSequence
sequ)
                              | BasecalledSequence -> Int
forall s. IsSequence s => s -> Int
S.length BasecalledSequence
sequ Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
frameSize                    = Int -> Maybe Int
forall a. a -> Maybe a
Just Int
frameSize
                              | Bool
otherwise                                    = Maybe Int
forall a. Maybe a
Nothing
  where
    meanInR :: Double
meanInR = BasecalledSequence -> RangeInclusive -> Double
forall s. ContainsWeight s => s -> RangeInclusive -> Double
meanInRange BasecalledSequence
sequ (Int
0, Int
frameSize Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)