module Bio.ABI.Clean
  ( Cleanable (..)

  , Thresholds (..)
  , defaultThresholds
  ) where

import           Control.Monad (guard, join)
import qualified Data.Vector   as V

import           Bio.Sequence            (mean, meanInRange)
import qualified Bio.Sequence            as S (drop, length, reverse, tail, take)
import           Bio.Sequence.Basecalled (BasecalledSequence, BasecalledSequenceWithRawData (..))

-- | Ability to clean initial data.
-- Returns cleaned variant or 'Nothing' if it can not be cleaned.
--
class Cleanable a where
  clean :: a -> Maybe a
  clean = cleanWith defaultThresholds

  cleanWith :: Thresholds -> a -> Maybe a

-- | Thresholds to clean the data.
--
-- ABI file contains sequence with quality.
-- By design of sanger sequencing method start and end of the sequence have bad quality.
-- Moreover, internal part of the sequence can also has bad quality.
-- To clean the data we make 2 steps.
--
-- Step 1. Clean edges:
--
--   * take frame with @frameSize@ and go through the sequence;
--   * on each step evaluate mean value;
--   * if mean value less than 'edgeThreshold', go further;
--   * if mean value more than 'edgeThreshold' stop and cut the sequence from END of this frame;
--   * repeat this algorithm for the right edge.
--
-- Step 2. Evaluate quality:
--
--   * for cropped sequence evaluate mean value;
--   * if mean value less then 'innerThreshold', sequence is bad;
--   * if mean value more then 'innerThreshold', sequence is acceptable.
--
-- Logic of this algorithm and 'defaultThresholds' were obtained by taking experiments with read ABI files.
--
data Thresholds
  = Thresholds
      { frameSize      :: Int
      , edgeThreshold  :: Double
      , innerThreshold :: Double
      }
  deriving (Eq, Show)

-- | These thresholds were selected by many experiments on ab1-files.
--
defaultThresholds :: Thresholds
defaultThresholds = Thresholds 10 20 30

instance Cleanable BasecalledSequence where
  cleanWith thr input = do
      cut <- fromBoth
      guard $ checkInner thr cut
      return cut
    where
      fromLeft = doCutEdge thr input
      fromBoth =  fmap S.reverse
               .  join
               $  doCutEdge thr
               .  S.reverse
              <$> fromLeft

instance Cleanable BasecalledSequenceWithRawData where
  cleanWith thr input@BasecalledSequenceWithRawData{..} = do
    toDropLeft <- cutEdge thr bsSequence
    let leftDroppedSequ = S.drop toDropLeft bsSequence
    let leftDroppedPloc = V.drop toDropLeft bsPeakLocations

    toDropRight <- cutEdge thr $ S.reverse leftDroppedSequ
    let rightDroppedSequ = S.take (S.length leftDroppedSequ - toDropRight) leftDroppedSequ
    let rightDroppedPloc = V.take (V.length leftDroppedPloc - toDropRight) leftDroppedPloc

    guard $ checkInner thr rightDroppedSequ
    return input { bsSequence = rightDroppedSequ, bsPeakLocations = rightDroppedPloc }

-------------------------------------------------------------------------------
-- INTERNAL
-------------------------------------------------------------------------------

checkInner :: Thresholds -> BasecalledSequence -> Bool
checkInner Thresholds{..} = (> innerThreshold) . mean

doCutEdge :: Thresholds -> BasecalledSequence -> Maybe BasecalledSequence
doCutEdge t sequ = do
    toDrop <- cutEdge t sequ
    return $ S.drop toDrop sequ

cutEdge :: Thresholds -> BasecalledSequence -> Maybe Int
cutEdge t@Thresholds{..} sequ | S.length sequ < frameSize                    = Just 0
                              | meanInR < edgeThreshold && S.length sequ > 1 = (1+) <$> cutEdge t (S.tail sequ)
                              | S.length sequ > frameSize                    = Just frameSize
                              | otherwise                                    = Nothing
  where
    meanInR = meanInRange sequ (0, frameSize - 1)