module Bio.ABI.Clean
( Cleanable (..)
, Thresholds (..)
, defaultThresholds
) where
import Control.Monad (guard, join)
import qualified Data.Vector as V
import Bio.Sequence (mean, meanInRange)
import qualified Bio.Sequence as S (drop, length, reverse, tail, take)
import Bio.Sequence.Basecalled (BasecalledSequence, BasecalledSequenceWithRawData (..))
class Cleanable a where
clean :: a -> Maybe a
clean = cleanWith defaultThresholds
cleanWith :: Thresholds -> a -> Maybe a
data Thresholds
= Thresholds
{ frameSize :: Int
, edgeThreshold :: Double
, innerThreshold :: Double
}
deriving (Eq, Show)
defaultThresholds :: Thresholds
defaultThresholds = Thresholds 10 20 30
instance Cleanable BasecalledSequence where
cleanWith thr input = do
cut <- fromBoth
guard $ checkInner thr cut
return cut
where
fromLeft = doCutEdge thr input
fromBoth = fmap S.reverse
. join
$ doCutEdge thr
. S.reverse
<$> fromLeft
instance Cleanable BasecalledSequenceWithRawData where
cleanWith thr input@BasecalledSequenceWithRawData{..} = do
toDropLeft <- cutEdge thr bsSequence
let leftDroppedSequ = S.drop toDropLeft bsSequence
let leftDroppedPloc = V.drop toDropLeft bsPeakLocations
toDropRight <- cutEdge thr $ S.reverse leftDroppedSequ
let rightDroppedSequ = S.take (S.length leftDroppedSequ - toDropRight) leftDroppedSequ
let rightDroppedPloc = V.take (V.length leftDroppedPloc - toDropRight) leftDroppedPloc
guard $ checkInner thr rightDroppedSequ
return input { bsSequence = rightDroppedSequ, bsPeakLocations = rightDroppedPloc }
checkInner :: Thresholds -> BasecalledSequence -> Bool
checkInner Thresholds{..} = (> innerThreshold) . mean
doCutEdge :: Thresholds -> BasecalledSequence -> Maybe BasecalledSequence
doCutEdge t sequ = do
toDrop <- cutEdge t sequ
return $ S.drop toDrop sequ
cutEdge :: Thresholds -> BasecalledSequence -> Maybe Int
cutEdge t@Thresholds{..} sequ | S.length sequ < frameSize = Just 0
| meanInR < edgeThreshold && S.length sequ > 1 = (1+) <$> cutEdge t (S.tail sequ)
| S.length sequ > frameSize = Just frameSize
| otherwise = Nothing
where
meanInR = meanInRange sequ (0, frameSize - 1)