module Bio.ABI.Clean ( Cleanable (..) , Thresholds (..) ) where import Bio.Sequence (mean, meanInRange) import qualified Bio.Sequence as S (drop, length, reverse, tail) import Bio.Sequence.Basecalled (BasecalledSequence) import Control.Monad (join) -- | Ability to clean initial data. -- Returns cleaned variant or 'Nothing' if it can not be cleaned. -- class Cleanable a where clean :: a -> Maybe a clean = cleanWith defaultThresholds cleanWith :: Thresholds -> a -> Maybe a -- | Thresholds to clean the data. -- ABI file contains sequence with quality. -- By design of sanger sequencing method start and end of the sequence have bad quality. -- Moreover, internal part of the sequence can also has bad quality. -- To clean the data we make 2 steps. -- -- Step 1. Clean edges: -- * take frame with @frameSize@ and go through the sequence; -- * on each step evaluate mean value; -- * if mean value less than @edgeThreshold@, go further; -- * if mean value more than @edgeThreshold@, stop and cut the sequence from END of this frame; -- * repeat this algorithm for the right edge. -- -- Step 2. Evaluate quality: -- * for cropped sequence evaluate mean value; -- * if mean value less then @innerThreshold@, sequence is bad; -- * if mean value more then @innerThreshold@, sequence is acceptable. -- -- Logic of this algorithm and 'defaultThresholds' were obtained by taking experiments with read ABI files. -- data Thresholds = Thresholds { frameSize :: Int , edgeThreshold :: Double , innerThreshold :: Double } deriving (Eq, Show) -- | This thresholds were selected by many experiments on ab1-files. -- defaultThresholds :: Thresholds defaultThresholds = Thresholds 10 20 30 instance Cleanable BasecalledSequence where cleanWith thr input = if fmap (checkInner thr) fromBoth == Just True then fromBoth else Nothing where fromLeft = cutEdge defaultThresholds input fromBoth = fmap S.reverse . join $ cutEdge defaultThresholds . S.reverse <$> fromLeft ------------------------------------------------------------------------------- -- INTERNAL ------------------------------------------------------------------------------- checkInner :: Thresholds -> BasecalledSequence -> Bool checkInner Thresholds{..} = (> innerThreshold) . mean cutEdge :: Thresholds -> BasecalledSequence -> Maybe BasecalledSequence cutEdge t@Thresholds{..} sequ | S.length sequ < frameSize = Just sequ | meanInR < edgeThreshold && S.length sequ > 1 = cutEdge t $ S.tail sequ | S.length sequ > frameSize = Just $ S.drop frameSize sequ | otherwise = Nothing where meanInR = meanInRange sequ (0, frameSize - 1)