{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
module Bio.Data.Fastq
( Fastq(..)
, streamFastqGzip
, streamFastq
, sinkFastqGzip
, sinkFastq
, parseFastqC
, fastqToByteString
, qualitySummary
, trimPolyA
) where
import Conduit
import Data.Conduit.Zlib (ungzip, multiple, gzip)
import Control.Monad (when)
import qualified Data.ByteString.Char8 as B
import qualified Data.ByteString.Lazy as BL
import qualified Data.ByteString as BS
import Data.Maybe (isJust)
import qualified Data.Attoparsec.ByteString as A
import Data.Attoparsec.ByteString.Char8
import Data.Conduit.Attoparsec
data Fastq = Fastq
{ fastqSeqId :: B.ByteString
, fastqSeq :: B.ByteString
, fastqSeqQual :: B.ByteString
} deriving (Show, Eq)
streamFastqGzip :: (PrimMonad m, MonadThrow m, MonadResource m)
=> FilePath -> ConduitT i Fastq m ()
streamFastqGzip fl = sourceFileBS fl .| multiple ungzip .| parseFastqC
streamFastq :: (MonadResource m, MonadThrow m)
=> FilePath -> ConduitT i Fastq m ()
streamFastq fl = sourceFileBS fl .| parseFastqC
sinkFastq :: (MonadResource m, MonadThrow m)
=> FilePath -> ConduitT Fastq o m ()
sinkFastq fl = mapC fastqToByteString .| unlinesAsciiC .| sinkFileBS fl
sinkFastqGzip :: (PrimMonad m, MonadThrow m, MonadResource m)
=> FilePath -> ConduitT Fastq o m ()
sinkFastqGzip fl = mapC fastqToByteString .| unlinesAsciiC .| gzip .| sinkFileBS fl
parseFastqC :: MonadThrow m => ConduitT B.ByteString Fastq m ()
parseFastqC = conduitParser fastqParser .| mapC snd
{-# INLINE parseFastqC #-}
fastqParser :: Parser Fastq
fastqParser = do
_ <- skipWhile (/='@') >> char8 '@'
ident <- A.takeTill isEndOfLine
endOfLine
sequence <- BS.filter (not . isEndOfLine) <$> takeTill (=='+')
char8 '+' >> A.skipWhile (not . isEndOfLine) >> endOfLine
score <- BS.filter (not . isEndOfLine) <$>
A.scan 0 (f (B.length sequence))
skipWhile (/='@')
return $ Fastq ident sequence score
where
f n i x | i >= n = Nothing
| isEndOfLine x = Just i
| otherwise = Just $ i + 1
{-# INLINE fastqParser #-}
fastqToByteString :: Fastq -> B.ByteString
fastqToByteString (Fastq a b c) = "@" <> a <> "\n" <> b <> "\n+\n" <> c
{-# INLINE fastqToByteString #-}
qualitySummary :: Monad m => ConduitT Fastq o m [(Double, Double)]
qualitySummary = mapC (map fromIntegral . decodeQualSc) .| meanVarianceC
meanVarianceC :: Monad m => ConduitT [Double] o m [(Double, Double)]
meanVarianceC = peekC >>= \case
Nothing -> error "Empty input"
Just x -> fst <$> foldlC f (replicate (length x) (0,0), 0)
where
f (acc, n) xs = let acc' = zipWith g acc xs in (acc', n')
where
n' = n + 1
g (m, s) x = (m', s')
where
m' = m + d / fromIntegral n'
s' = s + d * (x - m')
d = x - m
{-# INLINE meanVarianceC #-}
decodeQualSc :: Fastq -> [Int]
decodeQualSc = map (fromIntegral . (\x -> x - 33)) . BS.unpack .fastqSeqQual
{-# INLINE decodeQualSc #-}
pError :: Int -> Double
pError x = 10 ** (negate (fromIntegral x) / 10)
{-# INLINE pError #-}
trimPolyA :: Int -> Fastq -> Fastq
trimPolyA n f@(Fastq a b c)
| B.length trailing >= n = Fastq a b' $ B.take (B.length b') c
| otherwise = f
where
(b', trailing) = B.spanEnd (=='A') b