{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric, DeriveAnyClass #-}
module Bio.Data.Fastq
    ( Fastq(..)
    , streamFastqGzip
    , streamFastq
    , sinkFastqGzip
    , sinkFastq
    , parseFastqC
    , parseFastqC'
    , fastqToByteString
    , qualitySummary
    , trimPolyA
    ) where

import           Conduit
import Data.Conduit.Zlib (ungzip, multiple, gzip)
import qualified Data.ByteString.Char8 as B
import qualified Data.ByteString as BS
import qualified Data.Attoparsec.ByteString as A
import Data.Attoparsec.ByteString.Char8
import Data.Conduit.Attoparsec
import GHC.Generics (Generic)
import Control.DeepSeq

-- | A FASTQ file normally uses four lines per sequence.
--
--     * Line 1 begins with a '@' character and is followed by a sequence
--       identifier and an optional description (like a FASTA title line).
--
--     * Line 2 is the raw sequence letters.
--
--     * Line 3 begins with a '+' character and is optionally followed by the
--       same sequence identifier (and any description) again.
--
--     * Line 4 encodes the quality values for the sequence in Line 2, and must
--       contain the same number of symbols as letters in the sequence.
data Fastq = Fastq
    { Fastq -> ByteString
fastqSeqId   :: B.ByteString
    , Fastq -> ByteString
fastqSeq     :: B.ByteString
    , Fastq -> ByteString
fastqSeqQual :: B.ByteString
    } deriving (Int -> Fastq -> ShowS
[Fastq] -> ShowS
Fastq -> String
(Int -> Fastq -> ShowS)
-> (Fastq -> String) -> ([Fastq] -> ShowS) -> Show Fastq
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Fastq] -> ShowS
$cshowList :: [Fastq] -> ShowS
show :: Fastq -> String
$cshow :: Fastq -> String
showsPrec :: Int -> Fastq -> ShowS
$cshowsPrec :: Int -> Fastq -> ShowS
Show, Fastq -> Fastq -> Bool
(Fastq -> Fastq -> Bool) -> (Fastq -> Fastq -> Bool) -> Eq Fastq
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Fastq -> Fastq -> Bool
$c/= :: Fastq -> Fastq -> Bool
== :: Fastq -> Fastq -> Bool
$c== :: Fastq -> Fastq -> Bool
Eq, (forall x. Fastq -> Rep Fastq x)
-> (forall x. Rep Fastq x -> Fastq) -> Generic Fastq
forall x. Rep Fastq x -> Fastq
forall x. Fastq -> Rep Fastq x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cto :: forall x. Rep Fastq x -> Fastq
$cfrom :: forall x. Fastq -> Rep Fastq x
Generic, Fastq -> ()
(Fastq -> ()) -> NFData Fastq
forall a. (a -> ()) -> NFData a
rnf :: Fastq -> ()
$crnf :: Fastq -> ()
NFData)

-- | Read gzipped fastq file.
streamFastqGzip :: (PrimMonad m, MonadThrow m, MonadResource m) 
                => FilePath -> ConduitT i Fastq m ()
streamFastqGzip :: String -> ConduitT i Fastq m ()
streamFastqGzip String
fl = String -> ConduitT i ByteString m ()
forall (m :: * -> *) i.
MonadResource m =>
String -> ConduitT i ByteString m ()
sourceFileBS String
fl ConduitT i ByteString m ()
-> ConduitM ByteString Fastq m () -> ConduitT i Fastq m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.| ConduitT ByteString ByteString m ()
-> ConduitT ByteString ByteString m ()
forall (m :: * -> *) a.
Monad m =>
ConduitT ByteString a m () -> ConduitT ByteString a m ()
multiple ConduitT ByteString ByteString m ()
forall (m :: * -> *).
(PrimMonad m, MonadThrow m) =>
ConduitT ByteString ByteString m ()
ungzip ConduitT ByteString ByteString m ()
-> ConduitM ByteString Fastq m () -> ConduitM ByteString Fastq m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.| ConduitM ByteString Fastq m ()
forall (m :: * -> *). Monad m => ConduitT ByteString Fastq m ()
parseFastqC

streamFastq :: (MonadResource m, MonadThrow m)
            => FilePath -> ConduitT i Fastq m ()
streamFastq :: String -> ConduitT i Fastq m ()
streamFastq String
fl = String -> ConduitT i ByteString m ()
forall (m :: * -> *) i.
MonadResource m =>
String -> ConduitT i ByteString m ()
sourceFileBS String
fl ConduitT i ByteString m ()
-> ConduitM ByteString Fastq m () -> ConduitT i Fastq m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.| ConduitM ByteString Fastq m ()
forall (m :: * -> *). Monad m => ConduitT ByteString Fastq m ()
parseFastqC

sinkFastq :: (MonadResource m, MonadThrow m)
          => FilePath -> ConduitT Fastq o m ()
sinkFastq :: String -> ConduitT Fastq o m ()
sinkFastq String
fl = (Fastq -> ByteString) -> ConduitT Fastq ByteString m ()
forall (m :: * -> *) a b. Monad m => (a -> b) -> ConduitT a b m ()
mapC Fastq -> ByteString
fastqToByteString ConduitT Fastq ByteString m ()
-> ConduitM ByteString o m () -> ConduitT Fastq o m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.| ConduitT ByteString ByteString m ()
forall (m :: * -> *) seq.
(Monad m, IsSequence seq, Element seq ~ Word8) =>
ConduitT seq seq m ()
unlinesAsciiC ConduitT ByteString ByteString m ()
-> ConduitM ByteString o m () -> ConduitM ByteString o m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.| String -> ConduitM ByteString o m ()
forall (m :: * -> *) o.
MonadResource m =>
String -> ConduitT ByteString o m ()
sinkFileBS String
fl

sinkFastqGzip :: (PrimMonad m, MonadThrow m, MonadResource m)
              => FilePath -> ConduitT Fastq o m ()
sinkFastqGzip :: String -> ConduitT Fastq o m ()
sinkFastqGzip String
fl = (Fastq -> ByteString) -> ConduitT Fastq ByteString m ()
forall (m :: * -> *) a b. Monad m => (a -> b) -> ConduitT a b m ()
mapC Fastq -> ByteString
fastqToByteString ConduitT Fastq ByteString m ()
-> ConduitM ByteString o m () -> ConduitT Fastq o m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.| ConduitT ByteString ByteString m ()
forall (m :: * -> *) seq.
(Monad m, IsSequence seq, Element seq ~ Word8) =>
ConduitT seq seq m ()
unlinesAsciiC ConduitT ByteString ByteString m ()
-> ConduitM ByteString o m () -> ConduitM ByteString o m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.| ConduitT ByteString ByteString m ()
forall (m :: * -> *).
(MonadThrow m, PrimMonad m) =>
ConduitT ByteString ByteString m ()
gzip ConduitT ByteString ByteString m ()
-> ConduitM ByteString o m () -> ConduitM ByteString o m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.| String -> ConduitM ByteString o m ()
forall (m :: * -> *) o.
MonadResource m =>
String -> ConduitT ByteString o m ()
sinkFileBS String
fl

data FQBuilder = Init FQBuilder
               | FQ1 B.ByteString FQBuilder
               | FQ2 B.ByteString FQBuilder
               | FQ3 B.ByteString FQBuilder
               | Complete

fqBuilder :: FQBuilder -> Fastq
fqBuilder :: FQBuilder -> Fastq
fqBuilder = ([ByteString], [ByteString], [ByteString]) -> FQBuilder -> Fastq
go ([], [], [])
  where
    go :: ([ByteString], [ByteString], [ByteString]) -> FQBuilder -> Fastq
go ([ByteString], [ByteString], [ByteString])
acc (Init FQBuilder
bldr) = ([ByteString], [ByteString], [ByteString]) -> FQBuilder -> Fastq
go ([ByteString], [ByteString], [ByteString])
acc FQBuilder
bldr
    go ([ByteString]
f1,[ByteString]
f2,[ByteString]
f3) (FQ1 ByteString
x FQBuilder
bldr) = ([ByteString], [ByteString], [ByteString]) -> FQBuilder -> Fastq
go (ByteString
xByteString -> [ByteString] -> [ByteString]
forall a. a -> [a] -> [a]
:[ByteString]
f1, [ByteString]
f2, [ByteString]
f3) FQBuilder
bldr
    go ([ByteString]
f1,[ByteString]
f2,[ByteString]
f3) (FQ2 ByteString
x FQBuilder
bldr) = ([ByteString], [ByteString], [ByteString]) -> FQBuilder -> Fastq
go ([ByteString]
f1, ByteString
xByteString -> [ByteString] -> [ByteString]
forall a. a -> [a] -> [a]
:[ByteString]
f2, [ByteString]
f3) FQBuilder
bldr
    go ([ByteString]
f1,[ByteString]
f2,[ByteString]
f3) (FQ3 ByteString
x FQBuilder
bldr) = ([ByteString], [ByteString], [ByteString]) -> FQBuilder -> Fastq
go ([ByteString]
f1, [ByteString]
f2, ByteString
xByteString -> [ByteString] -> [ByteString]
forall a. a -> [a] -> [a]
:[ByteString]
f3) FQBuilder
bldr
    go ([ByteString]
f1,[ByteString]
f2,[ByteString]
f3) FQBuilder
Complete = ByteString -> ByteString -> ByteString -> Fastq
Fastq ([ByteString] -> ByteString
B.concat ([ByteString] -> ByteString) -> [ByteString] -> ByteString
forall a b. (a -> b) -> a -> b
$ [ByteString] -> [ByteString]
forall a. [a] -> [a]
reverse [ByteString]
f1)
        ([ByteString] -> ByteString
B.concat ([ByteString] -> ByteString) -> [ByteString] -> ByteString
forall a b. (a -> b) -> a -> b
$ [ByteString] -> [ByteString]
forall a. [a] -> [a]
reverse [ByteString]
f2) ([ByteString] -> ByteString
B.concat ([ByteString] -> ByteString) -> [ByteString] -> ByteString
forall a b. (a -> b) -> a -> b
$ [ByteString] -> [ByteString]
forall a. [a] -> [a]
reverse [ByteString]
f3)
{-# INLINE fqBuilder #-}

parseFastqC :: Monad m => ConduitT B.ByteString Fastq m ()
parseFastqC :: ConduitT ByteString Fastq m ()
parseFastqC = ConduitT ByteString Fastq m (Maybe ByteString)
forall (m :: * -> *) i. Monad m => Consumer i m (Maybe i)
await ConduitT ByteString Fastq m (Maybe ByteString)
-> (Maybe ByteString -> ConduitT ByteString Fastq m ())
-> ConduitT ByteString Fastq m ()
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= ConduitT ByteString Fastq m ()
-> (ByteString -> ConduitT ByteString Fastq m ())
-> Maybe ByteString
-> ConduitT ByteString Fastq m ()
forall b a. b -> (a -> b) -> Maybe a -> b
maybe (String -> ConduitT ByteString Fastq m ()
forall a. HasCallStack => String -> a
error String
"Empty input") ( \ByteString
x -> do
    if ByteString -> Char
B.head ByteString
x Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'@'
        then (FQBuilder -> FQBuilder)
-> Char -> ByteString -> ConduitT ByteString Fastq m ()
forall (m :: * -> *).
Monad m =>
(FQBuilder -> FQBuilder)
-> Char -> ByteString -> ConduitT ByteString Fastq m ()
loop FQBuilder -> FQBuilder
Init Char
'a' (ByteString -> ConduitT ByteString Fastq m ())
-> ByteString -> ConduitT ByteString Fastq m ()
forall a b. (a -> b) -> a -> b
$ ByteString -> ByteString
B.tail ByteString
x
        else String -> ConduitT ByteString Fastq m ()
forall a. HasCallStack => String -> a
error String
"Record does not start with \'@\'" )
  where
    tryRead1 :: ByteString -> ConduitT ByteString o m ByteString
tryRead1 ByteString
input | ByteString -> Bool
B.null ByteString
input = ConduitT ByteString o m (Maybe ByteString)
forall (m :: * -> *) i. Monad m => Consumer i m (Maybe i)
await ConduitT ByteString o m (Maybe ByteString)
-> (Maybe ByteString -> ConduitT ByteString o m ByteString)
-> ConduitT ByteString o m ByteString
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= ConduitT ByteString o m ByteString
-> (ByteString -> ConduitT ByteString o m ByteString)
-> Maybe ByteString
-> ConduitT ByteString o m ByteString
forall b a. b -> (a -> b) -> Maybe a -> b
maybe (String -> ConduitT ByteString o m ByteString
forall a. HasCallStack => String -> a
error String
"Unexpected EOF") ByteString -> ConduitT ByteString o m ByteString
forall (m :: * -> *) a. Monad m => a -> m a
return
                   | Bool
otherwise = ByteString -> ConduitT ByteString o m ByteString
forall (m :: * -> *) a. Monad m => a -> m a
return ByteString
input
    loop :: (FQBuilder -> FQBuilder)
-> Char -> ByteString -> ConduitT ByteString Fastq m ()
loop FQBuilder -> FQBuilder
acc Char
st ByteString
input = case Char
st of
        Char
'a' -> do
            (ByteString
x, ByteString
rest) <- (Char -> Bool) -> ByteString -> (ByteString, ByteString)
B.break (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
'\n') (ByteString -> (ByteString, ByteString))
-> ConduitT ByteString Fastq m ByteString
-> ConduitT ByteString Fastq m (ByteString, ByteString)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ByteString -> ConduitT ByteString Fastq m ByteString
forall (m :: * -> *) o.
Monad m =>
ByteString -> ConduitT ByteString o m ByteString
tryRead1 ByteString
input
            if ByteString -> Bool
B.null ByteString
rest
                then (FQBuilder -> FQBuilder)
-> Char -> ByteString -> ConduitT ByteString Fastq m ()
loop (FQBuilder -> FQBuilder
acc (FQBuilder -> FQBuilder)
-> (FQBuilder -> FQBuilder) -> FQBuilder -> FQBuilder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> FQBuilder -> FQBuilder
FQ1 ByteString
x) Char
'a' ByteString
rest
                else (FQBuilder -> FQBuilder)
-> Char -> ByteString -> ConduitT ByteString Fastq m ()
loop (FQBuilder -> FQBuilder
acc (FQBuilder -> FQBuilder)
-> (FQBuilder -> FQBuilder) -> FQBuilder -> FQBuilder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> FQBuilder -> FQBuilder
FQ1 ByteString
x) Char
'b' (ByteString -> ConduitT ByteString Fastq m ())
-> ByteString -> ConduitT ByteString Fastq m ()
forall a b. (a -> b) -> a -> b
$ ByteString -> ByteString
B.tail ByteString
rest
        Char
'b' -> do 
            (ByteString
x, ByteString
rest) <- (Char -> Bool) -> ByteString -> (ByteString, ByteString)
B.break (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
'\n') (ByteString -> (ByteString, ByteString))
-> ConduitT ByteString Fastq m ByteString
-> ConduitT ByteString Fastq m (ByteString, ByteString)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ByteString -> ConduitT ByteString Fastq m ByteString
forall (m :: * -> *) o.
Monad m =>
ByteString -> ConduitT ByteString o m ByteString
tryRead1 ByteString
input
            if ByteString -> Bool
B.null ByteString
rest
                then (FQBuilder -> FQBuilder)
-> Char -> ByteString -> ConduitT ByteString Fastq m ()
loop (FQBuilder -> FQBuilder
acc (FQBuilder -> FQBuilder)
-> (FQBuilder -> FQBuilder) -> FQBuilder -> FQBuilder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> FQBuilder -> FQBuilder
FQ2 ByteString
x) Char
'b' ByteString
rest
                else (FQBuilder -> FQBuilder)
-> Char -> ByteString -> ConduitT ByteString Fastq m ()
loop (FQBuilder -> FQBuilder
acc (FQBuilder -> FQBuilder)
-> (FQBuilder -> FQBuilder) -> FQBuilder -> FQBuilder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> FQBuilder -> FQBuilder
FQ2 ByteString
x) Char
'B' (ByteString -> ConduitT ByteString Fastq m ())
-> ByteString -> ConduitT ByteString Fastq m ()
forall a b. (a -> b) -> a -> b
$ ByteString -> ByteString
B.tail ByteString
rest
        Char
'B' -> do 
            ByteString
input' <- ByteString -> ConduitT ByteString Fastq m ByteString
forall (m :: * -> *) o.
Monad m =>
ByteString -> ConduitT ByteString o m ByteString
tryRead1 ByteString
input
            if ByteString -> Char
B.head ByteString
input' Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'+'
                then (FQBuilder -> FQBuilder)
-> Char -> ByteString -> ConduitT ByteString Fastq m ()
loop FQBuilder -> FQBuilder
acc Char
'c' (ByteString -> ConduitT ByteString Fastq m ())
-> ByteString -> ConduitT ByteString Fastq m ()
forall a b. (a -> b) -> a -> b
$ ByteString -> ByteString
B.tail ByteString
input'
                else do
                    let (ByteString
x, ByteString
rest) = (Char -> Bool) -> ByteString -> (ByteString, ByteString)
B.break (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
'\n') ByteString
input'
                    if ByteString -> Bool
B.null ByteString
rest
                        then (FQBuilder -> FQBuilder)
-> Char -> ByteString -> ConduitT ByteString Fastq m ()
loop (FQBuilder -> FQBuilder
acc (FQBuilder -> FQBuilder)
-> (FQBuilder -> FQBuilder) -> FQBuilder -> FQBuilder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> FQBuilder -> FQBuilder
FQ2 ByteString
x) Char
'b' ByteString
rest
                        else (FQBuilder -> FQBuilder)
-> Char -> ByteString -> ConduitT ByteString Fastq m ()
loop (FQBuilder -> FQBuilder
acc (FQBuilder -> FQBuilder)
-> (FQBuilder -> FQBuilder) -> FQBuilder -> FQBuilder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> FQBuilder -> FQBuilder
FQ2 ByteString
x) Char
'B' (ByteString -> ConduitT ByteString Fastq m ())
-> ByteString -> ConduitT ByteString Fastq m ()
forall a b. (a -> b) -> a -> b
$ ByteString -> ByteString
B.tail ByteString
rest
        Char
'c' -> do
            (ByteString
x, ByteString
rest) <- (Char -> Bool) -> ByteString -> (ByteString, ByteString)
B.break (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
'\n') (ByteString -> (ByteString, ByteString))
-> ConduitT ByteString Fastq m ByteString
-> ConduitT ByteString Fastq m (ByteString, ByteString)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ByteString -> ConduitT ByteString Fastq m ByteString
forall (m :: * -> *) o.
Monad m =>
ByteString -> ConduitT ByteString o m ByteString
tryRead1 ByteString
input
            if ByteString -> Bool
B.null ByteString
rest
                then (FQBuilder -> FQBuilder)
-> Char -> ByteString -> ConduitT ByteString Fastq m ()
loop FQBuilder -> FQBuilder
acc Char
'c' ByteString
rest
                else (FQBuilder -> FQBuilder)
-> Char -> ByteString -> ConduitT ByteString Fastq m ()
loop FQBuilder -> FQBuilder
acc Char
'd' (ByteString -> ConduitT ByteString Fastq m ())
-> ByteString -> ConduitT ByteString Fastq m ()
forall a b. (a -> b) -> a -> b
$ ByteString -> ByteString
B.tail ByteString
rest
        Char
'd' -> do
            (ByteString
x, ByteString
rest) <- (Char -> Bool) -> ByteString -> (ByteString, ByteString)
B.break (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
'\n') (ByteString -> (ByteString, ByteString))
-> ConduitT ByteString Fastq m ByteString
-> ConduitT ByteString Fastq m (ByteString, ByteString)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ByteString -> ConduitT ByteString Fastq m ByteString
forall (m :: * -> *) o.
Monad m =>
ByteString -> ConduitT ByteString o m ByteString
tryRead1 ByteString
input
            if ByteString -> Bool
B.null ByteString
rest 
                then (FQBuilder -> FQBuilder)
-> Char -> ByteString -> ConduitT ByteString Fastq m ()
loop (FQBuilder -> FQBuilder
acc (FQBuilder -> FQBuilder)
-> (FQBuilder -> FQBuilder) -> FQBuilder -> FQBuilder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> FQBuilder -> FQBuilder
FQ3 ByteString
x) Char
'd' ByteString
rest
                else (FQBuilder -> FQBuilder)
-> Char -> ByteString -> ConduitT ByteString Fastq m ()
loop (FQBuilder -> FQBuilder
acc (FQBuilder -> FQBuilder)
-> (FQBuilder -> FQBuilder) -> FQBuilder -> FQBuilder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> FQBuilder -> FQBuilder
FQ3 ByteString
x) Char
'D' (ByteString -> ConduitT ByteString Fastq m ())
-> ByteString -> ConduitT ByteString Fastq m ()
forall a b. (a -> b) -> a -> b
$ ByteString -> ByteString
B.tail ByteString
rest
        Char
'D' -> if ByteString -> Bool
B.null ByteString
input
            then ConduitT ByteString Fastq m (Maybe ByteString)
forall (m :: * -> *) i. Monad m => Consumer i m (Maybe i)
await ConduitT ByteString Fastq m (Maybe ByteString)
-> (Maybe ByteString -> ConduitT ByteString Fastq m ())
-> ConduitT ByteString Fastq m ()
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \case
                Maybe ByteString
Nothing -> Fastq -> ConduitT ByteString Fastq m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield (Fastq -> ConduitT ByteString Fastq m ())
-> Fastq -> ConduitT ByteString Fastq m ()
forall a b. (a -> b) -> a -> b
$ FQBuilder -> Fastq
fqBuilder (FQBuilder -> Fastq) -> FQBuilder -> Fastq
forall a b. (a -> b) -> a -> b
$ FQBuilder -> FQBuilder
acc FQBuilder
Complete
                Just ByteString
input' -> if ByteString -> Char
B.head ByteString
input' Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'@'
                    then do
                        Fastq -> ConduitT ByteString Fastq m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield (Fastq -> ConduitT ByteString Fastq m ())
-> Fastq -> ConduitT ByteString Fastq m ()
forall a b. (a -> b) -> a -> b
$ FQBuilder -> Fastq
fqBuilder (FQBuilder -> Fastq) -> FQBuilder -> Fastq
forall a b. (a -> b) -> a -> b
$ FQBuilder -> FQBuilder
acc FQBuilder
Complete
                        (FQBuilder -> FQBuilder)
-> Char -> ByteString -> ConduitT ByteString Fastq m ()
loop FQBuilder -> FQBuilder
Init Char
'a' (ByteString -> ConduitT ByteString Fastq m ())
-> ByteString -> ConduitT ByteString Fastq m ()
forall a b. (a -> b) -> a -> b
$ ByteString -> ByteString
B.tail ByteString
input'
                    else (FQBuilder -> FQBuilder)
-> Char -> ByteString -> ConduitT ByteString Fastq m ()
loop FQBuilder -> FQBuilder
acc Char
'd' ByteString
input'
            else if ByteString -> Char
B.head ByteString
input Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'@'
                then do
                    Fastq -> ConduitT ByteString Fastq m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield (Fastq -> ConduitT ByteString Fastq m ())
-> Fastq -> ConduitT ByteString Fastq m ()
forall a b. (a -> b) -> a -> b
$ FQBuilder -> Fastq
fqBuilder (FQBuilder -> Fastq) -> FQBuilder -> Fastq
forall a b. (a -> b) -> a -> b
$ FQBuilder -> FQBuilder
acc FQBuilder
Complete
                    (FQBuilder -> FQBuilder)
-> Char -> ByteString -> ConduitT ByteString Fastq m ()
loop FQBuilder -> FQBuilder
Init Char
'a' (ByteString -> ConduitT ByteString Fastq m ())
-> ByteString -> ConduitT ByteString Fastq m ()
forall a b. (a -> b) -> a -> b
$ ByteString -> ByteString
B.tail ByteString
input
                else (FQBuilder -> FQBuilder)
-> Char -> ByteString -> ConduitT ByteString Fastq m ()
loop FQBuilder -> FQBuilder
acc Char
'd' ByteString
input
{-# INLINE parseFastqC #-}

parseFastqC' :: MonadThrow m => ConduitT B.ByteString Fastq m ()
parseFastqC' :: ConduitT ByteString Fastq m ()
parseFastqC' = Parser ByteString Fastq
-> ConduitT ByteString (PositionRange, Fastq) m ()
forall a (m :: * -> *) b.
(AttoparsecInput a, MonadThrow m) =>
Parser a b -> ConduitT a (PositionRange, b) m ()
conduitParser Parser ByteString Fastq
fastqParser ConduitT ByteString (PositionRange, Fastq) m ()
-> ConduitM (PositionRange, Fastq) Fastq m ()
-> ConduitT ByteString Fastq m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.| ((PositionRange, Fastq) -> Fastq)
-> ConduitM (PositionRange, Fastq) Fastq m ()
forall (m :: * -> *) a b. Monad m => (a -> b) -> ConduitT a b m ()
mapC (PositionRange, Fastq) -> Fastq
forall a b. (a, b) -> b
snd
  where
    fastqParser :: Parser ByteString Fastq
fastqParser = do
        Word8
_ <- (Char -> Bool) -> Parser ()
skipWhile (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/=Char
'@') Parser () -> Parser ByteString Word8 -> Parser ByteString Word8
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Char -> Parser ByteString Word8
char8 Char
'@'
        ByteString
ident <- (Word8 -> Bool) -> Parser ByteString
A.takeTill Word8 -> Bool
isEndOfLine
        Parser ()
endOfLine
        ByteString
sequ <- (Word8 -> Bool) -> ByteString -> ByteString
BS.filter (Bool -> Bool
not (Bool -> Bool) -> (Word8 -> Bool) -> Word8 -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Word8 -> Bool
isEndOfLine) (ByteString -> ByteString)
-> Parser ByteString -> Parser ByteString
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Char -> Bool) -> Parser ByteString
takeTill (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
'+')
        Char -> Parser ByteString Word8
char8 Char
'+' Parser ByteString Word8 -> Parser () -> Parser ()
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> (Word8 -> Bool) -> Parser ()
A.skipWhile (Bool -> Bool
not (Bool -> Bool) -> (Word8 -> Bool) -> Word8 -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Word8 -> Bool
isEndOfLine) Parser () -> Parser () -> Parser ()
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ()
endOfLine
        ByteString
score <- (Word8 -> Bool) -> ByteString -> ByteString
BS.filter (Bool -> Bool
not (Bool -> Bool) -> (Word8 -> Bool) -> Word8 -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Word8 -> Bool
isEndOfLine) (ByteString -> ByteString)
-> Parser ByteString -> Parser ByteString
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$>
            Int -> (Int -> Word8 -> Maybe Int) -> Parser ByteString
forall s. s -> (s -> Word8 -> Maybe s) -> Parser ByteString
A.scan Int
0 (Int -> Int -> Word8 -> Maybe Int
forall a. (Ord a, Num a) => a -> a -> Word8 -> Maybe a
f (ByteString -> Int
B.length ByteString
sequ))
        (Char -> Bool) -> Parser ()
skipWhile (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/=Char
'@')
        Fastq -> Parser ByteString Fastq
forall (m :: * -> *) a. Monad m => a -> m a
return (Fastq -> Parser ByteString Fastq)
-> Fastq -> Parser ByteString Fastq
forall a b. (a -> b) -> a -> b
$ ByteString -> ByteString -> ByteString -> Fastq
Fastq ByteString
ident ByteString
sequ ByteString
score
      where
        f :: a -> a -> Word8 -> Maybe a
f a
n a
i Word8
x | a
i a -> a -> Bool
forall a. Ord a => a -> a -> Bool
>= a
n = Maybe a
forall a. Maybe a
Nothing
                | Word8 -> Bool
isEndOfLine Word8
x = a -> Maybe a
forall a. a -> Maybe a
Just a
i
                | Bool
otherwise = a -> Maybe a
forall a. a -> Maybe a
Just (a -> Maybe a) -> a -> Maybe a
forall a b. (a -> b) -> a -> b
$ a
i a -> a -> a
forall a. Num a => a -> a -> a
+ a
1
{-# INLINE parseFastqC' #-}

fastqToByteString :: Fastq -> B.ByteString
fastqToByteString :: Fastq -> ByteString
fastqToByteString (Fastq ByteString
a ByteString
b ByteString
c) = [ByteString] -> ByteString
B.concat [ByteString
"@", ByteString
a, ByteString
"\n", ByteString
b, ByteString
"\n+\n", ByteString
c]
{-# INLINE fastqToByteString #-}

-- | Get the mean and variance of quality scores at every position.
qualitySummary :: Monad m => ConduitT Fastq o m [(Double, Double)]
qualitySummary :: ConduitT Fastq o m [(Double, Double)]
qualitySummary = (Fastq -> [Double]) -> ConduitT Fastq [Double] m ()
forall (m :: * -> *) a b. Monad m => (a -> b) -> ConduitT a b m ()
mapC ((Int -> Double) -> [Int] -> [Double]
forall a b. (a -> b) -> [a] -> [b]
map Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral ([Int] -> [Double]) -> (Fastq -> [Int]) -> Fastq -> [Double]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Fastq -> [Int]
decodeQualSc) ConduitT Fastq [Double] m ()
-> ConduitM [Double] o m [(Double, Double)]
-> ConduitT Fastq o m [(Double, Double)]
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.| ConduitM [Double] o m [(Double, Double)]
forall (m :: * -> *) o.
Monad m =>
ConduitT [Double] o m [(Double, Double)]
meanVarianceC

meanVarianceC :: Monad m => ConduitT [Double] o m [(Double, Double)]
meanVarianceC :: ConduitT [Double] o m [(Double, Double)]
meanVarianceC = ConduitT [Double] o m (Maybe [Double])
forall (m :: * -> *) a o. Monad m => ConduitT a o m (Maybe a)
peekC ConduitT [Double] o m (Maybe [Double])
-> (Maybe [Double] -> ConduitT [Double] o m [(Double, Double)])
-> ConduitT [Double] o m [(Double, Double)]
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \case
    Maybe [Double]
Nothing -> String -> ConduitT [Double] o m [(Double, Double)]
forall a. HasCallStack => String -> a
error String
"Empty input"
    Just [Double]
x -> ([(Double, Double)], Int) -> [(Double, Double)]
forall a b. (a, b) -> a
fst (([(Double, Double)], Int) -> [(Double, Double)])
-> ConduitT [Double] o m ([(Double, Double)], Int)
-> ConduitT [Double] o m [(Double, Double)]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (([(Double, Double)], Int)
 -> [Double] -> ([(Double, Double)], Int))
-> ([(Double, Double)], Int)
-> ConduitT [Double] o m ([(Double, Double)], Int)
forall (m :: * -> *) a b o.
Monad m =>
(a -> b -> a) -> a -> ConduitT b o m a
foldlC ([(Double, Double)], Int) -> [Double] -> ([(Double, Double)], Int)
forall b b.
(Integral b, Fractional b) =>
([(b, b)], b) -> [b] -> ([(b, b)], b)
f (Int -> (Double, Double) -> [(Double, Double)]
forall a. Int -> a -> [a]
replicate ([Double] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Double]
x) (Double
0,Double
0), Int
0 :: Int)
  where
    f :: ([(b, b)], b) -> [b] -> ([(b, b)], b)
f ([(b, b)]
acc, b
n) [b]
xs = let acc' :: [(b, b)]
acc' = ((b, b) -> b -> (b, b)) -> [(b, b)] -> [b] -> [(b, b)]
forall a b c. (a -> b -> c) -> [a] -> [b] -> [c]
zipWith (b, b) -> b -> (b, b)
forall b. Fractional b => (b, b) -> b -> (b, b)
g [(b, b)]
acc [b]
xs in ([(b, b)]
acc', b
n')
      where
        n' :: b
n' = b
n b -> b -> b
forall a. Num a => a -> a -> a
+ b
1
        g :: (b, b) -> b -> (b, b)
g (b
m, b
s) b
x = (b
m', b
s')
          where
            m' :: b
m' = b
m b -> b -> b
forall a. Num a => a -> a -> a
+ b
d b -> b -> b
forall a. Fractional a => a -> a -> a
/ b -> b
forall a b. (Integral a, Num b) => a -> b
fromIntegral b
n'
            s' :: b
s' = b
s b -> b -> b
forall a. Num a => a -> a -> a
+ b
d b -> b -> b
forall a. Num a => a -> a -> a
* (b
x b -> b -> b
forall a. Num a => a -> a -> a
- b
m')
            d :: b
d  = b
x b -> b -> b
forall a. Num a => a -> a -> a
- b
m
{-# INLINE meanVarianceC #-}

decodeQualSc :: Fastq -> [Int]
decodeQualSc :: Fastq -> [Int]
decodeQualSc = (Word8 -> Int) -> [Word8] -> [Int]
forall a b. (a -> b) -> [a] -> [b]
map (Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word8 -> Int) -> (Word8 -> Word8) -> Word8 -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (\Word8
x -> Word8
x Word8 -> Word8 -> Word8
forall a. Num a => a -> a -> a
- Word8
33)) ([Word8] -> [Int]) -> (Fastq -> [Word8]) -> Fastq -> [Int]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> [Word8]
BS.unpack (ByteString -> [Word8])
-> (Fastq -> ByteString) -> Fastq -> [Word8]
forall b c a. (b -> c) -> (a -> b) -> a -> c
.Fastq -> ByteString
fastqSeqQual
{-# INLINE decodeQualSc #-}

pError :: Int -> Double
pError :: Int -> Double
pError Int
x = Double
10 Double -> Double -> Double
forall a. Floating a => a -> a -> a
** (Double -> Double
forall a. Num a => a -> a
negate (Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
x) Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double
10)
{-# INLINE pError #-}

{-
mkFastqRecord l1 l2 l3 l4 = Fastq (parseLine1 l1) (parseLine2 l2)
    (parseLine3 l3) (parseLine4 l4)
  where
    parseLine1 x | B.head x == '@' = B.tail x
                 | otherwise = error $ "Parse line 1 failed: " ++ B.unpack x
    parseLine2 x | B.all f x = x
                 | otherwise = error $ "Parse line 2 failed: " ++ B.unpack x
      where
        f 'C' = True
        f 'G' = True
        f 'T' = True
        f 'A' = True
        f 'N' = True
        f _   = False
    parseLine3 x | B.head x == '+' = B.tail x
                 | otherwise = error $ "Parse line 3 failed: " ++ B.unpack x
    parseLine4 x | BS.all f x = x
                 | otherwise = error $ "Parse line 4 failed: " ++ B.unpack x
      where
        f b = let b' = fromIntegral b :: Int
              in b' >= 33 && b' <= 126
-}

-- | Remove trailing 'A'
trimPolyA :: Int -> Fastq -> Fastq
trimPolyA :: Int -> Fastq -> Fastq
trimPolyA Int
n f :: Fastq
f@(Fastq ByteString
a ByteString
b ByteString
c)
    | ByteString -> Int
B.length ByteString
trailing Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
n = ByteString -> ByteString -> ByteString -> Fastq
Fastq ByteString
a ByteString
b' (ByteString -> Fastq) -> ByteString -> Fastq
forall a b. (a -> b) -> a -> b
$ Int -> ByteString -> ByteString
B.take (ByteString -> Int
B.length ByteString
b') ByteString
c
    | Bool
otherwise = Fastq
f
  where
    (ByteString
b', ByteString
trailing) = (Char -> Bool) -> ByteString -> (ByteString, ByteString)
B.spanEnd (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
'A') ByteString
b