{-# LANGUAGE OverloadedStrings #-}

{-| Module to read and parse through a Fasta file. The Fasta format is defined here:
<https://en.wikipedia.org/wiki/FASTA_format>
-}
module SequenceFormats.Fasta (readNextFastaEntry, loadFastaChrom) where

import           SequenceFormats.Utils            (Chrom (..))

import           Control.Exception.Base           (AssertionFailed (..),
                                                   throwIO)
import           Control.Monad                    (void)
import           Control.Monad.IO.Class           (MonadIO, liftIO)
import           Control.Monad.Trans.State.Strict (runStateT)
import qualified Data.Attoparsec.ByteString.Char8 as A
import qualified Data.ByteString.Char8            as B
import           Data.Char                        (isSpace)
import           Lens.Family2                     (view)
import           Pipes                            (Producer, next, runEffect,
                                                   (>->))
import           Pipes.Attoparsec                 (parse)
import qualified Pipes.ByteString                 as P
import           Pipes.Prelude                    (drain)
import           System.IO                        (Handle, hPutStr, stderr)

-- |A function to select out a specific chromosome from a Fasta File. Expects a file handle to the
-- file and a chromosome. Note that by Chromosome I simply denote a fasta header line, as is the
-- case for example for the human reference genome. Returns a Bytestring-Producer over the single sequence followed the specified header (the chromosome).
loadFastaChrom :: Handle -> Chrom -> IO (Producer B.ByteString IO ())
loadFastaChrom :: Handle -> Chrom -> IO (Producer ByteString IO ())
loadFastaChrom Handle
refFileHandle Chrom
chrom = do
    let prod :: Proxy x' x () ByteString IO ()
prod = forall (m :: * -> *).
MonadIO m =>
Handle -> Producer' ByteString m ()
P.fromHandle Handle
refFileHandle
    Producer ByteString IO () -> IO (Producer ByteString IO ())
go forall {x'} {x}. Proxy x' x () ByteString IO ()
prod
  where
    go :: Producer ByteString IO () -> IO (Producer ByteString IO ())
go Producer ByteString IO ()
prod = do
        (Chrom
chrom_, Producer ByteString IO (Producer ByteString IO ())
prod') <- forall (m :: * -> *).
MonadIO m =>
Producer ByteString m ()
-> m (Chrom, Producer ByteString m (Producer ByteString m ()))
readNextFastaEntry Producer ByteString IO ()
prod
        Handle -> String -> IO ()
hPutStr Handle
stderr (String
"found chromosome " forall a. Semigroup a => a -> a -> a
<> forall a. Show a => a -> String
show Chrom
chrom_)
        if Chrom
chrom_ forall a. Eq a => a -> a -> Bool
== Chrom
chrom
        then forall (m :: * -> *) a. Monad m => a -> m a
return (forall (f :: * -> *) a. Functor f => f a -> f ()
void Producer ByteString IO (Producer ByteString IO ())
prod')
        else do
            Producer ByteString IO ()
newProd <- forall (m :: * -> *) r. Monad m => Effect m r -> m r
runEffect forall a b. (a -> b) -> a -> b
$ Producer ByteString IO (Producer ByteString IO ())
prod' forall (m :: * -> *) a' a b r c' c.
Functor m =>
Proxy a' a () b m r -> Proxy () b c' c m r -> Proxy a' a c' c m r
>-> forall (m :: * -> *) a r. Functor m => Consumer' a m r
drain
            Producer ByteString IO () -> IO (Producer ByteString IO ())
go Producer ByteString IO ()
newProd

-- |This function takes a Bytestring-Producer over a Fasta-file, reads in the first header and then returns a produer over its sequence. The return of that producer is the Bytestring-Producer of the rest of the fasta file.
readNextFastaEntry :: (MonadIO m) => Producer B.ByteString m () ->
                      m (Chrom, Producer B.ByteString m (Producer B.ByteString m ()))
readNextFastaEntry :: forall (m :: * -> *).
MonadIO m =>
Producer ByteString m ()
-> m (Chrom, Producer ByteString m (Producer ByteString m ()))
readNextFastaEntry Producer ByteString m ()
prod = do
    (Maybe (Either ParsingError Chrom)
res, Producer ByteString m ()
rest) <- forall s (m :: * -> *) a. StateT s m a -> s -> m (a, s)
runStateT (forall (m :: * -> *) a b.
(Monad m, ParserInput a) =>
Parser a b -> Parser a m (Maybe (Either ParsingError b))
parse Parser Chrom
fastaHeaderLineParser) Producer ByteString m ()
prod
    Chrom
header <- case Maybe (Either ParsingError Chrom)
res of
        Maybe (Either ParsingError Chrom)
Nothing -> forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall e a. Exception e => e -> IO a
throwIO forall a b. (a -> b) -> a -> b
$ String -> AssertionFailed
AssertionFailed String
"Could not find chromosome. Fasta file exhausted."
        Just (Left ParsingError
e_) -> do
            Either () (ByteString, Producer ByteString m ())
x <- forall (m :: * -> *) a r.
Monad m =>
Producer a m r -> m (Either r (a, Producer a m r))
next Producer ByteString m ()
rest
            case Either () (ByteString, Producer ByteString m ())
x of
                (Right (ByteString
chunk, Producer ByteString m ()
_)) -> do
                    let msg :: String
msg = forall a. Show a => a -> String
show ParsingError
e_ forall a. [a] -> [a] -> [a]
++ ByteString -> String
B.unpack ByteString
chunk
                    forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall e a. Exception e => e -> IO a
throwIO forall a b. (a -> b) -> a -> b
$ String -> AssertionFailed
AssertionFailed (String
"Fasta header parsing error: " forall a. [a] -> [a] -> [a]
++ String
msg)
                Either () (ByteString, Producer ByteString m ())
_ -> forall a. HasCallStack => String -> a
error String
"should not happen"
        Just (Right Chrom
h) -> forall (m :: * -> *) a. Monad m => a -> m a
return Chrom
h
    forall (m :: * -> *) a. Monad m => a -> m a
return (Chrom
header, forall a s t b. FoldLike a s t a b -> s -> a
view (forall (m :: * -> *) x.
Monad m =>
(Word8 -> Bool)
-> Lens'
     (Producer ByteString m x)
     (Producer ByteString m (Producer ByteString m x))
P.break (forall a. Eq a => a -> a -> Bool
==Word8
62)) Producer ByteString m ()
rest forall (m :: * -> *) a' a b r c' c.
Functor m =>
Proxy a' a () b m r -> Proxy () b c' c m r -> Proxy a' a c' c m r
>-> forall (m :: * -> *) r.
Monad m =>
(Word8 -> Bool) -> Pipe ByteString ByteString m r
P.filter (\Word8
c -> Word8
c forall a. Eq a => a -> a -> Bool
/= Word8
10 Bool -> Bool -> Bool
&& Word8
c forall a. Eq a => a -> a -> Bool
/= Word8
13))
-- '>' == 62, '\n' == 10, \r == 13

fastaHeaderLineParser :: A.Parser Chrom
fastaHeaderLineParser :: Parser Chrom
fastaHeaderLineParser = do
    Char
_ <- Char -> Parser Char
A.char Char
'>'
    ByteString
chrom <- (Char -> Bool) -> Parser ByteString
A.takeWhile forall a b. (a -> b) -> a -> b
$ Bool -> Bool
not forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Bool
isSpace
    (Char -> Bool) -> Parser ()
A.skipWhile (\Char
c -> Char
c forall a. Eq a => a -> a -> Bool
/= Char
'\n' Bool -> Bool -> Bool
&& Char
c forall a. Eq a => a -> a -> Bool
/= Char
'\r')
    Parser ()
A.endOfLine
    forall (m :: * -> *) a. Monad m => a -> m a
return forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> Chrom
Chrom forall a b. (a -> b) -> a -> b
$ ByteString
chrom