{-# LANGUAGE OverloadedStrings #-}

{-| Module to read and parse through a Fasta file. The Fasta format is defined here:
<https://en.wikipedia.org/wiki/FASTA_format>
-}
module SequenceFormats.Fasta (readNextFastaEntry, loadFastaChrom) where

import           SequenceFormats.Utils            (Chrom (..))

import           Control.Exception.Base           (AssertionFailed (..),
                                                   throwIO)
import           Control.Monad                    (void)
import           Control.Monad.IO.Class           (MonadIO, liftIO)
import           Control.Monad.Trans.State.Strict (runStateT)
import qualified Data.Attoparsec.ByteString.Char8 as A
import qualified Data.ByteString.Char8            as B
import           Data.Char                        (isSpace)
import           Lens.Family2                     (view)
import           Pipes                            (Producer, next, runEffect,
                                                   (>->))
import           Pipes.Attoparsec                 (parse)
import qualified Pipes.ByteString                 as P
import           Pipes.Prelude                    (drain)
import           System.IO                        (Handle, hPutStr, stderr)

-- |A function to select out a specific chromosome from a Fasta File. Expects a file handle to the
-- file and a chromosome. Note that by Chromosome I simply denote a fasta header line, as is the
-- case for example for the human reference genome. Returns a Bytestring-Producer over the single sequence followed the specified header (the chromosome).
loadFastaChrom :: Handle -> Chrom -> IO (Producer B.ByteString IO ())
loadFastaChrom :: Handle -> Chrom -> IO (Producer ByteString IO ())
loadFastaChrom Handle
refFileHandle Chrom
chrom = do
    let prod :: Proxy x' x () ByteString IO ()
prod = Handle -> Producer' ByteString IO ()
forall (m :: * -> *).
MonadIO m =>
Handle -> Producer' ByteString m ()
P.fromHandle Handle
refFileHandle
    Producer ByteString IO () -> IO (Producer ByteString IO ())
go Producer ByteString IO ()
Producer' ByteString IO ()
prod
  where
    go :: Producer ByteString IO () -> IO (Producer ByteString IO ())
go Producer ByteString IO ()
prod = do
        (Chrom
chrom_, Producer ByteString IO (Producer ByteString IO ())
prod') <- Producer ByteString IO ()
-> IO (Chrom, Producer ByteString IO (Producer ByteString IO ()))
forall (m :: * -> *).
MonadIO m =>
Producer ByteString m ()
-> m (Chrom, Producer ByteString m (Producer ByteString m ()))
readNextFastaEntry Producer ByteString IO ()
prod
        Handle -> String -> IO ()
hPutStr Handle
stderr (String
"found chromosome " String -> String -> String
forall a. Semigroup a => a -> a -> a
<> Chrom -> String
forall a. Show a => a -> String
show Chrom
chrom_)
        if Chrom
chrom_ Chrom -> Chrom -> Bool
forall a. Eq a => a -> a -> Bool
== Chrom
chrom
        then Producer ByteString IO () -> IO (Producer ByteString IO ())
forall (m :: * -> *) a. Monad m => a -> m a
return (Producer ByteString IO (Producer ByteString IO ())
-> Producer ByteString IO ()
forall (f :: * -> *) a. Functor f => f a -> f ()
void Producer ByteString IO (Producer ByteString IO ())
prod')
        else do
            Producer ByteString IO ()
newProd <- Effect IO (Producer ByteString IO ())
-> IO (Producer ByteString IO ())
forall (m :: * -> *) r. Monad m => Effect m r -> m r
runEffect (Effect IO (Producer ByteString IO ())
 -> IO (Producer ByteString IO ()))
-> Effect IO (Producer ByteString IO ())
-> IO (Producer ByteString IO ())
forall a b. (a -> b) -> a -> b
$ Producer ByteString IO (Producer ByteString IO ())
prod' Producer ByteString IO (Producer ByteString IO ())
-> Proxy () ByteString () X IO (Producer ByteString IO ())
-> Effect IO (Producer ByteString IO ())
forall (m :: * -> *) a' a b r c' c.
Functor m =>
Proxy a' a () b m r -> Proxy () b c' c m r -> Proxy a' a c' c m r
>-> Proxy () ByteString () X IO (Producer ByteString IO ())
forall (m :: * -> *) a r. Functor m => Consumer' a m r
drain
            Producer ByteString IO () -> IO (Producer ByteString IO ())
go Producer ByteString IO ()
newProd

-- |This function takes a Bytestring-Producer over a Fasta-file, reads in the first header and then returns a produer over its sequence. The return of that producer is the Bytestring-Producer of the rest of the fasta file.
readNextFastaEntry :: (MonadIO m) => Producer B.ByteString m () ->
                      m (Chrom, Producer B.ByteString m (Producer B.ByteString m ()))
readNextFastaEntry :: Producer ByteString m ()
-> m (Chrom, Producer ByteString m (Producer ByteString m ()))
readNextFastaEntry Producer ByteString m ()
prod = do
    (Maybe (Either ParsingError Chrom)
res, Producer ByteString m ()
rest) <- StateT
  (Producer ByteString m ()) m (Maybe (Either ParsingError Chrom))
-> Producer ByteString m ()
-> m (Maybe (Either ParsingError Chrom), Producer ByteString m ())
forall s (m :: * -> *) a. StateT s m a -> s -> m (a, s)
runStateT (Parser ByteString Chrom
-> Parser ByteString m (Maybe (Either ParsingError Chrom))
forall (m :: * -> *) a b.
(Monad m, ParserInput a) =>
Parser a b -> Parser a m (Maybe (Either ParsingError b))
parse Parser ByteString Chrom
fastaHeaderLineParser) Producer ByteString m ()
prod
    Chrom
header <- case Maybe (Either ParsingError Chrom)
res of
        Maybe (Either ParsingError Chrom)
Nothing -> IO Chrom -> m Chrom
forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO (IO Chrom -> m Chrom)
-> (AssertionFailed -> IO Chrom) -> AssertionFailed -> m Chrom
forall b c a. (b -> c) -> (a -> b) -> a -> c
. AssertionFailed -> IO Chrom
forall e a. Exception e => e -> IO a
throwIO (AssertionFailed -> m Chrom) -> AssertionFailed -> m Chrom
forall a b. (a -> b) -> a -> b
$ String -> AssertionFailed
AssertionFailed String
"Could not find chromosome. Fasta file exhausted."
        Just (Left ParsingError
e_) -> do
            Either () (ByteString, Producer ByteString m ())
x <- Producer ByteString m ()
-> m (Either () (ByteString, Producer ByteString m ()))
forall (m :: * -> *) a r.
Monad m =>
Producer a m r -> m (Either r (a, Producer a m r))
next Producer ByteString m ()
rest
            case Either () (ByteString, Producer ByteString m ())
x of
                (Right (ByteString
chunk, Producer ByteString m ()
_)) -> do
                    let msg :: String
msg = ParsingError -> String
forall a. Show a => a -> String
show ParsingError
e_ String -> String -> String
forall a. [a] -> [a] -> [a]
++ ByteString -> String
B.unpack ByteString
chunk
                    IO Chrom -> m Chrom
forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO (IO Chrom -> m Chrom)
-> (AssertionFailed -> IO Chrom) -> AssertionFailed -> m Chrom
forall b c a. (b -> c) -> (a -> b) -> a -> c
. AssertionFailed -> IO Chrom
forall e a. Exception e => e -> IO a
throwIO (AssertionFailed -> m Chrom) -> AssertionFailed -> m Chrom
forall a b. (a -> b) -> a -> b
$ String -> AssertionFailed
AssertionFailed (String
"Fasta header parsing error: " String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
msg)
                Either () (ByteString, Producer ByteString m ())
_ -> String -> m Chrom
forall a. HasCallStack => String -> a
error String
"should not happen"
        Just (Right Chrom
h) -> Chrom -> m Chrom
forall (m :: * -> *) a. Monad m => a -> m a
return Chrom
h
    (Chrom, Producer ByteString m (Producer ByteString m ()))
-> m (Chrom, Producer ByteString m (Producer ByteString m ()))
forall (m :: * -> *) a. Monad m => a -> m a
return (Chrom
header, FoldLike
  (Producer ByteString m (Producer ByteString m ()))
  (Producer ByteString m ())
  (Producer ByteString m ())
  (Producer ByteString m (Producer ByteString m ()))
  (Producer ByteString m (Producer ByteString m ()))
-> Producer ByteString m ()
-> Producer ByteString m (Producer ByteString m ())
forall a s t b. FoldLike a s t a b -> s -> a
view ((Word8 -> Bool)
-> Lens'
     (Producer ByteString m ())
     (Producer ByteString m (Producer ByteString m ()))
forall (m :: * -> *) x.
Monad m =>
(Word8 -> Bool)
-> Lens'
     (Producer ByteString m x)
     (Producer ByteString m (Producer ByteString m x))
P.break (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
==Word8
62)) Producer ByteString m ()
rest Producer ByteString m (Producer ByteString m ())
-> Proxy () ByteString () ByteString m (Producer ByteString m ())
-> Producer ByteString m (Producer ByteString m ())
forall (m :: * -> *) a' a b r c' c.
Functor m =>
Proxy a' a () b m r -> Proxy () b c' c m r -> Proxy a' a c' c m r
>-> (Word8 -> Bool)
-> Proxy () ByteString () ByteString m (Producer ByteString m ())
forall (m :: * -> *) r.
Monad m =>
(Word8 -> Bool) -> Pipe ByteString ByteString m r
P.filter (\Word8
c -> Word8
c Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
10 Bool -> Bool -> Bool
&& Word8
c Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
13))
-- '>' == 62, '\n' == 10, \r == 13

fastaHeaderLineParser :: A.Parser Chrom
fastaHeaderLineParser :: Parser ByteString Chrom
fastaHeaderLineParser = do
    Char
_ <- Char -> Parser Char
A.char Char
'>'
    ByteString
chrom <- (Char -> Bool) -> Parser ByteString
A.takeWhile ((Char -> Bool) -> Parser ByteString)
-> (Char -> Bool) -> Parser ByteString
forall a b. (a -> b) -> a -> b
$ Bool -> Bool
not (Bool -> Bool) -> (Char -> Bool) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Bool
isSpace
    (Char -> Bool) -> Parser ()
A.skipWhile (\Char
c -> Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'\n' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'\r')
    Parser ()
A.endOfLine
    Chrom -> Parser ByteString Chrom
forall (m :: * -> *) a. Monad m => a -> m a
return (Chrom -> Parser ByteString Chrom)
-> (ByteString -> Chrom) -> ByteString -> Parser ByteString Chrom
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> Chrom
Chrom (ByteString -> Parser ByteString Chrom)
-> ByteString -> Parser ByteString Chrom
forall a b. (a -> b) -> a -> b
$ ByteString
chrom