{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE UndecidableInstances #-}

module Bio.Data.Fasta
    ( FastaLike(..)
    , fastaReader
    ) where

import Bio.Motif
import Bio.Seq
import qualified Data.ByteString.Char8 as B
import Conduit

class FastaLike f where
    -- | Convert a FASTA record, consisting of a record header and a record body,
    -- to a specific data type
    fromFastaRecord :: (B.ByteString, [B.ByteString]) -> f

    readFasta :: FilePath -> ConduitT i f (ResourceT IO) ()
    readFasta FilePath
fl = FilePath -> ConduitT i (ByteString, [ByteString]) (ResourceT IO) ()
forall i.
FilePath -> ConduitT i (ByteString, [ByteString]) (ResourceT IO) ()
fastaReader FilePath
fl ConduitT i (ByteString, [ByteString]) (ResourceT IO) ()
-> ConduitM (ByteString, [ByteString]) f (ResourceT IO) ()
-> ConduitT i f (ResourceT IO) ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.| ((ByteString, [ByteString]) -> f)
-> ConduitM (ByteString, [ByteString]) f (ResourceT IO) ()
forall (m :: * -> *) a b. Monad m => (a -> b) -> ConduitT a b m ()
mapC (ByteString, [ByteString]) -> f
forall f. FastaLike f => (ByteString, [ByteString]) -> f
fromFastaRecord

    -- | non-stream version, read whole file in memory
    readFasta' :: FilePath -> IO [f]
    readFasta' FilePath
fl = ResourceT IO [f] -> IO [f]
forall (m :: * -> *) a. MonadUnliftIO m => ResourceT m a -> m a
runResourceT (ResourceT IO [f] -> IO [f]) -> ResourceT IO [f] -> IO [f]
forall a b. (a -> b) -> a -> b
$ ConduitT () Void (ResourceT IO) [f] -> ResourceT IO [f]
forall (m :: * -> *) r. Monad m => ConduitT () Void m r -> m r
runConduit (ConduitT () Void (ResourceT IO) [f] -> ResourceT IO [f])
-> ConduitT () Void (ResourceT IO) [f] -> ResourceT IO [f]
forall a b. (a -> b) -> a -> b
$ FilePath -> ConduitT () f (ResourceT IO) ()
forall f i.
FastaLike f =>
FilePath -> ConduitT i f (ResourceT IO) ()
readFasta FilePath
fl ConduitT () f (ResourceT IO) ()
-> ConduitM f Void (ResourceT IO) [f]
-> ConduitT () Void (ResourceT IO) [f]
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.| ConduitM f Void (ResourceT IO) [f]
forall (m :: * -> *) a o. Monad m => ConduitT a o m [a]
sinkList
    {-# MINIMAL fromFastaRecord #-}

instance BioSeq s a => FastaLike (s a) where
    fromFastaRecord :: (ByteString, [ByteString]) -> s a
fromFastaRecord (ByteString
_, [ByteString]
xs) = case ByteString -> Either FilePath (s a)
forall (seq :: * -> *) alphabet.
BioSeq seq alphabet =>
ByteString -> Either FilePath (seq alphabet)
fromBS ([ByteString] -> ByteString
B.concat [ByteString]
xs) of
        Left FilePath
err -> FilePath -> s a
forall a. HasCallStack => FilePath -> a
error FilePath
err
        Right s a
x -> s a
x
    {-# INLINE fromFastaRecord #-}

instance FastaLike Motif where
    fromFastaRecord :: (ByteString, [ByteString]) -> Motif
fromFastaRecord (ByteString
name, [ByteString]
mat) = ByteString -> PWM -> Motif
Motif ByteString
name ([ByteString] -> PWM
toPWM [ByteString]
mat)
    {-# INLINE fromFastaRecord #-}

fastaReader :: FilePath
            -> ConduitT i (B.ByteString, [B.ByteString]) (ResourceT IO) ()
fastaReader :: FilePath -> ConduitT i (ByteString, [ByteString]) (ResourceT IO) ()
fastaReader FilePath
fl = FilePath -> ConduitT i ByteString (ResourceT IO) ()
forall (m :: * -> *) i.
MonadResource m =>
FilePath -> ConduitT i ByteString m ()
sourceFile FilePath
fl ConduitT i ByteString (ResourceT IO) ()
-> ConduitM ByteString (ByteString, [ByteString]) (ResourceT IO) ()
-> ConduitT i (ByteString, [ByteString]) (ResourceT IO) ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.| ConduitT ByteString ByteString (ResourceT IO) ()
forall (m :: * -> *) seq.
(Monad m, IsSequence seq, Element seq ~ Word8) =>
ConduitT seq seq m ()
linesUnboundedAsciiC ConduitT ByteString ByteString (ResourceT IO) ()
-> ConduitM ByteString (ByteString, [ByteString]) (ResourceT IO) ()
-> ConduitM ByteString (ByteString, [ByteString]) (ResourceT IO) ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.| [ByteString]
-> ConduitM ByteString (ByteString, [ByteString]) (ResourceT IO) ()
forall (m :: * -> *).
Monad m =>
[ByteString] -> ConduitT ByteString (ByteString, [ByteString]) m ()
loop []
  where
    loop :: [ByteString] -> ConduitT ByteString (ByteString, [ByteString]) m ()
loop [ByteString]
acc = do
        Maybe ByteString
x <- ConduitT ByteString (ByteString, [ByteString]) m (Maybe ByteString)
forall (m :: * -> *) i. Monad m => Consumer i m (Maybe i)
await
        case Maybe ByteString
x of
            Just ByteString
l -> case () of
                ()
_ | ByteString -> Bool
B.null ByteString
l -> [ByteString] -> ConduitT ByteString (ByteString, [ByteString]) m ()
loop [ByteString]
acc  -- empty line, go to next line
                  | ByteString -> Char
B.head ByteString
l Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'>' -> [ByteString] -> ConduitT ByteString (ByteString, [ByteString]) m ()
forall (m :: * -> *) a i.
Monad m =>
[a] -> ConduitT i (a, [a]) m ()
output ([ByteString] -> [ByteString]
forall a. [a] -> [a]
reverse [ByteString]
acc) ConduitT ByteString (ByteString, [ByteString]) m ()
-> ConduitT ByteString (ByteString, [ByteString]) m ()
-> ConduitT ByteString (ByteString, [ByteString]) m ()
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> [ByteString] -> ConduitT ByteString (ByteString, [ByteString]) m ()
loop [ByteString -> ByteString
B.tail ByteString
l]
                  | Bool
otherwise -> [ByteString] -> ConduitT ByteString (ByteString, [ByteString]) m ()
loop (ByteString
lByteString -> [ByteString] -> [ByteString]
forall a. a -> [a] -> [a]
:[ByteString]
acc)
            Maybe ByteString
Nothing -> [ByteString] -> ConduitT ByteString (ByteString, [ByteString]) m ()
forall (m :: * -> *) a i.
Monad m =>
[a] -> ConduitT i (a, [a]) m ()
output ([ByteString]
 -> ConduitT ByteString (ByteString, [ByteString]) m ())
-> [ByteString]
-> ConduitT ByteString (ByteString, [ByteString]) m ()
forall a b. (a -> b) -> a -> b
$ [ByteString] -> [ByteString]
forall a. [a] -> [a]
reverse [ByteString]
acc
    output :: [a] -> ConduitT i (a, [a]) m ()
output (a
x:[a]
xs) = (a, [a]) -> ConduitT i (a, [a]) m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield (a
x, [a]
xs)
    output [a]
_ = () -> ConduitT i (a, [a]) m ()
forall (m :: * -> *) a. Monad m => a -> m a
return ()
{-# INLINE fastaReader #-}