{-# LANGUAGE Rank2Types #-} {-# LANGUAGE PatternGuards #-} {-# LANGUAGE OverloadedStrings #-} -- Conduit-based FASTA file format reading. Designed to be used in streaming -- applications. -- -- On parsing, one can choose the chunk size depending on the application. On -- rendering into bytestrings, the number of columns for each data line can be -- selected. This should be less than 80. module Biobase.Fasta.Import where import Control.Arrow (second) import Control.Monad.IO.Class (liftIO, MonadIO (..)) import Control.Monad (unless) import Data.ByteString (ByteString, breakByte, takeWhile, empty, null, uncons) import Data.Char import Data.Conduit as C import Data.Conduit.Binary as C import Data.Conduit.List as CL import Prelude as P hiding (null) import qualified Data.ByteString as B import qualified Data.ByteString.Char8 as BC import Bio.Core.Sequence (Offset(..)) import Biobase.Fasta -- | Parse from 'ByteString' into 'FastaWindow's with a past. parseFastaWindows :: Monad m => Int -> Conduit ByteString m FastaWindow parseFastaWindows wsize = parseEvents wsize =$= CL.concatMapAccum go Nothing where go (Header i d) _ = (Just (0,i,d,""), []) -- offset, identifier, description, past go (Data x) Nothing = (Just (0,"","",""), [FastaW "" "" 0 x ""]) go (Data x) (Just (k,i,d,p)) = (Just (k + (fromIntegral $ B.length x), i, d, x), [FastaW i d (Offset k) x p]) go Done _ = (Nothing, []) -- | Render from 'FastaWindow's into 'ByteString's. renderFastaWindows :: Monad m => Int -> Conduit FastaWindow m ByteString renderFastaWindows cols = CL.concatMapAccum go Nothing =$= renderEvents cols where go fw Nothing = (Just (_identifier fw), [Header (_identifier fw) (_description fw), Data (_fasta fw)]) go fw (Just i) = if _identifier fw == i then (Just i, [Data (_fasta fw)]) else go fw Nothing -- | An event is either a FASTA header or a part of a FASTA data stream, -- chunked into user-defineable pieces. If there is no more input, we are -- 'Done'. But we are only 'Done' if there was some input in the first place! data Event = Header !ByteString !ByteString | Data !ByteString | Done deriving (Eq,Show) isHeader :: Event -> Bool isHeader (Header _ _) = True isHeader _ = False -- | Parse from 'ByteString' into 'Event's. parseEvents :: Monad m => Int -> GInfConduit ByteString m Event parseEvents wsize = awaitE >>= either return goU where loopU = awaitE >>= either finishU goU loopH front = awaitE >>= either (finishH front) (goH front) loopD k front = awaitE >>= either (finishD k front) (goD k front) finishU r = yield Done >> return r finishH front r = let final = front empty in unless (null final) (yield . uncurry Header . second (B.drop 1) . breakByte 32 . B.drop 1 $ final) >> yield Done >> return r finishD k front r = let final = front empty in unless (null final) (yield $ Data final) >> yield Done >> return r goU s = case BC.uncons s of Just ('>', _) -> goH id s Just _ -> goD 0 id s Nothing -> loopU goH sofar more = case uncons rpart of Just (_, rpart') -> yield (uncurry Header . second (B.drop 1) . breakByte 32 . B.drop 1 $ sofar fpart) >> goU rpart' Nothing -> loopH . B.append $ sofar more where (fpart,rpart) = breakByte 10 more goD k sofar more | Just ('>',_) <- BC.uncons more = let final = sofar empty in unless (null final) (yield $ Data final) >> goU more | otherwise = case uncons rpart of Just (_, rpart') -> let k' = k + B.length fpart in case k' `compare` wsize of LT -> goD k' (B.append $ sofar fpart) rpart' EQ -> yield (Data $ sofar fpart) >> goU rpart' GT -> let (lp,rp) = B.splitAt wsize $ sofar fpart in yield (Data lp) >> goD 0 id (B.append rp rpart) Nothing -> let k' = k + B.length more in case k' `compare` wsize of LT -> loopD k' . B.append $ sofar more EQ -> yield (Data $ sofar more) >> loopU GT -> let (lp,rp) = B.splitAt wsize $ sofar more in yield (Data lp) >> goD 0 id rp where (fpart,rpart) = breakByte 10 more -- | Render from 'Event's into 'ByteStrings'. 'cols' is the number of -- characters after which a newline is introduced into the stream. Such -- newlines are introduced only into 'Data' events. renderEvents :: Monad m => Int -> Conduit Event m ByteString renderEvents cols = CL.concatMap go =$= CL.map (`BC.snoc` '\n') where go (Header i d) = [printHeader $ Header i d] go (Data xs) = rows xs go (Done) = [] rows xs = let (x,xs') = B.splitAt cols xs in if B.length xs <= cols then [xs] else x : rows xs' printHeader (Header i d) = BC.concat $ [">",i] ++ (if null d then [] else [" ", d]) test :: IO () test = do let prnt (Header i d) = BC.putStr i >> BC.putStrLn d prnt (Data d) = BC.putStrLn d runResourceT $ sourceFile "big.fa" $= parseEvents 1000 $$ CL.foldM (\_ x -> liftIO $ prnt x) ()