-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | streaming FASTA parser -- -- Stream-based handling of FASTA files. The user selects a window size, -- the library then handles the window. For each window, the previous -- (past) window is available, in case some data sits on the boundary -- between windows. -- -- FastaTool is a simple tool providing information on FASTA files, and -- allowing to extract sequences and subsequences. -- -- Greg Schwartz' http://hackage.haskell.org/package/fasta package -- is a lot more complete. This one is mostly tailored to my usage -- requirements (and may at some point use his library). @package BiobaseFasta @version 0.2.0.0 module Biobase.Fasta.Types data Fasta Fasta :: ByteString -> ByteString -> Fasta [fastaHeader] :: Fasta -> ByteString [fastaSequence] :: Fasta -> ByteString newtype RawFastaEntry RawFastaEntry :: ByteString -> RawFastaEntry [_rawFastaEntry] :: RawFastaEntry -> ByteString -- | StreamEvents are chunked pieces of data, where the raw data is -- a strict ByteString. Each element also retains information on -- the first and last line and column (via streamLines) that are -- part of this chunk. data StreamEvent -- | A Header event, multiple header events signal that the header name was -- longer than the chunk size. StreamHeader :: !ByteString -> !LineInfo -> StreamEvent [streamHeader] :: StreamEvent -> !ByteString [streamLines] :: StreamEvent -> !LineInfo -- | A data event. We keep a pointer to the previous chunk (which is useful -- for some algorithms). The chunk is free of newlines! StreamFasta :: !ByteString -> !ByteString -> !LineInfo -> !ByteString -> StreamEvent [streamFasta] :: StreamEvent -> !ByteString [prevStreamFasta] :: StreamEvent -> !ByteString [streamLines] :: StreamEvent -> !LineInfo [streamHeader] :: StreamEvent -> !ByteString -- | Complete information on line and column start and end for a chunk. -- -- TODO This is a 1-based format? Lets use the BiobaseTypes facilities! data LineInfo LineInfo :: !Int -> !Int -> !Int -> !Int -> !Int -> LineInfo -- | first line for this chunk (lines in complete file!) [firstLine] :: LineInfo -> !Int -- | first column in first line for this chunk [firstCol] :: LineInfo -> !Int -- | last line for this chunk (lines in complete file!) [lastLine] :: LineInfo -> !Int -- | last column in last line for this chunk [lastCol] :: LineInfo -> !Int -- | first index in this fasta block. Counts just the number of symbols in -- the Fasta payload. [firstIndex] :: LineInfo -> !Int instance GHC.Generics.Generic Biobase.Fasta.Types.StreamEvent instance GHC.Classes.Ord Biobase.Fasta.Types.StreamEvent instance GHC.Classes.Eq Biobase.Fasta.Types.StreamEvent instance GHC.Show.Show Biobase.Fasta.Types.StreamEvent instance GHC.Generics.Generic Biobase.Fasta.Types.LineInfo instance GHC.Classes.Ord Biobase.Fasta.Types.LineInfo instance GHC.Classes.Eq Biobase.Fasta.Types.LineInfo instance GHC.Show.Show Biobase.Fasta.Types.LineInfo instance GHC.Classes.Ord Biobase.Fasta.Types.RawFastaEntry instance GHC.Classes.Eq Biobase.Fasta.Types.RawFastaEntry instance GHC.Show.Show Biobase.Fasta.Types.RawFastaEntry instance GHC.Classes.Eq Biobase.Fasta.Types.Fasta instance Control.DeepSeq.NFData Biobase.Fasta.Types.StreamEvent instance Control.DeepSeq.NFData Biobase.Fasta.Types.LineInfo -- | Streaming Fasta handling via the streaming library. -- -- The functions in here should be streaming in constant memory. -- -- TODO Check if this is actually true with some unit tests. module Biobase.Fasta.Streaming -- | Control structure for streamingFasta. data FindHeader FindHeader :: [ByteString] -> !Int -> FindHeader -- | the collected header parts (in reverse order) [headerParts] :: FindHeader -> [ByteString] -- | accumulated header length [headerLength] :: FindHeader -> !Int HasHeader :: !ByteString -> !ByteString -> [ByteString] -> !Int -> !Int -> FindHeader -- | the (size-truncated) header for this fasta file [header] :: FindHeader -> !ByteString -- | overlap (if any) from earlier parts of the fasta file [dataOverlap] :: FindHeader -> !ByteString -- | collection of dataParts, in reverse order! [dataParts] :: FindHeader -> [ByteString] -- | total length of data parts, simplifies checking if enough data was -- collected [dataLength] :: FindHeader -> !Int -- | count how many entries we have seen [entries] :: FindHeader -> !Int -- | Current Fasta window, together with the start index (0-based). data Current (which :: k) Current :: ByteString -> Index 0 -> Current [currentFasta] :: Current -> ByteString [currentStart] :: Current -> Index 0 newtype Overlap (which :: k) Overlap :: ByteString -> Overlap [getOverlap] :: Overlap -> ByteString newtype Header (which :: k) Header :: ByteString -> Header [getHeader] :: Header -> ByteString newtype CurrentSize CurrentSize :: Int -> CurrentSize newtype OverlapSize OverlapSize :: Int -> OverlapSize newtype HeaderSize HeaderSize :: Int -> HeaderSize -- | Fully stream a fasta file, making sure to never exceed a constant -- amount of memory. The go function yields values of type -- a down the line for continued streaming. -- --
--   r4 = toList . streamingFasta (HeaderSize 2) (OverlapSize 1) (CurrentSize 2) go . S8.fromStrict $ BS.pack t0
--    where go (Header h) (Overlap o) (Current c) = yield (h,o,c)
--   
streamingFasta :: forall m w r a. Monad m => HeaderSize -> OverlapSize -> CurrentSize -> (Header w -> Overlap w -> Current w -> Stream (Of a) m ()) -> ByteString m r -> Stream (Of a) m r eachFasta :: Monad m => Header which1 -> Overlap which2 -> Current which3 -> Stream (Of (ByteString, ByteString, ByteString)) m () parseFastaFile :: FilePath -> IO [Fasta] parseFasta :: ByteString -> [Fasta] instance forall k (which :: k). GHC.Show.Show (Biobase.Fasta.Streaming.Current which) instance forall k (which :: k). GHC.Classes.Ord (Biobase.Fasta.Streaming.Current which) instance forall k (which :: k). GHC.Classes.Eq (Biobase.Fasta.Streaming.Current which) instance forall k (which :: k). GHC.Show.Show (Biobase.Fasta.Streaming.Overlap which) instance forall k (which :: k). GHC.Classes.Ord (Biobase.Fasta.Streaming.Overlap which) instance forall k (which :: k). GHC.Classes.Eq (Biobase.Fasta.Streaming.Overlap which) instance forall k (which :: k). GHC.Show.Show (Biobase.Fasta.Streaming.Header which) instance forall k (which :: k). GHC.Classes.Ord (Biobase.Fasta.Streaming.Header which) instance forall k (which :: k). GHC.Classes.Eq (Biobase.Fasta.Streaming.Header which) instance GHC.Show.Show Biobase.Fasta.Streaming.CurrentSize instance GHC.Classes.Ord Biobase.Fasta.Streaming.CurrentSize instance GHC.Classes.Eq Biobase.Fasta.Streaming.CurrentSize instance GHC.Show.Show Biobase.Fasta.Streaming.OverlapSize instance GHC.Classes.Ord Biobase.Fasta.Streaming.OverlapSize instance GHC.Classes.Eq Biobase.Fasta.Streaming.OverlapSize instance GHC.Show.Show Biobase.Fasta.Streaming.HeaderSize instance GHC.Classes.Ord Biobase.Fasta.Streaming.HeaderSize instance GHC.Classes.Eq Biobase.Fasta.Streaming.HeaderSize -- | Fasta export module Biobase.Fasta.Export prettyPrintFasta :: Int -> Fasta -> String prettyByteStringFasta :: Int -> Fasta -> ByteString breakByteString :: Int -> ByteString -> [ByteString] fromIntToInt64 :: Int -> Int64 writeFastaFile :: FilePath -> [Fasta] -> IO () instance GHC.Show.Show Biobase.Fasta.Types.Fasta