-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | streaming FASTA parser -- -- Stream-based handling of FASTA files. The user selects a window size, -- the library then handles the window. For each window, the previous -- (past) window is available, in case some data sits on the boundary -- between windows. -- -- Greg Schwartz' http://hackage.haskell.org/package/fasta package -- is a lot more complete. This one is mostly tailored to my usage -- requirements (and may at some point use his library). @package BiobaseFasta @version 0.4.0.1 -- | Streaming Fasta handling via the streaming library. -- -- The functions in here should be streaming in constant memory. -- -- A typical, slightly complicated is this: forEach :: forall r . -- Stream (ByteString m) m r -> m (Stream (Of ()) m r) forEach dna = -- do -- extract the header, but at most 123 characters, dropping the -- rest hdr SP.:> dta ← extractHeader (Just 123) dna -- create windows -- ws of a particular type. Include the prefix, the suffix, and -- make each window 10 characters long let ws = (streamedWindows True -- True (Just 10) (SequenceIdentifier hdr) PlusStrand dta :: SP.Stream -- (SP.Of (BioSequenceWindow DNA DNA 0)) m r) -- count the number -- of characters in dna, get the return value, print each window -- count SP.:> r ← SP.mapM_ (liftIO . print) . bswSeqLength $ SP.copy -- ws liftIO $ print count liftIO $ putStrLn "" -- yield one vacuous -- () result, return the remainder r from dna. return $ -- SP.yield () *> return r -- -- TODO Check if this is actually true with some unit tests. module Biobase.Fasta.Streaming streamedFasta :: Monad m => ByteStream m r -> Stream (Stream (ByteStream m) m) m r -- | Here each individual fasta file will be a stream. -- -- TODO Once this works, streamingFasta should be S.concats -- . streamOfStreamedFasta ... streamOfStreamedFasta :: forall m r. Monad m => ByteStream m r -> Stream (Stream (ByteStream m) m) m r -- | Given a 'Stream (ByteString m) m r' which is a Stream of -- lines, split off the first Fasta entry. splitFasta :: Monad m => Stream (ByteStream m) m r -> Stream (ByteStream m) m (Stream (ByteStream m) m r) -- | Given a stream, roughly like [BS Header, BS Data1, -- BS Data2, ...] create a stream like [BS Header, -- BS Data]. This means that the resulting stream holds -- exactly two ByteString's. collapseData :: Monad m => Stream (ByteStream m) m r -> Stream (ByteStream m) m r -- | Rechunk a stream of bytestrings. reChunkBS :: Monad m => Int -> Stream (ByteStream m) m r -> Stream (ByteStream m) m r -- | Assuming a "rechunked" stream of bytestrings, create sequence windows. chunksToWindows :: Monad m => SequenceIdentifier w -> Strand -> Stream (ByteStream m) m r -> Stream (Of (Location w FwdPosition (BioSequence ty))) m r -- | Make it possible to take a fasta stream and produce a stream of -- BioSequenceWindows. This is a convenience function around -- 'withSuffix . withPrefix . chunksToWindows . reChunks'. -- -- In case of a Nothing window size, a single huge -- Fasta entry is produced (and materialized!). -- -- TODO In case of Nothing window size, we use the -- collapseData function which has one check too many, and will be -- slightly slower. However, the check should be once per -- ByteString. streamedWindows :: Monad m => Maybe Int -> Maybe Int -> Maybe Int -> SequenceIdentifier w -> Strand -> Stream (ByteStream m) m r -> Stream (Of (PIS w FwdPosition (BioSequence ty))) m r -- | Get the full length of a stream of BioSequenceWindows, -- counted in characters in each bswSequence. -- -- To use, start with bswSeqLength $ SP.copy xs. Then consume -- this stream normally. It still provides a Stream of -- BioSequenceWindowss. However, the return type is now not just -- r, but it provides Int SP.:> r, where the -- Int provides the total length of characters within this -- Fasta entry. -- -- This value may then be used to fully update negative strand -- information. streamLocationLength :: (Monad m, ModifyLocation posTy seqTy) => Stream (Of (Location i posTy seqTy)) m r -> m (Of Int r) -- | As a first function, the header should be extracted from a -- Fasta stream. Since headers may be malformed / malicious, we -- make it possible to extractHeader :: Monad m => Maybe Int -> Stream (ByteStream m) m r -> m (Of ByteString (Stream (ByteStream m) m r)) -- | A convenience module for *small* Fasta entries, that are -- completely in memory and *not* to be streamed. -- -- The Data.ByteString.Strict.Lens module is very helpful for -- further handling of Fasta entries. -- -- For convenience, the convertString function from -- string-conversions is supplied. module Biobase.Fasta.Strict -- | A *strict* Fasta entry. data Fasta which ty Fasta :: !SequenceIdentifier which -> !BioSequence ty -> Fasta which ty [_header] :: Fasta which ty -> !SequenceIdentifier which [_fasta] :: Fasta which ty -> !BioSequence ty -- | If you don't want to deal with the phantom types. type FastaUntyped = Fasta Void Void fasta :: forall k_ad2l (which_acOY :: k_ad2l) k_ad2n (ty_acOZ :: k_ad2n) k_afxU (ty_afxT :: k_afxU). Lens (Fasta (which_acOY :: k_ad2l) (ty_acOZ :: k_ad2n)) (Fasta (which_acOY :: k_ad2l) (ty_afxT :: k_afxU)) (BioSequence ty_acOZ) (BioSequence ty_afxT) header :: forall k_ad2l (which_acOY :: k_ad2l) k_ad2n (ty_acOZ :: k_ad2n) k_afxW (which_afxV :: k_afxW). Lens (Fasta (which_acOY :: k_ad2l) (ty_acOZ :: k_ad2n)) (Fasta (which_afxV :: k_afxW) (ty_acOZ :: k_ad2n)) (SequenceIdentifier which_acOY) (SequenceIdentifier which_afxV) -- | Render a Fasta entry to a ByteString. Will end with a -- final n in any case. fastaToByteString :: Int -> Fasta which ty -> ByteString -- | Render a Fasta entry to a Builder. Will end with a -- final n in any case. fastaToBuilder :: Int -> Fasta which ty -> Builder -- | Try to parse a ByteString as a Fasta, failing with -- Left, succees with Right. byteStringToFasta :: ByteString -> Either String (Fasta which ty) -- | Try to parse a ByteString as multiple Fasta entries. -- Even though this is using the underlying streaming interface, this is -- not streaming. -- -- A lens that goes from a BioSequenceWindow to a Fasta. -- -- A prism from a ByteString to a Fasta. Note that this -- will only be an identity if the underlying fasta file is rendered with -- k characters per line. rawFasta :: Int -> Prism' ByteString (Fasta which ty) convertString :: ConvertibleStrings a b => a -> b instance forall k1 (which :: k1) k2 (ty :: k2). GHC.Generics.Generic (Biobase.Fasta.Strict.Fasta which ty) instance forall k1 (which :: k1) k2 (ty :: k2). GHC.Show.Show (Biobase.Fasta.Strict.Fasta which ty) instance forall k1 (which :: k1) k2 (ty :: k2). GHC.Read.Read (Biobase.Fasta.Strict.Fasta which ty) instance forall k1 (which :: k1) k2 (ty :: k2). GHC.Classes.Ord (Biobase.Fasta.Strict.Fasta which ty) instance forall k1 (which :: k1) k2 (ty :: k2). GHC.Classes.Eq (Biobase.Fasta.Strict.Fasta which ty)