-- | A convenience module for *small* @Fasta@ entries, that are completely in -- memory and *not* to be streamed. -- -- The @Data.ByteString.Strict.Lens@ module is very helpful for further -- handling of 'Fasta' entries. -- -- For convenience, the 'convertString' function from @string-conversions@ is -- supplied. module Biobase.Fasta.Strict ( module Biobase.Fasta.Strict , convertString ) where import Control.Lens import Data.Bifunctor (first) import Data.ByteString (ByteString) import Data.String.Conversions import Data.Void import GHC.Generics (Generic) import qualified Data.ByteString.Builder as BB import qualified Data.ByteString.Char8 as BS import qualified Data.ByteString.Lazy as BSL import qualified Data.ByteString.Streaming as BSS import qualified Streaming.Prelude as SP import Biobase.Fasta.Streaming as FS import Biobase.Types.BioSequence -- | A *strict* @Fasta@ entry. data Fasta which ty = Fasta { _header ∷ !(SequenceIdentifier which) , _fasta ∷ !(BioSequence ty) } deriving (Eq,Ord,Read,Show,Generic) makeLenses ''Fasta -- | If you don't want to deal with the phantom types. type FastaUntyped = Fasta Void Void -- | Render a 'Fasta' entry to a 'ByteString'. Will end with a final @\n@ in -- any case. fastaToByteString ∷ Int → Fasta which ty → ByteString {-# Inlinable fastaToByteString #-} fastaToByteString k' Fasta{..} = BS.cons '>' (_header^._Wrapped) <> "\n" <> go (_fasta^._Wrapped) where go (BS.splitAt k → (hd,tl)) | BS.null hd = mempty | otherwise = hd <> "\n" <> go tl k = max 1 k' -- | Render a 'Fasta' entry to a 'Builder'. Will end with a final @\n@ in -- any case. fastaToBuilder ∷ Int → Fasta which ty → BB.Builder {-# Inlinable fastaToBuilder #-} fastaToBuilder k' Fasta{..} = BB.char8 '>' <> (BB.byteString $ _header^._Wrapped) <> BB.char8 '\n' <> go (_fasta^._Wrapped) where go (BS.splitAt k → (hd,tl)) | BS.null hd = mempty | otherwise = BB.byteString hd <> BB.char8 '\n' <> go tl k = max 1 k' -- | Try to parse a 'ByteString' as a 'Fasta', failing with 'Left', succees -- with 'Right'. byteStringToFasta ∷ ByteString → Either String (Fasta which ty) {-# Inlinable byteStringToFasta #-} byteStringToFasta (BS.lines → ls) | null ls = Left "empty bytestring" | Just (z, hdr) ← BS.uncons h, z `BS.elem` ">;" = Right $ Fasta { _header = SequenceIdentifier hdr, _fasta = BioSequence $ BS.concat ts } | otherwise = Left "no '>'/';' first character" where h:ts = ls -- | Try to parse a 'ByteString' as multiple 'Fasta' entries. Even though this -- is using the underlying streaming interface, this is not streaming. byteStringToMultiFasta ∷ BSL.ByteString → [Fasta which ty] {-# Inlinable byteStringToMultiFasta #-} byteStringToMultiFasta bsl = map (view windowedFasta) $ runIdentity bss where bss = SP.toList_ . streamingFasta (HeaderSize maxBound) (OverlapSize 0) (CurrentSize maxBound) $ BSS.fromLazy bsl -- | A lens that goes from a 'BioSequenceWindow' to a 'Fasta'. windowedFasta ∷ Lens' (BioSequenceWindow w ty k) (Fasta w ty) {-# Inline windowedFasta #-} windowedFasta = lens lr rl where lr bsw = Fasta { _header = bsw^.bswIdentifier, _fasta = bsw^.bswSequence } rl bsw f = set bswSequence (f^.fasta) $ set bswIdentifier (f^.header) bsw -- | A prism from a 'ByteString' to a 'Fasta'. Note that this will only be an -- identity if the underlying fasta file is rendered with @k@ characters per -- line. rawFasta ∷ Int → Prism' ByteString (Fasta which ty) {-# Inline rawFasta #-} rawFasta k = prism (fastaToByteString k) $ \bs → first (const bs) $ byteStringToFasta bs