{-| This module defines common data structures for biosequences, i.e. data that represents nucleotide or protein sequences. Basically, anything resembling or wrapping a sequence should implement the 'BioSeq' class (and 'BioSeqQual' if quality information is available). The data types are mostly wrappers from lazy bytestrings from 'Data.ByteString.Lazy' and 'Data.ByteString.Lazy.Char8'. -} {-# LANGUAGE GeneralizedNewtypeDeriving, DeriveDataTypeable #-} module Bio.Core.Sequence ( -- * Data definitions Qual (..), Offset (..), SeqData (..), SeqLabel (..), QualData (..), -- * Class definitions BioSeq (..), BioSeqQual (..), -- * Helper functions toFasta, toFastaQual, toFastQ ) where import qualified Data.ByteString.Lazy.Char8 as LC import qualified Data.ByteString.Lazy as L import Data.Int import Data.Typeable (Typeable) import Data.Word import Data.String -- | Sequence data are lazy bytestrings of ASCII characters. newtype SeqData = SeqData { unSD :: LC.ByteString } deriving (Eq,Ord,IsString,Show,Typeable) -- | Sequence data are lazy bytestrings of ASCII characters. newtype SeqLabel = SeqLabel { unSL :: LC.ByteString } deriving (Eq,Ord,IsString,Show,Typeable) -- | A quality value is in the range 0..255. newtype Qual = Qual { unQual :: Word8 } deriving (Show,Eq,Ord,Num) -- | Quality data are lazy bytestrings of 'Qual's. newtype QualData = QualData { unQD :: L.ByteString } deriving (Eq,Ord,Show,Typeable) -- | An 'Offset' is a zero-based index into a sequence newtype Offset = Offset { unOff :: Int64 } deriving (Show,Eq,Ord,Num,Enum,Real,Integral,Typeable) -- | The 'BioSeq' class models sequence data, and any data object that -- represents a biological sequence should implement it. class BioSeq s where seqlabel :: s -> SeqLabel seqdata :: s -> SeqData seqlength :: s -> Offset -- | Any 'BioSeq' can be formatted as Fasta, 60-char lines. toFasta :: BioSeq s => s -> LC.ByteString -- any kind of string-like data type? Use builder? toFasta s = LC.concat (gt:unSL (seqlabel s):nl:wrap (unSD $ seqdata s)) where wrap x = if LC.null x then [] else let (ln,rest) = LC.splitAt 60 x in ln : nl : wrap rest nl = LC.pack "\n" gt = LC.pack ">" -- | The BioSeqQual class extends BioSeq with quality data. Any correspondig data object -- should be an instance, this will allow Fasta formatted quality data 'toFastaQual', as -- well as the combined FastQ format (via 'toFastQ'). class BioSeq sq => BioSeqQual sq where seqqual :: sq -> QualData -- | Output Fasta-formatted quality data (.qual files), where quality values are output as -- whitespace-separated integers. toFastaQual :: BioSeqQual s => s -> LC.ByteString toFastaQual s = LC.concat (gt:unSL (seqlabel s):nl:wrap (L.unpack $ unQD $ seqqual s)) where wrap x = if null x then [] else let (ln,rest) = splitAt 20 x in LC.pack (unwords $ map show ln) : nl : wrap rest nl = LC.pack "\n" gt = LC.pack ">" -- | Output FastQ-formatted data. For simplicity, only the Sanger quality format is supported, -- and only four lines per sequence (i.e. no line breaks in sequence or quality data). toFastQ :: BioSeqQual s => s -> LC.ByteString toFastQ s = LC.unlines [LC.cons '@' (unSL $ seqlabel s) , unSD (seqdata s) , LC.cons '+' (unSL $ seqlabel s) , L.map (+33) (unQD $ seqqual s)]