src/Bio/Core/Sequence.hs

{-| This module defines common data structures for biosequences,
    i.e. data that represents nucleotide or protein sequences.

    Basically, anything resembling or wrapping a sequence should
    implement the 'BioSeq' class (and 'BioSeqQual' if quality information
    is available).

    The data types are mostly wrappers from lazy bytestrings from
    'Data.ByteString.Lazy' and 'Data.ByteString.Lazy.Char8'.
-}

{-# LANGUAGE GeneralizedNewtypeDeriving, DeriveDataTypeable #-}

module Bio.Core.Sequence (
  -- * Data definitions
  Qual (..), Offset (..),
  SeqData (..), SeqLabel (..), QualData (..),

  -- * Class definitions
  BioSeq (..), BioSeqQual (..),

  -- * Helper functions
  toFasta, toFastaQual, toFastQ
  ) where

import qualified Data.ByteString.Lazy.Char8 as LC
import qualified Data.ByteString.Lazy as L
import Data.Int
import Data.Typeable (Typeable)
import Data.Word
import Data.String

-- | Sequence data are lazy bytestrings of ASCII characters.
newtype SeqData  = SeqData { unSD :: LC.ByteString } deriving (Eq,Ord,IsString,Show,Typeable)

-- | Sequence data are lazy bytestrings of ASCII characters.
newtype SeqLabel = SeqLabel { unSL :: LC.ByteString } deriving (Eq,Ord,IsString,Show,Typeable)

-- | A quality value is in the range 0..255.
newtype Qual     = Qual { unQual :: Word8 } deriving (Show,Eq,Ord,Num)

-- | Quality data are lazy bytestrings of 'Qual's.
newtype QualData = QualData { unQD :: L.ByteString } deriving (Eq,Ord,Show,Typeable)

-- | An 'Offset' is a zero-based index into a sequence
newtype Offset   = Offset { unOff :: Int64 } deriving (Show,Eq,Ord,Num,Enum,Real,Integral,Typeable)

-- | The 'BioSeq' class models sequence data, and any data object that
--   represents a biological sequence should implement it.
class BioSeq s where
  seqlabel  :: s -> SeqLabel
  seqdata   :: s -> SeqData
  seqlength :: s -> Offset

-- | Any 'BioSeq' can be formatted as Fasta, 60-char lines.
toFasta :: BioSeq s => s -> LC.ByteString -- any kind of string-like data type?  Use builder?
toFasta s = LC.concat (gt:unSL (seqlabel s):nl:wrap (unSD $ seqdata s))
  where wrap x = if LC.null x then [] else let (ln,rest) = LC.splitAt 60 x in ln : nl : wrap rest
        nl = LC.pack "\n"
        gt = LC.pack ">"
                                         
-- | The BioSeqQual class extends BioSeq with quality data.  Any correspondig data object
--   should be an instance, this will allow Fasta formatted quality data 'toFastaQual', as
--   well as the combined FastQ format (via 'toFastQ').
class BioSeq sq => BioSeqQual sq where
  seqqual  :: sq -> QualData

-- | Output Fasta-formatted quality data (.qual files), where quality values are output as 
--   whitespace-separated integers.
toFastaQual :: BioSeqQual s => s -> LC.ByteString
toFastaQual s = LC.concat (gt:unSL (seqlabel s):nl:wrap (L.unpack $ unQD $ seqqual s))
  where wrap x = if null x then [] else let (ln,rest) = splitAt 20 x in LC.pack (unwords $ map show ln) : nl : wrap rest
        nl = LC.pack "\n"
        gt = LC.pack ">"

-- | Output FastQ-formatted data.  For simplicity, only the Sanger quality format is supported,
--   and only four lines per sequence (i.e. no line breaks in sequence or quality data).
toFastQ :: BioSeqQual s => s -> LC.ByteString
toFastQ s = LC.unlines [LC.cons '@' (unSL $ seqlabel s)
                       , unSD (seqdata s)
                       , LC.cons '+' (unSL $ seqlabel s)
                       , L.map (+33) (unQD $ seqqual s)]