Haskell Code by HsColour

{- |
   Data structures for manipulating (biological) sequences.

   Generally supports both nucleotide and protein sequences, some functions,
   like @revcompl@, only makes sense for nucleotides.
-}

module Bio.Sequence.SeqData 
    (
      -- * Data structure
      -- | A sequence is a header, sequence data itself, and optional quality data.
      --   All items are lazy bytestrings.  The Offset type can be used for indexing.
      Sequence(..), Offset, SeqData,

      -- | Quality data is normally associated with nucleotide sequences
      Qual, QualData

      -- * Accessor functions
    , (!), seqlength,seqlabel,seqheader,seqdata
    , (?), hasqual, seqqual

      -- * Converting to and from [Char]
    , fromStr, toStr

      -- * Nucleotide functionality
      -- | Nucleotide sequences contain the alphabet [A,C,G,T].
      -- IUPAC specifies an extended nucleotide alphabet with wildcards, but
      -- it is not supported at this point.
    , compl, revcompl

      -- * Protein functionality
      -- | Proteins are chains of amino acids, represented by the IUPAC alphabet.
    , Amino(..), translate, fromIUPAC, toIUPAC -- amino acids
    ) where

import Data.List (unfoldr)
import Data.Char (toUpper)
import Data.Int
import Data.Word
import Data.Maybe (fromJust, isJust)
import qualified Data.ByteString.Lazy.Char8 as B
import qualified Data.ByteString.Lazy as BB


-- | An offset, index, or length of a 'SeqData'
type Offset  = Int64

-- | The basic data type used in 'Sequence's
type SeqData = B.ByteString

-- | Basic type for quality data.  Range 0..255.  Typical Phred output is in
--   the range 6..50, with 20 as the line in the sand separating good from bad.
type Qual = Word8

-- | Quality data is a 'Qual' vector, currently implemented as a 'ByteString'.
type QualData = BB.ByteString -- mild abuse

-- | A sequence consists of a header, the sequence data itself, and optional quality data.
data Sequence = Seq !SeqData !SeqData !(Maybe QualData) -- ^ header and actual sequence
                deriving (Show,Eq)

-- | Convert a String to 'SeqData'
fromStr :: String -> SeqData
fromStr = B.pack

-- | Convert a 'SeqData' to a String
toStr :: SeqData -> String
toStr   = B.unpack

-- | Read the character at the specified position in the sequence.
{-# INLINE (!) #-}
(!) :: Sequence -> Offset -> Char
(!) (Seq _ bs _) = B.index bs

{-# INLINE (?) #-}
(?) :: Sequence -> Offset -> Qual
(?) (Seq _ _ (Just qs)) = BB.index qs

-- | Return sequence length.
seqlength :: Sequence -> Offset
seqlength (Seq _ bs _) = B.length bs

-- | Return sequence label (first word of header)
seqlabel :: Sequence -> SeqData
seqlabel (Seq l _ _) = head (B.words l)

-- | Return full header.
seqheader :: Sequence -> SeqData
seqheader (Seq l _ _) = l

-- | Return the sequence data.
seqdata :: Sequence -> SeqData
seqdata (Seq _ d _) = d

-- | Return the quality data, or error if none exist.  Use hasqual if in doubt.
seqqual :: Sequence -> QualData
seqqual (Seq _ _ (Just q)) = q

-- | Check whether the sequence has associated quality data.
hasqual :: Sequence -> Bool
hasqual (Seq _ _ q) = isJust q

------------------------------------------------------------
-- Nucleotide (DNA, RNA) specific stuff
------------------------------------------------------------

-- | Calculate the reverse complement.
--   This is only relevant for the nucleotide alphabet, 
--   and it leaves other characters unmodified.  
revcompl :: Sequence -> Sequence
revcompl (Seq l bs qs) = Seq l (B.reverse $ B.map compl bs)
                        (maybe Nothing (Just . BB.reverse) qs)

-- | Complement a single character.  I.e. identify the nucleotide it 
--   can hybridize with.  Note that for multiple nucleotides, you usually
--   want the reverse complement (see 'revcompl' for that).
compl :: Char -> Char
compl 'A' = 'T'
compl 'T' = 'A'
compl 'C' = 'G'
compl 'G' = 'C'
compl 'a' = 't'
compl 't' = 'a'
compl 'c' = 'g'
compl 'g' = 'c'
compl  x  =  x

------------------------------------------------------------
-- Amino acid (protein) stuff
------------------------------------------------------------

-- | Translate a nucleotide sequence into the corresponding protein
--   sequence.  This works rather blindly, with no attempt to identify ORFs
--   or otherwise QA the result.
translate :: Sequence -> Offset -> [Amino]
translate s' o' = unfoldr triples (s',o')
   where triples (s,o) = 
             if o > seqlength s - 3 then Nothing
             else Just (trans1 (map (s!) [o,o+1,o+2]),(s,o+3))

-- prop_trans s = tail (translate s 0) == translate s 3

trans1 :: String -> Amino
trans1 = maybe Xaa id . flip lookup trans_tbl . map (repUT . toUpper)
    where repUT x = if x == 'U' then 'T' else x -- RNA uses U for T

data Amino = Ala | Arg | Asn | Asp | Cys | Gln | Glu | Gly
           | His | Ile | Leu | Lys | Met | Phe | Pro | Ser
           | Thr | Tyr | Trp | Val 
           | STP | Asx | Glx | Xle | Xaa -- unknowns
     deriving (Show,Eq)

trans_tbl :: [(String,Amino)]
trans_tbl = [("GCT",Ala),("GCC",Ala),("GCA",Ala),("GCG",Ala),
             ("CGT",Arg),("CGC",Arg),("CGA",Arg),("CGG",Arg),("AGA",Arg),("AGG",Arg),
             ("AAT",Asn),("AAC",Asn),
             ("GAT",Asp),("GAC",Asp),
--             ("RAT",Asx),("RAC",Asx), -- IUPAC: R is purine (A or G)
             ("TGT",Cys),("TGC",Cys),
             ("CAA",Gln),("CAG",Gln),
             ("GAA",Glu),("GAG",Glu),
--             ("SAA",Glx),("SAG",Glx), -- IUPAC: S is C or G
             ("GGT",Gly),("GGC",Gly),("GGA",Gly),("GGG",Gly),
             ("CAT",His),("CAC",His),
             ("ATT",Ile),("ATC",Ile),("ATA",Ile),
             ("TTA",Leu),("TTG",Leu),("CTT",Leu),("CTC",Leu),("CTA",Leu),("CTG",Leu),
             ("AAA",Lys),("AAG",Lys),
             ("ATG",Met),
             ("TTT",Phe),("TTC",Phe),
             ("CCT",Pro),("CCC",Pro),("CCA",Pro),("CCG",Pro),
             ("TCT",Ser),("TCC",Ser),("TCA",Ser),("TCG",Ser),("AGT",Ser),("AGC",Ser),
             ("ACT",Thr),("ACC",Thr),("ACA",Thr),("ACG",Thr),
             ("TAT",Tyr),("TAC",Tyr),
             ("TGG",Trp),
             ("GTT",Val),("GTC",Val),("GTA",Val),("GTG",Val),
             ("TAG",STP),("TGA",STP),("TAA",STP)
            ]
-- todo: extend with more IUPAC nucleotide wildcards?

-- | Convert a list of amino acids to a sequence in IUPAC format.
toIUPAC :: [Amino] -> SeqData
toIUPAC = B.pack . map (fromJust . flip lookup iupac)

-- | Convert a sequence in IUPAC format to a list of amino acids.
fromIUPAC :: SeqData -> [Amino]
fromIUPAC = map (maybe Xaa id . flip lookup iupac') . B.unpack

iupac :: [(Amino,Char)]
iupac = [(Ala,'A')        ,(Arg,'R')        ,(Asn,'N')
        ,(Asp,'D')        ,(Cys,'C')        ,(Gln,'Q')
        ,(Glu,'E')        ,(Gly,'G')        ,(His,'H')
        ,(Ile,'I')        ,(Leu,'L')        ,(Lys,'K')
        ,(Met,'M')        ,(Phe,'F')        ,(Pro,'P')
        ,(Ser,'S')        ,(Thr,'T')        ,(Tyr,'Y')
        ,(Trp,'W')        ,(Val,'V')
        ,(Asx,'B') -- Asn or Asp
        ,(Glx,'Z') -- Gln or Glu
        ,(Xle,'J') -- Ile or Leu
        ,(STP,'X')        ,(Xaa,'X')
        ]
iupac' :: [(Char,Amino)]
iupac' = map (\(a,b)->(b,a)) iupac