Bio/Sequence/SeqData.hs

{- |
   Data structures for manipulating (biological) sequences.

   Generally supports both nucleotide and protein sequences, some functions,
   like @revcompl@, only makes sense for nucleotides.
-}

{-# LANGUAGE EmptyDataDecls #-}

module Bio.Sequence.SeqData 
    (
      -- * Data structure
      -- | A sequence is a header, sequence data itself, and optional quality data. 
      --   Sequences are type-tagged to identify them as nucleotide, amino acids,
      --   or unknown type. 
      --   All items are lazy bytestrings.  The Offset type can be used for indexing.
      --
      --   If you use overloaded strings (e.g., ghc -XOverloadedString), you can
      --   easily construct sequences from string literals.
      Sequence(..), Offset, SeqData,

      -- | Quality data is normally associated with nucleotide sequences
      Qual, QualData

      -- * Accessor functions
    , (!), seqlength,seqlabel,seqheader,seqdata
    , (?), hasqual, seqqual

      -- * Adding information to header
    , appendHeader, setHeader

      -- * Converting to and from [Char]
      -- | It is probably better to use the 'IsString' class from 'Data.String' for this.
    , fromStr, toStr

      -- * Sequence utilities
    , defragSeq, seqmap, castSeq

      -- * Nucleotide functionality
      -- | Nucleotide sequences contain the alphabet [A,C,G,T].
      -- IUPAC specifies an extended nucleotide alphabet with wildcards, but
      -- it is not supported at this point.
    , compl, revcompl, revcompl', Nuc, castToNuc

      -- * Protein functionality
      -- | Proteins are chains of amino acids, represented by the IUPAC alphabet.
    , Amino(..), translate, fromIUPAC, toIUPAC -- amino acids
    , castToAmino
    
      -- * Display a nicely formated sequence.
    , putSeqLn, seqToStr

      -- * Default type for sequences
    , Unknown
    ) where

import Data.List (unfoldr, intercalate, isPrefixOf)
import Data.Char (toUpper, isNumber)
import Data.Int
import Data.Word
import Data.Maybe (fromJust, isJust)

import qualified Data.ByteString.Lazy.Char8 as B
import qualified Data.ByteString.Lazy as BB
import qualified Data.ByteString as BBS
import Text.Printf (printf)

-- | An offset, index, or length of a 'SeqData'
type Offset  = Int64

-- | The basic data type used in 'Sequence's
type SeqData = B.ByteString

-- | Basic type for quality data.  Range 0..255.  Typical Phred output is in
--   the range 6..50, with 20 as the line in the sand separating good from bad.
type Qual = Word8

-- | Quality data is a 'Qual' vector, currently implemented as a 'ByteString'.
type QualData = BB.ByteString -- mild abuse

-- | A sequence consists of a header, the sequence data itself, and optional quality data.
--   The type parameter is a phantom type to separate nucleotide and amino acid sequences
data Sequence t = Seq !SeqData !SeqData !(Maybe QualData) -- ^ header and actual sequence
                  deriving Eq

-- | For type tagging sequences (protein sequences use 'Amino' below)
data Nuc
data Unknown

-- | Phantom type functionality, unchecked conversion between sequence types
castSeq :: Sequence a -> Sequence b
castSeq (Seq h d mq) = Seq h d mq

castToNuc :: Sequence a -> Sequence Nuc
castToNuc = castSeq

castToAmino :: Sequence a -> Sequence Amino
castToAmino = castSeq

-- | A more arranged show instance for Sequences reassembling the display of
--   the fasta-format
instance Show (Sequence a) where
    show s = seqToStr s 6 10 []

-- | Returns a properly formatted and probably highlighted string
-- | representation of a sequence. Highlighting is done using ANSI-Escape
-- | sequences.
seqToStr :: Sequence a -> Int -> Int -> [(Int, Int)] -> [Char]
seqToStr (Seq header sd _) chunks len parts =
    let strSeq     = toStr sd
        strHeader  = words $ toStr header
        identifier = case strHeader of (i:_) -> i; _ -> ""
        comment    = intercalate " " $ case strHeader of (_:rs) -> rs; _ -> []

        numLength  = (length . show . length) strSeq
        width      = chunks * len + numLength + chunks
        format     = intercalate "\n" . splits (width)
    in
    (line "ID" width)      ++ (format identifier) ++ "\n\n" ++
    (line "COMMENT" width) ++ (format comment)    ++ "\n\n" ++
    (line "DATA" width)    ++ (showDNA strSeq len chunks parts)
 
-- | A simple function to display a sequence: we generate the sequence string and
-- | call putStrLn
putSeqLn :: Sequence a -> Int -> Int -> [(Int, Int)] -> IO ()
putSeqLn s chunks len parts = putStrLn $ seqToStr s chunks len parts
 
-- Creates a header line of length n
line :: [Char] -> Int -> [Char]
line header width =
  header ++ " " ++ (take (width - (length header) - 1) (repeat '-')) ++ "\n"
 
-- | Splits a string into parts of size width. The last element can be shorter.
splits :: Int -> [t] -> [[t]]
splits _     [] = []
splits width xs =
    let (a, b) = splitAt width xs in
    [a] ++ (splits width b)

-- Highlighting and Dehighlighting ANSI-Escape sequences
beginHighlight, endHighlight :: String
beginHighlight = "\ESC[7m"
endHighlight   = "\ESC[0m"

-- Creates a string representation of the dna-part of a sequence. We can either
-- highlight specific areas, or, if this is empty, numberize the rows. At the
-- moment, both at the same time is not possible (see comment in the numberize-
-- function). If you want numberized output, take a look at the showDNANumber-
-- function.
showDNA :: [Char] -> Int -> Int -> [(Int, Int)] -> [Char]
showDNA xs len chunks highs =
    let hs  = highlight xs highs
        ks  = splitsSkip hs len [beginHighlight, endHighlight]
        ch  = chunkify ks chunks       
        ns  = numberize ch chunks len
        ns' = clean ns False
    in ns'
    
-- Add orientation numbers to each line.
numberize :: [[Char]] -> Int -> Int -> [Char]
numberize ls chunks len = 
    let -- The initial numbering of the single elements in lines
        numbered = zip [0, (chunks * len)..] ls 
        
        -- Create the format string with the right alignment, depending on the 
        -- number of elements in lines
        numLength   = (show . length . show ) (chunks * len * (length ls))
        formatStr   = "%" ++ numLength ++ "d "
        toNum (a,b) = (printf formatStr a) ++ b        
    in intercalate "\n" (map toNum numbered)
  
-- Concatenates each n elements of a list.    
chunkify :: [[Char]] -> Int -> [[Char]]
chunkify [] _n = []
chunkify xs  n =  
    [intercalate " " (take n xs)] ++ (chunkify (drop n xs) n)
      
-- Highlighting should not occur on spaces or over newline borders. We walk
-- through a generated string and add appropiate stop-highlight and start-
-- highlight markers.
--
-- This is one of the functions which can be definitely optimized by a seasoned
-- Haskell programmer.
clean :: [Char] -> Bool -> [Char]
clean []         _      = []

-- Handling spaces
clean (' ':xs)   True   = endHighlight ++ " " ++ beginHighlight ++ clean xs True
clean (' ':xs)   False  = " " ++ clean xs False

-- Newlines and numbering
clean ('\n':xs)   True   = 
    let (num, dna) = span (\c -> isNumber c) xs
    in endHighlight ++ "\n" ++ num ++ beginHighlight ++ clean dna True
clean ('\n':xs)   False  = "\n" ++ clean xs False

-- Checking for highlighting at all
clean str@(x:xs) isHigh =
    -- Special cases: check for the beginning or ending of highlighting 
    let a = isPrefixOf beginHighlight str
        b = isPrefixOf endHighlight str
    in if a then x : clean xs True
            else if b then x : clean xs False
                      else x : clean xs isHigh

-- Highlights a range of strings. Each start and end to be highlighted is 
-- described by a position-tuple (start, end).
highlight :: [Char] -> [(Int, Int)] -> [Char]
highlight ss [] = ss 
highlight ss ((start, end):rest) = 
    let sl = [beginHighlight, endHighlight]
        a  = insert ss start   beginHighlight sl
        b  = insert a  (end+1) endHighlight   sl
    in highlight b rest

-- Generates lists of length len from xs, skipping sublist in skips while
-- counting list elements.
splitsSkip :: (Eq t) => [t] -> Int -> [[t]] -> [[t]]
splitsSkip [] _   _     = []
splitsSkip xs len skips =
    (takeSkip xs len skips) : splitsSkip (dropSkip xs len skips) len skips
    
-- Emulates the normal take behaviour, skipping over sublists in skips.
takeSkip :: (Eq a) => [a] -> Int -> [[a]] -> [a]
takeSkip []         _ _     = []
takeSkip _          0 _     = []
takeSkip str@(x:xs) n skips =
    let n' = case (testPrefixes str skips) of
               Nothing  -> n-1
               Just len -> n + len - 1
    in x : takeSkip xs n' skips

-- Emulates the normal drop behaviour, skipping over sublists in skips.
dropSkip :: (Eq a) => [a] -> Int -> [[a]] -> [a]
dropSkip []         _ _     = []
dropSkip xs         0 _     = xs
dropSkip str@(_:xs) n skips =
    let n' = case (testPrefixes str skips) of
               Nothing  -> n-1
               Just len -> n + len - 1
    in dropSkip xs n' skips

-- Another definition of insert, which provides support for handling skipping
-- of strings. Inserts elem *before* pos into xs, but skips all strings in
-- skips while calculating the specified position
insert :: (Eq a) => [a] -> Int -> [a] -> [[a]] -> [a]
insert []         _   _    _     = []
insert xs         0   e    _     = e ++ xs
insert str@(x:xs) pos e skips = 
    let pos' = case (testPrefixes str skips) of
               Nothing  -> pos-1
               Just len -> pos + len - 1
    in x : insert xs pos' e skips
        
-- Test, if one of the prefixes is the prefix of xs. If yes, return it's length
-- else Nothing  
testPrefixes :: (Eq a) => [a] -> [[a]] -> Maybe Int
testPrefixes _  []     = Nothing
testPrefixes xs (p:ps) = 
    if (isPrefixOf p xs)
        then return (length p)
        else testPrefixes xs ps
        
-- | Convert a String to 'SeqData'
fromStr :: String -> SeqData
fromStr = B.pack

-- | Convert a 'SeqData' to a String
toStr :: SeqData -> String
toStr   = B.unpack

-- | Read the character at the specified position in the sequence.
{-# INLINE (!) #-}
(!) :: Sequence a -> Offset -> Char
(!) (Seq _ bs _) = B.index bs

{-# INLINE (?) #-}
(?) :: Sequence a -> Offset -> Qual
(?) (Seq _ _ (Just qs)) = BB.index qs
(?) (Seq _ _ Nothing) = 
    error "Attempting to use 'seqqual' on sequence without associated quality data"

-- | Return sequence length.
seqlength :: Sequence a -> Offset
seqlength (Seq _ bs _) = B.length bs

-- | Return sequence label (first word of header)
seqlabel :: Sequence a -> SeqData
seqlabel (Seq l _ _) = case B.words l of (x:_) -> x; [] -> B.empty

-- | Return full header.
seqheader :: Sequence a -> SeqData
seqheader (Seq l _ _) = l

-- | Return the sequence data.
seqdata :: Sequence a -> SeqData
seqdata (Seq _ d _) = d

-- | Return the quality data, or error if none exist.  Use hasqual if in doubt.
seqqual :: Sequence a -> QualData
seqqual (Seq _ _ (Just q)) = q
seqqual (Seq _ _ Nothing) = 
    error "Attempting to use 'seqqual' on sequence without associated quality data"

-- | Check whether the sequence has associated quality data.
hasqual :: Sequence a -> Bool
hasqual (Seq _ _ q) = isJust q

-- | Modify the header by appending text, or by replacing
--   all but the sequence label (i.e. first word).
appendHeader, setHeader :: Sequence a -> String -> Sequence a
appendHeader (Seq h d q) t = (Seq (B.append h (B.pack (" "++t))) d q)
setHeader s@(Seq _ d q) t = (Seq (B.append (seqlabel s) (B.pack (" "++t))) d q)

-- | Returns a sequence with all internal storage freshly copied and
-- with sequence and quality data present as a single chunk.  
--  
-- By freshly copying internal storage, 'defragSeq' allows garbage
-- collection of the original data source whence the sequence was
-- read; otherwise, use of just a short sequence name can cause an
-- entire sequence file buffer to be retained.
-- 
-- By compacting sequence data into a single chunk, 'defragSeq' avoids
-- linear-time traversal of sequence chunks during random access into
-- sequence data.
defragSeq :: Sequence t -> Sequence t
defragSeq (Seq name sequ qual) = Seq (defragB name) (defragB sequ) (fmap defragBB qual)
    where defragB = B.fromChunks . (: []) . BBS.concat . B.toChunks
          defragBB = BB.fromChunks . (: []) . BBS.concat . BB.toChunks


-- | map over sequences, treating them as a sequence of (char,word8) pairs.
--   This will work on sequences without quality, as long as the function doesn't 
--   try to examine it.
--   The current implementation is not very efficient.
seqmap :: ((Char,Qual) -> (Char,Qual)) -> Sequence t -> Sequence t
seqmap f (Seq n ds Nothing) = Seq n xs Nothing 
    where xs = B.map (\x -> fst $ f (x,e)) ds
          e = error "seqmap: tried to access qual data, but none available"
seqmap f (Seq n ds (Just qs)) = Seq n (B.pack xs) (Just $ BB.pack ys)
    where (xs,ys) = unzip $ map f $ zip (B.unpack ds) (BB.unpack qs)

------------------------------------------------------------
-- Nucleotide (DNA, RNA) specific stuff
------------------------------------------------------------

-- | Calculate the reverse complement.
--   This is only relevant for the nucleotide alphabet, 
--   and it leaves other characters unmodified.  
revcompl :: Sequence Nuc -> Sequence Nuc
revcompl (Seq l bs qs) = Seq l (revcompl' bs)
                        (maybe Nothing (Just . BB.reverse) qs)

-- | Calculate the reverse complent for SeqData only.
revcompl' :: SeqData -> SeqData
revcompl' = B.map compl . B.reverse

-- | Complement a single character.  I.e. identify the nucleotide it 
--   can hybridize with.  Note that for multiple nucleotides, you usually
--   want the reverse complement (see 'revcompl' for that).
compl :: Char -> Char
compl 'A' = 'T'
compl 'T' = 'A'
compl 'C' = 'G'
compl 'G' = 'C'
compl 'a' = 't'
compl 't' = 'a'
compl 'c' = 'g'
compl 'g' = 'c'
compl  x  =  x

------------------------------------------------------------
-- Amino acid (protein) stuff
------------------------------------------------------------

-- | Translate a nucleotide sequence into the corresponding protein
--   sequence.  This works rather blindly, with no attempt to identify ORFs
--   or otherwise QA the result.
translate :: Sequence Nuc -> Offset -> [Amino]
translate s' o' = unfoldr triples (s',o')
   where triples (s,o) = 
             if o > seqlength s - 3 then Nothing
             else Just (trans1 (map (s!) [o,o+1,o+2]),(s,o+3))

-- prop_trans s = tail (translate s 0) == translate s 3

trans1 :: String -> Amino
trans1 = maybe Xaa id . flip lookup trans_tbl . map (repUT . toUpper)
    where repUT x = if x == 'U' then 'T' else x -- RNA uses U for T

data Amino = Ala | Arg | Asn | Asp | Cys | Gln | Glu | Gly
           | His | Ile | Leu | Lys | Met | Phe | Pro | Ser
           | Thr | Tyr | Trp | Val 
           | STP | Asx | Glx | Xle | Xaa -- unknowns
     deriving (Show,Eq)

trans_tbl :: [(String,Amino)]
trans_tbl = [("GCT",Ala),("GCC",Ala),("GCA",Ala),("GCG",Ala),
             ("CGT",Arg),("CGC",Arg),("CGA",Arg),("CGG",Arg),("AGA",Arg),("AGG",Arg),
             ("AAT",Asn),("AAC",Asn),
             ("GAT",Asp),("GAC",Asp),
--             ("RAT",Asx),("RAC",Asx), -- IUPAC: R is purine (A or G)
             ("TGT",Cys),("TGC",Cys),
             ("CAA",Gln),("CAG",Gln),
             ("GAA",Glu),("GAG",Glu),
--             ("SAA",Glx),("SAG",Glx), -- IUPAC: S is C or G
             ("GGT",Gly),("GGC",Gly),("GGA",Gly),("GGG",Gly),
             ("CAT",His),("CAC",His),
             ("ATT",Ile),("ATC",Ile),("ATA",Ile),
             ("TTA",Leu),("TTG",Leu),("CTT",Leu),("CTC",Leu),("CTA",Leu),("CTG",Leu),
             ("AAA",Lys),("AAG",Lys),
             ("ATG",Met),
             ("TTT",Phe),("TTC",Phe),
             ("CCT",Pro),("CCC",Pro),("CCA",Pro),("CCG",Pro),
             ("TCT",Ser),("TCC",Ser),("TCA",Ser),("TCG",Ser),("AGT",Ser),("AGC",Ser),
             ("ACT",Thr),("ACC",Thr),("ACA",Thr),("ACG",Thr),
             ("TAT",Tyr),("TAC",Tyr),
             ("TGG",Trp),
             ("GTT",Val),("GTC",Val),("GTA",Val),("GTG",Val),
             ("TAG",STP),("TGA",STP),("TAA",STP)
            ]
-- todo: extend with more IUPAC nucleotide wildcards?

-- | Convert a list of amino acids to a sequence in IUPAC format.
toIUPAC :: [Amino] -> SeqData
toIUPAC = B.pack . map (fromJust . flip lookup iupac)

-- | Convert a sequence in IUPAC format to a list of amino acids.
fromIUPAC :: SeqData -> [Amino]
fromIUPAC = map (maybe Xaa id . flip lookup iupac' . toUpper) . B.unpack

iupac :: [(Amino,Char)]
iupac = [(Ala,'A')        ,(Arg,'R')        ,(Asn,'N')
        ,(Asp,'D')        ,(Cys,'C')        ,(Gln,'Q')
        ,(Glu,'E')        ,(Gly,'G')        ,(His,'H')
        ,(Ile,'I')        ,(Leu,'L')        ,(Lys,'K')
        ,(Met,'M')        ,(Phe,'F')        ,(Pro,'P')
        ,(Ser,'S')        ,(Thr,'T')        ,(Tyr,'Y')
        ,(Trp,'W')        ,(Val,'V')
        ,(Asx,'B') -- Asn or Asp
        ,(Glx,'Z') -- Gln or Glu
        ,(Xle,'J') -- Ile or Leu
        ,(STP,'X')        ,(Xaa,'X')
        ]
iupac' :: [(Char,Amino)]
iupac' = map (\(a,b)->(b,a)) iupac