src/Data/Fasta/ByteString/Translation.hs

-- Translation Module
-- By Gregory W. Schwartz

{- | Collects all functions pertaining to the translation of nucleotides to
amino acids for ByteStrings
-}

{-# LANGUAGE OverloadedStrings #-}

module Data.Fasta.ByteString.Translation ( chunksOf
                                         , codon2aa
                                         , translate ) where

-- Built in
import Data.Char
import Data.Either
import qualified Data.ByteString.Char8 as B

-- Local
import Data.Fasta.ByteString.Types

-- | ByteString version of chunksOf
chunksOf :: Int -> B.ByteString -> [B.ByteString]
chunksOf k = go
  where
    go t = case B.splitAt k t of
             (a,b) | B.null a    -> []
                   | otherwise    -> a : go b

-- | Converts a codon to an amino acid
-- Remember, if there is an "N" in that DNA sequence, then it is translated
-- as an X, an unknown amino acid.
codon2aa :: Codon -> Either B.ByteString B.ByteString
codon2aa x
    | codon `elem` ["GCT", "GCC", "GCA", "GCG"]               = Right "A"
    | codon `elem` ["CGT", "CGC", "CGA", "CGG", "AGA", "AGG"] = Right "R"
    | codon `elem` ["AAT", "AAC"]                             = Right "N"
    | codon `elem` ["GAT", "GAC"]                             = Right "D"
    | codon `elem` ["TGT", "TGC"]                             = Right "C"
    | codon `elem` ["CAA", "CAG"]                             = Right "Q"
    | codon `elem` ["GAA", "GAG"]                             = Right "E"
    | codon `elem` ["GGT", "GGC", "GGA", "GGG"]               = Right "G"
    | codon `elem` ["CAT", "CAC"]                             = Right "H"
    | codon `elem` ["ATT", "ATC", "ATA"]                      = Right "I"
    | codon `elem` ["ATG"]                                    = Right "M"
    | codon `elem` ["TTA", "TTG", "CTT", "CTC", "CTA", "CTG"] = Right "L"
    | codon `elem` ["AAA", "AAG"]                             = Right "K"
    | codon `elem` ["TTT", "TTC"]                             = Right "F"
    | codon `elem` ["CCT", "CCC", "CCA", "CCG"]               = Right "P"
    | codon `elem` ["TCT", "TCC", "TCA", "TCG", "AGT", "AGC"] = Right "S"
    | codon `elem` ["ACT", "ACC", "ACA", "ACG"]               = Right "T"
    | codon `elem` ["TGG"]                                    = Right "W"
    | codon `elem` ["TAT", "TAC"]                             = Right "Y"
    | codon `elem` ["GTT", "GTC", "GTA", "GTG"]               = Right "V"
    | codon `elem` ["TAA", "TGA", "TAG"]                      = Right "*"
    | codon `elem` ["---", "..."]                             = Right "-"
    | codon == "~~~"                                          = Right "-"
    | "N" `B.isInfixOf` codon                                 = Right "X"
    | "-" `B.isInfixOf` codon                                 = Right "-"
    | "." `B.isInfixOf` codon                                 = Right "-"
    | otherwise                                               = Left errorMsg
  where
    codon    = B.map toUpper x
    errorMsg = B.append "Unidentified codon: " codon

-- | Translates a bytestring of nucleotides given a reading frame (1, 2, or
-- 3) -- drops the first 0, 1, or 2 nucleotides respectively. Returns
-- a bytestring with the error if the codon is invalid.
translate :: Int -> FastaSequence -> Either B.ByteString FastaSequence
translate pos x
    | any isLeft' translation = Left $ head . lefts $ translation
    | otherwise               = Right $ x { fastaSeq = B.concat
                                                     . rights
                                                     $ translation }
  where
    translation = map codon2aa
                . filter ((== 3) . B.length)
                . chunksOf 3
                . B.drop (pos - 1)
                . fastaSeq
                $ x
    isLeft' (Left _) = True
    isLeft' _        = False