-- Translation Module
-- By Gregory W. Schwartz

{- | Collects all functions pertaining to the translation of nucleotides to
amino acids for Lazy Text.
-}

{-# LANGUAGE OverloadedStrings #-}

module Data.Fasta.Text.Lazy.Translation ( codon2aa
                                        , customCodon2aa
                                        , translate
                                        , customTranslate
                                        ) where

-- Built in
import Data.Either
import qualified Data.Text.Lazy as T
import Data.Int

-- Local
import Data.Fasta.Text.Lazy.Types

-- | Converts a codon to an amino acid
-- Remember, if there is an "N" in that DNA sequence, then it is translated
-- as an X, an unknown amino acid.
codon2aa :: Codon -> Either T.Text Char
codon2aa x
    | codon `elem` ["GCT", "GCC", "GCA", "GCG"]               = Right 'A'
    | codon `elem` ["CGT", "CGC", "CGA", "CGG", "AGA", "AGG"] = Right 'R'
    | codon `elem` ["AAT", "AAC"]                             = Right 'N'
    | codon `elem` ["GAT", "GAC"]                             = Right 'D'
    | codon `elem` ["TGT", "TGC"]                             = Right 'C'
    | codon `elem` ["CAA", "CAG"]                             = Right 'Q'
    | codon `elem` ["GAA", "GAG"]                             = Right 'E'
    | codon `elem` ["GGT", "GGC", "GGA", "GGG"]               = Right 'G'
    | codon `elem` ["CAT", "CAC"]                             = Right 'H'
    | codon `elem` ["ATT", "ATC", "ATA"]                      = Right 'I'
    | codon `elem` ["ATG"]                                    = Right 'M'
    | codon `elem` ["TTA", "TTG", "CTT", "CTC", "CTA", "CTG"] = Right 'L'
    | codon `elem` ["AAA", "AAG"]                             = Right 'K'
    | codon `elem` ["TTT", "TTC"]                             = Right 'F'
    | codon `elem` ["CCT", "CCC", "CCA", "CCG"]               = Right 'P'
    | codon `elem` ["TCT", "TCC", "TCA", "TCG", "AGT", "AGC"] = Right 'S'
    | codon `elem` ["ACT", "ACC", "ACA", "ACG"]               = Right 'T'
    | codon `elem` ["TGG"]                                    = Right 'W'
    | codon `elem` ["TAT", "TAC"]                             = Right 'Y'
    | codon `elem` ["GTT", "GTC", "GTA", "GTG"]               = Right 'V'
    | codon `elem` ["TAA", "TGA", "TAG"]                      = Right '*'
    | codon `elem` ["---", "..."]                             = Right '-'
    | codon == "~~~"                                          = Right '-'
    | "N" `T.isInfixOf` codon                                 = Right 'X'
    | "-" `T.isInfixOf` codon                                 = Right '-'
    | "." `T.isInfixOf` codon                                 = Right '-'
    | otherwise                                               = Left errorMsg
  where
    codon    = T.toUpper x
    errorMsg = T.append "Unidentified codon: " codon

-- | Translate a codon using a custom table
customCodon2aa :: [(Codon, Char)] -> Codon -> Either T.Text AA
customCodon2aa table codon = case lookup codon table of
                                (Just x) -> Right x
                                Nothing  -> codon2aa codon

-- | Translates a text of nucleotides given a reading frame (1, 2, or
-- 3) -- drops the first 0, 1, or 2 nucleotides respectively. Returns
-- a text with the error if the codon is invalid. Also has customized codon
-- translations as well overriding the defaults.
customTranslate :: [(Codon, AA)]
                -> Int64
                -> FastaSequence
                -> Either T.Text FastaSequence
customTranslate table pos x
    | any isLeft' translation = Left $ head . lefts $ translation
    | otherwise               = Right $ x { fastaSeq = T.pack
                                                     . rights
                                                     $ translation }
  where
    translation = map (customCodon2aa table)
                . filter ((== 3) . T.length)
                . T.chunksOf 3
                . T.drop (pos - 1)
                . fastaSeq
                $ x
    isLeft' (Left _) = True
    isLeft' _        = False

-- | Translates a text of nucleotides given a reading frame (1, 2, or
-- 3) -- drops the first 0, 1, or 2 nucleotides respectively. Returns
-- a text with the error if the codon is invalid.
translate :: Int64 -> FastaSequence -> Either T.Text FastaSequence
translate = customTranslate []