{-# LANGUAGE OverloadedStrings, FlexibleContexts #-} -- | Conversion of PDB residue codes to FASTA single-letter sequence codes. module Bio.PDB.Fasta(resname2fastacode, fastacode2resname , defaultResname, defaultFastaCode , fastaSequence, fastaGappedSequence, fastaRecord, fastaGappedRecord ) where import Bio.PDB.Iterable as Iter import Bio.PDB.Structure as PDB import Data.Map as Map -- | Standard nucleic acid codes codebookNucleicAcids = [ -- RNA codes ("A", 'A'), ("C", 'C'), ("G", 'G'), ("U", 'U'), ("I", 'I'), -- DNA codes ("DA", 'A'), ("DG", 'G'), ("DC", 'C'), ("DT", 'T'), ("DI", 'I'), -- incorrect name for thymine (instead of DT) ("T", 'T') ] -- | List of all correspondences between FASTA 1-letter codes, and PDB 3-letter codes. codebook = codebookNucleicAcids ++ codebookProtein -- | Standard protein codes codebookStandardProtein = [ ("ALA", 'A'), ("CYS", 'C'), ("ASP", 'D'), ("GLU", 'E'), ("PHE", 'F'), ("GLY", 'G'), ("HIS", 'H'), ("ILE", 'I'), ("LYS", 'K'), ("LEU", 'L'), ("MET", 'M'), ("ASN", 'N'), ("PRO", 'P'), ("GLN", 'Q'), ("ARG", 'R'), ("SER", 'S'), ("THR", 'T'), ("VAL", 'V'), ("TRP", 'W'), ("TYR", 'Y')] -- | List of both standard and non-standard protein codes. codebookProtein = codebookStandardProtein ++ [ -- Protein codes (common variants) ("MSE", 'M')] -- selenomethionine -- | Dictionary of translations from all 3-letter PDB codes into 1-letter FASTA aminoacid codes. resname2fastacodeDictionary = Map.fromList codebook -- | Dictionary of translations from 1-letter FASTA aminoacid (standard -- protein) codes into 3-letter PDB codes. fastacode2resnameDictionary = Map.fromList . Prelude.map (\(a, b) -> (b, a)) $ codebookStandardProtein -- | Three-letter PDB code for an unknown type of residue. defaultResname = "UNK" -- | One-letter aminoacid code for an unknown type of residue. defaultFastaCode = 'X' -- | Dictionary mapping three-letter PDB residue code to a single-letter FASTA code. resname2fastacode resname = Map.findWithDefault defaultFastaCode resname resname2fastacodeDictionary -- | Dictionary mapping single-letter FASTA standard aminoacid code to a PDB residue name fastacode2resname code = Map.findWithDefault defaultResname code fastacode2resnameDictionary -- | Converts a `Bio.PDB.Structure.Residue` into a one character aminoacid code. res2code :: Residue -> Char res2code r = resname2fastacode . resName $ r -- | Converts an `Iterable` yielding `Residue`s into a list of aminoacid one-character codes. fastaSequence :: (Iterable a Residue) => a -> [Char] fastaSequence = Iter.itfoldr (\a b -> res2code a : b) [] -- | Converts an `Iterable` yielding `Residue`s into a list of aminoacid one-character codes. fastaGappedSequence :: (Iterable a Residue) => a -> [Char] fastaGappedSequence = concat . scan2 insertGaps projectAA . Iter.itfoldr (\a b -> (resSeq a, res2code a) : b) [] where projectAA (i, aa) = [aa] insertGaps (i, _ ) (j, aa) = ['-' | _ <- [2..j-i]] ++ [aa] -- | Scans a list and applies first argument to all consecutive pairs, -- and second argument to the beginning or `lone wolf`, mapping to -- a list of the same length. scan2 f g [] = [] scan2 f g (b:bs) = g b:scan2' b bs where scan2' b (c:cs) = f b c:scan2' c cs scan2' b [] = [] -- | Convert a filename and Chain into a text of FASTA format record. -- First argument tells if we want gaps included. fastaRecord' :: Bool -> [Char] -> Chain -> [Char] fastaRecord' withGaps ident c = ">" ++ header ++ "\n" ++ fastaSeq c where header = if chainId c == ' ' then ident else ident ++ "|" ++ [chainId c] fastaSeq = if withGaps then fastaSequence else fastaGappedSequence -- | Returns 'String' with ungapped sequence of a given PDB 'Chain'. fastaRecord = fastaRecord' False -- | Returns 'String' with gapped sequence of a given PDB 'Chain'. -- Gaps are placed to assure consistent numbering of residues and -- indices in the output 'String'. fastaGappedRecord = fastaRecord' True