-- Module : PDBtools -- Copyright : (c) 2012 Grant Rotskoff -- License : GPL-3 -- -- Maintainer : gmr1887@gmail.com -- Stability : experimental -- The suite of source files in the PDButil directory are meant to complement high-throughput analyses -- of three-dimensional protein structure data in the PDB format. Because the source files rely heavily -- on one another, it is convenient to import them all into a single module. Examples of analysis projects -- are available at http://www.github.com/rotskoff/Haskell-PDB-Utilities module PDBtools.Base where -- Long list of imports... import PDBtools.PDButil.PDBparse import PDBtools.PDButil.Vectors import Data.List import Data.ByteString.Char8 (ByteString) import qualified Data.ByteString.Char8 as B import qualified Data.Map as Map import Data.Maybe -- Pull all atoms of a given name from a list of atoms atomtype :: String -> [Atom] -> [Atom] atomtype t atmlist = filter matcht atmlist where matcht a = atype a == B.pack t -- Match the atom's name in the PDB file rather than the underlying type atomname :: String -> [Atom] -> [Atom] atomname t atmlist = filter matcht atmlist where matcht a = name a == B.pack t atomnames :: [String] -> [Atom] -> [Atom] atomnames ts atmlist = filter matchts atmlist where matchts a = name a `elem` (map B.pack ts) -- Match the residue type, input the three letter abbreviation restype :: String -> [Atom] -> [Atom] restype t atmlist = filter matcht atmlist where matcht a = resname a == B.pack t -- Extract the list of alpha-Carbons from a protein backbone :: Protein -> [Atom] backbone = atomname "CA" . atoms -- Extract the list of residue name, residue number pairs resSeq :: Protein -> [(ByteString,Int)] resSeq p = zip (map resname bbatms) (map resid bbatms) where bbatms = backbone p --Naive homology calculation --this could be greatly improved by some alignment effort homology :: Protein -> Protein -> Double homology a b = (fromIntegral identities) / (fromIntegral totalLength) where identities = 2 * length (resSeq a `intersect` resSeq b) totalLength = length(resSeq a) + length(resSeq b) -- Euclidean Distance between two atoms distance :: Atom -> Atom -> Double distance a1 a2 = norm $ vSub (coords a1) (coords a2) -- Root Mean Squared Deviation, a measure of the total distance change -- Only well-defined if you input the same protein! rmsd :: [Atom] -> [Atom] -> Double rmsd atms atms' = sqrt $ avg sqdist where avg ds = (1/fromIntegral(length(ds))) * sum(ds) sqdist = map (^2) $ zipWith distance (atms) (atms') -- Collect all the atoms within a given distance within :: Double -> Atom -> [Atom] -> [Atom] within range a = (delete a) . filter withinRange where withinRange a' = (distance a a') <= range withinClusive :: Double -> Atom -> [Atom] -> [Atom] withinClusive range a = filter withinRange where withinRange a' = (distance a a') <= range -- Centers the list of atoms around the specified atom center :: Atom -> [Atom] -> [Atom] center a = a `shift` [0,0,0] -- Shift the entire atomlist by specifying the new location of a single atom shift :: Atom -> [Double] -> [Atom] -> [Atom] shift a newCoords as = map (\s -> s {coords = (translate s)}) as where translate s = vAdd (coords s) shiftFactor shiftFactor = vSub newCoords (coords a) -- Global translate by a vector translateBy :: [Double] -> [Atom] -> [Atom] translateBy vect = map (\s -> s {coords = (vAdd (coords s) vect)}) -- Compute the angle between three atoms, return value in radians! atmAngle :: Atom -> Atom -> Atom -> Double atmAngle a b c = angle baVec bcVec where baVec = (coords a) `vSub` (coords b) bcVec = (coords c) `vSub` (coords c) dihedrals :: Protein -> [(Double,Double)] dihedrals p = map (\cAlpha -> dihedral cAlpha (atoms p)) $ backbone p kth :: Int -> Atom -> String -> [Atom] -> [Double] kth k cAlpha t atms | (getAdjacent k cAlpha t atms) == [] = [0,0,0] | otherwise = coords $ head $ getAdjacent k cAlpha t atms getAdjacent :: Int -> Atom -> String -> [Atom] -> [Atom] getAdjacent k cAlpha t atms = filter (\s -> resid s == resid cAlpha+k) $ atomname t atms --TODO TESTING dihedral :: Atom -> [Atom] -> (Double,Double) dihedral cAlpha atms | prevCO == [0,0,0] || nextN == [0,0,0] = (0,0) | name cAlpha == B.pack "CA" = (phi v1 v2,psi v2 v3) | otherwise = error "Please input a Carbon Alpha atom." where prevCO = kth (-1) cAlpha "C" atms currCA = coords $ cAlpha currN = kth 0 cAlpha "N" atms currCO = kth 0 cAlpha "C" atms nextN = kth 1 cAlpha "N" atms a = vSub prevCO currCA b = vSub currN currCA c = vSub currCA currCO d = vSub currCA nextN v1 = a `cross` b v2 = c `cross` b v3 = c `cross` d phi v1 v2 | a `dot` v1 < 0 = angle v1 v2 | otherwise = -(angle v1 v2) psi v2 v3 | d `dot` v3 > 0 = angle v2 v3 | otherwise = -(angle v2 v3) rama :: Protein -> IO() rama p = do let toTSV (phi,psi) = (show phi) ++ "\t" ++ (show psi) mapM_ putStrLn $ map toTSV $ dihedrals p -- Convert a protein to FASTA sequence format -- TODO, headers in FASTA file spec protein2fasta :: Protein -> ByteString protein2fasta protein = B.pack $ concatMap (\s -> convert (resname s)) (backbone protein) convert :: ByteString -> String convert name | query == Nothing = "X" | otherwise = fromJust query where query = Map.lookup (B.unpack name) resMap resMap = Map.fromList [("ALA","A"), ("CYS","C"), ("ASP","D"), ("GLU","E"), ("PHE","F"), ("GLY","G"), ("HIS","H"), ("ILE","I"), ("LYS","K"), ("LEU","L"), ("MET","M"), ("ASN","N"), ("PYL","O"), ("PRO","P"), ("GLN","Q"), ("ARG","R"), ("SER","S"), ("THR","T"), --Selenocysteine ("VAL","V"), ("TRP","W"), ("TYR","Y")] -- otherwise, use 'X'