PDBtools/PDButil/PDBparse.hs

-- Module	: PDBparse
-- Copyright	: (c) 2012 Grant Rotskoff
-- License 	: GPL-3
--
-- Maintainer 	: gmr1887@gmail.com
-- Stability 	: experimental

module PDBtools.PDButil.PDBparse where

import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Char8 as B
import System.IO (FilePath)

data Atom =    Atom    { name     :: ByteString,
                         atid     :: Int,
                         chain    :: ByteString,
                         resid    :: Int,
                         resname  :: ByteString,
                         coords   :: [Double],
                         aField   :: Double,
                         bField   :: Double,
                         atype    :: ByteString    }
               deriving (Show,Eq)

data Protein = Protein { atoms    :: [Atom] }
               deriving (Show)

--Sample record:
-- ATOM      1  N   ASP A  28      52.958  39.871  41.308  1.00 89.38           N  

{- We only want record lines that begin with ATOM and HETATM
   ATOM lines contain the coordinates of the protein(s) in a PDB file 
   HETATM lines (short for heteroatom) contain coordinate information for 
   other molecules present in the structure... ligands, DNA, RNA, waters, etc. -}

parseAtom :: ByteString -> Atom
parseAtom record = Atom {   name = pull 13 16, 
                            atid = rpull 7 11,
                           chain = pull 22 22,
                           resid = rpull 23 26,
                         resname = pull 18 20,   
                          coords = [rpull 31 38,rpull 39 46,rpull 47 54],
                          aField = rpull 55 60, 
                          bField = rpull 61 66,
                           atype = pull 77 78  } where

  --Hard coded parsing of the PDB record for coordinate types
  --I've encountered this "repacking for comparison in expert code, 
  --but it seems like comparison should be possible some other way

   pull m n = cutspace $ B.drop (m-1) $ B.take n record
   rpull m n = read $ B.unpack $ pull m n  
   cutspace = B.pack . filter (/=' ') . B.unpack 


isAtom :: ByteString -> Bool
isAtom line = (B.take 4 line) == (B.pack "ATOM")

isHETATM :: ByteString -> Bool
isHETATM line = (B.take 6 line) == (B.pack "HETATM")


parse :: FilePath -> IO ([Protein],[Atom])
parse pdb = do
    let input = B.readFile pdb
    bstring <- input
    let atms = map parseAtom $ filter isAtom (B.lines bstring)
    let hetatms = map parseAtom $ filter isHETATM (B.lines bstring)
    return (splitChains atms, hetatms)

parseCofactorOnly :: FilePath -> IO [Atom]
parseCofactorOnly pdb = do 
	bstring <- B.readFile pdb
	let hetatms = map parseAtom $ filter isHETATM (B.lines bstring)
	return hetatms

parseProteinOnly :: FilePath -> IO [Protein]
parseProteinOnly pdb = do
	bstring <- B.readFile pdb
	let atms = map parseAtom $ filter isAtom (B.lines bstring)
	return $ splitChains atms

splitChains :: [Atom] -> [Protein]
splitChains [] = []
splitChains contents = [Protein {atoms = chain1}] ++ splitChains remainder where
	chain1 = takeWhile (\s -> id == chain s) contents
	remainder = dropWhile (\s -> id == chain s) contents
	id = chain (head contents)