{-# LANGUAGE RecordWildCards #-} {-# LANGUAGE OverloadedStrings #-} -- | Importing of FR3D data. Both "basepairs" and "near interactions" are -- currently supported. More parsers will come if required. module Biobase.FR3D.Import where import Control.Arrow import Data.ByteString.Char8 as BS import Data.Char import Data.Iteratee as I import Data.Iteratee.Char as I import Data.Iteratee.IO as I import Data.Iteratee.ListLike as I import Data.List as L import Data.Maybe import qualified Data.Map as M import System.FilePath.Find as F import Biobase.Secondary import Biobase.FR3D -- | An Iteratee from a bytestring to one FR3D entry. Since each file contains -- exactly one entry, this is no problem. iFR3D :: (Monad m) => Iteratee ByteString m FR3D iFR3D = joinI $ enumLinesBS f where f = do I.head -- fr3d header I.head -- sequence header cs' <- I.break ((/="#") . BS.take 1) I.head -- basepairs header xs <- stream2list -- and all basepairs let cs = L.map (second (BS.drop 1) . BS.span isAlphaNum . BS.drop 2) $ cs' return FR3D { pdbid = maybe "" (BS.take 4) $ listToMaybe xs , chains = cs , basepairs = {- L.map (fixSeqpos cs) . -} L.map bs2basepair $ xs } {- - This would be for fixing sequence position information, but it seems that - FR3D does not store this info consistently... - fixSeqpos :: [(ByteString,ByteString)] -> Basepair -> Basepair fixSeqpos cs bp@Basepair{..} = bp{seqpos1 = seqpos1 - cl M.! chain1, seqpos2 = seqpos2 - cl M.! chain2} where cl = M.fromList . snd . L.mapAccumL f 0 $ cs f acc x = (acc + BS.length (snd x), (fst x, acc)) -} -- | Helper function turning a bytestring line into a basepair entry bs2basepair :: ByteString -> Basepair bs2basepair s | L.length ws /= 10 = error $ "can't parse line: " ++ unpack s | otherwise = Basepair { interaction = threeChar . BS.unpack $ ws!!1 , nucleotide1 = BS.head $ ws!!2 , pdbnumber1 = maybe (-1) fst . readInt $ ws!!3 , chain1 = ws!!4 , seqpos1 = maybe (-1) (subtract 1 . fst) . readInt $ ws!!5 , nucleotide2 = BS.head $ ws!!6 , pdbnumber2 = maybe (-1) fst . readInt $ ws!!7 , chain2 = ws!!8 , seqpos2 = maybe (-1) (subtract 1 . fst) . readInt $ ws!!9 } where ws = BS.words s -- | Convenience function: given a directory name, extracts a list of all FR3D -- entries. fromDirSelect :: String -> FilePath -> IO [FR3D] fromDirSelect select fp = do fs <- F.find always (fileName ~~? select) fp mapM (\f -> run =<< enumFile 8192 f iFR3D) fs -- | This one select the "near interactions" fromDirNear = fromDirSelect "*near_interactions_FR3D.txt" -- | And this one the "basepairs" (this one you normally want). fromDir = fromDirSelect "*basepairs_FR3D.txt"