{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE BangPatterns #-} -- | Imports an Rfam Fasta file and provides simultaneous export to four -- different data structures for lookups. module Biobase.Infernal.RfamFasta.Import where import Control.Arrow ((***)) import Data.ByteString.Char8 as BS import Data.Iteratee as I import Data.Iteratee.Char as I import Data.Iteratee.IO as I import Data.Iteratee.ZLib as IZ import Data.Map as M import Prelude as P import Biobase.Infernal.RfamFasta import Biobase.Infernal.Types -- | Enumeratee for RfamFasta entries from a ByteString. eneeRfamFasta :: (Monad m) => Enumeratee ByteString [RfamFasta] m a eneeRfamFasta = enumLinesBS ><> convStream f where f = do th <- I.tryHead case th of Nothing -> error "huh?" Just h -> do let (ana,sps) = (BS.split ';' *** BS.split ':' . BS.dropWhile (==' ')) . BS.break (==' ') $ h fs <- I.takeWhile (\s -> ">" /= BS.take 1 s) return . (:[]) $ RfamFasta { modelAccession = ModelAccession . read . P.drop 2 . unpack $ ana!!0 , modelIdentifier = ModelIdentification $ ana!!1 , sequenceAccession = mkEmblAccession $ ana!!2 -- , speciesAC = maybe (error $ "ERROR: " ++ show (unpack $ sps!!0,unpack s)) fst . readInt $ sps!!0 , speciesAccession = SpeciesAccession . maybe (-1) fst . readInt $ sps!!0 , speciesName = SpeciesName $ sps!!1 , fastaData = StrictSeqData . BS.copy . BS.concat $ fs } -- * In-memory lookup -- | Create a mapping between rfam family accession numbers and rfam family -- names. iModelAC2ID :: (Monad m) => Iteratee [RfamFasta] m ModelAC2ID iModelAC2ID = I.foldl' f M.empty where f !m x = insertWith' const (modelAccession x) (modelIdentifier x) m -- | Create a mapping between rfam family names and rfam family accession -- numbers. iModelID2AC :: (Monad m) => Iteratee [RfamFasta] m ModelID2AC iModelID2AC = I.foldl' f M.empty where f !m x = insertWith' const (modelIdentifier x) (modelAccession x) m -- | Provides a mapping between (Rfam accession, sequence accession) and the -- complete 'RfamFasta'. iACAC2RfamFasta :: (Monad m) => Iteratee [RfamFasta] m ACAC2RfamFasta iACAC2RfamFasta = I.foldl' f M.empty where f !m x = insertWith' union (modelAccession x) (M.singleton (sequenceAccession x) x) m -- | Provides a mapping between (Rfam name, sequence accession) and the complete -- 'RfamFasta'. iIDAC2RfamFasta :: (Monad m) => Iteratee [RfamFasta] m IDAC2RfamFasta iIDAC2RfamFasta = I.foldl' f M.empty where f !m x = insertWith' union (modelIdentifier x) (M.singleton (sequenceAccession x) x) m -- * File reading. -- | Convenience function creating all maps. fromFileZip :: FilePath -> IO (ModelAC2ID, ModelID2AC, ACAC2RfamFasta, IDAC2RfamFasta) fromFileZip fp = run =<< ( enumFile 8192 fp . joinI . enumInflate GZipOrZlib defaultDecompressParams . joinI . eneeRfamFasta $ I.zip4 iModelAC2ID iModelID2AC iACAC2RfamFasta iIDAC2RfamFasta ) -- | Convenience function creating all maps. fromFile :: FilePath -> IO (ModelAC2ID, ModelID2AC, ACAC2RfamFasta, IDAC2RfamFasta) fromFile fp = run =<< ( enumFile 8192 fp . joinI . eneeRfamFasta $ I.zip4 iModelAC2ID iModelID2AC iACAC2RfamFasta iIDAC2RfamFasta )