module Biobase.Infernal.RfamFasta.Import where
import Control.Arrow ((***))
import Data.ByteString.Char8 as BS
import Data.Iteratee as I
import Data.Iteratee.Char as I
import Data.Iteratee.IO as I
import Data.Iteratee.ZLib as IZ
import Data.Map as M
import Prelude as P
import Biobase.Infernal.RfamFasta
import Biobase.Infernal.Types
eneeRfamFasta :: (Monad m) => Enumeratee ByteString [RfamFasta] m a
eneeRfamFasta = enumLinesBS ><> convStream f where
f = do
th <- I.tryHead
case th of
Nothing -> error "huh?"
Just h -> do
let (ana,sps) = (BS.split ';' *** BS.split ':' . BS.dropWhile (==' ')) . BS.break (==' ') $ h
fs <- I.takeWhile (\s -> ">" /= BS.take 1 s)
return . (:[]) $ RfamFasta
{ modelAccession = ModelAccession . read . P.drop 2 . unpack $ ana!!0
, modelIdentifier = ModelIdentification $ ana!!1
, sequenceAccession = mkEmblAccession $ ana!!2
, speciesAccession = SpeciesAccession . maybe (1) fst . readInt $ sps!!0
, speciesName = SpeciesName $ sps!!1
, fastaData = StrictSeqData . BS.copy . BS.concat $ fs
}
iModelAC2ID :: (Monad m) => Iteratee [RfamFasta] m ModelAC2ID
iModelAC2ID = I.foldl' f M.empty where
f !m x = insertWith' const (modelAccession x) (modelIdentifier x) m
iModelID2AC :: (Monad m) => Iteratee [RfamFasta] m ModelID2AC
iModelID2AC = I.foldl' f M.empty where
f !m x = insertWith' const (modelIdentifier x) (modelAccession x) m
iACAC2RfamFasta :: (Monad m) => Iteratee [RfamFasta] m ACAC2RfamFasta
iACAC2RfamFasta = I.foldl' f M.empty where
f !m x = insertWith' union (modelAccession x) (M.singleton (sequenceAccession x) x) m
iIDAC2RfamFasta :: (Monad m) => Iteratee [RfamFasta] m IDAC2RfamFasta
iIDAC2RfamFasta = I.foldl' f M.empty where
f !m x = insertWith' union (modelIdentifier x) (M.singleton (sequenceAccession x) x) m
fromFileZip :: FilePath -> IO (ModelAC2ID, ModelID2AC, ACAC2RfamFasta, IDAC2RfamFasta)
fromFileZip fp = run =<< ( enumFile 8192 fp
. joinI
. enumInflate GZipOrZlib defaultDecompressParams
. joinI
. eneeRfamFasta
$ I.zip4 iModelAC2ID iModelID2AC iACAC2RfamFasta iIDAC2RfamFasta
)
fromFile :: FilePath -> IO (ModelAC2ID, ModelID2AC, ACAC2RfamFasta, IDAC2RfamFasta)
fromFile fp = run =<< ( enumFile 8192 fp
. joinI
. eneeRfamFasta
$ I.zip4 iModelAC2ID iModelID2AC iACAC2RfamFasta iIDAC2RfamFasta
)