{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE BangPatterns #-} {-# LANGUAGE NoMonomorphismRestriction #-} module Biobase.SElab.RfamNames.Import where import Control.Applicative import Control.Lens import Data.Attoparsec as A hiding (parse) import Data.Attoparsec.Char8 as A8 hiding (parse) import Data.Conduit as C import Data.Conduit.Attoparsec import Data.Conduit.Binary as CB import Data.Conduit.List as CL import Data.Conduit.Util as C import Data.Either.Unwrap as E import Data.Map (Map) import qualified Data.ByteString.Char8 as BS import qualified Data.Map as M import Biobase.SElab.RfamNames import Biobase.SElab.Types parse = CB.lines =$ CL.map (parseOnly mkRfamName) =$ CL.filter isRight =$ CL.map fromRight =$ C.zipSinks mapIdRfamNames mapAcRfamNames {-# INLINE parse #-} mkRfamName = f <$> rfamAC <* char ';' <*> rfamID <* char ';' <*> seqident <* spaces <*> specAC <* char ':' <*> specID where f rfac rfid sid spac spid = ModelNames rfac rfid spac spid rfamAC = ACC <$ string "RF" <*> decimal rfamID = IDD <$> A8.takeTill (==';') seqident = A8.takeTill isSpace specAC = (fmap (ACC . read . BS.unpack) . maybeBS) <$> A8.takeTill (==':') specID = (fmap IDD . maybeBS) <$> takeByteString spaces = many1 space maybeBS s | BS.null s = Nothing | otherwise = Just s {-# INLINE mkRfamName #-} mapIdRfamNames = CL.fold f M.empty where f !mp x = M.insertWith' (++) (x ^. modelID) [x] mp {-# INLINE mapIdRfamNames #-} mapAcRfamNames = CL.fold f M.empty where f !mp x = M.insertWith' (++) (x ^. modelAC) [x] mp {-# INLINE mapAcRfamNames #-} fromFile :: String -> IO ( Map (Identification Rfam) [ModelNames] , Map (Accession Rfam) [ModelNames] ) fromFile fname = do runResourceT $ CB.sourceFile fname $$ parse {-# NOINLINE fromFile #-}