-- | Import an Rfam @taxonomy.txt@ file. Provides a simple "fromFile" -- function that produces both maps in one pass. @fromFile@ will check the -- file suffix and on @.gz@ suffixes decompress the input on-the-fly. module Biobase.SElab.Taxonomy.Import where import Codec.Compression.GZip (decompress) import Control.Applicative ((<|>)) import Control.Arrow (second) import Control.Monad import Data.Attoparsec.Text.Lazy as AT import Data.Char (isDigit) import Data.HashMap.Strict (HashMap) import Data.List (foldl') import Data.Text.Lazy.Encoding (decodeUtf8) import Data.Text.Lazy.IO as TL import Data.Text (Text) import Data.Vector (fromList) import qualified Data.ByteString.Lazy as BL import qualified Data.HashMap.Strict as HM import qualified Data.Text as T import qualified Data.Text.Lazy as TL import System.FilePath (takeExtension) import Data.String.Conversions.Monomorphic import Biobase.Types.Accession (Accession(..),Species) import Biobase.Types.Names import Biobase.Types.Taxonomy -- | Parse a single Taxon line. -- -- TODO there are unknown words at the end of each line. make those known parseTaxon :: Parser Taxon parseTaxon = do accession <- Accession <$> takeWhile1 isDigit "accession" skipSpace "1st space" species <- speciesName <$> takeWhile1 (/='\t') "species" skipSpace "2nd space" classification <- (fromList . map (,Unknown) . map fromST) <$> takeWhile1 (\z -> z/=';' && z/='.') `sepBy` "; " "classification" unknowns <- manyTill anyChar (endOfInput <|> endOfLine) return $ Taxon {..} -- | Taxonomy according to @Infernal@ stored in two hashmaps. The first -- from @Accession@ to @Taxon@, the second from species name to @Taxon@. type Taxonomy = ( HashMap (Accession Species) Taxon -- ^ find @Taxon@ via accession number , HashMap SpeciesName Taxon -- ^ find @Taxon@ via species name ) -- | Parses the taxonomy.txt file. parseTaxonomy :: Parser Taxonomy parseTaxonomy = foldl' go (HM.empty , HM.empty) <$> manyTill parseTaxon endOfInput where go (!a, !s) x = (HM.insert (accession x) x a, HM.insert (species x) x s) -- | Read @taxonomy.txt.gz@ / @taxonomy.txt@ file into structure. fromFile :: FilePath -> IO Taxonomy fromFile f = case takeExtension f of ".gz" -> (go . decodeUtf8 . decompress) <$> BL.readFile f _ -> go <$> TL.readFile f where go txt = case AT.parse parseTaxonomy txt of Done "" r -> r Done ncon r -> error $ "unconsumed input " ++ f ++ ": " ++ (TL.unpack $ TL.take 1000 ncon) Fail ncon ctx err -> error $ "error parsing " ++ f ++ ": " ++ show (ctx,err)