module Biobase.Infernal.Taxonomy.Import where
import Control.Applicative
import Data.Attoparsec as A
import Data.Attoparsec.Char8 as A8
import Data.Attoparsec.Iteratee
import Data.ByteString.Char8 as BS
import Data.Either.Unwrap as E
import Data.Iteratee as I
import Data.Iteratee.Char as I
import Data.Iteratee.IO as I
import Data.Iteratee.ListLike as I
import Data.List as L
import Data.Map as M
import Biobase.Infernal.Taxonomy
import Biobase.Infernal.Types
iSpeciesMap :: Monad m => Iteratee [SpeciesTaxonomy] m (M.Map SpeciesName SpeciesTaxonomy)
iSpeciesMap = I.foldl' f M.empty where
f !m x = M.insert (stName x) x m
iTaxIdMap :: Monad m => Iteratee [SpeciesTaxonomy] m (M.Map SpeciesAccession SpeciesTaxonomy)
iTaxIdMap = I.foldl' f M.empty where
f !m x = M.insert (stAccession x) x m
eneeSpecies :: Monad m => Enumeratee ByteString [Either String SpeciesTaxonomy] m a
eneeSpecies = enumLinesBS ><> mapStream (parseOnly mkSpecies)
mkSpecies :: Parser SpeciesTaxonomy
mkSpecies = f <$> ptaxid <* tab <*> pname <* tab <*> takeByteString where
f k n xs = let
cs = L.map (Classification . copy . BS.dropWhile (==' ')) . BS.split ';' . BS.init $ xs
in SpeciesTaxonomy (SpeciesAccession k) (SpeciesName $ copy n) cs
ptaxid = decimal
pname = A8.takeWhile (/='\t')
tab = char '\t'
fromFile :: FilePath -> IO (M.Map SpeciesName SpeciesTaxonomy, M.Map SpeciesAccession SpeciesTaxonomy)
fromFile fp = do
i <- enumFile 8192 fp
. joinI
. (eneeSpecies ><> I.filter isRight ><> mapStream fromRight)
$ I.zip iSpeciesMap iTaxIdMap
run i