{-# LANGUAGE BangPatterns #-}

-- | Iteratee-based importer. Provides a simple "fromFile" function that
-- produces both maps in one pass.

module Biobase.Infernal.Taxonomy.Import where

import Control.Applicative
import Data.Attoparsec as A
import Data.Attoparsec.Char8 as A8
import Data.Attoparsec.Iteratee
import Data.ByteString.Char8 as BS
import Data.Either.Unwrap as E
import Data.Iteratee as I
import Data.Iteratee.Char as I
import Data.Iteratee.IO as I
import Data.Iteratee.ListLike as I
import Data.List as L
import Data.Map as M

import Biobase.Infernal.Taxonomy



-- | Provide name-based lookup as the most-common usage scenario.
--
-- TODO there are 9 duplicates in the names, let's find them and see what is
-- going on

iSpeciesMap :: Monad m => Iteratee [Species] m (M.Map ByteString Species)
iSpeciesMap = I.foldl' f M.empty where
  f !m x = M.insert (name x) x m

-- | And a map based on taxon id

iTaxIdMap :: Monad m => Iteratee [Species] m (M.Map Int Species)
iTaxIdMap = I.foldl' f M.empty where
  f !m x = M.insert (taxid x) x m

-- | Imports taxonomy data.

eneeSpecies :: Monad m => Enumeratee ByteString [Either String Species] m a
eneeSpecies = enumLinesBS ><> mapStream (parseOnly mkSpecies)

-- | Given a 'ByteString', create a species entry.
--
-- NOTE The taxonomy format is, for each species, a line consisting of: taxid -
-- tab - species name - tab - semicolon separated list of classification names
-- - dot - end of line.

mkSpecies :: Parser Species
mkSpecies = f <$> ptaxid <* tab <*> pname <* tab <*> takeByteString where
  f k n xs = let
               cs = L.map (copy . BS.dropWhile (==' ')) . BS.split ';' . BS.init $ xs
             in Species (copy n) cs k
  ptaxid   = decimal
  pname    = A8.takeWhile (/='\t')
  tab      = char '\t'

-- | Convenience function: given a taxonomy file, produce both maps simultanously.

fromFile :: FilePath -> IO (M.Map ByteString Species, M.Map Int Species)
fromFile fp = do
  i <- enumFile 8192 fp
    . joinI
    . (eneeSpecies ><> I.filter isRight ><> mapStream fromRight)
    $ I.zip iSpeciesMap iTaxIdMap
  run i

-- * Testing

{-
test :: IO ()
test = do
  (s,t) <- fromFile "/home/choener/tmp/taxonomy"
  print $ M.size s
  print $ M.size t
  return ()
-}