{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE NoMonomorphismRestriction #-}
{-# LANGUAGE BangPatterns #-}

-- | Iteratee-based importer. Provides a simple "fromFile" function that
-- produces both maps in one pass.

module Biobase.SElab.Taxonomy.Import where

import Control.Applicative
import Control.Lens
import Data.Attoparsec as A hiding (parse)
import Data.Attoparsec.Char8 (char,decimal)
import Data.ByteString.Char8 as BS
import Data.Conduit as C
import Data.Conduit.Attoparsec
import Data.Conduit.Binary as CB
import Data.Conduit.List as CL
import Data.Conduit.Util as C
import Data.Either.Unwrap as E
import Data.List as L
import Data.Map as M
import qualified Data.Attoparsec.ByteString as AB hiding (parse)
import qualified Data.Attoparsec.Char8 as A8

import Biobase.SElab.Taxonomy
import Biobase.SElab.Types



parse =  CB.lines
      =$ CL.map (parseOnly mkTaxonomy)
      =$ CL.filter isRight
      =$ CL.map fromRight
      =$ C.zipSinks mapIdTaxonomy mapAcTaxonomy
{-# INLINE parse #-}

mkTaxonomy :: Parser Taxonomy
mkTaxonomy = f <$> ptaxid <* tab <*> pname <* tab <*> takeByteString where
  f k n xs = let
               cs = L.map (Classification . copy . BS.dropWhile (==' ')) . BS.split ';' . BS.init $ xs
             in Taxonomy (ACC k) (IDD $ copy n) cs
  ptaxid   = decimal
  pname    = A8.takeWhile (/='\t')
  tab      = char '\t'
{-# INLINE mkTaxonomy #-}

mapIdTaxonomy :: Monad m => GSink Taxonomy m (M.Map (Identification Species) Taxonomy)
mapIdTaxonomy = CL.fold f M.empty where
  f !mp x = M.insert (x ^. name) x mp
{-# INLINE mapIdTaxonomy #-}

mapAcTaxonomy :: Monad m => GSink Taxonomy m (M.Map (Accession Species) Taxonomy)
mapAcTaxonomy = CL.fold f M.empty where
  f !mp x = M.insert (x ^. accession) x mp
{-# INLINE mapAcTaxonomy #-}

fromFile :: String -> IO ( Map (Identification Species) Taxonomy
                         , Map (Accession Species) Taxonomy
                         )
fromFile fname = do
  runResourceT $ CB.sourceFile fname $$ parse
{-# NOINLINE fromFile #-}