module Biobase.SElab.Taxonomy.Import where
import Control.Applicative
import Control.Lens
import Data.Attoparsec as A hiding (parse)
import Data.Attoparsec.Char8 (char,decimal)
import Data.ByteString.Char8 as BS
import Data.Conduit as C
import Data.Conduit.Attoparsec
import Data.Conduit.Binary as CB
import Data.Conduit.List as CL
import Data.Conduit.Util as C
import Data.Either.Unwrap as E
import Data.List as L
import Data.Map as M
import qualified Data.Attoparsec.ByteString as AB hiding (parse)
import qualified Data.Attoparsec.Char8 as A8
import Biobase.SElab.Taxonomy
import Biobase.SElab.Types
parse = CB.lines
=$ CL.map (parseOnly mkTaxonomy)
=$ CL.filter isRight
=$ CL.map fromRight
=$ C.zipSinks mapIdTaxonomy mapAcTaxonomy
mkTaxonomy :: Parser Taxonomy
mkTaxonomy = f <$> ptaxid <* tab <*> pname <* tab <*> takeByteString where
f k n xs = let
cs = L.map (Classification . copy . BS.dropWhile (==' ')) . BS.split ';' . BS.init $ xs
in Taxonomy (ACC k) (IDD $ copy n) cs
ptaxid = decimal
pname = A8.takeWhile (/='\t')
tab = char '\t'
mapIdTaxonomy :: Monad m => GSink Taxonomy m (M.Map (Identification Species) Taxonomy)
mapIdTaxonomy = CL.fold f M.empty where
f !mp x = M.insert (x ^. name) x mp
mapAcTaxonomy :: Monad m => GSink Taxonomy m (M.Map (Accession Species) Taxonomy)
mapAcTaxonomy = CL.fold f M.empty where
f !mp x = M.insert (x ^. accession) x mp
fromFile :: String -> IO ( Map (Identification Species) Taxonomy
, Map (Accession Species) Taxonomy
)
fromFile fname = do
runResourceT $ CB.sourceFile fname $$ parse