{-# LANGUAGE OverloadedStrings #-}

-- An enumeratee for conversion from bytestring to individual FASTA entries is
-- provided. In addition, convenience function for file- and compressed
-- file-loading are available.

module Biobase.Fasta.Import where

import Data.ByteString.Char8 as BS
import Data.Iteratee.Iteratee as I
import Data.Iteratee.ListLike as I
import Data.Iteratee.Char as I
import Data.Iteratee.IO as I
import Data.Iteratee.ZLib
import Prelude as P
import Data.Monoid
import Data.List as L

import Biobase.Fasta



-- | This is the type of the conversion function from FASTA data to the data
-- 'z'. Make certain that all input is used strictly! BangPatterns are the
-- easiest to do. In order, the function expects the current FASTA header, then
-- a data segment, and finally the starting position of the data segment within
-- the full FASTA data.
--
-- If you need the conversion to run in constant time, do not use the
-- convenience functions and replace the final conversion to a strict stream by
-- your own conversion (or output) function.

type FastaFunction z
  = FastaHeader     -- ^ the ">" header
  -> StartPos       -- ^ where in the original sequence to start
  -> WindowSize     -- ^ how many characters we are looking at
  -> PeekSize       -- ^ this many characters are from the next window (peeking into)
  -> FastaData      -- ^ the actual sequence data
  -> z              -- ^ and what we return as result

-- | Starting position in FASTA entry.

type StartPos = Int

-- | Current header (the line starting with '>')

type FastaHeader = ByteString

-- | FASTA data

type FastaData = ByteString

-- | Window

type WindowSize = Int

-- | How many characters to peek forward

type PeekSize = Int


-- * conversion from FASTA to data of type 'z'.

-- | Takes a bytestring sequence, applies 'f' to each bytestring of windowsize
-- and returns the results z.

rollingIter
  :: (Monad m, Functor m, Nullable z, Monoid z)
  => (StartPos -> WindowSize -> PeekSize -> FastaData -> z)
  -> WindowSize
  -> PeekSize
  -> Enumeratee ByteString z m a
rollingIter f windowSize peekSize = unfoldConvStream go 0 where
  go start = do
    yss <- roll windowSize (windowSize+peekSize)
    case yss of
      [ys] -> do let xs = BS.filter (/='\n') ys
                 let l = BS.length xs
                 return $ (start + l, f start windowSize peekSize xs)
      _ -> error "rollingIter: error"
{-# INLINE rollingIter #-}

-- | Outer enumeratee. See the two convenience functions for how to use it
-- (just like any enumeratee, basically).
--
-- The fasta function 'f' manipulates small stretches of fasta data and has
-- arguments: fasta header, fasta data, start position (all filled by
-- eneeFasta).
--
-- Next we have the window size, how many characters to read at once,
--
-- followed by the the number of characters to read in addition.
--
-- The work is actually done by 'rollingIter'.

eneeFasta
  :: (Monad m, Functor m, Nullable z, NullPoint z, Monoid z)
  => FastaFunction z
  -> WindowSize
  -> PeekSize
  -> Enumeratee ByteString z m a
eneeFasta f windowSize peekSize = unfoldConvStream go "" where
  go hdr = do
    hdr <- I.takeWhile (/=10) -- 10 == '\n'
    is <- joinI
            $   I.breakE (==62) -- 62 == '>'
            ><> rollingIter (f hdr) windowSize peekSize
            $   stream2stream
    return (hdr, is)
{-# INLINE eneeFasta #-}



-- * Convenience functions: final data is returned strictly.

-- | From an uncompressed file.

fromFile :: (Monoid z, Nullable z) => FastaFunction z -> Int -> Int -> FilePath -> IO z
fromFile ff windowSize peekSize fp
  = run =<< ( enumFile 8192 fp
            . joinI
            . eneeFasta ff windowSize peekSize
            $ stream2stream
            )
{-# INLINE fromFile #-}

-- | From a gzip-compressed file.

fromFileZip :: (Monoid z, Nullable z) => FastaFunction z -> Int -> Int -> FilePath -> IO z
fromFileZip ff windowSize peekSize fp
  = run =<< ( enumFile 8192 fp
            . joinI
            . enumInflate GZipOrZlib defaultDecompressParams
            . joinI
            . eneeFasta ff windowSize peekSize
            $ stream2stream
            )
{-# INLINE fromFileZip #-}