{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NoMonomorphismRestriction #-}

-- | The importer for MAF files.
--
-- The importer is in enumerator form. In case of large alignments, move
-- computations into an iteratee.

module Biobase.MAF.Import where

import Control.Arrow
import Control.Monad
import Data.ByteString.Char8 as BS
import Data.Either as E
import Data.Iteratee as I
import Data.Iteratee.Char as C
import Data.Iteratee.IO
import Data.Iteratee.ListLike as LL
import Data.List as L
import Data.Map as M
import Data.Maybe
import Prelude as P

import Biobase.MAF



-- | Transforms a ByteString stream into a stream of "Either MAF Alignment".

eneeMAF :: (Monad m) => Enumeratee BS.ByteString [Either MAF Alignment] m a
eneeMAF = enumLinesBS ><> convStream f where
  f = icont step Nothing
  step (Chunk []) = f
  step (Chunk xs@(x:_))
    | BS.isPrefixOf "#" x = icont (mh xs) Nothing
    | otherwise           = icont (md xs) Nothing
  step str = idone [] str
  -- create the header
  mh xs (Chunk ys)
    | P.null ts = icont (mh hs) Nothing
    | otherwise = idone [mkMAF hs] (Chunk ts)
    where (hs,ts) = P.span (BS.isPrefixOf "#") $ xs++ys
  mh xs str = idone [] str -- TODO error if xs not null?
  -- create each data block
  md xs (Chunk ys)
    | P.length zs == 1 = icont (md (xs++ys)) Nothing
    | otherwise = idone (P.map mkAlignment $ P.init zs) (Chunk $ P.last zs)
    where zs = L.groupBy (\a b -> not $ BS.isPrefixOf "a" b)
             . L.filter (not . BS.isPrefixOf "#")
             . L.filter (not . BS.null)
             $ xs++ys
  md xs str = idone [mkAlignment xs] str
  mkMAF [] = error "eneeMAF: empty stream or no header"
  mkMAF xs = Left $ MAF (mkKVs $ P.head xs) (P.map BS.copy $ P.tail xs) [] []
  mkAlignment xs = Right $ Alignment (mkKVs $ P.head xs) (P.map mkAligned $ P.tail xs)
  mkKVs = M.fromList . P.map mkKV . P.drop 1 . BS.words
  mkKV = (BS.copy *** BS.copy . BS.drop 1) . BS.span (/='=')
  mkAligned x
    | P.length ws == 7 = Aligned
      { key = BS.copy $ ws!!1
      , start = read . BS.unpack $ ws!!2
      , length = read . BS.unpack $ ws!!3
      , strand = BS.head $ ws!!4
      , genomesize = read . BS.unpack $ ws!!5
      , value = BS.copy $ ws!!6
      }
    | otherwise = error $ "couldn't mkAligned from string: " ++ BS.unpack x
    where
        ws = BS.words x

-- | Very simple stream transformer that will remove the payload from alignment
-- blocks. The positional information is keps, only the aligned sequence data
-- is removed. For applications which do not requice the aligned sequences,
-- this can yield enormous space improvements.

eneeRemovePayload = mapStream f where
  f l@(Left _) = l
  f (Right x)  = Right $ x{sequences = P.map g $ sequences x}
  g a@Aligned{..} = a{value = BS.empty}

-- | Simple iteratee that builds a complete MAF out of a stream of "Either MAF
-- Alignment"

iMAF = do
  Left maf <- LL.head
  xs <- stream2list
  return maf{blocks = rights xs}