{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE LambdaCase #-}

-- | Parses GFF3

module Biobase.GFF3.Import (gff3FromFile,
                             parseGFF3s,
                            ) where

import Prelude hiding (takeWhile)
import Data.Attoparsec.ByteString.Char8 hiding (isSpace)
import qualified Data.Attoparsec.ByteString.Lazy as L
import qualified Data.ByteString.Char8 as C
import qualified Data.ByteString.Builder as S
import qualified Data.ByteString.Lazy.Char8 as B
import qualified Data.Vector as V
import System.Directory
import Data.Char
import Control.Monad
import Debug.Trace
import Text.Printf
import Biobase.GFF3.Types
import qualified Data.Word8 as W

-- | reads and parses GFF3 from provided filePath
gff3FromFile :: String -> IO [GFF3]
gff3FromFile filePath = do
  printf "# reading GFF3 from file %s\n" filePath
  fileExists <- doesFileExist filePath
  if fileExists
     then parseGFF3s <$> B.readFile filePath
     else fail ("# GFF3 file \"%s\" does not exist\n" ++ filePath)

-- | Read a lazy bytestring and stream out a list of @GFF3s@'s.
-- In case, there is a parse error "late" in the file, we might have
-- already streamed out some (or many!) of these results.

parseGFF3s :: B.ByteString -> [GFF3]
parseGFF3s = go
  where go xs = case L.parse genParseGFF3 xs of
          L.Fail remainingInput ctxts err  -> error $ "parseGFF3s failed! " ++ err ++ " ctxt: " ++ show ctxts ++ " head of remaining input: " ++ B.unpack (B.take 1000 remainingInput)
          L.Done remainingInput btr
            | B.null remainingInput  -> [btr]
            | otherwise              -> btr : go remainingInput

genParseGFF3 :: Parser GFF3
genParseGFF3 = do
  skipMany (try genParseGFF3Comment)
  _entry <- many1 (try genParseGFF3Entry) <?> "GFF3 entry"
  return $ GFF3 (V.fromList _entry) B.empty

genParseGFF3Comment :: Parser String
genParseGFF3Comment = do
  string "#"
  takeWhile1 (/= '\n')
  endOfLine
  return $ ""

genParseGFF3Entry :: Parser GFF3Entry
genParseGFF3Entry = do
  _gff3Seqid <- takeWhile1 (/= '\t') <?> "seqid"
  char '\t'
  _gff3Source <- takeWhile1 (/= '\t') <?> "source"
  char '\t'
  _gff3Type <- takeWhile1 (/= '\t') <?> "type"
  char '\t'
  _gff3Start <- decimal  <?> "start"
  char '\t'
  _gff3End <- decimal <?> "end"
  char '\t'
  _gff3Score <- takeWhile1 (/= '\t') <?> "score"
  char '\t'
  _gff3Strand <- (choice [char '+', char '-', char '.']) <?> "strand"
  char '\t'
  _gff3Phase <- takeWhile1 (/= '\t') <?> "phase"
  char '\t'
  _gff3Attributes <- genParseGFF3Attributes <?> "GFF3 attributes"
  endOfLine
  skipMany (try genParseGFF3Comment)
  return $ GFF3Entry (B.fromStrict _gff3Seqid) (B.fromStrict _gff3Source) (B.fromStrict _gff3Type) _gff3Start _gff3End (B.fromStrict _gff3Score) _gff3Strand (B.fromStrict _gff3Phase) (V.fromList _gff3Attributes)

genParseGFF3Attributes :: Parser [B.ByteString]
genParseGFF3Attributes = do
  _gff3AtributesString <- takeTill (\a -> a == '\n') <?> "attributes"
  let _gff3Atributes = map B.fromStrict (C.split ';' _gff3AtributesString)
  return $ _gff3Atributes

toLB :: C.ByteString -> B.ByteString
toLB = S.toLazyByteString . S.byteString