{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE LambdaCase #-}
module Biobase.GFF3.Import (gff3FromFile,
parseGFF3s,
) where
import Prelude hiding (takeWhile)
import Data.Attoparsec.ByteString.Char8 hiding (isSpace)
import qualified Data.Attoparsec.ByteString.Lazy as L
import qualified Data.ByteString.Char8 as C
import qualified Data.ByteString.Builder as S
import qualified Data.ByteString.Lazy.Char8 as B
import qualified Data.Vector as V
import System.Directory
import Data.Char
import Control.Monad
import Debug.Trace
import Text.Printf
import Biobase.GFF3.Types
import qualified Data.Word8 as W
gff3FromFile :: String -> IO [GFF3]
gff3FromFile filePath = do
printf "# reading GFF3 from file %s\n" filePath
fileExists <- doesFileExist filePath
if fileExists
then parseGFF3s <$> B.readFile filePath
else fail ("# GFF3 file \"%s\" does not exist\n" ++ filePath)
parseGFF3s :: B.ByteString -> [GFF3]
parseGFF3s = go
where go xs = case L.parse genParseGFF3 xs of
L.Fail remainingInput ctxts err -> error $ "parseGFF3s failed! " ++ err ++ " ctxt: " ++ show ctxts ++ " head of remaining input: " ++ B.unpack (B.take 1000 remainingInput)
L.Done remainingInput btr
| B.null remainingInput -> [btr]
| otherwise -> btr : go remainingInput
genParseGFF3 :: Parser GFF3
genParseGFF3 = do
skipMany (try genParseGFF3Comment)
_entry <- many1 (try genParseGFF3Entry) <?> "GFF3 entry"
return $ GFF3 (V.fromList _entry) B.empty
genParseGFF3Comment :: Parser String
genParseGFF3Comment = do
string "#"
takeWhile1 (/= '\n')
endOfLine
return $ ""
genParseGFF3Entry :: Parser GFF3Entry
genParseGFF3Entry = do
_gff3Seqid <- takeWhile1 (/= '\t') <?> "seqid"
char '\t'
_gff3Source <- takeWhile1 (/= '\t') <?> "source"
char '\t'
_gff3Type <- takeWhile1 (/= '\t') <?> "type"
char '\t'
_gff3Start <- decimal <?> "start"
char '\t'
_gff3End <- decimal <?> "end"
char '\t'
_gff3Score <- takeWhile1 (/= '\t') <?> "score"
char '\t'
_gff3Strand <- (choice [char '+', char '-', char '.']) <?> "strand"
char '\t'
_gff3Phase <- takeWhile1 (/= '\t') <?> "phase"
char '\t'
_gff3Attributes <- genParseGFF3Attributes <?> "GFF3 attributes"
endOfLine
skipMany (try genParseGFF3Comment)
return $ GFF3Entry (B.fromStrict _gff3Seqid) (B.fromStrict _gff3Source) (B.fromStrict _gff3Type) _gff3Start _gff3End (B.fromStrict _gff3Score) _gff3Strand (B.fromStrict _gff3Phase) (V.fromList _gff3Attributes)
genParseGFF3Attributes :: Parser [B.ByteString]
genParseGFF3Attributes = do
_gff3AtributesString <- takeTill (\a -> a == '\n') <?> "attributes"
let _gff3Atributes = map B.fromStrict (C.split ';' _gff3AtributesString)
return $ _gff3Atributes
toLB :: C.ByteString -> B.ByteString
toLB = S.toLazyByteString . S.byteString