{-# LANGUAGE BangPatterns #-}
module ELynx.Import.Sequence.Fasta
( fastaSequence
, fasta
) where
import Control.Monad
import qualified Data.ByteString.Lazy.Char8 as L
import qualified Data.Set as S
import Data.Void
import Data.Word8
import Text.Megaparsec
import Text.Megaparsec.Byte
import ELynx.Data.Alphabet.Alphabet as A
import ELynx.Data.Alphabet.Character
import ELynx.Data.Sequence.Sequence
import ELynx.Tools.ByteString (c2w)
type Parser = Parsec Void L.ByteString
isSpecial :: Word8 -> Bool
isSpecial w = w `elem` map c2w ['_', '|', '.', '-']
isHeaderChar :: Word8 -> Bool
isHeaderChar w = isAlphaNum w || isSpecial w
sequenceHeader :: Parser L.ByteString
sequenceHeader = do
_ <- char (c2w '>')
h <- takeWhile1P (Just "Header character") isHeaderChar
_ <- eol
return h
sequenceLine :: S.Set Word8 -> Parser L.ByteString
sequenceLine s = do
!xs <- takeWhile1P (Just "Alphabet character") (`S.member` s)
_ <- void eol <|> eof
return xs
fastaSequence :: Alphabet -> Parser Sequence
fastaSequence a = do hd <- sequenceHeader
let !alph = S.map toWord (A.all . alphabetSpec $ a)
lns <- some (sequenceLine alph)
_ <- many eol
return $ Sequence hd a (toCharacters $ L.concat lns)
fasta :: Alphabet -> Parser [Sequence]
fasta a = some (fastaSequence a) <* eof