{-# LANGUAGE OverloadedStrings #-}
module Bio.GB.Parser
( genBankP
) where
import Bio.GB.Type (Feature (..), Form (..),
GenBankSequence (..), Locus (..),
Meta (..), Reference (..), Source (..),
Version (..))
import Bio.Sequence (MarkedSequence, Range, markedSequence)
import Control.Applicative ((<|>))
import Data.Attoparsec.Text (Parser, char, decimal, digit, endOfInput,
endOfLine, letter, many', many1',
satisfy, string, takeWhile, takeWhile1)
import Data.Bifunctor (bimap)
import Data.Char (isAlphaNum, isSpace, isUpper)
import Data.Functor (($>))
import Data.Text (Text, intercalate, pack, splitOn, unpack)
import Prelude hiding (takeWhile)
genBankP :: Parser GenBankSequence
genBankP = GenBankSequence
<$> metaP
<*> gbSeqP
<* string "//" <* eolSpaceP <* endOfInput
metaP :: Parser Meta
metaP = do
locus' <- locusP
definitionM <- wrapMP definitionP
accessionM <- wrapMP accessionP
versionM <- wrapMP versionP
keywordsM <- wrapMP keywordsP
sourceM <- wrapMP sourceP
referencesL <- many' referenceP
commentsL <- many' commentP
pure $ Meta locus' definitionM accessionM versionM keywordsM sourceM referencesL commentsL
locusP :: Parser Locus
locusP = string "LOCUS" *> space *> (Locus
<$> textP <* space
<*> decimal <* space <* string "bp" <* space
<*> textP <* space
<*> wrapMP formP <* space
<*> wrapMP (pack <$> many1' (satisfy isUpper)) <* space
<*> textP
<* eolSpaceP)
where
textP = takeWhile1 $ not . isSpace
formP :: Parser Form
formP = (string "linear" $> Linear) <|> (string "circular" *> pure Circular)
definitionP :: Parser Text
definitionP = string "DEFINITION" *> space *> (emptyP <|> someLinesP)
accessionP :: Parser Text
accessionP = string "ACCESSION" *> space *> (emptyP <|> (pack
<$> many1' (alphaNumChar <|> char '_')
<* eolSpaceP))
versionP :: Parser Version
versionP = string "VERSION" *> space
*> ((Version <$> emptyP <*> pure Nothing) <|> (Version
<$> (pack <$> many1' versionP')
<*> wrapMP (pack <$> (space *> string "GI:" *> many1' versionP'))
<* eolSpaceP))
where
versionP' = alphaNumChar <|> char '_' <|> char '.'
keywordsP :: Parser Text
keywordsP = string "KEYWORDS"
*> (emptyP
<|> (space *> textWithSpacesP <* eolSpaceP))
sourceP :: Parser Source
sourceP = string "SOURCE" *> space
*> ((flip Source Nothing <$> emptyP) <|> (Source
<$> someLinesP
<*> wrapMP organismP))
where
organismP = string " ORGANISM" *> space *> someLinesP
referenceP :: Parser Reference
referenceP = string "REFERENCE" *> space
*> (((\x -> Reference x Nothing Nothing Nothing Nothing) <$> emptyP) <|> (Reference
<$> someLinesP
<*> wrapMP (string " AUTHORS" *> space *> someLinesP)
<*> wrapMP (string " TITLE" *> space *> someLinesP)
<*> wrapMP (string " JOURNAL" *> space *> someLinesP)
<*> wrapMP (string " PUBMED" *> space *> someLinesP)))
commentP :: Parser Text
commentP = string "COMMENT" *> (emptyP <|> (many' (char ' ') *> someLinesP))
featuresP :: Parser [(Feature, Range)]
featuresP = string "FEATURES" *> space
*> textWithSpacesP <* eolSpaceP
*> many1' featureP
featureP :: Parser (Feature, Range)
featureP = do
_ <- string featureIndent1
featureName' <- takeWhile (not . isSpace) <* space
(strand53, range) <- rangeP <* eolSpaceP
props <- many1' propsP
pure (Feature featureName' strand53 props, range)
rangeP :: Parser (Bool, Range)
rangeP = (string "complement(" *> rP False <* char ')') <|> rP True
where
rP :: Bool -> Parser (Bool, Range)
rP b = fmap (bimap pred id)
<$> (,) b
<$> (((,) <$> decimal <* string ".." <*> decimal) <|> ((\x -> (x, x)) <$> decimal))
propsP :: Parser (Text, Text)
propsP = do
_ <- string featureIndent2
_ <- char '/'
propName <- takeWhile1 (/= '=')
_ <- char '='
propText <- ((char '\"' *> takeWhile1 (/= '\"') <* char '\"')
<|> textWithSpacesP)
<* eolSpaceP
let propTextCorrect = mconcat $ filter (/= featureIndent2) $ splitOn featureIndent2 propText
pure (propName, propTextCorrect)
featureIndent1 :: Text
featureIndent1 = pack $ replicate 5 ' '
featureIndent2 :: Text
featureIndent2 = pack $ replicate 21 ' '
originP :: Parser String
originP = string "ORIGIN" *> eolSpaceP
*> pure toText
<*> many1' (space *> many1' digit *> space1
*> many1' (many1' letter <* (space1 <|> eolSpaceP)))
where
toText :: [[String]] -> String
toText = concat . fmap concat
gbSeqP :: Parser (MarkedSequence Feature Char)
gbSeqP = do
features <- featuresP
origin <- originP
either (fail . unpack) pure (markedSequence origin features)
firstIndent :: Text
firstIndent = pack $ replicate 12 ' '
eolSpaceP :: Parser ()
eolSpaceP = () <$ many' (char ' ') <* endOfLine
emptyP :: Parser Text
emptyP = many' (char ' ') *> char '.' *> eolSpaceP *> pure "."
textWithSpacesP :: Parser Text
textWithSpacesP = takeWhile (`notElem` ['\n', '\r'])
someLinesP :: Parser Text
someLinesP = intercalate "\n" <$> someLinesIndentP firstIndent
someLinesIndentP :: Text -> Parser [Text]
someLinesIndentP indent = (:) <$> textWithSpacesP <* eolSpaceP
<*> (many' (string indent *> textWithSpacesP <* eolSpaceP))
wrapMP :: Parser a -> Parser (Maybe a)
wrapMP p = fmap Just p <|> pure Nothing
space :: Parser ()
space = () <$ (many' $ satisfy isSpace)
space1 :: Parser ()
space1 = () <$ (many1' $ satisfy isSpace)
alphaNumChar :: Parser Char
alphaNumChar = satisfy isAlphaNum