{-# LANGUAGE OverloadedStrings #-} module Bio.GB.Parser ( genBankP ) where import Bio.GB.Type (Feature (..), Form (..), GenBankSequence (..), Locus (..), Meta (..), Reference (..), Source (..), Version (..)) import Bio.Sequence (MarkedSequence, Range, markedSequence) import Control.Applicative ((<|>)) import Data.Attoparsec.Text (Parser, char, decimal, digit, endOfInput, endOfLine, letter, many', many1', satisfy, string, takeWhile, takeWhile1) import Data.Bifunctor (bimap) import Data.Char (isAlphaNum, isSpace, isUpper) import Data.Functor (($>)) import Data.Text (Text, intercalate, pack, splitOn, unpack) import Prelude hiding (takeWhile) -- | Parser of .gb file. -- genBankP :: Parser GenBankSequence genBankP = GenBankSequence <$> metaP <*> gbSeqP <* string "//" <* eolSpaceP <* endOfInput -------------------------------------------------------------------------------- -- Block with meta-information. -------------------------------------------------------------------------------- metaP :: Parser Meta metaP = do locus' <- locusP definitionM <- wrapMP definitionP accessionM <- wrapMP accessionP versionM <- wrapMP versionP keywordsM <- wrapMP keywordsP sourceM <- wrapMP sourceP referencesL <- many' referenceP commentsL <- many' commentP pure $ Meta locus' definitionM accessionM versionM keywordsM sourceM referencesL commentsL locusP :: Parser Locus locusP = string "LOCUS" *> space *> (Locus <$> textP <* space -- name <*> decimal <* space <* string "bp" <* space -- sequence length <*> textP <* space -- molecule type <*> wrapMP formP <* space -- form of sequence <*> wrapMP (pack <$> many1' (satisfy isUpper)) <* space -- GenBank division <*> textP -- modification date <* eolSpaceP) where textP = takeWhile1 $ not . isSpace formP :: Parser Form formP = (string "linear" $> Linear) <|> (string "circular" *> pure Circular) definitionP :: Parser Text definitionP = string "DEFINITION" *> space *> (emptyP <|> someLinesP) accessionP :: Parser Text accessionP = string "ACCESSION" *> space *> (emptyP <|> (pack <$> many1' (alphaNumChar <|> char '_') <* eolSpaceP)) versionP :: Parser Version versionP = string "VERSION" *> space *> ((Version <$> emptyP <*> pure Nothing) <|> (Version <$> (pack <$> many1' versionP') <*> wrapMP (pack <$> (space *> string "GI:" *> many1' versionP')) <* eolSpaceP)) where versionP' = alphaNumChar <|> char '_' <|> char '.' keywordsP :: Parser Text keywordsP = string "KEYWORDS" *> (emptyP <|> (space *> textWithSpacesP <* eolSpaceP)) sourceP :: Parser Source sourceP = string "SOURCE" *> space *> ((flip Source Nothing <$> emptyP) <|> (Source <$> someLinesP <*> wrapMP organismP)) where organismP = string " ORGANISM" *> space *> someLinesP referenceP :: Parser Reference referenceP = string "REFERENCE" *> space *> (((\x -> Reference x Nothing Nothing Nothing Nothing) <$> emptyP) <|> (Reference <$> someLinesP <*> wrapMP (string " AUTHORS" *> space *> someLinesP) <*> wrapMP (string " TITLE" *> space *> someLinesP) <*> wrapMP (string " JOURNAL" *> space *> someLinesP) <*> wrapMP (string " PUBMED" *> space *> someLinesP))) commentP :: Parser Text commentP = string "COMMENT" *> (emptyP <|> (many' (char ' ') *> someLinesP)) -------------------------------------------------------------------------------- -- Block with FEATURES table. -------------------------------------------------------------------------------- featuresP :: Parser [(Feature, Range)] featuresP = string "FEATURES" *> space *> textWithSpacesP <* eolSpaceP *> many1' featureP featureP :: Parser (Feature, Range) featureP = do _ <- string featureIndent1 featureName' <- takeWhile (not . isSpace) <* space (strand53, range) <- rangeP <* eolSpaceP props <- many1' propsP pure (Feature featureName' strand53 props, range) rangeP :: Parser (Bool, Range) rangeP = (string "complement(" *> rP False <* char ')') <|> rP True where rP :: Bool -> Parser (Bool, Range) rP b = fmap (bimap pred id) <$> (,) b <$> (((,) <$> decimal <* string ".." <*> decimal) <|> ((\x -> (x, x)) <$> decimal)) propsP :: Parser (Text, Text) propsP = do _ <- string featureIndent2 _ <- char '/' propName <- takeWhile1 (/= '=') _ <- char '=' propText <- ((char '\"' *> takeWhile1 (/= '\"') <* char '\"') <|> textWithSpacesP) <* eolSpaceP let propTextCorrect = mconcat $ filter (/= featureIndent2) $ splitOn featureIndent2 propText pure (propName, propTextCorrect) -- | First level of identation in FEATURES table file. -- featureIndent1 :: Text featureIndent1 = pack $ replicate 5 ' ' -- | Second level of identation in FEATURES table file. -- featureIndent2 :: Text featureIndent2 = pack $ replicate 21 ' ' -------------------------------------------------------------------------------- -- Block with ORIGIN table. -------------------------------------------------------------------------------- originP :: Parser String originP = string "ORIGIN" *> eolSpaceP *> pure toText <*> many1' (space *> many1' digit *> space1 *> many1' (many1' letter <* (space1 <|> eolSpaceP))) where toText :: [[String]] -> String toText = concat . fmap concat -------------------------------------------------------------------------------- -- Parser of 'GenBankSequence' from FEATURES and ORIGIN tables. -------------------------------------------------------------------------------- gbSeqP :: Parser (MarkedSequence Feature Char) gbSeqP = do features <- featuresP origin <- originP either (fail . unpack) pure (markedSequence origin features) -------------------------------------------------------------------------------- -- Utility functions. -------------------------------------------------------------------------------- -- | First level of identation in .gb file. -- firstIndent :: Text firstIndent = pack $ replicate 12 ' ' eolSpaceP :: Parser () eolSpaceP = () <$ many' (char ' ') <* endOfLine emptyP :: Parser Text emptyP = many' (char ' ') *> char '.' *> eolSpaceP *> pure "." textWithSpacesP :: Parser Text textWithSpacesP = takeWhile (`notElem` ['\n', '\r']) someLinesP :: Parser Text someLinesP = intercalate "\n" <$> someLinesIndentP firstIndent someLinesIndentP :: Text -> Parser [Text] someLinesIndentP indent = (:) <$> textWithSpacesP <* eolSpaceP <*> (many' (string indent *> textWithSpacesP <* eolSpaceP)) wrapMP :: Parser a -> Parser (Maybe a) wrapMP p = fmap Just p <|> pure Nothing space :: Parser () space = () <$ (many' $ satisfy isSpace) space1 :: Parser () space1 = () <$ (many1' $ satisfy isSpace) alphaNumChar :: Parser Char alphaNumChar = satisfy isAlphaNum