{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE TupleSections #-}
module Bio.Uniprot.Parser where
import Prelude hiding (init, null)
import qualified Prelude as P (concat, id, init)
import Bio.Uniprot.Type
import Control.Applicative (liftA2, (<|>))
import Control.Monad (unless, when)
import Data.Attoparsec.Text
import Data.Bifunctor (second)
import Data.Functor (($>))
import Data.Text (Text, append, concat, init, null, pack,
splitOn, unpack, unwords)
import Data.Void
data NameType = RecName | AltName | SubName | Flags | None
deriving (Show)
parseID :: Parser ID
parseID = do
string "ID "
entryName <- pack <$> many1 (satisfy $ inClass "A-Z0-9_")
many1 space
status <- (string "Reviewed" $> Reviewed) <|>
(string "Unreviewed" $> Unreviewed)
char ';'
many1 space
seqLength <- decimal
space >> string "AA."
pure ID{..}
parseAC :: Parser AC
parseAC = do
parseStartAC
initAC <- P.concat <$> many' (parseOneAC <* endOfLine <* parseStartAC)
lastAC <- parseOneAC
let accessionNumbers = initAC ++ lastAC
pure AC{..}
where
parseStartAC :: Parser ()
parseStartAC = string "AC" >> count 3 space >> pure ()
parseOneAC :: Parser [Text]
parseOneAC = many1 $ do
res <- pack <$> many1 (satisfy $ inClass "A-Z0-9_")
char ';'
option ' ' (satisfy isHorizontalSpace)
pure res
parseDT :: Parser DT
parseDT = do
(dbIntegrationDate, dbName) <- parseOneDT "integrated into UniProtKB/" <* endOfLine
(seqVersionDate, seqVersion) <- second (read . unpack) <$> parseOneDT "sequence version " <* endOfLine
(entryVersionDate, entryVersion) <- second (read . unpack) <$> parseOneDT "entry version "
pure DT{..}
where
parseOneDT :: Text -> Parser (Text, Text)
parseOneDT txt = do
string "DT "
day <- pack <$> many1 (satisfy $ inClass "A-Z0-9-")
char ','
many1 space
string txt
x <- pack <$> many1 (satisfy $ inClass "A-Za-z0-9_-")
char '.'
pure (day, x)
parseDE :: Parser DE
parseDE = do
recName <- optional $ parseNameDE 0 RecName
altNames <- many' (endOfLine *> parseAltDE 0)
subNames <- many' (endOfLine *> parseNameDE 0 SubName)
includes <- many' (endOfLine *> parseInternal "Includes")
contains <- many' (endOfLine *> parseInternal "Contains")
flags <- optional (endOfLine *> parseFlagsDE)
pure DE{..}
where
parseNameDE :: Int -> NameType -> Parser Name
parseNameDE indent nameType = do
fullName <- parseDELine indent nameType "Full"
shortName <- many' $ endOfLine *> parseDELine indent None "Short"
ecNumber <- many' $ endOfLine *> parseDELine indent None "EC"
pure Name{..}
parseFlagsDE :: Parser Flag
parseFlagsDE = read . unpack <$> parseDELine 0 Flags ""
parseAltDE :: Int -> Parser AltName
parseAltDE indent =
(Simple <$> parseNameDE indent AltName) <|>
(Allergen <$> parseDELine indent AltName "Allergen") <|>
(Biotech <$> parseDELine indent AltName "Biotech") <|>
(CDAntigen <$> parseDELine indent AltName "CD_antigen") <|>
(INN <$> parseDELine indent AltName "INN")
parseDELine :: Int -> NameType -> Text -> Parser Text
parseDELine indent nameType tpe = do
string "DE "
count indent (char ' ')
case nameType of
None -> string " "
a -> string $ append (pack $ show a) ": "
unless (null tpe) $ do
string tpe
string "="
pure ()
res <- pack <$> many1 (notChar ';')
char ';'
pure res
parseInternal :: Text -> Parser DE
parseInternal name = do
string "DE " >> string name >> char ':'
endOfLine
recName <- optional $ parseNameDE 2 RecName
altNames <- many' (endOfLine *> parseAltDE 2)
pure $ DE recName altNames [] [] [] Nothing
parseGN :: Parser [GN]
parseGN = do
string "GN "
geneName <- optional parseGNName
optional $ parseBreak "GN"
synonyms <- option [] $ parseGNList "Synonyms"
optional $ parseBreak "GN"
orderedLocusNames <- option [] $ parseGNList "OrderedLocusNames"
optional $ parseBreak "GN"
orfNames <- option [] $ parseGNList "ORFNames"
let gn = GN{..}
rest <- option [] $ string "and" *> endOfLine *> parseGN
pure $ gn:rest
where
parseGNName :: Parser Text
parseGNName = parseDefItem "Name" (P.id <$>)
parseGNList :: Text -> Parser [Text]
parseGNList name = parseDefItem name (splitOn ", " <$>)
parseOS :: Parser OS
parseOS = OS . pack <$> parseOSStr
where
parseOSStr :: Parser String
parseOSStr = do
string "OS "
namePart <- many1 (satisfy $ not . isEndOfLine)
if last namePart == '.'
then pure $ P.init namePart
else do
rest <- endOfLine *> parseOSStr
pure $ namePart ++ rest
parseOG :: Parser OG
parseOG = parseOGNonPlasmid <|> (Plasmid <$> parseOGPlasmid)
where
parseOGNonPlasmid :: Parser OG
parseOGNonPlasmid = string "OG " *>
(string "Hydrogenosome." $> Hydrogenosome) <|>
(string "Mitochondrion." $> Mitochondrion) <|>
(string "Nucleomorph." $> Nucleomorph) <|>
(string "Plastid." $> Plastid PlastidSimple) <|>
(string "Plastid; Apicoplast." $> Plastid PlastidApicoplast) <|>
(string "Plastid; Chloroplast." $> Plastid PlastidChloroplast) <|>
(string "Plastid; Organellar chromatophore." $> Plastid PlastidOrganellarChromatophore) <|>
(string "Plastid; Cyanelle." $> Plastid PlastidCyanelle) <|>
(string "Plastid; Non-photosynthetic plastid." $> Plastid PlastidNonPhotosynthetic)
parseOGPlasmid :: Parser [Text]
parseOGPlasmid = do
string "OG "
name <- parseOnePlasmid
let separator = string "," >> optional " and"
rest <- many' $ separator *> space *> parseOnePlasmid
rest2 <- (char '.' $> []) <|> (separator *> endOfLine *> parseOGPlasmid)
pure $ name : rest ++ rest2
parseOnePlasmid :: Parser Text
parseOnePlasmid = do
string "Plasmid "
pack <$> many1 (satisfy $ liftA2 (&&) (notInClass ",.") (not . isEndOfLine))
parseOC :: Parser OC
parseOC = parseNodes ';' '.' "OC" OC
parseOX :: Parser OX
parseOX = do
string "OX "
databaseQualifier <- pack <$> many1 (notChar '=')
char '='
taxonomicCode <- pack <$> many1 (notChar ';')
char ';'
pure OX{..}
parseOH :: Parser OH
parseOH = do
string "OH NCBI_TaxID="
taxId <- pack <$> many1 (notChar ';')
string "; "
hostName <- pack <$> many1 (notChar '.')
char '.'
pure OH{..}
parseRef :: Parser Reference
parseRef = do
rn <- parseRN
endOfLine
rp <- parseRP
endOfLine
rc <- option [] (parseRCX STRAIN "RC" <* endOfLine)
rx <- option [] (parseRCX MEDLINE "RX" <* endOfLine)
rg <- optional (parseRG <* endOfLine)
ra <- option [] (parseNodes ',' ';' "RA" P.id <* endOfLine)
rt <- optional (parseRT <* endOfLine)
rl <- parseRL
pure Reference{..}
where
parseRN :: Parser Int
parseRN = (string "RN [" *> decimal) <* char ']'
parseRP :: Parser Text
parseRP = do
string "RP "
pack . P.init <$> parseMultiLineComment "RP" 3
parseRCX :: (Enum a, Show a) => a -> Text -> Parser [(a, Text)]
parseRCX start name = do
string name >> string " "
(:) <$> parseTokPair start
<*> many' (parseBreak name *> parseTokPair start)
where
parseTokPair :: (Enum a, Show a) => a -> Parser (a, Text)
parseTokPair x = foldl1 (<|>) $
(\x -> (x,) <$> parseDefItem (pack . show $ x) (P.id <$>)) <$> [x..]
parseRG :: Parser Text
parseRG = pack <$> (string "RG " *> many1 (satisfy $ not . isEndOfLine))
parseRT :: Parser Text
parseRT = do
string "RT \""
let p = many1 $ satisfy $ liftA2 (&&) (not . isEndOfLine) (notInClass "\"")
referenceTitle <- (:) <$> p <*> many' (endOfLine *> string "RT " *> p)
string "\";"
pure $ pack . hyphenConcat $ referenceTitle
parseRL :: Parser Text
parseRL = do
string "RL "
pack . P.init <$> parseMultiLineComment "RL" 3
parseCC :: Parser CC
parseCC = do
string "CC -!- "
topic <- pack <$> many1 (notChar ':')
char ':'
(char ' ' $> ()) <|> (endOfLine >> string "CC" >> count 7 space $> ())
comment <- pack <$> parseMultiLineComment "CC" 7
pure CC{..}
parseDR :: Parser DR
parseDR = do
string "DR "
resourceAbbr <- parseToken
string "; "
resourceId <- parseToken
string "; "
optionalInfo' <- (:) <$> parseToken <*> many' (string "; " *> parseToken)
let optionalInfo = P.init optionalInfo' ++ [init . last $ optionalInfo']
pure DR{..}
where
parseToken :: Parser Text
parseToken = pack <$> many1 (satisfy $ liftA2 (&&) (notInClass ";") (not . isEndOfLine))
parsePE :: Parser PE
parsePE = (string "PE 1: Evidence at protein level;" $> EvidenceAtProteinLevel) <|>
(string "PE 2: Evidence at transcript level;" $> EvidenceAtTranscriptLevel) <|>
(string "PE 3: Inferred from homology;" $> InferredFromHomology) <|>
(string "PE 4: Predicted;" $> Predicted) <|>
(string "PE 5: Uncertain;" $> Uncertain)
parseKW :: Parser KW
parseKW = parseNodes ';' '.' "KW" KW
parseFT :: Parser FT
parseFT = do
string "FT "
keyName <- pack <$> many1 (satisfy $ inClass "A-Z_")
many1 space
fromEP <- parseFTEndpoint
many1 space
toEP <- parseFTEndpoint
many1 space
description <- splitByMagic <$> parseMultiLineComment "FT" 32
pure FT{..}
where
parseFTEndpoint :: Parser Endpoint
parseFTEndpoint = (UncertainEP <$> (char '?' *> decimal)) <|>
(NTerminalEP <$> (char '<' *> decimal)) <|>
(CTerminalEP <$> (char '>' *> decimal)) <|>
(ExactEP <$> decimal) <|>
(char '?' $> UnknownEP)
splitByMagic :: String -> [Text]
splitByMagic txt = pack <$> splitStr 0 [] txt
where
splitStr :: Int -> String -> String -> [String]
splitStr 0 acc ['.'] = [reverse acc]
splitStr 0 acc ('.':' ':xs) = reverse acc : splitStr 0 [] xs
splitStr 0 acc ('.':xs) = reverse acc : splitStr 0 [] xs
splitStr n acc ('(':xs) = splitStr (n+1) ('(':acc) xs
splitStr n acc (')':xs) = splitStr (n-1) (')':acc) xs
splitStr n acc (x:xs) = splitStr n (x:acc) xs
parseSQ :: Parser SQ
parseSQ = do
string "SQ SEQUENCE"
many1 space
length <- decimal
space >> string "AA;"
many1 space
molWeight <- decimal
space >> string "MW;"
many1 space
crc64 <- pack <$> many1 (satisfy $ inClass "A-F0-9")
space >> string "CRC64;"
endOfLine
sequence <- pack . P.concat <$>
many1 (skipSpace *> many1 (satisfy $ inClass "ACDEFGHIKLMNPQRSTVWY"))
pure SQ{..}
parseEnd :: Parser ()
parseEnd = string "//" >> pure ()
parseRecord :: Parser Record
parseRecord = Record <$> (parseID <* endOfLine)
<*> (parseAC <* endOfLine)
<*> (parseDT <* endOfLine)
<*> (parseDE <* endOfLine)
<*> option [] (parseGN <* endOfLine)
<*> (parseOS <* endOfLine)
<*> optional (parseOG <* endOfLine)
<*> (parseOC <* endOfLine)
<*> (parseOX <* endOfLine)
<*> many' (parseOH <* endOfLine)
<*> many' (parseRef <* endOfLine)
<*> many' (parseCC <* endOfLine)
<*> many' (parseDR <* endOfLine)
<*> (parsePE <* endOfLine)
<*> optional (parseKW <* endOfLine)
<*> many' (parseFT <* endOfLine)
<*> (parseSQ <* endOfLine)
<* parseEnd
optional :: Parser a -> Parser (Maybe a)
optional par = option Nothing (Just <$> par)
parseNodes :: Char
-> Char
-> Text
-> ([Text] -> a)
-> Parser a
parseNodes del end start f = do
string start >> count 3 space
name <- parseNode
rest <- many' $ do
char del
string " " <|> (endOfLine >> string start >> count 3 space >> pure "")
parseNode
char end
pure $ f (name:rest)
where
parseNode :: Parser Text
parseNode = pack <$> many1 (satisfy $ liftA2 (&&) (notInClass [del,end]) (not . isEndOfLine))
parseTillEnd :: Parser String
parseTillEnd = many1 $ satisfy (not . isEndOfLine)
parseMultiLineComment :: Text -> Int -> Parser String
parseMultiLineComment start skip = do
comm <- (:) <$> parseTillEnd
<*> many' (do endOfLine
string start
count (skip - 1) (char ' ')
parseTillEnd)
pure $ hyphenConcat comm
parseBreak :: Text -> Parser ()
parseBreak txt = ((endOfLine >> string txt >> string " ") <|> string " ") $> ()
parseDefItem :: Text -> (Parser Text -> Parser a) -> Parser a
parseDefItem name f = do
string name >> char '='
f (pack <$> many1 (notChar ';')) <* char ';'
hyphenConcat :: [String] -> String
hyphenConcat [] = []
hyphenConcat [x] = x
hyphenConcat (x:y:ys) = x ++ hyphenConcat (sy:ys)
where
sy :: String
sy | last x == '-' = tail y
| isAA (last x) && isAA (y !! 1) = tail y
| otherwise = y
isAA :: Char -> Bool
isAA = inClass "ACDEFGHIKLMNPQRSTVWY"