{- | GOA - parse and index Gene Onthology Annotations In particular, the file 'gene_association.goa_uniprot' that contains links between GO terms and UniProt accessions. (Where to find the hierarchical relationship between GO terms?) contains isA relationships describes the format -} module Bio.Sequence.GOA where import Data.ByteString.Lazy.Char8 (ByteString,pack,unpack,copy) import qualified Data.ByteString.Lazy.Char8 as B -- | Read the goa_uniprot file (warning: this one is huge!) readGOA :: FilePath -> IO [Annotation] readGOA f = B.readFile f >>= return . map mkAnn . decomment -- | Read GO term definitions readGO :: FilePath -> IO [GoDef] readGO f = B.readFile f >>= return . map mkGoDef . decomment decomment :: ByteString -> [ByteString] decomment = filter (\l -> not (B.null l) && B.head l /= '!') . B.lines newtype GoTerm = GO Int deriving (Eq,Ord) type UniProtAcc = ByteString data GoClass = Func | Proc | Comp instance Read GoTerm where readsPrec n ('G':'O':':':xs) = map (\(i,s)-> (GO i,s)) (readsPrec n xs) readsPrec n e = error ("couldn't parse GO term: "++show e) instance Show GoTerm where show (GO x) = "GO:"++show x instance Read GoClass where readsPrec _ ('F':xs) = [(Func,xs)] readsPrec _ ('P':xs) = [(Proc,xs)] readsPrec _ ('C':xs) = [(Comp,xs)] readsPrec _ _ = [] instance Show GoClass where show Func = "F" show Proc = "P" show Comp = "C" -- | GOA Annotation - or multiple annotations? data Annotation = Ann !UniProtAcc !GoTerm !EvidenceCode deriving (Show) mkAnn :: ByteString -> Annotation mkAnn = pick . B.words where pick (_db:up:rest) = pick' up $ getGo rest pick' up' (go:_:ev:_) = Ann (copy up') (read $ unpack go) (read $ unpack ev) getGo = dropWhile (not . B.isPrefixOf (pack "GO:")) -- | GO maps GO terms (GO:xxxx for some number xxxx) to biologically meaningful terms. -- Defined in -- The format is "GO:0000000 [tab] text string [tab] F|P|C" data GoDef = GoDef !GoTerm !ByteString !GoClass deriving (Show) mkGoDef :: ByteString -> GoDef mkGoDef = pick . B.split '\t' where pick [go,desc,cls] = GoDef (read $ unpack go) (copy desc) (read $ unpack cls) pick _xs = error ("Couldn't decipher GO definition from: "++show _xs) -- | Evidence codes describe the type of support for an annotation -- data EvidenceCode = IC -- Inferred by Curator | IDA -- Inferred from Direct Assay | IEA -- Inferred from Electronic Annotation | IEP -- Inferred from Expression Pattern | IGC -- Inferred from Genomic Context | IGI -- Inferred from Genetic Interaction | IMP -- Inferred from Mutant Phenotype | IPI -- Inferred from Physical Interaction | ISS -- Inferred from Sequence or Structural Similarity | NAS -- Non-traceable Author Statement | ND -- No biological Data available | RCA -- inferred from Reviewed Computational Analysis | TAS -- Traceable Author Statement | NR -- Not Recorded deriving (Read,Show,Eq) -- | The vast majority of GOA data is IEA, while the most reliable information -- is manually curated. Filtering on this is useful to keep data set sizes -- manageable, too. isCurated :: EvidenceCode -> Bool isCurated = not . (`elem` [ND,IEA])