{- | GeneOntology - parse and index Gene Ontology Annotations In particular, the file 'gene_association.goa_uniprot' that contains links between GO terms and UniProt accessions. * -- Contains the hierarchy including isA relationships. * -- Describes the OBO format. * -- Contains the GOA-UniProt mapping (and a README file). * -- Contains GO definitions (not supported here yet). * -- GO definitions, simpler and more schematically. -} module Bio.Sequence.GeneOntology ( -- * Basic data types GoTerm(..), GoDef(..) -- * Reading the OBO format , GoHierarchy, readObo -- * Reading 'terms and ids' , readTerms -- * Reading UniProt associations , Annotation(..), UniProtAcc, GoClass(..), EvidenceCode(..), readGOA, isCurated -- * Utility stuff , decomment ) where import Data.ByteString.Lazy.Char8 (ByteString,pack,unpack,copy) import qualified Data.ByteString.Lazy.Char8 as B -- | Read the GO hierarchy from the obo file. Note that this is not quite a tree structure. readObo :: FilePath -> IO GoHierarchy readObo f = B.readFile f >>= return . mkGoHier . decomment -- | Read the goa_uniprot file (warning: this one is huge!) readGOA :: FilePath -> IO [Annotation] readGOA f = B.readFile f >>= return . map mkAnn . decomment -- | Read GO term definitions, from the GO.terms_and_ids file readTerms :: FilePath -> IO [GoDef] readTerms f = B.readFile f >>= return . map mkGoDef . decomment decomment :: ByteString -> [ByteString] decomment = filter (\l -> not (B.null l) && B.head l /= '!') . B.lines -- ---------------------------------------------------------- -- Reading the Obo file containing the ontology definition -- ---------------------------------------------------------- -- | A list of Go definitions, with pointers to parent nodes. Read from the .obo file. -- The user may construct the explicit hierachy by storing these in a Map or similar type GoHierarchy = [(GoDef,[GoTerm])] -- Each entry may span multiple lines, thus this function is slightly different from its siblings. -- Todo: strictness? copy? mkGoHier :: [ByteString] -> [(GoDef,[GoTerm])] mkGoHier ls = go $ dropWhile (not . termStart) ls where termStart = (== B.pack "[Term]") go [] = [] go (_:zs) = let (this,rest) = span (not . B.isPrefixOf (B.pack "[")) zs in if null this then if not (null rest) then error "Parse failure in mkGoHier/go" else [] else (mk1 $ map ($ this) [getId, getName, getNamespace, getIsA]) : mkGoHier rest mk1 xs@[i,n,ns,isa] | or (map null [i,n,ns]) = error ("Failed to parse Go Term (missing field in entry):\n" ++unlines (map unpack $ concat xs)) | length i /= 1 || length n /= 1 || length ns /= 1 = error ("Failed to parse Go Term (incorrect field multiplicity):\n" ++unlines (map unpack $ concat xs)) | otherwise = (GoDef (getGo $ head i) (head n) (readNS $ head ns), map getGo isa) mk1 _ = error "This shouldn't happen!" getId = map ((!!1) . B.words) . filter (B.isPrefixOf (pack "id:")) getName = map (B.unwords. tail . B.words) . filter (B.isPrefixOf (pack "name:")) getNamespace = map ((!!1) . B.words) . filter (B.isPrefixOf (pack "namespace:")) getIsA = map ((!!1) . B.words) . filter (B.isPrefixOf (pack "is_a:")) readNS xs = case unpack xs of "biological_process" -> Proc "molecular_function" -> Func "cellular_component" -> Comp _ -> error ("Unknown function: "++unpack xs) -- ---------------------------------------------------------- -- Reading GoTerms from the GO.terms_and_ids file -- ---------------------------------------------------------- -- | A GO term is a positive integer newtype GoTerm = GO Int deriving (Eq,Ord) data GoClass = Func | Proc | Comp instance Read GoTerm where readsPrec n ('G':'O':':':xs) = map (\(i,s)-> (GO i,s)) (readsPrec n xs) readsPrec _ e = error ("couldn't parse GO term: "++show e) instance Show GoTerm where show (GO x) = "GO:"++show x getGo :: ByteString -> GoTerm getGo bs = GO $ fst $ maybe e id (B.readInt $ B.drop 3 bs) where e = error ("Unable to parse GO term"++unpack bs) -- | A GoDef maps a "GoTerm" to a description and a "GoClass". data GoDef = GoDef !GoTerm !ByteString !GoClass deriving (Show) -- Defined in -- The format is "GO:0000000 [tab] text string [tab] F|P|C" -- | Parse a "GoDef" from a line in the GO.terms_and_ids file. mkGoDef :: ByteString -> GoDef mkGoDef = pick . B.split '\t' where pick [go,desc,cls] = GoDef (read $ unpack go) (copy desc) (read $ unpack cls) pick _xs = error ("Couldn't decipher GO definition from: "++show _xs) instance Read GoClass where readsPrec _ ('F':xs) = [(Func,xs)] readsPrec _ ('P':xs) = [(Proc,xs)] readsPrec _ ('C':xs) = [(Comp,xs)] readsPrec _ _ = [] instance Show GoClass where show Func = "F" show Proc = "P" show Comp = "C" -- ---------------------------------------------------------- -- Reading Annotations from the GOA UniProt-GO association file -- ---------------------------------------------------------- -- | A UniProt identifier (short string of capitals and numbers). type UniProtAcc = ByteString -- | A GOA annotation, containing a UniProt identifier, a GoTerm and an evidence code. data Annotation = Ann !UniProtAcc !GoTerm !EvidenceCode deriving (Show) -- | Reading an "Annotation" from a line in the association file. mkAnn :: ByteString -> Annotation mkAnn = pick . B.words where pick (_db:up:rest) = pick' up $ findGo rest pick _ = error "Internal error: mkAnn/pick" pick' up' (go:_:ev:_) = Ann (copy up') (getGo go) (getEC ev) pick' _ _ = error "Internal error: mkAnn/pick'" findGo = dropWhile (not . B.isPrefixOf (pack "GO:")) -- | Evidence codes describe the type of support for an annotation -- data EvidenceCode = IC -- ^ Inferred by Curator | IDA -- ^ Inferred from Direct Assay | IEA -- ^ Inferred from Electronic Annotation | IEP -- ^ Inferred from Expression Pattern | IGC -- ^ Inferred from Genomic Context | IGI -- ^ Inferred from Genetic Interaction | IMP -- ^ Inferred from Mutant Phenotype | IPI -- ^ Inferred from Physical Interaction | ISS -- ^ Inferred from Sequence or Structural Similarity | NAS -- ^ Non-traceable Author Statement | ND -- ^ No biological Data available | RCA -- ^ Inferred from Reviewed Computational Analysis | TAS -- ^ Traceable Author Statement | NR -- ^ Not Recorded deriving (Read,Show,Eq) -- | Read the evidence code from a ByteString (no error checking!). getEC :: ByteString -> EvidenceCode getEC s = case B.uncons s of Just ('I',s') -> case B.uncons s' of Just ('C',_) -> IC Just ('D',_) -> IDA Just ('E',s'') -> case B.head s'' of 'A' -> IEA 'P' -> IEP _ -> e 1 Just ('G',s'') -> case B.head s'' of 'C' -> IGC 'I' -> IGI _ -> e 2 Just ('M',_) -> IMP Just ('P',_) -> IPI Just ('S',_) -> ISS _ -> e 3 Just ('N',s') -> case B.head s' of 'A' -> NAS 'D' -> ND 'R' -> NR _ -> e 4 Just ('R',_) -> RCA Just ('T',_) -> TAS _ -> e 5 where e :: Int -> a e n = error ("Illegal GO evidence code ("++show n++"): "++unpack s) -- | The vast majority of GOA data is IEA, while the most reliable information -- is manually curated. Filtering on this is useful to keep data set sizes -- manageable, too. isCurated :: EvidenceCode -> Bool isCurated = not . (`elem` [ND,IEA])