Bio/Sequence/GeneOntology.hs

{- | 
   GeneOntology - parse and index Gene Ontology Annotations
   In particular, the file 'gene_association.goa_uniprot' that contains
   links between GO terms and UniProt accessions.

   * <http://www.geneontology.org/ontology/gene_ontology.obo> 
   -- Contains the hierarchy including isA relationships.

   * <http://www.geneontology.org/GO.format.obo-1_2.shtml> 
   -- Describes the OBO format.

   * <ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/> 
   -- Contains the GOA-UniProt mapping (and a README file).

   * <http://www.geneontology.org/ontology/GO.defs> 
   -- Contains GO definitions (not supported here yet).

   * <http://www.geneontology.org/doc/GO.terms_and_ids> 
   -- GO definitions, simpler and more schematically.

-}

module Bio.Sequence.GeneOntology 
    (
    -- * Basic data types
    GoTerm(..), GoDef(..)

    -- * Reading the OBO format
    , GoHierarchy, readObo

    -- * Reading 'terms and ids'
    , readTerms

    -- * Reading UniProt associations
    , Annotation(..), UniProtAcc, GoClass(..), EvidenceCode(..), readGOA, isCurated

    -- * Utility stuff
    , decomment

    ) where

import Data.ByteString.Lazy.Char8 (ByteString,pack,unpack,copy)
import qualified Data.ByteString.Lazy.Char8 as B

-- | Read the GO hierarchy from the obo file.  Note that this is not quite a tree structure.
readObo :: FilePath -> IO GoHierarchy
readObo f = B.readFile f >>=
            return . mkGoHier . decomment

-- | Read the goa_uniprot file (warning: this one is huge!)
readGOA :: FilePath -> IO [Annotation]
readGOA f = B.readFile f >>= 
            return . map mkAnn . decomment

-- | Read GO term definitions, from the GO.terms_and_ids file
readTerms :: FilePath -> IO [GoDef]
readTerms f = B.readFile f >>= 
           return . map mkGoDef . decomment

decomment :: ByteString -> [ByteString]
decomment = filter (\l -> not (B.null l) && B.head l /= '!') . B.lines

-- ----------------------------------------------------------
-- Reading the Obo file containing the ontology definition
-- ----------------------------------------------------------

-- | A list of Go definitions, with pointers to parent nodes.  Read from the .obo file.
--   The user may construct the explicit hierachy by storing these in a Map or similar
type GoHierarchy = [(GoDef,[GoTerm])]

-- Each entry may span multiple lines, thus this function is slightly different from its siblings.
-- Todo: strictness? copy?
mkGoHier :: [ByteString] -> [(GoDef,[GoTerm])]
mkGoHier ls = go $ dropWhile (not . termStart) ls
    where termStart = (== B.pack "[Term]")
          go [] = []
          go (_:zs) = let (this,rest) = span (not . B.isPrefixOf (B.pack "[")) zs in
              if null this then 
                  if not (null rest) then error "Parse failure in mkGoHier/go" 
                  else []
              else (mk1 $ map ($ this) [getId, getName, getNamespace, getIsA]) : mkGoHier rest

          mk1 xs@[i,n,ns,isa] 
              | or (map null [i,n,ns]) = error ("Failed to parse Go Term (missing field in entry):\n"
                                                ++unlines (map unpack $ concat xs))
              | length i /= 1 || length n /= 1 || length ns /= 1 
                         = error ("Failed to parse Go Term (incorrect field multiplicity):\n"
                                  ++unlines (map unpack $ concat xs))
              | otherwise = (GoDef (getGo $ head i) (head n) (readNS $ head ns), map getGo isa)
          mk1 _ = error "This shouldn't happen!"

          getId        = map ((!!1) . B.words) . filter (B.isPrefixOf (pack "id:"))
          getName      = map (B.unwords. tail  . B.words) . filter (B.isPrefixOf (pack "name:"))
          getNamespace = map ((!!1) . B.words) . filter (B.isPrefixOf (pack "namespace:"))
          getIsA       = map ((!!1) . B.words) . filter (B.isPrefixOf (pack "is_a:"))
          readNS xs = case unpack xs of "biological_process" -> Proc
                                        "molecular_function" -> Func
                                        "cellular_component" -> Comp
                                        _ -> error ("Unknown function: "++unpack xs)


-- ----------------------------------------------------------
-- Reading GoTerms from the GO.terms_and_ids file
-- ----------------------------------------------------------

-- | A GO term is a positive integer
newtype GoTerm = GO Int deriving (Eq,Ord)
data GoClass = Func | Proc | Comp

instance Read GoTerm where 
    readsPrec n ('G':'O':':':xs) = map (\(i,s)-> (GO i,s)) (readsPrec n xs)
    readsPrec _ e = error ("couldn't parse GO term: "++show e)
instance Show GoTerm where show (GO x) = "GO:"++show x

getGo :: ByteString -> GoTerm
getGo bs = GO $ fst $ maybe e id (B.readInt $ B.drop 3 bs)
    where e = error ("Unable to parse GO term"++unpack bs)

-- | A GoDef maps a "GoTerm" to a description and a "GoClass".
data GoDef = GoDef !GoTerm !ByteString !GoClass deriving (Show)

--   Defined in <http://www.geneontology.org/doc/GO.terms_and_ids>
--   The format is  "GO:0000000 [tab] text string  [tab] F|P|C"

-- | Parse a "GoDef" from a line in the GO.terms_and_ids file.
mkGoDef :: ByteString -> GoDef
mkGoDef = pick . B.split '\t'
    where pick [go,desc,cls] = GoDef (read $ unpack go) (copy desc) (read $ unpack cls)
          pick _xs = error ("Couldn't decipher GO definition from: "++show _xs)

instance Read GoClass where
    readsPrec _ ('F':xs) = [(Func,xs)]
    readsPrec _ ('P':xs) = [(Proc,xs)]
    readsPrec _ ('C':xs) = [(Comp,xs)]
    readsPrec _ _ = []
instance Show GoClass where 
    show Func = "F"
    show Proc = "P"
    show Comp = "C"

-- ----------------------------------------------------------
-- Reading Annotations from the GOA UniProt-GO association file
-- ----------------------------------------------------------

-- | A UniProt identifier (short string of capitals and numbers).
type UniProtAcc = ByteString
-- | A GOA annotation, containing a UniProt identifier, a GoTerm and an evidence code.
data Annotation = Ann !UniProtAcc !GoTerm !EvidenceCode deriving (Show)

-- | Reading an "Annotation" from a line in the association file.
mkAnn :: ByteString -> Annotation
mkAnn = pick . B.words
    where pick (_db:up:rest) = pick' up $ findGo rest
          pick _ = error "Internal error: mkAnn/pick"
          pick' up' (go:_:ev:_) = Ann (copy up') (getGo go) (getEC ev)
          pick' _ _ = error "Internal error: mkAnn/pick'"
          findGo = dropWhile (not . B.isPrefixOf (pack "GO:"))

-- | Evidence codes describe the type of support for an annotation
-- <http://www.geneontology.org/GO.evidence.shtml>
data EvidenceCode = IC  -- ^ Inferred by Curator
	          | IDA -- ^ Inferred from Direct Assay
	          | IEA -- ^ Inferred from Electronic Annotation
	          | IEP -- ^ Inferred from Expression Pattern
	          | IGC -- ^ Inferred from Genomic Context
	          | IGI -- ^ Inferred from Genetic Interaction
	          | IMP -- ^ Inferred from Mutant Phenotype
	          | IPI -- ^ Inferred from Physical Interaction
	          | ISS -- ^ Inferred from Sequence or Structural Similarity
	          | NAS -- ^ Non-traceable Author Statement
	          | ND  -- ^ No biological Data available
	          | RCA -- ^ Inferred from Reviewed Computational Analysis
	          | TAS -- ^ Traceable Author Statement
	          | NR  -- ^ Not Recorded 
     deriving (Read,Show,Eq)

-- | Read the evidence code from a ByteString (no error checking!).
getEC :: ByteString -> EvidenceCode 
getEC s = case B.uncons s of
            Just ('I',s') -> case B.uncons s' of
                               Just ('C',_) -> IC
                               Just ('D',_) -> IDA
                               Just ('E',s'') -> case B.head s'' of 'A' -> IEA
                                                                    'P' -> IEP
                                                                    _ -> e 1
                               Just ('G',s'') -> case B.head s'' of 'C' -> IGC
                                                                    'I' -> IGI
                                                                    _ -> e 2
                               Just ('M',_) -> IMP
                               Just ('P',_) -> IPI
                               Just ('S',_) -> ISS
                               _ -> e 3
            Just ('N',s') -> case B.head s' of 'A' -> NAS
                                               'D' -> ND
                                               'R' -> NR
                                               _ -> e 4
            Just ('R',_) -> RCA
            Just ('T',_) -> TAS
            _ -> e 5
    where e :: Int -> a
          e n = error ("Illegal GO evidence code ("++show n++"): "++unpack s)

-- | The vast majority of GOA data is IEA, while the most reliable information
--   is manually curated.  Filtering on this is useful to keep data set sizes
--   manageable, too.
isCurated :: EvidenceCode -> Bool
isCurated = not . (`elem` [ND,IEA])