{- | 
   GOA - parse and index Gene Onthology Annotations
   In particular, the file 'gene_association.goa_uniprot' that contains
   links between GO terms and UniProt accessions.

   (Where to find the hierarchical relationship between GO terms?)
   <http://www.geneontology.org/ontology/gene_ontology.obo> contains isA relationships
   <http://www.geneontology.org/GO.format.obo-1_2.shtml> describes the format
-}

module Bio.Sequence.GOA where
import Data.ByteString.Lazy.Char8 (ByteString,pack,unpack,copy)
import qualified Data.ByteString.Lazy.Char8 as B

-- | Read the goa_uniprot file (warning: this one is huge!)
readGOA :: FilePath -> IO [Annotation]
readGOA f = B.readFile f >>= 
            return . map mkAnn . decomment

-- | Read GO term definitions
readGO :: FilePath -> IO [GoDef]
readGO f = B.readFile f >>= 
           return . map mkGoDef . decomment

decomment :: ByteString -> [ByteString]
decomment = filter (\l -> not (B.null l) && B.head l /= '!') . B.lines

newtype GoTerm = GO Int deriving (Eq,Ord)
type UniProtAcc = ByteString
data GoClass = Func | Proc | Comp

instance Read GoTerm where 
    readsPrec n ('G':'O':':':xs) = map (\(i,s)-> (GO i,s)) (readsPrec n xs)
    readsPrec n e = error ("couldn't parse GO term: "++show e)
instance Show GoTerm where show (GO x) = "GO:"++show x

instance Read GoClass where
    readsPrec _ ('F':xs) = [(Func,xs)]
    readsPrec _ ('P':xs) = [(Proc,xs)]
    readsPrec _ ('C':xs) = [(Comp,xs)]
    readsPrec _ _ = []
instance Show GoClass where 
    show Func = "F"
    show Proc = "P"
    show Comp = "C"

-- | GOA Annotation - or multiple annotations?
data Annotation = Ann !UniProtAcc !GoTerm !EvidenceCode deriving (Show)

mkAnn :: ByteString -> Annotation
mkAnn = pick . B.words
    where pick (_db:up:rest) = pick' up $ getGo rest
          pick' up' (go:_:ev:_) = Ann (copy up') (read $ unpack go) (read $ unpack ev)
          getGo = dropWhile (not . B.isPrefixOf (pack "GO:"))

-- | GO maps GO terms (GO:xxxx for some number xxxx) to biologically meaningful terms.
--   Defined in <http://www.geneontology.org/doc/GO.terms_and_ids>
--   The format is  "GO:0000000 [tab] text string  [tab] F|P|C"
data GoDef = GoDef !GoTerm !ByteString !GoClass deriving (Show)

mkGoDef :: ByteString -> GoDef
mkGoDef = pick . B.split '\t'
    where pick [go,desc,cls] = GoDef (read $ unpack go) (copy desc) (read $ unpack cls)
          pick _xs = error ("Couldn't decipher GO definition from: "++show _xs)

-- | Evidence codes describe the type of support for an annotation
-- <http://www.geneontology.org/GO.evidence.shtml>
data EvidenceCode = IC  -- Inferred by Curator
	          | IDA -- Inferred from Direct Assay
	          | IEA -- Inferred from Electronic Annotation
	          | IEP -- Inferred from Expression Pattern
	          | IGC -- Inferred from Genomic Context
	          | IGI -- Inferred from Genetic Interaction
	          | IMP -- Inferred from Mutant Phenotype
	          | IPI -- Inferred from Physical Interaction
	          | ISS -- Inferred from Sequence or Structural Similarity
	          | NAS -- Non-traceable Author Statement
	          | ND  -- No biological Data available
	          | RCA -- inferred from Reviewed Computational Analysis
	          | TAS -- Traceable Author Statement
	          | NR  -- Not Recorded 
     deriving (Read,Show,Eq)

-- | The vast majority of GOA data is IEA, while the most reliable information
--   is manually curated.  Filtering on this is useful to keep data set sizes
--   manageable, too.
isCurated :: EvidenceCode -> Bool
isCurated = not . (`elem` [ND,IEA])