{- | Link sequences (read from the xml file) to desired annotations 
     (Currently support GO, todo support KEGG)
 -}
module Annotate where

import Data.ByteString.Lazy.Char8 (ByteString,unpack,pack)
import qualified Data.ByteString.Lazy.Char8 as B
import Data.Map  (Map)
import qualified Data.Map as M
import Data.Set  (Set)
import qualified Data.Set as S
import Data.List (nub)
import Data.Maybe

import Control.Monad
import Control.Arrow (second)

import Bio.Sequence.GeneOntology
import Bio.Sequence.KEGG
import qualified Bio.Alignment.BlastData as BD
import Bio.Alignment.BlastData (BlastResult)
import Bio.Alignment.BlastFlat 
import Bio.Alignment.BlastXML as X -- new BlastFlat exports this too
import Data.Array.Unboxed
import Data.Word

import Options

type GoDefinitions = Map GoTerm (GoDef,[GoTerm])
type GoAnnotations = Map UniProtAcc (UArray Int Word16)
type KoAnnotations = Map UniProtAcc KO

{- | Read GO annotations using GOA.
   
   1. build a map of GO terms to description (25K lines - tiny)
   2. collect all proteins names from the XML file (1 pass over .xml)
   3. build a map of proteins to GO terms (1 pass over gene_assoc..), and add GO term desc
-}
read_go :: Options -> IO (GoDefinitions, GoAnnotations)
read_go opts  = do  -- (Opts (Just go) (Just goa),[xml],[])
  -- goterms are not necessary when we don't also use goanns
  case goann opts of
    Nothing -> return (M.empty,M.empty)
    Just goa -> do
      ts <- case godef opts of
              Nothing -> return M.empty
              Just obo -> do
                    t <- return . goTerms =<< (count opts) 1000 "reading GO terms: "
                           =<< readObo obo
                    when (isNothing $ M.lookup (GO 0) t) $ return () -- strictness hack
                    return t

      proteins <- getProts opts
      when (S.member (B.empty) proteins == False) $ return () -- strictness
      mt <- readGOA goa >>= (count opts) 1000 "reading GO annotations: "
            >>= return .  protTerms proteins
      when (isNothing $ M.lookup B.empty mt) $ return () -- strictness, again
      return (ts,mt)
  

-- ----------------------------------------------------------------------
-- Build tables of annotation information
-- ----------------------------------------------------------------------

-- | Parse GO.terms_and_ids into a map
goTerms :: [(GoDef,[GoTerm])] -> GoDefinitions
goTerms = M.fromList . map go2keyval
    where go2keyval gd@(GoDef gt _ _,_) = (gt,gd)

-- | Read blast xml output and collect UniProt hits
getProts :: Options -> IO (Set UniProtAcc)
getProts opts = do
  hs <- mapM (\f -> (count opts) 100 ("reading accessions from '"++f++"': ")
                    . concatMap BD.hits . concatMap BD.results =<< X.readXML f) (inputs opts)
  return . S.fromList . map (chop . BD.subject) $ concat hs

-- | Convert "UniProt_ABCDEF Blah blah blah" to "ABCDEF"
--   Warning: only tested on this format, may break!
chop :: ByteString -> ByteString
chop = B.copy . B.drop 1 . B.dropWhile (/='_') . head . B.words

-- | Using GO definitions and the set of uniprot terms, scan the associations file and
--   snarf all relevant data.  Requires the annotations to be grouped by protein.
protTerms :: (Set UniProtAcc) -> [Annotation] -> GoAnnotations
protTerms ps as = M.fromAscList . map toArray . partitions . filter isMember $ as
    where toArray (Ann up (GO gt1) _:xs) = let gs = nub (gt1:[ gt | Ann _ (GO gt) _ <- xs ])
                                               a  = listArray (0,length gs-1) $ map fromIntegral gs
                                           in a `seq` (pack . unpack $ up,a)
          toArray [] = error "cannot read an empty list of annotations!"
          isMember (Ann up _ _) = up `S.member` ps
          partitions xs@(Ann x _ _:_) = let (p1,pps) = span (\(Ann y _ _) -> y==x) xs
                                        in p1:partitions pps
          partitions [] = []


{- | Read KEGG annotations 
   KEGG is specified as abbreviated organism prefixes. 
   Files to be read are $prefix_ko.list and $prefix_uniprot.list.
 -}
read_kegg :: Options -> IO KoAnnotations
read_kegg opts = M.unions `fmap` mapM build_kegg (ko opts)
    where -- build KEGG info for a single organism
          build_kegg org = do 
            k <- return . M.fromList =<<
                 (count opts) 1000 ("parsing KEGG information for '"++org++"'.. ")
                     =<< merge_files org
            when (isNothing $ M.lookup (B.pack "") k) $ return () -- strictness hack
            return k
          -- parse and merge files; they seem to be lexicographically sorted
          merge_files org = do
            g2u <- map (second decodeUP) `fmap` genReadKegg (org++"_uniprot.list") 
            g2k <- map (second decodeKO) `fmap` genReadKegg (org++"_ko.list")
            return $ merge g2u g2k

          -- merge sort on fst, returning snd            
          merge :: [(ByteString,UniProtAcc)] -> [(ByteString,KO)] -> [(UniProtAcc,KO)]
          merge (x:xs) (y:ys) | fst x < fst y = merge xs (y:ys)
                              | fst x > fst y = merge (x:xs) ys
                              | otherwise     = (snd x,snd y) : merge xs ys
          merge [] _ = []
          merge _ [] = []