{- | 
   Functionality for manipulating KEGG annotations.

   KEGG is a bit hard find, but there exist species-specific tables
   Available organisms are listed in the table at

      <ftp://ftp.genome.jp/pub/kegg/genes/etc/all_species.tab>

   Data for each organism is stored its own subdirectory under

      <ftp://ftp.genome.jp/pub/kegg/genes/organisms/>

   Containing tables linking everything -- including external resources like
   UniProt, PDB, or NCBI -- together.
-}

module Bio.Sequence.KEGG where

import Bio.Sequence.GeneOntology (UniProtAcc)

import qualified Data.ByteString.Lazy.Char8 as B
import Data.ByteString.Lazy.Char8 (ByteString)
import Control.Arrow ((&&&))

-- | Most KEGG files that contain associations, have one association per line,
--   consisting of two items separated by whitespace.  This is a generalized reader
--   function.
genReadKegg :: FilePath -> IO [(ByteString,ByteString)]
genReadKegg f = return . map (((!!0) &&& (!!1)) . B.words) . B.lines =<< B.readFile f

newtype KO = KO ByteString
instance Show KO where show (KO x) = B.unpack x

-- | Convert UniProt IDs (up:xxxxxx) to the "UniProtAcc" type.
decodeUP :: ByteString -> UniProtAcc
decodeUP = removePrefix "up" "UniProt accession" id

-- | Convert KO IDs (ko:xxxxx) to the "KO" data type.
decodeKO :: ByteString -> KO
decodeKO = removePrefix "ko" "KEGG KO id" KO

-- | KEGG uses strings with an identifying prefix for IDs. This helper function checks
--   and removes prefix to construct native values.
removePrefix :: String -> String ->  (ByteString -> a) -> ByteString -> a
removePrefix pfx err conv bs
    | (B.pack (pfx++":") `B.isPrefixOf` bs) = conv $ B.drop (fromIntegral (length pfx+1)) bs
    | otherwise                             = error ("Can't parse as "++err ++":\n"
                                                        ++B.unpack bs)