{- | Functionality for manipulating KEGG annotations. KEGG is a bit hard find, but there exist species-specific tables Available organisms are listed in the table at Data for each organism is stored its own subdirectory under Containing tables linking everything -- including external resources like UniProt, PDB, or NCBI -- together. -} module Bio.Sequence.KEGG where import Bio.Sequence.GeneOntology (UniProtAcc) import qualified Data.ByteString.Lazy.Char8 as B import Data.ByteString.Lazy.Char8 (ByteString) import Control.Arrow ((&&&)) -- | Most KEGG files that contain associations, have one association per line, -- consisting of two items separated by whitespace. This is a generalized reader -- function. genReadKegg :: FilePath -> IO [(ByteString,ByteString)] genReadKegg f = return . map (((!!0) &&& (!!1)) . B.words) . B.lines =<< B.readFile f newtype KO = KO ByteString instance Show KO where show (KO x) = B.unpack x -- | Convert UniProt IDs (up:xxxxxx) to the "UniProtAcc" type. decodeUP :: ByteString -> UniProtAcc decodeUP = removePrefix "up" "UniProt accession" id -- | Convert KO IDs (ko:xxxxx) to the "KO" data type. decodeKO :: ByteString -> KO decodeKO = removePrefix "ko" "KEGG KO id" KO -- | KEGG uses strings with an identifying prefix for IDs. This helper function checks -- and removes prefix to construct native values. removePrefix :: String -> String -> (ByteString -> a) -> ByteString -> a removePrefix pfx err conv bs | (B.pack (pfx++":") `B.isPrefixOf` bs) = conv $ B.drop (fromIntegral (length pfx+1)) bs | otherwise = error ("Can't parse as "++err ++":\n" ++B.unpack bs)