{-# LANGUAGE OverloadedStrings #-} module Bio.GO.Parser ( readOWL , readOWLAsMap ) where import Control.Arrow ((&&&)) import qualified Data.ByteString.Lazy.Char8 as L import qualified Data.HashMap.Strict as M import Data.Maybe import qualified Data.Text as T import Data.Text.Encoding (encodeUtf8) import Text.XML.Expat.Proc import Text.XML.Expat.Tree import Bio.GO readOWL :: FilePath -> IO [GO] readOWL fl = do c <- L.readFile fl let (xml, _) = parse defaultParseOptions c goTerms = findChildren "owl:Class" (xml :: Node T.Text T.Text) return . map pickle $ goTerms where pickle x = let id' = encodeUtf8 . f $ findChild "oboInOwl:id" x label = f $ findChild "rdfs:label" x parent = ( encodeUtf8 . T.replace "_" ":" . snd . T.breakOnEnd "/" . snd . head . getAttributes ) <$> findChild "rdfs:subClassOf" x namespace = f $ findChild "oboInOwl:hasOBONamespace" x in GO id' label parent namespace f = getText . head . getChildren . fromJust readOWLAsMap :: FilePath -> IO GOMap readOWLAsMap fl = M.fromList . map (_oboId &&& id) <$> readOWL fl