module Bio.GO.Parser
( readOWL
, readOWLAsMap
) where
import Control.Arrow ((&&&))
import qualified Data.ByteString.Lazy.Char8 as L
import qualified Data.HashMap.Strict as M
import Data.Maybe
import qualified Data.Text as T
import Data.Text.Encoding (encodeUtf8)
import Text.XML.Expat.Proc
import Text.XML.Expat.Tree
import Bio.GO
readOWL :: FilePath -> IO [GO]
readOWL fl = do
c <- L.readFile fl
let (xml, _) = parse defaultParseOptions c
goTerms = findChildren "owl:Class" (xml :: Node T.Text T.Text)
return . map pickle $ goTerms
where
pickle x =
let id' = encodeUtf8 . f $ findChild "oboInOwl:id" x
label = f $ findChild "rdfs:label" x
parent = ( encodeUtf8 . T.replace "_" ":" . snd
. T.breakOnEnd "/" . snd . head . getAttributes
) <$> findChild "rdfs:subClassOf" x
namespace = f $ findChild "oboInOwl:hasOBONamespace" x
in GO id' label parent namespace
f = getText . head . getChildren . fromJust
readOWLAsMap :: FilePath -> IO GOMap
readOWLAsMap fl = M.fromList . map (_oboId &&& id) <$> readOWL fl