{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE StrictData #-}
module Bio.GO.Parser
( readOWL
, readOWLAsMap
, GAF(..)
, readGAF
) where
import Control.Arrow ((&&&))
import Conduit
import qualified Data.ByteString.Lazy.Char8 as L
import qualified Data.ByteString.Char8 as B
import qualified Data.HashMap.Strict as M
import Data.Text.Encoding (decodeUtf8)
import Text.XML.Expat.Proc
import Text.XML.Expat.Tree
import Data.Maybe
import qualified Data.CaseInsensitive as CI
import Bio.GO
import Bio.Utils.Misc (readInt)
readOWL :: FilePath -> IO [GO]
readOWL fl = do
xml <- parseThrowing defaultParseOptions <$> L.readFile fl :: IO (Node B.ByteString B.ByteString)
return $ map process $ filterChildren (\x -> "owl:Class" == getName x && not (isDeprecated x)) xml
where
isDeprecated x = isJust $ findChild "owl:deprecated" x
process record = GO id' label parent namespace
where
id' = case findChild "oboInOwl:id" record of
Nothing -> error $ "Cannot find id field for: " <> show record
Just i -> readInt $ snd $ B.breakEnd (==':') $ B.concat $ map getText $ getChildren i
label = case findChild "rdfs:label" record of
Nothing -> error "readOWL: cannot find label field"
Just l -> decodeUtf8 $ B.concat $ map getText $ getChildren l
namespace = case findChild "oboInOwl:hasOBONamespace" record of
Nothing -> error "readOWL: cannot find namespace field"
Just ns -> decodeUtf8 $ B.concat $ map getText $ getChildren ns
parent =
let f p = case lookup "rdf:resource" (getAttributes p) of
Nothing -> Nothing
Just at -> Just $ readInt $ snd $ B.breakEnd (=='_') at
in mapMaybe f $ findChildren "rdfs:subClassOf" record
readOWLAsMap :: FilePath -> IO GOMap
readOWLAsMap fl = M.fromListWith errMsg . map (_oboId &&& id) <$> readOWL fl
where
errMsg = error "readOWLAsMap: Duplicate records."
data GAF = GAF
{ gafDb :: B.ByteString
, gafDbId :: B.ByteString
, gafSymbol :: CI.CI B.ByteString
, gafQualifier :: Maybe [B.ByteString]
, gafGoId :: GOId
, gafDbRef :: [B.ByteString]
, gafEvidenceCode :: B.ByteString
, gafWithOrFrom :: Maybe [B.ByteString]
, gafAspect :: B.ByteString
, gafName :: Maybe B.ByteString
, gafSynonym :: Maybe [B.ByteString]
, gafType :: B.ByteString
, gafTaxon :: [B.ByteString]
, gafDate :: B.ByteString
, gafAssignedBy :: B.ByteString
, gafAnnotationExtension :: Maybe [B.ByteString]
, gafGeneProductID :: Maybe B.ByteString
} deriving (Show, Ord, Eq)
readGAF :: FilePath -> ConduitT i GAF (ResourceT IO) ()
readGAF input = sourceFileBS input .| linesUnboundedAsciiC .|
(dropWhileC isCom >> mapC parseLine)
where
isCom l = B.head l == '!' || B.null l
{-# INLINE readGAF #-}
parseLine :: B.ByteString -> GAF
parseLine l = GAF f1 f2 (CI.mk f3) (optionals f4)
(readInt $ snd $ B.breakEnd (==':') f5) (B.split '|' f6) f7 (optionals f8)
f9 (optional f10) (optionals f11) f12 (B.split '|' f13) f14 f15
(optionals f16) (optional f17)
where
[f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17] = B.split '\t' l
optional x | B.null x = Nothing
| otherwise = Just x
optionals x | B.null x = Nothing
| otherwise = Just $ B.split '|' x
{-# INLINE parseLine #-}