{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE StrictData #-}
module Bio.GO.Parser
    ( readOWL
    , readOWLAsMap
    , GAF(..)
    , readGAF
    ) where

import           Control.Arrow              ((&&&))
import Conduit
import qualified Data.ByteString.Lazy.Char8 as L
import qualified Data.ByteString.Char8 as B
import qualified Data.HashMap.Strict        as M
import           Data.Text.Encoding         (decodeUtf8)
import           Text.XML.Expat.Proc
import           Text.XML.Expat.Tree
import Data.Maybe
import qualified Data.CaseInsensitive as CI

import           Bio.GO
import           Bio.Utils.Misc (readInt)

readOWL :: FilePath -> IO [GO]
readOWL :: FilePath -> IO [GO]
readOWL FilePath
fl = do
    Node ByteString ByteString
xml <- ParseOptions ByteString ByteString
-> ByteString -> Node ByteString ByteString
forall tag text.
(GenericXMLString tag, GenericXMLString text) =>
ParseOptions tag text -> ByteString -> Node tag text
parseThrowing ParseOptions ByteString ByteString
forall tag text. ParseOptions tag text
defaultParseOptions (ByteString -> Node ByteString ByteString)
-> IO ByteString -> IO (Node ByteString ByteString)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FilePath -> IO ByteString
L.readFile FilePath
fl :: IO (Node B.ByteString B.ByteString)
    [GO] -> IO [GO]
forall (m :: * -> *) a. Monad m => a -> m a
return ([GO] -> IO [GO]) -> [GO] -> IO [GO]
forall a b. (a -> b) -> a -> b
$ (Node ByteString ByteString -> GO)
-> [Node ByteString ByteString] -> [GO]
forall a b. (a -> b) -> [a] -> [b]
map Node ByteString ByteString -> GO
forall tag (n :: (* -> *) -> * -> * -> *).
(NodeClass n [], GenericXMLString tag, IsString tag,
 Show (n [] tag ByteString)) =>
n [] tag ByteString -> GO
process ([Node ByteString ByteString] -> [GO])
-> [Node ByteString ByteString] -> [GO]
forall a b. (a -> b) -> a -> b
$ (Node ByteString ByteString -> Bool)
-> Node ByteString ByteString -> [Node ByteString ByteString]
forall (n :: (* -> *) -> * -> * -> *) (c :: * -> *) tag text.
NodeClass n c =>
(n c tag text -> Bool) -> n c tag text -> c (n c tag text)
filterChildren (\Node ByteString ByteString
x -> ByteString
"owl:Class" ByteString -> ByteString -> Bool
forall a. Eq a => a -> a -> Bool
== Node ByteString ByteString -> ByteString
forall (n :: (* -> *) -> * -> * -> *) (c :: * -> *) tag text.
(NodeClass n c, Monoid tag) =>
n c tag text -> tag
getName Node ByteString ByteString
x Bool -> Bool -> Bool
&& Bool -> Bool
not (Node ByteString ByteString -> Bool
forall (n :: (* -> *) -> * -> * -> *) tag text.
(NodeClass n [], GenericXMLString tag, IsString tag) =>
n [] tag text -> Bool
isDeprecated Node ByteString ByteString
x)) Node ByteString ByteString
xml
  where
    isDeprecated :: n [] tag text -> Bool
isDeprecated n [] tag text
x = Maybe (n [] tag text) -> Bool
forall a. Maybe a -> Bool
isJust (Maybe (n [] tag text) -> Bool) -> Maybe (n [] tag text) -> Bool
forall a b. (a -> b) -> a -> b
$ tag -> n [] tag text -> Maybe (n [] tag text)
forall (n :: (* -> *) -> * -> * -> *) tag text.
(NodeClass n [], GenericXMLString tag) =>
tag -> n [] tag text -> Maybe (n [] tag text)
findChild tag
"owl:deprecated" n [] tag text
x
    process :: n [] tag ByteString -> GO
process n [] tag ByteString
record = GOId -> Text -> [GOId] -> Text -> GO
GO GOId
id' Text
label [GOId]
parent Text
namespace
      where
        id' :: GOId
id' = case tag -> n [] tag ByteString -> Maybe (n [] tag ByteString)
forall (n :: (* -> *) -> * -> * -> *) tag text.
(NodeClass n [], GenericXMLString tag) =>
tag -> n [] tag text -> Maybe (n [] tag text)
findChild tag
"oboInOwl:id" n [] tag ByteString
record of
            Maybe (n [] tag ByteString)
Nothing -> FilePath -> GOId
forall a. HasCallStack => FilePath -> a
error (FilePath -> GOId) -> FilePath -> GOId
forall a b. (a -> b) -> a -> b
$ FilePath
"Cannot find id field for: " FilePath -> FilePath -> FilePath
forall a. Semigroup a => a -> a -> a
<> n [] tag ByteString -> FilePath
forall a. Show a => a -> FilePath
show n [] tag ByteString
record
            Just n [] tag ByteString
i -> ByteString -> GOId
readInt (ByteString -> GOId) -> ByteString -> GOId
forall a b. (a -> b) -> a -> b
$ (ByteString, ByteString) -> ByteString
forall a b. (a, b) -> b
snd ((ByteString, ByteString) -> ByteString)
-> (ByteString, ByteString) -> ByteString
forall a b. (a -> b) -> a -> b
$ (Char -> Bool) -> ByteString -> (ByteString, ByteString)
B.breakEnd (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
':') (ByteString -> (ByteString, ByteString))
-> ByteString -> (ByteString, ByteString)
forall a b. (a -> b) -> a -> b
$ [ByteString] -> ByteString
B.concat ([ByteString] -> ByteString) -> [ByteString] -> ByteString
forall a b. (a -> b) -> a -> b
$ (n [] tag ByteString -> ByteString)
-> [n [] tag ByteString] -> [ByteString]
forall a b. (a -> b) -> [a] -> [b]
map n [] tag ByteString -> ByteString
forall (n :: (* -> *) -> * -> * -> *) (c :: * -> *) text tag.
(NodeClass n c, Monoid text) =>
n c tag text -> text
getText ([n [] tag ByteString] -> [ByteString])
-> [n [] tag ByteString] -> [ByteString]
forall a b. (a -> b) -> a -> b
$ n [] tag ByteString -> [n [] tag ByteString]
forall (n :: (* -> *) -> * -> * -> *) (c :: * -> *) tag text.
NodeClass n c =>
n c tag text -> c (n c tag text)
getChildren n [] tag ByteString
i
        label :: Text
label = case tag -> n [] tag ByteString -> Maybe (n [] tag ByteString)
forall (n :: (* -> *) -> * -> * -> *) tag text.
(NodeClass n [], GenericXMLString tag) =>
tag -> n [] tag text -> Maybe (n [] tag text)
findChild tag
"rdfs:label" n [] tag ByteString
record of
            Maybe (n [] tag ByteString)
Nothing -> FilePath -> Text
forall a. HasCallStack => FilePath -> a
error FilePath
"readOWL: cannot find label field"
            Just n [] tag ByteString
l -> ByteString -> Text
decodeUtf8 (ByteString -> Text) -> ByteString -> Text
forall a b. (a -> b) -> a -> b
$ [ByteString] -> ByteString
B.concat ([ByteString] -> ByteString) -> [ByteString] -> ByteString
forall a b. (a -> b) -> a -> b
$ (n [] tag ByteString -> ByteString)
-> [n [] tag ByteString] -> [ByteString]
forall a b. (a -> b) -> [a] -> [b]
map n [] tag ByteString -> ByteString
forall (n :: (* -> *) -> * -> * -> *) (c :: * -> *) text tag.
(NodeClass n c, Monoid text) =>
n c tag text -> text
getText ([n [] tag ByteString] -> [ByteString])
-> [n [] tag ByteString] -> [ByteString]
forall a b. (a -> b) -> a -> b
$ n [] tag ByteString -> [n [] tag ByteString]
forall (n :: (* -> *) -> * -> * -> *) (c :: * -> *) tag text.
NodeClass n c =>
n c tag text -> c (n c tag text)
getChildren n [] tag ByteString
l
        namespace :: Text
namespace = case tag -> n [] tag ByteString -> Maybe (n [] tag ByteString)
forall (n :: (* -> *) -> * -> * -> *) tag text.
(NodeClass n [], GenericXMLString tag) =>
tag -> n [] tag text -> Maybe (n [] tag text)
findChild tag
"oboInOwl:hasOBONamespace" n [] tag ByteString
record of
            Maybe (n [] tag ByteString)
Nothing -> FilePath -> Text
forall a. HasCallStack => FilePath -> a
error FilePath
"readOWL: cannot find namespace field"
            Just n [] tag ByteString
ns -> ByteString -> Text
decodeUtf8 (ByteString -> Text) -> ByteString -> Text
forall a b. (a -> b) -> a -> b
$ [ByteString] -> ByteString
B.concat ([ByteString] -> ByteString) -> [ByteString] -> ByteString
forall a b. (a -> b) -> a -> b
$ (n [] tag ByteString -> ByteString)
-> [n [] tag ByteString] -> [ByteString]
forall a b. (a -> b) -> [a] -> [b]
map n [] tag ByteString -> ByteString
forall (n :: (* -> *) -> * -> * -> *) (c :: * -> *) text tag.
(NodeClass n c, Monoid text) =>
n c tag text -> text
getText ([n [] tag ByteString] -> [ByteString])
-> [n [] tag ByteString] -> [ByteString]
forall a b. (a -> b) -> a -> b
$ n [] tag ByteString -> [n [] tag ByteString]
forall (n :: (* -> *) -> * -> * -> *) (c :: * -> *) tag text.
NodeClass n c =>
n c tag text -> c (n c tag text)
getChildren n [] tag ByteString
ns
        parent :: [GOId]
parent =
            let f :: n c a ByteString -> Maybe GOId
f n c a ByteString
p = case a -> [(a, ByteString)] -> Maybe ByteString
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup a
"rdf:resource" (n c a ByteString -> [(a, ByteString)]
forall (n :: (* -> *) -> * -> * -> *) (c :: * -> *) tag text.
NodeClass n c =>
n c tag text -> [(tag, text)]
getAttributes n c a ByteString
p) of
                    Maybe ByteString
Nothing -> Maybe GOId
forall a. Maybe a
Nothing
                    Just ByteString
at -> GOId -> Maybe GOId
forall a. a -> Maybe a
Just (GOId -> Maybe GOId) -> GOId -> Maybe GOId
forall a b. (a -> b) -> a -> b
$ ByteString -> GOId
readInt (ByteString -> GOId) -> ByteString -> GOId
forall a b. (a -> b) -> a -> b
$ (ByteString, ByteString) -> ByteString
forall a b. (a, b) -> b
snd ((ByteString, ByteString) -> ByteString)
-> (ByteString, ByteString) -> ByteString
forall a b. (a -> b) -> a -> b
$ (Char -> Bool) -> ByteString -> (ByteString, ByteString)
B.breakEnd (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
'_') ByteString
at
            in (n [] tag ByteString -> Maybe GOId)
-> [n [] tag ByteString] -> [GOId]
forall a b. (a -> Maybe b) -> [a] -> [b]
mapMaybe n [] tag ByteString -> Maybe GOId
forall a (n :: (* -> *) -> * -> * -> *) (c :: * -> *).
(Eq a, IsString a, NodeClass n c) =>
n c a ByteString -> Maybe GOId
f ([n [] tag ByteString] -> [GOId])
-> [n [] tag ByteString] -> [GOId]
forall a b. (a -> b) -> a -> b
$ tag -> n [] tag ByteString -> [n [] tag ByteString]
forall (n :: (* -> *) -> * -> * -> *) (c :: * -> *) tag text.
(NodeClass n c, Eq tag, Monoid tag) =>
tag -> n c tag text -> c (n c tag text)
findChildren tag
"rdfs:subClassOf" n [] tag ByteString
record 

readOWLAsMap :: FilePath -> IO GOMap
readOWLAsMap :: FilePath -> IO GOMap
readOWLAsMap FilePath
fl = (GO -> GO -> GO) -> [(GOId, GO)] -> GOMap
forall k v.
(Eq k, Hashable k) =>
(v -> v -> v) -> [(k, v)] -> HashMap k v
M.fromListWith GO -> GO -> GO
forall a. a
errMsg ([(GOId, GO)] -> GOMap) -> ([GO] -> [(GOId, GO)]) -> [GO] -> GOMap
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (GO -> (GOId, GO)) -> [GO] -> [(GOId, GO)]
forall a b. (a -> b) -> [a] -> [b]
map (GO -> GOId
_oboId (GO -> GOId) -> (GO -> GO) -> GO -> (GOId, GO)
forall (a :: * -> * -> *) b c c'.
Arrow a =>
a b c -> a b c' -> a b (c, c')
&&& GO -> GO
forall a. a -> a
id) ([GO] -> GOMap) -> IO [GO] -> IO GOMap
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> FilePath -> IO [GO]
readOWL FilePath
fl
  where
    errMsg :: a
errMsg = FilePath -> a
forall a. HasCallStack => FilePath -> a
error FilePath
"readOWLAsMap: Duplicate records."

data GAF = GAF
    { GAF -> ByteString
gafDb :: B.ByteString
    , GAF -> ByteString
gafDbId :: B.ByteString
    , GAF -> CI ByteString
gafSymbol :: CI.CI B.ByteString
    , GAF -> Maybe [ByteString]
gafQualifier :: Maybe [B.ByteString]
    , GAF -> GOId
gafGoId :: GOId
    , GAF -> [ByteString]
gafDbRef :: [B.ByteString]
    , GAF -> ByteString
gafEvidenceCode :: B.ByteString
    , GAF -> Maybe [ByteString]
gafWithOrFrom :: Maybe [B.ByteString]
    , GAF -> ByteString
gafAspect :: B.ByteString
    , GAF -> Maybe ByteString
gafName :: Maybe B.ByteString
    , GAF -> Maybe [ByteString]
gafSynonym :: Maybe [B.ByteString]
    , GAF -> ByteString
gafType :: B.ByteString
    , GAF -> [ByteString]
gafTaxon :: [B.ByteString]
    , GAF -> ByteString
gafDate :: B.ByteString
    , GAF -> ByteString
gafAssignedBy :: B.ByteString
    , GAF -> Maybe [ByteString]
gafAnnotationExtension :: Maybe [B.ByteString]
    , GAF -> Maybe ByteString
gafGeneProductID :: Maybe B.ByteString
    } deriving (GOId -> GAF -> FilePath -> FilePath
[GAF] -> FilePath -> FilePath
GAF -> FilePath
(GOId -> GAF -> FilePath -> FilePath)
-> (GAF -> FilePath) -> ([GAF] -> FilePath -> FilePath) -> Show GAF
forall a.
(GOId -> a -> FilePath -> FilePath)
-> (a -> FilePath) -> ([a] -> FilePath -> FilePath) -> Show a
showList :: [GAF] -> FilePath -> FilePath
$cshowList :: [GAF] -> FilePath -> FilePath
show :: GAF -> FilePath
$cshow :: GAF -> FilePath
showsPrec :: GOId -> GAF -> FilePath -> FilePath
$cshowsPrec :: GOId -> GAF -> FilePath -> FilePath
Show, Eq GAF
Eq GAF
-> (GAF -> GAF -> Ordering)
-> (GAF -> GAF -> Bool)
-> (GAF -> GAF -> Bool)
-> (GAF -> GAF -> Bool)
-> (GAF -> GAF -> Bool)
-> (GAF -> GAF -> GAF)
-> (GAF -> GAF -> GAF)
-> Ord GAF
GAF -> GAF -> Bool
GAF -> GAF -> Ordering
GAF -> GAF -> GAF
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: GAF -> GAF -> GAF
$cmin :: GAF -> GAF -> GAF
max :: GAF -> GAF -> GAF
$cmax :: GAF -> GAF -> GAF
>= :: GAF -> GAF -> Bool
$c>= :: GAF -> GAF -> Bool
> :: GAF -> GAF -> Bool
$c> :: GAF -> GAF -> Bool
<= :: GAF -> GAF -> Bool
$c<= :: GAF -> GAF -> Bool
< :: GAF -> GAF -> Bool
$c< :: GAF -> GAF -> Bool
compare :: GAF -> GAF -> Ordering
$ccompare :: GAF -> GAF -> Ordering
$cp1Ord :: Eq GAF
Ord, GAF -> GAF -> Bool
(GAF -> GAF -> Bool) -> (GAF -> GAF -> Bool) -> Eq GAF
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: GAF -> GAF -> Bool
$c/= :: GAF -> GAF -> Bool
== :: GAF -> GAF -> Bool
$c== :: GAF -> GAF -> Bool
Eq)

-- | GO Annotation File (GAF) Format 2.1 Parser. For details read:
-- http://geneontology.org/page/go-annotation-file-gaf-format-21.
readGAF :: FilePath -> ConduitT i GAF (ResourceT IO) ()
readGAF :: FilePath -> ConduitT i GAF (ResourceT IO) ()
readGAF FilePath
input = FilePath -> ConduitT i ByteString (ResourceT IO) ()
forall (m :: * -> *) i.
MonadResource m =>
FilePath -> ConduitT i ByteString m ()
sourceFileBS FilePath
input ConduitT i ByteString (ResourceT IO) ()
-> ConduitM ByteString GAF (ResourceT IO) ()
-> ConduitT i GAF (ResourceT IO) ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.| ConduitT ByteString ByteString (ResourceT IO) ()
forall (m :: * -> *) seq.
(Monad m, IsSequence seq, Element seq ~ Word8) =>
ConduitT seq seq m ()
linesUnboundedAsciiC ConduitT ByteString ByteString (ResourceT IO) ()
-> ConduitM ByteString GAF (ResourceT IO) ()
-> ConduitM ByteString GAF (ResourceT IO) ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.|
    ((ByteString -> Bool) -> ConduitM ByteString GAF (ResourceT IO) ()
forall (m :: * -> *) a o.
Monad m =>
(a -> Bool) -> ConduitT a o m ()
dropWhileC ByteString -> Bool
isCom ConduitM ByteString GAF (ResourceT IO) ()
-> ConduitM ByteString GAF (ResourceT IO) ()
-> ConduitM ByteString GAF (ResourceT IO) ()
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> (ByteString -> GAF) -> ConduitM ByteString GAF (ResourceT IO) ()
forall (m :: * -> *) a b. Monad m => (a -> b) -> ConduitT a b m ()
mapC ByteString -> GAF
parseLine)
  where
    isCom :: ByteString -> Bool
isCom ByteString
l = ByteString -> Char
B.head ByteString
l Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'!' Bool -> Bool -> Bool
|| ByteString -> Bool
B.null ByteString
l
{-# INLINE readGAF #-}

parseLine :: B.ByteString -> GAF
parseLine :: ByteString -> GAF
parseLine ByteString
l = ByteString
-> ByteString
-> CI ByteString
-> Maybe [ByteString]
-> GOId
-> [ByteString]
-> ByteString
-> Maybe [ByteString]
-> ByteString
-> Maybe ByteString
-> Maybe [ByteString]
-> ByteString
-> [ByteString]
-> ByteString
-> ByteString
-> Maybe [ByteString]
-> Maybe ByteString
-> GAF
GAF ByteString
f1 ByteString
f2 (ByteString -> CI ByteString
forall s. FoldCase s => s -> CI s
CI.mk ByteString
f3) (ByteString -> Maybe [ByteString]
optionals ByteString
f4)
    (ByteString -> GOId
readInt (ByteString -> GOId) -> ByteString -> GOId
forall a b. (a -> b) -> a -> b
$ (ByteString, ByteString) -> ByteString
forall a b. (a, b) -> b
snd ((ByteString, ByteString) -> ByteString)
-> (ByteString, ByteString) -> ByteString
forall a b. (a -> b) -> a -> b
$ (Char -> Bool) -> ByteString -> (ByteString, ByteString)
B.breakEnd (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
':') ByteString
f5) (Char -> ByteString -> [ByteString]
B.split Char
'|' ByteString
f6) ByteString
f7 (ByteString -> Maybe [ByteString]
optionals ByteString
f8)
    ByteString
f9 (ByteString -> Maybe ByteString
optional ByteString
f10) (ByteString -> Maybe [ByteString]
optionals ByteString
f11) ByteString
f12 (Char -> ByteString -> [ByteString]
B.split Char
'|' ByteString
f13) ByteString
f14 ByteString
f15
    (ByteString -> Maybe [ByteString]
optionals ByteString
f16) (ByteString -> Maybe ByteString
optional ByteString
f17)
  where
    [ByteString
f1,ByteString
f2,ByteString
f3,ByteString
f4,ByteString
f5,ByteString
f6,ByteString
f7,ByteString
f8,ByteString
f9,ByteString
f10,ByteString
f11,ByteString
f12,ByteString
f13,ByteString
f14,ByteString
f15,ByteString
f16,ByteString
f17] = Char -> ByteString -> [ByteString]
B.split Char
'\t' ByteString
l
    optional :: ByteString -> Maybe ByteString
optional ByteString
x | ByteString -> Bool
B.null ByteString
x = Maybe ByteString
forall a. Maybe a
Nothing
               | Bool
otherwise = ByteString -> Maybe ByteString
forall a. a -> Maybe a
Just ByteString
x
    optionals :: ByteString -> Maybe [ByteString]
optionals ByteString
x | ByteString -> Bool
B.null ByteString
x = Maybe [ByteString]
forall a. Maybe a
Nothing
                | Bool
otherwise = [ByteString] -> Maybe [ByteString]
forall a. a -> Maybe a
Just ([ByteString] -> Maybe [ByteString])
-> [ByteString] -> Maybe [ByteString]
forall a b. (a -> b) -> a -> b
$ Char -> ByteString -> [ByteString]
B.split Char
'|' ByteString
x
{-# INLINE parseLine #-}