{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE OverloadedLists #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DoAndIfThenElse #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE LambdaCase #-}
module Text.RDF.RDF4H.XmlParser
( XmlParser(..)
, parseXmlDebug
) where
import Data.RDF.Types hiding (empty, resolveQName)
import qualified Data.RDF.Types as RDF
import Data.RDF.IRI
import Data.RDF.Graph.TList
import Text.RDF.RDF4H.ParserUtils hiding (Parser)
import Text.RDF.RDF4H.XmlParser.Identifiers
import Text.RDF.RDF4H.XmlParser.Xmlbf hiding (Node)
import qualified Text.RDF.RDF4H.XmlParser.Xeno as Xeno
import Control.Applicative
import Control.Monad
import Control.Monad.Except
import Control.Monad.State.Strict
import Data.Semigroup ((<>))
import Data.Set (Set)
import qualified Data.Set as S
import qualified Data.Map as Map
import Data.Maybe
import Data.Either
import Data.Bifunctor
import Data.HashSet (HashSet)
import qualified Data.HashSet as HS
import Data.HashMap.Strict (HashMap)
import qualified Data.HashMap.Strict as HM
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.IO as TIO
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Encoding as T
import qualified Data.ByteString.Lazy as BL
import qualified Data.ByteString.Builder as BB
instance RdfParser XmlParser where
parseString (XmlParser bUrl dUrl) = parseXmlRDF bUrl dUrl
parseFile (XmlParser bUrl dUrl) = parseFile' bUrl dUrl
parseURL (XmlParser bUrl dUrl) = parseURL' bUrl dUrl
data XmlParser =
XmlParser (Maybe BaseUrl)
(Maybe Text)
parseFile' :: (Rdf a)
=> Maybe BaseUrl
-> Maybe Text
-> FilePath
-> IO (Either ParseFailure (RDF a))
parseFile' bUrl dUrl fpath = parseXmlRDF bUrl dUrl <$> TIO.readFile fpath
parseURL' :: (Rdf a)
=> Maybe BaseUrl
-> Maybe Text
-> String
-> IO (Either ParseFailure (RDF a))
parseURL' bUrl docUrl = parseFromURL (parseXmlRDF bUrl docUrl)
type Parser = ParserT (ExceptT String (State ParseState))
data ParseState = ParseState
{ stateBaseUri :: Maybe BaseUrl
, stateIdSet :: Set Text
, statePrefixMapping :: PrefixMappings
, stateLang :: Maybe Text
, stateNodeAttrs :: HashMap Text Text
, stateSubject :: Maybe Subject
, stateCollectionIndex :: Int
, stateGenId :: Int
} deriving(Show)
parseXmlRDF :: (Rdf a)
=> Maybe BaseUrl
-> Maybe Text
-> Text
-> Either ParseFailure (RDF a)
parseXmlRDF bUrl dUrl = parseRdf . parseXml
where
bUrl' = BaseUrl <$> dUrl <|> bUrl
parseXml = Xeno.fromRawXml . T.encodeUtf8
parseRdf = first ParseFailure . join . second parseRdf'
parseRdf' ns = join $ evalState (runExceptT (parseM rdfParser ns)) initState
initState = ParseState bUrl' mempty mempty empty mempty empty 0 0
parseXmlDebug
:: FilePath
-> IO (RDF TList)
parseXmlDebug f = fromRight RDF.empty <$> parseFile (XmlParser (Just . BaseUrl $ "http://base-url.com/") (Just "http://doc-url.com/")) f
rdfParser :: Rdf a => Parser (RDF a)
rdfParser = do
bUri <- currentBaseUri
triples <- (pRdf <* pWs) <|> pNodeElementList
pEndOfInput
mkRdf triples bUri <$> currentPrefixMappings
pRdf :: Parser Triples
pRdf = pAnyElement $ do
attrs <- pRDFAttrs
uri <- pName >>= pQName
guard (uri == rdfTag)
unless (null attrs) $ throwError "rdf:RDF: The set of attributes should be empty."
pNodeElementList
pQName :: Text -> Parser Text
pQName qn = do
pm <- currentPrefixMappings
let qn' = resolveQName pm qn >>= validateIRI
either throwError pure qn'
pRDFAttrs :: Parser (HashMap Text Text)
pRDFAttrs = do
liftA2 (<|>) pLang currentLang >>= setLang
liftA2 (<|>) pBase currentBaseUri >>= setBaseUri
bUri <- currentBaseUri
attrs <- pAttrs
pm <- updatePrefixMappings (PrefixMappings $ HM.foldlWithKey' mkNameSpace mempty attrs)
let as = HM.foldlWithKey' (mkRdfAttribute pm bUri) mempty attrs
setNodeAttrs as
pure as
where
mkNameSpace
:: Map.Map Text Text
-> Text
-> Text
-> Map.Map Text Text
mkNameSpace ns qn iri =
let qn' = parseQName qn
ns' = f <$> qn' <*> validateIRI iri
f (Nothing , "xmlns") iri' = Map.insert mempty iri' ns
f (Just "xmlns", prefix ) iri' = Map.insert prefix iri' ns
f _ _ = ns
in either (const ns) id ns'
mkRdfAttribute
:: PrefixMappings
-> Maybe BaseUrl
-> HM.HashMap Text Text
-> Text
-> Text
-> HM.HashMap Text Text
mkRdfAttribute pm bUri as qn v =
let as' = parseQName qn >>= f
f (Nothing, n)
| T.isPrefixOf "xml" n = Right as
| otherwise = case bUri of
Nothing -> Right as
Just (BaseUrl bUri') -> (\a -> HM.insert a v as) <$> resolveIRI bUri' n
f qn'@(Just prefix, _)
| T.isPrefixOf "xml" prefix = Right as
| otherwise = (\a -> HM.insert a v as) <$> resolveQName' pm qn'
in either (const as) id as'
pRDFAttr :: Text -> Parser Text
pRDFAttr a = do
as <- currentNodeAttrs
maybe
(fail . mconcat $ ["Attribute \"", T.unpack a, "\" not found."])
pure
(HM.lookup a as)
pNodeElementList :: Parser Triples
pNodeElementList = pWs *> (mconcat <$> some (keepState pNodeElement <* pWs))
pWs :: Parser ()
pWs = maybe True (T.all ws . TL.toStrict) <$> optional pText >>= guard
where
ws c = c == '\x20' || c == '\x09' || c == '\x0d' || c == '\x0a'
pNodeElement :: Parser Triples
pNodeElement = pAnyElement $ do
void pRDFAttrs
(s, mt) <- pSubject
ts1 <- pPropertyAttrs s
ts2 <- keepState pPropertyEltList
setSubject (Just s)
let ts = ts1 <> ts2
pure $ maybe ts (:ts) mt
pSubject :: Parser (Node, Maybe Triple)
pSubject = do
s <- pUnodeId <|> pBnode <|> pUnode <|> pBnodeGen
setSubject (Just s)
uri <- pName >>= pQName
unless (checkNodeUri uri) (throwError $ "URI not allowed: " <> T.unpack uri)
mtype <- optional (pType1 s uri)
pure (s, mtype)
where
checkNodeUri uri = isNotCoreSyntaxTerm uri && uri /= rdfLi && isNotOldTerm uri
pUnodeId = (pIdAttr >>= mkUNodeID) <* removeNodeAttr rdfID
pBnode = (BNode <$> pNodeIdAttr) <* removeNodeAttr rdfNodeID
pUnode = (unode <$> pAboutAttr) <* removeNodeAttr rdfAbout
pBnodeGen = newBNode
pType1 n uri =
if uri /= rdfDescription
then pure $ Triple n rdfTypeNode (unode uri)
else empty
pPropertyAttrs :: Node -> Parser Triples
pPropertyAttrs s = do
attrs <- currentNodeAttrs
HM.elems <$> HM.traverseWithKey f attrs
where
f attr value
| not (isPropertyAttrURI attr) = throwError $ "URI not allowed for attribute: " <> T.unpack attr
| attr == rdfType = pure $ Triple s rdfTypeNode (unode value)
| otherwise = do
lang <- currentLang
pure $ let mkLiteral = maybe plainL (flip plainLL) lang
in Triple s (unode attr) (lnode (mkLiteral value))
pLang :: Parser (Maybe Text)
pLang = optional (pAttr "xml:lang")
pBase :: Parser (Maybe BaseUrl)
pBase = optional $ do
uri <- pAttr "xml:base"
BaseUrl <$> either
throwError
(pure . serializeIRI . removeIRIFragment)
(parseIRI uri)
pPropertyEltList :: Parser Triples
pPropertyEltList = pWs
*> resetCollectionIndex
*> fmap mconcat (many (pPropertyElt <* pWs))
pPropertyElt :: Parser Triples
pPropertyElt = pAnyElement $ do
void pRDFAttrs
uri <- pName >>= pQName >>= listExpansion
unless (isPropertyAttrURI uri) (throwError $ "URI not allowed for propertyElt: " <> T.unpack uri)
let p = unode uri
pParseTypeLiteralPropertyElt p
<|> pParseTypeResourcePropertyElt p
<|> pParseTypeCollectionPropertyElt p
<|> pParseTypeOtherPropertyElt p
<|> pResourcePropertyElt p
<|> pLiteralPropertyElt p
<|> pEmptyPropertyElt p
where
listExpansion u
| u == rdfLi = nextCollectionIndex
| otherwise = pure u
pResourcePropertyElt :: Node -> Parser Triples
pResourcePropertyElt p = do
pWs
(ts1, o) <- keepState $ liftA2 (,) pNodeElement currentSubject
pWs
mi <- optional pIdAttr <* removeNodeAttr rdfID
checkAllowedAttributes []
s <- currentSubject
let mt = flip Triple p <$> s <*> o
ts2 <- maybe (pure mempty) (uncurry reifyTriple) (liftA2 (,) mi mt)
pure $ maybe (ts1 <> ts2) (:(ts1 <> ts2)) mt
pLiteralPropertyElt :: Node -> Parser Triples
pLiteralPropertyElt p = do
l <- pText
pChildren >>= guard . null
mi <- optional pIdAttr <* removeNodeAttr rdfID
checkAllowedAttributes [rdfDatatype]
dt <- optional pDatatypeAttr
s <- currentSubject
lang <- currentLang
let l' = TL.toStrict l
o = lnode . fromMaybe (plainL l') $ (typedL l' <$> dt) <|> (plainLL l' <$> lang)
mt = (\s' -> Triple s' p o) <$> s
ts <- maybe (pure mempty) (uncurry reifyTriple) (liftA2 (,) mi mt)
pure $ maybe ts (:ts) mt
pParseTypeLiteralPropertyElt :: Node -> Parser Triples
pParseTypeLiteralPropertyElt p = do
pt <- pRDFAttr rdfParseType
guard (pt == "Literal")
mi <- optional pIdAttr <* removeNodeAttr rdfID
checkAllowedAttributes [rdfParseType]
l <- pXMLLiteral
s <- currentSubject
let o = lnode (typedL l rdfXmlLiteral)
mt = (\s' -> Triple s' p o) <$> s
ts <- maybe (pure mempty) (uncurry reifyTriple) (liftA2 (,) mi mt)
pure $ maybe ts (:ts) mt
pParseTypeResourcePropertyElt :: Node -> Parser Triples
pParseTypeResourcePropertyElt p = do
pt <- pRDFAttr rdfParseType
guard (pt == "Resource")
mi <- optional pIdAttr <* removeNodeAttr rdfID
checkAllowedAttributes [rdfParseType]
s <- currentSubject
o <- newBNode
let mt = (\s' -> Triple s' p o) <$> s
ts1 <- maybe (pure mempty) (uncurry reifyTriple) (liftA2 (,) mi mt)
setSubject (Just o)
ts2 <- keepCollectionIndex pPropertyEltList
pure $ maybe (ts1 <> ts2) ((<> ts2) . (:ts1)) mt
pParseTypeCollectionPropertyElt :: Node -> Parser Triples
pParseTypeCollectionPropertyElt p = do
pt <- pRDFAttr rdfParseType
guard (pt == "Collection")
mi <- optional pIdAttr <* removeNodeAttr rdfID
checkAllowedAttributes [rdfParseType]
s <- currentSubject
case s of
Nothing -> pure mempty
Just s' -> do
r <- optional pNodeElement
case r of
Nothing ->
let t = Triple s' p rdfNilNode
in ([t] <>) <$> maybe (pure mempty) (`reifyTriple` t) mi
Just ts1 -> do
s'' <- currentSubject
n <- newBNode
let t = Triple s' p n
ts2 = maybe mempty (\s''' -> [t, Triple n rdfFirstNode s''']) s''
ts3 <- go n
ts4 <- maybe (pure mempty) (`reifyTriple` t) mi
pure $ mconcat [ts1, ts2, ts3, ts4]
where
go s = do
r <- optional pNodeElement
case r of
Nothing -> pure [Triple s rdfRestNode rdfNilNode]
Just ts1 -> do
s' <- currentSubject
n <- newBNode
let ts2 = maybe mempty (\s'' -> [Triple s rdfRestNode n, Triple n rdfFirstNode s'']) s'
ts3 <- go n
pure $ mconcat [ts1, ts2, ts3]
pParseTypeOtherPropertyElt :: Node -> Parser Triples
pParseTypeOtherPropertyElt _p = do
pt <- pRDFAttr rdfParseType
guard (pt /= "Resource" && pt /= "Literal" && pt /= "Collection")
checkAllowedAttributes [rdfParseType]
_mi <- optional pIdAttr <* removeNodeAttr rdfID
throwError "Not implemented: rdf:parseType = other"
pEmptyPropertyElt :: Node -> Parser Triples
pEmptyPropertyElt p = do
s <- currentSubject
case s of
Nothing -> pure mempty
Just s' -> do
mi <- optional pIdAttr <* removeNodeAttr rdfID
o <- pResourceAttr' <|> pNodeIdAttr' <|> newBNode
let t = Triple s' p o
ts1 <- maybe (pure mempty) (`reifyTriple` t) mi
ts2 <- pPropertyAttrs o
pure (t:ts1 <> ts2)
where
pResourceAttr' = unode <$> pResourceAttr <* removeNodeAttr rdfResource
pNodeIdAttr' = BNode <$> pNodeIdAttr <* removeNodeAttr rdfNodeID
checkAllowedAttributes :: HashSet Text -> Parser ()
checkAllowedAttributes as = do
attrs <- currentNodeAttrs
let diff = HS.difference (HM.keysSet attrs) as
unless (null diff) (throwError $ "Attributes not allowed: " <> show diff)
pXMLLiteral :: Parser Text
pXMLLiteral =
T.decodeUtf8 . BL.toStrict . BB.toLazyByteString . encode <$> pChildren
pIdAttr :: Parser Text
pIdAttr = do
i <- pRDFAttr rdfID
i' <- either throwError pure (checkRdfId i)
checkIdIsUnique i'
pure i'
checkIdIsUnique :: Text -> Parser ()
checkIdIsUnique i = do
notUnique <- S.member i <$> currentIdSet
when notUnique (throwError $ "rdf:ID already used in this context: " <> T.unpack i)
updateIdSet i
pNodeIdAttr :: Parser Text
pNodeIdAttr = do
i <- pRDFAttr rdfNodeID
either throwError pure (checkRdfId i)
pAboutAttr :: Parser Text
pAboutAttr = pRDFAttr rdfAbout >>= checkIRI "rdf:about"
pResourceAttr :: Parser Text
pResourceAttr = pRDFAttr rdfResource >>= checkIRI "rdf:resource"
pDatatypeAttr :: Parser Text
pDatatypeAttr = pRDFAttr rdfDatatype >>= checkIRI "rdf:datatype"
reifyTriple :: Text -> Triple -> Parser Triples
reifyTriple i (Triple s p' o) = do
n <- mkUNodeID i
pure [ Triple n rdfTypeNode rdfStatementNode
, Triple n rdfSubjectNode s
, Triple n rdfPredicateNode p'
, Triple n rdfObjectNode o ]
checkIRI :: String -> Text -> Parser Text
checkIRI msg iri = do
bUri <- maybe mempty unBaseUrl <$> currentBaseUri
case uriValidate iri of
Nothing -> throwError $ mconcat ["Malformed IRI for \"", msg, "\": ", T.unpack iri]
Just iri' -> either throwError pure (resolveIRI bUri iri')
isPropertyAttrURI :: Text -> Bool
isPropertyAttrURI uri
= isNotCoreSyntaxTerm uri
&& uri /= rdfDescription
&& uri /= rdfLi
&& isNotOldTerm uri
isNotCoreSyntaxTerm :: Text -> Bool
isNotCoreSyntaxTerm uri
= uri /= rdfTag && uri /= rdfID && uri /= rdfAbout
&& uri /= rdfParseType && uri /= rdfResource
&& uri /= rdfNodeID && uri /= rdfDatatype
isNotOldTerm :: Text -> Bool
isNotOldTerm uri = uri /= rdfAboutEach
&& uri /= rdfAboutEachPrefix
&& uri /= rdfBagID
newBNode :: Parser Node
newBNode = do
modify $ \st -> st { stateGenId = stateGenId st + 1 }
BNodeGen . stateGenId <$> get
keepState :: Parser a -> Parser a
keepState p = do
st <- get
let bUri = stateBaseUri st
is = stateIdSet st
p <* do
st' <- get
let i = stateGenId st'
bUri' = stateBaseUri st'
is' = stateIdSet st'
if bUri /= bUri'
then put (st { stateGenId = i })
else put (st { stateGenId = i, stateIdSet = is <> is' })
currentIdSet :: Parser (Set Text)
currentIdSet = stateIdSet <$> get
updateIdSet :: Text -> Parser ()
updateIdSet i = do
is <- currentIdSet
modify (\st -> st { stateIdSet = S.insert i is })
currentNodeAttrs :: Parser (HashMap Text Text)
currentNodeAttrs = stateNodeAttrs <$> get
setNodeAttrs :: HashMap Text Text -> Parser ()
setNodeAttrs as = modify (\st -> st { stateNodeAttrs = as })
removeNodeAttr :: Text -> Parser ()
removeNodeAttr a = HM.delete a <$> currentNodeAttrs >>= setNodeAttrs
currentPrefixMappings :: Parser PrefixMappings
currentPrefixMappings = statePrefixMapping <$> get
updatePrefixMappings :: PrefixMappings -> Parser PrefixMappings
updatePrefixMappings pm = do
pm' <- (<> pm) <$> currentPrefixMappings
modify (\st -> st { statePrefixMapping = pm' })
pure pm'
currentCollectionIndex :: Parser Int
currentCollectionIndex = stateCollectionIndex <$> get
setCollectionIndex :: Int -> Parser ()
setCollectionIndex i = modify (\st -> st { stateCollectionIndex = i })
keepCollectionIndex :: Parser a -> Parser a
keepCollectionIndex p = do
i <- currentCollectionIndex
p <* setCollectionIndex i
nextCollectionIndex :: Parser Text
nextCollectionIndex = do
modify $ \st -> st { stateCollectionIndex = stateCollectionIndex st + 1 }
(rdfListIndex <>) . T.pack . show . stateCollectionIndex <$> get
resetCollectionIndex :: Parser ()
resetCollectionIndex = modify $ \st -> st { stateCollectionIndex = 0 }
currentBaseUri :: Parser (Maybe BaseUrl)
currentBaseUri = stateBaseUri <$> get
setBaseUri :: (Maybe BaseUrl) -> Parser ()
setBaseUri u = modify (\st -> st { stateBaseUri = u })
mkUNodeID :: Text -> Parser Node
mkUNodeID t = mkUnode <$> currentBaseUri
where
mkUnode = unode . \case
Nothing -> t
Just (BaseUrl u) -> mconcat [u, "#", t]
currentSubject :: Parser (Maybe Subject)
currentSubject = stateSubject <$> get
setSubject :: (Maybe Subject) -> Parser ()
setSubject s = modify (\st -> st { stateSubject = s })
currentLang :: Parser (Maybe Text)
currentLang = stateLang <$> get
setLang :: (Maybe Text) -> Parser ()
setLang lang = modify (\st -> st { stateLang = lang })