{-| Module : Data.RDF.Internal Description : Representation and Incremental Processing of RDF Data Copyright : Travis Whitaker 2016 License : MIT Maintainer : pi.boy.travis@gmail.com Stability : Provisional Portability : Portable Internal module. -} {-# LANGUAGE DeriveGeneric , DeriveAnyClass , OverloadedStrings #-} module Data.RDF.Internal where import Control.Applicative import Control.DeepSeq import qualified Data.Attoparsec.Combinator as A import qualified Data.Attoparsec.Text as A import Data.Char import Data.String import GHC.Generics import qualified Data.Text as T -- | A contiguous RDF graph with optional label. Note that a contiguous graph -- within an RDF data set will not appear as a single contiguous graph to this -- library if the graph's constituent triples are not contiguous in the -- original data set. This strategy allows for incremental processing of RDF -- data in constant space. data RDFGraph = RDFGraph { -- | A named RDF graph includes an 'IRI'. rdfLabel :: !(Maybe IRI) -- | The constituent triples. A proper graph is a strict set of triples -- (i.e. no duplicate nodes or edges), but this guarantee cannot be made -- if the triples are to be processed incrementally in constant space. -- Programs using this type for interpreting RDF graphs should ignore any -- supernumerary triples in this list. , rdfTriples :: [Triple] } deriving ( Eq , Ord , Read , Show , Generic , NFData ) -- | An RDF quad, i.e. a triple belonging to a named graph. data Quad = Quad { quadTriple :: !Triple , quadGraph :: !(Maybe IRI) } deriving ( Eq , Ord , Read , Show , Generic , NFData ) -- | An RDF triple. data Triple = Triple !Subject !Predicate !Object deriving ( Eq , Ord , Read , Show , Generic , NFData ) -- | An RDF subject, i.e. either an 'IRI' or a 'BlankNode'. -- -- This type has an 'IsString' instance, allowing string literals to be -- interpreted as 'Subject's with @-XOverloadedStrings@, like so: -- -- >>> " :: Subject -- IRISubject (IRI (...)) -- >>> "_:some-node" :: Subject -- BlankSubject (BlankNode {unBlankNode = "some-node"}) data Subject = IRISubject !IRI | BlankSubject !BlankNode deriving ( Eq , Ord , Read , Show , Generic , NFData ) -- | An RDF predicate. -- -- This type has an 'IsString' instance, allowing string literals to be -- interpreted as 'Predicate's with @-XOverloadedStrings@, like so: -- -- >>> "" :: Predicate -- Predicate {unPredicate = IRI (...)} newtype Predicate = Predicate { unPredicate :: IRI } deriving ( Eq , Ord , Read , Show , Generic , NFData ) -- | An RDF object, i.e. either an 'IRI', a 'Literal', or a 'BlankNode'. -- -- This type has an 'IsString' instance, allowing string literals to be -- interpreted as 'Object's with @-XOverloadedStrings@, like so: -- -- >>> "" :: Object -- IRIObject (IRI (...)) -- >>> "_:some-node" :: Object -- BlankObject (BlankNode {unBlankNode = "some-node"}) -- >>> "computer" :: Object -- LiteralObject (Literal {litString = "computer", litType = LiteralUntyped}) -- -- The precedence for literal interpretation is IRI > BlankNode > Literal. To -- force a literal that is also a valid blank node label or IRI to be -- interpreted as a 'LiteralObject', wrap it in an extra set of double quotes: -- -- >>> "\"_:some-node\"" :: Object -- LiteralObject (Literal {litString = "_:some-node", litType = LiteralUntyped}) data Object = IRIObject !IRI | BlankObject !BlankNode | LiteralObject !Literal deriving ( Eq , Ord , Read , Show , Generic , NFData ) -- | A blank node with its local label, without the preceeding "_:". Other -- programs processing RDF are permitted to discard these node labels, i.e. -- all blank node labels are local to a specific representation of an RDF data -- set. -- -- This type has an 'IsString' instance, allowing string literals to be -- interpreted as 'BlankNode's with @-XOverloadedStrings@, like so: -- -- >>> "_:some-node" :: BlankNode -- BlankNode {unBlankNode = "some-node"} newtype BlankNode = BlankNode { unBlankNode :: T.Text } deriving ( Eq , Ord , Read , Show , Generic , NFData ) -- | An RDF literal. As stipulated by the RDF standard, the 'litType' is merely -- metadata; all RDF processing programs must try to handle literals that are -- ill-typed. -- -- This type has an 'IsString' instance, allowing string literals to be -- interpreted as 'Literal's with @-XOverloadedStrings@, like so: -- -- >>> "computer" :: Literal -- Literal {litString = "computer", litType = LiteralUntyped} -- -- For untyped literals the extra double quotes are not required. They are -- required for typed literals: -- -- >>> "\"computer\"@en" :: Literal -- Literal {litString = "computer", litType = LiteralLangType "en"} -- -- >>> "\"computer\"^^" :: Literal -- Literal { litString = "computer", litType = LiteralIRIType (...)} data Literal = Literal { litString :: !T.Text , litType :: !LiteralType } deriving ( Eq , Ord , Read , Show , Generic , NFData ) -- | An RDF literal type. As stipulated by the RDF standard, this is merely -- metadata; all RDF processing programs must try to handle literals that are -- ill-typed. data LiteralType = LiteralIRIType !IRI | LiteralLangType !T.Text | LiteralUntyped deriving ( Eq , Ord , Read , Show , Generic , NFData ) -- | An Internationalized Resource Identifier. This library preferentially -- follows RFC 3987 over the RDF 1.1 specification, as the two standards -- disagree about precisely what constitutes an IRI. A notable exception is -- the handling of IRI fragments; this library follows the RDF 1.1 -- specification, allowing IRI fragments to occur in absolute IRIs, even -- though this is expressly prohibited by RFC 3987. -- -- Unlike the @network-uri@ package's behavior with URI fields, this library -- does not include the sentinel tokens in the parsed fields. For example, -- when parsing @http://example.com@, @network-uri@ will provide the string -- @http:@ as the scheme, while this library will provide @http@ as the -- scheme. -- -- This type has an 'IsString' instnace, allowing string literals to be -- interpreted as 'IRI's with @-XOverloadedStrings@, like so: -- -- >>> "http://example.com" :: IRI -- IRI { iriScheme = "http" -- , iriAuth = Just (IRIAuth { iriUser = Nothing -- , iriHost = "example.com" -- , iriPort = Nothing -- }) -- , iriPath = "" -- , iriQuery = Nothing -- , iriFragment = Nothing -- } data IRI = IRI { -- | The IRI scheme, e.g. @http@ iriScheme :: !T.Text -- | The IRI authority, e.g. @example.com@ , iriAuth :: !(Maybe IRIAuth) -- | The IRI path, e.g. @/posts//index.html@ , iriPath :: !T.Text -- | The IRI query, i.e. the component after the @?@ if present. , iriQuery :: !(Maybe T.Text) -- | The IRI fragment, i.e. the component after the @#@ if present. , iriFragment :: !(Maybe T.Text) } deriving ( Eq , Ord , Read , Show , Generic , NFData ) -- | An IRI Authority, as described by RFC 3987. data IRIAuth = IRIAuth { -- | The IRI user, i.e. the component before the @\@@ if present. iriUser :: !(Maybe T.Text) -- | The IRI host, e.g. @example.com@. , iriHost :: T.Text -- | The IRI port, i.e. the numeral after the @:@ if present. , iriPort :: !(Maybe T.Text) } deriving ( Eq , Ord , Read , Show , Generic , NFData ) -- | Predicate on 'Char's for acceptability for inclusion in an 'IRI'. isIRI :: Char -> Bool isIRI c = (c /= '<') && (c /= '>') && (c /= '"') && (c /= '{') && (c /= '}') && (c /= '|') && (c /= '^') && (c /= '`') && (c /= '\\') -- | 'IRI' parser. parseIRI :: A.Parser IRI parseIRI = IRI <$> (parseScheme <* A.char ':') <*> parseAuth <*> parsePath <*> parseQuery <*> parseFragment -- | 'IRI' scheme parser. parseScheme :: A.Parser T.Text parseScheme = A.takeWhile1 isScheme >>= check where check t | isAlpha (T.head t) = pure t | otherwise = fail "parseScheme: must start with letter." isScheme c = isAlphaNum c || (c == '+') || (c == '-') || (c == '.') -- | 'IRIAuth' parser. parseAuth :: A.Parser (Maybe IRIAuth) parseAuth = A.option Nothing (A.string "//" *> (Just <$> parseIRIAuth)) where parseIRIAuth = IRIAuth <$> parseUser <*> parseHost <*> parsePort -- | 'IRIAuth' user parser. parseUser :: A.Parser (Maybe T.Text) parseUser = A.option Nothing (Just <$> (A.takeWhile1 isUser <* A.char '@')) where isUser c = isIRI c && (c /= '@') -- | 'IRIAuth' host parser. parseHost :: A.Parser T.Text parseHost = A.takeWhile1 isHost where isHost c = isIRI c && (c /= '/') && (c /= ':') -- | 'IRIAuth' port parser. parsePort :: A.Parser (Maybe T.Text) parsePort = A.option Nothing (Just <$> (A.char ':' *> A.takeWhile1 isDigit)) -- | 'IRI' path parser. parsePath :: A.Parser T.Text parsePath = A.option "" (A.char '/' *> A.takeWhile1 isPath) where isPath c = isIRI c && (c /= '?') && (c /= '#') -- | 'IRI' query parser. parseQuery :: A.Parser (Maybe T.Text) parseQuery = A.option Nothing (Just <$> (A.char '?' *> A.takeWhile1 isQuery)) where isQuery c = isIRI c && (c/= '#') -- | 'IRI' fragment parser. parseFragment :: A.Parser (Maybe T.Text) parseFragment = A.option Nothing (Just <$> (A.char '#' *> A.takeWhile1 isIRI)) -- | Parser for graph labels, i.e. either an escaped 'IRI' or the empty string. parseGraphLabel :: A.Parser (Maybe IRI) parseGraphLabel = A.option Nothing (Just <$> parseEscapedIRI) -- | 'Subject' parser. parseSubject :: A.Parser Subject parseSubject = do c <- A.anyChar case c of '<' -> IRISubject <$> (parseIRI <* A.char '>') '_' -> BlankSubject <$> (A.char ':' *> parseBlankNodeLabel) _ -> fail "parseSubject: must be blank node or IRI." -- | 'Predicate' parser. parsePredicate :: A.Parser Predicate parsePredicate = Predicate <$> parseEscapedIRI -- | 'Object' parser. parseObject :: A.Parser Object parseObject = do c <- A.anyChar case c of '<' -> IRIObject <$> (parseIRI <* A.char '>') '_' -> BlankObject <$> (A.char ':' *> parseBlankNodeLabel) _ -> LiteralObject <$> parseLiteralBody -- | Parse an escaped 'IRI', i.e. an IRI enclosed in angle brackets. parseEscapedIRI :: A.Parser IRI parseEscapedIRI = A.char '<' *> parseIRI <* A.char '>' -- | Parse a blank node label. parseBlankNodeLabel :: A.Parser BlankNode parseBlankNodeLabel = BlankNode <$> (A.takeWhile1 isLabel >>= check) where check t | isHead (T.head t) && isTail (T.last t) = pure t | otherwise = fail "parseBlankNode" isLabel = not . isSpace isHead c = isLabel c && (c /= '-') && (c /= '.') isTail c = isLabel c && (c /= '.') -- | Parse a blank node label, with the preceeding @_:@. parseBlankNode :: A.Parser BlankNode parseBlankNode = A.string "_:" *> parseBlankNodeLabel -- | Like 'parseLiteral', but without the leading double quote. parseLiteralBody :: A.Parser Literal parseLiteralBody = Literal <$> escString <*> valType where valType = valIRIType <|> valLangType <|> pure LiteralUntyped valIRIType = LiteralIRIType <$> (A.string "^^" *> parseEscapedIRI) valLangType = LiteralLangType <$> (A.char '@' *> A.takeWhile1 isLang) isLang c = isAlphaNum c || (c == '-') escString = unescapeAll <$> A.scan False machine machine False '\\' = Just True machine False '"' = Nothing machine False _ = Just False machine True _ = Just False unescapeAll = T.concat . unescapeFrag . T.splitOn "\\" unescapeFrag [] = [] unescapeFrag (f:fs) = case T.uncons f of Nothing -> f : unescapeFrag fs (Just (e, f')) -> T.singleton (unescape e) : f' : unescapeFrag fs unescape 't' = '\t' unescape 'b' = '\b' unescape 'n' = '\n' unescape 'r' = '\r' unescape 'f' = '\f' unescape c = c -- | Parse an RDF 'Literal', including the 'LiteralType' if present. parseLiteral :: A.Parser Literal parseLiteral = A.char '"' *> parseLiteralBody -- | Parse an unescaped untyped RDF 'Literal'. parseUnescapedLiteral :: A.Parser Literal parseUnescapedLiteral = Literal <$> A.takeText <*> pure LiteralUntyped -- | Make implementations for 'fromString' from a 'A.Parser'. fromStringParser :: A.Parser a -- ^ The literal parser. -> String -- ^ The literal type name for error messages. -> (String -> a) -- ^ The 'fromString' implementation. fromStringParser p n s = let t = T.pack s r = A.parseOnly p t in case r of (Left e) -> error $ mconcat [ "Invalid " , n , " literal (" , s , ") " , e ] (Right x) -> x -- | This instance uses 'parseIRI' and calls 'error' if the literal is invalid. -- It is not clear exactly when 'fromString' is evaluated so this error is -- difficult to explictly catch. This can be solved by ensuring that your -- 'IRI' literals are eagerly evaluated so any malformed literals can be -- caught immediately. It would be nicer if this happened at compile time. instance IsString IRI where fromString = fromStringParser parseIRI "IRI" -- | This instance uses 'parseLiteral' and calls 'error' if the literal is -- invalid. It is not clear exactly when 'fromString' is evaluated so this -- error is difficult to explictly catch. This can be solved by ensuring that -- your 'Literal' literals are eagerly evaluated so any malformed literals can -- be caught immediately. It would be nicer if this happened at compile time. instance IsString Literal where fromString = fromStringParser p "Literal" where p = parseLiteral <|> parseUnescapedLiteral -- | This instance uses 'parseBlankNode' and calls 'error' if the literal is -- invalid. It is not clear exactly when 'fromString' is evaluated so this -- error is difficult to explictly catch. This can be solved by ensuring that -- your 'BlankNode' literals are eagerly evaluated so any malformed literals -- can be caught immediately. It would be nicer if this happened at compile -- time. instance IsString BlankNode where fromString = fromStringParser parseBlankNode "BlankNode" -- | This instance uses 'parseSubject' and calls 'error' if the literal -- is invalid. It is not clear exactly when 'fromString' is evaluated so this -- error is difficult to explictly catch. This can be solved by ensuring that -- your 'Subject' literals are eagerly evaluated so any malformed literals can -- be caught immediately. It would be nicer if this happened at compile time. instance IsString Subject where fromString = fromStringParser parseSubject "Subject" -- | This instance uses 'parsePredicate' and calls 'error' if the literal is -- invalid. It is not clear exactly when 'fromString' is evaluated so this -- error is difficult to explictly catch. This can be solved by ensuring that -- your 'Predicate' literals are eagerly evaluated so any malformed literals -- can be caught immediately. It would be nicer if this happened at compile -- time. instance IsString Predicate where fromString = fromStringParser parsePredicate "Predicate" -- | This instance uses 'parseObject' and calls 'error' if the literal is -- invalid. It is not clear exactly when 'fromString' is evaluated so this -- error is difficult to explictly catch. This can be solved by ensuring that -- your 'Object' literals are eagerly evaluated so any malformed literals can -- be caught immediately. It would be nicer if this happened at compile time. instance IsString Object where fromString = fromStringParser p "Object" where p = parseObject <|> (LiteralObject <$> parseUnescapedLiteral)