-------------------------------------------------------------------------------- -- See end of this file for licence information. -------------------------------------------------------------------------------- -- | -- Module : NTriples -- Copyright : (c) 2011, 2012, 2013 Douglas Burke -- License : GPL V2 -- -- Maintainer : Douglas Burke -- Stability : experimental -- Portability : H98 -- -- This Module implements a NTriples parser, returning a -- new 'RDFGraph' consisting of triples and namespace information parsed from -- the supplied NTriples input string, or an error indication. -- -- REFERENCES: -- -- - \"RDF Test Cases\", -- W3C Recommendation 10 February 2004, -- -- -- NOTES: -- -- - If the URI is actually an IRI (Internationalized Resource Identifiers) -- then the parser will fail since 'Network.URI.parseURI' fails. -- -- - The case of language tags is retained. -- -- - Update to the document \"N-Triples. A line-based syntax for an RDF graph\" -- W3C Working Group Note 09 April 2013, -- -- -------------------------------------------------------------------------------- module Swish.RDF.Parser.NTriples ( ParseResult , parseNT ) where import Swish.GraphClass (arc) import Swish.Namespace (ScopedName, makeURIScopedName) import Swish.RDF.Graph (RDFGraph, RDFLabel(..), addArc, emptyRDFGraph) import Swish.RDF.Vocabulary (LanguageTag, toLangTag) import Swish.RDF.Parser.Utils (ParseResult , runParserWithError , ignore , skipMany , noneOf , char , string , eoln , fullStop , hex4 , hex8 ) import Control.Applicative import Network.URI (parseURI) import qualified Data.Text as T import qualified Data.Text.Lazy as L import Data.Char (isAsciiLower, isAsciiUpper, isDigit, ord) import Data.Maybe (fromMaybe) import Text.ParserCombinators.Poly.StateText ---------------------------------------------------------------------- -- Define parser state and helper functions ---------------------------------------------------------------------- -- | NT parser state data NTState = NTState { graphState :: RDFGraph -- Graph under construction } emptyState :: NTState emptyState = NTState { graphState = emptyRDFGraph } -- Return function to update graph in NT parser state, -- using the supplied function of a graph. This is for use -- with stUpdate. -- updateGraph :: (RDFGraph -> RDFGraph) -> NTState -> NTState updateGraph f s = s { graphState = f (graphState s) } ---------------------------------------------------------------------- -- Define top-level parser function: -- accepts a string and returns a graph or error ---------------------------------------------------------------------- -- | Parser that carries around a NTState record. type NTParser a = Parser NTState a -- | Parse a string. -- parseNT :: L.Text -- ^ input in NTriples format. -> ParseResult parseNT = parsefromText ntripleDoc {- -- useful for testing test :: String -> RDFGraph test = either error id . parseNT -} -- | Function to supply initial context and parse supplied term. -- -- Used for debugging. parsefromText :: NTParser a -- ^ parser to apply -> L.Text -- ^ input to be parsed -> Either String a parsefromText parser = runParserWithError parser emptyState -- helper routines {- lineFeed :: NTParser () lineFeed = ignore (char '\r') -} -- Add statement to graph in NT parser state addStatement :: RDFLabel -> RDFLabel -> RDFLabel -> NTParser () addStatement s p o = stUpdate (updateGraph (addArc (arc s p o) )) ---------------------------------------------------------------------- -- Syntax productions ---------------------------------------------------------------------- {- EBNF from the specification, using the notation from XML 1.0, second edition, is included inline below. We do not force ASCII 7-bit semantics here yet. space ::= #x20 /* US-ASCII space - decimal 32 */ cr ::= #xD /* US-ASCII carriage return - decimal 13 */ lf ::= #xA /* US-ASCII line feed - decimal 10 */ tab ::= #x9 /* US-ASCII horizontal tab - decimal 9 */ The productions are kept as close as possible to the specification for now. -} {- ntripleDoc ::= line* line ::= ws* ( comment | triple )? eoln We relax the rule that the input must be empty or end with a new line. ntripleDoc :: NTParser RDFGraph ntripleDoc = graphState <$> (many line *> eof *> getState) line :: NTParser () line = skipMany ws *> optional (comment <|> triple) *> eoln -} ntripleDoc :: NTParser RDFGraph ntripleDoc = graphState <$> (sepBy line eoln *> optional eoln *> skipWS *> eof *> stGet) line :: NTParser () line = skipWS *> ignore (optional (comment <|> triple)) {- ws ::= space | tab Could use whiteSpace rule here, but that would permit constructs (e.g. comments) where we do not support them. -} isWS :: Char -> Bool isWS = (`elem` " \t") {- ws :: NTParser () -- ws = ignore (char ' ' <|> tab) ws = ignore $ satisfy isWS -} skipWS :: NTParser () skipWS = ignore $ manySatisfy isWS skip1WS :: NTParser () skip1WS = ignore $ many1Satisfy isWS {- comment ::= '#' ( character - ( cr | lf ) )* -} comment :: NTParser () comment = char '#' *> skipMany (noneOf "\r\n") {- eoln ::= cr | lf | cr lf -} {- name ::= [A-Za-z][A-Za-z0-9]* -} isaz, isAZ, is09 :: Char -> Bool isaz = isAsciiLower isAZ = isAsciiUpper is09 = isDigit isaZ, isaZ09 :: Char -> Bool isaZ c = isaz c || isAZ c isaZ09 c = isaZ c || is09 c isHeadChar, isBodyChar :: Char -> Bool isHeadChar = isaZ isBodyChar = isaZ09 name :: NTParser L.Text name = L.cons <$> satisfy isHeadChar <*> manySatisfy isBodyChar nameStr :: NTParser String nameStr = L.unpack <$> name {- triple ::= subject ws+ predicate ws+ object ws* '.' ws* -} triple :: NTParser () triple = do s <- subject <* skip1WS p <- predicate <* skip1WS o <- object <* (skipWS >> fullStop >> skipWS) addStatement s p o {- subject ::= uriref | nodeID predicate ::= uriref object ::= uriref | nodeID | literal -} subject :: NTParser RDFLabel subject = urirefLbl <|> nodeID predicate :: NTParser RDFLabel predicate = urirefLbl object :: NTParser RDFLabel object = urirefLbl <|> nodeID <|> literal {- uriref ::= '<' absoluteURI '>' absoluteURI ::= character+ with escapes as defined below (from section 'URI References') The absoluteURI production encodes a Unicode string representing an RDF URI references as specified in [RDF-CONCEPTS]. These are encoded in N-Triples using the escapes described in section Strings. -} uriref :: NTParser ScopedName uriref = do ignore $ char '<' uri <- manyFinally' character (char '>') maybe (failBad ("Invalid URI: <" ++ uri ++ ">")) (return . makeURIScopedName) (parseURI uri) urirefLbl :: NTParser RDFLabel urirefLbl = Res <$> uriref {- nodeID ::= '_:' name -} nodeID :: NTParser RDFLabel nodeID = Blank <$> (string "_:" *> nameStr) {- literal ::= langString | datatypeString langString ::= '"' string '"' ( '@' language )? datatypeString ::= '"' string '"' '^^' uriref language ::= [a-z]+ ('-' [a-z0-9]+ )* encoding a language tag. string ::= character* with escapes as defined in section Strings -} literal :: NTParser RDFLabel literal = do lit <- T.pack <$> ntstring opt <- optional dtlang return $ case opt of Just (Left lcode) -> LangLit lit lcode Just (Right dtype) -> TypedLit lit dtype _ -> Lit lit ntstring :: NTParser String ntstring = bracket (char '"') (char '"') (many character) dtlang :: NTParser (Either LanguageTag ScopedName) dtlang = (char '@' *> commit (Left <$> language)) <|> (string "^^" *> commit (Right <$> uriref)) -- Note that toLangTag may fail since it does some extra -- validation not done by the parser (mainly on the length of the -- primary and secondary tags). -- -- NOTE: This parser does not accept multiple secondary tags which RFC3066 -- does. -- -- Although the EBNF only lists [a-z] we also support upper case values, -- since the W3C Turtle test case includes a NTriples file with -- "...@en-UK" in it. -- language :: NTParser LanguageTag language = do h <- many1Satisfy isaZ mt <- optional $ L.cons <$> char '-' <*> many1Satisfy isaZ09 let lbl = L.toStrict $ L.append h $ fromMaybe L.empty mt case toLangTag lbl of Just lt -> return lt _ -> fail ("Invalid language tag: " ++ T.unpack lbl) -- should this be failBad? {- String handling: EBNF has: character ::= [#x20-#x7E] /* US-ASCII space to decimal 126 */ Additional information from: http://www.w3.org/TR/rdf-testcases/#ntrip_strings N-Triples strings are sequences of US-ASCII character productions encoding [UNICODE] character strings. The characters outside the US-ASCII range and some other specific characters are made available by \-escape sequences as follows: Unicode character (with code point u) N-Triples encoding [#x0-#x8] \uHHHH 4 required hexadecimal digits HHHH encoding Unicode character u #x9 \t #xA \n [#xB-#xC] \uHHHH 4 required hexadecimal digits HHHH encoding Unicode character u #xD \r [#xE-#x1F] \uHHHH 4 required hexadecimal digits HHHH encoding Unicode character u [#x20-#x21] the character u #x22 \" [#x23-#x5B] the character u #x5C \\ [#x5D-#x7E] the character u [#x7F-#xFFFF] \uHHHH 4 required hexadecimal digits HHHH encoding Unicode character u [#10000-#x10FFFF] \UHHHHHHHH 8 required hexadecimal digits HHHHHHHH encoding Unicode character u where H is a hexadecimal digit: [#x30-#x39],[#x41-#x46] (0-9, uppercase A-F). This escaping satisfies the [CHARMOD] section Reference Processing Model on making the full Unicode character range U+0 to U+10FFFF available to applications and providing only one way to escape any character. -} -- 0x22 is " and 0x5c is \ isAsciiChar :: Char -> Bool isAsciiChar c = let i = ord c in i >= 0x20 && i <= 0x21 || i >= 0x23 && i <= 0x5b || i >= 0x5d && i <= 0x7e protectedChar :: NTParser Char protectedChar = (char 't' *> return '\t') <|> (char 'n' *> return '\n') <|> (char 'r' *> return '\r') <|> (char '"' *> return '"') <|> (char '\\' *> return '\\') <|> (char 'u' *> hex4) <|> (char 'U' *> hex8) character :: NTParser Char character = (char '\\' *> protectedChar) <|> satisfy isAsciiChar -------------------------------------------------------------------------------- -- -- Copyright (c) 2011, 2012, 2013 Douglas Burke -- All rights reserved. -- -- This file is part of Swish. -- -- Swish is free software; you can redistribute it and/or modify -- it under the terms of the GNU General Public License as published by -- the Free Software Foundation; either version 2 of the License, or -- (at your option) any later version. -- -- Swish is distributed in the hope that it will be useful, -- but WITHOUT ANY WARRANTY; without even the implied warranty of -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -- GNU General Public License for more details. -- -- You should have received a copy of the GNU General Public License -- along with Swish; if not, write to: -- The Free Software Foundation, Inc., -- 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -- --------------------------------------------------------------------------------