-- |A parser for RDF in N-Triples format -- . module Text.RDF.RDF4H.NTriplesParser (NTriplesParser(NTriplesParser), ParseFailure) where import Prelude hiding (init,pred) import Data.RDF.Types import Text.RDF.RDF4H.ParserUtils import Data.Char (isLetter, isDigit,isAlphaNum) import Data.Map as Map (empty) import Text.Parsec import Text.Parsec.Text import qualified Data.Text as T import qualified Data.Text.IO as TIO import Control.Monad (liftM,void,guard) -- |NTriplesParser is an 'RdfParser' implementation for parsing RDF in the -- NTriples format. It requires no configuration options. To use this parser, -- pass an 'NTriplesParser' value as the first argument to any of the -- 'parseString', 'parseFile', or 'parseURL' methods of the 'RdfParser' type -- class. data NTriplesParser = NTriplesParser -- |'NTriplesParser' is an instance of 'RdfParser'. instance RdfParser NTriplesParser where parseString _ = parseString' parseFile _ = parseFile' parseURL _ = parseURL' -- We define or redefine all here using same names as the spec, but with an -- 'nt_' prefix in order to avoid name clashes (e.g., ntripleDoc becomes -- nt_ntripleDoc). -- |nt_ntripleDoc is simply zero or more lines. nt_ntripleDoc :: GenParser () [Maybe Triple] nt_ntripleDoc = manyTill nt_line eof nt_line :: GenParser () (Maybe Triple) nt_line = skipMany nt_space >> ((nt_comment >>= \res -> return res) <|> try (nt_triple >>= \res -> nt_eoln >> return res) <|> try (nt_triple >>= \res -> char '#' >> manyTill anyChar nt_eoln >> return res) <|> (nt_empty >>= \res -> nt_eoln >> return res)) >>= \res -> return res nt_comment :: GenParser () (Maybe Triple) nt_comment = char '#' >> manyTill anyChar nt_eoln >> return Nothing -- A triple consists of whitespace-delimited subject, predicate, and object, -- followed by optional whitespace and a period, and possibly more -- whitespace. -- -- NTriples W3C test "minimal_whitespace" proposes no space: -- -- "tests absense of whitespace between subject, predicate, object and -- end-of-statement" -- -- `optional` lets this nt_triple parser succeed even if there is not -- a space or tab character between resources or the object and the '.'. nt_triple :: GenParser () (Maybe Triple) nt_triple = do subj <- nt_subject optional (skipMany1 nt_space) pred <- nt_predicate optional (skipMany1 nt_space) obj <- nt_object optional (skipMany1 nt_space) void (char '.') void (many nt_space) return $ Just (Triple subj pred obj) -- [6] literal nt_literal :: GenParser () LValue nt_literal = do s' <- nt_string_literal_quote let s = escapeRDFSyntax s' option (plainL s) $ do ((count 2 (char '^') >> nt_iriref >>= validateURI >>= isAbsoluteParser >>= \iri -> return (typedL s iri)) <|> (nt_langtag >>= \lang -> return (plainLL s lang))) -- [9] STRING_LITERAL_QUOTE nt_string_literal_quote :: GenParser () T.Text nt_string_literal_quote = between (char '"') (char '"') $ do T.concat <$> (many ((T.singleton <$> noneOf ['\x22','\x5C','\xA','\xD']) <|> nt_echar <|> nt_uchar)) -- [144s] LANGTAG nt_langtag :: GenParser () T.Text nt_langtag = do void (char '@') ss <- many1 (satisfy (\ c -> isLetter c)) rest <- concat <$> many (char '-' >> many1 (satisfy (\ c -> isAlphaNum c)) >>= \lang_str -> return ('-':lang_str)) return (T.pack (ss ++ rest)) -- [8] IRIREF nt_iriref :: GenParser () T.Text nt_iriref = do between (char '<') (char '>') $ do T.concat <$> many ( T.singleton <$> noneOf (['\x00'..'\x20'] ++ ['<','>','"','{','}','|','^','`','\\']) <|> nt_uchar ) -- [153s] ECHAR nt_echar :: GenParser () T.Text nt_echar = try $ do void (char '\\') c2 <- anyChar guard $ isEchar c2 return (T.pack [c2]) isEchar :: Char -> Bool isEchar = (`elem` ['t','b','n','r','f','"','\'','\\']) -- [10] UCHAR nt_uchar :: GenParser () T.Text nt_uchar = (try (char '\\' >> char 'u' >> count 4 hexDigit >>= \cs -> return $ T.pack ('\\':'u':cs)) <|> try (char '\\' >> char 'U' >> count 8 hexDigit >>= \cs -> return $ T.pack ('\\':'U':cs))) -- nt_empty is a line that isn't a comment or a triple. They appear in the -- parsed output as Nothing, whereas a real triple appears as (Just triple). nt_empty :: GenParser () (Maybe Triple) nt_empty = skipMany nt_space >> return Nothing -- A subject is either a URI reference for a resource or a node id for a -- blank node. nt_subject :: GenParser () Node nt_subject = liftM unode nt_uriref <|> liftM bnode nt_blank_node_label -- A predicate may only be a URI reference to a resource. nt_predicate :: GenParser () Node nt_predicate = liftM unode nt_uriref -- An object may be either a resource (represented by a URI reference), -- a blank node (represented by a node id), or an object literal. nt_object :: GenParser () Node nt_object = liftM unode nt_uriref <|> liftM bnode nt_blank_node_label <|> liftM LNode nt_literal validateUNode :: T.Text -> GenParser () Node validateUNode t = case unodeValidate t of Nothing -> unexpected ("Invalid URI in NTriples parser URI validation: " ++ show t) Just u@(UNode{}) -> return u Just node -> unexpected ("Unexpected node in NTriples parser URI validation: " ++ show node) validateURI :: T.Text -> GenParser () T.Text validateURI t = do UNode uri <- validateUNode t return uri isAbsoluteParser :: T.Text -> GenParser () T.Text isAbsoluteParser t = if isAbsoluteUri t then return t else unexpected ("Only absolute IRIs allowed in NTriples format, which this isn't: " ++ show t) absoluteURI :: T.Text -> GenParser () T.Text absoluteURI t = do uri <- isAbsoluteParser t return uri -- A URI reference is one or more nrab_character inside angle brackets. nt_uriref :: GenParser () T.Text nt_uriref = between (char '<') (char '>') $ do unvalidatedUri <- many (satisfy ( /= '>')) t <- validateURI (T.pack unvalidatedUri) absoluteURI t -- [141s] BLANK_NODE_LABEL nt_blank_node_label :: GenParser () T.Text nt_blank_node_label = do void (char '_' >> char ':') s1 <- (nt_pn_chars_u <|> satisfy isDigit) s2 <- option "" $ try $ do sub_dots <- many (char '.') sub_s1 <- many1 nt_pn_chars return (sub_dots ++ sub_s1) return (T.pack ("_:" ++ [s1] ++ s2)) -- [157s] PN_CHARS_BASE -- Not used. It used to be. What's happened? {- nt_pn_chars_base :: GenParser () Char nt_pn_chars_base = try $ do c <- anyChar guard $ isBaseChar c return c -} isBaseChar :: Char -> Bool isBaseChar c = (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '\x00C0' && c <= '\x00D6') || (c >= '\x00D8' && c <= '\x00F6') || (c >= '\x00F8' && c <= '\x02FF') || (c >= '\x0370' && c <= '\x037D') || (c >= '\x037F' && c <= '\x1FFF') || (c >= '\x200C' && c <= '\x200D') || (c >= '\x2070' && c <= '\x218F') || (c >= '\x2C00' && c <= '\x2FEF') || (c >= '\x3001' && c <= '\xD7FF') || (c >= '\xF900' && c <= '\xFDCF') || (c >= '\xFDF0' && c <= '\xFFFD') || (c >= '\x10000' && c <= '\xEFFFF') -- [158s] PN_CHARS_U nt_pn_chars_u :: GenParser () Char nt_pn_chars_u = try $ do c <- anyChar guard $ c == '_' || c == ':' || isBaseChar c return c -- [160s] PN_CHARS nt_pn_chars :: GenParser () Char nt_pn_chars = nt_pn_chars_u <|> try (do c <- anyChar guard $ c == '-' || c == '\x00B7' || isDigit c || (c >= '\x0300' && c <= '\x036F') || (c >= '\x203F' && c <= '\x2040') return c ) -- End-of-line consists of either lf or crlf. -- We also test for eof and consider that to match as well. nt_eoln :: GenParser () () nt_eoln = eof <|> void (nt_cr >> nt_lf) <|> void nt_lf -- Whitespace is either a space or tab character. We must avoid using the -- built-in space combinator here, because it includes newline. nt_space :: GenParser () Char nt_space = char ' ' <|> nt_tab -- Carriage return is \r. nt_cr :: GenParser () Char nt_cr = char '\r' -- Line feed is \n. nt_lf :: GenParser () Char nt_lf = char '\n' -- Tab is \t. nt_tab :: GenParser () Char nt_tab = char '\t' parseString' :: (Rdf a) => T.Text -> Either ParseFailure (RDF a) parseString' bs = handleParse mkRdf (runParser nt_ntripleDoc () "" bs) parseURL' :: (Rdf a) => String -> IO (Either ParseFailure (RDF a)) parseURL' = _parseURL parseString' parseFile' :: (Rdf a) => String -> IO (Either ParseFailure (RDF a)) parseFile' path = liftM (handleParse mkRdf . runParser nt_ntripleDoc () path) (TIO.readFile path) handleParse :: {-forall rdf. (RDF rdf) => -} (Triples -> Maybe BaseUrl -> PrefixMappings -> (RDF a)) -> Either ParseError [Maybe Triple] -> Either ParseFailure (RDF a) handleParse _mkRdf result -- | T.length rem /= 0 = (Left $ ParseFailure $ "Invalid Document. Unparseable end of document: " ++ T.unpack rem) | otherwise = case result of Left err -> Left $ ParseFailure $ "Parse failure: \n" ++ show err Right ts -> Right $ _mkRdf (conv ts) Nothing (PrefixMappings Map.empty) where conv [] = [] conv (Nothing:ts) = conv ts conv (Just t:ts) = t : conv ts