module Text.RDF.RDF4H.NTriplesParser
(NTriplesParser(NTriplesParser), NTriplesParserCustom(NTriplesParserCustom),ParseFailure)
where
import Prelude hiding (init,pred)
import Data.RDF.Types
import Text.RDF.RDF4H.ParserUtils
import Data.Attoparsec.ByteString (parse,IResult(..))
import Data.Char (isLetter, isDigit,isAlphaNum, isAsciiUpper, isAsciiLower)
import Data.Map as Map (empty)
import qualified Data.Text.Encoding as T
import Text.Parsec (runParser,ParseError)
import qualified Data.Text as T
import qualified Data.Text.IO as TIO
import Control.Monad (void)
import Text.Parser.Char
import Text.Parser.Combinators
import Control.Applicative
import Data.Maybe (catMaybes)
data NTriplesParser = NTriplesParser
data NTriplesParserCustom = NTriplesParserCustom Parser
instance RdfParser NTriplesParser where
parseString _ = parseStringParsec
parseFile _ = parseFileParsec
parseURL _ = parseURLParsec
instance RdfParser NTriplesParserCustom where
parseString (NTriplesParserCustom Parsec) = parseStringParsec
parseString (NTriplesParserCustom Attoparsec) = parseStringAttoparsec
parseFile (NTriplesParserCustom Parsec) = parseFileParsec
parseFile (NTriplesParserCustom Attoparsec) = parseFileAttoparsec
parseURL (NTriplesParserCustom Parsec) = parseURLParsec
parseURL (NTriplesParserCustom Attoparsec) = parseURLAttoparsec
nt_ntripleDoc :: (CharParsing m, Monad m) => m [Maybe Triple]
nt_ntripleDoc = manyTill nt_line eof
nt_line :: (CharParsing m, Monad m) => m (Maybe Triple)
nt_line =
skipMany nt_space *>
(nt_comment
<|> try (nt_triple <* nt_eoln)
<|> try (nt_triple <* char '#' <* manyTill anyChar nt_eoln)
<|> (nt_empty <* nt_eoln)
)
nt_comment :: CharParsing m => m (Maybe Triple)
nt_comment = char '#' *> manyTill anyChar nt_eoln *> pure Nothing
nt_triple :: (CharParsing m, Monad m) => m (Maybe Triple)
nt_triple =
do
subj <- nt_subject <* optional (skipSome nt_space)
pred <- nt_predicate <* optional (skipSome nt_space)
obj <- nt_object <* optional (skipSome nt_space) <* char '.' <* many nt_space
pure $ Just (Triple subj pred obj)
nt_literal :: (CharParsing m, Monad m) => m LValue
nt_literal = do
s <- escapeRDFSyntax <$> nt_string_literal_quote
option (plainL s) $
(count 2 (char '^') *> nt_iriref >>= validateURI >>= isAbsoluteParser >>= \iri -> pure (typedL s iri))
<|> (plainLL s <$> nt_langtag)
nt_string_literal_quote :: CharParsing m => m T.Text
nt_string_literal_quote =
between (char '"') (char '"') $
T.concat <$> many ((T.singleton <$> noneOf ['\x22','\x5C','\xA','\xD']) <|>
nt_echar <|>
nt_uchar)
nt_langtag :: (CharParsing m, Monad m) => m T.Text
nt_langtag = do
ss <- char '@' *> some (satisfy isLetter)
rest <- concat <$> many ((:) <$> char '-' <*> some (satisfy isAlphaNum))
pure (T.pack (ss ++ rest))
nt_iriref :: CharParsing m => m T.Text
nt_iriref =
between (char '<') (char '>') $
T.concat <$> many ( T.singleton <$> noneOf (['\x00'..'\x20'] ++ "<>\"{}|^`\\") <|> nt_uchar )
nt_echar :: CharParsing m => m T.Text
nt_echar = try $ T.singleton <$> (char '\\' *> satisfy isEchar)
isEchar :: Char -> Bool
isEchar = (`elem` ['t','b','n','r','f','"','\'','\\'])
nt_uchar :: CharParsing m => m T.Text
nt_uchar =
try (T.pack <$> ((++) <$> string "\\u" <*> count 4 hexDigit)) <|>
try (T.pack <$> ((++) <$> string "\\U" <*> count 8 hexDigit))
nt_empty :: CharParsing m => m (Maybe Triple)
nt_empty = skipMany nt_space *> pure Nothing
nt_subject :: (CharParsing m, Monad m) => m Node
nt_subject =
unode <$> nt_uriref <|>
bnode <$> nt_blank_node_label
nt_predicate :: (CharParsing m, Monad m) => m Node
nt_predicate = unode <$> nt_uriref
nt_object :: (CharParsing m, Monad m) => m Node
nt_object =
unode <$> nt_uriref <|>
bnode <$> nt_blank_node_label <|>
LNode <$> nt_literal
validateUNode :: CharParsing m => T.Text -> m Node
validateUNode t =
case unodeValidate t of
Just u@UNode{} -> pure u
Just node -> unexpected ("Unexpected node in NTriples parser URI validation: " ++ show node)
Nothing -> unexpected ("Invalid URI in NTriples parser URI validation: " ++ show t)
validateURI :: (CharParsing m, Monad m) => T.Text -> m T.Text
validateURI t = do
UNode uri <- validateUNode t
pure uri
isAbsoluteParser :: CharParsing m => T.Text -> m T.Text
isAbsoluteParser t =
if isAbsoluteUri t
then pure t
else unexpected ("Only absolute IRIs allowed in NTriples format, which this isn't: " ++ show t)
absoluteURI :: CharParsing m => T.Text -> m T.Text
absoluteURI = isAbsoluteParser
nt_uriref :: (CharParsing m, Monad m) => m T.Text
nt_uriref = between (char '<') (char '>') $ do
unvalidatedUri <- many (satisfy ( /= '>'))
absoluteURI =<< validateURI (T.pack unvalidatedUri)
nt_blank_node_label :: (CharParsing m, Monad m) => m T.Text
nt_blank_node_label = do
void (char '_' *> char ':')
s1 <- nt_pn_chars_u <|> satisfy isDigit
s2 <- option "" $ try $ (++) <$> many (char '.') <*> some nt_pn_chars
pure (T.pack ("_:" ++ s1:s2))
isBaseChar :: Char -> Bool
isBaseChar c
= isAsciiUpper c
|| isAsciiLower c
|| (c >= '\x00C0' && c <= '\x00D6')
|| (c >= '\x00D8' && c <= '\x00F6')
|| (c >= '\x00F8' && c <= '\x02FF')
|| (c >= '\x0370' && c <= '\x037D')
|| (c >= '\x037F' && c <= '\x1FFF')
|| (c >= '\x200C' && c <= '\x200D')
|| (c >= '\x2070' && c <= '\x218F')
|| (c >= '\x2C00' && c <= '\x2FEF')
|| (c >= '\x3001' && c <= '\xD7FF')
|| (c >= '\xF900' && c <= '\xFDCF')
|| (c >= '\xFDF0' && c <= '\xFFFD')
|| (c >= '\x10000' && c <= '\xEFFFF')
nt_pn_chars_u :: CharParsing m => m Char
nt_pn_chars_u = try $ satisfy $ \c -> c == '_' || c == ':' || isBaseChar c
nt_pn_chars :: CharParsing m => m Char
nt_pn_chars = nt_pn_chars_u
<|> try (satisfy $ \c ->
c == '-' || c == '\x00B7'
|| isDigit c
|| (c >= '\x0300' && c <= '\x036F')
|| (c >= '\x203F' && c <= '\x2040')
)
nt_eoln :: CharParsing m => m ()
nt_eoln = eof <|> void (nt_cr *> nt_lf) <|> void nt_lf
nt_space :: CharParsing m => m Char
nt_space = char ' ' <|> nt_tab
nt_cr :: CharParsing m => m Char
nt_cr = char '\r'
nt_lf :: CharParsing m => m Char
nt_lf = char '\n'
nt_tab :: CharParsing m => m Char
nt_tab = char '\t'
parseStringParsec :: (Rdf a) => T.Text -> Either ParseFailure (RDF a)
parseStringParsec bs = handleParsec mkRdf (runParser nt_ntripleDoc () "" bs)
parseFileParsec :: (Rdf a) => String -> IO (Either ParseFailure (RDF a))
parseFileParsec path =
handleParsec mkRdf . runParser nt_ntripleDoc () path
<$> TIO.readFile path
parseURLParsec :: (Rdf a) => String -> IO (Either ParseFailure (RDF a))
parseURLParsec = _parseURL parseStringParsec
handleParsec :: (Triples -> Maybe BaseUrl -> PrefixMappings -> RDF a) ->
Either ParseError [Maybe Triple] ->
Either ParseFailure (RDF a)
handleParsec _mkRdf result
= case result of
Left err -> Left $ ParseFailure $ "Parse failure: \n" ++ show err
Right ts -> Right $ _mkRdf (catMaybes ts) Nothing (PrefixMappings Map.empty)
parseFileAttoparsec :: (Rdf a) => String -> IO (Either ParseFailure (RDF a))
parseFileAttoparsec path = handleAttoparsec <$> TIO.readFile path
parseURLAttoparsec :: (Rdf a) => String -> IO (Either ParseFailure (RDF a))
parseURLAttoparsec = _parseURL handleAttoparsec
parseStringAttoparsec :: (Rdf a) => T.Text -> Either ParseFailure (RDF a)
parseStringAttoparsec = handleAttoparsec
handleAttoparsec :: (Rdf a) => T.Text -> Either ParseFailure (RDF a)
handleAttoparsec bs = handleResult $ parse nt_ntripleDoc (T.encodeUtf8 bs)
where
handleResult res = case res of
Fail _i _contexts err -> Left $ ParseFailure $ "Parse failure: \n" ++ show err
Partial f -> handleResult (f (T.encodeUtf8 T.empty))
Done _ ts -> Right $ mkRdf (catMaybes ts) Nothing (PrefixMappings Map.empty)