module Text.RDF.RDF4H.NTriplesParser
( NTriplesParser(NTriplesParser)
, NTriplesParserCustom(NTriplesParserCustom)
, ParseFailure
, nt_echar, nt_uchar, nt_langtag
, string_literal_quote, nt_string_literal_quote
, nt_pn_chars_base, nt_comment
, readFile
) where
import Prelude hiding (readFile)
import Data.Semigroup ((<>))
import Data.Char (isDigit, isLetter, isAlphaNum, isAsciiUpper, isAsciiLower)
import Control.Applicative
import Control.Monad (void)
import Data.RDF.Types hiding (empty)
import Data.RDF.IRI
import Text.RDF.RDF4H.ParserUtils
import Data.Attoparsec.ByteString (parse, IResult(..))
import Text.Parsec (runParser, ParseError)
import Text.Parser.LookAhead
import Text.Parser.Char
import Text.Parser.Combinators
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.Text.IO as T
import System.IO (IOMode(..), withFile, hSetNewlineMode, noNewlineTranslation, hSetEncoding, utf8)
data NTriplesParser = NTriplesParser
newtype NTriplesParserCustom = NTriplesParserCustom Parser
instance RdfParser NTriplesParser where
parseString _ = parseStringParsec
parseFile _ = parseFileParsec
parseURL _ = parseURLParsec
instance RdfParser NTriplesParserCustom where
parseString (NTriplesParserCustom Parsec) = parseStringParsec
parseString (NTriplesParserCustom Attoparsec) = parseStringAttoparsec
parseFile (NTriplesParserCustom Parsec) = parseFileParsec
parseFile (NTriplesParserCustom Attoparsec) = parseFileAttoparsec
parseURL (NTriplesParserCustom Parsec) = parseURLParsec
parseURL (NTriplesParserCustom Attoparsec) = parseURLAttoparsec
nt_ntripleDoc :: (CharParsing m, LookAheadParsing m, Monad m) => m [Triple]
nt_ntripleDoc = many sep *> sepEndBy (try nt_triple) (many sep) <* eof
where sep = many nt_space *> (try nt_comment <|> try nt_eoln) *> many nt_space
nt_triple :: (CharParsing m, LookAheadParsing m, Monad m) => m Triple
nt_triple = Triple
<$> (nt_subject <* optional (skipSome nt_space))
<*> (nt_predicate <* optional (skipSome nt_space))
<*> (nt_object <* optional (skipSome nt_space) <* char '.' <* many nt_space)
nt_literal :: (CharParsing m, Monad m) => m LValue
nt_literal = do
str <- nt_string_literal_quote
option (plainL str) (langTag str <|> typeIRI str)
where
langTag str = plainLL str <$> try nt_langtag
typeIRI str = typedL str <$> try (count 2 (char '^') *> nt_iriref)
nt_string_literal_quote :: (CharParsing m, Monad m) => m T.Text
nt_string_literal_quote = string_literal_quote '"'
string_literal_quote :: (CharParsing m, Monad m) => Char -> m T.Text
string_literal_quote d = between (char d) (char d) string_literal
where string_literal = T.pack <$> many (try validLiteralChar)
validLiteralChar = noneOf [d,'\x5C','\xA','\xD']
<|> nt_echar
<|> nt_uchar
nt_langtag :: (CharParsing m, Monad m) => m T.Text
nt_langtag = do
ss <- char '@' *> some (satisfy isLetter)
rest <- concat <$> many (char '-' *> some (satisfy isAlphaNum) >>= \lang_str -> pure ('-':lang_str))
pure (T.pack (ss <> rest))
nt_iriref :: (CharParsing m, Monad m) => m T.Text
nt_iriref = between (char '<') (char '>') $ do
raw_iri <- iriFragment
either (const empty) pure (validateIRI raw_iri) <?> "Only absolute IRIs allowed in NTriples format, which this isn't: " <> show raw_iri
nt_echar :: (CharParsing m, Monad m) => m Char
nt_echar = try $ do
c2 <- char '\\' *> anyChar
case c2 of
't' -> pure '\t'
'b' -> pure '\b'
'n' -> pure '\n'
'r' -> pure '\r'
'f' -> pure '\f'
'"' -> pure '\"'
'\'' -> pure '\''
'\\' -> pure '\\'
_ -> empty
nt_uchar :: (CharParsing m, Monad m) => m Char
nt_uchar = uchar
nt_subject :: (CharParsing m, LookAheadParsing m, Monad m) => m Node
nt_subject = unode <$> try nt_iriref
<|> bnode <$> nt_blank_node_label
nt_predicate :: (CharParsing m, Monad m) => m Node
nt_predicate = unode <$> nt_iriref
nt_object :: (CharParsing m, LookAheadParsing m, Monad m) => m Node
nt_object = unode <$> try nt_iriref
<|> bnode <$> try nt_blank_node_label
<|> LNode <$> nt_literal
nt_blank_node_label :: (CharParsing m, LookAheadParsing m, Monad m) => m T.Text
nt_blank_node_label = do
void (string "_:")
firstChar <- nt_pn_chars_u <|> satisfy isDigit
otherChars <- option "" $ try $
many (nt_pn_chars <|> try (char '.' <* lookAhead (try nt_pn_chars)))
pure $ T.pack (firstChar : otherChars)
nt_pn_chars_base :: CharParsing m => m Char
nt_pn_chars_base = try $ satisfy isBaseChar
where isBaseChar c = (isAsciiUpper c)
|| (isAsciiLower c)
|| (c >= '\x00C0' && c <= '\x00D6')
|| (c >= '\x00D8' && c <= '\x00F6')
|| (c >= '\x00F8' && c <= '\x02FF')
|| (c >= '\x0370' && c <= '\x037D')
|| (c >= '\x037F' && c <= '\x1FFF')
|| (c >= '\x200C' && c <= '\x200D')
|| (c >= '\x2070' && c <= '\x218F')
|| (c >= '\x2C00' && c <= '\x2FEF')
|| (c >= '\x3001' && c <= '\xD7FF')
|| (c >= '\xF900' && c <= '\xFDCF')
|| (c >= '\xFDF0' && c <= '\xFFFD')
|| (c >= '\x10000' && c <= '\xEFFFF')
nt_pn_chars_u :: CharParsing m => m Char
nt_pn_chars_u = nt_pn_chars_base <|> try (char '_') <|> try (char ':')
nt_pn_chars :: CharParsing m => m Char
nt_pn_chars = nt_pn_chars_u
<|> try (char '-')
<|> try (char '\x00B7')
<|> try (satisfy f)
where f c = isDigit c
|| (c >= '\x0300' && c <= '\x036F')
|| (c >= '\x203F' && c <= '\x2040')
nt_eoln :: CharParsing m => m ()
nt_eoln = try (void (string "\r\n")) <|> void (char '\n')
nt_space :: CharParsing m => m ()
nt_space = void (try (char ' ') <|> try (char '\t'))
nt_comment :: CharParsing m => m ()
nt_comment = void (char '#' *> manyTill anyChar (try nt_eoln))
parseStringParsec :: (Rdf a) => T.Text -> Either ParseFailure (RDF a)
parseStringParsec bs = handleParsec mkRdf (runParser nt_ntripleDoc () "" bs)
parseFileParsec :: (Rdf a) => String -> IO (Either ParseFailure (RDF a))
parseFileParsec path =
handleParsec mkRdf . runParser nt_ntripleDoc () path
<$> readFile path
readFile :: FilePath -> IO T.Text
readFile fpath = withFile fpath ReadMode $ \h -> do
hSetNewlineMode h noNewlineTranslation
hSetEncoding h utf8
T.hGetContents h
parseURLParsec :: (Rdf a) => String -> IO (Either ParseFailure (RDF a))
parseURLParsec = parseFromURL parseStringParsec
handleParsec :: (Triples -> Maybe BaseUrl -> PrefixMappings -> RDF a) ->
Either ParseError [Triple] -> Either ParseFailure (RDF a)
handleParsec _mkRdf result = case result of
Left err -> Left $ ParseFailure $ "Parse failure: \n" <> show err
Right ts -> Right $ _mkRdf ts Nothing (PrefixMappings mempty)
parseFileAttoparsec :: (Rdf a) => String -> IO (Either ParseFailure (RDF a))
parseFileAttoparsec path = handleAttoparsec <$> readFile path
parseURLAttoparsec :: (Rdf a) => String -> IO (Either ParseFailure (RDF a))
parseURLAttoparsec = parseFromURL handleAttoparsec
parseStringAttoparsec :: (Rdf a) => T.Text -> Either ParseFailure (RDF a)
parseStringAttoparsec = handleAttoparsec
handleAttoparsec :: (Rdf a) => T.Text -> Either ParseFailure (RDF a)
handleAttoparsec bs = handleResult $ parse nt_ntripleDoc (T.encodeUtf8 bs)
where
handleResult res = case res of
Fail _i _contexts err -> Left $ ParseFailure $ "Parse failure: \n" <> show err
Partial f -> handleResult (f (T.encodeUtf8 mempty))
Done _ ts -> Right $ mkRdf ts Nothing (PrefixMappings mempty)