module Text.RDF.RDF4H.TurtleParser(
TurtleParser(TurtleParser)
)
where
import Data.RDF
import Data.RDF.Namespace
import Text.RDF.RDF4H.ParserUtils
import Text.Parsec
import Text.Parsec.ByteString.Lazy
import qualified Data.Map as Map
import Data.ByteString.Lazy.Char8(ByteString)
import qualified Data.ByteString.Lazy.Char8 as B
import qualified Data.ByteString.Lazy as BL
import Data.Sequence(Seq, (|>))
import qualified Data.Sequence as Seq
import qualified Data.Foldable as F
import Data.Char (isDigit)
import Control.Monad
import Data.Maybe (fromMaybe)
import Debug.Trace(trace)
_trace = trace
data TurtleParser = TurtleParser (Maybe BaseUrl) (Maybe ByteString)
instance RdfParser TurtleParser where
parseString (TurtleParser bUrl dUrl) = parseString' bUrl dUrl
parseFile (TurtleParser bUrl dUrl) = parseFile' bUrl dUrl
parseURL (TurtleParser bUrl dUrl) = parseURL' bUrl dUrl
type ParseState =
(Maybe BaseUrl,
Maybe ByteString,
Int,
PrefixMappings,
[Subject],
[Predicate],
[Bool],
Seq Triple)
t_turtleDoc :: GenParser ByteString ParseState (Seq Triple, PrefixMappings)
t_turtleDoc =
many t_statement >> (eof <?> "eof") >> getState >>= \(_, _, _, pms, _, _, _, ts) -> return (ts, pms)
t_statement :: GenParser ByteString ParseState ()
t_statement = d <|> t <|> void (many1 t_ws <?> "blankline-whitespace")
where
d = void
(try t_directive >> (many t_ws <?> "directive-whitespace1") >>
(char '.' <?> "end-of-directive-period") >>
(many t_ws <?> "directive-whitespace2"))
t = void
(t_triples >> (many t_ws <?> "triple-whitespace1") >>
(char '.' <?> "end-of-triple-period") >>
(many t_ws <?> "triple-whitespace2"))
t_triples :: GenParser ByteString ParseState ()
t_triples = t_subject >> (many1 t_ws <?> "subject-predicate-whitespace") >> t_predicateObjectList >> resetSubjectPredicate
t_directive :: GenParser ByteString ParseState ()
t_directive = t_prefixID <|> t_base
t_resource :: GenParser ByteString ParseState ByteString
t_resource = try t_uriref <|> t_qname
t_prefixID :: GenParser ByteString ParseState ()
t_prefixID =
do try (string "@prefix" <?> "@prefix-directive")
pre <- (many1 t_ws <?> "whitespace-after-@prefix") >> option B.empty t_prefixName
char ':' >> (many1 t_ws <?> "whitespace-after-@prefix-colon")
uriFrag <- t_uriref
(bUrl, dUrl, _, PrefixMappings pms, _, _, _, _) <- getState
updatePMs $ Just (PrefixMappings $ Map.insert pre (absolutizeUrl bUrl dUrl uriFrag) pms)
return ()
t_base :: GenParser ByteString ParseState ()
t_base =
do try (string "@base" <?> "@base-directive")
many1 t_ws <?> "whitespace-after-@base"
urlFrag <- t_uriref
bUrl <- currBaseUrl
dUrl <- currDocUrl
updateBaseUrl (Just $ Just $ newBaseUrl bUrl (absolutizeUrl bUrl dUrl urlFrag))
t_verb :: GenParser ByteString ParseState ()
t_verb = (try t_predicate <|> (char 'a' >> return rdfTypeNode)) >>= pushPred
t_predicate :: GenParser ByteString ParseState Node
t_predicate = liftM (UNode . mkFastString) (t_resource <?> "resource")
t_nodeID :: GenParser ByteString ParseState ByteString
t_nodeID = do { try (string "_:"); cs <- t_name; return $! s2b "_:" `B.append` cs }
t_qname :: GenParser ByteString ParseState ByteString
t_qname =
do pre <- option B.empty (try t_prefixName)
char ':'
name <- option B.empty t_name
(bUrl, _, _, pms, _, _, _, _) <- getState
return $ resolveQName bUrl pre pms `B.append` name
t_subject :: GenParser ByteString ParseState ()
t_subject =
simpleBNode <|>
resource <|>
nodeId <|>
between (char '[') (char ']') poList
where
resource = liftM (UNode . mkFastString) (t_resource <?> "subject resource") >>= pushSubj
nodeId = liftM (BNode . mkFastString) (t_nodeID <?> "subject nodeID") >>= pushSubj
simpleBNode = try (string "[]") >> nextIdCounter >>= pushSubj . BNodeGen
poList = void
(nextIdCounter >>= pushSubj . BNodeGen >> many t_ws >>
t_predicateObjectList >>
many t_ws)
t_predicateObjectList :: GenParser ByteString ParseState ()
t_predicateObjectList =
do t_verb <?> "verb"
many1 t_ws <?> "polist-whitespace-after-verb"
t_objectList <?> "polist-objectList"
many (try (many t_ws >> char ';') >> many t_ws >> t_verb >> many1 t_ws >> t_objectList >> popPred)
popPred
return ()
t_objectList :: GenParser ByteString ParseState ()
t_objectList =
void
((t_object <?> "object") >>
many (try (many t_ws >> char ',' >> many t_ws >> t_object)))
t_object :: GenParser ByteString ParseState ()
t_object =
do inColl <- isInColl
onFirstItem <- onCollFirstItem
let processObject = (t_literal >>= addTripleForObject) <|>
(liftM (UNode . mkFastString) t_resource >>= addTripleForObject) <|>
blank_as_obj <|> t_collection
case (inColl, onFirstItem) of
(False, _) -> processObject
(True, True) -> liftM BNodeGen nextIdCounter >>= \bSubj -> addTripleForObject bSubj >>
pushSubj bSubj >> pushPred rdfFirstNode >> processObject >> collFirstItemProcessed
(True, False) -> liftM BNodeGen nextIdCounter >>= \bSubj -> pushPred rdfRestNode >>
addTripleForObject bSubj >> popPred >> popSubj >>
pushSubj bSubj >> processObject
t_collection:: GenParser ByteString ParseState ()
t_collection =
between (char '(') (char ')') $
do beginColl
many t_ws
emptyColl <- option True (try t_object >> many t_ws >> return False)
if emptyColl then void (addTripleForObject rdfNilNode) else
void
(many (many t_ws >> try t_object >> many t_ws) >> popPred >>
pushPred rdfRestNode >>
addTripleForObject rdfNilNode >>
popPred)
finishColl
return ()
blank_as_obj :: GenParser ByteString ParseState ()
blank_as_obj =
(liftM (BNode . mkFastString) t_nodeID >>= addTripleForObject) <|>
(genBlank >>= addTripleForObject) <|>
poList
where
genBlank = liftM BNodeGen (try (string "[]") >> nextIdCounter)
poList = between (char '[') (char ']') $
liftM BNodeGen nextIdCounter >>= \bSubj ->
void
(addTripleForObject bSubj >>
many t_ws >> pushSubj bSubj >>
t_predicateObjectList >> popSubj >> many t_ws)
rdfTypeNode, rdfNilNode, rdfFirstNode, rdfRestNode :: Node
rdfTypeNode = UNode $ mkFastString $ mkUri rdf $ s2b "type"
rdfNilNode = UNode $ mkFastString $ mkUri rdf $ s2b "nil"
rdfFirstNode = UNode $ mkFastString $ mkUri rdf $ s2b "first"
rdfRestNode = UNode $ mkFastString $ mkUri rdf $ s2b "rest"
xsdIntUri, xsdDoubleUri, xsdDecimalUri, xsdBooleanUri :: FastString
xsdIntUri = mkFastString $! mkUri xsd $! s2b "integer"
xsdDoubleUri = mkFastString $! mkUri xsd $! s2b "double"
xsdDecimalUri = mkFastString $! mkUri xsd $! s2b "decimal"
xsdBooleanUri = mkFastString $! mkUri xsd $! s2b "boolean"
t_literal :: GenParser ByteString ParseState Node
t_literal =
try str_literal <|>
liftM (`mkLNode` xsdIntUri) (try t_integer) <|>
liftM (`mkLNode` xsdDoubleUri) (try t_double) <|>
liftM (`mkLNode` xsdDecimalUri) (try t_decimal) <|>
liftM (`mkLNode` xsdBooleanUri) t_boolean
where
mkLNode :: ByteString -> FastString -> Node
mkLNode bs fs = LNode (typedL bs fs)
str_literal :: GenParser ByteString ParseState Node
str_literal =
do str <- t_quotedString <?> "quotedString"
liftM (LNode . typedL str . mkFastString)
(try (count 2 (char '^')) >> t_resource) <|>
liftM (lnode . plainLL str) (char '@' >> t_language) <|>
return (lnode $ plainL str)
t_quotedString :: GenParser ByteString ParseState ByteString
t_quotedString = t_longString <|> t_string
t_string :: GenParser ByteString ParseState ByteString
t_string = liftM B.concat (between (char '"') (char '"') (many t_scharacter))
t_longString :: GenParser ByteString ParseState ByteString
t_longString =
do
try tripleQuote
strVal <- liftM B.concat (many longString_char)
tripleQuote
return strVal
where
tripleQuote = count 3 (char '"')
t_integer :: GenParser ByteString ParseState ByteString
t_integer =
do sign <- sign_parser <?> "+-"
ds <- many1 digit <?> "digit"
notFollowedBy (char '.')
return $! (s2b sign `B.append` s2b ds)
t_double :: GenParser ByteString ParseState ByteString
t_double =
do sign <- sign_parser <?> "+-"
rest <- try (do { ds <- many1 digit <?> "digit"; char '.'; ds' <- many digit <?> "digit"; e <- t_exponent <?> "exponent"; return (s2b ds `B.snoc` '.' `B.append` s2b ds' `B.append` e) }) <|>
try (do { char '.'; ds <- many1 digit <?> "digit"; e <- t_exponent <?> "exponent"; return ('.' `B.cons` s2b ds `B.append` e) }) <|>
try (do { ds <- many1 digit <?> "digit"; e <- t_exponent <?> "exponent"; return (s2b ds `B.append` e) })
return $! s2b sign `B.append` rest
sign_parser :: GenParser ByteString ParseState String
sign_parser = option "" (oneOf "-+" >>= (\c -> return [c]))
t_decimal :: GenParser ByteString ParseState ByteString
t_decimal =
do sign <- sign_parser
rest <- try (do ds <- many digit <?> "digit"; char '.'; ds' <- option "" (many digit); return (ds ++ ('.':ds')))
<|> try (do { char '.'; ds <- many1 digit <?> "digit"; return ('.':ds) })
<|> many1 digit <?> "digit"
return $ s2b sign `B.append` s2b rest
t_exponent :: GenParser ByteString ParseState ByteString
t_exponent = do e <- oneOf "eE"
s <- option "" (oneOf "-+" >>= \c -> return [c])
ds <- many1 digit;
return $! (e `B.cons` (s2b s `B.append` s2b ds))
t_boolean :: GenParser ByteString ParseState ByteString
t_boolean =
try (liftM s2b (string "true") <|>
liftM s2b (string "true"))
t_comment :: GenParser ByteString ParseState ()
t_comment =
void (char '#' >> many (satisfy (\ c -> c /= '\n' && c /= '\r')))
t_ws :: GenParser ByteString ParseState ()
t_ws =
(void (try (char '\t' <|> char '\n' <|> char '\r' <|> char ' '))
<|> try t_comment)
<?> "whitespace-or-comment"
t_language :: GenParser ByteString ParseState ByteString
t_language =
do init <- many1 lower;
rest <- many (do {char '-'; cs <- many1 (lower <|> digit); return (s2b ('-':cs))})
return $! (s2b init `B.append` B.concat rest)
identifier :: GenParser ByteString ParseState Char -> GenParser ByteString ParseState Char -> GenParser ByteString ParseState ByteString
identifier initial rest = initial >>= \i -> many rest >>= \r -> return (s2b (i:r))
t_prefixName :: GenParser ByteString ParseState ByteString
t_prefixName = identifier t_nameStartCharMinusUnderscore t_nameChar
t_name :: GenParser ByteString ParseState ByteString
t_name = identifier t_nameStartChar t_nameChar
t_uriref :: GenParser ByteString ParseState ByteString
t_uriref = between (char '<') (char '>') t_relativeURI
t_relativeURI :: GenParser ByteString ParseState ByteString
t_relativeURI =
do frag <- liftM (B.pack . concat) (many t_ucharacter)
bUrl <- currBaseUrl
dUrl <- currDocUrl
return $ absolutizeUrl bUrl dUrl frag
t_ucharacter :: GenParser ByteString ParseState String
t_ucharacter =
try (liftM B.unpack unicode_escape) <|>
try (string "\\>") <|>
liftM B.unpack (non_ctrl_char_except ">")
t_nameChar :: GenParser ByteString ParseState Char
t_nameChar = t_nameStartChar <|> char '-' <|> char '\x00B7' <|> satisfy f
where
f = flip in_range [('0', '9'), ('\x0300', '\x036F'), ('\x203F', '\x2040')]
longString_char :: GenParser ByteString ParseState ByteString
longString_char =
specialChar <|>
try escapedChar <|>
try twoDoubleQuote <|>
try oneDoubleQuote <|>
safeNonCtrlChar <|>
try unicode_escape
where
specialChar = oneOf "\t\n\r" >>= bs1
escapedChar =
do char '\\'
(char 't' >> bs1 '\t') <|> (char 'n' >> bs1 '\n') <|> (char 'r' >> bs1 '\r') <|>
(char '\\' >> bs1 '\\') <|> (char '"' >> bs1 '"')
twoDoubleQuote = string "\"\"" >> notFollowedBy (char '"') >> bs "\"\""
oneDoubleQuote = char '"' >> notFollowedBy (char '"') >> bs1 '"'
safeNonCtrlChar = non_ctrl_char_except "\\\""
bs1 :: Char -> GenParser ByteString ParseState ByteString
bs1 = return . B.singleton
bs :: String -> GenParser ByteString ParseState ByteString
bs = return . B.pack
t_nameStartChar :: GenParser ByteString ParseState Char
t_nameStartChar = char '_' <|> t_nameStartCharMinusUnderscore
t_nameStartCharMinusUnderscore :: GenParser ByteString ParseState Char
t_nameStartCharMinusUnderscore = try $ satisfy $ flip in_range blocks
where
blocks = [('A', 'Z'), ('a', 'z'), ('\x00C0', '\x00D6'),
('\x00D8', '\x00F6'), ('\x00F8', '\x02FF'),
('\x0370', '\x037D'), ('\x037F', '\x1FFF'),
('\x200C', '\x200D'), ('\x2070', '\x218F'),
('\x2C00', '\x2FEF'), ('\x3001', '\xD7FF'),
('\xF900', '\xFDCF'), ('\xFDF0', '\xFFFD'),
('\x10000', '\xEFFFF')]
t_hex :: GenParser ByteString ParseState Char
t_hex = satisfy (\c -> isDigit c || (c >= 'A' && c <= 'F')) <?> "hexadecimal digit"
t_scharacter :: GenParser ByteString ParseState ByteString
t_scharacter =
(try (string "\\\"") >> return (B.singleton '"'))
<|> try (do {char '\\';
(char 't' >> return (B.singleton '\t')) <|>
(char 'n' >> return (B.singleton '\n')) <|>
(char 'r' >> return (B.singleton '\r'))})
<|> unicode_escape
<|> (non_ctrl_char_except "\\\"" >>= \s -> return $! s)
unicode_escape :: GenParser ByteString ParseState ByteString
unicode_escape =
(char '\\' >> return (B.singleton '\\')) >>
((char '\\' >> return (s2b "\\\\")) <|>
(char 'u' >> count 4 t_hex >>= \cs -> return $! s2b "\\u" `B.append` s2b cs) <|>
(char 'U' >> count 8 t_hex >>= \cs -> return $! s2b "\\U" `B.append` s2b cs))
non_ctrl_char_except :: String -> GenParser ByteString ParseState ByteString
non_ctrl_char_except cs =
liftM B.singleton
(satisfy (\ c -> c <= '\1114111' && (c >= ' ' && c `notElem` cs)))
in_range :: Char -> [(Char, Char)] -> Bool
in_range c = any (\(c1, c2) -> c >= c1 && c <= c2)
resolveQName :: Maybe BaseUrl -> ByteString -> PrefixMappings -> ByteString
resolveQName mbaseUrl prefix (PrefixMappings pms') =
case (mbaseUrl, B.null prefix) of
(Just (BaseUrl base), True) -> Map.findWithDefault base BL.empty pms'
(Nothing, True) -> err1
(_, _ ) -> Map.findWithDefault err2 prefix pms'
where
err1 = error "Cannot resolve empty QName prefix to a Base URL."
err2 = error ("Cannot resolve QName prefix: " ++ B.unpack prefix)
absolutizeUrl :: Maybe BaseUrl -> Maybe ByteString -> ByteString -> ByteString
absolutizeUrl mbUrl mdUrl urlFrag =
if isAbsoluteUri urlFrag then urlFrag else
(case (mbUrl, mdUrl) of
(Nothing, Nothing) -> urlFrag
(Just (BaseUrl bUrl), Nothing) -> bUrl `B.append` urlFrag
(Nothing, Just dUrl) -> if isHash urlFrag then
dUrl `B.append` urlFrag else urlFrag
(Just (BaseUrl bUrl), Just dUrl) -> (if isHash urlFrag then dUrl
else bUrl)
`B.append` urlFrag)
where
isHash bs = B.length bs == 1 && B.head bs == '#'
isAbsoluteUri :: ByteString -> Bool
isAbsoluteUri = B.elem ':'
newBaseUrl :: Maybe BaseUrl -> ByteString -> BaseUrl
newBaseUrl Nothing url = BaseUrl url
newBaseUrl (Just (BaseUrl bUrl)) url = BaseUrl $! mkAbsoluteUrl bUrl url
mkAbsoluteUrl :: ByteString -> ByteString -> ByteString
mkAbsoluteUrl base url =
if isAbsoluteUri url then url else base `B.append` url
currBaseUrl :: GenParser ByteString ParseState (Maybe BaseUrl)
currBaseUrl = getState >>= \(bUrl, _, _, _, _, _, _, _) -> return bUrl
currDocUrl :: GenParser ByteString ParseState (Maybe ByteString)
currDocUrl = getState >>= \(_, dUrl, _, _, _, _, _, _) -> return dUrl
pushSubj :: Subject -> GenParser ByteString ParseState ()
pushSubj s = getState >>= \(bUrl, dUrl, i, pms, ss, ps, cs, ts) ->
setState (bUrl, dUrl, i, pms, s:ss, ps, cs, ts)
popSubj :: GenParser ByteString ParseState Subject
popSubj = getState >>= \(bUrl, dUrl, i, pms, ss, ps, cs, ts) ->
setState (bUrl, dUrl, i, pms, tail ss, ps, cs, ts) >>
when (null ss) (error "Cannot pop subject off empty stack.") >>
return (head ss)
pushPred :: Predicate -> GenParser ByteString ParseState ()
pushPred p = getState >>= \(bUrl, dUrl, i, pms, ss, ps, cs, ts) ->
setState (bUrl, dUrl, i, pms, ss, p:ps, cs, ts)
popPred :: GenParser ByteString ParseState Predicate
popPred = getState >>= \(bUrl, dUrl, i, pms, ss, ps, cs, ts) ->
setState (bUrl, dUrl, i, pms, ss, tail ps, cs, ts) >>
when (null ps) (error "Cannot pop predicate off empty stack.") >>
return (head ps)
isInColl :: GenParser ByteString ParseState Bool
isInColl = getState >>= \(_, _, _, _, _, _, cs, _) -> return . not . null $ cs
updateBaseUrl :: Maybe (Maybe BaseUrl) -> GenParser ByteString ParseState ()
updateBaseUrl val = _modifyState val no no no no no
nextIdCounter :: GenParser ByteString ParseState Int
nextIdCounter = getState >>= \(bUrl, dUrl, i, pms, s, p, cs, ts) ->
setState (bUrl, dUrl, i+1, pms, s, p, cs, ts) >> return i
updatePMs :: Maybe PrefixMappings -> GenParser ByteString ParseState ()
updatePMs val = _modifyState no no val no no no
beginColl :: GenParser ByteString ParseState ()
beginColl = getState >>= \(bUrl, dUrl, i, pms, s, p, cs, ts) ->
setState (bUrl, dUrl, i, pms, s, p, True:cs, ts)
onCollFirstItem :: GenParser ByteString ParseState Bool
onCollFirstItem = getState >>= \(_, _, _, _, _, _, cs, _) -> return (not (null cs) && head cs)
collFirstItemProcessed :: GenParser ByteString ParseState ()
collFirstItemProcessed = getState >>= \(bUrl, dUrl, i, pms, s, p, _:cs, ts) ->
setState (bUrl, dUrl, i, pms, s, p, False:cs, ts)
finishColl :: GenParser ByteString ParseState Bool
finishColl = getState >>= \(bUrl, dUrl, i, pms, s, p, cs, ts) ->
let cs' = drop 1 cs
in setState (bUrl, dUrl, i, pms, s, p, cs', ts) >> return (not $ null cs')
no :: Maybe a
no = Nothing
resetSubjectPredicate :: GenParser ByteString ParseState ()
resetSubjectPredicate =
getState >>= \(bUrl, dUrl, n, pms, _, _, cs, ts) ->
setState (bUrl, dUrl, n, pms, [], [], cs, ts)
_modifyState :: Maybe (Maybe BaseUrl) -> Maybe (Int -> Int) -> Maybe PrefixMappings ->
Maybe Subject -> Maybe Predicate -> Maybe (Seq Triple) ->
GenParser ByteString ParseState ()
_modifyState mb_bUrl mb_n mb_pms mb_subj mb_pred mb_trps =
do (_bUrl, _dUrl, _n, _pms, _s, _p, _cs, _ts) <- getState
setState (fromMaybe _bUrl mb_bUrl,
_dUrl,
maybe _n (const _n) mb_n,
fromMaybe _pms mb_pms,
maybe _s (: _s) mb_subj,
maybe _p (: _p) mb_pred,
_cs,
fromMaybe _ts mb_trps)
addTripleForObject :: Object -> GenParser ByteString ParseState ()
addTripleForObject obj =
do (bUrl, dUrl, i, pms, ss, ps, cs, ts) <- getState
when (null ss) $
error $ "No Subject with which to create triple for: " ++ show obj
when (null ps) $
error $ "No Predicate with which to create triple for: " ++ show obj
setState (bUrl, dUrl, i, pms, ss, ps, cs, ts |> Triple (head ss) (head ps) obj)
--p
parseURL' :: forall rdf. (RDF rdf) =>
Maybe BaseUrl
-> Maybe ByteString
-> String
-> IO (Either ParseFailure rdf)
parseURL' bUrl docUrl = _parseURL (parseString' bUrl docUrl)
parseFile' :: forall rdf. (RDF rdf) => Maybe BaseUrl -> Maybe ByteString -> String -> IO (Either ParseFailure rdf)
parseFile' bUrl docUrl fpath =
B.readFile fpath >>= \bs -> return $ handleResult bUrl (runParser t_turtleDoc initialState (maybe "" B.unpack docUrl) bs)
where initialState = (bUrl, docUrl, 1, PrefixMappings Map.empty, [], [], [], Seq.empty)
parseString' :: forall rdf. (RDF rdf) => Maybe BaseUrl -> Maybe ByteString -> ByteString -> Either ParseFailure rdf
parseString' bUrl docUrl ttlStr = handleResult bUrl (runParser t_turtleDoc initialState "" ttlStr)
where initialState = (bUrl, docUrl, 1, PrefixMappings Map.empty, [], [], [], Seq.empty)
handleResult :: RDF rdf => Maybe BaseUrl -> Either ParseError (Seq Triple, PrefixMappings) -> Either ParseFailure rdf
handleResult bUrl result =
case result of
(Left err) -> Left (ParseFailure $ show err)
(Right (ts, pms)) -> Right $! mkRdf (F.toList ts) bUrl pms
_testParseState :: ParseState
_testParseState = (Nothing, Nothing, 1, PrefixMappings Map.empty, [], [], [], Seq.empty)