module Swish.RDF.TurtleParser
( ParseResult
, parseTurtle
, parseTurtlefromText
)
where
import Swish.RDF.RDFGraph
( RDFGraph, RDFLabel(..)
, ToRDFLabel(..)
, NamespaceMap
, addArc
, setNamespaces
, emptyRDFGraph
)
import Swish.RDF.GraphClass (arc)
import Swish.Utils.LookupMap
( LookupMap(..)
, LookupEntryClass(..)
, mapFindMaybe, mapReplaceOrAdd, mapAdd, mapReplace )
import Swish.Utils.Namespace
( Namespace, makeNamespace
, ScopedName
, getScopeNamespace
, getScopedNameURI
, getScopeNamespace
, makeURIScopedName
, makeNSScopedName
)
import Swish.RDF.Vocabulary
( langName
, rdfType
, rdfFirst, rdfRest, rdfNil
, xsdBoolean, xsdInteger, xsdDecimal, xsdDouble
, defaultBase
)
import Swish.RDF.RDFParser
( ParseResult
, runParserWithError
, ignore
, noneOf
, char
, ichar
, string
, stringT
, symbol
, isymbol
, lexeme
, whiteSpace
, mkTypedLit
, hex4
, hex8
, appendURIs
)
import Control.Applicative
import Control.Monad (foldM)
import Network.URI (URI(..), parseURIReference)
import Data.Char (ord)
import Data.Maybe (fromMaybe, fromJust)
import qualified Data.Text as T
import qualified Data.Text.Lazy as L
import Text.ParserCombinators.Poly.StateText
data TurtleState = TurtleState
{ graphState :: RDFGraph
, prefixUris :: NamespaceMap
, baseUri :: URI
, nodeGen :: Int
} deriving Show
setPrefix :: Maybe T.Text -> URI -> TurtleState -> TurtleState
setPrefix pre uri st = st { prefixUris=p' }
where
p' = mapReplaceOrAdd (makeNamespace pre uri) (prefixUris st)
setBase :: URI -> TurtleState -> TurtleState
setBase buri st = st { baseUri = buri }
getDefaultPrefix :: TurtleParser Namespace
getDefaultPrefix = do
s <- stGet
case getPrefixURI s Nothing of
Just uri -> return $ makeNamespace Nothing uri
_ -> failBad "No default prefix defined; how unexpected (probably a programming error)!"
getPrefixURI :: TurtleState -> Maybe T.Text -> Maybe URI
getPrefixURI st pre = mapFindMaybe pre (prefixUris st)
findPrefixNamespace :: Maybe L.Text -> TurtleParser Namespace
findPrefixNamespace (Just p) = findPrefix (L.toStrict p)
findPrefixNamespace Nothing = getDefaultPrefix
updateGraph :: (RDFGraph -> RDFGraph) -> TurtleState -> TurtleState
updateGraph f s = s { graphState = f (graphState s) }
type TurtleParser a = Parser TurtleState a
parseTurtlefromText ::
L.Text
-> ParseResult
parseTurtlefromText = flip parseTurtle Nothing
parseTurtle ::
L.Text
-> Maybe URI
-> ParseResult
parseTurtle txt mbase = parseAnyfromText turtleDoc mbase txt
hashURI :: URI
hashURI = fromJust $ parseURIReference "#"
emptyState ::
Maybe URI
-> TurtleState
emptyState mbase =
let pmap = LookupMap [makeNamespace Nothing hashURI]
buri = fromMaybe (getScopedNameURI defaultBase) mbase
in TurtleState
{ graphState = emptyRDFGraph
, prefixUris = pmap
, baseUri = buri
, nodeGen = 0
}
parseAnyfromText ::
TurtleParser a
-> Maybe URI
-> L.Text
-> Either String a
parseAnyfromText parser mbase = runParserWithError parser (emptyState mbase)
newBlankNode :: TurtleParser RDFLabel
newBlankNode = do
n <- stQuery (succ . nodeGen)
stUpdate $ \s -> s { nodeGen = n }
return $ Blank (show n)
comma, semiColon , fullStop :: TurtleParser ()
comma = isymbol ","
semiColon = isymbol ";"
fullStop = isymbol "."
sQuot, dQuot, sQuot3, dQuot3 :: TurtleParser ()
sQuot = ichar '\''
dQuot = ichar '"'
sQuot3 = ignore $ string "'''"
dQuot3 = ignore $ string "\"\"\""
match :: (Ord a) => a -> [(a,a)] -> Bool
match v = any (\(l,h) -> v >= l && v <= h)
br :: String -> String -> TurtleParser a -> TurtleParser a
br lsym rsym = bracket (symbol lsym) (symbol rsym)
atWord :: T.Text -> TurtleParser ()
atWord s = char '@' *> lexeme (stringT s) *> pure ()
addStatement :: RDFLabel -> RDFLabel -> RDFLabel -> TurtleParser ()
addStatement s p o@(Lit _ (Just dtype)) | dtype `elem` [xsdBoolean, xsdInteger, xsdDecimal, xsdDouble] = do
ost <- stGet
let stmt = arc s p o
oldp = prefixUris ost
ogs = graphState ost
newp = mapReplaceOrAdd (getScopeNamespace dtype) oldp
stUpdate $ \st -> st { prefixUris = newp, graphState = addArc stmt ogs }
addStatement s p o = stUpdate (updateGraph (addArc (arc s p o) ))
isaz, isAZ, isaZ, is09, isaZ09 :: Char -> Bool
isaz c = c >= 'a' && c <= 'z'
isAZ c = c >= 'A' && c <= 'Z'
isaZ c = isaz c || isAZ c
is09 c = c >= '0' && c <= '9'
isaZ09 c = isaZ c || is09 c
d2s :: L.Text -> RDFLabel
d2s =
let conv :: String -> Double
conv = read
in toRDFLabel . conv . L.unpack
operatorLabel :: ScopedName -> TurtleParser RDFLabel
operatorLabel snam = do
st <- stGet
let sns = getScopeNamespace snam
opmap = prefixUris st
pkey = entryKey sns
pval = entryVal sns
rval = Res snam
case mapFindMaybe pkey opmap of
Just val | val == pval -> return rval
| otherwise -> do
stUpdate $ \s -> s { prefixUris = mapReplace opmap sns }
return rval
_ -> do
stUpdate $ \s -> s { prefixUris = mapAdd opmap sns }
return rval
findPrefix :: T.Text -> TurtleParser Namespace
findPrefix pre = do
st <- stGet
case mapFindMaybe (Just pre) (prefixUris st) of
Just uri -> return $ makeNamespace (Just pre) uri
Nothing -> failBad $ "Undefined prefix '" ++ T.unpack pre ++ ":'."
turtleDoc :: TurtleParser RDFGraph
turtleDoc = mkGr <$> (whiteSpace *> many statement *> eof *> stGet)
where
mkGr s = setNamespaces (prefixUris s) (graphState s)
statement :: TurtleParser ()
statement = (directive <|> triples) *> fullStop
directive :: TurtleParser ()
directive = lexeme (prefixID <|> base)
prefixID :: TurtleParser ()
prefixID = do
_prefix
p <- lexeme _pnameNS
u <- _iriRef
stUpdate (setPrefix (fmap L.toStrict p) u)
base :: TurtleParser ()
base = _base >> _iriRef >>= stUpdate . setBase
triples :: TurtleParser ()
triples = subject >>= predicateObjectList
predicateObjectList :: RDFLabel -> TurtleParser ()
predicateObjectList subj =
let term = verb >>= objectList subj
in sepBy1 term semiColon *> ignore (optional semiColon)
objectList :: RDFLabel -> RDFLabel -> TurtleParser ()
objectList subj prd = sepBy1 object comma >>= mapM_ (addStatement subj prd)
verb :: TurtleParser RDFLabel
verb = predicate <|> (lexeme (char 'a') *> operatorLabel rdfType)
subject :: TurtleParser RDFLabel
subject = (Res <$> iriref) <|> blank
predicate :: TurtleParser RDFLabel
predicate = Res <$> iriref
object :: TurtleParser RDFLabel
object = (Res <$> iriref) <|> blank <|> literal
literal :: TurtleParser RDFLabel
literal = lexeme $ rdfLiteral <|> numericLiteral <|> booleanLiteral
blank :: TurtleParser RDFLabel
blank = lexeme (_blankNodeLabel
<|>
bracket (char '[') (char ']') handleBlankNode
<|>
collection
)
handleBlankNode :: TurtleParser RDFLabel
handleBlankNode = do
bNode <- newBlankNode
_manyws
ignore $ optional $ predicateObjectList bNode
_manyws
return bNode
collection :: TurtleParser RDFLabel
collection = do
os <- br "(" ")" (many object)
eNode <- operatorLabel rdfNil
case os of
[] -> return eNode
(x:xs) -> do
sNode <- newBlankNode
first <- operatorLabel rdfFirst
addStatement sNode first x
lNode <- foldM addElem sNode xs
rest <- operatorLabel rdfRest
addStatement lNode rest eNode
return sNode
where
addElem prevNode curElem = do
bNode <- newBlankNode
first <- operatorLabel rdfFirst
rest <- operatorLabel rdfRest
addStatement prevNode rest bNode
addStatement bNode first curElem
return bNode
_base :: TurtleParser ()
_base = atWord "base"
_prefix :: TurtleParser ()
_prefix = atWord "prefix"
_uchar :: TurtleParser Char
_uchar = char '\\' *> _uchar'
_uchar' :: TurtleParser Char
_uchar' = (char 'u' *> hex4) <|> (char 'U' *> hex8)
rdfLiteral :: TurtleParser RDFLabel
rdfLiteral = do
lbl <- turtleString
opt <- optional (_langTag <|> (string "^^" *> iriref))
return $ Lit (L.toStrict lbl) opt
numericLiteral :: TurtleParser RDFLabel
numericLiteral = numericLiteralNegative <|> numericLiteralPositive <|> numericLiteralUnsigned
numericLiteralUnsigned :: TurtleParser RDFLabel
numericLiteralUnsigned =
d2s <$> _double
<|>
(mkTypedLit xsdDecimal . L.toStrict <$> _decimal)
<|>
(mkTypedLit xsdInteger . L.toStrict <$> _integer)
numericLiteralPositive :: TurtleParser RDFLabel
numericLiteralPositive =
d2s <$> _doublePositive
<|>
(mkTypedLit xsdDecimal . L.toStrict <$> _decimalPositive)
<|>
(mkTypedLit xsdInteger . L.toStrict <$> _integerPositive)
numericLiteralNegative :: TurtleParser RDFLabel
numericLiteralNegative =
d2s <$> _doubleNegative
<|>
(mkTypedLit xsdDecimal . L.toStrict <$> _decimalNegative)
<|>
(mkTypedLit xsdInteger . L.toStrict <$> _integerNegative)
booleanLiteral :: TurtleParser RDFLabel
booleanLiteral = mkTypedLit xsdBoolean . T.pack <$> (string "true" <|> string "false")
turtleString :: TurtleParser L.Text
turtleString =
lexeme (
_stringLiteralLong1 <|> _stringLiteral1 <|>
_stringLiteralLong2 <|> _stringLiteral2)
iriref :: TurtleParser ScopedName
iriref = lexeme ((makeURIScopedName <$> _iriRef) <|> prefixedName)
prefixedName :: TurtleParser ScopedName
prefixedName =
_pnameLN <|>
flip makeNSScopedName T.empty <$> (_pnameNS >>= findPrefixNamespace)
_iriRef :: TurtleParser URI
_iriRef = do
utxt <- bracket (char '<') (char '>') $ manySatisfy (/= '>')
let ustr = L.unpack utxt
case parseURIReference ustr of
Nothing -> fail $ "Unable to convert <" ++ ustr ++ "> to a URI"
Just uref -> do
s <- stGet
either fail return $ appendURIs (baseUri s) uref
_pnameNS :: TurtleParser (Maybe L.Text)
_pnameNS = optional _pnPrefix <* char ':'
_pnameLN :: TurtleParser ScopedName
_pnameLN = makeNSScopedName
<$> (_pnameNS >>= findPrefixNamespace)
<*> fmap L.toStrict _pnLocal
_blankNodeLabel :: TurtleParser RDFLabel
_blankNodeLabel = (Blank . L.unpack) <$> (string "_:" *> _pnLocal)
_langTag :: TurtleParser ScopedName
_langTag = do
ichar '@'
h <- many1Satisfy isaZ
mt <- optional (L.cons <$> char '-' <*> many1Satisfy isaZ09)
return $ langName $ L.toStrict $ L.append h (fromMaybe L.empty mt)
_integer :: TurtleParser L.Text
_integer = many1Satisfy is09
_decimal :: TurtleParser L.Text
_decimal =
let dpart = L.cons <$> char '.' <*> (fromMaybe "0" <$> optional _integer)
in
(L.append <$> _integer <*> dpart)
<|>
(L.append "0." <$> (char '.' *> _integer))
_double :: TurtleParser L.Text
_double =
(L.append <$> _decimal <*> _exponent)
<|>
(L.append <$> _integer <*> _exponent)
_integerPositive, _decimalPositive, _doublePositive :: TurtleParser L.Text
_integerPositive = char '+' *> _integer
_decimalPositive = char '+' *> _decimal
_doublePositive = char '+' *> _double
_integerNegative, _decimalNegative, _doubleNegative :: TurtleParser L.Text
_integerNegative = L.cons <$> char '-' <*> _integer
_decimalNegative = L.cons <$> char '-' <*> _decimal
_doubleNegative = L.cons <$> char '-' <*> _double
_exponent :: TurtleParser L.Text
_exponent = do
ignore $ satisfy (`elem` "eE")
ms <- optional (satisfy (`elem` "+-"))
e <- _integer
case ms of
Just '-' -> return $ L.append "E-" e
_ -> return $ L.cons 'E' e
_stringLiteral1, _stringLiteral2 :: TurtleParser L.Text
_stringLiteral1 = _stringIt sQuot (_tChars "'\\\n\r")
_stringLiteral2 = _stringIt dQuot (_tChars "\"\\\n\r")
_stringLiteralLong1, _stringLiteralLong2 :: TurtleParser L.Text
_stringLiteralLong1 = _stringItLong sQuot3 (_tCharsLong '\'' "'\\")
_stringLiteralLong2 = _stringItLong dQuot3 (_tCharsLong '"' "\"\\")
_stringIt :: TurtleParser a -> TurtleParser Char -> TurtleParser L.Text
_stringIt sep chars = L.pack <$> bracket sep sep (many chars)
_stringItLong :: TurtleParser a -> TurtleParser L.Text -> TurtleParser L.Text
_stringItLong sep chars = L.concat <$> bracket sep sep (many chars)
_tChars :: String -> TurtleParser Char
_tChars excl = (char '\\' *> (_echar' <|> _uchar'))
<|> noneOf excl
_tCharsLong :: Char -> String -> TurtleParser L.Text
_tCharsLong c excl = do
mq <- optional $ oneOrTwo c
r <- _tChars excl
return $ L.append (fromMaybe L.empty mq) (L.singleton r)
oneOrTwo :: Char -> TurtleParser L.Text
oneOrTwo c = do
a <- char c
mb <- optional (char c)
case mb of
Just b -> return $ L.pack [a,b]
_ -> return $ L.singleton a
_echar :: TurtleParser Char
_echar = char '\\' *> _echar'
_echar' :: TurtleParser Char
_echar' =
(char 't' *> pure '\t') <|>
(char 'b' *> pure '\b') <|>
(char 'n' *> pure '\n') <|>
(char 'r' *> pure '\r') <|>
(char '\\' *> pure '\\') <|>
(char '"' *> pure '"') <|>
(char '\'' *> pure '\'')
_manyws :: TurtleParser ()
_manyws = ignore $ manySatisfy (`elem` " \t\r\n")
_pnCharsBase :: TurtleParser Char
_pnCharsBase =
let f c = let i = ord c
in isaZ c ||
match i [(0xc0, 0xd6), (0xd8, 0xf6), (0xf8, 0x2ff),
(0x370, 0x37d), (0x37f, 0x1fff), (0x200c, 0x200d),
(0x2070, 0x218f), (0x2c00, 0x2fef), (0x3001, 0xd7ff),
(0xf900, 0xfdcf), (0xfdf0, 0xfffd), (0x10000, 0xeffff)]
in satisfy f <|> _uchar
_pnCharsU :: TurtleParser Char
_pnCharsU = _pnCharsBase <|> char '_'
_pnChars :: TurtleParser Char
_pnChars =
_pnCharsU
<|>
satisfy (\c -> let i = ord c
in c == '-' || (c >= '0' && c <= '9') || i == 0xb7 ||
match i [(0x0300, 0x036f), (0x203f, 0x2040)])
_pnPrefix :: TurtleParser L.Text
_pnPrefix = L.cons <$> _pnCharsBase <*> _pnRest
_pnLocal :: TurtleParser L.Text
_pnLocal = L.cons <$> (_pnCharsU <|> satisfy is09)
<*> _pnRest
_pnRest :: TurtleParser L.Text
_pnRest = do
lbl <- many (_pnChars <|> char '.')
let (nret, lclean) = clean lbl
edl = id
snocdl x xs = xs . (x:)
appenddl = (.)
replicatedl n x = (replicate n x ++)
clean :: String -> (Int, String)
clean = go 0 edl
where
go n acc [] = (n, acc [])
go n acc ('.':xs) = go (n+1) acc xs
go 0 acc (x:xs) = go 0 (snocdl x acc) xs
go n acc (x:xs) = go 0 (appenddl acc (snocdl x (replicatedl n '.'))) xs
reparse $ L.replicate (fromIntegral nret) (L.singleton '.')
return $ L.pack lclean