{------------------------------------------------------------------------------------- - - An XQuery parser - Programmer: Leonidas Fegaras - Email: fegaras@cse.uta.edu - Web: http://lambda.uta.edu/ - Creation: 02/15/08, last update: 03/25/08 - - Copyright (c) 2008 by Leonidas Fegaras, the University of Texas at Arlington. All rights reserved. - This material is provided as is, with absolutely no warranty expressed or implied. - Any use is at your own risk. Permission is hereby granted to use or copy this program - for any purpose, provided the above notices are retained on all copies. - --------------------------------------------------------------------------------------} { module XQueryParser where import Char } %name parse %tokentype { Token } %error { parseError } %token 'return' { RETURN } 'some' { SOME } 'every' { EVERY } 'if' { IF } 'then' { THEN } 'else' { ELSE } '[' { LB } ']' { RB } '(' { LP } ')' { RP } '{' { LSB } '}' { RSB } '\'{' { LESCAPE } '}\'' { RESCAPE } 'to' { TO } '+' { PLUS } '-' { MINUS } '*' { TIMES } 'div' { DIV } 'idiv' { IDIV } 'mod' { MOD } '=' { TEQ } '!=' { TNEQ } '<' { TLT } '<=' { LEQ } '>' { TGT } '>=' { GEQ } 'and' { AND } 'or' { OR } 'not' { NOT } 'union' { UNION } 'intersect' { INTERSECT } 'except' { EXCEPT } 'for' { FOR } 'let' { LET } 'in' { IN } ',' { COMMA } ':=' { ASSIGN } 'where' { WHERE } 'order' { ORDER } 'by' { BY } 'ascending' { ASCENDING } 'descending' { DESCENDING } 'element' { ELEMENT } 'attribute' { ATTRIBUTE } '' { ETAG } 'satisfies' { SATISFIES } '@' { ATSIGN } '/' { SLASH } 'QName' { QName $$ } 'define' { DEFINE } 'function' { FUNCTION } 'at' { AT } '..' { DOTS } '.' { DOT } ';' { SEMI } 'Variable' { Variable $$ } 'XMLtext' { XMLtext $$ } 'Integer' { TInteger $$ } 'Double' { TFloat $$ } 'String' { TString $$ } %nonassoc 'for' 'let' 'satisfies' 'return' %nonassoc 'else' %left 'intersect' 'union' 'except' %right 'or' %right 'and' %nonassoc 'not' %left 'to' %left '=' '<' '>' '<=' '>=' '!=' %left '+' '-' %left '*' 'div' 'idiv' 'mod' %nonassoc UMINUS %% prog : expr { [$1] } | expr 'XMLtext' { [$1] } | 'define' 'function' 'QName' '(' params ')' '=' expr ';' prog { (Ast "define" ([Avar $3,$8]++$5)):$10 } | 'define' 'function' 'QName' '(' ')' '=' expr ';' prog { (Ast "define" ([Avar $3,$7])):$9 } params : var { [$1] } | params ',' var { $1++[$3] } var : 'Variable' { Avar $1 } expr : clauses opt_where opt_order 'return' expr { (snd $3) ($1 ($2 ((fst $3) $5))) } | 'some' for_bindings 'satisfies' expr { call "some" [$2 $4] } | 'every' for_bindings 'satisfies' expr { call "not" [call "some" [$2 (call "not" [$4])]] } | 'if' expr 'then' expr 'else' expr { call "if" [$2,$4,$6] } | full_path { $1 } | element { $1 } | computed { $1 } | expr 'to' expr { call "to" [$1,$3] } | expr '+' expr { call "+" [$1,$3] } | expr '-' expr { call "-" [$1,$3] } | expr '*' expr { call "*" [$1,$3] } | expr 'div' expr { call "div" [$1,$3] } | expr 'idiv' expr { call "idiv" [$1,$3] } | expr 'mod' expr { call "mod" [$1,$3] } | expr '=' expr { call "=" [$1,$3] } | expr '!=' expr { call "!=" [$1,$3] } | expr '<' expr { call "<" [$1,$3] } | expr '<=' expr { call "<=" [$1,$3] } | expr '>' expr { call ">" [$1,$3] } | expr '>=' expr { call ">=" [$1,$3] } | expr 'and' expr { call "and" [$1,$3] } | expr 'or' expr { call "or" [$1,$3] } | expr 'not' expr { call "not" [$1,$3] } | expr 'union' expr { call "union" [$1,$3] } | expr 'intersect' expr { call "itersect" [$1,$3] } | expr 'except' expr { call "except" [$1,$3] } | '+' expr %prec UMINUS { call "uplus" [$2] } | '-' expr %prec UMINUS { call "uminus" [$2] } | 'not' expr %prec UMINUS { call "not" [$2] } | 'String' { Astring $1 } | 'Integer' { Aint $1 } | 'Double' { Afloat $1 } expl : expr { [$1] } | expl ',' expr { $1++[$3] } clauses : 'for' for_bindings { $2 } | 'let' let_bindings { $2 } | clauses 'for' for_bindings { $1 . $3 } | clauses 'let' let_bindings { $1 . $3 } for_bindings : var 'in' expr { \x -> Ast "for" [$1,Avar "$",$3,x] } | var 'in' expr 'at' var { \x -> Ast "for" [$1,$5,$3,x] } | var 'at' var 'in' expr { \x -> Ast "for" [$1,$3,$5,x] } | for_bindings ',' var 'in' expr { \x -> $1(Ast "for" [$3,Avar "$",$5,x]) } | for_bindings ',' var 'in' expr 'at' var { \x -> $1(Ast "for" [$3,$7,$5,x]) } | for_bindings ',' var 'at' var 'in' expr { \x -> $1(Ast "for" [$3,$5,$7,x]) } let_bindings : var ':=' expr { \x -> Ast "let" [$1,$3,x] } | let_bindings ',' var ':=' expr { \x -> $1(Ast "let" [$3,$5,x]) } opt_where : 'where' expr { \x -> Ast "predicate" [$2,x] } | {- empty -} { id } opt_order : 'order' 'by' order_list { (\x -> Ast "sortTuple" (x:(fst $3)), \x -> Ast "sort" (x:(snd $3))) } | {- empty -} { (id,id) } order_list : expr mode { ([$1],[$2]) } | order_list ',' expr mode { ((fst $1)++[$3],(snd $1)++[$4]) } mode : 'ascending' { Avar "ascending" } | 'descending' { Avar "descending" } | {- empty -} { Avar "ascending" } computed : 'element' '(' 'QName' ')' { call "element" [Avar $3] } | 'attribute' '(' 'QName' ')' { call "attribute" [Avar $3] } element : stag '>' content '' { if head $1 == Astring $5 then Ast "construction" ($1++[concatAll $3]) else error("Unmatched tags in element construction: "++$5) } | stag '>' '' { if head $1 == Astring $4 then Ast "construction" ($1++[call "empty" []]) else error("Unmatched tags in element construction: "++$4) } | stag '/>' { Ast "construction" ($1++[call "empty" []]) } stag : '<' 'QName' { [Astring $2,Ast "attributes" []] } | '<' 'QName' attributes { [Astring $2,Ast "attributes" $3] } content : '{' expl '}' { $2 } | 'String' { [Astring $1] } | 'XMLtext' { [Astring $1] } | element { [$1] } | content '{' expl '}' { $1++$3 } | content 'String' { $1++[Astring $2] } | content 'XMLtext' { $1++[Astring $2] } | content element { $1++[$2] } attributes : 'QName' '=' 'String' { [Ast "pair" [Astring $1,Astring $3]] } | 'QName' '=' '\'{' expr '}\'' { [Ast "pair" [Astring $1,$4]] } | attributes 'QName' '=' 'String' { $1++[Ast "pair" [Astring $2,Astring $4]] } | attributes 'QName' '=' '\'{' expr '}\'' { $1++[Ast "pair" [Astring $2,$5]] } full_path : predicate_step { $1 "child" (Avar ".") } | '@' predicate_step { $2 "child_attribute" (Avar ".") } | predicate_step path { $2($1 "child" (Avar ".")) } | '@' predicate_step path { $3($2 "child_attribute" (Avar ".")) } path : step { $1 } | path step { $2 . $1 } step : '/' predicate_step { \e -> $2 "child" e } | '/' '@' predicate_step { \e -> $3 "child_attribute" e } | '/' '/' predicate_step { \e -> $3 "descendant" e } | '/' '/' '@' predicate_step { \e -> $4 "descendant_attribute" e } | '/' '..' { \e -> call "parent" [e] } predicate_step : simple_step { $1 } | predicate_step '[' expr ']' { \t e -> Ast "predicate" [$3,$1 t e] } simple_step : primary_expr { \t e -> $1 t e } | '*' { \t e -> call t [Astring "*",e] } | 'QName' { \t e -> call t [Astring $1,e] } primary_expr : var { \_ _ -> $1 } | '.' { \_ e -> e } | '(' expl ')' { \_ _ -> concatAll $2 } | '(' ')' { \_ _ -> call "empty" [] } | 'QName' '(' expl ')' { \_ _ -> call $1 $3 } | 'QName' '(' ')' { if elem $1 ["text"] then \_ e -> call $1 [e] else \_ _ -> call $1 [] } { -- Abstract Syntax Tree for XQueries data Ast = Ast String [Ast] | Avar String | Aint Int | Afloat Float | Astring String deriving Eq instance Show Ast where show (Ast s []) = s ++ "()" show (Ast s (x:xs)) = s ++ "(" ++ (show x) ++ (foldr (\a r -> ","++(show a)++r) "" xs) ++ ")" show (Avar s) = s show (Aint n) = show n show (Afloat n) = show n show (Astring s) = "\'" ++ s ++ "\'" call :: String -> [Ast] -> Ast call name args = Ast "call" ((Avar name):args) concatAll :: [Ast] -> Ast concatAll (x:xs) = foldl (\a r -> call "concatenate" [a,r]) x xs concatAll _ = call "empty" [] parseError tk = error ("Parse error: "++(show (take 10 tk))) data Token = RETURN | SOME | EVERY | IF | THEN | ELSE | LB | RB | LP | RP | LSB | RSB | LESCAPE | RESCAPE | TO | PLUS | MINUS | TIMES | DIV | IDIV | MOD | TEQ | TNEQ | TLT | LEQ | TGT | GEQ | AND | OR | NOT | UNION | INTERSECT | EXCEPT | FOR | LET | IN | COMMA | ASSIGN | WHERE | ORDER | BY | ASCENDING | DESCENDING | ELEMENT | ATTRIBUTE | STAG | ETAG | SATISFIES | ATSIGN | SLASH | DEFINE | FUNCTION | AT | SEMI | DOT | DOTS | TokenEOF | QName String | Variable String | XMLtext String | TInteger Int | TFloat Float | TString String deriving Show scan :: String -> [Token] scan cs = lexer cs [0] xmlText "" = [] xmlText text = [XMLtext text] -- scans XML PCDATA and returns an XMLtext token with the PCData text xml :: String -> String -> [Int] -> [Token] xml ('{':cs) text n = (xmlText text)++(LSB : lexer cs (0:n)) xml ('<':'/':cs) text n = (xmlText text)++(STAG : lexer cs n) xml ('<':cs) text (k:n) = (xmlText text)++(TLT : lexer cs (k+2:n)) xml ('\'':'{':cs) text n = (xmlText text)++(LESCAPE : lexer cs n) xml ('\"':'{':cs) text n = (xmlText text)++(LESCAPE : lexer cs n) xml (c:cs) text n = xml cs (text++[c]) n xml [] text _ = xmlText text isQN c = elem c "_:-" || isDigit c || isAlpha c isVar c = elem c "_" || isDigit c || isAlpha c -- the XQuery scanner lexer :: String -> [Int] -> [Token] lexer [] [0] = [] lexer [] _ = error("Unbalanced tags") lexer (' ':'>':' ':cs) n = TGT : lexer cs n lexer (c:cs) n | isSpace c = lexer cs n | isAlpha c = lexVar (c:cs) n | isDigit c = lexNum (c:cs) n lexer ('$':c:cs) n | isAlpha c = let (var,rest) = span isVar (c:cs) in (Variable var) : lexer rest n lexer ('\'':'{':cs) n = LESCAPE : lexer cs n lexer ('\"':'{':cs) n = LESCAPE : lexer cs n lexer (':':'=':cs) n = ASSIGN : lexer cs n lexer ('<':'/':cs) n = STAG : lexer cs n lexer ('<':'=':cs) n = LEQ : lexer cs n lexer ('>':'=':cs) n = GEQ : lexer cs n lexer ('/':'>':cs) (k:n) = ETAG : lexer cs (k-2:n) lexer ('.':'.':cs) n = DOTS : lexer cs n lexer ('.':cs) n = DOT : lexer cs n lexer ('}':'\'':cs) n = RESCAPE : lexer cs n lexer ('}':'\"':cs) n = RESCAPE : lexer cs n lexer ('!':'=':cs) n = TNEQ : lexer cs n lexer ('\'':cs) n = lexString cs n lexer ('\"':cs) n = lexString2 cs n lexer ('[':cs) n = LB : lexer cs n lexer (']':cs) n = RB : lexer cs n lexer ('(':cs) n = LP : lexer cs n lexer (')':cs) n = RP : lexer cs n lexer ('}':cs) (_:n) = RSB : xml cs "" n lexer ('+':cs) n = PLUS : lexer cs n lexer ('-':cs) n = MINUS : lexer cs n lexer ('*':cs) n = TIMES : lexer cs n lexer ('=':cs) n = TEQ : lexer cs n lexer ('<':c:cs) n | not(isAlpha c) = TLT : lexer (c:cs) n lexer ('<':cs) (k:n) = TLT : lexer cs (k+2:n) lexer ('>':cs) (k:n) = TGT : (if k==1 then lexer cs (k-1:n) else xml cs "" (k-1:n)) lexer (',':cs) n = COMMA : lexer cs n lexer ('@':cs) n = ATSIGN : lexer cs n lexer ('/':cs) n = SLASH : lexer cs n lexer ('{':cs) n = LSB : lexer cs n lexer (';':cs) n = SEMI : lexer cs n lexer (c:cs) n = error ("illegal character "++[c]) lexNum cs n = if null rest || head rest /= '.' then TInteger (read k) : lexer rest n else let (m,rest2) = span isDigit (tail rest) in TFloat (read (k++('.':m))) : lexer rest2 n where (k,rest) = span isDigit cs lexString cs n = TString s : lexer (tail rest) n where (s,rest) = span inString cs inString c = c /= '\'' lexString2 cs n = TString s : lexer (tail rest) n where (s,rest) = span inString cs inString c = c /= '\"' lexVar cs n = case span isQN cs of ("return",rest) -> RETURN : lexer rest n ("some",rest) -> SOME : lexer rest n ("every",rest) -> EVERY : lexer rest n ("if",rest) -> IF : lexer rest n ("then",rest) -> THEN : lexer rest n ("else",rest) -> ELSE : lexer rest n ("to",rest) -> TO : lexer rest n ("div",rest) -> DIV : lexer rest n ("idiv",rest) -> IDIV : lexer rest n ("mod",rest) -> MOD : lexer rest n ("and",rest) -> AND : lexer rest n ("or",rest) -> OR : lexer rest n ("not",rest) -> NOT : lexer rest n ("union",rest) -> UNION : lexer rest n ("intersect",rest) -> INTERSECT : lexer rest n ("except",rest) -> EXCEPT : lexer rest n ("for",rest) -> FOR : lexer rest n ("let",rest) -> LET : lexer rest n ("in",rest) -> IN : lexer rest n ("where",rest) -> WHERE : lexer rest n ("order",rest) -> ORDER : lexer rest n ("by",rest) -> BY : lexer rest n ("ascending",rest) -> ASCENDING : lexer rest n ("descending",rest) -> DESCENDING : lexer rest n ("element",rest) -> ELEMENT : lexer rest n ("attribute",rest) -> ATTRIBUTE : lexer rest n ("satisfies",rest) -> SATISFIES : lexer rest n ("define",rest) -> DEFINE : lexer rest n ("function",rest) -> FUNCTION : lexer rest n ("at",rest) -> AT : lexer rest n (var,rest) -> QName var : lexer rest n }