{------------------------------------------------------------------------------------- - - An XQuery parser - Programmer: Leonidas Fegaras - Email: fegaras@cse.uta.edu - Web: http://lambda.uta.edu/ - Creation: 02/15/08, last update: 05/15/08 - - Copyright (c) 2008 by Leonidas Fegaras, the University of Texas at Arlington. All rights reserved. - This material is provided as is, with absolutely no warranty expressed or implied. - Any use is at your own risk. Permission is hereby granted to use or copy this program - for any purpose, provided the above notices are retained on all copies. - --------------------------------------------------------------------------------------} { module XML.HXQ.Parser where import Char } %name parse %tokentype { Token } %error { parseError } %token 'return' { RETURN } 'some' { SOME } 'every' { EVERY } 'if' { IF } 'then' { THEN } 'else' { ELSE } '[' { LB } ']' { RB } '(' { LP } ')' { RP } '{' { LSB } '}' { RSB } 'to' { TO } '+' { PLUS } '-' { MINUS } '*' { TIMES } 'div' { DIV } 'idiv' { IDIV } 'mod' { MOD } '=' { TEQ } '!=' { TNE } '<' { TLT } '<=' { TLE } '>' { TGT } '>=' { TGE } '<<' { PRE } '>>' { POST } 'is' { IS } 'eq' { SEQ } 'ne' { SNE } 'lt' { SLT } 'le' { SLE } 'gt' { SGT } 'ge' { SGE } 'and' { AND } 'or' { OR } 'not' { NOT } 'union' { UNION } 'intersect' { INTERSECT } 'except' { EXCEPT } 'for' { FOR } 'let' { LET } 'in' { IN } ',' { COMMA } ':=' { ASSIGN } 'where' { WHERE } 'order' { ORDER } 'by' { BY } 'ascending' { ASCENDING } 'descending' { DESCENDING } 'element' { ELEMENT } 'attribute' { ATTRIBUTE } '' { ETAG } 'satisfies' { SATISFIES } '@' { ATSIGN } '/' { SLASH } 'QName' { QName $$ } 'declare' { DECLARE } 'function' { FUNCTION } 'variable' { VARIABLE } 'at' { AT } '..' { DOTS } '.' { DOT } ';' { SEMI } 'Variable' { Variable $$ } 'XMLtext' { XMLtext $$ } 'Integer' { TInteger $$ } 'Double' { TFloat $$ } 'String' { TString $$ } %nonassoc 'for' 'let' 'satisfies' 'return' %nonassoc 'else' %left 'intersect' 'union' 'except' %right 'or' %right 'and' %nonassoc 'not' %left 'to' %left '=' '<' '>' '<=' '>=' '!=' '>>' '<<' 'is' 'eq' 'ne' 'lt' 'le' 'gt' 'ge' %left '+' '-' %left '*' 'div' 'idiv' 'mod' %nonassoc UMINUS %% prog :: { [ Ast ] } prog : def { [$1] } | def 'XMLtext' { [$1] } | prog ';' def { $1++[$3] } | prog ';' def 'XMLtext' { $1++[$3] } def :: { Ast } def : expr { $1 } | 'declare' 'variable' var ':=' expr { Ast "variable" [$3,$5] } | 'declare' 'function' 'QName' '(' params ')' '{' expr '}' { Ast "function" ([Avar $3,$8]++$5) } | 'declare' 'function' 'QName' '(' ')' '{' expr '}' { Ast "function" [Avar $3,$7] } params :: { [ Ast ] } params : var { [$1] } | params ',' var { $1++[$3] } var :: { Ast } var : 'Variable' { Avar $1 } expr :: { Ast } expr : clauses opt_where opt_order 'return' expr { (snd $3) ($1 ($2 ((fst $3) $5))) } | 'some' for_bindings 'satisfies' expr { call "some" [$2 $4] } | 'every' for_bindings 'satisfies' expr { call "not" [call "some" [$2 (call "not" [$4])]] } | 'if' expr 'then' expr 'else' expr { call "if" [$2,$4,$6] } | full_path { $1 } | element { $1 } | computed { $1 } | expr 'to' expr { call "to" [$1,$3] } | expr '+' expr { call "+" [$1,$3] } | expr '-' expr { call "-" [$1,$3] } | expr '*' expr { call "*" [$1,$3] } | expr 'div' expr { call "div" [$1,$3] } | expr 'idiv' expr { call "idiv" [$1,$3] } | expr 'mod' expr { call "mod" [$1,$3] } | expr '=' expr { call "=" [$1,$3] } | expr '!=' expr { call "!=" [$1,$3] } | expr '<' expr { call "<" [$1,$3] } | expr '<=' expr { call "<=" [$1,$3] } | expr '>' expr { call ">" [$1,$3] } | expr '>=' expr { call ">=" [$1,$3] } | expr '<<' expr { call "<<" [$1,$3] } | expr '>>' expr { call ">>" [$1,$3] } | expr 'is' expr { call "is" [$1,$3] } | expr 'eq' expr { call "eq" [$1,$3] } | expr 'ne' expr { call "ne" [$1,$3] } | expr 'lt' expr { call "lt" [$1,$3] } | expr 'le' expr { call "le" [$1,$3] } | expr 'gt' expr { call "gt" [$1,$3] } | expr 'ge' expr { call "ge" [$1,$3] } | expr 'and' expr { call "and" [$1,$3] } | expr 'or' expr { call "or" [$1,$3] } | expr 'not' expr { call "not" [$1,$3] } | expr 'union' expr { call "union" [$1,$3] } | expr 'intersect' expr { call "intersect" [$1,$3] } | expr 'except' expr { call "except" [$1,$3] } | '+' expr %prec UMINUS { call "uplus" [$2] } | '-' expr %prec UMINUS { call "uminus" [$2] } | 'not' expr %prec UMINUS { call "not" [$2] } | string { $1 } | 'Integer' { Aint $1 } | 'Double' { Afloat $1 } expl :: { [ Ast ] } expl : expr { [$1] } | expl ',' expr { $1++[$3] } clauses :: { Ast -> Ast } clauses : 'for' for_bindings { $2 } | 'let' let_bindings { $2 } | clauses 'for' for_bindings { $1 . $3 } | clauses 'let' let_bindings { $1 . $3 } for_bindings :: { Ast -> Ast } for_bindings : var 'in' expr { \x -> Ast "for" [$1,Avar "$",$3,x] } | var 'at' var 'in' expr { \x -> Ast "for" [$1,$3,$5,x] } | for_bindings ',' var 'in' expr { \x -> $1(Ast "for" [$3,Avar "$",$5,x]) } | for_bindings ',' var 'at' var 'in' expr { \x -> $1(Ast "for" [$3,$5,$7,x]) } let_bindings :: { Ast -> Ast } let_bindings : var ':=' expr { \x -> Ast "let" [$1,$3,x] } | let_bindings ',' var ':=' expr { \x -> $1(Ast "let" [$3,$5,x]) } opt_where :: { Ast -> Ast } opt_where : 'where' expr { \x -> Ast "predicate" [$2,x] } | {- empty -} { id } opt_order :: { ( Ast -> Ast, Ast -> Ast ) } opt_order : 'order' 'by' order_list { (\x -> Ast "sortTuple" (x:(fst $3)), \x -> Ast "sort" (x:(snd $3))) } | {- empty -} { (id,id) } order_list :: { ( [ Ast ], [ Ast ] ) } order_list : expr mode { ([$1],[$2]) } | order_list ',' expr mode { ((fst $1)++[$3],(snd $1)++[$4]) } mode :: { Ast } mode : 'ascending' { Avar "ascending" } | 'descending' { Avar "descending" } | {- empty -} { Avar "ascending" } computed :: { Ast } computed : 'element' '(' 'QName' ')' { call "element" [Avar $3] } | 'attribute' '(' 'QName' ')' { call "attribute" [Avar $3] } element :: { Ast } element : stag '>' content '' { if head $1 == Astring $5 then Ast "element_construction" ($1++[Ast "append" $3]) else parseError [TError ("Unmatched tags in element construction: " ++(show (head $1))++" '"++$5++"'")] } | stag '>' '' { if head $1 == Astring $4 then Ast "element_construction" ($1++[Ast "append" []]) else parseError [TError ("Unmatched tags in element construction: " ++(show (head $1))++" '"++$4++"'")] } | stag '/>' { Ast "element_construction" ($1++[Ast "append" []]) } | 'element' '{' expr '}' '{' expl '}' { Ast "element_construction" [$3,Ast "attributes" [],concatenateAll $6] } | 'attribute' '{' expr '}''{' expl '}'{ Ast "attribute_construction" [$3,concatenateAll $6] } | 'element' 'QName' '{' expl '}' { Ast "element_construction" [Astring $2,Ast "attributes" [],concatenateAll $4] } | 'attribute' 'QName' '{' expl '}' { Ast "attribute_construction" [Astring $2,concatenateAll $4] } stag :: { [ Ast ] } stag : '<' 'QName' { [Astring $2,Ast "attributes" []] } | '<' 'QName' attributes { [Astring $2,Ast "attributes" $3] } content :: { [ Ast ] } content : '{' expl '}' { [concatenateAll $2] } | 'String' { [Astring $1] } | 'XMLtext' { [Astring $1] } | element { [$1] } | content '{' expl '}' { $1++[concatenateAll $3] } | content 'String' { $1++[Astring $2] } | content 'XMLtext' { $1++[Astring $2] } | content element { $1++[$2] } string :: { Ast } string : stringc { if length $1 == 1 then head $1 else Ast "append" $1 } stringc :: { [Ast] } stringc : 'String' { if $1=="" then [] else [Astring $1] } | '{' expl '}' { [concatenateAll $2] } | stringc 'String' { if $2=="" then $1 else $1++[Astring $2] } | stringc '{' expl '}' { $1++[concatenateAll $3] } attributes :: { [ Ast ] } attributes : 'QName' '=' string { [Ast "pair" [Astring $1,$3]] } | attributes 'QName' '=' string { $1++[Ast "pair" [Astring $2,$4]] } full_path :: { Ast } full_path : predicate_step { Ast "step" ($1 "child_step" (Avar ".")) } | '@' predicate_step { Ast "step" ($2 "attribute_step" (Avar ".")) } | predicate_step path { Ast "step" [$2 (Ast "step" ($1 "child_step" (Avar ".")))] } | '@' predicate_step path { Ast "step" (map $3 ($2 "attribute_step" (Avar "."))) } path :: { Ast -> Ast } path : step { $1 } | path step { $2 . $1 } step :: { Ast -> Ast } step : '/' predicate_step { \e -> Ast "step" ($2 "child_step" e) } | '/' '@' predicate_step { \e -> Ast "step" ($3 "attribute_step" e) } | '/' '/' predicate_step { \e -> Ast "step" ($3 "descendant_step" e) } | '/' '/' '@' predicate_step { \e -> Ast "step" ($4 "attribute_descendant_step" e) } | '/' '..' { \e -> Ast "step" [Ast "parent_step" [e]] } predicate_step :: { String -> Ast -> [ Ast ] } predicate_step : simple_step { \t e -> [$1 t e] } | predicate_step '[' expr ']' { \t e -> ($1 t e)++[$3] } simple_step :: { String -> Ast -> Ast } simple_step : primary_expr { \t e -> $1 t e } | '*' { \t e -> Ast t [Astring "*",e] } | 'QName' { \t e -> Ast t [Astring $1,e] } primary_expr :: { String -> Ast -> Ast } primary_expr : var { \_ _ -> $1 } | '.' { \_ e -> e } | '(' expl ')' { \t e -> if e == Avar "." then concatenateAll $2 else Ast "context" [e,Astring t,concatenateAll $2] } | '(' ')' { \_ _ -> call "empty" [] } | 'QName' '(' expl ')' { \t e -> if e == Avar "." then call $1 $3 else Ast "context" [e,Astring t,call $1 $3] } | 'QName' '(' ')' { \_ e -> call $1 (if e == Avar "." then [] else [e]) } { -- Abstract Syntax Tree for XQueries data Ast = Ast String [Ast] | Avar String | Aint Int | Afloat Float | Astring String deriving Eq instance Show Ast where show (Ast s []) = s ++ "()" show (Ast s (x:xs)) = s ++ "(" ++ show x ++ foldr (\a r -> ","++show a++r) "" xs ++ ")" show (Avar s) = s show (Aint n) = show n show (Afloat n) = show n show (Astring s) = "\'" ++ s ++ "\'" screenSize = 80::Int prettyAst :: Ast -> Int -> (String,Int) prettyAst (Avar s) p = (s,(length s)+p) prettyAst (Aint n) p = let s = show n in (s,(length s)+p) prettyAst (Afloat n) p = let s = show n in (s,(length s)+p) prettyAst (Astring s) p = ("\'" ++ s ++ "\'",(length s)+p+2) prettyAst (Ast s args) p = let (ps,np) = prettyArgs args in (s++"("++ps++")",np+1) where prettyArgs [] = ("",p+1) prettyArgs xs = let ss = show (head xs) ++ foldr (\a r -> ","++show a++r) "" (tail xs) np = (length s)+p+1 in if (length ss)+p < screenSize then (ss,(length ss)+p) else let ds = map (\x -> let (s,ep) = prettyAst x np in (s ++ ",\n" ++ space np,ep)) (init xs) (ls,lp) = prettyAst (last xs) np in (concatMap fst ds ++ ls,lp) space n = replicate n ' ' ppAst :: Ast -> String ppAst e = let (s,_) = prettyAst e 0 in s call :: String -> [Ast] -> Ast call name args = Ast "call" ((Avar name):args) concatenateAll :: [Ast] -> Ast concatenateAll [x] = x concatenateAll (x:xs) = foldl (\a r -> call "concatenate" [a,r]) x xs concatenateAll _ = call "empty" [] data Token = RETURN | SOME | EVERY | IF | THEN | ELSE | LB | RB | LP | RP | LSB | RSB | TO | PLUS | MINUS | TIMES | DIV | IDIV | MOD | TEQ | TNE | TLT | TLE | TGT | TGE | SEQ | SNE | SLT | SLE | SGT | SGE | AND | OR | NOT | UNION | INTERSECT | EXCEPT | FOR | LET | IN | COMMA | ASSIGN | WHERE | ORDER | BY | ASCENDING | DESCENDING | ELEMENT | ATTRIBUTE | STAG | ETAG | SATISFIES | ATSIGN | SLASH | DECLARE | SEMI | FUNCTION | VARIABLE |AT | DOT | DOTS | TokenEOF | PRE | POST | IS | QName String | Variable String | XMLtext String | TInteger Int | TFloat Float | TString String | TError String deriving Eq instance Show Token where show (QName s) = "QName("++s++")" show (Variable s) = "Variable("++s++")" show (XMLtext s) = "XMLtext("++s++")" show (TInteger n) = "Integer("++(show n)++")" show (TFloat n) = "Double("++(show n)++")" show (TString s) = "String("++s++")" show (TError s) = "'"++s++"'" show t = case filter (\(n,_) -> n==t) tokenList of (_,b):_ -> b _ -> "Illegal token" tokenList :: [(Token,String)] tokenList = [(RETURN,"return"),(SOME,"some"),(EVERY,"every"),(IF,"if"),(THEN,"then"),(ELSE,"else"), (LB,"["),(RB,"]"),(LP,"("),(RP,")"),(LSB,"{"),(RSB,"}"), (TO,"to"),(PLUS,"+"),(MINUS,"-"),(TIMES,"*"),(DIV,"div"),(IDIV,"idiv"),(MOD,"mod"), (TEQ,"="),(TNE,"!="),(TLT,"<"),(TLE,"<="),(TGT,">"),(TGE,">="),(PRE,"<<"),(POST,">>"), (IS,"is"),(SEQ,"eq"),(SNE,"ne"),(SLT,"lt"),(SLE,"le"),(SGT,"gt"),(SGE,"ge"),(AND,"and"), (OR,"or"),(NOT,"not"),(UNION,"union"),(INTERSECT,"intersect"),(EXCEPT,"except"), (FOR,"for"),(LET,"let"),(IN,"in"),(COMMA,"','"),(ASSIGN,":="),(WHERE,"where"),(ORDER,"order"), (BY,"by"),(ASCENDING,"ascending"),(DESCENDING,"descending"),(ELEMENT,"element"), (ATTRIBUTE,"attribute"),(STAG,""),(SATISFIES,"satisfies"),(ATSIGN,"@"), (SLASH,"/"),(DECLARE,"declare"),(FUNCTION,"function"),(VARIABLE,"variable"), (AT,"at"),(DOTS,".."),(DOT,"."),(SEMI,";")] parseError tk = error (case tk of ((TError s):_) -> "Parse error: "++s _ -> "Parse error: "++(foldr (\a r -> (show a)++" "++r) "" (take 20 tk))) scan :: String -> [Token] scan cs = lexer cs "" xmlText :: String -> [Token] xmlText "" = [] xmlText text = [XMLtext text] -- scans XML syntax and returns an XMLtext token with the text xml :: String -> String -> String -> [Token] xml ('{':cs) text n = (xmlText text)++(LSB : lexer cs ('{':n)) xml ('<':'/':cs) text n = (xmlText text)++(STAG : lexer cs ('<':'/':n)) xml ('<':'!':'-':cs) text n = xmlComment cs (text++" String -> String -> [Token] xqComment (':':')':cs) text n = xml cs text n xqComment (_:cs) text n = xqComment cs text n xqComment [] text _ = xmlText text xmlComment :: String -> String -> String -> [Token] xmlComment ('-':'>':cs) text n = xml cs (text++"->") n xmlComment (c:cs) text n = xmlComment cs (text++[c]) n xmlComment [] text _ = xmlText text isQN :: Char -> Bool isQN c = elem c "_:-" || isDigit c || isAlpha c isVar :: Char -> Bool isVar c = elem c "_" || isDigit c || isAlpha c inXML :: String -> Bool inXML ('>':'<':_) = True inXML _ = False -- the XQuery scanner lexer :: String -> String -> [Token] lexer [] "" = [] lexer [] _ = [ TError "Unexpected end of input" ] lexer (' ':'>':' ':cs) n = TGT : lexer cs n lexer (c:cs) n | isSpace c = lexer cs n | isAlpha c = lexVar (c:cs) n | isDigit c = lexNum (c:cs) n lexer ('$':c:cs) n | isAlpha c = let (var,rest) = span isVar (c:cs) in (Variable var) : lexer rest n lexer (':':'=':cs) n = ASSIGN : lexer cs n lexer ('<':'/':cs) n = STAG : lexer cs ('<':'/':n) lexer ('<':'=':cs) n = TLE : lexer cs n lexer ('>':'=':cs) n = TGE : lexer cs n lexer ('<':'<':cs) n = PRE : lexer cs n lexer ('>':'>':cs) n = POST : lexer cs n lexer ('/':'>':cs) m = case m of '<':n -> ETAG : (if inXML n then xml cs "" n else lexer cs n) _ -> [ TError "Unexpected token: '/>'" ] lexer ('(':':':cs) n = lexComment cs n lexer ('<':'!':'-':cs) n = lexXmlComment cs " RSB : lexString cs "" ('\"':n) '{':'\'':n -> RSB : lexString cs "" ('\'':n) '{':n -> RSB : (if inXML n then xml cs "" n else lexer cs n) _ -> [ TError "Unexpected token: '}'" ] lexer ('+':cs) n = PLUS : lexer cs n lexer ('-':cs) n = MINUS : lexer cs n lexer ('*':cs) n = TIMES : lexer cs n lexer ('=':cs) n = TEQ : lexer cs n lexer ('<':c:cs) n = TLT : (lexer (c:cs) (if isAlpha c then ('<':n) else n)) lexer ('>':cs) m = case m of '<':'/':'>':'<':n -> TGT : (if inXML n then xml cs "" n else lexer cs n) '<':n -> TGT : xml cs "" ('>':m) _ -> TGT : lexer cs m lexer (',':cs) n = COMMA : lexer cs n lexer ('@':cs) n = ATSIGN : lexer cs n lexer ('/':cs) n = SLASH : lexer cs n lexer ('{':cs) n = LSB : lexer cs ('{':n) lexer ('|':cs) n = UNION : lexer cs n lexer (';':cs) n = SEMI : lexer cs n lexer (c:cs) n = TError ("Illegal character: '"++[c,'\'']) : lexer cs n lexNum :: String -> String -> [Token] lexNum cs n = if null rest || head rest /= '.' then TInteger (read k) : lexer rest n else let (m,rest2) = span isDigit (tail rest) val::Float = read (k++('.':m)) in case rest2 of ('e':rest3) -> let (exp,rest4) = span isDigit rest3 in (TFloat (val*10^(read exp))) : lexer rest4 n _ -> (TFloat val) : lexer rest2 n where (k,rest) = span isDigit cs lexString :: String -> String -> String -> [Token] lexString ('\"':cs) s m = case m of '\"':n -> (TString s) : (lexer cs n) _ -> lexString cs (s++"\"") m lexString ('\'':cs) s m = case m of '\'':n -> (TString s) : (lexer cs n) _ -> lexString cs (s++"\'") m lexString ('{':cs) s n = (TString s) : LSB : (lexer cs ('{':n)) lexString (c:cs) s n = lexString cs (s++[c]) n lexString [] s n = [ TError "End of input while in string" ] lexComment :: String -> String -> [Token] lexComment (':':')':cs) n = lexer cs n lexComment (_:cs) n = lexComment cs n lexComment [] n = [ TError "End of input while in comment" ] lexXmlComment :: String -> String -> String -> [Token] lexXmlComment ('-':'>':cs) text n = (xmlText (text++"->"))++(lexer cs n) lexXmlComment (c:cs) text n = lexXmlComment cs (text++[c]) n lexXmlComment [] text _ = xmlText text lexVar :: String -> String -> [Token] lexVar cs n = let (nm,rest) = span isQN cs in (case nm of "return" -> RETURN "some" -> SOME "every" -> EVERY "if" -> IF "then" -> THEN "else" -> ELSE "to" -> TO "div" -> DIV "idiv" -> IDIV "mod" -> MOD "and" -> AND "or" -> OR "not" -> NOT "union" -> UNION "intersect" -> INTERSECT "except" -> EXCEPT "for" -> FOR "let" -> LET "in" -> IN "where" -> WHERE "order" -> ORDER "by" -> BY "ascending" -> ASCENDING "descending" -> DESCENDING "element" -> ELEMENT "attribute" -> ATTRIBUTE "satisfies" -> SATISFIES "declare" -> DECLARE "function" -> FUNCTION "variable" -> VARIABLE "at" -> AT "eq" -> SEQ "ne" -> SNE "lt" -> SLT "le" -> SLE "gt" -> SGT "ge" -> SGE "is" -> IS var -> QName var ) : lexer rest n }