{------------------------------------------------------------------------------------- - - A Compiler from XQuery to Haskell - Programmer: Leonidas Fegaras - Email: fegaras@cse.uta.edu - Web: http://lambda.uta.edu/ - Creation: 02/15/08, last update: 03/20/08 - - Copyright (c) 2008 by Leonidas Fegaras, the University of Texas at Arlington. All rights reserved. - This material is provided as is, with absolutely no warranty expressed or implied. - Any use is at your own risk. Permission is hereby granted to use or copy this program - for any purpose, provided the above notices are retained on all copies. - --------------------------------------------------------------------------------------} {-# OPTIONS_GHC -fth #-} module XQuery ( XTree(..), XSeq, xq, xe, cq ) where import Char(isDigit) import List(sortBy) import XMLParse(XMLEvent(..),parseDocument) import HXML(AttList) import Language.Haskell.TH import XQueryParser {--------------- XML Trees (rose trees) ----------------------------------------------} type Stream = [XMLEvent] type Tag = String data XTree = XElem Tag AttList [XTree] | XText String | XInt Int | XFloat Float deriving Eq type XSeq = [XTree] showAL :: AttList -> String showAL = foldr(\(a,v) r -> " "++a++"=\'"++v++"\'"++r) [] showXT :: XTree -> String showXT (XElem tag al xs) = "<"++tag++(showAL al)++">"++(showXS xs)++"" showXT (XText text) = text showXT (XInt n) = " "++(show n) showXT (XFloat n) = " "++(show n) showXS :: XSeq -> String showXS xs = concat (map showXT xs) instance Show XTree where show t = showXT t -- lazily materialize the SAX stream into a DOM tree materialize :: Stream -> XTree materialize stream = XElem "root" [] [head (filter (\x -> case x of XElem _ _ _ -> True; _ -> False) (fst (ml stream)))] where m ((TextEvent t):xs) = (XText t,xs) m ((EmptyEvent n atts):xs) = (XElem n atts [],xs) m ((StartEvent n atts):xs) = let (el,xs') = ml xs in (XElem n atts el,xs') m (_:xs) = (XText "unrecognized",xs) m [] = (XText "unrecognized",[]) ml ((EndEvent n):xs) = ([],xs) ml xs = let (e,xs') = m xs (el,xs'') = ml xs' in (e:el,xs'') {--------------- XPath Steps ---------------------------------------------------------} -- XPath step /tag or /* child :: Tag -> XTree -> XSeq child m x = case x of (XElem _ _ bs) -> foldr (\b s -> case b of (XElem k _ _) | (k==m || m=="*") -> b:s _ -> s) [] bs _ -> [] -- XPath step //tag or //* descendant :: Tag -> XTree -> XSeq descendant m (x@(XElem t al cs)) | m==t || m=="*" = x:(concat (map (descendant m) cs)) descendant m (XElem t al cs) = concat (map (descendant m) cs) descendant m _ = [] -- XPath step /@attr or /@* attribute :: Tag -> XTree -> XSeq attribute m x = case x of (XElem _ al _) -> foldr (\(k,v) s -> if k==m || m=="*" then (XText v):s else s) [] al _ -> [] -- XPath step //@attr or //@* attributeDescendant :: Tag -> XTree -> XSeq attributeDescendant m (x@(XElem _ al cs)) = foldr (\(k,v) s -> if k==m || m=="*" then (XText v):s else s) (concat (map (attributeDescendant m) cs)) al attributeDescendant m _ = [] {------------ Functions --------------------------------------------------------------} -- like foldr but with an index foldir :: (a -> Int -> b -> b) -> b -> [a] -> Int -> b foldir c n [] i = n foldir c n (x:xs) i = c x i (foldir c n xs (i+1)) trueXT = XText "true" toString :: XSeq -> [String] toString xs = foldr (\x r -> case x of XElem _ _ [XText t] -> t:r XText t -> t:r XInt n -> (show n):r XFloat n -> (show n):r _ -> r) [] xs readNum :: String -> Maybe XTree readNum cs = case span isDigit cs of (n,[]) -> Just (XInt (read n)) (n,'.':rest) -> case span isDigit rest of (k,[]) -> Just (XFloat (read (n++('.':k)))) _ -> Nothing _ -> Nothing toNum :: XSeq -> XSeq toNum xs = foldr (\x r -> case x of XElem _ _ [XText s] -> case readNum s of Just t -> t:r _ -> r XInt n -> x:r XFloat n -> x:r XText s -> case readNum s of Just t -> t:r _ -> r _ -> r) [] xs text :: XSeq -> XSeq text xs = foldr (\x r -> case x of XElem _ _ [z@(XText _)] -> z:r XElem _ _ [z@(XInt _)] -> z:r XElem _ _ [z@(XFloat _)] -> z:r XText _ -> x:r XInt _ -> x:r XFloat _ -> x:r _ -> r) [] xs toFloat :: XTree -> Float toFloat (XInt n) = fromIntegral n toFloat (XFloat n) = n contains :: String -> String -> Bool contains xs ys | ((take (length ys) xs) == ys) = True contains (_:xs) ys = contains xs ys contains [] ys = False distinct :: XSeq -> XSeq distinct = foldl (\r a -> if elem a r then r else r++[a]) [] arithmetic :: (Float -> Float -> Float) -> XTree -> XTree -> XTree arithmetic op (XInt n) (XInt m) = XInt (round (op (fromIntegral n) (fromIntegral m))) arithmetic op (XFloat n) (XFloat m) = XFloat (op n m) arithmetic op (XFloat n) (XInt m) = XFloat (op n (fromIntegral m)) arithmetic op (XInt n) (XFloat m) = XFloat (op (fromIntegral n) m) compareXTrees :: XTree -> XTree -> Ordering compareXTrees (XElem _ _ _) _ = EQ compareXTrees _ (XElem _ _ _) = EQ compareXTrees (XInt n) (XInt m) = compare n m compareXTrees (XFloat n) (XInt m) = compare n (fromIntegral m) compareXTrees (XInt n) (XFloat m) = compare (fromIntegral n) m compareXTrees (XFloat n) (XFloat m) = compare n m compareXTrees x y = let tx = case x of XText t -> t; XInt n -> show n; XFloat n -> show n ty = case y of XText t -> t; XInt n -> show n; XFloat n -> show n in compare tx ty compareXSeqs :: Bool -> XSeq -> XSeq -> Ordering compareXSeqs ord xs ys = let comps = [ compareXTrees x y | x <- xs, y <- ys ] in if ord then if all (\x -> x == LT) comps then LT else if all (\x -> x == GT) comps then GT else EQ else if all (\x -> x == LT) comps then GT else if all (\x -> x == GT) comps then LT else EQ type Function = [Q Exp] -> Q Exp functions :: [(Tag,Int,Function)] functions = [ ( "=", 2, \[xs,ys] -> [| [ trueXT | x <- text $xs, y <- text $ys, x == y ] |] ), ( "==", 2, \[xs,ys] -> [| [ trueXT | x <- $xs, y <- $ys, x == y ] |] ), ( "!=", 2, \[xs,ys] -> [| if null [ trueXT | x <- text $xs, y <- text $ys, x == y ] then [trueXT] else [] |] ), ( ">", 2, \[xs,ys] -> [| [ trueXT | x <- toNum $xs, y <- toNum $ys, compareXTrees x y == GT ] |] ), ( "<", 2, \[xs,ys] -> [| [ trueXT | x <- toNum $xs, y <- toNum $ys, compareXTrees x y == LT ] |] ), ( ">=", 2, \[xs,ys] -> [| [ trueXT | x <- toNum $xs, y <- toNum $ys, compareXTrees x y `elem` [GT,EQ] ] |] ), ( "<=", 2, \[xs,ys] -> [| [ trueXT | x <- toNum $xs, y <- toNum $ys, compareXTrees x y `elem` [LT,EQ] ] |] ), ( "+", 2, \[xs,ys] -> [| [ arithmetic (+) x y | x <- toNum $xs, y <- toNum $ys ] |] ), ( "-", 2, \[xs,ys] -> [| [ arithmetic (-) x y | x <- toNum $xs, y <- toNum $ys ] |] ), ( "*", 2, \[xs,ys] -> [| [ arithmetic (*) x y | x <- toNum $xs, y <- toNum $ys ] |] ), ( "div", 2, \[xs,ys] -> [| [ arithmetic (/) x y | x <- toNum $xs, y <- toNum $ys ] |] ), ( "idiv", 2, \[xs,ys] -> [| [ arithmetic div x y | x <- toNum $xs, y <- toNum $ys ] |] ), ( "uplus", 1, \[xs] -> [| [ x | x <- toNum $xs ] |] ), ( "uminus", 1, \[xs] -> [| [ case x of XInt n -> XInt (-n); XFloat n -> XFloat (-n) | x <- toNum $xs ] |] ), ( "and", 2, \[xs,ys] -> [| if (null $xs) || (null $ys) then [] else [trueXT] |] ), ( "or", 2, \[xs,ys] -> [| if (null $xs) && (null $ys) then [] else [trueXT] |] ), ( "not", 1, \[xs] -> [| if (null $xs) then [trueXT] else [] |] ), ( "some", 1, \[xs] -> [| if (null $xs) then [] else [trueXT] |] ), ( "count", 1, \[xs] -> [| [ XInt (length $xs) ] |] ), ( "sum", 1, \[xs] -> [| [ XFloat (sum [ toFloat x | x <- toNum $xs ]) ] |] ), ( "avg", 1, \[xs] -> [| let ys = $xs in [ XFloat ((sum [ toFloat x | x <- toNum ys ]) / (fromIntegral (length ys))) ] |] ), ( "min", 1, \[xs] -> [| [ XFloat (minimum [ toFloat x | x <- toNum $xs ]) ] |] ), ( "max", 1, \[xs] -> [| [ XFloat (maximum [ toFloat x | x <- toNum $xs ]) ] |] ), ( "to", 2, \[xs,ys] -> [| [ XInt i | XInt n <- toNum $xs, XInt m <- toNum $ys, i <- [n..m] ] |] ), ( "text", 1, \[xs] -> [| text $xs |] ), ( "node", 1, \[xs] -> [| $xs |] ), ( "empty", 0, \[] -> [| [] |] ), ( "child", 2, \[tags,xs] -> [| [ z | (XText tag) <- $tags, x <- $xs, z <- child tag x ] |] ), ( "descendant", 2, \[tags,xs] -> [| [ z | (XText tag) <- $tags, x <- $xs, z <- descendant tag x ] |] ), ( "attribute", 2, \[tags,xs] -> [| [ z | (XText tag) <- $tags, x <- $xs, z <- attribute tag x ] |] ), ( "descendant_attribute", 2, \[tags,xs] -> [| [ z | (XText tag) <- $tags, x <- $xs, z <- attributeDescendant tag x ] |] ), ( "if", 3, \[cs,ts,es] -> [| if null $cs then $es else $ts |] ), ( "element", 2, \[tags,xs] -> [| [ x | tag <- toString $tags, x <- $xs, case x of XElem t _ _ -> t==tag || tag=="*"; _ -> False ] |] ), ( "attribute", 2, \[tags,xs] -> [| [ z | tag <- toString $tags, x <- $xs, z <- attribute tag x ] |] ), ( "contains", 2, \[xs,text] -> [| [ trueXT | x <- toString $xs, t <- toString $text, contains x t ] |] ), ( "concat", 2, \[xs,ys] -> [| $xs ++ $ys |] ), ( "distinct-nodes", 1, \[xs] -> [| distinct $xs |] ), ( "union", 2, \[xs,ys] -> [| distinct ($xs ++ $ys) |] ), ( "intersect", 2, \[xs,ys] -> [| filter (\x -> elem x $ys) $xs |] ), ( "except", 2, \[xs,ys] -> [| filter (\x -> not (elem x $ys)) $xs |] ) ] -- make a function call callF :: Tag -> Function callF fname args = case filter (\(n,_,_) -> n==fname) functions of (_,len,f):_ -> if (length args) == len then f args else error ("wrong number of arguments in function call: " ++ fname) _ -> -- otherwise, it must be a Haskell function of type (XSeq,...,XSeq) -> XSeq let itp = foldr (\_ r -> appT r [t| XSeq |] ) (appT (tupleT (length args)) [t| XSeq |]) (tail args) fn = sigE (varE (mkName fname)) (appT (appT arrowT itp) [t| XSeq |]) in appE fn (tupE args) {------------ Optimizer --------------------------------------------------------------} -- get rid of backward steps optimize :: Ast -> Ast -- rule: x/tag/.. -> x[tag] optimize (Ast "call" [Avar "parent",Ast "call" [Avar "child",tag,x]]) = Ast "predicate" [Ast "call" [Avar "child",tag,Avar "."],optimize x] -- rule: x//tag/.. -> x//*[tag] optimize (Ast "call" [Avar "parent",Ast "call" [Avar "descendant",tag,x]]) = Ast "predicate" [Ast "call" [Avar "child",tag,Avar "."], Ast "call" [Avar "descendant",Astring "*",optimize x]] -- needs more rules optimize (Ast n args) = Ast n (map optimize args) optimize e = e {------------ Compiler ---------------------------------------------------------------} -- does the expression contain a last()? containsLast :: Ast -> Bool containsLast (Ast "call" [Avar "last"]) = True containsLast (Ast f _) | elem f ["let","for","predicate"] = False containsLast (Ast _ args) = or (map containsLast args) containsLast _ = False -- Compile the AST e into Haskell code -- context: context node -- index: the element position in the parent sequence (=position()) -- seqSize: the seqSize of the parent sequence (=last()) compile :: Ast -> Q Exp -> Q Exp -> Q Exp -> Q Exp compile e context index seqSize = case e of Avar "." -> context Avar v -> let x = varE (mkName v) in [| $x :: XSeq |] Aint n -> let x = litE (IntegerL (toInteger n)) in [| [ XInt $x ] |] Afloat n -> let x = litE (RationalL (toRational n)) in [| [ XFloat $x ] |] Astring s -> let x = litE (StringL s) in [| [ XText $x ] |] Ast "doc" [Aint n] -> let d = varE (mkName ("_doc"++(show n))) in [| [ $d ] |] Ast "call" [Avar "position"] -> index Ast "call" [Avar "last"] -> seqSize Ast "call" ((Avar f):args) -> callF f (map (\x -> compile x context index seqSize) args) Ast "construction" [Astring tag,Ast "attributes" al,body] -> let alc = foldr (\p r -> case p of Ast "pair" [Astring a,Astring v] -> let ac = litE (StringL a) vc = litE (StringL v) in [| ($ac,$vc) : $r |] Ast "pair" [Astring a,v] -> let ac = litE (StringL a) vc = compile v context index seqSize in [| ($ac,showXS $vc) : $r |]) [| [] |] al ct = litE (StringL tag) bc = compile body context index seqSize in [| [ XElem $ct $alc $bc ] |] Ast "predicate" [condition,body] | containsLast condition -- blocking: use only when last() is used in condition -> let c = compile condition b = compile body context index seqSize in [| let bl = $b len = length bl in foldir (\x i r -> case $(c [| [ x ] |] [| [ XInt i ] |] [| [ XInt len ] |]) of [] -> r [XInt n] -> if i==n then x:r else r -- indexing _ -> x:r) [] bl 1 |] Ast "predicate" [condition,body] -- non-blocking -> let c = compile condition b = compile body context index seqSize in [| foldir (\x i r -> case $(c [| [ x ] |] [| [ XInt i ] |] seqSize) of [] -> r [XInt n] -> if i==n then x:r else r -- indexing _ -> x:r) [] $b 1 |] Ast "let" [Avar var,source,body] -> do s <- compile source context index seqSize b <- compile body context index seqSize return (AppE (LamE [VarP (mkName var)] b) s) Ast "for" [Avar var,Avar "$",source,body] -- a for-loop without an index -> let b = compile body [| $(varE (mkName var)) |] [| [] |] [| [] |] f = lamE [varP (mkName var)] [| \x -> $b ++ x |] s = compile source context index seqSize in [| foldr (\x -> $f [x]) [] $s |] Ast "for" [Avar var,Avar ivar,source,body] -- a for-loop with an index -> let b = compile body [| $(varE (mkName var)) |] [| $(varE (mkName ivar)) |] [| [] |] f = lamE [varP (mkName var)] (lamE [varP (mkName ivar)] [| \x -> $b ++ x |]) s = compile source context index seqSize in [| foldir (\x i -> $f [x] [XInt i]) [] $s 1 |] Ast "sortTuple" (exp:orderBys) -- prepare each FLWOR tuple for sorting -> let res = foldl (\r a -> let ac = compile a context index seqSize in [| $r++[text $ac] |] ) [| [ $(compile exp context index seqSize) ] |] orderBys in [| [ $res ] |] Ast "sort" (exp:ordList) -> let ce = compile exp context index seqSize ordering = foldl (\r (Avar ord) -> let asc = if ord == "ascending" then [| True |] else [| False |] in [| \(x:xs) (y:ys) -> case compareXSeqs $asc x y of EQ -> $r xs ys o -> o |]) [| \xs ys -> EQ |] ordList in [| concatMap head (sortBy (\(_:xs) (_:ys) -> $ordering xs ys) ($ce::[[XSeq]])) |] _ -> error ("Illegal XQuery "++(show e)) -- collect all input documents and assign them a unique number getDocs :: Ast -> Int -> (Ast, Int, [(Int, String)]) getDocs query count = case query of Ast "call" [Avar "doc",Astring file] -> (Ast "doc" [Aint count], count+1, [(count,file)]) Ast n args -> let (s,c,ns) = foldr (\a r c -> let (e,c1,n1) = getDocs a c (s,c2,n2) = r c1 in (e:s,c2,n1++n2)) (\c -> ([],c,[])) args count in (Ast n s,c,ns) _ -> (query,count,[]) -- compile an XQuery AST that reads XML documents compileQuery :: Ast -> Q Exp -> Q Exp compileQuery query context = let (ast,_,ns) = getDocs query 0 code = compile (optimize ast) context [| [] |] [| [] |] in foldr (\(n,file) r -> let d = lamE [varP (mkName ("_doc"++(show n)))] r in [| do doc <- readFile $(litE (StringL file)) let x = materialize (parseDocument doc) $d x |]) [| return $code |] ns -- Display the AST and the Haskell code of an input XQuery cq :: String -> IO () cq query = do putStrLn "Abstract Syntax Tree:" let ast = parse (scan query) putStrLn (show ast) let opt = optimize ast putStrLn "Optimized AST:" putStrLn (show opt) putStrLn "Haskell Code:" let code = compileQuery opt [| [] |] runQ code >>= putStrLn.pprint -- Run an XQuery expression that does not read XML documents -- When evaluated, it returns XSeq xe :: String -> Q Exp xe query = compile (optimize (parse (scan query))) [| [] |] [| [] |] [| [] |] -- Run an XQuery that reads XML documents -- When evaluated, it returns IO XSeq xq :: String -> Q Exp xq query = compileQuery (parse (scan query)) [| [] |]