{------------------------------------------------------------------------------------- - - XML Trees (represented as rose trees) - Programmer: Leonidas Fegaras - Email: fegaras@cse.uta.edu - Web: http://lambda.uta.edu/ - Creation: 05/01/08, last update: 03/29/09 - - Copyright (c) 2008 by Leonidas Fegaras, the University of Texas at Arlington. All rights reserved. - This material is provided as is, with absolutely no warranty expressed or implied. - Any use is at your own risk. Permission is hereby granted to use or copy this program - for any purpose, provided the above notices are retained on all copies. - --------------------------------------------------------------------------------------} {-# OPTIONS_GHC -funbox-strict-fields #-} module Text.XML.HXQ.XTree ( XTree(..), XSeq, materialize, putXSeq, showXS ) where import System.IO import Char(isSpace) import XMLParse(XMLEvent(..)) import HXML(Name,AttList) import Text.XML.HXQ.Parser(Ast(..)) import System.CPUTime -- import Debug.Trace -- import System.IO.Unsafe -- | A rose tree representation of XML data. -- An XML element is: @XElem tagname atributes preorder parent children@. -- The preorder numbering is the document order of elements. -- The parent is a cyclic reference to the parent element. data XTree = XElem !Name !AttList !Int XTree [XTree] -- ^ an XML tree node (element) | XAttr !Name !String -- ^ attribute construction | XText !String -- ^ an XML tree leaf (PCDATA) | XInt !Int -- ^ an XML tree leaf (int) | XFloat !Double -- ^ an XML tree leaf (double) | XBool !Bool -- ^ an XML tree leaf (boolean) | XPI Name String -- ^ processing instruction | XGERef String -- ^ general entity reference | XComment String -- ^ comment | XError String -- ^ error report | XNull -- ^ null value | XType Ast -- ^ type information | XNoPad -- ^ marker for no padding in XSeq deriving Eq -- | A sequence of XML fragments type XSeq = [XTree] emptyElem :: XTree -> Bool emptyElem e = case e of XElem _ al _ _ xs -> emptyAL al && all emptyElem xs && not (null xs) XText text -> all isSpace text XNull -> True XNoPad -> True _ -> False where emptyAL = all (\(a,v) -> case (a,v) of (_,"") -> True; ('_':_,_) -> True; _ -> False) showAL :: AttList -> String showAL = foldr (\(a,v) r -> case (a,v) of ('_':_,_) -> r; _ -> " "++a++"=\""++v++"\""++r) [] showXT :: XTree -> Bool -> String showXT e pad = case e of XElem "_document" _ _ _ xs -> showXS xs XElem tag al _ _ [] -> "<"++tag++showAL al++"/>" XElem _ _ _ _ _ | emptyElem e -> "" XElem tag al _ _ xs | all emptyElem xs -> "<"++tag++showAL al++"/>" XElem tag al _ _ xs -> "<"++tag++showAL al++">"++showXS xs++"" XAttr tag val -> p++tag++"=\""++val++"\"" XText text -> p++text XInt n -> p++show n XFloat n -> p++show n XBool v -> p++if v then "true" else "false" XComment s -> "" XPI n s -> "" XError s -> error s XNull -> "?" XType tp -> show tp _ -> "" where p = if pad then " " else "" showXS :: XSeq -> String showXS [] = "" showXS (x:xs) = showXT x False ++ sXS xs where sXS (XNoPad:x:xs) = (showXT x False) ++ sXS xs sXS (x:xs) = (showXT x True) ++ sXS xs sXS _ = "" instance Show XTree where show t = showXT t False -- | Print the XQuery result (which is a sequence of XML fragments) without buffering. putXSeq :: XSeq -> IO () putXSeq xs = hSetBuffering stdout NoBuffering >> putStrLn (showXS xs) {--------------- Build the rose tree from the XML stream ----------------------------} type Stream = [XMLEvent] noParentError = XError "Undefined parent reference" statistics = id {- Collect statistics about the timing of I/O events statistics n = if mod n 1000 == 0 then unsafePerformIO (do t <- getCPUTime putStr $ show (div n 1000) putStrLn $ " " ++ show (div t (10^6)) return n) else n -} -- Lazily materialize the SAX stream into a DOM tree without setting parent references. materializeWithoutParent :: Int -> Stream -> XTree materializeWithoutParent level stream = XElem "_document" [] 1 noParentError ((\(x,_,_)->x) (mdl stream 2 level [])) where md s@(x@(StartEvent n atts):xs) i level ns | level == 0 = let f [] i = m s i f ((StartEvent n atts):ns) i = let (e,xs',i') = f ns $! i+1 in (XElem n atts i noParentError [e],xs',i') (e,xs',i') = (f $! (reverse ns)) i in (e,xs',i',ns) | otherwise = ((md xs $! i+1) $! level-1) $! x:ns md ((EndEvent n):xs) i level (x:ns) = (md xs i $! level+1) ns md (_:xs) i level ns = md xs i level ns md [] _ _ _ = (XText "",[],0,[]) mdl xs@(_:_) i level ns = let (e,xs',i',ns') = md xs i level ns (el,xs'',i'') = mdl xs' i' 0 ns' in (e:el,xs'',i'') mdl [] _ _ _ = ([],[],0) m ((TextEvent t):xs) i = (XText t,xs,i) m ((EmptyEvent n atts):xs) i = (XElem n atts i noParentError [],xs,i+1) m ((StartEvent n atts):xs) i = let (el,xs',i') = ml n xs $! i+1 in (XElem n atts i noParentError el,xs',i') m ((PIEvent n s):xs) i = (XPI n s,xs,i) m ((CommentEvent s):xs) i = (XComment s,xs,i) m ((GERefEvent n):xs) i = (XGERef n,xs,i) m ((ErrorEvent s):xs) i = (XError s,xs,i) m (_:xs) i = (XError "Unrecognized XML event",xs,i) m [] i = (XError "Unterminated element",[],i) ml _ [] i = ([],[],i) ml tag ((EndEvent n):xs) i | n==tag = ([],xs,i) | otherwise = error $ "Unmatched tags: <"++tag++">" ml tag xs i = let (e,xs',i') = m xs i (el,xs'',i'') = ml tag xs' i' in (e:el,xs'',i'') -- Lazily materialize the SAX stream into a DOM tree setting parent references. -- It has space leaks for large documents. -- Used only if the query has backward steps that cannot be eliminated. materializeWithParent :: Int -> Stream -> XTree materializeWithParent level stream = root where root = XElem "_document" [] 1 (XError "Trying to access the root parent") ((\(x,_,_)->x) (mdl stream 2 level root [])) md s@(x@(StartEvent n atts):xs) i level p ns | level == 0 = let f [] i p = m s i p f ((StartEvent n atts):ns) i p = let (e,xs',i') = (f ns $! i+1) x x = XElem n atts i p [e] in (x,xs',i') (e,xs',i') = (f $! (reverse ns)) i p in (e,xs',i',ns) | otherwise = ((md xs $! i+1) $! level-1) p $! x:ns md ((EndEvent n):xs) i level p (x:ns) = (md xs i $! level+1) p ns md (_:xs) i level p ns = md xs i level p ns md [] _ _ _ _ = (XText "",[],0,[]) mdl xs@(_:_) i level p ns = let (e,xs',i',ns') = md xs i level p ns (el,xs'',i'') = mdl xs' i' 0 p ns' in (e:el,xs'',i'') mdl [] _ _ _ _ = ([],[],0) m ((TextEvent t):xs) i _ = (XText t,xs,i) m ((EmptyEvent n atts):xs) i p = (XElem n atts i p [],xs,i+1) m ((StartEvent n atts):xs) i p = let (el,xs',i') = (ml n xs $! (i+1)) node node = XElem n atts i p el in (node,xs',i') m ((PIEvent n s):xs) i _ = (XPI n s,xs,i) m ((CommentEvent s):xs) i _ = (XComment s,xs,i) m ((GERefEvent n):xs) i _ = (XGERef n,xs,i) m ((ErrorEvent s):xs) i _ = (XError s,xs,i) m (_:xs) i _ = (XError "unrecognized XML event",xs,i) m [] i _ = (XError "unbalanced tags",[],i) ml _ [] i _ = ([],[],i) ml tag ((EndEvent n):xs) i _ | n==tag = ([],xs,i) | otherwise = error $ "Unmatched tags: <"++tag++">" ml tag xs i p = let (e,xs',i') = m xs i p (el,xs'',i'') = ml tag xs' i' p in (e:el,xs'',i'') -- It looks at the first maxStreamingUnitSize events in the stream to determine -- the minimum nesting depth. If this minimum depth is greater than 0, then -- the stream will be chopped into a list of XML elements at this depth level, -- where each XML element corresponds to at most maxStreamingUnitSize events. -- Otherwise, the cache requirenments would be the size of the largest root child docLevel :: Stream -> Int -> Int -> Int -> Int docLevel stream i level minLevel = if i < 0 then minLevel else case stream of ((StartEvent n _):xs) -> ((docLevel xs $! i-1) $! level+1) minLevel ((EndEvent n):xs) -> ((docLevel xs $! i-1) $! level-1) $! min minLevel (level-1) (_:xs) -> (docLevel xs $! i-1) level minLevel _ -> 0 -- | Maximum number of events that each streaming unit corresponds to maxStreamingUnitSize :: Int maxStreamingUnitSize = 10000 materialize :: Bool -> Stream -> XTree materialize withParent stream = let level = docLevel stream maxStreamingUnitSize 0 1000000 in if withParent then materializeWithParent level stream else materializeWithoutParent level stream