{------------------------------------------------------------------------------------- - - XML Trees (represented as rose trees) - Programmer: Leonidas Fegaras - Email: fegaras@cse.uta.edu - Web: http://lambda.uta.edu/ - Creation: 05/01/08, last update: 09/29/09 - - Copyright (c) 2008 by Leonidas Fegaras, the University of Texas at Arlington. All rights reserved. - This material is provided as is, with absolutely no warranty expressed or implied. - Any use is at your own risk. Permission is hereby granted to use or copy this program - for any purpose, provided the above notices are retained on all copies. - --------------------------------------------------------------------------------------} {-# OPTIONS_GHC -funbox-strict-fields #-} module Text.XML.HXQ.XTree where import System.IO import Char(isSpace) import XMLParse(XMLEvent(..)) import HXML(Name,AttList) import Text.XML.HXQ.Parser(Ast(..)) import System.CPUTime -- import System.IO.Unsafe -- | Namespace prefix type Prefix = String -- | Namespace URI type URI = String -- | Local name type LocalName = String -- | A qualified name has a namespace prefix, a URI, and a local name data QName = QName { prefix :: Prefix, uri :: URI, localName :: LocalName } instance Eq QName where (QName _ u1 ln1) == (QName _ u2 ln2) = u1 == u2 && ln1 == ln2 instance Ord QName where (QName _ u1 ln1) <= (QName _ u2 ln2) = u1 < u2 || (u1 == u2 && ln1 <= ln2) instance Show QName where show (QName "" _ ln) = ln show (QName ns _ ln) = ns++(':':ln) -- | XML attributes are bindings from qualified names to values type Attributes = [(QName,String)] matchQName x@(QName _ u1 ln1) y@(QName _ u2 ln2) = (u1 == u2 || u1 == "" || u2 == "") && (ln1 == ln2 || ln2 == "*" || ln1 == "*") documentRootTag = QName "" "" "_document" -- | A rose tree representation of XML data. -- An XML element is: @XElem tagname atributes preorder parent children@. -- The preorder numbering is the document order of elements. -- The parent is a cyclic reference to the parent element. data XTree = XElem !QName !Attributes !Int XTree [XTree] -- ^ an XML tree node (element) | XAttr !QName !String -- ^ attribute construction | XText !String -- ^ an XML tree leaf (PCDATA) | XInt !Int -- ^ an XML tree leaf (int) | XFloat !Double -- ^ an XML tree leaf (double) | XBool !Bool -- ^ an XML tree leaf (boolean) | XPI String String -- ^ processing instruction | XGERef String -- ^ general entity reference | XComment String -- ^ comment | XError String -- ^ error message | XNull -- ^ null value | XType Type -- ^ type information | XNoPad -- ^ marker for no padding in XSeq deriving Eq -- | A sequence of XML fragments type XSeq = [XTree] emptyElem :: XTree -> Bool emptyElem e = case e of XElem _ al _ _ xs -> emptyAL al && all emptyElem xs && not (null xs) XText text -> all isSpace text XNull -> True XNoPad -> True _ -> False where emptyAL = all (\(a,v) -> case (a,v) of (_,"") -> True (QName _ _ ('_':_),_) -> True _ -> False) showsAL :: Attributes -> String -> String showsAL al acc = foldr (\(a,v) r -> case (a,v) of (QName _ _ ('_':_),_) -> r _ -> ' ':shows a ('=':shows v r)) acc al showsXT :: XTree -> String -> Bool -> String showsXT e acc pad = case e of XElem tag _ _ _ xs | tag == documentRootTag -> showsXS xs acc XElem tag al _ _ [] -> '<':shows tag (showsAL al ('/':'>':acc)) XElem _ _ _ _ _ | emptyElem e -> acc XElem tag al _ _ xs | all emptyElem xs -> '<':shows tag (showsAL al ('/':'>':acc)) XElem tag al _ _ xs -> '<':shows tag (showsAL al ('>':showsXS xs ('<':'/':shows tag ('>':acc)))) XAttr tag val -> p (shows tag ('=':shows val acc)) XText text -> p (text++acc) XInt n -> p (shows n acc) XFloat n -> p (shows n acc) XBool v -> p (if v then "true"++acc else "false"++acc) XComment s -> ""++acc XPI n s -> "':acc))) XError s -> error s XNull -> '?':acc XType tp -> shows tp acc _ -> acc where p acc = if pad then ' ':acc else acc showsXS :: XSeq -> String -> String showsXS [] acc = acc showsXS (x:xs) acc = showsXT x (sXS xs acc) False where sXS (XNoPad:x:xs) acc = showsXT x (sXS xs acc) False sXS (x:xs) acc = showsXT x (sXS xs acc) True sXS _ acc = acc instance Show XTree where show t = showsXT t "" False -- | Print the XQuery result (which is a sequence of XML fragments) without buffering. putXSeq :: XSeq -> IO () putXSeq xs = hSetBuffering stdout NoBuffering >> putStrLn (showsXS xs "") {--------------------------- XQuery Types ---------------------------------------} -- | A type variable type TVar = Int -- | Type qualifier: *, +, or ? type TQualifier = Char -- | An XQuery type data Type = TVariable TVar -- ^ type variable (needed for polymorphic type inference) | TBase QName -- ^ xs:integer, xs:string, ... | TItem String -- ^ item(), node(), ... | TNamed QName -- ^ reference to a user-defined type | TElement String Type -- ^ element tag { t } | TAttribute String Type -- ^ attribute name { t } | TAny -- ^ any element or attribute content | TEmpty -- ^ () | TSequence Type Type -- ^ t1, t2 | TInterleaving Type Type -- ^ t1 & t2 | TChoice Type Type -- ^ t1 | t2 | TQualified Type TQualifier -- ^ t*, t+, or t? deriving Eq showsType :: Type -> Int -> String -> String showsType t prec acc = case t of TVariable s -> '#':shows s acc TBase s -> shows s acc TItem s -> s++('(':')':acc) TNamed s -> shows s acc TEmpty -> '(':')':acc TAny -> "xs:any"++acc TElement n TAny -> "element "++n++acc TAttribute a TAny -> "attribute "++a++acc TElement n t -> "element "++n++" { "++showsType t 3 (" }"++acc) TAttribute a t -> "attribute "++a++" { "++showsType t 3 (" }"++acc) TSequence t1 t2 -> paren 3 prec (showsType t1 3 (", "++showsType t2 3 acc)) TInterleaving t1 t2 -> paren 3 prec (showsType t1 2 (" & "++showsType t2 2 acc)) TChoice t1 t2 -> paren 2 prec (showsType t1 2 (" | "++showsType t2 2 acc)) TQualified t c -> paren 4 prec (showsType t 4 (c:acc)) where paren p1 p2 s | p1 NS -> QName tag s ns = if elem ':' s then case span (/= ':') s of (s1,_:s2) -> case lookup s1 (prefixes ns) of Just u -> QName s1 u s2 Nothing -> error ("Undeclared element namespace: "++s1) else QName "" (defaultElementNS ns) s attributeTag s ns = if elem ':' s then case span (/= ':') s of (s1,_:s2) -> case lookup s1 (prefixes ns) of Just u -> QName s1 u s2 Nothing -> error ("Undeclared attribute namespace: "++s1) else QName "" "" s functionTag :: String -> NS -> QName functionTag s ns = if elem ':' s then case span (/= ':') s of (s1,_:s2) -> case lookup s1 (prefixes ns) of Just u -> QName s1 u s2 Nothing -> error ("Undeclared function namespace: "++s1) else QName "" (defaultFunctionNS ns) s attributes :: AttList -> NS -> Attributes attributes al ns = map (\(s,v) -> (attributeTag s ns,v)) al elementNamespaces :: AttList -> NS -> NS elementNamespaces atts ns = ns { defaultElementNS = foldr (\(a,v) r -> case a of "xmlns" -> v; _ -> r) (defaultElementNS ns) atts, prefixes = foldr (\(a,v) r -> case splitAt 6 a of ("xmlns:",p) -> case lookup p predefinedNamespaces of Just uri -> if uri == v then (p,v):r else error ("You cannot redefine the system namespace "++p) Nothing -> (p,v):r _ -> r) (prefixes ns) atts } predefinedNamespaces :: NamespacePrefixes predefinedNamespaces = [ ("xml", "http://www.w3.org/XML/1998/namespace"), ("xmlns", "http://www.w3.org/2000/xmlns/"), ("xsl", "http://www.w3.org/1999/XSL/Transform"), ("xs", "http://www.w3.org/2001/XMLSchema"), ("xsi", "http://www.w3.org/2001/XMLSchema-instance"), ("fn", "http://www.w3.org/2005/xpath-functions"), ("xdt", "http://www.w3.org/2005/xpath-datatypes"), ("local", "http://www.w3.org/2005/xquery-local-functions") ] xsNamespace = let Just uri = lookup "xs" predefinedNamespaces in uri fnNamespace = let Just uri = lookup "fn" predefinedNamespaces in uri initialNS = NS { defaultElementNS="", defaultFunctionNS=fnNamespace, prefixes=predefinedNamespaces, schemas=[] } {--------------- Build an XTree from the XML stream ----------------------------} type Stream = [XMLEvent] noParentError = XError "Undefined parent reference" statistics = id {- Collect statistics about the timing of I/O events statistics n = if mod n 1000 == 0 then unsafePerformIO (do t <- getCPUTime putStr $ show (div n 1000) putStrLn $ " " ++ show (div t (10^6)) return n) else n -} -- Lazily materialize the SAX stream into a DOM tree without setting parent references. materializeWithoutParent :: Int -> Stream -> NS -> XTree materializeWithoutParent level stream ns = XElem documentRootTag [] 1 noParentError (if level<=1 then [head (filter (\x -> case x of XElem _ _ _ _ _ -> True; _ -> False) ((\(x,_,_)->x) (ml "" stream 2 ns)))] else ((\(x,_,_)->x) (mdl stream 2 level [] ns))) where md s@(x@(StartEvent n atts):xs) i level ls ns | level == 0 = let f [] i ns = m s i ns f ((StartEvent n atts):ls) i ns = let ns' = elementNamespaces atts ns (e,xs',i') = (f ls $! i+1) $! ns' in (((XElem $! (tag n ns')) $! (attributes atts ns')) i noParentError [e],xs',i') (e,xs',i') = (f $! (reverse ls)) i ns in (e,xs',i',ls) | otherwise = (((md xs $! i+1) $! level-1) $! x:ls) ns md ((EndEvent n):xs) i level (x:ls) ns = (md xs i $! level+1) ls ns md (_:xs) i level ls ns = md xs i level ls ns md [] _ _ _ _ = (XText "",[],0,[]) mdl xs@(_:_) i level ls ns = let (e,xs',i',ls') = md xs i level ls ns (el,xs'',i'') = mdl xs' i' 0 ls' ns in (e:el,xs'',i'') mdl [] _ _ _ _ = ([],[],0) m ((TextEvent t):xs) i _ = (XText t,xs,i) m ((EmptyEvent n atts):xs) i ns = let ns' = elementNamespaces atts ns in (((XElem $! (tag n ns')) $! (attributes atts ns')) i noParentError [],xs,i+1) m ((StartEvent n atts):xs) i ns = let ns' = elementNamespaces atts ns (el,xs',i') = (ml n xs $! i+1) $! ns' in (((XElem $! (tag n ns')) $! (attributes atts ns')) i noParentError el,xs',i') m ((PIEvent n s):xs) i _ = (XPI n s,xs,i) m ((CommentEvent s):xs) i _ = (XComment s,xs,i) m ((GERefEvent n):xs) i _ = (XGERef n,xs,i) m ((ErrorEvent s):xs) i _ = (XError s,xs,i) m (_:xs) i _ = (XError "Unrecognized XML event",xs,i) m [] i _ = (XError "Unterminated element",[],i) ml _ [] i _ = ([],[],i) ml tag ((EndEvent n):xs) i ns | n==tag = ([],xs,i) | otherwise = error $ "Unmatched tags: <"++show tag++">" ml tag xs i ns = let (e,xs',i') = m xs i ns (el,xs'',i'') = ml tag xs' i' ns in (e:el,xs'',i'') -- Lazily materialize the SAX stream into a DOM tree setting parent references. -- It causes space leaks for large documents. -- Used only if the query has backward steps that cannot be eliminated. materializeWithParent :: Int -> Stream -> NS -> XTree materializeWithParent level stream ns = root where root = XElem documentRootTag [] 1 (XError "Trying to access the root parent") (if level<=1 then [head (filter (\x -> case x of XElem _ _ _ _ _ -> True; _ -> False) ((\(x,_,_)->x) (ml "" stream 2 root ns)))] else ((\(x,_,_)->x) (mdl stream 2 level root [] ns))) md s@(x@(StartEvent n atts):xs) i level p ls ns | level == 0 = let f [] i p ns = m s i p ns f ((StartEvent n atts):ls) i p ns = let ns' = elementNamespaces atts ns (e,xs',i') = (f ls $! i+1) x $! ns' x = ((XElem $! (tag n ns')) $! (attributes atts ns')) i p [e] in (x,xs',i') (e,xs',i') = (f $! (reverse ls)) i p ns in (e,xs',i',ls) | otherwise = (((md xs $! i+1) $! level-1) p $! x:ls) ns md ((EndEvent n):xs) i level p (x:ls) ns = (md xs i $! level+1) p ls ns md (_:xs) i level p ls ns = md xs i level p ls ns md [] _ _ _ _ _ = (XText "",[],0,[]) mdl xs@(_:_) i level p ls ns = let (e,xs',i',ls') = md xs i level p ls ns (el,xs'',i'') = mdl xs' i' 0 p ls' ns in (e:el,xs'',i'') mdl [] _ _ _ _ _ = ([],[],0) m ((TextEvent t):xs) i _ _ = (XText t,xs,i) m ((EmptyEvent n atts):xs) i p ns = let ns' = elementNamespaces atts ns in (((XElem $! (tag n ns')) $! (attributes atts ns')) i p [],xs,i+1) m ((StartEvent n atts):xs) i p ns = let ns' = elementNamespaces atts ns (el,xs',i') = (ml n xs $! (i+1)) node $! ns' node = ((XElem $! (tag n ns')) $! (attributes atts ns')) i p el in (node,xs',i') m ((PIEvent n s):xs) i _ _ = (XPI n s,xs,i) m ((CommentEvent s):xs) i _ _ = (XComment s,xs,i) m ((GERefEvent n):xs) i _ _ = (XGERef n,xs,i) m ((ErrorEvent s):xs) i _ _ = (XError s,xs,i) m (_:xs) i _ _ = (XError "unrecognized XML event",xs,i) m [] i _ _ = (XError "unbalanced tags",[],i) ml _ [] i _ _ = ([],[],i) ml tag ((EndEvent n):xs) i _ _ | n==tag = ([],xs,i) | otherwise = error $ "Unmatched tags: <"++show tag++">" ml tag xs i p ns = let (e,xs',i') = m xs i p ns (el,xs'',i'') = ml tag xs' i' p ns in (e:el,xs'',i'') -- It looks at the first maxStreamingUnitSize events in the stream to determine -- the minimum nesting depth. If this minimum depth is greater than 1, then -- the stream will be chopped into a list of XML elements at this depth level, -- where each XML element corresponds to at most maxStreamingUnitSize events. -- Otherwise, the cache requirenments would be the size of the largest root child docLevel :: Stream -> Int -> Int -> Int -> Int docLevel stream i level minLevel = if i < 0 then minLevel else case stream of ((StartEvent n _):xs) -> ((docLevel xs $! i-1) $! level+1) minLevel ((EndEvent n):xs) -> ((docLevel xs $! i-1) $! level-1) $! min minLevel $! (level-1) (_:xs) -> (docLevel xs $! i-1) level minLevel _ -> 0 -- | Maximum number of events that each streaming unit corresponds to maxStreamingUnitSize :: Int maxStreamingUnitSize = 10000 materialize :: Bool -> Stream -> XTree materialize withParent stream = let level = docLevel stream maxStreamingUnitSize 0 1000000 in if withParent then materializeWithParent level stream initialNS else materializeWithoutParent level stream initialNS