{-------------------------------------------------------------------------------------
-
- XML Trees (represented as rose trees)
- Programmer: Leonidas Fegaras
- Email: fegaras@cse.uta.edu
- Web: http://lambda.uta.edu/
- Creation: 05/01/08, last update: 09/29/09
- 
- Copyright (c) 2008 by Leonidas Fegaras, the University of Texas at Arlington. All rights reserved.
- This material is provided as is, with absolutely no warranty expressed or implied.
- Any use is at your own risk. Permission is hereby granted to use or copy this program
- for any purpose, provided the above notices are retained on all copies.
-
--------------------------------------------------------------------------------------}


{-# OPTIONS_GHC -funbox-strict-fields #-}


module Text.XML.HXQ.XTree where

import System.IO
import Char(isSpace)
import XMLParse(XMLEvent(..))
import HXML(Name,AttList)
import Text.XML.HXQ.Parser(Ast(..))
import System.CPUTime
-- import System.IO.Unsafe


-- | Namespace prefix
type Prefix = String

-- | Namespace URI
type URI = String

-- | Local name
type LocalName = String

-- | A qualified name has a namespace prefix, a URI, and a local name
data QName = QName { prefix :: Prefix, uri :: URI, localName :: LocalName }

instance Eq QName where
    (QName _ u1 ln1) == (QName _ u2 ln2)
        = u1 == u2 && ln1 == ln2

instance Ord QName where
    (QName _ u1 ln1) <= (QName _ u2 ln2)
        = u1 < u2 || (u1 == u2 && ln1 <= ln2)

instance Show QName where
    show (QName "" _ ln) = ln
    show (QName ns _ ln) = ns++(':':ln)


-- | XML attributes are bindings from qualified names to values
type Attributes = [(QName,String)]


matchQName x@(QName _ u1 ln1) y@(QName _ u2 ln2)
    = (u1 == u2 || u1 == "" || u2 == "")
      && (ln1 == ln2 || ln2 == "*" || ln1 == "*")


documentRootTag = QName "" "" "_document"


-- | A rose tree representation of XML data.
-- An XML element is:  @XElem tagname atributes preorder parent children@.
-- The preorder numbering is the document order of elements.
-- The parent is a cyclic reference to the parent element.
data XTree =  XElem    !QName !Attributes !Int XTree [XTree]   -- ^ an XML tree node (element)
           |  XAttr    !QName !String   -- ^ attribute construction
           |  XText    !String          -- ^ an XML tree leaf (PCDATA)
           |  XInt     !Int             -- ^ an XML tree leaf (int)
           |  XFloat   !Double          -- ^ an XML tree leaf (double)
           |  XBool    !Bool            -- ^ an XML tree leaf (boolean)
           |  XPI      String String	-- ^ processing instruction
           |  XGERef   String		-- ^ general entity reference
           |  XComment String		-- ^ comment
           |  XError   String		-- ^ error message
           |  XNull                     -- ^ null value
           |  XType    Type             -- ^ type information
           |  XNoPad                    -- ^ marker for no padding in XSeq
           deriving Eq


-- | A sequence of XML fragments
type XSeq = [XTree]


emptyElem :: XTree -> Bool
emptyElem e
    = case e of
        XElem _ al _ _ xs
              -> emptyAL al && all emptyElem xs && not (null xs)
        XText text -> all isSpace text
        XNull -> True
        XNoPad -> True
        _ -> False
      where emptyAL = all (\(a,v) -> case (a,v) of
                                       (_,"") -> True
                                       (QName _ _ ('_':_),_) -> True
                                       _ -> False)


showsAL :: Attributes -> String -> String
showsAL al acc
    = foldr (\(a,v) r -> case (a,v) of
                           (QName _ _ ('_':_),_) -> r
                           _ -> ' ':shows a ('=':shows v r)) acc al


showsXT :: XTree -> String -> Bool -> String
showsXT e acc pad
    = case e of
        XElem tag _ _ _ xs
            | tag == documentRootTag
            -> showsXS xs acc
        XElem tag al _ _ []
            -> '<':shows tag (showsAL al ('/':'>':acc))
        XElem _ _ _ _ _
            | emptyElem e
            -> acc
        XElem tag al _ _ xs
            | all emptyElem xs
            -> '<':shows tag (showsAL al ('/':'>':acc))
        XElem tag al _ _ xs
            -> '<':shows tag (showsAL al ('>':showsXS xs ('<':'/':shows tag ('>':acc))))
        XAttr tag val
            -> p (shows tag ('=':shows val acc))
        XText text
            -> p (text++acc)
        XInt n
            -> p (shows n acc)
        XFloat n
            -> p (shows n acc)
        XBool v
            -> p (if v then "true"++acc else "false"++acc)
        XComment s
            -> "<!--"++s++"-->"++acc
        XPI n s
            -> "<?"++n++(' ':(s++('>':acc)))
        XError s -> error s
        XNull -> '?':acc
        XType tp -> shows tp acc
        _ -> acc
      where p acc = if pad then ' ':acc else acc

showsXS :: XSeq -> String -> String
showsXS [] acc = acc
showsXS (x:xs) acc = showsXT x (sXS xs acc) False
    where sXS (XNoPad:x:xs) acc = showsXT x  (sXS xs acc) False
          sXS (x:xs) acc = showsXT x (sXS xs acc) True
          sXS _ acc = acc

instance Show XTree where
    show t = showsXT t "" False


-- | Print the XQuery result (which is a sequence of XML fragments) without buffering.
putXSeq :: XSeq -> IO ()
putXSeq xs = hSetBuffering stdout NoBuffering >> putStrLn (showsXS xs "")


{--------------------------- XQuery Types ---------------------------------------}

-- | A type variable
type TVar = Int


-- | Type qualifier: *, +, or ?
type TQualifier = Char


-- | An XQuery type
data Type
    = TVariable TVar                 -- ^ type variable (needed for polymorphic type inference)
    | TBase QName                    -- ^ xs:integer, xs:string, ...
    | TItem String                   -- ^ item(), node(), ...
    | TNamed QName                   -- ^ reference to a user-defined type
    | TElement String Type           -- ^ element tag { t }
    | TAttribute String Type         -- ^ attribute name { t }
    | TAny                           -- ^ any element or attribute content
    | TEmpty                         -- ^ ()
    | TSequence Type Type            -- ^ t1, t2
    | TInterleaving Type Type        -- ^ t1 & t2
    | TChoice Type Type              -- ^ t1 | t2
    | TQualified Type TQualifier     -- ^ t*, t+, or t?
      deriving Eq


showsType :: Type -> Int -> String -> String
showsType t prec acc
    = case t of
        TVariable s -> '#':shows s acc
        TBase s -> shows s acc
        TItem s -> s++('(':')':acc)
        TNamed s -> shows s acc
        TEmpty -> '(':')':acc
        TAny -> "xs:any"++acc
        TElement n TAny -> "element "++n++acc
        TAttribute a TAny -> "attribute "++a++acc
        TElement n t
            -> "element "++n++" { "++showsType t 3 (" }"++acc)
        TAttribute a t
            -> "attribute "++a++" { "++showsType t 3 (" }"++acc)
        TSequence t1 t2
            -> paren 3 prec (showsType t1 3 (", "++showsType t2 3 acc))
        TInterleaving t1 t2
            -> paren 3 prec (showsType t1 2 (" & "++showsType t2 2 acc))
        TChoice t1 t2
            -> paren 2 prec (showsType t1 2 (" | "++showsType t2 2 acc))
        TQualified t c
            -> paren 4 prec (showsType t 4 (c:acc))
    where paren p1 p2 s | p1<p2 = "("++s++")"
                        | otherwise = s


instance Show Type where
    show t = showsType t 4 ""


-- | XML Schema bindings
type XMLSchema = [(LocalName,Type)]


-- | Binding from a namespace URI to an XML Schema
type Schemas = [(URI,XMLSchema)]


-- | Binding from namespace prefixes to URIs
type NamespacePrefixes = [(Prefix,URI)]


-- | The namespace context is the default element namespace, the default function namespace,
-- the namespace prefixes, and the XML Schemas associated with namespaces
data NS = NS { defaultElementNS :: URI, defaultFunctionNS :: URI,
               prefixes :: NamespacePrefixes, schemas :: Schemas }
          deriving Show


tag :: String -> NS -> QName
tag s ns
    = if elem ':' s
      then case span (/= ':') s of
             (s1,_:s2) -> case lookup s1 (prefixes ns) of
                            Just u -> QName s1 u s2
                            Nothing -> error ("Undeclared element namespace: "++s1)
      else QName "" (defaultElementNS ns) s


attributeTag s ns
    = if elem ':' s
      then case span (/= ':') s of
             (s1,_:s2) -> case lookup s1 (prefixes ns) of
                            Just u -> QName s1 u s2
                            Nothing -> error ("Undeclared attribute namespace: "++s1)
      else QName "" "" s


functionTag :: String -> NS -> QName
functionTag s ns
    = if elem ':' s
      then case span (/= ':') s of
             (s1,_:s2) -> case lookup s1 (prefixes ns) of
                            Just u -> QName s1 u s2
                            Nothing -> error ("Undeclared function namespace: "++s1)
      else QName "" (defaultFunctionNS ns) s


attributes :: AttList -> NS -> Attributes
attributes al ns
    = map (\(s,v) -> (attributeTag s ns,v)) al


elementNamespaces :: AttList -> NS -> NS
elementNamespaces atts ns
    = ns { defaultElementNS = foldr (\(a,v) r -> case a of "xmlns" -> v; _ -> r) (defaultElementNS ns) atts,
           prefixes = foldr (\(a,v) r -> case splitAt 6 a of
                                           ("xmlns:",p)
                                               -> case lookup p predefinedNamespaces of
                                                    Just uri
                                                        -> if uri == v
                                                           then (p,v):r
                                                           else error ("You cannot redefine the system namespace "++p)
                                                    Nothing -> (p,v):r
                                           _ -> r) (prefixes ns) atts }


predefinedNamespaces :: NamespacePrefixes
predefinedNamespaces
    = [ ("xml", "http://www.w3.org/XML/1998/namespace"),
        ("xmlns", "http://www.w3.org/2000/xmlns/"),
        ("xsl", "http://www.w3.org/1999/XSL/Transform"),
        ("xs", "http://www.w3.org/2001/XMLSchema"),
        ("xsi", "http://www.w3.org/2001/XMLSchema-instance"),
        ("fn", "http://www.w3.org/2005/xpath-functions"),
        ("xdt", "http://www.w3.org/2005/xpath-datatypes"),
        ("local", "http://www.w3.org/2005/xquery-local-functions") ]


xsNamespace = let Just uri = lookup "xs" predefinedNamespaces in uri
fnNamespace = let Just uri = lookup "fn" predefinedNamespaces in uri


initialNS = NS { defaultElementNS="", defaultFunctionNS=fnNamespace,
                 prefixes=predefinedNamespaces, schemas=[] }


{--------------- Build an XTree from the XML stream ----------------------------}


type Stream = [XMLEvent]

noParentError = XError "Undefined parent reference"


statistics = id
{- Collect statistics about the timing of I/O events
statistics n
    = if mod n 1000 == 0
      then unsafePerformIO (do t <- getCPUTime
                               putStr $ show (div n 1000)
                               putStrLn $ " " ++ show (div t (10^6))
                               return n)
      else n
-}


-- Lazily materialize the SAX stream into a DOM tree without setting parent references.
materializeWithoutParent :: Int -> Stream -> NS -> XTree
materializeWithoutParent level stream ns
    = XElem documentRootTag [] 1 noParentError
            (if level<=1
             then [head (filter (\x -> case x of XElem _ _ _ _ _ -> True; _ -> False)
                                ((\(x,_,_)->x) (ml "" stream 2 ns)))]
             else ((\(x,_,_)->x) (mdl stream 2 level [] ns)))
      where md s@(x@(StartEvent n atts):xs) i level ls ns
                | level == 0
                    = let f [] i ns = m s i ns
                          f ((StartEvent n atts):ls) i ns
                              = let ns' = elementNamespaces atts ns
                                    (e,xs',i') = (f ls $! i+1) $! ns'
                                in (((XElem $! (tag n ns')) $! (attributes atts ns')) i noParentError [e],xs',i')
                          (e,xs',i') = (f $! (reverse ls)) i ns
                      in (e,xs',i',ls)
                | otherwise
                    = (((md xs $! i+1) $! level-1) $! x:ls) ns
            md ((EndEvent n):xs) i level (x:ls) ns
                = (md xs i $! level+1) ls ns
            md (_:xs) i level ls ns
                    = md xs i level ls ns
            md [] _ _ _ _ = (XText "",[],0,[])
            mdl xs@(_:_) i level ls ns
                = let (e,xs',i',ls') = md xs i level ls ns
                      (el,xs'',i'') = mdl xs' i' 0 ls' ns
                  in (e:el,xs'',i'')
            mdl [] _ _ _ _ = ([],[],0)
            m ((TextEvent t):xs) i _ = (XText t,xs,i)
            m ((EmptyEvent n atts):xs) i ns
                = let ns' = elementNamespaces atts ns
                  in (((XElem $! (tag n ns')) $! (attributes atts ns')) i noParentError [],xs,i+1)
            m ((StartEvent n atts):xs) i ns
                = let ns' = elementNamespaces atts ns
                      (el,xs',i') = (ml n xs $! i+1) $! ns'
                  in (((XElem $! (tag n ns')) $! (attributes atts ns')) i noParentError el,xs',i')
            m ((PIEvent n s):xs) i _ = (XPI n s,xs,i)
            m ((CommentEvent s):xs) i _ = (XComment s,xs,i)
            m ((GERefEvent n):xs) i _ = (XGERef n,xs,i)
            m ((ErrorEvent s):xs) i _ = (XError s,xs,i)
            m (_:xs) i _ = (XError "Unrecognized XML event",xs,i)
            m [] i _ = (XError "Unterminated element",[],i)
            ml _ [] i _ = ([],[],i)
            ml tag ((EndEvent n):xs) i ns
                | n==tag = ([],xs,i)
                | otherwise = error $ "Unmatched tags: <"++show tag++"></"++n++">"
            ml tag xs i ns
                = let (e,xs',i') = m xs i ns
                      (el,xs'',i'') = ml tag xs' i' ns
                  in (e:el,xs'',i'')


-- Lazily materialize the SAX stream into a DOM tree setting parent references.
-- It causes space leaks for large documents.
-- Used only if the query has backward steps that cannot be eliminated.
materializeWithParent :: Int -> Stream -> NS -> XTree
materializeWithParent level stream ns = root
    where root = XElem documentRootTag [] 1 (XError "Trying to access the root parent")
                       (if level<=1
                        then [head (filter (\x -> case x of XElem _ _ _ _ _ -> True; _ -> False)
                                           ((\(x,_,_)->x) (ml "" stream 2 root ns)))]
                        else ((\(x,_,_)->x) (mdl stream 2 level root [] ns)))
          md s@(x@(StartEvent n atts):xs) i level p ls ns
              | level == 0
                  = let f [] i p ns = m s i p ns
                        f ((StartEvent n atts):ls) i p ns
                            = let ns' = elementNamespaces atts ns
                                  (e,xs',i') = (f ls $! i+1) x $! ns'
                                  x = ((XElem $! (tag n ns')) $! (attributes atts ns')) i p [e]
                              in (x,xs',i')
                        (e,xs',i') = (f $! (reverse ls)) i p ns
                    in (e,xs',i',ls)
              | otherwise
                  = (((md xs $! i+1) $! level-1) p $! x:ls) ns
          md ((EndEvent n):xs) i level p (x:ls) ns
              = (md xs i $! level+1) p ls ns
          md (_:xs) i level p ls ns
              = md xs i level p ls ns
          md [] _ _ _ _ _ = (XText "",[],0,[])
          mdl xs@(_:_) i level p ls ns
              = let (e,xs',i',ls') = md xs i level p ls ns
                    (el,xs'',i'') = mdl xs' i' 0 p ls' ns
                in (e:el,xs'',i'')
          mdl [] _ _ _ _ _ = ([],[],0)
          m ((TextEvent t):xs) i _ _ = (XText t,xs,i)
          m ((EmptyEvent n atts):xs) i p ns
              = let ns' = elementNamespaces atts ns
                in (((XElem $! (tag n ns')) $! (attributes atts ns')) i p [],xs,i+1)
          m ((StartEvent n atts):xs) i p ns
              = let ns' = elementNamespaces atts ns
                    (el,xs',i') = (ml n xs $! (i+1)) node $! ns'
                    node = ((XElem $! (tag n ns')) $!  (attributes atts ns')) i p el
                in (node,xs',i')
          m ((PIEvent n s):xs) i _ _ = (XPI n s,xs,i)
          m ((CommentEvent s):xs) i _ _ = (XComment s,xs,i)
          m ((GERefEvent n):xs) i _ _ = (XGERef n,xs,i)
          m ((ErrorEvent s):xs) i _ _ = (XError s,xs,i)
          m (_:xs) i _ _ = (XError "unrecognized XML event",xs,i)
          m [] i _ _ = (XError "unbalanced tags",[],i)
          ml _ [] i _ _ = ([],[],i)
          ml tag ((EndEvent n):xs) i _ _
                | n==tag = ([],xs,i)
                | otherwise = error $ "Unmatched tags: <"++show tag++"></"++n++">"
          ml tag xs i p ns
              = let (e,xs',i') = m xs i p ns
                    (el,xs'',i'') = ml tag xs' i' p ns
                in (e:el,xs'',i'')


-- It looks at the first maxStreamingUnitSize events in the stream to determine
-- the minimum nesting depth. If this minimum depth is greater than 1, then
-- the stream will be chopped into a list of XML elements at this depth level,
-- where each XML element corresponds to at most maxStreamingUnitSize events.
-- Otherwise, the cache requirenments would be the size of the largest root child
docLevel :: Stream -> Int -> Int -> Int -> Int
docLevel stream i level minLevel
    = if i < 0
      then minLevel
      else case stream of
             ((StartEvent n _):xs)
                 -> ((docLevel xs $! i-1) $! level+1) minLevel
             ((EndEvent n):xs)
                 -> ((docLevel xs $! i-1) $! level-1) $! min minLevel $! (level-1)
             (_:xs) -> (docLevel xs $! i-1) level minLevel
             _ -> 0


-- | Maximum number of events that each streaming unit corresponds to
maxStreamingUnitSize :: Int
maxStreamingUnitSize = 10000


materialize :: Bool -> Stream -> XTree
materialize withParent stream
    = let level = docLevel stream maxStreamingUnitSize 0 1000000
      in if withParent
         then materializeWithParent level stream initialNS
         else materializeWithoutParent level stream initialNS