-- ------------------------------------------------------------

{- |
   Module     : Text.XML.HXT.Parser.XmlParsec
   Copyright  : Copyright (C) 2005 Uwe Schmidt
   License    : MIT

   Maintainer : Uwe Schmidt (uwe@fh-wedel.de)
   Stability  : experimental
   Portability: portable
   Version    : $Id: XmlParsec.hs,v 1.14 2005/09/02 17:09:39 hxml Exp $

   Xml Parsec parser with pure filter interface

-}

-- ------------------------------------------------------------

module Text.XML.HXT.Parser.XmlParsec
    ( charData
    , charData'
    , comment
    , pI
    , cDSect
    , document
    , document'
    , prolog
    , xMLDecl
    , xMLDecl'
    , versionInfo
    , misc
    , doctypedecl
    , markupdecl
    , sDDecl
    , element
    , content
    , contentWithTextDecl
    , textDecl
    , encodingDecl
    , xread

    , parseXmlAttrValue
    , parseXmlContent
    , parseXmlDocEncodingSpec
    , parseXmlDocument
    , parseXmlDTDPart
    , parseXmlEncodingSpec
    , parseXmlEntityEncodingSpec
    , parseXmlGeneralEntityValue
    , parseXmlPart
    , parseXmlText

    , parseNMToken
    , parseName

    , removeEncodingSpec
    )
where

import Text.ParserCombinators.Parsec
    ( GenParser
    , Parser
    , parse
    , (<?>), (<|>)
    , char
    , string
    , eof 
    , between
    , many, many1
    , option
    , try
    , unexpected
    , getPosition
    , getInput
    , sourceName
    )

import Text.XML.HXT.DOM.ShowXml
    ( xshow
    )

import Text.XML.HXT.DOM.Interface

import Text.XML.HXT.DOM.XmlNode
    ( mkElement
    , mkAttr
    , mkRoot
    , mkDTDElem
    , mkText
    , mkCmt
    , mkCdata
    , mkError
    , mkPi
    , isText
    , isRoot
    , getText
    , getChildren
    , getAttrl
    , getAttrName
    , changeAttrl
    , mergeAttrl
    )

import Text.XML.HXT.Parser.XmlCharParser
    ( xmlChar
    )
import qualified Text.XML.HXT.Parser.XmlTokenParser	as XT
import qualified Text.XML.HXT.Parser.XmlDTDTokenParser	as XD

import Data.Char	(toLower)
import Data.Maybe

-- ------------------------------------------------------------
--
-- Character Data (2.4)

charData		:: GenParser Char state XmlTrees
charData
    = many (charData' <|> XT.referenceT)

charData'		:: GenParser Char state XmlTree
charData'
    = try ( do
	    t <- XT.allBut1 many1 (\ c -> not (c `elem` "<&")) "]]>"
	    return (mkText $! t)
	  )

-- ------------------------------------------------------------
--
-- Comments (2.5)

comment		:: GenParser Char state XmlTree

comment
    = ( do
	c <- between (try $ string "<!--") (string "-->") (XT.allBut many "--")
	return (mkCmt $! c)
      ) <?> "comment"

-- ------------------------------------------------------------
--
-- Processing Instructions

pI		:: GenParser Char state XmlTree
pI
    = between (try $ string "<?") (string "?>")
      ( do
	n <- pITarget
	p <- option "" (do
			_ <- XT.sPace
			XT.allBut many "?>"
		       )
	return $ mkPi (mkName n) [mkAttr (mkName a_value) [mkText p]]
      ) <?> "processing instruction"
      where
      pITarget	:: GenParser Char state String
      pITarget = ( do
		   n <- XT.name
		   if map toLower n == t_xml
		      then unexpected n
		      else return n
		 )

-- ------------------------------------------------------------
--
-- CDATA Sections (2.7)

cDSect		:: GenParser Char state XmlTree

cDSect
    = do
      t <- between ( try $ string "<![CDATA[") (string "]]>") (XT.allBut many "]]>")
      return (mkCdata $! t)
      <?> "CDATA section"

-- ------------------------------------------------------------
--
-- Document (2.1) and Prolog (2.8)

document	:: GenParser Char state XmlTree
document
    = do
      pos <- getPosition
      dl <- document'
      return $ mkRoot [ mkAttr (mkName a_source) [mkText (sourceName pos)]
		      , mkAttr (mkName a_status) [mkText (show c_ok)]
		      ] dl

document'	:: GenParser Char state XmlTrees
document'
    = do
      pl <- prolog
      el <- element
      ml <- many misc
      eof
      return (pl ++ [el] ++ ml)

prolog		:: GenParser Char state XmlTrees
prolog
    = do
      xml     <- option [] xMLDecl'
      misc1   <- many misc
      dtdPart <- option [] doctypedecl
      misc2   <- many misc
      return (xml ++ misc1 ++ dtdPart ++ misc2)

xMLDecl		:: GenParser Char state XmlTrees
xMLDecl
    = between (try $ string "<?xml") (string "?>")
      ( do
	vi <- versionInfo
	ed <- option [] encodingDecl
	sd <- option [] sDDecl
	XT.skipS0
	return (vi ++ ed ++ sd)
      )
      <?> "xml declaration"

xMLDecl'	:: GenParser Char state XmlTrees
xMLDecl'
    = do
      al <- xMLDecl
      return [mkPi (mkName t_xml) al]

xMLDecl''	:: GenParser Char state XmlTree
xMLDecl''
    = do
      al     <- option [] (try xMLDecl)
      return (mkRoot al [])

versionInfo	:: GenParser Char state XmlTrees
versionInfo
    = ( do
	_ <- try ( do
		   XT.skipS
		   XT.keyword a_version
		 )
	XT.eq
	vi <- XT.quoted XT.versionNum
	return [mkAttr (mkName a_version) [mkText vi]]
      )
      <?> "version info (with quoted version number)"

misc		:: GenParser Char state XmlTree
misc
    = comment
      <|>
      pI
      <|>
      ( ( do
	  ws <- XT.sPace
	  return (mkText ws)
	) <?> ""
      )

-- ------------------------------------------------------------
--
-- Document Type definition (2.8)

doctypedecl	:: GenParser Char state XmlTrees
doctypedecl
    = between (try $ string "<!DOCTYPE") (char '>')
      ( do
	XT.skipS
	n <- XT.name
	exId <- option [] ( try ( do
				  XT.skipS
				  externalID
				)
			  )
	XT.skipS0
	markup <- option []
	          ( do
		    m <- between (char '[' ) (char ']') markupOrDeclSep
		    XT.skipS0
		    return m
		  )
	return [mkDTDElem DOCTYPE ((a_name, n) : exId) markup]
      )

markupOrDeclSep	:: GenParser Char state XmlTrees
markupOrDeclSep
    = ( do
	ll <- many ( markupdecl
		     <|>
		     declSep
		     <|>
		     XT.mkList conditionalSect
		   )
	return (concat ll)
      )

declSep		:: GenParser Char state XmlTrees
declSep
    = XT.mkList XT.peReferenceT
      <|>
      ( do
	XT.skipS
	return []
      )

markupdecl	:: GenParser Char state XmlTrees
markupdecl
    = XT.mkList
      ( pI
	<|>
	comment
	<|>
	XD.dtdDeclTokenizer
      )

-- ------------------------------------------------------------
--
-- Standalone Document Declaration (2.9)

sDDecl		:: GenParser Char state XmlTrees
sDDecl
    = do
      _ <- try (do
		XT.skipS
		XT.keyword a_standalone
	       )
      XT.eq
      sd <- XT.quoted (XT.keywords [v_yes, v_no])
      return [mkAttr (mkName a_standalone) [mkText sd]]

-- ------------------------------------------------------------
--
-- element, tags and content (3, 3.1)

element		:: GenParser Char state XmlTree
element
    = ( do
	e <- elementStart
	elementRest e
      ) <?> "element"

elementStart		:: GenParser Char state (String, [(String, XmlTrees)])
elementStart
    = do
      n <- ( try ( do
		   _ <- char '<'
		   n <- XT.name
		   return n
		 )
	     <?> "start tag"
	   )
      ass <- attrList
      XT.skipS0
      return (n, ass)
      where
      attrList
	  = option [] ( do
			XT.skipS
			attrList'
		      )
      attrList'
	  = option [] ( do
			a1 <- attribute
			al <- attrList
			let (n, _v) = a1
			if isJust . lookup n $ al
			  then unexpected ("attribute name " ++ show n ++ " occurs twice in attribute list")
			  else return (a1 : al)
		      )

elementRest	:: (String, [(String, XmlTrees)]) -> GenParser Char state XmlTree
elementRest (n, al)
    = ( do
	_ <- try $ string "/>"
	return $! (mkElement (mkName n) (map (mkA $!) al) [])
      )
      <|>
      ( do
	_ <- XT.gt
	c <- content
	eTag n
	return $! (mkElement (mkName n) (map (mkA $!) al) $! c)
      )
      <?> "proper attribute list followed by \"/>\" or \">\""
    where
    mkA (n', ts') = mkAttr (mkName n') ts'

eTag		:: String -> GenParser Char state ()
eTag n'
    = do
      _ <- try ( string "</" ) <?> ""
      n <- XT.name
      XT.skipS0
      _ <- XT.gt
      if n == n'
	 then return ()
	 else unexpected ("illegal end tag </" ++ n ++ "> found, </" ++ n' ++ "> expected")

attribute	:: GenParser Char state (String, XmlTrees)
attribute
    = do
      n <- XT.name
      XT.eq
      v <- XT.attrValueT
      return (n, v)

content		:: GenParser Char state XmlTrees
content
    = do
      c1 <- charData
      cl <- many
	    ( do
	      l <- ( element
		     <|>
		     cDSect
		     <|>
		     pI
		     <|>
		     comment
		   )
	      c <- charData
	      return (l : c)
	    )
      return (c1 ++ concat cl)

contentWithTextDecl	:: GenParser Char state XmlTrees
contentWithTextDecl
    = do
      _ <- option [] textDecl
      content

-- ------------------------------------------------------------
--
-- Conditional Sections (3.4)
--
-- conditional sections are parsed in two steps,
-- first the whole content is detected,
-- and then, after PE substitution include sections are parsed again

conditionalSect		:: GenParser Char state XmlTree
conditionalSect
    = do
      _ <- try $ string "<!["
      cs <- many XD.dtdToken
      _ <- char '['
      sect <- condSectCont
      return (mkDTDElem CONDSECT [(a_value, sect)] cs)
    where

    condSectCont	:: GenParser Char state String
    condSectCont
	= ( do
	    _ <- try $ string "]]>"
	    return ""
	  )
          <|>
	  ( do
	    _ <- try $ string "<!["
	    cs1 <- condSectCont
	    cs2 <- condSectCont
	    return ("<![" ++ cs1 ++ "]]>" ++ cs2)
	  )
	  <|>
	  ( do
	    c  <- xmlChar
	    cs <- condSectCont
	    return (c : cs)
	  )

-- ------------------------------------------------------------
--
-- External Entities (4.2.2)

externalID	:: GenParser Char state Attributes
externalID
    = ( do
	_ <- XT.keyword k_system
	XT.skipS
	lit <- XT.systemLiteral
	return [(k_system, lit)]
      )
      <|>
      ( do
	_ <- XT.keyword k_public
	XT.skipS
	pl <- XT.pubidLiteral
	XT.skipS
	sl <- XT.systemLiteral
	return [ (k_system, sl)
	       , (k_public, pl) ]
      )
      <?> "SYSTEM or PUBLIC declaration"

-- ------------------------------------------------------------
--
-- Text Declaration (4.3.1)

textDecl	:: GenParser Char state XmlTrees
textDecl
    = between (try $ string "<?xml") (string "?>")
      ( do
	vi <- option [] versionInfo
	ed <- encodingDecl
	XT.skipS0
	return (vi ++ ed)
      )
      <?> "text declaration"


textDecl''	:: GenParser Char state XmlTree
textDecl''
    = do
      al    <- option [] (try textDecl)
      return (mkRoot al [])

-- ------------------------------------------------------------
--
-- Encoding Declaration (4.3.3)

encodingDecl	:: GenParser Char state XmlTrees
encodingDecl
    = do
      _ <- try ( do
		 XT.skipS
		 XT.keyword a_encoding
	       )
      XT.eq
      ed <- XT.quoted XT.encName
      return [mkAttr (mkName a_encoding) [mkText ed]]

-- ------------------------------------------------------------
--
-- the main entry points:
--	parsing the content of a text node
--	or parsing the text children from a tag node

-- |
-- the inverse function to 'xshow', (for XML content).
--
-- the string parameter is parsed with the XML content parser.
-- result is the list of trees or in case of an error a single element list with the
-- error message as node. No entity or character subtitution is done.
--
-- see also: 'parseXmlContent'

xread			:: String -> XmlTrees
xread str
    = parseXmlFromString parser loc str
    where
    loc = "string: " ++ show (if length str > 40 then take 40 str ++ "..." else str)
    parser = do
	     res <- content		-- take the content parser for parsing the string
	     eof			-- and test on everything consumed
	     return res

-- |
-- the filter version of 'xread'

parseXmlContent		:: XmlTree -> XmlTrees
parseXmlContent
    = xread . xshow . (:[])

-- |
-- a more general version of 'parseXmlContent'.
-- The parser to be used and the context are extra parameter

parseXmlText		:: Parser XmlTrees -> String -> XmlTree -> XmlTrees
parseXmlText p loc	= parseXmlFromString p loc . xshow . (:[])

parseXmlDocument	:: String -> String -> XmlTrees
parseXmlDocument	= parseXmlFromString document'


parseXmlFromString	:: Parser XmlTrees -> String -> String -> XmlTrees
parseXmlFromString parser loc
    = either ((:[]) . mkError c_err . (++ "\n") . show) id . parse parser loc

-- ------------------------------------------------------------
--

removeEncodingSpec	:: XmlTree -> XmlTrees
removeEncodingSpec t
    | isText t
	= ( either ((:[]) . mkError c_err . (++ "\n") . show) ((:[]) . mkText)
	    . parse parser "remove encoding spec"
	    . fromMaybe ""
	    . getText
	  ) t
    | otherwise
	= [t]
    where
    parser :: Parser String
    parser = do
	     _ <- option [] textDecl
	     getInput

-- ------------------------------------------------------------

-- |
-- general parser for parsing arbitray parts of a XML document

parseXmlPart	:: Parser XmlTrees -> String -> String -> XmlTree -> XmlTrees
parseXmlPart parser expected context t
    = parseXmlText ( do
		     res <- parser
		     eof <?> expected
		     return res
		   ) context
      $ t

-- ------------------------------------------------------------

-- |
-- Parser for parts of a DTD

parseXmlDTDPart	:: String -> XmlTree -> XmlTrees
parseXmlDTDPart
    = parseXmlPart markupOrDeclSep "markup declaration"

-- ------------------------------------------------------------

-- |
-- Parser for general entites

parseXmlGeneralEntityValue	:: String -> XmlTree -> XmlTrees
parseXmlGeneralEntityValue
    = parseXmlPart content "general entity value"

-- ------------------------------------------------------------

-- |
-- Parser for attribute values

parseXmlAttrValue	:: String -> XmlTree -> XmlTrees
parseXmlAttrValue
    = parseXmlPart (XT.attrValueT' "<&") "attribute value"

-- ------------------------------------------------------------

-- |
-- Parser for NMTOKENs

parseNMToken		:: String -> XmlTree -> XmlTrees
parseNMToken
    = parseXmlPart (many1 XT.nmtokenT) "nmtoken"

-- ------------------------------------------------------------

-- |
-- Parser for XML names

parseName		:: String -> XmlTree -> XmlTrees
parseName
    = parseXmlPart (many1 XT.nameT) "name"

-- ------------------------------------------------------------

-- |
-- try to parse a xml encoding spec.
--
--
--    * 1.parameter encParse :  the parser for the encoding decl
--
--    - 2.parameter root :  a document root
--
--    - returns : the same tree, but with an additional
--			  attribute \"encoding\" in the root node
--			  in case of a valid encoding spec
--			  else the unchanged tree

parseXmlEncodingSpec	:: Parser XmlTree -> XmlTree -> XmlTrees
parseXmlEncodingSpec encDecl x
    = (:[]) .
      ( if isRoot x
	then parseEncSpec
	else id
      ) $ x
    where
    parseEncSpec r
	= case ( parse encDecl source . xshow . getChildren $ r ) of
	  Right t
	      -> changeAttrl (mergeAttrl . fromMaybe [] . getAttrl $ t) r
	  Left _
	      -> r
	where
	-- arrow \"getAttrValue a_source\" programmed on the tree level (oops!)
	source = xshow . concat . map getChildren . filter ((== a_source) . maybe "" qualifiedName . getAttrName) . fromMaybe [] . getAttrl $ r

parseXmlEntityEncodingSpec	:: XmlTree -> XmlTrees
parseXmlEntityEncodingSpec	= parseXmlEncodingSpec textDecl''

parseXmlDocEncodingSpec		:: XmlTree -> XmlTrees
parseXmlDocEncodingSpec		= parseXmlEncodingSpec xMLDecl''

-- ------------------------------------------------------------