-- ------------------------------------------------------------

{- |
   Module     : Text.XML.HXT.Parser.XmlTokenParser
   Copyright  : Copyright (C) 2005 Uwe Schmidt
   License    : MIT

   Maintainer : Uwe Schmidt (uwe@fh-wedel.de)
   Stability  : experimental
   Portability: portable
   Version    : $Id: XmlTokenParser.hs,v 1.3 2005/09/02 17:09:39 hxml Exp $

   Parsec parser for XML tokens

-}

-- ------------------------------------------------------------

module Text.XML.HXT.Parser.XmlTokenParser
    ( allBut
    , allBut1
    , asciiLetter
    , attrValue
    , bar
    , charRef
    , comma
    , dq
    , encName
    , entityRef
    , entityValue
    , eq
    , gt
    , keyword
    , keywords
    , lpar
    , lt
    , name
    , names
    , ncName
    , nmtoken
    , nmtokens
    , peReference
    , pubidLiteral
    , qName
    , quoted
    , reference
    , rpar
    , semi
    , separator
    , singleChar
    , singleChars
    , skipS
    , skipS0
    , sPace
    , sPace0
    , sq
    , systemLiteral
    , versionNum

    , concRes
    , mkList

    , nameT
    , nmtokenT

    , entityValueT
    , entityTokensT
    , entityCharT

    , attrValueT
    , attrValueT'

    , referenceT
    , charRefT
    , entityRefT
    , peReferenceT

    , singleCharsT
    )
where

import Text.ParserCombinators.Parsec

import Text.XML.HXT.DOM.Interface

import Text.XML.HXT.DOM.XmlNode
    ( mkDTDElem
    , mkText
    , mkCharRef
    , mkEntityRef
    )

import Text.XML.HXT.DOM.Unicode
    ( isXmlChar
    , intToCharRef
    , intToCharRefHex
    )

import Text.XML.HXT.DOM.Util
    ( hexStringToInt
    , decimalStringToInt
    )

import Text.XML.HXT.Parser.XmlCharParser
    ( xmlNameChar
    , xmlNameStartChar
    , xmlNCNameChar
    , xmlNCNameStartChar
    , xmlSpaceChar
    )

-- ------------------------------------------------------------
--
-- Character (2.2) and White Space (2.3)
--
-- Unicode parsers in module XmlCharParser

-- ------------------------------------------------------------

sPace		:: GenParser Char state String
sPace
    = many1 xmlSpaceChar

sPace0		:: GenParser Char state String
sPace0
    = many xmlSpaceChar

skipS		:: GenParser Char state ()
skipS
    = skipMany1 xmlSpaceChar

skipS0		:: GenParser Char state ()
skipS0
    = skipMany xmlSpaceChar

-- ------------------------------------------------------------
--
-- Names and Tokens (2.3)

asciiLetter		:: GenParser Char state Char
asciiLetter
    = satisfy (\ c -> ( c >= 'A' && c <= 'Z' ||
			c >= 'a' && c <= 'z' )
	      )
      <?> "ASCII letter"

name		:: GenParser Char state String
name
    = try ( do
	    s1 <- xmlNameStartChar
	    sl <- many xmlNameChar
	    return (s1 : sl)
	  )
      <?> "Name"

-- Namespaces in XML: Rules [4-5] NCName:

ncName		:: GenParser Char state String
ncName
    = try ( do
	    s1 <- xmlNCNameStartChar
	    sl <- many xmlNCNameChar
	    return (s1 : sl)
	  )
      <?> "NCName"

-- Namespaces in XML: Rules [6-8] QName:

qName		:: GenParser Char state (String, String)
qName
    = do
      s1 <- ncName
      s2 <- option "" ( do
			char ':'
			ncName
		      )
      return ( if null s2 then (s2, s1) else (s1, s2) )

nmtoken		:: GenParser Char state String
nmtoken
    = try (many1 xmlNameChar)
      <?> "Nmtoken"

names		:: GenParser Char state [String]
names
    = sepBy1 name sPace

nmtokens	:: GenParser Char state [String]
nmtokens
    = sepBy1 nmtoken sPace

-- ------------------------------------------------------------
--
-- Literals (2.3)

singleChar		:: String -> GenParser Char state Char
singleChar notAllowed
    = satisfy (\ c -> isXmlChar c && not (c `elem` notAllowed))

singleChars		:: String -> GenParser Char state String
singleChars notAllowed
    = many1 (singleChar notAllowed)

entityValue	:: GenParser Char state String
entityValue
    = attrValue

attrValueDQ	:: GenParser Char state String
attrValueDQ
    = between dq dq (concRes $ many $ attrChar "<&\"")

attrValueSQ	:: GenParser Char state String
attrValueSQ
    = between sq sq (concRes $ many $ attrChar "<&\'")

attrValue	:: GenParser Char state String
attrValue
    = ( do
	v <- attrValueDQ
	return ("\"" ++ v ++ "\"")
      )
      <|>
      ( do
	v <- attrValueSQ
	return ("'" ++ v ++ "'")
      )
      <?> "attribute value (in quotes)"

attrChar	:: String -> GenParser Char state String
attrChar notAllowed
    = reference
      <|>
      mkList (singleChar notAllowed)
      <?> "legal attribute character or reference"


systemLiteral		:: GenParser Char state String
systemLiteral
    = between dq dq (many $ noneOf "\"")
      <|>
      between sq sq (many $ noneOf "\'")
      <?> "system literal (in quotes)"

pubidLiteral		:: GenParser Char state String
pubidLiteral
    = between dq dq (many $ pubidChar "\'")
      <|>
      between sq sq (many $ pubidChar "")
      <?> "pubid literal (in quotes)"
      where
      pubidChar		:: String -> GenParser Char state Char
      pubidChar quoteChars
	  = asciiLetter
	    <|>
	    digit
	    <|>
	    oneOf " \r\n"		-- no "\t" allowed, so xmlSpaceChar parser not used
	    <|>
	    oneOf "-()+,./:=?;!*#@$_%"
            <|>
	    oneOf quoteChars

-- ------------------------------------------------------------
--
-- Character and Entity References (4.1)

reference	:: GenParser Char state String
reference
    = ( do
	i <- charRef
	return ("&#" ++ show i ++ ";")
      )
      <|>
      ( do
        n <- entityRef
        return ("&" ++ n ++ ";")
      )
      

checkCharRef	:: Int -> GenParser Char state Int
checkCharRef i
    = if ( i <= fromEnum (maxBound::Char)
	   && isXmlChar (toEnum i)
	 )
        then return i
	else unexpected ("illegal value in character reference: " ++ intToCharRef i ++ " , in hex: " ++ intToCharRefHex i)

charRef		:: GenParser Char state Int
charRef
    = do
      try (string "&#x")
      d <- many1 hexDigit
      semi
      checkCharRef (hexStringToInt d)
      <|>
      do
      try (string "&#")
      d <- many1 digit
      semi
      checkCharRef (decimalStringToInt d)
      <?> "character reference"


entityRef	:: GenParser Char state String
entityRef
    = do
      char '&'
      n <- name
      semi
      return n
      <?> "entity reference"

peReference	:: GenParser Char state String
peReference
    = try ( do
	    char '%'
	    n <- name
	    semi
	    return n
	  )
      <?> "parameter-entity reference"

-- ------------------------------------------------------------
--
-- 4.3

encName		:: GenParser Char state String
encName
    = do
      c <- asciiLetter
      r <- many (asciiLetter <|> digit <|> oneOf "._-")
      return (c:r)

versionNum	:: GenParser Char state String
versionNum
    = many1 xmlNameChar


-- ------------------------------------------------------------
--
-- keywords

keyword		:: String -> GenParser Char state String
keyword kw
    = try ( do
	    n <- name
	    if n == kw
	      then return n
	      else unexpected n
	  )
      <?> kw

keywords	:: [String] -> GenParser Char state String
keywords
    = foldr1 (<|>) . map keyword

-- ------------------------------------------------------------
--
-- parser for quoted attribute values

quoted		:: GenParser Char state a -> GenParser Char state a
quoted p
    = between dq dq p
      <|>
      between sq sq p

-- ------------------------------------------------------------
--
-- simple char parsers

dq, sq, lt, gt, semi	:: GenParser Char state Char

dq	= char '\"'
sq	= char '\''
lt	= char '<'
gt	= char '>'
semi	= char ';'

separator	:: Char -> GenParser Char state ()
separator c
    = do
      try ( do
	    skipS0
	    char c
	  )
      skipS0
      <?> [c]

bar, comma, eq, lpar, rpar	:: GenParser Char state ()

bar	= separator '|'
comma	= separator ','
eq	= separator '='

lpar
    = do
      char '('
      skipS0

rpar
    = do
      skipS0
      char ')'
      return ()


-- ------------------------------------------------------------
--
-- all chars but not a special substring

allBut		:: (GenParser Char state Char -> GenParser Char state String) -> String -> GenParser Char state String
allBut p str
    = allBut1 p (const True) str

allBut1		:: (GenParser Char state Char -> GenParser Char state String) -> (Char -> Bool) -> String -> GenParser Char state String
allBut1 p prd (c:rest)
    = p ( satisfy (\ x -> isXmlChar x && prd x && not (x == c) )
	  <|>
	  try ( do
		char c
		notFollowedBy ( do
				try (string rest)
				return c
			      )
		return c
	      )
	)

allBut1 _p _prd str
    = error ("allBut1 _ _ " ++ show str ++ " is undefined")

-- ------------------------------------------------------------
--
-- concatenate parse results

concRes		:: GenParser Char state [[a]] -> GenParser Char state [a]
concRes p
    = do
      sl <- p
      return (concat sl)

mkList		:: GenParser Char state a -> GenParser Char state [a]
mkList p
    = do
      r <- p
      return [r]

-- ------------------------------------------------------------
--
-- token parsers returning XmlTrees
--
-- ------------------------------------------------------------
--
-- Literals (2.3)

nameT		:: GenParser Char state XmlTree
nameT
    = do
      n <- name
      return (mkDTDElem NAME [(a_name, n)] [])

nmtokenT	:: GenParser Char state XmlTree
nmtokenT
    = do
      n <- nmtoken
      return (mkDTDElem NAME [(a_name, n)] [])


entityValueT	:: GenParser Char state XmlTrees
entityValueT
    =  do
       sl <- between dq dq (entityTokensT "%&\"")
       return sl
       <|>
       do
       sl <- between sq sq (entityTokensT "%&\'")
       return sl
       <?> "entity value (in quotes)"

entityTokensT	:: String -> GenParser Char state XmlTrees
entityTokensT notAllowed
    = many (entityCharT notAllowed)

entityCharT	:: String -> GenParser Char state XmlTree
entityCharT notAllowed
    = peReferenceT
      <|>
      charRefT
      <|>
      bypassedEntityRefT
      <|>
      ( do
	cs <- many1 (singleChar notAllowed)
	return (mkText cs)
      )

attrValueT	:: GenParser Char state XmlTrees
attrValueT
    = between dq dq (attrValueT' "<&\"")
      <|>
      between sq sq (attrValueT' "<&\'")
      <?> "attribute value (in quotes)"

attrValueT'	:: String -> GenParser Char state XmlTrees
attrValueT' notAllowed
    = many ( referenceT <|> singleCharsT notAllowed)

singleCharsT	:: String -> GenParser Char state XmlTree
singleCharsT notAllowed
    = do
      cs <- singleChars notAllowed
      return (mkText cs)

-- ------------------------------------------------------------
--
-- Character and Entity References (4.1)

referenceT	:: GenParser Char state XmlTree
referenceT
    = charRefT
      <|>
      entityRefT

charRefT	:: GenParser Char state XmlTree
charRefT
    = do
      i <- charRef
      return $! (mkCharRef $! i)

entityRefT	:: GenParser Char state XmlTree
entityRefT
    = do
      n <- entityRef
      return $! (mkEntityRef $! n)

bypassedEntityRefT	:: GenParser Char state XmlTree
bypassedEntityRefT
    = do
      n <- entityRef
      return $! (mkText ("&" ++ n ++ ";"))

peReferenceT	:: GenParser Char state XmlTree
peReferenceT
    = do
      r <- peReference
      return $! (mkDTDElem PEREF [(a_peref, r)] [])

-- ------------------------------------------------------------