{- 
    Copyright 2009 Mario Blazevic

    This file is part of the Streaming Component Combinators (SCC) project.

    The SCC project is free software: you can redistribute it and/or modify it under the terms of the GNU General Public
    License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later
    version.

    SCC is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.

    You should have received a copy of the GNU General Public License along with SCC.  If not, see
    <http://www.gnu.org/licenses/>.
-}

-- | Module "XMLComponents" defines primitive components for parsing and manipulating XML.

{-# LANGUAGE DeriveDataTypeable, PatternGuards #-}

module Control.Concurrent.SCC.XMLComponents (
-- * Types
Token (..),
-- * Parsing XML
tokens, parseTokens, expandEntity,
-- * Showing XML
escapeAttributeCharacter, escapeContentCharacter,
-- * Splitters
element, elementContent, elementName, attribute, attributeName, attributeValue,
-- * Splitter combinators
elementHavingTag, havingText, havingOnlyText
)
where

import Control.Exception (assert)
import Control.Monad (liftM, when)
import Data.Char
import Data.Dynamic (Typeable)
import qualified Data.Map as Map
import Data.Maybe (fromJust, isJust, mapMaybe)
import Data.List (find, stripPrefix)
import qualified Data.Sequence as Seq
import Data.Sequence ((|>))
import Numeric (readDec, readHex)
import Debug.Trace (trace)

import Control.Concurrent.SCC.Foundation
import Control.Concurrent.SCC.ComponentTypes
import Control.Concurrent.SCC.Components (unparse)
import Control.Concurrent.SCC.Combinators ((>->), groupMarks, having, havingOnly, parseNestedRegions, splitterToMarker)


data Token = StartTag | EndTag | EmptyTag
           | ElementName | AttributeName | AttributeValue
           | EntityReferenceToken | EntityName
           | ProcessingInstruction | ProcessingInstructionText
           | Comment | CommentText
           | StartMarkedSectionCDATA | EndMarkedSection
           | ErrorToken String
             deriving (Eq, Show, Typeable)

-- | Escapes a character for inclusion into an XML attribute value.
escapeAttributeCharacter :: Char -> String
escapeAttributeCharacter '"' = "&quot;"
escapeAttributeCharacter '\t' = "&#9;"
escapeAttributeCharacter '\n' = "&#10;"
escapeAttributeCharacter '\r' = "&#13;"
escapeAttributeCharacter x = escapeContentCharacter x

-- | Escapes a character for inclusion into the XML data content.
escapeContentCharacter :: Char -> String
escapeContentCharacter '<' = "&lt;"
escapeContentCharacter '&' = "&amp;"
escapeContentCharacter x = [x]

-- | Converts an XML entity name into the text value it represents: @expandEntity \"lt\" = \"<\"@.
expandEntity :: String -> String
expandEntity "lt" = "<"
expandEntity "gt" = ">"
expandEntity "quot" = "\""
expandEntity "apos" = "'"
expandEntity "amp" = "&"
expandEntity ('#' : 'x' : codePoint) = [chr (fst $ head $ readHex codePoint)]
expandEntity ('#' : codePoint) = [chr (fst $ head $ readDec codePoint)]

isNameStart x = isLetter x || x == '_'
isNameChar x = isAlphaNum x || x == '_' || x == '-'

-- | The 'tokens' splitter distinguishes XML markup from data content. It is used by 'parseTokens'.
tokens :: (ParallelizableMonad m) => Splitter m Char (Boundary Token)
tokens = liftAtomicSplitter "XML.tokens" 1 $
         \source true false edge->
         let getContent = get source
                          >>= maybe (return []) content
             content '<' = get source
                           >>= maybe (return "<") (\x-> tag x >> get source >>= maybe (return []) content)
             content '&' = entity >> next content
             content x = put false x
                         >>= cond getContent (return [x])
             tag '?' = put edge (Start ProcessingInstruction)
                       >> putList "<?" true
                       >>= whenNull (put edge (Start ProcessingInstructionText)
                                     >> processingInstruction)
             tag '!' = dispatchOnString source
                          (\other-> put edge (Point (ErrorToken ("Expecting <![CDATA[ or <!--, received "
                                                                 ++ show ("<![" ++ other))))
                                    >> return ("<!" ++ other))
                          [("--",
                            \match-> put edge (Start Comment)
                                     >> putList match true
                                     >>= whenNull (put edge (Start CommentText)
                                                   >> comment)),
                           ("[CDATA[",
                            \match-> put edge (Start StartMarkedSectionCDATA)
                                     >> putList match true
                                     >>= whenNull (put edge (End StartMarkedSectionCDATA)
                                                   >> markedSection))]
             tag '/' = {-# SCC "EndTag" #-}
                       do put edge (Start EndTag)
                          put true '<'
                          put true '/'
                          x <- next (name ElementName)
                          put true x
                          when (x /= '>') (put edge (Point (ErrorToken ("Invalid character " ++ show x ++ " in end tag")))
                                           >> return ())
                          put edge (End EndTag)
                          return []
             tag x | isNameStart x
                   = {-# SCC "StartTag" #-}
                     do put edge (Start StartTag)
                        put true '<'
                        y <- name ElementName x
                        z <- attributes y
                        w <- if z == '/'
                                then put true z >> put edge (Point EmptyTag) >> get' source
                                else return z
                        put true w
                        when (w /= '>') (put edge (Point (ErrorToken ("Invalid character " ++ show w
                                                                      ++ " in start tag")))
                                         >> return ())
                        put edge (End StartTag)
                        return []
             attributes x | isSpace x = put true x >> next attributes
             attributes x | isNameStart x = do y <- name AttributeName x
                                               when (y /= '=') (put edge (Point (ErrorToken ("Invalid character " ++ show y
                                                                                             ++ " following attribute name")))
                                                                >> return ())
                                               q <- if y == '"' || y == '\'' then return y else put true y >> get' source
                                               when
                                                  (q /= '"' && q /= '\'')
                                                  (put edge (Point (ErrorToken ("Invalid quote character " ++ show q)))
                                                   >> return ())
                                               put true q
                                               put edge (Start AttributeValue)
                                               next (attributeValue q)
                                               next attributes
             attributes x = return x
             attributeValue q x | q == x = do put edge (End AttributeValue)
                                              put true x
             attributeValue q '<' = do put edge (Start (ErrorToken "Invalid character '<' in attribute value."))
                                       put true '<'
                                       put edge (End (ErrorToken "Invalid character '<' in attribute value."))
                                       next (attributeValue q)
             attributeValue q '&' = entity >> next (attributeValue q)
             attributeValue q x = put true x >> next (attributeValue q)
             processingInstruction = {-# SCC "PI" #-}
                                     dispatchOnString source
                                        (\other-> if null other
                                                  then (put edge (Point (ErrorToken "Unterminated processing instruction"))
                                                        >> return [])
                                                  else putList other true >>= whenNull processingInstruction)
                                        [("?>",
                                          \match-> put edge (End ProcessingInstructionText)
                                                   >> putList match true
                                                   >>= whenNull (put edge (End ProcessingInstruction)
                                                                 >> getContent))]
             comment = {-# SCC "comment" #-}
                       dispatchOnString source
                          (\other-> if null other
                                    then (put edge (Point (ErrorToken "Unterminated comment"))
                                          >> return [])
                                    else putList other true >>= whenNull comment)
                          [("-->",
                            \match-> put edge (End CommentText)
                                     >> putList match true
                                     >>= whenNull (put edge (End Comment)
                                                   >> getContent))]
             markedSection = {-# SCC "<![CDATA[" #-}
                             dispatchOnString source
                                (\other-> if null other
                                          then (put edge (Point (ErrorToken "Unterminated marked section"))
                                                >> return [])
                                          else putList other true >>= whenNull markedSection)
                                [("]]>",
                                  \match-> put edge (Start EndMarkedSection)
                                           >> putList match true
                                           >>= whenNull (put edge (End EndMarkedSection)
                                                         >> getContent))]
             entity = do put edge (Start EntityReferenceToken)
                         put true '&'
                         x <- next (name EntityName)
                         when (x /= ';') (put edge (Point (ErrorToken ("Invalid character " ++ show x
                                                                       ++ " ends entity name.")))
                                          >> return ())
                         put true x
                         put edge (End EntityReferenceToken)
             name token x | isNameStart x = {-# SCC "name" #-} 
                                            do put edge (Start token)
                                               put true x
                                               next (nameTail token)
             name _ x = do put edge (Point (ErrorToken ("Invalid character " ++ show x ++ " in attribute value.")))
                           return x
             nameTail token x = if isNameChar x || x == ':'
                                then put true x >> next (nameTail token)
                                else put edge (End token) >> return x
             next f = {-# SCC "next" #-} get' source >>= f
         in getContent

-- | The XML token parser. This parser converts plain text to parsed text, which is a precondition for using the
-- remaining XML components.
parseTokens :: (ParallelizableMonad m) => Parser m Char Token
parseTokens = parseNestedRegions tokens

dispatchOnString :: Monad m => Source c Char -> (String -> Pipe c m r) -> [(String, String -> Pipe c m r)] -> Pipe c m r
dispatchOnString source failure fullCases = dispatch fullCases id
   where dispatch cases consumed
            = case find (null . fst) cases
              of Just ("", rhs) -> rhs (consumed "")
                 Nothing -> get source
                            >>= maybe
                                   (failure (consumed ""))
                                   (\x-> case mapMaybe (startingWith x) cases
                                         of [] -> failure (consumed [x])
                                            subcases -> dispatch (subcases ++ fullCases) (consumed . (x :)))
         startingWith x (y:rest, rhs) | x == y = Just (rest, rhs)
                                      | otherwise = Nothing

getElementName :: Monad m => Source c (Markup Char Token) -> ([Markup Char Token] -> [Markup Char Token])
               -> Pipe c m ([Markup Char Token], Maybe String)
getElementName source f = get source
                          >>= maybe
                                 (return (f [], Nothing))
                                 (\x-> case x of Markup (Start ElementName) -> getRestOfRegion ElementName source (f . (x:)) id
                                                 Markup (Point ErrorToken{}) -> getElementName source (f . (x:))
                                                 Content{} -> getElementName source (f . (x:))
                                                 _ -> error ("Expected an ElementName, received " ++ show x))

getRestOfRegion :: Monad m => Token -> Source c (Markup Char Token)
                -> ([Markup Char Token] -> [Markup Char Token]) -> (String -> String)
                -> Pipe c m ([Markup Char Token], Maybe String)
getRestOfRegion token source f g = get source
                                   >>= maybe
                                          (return (f [], Nothing))
                                          (\x-> case x of Markup (End token) -> return (f [x], Just (g ""))
                                                          Content y -> getRestOfRegion token source (f . (x:)) (g . (y:))
                                                          _ -> error ("Expected rest of " ++ show token ++ ", received " ++ show x))

pourRestOfRegion :: Monad m
                    => Token -> Source c (Markup Char Token) -> Sink c (Markup Char Token) -> Sink c (Markup Char Token)
                             -> Pipe c m (Maybe [Markup Char Token])
pourRestOfRegion token source sink endSink
   = get source
     >>= maybe
            (return $ Just [])
            (\x-> case x
                  of Markup (End token') | token == token' -> put endSink x
                                                              >>= cond (return Nothing) (return $ Just [x])
                     Content y -> put sink x
                                  >>= cond (pourRestOfRegion token source sink endSink) (return $ Just [x])
                     _ -> error ("Expected rest of " ++ show token ++ ", received " ++ show x))

pourRestOfTag :: Monad m => Source c (Markup Char Token) -> Sink c (Markup Char Token) -> Pipe c m Bool
pourRestOfTag source sink = get source
                            >>= maybe
                                   (return True)
                                   (\x-> put sink x
                                         >> case x of Markup (End StartTag) -> return True
                                                      Markup (End EndTag) -> return True
                                                      Markup (Point EmptyTag) -> pourRestOfTag source sink >> return False
                                                      _ -> pourRestOfTag source sink)

findEndTag :: Monad m => Source c (Markup Char Token) -> Sink c (Markup Char Token) -> Sink c (Markup Char Token) -> String
           -> Pipe c m [Markup Char Token]
findEndTag source sink endSink name = find where
   find = get source
          >>= maybe
                 (return [])
                 (\x-> case x
                       of Markup (Start EndTag) -> do (tokens, mn) <- getElementName source (x :)
                                                      maybe
                                                         (return tokens)
                                                         (\name'-> if name == name'
                                                                   then putList tokens endSink
                                                                        >>= whenNull
                                                                               (pourRestOfTag source endSink
                                                                                >> return [])
                                                                   else putList tokens sink
                                                                        >>= whenNull
                                                                               (pourRestOfTag source sink
                                                                                >> find))
                                                         mn
                          Markup (Start StartTag) -> do (tokens, mn) <- getElementName source (x :)
                                                        maybe
                                                           (return tokens)
                                                           (\name'-> putList tokens sink
                                                                     >>= whenNull
                                                                            (if name == name'
                                                                             then pourRestOfTag source sink
                                                                                  >>= cond
                                                                                         (findEndTag source sink sink name)
                                                                                         (return [])
                                                                                  >>= whenNull find
                                                                             else pourRestOfTag source sink
                                                                                  >> find))
                                                           mn
                          _ -> put sink x
                               >>= cond find (return [x]))

findStartTag :: Monad m => Source c (Markup Char Token) -> Sink c (Markup Char Token)
             -> Pipe c m (Either [Markup Char Token] (Markup Char Token))
findStartTag source sink = get source
                           >>= maybe
                                  (return $ Left [])
                                  (\x-> case x of Markup (Start StartTag) -> return $ Right x
                                                  _ -> put sink x
                                                       >>= cond (findStartTag source sink) (return $ Left [x]))

-- | Splits all top-level elements with all their content to /true/, all other input to /false/.
element :: (Monad m) => Splitter m (Markup Char Token) ()
element = liftAtomicSplitter "element" 1 $
          \source true false edge->
          let split0 = findStartTag source false
                       >>= either return
                              (\x-> put edge ()
                                    >> put true x
                                    >>= cond
                                           (do (tokens, mn) <- getElementName source id
                                               maybe
                                                  (putList tokens true)
                                                  (\name-> putList tokens true
                                                           >>= whenNull
                                                                  (pourRestOfTag source true
                                                                   >>= cond
                                                                          (split1 name)
                                                                          split0))
                                                  mn)
                                           (return [x]))
              split1 name = findEndTag source true true name
                            >>= whenNull split0
          in split0

-- | Splits the content of all top-level elements to /true/, their tags and intervening input to /false/.
elementContent :: (Monad m) => Splitter m (Markup Char Token) ()
elementContent = liftAtomicSplitter "elementContent" 1 $
                 \source true false edge->
                 let split0 = findStartTag source false
                              >>= either return
                                     (\x-> put false x
                                           >>= cond
                                                  (do (tokens, mn) <- getElementName source id
                                                      maybe
                                                         (putList tokens false)
                                                         (\name-> putList tokens false
                                                                  >>= whenNull (pourRestOfTag source false
                                                                                >>= cond
                                                                                       (put edge ()
                                                                                        >> split1 name)
                                                                                       split0))
                                                         mn)
                                                  (return [x]))
                     split1 name = findEndTag source true false name
                                   >>= whenNull split0
                 in split0

-- | Similiar to @('Control.Concurrent.SCC.Combinators.having' 'element')@, except it runs the argument splitter
-- only on each element's start tag, not on the entire element with its content.
elementHavingTag :: (ParallelizableMonad m, Typeable b)
                    => Splitter m (Markup Char Token) b -> Splitter m (Markup Char Token) b
elementHavingTag test
   = liftSplitter "elementHavingTag" (maxUsableThreads test) $
     \threads-> let test' = usingThreads threads test
                    configuration = ComponentConfiguration [AnyComponent test'] threads (cost test' + 2)
                    split source true false edge = split0 where
                       split0 = findStartTag source false
                                >>= either return
                                       (\x-> do (tokens, mn) <- getElementName source (x :)
                                                maybe
                                                   (return tokens)
                                                   (\name-> do (hasContent, rest) <- pipe (pourRestOfTag source) getList
                                                               let tag = tokens ++ rest
                                                               (_, (unconsumed, maybeTrue, (), maybeEdge))
                                                                  <- pipe
                                                                        (putList tag)
                                                                        (\tag-> splitToConsumers
                                                                                   test'
                                                                                   tag
                                                                                   get
                                                                                   consumeAndSuppress
                                                                                   get)
                                                               if isJust maybeTrue || isJust maybeEdge
                                                                  then maybe (return True) (put edge) maybeEdge
                                                                       >> putList tag true
                                                                       >>= whenNull (split1 hasContent true name)
                                                                  else putList tag false
                                                                       >>= whenNull (split1 hasContent false name))
                                                   mn)
                       split1 hasContent sink name = if hasContent
                                                     then findEndTag source sink sink name >>= whenNull split0
                                                     else split0
                in (configuration, split)

-- | Splits every attribute specification to /true/, everything else to /false/.
attribute :: (ParallelizableMonad m) => Splitter m (Markup Char Token) ()
attribute = liftAtomicSplitter "attribute" 1 $
            \source true false edge->
            let split0 = get source
                         >>= maybe
                                (return [])
                                (\x-> case x of Markup (Start AttributeName)
                                                   -> put edge ()
                                                      >> put true x
                                                      >>= cond
                                                             (pourRestOfRegion AttributeName source true true
                                                              >>= maybe split1 return)
                                                             (return [x])
                                                _ -> put false x
                                                     >>= cond split0 (return [x]))
                split1 = get source
                         >>= maybe
                                (return [])
                                (\x-> case x of Markup (Start AttributeValue)
                                                   -> put true x
                                                      >>= cond
                                                             (pourRestOfRegion AttributeValue source true true
                                                              >>= maybe split0 return)
                                                             (return [x])
                                                _ -> put true x
                                                     >>= cond split1 (return [x]))
            in split0

-- | Splits every element name, including the names of nested elements and names in end tags, to /true/, all the rest of
-- input to /false/.
elementName :: (ParallelizableMonad m) => Splitter m (Markup Char Token) ()
elementName = liftAtomicSplitter "elementName" 1 (splitSimpleRegions ElementName)

-- | Splits every attribute name to /true/, all the rest of input to /false/.
attributeName :: (ParallelizableMonad m) => Splitter m (Markup Char Token) ()
attributeName = liftAtomicSplitter "attributeName" 1  (splitSimpleRegions AttributeName)

-- | Splits every attribute value, excluding the quote delimiters, to /true/, all the rest of input to /false/.
attributeValue :: (ParallelizableMonad m) => Splitter m (Markup Char Token) ()
attributeValue = liftAtomicSplitter "attributeValue" 1 (splitSimpleRegions AttributeValue)

splitSimpleRegions token source true false edge = split
   where split = get source
                 >>= maybe
                        (return [])
                        (\x-> case x of Markup (Start token') | token == token'
                                           -> put false x
                                              >>= cond
                                                     (put edge ()
                                                      >> pourRestOfRegion token source true false
                                                      >>= maybe split return)
                                                     (return [x])
                                        _ -> put false x
                                             >>= cond split (return [x]))

-- | Behaves like 'Control.Concurrent.SCC.Combinators.having', but the right-hand splitter works on plain instead of
-- marked-up text. This allows regular 'Char' splitters to be applied to parsed XML.
havingText :: (ParallelizableMonad m, Typeable b1, Typeable b2)
              => Splitter m (Markup Char Token) b1 -> Splitter m Char b2 -> Splitter m (Markup Char Token) b1
havingText chunker tester
   = liftSplitter "havingText" (maxUsableThreads chunker + maxUsableThreads tester) $
     \threads-> let (configuration, chunker', tester', parallel) = optimalTwoParallelConfigurations threads chunker tester
                    split source true false edge
                       = liftM fst $
                         (if parallel then pipeP else pipe)
                            (transduce (splitterToMarker chunker') source)
                            (flip groupMarks test)
                               where test Nothing chunk = pour chunk false >> return []
                                     test (Just mb) chunk = pipe
                                                               (\sink1-> pipe (tee chunk sink1) getList)
                                                               (\chunk-> liftM snd $
                                                                         pipe
                                                                            (transduce unparse chunk)
                                                                            (\chunk-> splitToConsumers tester' chunk
                                                                                         (liftM isJust . get)
                                                                                         consumeAndSuppress
                                                                                         (liftM isJust . get)))
                                                            >>= \(((), prefix), (_, anyTrue, (), anyEdge))->
                                                                if anyTrue || anyEdge
                                                                then maybe (return True) (put edge) mb
                                                                     >> putList prefix true
                                                                     >>= whenNull (pour chunk true >> return [])
                                                                else putList prefix false
                                                                     >>= whenNull (pour chunk false >> return [])
                in (configuration, split)

-- | Behaves like 'Control.Concurrent.SCC.Combinators.havingOnly', but the right-hand splitter works on plain instead of
-- marked-up text. This allows regular 'Char' splitters to be applied to parsed XML.
havingOnlyText :: (ParallelizableMonad m, Typeable b1, Typeable b2)
                  => Splitter m (Markup Char Token) b1 -> Splitter m Char b2 -> Splitter m (Markup Char Token) b1
havingOnlyText chunker tester
   = liftSplitter "havingOnlyText" (maxUsableThreads chunker + maxUsableThreads tester) $
     \threads-> let (configuration, chunker', tester', parallel) = optimalTwoParallelConfigurations threads chunker tester
                    split source true false edge
                       = liftM fst $
                         (if parallel then pipeP else pipe)
                            (transduce (splitterToMarker chunker') source)
                            (flip groupMarks test)
                               where test Nothing chunk = pour chunk false >> return []
                                     test (Just mb) chunk = pipe
                                                               (\sink1-> pipe (tee chunk sink1) getList)
                                                               (\chunk-> liftM snd $
                                                                         pipe
                                                                            (transduce unparse chunk)
                                                                            (\chunk-> splitToConsumers tester' chunk
                                                                                         consumeAndSuppress
                                                                                         (liftM isJust . get)
                                                                                         consumeAndSuppress))
                                                            >>= \(((), prefix), (_, (), anyFalse, ()))->
                                                                if anyFalse
                                                                then putList prefix false
                                                                     >>= whenNull (pour chunk false >> return [])
                                                                else maybe (return True) (put edge) mb
                                                                     >> putList prefix true
                                                                     >>= whenNull (pour chunk true >> return [])
                in (configuration, split)