module Control.Concurrent.SCC.XML (
Token (..),
tokens, parseTokens, expandEntity,
escapeAttributeCharacter, escapeContentCharacter,
element, elementContent, elementName, attribute, attributeName, attributeValue,
elementHavingTag, havingText, havingOnlyText
)
where
import Control.Exception (assert)
import Control.Monad (liftM, when)
import Data.Char
import qualified Data.Map as Map
import Data.Maybe (fromJust, isJust, mapMaybe)
import Data.List (find, stripPrefix)
import qualified Data.Sequence as Seq
import Data.Sequence ((|>))
import Numeric (readDec, readHex)
import Debug.Trace (trace)
import Control.Concurrent.Coroutine
import Control.Concurrent.SCC.Streams
import Control.Concurrent.SCC.Types
import Control.Concurrent.SCC.Combinators (groupMarks, splitterToMarker, parseNestedRegions)
import Control.Concurrent.SCC.Primitives (unparse)
data Token = StartTag | EndTag | EmptyTag
| ElementName | AttributeName | AttributeValue
| EntityReferenceToken | EntityName
| ProcessingInstruction | ProcessingInstructionText
| Comment | CommentText
| StartMarkedSectionCDATA | EndMarkedSection
| ErrorToken String
deriving (Eq, Show)
escapeAttributeCharacter :: Char -> String
escapeAttributeCharacter '"' = """
escapeAttributeCharacter '\t' = "	"
escapeAttributeCharacter '\n' = " "
escapeAttributeCharacter '\r' = " "
escapeAttributeCharacter x = escapeContentCharacter x
escapeContentCharacter :: Char -> String
escapeContentCharacter '<' = "<"
escapeContentCharacter '&' = "&"
escapeContentCharacter x = [x]
expandEntity :: String -> String
expandEntity "lt" = "<"
expandEntity "gt" = ">"
expandEntity "quot" = "\""
expandEntity "apos" = "'"
expandEntity "amp" = "&"
expandEntity ('#' : 'x' : codePoint) = [chr (fst $ head $ readHex codePoint)]
expandEntity ('#' : codePoint) = [chr (fst $ head $ readDec codePoint)]
isNameStart x = isLetter x || x == '_'
isNameChar x = isAlphaNum x || x == '_' || x == '-'
tokens :: Monad m => Splitter m Char (Boundary Token)
tokens = Splitter $
\source true false edge->
let getContent = get source
>>= maybe (return []) content
content '<' = get source
>>= maybe (return "<") (\x-> tag x >> get source >>= maybe (return []) content)
content '&' = entity >> next content
content x = put false x
>>= cond getContent (return [x])
tag '?' = put edge (Start ProcessingInstruction)
>> putList "<?" true
>>= whenNull (put edge (Start ProcessingInstructionText)
>> processingInstruction)
tag '!' = dispatchOnString source
(\other-> put edge (Point (ErrorToken ("Expecting <![CDATA[ or <!--, received "
++ show ("<![" ++ other))))
>> return ("<!" ++ other))
[("--",
\match-> put edge (Start Comment)
>> putList match true
>>= whenNull (put edge (Start CommentText)
>> comment)),
("[CDATA[",
\match-> put edge (Start StartMarkedSectionCDATA)
>> putList match true
>>= whenNull (put edge (End StartMarkedSectionCDATA)
>> markedSection))]
tag '/' =
do put edge (Start EndTag)
put true '<'
put true '/'
x <- next (name ElementName)
put true x
when (x /= '>')
(put edge (Point (ErrorToken ("Invalid character " ++ show x ++ " in end tag")))
>> return ())
put edge (End EndTag)
return []
tag x | isNameStart x
=
do put edge (Start StartTag)
put true '<'
y <- name ElementName x
z <- attributes y
w <- if z == '/'
then put true z >> put edge (Point EmptyTag)
>> get source
>>= maybe
(put edge (Point (ErrorToken ("Missing '>' at the end of start tag.")))
>> return '>')
return
else return z
put true w
when (w /= '>') (put edge (Point (ErrorToken ("Invalid character " ++ show w
++ " in start tag")))
>> return ())
put edge (End StartTag)
return []
tag x = put edge (Point (ErrorToken "Unescaped character '<' in content"))
>> put false '<'
>> put false x
>> return []
attributes x | isSpace x = put true x >> next attributes
attributes x | isNameStart x
= do y <- name AttributeName x
when (y /= '=') (put edge (Point (ErrorToken ("Invalid character " ++ show y
++ " following attribute name")))
>> return ())
q <- if y == '"' || y == '\''
then return y
else put true y >> get source
>>= maybe (put edge (Point (ErrorToken ("Truncated input after attribute name")))
>> return '"')
return
when
(q /= '"' && q /= '\'')
(put edge (Point (ErrorToken ("Invalid quote character " ++ show q)))
>> return ())
put true q
put edge (Start AttributeValue)
next (attributeValue q)
next attributes
attributes x = return x
attributeValue q x | q == x = do put edge (End AttributeValue)
put true x
attributeValue q '<' = do put edge (Start (ErrorToken "Invalid character '<' in attribute value."))
put true '<'
put edge (End (ErrorToken "Invalid character '<' in attribute value."))
next (attributeValue q)
attributeValue q '&' = entity >> next (attributeValue q)
attributeValue q x = put true x >> next (attributeValue q)
processingInstruction =
dispatchOnString source
(\other-> if null other
then (put edge (Point (ErrorToken "Unterminated processing instruction"))
>> return [])
else putList other true >>= whenNull processingInstruction)
[("?>",
\match-> put edge (End ProcessingInstructionText)
>> putList match true
>>= whenNull (put edge (End ProcessingInstruction)
>> getContent))]
comment =
dispatchOnString source
(\other-> if null other
then (put edge (Point (ErrorToken "Unterminated comment"))
>> return [])
else putList other true >>= whenNull comment)
[("-->",
\match-> put edge (End CommentText)
>> putList match true
>>= whenNull (put edge (End Comment)
>> getContent))]
markedSection =
dispatchOnString source
(\other-> if null other
then (put edge (Point (ErrorToken "Unterminated marked section"))
>> return [])
else putList other true >>= whenNull markedSection)
[("]]>",
\match-> put edge (Start EndMarkedSection)
>> putList match true
>>= whenNull (put edge (End EndMarkedSection)
>> getContent))]
entity = do put edge (Start EntityReferenceToken)
put true '&'
x <- next (name EntityName)
when (x /= ';') (put edge (Point (ErrorToken ("Invalid character " ++ show x
++ " ends entity name.")))
>> return ())
put true x
put edge (End EntityReferenceToken)
name token x | isNameStart x =
do put edge (Start token)
put true x
next (nameTail token)
name _ x = do put edge (Point (ErrorToken ("Invalid character " ++ show x ++ " in attribute value.")))
return x
nameTail token x = if isNameChar x || x == ':'
then put true x >> next (nameTail token)
else put edge (End token) >> return x
next f = get' source >>= f
in getContent
parseTokens :: Monad m => Parser m Char Token
parseTokens = parseNestedRegions tokens
dispatchOnString :: forall m a d r. (Monad m, AncestorFunctor a d) =>
Source m a Char -> (String -> Coroutine d m r) -> [(String, String -> Coroutine d m r)]
-> Coroutine d m r
dispatchOnString source failure fullCases = dispatch fullCases id
where dispatch cases consumed
= case find (null . fst) cases
of Just ("", rhs) -> rhs (consumed "")
Nothing -> get source
>>= maybe
(failure (consumed ""))
(\x-> case mapMaybe (startingWith x) cases
of [] -> failure (consumed [x])
subcases -> dispatch (subcases ++ fullCases) (consumed . (x :)))
startingWith x (y:rest, rhs) | x == y = Just (rest, rhs)
| otherwise = Nothing
getElementName :: forall m a d. (Monad m, AncestorFunctor a d) =>
Source m a (Markup Token Char) -> ([Markup Token Char] -> [Markup Token Char])
-> Coroutine d m ([Markup Token Char], Maybe String)
getElementName source f = get source
>>= maybe
(return (f [], Nothing))
(\x-> case x
of Markup (Start ElementName) -> getRestOfRegion ElementName source (f . (x:)) id
Markup (Point ErrorToken{}) -> getElementName source (f . (x:))
Content{} -> getElementName source (f . (x:))
_ -> error ("Expected an ElementName, received " ++ show x))
getRestOfRegion :: forall m a d. (Monad m, AncestorFunctor a d) =>
Token -> Source m a (Markup Token Char)
-> ([Markup Token Char] -> [Markup Token Char]) -> (String -> String)
-> Coroutine d m ([Markup Token Char], Maybe String)
getRestOfRegion token source f g = get source
>>= maybe
(return (f [], Nothing))
(\x-> case x
of Markup (End token) -> return (f [x], Just (g ""))
Content y -> getRestOfRegion token source (f . (x:)) (g . (y:))
_ -> error ("Expected rest of " ++ show token
++ ", received " ++ show x))
pourRestOfRegion :: forall m a1 a2 a3 d. (Monad m, AncestorFunctor a1 d, AncestorFunctor a2 d, AncestorFunctor a3 d) =>
Token -> Source m a1 (Markup Token Char)
-> Sink m a2 (Markup Token Char) -> Sink m a3 (Markup Token Char)
-> Coroutine d m (Maybe [Markup Token Char])
pourRestOfRegion token source sink endSink
= get source
>>= maybe
(return $ Just [])
(\x-> case x
of Markup (End token') | token == token' -> put endSink x
>>= cond (return Nothing) (return $ Just [x])
Content y -> put sink x
>>= cond (pourRestOfRegion token source sink endSink) (return $ Just [x])
_ -> error ("Expected rest of " ++ show token ++ ", received " ++ show x))
pourRestOfTag :: forall m a1 a2 d. (Monad m, AncestorFunctor a1 d, AncestorFunctor a2 d) =>
Source m a1 (Markup Token Char) -> Sink m a2 (Markup Token Char) -> Coroutine d m Bool
pourRestOfTag source sink = get source
>>= maybe
(return True)
(\x-> put sink x
>> case x of Markup (End StartTag) -> return True
Markup (End EndTag) -> return True
Markup (Point EmptyTag) -> pourRestOfTag source sink
>> return False
_ -> pourRestOfTag source sink)
findEndTag :: forall m a1 a2 a3 d. (Monad m, AncestorFunctor a1 d, AncestorFunctor a2 d, AncestorFunctor a3 d) =>
Source m a1 (Markup Token Char) -> Sink m a2 (Markup Token Char) -> Sink m a3 (Markup Token Char)
-> String
-> Coroutine d m [Markup Token Char]
findEndTag source sink endSink name = find where
find = get source
>>= maybe
(return [])
(\x-> case x
of Markup (Start EndTag) -> do (tokens, mn) <- getElementName source (x :)
maybe
(return tokens)
(\name'-> if name == name'
then putList tokens endSink
>>= whenNull
(pourRestOfTag source endSink
>> return [])
else putList tokens sink
>>= whenNull
(pourRestOfTag source sink
>> find))
mn
Markup (Start StartTag) -> do (tokens, mn) <- getElementName source (x :)
maybe
(return tokens)
(\name'-> putList tokens sink
>>= whenNull
(if name == name'
then pourRestOfTag source sink
>>= cond
(findEndTag source sink sink name)
(return [])
>>= whenNull find
else pourRestOfTag source sink
>> find))
mn
_ -> put sink x
>>= cond find (return [x]))
findStartTag :: forall m a1 a2 d. (Monad m, AncestorFunctor a1 d, AncestorFunctor a2 d) =>
Source m a1 (Markup Token Char) -> Sink m a2 (Markup Token Char)
-> Coroutine d m (Either [Markup Token Char] (Markup Token Char))
findStartTag source sink = get source
>>= maybe
(return $ Left [])
(\x-> case x of Markup (Start StartTag) -> return $ Right x
_ -> put sink x
>>= cond (findStartTag source sink) (return $ Left [x]))
element :: Monad m => Splitter m (Markup Token Char) ()
element = Splitter $
\source true false edge->
let split0 = findStartTag source false
>>= either return
(\x-> put edge ()
>> put true x
>>= cond
(do (tokens, mn) <- getElementName source id
maybe
(putList tokens true)
(\name-> putList tokens true
>>= whenNull
(pourRestOfTag source true
>>= cond
(split1 name)
split0))
mn)
(return [x]))
split1 name = findEndTag source true true name
>>= whenNull split0
in split0
elementContent :: Monad m => Splitter m (Markup Token Char) ()
elementContent = Splitter $
\source true false edge->
let split0 = findStartTag source false
>>= either return
(\x-> put false x
>>= cond
(do (tokens, mn) <- getElementName source id
maybe
(putList tokens false)
(\name-> putList tokens false
>>= whenNull (pourRestOfTag source false
>>= cond
(put edge ()
>> split1 name)
split0))
mn)
(return [x]))
split1 name = findEndTag source true false name
>>= whenNull split0
in split0
elementHavingTag :: forall m b. ParallelizableMonad m =>
Splitter m (Markup Token Char) b -> Splitter m (Markup Token Char) b
elementHavingTag test =
isolateSplitter $ \ source true false edge ->
let split0 = findStartTag source false
>>= either return
(\x-> do (tokens, mn) <- getElementName source (x :)
maybe
(return tokens)
(\name-> do (hasContent, rest) <- pipe
(pourRestOfTag source)
getList
let tag = tokens ++ rest
(_, (unconsumed, maybeTrue, (), maybeEdge))
<- pipe
(putList tag)
(\tag-> splitToConsumers
test
tag
get
consumeAndSuppress
get)
if isJust maybeTrue || isJust maybeEdge
then maybe (return True) (put edge) maybeEdge
>> putList tag true
>>= whenNull (split1 hasContent true name)
else putList tag false
>>= whenNull (split1 hasContent false name))
mn)
split1 hasContent sink name = if hasContent
then findEndTag source sink sink name >>= whenNull split0
else split0
in split0
attribute :: Monad m => Splitter m (Markup Token Char) ()
attribute = Splitter $
\source true false edge->
let split0 = get source
>>= maybe
(return [])
(\x-> case x of Markup (Start AttributeName)
-> put edge ()
>> put true x
>>= cond
(pourRestOfRegion AttributeName source true true
>>= maybe split1 return)
(return [x])
_ -> put false x
>>= cond split0 (return [x]))
split1 = get source
>>= maybe
(return [])
(\x-> case x of Markup (Start AttributeValue)
-> put true x
>>= cond
(pourRestOfRegion AttributeValue source true true
>>= maybe split0 return)
(return [x])
_ -> put true x
>>= cond split1 (return [x]))
in split0
elementName :: Monad m => Splitter m (Markup Token Char) ()
elementName = Splitter (splitSimpleRegions ElementName)
attributeName :: Monad m => Splitter m (Markup Token Char) ()
attributeName = Splitter (splitSimpleRegions AttributeName)
attributeValue :: Monad m => Splitter m (Markup Token Char) ()
attributeValue = Splitter (splitSimpleRegions AttributeValue)
splitSimpleRegions token source true false edge = split
where split = get source
>>= maybe
(return [])
(\x-> case x of Markup (Start token') | token == token'
-> put false x
>>= cond
(put edge ()
>> pourRestOfRegion token source true false
>>= maybe split return)
(return [x])
_ -> put false x
>>= cond split (return [x]))
havingText :: forall m b1 b2. ParallelizableMonad m =>
Bool -> Splitter m (Markup Token Char) b1 -> Splitter m Char b2 -> Splitter m (Markup Token Char) b1
havingText parallel chunker tester =
isolateSplitter $ \ source true false edge ->
let test Nothing chunk = pour chunk false >> return []
test (Just mb) chunk = pipe
(\sink1-> pipe (tee chunk sink1) getList)
(\chunk-> liftM snd $
pipe
(transduce unparse chunk)
(\chunk-> splitToConsumers tester chunk
(liftM isJust . get)
consumeAndSuppress
(liftM isJust . get)))
>>= \(((), prefix), (_, anyTrue, (), anyEdge))->
if anyTrue || anyEdge
then maybe (return True) (put edge) mb
>> putList prefix true
>>= whenNull (pour chunk true >> return [])
else putList prefix false
>>= whenNull (pour chunk false >> return [])
in liftM fst $
pipePS parallel
(transduce (splitterToMarker chunker) source)
(flip groupMarks test)
havingOnlyText :: forall m b1 b2. ParallelizableMonad m =>
Bool -> Splitter m (Markup Token Char) b1 -> Splitter m Char b2 -> Splitter m (Markup Token Char) b1
havingOnlyText parallel chunker tester =
isolateSplitter $ \ source true false edge ->
let test Nothing chunk = pour chunk false >> return []
test (Just mb) chunk = pipe
(\sink1-> pipe (tee chunk sink1) getList)
(\chunk-> liftM snd $
pipe
(transduce unparse chunk)
(\chunk-> splitToConsumers tester chunk
consumeAndSuppress
(liftM isJust . get)
consumeAndSuppress))
>>= \(((), prefix), (_, (), anyFalse, ()))->
if anyFalse
then putList prefix false
>>= whenNull (pour chunk false >> return [])
else maybe (return True) (put edge) mb
>> putList prefix true
>>= whenNull (pour chunk true >> return [])
in liftM fst $
pipePS parallel
(transduce (splitterToMarker chunker) source)
(flip groupMarks test)