module Control.Concurrent.SCC.XML (
xmlTokens, parseXMLTokens, expandXMLEntity, XMLToken(..),
xmlElement, xmlElementContent, xmlElementName, xmlAttribute, xmlAttributeName, xmlAttributeValue, xmlElementHavingTagWith
)
where
import Prelude hiding (mapM)
import Control.Category ((>>>))
import qualified Control.Category as Category
import Control.Exception (assert)
import Control.Monad (join, liftM, when)
import Data.Char
import qualified Data.Map as Map
import Data.Maybe (fromJust, isJust, mapMaybe)
import Data.List (find, stripPrefix)
import qualified Data.Sequence as Seq
import Data.Sequence (Seq, (|>))
import Data.Traversable (Traversable, mapM)
import Data.Text (Text, append)
import qualified Data.Text as Text
import Numeric (readDec, readHex)
import Debug.Trace (trace)
import Control.Monad.Coroutine
import Control.Monad.Parallel (MonadParallel(..))
import Control.Concurrent.SCC.Streams
import Control.Concurrent.SCC.Types
import Control.Concurrent.SCC.Coercions (coerce)
import Control.Concurrent.SCC.Combinators (groupMarks, parseEachNestedRegion, splitterToMarker,
findsTrueIn, findsFalseIn, teeConsumers)
import Control.Concurrent.SCC.Primitives (group)
data XMLToken = StartTag | EndTag | EmptyTag
| ElementName | AttributeName | AttributeValue
| EntityReferenceToken | EntityName
| ProcessingInstruction | ProcessingInstructionText
| Comment | CommentText
| StartMarkedSectionCDATA | EndMarkedSection
| ErrorToken String
deriving (Eq, Show)
expandXMLEntity :: String -> String
expandXMLEntity "lt" = "<"
expandXMLEntity "gt" = ">"
expandXMLEntity "quot" = "\""
expandXMLEntity "apos" = "'"
expandXMLEntity "amp" = "&"
expandXMLEntity ('#' : 'x' : codePoint) = [chr (fst $ head $ readHex codePoint)]
expandXMLEntity ('#' : codePoint) = [chr (fst $ head $ readDec codePoint)]
isNameStart x = isLetter x || x == '_'
isNameChar x = isAlphaNum x || x == '_' || x == '-'
xmlTokens :: Monad m => Splitter m Char (Boundary XMLToken)
xmlTokens = Splitter $
\source true false edge->
let getContent = pourWhile (\x-> x /= '<' && x /= '&') source false
>> getWith contentEnd source
contentEnd '<' = get source
>>= maybe
(put edge (Point errorUnescapedContentLT) >> put false '<')
(\x-> tag x >> getContent)
contentEnd '&' = entity >> getContent
tag '?' = do put edge (Start ProcessingInstruction)
putList "<?" true
put edge (Start ProcessingInstructionText)
processingInstruction
tag '!' = dispatchOnString source
(\other-> put edge (Point (errorBadDeclarationType other)))
[("--",
\match-> do put edge (Start Comment)
putList match true
put edge (Start CommentText)
comment),
("[CDATA[",
\match-> do put edge (Start StartMarkedSectionCDATA)
putList match true
put edge (End StartMarkedSectionCDATA)
markedSection)]
tag '/' =
do put edge (Start EndTag)
putList "</" true
name <- getWhile (\x-> isNameChar x || x == ':') source
if null name
then put edge (Point errorNamelessEndTag)
else put edge (Start ElementName)
>> putList name true
>> put edge (End ElementName)
pourUntil (not . isSpace) source true
>>= maybe
(put edge (Point errorInputEndInEndTag))
(\x-> if x == '>'
then getWith (put true) source
else put edge (Point (errorBadEndTag x)))
put edge (End EndTag)
tag x | isNameStart x =
put edge (Start StartTag)
>> put true '<'
>> name ElementName x
>> attributes
>> put edge (End StartTag)
tag x = put edge (Point errorUnescapedContentLT)
>> put false '<'
>> put false x
startTagEnd '/' = get source
>> put edge (Point EmptyTag)
>> next errorInputEndInStartTag
(\x-> do when (x /= '>' ) (put edge (Point (errorBadStartTag x)))
putList ['/', x] true
return ())
startTagEnd '>' = getWith (put true) source
startTagEnd x = put edge (Point (errorBadStartTag x))
attributes= pourUntil (not . isSpace) source true
>>= maybe
(put edge (Point errorInputEndInStartTag))
(\x-> if isNameStart x then attribute >> attributes else startTagEnd x)
attribute= do put edge (Start AttributeName)
pourWhile (\x-> isNameChar x || x == ':') source true
put edge (End AttributeName)
next errorInputEndInStartTag
(\y-> do when (y /= '=') (put edge (Point (errorBadAttribute y)))
q <- if y == '"' || y == '\''
then return y
else put true y
>> get source
>>= maybe
(put edge (Point errorInputEndInAttributeValue)
>> return '"')
return
when (q /= '"' && q /= '\'') (put edge (Point (errorBadQuoteCharacter q)))
put true q
put edge (Start AttributeValue)
attributeValue q
put edge (End AttributeValue)
put true q)
attributeValue q = pourWhile (\x-> (x /= q && x/= '<' && x /= '&')) source true
>> next errorInputEndInAttributeValue
(\x-> case x
of '<' -> do put edge (Start errorUnescapedAttributeLT)
put true '<'
put edge (End errorUnescapedAttributeLT)
attributeValue q
'&' -> entity >> attributeValue q
_ -> return ())
processingInstruction =
dispatchOnString source
(\other-> if null other
then put edge (Point errorInputEndInProcessingInstruction)
else putList other true >> processingInstruction)
[("?>",
\match-> do put edge (End ProcessingInstructionText)
putList match true
put edge (End ProcessingInstruction)
getContent)]
comment =
dispatchOnString source
(\other-> if null other
then put edge (Point errorInputEndInComment)
else putList other true >> comment)
[("-->",
\match-> do put edge (End CommentText)
putList match true
put edge (End Comment)
getContent)]
markedSection =
dispatchOnString source
(\other-> if null other
then put edge (Point errorInputEndInMarkedSection)
else putList other true >> markedSection)
[("]]>",
\match-> do put edge (Start EndMarkedSection)
putList match true
put edge (End EndMarkedSection)
getContent)]
entity = put edge (Start EntityReferenceToken)
>> put true '&'
>> next errorInputEndInEntityReference
(\x-> name EntityName x
>> next errorInputEndInEntityReference
(\x-> do when (x /= ';') (put edge (Point (errorBadEntityReference x)))
put true x))
>> put edge (End EntityReferenceToken)
name token x =
put edge (Start token)
>> nameTail x
>> put edge (End token)
nameTail x = getWhile (\x-> isNameChar x || x == ':') source
>>= \tail-> putList (x:tail) true
next error f = get source
>>= maybe (put edge (Point error)) f
in getContent
errorInputEndInComment = ErrorToken "Unterminated comment"
errorInputEndInMarkedSection = ErrorToken "Unterminated marked section"
errorInputEndInStartTag = ErrorToken "Missing '>' at the end of start tag."
errorInputEndInEndTag = ErrorToken "End of input in end tag"
errorInputEndInAttributeValue = ErrorToken "Truncated input after attribute name"
errorInputEndInEntityReference = ErrorToken "End of input in entity reference"
errorInputEndInProcessingInstruction = ErrorToken "Unterminated processing instruction"
errorBadQuoteCharacter q = ErrorToken ("Invalid quote character " ++ show q)
errorBadStartTag x = ErrorToken ("Invalid character " ++ show x ++ " in start tag")
errorBadEndTag x = ErrorToken ("Invalid character " ++ show x ++ " in end tag")
errorBadAttribute x = ErrorToken ("Invalid character " ++ show x ++ " following attribute name")
errorBadAttributeValue x = ErrorToken ("Invalid character " ++ show x ++ " in attribute value.")
errorBadEntityReference x = ErrorToken ("Invalid character " ++ show x ++ " ends entity name.")
errorBadDeclarationType other = ErrorToken ("Expecting <![CDATA[ or <!--, received " ++ show ("<![" ++ other))
errorNamelessEndTag = ErrorToken "Missing element name in end tag"
errorUnescapedContentLT = ErrorToken "Unescaped character '<' in content"
errorUnescapedAttributeLT = ErrorToken "Invalid character '<' in attribute value."
parseXMLTokens :: MonadParallel m => Transducer m Char (Markup XMLToken Text)
parseXMLTokens = parseEachNestedRegion sequentialBinder xmlTokens coerce
dispatchOnString :: forall m a d r. (Monad m, AncestorFunctor a d) =>
Source m a Char -> (String -> Coroutine d m r) -> [(String, String -> Coroutine d m r)]
-> Coroutine d m r
dispatchOnString source failure fullCases = dispatch fullCases id
where dispatch cases consumed
= case find (null . fst) cases
of Just ("", rhs) -> rhs (consumed "")
Nothing -> get source
>>= maybe
(failure (consumed ""))
(\x-> case mapMaybe (startingWith x) cases
of [] -> failure (consumed [x])
subcases -> dispatch (subcases ++ fullCases) (consumed . (x :)))
startingWith x (y:rest, rhs) | x == y = Just (rest, rhs)
| otherwise = Nothing
getElementName :: forall m a d. (Monad m, AncestorFunctor a d) =>
Source m a (Markup XMLToken Text) -> ([Markup XMLToken Text] -> [Markup XMLToken Text])
-> Coroutine d m ([Markup XMLToken Text], Maybe Text)
getElementName source f = get source
>>= maybe
(return (f [], Nothing))
(\x-> case x
of Markup (Start ElementName) -> getRestOfRegion ElementName source (f . (x:)) id
Markup (Point ErrorToken{}) -> getElementName source (f . (x:))
Content{} -> getElementName source (f . (x:))
_ -> error ("Expected an ElementName, received " ++ show x))
getRestOfRegion :: forall m a d. (Monad m, AncestorFunctor a d) =>
XMLToken -> Source m a (Markup XMLToken Text)
-> ([Markup XMLToken Text] -> [Markup XMLToken Text]) -> (Text -> Text)
-> Coroutine d m ([Markup XMLToken Text], Maybe Text)
getRestOfRegion token source f g = getWhile isContent source
>>= \content-> get source
>>= \x-> case x
of Just y@(Markup (End token))
-> return (f (content ++ [y]),
Just (g $ Text.concat $ map fromContent content))
_ -> error ("Expected rest of " ++ show token ++ ", received " ++ show x)
pourRestOfRegion :: forall m a1 a2 a3 d. (Monad m, AncestorFunctor a1 d, AncestorFunctor a2 d, AncestorFunctor a3 d) =>
XMLToken -> Source m a1 (Markup XMLToken Text)
-> Sink m a2 (Markup XMLToken Text) -> Sink m a3 (Markup XMLToken Text)
-> Coroutine d m Bool
pourRestOfRegion token source sink endSink = pourWhile isContent source sink
>> get source
>>= maybe
(return False)
(\x-> case x
of Markup (End token') | token == token' -> put endSink x
>> return True
_ -> error ("Expected rest of " ++ show token
++ ", received " ++ show x))
pourRestOfTag :: forall m a1 a2 d. (Monad m, AncestorFunctor a1 d, AncestorFunctor a2 d) =>
Source m a1 (Markup XMLToken Text) -> Sink m a2 (Markup XMLToken Text) -> Coroutine d m Bool
pourRestOfTag source sink = pourUntil isEndTag source sink
>>= maybe
(return True)
(\x-> put sink x
>> get source
>> case x of Markup (End StartTag) -> return True
Markup (End EndTag) -> return True
Markup (Point EmptyTag) -> pourRestOfTag source sink
>> return False)
where isEndTag (Markup (End StartTag)) = True
isEndTag (Markup (End EndTag)) = True
isEndTag (Markup (Point EmptyTag)) = True
isEndTag _ = False
findEndTag :: forall m a1 a2 a3 d. (Monad m, AncestorFunctor a1 d, AncestorFunctor a2 d, AncestorFunctor a3 d) =>
Source m a1 (Markup XMLToken Text) -> Sink m a2 (Markup XMLToken Text) -> Sink m a3 (Markup XMLToken Text)
-> Text
-> Coroutine d m ()
findEndTag source sink endSink name = find where
find = pourUntil isTagStart source sink
>>= maybe (return ()) (\x-> get source >> consumeOne x)
isTagStart (Markup (Start StartTag)) = True
isTagStart (Markup (Start EndTag)) = True
isTagStart _ = False
consumeOne x@(Markup (Start EndTag)) = do (tokens, mn) <- getElementName source (x :)
maybe
(return ())
(\name'-> if name == name'
then do putList tokens endSink
pourRestOfTag source endSink
return ()
else do putList tokens sink
pourRestOfTag source sink
find)
mn
consumeOne x@(Markup (Start StartTag)) = do (tokens, mn) <- getElementName source (x :)
maybe
(return ())
(\name'-> putList tokens sink
>> if name == name'
then pourRestOfTag source sink
>>= flip when (findEndTag source sink sink name)
>> find
else pourRestOfTag source sink
>> find)
mn
findStartTag :: forall m a1 a2 d. (Monad m, AncestorFunctor a1 d, AncestorFunctor a2 d) =>
Source m a1 (Markup XMLToken Text) -> Sink m a2 (Markup XMLToken Text)
-> Coroutine d m (Maybe (Markup XMLToken Text))
findStartTag source sink = pourUntil isStartTag source sink >> get source
where isStartTag (Markup (Start StartTag)) = True
isStartTag _ = False
xmlElement :: Monad m => Splitter m (Markup XMLToken Text) ()
xmlElement = Splitter $
\source true false edge->
let split0 = findStartTag source false
>>= maybe (return [])
(\x-> do put edge ()
put true x
(tokens, mn) <- getElementName source id
maybe
(putList tokens true)
(\name-> do putList tokens true
hasContent <- pourRestOfTag source true
if hasContent
then split1 name
else split0)
mn)
split1 name = findEndTag source true true name
>> split0
in split0 >> return ()
xmlElementContent :: Monad m => Splitter m (Markup XMLToken Text) ()
xmlElementContent = Splitter $
\source true false edge->
let split0 = findStartTag source false
>>= maybe (return [])
(\x-> do put false x
(tokens, mn) <- getElementName source id
maybe
(putList tokens false)
(\name-> do putList tokens false
hasContent <- pourRestOfTag source false
if hasContent
then put edge () >> split1 name
else split0)
mn)
split1 name = findEndTag source true false name
>> split0
in split0 >> return ()
xmlElementHavingTagWith :: forall m b. MonadParallel m =>
Splitter m (Markup XMLToken Text) b -> Splitter m (Markup XMLToken Text) b
xmlElementHavingTagWith test =
isolateSplitter $ \ source true false edge ->
let split0 = findStartTag source false
>>= maybe (return ())
(\x-> do (tokens, mn) <- getElementName source (x :)
maybe
(return ())
(\name-> do (hasContent, rest) <- pipe
(pourRestOfTag source)
getList
let tag = tokens ++ rest
(_, found) <- pipe (putList tag) (findsTrueIn test)
case found of Just mb -> maybe (return ()) (put edge) mb
>> putList tag true
>> split1 hasContent true name
Nothing -> putList tag false
>> split1 hasContent false name)
mn)
split1 hasContent sink name = when hasContent (findEndTag source sink sink name)
>> split0
in split0
xmlAttribute :: Monad m => Splitter m (Markup XMLToken Text) ()
xmlAttribute = Splitter $
\source true false edge->
let split0 = getWith
(\x-> case x
of Markup (Start AttributeName) ->
do put edge ()
put true x
pourRestOfRegion AttributeName source true true
>>= flip when split1
_ -> put false x >> split0)
source
split1 = getWith
(\x-> case x
of Markup (Start AttributeValue)
-> put true x
>> pourRestOfRegion AttributeValue source true true
>>= flip when split0
_ -> put true x >> split1)
source
in split0
xmlElementName :: Monad m => Splitter m (Markup XMLToken Text) ()
xmlElementName = Splitter (splitSimpleRegions ElementName)
xmlAttributeName :: Monad m => Splitter m (Markup XMLToken Text) ()
xmlAttributeName = Splitter (splitSimpleRegions AttributeName)
xmlAttributeValue :: Monad m => Splitter m (Markup XMLToken Text) ()
xmlAttributeValue = Splitter (splitSimpleRegions AttributeValue)
splitSimpleRegions token source true false edge = split
where split = getWith consumeOne source
consumeOne x@(Markup (Start token')) | token == token' = put false x
>> put edge ()
>> pourRestOfRegion token source true false
>>= flip when split
consumeOne x = put false x >> split
justContent (Content x) = Just x
justContent _ = Nothing
isContent (Content x) = True
isContent _ = False
fromContent (Content x) = x
mapJoinM :: (Monad m, Monad t, Traversable t) => (a -> m (t b)) -> t a -> m (t b)
mapJoinM f ta = mapM f ta >>= return . join