module Control.Concurrent.SCC.XML (
xmlTokens, parseXMLTokens, expandXMLEntity, XMLToken(..),
xmlElement, xmlElementContent, xmlElementName, xmlAttribute, xmlAttributeName, xmlAttributeValue,
xmlElementHavingTagWith
)
where
import Control.Monad (when)
import Data.Char
import Data.Maybe (mapMaybe)
import Data.List (find)
import Data.Text (Text)
import qualified Data.Text as Text
import Numeric (readDec, readHex)
import Control.Cofunctor.Ticker (andThen, tickOne, tickWhile)
import Control.Monad.Coroutine (Coroutine, sequentialBinder)
import Control.Concurrent.SCC.Streams
import Control.Concurrent.SCC.Types
import Control.Concurrent.SCC.Coercions (coerce)
import Control.Concurrent.SCC.Combinators (parseEachNestedRegion, findsTrueIn)
data XMLToken = StartTag | EndTag | EmptyTag
| ElementName | AttributeName | AttributeValue
| EntityReferenceToken | EntityName
| ProcessingInstruction | ProcessingInstructionText
| Comment | CommentText
| StartMarkedSectionCDATA | EndMarkedSection
| ErrorToken String
deriving (Eq, Show)
expandXMLEntity :: String -> String
expandXMLEntity "lt" = "<"
expandXMLEntity "gt" = ">"
expandXMLEntity "quot" = "\""
expandXMLEntity "apos" = "'"
expandXMLEntity "amp" = "&"
expandXMLEntity ('#' : 'x' : codePoint) = [chr (fst $ head $ readHex codePoint)]
expandXMLEntity ('#' : codePoint) = [chr (fst $ head $ readDec codePoint)]
expandXMLEntity e = error ("String \"" ++ e ++ "\" is not a built-in entity name.")
xmlTokens :: Monad m => Splitter m Char (Boundary XMLToken)
xmlTokens = Splitter $
\source true false edge->
let getContent = pourWhile (\x-> x /= '<' && x /= '&') source false
>> getWith contentEnd source
contentEnd '<' = get source
>>= maybe
(put edge (Point errorUnescapedContentLT) >> put false '<')
(\x-> tag x >> getContent)
contentEnd '&' = entity >> getContent
contentEnd _ = error "pourUntil returned early!"
tag '?' = put edge (Start ProcessingInstruction)
>> putList "<?" true
>> put edge (Start ProcessingInstructionText)
>> processingInstruction
tag '!' = dispatchOnString source
(\other-> put edge (Point (errorBadDeclarationType other)))
[("--",
const (put edge (Start Comment)
>> putList "<!--" true
>> put edge (Start CommentText)
>> comment)),
("[CDATA[",
const (put edge (Start StartMarkedSectionCDATA)
>> putList "<![CDATA[" true
>> put edge (End StartMarkedSectionCDATA)
>> markedSection))]
tag '/' =
do put edge (Start EndTag)
_ <- putList "</" true
elementName <- getWhile isNameChar source
if null elementName
then put edge (Point errorNamelessEndTag)
else put edge (Start ElementName)
>> putList elementName true
>> put edge (End ElementName)
pourUntil (not . isSpace) source true
>>= maybe
(put edge (Point errorInputEndInEndTag))
(\x-> if x == '>'
then getWith (put true) source
else put edge (Point (errorBadEndTag x)))
put edge (End EndTag)
tag x | isNameStart x =
put edge (Start StartTag)
>> put true '<'
>> name ElementName x
>> attributes
>> put edge (End StartTag)
tag x = put edge (Point errorUnescapedContentLT)
>> put false '<'
>> put false x
startTagEnd '/' = get source
>> put edge (Point EmptyTag)
>> next errorInputEndInStartTag
(\x-> when (x /= '>' ) (put edge (Point (errorBadStartTag x)))
>> putList ['/', x] true
>> return ())
startTagEnd '>' = getWith (put true) source
startTagEnd x = put edge (Point (errorBadStartTag x))
attributes= pourUntil (not . isSpace) source true
>>= maybe
(put edge (Point errorInputEndInStartTag))
(\x-> if isNameStart x then attribute >> attributes else startTagEnd x)
attribute= do put edge (Start AttributeName)
pourWhile isNameChar source true
put edge (End AttributeName)
next errorInputEndInStartTag
(\y-> do when (y /= '=') (put edge (Point (errorBadAttribute y)))
q <- if y == '"' || y == '\''
then return y
else put true y
>> get source
>>= maybe
(put edge (Point errorInputEndInAttributeValue)
>> return '"')
return
when (q /= '"' && q /= '\'') (put edge (Point (errorBadQuoteCharacter q)))
put true q
put edge (Start AttributeValue)
attributeValue q
put edge (End AttributeValue)
put true q)
attributeValue q = pourWhile (\x-> (x /= q && x/= '<' && x /= '&')) source true
>> next errorInputEndInAttributeValue
(\x-> case x
of '<' -> do put edge (Start errorUnescapedAttributeLT)
put true '<'
put edge (End errorUnescapedAttributeLT)
attributeValue q
'&' -> entity >> attributeValue q
_ -> return ())
processingInstruction =
pourWhile (/= '?') source true
>> dispatchOnString source
(\other-> if null other
then put edge (Point errorInputEndInProcessingInstruction)
else putList other true >> processingInstruction)
[("?>",
\match-> put edge (End ProcessingInstructionText)
>> putList match true
>> put edge (End ProcessingInstruction)
>> getContent)]
comment =
pourWhile (/= '-') source true
>> dispatchOnString source
(\other-> if null other
then put edge (Point errorInputEndInComment)
else putList other true >> comment)
[("-->",
\match-> put edge (End CommentText)
>> putList match true
>> put edge (End Comment)
>> getContent)]
markedSection =
pourWhile (/= ']') source true
>> dispatchOnString source
(\other-> if null other
then put edge (Point errorInputEndInMarkedSection)
else putList other true >> markedSection)
[("]]>",
\match-> put edge (Start EndMarkedSection)
>> putList match true
>> put edge (End EndMarkedSection)
>> getContent)]
entity = put edge (Start EntityReferenceToken)
>> put true '&'
>> next errorInputEndInEntityReference
(\x-> name EntityName x
>> next errorInputEndInEntityReference
(\y-> do when (y /= ';') (put edge (Point (errorBadEntityReference y)))
put true y))
>> put edge (End EntityReferenceToken)
name token x =
put edge (Start token)
>> nameTail x
>> put edge (End token)
nameTail x = getWhile isNameChar source
>>= \rest-> putList (x:rest) true
next errorToken f = get source
>>= maybe (put edge (Point errorToken)) f
in getContent
where errorInputEndInComment = ErrorToken "Unterminated comment"
errorInputEndInMarkedSection = ErrorToken "Unterminated marked section"
errorInputEndInStartTag = ErrorToken "Missing '>' at the end of start tag."
errorInputEndInEndTag = ErrorToken "End of input in end tag"
errorInputEndInAttributeValue = ErrorToken "Truncated input after attribute name"
errorInputEndInEntityReference = ErrorToken "End of input in entity reference"
errorInputEndInProcessingInstruction = ErrorToken "Unterminated processing instruction"
errorBadQuoteCharacter q = ErrorToken ("Invalid quote character " ++ show q)
errorBadStartTag x = ErrorToken ("Invalid character " ++ show x ++ " in start tag")
errorBadEndTag x = ErrorToken ("Invalid character " ++ show x ++ " in end tag")
errorBadAttribute x = ErrorToken ("Invalid character " ++ show x ++ " following attribute name")
errorBadEntityReference x = ErrorToken ("Invalid character " ++ show x ++ " ends entity name.")
errorBadDeclarationType other = ErrorToken ("Expecting <![CDATA[ or <!--, received " ++ show ("<![" ++ other))
errorNamelessEndTag = ErrorToken "Missing element name in end tag"
errorUnescapedContentLT = ErrorToken "Unescaped character '<' in content"
errorUnescapedAttributeLT = ErrorToken "Invalid character '<' in attribute value."
isNameStart x = isLetter x || x == '_'
isNameChar x = isAlphaNum x || x == '_' || x == '-' || x == ':'
parseXMLTokens :: Monad m => Transducer m Char (Markup XMLToken Text)
parseXMLTokens = parseEachNestedRegion sequentialBinder xmlTokens coerce
dispatchOnString :: forall m a d r. (Monad m, AncestorFunctor a d) =>
Source m a Char -> (String -> Coroutine d m r) -> [(String, String -> Coroutine d m r)]
-> Coroutine d m r
dispatchOnString source failure fullCases = dispatch fullCases id
where dispatch cases consumed
= case find (null . fst) cases
of Just (~"", rhs) -> rhs (consumed "")
Nothing -> get source
>>= maybe
(failure (consumed ""))
(\x-> case mapMaybe (startingWith x) cases
of [] -> failure (consumed [x])
subcases -> dispatch (subcases ++ fullCases) (consumed . (x :)))
startingWith x ~(y:rest, rhs) | x == y = Just (rest, rhs)
| otherwise = Nothing
getElementName :: forall m a d. (Monad m, AncestorFunctor a d) =>
Source m a (Markup XMLToken Text) -> ([Markup XMLToken Text] -> [Markup XMLToken Text])
-> Coroutine d m ([Markup XMLToken Text], Maybe Text)
getElementName source f = get source
>>= maybe
(return (f [], Nothing))
(\x-> let f' = f . (x:)
in case x
of Markup (Start ElementName) -> getRestOfRegion ElementName source f' id
Markup (Point ErrorToken{}) -> getElementName source f'
Content{} -> getElementName source f'
_ -> error ("Expected an ElementName, received " ++ show x))
getRestOfRegion :: forall m a d. (Monad m, AncestorFunctor a d) =>
XMLToken -> Source m a (Markup XMLToken Text)
-> ([Markup XMLToken Text] -> [Markup XMLToken Text]) -> (Text -> Text)
-> Coroutine d m ([Markup XMLToken Text], Maybe Text)
getRestOfRegion token source f g = getWhile isContent source
>>= \content-> get source
>>= \x-> case x
of Just y@(Markup End{})
-> return (f (content ++ [y]),
Just (g $ Text.concat $ map fromContent content))
_ -> error ("Expected rest of " ++ show token ++ ", received " ++ show x)
pourRestOfRegion :: forall m a1 a2 a3 d. (Monad m, AncestorFunctor a1 d, AncestorFunctor a2 d, AncestorFunctor a3 d) =>
XMLToken -> Source m a1 (Markup XMLToken Text)
-> Sink m a2 (Markup XMLToken Text) -> Sink m a3 (Markup XMLToken Text)
-> Coroutine d m Bool
pourRestOfRegion token source sink endSink = pourWhile isContent source sink
>> get source
>>= maybe
(return False)
(\x-> case x
of Markup (End token') | token == token' -> put endSink x
>> return True
_ -> error ("Expected rest of " ++ show token
++ ", received " ++ show x))
getRestOfStartTag :: forall m a d. (Monad m, AncestorFunctor a d) =>
Source m a (Markup XMLToken Text) -> Coroutine d m ([Markup XMLToken Text], Bool)
getRestOfStartTag source = do rest <- getWhile notEndTag source
end <- get source
case end of Nothing -> return (rest, False)
Just e@(Markup (End StartTag)) -> return (rest ++ [e], True)
Just e@(Markup (Point EmptyTag)) ->
getRestOfStartTag source
>>= \(rest', _)-> return (rest ++ (e: rest'), False)
_ -> error "getWhile returned early!"
where notEndTag (Markup (End StartTag)) = False
notEndTag (Markup (Point EmptyTag)) = False
notEndTag _ = True
getRestOfEndTag :: forall m a d. (Monad m, AncestorFunctor a d) =>
Source m a (Markup XMLToken Text) -> Coroutine d m [Markup XMLToken Text]
getRestOfEndTag source = getWhile (/= Markup (End EndTag)) source
>>= \tokens-> get source
>>= maybe (error "No end to the end tag!") (return . (tokens ++) . (:[]))
findEndTag :: forall m a1 a2 a3 d. (Monad m, AncestorFunctor a1 d, AncestorFunctor a2 d, AncestorFunctor a3 d) =>
Source m a1 (Markup XMLToken Text) -> Sink m a2 (Markup XMLToken Text) -> Sink m a3 (Markup XMLToken Text)
-> Text
-> Coroutine d m ()
findEndTag source sink endSink name = findTag where
findTag = pourWhile noTagStart source sink
>> get source
>>= maybe (return ()) consumeOne
noTagStart (Markup (Start StartTag)) = False
noTagStart (Markup (Start EndTag)) = False
noTagStart _ = True
consumeOne x@(Markup (Start EndTag)) = do (tokens, mn) <- getElementName source (x :)
maybe
(return ())
(\name'-> getRestOfEndTag source
>>= \rest-> if name == name'
then putList (tokens ++ rest) endSink
>> return ()
else putList (tokens ++ rest) sink
>> findTag)
mn
consumeOne x@(Markup (Start StartTag)) = do (tokens, mn) <- getElementName source (x :)
maybe
(return ())
(\name'-> do (rest, hasContent) <- getRestOfStartTag source
_ <- putList (tokens ++ rest) sink
when hasContent (findEndTag source sink sink name')
findTag)
mn
consumeOne _ = error "pourUntil returned early!"
findStartTag :: forall m a1 a2 d. (Monad m, AncestorFunctor a1 d, AncestorFunctor a2 d) =>
Source m a1 (Markup XMLToken Text) -> Sink m a2 (Markup XMLToken Text)
-> Coroutine d m (Maybe (Markup XMLToken Text))
findStartTag source sink = pourWhile (/= Markup (Start StartTag)) source sink >> get source
xmlElement :: Monad m => Splitter m (Markup XMLToken Text) ()
xmlElement = Splitter $
\source true false edge->
let split0 = findStartTag source false
>>= maybe (return [])
(\x-> do put edge ()
put true x
(tokens, mn) <- getElementName source id
maybe
(putList tokens true)
(\name-> do (rest, hasContent) <- getRestOfStartTag source
_ <- putList (tokens ++ rest) true
if hasContent
then split1 name
else split0)
mn)
split1 name = findEndTag source true true name
>> split0
in split0 >> return ()
xmlElementContent :: Monad m => Splitter m (Markup XMLToken Text) ()
xmlElementContent = Splitter $
\source true false edge->
let split0 = findStartTag source false
>>= maybe (return [])
(\x-> do put false x
(tokens, mn) <- getElementName source id
maybe
(putList tokens false)
(\name-> do (rest, hasContent) <- getRestOfStartTag source
_ <- putList (tokens ++ rest) false
if hasContent
then put edge () >> split1 name
else split0)
mn)
split1 name = findEndTag source true false name
>> split0
in split0 >> return ()
xmlElementHavingTagWith :: forall m b. Monad m =>
Splitter m (Markup XMLToken Text) b -> Splitter m (Markup XMLToken Text) b
xmlElementHavingTagWith test =
isolateSplitter $ \ source true false edge ->
let split0 = findStartTag source false
>>= maybe (return ())
(\x-> do (tokens, mn) <- getElementName source (x :)
maybe
(return ())
(\name-> do (rest, hasContent) <- getRestOfStartTag source
let tag = tokens ++ rest
(_, found) <- pipe (putList tag) (findsTrueIn test)
case found of Just mb -> maybe (return ()) (put edge) mb
>> putList tag true
>> split1 hasContent true name
Nothing -> putList tag false
>> split1 hasContent false name)
mn)
split1 hasContent sink name = when hasContent (findEndTag source sink sink name)
>> split0
in split0
xmlAttribute :: Monad m => Splitter m (Markup XMLToken Text) ()
xmlAttribute = Splitter $
\source true false edge->
let split0 = getWith
(\x-> case x
of Markup (Start AttributeName) ->
do put edge ()
put true x
pourRestOfRegion AttributeName source true true
>>= flip when split1
_ -> put false x >> split0)
source
split1 = getWith
(\x-> case x
of Markup (Start AttributeValue)
-> put true x
>> pourRestOfRegion AttributeValue source true true
>>= flip when split0
_ -> put true x >> split1)
source
in split0
xmlElementName :: Monad m => Splitter m (Markup XMLToken Text) ()
xmlElementName = Splitter (splitSimpleRegions ElementName)
xmlAttributeName :: Monad m => Splitter m (Markup XMLToken Text) ()
xmlAttributeName = Splitter (splitSimpleRegions AttributeName)
xmlAttributeValue :: Monad m => Splitter m (Markup XMLToken Text) ()
xmlAttributeValue = Splitter (splitSimpleRegions AttributeValue)
splitSimpleRegions :: Monad m => XMLToken -> OpenSplitter m a1 a2 a3 a4 d (Markup XMLToken Text) () ()
splitSimpleRegions token source true false edge = split0
where split0 = getWith consumeOne source
consumeOne x@(Markup (Start token')) | token == token' = put false x
>> put edge ()
>> pourRestOfRegion token source true false
>>= flip when split0
consumeOne x = put false x >> split0
isContent :: Markup b x -> Bool
isContent Content{} = True
isContent _ = False
fromContent :: Markup b x -> x
fromContent (Content x) = x
fromContent _ = error "fromContent expects Content!"