module Control.Concurrent.SCC.XML (
xmlTokens, parseXMLTokens, expandXMLEntity, XMLToken(..),
xmlElement, xmlElementContent, xmlElementName, xmlAttribute, xmlAttributeName, xmlAttributeValue,
xmlElementHavingTagWith
)
where
import Prelude hiding (takeWhile)
import Control.Applicative (Alternative ((<|>)))
import Control.Arrow ((>>>))
import Control.Monad (when)
import Data.Char
import Data.Maybe (mapMaybe)
import Data.Monoid (Monoid(..))
import Data.List (find)
import Data.Text (Text, pack, unpack, singleton)
import qualified Data.Text as Text
import Numeric (readDec, readHex)
import Data.Functor.Contravariant.Ticker (andThen, tickOne, tickWhile)
import Text.ParserCombinators.Incremental (Parser, more, feed, anyToken, satisfy, concatMany, takeWhile, takeWhile1, string,
moptional, skip, lookAhead, notFollowedBy, mapIncremental, (><))
import qualified Text.ParserCombinators.Incremental.LeftBiasedLocal as LeftBiasedLocal (Parser)
import Text.ParserCombinators.Incremental.LeftBiasedLocal (leftmost)
import Control.Monad.Coroutine (Coroutine, sequentialBinder)
import Control.Concurrent.SCC.Streams
import Control.Concurrent.SCC.Types hiding (Parser)
import Control.Concurrent.SCC.Coercions (coerce)
import Control.Concurrent.SCC.Combinators (parserToSplitter, findsTrueIn)
data XMLToken = StartTag | EndTag | EmptyTag
| ElementName | AttributeName | AttributeValue
| EntityReference | EntityName
| ProcessingInstruction | ProcessingInstructionText
| Comment | CommentText
| StartMarkedSectionCDATA | EndMarkedSection | DoctypeDeclaration
| ErrorToken String
deriving (Eq, Show)
expandXMLEntity :: String -> String
expandXMLEntity "lt" = "<"
expandXMLEntity "gt" = ">"
expandXMLEntity "quot" = "\""
expandXMLEntity "apos" = "'"
expandXMLEntity "amp" = "&"
expandXMLEntity ('#' : 'x' : codePoint) = [chr (fst $ head $ readHex codePoint)]
expandXMLEntity ('#' : codePoint) = [chr (fst $ head $ readDec codePoint)]
expandXMLEntity e = error ("String \"" ++ e ++ "\" is not a built-in entity name.")
newtype XMLStream = XMLStream {chunk :: [Markup XMLToken Text]} deriving (Show)
instance Monoid XMLStream where
mempty = XMLStream []
l `mappend` XMLStream [] = l
XMLStream [] `mappend` r = r
XMLStream l `mappend` XMLStream r@((Content rc):rt) =
case last l
of Content lc -> XMLStream (init l ++ Content (mappend lc rc) : rt)
_ -> XMLStream (l ++ r)
XMLStream l `mappend` XMLStream r = XMLStream (l ++ r)
xmlParser :: LeftBiasedLocal.Parser String XMLStream
xmlParser = concatMany (xmlContent <|> xmlMarkup)
where xmlContent = mapContent $ takeWhile1 (\x-> x /= "<" && x /= "&")
xmlMarkup = (string "<" >> ((startTag <|> endTag <|> processingInstruction <|> declaration)
<|> return (XMLStream [Markup $ Point errorUnescapedContentLT,
Content (singleton '<')])))
<|>
entityReference "&"
startTag = return (XMLStream [Markup (Start StartTag), Content (singleton '<'), Markup (Start ElementName)])
>< name
>< return (XMLStream [Markup (End ElementName)])
>< whiteSpace
>< attributes
>< moptional (string "/" >> return (XMLStream [Markup (Point EmptyTag), Content (singleton '/')]))
>< whiteSpace
>< (string ">" >> return (XMLStream [Content (singleton '>'), Markup (End StartTag)])
<|> return (XMLStream [Markup $ Point unterminatedStartTag, Markup $ End StartTag]))
entityReference s = string s
>> (return (XMLStream [Markup (Start EntityReference), Content (pack s),
Markup (Start EntityName)])
>< name
>< (string ";" >> return (XMLStream [Markup (End EntityName), Content (singleton ';'),
Markup (End EntityReference)]))
<|> return (XMLStream [Markup $ Point $ errorBadEntityReference, Content (pack s)]))
attributes = concatMany (attribute >< whiteSpace)
attribute = return (XMLStream [Markup (Start AttributeName)])
>< name
>< return (XMLStream [Markup (End AttributeName)])
>< (mapContent (string "=")
<|> (fmap (\x-> XMLStream [Markup $ Point $ errorBadAttribute x]) anyToken
>< whiteSpace >< moptional (mapContent $ string "=")))
>< ((string "\"" <|> string "\'")
>>= \quote-> return (XMLStream [Content $ pack quote, Markup (Start AttributeValue)])
>< mapContent (takeWhile (/= quote))
>< return (XMLStream [Markup (End AttributeValue), Content $ pack quote])
>< skip (string quote)
<|> (anyToken >>= \q-> return (XMLStream [Markup $ Point $ errorBadQuoteCharacter q,
Content $ pack quote])))
endTag = (string "/" >> return (XMLStream [Markup (Start EndTag), Content (pack "</"),
Markup (Start ElementName)]))
>< name
>< return (XMLStream [Markup (End ElementName)])
>< whiteSpace
>< (string ">" >> return (XMLStream [Content (singleton '>'), Markup (End EndTag)])
<|> return (XMLStream [Markup $ Point unterminatedEndTag, Markup (End EndTag)]))
processingInstruction = (string "?"
>> return (XMLStream [Markup (Start ProcessingInstruction), Content (pack "<?"),
Markup (Start ProcessingInstructionText)]))
>< upto "?>"
>< (string "?>"
>> return (XMLStream [Markup (End ProcessingInstructionText), Content (pack "?>"),
Markup (End ProcessingInstruction)])
<|> return (XMLStream [Markup $ Point unterminatedProcessingInstruction]))
declaration = string "!"
>> ((comment <|> cdataMarkedSection <|> doctypeDeclaration)
<|> return (XMLStream [Markup $ Point $ errorBadDeclarationType, Content (pack "<")]))
comment = (string "--" >> return (XMLStream [Markup (Start Comment), Content (pack "<!--"),
Markup (Start CommentText)]))
>< upto "-->"
>< (string "-->" >> return (XMLStream [Markup (End CommentText), Content (pack "-->"),
Markup (End Comment)])
<|> return (XMLStream [Markup $ Point unterminatedComment]))
cdataMarkedSection = (string "[CDATA["
>> return (XMLStream [Markup (Start StartMarkedSectionCDATA), Content (pack "<![CDATA["),
Markup (End StartMarkedSectionCDATA)]))
>< upto "]]>"
>< (string "]]>"
>> return (XMLStream [Markup (Start EndMarkedSection), Content (pack "]]>"),
Markup (End EndMarkedSection)])
<|> return (XMLStream [Markup $ Point unterminatedMarkedSection]))
doctypeDeclaration = (string "DOCTYPE" >> return (XMLStream [Markup (Start DoctypeDeclaration),
Content (pack "<!DOCTYPE")]))
>< whiteSpace
>< (name
>< whiteSpace
>< moptional ((mapContent (string "SYSTEM")
<|> mapContent (string "PUBLIC") >< whiteSpace >< literal)
>< whiteSpace >< literal >< whiteSpace)
>< moptional (mapContent (string "[") >< whiteSpace
>< concatMany ((markupDeclaration <|> comment <|> processingInstruction
<|> entityReference "%")
>< whiteSpace)
>< mapContent (string "]") >< whiteSpace)
>< mapContent (string ">")
<|> return (XMLStream [Markup (Point errorMalformedDoctypeDeclaration)]))
>< return (XMLStream [Markup (End DoctypeDeclaration)])
literal = (string "\"" <|> string "\'")
>>= \quote-> return (XMLStream [Content $ pack quote])
>< mapContent (takeWhile (/= quote))
>< return (XMLStream [Content $ pack quote])
>< skip (string quote)
markupDeclaration= mapContent (string "<!")
>< (concatMany (mapContent (takeWhile1 (\x-> x /= ">" && x /= "\"" && x /= "\'")) <|> literal)
>< mapContent (string ">")
<|> return (XMLStream [Markup $ Point unterminatedMarkupDeclaration]))
name = mapContent (takeWhile1 (isNameChar . head))
mapContent = mapIncremental (XMLStream . (:[]) . Content . pack)
whiteSpace = mapContent (takeWhile (isSpace . head))
upto end@(lead:_) = mapContent (concatMany (takeWhile1 (/= [lead]) <|> notFollowedBy (string end) >< anyToken))
errorBadQuoteCharacter q = ErrorToken ("Invalid quote character " ++ show q)
errorBadAttribute x = ErrorToken ("Invalid character " ++ show x ++ " following attribute name")
errorBadEntityReference = ErrorToken "Invalid entity reference."
errorBadDeclarationType = ErrorToken "The \"<!\" sequence must be followed by \"[CDATA[\" or \"--\"."
errorMalformedDoctypeDeclaration = ErrorToken "Malformed DOCTYPE declaration."
errorUnescapedContentLT = ErrorToken "Unescaped character '<' in content"
unterminatedComment = ErrorToken "Unterminated comment."
unterminatedMarkedSection = ErrorToken "Unterminated marked section."
unterminatedMarkupDeclaration = ErrorToken "Unterminated markup declaration."
unterminatedStartTag = ErrorToken "Missing '>' at the end of start tag."
unterminatedEndTag = ErrorToken "Missing '>' at the end of end tag."
unterminatedProcessingInstruction = ErrorToken "Unterminated processing instruction."
isNameStart x = isLetter x || x == '_'
isNameChar x = isAlphaNum x || x == '_' || x == '-' || x == ':'
xmlTokens :: Monad m => Splitter m Char (Boundary XMLToken)
xmlTokens = parserToSplitter (parseXMLTokens >>> statelessTransducer unpackContent)
where unpackContent :: Markup XMLToken Text -> [Markup XMLToken Char]
unpackContent (Markup b) = [Markup b]
unpackContent (Content c) = map Content (unpack c)
parseXMLTokens :: Monad m => Transducer m Char (Markup XMLToken Text)
parseXMLTokens = Transducer (pourParsed (mapIncremental chunk xmlParser))
dispatchOnString :: forall m a d r. (Monad m, AncestorFunctor a d) =>
Source m a Char -> (String -> Coroutine d m r) -> [(String, String -> Coroutine d m r)]
-> Coroutine d m r
dispatchOnString source failure fullCases = dispatch fullCases id
where dispatch cases consumed
= case find (null . fst) cases
of Just (~"", rhs) -> rhs (consumed "")
Nothing -> get source
>>= maybe
(failure (consumed ""))
(\x-> case mapMaybe (startingWith x) cases
of [] -> failure (consumed [x])
subcases -> dispatch (subcases ++ fullCases) (consumed . (x :)))
startingWith x ~(y:rest, rhs) | x == y = Just (rest, rhs)
| otherwise = Nothing
getElementName :: forall m a d. (Monad m, AncestorFunctor a d) =>
Source m a (Markup XMLToken Text) -> ([Markup XMLToken Text] -> [Markup XMLToken Text])
-> Coroutine d m ([Markup XMLToken Text], Maybe Text)
getElementName source f = get source
>>= maybe
(return (f [], Nothing))
(\x-> let f' = f . (x:)
in case x
of Markup (Start ElementName) -> getRestOfRegion ElementName source f' id
Markup (Point ErrorToken{}) -> getElementName source f'
Content{} -> getElementName source f'
_ -> error ("Expected an ElementName, received " ++ show x))
getRestOfRegion :: forall m a d. (Monad m, AncestorFunctor a d) =>
XMLToken -> Source m a (Markup XMLToken Text)
-> ([Markup XMLToken Text] -> [Markup XMLToken Text]) -> (Text -> Text)
-> Coroutine d m ([Markup XMLToken Text], Maybe Text)
getRestOfRegion token source f g = getWhile isContent source
>>= \content-> get source
>>= \x-> case x
of Just y@(Markup End{})
-> return (f (content ++ [y]),
Just (g $ Text.concat $ map fromContent content))
_ -> error ("Expected rest of " ++ show token ++ ", received " ++ show x)
pourRestOfRegion :: forall m a1 a2 a3 d. (Monad m, AncestorFunctor a1 d, AncestorFunctor a2 d, AncestorFunctor a3 d) =>
XMLToken -> Source m a1 (Markup XMLToken Text)
-> Sink m a2 (Markup XMLToken Text) -> Sink m a3 (Markup XMLToken Text)
-> Coroutine d m Bool
pourRestOfRegion token source sink endSink = pourWhile isContent source sink
>> get source
>>= maybe
(return False)
(\x-> case x
of Markup (End token') | token == token' -> put endSink x
>> return True
_ -> error ("Expected rest of " ++ show token
++ ", received " ++ show x))
getRestOfStartTag :: forall m a d. (Monad m, AncestorFunctor a d) =>
Source m a (Markup XMLToken Text) -> Coroutine d m ([Markup XMLToken Text], Bool)
getRestOfStartTag source = do rest <- getWhile notEndTag source
end <- get source
case end of Nothing -> return (rest, False)
Just e@(Markup (End StartTag)) -> return (rest ++ [e], True)
Just e@(Markup (Point EmptyTag)) ->
getRestOfStartTag source
>>= \(rest', _)-> return (rest ++ (e: rest'), False)
_ -> error "getWhile returned early!"
where notEndTag (Markup (End StartTag)) = False
notEndTag (Markup (Point EmptyTag)) = False
notEndTag _ = True
getRestOfEndTag :: forall m a d. (Monad m, AncestorFunctor a d) =>
Source m a (Markup XMLToken Text) -> Coroutine d m [Markup XMLToken Text]
getRestOfEndTag source = getWhile (/= Markup (End EndTag)) source
>>= \tokens-> get source
>>= maybe (error "No end to the end tag!") (return . (tokens ++) . (:[]))
findEndTag :: forall m a1 a2 a3 d. (Monad m, AncestorFunctor a1 d, AncestorFunctor a2 d, AncestorFunctor a3 d) =>
Source m a1 (Markup XMLToken Text) -> Sink m a2 (Markup XMLToken Text) -> Sink m a3 (Markup XMLToken Text)
-> Text
-> Coroutine d m ()
findEndTag source sink endSink name = findTag where
findTag = pourWhile noTagStart source sink
>> get source
>>= maybe (return ()) consumeOne
noTagStart (Markup (Start StartTag)) = False
noTagStart (Markup (Start EndTag)) = False
noTagStart _ = True
consumeOne x@(Markup (Start EndTag)) = do (tokens, mn) <- getElementName source (x :)
maybe
(return ())
(\name'-> getRestOfEndTag source
>>= \rest-> if name == name'
then putList (tokens ++ rest) endSink
>> return ()
else putList (tokens ++ rest) sink
>> findTag)
mn
consumeOne x@(Markup (Start StartTag)) = do (tokens, mn) <- getElementName source (x :)
maybe
(return ())
(\name'-> do (rest, hasContent) <- getRestOfStartTag source
_ <- putList (tokens ++ rest) sink
when hasContent (findEndTag source sink sink name')
findTag)
mn
consumeOne _ = error "pourWhile returned early!"
findStartTag :: forall m a1 a2 d. (Monad m, AncestorFunctor a1 d, AncestorFunctor a2 d) =>
Source m a1 (Markup XMLToken Text) -> Sink m a2 (Markup XMLToken Text)
-> Coroutine d m (Maybe (Markup XMLToken Text))
findStartTag source sink = pourWhile (/= Markup (Start StartTag)) source sink >> get source
xmlElement :: Monad m => Splitter m (Markup XMLToken Text) ()
xmlElement = Splitter $
\source true false edge->
let split0 = findStartTag source false
>>= maybe (return [])
(\x-> do put edge ()
put true x
(tokens, mn) <- getElementName source id
maybe
(putList tokens true)
(\name-> do (rest, hasContent) <- getRestOfStartTag source
_ <- putList (tokens ++ rest) true
if hasContent
then split1 name
else split0)
mn)
split1 name = findEndTag source true true name
>> split0
in split0 >> return ()
xmlElementContent :: Monad m => Splitter m (Markup XMLToken Text) ()
xmlElementContent = Splitter $
\source true false edge->
let split0 = findStartTag source false
>>= maybe (return [])
(\x-> do put false x
(tokens, mn) <- getElementName source id
maybe
(putList tokens false)
(\name-> do (rest, hasContent) <- getRestOfStartTag source
_ <- putList (tokens ++ rest) false
if hasContent
then put edge () >> split1 name
else split0)
mn)
split1 name = findEndTag source true false name
>> split0
in split0 >> return ()
xmlElementHavingTagWith :: forall m b. Monad m =>
Splitter m (Markup XMLToken Text) b -> Splitter m (Markup XMLToken Text) b
xmlElementHavingTagWith test =
isolateSplitter $ \ source true false edge ->
let split0 = findStartTag source false
>>= maybe (return ())
(\x-> do (tokens, mn) <- getElementName source (x :)
maybe
(return ())
(\name-> do (rest, hasContent) <- getRestOfStartTag source
let tag = tokens ++ rest
(_, found) <- pipe (putList tag) (findsTrueIn test)
case found of Just mb -> maybe (return ()) (put edge) mb
>> putList tag true
>> split1 hasContent true name
Nothing -> putList tag false
>> split1 hasContent false name)
mn)
split1 hasContent sink name = when hasContent (findEndTag source sink sink name)
>> split0
in split0
xmlAttribute :: Monad m => Splitter m (Markup XMLToken Text) ()
xmlAttribute = Splitter $
\source true false edge->
let split0 = getWith
(\x-> case x
of Markup (Start AttributeName) ->
do put edge ()
put true x
pourRestOfRegion AttributeName source true true
>>= flip when split1
_ -> put false x >> split0)
source
split1 = getWith
(\x-> case x
of Markup (Start AttributeValue)
-> put true x
>> pourRestOfRegion AttributeValue source true true
>>= flip when split0
_ -> put true x >> split1)
source
in split0
xmlElementName :: Monad m => Splitter m (Markup XMLToken Text) ()
xmlElementName = Splitter (splitSimpleRegions ElementName)
xmlAttributeName :: Monad m => Splitter m (Markup XMLToken Text) ()
xmlAttributeName = Splitter (splitSimpleRegions AttributeName)
xmlAttributeValue :: Monad m => Splitter m (Markup XMLToken Text) ()
xmlAttributeValue = Splitter (splitSimpleRegions AttributeValue)
splitSimpleRegions :: Monad m => XMLToken -> OpenSplitter m a1 a2 a3 a4 d (Markup XMLToken Text) () ()
splitSimpleRegions token source true false edge = split0
where split0 = getWith consumeOne source
consumeOne x@(Markup (Start token')) | token == token' = put false x
>> put edge ()
>> pourRestOfRegion token source true false
>>= flip when split0
consumeOne x = put false x >> split0
isContent :: Markup b x -> Bool
isContent Content{} = True
isContent _ = False
fromContent :: Markup b x -> x
fromContent (Content x) = x
fromContent _ = error "fromContent expects Content!"