{- Copyright 2009-2010 Mario Blazevic This file is part of the Streaming Component Combinators (SCC) project. The SCC project is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. SCC is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with SCC. If not, see . -} -- | Module "XML" defines primitives and combinators for parsing and manipulating XML. {-# LANGUAGE PatternGuards, FlexibleContexts, MultiParamTypeClasses, ScopedTypeVariables, Rank2Types #-} {-# OPTIONS_HADDOCK hide #-} module Control.Concurrent.SCC.XML ( -- * Parsing XML xmlTokens, parseXMLTokens, expandXMLEntity, XMLToken(..), -- * XML splitters xmlElement, xmlElementContent, xmlElementName, xmlAttribute, xmlAttributeName, xmlAttributeValue, xmlElementHavingTagWith ) where import Control.Monad (when) import Data.Char import Data.Maybe (mapMaybe) import Data.List (find) import Data.Text (Text) import qualified Data.Text as Text import Numeric (readDec, readHex) import Control.Cofunctor.Ticker (andThen, tickOne, tickWhile) import Control.Monad.Coroutine (Coroutine, sequentialBinder) import Control.Concurrent.SCC.Streams import Control.Concurrent.SCC.Types import Control.Concurrent.SCC.Coercions (coerce) import Control.Concurrent.SCC.Combinators (parseEachNestedRegion, findsTrueIn) data XMLToken = StartTag | EndTag | EmptyTag | ElementName | AttributeName | AttributeValue | EntityReferenceToken | EntityName | ProcessingInstruction | ProcessingInstructionText | Comment | CommentText | StartMarkedSectionCDATA | EndMarkedSection | ErrorToken String deriving (Eq, Show) -- | Converts an XML entity name into the text value it represents: @expandXMLEntity \"lt\" = \"<\"@. expandXMLEntity :: String -> String expandXMLEntity "lt" = "<" expandXMLEntity "gt" = ">" expandXMLEntity "quot" = "\"" expandXMLEntity "apos" = "'" expandXMLEntity "amp" = "&" expandXMLEntity ('#' : 'x' : codePoint) = [chr (fst $ head $ readHex codePoint)] expandXMLEntity ('#' : codePoint) = [chr (fst $ head $ readDec codePoint)] expandXMLEntity e = error ("String \"" ++ e ++ "\" is not a built-in entity name.") -- | This splitter splits XML markup from data content. It is used by 'parseXMLTokens'. xmlTokens :: Monad m => Splitter m Char (Boundary XMLToken) xmlTokens = Splitter $ \source true false edge-> let getContent = pourWhile (\x-> x /= '<' && x /= '&') source false >> getWith contentEnd source contentEnd '<' = get source >>= maybe (put edge (Point errorUnescapedContentLT) >> put false '<') (\x-> tag x >> getContent) contentEnd '&' = entity >> getContent contentEnd _ = error "pourUntil returned early!" tag '?' = put edge (Start ProcessingInstruction) >> putList "> put edge (Start ProcessingInstructionText) >> processingInstruction tag '!' = dispatchOnString source (\other-> put edge (Point (errorBadDeclarationType other))) [("--", const (put edge (Start Comment) >> putList "", \match-> put edge (End CommentText) >> putList match true >> put edge (End Comment) >> getContent)] markedSection = {-# SCC "> dispatchOnString source (\other-> if null other then put edge (Point errorInputEndInMarkedSection) else putList other true >> markedSection) [("]]>", \match-> put edge (Start EndMarkedSection) >> putList match true >> put edge (End EndMarkedSection) >> getContent)] entity = put edge (Start EntityReferenceToken) >> put true '&' >> next errorInputEndInEntityReference (\x-> name EntityName x >> next errorInputEndInEntityReference (\y-> do when (y /= ';') (put edge (Point (errorBadEntityReference y))) put true y)) >> put edge (End EntityReferenceToken) name token x = {-# SCC "name" #-} put edge (Start token) >> nameTail x >> put edge (End token) nameTail x = getWhile isNameChar source >>= \rest-> putList (x:rest) true next errorToken f = get source >>= maybe (put edge (Point errorToken)) f in getContent where errorInputEndInComment = ErrorToken "Unterminated comment" errorInputEndInMarkedSection = ErrorToken "Unterminated marked section" errorInputEndInStartTag = ErrorToken "Missing '>' at the end of start tag." errorInputEndInEndTag = ErrorToken "End of input in end tag" errorInputEndInAttributeValue = ErrorToken "Truncated input after attribute name" errorInputEndInEntityReference = ErrorToken "End of input in entity reference" errorInputEndInProcessingInstruction = ErrorToken "Unterminated processing instruction" errorBadQuoteCharacter q = ErrorToken ("Invalid quote character " ++ show q) errorBadStartTag x = ErrorToken ("Invalid character " ++ show x ++ " in start tag") errorBadEndTag x = ErrorToken ("Invalid character " ++ show x ++ " in end tag") errorBadAttribute x = ErrorToken ("Invalid character " ++ show x ++ " following attribute name") errorBadEntityReference x = ErrorToken ("Invalid character " ++ show x ++ " ends entity name.") errorBadDeclarationType other = ErrorToken ("Expecting Transducer m Char (Markup XMLToken Text) parseXMLTokens = parseEachNestedRegion sequentialBinder xmlTokens coerce dispatchOnString :: forall m a d r. (Monad m, AncestorFunctor a d) => Source m a Char -> (String -> Coroutine d m r) -> [(String, String -> Coroutine d m r)] -> Coroutine d m r dispatchOnString source failure fullCases = dispatch fullCases id where dispatch cases consumed = case find (null . fst) cases of Just (~"", rhs) -> rhs (consumed "") Nothing -> get source >>= maybe (failure (consumed "")) (\x-> case mapMaybe (startingWith x) cases of [] -> failure (consumed [x]) subcases -> dispatch (subcases ++ fullCases) (consumed . (x :))) startingWith x ~(y:rest, rhs) | x == y = Just (rest, rhs) | otherwise = Nothing getElementName :: forall m a d. (Monad m, AncestorFunctor a d) => Source m a (Markup XMLToken Text) -> ([Markup XMLToken Text] -> [Markup XMLToken Text]) -> Coroutine d m ([Markup XMLToken Text], Maybe Text) getElementName source f = get source >>= maybe (return (f [], Nothing)) (\x-> let f' = f . (x:) in case x of Markup (Start ElementName) -> getRestOfRegion ElementName source f' id Markup (Point ErrorToken{}) -> getElementName source f' Content{} -> getElementName source f' _ -> error ("Expected an ElementName, received " ++ show x)) getRestOfRegion :: forall m a d. (Monad m, AncestorFunctor a d) => XMLToken -> Source m a (Markup XMLToken Text) -> ([Markup XMLToken Text] -> [Markup XMLToken Text]) -> (Text -> Text) -> Coroutine d m ([Markup XMLToken Text], Maybe Text) getRestOfRegion token source f g = getWhile isContent source >>= \content-> get source >>= \x-> case x of Just y@(Markup End{}) -> return (f (content ++ [y]), Just (g $ Text.concat $ map fromContent content)) _ -> error ("Expected rest of " ++ show token ++ ", received " ++ show x) pourRestOfRegion :: forall m a1 a2 a3 d. (Monad m, AncestorFunctor a1 d, AncestorFunctor a2 d, AncestorFunctor a3 d) => XMLToken -> Source m a1 (Markup XMLToken Text) -> Sink m a2 (Markup XMLToken Text) -> Sink m a3 (Markup XMLToken Text) -> Coroutine d m Bool pourRestOfRegion token source sink endSink = pourWhile isContent source sink >> get source >>= maybe (return False) (\x-> case x of Markup (End token') | token == token' -> put endSink x >> return True _ -> error ("Expected rest of " ++ show token ++ ", received " ++ show x)) getRestOfStartTag :: forall m a d. (Monad m, AncestorFunctor a d) => Source m a (Markup XMLToken Text) -> Coroutine d m ([Markup XMLToken Text], Bool) getRestOfStartTag source = do rest <- getWhile notEndTag source end <- get source case end of Nothing -> return (rest, False) Just e@(Markup (End StartTag)) -> return (rest ++ [e], True) Just e@(Markup (Point EmptyTag)) -> getRestOfStartTag source >>= \(rest', _)-> return (rest ++ (e: rest'), False) _ -> error "getWhile returned early!" where notEndTag (Markup (End StartTag)) = False notEndTag (Markup (Point EmptyTag)) = False notEndTag _ = True getRestOfEndTag :: forall m a d. (Monad m, AncestorFunctor a d) => Source m a (Markup XMLToken Text) -> Coroutine d m [Markup XMLToken Text] getRestOfEndTag source = getWhile (/= Markup (End EndTag)) source >>= \tokens-> get source >>= maybe (error "No end to the end tag!") (return . (tokens ++) . (:[])) findEndTag :: forall m a1 a2 a3 d. (Monad m, AncestorFunctor a1 d, AncestorFunctor a2 d, AncestorFunctor a3 d) => Source m a1 (Markup XMLToken Text) -> Sink m a2 (Markup XMLToken Text) -> Sink m a3 (Markup XMLToken Text) -> Text -> Coroutine d m () findEndTag source sink endSink name = findTag where findTag = pourWhile noTagStart source sink >> get source >>= maybe (return ()) consumeOne noTagStart (Markup (Start StartTag)) = False noTagStart (Markup (Start EndTag)) = False noTagStart _ = True consumeOne x@(Markup (Start EndTag)) = do (tokens, mn) <- getElementName source (x :) maybe (return ()) (\name'-> getRestOfEndTag source >>= \rest-> if name == name' then putList (tokens ++ rest) endSink >> return () else putList (tokens ++ rest) sink >> findTag) mn consumeOne x@(Markup (Start StartTag)) = do (tokens, mn) <- getElementName source (x :) maybe (return ()) (\name'-> do (rest, hasContent) <- getRestOfStartTag source _ <- putList (tokens ++ rest) sink when hasContent (findEndTag source sink sink name') findTag) mn consumeOne _ = error "pourUntil returned early!" findStartTag :: forall m a1 a2 d. (Monad m, AncestorFunctor a1 d, AncestorFunctor a2 d) => Source m a1 (Markup XMLToken Text) -> Sink m a2 (Markup XMLToken Text) -> Coroutine d m (Maybe (Markup XMLToken Text)) findStartTag source sink = pourWhile (/= Markup (Start StartTag)) source sink >> get source -- | Splits all top-level elements with all their content to /true/, all other input to /false/. xmlElement :: Monad m => Splitter m (Markup XMLToken Text) () xmlElement = Splitter $ \source true false edge-> let split0 = findStartTag source false >>= maybe (return []) (\x-> do put edge () put true x (tokens, mn) <- getElementName source id maybe (putList tokens true) (\name-> do (rest, hasContent) <- getRestOfStartTag source _ <- putList (tokens ++ rest) true if hasContent then split1 name else split0) mn) split1 name = findEndTag source true true name >> split0 in split0 >> return () -- | Splits the content of all top-level elements to /true/, their tags and intervening input to /false/. xmlElementContent :: Monad m => Splitter m (Markup XMLToken Text) () xmlElementContent = Splitter $ \source true false edge-> let split0 = findStartTag source false >>= maybe (return []) (\x-> do put false x (tokens, mn) <- getElementName source id maybe (putList tokens false) (\name-> do (rest, hasContent) <- getRestOfStartTag source _ <- putList (tokens ++ rest) false if hasContent then put edge () >> split1 name else split0) mn) split1 name = findEndTag source true false name >> split0 in split0 >> return () -- | Similiar to @('Control.Concurrent.SCC.Combinators.having' 'element')@, except it runs the argument splitter -- only on each element's start tag, not on the entire element with its content. xmlElementHavingTagWith :: forall m b. Monad m => Splitter m (Markup XMLToken Text) b -> Splitter m (Markup XMLToken Text) b xmlElementHavingTagWith test = isolateSplitter $ \ source true false edge -> let split0 = findStartTag source false >>= maybe (return ()) (\x-> do (tokens, mn) <- getElementName source (x :) maybe (return ()) (\name-> do (rest, hasContent) <- getRestOfStartTag source let tag = tokens ++ rest (_, found) <- pipe (putList tag) (findsTrueIn test) case found of Just mb -> maybe (return ()) (put edge) mb >> putList tag true >> split1 hasContent true name Nothing -> putList tag false >> split1 hasContent false name) mn) split1 hasContent sink name = when hasContent (findEndTag source sink sink name) >> split0 in split0 -- | Splits every attribute specification to /true/, everything else to /false/. xmlAttribute :: Monad m => Splitter m (Markup XMLToken Text) () xmlAttribute = Splitter $ \source true false edge-> let split0 = getWith (\x-> case x of Markup (Start AttributeName) -> do put edge () put true x pourRestOfRegion AttributeName source true true >>= flip when split1 _ -> put false x >> split0) source split1 = getWith (\x-> case x of Markup (Start AttributeValue) -> put true x >> pourRestOfRegion AttributeValue source true true >>= flip when split0 _ -> put true x >> split1) source in split0 -- | Splits every element name, including the names of nested elements and names in end tags, to /true/, all the rest of -- input to /false/. xmlElementName :: Monad m => Splitter m (Markup XMLToken Text) () xmlElementName = Splitter (splitSimpleRegions ElementName) -- | Splits every attribute name to /true/, all the rest of input to /false/. xmlAttributeName :: Monad m => Splitter m (Markup XMLToken Text) () xmlAttributeName = Splitter (splitSimpleRegions AttributeName) -- | Splits every attribute value, excluding the quote delimiters, to /true/, all the rest of input to /false/. xmlAttributeValue :: Monad m => Splitter m (Markup XMLToken Text) () xmlAttributeValue = Splitter (splitSimpleRegions AttributeValue) splitSimpleRegions :: Monad m => XMLToken -> OpenSplitter m a1 a2 a3 a4 d (Markup XMLToken Text) () () splitSimpleRegions token source true false edge = split0 where split0 = getWith consumeOne source consumeOne x@(Markup (Start token')) | token == token' = put false x >> put edge () >> pourRestOfRegion token source true false >>= flip when split0 consumeOne x = put false x >> split0 isContent :: Markup b x -> Bool isContent Content{} = True isContent _ = False fromContent :: Markup b x -> x fromContent (Content x) = x fromContent _ = error "fromContent expects Content!"