{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE StandaloneDeriving #-} {-# LANGUAGE TemplateHaskell #-} -- | Streaming parsers for the RSS 2.0 standard. module Text.RSS.Conduit.Parse ( -- * Top-level rssDocument -- * Elements , rssCategory , rssCloud , rssEnclosure , rssGuid , rssImage , rssItem , rssSkipDays , rssSkipHours , rssSource , rssTextInput ) where -- {{{ Imports import Text.RSS.Types import Control.Applicative import Control.Foldl hiding (mconcat) import Control.Monad hiding (foldM) import Control.Monad.Catch import Data.Conduit.Parser import Data.Conduit.Parser.XML import Data.Maybe import Data.Monoid import Data.MonoTraversable import Data.Set hiding (fold) import Data.Text as Text hiding (cons, last, map, snoc) import Data.Text.Encoding import Data.Time.Clock import Data.Time.LocalTime import Data.Time.RFC822 import Data.Version import Data.XML.Types import Lens.Simple import Prelude hiding (last, lookup) import Text.Parser.Combinators import Text.ParserCombinators.ReadP (readP_to_S) import Text.Read (readMaybe) import URI.ByteString -- }}} -- {{{ Util asRssURI :: (MonadThrow m) => Text -> m RssURI asRssURI t = case (parseURI' t, parseRelativeRef' t) of (Right u, _) -> return $ RssURI u (_, Right u) -> return $ RssURI u (_, Left e) -> throwM $ InvalidURI e where parseURI' = parseURI laxURIParserOptions . encodeUtf8 parseRelativeRef' = parseRelativeRef laxURIParserOptions . encodeUtf8 asInt :: MonadThrow m => Text -> m Int asInt t = maybe (throwM $ InvalidInt t) return . readMaybe $ unpack t asBool :: MonadThrow m => Text -> m Bool asBool "true" = return True asBool "false" = return False asBool t = throwM $ InvalidBool t asVersion :: MonadThrow m => Text -> m Version asVersion t = maybe (throwM $ InvalidVersion t) return . fmap fst . headMay . readP_to_S parseVersion $ unpack t asCloudProtocol :: MonadThrow m => Text -> m CloudProtocol asCloudProtocol "xml-rpc" = return ProtocolXmlRpc asCloudProtocol "soap" = return ProtocolSoap asCloudProtocol "http-post" = return ProtocolHttpPost asCloudProtocol t = throwM $ InvalidProtocol t unknownTag :: (MonadCatch m) => ConduitParser Event m (Endo a) unknownTag = anyTag $ \_ _ -> many (void unknownTag <|> void textContent) >> return mempty -- | Like 'tagName' but ignores the namespace. tagName' :: (MonadCatch m) => Text -> AttrParser a -> (a -> ConduitParser Event m b) -> ConduitParser Event m b tagName' t = tagPredicate (\n -> nameLocalName n == t) -- | Tag which content is a date-time that follows RFC 3339 format. tagDate :: (MonadCatch m) => Name -> ConduitParser Event m UTCTime tagDate name = tagIgnoreAttrs name $ content (fmap zonedTimeToUTC . parseTimeRFC822) lastRequired :: (Monad m, Parsing m) => String -> FoldM m a a lastRequired e = FoldM (\_ a -> return $ Right a) (return $ Left e) (either unexpected return) -- }}} -- | Parse a @\@ element. rssSkipHours :: MonadCatch m => ConduitParser Event m (Set Hour) rssSkipHours = named "Rss element" $ tagIgnoreAttrs "skipHours" $ fromList <$> many (tagIgnoreAttrs "hour" $ content (asInt >=> asHour)) -- | Parse a @\@ element. rssSkipDays :: MonadCatch m => ConduitParser Event m (Set Day) rssSkipDays = named "Rss element" $ tagIgnoreAttrs "skipDays" $ fromList <$> many (tagIgnoreAttrs "day" $ content asDay) data TextInputPiece = TextInputTitle Text | TextInputDescription Text | TextInputName Text | TextInputLink RssURI | TextInputUnknown makeTraversals ''TextInputPiece -- | Parse a @\@ element. rssTextInput :: (MonadCatch m) => ConduitParser Event m RssTextInput rssTextInput = named "Rss element" $ tagIgnoreAttrs "textInput" $ do p <- many piece flip foldM p $ RssTextInput <$> handlesM _TextInputTitle (lastRequired "Missing element.") <*> handlesM _TextInputDescription (lastRequired "Missing <description> element.") <*> handlesM _TextInputName (lastRequired "Missing <name> element.") <*> handlesM _TextInputLink (lastRequired "Missing <link> element.") where piece :: MonadCatch m => ConduitParser Event m TextInputPiece piece = choice [ TextInputTitle <$> tagIgnoreAttrs "title" textContent , TextInputDescription <$> tagIgnoreAttrs "description" textContent , TextInputName <$> tagIgnoreAttrs "name" textContent , TextInputLink <$> tagIgnoreAttrs "link" (content asRssURI) , TextInputUnknown <$ unknownTag ] data ImagePiece = ImageUri RssURI | ImageTitle Text | ImageLink RssURI | ImageWidth Int | ImageHeight Int | ImageDescription Text | ImageUnknown makeTraversals ''ImagePiece -- | Parse an @\<image\>@ element. rssImage :: (MonadCatch m) => ConduitParser Event m RssImage rssImage = named "Rss <image> element" $ tagIgnoreAttrs "image" $ do p <- many piece flip foldM p $ RssImage <$> handlesM _ImageUri (lastRequired "Missing <uri> element.") <*> handlesM _ImageTitle (lastRequired "Missing <title> element.") <*> handlesM _ImageLink (lastRequired "Missing <link> element.") <*> generalize (handles _ImageWidth last) <*> generalize (handles _ImageHeight last) <*> generalize (handles _ImageDescription $ lastDef "") where piece = choice [ ImageUri <$> tagIgnoreAttrs "uri" (content asRssURI) , ImageTitle <$> tagIgnoreAttrs "title" textContent , ImageLink <$> tagIgnoreAttrs "link" (content asRssURI) , ImageWidth <$> tagIgnoreAttrs "width" (content asInt) , ImageHeight <$> tagIgnoreAttrs "height" (content asInt) , ImageDescription <$> tagIgnoreAttrs "description" textContent , ImageUnknown <$ unknownTag ] -- | Parse a @\<category\>@ element. rssCategory :: MonadCatch m => ConduitParser Event m RssCategory rssCategory = named "Rss <category> element" $ tagName' "category" (textAttr "domain") $ \domain -> RssCategory domain <$> textContent -- | Parse a @\<cloud\>@ element. rssCloud :: (MonadCatch m) => ConduitParser Event m RssCloud rssCloud = named "Rss <cloud> element" $ tagName' "cloud" attributes return where attributes = do uri <- fmap RssURI $ RelativeRef <$> fmap Just (Authority Nothing <$> (Host . encodeUtf8 <$> textAttr "domain") <*> (fmap Port <$> optional (attr "port" asInt))) <*> (encodeUtf8 <$> textAttr "path") <*> pure (Query []) <*> pure Nothing RssCloud uri <$> textAttr "registerProcedure" <*> attr "protocol" asCloudProtocol <* ignoreAttrs -- | Parse a @\<guid\>@ element. rssGuid :: MonadCatch m => ConduitParser Event m RssGuid rssGuid = named "RSS <guid> element" $ tagName' "guid" attributes handler where attributes = optional (attr "isPermaLink" asBool) <* ignoreAttrs handler (Just True) = GuidUri <$> content asRssURI handler _ = GuidText <$> textContent -- | Parse an @\<enclosure\>@ element. rssEnclosure :: MonadCatch m => ConduitParser Event m RssEnclosure rssEnclosure = named "Rss <enclosure> element" $ tagName' "enclosure" attributes handler where attributes = (,,) <$> attr "url" asRssURI <*> attr "length" asInt <*> textAttr "type" <* ignoreAttrs handler (uri, length_, type_) = return $ RssEnclosure uri length_ type_ -- | Parse a @\<source\>@ element. rssSource :: MonadCatch m => ConduitParser Event m RssSource rssSource = named "Rss <source> element" $ tagName' "source" attributes handler where attributes = attr "url" asRssURI <* ignoreAttrs handler uri = RssSource uri <$> textContent data ItemPiece = ItemTitle Text | ItemLink RssURI | ItemDescription Text | ItemAuthor Text | ItemCategory RssCategory | ItemComments RssURI | ItemEnclosure RssEnclosure | ItemGuid RssGuid | ItemPubDate UTCTime | ItemSource RssSource | ItemUnknown makeTraversals ''ItemPiece -- | Parse an @\<item\>@ element. rssItem :: MonadCatch m => ConduitParser Event m RssItem rssItem = named "Rss <item> element" $ tagIgnoreAttrs "item" $ do p <- many piece return . flip fold p $ RssItem <$> handles _ItemTitle (lastDef "") <*> handles _ItemLink last <*> handles _ItemDescription (lastDef "") <*> handles _ItemAuthor (lastDef "") <*> handles _ItemCategory list <*> handles _ItemComments last <*> handles _ItemEnclosure list <*> handles _ItemGuid last <*> handles _ItemPubDate last <*> handles _ItemSource last where piece = choice [ ItemTitle <$> tagIgnoreAttrs "title" textContent , ItemLink <$> tagIgnoreAttrs "link" (content asRssURI) , ItemDescription <$> tagIgnoreAttrs "description" textContent , ItemAuthor <$> tagIgnoreAttrs "author" textContent , ItemCategory <$> rssCategory , ItemComments <$> tagIgnoreAttrs "comments" (content asRssURI) , ItemEnclosure <$> rssEnclosure , ItemGuid <$> rssGuid , ItemPubDate <$> tagDate "pubDate" , ItemSource <$> rssSource ] data ChannelPiece = ChannelTitle Text | ChannelLink RssURI | ChannelDescription Text | ChannelItem RssItem | ChannelLanguage Text | ChannelCopyright Text | ChannelManagingEditor Text | ChannelWebmaster Text | ChannelPubDate UTCTime | ChannelLastBuildDate UTCTime | ChannelCategory RssCategory | ChannelGenerator Text | ChannelDocs RssURI | ChannelCloud RssCloud | ChannelTtl Int | ChannelImage RssImage | ChannelRating Text | ChannelTextInput RssTextInput | ChannelSkipHours (Set Hour) | ChannelSkipDays (Set Day) | ChannelUnknown makeTraversals ''ChannelPiece -- | Parse an @\<rss\>@ element. rssDocument :: MonadCatch m => ConduitParser Event m RssDocument rssDocument = named "RSS <rss> element" $ tagName' "rss" attributes $ \version -> tagIgnoreAttrs "channel" $ do p <- many piece flip foldM p $ RssDocument version <$> handlesM _ChannelTitle (lastRequired "Missing <title> element.") <*> handlesM _ChannelLink (lastRequired "Missing <link> element.") <*> handlesM _ChannelDescription (lastRequired "Missing <description> element.") <*> generalize (handles _ChannelItem list) <*> generalize (handles _ChannelLanguage $ lastDef "") <*> generalize (handles _ChannelCopyright $ lastDef "") <*> generalize (handles _ChannelManagingEditor $ lastDef "") <*> generalize (handles _ChannelWebmaster $ lastDef "") <*> generalize (handles _ChannelPubDate last) <*> generalize (handles _ChannelLastBuildDate last) <*> generalize (handles _ChannelCategory list) <*> generalize (handles _ChannelGenerator $ lastDef "") <*> generalize (handles _ChannelDocs last) <*> generalize (handles _ChannelCloud last) <*> generalize (handles _ChannelTtl last) <*> generalize (handles _ChannelImage last) <*> generalize (handles _ChannelRating $ lastDef "") <*> generalize (handles _ChannelTextInput last) <*> generalize (handles _ChannelSkipHours $ lastDef mempty) <*> generalize (handles _ChannelSkipDays $ lastDef mempty) where piece :: MonadCatch m => ConduitParser Event m ChannelPiece piece = choice [ ChannelTitle <$> tagIgnoreAttrs "title" textContent , ChannelLink <$> tagIgnoreAttrs "link" (content asRssURI) , ChannelDescription <$> tagIgnoreAttrs "description" textContent , ChannelItem <$> rssItem , ChannelLanguage <$> tagIgnoreAttrs "language" textContent , ChannelCopyright <$> tagIgnoreAttrs "copyright" textContent , ChannelManagingEditor <$> tagIgnoreAttrs "managingEditor" textContent , ChannelWebmaster <$> tagIgnoreAttrs "webMaster" textContent , ChannelPubDate <$> tagDate "pubDate" , ChannelLastBuildDate <$> tagDate "lastBuildDate" , ChannelCategory <$> rssCategory , ChannelGenerator <$> tagIgnoreAttrs "generator" textContent , ChannelDocs <$> tagIgnoreAttrs "docs" (content asRssURI) , ChannelCloud <$> rssCloud , ChannelTtl <$> tagIgnoreAttrs "ttl" (content asInt) , ChannelImage <$> rssImage , ChannelRating <$> tagIgnoreAttrs "rating" textContent , ChannelTextInput <$> rssTextInput , ChannelSkipHours <$> rssSkipHours , ChannelSkipDays <$> rssSkipDays , ChannelUnknown <$ unknownTag ] attributes = attr "version" asVersion <* ignoreAttrs