module Text.RSS.Conduit.Parse
(
rssDocument
, rssCategory
, rssCloud
, rssEnclosure
, rssGuid
, rssImage
, rssItem
, rssSkipDays
, rssSkipHours
, rssSource
, rssTextInput
) where
import Text.RSS.Types
import Control.Applicative
import Control.Foldl hiding (mconcat)
import Control.Monad hiding (foldM)
import Control.Monad.Catch
import Data.Conduit.Parser
import Data.Conduit.Parser.XML
import Data.Maybe
import Data.Monoid
import Data.MonoTraversable
import Data.Set hiding (fold)
import Data.Text as Text hiding (cons, last, map,
snoc)
import Data.Text.Encoding
import Data.Time.Clock
import Data.Time.LocalTime
import Data.Time.RFC822
import Data.Version
import Data.XML.Types
import Lens.Simple
import Prelude hiding (last, lookup)
import Text.Parser.Combinators
import Text.ParserCombinators.ReadP (readP_to_S)
import Text.Read (readMaybe)
import URI.ByteString
asRssURI :: (MonadThrow m) => Text -> m RssURI
asRssURI t = case (parseURI' t, parseRelativeRef' t) of
(Right u, _) -> return $ RssURI u
(_, Right u) -> return $ RssURI u
(_, Left e) -> throwM $ InvalidURI e
where parseURI' = parseURI laxURIParserOptions . encodeUtf8
parseRelativeRef' = parseRelativeRef laxURIParserOptions . encodeUtf8
asInt :: MonadThrow m => Text -> m Int
asInt t = maybe (throwM $ InvalidInt t) return . readMaybe $ unpack t
asBool :: MonadThrow m => Text -> m Bool
asBool "true" = return True
asBool "false" = return False
asBool t = throwM $ InvalidBool t
asVersion :: MonadThrow m => Text -> m Version
asVersion t = maybe (throwM $ InvalidVersion t) return . fmap fst . headMay . readP_to_S parseVersion $ unpack t
asCloudProtocol :: MonadThrow m => Text -> m CloudProtocol
asCloudProtocol "xml-rpc" = return ProtocolXmlRpc
asCloudProtocol "soap" = return ProtocolSoap
asCloudProtocol "http-post" = return ProtocolHttpPost
asCloudProtocol t = throwM $ InvalidProtocol t
unknownTag :: (MonadCatch m) => ConduitParser Event m (Endo a)
unknownTag = anyTag $ \_ _ -> many (void unknownTag <|> void textContent) >> return mempty
tagName' :: (MonadCatch m) => Text -> AttrParser a -> (a -> ConduitParser Event m b) -> ConduitParser Event m b
tagName' t = tagPredicate (\n -> nameLocalName n == t)
tagDate :: (MonadCatch m) => Name -> ConduitParser Event m UTCTime
tagDate name = tagIgnoreAttrs name $ content (fmap zonedTimeToUTC . parseTimeRFC822)
lastRequired :: (Monad m, Parsing m) => String -> FoldM m a a
lastRequired e = FoldM (\_ a -> return $ Right a) (return $ Left e) (either unexpected return)
rssSkipHours :: MonadCatch m => ConduitParser Event m (Set Hour)
rssSkipHours = named "Rss <skipHours> element" $ tagIgnoreAttrs "skipHours" $
fromList <$> many (tagIgnoreAttrs "hour" $ content (asInt >=> asHour))
rssSkipDays :: MonadCatch m => ConduitParser Event m (Set Day)
rssSkipDays = named "Rss <skipDays> element" $ tagIgnoreAttrs "skipDays" $
fromList <$> many (tagIgnoreAttrs "day" $ content asDay)
data TextInputPiece = TextInputTitle Text | TextInputDescription Text
| TextInputName Text | TextInputLink RssURI
| TextInputUnknown
makeTraversals ''TextInputPiece
rssTextInput :: (MonadCatch m) => ConduitParser Event m RssTextInput
rssTextInput = named "Rss <textInput> element" $ tagIgnoreAttrs "textInput" $ do
p <- many piece
flip foldM p $ RssTextInput
<$> handlesM _TextInputTitle (lastRequired "Missing <title> element.")
<*> handlesM _TextInputDescription (lastRequired "Missing <description> element.")
<*> handlesM _TextInputName (lastRequired "Missing <name> element.")
<*> handlesM _TextInputLink (lastRequired "Missing <link> element.")
where piece :: MonadCatch m => ConduitParser Event m TextInputPiece
piece = choice [ TextInputTitle <$> tagIgnoreAttrs "title" textContent
, TextInputDescription <$> tagIgnoreAttrs "description" textContent
, TextInputName <$> tagIgnoreAttrs "name" textContent
, TextInputLink <$> tagIgnoreAttrs "link" (content asRssURI)
, TextInputUnknown <$ unknownTag
]
data ImagePiece = ImageUri RssURI | ImageTitle Text | ImageLink RssURI | ImageWidth Int
| ImageHeight Int | ImageDescription Text | ImageUnknown
makeTraversals ''ImagePiece
rssImage :: (MonadCatch m) => ConduitParser Event m RssImage
rssImage = named "Rss <image> element" $ tagIgnoreAttrs "image" $ do
p <- many piece
flip foldM p $ RssImage
<$> handlesM _ImageUri (lastRequired "Missing <uri> element.")
<*> handlesM _ImageTitle (lastRequired "Missing <title> element.")
<*> handlesM _ImageLink (lastRequired "Missing <link> element.")
<*> generalize (handles _ImageWidth last)
<*> generalize (handles _ImageHeight last)
<*> generalize (handles _ImageDescription $ lastDef "")
where piece = choice [ ImageUri <$> tagIgnoreAttrs "uri" (content asRssURI)
, ImageTitle <$> tagIgnoreAttrs "title" textContent
, ImageLink <$> tagIgnoreAttrs "link" (content asRssURI)
, ImageWidth <$> tagIgnoreAttrs "width" (content asInt)
, ImageHeight <$> tagIgnoreAttrs "height" (content asInt)
, ImageDescription <$> tagIgnoreAttrs "description" textContent
, ImageUnknown <$ unknownTag
]
rssCategory :: MonadCatch m => ConduitParser Event m RssCategory
rssCategory = named "Rss <category> element" $ tagName' "category" (textAttr "domain") $ \domain ->
RssCategory domain <$> textContent
rssCloud :: (MonadCatch m) => ConduitParser Event m RssCloud
rssCloud = named "Rss <cloud> element" $ tagName' "cloud" attributes return where
attributes = do
uri <- fmap RssURI $ RelativeRef
<$> fmap Just (Authority Nothing <$> (Host . encodeUtf8 <$> textAttr "domain") <*> (fmap Port <$> optional (attr "port" asInt)))
<*> (encodeUtf8 <$> textAttr "path")
<*> pure (Query [])
<*> pure Nothing
RssCloud uri <$> textAttr "registerProcedure" <*> attr "protocol" asCloudProtocol <* ignoreAttrs
rssGuid :: MonadCatch m => ConduitParser Event m RssGuid
rssGuid = named "RSS <guid> element" $ tagName' "guid" attributes handler where
attributes = optional (attr "isPermaLink" asBool) <* ignoreAttrs
handler (Just True) = GuidUri <$> content asRssURI
handler _ = GuidText <$> textContent
rssEnclosure :: MonadCatch m => ConduitParser Event m RssEnclosure
rssEnclosure = named "Rss <enclosure> element" $ tagName' "enclosure" attributes handler where
attributes = (,,) <$> attr "url" asRssURI <*> attr "length" asInt <*> textAttr "type" <* ignoreAttrs
handler (uri, length_, type_) = return $ RssEnclosure uri length_ type_
rssSource :: MonadCatch m => ConduitParser Event m RssSource
rssSource = named "Rss <source> element" $ tagName' "source" attributes handler where
attributes = attr "url" asRssURI <* ignoreAttrs
handler uri = RssSource uri <$> textContent
data ItemPiece = ItemTitle Text | ItemLink RssURI | ItemDescription Text
| ItemAuthor Text | ItemCategory RssCategory | ItemComments RssURI
| ItemEnclosure RssEnclosure | ItemGuid RssGuid | ItemPubDate UTCTime
| ItemSource RssSource | ItemUnknown
makeTraversals ''ItemPiece
rssItem :: MonadCatch m => ConduitParser Event m RssItem
rssItem = named "Rss <item> element" $ tagIgnoreAttrs "item" $ do
p <- many piece
return . flip fold p $ RssItem
<$> handles _ItemTitle (lastDef "")
<*> handles _ItemLink last
<*> handles _ItemDescription (lastDef "")
<*> handles _ItemAuthor (lastDef "")
<*> handles _ItemCategory list
<*> handles _ItemComments last
<*> handles _ItemEnclosure list
<*> handles _ItemGuid last
<*> handles _ItemPubDate last
<*> handles _ItemSource last
where piece = choice [ ItemTitle <$> tagIgnoreAttrs "title" textContent
, ItemLink <$> tagIgnoreAttrs "link" (content asRssURI)
, ItemDescription <$> tagIgnoreAttrs "description" textContent
, ItemAuthor <$> tagIgnoreAttrs "author" textContent
, ItemCategory <$> rssCategory
, ItemComments <$> tagIgnoreAttrs "comments" (content asRssURI)
, ItemEnclosure <$> rssEnclosure
, ItemGuid <$> rssGuid
, ItemPubDate <$> tagDate "pubDate"
, ItemSource <$> rssSource
, ItemUnknown <$ unknownTag
]
data ChannelPiece = ChannelTitle Text | ChannelLink RssURI | ChannelDescription Text
| ChannelItem RssItem | ChannelLanguage Text | ChannelCopyright Text
| ChannelManagingEditor Text | ChannelWebmaster Text | ChannelPubDate UTCTime
| ChannelLastBuildDate UTCTime | ChannelCategory RssCategory
| ChannelGenerator Text | ChannelDocs RssURI | ChannelCloud RssCloud
| ChannelTtl Int | ChannelImage RssImage | ChannelRating Text
| ChannelTextInput RssTextInput | ChannelSkipHours (Set Hour)
| ChannelSkipDays (Set Day) | ChannelUnknown
makeTraversals ''ChannelPiece
rssDocument :: MonadCatch m => ConduitParser Event m RssDocument
rssDocument = named "RSS <rss> element" $ tagName' "rss" attributes $ \version -> tagIgnoreAttrs "channel" $ do
p <- many piece
flip foldM p $ RssDocument version
<$> handlesM _ChannelTitle (lastRequired "Missing <title> element.")
<*> handlesM _ChannelLink (lastRequired "Missing <link> element.")
<*> handlesM _ChannelDescription (lastRequired "Missing <description> element.")
<*> generalize (handles _ChannelItem list)
<*> generalize (handles _ChannelLanguage $ lastDef "")
<*> generalize (handles _ChannelCopyright $ lastDef "")
<*> generalize (handles _ChannelManagingEditor $ lastDef "")
<*> generalize (handles _ChannelWebmaster $ lastDef "")
<*> generalize (handles _ChannelPubDate last)
<*> generalize (handles _ChannelLastBuildDate last)
<*> generalize (handles _ChannelCategory list)
<*> generalize (handles _ChannelGenerator $ lastDef "")
<*> generalize (handles _ChannelDocs last)
<*> generalize (handles _ChannelCloud last)
<*> generalize (handles _ChannelTtl last)
<*> generalize (handles _ChannelImage last)
<*> generalize (handles _ChannelRating $ lastDef "")
<*> generalize (handles _ChannelTextInput last)
<*> generalize (handles _ChannelSkipHours $ lastDef mempty)
<*> generalize (handles _ChannelSkipDays $ lastDef mempty)
where piece :: MonadCatch m => ConduitParser Event m ChannelPiece
piece = choice [ ChannelTitle <$> tagIgnoreAttrs "title" textContent
, ChannelLink <$> tagIgnoreAttrs "link" (content asRssURI)
, ChannelDescription <$> tagIgnoreAttrs "description" textContent
, ChannelItem <$> rssItem
, ChannelLanguage <$> tagIgnoreAttrs "language" textContent
, ChannelCopyright <$> tagIgnoreAttrs "copyright" textContent
, ChannelManagingEditor <$> tagIgnoreAttrs "managingEditor" textContent
, ChannelWebmaster <$> tagIgnoreAttrs "webMaster" textContent
, ChannelPubDate <$> tagDate "pubDate"
, ChannelLastBuildDate <$> tagDate "lastBuildDate"
, ChannelCategory <$> rssCategory
, ChannelGenerator <$> tagIgnoreAttrs "generator" textContent
, ChannelDocs <$> tagIgnoreAttrs "docs" (content asRssURI)
, ChannelCloud <$> rssCloud
, ChannelTtl <$> tagIgnoreAttrs "ttl" (content asInt)
, ChannelImage <$> rssImage
, ChannelRating <$> tagIgnoreAttrs "rating" textContent
, ChannelTextInput <$> rssTextInput
, ChannelSkipHours <$> rssSkipHours
, ChannelSkipDays <$> rssSkipDays
, ChannelUnknown <$ unknownTag
]
attributes = attr "version" asVersion <* ignoreAttrs