{-# LANGUAGE DataKinds #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RankNTypes #-} {-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TypeFamilies #-} -- | Streaming parsers for the RSS 1.0 standard. module Text.RSS1.Conduit.Parse ( -- * Top-level rss1Document -- * Elements , rss1ChannelItems , rss1Image , rss1Item , rss1TextInput ) where -- {{{ Imports import Text.RSS.Extensions import Text.RSS.Types import Conduit hiding (throwM) import Control.Exception.Safe as Exception import Control.Monad import Control.Monad.Fix import Data.Conduit import Data.List.NonEmpty import Data.Singletons.Prelude import Data.Text as Text import Data.Text.Encoding import Data.Time.Clock import Data.Time.LocalTime import Data.Time.RFC3339 import Data.Version import Data.Vinyl.Core import Data.XML.Types import Lens.Simple import Text.XML.Stream.Parse import URI.ByteString -- }}} -- {{{ Util asDate :: (MonadThrow m) => Text -> m UTCTime asDate text = maybe (throw $ InvalidTime text) (return . zonedTimeToUTC) $ parseTimeRFC3339 text asRssURI :: (MonadThrow m) => Text -> m RssURI asRssURI t = case (parseURI' t, parseRelativeRef' t) of (Right u, _) -> return $ RssURI u (_, Right u) -> return $ RssURI u (_, Left e) -> throwM $ InvalidURI e where parseURI' = parseURI laxURIParserOptions . encodeUtf8 parseRelativeRef' = parseRelativeRef laxURIParserOptions . encodeUtf8 nullURI :: RssURI nullURI = RssURI $ RelativeRef Nothing "" (Query []) Nothing headRequiredC :: MonadThrow m => Text -> Consumer a m a headRequiredC e = maybe (throw $ MissingElement e) return =<< headC projectC :: Monad m => Fold a a' b b' -> Conduit a m b projectC prism = fix $ \recurse -> do item <- await case (item, item ^? (_Just . prism)) of (_, Just a) -> yield a >> recurse (Just _, _) -> recurse _ -> return () contentTag :: MonadThrow m => Text -> AttrParser a -> (a -> ConduitM Event o m b) -> ConduitM Event o m (Maybe b) contentTag string = tag' (matching (== contentName string)) dcTag :: MonadThrow m => Text -> AttrParser a -> (a -> ConduitM Event o m b) -> ConduitM Event o m (Maybe b) dcTag string = tag' (matching (== dcName string)) rdfTag :: MonadThrow m => Text -> AttrParser a -> (a -> ConduitM Event o m b) -> ConduitM Event o m (Maybe b) rdfTag string = tag' (matching (== rdfName string)) rss1Tag :: MonadThrow m => Text -> AttrParser a -> (a -> ConduitM Event o m b) -> ConduitM Event o m (Maybe b) rss1Tag string = tag' (matching (== rss1Name string)) contentName :: Text -> Name contentName string = Name string (Just "http://purl.org/rss/1.0/modules/content/") (Just "content") dcName :: Text -> Name dcName string = Name string (Just "http://purl.org/dc/elements/1.1/") (Just "dc") rdfName :: Text -> Name rdfName string = Name string (Just "http://www.w3.org/1999/02/22-rdf-syntax-ns#") (Just "rdf") rss1Name :: Text -> Name rss1Name string = Name string (Just "http://purl.org/rss/1.0/") Nothing -- }}} data TextInputPiece = TextInputTitle Text | TextInputDescription Text | TextInputName Text | TextInputLink RssURI makeTraversals ''TextInputPiece -- | Parse a @\@ element. rss1TextInput :: MonadThrow m => ConduitM Event o m (Maybe RssTextInput) rss1TextInput = rss1Tag "textinput" attributes $ \uri -> (manyYield' (choose piece) =$= parser uri) <* many ignoreAnyTreeContent where parser uri = getZipConduit $ RssTextInput <$> ZipConduit (projectC _TextInputTitle =$= headRequiredC "Missing element") <*> ZipConduit (projectC _TextInputDescription =$= headRequiredC "Missing <description> element") <*> ZipConduit (projectC _TextInputName =$= headRequiredC "Missing <name> element") <*> ZipConduit (projectC _TextInputLink =$= headDefC uri) -- Lenient piece = [ fmap TextInputTitle <$> rss1Tag "title" ignoreAttrs (const content) , fmap TextInputDescription <$> rss1Tag "description" ignoreAttrs (const content) , fmap TextInputName <$> rss1Tag "name" ignoreAttrs (const content) , fmap TextInputLink <$> rss1Tag "link" ignoreAttrs (const $ content >>= asRssURI) ] attributes = (requireAttr (rdfName "about") >>= asRssURI) <* ignoreAttrs data ItemPiece = ItemTitle Text | ItemLink RssURI | ItemDescription Text | ItemCreator Text | ItemDate UTCTime | ItemContent Text | ItemOther (NonEmpty Event) makeTraversals ''ItemPiece -- | Parse an @\<item\>@ element. -- -- RSS extensions are automatically parsed based on the inferred result type. rss1Item :: ParseRssExtensions e => MonadCatch m => ConduitM Event o m (Maybe (RssItem e)) rss1Item = rss1Tag "item" attributes $ \uri -> (manyYield' (choose piece) =$= parser uri) <* many ignoreAnyTreeContent where parser uri = getZipConduit $ RssItem <$> ZipConduit (projectC _ItemTitle =$= headDefC mempty) <*> (Just <$> ZipConduit (projectC _ItemLink =$= headDefC uri)) <*> ZipConduit (projectC _ItemDescription =$= headDefC mempty) <*> ZipConduit (projectC _ItemCreator =$= headDefC mempty) <*> pure mempty <*> pure mzero <*> pure mempty <*> pure mzero <*> ZipConduit (projectC _ItemDate =$= headC) <*> pure mzero <*> ZipConduit (projectC _ItemOther =$= concatC =$= parseRssItemExtensions) piece = [ fmap ItemTitle <$> rss1Tag "title" ignoreAttrs (const content) , fmap ItemLink <$> rss1Tag "link" ignoreAttrs (const $ content >>= asRssURI) , fmap ItemDescription <$> (rss1Tag "description" ignoreAttrs (const content) `orE` contentTag "encoded" ignoreAttrs (const content)) , fmap ItemCreator <$> dcTag "creator" ignoreAttrs (const content) , fmap ItemDate <$> dcTag "date" ignoreAttrs (const $ content >>= asDate) , fmap ItemOther . nonEmpty <$> (void takeAnyTreeContent =$= sinkList) ] attributes = (requireAttr (rdfName "about") >>= asRssURI) <* ignoreAttrs data ImagePiece = ImageUri RssURI | ImageTitle Text | ImageLink RssURI makeTraversals ''ImagePiece -- | Parse an @\<image\>@ element. rss1Image :: (MonadThrow m) => ConduitM Event o m (Maybe RssImage) rss1Image = rss1Tag "image" attributes $ \uri -> (manyYield' (choose piece) =$= parser uri) <* many ignoreAnyTreeContent where parser uri = getZipConduit $ RssImage <$> ZipConduit (projectC _ImageUri =$= headDefC uri) -- Lenient <*> ZipConduit (projectC _ImageTitle =$= headDefC "Unnamed image") -- Lenient <*> ZipConduit (projectC _ImageLink =$= headDefC nullURI) -- Lenient <*> pure mzero <*> pure mzero <*> pure mempty piece = [ fmap ImageUri <$> rss1Tag "url" ignoreAttrs (const $ content >>= asRssURI) , fmap ImageTitle <$> rss1Tag "title" ignoreAttrs (const content) , fmap ImageLink <$> rss1Tag "link" ignoreAttrs (const $ content >>= asRssURI) ] attributes = (requireAttr (rdfName "about") >>= asRssURI) <* ignoreAttrs -- | Parse an @\<items\>@ element. rss1ChannelItems :: MonadThrow m => ConduitM Event o m (Maybe [Text]) rss1ChannelItems = fmap join $ rss1Tag "items" ignoreAttrs $ const $ rdfTag "Seq" ignoreAttrs $ const $ many $ rdfTag "li" attributes return where attributes = requireAttr (rdfName "resource") <* ignoreAttrs data Rss1Channel (extensions :: [*]) = Rss1Channel { channelId' :: RssURI , channelTitle' :: Text , channelLink' :: RssURI , channelDescription' :: Text , channelItems' :: [Text] , channelImage' :: Maybe RssImage , channelTextInput' :: Maybe RssURI , channelExtensions' :: RssChannelExtensions extensions } data ChannelPiece = ChannelTitle Text | ChannelLink RssURI | ChannelDescription Text | ChannelImage RssImage | ChannelItems [Text] | ChannelTextInput RssURI | ChannelOther (NonEmpty Event) makeTraversals ''ChannelPiece -- | Parse a @\<channel\>@ element. -- -- RSS extensions are automatically parsed based on the inferred result type. rss1Channel :: ParseRssExtensions e => MonadThrow m => ConduitM Event o m (Maybe (Rss1Channel e)) rss1Channel = rss1Tag "channel" attributes $ \channelId -> (manyYield' (choose piece) =$= parser channelId) <* many ignoreAnyTreeContent where parser channelId = getZipConduit $ Rss1Channel channelId <$> ZipConduit (projectC _ChannelTitle =$= headRequiredC "Missing <title> element") <*> ZipConduit (projectC _ChannelLink =$= headRequiredC "Missing <link> element") <*> ZipConduit (projectC _ChannelDescription =$= headDefC "") -- Lenient <*> ZipConduit (projectC _ChannelItems =$= concatC =$= sinkList) <*> ZipConduit (projectC _ChannelImage =$= headC) <*> ZipConduit (projectC _ChannelTextInput =$= headC) <*> ZipConduit (projectC _ChannelOther =$= concatC =$= parseRssChannelExtensions) piece = [ fmap ChannelTitle <$> rss1Tag "title" ignoreAttrs (const content) , fmap ChannelLink <$> rss1Tag "link" ignoreAttrs (const $ content >>= asRssURI) , fmap ChannelDescription <$> rss1Tag "description" ignoreAttrs (const content) , fmap ChannelItems <$> rss1ChannelItems , fmap ChannelImage <$> rss1Image , fmap ChannelTextInput <$> rss1Tag "textinput" (requireAttr (rdfName "resource") >>= asRssURI) return , fmap ChannelOther . nonEmpty <$> (void takeAnyTreeContent =$= sinkList) ] attributes = (requireAttr (rdfName "about") >>= asRssURI) <* ignoreAttrs data Rss1Document (e :: [*]) = Rss1Document (Rss1Channel e) (Maybe RssImage) [RssItem e] (Maybe RssTextInput) rss1ToRss2 :: Rss1Document e -> RssDocument e rss1ToRss2 (Rss1Document channel image items textInput) = RssDocument (Version [1] []) (channelTitle' channel) (channelLink' channel) (channelDescription' channel) items mempty mempty mempty mempty mzero mzero mzero mempty mzero mzero mzero image mempty textInput mempty mempty (channelExtensions' channel) data DocumentPiece (e :: [*]) = DocumentChannel (Rss1Channel e) | DocumentImage RssImage | DocumentItem (RssItem e) | DocumentTextInput RssTextInput makeTraversals ''DocumentPiece -- | Parse an @\<RDF\>@ element. -- -- RSS extensions are automatically parsed based on the inferred result type. rss1Document :: ParseRssExtensions e => MonadCatch m => ConduitM Event o m (Maybe (RssDocument e)) rss1Document = fmap (fmap rss1ToRss2) $ rdfTag "RDF" ignoreAttrs $ const $ (manyYield' (choose piece) =$= parser) <* many ignoreAnyTreeContent where parser = getZipConduit $ Rss1Document <$> ZipConduit (projectC _DocumentChannel =$= headRequiredC "Missing <channel> element") <*> ZipConduit (projectC _DocumentImage =$= headC) <*> ZipConduit (projectC _DocumentItem =$= sinkList) <*> ZipConduit (projectC _DocumentTextInput =$= headC) piece = [ fmap DocumentChannel <$> rss1Channel , fmap DocumentImage <$> rss1Image , fmap DocumentItem <$> rss1Item , fmap DocumentTextInput <$> rss1TextInput ]