{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE OverloadedLists #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RankNTypes #-} {-# LANGUAGE StandaloneDeriving #-} {-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TypeFamilies #-} -- | Streaming parsers for the Atom 1.0 standard. module Text.Atom.Conduit.Parse ( -- * Top-level atomFeed -- * Elements , atomEntry , atomContent , atomCategory , atomLink , atomGenerator , atomSource -- * Constructs , atomPerson , atomText ) where -- {{{ Imports import Text.Atom.Types import Control.Applicative import Control.Foldl hiding (mconcat, set) import Control.Monad hiding (foldM) import Control.Monad.Catch import Data.Conduit.Parser import Data.Conduit.Parser.XML import Data.Maybe import Data.Monoid import Data.MonoTraversable import Data.NonNull (NonNull, fromNullable, toNullable) import Data.Text as Text (Text) import Data.Text.Encoding import Data.Time.Clock import Data.Time.LocalTime import Data.Time.RFC3339 import Data.XML.Types import Lens.Simple import Prelude hiding (last, lookup) import Text.Parser.Combinators import URI.ByteString -- }}} -- {{{ Util data AtomException = InvalidURI URIParseError | NullElement deriving instance Eq AtomException deriving instance Show AtomException instance Exception AtomException where displayException (InvalidURI e) = "Invalid URI reference: " ++ show e displayException NullElement = "Null element" asURIReference :: (MonadThrow m) => Text -> m AtomURI asURIReference t = case (parseURI' t, parseRelativeRef' t) of (Right u, _) -> return $ AtomURI u (_, Right u) -> return $ AtomURI u (Left _, Left e) -> throwM $ InvalidURI e where parseURI' = parseURI laxURIParserOptions . encodeUtf8 parseRelativeRef' = parseRelativeRef laxURIParserOptions . encodeUtf8 asNonNull :: (MonoFoldable a, MonadThrow m) => a -> m (NonNull a) asNonNull = maybe (throwM NullElement) return . fromNullable -- | Like 'tagName' but ignores the namespace. tagName' :: (MonadCatch m) => Text -> AttrParser a -> (a -> ConduitParser Event m b) -> ConduitParser Event m b tagName' t = tagPredicate (\n -> nameLocalName n == t) -- | Tag which content is a date-time that follows RFC 3339 format. tagDate :: (MonadCatch m) => Text -> ConduitParser Event m UTCTime tagDate name = tagIgnoreAttrs' name $ content (fmap zonedTimeToUTC . parseTimeRFC3339) -- | Like 'tagName'' but ignores all attributes. tagIgnoreAttrs' :: (MonadCatch m) => Text -> ConduitParser Event m a -> ConduitParser Event m a tagIgnoreAttrs' name handler = tagName' name ignoreAttrs $ const handler unknownTag :: (MonadCatch m) => ConduitParser Event m () unknownTag = anyTag $ \_ _ -> void $ many (void unknownTag <|> void textContent) atomId :: (MonadCatch m) => ConduitParser Event m (NonNull Text) atomId = tagIgnoreAttrs' "id" $ content asNonNull atomIcon, atomLogo :: (MonadCatch m) => ConduitParser Event m AtomURI atomIcon = tagIgnoreAttrs' "icon" $ content asURIReference atomLogo = tagIgnoreAttrs' "logo" $ content asURIReference lastRequired :: (Monad m, Parsing m) => String -> FoldM m a a lastRequired e = FoldM (\_ a -> return $ Right a) (return $ Left e) (either unexpected return) -- }}} data PersonPiece = PersonName (NonNull Text) | PersonEmail Text | PersonUri AtomURI | PersonUnknown makeTraversals ''PersonPiece -- | Parse an Atom person construct. -- Example: -- -- > -- > John Doe -- > JohnDoe@example.com -- > http://example.com/~johndoe -- > atomPerson :: (MonadCatch m) => Text -> ConduitParser Event m AtomPerson atomPerson name = named ("Atom person construct <" <> name <> ">") $ tagIgnoreAttrs' name $ do p <- many piece flip foldM p $ AtomPerson <$> handlesM _PersonName (lastRequired "Missing or invalid element.") <*> generalize (handles _PersonEmail $ lastDef "") <*> generalize (handles _PersonUri last) where piece :: (MonadCatch m) => ConduitParser Event m PersonPiece piece = choice [ PersonName <$> tagIgnoreAttrs' "name" (content asNonNull) , PersonEmail <$> tagIgnoreAttrs' "email" textContent , PersonUri <$> tagIgnoreAttrs' "uri" (content asURIReference) , PersonUnknown <$ unknownTag ] -- | Parse an @atom:category@ element. -- Example: -- -- > atomCategory :: (MonadCatch m) => ConduitParser Event m AtomCategory atomCategory = tagName' "category" categoryAttrs $ \(t, s, l) -> do term <- asNonNull t return $ AtomCategory term s l where categoryAttrs = (,,) <$> textAttr "term" <*> (textAttr "scheme" <|> pure mempty) <*> (textAttr "label" <|> pure mempty) <* ignoreAttrs -- | Parse an @atom:content@ element. atomContent :: (MonadCatch m) => ConduitParser Event m AtomContent atomContent = tagName' "content" contentAttrs handler where contentAttrs = (,) <$> optional (textAttr "type") <*> optional (attr "src" asURIReference) <* ignoreAttrs handler (Just "xhtml", _) = AtomContentInlineXHTML <$> tagIgnoreAttrs' "div" textContent handler (ctype, Just uri) = return $ AtomContentOutOfLine (fromMaybe mempty ctype) uri handler (Just "html", _) = AtomContentInlineText TypeHTML <$> textContent handler (Nothing, _) = AtomContentInlineText TypeText <$> textContent handler (Just ctype, _) = AtomContentInlineOther ctype <$> textContent -- | Parse an @atom:link@ element. -- Examples: -- -- > -- -- > atomLink :: (MonadCatch m) => ConduitParser Event m AtomLink atomLink = tagName' "link" linkAttrs $ \(href, rel, ltype, lang, title, length') -> return $ AtomLink href rel ltype lang title length' where linkAttrs = (,,,,,) <$> attr "href" asURIReference <*> (textAttr "rel" <|> pure mempty) <*> (textAttr "type" <|> pure mempty) <*> (textAttr "hreflang" <|> pure mempty) <*> (textAttr "title" <|> pure mempty) <*> (textAttr "length" <|> pure mempty) <* ignoreAttrs -- | Parse an Atom text construct. -- Examples: -- -- > AT&T bought by SBC! -- -- > -- > AT&amp;T bought <b>by SBC</b>! -- > -- -- > -- > <div xmlns="http://www.w3.org/1999/xhtml"> -- > AT&T bought <b>by SBC</b>! -- > </div> -- > atomText :: (MonadCatch m) => Text -> ConduitParser Event m AtomText atomText name = named ("Atom text construct <" <> name <> ">") $ tagName' name (optional (textAttr "type") <* ignoreAttrs) handler where handler (Just "xhtml") = AtomXHTMLText <$> tagIgnoreAttrs' "div" xhtmlContent handler (Just "html") = AtomPlainText TypeHTML <$> textContent handler _ = AtomPlainText TypeText <$> textContent xhtmlContent :: MonadCatch m => ConduitParser Event m Text xhtmlContent = mconcat <$> many (textContent <|> anyTag (\name attrs -> renderTag name attrs <$> xhtmlContent)) renderTag name attrs content = "<" <> nameLocalName name <> renderAttrs attrs <> ">" <> content <> " nameLocalName name <> ">" renderAttrs [] = "" renderAttrs ((name, content):t) = " " <> nameLocalName name <> "=\"" <> mconcat (renderContent <$> content) <> "\"" <> renderAttrs t renderContent (ContentText t) = t renderContent (ContentEntity t) = t -- | Parse an @atom:generator@ element. -- Example: -- -- > -- > Example Toolkit -- > atomGenerator :: (MonadCatch m) => ConduitParser Event m AtomGenerator atomGenerator = tagName' "generator" generatorAttrs $ \(uri, version) -> AtomGenerator uri version <$> (asNonNull =<< textContent) where generatorAttrs = (,) <$> optional (attr "uri" asURIReference) <*> (textAttr "version" <|> pure mempty) <* ignoreAttrs data SourcePiece = SourceAuthor AtomPerson | SourceCategory AtomCategory | SourceContributor AtomPerson | SourceGenerator AtomGenerator | SourceIcon AtomURI | SourceId Text | SourceLink AtomLink | SourceLogo AtomURI | SourceRights AtomText | SourceSubtitle AtomText | SourceTitle AtomText | SourceUpdated UTCTime | SourceUnknown makeTraversals ''SourcePiece -- | Parse an @atom:source@ element. -- Example: -- -- > -- > http://example.org/ -- > Fourty-Two -- > 2003-12-13T18:30:02Z -- > © 2005 Example, Inc. -- > atomSource :: (MonadCatch m) => ConduitParser Event m AtomSource atomSource = named "Atom element" $ tagIgnoreAttrs' "source" $ do p <- many piece flip foldM p $ AtomSource <$> generalize (handles _SourceAuthor list) <*> generalize (handles _SourceCategory list) <*> generalize (handles _SourceContributor list) <*> generalize (handles _SourceGenerator last) <*> generalize (handles _SourceIcon last) <*> generalize (handles _SourceId $ lastDef "") <*> generalize (handles _SourceLink list) <*> generalize (handles _SourceLogo last) <*> generalize (handles _SourceRights last) <*> generalize (handles _SourceSubtitle last) <*> generalize (handles _SourceTitle last) <*> generalize (handles _SourceUpdated last) where piece :: (MonadCatch m) => ConduitParser Event m SourcePiece piece = choice [ SourceAuthor <$> atomPerson "author" , SourceCategory <$> atomCategory , SourceContributor <$> atomPerson "contributor" , SourceGenerator <$> atomGenerator , SourceIcon <$> atomIcon , SourceId . toNullable <$> atomId , SourceLink <$> atomLink , SourceLogo <$> atomLogo , SourceRights <$> atomText "rights" , SourceSubtitle <$> atomText "subtitle" , SourceTitle <$> atomText "title" , SourceUpdated <$> tagDate "updated" , SourceUnknown <$ unknownTag ] data EntryPiece = EntryAuthor AtomPerson | EntryCategory AtomCategory | EntryContent AtomContent | EntryContributor AtomPerson | EntryId (NonNull Text) | EntryLink AtomLink | EntryPublished UTCTime | EntryRights AtomText | EntrySource AtomSource | EntrySummary AtomText | EntryTitle AtomText | EntryUpdated UTCTime | EntryUnknown makeTraversals ''EntryPiece -- | Parse an @atom:entry@ element. atomEntry :: (MonadCatch m) => ConduitParser Event m AtomEntry atomEntry = named "Atom element" $ tagIgnoreAttrs' "entry" $ do p <- many piece flip foldM p $ AtomEntry <$> generalize (handles _EntryAuthor list) <*> generalize (handles _EntryCategory list) <*> generalize (handles _EntryContent last) <*> generalize (handles _EntryContributor list) <*> handlesM _EntryId (lastRequired "Missing or invalid element.") <*> generalize (handles _EntryLink list) <*> generalize (handles _EntryPublished last) <*> generalize (handles _EntryRights last) <*> generalize (handles _EntrySource last) <*> generalize (handles _EntrySummary last) <*> handlesM _EntryTitle (lastRequired "Missing or invalid element.") <*> handlesM _EntryUpdated (lastRequired "Missing or invalid <updated> element.") where piece :: (MonadCatch m) => ConduitParser Event m EntryPiece piece = choice [ EntryAuthor <$> atomPerson "author" , EntryCategory <$> atomCategory , EntryContent <$> atomContent , EntryContributor <$> atomPerson "contributor" , EntryId <$> atomId , EntryLink <$> atomLink , EntryPublished <$> tagDate "published" , EntryRights <$> atomText "rights" , EntrySource <$> atomSource , EntrySummary <$> atomText "summary" , EntryTitle <$> atomText "title" , EntryUpdated <$> tagDate "updated" , EntryUnknown <$ unknownTag ] data FeedPiece = FeedAuthor AtomPerson | FeedCategory AtomCategory | FeedContributor AtomPerson | FeedEntry AtomEntry | FeedGenerator AtomGenerator | FeedIcon AtomURI | FeedId (NonNull Text) | FeedLink AtomLink | FeedLogo AtomURI | FeedRights AtomText | FeedSubtitle AtomText | FeedTitle AtomText | FeedUpdated UTCTime | FeedUnknown makeTraversals ''FeedPiece -- | Parse an @atom:feed@ element. atomFeed :: (MonadCatch m) => ConduitParser Event m AtomFeed atomFeed = named "Atom <feed> element" $ tagIgnoreAttrs' "feed" $ do p <- many piece flip foldM p $ AtomFeed <$> generalize (handles _FeedAuthor list) <*> generalize (handles _FeedCategory list) <*> generalize (handles _FeedContributor list) <*> generalize (handles _FeedEntry list) <*> generalize (handles _FeedGenerator last) <*> generalize (handles _FeedIcon last) <*> handlesM _FeedId (lastRequired "Missing or empty <id> element.") <*> generalize (handles _FeedLink list) <*> generalize (handles _FeedLogo last) <*> generalize (handles _FeedRights last) <*> generalize (handles _FeedSubtitle last) <*> handlesM _FeedTitle (lastRequired "Missing <title> element.") <*> handlesM _FeedUpdated (lastRequired "Missing <updated> element.") where piece :: MonadCatch m => ConduitParser Event m FeedPiece piece = choice [ FeedAuthor <$> atomPerson "author" , FeedCategory <$> atomCategory , FeedContributor <$> atomPerson "contributor" , FeedEntry <$> atomEntry , FeedGenerator <$> atomGenerator , FeedIcon <$> atomIcon , FeedId <$> atomId , FeedLink <$> atomLink , FeedLogo <$> atomLogo , FeedRights <$> atomText "rights" , FeedSubtitle <$> atomText "subtitle" , FeedTitle <$> atomText "title" , FeedUpdated <$> tagDate "updated" , FeedUnknown <$ unknownTag ]