-- |A simple feed generator -- -- Copyright (c) 2006 Manuel M T Chakravarty -- -- License: -- --- Description --------------------------------------------------------------- -- -- Language: Haskell 98 -- -- Feed source -- ~~~~~~~~~~~ -- The feed source is a directory containing channel descriptors, which are -- files with suffix `.lfc'. Each channel descriptor specifies a channel -- directory, which contains the feed for the channel. These items are -- individual files in those sub-directories with the suffix `.lfi'. -- -- Timestamps -- ~~~~~~~~~~ -- We require all channel items to include a timestamp (aka publication -- date). If an item doesn't contain an explicit `pubDate' field, the last -- modification date of the file containing the item is added as a -- `pubDate'. This holds only for channel items, not for channel -- descriptions. -- -- Text files in stanza format -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- This is the format used for channel descriptors and channel items. A -- stanza file is comprised of a list of stanzas which a separated by one or -- more empty lines (ie, lines containing nothing but white space). Each -- such stanza is an association list of tags and values. Tags start in the -- leftmost column and are terminated by a colon or the end of line. There -- can be no white space within a tag, but there can be trailing white space, -- which is discarded. The value associated with a tag is the string -- following the tag-terminating colon. Such a value string extends to the -- start of the next tag or end of the current stanza, whichever comes first. -- Hence, value strings can span multiple lines, but any line after the -- initial one, ie the one containing the tag, must have white space in the -- leftmost column. If a tag is terminated by the end of line, instead of a -- colon, its associated value string is empty. -- -- A line starting with `--' is a comment line, and hence, ignored, -- -- Channel descriptors and channel items -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- The tags used in channel descriptors and channel items are the same as -- those in the RSS 2.0 specification, but we permit arbitrary use of upper -- and lowercase letters and have the following special cases: -- -- `lastBuildDate': ignored, as it is added by lambdaFeed -- `generator' : ignored, as it is added by lambdaFeed -- `pubDate' : using file modification time if absent (only for items) -- `cloud' : not implemented yet -- `rating' : not implemented yet -- `textInput' : not implemented yet -- `skipHours' : not implemented yet -- `skipDays' : not implemented yet -- -- Elements that have attributes need to be in a stanza of their own. The -- element tag must be the tag of the first line of the stanze. All other -- lines must be attribute tags of that element. -- -- Moreover, we have an extra tag in channel descriptors: -- -- `items' : required element giving the filename of a directory with -- the channel's items -- -- Docs -- ~~~~ -- RSS 2.0 spec: -- --- Todo ---------------------------------------------------------------------- -- -- * Options to truncate news streams during formatting: (1) Maximal number -- of items rendered, (2) cut off date for rendering, and (3) maximum age -- of items to render. We should be able to specify this either for the -- XML and HTML output together or for both separately (as with output -- directory). This could also be realised by synthetic channels being able -- to specify such criteria. -- -- * More elaborate channel formatting and/or synthetic channels: Channels -- that don't have an item directory, but instead instructions on how to -- synthesise them from other channels by eg, merging and filtering. -- -- * Should also have a CGI script mode, where it dumps either the XML or -- HTML to stdout. (Could also do that via a shell script, but Haskell may -- be more portable. However, CGI "scripts" could be separate, small -- Haskell programs.) -- -- * Support cloud specification. -- -- * Don't regenerate a channel if neither its description nor items changed -- and/or some caching. -- -- * Validate the format of some fields; eg, URLs should really look like -- URLs, same for email addresses, but only warnings in case of errors. -- -- * Read channels in RSS format (probably with HaXml). They'd just be .xml -- files in the feed directory. -- -- * Use plugins to do the reading and writing of various formats and for -- formatting the feed. Also for configuration as in Yi. -- -- * It might be nice to generate atom feeds, too. There are good reasons to -- prefer Atom . It seems that a -- good compromise is requiring input to be Atom-ish, but to be able to -- render feeds in either format. -- module LambdaFeed (main) where -- hierachical libraries -- import Control.Monad ( liftM, mplus) import Data.Char ( isDigit, toLower) import Data.List ( sortBy) import Data.Maybe ( isNothing, fromJust) import Numeric ( showFFloat) import System.CPUTime ( getCPUTime) import System.Directory ( getDirectoryContents) import System.IO ( stderr) import System.Time ( ClockTime, getClockTime) -- lambdaFeed import Config ( version, chanSuffix, itemSuffix, htmlSuffix, rssSuffix, Config(..), processArgs) import Date ( Date(..), parseDate) import Error ( elementErrStr, abortWith, abortWithIO, exitSuccess) import Feed ( URL, Feed(..), Channel(..), Image(..), Category(..), Item(..), Enclosure(..), GUID(..), Source(..), Info(..), copyChannelInfo, defaultChannel, defaultImage, defaultCategory, defaultItem, defaultEnclosure, defaultGUID, defaultSource) import HTML ( channelToHTML) import RSS ( channelToRSS) import Stanza ( Stanza, StanzaAssoc, readStanzas, StanzaProcessor, Proc(..), Action(..), parseStanzas) -- Feed processing -- --------------- -- Summary information -- data Summary = Summary { -- number of items for each cannel channelItemsSummary :: [Int] } -- Read all feed data into our internal feed structure. -- readFeed :: Config -> IO (Feed [Item]) readFeed config = do let feedDir = feedDirCfg config sayV config $ "Getting channels from feed directory `" ++ feedDir ++ "'" -- fnames <- getDirectoryContents feedDir let chanFNames = [ feedDir fname | fname <- fnames, suffix fname == chanSuffix] -- sayV config $ "Reading " ++ show (length chanFNames) ++ " channel(s)" -- chanDescs <- mapM (readChanDesc config) chanFNames -- sayD config 1 $ show chanDescs -- liftM Feed $ mapM (readChan config) chanDescs -- Read a channel description from the file of the given name and modify its -- build date to be the current time. -- -- * We interpret the `itemsChan' file path relative to path from which we -- read the channel description (unless it is already absolute). -- readChanDesc :: Config -> FilePath -> IO (Channel FilePath) readChanDesc config fname = do text <- readStanzas fname let (chan, errs) = parseChanDesc fname text mapM (say config) errs currTime <- getClockTime return $ chan { generatorChan = Just version, lastBuildDateChan = Just $ Date currTime, itemsChan = dirname fname itemsChan chan, -- adjust path to item files infoChan = Info { fnameInfo = (stripSuffix . basename) fname }, docsChan = docsChan chan `mplus` Just specUrl } where specUrl = "http://www.rssboard.org/rss-specification" -- Read the items from the items directrory specified by the channel into the -- channel data structure. -- readChan :: Config -> Channel FilePath -> IO (Channel [Item]) readChan config chan = do let itemsDir = itemsChan chan sayV config $ "Getting items for " ++ titleChan chan ++ " from `" ++ itemsDir ++ "'" -- fnames <- getDirectoryContents itemsDir let itemFNames = [ itemsDir fname | fname <- fnames, suffix fname == itemSuffix] -- sayV config $ "Reading " ++ show (length itemFNames) ++ " items(s)" -- items <- mapM (readItem config) itemFNames -- sayD config 2 $ show items -- return $ (copyChannelInfo chan) {itemsChan = items} -- Read an item from the file of the given name. -- readItem :: Config -> FilePath -> IO Item readItem config fname = do text <- readStanzas fname let (item, errs) = parseItem fname text mapM (say config) errs pubDate <- case pubDateItem item of Just pubDate -> return pubDate Nothing -> do currTime <- liftM Date $ getClockTime appendFile fname $ "\n\nPubDate: " ++ show currTime ++ "\n" return currTime return $ item {pubDateItem = Just pubDate} -- Generate the requested feed representation. -- formatFeed :: Config -> Feed [Item] -> IO (Feed [Item]) formatFeed config (Feed chans) = liftM Feed $ mapM (formatChan config) chans formatChan :: Config -> Channel [Item] -> IO (Channel [Item]) formatChan config chan = do sayV config $ "Sorting channel `" ++ titleChan chan ++ "'" return $ chan {itemsChan = take (maxItemsOpt config) . sortBy newer $ itemsChan chan} where item1 `newer` item2 = case pubDateItem item1 `compare` pubDateItem item2 of LT -> GT EQ -> EQ GT -> LT -- Write feed data to HTML and RSS files. -- writeFeed :: Config -> Feed [Item] -> IO Summary writeFeed config (Feed chans) = do mapM generateHTML chans mapM generateRSS chans return $ Summary {channelItemsSummary = map (length . itemsChan) chans} where generateHTML chan = do let fname = htmlDirCfg config (fnameInfo . infoChan) chan <.> htmlSuffix sayV config $ "Writing HTML for channel `" ++ titleChan chan ++ "' to `" ++ fname ++ "'" writeFile fname (channelToHTML config chan) generateRSS chan = do let fname = rssDirCfg config (fnameInfo . infoChan) chan <.> rssSuffix sayV config $ "Writing RSS for channel `" ++ titleChan chan ++ "' to `" ++ fname ++ "'" writeFile fname (channelToRSS config chan) -- Print summary of feed data. -- printSummary :: Config -> Summary -> IO () printSummary config summary = do pico <- getCPUTime let noChannels = length (channelItemsSummary summary) noItems = sum (channelItemsSummary summary) secs = fromInteger (pico `div` 1000000000) / 1000 :: Float sayV config $ "Processed " ++ show noItems ++ " item(s) in " ++ show noChannels ++ " channel(s) in " ++ showFFloat (Just 3) secs "s." -- Stanza parsing -- -------------- -- Parse a channel description, returning our internal channel representation -- in the flavour that the name of the directory containing the channel's -- items. -- -- * The first argument is to identify the source in error messages. -- -- * Missing tags in a channel description lead to a fatal error (i.e., no -- channel description is returned, only some errors). Otherwise, errors -- are not fatal (parsing will still have produced a valid channel -- descriptor, although may be not exactly what the user intended). -- parseChanDesc :: String -> [Stanza] -> (Channel FilePath, [String]) parseChanDesc sname stanzas = case parseStanzas chanDescProc sname stanzas defaultChannel of (Nothing , errs) -> abortWith errs (Just chan, errs) -> (chan, errs) where chanDescProc = -- required elements [ ("title", Proc True $ Simple $ cantFail $ \v c -> c {titleChan = v} ) , ("link", Proc True $ Simple $ cantFail $ \v c -> c {linkChan = v} ) , ("description", Proc True $ Simple $ cantFail $ \v c -> c {descriptionChan = v} ) , ("items", Proc True $ Simple $ cantFail $ \v c -> c {itemsChan = v} ) -- optional elements , ("language", Proc False $ Simple $ cantFail $ \v c -> c {languageChan = Just v} ) , ("copyright", Proc False $ Simple $ cantFail $ \v c -> c {copyrightChan = Just v} ) , ("managingeditor", Proc False $ Simple $ cantFail $ \v c -> c {managingEditorChan = Just v} ) , ("webmaster", Proc False $ Simple $ cantFail $ \v c -> c {webMasterChan = Just v} ) , ("pubdate", Proc False $ Simple $ mayFail parseDate "bad date format (see RSS 2.0 spec)" $ \v c -> c {pubDateChan = Just v} ) , ("lastbuilddate", Proc False $ Simple $ mayFail parseDate "bad date format (see RSS 2.0 spec)" $ \v c -> c {lastBuildDateChan = Just v} ) , ("category", Proc False $ Compound $ cantFailStanza parseCategory $ \v c -> c {categoryChan = categoryChan c ++ [v]} ) , ("generator", Proc False $ Simple $ cantFail $ \v c -> c {generatorChan = Just v} ) , ("docs", Proc False $ Simple $ cantFail $ \v c -> c {docsChan = Just v} ) , ("cloud", Proc False $ Simple $ cantFail $ \v c -> c {cloudChan = ()} -- ignore ) , ("ttl", Proc False $ Simple $ mayFail parseNat "positive integer expected" $ \v c -> c {ttlChan = Just v} ) , ("image", Proc False $ Compound $ mayFailStanza parseImage $ \v c -> c {imageChan = v} ) , ("rating", Proc False $ Simple $ cantFail $ \v c -> c {ratingChan = ()} -- ignore ) , ("textinput", Proc False $ Simple $ cantFail $ \v c -> c {textInputChan = ()} -- ignore ) , ("skiphours", Proc False $ Simple $ cantFail $ \v c -> c {skipDaysChan = ()} -- ignore ) , ("skipdays", Proc False $ Simple $ cantFail $ \v c -> c {skipDaysChan = ()} -- ignore ) ] -- mayFail = mayFailS sname mayFailStanza = mayFailStanzaS sname -- Parse an image element. -- parseImage :: String -> Stanza -> (Maybe Image, [String]) parseImage sname stanza@((_, (start, end), _):_) = parseStanzas imageProc extendedSName [stanza] defaultImage where imageProc = -- required elements [ ("image", Proc True $ Simple $ mayFail parseNull "image tag can have no value" $ \v i -> i -- image tag itself has no data ) , ("url", Proc True $ Simple $ cantFail $ \v i -> i {urlImage = v} ) , ("title", Proc True $ Simple $ cantFail $ \v i -> i {titleImage = v} ) , ("link", Proc True $ Simple $ cantFail $ \v i -> i {linkImage = v} ) -- optional elements , ("width", Proc False $ Simple $ mayFail parseNat "positive integer expected" $ \v c -> c {widthImage = Just v} ) , ("height", Proc False $ Simple $ mayFail parseNat "positive integer expected" $ \v c -> c {heightImage = Just v} ) , ("description", Proc False $ Simple $ cantFail $ \v i -> i {descriptionImage = Just v} ) ] -- mayFail = mayFailS sname -- extendedSName = sname ++ ":" ++ show start ++ "-" ++ show end ++ ": image" -- Parse a category element (can't fail) -- parseCategory :: Stanza -> Category parseCategory stanza = (fromJust . fst) (parseStanzas categoryProc "" [stanza] defaultCategory) where categoryProc = -- required elements [ ("category", Proc True $ Simple $ cantFail $ \v i -> i {categoryCategory = v} ) -- optional elements , ("domain", Proc False $ Simple $ cantFail $ \v i -> i {domainCategory = Just v} ) ] -- Parse an item, returning our internal item representation. -- -- * The first argument is to identify the source in error messages. -- -- * It's a fatal error if there is neither a title nor a description. -- Otherwise, errors are not fatal (parsing will still have produced a valid -- channel descriptor, although may be not exactly what the user intended). -- parseItem :: String -> [Stanza] -> (Item, [String]) parseItem sname stanzas = case parseStanzas itemProc sname stanzas defaultItem of (Nothing , errs) -> error "parseItem: Impossible" -- no required items (Just item, errs) | isNothing (titleItem item) && isNothing (descriptionItem item) -> abortWith [sname ++ ": an item must have a title or description"] | otherwise -> (item, errs) where itemProc = -- optional elements [ ("title", Proc False $ Simple $ cantFail $ \v c -> c {titleItem = Just v} ) , ("link", Proc False $ Simple $ cantFail $ \v c -> c {linkItem = Just v} ) , ("description", Proc False $ Simple $ cantFail $ \v c -> c {descriptionItem = Just v} ) , ("author", Proc False $ Simple $ cantFail $ \v c -> c {authorItem = Just v} ) , ("category", Proc False $ Compound $ cantFailStanza parseCategory $ \v c -> c {categoryItem = categoryItem c ++ [v]} ) , ("comments", Proc False $ Simple $ cantFail $ \v c -> c {commentsItem = Just v} ) , ("enclosure", Proc False $ Compound $ mayFailStanza parseEnclosure $ \v c -> c {enclosureItem = v} ) , ("guid", Proc False $ Compound $ mayFailStanza parseGUID $ \v c -> c {guidItem = v} ) , ("pubdate", Proc False $ Simple $ mayFail parseDate "bad date format (see RSS 2.0 spec)" $ \v c -> c {pubDateItem = Just v} ) , ("source", Proc False $ Compound $ mayFailStanza parseSource $ \v c -> c {sourceItem = v} ) ] -- mayFail = mayFailS sname mayFailStanza = mayFailStanzaS sname -- Parse an enclosure element. -- parseEnclosure :: String -> Stanza -> (Maybe Enclosure, [String]) parseEnclosure sname stanza@((_, (start, end), _):_) = parseStanzas enclosureProc extendedSName [stanza] defaultEnclosure where enclosureProc = -- required elements [ ("enclosure", Proc True $ Simple $ mayFail parseNull "enclosure tag can have no value" $ \v i -> i -- enclosure tag itself has no data ) , ("url", Proc True $ Simple $ cantFail $ \v i -> i {urlEnclosure = v} ) , ("length", Proc True $ Simple $ mayFail parseNat "positive integer expected" $ \v i -> i {lengthEnclosure = v} ) , ("type", Proc True $ Simple $ cantFail $ \v i -> i {typeEnclosure = v} ) ] -- mayFail = mayFailS sname -- extendedSName = sname ++ ":" ++ show start ++ "-" ++ show end ++ ": enclosure" -- Parse a guid element. -- parseGUID :: String -> Stanza -> (Maybe GUID, [String]) parseGUID sname stanza@((_, (start, end), _):_) = parseStanzas guidProc extendedSName [stanza] defaultGUID where guidProc = -- required elements [ ("guid", Proc True $ Simple $ cantFail $ \v i -> i {guidGUID = v} ) -- optional elements , ("ispermalink", Proc False $ Simple $ mayFail parseBool "true or false expected" $ \v i -> i {isPermaLinkGUID = Just v} ) ] -- mayFail = mayFailS sname -- extendedSName = sname ++ ":" ++ show start ++ "-" ++ show end ++ ": guid" -- Parse a source element. -- parseSource :: String -> Stanza -> (Maybe Source, [String]) parseSource sname stanza@((_, (start, end), _):_) = parseStanzas sourceProc extendedSName [stanza] defaultSource where sourceProc = -- required elements [ ("source", Proc True $ Simple $ cantFail $ \v i -> i {sourceSource = v} ) , ("url", Proc True $ Simple $ cantFail $ \v i -> i {urlSource = v} ) ] -- extendedSName = sname ++ ":" ++ show start ++ "-" ++ show end ++ ": source" -- Parse an integer value. -- parseNat :: String -> Maybe Int parseNat str | all isDigit str = Just $ read str | otherwise = Nothing -- Parse a Boolean. -- parseBool :: String -> Maybe Bool parseBool str | strLower == "true" = Just True | strLower == "false" = Just False | otherwise = Nothing where strLower = map toLower str -- Parser that simply asserts that it is being passed an empty string. -- parseNull :: String -> Maybe () parseNull s | null s = Just () | otherwise = Nothing -- Simple action without any special parsing. -- cantFail :: (String -> r -> r) -- record updater -> (StanzaAssoc -> r -> (r, [String])) cantFail upd = \(_, _, v) c -> (upd v c, []) -- Compound action without any special parsing. -- cantFailStanza :: (Stanza -> a) -- stanza parser -> (a -> r -> r) -- record updater -> (Stanza -> r -> (r, [String])) cantFailStanza parse upd = \stanza rec -> (upd (parse stanza) rec, []) -- Action with parsing that may fail. -- mayFailS :: String -- source name -> (String -> Maybe a) -- value parser -> String -- error message -> (a -> r -> r) -- record updater -> (StanzaAssoc -> r -> (r, [String])) mayFailS sname parse err upd = \(_, range, v) rec -> case parse v of Just x -> (upd x rec, []) Nothing -> (rec , [elementErrStr sname range err]) -- Action with parsing that may fail. -- mayFailStanzaS :: String -- source name -> (String -> Stanza -> (a, [String])) -- stanza parser -> (a -> r -> r) -- record updater -> (Stanza -> r -> (r, [String])) mayFailStanzaS sname parse upd = \stanza rec -> let (v, errs) = parse sname stanza in (upd v rec, errs) -- Filename operations (this stuff won't work on win32) -- ------------------- -- Project the suffix from a filename. -- suffix :: FilePath -> String suffix = reverse . takeWhile (/= '.') . reverse -- Remove the suffix from a filename. -- stripSuffix :: FilePath -> String stripSuffix fname | null fname' = fname | otherwise = reverse (tail fname') where fname' = dropWhile (/= '.') . reverse $ fname -- Drop the directory of a path. -- basename :: FilePath -> String basename fname | null fname' = fname | otherwise = fname' where fname' = reverse . takeWhile (/= '/') . reverse $ fname -- Drop the file name of a path. -- dirname :: FilePath -> String dirname = reverse . dropWhile (/= '/') . reverse -- Join a directory and file path. If the file path is already absolute, -- ignore the directory argument. -- () :: FilePath -> FilePath -> FilePath "" fname = fname dir "" = dir dir fname | head fname == '/' = fname | last dir == '/' = dir ++ fname | otherwise = dir ++ "/" ++ fname -- Join a suffix to a file name. -- (<.>) :: FilePath -> String -> FilePath fname <.> "" = fname fname <.> suffix = fname ++ "." ++ suffix -- Logging -- ------- -- Warnings if not quiet. -- say :: Config -> String -> IO () say config str | not (quietCfg config) = putStrLn str | otherwise = return () -- Information messages if in verbose mode. -- sayV :: Config -> String -> IO () sayV config str | verboseCfg config = putStrLn str | otherwise = return () -- Diagnostic messages if in debug mode. -- sayD :: Config -> Int -> String -> IO () sayD config lvl str | debugCfg config >= lvl = putStrLn str | otherwise = return () -- Main -- ---- main :: IO () main = do config <- processArgs ( readFeed config >>= formatFeed config >>= writeFeed config >>= printSummary config) `catch` \e -> abortWithIO [show e] exitSuccess