{-# LANGUAGE PatternGuards,BangPatterns #-} -------------------------------------------------------------------- -- | -- Module: Watch an RSS/Atom feed and write it to an IRC channel -- Copyright : (c) Don Stewart 2008-2009, Simon Michael 2009 -- License : BSD3 -- More info : rss2irc.cabal -- -------------------------------------------------------------------- import Control.Concurrent (forkIO,threadDelay) import Control.Concurrent.Chan (Chan,newChan,writeList2Chan,readChan,unGetChan) import Control.Monad (when,unless,forever) import Data.List (isPrefixOf,foldl',stripPrefix,intercalate) import Data.List.Split (splitOn) import Data.Either (either) import Data.Maybe (fromMaybe,fromJust,isJust) import Data.Time.Clock (getCurrentTime) import Data.Time.LocalTime (LocalTime,getCurrentTimeZone,utcToLocalTime) import Network.HTTP (Response,getRequest,rspBody,rspCode) import Network.Browser (browse,request,setAllowRedirects,setOutHandler) import Network (PortID(PortNumber), connectTo) import qualified Control.Exception.Extensible as E (bracket,bracket_,catch,fromException) import Control.Parallel.Strategies (NFData) --(rnf),($|)) import System.Console.GetOpt (OptDescr(Option), ArgDescr(ReqArg,NoArg), ArgOrder(Permute), getOpt, usageInfo) import System.Environment (getArgs) import System.Exit (exitWith, ExitCode(ExitSuccess), exitFailure) import System.IO (Handle, BufferMode(NoBuffering),stdout,hSetBuffering,hFlush,hClose,hGetLine) import Text.Feed.Import (parseFeedString) import Text.Feed.Query (feedItems ,getItemTitle ,getItemLink ,getItemPublishDate ,getItemDate ,getItemAuthor ,getItemCommentLink ,getItemEnclosure ,getItemFeedLink ,getItemId ,getItemCategories ,getItemRights ,getItemSummary ,getItemDescription ) import Text.Feed.Types (Feed(XMLFeed),Item) import Text.Printf (printf,hPrintf) import Text.RegexPR (splitRegexPR,gsubRegexPR) -- import Debug.Trace -- strace :: Show a => a -> a -- strace a = trace (show a) a defaultport, defaultinterval, defaultmaxitems, maxmessagelength :: Int defaultport = 6667 defaultinterval = 5 -- minutes defaultmaxitems = 5 -- items maxmessagelength = 400 -- characters options :: [OptDescr Opt] options = [Option ['p'] ["port"] (ReqArg Port "PORT") "irc server port (default 6667)" ,Option [] ["ident"] (ReqArg Ident "STR") "set the bot's identity string (useful for contact info)" ,Option [] ["delay"] (ReqArg Delay "N") "wait for N seconds before starting (helps avoid mass joins)" ,Option ['i'] ["interval"] (ReqArg Interval "N") ("polling interval in minutes (default "++(show defaultinterval)++")") ,Option ['r'] ["recent"] (ReqArg Recent "N") "announce up to N recent items at startup (default 0)" ,Option ['m'] ["max-items"] (ReqArg MaxItems "N") ("announce at most N items per polling interval (default "++(show defaultmaxitems)++")") ,Option [] ["no-title"] (NoArg NoTitle) "don't show title (title is announced by default)" ,Option ['a'] ["author"] (NoArg Author) "show author" ,Option ['d'] ["description"] (NoArg Description) "show description" ,Option ['l'] ["link"] (NoArg Link) "show link URL" ,Option ['t'] ["time"] (NoArg Time) "show timestamp" ,Option ['e'] ["email"] (NoArg Email) "show email addresses (these are stripped by default)" ,Option ['h'] ["html"] (NoArg Html) "show HTML tags and entities (these are stripped by default)" ,Option [] ["replace"] (ReqArg Replace "\"OLD/NEW\"") "replace OLD with NEW (regexpr patterns)" ,Option ['n'] ["num-iterations"] (ReqArg NumIterations "N") "exit after N iterations" ,Option ['q'] ["quiet"] (NoArg Quiet) "silence normal console output" ,Option [] ["debug"] (NoArg Debug) "do not connect to irc" ,Option [] ["debug-irc"] (NoArg DebugIrc) "show irc activity" ,Option [] ["debug-feed"] (NoArg DebugFeed) "show feed items and polling stats" ,Option [] ["debug-xml"] (NoArg DebugXml) "show feed content" ,Option [] ["debug-http"] (NoArg DebugHttp) "show feed fetching progress" ] help :: IO a help = do putStrLn "Usage: rss2irc [OPTS] FEEDURL [irc://]BOTNAME@IRCSERVER/#CHANNEL" putStrLn " or: rss2irc [OPTS] FEEDURL (same as --debug)" putStrLn "Options:" putStrLn (usageInfo "" options) exitWith ExitSuccess data Opt = Port {value::String} | Ident {value::String} | Delay {value::String} | Interval {value::String} | MaxItems {value::String} | Recent {value::String} | Author | Description | Time | Link | NoTitle | Email | Html | Replace {value::String} | NumIterations {value::String} | Quiet | Debug | DebugHttp | DebugXml | DebugFeed | DebugIrc deriving (Eq, Show) data Bot = Bot { socket :: Handle , server :: String , port :: !Int , channel :: String , nick :: String , botopts :: ![Opt] , outputqueue :: Chan String } instance NFData Item main :: IO () main = do (opts, args, errs) <- getOpt Permute options `fmap` getArgs let delay = optIntValue Delay 0 opts interval = optIntValue Interval defaultinterval opts errs' = errs ++ if (interval > 0 || (isJust $ numIterations opts)) then [] else ["Eh.. no."] when (not . null $ errs') $ mapM_ putStrLn errs' >> help -- force early failure if there is a bad regexp. XXX Don't know how to catch this yet. seq (applyReplacements opts "") $ return () when (delay > 0) $ threadDelay $ delay * 1000000 q <- newChan (feed,bot) <- case args of [f,nsc] -> case map (splitOn "@") $ splitOn "/" $ maybe nsc id (stripPrefix "irc://" nsc) of [[n,s],[c]] -> return (f, Bot{ socket = stdout , server = s , port = optIntValue Port defaultport opts , channel = c , nick = n , botopts = opts , outputqueue = q }) _ -> help [f] -> return (f, Bot{ socket = stdout , server = "" , port = 0 , channel = "" , nick = "" , botopts = Debug:opts , outputqueue = q }) _ -> help E.bracket (connect bot) (disconnect) (\b -> run b feed `E.catch` exit b) where exit b e = case E.fromException e of Just ExitSuccess -> exitWith ExitSuccess _ -> printf "Error, %s died with: %s\n" (nick b) (show e) >> exitFailure -- | Connect to the irc server. connect :: Bot -> IO Bot connect b | Debug `elem` (botopts b) = do unless (Quiet `elem` botopts b) $ printf "Skipping IRC connection due to --debug\n" return b connect b = do let opts = botopts b E.bracket_ (unless (Quiet `elem` opts) $ getTimeStamp >>= \t -> printf "%s: %s connecting to %s, channel %s... " t (nick b) (server b) (channel b) >> hFlush stdout) (unless (Quiet `elem` opts) $ printf "connected.\n" >> hFlush stdout) (do h <- connectTo (server b) (PortNumber (fromIntegral (port b))) hSetBuffering h NoBuffering return b{socket=h}) -- | Join a channel, and start processing commands. run :: Bot -> String -> IO () run b@(Bot{nick=n,channel=c,botopts=opts}) f = do let ident = optValue Ident "rss2irc gateway" opts ircWrite b $ "NICK " ++ n ircWrite b $ printf "USER %s 0 * :%s" n ident ircWrite b $ "JOIN " ++ c forkIO $ feedReaderThread b f forkIO $ ircWriterThread b ircResponderThread b -- | Disconnect from the irc server. disconnect :: Bot -> IO () disconnect b | Debug `elem` (botopts b) = return () disconnect b = hClose $ socket b -- feed stuff -- | Poll the feed every interval minutes and announce any new items to irc. -- -- Here's the approach that seems most robust for now: -- Feeds are assumed to provide items sorted newest first. -- Announceable items are detected by taking unseen items from the top. -- "Seen" is implemented by keeping a rolling cache of likely-unique item ids. -- feedReaderThread :: Bot -> String -> IO () feedReaderThread b@(Bot{botopts=opts,outputqueue=q}) url = do when (iterations == Just 0) $ exitIterations unless (Quiet `elem` opts) $ printf "Polling %s every %s\n" url m >> hFlush stdout (is,polls,failedpolls) <- pollUntilFetchItems url delay opts 0 0 -- go no further until we have baseline items let seen = map itemId is recent = take (optIntValue Recent 0 opts) is announceable = reverse recent (announcenow,toannounce) = splitAt maxitems announceable numannounced = fromIntegral $ length announcenow when (DebugFeed `elem` opts) $ do getTimeStamp >>= printf ("\n%s: polled " ++ url ++ "\n") printItemDetails "initial feed items, in feed order" is printItemDetails "announceable items, oldest first" announceable printf "total polls, failed polls, items announced: %10d %10d %10d\n" polls failedpolls numannounced hFlush stdout writeList2Chan q $ map (announce b) announcenow go seen toannounce iterations numannounced polls failedpolls where go :: [String] -> [Item] -> Maybe Int -> Integer -> Integer -> Integer -> IO () go !seen !toannounce !iterationsleft !numannounced !polls !failedpolls = do when (iterationsleft == Just 0) $ exitIterations threadDelay delay when (DebugFeed `elem` opts) $ do getTimeStamp >>= \t -> printf "\n%s: polling %s\n" t url fetched <- fetchItems url opts let polls' = polls + 1 failedpolls' | isLeft fetched = failedpolls+1 | otherwise = failedpolls fetched' = either (const []) id fetched seen' = take windowsize $ (map itemId new) ++ seen new = takeWhile (not.(`elem` seen).itemId) fetched' announceable = toannounce ++ reverse new (announcenow,toannounce') = splitAt maxitems announceable numannounced' = numannounced + (fromIntegral $ length announcenow) iterationsleft' = maybe Nothing (Just . pred) iterationsleft when (DebugFeed `elem` opts) $ do printItemDetails "current feed items, in feed order" fetched' printItemDetails "new unseen items, in feed order" new printItemDetails "announceable items, oldest first" announceable printf "total polls, failed polls, items announced: %10d %10d %10d\n" polls' failedpolls' numannounced' hFlush stdout writeList2Chan q $ map (announce b) announcenow go seen' toannounce' iterationsleft' numannounced' polls' failedpolls' iterations = numIterations opts exitIterations = unless (Quiet `elem` opts) (printf "Exiting after %d iterations.\n" (fromJust iterations)) >> exitWith ExitSuccess interval = optIntValue Interval defaultinterval opts m = if interval==1 then "minute" else show interval ++ " minutes" delay = interval * minutes maxitems = optIntValue MaxItems defaultmaxitems opts windowsize = 200 -- | Get the best available unique identifier for a feed item. itemId :: Item -> String itemId i = case getItemId i of Just (_,s) -> s Nothing -> case getItemTitle i of Just s -> s Nothing -> case getItemDate i of Just s -> s Nothing -> show i -- | Like fetchItems, but if it fails with a transient error, keep -- retrying at the specified interval. Returns a tuple of items, poll -- and failed poll counts, or throws an IO error. pollUntilFetchItems :: String -> Int -> [Opt] -> Integer -> Integer -> IO ([Item],Integer,Integer) pollUntilFetchItems url delay opts polls failedpolls = do is <- fetchItems url opts case is of Right is' -> return (is',polls+1,failedpolls) Left _ -> do threadDelay delay pollUntilFetchItems url delay opts (polls+1) (failedpolls+1) -- | Get the items from the feed at the specified url, with redirects -- and authentication allowed, or an error string, or throw an IO -- error if the error looks permanent. fetchItems :: String -> [Opt] -> IO (Either String [Item]) fetchItems url opts = either Left (Right . feedItems) `fmap` readFeed url opts -- | Fetch a feed, with redirects and authentication allowed, or an error string, -- or throw an IO error if the error looks permanent. Also show the raw content -- as debug output if that option is in effect. readFeed :: String -> [Opt] -> IO (Either String Feed) readFeed url opts = do s <- readUri url opts case s of Left e -> return $ Left e Right s' -> do when (DebugXml `elem` opts) $ do getTimeStamp >>= \t -> printf "\n%s: feed content:\n%s\n" t s' case parseFeedString s' of Nothing -> noparse Just (XMLFeed _) -> noparse Just f -> return $ Right f where noparse = fail "could not parse feed" -- fatal or not ? return $ Left -- | Fetch the contents of a uri, with redirects and authentication allowed, or an error string, -- or throw an IO error if the error looks permanent. Also show the http transaction progress -- as debug output if that option is in effect. readUri :: String -> [Opt] -> IO (Either String String) readUri url opts = do (_url',rsp) <- browse $ do if (DebugHttp `elem` opts) then setOutHandler (\s -> getTimeStamp >>= \t -> printf "\n%s: %s\n" t s) else setOutHandler (const $ return ()) setAllowRedirects True request $ getRequest url case rspCode rsp of (2,_,_) -> return $ Right $ rspBody rsp code -> do getTimeStamp >>= \t -> printf "%s: Error fetching %s: %s\n" t url (show code) >> hFlush stdout case permanent code of False -> return $ Left $ show code True -> fail $ show code where permanent c | c `elem` [(4,0,4)] = True -- http://en.wikipedia.org/wiki/List_of_HTTP_status_codes | otherwise = False -- | Dump item details to the console for debugging. printItemDetails :: String -> [Item] -> IO () printItemDetails hdr is = printf "%s: %d\n%s" hdr count items >> hFlush stdout where items = unlines [printf " %-29s%s %-*s" d p twidth t | (d,p,t,_) <- fields] twidth = maximum $ map (length.fromMaybe "".getItemTitle) is -- subhdr = "(date, (publish date if different), title)\n" -- subhdr' = if null is then "" else subhdr count = length is fields = [(d, if p==d then "" else printf " pubdate:%-29s" p, t, i) | item <- is ,let d = fromMaybe "" $ getItemDate item ,let p = fromMaybe "" $ getItemPublishDate item ,let t = fromMaybe "" $ getItemTitle item ,let i = maybe "" show $ getItemId item ] -- deriving instance Eq Item instance Eq Item where (==) a b = let match f = f a == f b in all match [getItemTitle ,getItemLink ,getItemPublishDate ,getItemDate ,getItemAuthor ,getItemCommentLink ,getItemFeedLink ,getItemRights ,getItemSummary ,getItemDescription ] && match getItemCategories && match getItemEnclosure && match getItemId -- | Convert this item to an announcement string for the bot. announce :: Bot -> Item -> String announce (Bot{botopts=opts}) item = msg' where msg' = applyReplacements opts msg msg = printf "%s%s%s%s%s" title desc author date link title = if elem NoTitle opts then "" else clean $ fromMaybe "" (getItemTitle item) author = ifopt Author $ clean $ maybe "" ((" "++).parenthesise) (getItemAuthor item) desc = ifopt Description $ oneline $ clean $ maybe "" (" - "++) (getItemDescription item) date = ifopt Time $ maybe "" (" "++) (getItemDate item) link = ifopt Link $ maybe "" (" "++) (getItemLink item) ifopt o = if elem o opts then id else const "" oneline = intercalate " " . lines clean = striphtml . stripemail striphtml = if elem Html opts then id else stripHtml stripemail = if elem Email opts then id else stripEmails parenthesise = (++")").("("++) -- irc stuff -- | Print strings which appear in the bot's output queue to its irc channel, -- complying with bot and irc server policies. ircWriterThread :: Bot -> IO () ircWriterThread b@(Bot{outputqueue=q}) = forever $ do -- simple policy: no more than one message per 2s, no more than 400 chars per message msg <- readChan q case splitAt maxmessagelength msg of (s,[]) -> ircPrivmsgH b s (s,rest) -> do unGetChan q ("..."++(reverse $ take 3 $ reverse s)++rest) ircPrivmsgH b ((take (length s - 3) s) ++ "...") threadDelay $ 2 * seconds -- | Handle any incoming commands from the bot's irc channel. -- The following commands are supported: PING. ircResponderThread :: Bot -> IO () ircResponderThread b@(Bot{socket=h,botopts=opts}) = forever $ do if (Debug `elem` opts) then threadDelay $ 60 * minutes else do s <- init `fmap` hGetLine h when (DebugIrc `elem` opts) $ getTimeStamp >>= \t -> printf "%s: <-%s\n" t s if isping s then pong s else return () where isping x = "PING :" `isPrefixOf` x pong x = ircWrite b $ "PONG :"++drop 6 x -- | Send a privmsg to the bot's irc server & channel. ircPrivmsgH :: Bot -> String -> IO () ircPrivmsgH b@(Bot{channel=c}) msg = ircWrite b (printf "PRIVMSG %s :%s" c msg::String) -- | Send a string to the bot's irc server unless --debug is in effect, -- and to the console if --debug-irc is in effect. ircWrite :: Bot -> String -> IO () ircWrite (Bot{socket=h,botopts=opts}) s = do when (not $ Debug `elem` opts) $ hPrintf h "%s\r\n" s debugoutput where debugoutput | DebugIrc `elem` opts = getTimeStamp >>= \t -> printf "%s: ->%s\n" t s >> hFlush stdout | not (Quiet `elem` opts) && ("PRIVMSG" `isPrefixOf` s) = (printf "%s\n" $ drop 1 $ dropWhile (/=':') s) >> hFlush stdout | otherwise = return () -- utils -- | Apply all --replace substitutions to a string, in turn. -- Warning, will fail at runtime if there is a bad regexp. applyReplacements :: [Opt] -> String -> String applyReplacements opts = foldl' (.) id (reverse substitutions) where substitutions = map make $ optValues Replace opts make s = case splitRegexPR "(? gsubRegexPR pat sub _ -> id -- | Replace any HTML tags or entities in a string with a single space. stripHtml :: String -> String stripHtml = gsubRegexPR "(&[^ \t]*?;|<.*?>)" " " -- | Remove any email addresses from a string. stripEmails :: String -> String stripEmails = gsubRegexPR "(?i) ?(<|<)?\\b[-._%+a-z0-9]+@[-.a-z0-9]+\\.[a-z]{2,4}\\b(>|>)?" "" optValue :: (String -> Opt) -> String -> [Opt] -> String optValue oc def = head . optValues oc . (++[oc def]) optIntValue :: (String -> Opt) -> Int -> [Opt] -> Int optIntValue oc def opts = fromMaybe def $ maybeRead $ optValue oc "" opts optValues :: (String -> Opt) -> [Opt] -> [String] optValues oc opts = concatMap get opts where get o = if oc v == o then [v] else [] where v = value o numIterations :: [Opt] -> Maybe Int numIterations opts = case optIntValue NumIterations (-1) opts of (-1) -> Nothing n -> Just n maybeRead :: Read a => String -> Maybe a maybeRead s = case reads s of [(x, _)] -> Just x _ -> Nothing getCurrentLocalTime :: IO LocalTime getCurrentLocalTime = do t <- getCurrentTime tz <- getCurrentTimeZone return $ utcToLocalTime tz t getTimeStamp :: IO String getTimeStamp = do t <- getCurrentLocalTime tz <- getCurrentTimeZone return $ printf "%s %s" (take 19 $ show t) (show tz) isLeft :: Either a b -> Bool isLeft (Left _) = True isLeft _ = False -- strict :: NFData a => a -> a -- strict = id $| rnf minutes, seconds :: Int minutes = 60 * seconds seconds = 10^(6::Int)