module Web.Feed.Collect
(
Label
, Url
, Error (..)
, FeedConfig ()
, fcLabel
, fcUrl
, fcActive
, fcDebug
, mkFeed
, fromPairs
, WatcherConfig ()
, wcCollect
, wcCollectMany
, wcLogError
, wcCommandQueue
, wcVisitInterval
, wcMaxItems
, wcFeeds
, wcDebug
, collectorNull
, collectorPrint
, collectorPretty
, collectorLog
, run
, CommandQueue ()
, newCommandQueue
, sendCommand
, sendCommands
, Command ()
, addFeed
, removeFeed
, setFeedActive
, setInterval
, setMaxPerVisit
, showFeed
, showItem
, DebugConfig (..)
, setFeedDebug
, setGeneralDebug
)
where
import Control.Concurrent (threadDelay)
import Control.Concurrent.MVar
import Control.Exception (catch)
import Control.Monad (liftM, when)
import Control.Monad.IO.Class (MonadIO, liftIO)
import Data.Default.Class
import Data.List (partition)
import Data.Maybe (catMaybes, fromMaybe, listToMaybe)
import Data.Time.Clock (UTCTime)
import Data.Time.Format
import Data.Time.Interval
import Data.Time.LocalTime (getZonedTime, zonedTimeToUTC)
import Data.Time.RFC2822 (parseTimeRFC2822)
import Data.Time.RFC3339 (parseTimeRFC3339)
import Data.Time.RFC822 (parseTimeRFC822)
import Data.Time.Units (TimeUnit (..), Minute)
import Network.HTTP.Client
import Network.HTTP.Client.TLS (tlsManagerSettings)
import System.IO
import Text.Feed.Import (parseFeedString)
import Text.Feed.Query
import Text.Feed.Types (Item, Feed (..))
import Text.Show.Functions ()
import qualified Data.ByteString.Lazy.UTF8 as BU
type Label = String
type Url = String
data Command
= AddFeed FeedConfig
| RemoveFeed Label
| SetFeedActive Label Bool
| SetInterval Int
| SetMaxPerVisit Int
| SetFeedDebug Label Bool
| SetGeneralDebug Bool
newtype CommandQueue = CommandQueue { cqMVar :: MVar [Command] }
data FeedConfig = FeedConfig
{
fcLabel :: Label
, fcUrl :: Url
, fcActive :: Bool
, fcDebug :: Bool
}
instance Default FeedConfig where
def = FeedConfig
{ fcLabel = ""
, fcUrl = ""
, fcActive = True
, fcDebug = False
}
data DebugConfig = DebugConfig
{
dcDebugCycle :: Bool
, dcGeneralLog :: FilePath
, dcFeedLog :: Label -> FilePath
}
deriving Show
instance Default DebugConfig where
def = DebugConfig
{ dcDebugCycle = False
, dcGeneralLog = "feed-debug.log"
, dcFeedLog = \ l -> "feed-debug-" ++ l ++ ".log"
}
data WatcherConfig m = WatcherConfig
{
wcCollect :: (Label -> Url -> Feed -> Item -> m ())
, wcCollectMany :: Maybe (Label -> Url -> Feed -> [Item] -> m ())
, wcLogError :: (Label -> Error -> m ())
, wcCommandQueue :: Maybe CommandQueue
, wcVisitInterval :: TimeInterval
, wcMaxItems :: Int
, wcFeeds :: [FeedConfig]
, wcDebug :: DebugConfig
}
instance MonadIO m => Default (WatcherConfig m) where
def = WatcherConfig
{ wcCollect = \ l u f i -> liftIO $ collectorPretty l u f i
, wcCollectMany = Nothing
, wcLogError = \ l e -> liftIO $ putStrLn $ l ++ " : " ++ show e
, wcCommandQueue = Nothing
, wcVisitInterval = time (1 :: Minute)
, wcMaxItems = 3
, wcFeeds = []
, wcDebug = def
}
data ItemID
= ByID String | ByTitle String | ByTime String | BySummary String | Unique
deriving Show
instance Eq ItemID where
(ByID i) == (ByID j) = i == j
(ByTitle i) == (ByTitle j) = i == j
(ByTime i) == (ByTime j) = i == j
(BySummary i) == (BySummary j) = i == j
_ == _ = False
data FeedRecord = FeedRecord
{ feedName :: String
, feedUrl :: String
, feedOn :: Bool
, feedPrevIDs :: [ItemID]
, feedUpdated :: Maybe UTCTime
, feedDebug :: Bool
, feedDebugLog :: FilePath
}
deriving Show
data State = State
{ usecInterval :: Int
, maxItemsPerVisit :: Int
, records :: [FeedRecord]
, stDebug :: Bool
}
deriving Show
data Error
= HttpError HttpException
| FeedParsingFailed Url
deriving Show
mkFeed :: Label -> Url -> FeedConfig
mkFeed l u = def { fcLabel = l, fcUrl = u }
fromPairs :: [(Label, Url)] -> [FeedConfig]
fromPairs = map $ uncurry mkFeed
showFeedKind :: Feed -> String
showFeedKind (AtomFeed _) = "Atom"
showFeedKind (RSSFeed _) = "RSS"
showFeedKind (RSS1Feed _) = "RSS1"
showFeedKind (XMLFeed _) = "XML"
showFeed :: Feed -> String
showFeed feed = unwords [kind, title, "by", author, "at", home]
where
kind = '(' : showFeedKind feed ++ ")"
title = getFeedTitle feed
author = fromMaybe none $ getFeedAuthor feed
home = fromMaybe none $ getFeedHome feed
none = "[?]"
showItem :: Item -> String
showItem item = unwords [title, "by", author, "at", date]
where
title = fromMaybe none $ getItemTitle item
author = fromMaybe none $ getItemAuthor item
date = fromMaybe none $ getItemDate item
none = "[?]"
collectorNull :: Label -> Url -> Feed -> Item -> IO ()
collectorNull _ _ _ _ = return ()
collectorPrint :: Label -> Url -> Feed -> Item -> IO ()
collectorPrint _label _url feed item = print feed >> print item
collectorPretty :: Label -> Url -> Feed -> Item -> IO ()
collectorPretty _label _url feed item = do
putStrLn $ showFeed feed
putStrLn $ showItem item
collectorLog :: (Label -> FilePath) -> Label -> Url -> Feed -> Item -> IO ()
collectorLog getPath label url feed item =
withFile (getPath $ getFeedTitle feed) AppendMode $ \ h -> do
hPutStrLn h $ label ++ " : " ++ url
hPutStrLn h $ showFeed feed
hPutStrLn h $ showItem item
hPutChar h '\n'
maxNumIDs :: Int
maxNumIDs = 200
findJust :: [Maybe a] -> Maybe a
findJust = listToMaybe . catMaybes
itemID :: Item -> ItemID
itemID i = fromMaybe Unique $ findJust
[ fmap (ByID . snd) $ getItemId i
, fmap ByTitle $ getItemTitle i
, fmap ByTime $ getItemDate i
, fmap BySummary $ getItemSummary i
]
itemTime :: Item -> Maybe UTCTime
itemTime item =
let mdate = getItemDate item
dateParsers =
fmap zonedTimeToUTC . parseTimeRFC2822 :
fmap zonedTimeToUTC . parseTimeRFC3339 :
fmap zonedTimeToUTC . parseTimeRFC822 :
map
(parseTimeM True defaultTimeLocale)
[ "%a, %d %b %G %T"
, "%Y-%m-%d"
, "%e %b %Y"
, "%a, %e %b %Y %k:%M:%S %z"
, "%a, %e %b %Y %T %Z"
]
results = maybe [] (\ date -> map ($ date) dateParsers) mdate
in findJust results
newerThan :: Maybe UTCTime -> Maybe UTCTime -> Bool
(Just u) `newerThan` (Just v) = u > v
_ `newerThan` _ = True
showTime :: FormatTime t => t -> String
showTime = formatTime defaultTimeLocale rfc822DateFormat
detectNewItems :: Int -> FeedRecord -> Feed -> ([Item], FeedRecord, IO ())
detectNewItems maxItems rec feed =
let items = feedItems feed
ids = map itemID items
times = map itemTime items
iids = zip3 items ids times
new (_i, iid, t) =
iid `notElem` feedPrevIDs rec && t `newerThan` feedUpdated rec
iidsAllNew = filter new iids
iids' = drop (length iidsAllNew maxItems) iidsAllNew
(items', ids', _times') = unzip3 iids'
rec' = rec
{ feedPrevIDs = take maxNumIDs $ ids' ++ feedPrevIDs rec
, feedUpdated =
case iids' of
(_i, _iid, t) : _ -> t
[] -> feedUpdated rec
}
report = do
let updated' = fmap showTime . feedUpdated
prevIDs = feedPrevIDs rec
prevIDsS = take 5 prevIDs
itemsS = take maxItems items
iidsS = take maxItems iids
prevIDsFinal = feedPrevIDs rec'
prevIDsFinalS = take 5 prevIDsFinal
h <- openFile (feedDebugLog rec) AppendMode
let line = hPutStrLn h
nl = hPutChar h '\n'
printIT (_, i, t) = line $ show t ++ " " ++ show i
t <- getZonedTime
line $ replicate 79 '-'
line $ showTime t
line $ replicate 79 '-'
line $ "Label " ++ feedName rec
line $ "URL " ++ feedUrl rec
line "----------- (1) Before changes ------------"
line $ "Active " ++ show (feedOn rec) ++ " (should be True!)"
line $ "Updated " ++ fromMaybe "[?]" (updated' rec)
line $ "At most " ++ show maxItems ++ " are reported per visit"
line $ showFeed feed
line $ show (length prevIDs) ++ " previous item IDs logged"
nl
line $ "Most recent " ++ show (length prevIDsS) ++ " are:"
mapM_ (hPrint h) prevIDsS
nl
line "------------ (2) While running ------------"
line $ "Feed has " ++ show (length items) ++ " items"
nl
line $ "First " ++ show maxItems ++ " from the top are:"
mapM_ (line . showItem) itemsS
nl
line "Their computed times and IDs are:"
mapM_ printIT iidsS
nl
line "Out of all feed items, the following have newly seen IDs:"
mapM_ printIT $ filter (\ (_, i, _) -> i `notElem` prevIDs) iids
nl
line "Out of them, the following are also newer than last update:"
mapM_ printIT iidsAllNew
nl
line "--------------- (3) Result ----------------"
line "Out of them, the following have been collected:"
mapM_ printIT iids'
nl
line $ "Updated " ++ fromMaybe "[?]" (updated' rec')
line $ show (length prevIDsFinal) ++ " previous item IDs logged"
nl
line $ "Most recent " ++ show (length prevIDsFinalS) ++ " are:"
mapM_ (hPrint h) prevIDsFinalS
nl
hClose h
in (items', rec', report)
fetchRaw :: Manager -> String -> IO (Either HttpException String)
fetchRaw manager url =
let action = do
request <- parseUrl url
response <- httpLbs request manager
return $ Right $ BU.toString $ responseBody response
handler e = return $ Left (e :: HttpException)
in action `catch` handler
fetch :: Manager -> Url -> IO (Either Error Feed)
fetch manager url = do
ebody <- fetchRaw manager url
return $ case ebody of
Left err -> Left $ HttpError err
Right body ->
case parseFeedString body of
Just feed -> Right feed
Nothing -> Left $ FeedParsingFailed url
initRec :: MonadIO m
=> (Label -> Error -> m ())
-> Manager
-> FilePath
-> FeedConfig
-> m FeedRecord
initRec logError manager logfile fc = do
efeed <- liftIO $ fetch manager (fcUrl fc)
let rec = FeedRecord
{ feedName = fcLabel fc
, feedUrl = fcUrl fc
, feedOn = fcActive fc
, feedPrevIDs = []
, feedUpdated = Nothing
, feedDebug = fcDebug fc
, feedDebugLog = logfile
}
case efeed of
Right feed ->
let items = feedItems feed
in return rec
{ feedPrevIDs = map itemID $ take maxNumIDs items
, feedUpdated = listToMaybe items >>= itemTime
}
Left e -> logError (fcLabel fc) e >> return rec
exec :: MonadIO m
=> (Label -> Error -> m ())
-> Manager
-> (Label -> FilePath)
-> Command
-> State
-> m State
exec logError manager mklog command state@State { records = rs } =
case command of
AddFeed fc -> do
rec <- initRec logError manager (mklog $ fcLabel fc) fc
return state { records = rs ++ [rec] }
RemoveFeed label ->
return state { records = filter ((/= label) . feedName) rs }
SetFeedActive label active ->
let update rec =
if feedName rec == label
then rec { feedOn = active }
else rec
in return state { records = map update rs }
SetInterval usec -> return state { usecInterval = usec }
SetMaxPerVisit nitems -> return state { maxItemsPerVisit = nitems }
SetFeedDebug label debug ->
let update rec =
if feedName rec == label
then rec { feedDebug = debug }
else rec
in return state { records = map update rs }
SetGeneralDebug debug -> return state { stDebug = debug }
foldrM :: Monad m => (a -> b -> m b) -> b -> [a] -> m b
foldrM _ v [] = return v
foldrM f v (x:xs) = f x =<< foldrM f v xs
makeLine :: Command -> String
makeLine (AddFeed fc) = "Add feed " ++ fcLabel fc
makeLine (RemoveFeed l) = "Remove feed " ++ l
makeLine (SetFeedActive l True) = "Enable feed " ++ l
makeLine (SetFeedActive l False) = "Disable feed " ++ l
makeLine (SetInterval usec) = "Set interval to " ++ show usec ++ "usec"
makeLine (SetMaxPerVisit n) = "Set max items to " ++ show n
makeLine (SetFeedDebug l True) = "Enable debug for feed " ++ l
makeLine (SetFeedDebug l False) = "Disable debug for feed " ++ l
makeLine (SetGeneralDebug True) = "Enable general debug"
makeLine (SetGeneralDebug False) = "Disable general debug"
execAll :: MonadIO m
=> (Label -> Error -> m ())
-> Manager
-> (Label -> FilePath)
-> CommandQueue
-> State
-> m (State, [String])
execAll logError manager mklog cq state = do
cmds <- liftIO $ modifyMVar (cqMVar cq) $ \ l -> return ([], l)
state' <- foldrM (exec logError manager mklog) state cmds
return (state', map makeLine cmds)
visitFeed
:: MonadIO m
=> Manager
-> (Label -> Url -> Feed -> [Item] -> m ())
-> (Label -> Error -> m ())
-> Int
-> FeedRecord
-> m (FeedRecord, Int)
visitFeed manager collectMany logError maxitems rec = do
efeed <- liftIO $ fetch manager (feedUrl rec)
case efeed of
Right feed -> do
let (items, rec', report) = detectNewItems maxitems rec feed
ritems = reverse items
when (feedDebug rec) $ liftIO report
collectMany (feedName rec) (feedUrl rec) feed ritems
return (rec', length ritems)
Left e -> do
logError (feedName rec) e
return (rec, 1)
runIteration
:: MonadIO m
=> (Int -> FeedRecord -> m (FeedRecord, Int))
-> (State -> m (State, [String]))
-> FilePath
-> State
-> m State
runIteration visit execCmds logfile state = do
liftIO $ threadDelay $ usecInterval state
let (recsOn, recsOff) = partition feedOn $ records state
pairs <- mapM (visit $ maxItemsPerVisit state) recsOn
let recsOn' = map fst pairs
stateCollected = state { records = recsOn' ++ recsOff }
(stateExec, actions) <- execCmds stateCollected
liftIO $ when (stDebug state) $ withFile logfile AppendMode $ \ h -> do
let line = hPutStrLn h
fline label s = line $ label ++ ": " ++ s
nl = hPutChar h '\n'
t <- getZonedTime
line $ showTime t
line $ "Feeds: " ++ show (length $ records stateExec)
let info n
| n == 0 = "No new items"
| n == 1 = "Error"
| otherwise = show n ++ " new items"
mapM_ (\ (r, n) -> fline (feedName r) $ info n) pairs
mapM_ (\ r -> fline (feedName r) "Disabled") recsOff
line $ "Commands: " ++ show (length actions)
mapM_ line actions
nl
return stateExec
run :: MonadIO m => WatcherConfig m -> m ()
run wc = do
let mapCollect l u f = mapM_ $ (wcCollect wc) l u f
collectMany = fromMaybe mapCollect $ wcCollectMany wc
logError = wcLogError wc
dc = wcDebug wc
mklog = dcFeedLog dc
manager <- liftIO $ newManager tlsManagerSettings
let mkRecord fc = initRec logError manager (mklog $ fcLabel fc) fc
initialRecords <- mapM mkRecord $ wcFeeds wc
let initialState = State
{ usecInterval =
fromInteger $ microseconds $ wcVisitInterval wc
, maxItemsPerVisit = wcMaxItems wc
, records = initialRecords
, stDebug = dcDebugCycle dc
}
execCmds st =
case wcCommandQueue wc of
Just cq -> execAll logError manager mklog cq st
Nothing -> return (st, [])
visit = visitFeed manager collectMany logError
iter = runIteration visit execCmds (dcGeneralLog dc)
loop state = iter state >>= loop
loop initialState
newCommandQueue :: IO CommandQueue
newCommandQueue = liftM CommandQueue $ newMVar []
sendCommand :: CommandQueue -> Command -> IO ()
sendCommand cq cmd = modifyMVar_ (cqMVar cq) $ \ l -> return $ cmd : l
sendCommands :: CommandQueue -> [Command] -> IO ()
sendCommands cq cmds =
modifyMVar_ (cqMVar cq) $ \ l -> return $ reverse cmds ++ l
addFeed :: FeedConfig -> Command
addFeed = AddFeed
removeFeed :: Label -> Command
removeFeed = RemoveFeed
setFeedActive :: Label -> Bool -> Command
setFeedActive = SetFeedActive
setInterval :: TimeUnit t => t -> Command
setInterval = SetInterval . fromInteger . toMicroseconds
setMaxPerVisit :: Int -> Command
setMaxPerVisit = SetMaxPerVisit
setFeedDebug :: Label -> Bool -> Command
setFeedDebug = SetFeedDebug
setGeneralDebug :: Bool -> Command
setGeneralDebug = SetGeneralDebug