Copyright | (C) 2015-16 Matthias Herrmann |
---|---|
License | GPL-3 |
Maintainer | matthias.mh.herrmann@gmail.com |
Safe Haskell | None |
Language | Haskell2010 |
Main module, reexports everything you need to use "scrape-changes". Full working example:
{-# LANGUAGE OverloadedStrings #-} import Data.ByteString (isInfixOf) import Data.ByteString.Lazy (ByteString, toStrict) import Data.Text.Lazy.Encoding (decodeUtf8With) import Data.Foldable (find) import Data.Maybe (fromMaybe) import Text.HTML.TagSoup (Tag(..), (~==), (~/=), parseTags, fromAttrib) import Data.List.NonEmpty (NonEmpty ((:|))) import qualified System.Log.Logger as Logger import qualified System.Log.Handler.Syslog as Syslog import Data.Monoid ((<>)) import Control.Monad (forever) import Network.ScrapeChanges main :: IO () main = do _ <- configureLogging _ <- (Logger.errorM thisLogger . show)either
id $ scrapeChangesJobs putStrLn "scrape-changes examples executable. Just look at the example source code." -- |Simplest way to block the main thread forever. Good enough for the use cases of 'scrape-changes' _ <- forever getLine -- |Will never be executed in this case clearAllScrapeConfigs -- |Google logo scrape function using the tagsoup library scrapeGoogleLogo :: ByteString -> Text scrapeGoogleLogo byteString = let tags = parseTags byteString divWithBackgroundUrl = find (~/= TagClose ("div" :: ByteString)) $ dropWhile (not . isDivWithBackgroundUrl) tags resultMaybe = decodeUtf8Lenient . styleAttribContent $ divWithBackgroundUrl in fromMaybe "" resultMaybe where decodeUtf8Lenient = decodeUtf8With $ const . const . Just $?
isDivWithBackgroundUrl t = let containsBackgroundUrl = isInfixOf "background:url" . toStrict in t ~== TagOpen ("div" :: ByteString) [] && containsBackgroundUrl (styleAttribContent t) styleAttribContent = fromAttrib "style" scrapeChangesJobs :: Either [(Url, [ValidationError])] (IO ()) scrapeChangesJobs = repeatScrapeAll [ -- Checks each minute for changes and sends a mail if there are any ScrapeSchedule { _scrapeScheduleCron = "* * * * *" -- std cron format , _scrapeScheduleConfig = mailScrapeConfig "http://www.google.co.uk" -- to scrape (MailAddr Nothing "maxmustermann.de") -- from (MailAddr Nothing "receiver
scrape-changes.com" :| []) -- to , _scrapeScheduleScraper = scrapeGoogleLogo --scrape function } -- Checks each minute for changes and notifies to syslog if there are any , ScrapeSchedule { _scrapeScheduleCron = "* * * * *" , _scrapeScheduleConfig = otherScrapeConfig "http://www.google.co.uk" (text -> Logger.infoM thisLogger . show $ "Change detected: " <> text) , _scrapeScheduleScraper = scrapeGoogleLogo } ] configureLogging :: IO () configureLogging = do syslogHandler <- Syslog.openlog thisLogger [] Syslog.DAEMON Logger.DEBUG let logConfig = flip Logger.updateGlobalLogger (Logger.addHandler syslogHandler . Logger.setLevel Logger.DEBUG) sequence_ $ logConfig $ [Network.ScrapeChanges, thisLogger] thisLogger :: String thisLogger = "scrape-changes-examples"
- scrape :: ScrapeConfig -> Scraper -> Either [ValidationError] (IO ScrapeResult)
- repeatScrape :: CronScheduleString -> ScrapeConfig -> Scraper -> Either [ValidationError] (IO ())
- repeatScrapeAll :: [ScrapeSchedule] -> Either [(Url, [ValidationError])] (IO ())
- scrapeAll :: [(ScrapeConfig, Scraper)] -> [(Url, Either [ValidationError] (IO ScrapeResult))]
- mailScrapeConfig :: ScrapeInfoUrl -> MailFromAddr -> NonEmpty MailToAddr -> ScrapeConfig
- otherScrapeConfig :: ScrapeInfoUrl -> (Text -> IO ()) -> ScrapeConfig
- clearScrapeConfig :: ScrapeConfig -> IO ()
- clearAllScrapeConfigs :: IO ()
Documentation
scrape :: ScrapeConfig -> Scraper -> Either [ValidationError] (IO ScrapeResult) Source #
The basic scrape function. It fires a GET request against the url defined within the provided
ScrapeConfig
. The body is passed to the provided Scraper
. The result Text
of the
latter is used to determine whether something has changed on the respective website. If so, the callback
configured in ScrapeConfig
is executed and CallbackCalled
is returned. Otherwise CallbackNotCalled
is
returned.
repeatScrape :: CronScheduleString -> ScrapeConfig -> Scraper -> Either [ValidationError] (IO ()) Source #
Repeat executing scrape
by providing a CronScheduleString
. The returned
IO action is non blocking
repeatScrapeAll :: [ScrapeSchedule] -> Either [(Url, [ValidationError])] (IO ()) Source #
Execute a list of ScrapeSchedule
in parallel. If validation of any ScrapeSchedule
fails,
Left
containing ValidationError
indexed by the corresponding Url
is returned.
scrapeAll :: [(ScrapeConfig, Scraper)] -> [(Url, Either [ValidationError] (IO ScrapeResult))] Source #
Execute a list of ScrapeConfig
in sequence using scrape
and collect
the results in a map containing the respective Url
as key.
mailScrapeConfig :: ScrapeInfoUrl -> MailFromAddr -> NonEmpty MailToAddr -> ScrapeConfig Source #
Helper constructor for ScrapeConfig
containing MailConfig
callback.
otherScrapeConfig :: ScrapeInfoUrl -> (Text -> IO ()) -> ScrapeConfig Source #
Helper constructor for ScrapeConfig
containing OtherConfig
callback.
clearScrapeConfig :: ScrapeConfig -> IO () Source #
Clear all mutable state associated with the provided ScrapeConfig
clearAllScrapeConfigs :: IO () Source #
Clear all mutable state ever used by "scrape-changes"