| Copyright | (C) 2015-16 Matthias Herrmann |
|---|---|
| License | GPL-3 |
| Maintainer | matthias.mh.herrmann@gmail.com |
| Safe Haskell | None |
| Language | Haskell2010 |
Network.ScrapeChanges
Description
Main module, reexports everything you need to use "scrape-changes". Full working example:
{-# LANGUAGE OverloadedStrings #-}
import Data.ByteString (isInfixOf)
import Data.ByteString.Lazy (ByteString, toStrict)
import Data.Text.Lazy.Encoding (decodeUtf8With)
import Data.Foldable (find)
import Data.Maybe (fromMaybe)
import Text.HTML.TagSoup (Tag(..), (~==), (~/=), parseTags, fromAttrib)
import Data.List.NonEmpty (NonEmpty ((:|)))
import qualified System.Log.Logger as Logger
import qualified System.Log.Handler.Syslog as Syslog
import Data.Monoid ((<>))
import Control.Monad (forever)
import Network.ScrapeChanges
main :: IO ()
main = do
_ <- configureLogging
_ <- (Logger.errorM thisLogger . show) either id $ scrapeChangesJobs
putStrLn "scrape-changes examples executable. Just look at the example source code."
-- |Simplest way to block the main thread forever. Good enough for the use cases of 'scrape-changes'
_ <- forever getLine
-- |Will never be executed in this case
clearAllScrapeConfigs
-- |Google logo scrape function using the tagsoup library
scrapeGoogleLogo :: ByteString -> Text
scrapeGoogleLogo byteString =
let tags = parseTags byteString
divWithBackgroundUrl = find (~/= TagClose ("div" :: ByteString)) $
dropWhile (not . isDivWithBackgroundUrl) tags
resultMaybe = decodeUtf8Lenient . styleAttribContent $ divWithBackgroundUrl
in fromMaybe "" resultMaybe
where decodeUtf8Lenient = decodeUtf8With $ const . const . Just $ ?
isDivWithBackgroundUrl t =
let containsBackgroundUrl = isInfixOf "background:url" . toStrict
in t ~== TagOpen ("div" :: ByteString) [] && containsBackgroundUrl (styleAttribContent t)
styleAttribContent = fromAttrib "style"
scrapeChangesJobs :: Either [(Url, [ValidationError])] (IO ())
scrapeChangesJobs = repeatScrapeAll [
-- Checks each minute for changes and sends a mail if there are any
ScrapeSchedule {
_scrapeScheduleCron = "* * * * *" -- std cron format
, _scrapeScheduleConfig = mailScrapeConfig "http://www.google.co.uk" -- to scrape
(MailAddr Nothing "maxmustermann.de") -- from
(MailAddr Nothing "receiverscrape-changes.com" :| []) -- to
, _scrapeScheduleScraper = scrapeGoogleLogo --scrape function
}
-- Checks each minute for changes and notifies to syslog if there are any
, ScrapeSchedule {
_scrapeScheduleCron = "* * * * *"
, _scrapeScheduleConfig = otherScrapeConfig "http://www.google.co.uk"
(text -> Logger.infoM thisLogger . show $
"Change detected: " <> text)
, _scrapeScheduleScraper = scrapeGoogleLogo
}
]
configureLogging :: IO ()
configureLogging = do
syslogHandler <- Syslog.openlog thisLogger [] Syslog.DAEMON Logger.DEBUG
let logConfig = flip Logger.updateGlobalLogger (Logger.addHandler syslogHandler . Logger.setLevel Logger.DEBUG)
sequence_ $ logConfig $ [Network.ScrapeChanges, thisLogger]
thisLogger :: String
thisLogger = "scrape-changes-examples"
- scrape :: ScrapeConfig -> Scraper -> Either [ValidationError] (IO ScrapeResult)
- repeatScrape :: CronScheduleString -> ScrapeConfig -> Scraper -> Either [ValidationError] (IO ())
- repeatScrapeAll :: [ScrapeSchedule] -> Either [(Url, [ValidationError])] (IO ())
- scrapeAll :: [(ScrapeConfig, Scraper)] -> [(Url, Either [ValidationError] (IO ScrapeResult))]
- mailScrapeConfig :: ScrapeInfoUrl -> MailFromAddr -> NonEmpty MailToAddr -> ScrapeConfig
- otherScrapeConfig :: ScrapeInfoUrl -> (Text -> IO ()) -> ScrapeConfig
- clearScrapeConfig :: ScrapeConfig -> IO ()
- clearAllScrapeConfigs :: IO ()
Documentation
scrape :: ScrapeConfig -> Scraper -> Either [ValidationError] (IO ScrapeResult) Source #
The basic scrape function. It fires a GET request against the url defined within the provided
ScrapeConfig. The body is passed to the provided Scraper. The result Text of the
latter is used to determine whether something has changed on the respective website. If so, the callback
configured in ScrapeConfig is executed and CallbackCalled is returned. Otherwise CallbackNotCalled is
returned.
repeatScrape :: CronScheduleString -> ScrapeConfig -> Scraper -> Either [ValidationError] (IO ()) Source #
Repeat executing scrape by providing a CronScheduleString. The returned
IO action is non blocking
repeatScrapeAll :: [ScrapeSchedule] -> Either [(Url, [ValidationError])] (IO ()) Source #
Execute a list of ScrapeSchedule in parallel. If validation of any ScrapeSchedule fails,
Left containing ValidationError indexed by the corresponding Url is returned.
scrapeAll :: [(ScrapeConfig, Scraper)] -> [(Url, Either [ValidationError] (IO ScrapeResult))] Source #
Execute a list of ScrapeConfig in sequence using scrape and collect
the results in a map containing the respective Url as key.
mailScrapeConfig :: ScrapeInfoUrl -> MailFromAddr -> NonEmpty MailToAddr -> ScrapeConfig Source #
Helper constructor for ScrapeConfig containing MailConfig
callback.
otherScrapeConfig :: ScrapeInfoUrl -> (Text -> IO ()) -> ScrapeConfig Source #
Helper constructor for ScrapeConfig containing OtherConfig
callback.
clearScrapeConfig :: ScrapeConfig -> IO () Source #
Clear all mutable state associated with the provided ScrapeConfig
clearAllScrapeConfigs :: IO () Source #
Clear all mutable state ever used by "scrape-changes"