-- | -- Module : Network.ScrapeChanges -- Copyright : (C) 2015-16 Matthias Herrmann -- License : GPL-3 -- Maintainer : matthias.mh.herrmann@gmail.com -- -- Main module, reexports everything you need to use "scrape-changes". Full working example: -- -- @ -- -- {-\# LANGUAGE OverloadedStrings \#-} -- -- import Data.ByteString (isInfixOf) -- import Data.ByteString.Lazy (ByteString, toStrict) -- import Data.Text.Lazy.Encoding (decodeUtf8With) -- import Data.Foldable (find) -- import Data.Maybe (fromMaybe) -- import Text.HTML.TagSoup (Tag(..), (~==), (~/=), parseTags, fromAttrib) -- import Data.List.NonEmpty (NonEmpty ((:|))) -- import qualified System.Log.Logger as Logger -- import qualified System.Log.Handler.Syslog as Syslog -- import Data.Monoid ((<>)) -- import Control.Monad (forever) -- import Network.ScrapeChanges -- -- main :: IO () -- main = do -- _ <- configureLogging -- _ <- (Logger.errorM thisLogger . show) `either` id $ scrapeChangesJobs -- putStrLn "scrape-changes examples executable. Just look at the example source code." -- -- |Simplest way to block the main thread forever. Good enough for the use cases of 'scrape-changes' -- _ <- forever getLine -- -- |Will never be executed in this case -- clearAllScrapeConfigs -- -- -- |Google logo scrape function using the tagsoup library -- scrapeGoogleLogo :: ByteString -> Text -- scrapeGoogleLogo byteString = -- let tags = parseTags byteString -- divWithBackgroundUrl = find (~/= TagClose ("div" :: ByteString)) $ -- dropWhile (not . isDivWithBackgroundUrl) tags -- resultMaybe = decodeUtf8Lenient . styleAttribContent <$> divWithBackgroundUrl -- in fromMaybe "" resultMaybe -- where decodeUtf8Lenient = decodeUtf8With $ const . const . Just $ '?' -- isDivWithBackgroundUrl t = -- let containsBackgroundUrl = isInfixOf "background:url" . toStrict -- in t ~== TagOpen ("div" :: ByteString) [] && containsBackgroundUrl (styleAttribContent t) -- styleAttribContent = fromAttrib "style" -- -- scrapeChangesJobs :: Either [(Url, [ValidationError])] (IO ()) -- scrapeChangesJobs = repeatScrapeAll [ -- -- Checks each minute for changes and sends a mail if there are any -- ScrapeSchedule { -- _scrapeScheduleCron = "* * * * *" -- std cron format -- , _scrapeScheduleConfig = mailScrapeConfig "http://www.google.co.uk" -- to scrape -- (MailAddr Nothing "max@mustermann.de") -- from -- (MailAddr Nothing "receiver@scrape-changes.com" :| []) -- to -- , _scrapeScheduleScraper = scrapeGoogleLogo --scrape function -- } -- -- Checks each minute for changes and notifies to syslog if there are any -- , ScrapeSchedule { -- _scrapeScheduleCron = "* * * * *" -- , _scrapeScheduleConfig = otherScrapeConfig "http://www.google.co.uk" -- (\text -> Logger.infoM thisLogger . show $ -- "Change detected: " <> text) -- , _scrapeScheduleScraper = scrapeGoogleLogo -- } -- ] -- -- configureLogging :: IO () -- configureLogging = do -- syslogHandler <- Syslog.openlog thisLogger [] Syslog.DAEMON Logger.DEBUG -- let logConfig = flip Logger.updateGlobalLogger (Logger.addHandler syslogHandler . Logger.setLevel Logger.DEBUG) -- sequence_ $ logConfig <$> ["Network.ScrapeChanges", thisLogger] -- -- thisLogger :: String -- thisLogger = "scrape-changes-examples" -- @ module Network.ScrapeChanges( scrape , repeatScrape , repeatScrapeAll , scrapeAll , mailScrapeConfig , otherScrapeConfig , clearScrapeConfig , clearAllScrapeConfigs , module Domain ) where import Network.ScrapeChanges.Internal as Internal import Network.ScrapeChanges.Domain as Domain import qualified Data.Validation as Validation import qualified Data.Tuple as TU import qualified System.Cron.Schedule as CronSchedule import Control.Lens import qualified Network.Wreq as Http import qualified Control.Concurrent.Async as Async import qualified System.Log.Logger as Log import qualified Data.Foldable as Foldable import qualified Data.Maybe as Maybe import qualified Data.Traversable as Traversable import qualified Control.Monad as Monad import qualified Control.Exception as Exception -- |The basic scrape function. It fires a GET request against the url defined within the provided -- 'ScrapeConfig'. The body is passed to the provided 'Scraper'. The result 'Data.Text.Lazy.Text' of the -- latter is used to determine whether something has changed on the respective website. If so, the callback -- configured in 'ScrapeConfig' is executed and 'CallbackCalled' is returned. Otherwise 'CallbackNotCalled' is -- returned. scrape :: ScrapeConfig -> Scraper -> Either [ValidationError] (IO ScrapeResult) scrape sc s = let result = scrapeOrchestration <$ validateScrapeConfig sc in result ^. Validation._Either where scrapeOrchestration = let responseBody = (^. Http.responseBody) urlToRequest = sc ^. scrapeInfoUrl requestLog = Log.infoM loggerName $ "Requesting " ++ urlToRequest request = (s . responseBody <$>) . Http.get response = (request urlToRequest <* requestLog) `Exception.catch` httpExceptionHandler sc in do (response', latestHashedResponse) <- Async.concurrently response (readLatestHash sc) let currentHashedResponse = hash' response' let executeCallbackConfig' = executeCallbackConfig sc response' let saveHash'' = saveHash' currentHashedResponse urlToRequest let saveHashAndExecuteCallbackConfig = Async.concurrently saveHash'' executeCallbackConfig' if hashesAreDifferent latestHashedResponse currentHashedResponse then CallbackCalled <$ saveHashAndExecuteCallbackConfig else pure CallbackNotCalled hashesAreDifferent :: LatestHash -> CurrentHash -> Bool hashesAreDifferent latestHash currentHash = Maybe.isNothing latestHash || Foldable.or ((/= currentHash) <$> latestHash) saveHash' :: Hash -> Url -> IO () saveHash' h url = let saveHashMsg = "Saved new hash for url '" ++ url ++ "'" saveHashLog = Log.infoM loggerName saveHashMsg in saveHash sc h <* saveHashLog type LatestHash = Maybe String type CurrentHash = String -- |Repeat executing 'scrape' by providing a 'CronScheduleString'. The returned -- IO action is non blocking repeatScrape :: CronScheduleString -> ScrapeConfig -> Scraper -> Either [ValidationError] (IO ()) repeatScrape cs sc s = let result = repeatScrapeAll [ScrapeSchedule cs sc s] resultErrorMapped = (snd . head <$> (result ^. swapped)) ^. swapped in resultErrorMapped -- |Execute a list of 'ScrapeSchedule' in parallel. If validation of any 'ScrapeSchedule' fails, -- 'Left' containing 'ValidationError' indexed by the corresponding 'Url' is returned. repeatScrapeAll :: [ScrapeSchedule] -> Either [(Url, [ValidationError])] (IO ()) repeatScrapeAll scrapeSchedules = let cronSchedules = Traversable.for scrapeSchedules $ \(ScrapeSchedule cronSchedule scrapeConfig scraper) -> let scrapeConfigUrl = scrapeConfig ^. scrapeInfoUrl cronScheduleValidation = validateCronSchedule cronSchedule resultValidation = scrape scrapeConfig scraper ^. Validation._AccValidation result = toCronSchedule <$> resultValidation <*> cronScheduleValidation in ((\x -> [(scrapeConfigUrl, x)]) <$> (result ^. swapped)) ^. swapped in (Monad.void . CronSchedule.execSchedule . Foldable.sequenceA_) <$> cronSchedules ^. Validation._Either where toCronSchedule :: IO t -> CronScheduleString -> CronSchedule.Schedule () toCronSchedule scrapeAction = CronSchedule.addJob (Monad.void scrapeAction) -- |Execute a list of 'ScrapeConfig' in sequence using 'scrape' and collect -- the results in a map containing the respective 'Url' as key. scrapeAll :: [(ScrapeConfig, Scraper)] -> [(Url, Either [ValidationError] (IO ScrapeResult))] scrapeAll infos = let responses = TU.uncurry scrape <$> infos urls = (^. scrapeInfoUrl) <$> (fst <$> infos) in urls `zip` responses -- |Clear all mutable state associated with the provided 'ScrapeConfig' clearScrapeConfig :: ScrapeConfig -> IO () clearScrapeConfig = removeHash -- |Clear all mutable state ever used by "scrape-changes" clearAllScrapeConfigs :: IO () clearAllScrapeConfigs = removeHashes