{-| Description: Fetcher strategy to scrap info from an HTML URI. This module is the namespace to define a fetcher strategy which generates entries scraping the contents requested to an HTML URI. A `Selector` must be given in order to know from where the information for each entry field should be taken. Be aware that scraping an HTML page has very few consistency warantee. So, depending on the page structure and the selector you give, you could end up with 5 URIs, 4 titles and 6 descriptions. Keep in mind that the leading and limiting asset are the URIs, so in the previous scenario one `Nothing` title would be added and one description would be discarded. Here it is an example: @ import Follow import Follow.Fetchers.WebScraping selector :: Selector selector = Selector { selURI = Just $ Attr ".title a" "href" , selGUID = Just $ Attr ".title a" "href" , selTitle = Just $ InnerText ".title a" , selDescription = Just $ InnerText ".description" , selAuthor = Just $ InnerText ".author" , selPublishDate = Nothing } result :: IO [Entry] result = fetch ("http://an_url.com", selector) @ -} module Follow.Fetchers.WebScraping ( fetch , Selector(..) , SelectorItem(..) , CSSSelector , HTMLAttribute ) where import Control.Monad.Catch (MonadThrow) import Control.Monad.IO.Class (MonadIO, liftIO) import qualified Data.ByteString as BS (ByteString) import qualified Data.ByteString.Lazy as BL (ByteString) import Data.Text (Text) import Follow.Fetchers.WebScraping.Internal (CSSSelector, HTMLAttribute, Selector (..), SelectorItem (..), htmlToEntries) import Follow.Types (Fetched) import HTTP.Follow (getResponseBody, parseUrl) import Network.HTTP.Req (MonadHttp) -- | Fetches entries from given url using specified selectors. fetch :: (MonadThrow m, MonadIO m, MonadHttp m) => BS.ByteString -> Selector -> Fetched m fetch url selector = do url' <- parseUrl url response <- getResponseBody url' liftIO $ htmlToEntries response selector