{-# LANGUAGE OverloadedStrings #-} {-# OPTIONS_HADDOCK hide #-} module Text.HTML.Scalpel.Internal.Scrape.URL ( URL , Config (..) , Decoder , defaultDecoder , utf8Decoder , iso88591Decoder , scrapeURL , scrapeURLWithConfig ) where import Text.HTML.Scalpel.Core import Control.Applicative ((<$>)) import Data.CaseInsensitive () import Data.Default (def) import Data.Maybe (fromMaybe, listToMaybe) import qualified Data.ByteString.Lazy as LBS import qualified Data.Default as Default import qualified Data.Text as Text import qualified Data.Text.Encoding as Text import qualified Network.HTTP.Client as HTTP import qualified Network.HTTP.Client.TLS as HTTP import qualified Text.HTML.TagSoup as TagSoup import qualified Text.StringLike as TagSoup type URL = String -- | A method that takes a HTTP response as raw bytes and returns the body as a -- string type. type Decoder str = HTTP.Response LBS.ByteString -> str -- | A record type that determines how 'scrapeURLWithConfig' interacts with the -- HTTP server and interprets the results. data Config str = Config { decoder :: Decoder str , manager :: Maybe HTTP.Manager } instance TagSoup.StringLike str => Default.Default (Config str) where def = Config { decoder = defaultDecoder , manager = Nothing } -- | The 'scrapeURL' function downloads the contents of the given URL and -- executes a 'Scraper' on it. -- -- The default behavior is to use the global manager provided by -- http-client-tls (via 'HTTP.getGlobalManager'). Any exceptions thrown by -- http-client are not caught and are bubbled up to the caller. scrapeURL :: (TagSoup.StringLike str) => URL -> Scraper str a -> IO (Maybe a) scrapeURL = scrapeURLWithConfig def -- | The 'scrapeURLWithConfig' function takes a 'Config' record type and -- downloads the contents of the given URL and executes a 'Scraper' on it. scrapeURLWithConfig :: (TagSoup.StringLike str) => Config str -> URL -> Scraper str a -> IO (Maybe a) scrapeURLWithConfig config url scraper = do manager <- fromMaybe HTTP.getGlobalManager (return <$> manager config) tags <- downloadAsTags (decoder config) manager url return (scrape scraper tags) where downloadAsTags decoder manager url = do request <- HTTP.parseRequest url response <- HTTP.httpLbs request manager return $ TagSoup.parseTags $ decoder response -- | The default response decoder. This decoder attempts to infer the character -- set of the HTTP response body from the `Content-Type` header. If this header -- is not present, then the character set is assumed to be `ISO-8859-1`. defaultDecoder :: TagSoup.StringLike str => Decoder str defaultDecoder response = TagSoup.castString $ choosenDecoder body where body = HTTP.responseBody response headers = HTTP.responseHeaders response contentType = listToMaybe $ map (Text.decodeLatin1 . snd) $ take 1 $ dropWhile ((/= "content-type") . fst) headers isType t | Just ct <- contentType = ("charset=" `Text.append` t) `Text.isInfixOf` ct | otherwise = False choosenDecoder | isType "utf-8" = Text.decodeUtf8 . LBS.toStrict | otherwise = Text.decodeLatin1 . LBS.toStrict -- | A decoder that will always decode using `UTF-8`. utf8Decoder :: TagSoup.StringLike str => Decoder str utf8Decoder = TagSoup.castString . Text.decodeUtf8 . LBS.toStrict . HTTP.responseBody -- | A decoder that will always decode using `ISO-8859-1`. iso88591Decoder :: TagSoup.StringLike str => Decoder str iso88591Decoder = TagSoup.castString . Text.decodeLatin1 . LBS.toStrict . HTTP.responseBody