{-# LANGUAGE OverloadedStrings #-}
{-# OPTIONS_HADDOCK hide #-}
module Text.HTML.Scalpel.Internal.Scrape.URL (
URL
, Config (..)
, Decoder
, defaultDecoder
, utf8Decoder
, iso88591Decoder
, scrapeURL
, scrapeURLWithConfig
) where
import Text.HTML.Scalpel.Core
import Control.Applicative ((<$>))
import Data.CaseInsensitive ()
import Data.Default (def)
import Data.Maybe (fromMaybe, listToMaybe)
import qualified Data.ByteString.Lazy as LBS
import qualified Data.Default as Default
import qualified Data.Text as Text
import qualified Data.Text.Encoding as Text
import qualified Network.HTTP.Client as HTTP
import qualified Network.HTTP.Client.TLS as HTTP
import qualified Text.HTML.TagSoup as TagSoup
import qualified Text.StringLike as TagSoup
type URL = String
type Decoder str = HTTP.Response LBS.ByteString -> str
data Config str = Config {
decoder :: Decoder str
, manager :: Maybe HTTP.Manager
}
instance TagSoup.StringLike str => Default.Default (Config str) where
def = Config {
decoder = defaultDecoder
, manager = Nothing
}
scrapeURL :: (TagSoup.StringLike str)
=> URL -> Scraper str a -> IO (Maybe a)
scrapeURL = scrapeURLWithConfig def
scrapeURLWithConfig :: (TagSoup.StringLike str)
=> Config str -> URL -> Scraper str a -> IO (Maybe a)
scrapeURLWithConfig config url scraper = do
manager <- fromMaybe HTTP.getGlobalManager (return <$> manager config)
tags <- downloadAsTags (decoder config) manager url
return (scrape scraper tags)
where
downloadAsTags decoder manager url = do
request <- HTTP.parseRequest url
response <- HTTP.httpLbs request manager
return $ TagSoup.parseTags $ decoder response
defaultDecoder :: TagSoup.StringLike str => Decoder str
defaultDecoder response = TagSoup.castString
$ choosenDecoder body
where
body = HTTP.responseBody response
headers = HTTP.responseHeaders response
contentType = listToMaybe
$ map (Text.decodeLatin1 . snd)
$ take 1
$ dropWhile ((/= "content-type") . fst)
headers
isType t | Just ct <- contentType = ("charset=" `Text.append` t) `Text.isInfixOf` ct
| otherwise = False
choosenDecoder | isType "utf-8" = Text.decodeUtf8 . LBS.toStrict
| otherwise = Text.decodeLatin1 . LBS.toStrict
utf8Decoder :: TagSoup.StringLike str => Decoder str
utf8Decoder = TagSoup.castString . Text.decodeUtf8 . LBS.toStrict . HTTP.responseBody
iso88591Decoder :: TagSoup.StringLike str => Decoder str
iso88591Decoder = TagSoup.castString . Text.decodeLatin1 . LBS.toStrict . HTTP.responseBody