{-# LANGUAGE OverloadedStrings #-}
{-# OPTIONS_HADDOCK hide #-}
module Text.HTML.Scalpel.Internal.Scrape.URL (
    URL
,   Config (..)
,   Decoder

,   defaultDecoder
,   utf8Decoder
,   iso88591Decoder

,   fetchTags
,   fetchTagsWithConfig
,   scrapeURL
,   scrapeURLWithConfig
) where

import Text.HTML.Scalpel.Core

import Control.Monad
import Data.CaseInsensitive ()
import Data.Default (def)
import Data.Maybe (listToMaybe)

import qualified Data.ByteString.Lazy as LBS
import qualified Data.Default as Default
import qualified Data.Text as Text
import qualified Data.Text.Encoding as Text
import qualified Network.HTTP.Client as HTTP
import qualified Network.HTTP.Client.TLS as HTTP
import qualified Text.HTML.TagSoup as TagSoup
import qualified Text.StringLike as TagSoup


type URL = String

-- | A method that takes a HTTP response as raw bytes and returns the body as a
-- string type.
type Decoder str = HTTP.Response LBS.ByteString -> str

-- | A record type that determines how 'scrapeURLWithConfig' interacts with the
-- HTTP server and interprets the results.
data Config str = Config {
    forall str. Config str -> Decoder str
decoder :: Decoder str
,   forall str. Config str -> Maybe Manager
manager :: Maybe HTTP.Manager
}

instance TagSoup.StringLike str => Default.Default (Config str) where
    def :: Config str
def = Config {
            decoder :: Decoder str
decoder = forall str. StringLike str => Decoder str
defaultDecoder
        ,   manager :: Maybe Manager
manager = forall a. Maybe a
Nothing
        }

-- | The 'scrapeURL' function downloads the contents of the given URL and
-- executes a 'Scraper' on it.
--
-- The default behavior is to use the global manager provided by
-- http-client-tls (via 'HTTP.getGlobalManager'). Any exceptions thrown by
-- http-client are not caught and are bubbled up to the caller.
scrapeURL :: (TagSoup.StringLike str)
          => URL -> Scraper str a -> IO (Maybe a)
scrapeURL :: forall str a.
StringLike str =>
URL -> Scraper str a -> IO (Maybe a)
scrapeURL = forall str a.
StringLike str =>
Config str -> URL -> Scraper str a -> IO (Maybe a)
scrapeURLWithConfig forall a. Default a => a
def

-- | The 'scrapeURLWithConfig' function takes a 'Config' record type and
-- downloads the contents of the given URL and executes a 'Scraper' on it.
scrapeURLWithConfig :: (TagSoup.StringLike str)
                  => Config str -> URL -> Scraper str a -> IO (Maybe a)
scrapeURLWithConfig :: forall str a.
StringLike str =>
Config str -> URL -> Scraper str a -> IO (Maybe a)
scrapeURLWithConfig Config str
config URL
url Scraper str a
scraper = do
    forall str a.
StringLike str =>
Scraper str a -> [Tag str] -> Maybe a
scrape Scraper str a
scraper forall (m :: * -> *) a1 r. Monad m => (a1 -> r) -> m a1 -> m r
`liftM` forall str. StringLike str => Config str -> URL -> IO [Tag str]
fetchTagsWithConfig Config str
config URL
url

-- | Download and parse the contents of the given URL.
fetchTags :: TagSoup.StringLike str
                => URL -> IO [TagSoup.Tag str]
fetchTags :: forall str. StringLike str => URL -> IO [Tag str]
fetchTags = forall str. StringLike str => Config str -> URL -> IO [Tag str]
fetchTagsWithConfig forall a. Default a => a
def

-- | Download and parse the contents of the given URL with the given 'Config'.
fetchTagsWithConfig :: TagSoup.StringLike str
                  => Config str -> URL -> IO [TagSoup.Tag str]
fetchTagsWithConfig :: forall str. StringLike str => Config str -> URL -> IO [Tag str]
fetchTagsWithConfig Config str
config URL
url = do
    Manager
manager <- forall b a. b -> (a -> b) -> Maybe a -> b
maybe IO Manager
HTTP.getGlobalManager forall (m :: * -> *) a. Monad m => a -> m a
return (forall str. Config str -> Maybe Manager
manager Config str
config)
    Response ByteString
response <- forall a b c. (a -> b -> c) -> b -> a -> c
flip Request -> Manager -> IO (Response ByteString)
HTTP.httpLbs Manager
manager forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< forall (m :: * -> *). MonadThrow m => URL -> m Request
HTTP.parseRequest URL
url
    forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ forall str. StringLike str => str -> [Tag str]
TagSoup.parseTags forall a b. (a -> b) -> a -> b
$ forall str. Config str -> Decoder str
decoder Config str
config forall a b. (a -> b) -> a -> b
$ Response ByteString
response

-- | The default response decoder. This decoder attempts to infer the character
-- set of the HTTP response body from the `Content-Type` header. If this header
-- is not present, then the character set is assumed to be `ISO-8859-1`.
defaultDecoder :: TagSoup.StringLike str => Decoder str
defaultDecoder :: forall str. StringLike str => Decoder str
defaultDecoder Response ByteString
response = forall a b. (StringLike a, StringLike b) => a -> b
TagSoup.castString
                        forall a b. (a -> b) -> a -> b
$ ByteString -> Text
choosenDecoder ByteString
body
    where
        body :: ByteString
body        = forall body. Response body -> body
HTTP.responseBody Response ByteString
response
        headers :: ResponseHeaders
headers     = forall body. Response body -> ResponseHeaders
HTTP.responseHeaders Response ByteString
response
        contentType :: Maybe Text
contentType = forall a. [a] -> Maybe a
listToMaybe
                    forall a b. (a -> b) -> a -> b
$ forall a b. (a -> b) -> [a] -> [b]
map (Text -> Text
Text.toLower forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> Text
Text.decodeLatin1 forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a, b) -> b
snd)
                    forall a b. (a -> b) -> a -> b
$ forall a. Int -> [a] -> [a]
take Int
1
                    forall a b. (a -> b) -> a -> b
$ forall a. (a -> Bool) -> [a] -> [a]
dropWhile ((forall a. Eq a => a -> a -> Bool
/= HeaderName
"content-type") forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a, b) -> a
fst)
                                ResponseHeaders
headers

        isType :: Text -> Bool
isType Text
t | Just Text
ct <- Maybe Text
contentType = (Text
"charset=" Text -> Text -> Text
`Text.append` Text
t) Text -> Text -> Bool
`Text.isInfixOf` Text
ct
                 | Bool
otherwise              = Bool
False

        choosenDecoder :: ByteString -> Text
choosenDecoder | Text -> Bool
isType Text
"utf-8" = ByteString -> Text
Text.decodeUtf8 forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> ByteString
LBS.toStrict
                       | Bool
otherwise      = ByteString -> Text
Text.decodeLatin1 forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> ByteString
LBS.toStrict

-- | A decoder that will always decode using `UTF-8`.
utf8Decoder ::  TagSoup.StringLike str => Decoder str
utf8Decoder :: forall str. StringLike str => Decoder str
utf8Decoder = forall a b. (StringLike a, StringLike b) => a -> b
TagSoup.castString forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> Text
Text.decodeUtf8 forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> ByteString
LBS.toStrict forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall body. Response body -> body
HTTP.responseBody

-- | A decoder that will always decode using `ISO-8859-1`.
iso88591Decoder ::  TagSoup.StringLike str => Decoder str
iso88591Decoder :: forall str. StringLike str => Decoder str
iso88591Decoder = forall a b. (StringLike a, StringLike b) => a -> b
TagSoup.castString forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> Text
Text.decodeLatin1 forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> ByteString
LBS.toStrict forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall body. Response body -> body
HTTP.responseBody