{-# LANGUAGE OverloadedStrings #-}
{-# OPTIONS_HADDOCK hide #-}
module Text.HTML.Scalpel.Internal.Scrape.URL (
URL
, Config (..)
, Decoder
, defaultDecoder
, utf8Decoder
, iso88591Decoder
, fetchTags
, fetchTagsWithConfig
, scrapeURL
, scrapeURLWithConfig
) where
import Text.HTML.Scalpel.Core
import Control.Monad
import Data.CaseInsensitive ()
import Data.Default (def)
import Data.Maybe (listToMaybe)
import qualified Data.ByteString.Lazy as LBS
import qualified Data.Default as Default
import qualified Data.Text as Text
import qualified Data.Text.Encoding as Text
import qualified Network.HTTP.Client as HTTP
import qualified Network.HTTP.Client.TLS as HTTP
import qualified Text.HTML.TagSoup as TagSoup
import qualified Text.StringLike as TagSoup
type URL = String
type Decoder str = HTTP.Response LBS.ByteString -> str
data Config str = Config {
forall str. Config str -> Decoder str
decoder :: Decoder str
, forall str. Config str -> Maybe Manager
manager :: Maybe HTTP.Manager
}
instance TagSoup.StringLike str => Default.Default (Config str) where
def :: Config str
def = Config {
decoder :: Decoder str
decoder = forall str. StringLike str => Decoder str
defaultDecoder
, manager :: Maybe Manager
manager = forall a. Maybe a
Nothing
}
scrapeURL :: (TagSoup.StringLike str)
=> URL -> Scraper str a -> IO (Maybe a)
scrapeURL :: forall str a.
StringLike str =>
URL -> Scraper str a -> IO (Maybe a)
scrapeURL = forall str a.
StringLike str =>
Config str -> URL -> Scraper str a -> IO (Maybe a)
scrapeURLWithConfig forall a. Default a => a
def
scrapeURLWithConfig :: (TagSoup.StringLike str)
=> Config str -> URL -> Scraper str a -> IO (Maybe a)
scrapeURLWithConfig :: forall str a.
StringLike str =>
Config str -> URL -> Scraper str a -> IO (Maybe a)
scrapeURLWithConfig Config str
config URL
url Scraper str a
scraper = do
forall str a.
StringLike str =>
Scraper str a -> [Tag str] -> Maybe a
scrape Scraper str a
scraper forall (m :: * -> *) a1 r. Monad m => (a1 -> r) -> m a1 -> m r
`liftM` forall str. StringLike str => Config str -> URL -> IO [Tag str]
fetchTagsWithConfig Config str
config URL
url
fetchTags :: TagSoup.StringLike str
=> URL -> IO [TagSoup.Tag str]
fetchTags :: forall str. StringLike str => URL -> IO [Tag str]
fetchTags = forall str. StringLike str => Config str -> URL -> IO [Tag str]
fetchTagsWithConfig forall a. Default a => a
def
fetchTagsWithConfig :: TagSoup.StringLike str
=> Config str -> URL -> IO [TagSoup.Tag str]
fetchTagsWithConfig :: forall str. StringLike str => Config str -> URL -> IO [Tag str]
fetchTagsWithConfig Config str
config URL
url = do
Manager
manager <- forall b a. b -> (a -> b) -> Maybe a -> b
maybe IO Manager
HTTP.getGlobalManager forall (m :: * -> *) a. Monad m => a -> m a
return (forall str. Config str -> Maybe Manager
manager Config str
config)
Response ByteString
response <- forall a b c. (a -> b -> c) -> b -> a -> c
flip Request -> Manager -> IO (Response ByteString)
HTTP.httpLbs Manager
manager forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< forall (m :: * -> *). MonadThrow m => URL -> m Request
HTTP.parseRequest URL
url
forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ forall str. StringLike str => str -> [Tag str]
TagSoup.parseTags forall a b. (a -> b) -> a -> b
$ forall str. Config str -> Decoder str
decoder Config str
config forall a b. (a -> b) -> a -> b
$ Response ByteString
response
defaultDecoder :: TagSoup.StringLike str => Decoder str
defaultDecoder :: forall str. StringLike str => Decoder str
defaultDecoder Response ByteString
response = forall a b. (StringLike a, StringLike b) => a -> b
TagSoup.castString
forall a b. (a -> b) -> a -> b
$ ByteString -> Text
choosenDecoder ByteString
body
where
body :: ByteString
body = forall body. Response body -> body
HTTP.responseBody Response ByteString
response
headers :: ResponseHeaders
headers = forall body. Response body -> ResponseHeaders
HTTP.responseHeaders Response ByteString
response
contentType :: Maybe Text
contentType = forall a. [a] -> Maybe a
listToMaybe
forall a b. (a -> b) -> a -> b
$ forall a b. (a -> b) -> [a] -> [b]
map (Text -> Text
Text.toLower forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> Text
Text.decodeLatin1 forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a, b) -> b
snd)
forall a b. (a -> b) -> a -> b
$ forall a. Int -> [a] -> [a]
take Int
1
forall a b. (a -> b) -> a -> b
$ forall a. (a -> Bool) -> [a] -> [a]
dropWhile ((forall a. Eq a => a -> a -> Bool
/= HeaderName
"content-type") forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a, b) -> a
fst)
ResponseHeaders
headers
isType :: Text -> Bool
isType Text
t | Just Text
ct <- Maybe Text
contentType = (Text
"charset=" Text -> Text -> Text
`Text.append` Text
t) Text -> Text -> Bool
`Text.isInfixOf` Text
ct
| Bool
otherwise = Bool
False
choosenDecoder :: ByteString -> Text
choosenDecoder | Text -> Bool
isType Text
"utf-8" = ByteString -> Text
Text.decodeUtf8 forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> ByteString
LBS.toStrict
| Bool
otherwise = ByteString -> Text
Text.decodeLatin1 forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> ByteString
LBS.toStrict
utf8Decoder :: TagSoup.StringLike str => Decoder str
utf8Decoder :: forall str. StringLike str => Decoder str
utf8Decoder = forall a b. (StringLike a, StringLike b) => a -> b
TagSoup.castString forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> Text
Text.decodeUtf8 forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> ByteString
LBS.toStrict forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall body. Response body -> body
HTTP.responseBody
iso88591Decoder :: TagSoup.StringLike str => Decoder str
iso88591Decoder :: forall str. StringLike str => Decoder str
iso88591Decoder = forall a b. (StringLike a, StringLike b) => a -> b
TagSoup.castString forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> Text
Text.decodeLatin1 forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> ByteString
LBS.toStrict forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall body. Response body -> body
HTTP.responseBody