{-# Language OverloadedStrings #-}
{-# Language ScopedTypeVariables #-}
module Text.HTML.Scalpel.Search
(
Query,
buildDuckduckgoURL,
buildGoogleURL,
url2Text,
SearchResult (..),
duckduckgoScraper,
googleScraper
)
where
import Text.HTML.Scalpel.Core
import qualified Data.Maybe as M
import qualified Data.Text as T
import qualified Text.StringLike as SL
import qualified Text.URI as U
buildDuckduckgoURL :: Query -> U.URI
buildDuckduckgoURL q =
baseURI { U.uriRegName = Just "html.duckduckgo.com",
U.uriPath = "html",
U.uriQuery = Just query }
where
query = U.pairsToQuery [("q", T.unpack q)]
buildGoogleURL :: Query -> U.URI
buildGoogleURL q =
baseURI { U.uriRegName = Just "www.google.com",
U.uriPath = "search",
U.uriQuery = Just query }
where
query = U.pairsToQuery [("q", T.unpack q),
("no_redirect", "1"),
("skip_disambig", "1"),
("s", "0")]
url2Text :: U.URI -> T.Text
url2Text u = T.pack $ show u
googleScraper :: (Show str, SL.StringLike str, Monad m) =>
ScraperT str m SearchResult
googleScraper = searchScraper ("div", "ZINbbc") ("h3", "zBAuLc")
(attr "href" "a") extractGoogleUrl
duckduckgoScraper :: (SL.StringLike str, Monad m) =>
ScraperT str m SearchResult
duckduckgoScraper =
searchScraper ("div", "web-result") ("h2", "result__title")
us Just
where
us = SL.castString <$> tagClassText "h2" "result__title"
type Query = T.Text
type Title = T.Text
newtype SearchResult = SearchResult { unwrapSR :: [(Title, U.URI)] }
deriving (Show, Eq)
baseURI :: U.URI
baseURI = U.URI {
U.uriScheme = Just "https",
U.uriUserInfo = Nothing,
U.uriRegName = Nothing,
U.uriPort = Nothing,
U.uriPath = "",
U.uriQuery = Nothing,
U.uriFragment = Nothing }
searchScraper :: (SL.StringLike str, Monad m) =>
(TagName, String) -> (TagName, String) ->
ScraperT str m str -> (U.URI -> Maybe U.URI) ->
ScraperT str m SearchResult
searchScraper (et, ec) (tt, tc) ts cf = do
es <- chroots (et @: [hasClass ec]) entry
let es' = M.mapMaybe filFun es
return (SearchResult es')
where
entry = (,) <$> tagClassText tt tc
<*> url ts cf
filFun (a, mb) = mb >>= \b -> return (SL.castString a, b)
tagClassText :: (SL.StringLike str, Monad m) =>
TagName -> String -> ScraperT str m T.Text
tagClassText tt tc = SL.castString . normString <$> text (tt @: [hasClass tc])
url :: (SL.StringLike str, Monad m) =>
ScraperT str m str -> (U.URI -> Maybe U.URI) ->
ScraperT str m (Maybe U.URI)
url ts cf = do
surl <- normString <$> ts
let r = U.parseURI surl >>= cf
return r
extractGoogleUrl :: U.URI -> Maybe U.URI
extractGoogleUrl u
| up /= "/url" ||
M.isNothing q = Nothing
| Just u2 <- q = U.parseURI u2
| otherwise = Nothing
where
up = U.uriPath u
q = (U.queryToPairs <$> U.uriQuery u) >>= lookup "q"
normString :: SL.StringLike str => str -> String
normString cs = unwords . words . SL.castString $ cs