{-# Language OverloadedStrings #-} {-# Language ScopedTypeVariables #-} -------------------------------------------------------------------------------- -- | -- Module : Text.HTML.Scalpel.Search -- Copyright : © 2020 Francesco Ariis -- License : GPLv3 (see LICENSE file) -- -- Maintainer : Francesco Ariis -- Stability : provisional -- Portability : portable -- -- "Text.HTML.Scalpel.Core" scrapers for DuckDuckGo and Google search -- engines. -- -- Example: -- -- > import Text.HTML.Scalpel.Search -- > import Text.HTML.Scalpel -- from package `scalpel` -- > -- > url = buildGoogleURL "Gugliemo Oberdan" -- > sa = scrapeURL (show url) (googleScraper :: Scraper String SearchResult) -- > main = sa >>= print -- -------------------------------------------------------------------------------- module Text.HTML.Scalpel.Search ( -- * URL building -- | Check 'U.URI' to modify standard queries. Query, buildDuckduckgoURL, buildGoogleURL, url2Text, -- * Scrapers SearchResult (..), duckduckgoScraper, googleScraper ) where import Text.HTML.Scalpel.Core -- import qualified Data.Bifunctor as B import qualified Data.Maybe as M import qualified Data.Text as T import qualified Text.StringLike as SL import qualified Text.URI as U -- from https://api.duckduckgo.com/api -- | Build a DuckDuckGo URL from a 'Query' string. buildDuckduckgoURL :: Query -> U.URI buildDuckduckgoURL q = baseURI { U.uriRegName = Just "html.duckduckgo.com", U.uriPath = "html", U.uriQuery = Just query } where query = U.pairsToQuery [("q", T.unpack q)] -- from https://api.duckduckgo.com/api -- | Build a Googlr URL from a 'Query' string. buildGoogleURL :: Query -> U.URI buildGoogleURL q = baseURI { U.uriRegName = Just "www.google.com", U.uriPath = "search", U.uriQuery = Just query } where query = U.pairsToQuery [("q", T.unpack q), ("no_redirect", "1"), ("skip_disambig", "1"), ("s", "0")] -- | Convert 'U.URI' to 'T.Text'. If you prefer 'String', just use `show`. url2Text :: U.URI -> T.Text url2Text u = T.pack $ show u -- | Scrape a Google search page for result titles and links. googleScraper :: (Show str, SL.StringLike str, Monad m) => ScraperT str m SearchResult googleScraper = searchScraper ("div", "ZINbbc") ("h3", "zBAuLc") (attr "href" "a") extractGoogleUrl -- | Scrape a DuckDuckGo search page for result titles and links. -- DuckDuckGo is not fond of bots downloading their pages and will display -- an error if searches are too frequent (≳ 1 search per second). duckduckgoScraper :: (SL.StringLike str, Monad m) => ScraperT str m SearchResult duckduckgoScraper = searchScraper ("div", "web-result") ("h2", "result__title") us Just where us = SL.castString <$> tagClassText "h2" "result__title" ----------- -- TYPES -- ----------- type Query = T.Text type Title = T.Text newtype SearchResult = SearchResult { unwrapSR :: [(Title, U.URI)] } deriving (Show, Eq) ----------------- -- ANCILLARIES -- ----------------- -- uri -- baseURI :: U.URI baseURI = U.URI { U.uriScheme = Just "https", U.uriUserInfo = Nothing, U.uriRegName = Nothing, U.uriPort = Nothing, U.uriPath = "", U.uriQuery = Nothing, U.uriFragment = Nothing } -- scrape -- -- a generic scraper: tag/class of the single entry, tag/class of the title, -- scraper/extractor for the URL searchScraper :: (SL.StringLike str, Monad m) => (TagName, String) -> (TagName, String) -> ScraperT str m str -> (U.URI -> Maybe U.URI) -> ScraperT str m SearchResult searchScraper (et, ec) (tt, tc) ts cf = do es <- chroots (et @: [hasClass ec]) entry let es' = M.mapMaybe filFun es return (SearchResult es') where entry = (,) <$> tagClassText tt tc <*> url ts cf filFun (a, mb) = mb >>= \b -> return (SL.castString a, b) tagClassText :: (SL.StringLike str, Monad m) => TagName -> String -> ScraperT str m T.Text tagClassText tt tc = SL.castString . normString <$> text (tt @: [hasClass tc]) -- text scraper, some URI extractor if needed url :: (SL.StringLike str, Monad m) => ScraperT str m str -> (U.URI -> Maybe U.URI) -> ScraperT str m (Maybe U.URI) url ts cf = do surl <- normString <$> ts let r = U.parseURI surl >>= cf return r -- Nothing: not a link but a «search this instead», embedded, etc. extractGoogleUrl :: U.URI -> Maybe U.URI extractGoogleUrl u | up /= "/url" || M.isNothing q = Nothing | Just u2 <- q = U.parseURI u2 | otherwise = Nothing -- `otherwise` to quiet GHC -Wall where up = U.uriPath u q = (U.queryToPairs <$> U.uriQuery u) >>= lookup "q" normString :: SL.StringLike str => str -> String normString cs = unwords . words . SL.castString $ cs