{-# Language OverloadedStrings #-}
{-# Language ScopedTypeVariables #-}

--------------------------------------------------------------------------------
-- |
-- Module      :  Text.HTML.Scalpel.Search
-- Copyright   :  © 2020 Francesco Ariis
-- License     :  GPLv3 (see LICENSE file)
--
-- Maintainer  :  Francesco Ariis <fa-ml@ariis.it>
-- Stability   :  provisional
-- Portability :  portable
--
-- "Text.HTML.Scalpel.Core" scrapers for DuckDuckGo and Google search
-- engines.
--
-- Example:
--
-- > import Text.HTML.Scalpel.Search
-- > import Text.HTML.Scalpel          -- from package `scalpel`
-- >
-- > url = buildGoogleURL "Gugliemo Oberdan"
-- > sa = scrapeURL (show url) (googleScraper :: Scraper String SearchResult)
-- > main = sa >>= print
--
--------------------------------------------------------------------------------


module Text.HTML.Scalpel.Search
                ( -- * URL building
                  -- | Check 'U.URI' to modify standard queries.
                  Query,
                  buildDuckduckgoURL,
                  buildGoogleURL,
                  url2Text,

                  -- * Scrapers
                  SearchResult (..),
                  duckduckgoScraper,
                  googleScraper
                )
                where

import Text.HTML.Scalpel.Core

-- import qualified Data.Bifunctor as B
import qualified Data.Maybe as M
import qualified Data.Text as T
import qualified Text.StringLike as SL
import qualified Text.URI as U


-- from https://api.duckduckgo.com/api
-- | Build a DuckDuckGo URL from a 'Query' string.
buildDuckduckgoURL :: Query -> U.URI
buildDuckduckgoURL q =
        baseURI { U.uriRegName = Just "html.duckduckgo.com",
                  U.uriPath = "html",
                  U.uriQuery = Just query }
    where
          query = U.pairsToQuery [("q", T.unpack q)]

-- from https://api.duckduckgo.com/api
-- | Build a Googlr URL from a 'Query' string.
buildGoogleURL :: Query -> U.URI
buildGoogleURL q =
        baseURI { U.uriRegName = Just "www.google.com",
                  U.uriPath = "search",
                  U.uriQuery = Just query }
    where
          query = U.pairsToQuery [("q", T.unpack q),
                                  ("no_redirect", "1"),
                                  ("skip_disambig", "1"),
                                  ("s", "0")]

-- | Convert 'U.URI' to 'T.Text'. If you prefer 'String', just use `show`.
url2Text :: U.URI -> T.Text
url2Text u = T.pack $ show u

-- | Scrape a Google search page for result titles and links.
googleScraper :: (Show str, SL.StringLike str, Monad m) =>
                 ScraperT str m SearchResult
googleScraper = searchScraper ("div", "ZINbbc") ("h3", "zBAuLc")
                              (attr "href" "a") extractGoogleUrl

-- | Scrape a DuckDuckGo search page for result titles and links.
-- DuckDuckGo is not fond of bots downloading their pages and will display
-- an error if searches are too frequent (≳ 1 search per second).
duckduckgoScraper :: (SL.StringLike str, Monad m) =>
              ScraperT str m SearchResult
duckduckgoScraper =
            searchScraper ("div", "web-result") ("h2", "result__title")
                          us Just
    where
          us = SL.castString <$> tagClassText "h2" "result__title"


-----------
-- TYPES --
-----------

type Query = T.Text

type Title = T.Text
newtype SearchResult = SearchResult { unwrapSR :: [(Title, U.URI)] }
    deriving (Show, Eq)


-----------------
-- ANCILLARIES --
-----------------

-- uri --

baseURI :: U.URI
baseURI = U.URI {
        U.uriScheme = Just "https",
        U.uriUserInfo = Nothing,
        U.uriRegName = Nothing,
        U.uriPort = Nothing,
        U.uriPath = "",
        U.uriQuery = Nothing,
        U.uriFragment = Nothing }

-- scrape --

-- a generic scraper: tag/class of the single entry, tag/class of the title,
-- scraper/extractor for the URL
searchScraper :: (SL.StringLike str, Monad m) =>
                 (TagName, String) -> (TagName, String) ->
                 ScraperT str m str -> (U.URI -> Maybe U.URI) ->
                 ScraperT str m SearchResult
searchScraper (et, ec) (tt, tc) ts cf = do
            es <- chroots (et @: [hasClass ec]) entry
            let es' = M.mapMaybe filFun es
            return (SearchResult es')
    where
          entry = (,) <$> tagClassText tt tc
                      <*> url ts cf
          filFun (a, mb) = mb >>= \b -> return (SL.castString a, b)

tagClassText :: (SL.StringLike str, Monad m) =>
                TagName -> String -> ScraperT str m T.Text
tagClassText tt tc = SL.castString . normString <$> text (tt @: [hasClass tc])

-- text scraper, some URI extractor if needed
url :: (SL.StringLike str, Monad m) =>
       ScraperT str m str -> (U.URI -> Maybe U.URI) ->
       ScraperT str m (Maybe U.URI)
url ts cf = do
        surl <- normString <$> ts
        let r = U.parseURI surl >>= cf
        return r

-- Nothing: not a link but a «search this instead», embedded, etc.
extractGoogleUrl :: U.URI -> Maybe U.URI
extractGoogleUrl u
            | up /= "/url" ||
              M.isNothing q   = Nothing
            | Just u2 <- q    = U.parseURI u2
            | otherwise       = Nothing
                -- `otherwise` to quiet GHC -Wall
    where
          up = U.uriPath u
          q = (U.queryToPairs <$> U.uriQuery u) >>= lookup "q"

normString :: SL.StringLike str => str -> String
normString cs = unwords . words . SL.castString $ cs