src/Lambdabot/Util/Browser.hs

{-# LANGUAGE PatternGuards #-}

-- | URL Utility Functions

module Lambdabot.Util.Browser
    ( urlPageTitle
    , browseLB
    ) where

import Codec.Binary.UTF8.String
import Control.Applicative
import Control.Monad.Trans
import Lambdabot.Config.Core
import Lambdabot.Monad
import Lambdabot.Util (limitStr)
import Network.Browser
import Network.HTTP
import Network.URI
import Text.HTML.TagSoup
import Text.HTML.TagSoup.Match

-- | Run a browser action with some standardized settings
browseLB :: MonadLB m => BrowserAction conn a -> m a
browseLB act = lb $ do
    proxy' <- getConfig proxy
    liftIO . browse $ do
        setOutHandler (const (return ()))
        setErrHandler (const (return ()))
        
        setAllowRedirects True
        setMaxRedirects (Just 5)
        setProxy proxy'
        act

-- | Limit the maximum title length to prevent jokers from spamming
-- the channel with specially crafted HTML pages.
maxTitleLength :: Int
maxTitleLength = 80

-- | Fetches a page title suitable for display.  Ideally, other
-- plugins should make use of this function if the result is to be
-- displayed in an IRC channel because it ensures that a consistent
-- look is used (and also lets the URL plugin effectively ignore
-- contextual URLs that might be generated by another instance of
-- lambdabot; the URL plugin matches on 'urlTitlePrompt').
urlPageTitle :: String -> BrowserAction (HandleStream String) (Maybe String)
urlPageTitle = fmap (fmap (limitStr maxTitleLength)) . rawPageTitle

-- | Fetches a page title for the specified URL.  This function should
-- only be used by other plugins if and only if the result is not to
-- be displayed in an IRC channel.  Instead, use 'urlPageTitle'.
rawPageTitle :: String -> BrowserAction (HandleStream String) (Maybe String)
rawPageTitle url = do
    (_, result) <- request (getRequest (takeWhile (/='#') url))
    case rspCode result of
        (2,0,0)   -> do
            case takeWhile (/= ';') <$> lookupHeader HdrContentType (rspHeaders result) of
                Just "text/html"       -> return $ extractTitle (rspBody result)
                Just "application/pdf" -> rawPageTitle (googleCacheURL url)
                _                      -> return $ Nothing
        _         -> return Nothing
    
    where googleCacheURL = (gURL++) . escapeURIString (const False)
          gURL = "http://www.google.com/search?hl=en&q=cache:"

-- | Given a server response (list of Strings), return the text in
-- between the title HTML element, only if it is text/html content.
-- Now supports all(?) HTML entities thanks to TagSoup.
extractTitle :: String -> Maybe String
extractTitle = content . tags . decodeString where
    tags = closing . opening . canonicalizeTags . parseTags
    opening = dropWhile (not . tagOpenLit "title" (const True))
    closing = takeWhile (not . tagCloseLit "title")

    content = maybeText . format . innerText
    format = unwords . words
    maybeText [] = Nothing
    maybeText t  = Just (encodeString t)