module Lambdabot.Util.Browser
( urlPageTitle
, browseLB
) where
import Codec.Binary.UTF8.String
import Control.Applicative
import Control.Monad.Trans
import Lambdabot.Config.Core
import Lambdabot.Monad
import Lambdabot.Util (limitStr)
import Network.Browser
import Network.HTTP
import Network.URI
import Text.HTML.TagSoup
import Text.HTML.TagSoup.Match
browseLB :: MonadLB m => BrowserAction conn a -> m a
browseLB act = lb $ do
proxy' <- getConfig proxy
liftIO . browse $ do
setOutHandler (const (return ()))
setErrHandler (const (return ()))
setAllowRedirects True
setMaxRedirects (Just 5)
setProxy proxy'
act
maxTitleLength :: Int
maxTitleLength = 80
urlPageTitle :: String -> BrowserAction (HandleStream String) (Maybe String)
urlPageTitle = fmap (fmap (limitStr maxTitleLength)) . rawPageTitle
rawPageTitle :: String -> BrowserAction (HandleStream String) (Maybe String)
rawPageTitle url = do
(_, result) <- request (getRequest (takeWhile (/='#') url))
case rspCode result of
(2,0,0) -> do
case takeWhile (/= ';') <$> lookupHeader HdrContentType (rspHeaders result) of
Just "text/html" -> return $ extractTitle (rspBody result)
Just "application/pdf" -> rawPageTitle (googleCacheURL url)
_ -> return $ Nothing
_ -> return Nothing
where googleCacheURL = (gURL++) . escapeURIString (const False)
gURL = "http://www.google.com/search?hl=en&q=cache:"
extractTitle :: String -> Maybe String
extractTitle = content . tags . decodeString where
tags = closing . opening . canonicalizeTags . parseTags
opening = dropWhile (not . tagOpenLit "title" (const True))
closing = takeWhile (not . tagCloseLit "title")
content = maybeText . format . innerText
format = unwords . words
maybeText [] = Nothing
maybeText t = Just (encodeString t)