{-# LANGUAGE PatternGuards #-} -- | URL Utility Functions module Lambdabot.Url ( getHtmlPage, getHeader, rawPageTitle, urlPageTitle, urlTitlePrompt, runWebReq ) where import Data.List import Data.Maybe import Lambdabot.MiniHTTP import Control.Monad.Reader import Text.HTML.TagSoup.Parser import Text.HTML.TagSoup.Match import Text.HTML.TagSoup import Codec.Binary.UTF8.String -- | The string that I prepend to the quoted page title. urlTitlePrompt :: String urlTitlePrompt = "Title: " -- | Limit the maximum title length to prevent jokers from spamming -- the channel with specially crafted HTML pages. maxTitleLength :: Int maxTitleLength = 80 -- | A web request monad transformer for keeping hold of the proxy. type WebReq a = ReaderT Proxy IO a runWebReq :: WebReq a -> Proxy -> IO a runWebReq = runReaderT -- | Fetches a page title suitable for display. Ideally, other -- plugins should make use of this function if the result is to be -- displayed in an IRC channel because it ensures that a consistent -- look is used (and also lets the URL plugin effectively ignore -- contextual URLs that might be generated by another instance of -- lambdabot; the URL plugin matches on 'urlTitlePrompt'). urlPageTitle :: String -> WebReq (Maybe String) urlPageTitle url = do title <- rawPageTitle url return $ maybe Nothing prettyTitle title where limitLength s | length s > maxTitleLength = (take maxTitleLength s) ++ " ..." | otherwise = s prettyTitle = Just . (urlTitlePrompt ++) . limitLength -- | Fetches a page title for the specified URL. This function should -- only be used by other plugins if and only if the result is not to -- be displayed in an IRC channel. Instead, use 'urlPageTitle'. rawPageTitle :: String -> WebReq (Maybe String) rawPageTitle url | Just uri <- parseURI url' = do contents <- getHtmlPage uri case contentType contents of Just "text/html" -> return $ extractTitle contents Just "application/pdf" -> rawPageTitle (googleCacheURL url) _ -> return $ Nothing | otherwise = return Nothing -- URLs containing `#' fail to parse with parseURI, but -- these kind of URLs are commonly pasted, so we ought to try -- removing that part of provided URLs. where url' = takeWhile (/='#') url googleCacheURL = (gURL++) . escapeURIString (const False) gURL = "http://www.google.com/search?hl=en&q=cache:" -- | Fetch the contents of a URL following HTTP redirects. It returns -- a list of strings comprising the server response which includes the -- status line, response headers, and body. getHtmlPage :: URI -> WebReq [String] getHtmlPage u = getHtmlPage' u 5 where getHtmlPage' :: URI -> Int -> WebReq [String] getHtmlPage' _ 0 = return [] getHtmlPage' uri n = do contents <- getURIContents uri case responseStatus contents of 301 -> getHtmlPage' (redirectedUrl contents) (n-1) 302 -> getHtmlPage' (redirectedUrl contents) (n-1) 200 -> return contents _ -> return [] where -- | Parse the HTTP response code from a line in the following -- format: HTTP/1.1 200 Success. responseStatus hdrs = (read . (!!1) . words . (!!0)) hdrs :: Int -- | Return the value of the "Location" header in the server -- response redirectedUrl hdrs | Just loc <- getHeader "Location" hdrs = case parseURI loc of Nothing -> (fromJust . parseURI) $ fullUrl loc Just uri' -> uri' | otherwise = error("No Location header found in 3xx response.") -- | Construct a full absolute URL based on the current uri. This is -- used when a Location header violates the HTTP RFC and does not send -- an absolute URI in the response, instead, a relative URI is sent, so -- we must manually construct the absolute URI. fullUrl loc = let auth = fromJust $ uriAuthority uri in (uriScheme uri) ++ "//" ++ (uriRegName auth) ++ loc -- | Fetch the contents of a URL returning a list of strings -- comprising the server response which includes the status line, -- response headers, and body. getURIContents :: URI -> WebReq [String] getURIContents uri = do proxy <- ask liftIO $ readNBytes 3048 proxy uri (request proxy) "" where request Nothing = ["GET " ++ abs_path ++ " HTTP/1.1", "host: " ++ host, "Connection: close", ""] request _ = ["GET " ++ show uri ++ " HTTP/1.0", ""] abs_path = case uriPath uri ++ uriQuery uri ++ uriFragment uri of url@('/':_) -> url url -> '/':url host = uriRegName . fromJust $ uriAuthority uri -- | Given a server response (list of Strings), return the text in -- between the title HTML element, only if it is text/html content. -- Now supports all(?) HTML entities thanks to TagSoup. extractTitle :: [String] -> Maybe String extractTitle = content . tags . decodeString . unlines where tags = closing . opening . canonicalizeTags . parseTags opening = dropWhile (not . tagOpenLit "title" (const True)) closing = takeWhile (not . tagCloseLit "title") content = maybeText . format . innerText format = unwords . words maybeText [] = Nothing maybeText t = Just (encodeString t) -- | What is the type of the server response? contentType :: [String] -> Maybe (String) contentType [] = Nothing contentType contents = Just val where val = takeWhile (/=';') ctype ctype = case getHeader "Content-Type" contents of Nothing -> error "Lib.URL.isTextHTML: getHeader failed" Just c -> c -- | Retrieve the specified header from the server response being -- careful to strip the trailing carriage return. I swiped this code -- from Search.hs, but had to modify it because it was not properly -- stripping off the trailing CR (must not have manifested itself as a -- bug in that code; however, parseURI will fail against CR-terminated -- strings. getHeader :: String -> [String] -> Maybe String getHeader _ [] = Nothing getHeader hdr (_:hs) = lookup hdr $ concatMap mkassoc hs where removeCR = takeWhile (/='\r') mkassoc s = case findIndex (==':') s of Just n -> [(take n s, removeCR $ drop (n+2) s)] Nothing -> []