--
-- | Fetch URL page titles of HTML links.
--
module Plugin.Url (theModule) where
import Plugin
import Network.URI
import qualified Text.Regex as R -- legacy
PLUGIN Url
instance Module UrlModule Bool where
moduleHelp _ "url-title" = "url-title . Fetch the page title."
moduleCmds _ = ["url-title", "tiny-url"]
modulePrivs _ = ["url-on", "url-off"]
moduleDefState _ = return True -- url on
moduleSerialize _ = Just stdSerial
process_ _ "url-title" txt = lift $ maybe (return ["Url not valid."])
fetchTitle (containsUrl txt)
process_ _ "tiny-url" txt = lift $ maybe (return ["Url not valid."])
fetchTiny (containsUrl txt)
process_ _ "url-on" _ = writeMS True >> return ["Url enabled"]
process_ _ "url-off" _ = writeMS False >> return ["Url disabled"]
contextual _ _ _ text = do
alive <- readMS
if alive && (not $ areSubstringsOf ignoredStrings text)
then case containsUrl text of
Nothing -> return []
Just url
| length url > 65 -> do
title <- lift $ fetchTitle url
tiny <- lift $ fetchTiny url
return $ zipWith' cat title tiny
| otherwise -> lift $ fetchTitle url
else return []
where cat x y = x ++ ", " ++ y
zipWith' _ [] ys = ys
zipWith' _ xs [] = xs
zipWith' f (x:xs) (y:ys) = f x y : zipWith' f xs ys
------------------------------------------------------------------------
-- | Fetch the title of the specified URL.
fetchTitle :: String -> LB [String]
fetchTitle url = do
title <- io $ urlPageTitle url (proxy config)
return $ maybe [] return title
-- | base url for fetching tiny urls
tinyurl :: String
tinyurl = "http://tinyurl.com/api-create.php?url="
-- | Fetch the title of the specified URL.
fetchTiny :: String -> LB [String]
fetchTiny url
| Just uri <- parseURI (tinyurl ++ url) = do
tiny <- io $ getHtmlPage uri (proxy config)
return $ maybe [] return $ findTiny $ foldl' cat "" tiny
| otherwise = return $ maybe [] return $ Just url
where cat x y = x ++ " " ++ y
-- | Tries to find the start of a tinyurl
findTiny :: String -> Maybe String
findTiny text = do
(_,kind,rest,_) <- R.matchRegexAll begreg text
let url = takeWhile (/=' ') rest
return $ stripSuffixes ignoredUrlSuffixes $ kind ++ url
where
begreg = R.mkRegexWithOpts "http://tinyurl.com/" True False
-- | List of strings that, if present in a contextual message, will
-- prevent the looking up of titles. This list can be used to stop
-- responses to lisppaste for example. Another important use is to
-- another lambdabot looking up a url title that contains another
-- url in it (infinite loop). Ideally, this list could be added to
-- by an admin via a privileged command (TODO).
ignoredStrings :: [String]
ignoredStrings =
["paste", -- Ignore lisppaste, rafb.net
"cpp.sourcforge.net", -- C++ paste bin
"HaskellIrcPastePage", -- Ignore paste page
"title of that page", -- Ignore others like the old me
urlTitlePrompt] -- Ignore others like me
-- | Suffixes that should be stripped off when identifying URLs in
-- contextual messages. These strings may be punctuation in the
-- current sentence vs part of a URL. Included here is the NUL
-- character as well.
ignoredUrlSuffixes :: [String]
ignoredUrlSuffixes = [".", ",", ";", ")", "\"", "\1", "\n"]
-- | Searches a string for an embeddded URL and returns it.
containsUrl :: String -> Maybe String
containsUrl text = do
(_,kind,rest,_) <- R.matchRegexAll begreg text
let url = takeWhile (`notElem` " \n\t\v") rest
return $ stripSuffixes ignoredUrlSuffixes $ kind ++ url
where
begreg = R.mkRegexWithOpts "https?://" True False
-- | Utility function to remove potential suffixes from a string.
-- Note, once a suffix is found, it is stripped and returned, no other
-- suffixes are searched for at that point.
stripSuffixes :: [String] -> String -> String
stripSuffixes [] str = str
stripSuffixes (s:ss) str
| isSuffixOf s str = take (length str - length s) $ str
| otherwise = stripSuffixes ss str
-- | Utility function to check of any of the Strings in the specified
-- list are substrings of the String.
areSubstringsOf :: [String] -> String -> Bool
areSubstringsOf = flip (any . flip isSubstringOf)
where
isSubstringOf s str = any (isPrefixOf s) (tails str)