module HTML where import Control.Exception ( SomeException, catch, evaluate ) import Data.ByteString.Lazy ( toStrict ) import Data.Char import Network.HTTP.Conduit import Data.Text as T ( concat, lines, pack, strip, unpack ) import Data.Text.Encoding ( decodeUtf8 ) import Text.HTML.TagSoup import Text.Regex.PCRE ( (=~) ) htmlTitle :: FilePath -> String -> IO (Maybe String) htmlTitle regPath url = flip catch handleException $ do regexps <- fmap Prelude.lines $ readFile regPath if (safeHost regexps url) then do putStrLn $ url ++ " is safe" if url =~ "https?://www\\.youtube\\.com" then do resp <- simpleHttp ("http://www.getlinkinfo.com/info?link=" ++ url) evaluate $ extractTitle True . unpack . T.concat . T.lines . decodeUtf8 $ toStrict resp else do resp <- simpleHttp url evaluate $ extractTitle False . unpack . T.concat . T.lines . decodeUtf8 $ toStrict resp else pure Nothing where handleException :: SomeException -> IO (Maybe String) handleException e = do putStrLn $ "Exception: " ++ show e pure Nothing extractTitle :: Bool -> String -> Maybe String extractTitle True body = case dropTillTitle True (parseTags body) of (TagText title:TagClose "b":TagClose "dd":_) -> pure ("\ETX7«\ETX6 " ++ chomp title ++ " \ETX7»\SI") _ -> Nothing extractTitle False body = case dropTillTitle False (parseTags body) of (TagText title:TagClose "title":_) -> pure ("\ETX7«\ETX6 " ++ chomp title ++ " \ETX7»\SI") _ -> Nothing dropTillTitle :: Bool -> [Tag String] -> [Tag String] dropTillTitle _ [] = [] dropTillTitle True (TagOpen "dt" [("class","link-title")]:TagText _:TagClose "dt":TagText _:TagOpen "dd" []:TagOpen "b" []:xs) = xs dropTillTitle False (TagOpen "title" _:xs) = xs dropTillTitle b (_:xs) = dropTillTitle b xs chomp :: String -> String chomp = filter (\c -> ord c >= 32 && (isAlphaNum c || isPunctuation c || c == ' ')) . unpack . strip . pack -- Filter an URL so that we don’t make overviews of unknown hosts. Pretty -- cool to prevent people from going onto sensitive websites. -- -- All the regex should be put in a file. One per row. safeHost :: [String] -> String -> Bool safeHost regexps url = any (url =~) regexps