{-# LANGUAGE OverloadedStrings, RecordWildCards #-} module Text.Feed.Crawl where import Text.Feed.Crawl.Common import Text.Feed.Crawl.DetectLink import Network.Connection (TLSSettings(..)) import Network.HTTP.Conduit import qualified Data.Conduit as C import qualified Data.ByteString.Char8 as B import qualified Data.ByteString.Lazy.Char8 as BL import System.Environment import System.IO import Control.Monad.Trans.State.Lazy import Control.Monad.Trans.Class (lift) import Network.HTTP.Types.Status (statusCode) import Network.HTTP.Types.Header import Control.Monad.IO.Class import Data.Maybe (listToMaybe, catMaybes) import qualified Control.Exception as E -- |The main function crawlURL :: String -> IO CrawlResult crawlURL url = do request <- parseUrl url let settings = mkManagerSettings (TLSSettingsSimple True False False) Nothing (mkCrawlResult url =<< withRedirectTracking settings request) `E.catch` (\e -> return . Left . CrawlHttpError $ e) mkCrawlResult :: String -> (Response BL.ByteString, [Status]) -> IO CrawlResult mkCrawlResult firstUrl (resp, statuses) = do let ct = lookup hContentType . responseHeaders $ resp let loc = lookup hLocation . responseHeaders $ resp let urls = catMaybes [ sLocation | Status{..} <- statuses ] if isFeedContentType ct then return . Right $ CrawlSuccess { crawlLastContentType = ct , crawlLastUrl = head (urls ++ [B.pack firstUrl]) , crawlFeedContent = responseBody resp } else do links <- findFeedLinks (BL.unpack . responseBody $ resp) return . Left $ CrawlFoundFeedLinks links -- |Returns a tuple of response and list of redirect locations. -- The first location is the last redirect. withRedirectTracking :: ManagerSettings -> Request -> IO (Response BL.ByteString, [Status]) withRedirectTracking settings request = do m <- newManager settings r <- runStateT (traceRedirects request m) [] return r traceRedirects :: Request -> Manager -> StateT [Status] IO (Response BL.ByteString) traceRedirects req' man = do let req = req' { checkStatus = \_ _ _ -> Nothing } res <- httpLbs req{redirectCount=0} man let req2 = getRedirectedRequest req (responseHeaders res) (responseCookieJar res) (statusCode (responseStatus res)) let location = lookup hLocation . responseHeaders $ res case (req2, location) of (Just req2', Just location') -> do let st = Status { sStatusCode = statusCode (responseStatus res) , sLocation = lookup hLocation . responseHeaders $ res , sContentType = lookup hContentType . responseHeaders $ res } modify (st:) traceRedirects req2' man _ -> return res isFeed :: Status -> Bool isFeed Status{..} = isFeedContentType sContentType