module Text.Feed.Crawl where
import Text.Feed.Crawl.Common
import Text.Feed.Crawl.DetectLink
import Network.Connection (TLSSettings(..))
import Network.HTTP.Conduit
import qualified Data.Conduit as C
import qualified Data.ByteString.Char8 as B
import qualified Data.ByteString.Lazy.Char8 as BL
import System.Environment
import System.IO
import Control.Monad.Trans.State.Lazy
import Control.Monad.Trans.Class (lift)
import Network.HTTP.Types.Status (statusCode)
import Network.HTTP.Types.Header
import Control.Monad.IO.Class
import Data.Maybe (listToMaybe, catMaybes)
import qualified Control.Exception as E
crawlURL :: String
-> IO CrawlResult
crawlURL url = do
request <- parseUrl url
let settings = mkManagerSettings (TLSSettingsSimple True False False) Nothing
(mkCrawlResult url =<< withRedirectTracking settings request)
`E.catch` (\e -> return . Left . CrawlHttpError $ e)
mkCrawlResult :: String -> (Response BL.ByteString, [Status]) -> IO CrawlResult
mkCrawlResult firstUrl (resp, statuses) = do
let ct = lookup hContentType . responseHeaders $ resp
let loc = lookup hLocation . responseHeaders $ resp
let urls = catMaybes [ sLocation | Status{..} <- statuses ]
if isFeedContentType ct
then return . Right $ CrawlSuccess {
crawlLastContentType = ct
, crawlLastUrl = head (urls ++ [B.pack firstUrl])
, crawlFeedContent = responseBody resp
}
else do
links <- findFeedLinks (BL.unpack . responseBody $ resp)
return . Left $ CrawlFoundFeedLinks links
withRedirectTracking :: ManagerSettings
-> Request
-> IO (Response BL.ByteString, [Status])
withRedirectTracking settings request = do
m <- newManager settings
r <- runStateT (traceRedirects request m) []
return r
traceRedirects :: Request
-> Manager
-> StateT [Status] IO (Response BL.ByteString)
traceRedirects req' man = do
let req = req' { checkStatus = \_ _ _ -> Nothing }
res <- httpLbs req{redirectCount=0} man
let req2 = getRedirectedRequest req (responseHeaders res) (responseCookieJar res) (statusCode (responseStatus res))
let location = lookup hLocation . responseHeaders $ res
case (req2, location) of
(Just req2', Just location') -> do
let st = Status {
sStatusCode = statusCode (responseStatus res)
, sLocation = lookup hLocation . responseHeaders $ res
, sContentType = lookup hContentType . responseHeaders $ res
}
modify (st:)
traceRedirects req2' man
_ -> return res
isFeed :: Status -> Bool
isFeed Status{..} = isFeedContentType sContentType