module Crawling ( crawl, crawlAndStore, CrawlActionDescriber, Crawler ) where import Prelude hiding (log) import Control.Concurrent (threadDelay) import Data.Time.Format (formatTime, defaultTimeLocale) import Data.Time.LocalTime (getZonedTime) import Network.HTTP import Network.Stream (Result) import Network.URI import CrawlAction import CrawlResult import HTTPUtil type RequestType = Request_String type Crawler = CrawlAction -> IO CrawlResult type CrawlActionDescriber = CrawlAction -> String crawl :: Crawler crawl action = log "delaying crawl for 1s" >> threadDelay 1000000 >> crawl' action 3 (toRequest action) crawlAndStore :: CrawlActionDescriber -> Crawler crawlAndStore describer = (>>= store) . crawl where store :: CrawlResult -> IO CrawlResult store cr = store' (crawlingResultStatus cr) where store' CrawlingOk = storeResult (crawlingContent cr) store' (CrawlingFailed msg) = storeResult msg store' status = error $ "unexpected status: " ++ (show status) storeResult filecontent'= do writeFile' (describer $ crawlingAction cr) filecontent' return cr where writeFile' n c = do putStrLn $ "writing to " ++ n writeFile n c log :: String -> IO () log msg = printTime >> putStr ("> " ++ msg ++ "\n") where printTime = getZonedTime >>= return . formatTime' >>= putStr where formatTime' = formatTime defaultTimeLocale "%Y-%m-%d_%H:%M:%S" toRequest :: CrawlAction -> RequestType toRequest (GetRequest url) = addStandardHeader $ mkRequest GET (toURI url) toRequest (PostRequest url params postType) = plainPost {rqBody = formParams, rqHeaders = makePostHeaders postType formParams } where plainPost :: RequestType plainPost = addStandardHeader $ mkRequest POST (toURI url) formParams = urlEncodeVars params addStandardHeader :: (HasHeaders h) => h -> h addStandardHeader = insertHeaders [ Header HdrUserAgent "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:38.0) Gecko/20100101 Firefox/38.0" ] makePostHeaders :: PostType -> String -> [Header] makePostHeaders PostForm formParams = [ mkHeader HdrContentType "application/x-www-form-urlencoded", mkHeader HdrContentLength (show $ length formParams) ] makePostHeaders _ _ = [] crawl' :: CrawlAction -> Int -> RequestType -> IO CrawlResult crawl' originalAction maxRedirects request = do response <- simpleHTTP request body <- getResponseBody response code <- getResponseCode response log $ "Crawled " ++ (showRequest request) ++ " with result: " ++ (show code) checkRedirect maxRedirects request (crawlResult response body code) where crawlResult :: (HasHeaders a) => Result a -> String -> ResponseCode -> CrawlResult crawlResult response body code = CrawlResult originalAction body (parseResonseCode code (locationHeaders response)) where locationHeaders :: (HasHeaders a) => Result a -> [Header] locationHeaders = either (\_ -> []) (retrieveHeaders HdrLocation) -- this reinvents the wheel and should be switched to using http-client if problems occur checkRedirect :: Int -> RequestType -> CrawlResult -> IO CrawlResult checkRedirect 0 _ result = return result checkRedirect maxRedirects previousRequest result = maybe (return result) (crawl' (crawlingAction result) (maxRedirects -1)) (extractRedirectAction $ crawlingResultStatus result) where extractRedirectAction :: CrawlingResultStatus -> Maybe (Request String) -- unclean: converts PostRequest to Get, should do something more sensible extractRedirectAction (CrawlingRedirect url) = Just $ previousRequest { rqURI = toURI url } extractRedirectAction _ = Nothing parseResonseCode :: ResponseCode -> [Header] -> CrawlingResultStatus parseResonseCode (2, _, _) _ = CrawlingOk parseResonseCode code@(3, _, _) hdrLoc = maybe (CrawlingFailed (show code)) CrawlingRedirect (extractRedirectUrl hdrLoc) parseResonseCode code _ = CrawlingFailed (show code) extractRedirectUrl :: [Header] -> Maybe String extractRedirectUrl [] = Nothing extractRedirectUrl ((Header _ value):xs) = let parsedHeader = (parseURIReference value) in maybe (extractRedirectUrl xs) (Just . show) parsedHeader showRequest :: RequestType -> String showRequest r = (show $ rqURI r) ++ " - " ++ (show $ rqMethod r) ++ ": " ++ (rqBody r)