module Network.CrawlChain.Crawling (
crawl,
crawlAndStore, CrawlActionDescriber,
Crawler
) where
import Prelude hiding (log)
import Control.Concurrent (threadDelay)
import Data.Time.Format (formatTime, defaultTimeLocale)
import Data.Time.LocalTime (getZonedTime)
import Network.HTTP
import Network.Stream (Result)
import Network.URI
import Network.CrawlChain.CrawlAction
import Network.CrawlChain.CrawlResult
import Network.URI.Util
type RequestType = Request_String
type Crawler = CrawlAction -> IO CrawlResult
type CrawlActionDescriber = CrawlAction -> String
crawl :: Crawler
crawl action = log "delaying crawl for 1s" >> threadDelay 1000000 >> crawl' action 3 (toRequest action)
crawlAndStore :: CrawlActionDescriber -> Crawler
crawlAndStore describer = (>>= store) . crawl
where
store :: CrawlResult -> IO CrawlResult
store cr = store' (crawlingResultStatus cr)
where
store' CrawlingOk = storeResult (crawlingContent cr)
store' (CrawlingFailed msg) = storeResult msg
store' status = error $ "unexpected status: " ++ (show status)
storeResult filecontent'= do
writeFile' (describer $ crawlingAction cr) filecontent'
return cr
where
writeFile' n c = do
putStrLn $ "writing to " ++ n
writeFile n c
log :: String -> IO ()
log msg = printTime >> putStr ("> " ++ msg ++ "\n")
where
printTime = getZonedTime >>= return . formatTime' >>= putStr
where
formatTime' = formatTime defaultTimeLocale "%Y-%m-%d_%H:%M:%S"
toRequest :: CrawlAction -> RequestType
toRequest (GetRequest url) = addStandardHeader $ mkRequest GET (toURI url)
toRequest (PostRequest url params postType) =
plainPost {rqBody = formParams,
rqHeaders = makePostHeaders postType formParams
}
where
plainPost :: RequestType
plainPost = addStandardHeader $ mkRequest POST (toURI url)
formParams = urlEncodeVars params
addStandardHeader :: (HasHeaders h) => h -> h
addStandardHeader = insertHeaders [
Header HdrUserAgent "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:38.0) Gecko/20100101 Firefox/38.0"
]
makePostHeaders :: PostType -> String -> [Header]
makePostHeaders PostForm formParams =
[
mkHeader HdrContentType "application/x-www-form-urlencoded",
mkHeader HdrContentLength (show $ length formParams)
]
makePostHeaders _ _ = []
crawl' :: CrawlAction -> Int -> RequestType -> IO CrawlResult
crawl' originalAction maxRedirects request = do
response <- simpleHTTP request
body <- getResponseBody response
code <- getResponseCode response
log $ "Crawled " ++ (showRequest request) ++ " with result: " ++ (show code)
checkRedirect maxRedirects request (crawlResult response body code)
where
crawlResult :: (HasHeaders a) => Result a -> String -> ResponseCode -> CrawlResult
crawlResult response body code = CrawlResult originalAction body (parseResonseCode code (locationHeaders response))
where
locationHeaders :: (HasHeaders a) => Result a -> [Header]
locationHeaders = either (\_ -> []) (retrieveHeaders HdrLocation)
checkRedirect :: Int -> RequestType -> CrawlResult -> IO CrawlResult
checkRedirect 0 _ result = return result
checkRedirect maxRedirects previousRequest result =
maybe (return result) (crawl' (crawlingAction result) (maxRedirects 1)) (extractRedirectAction $ crawlingResultStatus result)
where
extractRedirectAction :: CrawlingResultStatus -> Maybe (Request String)
extractRedirectAction (CrawlingRedirect url) = Just $ previousRequest { rqURI = toURI url }
extractRedirectAction _ = Nothing
parseResonseCode :: ResponseCode -> [Header] -> CrawlingResultStatus
parseResonseCode (2, _, _) _ = CrawlingOk
parseResonseCode code@(3, _, _) hdrLoc = maybe (CrawlingFailed (show code)) CrawlingRedirect (extractRedirectUrl hdrLoc)
parseResonseCode code _ = CrawlingFailed (show code)
extractRedirectUrl :: [Header] -> Maybe String
extractRedirectUrl [] = Nothing
extractRedirectUrl ((Header _ value):xs) =
let parsedHeader = (parseURIReference value)
in maybe (extractRedirectUrl xs) (Just . show) parsedHeader
showRequest :: RequestType -> String
showRequest r = (show $ rqURI r) ++ " - " ++ (show $ rqMethod r) ++ ": " ++ (rqBody r)