Safe Haskell | None |
---|
- saveCrawlerState :: Binary r => FilePath -> CrawlerAction a r ()
- loadCrawlerState :: Binary r => FilePath -> CrawlerAction a r ()
- uriProcessed :: URI -> CrawlerAction a r ()
- urisProcessed :: URIs -> CrawlerAction a r ()
- uriToBeProcessed :: URI -> Int -> CrawlerAction a r ()
- urisToBeProcessed :: URIsWithLevel -> CrawlerAction a r ()
- uriAddToRobotsTxt :: URI -> CrawlerAction a r ()
- accumulateRes :: NFData r => (URI, a) -> CrawlerAction a r ()
- crawlDocs :: (NFData a, NFData r, Binary r) => [URI] -> CrawlerAction a r ()
- crawlerLoop :: (NFData a, NFData r, Binary r) => CrawlerAction a r ()
- crawlerResume :: (NFData a, NFData r, Binary r) => String -> CrawlerAction a r ()
- crawlerCheckSaveState :: Binary r => CrawlerAction a r ()
- crawlerSaveState :: Binary r => CrawlerAction a r ()
- type MapFold a r = (a -> IO r) -> (r -> r -> IO r) -> [a] -> IO r
- crawlNextDocs :: NFData r => MapFold URIWithLevel (URIs, URIsWithLevel, r) -> CrawlerAction a r ()
- processDoc' :: URIWithLevel -> CrawlerAction a r (URIs, URIsWithLevel, [(URI, a)])
- combineDocResults' :: NFData r => MergeDocResults r -> (URIs, URIsWithLevel, r) -> (URIs, URIsWithLevel, r) -> IO (URIs, URIsWithLevel, r)
- crawlNextDoc :: (NFData a, NFData r) => CrawlerAction a r ()
- processDoc :: URIWithLevel -> CrawlerAction a r (URI, [URIWithLevel], [(URI, a)])
- isAllowedByRobots :: URI -> CrawlerAction a r Bool
- processDocArrow :: CrawlerConfig c r -> URI -> IOSArrow a (URI, ([URI], [(URI, c)]))
- getLocationReference :: ArrowXml a => a XmlTree String
- getRealDocURI :: ArrowXml a => a XmlTree String
- initCrawler :: CrawlerAction a r ()
- runCrawler :: CrawlerAction a r x -> CrawlerConfig a r -> CrawlerState r -> IO (x, CrawlerState r)
- execCrawler :: CrawlerAction a r x -> CrawlerConfig a r -> CrawlerState r -> IO (CrawlerState r)
Documentation
saveCrawlerState :: Binary r => FilePath -> CrawlerAction a r ()Source
loadCrawlerState :: Binary r => FilePath -> CrawlerAction a r ()Source
uriProcessed :: URI -> CrawlerAction a r ()Source
urisProcessed :: URIs -> CrawlerAction a r ()Source
uriToBeProcessed :: URI -> Int -> CrawlerAction a r ()Source
urisToBeProcessed :: URIsWithLevel -> CrawlerAction a r ()Source
uriAddToRobotsTxt :: URI -> CrawlerAction a r ()Source
accumulateRes :: NFData r => (URI, a) -> CrawlerAction a r ()Source
crawlerLoop :: (NFData a, NFData r, Binary r) => CrawlerAction a r ()Source
crawlerResume :: (NFData a, NFData r, Binary r) => String -> CrawlerAction a r ()Source
crawlerCheckSaveState :: Binary r => CrawlerAction a r ()Source
crawlerSaveState :: Binary r => CrawlerAction a r ()Source
crawlNextDocs :: NFData r => MapFold URIWithLevel (URIs, URIsWithLevel, r) -> CrawlerAction a r ()Source
processDoc' :: URIWithLevel -> CrawlerAction a r (URIs, URIsWithLevel, [(URI, a)])Source
combineDocResults' :: NFData r => MergeDocResults r -> (URIs, URIsWithLevel, r) -> (URIs, URIsWithLevel, r) -> IO (URIs, URIsWithLevel, r)Source
crawlNextDoc :: (NFData a, NFData r) => CrawlerAction a r ()Source
crawl a single doc, mark doc as processed, collect new hrefs and combine doc result with accumulator in state
processDoc :: URIWithLevel -> CrawlerAction a r (URI, [URIWithLevel], [(URI, a)])Source
Run the process document arrow and prepare results
isAllowedByRobots :: URI -> CrawlerAction a r BoolSource
filter uris rejected by robots.txt
processDocArrow :: CrawlerConfig c r -> URI -> IOSArrow a (URI, ([URI], [(URI, c)]))Source
From a document two results are computed, 1. the list of all hrefs in the contents, and 2. the collected info contained in the page. This result is augmented with the transfer uri such that following functions know the source of this contents. The transfer-URI may be another one as the input uri, there could happen a redirect in the http request.
The two listA arrows make the whole arrow deterministic, so it never fails
getLocationReference :: ArrowXml a => a XmlTree StringSource
compute the real URI in case of a 301 or 302 response (moved permanently or temporary), else the arrow will fail
getRealDocURI :: ArrowXml a => a XmlTree StringSource
compute the real URI of the document, in case of a move response this is contained in the "http-location" attribute, else it's the tranferURI.
initCrawler :: CrawlerAction a r ()Source
runCrawler :: CrawlerAction a r x -> CrawlerConfig a r -> CrawlerState r -> IO (x, CrawlerState r)Source
execCrawler :: CrawlerAction a r x -> CrawlerConfig a r -> CrawlerState r -> IO (CrawlerState r)Source