Holumbus-Searchengine-1.2.1: A search and indexing engine.

Safe Haskell	None

Holumbus.Crawler.Core

Synopsis

Documentation

saveCrawlerState :: Binary r => FilePath -> CrawlerAction a r ()Source

loadCrawlerState :: Binary r => FilePath -> CrawlerAction a r ()Source

uriProcessed :: URI -> CrawlerAction a r ()Source

urisProcessed :: URIs -> CrawlerAction a r ()Source

uriToBeProcessed :: URI -> Int -> CrawlerAction a r ()Source

urisToBeProcessed :: URIsWithLevel -> CrawlerAction a r ()Source

uriAddToRobotsTxt :: URI -> CrawlerAction a r ()Source

accumulateRes :: NFData r => (URI, a) -> CrawlerAction a r ()Source

crawlDocs :: (NFData a, NFData r, Binary r) => [URI] -> CrawlerAction a r ()Source

crawlerLoop :: (NFData a, NFData r, Binary r) => CrawlerAction a r ()Source

crawlerResume :: (NFData a, NFData r, Binary r) => String -> CrawlerAction a r ()Source

crawlerCheckSaveState :: Binary r => CrawlerAction a r ()Source

crawlerSaveState :: Binary r => CrawlerAction a r ()Source

type MapFold a r = (a -> IO r) -> (r -> r -> IO r) -> [a] -> IO rSource

crawlNextDocs :: NFData r => MapFold URIWithLevel (URIs, URIsWithLevel, r) -> CrawlerAction a r ()Source

processDoc' :: URIWithLevel -> CrawlerAction a r (URIs, URIsWithLevel, [(URI, a)])Source

combineDocResults' :: NFData r => MergeDocResults r -> (URIs, URIsWithLevel, r) -> (URIs, URIsWithLevel, r) -> IO (URIs, URIsWithLevel, r)Source

crawlNextDoc :: (NFData a, NFData r) => CrawlerAction a r ()Source

crawl a single doc, mark doc as processed, collect new hrefs and combine doc result with accumulator in state

processDoc :: URIWithLevel -> CrawlerAction a r (URI, [URIWithLevel], [(URI, a)])Source

Run the process document arrow and prepare results

isAllowedByRobots :: URI -> CrawlerAction a r Bool Source

filter uris rejected by robots.txt

processDocArrow :: CrawlerConfig c r -> URI -> IOSArrow a (URI, ([URI], [(URI, c)]))Source

From a document two results are computed, 1. the list of all hrefs in the contents, and 2. the collected info contained in the page. This result is augmented with the transfer uri such that following functions know the source of this contents. The transfer-URI may be another one as the input uri, there could happen a redirect in the http request.

The two listA arrows make the whole arrow deterministic, so it never fails

getLocationReference :: ArrowXml a => a XmlTree String Source

compute the real URI in case of a 301 or 302 response (moved permanently or temporary), else the arrow will fail

getRealDocURI :: ArrowXml a => a XmlTree String Source

compute the real URI of the document, in case of a move response this is contained in the "http-location" attribute, else it's the tranferURI.

initCrawler :: CrawlerAction a r ()Source

runCrawler :: CrawlerAction a r x -> CrawlerConfig a r -> CrawlerState r -> IO (x, CrawlerState r)Source

execCrawler :: CrawlerAction a r x -> CrawlerConfig a r -> CrawlerState r -> IO (CrawlerState r)Source