Safe Haskell | None |
---|
- robotsAddHost :: CrawlerConfig a r -> AddRobotsAction
- robotsDontAddHost :: CrawlerConfig a r -> AddRobotsAction
- robotsDisallow :: Robots -> URI -> Bool
- getURIPart :: (URI -> String) -> URI -> String
- getHost :: URI -> URI
- isRobotsScheme :: URI -> Bool
- robotsGetSpec :: CrawlerConfig a r -> URI -> IO (URI, RobotRestriction)
- getRobotsTxt :: CrawlerConfig c r -> URI -> IO String
- evalRobotsTxt :: String -> String -> RobotRestriction
- enableRobotsTxt :: CrawlerConfig a r -> CrawlerConfig a r
- disableRobotsTxt :: CrawlerConfig a r -> CrawlerConfig a r
Documentation
robotsAddHost :: CrawlerConfig a r -> AddRobotsActionSource
Add a robots.txt description for a given URI, if it's not already there. The 1. main function of this module
robotsDisallow :: Robots -> URI -> BoolSource
Check whether a robot is not allowed to access a page. The 2. main function of this module
isRobotsScheme :: URI -> BoolSource
robotsGetSpec :: CrawlerConfig a r -> URI -> IO (URI, RobotRestriction)Source
Access, parse and evaluate a robots.txt file for a given URI
getRobotsTxt :: CrawlerConfig c r -> URI -> IO StringSource
Try to get the robots.txt file for a given host. If it's not there or any errors occur during access, the empty string is returned
evalRobotsTxt :: String -> String -> RobotRestrictionSource
Parse the robots.txt, select the crawler specific parts and build a robots restriction value
enableRobotsTxt :: CrawlerConfig a r -> CrawlerConfig a rSource
Enable the evaluation of robots.txt
disableRobotsTxt :: CrawlerConfig a r -> CrawlerConfig a rSource
Disable the evaluation of robots.txt