module Holumbus.Crawler.Robots
where
import Control.DeepSeq
import Data.Function.Selector
import Data.List
import qualified Data.Map as M
import Data.Maybe
import Holumbus.Crawler.URIs
import Holumbus.Crawler.RobotTypes
import Holumbus.Crawler.Types
import Holumbus.Crawler.Logger
import qualified Network.URI as N
import Text.XML.HXT.Core
import Text.XML.HXT.Cache
robotsAddHost :: CrawlerConfig a r -> AddRobotsAction
robotsAddHost conf uri rdm
| not (isRobotsScheme uri)
= return rdm
| isJust spec = return rdm
| otherwise = do
(h, r) <- robotsGetSpec conf host
let rdm' = M.insert h r rdm
return $! rdm'
where
host = getHost uri
spec = M.lookup host rdm
robotsDontAddHost :: CrawlerConfig a r -> AddRobotsAction
robotsDontAddHost = const $ const return
robotsDisallow :: Robots -> URI -> Bool
robotsDisallow rdm uri
| not (isRobotsScheme uri)
= False
| isNothing restr = False
| otherwise = evalRestr $ fromJust restr
where
host = getHost uri
path' = getURIPart N.uriPath uri
restr = M.lookup host rdm
evalRestr = foldr isDis False
where
isDis (r, a) v
| r `isPrefixOf` path'
&&
not (null r) = a == Disallow
| otherwise = v
getURIPart :: (N.URI -> String) -> URI -> String
getURIPart f = maybe "" f
.
N.parseURIReference
getHost :: URI -> URI
getHost = getURIPart h
where
h u = show $ u { N.uriPath = ""
, N.uriQuery = ""
, N.uriFragment = ""
}
isRobotsScheme :: URI -> Bool
isRobotsScheme = (`elem` ["http:", "https:"]) . getURIPart N.uriScheme
robotsGetSpec :: CrawlerConfig a r -> URI -> IO (URI, RobotRestriction)
robotsGetSpec conf uri
| not (isRobotsScheme uri)
= return ("", [])
| null host = return ("", [])
| otherwise = do
r <- getRobotsTxt conf host
s <- return $ evalRobotsTxt agent r
rnf s `seq` return (host, s)
where
host = getHost uri
agent = getS theCrawlerName $ conf
getRobotsTxt :: CrawlerConfig c r -> URI -> IO String
getRobotsTxt c uri = runX processRobotsTxt >>= (return . concat)
where
processRobotsTxt = hxtSetTraceAndErrorLogger (getS theTraceLevelHxt c)
>>>
readDocument
[ getS theSysConfig c
>>>
withParseByMimeType yes
>>>
withIgnoreNoneXmlContents no
>>>
withAcceptedMimeTypes [text_plain]
>>>
withRedirect yes
>>>
withoutCache
] (getHost uri ++ "/robots.txt")
>>>
documentStatusOk
>>>
getChildren
>>>
getText
evalRobotsTxt :: String -> String -> RobotRestriction
evalRobotsTxt agent t = lines
>>>
map (takeWhile (/= '#') >>> stringTrim)
>>>
filter (not . null)
>>>
filter ( stringToLower
>>>
takeWhile (/= ':')
>>>
(`elem` [ "disallow"
, "allow"
, "user-agent"
, "crawl-delay"
, "request-rate"
, "visit-time"
, "sitemap"
]
)
)
>>>
map ( span (/= ':')
>>>
( stringToLower *** (drop 1 >>> stringTrim) )
)
>>>
dropWhile ( \ (x, y) ->
( x /= "user-agent"
||
( y /= "*" && not (y `isPrefixOf` agent) )
)
)
>>>
drop 1
>>>
takeWhile (fst >>> (/= "user-agent"))
>>>
concatMap toRestr
$
t
where
toRestr ("disallow", uri) = [(uri, Disallow)]
toRestr ("allow", uri) = [(uri, Allow)]
toRestr _ = []
enableRobotsTxt :: CrawlerConfig a r -> CrawlerConfig a r
enableRobotsTxt = setS theAddRobotsAction robotsAddHost
disableRobotsTxt :: CrawlerConfig a r -> CrawlerConfig a r
disableRobotsTxt = setS theAddRobotsAction robotsDontAddHost