{-# LANGUAGE OverloadedStrings #-} module Network.HTTP.Robots where import Control.Applicative import Data.Attoparsec.Char8 hiding (skipSpace) import Data.ByteString.Char8 (ByteString) import qualified Data.ByteString.Char8 as BS import Data.Either (partitionEithers) import Data.List (find) import Data.Maybe (catMaybes) type Robot = ([([UserAgent], [Directive])], [Unparsable]) type Unparsable = ByteString data UserAgent = Wildcard | Literal ByteString deriving (Show,Eq) type Path = ByteString data Directive = Allow Path | Disallow Path | CrawlDelay Int deriving (Show,Eq) -- ... yeah. strip = BS.reverse . BS.dropWhile (==' ') . BS.reverse . BS.dropWhile (==' ') -- | parseRobots is the main entry point for parsing a robots.txt file. parseRobots :: ByteString -> Either String Robot parseRobots input = case parsed of -- special case no parsable lines and rubbish Right ([], out@(_:_)) -> Left ("no parsable lines: " ++ (show out)) _ -> parsed where parsed = parseOnly robotP . BS.unlines -- Filthy hack to account for the fact we don't grab sitemaps -- properly. people seem to just whack them anywhere, which makes it -- hard to write a nice parser for them. . filter (not . ( "Sitemap:" `BS.isPrefixOf`)) . filter (\x -> BS.head x /= '#' ) . filter (not . BS.null) . map strip . BS.lines -- worst way of handling window newlines ever . BS.filter (/= '\r') $ input robotP :: Parser Robot robotP = do (dirs, unparsable) <- partitionEithers <$> many (eitherP agentDirectiveP unparsableP) "robot" return (dirs, filter (/= "") unparsable) unparsableP = takeTill (=='\n') <* endOfLine -- char '\n' agentDirectiveP = (,) <$> many1 agentP <*> many1 directiveP "agentDirective" skipSpace :: Parser () skipSpace = skipWhile (\x -> x==' ' || x == '\t') directiveP :: Parser Directive directiveP = choice [ Allow <$> (stringCI "Allow:" >> skipSpace >> tokenP) , (stringCI "Disallow:" >> skipSpace >> (choice [Disallow <$> tokenP, -- this requires some explanation. -- The RFC suggests that an empty -- Disallow line means anything is -- allowed. Being semantically -- equivalent to 'Allow: "/"', -- I have chosen to change it here -- rather than carry the bogus -- distinction around. endOfLine >> return (Allow "/") ] )) , CrawlDelay <$> (stringCI "Crawl-delay:" >> skipSpace >>decimal) ] <* commentsP "directive" agentP :: Parser UserAgent agentP = do stringCI "user-agent:" skipSpace ((string "*" >> return Wildcard) <|> (Literal <$> tokenP)) <* skipSpace <* endOfLine "agent" commentsP :: Parser () commentsP = skipSpace >> ((string "#" >> takeTill (=='\n') >> skipSpace >> endOfLine) <|> return ()) tokenP :: Parser ByteString tokenP = skipSpace >> takeWhile1 (not . isSpace) <* skipSpace -- I lack the art to make this prettier. canAccess :: ByteString -> Robot -> Path -> Bool canAccess _ _ "/robots.txt" = True -- special-cased canAccess agent (robot,_) path = case stanzas of [] -> True ((_,directives):_) -> matchingDirective directives where stanzas = catMaybes [find ((any (`isLiteralSubstring` agent)) . fst) robot, find ((Wildcard `elem`) . fst) robot] isLiteralSubstring (Literal a) us = a `BS.isInfixOf` us isLiteralSubstring _ _ = False matchingDirective [] = True matchingDirective (x:xs) = case x of Allow robot_path -> robot_path `BS.isPrefixOf` path || matchingDirective xs Disallow robot_path -> (not $ robot_path `BS.isPrefixOf` path) && matchingDirective xs _ -> matchingDirective xs