module Network.HTTP.Robots where
import qualified Data.ByteString.Char8 as BS
import Data.ByteString.Char8(ByteString)
import Data.Attoparsec.Char8 hiding (skipSpace)
import Control.Applicative
import Data.List(find)
import Data.Maybe(catMaybes)
type Robot = [([UserAgent], [Directive])]
data UserAgent = Wildcard | Literal ByteString
deriving (Show,Eq)
type Path = ByteString
data Directive = Allow Path
| Disallow Path
| CrawlDelay Int
deriving (Show,Eq)
parseRobots :: ByteString -> Either String Robot
parseRobots = parseOnly robotP
. BS.unlines
. filter ( (\x -> BS.null x || BS.head x /= '#' ) . BS.dropWhile (==' '))
. BS.lines
robotP :: Parser Robot
robotP = many ((,) <$> many1 agentP <*> many1 directiveP) <?> "robot"
skipSpace :: Parser ()
skipSpace = skipWhile (\x -> x==' ' || x == '\t')
directiveP :: Parser Directive
directiveP = choice [ Allow <$> (string "Allow:" >> skipSpace >> tokenP)
, (string "Disallow:" >> skipSpace >>
(choice [Disallow <$> tokenP,
endOfLine >> return (Allow "/") ] ))
, CrawlDelay <$> (string "Crawl-delay:" >> skipSpace >>decimal)
] <* commentsP <?> "directive"
agentP :: Parser UserAgent
agentP = do
string "User-agent:"
skipSpace
((string "*" >> return Wildcard) <|>
(Literal <$> tokenP)) <* skipSpace <* endOfLine <?> "agent"
commentsP :: Parser ()
commentsP = skipSpace >>
((string "#" >> takeTill (=='\n') >> skipSpace >> endOfLine) <|> return ())
tokenP :: Parser ByteString
tokenP = skipSpace >> takeWhile1 (not . isSpace) <* skipSpace
canAccess :: ByteString -> Robot -> Path -> Bool
canAccess _ _ "/robots.txt" = True
canAccess agent robot path = case stanzas of
[] -> True
((_,directives):_) -> matchingDirective directives
where stanzas = catMaybes [find ((Literal agent `elem`) . fst) robot,
find ((Wildcard `elem`) . fst) robot]
matchingDirective [] = True
matchingDirective (x:xs) = case x of
Allow robot_path -> if robot_path `BS.isPrefixOf` path
then True
else matchingDirective xs
Disallow robot_path ->
if robot_path `BS.isPrefixOf` path
then False
else matchingDirective xs
_ -> matchingDirective xs