module Network.HTTP.Robots where
import qualified Data.ByteString.Char8 as BS
import Data.ByteString.Char8(ByteString)
import Data.Attoparsec.Char8
import Control.Applicative
type Robot = [(UserAgent, [Directive])]
data UserAgent = Wildcard | Literal ByteString
deriving (Show,Eq)
type Path = ByteString
data Directive = Allow Path
| Disallow Path
| CrawlDelay Int
deriving (Show,Eq)
parseRobots :: ByteString -> Either String Robot
parseRobots = parseOnly robotP
. BS.unlines
. filter ( (\x -> BS.null x || BS.head x /= '#' ) . BS.dropWhile (==' '))
. BS.lines
robotP :: Parser Robot
robotP = many ((,) <$> agentP <*> many directiveP) <?> "robot"
directiveP :: Parser Directive
directiveP = choice [ Allow <$> (string "Allow:" >> skipSpace >> tokenP)
, Disallow <$> (string "Disallow:" >> skipSpace >> tokenP)
, CrawlDelay <$> (string "Crawl-delay:" >> skipSpace >>decimal)
] <* skipSpace <?> "directive"
agentP :: Parser UserAgent
agentP = do
string "User-agent:"
skipSpace
((string "*" >> return Wildcard) <|>
(Literal <$> tokenP)) <* skipSpace <?> "agent"
tokenP :: Parser ByteString
tokenP = skipSpace >> takeTill isSpace <* skipSpace