{-# LANGUAGE OverloadedStrings #-}
module Network.HTTP.Robots where

import qualified Data.ByteString.Char8 as BS
import           Data.ByteString.Char8(ByteString)
import Data.Attoparsec.Char8 hiding (skipSpace)
import Control.Applicative
import Data.List(find)
import Data.Maybe(catMaybes)
import Data.Either(partitionEithers)

type Robot = ([([UserAgent], [Directive])], [Unparsable])

type Unparsable = ByteString

data UserAgent = Wildcard | Literal ByteString
  deriving (Show,Eq)
type Path = ByteString

data Directive = Allow Path
               | Disallow Path
               | CrawlDelay Int
  deriving (Show,Eq)

-- | parseRobots is the main entry point for parsing a robots.txt file.
parseRobots :: ByteString -> Either String Robot
parseRobots = parseOnly robotP
              . BS.unlines
              . filter ( (\x -> BS.null x || BS.head x /= '#' ) . BS.dropWhile (==' '))
              . BS.lines

robotP :: Parser Robot
robotP = do
  (dirs, unparsable) <- partitionEithers <$> many  (eitherP agentDirectiveP unparsableP) <?> "robot"
  return (dirs, filter (/= "") unparsable)

unparsableP = takeTill (=='\n') <* char '\n'

agentDirectiveP = (,) <$> many1 agentP <*> many1 directiveP <?> "agentDirective"



skipSpace :: Parser ()
skipSpace = skipWhile (\x -> x==' ' || x == '\t')

directiveP :: Parser Directive
directiveP = choice [ Allow <$>      (string "Allow:"       >> skipSpace >> tokenP)
                    , (string "Disallow:"    >> skipSpace >>
                       (choice [Disallow <$> tokenP,
                                -- this requires some explanation.
                                -- The RFC suggests that an empty
                                -- Disallow line means anything is
                                -- allowed. Being semantically
                                -- equivalent to 'Allow: "/"',
                                -- I have chosen to change it here
                                -- rather than carry the bogus
                                -- distinction around.
                                endOfLine >> return (Allow "/") ] ))
                    , CrawlDelay <$> (string "Crawl-delay:" >>  skipSpace >>decimal)
                    ] <* commentsP <?> "directive"

agentP :: Parser UserAgent
agentP = do
  string "User-agent:"
  skipSpace
  ((string "*" >> return Wildcard) <|>
   (Literal  <$> tokenP)) <* skipSpace <* endOfLine <?> "agent"


commentsP :: Parser ()
commentsP = skipSpace >>
            ((string "#" >> takeTill (=='\n') >> skipSpace >> endOfLine) <|> return ())

tokenP :: Parser ByteString
tokenP = skipSpace >> takeWhile1 (not . isSpace) <* skipSpace

-- I lack the art to make this prettier.
canAccess :: ByteString -> Robot -> Path -> Bool
canAccess _ _ "/robots.txt" = True -- special-cased
canAccess agent (robot,_) path = case stanzas of
  [] -> True
  ((_,directives):_) -> matchingDirective directives
  where stanzas = catMaybes [find ((any (`isLiteralSubstring` agent))  . fst) robot,
                             find ((Wildcard `elem`) . fst) robot]


        isLiteralSubstring (Literal a) us = a `BS.isInfixOf` us
        isLiteralSubstring _ _ = False
        matchingDirective [] = True
        matchingDirective (x:xs) = case x of
          Allow robot_path ->
            robot_path `BS.isPrefixOf` path || matchingDirective xs
          Disallow robot_path ->
            (not $ robot_path `BS.isPrefixOf` path) && matchingDirective xs

          _ -> matchingDirective xs