{-# LANGUAGE OverloadedStrings #-}
module Network.HTTP.Robots where

import           Control.Applicative
import           Control.Monad
import           Data.Attoparsec.ByteString.Char8 hiding (skipSpace)
import qualified Data.Attoparsec.Text             as AT (isEndOfLine)
import           Data.ByteString.Char8            (ByteString)
import qualified Data.ByteString.Char8            as BS
import           Data.Char                        (toUpper)
import           Data.Either                      (partitionEithers)
import           Data.List                        (find)
import           Data.Maybe                       (catMaybes)
import           Data.Ratio
import           Data.Time.Clock
import           Data.Time.LocalTime              ()

type Robot = ([([UserAgent], [Directive])], [Unparsable])

type Unparsable = ByteString

data UserAgent = Wildcard | Literal ByteString
  deriving (Int -> UserAgent -> ShowS
[UserAgent] -> ShowS
UserAgent -> String
(Int -> UserAgent -> ShowS)
-> (UserAgent -> String)
-> ([UserAgent] -> ShowS)
-> Show UserAgent
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [UserAgent] -> ShowS
$cshowList :: [UserAgent] -> ShowS
show :: UserAgent -> String
$cshow :: UserAgent -> String
showsPrec :: Int -> UserAgent -> ShowS
$cshowsPrec :: Int -> UserAgent -> ShowS
Show,UserAgent -> UserAgent -> Bool
(UserAgent -> UserAgent -> Bool)
-> (UserAgent -> UserAgent -> Bool) -> Eq UserAgent
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: UserAgent -> UserAgent -> Bool
$c/= :: UserAgent -> UserAgent -> Bool
== :: UserAgent -> UserAgent -> Bool
$c== :: UserAgent -> UserAgent -> Bool
Eq)
type Path = ByteString

-- http://www.conman.org/people/spc/robots2.html
-- This was never actually accepted as a standard,
-- but some sites do use it.
type TimeInterval = (DiffTime, DiffTime)

-- Crawldelay may have a decimal point
-- http://help.yandex.com/webmaster/controlling-robot/robots-txt.xml
-- Added directives NoArchive, NoSnippet, NoTranslate, SiteMap.
-- http://bloganddiscussion.com/anythingcomputer/1/robots-txt-noarchive-nocache-nosnippet/
data Directive = Allow Path
               | Disallow Path
               | CrawlDelay { Directive -> Rational
crawlDelay   :: Rational
                            , Directive -> TimeInterval
timeInterval :: TimeInterval
                            }
               | NoArchive Path
               | NoSnippet Path
               | NoTranslate Path
               -- not used by Google, Yahoo or Live Search/Bing
               -- http://searchengineland.com/a-deeper-look-at-robotstxt-17573
               | NoIndex Path
  deriving (Int -> Directive -> ShowS
[Directive] -> ShowS
Directive -> String
(Int -> Directive -> ShowS)
-> (Directive -> String)
-> ([Directive] -> ShowS)
-> Show Directive
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Directive] -> ShowS
$cshowList :: [Directive] -> ShowS
show :: Directive -> String
$cshow :: Directive -> String
showsPrec :: Int -> Directive -> ShowS
$cshowsPrec :: Int -> Directive -> ShowS
Show,Directive -> Directive -> Bool
(Directive -> Directive -> Bool)
-> (Directive -> Directive -> Bool) -> Eq Directive
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Directive -> Directive -> Bool
$c/= :: Directive -> Directive -> Bool
== :: Directive -> Directive -> Bool
$c== :: Directive -> Directive -> Bool
Eq)

-- For use in the attoparsec monad, allows to reparse a sub expression
subParser :: Parser a -> ByteString -> Parser a
subParser :: Parser a -> ByteString -> Parser a
subParser Parser a
p = (String -> Parser a)
-> (a -> Parser a) -> Either String a -> Parser a
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either (Parser a -> String -> Parser a
forall a b. a -> b -> a
const Parser a
forall (m :: * -> *) a. MonadPlus m => m a
mzero) a -> Parser a
forall (m :: * -> *) a. Monad m => a -> m a
return (Either String a -> Parser a)
-> (ByteString -> Either String a) -> ByteString -> Parser a
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Parser a -> ByteString -> Either String a
forall a. Parser a -> ByteString -> Either String a
parseOnly Parser a
p


-- Seems the rational parser is unsecure in the presence of an exponent
-- but since there is no alternative to parse a rational, we just to refuse
-- to parse numbers with 'e' / exponent
-- https://hackage.haskell.org/package/attoparsec-0.12.1.0/docs/Data-Attoparsec-ByteString-Char8.html#v:rational
safeParseRational :: Parser Rational
safeParseRational :: Parser Rational
safeParseRational = do
  (ByteString
bs,Scientific
_) <- Parser Scientific -> Parser (ByteString, Scientific)
forall a. Parser a -> Parser (ByteString, a)
match Parser Scientific
scientific
  if Char -> ByteString -> Bool
BS.elem Char
'e' ByteString
bs Bool -> Bool -> Bool
|| Char -> ByteString -> Bool
BS.elem Char
'E' ByteString
bs
    then Parser Rational
forall (m :: * -> *) a. MonadPlus m => m a
mzero
    else Parser Rational -> ByteString -> Parser Rational
forall a. Parser a -> ByteString -> Parser a
subParser Parser Rational
forall a. Fractional a => Parser a
rational ByteString
bs

-- Yeah, robots.txt should be ASCII, but some sites
-- include the UTF-8 marker at start.
-- We just drop it, but handle the file as ASCII.
dropUTF8BOM :: ByteString -> ByteString
dropUTF8BOM :: ByteString -> ByteString
dropUTF8BOM ByteString
bs = if Int -> ByteString -> ByteString
BS.take Int
3 ByteString
bs ByteString -> ByteString -> Bool
forall a. Eq a => a -> a -> Bool
== ( Char
'\239' Char -> ByteString -> ByteString
`BS.cons`
                                      Char
'\187' Char -> ByteString -> ByteString
`BS.cons`
                                      Char
'\191' Char -> ByteString -> ByteString
`BS.cons` ByteString
BS.empty)
                   then Int -> ByteString -> ByteString
BS.drop Int
3 ByteString
bs
                   else ByteString
bs

parseHourMinute :: Parser (Integer,Integer)
parseHourMinute :: Parser (Integer, Integer)
parseHourMinute = Parser (Integer, Integer)
parseWithColon Parser (Integer, Integer)
-> Parser (Integer, Integer) -> Parser (Integer, Integer)
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser (Integer, Integer)
parseWithoutColon
  where
    parseWithColon :: Parser (Integer, Integer)
parseWithColon = do
      Integer
hours <- Parser ()
skipSpace Parser () -> Parser ByteString Integer -> Parser ByteString Integer
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ByteString Integer
forall a. Integral a => Parser a
decimal
      Parser ByteString Char -> Parser ()
forall (f :: * -> *) a. Functor f => f a -> f ()
void         (Parser ByteString Char -> Parser ())
-> Parser ByteString Char -> Parser ()
forall a b. (a -> b) -> a -> b
$ Parser ()
skipSpace Parser () -> Parser ByteString Char -> Parser ByteString Char
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Char -> Parser ByteString Char
char Char
':'
      Integer
mins  <- Parser ()
skipSpace Parser () -> Parser ByteString Integer -> Parser ByteString Integer
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ByteString Integer
forall a. Integral a => Parser a
decimal
      (Integer, Integer) -> Parser (Integer, Integer)
forall (m :: * -> *) a. Monad m => a -> m a
return (Integer
hours,Integer
mins)
    parseWithoutColon :: Parser (Integer, Integer)
parseWithoutColon = do
      Integer
h <- Int -> Parser ByteString
Data.Attoparsec.ByteString.Char8.take Int
2 Parser ByteString
-> (ByteString -> Parser ByteString Integer)
-> Parser ByteString Integer
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Parser ByteString Integer
-> ByteString -> Parser ByteString Integer
forall a. Parser a -> ByteString -> Parser a
subParser Parser ByteString Integer
forall a. Integral a => Parser a
decimal
      Integer
m <- Int -> Parser ByteString
Data.Attoparsec.ByteString.Char8.take Int
2 Parser ByteString
-> (ByteString -> Parser ByteString Integer)
-> Parser ByteString Integer
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Parser ByteString Integer
-> ByteString -> Parser ByteString Integer
forall a. Parser a -> ByteString -> Parser a
subParser Parser ByteString Integer
forall a. Integral a => Parser a
decimal
      (Integer, Integer) -> Parser (Integer, Integer)
forall (m :: * -> *) a. Monad m => a -> m a
return (Integer
h,Integer
m)

parseTimeInterval :: Parser TimeInterval
parseTimeInterval :: Parser TimeInterval
parseTimeInterval = do
  (Integer
hours_start, Integer
mins_start) <- Parser (Integer, Integer)
parseHourMinute
  Parser () -> Parser ()
forall (f :: * -> *) a. Functor f => f a -> f ()
void         (Parser () -> Parser ()) -> Parser () -> Parser ()
forall a b. (a -> b) -> a -> b
$ (Parser ()
skipSpace Parser () -> Parser ByteString Char -> Parser ByteString Char
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Char -> Parser ByteString Char
char Char
'-' Parser ByteString Char -> Parser () -> Parser ()
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ()
skipSpace) Parser () -> Parser () -> Parser ()
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser ()
skipSpace
  (Integer
hours_end  , Integer
mins_end  ) <- Parser (Integer, Integer)
parseHourMinute
  TimeInterval -> Parser TimeInterval
forall (m :: * -> *) a. Monad m => a -> m a
return ( Integer -> DiffTime
secondsToDiffTime (Integer
hours_start Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
* Integer
60 Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
* Integer
60 Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
+ Integer
mins_start Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
* Integer
60)
         , Integer -> DiffTime
secondsToDiffTime (Integer
hours_end   Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
* Integer
60 Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
* Integer
60 Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
+ Integer
mins_end   Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
* Integer
60))

allDay :: TimeInterval
allDay :: TimeInterval
allDay =  ( Integer -> DiffTime
secondsToDiffTime Integer
0
          , Integer -> DiffTime
secondsToDiffTime (Integer
24Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
*Integer
60Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
*Integer
60) -- because of leap seconds
          )

parseRequestRate :: Parser Directive
parseRequestRate :: Parser Directive
parseRequestRate = do
  Parser ByteString -> Parser ()
forall (f :: * -> *) a. Functor f => f a -> f ()
void (Parser ByteString -> Parser ()) -> Parser ByteString -> Parser ()
forall a b. (a -> b) -> a -> b
$ ByteString -> Parser ByteString
stringCI ByteString
"Request-rate:"
  Integer
docs <- Parser ()
skipSpace Parser () -> Parser ByteString Integer -> Parser ByteString Integer
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ByteString Integer
forall a. Integral a => Parser a
decimal
  Parser ByteString Char -> Parser ()
forall (f :: * -> *) a. Functor f => f a -> f ()
void (Parser ByteString Char -> Parser ())
-> Parser ByteString Char -> Parser ()
forall a b. (a -> b) -> a -> b
$  Parser ()
skipSpace Parser () -> Parser ByteString Char -> Parser ByteString Char
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Char -> Parser ByteString Char
char Char
'/'
  Integer
ptim <- Parser ()
skipSpace Parser () -> Parser ByteString Integer -> Parser ByteString Integer
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ByteString Integer
forall a. Integral a => Parser a
decimal
  Integer
units<- Parser ()
skipSpace Parser () -> Parser ByteString Integer -> Parser ByteString Integer
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>>   (  (Char -> Parser ByteString Char
char Char
's' Parser ByteString Char
-> Parser ByteString Integer -> Parser ByteString Integer
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Integer -> Parser ByteString Integer
forall (m :: * -> *) a. Monad m => a -> m a
return (    Integer
1 :: Integer))
                        Parser ByteString Integer
-> Parser ByteString Integer -> Parser ByteString Integer
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser ByteString Char
char Char
'm' Parser ByteString Char
-> Parser ByteString Integer -> Parser ByteString Integer
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Integer -> Parser ByteString Integer
forall (m :: * -> *) a. Monad m => a -> m a
return (   Integer
60 :: Integer))
                        Parser ByteString Integer
-> Parser ByteString Integer -> Parser ByteString Integer
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser ByteString Char
char Char
'h' Parser ByteString Char
-> Parser ByteString Integer -> Parser ByteString Integer
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Integer -> Parser ByteString Integer
forall (m :: * -> *) a. Monad m => a -> m a
return (Integer
60Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
*Integer
60 :: Integer))
                        Parser ByteString Integer
-> Parser ByteString Integer -> Parser ByteString Integer
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>              Integer -> Parser ByteString Integer
forall (m :: * -> *) a. Monad m => a -> m a
return (    Integer
1 :: Integer)
                         )
  TimeInterval
tint <- Parser ()
skipSpace Parser () -> Parser TimeInterval -> Parser TimeInterval
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> ( Parser TimeInterval
parseTimeInterval Parser TimeInterval -> Parser TimeInterval -> Parser TimeInterval
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> TimeInterval -> Parser TimeInterval
forall (m :: * -> *) a. Monad m => a -> m a
return TimeInterval
allDay)
  Directive -> Parser Directive
forall (m :: * -> *) a. Monad m => a -> m a
return (Directive -> Parser Directive) -> Directive -> Parser Directive
forall a b. (a -> b) -> a -> b
$ Rational -> TimeInterval -> Directive
CrawlDelay ((Integer
ptim Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
* Integer
units) Integer -> Integer -> Rational
forall a. Integral a => a -> a -> Ratio a
% Integer
docs) TimeInterval
tint

parseVisitTime :: Parser Directive
parseVisitTime :: Parser Directive
parseVisitTime = do
  Parser ByteString -> Parser ()
forall (f :: * -> *) a. Functor f => f a -> f ()
void (Parser ByteString -> Parser ()) -> Parser ByteString -> Parser ()
forall a b. (a -> b) -> a -> b
$ ByteString -> Parser ByteString
stringCI ByteString
"Visit-time:"
  TimeInterval
tint <- Parser ()
skipSpace Parser () -> Parser TimeInterval -> Parser TimeInterval
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser TimeInterval
parseTimeInterval
  Directive -> Parser Directive
forall (m :: * -> *) a. Monad m => a -> m a
return (Directive -> Parser Directive) -> Directive -> Parser Directive
forall a b. (a -> b) -> a -> b
$ Rational -> TimeInterval -> Directive
CrawlDelay ( Integer
0 Integer -> Integer -> Rational
forall a. Integral a => a -> a -> Ratio a
% Integer
1) TimeInterval
tint

parseCrawlDelay :: Parser Directive
parseCrawlDelay :: Parser Directive
parseCrawlDelay = do
  Rational
delay <- ByteString -> Parser ByteString
stringCI ByteString
"Crawl-Delay:" Parser ByteString -> Parser () -> Parser ()
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ()
skipSpace Parser () -> Parser Rational -> Parser Rational
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser Rational
safeParseRational
  Directive -> Parser Directive
forall (m :: * -> *) a. Monad m => a -> m a
return (Directive -> Parser Directive) -> Directive -> Parser Directive
forall a b. (a -> b) -> a -> b
$ Rational -> TimeInterval -> Directive
CrawlDelay Rational
delay TimeInterval
allDay

-- ... yeah.
strip :: ByteString -> ByteString
strip :: ByteString -> ByteString
strip = ByteString -> ByteString
BS.reverse (ByteString -> ByteString)
-> (ByteString -> ByteString) -> ByteString -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Bool) -> ByteString -> ByteString
BS.dropWhile (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
' ') (ByteString -> ByteString)
-> (ByteString -> ByteString) -> ByteString -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> ByteString
BS.reverse (ByteString -> ByteString)
-> (ByteString -> ByteString) -> ByteString -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Bool) -> ByteString -> ByteString
BS.dropWhile (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
' ')

-- | parseRobots is the main entry point for parsing a robots.txt file.
parseRobots :: ByteString -> Either String Robot
parseRobots :: ByteString -> Either String Robot
parseRobots ByteString
input = case Either String Robot
parsed of
  -- special case no parsable lines and rubbish
  Right ([], out :: [ByteString]
out@(ByteString
_:[ByteString]
_)) ->
    String -> Either String Robot
forall a b. a -> Either a b
Left (String
"no parsable lines: " String -> ShowS
forall a. [a] -> [a] -> [a]
++ [ByteString] -> String
forall a. Show a => a -> String
show [ByteString]
out)
  Either String Robot
_ -> Either String Robot
parsed

  where parsed :: Either String Robot
parsed = Parser Robot -> ByteString -> Either String Robot
forall a. Parser a -> ByteString -> Either String a
parseOnly Parser Robot
robotP
              (ByteString -> Either String Robot)
-> (ByteString -> ByteString) -> ByteString -> Either String Robot
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [ByteString] -> ByteString
BS.unlines
  -- Filthy hack to account for the fact we don't grab sitemaps
  -- properly. people seem to just whack them anywhere, which makes it
  -- hard to write a nice parser for them.
              ([ByteString] -> ByteString)
-> (ByteString -> [ByteString]) -> ByteString -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (ByteString -> Bool) -> [ByteString] -> [ByteString]
forall a. (a -> Bool) -> [a] -> [a]
filter (Bool -> Bool
not (Bool -> Bool) -> (ByteString -> Bool) -> ByteString -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> ByteString -> Bool
BS.isPrefixOf ByteString
"SITEMAP:" (ByteString -> Bool)
-> (ByteString -> ByteString) -> ByteString -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Char) -> ByteString -> ByteString
BS.map Char -> Char
toUpper)
              ([ByteString] -> [ByteString])
-> (ByteString -> [ByteString]) -> ByteString -> [ByteString]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (ByteString -> Bool) -> [ByteString] -> [ByteString]
forall a. (a -> Bool) -> [a] -> [a]
filter (Bool -> Bool
not (Bool -> Bool) -> (ByteString -> Bool) -> ByteString -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> ByteString -> Bool
BS.isPrefixOf ByteString
"HOST:"    (ByteString -> Bool)
-> (ByteString -> ByteString) -> ByteString -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Char) -> ByteString -> ByteString
BS.map Char -> Char
toUpper)
              ([ByteString] -> [ByteString])
-> (ByteString -> [ByteString]) -> ByteString -> [ByteString]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (ByteString -> Bool) -> [ByteString] -> [ByteString]
forall a. (a -> Bool) -> [a] -> [a]
filter (\ByteString
x -> ByteString -> Char
BS.head ByteString
x Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'#' )
              ([ByteString] -> [ByteString])
-> (ByteString -> [ByteString]) -> ByteString -> [ByteString]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (ByteString -> Bool) -> [ByteString] -> [ByteString]
forall a. (a -> Bool) -> [a] -> [a]
filter (Bool -> Bool
not (Bool -> Bool) -> (ByteString -> Bool) -> ByteString -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> Bool
BS.null)
              ([ByteString] -> [ByteString])
-> (ByteString -> [ByteString]) -> ByteString -> [ByteString]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (ByteString -> ByteString) -> [ByteString] -> [ByteString]
forall a b. (a -> b) -> [a] -> [b]
map ByteString -> ByteString
strip
              ([ByteString] -> [ByteString])
-> (ByteString -> [ByteString]) -> ByteString -> [ByteString]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> [ByteString]
BS.lines
              -- worst way of handling windows newlines ever
              (ByteString -> [ByteString])
-> (ByteString -> ByteString) -> ByteString -> [ByteString]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Bool) -> ByteString -> ByteString
BS.filter (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'\r')
              (ByteString -> ByteString)
-> (ByteString -> ByteString) -> ByteString -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> ByteString
dropUTF8BOM
              (ByteString -> Either String Robot)
-> ByteString -> Either String Robot
forall a b. (a -> b) -> a -> b
$ ByteString
input

robotP :: Parser Robot
robotP :: Parser Robot
robotP = do
  ([([UserAgent], [Directive])]
dirs, [ByteString]
unparsable) <- [Either ([UserAgent], [Directive]) ByteString] -> Robot
forall a b. [Either a b] -> ([a], [b])
partitionEithers ([Either ([UserAgent], [Directive]) ByteString] -> Robot)
-> Parser ByteString [Either ([UserAgent], [Directive]) ByteString]
-> Parser Robot
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser ByteString (Either ([UserAgent], [Directive]) ByteString)
-> Parser ByteString [Either ([UserAgent], [Directive]) ByteString]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
many (Parser ByteString ([UserAgent], [Directive])
-> Parser ByteString
-> Parser ByteString (Either ([UserAgent], [Directive]) ByteString)
forall (f :: * -> *) a b.
Alternative f =>
f a -> f b -> f (Either a b)
eitherP Parser ByteString ([UserAgent], [Directive])
agentDirectiveP Parser ByteString
unparsableP) Parser Robot -> String -> Parser Robot
forall i a. Parser i a -> String -> Parser i a
<?> String
"robot"
  Robot -> Parser Robot
forall (m :: * -> *) a. Monad m => a -> m a
return ([([UserAgent], [Directive])]
dirs, (ByteString -> Bool) -> [ByteString] -> [ByteString]
forall a. (a -> Bool) -> [a] -> [a]
filter (ByteString -> ByteString -> Bool
forall a. Eq a => a -> a -> Bool
/= ByteString
"") [ByteString]
unparsable)

unparsableP :: Parser ByteString
unparsableP :: Parser ByteString
unparsableP = (Char -> Bool) -> Parser ByteString
takeTill Char -> Bool
AT.isEndOfLine Parser ByteString -> Parser () -> Parser ByteString
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Parser ()
endOfLine -- char '\n'

agentDirectiveP :: Parser ([UserAgent],[Directive])
agentDirectiveP :: Parser ByteString ([UserAgent], [Directive])
agentDirectiveP = (,) ([UserAgent] -> [Directive] -> ([UserAgent], [Directive]))
-> Parser ByteString [UserAgent]
-> Parser ByteString ([Directive] -> ([UserAgent], [Directive]))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser ByteString UserAgent -> Parser ByteString [UserAgent]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
many1 Parser ByteString UserAgent
agentP Parser ByteString ([Directive] -> ([UserAgent], [Directive]))
-> Parser ByteString [Directive]
-> Parser ByteString ([UserAgent], [Directive])
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> Parser Directive -> Parser ByteString [Directive]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
many1 Parser Directive
directiveP Parser ByteString ([UserAgent], [Directive])
-> String -> Parser ByteString ([UserAgent], [Directive])
forall i a. Parser i a -> String -> Parser i a
<?> String
"agentDirective"


skipSpace :: Parser ()
skipSpace :: Parser ()
skipSpace = (Char -> Bool) -> Parser ()
skipWhile (\Char
x -> Char
xChar -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
' ' Bool -> Bool -> Bool
|| Char
x Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'\t')

directiveP :: Parser Directive
directiveP :: Parser Directive
directiveP = do
  Parser ()
skipSpace
  [Parser Directive] -> Parser Directive
forall (f :: * -> *) a. Alternative f => [f a] -> f a
choice [ ByteString -> Parser ByteString
stringCI ByteString
"Disallow:" Parser ByteString -> Parser () -> Parser ()
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ()
skipSpace Parser () -> Parser Directive -> Parser Directive
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>>
                        ((ByteString -> Directive
Disallow (ByteString -> Directive) -> Parser ByteString -> Parser Directive
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser ByteString
tokenP) Parser Directive -> Parser Directive -> Parser Directive
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>
                      -- This requires some explanation.
                      -- The RFC suggests that an empty Disallow line means
                      -- anything is allowed. Being semantically equivalent to
                      -- 'Allow: "/"', I have chosen to change it here
                      -- rather than carry the bogus distinction around.
                             (Parser ()
endOfLine Parser () -> Parser Directive -> Parser Directive
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Directive -> Parser Directive
forall (m :: * -> *) a. Monad m => a -> m a
return (ByteString -> Directive
Allow    ByteString
"/")))
                    , ByteString -> Parser ByteString
stringCI ByteString
"Allow:" Parser ByteString -> Parser () -> Parser ()
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ()
skipSpace Parser () -> Parser Directive -> Parser Directive
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>>
                        ((ByteString -> Directive
Allow (ByteString -> Directive) -> Parser ByteString -> Parser Directive
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser ByteString
tokenP) Parser Directive -> Parser Directive -> Parser Directive
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>
                      -- If an empty disallow means 'disallow nothing',
                      -- an empty allow means 'allow nothing'. Right?
                      -- Not sure, actually, but only the americanexpress.com
                      -- has such a case, which in one hand I am tempted
                      -- to consider an error... but for now:
                             (Parser ()
endOfLine Parser () -> Parser Directive -> Parser Directive
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Directive -> Parser Directive
forall (m :: * -> *) a. Monad m => a -> m a
return (ByteString -> Directive
Disallow ByteString
"/")))
                    , Parser Directive
parseCrawlDelay
                    , Parser Directive
parseRequestRate
                    , Parser Directive
parseVisitTime
                    , ByteString -> Directive
NoArchive   (ByteString -> Directive) -> Parser ByteString -> Parser Directive
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (ByteString -> Parser ByteString
stringCI ByteString
"Noarchive:"  Parser ByteString -> Parser () -> Parser ()
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ()
skipSpace Parser () -> Parser ByteString -> Parser ByteString
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ByteString
tokenP)
                    , ByteString -> Directive
NoSnippet   (ByteString -> Directive) -> Parser ByteString -> Parser Directive
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (ByteString -> Parser ByteString
stringCI ByteString
"Nosnippet:"  Parser ByteString -> Parser () -> Parser ()
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ()
skipSpace Parser () -> Parser ByteString -> Parser ByteString
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ByteString
tokenP)
                    , ByteString -> Directive
NoTranslate (ByteString -> Directive) -> Parser ByteString -> Parser Directive
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (ByteString -> Parser ByteString
stringCI ByteString
"Notranslate:"Parser ByteString -> Parser () -> Parser ()
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ()
skipSpace Parser () -> Parser ByteString -> Parser ByteString
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ByteString
tokenP)
                    , ByteString -> Directive
NoIndex     (ByteString -> Directive) -> Parser ByteString -> Parser Directive
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (ByteString -> Parser ByteString
stringCI ByteString
"Noindex:"    Parser ByteString -> Parser () -> Parser ()
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ()
skipSpace Parser () -> Parser ByteString -> Parser ByteString
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ByteString
tokenP)
                    ] Parser Directive -> Parser () -> Parser Directive
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Parser ()
commentsP Parser Directive -> String -> Parser Directive
forall i a. Parser i a -> String -> Parser i a
<?> String
"directive"

agentP :: Parser UserAgent
agentP :: Parser ByteString UserAgent
agentP = do
  Parser ByteString -> Parser ()
forall (f :: * -> *) a. Functor f => f a -> f ()
void (Parser ByteString -> Parser ()) -> Parser ByteString -> Parser ()
forall a b. (a -> b) -> a -> b
$ ByteString -> Parser ByteString
stringCI ByteString
"user-agent:"
  Parser ()
skipSpace
  ((ByteString -> Parser ByteString
string ByteString
"*" Parser ByteString
-> Parser ByteString UserAgent -> Parser ByteString UserAgent
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> UserAgent -> Parser ByteString UserAgent
forall (m :: * -> *) a. Monad m => a -> m a
return UserAgent
Wildcard) Parser ByteString UserAgent
-> Parser ByteString UserAgent -> Parser ByteString UserAgent
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>
   (ByteString -> UserAgent
Literal  (ByteString -> UserAgent)
-> Parser ByteString -> Parser ByteString UserAgent
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser ByteString
tokenWithSpacesP)) Parser ByteString UserAgent
-> Parser () -> Parser ByteString UserAgent
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Parser ()
skipSpace Parser ByteString UserAgent
-> Parser () -> Parser ByteString UserAgent
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Parser ()
endOfLine Parser ByteString UserAgent
-> String -> Parser ByteString UserAgent
forall i a. Parser i a -> String -> Parser i a
<?> String
"agent"


commentsP :: Parser ()
commentsP :: Parser ()
commentsP = Parser ()
skipSpace Parser () -> Parser () -> Parser ()
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>>
            (   (ByteString -> Parser ByteString
string ByteString
"#" Parser ByteString -> Parser ByteString -> Parser ByteString
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> (Char -> Bool) -> Parser ByteString
takeTill Char -> Bool
AT.isEndOfLine Parser ByteString -> Parser () -> Parser ()
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ()
endOfLine)
            Parser () -> Parser () -> Parser ()
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser ()
endOfLine
            Parser () -> Parser () -> Parser ()
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> () -> Parser ()
forall (m :: * -> *) a. Monad m => a -> m a
return ())


tokenP :: Parser ByteString
tokenP :: Parser ByteString
tokenP = Parser ()
skipSpace Parser () -> Parser ByteString -> Parser ByteString
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> (Char -> Bool) -> Parser ByteString
takeWhile1 (Bool -> Bool
not (Bool -> Bool) -> (Char -> Bool) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Bool
isSpace) Parser ByteString -> Parser () -> Parser ByteString
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Parser ()
skipSpace
tokenWithSpacesP :: Parser ByteString
tokenWithSpacesP :: Parser ByteString
tokenWithSpacesP = Parser ()
skipSpace Parser () -> Parser ByteString -> Parser ByteString
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> (Char -> Bool) -> Parser ByteString
takeWhile1 (Bool -> Bool
not (Bool -> Bool) -> (Char -> Bool) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (\Char
c -> Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'#' Bool -> Bool -> Bool
|| Char -> Bool
AT.isEndOfLine Char
c))
                             Parser ByteString -> Parser ByteString -> Parser ByteString
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* (Char -> Bool) -> Parser ByteString
takeTill Char -> Bool
AT.isEndOfLine

-- I lack the art to make this prettier.
-- Currently does not take into account the CrawlDelay / Request Rate directives
canAccess :: ByteString -> Robot -> Path -> Bool
canAccess :: ByteString -> Robot -> ByteString -> Bool
canAccess ByteString
_ Robot
_ ByteString
"/robots.txt" = Bool
True -- special-cased
canAccess ByteString
agent ([([UserAgent], [Directive])]
robot,[ByteString]
_) ByteString
path = case [([UserAgent], [Directive])]
stanzas of
  [] -> Bool
True
  (([UserAgent]
_,[Directive]
directives):[([UserAgent], [Directive])]
_) -> [Directive] -> Bool
matchingDirective [Directive]
directives
  where stanzas :: [([UserAgent], [Directive])]
stanzas = [Maybe ([UserAgent], [Directive])] -> [([UserAgent], [Directive])]
forall a. [Maybe a] -> [a]
catMaybes [(([UserAgent], [Directive]) -> Bool)
-> [([UserAgent], [Directive])] -> Maybe ([UserAgent], [Directive])
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Maybe a
find ((UserAgent -> Bool) -> [UserAgent] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
any (UserAgent -> ByteString -> Bool
`isLiteralSubstring` ByteString
agent)  ([UserAgent] -> Bool)
-> (([UserAgent], [Directive]) -> [UserAgent])
-> ([UserAgent], [Directive])
-> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ([UserAgent], [Directive]) -> [UserAgent]
forall a b. (a, b) -> a
fst) [([UserAgent], [Directive])]
robot,
                             (([UserAgent], [Directive]) -> Bool)
-> [([UserAgent], [Directive])] -> Maybe ([UserAgent], [Directive])
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Maybe a
find ((UserAgent
Wildcard UserAgent -> [UserAgent] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem`) ([UserAgent] -> Bool)
-> (([UserAgent], [Directive]) -> [UserAgent])
-> ([UserAgent], [Directive])
-> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ([UserAgent], [Directive]) -> [UserAgent]
forall a b. (a, b) -> a
fst) [([UserAgent], [Directive])]
robot]


        isLiteralSubstring :: UserAgent -> ByteString -> Bool
isLiteralSubstring (Literal ByteString
a) ByteString
us = ByteString
a ByteString -> ByteString -> Bool
`BS.isInfixOf` ByteString
us
        isLiteralSubstring UserAgent
_ ByteString
_ = Bool
False
        matchingDirective :: [Directive] -> Bool
matchingDirective [] = Bool
True
        matchingDirective (Directive
x:[Directive]
xs) = case Directive
x of
          Allow ByteString
robot_path ->
            ByteString
robot_path ByteString -> ByteString -> Bool
`BS.isPrefixOf` ByteString
path Bool -> Bool -> Bool
|| [Directive] -> Bool
matchingDirective [Directive]
xs
          Disallow ByteString
robot_path ->
            Bool -> Bool
not (ByteString
robot_path ByteString -> ByteString -> Bool
`BS.isPrefixOf` ByteString
path) Bool -> Bool -> Bool
&& [Directive] -> Bool
matchingDirective [Directive]
xs

          Directive
_ -> [Directive] -> Bool
matchingDirective [Directive]
xs