{-# LANGUAGE OverloadedStrings #-}

module Network.HTTP.Robots where

import Control.Applicative
import Control.Monad
import Data.Attoparsec.ByteString.Char8 hiding (skipSpace)
import qualified Data.Attoparsec.Text as AT (isEndOfLine)
import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS
import Data.Char (toUpper)
import Data.Either (partitionEithers)
import Data.List (find)
import Data.Maybe (catMaybes)
import Data.Ratio
import Data.Time.Clock
import Data.Time.LocalTime ()
import Debug.Trace

type Robot = ([([UserAgent], [Directive])], [Unparsable])

type Unparsable = ByteString

data UserAgent = Wildcard | Literal ByteString
  deriving (Int -> UserAgent -> ShowS
[UserAgent] -> ShowS
UserAgent -> String
(Int -> UserAgent -> ShowS)
-> (UserAgent -> String)
-> ([UserAgent] -> ShowS)
-> Show UserAgent
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> UserAgent -> ShowS
showsPrec :: Int -> UserAgent -> ShowS
$cshow :: UserAgent -> String
show :: UserAgent -> String
$cshowList :: [UserAgent] -> ShowS
showList :: [UserAgent] -> ShowS
Show, UserAgent -> UserAgent -> Bool
(UserAgent -> UserAgent -> Bool)
-> (UserAgent -> UserAgent -> Bool) -> Eq UserAgent
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: UserAgent -> UserAgent -> Bool
== :: UserAgent -> UserAgent -> Bool
$c/= :: UserAgent -> UserAgent -> Bool
/= :: UserAgent -> UserAgent -> Bool
Eq)

type Path = ByteString

-- http://www.conman.org/people/spc/robots2.html
-- This was never actually accepted as a standard,
-- but some sites do use it.
type TimeInterval = (DiffTime, DiffTime)

-- Crawldelay may have a decimal point
-- http://help.yandex.com/webmaster/controlling-robot/robots-txt.xml
-- Added directives NoArchive, NoSnippet, NoTranslate, SiteMap.
-- http://bloganddiscussion.com/anythingcomputer/1/robots-txt-noarchive-nocache-nosnippet/
data Directive
  = Allow Path
  | Disallow Path
  | CrawlDelay
      { Directive -> Rational
crawlDelay :: Rational,
        Directive -> TimeInterval
timeInterval :: TimeInterval
      }
  | NoArchive Path
  | NoSnippet Path
  | NoTranslate Path
  | -- not used by Google, Yahoo or Live Search/Bing
    -- http://searchengineland.com/a-deeper-look-at-robotstxt-17573
    NoIndex Path
  deriving (Int -> Directive -> ShowS
[Directive] -> ShowS
Directive -> String
(Int -> Directive -> ShowS)
-> (Directive -> String)
-> ([Directive] -> ShowS)
-> Show Directive
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> Directive -> ShowS
showsPrec :: Int -> Directive -> ShowS
$cshow :: Directive -> String
show :: Directive -> String
$cshowList :: [Directive] -> ShowS
showList :: [Directive] -> ShowS
Show, Directive -> Directive -> Bool
(Directive -> Directive -> Bool)
-> (Directive -> Directive -> Bool) -> Eq Directive
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: Directive -> Directive -> Bool
== :: Directive -> Directive -> Bool
$c/= :: Directive -> Directive -> Bool
/= :: Directive -> Directive -> Bool
Eq)

-- For use in the attoparsec monad, allows to reparse a sub expression
subParser :: Parser a -> ByteString -> Parser a
subParser :: forall a. Parser a -> Path -> Parser a
subParser Parser a
p = (String -> Parser a)
-> (a -> Parser a) -> Either String a -> Parser a
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either (Parser a -> String -> Parser a
forall a b. a -> b -> a
const Parser a
forall a. Parser Path a
forall (m :: * -> *) a. MonadPlus m => m a
mzero) a -> Parser a
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return (Either String a -> Parser a)
-> (Path -> Either String a) -> Path -> Parser a
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Parser a -> Path -> Either String a
forall a. Parser a -> Path -> Either String a
parseOnly Parser a
p

-- Seems the rational parser is unsecure in the presence of an exponent
-- but since there is no alternative to parse a rational, we just to refuse
-- to parse numbers with 'e' / exponent
-- https://hackage.haskell.org/package/attoparsec-0.12.1.0/docs/Data-Attoparsec-ByteString-Char8.html#v:rational
safeParseRational :: Parser Rational
safeParseRational :: Parser Rational
safeParseRational = do
  (Path
bs, Scientific
_) <- Parser Scientific -> Parser (Path, Scientific)
forall a. Parser a -> Parser (Path, a)
match Parser Scientific
scientific
  if Char -> Path -> Bool
BS.elem Char
'e' Path
bs Bool -> Bool -> Bool
|| Char -> Path -> Bool
BS.elem Char
'E' Path
bs
    then Parser Rational
forall a. Parser Path a
forall (m :: * -> *) a. MonadPlus m => m a
mzero
    else Parser Rational -> Path -> Parser Rational
forall a. Parser a -> Path -> Parser a
subParser Parser Rational
forall a. Fractional a => Parser a
rational Path
bs

-- Yeah, robots.txt should be ASCII, but some sites
-- include the UTF-8 marker at start.
-- We just drop it, but handle the file as ASCII.
dropUTF8BOM :: ByteString -> ByteString
dropUTF8BOM :: Path -> Path
dropUTF8BOM Path
bs =
  if Int -> Path -> Path
BS.take Int
3 Path
bs
    Path -> Path -> Bool
forall a. Eq a => a -> a -> Bool
== ( Char
'\239'
           Char -> Path -> Path
`BS.cons` Char
'\187'
           Char -> Path -> Path
`BS.cons` Char
'\191'
           Char -> Path -> Path
`BS.cons` Path
BS.empty
       )
    then Int -> Path -> Path
BS.drop Int
3 Path
bs
    else Path
bs

parseHourMinute :: Parser (Integer, Integer)
parseHourMinute :: Parser (Integer, Integer)
parseHourMinute = Parser (Integer, Integer)
parseWithColon Parser (Integer, Integer)
-> Parser (Integer, Integer) -> Parser (Integer, Integer)
forall a. Parser Path a -> Parser Path a -> Parser Path a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser (Integer, Integer)
parseWithoutColon
  where
    parseWithColon :: Parser (Integer, Integer)
parseWithColon = do
      Integer
hours <- Parser ()
skipSpace Parser () -> Parser Path Integer -> Parser Path Integer
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser Path Integer
forall a. Integral a => Parser a
decimal
      Parser Path Char -> Parser ()
forall (f :: * -> *) a. Functor f => f a -> f ()
void (Parser Path Char -> Parser ()) -> Parser Path Char -> Parser ()
forall a b. (a -> b) -> a -> b
$ Parser ()
skipSpace Parser () -> Parser Path Char -> Parser Path Char
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Char -> Parser Path Char
char Char
':'
      Integer
mins <- Parser ()
skipSpace Parser () -> Parser Path Integer -> Parser Path Integer
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser Path Integer
forall a. Integral a => Parser a
decimal
      (Integer, Integer) -> Parser (Integer, Integer)
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return (Integer
hours, Integer
mins)
    parseWithoutColon :: Parser (Integer, Integer)
parseWithoutColon = do
      Integer
h <- Int -> Parser Path
Data.Attoparsec.ByteString.Char8.take Int
2 Parser Path -> (Path -> Parser Path Integer) -> Parser Path Integer
forall a b. Parser Path a -> (a -> Parser Path b) -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Parser Path Integer -> Path -> Parser Path Integer
forall a. Parser a -> Path -> Parser a
subParser Parser Path Integer
forall a. Integral a => Parser a
decimal
      Integer
m <- Int -> Parser Path
Data.Attoparsec.ByteString.Char8.take Int
2 Parser Path -> (Path -> Parser Path Integer) -> Parser Path Integer
forall a b. Parser Path a -> (a -> Parser Path b) -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Parser Path Integer -> Path -> Parser Path Integer
forall a. Parser a -> Path -> Parser a
subParser Parser Path Integer
forall a. Integral a => Parser a
decimal
      (Integer, Integer) -> Parser (Integer, Integer)
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return (Integer
h, Integer
m)

parseTimeInterval :: Parser TimeInterval
parseTimeInterval :: Parser TimeInterval
parseTimeInterval = do
  (Integer
hours_start, Integer
mins_start) <- Parser (Integer, Integer)
parseHourMinute
  Parser () -> Parser ()
forall (f :: * -> *) a. Functor f => f a -> f ()
void (Parser () -> Parser ()) -> Parser () -> Parser ()
forall a b. (a -> b) -> a -> b
$ (Parser ()
skipSpace Parser () -> Parser Path Char -> Parser Path Char
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Char -> Parser Path Char
char Char
'-' Parser Path Char -> Parser () -> Parser ()
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ()
skipSpace) Parser () -> Parser () -> Parser ()
forall a. Parser Path a -> Parser Path a -> Parser Path a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser ()
skipSpace
  (Integer
hours_end, Integer
mins_end) <- Parser (Integer, Integer)
parseHourMinute
  TimeInterval -> Parser TimeInterval
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return
    ( Integer -> DiffTime
secondsToDiffTime (Integer
hours_start Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
* Integer
60 Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
* Integer
60 Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
+ Integer
mins_start Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
* Integer
60),
      Integer -> DiffTime
secondsToDiffTime (Integer
hours_end Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
* Integer
60 Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
* Integer
60 Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
+ Integer
mins_end Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
* Integer
60)
    )

allDay :: TimeInterval
allDay :: TimeInterval
allDay =
  ( Integer -> DiffTime
secondsToDiffTime Integer
0,
    Integer -> DiffTime
secondsToDiffTime (Integer
24 Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
* Integer
60 Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
* Integer
60) -- because of leap seconds
  )

parseRequestRate :: Parser Directive
parseRequestRate :: Parser Directive
parseRequestRate = do
  Parser Path -> Parser ()
forall (f :: * -> *) a. Functor f => f a -> f ()
void (Parser Path -> Parser ()) -> Parser Path -> Parser ()
forall a b. (a -> b) -> a -> b
$ Path -> Parser Path
stringCI Path
"Request-rate:"
  Integer
docs <- Parser ()
skipSpace Parser () -> Parser Path Integer -> Parser Path Integer
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser Path Integer
forall a. Integral a => Parser a
decimal
  Parser Path Char -> Parser ()
forall (f :: * -> *) a. Functor f => f a -> f ()
void (Parser Path Char -> Parser ()) -> Parser Path Char -> Parser ()
forall a b. (a -> b) -> a -> b
$ Parser ()
skipSpace Parser () -> Parser Path Char -> Parser Path Char
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Char -> Parser Path Char
char Char
'/'
  Integer
ptim <- Parser ()
skipSpace Parser () -> Parser Path Integer -> Parser Path Integer
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser Path Integer
forall a. Integral a => Parser a
decimal
  Integer
units <-
    Parser ()
skipSpace
      Parser () -> Parser Path Integer -> Parser Path Integer
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> ( (Char -> Parser Path Char
char Char
's' Parser Path Char -> Parser Path Integer -> Parser Path Integer
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Integer -> Parser Path Integer
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return (Integer
1 :: Integer))
             Parser Path Integer -> Parser Path Integer -> Parser Path Integer
forall a. Parser Path a -> Parser Path a -> Parser Path a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Path Char
char Char
'm' Parser Path Char -> Parser Path Integer -> Parser Path Integer
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Integer -> Parser Path Integer
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return (Integer
60 :: Integer))
             Parser Path Integer -> Parser Path Integer -> Parser Path Integer
forall a. Parser Path a -> Parser Path a -> Parser Path a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> Parser Path Char
char Char
'h' Parser Path Char -> Parser Path Integer -> Parser Path Integer
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Integer -> Parser Path Integer
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return (Integer
60 Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
* Integer
60 :: Integer))
             Parser Path Integer -> Parser Path Integer -> Parser Path Integer
forall a. Parser Path a -> Parser Path a -> Parser Path a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Integer -> Parser Path Integer
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return (Integer
1 :: Integer)
         )
  TimeInterval
tint <- Parser ()
skipSpace Parser () -> Parser TimeInterval -> Parser TimeInterval
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> (Parser TimeInterval
parseTimeInterval Parser TimeInterval -> Parser TimeInterval -> Parser TimeInterval
forall a. Parser Path a -> Parser Path a -> Parser Path a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> TimeInterval -> Parser TimeInterval
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return TimeInterval
allDay)
  Directive -> Parser Directive
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return (Directive -> Parser Directive) -> Directive -> Parser Directive
forall a b. (a -> b) -> a -> b
$ Rational -> TimeInterval -> Directive
CrawlDelay ((Integer
ptim Integer -> Integer -> Integer
forall a. Num a => a -> a -> a
* Integer
units) Integer -> Integer -> Rational
forall a. Integral a => a -> a -> Ratio a
% Integer
docs) TimeInterval
tint

parseVisitTime :: Parser Directive
parseVisitTime :: Parser Directive
parseVisitTime = do
  Parser Path -> Parser ()
forall (f :: * -> *) a. Functor f => f a -> f ()
void (Parser Path -> Parser ()) -> Parser Path -> Parser ()
forall a b. (a -> b) -> a -> b
$ Path -> Parser Path
stringCI Path
"Visit-time:"
  TimeInterval
tint <- Parser ()
skipSpace Parser () -> Parser TimeInterval -> Parser TimeInterval
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser TimeInterval
parseTimeInterval
  Directive -> Parser Directive
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return (Directive -> Parser Directive) -> Directive -> Parser Directive
forall a b. (a -> b) -> a -> b
$ Rational -> TimeInterval -> Directive
CrawlDelay (Integer
0 Integer -> Integer -> Rational
forall a. Integral a => a -> a -> Ratio a
% Integer
1) TimeInterval
tint

parseCrawlDelay :: Parser Directive
parseCrawlDelay :: Parser Directive
parseCrawlDelay = do
  Rational
delay <- Path -> Parser Path
stringCI Path
"Crawl-Delay:" Parser Path -> Parser () -> Parser ()
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ()
skipSpace Parser () -> Parser Rational -> Parser Rational
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser Rational
safeParseRational
  Directive -> Parser Directive
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return (Directive -> Parser Directive) -> Directive -> Parser Directive
forall a b. (a -> b) -> a -> b
$ Rational -> TimeInterval -> Directive
CrawlDelay Rational
delay TimeInterval
allDay

-- ... yeah.
strip :: ByteString -> ByteString
strip :: Path -> Path
strip = Path -> Path
BS.reverse (Path -> Path) -> (Path -> Path) -> Path -> Path
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Bool) -> Path -> Path
BS.dropWhile (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
' ') (Path -> Path) -> (Path -> Path) -> Path -> Path
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Path -> Path
BS.reverse (Path -> Path) -> (Path -> Path) -> Path -> Path
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Bool) -> Path -> Path
BS.dropWhile (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
' ')

-- | parseRobots is the main entry point for parsing a robots.txt file.
parseRobots :: ByteString -> Either String Robot
parseRobots :: Path -> Either String Robot
parseRobots Path
input = case Either String Robot
parsed of
  -- special case no parsable lines and rubbish
  Right ([], out :: [Path]
out@(Path
_ : [Path]
_)) ->
    String -> Either String Robot
forall a b. a -> Either a b
Left (String
"no parsable lines: " String -> ShowS
forall a. [a] -> [a] -> [a]
++ [Path] -> String
forall a. Show a => a -> String
show [Path]
out)
  Either String Robot
_ -> Either String Robot
parsed
  where
    parsed :: Either String Robot
parsed =
      Parser Robot -> Path -> Either String Robot
forall a. Parser a -> Path -> Either String a
parseOnly Parser Robot
robotP
        (Path -> Either String Robot)
-> (Path -> Path) -> Path -> Either String Robot
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Path] -> Path
BS.unlines
        -- Filthy hack to account for the fact we don't grab sitemaps
        -- properly. people seem to just whack them anywhere, which makes it
        -- hard to write a nice parser for them.
        ([Path] -> Path) -> (Path -> [Path]) -> Path -> Path
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Path -> Bool) -> [Path] -> [Path]
forall a. (a -> Bool) -> [a] -> [a]
filter (Bool -> Bool
not (Bool -> Bool) -> (Path -> Bool) -> Path -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Path -> Path -> Bool
BS.isPrefixOf Path
"SITEMAP:" (Path -> Bool) -> (Path -> Path) -> Path -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Char) -> Path -> Path
BS.map Char -> Char
toUpper)
        ([Path] -> [Path]) -> (Path -> [Path]) -> Path -> [Path]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Path -> Bool) -> [Path] -> [Path]
forall a. (a -> Bool) -> [a] -> [a]
filter (Bool -> Bool
not (Bool -> Bool) -> (Path -> Bool) -> Path -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Path -> Path -> Bool
BS.isPrefixOf Path
"HOST:" (Path -> Bool) -> (Path -> Path) -> Path -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Char) -> Path -> Path
BS.map Char -> Char
toUpper)
        ([Path] -> [Path]) -> (Path -> [Path]) -> Path -> [Path]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Path -> Bool) -> [Path] -> [Path]
forall a. (a -> Bool) -> [a] -> [a]
filter (\Path
x -> Path -> Char
BS.head Path
x Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'#')
        ([Path] -> [Path]) -> (Path -> [Path]) -> Path -> [Path]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Path -> Bool) -> [Path] -> [Path]
forall a. (a -> Bool) -> [a] -> [a]
filter (Bool -> Bool
not (Bool -> Bool) -> (Path -> Bool) -> Path -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Path -> Bool
BS.null)
        ([Path] -> [Path]) -> (Path -> [Path]) -> Path -> [Path]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Path -> Path) -> [Path] -> [Path]
forall a b. (a -> b) -> [a] -> [b]
map Path -> Path
strip
        ([Path] -> [Path]) -> (Path -> [Path]) -> Path -> [Path]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Path -> [Path]
BS.lines
        -- worst way of handling windows newlines ever
        (Path -> [Path]) -> (Path -> Path) -> Path -> [Path]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Bool) -> Path -> Path
BS.filter (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'\r')
        (Path -> Path) -> (Path -> Path) -> Path -> Path
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Path -> Path
dropUTF8BOM
        (Path -> Either String Robot) -> Path -> Either String Robot
forall a b. (a -> b) -> a -> b
$ Path
input

robotP :: Parser Robot
robotP :: Parser Robot
robotP = do
  ([([UserAgent], [Directive])]
dirs, [Path]
unparsable) <- [Either ([UserAgent], [Directive]) Path] -> Robot
forall a b. [Either a b] -> ([a], [b])
partitionEithers ([Either ([UserAgent], [Directive]) Path] -> Robot)
-> Parser Path [Either ([UserAgent], [Directive]) Path]
-> Parser Robot
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser Path (Either ([UserAgent], [Directive]) Path)
-> Parser Path [Either ([UserAgent], [Directive]) Path]
forall a. Parser Path a -> Parser Path [a]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
many (Parser Path ([UserAgent], [Directive])
-> Parser Path
-> Parser Path (Either ([UserAgent], [Directive]) Path)
forall (f :: * -> *) a b.
Alternative f =>
f a -> f b -> f (Either a b)
eitherP Parser Path ([UserAgent], [Directive])
agentDirectiveP Parser Path
unparsableP) Parser Robot -> String -> Parser Robot
forall i a. Parser i a -> String -> Parser i a
<?> String
"robot"
  Robot -> Parser Robot
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return ([([UserAgent], [Directive])]
dirs, (Path -> Bool) -> [Path] -> [Path]
forall a. (a -> Bool) -> [a] -> [a]
filter (Path -> Path -> Bool
forall a. Eq a => a -> a -> Bool
/= Path
"") [Path]
unparsable)

unparsableP :: Parser ByteString
unparsableP :: Parser Path
unparsableP = (Char -> Bool) -> Parser Path
takeTill Char -> Bool
AT.isEndOfLine Parser Path -> Parser () -> Parser Path
forall a b. Parser Path a -> Parser Path b -> Parser Path a
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Parser ()
endOfLine -- char '\n'

agentDirectiveP :: Parser ([UserAgent], [Directive])
agentDirectiveP :: Parser Path ([UserAgent], [Directive])
agentDirectiveP = (,) ([UserAgent] -> [Directive] -> ([UserAgent], [Directive]))
-> Parser Path [UserAgent]
-> Parser Path ([Directive] -> ([UserAgent], [Directive]))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser Path UserAgent -> Parser Path [UserAgent]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
many1 Parser Path UserAgent
agentP Parser Path ([Directive] -> ([UserAgent], [Directive]))
-> Parser Path [Directive]
-> Parser Path ([UserAgent], [Directive])
forall a b. Parser Path (a -> b) -> Parser Path a -> Parser Path b
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> Parser Directive -> Parser Path [Directive]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
many1 Parser Directive
directiveP Parser Path ([UserAgent], [Directive])
-> String -> Parser Path ([UserAgent], [Directive])
forall i a. Parser i a -> String -> Parser i a
<?> String
"agentDirective"

skipSpace :: Parser ()
skipSpace :: Parser ()
skipSpace = (Char -> Bool) -> Parser ()
skipWhile (\Char
x -> Char
x Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
' ' Bool -> Bool -> Bool
|| Char
x Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'\t')

directiveP :: Parser Directive
directiveP :: Parser Directive
directiveP = do
  Parser ()
skipSpace
  [Parser Directive] -> Parser Directive
forall (f :: * -> *) a. Alternative f => [f a] -> f a
choice
    [ Path -> Parser Path
stringCI Path
"Disallow:"
        Parser Path -> Parser () -> Parser ()
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ()
skipSpace
        Parser () -> Parser Directive -> Parser Directive
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> ( (Path -> Directive
Disallow (Path -> Directive) -> Parser Path -> Parser Directive
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser Path
tokenP)
               Parser Directive -> Parser Directive -> Parser Directive
forall a. Parser Path a -> Parser Path a -> Parser Path a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>
               -- This requires some explanation.
               -- The RFC suggests that an empty Disallow line means
               -- anything is allowed. Being semantically equivalent to
               -- 'Allow: "/"', I have chosen to change it here
               -- rather than carry the bogus distinction around.
               (Parser ()
endOfLine Parser () -> Parser Directive -> Parser Directive
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Directive -> Parser Directive
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return (Path -> Directive
Allow Path
"/"))
           ),
      Path -> Parser Path
stringCI Path
"Allow:"
        Parser Path -> Parser () -> Parser ()
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ()
skipSpace
        Parser () -> Parser Directive -> Parser Directive
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> ( (Path -> Directive
Allow (Path -> Directive) -> Parser Path -> Parser Directive
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser Path
tokenP)
               Parser Directive -> Parser Directive -> Parser Directive
forall a. Parser Path a -> Parser Path a -> Parser Path a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>
               -- If an empty disallow means 'disallow nothing',
               -- an empty allow means 'allow nothing'. Right?
               -- Not sure, actually, but only the americanexpress.com
               -- has such a case, which in one hand I am tempted
               -- to consider an error... but for now:
               (Parser ()
endOfLine Parser () -> Parser Directive -> Parser Directive
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Directive -> Parser Directive
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return (Path -> Directive
Disallow Path
"/"))
           ),
      Parser Directive
parseCrawlDelay,
      Parser Directive
parseRequestRate,
      Parser Directive
parseVisitTime,
      Path -> Directive
NoArchive (Path -> Directive) -> Parser Path -> Parser Directive
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Path -> Parser Path
stringCI Path
"Noarchive:" Parser Path -> Parser () -> Parser ()
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ()
skipSpace Parser () -> Parser Path -> Parser Path
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser Path
tokenP),
      Path -> Directive
NoSnippet (Path -> Directive) -> Parser Path -> Parser Directive
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Path -> Parser Path
stringCI Path
"Nosnippet:" Parser Path -> Parser () -> Parser ()
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ()
skipSpace Parser () -> Parser Path -> Parser Path
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser Path
tokenP),
      Path -> Directive
NoTranslate (Path -> Directive) -> Parser Path -> Parser Directive
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Path -> Parser Path
stringCI Path
"Notranslate:" Parser Path -> Parser () -> Parser ()
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ()
skipSpace Parser () -> Parser Path -> Parser Path
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser Path
tokenP),
      Path -> Directive
NoIndex (Path -> Directive) -> Parser Path -> Parser Directive
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Path -> Parser Path
stringCI Path
"Noindex:" Parser Path -> Parser () -> Parser ()
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ()
skipSpace Parser () -> Parser Path -> Parser Path
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser Path
tokenP)
    ]
    Parser Directive -> Parser () -> Parser Directive
forall a b. Parser Path a -> Parser Path b -> Parser Path a
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Parser ()
commentsP
    Parser Directive -> String -> Parser Directive
forall i a. Parser i a -> String -> Parser i a
<?> String
"directive"

agentP :: Parser UserAgent
agentP :: Parser Path UserAgent
agentP = do
  Parser Path -> Parser ()
forall (f :: * -> *) a. Functor f => f a -> f ()
void (Parser Path -> Parser ()) -> Parser Path -> Parser ()
forall a b. (a -> b) -> a -> b
$ Path -> Parser Path
stringCI Path
"user-agent:"
  Parser ()
skipSpace
  ( (Path -> Parser Path
string Path
"*" Parser Path -> Parser Path UserAgent -> Parser Path UserAgent
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> UserAgent -> Parser Path UserAgent
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return UserAgent
Wildcard)
      Parser Path UserAgent
-> Parser Path UserAgent -> Parser Path UserAgent
forall a. Parser Path a -> Parser Path a -> Parser Path a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Path -> UserAgent
Literal (Path -> UserAgent) -> Parser Path -> Parser Path UserAgent
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser Path
tokenWithSpacesP)
    )
    Parser Path UserAgent -> Parser () -> Parser Path UserAgent
forall a b. Parser Path a -> Parser Path b -> Parser Path a
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Parser ()
skipSpace
    Parser Path UserAgent -> Parser () -> Parser Path UserAgent
forall a b. Parser Path a -> Parser Path b -> Parser Path a
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Parser ()
endOfLine
    Parser Path UserAgent -> String -> Parser Path UserAgent
forall i a. Parser i a -> String -> Parser i a
<?> String
"agent"

commentsP :: Parser ()
commentsP :: Parser ()
commentsP =
  Parser ()
skipSpace
    Parser () -> Parser () -> Parser ()
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> ( (Path -> Parser Path
string Path
"#" Parser Path -> Parser Path -> Parser Path
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> (Char -> Bool) -> Parser Path
takeTill Char -> Bool
AT.isEndOfLine Parser Path -> Parser () -> Parser ()
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Parser ()
endOfLine)
           Parser () -> Parser () -> Parser ()
forall a. Parser Path a -> Parser Path a -> Parser Path a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser ()
endOfLine
           Parser () -> Parser () -> Parser ()
forall a. Parser Path a -> Parser Path a -> Parser Path a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> () -> Parser ()
forall a. a -> Parser Path a
forall (m :: * -> *) a. Monad m => a -> m a
return ()
       )

tokenP :: Parser ByteString
tokenP :: Parser Path
tokenP = Parser ()
skipSpace Parser () -> Parser Path -> Parser Path
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> (Char -> Bool) -> Parser Path
takeWhile1 (Bool -> Bool
not (Bool -> Bool) -> (Char -> Bool) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Bool
isSpace) Parser Path -> Parser () -> Parser Path
forall a b. Parser Path a -> Parser Path b -> Parser Path a
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Parser ()
skipSpace

tokenWithSpacesP :: Parser ByteString
tokenWithSpacesP :: Parser Path
tokenWithSpacesP =
  Parser ()
skipSpace
    Parser () -> Parser Path -> Parser Path
forall a b. Parser Path a -> Parser Path b -> Parser Path b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> (Char -> Bool) -> Parser Path
takeWhile1 (Bool -> Bool
not (Bool -> Bool) -> (Char -> Bool) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (\Char
c -> Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'#' Bool -> Bool -> Bool
|| Char -> Bool
AT.isEndOfLine Char
c))
      Parser Path -> Parser Path -> Parser Path
forall a b. Parser Path a -> Parser Path b -> Parser Path a
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* (Char -> Bool) -> Parser Path
takeTill Char -> Bool
AT.isEndOfLine

-- I lack the art to make this prettier.
-- Currently does not take into account the CrawlDelay / Request Rate directives
canAccess :: ByteString -> Robot -> Path -> Bool
canAccess :: Path -> Robot -> Path -> Bool
canAccess Path
_ Robot
_ Path
"/robots.txt" = Bool
True -- special-cased
canAccess Path
agent ([([UserAgent], [Directive])]
robot, [Path]
_) Path
path = case (String, [([UserAgent], [Directive])])
-> [([UserAgent], [Directive])] -> [([UserAgent], [Directive])]
forall a b. Show a => a -> b -> b
traceShow (String
"stanzas", [([UserAgent], [Directive])]
stanzas) [([UserAgent], [Directive])]
stanzas of
  [] -> Bool
True
  (([UserAgent]
_, [Directive]
directives) : [([UserAgent], [Directive])]
_) -> [Directive] -> Bool
matchingDirective [Directive]
directives
  where
    stanzas :: [([UserAgent], [Directive])]
stanzas =
      [Maybe ([UserAgent], [Directive])] -> [([UserAgent], [Directive])]
forall a. [Maybe a] -> [a]
catMaybes
        [ (([UserAgent], [Directive]) -> Bool)
-> [([UserAgent], [Directive])] -> Maybe ([UserAgent], [Directive])
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Maybe a
find ((UserAgent -> Bool) -> [UserAgent] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
any (UserAgent -> Path -> Bool
`isLiteralSubstring` Path
agent) ([UserAgent] -> Bool)
-> (([UserAgent], [Directive]) -> [UserAgent])
-> ([UserAgent], [Directive])
-> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ([UserAgent], [Directive]) -> [UserAgent]
forall a b. (a, b) -> a
fst) [([UserAgent], [Directive])]
robot,
          (([UserAgent], [Directive]) -> Bool)
-> [([UserAgent], [Directive])] -> Maybe ([UserAgent], [Directive])
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Maybe a
find ((UserAgent
Wildcard UserAgent -> [UserAgent] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem`) ([UserAgent] -> Bool)
-> (([UserAgent], [Directive]) -> [UserAgent])
-> ([UserAgent], [Directive])
-> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ([UserAgent], [Directive]) -> [UserAgent]
forall a b. (a, b) -> a
fst) [([UserAgent], [Directive])]
robot
        ]

    isLiteralSubstring :: UserAgent -> Path -> Bool
isLiteralSubstring (Literal Path
a) Path
us = Path
a Path -> Path -> Bool
`BS.isInfixOf` Path
us
    isLiteralSubstring UserAgent
_ Path
_ = Bool
False
    matchingDirective :: [Directive] -> Bool
matchingDirective [] = String -> Bool -> Bool
forall a. String -> a -> a
trace String
"empty, so true" Bool
True
    matchingDirective (Directive
x : [Directive]
xs) = case Directive
x of
      Allow Path
robot_path ->
        (String, Path, Path) -> Bool -> Bool
forall a b. Show a => a -> b -> b
traceShow
          (String
"allow?", Path
robot_path, Path
path)
          (Bool -> Bool) -> Bool -> Bool
forall a b. (a -> b) -> a -> b
$ Path
robot_path
            Path -> Path -> Bool
`BS.isPrefixOf` Path
path
            Bool -> Bool -> Bool
|| [Directive] -> Bool
matchingDirective [Directive]
xs
      Disallow Path
robot_path ->
        (String, Path, Path) -> (Bool -> Bool) -> Bool -> Bool
forall a b. Show a => a -> b -> b
traceShow
          (String
"disallow?", Path
robot_path, Path
path)
          Bool -> Bool
not
          (Path
robot_path Path -> Path -> Bool
`BS.isPrefixOf` Path
path)
          Bool -> Bool -> Bool
&& [Directive] -> Bool
matchingDirective [Directive]
xs
      Directive
_ -> [Directive] -> Bool
matchingDirective [Directive]
xs