src/Network/HTTP/Robots.hs

{-# LANGUAGE OverloadedStrings #-}
module Network.HTTP.Robots where

import           Control.Applicative
import           Control.Monad
import           Data.Attoparsec.ByteString.Char8 hiding (skipSpace)
import qualified Data.Attoparsec.Text             as AT (isEndOfLine)
import           Data.ByteString.Char8            (ByteString)
import qualified Data.ByteString.Char8            as BS
import           Data.Char                        (toUpper)
import           Data.Either                      (partitionEithers)
import           Data.List                        (find)
import           Data.Maybe                       (catMaybes)
import           Data.Ratio
import           Data.Time.Clock
import           Data.Time.LocalTime              ()

type Robot = ([([UserAgent], [Directive])], [Unparsable])

type Unparsable = ByteString

data UserAgent = Wildcard | Literal ByteString
  deriving (Show,Eq)
type Path = ByteString

-- http://www.conman.org/people/spc/robots2.html
-- This was never actually accepted as a standard,
-- but some sites do use it.
type TimeInterval = (DiffTime, DiffTime)

-- Crawldelay may have a decimal point
-- http://help.yandex.com/webmaster/controlling-robot/robots-txt.xml
-- Added directives NoArchive, NoSnippet, NoTranslate, SiteMap.
-- http://bloganddiscussion.com/anythingcomputer/1/robots-txt-noarchive-nocache-nosnippet/
data Directive = Allow Path
               | Disallow Path
               | CrawlDelay { crawlDelay   :: Rational
                            , timeInterval :: TimeInterval
                            }
               | NoArchive Path
               | NoSnippet Path
               | NoTranslate Path
               -- not used by Google, Yahoo or Live Search/Bing
               -- http://searchengineland.com/a-deeper-look-at-robotstxt-17573
               | NoIndex Path
  deriving (Show,Eq)

-- For use in the attoparsec monad, allows to reparse a sub expression
subParser :: Parser a -> ByteString -> Parser a
subParser p = either (const mzero) return . parseOnly p


-- Seems the rational parser is unsecure in the presence of an exponent
-- but since there is no alternative to parse a rational, we just to refuse
-- to parse numbers with 'e' / exponent
-- https://hackage.haskell.org/package/attoparsec-0.12.1.0/docs/Data-Attoparsec-ByteString-Char8.html#v:rational
safeParseRational :: Parser Rational
safeParseRational = do
  (bs,_) <- match scientific
  if BS.elem 'e' bs || BS.elem 'E' bs
    then mzero
    else subParser rational bs

-- Yeah, robots.txt should be ASCII, but some sites
-- include the UTF-8 marker at start.
-- We just drop it, but handle the file as ASCII.
dropUTF8BOM :: ByteString -> ByteString
dropUTF8BOM bs = if BS.take 3 bs == ( '\239' `BS.cons`
                                      '\187' `BS.cons`
                                      '\191' `BS.cons` BS.empty)
                   then BS.drop 3 bs
                   else bs

parseHourMinute :: Parser (Integer,Integer)
parseHourMinute = parseWithColon <|> parseWithoutColon
  where
    parseWithColon = do
      hours <- skipSpace >> decimal
      void         $ skipSpace >> char ':'
      mins  <- skipSpace >> decimal
      return (hours,mins)
    parseWithoutColon = do
      h <- Data.Attoparsec.ByteString.Char8.take 2 >>= subParser decimal
      m <- Data.Attoparsec.ByteString.Char8.take 2 >>= subParser decimal
      return (h,m)

parseTimeInterval :: Parser TimeInterval
parseTimeInterval = do
  (hours_start, mins_start) <- parseHourMinute
  void         $ (skipSpace >> char '-' >> skipSpace) <|> skipSpace
  (hours_end  , mins_end  ) <- parseHourMinute
  return ( secondsToDiffTime (hours_start * 60 * 60 + mins_start * 60)
         , secondsToDiffTime (hours_end   * 60 * 60 + mins_end   * 60))

allDay :: TimeInterval
allDay =  ( secondsToDiffTime 0
          , secondsToDiffTime (24*60*60) -- because of leap seconds
          )

parseRequestRate :: Parser Directive
parseRequestRate = do
  void $ stringCI "Request-rate:"
  docs <- skipSpace >> decimal
  void $  skipSpace >> char '/'
  ptim <- skipSpace >> decimal
  units<- skipSpace >>   (  (char 's' >> return (    1 :: Integer))
                        <|> (char 'm' >> return (   60 :: Integer))
                        <|> (char 'h' >> return (60*60 :: Integer))
                        <|>              return (    1 :: Integer)
                         )
  tint <- skipSpace >> ( parseTimeInterval <|> return allDay)
  return $ CrawlDelay ((ptim * units) % docs) tint

parseVisitTime :: Parser Directive
parseVisitTime = do
  void $ stringCI "Visit-time:"
  tint <- skipSpace >> parseTimeInterval
  return $ CrawlDelay ( 0 % 1) tint

parseCrawlDelay :: Parser Directive
parseCrawlDelay = do
  delay <- stringCI "Crawl-Delay:" >> skipSpace >> safeParseRational
  return $ CrawlDelay delay allDay

-- ... yeah.
strip :: ByteString -> ByteString
strip = BS.reverse . BS.dropWhile (==' ') . BS.reverse . BS.dropWhile (==' ')

-- | parseRobots is the main entry point for parsing a robots.txt file.
parseRobots :: ByteString -> Either String Robot
parseRobots input = case parsed of
  -- special case no parsable lines and rubbish
  Right ([], out@(_:_)) ->
    Left ("no parsable lines: " ++ show out)
  _ -> parsed

  where parsed = parseOnly robotP
              . BS.unlines
  -- Filthy hack to account for the fact we don't grab sitemaps
  -- properly. people seem to just whack them anywhere, which makes it
  -- hard to write a nice parser for them.
              . filter (not . BS.isPrefixOf "SITEMAP:" . BS.map toUpper)
              . filter (not . BS.isPrefixOf "HOST:"    . BS.map toUpper)
              . filter (\x -> BS.head x /= '#' )
              . filter (not . BS.null)
              . map strip
              . BS.lines
              -- worst way of handling windows newlines ever
              . BS.filter (/= '\r')
              . dropUTF8BOM
              $ input

robotP :: Parser Robot
robotP = do
  (dirs, unparsable) <- partitionEithers <$> many (eitherP agentDirectiveP unparsableP) <?> "robot"
  return (dirs, filter (/= "") unparsable)

unparsableP :: Parser ByteString
unparsableP = takeTill AT.isEndOfLine <* endOfLine -- char '\n'

agentDirectiveP :: Parser ([UserAgent],[Directive])
agentDirectiveP = (,) <$> many1 agentP <*> many1 directiveP <?> "agentDirective"


skipSpace :: Parser ()
skipSpace = skipWhile (\x -> x==' ' || x == '\t')

directiveP :: Parser Directive
directiveP = do
  skipSpace
  choice [ stringCI "Disallow:" >> skipSpace >>
                        ((Disallow <$> tokenP) <|>
                      -- This requires some explanation.
                      -- The RFC suggests that an empty Disallow line means
                      -- anything is allowed. Being semantically equivalent to
                      -- 'Allow: "/"', I have chosen to change it here
                      -- rather than carry the bogus distinction around.
                             (endOfLine >> return (Allow    "/")))
                    , stringCI "Allow:" >> skipSpace >>
                        ((Allow <$> tokenP) <|>
                      -- If an empty disallow means 'disallow nothing',
                      -- an empty allow means 'allow nothing'. Right?
                      -- Not sure, actually, but only the americanexpress.com
                      -- has such a case, which in one hand I am tempted
                      -- to consider an error... but for now:
                             (endOfLine >> return (Disallow "/")))
                    , parseCrawlDelay
                    , parseRequestRate
                    , parseVisitTime
                    , NoArchive   <$> (stringCI "Noarchive:"  >> skipSpace >> tokenP)
                    , NoSnippet   <$> (stringCI "Nosnippet:"  >> skipSpace >> tokenP)
                    , NoTranslate <$> (stringCI "Notranslate:">> skipSpace >> tokenP)
                    , NoIndex     <$> (stringCI "Noindex:"    >> skipSpace >> tokenP)
                    ] <* commentsP <?> "directive"

agentP :: Parser UserAgent
agentP = do
  void $ stringCI "user-agent:"
  skipSpace
  ((string "*" >> return Wildcard) <|>
   (Literal  <$> tokenWithSpacesP)) <* skipSpace <* endOfLine <?> "agent"


commentsP :: Parser ()
commentsP = skipSpace >>
            (   (string "#" >> takeTill AT.isEndOfLine >> endOfLine)
            <|> endOfLine
            <|> return ())


tokenP :: Parser ByteString
tokenP = skipSpace >> takeWhile1 (not . isSpace) <* skipSpace
tokenWithSpacesP :: Parser ByteString
tokenWithSpacesP = skipSpace >> takeWhile1 (not . (\c -> c == '#' || AT.isEndOfLine c))
                             <* takeTill AT.isEndOfLine

-- I lack the art to make this prettier.
-- Currently does not take into account the CrawlDelay / Request Rate directives
canAccess :: ByteString -> Robot -> Path -> Bool
canAccess _ _ "/robots.txt" = True -- special-cased
canAccess agent (robot,_) path = case stanzas of
  [] -> True
  ((_,directives):_) -> matchingDirective directives
  where stanzas = catMaybes [find (any (`isLiteralSubstring` agent)  . fst) robot,
                             find ((Wildcard `elem`) . fst) robot]


        isLiteralSubstring (Literal a) us = a `BS.isInfixOf` us
        isLiteralSubstring _ _ = False
        matchingDirective [] = True
        matchingDirective (x:xs) = case x of
          Allow robot_path ->
            robot_path `BS.isPrefixOf` path || matchingDirective xs
          Disallow robot_path ->
            not (robot_path `BS.isPrefixOf` path) && matchingDirective xs

          _ -> matchingDirective xs