module Holumbus.Crawler.RobotTypes
where
import Control.DeepSeq
import Data.Binary ( Binary )
import qualified Data.Binary as B
import Data.Char
import qualified Data.Map as M
import Holumbus.Crawler.URIs
import Text.XML.HXT.Core
type Robots = M.Map URI RobotRestriction
type RobotRestriction = [RobotSpec]
type RobotSpec = (URI, RobotAction)
data RobotAction = Disallow | Allow
deriving (Eq, Show, Read, Enum)
type AddRobotsAction = URI -> Robots -> IO Robots
instance Binary RobotAction where
put = B.put . fromEnum
get = B.get >>= return . toEnum
instance NFData RobotAction where
rnf x = x `seq` ()
instance XmlPickler RobotAction where
xpickle = xpPrim
xpRobots :: PU Robots
xpRobots = xpElem "robots" $
xpMap "robot" "host" xpText xpRobotRestriction
xpRobotRestriction :: PU RobotRestriction
xpRobotRestriction = xpList $
xpElem "restriction" $
xpPair ( xpAttr "href" $ xpText )
( xpAttr "access" $ xpickle )
emptyRobots :: Robots
emptyRobots = M.singleton "" []
robotsExtend :: String -> AddRobotsAction
robotsExtend _robotName _uri robots
= return robots
robotsIndex :: URI -> Robots -> Bool
robotsIndex _uri _robots
= True
robotsFollow :: URI -> Robots -> Bool
robotsFollow _uri _robots
= True
robotsNo :: String -> LA XmlTree XmlTree
robotsNo what = none
`when`
( this
/> hasName "html"
/> hasName "head"
/> hasName "meta"
>>>
hasAttrValue "name" ( map toUpper
>>>
(== "ROBOTS")
)
>>>
getAttrValue0 "content"
>>>
isA ( map (toUpper
>>>
(\ x -> if isLetter x then x else ' ')
)
>>>
words
>>>
(what `elem`)
)
)
robotsNoIndex :: ArrowXml a => a XmlTree XmlTree
robotsNoIndex = fromLA $ robotsNo "NOINDEX"
robotsNoFollow :: ArrowXml a => a XmlTree XmlTree
robotsNoFollow = fromLA $ robotsNo "NOFOLLOW"