{-# OPTIONS #-} -- ------------------------------------------------------------ module Holumbus.Crawler.Types where import Control.DeepSeq import Control.Monad.Reader import Control.Monad.State import Control.Monad.ReaderStateIO -- import Control.Monad.State import Data.Binary ( Binary ) import qualified Data.Binary as B -- else naming conflict with put and get from Monad.State import Data.Function.Selector import Holumbus.Crawler.Constants import Holumbus.Crawler.URIs import Holumbus.Crawler.RobotTypes import Holumbus.Crawler.XmlArrows ( checkDocumentStatus ) import Text.XML.HXT.Core import Text.XML.HXT.Curl import qualified Text.XML.HXT.Arrow.XmlState.RunIOStateArrow as HXT ( theSysConfigComp ) import qualified Text.XML.HXT.Arrow.XmlState.TypeDefs as HXT ( theInputOptions ) import System.Log.Logger ( Priority(..) ) -- ------------------------------------------------------------ -- | The action to combine the result of a single document with the accumulator for the overall crawler result. -- This combining function runs in the IO monad to enable storing parts of the result externally -- but it is not a CrawlerAction, else parallel crawling with forkIO is not longer applicable type AccumulateDocResult a r = (URI, a) -> r -> IO r -- | The folding operator for merging partial results when working with mapFold and parallel crawling type MergeDocResults r = r -> r -> IO r -- | The operator for saving intermediate results type SavePartialResults r = FilePath -> r -> IO r -- | The extractor function for a single document type ProcessDocument a = IOSArrow XmlTree a -- | The crawler action monad type CrawlerAction a r = ReaderStateIO (CrawlerConfig a r) (CrawlerState r) -- | The crawler configuration record data CrawlerConfig a r = CrawlerConfig { cc_sysConfig :: SysConfig , cc_preRefsFilter :: IOSArrow XmlTree XmlTree , cc_processRefs :: IOSArrow XmlTree URI , cc_preDocFilter :: IOSArrow XmlTree XmlTree , cc_processDoc :: ProcessDocument a , cc_accumulate :: AccumulateDocResult a r -- result accumulation runs in the IO monad to allow storing parts externally , cc_fold :: MergeDocResults r , cc_followRef :: URI -> Bool , cc_addRobotsTxt :: CrawlerConfig a r -> AddRobotsAction , cc_clickLevel :: ! Int , cc_maxNoOfDocs :: ! Int , cc_maxParDocs :: ! Int , cc_maxParThreads :: ! Int , cc_saveIntervall :: ! Int , cc_savePathPrefix :: ! String , cc_savePreAction :: FilePath -> CrawlerAction a r () -- SavePartialResults r , cc_traceLevel :: ! Priority , cc_traceLevelHxt :: ! Priority } -- | The crawler state record data CrawlerState r = CrawlerState { cs_toBeProcessed :: ! URIsWithLevel , cs_alreadyProcessed :: ! URIs , cs_robots :: ! Robots -- is part of the state, it will grow during crawling , cs_noOfDocs :: ! Int -- stop crawling when this counter reaches 0, (-1) means unlimited # of docs , cs_noOfDocsSaved :: ! Int , cs_listOfDocsSaved :: ! [Int] , cs_resultAccu :: ! r -- evaluate accumulated result, else memory leaks show up , cs_resultInit :: ! r -- the initial value for folding results } deriving (Show) instance (NFData r) => NFData (CrawlerState r) where rnf CrawlerState { cs_toBeProcessed = a , cs_alreadyProcessed = b , cs_robots = c , cs_noOfDocs = d , cs_noOfDocsSaved = e , cs_listOfDocsSaved = f , cs_resultAccu = g , cs_resultInit = h } = rnf a `seq` rnf b `seq` rnf c `seq` rnf d `seq` rnf e `seq` rnf f `seq` rnf g `seq` rnf h instance (XmlPickler r) => XmlPickler (CrawlerState r) where xpickle = xpElem "crawler-state" $ xpWrap ( \ ((d, e, f), (a, b, c, g, h)) -> CrawlerState a b c d e f g h , \ (CrawlerState a b c d e f g h) -> ( (d, e, f) , (a, b, c, g, h) ) ) $ xpPair ( xpTriple ( xpAttr "no-of-docs" xpPrim ) ( xpAttr "no-of-docs-saved" xpPrim ) ( xpAttr "list-of-docs-saved" $ xpList $ xpElem "saved-at" $ xpPrim ) ) ( xp5Tuple ( xpElem "to-be-processed" $ xpURIsWithLevel ) ( xpElem "already-processed" $ xpURIs ) xpRobots xpickle xpickle ) where xpURIs = xpWrap ( fromListURIs, toListURIs ) $ xpList $ xpElem "doc" $ xpAttr "href" $ xpText xpURIsWithLevel = xpWrap ( fromListURIs', toListURIs' ) $ xpList $ xpElem "doc" $ xpPair ( xpAttr "href" $ xpText ) ( xpAttr "clicklevel" xpInt ) -- ------------------------------------------------------------ -- | selector functions for CrawlerState theToBeProcessed :: Selector (CrawlerState r) URIsWithLevel theToBeProcessed = S cs_toBeProcessed (\ x s -> s {cs_toBeProcessed = x}) theAlreadyProcessed :: Selector (CrawlerState r) URIs theAlreadyProcessed = S cs_alreadyProcessed (\ x s -> s {cs_alreadyProcessed = x}) theRobots :: Selector (CrawlerState r) Robots theRobots = S cs_robots (\ x s -> s {cs_robots = x}) theNoOfDocs :: Selector (CrawlerState r) Int theNoOfDocs = S cs_noOfDocs (\ x s -> s {cs_noOfDocs = x}) theNoOfDocsSaved :: Selector (CrawlerState r) Int theNoOfDocsSaved = S cs_noOfDocsSaved (\ x s -> s {cs_noOfDocsSaved = x}) theListOfDocsSaved :: Selector (CrawlerState r) [Int] theListOfDocsSaved = S cs_listOfDocsSaved (\ x s -> s {cs_listOfDocsSaved = x}) theResultAccu :: Selector (CrawlerState r) r theResultAccu = S cs_resultAccu (\ x s -> s {cs_resultAccu = x}) theResultInit :: Selector (CrawlerState r) r theResultInit = S cs_resultInit (\ x s -> s {cs_resultInit = x}) -- | selector functions for CrawlerConfig theSysConfig :: Selector (CrawlerConfig a r) SysConfig theSysConfig = S cc_sysConfig (\ x s -> s {cc_sysConfig = x}) theTraceLevel :: Selector (CrawlerConfig a r) Priority theTraceLevel = S cc_traceLevel (\ x s -> s {cc_traceLevel = x}) theTraceLevelHxt :: Selector (CrawlerConfig a r) Priority theTraceLevelHxt = S cc_traceLevelHxt (\ x s -> s {cc_traceLevelHxt = x}) theClickLevel :: Selector (CrawlerConfig a r) Int theClickLevel = S cc_clickLevel (\ x s -> s {cc_clickLevel = x}) theMaxNoOfDocs :: Selector (CrawlerConfig a r) Int theMaxNoOfDocs = S cc_maxNoOfDocs (\ x s -> s {cc_maxNoOfDocs = x}) theMaxParDocs :: Selector (CrawlerConfig a r) Int theMaxParDocs = S cc_maxParDocs (\ x s -> s {cc_maxParDocs = x}) theMaxParThreads :: Selector (CrawlerConfig a r) Int theMaxParThreads = S cc_maxParThreads (\ x s -> s {cc_maxParThreads = x}) theSaveIntervall :: Selector (CrawlerConfig a r) Int theSaveIntervall = S cc_saveIntervall (\ x s -> s {cc_saveIntervall = x}) theSavePathPrefix :: Selector (CrawlerConfig a r) String theSavePathPrefix = S cc_savePathPrefix (\ x s -> s {cc_savePathPrefix = x}) theSavePreAction :: Selector (CrawlerConfig a r) (FilePath -> CrawlerAction a r ()) -- (SavePartialResults r) theSavePreAction = S cc_savePreAction (\ x s -> s {cc_savePreAction = x}) theFollowRef :: Selector (CrawlerConfig a r) (URI -> Bool) theFollowRef = S cc_followRef (\ x s -> s {cc_followRef = x}) theAddRobotsAction :: Selector (CrawlerConfig a r) (CrawlerConfig a r -> AddRobotsAction) theAddRobotsAction = S cc_addRobotsTxt (\ x s -> s {cc_addRobotsTxt = x}) theAccumulateOp :: Selector (CrawlerConfig a r) (AccumulateDocResult a r) theAccumulateOp = S cc_accumulate (\ x s -> s {cc_accumulate = x}) theFoldOp :: Selector (CrawlerConfig a r) (MergeDocResults r) theFoldOp = S cc_fold (\ x s -> s {cc_fold = x}) thePreRefsFilter :: Selector (CrawlerConfig a r) (IOSArrow XmlTree XmlTree) thePreRefsFilter = S cc_preRefsFilter (\ x s -> s {cc_preRefsFilter = x}) theProcessRefs :: Selector (CrawlerConfig a r) (IOSArrow XmlTree URI) theProcessRefs = S cc_processRefs (\ x s -> s {cc_processRefs = x}) thePreDocFilter :: Selector (CrawlerConfig a r) (IOSArrow XmlTree XmlTree) thePreDocFilter = S cc_preDocFilter (\ x s -> s {cc_preDocFilter = x}) theProcessDoc :: Selector (CrawlerConfig a r) (IOSArrow XmlTree a) theProcessDoc = S cc_processDoc (\ x s -> s {cc_processDoc = x}) -- ------------------------------------------------------------ -- a rather boring default crawler configuration defaultCrawlerConfig :: AccumulateDocResult a r -> MergeDocResults r -> CrawlerConfig a r defaultCrawlerConfig op op2 = CrawlerConfig { cc_sysConfig = ( withCurl [ (curl_user_agent, defaultCrawlerName) , (curl_max_time, show $ (60 * 1000::Int)) -- whole transaction for reading a document must complete within 60,000 milli seconds, , (curl_connect_timeout, show $ (10::Int)) -- connection must be established within 10 seconds ] ) , cc_preRefsFilter = this -- no preprocessing for refs extraction , cc_processRefs = none -- don't extract refs , cc_preDocFilter = checkDocumentStatus -- default: in case of errors throw away any contents , cc_processDoc = none -- no document processing at all , cc_accumulate = op -- combining function for result accumulating , cc_fold = op2 , cc_followRef = const False -- do not follow any refs , cc_addRobotsTxt = const $ const return -- do not add robots.txt evaluation , cc_saveIntervall = (-1) -- never save an itermediate state , cc_savePathPrefix = "/tmp/hc-" -- the prefix for filenames into which intermediate states are saved , cc_savePreAction = const $ return () -- no action before saving state , cc_clickLevel = maxBound -- click level set to infinity , cc_maxNoOfDocs = (-1) -- maximum # of docs to be crawled, -1 means unlimited , cc_maxParDocs = 20 -- maximum # of doc crawled in parallel , cc_maxParThreads = 5 -- maximum # of threads running in parallel , cc_traceLevel = NOTICE -- traceLevel , cc_traceLevelHxt = WARNING -- traceLevel for hxt } theInputOptions :: Selector (CrawlerConfig a r) Attributes theInputOptions = theSysConfig >>> HXT.theSysConfigComp HXT.theInputOptions theCrawlerName :: Selector (CrawlerConfig a r) String theCrawlerName = theInputOptions >>> S { getS = lookupDef defaultCrawlerName curl_user_agent , setS = addEntry curl_user_agent } theMaxTime :: Selector (CrawlerConfig a r) Int theMaxTime = theInputOptions >>> S { getS = read . lookupDef "0" curl_max_time , setS = addEntry curl_max_time . show . (`max` 1) } theConnectTimeout :: Selector (CrawlerConfig a r) Int theConnectTimeout = theInputOptions >>> S { getS = read . lookupDef "0" curl_connect_timeout , setS = addEntry curl_connect_timeout . show . (`max` 1) } -- ------------------------------------------------------------ -- | Add attributes for accessing documents addSysConfig :: SysConfig -> CrawlerConfig a r -> CrawlerConfig a r addSysConfig cf = chgS theSysConfig ( >>> cf ) -- | Insert a robots no follow filter before thePreRefsFilter addRobotsNoFollow :: CrawlerConfig a r -> CrawlerConfig a r addRobotsNoFollow = chgS thePreRefsFilter ( robotsNoFollow >>> ) -- | Insert a robots no follow filter before thePreRefsFilter addRobotsNoIndex :: CrawlerConfig a r -> CrawlerConfig a r addRobotsNoIndex = chgS thePreDocFilter ( robotsNoIndex >>> ) -- | Set the log level setCrawlerTraceLevel :: Priority -> Priority -> CrawlerConfig a r -> CrawlerConfig a r setCrawlerTraceLevel l lx = setS theTraceLevel l >>> setS theTraceLevelHxt lx -- | Set save intervall in config setCrawlerSaveConf :: Int -> String -> CrawlerConfig a r -> CrawlerConfig a r setCrawlerSaveConf i f = setS theSaveIntervall i >>> setS theSavePathPrefix f -- | Set action performed before saving crawler state setCrawlerSaveAction :: (FilePath -> CrawlerAction a r ()) -> CrawlerConfig a r -> CrawlerConfig a r setCrawlerSaveAction f = setS theSavePreAction f -- | Set max # of steps (clicks) to reach a document setCrawlerClickLevel :: Int -> CrawlerConfig a r -> CrawlerConfig a r setCrawlerClickLevel mcl = setS theClickLevel mcl -- | Set max # of documents to be crawled -- and max # of documents crawled in parallel setCrawlerMaxDocs :: Int -> Int -> Int -> CrawlerConfig a r -> CrawlerConfig a r setCrawlerMaxDocs mxd mxp mxt = setS theMaxNoOfDocs mxd >>> setS theMaxParDocs mxp >>> setS theMaxParThreads mxt -- | Set the pre hook filter executed before the hrefs are collected setCrawlerPreRefsFilter :: IOSArrow XmlTree XmlTree -> CrawlerConfig a r -> CrawlerConfig a r setCrawlerPreRefsFilter f = setS thePreRefsFilter f -- ------------------------------------------------------------ instance (Binary r) => Binary (CrawlerState r) where put s = do B.put (getS theToBeProcessed s) B.put (getS theAlreadyProcessed s) B.put (getS theRobots s) B.put (getS theNoOfDocs s) B.put (getS theNoOfDocsSaved s) B.put (getS theListOfDocsSaved s) B.put (getS theResultAccu s) B.put (getS theResultInit s) get = do tbp <- B.get alp <- B.get rbt <- B.get mxd <- B.get mxs <- B.get lsd <- B.get acc <- B.get ini <- B.get return $ CrawlerState { cs_toBeProcessed = tbp , cs_alreadyProcessed = alp , cs_robots = rbt , cs_noOfDocs = mxd , cs_noOfDocsSaved = mxs , cs_listOfDocsSaved = lsd , cs_resultAccu = acc , cs_resultInit = ini } putCrawlerState :: (Binary r) => CrawlerState r -> B.Put putCrawlerState = B.put getCrawlerState :: (Binary r) => B.Get (CrawlerState r) getCrawlerState = B.get initCrawlerState :: r -> CrawlerState r initCrawlerState r = CrawlerState { cs_toBeProcessed = emptyURIs , cs_alreadyProcessed = emptyURIs , cs_robots = emptyRobots , cs_noOfDocs = 0 , cs_noOfDocsSaved = 0 , cs_listOfDocsSaved = [] , cs_resultAccu = r , cs_resultInit = r } -- ------------------------------------------------------------ -- -- basic crawler actions -- | Load a component from the crawler configuration getConf :: Selector (CrawlerConfig a r) v -> CrawlerAction a r v getConf = asks . getS getState :: Selector (CrawlerState r) v -> CrawlerAction a r v getState = gets . getS putState :: Selector (CrawlerState r) v -> v -> CrawlerAction a r () putState sel = modify . setS sel modifyState :: Selector (CrawlerState r) v -> (v -> v) -> CrawlerAction a r () modifyState sel = modify . chgS sel modifyStateIO :: Selector (CrawlerState r) v -> (v -> IO v) -> CrawlerAction a r () modifyStateIO sel = modifyIO . chgM sel -- ------------------------------------------------------------