Safe Haskell | None |
---|
- type AccumulateDocResult a r = (URI, a) -> r -> IO r
- type MergeDocResults r = r -> r -> IO r
- type SavePartialResults r = FilePath -> r -> IO r
- type ProcessDocument a = IOSArrow XmlTree a
- type CrawlerAction a r = ReaderStateIO (CrawlerConfig a r) (CrawlerState r)
- data CrawlerConfig a r = CrawlerConfig {
- cc_sysConfig :: SysConfig
- cc_preRefsFilter :: IOSArrow XmlTree XmlTree
- cc_processRefs :: IOSArrow XmlTree URI
- cc_preDocFilter :: IOSArrow XmlTree XmlTree
- cc_processDoc :: ProcessDocument a
- cc_accumulate :: AccumulateDocResult a r
- cc_fold :: MergeDocResults r
- cc_followRef :: URI -> Bool
- cc_addRobotsTxt :: CrawlerConfig a r -> AddRobotsAction
- cc_clickLevel :: !Int
- cc_maxNoOfDocs :: !Int
- cc_maxParDocs :: !Int
- cc_maxParThreads :: !Int
- cc_saveIntervall :: !Int
- cc_savePathPrefix :: !String
- cc_savePreAction :: FilePath -> CrawlerAction a r ()
- cc_traceLevel :: !Priority
- cc_traceLevelHxt :: !Priority
- data CrawlerState r = CrawlerState {
- cs_toBeProcessed :: !URIsWithLevel
- cs_alreadyProcessed :: !URIs
- cs_robots :: !Robots
- cs_noOfDocs :: !Int
- cs_noOfDocsSaved :: !Int
- cs_listOfDocsSaved :: ![Int]
- cs_resultAccu :: !r
- cs_resultInit :: !r
- theToBeProcessed :: Selector (CrawlerState r) URIsWithLevel
- theAlreadyProcessed :: Selector (CrawlerState r) URIs
- theRobots :: Selector (CrawlerState r) Robots
- theNoOfDocs :: Selector (CrawlerState r) Int
- theNoOfDocsSaved :: Selector (CrawlerState r) Int
- theListOfDocsSaved :: Selector (CrawlerState r) [Int]
- theResultAccu :: Selector (CrawlerState r) r
- theResultInit :: Selector (CrawlerState r) r
- theSysConfig :: Selector (CrawlerConfig a r) SysConfig
- theTraceLevel :: Selector (CrawlerConfig a r) Priority
- theTraceLevelHxt :: Selector (CrawlerConfig a r) Priority
- theClickLevel :: Selector (CrawlerConfig a r) Int
- theMaxNoOfDocs :: Selector (CrawlerConfig a r) Int
- theMaxParDocs :: Selector (CrawlerConfig a r) Int
- theMaxParThreads :: Selector (CrawlerConfig a r) Int
- theSaveIntervall :: Selector (CrawlerConfig a r) Int
- theSavePathPrefix :: Selector (CrawlerConfig a r) String
- theSavePreAction :: Selector (CrawlerConfig a r) (FilePath -> CrawlerAction a r ())
- theFollowRef :: Selector (CrawlerConfig a r) (URI -> Bool)
- theAddRobotsAction :: Selector (CrawlerConfig a r) (CrawlerConfig a r -> AddRobotsAction)
- theAccumulateOp :: Selector (CrawlerConfig a r) (AccumulateDocResult a r)
- theFoldOp :: Selector (CrawlerConfig a r) (MergeDocResults r)
- thePreRefsFilter :: Selector (CrawlerConfig a r) (IOSArrow XmlTree XmlTree)
- theProcessRefs :: Selector (CrawlerConfig a r) (IOSArrow XmlTree URI)
- thePreDocFilter :: Selector (CrawlerConfig a r) (IOSArrow XmlTree XmlTree)
- theProcessDoc :: Selector (CrawlerConfig a r) (IOSArrow XmlTree a)
- defaultCrawlerConfig :: AccumulateDocResult a r -> MergeDocResults r -> CrawlerConfig a r
- theInputOptions :: Selector (CrawlerConfig a r) Attributes
- theCrawlerName :: Selector (CrawlerConfig a r) String
- theMaxTime :: Selector (CrawlerConfig a r) Int
- theConnectTimeout :: Selector (CrawlerConfig a r) Int
- addSysConfig :: SysConfig -> CrawlerConfig a r -> CrawlerConfig a r
- addRobotsNoFollow :: CrawlerConfig a r -> CrawlerConfig a r
- addRobotsNoIndex :: CrawlerConfig a r -> CrawlerConfig a r
- setCrawlerTraceLevel :: Priority -> Priority -> CrawlerConfig a r -> CrawlerConfig a r
- setCrawlerSaveConf :: Int -> String -> CrawlerConfig a r -> CrawlerConfig a r
- setCrawlerSaveAction :: (FilePath -> CrawlerAction a r ()) -> CrawlerConfig a r -> CrawlerConfig a r
- setCrawlerClickLevel :: Int -> CrawlerConfig a r -> CrawlerConfig a r
- setCrawlerMaxDocs :: Int -> Int -> Int -> CrawlerConfig a r -> CrawlerConfig a r
- setCrawlerPreRefsFilter :: IOSArrow XmlTree XmlTree -> CrawlerConfig a r -> CrawlerConfig a r
- putCrawlerState :: Binary r => CrawlerState r -> Put
- getCrawlerState :: Binary r => Get (CrawlerState r)
- initCrawlerState :: r -> CrawlerState r
- getConf :: Selector (CrawlerConfig a r) v -> CrawlerAction a r v
- getState :: Selector (CrawlerState r) v -> CrawlerAction a r v
- putState :: Selector (CrawlerState r) v -> v -> CrawlerAction a r ()
- modifyState :: Selector (CrawlerState r) v -> (v -> v) -> CrawlerAction a r ()
- modifyStateIO :: Selector (CrawlerState r) v -> (v -> IO v) -> CrawlerAction a r ()
Documentation
type AccumulateDocResult a r = (URI, a) -> r -> IO rSource
The action to combine the result of a single document with the accumulator for the overall crawler result. This combining function runs in the IO monad to enable storing parts of the result externally but it is not a CrawlerAction, else parallel crawling with forkIO is not longer applicable
type MergeDocResults r = r -> r -> IO rSource
The folding operator for merging partial results when working with mapFold and parallel crawling
type SavePartialResults r = FilePath -> r -> IO rSource
The operator for saving intermediate results
type ProcessDocument a = IOSArrow XmlTree aSource
The extractor function for a single document
type CrawlerAction a r = ReaderStateIO (CrawlerConfig a r) (CrawlerState r)Source
The crawler action monad
data CrawlerConfig a r Source
The crawler configuration record
CrawlerConfig | |
|
data CrawlerState r Source
The crawler state record
CrawlerState | |
|
Show r => Show (CrawlerState r) | |
Binary r => Binary (CrawlerState r) | |
NFData r => NFData (CrawlerState r) | |
XmlPickler r => XmlPickler (CrawlerState r) |
theToBeProcessed :: Selector (CrawlerState r) URIsWithLevelSource
selector functions for CrawlerState
theNoOfDocs :: Selector (CrawlerState r) IntSource
theListOfDocsSaved :: Selector (CrawlerState r) [Int]Source
theResultAccu :: Selector (CrawlerState r) rSource
theResultInit :: Selector (CrawlerState r) rSource
theSysConfig :: Selector (CrawlerConfig a r) SysConfigSource
selector functions for CrawlerConfig
theTraceLevel :: Selector (CrawlerConfig a r) PrioritySource
theClickLevel :: Selector (CrawlerConfig a r) IntSource
theMaxNoOfDocs :: Selector (CrawlerConfig a r) IntSource
theMaxParDocs :: Selector (CrawlerConfig a r) IntSource
theMaxParThreads :: Selector (CrawlerConfig a r) IntSource
theSaveIntervall :: Selector (CrawlerConfig a r) IntSource
theSavePreAction :: Selector (CrawlerConfig a r) (FilePath -> CrawlerAction a r ())Source
theFollowRef :: Selector (CrawlerConfig a r) (URI -> Bool)Source
theAddRobotsAction :: Selector (CrawlerConfig a r) (CrawlerConfig a r -> AddRobotsAction)Source
theAccumulateOp :: Selector (CrawlerConfig a r) (AccumulateDocResult a r)Source
theFoldOp :: Selector (CrawlerConfig a r) (MergeDocResults r)Source
theProcessRefs :: Selector (CrawlerConfig a r) (IOSArrow XmlTree URI)Source
thePreDocFilter :: Selector (CrawlerConfig a r) (IOSArrow XmlTree XmlTree)Source
theProcessDoc :: Selector (CrawlerConfig a r) (IOSArrow XmlTree a)Source
defaultCrawlerConfig :: AccumulateDocResult a r -> MergeDocResults r -> CrawlerConfig a rSource
theCrawlerName :: Selector (CrawlerConfig a r) StringSource
theMaxTime :: Selector (CrawlerConfig a r) IntSource
theConnectTimeout :: Selector (CrawlerConfig a r) IntSource
addSysConfig :: SysConfig -> CrawlerConfig a r -> CrawlerConfig a rSource
Add attributes for accessing documents
addRobotsNoFollow :: CrawlerConfig a r -> CrawlerConfig a rSource
Insert a robots no follow filter before thePreRefsFilter
addRobotsNoIndex :: CrawlerConfig a r -> CrawlerConfig a rSource
Insert a robots no follow filter before thePreRefsFilter
setCrawlerTraceLevel :: Priority -> Priority -> CrawlerConfig a r -> CrawlerConfig a rSource
Set the log level
setCrawlerSaveConf :: Int -> String -> CrawlerConfig a r -> CrawlerConfig a rSource
Set save intervall in config
setCrawlerSaveAction :: (FilePath -> CrawlerAction a r ()) -> CrawlerConfig a r -> CrawlerConfig a rSource
Set action performed before saving crawler state
setCrawlerClickLevel :: Int -> CrawlerConfig a r -> CrawlerConfig a rSource
Set max # of steps (clicks) to reach a document
setCrawlerMaxDocs :: Int -> Int -> Int -> CrawlerConfig a r -> CrawlerConfig a rSource
Set max # of documents to be crawled and max # of documents crawled in parallel
setCrawlerPreRefsFilter :: IOSArrow XmlTree XmlTree -> CrawlerConfig a r -> CrawlerConfig a rSource
Set the pre hook filter executed before the hrefs are collected
putCrawlerState :: Binary r => CrawlerState r -> PutSource
getCrawlerState :: Binary r => Get (CrawlerState r)Source
initCrawlerState :: r -> CrawlerState rSource
getConf :: Selector (CrawlerConfig a r) v -> CrawlerAction a r vSource
Load a component from the crawler configuration
getState :: Selector (CrawlerState r) v -> CrawlerAction a r vSource
putState :: Selector (CrawlerState r) v -> v -> CrawlerAction a r ()Source
modifyState :: Selector (CrawlerState r) v -> (v -> v) -> CrawlerAction a r ()Source
modifyStateIO :: Selector (CrawlerState r) v -> (v -> IO v) -> CrawlerAction a r ()Source