module Holumbus.Crawler.CacheCore
where
import Control.DeepSeq
import Data.Binary ( Binary(..) )
import Data.Function.Selector
import Holumbus.Crawler
import Text.XML.HXT.Core
import Text.XML.HXT.Curl
type CacheCrawlerConfig = CrawlerConfig () CacheState
type CacheCrawlerState = CrawlerState CacheState
newtype CacheState = CS ()
instance NFData CacheState where
instance Binary CacheState where
put = const $ return ()
get = return emptyCacheState
instance XmlPickler CacheState where
xpickle = xpElem "cacheState" $
xpWrap (const emptyCacheState, const ()) $
xpUnit
emptyCacheState :: CacheState
emptyCacheState = CS ()
unionCacheStatesM :: (Monad m) => CacheState -> CacheState -> m CacheState
unionCacheStatesM _s1 _s2 = return emptyCacheState
insertCacheM :: (Monad m) => (URI, ()) -> CacheState -> m CacheState
insertCacheM _ _ = return emptyCacheState
cacheCrawlerConfig :: SysConfig
-> (URI -> Bool)
-> CacheCrawlerConfig
cacheCrawlerConfig opts followRef
= addSysConfig (defaultOpts >>> opts)
>>>
( setS theFollowRef followRef )
>>>
( setS theProcessRefs getHtmlReferences )
>>>
( setS thePreDocFilter checkDocumentStatus )
>>>
( setS theProcessDoc $ constA ())
>>>
enableRobotsTxt
>>>
addRobotsNoFollow
>>>
addRobotsNoIndex
$
defaultCrawlerConfig insertCacheM unionCacheStatesM
where
defaultOpts = withCurl [ (curl_max_filesize, "1000000")
, (curl_location, v_1)
, (curl_max_redirects, "3")
]
>>>
withRedirect yes
>>>
withAcceptedMimeTypes ["text/html", "text/xhtml", "text/plain", "text/pdf"]
>>>
withInputEncoding isoLatin1
>>>
withEncodingErrors no
>>>
withValidate no
>>>
withParseHTML yes
>>>
withWarnings no
stdCacher :: (Int, Int, Int)
-> (Int, String)
-> (Priority, Priority)
-> SysConfig
-> (CacheCrawlerConfig -> CacheCrawlerConfig)
-> Maybe String
-> [URI]
-> (URI -> Bool) -> IO CacheCrawlerState
stdCacher (maxDocs, maxParDocs, maxParThreads)
(saveIntervall, savePath)
(trc, trcx)
inpOptions
furtherConfigs
resumeLoc
startUris
followRef = execCrawler action config initState
where
initState = initCrawlerState emptyCacheState
action = do
noticeC "cacheCore" ["cache update started"]
maybe (crawlDocs startUris) crawlerResume $ resumeLoc
noticeC "cacheCore" ["cache update finished"]
config = setCrawlerMaxDocs maxDocs maxParDocs maxParThreads
>>>
setCrawlerSaveConf saveIntervall savePath
>>>
setCrawlerTraceLevel trc trcx
>>>
enableRobotsTxt
>>>
furtherConfigs
$
cacheCrawlerConfig inpOptions followRef