{-# OPTIONS -XFlexibleContexts -XBangPatterns #-} -- ------------------------------------------------------------ module Holumbus.Crawler.IndexerCore ( RawDoc , RawContexts , RawContext , RawWords , RawWord , RawTitle , IndexCrawlerConfig , IndexContextConfig(..) , IndexerState(..) , emptyIndexerState , indexCrawlerConfig , stdIndexer , unionIndexerStatesM , insertRawDocM ) where -- ------------------------------------------------------------ import Control.DeepSeq import Control.Monad ( foldM ) import Control.Monad.Trans -- ( MonadIO ) import Data.Binary ( Binary ) import qualified Data.Binary as B import Data.Function.Selector import Data.Maybe import Holumbus.Crawler import Holumbus.Index.Common hiding ( URI ) import Text.XML.HXT.Core -- ------------------------------------------------------------ type RawDoc c = (RawContexts, RawTitle, Maybe c) -- c is the user defined custom info type RawContexts = [RawContext] type RawContext = (Context, RawWords) type RawWords = [RawWord] type RawWord = (Word, Position) type RawTitle = String type IndexCrawlerConfig i d c = CrawlerConfig (RawDoc c) (IndexerState i d c) type IndexCrawlerState i d c = CrawlerState (IndexerState i d c) data IndexContextConfig = IndexContextConfig { ixc_name :: String , ixc_collectText :: IOSArrow XmlTree String , ixc_textToWords :: String -> [String] , ixc_boringWord :: String -> Bool } data IndexerState i d c = IndexerState { ixs_index :: ! i -- the index type , ixs_documents :: ! (d c) -- the type for document descriptions } deriving (Show) -- ------------------------------------------------------------ instance (NFData i, NFData (d c)) => NFData (IndexerState i d c) where rnf IndexerState { ixs_index = i , ixs_documents = d } = rnf i `seq` rnf d -- ------------------------------------------------------------ instance (Binary i, Binary (d c)) => Binary (IndexerState i d c) where put s = B.put (ixs_index s) >> B.put (ixs_documents s) get = do ix <- B.get dm <- B.get return $ IndexerState { ixs_index = ix , ixs_documents = dm } -- ------------------------------------------------------------ instance (XmlPickler i, XmlPickler (d c)) => XmlPickler (IndexerState i d c) where xpickle = xpElem "index-state" $ xpWrap ( uncurry IndexerState , \ ix -> (ixs_index ix, ixs_documents ix) ) $ xpPair xpickle xpickle -- ------------------------------------------------------------ emptyIndexerState :: i -> d c -> IndexerState i d c emptyIndexerState eix edm = IndexerState { ixs_index = eix , ixs_documents = edm } -- ------------------------------------------------------------ stdIndexer :: ( Binary i , Binary (d c) , Binary c , HolIndexM IO i , HolDocuments d c , NFData i , NFData (d c) , NFData c) => IndexCrawlerConfig i d c -- ^ adapt configuration to special needs, -- use id if default is ok -> Maybe String -- ^ resume from interrupted index run with state -- stored in file -> [URI] -- ^ start indexing with this set of uris -> IndexerState i d c -- ^ the initial empty indexer state -> IO (IndexCrawlerState i d c) -- ^ result is a state consisting of the index and the map of indexed documents stdIndexer config resumeLoc startUris eis = execCrawler action config (initCrawlerState eis) where action = do noticeC "indexerCore" ["indexer started"] res <- maybe (crawlDocs startUris) crawlerResume $ resumeLoc noticeC "indexerCore" ["indexer finished"] return res -- ------------------------------------------------------------ -- general HolIndexM IO i version, for old specialized version see code at end of this file indexCrawlerConfig :: ( HolIndexM IO i , HolDocuments d c , HolDocIndex d c i , NFData i , NFData c , NFData (d c) ) => SysConfig -- ^ document read options -> (URI -> Bool) -- ^ the filter for deciding, whether the URI shall be processed -> Maybe (IOSArrow XmlTree String) -- ^ the document href collection filter, default is 'Holumbus.Crawler.Html.getHtmlReferences' -> Maybe (IOSArrow XmlTree XmlTree) -- ^ the pre document filter, default is the this arrow -> Maybe (IOSArrow XmlTree String) -- ^ the filter for computing the document title, default is empty string -> Maybe (IOSArrow XmlTree c) -- ^ the filter for the cutomized doc info, default Nothing -> [IndexContextConfig] -- ^ the configuration of the various index parts -> IndexCrawlerConfig i d c -- ^ result is a crawler config indexCrawlerConfig opts followRef getHrefF preDocF titleF0 customF0 contextCs = addSysConfig (defaultOpts >>> opts) -- install the default read options >>> ( setS theFollowRef followRef ) >>> ( setS theProcessRefs $ fromMaybe getHtmlReferences getHrefF ) >>> ( setS thePreDocFilter $ fromMaybe checkDocumentStatus preDocF ) -- in case of errors throw away any contents >>> ( setS theProcessDoc rawDocF ) -- rawDocF is build up by the context config, text, title and custom >>> enableRobotsTxt -- add the robots stuff at the end >>> -- the filter wrap the other filters addRobotsNoFollow >>> addRobotsNoIndex $ defaultCrawlerConfig insertRawDocM unionIndexerStatesM -- take the default crawler config -- and set the result combining functions where rawDocF = ( listA contextFs &&& titleF &&& customF ) >>^ (\ (x3, (x2, x1)) -> (x3, x2, x1)) titleF = ( fromMaybe (constA "") titleF0 ) >. concat customF = ( fromMaybe none customF0 ) >. listToMaybe contextFs :: IOSArrow XmlTree RawContext contextFs = catA . map contextF $ contextCs -- collect all contexts contextF :: IndexContextConfig -> IOSArrow XmlTree RawContext contextF ixc = constA (ixc_name ixc) -- the name of the raw context &&& ( ixc_collectText ixc >. processText ) -- the list of words and positions of the collected text where -- this arrow is deterministic, it always delivers a single pair processText :: [String] -> RawWords processText = concat >>> ixc_textToWords ixc >>> flip zip [1..] >>> filter (fst >>> ixc_boringWord ixc >>> not) defaultOpts = withRedirect yes >>> withAcceptedMimeTypes ["text/html", "text/xhtml"] >>> withInputEncoding isoLatin1 >>> withEncodingErrors no -- encoding errors and parser warnings are boring >>> withValidate no >>> withParseHTML yes >>> withWarnings no -- ------------------------------------------------------------ unionIndexerStatesM :: ( MonadIO m , HolIndexM m i , HolDocuments d c , HolDocIndex d c i ) => IndexerState i d c -> IndexerState i d c -> m (IndexerState i d c) unionIndexerStatesM ixs1 ixs2 = return $! IndexerState { ixs_index = ix , ixs_documents = dt } where (! dt, ! ix) = unionDocIndex dt1 ix1 dt2 ix2 ix1 = ixs_index ixs1 ix2 = ixs_index ixs2 dt1 = ixs_documents ixs1 dt2 = ixs_documents ixs2 -- ------------------------------------------------------------ insertRawDocM :: ( MonadIO m , HolIndexM m i , HolDocuments d c , NFData i , NFData c , NFData (d c) ) => (URI, RawDoc c) -- ^ extracted URI and doc info -> IndexerState i d c -- ^ old indexer state -> m (IndexerState i d c) -- ^ new indexer state insertRawDocM (rawUri, (rawContexts, rawTitle, rawCustom)) ixs | nullContexts = return ixs -- no words found in document, -- so there are no refs in index -- and document is thrown away | otherwise = do newIx <- foldM (insertRawContextM did) (ixs_index ixs) $ rawContexts newIxs <- return $ IndexerState { ixs_index = newIx , ixs_documents = newDocs } rnf newIxs `seq` return newIxs where nullContexts = and . map (null . snd) $ rawContexts (did, newDocs) = insertDoc (ixs_documents ixs) doc doc = Document { title = rawTitle , uri = rawUri , custom = rawCustom } insertRawContextM :: (Monad m, HolIndexM m i) => DocId -> i -> (Context, [(Word, Position)]) -> m i insertRawContextM did ix (cx, ws) = foldM (insWordM cx did) ix ws insWordM :: (Monad m, HolIndexM m i) => Context -> DocId -> i -> (Word, Position) -> m i insWordM cx' did' ix' (w', p') = insertPositionM cx' w' did' p' ix' -- ------------------------------------------------------------