{-# OPTIONS #-} -- ------------------------------------------------------------ module Holumbus.Index.CompactIndex ( Document , Documents , SmallDocuments , Inverted , emptyInverted , removeDocIdsInverted , CompactInverted , emptyCompactInverted , inverted2compactInverted , HolumbusState , HolumbusConfig , emptyHolumbusState , defragmentHolumbusState , emptyIndexerState , emptyDocuments , mergeAndWritePartialRes' , writeXml , writeBin , writeSearchBin , writePartialIndex ) where import Control.Arrow import Control.DeepSeq import Control.Monad.Reader import Data.Binary import Data.Function.Selector ( (.&&&.) ) import Holumbus.Crawler.Types import Holumbus.Crawler.IndexerCore import Holumbus.Crawler.Logger import Holumbus.Index.Common ( Document(..) , Occurrences , defragmentDocIndex , fromList , toList , unionDocs , mergeIndexes ) import Holumbus.Index.CompactDocuments ( Documents(..) , emptyDocuments ) import Holumbus.Index.CompactSmallDocuments ( SmallDocuments(..) , docTable2smallDocTable ) import qualified Holumbus.Index.CompactSmallDocuments as CSD import Text.XML.HXT.Core -- ------------------------------------------------------------ {- .1: direct use of prefix tree with simple-9 encoded occurences concerning efficiency this implementation is about the same as the 2. one, space and time are minimally better, the reason could be less code working with classes import Holumbus.Index.Inverted.PrefixMem -- -} -- ------------------------------------------------------------ {- .2: indirect use of prefix tree with simple-9 encoded occurences via InvertedCompressed minimal overhead compared to .1 but less efficient in time (1598s / 1038s) and space total mem use (2612MB / 2498MB) than .3 import qualified Holumbus.Index.Inverted.CompressedPrefixMem as PM type Inverted = PM.InvertedCompressed emptyInverted :: Inverted emptyInverted = PM.emptyInvertedCompressed -- -} -- ------------------------------------------------------------ -- {- {- .3: indirect prefix tree without compression of position sets best of these 3 implementations implementations with serializations become much more inefficient in runtime and are not worth to be considered -} import qualified Holumbus.Index.Inverted.CompressedPrefixMem as PM type Inverted = PM.Inverted0 emptyInverted :: Inverted emptyInverted = PM.emptyInverted0 removeDocIdsInverted :: Occurrences -> Inverted -> Inverted removeDocIdsInverted = PM.removeDocIdsInverted type CompactInverted = PM.InvertedOSerialized emptyCompactInverted :: CompactInverted emptyCompactInverted = PM.emptyInvertedOSerialized inverted2compactInverted :: Inverted -> CompactInverted inverted2compactInverted = fromList PM.emptyInvertedOSerialized . toList -- -} -- ------------------------------------------------------------ type HolumbusState di = IndexerState Inverted Documents di type HolumbusConfig di = IndexCrawlerConfig Inverted Documents di emptyHolumbusState :: HolumbusState di emptyHolumbusState = emptyIndexerState emptyInverted emptyDocuments flushHolumbusState :: HolumbusState di -> HolumbusState di flushHolumbusState hs = hs { ixs_index = emptyInverted , ixs_documents = emptyDocuments { lastDocId = lastDocId . ixs_documents $ hs } } emptySmallDocuments :: SmallDocuments a emptySmallDocuments = CSD.emptyDocuments -- ------------------------------------------------------------ defragmentHolumbusState :: (Binary di) => HolumbusState di -> HolumbusState di defragmentHolumbusState IndexerState { ixs_index = ix , ixs_documents = dt } = IndexerState { ixs_index = ix' , ixs_documents = dt' } where (dt', ix') = defragmentDocIndex dt ix -- ------------------------------------------------------------ mergeAndWritePartialRes' :: (MonadIO m, NFData i, Binary i) => (SmallDocuments i -> SmallDocuments i) -> [String] -> String -> m () mergeAndWritePartialRes' id' pxs out = do notice $ ["merge partial doctables from"] ++ pxs mdocs <- mergeSmallDocs $ map (++ ".doc") pxs notice $ ["write merged doctable to", out ++ ".doc"] liftIO $ encodeFile (out ++ ".doc") (id' mdocs) notice $ ["merge partial indexes from"] ++ pxs mixs <- mergeCompactIxs $ map (++ ".idx") pxs notice $ ["write merged indexes to", out ++ ".idx"] liftIO $ encodeFile (out ++ ".idx") mixs notice $ ["merge partial doctables and indexes done"] mergeSmallDocs :: (MonadIO m, NFData i, Binary i) => [String] -> m (SmallDocuments i) mergeSmallDocs [] = return emptySmallDocuments mergeSmallDocs (x : xs) = do docs <- mergeSmallDocs xs notice ["merge small documents from file", x] doc1 <- liftIO $ decodeFile x rnf doc1 `seq` (return $ unionDocs docs doc1) mergeCompactIxs :: (MonadIO m) => [String] -> m CompactInverted mergeCompactIxs [] = return emptyCompactInverted mergeCompactIxs (x : xs) = do ixs <- mergeCompactIxs xs notice ["merge compact index from file", x] ix1 <- liftIO $ decodeFile x rnf ix1 `seq` (return $ mergeIndexes ix1 ixs) -- ------------------------------------------------------------ writeXml :: (MonadIO m, XmlPickler a) => FilePath -> a -> m () writeXml xf v | xmlOut = do notice ["writing into XML file", xmlFile] liftIO $ runX (constA v >>> hxtSetTraceAndErrorLogger WARNING >>> xpickleDocument xpickle [withIndent yes] xmlFile ) >> return () notice ["writing XML finished"] | otherwise = notice ["no XML output"] where (xmlOut, xmlFile) | null xf = (False, xf) | xf == "-" = (True, "") | otherwise = (True, xf) writeBin :: (MonadIO m, Binary a) => FilePath -> a -> m () writeBin out v | null out = notice ["no binary output"] | otherwise = do notice ["writing into binary file", out] liftIO $ encodeFile out v notice ["writing binary data finished"] writeSearchBin :: (Binary c, MonadIO m) => FilePath -> HolumbusState c -> m () writeSearchBin out state | null out = notice ["no search index written"] | otherwise = do notice ["writing small document table into binary file", docFile] liftIO $ encodeFile docFile (docTable2smallDocTable . ixs_documents $ state) notice ["writing compressed inverted index into binary file", idxFile] liftIO $ encodeFile idxFile (inverted2compactInverted . ixs_index $ state) notice ["writing search index files finished"] where docFile = out ++ ".doc" idxFile = out ++ ".idx" -- ------------------------------------------------------------ writePartialIndex :: (NFData c, XmlPickler c, Binary c) => Bool -> FilePath -> CrawlerAction a (HolumbusState c) () writePartialIndex xout fn = modifyStateIO (theResultAccu .&&&. theResultInit) (\ (r, _i) -> do r' <- writePartialIndex' xout fn r return (r', r') ) {- the above code is a bit tricky: when crawling is done in parallel, then initial result is used as a unit value, when merging results. When a partial index is written out, the document id count must not be set back to its initial value, to avoid renumbering when merging then partial indexes. As a consequence, not only the result accu must be changed but also the initial value. When this is not done, the indexer runs fine when using the sequential merge, but when running the parallel one, the index ids will overlap. -} writePartialIndex' :: (NFData c, XmlPickler c, Binary c) => Bool -> FilePath -> HolumbusState c -> IO (HolumbusState c) writePartialIndex' xout out ixs = do writeSearchBin out ixs if xout then writeXml (out ++ ".xml") ixs else return () let ixs' = flushHolumbusState ixs rnf ixs' `seq` return ixs' -- ------------------------------------------------------------ notice :: MonadIO m => [String] -> m () notice = noticeC "compactIndex" -- ------------------------------------------------------------