{-# OPTIONS #-} -- ------------------------------------------------------------ module Holumbus.Index.HashedIndex ( Document , Documents , SmallDocuments , Inverted , emptyInverted , removeDocIdsInverted , CompactInverted , emptyCompactInverted , inverted2compactInverted , HolumbusState , HolumbusConfig , emptyHolumbusState , defragmentHolumbusState , emptyIndexerState , emptyDocuments , mergeAndWritePartialRes' , writeXml , writeBin , writeSearchBin , writePartialIndex ) where import Control.Arrow import Control.DeepSeq import Control.Monad.Reader import Data.Binary import Holumbus.Crawler.Types import Holumbus.Crawler.IndexerCore import Holumbus.Crawler.Logger import Holumbus.Index.Common ( Document(..) , Occurrences , fromList , toList , unionDocs , mergeIndexes ) import Holumbus.Index.HashedDocuments ( Documents(..) , emptyDocuments ) import Text.XML.HXT.Core -- ------------------------------------------------------------ {- .1: direct use of prefix tree with simple-9 encoded occurences concerning efficiency this implementation is about the same as the 2. one, space and time are minimally better, the reason could be less code working with classes import Holumbus.Index.Inverted.PrefixMem -- -} -- ------------------------------------------------------------ {- .2: indirect use of prefix tree with simple-9 encoded occurences via InvertedCompressed minimal overhead compared to .1 but less efficient in time (1598s / 1038s) and space total mem use (2612MB / 2498MB) than .3 import qualified Holumbus.Index.Inverted.CompressedPrefixMem as PM type Inverted = PM.InvertedCompressed emptyInverted :: Inverted emptyInverted = PM.emptyInvertedCompressed -- -} -- ------------------------------------------------------------ -- {- remove "--" to coment out region {- .3: indirect prefix tree without compression of position sets best of these 3 implementations implementations with serializations become much more inefficient in runtime and are not worth to be considered -} import qualified Holumbus.Index.Inverted.CompressedPrefixMem as PM type Inverted = PM.Inverted0 emptyInverted :: Inverted emptyInverted = PM.emptyInverted0 removeDocIdsInverted :: Occurrences -> Inverted -> Inverted removeDocIdsInverted = PM.removeDocIdsInverted type CompactInverted = PM.InvertedOSerialized emptyCompactInverted :: CompactInverted emptyCompactInverted = PM.emptyInvertedOSerialized inverted2compactInverted :: Inverted -> CompactInverted inverted2compactInverted = fromList PM.emptyInvertedOSerialized . toList -- -} -- ------------------------------------------------------------ type HolumbusState di = IndexerState Inverted Documents di type HolumbusConfig di = IndexCrawlerConfig Inverted Documents di emptyHolumbusState :: HolumbusState di emptyHolumbusState = emptyIndexerState emptyInverted emptyDocuments type SmallDocuments = Documents emptySmallDocuments :: SmallDocuments a emptySmallDocuments = emptyDocuments -- ------------------------------------------------------------ defragmentHolumbusState :: (Binary di) => HolumbusState di -> HolumbusState di defragmentHolumbusState = id -- ------------------------------------------------------------ mergeAndWritePartialRes' :: (MonadIO m, NFData i, Binary i) => (SmallDocuments i -> SmallDocuments i) -> [String] -> String -> m () mergeAndWritePartialRes' id' pxs out = do notice $ ["merge hashed partial doctables from"] ++ pxs mdocs <- mergeSmallDocs $ map (++ ".doc") pxs notice $ ["write merged hashed doctable to", out ++ ".doc"] liftIO $ encodeFile (out ++ ".doc") (id' mdocs) notice $ ["merge hashed partial indexes from"] ++ pxs mixs <- mergeCompactIxs $ map (++ ".idx") pxs notice $ ["write merged hashed indexes to", out ++ ".idx"] liftIO $ encodeFile (out ++ ".idx") mixs notice $ ["merge partial hashed doctables and indexes done"] mergeSmallDocs :: (MonadIO m, NFData i, Binary i) => [String] -> m (SmallDocuments i) mergeSmallDocs [] = return emptySmallDocuments mergeSmallDocs (x : xs) = do docs <- mergeSmallDocs xs notice ["merge hashed documents from file", x] doc1 <- liftIO $ decodeFile x rnf doc1 `seq` (return $ unionDocs docs doc1) mergeCompactIxs :: (MonadIO m) => [String] -> m CompactInverted mergeCompactIxs [] = return emptyCompactInverted mergeCompactIxs (x : xs) = do ixs <- mergeCompactIxs xs notice ["merge compact hashed index from file", x] ix1 <- liftIO $ decodeFile x rnf ix1 `seq` (return $ mergeIndexes ix1 ixs) -- ------------------------------------------------------------ writeXml :: (MonadIO m, XmlPickler a) => FilePath -> a -> m () writeXml xf v | xmlOut = do notice ["writing into XML file", xmlFile] liftIO $ runX (constA v >>> hxtSetTraceAndErrorLogger WARNING >>> xpickleDocument xpickle [withIndent yes] xmlFile ) >> return () notice ["writing XML finished"] | otherwise = notice ["no XML output"] where (xmlOut, xmlFile) | null xf = (False, xf) | xf == "-" = (True, "") | otherwise = (True, xf) writeBin :: (MonadIO m, Binary a) => FilePath -> a -> m () writeBin out v | null out = notice ["no binary output"] | otherwise = do notice ["writing into binary file", out] liftIO $ encodeFile out v notice ["writing binary data finished"] writeSearchBin :: (Binary c, MonadIO m) => FilePath -> HolumbusState c -> m () writeSearchBin out state | null out = notice ["no search index written"] | otherwise = do notice ["writing hashed document table into binary file", docFile] liftIO $ encodeFile docFile (ixs_documents state) notice ["writing hashed compressed inverted index into binary file", idxFile] liftIO $ encodeFile idxFile (inverted2compactInverted . ixs_index $ state) notice ["writing hashed search index files finished"] where docFile = out ++ ".doc" idxFile = out ++ ".idx" -- ------------------------------------------------------------ writePartialIndex :: (NFData c, XmlPickler c, Binary c) => Bool -> FilePath -> CrawlerAction a (HolumbusState c) () writePartialIndex xout fn = modifyStateIO theResultAccu (writePartialIndex' xout fn) writePartialIndex' :: (NFData c, XmlPickler c, Binary c) => Bool -> FilePath -> HolumbusState c -> IO (HolumbusState c) writePartialIndex' xout out ixs = do writeSearchBin out ixs if xout then writeXml (out ++ ".xml") ixs else return () return emptyHolumbusState -- ------------------------------------------------------------ notice :: MonadIO m => [String] -> m () notice = noticeC "hashedIndex" -- ------------------------------------------------------------