module Holumbus.Index.CompactIndex
( Document
, Documents
, SmallDocuments
, Inverted
, emptyInverted
, removeDocIdsInverted
, CompactInverted
, emptyCompactInverted
, inverted2compactInverted
, HolumbusState
, HolumbusConfig
, emptyHolumbusState
, defragmentHolumbusState
, emptyIndexerState
, emptyDocuments
, mergeAndWritePartialRes'
, writeXml
, writeBin
, writeSearchBin
, writePartialIndex
)
where
import Control.Arrow
import Control.DeepSeq
import Control.Monad.Reader
import Data.Binary
import Data.Function.Selector ( (.&&&.) )
import Holumbus.Crawler.Types
import Holumbus.Crawler.IndexerCore
import Holumbus.Crawler.Logger
import Holumbus.Index.Common ( Document(..)
, Occurrences
, defragmentDocIndex
, fromList
, toList
, unionDocs
, mergeIndexes
)
import Holumbus.Index.CompactDocuments
( Documents(..)
, emptyDocuments
)
import Holumbus.Index.CompactSmallDocuments
( SmallDocuments(..)
, docTable2smallDocTable
)
import qualified Holumbus.Index.CompactSmallDocuments
as CSD
import Text.XML.HXT.Core
import qualified Holumbus.Index.Inverted.CompressedPrefixMem as PM
type Inverted = PM.Inverted0
emptyInverted :: Inverted
emptyInverted = PM.emptyInverted0
removeDocIdsInverted :: Occurrences -> Inverted -> Inverted
removeDocIdsInverted = PM.removeDocIdsInverted
type CompactInverted = PM.InvertedOSerialized
emptyCompactInverted :: CompactInverted
emptyCompactInverted = PM.emptyInvertedOSerialized
inverted2compactInverted :: Inverted -> CompactInverted
inverted2compactInverted = fromList PM.emptyInvertedOSerialized . toList
type HolumbusState di = IndexerState Inverted Documents di
type HolumbusConfig di = IndexCrawlerConfig Inverted Documents di
emptyHolumbusState :: HolumbusState di
emptyHolumbusState = emptyIndexerState emptyInverted emptyDocuments
flushHolumbusState :: HolumbusState di -> HolumbusState di
flushHolumbusState hs = hs { ixs_index = emptyInverted
, ixs_documents = emptyDocuments
{ lastDocId = lastDocId . ixs_documents $ hs }
}
emptySmallDocuments :: SmallDocuments a
emptySmallDocuments = CSD.emptyDocuments
defragmentHolumbusState :: (Binary di) =>
HolumbusState di -> HolumbusState di
defragmentHolumbusState IndexerState
{ ixs_index = ix
, ixs_documents = dt
} = IndexerState
{ ixs_index = ix'
, ixs_documents = dt'
}
where
(dt', ix') = defragmentDocIndex dt ix
mergeAndWritePartialRes' :: (MonadIO m, NFData i, Binary i) =>
(SmallDocuments i -> SmallDocuments i) -> [String] -> String -> m ()
mergeAndWritePartialRes' id' pxs out
= do notice $ ["merge partial doctables from"] ++ pxs
mdocs <- mergeSmallDocs $ map (++ ".doc") pxs
notice $ ["write merged doctable to", out ++ ".doc"]
liftIO $ encodeFile (out ++ ".doc") (id' mdocs)
notice $ ["merge partial indexes from"] ++ pxs
mixs <- mergeCompactIxs $ map (++ ".idx") pxs
notice $ ["write merged indexes to", out ++ ".idx"]
liftIO $ encodeFile (out ++ ".idx") mixs
notice $ ["merge partial doctables and indexes done"]
mergeSmallDocs :: (MonadIO m, NFData i, Binary i) => [String] -> m (SmallDocuments i)
mergeSmallDocs []
= return emptySmallDocuments
mergeSmallDocs (x : xs)
= do docs <- mergeSmallDocs xs
notice ["merge small documents from file", x]
doc1 <- liftIO $ decodeFile x
rnf doc1 `seq`
(return $ unionDocs docs doc1)
mergeCompactIxs :: (MonadIO m) => [String] -> m CompactInverted
mergeCompactIxs []
= return emptyCompactInverted
mergeCompactIxs (x : xs)
= do ixs <- mergeCompactIxs xs
notice ["merge compact index from file", x]
ix1 <- liftIO $ decodeFile x
rnf ix1 `seq`
(return $ mergeIndexes ix1 ixs)
writeXml :: (MonadIO m, XmlPickler a) => FilePath -> a -> m ()
writeXml xf v
| xmlOut
= do notice ["writing into XML file", xmlFile]
liftIO $ runX (constA v
>>> hxtSetTraceAndErrorLogger WARNING
>>> xpickleDocument xpickle [withIndent yes] xmlFile
)
>> return ()
notice ["writing XML finished"]
| otherwise
= notice ["no XML output"]
where
(xmlOut, xmlFile)
| null xf = (False, xf)
| xf == "-" = (True, "")
| otherwise = (True, xf)
writeBin :: (MonadIO m, Binary a) => FilePath -> a -> m ()
writeBin out v
| null out
= notice ["no binary output"]
| otherwise
= do notice ["writing into binary file", out]
liftIO $ encodeFile out v
notice ["writing binary data finished"]
writeSearchBin :: (Binary c, MonadIO m) => FilePath -> HolumbusState c -> m ()
writeSearchBin out state
| null out
= notice ["no search index written"]
| otherwise
= do notice ["writing small document table into binary file", docFile]
liftIO $ encodeFile docFile (docTable2smallDocTable . ixs_documents $ state)
notice ["writing compressed inverted index into binary file", idxFile]
liftIO $ encodeFile idxFile (inverted2compactInverted . ixs_index $ state)
notice ["writing search index files finished"]
where
docFile = out ++ ".doc"
idxFile = out ++ ".idx"
writePartialIndex :: (NFData c, XmlPickler c, Binary c) =>
Bool -> FilePath -> CrawlerAction a (HolumbusState c) ()
writePartialIndex xout fn
= modifyStateIO
(theResultAccu .&&&. theResultInit)
(\ (r, _i) -> do r' <- writePartialIndex' xout fn r
return (r', r')
)
writePartialIndex' :: (NFData c, XmlPickler c, Binary c) =>
Bool -> FilePath -> HolumbusState c -> IO (HolumbusState c)
writePartialIndex' xout out ixs
= do writeSearchBin out ixs
if xout
then writeXml (out ++ ".xml") ixs
else return ()
let ixs' = flushHolumbusState ixs
rnf ixs' `seq`
return ixs'
notice :: MonadIO m => [String] -> m ()
notice = noticeC "compactIndex"