module Holumbus.Index.HashedIndex
( Document
, Documents
, SmallDocuments
, Inverted
, emptyInverted
, removeDocIdsInverted
, CompactInverted
, emptyCompactInverted
, inverted2compactInverted
, HolumbusState
, HolumbusConfig
, emptyHolumbusState
, defragmentHolumbusState
, emptyIndexerState
, emptyDocuments
, mergeAndWritePartialRes'
, writeXml
, writeBin
, writeSearchBin
, writePartialIndex
)
where
import Control.Arrow
import Control.DeepSeq
import Control.Monad.Reader
import Data.Binary
import Holumbus.Crawler.Types
import Holumbus.Crawler.IndexerCore
import Holumbus.Crawler.Logger
import Holumbus.Index.Common ( Document(..)
, Occurrences
, fromList
, toList
, unionDocs
, mergeIndexes
)
import Holumbus.Index.HashedDocuments ( Documents(..)
, emptyDocuments
)
import Text.XML.HXT.Core
import qualified Holumbus.Index.Inverted.CompressedPrefixMem as PM
type Inverted = PM.Inverted0
emptyInverted :: Inverted
emptyInverted = PM.emptyInverted0
removeDocIdsInverted :: Occurrences -> Inverted -> Inverted
removeDocIdsInverted = PM.removeDocIdsInverted
type CompactInverted = PM.InvertedOSerialized
emptyCompactInverted :: CompactInverted
emptyCompactInverted = PM.emptyInvertedOSerialized
inverted2compactInverted :: Inverted -> CompactInverted
inverted2compactInverted = fromList PM.emptyInvertedOSerialized . toList
type HolumbusState di = IndexerState Inverted Documents di
type HolumbusConfig di = IndexCrawlerConfig Inverted Documents di
emptyHolumbusState :: HolumbusState di
emptyHolumbusState = emptyIndexerState emptyInverted emptyDocuments
type SmallDocuments = Documents
emptySmallDocuments :: SmallDocuments a
emptySmallDocuments = emptyDocuments
defragmentHolumbusState :: (Binary di) => HolumbusState di -> HolumbusState di
defragmentHolumbusState = id
mergeAndWritePartialRes' :: (MonadIO m, NFData i, Binary i) =>
(SmallDocuments i -> SmallDocuments i) -> [String] -> String -> m ()
mergeAndWritePartialRes' id' pxs out
= do notice $ ["merge hashed partial doctables from"] ++ pxs
mdocs <- mergeSmallDocs $ map (++ ".doc") pxs
notice $ ["write merged hashed doctable to", out ++ ".doc"]
liftIO $ encodeFile (out ++ ".doc") (id' mdocs)
notice $ ["merge hashed partial indexes from"] ++ pxs
mixs <- mergeCompactIxs $ map (++ ".idx") pxs
notice $ ["write merged hashed indexes to", out ++ ".idx"]
liftIO $ encodeFile (out ++ ".idx") mixs
notice $ ["merge partial hashed doctables and indexes done"]
mergeSmallDocs :: (MonadIO m, NFData i, Binary i) => [String] -> m (SmallDocuments i)
mergeSmallDocs []
= return emptySmallDocuments
mergeSmallDocs (x : xs)
= do docs <- mergeSmallDocs xs
notice ["merge hashed documents from file", x]
doc1 <- liftIO $ decodeFile x
rnf doc1 `seq`
(return $ unionDocs docs doc1)
mergeCompactIxs :: (MonadIO m) => [String] -> m CompactInverted
mergeCompactIxs []
= return emptyCompactInverted
mergeCompactIxs (x : xs)
= do ixs <- mergeCompactIxs xs
notice ["merge compact hashed index from file", x]
ix1 <- liftIO $ decodeFile x
rnf ix1 `seq`
(return $ mergeIndexes ix1 ixs)
writeXml :: (MonadIO m, XmlPickler a) => FilePath -> a -> m ()
writeXml xf v
| xmlOut
= do notice ["writing into XML file", xmlFile]
liftIO $ runX (constA v
>>> hxtSetTraceAndErrorLogger WARNING
>>> xpickleDocument xpickle [withIndent yes] xmlFile
)
>> return ()
notice ["writing XML finished"]
| otherwise
= notice ["no XML output"]
where
(xmlOut, xmlFile)
| null xf = (False, xf)
| xf == "-" = (True, "")
| otherwise = (True, xf)
writeBin :: (MonadIO m, Binary a) => FilePath -> a -> m ()
writeBin out v
| null out
= notice ["no binary output"]
| otherwise
= do notice ["writing into binary file", out]
liftIO $ encodeFile out v
notice ["writing binary data finished"]
writeSearchBin :: (Binary c, MonadIO m) => FilePath -> HolumbusState c -> m ()
writeSearchBin out state
| null out
= notice ["no search index written"]
| otherwise
= do notice ["writing hashed document table into binary file", docFile]
liftIO $ encodeFile docFile (ixs_documents state)
notice ["writing hashed compressed inverted index into binary file", idxFile]
liftIO $ encodeFile idxFile (inverted2compactInverted . ixs_index $ state)
notice ["writing hashed search index files finished"]
where
docFile = out ++ ".doc"
idxFile = out ++ ".idx"
writePartialIndex :: (NFData c, XmlPickler c, Binary c) =>
Bool -> FilePath -> CrawlerAction a (HolumbusState c) ()
writePartialIndex xout fn
= modifyStateIO
theResultAccu
(writePartialIndex' xout fn)
writePartialIndex' :: (NFData c, XmlPickler c, Binary c) =>
Bool -> FilePath -> HolumbusState c -> IO (HolumbusState c)
writePartialIndex' xout out ixs
= do writeSearchBin out ixs
if xout
then writeXml (out ++ ".xml") ixs
else return ()
return emptyHolumbusState
notice :: MonadIO m => [String] -> m ()
notice = noticeC "hashedIndex"