{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE TypeFamilies #-} -- ---------------------------------------------------------------------------- {- | Module : Hunt.Index.HashedDocTable Copyright : Copyright (C) 2012 Uwe Schmidt License : MIT Maintainer : Uwe Schmidt (uwe@fh-wedel.de) Stability : experimental A more space efficient substitute for Hunt.Index.Documents and a more flexible implementation than Hunt.Index.CompactDocuments. DocIds are computed by a hash function, so the inverse map from 'URI's to 'DocId's is substituted by the hash function. MurmurHash2 64-bit is used as the hash function. https://sites.google.com/site/murmurhash/ It is a fast non-cryptographic hash function with good performance and hash distribution properties. http://programmers.stackexchange.com/a/145633 -} -- ---------------------------------------------------------------------------- module Hunt.DocTable.HashedDocTable ( -- * Documents type Documents (..) , DocMap -- * Conversion , fromMap ) where import Data.Binary (Binary (..)) import Control.DeepSeq import Hunt.Common.BasicTypes import Hunt.Common.DocId (DocId, mkDocId) import Hunt.Common.DocIdMap (DocIdMap) import qualified Hunt.Common.DocIdMap as DM import Hunt.Common.DocIdSet (DocIdSet) import Hunt.Common.Document (Document (..), DocumentWrapper (..)) import Hunt.DocTable import Hunt.Utility -- ------------------------------------------------------------ -- | The table which is used to map a document to an artificial id and vice versa. type DocMap e = DocIdMap e -- | The 'DocTable' implementation. Maps 'DocId's to 'Document's. newtype Documents e = Documents { idToDoc :: DocMap e } -- ^ A mapping from a document id to -- the document itself. deriving (Eq, Show) mkDocuments :: NFData e => DocMap e -> Documents e mkDocuments m = Documents $! m -- ------------------------------------------------------------ instance NFData e => NFData (Documents e) where rnf (Documents e) = rnf e instance (DocumentWrapper e, Binary e) => Binary (Documents e) where put = put . idToDoc get = get >>= return . mkDocuments --- ------------------------------------------------------------ -- | An empty document table. empty' :: (DocTable (Documents e), DocumentWrapper e) => Documents e empty' = mkDocuments DM.empty -- | Build a 'DocTable' from a 'DocIdMap' (maps 'DocId's to 'Document's) fromMap :: (DocTable (Documents e), DocumentWrapper e) => (Document -> e) -> DocIdMap Document -> Documents e fromMap = fromMap' -- ------------------------------------------------------------ instance (DocumentWrapper e) => DocTable (Documents e) where type DValue (Documents e) = e null = return . null' -- Returns the number of unique documents in the table. size = return . size' -- Lookup a document by its id. lookup = return .:: lookup' -- Lookup the id of a document by an URI. lookupByURI = return .:: lookupByURI' -- Union of two disjoint document tables. It is assumed, that the DocIds and the document uris -- of both indexes are disjoint. If only the sets of uris are disjoint, the DocIds can be made -- disjoint by adding maxDocId of one to the DocIds of the second, e.g. with editDocIds union = return .:: unionDocs' -- Test whether the doc ids of both tables are disjoint. disjoint = return .:: disjoint' -- Insert a document into the table. Returns a tuple of the id for that document and the -- new table. If a document with the same URI is already present, its id will be returned -- and the table is returned unchanged. insert = return .:: insert' -- Update a document with a certain DocId. update = return .::: update' -- Removes the document with the specified id from the table. delete = return .:: delete' -- Deletes a set of Docs by Id from the table. difference = return .:: difference' -- Update documents (through mapping over all documents). map = return .:: map' -- Filters all documents that satisfy the predicate. filter = return .:: filter' -- Convert document table to a single map. toMap = return . toMap' -- | Empty 'DocTable'. empty = empty' -- ------------------------------------------------------------ null' :: (DocumentWrapper e) => Documents e -> Bool null' = DM.null . idToDoc size' :: (DocumentWrapper e) => Documents e -> Int size' = DM.size . idToDoc lookup' :: (Monad m, DocumentWrapper e) => DocId -> Documents e -> m e lookup' i d = maybe (fail "") return . DM.lookup i . idToDoc $ d lookupByURI' :: (Monad m, DocumentWrapper e) => URI -> Documents e -> m DocId lookupByURI' u d = maybe (fail "") (const $ return i) . DM.lookup i . idToDoc $ d where i = mkDocId u disjoint' :: (DocumentWrapper e) => Documents e -> Documents e -> Bool disjoint' dt1 dt2 = DM.null $ DM.intersection (idToDoc dt1) (idToDoc dt2) unionDocs' :: (DocumentWrapper e) => Documents e -> Documents e -> Documents e unionDocs' dt1 dt2 | disjoint' dt1 dt2 = unionDocs'' dt1 dt2 | otherwise = error "HashedDocTable.unionDocs: doctables are not disjoint" where unionDocs'' :: (DocumentWrapper e) => Documents e -> Documents e -> Documents e unionDocs'' dt1' dt2' = mkDocuments $ idToDoc dt1' `DM.union` idToDoc dt2' insert' :: (DocumentWrapper e) => e -> Documents e -> (DocId, Documents e) insert' d ds = maybe reallyInsert (const (newId, ds)) (lookup' newId ds) where newId = mkDocId . uri . unwrap $ d reallyInsert = (newId, mkDocuments $ DM.insert newId d $ idToDoc ds) update' :: (DocumentWrapper e) => DocId -> e -> Documents e -> Documents e update' i d ds = mkDocuments $ DM.insert i d $ idToDoc ds delete' :: (DocumentWrapper e) => DocId -> Documents e -> Documents e delete' d ds = mkDocuments $ DM.delete d $ idToDoc ds difference' :: (DocumentWrapper e) => DocIdSet -> Documents e -> Documents e difference' s ds = mkDocuments $ idToDoc ds `DM.diffWithSet` s map' :: (DocumentWrapper e) => (e -> e) -> Documents e -> Documents e map' f d = mkDocuments $ DM.map f (idToDoc d) filter' :: (DocumentWrapper e) => (e -> Bool) -> Documents e -> Documents e filter' p d = mkDocuments $ DM.filter p (idToDoc d) fromMap' :: (DocumentWrapper e) => (Document -> e) -> DocIdMap Document -> Documents e fromMap' f itd = mkDocuments $ DM.map f itd toMap' :: Documents e -> DocIdMap e toMap' = idToDoc -- ------------------------------------------------------------