module Holumbus.Index.HashedDocuments
(
Documents (..)
, CompressedDoc(..)
, DocMap
, emptyDocuments
, singleton
, toDocument
, fromDocument
, fromDocMap
, toDocMap
)
where
import qualified Codec.Compression.BZip as BZ
import Control.DeepSeq
import Data.Binary ( Binary )
import qualified Data.Binary as B
import Data.ByteString.Lazy ( ByteString )
import qualified Data.ByteString.Lazy as BS
import Data.Digest.Pure.SHA
import Holumbus.Index.Common
import Text.XML.HXT.Core
type DocMap a
= DocIdMap (CompressedDoc a)
newtype CompressedDoc a
= CDoc { unCDoc :: ByteString }
deriving (Eq, Show)
newtype Documents a
= Documents { idToDoc :: DocMap a }
deriving (Eq, Show, NFData)
docToId :: URI -> DocId
docToId = mkDocId . integerDigest . sha1 . B.encode
instance (Binary a, HolIndex i) => HolDocIndex Documents a i where
unionDocIndex dt1 ix1 dt2 ix2
| s1 == 0
= (dt2, ix2)
| s2 == 0
= (dt1, ix1)
| s1 < s2
= unionDocIndex dt2 ix2 dt1 ix1
| otherwise
= (dt, ix)
where
dt = unionDocs' dt1 dt2
ix = mergeIndexes ix1 ix2
s1 = sizeDocs dt1
s2 = sizeDocs dt2
toDocument :: (Binary a) => CompressedDoc a -> Document a
toDocument = B.decode . BZ.decompress . unCDoc
fromDocument :: (Binary a) => Document a -> CompressedDoc a
fromDocument = CDoc . BZ.compress . B.encode
mapDocument :: (Binary a) => (Document a -> Document a) -> CompressedDoc a -> CompressedDoc a
mapDocument f = fromDocument . f . toDocument
toDocMap :: (Binary a) => DocIdMap (Document a) -> DocMap a
toDocMap = mapDocIdMap fromDocument
fromDocMap :: (Binary a) => DocMap a -> DocIdMap (Document a)
fromDocMap = mapDocIdMap toDocument
instance Binary a => HolDocuments Documents a where
sizeDocs
= sizeDocIdMap . idToDoc
lookupById d i
= maybe (fail "") return
. fmap toDocument
. lookupDocIdMap i
. idToDoc
$ d
lookupByURI d u
= maybe (fail "") (const $ return i)
. lookupDocIdMap i
. idToDoc
$ d
where
i = docToId u
disjointDocs dt1 dt2
= nullDocIdMap $ intersectionDocIdMap (idToDoc dt1) (idToDoc dt2)
unionDocs dt1 dt2
| disjointDocs dt1 dt2
= unionDocs' dt1 dt2
| otherwise
= error $
"HashedDocuments.unionDocs: doctables are not disjoint"
makeEmpty _
= emptyDocuments
insertDoc ds d
= maybe reallyInsert (const (newId, ds)) (lookupById ds newId)
where
newId
= docToId . uri $ d
d' = fromDocument d
reallyInsert
= rnf d' `seq`
(newId, Documents {idToDoc = insertDocIdMap newId d' $ idToDoc ds})
updateDoc ds i d
= rnf d' `seq`
Documents {idToDoc = insertDocIdMap i d' $ idToDoc ds}
where
d' = fromDocument d
removeById ds d
= Documents {idToDoc = deleteDocIdMap d $ idToDoc ds}
updateDocuments f d
= Documents {idToDoc = mapDocIdMap (mapDocument f) (idToDoc d)}
filterDocuments p d
= Documents {idToDoc = filterDocIdMap (p . toDocument) (idToDoc d)}
fromMap itd'
= Documents {idToDoc = toDocMap itd'}
toMap
= fromDocMap . idToDoc
instance (Binary a, XmlPickler a) => XmlPickler (Documents a) where
xpickle
= xpElem "documents" $
xpWrap convertDoctable $
xpWrap (fromListDocIdMap, toListDocIdMap) $
xpList xpDocumentWithId
where
convertDoctable
= (Documents, idToDoc)
xpDocumentWithId
= xpElem "doc" $
xpPair (xpAttr "id" xpDocId) xpickle
instance Binary a => Binary (Documents a) where
put = B.put . idToDoc
get = fmap Documents B.get
instance (Binary a, XmlPickler a) => XmlPickler (CompressedDoc a) where
xpickle
= xpWrap (fromDocument , toDocument) $
xpickle
instance Binary a => Binary (CompressedDoc a) where
put = B.put . unCDoc
get = B.get >>= return . CDoc
instance NFData (CompressedDoc a) where
rnf (CDoc s)
= BS.length s `seq` ()
emptyDocuments :: Documents a
emptyDocuments
= Documents emptyDocIdMap
unionDocs' :: Documents a -> Documents a -> Documents a
unionDocs' dt1 dt2
= Documents
{ idToDoc = unionDocIdMap (idToDoc dt1) (idToDoc dt2) }
singleton :: (Binary a) => Document a -> Documents a
singleton d
= rnf d' `seq`
Documents {idToDoc = singletonDocIdMap (docToId . uri $ d) d'}
where
d' = fromDocument d