module Holumbus.Index.CompactDocuments
(
Documents (..)
, CompressedDoc(..)
, DocMap
, URIMap
, emptyDocuments
, singleton
, simplify
, toDocument
, fromDocument
, fromDocMap
, toDocMap
)
where
import qualified Codec.Compression.BZip as BZ
import Control.DeepSeq
import Data.Binary ( Binary )
import qualified Data.Binary as B
import Data.ByteString.Lazy ( ByteString )
import qualified Data.ByteString.Lazy as BS
import Data.Maybe ( fromJust )
import qualified Holumbus.Data.PrefixTree as M
import Holumbus.Index.Common
import Text.XML.HXT.Core
type URIMap = M.PrefixTree DocId
type DocMap a = DocIdMap (CompressedDoc a)
newtype CompressedDoc a = CDoc { unCDoc :: ByteString }
deriving (Eq, Show)
data Documents a = Documents
{ idToDoc :: ! (DocMap a)
, docToId :: ! URIMap
, lastDocId :: ! DocId
}
deriving (Show)
instance (Binary a, HolIndex i) => HolDocIndex Documents a i where
defragmentDocIndex dt ix = (dt1, ix1)
where
dt1 = editDocIds editId dt
ix1 = updateDocIds' editId ix
editId i = fromJust . lookupDocIdMap i $ idMap
idMap = fromListDocIdMap . flip zip (map mkDocId [1..]) . keysDocIdMap . toMap $ dt
unionDocIndex dt1 ix1 dt2 ix2
| s1 == 0 = (dt2, ix2)
| s2 == 0 = (dt1, ix1)
| s1 < s2 = unionDocIndex dt2 ix2 dt1 ix1
| otherwise = (dt, ix)
where
dt = unionDocs dt1 dt2s
ix = mergeIndexes ix1 ix2s
dt2s = editDocIds add1 dt2
ix2s = updateDocIds' add1 ix2
add1 = addDocId disp
max1 = maxKeyDocIdMap . toMap $ dt1
min2 = minKeyDocIdMap . toMap $ dt2
disp = incrDocId $ subDocId max1 min2
s1 = sizeDocs dt1
s2 = sizeDocs dt2
toDocument :: (Binary a) => CompressedDoc a -> Document a
toDocument = B.decode . BZ.decompress . unCDoc
fromDocument :: (Binary a) => Document a -> CompressedDoc a
fromDocument = CDoc . BZ.compress . B.encode
mapDocument :: (Binary a) => (Document a -> Document a) -> CompressedDoc a -> CompressedDoc a
mapDocument f = fromDocument . f . toDocument
toDocMap :: (Binary a) => DocIdMap (Document a) -> DocMap a
toDocMap = mapDocIdMap fromDocument
fromDocMap :: (Binary a) => DocMap a -> DocIdMap (Document a)
fromDocMap = mapDocIdMap toDocument
instance Binary a => HolDocuments Documents a where
sizeDocs d = sizeDocIdMap (idToDoc d)
lookupById d i = maybe (fail "") return
. fmap toDocument
. lookupDocIdMap i
$ idToDoc d
lookupByURI d u = maybe (fail "") return
. M.lookup u
$ docToId d
disjointDocs dt1 dt2
| nullDocs dt1
||
nullDocs dt2 = True
| otherwise = disjoint ( minKeyDocIdMap . idToDoc $ dt1
, maxKeyDocIdMap . idToDoc $ dt1
)
( minKeyDocIdMap . idToDoc $ dt2
, maxKeyDocIdMap . idToDoc $ dt2
)
where
disjoint p1@(x1, y1) p2@(x2, _y2)
| x1 <= x2 = y1 < x2
| otherwise = disjoint p2 p1
unionDocs dt1 dt2
| disjointDocs dt1 dt2 = Documents
{ idToDoc = unionDocIdMap (idToDoc dt1) (idToDoc dt2)
, docToId = M.union (docToId dt1) (docToId dt2)
, lastDocId = lastDocId dt1 `max` lastDocId dt2
}
| otherwise = error $
"unionDocs: doctables are not disjoint: " ++
show (didIntervall dt1) ++ ", " ++ show (didIntervall dt2)
where
didIntervall dt = ( minKeyDocIdMap . idToDoc $ dt
, maxKeyDocIdMap . idToDoc $ dt
)
makeEmpty _ = emptyDocuments
insertDoc ds d = maybe reallyInsert (\oldId -> (oldId, ds)) (lookupByURI ds (uri d))
where
d' = fromDocument d
reallyInsert = rnf d' `seq`
(newId, Documents newIdToDoc newDocToId newId)
where
newIdToDoc = insertDocIdMap newId d' (idToDoc ds)
newDocToId = M.insert (uri d) newId (docToId ds)
newId = incrDocId (lastDocId ds)
updateDoc ds i d = rnf d' `seq`
ds
{ idToDoc = insertDocIdMap i d' (idToDoc ds)
, docToId = M.insert (uri d) i (docToId (removeById ds i))
}
where
d' = fromDocument d
removeById ds d = maybe ds reallyRemove (lookupById ds d)
where
reallyRemove (Document _ u _)
= Documents
(deleteDocIdMap d (idToDoc ds))
(M.delete u (docToId ds))
(lastDocId ds)
updateDocuments f d = Documents updated (idToDoc2docToId updated) (lastId updated)
where
updated = mapDocIdMap (mapDocument f) (idToDoc d)
filterDocuments p d = Documents filtered (idToDoc2docToId filtered) (lastId filtered)
where
filtered = filterDocIdMap (p . toDocument) (idToDoc d)
fromMap itd' = Documents itd (idToDoc2docToId itd) (lastId itd)
where
itd = toDocMap itd'
toMap = fromDocMap . idToDoc
editDocIds f d = Documents
{ idToDoc = newIdToDoc
, docToId = M.map f $ docToId d
, lastDocId = lastId newIdToDoc
}
where
newIdToDoc = foldWithKeyDocIdMap (insertDocIdMap . f) emptyDocIdMap
$ idToDoc d
instance Eq a => Eq (Documents a)
where
(==) (Documents i2da d2ia _) (Documents i2db d2ib _)
= (i2da == i2db)
&&
(d2ia == d2ib)
instance NFData a => NFData (Documents a)
where
rnf (Documents i2d d2i lid) = rnf i2d `seq` rnf d2i `seq` rnf lid
instance (Binary a, XmlPickler a) =>
XmlPickler (Documents a)
where
xpickle = xpElem "documents" $
xpWrap convertDoctable $
xpWrap (fromListDocIdMap, toListDocIdMap) $
xpList xpDocumentWithId
where
convertDoctable = ( \ itd -> Documents itd (idToDoc2docToId itd) (lastId itd)
, \ (Documents itd _ _) -> itd
)
xpDocumentWithId = xpElem "doc" $
xpPair (xpAttr "id" xpDocId) xpickle
instance Binary a => Binary (Documents a)
where
put (Documents i2d _ lid) = B.put lid >> B.put i2d
get = do lid <- B.get
i2d <- B.get
return (Documents i2d (idToDoc2docToId i2d) lid)
instance (Binary a, XmlPickler a) =>
XmlPickler (CompressedDoc a)
where
xpickle = xpWrap (fromDocument , toDocument) $
xpickle
instance Binary a => Binary (CompressedDoc a)
where
put = B.put . unCDoc
get = B.get >>= return . CDoc
instance NFData (CompressedDoc a)
where
rnf (CDoc s) = BS.length s `seq` ()
emptyDocuments :: Documents a
emptyDocuments = Documents emptyDocIdMap M.empty nullDocId
singleton :: (Binary a) => Document a -> Documents a
singleton d = rnf d' `seq`
Documents
(singletonDocIdMap firstDocId d')
(M.singleton (uri d) firstDocId)
firstDocId
where
d' = fromDocument d
simplify :: (Binary a, Show a) => Documents a -> Documents String
simplify dt = Documents (simple (idToDoc dt)) (docToId dt) (lastDocId dt)
where
simple i2d = mapDocIdMap
( fromDocument
. (\d -> Document (title d) (uri d) (maybe Nothing (Just . show) (custom d)))
. toDocument
) i2d
idToDoc2docToId :: Binary a => DocMap a -> URIMap
idToDoc2docToId = foldWithKeyDocIdMap
(\i d r -> M.insert (uri . toDocument $ d) i r)
M.empty
lastId :: DocMap a -> DocId
lastId = maxKeyDocIdMap