module Holumbus.Index.Inverted.CompressedPrefixMem
( Inverted(..)
, Parts
, Part
, Inverted0
, InvertedCompressed
, InvertedSerialized
, InvertedCSerialized
, InvertedOSerialized
, ComprOccurrences(..)
, emptyInverted0
, emptyInvertedCompressed
, emptyInvertedSerialized
, emptyInvertedCSerialized
, emptyInvertedOSerialized
, Sizeof(..)
, sizeofAttrsInverted
, mapOcc
, zipOcc
, emptyOcc
, theOcc
, nullOcc
, unionOcc
, diffOcc
, insertPosOcc
, deletePosOcc
, updateDocIdOcc
, deleteDocIds
, removeDocIdsInverted
)
where
import qualified Codec.Compression.BZip as BZ
import Control.DeepSeq
import qualified Data.ByteString.Lazy as BS
import qualified Data.Binary as B
import Data.Function ( on )
import Data.Int
import Data.List ( foldl', sortBy )
import qualified Data.Map as M
import Data.Maybe
import Holumbus.Index.Common
import Holumbus.Index.Compression
import qualified Holumbus.Data.PrefixTree as PT
import Text.XML.HXT.Core
class ComprOccurrences s where
toOccurrences :: s -> Occurrences
fromOccurrences :: Occurrences -> s
mapOcc :: (ComprOccurrences s) => (Occurrences -> Occurrences) -> s -> s
mapOcc f = fromOccurrences . f . toOccurrences
zipOcc :: (ComprOccurrences s) => (Occurrences -> Occurrences -> Occurrences) -> s -> s -> s
zipOcc op x y = fromOccurrences $ op (toOccurrences x)(toOccurrences y)
emptyOcc :: (ComprOccurrences s) => s
emptyOcc = fromOccurrences $ emptyOccurrences
theOcc :: (ComprOccurrences s) => s -> Occurrences
theOcc = toOccurrences
nullOcc :: (ComprOccurrences s) => s -> Bool
nullOcc = (== emptyOccurrences) . toOccurrences
unionOcc :: (ComprOccurrences s) => Occurrences -> s -> s
unionOcc os = mapOcc $ mergeOccurrences os
diffOcc :: (ComprOccurrences s) => Occurrences -> s -> s
diffOcc os = mapOcc $ substractOccurrences os
insertPosOcc :: (ComprOccurrences s) => DocId -> Position -> s -> s
insertPosOcc d p = mapOcc $ insertOccurrence d p
deletePosOcc :: (ComprOccurrences s) => DocId -> Position -> s -> s
deletePosOcc d p = mapOcc $ deleteOccurrence d p
updateDocIdOcc :: (ComprOccurrences s) => (DocId -> DocId) -> s -> s
updateDocIdOcc f = mapOcc $ updateOccurrences f
deleteDocIds :: (ComprOccurrences s) => Occurrences -> s -> s
deleteDocIds ids = mapOcc $ flip diffOccurrences ids
class Sizeof a where
sizeof :: a -> Int64
newtype ByteString = Bs { unBs :: BS.ByteString }
deriving (Eq, Show)
instance NFData ByteString where
rnf s = BS.length (unBs s) `seq` ()
instance B.Binary ByteString where
put = B.put . unBs
get = B.get >>= return . Bs
instance Sizeof ByteString where
sizeof = (8 + ) . BS.length . unBs
newtype Occ0 = Occ0 { unOcc0 :: Occurrences }
instance ComprOccurrences Occ0 where
fromOccurrences = Occ0
toOccurrences = unOcc0
instance NFData Occ0 where
rnf = rnf . unOcc0
instance B.Binary Occ0 where
put = B.put . unOcc0
get = B.get >>= return . Occ0
instance Sizeof Occ0 where
sizeof = BS.length . B.encode . unOcc0
newtype OccCompressed = OccCp { unOccCp :: CompressedOccurrences }
deriving (Eq, Show)
instance ComprOccurrences OccCompressed where
fromOccurrences = OccCp . deflateOcc
toOccurrences = inflateOcc . unOccCp
instance NFData OccCompressed where
rnf = rnf . unOccCp
instance B.Binary OccCompressed where
put = B.put . unOccCp
get = B.get >>= return . OccCp
instance Sizeof OccCompressed where
sizeof = BS.length . B.encode . unOccCp
newtype OccSerialized = OccBs { unOccBs :: ByteString }
deriving (Eq, Show)
instance ComprOccurrences OccSerialized where
fromOccurrences = OccBs . Bs . B.encode . deflateOcc
toOccurrences = inflateOcc . B.decode . unBs . unOccBs
instance NFData OccSerialized where
rnf = rnf . unOccBs
instance B.Binary OccSerialized where
put = B.put . unOccBs
get = B.get >>= return . OccBs
instance Sizeof OccSerialized where
sizeof = sizeof . unOccBs
newtype OccCSerialized = OccCBs { unOccCBs :: ByteString }
deriving (Eq, Show)
instance ComprOccurrences OccCSerialized where
fromOccurrences = OccCBs . Bs . BZ.compress . B.encode . deflateOcc
toOccurrences = inflateOcc . B.decode . BZ.decompress . unBs . unOccCBs
instance NFData OccCSerialized where
rnf = rnf . unOccCBs
instance B.Binary OccCSerialized where
put = B.put . unOccCBs
get = B.get >>= return . OccCBs
instance Sizeof OccCSerialized where
sizeof = sizeof . unOccCBs
newtype OccOSerialized = OccOBs { unOccOBs :: ByteString }
deriving (Eq, Show)
instance ComprOccurrences OccOSerialized where
fromOccurrences = OccOBs . Bs . BZ.compress . B.encode
toOccurrences = B.decode . BZ.decompress . unBs . unOccOBs
instance NFData OccOSerialized where
rnf = rnf . unOccOBs
instance B.Binary OccOSerialized where
put = B.put . unOccOBs
get = B.get >>= return . OccOBs
instance Sizeof OccOSerialized where
sizeof = sizeof . unOccOBs
newtype Inverted occ = Inverted
{ unInverted :: Parts occ
}
deriving (Show, Eq)
type Parts occ = M.Map Context (Part occ)
type Part occ = PT.PrefixTree occ
instance (NFData occ) => NFData (Inverted occ) where
rnf = rnf . unInverted
instance (ComprOccurrences occ) => XmlPickler (Inverted occ) where
xpickle = xpElem "indexes" $
xpWrap (Inverted, unInverted) xpParts
xpParts :: (ComprOccurrences occ) => PU (Parts occ)
xpParts = xpWrap (M.fromList, M.toList) (xpList xpContext)
where
xpContext = xpElem "part" (xpPair (xpAttr "id" xpText) xpPart)
xpPart :: (ComprOccurrences occ) => PU (Part occ)
xpPart = xpElem "index" (xpWrap (PT.fromList, PT.toList) (xpList xpWord))
where
xpWord = xpElem "word" $
xpPair (xpAttr "w" xpText)
(xpWrap (fromOccurrences, toOccurrences) xpOccurrences)
instance (B.Binary occ) => B.Binary (Inverted occ) where
put = B.put . unInverted
get = B.get >>= return . Inverted
instance (B.Binary occ, ComprOccurrences occ) => HolIndex (Inverted occ) where
sizeWords = M.fold ((+) . PT.size) 0 . unInverted
contexts = fmap fst . M.toList . unInverted
allWords i c = fmap (second toOccurrences) . PT.toList . getPart c $ i
prefixCase i c q = fmap (second toOccurrences) . PT.prefixFindWithKeyBF q . getPart c $ i
prefixNoCase i c q = fmap (second toOccurrences) . PT.prefixFindNoCaseWithKeyBF q . getPart c $ i
lookupCase i c q = fmap ((,) q . toOccurrences) . maybeToList . PT.lookup q . getPart c $ i
lookupNoCase i c q = fmap (second toOccurrences) . PT.lookupNoCase q . getPart c $ i
mergeIndexes = zipInverted $ M.unionWith $ PT.unionWith (zipOcc mergeOccurrences)
substractIndexes = zipInverted $ M.differenceWith $ substractPart
insertOccurrences c w o i = mergeIndexes i (singletonInverted c w o)
deleteOccurrences c w o i = substractIndexes i (singletonInverted c w o)
toList = concatMap (uncurry convertPart) . toListInverted
where
convertPart c = map (\(w, o) -> (c, w, toOccurrences o)) . PT.toList
splitByContexts = splitInverted
. map ( (\ i -> (sizeWords i, i))
. Inverted
. uncurry M.singleton
)
. toListInverted
splitByDocuments i = splitInverted ( map (uncurry convert) $
toListDocIdMap $
unionsWithDocIdMap (M.unionWith (M.unionWith unionPos)) docResults
)
where
docResults = map (\c -> resultByDocument c (allWords i c)) (contexts i)
convert d cs = foldl' makeIndex (0, emptyInverted) (M.toList cs)
where
makeIndex r (c, ws) = foldl' makeOcc r (M.toList ws)
where
makeOcc (rs, ri) (w, p) = (sizePos p + rs, insertOccurrences c w (singletonDocIdMap d p) ri)
splitByWords i = splitInverted ( map (uncurry convert) .
M.toList .
M.unionsWith (M.unionWith mergeOccurrences) $ wordResults
)
where
wordResults = map (\c -> resultByWord c (allWords i c)) (contexts i)
convert w cs = foldl' makeIndex (0, emptyInverted) (M.toList cs)
where
makeIndex (rs, ri) (c, o) = (rs + sizeOccurrences o, insertOccurrences c w o ri)
updateDocIds f = mapInverted (M.mapWithKey updatePart)
where
updatePart c = PT.mapWithKey (updateOcc (f c))
updateOcc f' w = mapOcc $ updateOccurrences (f' w)
getPart :: Context -> Inverted i -> Part i
getPart c = fromMaybe PT.empty . M.lookup c . unInverted
substractPart :: (ComprOccurrences i) => Part i -> Part i -> Maybe (Part i)
substractPart p1 p2
| PT.null res = Nothing
| otherwise = Just res
where
res = diffPart p1 p2
diffPart :: (ComprOccurrences i) => Part i -> Part i -> Part i
diffPart = PT.differenceWith subtractDiffLists
where
subtractDiffLists o1 o2
| nullOcc res = Nothing
| otherwise = Just res
where
res = zipOcc substractOccurrences o1 o2
removeDocIdsPart :: (ComprOccurrences i) => Occurrences -> Part i -> Part i
removeDocIdsPart ids = PT.foldWithKey removeDocIds PT.empty
where
removeDocIds k occ acc
| nullOcc occ' = acc
| otherwise = PT.insert k occ' acc
where
occ' = deleteDocIds ids occ
mapInverted :: (Parts i -> Parts i) -> Inverted i -> Inverted i
mapInverted f = Inverted . f . unInverted
zipInverted :: (Parts i -> Parts i -> Parts i) -> Inverted i -> Inverted i -> Inverted i
zipInverted op i1 i2 = Inverted $ op (unInverted i1) (unInverted i2)
emptyInverted :: Inverted i
emptyInverted = Inverted M.empty
toListInverted :: Inverted i -> [(Context, Part i)]
toListInverted = M.toList . unInverted
singletonInverted :: (ComprOccurrences i) => Context -> String -> Occurrences -> Inverted i
singletonInverted c w o = Inverted . M.singleton c . PT.singleton w . fromOccurrences $ o
sizeofAttrsInverted :: (Sizeof i) => Inverted i -> Int64
sizeofAttrsInverted = M.fold ((+) . sizeofPT) 0 . unInverted
where
sizeofPT = PT.fold ((+) . sizeof) 0
removeDocIdsInverted :: (ComprOccurrences i) => Occurrences -> Inverted i -> Inverted i
removeDocIdsInverted ids = mapInverted $ M.map (removeDocIdsPart ids)
splitInverted :: (B.Binary i, ComprOccurrences i) => [(Int, Inverted i)] -> Int -> [Inverted i]
splitInverted inp n = allocate mergeIndexes stack buckets
where
buckets = zipWith const (createBuckets n) stack
stack = reverse (sortBy (compare `on` fst) inp)
allocate :: (a -> a -> a) -> [(Int, a)] -> [(Int, a)] -> [a]
allocate _ _ [] = []
allocate _ [] ys = map snd ys
allocate f (x:xs) (y:ys) = allocate f xs (sortBy (compare `on` fst) ((combine x y):ys))
where
combine (s1, v1) (s2, v2) = (s1 + s2, f v1 v2)
createBuckets :: Int -> [(Int, Inverted i)]
createBuckets n = (replicate n (0, emptyInverted))
type Inverted0 = Inverted Occ0
type InvertedCompressed = Inverted OccCompressed
type InvertedSerialized = Inverted OccSerialized
type InvertedCSerialized = Inverted OccCSerialized
type InvertedOSerialized = Inverted OccOSerialized
emptyInverted0 :: Inverted0
emptyInverted0 = emptyInverted
emptyInvertedCompressed :: InvertedCompressed
emptyInvertedCompressed = emptyInverted
emptyInvertedSerialized :: InvertedSerialized
emptyInvertedSerialized = emptyInverted
emptyInvertedCSerialized :: InvertedCSerialized
emptyInvertedCSerialized = emptyInverted
emptyInvertedOSerialized :: InvertedOSerialized
emptyInvertedOSerialized = emptyInverted