module Holumbus.Index.Inverted.PrefixMem
(
Inverted (..)
, Parts
, Part
, singleton
, emptyInverted
)
where
import Control.DeepSeq
import Data.Binary hiding (Word)
import Data.Function
import Data.List
import Data.Map (Map)
import qualified Data.Map as M
import Data.Maybe
import qualified Holumbus.Data.PrefixTree as PT
import Holumbus.Index.Common
import Holumbus.Index.Compression
import Text.XML.HXT.Core
newtype Inverted = Inverted { indexParts :: Parts }
deriving (Show, Eq)
type Parts = Map Context Part
type Part = PT.PrefixTree CompressedOccurrences
instance HolIndex Inverted where
sizeWords = M.fold ((+) . PT.size) 0 . indexParts
contexts = map fst . M.toList . indexParts
allWords i c = map (\(w, o) -> (w, inflateOcc o)) $ PT.toList $ getPart c i
prefixCase i c q = map (\(w, o) -> (w, inflateOcc o)) $ PT.prefixFindWithKey q $ getPart c i
prefixNoCase i c q = map (\(w, o) -> (w, inflateOcc o)) $ PT.prefixFindNoCaseWithKey q $ getPart c i
lookupCase i c q = map (\ o -> (q, inflateOcc o)) $ maybeToList (PT.lookup q $ getPart c i)
lookupNoCase i c q = map (\(w, o) -> (w, inflateOcc o)) $ PT.lookupNoCase q $ getPart c i
mergeIndexes i1 i2 = Inverted (mergeParts (indexParts i1) (indexParts i2))
substractIndexes i1 i2 = Inverted (substractParts (indexParts i1) (indexParts i2))
insertOccurrences c w o i = mergeIndexes (singleton c w o) i
deleteOccurrences c w o i = substractIndexes i (singleton c w o)
splitByContexts (Inverted ps) = splitInternal (map (uncurry annotate) . M.toList $ ps)
where
annotate c p = let
i = Inverted (M.singleton c p)
in
(sizeWords i, i)
splitByDocuments i = splitInternal ( map convert $
toListDocIdMap $
unionsWithDocIdMap unionDocs' docResults
)
where
unionDocs' = M.unionWith (M.unionWith unionPos)
docResults = map (\c -> resultByDocument c (allWords i c)) (contexts i)
convert (d, cs) = foldl' makeIndex (0, emptyInverted) (M.toList cs)
where
makeIndex r (c, ws) = foldl' makeOcc r (M.toList ws)
where
makeOcc (rs, ri) (w, p) = (sizePos p + rs , insertOccurrences c w (singletonDocIdMap d p) ri)
splitByWords i = splitInternal indexes
where
indexes = map convert $
M.toList $
M.unionsWith (M.unionWith mergeOccurrences) wordResults
where
wordResults = map (\c -> resultByWord c (allWords i c)) (contexts i)
convert (w, cs) = foldl' makeIndex (0, emptyInverted) (M.toList cs)
where
makeIndex (rs, ri) (c, o) = (rs + sizeOccurrences o, insertOccurrences c w o ri)
updateDocIds f (Inverted parts)
= Inverted (M.mapWithKey updatePart parts)
where
updatePart c p = PT.mapWithKey
(\w o -> foldWithKeyDocIdMap (updateDocument c w) emptyDocIdMap o) p
updateDocument c w d p r = insertWithDocIdMap mergePositions (f c w d) p r
where
mergePositions p1 p2 = deflatePos $ unionPos (inflatePos p1) (inflatePos p2)
updateDocIds' f
= Inverted . M.map updatePart . indexParts
where
updatePart = PT.map updateOcc
updateOcc = foldWithKeyDocIdMap updateId emptyDocIdMap
updateId = insertDocIdMap . f
toList i = concat $ map convertPart $ M.toList (indexParts i)
where convertPart (c,p) = map (\(w, o) -> (c, w, inflateOcc o)) $
PT.toList $
p
instance NFData Inverted where
rnf = rnf . indexParts
instance XmlPickler Inverted where
xpickle = xpElem "indexes" $
xpWrap (\p -> Inverted p, \(Inverted p) -> p) xpParts
xpParts :: PU Parts
xpParts = xpWrap (M.fromList, M.toList) (xpList xpContext)
where
xpContext = xpElem "part" (xpPair (xpAttr "id" xpText) xpPart)
xpPart :: PU Part
xpPart = xpElem "index" (xpWrap (PT.fromList, PT.toList) (xpList xpWord))
where
xpWord = xpElem "word" $
xpPair (xpAttr "w" xpText)
(xpWrap (deflateOcc, inflateOcc) xpOccurrences)
instance Binary Inverted where
put = put . indexParts
get = get >>= return . Inverted
emptyInverted :: Inverted
emptyInverted = Inverted M.empty
singleton :: Context -> String -> Occurrences -> Inverted
singleton c w o = Inverted (M.singleton c (PT.singleton w (deflateOcc o)))
mergeParts :: Parts -> Parts -> Parts
mergeParts = M.unionWith mergePart
mergePart :: Part -> Part -> Part
mergePart = PT.unionWith mergeDiffLists
where
mergeDiffLists o1 o2 = deflateOcc $
mergeOccurrences (inflateOcc o1) (inflateOcc o2)
substractParts :: Parts -> Parts -> Parts
substractParts = M.differenceWith substractPart
substractPart :: Part -> Part -> Maybe Part
substractPart p1 p2 = if PT.null diffPart then Nothing else Just diffPart
where
diffPart = PT.differenceWith substractDiffLists p1 p2
where
substractDiffLists o1 o2 = if diffOcc == emptyOccurrences then Nothing else Just (deflateOcc diffOcc)
where
diffOcc = substractOccurrences (inflateOcc o1) (inflateOcc o2)
splitInternal :: [(Int, Inverted)] -> Int -> [Inverted]
splitInternal inp n = allocate mergeIndexes stack buckets
where
buckets = zipWith const (createBuckets n) stack
stack = reverse (sortBy (compare `on` fst) inp)
allocate :: (a -> a -> a) -> [(Int, a)] -> [(Int, a)] -> [a]
allocate _ _ [] = []
allocate _ [] ys = map snd ys
allocate f (x:xs) (y:ys) = allocate f xs (sortBy (compare `on` fst) ((combine x y):ys))
where
combine (s1, v1) (s2, v2) = (s1 + s2, f v1 v2)
createBuckets :: Int -> [(Int, Inverted)]
createBuckets n = (replicate n (0, emptyInverted))
getPart :: Context -> Inverted -> Part
getPart c i = fromMaybe PT.empty (M.lookup c $ indexParts i)