src/Holumbus/Index/Inverted/PrefixMem.hs

{-# OPTIONS #-}

-- ----------------------------------------------------------------------------

{- |
  Module     : Holumbus.Index.Inverted.PrefixMem
  Copyright  : Copyright (C) 2007 - 2009 Sebastian M. Schlatt, Timo B. Huebel, Uwe Schmidt
  License    : MIT
  
  Maintainer : Uwe Schmidt (uwe@fh-wedel.de)
  Stability  : experimental
  Portability: portable
  
  A variant of the Inverted.Memory index with an optimized prefix tree
  instead of a trie as central data structure. This version should be
  more space efficient as the trie and more runtime efficient when combining
  whole indexes.

  For switching from Memory to this module, only the import has to be modified

-}

-- ----------------------------------------------------------------------------

module Holumbus.Index.Inverted.PrefixMem 
    (
     -- * Inverted index types
     Inverted (..)
    , Parts
    , Part
  
    -- * Construction
    , singleton
    , emptyInverted
    )
where

import           Control.DeepSeq

import           Data.Binary                    hiding (Word)
import           Data.Function
import           Data.List
import           Data.Map                       (Map)
import qualified Data.Map                       as M
import           Data.Maybe

import qualified Holumbus.Data.PrefixTree       as PT

import           Holumbus.Index.Common
import           Holumbus.Index.Compression

import           Text.XML.HXT.Core

-- ----------------------------------------------------------------------------

-- | The index consists of a table which maps documents to ids and a number of index parts.

newtype Inverted        = Inverted { indexParts :: Parts } 
                          deriving (Show, Eq)

-- | The index parts are identified by a name, which should denote the context of the words.

type Parts              = Map Context Part

-- | The index part is the real inverted index. Words are mapped to their occurrences.

type Part               = PT.PrefixTree CompressedOccurrences

-- ----------------------------------------------------------------------------

instance HolIndex Inverted where
  sizeWords                     = M.fold ((+) . PT.size) 0 . indexParts
  contexts                      = map fst . M.toList . indexParts

  allWords i c                  = map (\(w, o) -> (w, inflateOcc o)) $ PT.toList $ getPart c i
  prefixCase i c q              = map (\(w, o) -> (w, inflateOcc o)) $ PT.prefixFindWithKey q $ getPart c i
  prefixNoCase i c q            = map (\(w, o) -> (w, inflateOcc o)) $ PT.prefixFindNoCaseWithKey q $ getPart c i
  lookupCase i c q              = map (\    o  -> (q, inflateOcc o)) $ maybeToList (PT.lookup q $ getPart c i)
  lookupNoCase i c q            = map (\(w, o) -> (w, inflateOcc o)) $ PT.lookupNoCase q $ getPart c i

  mergeIndexes i1 i2            = Inverted (mergeParts (indexParts i1) (indexParts i2))
  substractIndexes i1 i2        = Inverted (substractParts (indexParts i1) (indexParts i2))

  insertOccurrences c w o i     = mergeIndexes (singleton c w o) i
  deleteOccurrences c w o i     = substractIndexes i (singleton c w o)

  splitByContexts (Inverted ps) = splitInternal (map (uncurry annotate) . M.toList $ ps)
    where
    annotate c p                = let
                                  i = Inverted (M.singleton c p)
                                  in
                                  (sizeWords i, i)

  splitByDocuments i            = splitInternal ( map convert $
                                                  toListDocIdMap $
                                                  unionsWithDocIdMap unionDocs' docResults
                                                )
    where
    unionDocs'                  = M.unionWith (M.unionWith unionPos)
    docResults                  = map (\c -> resultByDocument c (allWords i c)) (contexts i)
    convert (d, cs)             = foldl' makeIndex (0, emptyInverted) (M.toList cs)
      where
      makeIndex r (c, ws)       = foldl' makeOcc r (M.toList ws)
        where
        makeOcc (rs, ri) (w, p) = (sizePos p + rs , insertOccurrences c w (singletonDocIdMap d p) ri)

  splitByWords i                = splitInternal indexes
    where
    indexes                     = map convert $
                                  M.toList $
                                  M.unionsWith (M.unionWith mergeOccurrences) wordResults
      where
      wordResults               = map (\c -> resultByWord c (allWords i c)) (contexts i)
      convert (w, cs)           = foldl' makeIndex (0, emptyInverted) (M.toList cs)
        where
        makeIndex (rs, ri) (c, o) = (rs + sizeOccurrences o, insertOccurrences c w o ri)

  updateDocIds f (Inverted parts)
                                = Inverted (M.mapWithKey updatePart parts)
    where
    updatePart c p              = PT.mapWithKey
                                  (\w o -> foldWithKeyDocIdMap (updateDocument c w) emptyDocIdMap o) p
    updateDocument c w d p r    = insertWithDocIdMap mergePositions (f c w d) p r
      where
      mergePositions p1 p2      = deflatePos $ unionPos (inflatePos p1) (inflatePos p2)

  updateDocIds' f
                                = Inverted . M.map updatePart . indexParts
    where
    updatePart                  = PT.map updateOcc
    updateOcc                   = foldWithKeyDocIdMap updateId emptyDocIdMap
    updateId                    = insertDocIdMap . f

  toList i                      = concat $ map convertPart $ M.toList (indexParts i) 
    where convertPart (c,p)     = map (\(w, o) -> (c, w, inflateOcc o)) $
                                  PT.toList $
                                  p

-- ----------------------------------------------------------------------------

instance NFData Inverted where
    rnf                         = rnf . indexParts

-- ----------------------------------------------------------------------------

instance XmlPickler Inverted where
  xpickle                       =  xpElem "indexes" $
                                   xpWrap (\p -> Inverted p, \(Inverted p) -> p) xpParts

-- | The XML pickler for the index parts.
xpParts                         :: PU Parts
xpParts                         = xpWrap (M.fromList, M.toList) (xpList xpContext)
  where
  xpContext                     = xpElem "part" (xpPair (xpAttr "id" xpText) xpPart)

-- | The XML pickler for a single part.
xpPart                          :: PU Part
xpPart                          = xpElem "index" (xpWrap (PT.fromList, PT.toList) (xpList xpWord))
  where
  xpWord                        = xpElem "word" $
                                  xpPair (xpAttr "w" xpText)
                                         (xpWrap (deflateOcc, inflateOcc) xpOccurrences)

-- ----------------------------------------------------------------------------

instance Binary Inverted where
    put                         = put . indexParts
    get                         = get >>= return . Inverted

-- ----------------------------------------------------------------------------

-- | Create an empty index.
emptyInverted                   :: Inverted
emptyInverted                   = Inverted M.empty
                  
-- | Create an index with just one word in one context.
singleton                       :: Context -> String -> Occurrences -> Inverted
singleton c w o                 = Inverted (M.singleton c (PT.singleton w (deflateOcc o)))

-- | Merge two sets of index parts.
mergeParts                      :: Parts -> Parts -> Parts
mergeParts                      = M.unionWith mergePart

-- | Merge two index parts.
mergePart                       :: Part -> Part -> Part
mergePart                       = PT.unionWith mergeDiffLists
  where
  mergeDiffLists o1 o2          = deflateOcc $
                                  mergeOccurrences (inflateOcc o1) (inflateOcc o2)

-- | Substract a set of index parts from another.
substractParts                  :: Parts -> Parts -> Parts
substractParts                  = M.differenceWith substractPart

-- | Substract one index part from another.
substractPart                   :: Part -> Part -> Maybe Part
substractPart p1 p2             = if PT.null diffPart then Nothing else Just diffPart
  where
  diffPart                      = PT.differenceWith substractDiffLists p1 p2
    where
    substractDiffLists o1 o2    = if diffOcc == emptyOccurrences then Nothing else Just (deflateOcc diffOcc)
      where
      diffOcc                   = substractOccurrences (inflateOcc o1) (inflateOcc o2)

-- | Internal split function used by the split functions from the HolIndex interface (above).
splitInternal                   :: [(Int, Inverted)] -> Int -> [Inverted]
splitInternal inp n             = allocate mergeIndexes stack buckets
  where
  buckets                       = zipWith const (createBuckets n) stack
  stack                         = reverse (sortBy (compare `on` fst) inp)

-- | Allocates values from the first list to the buckets in the second list.
allocate                        :: (a -> a -> a) -> [(Int, a)] -> [(Int, a)] -> [a]
allocate _ _ []                 = []
allocate _ [] ys                = map snd ys
allocate f (x:xs) (y:ys)        = allocate f xs (sortBy (compare `on` fst) ((combine x y):ys))
  where
  combine (s1, v1) (s2, v2)     = (s1 + s2, f v1 v2)

-- | Create empty buckets for allocating indexes.  
createBuckets                   :: Int -> [(Int, Inverted)]
createBuckets n                 = (replicate n (0, emptyInverted))
  
-- | Return a part of the index for a given context.
getPart                         :: Context -> Inverted -> Part
getPart c i                     = fromMaybe PT.empty (M.lookup c $ indexParts i)

-- ----------------------------------------------------------------------------