src/Hunt/ContextIndex.hs

{-# LANGUAGE FlexibleContexts      #-}
{-# LANGUAGE FlexibleInstances     #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE Rank2Types            #-}
{-# LANGUAGE TypeFamilies          #-}
{-# LANGUAGE UndecidableInstances  #-}

-- ----------------------------------------------------------------------------
{- |
  The context index introduces contexts and combines the index, document table and schema.
-}
-- ----------------------------------------------------------------------------

module Hunt.ContextIndex
  (
    -- * Construction
    empty

    -- * Contexts and Schema
  , insertContext
  , deleteContext
  , foreachContext
  , contexts
  , contextsM
  , hasContext
  , hasContextM

    -- * Queries
  , lookupRangeCx
  , lookupAllWithCx
  , searchWithCx
  , searchWithCxsNormalized
  , searchWithCxSc
  , lookupRangeCxSc

    -- * Insert\/Delete Documents
  , insertList
                                 -- XXX: these functions should be internal
                                 -- we export them to be able to test them
                                 -- is there a bedder approach to achieve this?
  , createDocTableFromPartition  -- only used in tests
  , unionDocTables               -- only used in tests
  , modifyWithDescription
  , delete
  , deleteDocsByURI
  , decodeCxIx
  , member

    -- * Types
  , ContextIndex (..)
  , ContextMap (..)
  , IndexRep
  , mkContextMap
  , mapToSchema
  )
where
{-
import           Debug.Trace                   (traceShow)
-- -}
import           Prelude
import qualified Prelude                       as P

import           Control.Applicative           (Applicative, (<$>), (<*>))
import           Control.Arrow
import           Control.Monad
import qualified Control.Monad.Parallel        as Par

import           Data.Binary                   (Binary (..))
import           Data.Binary.Get
import           Data.ByteString.Lazy          (ByteString)
import           Data.Map                      (Map)
import qualified Data.Map                      as M
import           Data.Maybe
import           Data.Set                      (Set)
import qualified Data.Set                      as S
import           Data.Text                     (Text)

import           Hunt.Common
import qualified Hunt.Common.DocDesc           as DocDesc
import qualified Hunt.Common.DocIdSet          as DS
import qualified Hunt.Common.Document          as Doc
import           Hunt.Common.IntermediateValue
import qualified Hunt.Common.Occurrences       as Occ

import           Hunt.DocTable                 (DocTable)
import qualified Hunt.DocTable                 as Dt
import qualified Hunt.Index                    as Ix
import           Hunt.Index.IndexImpl          (IndexImpl)
import qualified Hunt.Index.IndexImpl          as Impl
import           Hunt.Utility

-- ------------------------------------------------------------

-- | Context index introduces contexts and combines the major components of Hunt.

data ContextIndex dt = ContextIndex
  { ciIndex :: !ContextMap -- ^ Indexes associated to contexts.
  , ciDocs  :: !dt         -- ^ Document table.
  }

empty :: DocTable dt => ContextIndex dt
empty = ContextIndex emptyContextMap Dt.empty

-- | Contexts with associated heterogeneous index implementations.

type IndexRep = (ContextSchema, Impl.IndexImpl)

newtype ContextMap
  = ContextMap { cxMap :: Map Context IndexRep }
  deriving (Show)

-- | Empty context map.
emptyContextMap :: ContextMap
emptyContextMap = mkContextMap $ M.empty


-- | Strict smart constructor for the 'ContextMap'.

mkContextMap :: Map Context IndexRep -> ContextMap
mkContextMap x = ContextMap $! x

-- | Get 'Schema' from 'ContextMap'
mapToSchema :: ContextMap -> Schema
mapToSchema (ContextMap m) = M.map fst m

-- ------------------------------------------------------------
-- Binary / Serialization
-- ------------------------------------------------------------

getContextMap :: [IndexImpl] -> Get ContextMap
getContextMap ts
  = do
    impls <- Impl.gets' ts
    s     <- get
    return . mkContextMap . M.fromDistinctAscList $ map (mergeGets s) impls
    where
      mergeGets s (c, i) = (c, (getS c s, i))
      getS c s = fromMaybe (error "deserializating failed: context schema is missing")
               $ lookup c s

instance Binary ContextMap where
  put = put . cxMap
  get = get >>= return . mkContextMap


-- | Deserialize a 'ContextIndex' with the list of available index implementations and a
--   map of available 'ContextSchema'.
--
--   /Note/: The serialized index implementations have to  be in the list of available types,
--           otherwise this will fail. The serialized schemas have to be in the list of
--           available 'ContextSchema', otherwise this will fail as well.

decodeCxIx :: (Binary dt, DocTable dt) => [IndexImpl] -> ByteString -> ContextIndex dt
decodeCxIx ts = runGet (get' ts)

get' :: Binary dt => [IndexImpl] -> Get (ContextIndex dt)
get' ts = ContextIndex <$> (getContextMap ts) <*> get

instance Binary dt => Binary (ContextIndex dt) where
  get = error "existential types cannot be deserialized this way. Use special get' functions"
  put (ContextIndex (ContextMap a) b)
    = put (M.map snd a) >>  -- convert to 'IndexImpl' and serialize
      put (M.map fst a) >>  -- convert to 'Schema' and serialize
      put b                 -- put 'DocTable'

-- ------------------------------------------------------------

{-
-- | Insert a Document and Words.
--
--   /Note/: For multiple inserts, use the more efficient 'insertList'.
insert :: (Par.MonadParallel m, Applicative m, DocTable dt)
       => Dt.DValue dt -> Words -> ContextIndex dt -> m (ContextIndex dt)
insert doc wrds ix = insertList [(doc,wrds)] ix
-}

--   This is more efficient than using fold and with 'insert'.
-- | Insert multiple documents and words.

insertList :: (Par.MonadParallel m, Applicative m, DocTable dt) =>
              [(Dt.DValue dt, Words)] ->
              ContextIndex dt -> m (ContextIndex dt)

insertList docAndWords (ContextIndex ix docTable)
    = do -- insert to doctable and generate docId
         tablesAndWords <- Par.mapM createDocTableFromPartition
                         $ partitionListByLength 20 docAndWords
         -- union doctables and docid-words pairs
         (newDt, docIdsAndWords) <- unionDocTables tablesAndWords docTable
         -- insert words to index
         newIx <- batchAddWordsM docIdsAndWords ix
         return $! ContextIndex newIx newDt

-- takes list of documents with wordlist. creates new 'DocTable' and
-- inserts each document of the list into it.
createDocTableFromPartition :: (Par.MonadParallel m, DocTable dt) =>
                               [(Dt.DValue dt, Words)] -> m (dt, [(DocId, Words)])
createDocTableFromPartition ds
    = foldM toDocTable (Dt.empty, []) ds
    where
      toDocTable (dt, resIdsAndWords) (doc, ws)
        = do (dId, dt') <- Dt.insert doc dt
             return (dt', (dId, ws):resIdsAndWords)

-- takes list of doctables with lists of docid-words pairs attached
-- unions the doctables to one big doctable and concats the docid-words
-- pairs to one list
unionDocTables :: (DocTable dt, Par.MonadParallel m) =>
                  [(dt, [(DocId, Words)])] -> dt -> m (dt, [(DocId, Words)])
unionDocTables tablesAndWords oldDt
    = do step <- Par.mapM unionDtsAndWords $ mkPairs tablesAndWords
         case step of
                 []      -> return (Dt.empty, [])
                 [(d,w)] -> do n <- Dt.union oldDt d
                               return (n, w)
                 xs      -> unionDocTables xs oldDt
    where
      unionDtsAndWords ((dt1, ws1), (dt2, ws2))
        = do dt <- Dt.union dt1 dt2
             return (dt, ws1 ++ ws2)

      mkPairs []       = []
      mkPairs (a:[])   = [(a,(Dt.empty,[]))]
      mkPairs (a:b:xs) = (a,b):mkPairs xs

{-
-- XXX: this should not work well atm
-- | Modify documents and index data.

modify :: (Par.MonadParallel m, Applicative m, DocTable dt)
       => (Dt.DValue dt -> m (Dt.DValue dt))
       -> Words -> DocId -> ContextIndex dt -> m (ContextIndex dt)
modify f wrds dId (ContextIndex ii dt s) = do
  newDocTable <- Dt.adjust f dId dt
  newIndex    <- addWordsM wrds dId ii
  return $ ContextIndex newIndex newDocTable s
-- -}

-- | Delete a set of documents by 'URI'.
deleteDocsByURI :: (Par.MonadParallel m, Applicative m, DocTable dt)
                => Set URI -> ContextIndex dt -> m (ContextIndex dt)
deleteDocsByURI us ixx@(ContextIndex _ix dt) = do
  docIds <- liftM (DS.fromList . catMaybes) . mapM (flip Dt.lookupByURI dt) . S.toList $ us
  delete docIds ixx


-- | Delete a set of documents by 'DocId'.
delete :: (Par.MonadParallel m, Applicative m, DocTable dt)
       => DocIdSet -> ContextIndex dt -> m (ContextIndex dt)
delete dIds cix@(ContextIndex ix dt)
    | DS.null dIds
        = return cix
    | otherwise
        = do newIx <- delete' dIds ix
             newDt <- Dt.difference dIds dt
             return $ ContextIndex newIx newDt


-- | Is the document part of the index?
member :: (Monad m, Applicative m, DocTable dt)
       => URI -> ContextIndex dt -> m Bool
member u (ContextIndex _ii dt) = do
  mem <- Dt.lookupByURI u dt
  return $ isJust mem

-- ------------------------------------------------------------

-- | Modify the description of a document and add words
--   (occurrences for that document) to the index.

modifyWithDescription :: (Par.MonadParallel m, Applicative m, DocTable dt) =>
                         Score -> Description -> Words -> DocId -> ContextIndex dt ->
                         m (ContextIndex dt)
modifyWithDescription weight descr wrds dId (ContextIndex ii dt)
    = do newDocTable <- Dt.adjust mergeDescr dId dt
         newIndex    <- batchAddWordsM [(dId,wrds)] ii
         return $ ContextIndex newIndex newDocTable
    where
      -- M.union is left-biased
      -- flip to use new values for existing keys
      -- no flip to keep old values
      --
      -- Null values in new descr will remove associated attributes
      mergeDescr
          = return . Doc.update (updateWeight . updateDescr)
          where
            updateWeight d
                | weight == noScore = d
                | otherwise         = d {wght = weight}

            updateDescr d           = -- trc "updateDescr res=" $
                                      d {desc = DocDesc.deleteNull $
                                                flip DocDesc.union d' descr'
                                        }
                                      where
                                        d'     = -- trc "updateDescr old=" $
                                                 desc d
                                        descr' = -- trc "updateDescr new=" $
                                                 descr

-- trc :: Show a => String -> a -> a
-- trc msg x = traceShow (msg, x) x

-- ------------------------------------------------------------
-- Helper
-- ------------------------------------------------------------

-- | Adds words associated to a document to the index.
--

-- | Add words for a document to the 'Index'.
--
--   /Note/: Adds words to /existing/ 'Context's.

batchAddWordsM :: (Functor m, Par.MonadParallel m) =>
                  [(DocId, Words)] -> ContextMap -> m ContextMap
batchAddWordsM [] ix
    = return ix

batchAddWordsM vs (ContextMap m)
    = mkContextMap <$>
      mapWithKeyMP ( \cx (s, impl) -> do i <- foldinsertList cx impl
                                         return (s, i)
                   ) m
    where
      foldinsertList :: (Functor m, Monad m) =>
                        Context -> IndexImpl -> m IndexImpl
      foldinsertList cx (Impl.IndexImpl impl)
          = Impl.mkIndex <$>
            Ix.insertListM (contentForCx cx vs) impl

-- | Computes the words and occurrences out of a list for one context

contentForCx :: Context -> [(DocId, Words)] -> [(Word, IntermediateValue)]
contentForCx cx vs
    = concatMap (invert . second (getWlForCx cx)) $ vs
          where
            invert (did, wl)
                = map (second (toIntermediate . Occ.singleton' did)) $ M.toList wl
            getWlForCx cx' ws'
                = fromMaybe M.empty (M.lookup cx' ws')

----------------------------------------------------------------------------
-- addWords/batchAddWords functions
----------------------------------------------------------------------------

mapWithKeyMP :: (Par.MonadParallel m, Ord k) => (k -> a -> m b) -> M.Map k a -> m (M.Map k b)
mapWithKeyMP f m =
  (Par.mapM (\(k, a) -> do
                  b <- f k a
                  return (k, b)
                ) $ M.toList m) >>=
    return . M.fromList

----------------------------------------------------------------------------

-- | Inserts a new context.
--
insertContext :: Context -> Impl.IndexImpl -> ContextSchema
              -> ContextIndex dt -> ContextIndex dt
insertContext c ix schema (ContextIndex m dt)
    = ContextIndex  m' dt
    where
    m' = insertContext' c schema ix m

--   /Note/: Does nothing if the context already exists.
insertContext' :: Context -> ContextSchema -> Impl.IndexImpl -> ContextMap -> ContextMap
insertContext' c s ix (ContextMap m) = mkContextMap $ M.insertWith (const id) c (s, ix) m

-- | Removes context (including the index and the schema).
deleteContext :: Context -> ContextIndex dt -> ContextIndex dt
deleteContext c (ContextIndex ix dt) = ContextIndex (deleteContext' c ix) dt

-- | Removes context (includes the index, but not the schema).
deleteContext' :: Context -> ContextMap -> ContextMap
deleteContext' cx (ContextMap m) = mkContextMap $ M.delete cx m

delete' :: Par.MonadParallel m => DocIdSet -> ContextMap -> m ContextMap
delete' dIds (ContextMap m)
  = mapWithKeyMP (\_ impl -> adjust' impl) m >>= return . mkContextMap
--  = TV.mapM adjust' m >>= return . mkContextMap
  where
  adjust' (s, Impl.IndexImpl ix) = Ix.deleteDocsM dIds ix >>= \i -> return (s, Impl.mkIndex i)

{- not yet used

-- | Search query in all context.
search :: Monad m => TextSearchOp -> Text -> ContextMap -> m [(Context, [(Text, v)])]
search op k (ContextMap m)
  = liftM M.toList $ TV.mapM search' m
  where
  search' (Impl.IndexImpl ix) = Ix.searchM op k ix
-- -}

-- | Range query in a context between first and second key.
lookupRangeCx :: Monad m => Context -> Text -> Text -> ContextMap -> m [(Text, IntermediateValue)]
lookupRangeCx c k1 k2 cm
    = lookupIndex c cm $ Ix.lookupRangeM k1 k2

-- | Dump a context
lookupAllWithCx :: Monad m => Context -> ContextMap -> m [(Text, IntermediateValue)]
lookupAllWithCx cx cm
    = lookupIndex cx cm $ Ix.toListM

-- | Search query in a context.
searchWithCx :: Monad m => TextSearchOp -> Context -> Text -> ContextMap -> m [(Text, IntermediateValue)]
searchWithCx op cx w cm
    = lookupIndex cx cm $ Ix.searchM op w


-- | Search over a list of contexts and words
searchWithCxsNormalized :: (Functor m, Monad m) =>
                           TextSearchOp -> [(Context, Text)] -> ContextMap ->
                           m [(Context, [(Text, IntermediateValue)])]
searchWithCxsNormalized op cxws cm
    = P.mapM (uncurry search') cxws
    where
      search' cx w
          = (\ x -> (cx, x))
            <$> (lookupIndex cx cm $ Ix.searchM op w)

-- | Search query with scored results
-- XXX TODO: this function should return intermediates and query processor should work with those
searchWithCxSc :: Monad m =>
                  TextSearchOp -> Context -> Text -> ContextMap -> m [(Text, (Score, Occurrences))]
searchWithCxSc op cx w cm
    = (lookupIndex cx cm $ Ix.searchMSc op w) >>= return . fromScoredIntermediates

-- | Range query in a context between first and second key.
-- XXX TODO: this function should return intermediates and query processor should work with those
lookupRangeCxSc :: Monad m => Context -> Text -> Text -> ContextMap -> m [(Text, (Score, Occurrences))]
lookupRangeCxSc c k1 k2 cm
    = (lookupIndex c cm $ Ix.lookupRangeMSc k1 k2) >>= return . fromScoredIntermediates

-- ------------------------------------------------------------

-- | lookup an index by a context and then search this index for a word
-- result is always a list of values.
--
-- This pattern is used in all search variants

lookupIndex :: Monad m =>
               Context -> ContextMap ->
               (forall i . Impl.IndexImplCon i => i -> m [r]) ->
               m [r]
lookupIndex cx (ContextMap m) search
    = case M.lookup cx m of
        Just (_, Impl.IndexImpl cm)
            -> search cm
        Nothing
            -> return []

foreachContext :: (Functor m, Monad m) =>
                  [Context] ->
                  (Context -> m res) ->
                  m [(Context, res)]
foreachContext cxs action
    = P.mapM action' cxs
      where
        action' cx
            = (\ r -> (cx, r)) <$> action cx

-- ------------------------------------------------------------

-- | All contexts of the index.
contextsM :: (Monad m, DocTable dt)
         => ContextIndex dt -> m [Context]
contextsM (ContextIndex ix _) = return $ contexts ix

-- | Contexts/keys of 'ContextMap'.
contexts :: ContextMap -> [Context]
contexts (ContextMap m) = M.keys m

-- | Check if the context exists.
hasContext :: Context -> ContextMap -> Bool
hasContext c (ContextMap m) = M.member c m

-- | Does the context exist?
hasContextM :: (Monad m, DocTable dt)
           => Context -> ContextIndex dt -> m Bool
hasContextM c (ContextIndex ix _) = return $ hasContext c ix