src/Holumbus/Index/Common.hs

{-# OPTIONS -fno-warn-orphans #-}

{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE TypeSynonymInstances #-}

-- ----------------------------------------------------------------------------

{- |
  Module     : Holumbus.Index.Common
  Copyright  : Copyright (C) 2007-2012 Sebastian M. Schlatt, Timo B. Huebel, Uwe Schmidt
  License    : MIT

  Maintainer : Timo B. Huebel (tbh@holumbus.org)
  Stability  : experimental
  Portability: none portable

  Common data types shared by all index types and a unified interface for
  all different index types. This module defines the common interfaces of
  indexes and their document tables as well as full-text caches.

-}

-- ----------------------------------------------------------------------------

module Holumbus.Index.Common 
  (
  -- * Common index types and classes
  HolIndex (..)
  , HolIndexM (..)
  , HolDocuments (..)
  , HolDocIndex (..)
  , HolCache (..)

  -- * Indexes and Documents
  -- , mergeAll

  , module Holumbus.Index.Common.BasicTypes
  , module Holumbus.Index.Common.Document
  , module Holumbus.Index.Common.DocId
  , module Holumbus.Index.Common.DocIdMap
  , module Holumbus.Index.Common.Occurences
  , module Holumbus.Index.Common.RawResult
  , module Holumbus.Index.Common.LoadStore

  )
where

import Control.Monad                    ( foldM )

-- import Data.Binary                   ( Binary (..) )
-- import Data.Maybe

import Holumbus.Index.Common.BasicTypes
import Holumbus.Index.Common.Document
import Holumbus.Index.Common.DocId
import Holumbus.Index.Common.DocIdMap
import Holumbus.Index.Common.Occurences
import Holumbus.Index.Common.RawResult
import Holumbus.Index.Common.LoadStore

-- ------------------------------------------------------------

-- | This class provides a generic interface to different types of index implementations.

class HolIndex i where
  -- | Returns the number of unique words in the index.
  sizeWords                     :: i -> Int

  -- | Returns a list of all contexts avaliable in the index.
  contexts                      :: i -> [Context]

  -- | Returns the occurrences for every word. A potentially expensive operation.
  allWords                      :: i -> Context -> RawResult

  -- | Searches for words beginning with the prefix in a given context (case-sensitive).
  prefixCase                    :: i -> Context -> String -> RawResult

  -- | Searches for words beginning with the prefix in a given context (case-insensitive).
  prefixNoCase                  :: i -> Context -> String -> RawResult

  -- | Searches for and exact word in a given context (case-sensitive).
  lookupCase                    :: i -> Context -> String -> RawResult

  -- | Searches for and exact word in a given context (case-insensitive).
  lookupNoCase                  :: i -> Context -> String -> RawResult
  
  -- | Insert occurrences.
  insertOccurrences             :: Context -> Word -> Occurrences -> i -> i

  -- | Delete occurrences.
  deleteOccurrences             :: Context -> Word -> Occurrences -> i -> i

  -- | Insert a position for a single document.
  insertPosition                :: Context -> Word -> DocId -> Position -> i -> i
  insertPosition c w d p i      = insertOccurrences c w (singletonOccurrence d p) i

  -- | Delete a position for a single document.
  deletePosition                :: Context -> Word -> DocId -> Position -> i -> i
  deletePosition c w d p i      = deleteOccurrences c w (singletonOccurrence d p) i

  -- | Merges two indexes. 
  mergeIndexes                  :: i -> i -> i

  -- | Substract one index from another.
  substractIndexes              :: i -> i -> i

  -- | Splitting an index by its contexts.
  splitByContexts               :: i -> Int -> [i]

  -- | Splitting an index by its documents.
  splitByDocuments              :: i -> Int -> [i]

  -- | Splitting an index by its words.
  splitByWords                  :: i -> Int -> [i]

  -- | Update document id's (e.g. for renaming documents). If the function maps two different id's
  -- to the same new id, the two sets of word positions will be merged if both old id's are present
  -- in the occurrences for a word in a specific context.
  updateDocIds                  :: (Context -> Word -> DocId -> DocId) -> i -> i

  -- | Update document id's with a simple injective editing function.
  updateDocIds'                 :: (DocId -> DocId) -> i -> i
  updateDocIds' f               = updateDocIds (const . const $ f)

  -- Convert an Index to a list. Can be used for easy conversion between different index  
  -- implementations

  toList                        :: i -> [(Context, Word, Occurrences)]
  
  -- Create an Index from a list. Can be used for easy conversion between different index  
  -- implementations. Needs an empty index as first argument

  fromList                      :: i -> [(Context, Word, Occurrences)] -> i
  fromList e                    = foldl (\i (c,w,o) -> insertOccurrences c w o i) e

-- ------------------------------------------------------------

-- | This class provides a generic interface to different monadic types of index implementations.

class (Monad m) => HolIndexM m i where
  -- | Returns the number of unique words in the index.
  sizeWordsM                    :: i -> m Int

  -- | Returns a list of all contexts avaliable in the index.
  contextsM                     :: i -> m [Context]

  -- | Returns the occurrences for every word. A potentially expensive operation.
  allWordsM                     :: i -> Context -> m RawResult

  -- | Searches for words beginning with the prefix in a given context (case-sensitive).
  prefixCaseM                   :: i -> Context -> String -> m RawResult

  -- | Searches for words beginning with the prefix in a given context (case-insensitive).
  prefixNoCaseM                 :: i -> Context -> String -> m RawResult

  -- | Searches for and exact word in a given context (case-sensitive).
  lookupCaseM                   :: i -> Context -> String -> m RawResult

  -- | Searches for and exact word in a given context (case-insensitive).
  lookupNoCaseM                 :: i -> Context -> String -> m RawResult

  -- | Insert occurrences.
  insertOccurrencesM            :: Context -> Word -> Occurrences -> i -> m i

  -- | Delete occurrences.
  deleteOccurrencesM            :: Context -> Word -> Occurrences -> i -> m i

  -- | Insert a position for a single document.
  insertPositionM               :: Context -> Word -> DocId -> Position -> i -> m i
  insertPositionM c w d p i     = insertOccurrencesM c w (singletonOccurrence d p) i

  -- | Delete a position for a single document.
  deletePositionM               :: Context -> Word -> DocId -> Position -> i -> m i
  deletePositionM c w d p i     = deleteOccurrencesM c w (singletonOccurrence d p) i

  -- | Merges two indexes. 
  mergeIndexesM                 :: i -> i -> m i

  -- | Update document id's (e.g. for renaming documents). If the function maps two different id's
  -- to the same new id, the two sets of word positions will be merged if both old id's are present
  -- in the occurrences for a word in a specific context.
  updateDocIdsM                 :: (Context -> Word -> DocId -> DocId) -> i -> m i

  -- | Update document id's with an simple injective editing function.
  updateDocIdsM'                :: (DocId -> DocId) -> i -> m i

  -- Convert an Index to a list. Can be used for easy conversion between different index  
  -- implementations
  toListM                       :: i -> m [(Context, Word, Occurrences)]

  -- Create an Index from a list. Can be used vor easy conversion between different index  
  -- implementations. Needs an empty index as first argument
  fromListM                     :: i -> [(Context, Word, Occurrences)] -> m i
  fromListM e                   = foldM (\i (c,w,o) -> insertOccurrencesM c w o i) e

-- ------------------------------------------------------------

-- don't change IO into Monad m
-- this leads to ambiguities and error messages, when a context (HolIndexM m i) is used
--
-- NOT: instance (Monad m, HolIndex i) => HolIndexM m i where

instance (HolIndex i) => HolIndexM IO i where
    sizeWordsM                  = return . sizeWords
    contextsM                   = return . contexts
    allWordsM i                 = return . allWords i
    prefixCaseM i c             = return . prefixCase i c
    prefixNoCaseM i c           = return . prefixNoCase i c
    lookupCaseM i c             = return . lookupCase i c
    lookupNoCaseM i c           = return . lookupNoCase i c
    insertOccurrencesM c w o    = return . insertOccurrences c w o
    deleteOccurrencesM c w o    = return . deleteOccurrences c w o
    mergeIndexesM i1            = return . mergeIndexes i1
    updateDocIdsM u             = return . updateDocIds u
    updateDocIdsM' f            = return . updateDocIds (const . const $ f)
    toListM                     = return . toList

-- ------------------------------------------------------------

class HolDocuments d a where
  -- | doctable empty?
  nullDocs                      :: d a -> Bool
  nullDocs                      = (== 0) . sizeDocs

  -- | Returns the number of unique documents in the table.
  sizeDocs                      :: d a -> Int
  
  -- | Lookup a document by its id.
  lookupById                    :: Monad m => d a -> DocId -> m (Document a)

  -- | Lookup the id of a document by an URI.
  lookupByURI                   :: Monad m => d a -> URI -> m DocId

  -- | Union of two disjoint document tables. It is assumed, that the DocIds and the document uris
  -- of both indexes are disjoint. If only the sets of uris are disjoint, the DocIds can be made
  -- disjoint by adding maxDocId of one to the DocIds of the second, e.g. with editDocIds

  unionDocs                     :: d a -> d a -> d a
  unionDocs dt1                 = foldDocIdMap addDoc dt1 . toMap
      where
      addDoc d dt               = snd . insertDoc dt $ d

  -- | Test whether the doc ids of both tables are disjoint
  disjointDocs                  :: d a -> d a -> Bool

  -- | Return an empty document table. The input parameter is taken to identify the typeclass
  makeEmpty                     :: d a -> d a
  
  -- | Insert a document into the table. Returns a tuple of the id for that document and the 
  -- new table. If a document with the same URI is already present, its id will be returned 
  -- and the table is returned unchanged.

  insertDoc                     :: d a -> (Document a) -> (DocId, d a)

  -- | Update a document with a certain DocId. 
  updateDoc                     :: d a -> DocId -> (Document a) -> d a

  -- | Removes the document with the specified id from the table.
  removeById                    :: d a -> DocId -> d a

  -- | Removes the document with the specified URI from the table.
  removeByURI                   :: d a -> URI -> d a
  removeByURI ds u              = maybe ds (removeById ds) (lookupByURI ds u)

  -- | Update documents (through mapping over all documents).
  updateDocuments               :: (Document a -> Document a) -> d a -> d a

  filterDocuments               :: (Document a -> Bool) -> d a -> d a

  -- | Create a document table from a single map.
  fromMap                       :: DocIdMap (Document a) -> d a

  -- | Convert document table to a single map
  toMap                         :: d a -> DocIdMap (Document a)

  -- | Edit document ids
  editDocIds                    :: (DocId -> DocId) -> d a -> d a
  editDocIds f                  = fromMap . foldWithKeyDocIdMap (insertDocIdMap . f) emptyDocIdMap . toMap

-- ------------------------------------------------------------

class HolCache c where
  -- | Retrieves the full text of a document for a given context. Will never throw any exception,
  -- upon failure or if no text found for the document, @Nothing@ is returned.
  getDocText  :: c -> Context -> DocId -> IO (Maybe Content)

  -- | Store the full text of a document for a given context. May throw an exception if the 
  -- storage of the text failed.

  putDocText  :: c -> Context -> DocId -> Content -> IO ()
  -- | Merge two caches in the way that everything that is in the second cache is inserted into the
  --   first one.

  mergeCaches :: c -> c -> IO c

-- ------------------------------------------------------------

class (HolDocuments d a, HolIndex i) => HolDocIndex d a i where

    -- | Merge two doctables and indexes together into a single doctable and index
    unionDocIndex               :: d a -> i -> d a -> i -> (d a, i)

    -- | Defragment a doctable and index, useful when the doc ids are
    -- organized as an intervall of ints.
    --
    -- Default implementation is the identity

    defragmentDocIndex          :: d a -> i -> (d a, i)
    defragmentDocIndex          = (,)

-- ------------------------------------------------------------