module Condor.Index
( DocName
, Index
, DocContent
, addDocument
, emptyIndex
, search
, termCount
) where
import qualified Data.Map as Map
import qualified Data.List as List
import Data.Binary
import Condor.Text
import Condor.Language.English.StopWords (isStopWord)
import Condor.Language.English.Porter (stem)
type DocName = String
type DocContent = String
data IndexParams = IndexParams { ignore :: String -> Bool
, stemmer :: String -> String
}
instance Binary IndexParams where
put _ = put (0 :: Word8)
get = do tag <- getWord8
case tag of
_ -> return $ IndexParams isStopWord stem
data Index = Index { terms :: Map.Map String [String]
, params :: IndexParams
}
instance Binary Index where
put i = do put (terms i)
put (params i)
get = do i <- get
p <- get
return $ Index i p
emptyIndex :: Index
emptyIndex = Index Map.empty (IndexParams isStopWord stem)
addDocument :: DocName -> DocContent -> Index -> Index
addDocument d c ix = Index (foldl f (terms ix) ws) (params ix)
where ws = splitWords (params ix) c
f i t = case Map.lookup t i of
Just a -> Map.insert t (d:a) i
Nothing -> Map.insert t [d] i
search :: Index -> DocContent -> [DocName]
search ix s = List.nub $ foldl (++) [] ys
where ys = map (searchTerm ix) ws
ws = splitWords (params ix) s
searchTerm :: Index -> String -> [DocName]
searchTerm ix s = case Map.lookup s (terms ix) of
Just a -> a
Nothing -> []
termCount :: Index -> Int
termCount ix = Map.size (terms ix)
splitWords :: IndexParams -> String -> [String]
splitWords p s = map (stemmer p) (filter f t)
where t = tokenize s
f = \x -> not ((ignore p) x)