module Condor.Index
( DocName
, Index
, addDocument
, addDocTerms
, emptyIndex
, search
, searchTerms
, termCount
) where
import qualified Data.Map as Map
import qualified Data.List as List
import qualified Data.Text as T
import qualified Data.Text.Encoding as E
import Data.Binary
import Condor.Text
import Condor.DataTypes (DocName, Document(..), docName, docText)
import Condor.Language.English.StopWords (isStopWord)
import Condor.Language.English.Porter (stem)
type Term = T.Text
data Index = Index { terms :: Map.Map Term [Int]
, docs :: [DocName]
}
instance Binary Index where
put i = do
put (terms i)
put (docs i)
get = do i <- get
d <- get
return $ Index i d
instance Binary T.Text where
put i = do put (E.encodeUtf8 i)
get = do i <- get
return $ E.decodeUtf8 i
emptyIndex :: Index
emptyIndex = Index Map.empty []
addDocument :: Document -> Index -> Index
addDocument d idx = addDocTerms (docName d) (splitTerms content) idx
where content = docText d
addDocTerms :: DocName -> [Term] -> Index -> Index
addDocTerms d c ix = Index (foldl f (terms ix) c) (d:docs ix)
where f ix' t = case Map.lookup t ix' of
Just a -> Map.insert t (index:a) ix'
Nothing -> Map.insert t [index] ix'
index = length (docs ix)
search :: Index -> String -> [DocName]
search ix s = searchTerms ix (splitTerms (T.pack s))
searchTerms :: Index -> [Term] -> [DocName]
searchTerms ix s = List.nub $ foldl (++) [] ys
where ys = map (findDocs ix) s
findDocs :: Index -> Term -> [DocName]
findDocs ix s = case Map.lookup s (terms ix) of
Just a -> map ((reverse (docs ix))!!) a
Nothing -> []
termCount :: Index -> Int
termCount ix = Map.size (terms ix)
splitTerms :: T.Text -> [Term]
splitTerms s = map stem (filter (not . isStopWord) t)
where t = tokenize s