module Text.Snowball
( Algorithm(..)
, stem
)
where
import Control.Exception (finally)
import Control.Monad (forM)
import Data.ByteString.Char8 (ByteString, packCString,
packCStringLen, useAsCString)
import Data.Text (Text)
import qualified Data.Text as Text
import Data.Text.Encoding (decodeUtf8', encodeUtf8)
import Foreign (Ptr)
import Foreign.C (CInt (..), CString)
import System.IO.Unsafe (unsafePerformIO)
data Algorithm
= Danish
| Dutch
| English
| Finnish
| French
| German
| Hungarian
| Italian
| Norwegian
| Portuguese
| Romanian
| Russian
| Spanish
| Swedish
| Turkish
| Porter
stem :: Algorithm -> Text -> Text
stem algorithm word = let [a] = stems algorithm [word] in a
stems :: Algorithm -> [Text] -> [Text]
stems algorithm words =
unsafePerformIO $ withStemmer algorithm $ \stemmer ->
forM words $ \word ->
useAsCString (encodeUtf8 word) $ \word' ->
do ptr <- sb_stemmer_stem stemmer word' (fromIntegral $ Text.length word)
len <- sb_stemmer_length stemmer
bytes <- packCStringLen (ptr,fromIntegral len)
return $ either (const word) id $ decodeUtf8' bytes
data Stemmer
foreign import ccall unsafe "libstemmer.h sb_stemmer_new"
sb_stemmer_new :: CString -> CString -> IO (Ptr Stemmer)
foreign import ccall unsafe "libstemmer.h sb_stemmer_delete"
sb_stemmer_delete :: Ptr Stemmer -> IO ()
foreign import ccall unsafe "libstemmer.h sb_stemmer_stem"
sb_stemmer_stem :: Ptr Stemmer -> CString -> CInt -> IO (CString)
foreign import ccall unsafe "libstemmer.h sb_stemmer_length"
sb_stemmer_length :: Ptr Stemmer -> IO CInt
withStemmer :: Algorithm -> (Ptr Stemmer -> IO a) -> IO a
withStemmer algorithm action =
useAsCString (algorithmName algorithm) $ \name ->
useAsCString "UTF_8" $ \utf8 ->
do stemmer <- sb_stemmer_new name utf8
action stemmer `finally` sb_stemmer_delete stemmer
algorithmName :: Algorithm -> ByteString
algorithmName algorithm =
case algorithm of
Danish -> "da"
Dutch -> "nl"
English -> "en"
Finnish -> "fi"
French -> "fr"
German -> "de"
Hungarian -> "hu"
Italian -> "it"
Norwegian -> "no"
Portuguese -> "pt"
Romanian -> "ro"
Russian -> "ru"
Spanish -> "es"
Swedish -> "sv"
Turkish -> "tr"
Porter -> "porter"