module NLP.Snowball
(
Algorithm(..)
, stem
, stems
, Stemmer
, newStemmer
, stemWith
, stemsWith
)
where
import Control.Concurrent (MVar, newMVar, withMVar)
import Control.Monad (forM, when)
import Data.ByteString.Char8 (ByteString, pack, packCStringLen,
useAsCString)
import Data.Text (Text)
import qualified Data.Text as Text
import Data.Text.ICU.Convert (Converter, fromUnicode, open,
toUnicode)
import Foreign (ForeignPtr, FunPtr, Ptr, newForeignPtr,
nullPtr, withForeignPtr)
import Foreign.C (CInt (..), CString)
import System.IO.Unsafe (unsafePerformIO)
data Algorithm
= Danish
| Dutch
| English
| Finnish
| French
| German
| Hungarian
| Italian
| Norwegian
| Portuguese
| Romanian
| Russian
| Spanish
| Swedish
| Turkish
| Porter
stem :: Algorithm -> Text -> Text
stem algorithm word = let [a] = stems algorithm [word] in a
stems :: Algorithm -> [Text] -> [Text]
stems algorithm ws =
unsafePerformIO $
do stemmer <- newStemmer algorithm
stemsWith stemmer ws
newtype Stemmer = Stemmer (MVar (ForeignPtr Struct,Converter))
newStemmer :: Algorithm -> IO Stemmer
newStemmer algorithm =
useAsCString (algorithmName algorithm) $ \name ->
useAsCString (encodingName encoding) $ \utf8 ->
do struct <- sb_stemmer_new name utf8
when (struct == nullPtr) $
error "Text.Snowball.newStemmer: nullPtr"
structPtr <- newForeignPtr sb_stemmer_delete struct
converter <- open (converterName encoding) Nothing
mvar <- newMVar (structPtr,converter)
return $ Stemmer mvar
where
encoding = algorithmEncoding algorithm
stemWith :: Stemmer -> Text -> IO Text
stemWith stemmer word = do
[a] <- stemsWith stemmer [word]
return a
stemsWith :: Stemmer -> [Text] -> IO [Text]
stemsWith (Stemmer mvar) ws =
withMVar mvar $ \(structPtr,converter) ->
withForeignPtr structPtr $ \struct ->
forM ws $ \word ->
useAsCString (fromUnicode converter word) $ \word' ->
do ptr <- sb_stemmer_stem struct word' $
fromIntegral $ Text.length word
len <- sb_stemmer_length struct
bytes <- packCStringLen (ptr,fromIntegral len)
return $ toUnicode converter bytes
data Struct
foreign import ccall unsafe "libstemmer.h sb_stemmer_new"
sb_stemmer_new :: CString -> CString -> IO (Ptr Struct)
foreign import ccall unsafe "libstemmer.h &sb_stemmer_delete"
sb_stemmer_delete :: FunPtr (Ptr Struct -> IO ())
foreign import ccall unsafe "libstemmer.h sb_stemmer_stem"
sb_stemmer_stem :: Ptr Struct -> CString -> CInt -> IO (CString)
foreign import ccall unsafe "libstemmer.h sb_stemmer_length"
sb_stemmer_length :: Ptr Struct -> IO CInt
algorithmName :: Algorithm -> ByteString
algorithmName algorithm =
case algorithm of
Danish -> pack "da"
Dutch -> pack "nl"
English -> pack "en"
Finnish -> pack "fi"
French -> pack "fr"
German -> pack "de"
Hungarian -> pack "hu"
Italian -> pack "it"
Norwegian -> pack "no"
Portuguese -> pack "pt"
Romanian -> pack "ro"
Russian -> pack "ru"
Spanish -> pack "es"
Swedish -> pack "sv"
Turkish -> pack "tr"
Porter -> pack "porter"
data Encoding = UTF_8 | ISO_8859_1 | ISO_8859_2 | KOI8_R
encodingName :: Encoding -> ByteString
encodingName encoding =
case encoding of
UTF_8 -> pack "UTF_8"
ISO_8859_1 -> pack "ISO_8859_1"
ISO_8859_2 -> pack "ISO_8859_2"
KOI8_R -> pack "KOI8_R"
converterName :: Encoding -> String
converterName encoding =
case encoding of
UTF_8 -> "UTF-8"
ISO_8859_1 -> "ISO-8859-1"
ISO_8859_2 -> "ISO-8859-2"
KOI8_R -> "KOI8-R"
algorithmEncoding :: Algorithm -> Encoding
algorithmEncoding algorithm =
case algorithm of
Romanian -> ISO_8859_2
Russian -> KOI8_R
Turkish -> UTF_8
_ -> ISO_8859_1