{-# LANGUAGE ForeignFunctionInterface #-} {-# LANGUAGE OverloadedStrings #-} -- | Bindings to the Snowball library. module Text.Snowball ( Algorithm(..) , stem ) where ------------------------------------------------------------------------------- import Control.Exception (finally) import Control.Monad (forM) ------------------------------------------------------------------------------- import Data.ByteString.Char8 (ByteString, packCString, packCStringLen, useAsCString) import Data.Text (Text) import qualified Data.Text as Text import Data.Text.Encoding (decodeUtf8', encodeUtf8) ------------------------------------------------------------------------------- import Foreign (Ptr) import Foreign.C (CInt (..), CString) ------------------------------------------------------------------------------- import System.IO.Unsafe (unsafePerformIO) ------------------------------------------------------------------------------- -- | Snowball algorithm used for stemming words. data Algorithm = Danish | Dutch | English | Finnish | French | German | Hungarian | Italian | Norwegian | Portuguese | Romanian | Russian | Spanish | Swedish | Turkish | Porter -- ^ Use 'English' instead. -- | Compute the stem of a word using the specified algorithm. -- -- >>> stem English "fantastically" -- "fantast" stem :: Algorithm -> Text -> Text stem algorithm word = let [a] = stems algorithm [word] in a stems :: Algorithm -> [Text] -> [Text] stems algorithm words = unsafePerformIO $ withStemmer algorithm $ \stemmer -> forM words $ \word -> useAsCString (encodeUtf8 word) $ \word' -> do ptr <- sb_stemmer_stem stemmer word' (fromIntegral $ Text.length word) len <- sb_stemmer_length stemmer bytes <- packCStringLen (ptr,fromIntegral len) return $ either (const word) id $ decodeUtf8' bytes {-# RULES "map/stem" forall a xs. map (stem a) xs = stems a xs #-} ------------------------------------------------------------------------------- data Stemmer foreign import ccall "libstemmer.h sb_stemmer_new" sb_stemmer_new :: CString -> CString -> IO (Ptr Stemmer) foreign import ccall "libstemmer.h sb_stemmer_delete" sb_stemmer_delete :: Ptr Stemmer -> IO () foreign import ccall "libstemmer.h sb_stemmer_stem" sb_stemmer_stem :: Ptr Stemmer -> CString -> CInt -> IO (CString) foreign import ccall "libstemmer.h sb_stemmer_length" sb_stemmer_length :: Ptr Stemmer -> IO CInt withStemmer :: Algorithm -> (Ptr Stemmer -> IO a) -> IO a withStemmer algorithm action = useAsCString (algorithmName algorithm) $ \name -> useAsCString "UTF_8" $ \utf8 -> do stemmer <- sb_stemmer_new name utf8 action stemmer `finally` sb_stemmer_delete stemmer algorithmName :: Algorithm -> ByteString algorithmName algorithm = case algorithm of Danish -> "da" Dutch -> "nl" English -> "en" Finnish -> "fi" French -> "fr" German -> "de" Hungarian -> "hu" Italian -> "it" Norwegian -> "no" Portuguese -> "pt" Romanian -> "ro" Russian -> "ru" Spanish -> "es" Swedish -> "sv" Turkish -> "tr" Porter -> "porter"