src/NLP/Snowball.hs

-- |
--   Portability: Haskell2010
--
--   Bindings to the Snowball library.
module NLP.Snowball
    ( -- * Pure interface
      Algorithm(..)
    , stem
    , stems
      -- * IO interface
    , Stemmer
    , newStemmer
    , stemWith
    , stemsWith
    )
  where

-------------------------------------------------------------------------------
import           Control.Concurrent    (MVar, newMVar, withMVar)
import           Control.Monad         (forM, when)
-------------------------------------------------------------------------------
import           Data.ByteString.Char8 (ByteString, pack, packCStringLen,
                                        useAsCString)
import           Data.Text             (Text)
import qualified Data.Text             as Text
import           Data.Text.ICU.Convert (Converter, fromUnicode, open,
                                        toUnicode)
-------------------------------------------------------------------------------
import           Foreign               (ForeignPtr, FunPtr, Ptr, newForeignPtr,
                                        nullPtr, withForeignPtr)
import           Foreign.C             (CInt (..), CString)
-------------------------------------------------------------------------------
import           System.IO.Unsafe      (unsafePerformIO)
-------------------------------------------------------------------------------


-- | Snowball algorithm used for stemming words.
data Algorithm
    = Danish
    | Dutch
    | English
    | Finnish
    | French
    | German
    | Hungarian
    | Italian
    | Norwegian
    | Portuguese
    | Romanian
    | Russian
    | Spanish
    | Swedish
    | Turkish
    | Porter -- ^ Use 'English' instead.

-- | Compute the stem of a word using the specified algorithm.
--
--   >>> stem English "fantastically"
--   "fantast"
stem :: Algorithm -> Text -> Text
stem algorithm word = let [a] = stems algorithm [word] in a

-- | Compute the stems of several words in one go.  This can be more
--   efficient than @'map' 'stem'@ because it uses a single 'Stemmer'
--   instance, however the @map@ version is rewritten to use this function
--   with a rewrite rule.  You can still use this function though if you
--   want to make sure it is used or if you find it more convenient.
stems :: Algorithm -> [Text] -> [Text]
stems algorithm ws =
    unsafePerformIO $
      do stemmer <- newStemmer algorithm
         stemsWith stemmer ws

{-# RULES "map/stem" forall a xs. map (stem a) xs = stems a xs #-}


-------------------------------------------------------------------------------

-- | A thread and memory safe Snowball stemmer instance.
newtype Stemmer = Stemmer (MVar (ForeignPtr Struct,Converter))

-- | Create a new reusable 'Stemmer' instance.
newStemmer :: Algorithm  -> IO Stemmer
newStemmer algorithm =
    useAsCString (algorithmName algorithm) $ \name ->
      useAsCString (encodingName encoding) $ \utf8 ->
        do struct <- sb_stemmer_new name utf8
           when (struct == nullPtr) $
             error "Text.Snowball.newStemmer: nullPtr"
           structPtr <- newForeignPtr sb_stemmer_delete struct
           converter <- open (converterName encoding) Nothing
           mvar <- newMVar (structPtr,converter)
           return $ Stemmer mvar
  where
    encoding = algorithmEncoding algorithm

-- | Use a 'Stemmer' to stem a word.  This can be used more efficiently
--   than 'stem' because you can keep a stemmer around and reuse it, but it
--   requires 'IO' to ensure thread safety.
--
--   In my benchmarks, this (and 'stemsWith') is faster than 'stem' for
--   a few hundred words, but slower for larger number of words.  I don't
--   know if this is a problem with my benchmarks, with these bindings or
--   with the Snowball library itself, so make sure to benchmark yourself
--   if speed is a concern, and consider caching stems with e.g.
--   a @HashMap@.
stemWith :: Stemmer -> Text -> IO Text
stemWith stemmer word = do
    [a] <- stemsWith stemmer [word]
    return a

-- | Use a 'Stemmer' to stem multiple words in one go.  This can be more
--   efficient than @'mapM' 'stemWith'@ because the 'Stemmer' is only
--   locked once.
stemsWith :: Stemmer -> [Text] -> IO [Text]
stemsWith (Stemmer mvar) ws =
    withMVar mvar $ \(structPtr,converter) ->
      withForeignPtr structPtr $ \struct ->
        forM ws $ \word ->
          useAsCString (fromUnicode converter word) $ \word' ->
            do ptr <- sb_stemmer_stem struct word' $
                        fromIntegral $ Text.length word
               len <- sb_stemmer_length struct
               bytes <- packCStringLen (ptr,fromIntegral len)
               return $ toUnicode converter bytes


-------------------------------------------------------------------------------

data Struct

foreign import ccall unsafe "libstemmer.h sb_stemmer_new"
    sb_stemmer_new :: CString -> CString -> IO (Ptr Struct)

foreign import ccall unsafe "libstemmer.h &sb_stemmer_delete"
    sb_stemmer_delete :: FunPtr (Ptr Struct -> IO ())

foreign import ccall unsafe "libstemmer.h sb_stemmer_stem"
    sb_stemmer_stem :: Ptr Struct -> CString -> CInt -> IO (CString)

foreign import ccall unsafe "libstemmer.h sb_stemmer_length"
    sb_stemmer_length :: Ptr Struct -> IO CInt

algorithmName :: Algorithm -> ByteString
algorithmName algorithm =
    case algorithm of
      Danish     -> pack "da"
      Dutch      -> pack "nl"
      English    -> pack "en"
      Finnish    -> pack "fi"
      French     -> pack "fr"
      German     -> pack "de"
      Hungarian  -> pack "hu"
      Italian    -> pack "it"
      Norwegian  -> pack "no"
      Portuguese -> pack "pt"
      Romanian   -> pack "ro"
      Russian    -> pack "ru"
      Spanish    -> pack "es"
      Swedish    -> pack "sv"
      Turkish    -> pack "tr"
      Porter     -> pack "porter"

data Encoding = UTF_8 | ISO_8859_1 | ISO_8859_2 | KOI8_R

encodingName :: Encoding -> ByteString
encodingName encoding =
    case encoding of
      UTF_8      -> pack "UTF_8"
      ISO_8859_1 -> pack "ISO_8859_1"
      ISO_8859_2 -> pack "ISO_8859_2"
      KOI8_R     -> pack "KOI8_R"

converterName :: Encoding -> String
converterName encoding =
    case encoding of
      UTF_8      -> "UTF-8"
      ISO_8859_1 -> "ISO-8859-1"
      ISO_8859_2 -> "ISO-8859-2"
      KOI8_R     -> "KOI8-R"

algorithmEncoding :: Algorithm -> Encoding
algorithmEncoding algorithm =
    case algorithm of
      Romanian -> ISO_8859_2
      Russian  -> KOI8_R
      Turkish  -> UTF_8
      _        -> ISO_8859_1