{-# OPTIONS  -XEmptyDataDecls #-}
{-# LANGUAGE ForeignFunctionInterface #-}

-- | Haskell bindings for the Snowball stemming library.
-- This module contains all the low-level functions and are more or less direct
-- translations of the foreign function calls.  The 'stem' function expects
-- strings to use UTF-8 encoding.
module NLP.Stemmer.C (
    -- * Types
      Algorithm(..)
    , Stemmer
    -- * Low level functions
    , new
    , delete
    , stem
    -- * Wrapper
    , withStemmer
    ) where

import Data.Char        (toLower)
import Foreign.C        (CString, CInt, peekCStringLen, newCString)
import Foreign.Ptr      (Ptr)

data StemmerStruct

-- | Pointer to a stemmer instance
type Stemmer = Ptr StemmerStruct

-- | Available algorithms.  For English, 'English' is recommended over 'Porter'.
data Algorithm = Danish
               | Dutch
               | English
               | Finnish
               | French
               | German
               | Hungarian
               | Italian
               | Norwegian
               | Portuguese
               | Romanian
               | Russian
               | Spanish
               | Swedish
               | Turkish
               | Porter
               deriving Show

foreign import ccall "libstemmer.h sb_stemmer_new"    sb_stemmer_new    :: CString -> CString -> IO Stemmer
foreign import ccall "libstemmer.h sb_stemmer_delete" sb_stemmer_delete :: Stemmer -> IO ()
foreign import ccall "libstemmer.h sb_stemmer_stem"   sb_stemmer_stem   :: Stemmer -> CString -> CInt -> IO (CString)
foreign import ccall "libstemmer.h sb_stemmer_length" sb_stemmer_length :: Stemmer -> IO CInt

-- | Create a new stemmer instance.  When you're done using the stemmer, you
-- should 'delete' it, freeing the memory.
new :: Algorithm -> IO Stemmer
new algorithm = do
    algorithm' <- algorithmCString algorithm
    encoding   <- utf8
    sb_stemmer_new algorithm' encoding

-- | Delete a stemmer instance.  Don't use it after deleting.
delete :: Stemmer -> IO ()
delete = sb_stemmer_delete

-- | Stem a word using the stemmer instance.
stem :: Stemmer -> String -> IO String
stem stemmer word = do
    word'  <- newCString word
    strPtr <- sb_stemmer_stem   stemmer word' (fromIntegral $ length word)
    strLen <- sb_stemmer_length stemmer
    peekCStringLen (strPtr, fromIntegral strLen)

-----------------------------------------------------------

-- | Perform stemming using a wrapper handling the low-level creation and
-- deletion of the stemmer instance.
withStemmer :: Algorithm -> (Stemmer -> IO a) -> IO a
withStemmer algorithm action = do
    stemmer <- new algorithm
    result  <- action stemmer
    delete stemmer
    return result

-----------------------------------------------------------

utf8 :: IO CString
utf8 = newCString "UTF_8"

algorithmCString :: Algorithm -> IO CString
algorithmCString = newCString . firstLower . show
    where firstLower (first:rest) = (toLower first) : rest