{-# OPTIONS  -XEmptyDataDecls #-}
{-# LANGUAGE ForeignFunctionInterface #-}

-- | Haskell bindings for the Snowball stemming library
module NLP.Stemmer ( new
                   , delete
                   , stem
                   , unsafeStem
                   , Algorithm(..)
                   , Stemmer
                   ) where

import Data.Char        (toLower)
import Foreign          (unsafePerformIO)
import Foreign.C        (CString, CInt, peekCStringLen, newCString)
import Foreign.Ptr      (Ptr)

data StemmerStruct

-- | Pointer to a stemmer instance
type Stemmer = Ptr StemmerStruct

-- | Algorithms to create a stemmer instance for
data Algorithm = Danish
               | Dutch
               | English
               | Finnish
               | French
               | German
               | Hungarian
               | Italian
               | Norwegian
               | Portuguese
               | Romanian
               | Russian
               | Spanish
               | Swedish
               | Turkish
               | Porter
               deriving Show

foreign import ccall "libstemmer.h sb_stemmer_new"    sb_stemmer_new    :: CString -> CString -> IO Stemmer
foreign import ccall "libstemmer.h sb_stemmer_delete" sb_stemmer_delete :: Stemmer -> IO ()
foreign import ccall "libstemmer.h sb_stemmer_stem"   sb_stemmer_stem   :: Stemmer -> CString -> CInt -> IO (CString)
foreign import ccall "libstemmer.h sb_stemmer_length" sb_stemmer_length :: Stemmer -> IO CInt

-- | Create a new stemmer instance.  When you're done using the stemmer, you should 'delete' it 
new :: Algorithm -> IO Stemmer
new algorithm = do
    algorithm' <- algorithmCString algorithm
    encoding   <- utf8
    sb_stemmer_new algorithm' encoding

-- | Delete a stemmer instance.  Don't use it after deleting.
delete :: Stemmer -> IO ()
delete = sb_stemmer_delete

-- | Stem a word using the stemmer instance.
stem :: Stemmer -> String -> IO String
stem stemmer word = do
    word'  <- newCString word
    strPtr <- sb_stemmer_stem   stemmer word' (fromIntegral $ length word)
    strLen <- sb_stemmer_length stemmer
    peekCStringLen (strPtr, fromIntegral strLen)

{-# NOINLINE unsafeStem #-}
-- | Unsafe stemming, use with care.
unsafeStem :: Stemmer -> String -> String
unsafeStem s = unsafePerformIO . stem s

-----------------------------------------------------------

utf8 :: IO CString
utf8 = newCString "UTF_8"

algorithmCString :: Algorithm -> IO CString
algorithmCString = newCString . firstLower . show
    where firstLower (first:rest) = (toLower first) : rest