module NLP.Morfeusz
(
DAG
, Edge (..)
, Token (..)
, Interp (..)
, KeepSpaces
, analyse
, paths
) where
import System.IO.Unsafe (unsafePerformIO)
import Control.Applicative ((<$>), (<*>))
import Control.Monad (when)
import qualified Data.Map as M
import qualified Data.ByteString as B
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import Foreign hiding (unsafePerformIO)
import Foreign.C.Types
import Foreign.C.String (CString)
import NLP.Morfeusz.Lock (lock)
newtype MorfOption = MorfOption { unMorfOption :: CInt }
deriving (Eq, Show)
newtype Encoding = Encoding { unEncoding :: CInt }
deriving (Eq, Show)
newtype WhiteSpace = WhiteSpace { unWhiteSpace :: CInt }
deriving (Eq, Show)
encoding :: MorfOption
encoding = MorfOption 1
whitespace :: MorfOption
whitespace = MorfOption 2
utf8 :: Encoding
utf8 = Encoding 8
iso8859_2 :: Encoding
iso8859_2 = Encoding 88592
cp1250 :: Encoding
cp1250 = Encoding 1250
cp852 :: Encoding
cp852 = Encoding 852
skip_whitespace :: WhiteSpace
skip_whitespace = WhiteSpace 0
keep_whitespace :: WhiteSpace
keep_whitespace = WhiteSpace 2
setEncoding :: Encoding -> IO Bool
setEncoding enc = (1 ==) <$>
c_morfeusz_set_option (unMorfOption encoding) (unEncoding enc)
setSpace :: WhiteSpace -> IO Bool
setSpace spc = (1 ==) <$>
c_morfeusz_set_option (unMorfOption whitespace) (unWhiteSpace spc)
data Edge a = Edge
{ from :: Int
, to :: Int
, label :: a }
deriving (Eq, Ord, Show, Functor)
data RawInterp = RawInterp
{ _orth :: T.Text
, _base :: Maybe T.Text
, _msd :: Maybe T.Text }
deriving (Eq, Ord, Show)
data Token = Token
{ orth :: T.Text
, interps :: [Interp] }
deriving (Show)
data Interp = Interp
{ base :: T.Text
, msd :: T.Text }
deriving (Show)
instance Storable (Edge RawInterp) where
sizeOf _ = ((20))
alignment _ = alignment (undefined :: CString)
peek ptr = do
from <- getInt (((\hsc_ptr -> peekByteOff hsc_ptr 0)) ptr)
to <- getInt (((\hsc_ptr -> peekByteOff hsc_ptr 4)) ptr)
(orth, base, msd) <- if from == 1
then return ("", Nothing, Nothing)
else (,,)
<$> getText (((\hsc_ptr -> peekByteOff hsc_ptr 8)) ptr)
<*> getTextMaybe (((\hsc_ptr -> peekByteOff hsc_ptr 12)) ptr)
<*> getTextMaybe (((\hsc_ptr -> peekByteOff hsc_ptr 16)) ptr)
return $ Edge from to (RawInterp orth base msd)
where
getInt = fmap fromIntegral :: IO CInt -> IO Int
getText cStrIO = peekText =<< cStrIO
getTextMaybe cStrIO = cStrIO >>= \cStr -> do
if cStr == nullPtr
then return Nothing
else Just <$> peekText cStr
peekText xs = T.decodeUtf8 <$> B.packCString xs
foreign import ccall unsafe "morfeusz_analyse"
c_morfeusz_analyse :: CString -> IO (Ptr (Edge RawInterp))
foreign import ccall unsafe "morfeusz_set_option"
c_morfeusz_set_option :: CInt -> CInt -> IO CInt
type DAG a = [Edge a]
type KeepSpaces = Bool
analyseRaw :: KeepSpaces -> T.Text -> DAG RawInterp
analyseRaw keepSp word = run $ \cword -> lock $ do
_ <- setEncoding utf8
_ <- setSpace $ if keepSp then keep_whitespace else skip_whitespace
interp_ptr <- c_morfeusz_analyse cword
when (interp_ptr == nullPtr) (fail $ "analyseRaw: null pointer")
retrieve 0 interp_ptr
where
run = unsafePerformIO . B.useAsCString (T.encodeUtf8 word)
retrieve k ptr = do
x <- peekElemOff ptr k
if from x == 1
then return []
else (:) <$> return x <*> retrieve (k + 1) ptr
properDAG :: DAG RawInterp -> DAG Token
properDAG dag =
[Edge p q t | ((p, q), t) <- M.toAscList m]
where
m = M.fromListWith (<>) [((p, q), fromRaw r) | Edge p q r <- dag]
fromRaw (RawInterp o (Just b) (Just m)) = Token o [Interp b m]
fromRaw (RawInterp o _ _) = Token o []
Token orth xs <> Token _ ys = Token orth (xs ++ ys)
analyse :: KeepSpaces -> T.Text -> DAG Token
analyse keepSp = properDAG . analyseRaw keepSp
paths :: DAG a -> [[a]]
paths dag =
doIt .fst . M.findMin $ m
where
m = M.fromListWith (++) [(from e, [e]) | e <- dag]
doIt p = case M.lookup p m of
Just es -> [(label e : path) | e <- es, path <- doIt (to e)]
Nothing -> [[]]