{-# LINE 1 "NLP/Morfeusz.hsc" #-}
{-# LANGUAGE ForeignFunctionInterface #-}
{-# LINE 2 "NLP/Morfeusz.hsc" #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE DeriveFunctor #-}

-- | The module provides the 'analyse' wrapper function which uses the
-- Morfeusz library for morphosyntactic analysis.  The result is represented
-- as a directed acylic graph (DAG) with 'Token' labeled edges.
-- The DAG representation is needed when the input word has multiple
-- correct segmentations.
--
-- >>> :m NLP.Morfeusz
-- >>> :set -XOverloadedStrings
-- >>> mapM_ print . analyse False $ "miałem"
-- Edge {from = 0, to = 1, label = Token {orth = "mia\322", interps = [Interp {base = "mie\263", msd = "praet:sg:m1.m2.m3:imperf"}]}}
-- Edge {from = 0, to = 2, label = Token {orth = "mia\322em", interps = [Interp {base = "mia\322", msd = "subst:sg:inst:m3"}]}}
-- Edge {from = 1, to = 2, label = Token {orth = "em", interps = [Interp {base = "by\263", msd = "aglt:sg:pri:imperf:wok"}]}}
--
-- You can use the 'paths' function to extract all paths from the resultant
-- DAG and, if you are not interested in all possible segmentations, just
-- take the first of possible paths:
--
-- >>> mapM_ print . paths . analyse False $ "miałem"
-- [Token {orth = "mia\322em", interps = [Interp {base = "mia\322", msd = "subst:sg:inst:m3"}]}]
-- [Token {orth = "mia\322", interps = [Interp {base = "mie\263", msd = "praet:sg:m1.m2.m3:imperf"}]},Token {orth = "em", interps = [Interp {base = "by\263", msd = "aglt:sg:pri:imperf:wok"}]}]
-- >>> mapM_ print . head . paths . analyse False $ "miałem"
-- Token {orth = "mia\322em", interps = [Interp {base = "mia\322", msd = "subst:sg:inst:m3"}]}

module NLP.Morfeusz
(
-- * Types
  DAG
, Edge (..)
, Token (..)
, Interp (..)

-- * Sentence analysis
, KeepSpaces
, analyse

-- * Utilities
, paths
) where

import System.IO.Unsafe (unsafePerformIO)
import Control.Applicative ((<$>), (<*>))
import Control.Monad (when)
import qualified Data.Map as M
import qualified Data.ByteString as B
import qualified Data.Text as T
import qualified Data.Text.Encoding as T

import Foreign hiding (unsafePerformIO)
import Foreign.C.Types
import Foreign.C.String (CString)

import NLP.Morfeusz.Lock (lock)


{-# LINE 61 "NLP/Morfeusz.hsc" #-}

-- | Morfeusz options
newtype MorfOption = MorfOption { unMorfOption :: CInt }
    deriving (Eq, Show)

newtype Encoding = Encoding { unEncoding :: CInt }
    deriving (Eq, Show)

newtype WhiteSpace = WhiteSpace { unWhiteSpace :: CInt }
    deriving (Eq, Show)

encoding    :: MorfOption
encoding    = MorfOption 1
whitespace  :: MorfOption
whitespace  = MorfOption 2

{-# LINE 75 "NLP/Morfeusz.hsc" #-}

utf8          :: Encoding
utf8          = Encoding 8
iso8859_2     :: Encoding
iso8859_2     = Encoding 88592
cp1250        :: Encoding
cp1250        = Encoding 1250
cp852         :: Encoding
cp852         = Encoding 852

{-# LINE 81 "NLP/Morfeusz.hsc" #-}

skip_whitespace  :: WhiteSpace
skip_whitespace  = WhiteSpace 0
keep_whitespace  :: WhiteSpace
keep_whitespace  = WhiteSpace 2

{-# LINE 85 "NLP/Morfeusz.hsc" #-}

-- | Set the encoding.
setEncoding :: Encoding -> IO Bool
setEncoding enc = (1 ==) <$>
    c_morfeusz_set_option (unMorfOption encoding) (unEncoding enc)

-- | Set the Morfeusz whitespace option.
setSpace :: WhiteSpace -> IO Bool
setSpace spc = (1 ==) <$>
    c_morfeusz_set_option (unMorfOption whitespace) (unWhiteSpace spc)

-- | A directed edge with label of type @a@ between nodes of type 'Int'.
data Edge a = Edge
    { from  :: Int
    , to    :: Int
    , label :: a }
    deriving (Eq, Ord, Show, Functor)

-- | Raw morphosyntactic interpretation as presented by the Morfeusz.
data RawInterp = RawInterp
    { _orth :: T.Text
    , _base :: Maybe T.Text
    , _msd  :: Maybe T.Text }
    deriving (Eq, Ord, Show)

-- | A token with a list of recognized interpretations.  If the list of
-- interpretations is empty, the token is unknown to the Morfeusz.
data Token = Token
    { orth      :: T.Text
    , interps   :: [Interp] }
    deriving (Show)

-- | An interpretation of the word.
data Interp = Interp
    { base :: T.Text
    , msd  :: T.Text }
    deriving (Show)

-- | We only provide the peek functionality.
instance Storable (Edge RawInterp) where
    sizeOf    _ = ((20))
{-# LINE 126 "NLP/Morfeusz.hsc" #-}
    alignment _ = alignment (undefined :: CString)  -- or CInt ?
    peek ptr = do
        from <- getInt (((\hsc_ptr -> peekByteOff hsc_ptr 0)) ptr)
{-# LINE 129 "NLP/Morfeusz.hsc" #-}
        to <- getInt (((\hsc_ptr -> peekByteOff hsc_ptr 4)) ptr)
{-# LINE 130 "NLP/Morfeusz.hsc" #-}
        (orth, base, msd) <- if from == -1
            then return ("", Nothing, Nothing)
            else (,,)
                <$> getText (((\hsc_ptr -> peekByteOff hsc_ptr 8)) ptr)
{-# LINE 134 "NLP/Morfeusz.hsc" #-}
                <*> getTextMaybe (((\hsc_ptr -> peekByteOff hsc_ptr 12)) ptr)
{-# LINE 135 "NLP/Morfeusz.hsc" #-}
                <*> getTextMaybe (((\hsc_ptr -> peekByteOff hsc_ptr 16)) ptr)
{-# LINE 136 "NLP/Morfeusz.hsc" #-}
        return $ Edge from to (RawInterp orth base msd)
      where
        getInt = fmap fromIntegral :: IO CInt -> IO Int
        getText cStrIO = peekText =<< cStrIO
        getTextMaybe cStrIO = cStrIO >>= \cStr -> do
            if cStr == nullPtr
                then return Nothing
                else Just <$> peekText cStr
        peekText xs = T.decodeUtf8 <$> B.packCString xs

foreign import ccall unsafe "morfeusz_analyse"
    -- InterpMorf *morfeusz_analyse(char *tekst)
    c_morfeusz_analyse :: CString -> IO (Ptr (Edge RawInterp))

foreign import ccall unsafe "morfeusz_set_option"
    -- int morfeusz_set_option(int option, int value)
    c_morfeusz_set_option :: CInt -> CInt -> IO CInt

-- | A DAG with annotated edges. 
type DAG a = [Edge a]

-- | Keep spaces in the analysis output.
type KeepSpaces = Bool

-- | Analyse the word and output raw Morfeusz results.
analyseRaw :: KeepSpaces -> T.Text -> DAG RawInterp
analyseRaw keepSp word = run $ \cword -> lock $ do
    _ <- setEncoding utf8
    _ <- setSpace $ if keepSp then keep_whitespace else skip_whitespace
    interp_ptr <- c_morfeusz_analyse cword
    when (interp_ptr == nullPtr) (fail $ "analyseRaw: null pointer")
    retrieve 0 interp_ptr
  where
    run = unsafePerformIO . B.useAsCString (T.encodeUtf8 word)
    retrieve k ptr = do
        x <- peekElemOff ptr k
        if from x == -1
            then return []
            else (:) <$> return x <*> retrieve (k + 1) ptr

-- | Translate the DAG of raw Morfeusz interpretations to
-- DAG labeled with tokens.
properDAG :: DAG RawInterp -> DAG Token
properDAG dag =
    [Edge p q t | ((p, q), t) <- M.toAscList m]
  where
    m = M.fromListWith (<>) [((p, q), fromRaw r) | Edge p q r <- dag]
    fromRaw (RawInterp o (Just b) (Just m)) = Token o [Interp b m] 
    fromRaw (RawInterp o _ _)               = Token o [] 
    Token orth xs <> Token _ ys = Token orth (xs ++ ys)

-- | Analyse the input sentence and return the result as a DAG of tokens.
analyse :: KeepSpaces -> T.Text -> DAG Token
analyse keepSp = properDAG . analyseRaw keepSp

-- | Retrieve all paths from DAG root to leaves.
paths :: DAG a -> [[a]]
paths dag =
    doIt .fst . M.findMin $ m
  where
    m = M.fromListWith (++) [(from e, [e]) | e <- dag]
    doIt p = case M.lookup p m of
        Just es -> [(label e : path) | e <- es, path <- doIt (to e)]
        Nothing -> [[]]