module NLP.Concraft.Polish.Maca
(
MacaPool
, newMacaPool
, macaPar
) where
import Control.Applicative ((<$>))
import Control.Monad (void, forever, guard, replicateM, unless)
import Control.Concurrent
import Control.Exception
import System.Process
import System.IO
import Data.Function (on)
import qualified Data.Char as C
import qualified Data.Text as T
import qualified Data.Text.IO as T
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.IO as L
import qualified Control.Monad.State.Strict as S
import qualified Control.Monad.Trans.Maybe as M
import Control.Monad.Trans.Class (lift)
import NLP.Concraft.Polish.Morphosyntax hiding (restore)
import qualified NLP.Concraft.Polish.Format.Plain as Plain
type In = Chan T.Text
type Out = Chan [Sent Tag]
newtype Maca = Maca (In, Out)
newMaca :: IO Maca
newMaca = do
inCh <- newChan
outCh <- newChan
void $ runMacaOn inCh outCh
return $ Maca (inCh, outCh)
runMacaOn :: In -> Out -> IO ThreadId
runMacaOn inCh outCh = forkIO . mask $ \restore -> do
let cmd = "maca-analyse"
args = ["-q", "morfeusz-nkjp-official", "-o", "plain", "-l"]
(Just inh, Just outh, Just errh, pid) <-
createProcess (proc cmd args){ std_in = CreatePipe
, std_out = CreatePipe
, std_err = CreatePipe }
let excHandler = do
let tryIO = try :: IO a -> IO (Either IOException a)
void $ tryIO $ do
err <- hGetContents errh
unless (all C.isSpace err) $ do
putStr "Maca error: "
putStrLn err
hClose inh; hClose outh; hClose errh
terminateProcess pid
waitForProcess pid
hSetBuffering outh LineBuffering
flip onException excHandler $ restore $ forever $ do
txt <- readChan inCh
T.hPutStr inh txt; hFlush inh
writeChan outCh =<< readMacaResponse outh (textWeight txt)
readMacaResponse :: Handle -> Int -> IO [Sent Tag]
readMacaResponse h n
| n <= 0 = return []
| otherwise = do
x <- readMacaSent h
xs <- readMacaResponse h (n sentWeight x)
return (x : xs)
readMacaSent :: Handle -> IO (Sent Tag)
readMacaSent h =
Plain.parseSent . L.unlines <$> getTxt
where
getTxt = do
x <- L.hGetLine h
if L.null x
then return []
else (x:) <$> getTxt
doMacaPar :: Maca -> T.Text -> IO [Sent Tag]
doMacaPar (Maca (inCh, outCh)) par = do
let par' = T.intercalate " " (T.lines par) `T.append` "\n"
writeChan inCh par'
restoreSpaces par <$> readChan outCh
restoreSpaces :: T.Text -> [Sent Tag] -> [Sent Tag]
restoreSpaces par sents =
S.evalState (mapM onSent sents) (0, chunks)
where
parts = T.groupBy ((==) `on` C.isSpace) par
weights = scanl1 (+) (map textWeight parts)
chunks = filter (T.any C.isSpace . fst) (zip parts weights)
onSent = mapM onWord
onWord seg = do
n <- addWeight seg
s <- popSpace n
let word' = (word seg) { space = s }
return $ seg { word = word' }
addWeight seg = S.state $ \(n, xs) ->
let m = n + segWeight seg
in (m, (m, xs))
popSpace n = fmap (maybe None id) . M.runMaybeT $ do
spaces <- lift $ S.gets snd
(sp, m) <- liftMaybe $ maybeHead spaces
guard $ m < n
lift $ S.modify $ \(n', xs) -> (n', tail xs)
return $ toSpace sp
liftMaybe = M.MaybeT . return
maybeHead xs = case xs of
(x:_) -> Just x
[] -> Nothing
toSpace x
| has '\n' = NewLine
| has ' ' = Space
| otherwise = None
where has c = maybe False (const True) (T.find (==c) x)
newtype MacaPool = MacaPool (Chan Maca)
newMacaPool
:: Int
-> IO MacaPool
newMacaPool n = do
chan <- newChan
macas <- replicateM n newMaca
writeList2Chan chan macas
return $ MacaPool chan
popMaca :: MacaPool -> IO Maca
popMaca (MacaPool c) = readChan c
putMaca :: Maca -> MacaPool -> IO ()
putMaca x (MacaPool c) = writeChan c x
macaPar :: MacaPool -> T.Text -> IO [Sent Tag]
macaPar pool par = do
maca <- popMaca pool
doMacaPar maca par `finally` putMaca maca pool
textWeight :: T.Text -> Int
textWeight = T.length . T.filter (not . C.isSpace)
segWeight :: Seg t -> Int
segWeight = textWeight . orth . word
sentWeight :: Sent t -> Int
sentWeight = sum . map segWeight