-- GenI surface realiser -- Copyright (C) 2005 Carlos Areces and Eric Kow -- -- This program is free software; you can redistribute it and/or -- modify it under the terms of the GNU General Public License -- as published by the Free Software Foundation; either version 2 -- of the License, or (at your option) any later version. -- -- This program is distributed in the hope that it will be useful, -- but WITHOUT ANY WARRANTY; without even the implied warranty of -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -- GNU General Public License for more details. -- -- You should have received a copy of the GNU General Public License -- along with this program; if not, write to the Free Software -- Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. {-| This module handles mostly everything to do with morphology in Geni. There are two basic tasks: morphological input and output. GenI farms out morphology to whatever third party program you specify on the command line. Note that a simple and stupid ``sillymorph'' realiser is provided either in the GenI repository or on hackage. -} {-# OPTIONS_GHC -fno-warn-orphans #-} {-# LANGUAGE DeriveDataTypeable #-} {-# LANGUAGE OverloadedStrings #-} module NLP.GenI.Morphology ( module NLP.GenI.Morphology.Types -- * Morphological predicates , readMorph, stripMorphSem, attachMorph, setMorphAnchor -- * Morphological realisation , inflectSentencesUsingCmd, sansMorph ) where import Control.Applicative import Control.Concurrent (forkIO) import Control.Exception (IOException, bracket, catch, evaluate) import qualified Data.Map as Map import Data.Maybe (fromMaybe, isNothing) import Data.Text (Text) import qualified Data.Text as T import Data.Tree import Data.Typeable import Prelude hiding (catch) import System.Exit import System.IO import System.Process import System.Log.Logger import Text.JSON import qualified Text.JSON as J import Text.JSON.Pretty hiding ((<+>), (<>)) import NLP.GenI.FeatureStructure import NLP.GenI.General import NLP.GenI.GeniVal (GeniVal, mkGAnon, replace) import NLP.GenI.Morphology.Types import NLP.GenI.Parser import NLP.GenI.Pretty import NLP.GenI.Semantics (Literal (..), Sem) import NLP.GenI.Tag import NLP.GenI.TreeSchema (GNode (..), GType (..)) -- ---------------------------------------------------------------------- -- Morphological input -- ---------------------------------------------------------------------- -- | Converts information from a morphological information file into GenI's -- internal format. readMorph :: [(Text,[AvPair GeniVal])] -> MorphInputFn readMorph minfo lit = Map.lookup key fm where fm = Map.fromList minfo key = pretty (lPredicate lit) -- | Filters away from an input semantics any literals whose realisation is -- strictly morphological. The first argument tells us helps identify the -- morphological literals -- it associates literals with morphological stuff; -- if it returns 'Nothing', then it is non-morphological stripMorphSem :: MorphInputFn -> Sem -> Sem stripMorphSem morphfn tsem = [ l | l <- tsem, (isNothing.morphfn) l ] -- | 'attachMorph' @morphfn sem cands@ does the bulk of the morphological -- input processing. We use @morphfn@ to determine which literals in -- @sem@ contain morphological information and what information they contain. -- Then we attach this morphological information to the relevant trees in -- @cand@. A tree is considered relevant w.r.t to a morphological -- literal if its semantics contains at least one literal whose first index -- is the same as the first index of the morphological literal. attachMorph :: MorphInputFn -> Sem -> [TagElem] -> [TagElem] attachMorph morphfn sem cands = let -- relevance of a tree wrt to an index relTree i = not.null.relfilt.tsemantics where relfilt = filter (relLit i) relLit i l = case lArgs l of [] -> False (x:_) -> x == i -- perform the attachment for a tree if it is relevant attachHelper :: GeniVal -> Flist GeniVal -> TagElem -> TagElem attachHelper i mfs t = if relTree i t then attachMorphHelper mfs t else t -- perform all attachments for a literal attach :: Literal GeniVal -> [TagElem] -> [TagElem] attach l cs = case morphfn l of Nothing -> cs Just mfs -> map (attachHelper i mfs) cs where i = case lArgs l of [] -> mkGAnon (x:_) -> x in foldr attach cands sem -- | Actually unify the morphological features into the anchor node -- -- FIXME: we'll need to make sure this still works as promised -- when we implement co-anchors. attachMorphHelper :: Flist GeniVal -> TagElem -> TagElem attachMorphHelper mfs te = let -- unification with anchor tt = ttree te anchor = head $ filterTree fn tt where fn a = (ganchor a && gtype a == Lex) in case unifyFeat mfs (gup anchor) of Left err -> error . T.unpack $ "Morphological unification failure on" <+> idname te <> ":" <+> err Right (unf,subst) -> let -- perform replacements te2 = replace subst te tt2 = ttree te2 -- replace the anchor with the unification results newgdown = replace subst (gdown anchor) newa = anchor { gup = unf, gdown = newgdown } in te2 { ttree = setMorphAnchor newa tt2 } -- | @setMorphAnchor n t@ replaces the anchor node of a tree with @n@ -- -- We assume the tree has exactly one anchor node. If it has none, -- this explodes; if it has more than one, they all get replaced. setMorphAnchor :: GNode GeniVal -> Tree (GNode GeniVal) -> Tree (GNode GeniVal) setMorphAnchor n t = fromMaybe (error oops) $ repNode fn filt t where filt (Node a _) = gtype a == Lex && ganchor a fn (Node _ l) = Node n l oops = "NLP.GenI.Morphology.setMorphAnchor did not anticipate failure was possible" -- ---------------------------------------------------------------------- -- Morphological realisation -- ---------------------------------------------------------------------- -- | Extracts the lemmas from a list of uninflected sentences. This is used -- when the morphological generator is unavailable, doesn't work, etc. sansMorph :: LemmaPlusSentence -> MorphOutput sansMorph = MorphOutput [] . singleton . T.unwords . map lem where lem (LemmaPlus l _) = l -- | Converts a list of uninflected sentences into inflected ones by calling --- the third party software. -- FIXME: this doesn't actually support lists-of-results per input -- will need to work it out -- HUH? What makes me say that? inflectSentencesUsingCmd :: String -> [LemmaPlusSentence] -> IO [(LemmaPlusSentence,MorphOutput)] inflectSentencesUsingCmd morphcmd sentences = doit `catch` \e -> let _ = e :: IOException in (fallback (show e)) where hCloseSloppy h = hClose h `catch` \err -> let _ = err :: IOException in warningM logname (show err) doit = bracket (do debugM logname $ "Starting morph generator: " ++ morphcmd runInteractiveCommand morphcmd) (\(inh,outh,errh,_) -> do debugM logname $ "Closing output handles from morph generator" mapM hCloseSloppy [inh, outh, errh]) $ \(toP,fromP,errP,pid) -> do debugM logname $ "Sending " ++ show (length sentences) ++ " sentences to morph generator" hPutStrLn toP . render . pp_value . showJSON $ sentences debugM logname $ "Closing input handle to morph generator" hClose toP -- see http://www.haskell.org/pipermail/haskell-cafe/2008-May/042994.html -- fork off a thread to pull on the stderr -- so if the process writes to stderr we do not block. -- NB. do the hGetContents synchronously, otherwise the outer -- bracket can exit before this thread has run, and hGetContents -- will fail. err <- hGetContents errP _ <- forkIO (evaluate (length err) >> warningM logname err) -- wait for all the output output <- hGetContents fromP _ <- evaluate (length output) -- wait for the program to terminate exitcode <- waitForProcess pid debugM logname $ "Morph command exited" -- on failure, throw the exit code as an exception if exitcode == ExitSuccess then case resultToEither (decode output) of Left jerr -> fallback $ "Could not parse morphological generator output: " ++ jerr Right res -> do let lenResults = length res lenSentences = length sentences if lenResults == lenSentences then return $ zip sentences res else fallback $ "Morphological generator returned " ++ show lenResults ++ " results for " ++ show lenSentences ++ " inputs" `catch` \e -> let _ = e :: IOException in fallback ("Error calling morphological generator:\n" ++ show e) else fallback "Morph generator failed" fallback err = do errorM logname err return $ map (\x -> (x, sansMorph x)) sentences -- --------------------------------------------------------------------- -- parsers -- --------------------------------------------------------------------- instance JSON MorphOutput where readJSON j = case fromJSObject `fmap` readJSON j of J.Error _ -> MorphOutput [] <$> readJSON j J.Ok jo -> do let field x = maybe (fail $ "Could not find: " ++ x) readJSON $ lookup x jo warnings = maybe (return []) readJSON (lookup "warnings" jo) MorphOutput <$> warnings <*> field "realisations" showJSON _ = error "Don't know how to render MorphOutput" instance JSON LemmaPlus where readJSON j = do jo <- fromJSObject `fmap` readJSON j let field x = maybe (fail $ "Could not find: " ++ x) readJSON $ lookup x jo tfield = fmap T.pack . field LemmaPlus <$> field "lemma" <*> (parsecToJSON "lemma-features" geniFeats =<< tfield "lemma-features") showJSON (LemmaPlus l fs) = JSObject . toJSObject $ [ ("lemma" , showJSON l) , ("lemma-features", showJSON $ prettyStr fs) ] parsecToJSON :: Monad m => String -> Parser b -> Text -> m b parsecToJSON description p str = case runParser p () "" str of Left err -> fail $ "Couldn't parse " ++ description ++ " because " ++ show err Right res -> return res -- ---------------------------------------------------------------------- -- odds and ends -- ---------------------------------------------------------------------- singleton :: a -> [a] singleton x = [x] data MNAME = MNAME deriving Typeable logname :: String logname = mkLogname MNAME