% GenI surface realiser
% Copyright (C) 2005 Carlos Areces and Eric Kow
%
% This program is free software; you can redistribute it and/or
% modify it under the terms of the GNU General Public License
% as published by the Free Software Foundation; either version 2
% of the License, or (at your option) any later version.
%
% This program is distributed in the hope that it will be useful,
% but WITHOUT ANY WARRANTY; without even the implied warranty of
% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
% GNU General Public License for more details.
%
% You should have received a copy of the GNU General Public License
% along with this program; if not, write to the Free Software
% Foundation, Inc., 59 Temple Place Suite 330, Boston, MA 021111307, USA.
\chapter{Geni}
\label{cha:Geni}
Geni is the interface between the front and backends of the generator. The GUI
and the console interface both talk to this module, and in turn, this module
talks to the input file parsers and the surface realisation engine. This
module also does lexical selection and anchoring because these processes might
involve some messy IO performance tricks.
\begin{code}
module NLP.GenI.Geni (
ProgState(..), ProgStateRef, emptyProgState,
initGeni,
runGeni, runGeniWithSelector,
GeniResult(..), ResultType(..),
lemmaSentenceString, prettyResult,
showRealisations, groupAndCount,
getTraces, Selector,
loadEverything, loadLexicon, loadGeniMacros,
loadTestSuite, loadTargetSemStr,
loadRanking, readRanking,
combine,
chooseLexCand,
)
where
\end{code}
\ignore{
\begin{code}
import Control.Applicative ((<$>),(<*>))
import Control.Arrow ((&&&))
import Control.Monad.Error
import Control.Monad (unless)
import Data.Binary (Binary, decodeFile)
import Data.Function ( on )
import Data.IORef (IORef, readIORef, modifyIORef)
import Data.List
import Data.List.Split ( wordsBy )
import qualified Data.Map as Map
import Data.Maybe (mapMaybe, fromMaybe, isJust)
import Data.Tree (Tree(Node))
import Data.Typeable (Typeable)
import qualified System.IO.UTF8 as UTF8
import System.IO.Unsafe (unsafePerformIO)
import Text.JSON
import NLP.GenI.General(filterTree, repAllNode,
groupAndCount, multiGroupByFM,
geniBug,
repNodeByNode,
fst3,
ePutStr, ePutStrLn, eFlush,
)
import NLP.GenI.Btypes
(Macros, MTtree, ILexEntry, Lexicon,
replace, replaceList,
Sem, SemInput, TestCase(..), sortSem, subsumeSem, params,
GeniVal(GConst), fromGVar, AvPair(..),
GNode(ganchor, gnname, gup, gdown, gaconstr, gtype, gorigin),
GType(Subs, Other),
isemantics, ifamname, iword, iparams, iequations,
iinterface, ifilters,
isempols,
toKeys,
showLexeme, showSem,
pidname, pfamily, pinterface, ptype, psemantics, ptrace,
setAnchor, setLexeme, tree, unifyFeat,
alphaConvert,
)
import NLP.GenI.BtypesBinary ()
import NLP.GenI.Tags (Tags, TagElem, emptyTE,
idname, ttreename,
ttype, tsemantics, ttree, tsempols,
tinterface, ttrace,
setTidnums)
import NLP.GenI.Configuration
( Params, getFlagP, hasFlagP, hasOpt, Optimisation(NoConstraints)
, MacrosFlg(..), LexiconFlg(..), TestSuiteFlg(..), TestCaseFlg(..)
, MorphInfoFlg(..), MorphCmdFlg(..)
, RankingConstraintsFlg(..)
, PartialFlg(..)
, FromStdinFlg(..), VerboseModeFlg(..)
, NoLoadTestSuiteFlg(..)
, TracesFlg(..)
, grammarType
, GrammarType(..) )
import qualified NLP.GenI.Builder as B
import NLP.GenI.GeniParsers (geniMacros, geniTagElems,
geniLexicon, geniTestSuite,
geniTestSuiteString, geniSemanticInput,
geniMorphInfo,
parseFromFile, runParser, Parser,
)
import NLP.GenI.Morphology
import NLP.GenI.OptimalityTheory
import NLP.GenI.Statistics (Statistics)
\end{code}
}
\begin{code}
myEMPTY :: String
myEMPTY = "MYEMPTY"
\end{code}
%
\section{ProgState}
%
\begin{code}
data ProgState = ST{
pa :: Params,
gr :: Macros,
le :: Lexicon,
morphinf :: MorphFn,
ts :: SemInput,
tcase :: String,
tsuite :: [TestCase],
ranking :: OtRanking,
traces :: [String],
warnings :: [String]
}
type ProgStateRef = IORef ProgState
emptyProgState :: Params -> ProgState
emptyProgState args =
ST { pa = args
, gr = []
, le = Map.empty
, morphinf = const Nothing
, ts = ([],[],[])
, tcase = []
, tsuite = []
, traces = []
, ranking = []
, warnings = []
}
addWarning :: ProgStateRef -> String -> IO ()
addWarning pstRef s = modifyIORef pstRef $ \p -> p { warnings = s : warnings p }
\end{code}
%
\section{Interface}
\subsection{Loading and parsing}
%
We have one master function that loads all the files GenI is expected to
use. This just calls the subloaders below, some of which are exported
for use by the graphical interface. The master function also makes sure
to complain intelligently if some of the required files are missing.
\begin{code}
loadEverything :: ProgStateRef -> IO()
loadEverything pstRef =
do pst <- readIORef pstRef
let config = pa pst
isMissing f = not $ hasFlagP f config
isNotPreanchored = grammarType config /= PreAnchored
isNotPrecompiled = grammarType config /= PreCompiled
useTestSuite = isMissing FromStdinFlg
&& isMissing NoLoadTestSuiteFlg
let errormsg =
concat $ intersperse ", " [ msg | (con, msg) <- errorlst, con ]
errorlst =
[ (isNotPrecompiled && isMissing MacrosFlg,
"a tree file")
, (isNotPreanchored && isMissing LexiconFlg,
"a lexicon file")
, (useTestSuite && isMissing TestSuiteFlg,
"a test suite") ]
unless (null errormsg) $ fail ("Please specify: " ++ errormsg)
case grammarType config of
PreAnchored -> return ()
PreCompiled -> return ()
_ -> loadGeniMacros pstRef
when isNotPreanchored $ loadLexicon pstRef
loadMorphInfo pstRef
when useTestSuite $ loadTestSuite pstRef >> return ()
loadTraces pstRef
loadRanking pstRef
\end{code}
The file loading functions all work the same way: we load the file,
and try to parse it. If this doesn't work, we just fail in IO, and
GenI dies. If we succeed, we update the program state passed in as
an IORef.
\begin{code}
loadLexicon :: ProgStateRef -> IO ()
loadLexicon pstRef =
do let getSem l = isemantics l
sorter l = l { isemantics = (sortSem . getSem) l }
cleanup = mapBySemKeys isemantics . map sorter
xs <- loadThingOrDie LexiconFlg "lexicon" pstRef
(parseFromFileOrFail geniLexicon)
modifyIORef pstRef (\p -> p { le = cleanup xs })
loadGeniMacros :: ProgStateRef -> IO ()
loadGeniMacros pstRef =
do xs <- loadThingOrDie MacrosFlg "trees" pstRef parser
modifyIORef pstRef (\p -> p { gr = xs })
where parser = parseFromFileMaybeBinary geniMacros
loadMorphInfo :: ProgStateRef -> IO ()
loadMorphInfo pstRef =
do xs <- loadThingOrIgnore MorphInfoFlg "morphological info" pstRef parser
modifyIORef pstRef (\p -> p { morphinf = readMorph xs } )
where parser = parseFromFileOrFail geniMorphInfo
loadTraces :: ProgStateRef -> IO ()
loadTraces pstRef =
do xs <- loadThingOrIgnore TracesFlg "traces" pstRef
(\f -> lines `fmap` readFile f)
modifyIORef pstRef (\p -> p {traces = xs})
loadRanking :: ProgStateRef -> IO ()
loadRanking pstRef =
do config <- pa `fmap` readIORef pstRef
let verbose = hasFlagP VerboseModeFlg config
case getFlagP RankingConstraintsFlg config of
Nothing -> return ()
Just f -> do r <- readRanking verbose f
modifyIORef pstRef (\p -> p { ranking = r })
readRanking :: Bool
-> FilePath -> IO OtRanking
readRanking verbose f =
do when verbose $ do
ePutStr $ unwords [ "Loading OT constraints", f ++ "... " ]
eFlush
mr <- (resultToEither . decode) `fmap` UTF8.readFile f
when verbose $ ePutStr "done"
either fail return mr
\end{code}
\subsubsection{Target semantics}
Reading in the target semantics (or test suite) is a little more
complicated. It follows the same general schema as above, except
that we parse the file twice: once for our internal representation,
and once to get a string representation of each test case. The
string representation is for the graphical interface; it avoids us
figuring out how to prettyprint things because we can assume the
user will format it the way s/he wants.
\begin{code}
loadTestSuite :: ProgStateRef -> IO [TestCase]
loadTestSuite pstRef = do
config <- pa `fmap` readIORef pstRef
let parser f = do
sem <- parseFromFileOrFail geniTestSuite f
mStrs <- parseFromFileOrFail geniTestSuiteString f
return $ zip sem mStrs
updater s x =
x { tsuite = s
, tcase = fromMaybe "" $ getFlagP TestCaseFlg config}
cleanup (tc,str) =
tc { tcSem = (sortSem sm, sort sr, lc)
, tcSemString = str }
where (sm, sr, lc) = tcSem tc
xs <- map cleanup `fmap` loadThingOrDie TestSuiteFlg "test suite" pstRef parser
modifyIORef pstRef (updater xs)
return xs
\end{code}
Sometimes, the target semantics does not come from a file, but from
the graphical interface, so we also provide the ability to parse an
arbitrary string as the semantics.
\begin{code}
loadTargetSemStr :: ProgStateRef -> String -> IO ()
loadTargetSemStr pstRef str =
do parseSem
where
parseSem = do
let sem = runParser geniSemanticInput () "" str
case sem of
Left err -> fail (show err)
Right sr -> modifyIORef pstRef (\x -> x{ts = smooth sr})
smooth (s,r,l) = (sortSem s, sort r, l)
\end{code}
\subsubsection{Helpers for loading files}
\begin{code}
loadThingOrIgnore, loadThingOrDie :: forall f a . (Eq f, Show f, Typeable f)
=> (FilePath -> f)
-> String
-> ProgStateRef
-> (FilePath -> IO [a])
-> IO [a]
loadThingOrIgnore flag description pstRef parser =
do config <- pa `fmap` readIORef pstRef
case getFlagP flag config of
Nothing -> return []
Just f -> loadThing f description pstRef parser
loadThingOrDie flag description pstRef parser =
do config <- pa `fmap` readIORef pstRef
case getFlagP flag config of
Nothing -> fail $ "Please specify a " ++ description ++ "!"
Just f -> loadThing f description pstRef parser
loadThing :: FilePath
-> String
-> ProgStateRef
-> (FilePath -> IO [a])
-> IO [a]
loadThing filename description pstRef parser =
do config <- pa `fmap` readIORef pstRef
let verbose = hasFlagP VerboseModeFlg config
when verbose $ do
ePutStr $ unwords [ "Loading", description, filename ++ "... " ]
eFlush
theTs <- parser filename
when verbose $ ePutStr $ (show $ length theTs) ++ " entries\n"
return theTs
parseFromFileOrFail :: Parser a -> FilePath -> IO a
parseFromFileOrFail p f = parseFromFile p f >>= either (fail.show) (return)
parseFromFileMaybeBinary :: Binary a
=> Parser a
-> FilePath
-> IO a
parseFromFileMaybeBinary p f =
if (".genib" `isSuffixOf` f)
then decodeFile f
else parseFromFileOrFail p f
\end{code}
%
\subsection{Surface realisation entry point}
%
This is your basic entry point. You call this if the only thing you want to do
is run the surface realiser.
\begin{enumerate}
\item It initialises the realiser (lexical selection, among other things),
via \fnref{initGeni}
\item It runs the builder (the surface realisation engine proper)
\item It unpacks the builder results
\item It finalises the results (morphological generation)
\end{enumerate}
\begin{code}
data GeniResult = GeniResult
{ grLemmaSentence :: B.LemmaPlusSentence
, grRealisations :: [String]
, grDerivation :: B.Derivation
, grLexSelection :: [ GeniLexSel ]
, grRanking :: Int
, grViolations :: [ OtViolation ]
, grResultType :: ResultType
} deriving (Ord, Eq)
data GeniLexSel = GeniLexSel
{ nlTree :: String
, nlTrace :: [String]
} deriving (Ord, Eq)
data ResultType = CompleteResult | PartialResult deriving (Ord, Eq)
runGeni :: ProgStateRef -> B.Builder st it Params -> IO ([GeniResult], Statistics, st)
runGeni pstRef builder = runGeniWithSelector pstRef defaultSelector builder
runGeniWithSelector :: ProgStateRef -> Selector -> B.Builder st it Params -> IO ([GeniResult], Statistics, st)
runGeniWithSelector pstRef selector builder =
do pst <- readIORef pstRef
let config = pa pst
run = B.run builder
unpack = B.unpack builder
initStuff <- initGeniWithSelector pstRef selector
let (finalSt, stats) = run initStuff config
let uninflected = unpack finalSt
tryPartial = null uninflected && hasFlagP PartialFlg config
rawResults = if tryPartial then B.partial builder finalSt else uninflected
resultTy = if tryPartial then PartialResult else CompleteResult
results <- finaliseResults pstRef resultTy rawResults
return (results, stats, finalSt)
\end{code}
%
\subsection{Surface realisation sub steps}
%
Below are the initial and final steps of \fnreflite{runGeni}. These functions
are seperated out so that they may be individually called from the graphical
debugger. The middle steps (running and unpacking the builder) depend on your
builder implementation.
\begin{code}
initGeni :: ProgStateRef -> IO (B.Input)
initGeni pstRef = initGeniWithSelector pstRef defaultSelector
initGeniWithSelector :: ProgStateRef -> Selector -> IO (B.Input)
initGeniWithSelector pstRef lexSelector =
do
modifyIORef pstRef
(\p -> if hasOpt NoConstraints (pa p)
then p { ts = (fst3 (ts p),[],[]) }
else p)
(cand, lexonly) <- lexSelector pstRef
pst <- readIORef pstRef
let (tsem,tres,lc) = ts pst
tsem2 = stripMorphSem (morphinf pst) tsem
let initStuff = B.Input
{ B.inSemInput = (tsem2, tres, lc)
, B.inLex = lexonly
, B.inCands = map (\c -> (c,1)) cand
}
return initStuff
finaliseResults :: ProgStateRef -> ResultType -> [B.Output] -> IO [GeniResult]
finaliseResults pstRef ty os =
do pst <- readIORef pstRef
mss <- case getFlagP MorphCmdFlg (pa pst) of
Nothing -> return $ map sansMorph sentences
Just cmd -> map snd `fmap` inflectSentencesUsingCmd cmd sentences
let unranked = zipWith (sansRanking pst) os mss
rank = rankResults (getTraces pst) grDerivation (ranking pst)
return . map addRanking . rank $ unranked
where
sentences = map fst os
sansRanking pst (l,d) rs =
GeniResult { grLemmaSentence = l
, grRealisations = rs
, grDerivation = d
, grLexSelection = map (\x -> GeniLexSel x (getTraces pst x)) (B.lexicalSelection d)
, grRanking = 1
, grViolations = []
, grResultType = ty
}
addRanking (i,res,vs) = res { grViolations = vs, grRanking = i }
\end{code}
%
\subsection{Displaying results}
%
\begin{code}
showRealisations :: [String] -> String
showRealisations sentences =
let sentencesGrouped = map (\ (s,c) -> s ++ countStr c) g
where g = groupAndCount sentences
countStr c = if c > 1 then " (" ++ show c ++ " instances)"
else ""
in if null sentences
then "(none)"
else unlines sentencesGrouped
lemmaSentenceString :: GeniResult -> String
lemmaSentenceString = unwords . map lpLemma . grLemmaSentence
prettyResult :: ProgState -> GeniResult -> String
prettyResult pst nr =
concat . intersperse "\n" . map showOne . grRealisations $ nr
where
showOne str = show theRanking ++ ". " ++ str ++ "\n" ++ violations
violations = prettyViolations tracesFn verbose (grViolations nr)
theRanking = grRanking nr
verbose = hasFlagP VerboseModeFlg (pa pst)
tracesFn = getTraces pst
getTraces :: ProgState -> String -> [String]
getTraces pst tname =
filt $ concat [ ptrace t | t <- gr pst, pidname t == readPidname tname ]
where
filt = case traces pst of
[] -> id
theTs -> filter (`elem` theTs)
readPidname :: String -> String
readPidname n =
case wordsBy (== ':') n of
(_:_:p:_) -> p
_ -> geniBug "readPidname or combineName are broken"
\end{code}
%
\section{Lexical selection}
\label{sec:candidate_selection} \label{sec:lexical_selecetion} \label{par:lexSelection}
%
\paragraph{runLexSelection} \label{fn:runLexSelection} determines which
candidates trees which will be used to generate the current target semantics.
In addition to the anchored candidate trees, we also return the lexical items
themselves. This list of lexical items is useful for debugging a grammar;
it lets us know if GenI managed to lexically select something, but did not
succeed in anchoring it.
\begin{code}
runLexSelection :: ProgStateRef -> IO ([TagElem], [ILexEntry])
runLexSelection pstRef =
do pst <- readIORef pstRef
let (tsem,_,litConstrs) = ts pst
lexicon = le pst
lexCand = chooseLexCand lexicon tsem
config = pa pst
verbose = hasFlagP VerboseModeFlg config
let grammar = gr pst
combineWithGr l =
do let (lexCombineErrors, res) = combineList grammar l
familyMembers = [ p | p <- grammar, pfamily p == ifamname l ]
mapM_ (addWarning pstRef . showErr) $ compressLexCombineErrors
$ lexCombineErrors
let lexeme = showLexeme.iword $ l
_outOfFamily n = show n ++ "/" ++ (show $ length familyMembers)
++ " instances of " ++ lexeme ++ ":" ++ ifamname l
case concatMap (missingCoanchors l) familyMembers of
[] -> return ()
cs -> mapM_ showWarning . group . sort $ cs
where showWarning [] = geniBug "silly error in Geni.runLexSelection"
showWarning xs@(x0:_) = addWarning pstRef $ "Missing co-anchor '" ++ x0 ++ "'" ++ " in " ++ _outOfFamily (length xs) ++ "."
return res
cand <- case grammarType config of
PreAnchored -> readPreAnchored pst
_ -> concat `liftM` mapM combineWithGr lexCand
let considerMorph = attachMorph (morphinf pst) tsem
let matchesLc t = all (`elem` myTrace) constrs
where constrs = concat [ cs | (l,cs) <- litConstrs, l `elem` mySem ]
mySem = tsemantics t
myTrace = ttrace t
considerLc = filter matchesLc
let considerCoherency = filter (all (`elem` tsem) . tsemantics)
considerHasSem = filter (not . null . tsemantics)
let candFinal = setTidnums . considerCoherency . considerHasSem
. considerLc . considerMorph $ cand
indent x = ' ' : x
unlinesIndentAnd :: (x -> String) -> [x] -> String
unlinesIndentAnd f = unlines . map (indent . f)
when verbose $
do ePutStrLn $ "Lexical items selected:\n" ++ (unlinesIndentAnd (showLexeme.iword) lexCand)
ePutStrLn $ "Trees anchored (family) :\n" ++ (unlinesIndentAnd idname candFinal)
let missedSem = tsem \\ (nub $ concatMap tsemantics candFinal)
hasTree l = isJust $ find (\t -> tsemantics t == lsem) cand
where lsem = isemantics l
missedLex = filter (not.hasTree) lexCand
unless (null missedSem) $ addWarning pstRef $ "no lexical selection for " ++ showSem missedSem
unless (null missedLex) $ forM_ missedLex $ \l -> addWarning pstRef $
"'" ++ showLex l ++ "' was lexically selected, but not anchored to any trees"
return (candFinal, lexCand)
where showLex l = (showLexeme $ iword l) ++ "-" ++ (ifamname l)
showErr (c, e) = show e ++ " (" ++ show c ++ " times)"
compressLexCombineErrors :: [LexCombineError] -> [(Int, LexCombineError)]
compressLexCombineErrors = map (length &&& head) . groupBy h
where
h (EnrichError m1 l1 _) (EnrichError m2 l2 _) = pfamily m1 == pfamily m2 &&
iword l1 == iword l2
h _ _ = False
chooseLexCand :: Lexicon -> Sem -> [ILexEntry]
chooseLexCand slex tsem =
let keys = toKeys tsem
lookuplex t = Map.findWithDefault [] t slex
cand = concatMap lookuplex $ myEMPTY : keys
cand2 = chooseCandI tsem cand
in cand2
\end{code}
With a helper function, we refine the candidate selection by
instatiating the semantics, at the same time filtering those which
do not stay within the target semantics, and finally eliminating
the duplicates.
\begin{code}
chooseCandI :: Sem -> [ILexEntry] -> [ILexEntry]
chooseCandI tsem cand =
let replaceLex i (sem,sub) =
(replace sub i) { isemantics = sem }
helper :: ILexEntry -> [ILexEntry]
helper l = if null sem then [l]
else map (replaceLex l) psubsem
where psubsem = subsumeSem tsem sem
sem = isemantics l
in nub $ concatMap helper cand
\end{code}
A semantic key is a semantic literal boiled down to predicate plus arity
(see section \ref{btypes_semantics}).
\begin{code}
mapBySemKeys :: (a -> Sem) -> [a] -> Map.Map String [a]
mapBySemKeys semfn xs =
let gfn t = if (null s) then [myEMPTY] else toKeys s
where s = semfn t
in multiGroupByFM gfn xs
\end{code}
\fnlabel{mergeSynonyms} is a factorisation technique that uses
atomic disjunction to merge all synonyms into a single lexical
entry. Two lexical entries are considered synonyms if their
semantics match and they point to the same tree families.
FIXME: 20061011 note that this is no longer being used,
because it breaks the case where two lexical entries differ
only by their use of path equations. Perhaps it's worthwhile
just to add a check that the path equations match exactly.
\begin{code}
\end{code}
%
\subsection{Basic anchoring}
\label{sec:combine_macros}
%
This section of the code helps you to combined a selected lexical item with
a macro or a list of macros. This is a process that can go fail for any
number of reasons, so we try to record the possible failures for bookkeeping.
\begin{code}
data LexCombineError =
BoringError String
| EnrichError { eeMacro :: MTtree
, eeLexEntry :: ILexEntry
, eeLocation :: PathEqLhs }
| OtherError MTtree ILexEntry String
instance Error LexCombineError where
noMsg = strMsg "error combining items"
strMsg s = BoringError s
instance Show LexCombineError where
show (BoringError s) = s
show (OtherError t l s) = s ++ " on " ++ pfamily t ++ " (" ++ (showLexeme $ iword l) ++ ")"
show (EnrichError t l _) = show (OtherError t l "enrichment error")
\end{code}
The first step in lexical selection is to collect all the features and
parameters that we want to combine.
\begin{code}
combine :: Macros -> Lexicon -> Tags
combine gram lexicon =
let helper li = mapEither (combineOne li) macs
where tn = ifamname li
macs = [ t | t <- gram, pfamily t == tn ]
in Map.map (\e -> concatMap helper e) lexicon
mapEither :: (a -> Either l r) -> [a] -> [r]
mapEither fn = mapMaybe (\x -> either (const Nothing) Just $ fn x)
\end{code}
\begin{code}
combineList :: Macros -> ILexEntry
-> ([LexCombineError],[TagElem])
combineList gram lexitem =
case [ t | t <- gram, pfamily t == tn ] of
[] -> ([BoringError $ "Family " ++ tn ++ " not found in Macros"],[])
macs -> unzipEither $ map (combineOne lexitem) macs
where tn = ifamname lexitem
unzipEither :: (Error e, Show b) => [Either e b] -> ([e], [b])
unzipEither es = helper ([],[]) es where
helper accs [] = accs
helper (eAcc, rAcc) (Left e : next) = helper (e:eAcc,rAcc) next
helper (eAcc, rAcc) (Right r : next) = helper (eAcc,r:rAcc) next
\end{code}
\begin{code}
combineOne :: ILexEntry -> MTtree -> Either LexCombineError TagElem
combineOne lexRaw eRaw =
do let l1 = alphaConvert "-l" lexRaw
e1 = alphaConvert "-t" eRaw
(l,e) <- unifyParamsWithWarning (l1,e1)
>>= unifyInterfaceUsing iinterface
>>= unifyInterfaceUsing ifilters
>>= enrichWithWarning
let name = concat $ intersperse ":" $ filter (not.null)
[ head (iword l) , pfamily e , pidname e ]
return $ emptyTE
{ idname = name
, ttreename = pfamily e
, ttype = ptype e
, ttree = setOrigin name . setLemAnchors . setAnchor (iword l) $ tree e
, tsemantics =
sortSem $ case psemantics e of
Nothing -> isemantics l
Just s -> s
, tsempols = isempols l
, tinterface = pinterface e
, ttrace = ptrace e
}
where
unifyParamsWithWarning (l,t) =
let lp = iparams l
tp = map fromGVar $ params t
psubst = zip tp lp
in if (length lp) /= (length tp)
then Left $ OtherError t l $ "Parameter length mismatch"
else Right $ (replaceList psubst l, replaceList psubst t)
unifyInterfaceUsing ifn (l,e) =
case unifyFeat (ifn l) (pinterface e) of
Nothing -> Left $ OtherError e l $ "Interface unification error"
Just (int2, fsubst) -> Right $ (replace fsubst l, e2)
where e2 = (replace fsubst e) { pinterface = int2 }
enrichWithWarning (l,e) =
do e2 <- enrich l e
return (l,e2)
\end{code}
\subsubsection{Enrichment}
Enrichment is a process which adds features to either the interface, an
explicitly named node or the coanchor of a lexically selected tree. The
enrichement information comes from the lexicon in the form of a path equations
which specify
\begin{enumerate}
\item the location
\item top or bottom
\item the attribute
\item what value to associate with it
\end{enumerate}
The conventions taken by GenI for path equations are:
\begin{tabular}{|l|p{8cm}|}
\hline
\verb!interface.foo=bar! &
\fs{foo=bar} is unified into the interface (not the tree) \\
\hline
\verb!anchor.bot.foo=bar! &
\fs{foo=bar} is unified into the bottom feature of the node
which is marked anchor. \\
\hline
\verb!toto.top.foo=bar! &
\fs{foo=bar} is unified into the top feature of node named toto \\
\hline
\verb!toto.bot.foo=bar! &
\fs{foo=bar} is unified into the bot feature of node named toto \\
\hline
\verb!anchor.foo=bar! &
same as \verb!anchor.bot.foo=bar! \\
\hline
\verb!anc.whatever...! &
same as \verb!anchor.whatever...! \\
\hline
\verb!top.foo=bar! &
same as \verb!anchor.top.foo=bar! \\
\hline
\verb!bot.foo=bar! &
same as \verb!anchor.bot.foo=bar! \\
\hline
\verb!foo=bar! &
same as \verb!anchor.bot.foo=bar! \\
\hline
\verb!toto.foo=bar! &
same as \verb!toto.top.foo=bar! (creates a warning) \\
\hline
\end{tabular}
\begin{code}
type PathEqLhs = (String, Bool, String)
type PathEqPair = (PathEqLhs, GeniVal)
enrich :: ILexEntry -> MTtree -> Either LexCombineError MTtree
enrich l t =
do
let (intE, namedE) = lexEquations l
t2 <- foldM enrichInterface t intE
foldM (enrichBy l) t2 namedE
where
toAvPair ((_,_,a),v) = AvPair a v
enrichInterface tx en =
do (i2, isubs) <- unifyFeat [toAvPair en] (pinterface tx)
`catchError` (\_ -> throwError $ ifaceEnrichErr en)
return $ (replace isubs tx) { pinterface = i2 }
ifaceEnrichErr (loc,_) = EnrichError
{ eeMacro = t
, eeLexEntry = l
, eeLocation = loc }
enrichBy :: ILexEntry
-> MTtree
-> (PathEqLhs, GeniVal)
-> Either LexCombineError MTtree
enrichBy lexEntry t (eqLhs, eqVal) =
case seekCoanchor eqName t of
Nothing -> return t
Just a ->
do let tfeat = (if eqTop then gup else gdown) a
(newfeat, sub) <- unifyFeat [AvPair eqAtt eqVal] tfeat
`catchError` (\_ -> throwError enrichErr)
let newnode = if eqTop then a {gup = newfeat}
else a {gdown = newfeat}
return $ fixNode newnode $ replace sub t
where
(eqName, eqTop, eqAtt) = eqLhs
fixNode n mt = mt { tree = repNodeByNode (matchNodeName eqName) n (tree mt) }
enrichErr = EnrichError { eeMacro = t
, eeLexEntry = lexEntry
, eeLocation = eqLhs }
pathEqName :: PathEqPair -> String
pathEqName = fst3.fst
missingCoanchors :: ILexEntry -> MTtree -> [String]
missingCoanchors lexEntry t =
do eq <- nubBy ((==) `on` pathEqName) $ snd $ lexEquations lexEntry
let name = pathEqName eq
case seekCoanchor name t of
Nothing -> [name]
Just _ -> []
lexEquations :: ILexEntry -> ([PathEqPair], [PathEqPair])
lexEquations =
partition (nameIs "interface") . map parseAv . iequations
where
parseAv (AvPair a v) =
case parsePathEq a of
Left (err,peq) -> unsafePerformIO $ do putStrLn err
return (peq,v)
Right peq -> (peq, v)
nameIs n x = pathEqName x == n
seekCoanchor :: String -> MTtree -> Maybe GNode
seekCoanchor eqName t =
case filterTree (matchNodeName eqName) (tree t) of
[a] -> Just a
[] -> Nothing
_ -> geniBug $ "Tree with multiple matches in enrichBy. " ++
"\nTree: " ++ pidname t ++ "\nFamily: " ++ pfamily t ++
"\nMatching on: " ++ eqName
matchNodeName :: String -> GNode -> Bool
matchNodeName "anchor" = ganchor
matchNodeName n = (== n) . gnname
parsePathEq :: String -> Either (String,PathEqLhs) (PathEqLhs)
parsePathEq e =
case wordsBy (== '.') e of
(n:"top":r) -> Right (n, True, rejoin r)
(n:"bot":r) -> Right (n, False, rejoin r)
("top":r) -> Right ("anchor", True, rejoin r)
("bot":r) -> Right ("anchor", False, rejoin r)
("anc":r) -> parsePathEq $ rejoin $ "anchor":r
("anchor":r) -> Right ("anchor", False, rejoin r)
("interface":r) -> Right ("interface", False, rejoin r)
(n:r) -> Left (err, (n, True, rejoin r))
where err = "Warning: Interpreting path equation " ++ e ++
" as applying to top of " ++ n ++ "."
_ -> Left (err, ("", True, e))
where err = "Warning: could not interpret path equation " ++ e
where
rejoin = concat . intersperse "."
\end{code}
\subsubsection{Lemanchor mechanism}
One problem in building reversible grammars is the treatment of coanchors.
In the French language, for example, we have some structures like
\natlang{C'est Jean qui regarde Marie}
\natlang{It is John who looks at Mary}
One might be tempted to hard code the ce (it) and the être (is) into the tree
for regarder (look at), something like \texttt{s(ce, être, n$\downarrow$, qui,
v(regarder), n$\downarrow$)}. Indeed, this would work just fine for
generation, but not for parsing. When you parse, you would encounter inflected
forms for these items for example \natlang{c'} for \natlang{ce} or
\natlang{sont} or \natlang{est} for \natlang{être}. Hardcoding the \natlang{ce}
into such trees would break parsing.
To work around this, we propose a mechanism to have our coanchors and parsing
too. Coanchors that are susceptible to morphological variation should be
\begin{itemize}
\item marked in a substitution site (this is to keep parsers happy)
\item have a feature \texttt{bot.lemanchor:foo} where foo is the
coanchor you want
\end{itemize}
GenI will convert these into nonsubstitution sites with a lexical item
leaf node.
\begin{code}
setLemAnchors :: Tree GNode -> Tree GNode
setLemAnchors t =
repAllNode fn filt t
where
filt (Node a []) = gtype a == Subs && (isJust. lemAnchor) a
filt _ = False
fn (Node x k) = setLexeme (lemAnchorMaybeFake x) $
Node (x { gtype = Other, gaconstr = False }) k
lemAnchorMaybeFake :: GNode -> [String]
lemAnchorMaybeFake n =
case lemAnchor n of
Nothing -> ["ERR_UNSET_LEMMANCHOR"]
Just l -> l
lemAnchor :: GNode -> Maybe [String]
lemAnchor n =
case [ v | AvPair a v <- gdown n, a == _lemanchor ] of
[GConst l] -> Just l
_ -> Nothing
_lemanchor :: String
_lemanchor = "lemanchor"
\end{code}
\subsubsection{Node origins}
After lexical selection, we label each tree node with its origin, most
likely the name and id of its elementary tree. This is useful for
building derivation trees
\begin{code}
setOrigin :: String -> Tree GNode -> Tree GNode
setOrigin t = fmap (\g -> g { gorigin = t })
\end{code}
%
\subsection{Preselection and preanchoring}
\label{sec:preanchor}
%
For testing purposes, we can perform lexical selection ahead of time and store
it somewhere else.
\begin{code}
type Selector = ProgStateRef -> IO ([TagElem],[ILexEntry])
defaultSelector :: Selector
defaultSelector = runLexSelection
\end{code}
For debugging purposes, it is often useful to perform lexical selection and
surface realisation separately. Preanchored mode allows the user to just
pass the lexical selection in as a file of anchored trees associated with a
semantics.
\begin{code}
readPreAnchored :: ProgState -> IO [TagElem]
readPreAnchored pst =
case getFlagP MacrosFlg (pa pst) of
Nothing -> fail "No macros file specified (preanchored mode)"
Just file -> parseFromFileOrFail geniTagElems file
\end{code}
%
% Boring utility code
%
\ignore{
\begin{code}
instance JSON GeniResult where
readJSON j =
do jo <- fromJSObject `fmap` readJSON j
let field x = maybe (fail $ "Could not find: " ++ x) readJSON
$ lookup x jo
GeniResult <$> field "raw"
<*> field "realisations"
<*> field "derivation"
<*> field "lexical-selection"
<*> field "ranking"
<*> field "violations"
<*> field "result-type"
showJSON nr =
JSObject . toJSObject $ [ ("raw", showJSON $ grLemmaSentence nr)
, ("realisations", showJSONs $ grRealisations nr)
, ("derivation", showJSONs $ grDerivation nr)
, ("lexical-selection", showJSONs $ grLexSelection nr)
, ("ranking", showJSON $ grRanking nr)
, ("violations", showJSONs $ grViolations nr)
, ("result-type", showJSON $ grResultType nr)
]
instance JSON ResultType where
readJSON j =
do js <- fromJSString `fmap` readJSON j
case js of
"partial" -> return PartialResult
"complete" -> return CompleteResult
ty -> fail $ "unknown result type: " ++ ty
showJSON CompleteResult = JSString $ toJSString "complete"
showJSON PartialResult = JSString $ toJSString "partial"
instance JSON GeniLexSel where
readJSON j =
do jo <- fromJSObject `fmap` readJSON j
let field x = maybe (fail $ "Could not find: " ++ x) readJSON
$ lookup x jo
GeniLexSel <$> field "lex-item"
<*> field "trace"
showJSON x =
JSObject . toJSObject $ [ ("lex-item", showJSON $ nlTree x)
, ("trace", showJSONs $ nlTrace x)
]
\end{code}
}