% GenI surface realiser % Copyright (C) 2005 Carlos Areces and Eric Kow % % This program is free software; you can redistribute it and/or % modify it under the terms of the GNU General Public License % as published by the Free Software Foundation; either version 2 % of the License, or (at your option) any later version. % % This program is distributed in the hope that it will be useful, % but WITHOUT ANY WARRANTY; without even the implied warranty of % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the % GNU General Public License for more details. % % You should have received a copy of the GNU General Public License % along with this program; if not, write to the Free Software % Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. \chapter{Tags} \label{cha:Tags} This module provides basic datatypes specific to Tree Adjoining Grammar (TAG) and some low-level operations. Note that we don't handle substitution and adjunction here; see sections \ref{sec:substitution} and \ref{sec:adjunction} instead. \begin{code} {-# LANGUAGE TemplateHaskell #-} module NLP.GenI.Tags( -- Main Datatypes Tags, TagElem(..), TagItem(..), TagSite(..), TagDerivation, DerivationStep(..), emptyTE, ts_synIncomplete, ts_semIncomplete, ts_tbUnificationFailure, ts_rootFeatureMismatch, -- Functions from Tags addToTags, tagLeaves, -- Functions from TagElem setTidnums, -- General functions mapBySem, subsumedBy, showTagSites, collect, detectSites ) where \end{code} \ignore{ \begin{code} import Control.Applicative ( (<$>), (<*>) ) import qualified Data.Map as Map import Data.Maybe (fromMaybe, listToMaybe, mapMaybe) import Data.List (intersperse) import Data.Tree import Data.Generics (Data) import Data.Generics.PlateDirect import Data.Typeable (Typeable) import Text.JSON import NLP.GenI.Btypes (Ptype(Initial, Auxiliar), SemPols, GeniVal(GConst), AvPair(..), GNode(gup, glexeme, gnname, gaconstr, gdown, gtype, gorigin), GType(Subs), Flist, DescendGeniVal(..), Collectable(..), Idable(..), Sem, Pred, emptyPred, emptyGNode, showFlist, showPairs, showSem, lexemeAttributes, ) import NLP.GenI.General (groupByFM, preTerminals) import NLP.GenI.PolarityTypes (PolarityKey(..)) \end{code} } % ---------------------------------------------------------------------- \section{Tags} % ---------------------------------------------------------------------- \begin{code} -- | An anchored grammar. -- The grammar associates a set of semantic predicates to a list of trees each. type Tags = Map.Map String [TagElem] -- | 'addTags' @tags key elem@ adds @elem@ to the the list of elements associated -- to the key addToTags :: Tags -> String -> TagElem -> Tags addToTags t k e = Map.insertWith (++) k [e] t \end{code} % ---------------------------------------------------------------------- \section{TagElem} % ---------------------------------------------------------------------- Final types used for the combined macros + lexicon. We assume that a two trees are the same iff they have the same tidnum. To make this work, we assign each tree with a unique id during the process of combining macros with lexicon (see section \ref{sec:combine_macros}). \begin{code} data TagSite = TagSite { tsName :: String , tsUp :: Flist , tsDown :: Flist , tsOrigin :: String } deriving (Show, Eq, Ord, Data, Typeable) instance Biplate TagSite GeniVal where biplate (TagSite x1 zu zd x2) = plate TagSite |- x1 ||+ zu ||+ zd |- x2 instance Biplate (Maybe TagSite) GeniVal where biplate (Just x1) = plate Just |+ x1 biplate Nothing = plate Nothing data TagElem = TE { idname :: String, ttreename :: String, tidnum :: Integer, ttype :: Ptype, ttree :: Tree GNode, tsemantics :: Sem, -- optimisation stuff -- (polarity key to charge interval) tpolarities :: Map.Map PolarityKey (Int,Int), tinterface :: Flist, -- for idxconstraints (pol) ttrace :: [String], tsempols :: [SemPols] } deriving (Show, Eq, Data, Typeable) instance Biplate TagElem GeniVal where biplate (TE x1 x2 x3 x4 zt zsem x5 zint x6 x7) = plate TE |- x1 |- x2 |- x3 |- x4 |+ zt ||+ zsem |- x5 ||+ zint |- x6 |- x7 \end{code} A TAG derivation history consists of a list of 3-tuples representing the operation (s for substitution, a for adjunction), the name of the child tree, the name of the parent tree and the node affected. \begin{code} type TagDerivation = [ DerivationStep ] data DerivationStep = DerivationStep { dsOp :: Char , dsChild :: String , dsParent :: String , dsParentSite :: String } deriving (Show, Ord, Eq) instance JSON DerivationStep where readJSON j = do jo <- fromJSObject `fmap` readJSON j let field x = maybe (fail $ "Could not find: " ++ x) readJSON $ lookup x jo DerivationStep <$> field "op" <*> field "child" <*> field "parent" <*> field "parent-node" showJSON x = JSObject . toJSObject $ [ ("op", showJSON $ dsOp x) , ("child", showJSON $ dsChild x) , ("parent", showJSON $ dsParent x) , ("parent-node", showJSON $ dsParentSite x) ] \end{code} \begin{code} instance Ord TagElem where compare t1 t2 = case (ttype t1, ttype t2) of (Initial, Initial) -> compareId (Initial, Auxiliar) -> LT (Auxiliar, Initial) -> GT (Auxiliar, Auxiliar) -> compareId _ -> error "TagElem compare not exhaustively defined" where compareId = compare (tidnum t1) (tidnum t2) instance DescendGeniVal TagElem where descendGeniVal s te = te { tinterface = descendGeniVal s (tinterface te) , ttree = descendGeniVal s (ttree te) , tsemantics = descendGeniVal s (tsemantics te) } instance DescendGeniVal TagSite where descendGeniVal s (TagSite n fu fd o) = TagSite n (descendGeniVal s fu) (descendGeniVal s fd) o instance Collectable TagElem where collect t = (collect $ tinterface t) . (collect $ ttree t) . (collect $ tsemantics t) instance Idable TagElem where idOf = tidnum \end{code} \begin{code} emptyTE :: TagElem emptyTE = TE { idname = "", ttreename = "", tidnum = -1, ttype = Initial, ttree = Node emptyGNode [], tsemantics = [], tpolarities = Map.empty, tsempols = [], tinterface = [], ttrace = [] } -- | Given a tree(GNode) returns a list of substitution or adjunction -- nodes, as well as remaining nodes with a null adjunction constraint. detectSites :: Tree GNode -> ([TagSite], [TagSite], [TagSite]) detectSites t = ( sites isSub -- for substitution , sites (not.gaconstr) -- for adjunction , sites constrButNotSub -- for neither ) where ns = flatten t sites match = [ TagSite (gnname n) (gup n) (gdown n) (gorigin n) | n <- ns, match n ] isSub n = gtype n == Subs constrButNotSub n = gaconstr n && (not $ isSub n) \end{code} \subsection{Unique ID} TagElem comparison relies exclusively on \fnparam{tidnum}, so you must ensure that every TagElem you use has a unique ID. We provide two helpful functions for this. These are most likely useful \emph{between} lexical selection and generation proper, because during generation proper, you can simply keep a counter within a State monad to assign unique IDs to new TagElems. Note that we also label each node of the tree with its elementary tree name and with the unique ID. This helps us to build derivation trees correctly \begin{code} -- | Assigns a unique id to each element of this list, that is, an integer -- between 1 and the size of the list. setTidnums :: [TagElem] -> [TagElem] setTidnums xs = zipWith (\c i -> setOrigin $ c {tidnum = i}) xs [1..] setOrigin :: TagElem -> TagElem setOrigin te = te { ttree = fmap setLabel . ttree $ te } where setLabel g = g { gorigin = idname te ++ ":" ++ (show.tidnum) te } \end{code} % ---------------------------------------------------------------------- \section{TAG Item} % ---------------------------------------------------------------------- \begin{code} -- | 'TagItem' is a generalisation of 'TagElem'. class TagItem t where tgIdName :: t -> String tgIdNum :: t -> Integer tgSemantics :: t -> Sem instance TagItem TagElem where tgIdName = idname tgIdNum = tidnum tgSemantics = tsemantics \end{code} % ---------------------------------------------------------------------- \section{Map by sem} % ---------------------------------------------------------------------- \begin{code} -- | Sorts trees into a Map.Map organised by the first literal of their -- semantics. This is useful in at least three places: the polarity -- optimisation, the gui display code, and code for measuring the efficiency -- of GenI. Note: trees with a null semantics are filed under an empty -- predicate, if any. mapBySem :: (TagItem t) => [t] -> Map.Map Pred [t] mapBySem ts = let gfn t = case tgSemantics t of [] -> emptyPred (x:_) -> x in groupByFM gfn ts -- | 'subsumedBy' @cs ts@ determines if the candidate semantics @cs@ is -- subsumed by the proposition semantics @ts@. Notice how the proposition -- semantics is only a single item where as the candidate semantics is a -- list. -- -- We assume -- -- * most importantly that @cs@ has already its semantics instatiated -- (all variables assigned) -- -- * @cs@ and @ts@ are sorted -- -- * the list in each element of cs and ts is itself sorted subsumedBy :: Sem -> Pred -> Bool subsumedBy [] _ = False subsumedBy ((ch, cp, cla):cl) (th, tp,tla) | (ch == th) && (cp == tp) && (cla == tla) = True -- if we haven't yet overshot, try for the next one | cp < tp = subsumedBy cl (th, tp, tla) | otherwise = False \end{code} % ---------------------------------------------------------------------- \section{Extracting sentences} % ---------------------------------------------------------------------- Normally, extracting the sentences from a TAG tree would just consist of reading its leaves. But if you want the generator to return inflected forms instead of just lemmas, you also need to return the relevant features for each leaf. In TAG, or at least our use of it, the features come from the \emph{pre-terminal} nodes, that is, not the leaves themselves but their parents. Another bit of trickiness: because of atomic disjunction, leaves might have more than one value, so we can't just return a String lemma but a list of String, one for each possibility. \begin{code} type UninflectedDisjunction = ([String], Flist) tagLeaves :: TagElem -> [ (String, UninflectedDisjunction) ] tagLeaves te = [ (gnname pt, (getLexeme t, gup pt)) | (pt,t) <- preTerminals . ttree $ te ] -- | Try in order: lexeme, lexeme attributes, node name getLexeme :: GNode -> [String] getLexeme node = case glexeme node of [] -> fromMaybe [gnname node] $ firstMaybe grab lexemeAttributes lexs -> lexs where grab la = let match (AvPair a (GConst v)) | a == la = Just v match _ = Nothing in firstMaybe match guppy guppy = gup node firstMaybe :: (a -> Maybe b) -> [a] -> Maybe b firstMaybe fn = listToMaybe . mapMaybe fn \end{code} % ---------------------------------------------------------------------- \section{Debugging} % ---------------------------------------------------------------------- \begin{code} -- Useful for debugging adjunction and substitution nodes showTagSites :: [TagSite] -> String showTagSites sites = concat $ intersperse "\n " $ map fn sites where fn (TagSite n t b o) = concat . intersperse "/" $ [ n, showPairs t, showPairs b, o ] \end{code} % ---------------------------------------------------------------------- \section{Diagnostic messages} % ---------------------------------------------------------------------- Diagnostic messages let us know why a TAG tree is not returned as a result. Whenever GenI decides to discard a tree, it sets the tdiagnostic field of the TagElem so that the person using a debugger can find out what went wrong. \begin{code} ts_synIncomplete, ts_tbUnificationFailure :: String ts_synIncomplete = "syntactically incomplete" ts_tbUnificationFailure = "top/bot unification failure" ts_rootFeatureMismatch :: Flist -> String ts_rootFeatureMismatch good = "root feature does not unify with " ++ showFlist good ts_semIncomplete :: [Pred] -> String ts_semIncomplete sem = "semantically incomplete - missing: " ++ showSem sem \end{code} % ---------------------------------------------------------------------- % Performance % ----------------------------------------------------------------------