-- GenI surface realiser -- Copyright (C) 2005 Carlos Areces and Eric Kow -- -- This program is free software; you can redistribute it and/or -- modify it under the terms of the GNU General Public License -- as published by the Free Software Foundation; either version 2 -- of the License, or (at your option) any later version. -- -- This program is distributed in the hope that it will be useful, -- but WITHOUT ANY WARRANTY; without even the implied warranty of -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -- GNU General Public License for more details. -- -- You should have received a copy of the GNU General Public License -- along with this program; if not, write to the Free Software -- Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -- | This module performs the core of lexical selection and anchoring. {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE TupleSections #-} module NLP.GenI.LexicalSelection where import Control.Applicative ((<$>)) import Control.Arrow ((***)) import Control.Monad.Trans.Maybe import Control.Monad.Writer import Data.Function (on) import Data.List import qualified Data.Map as Map import Data.Text (Text) import qualified Data.Text as T import Data.Tree (Tree (Node)) import Control.Error import Data.FullList hiding (head, tail, (++)) import qualified Data.FullList as FL import NLP.GenI.FeatureStructure (AvPair (..), Flist, unifyFeat) import NLP.GenI.General (filterTree, geniBug, histogram, repAllNode, repNodeByNode) import NLP.GenI.GeniVal import NLP.GenI.LexicalSelection.Types import NLP.GenI.Lexicon (LexEntry (..), Lexicon) import NLP.GenI.Semantics (LitConstr, Sem, SemInput, subsumeSem, unifySem) import NLP.GenI.Tag (TagElem (..), idname) import NLP.GenI.TestSuite import NLP.GenI.TreeSchema (GNode (..), GType (..), AdjunctionConstraint(..), Macros, SchemaTree, Ttree (..), crushTreeGNode, setAnchor, setLexeme, tree) import NLP.GenI.Warning -- ---------------------------------------------------------------------- -- * Lexical selection algorithms -- ---------------------------------------------------------------------- -- | This aims to support users who want to do lexical selection -- directly from an input other than GenI style flat semantics. -- -- The requirement here is for you to provide some means of -- converting the custom semantics to a GenI semantics data CustomSem sem = CustomSem { -- | Conversion from custom semantics to GenI semantic input fromCustomSemInput :: sem -> Either Text SemInput -- | Lexical selection function , customSelector :: LexicalSelector sem , customSemParser :: Text -> Either Text (TestCase sem) -- | List of named inputs intended to act as a substitute for -- test suites -- ('FilePath' argument is for reporting error messages only) , customSuiteParser :: FilePath -> Text -> Either Text [TestCase sem] , customRenderSem :: sem -> Text } -- | See 'NLP.GenI.Configuration' if you want to use GenI with a custom -- lexical selection function. type LexicalSelector sem = Macros -> Lexicon -> sem -> IO LexicalSelection -- | The result of the lexical selection process data LexicalSelection = LexicalSelection { -- | the main result: a set of elementary trees (ie. anchored trees) lsAnchored :: [TagElem] -- | if available, lexical entries that were used to produce anchored -- trees (useful for identifying anchoring failure) , lsLexEntries :: [LexEntry] -- | HINT: use 'Data.Monoid.mempty' to initialise to empty , lsWarnings :: GeniWarnings } -- | Performs standard GenI lexical selection as described in -- -- -- This is just 'defaultLexicalSelection' lifted into IO defaultLexicalSelector :: Macros -> Lexicon -> SemInput -> IO LexicalSelection defaultLexicalSelector g l t = return (defaultLexicalSelection g l t) -- | Helper for 'defaultLexicalSelector' -- (Standard GenI lexical selection is actually pure) -- -- This is just -- -- * 'defaultLexicalChoice' -- -- * 'defaultAnchoring' -- -- * 'defaultPostProcessing' defaultLexicalSelection :: Macros -> Lexicon -> SemInput -> LexicalSelection defaultLexicalSelection grammar lexicon sem = defaultPostProcessing sem $ defaultAnchoring sem grammar $ defaultLexicalChoice lexicon sem -- | @missingLexEntries ts lexs@ returns any of the lexical candidates -- @lexs@ that were apparently not anchored succesfully. -- -- TODO: it does this by (wrongly) checking for each lexical item -- to see if any of the anchored trees in @ts@ have identical -- semantics to that lexical item. The better way to do this would -- be to throw a subsumption check on top of items reported missing, -- because it's possible for the trees to add semantics through -- unification. missingLexEntries :: [TagElem] -> [LexEntry] -> [LexEntry] missingLexEntries cands = filter treeless where treeless l = isNothing $ find (\t -> tsemantics t == isemantics l) cands -- ---------------------------------------------------------------------- -- * Selecting candidate lemmas -- ---------------------------------------------------------------------- -- | Select and returns the set of entries from the lexicon whose semantics -- subsumes the input semantics. defaultLexicalChoice :: Lexicon -> SemInput -> [LexEntry] defaultLexicalChoice slex (tsem,_,_) = chooseCandI tsem slex -- | 'chooseCandI' @sem l@ attempts to unify the semantics of @l@ with @sem@ -- If this succeeds, we use return the result(s); if it fails, we reject -- @l@ as a lexical selection candidate. chooseCandI :: Sem -> [LexEntry] -> [LexEntry] chooseCandI tsem cand = nub $ concatMap helper cand where replaceLex i (sem,sub) = (replace sub i) { isemantics = sem } -- helper :: LexEntry -> [LexEntry] helper l = if null sem then [l] else map (replaceLex l) psubsem where psubsem = sem `subsumeSem` tsem sem = isemantics l -- | `mergeSynonyms' is a factorisation technique that uses -- atomic disjunction to merge all synonyms into a single lexical -- entry. Two lexical entries are considered synonyms if their -- semantics match and they point to the same tree families. -- -- FIXME: 2006-10-11 - note that this is no longer being used, -- because it breaks the case where two lexical entries differ -- only by their use of path equations. Perhaps it's worthwhile -- just to add a check that the path equations match exactly. mergeSynonyms :: [LexEntry] -> [LexEntry] mergeSynonyms lexEntry = Map.elems synMap where mergeFn l1 l2 = l1 { iword = (FL.++) (iword l1) (iword l2) } keyFn l = (ifamname l, isemantics l) synMap = foldr helper Map.empty lexEntry where helper x acc = Map.insertWith mergeFn (keyFn x) x acc -- -------------------------------------------------------------------- -- * Anchoring -- -------------------------------------------------------------------- -- | The 'LexCombine' monad supports warnings during lexical selection -- and also failure via Maybe type LexCombine a = MaybeT (Writer [LexCombineError]) a runLexCombine :: LexCombine a -> (Maybe a, [LexCombineError]) runLexCombine = runWriter . runMaybeT -- | Note an anchoring error lexTell :: LexCombineError -> LexCombine () lexTell x = lift (tell [x]) -- | @defaultAnchoring schemata lex sem@ implements the later half of lexical -- selection (tree anchoring and enrichement). It assumes that @lex@ consists -- just of the lexical items that have been selected, and tries to combine them -- with the tree schemata. -- -- This function may be useful if you are implementing your own lexical selection -- functions, and you want GenI to take over after you've given it a @[LexEntry]@ defaultAnchoring :: SemInput -> Macros -> [LexEntry] -> LexicalSelection defaultAnchoring (tsem,_,_) grammar lexCands = LexicalSelection { lsAnchored = cands , lsLexEntries = lexCands , lsWarnings = mconcat [ lexWarnings, coanchorWarnings, errs ] } where combinations = map (combineList tsem grammar) lexCands cands = concatMap snd combinations errs = mkGeniWarnings . concat $ zipWith mkWarnings lexCands (map fst combinations) mkWarnings l = map (LexWarning [l] . LexCombineOneSchemaFailed) coanchorWarnings = mkGeniWarnings $ do -- list monad l <- lexCands let xs = filter (\p -> pfamily p == ifamname l) grammar (c,n) <- Map.toList . histogram $ concatMap (missingCoanchors l) xs return (LexWarning [l] (MissingCoanchors c n)) lexWarnings = mkGeniWarnings $ case missingLexEntries cands lexCands of [] -> [] xs -> [LexWarning xs LexCombineAllSchemataFailed] -- ---------------------------------------------------------------------- -- ** Combination -- ---------------------------------------------------------------------- -- | Given a lexical item, looks up the tree families for that item, and -- anchor the item to the trees. combineList :: Sem -> Macros -> LexEntry -> ([LexCombineError],[TagElem]) -- ^ any warnings, plus the results combineList tsem gram lexitem = case [ t | t <- gram, pfamily t == tn ] of [] -> ([FamilyNotFoundError tn],[]) macs -> squish . swap . unzip $ map (\m -> runLexCombine $ combineOne tsem lexitem m) macs where tn = ifamname lexitem swap (x,y) = (y,x) squish = (compressLexCombineErrors . concat) *** (concat . catMaybes) -- | Combine a single tree with its lexical item to form a bonafide TagElem. -- This process can fail, however, because of filtering or enrichement combineOne :: Sem -> LexEntry -> SchemaTree -> LexCombine [TagElem] combineOne tsem lexRaw eRaw = do -- Maybe monad -- trace ("\n" ++ (show wt)) $ let l1 = finaliseVars "-l" lexRaw e1 = finaliseVars "-t" eRaw (l,e) <- unifyParamsWithWarning (l1,e1) >>= unifyInterfaceUsing iinterface >>= unifyInterfaceUsing ifilters -- filtering >>= enrichWithWarning -- enrichment tree2 <- case crushTreeGNode (tree e) of Nothing -> do lexTell (SchemaError [pidname e] (StringError "Could not flatten disjunction")) fail "" Just x -> return x let name = T.intercalate ":" $ filter (not . T.null) [ FL.head (iword l) , pfamily e , pidname e ] template = TE { idname = name , ttreename = pfamily e , tidnum = -1 -- provisional id , ttype = ptype e , ttree = setOrigin name . setLemAnchors . setAnchor (iword l) $ tree2 , tsemantics = [] , tsempols = isempols l , tpolarities = Map.empty , tinterface = pinterface e , ttrace = ptrace e } semUnifications <- case unifySem (isemantics l) (fromMaybe [] $ psemantics e) of [] -> croak e "could not unify lemma and schema semantics" xs -> return xs return $ concatMap (anonymiseSingletons . finaliseSemantics template) semUnifications where croak :: SchemaTree -> Text -> LexCombine a croak t msg = do lexTell (SchemaError [pidname t] (StringError msg)) mzero finaliseSemantics template (sem,sub) = do let template2 = replace sub template (sem2,sub2) <- sem `subsumeSem` replace sub tsem return $ replace sub2 $ template2 { tsemantics = sem2 } unifyParamsWithWarning (l,t) = if length lp /= length tp then croak t "Parameter length mismatch" else case unify lp tp of Left msg -> croak t ("Parameter unification error: " <> msg) Right (ps2, subst) -> let t2 = (replace subst t) { params = ps2 } in return (replace subst l, t2) where lp = iparams l tp = params t unifyInterfaceUsing ifn (l,e) = -- trace ("unify interface" ++ wt) $ case unifyFeat (ifn l) (pinterface e) of Left msg -> croak e ("Interface unification error: " <> msg) Right (int2, fsubst) -> let e2 = (replace fsubst e) { pinterface = int2 } in return (replace fsubst l, e2) -- enrichWithWarning (l,e) = (l,) <$> enrich l e -- ---------------------------------------------------------------------- -- ** Enrichment -- ---------------------------------------------------------------------- -- | See -- on enrichement enrich :: LexEntry -> SchemaTree -> LexCombine SchemaTree enrich l t = do -- separate into interface/anchor/named (intE, namedE) <- lift $ lexEquations l -- enrich the interface and everything else t2 <- foldM enrichInterface t intE -- enrich everything else foldM enrichBy t2 namedE where enrichInterface tx en = case hush (unifyFeat [en] (pinterface tx)) of Nothing -> lexTell (ifaceEnrichErr en) >> fail "" Just (i2, isubs) -> return $ (replace isubs tx) { pinterface = i2 } ifaceEnrichErr (AvPair loc _) = SchemaError [pidname t] (EnrichError (PeqInterface loc)) -- *** 'enrich' helpers -- | Helper for 'enrich' (enrich by single path equation) enrichBy :: SchemaTree -> PathEqPair -> LexCombine SchemaTree enrichBy t eq@(eqLhs, _) = case maybeEnrichBy t eq of Nothing -> lexTell enrichErr >> return t Just (t2,_) -> return t2 where enrichErr = SchemaError [pidname t] (EnrichError (PeqJust eqLhs)) -- | Helper for 'enrichBy' maybeEnrichBy :: SchemaTree -> PathEqPair -> Maybe (SchemaTree, Subst) maybeEnrichBy t (eqLhs, eqVal) = do node <- seekCoanchor eqLhs t case eqLhs of PeqFeat _ eqTop eqAtt -> do let (get, set) = case eqTop of Top -> (gup, \n x -> n { gup = x }) Bottom -> (gdown, \n x -> n { gdown = x}) (fs, sub) <- hush $ enrichFeat (AvPair eqAtt eqVal) (get node) let t2 = fixNode (set node fs) (replace sub t) return (t2, sub) PeqLex _ -> do vs <- gConstraints eqVal let node2 = node { glexeme = FL.fromFL vs } t2 = fixNode node2 t return (t2, Map.empty) where fixNode n mt = mt { tree = repNodeByNode (matchNodeName eqLhs) n (tree mt) } -- | @enrichFeat av fs@ attempts to unify @av@ with @fs@ -- -- Note here that @fs@ is an @Flist [GeniVal]@ rather than the usual -- @Flist GeniVal@ you may expect. This is because it comes from -- 'SchemaTree' which allows non-atomic disjunctions of @GeniVal@ -- which have to be flatten down to at most atomic disjunctions once -- lexical selection is complete. enrichFeat :: MonadUnify m => AvPair GeniVal -> Flist SchemaVal -> m (Flist SchemaVal, Subst) enrichFeat (AvPair a v) fs = case span (\x -> avAtt x < a) fs of (before,here:after) | avMatch here -> do let (AvPair _ (SchemaVal fv)) = here (v2,sub) <- unify fv (replicate (length fv) v) let av2 = AvPair a (SchemaVal v2) fs2 = replace sub before ++ (av2 : replace sub after) return (fs2, sub) (before,after) -> do let av2 = AvPair a (SchemaVal [v]) fs2 = before ++ (av2 : after) return (fs2, Map.empty) where avMatch (AvPair fa _) = fa == a -- | @missingCoanchors l t@ returns the list of coanchor node names from @l@ -- that were not found in @t@ missingCoanchors :: LexEntry -> SchemaTree -> [Text] missingCoanchors lexEntry t = [ name eqLhs | eqLhs <- nubBy ((==) `on` name) equations, missing eqLhs ] where equations = map fst . snd . fst . runWriter $ lexEquations lexEntry name (PeqFeat n _ _) = n name (PeqLex n) = n missing eqLhs = isNothing (seekCoanchor eqLhs t) -- | Split a lex entry's path equations into interface enrichement equations -- or (co-)anchor modifiers lexEquations :: LexEntry -> Writer [LexCombineError] ([AvPair GeniVal],[PathEqPair]) lexEquations = fmap myPartition . mapM parseAv . iequations where myPartition xs = ( [ AvPair a v | (PeqInterface a, v) <- xs ] , [ (n,v) | (PeqJust n, v) <- xs ] ) parseAv (AvPair a v) = fmap (\a2 -> (a2,v)) (parsePathEq a) -- | @seekCoanchor lhs t@ returns @Just node@ if @t@ contains exactly one -- node that can be identified by @lhs@, @Nothing@ if it contains none. -- -- It crashes if there is more than one such node, because this should -- have been caught earlier by GenI. seekCoanchor :: NodePathEqLhs -> SchemaTree -> Maybe (GNode SchemaVal) seekCoanchor eqLhs t = case filterTree (matchNodeName eqLhs) (tree t) of [a] -> Just a [] -> Nothing _ -> geniBug . T.unpack . T.intercalate "\n" $ [ "NLP.GenI.LexicalSelection.seekCoanchor:" , "Did not expect to see a tree with multiple matches in enrichBy." , "Tree: " `T.append` pidname t , "Family: " `T.append` pfamily t , "Matching on: " `T.append` showPathEqLhs (PeqJust eqLhs) ] -- | @matchNodeName lhs n@ is @True@ if the @lhs@ refers to the node @n@ matchNodeName :: NodePathEqLhs -> GNode SchemaVal -> Bool matchNodeName (PeqFeat n _ _) = matchNodeNameHelper n matchNodeName (PeqLex n) = matchNodeNameHelper n -- | @matchNodeNameHelper@ recognises “anchor“ by convention; otherwise, -- it does a name match matchNodeNameHelper :: Text -> GNode SchemaVal -> Bool matchNodeNameHelper "anchor" = ganchor matchNodeNameHelper n = (== n) . gnname -- ---------------------------------------------------------------------- -- ** Lemanchor mechanism -- ---------------------------------------------------------------------- -- | The lemanchor mechanism is described in -- setLemAnchors :: Tree (GNode GeniVal) -> Tree (GNode GeniVal) setLemAnchors t = repAllNode fn filt t where filt (Node a []) = gtype a == Subs && (isJust. lemAnchor) a filt _ = False fn (Node x k) = setLexeme (lemAnchorMaybeFake x) $ Node (x { gtype = Other, gaconstr = MaybeAdj }) k -- lemAnchorMaybeFake :: GNode GeniVal -> [Text] lemAnchorMaybeFake n = fromMaybe ["ERR_UNSET_LEMMANCHOR"] (lemAnchor n) lemAnchor :: GNode GeniVal -> Maybe [Text] lemAnchor n = case [ v | AvPair a v <- gup n, a == _lemanchor ] of [l] -> fromFL <$> gConstraints l _ -> Nothing -- | The name of the lemanchor attribute (by convention; see source) _lemanchor :: Text _lemanchor = "lemanchor" -- | @setOrigin n t@ marks the nodes in @t@ as having come from -- a tree named @n@ setOrigin :: Text -> Tree (GNode v) -> Tree (GNode v) setOrigin t = fmap (\g -> g { gorigin = t }) -- ---------------------------------------------------------------------- -- * Post-processing -- ---------------------------------------------------------------------- -- | Standard post-processing/filtering steps that can take place -- after lexical selection. Right now, this only consists of -- paraphrase selection defaultPostProcessing :: SemInput -> LexicalSelection -> LexicalSelection defaultPostProcessing (_,_,lc) sel = sel { lsAnchored = preselectParaphrases lc (lsAnchored sel) } -- ---------------------------------------------------------------------- -- ** Paraphrase selection -- ---------------------------------------------------------------------- -- | Rule out lexical selection results that violate trace constraints preselectParaphrases :: [LitConstr] -> [TagElem] -> [TagElem] preselectParaphrases litContrs = filter (respectsConstraints litContrs) -- | 'True' if the tree fulfills the supplied trace constraints respectsConstraints :: [LitConstr] -> TagElem -> Bool respectsConstraints lc t = all (`elem` ttrace t) constrs where constrs = concat [ cs | (l,cs) <- lc, l `elem` tsemantics t ]