% GenI surface realiser % Copyright (C) 2005 Carlos Areces and Eric Kow % % This program is free software; you can redistribute it and/or % modify it under the terms of the GNU General Public License % as published by the Free Software Foundation; either version 2 % of the License, or (at your option) any later version. % % This program is distributed in the hope that it will be useful, % but WITHOUT ANY WARRANTY; without even the implied warranty of % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the % GNU General Public License for more details. % % You should have received a copy of the GNU General Public License % along with this program; if not, write to the Free Software % Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. \chapter{File formats (GeniParsers)} \label{cha:GeniParsers} This chapter is a description of the file format used by GenI. You might also have to look at the LORIA wiki for documentation on this. See \url{http://wiki.loria.fr/wiki/GenI/Input_format}. If the descriptions here sound a little weird to you, it's likely because they used to be source code comments, and are being converted into actual documentation. \ignore{ \begin{code} module NLP.GenI.GeniParsers ( -- test suite stuff geniTestSuite, geniSemanticInput, geniTestSuiteString, geniDerivations, toSemInputString, -- macros geniMacros, -- lexicons geniLexicon, geniMorphLexicon, geniMorphInfo, -- features and polarities geniFeats, geniPolarities, -- TagElem, geniTagElems, -- things used by external scripts geniSemantics, geniValue, geniWords, geniLanguageDef, tillEof, ) where import NLP.GenI.General ((!+!), Interval, ival) import NLP.GenI.Btypes import NLP.GenI.Tags (TagElem(..), emptyTE, setTidnums) import NLP.GenI.GeniShow (GeniShow(geniShow)) import Control.Monad (liftM, when) import Data.List (sort) import qualified Data.Map as Map import qualified Data.Tree as T import Text.ParserCombinators.Parsec import Text.ParserCombinators.Parsec.Language (emptyDef) import Text.ParserCombinators.Parsec.Token (TokenParser, LanguageDef(..), makeTokenParser) import qualified Text.ParserCombinators.Parsec.Token as P -- reserved words #define SEMANTICS "semantics" #define SENTENCE "sentence" #define OUTPUT "output" #define TRACE "trace" #define ANCHOR "anchor" #define SUBST "subst" #define FOOT "foot" #define LEX "lex" #define TYPE "type" #define ACONSTR_NOADJ "aconstr:noadj" #define INITIAL "initial" #define AUXILIARY "auxiliary" #define IDXCONSTRAINTS "idxconstraints" #define BEGIN "begin" #define END "end" \end{code} } \section{Test suites} The test suite format consists of arbitrarily many test cases: \begin{code} geniTestSuite :: Parser [TestCase] geniTestSuite = tillEof (many geniTestCase) -- | Just the String representations of the semantics -- in the test suite geniTestSuiteString :: Parser [String] geniTestSuiteString = tillEof (many geniTestCaseString) -- | This is only used by the script genimakesuite geniDerivations :: Parser [TestCaseOutput] geniDerivations = tillEof $ many geniOutput \end{code} A test case is composed of an optional test id, some semantic input \fnref{geniSemanticInput}, followed by any number of sentences and optionally followed by a list of outputs. The sentences can either be known good sentences (optionally preceded by the keyword 'sentence' -- perhaps this should be mandatory one day). The outputs are used directly by users. The field is useful for noting what outputs were actually produced, say, in a script that generates test suites from GenI output. This field doesn't have much use for GenI per se, just its satellite scripts. \begin{code} geniTestCase :: Parser TestCase geniTestCase = do name <- option "" (identifier "a test case name") seminput <- geniSemanticInput sentences <- many geniSentence outputs <- many geniOutput return $ TestCase name "" seminput sentences outputs -- note that the keyword is NOT optional type TestCaseOutput = (String, Map.Map (String,String) [String]) geniOutput :: Parser TestCaseOutput geniOutput = do ws <- keyword OUTPUT >> (squares geniWords) ds <- Map.fromList `fmap` many geniTraces return (ws, ds) geniTraces :: Parser ((String,String), [String]) geniTraces = do keyword TRACE squares $ do k1 <- withWhite geniWord k2 <- withWhite geniWord whiteSpace >> char '!' >> whiteSpace traces <- sepEndBy1 geniWord whiteSpace return ((k1,k2), traces) withWhite :: Parser a -> Parser a withWhite p = p >>= (\a -> whiteSpace >> return a) geniSentence :: Parser String geniSentence = optional (keyword SENTENCE) >> squares geniWords geniWords :: Parser String geniWords = unwords `fmap` (sepEndBy1 geniWord whiteSpace "a sentence") geniWord :: Parser String geniWord = many1 (noneOf "[]\v\f\t\r\n ") -- | The original string representation of a test case semantics -- (for gui) geniTestCaseString :: Parser String geniTestCaseString = do option "" (identifier "a test case name") s <- geniSemanticInputString many geniSentence many geniOutput return s \end{code} \section{Semantics} \fnlabel{geniSemanticInput} consists of a semantics, and optionally a set of index constraints. The semantics may contain literal based constraints as described in section \ref{sec:fixme}. These constraints are just a space-delimited list of String. When returning the results, we separate them out from the semantics proper so that they can be treated separately. Index constraints are represented as feature structures. For more details about them, see \fnref{detectIdxConstraints}. \begin{code} geniSemanticInput :: Parser (Sem,Flist,[LitConstr]) geniSemanticInput = do keywordSemantics (sem,litC) <- liftM unzip $ squares $ many literalAndConstraint idxC <- option [] geniIdxConstraints -- let sem2 = createHandles sem semlitC2 = [ (s,c) | (s,c) <- zip sem2 litC, (not.null) c ] return (createHandles sem, idxC, semlitC2) where -- set all anonymous handles to some unique value -- this is to simplify checking if a result is -- semantically complete createHandles :: Sem -> Sem createHandles = zipWith setHandle ([1..] :: [Int]) -- setHandle i (h, pred_, par) = let h2 = if h /= GAnon then h else GConst ["genihandle" ++ (show i)] in (h2, pred_, par) -- literalAndConstraint :: Parser (Pred, [String]) literalAndConstraint = do l <- geniLiteral t <- option [] $ squares $ many identifier return (l,t) -- | The original string representation of the semantics (for gui) geniSemanticInputString :: Parser String geniSemanticInputString = do keywordSemantics s <- squaresString whiteSpace optional geniIdxConstraints return s geniIdxConstraints :: Parser Flist geniIdxConstraints = keyword IDXCONSTRAINTS >> geniFeats squaresString :: Parser String squaresString = do char '[' s <- liftM concat $ many $ (many1 $ noneOf "[]") <|> squaresString char ']' return $ "[" ++ s ++ "]" -- the output end of things -- displaying preformatted semantic input data SemInputString = SemInputString String Flist instance GeniShow SemInputString where geniShow (SemInputString semStr idxC) = SEMANTICS ++ ":" ++ semStr ++ (if null idxC then "" else r) where r = "\n" ++ IDXCONSTRAINTS ++ ": " ++ showFlist idxC toSemInputString :: SemInput -> String -> SemInputString toSemInputString (_,lc,_) s = SemInputString s lc \end{code} \section{Lexicon} A lexicon is just a whitespace seperated list of lexical entries. Each lexical entry is \begin{enumerate} \item A lemma \item The family name of things this lemma anchors to \item The interface to the tree. Here's the compicated bit. Either you provide : \begin{itemize} \item A list of parameters and an interface, as defined in \fnref{geniParams}. The interface is meant to be unified with the tree interface. \item A feature structure which is to be unifed with the tree interface. This is equivalent to the attribute-value pairs above; the only difference is that we don't do any parameters, and we use square brackets instead of parentheses. \item Optionally: a set of path equations for enrichmment. This feature structure can consist of path equations of the form node.att:val, because they will be unified with the entire tree and not just the tree interface. To force something to unify with a tree interface in XMG, you should supply ``interface.'' as a node name. \end{itemize} \item Optionally: a set of filters. This is to be used in conjunction with XMG's SelectTAG. Note that you must explicitly include family as an attribute, even if it's already declared in the lexical entry. \end{enumerate} \begin{code} geniLexicon :: Parser [ILexEntry] geniLexicon = tillEof $ many1 geniLexicalEntry geniLexicalEntry :: Parser ILexEntry geniLexicalEntry = do lemma <- (looseIdentifier <|> stringLiteral) "a lemma" family <- identifier "a tree family" (pars, interface) <- option ([],[]) $ parens paramsParser equations <- option [] $ do keyword "equations" geniFeats "path equations" filters <- option [] $ do keyword "filters" geniFeats keywordSemantics (sem,pols) <- squares geniLexSemantics -- return emptyLE { iword = [lemma] , ifamname = family , iparams = pars , iinterface = sortFlist interface , iequations = equations , ifilters = filters , isemantics = sem , isempols = pols } where paramsParser :: Parser ([GeniVal], Flist) paramsParser = do pars <- many geniValue "some parameters" interface <- option [] $ do symbol "!" many geniAttVal return (pars, interface) \end{code} \section{Trees} \subsection{Macros} A macro library is basically a list of trees. \begin{code} geniMacros :: Parser [MTtree] geniMacros = tillEof $ many geniTreeDef initType, auxType :: Parser Ptype initType = do { reserved INITIAL ; return Initial } auxType = do { reserved AUXILIARY ; return Auxiliar } \end{code} \subsection{Tree definitions} A tree definition consists of \begin{enumerate} \item a family name, followed by an optional tree id \item the tree parameters/interface as defined in \fnref{geniParams} \item (optional) a tree type specification, as parameterised through the \fnparam{ttypeP} argument \item the tree itself \end{enumerate} \begin{code} geniTreeDef :: Parser MTtree geniTreeDef = do sourcePos <- getPosition family <- identifier tname <- option "" $ do { colon; identifier } (pars,iface) <- geniParams theTtype <- (initType <|> auxType) theTree <- geniTree -- sanity checks? let treeFail x = do setPosition sourcePos -- FIXME does not do what I expect fail $ "In tree " ++ family ++ ":" ++ tname ++ " " ++ show sourcePos ++ ": " ++ x let theNodes = T.flatten theTree numFeet = length [ x | x <- theNodes, gtype x == Foot ] numAnchors = length [ x | x <- theNodes, ganchor x ] when (not $ any ganchor theNodes) $ treeFail "At least one node in an LTAG tree must be an anchor" when (numAnchors > 1) $ treeFail "There can be no more than 1 anchor node in a tree" when (numFeet > 1) $ treeFail "There can be no more than 1 foot node in a tree" when (theTtype == Initial && numFeet > 0) $ treeFail "Initial trees may not have foot nodes" -- psem <- option Nothing $ do { keywordSemantics; liftM Just (squares geniSemantics) } ptrc <- option [] $ do { keyword TRACE; squares (many identifier) } -- return TT{ params = pars , pfamily = family , pidname = tname , pinterface = sortFlist iface , ptype = theTtype , tree = theTree , ptrace = ptrc , psemantics = psem } \end{code} \subsection{Tree structure} A tree is recursively defined as a node followed by an optional list of child nodes. If there are any child nodes, they appear between curly brackets. A node consists of \begin{enumerate} \item A node name \item (optionally) a node type (anchor, lexeme, foot, subst). \item (if node type is lexeme) a lexeme \item (optionally) an adjunction constraint (Notes: We only know about null adjunction constraints. If the node has a type, it is assumed as having a null adjunction constraint) \end{enumerate} Example of a tree: \begin{verbatim} n2 type:subst [cat:np idx:?Agent]![] n3[cat:vp idx:?Event]![] { n4 aconstr:noadj [cat:v idx:?Event]![] { n5 anchor } \end{verbatim} \begin{code} geniTree :: Parser (T.Tree GNode) geniTree = do node <- geniNode kids <- option [] (braces $ many geniTree) "child nodes" -- sanity checks let noKidsAllowed t c = when (c node && (not.null $ kids)) $ fail $ t ++ " nodes may *not* have any children" noKidsAllowed "Anchor" $ ganchor noKidsAllowed "Substitution" $ (== Subs) . gtype noKidsAllowed "Foot" $ (== Foot) . gtype -- return (T.Node node kids) geniNode :: Parser GNode geniNode = do name <- identifier nodeType <- option "" ( (keyword TYPE >> typeParser) <|> reserved ANCHOR) lex_ <- if nodeType == LEX then (sepBy (stringLiteral<|>identifier) (symbol "|") "some lexemes") else return [] constr <- case nodeType of "" -> adjConstraintParser ANCHOR -> adjConstraintParser _ -> return True (top_,bot_) <- -- features only obligatory for non-lex nodes if nodeType == LEX then option ([],[]) $ try topbotParser else topbotParser -- let top = sort top_ bot = sort bot_ nodeType2 = case nodeType of ANCHOR -> Lex LEX -> Lex FOOT -> Foot SUBST -> Subs "" -> Other other -> error ("unknown node type: " ++ other) return $ GN { gnname = name, gtype = nodeType2 , gup = top, gdown = bot , glexeme = lex_ , ganchor = (nodeType == ANCHOR) , gaconstr = constr , gorigin = "" } where typeParser = choice $ map (try.symbol) [ ANCHOR, FOOT, SUBST, LEX ] adjConstraintParser = option False $ reserved ACONSTR_NOADJ >> return True topbotParser = do top <- geniFeats "top features" symbol "!" bot <- geniFeats "bot features" return (top,bot) \end{code} \subsection{TagElem} For debugging purposes, it is often useful to be able to read TagElem's directly. Note that this shares a lot of code with the macros above. Hopefully, it is reasonably refactored. FIXME: note that this is very rudimentary; we do not set id numbers, parse polarities. You'll have to call some of our helper functions if you want that functionality. \begin{code} geniTagElems :: Parser [TagElem] geniTagElems = tillEof $ setTidnums `fmap` many geniTagElem geniTagElem :: Parser TagElem geniTagElem = do family <- identifier tname <- option "" $ do { colon; identifier } iface <- (snd `liftM` geniParams) <|> geniFeats theType <- initType <|> auxType theTree <- geniTree sem <- do { keywordSemantics; squares geniSemantics } -- return $ emptyTE { idname = tname , ttreename = family , tinterface = iface , ttype = theType , ttree = theTree , tsemantics = sem } \end{code} \section{Polarities} The polarities parser is used for parsing extra polarity input from the user. For more information, see chapter \ref{cha:Polarity}. \begin{code} geniPolarities :: Parser (Map.Map String Interval) geniPolarities = tillEof $ toMap `fmap` many pol where toMap = Map.fromListWith (!+!) pol = do p <- geniPolarity i <- identifier return (i,ival p) \end{code} \fnlabel{geniPolarity} associates a numerical value to a polarity symbol, that is, '+' or '-'. \begin{code} geniPolarity :: Parser Int geniPolarity = option 0 (plus <|> minus) where plus = do { char '+'; return 1 } minus = do { char '-'; return (-1) } \end{code} \section{Morphology} GenI has two types of morphological input. \paragraph{morphinfo} A morphinfo file associates predicates with morphological feature structures. Each morphological entry consists of a predicate followed by a feature structuer. For more information, see chapter \ref{cha:Morphology}. \begin{code} geniMorphInfo :: Parser [(String,Flist)] geniMorphInfo = tillEof $ many morphEntry morphEntry :: Parser (String,Flist) morphEntry = do pred_ <- identifier feats <- geniFeats return (pred_, feats) \end{code} \paragraph{morphlexicon} A morphological lexicon is a table where each entry is an inflected form followed by the lemma and the feature structure to which it is associated. The table is whitespace-delimited. \begin{code} geniMorphLexicon :: Parser [MorphLexEntry] geniMorphLexicon = tillEof $ many morphLexiconEntry morphLexiconEntry :: Parser (String, String, Flist) morphLexiconEntry = do inflected <- try stringLiteral <|> geniWord whiteSpace lemma <- try stringLiteral <|> geniWord whiteSpace feats <- geniFeats return (inflected, lemma, feats) \end{code} \section{Generic GenI stuff} \subsection{Lexer} Some preliminaries about GenI formats in general - comments start with \verb!%! There is also the option of using \verb'/* */' for embedded comments. \begin{code} lexer :: TokenParser () lexer = makeTokenParser geniLanguageDef geniLanguageDef :: LanguageDef () geniLanguageDef = emptyDef { commentLine = "%" , commentStart = "/*" , commentEnd = "*/" , opLetter = oneOf "" , reservedOpNames = [""] , reservedNames = [ SEMANTICS , SENTENCE, OUTPUT, IDXCONSTRAINTS, TRACE , ANCHOR , SUBST , FOOT , LEX , TYPE , ACONSTR_NOADJ , INITIAL , AUXILIARY , BEGIN , END ] , identLetter = identStuff , identStart = identStuff } where identStuff = alphaNum <|> oneOf "_'+-." whiteSpace :: CharParser () () whiteSpace = P.whiteSpace lexer looseIdentifier, identifier, stringLiteral, colon :: CharParser () String identifier = P.identifier lexer -- stolen from Parsec code (ident) -- | Like 'identifier' but allows for reserved words too looseIdentifier = do { i <- ident ; whiteSpace; return i } where ident = do { c <- identStart geniLanguageDef ; cs <- many (identLetter geniLanguageDef) ; return (c:cs) } "identifier" stringLiteral = P.stringLiteral lexer colon = P.colon lexer squares, braces, parens :: CharParser () a -> CharParser () a squares = P.squares lexer braces = P.braces lexer parens = P.parens lexer reserved, symbol :: String -> CharParser () String reserved s = P.reserved lexer s >> return s symbol = P.symbol lexer \end{code} \subsection{Keyword} A key is nothing simpler than the keyword, followed by a colon. We factor this into a seperate function to account for whitespace. \begin{code} {-# INLINE keyword #-} keyword :: String -> Parser String keyword k = do let helper = try $ do { reserved k; colon; return k } helper k ++ ":" {-# INLINE keywordSemantics #-} keywordSemantics :: Parser String keywordSemantics = keyword SEMANTICS \end{code} \subsection{Feature structures} Feature structures take the form \verb!val : att! with only whitespace to separate each attval pair. See \fnref{geniValue} for details about what the values look like. \begin{code} geniFeats :: Parser Flist geniFeats = option [] $ squares $ many geniAttVal geniAttVal :: Parser AvPair geniAttVal = do att <- identifier "an attribute"; colon val <- geniValue "a GenI value" return (att, val) \end{code} \fnlabel{geniParams} recognises a list of parameters optionally followed by a bang (\verb$!$) and a list of attribute-value pairs. This whole thing is to wrapped in the parens. \textbf{Note:} sometimes people prefer not to use parameters - instead they stick to using the interface. This is fine, but they should not forget the bang seperator. \begin{code} geniParams :: Parser ([GeniVal], Flist) geniParams = parens $ do pars <- many geniValue "some parameters" interface <- option [] $ do { symbol "!"; many geniAttVal } return (pars, interface) \end{code} \subsection{Semantics} A semantics is simply a list of literals. A literal can take one of two forms: \begin{verbatim} handle:predicate(arguments) predicate(arguments) \end{verbatim} The arguments are space-delimited. Not providing a handle is equivalent to providing an anonymous one. \begin{code} geniSemantics :: Parser Sem geniSemantics = do sem <- many (geniLiteral "a literal") return (sortSem sem) geniLiteral :: Parser Pred geniLiteral = do handle <- option GAnon handleParser "a handle" predicate <- geniValue "a predicate" pars <- parens (many geniValue) "some parameters" -- return (handle, predicate, pars) where handleParser = try $ do { h <- geniValue ; char ':' ; return h } \end{code} \subsection{Lexical semantics} A lexical semantics is almost exactly the same as a regular semantics, except that each variable may be preceded by a polarity symbol. When we figure out how to automate the detection of lexical semantic polarities, we can start using a regular semantics again. \begin{code} geniLexSemantics :: Parser (Sem, [[Int]]) geniLexSemantics = do litpols <- many (geniLexLiteral "a literal") return $ unzip litpols geniLexLiteral :: Parser (Pred, [Int]) geniLexLiteral = do (handle, hpol) <- option (GAnon,0) (handleParser "a handle") predicate <- geniValue "a predicate" paramsPols <- parens (many geniPolValue) "some parameters" -- let (pars, pols) = unzip paramsPols literal = (handle, predicate, pars) return (literal, hpol:pols) where handleParser = try $ do { h <- geniPolValue; colon; return h } geniPolValue :: Parser (GeniVal, Int) geniPolValue = do p <- geniPolarity v <- geniValue return (v,p) \end{code} \subsection{Miscellaneous} \fnlabel{geniValue} is recognised both in feature structures and in the GenI semantics. \begin{enumerate} \item As of geni 0.8, variables are prefixed with a question mark. \item The underscore, \verb!_!, and \verb!?_! are treated as anonymous variables. \item Atomic disjunctions are seperated with a pipe, \verb!|!. Only constants may be separated by atomic disjunction \item Anything else is just a constant \end{enumerate} \begin{code} geniValue :: Parser GeniVal geniValue = ((try $ anonymous) "_ or ?_") <|> (constants "a constant or atomic disjunction") <|> (variable "a variable") where question = "?" -- constants :: Parser GeniVal constants = do c <- sepBy1 (looseIdentifier <|> stringLiteral) (symbol "|") return (GConst c) variable :: Parser GeniVal variable = do symbol question v <- identifier return (GVar v) anonymous :: Parser GeniVal anonymous = do optional $ symbol question symbol "_" return GAnon \end{code} \begin{code} tillEof :: Parser a -> Parser a tillEof p = do whiteSpace r <- p eof return r \end{code}