% GenI surface realiser
% Copyright (C) 2005 Carlos Areces and Eric Kow
%
% This program is free software; you can redistribute it and/or
% modify it under the terms of the GNU General Public License
% as published by the Free Software Foundation; either version 2
% of the License, or (at your option) any later version.
%
% This program is distributed in the hope that it will be useful,
% but WITHOUT ANY WARRANTY; without even the implied warranty of
% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
% GNU General Public License for more details.
%
% You should have received a copy of the GNU General Public License
% along with this program; if not, write to the Free Software
% Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
\chapter{File formats (GeniParsers)}
\label{cha:GeniParsers}
This chapter is a description of the file format used by GenI. You
might also have to look at the LORIA wiki for documentation on this.
See \url{http://wiki.loria.fr/wiki/GenI/Input_format}. If the
descriptions here sound a little weird to you, it's likely because
they used to be source code comments, and are being converted into
actual documentation.
\ignore{
\begin{code}
module NLP.GenI.GeniParsers (
geniTestSuite, geniSemanticInput, geniTestSuiteString,
geniDerivations,
toSemInputString,
geniMacros,
geniLexicon, geniMorphLexicon, geniMorphInfo,
geniFeats, geniPolarities,
geniTagElems,
geniSemantics, geniValue, geniWords, geniLanguageDef, tillEof,
) where
import NLP.GenI.General ((!+!), Interval, ival)
import NLP.GenI.Btypes
import NLP.GenI.Tags (TagElem(..), emptyTE, setTidnums)
import NLP.GenI.GeniShow (GeniShow(geniShow))
import Control.Monad (liftM, when)
import Data.List (sort)
import qualified Data.Map as Map
import qualified Data.Tree as T
import Text.ParserCombinators.Parsec
import Text.ParserCombinators.Parsec.Language (emptyDef)
import Text.ParserCombinators.Parsec.Token (TokenParser,
LanguageDef(..), makeTokenParser)
import qualified Text.ParserCombinators.Parsec.Token as P
#define SEMANTICS "semantics"
#define SENTENCE "sentence"
#define OUTPUT "output"
#define TRACE "trace"
#define ANCHOR "anchor"
#define SUBST "subst"
#define FOOT "foot"
#define LEX "lex"
#define TYPE "type"
#define ACONSTR_NOADJ "aconstr:noadj"
#define INITIAL "initial"
#define AUXILIARY "auxiliary"
#define IDXCONSTRAINTS "idxconstraints"
#define BEGIN "begin"
#define END "end"
\end{code}
}
\section{Test suites}
The test suite format consists of arbitrarily many test cases:
\begin{code}
geniTestSuite :: Parser [TestCase]
geniTestSuite =
tillEof (many geniTestCase)
geniTestSuiteString :: Parser [String]
geniTestSuiteString =
tillEof (many geniTestCaseString)
geniDerivations :: Parser [TestCaseOutput]
geniDerivations = tillEof $ many geniOutput
\end{code}
A test case is composed of an optional test id, some semantic input
\fnref{geniSemanticInput}, followed by any number of sentences
and optionally followed by a list of outputs.
The sentences can either be known good sentences (optionally preceded by the
keyword 'sentence' -- perhaps this should be mandatory one day). The outputs
are used directly by users. The field is useful for noting what outputs were
actually produced, say, in a script that generates test suites from GenI
output. This field doesn't have much use for GenI per se, just its satellite
scripts.
\begin{code}
geniTestCase :: Parser TestCase
geniTestCase =
do name <- option "" (identifier <?> "a test case name")
seminput <- geniSemanticInput
sentences <- many geniSentence
outputs <- many geniOutput
return $ TestCase name "" seminput sentences outputs
type TestCaseOutput = (String, Map.Map (String,String) [String])
geniOutput :: Parser TestCaseOutput
geniOutput =
do ws <- keyword OUTPUT >> (squares geniWords)
ds <- Map.fromList `fmap` many geniTraces
return (ws, ds)
geniTraces :: Parser ((String,String), [String])
geniTraces =
do keyword TRACE
squares $ do
k1 <- withWhite geniWord
k2 <- withWhite geniWord
whiteSpace >> char '!' >> whiteSpace
traces <- sepEndBy1 geniWord whiteSpace
return ((k1,k2), traces)
withWhite :: Parser a -> Parser a
withWhite p = p >>= (\a -> whiteSpace >> return a)
geniSentence :: Parser String
geniSentence = optional (keyword SENTENCE) >> squares geniWords
geniWords :: Parser String
geniWords =
unwords `fmap` (sepEndBy1 geniWord whiteSpace <?> "a sentence")
geniWord :: Parser String
geniWord = many1 (noneOf "[]\v\f\t\r\n ")
geniTestCaseString :: Parser String
geniTestCaseString =
do option "" (identifier <?> "a test case name")
s <- geniSemanticInputString
many geniSentence
many geniOutput
return s
\end{code}
\section{Semantics}
\fnlabel{geniSemanticInput} consists of a semantics, and optionally a
set of index constraints.
The semantics may contain literal based constraints as described in
section \ref{sec:fixme}. These constraints are just a space-delimited
list of String. When returning the results, we separate them out from
the semantics proper so that they can be treated separately.
Index constraints are represented as feature structures. For more
details about them, see \fnref{detectIdxConstraints}.
\begin{code}
geniSemanticInput :: Parser (Sem,Flist,[LitConstr])
geniSemanticInput =
do keywordSemantics
(sem,litC) <- liftM unzip $ squares $ many literalAndConstraint
idxC <- option [] geniIdxConstraints
let sem2 = createHandles sem
semlitC2 = [ (s,c) | (s,c) <- zip sem2 litC, (not.null) c ]
return (createHandles sem, idxC, semlitC2)
where
createHandles :: Sem -> Sem
createHandles = zipWith setHandle ([1..] :: [Int])
setHandle i (h, pred_, par) =
let h2 = if h /= GAnon then h
else GConst ["genihandle" ++ (show i)]
in (h2, pred_, par)
literalAndConstraint :: Parser (Pred, [String])
literalAndConstraint =
do l <- geniLiteral
t <- option [] $ squares $ many identifier
return (l,t)
geniSemanticInputString :: Parser String
geniSemanticInputString =
do keywordSemantics
s <- squaresString
whiteSpace
optional geniIdxConstraints
return s
geniIdxConstraints :: Parser Flist
geniIdxConstraints = keyword IDXCONSTRAINTS >> geniFeats
squaresString :: Parser String
squaresString =
do char '['
s <- liftM concat $ many $ (many1 $ noneOf "[]") <|> squaresString
char ']'
return $ "[" ++ s ++ "]"
data SemInputString = SemInputString String Flist
instance GeniShow SemInputString where
geniShow (SemInputString semStr idxC) =
SEMANTICS ++ ":" ++ semStr ++ (if null idxC then "" else r)
where r = "\n" ++ IDXCONSTRAINTS ++ ": " ++ showFlist idxC
toSemInputString :: SemInput -> String -> SemInputString
toSemInputString (_,lc,_) s = SemInputString s lc
\end{code}
\section{Lexicon}
A lexicon is just a whitespace seperated list of lexical entries.
Each lexical entry is
\begin{enumerate}
\item A lemma
\item The family name of things this lemma anchors to
\item The interface to the tree. Here's the compicated bit.
Either you provide :
\begin{itemize}
\item A list of parameters and an interface, as defined in
\fnref{geniParams}. The interface is meant to be unified with
the tree interface.
\item A feature structure which is to be unifed with the tree interface.
This is equivalent to the attribute-value pairs above; the only
difference is that we don't do any parameters, and we use square
brackets instead of parentheses.
\item Optionally: a set of path equations for enrichmment.
This feature structure can consist of
path equations of the form node.att:val, because they will be
unified with the entire tree and not just the tree interface. To
force something to unify with a tree interface in XMG, you should
supply ``interface.'' as a node name.
\end{itemize}
\item Optionally: a set of filters. This is to be used in conjunction
with XMG's SelectTAG. Note that you must explicitly include
family as an attribute, even if it's already declared in the
lexical entry.
\end{enumerate}
\begin{code}
geniLexicon :: Parser [ILexEntry]
geniLexicon = tillEof $ many1 geniLexicalEntry
geniLexicalEntry :: Parser ILexEntry
geniLexicalEntry =
do lemma <- (looseIdentifier <|> stringLiteral) <?> "a lemma"
family <- identifier <?> "a tree family"
(pars, interface) <- option ([],[]) $ parens paramsParser
equations <- option [] $ do keyword "equations"
geniFeats <?> "path equations"
filters <- option [] $ do keyword "filters"
geniFeats
keywordSemantics
(sem,pols) <- squares geniLexSemantics
return emptyLE { iword = [lemma]
, ifamname = family
, iparams = pars
, iinterface = sortFlist interface
, iequations = equations
, ifilters = filters
, isemantics = sem
, isempols = pols }
where
paramsParser :: Parser ([GeniVal], Flist)
paramsParser = do
pars <- many geniValue <?> "some parameters"
interface <- option [] $ do symbol "!"
many geniAttVal
return (pars, interface)
\end{code}
\section{Trees}
\subsection{Macros}
A macro library is basically a list of trees.
\begin{code}
geniMacros :: Parser [MTtree]
geniMacros = tillEof $ many geniTreeDef
initType, auxType :: Parser Ptype
initType = do { reserved INITIAL ; return Initial }
auxType = do { reserved AUXILIARY ; return Auxiliar }
\end{code}
\subsection{Tree definitions}
A tree definition consists of
\begin{enumerate}
\item a family name, followed by an optional tree id
\item the tree parameters/interface as defined in \fnref{geniParams}
\item (optional) a tree type specification, as parameterised through the
\fnparam{ttypeP} argument
\item the tree itself
\end{enumerate}
\begin{code}
geniTreeDef :: Parser MTtree
geniTreeDef =
do sourcePos <- getPosition
family <- identifier
tname <- option "" $ do { colon; identifier }
(pars,iface) <- geniParams
theTtype <- (initType <|> auxType)
theTree <- geniTree
let treeFail x =
do setPosition sourcePos
fail $ "In tree " ++ family ++ ":" ++ tname ++ " " ++ show sourcePos ++ ": " ++ x
let theNodes = T.flatten theTree
numFeet = length [ x | x <- theNodes, gtype x == Foot ]
numAnchors = length [ x | x <- theNodes, ganchor x ]
when (not $ any ganchor theNodes) $
treeFail "At least one node in an LTAG tree must be an anchor"
when (numAnchors > 1) $
treeFail "There can be no more than 1 anchor node in a tree"
when (numFeet > 1) $
treeFail "There can be no more than 1 foot node in a tree"
when (theTtype == Initial && numFeet > 0) $
treeFail "Initial trees may not have foot nodes"
psem <- option Nothing $ do { keywordSemantics; liftM Just (squares geniSemantics) }
ptrc <- option [] $ do { keyword TRACE; squares (many identifier) }
return TT{ params = pars
, pfamily = family
, pidname = tname
, pinterface = sortFlist iface
, ptype = theTtype
, tree = theTree
, ptrace = ptrc
, psemantics = psem
}
\end{code}
\subsection{Tree structure}
A tree is recursively defined as a node followed by an optional list of child
nodes. If there are any child nodes, they appear between curly brackets.
A node consists of
\begin{enumerate}
\item A node name
\item (optionally) a node type (anchor, lexeme, foot, subst).
\item (if node type is lexeme) a lexeme
\item (optionally) an adjunction constraint
(Notes: We only know about null adjunction constraints.
If the node has a type, it is assumed as having
a null adjunction constraint)
\end{enumerate}
Example of a tree:
\begin{verbatim}
n2 type:subst [cat:np idx:?Agent]![]
n3[cat:vp idx:?Event]![]
{
n4 aconstr:noadj [cat:v idx:?Event]![]
{
n5 anchor
}
\end{verbatim}
\begin{code}
geniTree :: Parser (T.Tree GNode)
geniTree =
do node <- geniNode
kids <- option [] (braces $ many geniTree)
<?> "child nodes"
let noKidsAllowed t c = when (c node && (not.null $ kids)) $
fail $ t ++ " nodes may *not* have any children"
noKidsAllowed "Anchor" $ ganchor
noKidsAllowed "Substitution" $ (== Subs) . gtype
noKidsAllowed "Foot" $ (== Foot) . gtype
return (T.Node node kids)
geniNode :: Parser GNode
geniNode =
do name <- identifier
nodeType <- option "" ( (keyword TYPE >> typeParser)
<|>
reserved ANCHOR)
lex_ <- if nodeType == LEX
then (sepBy (stringLiteral<|>identifier) (symbol "|") <?> "some lexemes")
else return []
constr <- case nodeType of
"" -> adjConstraintParser
ANCHOR -> adjConstraintParser
_ -> return True
(top_,bot_) <-
if nodeType == LEX
then option ([],[]) $ try topbotParser
else topbotParser
let top = sort top_
bot = sort bot_
nodeType2 = case nodeType of
ANCHOR -> Lex
LEX -> Lex
FOOT -> Foot
SUBST -> Subs
"" -> Other
other -> error ("unknown node type: " ++ other)
return $ GN { gnname = name, gtype = nodeType2
, gup = top, gdown = bot
, glexeme = lex_
, ganchor = (nodeType == ANCHOR)
, gaconstr = constr
, gorigin = "" }
where
typeParser = choice $ map (try.symbol) [ ANCHOR, FOOT, SUBST, LEX ]
adjConstraintParser = option False $ reserved ACONSTR_NOADJ >> return True
topbotParser =
do top <- geniFeats <?> "top features"
symbol "!"
bot <- geniFeats <?> "bot features"
return (top,bot)
\end{code}
\subsection{TagElem}
For debugging purposes, it is often useful to be able to read TagElem's
directly. Note that this shares a lot of code with the macros above.
Hopefully, it is reasonably refactored.
FIXME: note that this is very rudimentary; we do not set id numbers,
parse polarities. You'll have to call
some of our helper functions if you want that functionality.
\begin{code}
geniTagElems :: Parser [TagElem]
geniTagElems = tillEof $ setTidnums `fmap` many geniTagElem
geniTagElem :: Parser TagElem
geniTagElem =
do family <- identifier
tname <- option "" $ do { colon; identifier }
iface <- (snd `liftM` geniParams) <|> geniFeats
theType <- initType <|> auxType
theTree <- geniTree
sem <- do { keywordSemantics; squares geniSemantics }
return $ emptyTE { idname = tname
, ttreename = family
, tinterface = iface
, ttype = theType
, ttree = theTree
, tsemantics = sem }
\end{code}
\section{Polarities}
The polarities parser is used for parsing extra polarity input from the
user. For more information, see chapter \ref{cha:Polarity}.
\begin{code}
geniPolarities :: Parser (Map.Map String Interval)
geniPolarities = tillEof $ toMap `fmap` many pol
where
toMap = Map.fromListWith (!+!)
pol = do p <- geniPolarity
i <- identifier
return (i,ival p)
\end{code}
\fnlabel{geniPolarity} associates a numerical value to a polarity symbol,
that is, '+' or '-'.
\begin{code}
geniPolarity :: Parser Int
geniPolarity = option 0 (plus <|> minus)
where
plus = do { char '+'; return 1 }
minus = do { char '-'; return (1) }
\end{code}
\section{Morphology}
GenI has two types of morphological input.
\paragraph{morphinfo} A morphinfo file associates predicates with
morphological feature structures. Each morphological entry consists of
a predicate followed by a feature structuer. For more information, see
chapter \ref{cha:Morphology}.
\begin{code}
geniMorphInfo :: Parser [(String,Flist)]
geniMorphInfo = tillEof $ many morphEntry
morphEntry :: Parser (String,Flist)
morphEntry =
do pred_ <- identifier
feats <- geniFeats
return (pred_, feats)
\end{code}
\paragraph{morphlexicon} A morphological lexicon is a table where each
entry is an inflected form followed by the lemma and the feature
structure to which it is associated. The table is whitespace-delimited.
\begin{code}
geniMorphLexicon :: Parser [MorphLexEntry]
geniMorphLexicon = tillEof $ many morphLexiconEntry
morphLexiconEntry :: Parser (String, String, Flist)
morphLexiconEntry =
do inflected <- try stringLiteral <|> geniWord
whiteSpace
lemma <- try stringLiteral <|> geniWord
whiteSpace
feats <- geniFeats
return (inflected, lemma, feats)
\end{code}
\section{Generic GenI stuff}
\subsection{Lexer}
Some preliminaries about GenI formats in general - comments start with
\verb!%! There is also the option of using \verb'/* */' for embedded
comments.
\begin{code}
lexer :: TokenParser ()
lexer = makeTokenParser geniLanguageDef
geniLanguageDef :: LanguageDef ()
geniLanguageDef = emptyDef
{ commentLine = "%"
, commentStart = "/*"
, commentEnd = "*/"
, opLetter = oneOf ""
, reservedOpNames = [""]
, reservedNames =
[ SEMANTICS , SENTENCE, OUTPUT, IDXCONSTRAINTS, TRACE
, ANCHOR , SUBST , FOOT , LEX , TYPE , ACONSTR_NOADJ
, INITIAL , AUXILIARY
, BEGIN , END ]
, identLetter = identStuff
, identStart = identStuff
}
where identStuff = alphaNum <|> oneOf "_'+-."
whiteSpace :: CharParser () ()
whiteSpace = P.whiteSpace lexer
looseIdentifier, identifier, stringLiteral, colon :: CharParser () String
identifier = P.identifier lexer
looseIdentifier =
do { i <- ident ; whiteSpace; return i }
where
ident =
do { c <- identStart geniLanguageDef
; cs <- many (identLetter geniLanguageDef)
; return (c:cs) } <?> "identifier"
stringLiteral = P.stringLiteral lexer
colon = P.colon lexer
squares, braces, parens :: CharParser () a -> CharParser () a
squares = P.squares lexer
braces = P.braces lexer
parens = P.parens lexer
reserved, symbol :: String -> CharParser () String
reserved s = P.reserved lexer s >> return s
symbol = P.symbol lexer
\end{code}
\subsection{Keyword}
A key is nothing simpler than the keyword, followed by a colon.
We factor this into a seperate function to account for whitespace.
\begin{code}
keyword :: String -> Parser String
keyword k =
do let helper = try $ do { reserved k; colon; return k }
helper <?> k ++ ":"
keywordSemantics :: Parser String
keywordSemantics = keyword SEMANTICS
\end{code}
\subsection{Feature structures}
Feature structures take the form \verb!val : att! with only
whitespace to separate each attval pair. See \fnref{geniValue} for
details about what the values look like.
\begin{code}
geniFeats :: Parser Flist
geniFeats = option [] $ squares $ many geniAttVal
geniAttVal :: Parser AvPair
geniAttVal = do
att <- identifier <?> "an attribute"; colon
val <- geniValue <?> "a GenI value"
return (att, val)
\end{code}
\fnlabel{geniParams} recognises a list of parameters optionally followed by a
bang (\verb$!$) and a list of attribute-value pairs. This whole thing is to
wrapped in the parens.
\textbf{Note:} sometimes people prefer not to use parameters - instead they
stick to using the interface. This is fine, but they should not forget the
bang seperator.
\begin{code}
geniParams :: Parser ([GeniVal], Flist)
geniParams = parens $ do
pars <- many geniValue <?> "some parameters"
interface <- option [] $ do { symbol "!"; many geniAttVal }
return (pars, interface)
\end{code}
\subsection{Semantics}
A semantics is simply a list of literals. A literal can take one of two
forms:
\begin{verbatim}
handle:predicate(arguments)
predicate(arguments)
\end{verbatim}
The arguments are space-delimited. Not providing a handle is
equivalent to providing an anonymous one.
\begin{code}
geniSemantics :: Parser Sem
geniSemantics =
do sem <- many (geniLiteral <?> "a literal")
return (sortSem sem)
geniLiteral :: Parser Pred
geniLiteral =
do handle <- option GAnon handleParser <?> "a handle"
predicate <- geniValue <?> "a predicate"
pars <- parens (many geniValue) <?> "some parameters"
return (handle, predicate, pars)
where handleParser =
try $ do { h <- geniValue ; char ':' ; return h }
\end{code}
\subsection{Lexical semantics}
A lexical semantics is almost exactly the same as a regular semantics,
except that each variable may be preceded by a polarity symbol. When
we figure out how to automate the detection of lexical semantic
polarities, we can start using a regular semantics again.
\begin{code}
geniLexSemantics :: Parser (Sem, [[Int]])
geniLexSemantics =
do litpols <- many (geniLexLiteral <?> "a literal")
return $ unzip litpols
geniLexLiteral :: Parser (Pred, [Int])
geniLexLiteral =
do (handle, hpol) <- option (GAnon,0) (handleParser <?> "a handle")
predicate <- geniValue <?> "a predicate"
paramsPols <- parens (many geniPolValue) <?> "some parameters"
let (pars, pols) = unzip paramsPols
literal = (handle, predicate, pars)
return (literal, hpol:pols)
where handleParser =
try $ do { h <- geniPolValue; colon; return h }
geniPolValue :: Parser (GeniVal, Int)
geniPolValue =
do p <- geniPolarity
v <- geniValue
return (v,p)
\end{code}
\subsection{Miscellaneous}
\fnlabel{geniValue} is recognised both in feature structures and in the
GenI semantics.
\begin{enumerate}
\item As of geni 0.8, variables are prefixed with a question
mark.
\item The underscore, \verb!_!, and \verb!?_! are treated as anonymous
variables.
\item Atomic disjunctions are seperated with a pipe, \verb!|!. Only
constants may be separated by atomic disjunction
\item Anything else is just a constant
\end{enumerate}
\begin{code}
geniValue :: Parser GeniVal
geniValue = ((try $ anonymous) <?> "_ or ?_")
<|> (constants <?> "a constant or atomic disjunction")
<|> (variable <?> "a variable")
where
question = "?"
constants :: Parser GeniVal
constants =
do c <- sepBy1 (looseIdentifier <|> stringLiteral) (symbol "|")
return (GConst c)
variable :: Parser GeniVal
variable =
do symbol question
v <- identifier
return (GVar v)
anonymous :: Parser GeniVal
anonymous =
do optional $ symbol question
symbol "_"
return GAnon
\end{code}
\begin{code}
tillEof :: Parser a -> Parser a
tillEof p =
do whiteSpace
r <- p
eof
return r
\end{code}