# Copyright (C) 2011, Peter Ljunglof. All rights reserved.
# This file is part of the FraCaS Treebank.
#
# The FraCaS Treebank is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# The FraCaS Treebank is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with the FraCaS Treebank. If not, see .
"""
Usage: python build_fracasbank.py (xml/pl) FraCaSBankI.gf FraCaSBank{Lang}.gf ...
This script compiles the FraCaS treebank into XML or Prolog format.
The result is printed to standard output.
"""
import re
def read_treebank(treefile):
"""Reads trees from the FraCaS treebank.
Returns a dict mapping sentence IDs to the trees.
"""
treebank = {}
with open(treefile) as F:
for line in F:
match = re.match(r"^lin +(\w+) *= *(.+?) *; *$", line)
if match:
sid, tree = match.group(1, 2)
if re.match(r"^\w+$", tree):
pass
elif re.match(r"^ *variants *\{ *\} *$", tree):
tree = None
else:
tree = GFTree.parse(tree)
treebank[sid] = {'tree': tree}
return treebank
def linearize_language(treebank, langfile, lang):
"""Calls GF to linearize the trees in the treebank in the given language.
Adds the results to the treebank.
"""
from subprocess import Popen, PIPE
gf = Popen(["gf", "--quiet", "--run", langfile], stdin=PIPE, stdout=PIPE)
gfinput = ""
for sid in sorted(treebank):
gfinput += 'ps "@ %s"\nl %s\n' % (sid, sid)
output, _error = gf.communicate(gfinput)
for result in output.split("@"):
try:
sid, lin = result.split(None, 1)
except ValueError:
continue
treebank[sid][lang] = lin.strip()
def print_treebank(treebank, outformat):
"""Print the treebank on the standard output.
The outformat can be 'xml' (XML) or 'pl' (Prolog).
"""
pro = outformat.lower() == "pl"
xml = outformat.lower() == "xml"
assert pro or xml, "Unknown output format: '%s'" % outformat
if xml:
print ""
print ""
if pro:
print ":- discontiguous tree/2, sent/3."
print "%% tree(?SentenceID, ?Tree)"
print "%% sent(?SentenceID, ?Language, ?Sentence)"
for sid, item in sorted(treebank.items()):
if xml: print "" % sid
if isinstance(item['tree'], basestring):
if xml: print "" % item['tree']
if pro: print "tree(%s, %s)." % (plquote(sid), plquote(item['tree']))
elif isinstance(item['tree'], GFTree):
if xml: print "%s" % item['tree'].xmlstr()
if pro: print "tree(%s, %s)." % (plquote(sid), item['tree'].prologstr())
for lang, sent in sorted(item.items()):
if lang != 'tree':
if xml: print "%s" % (lang, sent)
if pro: print "sent(%s, %s, %s)." % (plquote(sid), plquote(lang.lower()), plquote(sent))
if xml: print ""
if pro: print
if xml: print ""
def plquote(atom):
"""Surround a Prolog atom with '...' if necessary."""
if re.match(r"^(\d+|[a-z][a-zA-Z0-9_]*)$", atom):
return atom
else:
return "'" + atom.replace("\\", "\\\\").replace("'", "\\'") + "'"
class GFTree(object):
def __init__(self, node, children=[]):
self.node = node
self.children = list(children)
@classmethod
def parse(cls, descr):
tokens = descr.replace("(", " ( ").replace(")", " ) ").split()
if tokens[0] == "(" and tokens[-1] == ")":
tokens = tokens[1:-1]
result = [[]]
for token in tokens:
if token == "(":
result.append([])
elif token == ")":
tree = result.pop()
result[-1].append(GFTree(tree[0], tree[1:]))
elif not result[-1]:
result[-1].append(token)
else:
result[-1].append(GFTree(token))
assert len(result) == 1
tree = result[0]
return GFTree(tree[0], tree[1:])
def __str__(self):
return "(" + " ".join(map(str, [self.node] + self.children)) + ")"
def __repr__(self):
return (type(self).__name__ + "(" +
", ".join(map(repr, [self.node] + self.children)) + ")")
def xmlstr(self):
return ("" + self.node + "" +
"".join(child.xmlstr() for child in self.children) +
"")
def prologstr(self):
return ("t(" + plquote(self.node) + ", [" +
", ".join(child.prologstr() for child in self.children) +
"])")
if __name__ == '__main__':
import sys, os.path
if len(sys.argv) < 3:
exit(__doc__)
outformat, treefile = sys.argv[1:3]
languages = sys.argv[3:]
basename = os.path.commonprefix(sys.argv[2:])
treebank = read_treebank(treefile)
for langfile in languages:
lang = langfile[len(basename):]
if lang.endswith(".gf"): lang = lang[:-3]
linearize_language(treebank, langfile, lang)
print_treebank(treebank, outformat)