#! /usr/bin/perl -w # # Perl script to process OALD machine-readable ASCII file # into a GF lexicon # # Usage: ./asc2gf < ascii_0710-1.txt # # Bjorn Bringert 2008, # based on asc2lex by # Matthew Purver, 11/2001 use strict; my %irregular_verbs = (); my %words = (); my $irreg_eng = "../../english/IrregEng.gf"; open(IRREG_ENG,"$irreg_eng") or die "Could not open $irreg_eng\n"; while () { if (s/\s*([a-z\d]+)_V\s*=.*/$1/) { chomp; $irregular_verbs{$_} = 1; } } close IRREG_ENG; print "Known irregular verbs from $irreg_eng:\n"; print join(",", keys %irregular_verbs) . "\n"; # skip header section while ( ) { last if /<\/TEIHEADER>/; } # read a line from stdin while ( my $line = ) { # remove SGML tags $line =~ s/<[^<>]+>//g; # split line into fields according to spec (line may be empty now) if ( $line =~ /^(.{23}).{23}(.{23}).{1}(.{58})$/ ) { my ( $word, $pos, $cat ) = ( $1, $2, $3 ); # trim white space for ( ( $word, $pos, $cat ) ) { s/\s*$//; } # make word lower-case $word =~ tr/A-Z/a-z/; # lower case # translate OALD diacritics $word =~ s/~n/ñ/g; $word =~ s/ _ $name =~ s/-/_/g; # - -> _ $name =~ s/\./_/g; # . -> _ $name =~ s/^'//; # drop initial ' # get PoS & subcat info my @pos = split( /,/, $pos ); $cat =~ s/,/\',\'/g; ( $cat = "\'$cat\'" ) unless ( $cat eq '' ); foreach ( @pos ) { my ( $pcode, $infl, $freq )=split(//); # for verbs, get inflected forms if ( $pcode =~ /^[GHIJ]/ ) { $pos = 'verb'; my ($vbz, $vbg, $vbd); # if this is a root form, work out the inflected forms if ( $infl =~ /^\d/ ) { if ( $infl == 0 ) { ( $vbz = $word ) =~ s/$/s/; ( $vbg = $word ) =~ s/$/ing/; ( $vbd = $word ) =~ s/$/ed/; } elsif ( $infl == 1 ) { ( $vbz = $word ) =~ s/$/es/; ( $vbg = $word ) =~ s/$/ing/; ( $vbd = $word ) =~ s/$/ed/; } elsif ( $infl == 2 ) { ( $vbz = $word ) =~ s/e$/es/; ( $vbg = $word ) =~ s/e$/ing/; ( $vbd = $word ) =~ s/e$/ed/; } elsif ( $infl == 3 ) { ( $vbz = $word ) =~ s/y$/ies/; ( $vbg = $word ) =~ s/y$/ying/; ( $vbd = $word ) =~ s/y$/ied/; } elsif ( $infl == 4 ) { ( $vbz = $word ) =~ s/$/s/; ( $vbg = $word ) =~ s/(\w)$/$1$1ing/; ( $vbd = $word ) =~ s/(\w)$/$1$1ed/; } elsif ( $infl == 5 ) { # for irregulars, just mark as such for now, we'll guess later $vbz = 'IRREG'; $vbg = 'IRREG'; $vbd = 'IRREG'; } my $lin = "mkV \"$word\" \"$vbz\" \"$vbd\" \"$vbd\" \"$vbg\""; # try to use a verb from IrregEng if ( $infl == 5 ) { for (my $i = 0; $i < length($word) - 1; $i++) { my $suffix = substr($word, $i); if ($irregular_verbs{$suffix}) { if ($i == 0) { $lin = "IrregEng.${name}_V"; } else { my $prefix = substr($word, 0, $i); $lin = "mkV \"$prefix\" IrregEng.${suffix}_V"; } last; } } } if ($pcode eq 'G') { #add_word("${name}_VX", "mkVX ($lin)"); print STDERR "Ignoring anomalous verb: $name\n"; } if ($pcode eq 'I' || $pcode eq 'J') { add_word("${name}_V", "$lin"); } if ($pcode eq 'H' || $pcode eq 'J') { add_word("${name}_V2", "mkV2 ($lin)"); } } # if this is an inflected form, save for guessing irregulars later elsif ( $infl =~ /^a/ ) { #push( @vbz, $word ); } elsif ( $infl =~ /^b/ ) { #push( @vbg, $word ); } elsif ( $infl =~ /^c/ ) { #push( @vbd, $word ); } elsif ( $infl =~ /^d/ ) { #push( @vbn, $word ); } } # for nouns, get plural form elsif( $pcode =~ /^[KLMNY]/ ) { $pos = 'noun'; $pcode =~ s/^K/count/; $pcode =~ s/^L/mass/; $pcode =~ s/^M/both/; $pcode =~ s/^N/proper/; if ( $pcode =~ /^Y/ ) { $pcode = 'count' if $infl =~ /^[>\)\]]/; $pcode = 'mass' if $infl =~ /^\}/; $pcode = 'proper' if $infl =~ /^[:=~]/; } # if this is a singular form, work out plural form unless ( $infl =~ /^j/ ) { my $pl = '-'; if ( $infl eq '6' ) { ( $pl = $word ) =~ s/$/s/; } elsif ( $infl eq '7' ) { ( $pl = $word ) =~ s/$/es/; } elsif ( $infl eq '8' ) { ( $pl = $word ) =~ s/y$/ies/; } elsif ( $infl =~ /^[9k\]]/ ) { $pl = $word; } elsif ( $infl =~ /^i/ ) { # for irregulars, let's just make a guess and mark with '*' # this could be done better, as for verbs, but I can't be bothered now $pl = $word; ( $pl =~ s/^((wo)?m)an/$1en\*/ ) or ( $pl =~ s/man(-|$)/men$1\*/ ) or ( $pl =~ s/-in-law/s-in-law\*/ ) or ( $pl =~ s/um$/a\*/ ) or ( $pl =~ s/us$/i\*/ ) or ( $pl =~ s/a$/ae\*/ ) or ( $pl =~ s/on$/a\*/ ) or ( $pl =~ s/is$/es\*/ ) or ( $pl =~ s/o$/i\*/ ) or ( $pl =~ s/child$/children\*/ ) or ( $pl =~ s/oot$/eet\*/ ) or ( $pl =~ s/ooth$/eeth\*/ ) or ( $pl =~ s/([lm])ouse$/$1ice\*/ ) or ( $pl =~ s/f(e)?$/ves\*/ ) or ( $pl =~ s/[ei]x$/ices\*/ ) or ( $pl =~ s/eau$/eaux\*/ ) or ( $pl = 'IRREG' ); } # if plural-only, swap root form & plural elsif ( $infl =~ /^\)/ ) { $pl = $word; $word = '-'; } ( $infl =~ s/^[:l]/per/ ) or ( $infl =~ s/^[mn]/loc/ ) or ( $infl = '_' ); my $comment = ""; if ( $word eq '-' ) { $comment .= " {- FIXME: no singular form -}"; } if ( $pl eq '-' ) { $comment .= " {- FIXME: no plural form -}"; } if ( $pl =~ s/\*$// ) { $comment .= " {- FIXME: guessed plural form -}"; } if ( $pcode eq 'proper' ) { add_word("${name}_PN", "mkPN \"$word\""); } else { add_word("${name}_N", "mkN \"$word\" \"$pl\"$comment"); } } } # for adjectives, get comparative & superlative forms elsif( $pcode =~ /^O/ ) { $pos = 'adj'; # if this is root form, work out inflected forms unless ( $infl =~ /^[rs]/ ) { my ($comp, $sup); if ( $infl =~ /^[Apqt]/ ) { $comp = $sup = '-'; } elsif ( $infl =~ /^B/ ) { ( $comp = $word ) =~ s/$/r/; ( $sup = $word ) =~ s/$/st/; } elsif ( $infl =~ /^C/ ) { ( $comp = $word ) =~ s/$/er/; ( $sup = $word ) =~ s/$/est/; } elsif ( $infl =~ /^D/ ) { ( $comp = $word ) =~ s/y$/ier/; ( $sup = $word ) =~ s/y$/iest/; } elsif ( $infl =~ /^E/ ) { # for irregulars, let's just have a guess and mark with '*' # (there aren't very many of these) ( $comp = $word ) =~ s/(\w)$/$1$1er\*/; ( $sup = $word ) =~ s/(\w)$/$1$1est\*/; } $infl =~ s/^[ABCDE]/normal/; $infl =~ s/^p/pred/; $infl =~ s/^q/attr/; $infl =~ s/^t/affix/; if ( $comp eq '-' ) { add_word("${name}_A", "compoundA (mkA \"$word\")"); } else { add_word("${name}_A", "mkA \"$word\" \"$comp\""); } } } # adverb elsif( $pcode =~ /^P/ ) { $pos = 'adv'; $infl =~ s/^[u\+]/normal/; $infl =~ s/^w/whrel/; $infl =~ s/^v/whq/; add_word("${name}_Adv", "mkAdv \"$word\""); } # pronoun elsif( $pcode =~ s/^Q/_/ ) { $pos = 'pron'; $infl =~ s/^x/normal/; $infl =~ s/^y/whq/; $infl =~ s/^z/whrel/; my $class = '_'; # reflexive pronouns if ( ( $word =~ /self$/ ) or ( $word =~ /selves$/ ) ) { $pcode = 'acc'; } # accusative personal pronouns if ( ( $word =~ /^him/ ) or ( $word =~ /^her/ ) or ( $word =~ /^them/ ) or ( $word eq 'us' ) or ( $word eq 'thee' ) or ( $word eq 'me' ) ) { $pcode = 'acc'; $class = 'per'; } # nominative personal pronouns if ( ( $word eq 'he' ) or ( $word eq 'she' ) or ( $word eq 'they' ) or ( $word eq 'we' ) or ( $word eq 'thou' ) or ( $word eq 'i' ) ) { $pcode = 'nom'; $class = 'per'; } # other personal pronouns if ( ( $word =~ /.+one/ ) or ( $word =~ /one.+/ ) or ( $word =~ /body/ ) or ( $word =~ /^you/ ) or ( $word =~ /^who/ ) ) { $class = 'per'; } # non-personal pronouns if ( $word =~ /thing/ ) { $class = 'nper'; } # otherwise case/person info will be '_' (anon variable) # add full spec to @pron array #push( @pron, "$pos( \'$word\', $pcode, $infl, $class ).\n" ); } # for determiners, leave anon variable as placeholder for semantics elsif( $pcode =~ /^[RS]/ ) { $pos = 'det'; $pcode =~ s/^R/def/; $pcode =~ s/^S/indef/; #add_word("${name}_Det","mkDeterminer \"$word\""); } # for prepositions - nothing to say elsif( $pcode =~ s/^T/prep/ ) { $pos = 'prep'; add_word("${name}_Prep","mkPrep \"$word\""); } # for conjunctions - nothing to say elsif( $pcode =~ s/^V/conj/ ) { $pos = 'conj'; add_word("${name}_Conj","mkConj \"$word\""); } # for miscellaneous, leave '-' as placeholder for illocutionary info elsif( $pcode =~ /^[UWXZ]/ ) { $pos = 'misc'; #push( @prefix, "$pos( \'$word\', $pcode, '-' ).\n" ) if ( $pcode =~ s/^U/prefix/ ); #push( @interj, "$pos( \'$word\', $pcode, '-' ).\n" ) if ( $pcode =~ s/^W/interj/ ); #push( @partcl, "$pos( \'$word\', $pcode, '-' ).\n" ) if ( $pcode =~ s/^X/partcl/ ); #push( @unknown, "$pos( \'$word\', $pcode, '-' ).\n" ) if ( $pcode =~ s/^Z/unknown/ ); } } } } my $absfile = "Oald.gf"; my $cncfile = "OaldEng.gf"; my $abs_structfile = "OaldStructural.gf"; my $cnc_structfile = "OaldStructuralEng.gf"; open (ABS, '>', $absfile); open (CNC, '>', $cncfile); open (ABS_STRUCTURAL, '>', $abs_structfile); open (CNC_STRUCTURAL, '>', $cnc_structfile); # print a nice comment at the top my $header = "-- English lexicon for GF, produced from:\n" . "-- Oxford advanced learner's dictionary of current English:\n" . "-- expanded 'computer usable' version compiled by Roger Mitton\n" . "-- The computer usable version is transcribed from:\n" . "-- Oxford advanced learner's dictionary of current English\n" . "-- A.S. Hornby ; with the assistance of A.P. Cowie [and] J. Windsor Lewis.\n" . "-- 3rd. ed., London : Oxford University Press, 1974.\n" . "-- Distributed as 'dict0710' by:\n" . "-- Oxford Text Archive\n" . "-- Oxford University Computing Services\n" . "-- 13 Banbury Road\n" . "-- Oxford\n" . "-- OX2 6NN\n" . "-- Under these conditions:\n" . "-- Freely available for non-commercial use provided that this header is\n" . "-- included in its entirety with any copy distributed.\n" . "--\n" . "-- GF version generated by asc2gf, Bjorn Bringert Nov 2008\n" . "-- based on asc2lex, Matthew Purver Nov 2001\n" . "-- http://www.stanford.edu/~mpurver/software.html\n" . "\n"; print ABS $header; print ABS "abstract Oald = Cat ** {\n"; print CNC $header; print CNC "--# -path=.:alltenses\n"; print CNC "concrete OaldEng of Oald = CatEng ** open ParadigmsEng, IrregEng in {\n"; print ABS_STRUCTURAL $header; print ABS_STRUCTURAL "abstract OaldStructural = Cat ** {\n"; print CNC_STRUCTURAL $header; print CNC_STRUCTURAL "--# -path=.:alltenses\n"; print CNC_STRUCTURAL "concrete OaldStructuralEng of OaldStructural = CatEng ** open ParadigmsEng in {\n"; foreach my $name (sort (keys %words)) { (my $cat = $name) =~ s/.*_([A-Z][A-Za-z\d]*)$/$1/; my $lin = $words{$name}; if ( $cat =~ /^(A)|(N)|(V)|(V2)$/ ) { print ABS "fun $name : $cat;\n"; print CNC "lin $name = $lin;\n"; } else { print ABS_STRUCTURAL "fun $name : $cat;\n"; print CNC_STRUCTURAL "lin $name = $lin;\n"; } } print ABS "}"; print CNC "}"; print ABS_STRUCTURAL "}"; print CNC_STRUCTURAL "}"; close(ABS_STRUCTURAL); close(CNC_STRUCTURAL); close(ABS); close(CNC); print "\nWrote open lexicon to $absfile and $cncfile\n"; print "Wrote closed lexicon to $abs_structfile and $cnc_structfile\n"; sub add_word { my ($name,$lin) = @_; if (exists $words{$name}) { print STDERR "Duplicate word: $name\n"; } else { $words{$name} = $lin; } }