blob: 03c3ccda374c3b509947f1bf7e6272b4d8d4e5ee [file] [log] [blame]
#!/usr/localbin/perl
use Getopt::Long;
use File::Basename;
use lib dirname($0);
$assume_invocab = 0; # _when_semantics_missing
$rc = GetOptions("add=s" => \@additional_fields,
"invocab" => \$assume_invocab,
"quiet" => \$quiet,
"semantic" => \$try_semantic_validation,
"altsem=s" => \$altsemfile,
);
my @fields = ("file", "correct", "invocab", "gdiff", "sd", "sd13", "spf", "abs", "gdiffpf", "rejrslt", "rankc", "match", "ortho", "choice1", "choice2", "score1", "conf", "gender");
if($try_semantic_validation) {
push(@additional_fields,"parsed_ortho");
}
push(@fields, @additional_fields);
foreach $additional_field (@additional_fields) {
$additional_fieldh{$additional_field}++;
}
load_altsemfile($altsemfile) if($altsemfile);
$| = 1;
if(@ARGV[0] =~ /^@/) {
$flist = substr($ARGV[0],1);
@resfiles = `cat $flist`;
grep { s/\s+$// } @resfiles;
} else {
@resfiles = @ARGV;
}
foreach $resfile (@resfiles) {
($base = $resfile) =~ s/\.[a-z]+$//i;
$utdfile = "$base.utd";
# print "processing $resfile to $utdfile\n" unless($quiet);
open(RES, "<$resfile") || die "error opening $resfile\n";
open(UTD, ">$utdfile") || die "error opening $utdfile\n";
$hUTD = \*UTD;
undef %results;
while(<RES>) {
s/\s+$//;
s/^\s+//;
if(/^D:\s+(\S+)\s*$/) { # same as CREC
$file = $1;
if(defined %token) {
process(\%token, \%results);
dump_record($hUTD, \%token);
} else {
dump_header($hUTD);
}
undef %token;
$token{file} = $file;
$file =~ /ENU-(\d\d\d)-/;
$token{gender} = $gender{$1};
$token{"snr"} = get_snr($file) if($additional_fieldh{"snr"});
$token{"snrr"} = sprintf("%.2d",int(get_snr($file)/5+0.5)*5) if($additional_fieldh{"snrr"});
} elsif(/^C:\s+(.*)$/) { # same as CREC
$token{ortho} = normalize($1);
} elsif(/^\s*(\S+) = (.*)$/) {
($augkey,$augval) = ($1,$2);
if($augkey eq "feedback") {
$token{parsed_ortho} = $augval;
$token{invocab}++;
}
} elsif(/^R:\s+(.*)$/) { # same as CREC
if(/<rejected/i || /<FAILED/i) {
$token{rejrslt} = "f";
} else {
# $token{topchoice} = $1;
$token{rejrslt} = "a";
}
} elsif(/^Sem[^:]+: invocab=(\d)/) { # same as CREC
$token{invocab} = 1;
} elsif(/^CSem:\s+([a-z]+.*)\s*$/i) {
$token{parsed_ortho} = $1;
} elsif(/^Sem:(\s+)(\S+)/) { # same as CREC
$token{invocab} = 0;
} elsif(/^LITERAL\[\s*0\]\s*:\s*\'(.*)\'/) {
$choice = $1;
$token{choices}[0] = $choice;
} elsif(/^LITERAL\[\s*(\d+)\]\s+:\s+\'(.*)\'/) {
$i = $1;
$choice = $2;
/.*\: \'(.*)\'/;
$choice = $1;
$token{choices}[$i] = $choice;
} elsif(/^MEANING\[\s*(\d+)\]\s+:\s+\'(.*)\'/) {
$i = $1;
$choice = $2;
/.*\: \'(.*)\'/;
$choice = $1;
$choice =~ s/\s+$//;
$token{meanings}[$i] = $choice;
} elsif(/^LITERAL\[(\d+)\]\[(\d+)\]\s+:\s+\'(.*)\'/) {
$i = $1;
$score = $2;
$token{scores}[$i] = $score;
} elsif(/^RAW SCORE\s+:\s+\'(.*)\'/) {
$token{topscore} = $1;
} elsif(/^gdiff\s+(.*)$/){
$token{gdiff} = $1;
} elsif(/^sd13\s+(.*)$/){
$token{sd13} = $1;
} elsif(/^spf\s+(.*)$/){
$token{spf} = $1;
} elsif(/^abs\s+(.*)$/){
$token{abs} = $1;
} elsif(/^gdiffpf\s+(.*)$/){
$token{gdiffpf} = $1;
} elsif(/^sd\s+(.*)$/){
$token{sd} = $1;
} elsif(/^CONFIDENCE SCORE\s+:\s+\'(.*)\'/) {
$token{conf} = $1;
}
}
process(\%token, \%results) if(defined %token);
dump_record($hUTD, \%token) if(defined %token);
close(UTD);
close(RES);
undef %token;
$results{total} ||= 1;
$rr = $results{correct}/$results{total} * 100;
$rr = int($rr*10 + 0.5)/10;
print sprintf("%-45s RR %4.1f %d/%d (%d oovs)\n", $base, $rr, $results{correct}, $results{total}, $results{numoovs});
}
sub process
{
my $token = shift(@_);
my $results = shift(@_);
$token->{invocab} = 1 if($assume_invocab);
if(defined $token{topchoice}) {
$token->{choices}[0] = $token{topchoice};
}
if(defined $token{topscore}) {
$token->{scores}[0] = $token{topscore};
}
my $ortho = lc($token->{ortho});
my $topch = lc($token->{choices}[0]);
$ortho =~ s/_/ /g;
$topch =~ s/_/ /g;
$topch =~ s/\s\s+/ /g;
$ortho =~ s/\s\s+/ /g;
if($token->{invocab} == 0) {
$token->{correct} = "0";
$results->{numoovs}++;
} elsif($topch eq $ortho) {
$results->{total}++;
$results->{correct}++;
$token->{correct} = "1";
} else {
$results->{total}++;
# print "$token->{file} MEANINGCMP: ==$token->{meanings}[0]== ==$token->{parsed_ortho}==\n";
if($altsemfile) {
if($token->{parsed_ortho} ne $csemtags{$token->{file}}) {
# print "changing $token{parsed_ortho} ne $csemtags{$token->{file}}\n";
$token->{parsed_ortho} = $csemtags{$token->{file}};
}
}
if(not $try_semantic_validation) {
$token->{correct} = "0";
} else {
if($token->{meanings}[0] eq $token->{parsed_ortho} && length($token->{parsed_ortho})>0) {
$token->{correct} = "1";
$results->{correct}++ ;
} else {
$token->{correct} = "0";
}
}
}
$token->{rankc} = 0;
my $nchoices = scalar(@{$token->{choices}});
for($i=0; $i<$nchoices; $i++) {
my $choice = lc $token->{choices}[$i];
$choice =~ s/_/ /g;
if($choice eq $ortho) {
$token->{rankc} = $i+1;
last;
}
}
$token->{gender} = "?";
}
sub dump_record
{
my $HH = shift(@_);
my $token = shift(@_);
foreach $field (@fields) {
if ($field =~ /^sd13$/){
print UTD "$token->{$field}" , ":";
} elsif($field =~ /^(\S+)(\d+)$/) {
$name = "${1}s";
$num = $2 - 1;
print UTD "$token->{$name}[$num]", ":";
} else{
print UTD "$token->{$field}" , ":";
}
}
print UTD "\n";
}
sub dump_header
{
my $HH = shift(@_);
foreach $field (@fields) {
print UTD "$field" , ":";
}
print UTD "\n";
}
sub normalize
{
my $k = shift(@_);
$k =~ s/\s\s+/ /g;
$k =~ s/\:/\;/g;
$k =~ s/\[[^\]]+\]//g;
$k =~ s/^\s+//g;
$k =~ s/\s+$//g;
return $k;
}
sub load_altsemfile
{
my $semfile = shift(@_);
open(SM,"<$semfile") || die "error: opening $semfile\n";
while(<SM>) {
if(/D: (\S+)$/) {
$file = $1;
$file =~ s/\s+$//;
} elsif(/^CSem:\s+([a-z]+.*)\s*$/i) {
$csemtags{$file} = $1;
$csemtags{$file} =~ s/\s+$//;
} elsif(/^Sem[^:]+: invocab=(\d)/) { # same as CREC
$semtags{$file} = 1;
} elsif(/^Sem:(\s+)(\S+)/) { # same as CREC
$semtags{$file} = 0;
}
}
close(SM);
}