tools/cmd/srecres2utd.pl - platform/external/srec.git - Git at Google

 #!/usr/localbin/perl

 use Getopt::Long;
 use File::Basename;
 use lib dirname($0);

 $assume_invocab = 0; # _when_semantics_missing
 $rc = GetOptions("add=s" => \@additional_fields,
 		 "invocab" => \$assume_invocab,
 		 "quiet" => \$quiet,
 		 "semantic" => \$try_semantic_validation,
 		 "altsem=s" => \$altsemfile,
                  );

 my @fields = ("file", "correct", "invocab", "gdiff", "sd", "sd13", "spf", "abs", "gdiffpf", "rejrslt", "rankc", "match", "ortho", "choice1", "choice2", "score1", "conf", "gender");

 if($try_semantic_validation) {
     push(@additional_fields,"parsed_ortho");
 }
 push(@fields, @additional_fields);
 foreach $additional_field (@additional_fields) {
     $additional_fieldh{$additional_field}++;
 }

 load_altsemfile($altsemfile) if($altsemfile);

 $| = 1;

 if(@ARGV[0] =~ /^@/) {
     $flist = substr($ARGV[0],1);
     @resfiles = `cat $flist`;
     grep { s/\s+$// } @resfiles;
 } else {
     @resfiles = @ARGV;
 }

 foreach $resfile (@resfiles) {
     ($base = $resfile) =~ s/\.[a-z]+$//i;
     $utdfile = "$base.utd";

     # print "processing $resfile to $utdfile\n" unless($quiet);
     open(RES, "<$resfile") || die "error opening $resfile\n";
     open(UTD, ">$utdfile") || die "error opening $utdfile\n";
     $hUTD = \*UTD;
     undef %results;
     while(<RES>) {
 	s/\s+$//;
         s/^\s+//;
 	if(/^D:\s+(\S+)\s*$/) { # same as CREC
 	    $file = $1;
 	    if(defined %token) {
 		process(\%token, \%results);
 		dump_record($hUTD, \%token);
 	    } else {
 		dump_header($hUTD);
 	    }
 	    undef %token;
 	    $token{file} = $file;
 	    $file =~ /ENU-(\d\d\d)-/;
 	    $token{gender} = $gender{$1};
 	    $token{"snr"} = get_snr($file) if($additional_fieldh{"snr"});
 	    $token{"snrr"} = sprintf("%.2d",int(get_snr($file)/5+0.5)*5) if($additional_fieldh{"snrr"});
 	} elsif(/^C:\s+(.*)$/) { # same as CREC
 	    $token{ortho} = normalize($1);
 	} elsif(/^\s*(\S+) = (.*)$/) {
 	    ($augkey,$augval) = ($1,$2);
 	    if($augkey eq "feedback") {
 		$token{parsed_ortho} = $augval;
 		$token{invocab}++;
 	    }
 	} elsif(/^R:\s+(.*)$/) { # same as CREC
 	    if(/<rejected/i || /<FAILED/i) {
 		$token{rejrslt} = "f";
 	    } else {
 		# $token{topchoice} = $1;
 		$token{rejrslt} = "a";
 	    }
 	} elsif(/^Sem[^:]+:  invocab=(\d)/) { # same as CREC
 	    $token{invocab} = 1;
 	} elsif(/^CSem:\s+([a-z]+.*)\s*$/i) {
 	    $token{parsed_ortho} = $1;
 	} elsif(/^Sem:(\s+)(\S+)/) { # same as CREC
 	    $token{invocab} = 0;
 	} elsif(/^LITERAL\[\s*0\]\s*:\s*\'(.*)\'/) {
 	    $choice = $1;
 	    $token{choices}[0] = $choice;
 	} elsif(/^LITERAL\[\s*(\d+)\]\s+:\s+\'(.*)\'/) {
 	    $i = $1;
 	    $choice = $2;
 	    /.*\: \'(.*)\'/;
 	    $choice = $1;
 	    $token{choices}[$i] = $choice;
 	} elsif(/^MEANING\[\s*(\d+)\]\s+:\s+\'(.*)\'/) {
 	    $i = $1;
 	    $choice = $2;
 	    /.*\: \'(.*)\'/;
 	    $choice = $1;
 	    $choice =~ s/\s+$//;
 	    $token{meanings}[$i] = $choice;
 	} elsif(/^LITERAL\[(\d+)\]\[(\d+)\]\s+:\s+\'(.*)\'/) {
 	    $i = $1;
 	    $score = $2;
 	    $token{scores}[$i] = $score;
 	} elsif(/^RAW SCORE\s+:\s+\'(.*)\'/) {
 	    $token{topscore} = $1;
 	} elsif(/^gdiff\s+(.*)$/){
             $token{gdiff} = $1;
         } elsif(/^sd13\s+(.*)$/){
             $token{sd13} = $1;
         } elsif(/^spf\s+(.*)$/){
             $token{spf} = $1;
         } elsif(/^abs\s+(.*)$/){
             $token{abs} = $1;
         } elsif(/^gdiffpf\s+(.*)$/){
             $token{gdiffpf} = $1;
         } elsif(/^sd\s+(.*)$/){
             $token{sd} = $1;
         } elsif(/^CONFIDENCE SCORE\s+:\s+\'(.*)\'/) {
             $token{conf} = $1;
         }
     }
     process(\%token, \%results) if(defined %token);
     dump_record($hUTD, \%token) if(defined %token);
     close(UTD);
     close(RES);
     undef %token;
     $results{total} ||= 1;
     $rr = $results{correct}/$results{total} * 100;
     $rr = int($rr*10 + 0.5)/10;
     print sprintf("%-45s RR %4.1f %d/%d (%d oovs)\n", $base, $rr, $results{correct}, $results{total}, $results{numoovs});
 }


 sub process
 {
     my $token = shift(@_);
     my $results = shift(@_);
     $token->{invocab} = 1 if($assume_invocab);
     if(defined $token{topchoice}) {
 	$token->{choices}[0] = $token{topchoice};
     }
     if(defined $token{topscore}) {
 	$token->{scores}[0] = $token{topscore};
     }
     my $ortho = lc($token->{ortho});
     my $topch = lc($token->{choices}[0]);

     $ortho =~ s/_/ /g;
     $topch =~ s/_/ /g;
     $topch =~ s/\s\s+/ /g;
     $ortho =~ s/\s\s+/ /g;
     if($token->{invocab} == 0) {
 	$token->{correct} = "0";
 	$results->{numoovs}++;
     } elsif($topch eq $ortho) {
 	$results->{total}++;
 	$results->{correct}++;
 	$token->{correct} = "1";
     } else {
 	$results->{total}++;
 	# print "$token->{file} MEANINGCMP: ==$token->{meanings}[0]== ==$token->{parsed_ortho}==\n";
 	if($altsemfile) {
 	    if($token->{parsed_ortho} ne $csemtags{$token->{file}}) {
 		# print "changing $token{parsed_ortho} ne $csemtags{$token->{file}}\n";
 		$token->{parsed_ortho} = $csemtags{$token->{file}};
 	    }
 	}

 	if(not $try_semantic_validation) {
 	    $token->{correct} = "0";
 	} else {
 	    if($token->{meanings}[0] eq $token->{parsed_ortho} && length($token->{parsed_ortho})>0) {
 		$token->{correct} = "1";
 		$results->{correct}++ ;
 	    } else {
 		$token->{correct} = "0";
 	    }
 	}
     }
     $token->{rankc} = 0;
     my $nchoices = scalar(@{$token->{choices}});
     for($i=0; $i<$nchoices; $i++) {
 	my $choice = lc $token->{choices}[$i];
 	$choice =~ s/_/ /g;
 	if($choice eq $ortho) {
 	    $token->{rankc} = $i+1;
 	    last;
 	}
     }
     $token->{gender} = "?";
 }

 sub dump_record
 {
     my $HH = shift(@_);
     my $token = shift(@_);
     foreach $field (@fields) {
           if ($field =~ /^sd13$/){
           print UTD "$token->{$field}" , ":";
 	} elsif($field =~ /^(\S+)(\d+)$/) {
 	  $name = "${1}s";
 	  $num = $2 - 1;
 	  print UTD "$token->{$name}[$num]", ":";
 	} else{
           print UTD "$token->{$field}" , ":";
 	}
     }
     print UTD "\n";
 }

 sub dump_header
 {
     my $HH = shift(@_);
     foreach $field (@fields) {
 	print UTD "$field" , ":";
     }
     print UTD "\n";
 }

 sub normalize
 {
     my $k = shift(@_);
     $k =~ s/\s\s+/ /g;
     $k =~ s/\:/\;/g;
     $k =~ s/\[[^\]]+\]//g;
     $k =~ s/^\s+//g;
     $k =~ s/\s+$//g;
     return $k;
 }

 sub load_altsemfile
 {
     my $semfile = shift(@_);
     open(SM,"<$semfile") || die "error: opening $semfile\n";
     while(<SM>) {
 	if(/D: (\S+)$/) {
 	    $file = $1;
 	    $file =~ s/\s+$//;
 	} elsif(/^CSem:\s+([a-z]+.*)\s*$/i) {
 	    $csemtags{$file} = $1;
 	    $csemtags{$file} =~ s/\s+$//;
 	} elsif(/^Sem[^:]+:  invocab=(\d)/) { # same as CREC
 	    $semtags{$file} = 1;
 	} elsif(/^Sem:(\s+)(\S+)/) { # same as CREC
 	    $semtags{$file} = 0;
 	}
     }
     close(SM);
 }
	#!/usr/localbin/perl

	use Getopt::Long;
	use File::Basename;
	use lib dirname($0);

	$assume_invocab = 0; # _when_semantics_missing
	$rc = GetOptions("add=s" => \@additional_fields,
	"invocab" => \$assume_invocab,
	"quiet" => \$quiet,
	"semantic" => \$try_semantic_validation,
	"altsem=s" => \$altsemfile,
	);

	my @fields = ("file", "correct", "invocab", "gdiff", "sd", "sd13", "spf", "abs", "gdiffpf", "rejrslt", "rankc", "match", "ortho", "choice1", "choice2", "score1", "conf", "gender");

	if($try_semantic_validation) {
	push(@additional_fields,"parsed_ortho");
	}
	push(@fields, @additional_fields);
	foreach $additional_field (@additional_fields) {
	$additional_fieldh{$additional_field}++;
	}

	load_altsemfile($altsemfile) if($altsemfile);

	$\| = 1;

	if(@ARGV[0] =~ /^@/) {
	$flist = substr($ARGV[0],1);
	@resfiles = `cat $flist`;
	grep { s/\s+$// } @resfiles;
	} else {
	@resfiles = @ARGV;
	}

	foreach $resfile (@resfiles) {
	($base = $resfile) =~ s/\.[a-z]+$//i;
	$utdfile = "$base.utd";

	# print "processing $resfile to $utdfile\n" unless($quiet);
	open(RES, "<$resfile") \|\| die "error opening $resfile\n";
	open(UTD, ">$utdfile") \|\| die "error opening $utdfile\n";
	$hUTD = \*UTD;
	undef %results;
	while(<RES>) {
	s/\s+$//;
	s/^\s+//;
	if(/^D:\s+(\S+)\s*$/) { # same as CREC
	$file = $1;
	if(defined %token) {
	process(\%token, \%results);
	dump_record($hUTD, \%token);
	} else {
	dump_header($hUTD);
	}
	undef %token;
	$token{file} = $file;
	$file =~ /ENU-(\d\d\d)-/;
	$token{gender} = $gender{$1};
	$token{"snr"} = get_snr($file) if($additional_fieldh{"snr"});
	$token{"snrr"} = sprintf("%.2d",int(get_snr($file)/5+0.5)*5) if($additional_fieldh{"snrr"});
	} elsif(/^C:\s+(.*)$/) { # same as CREC
	$token{ortho} = normalize($1);
	} elsif(/^\s(\S+) = (.)$/) {
	($augkey,$augval) = ($1,$2);
	if($augkey eq "feedback") {
	$token{parsed_ortho} = $augval;
	$token{invocab}++;
	}
	} elsif(/^R:\s+(.*)$/) { # same as CREC
	if(/<rejected/i \|\| /<FAILED/i) {
	$token{rejrslt} = "f";
	} else {
	# $token{topchoice} = $1;
	$token{rejrslt} = "a";
	}
	} elsif(/^Sem[^:]+: invocab=(\d)/) { # same as CREC
	$token{invocab} = 1;
	} elsif(/^CSem:\s+([a-z]+.)\s$/i) {
	$token{parsed_ortho} = $1;
	} elsif(/^Sem:(\s+)(\S+)/) { # same as CREC
	$token{invocab} = 0;
	} elsif(/^LITERAL\[\s0\]\s:\s\'(.)\'/) {
	$choice = $1;
	$token{choices}[0] = $choice;
	} elsif(/^LITERAL\[\s(\d+)\]\s+:\s+\'(.)\'/) {
	$i = $1;
	$choice = $2;
	/.\: \'(.)\'/;
	$choice = $1;
	$token{choices}[$i] = $choice;
	} elsif(/^MEANING\[\s(\d+)\]\s+:\s+\'(.)\'/) {
	$i = $1;
	$choice = $2;
	/.\: \'(.)\'/;
	$choice = $1;
	$choice =~ s/\s+$//;
	$token{meanings}[$i] = $choice;
	} elsif(/^LITERAL\[(\d+)\]\[(\d+)\]\s+:\s+\'(.*)\'/) {
	$i = $1;
	$score = $2;
	$token{scores}[$i] = $score;
	} elsif(/^RAW SCORE\s+:\s+\'(.*)\'/) {
	$token{topscore} = $1;
	} elsif(/^gdiff\s+(.*)$/){
	$token{gdiff} = $1;
	} elsif(/^sd13\s+(.*)$/){
	$token{sd13} = $1;
	} elsif(/^spf\s+(.*)$/){
	$token{spf} = $1;
	} elsif(/^abs\s+(.*)$/){
	$token{abs} = $1;
	} elsif(/^gdiffpf\s+(.*)$/){
	$token{gdiffpf} = $1;
	} elsif(/^sd\s+(.*)$/){
	$token{sd} = $1;
	} elsif(/^CONFIDENCE SCORE\s+:\s+\'(.*)\'/) {
	$token{conf} = $1;
	}
	}
	process(\%token, \%results) if(defined %token);
	dump_record($hUTD, \%token) if(defined %token);
	close(UTD);
	close(RES);
	undef %token;
	$results{total} \|\|= 1;
	$rr = $results{correct}/$results{total} * 100;
	$rr = int($rr*10 + 0.5)/10;
	print sprintf("%-45s RR %4.1f %d/%d (%d oovs)\n", $base, $rr, $results{correct}, $results{total}, $results{numoovs});
	}


	sub process
	{
	my $token = shift(@_);
	my $results = shift(@_);
	$token->{invocab} = 1 if($assume_invocab);
	if(defined $token{topchoice}) {
	$token->{choices}[0] = $token{topchoice};
	}
	if(defined $token{topscore}) {
	$token->{scores}[0] = $token{topscore};
	}
	my $ortho = lc($token->{ortho});
	my $topch = lc($token->{choices}[0]);

	$ortho =~ s/_/ /g;
	$topch =~ s/_/ /g;
	$topch =~ s/\s\s+/ /g;
	$ortho =~ s/\s\s+/ /g;
	if($token->{invocab} == 0) {
	$token->{correct} = "0";
	$results->{numoovs}++;
	} elsif($topch eq $ortho) {
	$results->{total}++;
	$results->{correct}++;
	$token->{correct} = "1";
	} else {
	$results->{total}++;
	# print "$token->{file} MEANINGCMP: ==$token->{meanings}[0]== ==$token->{parsed_ortho}==\n";
	if($altsemfile) {
	if($token->{parsed_ortho} ne $csemtags{$token->{file}}) {
	# print "changing $token{parsed_ortho} ne $csemtags{$token->{file}}\n";
	$token->{parsed_ortho} = $csemtags{$token->{file}};
	}
	}

	if(not $try_semantic_validation) {
	$token->{correct} = "0";
	} else {
	if($token->{meanings}[0] eq $token->{parsed_ortho} && length($token->{parsed_ortho})>0) {
	$token->{correct} = "1";
	$results->{correct}++ ;
	} else {
	$token->{correct} = "0";
	}
	}
	}
	$token->{rankc} = 0;
	my $nchoices = scalar(@{$token->{choices}});
	for($i=0; $i<$nchoices; $i++) {
	my $choice = lc $token->{choices}[$i];
	$choice =~ s/_/ /g;
	if($choice eq $ortho) {
	$token->{rankc} = $i+1;
	last;
	}
	}
	$token->{gender} = "?";
	}

	sub dump_record
	{
	my $HH = shift(@_);
	my $token = shift(@_);
	foreach $field (@fields) {
	if ($field =~ /^sd13$/){
	print UTD "$token->{$field}" , ":";
	} elsif($field =~ /^(\S+)(\d+)$/) {
	$name = "${1}s";
	$num = $2 - 1;
	print UTD "$token->{$name}[$num]", ":";
	} else{
	print UTD "$token->{$field}" , ":";
	}
	}
	print UTD "\n";
	}

	sub dump_header
	{
	my $HH = shift(@_);
	foreach $field (@fields) {
	print UTD "$field" , ":";
	}
	print UTD "\n";
	}

	sub normalize
	{
	my $k = shift(@_);
	$k =~ s/\s\s+/ /g;
	$k =~ s/\:/\;/g;
	$k =~ s/\[[^\]]+\]//g;
	$k =~ s/^\s+//g;
	$k =~ s/\s+$//g;
	return $k;
	}

	sub load_altsemfile
	{
	my $semfile = shift(@_);
	open(SM,"<$semfile") \|\| die "error: opening $semfile\n";
	while(<SM>) {
	if(/D: (\S+)$/) {
	$file = $1;
	$file =~ s/\s+$//;
	} elsif(/^CSem:\s+([a-z]+.)\s$/i) {
	$csemtags{$file} = $1;
	$csemtags{$file} =~ s/\s+$//;
	} elsif(/^Sem[^:]+: invocab=(\d)/) { # same as CREC
	$semtags{$file} = 1;
	} elsif(/^Sem:(\s+)(\S+)/) { # same as CREC
	$semtags{$file} = 0;
	}
	}
	close(SM);
	}