blob: 9483b6bdb089ef9d1ce8290ed14cee8aa1f4f50b [file] [log] [blame]
#############################################################################
# Perl script genlingware.pl --- composes a lingware resource from
# pico knowledge base binary files (pkb)
# according to given configuration
#
# Copyright (C) 2009 SVOX AG. All Rights Reserved.
#
# type perl genlingware.pl -help to get help
#
#############################################################################
eval "exec perl -S \$0 \${1+\"\$@\"}"
if 0;
$resource_structure = <<EOSTRUCT
the resource file structure is as follows:
------------------------------
1. optional foreign header (4-byte aligned), externally added
------------------------------
A) Pico header (4-byte aligned)
2. SVOX PICO header (signature)
3. length of header (2 byte, excluding length itself)
4. number of fields (1 byte)
5. header fields (space separated) (key/value pairs)
6. filler (0-3)
------------------------------
B) length of content
7. length of the remaining content in 4-byte, excluding length itself
------------------------------
C) Index
8. summary: number of kbs
9. id names of kb (strings of max 15 chars plus closing space)
10. directory (kb id (1 byte), offset (4 bytes), size (4 bytes))
11. filler (0-3)
------------------------------
D) knowledge bases
12. sequence of knowledge bases (byte arrays), each 4-byte aligned
all numbers are little endian
EOSTRUCT
;
###################################################################
##
## Imports
##
###################################################################
#use File::DosGlob 'glob';
#use File::Copy;
#use File::Path;
#use File::Basename;
#use Filehandle;
#use Time::Local;
use Getopt::Long;
###################################################################
##
## Default values
##
###################################################################
$VALUE = 1;
$NAME = "name";
$DEST = ".";
###################################################################
##
## Options
##
###################################################################
GetOptions(
"value=i" => \$VALUE, # numeric
"name=s" => \$NAME, # string
"help" => \$HELP
);
###################################################################
##
## Help
##
###################################################################
$help = <<EOHELP
$0 -- composes a lingware resource from pico knowledge base
binary files (pkb) according to given configuration
Usage:
$0 -help
print this help
$0 <config> <resource>
reads in configuration file <config> and creates resource <resource>
Arguments:
<config> : configuration file (input)
<resource> : platform-independent resource file (output)
(For more details see the source of this script)
EOHELP
;
die $help if $HELP;
$config_example = <<EOCONFIGEXAMPLE
Example of a config:
-------------------------------------------------------------------
\# collection of de-DE textana knowledge bases
\# header fields:
NAME de-DE_ta_1.0.0.0-0-2
VERSION 1.0.0.0-0-2
DATE 2009-01-15
TIME 17:00:00.000
CONTENT_TYPE TEXTANA
\# pico knowledge bases:
TPP_MAIN \"../pkb/de-DE/de-DE_kpr.pkb\"
TAB_GRAPHS \"../pkb/de-DE/de-DE_ktab_graphs.pkb\"
...
-------------------------------------------------------------------
for all recognized pkb tags, see %picoknow_kb_id below
EOCONFIGEXAMPLE
;
###################################################################
##
## Initialization
##
###################################################################
$svoxheader = " (C) SVOX AG ";
%header_field = (
"NAME" => 1,
"VERSION" => 2,
"DATE" => 3,
"TIME" => 4,
"CONTENT_TYPE" => 5,
);
%picoknow_kb_id = (
NULL => 0,
#base
TAB_GRAPHS => 2,
TAB_PHONES => 3,
TAB_POS => 4,
# FIXED_IDS = 7,
#dbg
DBG => 8,
#textana
TPP_MAIN => 1,
LEX_MAIN => 9,
DT_POSP => 10,
DT_POSD => 11,
DT_G2P => 12,
FST_WPHO_1 => 13,
FST_WPHO_2 => 14,
FST_WPHO_3 => 15,
FST_WPHO_4 => 16,
FST_WPHO_5 => 17,
DT_PHR => 18,
DT_ACC => 19,
FST_SPHO_1 => 20,
FST_SPHO_2 => 21,
FST_SPHO_3 => 22,
FST_SPHO_4 => 23,
FST_SPHO_5 => 24,
FST_XSPA_PARSE => 25,
FST_SVPA_PARSE => 26,
FST_XS2SVPA => 27,
FST_SPHO_6 => 28,
FST_SPHO_7 => 29,
FST_SPHO_8 => 30,
FST_SPHO_9 => 31,
FST_SPHO_10 => 32,
#siggen
DT_DUR => 34,
DT_LFZ1 => 35,
DT_LFZ2 => 36,
DT_LFZ3 => 37,
DT_LFZ4 => 38,
DT_LFZ5 => 39,
DT_MGC1 => 40,
DT_MGC2 => 41,
DT_MGC3 => 42,
DT_MGC4 => 43,
DT_MGC5 => 44,
PDF_DUR => 45,
PDF_LFZ => 46,
PDF_MGC => 47,
PDF_PHS => 48,
#user tpp
TPP_USER1 => 49,
TPP_USER2 => 50,
#user lex
LEX_USER1 => 57,
LEX_USER2 => 58,
DUMMY => 127
);
###################################################################
##
## Get Parameters
##
###################################################################
($in,$out) = (shift,shift);
unless ($in && $out) {
print "*** error: incorrect number of parameters\n";
die $help;
}
###################################################################
##
## Work
##
###################################################################
#get description of kbs
unless (open IN, $in) {
print "*** error: can't open $in\n";
die "can't open $in";
}
while (<IN>) {
# skip comments
next if /^\s*#/;
($key, $value) = split;
next unless $key;
$value =~ s/^\s*\"(.*)\"\s*$/\1/;
if ($field = $header_field{$key}) {
$fields[$field] = {key => "$key", value => "$value"};
next;
}
#print "$key -> $value\n";
unless ($id = $picoknow_kb_id{$key}) {
print "*** error: not a valid knowledge base name $key\n";
die "not a valid knowledge base name $key" ;
}
push @kb, {name => $key, file => $value, id => $id};
}
close IN;
#open output lingware file and write header
unless (open OUT, ">$out") {
print "*** error: can't open $out for writing\n";
die "can't open $out for writing";
}
binmode(OUT);
$offs = 0;
###################################################################
##
## A) PICO HEADER
##
###################################################################
# 1. SVOX HEADER
foreach $ch (split //, $svoxheader) {
push @svoxheader, chr(ord($ch)-ord(' '));
}
$offs += print_offs(@svoxheader);
print "offset after svoxheader: $offs\n";
# fields header
$fieldheader = "";
foreach $field (@fields) {
$fieldheader .= $field->{key} . " " . $field->{value} . " ";
}
#print size of fields header
# length of the fields plus preceding number of fields (1)
$len = length($fieldheader)+1;
#fill should make the whole header 4-byte aligned, i.e. the current offs
# (svoxheader) plus the length of the header (2) plus the fields
$fill = ($offs + 2 + $len) % 4;
$fill = $fill ? 4-$fill : 0;
$len += $fill;
print "filled length of header : $len\n";
# 2. length of header
$offs += &print_uint16($len); #write little-endian 16-bit cardinal
print "offset after length of header: $offs\n";
# 3. print number of fields
$offs += &print_uint8(@fields+0);
print "offset after number of fields: $offs\n";
# 4. print fields
$offs += print_offs($fieldheader);
print "offset after fields: $offs\n";
# 5. print magic number (not yet)
# 6. print filler
$offs += &opt_fill($fill);
print "offset after fill: $offs\n";
###################################################################
##
## CONTENT (that is actually saved in RAM)
##
###################################################################
# open kb files to get sizes and calculate fillers
foreach $kb (@kb) {
if ($kb->{file}) {
unless (open IN, $kb->{file}) {
print "*** error: can't open " . $kb->{file} ."\n";
die("can't open PKB " . $kb->{file});
}
binmode(IN);
#slurp in the whole file
@content = <IN>;
close IN;
$kb->{size} = length(join '',@content);
$fill = ($kb->{size} % 4);
$kb->{fill} = $fill ? 4-$fill : 0;
print "fill of ", $kb->{name} , " is ", $kb->{fill}+0, "\n";
$totalcont += $kb->{size} + $kb->{fill};
} else {
$kb->{size} = 0;
}
print $kb->{name}, " -> ", $kb->{file}, " -> ", $kb->{size}, "\n";
}
# calculate total content size (B):
$totalcont = 0;
# size of number of kbs (7.)
$totalcont += 1;
#size of names of kbs (8.)
foreach $kb (@kb) {
$totalcont += &size_offs($kb->{name}, " ");
}
# size of directory (9.)
$totalcont += 9 * @kb;
# size of filler (10.)
$xfill = $totalcont % 4;
$xfill = $xfill ? 4 - $xfill : 0;
$totalcont += $xfill;
#set root of first kb
$offs1 = $totalcont;
print "totalcont before kbs: $totalcont\n";
# size of actual kbs
foreach $kb (@kb) {
$totalcont += $kb->{size} + $kb->{fill};
}
print "totalcont after kbs: $totalcont\n";
# B) print the total size
$offs += &print_uint32($totalcont);
print "offset after size of content (to be loaded): $offs\n";
# here starts the part that is stored in "permament memory"
$offs = 0;
print "offset after reset: $offs\n";
# 7. print number of kbs:
$offs += &print_uint8(@kb+0);
print "offset after number of kbs: $offs\n";
# 8. print names of kbs
foreach $kb (@kb) {
$offs += &print_offs($kb->{name}, " ");
}
print "offset after descriptive kb names: $offs\n";
# 9. print directory (ids and offsets of kbs)
print "first kb should start at $offs1\n";
foreach $kb (@kb) {
$offs += &print_uint8($kb->{id});
print "kb id: $kb->{id}, offs=$offs\n";
if ($kb->{size}) {
print "real kb (size $kb->{size})\n";
$offs += &print_uint32($offs1);
print "kb offs: $offs1, offs=$offs\n";
$offs += &print_uint32($kb->{size});
print "kb size: $kb->{size}, offs=$offs\n";
$offs1 += $kb->{size} + $kb->{fill};
} else {
print "dummy kb (size 0)\n";
$offs += &print_uint32(0);
print "kb offs: 0, offs=$offs\n";
$offs += &print_uint32(0);
print "kb size: $kb->{size}, offs=$offs\n";
}
print "offset after directory entry for kb $kb->{name} (id $kb->{id}): $offs\n";
}
# 10. print filler
$offs += &opt_fill($xfill);
print "offset after fill: $offs\n";
# print kbs themselves
foreach $kb (@kb) {
if ($kb->{file}) {
unless (open IN, $kb->{file}) {
print "*** error: can't open " . $kb->{file} ."\n";
die "can't open " . $kb->{file};
}
binmode(IN);
#slurp in the whole file
@content = <IN>;
close IN;
print OUT join '', @content;
$offs += $kb->{size};
$offs += &opt_fill($kb->{fill});
print "offset after kb $kb->{file}: $offs\n";
} else {
}
#print $kb->{name}, " -> ", $kb->{file}, " -> ", $kb->{size}, "\n";
}
close OUT;
# optional filler
# use for alignment if filler doesn't need to be parsed
sub opt_fill() {
my ($fill) = @_;
my $size = 0;
if ($fill) {
$size += &print_uint8($fill);
for (my $i = 1; $i < $fill; $i++) {
$size += &print_uint8(0);
}
}
return $size;
}
# mandatory filler
# use for alignment if filler needs to be parsed (at least one byte)
sub mand_fill() {
my ($fill) = @_;
my $size = 0;
#force fill to be 4 if there is no fill
$fill = 4 unless $fill;
$size += &print_uint8($fill);
for (my $i = 1; $i < $fill; $i++) {
$size += &print_uint8(0);
}
return $size;
}
sub print_offs() {
my (@cont) = @_;
my $size = 0;
foreach my $cont (@cont) {
$size += length($cont);
print OUT $cont;
}
return $size;
}
sub size_offs() {
my (@cont) = @_;
my $size = 0;
foreach my $cont (@cont) {
$size += length($cont);
}
return $size;
}
sub print_uint_n() {
#little-endian n-byte cardinal (no check, though!)
my ($num, $n) = @_;
my (@out) = ();
for (my $i=0; $i < $n;$i++) {
$out[$i] = $num % 256;
$num = int($num / 256);
}
print OUT pack("c*",@out);
return $n;
}
sub print_uint32() {
#little-endian 4-byte cardinal (no check, though!)
my ($num) = @_;
return &print_uint_n($num,4);
}
sub print_uint16() {
#little-endian 2-byte cardinal (no check, though!)
my ($num) = @_;
return &print_uint_n($num,2);
}
sub print_uint8() {
my ($num) = @_;
return &print_uint_n($num,1);
}
sub numerically {$x < $y}
# @x = map {chr(48+$_) } (0..9);
# foreach $x (@x) {
# print $x, "\n";
# }
# @x = map +(chr(65+$_)), (0..9);
# foreach $x (@x) {
# print $x, "\n";
# }