| #!/usr/bin/perl |
| # |
| # Generate a subset of the UnicodeData.txt file, available from |
| # ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt |
| # |
| # Usage: |
| # gensubset.pl [subset files] < UnicodeData.txt > MiniUCD.txt |
| # |
| |
| %need_these = (); |
| |
| # Mark as needed all the characters mentioned in the relevant files |
| foreach $file (@ARGV) { |
| open(F, '<', $file) or die; |
| while (defined($line = <F>)) { |
| $line =~ s/\s*(\#.*|)$//; # Remove comments and final blanks |
| @f = split(/\s+/, $line); |
| next if (scalar @f != 2); |
| $need_these{hex $f[1]}++; |
| } |
| close(F); |
| } |
| |
| # Also mark as needed any case variants of those |
| # (Note: this doesn't necessarily provide the full transitive closure, |
| # but we shouldn't need it.) |
| while (defined($line = <STDIN>)) { |
| @f = split(/;/, $line); |
| if ($f[0] =~ /^([0-9a-f]+)$/i) { |
| $r = hex $f[0]; |
| if ($need_these{$r}) { |
| $need_these{hex $f[12]}++ if ($f[12] ne ''); |
| $need_these{hex $f[13]}++ if ($f[13] ne ''); |
| $need_these{hex $f[14]}++ if ($f[14] ne ''); |
| } |
| } |
| } |
| |
| # Finally, write out the subset |
| seek(STDIN, 0, 0); |
| while (defined($line = <STDIN>)) { |
| ($v, $l) = split(/;/, $line, 2); |
| if ($v =~ /^([0-9a-f]+)\-([0-9a-f]+)$/i) { |
| # This isn't actually the format... fix that if it ever matters |
| $r1 = hex $1; |
| $r2 = hex $2; |
| } elsif ($v =~ /^([0-9a-f]+)$/i) { |
| $r1 = $r2 = hex $1; |
| } else { |
| next; |
| } |
| for ($r = $r1; $r <= $r2; $r++) { |
| printf "%04X;%s", $r, $l if ($need_these{$r}); |
| } |
| } |
| |
| |