#!/usr/bin/perl -w | |
# Copyright (C) 2003, 2004, 2005, 2006 Apple Computer, Inc. All rights reserved. | |
# | |
# Redistribution and use in source and binary forms, with or without | |
# modification, are permitted provided that the following conditions | |
# are met: | |
# | |
# 1. Redistributions of source code must retain the above copyright | |
# notice, this list of conditions and the following disclaimer. | |
# 2. Redistributions in binary form must reproduce the above copyright | |
# notice, this list of conditions and the following disclaimer in the | |
# documentation and/or other materials provided with the distribution. | |
# 3. Neither the name of Apple Computer, Inc. ("Apple") nor the names of | |
# its contributors may be used to endorse or promote products derived | |
# from this software without specific prior written permission. | |
# | |
# THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY | |
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
# DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY | |
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | |
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND | |
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF | |
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
use strict; | |
my %aliasesFromCharsetsFile; | |
my %namesWritten; | |
my $output = ""; | |
my $error = 0; | |
sub error ($) | |
{ | |
print STDERR @_, "\n"; | |
$error = 1; | |
} | |
sub emit_line | |
{ | |
my ($name, $prefix, $encoding, $flags) = @_; | |
error "$name shows up twice in output" if $namesWritten{$name}; | |
$namesWritten{$name} = 1; | |
$output .= " { \"$name\", $prefix$encoding },\n"; | |
} | |
sub process_platform_encodings | |
{ | |
my ($filename, $PlatformPrefix) = @_; | |
my $baseFilename = $filename; | |
$baseFilename =~ s|.*/||; | |
my %seenPlatformNames; | |
my %seenIANANames; | |
open PLATFORM_ENCODINGS, $filename or die; | |
while (<PLATFORM_ENCODINGS>) { | |
chomp; | |
s/\#.*$//; | |
s/\s+$//; | |
if (my ($PlatformName, undef, $flags, $IANANames) = /^(.+?)(, (.+))?: (.+)$/) { | |
my %aliases; | |
my $PlatformNameWithFlags = $PlatformName; | |
if ($flags) { | |
$PlatformNameWithFlags .= ", " . $flags; | |
} else { | |
$flags = "NoEncodingFlags"; | |
} | |
error "Platform encoding name $PlatformName is mentioned twice in $baseFilename" if $seenPlatformNames{$PlatformNameWithFlags}; | |
$seenPlatformNames{$PlatformNameWithFlags} = 1; | |
# Build the aliases list. | |
# Also check that no two names are part of the same entry in the charsets file. | |
my @IANANames = split ", ", $IANANames; | |
my $firstName = ""; | |
my $canonicalFirstName = ""; | |
my $prevName = ""; | |
for my $name (@IANANames) { | |
if ($firstName eq "") { | |
if ($name !~ /^[-A-Za-z0-9_]+$/) { | |
error "$name, in $baseFilename, has illegal characters in it"; | |
next; | |
} | |
$firstName = $name; | |
} else { | |
if ($name !~ /^[a-z0-9]+$/) { | |
error "$name, in $baseFilename, has illegal characters in it (must be all lowercase alphanumeric)"; | |
next; | |
} | |
if ($name le $prevName) { | |
error "$name comes after $prevName in $baseFilename, but everything must be in alphabetical order"; | |
} | |
$prevName = $name; | |
} | |
my $canonicalName = lc $name; | |
$canonicalName =~ tr/-_//d; | |
$canonicalFirstName = $canonicalName if $canonicalFirstName eq ""; | |
error "$name is mentioned twice in $baseFilename" if $seenIANANames{$canonicalName}; | |
$seenIANANames{$canonicalName} = 1; | |
$aliases{$canonicalName} = 1; | |
next if !$aliasesFromCharsetsFile{$canonicalName}; | |
for my $alias (@{$aliasesFromCharsetsFile{$canonicalName}}) { | |
$aliases{$alias} = 1; | |
} | |
for my $otherName (@IANANames) { | |
next if $canonicalName eq $otherName; | |
if ($aliasesFromCharsetsFile{$otherName} | |
&& $aliasesFromCharsetsFile{$canonicalName} eq $aliasesFromCharsetsFile{$otherName} | |
&& $canonicalName le $otherName) { | |
error "$baseFilename lists both $name and $otherName under $PlatformName, but that aliasing is already specified in character-sets.txt"; | |
} | |
} | |
} | |
# write out | |
emit_line($firstName, $PlatformPrefix, $PlatformName, $flags); | |
for my $alias (sort keys %aliases) { | |
emit_line($alias, $PlatformPrefix, $PlatformName, $flags) if $alias ne $canonicalFirstName; | |
} | |
} elsif (/^([a-zA-Z0-9_]+)(, (.+))?$/) { | |
my $PlatformName = $1; | |
error "Platform encoding name $PlatformName is mentioned twice in $baseFilename" if $seenPlatformNames{$PlatformName}; | |
$seenPlatformNames{$PlatformName} = 1; | |
} elsif (/./) { | |
error "syntax error in $baseFilename, line $."; | |
} | |
} | |
close PLATFORM_ENCODINGS; | |
} | |
sub process_iana_charset | |
{ | |
my ($canonical_name, @aliases) = @_; | |
return if !$canonical_name; | |
my @names = sort $canonical_name, @aliases; | |
for my $name (@names) { | |
$aliasesFromCharsetsFile{$name} = \@names; | |
} | |
} | |
sub process_iana_charsets | |
{ | |
my ($filename) = @_; | |
open CHARSETS, $filename or die; | |
my %seen; | |
my $canonical_name; | |
my @aliases; | |
my %exceptions = ( isoir91 => 1, isoir92 => 1 ); | |
while (<CHARSETS>) { | |
chomp; | |
if ((my $new_canonical_name) = /Name: ([^ \t]*).*/) { | |
$new_canonical_name = lc $new_canonical_name; | |
$new_canonical_name =~ tr/a-z0-9//cd; | |
error "saw $new_canonical_name twice in character-sets.txt", if $seen{$new_canonical_name}; | |
$seen{$new_canonical_name} = $new_canonical_name; | |
process_iana_charset $canonical_name, @aliases; | |
$canonical_name = $new_canonical_name; | |
@aliases = (); | |
} elsif ((my $new_alias) = /Alias: ([^ \t]*).*/) { | |
$new_alias = lc $new_alias; | |
$new_alias =~ tr/a-z0-9//cd; | |
# do this after normalizing the alias, sometimes character-sets.txt | |
# has weird escape characters, e.g. \b after None | |
next if $new_alias eq "none"; | |
error "saw $new_alias twice in character-sets.txt $seen{$new_alias}, $canonical_name", if $seen{$new_alias} && $seen{$new_alias} ne $canonical_name && !$exceptions{$new_alias}; | |
push @aliases, $new_alias if !$seen{$new_alias}; | |
$seen{$new_alias} = $canonical_name; | |
} | |
} | |
process_iana_charset $canonical_name, @aliases; | |
close CHARSETS; | |
} | |
# Program body | |
process_iana_charsets($ARGV[0]); | |
process_platform_encodings($ARGV[1], $ARGV[2]); | |
exit 1 if $error; | |
print <<EOF | |
// File generated by make-charset-table.pl. Do not edit! | |
#include "config.h" | |
#include "CharsetData.h" | |
namespace WebCore { | |
const CharsetEntry CharsetTable[] = { | |
$output | |
{ 0, 0 } | |
}; | |
} | |
EOF |