| # $Id: encoding.pm,v 2.9 2011/08/09 07:49:44 dankogai Exp dankogai $ |
| package encoding; |
| our $VERSION = '2.6_01'; |
| |
| use Encode; |
| use strict; |
| use warnings; |
| |
| use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG}; |
| |
| BEGIN { |
| if ( ord("A") == 193 ) { |
| require Carp; |
| Carp::croak("encoding: pragma does not support EBCDIC platforms"); |
| } |
| } |
| |
| our $HAS_PERLIO = 0; |
| eval { require PerlIO::encoding }; |
| unless ($@) { |
| $HAS_PERLIO = ( PerlIO::encoding->VERSION >= 0.02 ); |
| } |
| |
| sub _exception { |
| my $name = shift; |
| $] > 5.008 and return 0; # 5.8.1 or higher then no |
| my %utfs = map { $_ => 1 } |
| qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE |
| UTF-32 UTF-32BE UTF-32LE); |
| $utfs{$name} or return 0; # UTFs or no |
| require Config; |
| Config->import(); |
| our %Config; |
| return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no |
| } |
| |
| sub in_locale { $^H & ( $locale::hint_bits || 0 ) } |
| |
| sub _get_locale_encoding { |
| my $locale_encoding; |
| |
| # I18N::Langinfo isn't available everywhere |
| eval { |
| require I18N::Langinfo; |
| I18N::Langinfo->import(qw(langinfo CODESET)); |
| $locale_encoding = langinfo( CODESET() ); |
| }; |
| |
| my $country_language; |
| |
| no warnings 'uninitialized'; |
| |
| if ( (not $locale_encoding) && in_locale() ) { |
| if ( $ENV{LC_ALL} =~ /^([^.]+)\.([^.@]+)(@.*)?$/ ) { |
| ( $country_language, $locale_encoding ) = ( $1, $2 ); |
| } |
| elsif ( $ENV{LANG} =~ /^([^.]+)\.([^.@]+)(@.*)?$/ ) { |
| ( $country_language, $locale_encoding ) = ( $1, $2 ); |
| } |
| |
| # LANGUAGE affects only LC_MESSAGES only on glibc |
| } |
| elsif ( not $locale_encoding ) { |
| if ( $ENV{LC_ALL} =~ /\butf-?8\b/i |
| || $ENV{LANG} =~ /\butf-?8\b/i ) |
| { |
| $locale_encoding = 'utf8'; |
| } |
| |
| # Could do more heuristics based on the country and language |
| # parts of LC_ALL and LANG (the parts before the dot (if any)), |
| # since we have Locale::Country and Locale::Language available. |
| # TODO: get a database of Language -> Encoding mappings |
| # (the Estonian database at http://www.eki.ee/letter/ |
| # would be excellent!) --jhi |
| } |
| if ( defined $locale_encoding |
| && lc($locale_encoding) eq 'euc' |
| && defined $country_language ) |
| { |
| if ( $country_language =~ /^ja_JP|japan(?:ese)?$/i ) { |
| $locale_encoding = 'euc-jp'; |
| } |
| elsif ( $country_language =~ /^ko_KR|korean?$/i ) { |
| $locale_encoding = 'euc-kr'; |
| } |
| elsif ( $country_language =~ /^zh_CN|chin(?:a|ese)$/i ) { |
| $locale_encoding = 'euc-cn'; |
| } |
| elsif ( $country_language =~ /^zh_TW|taiwan(?:ese)?$/i ) { |
| $locale_encoding = 'euc-tw'; |
| } |
| else { |
| require Carp; |
| Carp::croak( |
| "encoding: Locale encoding '$locale_encoding' too ambiguous" |
| ); |
| } |
| } |
| |
| return $locale_encoding; |
| } |
| |
| sub import { |
| my $class = shift; |
| my $name = shift; |
| if ( $name eq ':_get_locale_encoding' ) { # used by lib/open.pm |
| my $caller = caller(); |
| { |
| no strict 'refs'; |
| *{"${caller}::_get_locale_encoding"} = \&_get_locale_encoding; |
| } |
| return; |
| } |
| $name = _get_locale_encoding() if $name eq ':locale'; |
| my %arg = @_; |
| $name = $ENV{PERL_ENCODING} unless defined $name; |
| my $enc = find_encoding($name); |
| unless ( defined $enc ) { |
| require Carp; |
| Carp::croak("encoding: Unknown encoding '$name'"); |
| } |
| $name = $enc->name; # canonize |
| unless ( $arg{Filter} ) { |
| DEBUG and warn "_exception($name) = ", _exception($name); |
| _exception($name) or ${^ENCODING} = $enc; |
| $HAS_PERLIO or return 1; |
| } |
| else { |
| defined( ${^ENCODING} ) and undef ${^ENCODING}; |
| |
| # implicitly 'use utf8' |
| require utf8; # to fetch $utf8::hint_bits; |
| $^H |= $utf8::hint_bits; |
| eval { |
| require Filter::Util::Call; |
| Filter::Util::Call->import; |
| filter_add( |
| sub { |
| my $status = filter_read(); |
| if ( $status > 0 ) { |
| $_ = $enc->decode( $_, 1 ); |
| DEBUG and warn $_; |
| } |
| $status; |
| } |
| ); |
| }; |
| $@ eq '' and DEBUG and warn "Filter installed"; |
| } |
| defined ${^UNICODE} and ${^UNICODE} != 0 and return 1; |
| for my $h (qw(STDIN STDOUT)) { |
| if ( $arg{$h} ) { |
| unless ( defined find_encoding( $arg{$h} ) ) { |
| require Carp; |
| Carp::croak( |
| "encoding: Unknown encoding for $h, '$arg{$h}'"); |
| } |
| eval { binmode( $h, ":raw :encoding($arg{$h})" ) }; |
| } |
| else { |
| unless ( exists $arg{$h} ) { |
| eval { |
| no warnings 'uninitialized'; |
| binmode( $h, ":raw :encoding($name)" ); |
| }; |
| } |
| } |
| if ($@) { |
| require Carp; |
| Carp::croak($@); |
| } |
| } |
| return 1; # I doubt if we need it, though |
| } |
| |
| sub unimport { |
| no warnings; |
| undef ${^ENCODING}; |
| if ($HAS_PERLIO) { |
| binmode( STDIN, ":raw" ); |
| binmode( STDOUT, ":raw" ); |
| } |
| else { |
| binmode(STDIN); |
| binmode(STDOUT); |
| } |
| if ( $INC{"Filter/Util/Call.pm"} ) { |
| eval { filter_del() }; |
| } |
| } |
| |
| 1; |
| __END__ |
| |
| =pod |
| |
| =head1 NAME |
| |
| encoding - allows you to write your script in non-ascii or non-utf8 |
| |
| =head1 SYNOPSIS |
| |
| use encoding "greek"; # Perl like Greek to you? |
| use encoding "euc-jp"; # Jperl! |
| |
| # or you can even do this if your shell supports your native encoding |
| |
| perl -Mencoding=latin2 -e'...' # Feeling centrally European? |
| perl -Mencoding=euc-kr -e'...' # Or Korean? |
| |
| # more control |
| |
| # A simple euc-cn => utf-8 converter |
| use encoding "euc-cn", STDOUT => "utf8"; while(<>){print}; |
| |
| # "no encoding;" supported (but not scoped!) |
| no encoding; |
| |
| # an alternate way, Filter |
| use encoding "euc-jp", Filter=>1; |
| # now you can use kanji identifiers -- in euc-jp! |
| |
| # switch on locale - |
| # note that this probably means that unless you have a complete control |
| # over the environments the application is ever going to be run, you should |
| # NOT use the feature of encoding pragma allowing you to write your script |
| # in any recognized encoding because changing locale settings will wreck |
| # the script; you can of course still use the other features of the pragma. |
| use encoding ':locale'; |
| |
| =head1 ABSTRACT |
| |
| Let's start with a bit of history: Perl 5.6.0 introduced Unicode |
| support. You could apply C<substr()> and regexes even to complex CJK |
| characters -- so long as the script was written in UTF-8. But back |
| then, text editors that supported UTF-8 were still rare and many users |
| instead chose to write scripts in legacy encodings, giving up a whole |
| new feature of Perl 5.6. |
| |
| Rewind to the future: starting from perl 5.8.0 with the B<encoding> |
| pragma, you can write your script in any encoding you like (so long |
| as the C<Encode> module supports it) and still enjoy Unicode support. |
| This pragma achieves that by doing the following: |
| |
| =over |
| |
| =item * |
| |
| Internally converts all literals (C<q//,qq//,qr//,qw///, qx//>) from |
| the encoding specified to utf8. In Perl 5.8.1 and later, literals in |
| C<tr///> and C<DATA> pseudo-filehandle are also converted. |
| |
| =item * |
| |
| Changing PerlIO layers of C<STDIN> and C<STDOUT> to the encoding |
| specified. |
| |
| =back |
| |
| =head2 Literal Conversions |
| |
| You can write code in EUC-JP as follows: |
| |
| my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji |
| #<-char-><-char-> # 4 octets |
| s/\bCamel\b/$Rakuda/; |
| |
| And with C<use encoding "euc-jp"> in effect, it is the same thing as |
| the code in UTF-8: |
| |
| my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters |
| s/\bCamel\b/$Rakuda/; |
| |
| =head2 PerlIO layers for C<STD(IN|OUT)> |
| |
| The B<encoding> pragma also modifies the filehandle layers of |
| STDIN and STDOUT to the specified encoding. Therefore, |
| |
| use encoding "euc-jp"; |
| my $message = "Camel is the symbol of perl.\n"; |
| my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji |
| $message =~ s/\bCamel\b/$Rakuda/; |
| print $message; |
| |
| Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n", |
| not "\x{99F1}\x{99DD} is the symbol of perl.\n". |
| |
| You can override this by giving extra arguments; see below. |
| |
| =head2 Implicit upgrading for byte strings |
| |
| By default, if strings operating under byte semantics and strings |
| with Unicode character data are concatenated, the new string will |
| be created by decoding the byte strings as I<ISO 8859-1 (Latin-1)>. |
| |
| The B<encoding> pragma changes this to use the specified encoding |
| instead. For example: |
| |
| use encoding 'utf8'; |
| my $string = chr(20000); # a Unicode string |
| utf8::encode($string); # now it's a UTF-8 encoded byte string |
| # concatenate with another Unicode string |
| print length($string . chr(20000)); |
| |
| Will print C<2>, because C<$string> is upgraded as UTF-8. Without |
| C<use encoding 'utf8';>, it will print C<4> instead, since C<$string> |
| is three octets when interpreted as Latin-1. |
| |
| =head2 Side effects |
| |
| If the C<encoding> pragma is in scope then the lengths returned are |
| calculated from the length of C<$/> in Unicode characters, which is not |
| always the same as the length of C<$/> in the native encoding. |
| |
| This pragma affects utf8::upgrade, but not utf8::downgrade. |
| |
| =head1 FEATURES THAT REQUIRE 5.8.1 |
| |
| Some of the features offered by this pragma requires perl 5.8.1. Most |
| of these are done by Inaba Hiroto. Any other features and changes |
| are good for 5.8.0. |
| |
| =over |
| |
| =item "NON-EUC" doublebyte encodings |
| |
| Because perl needs to parse script before applying this pragma, such |
| encodings as Shift_JIS and Big-5 that may contain '\' (BACKSLASH; |
| \x5c) in the second byte fails because the second byte may |
| accidentally escape the quoting character that follows. Perl 5.8.1 |
| or later fixes this problem. |
| |
| =item tr// |
| |
| C<tr//> was overlooked by Perl 5 porters when they released perl 5.8.0 |
| See the section below for details. |
| |
| =item DATA pseudo-filehandle |
| |
| Another feature that was overlooked was C<DATA>. |
| |
| =back |
| |
| =head1 USAGE |
| |
| =over 4 |
| |
| =item use encoding [I<ENCNAME>] ; |
| |
| Sets the script encoding to I<ENCNAME>. And unless ${^UNICODE} |
| exists and non-zero, PerlIO layers of STDIN and STDOUT are set to |
| ":encoding(I<ENCNAME>)". |
| |
| Note that STDERR WILL NOT be changed. |
| |
| Also note that non-STD file handles remain unaffected. Use C<use |
| open> or C<binmode> to change layers of those. |
| |
| If no encoding is specified, the environment variable L<PERL_ENCODING> |
| is consulted. If no encoding can be found, the error C<Unknown encoding |
| 'I<ENCNAME>'> will be thrown. |
| |
| =item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ; |
| |
| You can also individually set encodings of STDIN and STDOUT via the |
| C<< STDIN => I<ENCNAME> >> form. In this case, you cannot omit the |
| first I<ENCNAME>. C<< STDIN => undef >> turns the IO transcoding |
| completely off. |
| |
| When ${^UNICODE} exists and non-zero, these options will completely |
| ignored. ${^UNICODE} is a variable introduced in perl 5.8.1. See |
| L<perlrun> see L<perlvar/"${^UNICODE}"> and L<perlrun/"-C"> for |
| details (perl 5.8.1 and later). |
| |
| =item use encoding I<ENCNAME> Filter=E<gt>1; |
| |
| This turns the encoding pragma into a source filter. While the |
| default approach just decodes interpolated literals (in qq() and |
| qr()), this will apply a source filter to the entire source code. See |
| L</"The Filter Option"> below for details. |
| |
| =item no encoding; |
| |
| Unsets the script encoding. The layers of STDIN, STDOUT are |
| reset to ":raw" (the default unprocessed raw stream of bytes). |
| |
| =back |
| |
| =head1 The Filter Option |
| |
| The magic of C<use encoding> is not applied to the names of |
| identifiers. In order to make C<${"\x{4eba}"}++> ($human++, where human |
| is a single Han ideograph) work, you still need to write your script |
| in UTF-8 -- or use a source filter. That's what 'Filter=>1' does. |
| |
| What does this mean? Your source code behaves as if it is written in |
| UTF-8 with 'use utf8' in effect. So even if your editor only supports |
| Shift_JIS, for example, you can still try examples in Chapter 15 of |
| C<Programming Perl, 3rd Ed.>. For instance, you can use UTF-8 |
| identifiers. |
| |
| This option is significantly slower and (as of this writing) non-ASCII |
| identifiers are not very stable WITHOUT this option and with the |
| source code written in UTF-8. |
| |
| =head2 Filter-related changes at Encode version 1.87 |
| |
| =over |
| |
| =item * |
| |
| The Filter option now sets STDIN and STDOUT like non-filter options. |
| And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> work like |
| non-filter version. |
| |
| =item * |
| |
| C<use utf8> is implicitly declared so you no longer have to C<use |
| utf8> to C<${"\x{4eba}"}++>. |
| |
| =back |
| |
| =head1 CAVEATS |
| |
| =head2 NOT SCOPED |
| |
| The pragma is a per script, not a per block lexical. Only the last |
| C<use encoding> or C<no encoding> matters, and it affects |
| B<the whole script>. However, the <no encoding> pragma is supported and |
| B<use encoding> can appear as many times as you want in a given script. |
| The multiple use of this pragma is discouraged. |
| |
| By the same reason, the use this pragma inside modules is also |
| discouraged (though not as strongly discouraged as the case above. |
| See below). |
| |
| If you still have to write a module with this pragma, be very careful |
| of the load order. See the codes below; |
| |
| # called module |
| package Module_IN_BAR; |
| use encoding "bar"; |
| # stuff in "bar" encoding here |
| 1; |
| |
| # caller script |
| use encoding "foo" |
| use Module_IN_BAR; |
| # surprise! use encoding "bar" is in effect. |
| |
| The best way to avoid this oddity is to use this pragma RIGHT AFTER |
| other modules are loaded. i.e. |
| |
| use Module_IN_BAR; |
| use encoding "foo"; |
| |
| =head2 DO NOT MIX MULTIPLE ENCODINGS |
| |
| Notice that only literals (string or regular expression) having only |
| legacy code points are affected: if you mix data like this |
| |
| \xDF\x{100} |
| |
| the data is assumed to be in (Latin 1 and) Unicode, not in your native |
| encoding. In other words, this will match in "greek": |
| |
| "\xDF" =~ /\x{3af}/ |
| |
| but this will not |
| |
| "\xDF\x{100}" =~ /\x{3af}\x{100}/ |
| |
| since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on |
| the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL |
| LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left. You |
| should not be mixing your legacy data and Unicode in the same string. |
| |
| This pragma also affects encoding of the 0x80..0xFF code point range: |
| normally characters in that range are left as eight-bit bytes (unless |
| they are combined with characters with code points 0x100 or larger, |
| in which case all characters need to become UTF-8 encoded), but if |
| the C<encoding> pragma is present, even the 0x80..0xFF range always |
| gets UTF-8 encoded. |
| |
| After all, the best thing about this pragma is that you don't have to |
| resort to \x{....} just to spell your name in a native encoding. |
| So feel free to put your strings in your encoding in quotes and |
| regexes. |
| |
| =head2 tr/// with ranges |
| |
| The B<encoding> pragma works by decoding string literals in |
| C<q//,qq//,qr//,qw///, qx//> and so forth. In perl 5.8.0, this |
| does not apply to C<tr///>. Therefore, |
| |
| use encoding 'euc-jp'; |
| #.... |
| $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/; |
| # -------- -------- -------- -------- |
| |
| Does not work as |
| |
| $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/; |
| |
| =over |
| |
| =item Legend of characters above |
| |
| utf8 euc-jp charnames::viacode() |
| ----------------------------------------- |
| \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A |
| \x{3093} \xA4\xF3 HIRAGANA LETTER N |
| \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A |
| \x{30f3} \xA5\xF3 KATAKANA LETTER N |
| |
| =back |
| |
| This counterintuitive behavior has been fixed in perl 5.8.1. |
| |
| =head3 workaround to tr///; |
| |
| In perl 5.8.0, you can work around as follows; |
| |
| use encoding 'euc-jp'; |
| # .... |
| eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ }; |
| |
| Note the C<tr//> expression is surrounded by C<qq{}>. The idea behind |
| is the same as classic idiom that makes C<tr///> 'interpolate'. |
| |
| tr/$from/$to/; # wrong! |
| eval qq{ tr/$from/$to/ }; # workaround. |
| |
| Nevertheless, in case of B<encoding> pragma even C<q//> is affected so |
| C<tr///> not being decoded was obviously against the will of Perl5 |
| Porters so it has been fixed in Perl 5.8.1 or later. |
| |
| =head1 EXAMPLE - Greekperl |
| |
| use encoding "iso 8859-7"; |
| |
| # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode. |
| |
| $a = "\xDF"; |
| $b = "\x{100}"; |
| |
| printf "%#x\n", ord($a); # will print 0x3af, not 0xdf |
| |
| $c = $a . $b; |
| |
| # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}". |
| |
| # chr() is affected, and ... |
| |
| print "mega\n" if ord(chr(0xdf)) == 0x3af; |
| |
| # ... ord() is affected by the encoding pragma ... |
| |
| print "tera\n" if ord(pack("C", 0xdf)) == 0x3af; |
| |
| # ... as are eq and cmp ... |
| |
| print "peta\n" if "\x{3af}" eq pack("C", 0xdf); |
| print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0; |
| |
| # ... but pack/unpack C are not affected, in case you still |
| # want to go back to your native encoding |
| |
| print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf; |
| |
| =head1 KNOWN PROBLEMS |
| |
| =over |
| |
| =item literals in regex that are longer than 127 bytes |
| |
| For native multibyte encodings (either fixed or variable length), |
| the current implementation of the regular expressions may introduce |
| recoding errors for regular expression literals longer than 127 bytes. |
| |
| =item EBCDIC |
| |
| The encoding pragma is not supported on EBCDIC platforms. |
| (Porters who are willing and able to remove this limitation are |
| welcome.) |
| |
| =item format |
| |
| This pragma doesn't work well with format because PerlIO does not |
| get along very well with it. When format contains non-ascii |
| characters it prints funny or gets "wide character warnings". |
| To understand it, try the code below. |
| |
| # Save this one in utf8 |
| # replace *non-ascii* with a non-ascii string |
| my $camel; |
| format STDOUT = |
| *non-ascii*@>>>>>>> |
| $camel |
| . |
| $camel = "*non-ascii*"; |
| binmode(STDOUT=>':encoding(utf8)'); # bang! |
| write; # funny |
| print $camel, "\n"; # fine |
| |
| Without binmode this happens to work but without binmode, print() |
| fails instead of write(). |
| |
| At any rate, the very use of format is questionable when it comes to |
| unicode characters since you have to consider such things as character |
| width (i.e. double-width for ideographs) and directions (i.e. BIDI for |
| Arabic and Hebrew). |
| |
| =item Thread safety |
| |
| C<use encoding ...> is not thread-safe (i.e., do not use in threaded |
| applications). |
| |
| =back |
| |
| =head2 The Logic of :locale |
| |
| The logic of C<:locale> is as follows: |
| |
| =over 4 |
| |
| =item 1. |
| |
| If the platform supports the langinfo(CODESET) interface, the codeset |
| returned is used as the default encoding for the open pragma. |
| |
| =item 2. |
| |
| If 1. didn't work but we are under the locale pragma, the environment |
| variables LC_ALL and LANG (in that order) are matched for encodings |
| (the part after C<.>, if any), and if any found, that is used |
| as the default encoding for the open pragma. |
| |
| =item 3. |
| |
| If 1. and 2. didn't work, the environment variables LC_ALL and LANG |
| (in that order) are matched for anything looking like UTF-8, and if |
| any found, C<:utf8> is used as the default encoding for the open |
| pragma. |
| |
| =back |
| |
| If your locale environment variables (LC_ALL, LC_CTYPE, LANG) |
| contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching), |
| the default encoding of your STDIN, STDOUT, and STDERR, and of |
| B<any subsequent file open>, is UTF-8. |
| |
| =head1 HISTORY |
| |
| This pragma first appeared in Perl 5.8.0. For features that require |
| 5.8.1 and better, see above. |
| |
| The C<:locale> subpragma was implemented in 2.01, or Perl 5.8.6. |
| |
| =head1 SEE ALSO |
| |
| L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>, |
| |
| Ch. 15 of C<Programming Perl (3rd Edition)> |
| by Larry Wall, Tom Christiansen, Jon Orwant; |
| O'Reilly & Associates; ISBN 0-596-00027-8 |
| |
| =cut |