perl-5.16.2/cpan/Text-Soundex/Soundex.pm - toolchain/perl - Git at Google

 # -*- perl -*-

 # (c) Copyright 1998-2007 by Mark Mielke
 #
 # Freedom to use these sources for whatever you want, as long as credit
 # is given where credit is due, is hereby granted. You may make modifications
 # where you see fit but leave this copyright somewhere visible. As well, try
 # to initial any changes you make so that if I like the changes I can
 # incorporate them into later versions.
 #
 #      - Mark Mielke <mark@mielke.cc>
 #

 package Text::Soundex;
 require 5.006;

 use Exporter ();
 use XSLoader ();

 use strict;

 our $VERSION   = '3.03_01';
 our @EXPORT_OK = qw(soundex soundex_unicode soundex_nara soundex_nara_unicode
                     $soundex_nocode);
 our @EXPORT    = qw(soundex soundex_nara $soundex_nocode);
 our @ISA       = qw(Exporter);

 our $nocode;

 # Previous releases of Text::Soundex made $nocode available as $soundex_nocode.
 # For now, this part of the interface is exported and maintained.
 # In the feature, $soundex_nocode will be deprecated.
 *Text::Soundex::soundex_nocode = \$nocode;

 sub soundex_noxs
 {
     # Original Soundex algorithm

     my @results = map {
         my $code = uc($_);
         $code =~ tr/AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr//cd;

 	if (length($code)) {
             my $firstchar = substr($code, 0, 1);
 	    $code =~ tr[AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr]
                        [0000000000000000111111112222222222222222333344555566]s;
 	    ($code = substr($code, 1)) =~ tr/0//d;
 	    substr($firstchar . $code . '000', 0, 4);
 	} else {
 	    $nocode;
 	}
     } @_;

     wantarray ? @results : $results[0];
 }

 sub soundex_nara
 {
     # US census (NARA) algorithm.

     my @results = map {
 	my $code = uc($_);
         $code =~ tr/AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr//cd;

 	if (length($code)) {
             my $firstchar = substr($code, 0, 1);
 	    $code =~ tr[AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr]
                        [0000990000009900111111112222222222222222333344555566]s;
             $code =~ s/(.)9\1/$1/gs;
 	    ($code = substr($code, 1)) =~ tr/09//d;
 	    substr($firstchar . $code . '000', 0, 4);
 	} else {
 	    $nocode
 	}
     } @_;

     wantarray ? @results : $results[0];
 }

 sub soundex_unicode
 {
     require Text::Unidecode unless defined &Text::Unidecode::unidecode;
     soundex(Text::Unidecode::unidecode(@_));
 }

 sub soundex_nara_unicode
 {
     require Text::Unidecode unless defined &Text::Unidecode::unidecode;
     soundex_nara(Text::Unidecode::unidecode(@_));
 }

 eval { XSLoader::load(__PACKAGE__, $VERSION) };

 if (defined(&soundex_xs)) {
     *soundex = \&soundex_xs;
 } else {
     *soundex = \&soundex_noxs;
     *soundex_xs = sub {
         require Carp;
         Carp::croak("XS implementation of Text::Soundex::soundex_xs() ".
                     "could not be loaded");
     };
 }

 1;

 __END__

 # Implementation of the soundex algorithm.
 #
 # Some of this documention was written by Mike Stok.
 #
 # Examples:
 #
 # Euler, Ellery -> E460
 # Gauss, Ghosh -> G200
 # Hilbert, Heilbronn -> H416
 # Knuth, Kant -> K530
 # Lloyd, Ladd -> L300
 # Lukasiewicz, Lissajous -> L222
 #

 =head1 NAME

 Text::Soundex - Implementation of the soundex algorithm.

 =head1 SYNOPSIS

   use Text::Soundex;

   # Original algorithm.
   $code = soundex($name);    # Get the soundex code for a name.
   @codes = soundex(@names);  # Get the list of codes for a list of names.

   # American Soundex variant (NARA) - Used for US census data.
   $code = soundex_nara($name);    # Get the soundex code for a name.
   @codes = soundex_nara(@names);  # Get the list of codes for a list of names.

   # Redefine the value that soundex() will return if the input string
   # contains no identifiable sounds within it.
   $Text::Soundex::nocode = 'Z000';

 =head1 DESCRIPTION

 Soundex is a phonetic algorithm for indexing names by sound, as
 pronounced in English. The goal is for names with the same
 pronunciation to be encoded to the same representation so that they
 can be matched despite minor differences in spelling. Soundex is the
 most widely known of all phonetic algorithms and is often used
 (incorrectly) as a synonym for "phonetic algorithm". Improvements to
 Soundex are the basis for many modern phonetic algorithms. (Wikipedia,
 2007)

 This module implements the original soundex algorithm developed by
 Robert Russell and Margaret Odell, patented in 1918 and 1922, as well
 as a variation called "American Soundex" used for US census data, and
 current maintained by the National Archives and Records Administration
 (NARA).

 The soundex algorithm may be recognized from Donald Knuth's
 B<The Art of Computer Programming>. The algorithm described by
 Knuth is the NARA algorithm.

 The value returned for strings which have no soundex encoding is
 defined using C<$Text::Soundex::nocode>. The default value is C<undef>,
 however values such as C<'Z000'> are commonly used alternatives.

 For backward compatibility with older versions of this module the
 C<$Text::Soundex::nocode> is exported into the caller's namespace as
 C<$soundex_nocode>.

 In scalar context, C<soundex()> returns the soundex code of its first
 argument. In list context, a list is returned in which each element is the
 soundex code for the corresponding argument passed to C<soundex()>. For
 example, the following code assigns @codes the value C<('M200', 'S320')>:

    @codes = soundex qw(Mike Stok);

 To use C<Text::Soundex> to generate codes that can be used to search one
 of the publically available US Censuses, a variant of the soundex
 algorithm must be used:

     use Text::Soundex;
     $code = soundex_nara($name);

 An example of where these algorithm differ follows:

     use Text::Soundex;
     print soundex("Ashcraft"), "\n";       # prints: A226
     print soundex_nara("Ashcraft"), "\n";  # prints: A261

 =head1 EXAMPLES

 Donald Knuth's examples of names and the soundex codes they map to
 are listed below:

   Euler, Ellery -> E460
   Gauss, Ghosh -> G200
   Hilbert, Heilbronn -> H416
   Knuth, Kant -> K530
   Lloyd, Ladd -> L300
   Lukasiewicz, Lissajous -> L222

 so:

   $code = soundex 'Knuth';         # $code contains 'K530'
   @list = soundex qw(Lloyd Gauss); # @list contains 'L300', 'G200'

 =head1 LIMITATIONS

 As the soundex algorithm was originally used a B<long> time ago in the US
 it considers only the English alphabet and pronunciation. In particular,
 non-ASCII characters will be ignored. The recommended method of dealing
 with characters that have accents, or other unicode characters, is to use
 the Text::Unidecode module available from CPAN. Either use the module
 explicitly:

     use Text::Soundex;
     use Text::Unidecode;

     print soundex(unidecode("Fran\xE7ais")), "\n"; # Prints "F652\n"

 Or use the convenient wrapper routine:

     use Text::Soundex 'soundex_unicode';

     print soundex_unicode("Fran\xE7ais"), "\n";    # Prints "F652\n"

 Since the soundex algorithm maps a large space (strings of arbitrary
 length) onto a small space (single letter plus 3 digits) no inference
 can be made about the similarity of two strings which end up with the
 same soundex code.  For example, both C<Hilbert> and C<Heilbronn> end
 up with a soundex code of C<H416>.

 =head1 MAINTAINER

 This module is currently maintain by Mark Mielke (C<mark@mielke.cc>).

 =head1 HISTORY

 Version 3 is a significant update to provide support for versions of
 Perl later than Perl 5.004. Specifically, the XS version of the
 soundex() subroutine understands strings that are encoded using UTF-8
 (unicode strings).

 Version 2 of this module was a re-write by Mark Mielke (C<mark@mielke.cc>)
 to improve the speed of the subroutines. The XS version of the soundex()
 subroutine was introduced in 2.00.

 Version 1 of this module was written by Mike Stok (C<mike@stok.co.uk>)
 and was included into the Perl core library set.

 Dave Carlsen (C<dcarlsen@csranet.com>) made the request for the NARA
 algorithm to be included. The NARA soundex page can be viewed at:
 C<http://www.nara.gov/genealogy/soundex/soundex.html>

 Ian Phillips (C<ian@pipex.net>) and Rich Pinder (C<rpinder@hsc.usc.edu>)
 supplied ideas and spotted mistakes for v1.x.

 =cut
	# -- perl --

	# (c) Copyright 1998-2007 by Mark Mielke
	#
	# Freedom to use these sources for whatever you want, as long as credit
	# is given where credit is due, is hereby granted. You may make modifications
	# where you see fit but leave this copyright somewhere visible. As well, try
	# to initial any changes you make so that if I like the changes I can
	# incorporate them into later versions.
	#
	# - Mark Mielke <mark@mielke.cc>
	#

	package Text::Soundex;
	require 5.006;

	use Exporter ();
	use XSLoader ();

	use strict;

	our $VERSION = '3.03_01';
	our @EXPORT_OK = qw(soundex soundex_unicode soundex_nara soundex_nara_unicode
	$soundex_nocode);
	our @EXPORT = qw(soundex soundex_nara $soundex_nocode);
	our @ISA = qw(Exporter);

	our $nocode;

	# Previous releases of Text::Soundex made $nocode available as $soundex_nocode.
	# For now, this part of the interface is exported and maintained.
	# In the feature, $soundex_nocode will be deprecated.
	*Text::Soundex::soundex_nocode = \$nocode;

	sub soundex_noxs
	{
	# Original Soundex algorithm

	my @results = map {
	my $code = uc($_);
	$code =~ tr/AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr//cd;

	if (length($code)) {
	my $firstchar = substr($code, 0, 1);
	$code =~ tr[AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr]
	[0000000000000000111111112222222222222222333344555566]s;
	($code = substr($code, 1)) =~ tr/0//d;
	substr($firstchar . $code . '000', 0, 4);
	} else {
	$nocode;
	}
	} @_;

	wantarray ? @results : $results[0];
	}

	sub soundex_nara
	{
	# US census (NARA) algorithm.

	my @results = map {
	my $code = uc($_);
	$code =~ tr/AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr//cd;

	if (length($code)) {
	my $firstchar = substr($code, 0, 1);
	$code =~ tr[AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr]
	[0000990000009900111111112222222222222222333344555566]s;
	$code =~ s/(.)9\1/$1/gs;
	($code = substr($code, 1)) =~ tr/09//d;
	substr($firstchar . $code . '000', 0, 4);
	} else {
	$nocode
	}
	} @_;

	wantarray ? @results : $results[0];
	}

	sub soundex_unicode
	{
	require Text::Unidecode unless defined &Text::Unidecode::unidecode;
	soundex(Text::Unidecode::unidecode(@_));
	}

	sub soundex_nara_unicode
	{
	require Text::Unidecode unless defined &Text::Unidecode::unidecode;
	soundex_nara(Text::Unidecode::unidecode(@_));
	}

	eval { XSLoader::load(__PACKAGE__, $VERSION) };

	if (defined(&soundex_xs)) {
	*soundex = \&soundex_xs;
	} else {
	*soundex = \&soundex_noxs;
	*soundex_xs = sub {
	require Carp;
	Carp::croak("XS implementation of Text::Soundex::soundex_xs() ".
	"could not be loaded");
	};
	}

	1;

	__END__

	# Implementation of the soundex algorithm.
	#
	# Some of this documention was written by Mike Stok.
	#
	# Examples:
	#
	# Euler, Ellery -> E460
	# Gauss, Ghosh -> G200
	# Hilbert, Heilbronn -> H416
	# Knuth, Kant -> K530
	# Lloyd, Ladd -> L300
	# Lukasiewicz, Lissajous -> L222
	#

	=head1 NAME

	Text::Soundex - Implementation of the soundex algorithm.

	=head1 SYNOPSIS

	use Text::Soundex;

	# Original algorithm.
	$code = soundex($name); # Get the soundex code for a name.
	@codes = soundex(@names); # Get the list of codes for a list of names.

	# American Soundex variant (NARA) - Used for US census data.
	$code = soundex_nara($name); # Get the soundex code for a name.
	@codes = soundex_nara(@names); # Get the list of codes for a list of names.

	# Redefine the value that soundex() will return if the input string
	# contains no identifiable sounds within it.
	$Text::Soundex::nocode = 'Z000';

	=head1 DESCRIPTION

	Soundex is a phonetic algorithm for indexing names by sound, as
	pronounced in English. The goal is for names with the same
	pronunciation to be encoded to the same representation so that they
	can be matched despite minor differences in spelling. Soundex is the
	most widely known of all phonetic algorithms and is often used
	(incorrectly) as a synonym for "phonetic algorithm". Improvements to
	Soundex are the basis for many modern phonetic algorithms. (Wikipedia,
	2007)

	This module implements the original soundex algorithm developed by
	Robert Russell and Margaret Odell, patented in 1918 and 1922, as well
	as a variation called "American Soundex" used for US census data, and
	current maintained by the National Archives and Records Administration
	(NARA).

	The soundex algorithm may be recognized from Donald Knuth's
	B<The Art of Computer Programming>. The algorithm described by
	Knuth is the NARA algorithm.

	The value returned for strings which have no soundex encoding is
	defined using C<$Text::Soundex::nocode>. The default value is C<undef>,
	however values such as C<'Z000'> are commonly used alternatives.

	For backward compatibility with older versions of this module the
	C<$Text::Soundex::nocode> is exported into the caller's namespace as
	C<$soundex_nocode>.

	In scalar context, C<soundex()> returns the soundex code of its first
	argument. In list context, a list is returned in which each element is the
	soundex code for the corresponding argument passed to C<soundex()>. For
	example, the following code assigns @codes the value C<('M200', 'S320')>:

	@codes = soundex qw(Mike Stok);

	To use C<Text::Soundex> to generate codes that can be used to search one
	of the publically available US Censuses, a variant of the soundex
	algorithm must be used:

	use Text::Soundex;
	$code = soundex_nara($name);

	An example of where these algorithm differ follows:

	use Text::Soundex;
	print soundex("Ashcraft"), "\n"; # prints: A226
	print soundex_nara("Ashcraft"), "\n"; # prints: A261

	=head1 EXAMPLES

	Donald Knuth's examples of names and the soundex codes they map to
	are listed below:

	Euler, Ellery -> E460
	Gauss, Ghosh -> G200
	Hilbert, Heilbronn -> H416
	Knuth, Kant -> K530
	Lloyd, Ladd -> L300
	Lukasiewicz, Lissajous -> L222

	so:

	$code = soundex 'Knuth'; # $code contains 'K530'
	@list = soundex qw(Lloyd Gauss); # @list contains 'L300', 'G200'

	=head1 LIMITATIONS

	As the soundex algorithm was originally used a B<long> time ago in the US
	it considers only the English alphabet and pronunciation. In particular,
	non-ASCII characters will be ignored. The recommended method of dealing
	with characters that have accents, or other unicode characters, is to use
	the Text::Unidecode module available from CPAN. Either use the module
	explicitly:

	use Text::Soundex;
	use Text::Unidecode;

	print soundex(unidecode("Fran\xE7ais")), "\n"; # Prints "F652\n"

	Or use the convenient wrapper routine:

	use Text::Soundex 'soundex_unicode';

	print soundex_unicode("Fran\xE7ais"), "\n"; # Prints "F652\n"

	Since the soundex algorithm maps a large space (strings of arbitrary
	length) onto a small space (single letter plus 3 digits) no inference
	can be made about the similarity of two strings which end up with the
	same soundex code. For example, both C<Hilbert> and C<Heilbronn> end
	up with a soundex code of C<H416>.

	=head1 MAINTAINER

	This module is currently maintain by Mark Mielke (C<mark@mielke.cc>).

	=head1 HISTORY

	Version 3 is a significant update to provide support for versions of
	Perl later than Perl 5.004. Specifically, the XS version of the
	soundex() subroutine understands strings that are encoded using UTF-8
	(unicode strings).

	Version 2 of this module was a re-write by Mark Mielke (C<mark@mielke.cc>)
	to improve the speed of the subroutines. The XS version of the soundex()
	subroutine was introduced in 2.00.

	Version 1 of this module was written by Mike Stok (C<mike@stok.co.uk>)
	and was included into the Perl core library set.

	Dave Carlsen (C<dcarlsen@csranet.com>) made the request for the NARA
	algorithm to be included. The NARA soundex page can be viewed at:
	C<http://www.nara.gov/genealogy/soundex/soundex.html>

	Ian Phillips (C<ian@pipex.net>) and Rich Pinder (C<rpinder@hsc.usc.edu>)
	supplied ideas and spotted mistakes for v1.x.

	=cut