gnulib/lib/propername.c - toolchain/make - Git at Google

 /* Localization of proper names.
    Copyright (C) 2006-2020 Free Software Foundation, Inc.
    Written by Bruno Haible <bruno@clisp.org>, 2006.

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */

 /* Without this pragma, gcc 4.7.0 20111124 mistakenly suggests that
    the proper_name function might be candidate for attribute 'const'  */
 #if (__GNUC__ == 4 && 6 <= __GNUC_MINOR__) || 4 < __GNUC__
 # pragma GCC diagnostic ignored "-Wsuggest-attribute=const"
 #endif

 #include <config.h>

 /* Specification.  */
 #include "propername.h"

 #include <ctype.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #if HAVE_ICONV
 # include <iconv.h>
 #endif

 #include "trim.h"
 #include "mbchar.h"
 #include "mbuiter.h"
 #include "localcharset.h"
 #include "c-strcase.h"
 #include "xstriconv.h"
 #include "xalloc.h"
 #include "gettext.h"


 /* Tests whether STRING contains trim (SUB), starting and ending at word
    boundaries.
    Here, instead of implementing Unicode Standard Annex #29 for determining
    word boundaries, we assume that trim (SUB) starts and ends with words and
    only test whether the part before it ends with a non-word and the part
    after it starts with a non-word.  */
 static bool
 mbsstr_trimmed_wordbounded (const char *string, const char *sub)
 {
   char *tsub = trim (sub);
   bool found = false;

   for (; *string != '\0';)
     {
       const char *tsub_in_string = mbsstr (string, tsub);
       if (tsub_in_string == NULL)
         break;
       else
         {
           if (MB_CUR_MAX > 1)
             {
               mbui_iterator_t string_iter;
               bool word_boundary_before;
               bool word_boundary_after;

               mbui_init (string_iter, string);
               word_boundary_before = true;
               if (mbui_cur_ptr (string_iter) < tsub_in_string)
                 {
                   mbchar_t last_char_before_tsub;
                   do
                     {
                       if (!mbui_avail (string_iter))
                         abort ();
                       last_char_before_tsub = mbui_cur (string_iter);
                       mbui_advance (string_iter);
                     }
                   while (mbui_cur_ptr (string_iter) < tsub_in_string);
                   if (mb_isalnum (last_char_before_tsub))
                     word_boundary_before = false;
                 }

               mbui_init (string_iter, tsub_in_string);
               {
                 mbui_iterator_t tsub_iter;

                 for (mbui_init (tsub_iter, tsub);
                      mbui_avail (tsub_iter);
                      mbui_advance (tsub_iter))
                   {
                     if (!mbui_avail (string_iter))
                       abort ();
                     mbui_advance (string_iter);
                   }
               }
               word_boundary_after = true;
               if (mbui_avail (string_iter))
                 {
                   mbchar_t first_char_after_tsub = mbui_cur (string_iter);
                   if (mb_isalnum (first_char_after_tsub))
                     word_boundary_after = false;
                 }

               if (word_boundary_before && word_boundary_after)
                 {
                   found = true;
                   break;
                 }

               mbui_init (string_iter, tsub_in_string);
               if (!mbui_avail (string_iter))
                 break;
               string = tsub_in_string + mb_len (mbui_cur (string_iter));
             }
           else
             {
               bool word_boundary_before;
               const char *p;
               bool word_boundary_after;

               word_boundary_before = true;
               if (string < tsub_in_string)
                 if (isalnum ((unsigned char) tsub_in_string[-1]))
                   word_boundary_before = false;

               p = tsub_in_string + strlen (tsub);
               word_boundary_after = true;
               if (*p != '\0')
                 if (isalnum ((unsigned char) *p))
                   word_boundary_after = false;

               if (word_boundary_before && word_boundary_after)
                 {
                   found = true;
                   break;
                 }

               if (*tsub_in_string == '\0')
                 break;
               string = tsub_in_string + 1;
             }
         }
     }
   free (tsub);
   return found;
 }

 /* Return the localization of NAME.  NAME is written in ASCII.  */

 const char *
 proper_name (const char *name)
 {
   /* See whether there is a translation.   */
   const char *translation = gettext (name);

   if (translation != name)
     {
       /* See whether the translation contains the original name.  */
       if (mbsstr_trimmed_wordbounded (translation, name))
         return translation;
       else
         {
           /* Return "TRANSLATION (NAME)".  */
           char *result =
             XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char);

           sprintf (result, "%s (%s)", translation, name);
           return result;
         }
     }
   else
     return name;
 }

 /* Return the localization of a name whose original writing is not ASCII.
    NAME_UTF8 is the real name, written in UTF-8 with octal or hexadecimal
    escape sequences.  NAME_ASCII is a fallback written only with ASCII
    characters.  */

 const char *
 proper_name_utf8 (const char *name_ascii, const char *name_utf8)
 {
   /* See whether there is a translation.   */
   const char *translation = gettext (name_ascii);

   /* Try to convert NAME_UTF8 to the locale encoding.  */
   const char *locale_code = locale_charset ();
   char *alloc_name_converted = NULL;
   char *alloc_name_converted_translit = NULL;
   const char *name_converted = NULL;
   const char *name_converted_translit = NULL;
   const char *name;

   if (c_strcasecmp (locale_code, "UTF-8") != 0)
     {
 #if HAVE_ICONV
       name_converted = alloc_name_converted =
         xstr_iconv (name_utf8, "UTF-8", locale_code);

 # if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
       && !defined __UCLIBC__) \
      || _LIBICONV_VERSION >= 0x0105
       {
         char *converted_translit;

         size_t len = strlen (locale_code);
         char *locale_code_translit = XNMALLOC (len + 10 + 1, char);
         memcpy (locale_code_translit, locale_code, len);
         memcpy (locale_code_translit + len, "//TRANSLIT", 10 + 1);

         converted_translit =
           xstr_iconv (name_utf8, "UTF-8", locale_code_translit);

         free (locale_code_translit);

         if (converted_translit != NULL)
           {
 #  if !_LIBICONV_VERSION
             /* Don't use the transliteration if it added question marks.
                glibc's transliteration falls back to question marks; libiconv's
                transliteration does not.
                mbschr is equivalent to strchr in this case.  */
             if (strchr (converted_translit, '?') != NULL)
               free (converted_translit);
             else
 #  endif
               name_converted_translit = alloc_name_converted_translit =
                 converted_translit;
           }
       }
 # endif
 #endif
     }
   else
     {
       name_converted = name_utf8;
       name_converted_translit = name_utf8;
     }

   /* The name in locale encoding.  */
   name = (name_converted != NULL ? name_converted :
           name_converted_translit != NULL ? name_converted_translit :
           name_ascii);

   /* See whether we have a translation.  Some translators have not understood
      that they should use the UTF-8 form of the name, if possible.  So if the
      translator provided a no-op translation, we ignore it.  */
   if (strcmp (translation, name_ascii) != 0)
     {
       /* See whether the translation contains the original name.  */
       if (mbsstr_trimmed_wordbounded (translation, name_ascii)
           || (name_converted != NULL
               && mbsstr_trimmed_wordbounded (translation, name_converted))
           || (name_converted_translit != NULL
               && mbsstr_trimmed_wordbounded (translation, name_converted_translit)))
         {
           if (alloc_name_converted != NULL)
             free (alloc_name_converted);
           if (alloc_name_converted_translit != NULL)
             free (alloc_name_converted_translit);
           return translation;
         }
       else
         {
           /* Return "TRANSLATION (NAME)".  */
           char *result =
             XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char);

           sprintf (result, "%s (%s)", translation, name);

           if (alloc_name_converted != NULL)
             free (alloc_name_converted);
           if (alloc_name_converted_translit != NULL)
             free (alloc_name_converted_translit);
           return result;
         }
     }
   else
     {
       if (alloc_name_converted != NULL && alloc_name_converted != name)
         free (alloc_name_converted);
       if (alloc_name_converted_translit != NULL
           && alloc_name_converted_translit != name)
         free (alloc_name_converted_translit);
       return name;
     }
 }

 #ifdef TEST1
 # include <locale.h>
 int
 main (int argc, char *argv[])
 {
   setlocale (LC_ALL, "");
   if (mbsstr_trimmed_wordbounded (argv[1], argv[2]))
     printf("found\n");
   return 0;
 }
 #endif

 #ifdef TEST2
 # include <locale.h>
 # include <stdio.h>
 int
 main (int argc, char *argv[])
 {
   setlocale (LC_ALL, "");
   printf ("%s\n", proper_name_utf8 ("Franc,ois Pinard", "Fran\303\247ois Pinard"));
   return 0;
 }
 #endif
	/* Localization of proper names.
	Copyright (C) 2006-2020 Free Software Foundation, Inc.
	Written by Bruno Haible <bruno@clisp.org>, 2006.

	This program is free software: you can redistribute it and/or modify
	it under the terms of the GNU General Public License as published by
	the Free Software Foundation; either version 3 of the License, or
	(at your option) any later version.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	GNU General Public License for more details.

	You should have received a copy of the GNU General Public License
	along with this program. If not, see <https://www.gnu.org/licenses/>. */

	/* Without this pragma, gcc 4.7.0 20111124 mistakenly suggests that
	the proper_name function might be candidate for attribute 'const' */
	#if (__GNUC__ == 4 && 6 <= __GNUC_MINOR__) \|\| 4 < __GNUC__
	# pragma GCC diagnostic ignored "-Wsuggest-attribute=const"
	#endif

	#include <config.h>

	/* Specification. */
	#include "propername.h"

	#include <ctype.h>
	#include <stdbool.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#if HAVE_ICONV
	# include <iconv.h>
	#endif

	#include "trim.h"
	#include "mbchar.h"
	#include "mbuiter.h"
	#include "localcharset.h"
	#include "c-strcase.h"
	#include "xstriconv.h"
	#include "xalloc.h"
	#include "gettext.h"


	/* Tests whether STRING contains trim (SUB), starting and ending at word
	boundaries.
	Here, instead of implementing Unicode Standard Annex #29 for determining
	word boundaries, we assume that trim (SUB) starts and ends with words and
	only test whether the part before it ends with a non-word and the part
	after it starts with a non-word. */
	static bool
	mbsstr_trimmed_wordbounded (const char string, const char sub)
	{
	char *tsub = trim (sub);
	bool found = false;

	for (; *string != '\0';)
	{
	const char *tsub_in_string = mbsstr (string, tsub);
	if (tsub_in_string == NULL)
	break;
	else
	{
	if (MB_CUR_MAX > 1)
	{
	mbui_iterator_t string_iter;
	bool word_boundary_before;
	bool word_boundary_after;

	mbui_init (string_iter, string);
	word_boundary_before = true;
	if (mbui_cur_ptr (string_iter) < tsub_in_string)
	{
	mbchar_t last_char_before_tsub;
	do
	{
	if (!mbui_avail (string_iter))
	abort ();
	last_char_before_tsub = mbui_cur (string_iter);
	mbui_advance (string_iter);
	}
	while (mbui_cur_ptr (string_iter) < tsub_in_string);
	if (mb_isalnum (last_char_before_tsub))
	word_boundary_before = false;
	}

	mbui_init (string_iter, tsub_in_string);
	{
	mbui_iterator_t tsub_iter;

	for (mbui_init (tsub_iter, tsub);
	mbui_avail (tsub_iter);
	mbui_advance (tsub_iter))
	{
	if (!mbui_avail (string_iter))
	abort ();
	mbui_advance (string_iter);
	}
	}
	word_boundary_after = true;
	if (mbui_avail (string_iter))
	{
	mbchar_t first_char_after_tsub = mbui_cur (string_iter);
	if (mb_isalnum (first_char_after_tsub))
	word_boundary_after = false;
	}

	if (word_boundary_before && word_boundary_after)
	{
	found = true;
	break;
	}

	mbui_init (string_iter, tsub_in_string);
	if (!mbui_avail (string_iter))
	break;
	string = tsub_in_string + mb_len (mbui_cur (string_iter));
	}
	else
	{
	bool word_boundary_before;
	const char *p;
	bool word_boundary_after;

	word_boundary_before = true;
	if (string < tsub_in_string)
	if (isalnum ((unsigned char) tsub_in_string[-1]))
	word_boundary_before = false;

	p = tsub_in_string + strlen (tsub);
	word_boundary_after = true;
	if (*p != '\0')
	if (isalnum ((unsigned char) *p))
	word_boundary_after = false;

	if (word_boundary_before && word_boundary_after)
	{
	found = true;
	break;
	}

	if (*tsub_in_string == '\0')
	break;
	string = tsub_in_string + 1;
	}
	}
	}
	free (tsub);
	return found;
	}

	/* Return the localization of NAME. NAME is written in ASCII. */

	const char *
	proper_name (const char *name)
	{
	/* See whether there is a translation. */
	const char *translation = gettext (name);

	if (translation != name)
	{
	/* See whether the translation contains the original name. */
	if (mbsstr_trimmed_wordbounded (translation, name))
	return translation;
	else
	{
	/* Return "TRANSLATION (NAME)". */
	char *result =
	XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char);

	sprintf (result, "%s (%s)", translation, name);
	return result;
	}
	}
	else
	return name;
	}

	/* Return the localization of a name whose original writing is not ASCII.
	NAME_UTF8 is the real name, written in UTF-8 with octal or hexadecimal
	escape sequences. NAME_ASCII is a fallback written only with ASCII
	characters. */

	const char *
	proper_name_utf8 (const char name_ascii, const char name_utf8)
	{
	/* See whether there is a translation. */
	const char *translation = gettext (name_ascii);

	/* Try to convert NAME_UTF8 to the locale encoding. */
	const char *locale_code = locale_charset ();
	char *alloc_name_converted = NULL;
	char *alloc_name_converted_translit = NULL;
	const char *name_converted = NULL;
	const char *name_converted_translit = NULL;
	const char *name;

	if (c_strcasecmp (locale_code, "UTF-8") != 0)
	{
	#if HAVE_ICONV
	name_converted = alloc_name_converted =
	xstr_iconv (name_utf8, "UTF-8", locale_code);

	# if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) \|\| __GLIBC__ > 2) \
	&& !defined __UCLIBC__) \
	\|\| _LIBICONV_VERSION >= 0x0105
	{
	char *converted_translit;

	size_t len = strlen (locale_code);
	char *locale_code_translit = XNMALLOC (len + 10 + 1, char);
	memcpy (locale_code_translit, locale_code, len);
	memcpy (locale_code_translit + len, "//TRANSLIT", 10 + 1);

	converted_translit =
	xstr_iconv (name_utf8, "UTF-8", locale_code_translit);

	free (locale_code_translit);

	if (converted_translit != NULL)
	{
	# if !_LIBICONV_VERSION
	/* Don't use the transliteration if it added question marks.
	glibc's transliteration falls back to question marks; libiconv's
	transliteration does not.
	mbschr is equivalent to strchr in this case. */
	if (strchr (converted_translit, '?') != NULL)
	free (converted_translit);
	else
	# endif
	name_converted_translit = alloc_name_converted_translit =
	converted_translit;
	}
	}
	# endif
	#endif
	}
	else
	{
	name_converted = name_utf8;
	name_converted_translit = name_utf8;
	}

	/* The name in locale encoding. */
	name = (name_converted != NULL ? name_converted :
	name_converted_translit != NULL ? name_converted_translit :
	name_ascii);

	/* See whether we have a translation. Some translators have not understood
	that they should use the UTF-8 form of the name, if possible. So if the
	translator provided a no-op translation, we ignore it. */
	if (strcmp (translation, name_ascii) != 0)
	{
	/* See whether the translation contains the original name. */
	if (mbsstr_trimmed_wordbounded (translation, name_ascii)
	\|\| (name_converted != NULL
	&& mbsstr_trimmed_wordbounded (translation, name_converted))
	\|\| (name_converted_translit != NULL
	&& mbsstr_trimmed_wordbounded (translation, name_converted_translit)))
	{
	if (alloc_name_converted != NULL)
	free (alloc_name_converted);
	if (alloc_name_converted_translit != NULL)
	free (alloc_name_converted_translit);
	return translation;
	}
	else
	{
	/* Return "TRANSLATION (NAME)". */
	char *result =
	XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char);

	sprintf (result, "%s (%s)", translation, name);

	if (alloc_name_converted != NULL)
	free (alloc_name_converted);
	if (alloc_name_converted_translit != NULL)
	free (alloc_name_converted_translit);
	return result;
	}
	}
	else
	{
	if (alloc_name_converted != NULL && alloc_name_converted != name)
	free (alloc_name_converted);
	if (alloc_name_converted_translit != NULL
	&& alloc_name_converted_translit != name)
	free (alloc_name_converted_translit);
	return name;
	}
	}

	#ifdef TEST1
	# include <locale.h>
	int
	main (int argc, char *argv[])
	{
	setlocale (LC_ALL, "");
	if (mbsstr_trimmed_wordbounded (argv[1], argv[2]))
	printf("found\n");
	return 0;
	}
	#endif

	#ifdef TEST2
	# include <locale.h>
	# include <stdio.h>
	int
	main (int argc, char *argv[])
	{
	setlocale (LC_ALL, "");
	printf ("%s\n", proper_name_utf8 ("Franc,ois Pinard", "Fran\303\247ois Pinard"));
	return 0;
	}
	#endif