gnulib/lib/uniname/uniname.c - toolchain/make - Git at Google

 /* Association between Unicode characters and their names.
    Copyright (C) 2000-2002, 2005-2007, 2009-2020 Free Software Foundation, Inc.

    This program is free software: you can redistribute it and/or modify it
    under the terms of the GNU Lesser General Public License as published
    by the Free Software Foundation; either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */

 #include <config.h>

 /* Specification.  */
 #include "uniname.h"

 #include <assert.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>

 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))


 /* Table of Unicode character names, derived from UnicodeData.txt.
    This table is generated in a way to minimize the memory footprint:
      1. its compiled size is small (less than 350 KB),
      2. it resides entirely in the text or read-only data segment of the
         executable or shared library: the table contains only immediate
         integers, no pointers, and the functions don't do heap allocation.
  */
 #include "uninames.h"
 /* It contains:
   static const char unicode_name_words[36303] = ...;
   #define UNICODE_CHARNAME_NUM_WORDS 6260
   static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
   #define UNICODE_CHARNAME_WORD_HANGUL 3902
   #define UNICODE_CHARNAME_WORD_SYLLABLE 4978
   #define UNICODE_CHARNAME_WORD_CJK 417
   #define UNICODE_CHARNAME_WORD_COMPATIBILITY 6107
   static const uint16_t unicode_names[68940] = ...;
   static const struct { uint16_t index; uint32_t name:24; } unicode_name_to_index[16626] = ...;
   static const struct { uint16_t index; uint32_t name:24; } unicode_index_to_name[16626] = ...;
   #define UNICODE_CHARNAME_MAX_LENGTH 83
   #define UNICODE_CHARNAME_MAX_WORDS 13
   static const struct { uint32_t index; uint32_t gap; uint16_t length; } unicode_ranges[401] = ...;
 */

 /* Returns the word with a given index.  */
 static const char *
 unicode_name_word (unsigned int index, unsigned int *lengthp)
 {
   unsigned int i1;
   unsigned int i2;
   unsigned int i;

   assert (index < UNICODE_CHARNAME_NUM_WORDS);

   /* Binary search for i with
        unicode_name_by_length[i].ind_offset <= index
      and
        index < unicode_name_by_length[i+1].ind_offset
    */

   i1 = 0;
   i2 = SIZEOF (unicode_name_by_length) - 1;
   while (i2 - i1 > 1)
     {
       unsigned int i = (i1 + i2) >> 1;
       if (unicode_name_by_length[i].ind_offset <= index)
         i1 = i;
       else
         i2 = i;
     }
   i = i1;
   assert (unicode_name_by_length[i].ind_offset <= index
           && index < unicode_name_by_length[i+1].ind_offset);
   *lengthp = i;
   return &unicode_name_words[unicode_name_by_length[i].extra_offset
                              + (index-unicode_name_by_length[i].ind_offset)*i];
 }

 /* Looks up the index of a word.  */
 static int
 unicode_name_word_lookup (const char *word, unsigned int length)
 {
   if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1)
     {
       /* Binary search among the words of given length.  */
       unsigned int extra_offset = unicode_name_by_length[length].extra_offset;
       unsigned int i0 = unicode_name_by_length[length].ind_offset;
       unsigned int i1 = i0;
       unsigned int i2 = unicode_name_by_length[length+1].ind_offset;
       while (i2 - i1 > 0)
         {
           unsigned int i = (i1 + i2) >> 1;
           const char *p = &unicode_name_words[extra_offset + (i-i0)*length];
           const char *w = word;
           unsigned int n = length;
           for (;;)
             {
               if (*p < *w)
                 {
                   if (i1 == i)
                     return -1;
                   /* Note here: i1 < i < i2.  */
                   i1 = i;
                   break;
                 }
               if (*p > *w)
                 {
                   /* Note here: i1 <= i < i2.  */
                   i2 = i;
                   break;
                 }
               p++; w++; n--;
               if (n == 0)
                 return i;
             }
         }
     }
   return -1;
 }

 #define UNINAME_INVALID_INDEX UINT16_MAX

 /* Looks up the internal index of a Unicode character.  */
 static uint16_t
 unicode_code_to_index (ucs4_t c)
 {
   /* Binary search in unicode_ranges.  */
   unsigned int i1 = 0;
   unsigned int i2 = SIZEOF (unicode_ranges);

   for (;;)
     {
       unsigned int i = (i1 + i2) >> 1;
       ucs4_t start_code =
         unicode_ranges[i].index + unicode_ranges[i].gap;
       ucs4_t end_code =
         start_code + unicode_ranges[i].length - 1;

       if (start_code <= c && c <= end_code)
         return c - unicode_ranges[i].gap;

       if (end_code < c)
         {
           if (i1 == i)
             break;
           /* Note here: i1 < i < i2.  */
           i1 = i;
         }
       else if (c < start_code)
         {
           if (i2 == i)
             break;
           /* Note here: i1 <= i < i2.  */
           i2 = i;
         }
     }
   return UNINAME_INVALID_INDEX;
 }

 /* Looks up the codepoint of a Unicode character, from the given
    internal index.  */
 static ucs4_t
 unicode_index_to_code (uint16_t index)
 {
   /* Binary search in unicode_ranges.  */
   unsigned int i1 = 0;
   unsigned int i2 = SIZEOF (unicode_ranges);

   for (;;)
     {
       unsigned int i = (i1 + i2) >> 1;
       uint16_t start_index = unicode_ranges[i].index;
       uint16_t end_index = start_index + unicode_ranges[i].length - 1;

       if (start_index <= index && index <= end_index)
         return index + unicode_ranges[i].gap;

       if (end_index < index)
         {
           if (i1 == i)
             break;
           /* Note here: i1 < i < i2.  */
           i1 = i;
         }
       else if (index < start_index)
         {
           if (i2 == i)
             break;
           /* Note here: i1 <= i < i2.  */
           i2 = i;
         }
     }
   return UNINAME_INVALID;
 }


 /* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
    sections 3.11 and 4.4.  */
 static const char jamo_initial_short_name[19][3] =
 {
   "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
   "C", "K", "T", "P", "H"
 };
 static const char jamo_medial_short_name[21][4] =
 {
   "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
   "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
 };
 static const char jamo_final_short_name[28][3] =
 {
   "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
   "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
 };

 /* Looks up the name of a Unicode character, in uppercase ASCII.
    Returns the filled buf, or NULL if the character does not have a name.  */
 char *
 unicode_character_name (ucs4_t c, char *buf)
 {
   if (c >= 0xAC00 && c <= 0xD7A3)
     {
       /* Special case for Hangul syllables. Keeps the tables small.  */
       char *ptr;
       unsigned int tmp;
       unsigned int index1;
       unsigned int index2;
       unsigned int index3;
       const char *q;

       /* buf needs to have at least 16 + 7 bytes here.  */
       memcpy (buf, "HANGUL SYLLABLE ", 16);
       ptr = buf + 16;

       tmp = c - 0xAC00;
       index3 = tmp % 28; tmp = tmp / 28;
       index2 = tmp % 21; tmp = tmp / 21;
       index1 = tmp;

       q = jamo_initial_short_name[index1];
       while (*q != '\0')
         *ptr++ = *q++;
       q = jamo_medial_short_name[index2];
       while (*q != '\0')
         *ptr++ = *q++;
       q = jamo_final_short_name[index3];
       while (*q != '\0')
         *ptr++ = *q++;
       *ptr = '\0';
       return buf;
     }
   else if ((c >= 0xF900 && c <= 0xFA2D) || (c >= 0xFA30 && c <= 0xFA6A)
            || (c >= 0xFA70 && c <= 0xFAD9) || (c >= 0x2F800 && c <= 0x2FA1D))
     {
       /* Special case for CJK compatibility ideographs. Keeps the tables
          small.  */
       char *ptr;
       int i;

       /* buf needs to have at least 28 + 5 bytes here.  */
       memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28);
       ptr = buf + 28;

       for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4)
         {
           unsigned int x = (c >> i) & 0xf;
           *ptr++ = (x < 10 ? '0' : 'A' - 10) + x;
         }
       *ptr = '\0';
       return buf;
     }
   else if ((c >= 0xFE00 && c <= 0xFE0F) || (c >= 0xE0100 && c <= 0xE01EF))
     {
       /* Special case for variation selectors. Keeps the tables
          small.  */

       /* buf needs to have at least 19 + 3 bytes here.  */
       sprintf (buf, "VARIATION SELECTOR-%d",
                c <= 0xFE0F ? c - 0xFE00 + 1 : c - 0xE0100 + 17);
       return buf;
     }
   else
     {
       uint16_t index = unicode_code_to_index (c);
       const uint16_t *words = NULL;

       if (index != UNINAME_INVALID_INDEX)
         {
           /* Binary search in unicode_code_to_name.  */
           unsigned int i1 = 0;
           unsigned int i2 = SIZEOF (unicode_index_to_name);
           for (;;)
             {
               unsigned int i = (i1 + i2) >> 1;
               if (unicode_index_to_name[i].index == index)
                 {
                   words = &unicode_names[unicode_index_to_name[i].name];
                   break;
                 }
               else if (unicode_index_to_name[i].index < index)
                 {
                   if (i1 == i)
                     {
                       words = NULL;
                       break;
                     }
                   /* Note here: i1 < i < i2.  */
                   i1 = i;
                 }
               else if (unicode_index_to_name[i].index > index)
                 {
                   if (i2 == i)
                     {
                       words = NULL;
                       break;
                     }
                   /* Note here: i1 <= i < i2.  */
                   i2 = i;
                 }
             }
         }
       if (words != NULL)
         {
           /* Found it in unicode_index_to_name. Now concatenate the words.  */
           /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes.  */
           char *ptr = buf;
           for (;;)
             {
               unsigned int wordlen;
               const char *word = unicode_name_word (*words>>1, &wordlen);
               do
                 *ptr++ = *word++;
               while (--wordlen > 0);
               if ((*words & 1) == 0)
                 break;
               *ptr++ = ' ';
               words++;
             }
           *ptr = '\0';
           return buf;
         }
       return NULL;
     }
 }

 /* Looks up the Unicode character with a given name, in upper- or lowercase
    ASCII.  Returns the character if found, or UNINAME_INVALID if not found.  */
 ucs4_t
 unicode_name_character (const char *name)
 {
   unsigned int len = strlen (name);
   if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH)
     {
       /* Test for "word1 word2 ..." syntax.  */
       char buf[UNICODE_CHARNAME_MAX_LENGTH];
       char *ptr = buf;
       for (;;)
         {
           char c = *name++;
           if (!(c >= ' ' && c <= '~'))
             break;
           *ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
           if (--len == 0)
             goto filled_buf;
         }
       if (false)
       filled_buf:
         {
           {
             /* Special case for variation selector aliases. Keeps the
                tables small.  */
             const char *p1 = buf;
             if (ptr >= buf + 3 && *p1++ == 'V')
               {
                 if (*p1++ == 'S')
                   {
                     if (*p1 != '0')
                       {
                         unsigned int c = 0;
                         for (;;)
                           {
                             if (*p1 >= '0' && *p1 <= '9')
                               c += (*p1 - '0');
                             p1++;
                             if (p1 == ptr)
                               {
                                 if (c >= 1 && c <= 16)
                                   return c - 1 + 0xFE00;
                                 else if (c >= 17 && c <= 256)
                                   return c - 17 + 0xE0100;
                                 else
                                   break;
                               }
                             c = c * 10;
                           }
                       }
                   }
               }
           }
           {
             /* Convert the constituents to uint16_t words.  */
             uint16_t words[UNICODE_CHARNAME_MAX_WORDS];
             uint16_t *wordptr = words;
             {
               const char *p1 = buf;
               for (;;)
                 {
                   {
                     int word;
                     const char *p2 = p1;
                     while (p2 < ptr && *p2 != ' ')
                       p2++;
                     word = unicode_name_word_lookup (p1, p2 - p1);
                     if (word < 0)
                       break;
                     if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS])
                       break;
                     *wordptr++ = word;
                     if (p2 == ptr)
                       goto filled_words;
                     p1 = p2 + 1;
                   }
                   /* Special case for Hangul syllables. Keeps the tables small. */
                   if (wordptr == &words[2]
                       && words[0] == UNICODE_CHARNAME_WORD_HANGUL
                       && words[1] == UNICODE_CHARNAME_WORD_SYLLABLE)
                     {
                       /* Split the last word [p1..ptr) into three parts:
                            1) [BCDGHJKMNPRST]
                            2) [AEIOUWY]
                            3) [BCDGHIJKLMNPST]
                        */
                       const char *p2;
                       const char *p3;
                       const char *p4;

                       p2 = p1;
                       while (p2 < ptr
                              && (*p2 == 'B' || *p2 == 'C' || *p2 == 'D'
                                  || *p2 == 'G' || *p2 == 'H' || *p2 == 'J'
                                  || *p2 == 'K' || *p2 == 'M' || *p2 == 'N'
                                  || *p2 == 'P' || *p2 == 'R' || *p2 == 'S'
                                  || *p2 == 'T'))
                         p2++;
                       p3 = p2;
                       while (p3 < ptr
                              && (*p3 == 'A' || *p3 == 'E' || *p3 == 'I'
                                  || *p3 == 'O' || *p3 == 'U' || *p3 == 'W'
                                  || *p3 == 'Y'))
                         p3++;
                       p4 = p3;
                       while (p4 < ptr
                              && (*p4 == 'B' || *p4 == 'C' || *p4 == 'D'
                                  || *p4 == 'G' || *p4 == 'H' || *p4 == 'I'
                                  || *p4 == 'J' || *p4 == 'K' || *p4 == 'L'
                                  || *p4 == 'M' || *p4 == 'N' || *p4 == 'P'
                                  || *p4 == 'S' || *p4 == 'T'))
                         p4++;
                       if (p4 == ptr)
                         {
                           unsigned int n1 = p2 - p1;
                           unsigned int n2 = p3 - p2;
                           unsigned int n3 = p4 - p3;

                           if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2)
                             {
                               unsigned int index1;

                               for (index1 = 0; index1 < 19; index1++)
                                 if (memcmp (jamo_initial_short_name[index1], p1, n1) == 0
                                     && jamo_initial_short_name[index1][n1] == '\0')
                                   {
                                     unsigned int index2;

                                     for (index2 = 0; index2 < 21; index2++)
                                       if (memcmp (jamo_medial_short_name[index2], p2, n2) == 0
                                           && jamo_medial_short_name[index2][n2] == '\0')
                                         {
                                           unsigned int index3;

                                           for (index3 = 0; index3 < 28; index3++)
                                             if (memcmp (jamo_final_short_name[index3], p3, n3) == 0
                                                 && jamo_final_short_name[index3][n3] == '\0')
                                               {
                                                 return 0xAC00 + (index1 * 21 + index2) * 28 + index3;
                                               }
                                           break;
                                         }
                                     break;
                                   }
                             }
                         }
                     }
                   /* Special case for CJK compatibility ideographs. Keeps the
                      tables small.  */
                   if (wordptr == &words[2]
                       && words[0] == UNICODE_CHARNAME_WORD_CJK
                       && words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
                       && p1 + 14 <= ptr
                       && p1 + 15 >= ptr
                       && memcmp (p1, "IDEOGRAPH-", 10) == 0)
                     {
                       const char *p2 = p1 + 10;

                       if (*p2 != '0')
                         {
                           unsigned int c = 0;

                           for (;;)
                             {
                               if (*p2 >= '0' && *p2 <= '9')
                                 c += (*p2 - '0');
                               else if (*p2 >= 'A' && *p2 <= 'F')
                                 c += (*p2 - 'A' + 10);
                               else
                                 break;
                               p2++;
                               if (p2 == ptr)
                                 {
                                   if ((c >= 0xF900 && c <= 0xFA2D)
                                       || (c >= 0xFA30 && c <= 0xFA6A)
                                       || (c >= 0xFA70 && c <= 0xFAD9)
                                       || (c >= 0x2F800 && c <= 0x2FA1D))
                                     return c;
                                   else
                                     break;
                                 }
                               c = c << 4;
                             }
                         }
                     }
                   /* Special case for variation selectors. Keeps the
                      tables small.  */
                   if (wordptr == &words[1]
                       && words[0] == UNICODE_CHARNAME_WORD_VARIATION
                       && p1 + 10 <= ptr
                       && p1 + 12 >= ptr
                       && memcmp (p1, "SELECTOR-", 9) == 0)
                     {
                       const char *p2 = p1 + 9;

                       if (*p2 != '0')
                         {
                           unsigned int c = 0;

                           for (;;)
                             {
                               if (*p2 >= '0' && *p2 <= '9')
                                 c += (*p2 - '0');
                               p2++;
                               if (p2 == ptr)
                                 {
                                   if (c >= 1 && c <= 16)
                                     return c - 1 + 0xFE00;
                                   else if (c >= 17 && c <= 256)
                                     return c - 17 + 0xE0100;
                                   else
                                     break;
                                 }
                               c = c * 10;
                             }
                         }
                     }
                 }
             }
             if (false)
             filled_words:
               {
                 /* Multiply by 2, to simplify later comparisons.  */
                 unsigned int words_length = wordptr - words;
                 {
                   int i = words_length - 1;
                   words[i] = 2 * words[i];
                   for (; --i >= 0; )
                     words[i] = 2 * words[i] + 1;
                 }
                 /* Binary search in unicode_name_to_index.  */
                 {
                   unsigned int i1 = 0;
                   unsigned int i2 = SIZEOF (unicode_name_to_index);
                   for (;;)
                     {
                       unsigned int i = (i1 + i2) >> 1;
                       const uint16_t *w = words;
                       const uint16_t *p = &unicode_names[unicode_name_to_index[i].name];
                       unsigned int n = words_length;
                       for (;;)
                         {
                           if (*p < *w)
                             {
                               if (i1 == i)
                                 goto name_not_found;
                               /* Note here: i1 < i < i2.  */
                               i1 = i;
                               break;
                             }
                           else if (*p > *w)
                             {
                               if (i2 == i)
                                 goto name_not_found;
                               /* Note here: i1 <= i < i2.  */
                               i2 = i;
                               break;
                             }
                           p++; w++; n--;
                           if (n == 0)
                             return unicode_index_to_code (unicode_name_to_index[i].index);
                         }
                     }
                 }
               name_not_found: ;
               }
           }
         }
     }
   return UNINAME_INVALID;
 }
	/* Association between Unicode characters and their names.
	Copyright (C) 2000-2002, 2005-2007, 2009-2020 Free Software Foundation, Inc.

	This program is free software: you can redistribute it and/or modify it
	under the terms of the GNU Lesser General Public License as published
	by the Free Software Foundation; either version 3 of the License, or
	(at your option) any later version.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	Lesser General Public License for more details.

	You should have received a copy of the GNU Lesser General Public License
	along with this program. If not, see <https://www.gnu.org/licenses/>. */

	#include <config.h>

	/* Specification. */
	#include "uniname.h"

	#include <assert.h>
	#include <stdbool.h>
	#include <stdint.h>
	#include <stdio.h>
	#include <string.h>

	#define SIZEOF(a) (sizeof(a) / sizeof(a[0]))


	/* Table of Unicode character names, derived from UnicodeData.txt.
	This table is generated in a way to minimize the memory footprint:
	1. its compiled size is small (less than 350 KB),
	2. it resides entirely in the text or read-only data segment of the
	executable or shared library: the table contains only immediate
	integers, no pointers, and the functions don't do heap allocation.
	*/
	#include "uninames.h"
	/* It contains:
	static const char unicode_name_words[36303] = ...;
	#define UNICODE_CHARNAME_NUM_WORDS 6260
	static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
	#define UNICODE_CHARNAME_WORD_HANGUL 3902
	#define UNICODE_CHARNAME_WORD_SYLLABLE 4978
	#define UNICODE_CHARNAME_WORD_CJK 417
	#define UNICODE_CHARNAME_WORD_COMPATIBILITY 6107
	static const uint16_t unicode_names[68940] = ...;
	static const struct { uint16_t index; uint32_t name:24; } unicode_name_to_index[16626] = ...;
	static const struct { uint16_t index; uint32_t name:24; } unicode_index_to_name[16626] = ...;
	#define UNICODE_CHARNAME_MAX_LENGTH 83
	#define UNICODE_CHARNAME_MAX_WORDS 13
	static const struct { uint32_t index; uint32_t gap; uint16_t length; } unicode_ranges[401] = ...;
	*/

	/* Returns the word with a given index. */
	static const char *
	unicode_name_word (unsigned int index, unsigned int *lengthp)
	{
	unsigned int i1;
	unsigned int i2;
	unsigned int i;

	assert (index < UNICODE_CHARNAME_NUM_WORDS);

	/* Binary search for i with
	unicode_name_by_length[i].ind_offset <= index
	and
	index < unicode_name_by_length[i+1].ind_offset
	*/

	i1 = 0;
	i2 = SIZEOF (unicode_name_by_length) - 1;
	while (i2 - i1 > 1)
	{
	unsigned int i = (i1 + i2) >> 1;
	if (unicode_name_by_length[i].ind_offset <= index)
	i1 = i;
	else
	i2 = i;
	}
	i = i1;
	assert (unicode_name_by_length[i].ind_offset <= index
	&& index < unicode_name_by_length[i+1].ind_offset);
	*lengthp = i;
	return &unicode_name_words[unicode_name_by_length[i].extra_offset
	+ (index-unicode_name_by_length[i].ind_offset)*i];
	}

	/* Looks up the index of a word. */
	static int
	unicode_name_word_lookup (const char *word, unsigned int length)
	{
	if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1)
	{
	/* Binary search among the words of given length. */
	unsigned int extra_offset = unicode_name_by_length[length].extra_offset;
	unsigned int i0 = unicode_name_by_length[length].ind_offset;
	unsigned int i1 = i0;
	unsigned int i2 = unicode_name_by_length[length+1].ind_offset;
	while (i2 - i1 > 0)
	{
	unsigned int i = (i1 + i2) >> 1;
	const char p = &unicode_name_words[extra_offset + (i-i0)length];
	const char *w = word;
	unsigned int n = length;
	for (;;)
	{
	if (p < w)
	{
	if (i1 == i)
	return -1;
	/* Note here: i1 < i < i2. */
	i1 = i;
	break;
	}
	if (p > w)
	{
	/* Note here: i1 <= i < i2. */
	i2 = i;
	break;
	}
	p++; w++; n--;
	if (n == 0)
	return i;
	}
	}
	}
	return -1;
	}

	#define UNINAME_INVALID_INDEX UINT16_MAX

	/* Looks up the internal index of a Unicode character. */
	static uint16_t
	unicode_code_to_index (ucs4_t c)
	{
	/* Binary search in unicode_ranges. */
	unsigned int i1 = 0;
	unsigned int i2 = SIZEOF (unicode_ranges);

	for (;;)
	{
	unsigned int i = (i1 + i2) >> 1;
	ucs4_t start_code =
	unicode_ranges[i].index + unicode_ranges[i].gap;
	ucs4_t end_code =
	start_code + unicode_ranges[i].length - 1;

	if (start_code <= c && c <= end_code)
	return c - unicode_ranges[i].gap;

	if (end_code < c)
	{
	if (i1 == i)
	break;
	/* Note here: i1 < i < i2. */
	i1 = i;
	}
	else if (c < start_code)
	{
	if (i2 == i)
	break;
	/* Note here: i1 <= i < i2. */
	i2 = i;
	}
	}
	return UNINAME_INVALID_INDEX;
	}

	/* Looks up the codepoint of a Unicode character, from the given
	internal index. */
	static ucs4_t
	unicode_index_to_code (uint16_t index)
	{
	/* Binary search in unicode_ranges. */
	unsigned int i1 = 0;
	unsigned int i2 = SIZEOF (unicode_ranges);

	for (;;)
	{
	unsigned int i = (i1 + i2) >> 1;
	uint16_t start_index = unicode_ranges[i].index;
	uint16_t end_index = start_index + unicode_ranges[i].length - 1;

	if (start_index <= index && index <= end_index)
	return index + unicode_ranges[i].gap;

	if (end_index < index)
	{
	if (i1 == i)
	break;
	/* Note here: i1 < i < i2. */
	i1 = i;
	}
	else if (index < start_index)
	{
	if (i2 == i)
	break;
	/* Note here: i1 <= i < i2. */
	i2 = i;
	}
	}
	return UNINAME_INVALID;
	}


	/* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
	sections 3.11 and 4.4. */
	static const char jamo_initial_short_name[19][3] =
	{
	"G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
	"C", "K", "T", "P", "H"
	};
	static const char jamo_medial_short_name[21][4] =
	{
	"A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
	"U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
	};
	static const char jamo_final_short_name[28][3] =
	{
	"", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
	"LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
	};

	/* Looks up the name of a Unicode character, in uppercase ASCII.
	Returns the filled buf, or NULL if the character does not have a name. */
	char *
	unicode_character_name (ucs4_t c, char *buf)
	{
	if (c >= 0xAC00 && c <= 0xD7A3)
	{
	/* Special case for Hangul syllables. Keeps the tables small. */
	char *ptr;
	unsigned int tmp;
	unsigned int index1;
	unsigned int index2;
	unsigned int index3;
	const char *q;

	/* buf needs to have at least 16 + 7 bytes here. */
	memcpy (buf, "HANGUL SYLLABLE ", 16);
	ptr = buf + 16;

	tmp = c - 0xAC00;
	index3 = tmp % 28; tmp = tmp / 28;
	index2 = tmp % 21; tmp = tmp / 21;
	index1 = tmp;

	q = jamo_initial_short_name[index1];
	while (*q != '\0')
	ptr++ = q++;
	q = jamo_medial_short_name[index2];
	while (*q != '\0')
	ptr++ = q++;
	q = jamo_final_short_name[index3];
	while (*q != '\0')
	ptr++ = q++;
	*ptr = '\0';
	return buf;
	}
	else if ((c >= 0xF900 && c <= 0xFA2D) \|\| (c >= 0xFA30 && c <= 0xFA6A)
	\|\| (c >= 0xFA70 && c <= 0xFAD9) \|\| (c >= 0x2F800 && c <= 0x2FA1D))
	{
	/* Special case for CJK compatibility ideographs. Keeps the tables
	small. */
	char *ptr;
	int i;

	/* buf needs to have at least 28 + 5 bytes here. */
	memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28);
	ptr = buf + 28;

	for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4)
	{
	unsigned int x = (c >> i) & 0xf;
	*ptr++ = (x < 10 ? '0' : 'A' - 10) + x;
	}
	*ptr = '\0';
	return buf;
	}
	else if ((c >= 0xFE00 && c <= 0xFE0F) \|\| (c >= 0xE0100 && c <= 0xE01EF))
	{
	/* Special case for variation selectors. Keeps the tables
	small. */

	/* buf needs to have at least 19 + 3 bytes here. */
	sprintf (buf, "VARIATION SELECTOR-%d",
	c <= 0xFE0F ? c - 0xFE00 + 1 : c - 0xE0100 + 17);
	return buf;
	}
	else
	{
	uint16_t index = unicode_code_to_index (c);
	const uint16_t *words = NULL;

	if (index != UNINAME_INVALID_INDEX)
	{
	/* Binary search in unicode_code_to_name. */
	unsigned int i1 = 0;
	unsigned int i2 = SIZEOF (unicode_index_to_name);
	for (;;)
	{
	unsigned int i = (i1 + i2) >> 1;
	if (unicode_index_to_name[i].index == index)
	{
	words = &unicode_names[unicode_index_to_name[i].name];
	break;
	}
	else if (unicode_index_to_name[i].index < index)
	{
	if (i1 == i)
	{
	words = NULL;
	break;
	}
	/* Note here: i1 < i < i2. */
	i1 = i;
	}
	else if (unicode_index_to_name[i].index > index)
	{
	if (i2 == i)
	{
	words = NULL;
	break;
	}
	/* Note here: i1 <= i < i2. */
	i2 = i;
	}
	}
	}
	if (words != NULL)
	{
	/* Found it in unicode_index_to_name. Now concatenate the words. */
	/* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes. */
	char *ptr = buf;
	for (;;)
	{
	unsigned int wordlen;
	const char word = unicode_name_word (words>>1, &wordlen);
	do
	ptr++ = word++;
	while (--wordlen > 0);
	if ((*words & 1) == 0)
	break;
	*ptr++ = ' ';
	words++;
	}
	*ptr = '\0';
	return buf;
	}
	return NULL;
	}
	}

	/* Looks up the Unicode character with a given name, in upper- or lowercase
	ASCII. Returns the character if found, or UNINAME_INVALID if not found. */
	ucs4_t
	unicode_name_character (const char *name)
	{
	unsigned int len = strlen (name);
	if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH)
	{
	/* Test for "word1 word2 ..." syntax. */
	char buf[UNICODE_CHARNAME_MAX_LENGTH];
	char *ptr = buf;
	for (;;)
	{
	char c = *name++;
	if (!(c >= ' ' && c <= '~'))
	break;
	*ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
	if (--len == 0)
	goto filled_buf;
	}
	if (false)
	filled_buf:
	{
	{
	/* Special case for variation selector aliases. Keeps the
	tables small. */
	const char *p1 = buf;
	if (ptr >= buf + 3 && *p1++ == 'V')
	{
	if (*p1++ == 'S')
	{
	if (*p1 != '0')
	{
	unsigned int c = 0;
	for (;;)
	{
	if (p1 >= '0' && p1 <= '9')
	c += (*p1 - '0');
	p1++;
	if (p1 == ptr)
	{
	if (c >= 1 && c <= 16)
	return c - 1 + 0xFE00;
	else if (c >= 17 && c <= 256)
	return c - 17 + 0xE0100;
	else
	break;
	}
	c = c * 10;
	}
	}
	}
	}
	}
	{
	/* Convert the constituents to uint16_t words. */
	uint16_t words[UNICODE_CHARNAME_MAX_WORDS];
	uint16_t *wordptr = words;
	{
	const char *p1 = buf;
	for (;;)
	{
	{
	int word;
	const char *p2 = p1;
	while (p2 < ptr && *p2 != ' ')
	p2++;
	word = unicode_name_word_lookup (p1, p2 - p1);
	if (word < 0)
	break;
	if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS])
	break;
	*wordptr++ = word;
	if (p2 == ptr)
	goto filled_words;
	p1 = p2 + 1;
	}
	/* Special case for Hangul syllables. Keeps the tables small. */
	if (wordptr == &words[2]
	&& words[0] == UNICODE_CHARNAME_WORD_HANGUL
	&& words[1] == UNICODE_CHARNAME_WORD_SYLLABLE)
	{
	/* Split the last word [p1..ptr) into three parts:
	1) [BCDGHJKMNPRST]
	2) [AEIOUWY]
	3) [BCDGHIJKLMNPST]
	*/
	const char *p2;
	const char *p3;
	const char *p4;

	p2 = p1;
	while (p2 < ptr
	&& (p2 == 'B' \|\| p2 == 'C' \|\| *p2 == 'D'
	\|\| p2 == 'G' \|\| p2 == 'H' \|\| *p2 == 'J'
	\|\| p2 == 'K' \|\| p2 == 'M' \|\| *p2 == 'N'
	\|\| p2 == 'P' \|\| p2 == 'R' \|\| *p2 == 'S'
	\|\| *p2 == 'T'))
	p2++;
	p3 = p2;
	while (p3 < ptr
	&& (p3 == 'A' \|\| p3 == 'E' \|\| *p3 == 'I'
	\|\| p3 == 'O' \|\| p3 == 'U' \|\| *p3 == 'W'
	\|\| *p3 == 'Y'))
	p3++;
	p4 = p3;
	while (p4 < ptr
	&& (p4 == 'B' \|\| p4 == 'C' \|\| *p4 == 'D'
	\|\| p4 == 'G' \|\| p4 == 'H' \|\| *p4 == 'I'
	\|\| p4 == 'J' \|\| p4 == 'K' \|\| *p4 == 'L'
	\|\| p4 == 'M' \|\| p4 == 'N' \|\| *p4 == 'P'
	\|\| p4 == 'S' \|\| p4 == 'T'))
	p4++;
	if (p4 == ptr)
	{
	unsigned int n1 = p2 - p1;
	unsigned int n2 = p3 - p2;
	unsigned int n3 = p4 - p3;

	if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2)
	{
	unsigned int index1;

	for (index1 = 0; index1 < 19; index1++)
	if (memcmp (jamo_initial_short_name[index1], p1, n1) == 0
	&& jamo_initial_short_name[index1][n1] == '\0')
	{
	unsigned int index2;

	for (index2 = 0; index2 < 21; index2++)
	if (memcmp (jamo_medial_short_name[index2], p2, n2) == 0
	&& jamo_medial_short_name[index2][n2] == '\0')
	{
	unsigned int index3;

	for (index3 = 0; index3 < 28; index3++)
	if (memcmp (jamo_final_short_name[index3], p3, n3) == 0
	&& jamo_final_short_name[index3][n3] == '\0')
	{
	return 0xAC00 + (index1 * 21 + index2) * 28 + index3;
	}
	break;
	}
	break;
	}
	}
	}
	}
	/* Special case for CJK compatibility ideographs. Keeps the
	tables small. */
	if (wordptr == &words[2]
	&& words[0] == UNICODE_CHARNAME_WORD_CJK
	&& words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
	&& p1 + 14 <= ptr
	&& p1 + 15 >= ptr
	&& memcmp (p1, "IDEOGRAPH-", 10) == 0)
	{
	const char *p2 = p1 + 10;

	if (*p2 != '0')
	{
	unsigned int c = 0;

	for (;;)
	{
	if (p2 >= '0' && p2 <= '9')
	c += (*p2 - '0');
	else if (p2 >= 'A' && p2 <= 'F')
	c += (*p2 - 'A' + 10);
	else
	break;
	p2++;
	if (p2 == ptr)
	{
	if ((c >= 0xF900 && c <= 0xFA2D)
	\|\| (c >= 0xFA30 && c <= 0xFA6A)
	\|\| (c >= 0xFA70 && c <= 0xFAD9)
	\|\| (c >= 0x2F800 && c <= 0x2FA1D))
	return c;
	else
	break;
	}
	c = c << 4;
	}
	}
	}
	/* Special case for variation selectors. Keeps the
	tables small. */
	if (wordptr == &words[1]
	&& words[0] == UNICODE_CHARNAME_WORD_VARIATION
	&& p1 + 10 <= ptr
	&& p1 + 12 >= ptr
	&& memcmp (p1, "SELECTOR-", 9) == 0)
	{
	const char *p2 = p1 + 9;

	if (*p2 != '0')
	{
	unsigned int c = 0;

	for (;;)
	{
	if (p2 >= '0' && p2 <= '9')
	c += (*p2 - '0');
	p2++;
	if (p2 == ptr)
	{
	if (c >= 1 && c <= 16)
	return c - 1 + 0xFE00;
	else if (c >= 17 && c <= 256)
	return c - 17 + 0xE0100;
	else
	break;
	}
	c = c * 10;
	}
	}
	}
	}
	}
	if (false)
	filled_words:
	{
	/* Multiply by 2, to simplify later comparisons. */
	unsigned int words_length = wordptr - words;
	{
	int i = words_length - 1;
	words[i] = 2 * words[i];
	for (; --i >= 0; )
	words[i] = 2 * words[i] + 1;
	}
	/* Binary search in unicode_name_to_index. */
	{
	unsigned int i1 = 0;
	unsigned int i2 = SIZEOF (unicode_name_to_index);
	for (;;)
	{
	unsigned int i = (i1 + i2) >> 1;
	const uint16_t *w = words;
	const uint16_t *p = &unicode_names[unicode_name_to_index[i].name];
	unsigned int n = words_length;
	for (;;)
	{
	if (p < w)
	{
	if (i1 == i)
	goto name_not_found;
	/* Note here: i1 < i < i2. */
	i1 = i;
	break;
	}
	else if (p > w)
	{
	if (i2 == i)
	goto name_not_found;
	/* Note here: i1 <= i < i2. */
	i2 = i;
	break;
	}
	p++; w++; n--;
	if (n == 0)
	return unicode_index_to_code (unicode_name_to_index[i].index);
	}
	}
	}
	name_not_found: ;
	}
	}
	}
	}
	return UNINAME_INVALID;
	}