src/pcre2_compile_class.c - platform/external/pcre - Git at Google

 /*************************************************
 *      Perl-Compatible Regular Expressions       *
 *************************************************/

 /* PCRE is a library of functions to support regular expressions whose syntax
 and semantics are as close as possible to those of the Perl 5 language.

                        Written by Philip Hazel
      Original API code Copyright (c) 1997-2012 University of Cambridge
           New API code Copyright (c) 2016-2024 University of Cambridge

 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:

     * Redistributions of source code must retain the above copyright notice,
       this list of conditions and the following disclaimer.

     * Redistributions in binary form must reproduce the above copyright
       notice, this list of conditions and the following disclaimer in the
       documentation and/or other materials provided with the distribution.

     * Neither the name of the University of Cambridge nor the names of its
       contributors may be used to endorse or promote products derived from
       this software without specific prior written permission.

 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.
 -----------------------------------------------------------------------------
 */

 #ifdef HAVE_CONFIG_H
 #include "config.h"
 #endif

 #include "pcre2_compile.h"

 #ifdef SUPPORT_WIDE_CHARS

 /* Heapsort algorithm. */

 static void do_heapify(uint32_t *buffer, size_t size, size_t i)
 {
 size_t max;
 size_t left;
 size_t right;
 uint32_t tmp1, tmp2;

 while (TRUE)
   {
   max = i;
   left = (i << 1) + 2;
   right = left + 2;

   if (left < size && buffer[left] > buffer[max]) max = left;
   if (right < size && buffer[right] > buffer[max]) max = right;
   if (i == max) return;

   /* Swap items. */
   tmp1 = buffer[i];
   tmp2 = buffer[i + 1];
   buffer[i] = buffer[max];
   buffer[i + 1] = buffer[max + 1];
   buffer[max] = tmp1;
   buffer[max + 1] = tmp2;
   i = max;
   }
 }

 #ifdef SUPPORT_UNICODE

 #define PARSE_CLASS_CASELESS_UTF      0x1
 #define PARSE_CLASS_RESTRICTED_UTF    0x2

 static const uint32_t*
 get_nocase_range(uint32_t c)
 {
 uint32_t left = 0;
 uint32_t right = PRIV(ucd_nocase_ranges_size);
 uint32_t middle;

 if (c > MAX_UTF_CODE_POINT) return PRIV(ucd_nocase_ranges) + right;

 while (TRUE)
   {
   /* Range end of the middle element. */
   middle = ((left + right) >> 1) | 0x1;

   if (PRIV(ucd_nocase_ranges)[middle] <= c)
     left = middle + 1;
   else if (middle > 1 && PRIV(ucd_nocase_ranges)[middle - 2] > c)
     right = middle - 1;
   else
     return PRIV(ucd_nocase_ranges) + (middle - 1);
   }
 }

 static size_t
 utf_caseless_extend(uint32_t start, uint32_t end, uint32_t options,
   uint32_t *buffer)
 {
 uint32_t new_start = start;
 uint32_t new_end = end;
 uint32_t c = start;
 const uint32_t *list;
 uint32_t tmp[3];
 size_t result = 2;
 const uint32_t *skip_range = get_nocase_range(c);
 uint32_t skip_start = skip_range[0];

 #if PCRE2_CODE_UNIT_WIDTH == 32
 if (end > MAX_UTF_CODE_POINT) end = MAX_UTF_CODE_POINT;
 #endif

 while (c <= end)
   {
   uint32_t co;

   if (c > skip_start)
     {
     c = skip_range[1];
     skip_range += 2;
     skip_start = skip_range[0];
     continue;
     }

   /* Compute caseless set. */
   co = UCD_CASESET(c);

   if (co != 0 && (!(options & PARSE_CLASS_RESTRICTED_UTF)
                   || PRIV(ucd_caseless_sets)[co] > 127))
     list = PRIV(ucd_caseless_sets) + co;
   else
     {
     co = UCD_OTHERCASE(c);
     list = tmp;
     tmp[0] = c;
     tmp[1] = NOTACHAR;

     if (co != c)
       {
       tmp[1] = co;
       tmp[2] = NOTACHAR;
       }
     }
   c++;

   /* Add characters. */
   do
     {
     if (*list < new_start)
       {
       if (*list + 1 == new_start)
         {
         new_start--;
         continue;
         }
       }
     else if (*list > new_end)
       {
       if (*list - 1 == new_end)
         {
         new_end++;
         continue;
         }
       }
     else continue;

     result += 2;
     if (buffer != NULL)
       {
       buffer[0] = *list;
       buffer[1] = *list;
       buffer += 2;
       }
     }
   while (*(++list) != NOTACHAR);
   }

   if (buffer != NULL)
     {
     buffer[0] = new_start;
     buffer[1] = new_end;
     buffer += 2;
     }
   return result;
 }

 #endif

 static size_t
 parse_class(uint32_t *ptr, uint32_t options, uint32_t *buffer)
 {
 size_t total_size = 0;
 uint32_t meta_arg;
 uint32_t start_char;

 (void)options; /* Avoid compiler warning. */

 while (*ptr != META_CLASS_END)
   {
   switch (META_CODE(*ptr))
     {
     case META_ESCAPE:
       meta_arg = META_DATA(*ptr);
       if (meta_arg == ESC_P || meta_arg == ESC_p) ptr++;
       ptr++;
       continue;
     case META_POSIX:
     case META_POSIX_NEG:
       ptr += 2;
       continue;
     case META_BIGVALUE:
       /* Character literal */
       ptr++;
       break;
     default:
       PCRE2_ASSERT(*ptr < META_END);
       break;
     }

     start_char = *ptr;

     if (ptr[1] == META_RANGE_LITERAL || ptr[1] == META_RANGE_ESCAPED)
       {
       ptr += 2;
       PCRE2_ASSERT(*ptr < META_END || *ptr == META_BIGVALUE);

       if (*ptr == META_BIGVALUE) ptr++;
       }

 #ifdef SUPPORT_UNICODE
     if (options & PARSE_CLASS_CASELESS_UTF)
       {
       size_t size = utf_caseless_extend(start_char, *ptr++, options, buffer);
       if (buffer != NULL) buffer += size;
       total_size += size;
       continue;
       }
 #endif

     if (buffer != NULL)
       {
       buffer[0] = start_char;
       buffer[1] = *ptr;
       buffer += 2;
       }

     ptr++;
     total_size += 2;
   }

   return total_size;
 }

 uint32_t *PRIV(optimize_class)(uint32_t *start_ptr, uint32_t options,
   size_t *buffer_size, compile_block* cb)
 {
 uint32_t *ptr = start_ptr + 1;
 uint32_t *buffer;
 uint32_t *dst;
 size_t size = 0, i;
 uint32_t tmp1, tmp2;

 PCRE2_ASSERT(*start_ptr == META_CLASS || *start_ptr == META_CLASS_NOT);

 #ifdef SUPPORT_UNICODE
 if ((options & PCRE2_CASELESS) && (options & (PCRE2_UTF|PCRE2_UCP)))
   options = PARSE_CLASS_CASELESS_UTF;
 else
   options = 0;

 if (cb->cx->extra_options & PCRE2_EXTRA_CASELESS_RESTRICT)
   options |= PARSE_CLASS_RESTRICTED_UTF;
 #endif

 /* Compute required space for the range. */

 size = parse_class(start_ptr + 1, options, NULL);

 *buffer_size = size;
 if (size == 0) return NULL;

 /* Allocate and buffer. */

 buffer = (uint32_t*)
   cb->cx->memctl.malloc(size * sizeof(uint32_t), cb->cx->memctl.memory_data);

 if (buffer == NULL) return NULL;

 parse_class(start_ptr + 1, options, buffer);

 if (size == 2) return buffer;

 /* In-place sorting of ranges. */

 i = (((size >> 2) - 1) << 1);
 while (TRUE)
   {
   do_heapify(buffer, size, i);
   if (i == 0) break;
   i -= 2;
   }

 i = size - 2;
 while (TRUE)
   {
   tmp1 = buffer[i];
   tmp2 = buffer[i + 1];
   buffer[i] = buffer[0];
   buffer[i + 1] = buffer[1];
   buffer[0] = tmp1;
   buffer[1] = tmp2;

   do_heapify(buffer, i, 0);
   if (i == 0) break;
   i -= 2;
   }

 /* Merge ranges whenever possible. */
 dst = buffer;
 ptr = buffer + 2;
 size -= 2;

 /* The second condition is a very rare corner case, where the end of the last
 range is the maximum character. This range cannot be extended further. */

 while (size > 0 && dst[1] != ~(uint32_t)0)
   {
   if (dst[1] + 1 < ptr[0])
     {
     dst += 2;
     dst[0] = ptr[0];
     dst[1] = ptr[1];
     }
   else if (dst[1] < ptr[1]) dst[1] = ptr[1];

   ptr += 2;
   size -= 2;
   }

 *buffer_size = (size_t)(dst + 2 - buffer);
 return buffer;
 }

 #endif /* SUPPORT_WIDE_CHARS */

 /* End of pcre2_compile_class.c */
	/*************************************************
	* Perl-Compatible Regular Expressions *
	*************************************************/

	/* PCRE is a library of functions to support regular expressions whose syntax
	and semantics are as close as possible to those of the Perl 5 language.

	Written by Philip Hazel
	Original API code Copyright (c) 1997-2012 University of Cambridge
	New API code Copyright (c) 2016-2024 University of Cambridge

	-----------------------------------------------------------------------------
	Redistribution and use in source and binary forms, with or without
	modification, are permitted provided that the following conditions are met:

	* Redistributions of source code must retain the above copyright notice,
	this list of conditions and the following disclaimer.

	* Redistributions in binary form must reproduce the above copyright
	notice, this list of conditions and the following disclaimer in the
	documentation and/or other materials provided with the distribution.

	* Neither the name of the University of Cambridge nor the names of its
	contributors may be used to endorse or promote products derived from
	this software without specific prior written permission.

	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	POSSIBILITY OF SUCH DAMAGE.
	-----------------------------------------------------------------------------
	*/

	#ifdef HAVE_CONFIG_H
	#include "config.h"
	#endif

	#include "pcre2_compile.h"

	#ifdef SUPPORT_WIDE_CHARS

	/* Heapsort algorithm. */

	static void do_heapify(uint32_t *buffer, size_t size, size_t i)
	{
	size_t max;
	size_t left;
	size_t right;
	uint32_t tmp1, tmp2;

	while (TRUE)
	{
	max = i;
	left = (i << 1) + 2;
	right = left + 2;

	if (left < size && buffer[left] > buffer[max]) max = left;
	if (right < size && buffer[right] > buffer[max]) max = right;
	if (i == max) return;

	/* Swap items. */
	tmp1 = buffer[i];
	tmp2 = buffer[i + 1];
	buffer[i] = buffer[max];
	buffer[i + 1] = buffer[max + 1];
	buffer[max] = tmp1;
	buffer[max + 1] = tmp2;
	i = max;
	}
	}

	#ifdef SUPPORT_UNICODE

	#define PARSE_CLASS_CASELESS_UTF 0x1
	#define PARSE_CLASS_RESTRICTED_UTF 0x2

	static const uint32_t*
	get_nocase_range(uint32_t c)
	{
	uint32_t left = 0;
	uint32_t right = PRIV(ucd_nocase_ranges_size);
	uint32_t middle;

	if (c > MAX_UTF_CODE_POINT) return PRIV(ucd_nocase_ranges) + right;

	while (TRUE)
	{
	/* Range end of the middle element. */
	middle = ((left + right) >> 1) \| 0x1;

	if (PRIV(ucd_nocase_ranges)[middle] <= c)
	left = middle + 1;
	else if (middle > 1 && PRIV(ucd_nocase_ranges)[middle - 2] > c)
	right = middle - 1;
	else
	return PRIV(ucd_nocase_ranges) + (middle - 1);
	}
	}

	static size_t
	utf_caseless_extend(uint32_t start, uint32_t end, uint32_t options,
	uint32_t *buffer)
	{
	uint32_t new_start = start;
	uint32_t new_end = end;
	uint32_t c = start;
	const uint32_t *list;
	uint32_t tmp[3];
	size_t result = 2;
	const uint32_t *skip_range = get_nocase_range(c);
	uint32_t skip_start = skip_range[0];

	#if PCRE2_CODE_UNIT_WIDTH == 32
	if (end > MAX_UTF_CODE_POINT) end = MAX_UTF_CODE_POINT;
	#endif

	while (c <= end)
	{
	uint32_t co;

	if (c > skip_start)
	{
	c = skip_range[1];
	skip_range += 2;
	skip_start = skip_range[0];
	continue;
	}

	/* Compute caseless set. */
	co = UCD_CASESET(c);

	if (co != 0 && (!(options & PARSE_CLASS_RESTRICTED_UTF)
	\|\| PRIV(ucd_caseless_sets)[co] > 127))
	list = PRIV(ucd_caseless_sets) + co;
	else
	{
	co = UCD_OTHERCASE(c);
	list = tmp;
	tmp[0] = c;
	tmp[1] = NOTACHAR;

	if (co != c)
	{
	tmp[1] = co;
	tmp[2] = NOTACHAR;
	}
	}
	c++;

	/* Add characters. */
	do
	{
	if (*list < new_start)
	{
	if (*list + 1 == new_start)
	{
	new_start--;
	continue;
	}
	}
	else if (*list > new_end)
	{
	if (*list - 1 == new_end)
	{
	new_end++;
	continue;
	}
	}
	else continue;

	result += 2;
	if (buffer != NULL)
	{
	buffer[0] = *list;
	buffer[1] = *list;
	buffer += 2;
	}
	}
	while (*(++list) != NOTACHAR);
	}

	if (buffer != NULL)
	{
	buffer[0] = new_start;
	buffer[1] = new_end;
	buffer += 2;
	}
	return result;
	}

	#endif

	static size_t
	parse_class(uint32_t ptr, uint32_t options, uint32_t buffer)
	{
	size_t total_size = 0;
	uint32_t meta_arg;
	uint32_t start_char;

	(void)options; /* Avoid compiler warning. */

	while (*ptr != META_CLASS_END)
	{
	switch (META_CODE(*ptr))
	{
	case META_ESCAPE:
	meta_arg = META_DATA(*ptr);
	if (meta_arg == ESC_P \|\| meta_arg == ESC_p) ptr++;
	ptr++;
	continue;
	case META_POSIX:
	case META_POSIX_NEG:
	ptr += 2;
	continue;
	case META_BIGVALUE:
	/* Character literal */
	ptr++;
	break;
	default:
	PCRE2_ASSERT(*ptr < META_END);
	break;
	}

	start_char = *ptr;

	if (ptr[1] == META_RANGE_LITERAL \|\| ptr[1] == META_RANGE_ESCAPED)
	{
	ptr += 2;
	PCRE2_ASSERT(ptr < META_END \|\| ptr == META_BIGVALUE);

	if (*ptr == META_BIGVALUE) ptr++;
	}

	#ifdef SUPPORT_UNICODE
	if (options & PARSE_CLASS_CASELESS_UTF)
	{
	size_t size = utf_caseless_extend(start_char, *ptr++, options, buffer);
	if (buffer != NULL) buffer += size;
	total_size += size;
	continue;
	}
	#endif

	if (buffer != NULL)
	{
	buffer[0] = start_char;
	buffer[1] = *ptr;
	buffer += 2;
	}

	ptr++;
	total_size += 2;
	}

	return total_size;
	}

	uint32_t PRIV(optimize_class)(uint32_t start_ptr, uint32_t options,
	size_t buffer_size, compile_block cb)
	{
	uint32_t *ptr = start_ptr + 1;
	uint32_t *buffer;
	uint32_t *dst;
	size_t size = 0, i;
	uint32_t tmp1, tmp2;

	PCRE2_ASSERT(start_ptr == META_CLASS \|\| start_ptr == META_CLASS_NOT);

	#ifdef SUPPORT_UNICODE
	if ((options & PCRE2_CASELESS) && (options & (PCRE2_UTF\|PCRE2_UCP)))
	options = PARSE_CLASS_CASELESS_UTF;
	else
	options = 0;

	if (cb->cx->extra_options & PCRE2_EXTRA_CASELESS_RESTRICT)
	options \|= PARSE_CLASS_RESTRICTED_UTF;
	#endif

	/* Compute required space for the range. */

	size = parse_class(start_ptr + 1, options, NULL);

	*buffer_size = size;
	if (size == 0) return NULL;

	/* Allocate and buffer. */

	buffer = (uint32_t*)
	cb->cx->memctl.malloc(size * sizeof(uint32_t), cb->cx->memctl.memory_data);

	if (buffer == NULL) return NULL;

	parse_class(start_ptr + 1, options, buffer);

	if (size == 2) return buffer;

	/* In-place sorting of ranges. */

	i = (((size >> 2) - 1) << 1);
	while (TRUE)
	{
	do_heapify(buffer, size, i);
	if (i == 0) break;
	i -= 2;
	}

	i = size - 2;
	while (TRUE)
	{
	tmp1 = buffer[i];
	tmp2 = buffer[i + 1];
	buffer[i] = buffer[0];
	buffer[i + 1] = buffer[1];
	buffer[0] = tmp1;
	buffer[1] = tmp2;

	do_heapify(buffer, i, 0);
	if (i == 0) break;
	i -= 2;
	}

	/* Merge ranges whenever possible. */
	dst = buffer;
	ptr = buffer + 2;
	size -= 2;

	/* The second condition is a very rare corner case, where the end of the last
	range is the maximum character. This range cannot be extended further. */

	while (size > 0 && dst[1] != ~(uint32_t)0)
	{
	if (dst[1] + 1 < ptr[0])
	{
	dst += 2;
	dst[0] = ptr[0];
	dst[1] = ptr[1];
	}
	else if (dst[1] < ptr[1]) dst[1] = ptr[1];

	ptr += 2;
	size -= 2;
	}

	*buffer_size = (size_t)(dst + 2 - buffer);
	return buffer;
	}

	#endif /* SUPPORT_WIDE_CHARS */

	/* End of pcre2_compile_class.c */