blob: e92f4a2eed86c50815bacffa667ed9b7fab1f1ca [file] [log] [blame]
/* Construct a regular expression from a literal string.
Copyright (C) 1995, 2010-2020 Free Software Foundation, Inc.
Written by Bruno Haible <haible@clisp.cons.org>, 2010.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>. */
#include <config.h>
/* Specification. */
#include "regex-quote.h"
#include <string.h>
#include "mbuiter.h"
#include "xalloc.h"
/* Characters that are special in a BRE. */
static const char bre_special[] = "$^.*[]\\";
/* Characters that are special in an ERE. */
static const char ere_special[] = "$^.*[]\\+?{}()|";
struct regex_quote_spec
regex_quote_spec_posix (int cflags, bool anchored)
{
struct regex_quote_spec result;
strcpy (result.special, cflags != 0 ? ere_special : bre_special);
result.multibyte = true;
result.anchored = anchored;
return result;
}
/* Syntax bit values, defined in GNU <regex.h>. We don't include it here,
otherwise this module would need to depend on gnulib module 'regex'. */
#define RE_BK_PLUS_QM 0x00000002
#define RE_INTERVALS 0x00000200
#define RE_LIMITED_OPS 0x00000400
#define RE_NEWLINE_ALT 0x00000800
#define RE_NO_BK_BRACES 0x00001000
#define RE_NO_BK_PARENS 0x00002000
#define RE_NO_BK_VBAR 0x00008000
struct regex_quote_spec
regex_quote_spec_gnu (unsigned long /*reg_syntax_t*/ syntax, bool anchored)
{
struct regex_quote_spec result;
char *p;
p = result.special;
memcpy (p, bre_special, sizeof (bre_special) - 1);
p += sizeof (bre_special) - 1;
if ((syntax & RE_LIMITED_OPS) == 0 && (syntax & RE_BK_PLUS_QM) == 0)
{
*p++ = '+';
*p++ = '?';
}
if ((syntax & RE_INTERVALS) != 0 && (syntax & RE_NO_BK_BRACES) != 0)
{
*p++ = '{';
*p++ = '}';
}
if ((syntax & RE_NO_BK_PARENS) != 0)
{
*p++ = '(';
*p++ = ')';
}
if ((syntax & RE_LIMITED_OPS) == 0 && (syntax & RE_NO_BK_VBAR) != 0)
*p++ = '|';
if ((syntax & RE_NEWLINE_ALT) != 0)
*p++ = '\n';
*p = '\0';
result.multibyte = true;
result.anchored = anchored;
return result;
}
/* Characters that are special in a PCRE. */
static const char pcre_special[] = "$^.*[]\\+?{}()|";
/* Options bit values, defined in <pcre.h>. We don't include it here, because
it is not a standard header. */
#define PCRE_ANCHORED 0x00000010
#define PCRE_EXTENDED 0x00000008
struct regex_quote_spec
regex_quote_spec_pcre (int options, bool anchored)
{
struct regex_quote_spec result;
char *p;
p = result.special;
memcpy (p, pcre_special, sizeof (pcre_special) - 1);
p += sizeof (pcre_special) - 1;
if (options & PCRE_EXTENDED)
{
*p++ = ' ';
*p++ = '\t';
*p++ = '\n';
*p++ = '\v';
*p++ = '\f';
*p++ = '\r';
*p++ = '#';
}
*p = '\0';
/* PCRE regular expressions consist of UTF-8 characters of options contains
PCRE_UTF8 and of single bytes otherwise. */
result.multibyte = false;
/* If options contains PCRE_ANCHORED, the anchoring is implicit. */
result.anchored = (options & PCRE_ANCHORED ? 0 : anchored);
return result;
}
size_t
regex_quote_length (const char *string, const struct regex_quote_spec *spec)
{
const char *special = spec->special;
size_t length;
length = 0;
if (spec->anchored)
length += 2; /* for '^' at the beginning and '$' at the end */
if (spec->multibyte)
{
mbui_iterator_t iter;
for (mbui_init (iter, string); mbui_avail (iter); mbui_advance (iter))
{
/* We know that special contains only ASCII characters. */
if (mb_len (mbui_cur (iter)) == 1
&& strchr (special, * mbui_cur_ptr (iter)))
length += 1;
length += mb_len (mbui_cur (iter));
}
}
else
{
const char *iter;
for (iter = string; *iter != '\0'; iter++)
{
if (strchr (special, *iter))
length += 1;
length += 1;
}
}
return length;
}
char *
regex_quote_copy (char *p, const char *string, const struct regex_quote_spec *spec)
{
const char *special = spec->special;
if (spec->anchored)
*p++ = '^';
if (spec->multibyte)
{
mbui_iterator_t iter;
for (mbui_init (iter, string); mbui_avail (iter); mbui_advance (iter))
{
/* We know that special contains only ASCII characters. */
if (mb_len (mbui_cur (iter)) == 1
&& strchr (special, * mbui_cur_ptr (iter)))
*p++ = '\\';
memcpy (p, mbui_cur_ptr (iter), mb_len (mbui_cur (iter)));
p += mb_len (mbui_cur (iter));
}
}
else
{
const char *iter;
for (iter = string; *iter != '\0'; iter++)
{
if (strchr (special, *iter))
*p++ = '\\';
*p++ = *iter++;
}
}
if (spec->anchored)
*p++ = '$';
return p;
}
char *
regex_quote (const char *string, const struct regex_quote_spec *spec)
{
size_t length = regex_quote_length (string, spec);
char *result = XNMALLOC (length + 1, char);
char *p;
p = result;
p = regex_quote_copy (p, string, spec);
*p = '\0';
return result;
}