Snap for 11151698 from b7a213c11b2b60f1a1be18a29753132f0376d47f to 24Q1-release Change-Id: I6ce1405dccdee1d3b100db984c44280619d335ec

commit: e40c4a72a7db47c7614518690749cf4093b911a0 [log] [tgz]
author: Android Build Coastguard Worker <android-build-coastguard-worker@google.com> Wed Nov 29 00:10:49 2023 +0000
committer: Android Build Coastguard Worker <android-build-coastguard-worker@google.com> Wed Nov 29 00:10:49 2023 +0000
tree: b4b77bcd6c82504db09e7e1dbf5db683b59e7c45
parent: 13f5893f3addb30e157763ad236e7c2156e4b198 [diff]
parent: b7a213c11b2b60f1a1be18a29753132f0376d47f [diff]
diff --git a/FIXES b/FIXES
index a13ca50..52f49e3 100644
--- a/FIXES
+++ b/FIXES

@@ -25,6 +25,29 @@
 This file lists all bug fixes, changes, etc., made since the 
 second edition of the AWK book was published in September 2023.
 
+Nov 24, 2023:
+        Fix issue #199: gototab improvements to dynamically resize the
+        table, qsort and bsearch to improve the lookup speed as the
+        table gets larger for multibyte input. thanks to Arnold Robbins.
+
+Nov 23, 2023:
+	Fix Issue #169, related to escape sequences in strings.
+	Thanks to Github user rajeevvp.
+	Fix Issue #147, reported by Github user drawkula, and fixed
+	by Miguel Pineiro Jr.
+
+Nov 20, 2023:
+	rewrite of fnematch to fix a number of issues, including
+	extraneous output, out-of-bounds access, number of bytes
+	to push back after a failed match etc.
+	thanks to Miguel Pineiro Jr.
+
+Nov 15, 2023:
+	Man page edit, regression test fixes. thanks to Arnold Robbins
+	consolidation of sub and gsub into dosub, removing duplicate
+	code. thanks to Miguel Pineiro Jr.
+	gcc replaced with cc everywhere.
+
 Oct 30, 2023:
 	multiple fixes and a minor code cleanup.
 	disabled utf-8 for non-multibyte locales, such as C or POSIX.
@@ -32,7 +55,6 @@
 	systems. also fixed an out-of-bounds read for empty CCL.
 	fixed a buffer overflow in substr with utf-8 strings.
 	many thanks to Todd C Miller.
-	
 
 Sep 24, 2023:
 	fnematch and getrune have been overhauled to solve issues around

diff --git a/METADATA b/METADATA
index 6ea18b5..2b83084 100644
--- a/METADATA
+++ b/METADATA

@@ -9,11 +9,11 @@
     type: GIT
     value: "https://github.com/onetrueawk/awk.git"
   }
-  version: "d801514094d1140dfc9f8571b9821082ddddf107"
+  version: "fbd1d5b712e27a9bb527e39ed6e9bf3b9afbb1df"
   license_type: NOTICE
   last_upgrade_date {
     year: 2023
     month: 11
-    day: 6
+    day: 27
   }
 }

diff --git a/README.md b/README.md
index daace23..84fb06e 100644
--- a/README.md
+++ b/README.md

@@ -21,8 +21,6 @@
 ### Regular expressions ###
 
 Regular expressions may include UTF-8 code points, including `\u`.
-Character classes are likely to be limited to about 256 characters
-when expanded.
 
 ### CSV ###
 
@@ -145,4 +143,4 @@
 
 #### Last Updated
 
-Sun 15 Oct 2023 06:28:36 IDT
+Mon 16 Oct 2023 11:23:08 IDT

diff --git a/awk.1 b/awk.1
index 40ff0d3..ef40a01 100644
--- a/awk.1
+++ b/awk.1

@@ -586,6 +586,9 @@
 .PP
 Input is expected to be UTF-8 encoded. Other multibyte
 character sets are not handled.
+However, in eight-bit locales,
+.I awk
+treats each input byte as a separate character.
 .SH UNUSUAL FLOATING-POINT VALUES
 .I Awk
 was designed before IEEE 754 arithmetic defined Not-A-Number (NaN)

diff --git a/awk.h b/awk.h
index 217319c..76180e4 100644
--- a/awk.h
+++ b/awk.h

@@ -246,14 +246,19 @@
 	int	*lfollow;
 } rrow;
 
-typedef struct gtt { /* gototab entry */
+typedef struct gtte { /* gototab entry */
 	unsigned int ch;
 	unsigned int state;
+} gtte;
+
+typedef struct gtt {	/* gototab */
+	size_t	allocated;
+	size_t	inuse;
+	gtte	*entries;
 } gtt;
 
 typedef struct fa {
-	gtt	**gototab;
-	int	gototab_len;
+	gtt	*gototab;
 	uschar	*out;
 	uschar	*restr;
 	int	**posns;

diff --git a/b.c b/b.c
index aa07d59..881c052 100644
--- a/b.c
+++ b/b.c

@@ -96,9 +96,8 @@
    mechanism of the goto table used 8-bit byte indices into the
    gototab entries to compute the next state.  Unicode is a lot
    bigger, so the gototab entries are now structs with a character
-   and a next state, and there is a linear search of the characters
-   to find the state.  (Yes, this is slower, by a significant
-   amount.  Tough.)
+   and a next state. These are sorted by code point and binary
+   searched.
 
    Throughout the RE mechanism in b.c, utf-8 characters are
    converted to their utf-32 value.  This mostly shows up in
@@ -113,8 +112,10 @@
 
  */
 
+static int entry_cmp(const void *l, const void *r);
 static int get_gototab(fa*, int, int);
 static int set_gototab(fa*, int, int, int);
+static void clear_gototab(fa*, int);
 extern int u8_rune(int *, const uschar *);
 
 static int *
@@ -142,7 +143,7 @@
 static void
 resize_state(fa *f, int state)
 {
-	gtt **p;
+	gtt *p;
 	uschar *p2;
 	int **p3;
 	int i, new_count;
@@ -152,7 +153,7 @@
 
 	new_count = state + 10; /* needs to be tuned */
 
-	p = (gtt **) realloc(f->gototab, new_count * sizeof(f->gototab[0]));
+	p = (gtt *) realloc(f->gototab, new_count * sizeof(gtt));
 	if (p == NULL)
 		goto out;
 	f->gototab = p;
@@ -168,13 +169,14 @@
 	f->posns = p3;
 
 	for (i = f->state_count; i < new_count; ++i) {
-		f->gototab[i] = (gtt *) calloc(NCHARS, sizeof(**f->gototab));
-		if (f->gototab[i] == NULL)
+		f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte));
+		if (f->gototab[i].entries == NULL)
 			goto out;
-		f->out[i]  = 0;
+		f->gototab[i].allocated = NCHARS;
+		f->gototab[i].inuse = 0;
+		f->out[i] = 0;
 		f->posns[i] = NULL;
 	}
-	f->gototab_len = NCHARS; /* should be variable, growable */
 	f->state_count = new_count;
 	return;
 out:
@@ -268,8 +270,7 @@
 	}
 	if ((f->posns[2])[1] == f->accept)
 		f->out[2] = 1;
-	for (i = 0; i < NCHARS; i++)
-		set_gototab(f, 2, 0, 0); /* f->gototab[2][i] = 0; */
+	clear_gototab(f, 2);
 	f->curstat = cgoto(f, 2, HAT);
 	if (anchor) {
 		*f->posns[2] = k-1;	/* leave out position 0 */
@@ -595,32 +596,104 @@
 	return(0);
 }
 
+static void resize_gototab(fa *f, int state)
+{
+	size_t new_size = f->gototab[state].allocated * 2;
+	gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte));
+	if (p == NULL)
+		overflo(__func__);
+
+	// need to initialized the new memory to zero
+	size_t orig_size = f->gototab[state].allocated;		// 2nd half of new mem is this size
+	memset(p + orig_size, 0, orig_size * sizeof(gtte));	// clean it out
+
+	f->gototab[state].allocated = new_size;			// update gotottab info
+	f->gototab[state].entries = p;
+}
+
 static int get_gototab(fa *f, int state, int ch) /* hide gototab inplementation */
 {
-	int i;
-	for (i = 0; i < f->gototab_len; i++) {
-		if (f->gototab[state][i].ch == 0)
-			break;
-		if (f->gototab[state][i].ch == ch)
-			return f->gototab[state][i].state;
-	}
-	return 0;
+	gtte key;
+	gtte *item;
+
+	key.ch = ch;
+	key.state = 0;	/* irrelevant */
+	item = bsearch(& key, f->gototab[state].entries,
+			f->gototab[state].inuse, sizeof(gtte),
+			entry_cmp);
+
+	if (item == NULL)
+		return 0;
+	else
+		return item->state;
+}
+
+static int entry_cmp(const void *l, const void *r)
+{
+	const gtte *left, *right;
+
+	left = (const gtte *) l;
+	right = (const gtte *) r;
+
+	return left->ch - right->ch;
 }
 
 static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplementation */
 {
-	int i;
-	for (i = 0; i < f->gototab_len; i++) {
-		if (f->gototab[state][i].ch == 0 || f->gototab[state][i].ch == ch) {
-			f->gototab[state][i].ch = ch;
-			f->gototab[state][i].state = val;
-			return val;
+	if (f->gototab[state].inuse == 0) {
+		f->gototab[state].entries[0].ch = ch;
+		f->gototab[state].entries[0].state = val;
+		f->gototab[state].inuse++;
+		return val;
+	} else if (ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) {
+		// not seen yet, insert and return
+		gtt *tab = & f->gototab[state];
+		if (tab->inuse + 1 >= tab->allocated)
+			resize_gototab(f, state);
+
+		f->gototab[state].entries[f->gototab[state].inuse-1].ch = ch;
+		f->gototab[state].entries[f->gototab[state].inuse-1].state = val;
+		f->gototab[state].inuse++;
+		return val;
+	} else {
+		// maybe we have it, maybe we don't
+		gtte key;
+		gtte *item;
+
+		key.ch = ch;
+		key.state = 0;	/* irrelevant */
+		item = bsearch(& key, f->gototab[state].entries,
+				f->gototab[state].inuse, sizeof(gtte),
+				entry_cmp);
+
+		if (item != NULL) {
+			// we have it, update state and return
+			item->state = val;
+			return item->state;
 		}
+		// otherwise, fall through to insert and reallocate.
 	}
-	overflo(__func__);
+
+	gtt *tab = & f->gototab[state];
+	if (tab->inuse + 1 >= tab->allocated)
+		resize_gototab(f, state);
+	++tab->inuse;
+	f->gototab[state].entries[tab->inuse].ch = ch;
+	f->gototab[state].entries[tab->inuse].state = val;
+
+	qsort(f->gototab[state].entries,
+		f->gototab[state].inuse, sizeof(gtte), entry_cmp);
+
 	return val; /* not used anywhere at the moment */
 }
 
+static void clear_gototab(fa *f, int state)
+{
+	memset(f->gototab[state].entries, 0,
+		f->gototab[state].allocated * sizeof(gtte));
+	f->gototab[state].inuse = 0;
+}
+
 int match(fa *f, const char *p0)	/* shortest match ? */
 {
 	int s, ns;
@@ -759,59 +832,6 @@
 
 #define MAX_UTF_BYTES	4	// UTF-8 is up to 4 bytes long
 
-// Read one rune at a time from the given FILE*. Return both
-// the bytes and the actual rune.
-
-struct runedata {
-	int rune;
-	size_t len;
-	char bytes[6];
-};
-
-struct runedata getrune(FILE *fp)
-{
-	struct runedata result;
-	int c, next;
-
-	memset(&result, 0, sizeof(result));
-
-	c = getc(fp);
-	if (c == EOF)
-		return result;	// result.rune == 0 --> EOF
-	else if (c < 128 || awk_mb_cur_max == 1) {
-		result.bytes[0] = c;
-		result.len = 1;
-		result.rune = c;
-
-		return result;
-	}
-
-	// need to get bytes and fill things in
-	result.bytes[0] = c;
-	result.len = 1;
-
-	next = 1;
-	for (int i = 1; i < MAX_UTF_BYTES; i++) {
-		c = getc(fp);
-		if (c == EOF)
-			break;
-		result.bytes[next++] = c;
-		result.len++;
-	}
-
-	// put back any extra input bytes
-	int actual_len = u8_nextlen(result.bytes);
-	while (result.len > actual_len) {
-		ungetc(result.bytes[--result.len], fp);
-	}
-
-	result.bytes[result.len] = '\0';
-	(void) u8_rune(& result.rune, (uschar *) result.bytes);
-
-	return result;
-}
-
-
 /*
  * NAME
  *     fnematch
@@ -829,58 +849,76 @@
 
 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
 {
-	char *buf = *pbuf;
+	char *i, *j, *k, *buf = *pbuf;
 	int bufsize = *pbufsize;
-	int i, j, k, ns, s;
-	struct runedata r;
+	int c, n, ns, s;
 
 	s = pfa->initstat;
 	patlen = 0;
 
 	/*
-	 * All indices relative to buf.
-	 * i <= j <= k <= bufsize
+	 * buf <= i <= j <= k <= buf+bufsize
 	 *
-	 * i: origin of active substring (first byte of first character)
-	 * j: current character		(last byte of current character)
-	 * k: destination of next getc()
+	 * i: origin of active substring
+	 * j: current character
+	 * k: destination of the next getc
 	 */
-	i = -1, k = 0;
-        do {
-		j = i++;
-		do {
-			r = getrune(f);
-			if ((++j + r.len) >= k) {
-				if (k >= bufsize)
-					if (!adjbuf((char **) &buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
-						FATAL("stream '%.30s...' too long", buf);
-			}
-			memcpy(buf + k, r.bytes, r.len);
-			j += r.len - 1;	// incremented next time around the loop
-			k += r.len;
 
-			if ((ns = get_gototab(pfa, s, r.rune)) != 0)
-				s = ns;
-			else
-				s = cgoto(pfa, s, r.rune);
+	i = j = k = buf;
 
-			if (pfa->out[s]) {	/* final state */
-				patlen = j - i + 1;
-				if (r.rune == 0)	/* don't count $ */
-					patlen--;
+	do {
+		/*
+		 * Call u8_rune with at least MAX_UTF_BYTES ahead in
+		 * the buffer until EOF interferes.
+		 */
+		if (k - j < MAX_UTF_BYTES) {
+			if (k + MAX_UTF_BYTES > buf + bufsize) {
+				adjbuf((char **) &buf, &bufsize,
+				    bufsize + MAX_UTF_BYTES,
+				    quantum, 0, "fnematch");
 			}
-		} while (buf[j] && s != 1);
+			for (n = MAX_UTF_BYTES ; n > 0; n--) {
+				*k++ = (c = getc(f)) != EOF ? c : 0;
+				if (c == EOF) {
+					if (ferror(f))
+						FATAL("fnematch: getc error");
+					break;
+				}
+			}
+		}
+
+		j += u8_rune(&c, (uschar *)j);
+
+		if ((ns = get_gototab(pfa, s, c)) != 0)
+			s = ns;
+		else
+			s = cgoto(pfa, s, c);
+
+		if (pfa->out[s]) {	/* final state */
+			patbeg = i;
+			patlen = j - i;
+			if (c == 0)	/* don't count $ */
+				patlen--;
+		}
+
+		if (c && s != 1)
+			continue;  /* origin i still viable, next j */
+		if (patlen)
+			break;     /* best match found */
+
+		/* no match at origin i, next i and start over */
+		i += u8_rune(&c, (uschar *)i);
+		if (c == 0)
+			break;    /* no match */
+		j = i;
 		s = 2;
-		if (r.len > 1)
-			i += r.len - 1;	// i incremented around the loop
-	} while (buf[i] && !patlen);
+	} while (1);
 
 	/* adjbuf() may have relocated a resized buffer. Inform the world. */
 	*pbuf = buf;
 	*pbufsize = bufsize;
 
 	if (patlen) {
-		patbeg = (char *) buf + i;
 		/*
 		 * Under no circumstances is the last character fed to
 		 * the automaton part of the match. It is EOF's nullbyte,
@@ -893,11 +931,10 @@
 		 * terminate the buffer.
 		 */
 		do
-			for (int ii = r.len; ii > 0; ii--)
-				if (buf[--k] && ungetc(buf[k], f) == EOF)
-					FATAL("unable to ungetc '%c'", buf[k]);
-		while (k > i + patlen);
-		buf[k] = '\0';
+			if (*--k && ungetc(*k, f) == EOF)
+				FATAL("unable to ungetc '%c'", *k);
+		while (k > patbeg + patlen);
+		*k = '\0';
 		return true;
 	}
 	else
@@ -1486,8 +1523,7 @@
 	/* add tmpset to current set of states */
 	++(f->curstat);
 	resize_state(f, f->curstat);
-	for (i = 0; i < NCHARS; i++)
-		set_gototab(f, f->curstat, 0, 0);
+	clear_gototab(f, f->curstat);
 	xfree(f->posns[f->curstat]);
 	p = intalloc(setcnt + 1, __func__);
 
@@ -1511,7 +1547,8 @@
 	if (f == NULL)
 		return;
 	for (i = 0; i < f->state_count; i++)
-		xfree(f->gototab[i])
+		xfree(f->gototab[i].entries);
+	xfree(f->gototab);
 	for (i = 0; i <= f->curstat; i++)
 		xfree(f->posns[i]);
 	for (i = 0; i <= f->accept; i++) {

diff --git a/bugs-fixed/REGRESS b/bugs-fixed/REGRESS
index 0716003..98d578a 100755
--- a/bugs-fixed/REGRESS
+++ b/bugs-fixed/REGRESS

@@ -1,4 +1,4 @@
-#! /bin/bash
+#! /bin/sh
 
 if [ ! -f ../a.out ]
 then

diff --git a/lex.c b/lex.c
index 675c116..0473a33 100644
--- a/lex.c
+++ b/lex.c

@@ -421,8 +421,12 @@
 			    {
 				int i;
 
+				if (!isxdigit(peek())) {
+					unput(c);
+					break;
+				}
 				n = 0;
-				for (i = 1; i <= 2; i++) {
+				for (i = 0; i < 2; i++) {
 					c = input();
 					if (c == 0)
 						break;
@@ -433,13 +437,13 @@
 							n += (c - '0');
 						else
 							n += 10 + (c - 'a');
-					} else
+					} else {
+						unput(c);
 						break;
+					}
 				}
-				if (n)
+				if (i)
 					*bp++ = n;
-				else
-					unput(c);
 				break;
 			    }
 

diff --git a/main.c b/main.c
index 3a205c8..c478e32 100644
--- a/main.c
+++ b/main.c

@@ -22,7 +22,7 @@
 THIS SOFTWARE.
 ****************************************************************/
 
-const char	*version = "version 20231030";
+const char	*version = "version 20231124";
 
 #define DEBUG
 #include <stdio.h>

diff --git a/makefile b/makefile
index df966ef..b47a8af 100644
--- a/makefile
+++ b/makefile

@@ -28,10 +28,10 @@
 CFLAGS = -O2
 
 # compiler options
-#CC = gcc -Wall -g -Wwrite-strings
-#CC = gcc -O4 -Wall -pedantic -fno-strict-aliasing
-#CC = gcc -fprofile-arcs -ftest-coverage # then gcov f1.c; cat f1.c.gcov
-HOSTCC = gcc -g -Wall -pedantic -Wcast-qual
+#CC = cc -Wall -g -Wwrite-strings
+#CC = cc -O4 -Wall -pedantic -fno-strict-aliasing
+#CC = cc -fprofile-arcs -ftest-coverage # then gcov f1.c; cat f1.c.gcov
+HOSTCC = cc -g -Wall -pedantic -Wcast-qual
 CC = $(HOSTCC)  # change this is cross-compiling.
 
 # By fiat, to make our lives easier, yacc is now defined to be bison.

diff --git a/maketab.c b/maketab.c
index 433541e..3747efa 100644
--- a/maketab.c
+++ b/maketab.c

@@ -52,8 +52,8 @@
 	{ ARRAY, "array", NULL },
 	{ INDIRECT, "indirect", "$(" },
 	{ SUBSTR, "substr", "substr" },
-	{ SUB, "sub", "sub" },
-	{ GSUB, "gsub", "gsub" },
+	{ SUB, "dosub", "sub" },
+	{ GSUB, "dosub", "gsub" },
 	{ INDEX, "sindex", "sindex" },
 	{ SPRINTF, "awksprintf", "sprintf " },
 	{ ADD, "arith", " + " },

diff --git a/proto.h b/proto.h
index cb4988e..ed63e78 100644
--- a/proto.h
+++ b/proto.h

@@ -196,8 +196,7 @@
 extern	const char	*filename(FILE *);
 extern	Cell	*closefile(Node **, int);
 extern	void	closeall(void);
-extern	Cell	*sub(Node **, int);
-extern	Cell	*gsub(Node **, int);
+extern	Cell	*dosub(Node **, int);
 
 extern	FILE	*popen(const char *, const char *);
 extern	int	pclose(FILE *);

diff --git a/run.c b/run.c
index a9ef242..7462c38 100644
--- a/run.c
+++ b/run.c

@@ -1540,8 +1540,9 @@
 		if (x == y && !(x->tval & (FLD|REC)) && x != nfloc)
 			;	/* self-assignment: leave alone unless it's a field or NF */
 		else if ((y->tval & (STR|NUM)) == (STR|NUM)) {
+			yf = getfval(y);
 			setsval(x, getsval(y));
-			x->fval = getfval(y);
+			x->fval = yf;
 			x->tval |= NUM;
 		}
 		else if (isstr(y))
@@ -2397,169 +2398,143 @@
 
 void backsub(char **pb_ptr, const char **sptr_ptr);
 
-Cell *sub(Node **a, int nnn)	/* substitute command */
+Cell *dosub(Node **a, int subop)        /* sub and gsub */
 {
-	const char *sptr, *q;
-	Cell *x, *y, *result;
-	char *t, *buf, *pb;
 	fa *pfa;
+	int tempstat;
+	char *repl;
+	Cell *x;
+
+	char *buf = NULL;
+	char *pb = NULL;
 	int bufsz = recsize;
 
-	if ((buf = (char *) malloc(bufsz)) == NULL)
-		FATAL("out of memory in sub");
-	x = execute(a[3]);	/* target string */
-	t = getsval(x);
-	if (a[0] == NULL)	/* 0 => a[1] is already-compiled regexpr */
-		pfa = (fa *) a[1];	/* regular expression */
-	else {
-		y = execute(a[1]);
-		pfa = makedfa(getsval(y), 1);
-		tempfree(y);
+	const char *r, *s;
+	const char *start;
+	const char *noempty = NULL;      /* empty match disallowed here */
+	size_t m = 0;                    /* match count */
+	size_t whichm;                   /* which match to select, 0 = global */
+	int mtype;                       /* match type */
+
+	if (a[0] == NULL) {	/* 0 => a[1] is already-compiled regexpr */
+		pfa = (fa *) a[1];
+	} else {
+		x = execute(a[1]);
+		pfa = makedfa(getsval(x), 1);
+		tempfree(x);
 	}
-	y = execute(a[2]);	/* replacement string */
-	result = False;
-	if (pmatch(pfa, t)) {
-		sptr = t;
-		adjbuf(&buf, &bufsz, 1+patbeg-sptr, recsize, 0, "sub");
-		pb = buf;
-		while (sptr < patbeg)
-			*pb++ = *sptr++;
-		sptr = getsval(y);
-		while (*sptr != '\0') {
-			adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "sub");
-			if (*sptr == '\\') {
-				backsub(&pb, &sptr);
-			} else if (*sptr == '&') {
-				sptr++;
-				adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "sub");
-				for (q = patbeg; q < patbeg+patlen; )
-					*pb++ = *q++;
-			} else
-				*pb++ = *sptr++;
-		}
-		*pb = '\0';
-		if (pb > buf + bufsz)
-			FATAL("sub result1 %.30s too big; can't happen", buf);
-		sptr = patbeg + patlen;
-		if ((patlen == 0 && *patbeg) || (patlen && *(sptr-1))) {
-			adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "sub");
-			while ((*pb++ = *sptr++) != '\0')
-				continue;
-		}
-		if (pb > buf + bufsz)
-			FATAL("sub result2 %.30s too big; can't happen", buf);
-		setsval(x, buf);	/* BUG: should be able to avoid copy */
-		result = True;
-	}
+
+	x = execute(a[2]);	/* replacement string */
+	repl = tostring(getsval(x));
 	tempfree(x);
-	tempfree(y);
-	free(buf);
-	return result;
-}
 
-Cell *gsub(Node **a, int nnn)	/* global substitute */
-{
-	Cell *x, *y;
-	char *rptr, *pb;
-	const char *q, *t, *sptr;
-	char *buf;
-	fa *pfa;
-	int mflag, tempstat, num;
-	int bufsz = recsize;
-	int charlen = 0;
-
-	if ((buf = (char *) malloc(bufsz)) == NULL)
-		FATAL("out of memory in gsub");
-	mflag = 0;	/* if mflag == 0, can replace empty string */
-	num = 0;
-	x = execute(a[3]);	/* target string */
-	t = getsval(x);
-	if (a[0] == NULL)	/* 0 => a[1] is already-compiled regexpr */
-		pfa = (fa *) a[1];	/* regular expression */
-	else {
-		y = execute(a[1]);
-		pfa = makedfa(getsval(y), 1);
-		tempfree(y);
+	switch (subop) {
+	case SUB:
+		whichm = 1;
+		x = execute(a[3]);    /* source string */
+		break;
+	case GSUB:
+		whichm = 0;
+		x = execute(a[3]);    /* source string */
+		break;
+	default:
+		FATAL("dosub: unrecognized subop: %d", subop);
 	}
-	y = execute(a[2]);	/* replacement string */
-	if (pmatch(pfa, t)) {
-		tempstat = pfa->initstat;
-		pfa->initstat = 2;
-		pb = buf;
-		rptr = getsval(y);
-		do {
-			if (patlen == 0 && *patbeg != '\0') {	/* matched empty string */
-				if (mflag == 0) {	/* can replace empty */
-					num++;
-					sptr = rptr;
-					while (*sptr != '\0') {
-						adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gsub");
-						if (*sptr == '\\') {
-							backsub(&pb, &sptr);
-						} else if (*sptr == '&') {
-							sptr++;
-							adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gsub");
-							for (q = patbeg; q < patbeg+patlen; )
-								*pb++ = *q++;
-						} else
-							*pb++ = *sptr++;
-					}
-				}
-				if (*t == '\0')	/* at end */
-					goto done;
-				adjbuf(&buf, &bufsz, 2+pb-buf, recsize, &pb, "gsub");
-				charlen = u8_nextlen(t);
-				while (charlen-- > 0)
-					*pb++ = *t++;
-				if (pb > buf + bufsz)	/* BUG: not sure of this test */
-					FATAL("gsub result0 %.30s too big; can't happen", buf);
-				mflag = 0;
+
+	start = getsval(x);
+	while (pmatch(pfa, start)) {
+		if (buf == NULL) {
+			if ((pb = buf = malloc(bufsz)) == NULL)
+				FATAL("out of memory in dosub");
+			tempstat = pfa->initstat;
+			pfa->initstat = 2;
+		}
+
+		/* match types */
+		#define	MT_IGNORE  0  /* unselected or invalid */
+		#define MT_INSERT  1  /* selected, empty */
+		#define MT_REPLACE 2  /* selected, not empty */
+
+		/* an empty match just after replacement is invalid */
+
+		if (patbeg == noempty && patlen == 0) {
+			mtype = MT_IGNORE;    /* invalid, not counted */
+		} else if (whichm == ++m || whichm == 0) {
+			mtype = patlen ? MT_REPLACE : MT_INSERT;
+		} else {
+			mtype = MT_IGNORE;    /* unselected, but counted */
+		}
+
+		/* leading text: */
+		if (patbeg > start) {
+			adjbuf(&buf, &bufsz, (pb - buf) + (patbeg - start),
+				recsize, &pb, "dosub");
+			s = start;
+			while (s < patbeg)
+				*pb++ = *s++;
+		}
+
+		if (mtype == MT_IGNORE)
+			goto matching_text;  /* skip replacement text */
+
+		r = repl;
+		while (*r != 0) {
+			adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "dosub");
+			if (*r == '\\') {
+				backsub(&pb, &r);
+			} else if (*r == '&') {
+				r++;
+				adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize,
+					&pb, "dosub");
+				for (s = patbeg; s < patbeg+patlen; )
+					*pb++ = *s++;
+			} else {
+				*pb++ = *r++;
 			}
-			else {	/* matched nonempty string */
-				num++;
-				sptr = t;
-				adjbuf(&buf, &bufsz, 1+(patbeg-sptr)+pb-buf, recsize, &pb, "gsub");
-				while (sptr < patbeg)
-					*pb++ = *sptr++;
-				sptr = rptr;
-				while (*sptr != '\0') {
-					adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gsub");
-					if (*sptr == '\\') {
-						backsub(&pb, &sptr);
-					} else if (*sptr == '&') {
-						sptr++;
-						adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gsub");
-						for (q = patbeg; q < patbeg+patlen; )
-							*pb++ = *q++;
-					} else
-						*pb++ = *sptr++;
-				}
-				t = patbeg + patlen;
-				if (patlen == 0 || *t == '\0' || *(t-1) == '\0')
-					goto done;
-				if (pb > buf + bufsz)
-					FATAL("gsub result1 %.30s too big; can't happen", buf);
-				mflag = 1;
-			}
-		} while (pmatch(pfa,t));
-		sptr = t;
-		adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "gsub");
-		while ((*pb++ = *sptr++) != '\0')
-			continue;
-	done:	if (pb < buf + bufsz)
-			*pb = '\0';
-		else if (*(pb-1) != '\0')
-			FATAL("gsub result2 %.30s truncated; can't happen", buf);
-		setsval(x, buf);	/* BUG: should be able to avoid copy + free */
+		}
+
+matching_text:
+		if (mtype == MT_REPLACE || *patbeg == '\0')
+			goto next_search;  /* skip matching text */
+		
+		if (patlen == 0)
+			patlen = u8_nextlen(patbeg);
+		adjbuf(&buf, &bufsz, (pb-buf) + patlen, recsize, &pb, "dosub");
+		s = patbeg;
+		while (s < patbeg + patlen)
+			*pb++ = *s++;
+
+next_search:
+		start = patbeg + patlen;
+		if (m == whichm || *patbeg == '\0')
+			break;
+		if (mtype == MT_REPLACE)
+			noempty = start;
+
+		#undef MT_IGNORE
+		#undef MT_INSERT
+		#undef MT_REPLACE
+	}
+
+	xfree(repl);
+
+	if (buf != NULL) {
 		pfa->initstat = tempstat;
+
+		/* trailing text */
+		adjbuf(&buf, &bufsz, 1+strlen(start)+pb-buf, 0, &pb, "dosub");
+		while ((*pb++ = *start++) != '\0')
+			;
+
+		setsval(x, buf);
+		free(buf);
 	}
+
 	tempfree(x);
-	tempfree(y);
 	x = gettemp();
 	x->tval = NUM;
-	x->fval = num;
-	free(buf);
-	return(x);
+	x->fval = m;
+	return x;
 }
 
 void backsub(char **pb_ptr, const char **sptr_ptr)	/* handle \\& variations */

diff --git a/testdir/Compare.tt b/testdir/Compare.tt
index ca828d2..4b297d7 100755
--- a/testdir/Compare.tt
+++ b/testdir/Compare.tt

@@ -4,7 +4,7 @@
 awk=${awk-../a.out}
 
 echo compiling time.c
-gcc time.c -o time
+cc time.c -o time
 time=./time
 
 echo time command = $time

diff --git a/testdir/REGRESS b/testdir/REGRESS
index 5c3667f..b54ce3f 100755
--- a/testdir/REGRESS
+++ b/testdir/REGRESS

@@ -1,7 +1,7 @@
 #!/bin/sh
 
 uname -a
-gcc echo.c -o echo && echo echo compiled
+cc echo.c -o echo && echo echo compiled
 
 oldawk=${oldawk-awk}
 awk=${awk-../a.out}

diff --git a/testdir/T.csv b/testdir/T.csv
index 10da1ea..79c1510 100755
--- a/testdir/T.csv
+++ b/testdir/T.csv

@@ -77,5 +77,4 @@
 a,	[a][]
 "",	[][]
 ,	[][]
-a"b	[a"b]	
 !!!!

diff --git a/testdir/T.flags b/testdir/T.flags
index 33d7c8d..17ce561 100755
--- a/testdir/T.flags
+++ b/testdir/T.flags

@@ -20,5 +20,6 @@
 $awk -F  >foo 2>&1
 grep 'no field separator' foo >/dev/null || echo 'T.flags: bad missing field separator'
 
-$awk -F '' >foo 2>&1
-grep 'field separator FS is empty' foo >/dev/null || echo 'T.flags: bad empty field separator'
+### Awk is now like gawk and splits into separate characters if FS = ""
+# $awk -F '' >foo 2>&1
+# grep 'field separator FS is empty' foo >/dev/null || echo 'T.flags: bad empty field separator'

diff --git a/testdir/T.misc b/testdir/T.misc
index 1e5c3c5..b8ed3c1 100755
--- a/testdir/T.misc
+++ b/testdir/T.misc
Binary files differ
commit	e40c4a72a7db47c7614518690749cf4093b911a0	[log] [tgz]
author	Android Build Coastguard Worker <android-build-coastguard-worker@google.com>	Wed Nov 29 00:10:49 2023 +0000
committer	Android Build Coastguard Worker <android-build-coastguard-worker@google.com>	Wed Nov 29 00:10:49 2023 +0000
tree	b4b77bcd6c82504db09e7e1dbf5db683b59e7c45
parent	13f5893f3addb30e157763ad236e7c2156e4b198 [diff]
parent	b7a213c11b2b60f1a1be18a29753132f0376d47f [diff]