btf_encoder: Represent "."-suffixed functions (".isra.0") in BTF

At gcc optimization level O2 or higher, many function optimizations
occur such as:

- constant propagation (.constprop.0);

- interprocedural scalar replacement of aggregates, removal of
  unused parameters and replacement of parameters passed by
  reference by parameters passed by value (.isra.0)

See [1] for details.

Currently BTF encoding does not handle such optimized functions that get
renamed with a "." suffix such as ".isra.0", ".constprop.0".  This is
safer because such suffixes can often indicate parameters have been
optimized out.  Since we can now spot this, support matching to a "."
suffix and represent the function in BTF if it does not have
optimized-out parameters.  First an attempt to match by exact name is
made; if that fails we fall back to checking for a "."-suffixed name.
The BTF representation will use the original function name "foo" not
"foo.isra.0" for consistency with DWARF representation.

There is a complication however, and this arises because we process each
CU separately and merge BTF when complete.  Different CUs may optimize
differently, so in one CU, a function may have optimized-out parameters
- and thus be ineligible for BTF - while in another it does not have
  optimized-out parameters - making it eligible for BTF.  The NF_HOOK
function is an example of this.

To avoid disrupting BTF generation parallelism, the approach taken is to
save pointers to the function representations in the ELF function table;
it is per-encoder, but the same representation is used across encoders,
so we can use the same array index to find the same function when
merging for efficiency.  Using the ELF function is ideal also as we
already have to carry out a name lookup for matching from DWARF to ELF.

At thread collection time, observations about optimizations are merged
across encoders and we know whether it is safe to add a "."-suffixed
function or not.

The result of this is we add 1180 "."-suffixed functions to the BTF
representation.

Encoding of "."-suffixed functions is done only if the
"--btf_gen_optimized" function is specified.

Because we need to ensure consistency across encoders, there is
a performance cost to the save/merge/apply approach. Comparing
baseline to test times:

$ time LLVM_OBJCOPY=objcopy pahole -J -j4 vmlinux

real	0m7.788s
user	0m17.629s
sys	0m0.652s

$ time LLVM_OBJCOPY=objcopy pahole -J -j4 --btf_gen_optimized vmlinux

real	0m10.839s
user	0m19.473s
sys	0m5.932s

[1] https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Tested-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Eduard Zingerman <eddyz87@gmail.com>
Cc: Hao Luo <haoluo@google.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: KP Singh <kpsingh@chromium.org>
Cc: Kui-Feng Lee <sinquersw@gmail.com>
Cc: Martin KaFai Lau <martin.lau@kernel.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: Stanislav Fomichev <sdf@google.com>
Cc: Timo Beckers <timo@incline.eu>
Cc: Yonghong Song <yhs@fb.com>
Cc: bpf@vger.kernel.org
Link: https://lore.kernel.org/r/1675790102-23037-6-git-send-email-alan.maguire@oracle.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
diff --git a/btf_encoder.c b/btf_encoder.c
index 74ab61b..cb50401 100644
--- a/btf_encoder.c
+++ b/btf_encoder.c
@@ -30,11 +30,20 @@
 
 #include <errno.h>
 #include <stdint.h>
+#include <search.h> /* for tsearch(), tfind() and tdestroy() */
 #include <pthread.h>
 
+/* state used to do later encoding of saved functions */
+struct btf_encoder_state {
+	uint32_t type_id_off;
+};
+
 struct elf_function {
 	const char	*name;
 	bool		 generated;
+	size_t		prefixlen;
+	struct function	*function;
+	struct btf_encoder_state state;
 };
 
 #define MAX_PERCPU_VAR_CNT 4096
@@ -57,6 +66,7 @@
 	struct elf_symtab *symtab;
 	uint32_t	  type_id_off;
 	uint32_t	  unspecified_type;
+	int		  saved_func_cnt;
 	bool		  has_index_type,
 			  need_index_type,
 			  skip_encoding_vars,
@@ -77,12 +87,15 @@
 		struct elf_function *entries;
 		int		    allocated;
 		int		    cnt;
+		int		    suffix_cnt; /* number of .isra, .part etc */
 	} functions;
 };
 
 static LIST_HEAD(encoders);
 static pthread_mutex_t encoders__lock = PTHREAD_MUTEX_INITIALIZER;
 
+static void btf_encoder__add_saved_funcs(struct btf_encoder *encoder);
+
 /* mutex only needed for add/delete, as this can happen in multiple encoding
  * threads.  Traversal of the list is currently confined to thread collection.
  */
@@ -707,6 +720,11 @@
 	int32_t i, id;
 	struct btf_var_secinfo *vsi;
 
+	if (encoder == other)
+		return 0;
+
+	btf_encoder__add_saved_funcs(other);
+
 	for (i = 0; i < nr_var_secinfo; i++) {
 		vsi = (struct btf_var_secinfo *)var_secinfo_buf->entries + i;
 		type_id = next_type_id + vsi->type - 1; /* Type ID starts from 1 */
@@ -782,6 +800,27 @@
 	return id;
 }
 
+static int32_t btf_encoder__save_func(struct btf_encoder *encoder, struct function *fn, struct elf_function *func)
+{
+	if (func->function) {
+		struct function *existing = func->function;
+
+		/* If saving and we find an existing entry, we want to merge
+		 * observations across both functions, checking that the
+		 * "seen optimized parameters" status is reflected in the func
+		 * entry. If the entry is new, record encoder state required
+		 * to add the local function later (encoder + type_id_off)
+		 * such that we can add the function later.
+		 */
+		existing->proto.optimized_parms |= fn->proto.optimized_parms;
+	} else {
+		func->state.type_id_off = encoder->type_id_off;
+		func->function = fn;
+		encoder->saved_func_cnt++;
+	}
+	return 0;
+}
+
 static int32_t btf_encoder__add_func(struct btf_encoder *encoder, struct function *fn)
 {
 	int btf_fnproto_id, btf_fn_id, tag_type_id;
@@ -807,6 +846,50 @@
 	return 0;
 }
 
+static void btf_encoder__add_saved_funcs(struct btf_encoder *encoder)
+{
+	int i;
+
+	for (i = 0; i < encoder->functions.cnt; i++) {
+		struct function *fn = encoder->functions.entries[i].function;
+		struct btf_encoder *other_encoder;
+
+		if (!fn || fn->proto.processed)
+			continue;
+
+		/* merge optimized-out status across encoders; since each
+		 * encoder has the same elf symbol table we can use the
+		 * same index to access the same elf symbol.
+		 */
+		btf_encoders__for_each_encoder(other_encoder) {
+			struct function *other_fn;
+
+			if (other_encoder == encoder)
+				continue;
+
+			other_fn = other_encoder->functions.entries[i].function;
+			if (!other_fn)
+				continue;
+			fn->proto.optimized_parms |= other_fn->proto.optimized_parms;
+			other_fn->proto.processed = 1;
+		}
+		if (fn->proto.optimized_parms) {
+			if (encoder->verbose) {
+				const char *name = function__name(fn);
+
+				printf("skipping addition of '%s'(%s) due to optimized-out parameters\n",
+				       name, fn->alias ?: name);
+			}
+		} else {
+			struct elf_function *func = &encoder->functions.entries[i];
+
+			encoder->type_id_off = func->state.type_id_off;
+			btf_encoder__add_func(encoder, fn);
+		}
+		fn->proto.processed = 1;
+	}
+}
+
 /*
  * This corresponds to the same macro defined in
  * include/linux/kallsyms.h
@@ -818,6 +901,11 @@
 	const struct elf_function *a = _a;
 	const struct elf_function *b = _b;
 
+	/* if search key allows prefix match, verify target has matching
+	 * prefix len and prefix matches.
+	 */
+	if (a->prefixlen && a->prefixlen == b->prefixlen)
+		return strncmp(a->name, b->name, b->prefixlen);
 	return strcmp(a->name, b->name);
 }
 
@@ -850,14 +938,22 @@
 	}
 
 	encoder->functions.entries[encoder->functions.cnt].name = name;
+	if (strchr(name, '.')) {
+		const char *suffix = strchr(name, '.');
+
+		encoder->functions.suffix_cnt++;
+		encoder->functions.entries[encoder->functions.cnt].prefixlen = suffix - name;
+	}
 	encoder->functions.entries[encoder->functions.cnt].generated = false;
+	encoder->functions.entries[encoder->functions.cnt].function = NULL;
 	encoder->functions.cnt++;
 	return 0;
 }
 
-static struct elf_function *btf_encoder__find_function(const struct btf_encoder *encoder, const char *name)
+static struct elf_function *btf_encoder__find_function(const struct btf_encoder *encoder,
+						       const char *name, size_t prefixlen)
 {
-	struct elf_function key = { .name = name };
+	struct elf_function key = { .name = name, .prefixlen = prefixlen };
 
 	return bsearch(&key, encoder->functions.entries, encoder->functions.cnt, sizeof(key), functions_cmp);
 }
@@ -1187,6 +1283,9 @@
 {
 	int err;
 
+	/* for single-threaded case, saved funcs are added here */
+	btf_encoder__add_saved_funcs(encoder);
+
 	if (gobuffer__size(&encoder->percpu_secinfo) != 0)
 		btf_encoder__add_datasec(encoder, PERCPU_SECTION);
 
@@ -1634,6 +1733,8 @@
 	}
 
 	cu__for_each_function(cu, core_id, fn) {
+		struct elf_function *func = NULL;
+		bool save = false;
 
 		/*
 		 * Skip functions that:
@@ -1647,29 +1748,62 @@
 		if (!ftype__has_arg_names(&fn->proto))
 			continue;
 		if (encoder->functions.cnt) {
-			struct elf_function *func;
 			const char *name;
 
 			name = function__name(fn);
 			if (!name)
 				continue;
 
-			func = btf_encoder__find_function(encoder, name);
-			if (!func || func->generated)
+			/* prefer exact function name match... */
+			func = btf_encoder__find_function(encoder, name, 0);
+			if (func) {
+				if (func->generated)
+					continue;
+				func->generated = true;
+			} else if (encoder->functions.suffix_cnt &&
+				   conf_load->btf_gen_optimized) {
+				/* falling back to name.isra.0 match if no exact
+				 * match is found; only bother if we found any
+				 * .suffix function names.  The function
+				 * will be saved and added once we ensure
+				 * it does not have optimized-out parameters
+				 * in any cu.
+				 */
+				func = btf_encoder__find_function(encoder, name,
+								  strlen(name));
+				if (func) {
+					save = true;
+					if (encoder->verbose)
+						printf("matched function '%s' with '%s'%s\n",
+						       name, func->name,
+						       fn->proto.optimized_parms ?
+						       ", has optimized-out parameters" : "");
+					fn->alias = func->name;
+				}
+			}
+			if (!func)
 				continue;
-			func->generated = true;
 		} else {
 			if (!fn->external)
 				continue;
 		}
 
-		err = btf_encoder__add_func(encoder, fn);
+		if (save)
+			err = btf_encoder__save_func(encoder, fn, func);
+		else
+			err = btf_encoder__add_func(encoder, fn);
 		if (err)
 			goto out;
 	}
 
 	if (!encoder->skip_encoding_vars)
 		err = btf_encoder__encode_cu_variables(encoder);
+
+	/* It is only safe to delete this CU if we have not stashed any static
+	 * functions for later addition.
+	 */
+	if (!err)
+		err = encoder->saved_func_cnt > 0 ? LSK__KEEPIT : LSK__DELETE;
 out:
 	encoder->cu = NULL;
 	return err;
diff --git a/dwarves.h b/dwarves.h
index ddf56f0..e169363 100644
--- a/dwarves.h
+++ b/dwarves.h
@@ -66,6 +66,7 @@
 	bool			skip_missing;
 	bool			skip_encoding_btf_type_tag;
 	bool			skip_encoding_btf_enum64;
+	bool			btf_gen_optimized;
 	uint8_t			hashtable_bits;
 	uint8_t			max_hashtable_bits;
 	uint16_t		kabi_prefix_len;
@@ -832,6 +833,7 @@
 	uint16_t	 nr_parms;
 	uint8_t		 unspec_parms:1; /* just one bit is needed */
 	uint8_t		 optimized_parms:1;
+	uint8_t		 processed:1;
 };
 
 static inline struct ftype *tag__ftype(const struct tag *tag)
@@ -884,6 +886,7 @@
 	struct rb_node	 rb_node;
 	const char	 *name;
 	const char	 *linkage_name;
+	const char	 *alias;	/* name.isra.0 */
 	uint32_t	 cu_total_size_inline_expansions;
 	uint16_t	 cu_total_nr_inline_expansions;
 	uint8_t		 inlined:2;
diff --git a/man-pages/pahole.1 b/man-pages/pahole.1
index 7460104..1a85f6d 100644
--- a/man-pages/pahole.1
+++ b/man-pages/pahole.1
@@ -262,6 +262,11 @@
 information has float types.
 
 .TP
+.B \-\-btf_gen_optimized
+Generate BTF for functions with optimization-related suffixes (.isra, .constprop).
+BTF will only be generated if a function does not optimize out parameters.
+
+.TP
 .B \-\-btf_gen_all
 Allow using all the BTF features supported by pahole.
 
diff --git a/pahole.c b/pahole.c
index 610a672..8effe4a 100644
--- a/pahole.c
+++ b/pahole.c
@@ -1230,6 +1230,7 @@
 #define ARGP_languages_exclude	   336
 #define ARGP_skip_encoding_btf_enum64 337
 #define ARGP_skip_emitting_atomic_typedefs 338
+#define ARGP_btf_gen_optimized  339
 
 static const struct argp_option pahole__options[] = {
 	{
@@ -1643,6 +1644,11 @@
 		.doc  = "Do not emit 'typedef _Atomic int atomic_int' & friends."
 	},
 	{
+		.name = "btf_gen_optimized",
+		.key  = ARGP_btf_gen_optimized,
+		.doc  = "Generate BTF for functions with optimization-related suffixes (.isra, .constprop).  BTF will only be generated if a function does not optimize out parameters."
+	},
+	{
 		.name = NULL,
 	}
 };
@@ -1813,6 +1819,8 @@
 		conf_load.skip_encoding_btf_enum64 = true;	break;
 	case ARGP_skip_emitting_atomic_typedefs:
 		conf.skip_emitting_atomic_typedefs = true;	break;
+	case ARGP_btf_gen_optimized:
+		conf_load.btf_gen_optimized = true;		break;
 	default:
 		return ARGP_ERR_UNKNOWN;
 	}
@@ -2991,20 +2999,20 @@
 		 * Merge content of the btf instances of worker threads to the btf
 		 * instance of the primary btf_encoder.
                 */
-		if (!threads[i]->btf || threads[i]->encoder == btf_encoder)
-			continue; /* The primary btf_encoder */
+		if (!threads[i]->btf)
+			continue;
 		err = btf_encoder__add_encoder(btf_encoder, threads[i]->encoder);
 		if (err < 0)
 			goto out;
-		btf_encoder__delete(threads[i]->encoder);
-		threads[i]->encoder = NULL;
 	}
 	err = 0;
 
 out:
 	for (i = 0; i < nr_threads; i++) {
-		if (threads[i]->encoder && threads[i]->encoder != btf_encoder)
+		if (threads[i]->encoder && threads[i]->encoder != btf_encoder) {
 			btf_encoder__delete(threads[i]->encoder);
+			threads[i]->encoder = NULL;
+		}
 	}
 	free(threads[0]);
 
@@ -3088,11 +3096,11 @@
 			encoder = btf_encoder;
 		}
 
-		if (btf_encoder__encode_cu(encoder, cu, conf_load)) {
+		ret = btf_encoder__encode_cu(encoder, cu, conf_load);
+		if (ret < 0) {
 			fprintf(stderr, "Encountered error while encoding BTF.\n");
 			exit(1);
 		}
-		ret = LSK__DELETE;
 out_btf:
 		return ret;
 	}