minijail: Add support for pre-compiled BPF programs

This change introduces minijail_set_seccomp_filters() and
the --seccomp-bpf-binary CLI flag. This allows loading pre-compiled
seccomp-bpf binaries, so that we can use libseccomp or any other
optimizer without adding a lot of bloat to minijail.

Bug: chromium:856315
Test: ./parse_seccomp_policy --dump=stdin_stdout.bpf \
          test/stdin_stdout.policy && \
      ./minijail0 --seccomp-bpf-binary=stdin_stdout.bpf \
          --preload-library=./libminijailpreload.so -- /bin/ls

Change-Id: I87ff9279da40cf9b6576a5db9c8103b2a8fa4dd7
diff --git a/Android.bp b/Android.bp
index 5185aed..93c4543 100644
--- a/Android.bp
+++ b/Android.bp
@@ -33,6 +33,7 @@
 cc_defaults {
     name: "libminijail_flags",
     cflags: [
+        "-D_FILE_OFFSET_BITS=64",
         "-DALLOW_DEBUG_LOGGING",
         "-DHAVE_SECUREBITS_H",
         "-Wall",
diff --git a/libminijail.c b/libminijail.c
index d2f4945..cd302d1 100644
--- a/libminijail.c
+++ b/libminijail.c
@@ -13,6 +13,7 @@
 #include <fcntl.h>
 #include <grp.h>
 #include <linux/capability.h>
+#include <linux/filter.h>
 #include <sched.h>
 #include <signal.h>
 #include <stdbool.h>
@@ -868,7 +869,7 @@
 	j->flags.no_new_privs = 0;
 }
 
-static int seccomp_should_parse_filters(struct minijail *j)
+static int seccomp_should_use_filters(struct minijail *j)
 {
 	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL) == -1) {
 		/*
@@ -916,10 +917,65 @@
 	return 1;
 }
 
+static int set_seccomp_filters_internal(struct minijail *j,
+					struct sock_fprog *filter, bool owned)
+{
+	struct sock_fprog *fprog;
+
+	if (owned) {
+		fprog = filter;
+	} else {
+		fprog = malloc(sizeof(struct sock_fprog));
+		if (!fprog)
+			return -ENOMEM;
+		fprog->len = filter->len;
+		fprog->filter = malloc(sizeof(struct sock_filter) * fprog->len);
+		if (!fprog->filter) {
+			free(fprog);
+			return -ENOMEM;
+		}
+		memcpy(fprog->filter, filter->filter,
+		       sizeof(struct sock_filter) * fprog->len);
+	}
+
+	if (j->filter_prog) {
+		free(j->filter_prog->filter);
+		free(j->filter_prog);
+	}
+
+	j->filter_len = fprog->len;
+	j->filter_prog = fprog;
+	return 0;
+}
+
+void API minijail_set_seccomp_filters(struct minijail *j,
+				      const struct sock_fprog *filter)
+{
+	if (!seccomp_should_use_filters(j))
+		return;
+
+	if (j->flags.seccomp_filter_logging) {
+		die("minijail_log_seccomp_filter_failures() is incompatible "
+		    "with minijail_set_seccomp_filters()");
+	}
+
+	/*
+	 * set_seccomp_filters_internal() can only fail with ENOMEM.
+	 * Furthermore, since we won't own the incoming filter, it will not be
+	 * modified.
+	 */
+	if (set_seccomp_filters_internal(j, (struct sock_fprog *)filter,
+					 false) < 0) {
+		die("failed to copy seccomp filter");
+	}
+}
+
 static int parse_seccomp_filters(struct minijail *j, const char *filename,
 				 FILE *policy_file)
 {
 	struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog));
+	if (!fprog)
+		return -ENOMEM;
 	int use_ret_trap =
 	    j->flags.seccomp_filter_tsync || j->flags.seccomp_filter_logging;
 	int allow_logging = j->flags.seccomp_filter_logging;
@@ -930,14 +986,12 @@
 		return -1;
 	}
 
-	j->filter_len = fprog->len;
-	j->filter_prog = fprog;
-	return 0;
+	return set_seccomp_filters_internal(j, fprog, true);
 }
 
 void API minijail_parse_seccomp_filters(struct minijail *j, const char *path)
 {
-	if (!seccomp_should_parse_filters(j))
+	if (!seccomp_should_use_filters(j))
 		return;
 
 	FILE *file = fopen(path, "re");
@@ -957,7 +1011,7 @@
 	char *fd_path, *path;
 	FILE *file;
 
-	if (!seccomp_should_parse_filters(j))
+	if (!seccomp_should_use_filters(j))
 		return;
 
 	file = fdopen(fd, "r");
@@ -1228,10 +1282,8 @@
 	for (i = 0; i < j->cgroup_count; ++i)
 		free(j->cgroups[i]);
 bad_mounts:
-	if (j->flags.seccomp_filter && j->filter_len > 0) {
+	if (j->filter_prog && j->filter_prog->filter)
 		free(j->filter_prog->filter);
-		free(j->filter_prog);
-	}
 bad_filter_prog_instrs:
 	if (j->filter_prog)
 		free(j->filter_prog);
@@ -2955,7 +3007,7 @@
 {
 	size_t i;
 
-	if (j->flags.seccomp_filter && j->filter_prog) {
+	if (j->filter_prog) {
 		free(j->filter_prog->filter);
 		free(j->filter_prog);
 	}
diff --git a/libminijail.h b/libminijail.h
index ad17819..da8244d 100644
--- a/libminijail.h
+++ b/libminijail.h
@@ -30,6 +30,7 @@
 };
 
 struct minijail;
+struct sock_fprog;
 
 /*
  * A hook that can be used to execute code at various events during minijail
@@ -81,6 +82,9 @@
 void minijail_no_new_privs(struct minijail *j);
 void minijail_use_seccomp_filter(struct minijail *j);
 void minijail_set_seccomp_filter_tsync(struct minijail *j);
+/* Does not take ownership of |filter|. */
+void minijail_set_seccomp_filters(struct minijail *j,
+				  const struct sock_fprog *filter);
 void minijail_parse_seccomp_filters(struct minijail *j, const char *path);
 void minijail_parse_seccomp_filters_from_fd(struct minijail *j, int fd);
 void minijail_log_seccomp_filter_failures(struct minijail *j);
diff --git a/minijail0.1 b/minijail0.1
index 676f901..6f85f87 100644
--- a/minijail0.1
+++ b/minijail0.1
@@ -216,8 +216,9 @@
 .TP
 \fB-S <arch-specific seccomp_filter policy file>\fR
 Enable \fBseccomp\fR(2) in mode 13 which restricts the child process to a set of
-system calls defined in the policy file. Note that system calls often change
-names based on the architecture or mode. (uname -m is your friend.)
+system calls defined in the policy file. Note that system call names may be
+different based on the runtime environment; see \fBminijail0\fR(5) for more
+details.
 .TP
 \fB-t[size]\fR
 Mounts a tmpfs filesystem on /tmp. /tmp must exist already (e.g. in the chroot).
@@ -286,6 +287,13 @@
 \fB--preload-library <file path>\fR
 Allows overriding the default path of \fI/lib/libminijailpreload.so\fR.  This
 is only really useful for testing.
+\fB--seccomp-bpf-binary <arch-specific BPF binary>\fR
+This is similar to \fB-S\fR, but
+instead of using a policy file, \fB--secomp-bpf-binary\fR expects a
+arch-and-kernel-version-specific pre-compiled BPF binary (such as the ones
+produced by \fBparse_seccomp_policy\fR).  Note that the filter might be
+different based on the runtime environment; see \fBminijail0\fR(5) for more
+details.
 .SH SANDBOXING PROFILES
 The following sandboxing profiles are supported:
 .TP
diff --git a/minijail0_cli.c b/minijail0_cli.c
index ed78022..2e25ca6 100644
--- a/minijail0_cli.c
+++ b/minijail0_cli.c
@@ -6,6 +6,7 @@
 #include <dlfcn.h>
 #include <errno.h>
 #include <getopt.h>
+#include <inttypes.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -14,6 +15,8 @@
 #include <sys/types.h>
 #include <unistd.h>
 
+#include <linux/filter.h>
+
 #include "libminijail.h"
 #include "libsyscalls.h"
 
@@ -406,6 +409,47 @@
 	minijail_remount_mode(j, msmode);
 }
 
+static void read_seccomp_filter(const char *filter_path,
+				struct sock_fprog *filter)
+{
+	FILE *f = fopen(filter_path, "re");
+	if (!f) {
+		fprintf(stderr, "failed to open %s: %m", filter_path);
+		exit(1);
+	}
+	off_t filter_size = 0;
+	if (fseeko(f, 0, SEEK_END) == -1 || (filter_size = ftello(f)) == -1) {
+		fclose(f);
+		fprintf(stderr, "failed to get file size of %s: %m",
+			filter_path);
+		exit(1);
+	}
+	if (filter_size % sizeof(struct sock_filter) != 0) {
+		fclose(f);
+		fprintf(stderr,
+			"filter size (%" PRId64
+			") of %s is not a multiple of %zu: %m",
+			filter_size, filter_path, sizeof(struct sock_filter));
+		exit(1);
+	}
+	rewind(f);
+
+	filter->len = filter_size / sizeof(struct sock_filter);
+	filter->filter = malloc(filter_size);
+	if (!filter->filter) {
+		fclose(f);
+		fprintf(stderr, "failed to allocate memory for filter: %m");
+		exit(1);
+	}
+	if (fread(filter->filter, sizeof(struct sock_filter), filter->len, f) !=
+	    filter->len) {
+		fclose(f);
+		fprintf(stderr, "failed read %s: %m", filter_path);
+		exit(1);
+	}
+	fclose(f);
+}
+
 static void usage(const char *progn)
 {
 	size_t i;
@@ -496,7 +540,12 @@
 	       "                that are typically used together.\n"
 	       "                See the minijail0(1) man page for the full list.\n"
 	       "  --preload-library=<f>:Overrides the path to \"" PRELOADPATH "\".\n"
-	       "                This is only really useful for local testing.\n");
+	       "                This is only really useful for local testing.\n"
+	       "  --seccomp-bpf-binary=<f>:Set a pre-compiled seccomp filter using <f>.\n"
+	       "                E.g., '-S /usr/share/filters/<prog>.$(uname -m).bpf'.\n"
+	       "                Requires -n when not running as root.\n"
+	       "                The user is responsible for ensuring that the binary\n"
+	       "                was compiled for the correct architecture / kernel version.\n");
 	/* clang-format on */
 }
 
@@ -516,7 +565,7 @@
 	       const char **preload_path)
 {
 	int opt;
-	int use_seccomp_filter = 0;
+	int use_seccomp_filter = 0, use_seccomp_filter_binary = 0;
 	int forward = 1;
 	int binding = 0;
 	int chroot = 0, pivot_root = 0;
@@ -524,7 +573,6 @@
 	int inherit_suppl_gids = 0, keep_suppl_gids = 0;
 	int caps = 0, ambient_caps = 0;
 	int seccomp = -1;
-	const size_t path_max = 4096;
 	uid_t uid = 0;
 	gid_t gid = 0;
 	char *uidmap = NULL, *gidmap = NULL;
@@ -544,6 +592,7 @@
 		{"logging", required_argument, 0, 130},
 		{"profile", required_argument, 0, 131},
 		{"preload-library", required_argument, 0, 132},
+		{"seccomp-bpf-binary", required_argument, 0, 133},
 		{0, 0, 0, 0},
 	};
 	/* clang-format on */
@@ -563,7 +612,8 @@
 		case 's':
 			if (seccomp != -1 && seccomp != 1) {
 				fprintf(stderr,
-					"Do not use -s & -S together.\n");
+					"Do not use -s, -S, or "
+					"--seccomp-bpf-binary together.\n");
 				exit(1);
 			}
 			seccomp = 1;
@@ -572,21 +622,13 @@
 		case 'S':
 			if (seccomp != -1 && seccomp != 2) {
 				fprintf(stderr,
-					"Do not use -s & -S together.\n");
+					"Do not use -s, -S, or "
+					"--seccomp-bpf-binary together.\n");
 				exit(1);
 			}
 			seccomp = 2;
 			minijail_use_seccomp_filter(j);
-			if (strlen(optarg) >= path_max) {
-				fprintf(stderr, "Filter path is too long.\n");
-				exit(1);
-			}
-			filter_path = strndup(optarg, path_max);
-			if (!filter_path) {
-				fprintf(stderr,
-					"Could not strndup(3) filter path.\n");
-				exit(1);
-			}
+			filter_path = optarg;
 			use_seccomp_filter = 1;
 			break;
 		case 'l':
@@ -777,6 +819,18 @@
 		case 132: /* PRELOADPATH */
 			*preload_path = optarg;
 			break;
+		case 133: /* seccomp-bpf binary. */
+			if (seccomp != -1 && seccomp != 3) {
+				fprintf(stderr,
+					"Do not use -s, -S, or "
+					"--seccomp-bpf-binary together.\n");
+				exit(1);
+			}
+			seccomp = 3;
+			minijail_use_seccomp_filter(j);
+			filter_path = optarg;
+			use_seccomp_filter_binary = 1;
+			break;
 		default:
 			usage(argv[0]);
 			exit(opt == 'h' ? 0 : 1);
@@ -838,7 +892,11 @@
 	 */
 	if (use_seccomp_filter) {
 		minijail_parse_seccomp_filters(j, filter_path);
-		free((void *)filter_path);
+	} else if (use_seccomp_filter_binary) {
+		struct sock_fprog filter;
+		read_seccomp_filter(filter_path, &filter);
+		minijail_set_seccomp_filters(j, &filter);
+		free((void *)filter.filter);
 	}
 
 	/* Mount a tmpfs under /tmp and set its size. */