/*

   nsjail - CLONE_NEWNS routines
   -----------------------------------------

   Copyright 2014 Google Inc. All Rights Reserved.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.

*/

#include "mnt.h"

#include <errno.h>
#include <fcntl.h>
#include <inttypes.h>
#include <limits.h>
#include <linux/sched.h>
#include <sched.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/statvfs.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>

#include <memory>
#include <string>

#include "logs.h"
#include "macros.h"
#include "subproc.h"
#include "util.h"

namespace mnt {

#if !defined(MS_LAZYTIME)
#define MS_LAZYTIME (1 << 25)
#endif /* if !defined(MS_LAZYTIME) */

static const std::string flagsToStr(uintptr_t flags) {
	std::string res;

	struct {
		const uint64_t flag;
		const char* const name;
	} static const mountFlags[] = {
		NS_VALSTR_STRUCT(MS_RDONLY),
		NS_VALSTR_STRUCT(MS_NOSUID),
		NS_VALSTR_STRUCT(MS_NODEV),
		NS_VALSTR_STRUCT(MS_NOEXEC),
		NS_VALSTR_STRUCT(MS_SYNCHRONOUS),
		NS_VALSTR_STRUCT(MS_REMOUNT),
		NS_VALSTR_STRUCT(MS_MANDLOCK),
		NS_VALSTR_STRUCT(MS_DIRSYNC),
		NS_VALSTR_STRUCT(MS_NOATIME),
		NS_VALSTR_STRUCT(MS_NODIRATIME),
		NS_VALSTR_STRUCT(MS_BIND),
		NS_VALSTR_STRUCT(MS_MOVE),
		NS_VALSTR_STRUCT(MS_REC),
		NS_VALSTR_STRUCT(MS_SILENT),
		NS_VALSTR_STRUCT(MS_POSIXACL),
		NS_VALSTR_STRUCT(MS_UNBINDABLE),
		NS_VALSTR_STRUCT(MS_PRIVATE),
		NS_VALSTR_STRUCT(MS_SLAVE),
		NS_VALSTR_STRUCT(MS_SHARED),
		NS_VALSTR_STRUCT(MS_RELATIME),
		NS_VALSTR_STRUCT(MS_KERNMOUNT),
		NS_VALSTR_STRUCT(MS_I_VERSION),
		NS_VALSTR_STRUCT(MS_STRICTATIME),
		NS_VALSTR_STRUCT(MS_LAZYTIME),
#if defined(MS_ACTIVE)
		NS_VALSTR_STRUCT(MS_ACTIVE),
#endif /* defined(MS_ACTIVE) */
#if defined(MS_NOUSER)
		NS_VALSTR_STRUCT(MS_NOUSER),
#endif /* defined(MS_NOUSER) */
	};

	uint64_t knownFlagMask = 0U;
	for (const auto& i : mountFlags) {
		if (flags & i.flag) {
			if (!res.empty()) {
				res.append("|");
			}
			res.append(i.name);
		}
		knownFlagMask |= i.flag;
	}

	if (flags & ~(knownFlagMask)) {
		util::StrAppend(&res, "|%#tx", flags & ~(knownFlagMask));
	}

	return res;
}

static bool isDir(const char* path) {
	/*
	 *  If the source dir is NULL, we assume it's a dir (for /proc and tmpfs)
	 */
	if (path == NULL) {
		return true;
	}
	struct stat st;
	if (stat(path, &st) == -1) {
		PLOG_D("stat('%s')", path);
		return false;
	}
	if (S_ISDIR(st.st_mode)) {
		return true;
	}
	return false;
}

static bool mountPt(mount_t* mpt, const char* newroot, const char* tmpdir) {
	LOG_D("Mounting '%s'", describeMountPt(*mpt).c_str());

	char dstpath[PATH_MAX];
	snprintf(dstpath, sizeof(dstpath), "%s/%s", newroot, mpt->dst.c_str());

	char srcpath[PATH_MAX];
	if (!mpt->src.empty()) {
		snprintf(srcpath, sizeof(srcpath), "%s", mpt->src.c_str());
	} else {
		snprintf(srcpath, sizeof(srcpath), "none");
	}

	if (!util::createDirRecursively(dstpath)) {
		LOG_W("Couldn't create upper directories for '%s'", dstpath);
		return false;
	}

	if (mpt->is_symlink) {
		LOG_D("symlink('%s', '%s')", srcpath, dstpath);
		if (symlink(srcpath, dstpath) == -1) {
			if (mpt->is_mandatory) {
				PLOG_E("symlink('%s', '%s')", srcpath, dstpath);
				return false;
			} else {
				PLOG_W("symlink('%s', '%s'), but it's not mandatory, continuing",
				    srcpath, dstpath);
			}
		}
		return true;
	}

	if (mpt->is_dir) {
		if (mkdir(dstpath, 0711) == -1 && errno != EEXIST) {
			PLOG_W("mkdir('%s')", dstpath);
		}
	} else {
		int fd = TEMP_FAILURE_RETRY(open(dstpath, O_CREAT | O_RDONLY | O_CLOEXEC, 0644));
		if (fd >= 0) {
			close(fd);
		} else {
			PLOG_W("open('%s', O_CREAT|O_RDONLY|O_CLOEXEC, 0644)", dstpath);
		}
	}

	if (!mpt->src_content.empty()) {
		static uint64_t df_counter = 0;
		snprintf(
		    srcpath, sizeof(srcpath), "%s/dynamic_file.%" PRIu64, tmpdir, ++df_counter);
		int fd = TEMP_FAILURE_RETRY(
		    open(srcpath, O_CREAT | O_EXCL | O_CLOEXEC | O_WRONLY, 0644));
		if (fd < 0) {
			PLOG_W("open(srcpath, O_CREAT|O_EXCL|O_CLOEXEC|O_WRONLY, 0644) failed");
			return false;
		}
		if (!util::writeToFd(fd, mpt->src_content.data(), mpt->src_content.length())) {
			LOG_W(
			    "Writing %zu bytes to '%s' failed", mpt->src_content.length(), srcpath);
			close(fd);
			return false;
		}
		close(fd);
		mpt->flags |= (MS_BIND | MS_REC | MS_PRIVATE);
	}

	/*
	 * Initially mount it as RW, it will be remounted later on if needed
	 */
	unsigned long flags = mpt->flags & ~(MS_RDONLY);
	if (mount(srcpath, dstpath, mpt->fs_type.c_str(), flags, mpt->options.c_str()) == -1) {
		if (errno == EACCES) {
			PLOG_W(
			    "mount('%s') src:'%s' dstpath:'%s' failed. "
			    "Try fixing this problem by applying 'chmod o+x' to the '%s' "
			    "directory and its ancestors",
			    describeMountPt(*mpt).c_str(), srcpath, dstpath, srcpath);
		} else {
			PLOG_W("mount('%s') src:'%s' dstpath:'%s' failed",
			    describeMountPt(*mpt).c_str(), srcpath, dstpath);
			if (mpt->fs_type.compare("proc") == 0) {
				PLOG_W(
				    "procfs can only be mounted if the original /proc doesn't have "
				    "any other file-systems mounted on top of it (e.g. /dev/null "
				    "on top of /proc/kcore)");
			}
		}
		return false;
	} else {
		mpt->mounted = true;
	}

	if (!mpt->src_content.empty() && unlink(srcpath) == -1) {
		PLOG_W("unlink('%s')", srcpath);
	}
	return true;
}

static bool remountPt(const mount_t& mpt) {
	if (!mpt.mounted) {
		return true;
	}
	if (mpt.is_symlink) {
		return true;
	}

	struct statvfs vfs;
	if (TEMP_FAILURE_RETRY(statvfs(mpt.dst.c_str(), &vfs)) == -1) {
		PLOG_W("statvfs('%s')", mpt.dst.c_str());
		return false;
	}

	struct {
		const unsigned long mount_flag;
		const unsigned long vfs_flag;
	} static const mountPairs[] = {
	    {MS_NOSUID, ST_NOSUID},
	    {MS_NODEV, ST_NODEV},
	    {MS_NOEXEC, ST_NOEXEC},
	    {MS_SYNCHRONOUS, ST_SYNCHRONOUS},
	    {MS_MANDLOCK, ST_MANDLOCK},
	    {MS_NOATIME, ST_NOATIME},
	    {MS_NODIRATIME, ST_NODIRATIME},
	    {MS_RELATIME, ST_RELATIME},
	};

	const unsigned long per_mountpoint_flags =
	    MS_LAZYTIME | MS_MANDLOCK | MS_NOATIME | MS_NODEV | MS_NODIRATIME | MS_NOEXEC |
	    MS_NOSUID | MS_RELATIME | MS_RDONLY | MS_SYNCHRONOUS;
	unsigned long new_flags = MS_REMOUNT | MS_BIND | (mpt.flags & per_mountpoint_flags);
	for (const auto& i : mountPairs) {
		if (vfs.f_flag & i.vfs_flag) {
			new_flags |= i.mount_flag;
		}
	}

	LOG_D("Re-mounting '%s' (flags:%s)", mpt.dst.c_str(), flagsToStr(new_flags).c_str());
	if (mount(mpt.dst.c_str(), mpt.dst.c_str(), NULL, new_flags, 0) == -1) {
		PLOG_W("mount('%s', flags:%s)", mpt.dst.c_str(), flagsToStr(new_flags).c_str());
		return false;
	}

	return true;
}

static bool mkdirAndTest(const std::string& dir) {
	if (mkdir(dir.c_str(), 0755) == -1 && errno != EEXIST) {
		PLOG_D("Couldn't create '%s' directory", dir.c_str());
		return false;
	}
	if (access(dir.c_str(), R_OK) == -1) {
		PLOG_W("access('%s', R_OK)", dir.c_str());
		return false;
	}
	LOG_D("Created accessible directory in '%s'", dir.c_str());
	return true;
}

static std::unique_ptr<std::string> getDir(nsjconf_t* nsjconf, const char* name) {
	std::unique_ptr<std::string> dir(new std::string);

	dir->assign("/run/user/").append(std::to_string(nsjconf->orig_uid)).append("/nsjail");
	if (mkdirAndTest(*dir)) {
		dir->append("/").append(name);
		if (mkdirAndTest(*dir)) {
			return dir;
		}
	}
	dir->assign("/run/user/")
	    .append("/nsjail.")
	    .append(std::to_string(nsjconf->orig_uid))
	    .append(".")
	    .append(name);
	if (mkdirAndTest(*dir)) {
		return dir;
	}
	dir->assign("/tmp/nsjail.")
	    .append(std::to_string(nsjconf->orig_uid))
	    .append(".")
	    .append(name);
	if (mkdirAndTest(*dir)) {
		return dir;
	}
	const char* tmp = getenv("TMPDIR");
	if (tmp) {
		dir->assign(tmp)
		    .append("/")
		    .append("nsjail.")
		    .append(std::to_string(nsjconf->orig_uid))
		    .append(".")
		    .append(name);
		if (mkdirAndTest(*dir)) {
			return dir;
		}
	}
	dir->assign("/dev/shm/nsjail.")
	    .append(std::to_string(nsjconf->orig_uid))
	    .append(".")
	    .append(name);
	if (mkdirAndTest(*dir)) {
		return dir;
	}
	dir->assign("/tmp/nsjail.")
	    .append(std::to_string(nsjconf->orig_uid))
	    .append(".")
	    .append(name)
	    .append(".")
	    .append(std::to_string(util::rnd64()));
	if (mkdirAndTest(*dir)) {
		return dir;
	}

	LOG_E("Couldn't create tmp directory of type '%s'", name);
	return nullptr;
}

static bool initNoCloneNs(nsjconf_t* nsjconf) {
	/*
	 * If CLONE_NEWNS is not used, we would be changing the global mount namespace, so simply
	 * use --chroot in this case
	 */
	if (nsjconf->chroot.empty()) {
		return true;
	}
	if (chroot(nsjconf->chroot.c_str()) == -1) {
		PLOG_E("chroot('%s')", nsjconf->chroot.c_str());
		return false;
	}
	if (chdir("/") == -1) {
		PLOG_E("chdir('/')");
		return false;
	}
	return true;
}

static bool initCloneNs(nsjconf_t* nsjconf) {
	if (chdir("/") == -1) {
		PLOG_E("chdir('/')");
		return false;
	}

	std::unique_ptr<std::string> destdir = getDir(nsjconf, "root");
	if (!destdir) {
		LOG_E("Couldn't obtain root mount directories");
		return false;
	}

	/* Make changes to / (recursively) private, to avoid changing the global mount ns */
	if (mount("/", "/", NULL, MS_REC | MS_PRIVATE, NULL) == -1) {
		PLOG_E("mount('/', '/', NULL, MS_REC|MS_PRIVATE, NULL)");
		return false;
	}
	if (mount(NULL, destdir->c_str(), "tmpfs", 0, "size=16777216") == -1) {
		PLOG_E("mount('%s', 'tmpfs')", destdir->c_str());
		return false;
	}

	std::unique_ptr<std::string> tmpdir = getDir(nsjconf, "tmp");
	if (!tmpdir) {
		LOG_E("Couldn't obtain temporary mount directories");
		return false;
	}
	if (mount(NULL, tmpdir->c_str(), "tmpfs", 0, "size=16777216") == -1) {
		PLOG_E("mount('%s', 'tmpfs')", tmpdir->c_str());
		return false;
	}

	for (auto& p : nsjconf->mountpts) {
		if (!mountPt(&p, destdir->c_str(), tmpdir->c_str()) && p.is_mandatory) {
			LOG_E("Couldn't mount '%s'", p.dst.c_str());
			return false;
		}
	}

	if (umount2(tmpdir->c_str(), MNT_DETACH) == -1) {
		PLOG_E("umount2('%s', MNT_DETACH)", tmpdir->c_str());
		return false;
	}

	if (!nsjconf->no_pivotroot) {
		/*
		 * This requires some explanation: It's actually possible to pivot_root('/', '/').
		 * After this operation has been completed, the old root is mounted over the new
		 * root, and it's OK to simply umount('/') now, and to have new_root as '/'. This
		 * allows us not care about providing any special directory for old_root, which is
		 * sometimes not easy, given that e.g. /tmp might not always be present inside
		 * new_root
		 */
		if (util::syscall(__NR_pivot_root, (uintptr_t)destdir->c_str(),
			(uintptr_t)destdir->c_str()) == -1) {
			PLOG_E("pivot_root('%s', '%s')", destdir->c_str(), destdir->c_str());
			return false;
		}

		if (umount2("/", MNT_DETACH) == -1) {
			PLOG_E("umount2('/', MNT_DETACH)");
			return false;
		}
	} else {
		/*
		 * pivot_root would normally un-mount the old root, however in certain cases this
		 * operation is forbidden. There are systems (mainly embedded) that keep their root
		 * file system in RAM, when initially loaded by the kernel (e.g. initramfs),
		 * and there is no other file system that is mounted on top of it.In such systems,
		 * there is no option to pivot_root!
		 * For more information, see
		 * kernel.org/doc/Documentation/filesystems/ramfs-rootfs-initramfs.txt. switch_root
		 * alternative: Innstead of un-mounting the old rootfs, it is over mounted by moving
		 * the new root to it.
		 */

		/* NOTE: Using mount move and chroot allows escaping back into the old root when
		 * proper capabilities are kept in the user namespace. It can be acheived by
		 * unmounting the new root and using setns to re-enter the mount namespace.
		 */
		LOG_W(
		    "Using no_pivotroot is escapable when user posseses relevant capabilities, "
		    "Use it with care!");

		if (chdir(destdir->c_str()) == -1) {
			PLOG_E("chdir('%s')", destdir->c_str());
			return false;
		}

		/* mount moving the new root on top of '/'. This operation is atomic and doesn't
		involve un-mounting '/' at any stage */
		if (mount(".", "/", NULL, MS_MOVE, NULL) == -1) {
			PLOG_E("mount('/', %s, NULL, MS_MOVE, NULL)", destdir->c_str());
			return false;
		}

		if (chroot(".") == -1) {
			PLOG_E("chroot('%s')", destdir->c_str());
			return false;
		}
	}

	for (const auto& p : nsjconf->mountpts) {
		if (!remountPt(p) && p.is_mandatory) {
			return false;
		}
	}

	return true;
}

static bool initNsInternal(nsjconf_t* nsjconf) {
	if (nsjconf->clone_newns) {
		if (!initCloneNs(nsjconf)) {
			return false;
		}
	} else {
		if (!initNoCloneNs(nsjconf)) {
			return false;
		}
	}

	if (chdir(nsjconf->cwd.c_str()) == -1) {
		PLOG_E("chdir('%s')", nsjconf->cwd.c_str());
		return false;
	}
	return true;
}

/*
 * With mode MODE_STANDALONE_EXECVE it's required to mount /proc inside a new process,
 * as the current process is still in the original PID namespace (man pid_namespaces)
 */
bool initNs(nsjconf_t* nsjconf) {
	if (nsjconf->mode != MODE_STANDALONE_EXECVE) {
		return initNsInternal(nsjconf);
	}

	pid_t pid = subproc::cloneProc(CLONE_FS, SIGCHLD);
	if (pid == -1) {
		return false;
	}

	if (pid == 0) {
		exit(initNsInternal(nsjconf) ? 0 : 0xff);
	}

	int status;
	while (wait4(pid, &status, 0, NULL) != pid)
		;
	if (WIFEXITED(status) && WEXITSTATUS(status) == 0) {
		return true;
	}
	return false;
}

static bool addMountPt(mount_t* mnt, const std::string& src, const std::string& dst,
    const std::string& fstype, const std::string& options, uintptr_t flags, isDir_t is_dir,
    bool is_mandatory, const std::string& src_env, const std::string& dst_env,
    const std::string& src_content, bool is_symlink) {
	if (!src_env.empty()) {
		const char* e = getenv(src_env.c_str());
		if (e == NULL) {
			LOG_W("No such envar:'%s'", src_env.c_str());
			return false;
		}
		mnt->src = e;
	}
	mnt->src.append(src);

	if (!dst_env.empty()) {
		const char* e = getenv(dst_env.c_str());
		if (e == NULL) {
			LOG_W("No such envar:'%s'", dst_env.c_str());
			return false;
		}
		mnt->dst = e;
	}
	mnt->dst.append(dst);

	mnt->fs_type = fstype;
	mnt->options = options;
	mnt->flags = flags;
	mnt->is_symlink = is_symlink;
	mnt->is_mandatory = is_mandatory;
	mnt->mounted = false;
	mnt->src_content = src_content;

	switch (is_dir) {
	case NS_DIR_YES:
		mnt->is_dir = true;
		break;
	case NS_DIR_NO:
		mnt->is_dir = false;
		break;
	case NS_DIR_MAYBE: {
		if (!src_content.empty()) {
			mnt->is_dir = false;
		} else if (mnt->src.empty()) {
			mnt->is_dir = true;
		} else if (mnt->flags & MS_BIND) {
			mnt->is_dir = mnt::isDir(mnt->src.c_str());
		} else {
			mnt->is_dir = true;
		}
	} break;
	default:
		LOG_E("Unknown is_dir value: %d", is_dir);
		return false;
	}

	return true;
}

bool addMountPtHead(nsjconf_t* nsjconf, const std::string& src, const std::string& dst,
    const std::string& fstype, const std::string& options, uintptr_t flags, isDir_t is_dir,
    bool is_mandatory, const std::string& src_env, const std::string& dst_env,
    const std::string& src_content, bool is_symlink) {
	mount_t mnt;
	if (!addMountPt(&mnt, src, dst, fstype, options, flags, is_dir, is_mandatory, src_env,
		dst_env, src_content, is_symlink)) {
		return false;
	}
	nsjconf->mountpts.insert(nsjconf->mountpts.begin(), mnt);
	return true;
}

bool addMountPtTail(nsjconf_t* nsjconf, const std::string& src, const std::string& dst,
    const std::string& fstype, const std::string& options, uintptr_t flags, isDir_t is_dir,
    bool is_mandatory, const std::string& src_env, const std::string& dst_env,
    const std::string& src_content, bool is_symlink) {
	mount_t mnt;
	if (!addMountPt(&mnt, src, dst, fstype, options, flags, is_dir, is_mandatory, src_env,
		dst_env, src_content, is_symlink)) {
		return false;
	}
	nsjconf->mountpts.push_back(mnt);
	return true;
}

const std::string describeMountPt(const mount_t& mpt) {
	std::string descr;

	descr.append(mpt.src.empty() ? "" : "'")
	    .append(mpt.src.empty() ? "" : mpt.src)
	    .append(mpt.src.empty() ? "" : "' -> ")
	    .append("'")
	    .append(mpt.dst)
	    .append("' flags:")
	    .append(flagsToStr(mpt.flags))
	    .append(" type:'")
	    .append(mpt.fs_type)
	    .append("' options:'")
	    .append(mpt.options)
	    .append("'");

	if (mpt.is_dir) {
		descr.append(" dir:true");
	} else {
		descr.append(" dir:false");
	}
	if (!mpt.is_mandatory) {
		descr.append(" mandatory:false");
	}
	if (!mpt.src_content.empty()) {
		descr.append(" src_content_len:").append(std::to_string(mpt.src_content.length()));
	}
	if (mpt.is_symlink) {
		descr.append(" symlink:true");
	}

	return descr;
}

}  // namespace mnt
