| // Copyright 2019 Google LLC. All Rights Reserved. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // Implementation file for the sandbox2::Monitor class. |
| |
| #include "sandboxed_api/sandbox2/monitor.h" |
| |
| // clang-format off |
| #include <linux/posix_types.h> // NOLINT: Needs to come before linux/ipc.h |
| #include <linux/ipc.h> |
| // clang-format on |
| #include <sched.h> |
| #include <sys/mman.h> |
| #include <sys/ptrace.h> |
| #include <sys/time.h> |
| #include <sys/wait.h> |
| #include <syscall.h> |
| #include <unistd.h> |
| |
| #include <algorithm> |
| #include <atomic> |
| #include <cerrno> |
| #include <csignal> |
| #include <cstdlib> |
| #include <cstring> |
| #include <ctime> |
| #include <fstream> |
| #include <memory> |
| #include <set> |
| #include <sstream> |
| #include <string> |
| |
| #include <glog/logging.h> |
| #include "sandboxed_api/util/flag.h" |
| #include "absl/memory/memory.h" |
| #include "absl/strings/str_cat.h" |
| #include "absl/strings/str_format.h" |
| #include "absl/time/time.h" |
| #include "sandboxed_api/sandbox2/client.h" |
| #include "sandboxed_api/sandbox2/comms.h" |
| #include "sandboxed_api/sandbox2/executor.h" |
| #include "sandboxed_api/sandbox2/limits.h" |
| #include "sandboxed_api/sandbox2/mounts.h" |
| #include "sandboxed_api/sandbox2/namespace.h" |
| #include "sandboxed_api/sandbox2/policy.h" |
| #include "sandboxed_api/sandbox2/regs.h" |
| #include "sandboxed_api/sandbox2/result.h" |
| #include "sandboxed_api/sandbox2/sanitizer.h" |
| #include "sandboxed_api/sandbox2/stack-trace.h" |
| #include "sandboxed_api/sandbox2/syscall.h" |
| #include "sandboxed_api/sandbox2/util.h" |
| #include "sandboxed_api/util/raw_logging.h" |
| |
| ABSL_FLAG(bool, sandbox2_report_on_sandboxee_signal, true, |
| "Report sandbox2 sandboxee deaths caused by signals"); |
| |
| ABSL_FLAG(bool, sandbox2_report_on_sandboxee_timeout, true, |
| "Report sandbox2 sandboxee timeouts"); |
| |
| ABSL_DECLARE_FLAG(bool, sandbox2_danger_danger_permit_all); |
| ABSL_DECLARE_FLAG(string, sandbox2_danger_danger_permit_all_and_log); |
| |
| namespace sandbox2 { |
| |
| namespace { |
| |
| // We could use the ProcMapsIterator, however we want the full file content. |
| std::string ReadProcMaps(pid_t pid) { |
| std::ifstream input(absl::StrCat("/proc/", pid, "/maps"), |
| std::ios_base::in | std::ios_base::binary); |
| std::ostringstream contents; |
| contents << input.rdbuf(); |
| return contents.str(); |
| } |
| |
| } // namespace |
| |
| Monitor::Monitor(Executor* executor, Policy* policy, Notify* notify) |
| : executor_(executor), |
| notify_(notify), |
| policy_(policy), |
| comms_(executor_->ipc()->comms()), |
| ipc_(executor_->ipc()), |
| setup_counter_(new absl::BlockingCounter(1)), |
| done_(false), |
| wait_for_execve_(executor->enable_sandboxing_pre_execve_) { |
| std::string path = |
| absl::GetFlag(FLAGS_sandbox2_danger_danger_permit_all_and_log); |
| if (!path.empty()) { |
| log_file_ = std::fopen(path.c_str(), "a+"); |
| PCHECK(log_file_ != nullptr) << "Failed to open log file '" << path << "'"; |
| } |
| } |
| |
| Monitor::~Monitor() { |
| CleanUpTimer(); |
| if (log_file_) { |
| std::fclose(log_file_); |
| } |
| } |
| |
| namespace { |
| |
| void LogContainer(const std::vector<std::string>& container) { |
| for (size_t i = 0; i < container.size(); ++i) { |
| SAPI_RAW_LOG(INFO, "[%4d]=%s", i, container[i]); |
| } |
| } |
| |
| } // namespace |
| |
| void Monitor::Run() { |
| using DecrementCounter = decltype(setup_counter_); |
| std::unique_ptr<DecrementCounter, std::function<void(DecrementCounter*)>> |
| decrement_count{&setup_counter_, [](DecrementCounter* counter) { |
| (*counter)->DecrementCount(); |
| }}; |
| |
| struct MonitorCleanup { |
| ~MonitorCleanup() { |
| getrusage(RUSAGE_THREAD, capture->result_.GetRUsageMonitor()); |
| capture->notify_->EventFinished(capture->result_); |
| capture->ipc_->InternalCleanupFdMap(); |
| absl::MutexLock lock(&capture->done_mutex_); |
| capture->done_.store(true, std::memory_order_release); |
| } |
| Monitor* capture; |
| } monitor_cleanup{this}; |
| |
| if (!InitSetupTimer()) { |
| result_.SetExitStatusCode(Result::SETUP_ERROR, Result::FAILED_TIMERS); |
| return; |
| } |
| |
| // It'd be costly to initialize the sigset_t for each sigtimedwait() |
| // invocation, so do it once per Monitor. |
| sigset_t sigtimedwait_sset; |
| if (!InitSetupSignals(&sigtimedwait_sset)) { |
| result_.SetExitStatusCode(Result::SETUP_ERROR, Result::FAILED_SIGNALS); |
| return; |
| } |
| |
| if (SAPI_VLOG_IS_ON(1) && policy_->GetNamespace() != nullptr) { |
| std::vector<std::string> outside_entries; |
| std::vector<std::string> inside_entries; |
| policy_->GetNamespace()->mounts().RecursivelyListMounts( |
| /*outside_entries=*/&outside_entries, |
| /*inside_entries=*/&inside_entries); |
| SAPI_RAW_VLOG(1, "Outside entries mapped to chroot:"); |
| LogContainer(outside_entries); |
| SAPI_RAW_VLOG(1, "Inside entries as they appear in chroot:"); |
| LogContainer(inside_entries); |
| } |
| |
| // Don't trace the child: it will allow to use 'strace -f' with the whole |
| // sandbox master/monitor, which ptrace_attach'es to the child. |
| int clone_flags = CLONE_UNTRACED; |
| |
| // Get PID of the sandboxee. |
| pid_t init_pid = 0; |
| pid_ = executor_->StartSubProcess(clone_flags, policy_->GetNamespace(), |
| policy_->GetCapabilities(), &init_pid); |
| |
| if (init_pid < 0) { |
| // TODO(hamacher): does this require additional handling here? |
| LOG(ERROR) << "Spawning init process failed"; |
| } else if (init_pid > 0) { |
| PCHECK(ptrace(PTRACE_SEIZE, init_pid, 0, PTRACE_O_EXITKILL) == 0); |
| } |
| |
| if (pid_ < 0) { |
| result_.SetExitStatusCode(Result::SETUP_ERROR, Result::FAILED_SUBPROCESS); |
| return; |
| } |
| |
| if (!notify_->EventStarted(pid_, comms_)) { |
| result_.SetExitStatusCode(Result::SETUP_ERROR, Result::FAILED_NOTIFY); |
| return; |
| } |
| if (!InitAcceptConnection()) { |
| result_.SetExitStatusCode(Result::SETUP_ERROR, Result::FAILED_CONNECTION); |
| return; |
| } |
| if (!InitSendIPC()) { |
| result_.SetExitStatusCode(Result::SETUP_ERROR, Result::FAILED_IPC); |
| return; |
| } |
| if (!InitSendCwd()) { |
| result_.SetExitStatusCode(Result::SETUP_ERROR, Result::FAILED_CWD); |
| return; |
| } |
| if (!InitSendPolicy()) { |
| result_.SetExitStatusCode(Result::SETUP_ERROR, Result::FAILED_POLICY); |
| return; |
| } |
| if (!WaitForSandboxReady()) { |
| result_.SetExitStatusCode(Result::SETUP_ERROR, Result::FAILED_WAIT); |
| return; |
| } |
| if (!InitApplyLimits()) { |
| result_.SetExitStatusCode(Result::SETUP_ERROR, Result::FAILED_LIMITS); |
| return; |
| } |
| // This call should be the last in the init sequence, because it can cause the |
| // sandboxee to enter ptrace-stopped state, in which it will not be able to |
| // send any messages over the Comms channel. |
| if (!InitPtraceAttach()) { |
| result_.SetExitStatusCode(Result::SETUP_ERROR, Result::FAILED_PTRACE); |
| return; |
| } |
| |
| // Tell the parent thread (Sandbox2 object) that we're done with the initial |
| // set-up process of the sandboxee. |
| decrement_count.reset(); |
| |
| MainLoop(&sigtimedwait_sset); |
| |
| // Disarm the timer: it will be deleted in ~Monitor, but the Monitor object |
| // lifetime is controlled by owner of Sandbox2, and we don't want to leave any |
| // timers behind (esp. armed ones) in the meantime. |
| TimerArm(absl::ZeroDuration()); |
| } |
| |
| bool Monitor::IsActivelyMonitoring() { |
| // If we're still waiting for execve(), then we allow all syscalls. |
| return !wait_for_execve_; |
| } |
| |
| void Monitor::SetActivelyMonitoring() { wait_for_execve_ = false; } |
| |
| void Monitor::MainSignals(int signo, siginfo_t* si) { |
| VLOG(3) << "Signal '" << strsignal(signo) << "' (" << signo |
| << ") received from PID: " << si->si_pid; |
| |
| // SIGCHLD is received frequently due to ptrace() events being sent by child |
| // processes; return early to avoid costly syscalls. |
| if (signo == SIGCHLD) { |
| return; |
| } |
| |
| // We should only receive signals from the same process (thread group). Other |
| // signals are suspicious (esp. if coming from a sandboxed process) Using |
| // syscall(__NR_getpid) here because getpid() is cached in glibc, and it |
| // might return previous pid if bare syscall(__NR_fork) was used instead of |
| // fork(). |
| // |
| // The notable exception are signals caused by timer_settime which are sent |
| // by the kernel. |
| if (signo != Monitor::kTimerWallTimeSignal && |
| si->si_pid != util::Syscall(__NR_getpid)) { |
| LOG(ERROR) << "Monitor received signal '" << strsignal(signo) << "' (" |
| << signo << ") from PID " << si->si_pid |
| << " which is not in the current thread group"; |
| return; |
| } |
| |
| switch (signo) { |
| case Monitor::kExternalKillSignal: |
| VLOG(1) << "Will kill the main pid"; |
| ActionProcessKill(pid_, Result::EXTERNAL_KILL, 0); |
| break; |
| case Monitor::kTimerWallTimeSignal: |
| VLOG(1) << "Sandbox process hit timeout due to the walltime timer"; |
| ActionProcessKill(pid_, Result::TIMEOUT, 0); |
| break; |
| case Monitor::kTimerSetSignal: |
| VLOG(1) << "Will set the walltime timer to " << si->si_value.sival_int |
| << " seconds"; |
| TimerArm(absl::Seconds(si->si_value.sival_int)); |
| break; |
| case Monitor::kDumpStackSignal: |
| VLOG(1) << "Dump the main pid's stack"; |
| should_dump_stack_ = true; |
| PidInterrupt(pid_); |
| break; |
| default: |
| LOG(ERROR) << "Unknown signal received: " << signo; |
| break; |
| } |
| } |
| |
| // Not defined in glibc. |
| #define __WPTRACEEVENT(x) ((x & 0xff0000) >> 16) |
| bool Monitor::MainWait() { |
| // All possible process status change event must be checked as SIGCHLD |
| // is reported once only for all events that arrived at the same time. |
| for (;;) { |
| int status; |
| // It should be a non-blocking operation (hence WNOHANG), so this function |
| // returns quickly if there are no events to be processed. |
| int ret = waitpid(-1, &status, __WNOTHREAD | __WALL | WUNTRACED | WNOHANG); |
| |
| // No traced processes have changed their status yet. |
| if (ret == 0) { |
| return false; |
| } |
| |
| if (ret == -1 && errno == ECHILD) { |
| LOG(ERROR) << "PANIC(). The main process has not exited yet, " |
| << "yet we haven't seen its exit event"; |
| // We'll simply exit which will kill all remaining processes (if |
| // there are any) because of the PTRACE_O_EXITKILL ptrace() flag. |
| return true; |
| } |
| if (ret == -1 && errno == EINTR) { |
| VLOG(3) << "waitpid() interruped with EINTR"; |
| continue; |
| } |
| if (ret == -1) { |
| PLOG(ERROR) << "waitpid() failed"; |
| continue; |
| } |
| |
| VLOG(3) << "waitpid() returned with PID: " << ret << ", status: " << status; |
| |
| if (WIFEXITED(status)) { |
| VLOG(1) << "PID: " << ret |
| << " finished with code: " << WEXITSTATUS(status); |
| // That's the main process, set the exit code, and exit. It will kill |
| // all remaining processes (if there are any) because of the |
| // PTRACE_O_EXITKILL ptrace() flag. |
| if (ret == pid_) { |
| if (IsActivelyMonitoring()) { |
| result_.SetExitStatusCode(Result::OK, WEXITSTATUS(status)); |
| } else { |
| result_.SetExitStatusCode(Result::SETUP_ERROR, |
| Result::FAILED_MONITOR); |
| } |
| return true; |
| } |
| } else if (WIFSIGNALED(status)) { |
| VLOG(1) << "PID: " << ret << " terminated with signal: " |
| << util::GetSignalName(WTERMSIG(status)); |
| if (ret == pid_) { |
| // That's the main process, depending on the result of the process take |
| // the register content and/or the stack trace. The death of this |
| // process will cause all remaining processes to be killed (if there are |
| // any), see the PTRACE_O_EXITKILL ptrace() flag. |
| |
| // When the process is killed from a signal from within the result |
| // status will be still unset, fix this. |
| // The other cases should either be already handled, or (in the case of |
| // Result::OK) should be impossible to reach. |
| if (result_.final_status() == Result::UNSET) { |
| result_.SetExitStatusCode(Result::SIGNALED, WTERMSIG(status)); |
| } else if (result_.final_status() == Result::OK) { |
| LOG(ERROR) << "Unexpected codepath taken"; |
| } |
| return true; |
| } |
| } else if (WIFSTOPPED(status)) { |
| VLOG(2) << "PID: " << ret |
| << " received signal: " << util::GetSignalName(WSTOPSIG(status)) |
| << " with event: " << __WPTRACEEVENT(status); |
| StateProcessStopped(ret, status); |
| } else if (WIFCONTINUED(status)) { |
| VLOG(2) << "PID: " << ret << " is being continued"; |
| } |
| } |
| } |
| |
| void Monitor::MainLoop(sigset_t* sset) { |
| for (;;) { |
| // Use a time-out, so we can check for missed waitpid() events. It should |
| // not happen during regular operations, so it's a defense-in-depth |
| // mechanism against SIGCHLD signals being lost by the kernel (since these |
| // are not-RT signals - i.e. not queued). |
| static const timespec ts = {kWakeUpPeriodSec, kWakeUpPeriodNSec}; |
| |
| // Wait for any kind of events, e.g. signals sent from the parent process, |
| // or SIGCHLD sent by kernel indicating that state of one of the traced |
| // processes has changed. |
| siginfo_t si; |
| int ret = sigtimedwait(sset, &si, &ts); |
| if (ret > 0) { |
| // Process signals which arrived. |
| MainSignals(ret, &si); |
| } |
| |
| // If CheckWait reported no more traced processes, or that |
| // the main pid had exited, we should break this loop (i.e. our job is |
| // done here). |
| // |
| // MainWait() should use a not-blocking (e.g. WNOHANG with waitpid()) |
| // syntax, so it returns quickly if there are not status changes in |
| // traced processes. |
| if (MainWait()) { |
| return; |
| } |
| } |
| } |
| |
| bool Monitor::InitSetupTimer() { |
| walltime_timer_ = absl::make_unique<timer_t>(); |
| |
| // Set the wall-time timer. |
| sigevent sevp; |
| sevp.sigev_value.sival_ptr = walltime_timer_.get(); |
| sevp.sigev_signo = kTimerWallTimeSignal; |
| sevp.sigev_notify = SIGEV_THREAD_ID | SIGEV_SIGNAL; |
| sevp._sigev_un._tid = static_cast<pid_t>(util::Syscall(__NR_gettid)); |
| // GLibc's implementation seem to mis-behave during timer_delete, as it's |
| // trying to find out whether POSIX TIMERs are available. So, we stick to |
| // syscalls for this class of calls. |
| if (util::Syscall(__NR_timer_create, CLOCK_REALTIME, |
| reinterpret_cast<uintptr_t>(&sevp), |
| reinterpret_cast<uintptr_t>(walltime_timer_.get())) == -1) { |
| walltime_timer_ = nullptr; |
| PLOG(ERROR) << "timer_create(CLOCK_REALTIME, walltime_timer_)"; |
| return false; |
| } |
| return TimerArm(executor_->limits()->wall_time_limit()); |
| } |
| |
| // Can be used from a signal handler. Avoid non-reentrant functions. |
| bool Monitor::TimerArm(absl::Duration duration) { |
| VLOG(2) << (duration == absl::ZeroDuration() ? "Disarming" : "Arming") |
| << " the walltime timer with " << absl::FormatDuration(duration); |
| |
| itimerspec ts; |
| absl::Duration rem; |
| ts.it_value.tv_sec = absl::IDivDuration(duration, absl::Seconds(1), &rem); |
| ts.it_value.tv_nsec = absl::ToInt64Nanoseconds(rem); |
| ts.it_interval.tv_sec = |
| duration != absl::ZeroDuration() ? 1L : 0L; // Re-fire every 1 sec. |
| ts.it_interval.tv_nsec = 0UL; |
| itimerspec* null_ts = nullptr; |
| if (util::Syscall(__NR_timer_settime, |
| reinterpret_cast<uintptr_t>(*walltime_timer_), 0, |
| reinterpret_cast<uintptr_t>(&ts), |
| reinterpret_cast<uintptr_t>(null_ts)) == -1) { |
| PLOG(ERROR) << "timer_settime(): time: " << absl::FormatDuration(duration); |
| return false; |
| } |
| |
| return true; |
| } |
| |
| void Monitor::CleanUpTimer() { |
| if (walltime_timer_) { |
| if (util::Syscall(__NR_timer_delete, |
| reinterpret_cast<uintptr_t>(*walltime_timer_)) == -1) { |
| PLOG(ERROR) << "timer_delete()"; |
| } |
| } |
| } |
| |
| bool Monitor::InitSetupSig(int signo, sigset_t* sset) { |
| // sigtimedwait will react (wake-up) to arrival of this signal. |
| sigaddset(sset, signo); |
| |
| // Block this specific signal, so only sigtimedwait reacts to it. |
| sigset_t block_set; |
| if (sigemptyset(&block_set) == -1) { |
| PLOG(ERROR) << "sigemptyset()"; |
| return false; |
| } |
| if (sigaddset(&block_set, signo) == -1) { |
| PLOG(ERROR) << "sigaddset(" << signo << ")"; |
| return false; |
| } |
| if (pthread_sigmask(SIG_BLOCK, &block_set, nullptr) == -1) { |
| PLOG(ERROR) << "pthread_sigmask(SIG_BLOCK, " << signo << ")"; |
| return false; |
| } |
| |
| return true; |
| } |
| |
| bool Monitor::InitSetupSignals(sigset_t* sset) { |
| sigemptyset(sset); |
| |
| return Monitor::InitSetupSig(kExternalKillSignal, sset) && |
| Monitor::InitSetupSig(kTimerWallTimeSignal, sset) && |
| Monitor::InitSetupSig(kTimerSetSignal, sset) && |
| Monitor::InitSetupSig(kDumpStackSignal, sset) && |
| // SIGCHLD means that a new children process status change event |
| // has been delivered (e.g. due ptrace notification). |
| Monitor::InitSetupSig(SIGCHLD, sset); |
| } |
| |
| bool Monitor::InitSendPolicy() { |
| if (!policy_->SendPolicy(comms_)) { |
| LOG(ERROR) << "Couldn't send policy"; |
| return false; |
| } |
| |
| return true; |
| } |
| |
| bool Monitor::InitSendCwd() { |
| if (!comms_->SendString(executor_->cwd_)) { |
| PLOG(ERROR) << "Couldn't send cwd"; |
| return false; |
| } |
| |
| return true; |
| } |
| |
| bool Monitor::InitApplyLimit(pid_t pid, __rlimit_resource resource, |
| const rlimit64& rlim) const { |
| std::string rlim_name = absl::StrCat("UNKNOWN: ", resource); |
| switch (resource) { |
| case RLIMIT_AS: |
| rlim_name = "RLIMIT_AS"; |
| break; |
| case RLIMIT_FSIZE: |
| rlim_name = "RLIMIT_FSIZE"; |
| break; |
| case RLIMIT_NOFILE: |
| rlim_name = "RLIMIT_NOFILE"; |
| break; |
| case RLIMIT_CPU: |
| rlim_name = "RLIMIT_CPU"; |
| break; |
| case RLIMIT_CORE: |
| rlim_name = "RLIMIT_CORE"; |
| break; |
| default: |
| break; |
| } |
| |
| rlimit64 curr_limit; |
| if (prlimit64(pid, resource, nullptr, &curr_limit) == -1) { |
| PLOG(ERROR) << "prlimit64(" << pid << ", " << rlim_name << ")"; |
| } else { |
| // In such case, don't update the limits, as it will fail. Just stick to the |
| // current ones (which are already lower than intended). |
| if (rlim.rlim_cur > curr_limit.rlim_max) { |
| LOG(ERROR) << rlim_name << ": new.current > current.max (" |
| << rlim.rlim_cur << " > " << curr_limit.rlim_max |
| << "), skipping"; |
| return true; |
| } |
| } |
| if (prlimit64(pid, resource, &rlim, nullptr) == -1) { |
| PLOG(ERROR) << "prlimit64(RLIMIT_AS, " << rlim.rlim_cur << ")"; |
| return false; |
| } |
| |
| return true; |
| } |
| |
| bool Monitor::InitApplyLimits() { |
| Limits* limits = executor_->limits(); |
| return InitApplyLimit(pid_, RLIMIT_AS, limits->rlimit_as()) && |
| InitApplyLimit(pid_, RLIMIT_CPU, limits->rlimit_cpu()) && |
| InitApplyLimit(pid_, RLIMIT_FSIZE, limits->rlimit_fsize()) && |
| InitApplyLimit(pid_, RLIMIT_NOFILE, limits->rlimit_nofile()) && |
| InitApplyLimit(pid_, RLIMIT_CORE, limits->rlimit_core()); |
| } |
| |
| bool Monitor::InitSendIPC() { return ipc_->SendFdsOverComms(); } |
| |
| bool Monitor::WaitForSandboxReady() { |
| uint32_t tmp; |
| if (!comms_->RecvUint32(&tmp)) { |
| LOG(ERROR) << "Couldn't receive 'Client::kClient2SandboxReady' message"; |
| return false; |
| } |
| if (tmp != Client::kClient2SandboxReady) { |
| LOG(ERROR) << "Received " << tmp << " != Client::kClient2SandboxReady (" |
| << Client::kClient2SandboxReady << ")"; |
| return false; |
| } |
| return true; |
| } |
| |
| bool Monitor::InitPtraceAttach() { |
| sanitizer::WaitForTsan(); |
| |
| // Get a list of tasks. |
| std::set<int> tasks; |
| if (!sanitizer::GetListOfTasks(pid_, &tasks)) { |
| LOG(ERROR) << "Could not get list of tasks"; |
| return false; |
| } |
| |
| // With TSYNC, we can allow threads: seccomp applies to all threads. |
| |
| if (tasks.size() > 1) { |
| LOG(WARNING) << "PID " << pid_ << " has " << tasks.size() << " threads," |
| << " at the time of call to SandboxMeHere. If you are seeing" |
| << " more sandbox violations than expected, this might be" |
| << " the reason why" |
| << "."; |
| } |
| |
| intptr_t ptrace_opts = |
| PTRACE_O_TRACESYSGOOD | PTRACE_O_TRACEFORK | PTRACE_O_TRACEVFORK | |
| PTRACE_O_TRACEVFORKDONE | PTRACE_O_TRACECLONE | PTRACE_O_TRACEEXEC | |
| PTRACE_O_TRACEEXIT | PTRACE_O_TRACESECCOMP | PTRACE_O_EXITKILL; |
| |
| bool main_pid_found = false; |
| for (auto task : tasks) { |
| if (task == pid_) { |
| main_pid_found = true; |
| } |
| |
| // In some situations we allow ptrace to try again when it fails. |
| bool ptrace_succeeded = false; |
| int retries = 0; |
| auto deadline = absl::Now() + absl::Seconds(2); |
| while (absl::Now() < deadline) { |
| int ret = ptrace(PTRACE_SEIZE, task, 0, ptrace_opts); |
| if (ret == 0) { |
| ptrace_succeeded = true; |
| break; |
| } |
| if (ret != 0 && errno == ESRCH) { |
| // A task may have exited since we captured the task list, we will allow |
| // things to continue after we log a warning. |
| PLOG(WARNING) << "ptrace(PTRACE_SEIZE, " << task << ", " |
| << absl::StrCat("0x", absl::Hex(ptrace_opts)) |
| << ") skipping exited task. Continuing with other tasks."; |
| ptrace_succeeded = true; |
| break; |
| } |
| if (ret != 0 && errno == EPERM) { |
| // Sometimes when a task is exiting we can get an EPERM from ptrace. |
| // Let's try again up until the timeout in this situation. |
| PLOG(WARNING) << "ptrace(PTRACE_SEIZE, " << task << ", " |
| << absl::StrCat("0x", absl::Hex(ptrace_opts)) |
| << "), trying again..."; |
| |
| // Exponential Backoff. |
| constexpr auto kInitialRetry = absl::Milliseconds(1); |
| constexpr auto kMaxRetry = absl::Milliseconds(20); |
| const auto retry_interval = |
| kInitialRetry * (1 << std::min(10, retries++)); |
| absl::SleepFor(std::min(retry_interval, kMaxRetry)); |
| continue; |
| } |
| |
| // Any other errno will be considered a failure. |
| PLOG(ERROR) << "ptrace(PTRACE_SEIZE, " << task << ", " |
| << absl::StrCat("0x", absl::Hex(ptrace_opts)) << ") failed."; |
| return false; |
| } |
| |
| if (!ptrace_succeeded) { |
| LOG(ERROR) << "ptrace(PTRACE_SEIZE, " << task << ", " |
| << absl::StrCat("0x", absl::Hex(ptrace_opts)) |
| << ") failed after retrying until the timeout."; |
| return false; |
| } |
| } |
| |
| if (!main_pid_found) { |
| LOG(ERROR) << "The pid " << pid_ << " was not found in its own tasklist."; |
| return false; |
| } |
| |
| // Get a list of tasks after attaching. |
| std::set<int> tasks_after; |
| if (!sanitizer::GetListOfTasks(pid_, &tasks_after)) { |
| LOG(ERROR) << "Could not get list of tasks"; |
| return false; |
| } |
| |
| // Check that no new threads have shown up. Note: tasks_after can have fewer |
| // tasks than before but no new tasks can be added as they would be missing |
| // from the initial task list. |
| if (!std::includes(tasks.begin(), tasks.end(), tasks_after.begin(), |
| tasks_after.end())) { |
| LOG(ERROR) << "The pid " << pid_ |
| << " spawned new threads while we were trying to attach to it."; |
| return false; |
| } |
| |
| // No glibc wrapper for gettid - see 'man gettid'. |
| VLOG(1) << "Monitor (PID: " << getpid() |
| << ", TID: " << util::Syscall(__NR_gettid) |
| << ") attached to PID: " << pid_; |
| |
| // Technically, the sandboxee can be in a ptrace-stopped state right now, |
| // because some signal could have arrived in the meantime. Yet, this |
| // Comms::SendUint32 call shouldn't lock our process, because the underlying |
| // socketpair() channel is buffered, hence it will accept the uint32_t message |
| // no matter what is the current state of the sandboxee, and it will allow for |
| // our process to continue and unlock the sandboxee with the proper ptrace |
| // event handling. |
| if (!comms_->SendUint32(Client::kSandbox2ClientDone)) { |
| LOG(ERROR) << "Couldn't send Client::kSandbox2ClientDone message"; |
| return false; |
| } |
| return true; |
| } |
| |
| bool Monitor::InitAcceptConnection() { |
| // It's a pre-connected Comms channel, no need to accept new connection or |
| // verify the peer (sandboxee). |
| if (comms_->IsConnected()) { |
| return true; |
| } |
| |
| if (!comms_->Accept()) { |
| return false; |
| } |
| |
| // Check whether the PID which has connected to us, is the PID we're |
| // expecting. |
| pid_t cred_pid; |
| uid_t cred_uid; |
| gid_t cred_gid; |
| if (!comms_->RecvCreds(&cred_pid, &cred_uid, &cred_gid)) { |
| LOG(ERROR) << "Couldn't receive credentials"; |
| return false; |
| } |
| |
| if (pid_ != cred_pid) { |
| LOG(ERROR) << "Initial PID (" << pid_ << ") differs from the PID received " |
| << "from the peer (" << cred_pid << ")"; |
| return false; |
| } |
| |
| return true; |
| } |
| |
| void Monitor::ActionProcessContinue(pid_t pid, int signo) { |
| if (ptrace(PTRACE_CONT, pid, 0, signo) == -1) { |
| PLOG(ERROR) << "ptrace(PTRACE_CONT, pid=" << pid << ", sig=" << signo |
| << ")"; |
| } |
| } |
| |
| void Monitor::ActionProcessStop(pid_t pid, int signo) { |
| if (ptrace(PTRACE_LISTEN, pid, 0, signo) == -1) { |
| PLOG(ERROR) << "ptrace(PTRACE_LISTEN, pid=" << pid << ", sig=" << signo |
| << ")"; |
| } |
| } |
| |
| void Monitor::ActionProcessSyscall(Regs* regs, const Syscall& syscall) { |
| // If the sandboxing is not enabled yet, allow the first __NR_execveat. |
| if (syscall.nr() == __NR_execveat && !IsActivelyMonitoring()) { |
| VLOG(1) << "[PERMITTED/BEFORE_EXECVEAT]: " |
| << "SYSCALL ::: PID: " << regs->pid() << ", PROG: '" |
| << util::GetProgName(regs->pid()) |
| << "' : " << syscall.GetDescription(); |
| ActionProcessContinue(regs->pid(), 0); |
| return; |
| } |
| |
| // Notify can decide whether we want to allow this syscall. It could be useful |
| // for sandbox setups in which some syscalls might still need some logging, |
| // but nonetheless be allowed ('permissible syscalls' in sandbox v1). |
| if (notify_->EventSyscallTrap(syscall)) { |
| LOG(WARNING) << "[PERMITTED]: SYSCALL ::: PID: " << regs->pid() |
| << ", PROG: '" << util::GetProgName(regs->pid()) |
| << "' : " << syscall.GetDescription(); |
| |
| ActionProcessContinue(regs->pid(), 0); |
| return; |
| } |
| |
| // TODO(wiktorg): Further clean that up, probably while doing monitor cleanup |
| // log_file_ not null iff FLAGS_sandbox2_danger_danger_permit_all_and_log is |
| // set. |
| if (log_file_) { |
| std::string syscall_description = syscall.GetDescription(); |
| PCHECK(absl::FPrintF(log_file_, "PID: %d %s\n", regs->pid(), |
| syscall_description) >= 0); |
| ActionProcessContinue(regs->pid(), 0); |
| return; |
| } |
| |
| if (absl::GetFlag(FLAGS_sandbox2_danger_danger_permit_all)) { |
| ActionProcessContinue(regs->pid(), 0); |
| return; |
| } |
| |
| ActionProcessSyscallViolation(regs, syscall, kSyscallViolation); |
| } |
| |
| void Monitor::ActionProcessSyscallViolation(Regs* regs, const Syscall& syscall, |
| ViolationType violation_type) { |
| pid_t pid = regs->pid(); |
| |
| LogAccessViolation(syscall); |
| notify_->EventSyscallViolation(syscall, violation_type); |
| result_.SetExitStatusCode(Result::VIOLATION, syscall.nr()); |
| result_.SetSyscall(absl::make_unique<Syscall>(syscall)); |
| // Only get the stacktrace if we are not in the libunwind sandbox (avoid |
| // recursion). |
| if (executor_->libunwind_sbox_for_pid_ == 0 && policy_->GetNamespace()) { |
| if (policy_->collect_stacktrace_on_violation_) { |
| result_.SetStackTrace( |
| GetStackTrace(regs, policy_->GetNamespace()->mounts())); |
| LOG(ERROR) << "Stack trace: " << result_.GetStackTrace(); |
| } else { |
| LOG(ERROR) << "Stack traces have been disabled"; |
| } |
| } |
| // We make the result object create its own Reg instance. our regs is a |
| // pointer to a stack variable which might not live long enough. |
| result_.LoadRegs(pid); |
| result_.SetProgName(util::GetProgName(pid)); |
| result_.SetProcMaps(ReadProcMaps(pid_)); |
| |
| // Rewrite the syscall argument to something invalid (-1). The process will |
| // be killed by ActionProcessKill(), so this is just a precaution. |
| auto status = regs->SkipSyscallReturnValue(-ENOSYS); |
| if (!status.ok()) { |
| LOG(ERROR) << status; |
| } |
| |
| ActionProcessKill(pid, Result::VIOLATION, syscall.nr()); |
| } |
| |
| void Monitor::LogAccessViolation(const Syscall& syscall) { |
| // Do not unwind libunwind. |
| if (executor_->libunwind_sbox_for_pid_ != 0) { |
| LOG(ERROR) << "Sandbox violation during execution of libunwind: " |
| << syscall.GetDescription(); |
| return; |
| } |
| |
| uintptr_t syscall_nr = syscall.nr(); |
| uintptr_t arg0 = syscall.args()[0]; |
| |
| // So, this is an invalid syscall. Will be killed by seccomp-bpf policies as |
| // well, but we should be on a safe side here as well. |
| LOG(ERROR) << "SANDBOX VIOLATION : PID: " << syscall.pid() << ", PROG: '" |
| << util::GetProgName(syscall.pid()) |
| << "' : " << syscall.GetDescription(); |
| |
| // This follows policy in Policy::GetDefaultPolicy - keep it in sync. |
| if (syscall.arch() != Syscall::GetHostArch()) { |
| LOG(ERROR) |
| << "This is a violation because the syscall was issued because the" |
| << " sandboxee and executor architectures are different."; |
| return; |
| } |
| |
| if (syscall_nr == __NR_ptrace) { |
| LOG(ERROR) |
| << "This is a violation because the ptrace syscall would be unsafe in" |
| << " sandbox2, so it has been blocked."; |
| return; |
| } |
| if (syscall_nr == __NR_bpf) { |
| LOG(ERROR) |
| << "This is a violation because the bpf syscall would be risky in" |
| << " a sandbox, so it has been blocked."; |
| return; |
| } |
| |
| if (syscall_nr == __NR_clone && ((arg0 & CLONE_UNTRACED) != 0)) { |
| LOG(ERROR) << "This is a violation because calling clone with CLONE_UNTRACE" |
| << " would be unsafe in sandbox2, so it has been blocked."; |
| return; |
| } |
| } |
| |
| void Monitor::ActionProcessKill(pid_t pid, Result::StatusEnum status, |
| uintptr_t code) { |
| // Avoid overwriting result if we set it for instance after a violation. |
| if (result_.final_status() == Result::UNSET) { |
| result_.SetExitStatusCode(status, code); |
| } |
| |
| VLOG(1) << "Sending SIGKILL to the PID: " << pid_; |
| if (kill(pid_, SIGKILL) != 0) { |
| LOG(FATAL) << "Could not send SIGKILL to PID " << pid_; |
| } |
| } |
| |
| void Monitor::EventPtraceSeccomp(pid_t pid, int event_msg) { |
| VLOG(1) << "PID: " << pid << " violation uncovered via the SECCOMP_EVENT"; |
| // If the seccomp-policy is using RET_TRACE, we request that it returns the |
| // syscall architecture identifier in the SECCOMP_RET_DATA. |
| const auto syscall_arch = static_cast<Syscall::CpuArch>(event_msg); |
| Regs regs(pid); |
| auto status = regs.Fetch(); |
| if (!status.ok()) { |
| LOG(ERROR) << status; |
| ActionProcessKill(pid, Result::INTERNAL_ERROR, Result::FAILED_FETCH); |
| return; |
| } |
| |
| Syscall syscall = regs.ToSyscall(syscall_arch); |
| // If the architecture of the syscall used is different that the current host |
| // architecture, report a violation. |
| if (syscall_arch != Syscall::GetHostArch()) { |
| ActionProcessSyscallViolation(®s, syscall, kArchitectureSwitchViolation); |
| return; |
| } |
| |
| ActionProcessSyscall(®s, syscall); |
| } |
| |
| void Monitor::EventPtraceExec(pid_t pid, int event_msg) { |
| if (!IsActivelyMonitoring()) { |
| VLOG(1) << "PTRACE_EVENT_EXEC seen from PID: " << event_msg |
| << ". SANDBOX ENABLED!"; |
| SetActivelyMonitoring(); |
| } |
| ActionProcessContinue(pid, 0); |
| } |
| |
| void Monitor::EventPtraceExit(pid_t pid, int event_msg) { |
| // A regular exit, let it continue. |
| if (WIFEXITED(event_msg)) { |
| ActionProcessContinue(pid, 0); |
| return; |
| } |
| |
| // Everything except the SECCOMP violation can continue. |
| if (!WIFSIGNALED(event_msg) || WTERMSIG(event_msg) != SIGSYS) { |
| // Process is dying because it received a signal. |
| // This can occur in three cases: |
| // 1) Process was killed from the sandbox, in this case the result status |
| // was already set to Result::EXTERNAL_KILL. We do not get the stack |
| // trace in this case. |
| // 2) Process was killed because it hit a timeout. The result status is |
| // also already set, however we are interested in the stack trace. |
| // 3) Regular signal. We need to obtain everything. The status will be set |
| // upon the process exit handler. |
| if (pid == pid_) { |
| result_.LoadRegs(pid_); |
| result_.SetProgName(util::GetProgName(pid_)); |
| result_.SetProcMaps(ReadProcMaps(pid_)); |
| bool stacktrace_collection_possible = |
| policy_->GetNamespace() && executor_->libunwind_sbox_for_pid_ == 0; |
| auto collect_stacktrace = [this]() { |
| result_.SetStackTrace(GetStackTrace(result_.GetRegs(), |
| policy_->GetNamespace()->mounts())); |
| }; |
| switch (result_.final_status()) { |
| case Result::EXTERNAL_KILL: |
| if (stacktrace_collection_possible && |
| policy_->collect_stacktrace_on_kill_) { |
| collect_stacktrace(); |
| } |
| break; |
| case Result::TIMEOUT: |
| if (stacktrace_collection_possible && |
| policy_->collect_stacktrace_on_timeout_) { |
| collect_stacktrace(); |
| } |
| break; |
| case Result::VIOLATION: |
| break; |
| case Result::UNSET: |
| // Regular signal. |
| if (stacktrace_collection_possible && |
| policy_->collect_stacktrace_on_signal_) { |
| collect_stacktrace(); |
| } |
| break; |
| default: |
| LOG(ERROR) << "Unexpected codepath taken"; |
| break; |
| } |
| } |
| |
| ActionProcessContinue(pid, 0); |
| return; |
| } |
| |
| VLOG(1) << "PID: " << pid << " violation uncovered via the EXIT_EVENT"; |
| |
| // We do not generate the stack trace in the SECCOMP case as it will be |
| // generated during ActionProcessSyscallViolation anyway. |
| Regs regs(pid); |
| auto status = regs.Fetch(); |
| if (!status.ok()) { |
| LOG(ERROR) << status; |
| ActionProcessKill(pid, Result::INTERNAL_ERROR, Result::FAILED_FETCH); |
| return; |
| } |
| |
| auto syscall = regs.ToSyscall(Syscall::GetHostArch()); |
| |
| ActionProcessSyscallViolation(®s, syscall, kSyscallViolation); |
| } |
| |
| void Monitor::EventPtraceStop(pid_t pid, int stopsig) { |
| // It's not a real stop signal. For example PTRACE_O_TRACECLONE and similar |
| // flags to ptrace(PTRACE_SEIZE) might generate this event with SIGTRAP. |
| if (stopsig != SIGSTOP && stopsig != SIGTSTP && stopsig != SIGTTIN && |
| stopsig != SIGTTOU) { |
| ActionProcessContinue(pid, 0); |
| return; |
| } |
| // It's our PID stop signal. Stop it. |
| VLOG(2) << "PID: " << pid << " stopped due to " |
| << util::GetSignalName(stopsig); |
| ActionProcessStop(pid, 0); |
| } |
| |
| void Monitor::StateProcessStopped(pid_t pid, int status) { |
| int stopsig = WSTOPSIG(status); |
| if (__WPTRACEEVENT(status) == 0) { |
| // Must be a regular signal delivery. |
| VLOG(2) << "PID: " << pid |
| << " received signal: " << util::GetSignalName(stopsig); |
| notify_->EventSignal(pid, stopsig); |
| ActionProcessContinue(pid, stopsig); |
| return; |
| } |
| |
| unsigned long event_msg; // NOLINT |
| if (ptrace(PTRACE_GETEVENTMSG, pid, 0, &event_msg) == -1) { |
| if (errno == ESRCH) { |
| // This happens from time to time, the kernel does not guarantee us that |
| // we get the event in time. |
| PLOG(INFO) << "ptrace(PTRACE_GETEVENTMSG, " << pid << ")"; |
| return; |
| } |
| PLOG(ERROR) << "ptrace(PTRACE_GETEVENTMSG, " << pid << ")"; |
| ActionProcessKill(pid, Result::INTERNAL_ERROR, Result::FAILED_GETEVENT); |
| return; |
| } |
| |
| if (pid == pid_ && should_dump_stack_ && |
| executor_->libunwind_sbox_for_pid_ == 0 && policy_->GetNamespace()) { |
| Regs regs(pid); |
| auto status = regs.Fetch(); |
| if (status.ok()) { |
| VLOG(0) << "SANDBOX STACK : PID: " << pid << ", [" |
| << GetStackTrace(®s, policy_->GetNamespace()->mounts()) << "]"; |
| } else { |
| LOG(WARNING) << "FAILED TO GET SANDBOX STACK : " << status; |
| } |
| should_dump_stack_ = false; |
| } |
| |
| #if !defined(PTRACE_EVENT_STOP) |
| #define PTRACE_EVENT_STOP 128 |
| #endif |
| |
| switch (__WPTRACEEVENT(status)) { |
| case PTRACE_EVENT_FORK: |
| /* fall through */ |
| case PTRACE_EVENT_VFORK: |
| /* fall through */ |
| case PTRACE_EVENT_CLONE: |
| /* fall through */ |
| case PTRACE_EVENT_VFORK_DONE: |
| ActionProcessContinue(pid, 0); |
| break; |
| case PTRACE_EVENT_EXEC: |
| VLOG(2) << "PID: " << pid << " PTRACE_EVENT_EXEC, PID: " << event_msg; |
| EventPtraceExec(pid, event_msg); |
| break; |
| case PTRACE_EVENT_EXIT: |
| VLOG(2) << "PID: " << pid << " PTRACE_EVENT_EXIT: " << event_msg; |
| EventPtraceExit(pid, event_msg); |
| break; |
| case PTRACE_EVENT_STOP: |
| VLOG(2) << "PID: " << pid << " PTRACE_EVENT_STOP: " << event_msg; |
| EventPtraceStop(pid, stopsig); |
| break; |
| case PTRACE_EVENT_SECCOMP: |
| VLOG(2) << "PID: " << pid << " PTRACE_EVENT_SECCOMP: " << event_msg; |
| EventPtraceSeccomp(pid, event_msg); |
| break; |
| default: |
| LOG(ERROR) << "Unknown ptrace event: " << __WPTRACEEVENT(status) |
| << " with data: " << event_msg; |
| break; |
| } |
| } |
| |
| void Monitor::PidInterrupt(pid_t pid) { |
| if (ptrace(PTRACE_INTERRUPT, pid, 0, 0) == -1) { |
| PLOG(WARNING) << "ptrace(PTRACE_INTERRUPT, pid=" << pid << ")"; |
| } |
| } |
| |
| } // namespace sandbox2 |