sandboxed_api/sandbox2/monitor.cc - platform/external/sandboxed-api - Git at Google

 // Copyright 2019 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // Implementation file for the sandbox2::Monitor class.

 #include "sandboxed_api/sandbox2/monitor.h"

 // clang-format off
 #include <linux/posix_types.h>  // NOLINT: Needs to come before linux/ipc.h
 #include <linux/ipc.h>
 // clang-format on
 #include <sched.h>
 #include <sys/mman.h>
 #include <sys/ptrace.h>
 #include <sys/time.h>
 #include <sys/wait.h>
 #include <syscall.h>
 #include <unistd.h>

 #include <algorithm>
 #include <atomic>
 #include <cerrno>
 #include <csignal>
 #include <cstdlib>
 #include <cstring>
 #include <ctime>
 #include <fstream>
 #include <memory>
 #include <set>
 #include <sstream>
 #include <string>

 #include <glog/logging.h>
 #include "sandboxed_api/util/flag.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/time/time.h"
 #include "sandboxed_api/sandbox2/client.h"
 #include "sandboxed_api/sandbox2/comms.h"
 #include "sandboxed_api/sandbox2/executor.h"
 #include "sandboxed_api/sandbox2/limits.h"
 #include "sandboxed_api/sandbox2/mounts.h"
 #include "sandboxed_api/sandbox2/namespace.h"
 #include "sandboxed_api/sandbox2/network_proxy/server.h"
 #include "sandboxed_api/sandbox2/policy.h"
 #include "sandboxed_api/sandbox2/regs.h"
 #include "sandboxed_api/sandbox2/result.h"
 #include "sandboxed_api/sandbox2/sanitizer.h"
 #include "sandboxed_api/sandbox2/stack_trace.h"
 #include "sandboxed_api/sandbox2/syscall.h"
 #include "sandboxed_api/sandbox2/util.h"
 #include "sandboxed_api/util/raw_logging.h"

 using std::string;

 ABSL_FLAG(bool, sandbox2_report_on_sandboxee_signal, true,
           "Report sandbox2 sandboxee deaths caused by signals");

 ABSL_FLAG(bool, sandbox2_report_on_sandboxee_timeout, true,
           "Report sandbox2 sandboxee timeouts");

 ABSL_DECLARE_FLAG(bool, sandbox2_danger_danger_permit_all);
 ABSL_DECLARE_FLAG(bool, sandbox_libunwind_crash_handler);
 ABSL_DECLARE_FLAG(string, sandbox2_danger_danger_permit_all_and_log);

 namespace sandbox2 {

 namespace {

 // We could use the ProcMapsIterator, however we want the full file content.
 std::string ReadProcMaps(pid_t pid) {
   std::ifstream input(absl::StrCat("/proc/", pid, "/maps"),
                       std::ios_base::in | std::ios_base::binary);
   std::ostringstream contents;
   contents << input.rdbuf();
   return contents.str();
 }

 void InterruptProcess(pid_t pid) {
   if (ptrace(PTRACE_INTERRUPT, pid, 0, 0) == -1) {
     PLOG(WARNING) << "ptrace(PTRACE_INTERRUPT, pid=" << pid << ")";
   }
 }

 void ContinueProcess(pid_t pid, int signo) {
   if (ptrace(PTRACE_CONT, pid, 0, signo) == -1) {
     if (errno == ESRCH) {
       LOG(WARNING) << "Process " << pid
                    << " died while trying to PTRACE_CONT it";
     } else {
       PLOG(ERROR) << "ptrace(PTRACE_CONT, pid=" << pid << ", sig=" << signo
                   << ")";
     }
   }
 }

 void StopProcess(pid_t pid, int signo) {
   if (ptrace(PTRACE_LISTEN, pid, 0, signo) == -1) {
     if (errno == ESRCH) {
       LOG(WARNING) << "Process " << pid
                    << " died while trying to PTRACE_LISTEN it";
     } else {
       PLOG(ERROR) << "ptrace(PTRACE_CONT, pid=" << pid << ", sig=" << signo
                   << ")";
     }
   }
 }

 }  // namespace

 Monitor::Monitor(Executor* executor, Policy* policy, Notify* notify)
     : executor_(executor),
       notify_(notify),
       policy_(policy),
       comms_(executor_->ipc()->comms()),
       ipc_(executor_->ipc()),
       wait_for_execve_(executor->enable_sandboxing_pre_execve_) {
   // It's a pre-connected Comms channel, no need to accept new connection.
   CHECK(comms_->IsConnected());
   std::string path =
       absl::GetFlag(FLAGS_sandbox2_danger_danger_permit_all_and_log);
   external_kill_request_flag_.test_and_set(std::memory_order_relaxed);
   dump_stack_request_flag_.test_and_set(std::memory_order_relaxed);
   if (!path.empty()) {
     log_file_ = std::fopen(path.c_str(), "a+");
     PCHECK(log_file_ != nullptr) << "Failed to open log file '" << path << "'";
   }
 }

 Monitor::~Monitor() {
   if (log_file_) {
     std::fclose(log_file_);
   }
   if (network_proxy_server_) {
     network_proxy_thread_.join();
   }
 }

 namespace {

 void LogContainer(const std::vector<std::string>& container) {
   for (size_t i = 0; i < container.size(); ++i) {
     SAPI_RAW_LOG(INFO, "[%4d]=%s", i, container[i]);
   }
 }

 }  // namespace

 void Monitor::Run() {
   std::unique_ptr<absl::Notification, void (*)(absl::Notification*)>
       setup_notify{&setup_notification_, [](absl::Notification* notification) {
                      notification->Notify();
                    }};

   struct MonitorCleanup {
     ~MonitorCleanup() {
       getrusage(RUSAGE_THREAD, capture->result_.GetRUsageMonitor());
       capture->notify_->EventFinished(capture->result_);
       capture->ipc_->InternalCleanupFdMap();
       capture->done_notification_.Notify();
     }
     Monitor* capture;
   } monitor_cleanup{this};

   if (executor_->limits()->wall_time_limit() != absl::ZeroDuration()) {
     auto deadline = absl::Now() + executor_->limits()->wall_time_limit();
     deadline_millis_.store(absl::ToUnixMillis(deadline),
                            std::memory_order_relaxed);
   }

   // It'd be costly to initialize the sigset_t for each sigtimedwait()
   // invocation, so do it once per Monitor.
   sigset_t sigtimedwait_sset;
   if (!InitSetupSignals(&sigtimedwait_sset)) {
     SetExitStatusCode(Result::SETUP_ERROR, Result::FAILED_SIGNALS);
     return;
   }

   if (SAPI_VLOG_IS_ON(1) && policy_->GetNamespace() != nullptr) {
     std::vector<std::string> outside_entries;
     std::vector<std::string> inside_entries;
     policy_->GetNamespace()->mounts().RecursivelyListMounts(
         /*outside_entries=*/&outside_entries,
         /*inside_entries=*/&inside_entries);
     SAPI_RAW_VLOG(1, "Outside entries mapped to chroot:");
     LogContainer(outside_entries);
     SAPI_RAW_VLOG(1, "Inside entries as they appear in chroot:");
     LogContainer(inside_entries);
   }

   // Don't trace the child: it will allow to use 'strace -f' with the whole
   // sandbox master/monitor, which ptrace_attach'es to the child.
   int clone_flags = CLONE_UNTRACED;

   if (policy_->allowed_hosts_) {
     EnableNetworkProxyServer();
   }

   // Get PID of the sandboxee.
   pid_t init_pid = 0;
   Namespace* ns = policy_->GetNamespace();
   bool should_have_init = ns && (ns->GetCloneFlags() & CLONE_NEWPID);
   pid_ = executor_->StartSubProcess(clone_flags, ns, policy_->GetCapabilities(),
                                     &init_pid);

   if (init_pid > 0) {
     if (ptrace(PTRACE_SEIZE, init_pid, 0, PTRACE_O_EXITKILL) != 0) {
       if (errno == ESRCH) {
         SetExitStatusCode(Result::SETUP_ERROR, Result::FAILED_PTRACE);
         return;
       }
       PLOG(FATAL) << "attaching to init process failed";
     }
   }

   if (pid_ <= 0 || (should_have_init && init_pid <= 0)) {
     SetExitStatusCode(Result::SETUP_ERROR, Result::FAILED_SUBPROCESS);
     return;
   }

   if (!notify_->EventStarted(pid_, comms_)) {
     SetExitStatusCode(Result::SETUP_ERROR, Result::FAILED_NOTIFY);
     return;
   }
   if (!InitSendIPC()) {
     SetExitStatusCode(Result::SETUP_ERROR, Result::FAILED_IPC);
     return;
   }
   if (!InitSendCwd()) {
     SetExitStatusCode(Result::SETUP_ERROR, Result::FAILED_CWD);
     return;
   }
   if (!InitSendPolicy()) {
     SetExitStatusCode(Result::SETUP_ERROR, Result::FAILED_POLICY);
     return;
   }
   if (!WaitForSandboxReady()) {
     SetExitStatusCode(Result::SETUP_ERROR, Result::FAILED_WAIT);
     return;
   }
   if (!InitApplyLimits()) {
     SetExitStatusCode(Result::SETUP_ERROR, Result::FAILED_LIMITS);
     return;
   }
   // This call should be the last in the init sequence, because it can cause the
   // sandboxee to enter ptrace-stopped state, in which it will not be able to
   // send any messages over the Comms channel.
   if (!InitPtraceAttach()) {
     SetExitStatusCode(Result::SETUP_ERROR, Result::FAILED_PTRACE);
     return;
   }

   // Tell the parent thread (Sandbox2 object) that we're done with the initial
   // set-up process of the sandboxee.
   setup_notify.reset();

   MainLoop(&sigtimedwait_sset);
 }

 bool Monitor::IsActivelyMonitoring() {
   // If we're still waiting for execve(), then we allow all syscalls.
   return !wait_for_execve_;
 }

 void Monitor::SetActivelyMonitoring() { wait_for_execve_ = false; }

 void Monitor::SetExitStatusCode(Result::StatusEnum final_status,
                                 uintptr_t reason_code) {
   CHECK(result_.final_status() == Result::UNSET);
   result_.SetExitStatusCode(final_status, reason_code);
 }

 bool Monitor::ShouldCollectStackTrace() {
   // Only get the stacktrace if we are not in the libunwind sandbox (avoid
   // recursion).
   bool stacktrace_collection_possible =
       (policy_->GetNamespace() ||
        absl::GetFlag(FLAGS_sandbox_libunwind_crash_handler) == false) &&
       executor_->libunwind_sbox_for_pid_ == 0;
   if (!stacktrace_collection_possible) {
     LOG(ERROR) << "Cannot collect stack trace. Unwind pid "
                << executor_->libunwind_sbox_for_pid_ << ", namespace "
                << policy_->GetNamespace();
     return false;
   }
   switch (result_.final_status()) {
     case Result::EXTERNAL_KILL:
       return policy_->collect_stacktrace_on_kill_;
     case Result::TIMEOUT:
       return policy_->collect_stacktrace_on_timeout_;
     case Result::SIGNALED:
       return policy_->collect_stacktrace_on_signal_;
     case Result::VIOLATION:
       return policy_->collect_stacktrace_on_violation_;
     default:
       return false;
   }
 }

 void Monitor::SetAdditionalResultInfo(std::unique_ptr<Regs> regs) {
   pid_t pid = regs->pid();
   result_.SetRegs(std::move(regs));
   result_.SetProgName(util::GetProgName(pid));
   result_.SetProcMaps(ReadProcMaps(pid_));
   if (ShouldCollectStackTrace()) {
     auto* ns = policy_->GetNamespace();
     const Mounts empty_mounts;
     result_.SetStackTrace(
         GetStackTrace(result_.GetRegs(), ns ? ns->mounts() : empty_mounts));
     LOG(INFO) << "Stack trace: " << result_.GetStackTrace();
   } else {
     LOG(INFO) << "Stack traces have been disabled";
   }
 }

 void Monitor::KillSandboxee() {
   VLOG(1) << "Sending SIGKILL to the PID: " << pid_;
   if (kill(pid_, SIGKILL) != 0) {
     LOG(ERROR) << "Could not send SIGKILL to PID " << pid_;
     SetExitStatusCode(Result::INTERNAL_ERROR, Result::FAILED_KILL);
   }
 }

 // Not defined in glibc.
 #define __WPTRACEEVENT(x) ((x & 0xff0000) >> 16)

 void Monitor::MainLoop(sigset_t* sset) {
   bool sandboxee_exited = false;
   int status;
   // All possible still running children of main process, will be killed due to
   // PTRACE_O_EXITKILL ptrace() flag.
   while (result_.final_status() == Result::UNSET) {
     int64_t deadline = deadline_millis_.load(std::memory_order_relaxed);
     if (deadline != 0 && absl::Now() >= absl::FromUnixMillis(deadline)) {
       VLOG(1) << "Sandbox process hit timeout due to the walltime timer";
       timed_out_ = true;
       KillSandboxee();
     }

     if (!dump_stack_request_flag_.test_and_set(std::memory_order_relaxed)) {
       should_dump_stack_ = true;
       InterruptProcess(pid_);
     }

     if (!external_kill_request_flag_.test_and_set(std::memory_order_relaxed)) {
       external_kill_ = true;
       KillSandboxee();
     }

     if (network_proxy_server_ &&
         network_proxy_server_->violation_occurred_.load(
             std::memory_order_acquire) &&
         !network_violation_) {
       network_violation_ = true;
       KillSandboxee();
     }

     // It should be a non-blocking operation (hence WNOHANG), so this function
     // returns quickly if there are no events to be processed.
     // Prioritize main pid to avoid resource starvation
     pid_t ret =
         waitpid(pid_, &status, __WNOTHREAD | __WALL | WUNTRACED | WNOHANG);
     if (ret == 0) {
       ret = waitpid(-1, &status, __WNOTHREAD | __WALL | WUNTRACED | WNOHANG);
     }

     if (ret == 0) {
       constexpr timespec ts = {kWakeUpPeriodSec, kWakeUpPeriodNSec};
       int signo = sigtimedwait(sset, nullptr, &ts);
       LOG_IF(ERROR, signo != -1 && signo != SIGCHLD)
           << "Unknown signal received: " << signo;
       continue;
     }

     if (ret == -1) {
       if (errno == ECHILD) {
         LOG(ERROR) << "PANIC(). The main process has not exited yet, "
                    << "yet we haven't seen its exit event";
         SetExitStatusCode(Result::INTERNAL_ERROR, Result::FAILED_CHILD);
       } else {
         PLOG(ERROR) << "waitpid() failed";
       }
       continue;
     }

     VLOG(3) << "waitpid() returned with PID: " << ret << ", status: " << status;

     if (WIFEXITED(status)) {
       VLOG(1) << "PID: " << ret
               << " finished with code: " << WEXITSTATUS(status);
       // That's the main process, set the exit code, and exit. It will kill
       // all remaining processes (if there are any) because of the
       // PTRACE_O_EXITKILL ptrace() flag.
       if (ret == pid_) {
         if (IsActivelyMonitoring()) {
           SetExitStatusCode(Result::OK, WEXITSTATUS(status));
         } else {
           SetExitStatusCode(Result::SETUP_ERROR, Result::FAILED_MONITOR);
         }
         sandboxee_exited = true;
       }
     } else if (WIFSIGNALED(status)) {
       //  This usually does not happen, but might.
       //  Quote from the manual:
       //   A SIGKILL signal may still cause a PTRACE_EVENT_EXIT stop before
       //   actual signal death.  This may be changed in the future;
       VLOG(1) << "PID: " << ret << " terminated with signal: "
               << util::GetSignalName(WTERMSIG(status));
       if (ret == pid_) {
         if (network_violation_) {
           SetExitStatusCode(Result::VIOLATION, Result::VIOLATION_NETWORK);
           result_.SetNetworkViolation(network_proxy_server_->violation_msg_);
         } else if (external_kill_) {
           SetExitStatusCode(Result::EXTERNAL_KILL, 0);
         } else if (timed_out_) {
           SetExitStatusCode(Result::TIMEOUT, 0);
         } else {
           SetExitStatusCode(Result::SIGNALED, WTERMSIG(status));
         }
         sandboxee_exited = true;
       }
     } else if (WIFSTOPPED(status)) {
       VLOG(2) << "PID: " << ret
               << " received signal: " << util::GetSignalName(WSTOPSIG(status))
               << " with event: " << __WPTRACEEVENT(status);
       StateProcessStopped(ret, status);
     } else if (WIFCONTINUED(status)) {
       VLOG(2) << "PID: " << ret << " is being continued";
     }
   }
   // Try to make sure main pid is killed and reaped
   if (!sandboxee_exited) {
     kill(pid_, SIGKILL);
     constexpr auto kGracefullExitTimeout = absl::Milliseconds(200);
     auto deadline = absl::Now() + kGracefullExitTimeout;
     for (;;) {
       auto left = deadline - absl::Now();
       if (absl::Now() >= deadline) {
         LOG(INFO) << "Waiting for sandboxee exit timed out";
         break;
       }
       pid_t ret =
           waitpid(pid_, &status, __WNOTHREAD | __WALL | WUNTRACED | WNOHANG);
       if (ret == 0) {
         // Sometimes PTRACE_EVENT_EXIT needs to be handled for each child thread
         // in order to observe main thread exit
         ret = waitpid(-1, &status, __WNOTHREAD | __WALL | WUNTRACED | WNOHANG);
       }
       if (ret == -1) {
         PLOG(ERROR) << "waitpid() failed";
         break;
       }
       if (ret == pid_ && (WIFSIGNALED(status) || WIFEXITED(status))) {
         break;
       }
       if (ret == 0) {
         auto ts = absl::ToTimespec(left);
         sigtimedwait(sset, nullptr, &ts);
       } else if (WIFSTOPPED(status) &&
                  __WPTRACEEVENT(status) == PTRACE_EVENT_EXIT) {
         VLOG(2) << "PID: " << ret << " PTRACE_EVENT_EXIT ";
         ContinueProcess(ret, 0);
       } else {
         kill(pid_, SIGKILL);
       }
     }
   }
 }

 bool Monitor::InitSetupSignals(sigset_t* sset) {
   if (sigemptyset(sset) == -1) {
     PLOG(ERROR) << "sigemptyset()";
     return false;
   }

   // sigtimedwait will react (wake-up) to arrival of this signal.
   if (sigaddset(sset, SIGCHLD) == -1) {
     PLOG(ERROR) << "sigaddset(SIGCHLD)";
     return false;
   }

   if (pthread_sigmask(SIG_BLOCK, sset, nullptr) == -1) {
     PLOG(ERROR) << "pthread_sigmask(SIG_BLOCK, SIGCHLD)";
     return false;
   }

   return true;
 }

 bool Monitor::InitSendPolicy() {
   if (!policy_->SendPolicy(comms_)) {
     LOG(ERROR) << "Couldn't send policy";
     return false;
   }

   return true;
 }

 bool Monitor::InitSendCwd() {
   if (!comms_->SendString(executor_->cwd_)) {
     PLOG(ERROR) << "Couldn't send cwd";
     return false;
   }

   return true;
 }

 bool Monitor::InitApplyLimit(pid_t pid, __rlimit_resource resource,
                              const rlimit64& rlim) const {
   rlimit64 curr_limit;
   if (prlimit64(pid, resource, nullptr, &curr_limit) == -1) {
     PLOG(ERROR) << "prlimit64(" << pid << ", " << util::GetRlimitName(resource)
                 << ")";
   } else if (rlim.rlim_cur > curr_limit.rlim_max) {
     // In such case, don't update the limits, as it will fail. Just stick to the
     // current ones (which are already lower than intended).
     LOG(ERROR) << util::GetRlimitName(resource)
                << ": new.current > current.max (" << rlim.rlim_cur << " > "
                << curr_limit.rlim_max << "), skipping";
     return true;
   }

   if (prlimit64(pid, resource, &rlim, nullptr) == -1) {
     PLOG(ERROR) << "prlimit64(" << pid << ", " << util::GetRlimitName(resource)
                 << ", " << rlim.rlim_cur << ")";
     return false;
   }

   return true;
 }

 bool Monitor::InitApplyLimits() {
   Limits* limits = executor_->limits();
   return InitApplyLimit(pid_, RLIMIT_AS, limits->rlimit_as()) &&
          InitApplyLimit(pid_, RLIMIT_CPU, limits->rlimit_cpu()) &&
          InitApplyLimit(pid_, RLIMIT_FSIZE, limits->rlimit_fsize()) &&
          InitApplyLimit(pid_, RLIMIT_NOFILE, limits->rlimit_nofile()) &&
          InitApplyLimit(pid_, RLIMIT_CORE, limits->rlimit_core());
 }

 bool Monitor::InitSendIPC() { return ipc_->SendFdsOverComms(); }

 bool Monitor::WaitForSandboxReady() {
   uint32_t tmp;
   if (!comms_->RecvUint32(&tmp)) {
     LOG(ERROR) << "Couldn't receive 'Client::kClient2SandboxReady' message";
     return false;
   }
   if (tmp != Client::kClient2SandboxReady) {
     LOG(ERROR) << "Received " << tmp << " != Client::kClient2SandboxReady ("
                << Client::kClient2SandboxReady << ")";
     return false;
   }
   return true;
 }

 bool Monitor::InitPtraceAttach() {
   sanitizer::WaitForTsan();

   // Get a list of tasks.
   std::set<int> tasks;
   if (!sanitizer::GetListOfTasks(pid_, &tasks)) {
     LOG(ERROR) << "Could not get list of tasks";
     return false;
   }

   if (tasks.find(pid_) == tasks.end()) {
     LOG(ERROR) << "The pid " << pid_ << " was not found in its own tasklist.";
     return false;
   }

   // With TSYNC, we can allow threads: seccomp applies to all threads.

   if (tasks.size() > 1) {
     LOG(WARNING) << "PID " << pid_ << " has " << tasks.size() << " threads,"
                  << " at the time of call to SandboxMeHere. If you are seeing"
                  << " more sandbox violations than expected, this might be"
                  << " the reason why"
                  << ".";
   }

   std::set<int> tasks_attached;
   int retries = 0;
   absl::Time deadline = absl::Now() + absl::Seconds(2);

   // In some situations we allow ptrace to try again when it fails.
   while (!tasks.empty()) {
     std::set<int> tasks_left;
     for (int task : tasks) {
       constexpr intptr_t options =
           PTRACE_O_TRACESYSGOOD | PTRACE_O_TRACEFORK | PTRACE_O_TRACEVFORK |
           PTRACE_O_TRACEVFORKDONE | PTRACE_O_TRACECLONE | PTRACE_O_TRACEEXEC |
           PTRACE_O_TRACEEXIT | PTRACE_O_TRACESECCOMP | PTRACE_O_EXITKILL;
       int ret = ptrace(PTRACE_SEIZE, task, 0, options);
       if (ret != 0) {
         if (errno == EPERM) {
           // Sometimes when a task is exiting we can get an EPERM from ptrace.
           // Let's try again up until the timeout in this situation.
           PLOG(WARNING) << "ptrace(PTRACE_SEIZE, " << task << ", "
                         << absl::StrCat("0x", absl::Hex(options))
                         << "), trying again...";
           tasks_left.insert(task);
           continue;
         }
         if (errno == ESRCH) {
           // A task may have exited since we captured the task list, we will
           // allow things to continue after we log a warning.
           PLOG(WARNING)
               << "ptrace(PTRACE_SEIZE, " << task << ", "
               << absl::StrCat("0x", absl::Hex(options))
               << ") skipping exited task. Continuing with other tasks.";
           continue;
         }
         // Any other errno will be considered a failure.
         PLOG(ERROR) << "ptrace(PTRACE_SEIZE, " << task << ", "
                     << absl::StrCat("0x", absl::Hex(options)) << ") failed.";
         return false;
       }
       tasks_attached.insert(task);
     }
     if (!tasks_left.empty()) {
       if (absl::Now() < deadline) {
         LOG(ERROR) << "Attaching to sandboxee timed out: could not attach to "
                    << tasks_left.size() << " tasks";
         return false;
       }
       // Exponential Backoff.
       constexpr absl::Duration kInitialRetry = absl::Milliseconds(1);
       constexpr absl::Duration kMaxRetry = absl::Milliseconds(20);
       const absl::Duration retry_interval =
           kInitialRetry * (1 << std::min(10, retries++));
       absl::SleepFor(
           std::min({retry_interval, kMaxRetry, deadline - absl::Now()}));
     }
     tasks = std::move(tasks_left);
   }

   // Get a list of tasks after attaching.
   if (!sanitizer::GetListOfTasks(pid_, &tasks)) {
     LOG(ERROR) << "Could not get list of tasks";
     return false;
   }

   // Check that we attached to all the threads
   if (tasks_attached != tasks) {
     LOG(ERROR) << "The pid " << pid_
                << " spawned new threads while we were trying to attach to it.";
     return false;
   }

   // No glibc wrapper for gettid - see 'man gettid'.
   VLOG(1) << "Monitor (PID: " << getpid()
           << ", TID: " << util::Syscall(__NR_gettid)
           << ") attached to PID: " << pid_;

   // Technically, the sandboxee can be in a ptrace-stopped state right now,
   // because some signal could have arrived in the meantime. Yet, this
   // Comms::SendUint32 call shouldn't lock our process, because the underlying
   // socketpair() channel is buffered, hence it will accept the uint32_t message
   // no matter what is the current state of the sandboxee, and it will allow for
   // our process to continue and unlock the sandboxee with the proper ptrace
   // event handling.
   if (!comms_->SendUint32(Client::kSandbox2ClientDone)) {
     LOG(ERROR) << "Couldn't send Client::kSandbox2ClientDone message";
     return false;
   }
   return true;
 }

 void Monitor::ActionProcessSyscall(Regs* regs, const Syscall& syscall) {
   // If the sandboxing is not enabled yet, allow the first __NR_execveat.
   if (syscall.nr() == __NR_execveat && !IsActivelyMonitoring()) {
     VLOG(1) << "[PERMITTED/BEFORE_EXECVEAT]: "
             << "SYSCALL ::: PID: " << regs->pid() << ", PROG: '"
             << util::GetProgName(regs->pid())
             << "' : " << syscall.GetDescription();
     ContinueProcess(regs->pid(), 0);
     return;
   }

   // Notify can decide whether we want to allow this syscall. It could be useful
   // for sandbox setups in which some syscalls might still need some logging,
   // but nonetheless be allowed ('permissible syscalls' in sandbox v1).
   if (notify_->EventSyscallTrap(syscall)) {
     LOG(WARNING) << "[PERMITTED]: SYSCALL ::: PID: " << regs->pid()
                  << ", PROG: '" << util::GetProgName(regs->pid())
                  << "' : " << syscall.GetDescription();

     ContinueProcess(regs->pid(), 0);
     return;
   }

   // TODO(wiktorg): Further clean that up, probably while doing monitor cleanup
   // log_file_ not null iff FLAGS_sandbox2_danger_danger_permit_all_and_log is
   // set.
   if (log_file_) {
     std::string syscall_description = syscall.GetDescription();
     PCHECK(absl::FPrintF(log_file_, "PID: %d %s\n", regs->pid(),
                          syscall_description) >= 0);
     ContinueProcess(regs->pid(), 0);
     return;
   }

   if (absl::GetFlag(FLAGS_sandbox2_danger_danger_permit_all)) {
     ContinueProcess(regs->pid(), 0);
     return;
   }

   ActionProcessSyscallViolation(regs, syscall, kSyscallViolation);
 }

 void Monitor::ActionProcessSyscallViolation(Regs* regs, const Syscall& syscall,
                                             ViolationType violation_type) {
   LogSyscallViolation(syscall);
   notify_->EventSyscallViolation(syscall, violation_type);
   SetExitStatusCode(Result::VIOLATION, syscall.nr());
   result_.SetSyscall(absl::make_unique<Syscall>(syscall));
   SetAdditionalResultInfo(absl::make_unique<Regs>(*regs));
   // Rewrite the syscall argument to something invalid (-1).
   // The process will be killed anyway so this is just a precaution.
   auto status = regs->SkipSyscallReturnValue(-ENOSYS);
   if (!status.ok()) {
     LOG(ERROR) << status;
   }
 }

 void Monitor::LogSyscallViolation(const Syscall& syscall) const {
   // Do not unwind libunwind.
   if (executor_->libunwind_sbox_for_pid_ != 0) {
     LOG(ERROR) << "Sandbox violation during execution of libunwind: "
                << syscall.GetDescription();
     return;
   }

   // So, this is an invalid syscall. Will be killed by seccomp-bpf policies as
   // well, but we should be on a safe side here as well.
   LOG(ERROR) << "SANDBOX VIOLATION : PID: " << syscall.pid() << ", PROG: '"
              << util::GetProgName(syscall.pid())
              << "' : " << syscall.GetDescription();

   LogSyscallViolationExplanation(syscall);
 }

 void Monitor::EventPtraceSeccomp(pid_t pid, int event_msg) {
   // If the seccomp-policy is using RET_TRACE, we request that it returns the
   // syscall architecture identifier in the SECCOMP_RET_DATA.
   const auto syscall_arch = static_cast<Syscall::CpuArch>(event_msg);
   Regs regs(pid);
   auto status = regs.Fetch();
   if (!status.ok()) {
     LOG(ERROR) << status;
     SetExitStatusCode(Result::INTERNAL_ERROR, Result::FAILED_FETCH);
     return;
   }

   Syscall syscall = regs.ToSyscall(syscall_arch);
   // If the architecture of the syscall used is different that the current host
   // architecture, report a violation.
   if (syscall_arch != Syscall::GetHostArch()) {
     ActionProcessSyscallViolation(&regs, syscall, kArchitectureSwitchViolation);
     return;
   }

   ActionProcessSyscall(&regs, syscall);
 }

 void Monitor::EventPtraceExec(pid_t pid, int event_msg) {
   if (!IsActivelyMonitoring()) {
     VLOG(1) << "PTRACE_EVENT_EXEC seen from PID: " << event_msg
             << ". SANDBOX ENABLED!";
     SetActivelyMonitoring();
   }
   ContinueProcess(pid, 0);
 }

 void Monitor::EventPtraceExit(pid_t pid, int event_msg) {
   // A regular exit, let it continue (fast-path).
   if (WIFEXITED(event_msg)) {
     ContinueProcess(pid, 0);
     return;
   }

   // Fetch the registers as we'll need them to fill the result in any case
   auto regs = absl::make_unique<Regs>(pid);
   auto status = regs->Fetch();
   if (!status.ok()) {
     LOG(ERROR) << "failed to fetch regs: " << status;
     SetExitStatusCode(Result::INTERNAL_ERROR, Result::FAILED_FETCH);
     return;
   }

   // Process signaled due to seccomp violation.
   if (WIFSIGNALED(event_msg) && WTERMSIG(event_msg) == SIGSYS) {
     VLOG(1) << "PID: " << pid << " violation uncovered via the EXIT_EVENT";
     ActionProcessSyscallViolation(
         regs.get(), regs->ToSyscall(Syscall::GetHostArch()), kSyscallViolation);
     return;
   }

   // This can be reached in three cases:
   // 1) Process was killed from the sandbox.
   // 2) Process was killed because it hit a timeout.
   // 3) Regular signal/other exit cause.
   if (pid == pid_) {
     VLOG(1) << "PID: " << pid << " main special exit";
     if (network_violation_) {
       SetExitStatusCode(Result::VIOLATION, Result::VIOLATION_NETWORK);
       result_.SetNetworkViolation(network_proxy_server_->violation_msg_);
     } else if (external_kill_) {
       SetExitStatusCode(Result::EXTERNAL_KILL, 0);
     } else if (timed_out_) {
       SetExitStatusCode(Result::TIMEOUT, 0);
     } else {
       SetExitStatusCode(Result::SIGNALED, WTERMSIG(event_msg));
     }
     SetAdditionalResultInfo(std::move(regs));
   }
   VLOG(1) << "Continuing";
   ContinueProcess(pid, 0);
 }

 void Monitor::EventPtraceStop(pid_t pid, int stopsig) {
   // It's not a real stop signal. For example PTRACE_O_TRACECLONE and similar
   // flags to ptrace(PTRACE_SEIZE) might generate this event with SIGTRAP.
   if (stopsig != SIGSTOP && stopsig != SIGTSTP && stopsig != SIGTTIN &&
       stopsig != SIGTTOU) {
     ContinueProcess(pid, 0);
     return;
   }
   // It's our PID stop signal. Stop it.
   VLOG(2) << "PID: " << pid << " stopped due to "
           << util::GetSignalName(stopsig);
   StopProcess(pid, 0);
 }

 void Monitor::StateProcessStopped(pid_t pid, int status) {
   int stopsig = WSTOPSIG(status);
   if (__WPTRACEEVENT(status) == 0) {
     // Must be a regular signal delivery.
     VLOG(2) << "PID: " << pid
             << " received signal: " << util::GetSignalName(stopsig);
     notify_->EventSignal(pid, stopsig);
     ContinueProcess(pid, stopsig);
     return;
   }

   unsigned long event_msg;  // NOLINT
   if (ptrace(PTRACE_GETEVENTMSG, pid, 0, &event_msg) == -1) {
     if (errno == ESRCH) {
       // This happens from time to time, the kernel does not guarantee us that
       // we get the event in time.
       PLOG(INFO) << "ptrace(PTRACE_GETEVENTMSG, " << pid << ")";
       return;
     }
     PLOG(ERROR) << "ptrace(PTRACE_GETEVENTMSG, " << pid << ")";
     SetExitStatusCode(Result::INTERNAL_ERROR, Result::FAILED_GETEVENT);
     return;
   }

   if (ABSL_PREDICT_FALSE(pid == pid_ && should_dump_stack_ &&
                          executor_->libunwind_sbox_for_pid_ == 0 &&
                          policy_->GetNamespace())) {
     Regs regs(pid);
     auto status = regs.Fetch();
     if (status.ok()) {
       VLOG(0) << "SANDBOX STACK : PID: " << pid << ", ["
               << GetStackTrace(&regs, policy_->GetNamespace()->mounts()) << "]";
     } else {
       LOG(WARNING) << "FAILED TO GET SANDBOX STACK : " << status;
     }
     should_dump_stack_ = false;
   }

 #if !defined(PTRACE_EVENT_STOP)
 #define PTRACE_EVENT_STOP 128
 #endif

   switch (__WPTRACEEVENT(status)) {
     case PTRACE_EVENT_FORK:
       /* fall through */
     case PTRACE_EVENT_VFORK:
       /* fall through */
     case PTRACE_EVENT_CLONE:
       /* fall through */
     case PTRACE_EVENT_VFORK_DONE:
       ContinueProcess(pid, 0);
       break;
     case PTRACE_EVENT_EXEC:
       VLOG(2) << "PID: " << pid << " PTRACE_EVENT_EXEC, PID: " << event_msg;
       EventPtraceExec(pid, event_msg);
       break;
     case PTRACE_EVENT_EXIT:
       VLOG(2) << "PID: " << pid << " PTRACE_EVENT_EXIT: " << event_msg;
       EventPtraceExit(pid, event_msg);
       break;
     case PTRACE_EVENT_STOP:
       VLOG(2) << "PID: " << pid << " PTRACE_EVENT_STOP: " << event_msg;
       EventPtraceStop(pid, stopsig);
       break;
     case PTRACE_EVENT_SECCOMP:
       VLOG(2) << "PID: " << pid << " PTRACE_EVENT_SECCOMP: " << event_msg;
       EventPtraceSeccomp(pid, event_msg);
       break;
     default:
       LOG(ERROR) << "Unknown ptrace event: " << __WPTRACEEVENT(status)
                  << " with data: " << event_msg;
       break;
   }
 }

 void Monitor::LogSyscallViolationExplanation(const Syscall& syscall) const {
   const uintptr_t syscall_nr = syscall.nr();
   const uintptr_t arg0 = syscall.args()[0];

   // This follows policy in Policy::GetDefaultPolicy - keep it in sync.
   if (syscall.arch() != Syscall::GetHostArch()) {
     LOG(ERROR)
         << "This is a violation because the syscall was issued because the"
         << " sandboxee and executor architectures are different.";
     return;
   }
   if (syscall_nr == __NR_ptrace) {
     LOG(ERROR)
         << "This is a violation because the ptrace syscall would be unsafe in"
         << " sandbox2, so it has been blocked.";
     return;
   }
   if (syscall_nr == __NR_bpf) {
     LOG(ERROR)
         << "This is a violation because the bpf syscall would be risky in"
         << " a sandbox, so it has been blocked.";
     return;
   }
   if (syscall_nr == __NR_clone && ((arg0 & CLONE_UNTRACED) != 0)) {
     LOG(ERROR) << "This is a violation because calling clone with CLONE_UNTRACE"
                << " would be unsafe in sandbox2, so it has been blocked.";
     return;
   }
 }

 void Monitor::EnableNetworkProxyServer() {
   int fd = ipc_->ReceiveFd(NetworkProxyClient::kFDName);

   network_proxy_server_ = absl::make_unique<NetworkProxyServer>(
       fd, &policy_->allowed_hosts_.value(), pthread_self());

   network_proxy_thread_ = std::thread(&NetworkProxyServer::Run,
   network_proxy_server_.get());
 }

 }  // namespace sandbox2