Allow threads to be marked as unsuspendable by kForUserCode

There was a possible deadlock between jit-threads being suspended and
another thread waiting for the jit-thread to finish their work. If the
jit-thread hit a suspend-point the process would deadlock. This fixes
this by allowing threads to be marked as unsuspendable by user-code.
This prevents the issue by marking jit and gc threads as unsuspendable
by user code. Agents attempting to suspend them using JVMTI will
succeed (and see the thread as suspended) but internal runtime methods
will not see the thread as suspended and the thread will not be
prevented from moving into the kRunnable state. A thread that is
unsuspendable trying to suspend itself using JVMTI will get
ERR(INTERNAL) and a log message.

Doing this requires that we rewrite the JVMTI thread suspension code
somewhat so it will now perform an unconditional kInternal suspension
prior to trying to suspend the thread kForUserCode. The kInternal
suspension is then lifted. This ensures that everything is done
atomically even if the kForUserCode won't stop the thread.

Test: ./test.py --host
Test: ./art/tools/run-libjdwp-tests.sh --mode=host
Test: ./art/tools/run-libjdwp-tests.sh \
        --mode=host                    \
        --variant=x64                  \
        --test org.apache.harmony.jpda.tests.jdwp.EventModifiers.InstanceOnlyModifierTest

Bug: 70838465
Bug: 111348762

Change-Id: I91211641b82416664bf5abd8546efebf4f672f12
diff --git a/openjdkjvmti/ti_thread.cc b/openjdkjvmti/ti_thread.cc
index 369eb76..949b566 100644
--- a/openjdkjvmti/ti_thread.cc
+++ b/openjdkjvmti/ti_thread.cc
@@ -818,6 +818,37 @@
   return ERR(NONE);
 }
 
+class ScopedSuspendByPeer {
+ public:
+  explicit ScopedSuspendByPeer(jthread jtarget)
+      : thread_list_(art::Runtime::Current()->GetThreadList()),
+        timeout_(false),
+        target_(thread_list_->SuspendThreadByPeer(jtarget,
+                                                  /* suspend_thread */ true,
+                                                  art::SuspendReason::kInternal,
+                                                  &timeout_)) { }
+  ~ScopedSuspendByPeer() {
+    if (target_ != nullptr) {
+      if (!thread_list_->Resume(target_, art::SuspendReason::kInternal)) {
+        LOG(ERROR) << "Failed to resume " << target_ << "!";
+      }
+    }
+  }
+
+  art::Thread* GetTargetThread() const {
+    return target_;
+  }
+
+  bool TimedOut() const {
+    return timeout_;
+  }
+
+ private:
+  art::ThreadList* thread_list_;
+  bool timeout_;
+  art::Thread* target_;
+};
+
 jvmtiError ThreadUtil::SuspendOther(art::Thread* self,
                                     jthread target_jthread) {
   // Loop since we need to bail out and try again if we would end up getting suspended while holding
@@ -845,29 +876,27 @@
       if (!GetAliveNativeThread(target_jthread, soa, &target, &err)) {
         return err;
       }
-      art::ThreadState state = target->GetState();
-      if (state == art::ThreadState::kStarting || target->IsStillStarting()) {
-        return ERR(THREAD_NOT_ALIVE);
-      } else {
-        art::MutexLock thread_suspend_count_mu(self, *art::Locks::thread_suspend_count_lock_);
-        if (target->GetUserCodeSuspendCount() != 0) {
-          return ERR(THREAD_SUSPENDED);
-        }
-      }
     }
-    bool timeout = true;
-    art::Thread* ret_target = art::Runtime::Current()->GetThreadList()->SuspendThreadByPeer(
-        target_jthread,
-        /* request_suspension */ true,
-        art::SuspendReason::kForUserCode,
-        &timeout);
-    if (ret_target == nullptr && !timeout) {
+    // Get the actual thread in a suspended state so we can change the user-code suspend count.
+    ScopedSuspendByPeer ssbp(target_jthread);
+    if (ssbp.GetTargetThread() == nullptr && !ssbp.TimedOut()) {
       // TODO It would be good to get more information about why exactly the thread failed to
       // suspend.
       return ERR(INTERNAL);
-    } else if (!timeout) {
-      // we didn't time out and got a result.
-      return OK;
+    } else if (!ssbp.TimedOut()) {
+      art::ThreadState state = ssbp.GetTargetThread()->GetState();
+      if (state == art::ThreadState::kStarting || ssbp.GetTargetThread()->IsStillStarting()) {
+        return ERR(THREAD_NOT_ALIVE);
+      }
+      // we didn't time out and got a result. Suspend the thread by usercode and return. It's
+      // already suspended internal so we don't need to do anything but increment the count.
+      art::MutexLock thread_suspend_count_mu(self, *art::Locks::thread_suspend_count_lock_);
+      if (ssbp.GetTargetThread()->GetUserCodeSuspendCount() != 0) {
+        return ERR(THREAD_SUSPENDED);
+      }
+      bool res = ssbp.GetTargetThread()->ModifySuspendCount(
+          self, +1, nullptr, art::SuspendReason::kForUserCode);
+      return res ? OK : ERR(INTERNAL);
     }
     // We timed out. Just go around and try again.
   } while (true);
@@ -876,6 +905,17 @@
 
 jvmtiError ThreadUtil::SuspendSelf(art::Thread* self) {
   CHECK(self == art::Thread::Current());
+  if (!self->CanBeSuspendedByUserCode()) {
+    // TODO This is really undesirable. As far as I can tell this is can only come about because of
+    // class-loads in the jit-threads (through either VMObjectAlloc or the ClassLoad/ClassPrepare
+    // events that we send). It's unlikely that anyone would be suspending themselves there since
+    // it's almost guaranteed to cause a deadlock but it is technically allowed. Ideally we'd want
+    // to put a CHECK here (or in the event-dispatch code) that we are only in this situation when
+    // sending the GC callbacks but the jit causing events means we cannot do this.
+    LOG(WARNING) << "Attempt to self-suspend on a thread without suspension enabled. Thread is "
+                 << *self;
+    return ERR(INTERNAL);
+  }
   {
     art::MutexLock mu(self, *art::Locks::user_code_suspension_lock_);
     art::MutexLock thread_list_mu(self, *art::Locks::thread_suspend_count_lock_);
@@ -923,7 +963,6 @@
     return ERR(NULL_POINTER);
   }
   art::Thread* self = art::Thread::Current();
-  art::Thread* target;
   // Retry until we know we won't get suspended by user code while resuming something.
   do {
     SuspendCheck(self);
@@ -934,36 +973,37 @@
       continue;
     }
     // From now on we know we cannot get suspended by user-code.
-    {
-      // NB This does a SuspendCheck (during thread state change) so we need to make sure we don't
-      // have the 'suspend_lock' locked here.
-      art::ScopedObjectAccess soa(self);
-      art::MutexLock tll_mu(self, *art::Locks::thread_list_lock_);
-      jvmtiError err = ERR(INTERNAL);
-      if (!GetAliveNativeThread(thread, soa, &target, &err)) {
-        return err;
-      } else if (target == self) {
-        // We would have paused until we aren't suspended anymore due to the ScopedObjectAccess so
-        // we can just return THREAD_NOT_SUSPENDED. Unfortunately we cannot do any real DCHECKs
-        // about current state since it's all concurrent.
-        return ERR(THREAD_NOT_SUSPENDED);
-      }
-      // The JVMTI spec requires us to return THREAD_NOT_SUSPENDED if it is alive but we really
-      // cannot tell why resume failed.
-      {
-        art::MutexLock thread_suspend_count_mu(self, *art::Locks::thread_suspend_count_lock_);
-        if (target->GetUserCodeSuspendCount() == 0) {
-          return ERR(THREAD_NOT_SUSPENDED);
-        }
-      }
+    // NB This does a SuspendCheck (during thread state change) so we need to make sure we don't
+    // have the 'suspend_lock' locked here.
+    art::ScopedObjectAccess soa(self);
+    if (thread == nullptr) {
+      // The thread is the current thread.
+      return ERR(THREAD_NOT_SUSPENDED);
+    } else if (!soa.Env()->IsInstanceOf(thread, art::WellKnownClasses::java_lang_Thread)) {
+      // Not a thread object.
+      return ERR(INVALID_THREAD);
+    } else if (self->GetPeer() == soa.Decode<art::mirror::Object>(thread)) {
+      // The thread is the current thread.
+      return ERR(THREAD_NOT_SUSPENDED);
     }
-    // It is okay that we don't have a thread_list_lock here since we know that the thread cannot
-    // die since it is currently held suspended by a SuspendReason::kForUserCode suspend.
-    DCHECK(target != self);
-    if (!art::Runtime::Current()->GetThreadList()->Resume(target,
-                                                          art::SuspendReason::kForUserCode)) {
+    ScopedSuspendByPeer ssbp(thread);
+    if (ssbp.TimedOut()) {
+      // Unknown error. Couldn't suspend thread!
+      return ERR(INTERNAL);
+    } else if (ssbp.GetTargetThread() == nullptr) {
+      // Thread must not be alive.
+      return ERR(THREAD_NOT_ALIVE);
+    }
+    // We didn't time out and got a result. Check the thread is suspended by usercode, unsuspend it
+    // and return. It's already suspended internal so we don't need to do anything but decrement the
+    // count.
+    art::MutexLock thread_list_mu(self, *art::Locks::thread_suspend_count_lock_);
+    if (ssbp.GetTargetThread()->GetUserCodeSuspendCount() == 0) {
+      return ERR(THREAD_NOT_SUSPENDED);
+    } else if (!ssbp.GetTargetThread()->ModifySuspendCount(
+        self, -1, nullptr, art::SuspendReason::kForUserCode)) {
       // TODO Give a better error.
-      // This is most likely THREAD_NOT_SUSPENDED but we cannot really be sure.
+      // This should not really be possible and is probably some race.
       return ERR(INTERNAL);
     } else {
       return OK;
diff --git a/runtime/suspend_reason.h b/runtime/suspend_reason.h
index 289a1a4..4e75a4f 100644
--- a/runtime/suspend_reason.h
+++ b/runtime/suspend_reason.h
@@ -22,6 +22,8 @@
 namespace art {
 
 // The various reasons that we might be suspending a thread.
+// TODO Once kForDebugger is removed by removing the old debugger we should make the kForUserCode
+//      just a basic count for bookkeeping instead of linking it as directly with internal suspends.
 enum class SuspendReason {
   // Suspending for internal reasons (e.g. GC, stack trace, etc.).
   // TODO Split this into more descriptive sections.
diff --git a/runtime/thread.cc b/runtime/thread.cc
index cd6c834..18dc0e8 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -1228,6 +1228,34 @@
   LOG(FATAL) << ss.str();
 }
 
+void Thread::SetCanBeSuspendedByUserCode(bool can_be_suspended_by_user_code) {
+  CHECK_EQ(this, Thread::Current()) << "This function may only be called on the current thread. "
+                                    << *Thread::Current() << " tried to modify the suspendability "
+                                    << "of " << *this;
+  // NB This checks the new value! This ensures that we can only set can_be_suspended_by_user_code
+  // to false if !CanCallIntoJava().
+  DCHECK(!CanCallIntoJava() || can_be_suspended_by_user_code)
+      << "Threads able to call into java may not be marked as unsuspendable!";
+  if (can_be_suspended_by_user_code == CanBeSuspendedByUserCode()) {
+    // Don't need to do anything if nothing is changing.
+    return;
+  }
+  art::MutexLock mu(this, *Locks::user_code_suspension_lock_);
+  art::MutexLock thread_list_mu(this, *Locks::thread_suspend_count_lock_);
+
+  // We want to add the user-code suspend count if we are newly allowing user-code suspends and
+  // remove them if we are disabling them.
+  int adj = can_be_suspended_by_user_code ? GetUserCodeSuspendCount() : -GetUserCodeSuspendCount();
+  // Adjust the global suspend count appropriately. Use kInternal to not change the ForUserCode
+  // count.
+  if (adj != 0) {
+    bool suspend = ModifySuspendCountInternal(this, adj, nullptr, SuspendReason::kInternal);
+    CHECK(suspend) << this << " was unable to modify it's own suspend count!";
+  }
+  // Mark thread as accepting user-code suspensions.
+  can_be_suspended_by_user_code_ = can_be_suspended_by_user_code;
+}
+
 bool Thread::ModifySuspendCountInternal(Thread* self,
                                         int delta,
                                         AtomicInteger* suspend_barrier,
@@ -1249,6 +1277,17 @@
       LOG(ERROR) << "attempting to modify suspend count in an illegal way.";
       return false;
     }
+    DCHECK(this == self || this->IsSuspended())
+        << "Only self kForUserCode suspension on an unsuspended thread is allowed: " << this;
+    if (UNLIKELY(!CanBeSuspendedByUserCode())) {
+      VLOG(threads) << this << " is being requested to suspend for user code but that is disabled "
+                    << "the thread will not actually go to sleep.";
+      // Having the user_code_suspend_count still be around is useful but we don't need to actually
+      // do anything since we aren't going to 'really' suspend. Just adjust the
+      // user_code_suspend_count and return.
+      tls32_.user_code_suspend_count += delta;
+      return true;
+    }
   }
   if (UNLIKELY(delta < 0 && tls32_.suspend_count <= 0)) {
     UnsafeLogFatalForSuspendCount(self, this);
@@ -2109,7 +2148,8 @@
 Thread::Thread(bool daemon)
     : tls32_(daemon),
       wait_monitor_(nullptr),
-      can_call_into_java_(true) {
+      can_call_into_java_(true),
+      can_be_suspended_by_user_code_(true) {
   wait_mutex_ = new Mutex("a thread wait mutex");
   wait_cond_ = new ConditionVariable("a thread wait condition variable", *wait_mutex_);
   tlsPtr_.instrumentation_stack = new std::deque<instrumentation::InstrumentationStackFrame>;
diff --git a/runtime/thread.h b/runtime/thread.h
index edc429b..d169a62 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -989,6 +989,17 @@
     --tls32_.disable_thread_flip_count;
   }
 
+  // Returns true if the thread is subject to user_code_suspensions.
+  bool CanBeSuspendedByUserCode() const {
+    return can_be_suspended_by_user_code_;
+  }
+
+  // Sets CanBeSuspenededByUserCode and adjusts the suspend-count as needed. This may only be called
+  // when running on the current thread. It is **absolutely required** that this be called only on
+  // the Thread::Current() thread.
+  void SetCanBeSuspendedByUserCode(bool can_be_suspended_by_user_code)
+      REQUIRES(!Locks::thread_suspend_count_lock_, !Locks::user_code_suspension_lock_);
+
   // Returns true if the thread is allowed to call into java.
   bool CanCallIntoJava() const {
     return can_call_into_java_;
@@ -1552,8 +1563,9 @@
     // critical section enter.
     uint32_t disable_thread_flip_count;
 
-    // How much of 'suspend_count_' is by request of user code, used to distinguish threads
-    // suspended by the runtime from those suspended by user code.
+    // If CanBeSuspendedByUserCode, how much of 'suspend_count_' is by request of user code, used to
+    // distinguish threads suspended by the runtime from those suspended by user code. Otherwise
+    // this is just a count of how many user-code suspends have been attempted (but were ignored).
     // This should have GUARDED_BY(Locks::user_code_suspension_lock_) but auto analysis cannot be
     // told that AssertHeld should be good enough.
     int user_code_suspend_count GUARDED_BY(Locks::thread_suspend_count_lock_);
@@ -1772,6 +1784,10 @@
   // By default this is true.
   bool can_call_into_java_;
 
+  // True if the thread is subject to user-code suspension. By default this is true. This can only
+  // be false for threads where '!can_call_into_java_'.
+  bool can_be_suspended_by_user_code_;
+
   friend class Dbg;  // For SetStateUnsafe.
   friend class gc::collector::SemiSpace;  // For getting stack traces.
   friend class Runtime;  // For CreatePeer.
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index b2be549..ba333f6 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -902,6 +902,8 @@
                                         bool request_suspension,
                                         SuspendReason reason,
                                         bool* timed_out) {
+  CHECK_NE(reason, SuspendReason::kForUserCode) << "Cannot suspend for user-code by peer. Must be "
+                                                << "done directly on the thread.";
   const uint64_t start_time = NanoTime();
   useconds_t sleep_us = kThreadSuspendInitialSleepUs;
   *timed_out = false;
diff --git a/runtime/thread_pool.cc b/runtime/thread_pool.cc
index bec1150..26ca190 100644
--- a/runtime/thread_pool.cc
+++ b/runtime/thread_pool.cc
@@ -100,8 +100,13 @@
   worker->thread_ = Thread::Current();
   // Thread pool workers cannot call into java.
   worker->thread_->SetCanCallIntoJava(false);
+  // Thread pool workers should not be getting paused by user-code.
+  worker->thread_->SetCanBeSuspendedByUserCode(false);
   // Do work until its time to shut down.
   worker->Run();
+  // Thread pool worker is finished. We want to allow suspension during shutdown.
+  worker->thread_->SetCanBeSuspendedByUserCode(true);
+  // Thread shuts down.
   runtime->DetachCurrentThread();
   return nullptr;
 }