Revert^2 "Wait for thread termination in PreZygoteFork()"

Revert of 02285d918d3ab1e95fe1849842661bdf194294c9

PS1 is identical to aosp/2782117

PS2 no longer erroneously passes null as self.

Bug: 299221079
Bug: 304183774
Test: Build and boot AOSP, TreeHugger
Change-Id: Iaf8c26208c71c6a5dcb46a2eb0472f572cf4099a
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index 84b2236..b1c40ec 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -746,10 +746,77 @@
   DISALLOW_COPY_AND_ASSIGN(UpdateMethodsPreFirstForkVisitor);
 };
 
+// Wait until the kernel thinks we are single-threaded again.
+static void WaitUntilSingleThreaded() {
+#if defined(__linux__)
+  // Read num_threads field from /proc/self/stat, avoiding higher-level IO libraries that may
+  // break atomicity of the read.
+  static constexpr size_t kNumTries = 1000;
+  static constexpr size_t kNumThreadsIndex = 20;
+  for (size_t tries = 0; tries < kNumTries; ++tries) {
+    static constexpr int BUF_SIZE = 500;
+    char buf[BUF_SIZE];
+    int stat_fd = open("/proc/self/stat", O_RDONLY | O_CLOEXEC);
+    CHECK(stat_fd >= 0) << strerror(errno);
+    ssize_t bytes_read = TEMP_FAILURE_RETRY(read(stat_fd, buf, BUF_SIZE));
+    CHECK(bytes_read >= 0) << strerror(errno);
+    int ret = close(stat_fd);
+    DCHECK(ret == 0) << strerror(errno);
+    ssize_t pos = 0;
+    while (pos < bytes_read && buf[pos++] != ')') {}
+    ++pos;
+    // We're now positioned at the beginning of the third field. Don't count blanks embedded in
+    // second (command) field.
+    size_t blanks_seen = 2;
+    while (pos < bytes_read && blanks_seen < kNumThreadsIndex - 1) {
+      if (buf[pos++] == ' ') {
+        ++blanks_seen;
+      }
+    }
+    CHECK(pos < bytes_read - 2);
+    // pos is first character of num_threads field.
+    CHECK_EQ(buf[pos + 1], ' ');  // We never have more than single-digit threads here.
+    if (buf[pos] == '1') {
+      return;  //  num_threads == 1; success.
+    }
+    usleep(1000);
+  }
+  LOG(FATAL) << "Failed to reach single-threaded state";
+#else  // Not Linux; shouldn't matter, but this has a high probability of working slowly.
+  usleep(20'000);
+#endif
+}
+
 void Runtime::PreZygoteFork() {
   if (GetJit() != nullptr) {
     GetJit()->PreZygoteFork();
   }
+  // All other threads have already been joined, but they may not have finished
+  // removing themselves from the thread list. Wait until the other threads have completely
+  // finished, and are no longer in the thread list.
+  // TODO: Since the threads Unregister() themselves before exiting, the first wait should be
+  // unnecessary. But since we're reading from a /proc entry that's concurrently changing, for
+  // now we play this as safe as possible.
+  ThreadList* tl = GetThreadList();
+  {
+    Thread* self = Thread::Current();
+    MutexLock mu(self, *Locks::thread_list_lock_);
+    tl->WaitForUnregisterToComplete(self);
+    if (kIsDebugBuild) {
+      auto list = tl->GetList();
+      if (list.size() != 1) {
+        for (Thread* t : list) {
+          std::string name;
+          t->GetThreadName(name);
+          LOG(ERROR) << "Remaining pre-fork thread: " << name;
+        }
+      }
+    }
+    CHECK_EQ(tl->Size(), 1u);
+    // And then wait until the kernel thinks the threads are gone.
+    WaitUntilSingleThreaded();
+  }
+
   if (!heap_->HasZygoteSpace()) {
     Thread* self = Thread::Current();
     // This is the first fork. Update ArtMethods in the boot classpath now to
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index 5c71324..326737b 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -1409,6 +1409,14 @@
   }
 }
 
+void ThreadList::WaitForUnregisterToComplete(Thread* self) {
+  // We hold thread_list_lock_ .
+  while (unregistering_count_ != 0) {
+    LOG(WARNING) << "Waiting for a thread to finish unregistering";
+    Locks::thread_exit_cond_->Wait(self);
+  }
+}
+
 void ThreadList::VisitRootsForSuspendedThreads(RootVisitor* visitor) {
   Thread* const self = Thread::Current();
   std::vector<Thread*> threads_to_visit;
diff --git a/runtime/thread_list.h b/runtime/thread_list.h
index 51fac4a..db06611 100644
--- a/runtime/thread_list.h
+++ b/runtime/thread_list.h
@@ -158,6 +158,10 @@
                !Locks::thread_list_lock_,
                !Locks::thread_suspend_count_lock_);
 
+  // Wait until there are no Unregister() requests in flight. Only makes sense when we know that
+  // no new calls can be made. e.g. because we're the last thread.
+  void WaitForUnregisterToComplete(Thread* self) REQUIRES(Locks::thread_list_lock_);
+
   void VisitRoots(RootVisitor* visitor, VisitRootFlags flags) const
       REQUIRES_SHARED(Locks::mutator_lock_);
 
@@ -175,6 +179,8 @@
     return list_;
   }
 
+  size_t Size() REQUIRES(Locks::thread_list_lock_) { return list_.size(); }
+
   void DumpNativeStacks(std::ostream& os)
       REQUIRES(!Locks::thread_list_lock_);