Implement ClassStatus::kVisiblyInitialized.

Previously all class initialization checks involved a memory
barrier to ensure appropriate memory visibility. We change
that by introducing the kVisiblyInitialized status which can
be checked without a memory barrier. Before we mark a class
as visibly initialized, we run a checkpoint on all threads
to ensure memory visibility. This is done in batches for up
to 32 classes to reduce the overhead.

Avoiding memory barriers in the compiled code reduces code
size and improves performance. This is also the first step
toward fixing a long-standing synchronization bug 18161648.

Prebuilt sizes for aosp_taimen-userdebug:
 - before:
   arm/boot*.oat: 19150696
   arm64/boot*.oat: 22574336
   oat/arm64/services.odex: 21929800
 - after:
   arm/boot*.oat: 19134508 (-16KiB)
   arm64/boot*.oat: 22553664 (-20KiB)
   oat/arm64/services.odex: 21888760 (-40KiB)

Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Test: aosp_taimen-userdebug boots
Test: run-gtests.sh -j4
Test: testrunner.py --target --optimizing
Test: Manually diff `m dump-oat-boot` output from before
      with output after this CL without codegen changes,
      with `sed` replacements for class status. Check that
      only checksums and the oatdump runtime values of
      DexCache.dexFile differ.
Bug: 18161648
Bug: 36692143
Change-Id: Ida10439d347e680a0abf4674546923374ffaa957
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index cf596c7..43d466b 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -1756,17 +1756,14 @@
   UseScratchRegisterScope temps(GetVIXLAssembler());
   Register temp = temps.AcquireW();
   constexpr size_t status_lsb_position = SubtypeCheckBits::BitStructSizeOf();
-  const size_t status_byte_offset =
-      mirror::Class::StatusOffset().SizeValue() + (status_lsb_position / kBitsPerByte);
-  constexpr uint32_t shifted_initialized_value =
-      enum_cast<uint32_t>(ClassStatus::kInitialized) << (status_lsb_position % kBitsPerByte);
+  constexpr uint32_t visibly_initialized = enum_cast<uint32_t>(ClassStatus::kVisiblyInitialized);
+  static_assert(visibly_initialized == MaxInt<uint32_t>(32u - status_lsb_position),
+                "kVisiblyInitialized must have all bits set");
 
-  // Even if the initialized flag is set, we need to ensure consistent memory ordering.
-  // TODO(vixl): Let the MacroAssembler handle MemOperand.
-  __ Add(temp, class_reg, status_byte_offset);
-  __ Ldarb(temp, HeapOperand(temp));
-  __ Cmp(temp, shifted_initialized_value);
-  __ B(lo, slow_path->GetEntryLabel());
+  const size_t status_offset = mirror::Class::StatusOffset().SizeValue();
+  __ Ldr(temp, HeapOperand(class_reg, status_offset));
+  __ Mvn(temp, Operand(temp, ASR, status_lsb_position));  // Were all the bits of the status set?
+  __ Cbnz(temp, slow_path->GetEntryLabel());              // If not, go to slow path.
   __ Bind(slow_path->GetExitLabel());
 }
 
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index 49e7695..b72a1a0 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -7182,17 +7182,13 @@
   UseScratchRegisterScope temps(GetVIXLAssembler());
   vixl32::Register temp = temps.Acquire();
   constexpr size_t status_lsb_position = SubtypeCheckBits::BitStructSizeOf();
-  const size_t status_byte_offset =
-      mirror::Class::StatusOffset().SizeValue() + (status_lsb_position / kBitsPerByte);
-  constexpr uint32_t shifted_initialized_value =
-      enum_cast<uint32_t>(ClassStatus::kInitialized) << (status_lsb_position % kBitsPerByte);
+  constexpr uint32_t shifted_visibly_initialized_value =
+      enum_cast<uint32_t>(ClassStatus::kVisiblyInitialized) << status_lsb_position;
 
-  GetAssembler()->LoadFromOffset(kLoadUnsignedByte, temp, class_reg, status_byte_offset);
-  __ Cmp(temp, shifted_initialized_value);
+  const size_t status_offset = mirror::Class::StatusOffset().SizeValue();
+  GetAssembler()->LoadFromOffset(kLoadWord, temp, class_reg, status_offset);
+  __ Cmp(temp, shifted_visibly_initialized_value);
   __ B(lo, slow_path->GetEntryLabel());
-  // Even if the initialized flag is set, we may be in a situation where caches are not synced
-  // properly. Therefore, we do a memory fence.
-  __ Dmb(ISH);
   __ Bind(slow_path->GetExitLabel());
 }
 
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 7f7e3a5..5159553 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -6709,13 +6709,12 @@
   constexpr size_t status_lsb_position = SubtypeCheckBits::BitStructSizeOf();
   const size_t status_byte_offset =
       mirror::Class::StatusOffset().SizeValue() + (status_lsb_position / kBitsPerByte);
-  constexpr uint32_t shifted_initialized_value =
-      enum_cast<uint32_t>(ClassStatus::kInitialized) << (status_lsb_position % kBitsPerByte);
+  constexpr uint32_t shifted_visibly_initialized_value =
+      enum_cast<uint32_t>(ClassStatus::kVisiblyInitialized) << (status_lsb_position % kBitsPerByte);
 
-  __ cmpb(Address(class_reg,  status_byte_offset), Immediate(shifted_initialized_value));
+  __ cmpb(Address(class_reg,  status_byte_offset), Immediate(shifted_visibly_initialized_value));
   __ j(kBelow, slow_path->GetEntryLabel());
   __ Bind(slow_path->GetExitLabel());
-  // No need for memory fence, thanks to the X86 memory model.
 }
 
 void InstructionCodeGeneratorX86::GenerateBitstringTypeCheckCompare(HTypeCheckInstruction* check,
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 8067b9c..8c8b5e6 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -5837,13 +5837,12 @@
   constexpr size_t status_lsb_position = SubtypeCheckBits::BitStructSizeOf();
   const size_t status_byte_offset =
       mirror::Class::StatusOffset().SizeValue() + (status_lsb_position / kBitsPerByte);
-  constexpr uint32_t shifted_initialized_value =
-      enum_cast<uint32_t>(ClassStatus::kInitialized) << (status_lsb_position % kBitsPerByte);
+  constexpr uint32_t shifted_visibly_initialized_value =
+      enum_cast<uint32_t>(ClassStatus::kVisiblyInitialized) << (status_lsb_position % kBitsPerByte);
 
-  __ cmpb(Address(class_reg,  status_byte_offset), Immediate(shifted_initialized_value));
+  __ cmpb(Address(class_reg,  status_byte_offset), Immediate(shifted_visibly_initialized_value));
   __ j(kBelow, slow_path->GetEntryLabel());
   __ Bind(slow_path->GetExitLabel());
-  // No need for memory fence, thanks to the x86-64 memory model.
 }
 
 void InstructionCodeGeneratorX86_64::GenerateBitstringTypeCheckCompare(HTypeCheckInstruction* check,
diff --git a/dex2oat/driver/compiler_driver.cc b/dex2oat/driver/compiler_driver.cc
index c0b2f3e..9eaf17d 100644
--- a/dex2oat/driver/compiler_driver.cc
+++ b/dex2oat/driver/compiler_driver.cc
@@ -1961,7 +1961,12 @@
 
       // Class has a meaningful status for the compiler now, record it.
       ClassReference ref(manager_->GetDexFile(), class_def_index);
-      manager_->GetCompiler()->RecordClassStatus(ref, klass->GetStatus());
+      ClassStatus status = klass->GetStatus();
+      if (status == ClassStatus::kInitialized) {
+        // Initialized classes shall be visibly initialized when loaded from the image.
+        status = ClassStatus::kVisiblyInitialized;
+      }
+      manager_->GetCompiler()->RecordClassStatus(ref, status);
 
       // It is *very* problematic if there are resolution errors in the boot classpath.
       //
@@ -2017,6 +2022,9 @@
                               : verifier::HardFailLogMode::kLogWarning;
   VerifyClassVisitor visitor(&context, log_level);
   context.ForAll(0, dex_file.NumClassDefs(), &visitor, thread_count);
+
+  // Make initialized classes visibly initialized.
+  class_linker->MakeInitializedClassesVisiblyInitialized(Thread::Current(), /*wait=*/ true);
 }
 
 class SetVerifiedClassVisitor : public CompilationVisitor {
@@ -2285,6 +2293,10 @@
         soa.Self()->AssertNoPendingException();
       }
     }
+    if (old_status == ClassStatus::kInitialized) {
+      // Initialized classes shall be visibly initialized when loaded from the image.
+      old_status = ClassStatus::kVisiblyInitialized;
+    }
     // Record the final class status if necessary.
     ClassReference ref(&dex_file, klass->GetDexClassDefIndex());
     // Back up the status before doing initialization for static encoded fields,
@@ -2496,6 +2508,9 @@
   }
   InitializeClassVisitor visitor(&context);
   context.ForAll(0, dex_file.NumClassDefs(), &visitor, init_thread_count);
+
+  // Make initialized classes visibly initialized.
+  class_linker->MakeInitializedClassesVisiblyInitialized(Thread::Current(), /*wait=*/ true);
 }
 
 class InitializeArrayClassesAndCreateConflictTablesVisitor : public ClassVisitor {
@@ -2759,7 +2774,7 @@
     case ClassStatus::kRetryVerificationAtRuntime:
     case ClassStatus::kVerified:
     case ClassStatus::kSuperclassValidated:
-    case ClassStatus::kInitialized:
+    case ClassStatus::kVisiblyInitialized:
       break;  // Expected states.
     default:
       LOG(FATAL) << "Unexpected class status for class "
diff --git a/dex2oat/driver/compiler_driver_test.cc b/dex2oat/driver/compiler_driver_test.cc
index 81262d3..8ec7003 100644
--- a/dex2oat/driver/compiler_driver_test.cc
+++ b/dex2oat/driver/compiler_driver_test.cc
@@ -354,7 +354,8 @@
     const ClassStatus expected_status = enum_cast<ClassStatus>(i);
     // Skip unsupported status that are not supposed to be ever recorded.
     if (expected_status == ClassStatus::kVerifyingAtRuntime ||
-        expected_status == ClassStatus::kInitializing) {
+        expected_status == ClassStatus::kInitializing ||
+        expected_status == ClassStatus::kInitialized) {
       continue;
     }
     compiler_driver_->RecordClassStatus(ref, expected_status);
diff --git a/dex2oat/linker/image_writer.cc b/dex2oat/linker/image_writer.cc
index 8a8ef4d..1f1bf33 100644
--- a/dex2oat/linker/image_writer.cc
+++ b/dex2oat/linker/image_writer.cc
@@ -822,7 +822,7 @@
       if (dirty_image_objects_ != nullptr &&
           dirty_image_objects_->find(klass->PrettyDescriptor()) != dirty_image_objects_->end()) {
         bin = Bin::kKnownDirty;
-      } else if (klass->GetStatus() == ClassStatus::kInitialized) {
+      } else if (klass->GetStatus() == ClassStatus::kVisiblyInitialized) {
         bin = Bin::kClassInitialized;
 
         // If the class's static fields are all final, put it into a separate bin
@@ -888,7 +888,8 @@
   }
   ObjPtr<mirror::Class> declaring_class = m->GetDeclaringClass();
   // Initialized is highly unlikely to dirty since there's no entry points to mutate.
-  return declaring_class == nullptr || declaring_class->GetStatus() != ClassStatus::kInitialized;
+  return declaring_class == nullptr ||
+         declaring_class->GetStatus() != ClassStatus::kVisiblyInitialized;
 }
 
 bool ImageWriter::IsImageBinSlotAssigned(mirror::Object* object) const {
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index fca40c5..731dfe7 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -20,6 +20,7 @@
 
 #include <algorithm>
 #include <deque>
+#include <forward_list>
 #include <iostream>
 #include <map>
 #include <memory>
@@ -35,11 +36,13 @@
 
 #include "art_field-inl.h"
 #include "art_method-inl.h"
+#include "barrier.h"
 #include "base/arena_allocator.h"
 #include "base/casts.h"
 #include "base/file_utils.h"
 #include "base/leb128.h"
 #include "base/logging.h"
+#include "base/mutex-inl.h"
 #include "base/os.h"
 #include "base/quasi_atomic.h"
 #include "base/scoped_arena_containers.h"
@@ -233,6 +236,189 @@
   }
 }
 
+// Callback responsible for making a batch of classes visibly initialized
+// after all threads have called it from a checkpoint, ensuring visibility.
+class ClassLinker::VisiblyInitializedCallback final
+    : public Closure, public IntrusiveForwardListNode<VisiblyInitializedCallback> {
+ public:
+  explicit VisiblyInitializedCallback(ClassLinker* class_linker)
+      : class_linker_(class_linker),
+        num_classes_(0u),
+        thread_visibility_counter_(0),
+        barriers_() {
+    std::fill_n(classes_, kMaxClasses, nullptr);
+  }
+
+  bool IsEmpty() const {
+    DCHECK_LE(num_classes_, kMaxClasses);
+    return num_classes_ == 0u;
+  }
+
+  bool IsFull() const {
+    DCHECK_LE(num_classes_, kMaxClasses);
+    return num_classes_ == kMaxClasses;
+  }
+
+  void AddClass(Thread* self, ObjPtr<mirror::Class> klass) REQUIRES_SHARED(Locks::mutator_lock_) {
+    DCHECK_EQ(klass->GetStatus(), ClassStatus::kInitialized);
+    DCHECK(!IsFull());
+    classes_[num_classes_] = self->GetJniEnv()->GetVm()->AddWeakGlobalRef(self, klass);
+    ++num_classes_;
+  }
+
+  void AddBarrier(Barrier* barrier) {
+    barriers_.push_front(barrier);
+  }
+
+  std::forward_list<Barrier*> GetAndClearBarriers() {
+    std::forward_list<Barrier*> result;
+    result.swap(barriers_);
+    result.reverse();  // Return barriers in insertion order.
+    return result;
+  }
+
+  void MakeVisible(Thread* self) {
+    DCHECK_EQ(thread_visibility_counter_.load(std::memory_order_relaxed), 0);
+    size_t count = Runtime::Current()->GetThreadList()->RunCheckpoint(this);
+    AdjustThreadVisibilityCounter(self, count);
+  }
+
+  void Run(Thread* self) override {
+    self->ClearMakeVisiblyInitializedCounter();
+    AdjustThreadVisibilityCounter(self, -1);
+  }
+
+ private:
+  void AdjustThreadVisibilityCounter(Thread* self, ssize_t adjustment) {
+    ssize_t old = thread_visibility_counter_.fetch_add(adjustment, std::memory_order_relaxed);
+    if (old + adjustment == 0) {
+      // All threads passed the checkpoint. Mark classes as visibly initialized.
+      {
+        ScopedObjectAccess soa(self);
+        StackHandleScope<1u> hs(self);
+        MutableHandle<mirror::Class> klass = hs.NewHandle<mirror::Class>(nullptr);
+        JavaVMExt* vm = self->GetJniEnv()->GetVm();
+        for (size_t i = 0, num = num_classes_; i != num; ++i) {
+          klass.Assign(ObjPtr<mirror::Class>::DownCast(self->DecodeJObject(classes_[i])));
+          vm->DeleteWeakGlobalRef(self, classes_[i]);
+          if (klass != nullptr) {
+            ObjectLock<mirror::Class> lock(self, klass);
+            mirror::Class::SetStatus(klass, ClassStatus::kVisiblyInitialized, self);
+          }
+        }
+        num_classes_ = 0u;
+      }
+      class_linker_->VisiblyInitializedCallbackDone(self, this);
+    }
+  }
+
+  static constexpr size_t kMaxClasses = 32;
+
+  ClassLinker* const class_linker_;
+  size_t num_classes_;
+  jweak classes_[kMaxClasses];
+
+  // The thread visibility counter starts at 0 and it is incremented by the number of
+  // threads that need to run this callback (by the thread that request the callback
+  // to be run) and decremented once for each `Run()` execution. When it reaches 0,
+  // whether after the increment or after a decrement, we know that `Run()` was executed
+  // for all threads and therefore we can mark the classes as visibly initialized.
+  std::atomic<ssize_t> thread_visibility_counter_;
+
+  // List of barries to `Pass()` for threads that wait for the callback to complete.
+  std::forward_list<Barrier*> barriers_;
+};
+
+void ClassLinker::MakeInitializedClassesVisiblyInitialized(Thread* self, bool wait) {
+  if (kRuntimeISA == InstructionSet::kX86 || kRuntimeISA == InstructionSet::kX86_64) {
+    return;  // Nothing to do. Thanks to the x86 memory model classes skip the initialized status.
+  }
+  std::optional<Barrier> maybe_barrier;  // Avoid constructing the Barrier for `wait == false`.
+  if (wait) {
+    maybe_barrier.emplace(0);
+  }
+  int wait_count = 0;
+  VisiblyInitializedCallback* callback = nullptr;
+  {
+    MutexLock lock(self, visibly_initialized_callback_lock_);
+    if (visibly_initialized_callback_ != nullptr && !visibly_initialized_callback_->IsEmpty()) {
+      callback = visibly_initialized_callback_.release();
+      running_visibly_initialized_callbacks_.push_front(*callback);
+    }
+    if (wait) {
+      DCHECK(maybe_barrier.has_value());
+      Barrier* barrier = std::addressof(*maybe_barrier);
+      for (VisiblyInitializedCallback& cb : running_visibly_initialized_callbacks_) {
+        cb.AddBarrier(barrier);
+        ++wait_count;
+      }
+    }
+  }
+  if (callback != nullptr) {
+    callback->MakeVisible(self);
+  }
+  if (wait_count != 0) {
+    DCHECK(maybe_barrier.has_value());
+    maybe_barrier->Increment(self, wait_count);
+  }
+}
+
+void ClassLinker::VisiblyInitializedCallbackDone(Thread* self,
+                                                 VisiblyInitializedCallback* callback) {
+  MutexLock lock(self, visibly_initialized_callback_lock_);
+  // Pass the barriers if requested.
+  for (Barrier* barrier : callback->GetAndClearBarriers()) {
+    barrier->Pass(self);
+  }
+  // Remove the callback from the list of running callbacks.
+  auto before = running_visibly_initialized_callbacks_.before_begin();
+  auto it = running_visibly_initialized_callbacks_.begin();
+  DCHECK(it != running_visibly_initialized_callbacks_.end());
+  while (std::addressof(*it) != callback) {
+    before = it;
+    ++it;
+    DCHECK(it != running_visibly_initialized_callbacks_.end());
+  }
+  running_visibly_initialized_callbacks_.erase_after(before);
+  // Reuse or destroy the callback object.
+  if (visibly_initialized_callback_ == nullptr) {
+    visibly_initialized_callback_.reset(callback);
+  } else {
+    delete callback;
+  }
+}
+
+ClassLinker::VisiblyInitializedCallback* ClassLinker::MarkClassInitialized(
+    Thread* self, Handle<mirror::Class> klass) {
+  if (kRuntimeISA == InstructionSet::kX86 || kRuntimeISA == InstructionSet::kX86_64) {
+    // Thanks to the x86 memory model, we do not need any memory fences and
+    // we can immediately mark the class as visibly initialized.
+    mirror::Class::SetStatus(klass, ClassStatus::kVisiblyInitialized, self);
+    return nullptr;
+  }
+  if (Runtime::Current()->IsActiveTransaction()) {
+    // Transactions are single-threaded, so we can mark the class as visibly intialized.
+    // (Otherwise we'd need to track the callback's entry in the transaction for rollback.)
+    mirror::Class::SetStatus(klass, ClassStatus::kVisiblyInitialized, self);
+    return nullptr;
+  }
+  mirror::Class::SetStatus(klass, ClassStatus::kInitialized, self);
+  MutexLock lock(self, visibly_initialized_callback_lock_);
+  if (visibly_initialized_callback_ == nullptr) {
+    visibly_initialized_callback_.reset(new VisiblyInitializedCallback(this));
+  }
+  DCHECK(!visibly_initialized_callback_->IsFull());
+  visibly_initialized_callback_->AddClass(self, klass.Get());
+
+  if (visibly_initialized_callback_->IsFull()) {
+    VisiblyInitializedCallback* callback = visibly_initialized_callback_.release();
+    running_visibly_initialized_callbacks_.push_front(*callback);
+    return callback;
+  } else {
+    return nullptr;
+  }
+}
+
 void ClassLinker::ThrowEarlierClassFailure(ObjPtr<mirror::Class> c,
                                            bool wrap_in_no_class_def,
                                            bool log) {
@@ -418,6 +604,8 @@
       quick_generic_jni_trampoline_(nullptr),
       quick_to_interpreter_bridge_trampoline_(nullptr),
       image_pointer_size_(kRuntimePointerSize),
+      visibly_initialized_callback_lock_("visibly initialized callback lock"),
+      visibly_initialized_callback_(nullptr),
       cha_(Runtime::Current()->IsAotCompiler() ? nullptr : new ClassHierarchyAnalysis()) {
   // For CHA disabled during Aot, see b/34193647.
 
@@ -2479,6 +2667,11 @@
     DeleteClassLoader(self, data, /*cleanup_cha=*/ false);
   }
   class_loaders_.clear();
+  while (!running_visibly_initialized_callbacks_.empty()) {
+    std::unique_ptr<VisiblyInitializedCallback> callback(
+        std::addressof(running_visibly_initialized_callbacks_.front()));
+    running_visibly_initialized_callbacks_.pop_front();
+  }
 }
 
 void ClassLinker::DeleteClassLoader(Thread* self, const ClassLoaderData& data, bool cleanup_cha) {
@@ -2646,7 +2839,9 @@
 
   array_class->SetAccessFlags(access_flags);
 
-  array_class->SetStatusForPrimitiveOrArray(ClassStatus::kInitialized);
+  // Array classes are fully initialized either during single threaded startup,
+  // or from a pre-fence visitor, so visibly initialized.
+  array_class->SetStatusForPrimitiveOrArray(ClassStatus::kVisiblyInitialized);
 }
 
 void ClassLinker::FinishCoreArrayClassSetup(ClassRoot array_root) {
@@ -4159,7 +4354,8 @@
   // the kAccVerificationAttempted flag was added above, and there are no
   // methods that need the kAccSkipAccessChecks flag.
   DCHECK_EQ(primitive_class->NumMethods(), 0u);
-  primitive_class->SetStatusForPrimitiveOrArray(ClassStatus::kInitialized);
+  // Primitive classes are initialized during single threaded startup, so visibly initialized.
+  primitive_class->SetStatusForPrimitiveOrArray(ClassStatus::kVisiblyInitialized);
   const char* descriptor = Primitive::Descriptor(type);
   ObjPtr<mirror::Class> existing = InsertClass(descriptor,
                                                primitive_class,
@@ -5003,11 +5199,16 @@
     // TODO: Avoid taking subtype_check_lock_ if SubtypeCheck for j.l.r.Proxy is already assigned.
   }
 
+  VisiblyInitializedCallback* callback = nullptr;
   {
     // Lock on klass is released. Lock new class object.
     ObjectLock<mirror::Class> initialization_lock(self, klass);
     EnsureSkipAccessChecksMethods(klass, image_pointer_size_);
-    mirror::Class::SetStatus(klass, ClassStatus::kInitialized, self);
+    // Conservatively go through the ClassStatus::kInitialized state.
+    callback = MarkClassInitialized(self, klass);
+  }
+  if (callback != nullptr) {
+    callback->MakeVisible(self);
   }
 
   // sanity checks
@@ -5421,6 +5622,7 @@
   self->AllowThreadSuspension();
   uint64_t t1 = NanoTime();
 
+  VisiblyInitializedCallback* callback = nullptr;
   bool success = true;
   {
     ObjectLock<mirror::Class> lock(self, klass);
@@ -5446,7 +5648,7 @@
       global_stats->class_init_time_ns += (t1 - t0 - t_sub);
       thread_stats->class_init_time_ns += (t1 - t0 - t_sub);
       // Set the class as initialized except if failed to initialize static fields.
-      mirror::Class::SetStatus(klass, ClassStatus::kInitialized, self);
+      callback = MarkClassInitialized(self, klass);
       if (VLOG_IS_ON(class_linker)) {
         std::string temp;
         LOG(INFO) << "Initialized class " << klass->GetDescriptor(&temp) << " from " <<
@@ -5456,6 +5658,9 @@
       FixupStaticTrampolines(klass.Get());
     }
   }
+  if (callback != nullptr) {
+    callback->MakeVisible(self);
+  }
   return success;
 }
 
diff --git a/runtime/class_linker.h b/runtime/class_linker.h
index a724685..dd9f56f 100644
--- a/runtime/class_linker.h
+++ b/runtime/class_linker.h
@@ -26,6 +26,8 @@
 #include <vector>
 
 #include "base/enums.h"
+#include "base/mutex.h"
+#include "base/intrusive_forward_list.h"
 #include "base/locks.h"
 #include "base/macros.h"
 #include "dex/class_accessor.h"
@@ -719,6 +721,8 @@
     return cha_.get();
   }
 
+  void MakeInitializedClassesVisiblyInitialized(Thread* self, bool wait);
+
   struct DexCacheData {
     // Construct an invalid data object.
     DexCacheData()
@@ -764,6 +768,7 @@
 
  private:
   class LinkInterfaceMethodsHelper;
+  class VisiblyInitializedCallback;
 
   struct ClassLoaderData {
     jweak weak_root;  // Weak root to enable class unloading.
@@ -771,6 +776,10 @@
     LinearAlloc* allocator;
   };
 
+  void VisiblyInitializedCallbackDone(Thread* self, VisiblyInitializedCallback* callback);
+  VisiblyInitializedCallback* MarkClassInitialized(Thread* self, Handle<mirror::Class> klass)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
   // Ensures that the supertype of 'klass' ('supertype') is verified. Returns false and throws
   // appropriate exceptions if verification failed hard. Returns true for successful verification or
   // soft-failures.
@@ -1393,6 +1402,13 @@
   // Image pointer size.
   PointerSize image_pointer_size_;
 
+  // Classes to transition from ClassStatus::kInitialized to ClassStatus::kVisiblyInitialized.
+  Mutex visibly_initialized_callback_lock_;
+  std::unique_ptr<VisiblyInitializedCallback> visibly_initialized_callback_
+      GUARDED_BY(visibly_initialized_callback_lock_);
+  IntrusiveForwardList<VisiblyInitializedCallback> running_visibly_initialized_callbacks_
+      GUARDED_BY(visibly_initialized_callback_lock_);
+
   std::unique_ptr<ClassHierarchyAnalysis> cha_;
 
   class FindVirtualMethodHolderVisitor;
diff --git a/runtime/class_linker_test.cc b/runtime/class_linker_test.cc
index 0ffbc4a..4377d7e 100644
--- a/runtime/class_linker_test.cc
+++ b/runtime/class_linker_test.cc
@@ -92,7 +92,7 @@
     EXPECT_TRUE(primitive->GetSuperClass() == nullptr);
     EXPECT_FALSE(primitive->HasSuperClass());
     EXPECT_TRUE(primitive->GetClassLoader() == nullptr);
-    EXPECT_EQ(ClassStatus::kInitialized, primitive->GetStatus());
+    EXPECT_EQ(ClassStatus::kVisiblyInitialized, primitive->GetStatus());
     EXPECT_FALSE(primitive->IsErroneous());
     EXPECT_TRUE(primitive->IsLoaded());
     EXPECT_TRUE(primitive->IsResolved());
@@ -131,7 +131,8 @@
     EXPECT_TRUE(JavaLangObject->GetSuperClass() == nullptr);
     EXPECT_FALSE(JavaLangObject->HasSuperClass());
     EXPECT_TRUE(JavaLangObject->GetClassLoader() == nullptr);
-    EXPECT_EQ(ClassStatus::kInitialized, JavaLangObject->GetStatus());
+    class_linker_->MakeInitializedClassesVisiblyInitialized(Thread::Current(), /*wait=*/ true);
+    EXPECT_EQ(ClassStatus::kVisiblyInitialized, JavaLangObject->GetStatus());
     EXPECT_FALSE(JavaLangObject->IsErroneous());
     EXPECT_TRUE(JavaLangObject->IsLoaded());
     EXPECT_TRUE(JavaLangObject->IsResolved());
@@ -207,7 +208,7 @@
     EXPECT_TRUE(array->HasSuperClass());
     ASSERT_TRUE(array->GetComponentType() != nullptr);
     ASSERT_GT(strlen(array->GetComponentType()->GetDescriptor(&temp)), 0U);
-    EXPECT_EQ(ClassStatus::kInitialized, array->GetStatus());
+    EXPECT_EQ(ClassStatus::kVisiblyInitialized, array->GetStatus());
     EXPECT_FALSE(array->IsErroneous());
     EXPECT_TRUE(array->IsLoaded());
     EXPECT_TRUE(array->IsResolved());
diff --git a/runtime/class_status.h b/runtime/class_status.h
index ada2863..de64ab7 100644
--- a/runtime/class_status.h
+++ b/runtime/class_status.h
@@ -87,7 +87,8 @@
   kSuperclassValidated = 12,  // Superclass validation part of init done.
   kInitializing = 13,  // Class init in progress.
   kInitialized = 14,  // Ready to go.
-  kLast = kInitialized
+  kVisiblyInitialized = 15,  // Initialized and visible to all threads.
+  kLast = kVisiblyInitialized
 };
 
 std::ostream& operator<<(std::ostream& os, const ClassStatus& rhs);
diff --git a/runtime/entrypoints/quick/quick_dexcache_entrypoints.cc b/runtime/entrypoints/quick/quick_dexcache_entrypoints.cc
index 838b5b5..0b286e3 100644
--- a/runtime/entrypoints/quick/quick_dexcache_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_dexcache_entrypoints.cc
@@ -140,6 +140,12 @@
   ScopedQuickEntrypointChecks sqec(self);
   DCHECK(klass != nullptr);
   ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
+  if (UNLIKELY(klass->IsInitialized())) {
+    if (self->IncrementMakeVisiblyInitializedCounter()) {
+      class_linker->MakeInitializedClassesVisiblyInitialized(self, /*wait=*/ false);
+    }
+    return klass;
+  }
   StackHandleScope<1> hs(self);
   Handle<mirror::Class> h_klass = hs.NewHandle(klass);
   bool success = class_linker->EnsureInitialized(
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index 2078186..7919621 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -769,22 +769,31 @@
     BuildQuickShadowFrameVisitor shadow_frame_builder(sp, method->IsStatic(), shorty, shorty_len,
                                                       shadow_frame, first_arg_reg);
     shadow_frame_builder.VisitArguments();
-    const bool needs_initialization =
-        method->IsStatic() && !method->GetDeclaringClass()->IsInitialized();
     // Push a transition back into managed code onto the linked list in thread.
     self->PushManagedStackFragment(&fragment);
     self->PushShadowFrame(shadow_frame);
     self->EndAssertNoThreadSuspension(old_cause);
 
-    if (needs_initialization) {
-      // Ensure static method's class is initialized.
-      StackHandleScope<1> hs(self);
-      Handle<mirror::Class> h_class(hs.NewHandle(shadow_frame->GetMethod()->GetDeclaringClass()));
-      if (!Runtime::Current()->GetClassLinker()->EnsureInitialized(self, h_class, true, true)) {
-        DCHECK(Thread::Current()->IsExceptionPending())
-            << shadow_frame->GetMethod()->PrettyMethod();
-        self->PopManagedStackFragment(fragment);
-        return 0;
+    if (method->IsStatic()) {
+      ObjPtr<mirror::Class> declaring_class = method->GetDeclaringClass();
+      if (LIKELY(declaring_class->IsVisiblyInitialized())) {
+        // Visibly initialized, nothing to do.
+      } else if (!declaring_class->IsInitialized()) {
+        // Ensure static method's class is initialized.
+        StackHandleScope<1> hs(self);
+        Handle<mirror::Class> h_class(hs.NewHandle(shadow_frame->GetMethod()->GetDeclaringClass()));
+        if (!Runtime::Current()->GetClassLinker()->EnsureInitialized(self, h_class, true, true)) {
+          DCHECK(Thread::Current()->IsExceptionPending())
+              << shadow_frame->GetMethod()->PrettyMethod();
+          self->PopManagedStackFragment(fragment);
+          return 0;
+        }
+      } else {
+        // Initialized but not visibly initialized.
+        if (self->IncrementMakeVisiblyInitializedCounter()) {
+          Runtime::Current()->GetClassLinker()->MakeInitializedClassesVisiblyInitialized(
+              self, /*wait=*/ false);
+        }
       }
     }
 
diff --git a/runtime/mirror/class.cc b/runtime/mirror/class.cc
index 422625d..846517b 100644
--- a/runtime/mirror/class.cc
+++ b/runtime/mirror/class.cc
@@ -201,7 +201,11 @@
   // Setting the object size alloc fast path needs to be after the status write so that if the
   // alloc path sees a valid object size, we would know that it's initialized as long as it has a
   // load-acquire/fake dependency.
-  if (new_status == ClassStatus::kInitialized && !h_this->IsVariableSize()) {
+  // TODO: Update the object size alloc fast path only for ClassStatus::kVisiblyInitialized
+  // and take advantage of this in allocation entrypoints. b/36692143
+  if (new_status >= ClassStatus::kInitialized &&
+      old_status < ClassStatus::kInitialized &&
+      !h_this->IsVariableSize()) {
     DCHECK_EQ(h_this->GetObjectSizeAllocFastPath(), std::numeric_limits<uint32_t>::max());
     // Finalizable objects must always go slow path.
     if (!h_this->IsFinalizable()) {
diff --git a/runtime/mirror/class.h b/runtime/mirror/class.h
index 7c4c1d5..960f49c 100644
--- a/runtime/mirror/class.h
+++ b/runtime/mirror/class.h
@@ -89,12 +89,16 @@
   static constexpr uint32_t kPrimitiveTypeSizeShiftShift = 16;
   static constexpr uint32_t kPrimitiveTypeMask = (1u << kPrimitiveTypeSizeShiftShift) - 1;
 
-  template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
+  template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags,
+           bool kWithSynchronizationBarrier = true>
   ClassStatus GetStatus() REQUIRES_SHARED(Locks::mutator_lock_) {
+    // Reading the field without barrier is used exclusively for IsVisiblyInitialized().
+    int32_t field_value = kWithSynchronizationBarrier
+        ? GetField32Volatile<kVerifyFlags>(StatusOffset())
+        : GetField32<kVerifyFlags>(StatusOffset());
     // Avoid including "subtype_check_bits_and_status.h" to get the field.
     // The ClassStatus is always in the 4 most-significant bits of status_.
-    return enum_cast<ClassStatus>(
-        static_cast<uint32_t>(GetField32Volatile<kVerifyFlags>(StatusOffset())) >> (32 - 4));
+    return enum_cast<ClassStatus>(static_cast<uint32_t>(field_value) >> (32 - 4));
   }
 
   // This is static because 'this' may be moved by GC.
@@ -176,7 +180,15 @@
   // Returns true if the class is initialized.
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
   bool IsInitialized() REQUIRES_SHARED(Locks::mutator_lock_) {
-    return GetStatus<kVerifyFlags>() == ClassStatus::kInitialized;
+    return GetStatus<kVerifyFlags>() >= ClassStatus::kInitialized;
+  }
+
+  // Returns true if the class is visibly initialized.
+  template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
+  bool IsVisiblyInitialized() REQUIRES_SHARED(Locks::mutator_lock_) {
+    // Note: Avoiding the synchronization barrier for the visibly initialized check.
+    ClassStatus status = GetStatus<kVerifyFlags, /*kWithSynchronizationBarrier=*/ false>();
+    return status == ClassStatus::kVisiblyInitialized;
   }
 
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index f1b14dc..91a3c45 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -2384,6 +2384,10 @@
 void Runtime::EnterTransactionMode() {
   DCHECK(IsAotCompiler());
   DCHECK(!IsActiveTransaction());
+  // Make initialized classes visibly initialized now. If that happened during the transaction
+  // and then the transaction was aborted, we would roll back the status update but not the
+  // ClassLinker's bookkeeping structures, so these classes would never be visibly initialized.
+  GetClassLinker()->MakeInitializedClassesVisiblyInitialized(Thread::Current(), /*wait=*/ true);
   preinitialization_transactions_.push_back(std::make_unique<Transaction>());
 }
 
diff --git a/runtime/thread.h b/runtime/thread.h
index 98aaa92..dbb811b 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -1229,6 +1229,15 @@
     return tls32_.force_interpreter_count != 0;
   }
 
+  bool IncrementMakeVisiblyInitializedCounter() {
+    tls32_.make_visibly_initialized_counter += 1u;
+    return tls32_.make_visibly_initialized_counter == kMakeVisiblyInitializedCounterTriggerCount;
+  }
+
+  void ClearMakeVisiblyInitializedCounter() {
+    tls32_.make_visibly_initialized_counter = 0u;
+  }
+
   void PushVerifier(verifier::MethodVerifier* verifier);
   void PopVerifier(verifier::MethodVerifier* verifier);
 
@@ -1473,6 +1482,8 @@
   // Stores the jit sensitive thread (which for now is the UI thread).
   static Thread* jit_sensitive_thread_;
 
+  static constexpr uint32_t kMakeVisiblyInitializedCounterTriggerCount = 1024;
+
   /***********************************************************************************************/
   // Thread local storage. Fields are grouped by size to enable 32 <-> 64 searching to account for
   // pointer size differences. To encourage shorter encoding, more frequently used values appear
@@ -1484,14 +1495,26 @@
     // to be 4-byte quantities.
     typedef uint32_t bool32_t;
 
-    explicit tls_32bit_sized_values(bool is_daemon) :
-      suspend_count(0), debug_suspend_count(0), thin_lock_thread_id(0), tid(0),
-      daemon(is_daemon), throwing_OutOfMemoryError(false), no_thread_suspension(0),
-      thread_exit_check_count(0), handling_signal_(false),
-      is_transitioning_to_runnable(false), ready_for_debug_invoke(false),
-      debug_method_entry_(false), is_gc_marking(false), weak_ref_access_enabled(true),
-      disable_thread_flip_count(0), user_code_suspend_count(0), force_interpreter_count(0) {
-    }
+    explicit tls_32bit_sized_values(bool is_daemon)
+        : suspend_count(0),
+          debug_suspend_count(0),
+          thin_lock_thread_id(0),
+          tid(0),
+          daemon(is_daemon),
+          throwing_OutOfMemoryError(false),
+          no_thread_suspension(0),
+          thread_exit_check_count(0),
+          handling_signal_(false),
+          is_transitioning_to_runnable(false),
+          ready_for_debug_invoke(false),
+          debug_method_entry_(false),
+          is_gc_marking(false),
+          weak_ref_access_enabled(true),
+          disable_thread_flip_count(0),
+          user_code_suspend_count(0),
+          force_interpreter_count(0),
+          use_mterp(0),
+          make_visibly_initialized_counter(0) {}
 
     union StateAndFlags state_and_flags;
     static_assert(sizeof(union StateAndFlags) == sizeof(int32_t),
@@ -1581,6 +1604,14 @@
     // True if everything is in the ideal state for fast interpretation.
     // False if we need to switch to the C++ interpreter to handle special cases.
     std::atomic<bool32_t> use_mterp;
+
+    // Counter for calls to initialize a class that's initialized but not visibly initialized.
+    // When this reaches kMakeVisiblyInitializedCounterTriggerCount, we call the runtime to
+    // make initialized classes visibly initialized. This is needed because we usually make
+    // classes visibly initialized in batches but we do not want to be stuck with a class
+    // initialized but not visibly initialized for a long time even if no more classes are
+    // being initialized anymore.
+    uint32_t make_visibly_initialized_counter;
   } tls32_;
 
   struct PACKED(8) tls_64bit_sized_values {