Revert "Use the "GC is marking" information in compiler read barriers (ARM, ARM64)."

This reverts commit 1372c9f40df1e47bf775f1466bbb96f472b6b9ed.

This change (along with https://android-review.googlesource.com/#/c/342429/)
creates null pointer dereferences.

Bug: 35780827
Bug: 29516974
Change-Id: I2a9c4d0ad8d2ab870c2e0ddbff32152933c77abe
(cherry picked from commit 35345a555bd7928582a7ffa6369b374b3ddc379d)
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 710ca7a..7b84ef8 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -650,7 +650,7 @@
 //
 // If `entrypoint` is a valid location it is assumed to already be
 // holding the entrypoint. The case where the entrypoint is passed in
-// is when the decision to mark is based on whether the GC is marking.
+// is for the GcRoot read barrier.
 class ReadBarrierMarkSlowPathARM : public SlowPathCodeARM {
  public:
   ReadBarrierMarkSlowPathARM(HInstruction* instruction,
@@ -715,7 +715,6 @@
       arm_codegen->ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction_, this);
       __ blx(entrypoint_.AsRegister<Register>());
     } else {
-      // Entrypoint is not already loaded, load from the thread.
       int32_t entry_point_offset =
           CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref_reg);
       // This runtime call does not require a stack map.
@@ -744,10 +743,6 @@
 // and `obj.field` will be up-to-date; i.e., after the flip, both will
 // hold the same to-space reference (unless another thread installed
 // another object reference (different from `ref`) in `obj.field`).
-//
-// If `entrypoint` is a valid location it is assumed to already be
-// holding the entrypoint. The case where the entrypoint is passed in
-// is when the decision to mark is based on whether the GC is marking.
 class ReadBarrierMarkAndUpdateFieldSlowPathARM : public SlowPathCodeARM {
  public:
   ReadBarrierMarkAndUpdateFieldSlowPathARM(HInstruction* instruction,
@@ -755,15 +750,13 @@
                                            Register obj,
                                            Location field_offset,
                                            Register temp1,
-                                           Register temp2,
-                                           Location entrypoint = Location::NoLocation())
+                                           Register temp2)
       : SlowPathCodeARM(instruction),
         ref_(ref),
         obj_(obj),
         field_offset_(field_offset),
         temp1_(temp1),
-        temp2_(temp2),
-        entrypoint_(entrypoint) {
+        temp2_(temp2) {
     DCHECK(kEmitCompilerReadBarrier);
   }
 
@@ -816,16 +809,10 @@
     //
     //   rX <- ReadBarrierMarkRegX(rX)
     //
-    if (entrypoint_.IsValid()) {
-      arm_codegen->ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction_, this);
-      __ blx(entrypoint_.AsRegister<Register>());
-    } else {
-      // Entrypoint is not already loaded, load from the thread.
-      int32_t entry_point_offset =
-          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref_reg);
-      // This runtime call does not require a stack map.
-      arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
-    }
+    int32_t entry_point_offset =
+        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref_reg);
+    // This runtime call does not require a stack map.
+    arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
 
     // If the new reference is different from the old reference,
     // update the field in the holder (`*(obj_ + field_offset_)`).
@@ -915,9 +902,6 @@
   const Register temp1_;
   const Register temp2_;
 
-  // The location of the entrypoint if already loaded.
-  const Location entrypoint_;
-
   DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkAndUpdateFieldSlowPathARM);
 };
 
@@ -7201,35 +7185,14 @@
     DCHECK(kEmitCompilerReadBarrier);
     if (kUseBakerReadBarrier) {
       // Fast path implementation of art::ReadBarrier::BarrierForRoot when
-      // Baker's read barrier are used.
+      // Baker's read barrier are used:
       //
-      // Note that we do not actually check the value of
-      // `GetIsGcMarking()` to decide whether to mark the loaded GC
-      // root or not.  Instead, we load into `temp` the read barrier
-      // mark entry point corresponding to register `root`. If `temp`
-      // is null, it means that `GetIsGcMarking()` is false, and vice
-      // versa.
-      //
+      //   root = obj.field;
       //   temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
-      //   GcRoot<mirror::Object> root = *(obj+offset);  // Original reference load.
-      //   if (temp != nullptr) {  // <=> Thread::Current()->GetIsGcMarking()
-      //     // Slow path.
-      //     root = temp(root);  // root = ReadBarrier::Mark(root);  // Runtime entry point call.
+      //   if (temp != null) {
+      //     root = temp(root)
       //   }
 
-      // Slow path marking the GC root `root`. The entrypoint will already be loaded in `temp`.
-      Location temp = Location::RegisterLocation(LR);
-      SlowPathCodeARM* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM(
-          instruction, root, /* entrypoint */ temp);
-      codegen_->AddSlowPath(slow_path);
-
-      // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
-      const int32_t entry_point_offset =
-          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg());
-      // Loading the entrypoint does not require a load acquire since it is only changed when
-      // threads are suspended or running a checkpoint.
-      __ LoadFromOffset(kLoadWord, temp.AsRegister<Register>(), TR, entry_point_offset);
-
       // /* GcRoot<mirror::Object> */ root = *(obj + offset)
       __ LoadFromOffset(kLoadWord, root_reg, obj, offset);
       static_assert(
@@ -7240,6 +7203,21 @@
                     "art::mirror::CompressedReference<mirror::Object> and int32_t "
                     "have different sizes.");
 
+      // Slow path marking the GC root `root`.
+      Location temp = Location::RegisterLocation(LR);
+      SlowPathCodeARM* slow_path =
+          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM(
+              instruction,
+              root,
+              /*entrypoint*/ temp);
+      codegen_->AddSlowPath(slow_path);
+
+      // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+      const int32_t entry_point_offset =
+          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg());
+      // Loading the entrypoint does not require a load acquire since it is only changed when
+      // threads are suspended or running a checkpoint.
+      __ LoadFromOffset(kLoadWord, temp.AsRegister<Register>(), TR, entry_point_offset);
       // The entrypoint is null when the GC is not marking, this prevents one load compared to
       // checking GetIsGcMarking.
       __ CompareAndBranchIfNonZero(temp.AsRegister<Register>(), slow_path->GetEntryLabel());
@@ -7310,79 +7288,51 @@
   DCHECK(kEmitCompilerReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
-  // After loading the reference from `obj.field` into `ref`, query
-  // `art::Thread::Current()->GetIsGcMarking()` to decide whether we
-  // need to enter the slow path to mark the reference. This
-  // optimistic strategy (we expect the GC to not be marking most of
-  // the time) does not check `obj`'s lock word (to see if it is a
-  // gray object or not), so may sometimes mark an already marked
-  // object.
+  // In slow path based read barriers, the read barrier call is
+  // inserted after the original load. However, in fast path based
+  // Baker's read barriers, we need to perform the load of
+  // mirror::Object::monitor_ *before* the original reference load.
+  // This load-load ordering is required by the read barrier.
+  // The fast path/slow path (for Baker's algorithm) should look like:
   //
-  // Note that we do not actually check the value of `GetIsGcMarking()`;
-  // instead, we load into `temp3` the read barrier mark entry point
-  // corresponding to register `ref`. If `temp3` is null, it means
-  // that `GetIsGcMarking()` is false, and vice versa.
-  //
-  //   temp3 = Thread::Current()->pReadBarrierMarkReg ## root.reg()
-  //   HeapReference<mirror::Object> ref = *src;  // Original reference load.
-  //   if (temp3 != nullptr) {  // <=> Thread::Current()->GetIsGcMarking()
-  //     // Slow path.
-  //     ref = temp3(ref);  // ref = ReadBarrier::Mark(ref);  // Runtime entry point call.
+  //   uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
+  //   lfence;  // Load fence or artificial data dependency to prevent load-load reordering
+  //   HeapReference<Object> ref = *src;  // Original reference load.
+  //   bool is_gray = (rb_state == ReadBarrier::GrayState());
+  //   if (is_gray) {
+  //     ref = ReadBarrier::Mark(ref);  // Performed by runtime entrypoint slow path.
   //   }
+  //
+  // Note: the original implementation in ReadBarrier::Barrier is
+  // slightly more complex as it performs additional checks that we do
+  // not do here for performance reasons.
 
-  // TODO: This temp register is only necessary when
-  // `always_update_field` is true; make it optional (like `temp2`).
-  Register temp_reg = temp.AsRegister<Register>();
-
-  // Slow path marking the object `ref` when the GC is marking. The
-  // entrypoint will already be loaded in `temp3`.
-  Location temp3 = Location::RegisterLocation(LR);
-  SlowPathCodeARM* slow_path;
-  if (always_update_field) {
-    DCHECK(temp2 != nullptr);
-    // ReadBarrierMarkAndUpdateFieldSlowPathARM only supports address
-    // of the form `obj + field_offset`, where `obj` is a register and
-    // `field_offset` is a register pair (of which only the lower half
-    // is used). Thus `offset` and `scale_factor` above are expected
-    // to be null in this code path.
-    DCHECK_EQ(offset, 0u);
-    DCHECK_EQ(scale_factor, ScaleFactor::TIMES_1);
-    slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkAndUpdateFieldSlowPathARM(
-        instruction, ref, obj, /* field_offset */ index, temp_reg, *temp2, /* entrypoint */ temp3);
-  } else {
-    slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM(
-        instruction, ref, /* entrypoint */ temp3);
-  }
-  AddSlowPath(slow_path);
-
-  // temp3 = Thread::Current()->pReadBarrierMarkReg ## ref.reg()
-  const int32_t entry_point_offset =
-      CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref.reg());
-  // Loading the entrypoint does not require a load acquire since it is only changed when
-  // threads are suspended or running a checkpoint.
-  __ LoadFromOffset(kLoadWord, temp3.AsRegister<Register>(), TR, entry_point_offset);
-  // The reference load.
-  GenerateRawReferenceLoad(instruction, ref, obj, offset, index, scale_factor, needs_null_check);
-  // The entrypoint is null when the GC is not marking, this prevents one load compared to
-  // checking GetIsGcMarking.
-  __ CompareAndBranchIfNonZero(temp3.AsRegister<Register>(), slow_path->GetEntryLabel());
-  __ Bind(slow_path->GetExitLabel());
-}
-
-void CodeGeneratorARM::GenerateRawReferenceLoad(HInstruction* instruction,
-                                                Location ref,
-                                                Register obj,
-                                                uint32_t offset,
-                                                Location index,
-                                                ScaleFactor scale_factor,
-                                                bool needs_null_check) {
   Register ref_reg = ref.AsRegister<Register>();
+  Register temp_reg = temp.AsRegister<Register>();
+  uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
 
+  // /* int32_t */ monitor = obj->monitor_
+  __ LoadFromOffset(kLoadWord, temp_reg, obj, monitor_offset);
+  if (needs_null_check) {
+    MaybeRecordImplicitNullCheck(instruction);
+  }
+  // /* LockWord */ lock_word = LockWord(monitor)
+  static_assert(sizeof(LockWord) == sizeof(int32_t),
+                "art::LockWord and int32_t have different sizes.");
+
+  // Introduce a dependency on the lock_word including the rb_state,
+  // which shall prevent load-load reordering without using
+  // a memory barrier (which would be more expensive).
+  // `obj` is unchanged by this operation, but its value now depends
+  // on `temp_reg`.
+  __ add(obj, obj, ShifterOperand(temp_reg, LSR, 32));
+
+  // The actual reference load.
   if (index.IsValid()) {
     // Load types involving an "index": ArrayGet,
     // UnsafeGetObject/UnsafeGetObjectVolatile and UnsafeCASObject
     // intrinsics.
-    // /* HeapReference<mirror::Object> */ ref = *(obj + offset + (index << scale_factor))
+    // /* HeapReference<Object> */ ref = *(obj + offset + (index << scale_factor))
     if (index.IsConstant()) {
       size_t computed_offset =
           (index.GetConstant()->AsIntConstant()->GetValue() << scale_factor) + offset;
@@ -7399,16 +7349,41 @@
       __ LoadFromOffset(kLoadWord, ref_reg, IP, offset);
     }
   } else {
-    // /* HeapReference<mirror::Object> */ ref = *(obj + offset)
+    // /* HeapReference<Object> */ ref = *(obj + offset)
     __ LoadFromOffset(kLoadWord, ref_reg, obj, offset);
   }
 
-  if (needs_null_check) {
-    MaybeRecordImplicitNullCheck(instruction);
-  }
-
   // Object* ref = ref_addr->AsMirrorPtr()
   __ MaybeUnpoisonHeapReference(ref_reg);
+
+  // Slow path marking the object `ref` when it is gray.
+  SlowPathCodeARM* slow_path;
+  if (always_update_field) {
+    DCHECK(temp2 != nullptr);
+    // ReadBarrierMarkAndUpdateFieldSlowPathARM only supports address
+    // of the form `obj + field_offset`, where `obj` is a register and
+    // `field_offset` is a register pair (of which only the lower half
+    // is used). Thus `offset` and `scale_factor` above are expected
+    // to be null in this code path.
+    DCHECK_EQ(offset, 0u);
+    DCHECK_EQ(scale_factor, ScaleFactor::TIMES_1);
+    slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkAndUpdateFieldSlowPathARM(
+        instruction, ref, obj, /* field_offset */ index, temp_reg, *temp2);
+  } else {
+    slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM(instruction, ref);
+  }
+  AddSlowPath(slow_path);
+
+  // if (rb_state == ReadBarrier::GrayState())
+  //   ref = ReadBarrier::Mark(ref);
+  // Given the numeric representation, it's enough to check the low bit of the
+  // rb_state. We do that by shifting the bit out of the lock word with LSRS
+  // which can be a 16-bit instruction unlike the TST immediate.
+  static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0");
+  static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
+  __ Lsrs(temp_reg, temp_reg, LockWord::kReadBarrierStateShift + 1);
+  __ b(slow_path->GetEntryLabel(), CS);  // Carry flag is the last bit shifted out by LSRS.
+  __ Bind(slow_path->GetExitLabel());
 }
 
 void CodeGeneratorARM::GenerateReadBarrierSlow(HInstruction* instruction,
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index 1f68777..df2dbc7 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -520,6 +520,9 @@
                                              Location index,
                                              Location temp,
                                              bool needs_null_check);
+  // Factored implementation used by GenerateFieldLoadWithBakerReadBarrier
+  // and GenerateArrayLoadWithBakerReadBarrier.
+
   // Factored implementation, used by GenerateFieldLoadWithBakerReadBarrier,
   // GenerateArrayLoadWithBakerReadBarrier and some intrinsics.
   //
@@ -542,15 +545,6 @@
                                                  bool always_update_field = false,
                                                  Register* temp2 = nullptr);
 
-  // Generate a heap reference load (with no read barrier).
-  void GenerateRawReferenceLoad(HInstruction* instruction,
-                                Location ref,
-                                Register obj,
-                                uint32_t offset,
-                                Location index,
-                                ScaleFactor scale_factor,
-                                bool needs_null_check);
-
   // Generate a read barrier for a heap reference within `instruction`
   // using a slow path.
   //
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 5bdaac2..edccbd4 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -647,7 +647,7 @@
 //
 // If `entrypoint` is a valid location it is assumed to already be
 // holding the entrypoint. The case where the entrypoint is passed in
-// is when the decision to mark is based on whether the GC is marking.
+// is for the GcRoot read barrier.
 class ReadBarrierMarkSlowPathARM64 : public SlowPathCodeARM64 {
  public:
   ReadBarrierMarkSlowPathARM64(HInstruction* instruction,
@@ -743,24 +743,18 @@
 // and `obj.field` will be up-to-date; i.e., after the flip, both will
 // hold the same to-space reference (unless another thread installed
 // another object reference (different from `ref`) in `obj.field`).
-//
-// If `entrypoint` is a valid location it is assumed to already be
-// holding the entrypoint. The case where the entrypoint is passed in
-// is when the decision to mark is based on whether the GC is marking.
 class ReadBarrierMarkAndUpdateFieldSlowPathARM64 : public SlowPathCodeARM64 {
  public:
   ReadBarrierMarkAndUpdateFieldSlowPathARM64(HInstruction* instruction,
                                              Location ref,
                                              Register obj,
                                              Location field_offset,
-                                             Register temp,
-                                             Location entrypoint = Location::NoLocation())
+                                             Register temp)
       : SlowPathCodeARM64(instruction),
         ref_(ref),
         obj_(obj),
         field_offset_(field_offset),
-        temp_(temp),
-        entrypoint_(entrypoint) {
+        temp_(temp) {
     DCHECK(kEmitCompilerReadBarrier);
   }
 
@@ -816,16 +810,10 @@
     //
     //   rX <- ReadBarrierMarkRegX(rX)
     //
-    if (entrypoint_.IsValid()) {
-      arm64_codegen->ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction_, this);
-      __ Blr(XRegisterFrom(entrypoint_));
-    } else {
-      // Entrypoint is not already loaded, load from the thread.
-      int32_t entry_point_offset =
-          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ref_.reg());
-      // This runtime call does not require a stack map.
-      arm64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
-    }
+    int32_t entry_point_offset =
+        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ref_.reg());
+    // This runtime call does not require a stack map.
+    arm64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
 
     // If the new reference is different from the old reference,
     // update the field in the holder (`*(obj_ + field_offset_)`).
@@ -908,9 +896,6 @@
 
   const Register temp_;
 
-  // The location of the entrypoint if it is already loaded.
-  const Location entrypoint_;
-
   DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkAndUpdateFieldSlowPathARM64);
 };
 
@@ -5629,35 +5614,14 @@
     DCHECK(kEmitCompilerReadBarrier);
     if (kUseBakerReadBarrier) {
       // Fast path implementation of art::ReadBarrier::BarrierForRoot when
-      // Baker's read barrier are used.
+      // Baker's read barrier are used:
       //
-      // Note that we do not actually check the value of
-      // `GetIsGcMarking()` to decide whether to mark the loaded GC
-      // root or not.  Instead, we load into `temp` the read barrier
-      // mark entry point corresponding to register `root`. If `temp`
-      // is null, it means that `GetIsGcMarking()` is false, and vice
-      // versa.
-      //
+      //   root = obj.field;
       //   temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
-      //   GcRoot<mirror::Object> root = *(obj+offset);  // Original reference load.
-      //   if (temp != nullptr) {  // <=> Thread::Current()->GetIsGcMarking()
-      //     // Slow path.
-      //     root = temp(root);  // root = ReadBarrier::Mark(root);  // Runtime entry point call.
+      //   if (temp != null) {
+      //     root = temp(root)
       //   }
 
-      // Slow path marking the GC root `root`. The entrypoint will already be loaded in `temp`.
-      Register temp = lr;
-      SlowPathCodeARM64* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(
-          instruction, root, /* entrypoint */ LocationFrom(temp));
-      codegen_->AddSlowPath(slow_path);
-
-      // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
-      const int32_t entry_point_offset =
-          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(root.reg());
-      // Loading the entrypoint does not require a load acquire since it is only changed when
-      // threads are suspended or running a checkpoint.
-      __ Ldr(temp, MemOperand(tr, entry_point_offset));
-
       // /* GcRoot<mirror::Object> */ root = *(obj + offset)
       if (fixup_label == nullptr) {
         __ Ldr(root_reg, MemOperand(obj, offset));
@@ -5672,6 +5636,20 @@
                     "art::mirror::CompressedReference<mirror::Object> and int32_t "
                     "have different sizes.");
 
+      Register temp = lr;
+
+      // Slow path marking the GC root `root`. The entrypoint will alrady be loaded in temp.
+      SlowPathCodeARM64* slow_path =
+          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(instruction,
+                                                                    root,
+                                                                    LocationFrom(temp));
+      codegen_->AddSlowPath(slow_path);
+      const int32_t entry_point_offset =
+          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(root.reg());
+      // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+      // Loading the entrypoint does not require a load acquire since it is only changed when
+      // threads are suspended or running a checkpoint.
+      __ Ldr(temp, MemOperand(tr, entry_point_offset));
       // The entrypoint is null when the GC is not marking, this prevents one load compared to
       // checking GetIsGcMarking.
       __ Cbnz(temp, slow_path->GetEntryLabel());
@@ -5773,77 +5751,54 @@
   // `instruction->IsArrayGet()` => `!use_load_acquire`.
   DCHECK(!instruction->IsArrayGet() || !use_load_acquire);
 
-  // After loading the reference from `obj.field` into `ref`, query
-  // `art::Thread::Current()->GetIsGcMarking()` to decide whether we
-  // need to enter the slow path to mark the reference. This
-  // optimistic strategy (we expect the GC to not be marking most of
-  // the time) does not check `obj`'s lock word (to see if it is a
-  // gray object or not), so may sometimes mark an already marked
-  // object.
+  MacroAssembler* masm = GetVIXLAssembler();
+  UseScratchRegisterScope temps(masm);
+
+  // In slow path based read barriers, the read barrier call is
+  // inserted after the original load. However, in fast path based
+  // Baker's read barriers, we need to perform the load of
+  // mirror::Object::monitor_ *before* the original reference load.
+  // This load-load ordering is required by the read barrier.
+  // The fast path/slow path (for Baker's algorithm) should look like:
   //
-  // Note that we do not actually check the value of `GetIsGcMarking()`;
-  // instead, we load into `temp2` the read barrier mark entry point
-  // corresponding to register `ref`. If `temp2` is null, it means
-  // that `GetIsGcMarking()` is false, and vice versa.
-  //
-  //   temp2 = Thread::Current()->pReadBarrierMarkReg ## root.reg()
-  //   HeapReference<mirror::Object> ref = *src;  // Original reference load.
-  //   if (temp2 != nullptr) {  // <=> Thread::Current()->GetIsGcMarking()
-  //     // Slow path.
-  //     ref = temp2(ref);  // ref = ReadBarrier::Mark(ref);  // Runtime entry point call.
+  //   uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
+  //   lfence;  // Load fence or artificial data dependency to prevent load-load reordering
+  //   HeapReference<Object> ref = *src;  // Original reference load.
+  //   bool is_gray = (rb_state == ReadBarrier::GrayState());
+  //   if (is_gray) {
+  //     ref = ReadBarrier::Mark(ref);  // Performed by runtime entrypoint slow path.
   //   }
+  //
+  // Note: the original implementation in ReadBarrier::Barrier is
+  // slightly more complex as it performs additional checks that we do
+  // not do here for performance reasons.
 
-  // Slow path marking the object `ref` when the GC is marking. The
-  // entrypoint will already be loaded in `temp2`.
-  Register temp2 = lr;
-  Location temp2_loc = LocationFrom(temp2);
-  SlowPathCodeARM64* slow_path;
-  if (always_update_field) {
-    // ReadBarrierMarkAndUpdateFieldSlowPathARM64 only supports
-    // address of the form `obj + field_offset`, where `obj` is a
-    // register and `field_offset` is a register. Thus `offset` and
-    // `scale_factor` above are expected to be null in this code path.
-    DCHECK_EQ(offset, 0u);
-    DCHECK_EQ(scale_factor, 0u);  /* "times 1" */
-    slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkAndUpdateFieldSlowPathARM64(
-        instruction, ref, obj, /* field_offset */ index, temp, /* entrypoint */ temp2_loc);
-  } else {
-    slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(
-        instruction, ref, /* entrypoint */ temp2_loc);
-  }
-  AddSlowPath(slow_path);
-
-  // temp2 = Thread::Current()->pReadBarrierMarkReg ## ref.reg()
-  const int32_t entry_point_offset =
-      CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ref.reg());
-  // Loading the entrypoint does not require a load acquire since it is only changed when
-  // threads are suspended or running a checkpoint.
-  __ Ldr(temp2, MemOperand(tr, entry_point_offset));
-  // The reference load.
-  GenerateRawReferenceLoad(
-      instruction, ref, obj, offset, index, scale_factor, needs_null_check, use_load_acquire);
-  // The entrypoint is null when the GC is not marking, this prevents one load compared to
-  // checking GetIsGcMarking.
-  __ Cbnz(temp2, slow_path->GetEntryLabel());
-  __ Bind(slow_path->GetExitLabel());
-}
-
-void CodeGeneratorARM64::GenerateRawReferenceLoad(HInstruction* instruction,
-                                                  Location ref,
-                                                  Register obj,
-                                                  uint32_t offset,
-                                                  Location index,
-                                                  size_t scale_factor,
-                                                  bool needs_null_check,
-                                                  bool use_load_acquire) {
-  DCHECK(obj.IsW());
   Primitive::Type type = Primitive::kPrimNot;
   Register ref_reg = RegisterFrom(ref, type);
+  DCHECK(obj.IsW());
+  uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
 
-  // If needed, vixl::EmissionCheckScope guards are used to ensure
-  // that no pools are emitted between the load (macro) instruction
-  // and MaybeRecordImplicitNullCheck.
+  {
+    // Ensure that between load and MaybeRecordImplicitNullCheck there are no pools emitted.
+    EmissionCheckScope guard(GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes);
+    // /* int32_t */ monitor = obj->monitor_
+    __ Ldr(temp, HeapOperand(obj, monitor_offset));
+    if (needs_null_check) {
+      MaybeRecordImplicitNullCheck(instruction);
+    }
+  }
+  // /* LockWord */ lock_word = LockWord(monitor)
+  static_assert(sizeof(LockWord) == sizeof(int32_t),
+                "art::LockWord and int32_t have different sizes.");
 
+  // Introduce a dependency on the lock_word including rb_state,
+  // to prevent load-load reordering, and without using
+  // a memory barrier (which would be more expensive).
+  // `obj` is unchanged by this operation, but its value now depends
+  // on `temp`.
+  __ Add(obj.X(), obj.X(), Operand(temp.X(), LSR, 32));
+
+  // The actual reference load.
   if (index.IsValid()) {
     // Load types involving an "index": ArrayGet,
     // UnsafeGetObject/UnsafeGetObjectVolatile and UnsafeCASObject
@@ -5858,50 +5813,59 @@
           << instruction->AsInvoke()->GetIntrinsic();
       DCHECK_EQ(offset, 0u);
       DCHECK_EQ(scale_factor, 0u);
-      DCHECK_EQ(needs_null_check, false);
-      // /* HeapReference<mirror::Object> */ ref = *(obj + index)
+      DCHECK_EQ(needs_null_check, 0u);
+      // /* HeapReference<Object> */ ref = *(obj + index)
       MemOperand field = HeapOperand(obj, XRegisterFrom(index));
       LoadAcquire(instruction, ref_reg, field, /* needs_null_check */ false);
     } else {
-      // ArrayGet and UnsafeGetObject and UnsafeCASObject intrinsics cases.
-      // /* HeapReference<mirror::Object> */ ref = *(obj + offset + (index << scale_factor))
+      // ArrayGet and UnsafeGetObject intrinsics cases.
+      // /* HeapReference<Object> */ ref = *(obj + offset + (index << scale_factor))
       if (index.IsConstant()) {
         uint32_t computed_offset = offset + (Int64ConstantFrom(index) << scale_factor);
-        EmissionCheckScope guard(GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes);
         Load(type, ref_reg, HeapOperand(obj, computed_offset));
-        if (needs_null_check) {
-          MaybeRecordImplicitNullCheck(instruction);
-        }
       } else {
-        UseScratchRegisterScope temps(GetVIXLAssembler());
-        Register temp = temps.AcquireW();
-        __ Add(temp, obj, offset);
-        {
-          EmissionCheckScope guard(GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes);
-          Load(type, ref_reg, HeapOperand(temp, XRegisterFrom(index), LSL, scale_factor));
-          if (needs_null_check) {
-            MaybeRecordImplicitNullCheck(instruction);
-          }
-        }
+        Register temp3 = temps.AcquireW();
+        __ Add(temp3, obj, offset);
+        Load(type, ref_reg, HeapOperand(temp3, XRegisterFrom(index), LSL, scale_factor));
+        temps.Release(temp3);
       }
     }
   } else {
-    // /* HeapReference<mirror::Object> */ ref = *(obj + offset)
+    // /* HeapReference<Object> */ ref = *(obj + offset)
     MemOperand field = HeapOperand(obj, offset);
     if (use_load_acquire) {
-      // Implicit null checks are handled by CodeGeneratorARM64::LoadAcquire.
-      LoadAcquire(instruction, ref_reg, field, needs_null_check);
+      LoadAcquire(instruction, ref_reg, field, /* needs_null_check */ false);
     } else {
-      EmissionCheckScope guard(GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes);
       Load(type, ref_reg, field);
-      if (needs_null_check) {
-        MaybeRecordImplicitNullCheck(instruction);
-      }
     }
   }
 
   // Object* ref = ref_addr->AsMirrorPtr()
   GetAssembler()->MaybeUnpoisonHeapReference(ref_reg);
+
+  // Slow path marking the object `ref` when it is gray.
+  SlowPathCodeARM64* slow_path;
+  if (always_update_field) {
+    // ReadBarrierMarkAndUpdateFieldSlowPathARM64 only supports
+    // address of the form `obj + field_offset`, where `obj` is a
+    // register and `field_offset` is a register. Thus `offset` and
+    // `scale_factor` above are expected to be null in this code path.
+    DCHECK_EQ(offset, 0u);
+    DCHECK_EQ(scale_factor, 0u);  /* "times 1" */
+    slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkAndUpdateFieldSlowPathARM64(
+        instruction, ref, obj, /* field_offset */ index, temp);
+  } else {
+    slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(instruction, ref);
+  }
+  AddSlowPath(slow_path);
+
+  // if (rb_state == ReadBarrier::GrayState())
+  //   ref = ReadBarrier::Mark(ref);
+  // Given the numeric representation, it's enough to check the low bit of the rb_state.
+  static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0");
+  static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
+  __ Tbnz(temp, LockWord::kReadBarrierStateShift, slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
 }
 
 void CodeGeneratorARM64::GenerateReadBarrierSlow(HInstruction* instruction,
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 231fb05..5faf29a 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -616,8 +616,8 @@
                                              Location index,
                                              vixl::aarch64::Register temp,
                                              bool needs_null_check);
-  // Factored implementation, used by GenerateFieldLoadWithBakerReadBarrier,
-  // GenerateArrayLoadWithBakerReadBarrier and some intrinsics.
+  // Factored implementation used by GenerateFieldLoadWithBakerReadBarrier
+  // and GenerateArrayLoadWithBakerReadBarrier.
   //
   // Load the object reference located at the address
   // `obj + offset + (index << scale_factor)`, held by object `obj`, into
@@ -636,16 +636,6 @@
                                                  bool use_load_acquire,
                                                  bool always_update_field = false);
 
-  // Generate a heap reference load (with no read barrier).
-  void GenerateRawReferenceLoad(HInstruction* instruction,
-                                Location ref,
-                                vixl::aarch64::Register obj,
-                                uint32_t offset,
-                                Location index,
-                                size_t scale_factor,
-                                bool needs_null_check,
-                                bool use_load_acquire);
-
   // Generate a read barrier for a heap reference within `instruction`
   // using a slow path.
   //
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index c92a056..6bfbe4a 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -668,10 +668,6 @@
 // probably still be a from-space reference (unless it gets updated by
 // another thread, or if another thread installed another object
 // reference (different from `ref`) in `obj.field`).
-//
-// If `entrypoint` is a valid location it is assumed to already be
-// holding the entrypoint. The case where the entrypoint is passed in
-// when the decision to mark is based on whether the GC is marking.
 class ReadBarrierMarkSlowPathARMVIXL : public SlowPathCodeARMVIXL {
  public:
   ReadBarrierMarkSlowPathARMVIXL(HInstruction* instruction,
@@ -736,7 +732,6 @@
       arm_codegen->ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction_, this);
       __ Blx(RegisterFrom(entrypoint_));
     } else {
-      // Entrypoint is not already loaded, load from the thread.
       int32_t entry_point_offset =
           CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref_reg.GetCode());
       // This runtime call does not require a stack map.
@@ -765,10 +760,6 @@
 // and `obj.field` will be up-to-date; i.e., after the flip, both will
 // hold the same to-space reference (unless another thread installed
 // another object reference (different from `ref`) in `obj.field`).
-//
-// If `entrypoint` is a valid location it is assumed to already be
-// holding the entrypoint. The case where the entrypoint is passed in
-// when the decision to mark is based on whether the GC is marking.
 class ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL : public SlowPathCodeARMVIXL {
  public:
   ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL(HInstruction* instruction,
@@ -776,15 +767,13 @@
                                                vixl32::Register obj,
                                                Location field_offset,
                                                vixl32::Register temp1,
-                                               vixl32::Register temp2,
-                                               Location entrypoint = Location::NoLocation())
+                                               vixl32::Register temp2)
       : SlowPathCodeARMVIXL(instruction),
         ref_(ref),
         obj_(obj),
         field_offset_(field_offset),
         temp1_(temp1),
-        temp2_(temp2),
-        entrypoint_(entrypoint) {
+        temp2_(temp2) {
     DCHECK(kEmitCompilerReadBarrier);
   }
 
@@ -839,16 +828,10 @@
     //
     //   rX <- ReadBarrierMarkRegX(rX)
     //
-    if (entrypoint_.IsValid()) {
-      arm_codegen->ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction_, this);
-      __ Blx(RegisterFrom(entrypoint_));
-    } else {
-      // Entrypoint is not already loaded, load from the thread.
-      int32_t entry_point_offset =
-          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref_reg.GetCode());
-      // This runtime call does not require a stack map.
-      arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
-    }
+    int32_t entry_point_offset =
+        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref_reg.GetCode());
+    // This runtime call does not require a stack map.
+    arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
 
     // If the new reference is different from the old reference,
     // update the field in the holder (`*(obj_ + field_offset_)`).
@@ -945,9 +928,6 @@
   const vixl32::Register temp1_;
   const vixl32::Register temp2_;
 
-  // The location of the entrypoint if already loaded.
-  const Location entrypoint_;
-
   DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL);
 };
 
@@ -7283,35 +7263,14 @@
     DCHECK(kEmitCompilerReadBarrier);
     if (kUseBakerReadBarrier) {
       // Fast path implementation of art::ReadBarrier::BarrierForRoot when
-      // Baker's read barrier are used.
+      // Baker's read barrier are used:
       //
-      // Note that we do not actually check the value of
-      // `GetIsGcMarking()` to decide whether to mark the loaded GC
-      // root or not.  Instead, we load into `temp` the read barrier
-      // mark entry point corresponding to register `root`. If `temp`
-      // is null, it means that `GetIsGcMarking()` is false, and vice
-      // versa.
-      //
+      //   root = obj.field;
       //   temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
-      //   GcRoot<mirror::Object> root = *(obj+offset);  // Original reference load.
-      //   if (temp != nullptr) {  // <=> Thread::Current()->GetIsGcMarking()
-      //     // Slow path.
-      //     root = temp(root);  // root = ReadBarrier::Mark(root);  // Runtime entry point call.
+      //   if (temp != null) {
+      //     root = temp(root)
       //   }
 
-      // Slow path marking the GC root `root`. The entrypoint will already be loaded in `temp`.
-      Location temp = LocationFrom(lr);
-      SlowPathCodeARMVIXL* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARMVIXL(
-              instruction, root, /* entrypoint */ temp);
-      codegen_->AddSlowPath(slow_path);
-
-      // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
-      const int32_t entry_point_offset =
-          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg());
-      // Loading the entrypoint does not require a load acquire since it is only changed when
-      // threads are suspended or running a checkpoint.
-      GetAssembler()->LoadFromOffset(kLoadWord, RegisterFrom(temp), tr, entry_point_offset);
-
       // /* GcRoot<mirror::Object> */ root = *(obj + offset)
       GetAssembler()->LoadFromOffset(kLoadWord, root_reg, obj, offset);
       static_assert(
@@ -7322,6 +7281,21 @@
                     "art::mirror::CompressedReference<mirror::Object> and int32_t "
                     "have different sizes.");
 
+      // Slow path marking the GC root `root`.
+      Location temp = LocationFrom(lr);
+      SlowPathCodeARMVIXL* slow_path =
+          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARMVIXL(
+              instruction,
+              root,
+              /*entrypoint*/ temp);
+      codegen_->AddSlowPath(slow_path);
+
+      // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+      const int32_t entry_point_offset =
+          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg());
+      // Loading the entrypoint does not require a load acquire since it is only changed when
+      // threads are suspended or running a checkpoint.
+      GetAssembler()->LoadFromOffset(kLoadWord, RegisterFrom(temp), tr, entry_point_offset);
       // The entrypoint is null when the GC is not marking, this prevents one load compared to
       // checking GetIsGcMarking.
       __ CompareAndBranchIfNonZero(RegisterFrom(temp), slow_path->GetEntryLabel());
@@ -7392,33 +7366,77 @@
   DCHECK(kEmitCompilerReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
-  // After loading the reference from `obj.field` into `ref`, query
-  // `art::Thread::Current()->GetIsGcMarking()` to decide whether we
-  // need to enter the slow path to mark the reference. This
-  // optimistic strategy (we expect the GC to not be marking most of
-  // the time) does not check `obj`'s lock word (to see if it is a
-  // gray object or not), so may sometimes mark an already marked
-  // object.
+  // In slow path based read barriers, the read barrier call is
+  // inserted after the original load. However, in fast path based
+  // Baker's read barriers, we need to perform the load of
+  // mirror::Object::monitor_ *before* the original reference load.
+  // This load-load ordering is required by the read barrier.
+  // The fast path/slow path (for Baker's algorithm) should look like:
   //
-  // Note that we do not actually check the value of `GetIsGcMarking()`;
-  // instead, we load into `temp3` the read barrier mark entry point
-  // corresponding to register `ref`. If `temp3` is null, it means
-  // that `GetIsGcMarking()` is false, and vice versa.
-  //
-  //   temp3 = Thread::Current()->pReadBarrierMarkReg ## root.reg()
-  //   HeapReference<mirror::Object> ref = *src;  // Original reference load.
-  //   if (temp3 != nullptr) {  // <=> Thread::Current()->GetIsGcMarking()
-  //     // Slow path.
-  //     ref = temp3(ref);  // ref = ReadBarrier::Mark(ref);  // Runtime entry point call.
+  //   uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
+  //   lfence;  // Load fence or artificial data dependency to prevent load-load reordering
+  //   HeapReference<Object> ref = *src;  // Original reference load.
+  //   bool is_gray = (rb_state == ReadBarrier::GrayState());
+  //   if (is_gray) {
+  //     ref = ReadBarrier::Mark(ref);  // Performed by runtime entrypoint slow path.
   //   }
+  //
+  // Note: the original implementation in ReadBarrier::Barrier is
+  // slightly more complex as it performs additional checks that we do
+  // not do here for performance reasons.
 
-  // TODO: This temp register is only necessary when
-  // `always_update_field` is true; make it optional (like `temp2`).
+  vixl32::Register ref_reg = RegisterFrom(ref);
   vixl32::Register temp_reg = RegisterFrom(temp);
+  uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
 
-  // Slow path marking the object `ref` when the GC is marking. The
-  // entrypoint will already be loaded in `temp3`.
-  Location temp3 = LocationFrom(lr);
+  // /* int32_t */ monitor = obj->monitor_
+  GetAssembler()->LoadFromOffset(kLoadWord, temp_reg, obj, monitor_offset);
+  if (needs_null_check) {
+    MaybeRecordImplicitNullCheck(instruction);
+  }
+  // /* LockWord */ lock_word = LockWord(monitor)
+  static_assert(sizeof(LockWord) == sizeof(int32_t),
+                "art::LockWord and int32_t have different sizes.");
+
+  // Introduce a dependency on the lock_word including the rb_state,
+  // which shall prevent load-load reordering without using
+  // a memory barrier (which would be more expensive).
+  // `obj` is unchanged by this operation, but its value now depends
+  // on `temp_reg`.
+  __ Add(obj, obj, Operand(temp_reg, ShiftType::LSR, 32));
+
+  // The actual reference load.
+  if (index.IsValid()) {
+    // Load types involving an "index": ArrayGet,
+    // UnsafeGetObject/UnsafeGetObjectVolatile and UnsafeCASObject
+    // intrinsics.
+    // /* HeapReference<Object> */ ref = *(obj + offset + (index << scale_factor))
+    if (index.IsConstant()) {
+      size_t computed_offset =
+          (Int32ConstantFrom(index) << scale_factor) + offset;
+      GetAssembler()->LoadFromOffset(kLoadWord, ref_reg, obj, computed_offset);
+    } else {
+      // Handle the special case of the
+      // UnsafeGetObject/UnsafeGetObjectVolatile and UnsafeCASObject
+      // intrinsics, which use a register pair as index ("long
+      // offset"), of which only the low part contains data.
+      vixl32::Register index_reg = index.IsRegisterPair()
+          ? LowRegisterFrom(index)
+          : RegisterFrom(index);
+      UseScratchRegisterScope temps(GetVIXLAssembler());
+      const vixl32::Register temp3 = temps.Acquire();
+      __ Add(temp3, obj, Operand(index_reg, ShiftType::LSL, scale_factor));
+      GetAssembler()->LoadFromOffset(kLoadWord, ref_reg, temp3, offset);
+    }
+  } else {
+    // /* HeapReference<Object> */ ref = *(obj + offset)
+    GetAssembler()->LoadFromOffset(kLoadWord, ref_reg, obj, offset);
+  }
+
+  // Object* ref = ref_addr->AsMirrorPtr()
+  GetAssembler()->MaybeUnpoisonHeapReference(ref_reg);
+
+  // Slow path marking the object `ref` when it is gray.
   SlowPathCodeARMVIXL* slow_path;
   if (always_update_field) {
     DCHECK(temp2 != nullptr);
@@ -7430,86 +7448,24 @@
     DCHECK_EQ(offset, 0u);
     DCHECK_EQ(scale_factor, ScaleFactor::TIMES_1);
     slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL(
-        instruction, ref, obj, /* field_offset */ index, temp_reg, *temp2, /* entrypoint */ temp3);
+        instruction, ref, obj, /* field_offset */ index, temp_reg, *temp2);
   } else {
-    slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARMVIXL(
-        instruction, ref, /* entrypoint */ temp3);
+    slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARMVIXL(instruction, ref);
   }
   AddSlowPath(slow_path);
 
-  // temp3 = Thread::Current()->pReadBarrierMarkReg ## ref.reg()
-  const int32_t entry_point_offset =
-      CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref.reg());
-  // Loading the entrypoint does not require a load acquire since it is only changed when
-  // threads are suspended or running a checkpoint.
-  GetAssembler()->LoadFromOffset(kLoadWord, RegisterFrom(temp3), tr, entry_point_offset);
-  // The reference load.
-  GenerateRawReferenceLoad(instruction, ref, obj, offset, index, scale_factor, needs_null_check);
-  // The entrypoint is null when the GC is not marking, this prevents one load compared to
-  // checking GetIsGcMarking.
-  __ CompareAndBranchIfNonZero(RegisterFrom(temp3), slow_path->GetEntryLabel());
+  // if (rb_state == ReadBarrier::GrayState())
+  //   ref = ReadBarrier::Mark(ref);
+  // Given the numeric representation, it's enough to check the low bit of the
+  // rb_state. We do that by shifting the bit out of the lock word with LSRS
+  // which can be a 16-bit instruction unlike the TST immediate.
+  static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0");
+  static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
+  __ Lsrs(temp_reg, temp_reg, LockWord::kReadBarrierStateShift + 1);
+  __ B(cs, slow_path->GetEntryLabel());  // Carry flag is the last bit shifted out by LSRS.
   __ Bind(slow_path->GetExitLabel());
 }
 
-void CodeGeneratorARMVIXL::GenerateRawReferenceLoad(HInstruction* instruction,
-                                                    Location ref,
-                                                    vixl::aarch32::Register obj,
-                                                    uint32_t offset,
-                                                    Location index,
-                                                    ScaleFactor scale_factor,
-                                                    bool needs_null_check) {
-  Primitive::Type type = Primitive::kPrimNot;
-  vixl32::Register ref_reg = RegisterFrom(ref, type);
-
-  // If needed, vixl::EmissionCheckScope guards are used to ensure
-  // that no pools are emitted between the load (macro) instruction
-  // and MaybeRecordImplicitNullCheck.
-
-  if (index.IsValid()) {
-    // Load types involving an "index": ArrayGet,
-    // UnsafeGetObject/UnsafeGetObjectVolatile and UnsafeCASObject
-    // intrinsics.
-    // /* HeapReference<mirror::Object> */ ref = *(obj + offset + (index << scale_factor))
-    if (index.IsConstant()) {
-      size_t computed_offset =
-          (Int32ConstantFrom(index) << scale_factor) + offset;
-      vixl::EmissionCheckScope guard(GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes);
-      GetAssembler()->LoadFromOffset(kLoadWord, ref_reg, obj, computed_offset);
-      if (needs_null_check) {
-        MaybeRecordImplicitNullCheck(instruction);
-      }
-    } else {
-      // Handle the special case of the
-      // UnsafeGetObject/UnsafeGetObjectVolatile and UnsafeCASObject
-      // intrinsics, which use a register pair as index ("long
-      // offset"), of which only the low part contains data.
-      vixl32::Register index_reg = index.IsRegisterPair()
-          ? LowRegisterFrom(index)
-          : RegisterFrom(index);
-      UseScratchRegisterScope temps(GetVIXLAssembler());
-      vixl32::Register temp = temps.Acquire();
-      __ Add(temp, obj, Operand(index_reg, ShiftType::LSL, scale_factor));
-      {
-        vixl::EmissionCheckScope guard(GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes);
-        GetAssembler()->LoadFromOffset(kLoadWord, ref_reg, temp, offset);
-        if (needs_null_check) {
-          MaybeRecordImplicitNullCheck(instruction);
-        }
-      }
-    }
-  } else {
-    // /* HeapReference<mirror::Object> */ ref = *(obj + offset)
-    vixl::EmissionCheckScope guard(GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes);
-    GetAssembler()->LoadFromOffset(kLoadWord, ref_reg, obj, offset);
-    if (needs_null_check) {
-      MaybeRecordImplicitNullCheck(instruction);
-    }
-  }
-
-  // Object* ref = ref_addr->AsMirrorPtr()
-  GetAssembler()->MaybeUnpoisonHeapReference(ref_reg);
-}
-
 void CodeGeneratorARMVIXL::GenerateReadBarrierSlow(HInstruction* instruction,
                                                    Location out,
                                                    Location ref,
diff --git a/compiler/optimizing/code_generator_arm_vixl.h b/compiler/optimizing/code_generator_arm_vixl.h
index 2a636dbd..3f52c72 100644
--- a/compiler/optimizing/code_generator_arm_vixl.h
+++ b/compiler/optimizing/code_generator_arm_vixl.h
@@ -45,11 +45,6 @@
 namespace art {
 namespace arm {
 
-// This constant is used as an approximate margin when emission of veneer and literal pools
-// must be blocked.
-static constexpr int kMaxMacroInstructionSizeInBytes =
-    15 * vixl::aarch32::kMaxInstructionSizeInBytes;
-
 static const vixl::aarch32::Register kParameterCoreRegistersVIXL[] = {
     vixl::aarch32::r1,
     vixl::aarch32::r2,
@@ -630,15 +625,6 @@
                                                  bool always_update_field = false,
                                                  vixl::aarch32::Register* temp2 = nullptr);
 
-  // Generate a heap reference load (with no read barrier).
-  void GenerateRawReferenceLoad(HInstruction* instruction,
-                                Location ref,
-                                vixl::aarch32::Register obj,
-                                uint32_t offset,
-                                Location index,
-                                ScaleFactor scale_factor,
-                                bool needs_null_check);
-
   // Generate a read barrier for a heap reference within `instruction`
   // using a slow path.
   //
diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc
index 751623c..c262cf9 100644
--- a/compiler/optimizing/intrinsics_arm.cc
+++ b/compiler/optimizing/intrinsics_arm.cc
@@ -1946,8 +1946,6 @@
   }
 
   if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
-    // TODO: Also convert this intrinsic to the IsGcMarking strategy?
-
     // The base destination address is computed later, as `temp2` is
     // used for intermediate computations.
 
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index f386422..86e5429 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -2732,8 +2732,6 @@
     const int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot);
 
     if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
-      // TODO: Also convert this intrinsic to the IsGcMarking strategy?
-
       // SystemArrayCopy implementation for Baker read barriers (see
       // also CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier):
       //
diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc
index cc4889b..70a3d38 100644
--- a/compiler/optimizing/intrinsics_arm_vixl.cc
+++ b/compiler/optimizing/intrinsics_arm_vixl.cc
@@ -2264,8 +2264,6 @@
   }
 
   if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
-    // TODO: Also convert this intrinsic to the IsGcMarking strategy?
-
     // The base destination address is computed later, as `temp2` is
     // used for intermediate computations.
 
diff --git a/runtime/lock_word.h b/runtime/lock_word.h
index edc64f3..2f2565b 100644
--- a/runtime/lock_word.h
+++ b/runtime/lock_word.h
@@ -57,8 +57,7 @@
  *  |10|9|87654321098765432109876543210|
  *  |11|0| ForwardingAddress           |
  *
- * The `r` bit stores the read barrier state.
- * The `m` bit stores the mark state.
+ * The rb bits store the read barrier state.
  */
 class LockWord {
  public:
diff --git a/runtime/type_lookup_table.h b/runtime/type_lookup_table.h
index fd68deb..3f6f76f 100644
--- a/runtime/type_lookup_table.h
+++ b/runtime/type_lookup_table.h
@@ -148,7 +148,7 @@
     return mask_;
   }
 
-  // Attempt to set an entry on its hash's slot. If there is already something there, return false.
+  // Attempt to set an entry on it's hash' slot. If there is alrady something there, return false.
   // Otherwise return true.
   bool SetOnInitialPos(const Entry& entry, uint32_t hash);