ARM/ARM64: Introspection Baker RB for intrinsics.

Namely Unsafe.getObject/-Volatile().

Test: Additional tests in 160-read-barrier-stress.
Test: m test-art-host-gtest
Test: Pixel 2 XL boots.
Test: testrunner.py --target --optimizing
Test: ART_HEAP_POISONING=true testrunner.py --target --optimizing
Bug: 36141117
Change-Id: I7305d75ab0ae8c9621843f9a382ad3a5e0aefa0b
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 7aaa7bf..415c6bf 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -93,16 +93,6 @@
 // the offset explicitly.
 constexpr uint32_t kReferenceLoadMinFarOffset = 16 * KB;
 
-// Some instructions have special requirements for a temporary, for example
-// LoadClass/kBssEntry and LoadString/kBssEntry for Baker read barrier require
-// temp that's not an R0 (to avoid an extra move) and Baker read barrier field
-// loads with large offsets need a fixed register to limit the number of link-time
-// thunks we generate. For these and similar cases, we want to reserve a specific
-// register that's neither callee-save nor an argument register. We choose x15.
-inline Location FixedTempLocation() {
-  return Location::RegisterLocation(x15.GetCode());
-}
-
 inline Condition ARM64Condition(IfCondition cond) {
   switch (cond) {
     case kCondEQ: return eq;
@@ -678,170 +668,9 @@
 // Slow path loading `obj`'s lock word, loading a reference from
 // object `*(obj + offset + (index << scale_factor))` into `ref`, and
 // marking `ref` if `obj` is gray according to the lock word (Baker
-// read barrier). The field `obj.field` in the object `obj` holding
-// this reference does not get updated by this slow path after marking
-// (see LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64
-// below for that).
-//
-// This means that after the execution of this slow path, `ref` will
-// always be up-to-date, but `obj.field` may not; i.e., after the
-// flip, `ref` will be a to-space reference, but `obj.field` will
-// probably still be a from-space reference (unless it gets updated by
-// another thread, or if another thread installed another object
-// reference (different from `ref`) in `obj.field`).
-//
-// Argument `entrypoint` must be a register location holding the read
-// barrier marking runtime entry point to be invoked or an empty
-// location; in the latter case, the read barrier marking runtime
-// entry point will be loaded by the slow path code itself.
-class LoadReferenceWithBakerReadBarrierSlowPathARM64 : public ReadBarrierMarkSlowPathBaseARM64 {
- public:
-  LoadReferenceWithBakerReadBarrierSlowPathARM64(HInstruction* instruction,
-                                                 Location ref,
-                                                 Register obj,
-                                                 uint32_t offset,
-                                                 Location index,
-                                                 size_t scale_factor,
-                                                 bool needs_null_check,
-                                                 bool use_load_acquire,
-                                                 Register temp,
-                                                 Location entrypoint = Location::NoLocation())
-      : ReadBarrierMarkSlowPathBaseARM64(instruction, ref, entrypoint),
-        obj_(obj),
-        offset_(offset),
-        index_(index),
-        scale_factor_(scale_factor),
-        needs_null_check_(needs_null_check),
-        use_load_acquire_(use_load_acquire),
-        temp_(temp) {
-    DCHECK(kEmitCompilerReadBarrier);
-    DCHECK(kUseBakerReadBarrier);
-  }
-
-  const char* GetDescription() const OVERRIDE {
-    return "LoadReferenceWithBakerReadBarrierSlowPathARM64";
-  }
-
-  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
-    LocationSummary* locations = instruction_->GetLocations();
-    DCHECK(locations->CanCall());
-    DCHECK(ref_.IsRegister()) << ref_;
-    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_.reg())) << ref_.reg();
-    DCHECK(obj_.IsW());
-    DCHECK_NE(ref_.reg(), LocationFrom(temp_).reg());
-    DCHECK(instruction_->IsInstanceFieldGet() ||
-           instruction_->IsStaticFieldGet() ||
-           instruction_->IsArrayGet() ||
-           instruction_->IsArraySet() ||
-           instruction_->IsInstanceOf() ||
-           instruction_->IsCheckCast() ||
-           (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) ||
-           (instruction_->IsInvokeStaticOrDirect() && instruction_->GetLocations()->Intrinsified()))
-        << "Unexpected instruction in read barrier marking slow path: "
-        << instruction_->DebugName();
-    // The read barrier instrumentation of object ArrayGet
-    // instructions does not support the HIntermediateAddress
-    // instruction.
-    DCHECK(!(instruction_->IsArrayGet() &&
-             instruction_->AsArrayGet()->GetArray()->IsIntermediateAddress()));
-
-    // Temporary register `temp_`, used to store the lock word, must
-    // not be IP0 nor IP1, as we may use them to emit the reference
-    // load (in the call to GenerateRawReferenceLoad below), and we
-    // need the lock word to still be in `temp_` after the reference
-    // load.
-    DCHECK_NE(LocationFrom(temp_).reg(), IP0);
-    DCHECK_NE(LocationFrom(temp_).reg(), IP1);
-
-    __ Bind(GetEntryLabel());
-
-    // When using MaybeGenerateReadBarrierSlow, the read barrier call is
-    // inserted after the original load. However, in fast path based
-    // Baker's read barriers, we need to perform the load of
-    // mirror::Object::monitor_ *before* the original reference load.
-    // This load-load ordering is required by the read barrier.
-    // The slow path (for Baker's algorithm) should look like:
-    //
-    //   uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
-    //   lfence;  // Load fence or artificial data dependency to prevent load-load reordering
-    //   HeapReference<mirror::Object> ref = *src;  // Original reference load.
-    //   bool is_gray = (rb_state == ReadBarrier::GrayState());
-    //   if (is_gray) {
-    //     ref = entrypoint(ref);  // ref = ReadBarrier::Mark(ref);  // Runtime entry point call.
-    //   }
-    //
-    // Note: the original implementation in ReadBarrier::Barrier is
-    // slightly more complex as it performs additional checks that we do
-    // not do here for performance reasons.
-
-    // /* int32_t */ monitor = obj->monitor_
-    uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
-    __ Ldr(temp_, HeapOperand(obj_, monitor_offset));
-    if (needs_null_check_) {
-      codegen->MaybeRecordImplicitNullCheck(instruction_);
-    }
-    // /* LockWord */ lock_word = LockWord(monitor)
-    static_assert(sizeof(LockWord) == sizeof(int32_t),
-                  "art::LockWord and int32_t have different sizes.");
-
-    // Introduce a dependency on the lock_word including rb_state,
-    // to prevent load-load reordering, and without using
-    // a memory barrier (which would be more expensive).
-    // `obj` is unchanged by this operation, but its value now depends
-    // on `temp`.
-    __ Add(obj_.X(), obj_.X(), Operand(temp_.X(), LSR, 32));
-
-    // The actual reference load.
-    // A possible implicit null check has already been handled above.
-    CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen);
-    arm64_codegen->GenerateRawReferenceLoad(instruction_,
-                                            ref_,
-                                            obj_,
-                                            offset_,
-                                            index_,
-                                            scale_factor_,
-                                            /* needs_null_check */ false,
-                                            use_load_acquire_);
-
-    // Mark the object `ref` when `obj` is gray.
-    //
-    //   if (rb_state == ReadBarrier::GrayState())
-    //     ref = ReadBarrier::Mark(ref);
-    //
-    // Given the numeric representation, it's enough to check the low bit of the rb_state.
-    static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0");
-    static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
-    __ Tbz(temp_, LockWord::kReadBarrierStateShift, GetExitLabel());
-    GenerateReadBarrierMarkRuntimeCall(codegen);
-
-    __ B(GetExitLabel());
-  }
-
- private:
-  // The register containing the object holding the marked object reference field.
-  Register obj_;
-  // The offset, index and scale factor to access the reference in `obj_`.
-  uint32_t offset_;
-  Location index_;
-  size_t scale_factor_;
-  // Is a null check required?
-  bool needs_null_check_;
-  // Should this reference load use Load-Acquire semantics?
-  bool use_load_acquire_;
-  // A temporary register used to hold the lock word of `obj_`.
-  Register temp_;
-
-  DISALLOW_COPY_AND_ASSIGN(LoadReferenceWithBakerReadBarrierSlowPathARM64);
-};
-
-// Slow path loading `obj`'s lock word, loading a reference from
-// object `*(obj + offset + (index << scale_factor))` into `ref`, and
-// marking `ref` if `obj` is gray according to the lock word (Baker
 // read barrier). If needed, this slow path also atomically updates
 // the field `obj.field` in the object `obj` holding this reference
-// after marking (contrary to
-// LoadReferenceWithBakerReadBarrierSlowPathARM64 above, which never
-// tries to update `obj.field`).
+// after marking.
 //
 // This means that after the execution of this slow path, both `ref`
 // and `obj.field` will be up-to-date; i.e., after the flip, both will
@@ -914,7 +743,7 @@
 
     __ Bind(GetEntryLabel());
 
-    // The implementation is similar to LoadReferenceWithBakerReadBarrierSlowPathARM64's:
+    // The implementation is:
     //
     //   uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
     //   lfence;  // Load fence or artificial data dependency to prevent load-load reordering
@@ -6288,9 +6117,8 @@
 
 void CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction,
                                                                Location ref,
-                                                               Register obj,
-                                                               uint32_t offset,
-                                                               Location maybe_temp,
+                                                               vixl::aarch64::Register obj,
+                                                               const vixl::aarch64::MemOperand& src,
                                                                bool needs_null_check,
                                                                bool use_load_acquire) {
   DCHECK(kEmitCompilerReadBarrier);
@@ -6317,6 +6145,53 @@
   //     HeapReference<mirror::Object> reference = *(obj+offset);
   //   gray_return_address:
 
+  DCHECK(src.GetAddrMode() == vixl::aarch64::Offset);
+  DCHECK_ALIGNED(src.GetOffset(), sizeof(mirror::HeapReference<mirror::Object>));
+
+  UseScratchRegisterScope temps(GetVIXLAssembler());
+  DCHECK(temps.IsAvailable(ip0));
+  DCHECK(temps.IsAvailable(ip1));
+  temps.Exclude(ip0, ip1);
+  uint32_t custom_data = use_load_acquire
+      ? EncodeBakerReadBarrierAcquireData(src.GetBaseRegister().GetCode(), obj.GetCode())
+      : EncodeBakerReadBarrierFieldData(src.GetBaseRegister().GetCode(), obj.GetCode());
+
+  {
+    ExactAssemblyScope guard(GetVIXLAssembler(),
+                             (kPoisonHeapReferences ? 4u : 3u) * vixl::aarch64::kInstructionSize);
+    vixl::aarch64::Label return_address;
+    __ adr(lr, &return_address);
+    EmitBakerReadBarrierCbnz(custom_data);
+    static_assert(BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4),
+                  "Field LDR must be 1 instruction (4B) before the return address label; "
+                  " 2 instructions (8B) for heap poisoning.");
+    Register ref_reg = RegisterFrom(ref, DataType::Type::kReference);
+    if (use_load_acquire) {
+      DCHECK_EQ(src.GetOffset(), 0);
+      __ ldar(ref_reg, src);
+    } else {
+      __ ldr(ref_reg, src);
+    }
+    if (needs_null_check) {
+      MaybeRecordImplicitNullCheck(instruction);
+    }
+    // Unpoison the reference explicitly if needed. MaybeUnpoisonHeapReference() uses
+    // macro instructions disallowed in ExactAssemblyScope.
+    if (kPoisonHeapReferences) {
+      __ neg(ref_reg, Operand(ref_reg));
+    }
+    __ bind(&return_address);
+  }
+  MaybeGenerateMarkingRegisterCheck(/* code */ __LINE__, /* temp_loc */ LocationFrom(ip1));
+}
+
+void CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction,
+                                                               Location ref,
+                                                               Register obj,
+                                                               uint32_t offset,
+                                                               Location maybe_temp,
+                                                               bool needs_null_check,
+                                                               bool use_load_acquire) {
   DCHECK_ALIGNED(offset, sizeof(mirror::HeapReference<mirror::Object>));
   Register base = obj;
   if (use_load_acquire) {
@@ -6331,41 +6206,9 @@
     __ Add(base, obj, Operand(offset & ~(kReferenceLoadMinFarOffset - 1u)));
     offset &= (kReferenceLoadMinFarOffset - 1u);
   }
-  UseScratchRegisterScope temps(GetVIXLAssembler());
-  DCHECK(temps.IsAvailable(ip0));
-  DCHECK(temps.IsAvailable(ip1));
-  temps.Exclude(ip0, ip1);
-  uint32_t custom_data = use_load_acquire
-      ? EncodeBakerReadBarrierAcquireData(base.GetCode(), obj.GetCode())
-      : EncodeBakerReadBarrierFieldData(base.GetCode(), obj.GetCode());
-
-  {
-    ExactAssemblyScope guard(GetVIXLAssembler(),
-                             (kPoisonHeapReferences ? 4u : 3u) * vixl::aarch64::kInstructionSize);
-    vixl::aarch64::Label return_address;
-    __ adr(lr, &return_address);
-    EmitBakerReadBarrierCbnz(custom_data);
-    static_assert(BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4),
-                  "Field LDR must be 1 instruction (4B) before the return address label; "
-                  " 2 instructions (8B) for heap poisoning.");
-    Register ref_reg = RegisterFrom(ref, DataType::Type::kReference);
-    if (use_load_acquire) {
-      DCHECK_EQ(offset, 0u);
-      __ ldar(ref_reg, MemOperand(base.X()));
-    } else {
-      __ ldr(ref_reg, MemOperand(base.X(), offset));
-    }
-    if (needs_null_check) {
-      MaybeRecordImplicitNullCheck(instruction);
-    }
-    // Unpoison the reference explicitly if needed. MaybeUnpoisonHeapReference() uses
-    // macro instructions disallowed in ExactAssemblyScope.
-    if (kPoisonHeapReferences) {
-      __ neg(ref_reg, Operand(ref_reg));
-    }
-    __ bind(&return_address);
-  }
-  MaybeGenerateMarkingRegisterCheck(/* code */ __LINE__, /* temp_loc */ LocationFrom(ip1));
+  MemOperand src(base.X(), offset);
+  GenerateFieldLoadWithBakerReadBarrier(
+      instruction, ref, obj, src, needs_null_check, use_load_acquire);
 }
 
 void CodeGeneratorARM64::GenerateArrayLoadWithBakerReadBarrier(Location ref,
@@ -6435,65 +6278,6 @@
   MaybeGenerateMarkingRegisterCheck(/* code */ __LINE__, /* temp_loc */ LocationFrom(ip1));
 }
 
-void CodeGeneratorARM64::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction,
-                                                                   Location ref,
-                                                                   Register obj,
-                                                                   uint32_t offset,
-                                                                   Location index,
-                                                                   size_t scale_factor,
-                                                                   Register temp,
-                                                                   bool needs_null_check,
-                                                                   bool use_load_acquire) {
-  DCHECK(kEmitCompilerReadBarrier);
-  DCHECK(kUseBakerReadBarrier);
-  // If we are emitting an array load, we should not be using a
-  // Load Acquire instruction.  In other words:
-  // `instruction->IsArrayGet()` => `!use_load_acquire`.
-  DCHECK(!instruction->IsArrayGet() || !use_load_acquire);
-
-  // Query `art::Thread::Current()->GetIsGcMarking()` (stored in the
-  // Marking Register) to decide whether we need to enter the slow
-  // path to mark the reference. Then, in the slow path, check the
-  // gray bit in the lock word of the reference's holder (`obj`) to
-  // decide whether to mark `ref` or not.
-  //
-  //   if (mr) {  // Thread::Current()->GetIsGcMarking()
-  //     // Slow path.
-  //     uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
-  //     lfence;  // Load fence or artificial data dependency to prevent load-load reordering
-  //     HeapReference<mirror::Object> ref = *src;  // Original reference load.
-  //     bool is_gray = (rb_state == ReadBarrier::GrayState());
-  //     if (is_gray) {
-  //       entrypoint = Thread::Current()->pReadBarrierMarkReg ## root.reg()
-  //       ref = entrypoint(ref);  // ref = ReadBarrier::Mark(ref);  // Runtime entry point call.
-  //     }
-  //   } else {
-  //     HeapReference<mirror::Object> ref = *src;  // Original reference load.
-  //   }
-
-  // Slow path marking the object `ref` when the GC is marking. The
-  // entrypoint will be loaded by the slow path code.
-  SlowPathCodeARM64* slow_path =
-      new (GetScopedAllocator()) LoadReferenceWithBakerReadBarrierSlowPathARM64(
-          instruction,
-          ref,
-          obj,
-          offset,
-          index,
-          scale_factor,
-          needs_null_check,
-          use_load_acquire,
-          temp);
-  AddSlowPath(slow_path);
-
-  __ Cbnz(mr, slow_path->GetEntryLabel());
-  // Fast path: the GC is not marking: just load the reference.
-  GenerateRawReferenceLoad(
-      instruction, ref, obj, offset, index, scale_factor, needs_null_check, use_load_acquire);
-  __ Bind(slow_path->GetExitLabel());
-  MaybeGenerateMarkingRegisterCheck(/* code */ __LINE__);
-}
-
 void CodeGeneratorARM64::UpdateReferenceFieldWithBakerReadBarrier(HInstruction* instruction,
                                                                   Location ref,
                                                                   Register obj,
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 6a358a4..f3f5079 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -92,6 +92,16 @@
         ((kEmitCompilerReadBarrier && kUseBakerReadBarrier) ? mr : vixl::aarch64::NoCPUReg),
         vixl::aarch64::lr);
 
+// Some instructions have special requirements for a temporary, for example
+// LoadClass/kBssEntry and LoadString/kBssEntry for Baker read barrier require
+// temp that's not an R0 (to avoid an extra move) and Baker read barrier field
+// loads with large offsets need a fixed register to limit the number of link-time
+// thunks we generate. For these and similar cases, we want to reserve a specific
+// register that's neither callee-save nor an argument register. We choose x15.
+inline Location FixedTempLocation() {
+  return Location::RegisterLocation(vixl::aarch64::x15.GetCode());
+}
+
 // Callee-save registers AAPCS64, without x19 (Thread Register) (nor
 // x20 (Marking Register) when emitting Baker read barriers).
 const vixl::aarch64::CPURegList callee_saved_core_registers(
@@ -663,6 +673,15 @@
                                ReadBarrierOption read_barrier_option);
   // Fast path implementation of ReadBarrier::Barrier for a heap
   // reference field load when Baker's read barriers are used.
+  // Overload suitable for Unsafe.getObject/-Volatile() intrinsic.
+  void GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction,
+                                             Location ref,
+                                             vixl::aarch64::Register obj,
+                                             const vixl::aarch64::MemOperand& src,
+                                             bool needs_null_check,
+                                             bool use_load_acquire);
+  // Fast path implementation of ReadBarrier::Barrier for a heap
+  // reference field load when Baker's read barriers are used.
   void GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction,
                                              Location ref,
                                              vixl::aarch64::Register obj,
@@ -678,21 +697,6 @@
                                              Location index,
                                              vixl::aarch64::Register temp,
                                              bool needs_null_check);
-  // Factored implementation, used by GenerateFieldLoadWithBakerReadBarrier,
-  // GenerateArrayLoadWithBakerReadBarrier and some intrinsics.
-  //
-  // Load the object reference located at the address
-  // `obj + offset + (index << scale_factor)`, held by object `obj`, into
-  // `ref`, and mark it if needed.
-  void GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction,
-                                                 Location ref,
-                                                 vixl::aarch64::Register obj,
-                                                 uint32_t offset,
-                                                 Location index,
-                                                 size_t scale_factor,
-                                                 vixl::aarch64::Register temp,
-                                                 bool needs_null_check,
-                                                 bool use_load_acquire);
 
   // Generate code checking whether the the reference field at the
   // address `obj + field_offset`, held by object `obj`, needs to be
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index 3e63c26..ce93afc 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -786,160 +786,9 @@
 // Slow path loading `obj`'s lock word, loading a reference from
 // object `*(obj + offset + (index << scale_factor))` into `ref`, and
 // marking `ref` if `obj` is gray according to the lock word (Baker
-// read barrier). The field `obj.field` in the object `obj` holding
-// this reference does not get updated by this slow path after marking
-// (see LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL
-// below for that).
-//
-// This means that after the execution of this slow path, `ref` will
-// always be up-to-date, but `obj.field` may not; i.e., after the
-// flip, `ref` will be a to-space reference, but `obj.field` will
-// probably still be a from-space reference (unless it gets updated by
-// another thread, or if another thread installed another object
-// reference (different from `ref`) in `obj.field`).
-//
-// Argument `entrypoint` must be a register location holding the read
-// barrier marking runtime entry point to be invoked or an empty
-// location; in the latter case, the read barrier marking runtime
-// entry point will be loaded by the slow path code itself.
-class LoadReferenceWithBakerReadBarrierSlowPathARMVIXL : public ReadBarrierMarkSlowPathBaseARMVIXL {
- public:
-  LoadReferenceWithBakerReadBarrierSlowPathARMVIXL(HInstruction* instruction,
-                                                   Location ref,
-                                                   vixl32::Register obj,
-                                                   uint32_t offset,
-                                                   Location index,
-                                                   ScaleFactor scale_factor,
-                                                   bool needs_null_check,
-                                                   vixl32::Register temp,
-                                                   Location entrypoint = Location::NoLocation())
-      : ReadBarrierMarkSlowPathBaseARMVIXL(instruction, ref, entrypoint),
-        obj_(obj),
-        offset_(offset),
-        index_(index),
-        scale_factor_(scale_factor),
-        needs_null_check_(needs_null_check),
-        temp_(temp) {
-    DCHECK(kEmitCompilerReadBarrier);
-    DCHECK(kUseBakerReadBarrier);
-  }
-
-  const char* GetDescription() const OVERRIDE {
-    return "LoadReferenceWithBakerReadBarrierSlowPathARMVIXL";
-  }
-
-  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
-    LocationSummary* locations = instruction_->GetLocations();
-    vixl32::Register ref_reg = RegisterFrom(ref_);
-    DCHECK(locations->CanCall());
-    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_reg.GetCode())) << ref_reg;
-    DCHECK(instruction_->IsInstanceFieldGet() ||
-           instruction_->IsStaticFieldGet() ||
-           instruction_->IsArrayGet() ||
-           instruction_->IsArraySet() ||
-           instruction_->IsInstanceOf() ||
-           instruction_->IsCheckCast() ||
-           (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) ||
-           (instruction_->IsInvokeStaticOrDirect() && instruction_->GetLocations()->Intrinsified()))
-        << "Unexpected instruction in read barrier marking slow path: "
-        << instruction_->DebugName();
-    // The read barrier instrumentation of object ArrayGet
-    // instructions does not support the HIntermediateAddress
-    // instruction.
-    DCHECK(!(instruction_->IsArrayGet() &&
-             instruction_->AsArrayGet()->GetArray()->IsIntermediateAddress()));
-
-    // Temporary register `temp_`, used to store the lock word, must
-    // not be IP, as we may use it to emit the reference load (in the
-    // call to GenerateRawReferenceLoad below), and we need the lock
-    // word to still be in `temp_` after the reference load.
-    DCHECK(!temp_.Is(ip));
-
-    __ Bind(GetEntryLabel());
-
-    // When using MaybeGenerateReadBarrierSlow, the read barrier call is
-    // inserted after the original load. However, in fast path based
-    // Baker's read barriers, we need to perform the load of
-    // mirror::Object::monitor_ *before* the original reference load.
-    // This load-load ordering is required by the read barrier.
-    // The slow path (for Baker's algorithm) should look like:
-    //
-    //   uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
-    //   lfence;  // Load fence or artificial data dependency to prevent load-load reordering
-    //   HeapReference<mirror::Object> ref = *src;  // Original reference load.
-    //   bool is_gray = (rb_state == ReadBarrier::GrayState());
-    //   if (is_gray) {
-    //     ref = entrypoint(ref);  // ref = ReadBarrier::Mark(ref);  // Runtime entry point call.
-    //   }
-    //
-    // Note: the original implementation in ReadBarrier::Barrier is
-    // slightly more complex as it performs additional checks that we do
-    // not do here for performance reasons.
-
-    CodeGeneratorARMVIXL* arm_codegen = down_cast<CodeGeneratorARMVIXL*>(codegen);
-
-    // /* int32_t */ monitor = obj->monitor_
-    uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
-    arm_codegen->GetAssembler()->LoadFromOffset(kLoadWord, temp_, obj_, monitor_offset);
-    if (needs_null_check_) {
-      codegen->MaybeRecordImplicitNullCheck(instruction_);
-    }
-    // /* LockWord */ lock_word = LockWord(monitor)
-    static_assert(sizeof(LockWord) == sizeof(int32_t),
-                  "art::LockWord and int32_t have different sizes.");
-
-    // Introduce a dependency on the lock_word including the rb_state,
-    // which shall prevent load-load reordering without using
-    // a memory barrier (which would be more expensive).
-    // `obj` is unchanged by this operation, but its value now depends
-    // on `temp`.
-    __ Add(obj_, obj_, Operand(temp_, ShiftType::LSR, 32));
-
-    // The actual reference load.
-    // A possible implicit null check has already been handled above.
-    arm_codegen->GenerateRawReferenceLoad(
-        instruction_, ref_, obj_, offset_, index_, scale_factor_, /* needs_null_check */ false);
-
-    // Mark the object `ref` when `obj` is gray.
-    //
-    //   if (rb_state == ReadBarrier::GrayState())
-    //     ref = ReadBarrier::Mark(ref);
-    //
-    // Given the numeric representation, it's enough to check the low bit of the
-    // rb_state. We do that by shifting the bit out of the lock word with LSRS
-    // which can be a 16-bit instruction unlike the TST immediate.
-    static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0");
-    static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
-    __ Lsrs(temp_, temp_, LockWord::kReadBarrierStateShift + 1);
-    __ B(cc, GetExitLabel());  // Carry flag is the last bit shifted out by LSRS.
-    GenerateReadBarrierMarkRuntimeCall(codegen);
-
-    __ B(GetExitLabel());
-  }
-
- private:
-  // The register containing the object holding the marked object reference field.
-  vixl32::Register obj_;
-  // The offset, index and scale factor to access the reference in `obj_`.
-  uint32_t offset_;
-  Location index_;
-  ScaleFactor scale_factor_;
-  // Is a null check required?
-  bool needs_null_check_;
-  // A temporary register used to hold the lock word of `obj_`.
-  vixl32::Register temp_;
-
-  DISALLOW_COPY_AND_ASSIGN(LoadReferenceWithBakerReadBarrierSlowPathARMVIXL);
-};
-
-// Slow path loading `obj`'s lock word, loading a reference from
-// object `*(obj + offset + (index << scale_factor))` into `ref`, and
-// marking `ref` if `obj` is gray according to the lock word (Baker
 // read barrier). If needed, this slow path also atomically updates
 // the field `obj.field` in the object `obj` holding this reference
-// after marking (contrary to
-// LoadReferenceWithBakerReadBarrierSlowPathARMVIXL above, which never
-// tries to update `obj.field`).
+// after marking.
 //
 // This means that after the execution of this slow path, both `ref`
 // and `obj.field` will be up-to-date; i.e., after the flip, both will
@@ -1006,7 +855,7 @@
 
     __ Bind(GetEntryLabel());
 
-    // The implementation is similar to LoadReferenceWithBakerReadBarrierSlowPathARMVIXL's:
+    // The implementation is:
     //
     //   uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
     //   lfence;  // Load fence or artificial data dependency to prevent load-load reordering
@@ -8796,8 +8645,7 @@
 void CodeGeneratorARMVIXL::GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction,
                                                                  Location ref,
                                                                  vixl32::Register obj,
-                                                                 uint32_t offset,
-                                                                 Location temp,
+                                                                 const vixl32::MemOperand& src,
                                                                  bool needs_null_check) {
   DCHECK(kEmitCompilerReadBarrier);
   DCHECK(kUseBakerReadBarrier);
@@ -8823,23 +8671,15 @@
   //     HeapReference<mirror::Object> reference = *(obj+offset);
   //   gray_return_address:
 
-  DCHECK_ALIGNED(offset, sizeof(mirror::HeapReference<mirror::Object>));
+  DCHECK(src.GetAddrMode() == vixl32::Offset);
+  DCHECK_ALIGNED(src.GetOffsetImmediate(), sizeof(mirror::HeapReference<mirror::Object>));
   vixl32::Register ref_reg = RegisterFrom(ref, DataType::Type::kReference);
-  bool narrow = CanEmitNarrowLdr(ref_reg, obj, offset);
-  vixl32::Register base = obj;
-  if (offset >= kReferenceLoadMinFarOffset) {
-    base = RegisterFrom(temp);
-    static_assert(IsPowerOfTwo(kReferenceLoadMinFarOffset), "Expecting a power of 2.");
-    __ Add(base, obj, Operand(offset & ~(kReferenceLoadMinFarOffset - 1u)));
-    offset &= (kReferenceLoadMinFarOffset - 1u);
-    // Use narrow LDR only for small offsets. Generating narrow encoding LDR for the large
-    // offsets with `(offset & (kReferenceLoadMinFarOffset - 1u)) < 32u` would most likely
-    // increase the overall code size when taking the generated thunks into account.
-    DCHECK(!narrow);
-  }
+  bool narrow = CanEmitNarrowLdr(ref_reg, src.GetBaseRegister(), src.GetOffsetImmediate());
+
   UseScratchRegisterScope temps(GetVIXLAssembler());
   temps.Exclude(ip);
-  uint32_t custom_data = EncodeBakerReadBarrierFieldData(base.GetCode(), obj.GetCode(), narrow);
+  uint32_t custom_data =
+      EncodeBakerReadBarrierFieldData(src.GetBaseRegister().GetCode(), obj.GetCode(), narrow);
 
   {
     vixl::EmissionCheckScope guard(
@@ -8850,7 +8690,7 @@
     __ cmp(mr, Operand(0));
     EmitBakerReadBarrierBne(custom_data);
     ptrdiff_t old_offset = GetVIXLAssembler()->GetBuffer()->GetCursorOffset();
-    __ ldr(EncodingSize(narrow ? Narrow : Wide), ref_reg, MemOperand(base, offset));
+    __ ldr(EncodingSize(narrow ? Narrow : Wide), ref_reg, src);
     if (needs_null_check) {
       MaybeRecordImplicitNullCheck(instruction);
     }
@@ -8871,6 +8711,24 @@
   MaybeGenerateMarkingRegisterCheck(/* code */ 20, /* temp_loc */ LocationFrom(ip));
 }
 
+void CodeGeneratorARMVIXL::GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction,
+                                                                 Location ref,
+                                                                 vixl32::Register obj,
+                                                                 uint32_t offset,
+                                                                 Location temp,
+                                                                 bool needs_null_check) {
+  DCHECK_ALIGNED(offset, sizeof(mirror::HeapReference<mirror::Object>));
+  vixl32::Register base = obj;
+  if (offset >= kReferenceLoadMinFarOffset) {
+    base = RegisterFrom(temp);
+    static_assert(IsPowerOfTwo(kReferenceLoadMinFarOffset), "Expecting a power of 2.");
+    __ Add(base, obj, Operand(offset & ~(kReferenceLoadMinFarOffset - 1u)));
+    offset &= (kReferenceLoadMinFarOffset - 1u);
+  }
+  GenerateFieldLoadWithBakerReadBarrier(
+      instruction, ref, obj, MemOperand(base, offset), needs_null_check);
+}
+
 void CodeGeneratorARMVIXL::GenerateArrayLoadWithBakerReadBarrier(Location ref,
                                                                  vixl32::Register obj,
                                                                  uint32_t data_offset,
@@ -8938,53 +8796,6 @@
   MaybeGenerateMarkingRegisterCheck(/* code */ 21, /* temp_loc */ LocationFrom(ip));
 }
 
-void CodeGeneratorARMVIXL::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction,
-                                                                     Location ref,
-                                                                     vixl32::Register obj,
-                                                                     uint32_t offset,
-                                                                     Location index,
-                                                                     ScaleFactor scale_factor,
-                                                                     Location temp,
-                                                                     bool needs_null_check) {
-  DCHECK(kEmitCompilerReadBarrier);
-  DCHECK(kUseBakerReadBarrier);
-
-  // Query `art::Thread::Current()->GetIsGcMarking()` (stored in the
-  // Marking Register) to decide whether we need to enter the slow
-  // path to mark the reference. Then, in the slow path, check the
-  // gray bit in the lock word of the reference's holder (`obj`) to
-  // decide whether to mark `ref` or not.
-  //
-  //   if (mr) {  // Thread::Current()->GetIsGcMarking()
-  //     // Slow path.
-  //     uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
-  //     lfence;  // Load fence or artificial data dependency to prevent load-load reordering
-  //     HeapReference<mirror::Object> ref = *src;  // Original reference load.
-  //     bool is_gray = (rb_state == ReadBarrier::GrayState());
-  //     if (is_gray) {
-  //       entrypoint = Thread::Current()->pReadBarrierMarkReg ## root.reg()
-  //       ref = entrypoint(ref);  // ref = ReadBarrier::Mark(ref);  // Runtime entry point call.
-  //     }
-  //   } else {
-  //     HeapReference<mirror::Object> ref = *src;  // Original reference load.
-  //   }
-
-  vixl32::Register temp_reg = RegisterFrom(temp);
-
-  // Slow path marking the object `ref` when the GC is marking. The
-  // entrypoint will be loaded by the slow path code.
-  SlowPathCodeARMVIXL* slow_path =
-      new (GetScopedAllocator()) LoadReferenceWithBakerReadBarrierSlowPathARMVIXL(
-          instruction, ref, obj, offset, index, scale_factor, needs_null_check, temp_reg);
-  AddSlowPath(slow_path);
-
-  __ CompareAndBranchIfNonZero(mr, slow_path->GetEntryLabel());
-  // Fast path: the GC is not marking: just load the reference.
-  GenerateRawReferenceLoad(instruction, ref, obj, offset, index, scale_factor, needs_null_check);
-  __ Bind(slow_path->GetExitLabel());
-  MaybeGenerateMarkingRegisterCheck(/* code */ 22);
-}
-
 void CodeGeneratorARMVIXL::UpdateReferenceFieldWithBakerReadBarrier(HInstruction* instruction,
                                                                     Location ref,
                                                                     vixl32::Register obj,
diff --git a/compiler/optimizing/code_generator_arm_vixl.h b/compiler/optimizing/code_generator_arm_vixl.h
index 0106236..2fd18ca 100644
--- a/compiler/optimizing/code_generator_arm_vixl.h
+++ b/compiler/optimizing/code_generator_arm_vixl.h
@@ -624,6 +624,14 @@
                                ReadBarrierOption read_barrier_option);
   // Fast path implementation of ReadBarrier::Barrier for a heap
   // reference field load when Baker's read barriers are used.
+  // Overload suitable for Unsafe.getObject/-Volatile() intrinsic.
+  void GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction,
+                                             Location ref,
+                                             vixl::aarch32::Register obj,
+                                             const vixl::aarch32::MemOperand& src,
+                                             bool needs_null_check);
+  // Fast path implementation of ReadBarrier::Barrier for a heap
+  // reference field load when Baker's read barriers are used.
   void GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction,
                                              Location ref,
                                              vixl::aarch32::Register obj,
@@ -638,20 +646,6 @@
                                              Location index,
                                              Location temp,
                                              bool needs_null_check);
-  // Factored implementation, used by GenerateFieldLoadWithBakerReadBarrier,
-  // GenerateArrayLoadWithBakerReadBarrier and some intrinsics.
-  //
-  // Load the object reference located at the address
-  // `obj + offset + (index << scale_factor)`, held by object `obj`, into
-  // `ref`, and mark it if needed.
-  void GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction,
-                                                 Location ref,
-                                                 vixl::aarch32::Register obj,
-                                                 uint32_t offset,
-                                                 Location index,
-                                                 ScaleFactor scale_factor,
-                                                 Location temp,
-                                                 bool needs_null_check);
 
   // Generate code checking whether the the reference field at the
   // address `obj + field_offset`, held by object `obj`, needs to be
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index 4b2bcc8..e95cc6e 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -745,15 +745,15 @@
   if (type == DataType::Type::kReference && kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
     // UnsafeGetObject/UnsafeGetObjectVolatile with Baker's read barrier case.
     Register temp = WRegisterFrom(locations->GetTemp(0));
-    codegen->GenerateReferenceLoadWithBakerReadBarrier(invoke,
-                                                       trg_loc,
-                                                       base,
-                                                       /* offset */ 0u,
-                                                       /* index */ offset_loc,
-                                                       /* scale_factor */ 0u,
-                                                       temp,
-                                                       /* needs_null_check */ false,
-                                                       is_volatile);
+    MacroAssembler* masm = codegen->GetVIXLAssembler();
+    // Piggy-back on the field load path using introspection for the Baker read barrier.
+    __ Add(temp, base, offset.W());  // Offset should not exceed 32 bits.
+    codegen->GenerateFieldLoadWithBakerReadBarrier(invoke,
+                                                   trg_loc,
+                                                   base,
+                                                   MemOperand(temp.X()),
+                                                   /* needs_null_check */ false,
+                                                   is_volatile);
   } else {
     // Other cases.
     MemOperand mem_op(base.X(), offset);
@@ -782,9 +782,9 @@
                                       kIntrinsified);
   if (can_call && kUseBakerReadBarrier) {
     locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
-    // We need a temporary register for the read barrier marking slow
-    // path in CodeGeneratorARM64::GenerateReferenceLoadWithBakerReadBarrier.
-    locations->AddTemp(Location::RequiresRegister());
+    // We need a temporary register for the read barrier load in order to use
+    // CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier().
+    locations->AddTemp(FixedTempLocation());
   }
   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
   locations->SetInAt(1, Location::RequiresRegister());
diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc
index 2963308..b920750 100644
--- a/compiler/optimizing/intrinsics_arm_vixl.cc
+++ b/compiler/optimizing/intrinsics_arm_vixl.cc
@@ -638,8 +638,11 @@
       if (kEmitCompilerReadBarrier) {
         if (kUseBakerReadBarrier) {
           Location temp = locations->GetTemp(0);
-          codegen->GenerateReferenceLoadWithBakerReadBarrier(
-              invoke, trg_loc, base, 0U, offset_loc, TIMES_1, temp, /* needs_null_check */ false);
+          // Piggy-back on the field load path using introspection for the Baker read barrier.
+          __ Add(RegisterFrom(temp), base, Operand(offset));
+          MemOperand src(RegisterFrom(temp), 0);
+          codegen->GenerateFieldLoadWithBakerReadBarrier(
+              invoke, trg_loc, base, src, /* needs_null_check */ false);
           if (is_volatile) {
             __ Dmb(vixl32::ISH);
           }
diff --git a/test/160-read-barrier-stress/src/Main.java b/test/160-read-barrier-stress/src/Main.java
index 7e130ce..04880c5 100644
--- a/test/160-read-barrier-stress/src/Main.java
+++ b/test/160-read-barrier-stress/src/Main.java
@@ -14,8 +14,11 @@
  * limitations under the License.
  */
 
+import java.lang.reflect.Field;
+import sun.misc.Unsafe;
+
 public class Main {
-    public static void main(String[] args) {
+    public static void main(String[] args) throws Exception {
         // Initialize local variables for comparison.
         Object f0000 = manyFields.testField0000;
         Object f1024 = manyFields.testField1024;
@@ -44,6 +47,16 @@
             testString2 = "testString2";
             testString3 = "testString3";
         }
+        // Initialize Unsafe.
+        Unsafe unsafe = getUnsafe();
+        long f0000Offset =
+            unsafe.objectFieldOffset(ManyFields.class.getField("testField0000"));
+        long f1024Offset =
+            unsafe.objectFieldOffset(ManyFields.class.getField("testField1024"));
+        long f4444Offset =
+            unsafe.objectFieldOffset(ManyFields.class.getField("testField4444"));
+        long f4999Offset =
+            unsafe.objectFieldOffset(ManyFields.class.getField("testField4999"));
 
         // Continually check reads from `manyFields` and `largeArray` while allocating
         // over 64MiB memory (with heap size limited to 16MiB), ensuring we run GC and
@@ -74,7 +87,17 @@
               assertSameObject(testString2, "testString2");
               assertSameObject(testString3, "testString3");
             }
-            // TODO: Stress GC roots (const-string/const-class, kBssEntry/kReferrersClass).
+            // TODO: Stress GC roots (const-class, kBssEntry/kReferrersClass).
+            // Test Unsafe.getObject().
+            assertSameObject(f0000, unsafe.getObject(mf, f0000Offset));
+            assertSameObject(f1024, unsafe.getObject(mf, f1024Offset));
+            assertSameObject(f4444, unsafe.getObject(mf, f4444Offset));
+            assertSameObject(f4999, unsafe.getObject(mf, f4999Offset));
+            // Test Unsafe.compareAndSwapObject().
+            assertEqual(false, unsafe.compareAndSwapObject(mf, f4444Offset, f1024, f4444));
+            assertEqual(true, unsafe.compareAndSwapObject(mf, f1024Offset, f1024, f4444));
+            assertEqual(true, unsafe.compareAndSwapObject(mf, f1024Offset, f4444, f1024));
+            assertEqual(false, unsafe.compareAndSwapObject(mf, f1024Offset, f4444, f1024));
         }
     }
 
@@ -84,6 +107,19 @@
         }
     }
 
+    public static void assertEqual(boolean expected, boolean actual) {
+      if (expected != actual) {
+        throw new Error("Expected " + expected +", got " + actual);
+      }
+    }
+
+    public static Unsafe getUnsafe() throws Exception {
+      Class<?> unsafeClass = Class.forName("sun.misc.Unsafe");
+      Field f = unsafeClass.getDeclaredField("theUnsafe");
+      f.setAccessible(true);
+      return (Unsafe) f.get(null);
+    }
+
     public static void allocateAtLeast1KiB() {
         // Give GC more work by allocating Object arrays.
         memory[allocationIndex] = new Object[1024 / 4];