Introduce a Marking Register in ARM code generation.

When generating code for ARM, maintain the status of
Thread::Current()->GetIsGcMarking() in register R8,
dubbed MR (Marking Register), and check the value of that
register (instead of loading and checking a read barrier
marking entrypoint) in read barriers.

Test: m test-art-target
Test: m test-art-target with tree built with ART_USE_READ_BARRIER=false
Test: m test-art-host-gtest
Test: ARM device boot test
Bug: 37707231
Change-Id: I30b44254460d0bbb9f1b2adc65eca52ca3de3f53
diff --git a/compiler/jni/jni_cfi_test.cc b/compiler/jni/jni_cfi_test.cc
index 23106e5..b552a6e 100644
--- a/compiler/jni/jni_cfi_test.cc
+++ b/compiler/jni/jni_cfi_test.cc
@@ -110,8 +110,13 @@
   }
 
 #ifdef ART_ENABLE_CODEGEN_arm
+// Run the tests for ARM only with Baker read barriers, as the
+// expected generated code contains a Marking Register refresh
+// instruction.
+#if defined(USE_READ_BARRIER) && defined(USE_BAKER_READ_BARRIER)
 TEST_ISA(kThumb2)
 #endif
+#endif
 
 #ifdef ART_ENABLE_CODEGEN_arm64
 // Run the tests for ARM64 only with Baker read barriers, as the
diff --git a/compiler/jni/jni_cfi_test_expected.inc b/compiler/jni/jni_cfi_test_expected.inc
index acb8a57..d641fe4 100644
--- a/compiler/jni/jni_cfi_test_expected.inc
+++ b/compiler/jni/jni_cfi_test_expected.inc
@@ -1,7 +1,8 @@
 static constexpr uint8_t expected_asm_kThumb2[] = {
     0x2D, 0xE9, 0xE0, 0x4D, 0x2D, 0xED, 0x10, 0x8A, 0x89, 0xB0, 0x00, 0x90,
     0x21, 0x91, 0x8D, 0xED, 0x22, 0x0A, 0x23, 0x92, 0x24, 0x93, 0x88, 0xB0,
-    0x08, 0xB0, 0x09, 0xB0, 0xBD, 0xEC, 0x10, 0x8A, 0xBD, 0xE8, 0xE0, 0x8D,
+    0x08, 0xB0, 0x09, 0xB0, 0xBD, 0xEC, 0x10, 0x8A, 0xBD, 0xE8, 0xE0, 0x4D,
+    0xD9, 0xF8, 0x34, 0x80, 0x70, 0x47,
 };
 static constexpr uint8_t expected_cfi_kThumb2[] = {
     0x44, 0x0E, 0x1C, 0x85, 0x07, 0x86, 0x06, 0x87, 0x05, 0x88, 0x04, 0x8A,
@@ -13,10 +14,10 @@
     0x4E, 0x0E, 0xA0, 0x01, 0x42, 0x0E, 0x80, 0x01, 0x0A, 0x42, 0x0E, 0x5C,
     0x44, 0x0E, 0x1C, 0x06, 0x50, 0x06, 0x51, 0x06, 0x52, 0x06, 0x53, 0x06,
     0x54, 0x06, 0x55, 0x06, 0x56, 0x06, 0x57, 0x06, 0x58, 0x06, 0x59, 0x06,
-    0x5A, 0x06, 0x5B, 0x06, 0x5C, 0x06, 0x5D, 0x06, 0x5E, 0x06, 0x5F, 0x44,
+    0x5A, 0x06, 0x5B, 0x06, 0x5C, 0x06, 0x5D, 0x06, 0x5E, 0x06, 0x5F, 0x4A,
     0x0B, 0x0E, 0x80, 0x01,
 };
-// 0x00000000: push {r5, r6, r7, r8, r10, r11, lr}
+// 0x00000000: push {r5,r6,r7,r8,r10,r11,lr}
 // 0x00000004: .cfi_def_cfa_offset: 28
 // 0x00000004: .cfi_offset: r5 at cfa-28
 // 0x00000004: .cfi_offset: r6 at cfa-24
@@ -25,7 +26,7 @@
 // 0x00000004: .cfi_offset: r10 at cfa-12
 // 0x00000004: .cfi_offset: r11 at cfa-8
 // 0x00000004: .cfi_offset: r14 at cfa-4
-// 0x00000004: vpush.f32 {s16-s31}
+// 0x00000004: vpush {s16-s31}
 // 0x00000008: .cfi_def_cfa_offset: 92
 // 0x00000008: .cfi_offset_extended: r80 at cfa-92
 // 0x00000008: .cfi_offset_extended: r81 at cfa-88
@@ -43,21 +44,21 @@
 // 0x00000008: .cfi_offset_extended: r93 at cfa-40
 // 0x00000008: .cfi_offset_extended: r94 at cfa-36
 // 0x00000008: .cfi_offset_extended: r95 at cfa-32
-// 0x00000008: sub sp, sp, #36
+// 0x00000008: sub sp, #36
 // 0x0000000a: .cfi_def_cfa_offset: 128
-// 0x0000000a: str r0, [sp, #0]
+// 0x0000000a: str r0, [sp]
 // 0x0000000c: str r1, [sp, #132]
-// 0x0000000e: vstr.f32 s0, [sp, #136]
+// 0x0000000e: vstr s0, [sp, #136]
 // 0x00000012: str r2, [sp, #140]
 // 0x00000014: str r3, [sp, #144]
-// 0x00000016: sub sp, sp, #32
+// 0x00000016: sub sp, #32
 // 0x00000018: .cfi_def_cfa_offset: 160
-// 0x00000018: add sp, sp, #32
+// 0x00000018: add sp, #32
 // 0x0000001a: .cfi_def_cfa_offset: 128
 // 0x0000001a: .cfi_remember_state
-// 0x0000001a: add sp, sp, #36
+// 0x0000001a: add sp, #36
 // 0x0000001c: .cfi_def_cfa_offset: 92
-// 0x0000001c: vpop.f32 {s16-s31}
+// 0x0000001c: vpop {s16-s31}
 // 0x00000020: .cfi_def_cfa_offset: 28
 // 0x00000020: .cfi_restore_extended: r80
 // 0x00000020: .cfi_restore_extended: r81
@@ -75,9 +76,11 @@
 // 0x00000020: .cfi_restore_extended: r93
 // 0x00000020: .cfi_restore_extended: r94
 // 0x00000020: .cfi_restore_extended: r95
-// 0x00000020: pop {r5, r6, r7, r8, r10, r11, pc}
-// 0x00000024: .cfi_restore_state
-// 0x00000024: .cfi_def_cfa_offset: 128
+// 0x00000020: pop {r5,r6,r7,r8,r10,r11,lr}
+// 0x00000024: ldr r8, [tr, #52] ; is_gc_marking
+// 0x00000028: bx lr
+// 0x0000002a: .cfi_restore_state
+// 0x0000002a: .cfi_def_cfa_offset: 128
 
 static constexpr uint8_t expected_asm_kArm64[] = {
     0xFF, 0x03, 0x03, 0xD1, 0xF3, 0x53, 0x06, 0xA9, 0xF5, 0x5B, 0x07, 0xA9,
diff --git a/compiler/linker/arm/relative_patcher_thumb2.cc b/compiler/linker/arm/relative_patcher_thumb2.cc
index aa5a945..18d6b9a 100644
--- a/compiler/linker/arm/relative_patcher_thumb2.cc
+++ b/compiler/linker/arm/relative_patcher_thumb2.cc
@@ -199,6 +199,24 @@
   // Note: The fake dependency is unnecessary for the slow path.
 }
 
+// Load the read barrier introspection entrypoint in register `entrypoint`
+static void LoadReadBarrierMarkIntrospectionEntrypoint(arm::ArmVIXLAssembler& assembler,
+                                                       vixl::aarch32::Register entrypoint) {
+  using vixl::aarch32::MemOperand;
+  using vixl::aarch32::ip;
+  // Thread Register.
+  const vixl::aarch32::Register tr = vixl::aarch32::r9;
+
+  // The register where the read barrier introspection entrypoint is loaded
+  // is fixed: `Thumb2RelativePatcher::kBakerCcEntrypointRegister` (R4).
+  DCHECK_EQ(entrypoint.GetCode(), Thumb2RelativePatcher::kBakerCcEntrypointRegister);
+  // entrypoint = Thread::Current()->pReadBarrierMarkReg12, i.e. pReadBarrierMarkIntrospection.
+  DCHECK_EQ(ip.GetCode(), 12u);
+  const int32_t entry_point_offset =
+      Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ip.GetCode());
+  __ Ldr(entrypoint, MemOperand(tr, entry_point_offset));
+}
+
 void Thumb2RelativePatcher::CompileBakerReadBarrierThunk(arm::ArmVIXLAssembler& assembler,
                                                          uint32_t encoded_data) {
   using namespace vixl::aarch32;  // NOLINT(build/namespaces)
@@ -233,6 +251,7 @@
       const int32_t ldr_offset = /* Thumb state adjustment (LR contains Thumb state). */ -1 +
                                  raw_ldr_offset;
       Register ep_reg(kBakerCcEntrypointRegister);
+      LoadReadBarrierMarkIntrospectionEntrypoint(assembler, ep_reg);
       if (width == BakerReadBarrierWidth::kWide) {
         MemOperand ldr_half_address(lr, ldr_offset + 2);
         __ Ldrh(ip, ldr_half_address);        // Load the LDR immediate half-word with "Rt | imm12".
@@ -278,8 +297,10 @@
       MemOperand ldr_address(lr, ldr_offset + 2);
       __ Ldrb(ip, ldr_address);               // Load the LDR (register) byte with "00 | imm2 | Rm",
                                               // i.e. Rm+32 because the scale in imm2 is 2.
-      Register ep_reg(kBakerCcEntrypointRegister);  // Insert ip to the entrypoint address to create
-      __ Bfi(ep_reg, ip, 3, 6);               // a switch case target based on the index register.
+      Register ep_reg(kBakerCcEntrypointRegister);
+      LoadReadBarrierMarkIntrospectionEntrypoint(assembler, ep_reg);
+      __ Bfi(ep_reg, ip, 3, 6);               // Insert ip to the entrypoint address to create
+                                              // a switch case target based on the index register.
       __ Mov(ip, base_reg);                   // Move the base register to ip0.
       __ Bx(ep_reg);                          // Jump to the entrypoint's array switch case.
       break;
@@ -309,9 +330,10 @@
                     " the highest bits and the 'forwarding address' state to have all bits set");
       __ Cmp(ip, Operand(0xc0000000));
       __ B(hs, &forwarding_address);
+      Register ep_reg(kBakerCcEntrypointRegister);
+      LoadReadBarrierMarkIntrospectionEntrypoint(assembler, ep_reg);
       // Adjust the art_quick_read_barrier_mark_introspection address in kBakerCcEntrypointRegister
       // to art_quick_read_barrier_mark_introspection_gc_roots.
-      Register ep_reg(kBakerCcEntrypointRegister);
       int32_t entrypoint_offset = (width == BakerReadBarrierWidth::kWide)
           ? BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_ENTRYPOINT_OFFSET
           : BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_ENTRYPOINT_OFFSET;
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index 7334678..d7e0f51 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -740,7 +740,9 @@
 // `ref`.
 //
 // Argument `entrypoint` must be a register location holding the read
-// barrier marking runtime entry point to be invoked.
+// barrier marking runtime entry point to be invoked or an empty
+// location; in the latter case, the read barrier marking runtime
+// entry point will be loaded by the slow path code itself.
 class ReadBarrierMarkSlowPathBaseARMVIXL : public SlowPathCodeARMVIXL {
  protected:
   ReadBarrierMarkSlowPathBaseARMVIXL(HInstruction* instruction, Location ref, Location entrypoint)
@@ -813,9 +815,10 @@
 // another thread, or if another thread installed another object
 // reference (different from `ref`) in `obj.field`).
 //
-// If `entrypoint` is a valid location it is assumed to already be
-// holding the entrypoint. The case where the entrypoint is passed in
-// is when the decision to mark is based on whether the GC is marking.
+// Argument `entrypoint` must be a register location holding the read
+// barrier marking runtime entry point to be invoked or an empty
+// location; in the latter case, the read barrier marking runtime
+// entry point will be loaded by the slow path code itself.
 class ReadBarrierMarkSlowPathARMVIXL : public ReadBarrierMarkSlowPathBaseARMVIXL {
  public:
   ReadBarrierMarkSlowPathARMVIXL(HInstruction* instruction,
@@ -861,7 +864,9 @@
 // reference (different from `ref`) in `obj.field`).
 //
 // Argument `entrypoint` must be a register location holding the read
-// barrier marking runtime entry point to be invoked.
+// barrier marking runtime entry point to be invoked or an empty
+// location; in the latter case, the read barrier marking runtime
+// entry point will be loaded by the slow path code itself.
 class LoadReferenceWithBakerReadBarrierSlowPathARMVIXL : public ReadBarrierMarkSlowPathBaseARMVIXL {
  public:
   LoadReferenceWithBakerReadBarrierSlowPathARMVIXL(HInstruction* instruction,
@@ -872,7 +877,7 @@
                                                    ScaleFactor scale_factor,
                                                    bool needs_null_check,
                                                    vixl32::Register temp,
-                                                   Location entrypoint)
+                                                   Location entrypoint = Location::NoLocation())
       : ReadBarrierMarkSlowPathBaseARMVIXL(instruction, ref, entrypoint),
         obj_(obj),
         offset_(offset),
@@ -1006,22 +1011,24 @@
 // hold the same to-space reference (unless another thread installed
 // another object reference (different from `ref`) in `obj.field`).
 //
-//
 // Argument `entrypoint` must be a register location holding the read
-// barrier marking runtime entry point to be invoked.
+// barrier marking runtime entry point to be invoked or an empty
+// location; in the latter case, the read barrier marking runtime
+// entry point will be loaded by the slow path code itself.
 class LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL
     : public ReadBarrierMarkSlowPathBaseARMVIXL {
  public:
-  LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL(HInstruction* instruction,
-                                                                 Location ref,
-                                                                 vixl32::Register obj,
-                                                                 uint32_t offset,
-                                                                 Location index,
-                                                                 ScaleFactor scale_factor,
-                                                                 bool needs_null_check,
-                                                                 vixl32::Register temp1,
-                                                                 vixl32::Register temp2,
-                                                                 Location entrypoint)
+  LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL(
+      HInstruction* instruction,
+      Location ref,
+      vixl32::Register obj,
+      uint32_t offset,
+      Location index,
+      ScaleFactor scale_factor,
+      bool needs_null_check,
+      vixl32::Register temp1,
+      vixl32::Register temp2,
+      Location entrypoint = Location::NoLocation())
       : ReadBarrierMarkSlowPathBaseARMVIXL(instruction, ref, entrypoint),
         obj_(obj),
         offset_(offset),
@@ -2310,7 +2317,8 @@
   }
 }
 
-static void GenerateConditionIntegralOrNonPrimitive(HCondition* cond, CodeGeneratorARMVIXL* codegen) {
+static void GenerateConditionIntegralOrNonPrimitive(HCondition* cond,
+                                                    CodeGeneratorARMVIXL* codegen) {
   const Primitive::Type type = cond->GetLeft()->GetType();
 
   DCHECK(Primitive::IsIntegralType(type) || type == Primitive::kPrimNot) << type;
@@ -2576,6 +2584,11 @@
   blocked_core_registers_[LR] = true;
   blocked_core_registers_[PC] = true;
 
+  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    // Reserve marking register.
+    blocked_core_registers_[MR] = true;
+  }
+
   // Reserve thread register.
   blocked_core_registers_[TR] = true;
 
@@ -8531,20 +8544,17 @@
       // Baker's read barrier are used.
       if (kBakerReadBarrierLinkTimeThunksEnableForGcRoots &&
           !Runtime::Current()->UseJitCompilation()) {
-        // Note that we do not actually check the value of `GetIsGcMarking()`
-        // to decide whether to mark the loaded GC root or not.  Instead, we
-        // load into `temp` (actually kBakerCcEntrypointRegister) the read
-        // barrier mark introspection entrypoint. If `temp` is null, it means
-        // that `GetIsGcMarking()` is false, and vice versa.
+        // Query `art::Thread::Current()->GetIsGcMarking()` (stored in
+        // the Marking Register) to decide whether we need to enter
+        // the slow path to mark the GC root.
         //
         // We use link-time generated thunks for the slow path. That thunk
         // checks the reference and jumps to the entrypoint if needed.
         //
-        //     temp = Thread::Current()->pReadBarrierMarkIntrospection
         //     lr = &return_address;
         //     GcRoot<mirror::Object> root = *(obj+offset);  // Original reference load.
-        //     if (temp != nullptr) {
-        //        goto gc_root_thunk<root_reg>(lr)
+        //     if (mr) {  // Thread::Current()->GetIsGcMarking()
+        //       goto gc_root_thunk<root_reg>(lr)
         //     }
         //   return_address:
 
@@ -8555,18 +8565,10 @@
             root_reg.GetCode(), narrow);
         vixl32::Label* bne_label = codegen_->NewBakerReadBarrierPatch(custom_data);
 
-        // entrypoint_reg =
-        //     Thread::Current()->pReadBarrierMarkReg12, i.e. pReadBarrierMarkIntrospection.
-        DCHECK_EQ(ip.GetCode(), 12u);
-        const int32_t entry_point_offset =
-            Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ip.GetCode());
-        __ Ldr(kBakerCcEntrypointRegister, MemOperand(tr, entry_point_offset));
-
-        vixl::EmissionCheckScope guard(GetVIXLAssembler(),
-                                       4 * vixl32::kMaxInstructionSizeInBytes);
+        vixl::EmissionCheckScope guard(GetVIXLAssembler(), 4 * vixl32::kMaxInstructionSizeInBytes);
         vixl32::Label return_address;
         EmitAdrCode adr(GetVIXLAssembler(), lr, &return_address);
-        __ cmp(kBakerCcEntrypointRegister, Operand(0));
+        __ cmp(mr, Operand(0));
         // Currently the offset is always within range. If that changes,
         // we shall have to split the load the same way as for fields.
         DCHECK_LT(offset, kReferenceLoadMinFarOffset);
@@ -8578,34 +8580,23 @@
                   narrow ? BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_OFFSET
                          : BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_OFFSET);
       } else {
-        // Note that we do not actually check the value of
-        // `GetIsGcMarking()` to decide whether to mark the loaded GC
-        // root or not.  Instead, we load into `temp` the read barrier
-        // mark entry point corresponding to register `root`. If `temp`
-        // is null, it means that `GetIsGcMarking()` is false, and vice
-        // versa.
+        // Query `art::Thread::Current()->GetIsGcMarking()` (stored in
+        // the Marking Register) to decide whether we need to enter
+        // the slow path to mark the GC root.
         //
-        //   temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
         //   GcRoot<mirror::Object> root = *(obj+offset);  // Original reference load.
-        //   if (temp != nullptr) {  // <=> Thread::Current()->GetIsGcMarking()
+        //   if (mr) {  // Thread::Current()->GetIsGcMarking()
         //     // Slow path.
-        //     root = temp(root);  // root = ReadBarrier::Mark(root);  // Runtime entry point call.
+        //     entrypoint = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+        //     root = entrypoint(root);  // root = ReadBarrier::Mark(root);  // Entry point call.
         //   }
 
-        // Slow path marking the GC root `root`. The entrypoint will already be loaded in `temp`.
-        Location temp = LocationFrom(lr);
+        // Slow path marking the GC root `root`. The entrypoint will
+        // be loaded by the slow path code.
         SlowPathCodeARMVIXL* slow_path =
-            new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARMVIXL(
-                instruction, root, /* entrypoint */ temp);
+            new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARMVIXL(instruction, root);
         codegen_->AddSlowPath(slow_path);
 
-        // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
-        const int32_t entry_point_offset =
-            Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg());
-        // Loading the entrypoint does not require a load acquire since it is only changed when
-        // threads are suspended or running a checkpoint.
-        GetAssembler()->LoadFromOffset(kLoadWord, RegisterFrom(temp), tr, entry_point_offset);
-
         // /* GcRoot<mirror::Object> */ root = *(obj + offset)
         GetAssembler()->LoadFromOffset(kLoadWord, root_reg, obj, offset);
         static_assert(
@@ -8616,9 +8607,7 @@
                       "art::mirror::CompressedReference<mirror::Object> and int32_t "
                       "have different sizes.");
 
-        // The entrypoint is null when the GC is not marking, this prevents one load compared to
-        // checking GetIsGcMarking.
-        __ CompareAndBranchIfNonZero(RegisterFrom(temp), slow_path->GetEntryLabel());
+        __ CompareAndBranchIfNonZero(mr, slow_path->GetEntryLabel());
         __ Bind(slow_path->GetExitLabel());
       }
     } else {
@@ -8659,20 +8648,19 @@
 
   if (kBakerReadBarrierLinkTimeThunksEnableForFields &&
       !Runtime::Current()->UseJitCompilation()) {
-    // Note that we do not actually check the value of `GetIsGcMarking()`
-    // to decide whether to mark the loaded reference or not.  Instead, we
-    // load into `temp` (actually kBakerCcEntrypointRegister) the read
-    // barrier mark introspection entrypoint. If `temp` is null, it means
-    // that `GetIsGcMarking()` is false, and vice versa.
+    // Query `art::Thread::Current()->GetIsGcMarking()` (stored in the
+    // Marking Register) to decide whether we need to enter the slow
+    // path to mark the reference. Then, in the slow path, check the
+    // gray bit in the lock word of the reference's holder (`obj`) to
+    // decide whether to mark `ref` or not.
     //
     // We use link-time generated thunks for the slow path. That thunk checks
     // the holder and jumps to the entrypoint if needed. If the holder is not
     // gray, it creates a fake dependency and returns to the LDR instruction.
     //
-    //     temp = Thread::Current()->pReadBarrierMarkIntrospection
     //     lr = &gray_return_address;
-    //     if (temp != nullptr) {
-    //        goto field_thunk<holder_reg, base_reg>(lr)
+    //     if (mr) {  // Thread::Current()->GetIsGcMarking()
+    //       goto field_thunk<holder_reg, base_reg>(lr)
     //     }
     //   not_gray_return_address:
     //     // Original reference load. If the offset is too large to fit
@@ -8701,19 +8689,12 @@
         base.GetCode(), obj.GetCode(), narrow);
     vixl32::Label* bne_label = NewBakerReadBarrierPatch(custom_data);
 
-    // entrypoint_reg =
-    //     Thread::Current()->pReadBarrierMarkReg12, i.e. pReadBarrierMarkIntrospection.
-    DCHECK_EQ(ip.GetCode(), 12u);
-    const int32_t entry_point_offset =
-        Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ip.GetCode());
-    __ Ldr(kBakerCcEntrypointRegister, MemOperand(tr, entry_point_offset));
-
     vixl::EmissionCheckScope guard(
         GetVIXLAssembler(),
         (kPoisonHeapReferences ? 5u : 4u) * vixl32::kMaxInstructionSizeInBytes);
     vixl32::Label return_address;
     EmitAdrCode adr(GetVIXLAssembler(), lr, &return_address);
-    __ cmp(kBakerCcEntrypointRegister, Operand(0));
+    __ cmp(mr, Operand(0));
     EmitPlaceholderBne(this, bne_label);
     ptrdiff_t old_offset = GetVIXLAssembler()->GetBuffer()->GetCursorOffset();
     __ ldr(EncodingSize(narrow ? Narrow : Wide), ref_reg, MemOperand(base, offset));
@@ -8760,20 +8741,19 @@
 
   if (kBakerReadBarrierLinkTimeThunksEnableForArrays &&
       !Runtime::Current()->UseJitCompilation()) {
-    // Note that we do not actually check the value of `GetIsGcMarking()`
-    // to decide whether to mark the loaded reference or not.  Instead, we
-    // load into `temp` (actually kBakerCcEntrypointRegister) the read
-    // barrier mark introspection entrypoint. If `temp` is null, it means
-    // that `GetIsGcMarking()` is false, and vice versa.
+    // Query `art::Thread::Current()->GetIsGcMarking()` (stored in the
+    // Marking Register) to decide whether we need to enter the slow
+    // path to mark the reference. Then, in the slow path, check the
+    // gray bit in the lock word of the reference's holder (`obj`) to
+    // decide whether to mark `ref` or not.
     //
     // We use link-time generated thunks for the slow path. That thunk checks
     // the holder and jumps to the entrypoint if needed. If the holder is not
     // gray, it creates a fake dependency and returns to the LDR instruction.
     //
-    //     temp = Thread::Current()->pReadBarrierMarkIntrospection
     //     lr = &gray_return_address;
-    //     if (temp != nullptr) {
-    //        goto field_thunk<holder_reg, base_reg>(lr)
+    //     if (mr) {  // Thread::Current()->GetIsGcMarking()
+    //       goto array_thunk<base_reg>(lr)
     //     }
     //   not_gray_return_address:
     //     // Original reference load. If the offset is too large to fit
@@ -8793,20 +8773,13 @@
         linker::Thumb2RelativePatcher::EncodeBakerReadBarrierArrayData(data_reg.GetCode());
     vixl32::Label* bne_label = NewBakerReadBarrierPatch(custom_data);
 
-    // entrypoint_reg =
-    //     Thread::Current()->pReadBarrierMarkReg16, i.e. pReadBarrierMarkIntrospection.
-    DCHECK_EQ(ip.GetCode(), 12u);
-    const int32_t entry_point_offset =
-        Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ip.GetCode());
-    __ Ldr(kBakerCcEntrypointRegister, MemOperand(tr, entry_point_offset));
     __ Add(data_reg, obj, Operand(data_offset));
-
     vixl::EmissionCheckScope guard(
         GetVIXLAssembler(),
         (kPoisonHeapReferences ? 5u : 4u) * vixl32::kMaxInstructionSizeInBytes);
     vixl32::Label return_address;
     EmitAdrCode adr(GetVIXLAssembler(), lr, &return_address);
-    __ cmp(kBakerCcEntrypointRegister, Operand(0));
+    __ cmp(mr, Operand(0));
     EmitPlaceholderBne(this, bne_label);
     ptrdiff_t old_offset = GetVIXLAssembler()->GetBuffer()->GetCursorOffset();
     __ ldr(ref_reg, MemOperand(data_reg, index_reg, vixl32::LSL, scale_factor));
@@ -8838,26 +8811,21 @@
   DCHECK(kEmitCompilerReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
-  // Query `art::Thread::Current()->GetIsGcMarking()` to decide
-  // whether we need to enter the slow path to mark the reference.
-  // Then, in the slow path, check the gray bit in the lock word of
-  // the reference's holder (`obj`) to decide whether to mark `ref` or
-  // not.
+  // Query `art::Thread::Current()->GetIsGcMarking()` (stored in the
+  // Marking Register) to decide whether we need to enter the slow
+  // path to mark the reference. Then, in the slow path, check the
+  // gray bit in the lock word of the reference's holder (`obj`) to
+  // decide whether to mark `ref` or not.
   //
-  // Note that we do not actually check the value of `GetIsGcMarking()`;
-  // instead, we load into `temp2` the read barrier mark entry point
-  // corresponding to register `ref`. If `temp2` is null, it means
-  // that `GetIsGcMarking()` is false, and vice versa.
-  //
-  //   temp2 = Thread::Current()->pReadBarrierMarkReg ## root.reg()
-  //   if (temp2 != nullptr) {  // <=> Thread::Current()->GetIsGcMarking()
+  //   if (mr) {  // Thread::Current()->GetIsGcMarking()
   //     // Slow path.
   //     uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
   //     lfence;  // Load fence or artificial data dependency to prevent load-load reordering
   //     HeapReference<mirror::Object> ref = *src;  // Original reference load.
   //     bool is_gray = (rb_state == ReadBarrier::GrayState());
   //     if (is_gray) {
-  //       ref = temp2(ref);  // ref = ReadBarrier::Mark(ref);  // Runtime entry point call.
+  //       entrypoint = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+  //       ref = entrypoint(ref);  // ref = ReadBarrier::Mark(ref);  // Runtime entry point call.
   //     }
   //   } else {
   //     HeapReference<mirror::Object> ref = *src;  // Original reference load.
@@ -8866,30 +8834,13 @@
   vixl32::Register temp_reg = RegisterFrom(temp);
 
   // Slow path marking the object `ref` when the GC is marking. The
-  // entrypoint will already be loaded in `temp2`.
-  Location temp2 = LocationFrom(lr);
+  // entrypoint will be loaded by the slow path code.
   SlowPathCodeARMVIXL* slow_path =
       new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierSlowPathARMVIXL(
-          instruction,
-          ref,
-          obj,
-          offset,
-          index,
-          scale_factor,
-          needs_null_check,
-          temp_reg,
-          /* entrypoint */ temp2);
+          instruction, ref, obj, offset, index, scale_factor, needs_null_check, temp_reg);
   AddSlowPath(slow_path);
 
-  // temp2 = Thread::Current()->pReadBarrierMarkReg ## ref.reg()
-  const int32_t entry_point_offset =
-      Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref.reg());
-  // Loading the entrypoint does not require a load acquire since it is only changed when
-  // threads are suspended or running a checkpoint.
-  GetAssembler()->LoadFromOffset(kLoadWord, RegisterFrom(temp2), tr, entry_point_offset);
-  // The entrypoint is null when the GC is not marking, this prevents one load compared to
-  // checking GetIsGcMarking.
-  __ CompareAndBranchIfNonZero(RegisterFrom(temp2), slow_path->GetEntryLabel());
+  __ CompareAndBranchIfNonZero(mr, slow_path->GetEntryLabel());
   // Fast path: the GC is not marking: just load the reference.
   GenerateRawReferenceLoad(instruction, ref, obj, offset, index, scale_factor, needs_null_check);
   __ Bind(slow_path->GetExitLabel());
@@ -8905,19 +8856,14 @@
   DCHECK(kEmitCompilerReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
-  // Query `art::Thread::Current()->GetIsGcMarking()` to decide
-  // whether we need to enter the slow path to update the reference
-  // field within `obj`.  Then, in the slow path, check the gray bit
-  // in the lock word of the reference's holder (`obj`) to decide
-  // whether to mark `ref` and update the field or not.
+  // Query `art::Thread::Current()->GetIsGcMarking()` (stored in the
+  // Marking Register) to decide whether we need to enter the slow
+  // path to update the reference field within `obj`. Then, in the
+  // slow path, check the gray bit in the lock word of the reference's
+  // holder (`obj`) to decide whether to mark `ref` and update the
+  // field or not.
   //
-  // Note that we do not actually check the value of `GetIsGcMarking()`;
-  // instead, we load into `temp3` the read barrier mark entry point
-  // corresponding to register `ref`. If `temp3` is null, it means
-  // that `GetIsGcMarking()` is false, and vice versa.
-  //
-  //   temp3 = Thread::Current()->pReadBarrierMarkReg ## root.reg()
-  //   if (temp3 != nullptr) {  // <=> Thread::Current()->GetIsGcMarking()
+  //   if (mr) {  // Thread::Current()->GetIsGcMarking()
   //     // Slow path.
   //     uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
   //     lfence;  // Load fence or artificial data dependency to prevent load-load reordering
@@ -8925,7 +8871,8 @@
   //     bool is_gray = (rb_state == ReadBarrier::GrayState());
   //     if (is_gray) {
   //       old_ref = ref;
-  //       ref = temp3(ref);  // ref = ReadBarrier::Mark(ref);  // Runtime entry point call.
+  //       entrypoint = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+  //       ref = entrypoint(ref);  // ref = ReadBarrier::Mark(ref);  // Runtime entry point call.
   //       compareAndSwapObject(obj, field_offset, old_ref, ref);
   //     }
   //   }
@@ -8933,8 +8880,7 @@
   vixl32::Register temp_reg = RegisterFrom(temp);
 
   // Slow path updating the object reference at address `obj + field_offset`
-  // when the GC is marking. The entrypoint will already be loaded in `temp3`.
-  Location temp3 = LocationFrom(lr);
+  // when the GC is marking. The entrypoint will be loaded by the slow path code.
   SlowPathCodeARMVIXL* slow_path =
       new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL(
           instruction,
@@ -8945,19 +8891,10 @@
           /* scale_factor */ ScaleFactor::TIMES_1,
           needs_null_check,
           temp_reg,
-          temp2,
-          /* entrypoint */ temp3);
+          temp2);
   AddSlowPath(slow_path);
 
-  // temp3 = Thread::Current()->pReadBarrierMarkReg ## ref.reg()
-  const int32_t entry_point_offset =
-      Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref.reg());
-  // Loading the entrypoint does not require a load acquire since it is only changed when
-  // threads are suspended or running a checkpoint.
-  GetAssembler()->LoadFromOffset(kLoadWord, RegisterFrom(temp3), tr, entry_point_offset);
-  // The entrypoint is null when the GC is not marking, this prevents one load compared to
-  // checking GetIsGcMarking.
-  __ CompareAndBranchIfNonZero(RegisterFrom(temp3), slow_path->GetEntryLabel());
+  __ CompareAndBranchIfNonZero(mr, slow_path->GetEntryLabel());
   // Fast path: the GC is not marking: nothing to do (the field is
   // up-to-date, and we don't need to load the reference).
   __ Bind(slow_path->GetExitLabel());
diff --git a/compiler/optimizing/code_generator_arm_vixl.h b/compiler/optimizing/code_generator_arm_vixl.h
index ad3283a..5584723 100644
--- a/compiler/optimizing/code_generator_arm_vixl.h
+++ b/compiler/optimizing/code_generator_arm_vixl.h
@@ -80,12 +80,16 @@
 
 static const vixl::aarch32::Register kCoreAlwaysSpillRegister = vixl::aarch32::r5;
 
-// Callee saves core registers r5, r6, r7, r8, r10, r11, and lr.
+// Callee saves core registers r5, r6, r7, r8 (except when emitting Baker
+// read barriers, where it is used as Marking Register), r10, r11, and lr.
 static const vixl::aarch32::RegisterList kCoreCalleeSaves = vixl::aarch32::RegisterList::Union(
     vixl::aarch32::RegisterList(vixl::aarch32::r5,
                                 vixl::aarch32::r6,
-                                vixl::aarch32::r7,
-                                vixl::aarch32::r8),
+                                vixl::aarch32::r7),
+    // Do not consider r8 as a callee-save register with Baker read barriers.
+    ((kEmitCompilerReadBarrier && kUseBakerReadBarrier)
+         ? vixl::aarch32::RegisterList()
+         : vixl::aarch32::RegisterList(vixl::aarch32::r8)),
     vixl::aarch32::RegisterList(vixl::aarch32::r10,
                                 vixl::aarch32::r11,
                                 vixl::aarch32::lr));
diff --git a/compiler/utils/arm/assembler_arm_vixl.cc b/compiler/utils/arm/assembler_arm_vixl.cc
index eb3f870..af3b447 100644
--- a/compiler/utils/arm/assembler_arm_vixl.cc
+++ b/compiler/utils/arm/assembler_arm_vixl.cc
@@ -37,7 +37,10 @@
 #define ___   vixl_masm_.
 #endif
 
+// Thread register definition.
 extern const vixl32::Register tr(TR);
+// Marking register definition.
+extern const vixl32::Register mr(MR);
 
 void ArmVIXLAssembler::FinalizeCode() {
   vixl_masm_.FinalizeCode();
diff --git a/compiler/utils/arm/assembler_arm_vixl.h b/compiler/utils/arm/assembler_arm_vixl.h
index e81e7675..66b22ea 100644
--- a/compiler/utils/arm/assembler_arm_vixl.h
+++ b/compiler/utils/arm/assembler_arm_vixl.h
@@ -241,6 +241,8 @@
 
 // Thread register declaration.
 extern const vixl32::Register tr;
+// Marking register declaration.
+extern const vixl32::Register mr;
 
 }  // namespace arm
 }  // namespace art
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
index d07c047..bebe64c 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
@@ -120,8 +120,8 @@
   CHECK_ALIGNED(frame_size, kStackAlignment);
   cfi().RememberState();
 
-  // Compute callee saves to pop and PC.
-  RegList core_spill_mask = 1 << PC;
+  // Compute callee saves to pop and LR.
+  RegList core_spill_mask = 1 << LR;
   uint32_t fp_spill_mask = 0;
   for (const ManagedRegister& reg : callee_save_regs) {
     if (reg.AsArm().IsCoreRegister()) {
@@ -136,6 +136,7 @@
   CHECK_GT(frame_size, pop_values * kFramePointerSize);
   DecreaseFrameSize(frame_size - (pop_values * kFramePointerSize));  // handles CFI as well.
 
+  // Pop FP callee saves.
   if (fp_spill_mask != 0) {
     uint32_t first = CTZ(fp_spill_mask);
     // Check that list is contiguous.
@@ -146,9 +147,18 @@
     cfi().RestoreMany(DWARFReg(s0), fp_spill_mask);
   }
 
-  // Pop callee saves and PC.
+  // Pop core callee saves and LR.
   ___ Pop(RegisterList(core_spill_mask));
 
+  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    // Refresh Mark Register.
+    // TODO: Refresh MR only if suspend is taken.
+    ___ Ldr(mr, MemOperand(tr, Thread::IsGcMarkingOffset<kArmPointerSize>().Int32Value()));
+  }
+
+  // Return to LR.
+  ___ Bx(vixl32::lr);
+
   // The CFI should be restored for any code that follows the exit block.
   cfi().RestoreState();
   cfi().DefCFAOffset(frame_size);
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.cc b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
index c436fd9..bab84be 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.cc
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
@@ -782,7 +782,7 @@
   // Decrease frame size to start of callee saved regs.
   DecreaseFrameSize(frame_size);
 
-  // Pop callee saved and return to LR.
+  // Return to LR.
   ___ Ret();
 
   // The CFI should be restored for any code that follows the exit block.
diff --git a/compiler/utils/assembler_thumb_test.cc b/compiler/utils/assembler_thumb_test.cc
index 4e9b619..759ed38 100644
--- a/compiler/utils/assembler_thumb_test.cc
+++ b/compiler/utils/assembler_thumb_test.cc
@@ -1643,6 +1643,10 @@
 #define __ assembler.
 
 TEST_F(ArmVIXLAssemblerTest, VixlJniHelpers) {
+  // Run the test only with Baker read barriers, as the expected
+  // generated code contains a Marking Register refresh instruction.
+  TEST_DISABLED_WITHOUT_BAKER_READ_BARRIERS();
+
   const bool is_static = true;
   const bool is_synchronized = false;
   const bool is_critical_native = false;
diff --git a/compiler/utils/assembler_thumb_test_expected.cc.inc b/compiler/utils/assembler_thumb_test_expected.cc.inc
index eaaf815..563d135 100644
--- a/compiler/utils/assembler_thumb_test_expected.cc.inc
+++ b/compiler/utils/assembler_thumb_test_expected.cc.inc
@@ -5595,7 +5595,7 @@
   " 1dc:	f8cd c7ff 	str.w	ip, [sp, #2047]	; 0x7ff\n",
   " 1e0:	f8cd c7ff 	str.w	ip, [sp, #2047]	; 0x7ff\n",
   " 1e4:	f000 b802 	b.w	1ec <VixlJniHelpers+0x1ec>\n",
-  " 1e8:	f000 b818 	b.w	21c <VixlJniHelpers+0x21c>\n",
+  " 1e8:	f000 b81b 	b.w	222 <VixlJniHelpers+0x222>\n",
   " 1ec:	f8cd c7ff 	str.w	ip, [sp, #2047]	; 0x7ff\n",
   " 1f0:	f8cd c7ff 	str.w	ip, [sp, #2047]	; 0x7ff\n",
   " 1f4:	f8cd c7ff 	str.w	ip, [sp, #2047]	; 0x7ff\n",
@@ -5608,10 +5608,12 @@
   " 210:	b008      	add	sp, #32\n",
   " 212:	b009      	add	sp, #36	; 0x24\n",
   " 214:	ecbd 8a10 	vpop	{s16-s31}\n",
-  " 218:	e8bd 8de0 	ldmia.w	sp!, {r5, r6, r7, r8, sl, fp, pc}\n",
-  " 21c:	4660      	mov	r0, ip\n",
-  " 21e:	f8d9 c2c0 	ldr.w	ip, [r9, #704]	; 0x2c0\n",
-  " 222:	47e0      	blx	ip\n",
+  " 218:	e8bd 4de0 	ldmia.w	sp!, {r5, r6, r7, r8, sl, fp, lr}\n",
+  " 21c:	f8d9 8034 	ldr.w	r8, [r9, #52]	; 0x34\n",
+  " 220:	4770      	bx	lr\n",
+  " 222:	4660      	mov	r0, ip\n",
+  " 224:	f8d9 c2c0 	ldr.w	ip, [r9, #704]	; 0x2c0\n",
+  " 228:	47e0      	blx	ip\n",
   nullptr
 };
 
diff --git a/runtime/arch/arm/asm_support_arm.S b/runtime/arch/arm/asm_support_arm.S
index 9eca862..eeac743 100644
--- a/runtime/arch/arm/asm_support_arm.S
+++ b/runtime/arch/arm/asm_support_arm.S
@@ -26,6 +26,13 @@
 // Register holding Thread::Current().
 #define rSELF r9
 
+#if defined(USE_READ_BARRIER) && defined(USE_BAKER_READ_BARRIER)
+// Marking Register, holding Thread::Current()->GetIsGcMarking().
+// Only used with the Concurrent Copying (CC) garbage
+// collector, with the Baker read barrier configuration.
+#define rMR r8
+#endif
+
 .syntax unified
 .arch armv7-a
 .thumb
@@ -121,14 +128,14 @@
     END \name
 .endm
 
-// Macros to poison (negate) the reference for heap poisoning.
+// Macro to poison (negate) the reference for heap poisoning.
 .macro POISON_HEAP_REF rRef
 #ifdef USE_HEAP_POISONING
     rsb \rRef, \rRef, #0
 #endif  // USE_HEAP_POISONING
 .endm
 
-// Macros to unpoison (negate) the reference for heap poisoning.
+// Macro to unpoison (negate) the reference for heap poisoning.
 .macro UNPOISON_HEAP_REF rRef
 #ifdef USE_HEAP_POISONING
     rsb \rRef, \rRef, #0
diff --git a/runtime/arch/arm/context_arm.cc b/runtime/arch/arm/context_arm.cc
index 0db14fb..711452c 100644
--- a/runtime/arch/arm/context_arm.cc
+++ b/runtime/arch/arm/context_arm.cc
@@ -108,7 +108,9 @@
   for (size_t i = 0; i < kNumberOfSRegisters; ++i) {
     fprs[i] = fprs_[i] != nullptr ? *fprs_[i] : ArmContext::kBadFprBase + i;
   }
+  // Ensure the Thread Register contains the address of the current thread.
   DCHECK_EQ(reinterpret_cast<uintptr_t>(Thread::Current()), gprs[TR]);
+  // The Marking Register will be updated by art_quick_do_long_jump.
   art_quick_do_long_jump(gprs, fprs);
 }
 
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index b909bda..b4002f0 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -67,6 +67,9 @@
      * Runtime::CreateCalleeSaveMethod(kSaveRefsOnly).
      */
 .macro SETUP_SAVE_REFS_ONLY_FRAME rTemp
+    // Note: We could avoid saving R8 in the case of Baker read
+    // barriers, as it is overwritten by REFRESH_MARKING_REGISTER
+    // later; but it's not worth handling this special case.
     push {r5-r8, r10-r11, lr}                     @ 7 words of callee saves
     .cfi_adjust_cfa_offset 28
     .cfi_rel_offset r5, 0
@@ -93,6 +96,9 @@
 .macro RESTORE_SAVE_REFS_ONLY_FRAME
     add sp, #4               @ bottom word holds Method*
     .cfi_adjust_cfa_offset -4
+    // Note: Likewise, we could avoid restoring R8 in the case of Baker
+    // read barriers, as it is overwritten by REFRESH_MARKING_REGISTER
+    // later; but it's not worth handling this special case.
     pop {r5-r8, r10-r11, lr} @ 7 words of callee saves
     .cfi_restore r5
     .cfi_restore r6
@@ -104,16 +110,14 @@
     .cfi_adjust_cfa_offset -28
 .endm
 
-.macro RESTORE_SAVE_REFS_ONLY_FRAME_AND_RETURN
-    RESTORE_SAVE_REFS_ONLY_FRAME
-    bx  lr                   @ return
-.endm
-
     /*
      * Macro that sets up the callee save frame to conform with
      * Runtime::CreateCalleeSaveMethod(kSaveRefsAndArgs).
      */
 .macro SETUP_SAVE_REFS_AND_ARGS_FRAME_REGISTERS_ONLY
+    // Note: We could avoid saving R8 in the case of Baker read
+    // barriers, as it is overwritten by REFRESH_MARKING_REGISTER
+    // later; but it's not worth handling this special case.
     push {r1-r3, r5-r8, r10-r11, lr}   @ 10 words of callee saves and args.
     .cfi_adjust_cfa_offset 40
     .cfi_rel_offset r1, 0
@@ -156,6 +160,9 @@
     .cfi_adjust_cfa_offset -8
     vpop {s0-s15}
     .cfi_adjust_cfa_offset -64
+    // Note: Likewise, we could avoid restoring X20 in the case of Baker
+    // read barriers, as it is overwritten by REFRESH_MARKING_REGISTER
+    // later; but it's not worth handling this special case.
     pop {r1-r3, r5-r8, r10-r11, lr}  @ 10 words of callee saves
     .cfi_restore r1
     .cfi_restore r2
@@ -263,6 +270,17 @@
     .cfi_adjust_cfa_offset -52
 .endm
 
+// Macro to refresh the Marking Register (R8).
+//
+// This macro must be called at the end of functions implementing
+// entrypoints that possibly (directly or indirectly) perform a
+// suspend check (before they return).
+.macro REFRESH_MARKING_REGISTER
+#if defined(USE_READ_BARRIER) && defined(USE_BAKER_READ_BARRIER)
+    ldr rMR, [rSELF, #THREAD_IS_GC_MARKING_OFFSET]
+#endif
+.endm
+
 .macro RETURN_IF_RESULT_IS_ZERO
     cbnz   r0, 1f              @ result non-zero branch over
     bx     lr                  @ return
@@ -359,6 +377,7 @@
     mov    r1, r9                        @ pass Thread::Current
     bl     \entrypoint                   @ (uint32_t field_idx, Thread*)
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
     \return
 END \name
 .endm
@@ -370,6 +389,7 @@
     mov    r2, r9                        @ pass Thread::Current
     bl     \entrypoint                   @ (field_idx, Object*, Thread*)
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
     \return
 END \name
 .endm
@@ -381,6 +401,7 @@
     mov    r3, r9                        @ pass Thread::Current
     bl     \entrypoint                   @ (field_idx, Object*, new_val, Thread*)
     RESTORE_SAVE_REFS_ONLY_FRAME         @ TODO: we can clearly save an add here
+    REFRESH_MARKING_REGISTER
     \return
 END \name
 .endm
@@ -464,6 +485,8 @@
      *
      * On success this wrapper will restore arguments and *jump* to the target, leaving the lr
      * pointing back to the original caller.
+     *
+     * Clobbers IP (R12).
      */
 .macro INVOKE_TRAMPOLINE_BODY cxx_name
     .extern \cxx_name
@@ -473,6 +496,7 @@
     bl     \cxx_name                      @ (method_idx, this, Thread*, SP)
     mov    r12, r1                        @ save Method*->code_
     RESTORE_SAVE_REFS_AND_ARGS_FRAME
+    REFRESH_MARKING_REGISTER
     cbz    r0, 1f                         @ did we find the target? if not go to exception delivery
     bx     r12                            @ tail call to target
 1:
@@ -549,6 +573,8 @@
     mov    r4, #SUSPEND_CHECK_INTERVAL     @ reset r4 to suspend check interval
 #endif
 
+    REFRESH_MARKING_REGISTER
+
     ldr    ip, [r0, #ART_METHOD_QUICK_CODE_OFFSET_32]  @ get pointer to the code
     blx    ip                              @ call the method
 
@@ -580,7 +606,8 @@
     mov    r11, sp                         @ Save the stack pointer
     mov    r10, r1                         @ Save size of stack
     ldr    r9, [r11, #40]                  @ Move managed thread pointer into r9
-    mov    r8, r2                          @ Save the pc to call
+    REFRESH_MARKING_REGISTER
+    mov    r6, r2                          @ Save the pc to call
     sub    r7, sp, #12                     @ Reserve space for stack pointer,
                                            @    JValue* result, and ArtMethod* slot.
     and    r7, #0xFFFFFFF0                 @ Align stack pointer
@@ -612,7 +639,7 @@
 .Losr_entry:
     sub r10, r10, #4
     str lr, [sp, r10]                     @ Store link register per the compiler ABI
-    bx r8
+    bx r6
 END art_quick_osr_stub
 
     /*
@@ -624,6 +651,7 @@
     ldr  r14, [r0, #56]   @ (LR from gprs_ 56=4*14)
     add  r0, r0, #12      @ increment r0 to skip gprs_[0..2] 12=4*3
     ldm  r0, {r3-r13}     @ load remaining gprs from argument gprs_
+    REFRESH_MARKING_REGISTER
     ldr  r0, [r0, #-12]   @ load r0 value
     mov  r1, #0           @ clear result register r1
     bx   r2               @ do long jump
@@ -677,6 +705,7 @@
     mov    r1, r9                     @ pass Thread::Current
     bl     artLockObjectFromCode      @ (Object* obj, Thread*)
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
     RETURN_IF_RESULT_IS_ZERO
     DELIVER_PENDING_EXCEPTION
 END art_quick_lock_object
@@ -686,6 +715,7 @@
     mov    r1, r9                     @ pass Thread::Current
     bl     artLockObjectFromCode      @ (Object* obj, Thread*)
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
     RETURN_IF_RESULT_IS_ZERO
     DELIVER_PENDING_EXCEPTION
 END art_quick_lock_object_no_inline
@@ -743,6 +773,7 @@
     mov    r1, r9                     @ pass Thread::Current
     bl     artUnlockObjectFromCode    @ (Object* obj, Thread*)
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
     RETURN_IF_RESULT_IS_ZERO
     DELIVER_PENDING_EXCEPTION
 END art_quick_unlock_object
@@ -753,6 +784,7 @@
     mov    r1, r9                     @ pass Thread::Current
     bl     artUnlockObjectFromCode    @ (Object* obj, Thread*)
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
     RETURN_IF_RESULT_IS_ZERO
     DELIVER_PENDING_EXCEPTION
 END art_quick_unlock_object_no_inline
@@ -921,6 +953,7 @@
     mov    r1, r9                     @ pass Thread::Current
     bl     \entrypoint     @ (uint32_t type_idx, Method* method, Thread*)
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
     \return
 END \name
 .endm
@@ -933,6 +966,7 @@
     mov    r2, r9                     @ pass Thread::Current
     bl     \entrypoint     @ (uint32_t type_idx, Method* method, Thread*)
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
     \return
 END \name
 .endm
@@ -946,6 +980,7 @@
     @ (uint32_t type_idx, Method* method, int32_t component_count, Thread*)
     bl     \entrypoint
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
     \return
 END \name
 .endm
@@ -961,6 +996,7 @@
     add    sp, #16                    @ strip the extra frame
     .cfi_adjust_cfa_offset -16
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
     \return
 END \name
 .endm
@@ -975,6 +1011,7 @@
     cbz    r0, 1f                     @ If result is null, deliver the OOME.
     .cfi_remember_state
     RESTORE_SAVE_EVERYTHING_FRAME_KEEP_R0
+    REFRESH_MARKING_REGISTER
     bx     lr
     .cfi_restore_state
 1:
@@ -987,6 +1024,9 @@
 ONE_ARG_SAVE_EVERYTHING_DOWNCALL art_quick_initialize_type_and_verify_access, artInitializeTypeAndVerifyAccessFromCode
 ONE_ARG_SAVE_EVERYTHING_DOWNCALL art_quick_resolve_string, artResolveStringFromCode
 
+// Note: Functions `art{Get,Set}<Kind>{Static,Instance>FromCompiledCode` are
+// defined by macros in runtime/entrypoints/quick/quick_field_entrypoints.cc.
+
     /*
      * Called by managed code to resolve a static field and load a non-wide value.
      */
@@ -1006,6 +1046,7 @@
     bl     artGet64StaticFromCompiledCode        @ (uint32_t field_idx, Thread*)
     ldr    r2, [r9, #THREAD_EXCEPTION_OFFSET]  @ load Thread::Current()->exception_
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
     cbnz   r2, 1f                        @ success if no exception pending
     bx     lr                            @ return on success
 1:
@@ -1031,6 +1072,7 @@
     bl     artGet64InstanceFromCompiledCode      @ (field_idx, Object*, Thread*)
     ldr    r2, [r9, #THREAD_EXCEPTION_OFFSET]  @ load Thread::Current()->exception_
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
     cbnz   r2, 1f                        @ success if no exception pending
     bx     lr                            @ return on success
 1:
@@ -1066,6 +1108,7 @@
     add    sp, #16                       @ release out args
     .cfi_adjust_cfa_offset -16
     RESTORE_SAVE_REFS_ONLY_FRAME         @ TODO: we can clearly save an add here
+    REFRESH_MARKING_REGISTER
     RETURN_IF_RESULT_IS_ZERO
     DELIVER_PENDING_EXCEPTION
 END art_quick_set64_instance
@@ -1080,6 +1123,7 @@
     add    sp, #16                        @ release out args
     .cfi_adjust_cfa_offset -16
     RESTORE_SAVE_REFS_ONLY_FRAME          @ TODO: we can clearly save an add here
+    REFRESH_MARKING_REGISTER
     RETURN_IF_RESULT_IS_ZERO
     DELIVER_PENDING_EXCEPTION
 END art_quick_set64_static
@@ -1223,6 +1267,7 @@
     mov    r1, r9                     @ pass Thread::Current
     bl     \cxx_name                  @ (mirror::Class* cls, Thread*)
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
     RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
 END \c_name
 .endm
@@ -1315,6 +1360,7 @@
     mov    r1, r9                                             // Pass Thread::Current.
     bl     \entrypoint                                        // (mirror::Class* klass, Thread*)
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
     RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
 END \name
 .endm
@@ -1386,6 +1432,7 @@
     mov    r2, r9                  // pass Thread::Current
     bl     \entrypoint
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
     RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
 END \name
 .endm
@@ -1462,8 +1509,8 @@
     add    r2, r2, #(MIRROR_WIDE_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK)
 .endm
 
-# TODO(ngeoffray): art_quick_alloc_array_resolved_region_tlab is not used for arm, remove
-# the entrypoint once all backends have been updated to use the size variants.
+// TODO(ngeoffray): art_quick_alloc_array_resolved_region_tlab is not used for arm, remove
+// the entrypoint once all backends have been updated to use the size variants.
 GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_UNKNOWN
 GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved8_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_8
 GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved16_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_16
@@ -1492,6 +1539,7 @@
     mov    r0, rSELF
     bl     artTestSuspendFromCode               @ (Thread*)
     RESTORE_SAVE_EVERYTHING_FRAME
+    REFRESH_MARKING_REGISTER
     bx     lr
 END art_quick_test_suspend
 
@@ -1499,7 +1547,9 @@
     mov    r0, rSELF
     SETUP_SAVE_REFS_ONLY_FRAME r1             @ save callee saves for stack crawl
     bl     artTestSuspendFromCode             @ (Thread*)
-    RESTORE_SAVE_REFS_ONLY_FRAME_AND_RETURN
+    RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
+    bx     lr
 END art_quick_implicit_suspend
 
     /*
@@ -1518,6 +1568,7 @@
     add     sp, #(FRAME_SIZE_SAVE_REFS_AND_ARGS - FRAME_SIZE_SAVE_REFS_ONLY)
     .cfi_adjust_cfa_offset -(FRAME_SIZE_SAVE_REFS_AND_ARGS - FRAME_SIZE_SAVE_REFS_ONLY)
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
     cbnz    r2, 1f                 @ success if no exception is pending
     vmov    d0, r0, r1             @ store into fpr, for when it's a fpr return...
     bx      lr                     @ return on success
@@ -1567,8 +1618,9 @@
     blx     artQuickResolutionTrampoline  @ (Method* called, receiver, Thread*, SP)
     cbz     r0, 1f                 @ is code pointer null? goto exception
     mov     r12, r0
-    ldr  r0, [sp, #0]              @ load resolved method in r0
+    ldr     r0, [sp, #0]           @ load resolved method in r0
     RESTORE_SAVE_REFS_AND_ARGS_FRAME
+    REFRESH_MARKING_REGISTER
     bx      r12                    @ tail-call into actual code
 1:
     RESTORE_SAVE_REFS_AND_ARGS_FRAME
@@ -1649,6 +1701,7 @@
     add     sp, #FRAME_SIZE_SAVE_REFS_AND_ARGS-FRAME_SIZE_SAVE_REFS_ONLY
     .cfi_adjust_cfa_offset -(FRAME_SIZE_SAVE_REFS_AND_ARGS-FRAME_SIZE_SAVE_REFS_ONLY)
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
 
     // store into fpr, for when it's a fpr return...
     vmov d0, r0, r1
@@ -1675,6 +1728,7 @@
     add     sp, #(FRAME_SIZE_SAVE_REFS_AND_ARGS - FRAME_SIZE_SAVE_REFS_ONLY)
     .cfi_adjust_cfa_offset -(FRAME_SIZE_SAVE_REFS_AND_ARGS - FRAME_SIZE_SAVE_REFS_ONLY)
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
     cbnz    r2, 1f                 @ success if no exception is pending
     vmov    d0, r0, r1             @ store into fpr, for when it's a fpr return...
     bx      lr                     @ return on success
@@ -1705,6 +1759,7 @@
     mov   r12, r0        @ r12 holds reference to code
     ldr   r0, [sp, #4]   @ restore r0
     RESTORE_SAVE_REFS_AND_ARGS_FRAME
+    REFRESH_MARKING_REGISTER
     blx   r12            @ call method with lr set to art_quick_instrumentation_exit
 @ Deliberate fall-through into art_quick_instrumentation_exit.
     .type art_quick_instrumentation_exit, #function
@@ -1734,6 +1789,7 @@
     .cfi_restore r0
     .cfi_restore r1
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
     cbz   r2, .Ldo_deliver_instrumentation_exception
                          @ Deliver exception if we got nullptr as function.
     bx    r2             @ Otherwise, return
@@ -1787,7 +1843,7 @@
      */
     /* mul-long vAA, vBB, vCC */
 ENTRY art_quick_mul_long
-    push    {r9 - r10}
+    push    {r9-r10}
     .cfi_adjust_cfa_offset 8
     .cfi_rel_offset r9, 0
     .cfi_rel_offset r10, 4
@@ -1797,7 +1853,7 @@
     add     r10, r2, r10                @  r10<- r10 + low(ZxW + (YxX))
     mov     r0,r9
     mov     r1,r10
-    pop     {r9 - r10}
+    pop     {r9-r10}
     .cfi_adjust_cfa_offset -8
     .cfi_restore r9
     .cfi_restore r10
@@ -2544,6 +2600,7 @@
     add     sp, #8
     .cfi_adjust_cfa_offset -8
     RESTORE_SAVE_REFS_AND_ARGS_FRAME
+    REFRESH_MARKING_REGISTER
     RETURN_OR_DELIVER_PENDING_EXCEPTION_REG r2
 
 .macro HANDLER_TABLE_OFFSET handler_label
diff --git a/runtime/arch/arm/registers_arm.h b/runtime/arch/arm/registers_arm.h
index 932095d..d39a2a2 100644
--- a/runtime/arch/arm/registers_arm.h
+++ b/runtime/arch/arm/registers_arm.h
@@ -40,7 +40,8 @@
   R13 = 13,
   R14 = 14,
   R15 = 15,
-  TR  = 9,  // thread register
+  MR  = 8,  // ART Marking Register
+  TR  = 9,  // ART Thread Register
   FP  = 11,
   IP  = 12,
   SP  = 13,
diff --git a/runtime/common_runtime_test.h b/runtime/common_runtime_test.h
index 5893573..fcf3a31 100644
--- a/runtime/common_runtime_test.h
+++ b/runtime/common_runtime_test.h
@@ -247,6 +247,12 @@
     return; \
   }
 
+#define TEST_DISABLED_WITHOUT_BAKER_READ_BARRIERS() \
+  if (!kEmitCompilerReadBarrier || !kUseBakerReadBarrier) { \
+    printf("WARNING: TEST DISABLED FOR GC WITHOUT BAKER READ BARRIER\n"); \
+    return; \
+  }
+
 #define TEST_DISABLED_FOR_NON_STATIC_HOST_BUILDS() \
   if (!kHostStaticBuildEnabled) { \
     printf("WARNING: TEST DISABLED FOR NON-STATIC HOST BUILDS\n"); \