Add read barrier support to the entrypoints.

Also remove "THIS_LOAD_REQUIRES_READ_BARRIER" since reading
an ArtMethod* no longer needs read barrier.

stub_test should also work with read barriers now.

Change-Id: I3fba18042de2f867a18dbdc38519986212bd9769
diff --git a/compiler/oat_test.cc b/compiler/oat_test.cc
index c98a5f8..88dc29e 100644
--- a/compiler/oat_test.cc
+++ b/compiler/oat_test.cc
@@ -183,7 +183,7 @@
   EXPECT_EQ(72U, sizeof(OatHeader));
   EXPECT_EQ(4U, sizeof(OatMethodOffsets));
   EXPECT_EQ(28U, sizeof(OatQuickMethodHeader));
-  EXPECT_EQ(112 * GetInstructionSetPointerSize(kRuntimeISA), sizeof(QuickEntryPoints));
+  EXPECT_EQ(113 * GetInstructionSetPointerSize(kRuntimeISA), sizeof(QuickEntryPoints));
 }
 
 TEST_F(OatTest, OatHeaderIsValid) {
diff --git a/runtime/arch/arm/entrypoints_init_arm.cc b/runtime/arch/arm/entrypoints_init_arm.cc
index 2f2654d..be9af98 100644
--- a/runtime/arch/arm/entrypoints_init_arm.cc
+++ b/runtime/arch/arm/entrypoints_init_arm.cc
@@ -171,6 +171,7 @@
 
   // Read barrier
   qpoints->pReadBarrierJni = ReadBarrierJni;
+  qpoints->pReadBarrierSlow = artReadBarrierSlow;
 }
 
 }  // namespace art
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index 2000110..f6d954f 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -51,7 +51,6 @@
     sub sp, #12                                   @ 3 words of space, bottom word will hold Method*
     .cfi_adjust_cfa_offset 12
     RUNTIME_CURRENT1 \rTemp1, \rTemp2             @ Load Runtime::Current into rTemp1.
-    THIS_LOAD_REQUIRES_READ_BARRIER
     ldr \rTemp1, [\rTemp1, #RUNTIME_SAVE_ALL_CALLEE_SAVE_FRAME_OFFSET] @ rTemp1 is kSaveAll Method*.
     str \rTemp1, [sp, #0]                         @ Place Method* at bottom of stack.
     str sp, [r9, #THREAD_TOP_QUICK_FRAME_OFFSET]  @ Place sp in Thread::Current()->top_quick_frame.
@@ -79,7 +78,6 @@
     sub sp, #4                                    @ bottom word will hold Method*
     .cfi_adjust_cfa_offset 4
     RUNTIME_CURRENT2 \rTemp1, \rTemp2             @ Load Runtime::Current into rTemp1.
-    THIS_LOAD_REQUIRES_READ_BARRIER
     ldr \rTemp1, [\rTemp1, #RUNTIME_REFS_ONLY_CALLEE_SAVE_FRAME_OFFSET] @ rTemp1 is kRefsOnly Method*.
     str \rTemp1, [sp, #0]                         @ Place Method* at bottom of stack.
     str sp, [r9, #THREAD_TOP_QUICK_FRAME_OFFSET]  @ Place sp in Thread::Current()->top_quick_frame.
@@ -139,7 +137,6 @@
 .macro SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME rTemp1, rTemp2
     SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME_REGISTERS_ONLY
     RUNTIME_CURRENT3 \rTemp1, \rTemp2  @ Load Runtime::Current into rTemp1.
-    THIS_LOAD_REQUIRES_READ_BARRIER
      @ rTemp1 is kRefsAndArgs Method*.
     ldr \rTemp1, [\rTemp1, #RUNTIME_REFS_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET]
     str \rTemp1, [sp, #0]                         @ Place Method* at bottom of stack.
@@ -171,7 +168,6 @@
     .cfi_adjust_cfa_offset -40
 .endm
 
-
 .macro RETURN_IF_RESULT_IS_ZERO
     cbnz   r0, 1f              @ result non-zero branch over
     bx     lr                  @ return
@@ -588,6 +584,59 @@
     bkpt
 END art_quick_check_cast
 
+// Restore rReg's value from [sp, #offset] if rReg is not the same as rExclude.
+.macro POP_REG_NE rReg, offset, rExclude
+    .ifnc \rReg, \rExclude
+        ldr \rReg, [sp, #\offset]   @ restore rReg
+        .cfi_restore \rReg
+    .endif
+.endm
+
+    /*
+     * Macro to insert read barrier, only used in art_quick_aput_obj.
+     * rObj and rDest are registers, offset is a defined literal such as MIRROR_OBJECT_CLASS_OFFSET.
+     * TODO: When read barrier has a fast path, add heap unpoisoning support for the fast path.
+     */
+.macro READ_BARRIER rDest, rObj, offset
+#ifdef USE_READ_BARRIER
+    push {r0-r3, ip, lr}            @ 6 words for saved registers (used in art_quick_aput_obj)
+    .cfi_adjust_cfa_offset 24
+    .cfi_rel_offset r0, 0
+    .cfi_rel_offset r1, 4
+    .cfi_rel_offset r2, 8
+    .cfi_rel_offset r3, 12
+    .cfi_rel_offset ip, 16
+    .cfi_rel_offset lr, 20
+    sub sp, #8                      @ push padding
+    .cfi_adjust_cfa_offset 8
+    @ mov r0, r0                    @ pass ref in r0 (no-op for now since parameter ref is unused)
+    .ifnc \rObj, r1
+        mov r1, \rObj               @ pass rObj
+    .endif
+    mov r2, #\offset                @ pass offset
+    bl artReadBarrierSlow           @ artReadBarrierSlow(ref, rObj, offset)
+    @ No need to unpoison return value in r0, artReadBarrierSlow() would do the unpoisoning.
+    .ifnc \rDest, r0
+        mov \rDest, r0              @ save return value in rDest
+    .endif
+    add sp, #8                      @ pop padding
+    .cfi_adjust_cfa_offset -8
+    POP_REG_NE r0, 0, \rDest        @ conditionally restore saved registers
+    POP_REG_NE r1, 4, \rDest
+    POP_REG_NE r2, 8, \rDest
+    POP_REG_NE r3, 12, \rDest
+    POP_REG_NE ip, 16, \rDest
+    add sp, #20
+    .cfi_adjust_cfa_offset -20
+    pop {lr}                        @ restore lr
+    .cfi_adjust_cfa_offset -4
+    .cfi_restore lr
+#else
+    ldr \rDest, [\rObj, #\offset]
+    UNPOISON_HEAP_REF \rDest
+#endif  // USE_READ_BARRIER
+.endm
+
     /*
      * Entry from managed code for array put operations of objects where the value being stored
      * needs to be checked for compatibility.
@@ -609,15 +658,21 @@
     b art_quick_throw_array_bounds
 END art_quick_aput_obj_with_bound_check
 
+#ifdef USE_READ_BARRIER
+    .extern artReadBarrierSlow
+#endif
     .hidden art_quick_aput_obj
 ENTRY art_quick_aput_obj
+#ifdef USE_READ_BARRIER
+    @ The offset to .Ldo_aput_null is too large to use cbz due to expansion from READ_BARRIER macro.
+    tst r2, r2
+    beq .Ldo_aput_null
+#else
     cbz r2, .Ldo_aput_null
-    ldr r3, [r0, #MIRROR_OBJECT_CLASS_OFFSET]
-    UNPOISON_HEAP_REF r3
-    ldr ip, [r2, #MIRROR_OBJECT_CLASS_OFFSET]
-    UNPOISON_HEAP_REF ip
-    ldr r3, [r3, #MIRROR_CLASS_COMPONENT_TYPE_OFFSET]
-    UNPOISON_HEAP_REF r3
+#endif  // USE_READ_BARRIER
+    READ_BARRIER r3, r0, MIRROR_OBJECT_CLASS_OFFSET
+    READ_BARRIER ip, r2, MIRROR_OBJECT_CLASS_OFFSET
+    READ_BARRIER r3, r3, MIRROR_CLASS_COMPONENT_TYPE_OFFSET
     cmp r3, ip  @ value's type == array's component type - trivial assignability
     bne .Lcheck_assignability
 .Ldo_aput:
diff --git a/runtime/arch/arm64/entrypoints_init_arm64.cc b/runtime/arch/arm64/entrypoints_init_arm64.cc
index 2ce2a29..0f06727 100644
--- a/runtime/arch/arm64/entrypoints_init_arm64.cc
+++ b/runtime/arch/arm64/entrypoints_init_arm64.cc
@@ -155,6 +155,7 @@
 
   // Read barrier
   qpoints->pReadBarrierJni = ReadBarrierJni;
+  qpoints->pReadBarrierSlow = artReadBarrierSlow;
 };
 
 }  // namespace art
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 6d9b44a..548ab47 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -31,8 +31,6 @@
     ldr xIP0, [xIP0]  // xIP0 = & (art::Runtime * art::Runtime.instance_) .
 
     // xIP0 = (ArtMethod*) Runtime.instance_.callee_save_methods[kRefAndArgs]  .
-    THIS_LOAD_REQUIRES_READ_BARRIER
-
     // Loads appropriate callee-save-method.
     ldr xIP0, [xIP0, RUNTIME_SAVE_ALL_CALLEE_SAVE_FRAME_OFFSET ]
 
@@ -95,8 +93,6 @@
     ldr xIP0, [xIP0]  // xIP0 = & (art::Runtime * art::Runtime.instance_) .
 
     // xIP0 = (ArtMethod*) Runtime.instance_.callee_save_methods[kRefOnly]  .
-    THIS_LOAD_REQUIRES_READ_BARRIER
-
     // Loads appropriate callee-save-method.
     ldr xIP0, [xIP0, RUNTIME_REFS_ONLY_CALLEE_SAVE_FRAME_OFFSET ]
 
@@ -251,7 +247,6 @@
     ldr xIP0, [xIP0]  // xIP0 = & (art::Runtime * art::Runtime.instance_) .
 
     // xIP0 = (ArtMethod*) Runtime.instance_.callee_save_methods[kRefAndArgs]  .
-    THIS_LOAD_REQUIRES_READ_BARRIER
     ldr xIP0, [xIP0, RUNTIME_REFS_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET ]
 
     SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME_INTERNAL
@@ -1119,6 +1114,62 @@
     brk 0                             // We should not return here...
 END art_quick_check_cast
 
+// Restore xReg's value from [sp, #offset] if xReg is not the same as xExclude.
+.macro POP_REG_NE xReg, offset, xExclude
+    .ifnc \xReg, \xExclude
+        ldr \xReg, [sp, #\offset]     // restore xReg
+        .cfi_restore \xReg
+    .endif
+.endm
+
+    /*
+     * Macro to insert read barrier, only used in art_quick_aput_obj.
+     * xDest, wDest and xObj are registers, offset is a defined literal such as
+     * MIRROR_OBJECT_CLASS_OFFSET. Dest needs both x and w versions of the same register to handle
+     * name mismatch between instructions. This macro uses the lower 32b of register when possible.
+     * TODO: When read barrier has a fast path, add heap unpoisoning support for the fast path.
+     */
+.macro READ_BARRIER xDest, wDest, xObj, offset
+#ifdef USE_READ_BARRIER
+    // Store registers used in art_quick_aput_obj (x0-x4, LR), stack is 16B aligned.
+    stp x0, x1, [sp, #-48]!
+    .cfi_adjust_cfa_offset 48
+    .cfi_rel_offset x0, 0
+    .cfi_rel_offset x1, 8
+    stp x2, x3, [sp, #16]
+    .cfi_rel_offset x2, 16
+    .cfi_rel_offset x3, 24
+    stp x4, xLR, [sp, #32]
+    .cfi_rel_offset x4, 32
+    .cfi_rel_offset x30, 40
+
+    // mov x0, x0                   // pass ref in x0 (no-op for now since parameter ref is unused)
+    .ifnc \xObj, x1
+        mov x1, \xObj               // pass xObj
+    .endif
+    mov w2, #\offset                // pass offset
+    bl artReadBarrierSlow           // artReadBarrierSlow(ref, xObj, offset)
+    // No need to unpoison return value in w0, artReadBarrierSlow() would do the unpoisoning.
+    .ifnc \wDest, w0
+        mov \wDest, w0              // save return value in wDest
+    .endif
+
+    // Conditionally restore saved registers
+    POP_REG_NE x0, 0, \xDest
+    POP_REG_NE x1, 8, \xDest
+    POP_REG_NE x2, 16, \xDest
+    POP_REG_NE x3, 24, \xDest
+    POP_REG_NE x4, 32, \xDest
+    ldr xLR, [sp, #40]
+    .cfi_restore x30
+    add sp, sp, #48
+    .cfi_adjust_cfa_offset -48
+#else
+    ldr \wDest, [\xObj, #\offset]   // Heap reference = 32b. This also zero-extends to \xDest.
+    UNPOISON_HEAP_REF \wDest
+#endif  // USE_READ_BARRIER
+.endm
+
     /*
      * Entry from managed code for array put operations of objects where the value being stored
      * needs to be checked for compatibility.
@@ -1146,17 +1197,17 @@
     b art_quick_throw_array_bounds
 END art_quick_aput_obj_with_bound_check
 
+#ifdef USE_READ_BARRIER
+    .extern artReadBarrierSlow
+#endif
 ENTRY art_quick_aput_obj
     cbz x2, .Ldo_aput_null
-    ldr w3, [x0, #MIRROR_OBJECT_CLASS_OFFSET]            // Heap reference = 32b
+    READ_BARRIER x3, w3, x0, MIRROR_OBJECT_CLASS_OFFSET     // Heap reference = 32b
                                                          // This also zero-extends to x3
-    UNPOISON_HEAP_REF w3
-    ldr w4, [x2, #MIRROR_OBJECT_CLASS_OFFSET]            // Heap reference = 32b
+    READ_BARRIER x4, w4, x2, MIRROR_OBJECT_CLASS_OFFSET     // Heap reference = 32b
                                                          // This also zero-extends to x4
-    UNPOISON_HEAP_REF w4
-    ldr w3, [x3, #MIRROR_CLASS_COMPONENT_TYPE_OFFSET]    // Heap reference = 32b
+    READ_BARRIER x3, w3, x3, MIRROR_CLASS_COMPONENT_TYPE_OFFSET // Heap reference = 32b
                                                          // This also zero-extends to x3
-    UNPOISON_HEAP_REF w3
     cmp w3, w4  // value's type == array's component type - trivial assignability
     bne .Lcheck_assignability
 .Ldo_aput:
diff --git a/runtime/arch/mips/entrypoints_direct_mips.h b/runtime/arch/mips/entrypoints_direct_mips.h
index b1aa3ee..f9c5315 100644
--- a/runtime/arch/mips/entrypoints_direct_mips.h
+++ b/runtime/arch/mips/entrypoints_direct_mips.h
@@ -44,7 +44,8 @@
       entrypoint == kQuickCmpgDouble ||
       entrypoint == kQuickCmpgFloat ||
       entrypoint == kQuickCmplDouble ||
-      entrypoint == kQuickCmplFloat;
+      entrypoint == kQuickCmplFloat ||
+      entrypoint == kQuickReadBarrierSlow;
 }
 
 }  // namespace art
diff --git a/runtime/arch/mips/entrypoints_init_mips.cc b/runtime/arch/mips/entrypoints_init_mips.cc
index 09a018e..4e4b91f 100644
--- a/runtime/arch/mips/entrypoints_init_mips.cc
+++ b/runtime/arch/mips/entrypoints_init_mips.cc
@@ -279,6 +279,8 @@
 
   qpoints->pReadBarrierJni = ReadBarrierJni;
   static_assert(!IsDirectEntrypoint(kQuickReadBarrierJni), "Non-direct C stub marked direct.");
+  qpoints->pReadBarrierSlow = artReadBarrierSlow;
+  static_assert(IsDirectEntrypoint(kQuickReadBarrierSlow), "Direct C stub not marked direct.");
 };
 
 }  // namespace art
diff --git a/runtime/arch/mips/quick_entrypoints_mips.S b/runtime/arch/mips/quick_entrypoints_mips.S
index 2819f92..4d5004f 100644
--- a/runtime/arch/mips/quick_entrypoints_mips.S
+++ b/runtime/arch/mips/quick_entrypoints_mips.S
@@ -79,7 +79,6 @@
 
     lw $t0, %got(_ZN3art7Runtime9instance_E)($gp)
     lw $t0, 0($t0)
-    THIS_LOAD_REQUIRES_READ_BARRIER
     lw $t0, RUNTIME_SAVE_ALL_CALLEE_SAVE_FRAME_OFFSET($t0)
     sw $t0, 0($sp)                                # Place Method* at bottom of stack.
     sw $sp, THREAD_TOP_QUICK_FRAME_OFFSET(rSELF)  # Place sp in Thread::Current()->top_quick_frame.
@@ -127,7 +126,6 @@
 
     lw $t0, %got(_ZN3art7Runtime9instance_E)($gp)
     lw $t0, 0($t0)
-    THIS_LOAD_REQUIRES_READ_BARRIER
     lw $t0, RUNTIME_REFS_ONLY_CALLEE_SAVE_FRAME_OFFSET($t0)
     sw $t0, 0($sp)                                # Place Method* at bottom of stack.
     sw $sp, THREAD_TOP_QUICK_FRAME_OFFSET(rSELF)  # Place sp in Thread::Current()->top_quick_frame.
@@ -219,7 +217,6 @@
     SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME_REGISTERS_ONLY
     lw $t0, %got(_ZN3art7Runtime9instance_E)($gp)
     lw $t0, 0($t0)
-    THIS_LOAD_REQUIRES_READ_BARRIER
     lw $t0, RUNTIME_REFS_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET($t0)
     sw $t0, 0($sp)                                # Place Method* at bottom of stack.
     sw $sp, THREAD_TOP_QUICK_FRAME_OFFSET(rSELF)  # Place sp in Thread::Current()->top_quick_frame.
@@ -627,6 +624,76 @@
 END art_quick_check_cast
 
     /*
+     * Restore rReg's value from offset($sp) if rReg is not the same as rExclude.
+     * nReg is the register number for rReg.
+     */
+.macro POP_REG_NE rReg, nReg, offset, rExclude
+    .ifnc \rReg, \rExclude
+        lw \rReg, \offset($sp)      # restore rReg
+        .cfi_restore \nReg
+    .endif
+.endm
+
+    /*
+     * Macro to insert read barrier, only used in art_quick_aput_obj.
+     * rObj and rDest are registers, offset is a defined literal such as MIRROR_OBJECT_CLASS_OFFSET.
+     * TODO: When read barrier has a fast path, add heap unpoisoning support for the fast path.
+     */
+.macro READ_BARRIER rDest, rObj, offset
+#ifdef USE_READ_BARRIER
+    # saved registers used in art_quick_aput_obj: a0-a2, t0-t1, t9, ra. 8 words for 16B alignment.
+    addiu  $sp, $sp, -32
+    .cfi_adjust_cfa_offset 32
+    sw     $ra, 28($sp)
+    .cfi_rel_offset 31, 28
+    sw     $t9, 24($sp)
+    .cfi_rel_offset 25, 24
+    sw     $t1, 20($sp)
+    .cfi_rel_offset 9, 20
+    sw     $t0, 16($sp)
+    .cfi_rel_offset 8, 16
+    sw     $a2, 8($sp)              # padding slot at offset 12 (padding can be any slot in the 32B)
+    .cfi_rel_offset 6, 8
+    sw     $a1, 4($sp)
+    .cfi_rel_offset 5, 4
+    sw     $a0, 0($sp)
+    .cfi_rel_offset 4, 0
+
+    # move $a0, $a0                 # pass ref in a0 (no-op for now since parameter ref is unused)
+    .ifnc \rObj, $a1
+        move $a1, \rObj             # pass rObj
+    .endif
+    addiu $a2, $zero, \offset       # pass offset
+    jal artReadBarrierSlow          # artReadBarrierSlow(ref, rObj, offset)
+    addiu  $sp, $sp, -16            # Use branch delay slot to reserve argument slots on the stack
+                                    # before the call to artReadBarrierSlow.
+    addiu  $sp, $sp, 16             # restore stack after call to artReadBarrierSlow
+    # No need to unpoison return value in v0, artReadBarrierSlow() would do the unpoisoning.
+    move \rDest, $v0                # save return value in rDest
+                                    # (rDest cannot be v0 in art_quick_aput_obj)
+
+    lw     $a0, 0($sp)              # restore registers except rDest
+                                    # (rDest can only be t0 or t1 in art_quick_aput_obj)
+    .cfi_restore 4
+    lw     $a1, 4($sp)
+    .cfi_restore 5
+    lw     $a2, 8($sp)
+    .cfi_restore 6
+    POP_REG_NE $t0, 8, 16, \rDest
+    POP_REG_NE $t1, 9, 20, \rDest
+    lw     $t9, 24($sp)
+    .cfi_restore 25
+    lw     $ra, 28($sp)             # restore $ra
+    .cfi_restore 31
+    addiu  $sp, $sp, 32
+    .cfi_adjust_cfa_offset -32
+#else
+    lw     \rDest, \offset(\rObj)
+    UNPOISON_HEAP_REF \rDest
+#endif  // USE_READ_BARRIER
+.endm
+
+    /*
      * Entry from managed code for array put operations of objects where the value being stored
      * needs to be checked for compatibility.
      * a0 = array, a1 = index, a2 = value
@@ -648,15 +715,15 @@
     move $a1, $t0
 END art_quick_aput_obj_with_bound_check
 
+#ifdef USE_READ_BARRIER
+    .extern artReadBarrierSlow
+#endif
 ENTRY art_quick_aput_obj
     beqz $a2, .Ldo_aput_null
     nop
-    lw $t0, MIRROR_OBJECT_CLASS_OFFSET($a0)
-    UNPOISON_HEAP_REF $t0
-    lw $t1, MIRROR_OBJECT_CLASS_OFFSET($a2)
-    UNPOISON_HEAP_REF $t1
-    lw $t0, MIRROR_CLASS_COMPONENT_TYPE_OFFSET($t0)
-    UNPOISON_HEAP_REF $t0
+    READ_BARRIER $t0, $a0, MIRROR_OBJECT_CLASS_OFFSET
+    READ_BARRIER $t1, $a2, MIRROR_OBJECT_CLASS_OFFSET
+    READ_BARRIER $t0, $t0, MIRROR_CLASS_COMPONENT_TYPE_OFFSET
     bne $t1, $t0, .Lcheck_assignability  # value's type == array's component type - trivial assignability
     nop
 .Ldo_aput:
diff --git a/runtime/arch/mips64/entrypoints_init_mips64.cc b/runtime/arch/mips64/entrypoints_init_mips64.cc
index 4904af9..ec02d5a 100644
--- a/runtime/arch/mips64/entrypoints_init_mips64.cc
+++ b/runtime/arch/mips64/entrypoints_init_mips64.cc
@@ -186,6 +186,7 @@
 
   // Read barrier
   qpoints->pReadBarrierJni = ReadBarrierJni;
+  qpoints->pReadBarrierSlow = artReadBarrierSlow;
 };
 
 }  // namespace art
diff --git a/runtime/arch/mips64/quick_entrypoints_mips64.S b/runtime/arch/mips64/quick_entrypoints_mips64.S
index abca70b..c30e6ca 100644
--- a/runtime/arch/mips64/quick_entrypoints_mips64.S
+++ b/runtime/arch/mips64/quick_entrypoints_mips64.S
@@ -89,7 +89,6 @@
     # load appropriate callee-save-method
     ld      $t1, %got(_ZN3art7Runtime9instance_E)($gp)
     ld      $t1, 0($t1)
-    THIS_LOAD_REQUIRES_READ_BARRIER
     ld      $t1, RUNTIME_SAVE_ALL_CALLEE_SAVE_FRAME_OFFSET($t1)
     sd      $t1, 0($sp)                                # Place ArtMethod* at bottom of stack.
     sd      $sp, THREAD_TOP_QUICK_FRAME_OFFSET(rSELF)  # Place sp in Thread::Current()->top_quick_frame.
@@ -132,7 +131,6 @@
     # load appropriate callee-save-method
     ld      $t1, %got(_ZN3art7Runtime9instance_E)($gp)
     ld      $t1, 0($t1)
-    THIS_LOAD_REQUIRES_READ_BARRIER
     ld      $t1, RUNTIME_REFS_ONLY_CALLEE_SAVE_FRAME_OFFSET($t1)
     sd      $t1, 0($sp)                                # Place Method* at bottom of stack.
     sd      $sp, THREAD_TOP_QUICK_FRAME_OFFSET(rSELF)  # Place sp in Thread::Current()->top_quick_frame.
@@ -255,7 +253,6 @@
     # load appropriate callee-save-method
     ld      $t1, %got(_ZN3art7Runtime9instance_E)($gp)
     ld      $t1, 0($t1)
-    THIS_LOAD_REQUIRES_READ_BARRIER
     ld      $t1, RUNTIME_REFS_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET($t1)
     sd      $t1, 0($sp)                                # Place Method* at bottom of stack.
     sd      $sp, THREAD_TOP_QUICK_FRAME_OFFSET(rSELF)  # Place sp in Thread::Current()->top_quick_frame.
@@ -888,6 +885,77 @@
     move $a2, rSELF                 # pass Thread::Current
 END art_quick_check_cast
 
+
+    /*
+     * Restore rReg's value from offset($sp) if rReg is not the same as rExclude.
+     * nReg is the register number for rReg.
+     */
+.macro POP_REG_NE rReg, nReg, offset, rExclude
+    .ifnc \rReg, \rExclude
+        ld \rReg, \offset($sp)      # restore rReg
+        .cfi_restore \nReg
+    .endif
+.endm
+
+    /*
+     * Macro to insert read barrier, only used in art_quick_aput_obj.
+     * rObj and rDest are registers, offset is a defined literal such as MIRROR_OBJECT_CLASS_OFFSET.
+     * TODO: When read barrier has a fast path, add heap unpoisoning support for the fast path.
+     */
+.macro READ_BARRIER rDest, rObj, offset
+#ifdef USE_READ_BARRIER
+    # saved registers used in art_quick_aput_obj: a0-a2, t0-t1, t9, ra. 16B-aligned.
+    daddiu  $sp, $sp, -64
+    .cfi_adjust_cfa_offset 64
+    sd     $ra, 56($sp)
+    .cfi_rel_offset 31, 56
+    sd     $t9, 48($sp)
+    .cfi_rel_offset 25, 48
+    sd     $t1, 40($sp)
+    .cfi_rel_offset 13, 40
+    sd     $t0, 32($sp)
+    .cfi_rel_offset 12, 32
+    sd     $a2, 16($sp)             # padding slot at offset 24 (padding can be any slot in the 64B)
+    .cfi_rel_offset 6, 16
+    sd     $a1, 8($sp)
+    .cfi_rel_offset 5, 8
+    sd     $a0, 0($sp)
+    .cfi_rel_offset 4, 0
+
+    # move $a0, $a0                 # pass ref in a0 (no-op for now since parameter ref is unused)
+    .ifnc \rObj, $a1
+        move $a1, \rObj             # pass rObj
+    .endif
+    daddiu $a2, $zero, \offset      # pass offset
+    jal artReadBarrierSlow          # artReadBarrierSlow(ref, rObj, offset)
+    .cpreturn                       # Restore gp from t8 in branch delay slot.
+                                    # t8 may be clobbered in artReadBarrierSlow.
+    # No need to unpoison return value in v0, artReadBarrierSlow() would do the unpoisoning.
+    move \rDest, $v0                # save return value in rDest
+                                    # (rDest cannot be v0 in art_quick_aput_obj)
+
+    ld     $a0, 0($sp)              # restore registers except rDest
+                                    # (rDest can only be t0 or t1 in art_quick_aput_obj)
+    .cfi_restore 4
+    ld     $a1, 8($sp)
+    .cfi_restore 5
+    ld     $a2, 16($sp)
+    .cfi_restore 6
+    POP_REG_NE $t0, 12, 32, \rDest
+    POP_REG_NE $t1, 13, 40, \rDest
+    ld     $t9, 48($sp)
+    .cfi_restore 25
+    ld     $ra, 56($sp)             # restore $ra
+    .cfi_restore 31
+    daddiu  $sp, $sp, 64
+    .cfi_adjust_cfa_offset -64
+    SETUP_GP                        # set up gp because we are not returning
+#else
+    lwu     \rDest, \offset(\rObj)
+    UNPOISON_HEAP_REF \rDest
+#endif  // USE_READ_BARRIER
+.endm
+
     /*
      * Entry from managed code for array put operations of objects where the value being stored
      * needs to be checked for compatibility.
@@ -913,12 +981,9 @@
 ENTRY art_quick_aput_obj
     beq  $a2, $zero, .Ldo_aput_null
     nop
-    lwu $t0, MIRROR_OBJECT_CLASS_OFFSET($a0)
-    UNPOISON_HEAP_REF $t0
-    lwu $t1, MIRROR_OBJECT_CLASS_OFFSET($a2)
-    UNPOISON_HEAP_REF $t1
-    lwu $t0, MIRROR_CLASS_COMPONENT_TYPE_OFFSET($t0)
-    UNPOISON_HEAP_REF $t0
+    READ_BARRIER $t0, $a0, MIRROR_OBJECT_CLASS_OFFSET
+    READ_BARRIER $t1, $a2, MIRROR_OBJECT_CLASS_OFFSET
+    READ_BARRIER $t0, $t0, MIRROR_CLASS_COMPONENT_TYPE_OFFSET
     bne $t1, $t0, .Lcheck_assignability  # value's type == array's component type - trivial assignability
     nop
 .Ldo_aput:
diff --git a/runtime/arch/stub_test.cc b/runtime/arch/stub_test.cc
index 0831c26..cf7db34 100644
--- a/runtime/arch/stub_test.cc
+++ b/runtime/arch/stub_test.cc
@@ -1124,8 +1124,6 @@
 
 
 TEST_F(StubTest, APutObj) {
-  TEST_DISABLED_FOR_READ_BARRIER();
-
 #if defined(__i386__) || defined(__arm__) || defined(__aarch64__) || defined(__mips__) || \
     (defined(__x86_64__) && !defined(__APPLE__))
   Thread* self = Thread::Current();
@@ -1258,8 +1256,6 @@
 }
 
 TEST_F(StubTest, AllocObject) {
-  TEST_DISABLED_FOR_READ_BARRIER();
-
 #if defined(__i386__) || defined(__arm__) || defined(__aarch64__) || defined(__mips__) || \
     (defined(__x86_64__) && !defined(__APPLE__))
   // This will lead to OOM  error messages in the log.
@@ -1385,8 +1381,6 @@
 }
 
 TEST_F(StubTest, AllocObjectArray) {
-  TEST_DISABLED_FOR_READ_BARRIER();
-
 #if defined(__i386__) || defined(__arm__) || defined(__aarch64__) || defined(__mips__) || \
     (defined(__x86_64__) && !defined(__APPLE__))
   // TODO: Check the "Unresolved" allocation stubs
@@ -1474,8 +1468,6 @@
 
 
 TEST_F(StubTest, StringCompareTo) {
-  TEST_DISABLED_FOR_READ_BARRIER();
-
 #if defined(__i386__) || defined(__arm__) || defined(__aarch64__) || (defined(__x86_64__) && !defined(__APPLE__))
   // TODO: Check the "Unresolved" allocation stubs
 
@@ -2152,8 +2144,6 @@
 }
 
 TEST_F(StubTest, Fields8) {
-  TEST_DISABLED_FOR_READ_BARRIER();
-
   Thread* self = Thread::Current();
 
   self->TransitionFromSuspendedToRunnable();
@@ -2166,8 +2156,6 @@
 }
 
 TEST_F(StubTest, Fields16) {
-  TEST_DISABLED_FOR_READ_BARRIER();
-
   Thread* self = Thread::Current();
 
   self->TransitionFromSuspendedToRunnable();
@@ -2180,8 +2168,6 @@
 }
 
 TEST_F(StubTest, Fields32) {
-  TEST_DISABLED_FOR_READ_BARRIER();
-
   Thread* self = Thread::Current();
 
   self->TransitionFromSuspendedToRunnable();
@@ -2193,8 +2179,6 @@
 }
 
 TEST_F(StubTest, FieldsObj) {
-  TEST_DISABLED_FOR_READ_BARRIER();
-
   Thread* self = Thread::Current();
 
   self->TransitionFromSuspendedToRunnable();
@@ -2206,8 +2190,6 @@
 }
 
 TEST_F(StubTest, Fields64) {
-  TEST_DISABLED_FOR_READ_BARRIER();
-
   Thread* self = Thread::Current();
 
   self->TransitionFromSuspendedToRunnable();
@@ -2221,8 +2203,6 @@
 TEST_F(StubTest, IMT) {
 #if defined(__i386__) || defined(__arm__) || defined(__aarch64__) || defined(__mips__) || \
     (defined(__x86_64__) && !defined(__APPLE__))
-  TEST_DISABLED_FOR_READ_BARRIER();
-
   Thread* self = Thread::Current();
 
   ScopedObjectAccess soa(self);
@@ -2342,8 +2322,6 @@
 
 TEST_F(StubTest, StringIndexOf) {
 #if defined(__arm__) || defined(__aarch64__)
-  TEST_DISABLED_FOR_READ_BARRIER();
-
   Thread* self = Thread::Current();
   ScopedObjectAccess soa(self);
   // garbage is created during ClassLinker::Init
@@ -2416,4 +2394,40 @@
 #endif
 }
 
+TEST_F(StubTest, ReadBarrier) {
+#if defined(ART_USE_READ_BARRIER) && (defined(__i386__) || defined(__arm__) || \
+      defined(__aarch64__) || defined(__mips__) || (defined(__x86_64__) && !defined(__APPLE__)))
+  Thread* self = Thread::Current();
+
+  const uintptr_t readBarrierSlow = StubTest::GetEntrypoint(self, kQuickReadBarrierSlow);
+
+  // Create an object
+  ScopedObjectAccess soa(self);
+  // garbage is created during ClassLinker::Init
+
+  StackHandleScope<2> hs(soa.Self());
+  Handle<mirror::Class> c(
+      hs.NewHandle(class_linker_->FindSystemClass(soa.Self(), "Ljava/lang/Object;")));
+
+  // Build an object instance
+  Handle<mirror::Object> obj(hs.NewHandle(c->AllocObject(soa.Self())));
+
+  EXPECT_FALSE(self->IsExceptionPending());
+
+  size_t result = Invoke3(0U, reinterpret_cast<size_t>(obj.Get()),
+                          mirror::Object::ClassOffset().SizeValue(), readBarrierSlow, self);
+
+  EXPECT_FALSE(self->IsExceptionPending());
+  EXPECT_NE(reinterpret_cast<size_t>(nullptr), result);
+  mirror::Class* klass = reinterpret_cast<mirror::Class*>(result);
+  EXPECT_EQ(klass, obj->GetClass());
+
+  // Tests done.
+#else
+  LOG(INFO) << "Skipping read_barrier_slow";
+  // Force-print to std::cout so it's also outside the logcat.
+  std::cout << "Skipping read_barrier_slow" << std::endl;
+#endif
+}
+
 }  // namespace art
diff --git a/runtime/arch/x86/entrypoints_init_x86.cc b/runtime/arch/x86/entrypoints_init_x86.cc
index 737f4d1..e2632c1 100644
--- a/runtime/arch/x86/entrypoints_init_x86.cc
+++ b/runtime/arch/x86/entrypoints_init_x86.cc
@@ -28,6 +28,9 @@
 extern "C" uint32_t art_quick_is_assignable(const mirror::Class* klass,
                                             const mirror::Class* ref_class);
 
+// Read barrier entrypoints.
+extern "C" mirror::Object* art_quick_read_barrier_slow(mirror::Object*, mirror::Object*, uint32_t);
+
 void InitEntryPoints(InterpreterEntryPoints* ipoints, JniEntryPoints* jpoints,
                      QuickEntryPoints* qpoints) {
   // Interpreter
@@ -141,6 +144,7 @@
 
   // Read barrier
   qpoints->pReadBarrierJni = ReadBarrierJni;
+  qpoints->pReadBarrierSlow = art_quick_read_barrier_slow;
 };
 
 }  // namespace art
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index ebfb3fa..1da5a2f 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -33,7 +33,6 @@
     movl SYMBOL(_ZN3art7Runtime9instance_E)@GOT(REG_VAR(got_reg)), REG_VAR(temp_reg)
     movl (REG_VAR(temp_reg)), REG_VAR(temp_reg)
     // Push save all callee-save method.
-    THIS_LOAD_REQUIRES_READ_BARRIER
     pushl RUNTIME_SAVE_ALL_CALLEE_SAVE_FRAME_OFFSET(REG_VAR(temp_reg))
     CFI_ADJUST_CFA_OFFSET(4)
     // Store esp as the top quick frame.
@@ -60,7 +59,6 @@
     movl SYMBOL(_ZN3art7Runtime9instance_E)@GOT(REG_VAR(got_reg)), REG_VAR(temp_reg)
     movl (REG_VAR(temp_reg)), REG_VAR(temp_reg)
     // Push save all callee-save method.
-    THIS_LOAD_REQUIRES_READ_BARRIER
     pushl RUNTIME_REFS_ONLY_CALLEE_SAVE_FRAME_OFFSET(REG_VAR(temp_reg))
     CFI_ADJUST_CFA_OFFSET(4)
     // Store esp as the top quick frame.
@@ -106,7 +104,6 @@
     movl SYMBOL(_ZN3art7Runtime9instance_E)@GOT(REG_VAR(got_reg)), REG_VAR(temp_reg)
     movl (REG_VAR(temp_reg)), REG_VAR(temp_reg)
     // Push save all callee-save method.
-    THIS_LOAD_REQUIRES_READ_BARRIER
     pushl RUNTIME_REFS_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET(REG_VAR(temp_reg))
     CFI_ADJUST_CFA_OFFSET(4)
     // Store esp as the stop quick frame.
@@ -1126,6 +1123,53 @@
     UNREACHABLE
 END_FUNCTION art_quick_check_cast
 
+// Restore reg's value if reg is not the same as exclude_reg, otherwise just adjust stack.
+MACRO2(POP_REG_NE, reg, exclude_reg)
+    .ifc RAW_VAR(reg), RAW_VAR(exclude_reg)
+      addl MACRO_LITERAL(4), %esp
+      CFI_ADJUST_CFA_OFFSET(-4)
+    .else
+      POP RAW_VAR(reg)
+    .endif
+END_MACRO
+
+    /*
+     * Macro to insert read barrier, only used in art_quick_aput_obj.
+     * obj_reg and dest_reg are registers, offset is a defined literal such as
+     * MIRROR_OBJECT_CLASS_OFFSET.
+     * pop_eax is a boolean flag, indicating if eax is popped after the call.
+     * TODO: When read barrier has a fast path, add heap unpoisoning support for the fast path.
+     */
+MACRO4(READ_BARRIER, obj_reg, offset, dest_reg, pop_eax)
+#ifdef USE_READ_BARRIER
+    PUSH eax                        // save registers used in art_quick_aput_obj
+    PUSH ebx
+    PUSH edx
+    PUSH ecx
+    // Outgoing argument set up
+    pushl MACRO_LITERAL((RAW_VAR(offset)))  // pass offset, double parentheses are necessary
+    CFI_ADJUST_CFA_OFFSET(4)
+    PUSH RAW_VAR(obj_reg)           // pass obj_reg
+    PUSH eax                        // pass ref, just pass eax for now since parameter ref is unused
+    call SYMBOL(artReadBarrierSlow) // artReadBarrierSlow(ref, obj_reg, offset)
+    // No need to unpoison return value in eax, artReadBarrierSlow() would do the unpoisoning.
+    .ifnc RAW_VAR(dest_reg), eax
+      movl %eax, REG_VAR(dest_reg)  // save loaded ref in dest_reg
+    .endif
+    addl MACRO_LITERAL(12), %esp    // pop arguments
+    CFI_ADJUST_CFA_OFFSET(-12)
+    POP_REG_NE ecx, RAW_VAR(dest_reg) // Restore args except dest_reg
+    POP_REG_NE edx, RAW_VAR(dest_reg)
+    POP_REG_NE ebx, RAW_VAR(dest_reg)
+    .ifc RAW_VAR(pop_eax), true
+      POP_REG_NE eax, RAW_VAR(dest_reg)
+    .endif
+#else
+    movl RAW_VAR(offset)(REG_VAR(obj_reg)), REG_VAR(dest_reg)
+    UNPOISON_HEAP_REF RAW_VAR(dest_reg)
+#endif  // USE_READ_BARRIER
+END_MACRO
+
     /*
      * Entry from managed code for array put operations of objects where the value being stored
      * needs to be checked for compatibility.
@@ -1149,17 +1193,20 @@
 DEFINE_FUNCTION art_quick_aput_obj
     test %edx, %edx              // store of null
     jz .Ldo_aput_null
-    movl MIRROR_OBJECT_CLASS_OFFSET(%eax), %ebx
-    UNPOISON_HEAP_REF ebx
-    movl MIRROR_CLASS_COMPONENT_TYPE_OFFSET(%ebx), %ebx
-    UNPOISON_HEAP_REF ebx
+    READ_BARRIER eax, MIRROR_OBJECT_CLASS_OFFSET, ebx, true
+    READ_BARRIER ebx, MIRROR_CLASS_COMPONENT_TYPE_OFFSET, ebx, true
     // value's type == array's component type - trivial assignability
-#ifdef USE_HEAP_POISONING
-    PUSH eax  // save eax
+#if defined(USE_READ_BARRIER)
+    READ_BARRIER edx, MIRROR_OBJECT_CLASS_OFFSET, eax, false
+    cmpl %eax, %ebx
+    POP eax                      // restore eax from the push in the beginning of READ_BARRIER macro
+#elif defined(USE_HEAP_POISONING)
+    PUSH eax                     // save eax
+    // Cannot call READ_BARRIER macro here, because the above push messes up stack alignment.
     movl MIRROR_OBJECT_CLASS_OFFSET(%edx), %eax
     UNPOISON_HEAP_REF eax
     cmpl %eax, %ebx
-    POP  eax  // restore eax
+    POP eax                      // restore eax
 #else
     cmpl MIRROR_OBJECT_CLASS_OFFSET(%edx), %ebx
 #endif
@@ -1181,6 +1228,8 @@
     subl LITERAL(8), %esp         // alignment padding
     CFI_ADJUST_CFA_OFFSET(8)
 #ifdef USE_HEAP_POISONING
+    // This load does not need read barrier, since edx is unchanged and there's no GC safe point
+    // from last read of MIRROR_OBJECT_CLASS_OFFSET(%edx).
     movl MIRROR_OBJECT_CLASS_OFFSET(%edx), %eax  // pass arg2 - type of the value to be stored
     UNPOISON_HEAP_REF eax
     PUSH eax
@@ -1696,5 +1745,15 @@
     UNREACHABLE
 END_FUNCTION art_nested_signal_return
 
+DEFINE_FUNCTION art_quick_read_barrier_slow
+    PUSH edx                        // pass arg3 - offset
+    PUSH ecx                        // pass arg2 - obj
+    PUSH eax                        // pass arg1 - ref
+    call SYMBOL(artReadBarrierSlow) // artReadBarrierSlow(ref, obj, offset)
+    addl LITERAL(12), %esp          // pop arguments
+    CFI_ADJUST_CFA_OFFSET(-12)
+    ret
+END_FUNCTION art_quick_read_barrier_slow
+
     // TODO: implement these!
 UNIMPLEMENTED art_quick_memcmp16
diff --git a/runtime/arch/x86_64/asm_support_x86_64.S b/runtime/arch/x86_64/asm_support_x86_64.S
index 706ae58..cf0039c 100644
--- a/runtime/arch/x86_64/asm_support_x86_64.S
+++ b/runtime/arch/x86_64/asm_support_x86_64.S
@@ -24,6 +24,7 @@
 #define MACRO1(macro_name, macro_arg1) .macro macro_name macro_arg1
 #define MACRO2(macro_name, macro_arg1, macro_arg2) .macro macro_name macro_arg1, macro_arg2
 #define MACRO3(macro_name, macro_arg1, macro_arg2, macro_arg3) .macro macro_name macro_arg1, macro_arg2, macro_arg3
+#define MACRO4(macro_name, macro_arg1, macro_arg2, macro_arg3, macro_arg4) .macro macro_name macro_arg1, macro_arg2, macro_arg3, macro_arg4
 #define END_MACRO .endm
 
 #if defined(__clang__)
diff --git a/runtime/arch/x86_64/entrypoints_init_x86_64.cc b/runtime/arch/x86_64/entrypoints_init_x86_64.cc
index d0ab9d5..ef1bb5f 100644
--- a/runtime/arch/x86_64/entrypoints_init_x86_64.cc
+++ b/runtime/arch/x86_64/entrypoints_init_x86_64.cc
@@ -29,6 +29,9 @@
 extern "C" uint32_t art_quick_assignable_from_code(const mirror::Class* klass,
                                                    const mirror::Class* ref_class);
 
+// Read barrier entrypoints.
+extern "C" mirror::Object* art_quick_read_barrier_slow(mirror::Object*, mirror::Object*, uint32_t);
+
 void InitEntryPoints(InterpreterEntryPoints* ipoints, JniEntryPoints* jpoints,
                      QuickEntryPoints* qpoints) {
 #if defined(__APPLE__)
@@ -145,6 +148,7 @@
 
   // Read barrier
   qpoints->pReadBarrierJni = ReadBarrierJni;
+  qpoints->pReadBarrierSlow = art_quick_read_barrier_slow;
 #endif  // __APPLE__
 };
 
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 0eeb03a..f4c9488 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -66,7 +66,6 @@
     movq %xmm14, 24(%rsp)
     movq %xmm15, 32(%rsp)
     // R10 := ArtMethod* for save all callee save frame method.
-    THIS_LOAD_REQUIRES_READ_BARRIER
     movq RUNTIME_SAVE_ALL_CALLEE_SAVE_FRAME_OFFSET(%r10), %r10
     // Store ArtMethod* to bottom of stack.
     movq %r10, 0(%rsp)
@@ -109,7 +108,6 @@
     movq %xmm14, 24(%rsp)
     movq %xmm15, 32(%rsp)
     // R10 := ArtMethod* for refs only callee save frame method.
-    THIS_LOAD_REQUIRES_READ_BARRIER
     movq RUNTIME_REFS_ONLY_CALLEE_SAVE_FRAME_OFFSET(%r10), %r10
     // Store ArtMethod* to bottom of stack.
     movq %r10, 0(%rsp)
@@ -168,7 +166,6 @@
     subq MACRO_LITERAL(80 + 4 * 8), %rsp
     CFI_ADJUST_CFA_OFFSET(80 + 4 * 8)
     // R10 := ArtMethod* for ref and args callee save frame method.
-    THIS_LOAD_REQUIRES_READ_BARRIER
     movq RUNTIME_REFS_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET(%r10), %r10
     // Save FPRs.
     movq %xmm0, 16(%rsp)
@@ -920,8 +917,12 @@
     // Fast path tlab allocation.
     // RDI: uint32_t type_idx, RSI: ArtMethod*
     // RDX, RCX, R8, R9: free. RAX: return val.
+    // TODO: Add read barrier when this function is used.
+    // Might need a special macro since rsi and edx is 32b/64b mismatched.
     movl ART_METHOD_DEX_CACHE_TYPES_OFFSET(%rsi), %edx  // Load dex cache resolved types array
     UNPOISON_HEAP_REF edx
+    // TODO: Add read barrier when this function is used.
+    // Might need to break down into multiple instructions to get the base address in a register.
                                                                // Load the class
     movl MIRROR_OBJECT_ARRAY_DATA_OFFSET(%rdx, %rdi, MIRROR_OBJECT_ARRAY_COMPONENT_SIZE), %edx
     UNPOISON_HEAP_REF edx
@@ -1153,6 +1154,60 @@
 END_FUNCTION art_quick_check_cast
 
 
+// Restore reg's value if reg is not the same as exclude_reg, otherwise just adjust stack.
+MACRO2(POP_REG_NE, reg, exclude_reg)
+    .ifc RAW_VAR(reg), RAW_VAR(exclude_reg)
+      addq MACRO_LITERAL(8), %rsp
+      CFI_ADJUST_CFA_OFFSET(-8)
+    .else
+      POP RAW_VAR(reg)
+    .endif
+END_MACRO
+
+    /*
+     * Macro to insert read barrier, used in art_quick_aput_obj and art_quick_alloc_object_tlab.
+     * obj_reg and dest_reg{32|64} are registers, offset is a defined literal such as
+     * MIRROR_OBJECT_CLASS_OFFSET. dest_reg needs two versions to handle the mismatch between
+     * 64b PUSH/POP and 32b argument.
+     * TODO: When read barrier has a fast path, add heap unpoisoning support for the fast path.
+     *
+     * As with art_quick_aput_obj* functions, the 64b versions are in comments.
+     */
+MACRO4(READ_BARRIER, obj_reg, offset, dest_reg32, dest_reg64)
+#ifdef USE_READ_BARRIER
+    PUSH rax                            // save registers that might be used
+    PUSH rdi
+    PUSH rsi
+    PUSH rdx
+    PUSH rcx
+    SETUP_FP_CALLEE_SAVE_FRAME
+    // Outgoing argument set up
+    // movl %edi, %edi                  // pass ref, no-op for now since parameter ref is unused
+    // // movq %rdi, %rdi
+    movl REG_VAR(obj_reg), %esi         // pass obj_reg
+    // movq REG_VAR(obj_reg), %rsi
+    movl MACRO_LITERAL((RAW_VAR(offset))), %edx // pass offset, double parentheses are necessary
+    // movq MACRO_LITERAL((RAW_VAR(offset))), %rdx
+    call SYMBOL(artReadBarrierSlow)     // artReadBarrierSlow(ref, obj_reg, offset)
+    // No need to unpoison return value in rax, artReadBarrierSlow() would do the unpoisoning.
+    .ifnc RAW_VAR(dest_reg32), eax
+    // .ifnc RAW_VAR(dest_reg64), rax
+      movl %eax, REG_VAR(dest_reg32)    // save loaded ref in dest_reg
+      // movq %rax, REG_VAR(dest_reg64)
+    .endif
+    RESTORE_FP_CALLEE_SAVE_FRAME
+    POP_REG_NE rcx, RAW_VAR(dest_reg64) // Restore registers except dest_reg
+    POP_REG_NE rdx, RAW_VAR(dest_reg64)
+    POP_REG_NE rsi, RAW_VAR(dest_reg64)
+    POP_REG_NE rdi, RAW_VAR(dest_reg64)
+    POP_REG_NE rax, RAW_VAR(dest_reg64)
+#else
+    movl RAW_VAR(offset)(REG_VAR(obj_reg)), REG_VAR(dest_reg32)
+    // movq RAW_VAR(offset)(REG_VAR(obj_reg)), REG_VAR(dest_reg64)
+    UNPOISON_HEAP_REF RAW_VAR(dest_reg32) // UNPOISON_HEAP_REF only takes a 32b register
+#endif  // USE_READ_BARRIER
+END_MACRO
+
     /*
      * Entry from managed code for array put operations of objects where the value being stored
      * needs to be checked for compatibility.
@@ -1197,15 +1252,13 @@
     testl %edx, %edx                // store of null
 //  test %rdx, %rdx
     jz .Ldo_aput_null
-    movl MIRROR_OBJECT_CLASS_OFFSET(%edi), %ecx
-//  movq MIRROR_OBJECT_CLASS_OFFSET(%rdi), %rcx
-    UNPOISON_HEAP_REF ecx
-    movl MIRROR_CLASS_COMPONENT_TYPE_OFFSET(%ecx), %ecx
-//  movq MIRROR_CLASS_COMPONENT_TYPE_OFFSET(%rcx), %rcx
-    UNPOISON_HEAP_REF ecx
-#ifdef USE_HEAP_POISONING
-    movl MIRROR_OBJECT_CLASS_OFFSET(%edx), %eax  // rax is free.
-    UNPOISON_HEAP_REF eax
+    READ_BARRIER edi, MIRROR_OBJECT_CLASS_OFFSET, ecx, rcx
+    // READ_BARRIER rdi, MIRROR_OBJECT_CLASS_OFFSET, ecx, rcx
+    READ_BARRIER ecx, MIRROR_CLASS_COMPONENT_TYPE_OFFSET, ecx, rcx
+    // READ_BARRIER rcx, MIRROR_CLASS_COMPONENT_TYPE_OFFSET, ecx, rcx
+#if defined(USE_HEAP_POISONING) || defined(USE_READ_BARRIER)
+    READ_BARRIER edx, MIRROR_OBJECT_CLASS_OFFSET, eax, rax  // rax is free.
+    // READ_BARRIER rdx, MIRROR_OBJECT_CLASS_OFFSET, eax, rax
     cmpl %eax, %ecx  // value's type == array's component type - trivial assignability
 #else
     cmpl MIRROR_OBJECT_CLASS_OFFSET(%edx), %ecx // value's type == array's component type - trivial assignability
@@ -1232,9 +1285,14 @@
     PUSH rdx
     SETUP_FP_CALLEE_SAVE_FRAME
 
-                                  // "Uncompress" = do nothing, as already zero-extended on load.
-    movl MIRROR_OBJECT_CLASS_OFFSET(%edx), %esi // Pass arg2 = value's class.
-    UNPOISON_HEAP_REF esi
+#if defined(USE_HEAP_POISONING) || defined(USE_READ_BARRIER)
+    // The load of MIRROR_OBJECT_CLASS_OFFSET(%edx) is redundant, eax still holds the value.
+    movl %eax, %esi               // Pass arg2 = value's class.
+    // movq %rax, %rsi
+#else
+                                     // "Uncompress" = do nothing, as already zero-extended on load.
+    movl MIRROR_OBJECT_CLASS_OFFSET(%edx), %esi  // Pass arg2 = value's class.
+#endif
     movq %rcx, %rdi               // Pass arg1 = array's component type.
 
     call SYMBOL(artIsAssignableFromCode)  // (Class* a, Class* b)
@@ -1735,3 +1793,14 @@
     call PLT_SYMBOL(longjmp)
     UNREACHABLE
 END_FUNCTION art_nested_signal_return
+
+DEFINE_FUNCTION art_quick_read_barrier_slow
+    SETUP_FP_CALLEE_SAVE_FRAME
+    subq LITERAL(8), %rsp           // Alignment padding.
+    CFI_ADJUST_CFA_OFFSET(8)
+    call SYMBOL(artReadBarrierSlow) // artReadBarrierSlow(ref, obj, offset)
+    addq LITERAL(8), %rsp
+    CFI_ADJUST_CFA_OFFSET(-8)
+    RESTORE_FP_CALLEE_SAVE_FRAME
+    ret
+END_FUNCTION art_quick_read_barrier_slow
diff --git a/runtime/asm_support.h b/runtime/asm_support.h
index f4f8eaf..350a0d4 100644
--- a/runtime/asm_support.h
+++ b/runtime/asm_support.h
@@ -109,7 +109,7 @@
             art::Thread::SelfOffset<__SIZEOF_POINTER__>().Int32Value())
 
 // Offset of field Thread::tlsPtr_.thread_local_pos.
-#define THREAD_LOCAL_POS_OFFSET (THREAD_CARD_TABLE_OFFSET + 150 * __SIZEOF_POINTER__)
+#define THREAD_LOCAL_POS_OFFSET (THREAD_CARD_TABLE_OFFSET + 151 * __SIZEOF_POINTER__)
 ADD_TEST_EQ(THREAD_LOCAL_POS_OFFSET,
             art::Thread::ThreadLocalPosOffset<__SIZEOF_POINTER__>().Int32Value())
 // Offset of field Thread::tlsPtr_.thread_local_end.
diff --git a/runtime/entrypoints/quick/quick_entrypoints.h b/runtime/entrypoints/quick/quick_entrypoints.h
index cef2510..3d3f7a1 100644
--- a/runtime/entrypoints/quick/quick_entrypoints.h
+++ b/runtime/entrypoints/quick/quick_entrypoints.h
@@ -20,6 +20,7 @@
 #include <jni.h>
 
 #include "base/macros.h"
+#include "base/mutex.h"
 #include "offsets.h"
 
 #define QUICK_ENTRYPOINT_OFFSET(ptr_size, x) \
@@ -71,6 +72,16 @@
                            Thread* self)
     NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
 
+// Read barrier entrypoints.
+// Compilers for ARM, ARM64, MIPS, MIPS64 can insert a call to this function directly.
+// For x86 and x86_64, compilers need a wrapper assembly function, to handle mismatch in ABI.
+// This is the read barrier slow path for instance and static fields and reference-type arrays.
+// TODO: Currently the read barrier does not have a fast path for compilers to directly generate.
+// Ideally the slow path should only take one parameter "ref".
+extern "C" mirror::Object* artReadBarrierSlow(mirror::Object* ref, mirror::Object* obj,
+                                              uint32_t offset)
+    SHARED_REQUIRES(Locks::mutator_lock_) HOT_ATTR;
+
 }  // namespace art
 
 #endif  // ART_RUNTIME_ENTRYPOINTS_QUICK_QUICK_ENTRYPOINTS_H_
diff --git a/runtime/entrypoints/quick/quick_entrypoints_list.h b/runtime/entrypoints/quick/quick_entrypoints_list.h
index 60bbf4a..73d8ae7 100644
--- a/runtime/entrypoints/quick/quick_entrypoints_list.h
+++ b/runtime/entrypoints/quick/quick_entrypoints_list.h
@@ -145,7 +145,8 @@
   V(NewStringFromStringBuffer, void) \
   V(NewStringFromStringBuilder, void) \
 \
-  V(ReadBarrierJni, void, mirror::CompressedReference<mirror::Object>*, Thread*)
+  V(ReadBarrierJni, void, mirror::CompressedReference<mirror::Object>*, Thread*) \
+  V(ReadBarrierSlow, mirror::Object*, mirror::Object*, mirror::Object*, uint32_t)
 
 #endif  // ART_RUNTIME_ENTRYPOINTS_QUICK_QUICK_ENTRYPOINTS_LIST_H_
 #undef ART_RUNTIME_ENTRYPOINTS_QUICK_QUICK_ENTRYPOINTS_LIST_H_   // #define is only for lint.
diff --git a/runtime/entrypoints/quick/quick_field_entrypoints.cc b/runtime/entrypoints/quick/quick_field_entrypoints.cc
index 25a943a..0a1d806 100644
--- a/runtime/entrypoints/quick/quick_field_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_field_entrypoints.cc
@@ -557,4 +557,16 @@
   return -1;  // failure
 }
 
+// TODO: Currently the read barrier does not have a fast path. Ideally the slow path should only
+// take one parameter "ref", which is generated by the fast path.
+extern "C" mirror::Object* artReadBarrierSlow(mirror::Object* ref ATTRIBUTE_UNUSED,
+                                              mirror::Object* obj, uint32_t offset) {
+  DCHECK(kUseReadBarrier);
+  uint8_t* raw_addr = reinterpret_cast<uint8_t*>(obj) + offset;
+  mirror::HeapReference<mirror::Object>* ref_addr =
+      reinterpret_cast<mirror::HeapReference<mirror::Object>*>(raw_addr);
+  return ReadBarrier::Barrier<mirror::Object, kWithReadBarrier, true>(obj, MemberOffset(offset),
+                                                                      ref_addr);
+}
+
 }  // namespace art
diff --git a/runtime/entrypoints_order_test.cc b/runtime/entrypoints_order_test.cc
index c05c935..f7a3cd5 100644
--- a/runtime/entrypoints_order_test.cc
+++ b/runtime/entrypoints_order_test.cc
@@ -311,8 +311,9 @@
                          sizeof(void*));
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pNewStringFromStringBuilder, pReadBarrierJni,
                          sizeof(void*));
+    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pReadBarrierJni, pReadBarrierSlow, sizeof(void*));
 
-    CHECKED(OFFSETOF_MEMBER(QuickEntryPoints, pReadBarrierJni)
+    CHECKED(OFFSETOF_MEMBER(QuickEntryPoints, pReadBarrierSlow)
             + sizeof(void*) == sizeof(QuickEntryPoints), QuickEntryPoints_all);
   }
 };
diff --git a/runtime/oat.h b/runtime/oat.h
index ee2f3f6..29dd76c 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,7 +32,7 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr uint8_t kOatMagic[] = { 'o', 'a', 't', '\n' };
-  static constexpr uint8_t kOatVersion[] = { '0', '6', '7', '\0' };
+  static constexpr uint8_t kOatVersion[] = { '0', '6', '8', '\0' };
 
   static constexpr const char* kImageLocationKey = "image-location";
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";
diff --git a/runtime/read_barrier_c.h b/runtime/read_barrier_c.h
index 4f408dd..710c21f 100644
--- a/runtime/read_barrier_c.h
+++ b/runtime/read_barrier_c.h
@@ -47,9 +47,4 @@
 #error "Only one of Baker or Brooks can be enabled at a time."
 #endif
 
-// A placeholder marker to indicate places to add read barriers in the
-// assembly code. This is a development time aid and to be removed
-// after read barriers are added.
-#define THIS_LOAD_REQUIRES_READ_BARRIER
-
 #endif  // ART_RUNTIME_READ_BARRIER_C_H_
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 6949b0b..2b977af 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -2304,6 +2304,7 @@
   QUICK_ENTRY_POINT_INFO(pNewStringFromStringBuffer)
   QUICK_ENTRY_POINT_INFO(pNewStringFromStringBuilder)
   QUICK_ENTRY_POINT_INFO(pReadBarrierJni)
+  QUICK_ENTRY_POINT_INFO(pReadBarrierSlow)
 #undef QUICK_ENTRY_POINT_INFO
 
   os << offset;