Add forwarding address checks for X86, arm, arm64

Added to READ_BARRIER_MARK_REG.

Bug: 30162165

Test: test-art-host, test-art-target

Change-Id: I15cf0d51ed3d22fa401e80ffac3877d61593527c
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index bf70c55..0135260 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -1999,11 +1999,17 @@
     // Check lock word for mark bit, if marked return. Use IP for scratch since it is blocked.
     ldr ip, [\reg, MIRROR_OBJECT_LOCK_WORD_OFFSET]
     tst ip, #LOCK_WORD_MARK_BIT_MASK_SHIFTED
-    beq .Lslow_rb_\name
+    beq .Lnot_marked_rb_\name
     // Already marked, return right away.
 .Lret_rb_\name:
     bx lr
 
+.Lnot_marked_rb_\name:
+    // Test that both the forwarding state bits are 1.
+    mvn ip, ip
+    tst ip, #(LOCK_WORD_STATE_FORWARDING_ADDRESS << LOCK_WORD_STATE_SHIFT)
+    beq .Lret_forwarding_address\name
+
 .Lslow_rb_\name:
     // Save IP: the kSaveEverything entrypoint art_quick_resolve_string makes a tail call here.
     push  {r0-r4, r9, ip, lr}           @ save return address, core caller-save registers and ip
@@ -2064,6 +2070,12 @@
     .cfi_restore ip
     .cfi_restore lr
     bx lr
+.Lret_forwarding_address\name:
+    // Shift left by the forwarding address shift. This clears out the state bits since they are
+    // in the top 2 bits of the lock word.
+    mvn ip, ip
+    lsl \reg, ip, #LOCK_WORD_STATE_FORWARDING_ADDRESS_SHIFT
+    bx lr
 END \name
 .endm
 
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 483cee3..d806715 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -2539,10 +2539,17 @@
      */
     // Use wIP0 as temp and check the mark bit of the reference. wIP0 is not used by the compiler.
     ldr   wIP0, [\xreg, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
-    tbz   wIP0, #LOCK_WORD_MARK_BIT_SHIFT, .Lslow_rb_\name
+    tbz   wIP0, #LOCK_WORD_MARK_BIT_SHIFT, .Lnot_marked_rb_\name
 .Lret_rb_\name:
     ret
+.Lnot_marked_rb_\name:
+    // Check if the top two bits are one, if this is the case it is a forwarding address.
+    mvn wIP0, wIP0
+    cmp wzr, wIP0, lsr #30
+    beq .Lret_forwarding_address\name
 .Lslow_rb_\name:
+    // We must not clobber IP0 since art_quick_resolve_string makes a tail call here and relies on
+    // IP0 being restored.
     // Save all potentially live caller-save core registers.
     SAVE_TWO_REGS_INCREASE_FRAME x0, x1, 368
     SAVE_TWO_REGS  x2,  x3, 16
@@ -2608,6 +2615,12 @@
     RESTORE_REG xLR, 360
     DECREASE_FRAME 368
     ret
+.Lret_forwarding_address\name:
+    mvn wIP0, wIP0
+    // Shift left by the forwarding address shift. This clears out the state bits since they are
+    // in the top 2 bits of the lock word.
+    lsl \wreg, wIP0, #LOCK_WORD_STATE_FORWARDING_ADDRESS_SHIFT
+    ret
 END \name
 .endm
 
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index f4f9a68..98739d3 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -2155,8 +2155,15 @@
     jz .Lslow_rb_\name
     ret
 .Lslow_rb_\name:
-    // Save all potentially live caller-save core registers.
     PUSH eax
+    mov MIRROR_OBJECT_LOCK_WORD_OFFSET(REG_VAR(reg)), %eax
+    add LITERAL(LOCK_WORD_STATE_FORWARDING_ADDRESS_OVERFLOW), %eax
+    // Jump if overflow, the only case where it overflows should be the forwarding address one.
+    // Taken ~25% of the time.
+    jnae .Lret_forwarding_address\name
+
+    // Save all potentially live caller-save core registers.
+    mov 0(%esp), %eax
     PUSH ecx
     PUSH edx
     PUSH ebx
@@ -2204,6 +2211,12 @@
     POP_REG_NE eax, RAW_VAR(reg)
 .Lret_rb_\name:
     ret
+.Lret_forwarding_address\name:
+    // The overflow cleared the top bits.
+    sall LITERAL(LOCK_WORD_STATE_FORWARDING_ADDRESS_SHIFT), %eax
+    mov %eax, REG_VAR(reg)
+    POP_REG_NE eax, RAW_VAR(reg)
+    ret
     END_FUNCTION VAR(name)
 END_MACRO
 
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index fc549ec..185e55e 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -2279,9 +2279,10 @@
     PUSH rax
     movl MIRROR_OBJECT_LOCK_WORD_OFFSET(REG_VAR(reg)), %eax
     addl LITERAL(LOCK_WORD_STATE_FORWARDING_ADDRESS_OVERFLOW), %eax
-    // Jump if overflow, the only case where it overflows should be the forwarding address one.
+    // Jump if the addl caused eax to unsigned overflow. The only case where it overflows is the
+    // forwarding address one.
     // Taken ~25% of the time.
-    jnae .Lret_overflow\name
+    jnae .Lret_forwarding_address\name
 
     // Save all potentially live caller-save core registers.
     movq 0(%rsp), %rax
@@ -2349,7 +2350,7 @@
     POP_REG_NE rax, RAW_VAR(reg)
 .Lret_rb_\name:
     ret
-.Lret_overflow\name:
+.Lret_forwarding_address\name:
     // The overflow cleared the top bits.
     sall LITERAL(LOCK_WORD_STATE_FORWARDING_ADDRESS_SHIFT), %eax
     movq %rax, REG_VAR(reg)
diff --git a/runtime/gc/collector/concurrent_copying-inl.h b/runtime/gc/collector/concurrent_copying-inl.h
index 6b6a12d..7c64952 100644
--- a/runtime/gc/collector/concurrent_copying-inl.h
+++ b/runtime/gc/collector/concurrent_copying-inl.h
@@ -149,8 +149,6 @@
 
 inline mirror::Object* ConcurrentCopying::MarkFromReadBarrier(mirror::Object* from_ref) {
   mirror::Object* ret;
-  // TODO: Delete GetMarkBit check when all of the callers properly check the bit. Remaining caller
-  // is array allocations.
   if (from_ref == nullptr) {
     return from_ref;
   }
diff --git a/runtime/generated/asm_support_gen.h b/runtime/generated/asm_support_gen.h
index f938c9f..2c95fe9 100644
--- a/runtime/generated/asm_support_gen.h
+++ b/runtime/generated/asm_support_gen.h
@@ -98,6 +98,8 @@
 DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), (static_cast<uint32_t>(art::LockWord::kReadBarrierStateMaskShiftedToggled)))
 #define LOCK_WORD_THIN_LOCK_COUNT_ONE 65536
 DEFINE_CHECK_EQ(static_cast<int32_t>(LOCK_WORD_THIN_LOCK_COUNT_ONE), (static_cast<int32_t>(art::LockWord::kThinLockCountOne)))
+#define LOCK_WORD_STATE_FORWARDING_ADDRESS 0x3
+DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_STATE_FORWARDING_ADDRESS), (static_cast<uint32_t>(art::LockWord::kStateForwardingAddress)))
 #define LOCK_WORD_STATE_FORWARDING_ADDRESS_OVERFLOW 0x40000000
 DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_STATE_FORWARDING_ADDRESS_OVERFLOW), (static_cast<uint32_t>(art::LockWord::kStateForwardingAddressOverflow)))
 #define LOCK_WORD_STATE_FORWARDING_ADDRESS_SHIFT 0x3
diff --git a/runtime/lock_word.h b/runtime/lock_word.h
index dea301c..2f2565b 100644
--- a/runtime/lock_word.h
+++ b/runtime/lock_word.h
@@ -265,6 +265,9 @@
         static_cast<uint64_t>(kStateForwardingAddressOverflow);
     constexpr bool is_larger = overflow > static_cast<uint64_t>(0xFFFFFFFF);
     static_assert(is_larger, "should have overflowed");
+    static_assert(
+         (~kStateForwardingAddress & kStateMask) == 0,
+        "READ_BARRIER_MARK_REG relies on the forwarding address state being only one bits");
     CheckReadBarrierState();
   }
 
diff --git a/tools/cpp-define-generator/constant_lockword.def b/tools/cpp-define-generator/constant_lockword.def
index f9b6b19..08d5885 100644
--- a/tools/cpp-define-generator/constant_lockword.def
+++ b/tools/cpp-define-generator/constant_lockword.def
@@ -30,6 +30,7 @@
 DEFINE_LOCK_WORD_EXPR(READ_BARRIER_STATE_MASK_TOGGLED, uint32_t, kReadBarrierStateMaskShiftedToggled)
 DEFINE_LOCK_WORD_EXPR(THIN_LOCK_COUNT_ONE,       int32_t,  kThinLockCountOne)
 
+DEFINE_LOCK_WORD_EXPR(STATE_FORWARDING_ADDRESS, uint32_t, kStateForwardingAddress)
 DEFINE_LOCK_WORD_EXPR(STATE_FORWARDING_ADDRESS_OVERFLOW, uint32_t, kStateForwardingAddressOverflow)
 DEFINE_LOCK_WORD_EXPR(STATE_FORWARDING_ADDRESS_SHIFT, uint32_t, kForwardingAddressShift)