Assembly TLAB allocation fast path for x86_64.

TODO: resolved/initialized cases, other architectures.

Bug: 9986565
Change-Id: If6df3449a3b2f5074d11babdda0fd2791fd54946
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 0f874a4..0629369 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -883,7 +883,44 @@
 GENERATE_ALLOC_ENTRYPOINTS_CHECK_AND_ALLOC_ARRAY(_bump_pointer_instrumented, BumpPointerInstrumented)
 GENERATE_ALLOC_ENTRYPOINTS_CHECK_AND_ALLOC_ARRAY_WITH_ACCESS_CHECK(_bump_pointer_instrumented, BumpPointerInstrumented)
 
-GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB)
+DEFINE_FUNCTION art_quick_alloc_object_tlab
+    // Fast path tlab allocation.
+    // RDI: uint32_t type_idx, RSI: ArtMethod*
+    // RDX, RCX, R8, R9: free. RAX: return val.
+    movl MIRROR_ART_METHOD_DEX_CACHE_TYPES_OFFSET(%rsi), %edx  // Load dex cache resolved types array
+                                                               // Load the class
+    movl MIRROR_OBJECT_ARRAY_DATA_OFFSET(%rdx, %rdi, MIRROR_OBJECT_ARRAY_COMPONENT_SIZE), %edx
+    testl %edx, %edx                                           // Check null class
+    jz   .Lart_quick_alloc_object_tlab_slow_path
+                                                               // Check class status.
+    cmpl LITERAL(MIRROR_CLASS_STATUS_INITIALIZED), MIRROR_CLASS_STATUS_OFFSET(%rdx)
+    jne  .Lart_quick_alloc_object_tlab_slow_path
+                                                               // Check access flags has kAccClassIsFinalizable
+    testl LITERAL(ACCESS_FLAGS_CLASS_IS_FINALIZABLE), MIRROR_CLASS_ACCESS_FLAGS_OFFSET(%rdx)
+    jnz  .Lart_quick_alloc_object_tlab_slow_path
+    movl MIRROR_CLASS_OBJECT_SIZE_OFFSET(%rdx), %ecx           // Load the object size.
+    addl LITERAL(OBJECT_ALIGNMENT_MASK), %ecx                  // Align the size by 8. (addr + 7) & ~7.
+    andl LITERAL(OBJECT_ALIGNMENT_MASK_TOGGLED), %ecx
+    movq %gs:THREAD_SELF_OFFSET, %r8                           // r8 = thread
+    movq THREAD_LOCAL_POS_OFFSET(%r8), %rax                    // Load thread_local_pos.
+    addq %rax, %rcx                                            // Add the object size.
+    cmpq THREAD_LOCAL_END_OFFSET(%r8), %rcx                    // Check if it fits.
+    ja   .Lart_quick_alloc_object_tlab_slow_path
+    movq %rcx, THREAD_LOCAL_POS_OFFSET(%r8)                    // Update thread_local_pos.
+    addq LITERAL(1), THREAD_LOCAL_OBJECTS_OFFSET(%r8)          // Increment thread_local_objects.
+                                                               // Store the class pointer in the header.
+                                                               // No fence needed for x86.
+    movl %edx, MIRROR_OBJECT_CLASS_OFFSET(%rax)
+    ret                                                        // Fast path succeeded.
+.Lart_quick_alloc_object_tlab_slow_path:
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME    // save ref containing registers for GC
+    // Outgoing argument set up
+    movq %gs:THREAD_SELF_OFFSET, %rdx    // pass Thread::Current()
+    call artAllocObjectFromCodeTLAB      // cxx_name(arg0, arg1, Thread*)
+    RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME  // restore frame up to return address
+    RETURN_IF_RESULT_IS_NON_ZERO         // return or deliver exception
+END_FUNCTION art_quick_alloc_object_tlab
+
 GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB)
 GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_tlab, TLAB)
 GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_tlab, TLAB)
diff --git a/runtime/asm_support.h b/runtime/asm_support.h
index 92f4ebe..b1dbf6f 100644
--- a/runtime/asm_support.h
+++ b/runtime/asm_support.h
@@ -103,6 +103,16 @@
 ADD_TEST_EQ(THREAD_SELF_OFFSET,
             art::Thread::SelfOffset<__SIZEOF_POINTER__>().Int32Value())
 
+#define THREAD_LOCAL_POS_OFFSET (THREAD_CARD_TABLE_OFFSET + 125 * __SIZEOF_POINTER__)
+ADD_TEST_EQ(THREAD_LOCAL_POS_OFFSET,
+            art::Thread::ThreadLocalPosOffset<__SIZEOF_POINTER__>().Int32Value())
+#define THREAD_LOCAL_END_OFFSET (THREAD_LOCAL_POS_OFFSET + __SIZEOF_POINTER__)
+ADD_TEST_EQ(THREAD_LOCAL_END_OFFSET,
+            art::Thread::ThreadLocalEndOffset<__SIZEOF_POINTER__>().Int32Value())
+#define THREAD_LOCAL_OBJECTS_OFFSET (THREAD_LOCAL_POS_OFFSET + 2 * __SIZEOF_POINTER__)
+ADD_TEST_EQ(THREAD_LOCAL_OBJECTS_OFFSET,
+            art::Thread::ThreadLocalObjectsOffset<__SIZEOF_POINTER__>().Int32Value())
+
 // Offsets within java.lang.Object.
 #define MIRROR_OBJECT_CLASS_OFFSET 0
 ADD_TEST_EQ(MIRROR_OBJECT_CLASS_OFFSET, art::mirror::Object::ClassOffset().Int32Value())
@@ -120,6 +130,22 @@
 #define MIRROR_CLASS_COMPONENT_TYPE_OFFSET (4 + MIRROR_OBJECT_HEADER_SIZE)
 ADD_TEST_EQ(MIRROR_CLASS_COMPONENT_TYPE_OFFSET,
             art::mirror::Class::ComponentTypeOffset().Int32Value())
+#define MIRROR_CLASS_ACCESS_FLAGS_OFFSET (52 + MIRROR_OBJECT_HEADER_SIZE)
+ADD_TEST_EQ(MIRROR_CLASS_ACCESS_FLAGS_OFFSET,
+            art::mirror::Class::AccessFlagsOffset().Int32Value())
+#define MIRROR_CLASS_OBJECT_SIZE_OFFSET (80 + MIRROR_OBJECT_HEADER_SIZE)
+ADD_TEST_EQ(MIRROR_CLASS_OBJECT_SIZE_OFFSET,
+            art::mirror::Class::ObjectSizeOffset().Int32Value())
+#define MIRROR_CLASS_STATUS_OFFSET (92 + MIRROR_OBJECT_HEADER_SIZE)
+ADD_TEST_EQ(MIRROR_CLASS_STATUS_OFFSET,
+            art::mirror::Class::StatusOffset().Int32Value())
+
+#define MIRROR_CLASS_STATUS_INITIALIZED 10
+ADD_TEST_EQ(static_cast<uint32_t>(MIRROR_CLASS_STATUS_INITIALIZED),
+            static_cast<uint32_t>(art::mirror::Class::kStatusInitialized))
+#define ACCESS_FLAGS_CLASS_IS_FINALIZABLE 0x80000000
+ADD_TEST_EQ(static_cast<uint32_t>(ACCESS_FLAGS_CLASS_IS_FINALIZABLE),
+            static_cast<uint32_t>(kAccClassIsFinalizable))
 
 // Array offsets.
 #define MIRROR_ARRAY_LENGTH_OFFSET      MIRROR_OBJECT_HEADER_SIZE
@@ -134,6 +160,10 @@
     art::mirror::Array::DataOffset(
         sizeof(art::mirror::HeapReference<art::mirror::Object>)).Int32Value())
 
+#define MIRROR_OBJECT_ARRAY_COMPONENT_SIZE 4
+ADD_TEST_EQ(static_cast<size_t>(MIRROR_OBJECT_ARRAY_COMPONENT_SIZE),
+            sizeof(art::mirror::HeapReference<art::mirror::Object>))
+
 // Offsets within java.lang.String.
 #define MIRROR_STRING_VALUE_OFFSET  MIRROR_OBJECT_HEADER_SIZE
 ADD_TEST_EQ(MIRROR_STRING_VALUE_OFFSET, art::mirror::String::ValueOffset().Int32Value())
@@ -149,6 +179,10 @@
 ADD_TEST_EQ(MIRROR_ART_METHOD_DEX_CACHE_METHODS_OFFSET,
             art::mirror::ArtMethod::DexCacheResolvedMethodsOffset().Int32Value())
 
+#define MIRROR_ART_METHOD_DEX_CACHE_TYPES_OFFSET (8 + MIRROR_OBJECT_HEADER_SIZE)
+ADD_TEST_EQ(MIRROR_ART_METHOD_DEX_CACHE_TYPES_OFFSET,
+            art::mirror::ArtMethod::DexCacheResolvedTypesOffset().Int32Value())
+
 #define MIRROR_ART_METHOD_QUICK_CODE_OFFSET_32        (36 + MIRROR_OBJECT_HEADER_SIZE)
 ADD_TEST_EQ(MIRROR_ART_METHOD_QUICK_CODE_OFFSET_32,
             art::mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(4).Int32Value())
@@ -178,6 +212,13 @@
 #define LOCK_WORD_THIN_LOCK_COUNT_ONE 65536
 ADD_TEST_EQ(LOCK_WORD_THIN_LOCK_COUNT_ONE, static_cast<int32_t>(art::LockWord::kThinLockCountOne))
 
+#define OBJECT_ALIGNMENT_MASK 7
+ADD_TEST_EQ(static_cast<size_t>(OBJECT_ALIGNMENT_MASK), art::kObjectAlignment - 1)
+
+#define OBJECT_ALIGNMENT_MASK_TOGGLED 0xFFFFFFF8
+ADD_TEST_EQ(static_cast<uint32_t>(OBJECT_ALIGNMENT_MASK_TOGGLED),
+            ~static_cast<uint32_t>(art::kObjectAlignment - 1))
+
 #if defined(__cplusplus)
 }  // End of CheckAsmSupportOffsets.
 #endif
diff --git a/runtime/mirror/class-inl.h b/runtime/mirror/class-inl.h
index 97052f1..c368dc6 100644
--- a/runtime/mirror/class-inl.h
+++ b/runtime/mirror/class-inl.h
@@ -45,7 +45,7 @@
     bool is_variable_size = IsVariableSize<kVerifyFlags, kReadBarrierOption>();
     CHECK(!is_variable_size) << " class=" << PrettyTypeOf(this);
   }
-  return GetField32(OFFSET_OF_OBJECT_MEMBER(Class, object_size_));
+  return GetField32(ObjectSizeOffset());
 }
 
 inline Class* Class::GetSuperClass() {
@@ -523,7 +523,7 @@
       << " IsArtField=" << (this == ArtField::GetJavaLangReflectArtField())
       << " IsArtMethod=" << (this == ArtMethod::GetJavaLangReflectArtMethod())
       << " descriptor=" << PrettyDescriptor(this);
-  return GetField32<kVerifyFlags>(OFFSET_OF_OBJECT_MEMBER(Class, access_flags_));
+  return GetField32<kVerifyFlags>(AccessFlagsOffset());
 }
 
 inline String* Class::GetName() {
diff --git a/runtime/mirror/class.h b/runtime/mirror/class.h
index e7f7c6e..2dff383 100644
--- a/runtime/mirror/class.h
+++ b/runtime/mirror/class.h
@@ -204,6 +204,9 @@
 
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
   ALWAYS_INLINE uint32_t GetAccessFlags() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  static MemberOffset AccessFlagsOffset() {
+    return OFFSET_OF_OBJECT_MEMBER(Class, access_flags_);
+  }
 
   void SetAccessFlags(uint32_t new_access_flags) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
@@ -527,6 +530,9 @@
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags,
            ReadBarrierOption kReadBarrierOption = kWithReadBarrier>
   uint32_t GetObjectSize() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  static MemberOffset ObjectSizeOffset() {
+    return OFFSET_OF_OBJECT_MEMBER(Class, object_size_);
+  }
 
   void SetObjectSize(uint32_t new_object_size) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     DCHECK(!IsVariableSize());
diff --git a/runtime/thread.h b/runtime/thread.h
index da7af83..9d4d89d 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -573,6 +573,21 @@
         OFFSETOF_MEMBER(tls_ptr_sized_values, suspend_trigger));
   }
 
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> ThreadLocalPosOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values, thread_local_pos));
+  }
+
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> ThreadLocalEndOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values, thread_local_end));
+  }
+
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> ThreadLocalObjectsOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values, thread_local_objects));
+  }
+
   // Size of stack less any space reserved for stack overflow
   size_t GetStackSize() const {
     return tlsPtr_.stack_size - (tlsPtr_.stack_end - tlsPtr_.stack_begin);