Revert "Revert "Assembly RegionTLAB allocation fast path for x86_64""
This reverts commit 5ef46671f22640ae1df548aef2fc8f9a61f5ddb4.
Change-Id: Ied2ed491f1ab675db300829f2bf259dbb24cdd20
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 26e668e..3553c86 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -894,57 +894,107 @@
RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER // return or deliver exception
END_FUNCTION art_quick_alloc_object_rosalloc
-// A handle-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB).
+// The common fast path code for art_quick_alloc_object_tlab and art_quick_alloc_object_region_tlab.
+//
+// RDI: type_idx, RSI: ArtMethod*, RDX/EDX: the class, RAX: return value.
+// RCX: scratch, r8: Thread::Current().
+MACRO1(ALLOC_OBJECT_TLAB_FAST_PATH, slowPathLabel)
+ testl %edx, %edx // Check null class
+ jz RAW_VAR(slowPathLabel)
+ // Check class status.
+ cmpl LITERAL(MIRROR_CLASS_STATUS_INITIALIZED), MIRROR_CLASS_STATUS_OFFSET(%rdx)
+ jne RAW_VAR(slowPathLabel)
+ // No fake dependence needed on x86
+ // between status and flags load,
+ // since each load is a load-acquire,
+ // no loads reordering.
+ // Check access flags has
+ // kAccClassIsFinalizable
+ testl LITERAL(ACCESS_FLAGS_CLASS_IS_FINALIZABLE), MIRROR_CLASS_ACCESS_FLAGS_OFFSET(%rdx)
+ jnz RAW_VAR(slowPathLabel)
+ movq %gs:THREAD_SELF_OFFSET, %r8 // r8 = thread
+ movq THREAD_LOCAL_END_OFFSET(%r8), %rax // Load thread_local_end.
+ subq THREAD_LOCAL_POS_OFFSET(%r8), %rax // Compute the remaining buffer size.
+ movl MIRROR_CLASS_OBJECT_SIZE_OFFSET(%rdx), %ecx // Load the object size.
+ cmpq %rax, %rcx // Check if it fits. OK to do this
+ // before rounding up the object size
+ // assuming the buf size alignment.
+ ja RAW_VAR(slowPathLabel)
+ addl LITERAL(OBJECT_ALIGNMENT_MASK), %ecx // Align the size by 8. (addr + 7) & ~7.
+ andl LITERAL(OBJECT_ALIGNMENT_MASK_TOGGLED), %ecx
+ movq THREAD_LOCAL_POS_OFFSET(%r8), %rax // Load thread_local_pos
+ // as allocated object.
+ addq %rax, %rcx // Add the object size.
+ movq %rcx, THREAD_LOCAL_POS_OFFSET(%r8) // Update thread_local_pos.
+ addq LITERAL(1), THREAD_LOCAL_OBJECTS_OFFSET(%r8) // Increase thread_local_objects.
+ // Store the class pointer in the header.
+ // No fence needed for x86.
+ POISON_HEAP_REF edx
+ movl %edx, MIRROR_OBJECT_CLASS_OFFSET(%rax)
+ ret // Fast path succeeded.
+END_MACRO
+
+// The common slow path code for art_quick_alloc_object_tlab and art_quick_alloc_object_region_tlab.
+MACRO1(ALLOC_OBJECT_TLAB_SLOW_PATH, cxx_name)
+ SETUP_REFS_ONLY_CALLEE_SAVE_FRAME // save ref containing registers for GC
+ // Outgoing argument set up
+ movq %gs:THREAD_SELF_OFFSET, %rdx // pass Thread::Current()
+ call VAR(cxx_name) // cxx_name(arg0, arg1, Thread*)
+ RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME // restore frame up to return address
+ RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER // return or deliver exception
+END_MACRO
+
+// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB).
DEFINE_FUNCTION art_quick_alloc_object_tlab
// Fast path tlab allocation.
// RDI: uint32_t type_idx, RSI: ArtMethod*
// RDX, RCX, R8, R9: free. RAX: return val.
- // TODO: Add read barrier when this function is used.
- // Note this function can/should implement read barrier fast path only
- // (no read barrier slow path) because this is the fast path of tlab allocation.
- // We can fall back to the allocation slow path to do the read barrier slow path.
#if defined(USE_READ_BARRIER)
int3
int3
#endif
// Might need a special macro since rsi and edx is 32b/64b mismatched.
movq ART_METHOD_DEX_CACHE_TYPES_OFFSET_64(%rsi), %rdx // Load dex cache resolved types array
- // TODO: Add read barrier when this function is used.
// Might need to break down into multiple instructions to get the base address in a register.
// Load the class
movl 0(%rdx, %rdi, COMPRESSED_REFERENCE_SIZE), %edx
- testl %edx, %edx // Check null class
- jz .Lart_quick_alloc_object_tlab_slow_path
- // Check class status.
- cmpl LITERAL(MIRROR_CLASS_STATUS_INITIALIZED), MIRROR_CLASS_STATUS_OFFSET(%rdx)
- jne .Lart_quick_alloc_object_tlab_slow_path
- // Check access flags has kAccClassIsFinalizable
- testl LITERAL(ACCESS_FLAGS_CLASS_IS_FINALIZABLE), MIRROR_CLASS_ACCESS_FLAGS_OFFSET(%rdx)
- jnz .Lart_quick_alloc_object_tlab_slow_path
- movl MIRROR_CLASS_OBJECT_SIZE_OFFSET(%rdx), %ecx // Load the object size.
- addl LITERAL(OBJECT_ALIGNMENT_MASK), %ecx // Align the size by 8. (addr + 7) & ~7.
- andl LITERAL(OBJECT_ALIGNMENT_MASK_TOGGLED), %ecx
- movq %gs:THREAD_SELF_OFFSET, %r8 // r8 = thread
- movq THREAD_LOCAL_POS_OFFSET(%r8), %rax // Load thread_local_pos.
- addq %rax, %rcx // Add the object size.
- cmpq THREAD_LOCAL_END_OFFSET(%r8), %rcx // Check if it fits.
- ja .Lart_quick_alloc_object_tlab_slow_path
- movq %rcx, THREAD_LOCAL_POS_OFFSET(%r8) // Update thread_local_pos.
- addq LITERAL(1), THREAD_LOCAL_OBJECTS_OFFSET(%r8) // Increment thread_local_objects.
- // Store the class pointer in the header.
- // No fence needed for x86.
- movl %edx, MIRROR_OBJECT_CLASS_OFFSET(%rax)
- ret // Fast path succeeded.
+ ALLOC_OBJECT_TLAB_FAST_PATH .Lart_quick_alloc_object_tlab_slow_path
.Lart_quick_alloc_object_tlab_slow_path:
- SETUP_REFS_ONLY_CALLEE_SAVE_FRAME // save ref containing registers for GC
- // Outgoing argument set up
- movq %gs:THREAD_SELF_OFFSET, %rdx // pass Thread::Current()
- call SYMBOL(artAllocObjectFromCodeTLAB) // cxx_name(arg0, arg1, Thread*)
- RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME // restore frame up to return address
- RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER // return or deliver exception
+ ALLOC_OBJECT_TLAB_SLOW_PATH artAllocObjectFromCodeTLAB
END_FUNCTION art_quick_alloc_object_tlab
-GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_region_tlab, RegionTLAB)
+// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_region_tlab, RegionTLAB).
+DEFINE_FUNCTION art_quick_alloc_object_region_tlab
+ // Fast path region tlab allocation.
+ // RDI: uint32_t type_idx, RSI: ArtMethod*
+ // RDX, RCX, R8, R9: free. RAX: return val.
+#if !defined(USE_READ_BARRIER)
+ int3
+ int3
+#endif
+ // Might need a special macro since rsi and edx is 32b/64b mismatched.
+ movq ART_METHOD_DEX_CACHE_TYPES_OFFSET_64(%rsi), %rdx // Load dex cache resolved types array
+ // Might need to break down into multiple instructions to get the base address in a register.
+ // Load the class
+ movl 0(%rdx, %rdi, COMPRESSED_REFERENCE_SIZE), %edx
+ cmpl LITERAL(0), %gs:THREAD_IS_GC_MARKING_OFFSET
+ jne .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path
+.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit:
+ ALLOC_OBJECT_TLAB_FAST_PATH .Lart_quick_alloc_object_region_tlab_slow_path
+.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path:
+ // The read barrier slow path. Mark the class.
+ PUSH rdi
+ PUSH rsi
+ // Outgoing argument set up
+ movq %rdx, %rdi // Pass the class as the first param.
+ call SYMBOL(artReadBarrierMark) // cxx_name(mirror::Object* obj)
+ movq %rax, %rdx
+ POP rsi
+ POP rdi
+ jmp .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit
+.Lart_quick_alloc_object_region_tlab_slow_path:
+ ALLOC_OBJECT_TLAB_SLOW_PATH artAllocObjectFromCodeRegionTLAB
+END_FUNCTION art_quick_alloc_object_region_tlab
ONE_ARG_DOWNCALL art_quick_resolve_string, artResolveStringFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
ONE_ARG_DOWNCALL art_quick_initialize_static_storage, artInitializeStaticStorageFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER