src/hotspot/os_cpu/linux_arm/linux_arm_64.s - platform/libcore - Git at Google

 #
 # Copyright (c) 2008, 2013, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License version 2 only, as
 # published by the Free Software Foundation.
 #
 # This code is distributed in the hope that it will be useful, but WITHOUT
 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 # FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 # version 2 for more details (a copy is included in the LICENSE file that
 # accompanied this code).
 #
 # You should have received a copy of the GNU General Public License version
 # 2 along with this work; if not, write to the Free Software Foundation,
 # Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 #
 # Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 # or visit www.oracle.com if you need additional information or have any
 # questions.
 #

         # TODO-AARCH64

         # NOTE WELL!  The _Copy functions are called directly
         # from server-compiler-generated code via CallLeafNoFP,
         # which means that they *must* either not use floating
         # point or use it in the same manner as does the server
         # compiler.

         .globl _Copy_conjoint_bytes
         .type _Copy_conjoint_bytes, %function
         .globl _Copy_arrayof_conjoint_bytes
         .type _Copy_arrayof_conjoint_bytes, %function
         .globl _Copy_disjoint_words
         .type _Copy_disjoint_words, %function
         .globl _Copy_conjoint_words
         .type _Copy_conjoint_words, %function
         .globl _Copy_conjoint_jshorts_atomic
         .type _Copy_conjoint_jshorts_atomic, %function
         .globl _Copy_arrayof_conjoint_jshorts
         .type _Copy_arrayof_conjoint_jshorts, %function
         .globl _Copy_conjoint_jints_atomic
         .type _Copy_conjoint_jints_atomic, %function
         .globl _Copy_arrayof_conjoint_jints
         .type _Copy_arrayof_conjoint_jints, %function
         .globl _Copy_conjoint_jlongs_atomic
         .type _Copy_conjoint_jlongs_atomic, %function
         .globl _Copy_arrayof_conjoint_jlongs
         .type _Copy_arrayof_conjoint_jlongs, %function

         .text
         .globl  SpinPause
         .type SpinPause, %function
 SpinPause:
         yield
         ret

         # Support for void Copy::conjoint_bytes(void* from,
         #                                       void* to,
         #                                       size_t count)
 _Copy_conjoint_bytes:
         hlt 1002

         # Support for void Copy::arrayof_conjoint_bytes(void* from,
         #                                               void* to,
         #                                               size_t count)
 _Copy_arrayof_conjoint_bytes:
         hlt 1003


         # Support for void Copy::disjoint_words(void* from,
         #                                       void* to,
         #                                       size_t count)
 _Copy_disjoint_words:
         # These and further memory prefetches may hit out of array ranges.
         # Experiments showed that prefetching of inaccessible memory doesn't result in exceptions.
         prfm    pldl1keep,  [x0, #0]
         prfm    pstl1keep,  [x1, #0]
         prfm    pldl1keep,  [x0, #64]
         prfm    pstl1keep,  [x1, #64]

         subs    x18, x2,  #128
         b.ge    dw_large

 dw_lt_128:
         # Copy [x0, x0 + x2) to [x1, x1 + x2)

         adr     x15,  dw_tail_table_base
         and     x16,  x2,  #~8

         # Calculate address to jump and store it to x15:
         #   Each pair of instructions before dw_tail_table_base copies 16 bytes.
         #   x16 is count of bytes to copy aligned down by 16.
         #   So x16/16 pairs of instructions should be executed.
         #   Each pair takes 8 bytes, so x15 = dw_tail_table_base - (x16/16)*8 = x15 - x16/2
         sub     x15,  x15, x16, lsr #1
         prfm    plil1keep, [x15]

         add     x17,  x0,  x2
         add     x18,  x1,  x2

         # If x2 = x16 + 8, then copy 8 bytes and x16 bytes after that.
         # Otherwise x2 = x16, so proceed to copy x16 bytes.
         tbz     x2, #3, dw_lt_128_even
         ldr     x3, [x0]
         str     x3, [x1]
 dw_lt_128_even:
         # Copy [x17 - x16, x17) to [x18 - x16, x18)
         # x16 is aligned by 16 and less than 128

         # Execute (x16/16) ldp-stp pairs; each pair copies 16 bytes
         br      x15

         ldp     x3,  x4,  [x17, #-112]
         stp     x3,  x4,  [x18, #-112]
         ldp     x5,  x6,  [x17, #-96]
         stp     x5,  x6,  [x18, #-96]
         ldp     x7,  x8,  [x17, #-80]
         stp     x7,  x8,  [x18, #-80]
         ldp     x9,  x10, [x17, #-64]
         stp     x9,  x10, [x18, #-64]
         ldp     x11, x12, [x17, #-48]
         stp     x11, x12, [x18, #-48]
         ldp     x13, x14, [x17, #-32]
         stp     x13, x14, [x18, #-32]
         ldp     x15, x16, [x17, #-16]
         stp     x15, x16, [x18, #-16]
 dw_tail_table_base:
         ret

 .p2align  6
 .rept   12
         nop
 .endr
 dw_large:
         # x18 >= 0;
         # Copy [x0, x0 + x18 + 128) to [x1, x1 + x18 + 128)

         ldp     x3,  x4,  [x0], #64
         ldp     x5,  x6,  [x0, #-48]
         ldp     x7,  x8,  [x0, #-32]
         ldp     x9,  x10, [x0, #-16]

         # Before and after each iteration of loop registers x3-x10 contain [x0 - 64, x0),
         # and x1 is a place to copy this data;
         # x18 contains number of bytes to be stored minus 128

         # Exactly 16 instructions from p2align, so dw_loop starts from cache line boundary
         # Checking it explictly by aligning with "hlt 1000" instructions
 .p2alignl  6, 0xd4407d00
 dw_loop:
         prfm    pldl1keep,  [x0, #64]
         # Next line actually hurted memory copy performance (for interpreter) - JDK-8078120
         # prfm    pstl1keep,  [x1, #64]

         subs    x18, x18, #64

         stp     x3,  x4,  [x1, #0]
         ldp     x3,  x4,  [x0, #0]
         stp     x5,  x6,  [x1, #16]
         ldp     x5,  x6,  [x0, #16]
         stp     x7,  x8,  [x1, #32]
         ldp     x7,  x8,  [x0, #32]
         stp     x9,  x10, [x1, #48]
         ldp     x9,  x10, [x0, #48]

         add     x1,  x1,  #64
         add     x0,  x0,  #64

         b.ge    dw_loop

         # 13 instructions from dw_loop, so the loop body hits into one cache line

 dw_loop_end:
         adds    x2,  x18, #64

         stp     x3,  x4,  [x1], #64
         stp     x5,  x6,  [x1, #-48]
         stp     x7,  x8,  [x1, #-32]
         stp     x9,  x10, [x1, #-16]

         # Increased x18 by 64, but stored 64 bytes, so x2 contains exact number of bytes to be stored

         # If this number is not zero, also copy remaining bytes
         b.ne    dw_lt_128
         ret


         # Support for void Copy::conjoint_words(void* from,
         #                                       void* to,
         #                                       size_t count)
 _Copy_conjoint_words:
         subs    x3, x1, x0
         # hi condition is met <=> from < to
         ccmp    x2, x3, #0, hi
         # hi condition is met <=> (from < to) and (to - from < count)
         # otherwise _Copy_disjoint_words may be used, because it performs forward copying,
         # so it also works when ranges overlap but to <= from
         b.ls    _Copy_disjoint_words

         # Overlapping case should be the rare one, it does not worth optimizing

         ands    x3,  x2,  #~8
         # x3 is count aligned down by 2*wordSize
         add     x0,  x0,  x2
         add     x1,  x1,  x2
         sub     x3,  x3,  #16
         # Skip loop if 0 or 1 words
         b.eq    cw_backward_loop_end

         # x3 >= 0
         # Copy [x0 - x3 - 16, x0) to [x1 - x3 - 16, x1) backward
 cw_backward_loop:
         subs    x3,  x3,  #16
         ldp     x4,  x5,  [x0, #-16]!
         stp     x4,  x5,  [x1, #-16]!
         b.ge    cw_backward_loop

 cw_backward_loop_end:
         # Copy remaining 0 or 1 words
         tbz     x2,  #3,  cw_finish
         ldr     x3, [x0, #-8]
         str     x3, [x1, #-8]

 cw_finish:
         ret


         # Support for void Copy::conjoint_jshorts_atomic(void* from,
         #                                                void* to,
         #                                                size_t count)
 _Copy_conjoint_jshorts_atomic:
         add     x17, x0, x2
         add     x18, x1, x2

         subs    x3, x1, x0
         # hi is met <=> (from < to) and (to - from < count)
         ccmp    x2, x3, #0, hi
         b.hi    cs_backward

         subs    x3, x2, #14
         b.ge    cs_forward_loop

         # Copy x2 < 14 bytes from x0 to x1
 cs_forward_lt14:
         ands    x7, x2, #7
         tbz     x2, #3, cs_forward_lt8
         ldrh    w3, [x0, #0]
         ldrh    w4, [x0, #2]
         ldrh    w5, [x0, #4]
         ldrh    w6, [x0, #6]

         strh    w3, [x1, #0]
         strh    w4, [x1, #2]
         strh    w5, [x1, #4]
         strh    w6, [x1, #6]

         # Copy x7 < 8 bytes from x17 - x7 to x18 - x7
 cs_forward_lt8:
         b.eq    cs_forward_0
         cmp     x7, #4
         b.lt    cs_forward_2
         b.eq    cs_forward_4

 cs_forward_6:
         ldrh    w3, [x17, #-6]
         strh    w3, [x18, #-6]
 cs_forward_4:
         ldrh    w4, [x17, #-4]
         strh    w4, [x18, #-4]
 cs_forward_2:
         ldrh    w5, [x17, #-2]
         strh    w5, [x18, #-2]
 cs_forward_0:
         ret


         # Copy [x0, x0 + x3 + 14) to [x1, x1 + x3 + 14)
         # x3 >= 0
 .p2align 6
 cs_forward_loop:
         subs    x3, x3, #14

         ldrh    w4, [x0], #14
         ldrh    w5, [x0, #-12]
         ldrh    w6, [x0, #-10]
         ldrh    w7, [x0, #-8]
         ldrh    w8, [x0, #-6]
         ldrh    w9, [x0, #-4]
         ldrh    w10, [x0, #-2]

         strh    w4, [x1], #14
         strh    w5, [x1, #-12]
         strh    w6, [x1, #-10]
         strh    w7, [x1, #-8]
         strh    w8, [x1, #-6]
         strh    w9, [x1, #-4]
         strh    w10, [x1, #-2]

         b.ge    cs_forward_loop
         # Exactly 16 instruction from cs_forward_loop, so loop fits into one cache line

         adds    x2, x3, #14
         # x2 bytes should be copied from x0 to x1
         b.ne    cs_forward_lt14
         ret

         # Very similar to forward copying
 cs_backward:
         subs    x3, x2, #14
         b.ge    cs_backward_loop

 cs_backward_lt14:
         ands    x7, x2, #7
         tbz     x2, #3, cs_backward_lt8

         ldrh    w3, [x17, #-8]
         ldrh    w4, [x17, #-6]
         ldrh    w5, [x17, #-4]
         ldrh    w6, [x17, #-2]

         strh    w3, [x18, #-8]
         strh    w4, [x18, #-6]
         strh    w5, [x18, #-4]
         strh    w6, [x18, #-2]

 cs_backward_lt8:
         b.eq    cs_backward_0
         cmp     x7, #4
         b.lt    cs_backward_2
         b.eq    cs_backward_4

 cs_backward_6:
         ldrh    w3, [x0, #4]
         strh    w3, [x1, #4]

 cs_backward_4:
         ldrh    w4, [x0, #2]
         strh    w4, [x1, #2]

 cs_backward_2:
         ldrh    w5, [x0, #0]
         strh    w5, [x1, #0]

 cs_backward_0:
         ret


 .p2align 6
 cs_backward_loop:
         subs    x3, x3, #14

         ldrh    w4, [x17, #-14]!
         ldrh    w5, [x17, #2]
         ldrh    w6, [x17, #4]
         ldrh    w7, [x17, #6]
         ldrh    w8, [x17, #8]
         ldrh    w9, [x17, #10]
         ldrh    w10, [x17, #12]

         strh    w4, [x18, #-14]!
         strh    w5, [x18, #2]
         strh    w6, [x18, #4]
         strh    w7, [x18, #6]
         strh    w8, [x18, #8]
         strh    w9, [x18, #10]
         strh    w10, [x18, #12]

         b.ge    cs_backward_loop
         adds    x2, x3, #14
         b.ne    cs_backward_lt14
         ret


         # Support for void Copy::arrayof_conjoint_jshorts(void* from,
         #                                                 void* to,
         #                                                 size_t count)
 _Copy_arrayof_conjoint_jshorts:
         hlt 1007


         # Support for void Copy::conjoint_jlongs_atomic(jlong* from,
         #                                               jlong* to,
         #                                               size_t count)
 _Copy_conjoint_jlongs_atomic:
 _Copy_arrayof_conjoint_jlongs:
         hlt 1009


         # Support for void Copy::conjoint_jints_atomic(void* from,
         #                                              void* to,
         #                                              size_t count)
 _Copy_conjoint_jints_atomic:
 _Copy_arrayof_conjoint_jints:
         # These and further memory prefetches may hit out of array ranges.
         # Experiments showed that prefetching of inaccessible memory doesn't result in exceptions.
         prfm    pldl1keep,  [x0, #0]
         prfm    pstl1keep,  [x1, #0]
         prfm    pldl1keep,  [x0, #32]
         prfm    pstl1keep,  [x1, #32]

         subs    x3, x1, x0
         # hi condition is met <=> from < to
         ccmp    x2, x3, #0, hi
         # hi condition is met <=> (from < to) and (to - from < count)
         b.hi    ci_backward

         subs    x18, x2,  #64
         b.ge    ci_forward_large

 ci_forward_lt_64:
         # Copy [x0, x0 + x2) to [x1, x1 + x2)

         adr     x15,  ci_forward_tail_table_base
         and     x16,  x2,  #~4

         # Calculate address to jump and store it to x15:
         #   Each pair of instructions before ci_forward_tail_table_base copies 8 bytes.
         #   x16 is count of bytes to copy aligned down by 8.
         #   So x16/8 pairs of instructions should be executed.
         #   Each pair takes 8 bytes, so x15 = ci_forward_tail_table_base - (x16/8)*8 = x15 - x16
         sub     x15,  x15, x16
         prfm    plil1keep, [x15]

         add     x17,  x0,  x2
         add     x18,  x1,  x2

         # If x2 = x16 + 4, then copy 4 bytes and x16 bytes after that.
         # Otherwise x2 = x16, so proceed to copy x16 bytes.
         tbz     x2, #2, ci_forward_lt_64_even
         ldr     w3, [x0]
         str     w3, [x1]
 ci_forward_lt_64_even:
         # Copy [x17 - x16, x17) to [x18 - x16, x18)
         # x16 is aligned by 8 and less than 64

         # Execute (x16/8) ldp-stp pairs; each pair copies 8 bytes
         br      x15

         ldp     w3,  w4,  [x17, #-56]
         stp     w3,  w4,  [x18, #-56]
         ldp     w5,  w6,  [x17, #-48]
         stp     w5,  w6,  [x18, #-48]
         ldp     w7,  w8,  [x17, #-40]
         stp     w7,  w8,  [x18, #-40]
         ldp     w9,  w10, [x17, #-32]
         stp     w9,  w10, [x18, #-32]
         ldp     w11, w12, [x17, #-24]
         stp     w11, w12, [x18, #-24]
         ldp     w13, w14, [x17, #-16]
         stp     w13, w14, [x18, #-16]
         ldp     w15, w16, [x17, #-8]
         stp     w15, w16, [x18, #-8]
 ci_forward_tail_table_base:
         ret

 .p2align  6
 .rept   12
         nop
 .endr
 ci_forward_large:
         # x18 >= 0;
         # Copy [x0, x0 + x18 + 64) to [x1, x1 + x18 + 64)

         ldp     w3,  w4,  [x0], #32
         ldp     w5,  w6,  [x0, #-24]
         ldp     w7,  w8,  [x0, #-16]
         ldp     w9,  w10, [x0, #-8]

         # Before and after each iteration of loop registers w3-w10 contain [x0 - 32, x0),
         # and x1 is a place to copy this data;
         # x18 contains number of bytes to be stored minus 64

         # Exactly 16 instructions from p2align, so ci_forward_loop starts from cache line boundary
         # Checking it explictly by aligning with "hlt 1000" instructions
 .p2alignl  6, 0xd4407d00
 ci_forward_loop:
         prfm    pldl1keep,  [x0, #32]
         prfm    pstl1keep,  [x1, #32]

         subs    x18, x18, #32

         stp     w3,  w4,  [x1, #0]
         ldp     w3,  w4,  [x0, #0]
         stp     w5,  w6,  [x1, #8]
         ldp     w5,  w6,  [x0, #8]
         stp     w7,  w8,  [x1, #16]
         ldp     w7,  w8,  [x0, #16]
         stp     w9,  w10, [x1, #24]
         ldp     w9,  w10, [x0, #24]

         add     x1,  x1,  #32
         add     x0,  x0,  #32

         b.ge    ci_forward_loop

         # 14 instructions from ci_forward_loop, so the loop body hits into one cache line

 ci_forward_loop_end:
         adds    x2,  x18, #32

         stp     w3,  w4,  [x1], #32
         stp     w5,  w6,  [x1, #-24]
         stp     w7,  w8,  [x1, #-16]
         stp     w9,  w10, [x1, #-8]

         # Increased x18 by 32, but stored 32 bytes, so x2 contains exact number of bytes to be stored

         # If this number is not zero, also copy remaining bytes
         b.ne    ci_forward_lt_64
         ret

 ci_backward:

         # Overlapping case should be the rare one, it does not worth optimizing

         ands    x3,  x2,  #~4
         # x3 is count aligned down by 2*jintSize
         add     x0,  x0,  x2
         add     x1,  x1,  x2
         sub     x3,  x3,  #8
         # Skip loop if 0 or 1 jints
         b.eq    ci_backward_loop_end

         # x3 >= 0
         # Copy [x0 - x3 - 8, x0) to [x1 - x3 - 8, x1) backward
 ci_backward_loop:
         subs    x3,  x3,  #8
         ldp     w4,  w5,  [x0, #-8]!
         stp     w4,  w5,  [x1, #-8]!
         b.ge    ci_backward_loop

 ci_backward_loop_end:
         # Copy remaining 0 or 1 jints
         tbz     x2,  #2,  ci_backward_finish
         ldr     w3, [x0, #-4]
         str     w3, [x1, #-4]

 ci_backward_finish:
         ret
	#
	# Copyright (c) 2008, 2013, Oracle and/or its affiliates. All rights reserved.
	# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
	#
	# This code is free software; you can redistribute it and/or modify it
	# under the terms of the GNU General Public License version 2 only, as
	# published by the Free Software Foundation.
	#
	# This code is distributed in the hope that it will be useful, but WITHOUT
	# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
	# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	# version 2 for more details (a copy is included in the LICENSE file that
	# accompanied this code).
	#
	# You should have received a copy of the GNU General Public License version
	# 2 along with this work; if not, write to the Free Software Foundation,
	# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
	#
	# Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
	# or visit www.oracle.com if you need additional information or have any
	# questions.
	#

	# TODO-AARCH64

	# NOTE WELL! The _Copy functions are called directly
	# from server-compiler-generated code via CallLeafNoFP,
	# which means that they must either not use floating
	# point or use it in the same manner as does the server
	# compiler.

	.globl _Copy_conjoint_bytes
	.type _Copy_conjoint_bytes, %function
	.globl _Copy_arrayof_conjoint_bytes
	.type _Copy_arrayof_conjoint_bytes, %function
	.globl _Copy_disjoint_words
	.type _Copy_disjoint_words, %function
	.globl _Copy_conjoint_words
	.type _Copy_conjoint_words, %function
	.globl _Copy_conjoint_jshorts_atomic
	.type _Copy_conjoint_jshorts_atomic, %function
	.globl _Copy_arrayof_conjoint_jshorts
	.type _Copy_arrayof_conjoint_jshorts, %function
	.globl _Copy_conjoint_jints_atomic
	.type _Copy_conjoint_jints_atomic, %function
	.globl _Copy_arrayof_conjoint_jints
	.type _Copy_arrayof_conjoint_jints, %function
	.globl _Copy_conjoint_jlongs_atomic
	.type _Copy_conjoint_jlongs_atomic, %function
	.globl _Copy_arrayof_conjoint_jlongs
	.type _Copy_arrayof_conjoint_jlongs, %function

	.text
	.globl SpinPause
	.type SpinPause, %function
	SpinPause:
	yield
	ret

	# Support for void Copy::conjoint_bytes(void* from,
	# void* to,
	# size_t count)
	_Copy_conjoint_bytes:
	hlt 1002

	# Support for void Copy::arrayof_conjoint_bytes(void* from,
	# void* to,
	# size_t count)
	_Copy_arrayof_conjoint_bytes:
	hlt 1003


	# Support for void Copy::disjoint_words(void* from,
	# void* to,
	# size_t count)
	_Copy_disjoint_words:
	# These and further memory prefetches may hit out of array ranges.
	# Experiments showed that prefetching of inaccessible memory doesn't result in exceptions.
	prfm pldl1keep, [x0, #0]
	prfm pstl1keep, [x1, #0]
	prfm pldl1keep, [x0, #64]
	prfm pstl1keep, [x1, #64]

	subs x18, x2, #128
	b.ge dw_large

	dw_lt_128:
	# Copy [x0, x0 + x2) to [x1, x1 + x2)

	adr x15, dw_tail_table_base
	and x16, x2, #~8

	# Calculate address to jump and store it to x15:
	# Each pair of instructions before dw_tail_table_base copies 16 bytes.
	# x16 is count of bytes to copy aligned down by 16.
	# So x16/16 pairs of instructions should be executed.
	# Each pair takes 8 bytes, so x15 = dw_tail_table_base - (x16/16)*8 = x15 - x16/2
	sub x15, x15, x16, lsr #1
	prfm plil1keep, [x15]

	add x17, x0, x2
	add x18, x1, x2

	# If x2 = x16 + 8, then copy 8 bytes and x16 bytes after that.
	# Otherwise x2 = x16, so proceed to copy x16 bytes.
	tbz x2, #3, dw_lt_128_even
	ldr x3, [x0]
	str x3, [x1]
	dw_lt_128_even:
	# Copy [x17 - x16, x17) to [x18 - x16, x18)
	# x16 is aligned by 16 and less than 128

	# Execute (x16/16) ldp-stp pairs; each pair copies 16 bytes
	br x15

	ldp x3, x4, [x17, #-112]
	stp x3, x4, [x18, #-112]
	ldp x5, x6, [x17, #-96]
	stp x5, x6, [x18, #-96]
	ldp x7, x8, [x17, #-80]
	stp x7, x8, [x18, #-80]
	ldp x9, x10, [x17, #-64]
	stp x9, x10, [x18, #-64]
	ldp x11, x12, [x17, #-48]
	stp x11, x12, [x18, #-48]
	ldp x13, x14, [x17, #-32]
	stp x13, x14, [x18, #-32]
	ldp x15, x16, [x17, #-16]
	stp x15, x16, [x18, #-16]
	dw_tail_table_base:
	ret

	.p2align 6
	.rept 12
	nop
	.endr
	dw_large:
	# x18 >= 0;
	# Copy [x0, x0 + x18 + 128) to [x1, x1 + x18 + 128)

	ldp x3, x4, [x0], #64
	ldp x5, x6, [x0, #-48]
	ldp x7, x8, [x0, #-32]
	ldp x9, x10, [x0, #-16]

	# Before and after each iteration of loop registers x3-x10 contain [x0 - 64, x0),
	# and x1 is a place to copy this data;
	# x18 contains number of bytes to be stored minus 128

	# Exactly 16 instructions from p2align, so dw_loop starts from cache line boundary
	# Checking it explictly by aligning with "hlt 1000" instructions
	.p2alignl 6, 0xd4407d00
	dw_loop:
	prfm pldl1keep, [x0, #64]
	# Next line actually hurted memory copy performance (for interpreter) - JDK-8078120
	# prfm pstl1keep, [x1, #64]

	subs x18, x18, #64

	stp x3, x4, [x1, #0]
	ldp x3, x4, [x0, #0]
	stp x5, x6, [x1, #16]
	ldp x5, x6, [x0, #16]
	stp x7, x8, [x1, #32]
	ldp x7, x8, [x0, #32]
	stp x9, x10, [x1, #48]
	ldp x9, x10, [x0, #48]

	add x1, x1, #64
	add x0, x0, #64

	b.ge dw_loop

	# 13 instructions from dw_loop, so the loop body hits into one cache line

	dw_loop_end:
	adds x2, x18, #64

	stp x3, x4, [x1], #64
	stp x5, x6, [x1, #-48]
	stp x7, x8, [x1, #-32]
	stp x9, x10, [x1, #-16]

	# Increased x18 by 64, but stored 64 bytes, so x2 contains exact number of bytes to be stored

	# If this number is not zero, also copy remaining bytes
	b.ne dw_lt_128
	ret


	# Support for void Copy::conjoint_words(void* from,
	# void* to,
	# size_t count)
	_Copy_conjoint_words:
	subs x3, x1, x0
	# hi condition is met <=> from < to
	ccmp x2, x3, #0, hi
	# hi condition is met <=> (from < to) and (to - from < count)
	# otherwise _Copy_disjoint_words may be used, because it performs forward copying,
	# so it also works when ranges overlap but to <= from
	b.ls _Copy_disjoint_words

	# Overlapping case should be the rare one, it does not worth optimizing

	ands x3, x2, #~8
	# x3 is count aligned down by 2*wordSize
	add x0, x0, x2
	add x1, x1, x2
	sub x3, x3, #16
	# Skip loop if 0 or 1 words
	b.eq cw_backward_loop_end

	# x3 >= 0
	# Copy [x0 - x3 - 16, x0) to [x1 - x3 - 16, x1) backward
	cw_backward_loop:
	subs x3, x3, #16
	ldp x4, x5, [x0, #-16]!
	stp x4, x5, [x1, #-16]!
	b.ge cw_backward_loop

	cw_backward_loop_end:
	# Copy remaining 0 or 1 words
	tbz x2, #3, cw_finish
	ldr x3, [x0, #-8]
	str x3, [x1, #-8]

	cw_finish:
	ret


	# Support for void Copy::conjoint_jshorts_atomic(void* from,
	# void* to,
	# size_t count)
	_Copy_conjoint_jshorts_atomic:
	add x17, x0, x2
	add x18, x1, x2

	subs x3, x1, x0
	# hi is met <=> (from < to) and (to - from < count)
	ccmp x2, x3, #0, hi
	b.hi cs_backward

	subs x3, x2, #14
	b.ge cs_forward_loop

	# Copy x2 < 14 bytes from x0 to x1
	cs_forward_lt14:
	ands x7, x2, #7
	tbz x2, #3, cs_forward_lt8
	ldrh w3, [x0, #0]
	ldrh w4, [x0, #2]
	ldrh w5, [x0, #4]
	ldrh w6, [x0, #6]

	strh w3, [x1, #0]
	strh w4, [x1, #2]
	strh w5, [x1, #4]
	strh w6, [x1, #6]

	# Copy x7 < 8 bytes from x17 - x7 to x18 - x7
	cs_forward_lt8:
	b.eq cs_forward_0
	cmp x7, #4
	b.lt cs_forward_2
	b.eq cs_forward_4

	cs_forward_6:
	ldrh w3, [x17, #-6]
	strh w3, [x18, #-6]
	cs_forward_4:
	ldrh w4, [x17, #-4]
	strh w4, [x18, #-4]
	cs_forward_2:
	ldrh w5, [x17, #-2]
	strh w5, [x18, #-2]
	cs_forward_0:
	ret


	# Copy [x0, x0 + x3 + 14) to [x1, x1 + x3 + 14)
	# x3 >= 0
	.p2align 6
	cs_forward_loop:
	subs x3, x3, #14

	ldrh w4, [x0], #14
	ldrh w5, [x0, #-12]
	ldrh w6, [x0, #-10]
	ldrh w7, [x0, #-8]
	ldrh w8, [x0, #-6]
	ldrh w9, [x0, #-4]
	ldrh w10, [x0, #-2]

	strh w4, [x1], #14
	strh w5, [x1, #-12]
	strh w6, [x1, #-10]
	strh w7, [x1, #-8]
	strh w8, [x1, #-6]
	strh w9, [x1, #-4]
	strh w10, [x1, #-2]

	b.ge cs_forward_loop
	# Exactly 16 instruction from cs_forward_loop, so loop fits into one cache line

	adds x2, x3, #14
	# x2 bytes should be copied from x0 to x1
	b.ne cs_forward_lt14
	ret

	# Very similar to forward copying
	cs_backward:
	subs x3, x2, #14
	b.ge cs_backward_loop

	cs_backward_lt14:
	ands x7, x2, #7
	tbz x2, #3, cs_backward_lt8

	ldrh w3, [x17, #-8]
	ldrh w4, [x17, #-6]
	ldrh w5, [x17, #-4]
	ldrh w6, [x17, #-2]

	strh w3, [x18, #-8]
	strh w4, [x18, #-6]
	strh w5, [x18, #-4]
	strh w6, [x18, #-2]

	cs_backward_lt8:
	b.eq cs_backward_0
	cmp x7, #4
	b.lt cs_backward_2
	b.eq cs_backward_4

	cs_backward_6:
	ldrh w3, [x0, #4]
	strh w3, [x1, #4]

	cs_backward_4:
	ldrh w4, [x0, #2]
	strh w4, [x1, #2]

	cs_backward_2:
	ldrh w5, [x0, #0]
	strh w5, [x1, #0]

	cs_backward_0:
	ret


	.p2align 6
	cs_backward_loop:
	subs x3, x3, #14

	ldrh w4, [x17, #-14]!
	ldrh w5, [x17, #2]
	ldrh w6, [x17, #4]
	ldrh w7, [x17, #6]
	ldrh w8, [x17, #8]
	ldrh w9, [x17, #10]
	ldrh w10, [x17, #12]

	strh w4, [x18, #-14]!
	strh w5, [x18, #2]
	strh w6, [x18, #4]
	strh w7, [x18, #6]
	strh w8, [x18, #8]
	strh w9, [x18, #10]
	strh w10, [x18, #12]

	b.ge cs_backward_loop
	adds x2, x3, #14
	b.ne cs_backward_lt14
	ret


	# Support for void Copy::arrayof_conjoint_jshorts(void* from,
	# void* to,
	# size_t count)
	_Copy_arrayof_conjoint_jshorts:
	hlt 1007


	# Support for void Copy::conjoint_jlongs_atomic(jlong* from,
	# jlong* to,
	# size_t count)
	_Copy_conjoint_jlongs_atomic:
	_Copy_arrayof_conjoint_jlongs:
	hlt 1009


	# Support for void Copy::conjoint_jints_atomic(void* from,
	# void* to,
	# size_t count)
	_Copy_conjoint_jints_atomic:
	_Copy_arrayof_conjoint_jints:
	# These and further memory prefetches may hit out of array ranges.
	# Experiments showed that prefetching of inaccessible memory doesn't result in exceptions.
	prfm pldl1keep, [x0, #0]
	prfm pstl1keep, [x1, #0]
	prfm pldl1keep, [x0, #32]
	prfm pstl1keep, [x1, #32]

	subs x3, x1, x0
	# hi condition is met <=> from < to
	ccmp x2, x3, #0, hi
	# hi condition is met <=> (from < to) and (to - from < count)
	b.hi ci_backward

	subs x18, x2, #64
	b.ge ci_forward_large

	ci_forward_lt_64:
	# Copy [x0, x0 + x2) to [x1, x1 + x2)

	adr x15, ci_forward_tail_table_base
	and x16, x2, #~4

	# Calculate address to jump and store it to x15:
	# Each pair of instructions before ci_forward_tail_table_base copies 8 bytes.
	# x16 is count of bytes to copy aligned down by 8.
	# So x16/8 pairs of instructions should be executed.
	# Each pair takes 8 bytes, so x15 = ci_forward_tail_table_base - (x16/8)*8 = x15 - x16
	sub x15, x15, x16
	prfm plil1keep, [x15]

	add x17, x0, x2
	add x18, x1, x2

	# If x2 = x16 + 4, then copy 4 bytes and x16 bytes after that.
	# Otherwise x2 = x16, so proceed to copy x16 bytes.
	tbz x2, #2, ci_forward_lt_64_even
	ldr w3, [x0]
	str w3, [x1]
	ci_forward_lt_64_even:
	# Copy [x17 - x16, x17) to [x18 - x16, x18)
	# x16 is aligned by 8 and less than 64

	# Execute (x16/8) ldp-stp pairs; each pair copies 8 bytes
	br x15

	ldp w3, w4, [x17, #-56]
	stp w3, w4, [x18, #-56]
	ldp w5, w6, [x17, #-48]
	stp w5, w6, [x18, #-48]
	ldp w7, w8, [x17, #-40]
	stp w7, w8, [x18, #-40]
	ldp w9, w10, [x17, #-32]
	stp w9, w10, [x18, #-32]
	ldp w11, w12, [x17, #-24]
	stp w11, w12, [x18, #-24]
	ldp w13, w14, [x17, #-16]
	stp w13, w14, [x18, #-16]
	ldp w15, w16, [x17, #-8]
	stp w15, w16, [x18, #-8]
	ci_forward_tail_table_base:
	ret

	.p2align 6
	.rept 12
	nop
	.endr
	ci_forward_large:
	# x18 >= 0;
	# Copy [x0, x0 + x18 + 64) to [x1, x1 + x18 + 64)

	ldp w3, w4, [x0], #32
	ldp w5, w6, [x0, #-24]
	ldp w7, w8, [x0, #-16]
	ldp w9, w10, [x0, #-8]

	# Before and after each iteration of loop registers w3-w10 contain [x0 - 32, x0),
	# and x1 is a place to copy this data;
	# x18 contains number of bytes to be stored minus 64

	# Exactly 16 instructions from p2align, so ci_forward_loop starts from cache line boundary
	# Checking it explictly by aligning with "hlt 1000" instructions
	.p2alignl 6, 0xd4407d00
	ci_forward_loop:
	prfm pldl1keep, [x0, #32]
	prfm pstl1keep, [x1, #32]

	subs x18, x18, #32

	stp w3, w4, [x1, #0]
	ldp w3, w4, [x0, #0]
	stp w5, w6, [x1, #8]
	ldp w5, w6, [x0, #8]
	stp w7, w8, [x1, #16]
	ldp w7, w8, [x0, #16]
	stp w9, w10, [x1, #24]
	ldp w9, w10, [x0, #24]

	add x1, x1, #32
	add x0, x0, #32

	b.ge ci_forward_loop

	# 14 instructions from ci_forward_loop, so the loop body hits into one cache line

	ci_forward_loop_end:
	adds x2, x18, #32

	stp w3, w4, [x1], #32
	stp w5, w6, [x1, #-24]
	stp w7, w8, [x1, #-16]
	stp w9, w10, [x1, #-8]

	# Increased x18 by 32, but stored 32 bytes, so x2 contains exact number of bytes to be stored

	# If this number is not zero, also copy remaining bytes
	b.ne ci_forward_lt_64
	ret

	ci_backward:

	# Overlapping case should be the rare one, it does not worth optimizing

	ands x3, x2, #~4
	# x3 is count aligned down by 2*jintSize
	add x0, x0, x2
	add x1, x1, x2
	sub x3, x3, #8
	# Skip loop if 0 or 1 jints
	b.eq ci_backward_loop_end

	# x3 >= 0
	# Copy [x0 - x3 - 8, x0) to [x1 - x3 - 8, x1) backward
	ci_backward_loop:
	subs x3, x3, #8
	ldp w4, w5, [x0, #-8]!
	stp w4, w5, [x1, #-8]!
	b.ge ci_backward_loop

	ci_backward_loop_end:
	# Copy remaining 0 or 1 jints
	tbz x2, #2, ci_backward_finish
	ldr w3, [x0, #-4]
	str w3, [x1, #-4]

	ci_backward_finish:
	ret