libc/arch-arm/bionic/memcpy.a15.S - platform/bionic - Git at Google

 /*
  * Copyright (c) 2013 ARM Ltd
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the company may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */

     /* Prototype: void *memcpy (void *dst, const void *src, size_t count).  */

         /* Use the version of memcpy implemented using LDRD and STRD.
            This version is tuned for Cortex-A15.
            This might not be the best for other ARMv7-A CPUs,
            but there is no predefine to distinguish between
            different CPUs in the same architecture,
            and this version is better than the plain memcpy provided in newlib.

            Therefore, we use this version for all ARMv7-A CPUS.  */

         /* To make the same code compile for both ARM and Thumb instruction
        sets, switch to unified syntax at the beginning of this function.
            However, by using the same code, we may be missing optimization
        opportunities.  For instance, in LDRD/STRD instructions, the first
        destination register must be even and the second consecutive in
        ARM state, but not in Thumb state.  */

 #include <machine/cpu-features.h>
 #include <machine/asm.h>

         .syntax         unified

 ENTRY(memcpy)

        /* Assumes that n >= 0, and dst, src are valid pointers.
           If there is at least 8 bytes to copy, use LDRD/STRD.
           If src and dst are misaligned with different offsets,
           first copy byte by byte until dst is aligned,
           and then copy using LDRD/STRD and shift if needed.
           When less than 8 left, copy a word and then byte by byte.  */

        /* Save registers (r0 holds the return value):
           optimized push {r0, r4, r5, r6, r7, lr}.
           To try and improve performance, stack layout changed,
           i.e., not keeping the stack looking like users expect
           (highest numbered register at highest address).  */
         .save   {r0, lr}
         push    {r0, lr}
         .save   {r4, r5}
         strd    r4, r5, [sp, #-8]!
         .save   {r6, r7}
         strd    r6, r7, [sp, #-8]!

        /* TODO: Add debug frame directives.
           We don't need exception unwind directives, because the code below
       does not throw any exceptions and does not call any other functions.
           Generally, newlib functions like this lack debug information for
       assembler source.  */

         /* Get copying of tiny blocks out of the way first.  */
         /* Is there at least 4 bytes to copy?  */
         subs    r2, r2, #4
         blt     copy_less_than_4                 /* If n < 4.  */

         /* Check word alignment.  */
         ands    ip, r0, #3                       /* ip = last 2 bits of dst.  */
         bne     dst_not_word_aligned             /* If dst is not word-aligned.  */

         /* Get here if dst is word-aligned.  */
         ands    ip, r1, #3                      /* ip = last 2 bits of src.  */
         bne     src_not_word_aligned            /* If src is not word-aligned.  */
 word_aligned:
         /* Get here if source and dst both are word-aligned.
            The number of bytes remaining to copy is r2+4.  */

         /* Is there is at least 64 bytes to copy?  */
         subs    r2, r2, #60
         blt     copy_less_than_64                /* If r2 + 4 < 64.  */

         /* First, align the destination buffer to 8-bytes,
            to make sure double loads and stores don't cross cache line boundary,
            as they are then more expensive even if the data is in the cache
            (require two load/store issue cycles instead of one).
            If only one of the buffers is not 8-bytes aligned,
            then it's more important to align dst than src,
            because there is more penalty for stores
            than loads that cross cacheline boundary.
            This check and realignment are only worth doing
            if there is a lot to copy.  */

         /* Get here if dst is word aligned,
            i.e., the 2 least significant bits are 0.
            If dst is not 2w aligned (i.e., the 3rd bit is not set in dst),
            then copy 1 word (4 bytes).  */
         ands    r3, r0, #4
         beq     11f                  /* If dst already two-word aligned.  */
         ldr     r3, [r1], #4
         str     r3, [r0], #4
         subs    r2, r2, #4
         blt     copy_less_than_64

 11:
         /* TODO: Align to cacheline (useful for PLD optimization).  */

         /* Every loop iteration copies 64 bytes.  */
 1:
         .irp    offset, #0, #8, #16, #24, #32, #40, #48, #56
         ldrd    r4, r5, [r1, \offset]
         strd    r4, r5, [r0, \offset]
         .endr

         add     r0, r0, #64
         add     r1, r1, #64
         subs    r2, r2, #64
         bge     1b                            /* If there is more to copy.  */

 copy_less_than_64:

         /* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
            Restore the count if there is more than 7 bytes to copy.  */
         adds    r2, r2, #56
         blt     copy_less_than_8

         /* Copy 8 bytes at a time.  */
 2:
         ldrd    r4, r5, [r1], #8
         strd    r4, r5, [r0], #8
         subs    r2, r2, #8
         bge     2b                            /* If there is more to copy.  */

 copy_less_than_8:

         /* Get here if less than 8 bytes to copy, -8 <= r2 < 0.
            Check if there is more to copy.  */
         cmn     r2, #8
         beq     return                          /* If r2 + 8 == 0.  */

         /* Restore the count if there is more than 3 bytes to copy.  */
         adds    r2, r2, #4
         blt     copy_less_than_4

         /* Copy 4 bytes.  */
         ldr     r3, [r1], #4
         str     r3, [r0], #4

 copy_less_than_4:
         /* Get here if less than 4 bytes to copy, -4 <= r2 < 0.  */

         /* Restore the count, check if there is more to copy.  */
         adds    r2, r2, #4
         beq     return                          /* If r2 == 0.  */

         /* Get here with r2 is in {1,2,3}={01,10,11}.  */
         /* Logical shift left r2, insert 0s, update flags.  */
         lsls    r2, r2, #31

         /* Copy byte by byte.
            Condition ne means the last bit of r2 is 0.
            Condition cs means the second to last bit of r2 is set,
            i.e., r2 is 1 or 3.  */
         itt     ne
         ldrbne  r3, [r1], #1
         strbne  r3, [r0], #1

         itttt   cs
         ldrbcs  r4, [r1], #1
         ldrbcs  r5, [r1]
         strbcs  r4, [r0], #1
         strbcs  r5, [r0]

 return:
         /* Restore registers: optimized pop {r0, r4, r5, r6, r7, pc}   */
         /* This is the only return point of memcpy.  */
         ldrd r6, r7, [sp], #8
         ldrd r4, r5, [sp], #8
         pop {r0, pc}

 #ifndef __ARM_FEATURE_UNALIGNED

        /* The following assembly macro implements misaligned copy in software.
           Assumes that dst is word aligned, src is at offset "pull" bits from
       word, push = 32 - pull, and the number of bytes that remain to copy
       is r2 + 4, r2 >= 0.  */

        /* In the code below, r2 is the number of bytes that remain to be
       written.  The number of bytes read is always larger, because we have
       partial words in the shift queue.  */

         .macro  miscopy pull push shiftleft shiftright

         /* Align src to the previous word boundary.  */
         bic     r1, r1, #3

         /* Initialize the shift queue.  */
         ldr     r5, [r1], #4                   /* Load a word from source.  */

         subs    r2, r2, #4
         blt     6f          /* Go to misaligned copy of less than 8 bytes.  */

        /* Get here if there is more than 8 bytes to copy.
           The number of bytes to copy is r2+8, r2 >= 0.  */

        subs     r2, r2, #56
        blt      4f         /* Go to misaligned copy of less than 64 bytes.  */

 3:
        /* Get here if there is more than 64 bytes to copy.
           The number of bytes to copy is r2+64, r2 >= 0.  */

        /* Copy 64 bytes in every iteration.
           Use a partial word from the shift queue.  */
         .irp    offset, #0, #8, #16, #24, #32, #40, #48, #56
         mov     r6, r5, \shiftleft #\pull
         ldrd    r4, r5, [r1, \offset]
         orr     r6, r6, r4, \shiftright #\push
         mov     r7, r4, \shiftleft #\pull
         orr     r7, r7, r5, \shiftright #\push
         strd    r6, r7, [r0, \offset]
         .endr

         add     r1, r1, #64
         add     r0, r0, #64
         subs    r2, r2, #64
         bge     3b

 4:
        /* Get here if there is less than 64 bytes to copy (-64 <= r2 < 0)
       and they are misaligned.  */

        /* Restore the count if there is more than 7 bytes to copy.  */
         adds    r2, r2, #56

         blt     6f          /* Go to misaligned copy of less than 8 bytes.  */

 5:
         /* Copy 8 bytes at a time.
            Use a partial word from the shift queue.  */
         mov     r6, r5, \shiftleft #\pull
         ldrd    r4, r5, [r1], #8
         orr     r6, r6, r4, \shiftright #\push
         mov     r7, r4, \shiftleft #\pull
         orr     r7, r7, r5, \shiftright #\push
         strd    r6, r7, [r0], #8

         subs    r2, r2, #8
         bge     5b                        /* If there is more to copy.  */

 6:
         /* Get here if there less than 8 bytes to copy (-8 <= r2 < 0)
            and they are misaligned.  */

         /* Check if there is more to copy.  */
         cmn     r2, #8
         beq     return

         /* Check if there is less than 4 bytes to copy.  */
         cmn     r2, #4

         itt     lt
         /* Restore src offset from word-align.  */
         sublt   r1, r1, #(\push / 8)
         blt     copy_less_than_4

         /* Use a partial word from the shift queue.  */
         mov     r3, r5, \shiftleft #\pull
         /* Load a word from src, but without writeback
            (this word is not fully written to dst).  */
         ldr     r5, [r1]

         /* Restore src offset from word-align.  */
         add     r1, r1, #(\pull / 8)

         /* Shift bytes to create one dst word and store it.  */
         orr     r3, r3, r5, \shiftright #\push
         str     r3, [r0], #4

         /* Use single byte copying of the remaining bytes.  */
         b       copy_less_than_4

         .endm

 #endif /* not __ARM_FEATURE_UNALIGNED  */

 dst_not_word_aligned:

        /* Get here when dst is not aligned and ip has the last 2 bits of dst,
           i.e., ip is the offset of dst from word.
           The number of bytes that remains to copy is r2 + 4,
           i.e., there are at least 4 bytes to copy.
           Write a partial word (0 to 3 bytes), such that dst becomes
       word-aligned.  */

        /* If dst is at ip bytes offset from a word (with 0 < ip < 4),
           then there are (4 - ip) bytes to fill up to align dst to the next
       word.  */
         rsb     ip, ip, #4                        /* ip = #4 - ip.  */
         cmp     ip, #2

        /* Copy byte by byte with conditionals.  */
         itt     gt
         ldrbgt  r3, [r1], #1
         strbgt  r3, [r0], #1

         itt     ge
         ldrbge  r4, [r1], #1
         strbge  r4, [r0], #1

         ldrb    lr, [r1], #1
         strb    lr, [r0], #1

        /* Update the count.
           ip holds the number of bytes we have just copied.  */
         subs    r2, r2, ip                        /* r2 = r2 - ip.  */
         blt     copy_less_than_4                  /* If r2 < ip.  */

        /* Get here if there are more than 4 bytes to copy.
           Check if src is aligned.  If beforehand src and dst were not word
       aligned but congruent (same offset), then now they are both
       word-aligned, and we can copy the rest efficiently (without
       shifting).  */
         ands    ip, r1, #3                    /* ip = last 2 bits of src.  */
         beq     word_aligned                  /* If r1 is word-aligned.  */

 src_not_word_aligned:
        /* Get here when src is not word-aligned, but dst is word-aligned.
           The number of bytes that remains to copy is r2+4.  */

 #ifdef __ARM_FEATURE_UNALIGNED
        /* Copy word by word using LDR when alignment can be done in hardware,
           i.e., SCTLR.A is set, supporting unaligned access in LDR and STR.  */
         subs    r2, r2, #60
         blt     8f

 7:
         /* Copy 64 bytes in every loop iteration.  */
         .irp    offset, #0, #4, #8, #12, #16, #20, #24, #28, #32, #36, #40, #44, #48, #52, #56, #60
         ldr     r3, [r1, \offset]
         str     r3, [r0, \offset]
         .endr

         add     r0, r0, #64
         add     r1, r1, #64
         subs    r2, r2, #64
         bge     7b

 8:
         /* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
            Check if there is more than 3 bytes to copy.  */
         adds    r2, r2, #60
         blt     copy_less_than_4

 9:
        /* Get here if there is less than 64 but at least 4 bytes to copy,
           where the number of bytes to copy is r2+4.  */
         ldr     r3, [r1], #4
         str     r3, [r0], #4
         subs    r2, r2, #4
         bge     9b

         b       copy_less_than_4

 #else /* not __ARM_FEATURE_UNALIGNED  */

        /* ip has last 2 bits of src,
           i.e., ip is the offset of src from word, and ip > 0.
           Compute shifts needed to copy from src to dst.  */
         cmp     ip, #2
         beq     miscopy_16_16             /* If ip == 2.  */
         bge     miscopy_24_8              /* If ip == 3.  */

         /* Get here if ip == 1.  */

         /* Endian independent macros for shifting bytes within registers.  */

 #ifndef __ARMEB__
 miscopy_8_24:   miscopy pull=8 push=24 shiftleft=lsr shiftright=lsl
 miscopy_16_16:  miscopy pull=16 push=16 shiftleft=lsr shiftright=lsl
 miscopy_24_8:   miscopy pull=24 push=8 shiftleft=lsr shiftright=lsl
 #else  /* not __ARMEB__ */
 miscopy_8_24:   miscopy pull=8 push=24 shiftleft=lsl shiftright=lsr
 miscopy_16_16:  miscopy pull=16 push=16 shiftleft=lsl shiftright=lsr
 miscopy_24_8:   miscopy pull=24 push=8 shiftleft=lsl shiftright=lsr
 #endif  /* not __ARMEB__ */

 #endif  /* not __ARM_FEATURE_UNALIGNED  */

 END(memcpy)
	/*
	* Copyright (c) 2013 ARM Ltd
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the company may not be used to endorse or promote
	* products derived from this software without specific prior written
	* permission.
	*
	* THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
	* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
	* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
	* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
	* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
	* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	/* Prototype: void memcpy (void dst, const void src, size_t count). /

	/* Use the version of memcpy implemented using LDRD and STRD.
	This version is tuned for Cortex-A15.
	This might not be the best for other ARMv7-A CPUs,
	but there is no predefine to distinguish between
	different CPUs in the same architecture,
	and this version is better than the plain memcpy provided in newlib.

	Therefore, we use this version for all ARMv7-A CPUS. */

	/* To make the same code compile for both ARM and Thumb instruction
	sets, switch to unified syntax at the beginning of this function.
	However, by using the same code, we may be missing optimization
	opportunities. For instance, in LDRD/STRD instructions, the first
	destination register must be even and the second consecutive in
	ARM state, but not in Thumb state. */

	#include <machine/cpu-features.h>
	#include <machine/asm.h>

	.syntax unified

	ENTRY(memcpy)

	/* Assumes that n >= 0, and dst, src are valid pointers.
	If there is at least 8 bytes to copy, use LDRD/STRD.
	If src and dst are misaligned with different offsets,
	first copy byte by byte until dst is aligned,
	and then copy using LDRD/STRD and shift if needed.
	When less than 8 left, copy a word and then byte by byte. */

	/* Save registers (r0 holds the return value):
	optimized push {r0, r4, r5, r6, r7, lr}.
	To try and improve performance, stack layout changed,
	i.e., not keeping the stack looking like users expect
	(highest numbered register at highest address). */
	.save {r0, lr}
	push {r0, lr}
	.save {r4, r5}
	strd r4, r5, [sp, #-8]!
	.save {r6, r7}
	strd r6, r7, [sp, #-8]!

	/* TODO: Add debug frame directives.
	We don't need exception unwind directives, because the code below
	does not throw any exceptions and does not call any other functions.
	Generally, newlib functions like this lack debug information for
	assembler source. */

	/* Get copying of tiny blocks out of the way first. */
	/* Is there at least 4 bytes to copy? */
	subs r2, r2, #4
	blt copy_less_than_4 /* If n < 4. */

	/* Check word alignment. */
	ands ip, r0, #3 /* ip = last 2 bits of dst. */
	bne dst_not_word_aligned /* If dst is not word-aligned. */

	/* Get here if dst is word-aligned. */
	ands ip, r1, #3 /* ip = last 2 bits of src. */
	bne src_not_word_aligned /* If src is not word-aligned. */
	word_aligned:
	/* Get here if source and dst both are word-aligned.
	The number of bytes remaining to copy is r2+4. */

	/* Is there is at least 64 bytes to copy? */
	subs r2, r2, #60
	blt copy_less_than_64 /* If r2 + 4 < 64. */

	/* First, align the destination buffer to 8-bytes,
	to make sure double loads and stores don't cross cache line boundary,
	as they are then more expensive even if the data is in the cache
	(require two load/store issue cycles instead of one).
	If only one of the buffers is not 8-bytes aligned,
	then it's more important to align dst than src,
	because there is more penalty for stores
	than loads that cross cacheline boundary.
	This check and realignment are only worth doing
	if there is a lot to copy. */

	/* Get here if dst is word aligned,
	i.e., the 2 least significant bits are 0.
	If dst is not 2w aligned (i.e., the 3rd bit is not set in dst),
	then copy 1 word (4 bytes). */
	ands r3, r0, #4
	beq 11f /* If dst already two-word aligned. */
	ldr r3, [r1], #4
	str r3, [r0], #4
	subs r2, r2, #4
	blt copy_less_than_64

	11:
	/* TODO: Align to cacheline (useful for PLD optimization). */

	/* Every loop iteration copies 64 bytes. */
	1:
	.irp offset, #0, #8, #16, #24, #32, #40, #48, #56
	ldrd r4, r5, [r1, \offset]
	strd r4, r5, [r0, \offset]
	.endr

	add r0, r0, #64
	add r1, r1, #64
	subs r2, r2, #64
	bge 1b /* If there is more to copy. */

	copy_less_than_64:

	/* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
	Restore the count if there is more than 7 bytes to copy. */
	adds r2, r2, #56
	blt copy_less_than_8

	/* Copy 8 bytes at a time. */
	2:
	ldrd r4, r5, [r1], #8
	strd r4, r5, [r0], #8
	subs r2, r2, #8
	bge 2b /* If there is more to copy. */

	copy_less_than_8:

	/* Get here if less than 8 bytes to copy, -8 <= r2 < 0.
	Check if there is more to copy. */
	cmn r2, #8
	beq return /* If r2 + 8 == 0. */

	/* Restore the count if there is more than 3 bytes to copy. */
	adds r2, r2, #4
	blt copy_less_than_4

	/* Copy 4 bytes. */
	ldr r3, [r1], #4
	str r3, [r0], #4

	copy_less_than_4:
	/* Get here if less than 4 bytes to copy, -4 <= r2 < 0. */

	/* Restore the count, check if there is more to copy. */
	adds r2, r2, #4
	beq return /* If r2 == 0. */

	/* Get here with r2 is in {1,2,3}={01,10,11}. */
	/* Logical shift left r2, insert 0s, update flags. */
	lsls r2, r2, #31

	/* Copy byte by byte.
	Condition ne means the last bit of r2 is 0.
	Condition cs means the second to last bit of r2 is set,
	i.e., r2 is 1 or 3. */
	itt ne
	ldrbne r3, [r1], #1
	strbne r3, [r0], #1

	itttt cs
	ldrbcs r4, [r1], #1
	ldrbcs r5, [r1]
	strbcs r4, [r0], #1
	strbcs r5, [r0]

	return:
	/* Restore registers: optimized pop {r0, r4, r5, r6, r7, pc} */
	/* This is the only return point of memcpy. */
	ldrd r6, r7, [sp], #8
	ldrd r4, r5, [sp], #8
	pop {r0, pc}

	#ifndef __ARM_FEATURE_UNALIGNED

	/* The following assembly macro implements misaligned copy in software.
	Assumes that dst is word aligned, src is at offset "pull" bits from
	word, push = 32 - pull, and the number of bytes that remain to copy
	is r2 + 4, r2 >= 0. */

	/* In the code below, r2 is the number of bytes that remain to be
	written. The number of bytes read is always larger, because we have
	partial words in the shift queue. */

	.macro miscopy pull push shiftleft shiftright

	/* Align src to the previous word boundary. */
	bic r1, r1, #3

	/* Initialize the shift queue. */
	ldr r5, [r1], #4 /* Load a word from source. */

	subs r2, r2, #4
	blt 6f /* Go to misaligned copy of less than 8 bytes. */

	/* Get here if there is more than 8 bytes to copy.
	The number of bytes to copy is r2+8, r2 >= 0. */

	subs r2, r2, #56
	blt 4f /* Go to misaligned copy of less than 64 bytes. */

	3:
	/* Get here if there is more than 64 bytes to copy.
	The number of bytes to copy is r2+64, r2 >= 0. */

	/* Copy 64 bytes in every iteration.
	Use a partial word from the shift queue. */
	.irp offset, #0, #8, #16, #24, #32, #40, #48, #56
	mov r6, r5, \shiftleft #\pull
	ldrd r4, r5, [r1, \offset]
	orr r6, r6, r4, \shiftright #\push
	mov r7, r4, \shiftleft #\pull
	orr r7, r7, r5, \shiftright #\push
	strd r6, r7, [r0, \offset]
	.endr

	add r1, r1, #64
	add r0, r0, #64
	subs r2, r2, #64
	bge 3b

	4:
	/* Get here if there is less than 64 bytes to copy (-64 <= r2 < 0)
	and they are misaligned. */

	/* Restore the count if there is more than 7 bytes to copy. */
	adds r2, r2, #56

	blt 6f /* Go to misaligned copy of less than 8 bytes. */

	5:
	/* Copy 8 bytes at a time.
	Use a partial word from the shift queue. */
	mov r6, r5, \shiftleft #\pull
	ldrd r4, r5, [r1], #8
	orr r6, r6, r4, \shiftright #\push
	mov r7, r4, \shiftleft #\pull
	orr r7, r7, r5, \shiftright #\push
	strd r6, r7, [r0], #8

	subs r2, r2, #8
	bge 5b /* If there is more to copy. */

	6:
	/* Get here if there less than 8 bytes to copy (-8 <= r2 < 0)
	and they are misaligned. */

	/* Check if there is more to copy. */
	cmn r2, #8
	beq return

	/* Check if there is less than 4 bytes to copy. */
	cmn r2, #4

	itt lt
	/* Restore src offset from word-align. */
	sublt r1, r1, #(\push / 8)
	blt copy_less_than_4

	/* Use a partial word from the shift queue. */
	mov r3, r5, \shiftleft #\pull
	/* Load a word from src, but without writeback
	(this word is not fully written to dst). */
	ldr r5, [r1]

	/* Restore src offset from word-align. */
	add r1, r1, #(\pull / 8)

	/* Shift bytes to create one dst word and store it. */
	orr r3, r3, r5, \shiftright #\push
	str r3, [r0], #4

	/* Use single byte copying of the remaining bytes. */
	b copy_less_than_4

	.endm

	#endif /* not __ARM_FEATURE_UNALIGNED */

	dst_not_word_aligned:

	/* Get here when dst is not aligned and ip has the last 2 bits of dst,
	i.e., ip is the offset of dst from word.
	The number of bytes that remains to copy is r2 + 4,
	i.e., there are at least 4 bytes to copy.
	Write a partial word (0 to 3 bytes), such that dst becomes
	word-aligned. */

	/* If dst is at ip bytes offset from a word (with 0 < ip < 4),
	then there are (4 - ip) bytes to fill up to align dst to the next
	word. */
	rsb ip, ip, #4 /* ip = #4 - ip. */
	cmp ip, #2

	/* Copy byte by byte with conditionals. */
	itt gt
	ldrbgt r3, [r1], #1
	strbgt r3, [r0], #1

	itt ge
	ldrbge r4, [r1], #1
	strbge r4, [r0], #1

	ldrb lr, [r1], #1
	strb lr, [r0], #1

	/* Update the count.
	ip holds the number of bytes we have just copied. */
	subs r2, r2, ip /* r2 = r2 - ip. */
	blt copy_less_than_4 /* If r2 < ip. */

	/* Get here if there are more than 4 bytes to copy.
	Check if src is aligned. If beforehand src and dst were not word
	aligned but congruent (same offset), then now they are both
	word-aligned, and we can copy the rest efficiently (without
	shifting). */
	ands ip, r1, #3 /* ip = last 2 bits of src. */
	beq word_aligned /* If r1 is word-aligned. */

	src_not_word_aligned:
	/* Get here when src is not word-aligned, but dst is word-aligned.
	The number of bytes that remains to copy is r2+4. */

	#ifdef __ARM_FEATURE_UNALIGNED
	/* Copy word by word using LDR when alignment can be done in hardware,
	i.e., SCTLR.A is set, supporting unaligned access in LDR and STR. */
	subs r2, r2, #60
	blt 8f

	7:
	/* Copy 64 bytes in every loop iteration. */
	.irp offset, #0, #4, #8, #12, #16, #20, #24, #28, #32, #36, #40, #44, #48, #52, #56, #60
	ldr r3, [r1, \offset]
	str r3, [r0, \offset]
	.endr

	add r0, r0, #64
	add r1, r1, #64
	subs r2, r2, #64
	bge 7b

	8:
	/* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
	Check if there is more than 3 bytes to copy. */
	adds r2, r2, #60
	blt copy_less_than_4

	9:
	/* Get here if there is less than 64 but at least 4 bytes to copy,
	where the number of bytes to copy is r2+4. */
	ldr r3, [r1], #4
	str r3, [r0], #4
	subs r2, r2, #4
	bge 9b

	b copy_less_than_4

	#else /* not __ARM_FEATURE_UNALIGNED */

	/* ip has last 2 bits of src,
	i.e., ip is the offset of src from word, and ip > 0.
	Compute shifts needed to copy from src to dst. */
	cmp ip, #2
	beq miscopy_16_16 /* If ip == 2. */
	bge miscopy_24_8 /* If ip == 3. */

	/* Get here if ip == 1. */

	/* Endian independent macros for shifting bytes within registers. */

	#ifndef __ARMEB__
	miscopy_8_24: miscopy pull=8 push=24 shiftleft=lsr shiftright=lsl
	miscopy_16_16: miscopy pull=16 push=16 shiftleft=lsr shiftright=lsl
	miscopy_24_8: miscopy pull=24 push=8 shiftleft=lsr shiftright=lsl
	#else /* not __ARMEB__ */
	miscopy_8_24: miscopy pull=8 push=24 shiftleft=lsl shiftright=lsr
	miscopy_16_16: miscopy pull=16 push=16 shiftleft=lsl shiftright=lsr
	miscopy_24_8: miscopy pull=24 push=8 shiftleft=lsl shiftright=lsr
	#endif /* not __ARMEB__ */

	#endif /* not __ARM_FEATURE_UNALIGNED */

	END(memcpy)