libc/arch-arm/bionic/memcmp.S - platform/bionic - Git at Google

 /*
  * Copyright (C) 2008 The Android Open Source Project
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *  * Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  *  * Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the
  *    distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */

 #include <machine/cpu-features.h>
 #include <machine/asm.h>

 /*
  * Optimized memcmp() for ARM9.
  * This would not be optimal on XScale or ARM11, where more prefetching
  * and use of PLD will be needed.
  * The 2 major optimzations here are
  * (1) The main loop compares 16 bytes at a time
  * (2) The loads are scheduled in a way they won't stall
  */

 ENTRY(memcmp)
         PLD         (r0, #0)
         PLD         (r1, #0)

         /* take of the case where length is 0 or the buffers are the same */
         cmp         r0, r1
         cmpne       r2, #0
         moveq       r0, #0
         bxeq        lr

         .save {r4, lr}
         /* save registers */
         stmfd       sp!, {r4, lr}

         PLD         (r0, #32)
         PLD         (r1, #32)

         /* since r0 hold the result, move the first source
          * pointer somewhere else
          */

          mov        r4, r0

          /* make sure we have at least 8+4 bytes, this simplify things below
           * and avoid some overhead for small blocks
           */
          cmp        r2, #(8+4)
          bmi        8f

         /* align first pointer to word boundary
          * offset = -src & 3
          */
         rsb         r3, r4, #0
         ands        r3, r3, #3
         beq         0f

         /* align first pointer  */
         sub         r2, r2, r3
 1:      ldrb        r0, [r4], #1
         ldrb        ip, [r1], #1
         subs        r0, r0, ip
         bne         9f
         subs        r3, r3, #1
         bne         1b


 0:      /* here the first pointer is aligned, and we have at least 4 bytes
          * to process.
          */

         /* see if the pointers are congruent */
         eor         r0, r4, r1
         ands        r0, r0, #3
         bne         5f

         /* congruent case, 32 bytes per iteration
          * We need to make sure there are at least 32+4 bytes left
          * because we effectively read ahead one word, and we could
          * read past the buffer (and segfault) if we're not careful.
          */

         ldr         ip, [r1]
         subs        r2, r2, #(32 + 4)
         bmi         1f

 0:      PLD         (r4, #64)
         PLD         (r1, #64)
         ldr         r0, [r4], #4
         ldr         lr, [r1, #4]!
         eors        r0, r0, ip
         ldreq       r0, [r4], #4
         ldreq       ip, [r1, #4]!
         eoreqs      r0, r0, lr
         ldreq       r0, [r4], #4
         ldreq       lr, [r1, #4]!
         eoreqs      r0, r0, ip
         ldreq       r0, [r4], #4
         ldreq       ip, [r1, #4]!
         eoreqs      r0, r0, lr
         ldreq       r0, [r4], #4
         ldreq       lr, [r1, #4]!
         eoreqs      r0, r0, ip
         ldreq       r0, [r4], #4
         ldreq       ip, [r1, #4]!
         eoreqs      r0, r0, lr
         ldreq       r0, [r4], #4
         ldreq       lr, [r1, #4]!
         eoreqs      r0, r0, ip
         ldreq       r0, [r4], #4
         ldreq       ip, [r1, #4]!
         eoreqs      r0, r0, lr
         bne         2f
         subs        r2, r2, #32
         bhs         0b

         /* do we have at least 4 bytes left? */
 1:      adds        r2, r2, #(32 - 4 + 4)
         bmi         4f

         /* finish off 4 bytes at a time */
 3:      ldr         r0, [r4], #4
         ldr         ip, [r1], #4
         eors        r0, r0, ip
         bne         2f
         subs        r2, r2, #4
         bhs         3b

         /* are we done? */
 4:      adds        r2, r2, #4
         moveq       r0, #0
         beq         9f

         /* finish off the remaining bytes */
         b           8f

 2:      /* the last 4 bytes are different, restart them */
         sub         r4, r4, #4
         sub         r1, r1, #4
         mov         r2, #4

         /* process the last few bytes */
 8:      ldrb        r0, [r4], #1
         ldrb        ip, [r1], #1
         // stall
         subs        r0, r0, ip
         bne         9f
         subs        r2, r2, #1
         bne         8b

 9:      /* restore registers and return */
         ldmfd       sp!, {r4, lr}
         bx          lr
 END(memcmp)


 5:      /*************** non-congruent case ***************/
         and         r0, r1, #3
         cmp         r0, #2
         bne         4f

         /* here, offset is 2 (16-bits aligned, special cased) */

         /* make sure we have at least 16 bytes to process */
         subs        r2, r2, #16
         addmi       r2, r2, #16
         bmi         8b

         /* align the unaligned pointer */
         bic         r1, r1, #3
         ldr         lr, [r1], #4

 6:      PLD         (r1, #64)
         PLD         (r4, #64)
         mov         ip, lr, lsr #16
         ldr         lr, [r1], #4
         ldr         r0, [r4], #4
         orr         ip, ip, lr, lsl #16
         eors        r0, r0, ip
         moveq       ip, lr, lsr #16
         ldreq       lr, [r1], #4
         ldreq       r0, [r4], #4
         orreq       ip, ip, lr, lsl #16
         eoreqs      r0, r0, ip
         moveq       ip, lr, lsr #16
         ldreq       lr, [r1], #4
         ldreq       r0, [r4], #4
         orreq       ip, ip, lr, lsl #16
         eoreqs      r0, r0, ip
         moveq       ip, lr, lsr #16
         ldreq       lr, [r1], #4
         ldreq       r0, [r4], #4
         orreq       ip, ip, lr, lsl #16
         eoreqs      r0, r0, ip
         bne         7f
         subs        r2, r2, #16
         bhs         6b
         sub         r1, r1, #2
         /* are we done? */
         adds        r2, r2, #16
         moveq       r0, #0
         beq         9b
         /* finish off the remaining bytes */
         b           8b

 7:      /* fix up the 2 pointers and fallthrough... */
         sub         r1, r1, #(4+2)
         sub         r4, r4, #4
         mov         r2, #4
         b           8b


 4:      /*************** offset is 1 or 3 (less optimized) ***************/

 		stmfd		sp!, {r5, r6, r7}

         // r5 = rhs
         // r6 = lhs
         // r7 = scratch

         mov         r5, r0, lsl #3		/* r5 = right shift */
         rsb         r6, r5, #32         /* r6 = left shift */

         /* align the unaligned pointer */
         bic         r1, r1, #3
         ldr         r7, [r1], #4
         sub         r2, r2, #8

 6:      mov         ip, r7, lsr r5
         ldr         r7, [r1], #4
         ldr         r0, [r4], #4
         orr         ip, ip, r7, lsl r6
         eors        r0, r0, ip
         moveq       ip, r7, lsr r5
         ldreq       r7, [r1], #4
         ldreq       r0, [r4], #4
         orreq       ip, ip, r7, lsl r6
         eoreqs      r0, r0, ip
         bne         7f
         subs        r2, r2, #8
         bhs         6b

         sub         r1, r1, r6, lsr #3
 		ldmfd       sp!, {r5, r6, r7}

         /* are we done? */
         adds        r2, r2, #8
         moveq       r0, #0
         beq         9b

         /* finish off the remaining bytes */
         b           8b

 7:      /* fix up the 2 pointers and fallthrough... */
         sub         r1, r1, #4
         sub         r1, r1, r6, lsr #3
         sub         r4, r4, #4
         mov         r2, #4
 		ldmfd		sp!, {r5, r6, r7}
         b           8b
	/*
	* Copyright (C) 2008 The Android Open Source Project
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in
	* the documentation and/or other materials provided with the
	* distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
	* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <machine/cpu-features.h>
	#include <machine/asm.h>

	/*
	* Optimized memcmp() for ARM9.
	* This would not be optimal on XScale or ARM11, where more prefetching
	* and use of PLD will be needed.
	* The 2 major optimzations here are
	* (1) The main loop compares 16 bytes at a time
	* (2) The loads are scheduled in a way they won't stall
	*/

	ENTRY(memcmp)
	PLD (r0, #0)
	PLD (r1, #0)

	/* take of the case where length is 0 or the buffers are the same */
	cmp r0, r1
	cmpne r2, #0
	moveq r0, #0
	bxeq lr

	.save {r4, lr}
	/* save registers */
	stmfd sp!, {r4, lr}

	PLD (r0, #32)
	PLD (r1, #32)

	/* since r0 hold the result, move the first source
	* pointer somewhere else
	*/

	mov r4, r0

	/* make sure we have at least 8+4 bytes, this simplify things below
	* and avoid some overhead for small blocks
	*/
	cmp r2, #(8+4)
	bmi 8f

	/* align first pointer to word boundary
	* offset = -src & 3
	*/
	rsb r3, r4, #0
	ands r3, r3, #3
	beq 0f

	/* align first pointer */
	sub r2, r2, r3
	1: ldrb r0, [r4], #1
	ldrb ip, [r1], #1
	subs r0, r0, ip
	bne 9f
	subs r3, r3, #1
	bne 1b


	0: /* here the first pointer is aligned, and we have at least 4 bytes
	* to process.
	*/

	/* see if the pointers are congruent */
	eor r0, r4, r1
	ands r0, r0, #3
	bne 5f

	/* congruent case, 32 bytes per iteration
	* We need to make sure there are at least 32+4 bytes left
	* because we effectively read ahead one word, and we could
	* read past the buffer (and segfault) if we're not careful.
	*/

	ldr ip, [r1]
	subs r2, r2, #(32 + 4)
	bmi 1f

	0: PLD (r4, #64)
	PLD (r1, #64)
	ldr r0, [r4], #4
	ldr lr, [r1, #4]!
	eors r0, r0, ip
	ldreq r0, [r4], #4
	ldreq ip, [r1, #4]!
	eoreqs r0, r0, lr
	ldreq r0, [r4], #4
	ldreq lr, [r1, #4]!
	eoreqs r0, r0, ip
	ldreq r0, [r4], #4
	ldreq ip, [r1, #4]!
	eoreqs r0, r0, lr
	ldreq r0, [r4], #4
	ldreq lr, [r1, #4]!
	eoreqs r0, r0, ip
	ldreq r0, [r4], #4
	ldreq ip, [r1, #4]!
	eoreqs r0, r0, lr
	ldreq r0, [r4], #4
	ldreq lr, [r1, #4]!
	eoreqs r0, r0, ip
	ldreq r0, [r4], #4
	ldreq ip, [r1, #4]!
	eoreqs r0, r0, lr
	bne 2f
	subs r2, r2, #32
	bhs 0b

	/* do we have at least 4 bytes left? */
	1: adds r2, r2, #(32 - 4 + 4)
	bmi 4f

	/* finish off 4 bytes at a time */
	3: ldr r0, [r4], #4
	ldr ip, [r1], #4
	eors r0, r0, ip
	bne 2f
	subs r2, r2, #4
	bhs 3b

	/* are we done? */
	4: adds r2, r2, #4
	moveq r0, #0
	beq 9f

	/* finish off the remaining bytes */
	b 8f

	2: /* the last 4 bytes are different, restart them */
	sub r4, r4, #4
	sub r1, r1, #4
	mov r2, #4

	/* process the last few bytes */
	8: ldrb r0, [r4], #1
	ldrb ip, [r1], #1
	// stall
	subs r0, r0, ip
	bne 9f
	subs r2, r2, #1
	bne 8b

	9: /* restore registers and return */
	ldmfd sp!, {r4, lr}
	bx lr
	END(memcmp)





	5: /************* non-congruent case *************/
	and r0, r1, #3
	cmp r0, #2
	bne 4f

	/* here, offset is 2 (16-bits aligned, special cased) */

	/* make sure we have at least 16 bytes to process */
	subs r2, r2, #16
	addmi r2, r2, #16
	bmi 8b

	/* align the unaligned pointer */
	bic r1, r1, #3
	ldr lr, [r1], #4

	6: PLD (r1, #64)
	PLD (r4, #64)
	mov ip, lr, lsr #16
	ldr lr, [r1], #4
	ldr r0, [r4], #4
	orr ip, ip, lr, lsl #16
	eors r0, r0, ip
	moveq ip, lr, lsr #16
	ldreq lr, [r1], #4
	ldreq r0, [r4], #4
	orreq ip, ip, lr, lsl #16
	eoreqs r0, r0, ip
	moveq ip, lr, lsr #16
	ldreq lr, [r1], #4
	ldreq r0, [r4], #4
	orreq ip, ip, lr, lsl #16
	eoreqs r0, r0, ip
	moveq ip, lr, lsr #16
	ldreq lr, [r1], #4
	ldreq r0, [r4], #4
	orreq ip, ip, lr, lsl #16
	eoreqs r0, r0, ip
	bne 7f
	subs r2, r2, #16
	bhs 6b
	sub r1, r1, #2
	/* are we done? */
	adds r2, r2, #16
	moveq r0, #0
	beq 9b
	/* finish off the remaining bytes */
	b 8b

	7: /* fix up the 2 pointers and fallthrough... */
	sub r1, r1, #(4+2)
	sub r4, r4, #4
	mov r2, #4
	b 8b


	4: /************* offset is 1 or 3 (less optimized) *************/

	stmfd sp!, {r5, r6, r7}

	// r5 = rhs
	// r6 = lhs
	// r7 = scratch

	mov r5, r0, lsl #3 /* r5 = right shift */
	rsb r6, r5, #32 /* r6 = left shift */

	/* align the unaligned pointer */
	bic r1, r1, #3
	ldr r7, [r1], #4
	sub r2, r2, #8

	6: mov ip, r7, lsr r5
	ldr r7, [r1], #4
	ldr r0, [r4], #4
	orr ip, ip, r7, lsl r6
	eors r0, r0, ip
	moveq ip, r7, lsr r5
	ldreq r7, [r1], #4
	ldreq r0, [r4], #4
	orreq ip, ip, r7, lsl r6
	eoreqs r0, r0, ip
	bne 7f
	subs r2, r2, #8
	bhs 6b

	sub r1, r1, r6, lsr #3
	ldmfd sp!, {r5, r6, r7}

	/* are we done? */
	adds r2, r2, #8
	moveq r0, #0
	beq 9b

	/* finish off the remaining bytes */
	b 8b

	7: /* fix up the 2 pointers and fallthrough... */
	sub r1, r1, #4
	sub r1, r1, r6, lsr #3
	sub r4, r4, #4
	mov r2, #4
	ldmfd sp!, {r5, r6, r7}
	b 8b