blob: 2629251e430a8eea9df6a4370d0843bd6b5827c5 [file] [log] [blame]
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build ppc64 || ppc64le
#include "go_asm.h"
#include "textflag.h"
// Helper names for x-form loads in BE ordering.
#ifdef GOARCH_ppc64le
#define _LDBEX MOVDBR
#define _LWBEX MOVWBR
#define _LHBEX MOVHBR
#else
#define _LDBEX MOVD
#define _LWBEX MOVW
#define _LHBEX MOVH
#endif
#ifdef GOPPC64_power9
#define SETB_CR0(rout) SETB CR0, rout
#define SETB_CR1(rout) SETB CR1, rout
#define SETB_INIT()
#define SETB_CR0_NE(rout) SETB_CR0(rout)
#else
// A helper macro to emulate SETB on P8. This assumes
// -1 is in R20, and 1 is in R21. crxlt and crxeq must
// also be the same CR field.
#define _SETB(crxlt, crxeq, rout) \
ISEL crxeq,R0,R21,rout \
ISEL crxlt,R20,rout,rout
// A special case when it is know the comparison
// will always be not equal. The result must be -1 or 1.
#define SETB_CR0_NE(rout) \
ISEL CR0LT,R20,R21,rout
#define SETB_CR0(rout) _SETB(CR0LT, CR0EQ, rout)
#define SETB_CR1(rout) _SETB(CR1LT, CR1EQ, rout)
#define SETB_INIT() \
MOVD $-1,R20 \
MOVD $1,R21
#endif
TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
// incoming:
// R3 a addr
// R4 a len
// R6 b addr
// R7 b len
//
// on entry to cmpbody:
// R3 return value if len(a) == len(b)
// R5 a addr
// R6 b addr
// R9 min(len(a),len(b))
SETB_INIT()
MOVD R3,R5
CMP R4,R7,CR0
CMP R3,R6,CR7
ISEL CR0LT,R4,R7,R9
SETB_CR0(R3)
BC $12,30,LR // beqlr cr7
BR cmpbody<>(SB)
TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
// incoming:
// R3 a addr -> R5
// R4 a len -> R3
// R5 b addr -> R6
// R6 b len -> R4
//
// on entry to cmpbody:
// R3 compare value if compared length is same.
// R5 a addr
// R6 b addr
// R9 min(len(a),len(b))
SETB_INIT()
CMP R4,R6,CR0
CMP R3,R5,CR7
ISEL CR0LT,R4,R6,R9
MOVD R5,R6
MOVD R3,R5
SETB_CR0(R3)
BC $12,30,LR // beqlr cr7
BR cmpbody<>(SB)
#ifdef GOARCH_ppc64le
DATA byteswap<>+0(SB)/8, $0x0706050403020100
DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908
GLOBL byteswap<>+0(SB), RODATA, $16
#define SWAP V21
#endif
TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
start:
CMP R9,$16,CR0
CMP R9,$32,CR1
CMP R9,$64,CR2
MOVD $16,R10
BLT cmp8
BLT CR1,cmp16
BLT CR2,cmp32
cmp64: // >= 64B
DCBT (R5) // optimize for size>=64
DCBT (R6) // cache hint
SRD $6,R9,R14 // There is at least one iteration.
MOVD R14,CTR
ANDCC $63,R9,R9
CMP R9,$16,CR1 // Do setup for tail check early on.
CMP R9,$32,CR2
CMP R9,$48,CR3
ADD $-16,R9,R9
MOVD $32,R11 // set offsets to load into vector
MOVD $48,R12 // set offsets to load into vector
PCALIGN $16
cmp64_loop:
LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector
LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector
VCMPEQUDCC V3,V4,V1
BGE CR6,different // jump out if its different
LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector
LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector
VCMPEQUDCC V3,V4,V1
BGE CR6,different
LXVD2X (R5)(R11),V3 // load bytes of A at offset 32 into vector
LXVD2X (R6)(R11),V4 // load bytes of B at offset 32 into vector
VCMPEQUDCC V3,V4,V1
BGE CR6,different
LXVD2X (R5)(R12),V3 // load bytes of A at offset 64 into vector
LXVD2X (R6)(R12),V4 // load bytes of B at offset 64 into vector
VCMPEQUDCC V3,V4,V1
BGE CR6,different
ADD $64,R5,R5 // increment to next 64 bytes of A
ADD $64,R6,R6 // increment to next 64 bytes of B
BDNZ cmp64_loop
BC $12,2,LR // beqlr
// Finish out tail with minimal overlapped checking.
// Note, 0 tail is handled by beqlr above.
BLE CR1,cmp64_tail_gt0
BLE CR2,cmp64_tail_gt16
BLE CR3,cmp64_tail_gt32
cmp64_tail_gt48: // 49 - 63 B
LXVD2X (R0)(R5),V3
LXVD2X (R0)(R6),V4
VCMPEQUDCC V3,V4,V1
BGE CR6,different
LXVD2X (R5)(R10),V3
LXVD2X (R6)(R10),V4
VCMPEQUDCC V3,V4,V1
BGE CR6,different
LXVD2X (R5)(R11),V3
LXVD2X (R6)(R11),V4
VCMPEQUDCC V3,V4,V1
BGE CR6,different
BR cmp64_tail_gt0
PCALIGN $16
cmp64_tail_gt32: // 33 - 48B
LXVD2X (R0)(R5),V3
LXVD2X (R0)(R6),V4
VCMPEQUDCC V3,V4,V1
BGE CR6,different
LXVD2X (R5)(R10),V3
LXVD2X (R6)(R10),V4
VCMPEQUDCC V3,V4,V1
BGE CR6,different
BR cmp64_tail_gt0
PCALIGN $16
cmp64_tail_gt16: // 17 - 32B
LXVD2X (R0)(R5),V3
LXVD2X (R0)(R6),V4
VCMPEQUDCC V3,V4,V1
BGE CR6,different
BR cmp64_tail_gt0
PCALIGN $16
cmp64_tail_gt0: // 1 - 16B
LXVD2X (R5)(R9),V3
LXVD2X (R6)(R9),V4
VCMPEQUDCC V3,V4,V1
BGE CR6,different
RET
PCALIGN $16
cmp32: // 32 - 63B
ANDCC $31,R9,R9
LXVD2X (R0)(R5),V3
LXVD2X (R0)(R6),V4
VCMPEQUDCC V3,V4,V1
BGE CR6,different
LXVD2X (R10)(R5),V3
LXVD2X (R10)(R6),V4
VCMPEQUDCC V3,V4,V1
BGE CR6,different
BC $12,2,LR // beqlr
ADD R9,R10,R10
LXVD2X (R9)(R5),V3
LXVD2X (R9)(R6),V4
VCMPEQUDCC V3,V4,V1
BGE CR6,different
LXVD2X (R10)(R5),V3
LXVD2X (R10)(R6),V4
VCMPEQUDCC V3,V4,V1
BGE CR6,different
RET
PCALIGN $16
cmp16: // 16 - 31B
ANDCC $15,R9,R9
LXVD2X (R0)(R5),V3
LXVD2X (R0)(R6),V4
VCMPEQUDCC V3,V4,V1
BGE CR6,different
BC $12,2,LR // beqlr
LXVD2X (R9)(R5),V3
LXVD2X (R9)(R6),V4
VCMPEQUDCC V3,V4,V1
BGE CR6,different
RET
PCALIGN $16
different:
#ifdef GOARCH_ppc64le
MOVD $byteswap<>+00(SB),R16
LXVD2X (R16)(R0),SWAP // Set up swap string
VPERM V3,V3,SWAP,V3
VPERM V4,V4,SWAP,V4
#endif
MFVSRD VS35,R16 // move upper doublewords of A and B into GPR for comparison
MFVSRD VS36,R10
CMPU R16,R10
BEQ lower
SETB_CR0_NE(R3)
RET
PCALIGN $16
lower:
VSLDOI $8,V3,V3,V3 // move lower doublewords of A and B into GPR for comparison
MFVSRD VS35,R16
VSLDOI $8,V4,V4,V4
MFVSRD VS36,R10
CMPU R16,R10
SETB_CR0_NE(R3)
RET
PCALIGN $16
cmp8: // 8 - 15B (0 - 15B if GOPPC64_power10)
#ifdef GOPPC64_power10
SLD $56,R9,R9
LXVLL R5,R9,V3 // Load bytes starting from MSB to LSB, unused are zero filled.
LXVLL R6,R9,V4
VCMPUQ V3,V4,CR0 // Compare as a 128b integer.
SETB_CR0(R6)
ISEL CR0EQ,R3,R6,R3 // If equal, length determines the return value.
RET
#else
CMP R9,$8
BLT cmp4
ANDCC $7,R9,R9
_LDBEX (R0)(R5),R10
_LDBEX (R0)(R6),R11
_LDBEX (R9)(R5),R12
_LDBEX (R9)(R6),R14
CMPU R10,R11,CR0
SETB_CR0(R5)
CMPU R12,R14,CR1
SETB_CR1(R6)
CRAND CR0EQ,CR1EQ,CR1EQ // If both equal, length determines return value.
ISEL CR0EQ,R6,R5,R4
ISEL CR1EQ,R3,R4,R3
RET
PCALIGN $16
cmp4: // 4 - 7B
CMP R9,$4
BLT cmp2
ANDCC $3,R9,R9
_LWBEX (R0)(R5),R10
_LWBEX (R0)(R6),R11
_LWBEX (R9)(R5),R12
_LWBEX (R9)(R6),R14
RLDIMI $32,R10,$0,R12
RLDIMI $32,R11,$0,R14
CMPU R12,R14
BR cmp0
PCALIGN $16
cmp2: // 2 - 3B
CMP R9,$2
BLT cmp1
ANDCC $1,R9,R9
_LHBEX (R0)(R5),R10
_LHBEX (R0)(R6),R11
_LHBEX (R9)(R5),R12
_LHBEX (R9)(R6),R14
RLDIMI $32,R10,$0,R12
RLDIMI $32,R11,$0,R14
CMPU R12,R14
BR cmp0
PCALIGN $16
cmp1:
CMP R9,$0
BEQ cmp0
MOVBZ (R5),R10
MOVBZ (R6),R11
CMPU R10,R11
cmp0:
SETB_CR0(R6)
ISEL CR0EQ,R3,R6,R3
RET
#endif