blob: 55e02ce8a187ec99364e9a08abd0b9ae70caa15b [file] [log] [blame]
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build ppc64le || ppc64
#include "go_asm.h"
#include "textflag.h"
TEXT ·Count<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
// R3 = byte array pointer
// R4 = length
// R6 = byte to count
MTVRD R6, V1 // move compare byte
MOVD R6, R5
VSPLTB $7, V1, V1 // replicate byte across V1
BR countbytebody<>(SB)
TEXT ·CountString<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-32
// R3 = byte array pointer
// R4 = length
// R5 = byte to count
MTVRD R5, V1 // move compare byte
VSPLTB $7, V1, V1 // replicate byte across V1
BR countbytebody<>(SB)
// R3: addr of string
// R4: len of string
// R5: byte to count
// V1: byte to count, splatted.
// On exit:
// R3: return value
TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
MOVD $0, R18 // byte count
#ifndef GOPPC64_power10
RLDIMI $8, R5, $48, R5
RLDIMI $16, R5, $32, R5
RLDIMI $32, R5, $0, R5 // fill reg with the byte to count
#endif
CMPU R4, $32 // Check if it's a small string (<32 bytes)
BLT tail // Jump to the small string case
SRD $5, R4, R20
MOVD R20, CTR
MOVD $16, R21
XXLXOR V4, V4, V4
XXLXOR V5, V5, V5
PCALIGN $16
cmploop:
LXVD2X (R0)(R3), V0 // Count 32B per loop with two vector accumulators.
LXVD2X (R21)(R3), V2
VCMPEQUB V2, V1, V2
VCMPEQUB V0, V1, V0
VPOPCNTD V2, V2 // A match is 0xFF or 0. Count the bits into doubleword buckets.
VPOPCNTD V0, V0
VADDUDM V0, V4, V4 // Accumulate the popcounts. They are 8x the count.
VADDUDM V2, V5, V5 // The count will be fixed up afterwards.
ADD $32, R3
BDNZ cmploop
VADDUDM V4, V5, V5
MFVSRD V5, R18
VSLDOI $8, V5, V5, V5
MFVSRD V5, R21
ADD R21, R18, R18
ANDCC $31, R4, R4
// Skip the tail processing if no bytes remaining.
BEQ tail_0
#ifdef GOPPC64_power10
SRD $3, R18, R18 // Fix the vector loop count before counting the tail on P10.
tail: // Count the last 0 - 31 bytes.
CMP R4, $16
BLE small_tail_p10
LXV 0(R3), V0
VCMPEQUB V0, V1, V0
VCNTMBB V0, $1, R14 // Sum the value of bit 0 of each byte of the compare into R14.
SRD $56, R14, R14 // The result of VCNTMBB is shifted. Unshift it.
ADD R14, R18, R18
ADD $16, R3, R3
ANDCC $15, R4, R4
small_tail_p10:
SLD $56, R4, R6
LXVLL R3, R6, V0
VCMPEQUB V0, V1, V0
VCLRRB V0, R4, V0 // If <16B being compared, clear matches of the 16-R4 bytes.
VCNTMBB V0, $1, R14 // Sum the value of bit 0 of each byte of the compare into R14.
SRD $56, R14, R14 // The result of VCNTMBB is shifted. Unshift it.
ADD R14, R18, R3
RET
#else
tail: // Count the last 0 - 31 bytes.
CMP R4, $16
BLT tail_8
MOVD (R3), R12
MOVD 8(R3), R14
CMPB R12, R5, R12
CMPB R14, R5, R14
POPCNTD R12, R12
POPCNTD R14, R14
ADD R12, R18, R18
ADD R14, R18, R18
ADD $16, R3, R3
ADD $-16, R4, R4
tail_8: // Count the remaining 0 - 15 bytes.
CMP R4, $8
BLT tail_4
MOVD (R3), R12
CMPB R12, R5, R12
POPCNTD R12, R12
ADD R12, R18, R18
ADD $8, R3, R3
ADD $-8, R4, R4
tail_4: // Count the remaining 0 - 7 bytes.
CMP R4, $4
BLT tail_2
MOVWZ (R3), R12
CMPB R12, R5, R12
SLD $32, R12, R12 // Remove non-participating matches.
POPCNTD R12, R12
ADD R12, R18, R18
ADD $4, R3, R3
ADD $-4, R4, R4
tail_2: // Count the remaining 0 - 3 bytes.
CMP R4, $2
BLT tail_1
MOVHZ (R3), R12
CMPB R12, R5, R12
SLD $48, R12, R12 // Remove non-participating matches.
POPCNTD R12, R12
ADD R12, R18, R18
ADD $2, R3, R3
ADD $-2, R4, R4
tail_1: // Count the remaining 0 - 1 bytes.
CMP R4, $1
BLT tail_0
MOVBZ (R3), R12
CMPB R12, R5, R12
ANDCC $0x8, R12, R12
ADD R12, R18, R18
#endif
tail_0: // No remaining tail to count.
SRD $3, R18, R3 // Fixup count, it is off by 8x.
RET