Implement vfred{[ou]sum,min,max}.vs instructions.
Test: m berberis_all
Change-Id: I8e44fc31fdc90341a8d2c71ac69caf17301a3601
diff --git a/decoder/include/berberis/decoder/riscv64/decoder.h b/decoder/include/berberis/decoder/riscv64/decoder.h
index 084aff9..52cc3d8 100644
--- a/decoder/include/berberis/decoder/riscv64/decoder.h
+++ b/decoder/include/berberis/decoder/riscv64/decoder.h
@@ -284,13 +284,13 @@
enum class VOpFVvOpcode : uint8_t {
kVfaddvv = 0b000000,
- kVfredusumvv = 0b000001,
+ kVfredusumvs = 0b000001,
kVfsubvv = 0b000010,
- kVfredosumvv = 0b000011,
+ kVfredosumvs = 0b000011,
kVfminvv = 0b000100,
- kVfredminvv = 0b000101,
+ kVfredminvs = 0b000101,
kVfmaxvv = 0b000110,
- kVfredmaxvv = 0b000111,
+ kVfredmaxvs = 0b000111,
kVfsgnjvv = 0b001000,
kVfsgnjnvv = 0b001001,
kVfsgnjxvv = 0b001010,
diff --git a/interpreter/riscv64/interpreter.h b/interpreter/riscv64/interpreter.h
index 56e5994..99a983d 100644
--- a/interpreter/riscv64/interpreter.h
+++ b/interpreter/riscv64/interpreter.h
@@ -1270,7 +1270,23 @@
template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
void OpVector(const Decoder::VOpFVfArgs& args, ElementType arg2) {
using SignedType = Wrapping<std::make_signed_t<typename TypeTraits<ElementType>::Int>>;
+ // Keep cases sorted in opcode order to match RISC-V V manual.
switch (args.opcode) {
+ case Decoder::VOpFVfOpcode::kVfminvf:
+ return OpVectorvx<intrinsics::Vfminvx<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, arg2);
+ case Decoder::VOpFVfOpcode::kVfmaxvf:
+ return OpVectorvx<intrinsics::Vfmaxvx<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, arg2);
+ case Decoder::VOpFVfOpcode::kVfsgnjvf:
+ return OpVectorvx<intrinsics::Vfsgnjvx<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, arg2);
+ case Decoder::VOpFVfOpcode::kVfsgnjnvf:
+ return OpVectorvx<intrinsics::Vfsgnjnvx<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, arg2);
+ case Decoder::VOpFVfOpcode::kVfsgnjxvf:
+ return OpVectorvx<intrinsics::Vfsgnjxvx<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, arg2);
case Decoder::VOpFVfOpcode::kVfmvsf:
if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
return Unimplemented();
@@ -1295,21 +1311,6 @@
InactiveProcessing::kUndisturbed>(
args.dst, arg2, /*dst_mask=*/args.src1);
}
- case Decoder::VOpFVfOpcode::kVfminvf:
- return OpVectorvx<intrinsics::Vfminvx<ElementType>, ElementType, vlmul, vta, vma>(
- args.dst, args.src1, arg2);
- case Decoder::VOpFVfOpcode::kVfmaxvf:
- return OpVectorvx<intrinsics::Vfmaxvx<ElementType>, ElementType, vlmul, vta, vma>(
- args.dst, args.src1, arg2);
- case Decoder::VOpFVfOpcode::kVfsgnjvf:
- return OpVectorvx<intrinsics::Vfsgnjvx<ElementType>, ElementType, vlmul, vta, vma>(
- args.dst, args.src1, arg2);
- case Decoder::VOpFVfOpcode::kVfsgnjnvf:
- return OpVectorvx<intrinsics::Vfsgnjnvx<ElementType>, ElementType, vlmul, vta, vma>(
- args.dst, args.src1, arg2);
- case Decoder::VOpFVfOpcode::kVfsgnjxvf:
- return OpVectorvx<intrinsics::Vfsgnjxvx<ElementType>, ElementType, vlmul, vta, vma>(
- args.dst, args.src1, arg2);
case Decoder::VOpFVfOpcode::kVmfeqvf:
return OpVectorToMaskvx<intrinsics::Vfeqvx<ElementType>, ElementType, vlmul, vma>(
args.dst, args.src1, arg2);
@@ -1374,6 +1375,7 @@
// double-width floats use these encodings to produce regular Float32 types.
if constexpr (sizeof(ElementType) <= sizeof(Float32)) {
using WideElementType = typename TypeTraits<ElementType>::Wide;
+ // Keep cases sorted in opcode order to match RISC-V V manual.
switch (args.opcode) {
case Decoder::VOpFVvOpcode::kVFUnary0:
switch (args.vfunary0_opcode) {
@@ -1446,6 +1448,7 @@
using WideElementType = WideType<ElementType>;
using WideSignedType = WideType<SignedType>;
using WideUnsignedType = WideType<UnsignedType>;
+ // Keep cases sorted in opcode order to match RISC-V V manual.
switch (args.opcode) {
case Decoder::VOpFVvOpcode::kVFUnary0:
switch (args.vfunary0_opcode) {
@@ -1532,21 +1535,75 @@
// If our ElementType is Float16 then “straight” operations are unsupported and we whouldn't try
// instantiate any functions since this would lead to compilke-time error.
if constexpr (sizeof(ElementType) >= sizeof(Float32)) {
+ // Floating point IEEE 754 value -0.0 includes 1 top bit set and the other bits not set:
+ // https://en.wikipedia.org/wiki/Signed_zero#Representations This is the exact same
+ // representation minimum negative integer have in two's complement representation:
+ // https://en.wikipedia.org/wiki/Two%27s_complement#Most_negative_number
+ // Note: we pass filler elements as integers because `Float32`/`Float64` couldn't be template
+ // parameters.
+ constexpr SignedType kNegativeZero{std::numeric_limits<typename SignedType::BaseType>::min()};
+ // Floating point IEEE 754 value +0.0 includes only zero bits, same as integer zero.
+ constexpr SignedType kPositiveZero{};
+ // Keep cases sorted in opcode order to match RISC-V V manual.
switch (args.opcode) {
- case Decoder::VOpFVvOpcode::kVfmvfs:
- if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
- return Unimplemented();
+ case Decoder::VOpFVvOpcode::kVfredusumvs:
+ // 14.3. Vector Single-Width Floating-Point Reduction Instructions:
+ // The additive identity is +0.0 when rounding down or -0.0 for all other rounding modes.
+ if (GetCsr<kFrm>() != FPFlags::RDN) {
+ return OpVectorvs<intrinsics::Vfredusumvs<ElementType>,
+ ElementType,
+ vlmul,
+ vta,
+ vma,
+ kFrm>(args.dst, args.src1, Vec<kNegativeZero>{args.src2});
+ } else {
+ return OpVectorvs<intrinsics::Vfredusumvs<ElementType>,
+ ElementType,
+ vlmul,
+ vta,
+ vma,
+ kFrm>(args.dst, args.src1, Vec<kPositiveZero>{args.src2});
}
- if (args.src2 != 0) {
- return Unimplemented();
+ case Decoder::VOpFVvOpcode::kVfredosumvs:
+ // 14.3. Vector Single-Width Floating-Point Reduction Instructions:
+ // The additive identity is +0.0 when rounding down or -0.0 for all other rounding modes.
+ if (GetCsr<kFrm>() != FPFlags::RDN) {
+ return OpVectorvs<intrinsics::Vfredosumvs<ElementType>,
+ ElementType,
+ vlmul,
+ vta,
+ vma,
+ kFrm>(args.dst, args.src1, Vec<kNegativeZero>{args.src2});
+ } else {
+ return OpVectorvs<intrinsics::Vfredosumvs<ElementType>,
+ ElementType,
+ vlmul,
+ vta,
+ vma,
+ kFrm>(args.dst, args.src1, Vec<kPositiveZero>{args.src2});
}
- return OpVectorVmvfs<ElementType>(args.dst, args.src1);
- case Decoder::VOpFVvOpcode::kVfmaxvv:
- return OpVectorvv<intrinsics::Vfmaxvv<ElementType>, ElementType, vlmul, vta, vma>(
- args.dst, args.src1, args.src2);
case Decoder::VOpFVvOpcode::kVfminvv:
return OpVectorvv<intrinsics::Vfminvv<ElementType>, ElementType, vlmul, vta, vma>(
args.dst, args.src1, args.src2);
+ case Decoder::VOpFVvOpcode::kVfredminvs:
+ // For Vfredmin the identity element is +inf.
+ return OpVectorvs<intrinsics::Vfredminvs<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst,
+ args.src1,
+ Vec<UnsignedType{(sizeof(ElementType) == sizeof(Float32)) ? 0x7f80'0000
+ : 0x7ff0'0000'0000'0000}>{
+ args.src2});
+ case Decoder::VOpFVvOpcode::kVfmaxvv:
+ return OpVectorvv<intrinsics::Vfmaxvv<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst, args.src1, args.src2);
+ case Decoder::VOpFVvOpcode::kVfredmaxvs:
+ // For Vfredmax the identity element is -inf.
+ return OpVectorvs<intrinsics::Vfredmaxvs<ElementType>, ElementType, vlmul, vta, vma>(
+ args.dst,
+ args.src1,
+ Vec<UnsignedType{(sizeof(ElementType) == sizeof(Float32)) ? 0xff80'0000
+ : 0xfff0'0000'0000'0000}>{
+ args.src2});
case Decoder::VOpFVvOpcode::kVfsgnjvv:
return OpVectorvv<intrinsics::Vfsgnjvv<ElementType>, ElementType, vlmul, vta, vma>(
args.dst, args.src1, args.src2);
@@ -1616,6 +1673,14 @@
break; // Make compiler happy.
}
break;
+ case Decoder::VOpFVvOpcode::kVfmvfs:
+ if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
+ return Unimplemented();
+ }
+ if (args.src2 != 0) {
+ return Unimplemented();
+ }
+ return OpVectorVmvfs<ElementType>(args.dst, args.src1);
case Decoder::VOpFVvOpcode::kVmfeqvv:
return OpVectorToMaskvv<intrinsics::Vfeqvv<ElementType>, ElementType, vlmul, vma>(
args.dst, args.src1, args.src2);
@@ -1669,6 +1734,7 @@
using UnsignedType = berberis::UnsignedType<ElementType>;
using SaturatingSignedType = SaturatingType<SignedType>;
using SaturatingUnsignedType = SaturatingType<UnsignedType>;
+ // Keep cases sorted in opcode order to match RISC-V V manual.
switch (args.opcode) {
case Decoder::VOpIViOpcode::kVaddvi:
return OpVectorvx<intrinsics::Vaddvx<SignedType>, SignedType, vlmul, vta, vma>(
@@ -1795,6 +1861,7 @@
using UnsignedType = berberis::UnsignedType<ElementType>;
using SaturatingSignedType = SaturatingType<SignedType>;
using SaturatingUnsignedType = SaturatingType<UnsignedType>;
+ // Keep cases sorted in opcode order to match RISC-V V manual.
switch (args.opcode) {
case Decoder::VOpIVvOpcode::kVaddvv:
return OpVectorvv<intrinsics::Vaddvv<ElementType>, ElementType, vlmul, vta, vma>(
@@ -1918,6 +1985,7 @@
using UnsignedType = berberis::UnsignedType<ElementType>;
using SaturatingSignedType = SaturatingType<SignedType>;
using SaturatingUnsignedType = SaturatingType<UnsignedType>;
+ // Keep cases sorted in opcode order to match RISC-V V manual.
switch (args.opcode) {
case Decoder::VOpIVxOpcode::kVaddvx:
return OpVectorvx<intrinsics::Vaddvx<ElementType>, ElementType, vlmul, vta, vma>(
@@ -2049,6 +2117,7 @@
using SignedType = berberis::SignedType<ElementType>;
using UnsignedType = berberis::UnsignedType<ElementType>;
if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
+ // Keep cases sorted in opcode order to match RISC-V V manual.
switch (args.opcode) {
case Decoder::VOpMVvOpcode::kVmandnmm:
return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return lhs & ~rhs; }>(
@@ -2077,6 +2146,7 @@
default:; // Do nothing: handled in next switch.
}
}
+ // Keep cases sorted in opcode order to match RISC-V V manual.
switch (args.opcode) {
case Decoder::VOpMVvOpcode::kVredsumvs:
return OpVectorvs<intrinsics::Vredsumvs<ElementType>, ElementType, vlmul, vta, vma>(
@@ -2277,6 +2347,7 @@
void OpVector(const Decoder::VOpMVxArgs& args, Register arg2) {
using SignedType = berberis::SignedType<ElementType>;
using UnsignedType = berberis::UnsignedType<ElementType>;
+ // Keep cases sorted in opcode order to match RISC-V V manual.
switch (args.opcode) {
case Decoder::VOpMVxOpcode::kVslide1upvx:
return OpVectorslide1up<SignedType, vlmul, vta, vma>(
@@ -2822,10 +2893,15 @@
VectorRegisterGroupMultiplier vlmul,
TailProcessing vta,
auto vma,
+ CsrName... kExtraCsrs,
auto kDefaultElement>
void OpVectorvs(uint8_t dst, uint8_t src1, Vec<kDefaultElement> src2) {
- return OpVectorvs<Intrinsic, ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(
- dst, src1, src2);
+ return OpVectorvs<Intrinsic,
+ ElementType,
+ NumberOfRegistersInvolved(vlmul),
+ vta,
+ vma,
+ kExtraCsrs...>(dst, src1, src2);
}
template <auto Intrinsic,
@@ -2833,6 +2909,7 @@
size_t kRegistersInvolved,
TailProcessing vta,
auto vma,
+ CsrName... kExtraCsrs,
auto kDefaultElement>
void OpVectorvs(uint8_t dst, uint8_t src1, Vec<kDefaultElement> src2) {
if (!IsAligned<kRegistersInvolved>(dst | src2.start_no)) {
@@ -2852,7 +2929,9 @@
ElementType arg1 = SIMD128Register{state_->cpu.v[src1]}.Get<ElementType>(0);
for (size_t index = 0; index < kRegistersInvolved; ++index) {
arg1 = std::get<0>(
- Intrinsic(arg1, GetVectorArgument<ElementType, vta, vma>(src2, vstart, vl, index, mask)));
+ Intrinsic(GetCsr<kExtraCsrs>()...,
+ arg1,
+ GetVectorArgument<ElementType, vta, vma>(src2, vstart, vl, index, mask)));
}
SIMD128Register result{state_->cpu.v[dst]};
result.Set(arg1, 0);
diff --git a/interpreter/riscv64/interpreter_test.cc b/interpreter/riscv64/interpreter_test.cc
index fdd9a81..53a97c1 100644
--- a/interpreter/riscv64/interpreter_test.cc
+++ b/interpreter/riscv64/interpreter_test.cc
@@ -1667,6 +1667,21 @@
}
void TestVectorReductionInstruction(uint32_t insn_bytes,
+ const uint32_t (&expected_result_vd0_int32)[8],
+ const uint64_t (&expected_result_vd0_int64)[8],
+ const uint32_t (&expected_result_vd0_with_mask_int32)[8],
+ const uint64_t (&expected_result_vd0_with_mask_int64)[8],
+ const __v2du (&source)[16]) {
+ TestVectorReductionInstruction(
+ insn_bytes,
+ source,
+ std::tuple<const uint32_t(&)[8], const uint32_t(&)[8]>{expected_result_vd0_int32,
+ expected_result_vd0_with_mask_int32},
+ std::tuple<const uint64_t(&)[8], const uint64_t(&)[8]>{
+ expected_result_vd0_int64, expected_result_vd0_with_mask_int64});
+ }
+
+ void TestVectorReductionInstruction(uint32_t insn_bytes,
const uint8_t (&expected_result_vd0_int8)[8],
const uint16_t (&expected_result_vd0_int16)[8],
const uint32_t (&expected_result_vd0_int32)[8],
@@ -8265,6 +8280,44 @@
kVectorCalculationsSource);
}
+TEST_F(Riscv64InterpreterTest, TestVfredosum) {
+ TestVectorReductionInstruction(
+ 0xd0c1457, // vfredosum.vs v8, v16, v24, v0.t
+ // expected_result_vd0_int32
+ {0x9e0c'9a8e, 0xbe2c'bace, 0xfe6c'fb4e, 0x7e6b'fc4d, /* unused */ 0, /* unused */ 0,
+ 0x9604'9200, 0x9e0c'9a8e},
+ // expected_result_vd0_int64
+ {0x9e0c'9a09'9604'9200, 0xbe2c'ba29'b624'b220, 0xfe6c'fa69'f664'f260, 0x7eec'5def'0cee'0dee,
+ /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9e0c'9a09'9604'9200},
+ // expected_result_vd0_with_mask_int32
+ {0x9604'929d, 0xbe2c'ba29, 0xfe6c'fb4e, 0x7e6b'fa84, /* unused */ 0, /* unused */ 0,
+ 0x9604'9200, 0x9604'9200},
+ // expected_result_vd0_with_mask_int64
+ {0x9e0c'9a09'9604'9200, 0xbe2c'ba29'b624'b220, 0xee7c'ea78'e674'e271, 0x6efc'4e0d'ee0d'ee0f,
+ /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9e0c'9a09'9604'9200},
+ kVectorCalculationsSource);
+}
+
+// Currently Vfredusum is implemented as Vfredosum (as explicitly permitted by RVV 1.0).
+// If we would implement some speedups which would change results then we may need to alter tests.
+TEST_F(Riscv64InterpreterTest, TestVfredusum) {
+ TestVectorReductionInstruction(
+ 0x50c1457, // vfredusum.vs v8, v16, v24, v0.t
+ // expected_result_vd0_int32
+ {0x9e0c'9a8e, 0xbe2c'bace, 0xfe6c'fb4e, 0x7e6b'fc4d, /* unused */ 0, /* unused */ 0,
+ 0x9604'9200, 0x9e0c'9a8e},
+ // expected_result_vd0_int64
+ {0x9e0c'9a09'9604'9200, 0xbe2c'ba29'b624'b220, 0xfe6c'fa69'f664'f260, 0x7eec'5def'0cee'0dee,
+ /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9e0c'9a09'9604'9200},
+ // expected_result_vd0_with_mask_int32
+ {0x9604'929d, 0xbe2c'ba29, 0xfe6c'fb4e, 0x7e6b'fa84, /* unused */ 0, /* unused */ 0,
+ 0x9604'9200, 0x9604'9200},
+ // expected_result_vd0_with_mask_int64
+ {0x9e0c'9a09'9604'9200, 0xbe2c'ba29'b624'b220, 0xee7c'ea78'e674'e271, 0x6efc'4e0d'ee0d'ee0f,
+ /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9e0c'9a09'9604'9200},
+ kVectorCalculationsSource);
+}
+
TEST_F(Riscv64InterpreterTest, TestVredand) {
TestVectorReductionInstruction(
0x50c2457, // vredand.vs v8,v16,v24,v0.t
@@ -8395,6 +8448,24 @@
kVectorCalculationsSource);
}
+TEST_F(Riscv64InterpreterTest, TestVfredmin) {
+ TestVectorReductionInstruction(
+ 0x150c1457, // vfredmin.vs v8, v16, v24, v0.t
+ // expected_result_vd0_int32
+ {0x9e0c'9a09, 0xbe2c'ba29, 0xfe6c'fa69, 0xfe6c'fa69, /* unused */ 0, /* unused */ 0,
+ 0x9604'9200, 0x9e0c'9a09},
+ // expected_result_vd0_int64
+ {0x9e0c'9a09'9604'9200, 0xbe2c'ba29'b624'b220, 0xfe6c'fa69'f664'f260, 0xfe6c'fa69'f664'f260,
+ /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9e0c'9a09'9604'9200},
+ // expected_result_vd0_with_mask_int32
+ {0x9604'9200, 0xbe2c'ba29, 0xfe6c'fa69, 0xfe6c'fa69, /* unused */ 0, /* unused */ 0,
+ 0x9604'9200, 0x9604'9200},
+ // expected_result_vd0_with_mask_int64
+ {0x9e0c'9a09'9604'9200, 0xbe2c'ba29'b624'b220, 0xee7c'ea78'e674'e271, 0xee7c'ea78'e674'e271,
+ /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9e0c'9a09'9604'9200},
+ kVectorCalculationsSource);
+}
+
TEST_F(Riscv64InterpreterTest, TestVredmaxu) {
TestVectorReductionInstruction(
0x190c2457, // vredmaxu.vs v8,v16,v24,v0.t
@@ -8447,6 +8518,24 @@
kVectorCalculationsSource);
}
+TEST_F(Riscv64InterpreterTest, TestVfredmax) {
+ TestVectorReductionInstruction(
+ 0x1d0c1457, // vfredmax.vs v8, v16, v24, v0.t
+ // expected_result_vd0_int32
+ {0x8302'8100, 0x8302'8100, 0x8302'8100, 0x7eec'7ae9, /* unused */ 0, /* unused */ 0,
+ 0x8302'8100, 0x8302'8100},
+ // expected_result_vd0_int64
+ {0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x7eec'7ae9'76e4'72e0,
+ /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x8706'8504'8302'8100},
+ // expected_result_vd0_with_mask_int32
+ {0x8302'8100, 0x8302'8100, 0x8302'8100, 0x7eec'7ae9, /* unused */ 0, /* unused */ 0,
+ 0x8302'8100, 0x8302'8100},
+ // expected_result_vd0_with_mask_int64
+ {0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x6efc'6af8'66f4'62f1,
+ /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x8706'8504'8302'8100},
+ kVectorCalculationsSource);
+}
+
// Note that the expected test outputs for v[f]merge.vXm are identical to those for v[f]mv.v.X.
// This happens because v[f]merge.vXm is just a v[f]mv.v.X with mask (second operand is not used
// by v[f]mv.v.X but the difference between v[f]merge.vXm and v[f]mv.v.X is captured in masking
diff --git a/intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h b/intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h
index 383dd10..5d155e4 100644
--- a/intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h
+++ b/intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h
@@ -760,6 +760,15 @@
DEFINE_ARITHMETIC_REDUCE_INTRINSIC(Vred##name##vs, return ({ __VA_ARGS__; }); \
, (ResultType init, SIMD128Register src), (), (init, src))
+#define DEFINE_2OP_FLOAT_ARITHMETIC_INTRINSIC_VS(name, ...) \
+ DEFINE_ARITHMETIC_REDUCE_INTRINSIC(Vfred##name##vs, return ({ __VA_ARGS__; }); \
+ , (ResultType init, SIMD128Register src), (), (init, src))
+
+#define DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VS(name, ...) \
+ DEFINE_ARITHMETIC_REDUCE_INTRINSIC( \
+ Vfred##name##vs, return ({ __VA_ARGS__; }); \
+ , (int8_t csr, ResultType init, SIMD128Register src), (csr), (init, src))
+
#define DEFINE_W_ARITHMETIC_INTRINSIC(Name, Pattern, arithmetic, parameters, arguments) \
template <typename ElementType, \
enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible> \
@@ -815,6 +824,8 @@
DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(xor, (args ^ ...))
DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(add, std::get<0>(FAdd(FPFlags::DYN, csr, args...)))
DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(add, std::get<0>(FAdd(FPFlags::DYN, csr, args...)))
+DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VS(osum, std::get<0>(FAdd(FPFlags::DYN, csr, args...)))
+DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VS(usum, std::get<0>(FAdd(FPFlags::DYN, csr, args...)))
DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(mul, std::get<0>(FMul(FPFlags::DYN, csr, args...)))
DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(mul, std::get<0>(FMul(FPFlags::DYN, csr, args...)))
DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(div, std::get<0>(FDiv(FPFlags::DYN, csr, args...)))
@@ -900,10 +911,12 @@
(-(arg2 * arg3) + arg1))
DEFINE_3OP_ARITHMETIC_INTRINSIC_VX(nmsub, auto [arg1, arg2, arg3] = std::tuple{args...};
(-(arg2 * arg3) + arg1))
-DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fmin, std::get<0>(FMin(args...)))
-DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fmax, std::get<0>(FMax(args...)))
DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fmin, std::get<0>(FMin(args...)))
+DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fmin, std::get<0>(FMin(args...)))
+DEFINE_2OP_FLOAT_ARITHMETIC_INTRINSIC_VS(min, std::get<0>(FMin(args...)))
DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fmax, std::get<0>(FMax(args...)))
+DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fmax, std::get<0>(FMax(args...)))
+DEFINE_2OP_FLOAT_ARITHMETIC_INTRINSIC_VS(max, std::get<0>(FMax(args...)))
DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fsgnj, std::get<0>(FSgnj(args...)))
DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fsgnj, std::get<0>(FSgnj(args...)))
DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fsgnjn, std::get<0>(FSgnjn(args...)))
@@ -954,12 +967,14 @@
#undef DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS
#undef DEFINE_1OP_ARITHMETIC_INTRINSIC_V
#undef DEFINE_2OP_ARITHMETIC_INTRINSIC_VS
+#undef DEFINE_2OP_FLOAT_ARITHMETIC_INTRINSIC_VS
#undef DEFINE_2OP_ARITHMETIC_INTRINSIC_VV
#undef DEFINE_3OP_ARITHMETIC_INTRINSIC_VV
#undef DEFINE_2OP_ARITHMETIC_INTRINSIC_VX
#undef DEFINE_3OP_ARITHMETIC_INTRINSIC_VX
#undef DEFINE_1OP_ARITHMETIC_INTRINSIC_X
#undef DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF
+#undef DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VS
#undef DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV
#undef DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WV
#undef DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WX
diff --git a/intrinsics/riscv64_to_x86_64/include/berberis/intrinsics/macro_assembler_constants_pool.h b/intrinsics/riscv64_to_x86_64/include/berberis/intrinsics/macro_assembler_constants_pool.h
index 0a51a30..a7a20c9 100644
--- a/intrinsics/riscv64_to_x86_64/include/berberis/intrinsics/macro_assembler_constants_pool.h
+++ b/intrinsics/riscv64_to_x86_64/include/berberis/intrinsics/macro_assembler_constants_pool.h
@@ -37,16 +37,22 @@
template <>
extern const int32_t kVectorConst<int32_t{static_cast<int32_t>(-0x8000'0000)}>;
template <>
+extern const int32_t kVectorConst<int32_t{-0x0080'0000}>;
+template <>
extern const int32_t kVectorConst<int32_t{0x3f80'0000}>;
template <>
+extern const int32_t kVectorConst<int32_t{0x7f80'0000}>;
+template <>
extern const int32_t kVectorConst<int32_t{0x7fff'ffff}>;
template <>
extern const int32_t kVectorConst<int64_t{static_cast<int64_t>(-0x8000'0000'0000'0000)}>;
template <>
-extern const int32_t kVectorConst<int64_t{0x7ff8'0000'0000'0000}>;
+extern const int32_t kVectorConst<int64_t{0x7ff0'0000'0000'0000}>;
template <>
extern const int32_t kVectorConst<int64_t{0x7fff'ffff'ffff'ffff}>;
template <>
+extern const int32_t kVectorConst<int64_t{-0x0010'0000'0000'0000}>;
+template <>
extern const int32_t kVectorConst<uint64_t{0x0000'0000'0000'0000}>;
template <>
inline const int32_t& kVectorConst<int8_t{0x00}> = kVectorConst<uint64_t{0x0000'0000'0000'0000}>;
@@ -58,19 +64,44 @@
inline const int32_t& kVectorConst<uint16_t{0x0000}> =
kVectorConst<uint64_t{0x0000'0000'0000'0000}>;
template <>
+inline const int32_t& kVectorConst<uint8_t{127}> = kVectorConst<int8_t{127}>;
+template <>
+inline const int32_t& kVectorConst<uint8_t{128}> = kVectorConst<int8_t{-128}>;
+template <>
+inline const int32_t& kVectorConst<uint16_t{0x7fff}> = kVectorConst<int16_t{0x7fff}>;
+template <>
+inline const int32_t& kVectorConst<uint16_t{0x8000}> = kVectorConst<int16_t{-0x8000}>;
+template <>
inline const int32_t& kVectorConst<int32_t{0x0000'0000}> =
kVectorConst<uint64_t{0x0000'0000'0000'0000}>;
template <>
inline const int32_t& kVectorConst<uint32_t{0x0000'0000}> =
kVectorConst<uint64_t{0x0000'0000'0000'0000}>;
template <>
+inline const int32_t& kVectorConst<uint32_t{0x3f80'0000}> = kVectorConst<int32_t{0x3f80'0000}>;
+template <>
+inline const int32_t& kVectorConst<uint32_t{0x7f80'0000}> = kVectorConst<int32_t{0x7f80'0000}>;
+template <>
+inline const int32_t& kVectorConst<uint32_t{0x7fff'ffff}> = kVectorConst<int32_t{0x7fff'ffff}>;
+template <>
+inline const int32_t& kVectorConst<uint32_t{0x8000'0000}> =
+ kVectorConst<int32_t{static_cast<int32_t>(-0x8000'0000)}>;
+template <>
+inline const int32_t& kVectorConst<uint32_t{0xff80'0000}> = kVectorConst<int32_t{-0x0080'0000}>;
+template <>
inline const int32_t& kVectorConst<int64_t{0x0000'0000'0000'0000}> =
kVectorConst<uint64_t{0x0000'0000'0000'0000}>;
template <>
extern const int32_t kVectorConst<uint64_t{0x7fc'00000'7fc'00000}>;
template <>
+inline const int32_t& kVectorConst<uint64_t{0x7ff0'0000'0000'0000}> =
+ kVectorConst<int64_t{0x7ff0'0000'0000'0000}>;
+template <>
extern const int32_t kVectorConst<uint64_t{0x7ff8'0000'0000'0000}>;
template <>
+inline const int32_t& kVectorConst<uint64_t{0xfff0'0000'0000'0000}> =
+ kVectorConst<int64_t{-0x0010'0000'0000'0000}>;
+template <>
extern const int32_t kVectorConst<uint64_t{0xffff'ffff'0000'0000}>;
template <>
extern const int32_t kVectorConst<uint64_t{0xffff'ffff'7fc0'0000}>;
diff --git a/intrinsics/riscv64_to_x86_64/macro_assembler.cc b/intrinsics/riscv64_to_x86_64/macro_assembler.cc
index 597ebc9..edbf615 100644
--- a/intrinsics/riscv64_to_x86_64/macro_assembler.cc
+++ b/intrinsics/riscv64_to_x86_64/macro_assembler.cc
@@ -42,6 +42,10 @@
0x7ff8'0000'0000'0000};
alignas(16) const uint32_t kFloat32One[4] = {0x3f80'0000, 0x3f80'0000, 0x3f80'0000, 0x3f80'0000};
alignas(16) const uint64_t kFloat64One[2] = {0x3ff0'0000'0000'0000, 0x3ff0'0000'0000'0000};
+ alignas(16) const uint32_t kFloat32PInf[4] = {0x7f80'0000, 0x7f80'0000, 0x7f80'0000, 0x7f80'0000};
+ alignas(16) const uint32_t kFloat32NInf[4] = {0xff80'0000, 0xff80'0000, 0xff80'0000, 0xff80'0000};
+ alignas(16) const uint64_t kFloat64PInf[2] = {0x7ff0'0000'0000'0000, 0x7ff0'0000'0000'0000};
+ alignas(16) const uint64_t kFloat64NInf[2] = {0xfff0'0000'0000'0000, 0xfff0'0000'0000'0000};
alignas(16) const int8_t kMinInt8[16] = {
-128,
-128,
@@ -262,36 +266,40 @@
};
// Make sure Layout is the same in 32-bit mode and 64-bit mode.
-CHECK_STRUCT_LAYOUT(MacroAssemblerConstants, 27008, 128);
+CHECK_STRUCT_LAYOUT(MacroAssemblerConstants, 27520, 128);
CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kNanBoxFloat32, 0, 128);
CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kNanBoxedNansFloat32, 128, 128);
CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kCanonicalNansFloat32, 256, 128);
CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kCanonicalNansFloat64, 384, 128);
CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kFloat32One, 512, 128);
CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kFloat64One, 640, 128);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kMinInt8, 768, 128);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kMaxInt8, 896, 128);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kMinInt16, 1024, 128);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kMaxInt16, 1152, 128);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kMinInt32, 1280, 128);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kMaxInt32, 1408, 128);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kMinInt64, 1536, 128);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kMaxInt64, 1664, 128);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kBsrToClzInt64, 1792, 64);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kWidthInBits64, 1856, 64);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kBsrToClzInt32, 1920, 32);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kWidthInBits32, 1952, 32);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, k0x8000_0000_0000_00ff, 1984, 64);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kRiscVToX87Exceptions, 2432, 256);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kX87ToRiscVExceptions, 2688, 512);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kBitMaskTable, 3200, 2048);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kVid64Bit, 5248, 1024);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kVid32Bit, 6272, 1024);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kVid16Bit, 7296, 1024);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kVid8Bit, 8320, 1024);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kBitMaskTo32bitMask, 9344, 256);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kBitMaskTo16bitMask, 9600, 1024);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kBitMaskTo8bitMask, 10624, 16384);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kFloat32PInf, 768, 128);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kFloat32NInf, 896, 128);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kFloat64PInf, 1024, 128);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kFloat64NInf, 1152, 128);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kMinInt8, 1280, 128);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kMaxInt8, 1408, 128);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kMinInt16, 1536, 128);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kMaxInt16, 1664, 128);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kMinInt32, 1792, 128);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kMaxInt32, 1920, 128);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kMinInt64, 2048, 128);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kMaxInt64, 2176, 128);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kBsrToClzInt64, 2304, 64);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kWidthInBits64, 2368, 64);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kBsrToClzInt32, 2432, 32);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kWidthInBits32, 2464, 32);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, k0x8000_0000_0000_00ff, 2496, 64);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kRiscVToX87Exceptions, 2944, 256);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kX87ToRiscVExceptions, 3200, 512);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kBitMaskTable, 3712, 2048);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kVid64Bit, 5760, 1024);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kVid32Bit, 6784, 1024);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kVid16Bit, 7808, 1024);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kVid8Bit, 8832, 1024);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kBitMaskTo32bitMask, 9856, 256);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kBitMaskTo16bitMask, 10112, 1024);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kBitMaskTo8bitMask, 11136, 16384);
// Note: because we have aligned fields and thus padding in that data structure
// value-initialization is both slower and larger than copy-initialization for
@@ -342,17 +350,29 @@
extern const int32_t kVectorConst<int32_t{0x3f80'0000}> =
GetConstants() + offsetof(MacroAssemblerConstants, kFloat32One);
template <>
+extern const int32_t kVectorConst<int32_t{0x7f80'0000}> =
+ GetConstants() + offsetof(MacroAssemblerConstants, kFloat32PInf);
+template <>
extern const int32_t kVectorConst<int32_t{0x7fff'ffff}> =
GetConstants() + offsetof(MacroAssemblerConstants, kMaxInt32);
template <>
+extern const int32_t kVectorConst<int32_t{-0x0080'0000}> =
+ GetConstants() + offsetof(MacroAssemblerConstants, kFloat32NInf);
+template <>
extern const int32_t kVectorConst<int64_t{static_cast<int64_t>(-0x8000'0000'0000'0000)}> =
GetConstants() + offsetof(MacroAssemblerConstants, kMinInt64);
template <>
+extern const int32_t kVectorConst<int64_t{0x3ff0'0000'0000'0000}> =
+ GetConstants() + offsetof(MacroAssemblerConstants, kFloat64One);
+template <>
+extern const int32_t kVectorConst<int64_t{0x7ff0'0000'0000'0000}> =
+ GetConstants() + offsetof(MacroAssemblerConstants, kFloat64PInf);
+template <>
extern const int32_t kVectorConst<int64_t{0x7fff'ffff'ffff'ffff}> =
GetConstants() + offsetof(MacroAssemblerConstants, kMaxInt64);
template <>
-extern const int32_t kVectorConst<int64_t{0x3ff0'0000'0000'0000}> =
- GetConstants() + offsetof(MacroAssemblerConstants, kFloat64One);
+extern const int32_t kVectorConst<int64_t{-0x0010'0000'0000'0000}> =
+ GetConstants() + offsetof(MacroAssemblerConstants, kFloat64NInf);
template <>
const int32_t kVectorConst<uint64_t{0x0000'0000'0000'0000}> =
GetConstants() + offsetof(MacroAssemblerConstants, kBitMaskTable);