Revert^2 "Enable [vzext|vsext].vf[8|4|2] instructions" This reverts commit 1bcd5b52b2a37f15d8f0bb96bcffd617cd189cb8. Reason for revert: Should work now Change-Id: I834fef1449168fc2e9ecbd315ba54dacb9b9e164
diff --git a/decoder/include/berberis/decoder/riscv64/decoder.h b/decoder/include/berberis/decoder/riscv64/decoder.h index 483445a..0a3cdde 100644 --- a/decoder/include/berberis/decoder/riscv64/decoder.h +++ b/decoder/include/berberis/decoder/riscv64/decoder.h
@@ -363,6 +363,7 @@ kVmxnormm = 0b011111, kVXmXXs = 0b010000, kVmsXf = 0b010100, + kVxunary0 = 0b010010, kVmulhuvv = 0b100100, kVmulvv = 0b100101, kVmulhsuvv = 0b100110, @@ -459,6 +460,15 @@ kVidv = 0b10001, }; + enum class Vxunary0Opcode : uint8_t { + kVzextvf8m = 0b00010, + kVsextvf8m = 0b00011, + kVzextvf4m = 0b00100, + kVsextvf4m = 0b00101, + kVzextvf2m = 0b00110, + kVsextvf2m = 0b00111, + }; + // Load/Store instruction include 3bit “width” field while all other floating-point instructions // include 2bit “fmt” field. // @@ -748,6 +758,7 @@ union { VXmXXsOpcode vXmXXs_opcode; VmsXfOpcode vmsXf_opcode; + Vxunary0Opcode vxunary0_opcode; uint8_t src2; }; };
diff --git a/interpreter/riscv64/interpreter.cc b/interpreter/riscv64/interpreter.cc index 450341a..bd11a43 100644 --- a/interpreter/riscv64/interpreter.cc +++ b/interpreter/riscv64/interpreter.cc
@@ -983,6 +983,60 @@ default: return Unimplemented(); } + case Decoder::VOpMVvOpcode::kVxunary0: + switch (args.vxunary0_opcode) { + case Decoder::Vxunary0Opcode::kVzextvf2m: + if constexpr (sizeof(UnsignedType) >= 2) { + return OpVectorExtend<intrinsics::Vextf2<UnsignedType>, + UnsignedType, + 2, + vlmul, + vta, + vma>(args.dst, args.src1); + } + break; + case Decoder::Vxunary0Opcode::kVsextvf2m: + if constexpr (sizeof(SignedType) >= 2) { + return OpVectorExtend<intrinsics::Vextf2<SignedType>, SignedType, 2, vlmul, vta, vma>( + args.dst, args.src1); + } + break; + case Decoder::Vxunary0Opcode::kVzextvf4m: + if constexpr (sizeof(UnsignedType) >= 4) { + return OpVectorExtend<intrinsics::Vextf4<UnsignedType>, + UnsignedType, + 4, + vlmul, + vta, + vma>(args.dst, args.src1); + } + break; + case Decoder::Vxunary0Opcode::kVsextvf4m: + if constexpr (sizeof(SignedType) >= 4) { + return OpVectorExtend<intrinsics::Vextf4<SignedType>, SignedType, 4, vlmul, vta, vma>( + args.dst, args.src1); + } + break; + case Decoder::Vxunary0Opcode::kVzextvf8m: + if constexpr (sizeof(UnsignedType) >= 8) { + return OpVectorExtend<intrinsics::Vextf8<UnsignedType>, + UnsignedType, + 8, + vlmul, + vta, + vma>(args.dst, args.src1); + } + break; + case Decoder::Vxunary0Opcode::kVsextvf8m: + if constexpr (sizeof(SignedType) >= 8) { + return OpVectorExtend<intrinsics::Vextf8<SignedType>, SignedType, 8, vlmul, vta, vma>( + args.dst, args.src1); + } + break; + default: + return Unimplemented(); + } + return Unimplemented(); case Decoder::VOpMVvOpcode::kVmsXf: switch (args.vmsXf_opcode) { case Decoder::VmsXfOpcode::kVmsbfm: @@ -1713,6 +1767,40 @@ } template <auto Intrinsic, + typename DestElementType, + const uint8_t kFactor, + VectorRegisterGroupMultiplier vlmul, + TailProcessing vta, + auto vma> + void OpVectorExtend(uint8_t dst, uint8_t src) { + static_assert(kFactor == 2 || kFactor == 4 || kFactor == 8); + constexpr size_t kDestRegistersInvolved = NumberOfRegistersInvolved(vlmul); + constexpr size_t kSourceRegistersInvolved = (kDestRegistersInvolved / kFactor) ?: 1; + if (!IsAligned<kDestRegistersInvolved>(dst) || !IsAligned<kSourceRegistersInvolved>(src)) { + return Unimplemented(); + } + int vstart = GetCsr<CsrName::kVstart>(); + int vl = GetCsr<CsrName::kVl>(); + auto mask = GetMaskForVectorOperations<vma>(); + for (size_t dst_index = 0; dst_index < kDestRegistersInvolved; dst_index++) { + size_t src_index = dst_index / kFactor; + size_t src_elem = dst_index % kFactor; + SIMD128Register result{state_->cpu.v[dst + dst_index]}; + SIMD128Register arg{state_->cpu.v[src + src_index] >> ((128 / kFactor) * src_elem)}; + + result = std::get<0>(intrinsics::VectorMasking<DestElementType, vta, vma>( + result, + std::get<0>(Intrinsic(arg)), + result, + vstart - dst_index * (16 / sizeof(DestElementType)), + vl - dst_index * (16 / sizeof(DestElementType)), + std::get<0>(intrinsics::MaskForRegisterInSequence<DestElementType>(mask, dst_index)))); + state_->cpu.v[dst + dst_index] = result.Get<__uint128_t>(); + } + SetCsr<CsrName::kVstart>(0); + } + + template <auto Intrinsic, typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta,
diff --git a/interpreter/riscv64/interpreter_test.cc b/interpreter/riscv64/interpreter_test.cc index 241651c..3bba652 100644 --- a/interpreter/riscv64/interpreter_test.cc +++ b/interpreter/riscv64/interpreter_test.cc
@@ -517,6 +517,125 @@ Verify(insn_bytes | (1 << 25), 2, 8, expected_result_int32, kNoMask); } + void TestExtendingVectorInstruction(uint32_t insn_bytes, + const __v8hu (&expected_result_int16)[8], + const __v4su (&expected_result_int32)[8], + const __v2du (&expected_result_int64)[8], + const __v2du (&source)[16], + const uint8_t factor) { + auto Verify = [this, &source, &factor](uint32_t insn_bytes, + uint8_t vsew, + uint8_t vlmul_max, + const auto& expected_result, + auto mask) { + CHECK((factor == 2) || (factor == 4) || (factor == 8)); + // Mask register is, unconditionally, v0, and we need 8, 16, or 24 to handle full 8-registers + // inputs thus we use v8..v15 for destination and place sources into v16..v23 and v24..v31. + state_.cpu.v[0] = SIMD128Register{kMask}.Get<__uint128_t>(); + for (uint8_t index = 0; index < (8 / factor); ++index) { + state_.cpu.v[16 + index] = SIMD128Register{source[index]}.Get<__uint128_t>(); + } + for (uint8_t vlmul = 0; vlmul < vlmul_max; ++vlmul) { + if (vlmul == 3) { + continue; + } + for (uint8_t vta = 0; vta < 2; ++vta) { + for (uint8_t vma = 0; vma < 2; ++vma) { + auto [vlmax, vtype] = + intrinsics::Vsetvl(~0ULL, (vma << 7) | (vta << 6) | (vsew << 3) | vlmul); + // Incompatible vsew and vlmax. Skip it. + if (vlmax == 0) { + continue; + } + // To make tests quick enough we don't test vstart and vl change with small register + // sets. Only with vlmul == 2 (4 registers) we set vstart and vl to skip half of + // first + // register and half of last register. + // Don't use vlmul == 3 because that one may not be supported if instruction widens + // the result. + if (vlmul == 2) { + state_.cpu.vstart = vlmax / 8; + state_.cpu.vl = (vlmax * 5) / 8; + } else { + state_.cpu.vstart = 0; + state_.cpu.vl = vlmax; + } + state_.cpu.vtype = vtype; + + // Set expected_result vector registers into 0b01010101… pattern. + for (size_t index = 0; index < 8; ++index) { + state_.cpu.v[8 + index] = SIMD128Register{kUndisturbedResult}.Get<__uint128_t>(); + } + + state_.cpu.insn_addr = ToGuestAddr(&insn_bytes); + EXPECT_TRUE(RunOneInstruction(&state_, state_.cpu.insn_addr + 4)); + + // Values for inactive elements (i.e. corresponding mask bit is 0). + const size_t n = std::size(source) * 2; + __m128i expected_inactive[n]; + // For most instructions, follow basic inactive processing rules based on vma flag. + std::fill_n(expected_inactive, n, (vma ? kAgnosticResult : kUndisturbedResult)); + + if (vlmul < 4) { + for (size_t index = 0; index < 1 << vlmul; ++index) { + if (index == 0 && vlmul == 2) { + EXPECT_EQ(state_.cpu.v[8 + index], + SIMD128Register{ + (kUndisturbedResult & kFractionMaskInt8[3]) | + (expected_result[index] & mask[index] & ~kFractionMaskInt8[3]) | + (expected_inactive[index] & ~mask[index] & ~kFractionMaskInt8[3])} + .Get<__uint128_t>()); + } else if (index == 2 && vlmul == 2) { + EXPECT_EQ( + state_.cpu.v[8 + index], + SIMD128Register{ + (expected_result[index] & mask[index] & kFractionMaskInt8[3]) | + (expected_inactive[index] & ~mask[index] & kFractionMaskInt8[3]) | + ((vta ? kAgnosticResult : kUndisturbedResult) & ~kFractionMaskInt8[3])} + .Get<__uint128_t>()); + } else if (index == 3 && vlmul == 2 && vta) { + EXPECT_EQ(state_.cpu.v[8 + index], SIMD128Register{kAgnosticResult}); + } else if (index == 3 && vlmul == 2) { + EXPECT_EQ(state_.cpu.v[8 + index], SIMD128Register{kUndisturbedResult}); + } else { + EXPECT_EQ(state_.cpu.v[8 + index], + SIMD128Register{(expected_result[index] & mask[index]) | + (expected_inactive[index] & ~mask[index])} + .Get<__uint128_t>()); + } + } + } else { + EXPECT_EQ( + state_.cpu.v[8], + SIMD128Register{(expected_result[0] & mask[0] & kFractionMaskInt8[vlmul - 4]) | + (expected_inactive[0] & ~mask[0] & kFractionMaskInt8[vlmul - 4]) | + ((vta ? kAgnosticResult : kUndisturbedResult) & + ~kFractionMaskInt8[vlmul - 4])} + .Get<__uint128_t>()); + } + + if (vlmul == 2) { + // Every vector instruction must set vstart to 0, but shouldn't touch vl. + EXPECT_EQ(state_.cpu.vstart, 0); + EXPECT_EQ(state_.cpu.vl, (vlmax * 5) / 8); + } + } + } + } + }; + + if (factor == 2) { + Verify(insn_bytes, 1, 8, expected_result_int16, kMaskInt16); + Verify(insn_bytes | (1 << 25), 1, 8, expected_result_int16, kNoMask); + } + if (factor == 2 || factor == 4) { + Verify(insn_bytes, 2, 8, expected_result_int32, kMaskInt32); + Verify(insn_bytes | (1 << 25), 2, 8, expected_result_int32, kNoMask); + } + Verify(insn_bytes, 3, 8, expected_result_int64, kMaskInt64); + Verify(insn_bytes | (1 << 25), 3, 8, expected_result_int64, kNoMask); + } + void TestVectorMaskInstruction(uint8_t max_vstart, intrinsics::InactiveProcessing vma, uint32_t insn_bytes, @@ -3580,6 +3699,137 @@ kVectorCalculationsSource[0]); } +TEST_F(Riscv64InterpreterTest, TestVXext) { + TestExtendingVectorInstruction( + 0x49012457, // vzext.vf8 v8,v16,v0.t + {}, {}, + {{0x0000'0000'0000'0000, 0x0000'0000'0000'0081}, + {0x0000'0000'0000'0002, 0x0000'0000'0000'0083}, + {0x0000'0000'0000'0004, 0x0000'0000'0000'0085}, + {0x0000'0000'0000'0006, 0x0000'0000'0000'0087}, + {0x0000'0000'0000'0008, 0x0000'0000'0000'0089}, + {0x0000'0000'0000'000a, 0x0000'0000'0000'008b}, + {0x0000'0000'0000'000c, 0x0000'0000'0000'008d}, + {0x0000'0000'0000'000e, 0x0000'0000'0000'008f}}, + kVectorCalculationsSource, + 8); + + TestExtendingVectorInstruction( + 0x4901a457, // vsext.vf8 v8,v16,v0.t + {}, {}, + {{0x0000'0000'0000'0000, 0xffff'ffff'ffff'ff81}, + {0x0000'0000'0000'0002, 0xffff'ffff'ffff'ff83}, + {0x0000'0000'0000'0004, 0xffff'ffff'ffff'ff85}, + {0x0000'0000'0000'0006, 0xffff'ffff'ffff'ff87}, + {0x0000'0000'0000'0008, 0xffff'ffff'ffff'ff89}, + {0x0000'0000'0000'000a, 0xffff'ffff'ffff'ff8b}, + {0x0000'0000'0000'000c, 0xffff'ffff'ffff'ff8d}, + {0x0000'0000'0000'000e, 0xffff'ffff'ffff'ff8f}}, + kVectorCalculationsSource, + 8); + + TestExtendingVectorInstruction( + 0x49022457, // vzext.vf4 v8, v16, v0.t + {}, + {{0x0000'0000, 0x0000'0081, 0x0000'0002, 0x0000'0083}, + {0x0000'0004, 0x0000'0085, 0x0000'0006, 0x0000'0087}, + {0x0000'0008, 0x0000'0089, 0x0000'000a, 0x0000'008b}, + {0x0000'000c, 0x0000'008d, 0x0000'000e, 0x0000'008f}, + {0x0000'0010, 0x0000'0091, 0x0000'0012, 0x0000'0093}, + {0x0000'0014, 0x0000'0095, 0x0000'0016, 0x0000'0097}, + {0x0000'0018, 0x0000'0099, 0x0000'001a, 0x0000'009b}, + {0x0000'001c, 0x0000'009d, 0x0000'001e, 0x0000'009f}}, + {{0x0000'0000'0000'8100, 0x0000'0000'0000'8302}, + {0x0000'0000'0000'8504, 0x0000'0000'0000'8706}, + {0x0000'0000'0000'8908, 0x0000'0000'0000'8b0a}, + {0x0000'0000'0000'8d0c, 0x0000'0000'0000'8f0e}, + {0x0000'0000'0000'9110, 0x0000'0000'0000'9312}, + {0x0000'0000'0000'9514, 0x0000'0000'0000'9716}, + {0x0000'0000'0000'9918, 0x0000'0000'0000'9b1a}, + {0x0000'0000'0000'9d1c, 0x0000'0000'0000'9f1e}}, + kVectorCalculationsSource, + 4); + + TestExtendingVectorInstruction( + 0x4902a457, // vsext.vf4 v8,v16,v0.t + {}, + {{0x0000'0000, 0xffff'ff81, 0x0000'0002, 0xffff'ff83}, + {0x0000'0004, 0xffff'ff85, 0x0000'0006, 0xffff'ff87}, + {0x0000'0008, 0xffff'ff89, 0x0000'000a, 0xffff'ff8b}, + {0x0000'000c, 0xffff'ff8d, 0x0000'000e, 0xffff'ff8f}, + {0x0000'0010, 0xffff'ff91, 0x0000'0012, 0xffff'ff93}, + {0x0000'0014, 0xffff'ff95, 0x0000'0016, 0xffff'ff97}, + {0x0000'0018, 0xffff'ff99, 0x0000'001a, 0xffff'ff9b}, + {0x0000'001c, 0xffff'ff9d, 0x0000'001e, 0xffff'ff9f}}, + {{0xffff'ffff'ffff'8100, 0xffff'ffff'ffff'8302}, + {0xffff'ffff'ffff'8504, 0xffff'ffff'ffff'8706}, + {0xffff'ffff'ffff'8908, 0xffff'ffff'ffff'8b0a}, + {0xffff'ffff'ffff'8d0c, 0xffff'ffff'ffff'8f0e}, + {0xffff'ffff'ffff'9110, 0xffff'ffff'ffff'9312}, + {0xffff'ffff'ffff'9514, 0xffff'ffff'ffff'9716}, + {0xffff'ffff'ffff'9918, 0xffff'ffff'ffff'9b1a}, + {0xffff'ffff'ffff'9d1c, 0xffff'ffff'ffff'9f1e}}, + kVectorCalculationsSource, + 4); + + TestExtendingVectorInstruction( + 0x49032457, // vzext.vf2 v8,v16,v0.t + {{0x0000, 0x0081, 0x0002, 0x0083, 0x0004, 0x0085, 0x0006, 0x0087}, + {0x0008, 0x0089, 0x000a, 0x008b, 0x000c, 0x008d, 0x000e, 0x008f}, + {0x0010, 0x0091, 0x0012, 0x0093, 0x0014, 0x0095, 0x0016, 0x0097}, + {0x0018, 0x0099, 0x001a, 0x009b, 0x001c, 0x009d, 0x001e, 0x009f}, + {0x0020, 0x00a1, 0x0022, 0x00a3, 0x0024, 0x00a5, 0x0026, 0x00a7}, + {0x0028, 0x00a9, 0x002a, 0x00ab, 0x002c, 0x00ad, 0x002e, 0x00af}, + {0x0030, 0x00b1, 0x0032, 0x00b3, 0x0034, 0x00b5, 0x0036, 0x00b7}, + {0x0038, 0x00b9, 0x003a, 0x00bb, 0x003c, 0x00bd, 0x003e, 0x00bf}}, + {{0x0000'8100, 0x0000'8302, 0x0000'8504, 0x0000'8706}, + {0x0000'8908, 0x0000'8b0a, 0x0000'8d0c, 0x0000'8f0e}, + {0x0000'9110, 0x0000'9312, 0x0000'9514, 0x0000'9716}, + {0x0000'9918, 0x0000'9b1a, 0x0000'9d1c, 0x0000'9f1e}, + {0x0000'a120, 0x0000'a322, 0x0000'a524, 0x0000'a726}, + {0x0000'a928, 0x0000'ab2a, 0x0000'ad2c, 0x0000'af2e}, + {0x0000'b130, 0x0000'b332, 0x0000'b534, 0x0000'b736}, + {0x0000'b938, 0x0000'bb3a, 0x0000'bd3c, 0x0000'bf3e}}, + {{0x0000'0000'8302'8100, 0x0000'0000'8706'8504}, + {0x0000'0000'8b0a'8908, 0x0000'0000'8f0e'8d0c}, + {0x0000'0000'9312'9110, 0x0000'0000'9716'9514}, + {0x0000'0000'9b1a'9918, 0x0000'0000'9f1e'9d1c}, + {0x0000'0000'a322'a120, 0x0000'0000'a726'a524}, + {0x0000'0000'ab2a'a928, 0x0000'0000'af2e'ad2c}, + {0x0000'0000'b332'b130, 0x0000'0000'b736'b534}, + {0x0000'0000'bb3a'b938, 0x0000'0000'bf3e'bd3c}}, + kVectorCalculationsSource, + 2); + + TestExtendingVectorInstruction( + 0x4903a457, // vsext.vf2 v8,v16,v0.t + {{0x0000, 0xff81, 0x0002, 0xff83, 0x0004, 0xff85, 0x0006, 0xff87}, + {0x0008, 0xff89, 0x000a, 0xff8b, 0x000c, 0xff8d, 0x000e, 0xff8f}, + {0x0010, 0xff91, 0x0012, 0xff93, 0x0014, 0xff95, 0x0016, 0xff97}, + {0x0018, 0xff99, 0x001a, 0xff9b, 0x001c, 0xff9d, 0x001e, 0xff9f}, + {0x0020, 0xffa1, 0x0022, 0xffa3, 0x0024, 0xffa5, 0x0026, 0xffa7}, + {0x0028, 0xffa9, 0x002a, 0xffab, 0x002c, 0xffad, 0x002e, 0xffaf}, + {0x0030, 0xffb1, 0x0032, 0xffb3, 0x0034, 0xffb5, 0x0036, 0xffb7}, + {0x0038, 0xffb9, 0x003a, 0xffbb, 0x003c, 0xffbd, 0x003e, 0xffbf}}, + {{0xffff'8100, 0xffff'8302, 0xffff'8504, 0xffff'8706}, + {0xffff'8908, 0xffff'8b0a, 0xffff'8d0c, 0xffff'8f0e}, + {0xffff'9110, 0xffff'9312, 0xffff'9514, 0xffff'9716}, + {0xffff'9918, 0xffff'9b1a, 0xffff'9d1c, 0xffff'9f1e}, + {0xffff'a120, 0xffff'a322, 0xffff'a524, 0xffff'a726}, + {0xffff'a928, 0xffff'ab2a, 0xffff'ad2c, 0xffff'af2e}, + {0xffff'b130, 0xffff'b332, 0xffff'b534, 0xffff'b736}, + {0xffff'b938, 0xffff'bb3a, 0xffff'bd3c, 0xffff'bf3e}}, + {{0xffff'ffff'8302'8100, 0xffff'ffff'8706'8504}, + {0xffff'ffff'8b0a'8908, 0xffff'ffff'8f0e'8d0c}, + {0xffff'ffff'9312'9110, 0xffff'ffff'9716'9514}, + {0xffff'ffff'9b1a'9918, 0xffff'ffff'9f1e'9d1c}, + {0xffff'ffff'a322'a120, 0xffff'ffff'a726'a524}, + {0xffff'ffff'ab2a'a928, 0xffff'ffff'af2e'ad2c}, + {0xffff'ffff'b332'b130, 0xffff'ffff'b736'b534}, + {0xffff'ffff'bb3a'b938, 0xffff'ffff'bf3e'bd3c}}, + kVectorCalculationsSource, + 2); +} } // namespace } // namespace berberis
diff --git a/intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h b/intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h index 7a2a334..d243699 100644 --- a/intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h +++ b/intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h
@@ -303,6 +303,39 @@ return result; } +template <typename ElementType> +SIMD128Register VectorExtend(SIMD128Register src) { + SIMD128Register result; + constexpr int kElementsCount = static_cast<int>(8 / sizeof(ElementType)); + for (int index = 0; index < kElementsCount; ++index) { + result.Set(Widen(VectorElement<ElementType>(src, index)), index); + } + return result; +} + +template <typename ElementType, + enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible> +inline std::tuple<SIMD128Register> Vextf2(SIMD128Register src) { + using SourceElementType = decltype(Narrow(ElementType{0})); + return {VectorExtend<SourceElementType>(src)}; +} + +template <typename ElementType, + enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible> +inline std::tuple<SIMD128Register> Vextf4(SIMD128Register src) { + using WideSourceElementType = decltype(Narrow(ElementType{0})); + using SourceElementType = decltype(Narrow(WideSourceElementType{0})); + return {VectorExtend<WideSourceElementType>(VectorExtend<SourceElementType>(src))}; +} + +template <typename ElementType, + enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible> +inline std::tuple<SIMD128Register> Vextf8(SIMD128Register src) { + using WideWideSourceElementType = decltype(Narrow(ElementType{0})); + return { + VectorExtend<WideWideSourceElementType>(std::get<0>(Vextf4<WideWideSourceElementType>(src)))}; +} + // SEW = 2*SEW op SEW // TODO(b/260725458): Pass lambda as template argument after C++20 would become available. template <typename ElementType, typename Lambda, typename... ParameterType> @@ -376,9 +409,8 @@ DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS arguments); \ } -#define DEFINE_1OP_ARITHMETIC_INTRINSIC_M(name, ...) \ - DEFINE_ARITHMETIC_INTRINSIC(V##name##m, return ({ __VA_ARGS__; }); \ - , (Int128 src), (src)) +#define DEFINE_1OP_ARITHMETIC_INTRINSIC_M(name, ...) \ + DEFINE_ARITHMETIC_INTRINSIC(V##name##m, return ({ __VA_ARGS__; });, (Int128 src), (src)) #define DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(name, ...) \ DEFINE_ARITHMETIC_INTRINSIC(V##name##vs, return ({ __VA_ARGS__; }); \ , (ElementType src1, ElementType src2), (src1, src2))