Implement vfred{[ou]sum,min,max}.vs instructions.

Test: m berberis_all

Change-Id: I8e44fc31fdc90341a8d2c71ac69caf17301a3601
diff --git a/decoder/include/berberis/decoder/riscv64/decoder.h b/decoder/include/berberis/decoder/riscv64/decoder.h
index 084aff9..52cc3d8 100644
--- a/decoder/include/berberis/decoder/riscv64/decoder.h
+++ b/decoder/include/berberis/decoder/riscv64/decoder.h
@@ -284,13 +284,13 @@
 
   enum class VOpFVvOpcode : uint8_t {
     kVfaddvv = 0b000000,
-    kVfredusumvv = 0b000001,
+    kVfredusumvs = 0b000001,
     kVfsubvv = 0b000010,
-    kVfredosumvv = 0b000011,
+    kVfredosumvs = 0b000011,
     kVfminvv = 0b000100,
-    kVfredminvv = 0b000101,
+    kVfredminvs = 0b000101,
     kVfmaxvv = 0b000110,
-    kVfredmaxvv = 0b000111,
+    kVfredmaxvs = 0b000111,
     kVfsgnjvv = 0b001000,
     kVfsgnjnvv = 0b001001,
     kVfsgnjxvv = 0b001010,
diff --git a/interpreter/riscv64/interpreter.h b/interpreter/riscv64/interpreter.h
index 56e5994..99a983d 100644
--- a/interpreter/riscv64/interpreter.h
+++ b/interpreter/riscv64/interpreter.h
@@ -1270,7 +1270,23 @@
   template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
   void OpVector(const Decoder::VOpFVfArgs& args, ElementType arg2) {
     using SignedType = Wrapping<std::make_signed_t<typename TypeTraits<ElementType>::Int>>;
+    // Keep cases sorted in opcode order to match RISC-V V manual.
     switch (args.opcode) {
+      case Decoder::VOpFVfOpcode::kVfminvf:
+        return OpVectorvx<intrinsics::Vfminvx<ElementType>, ElementType, vlmul, vta, vma>(
+            args.dst, args.src1, arg2);
+      case Decoder::VOpFVfOpcode::kVfmaxvf:
+        return OpVectorvx<intrinsics::Vfmaxvx<ElementType>, ElementType, vlmul, vta, vma>(
+            args.dst, args.src1, arg2);
+      case Decoder::VOpFVfOpcode::kVfsgnjvf:
+        return OpVectorvx<intrinsics::Vfsgnjvx<ElementType>, ElementType, vlmul, vta, vma>(
+            args.dst, args.src1, arg2);
+      case Decoder::VOpFVfOpcode::kVfsgnjnvf:
+        return OpVectorvx<intrinsics::Vfsgnjnvx<ElementType>, ElementType, vlmul, vta, vma>(
+            args.dst, args.src1, arg2);
+      case Decoder::VOpFVfOpcode::kVfsgnjxvf:
+        return OpVectorvx<intrinsics::Vfsgnjxvx<ElementType>, ElementType, vlmul, vta, vma>(
+            args.dst, args.src1, arg2);
       case Decoder::VOpFVfOpcode::kVfmvsf:
         if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
           return Unimplemented();
@@ -1295,21 +1311,6 @@
                            InactiveProcessing::kUndisturbed>(
               args.dst, arg2, /*dst_mask=*/args.src1);
         }
-      case Decoder::VOpFVfOpcode::kVfminvf:
-        return OpVectorvx<intrinsics::Vfminvx<ElementType>, ElementType, vlmul, vta, vma>(
-            args.dst, args.src1, arg2);
-      case Decoder::VOpFVfOpcode::kVfmaxvf:
-        return OpVectorvx<intrinsics::Vfmaxvx<ElementType>, ElementType, vlmul, vta, vma>(
-            args.dst, args.src1, arg2);
-      case Decoder::VOpFVfOpcode::kVfsgnjvf:
-        return OpVectorvx<intrinsics::Vfsgnjvx<ElementType>, ElementType, vlmul, vta, vma>(
-            args.dst, args.src1, arg2);
-      case Decoder::VOpFVfOpcode::kVfsgnjnvf:
-        return OpVectorvx<intrinsics::Vfsgnjnvx<ElementType>, ElementType, vlmul, vta, vma>(
-            args.dst, args.src1, arg2);
-      case Decoder::VOpFVfOpcode::kVfsgnjxvf:
-        return OpVectorvx<intrinsics::Vfsgnjxvx<ElementType>, ElementType, vlmul, vta, vma>(
-            args.dst, args.src1, arg2);
       case Decoder::VOpFVfOpcode::kVmfeqvf:
         return OpVectorToMaskvx<intrinsics::Vfeqvx<ElementType>, ElementType, vlmul, vma>(
             args.dst, args.src1, arg2);
@@ -1374,6 +1375,7 @@
     // double-width floats use these encodings to produce regular Float32 types.
     if constexpr (sizeof(ElementType) <= sizeof(Float32)) {
       using WideElementType = typename TypeTraits<ElementType>::Wide;
+      // Keep cases sorted in opcode order to match RISC-V V manual.
       switch (args.opcode) {
         case Decoder::VOpFVvOpcode::kVFUnary0:
           switch (args.vfunary0_opcode) {
@@ -1446,6 +1448,7 @@
       using WideElementType = WideType<ElementType>;
       using WideSignedType = WideType<SignedType>;
       using WideUnsignedType = WideType<UnsignedType>;
+      // Keep cases sorted in opcode order to match RISC-V V manual.
       switch (args.opcode) {
         case Decoder::VOpFVvOpcode::kVFUnary0:
           switch (args.vfunary0_opcode) {
@@ -1532,21 +1535,75 @@
     // If our ElementType is Float16 then “straight” operations are unsupported and we whouldn't try
     // instantiate any functions since this would lead to compilke-time error.
     if constexpr (sizeof(ElementType) >= sizeof(Float32)) {
+      // Floating point IEEE 754 value -0.0 includes 1 top bit set and the other bits not set:
+      // https://en.wikipedia.org/wiki/Signed_zero#Representations This is the exact same
+      // representation minimum negative integer have in two's complement representation:
+      // https://en.wikipedia.org/wiki/Two%27s_complement#Most_negative_number
+      // Note: we pass filler elements as integers because `Float32`/`Float64` couldn't be template
+      // parameters.
+      constexpr SignedType kNegativeZero{std::numeric_limits<typename SignedType::BaseType>::min()};
+      // Floating point IEEE 754 value +0.0 includes only zero bits, same as integer zero.
+      constexpr SignedType kPositiveZero{};
+      // Keep cases sorted in opcode order to match RISC-V V manual.
       switch (args.opcode) {
-        case Decoder::VOpFVvOpcode::kVfmvfs:
-          if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
-            return Unimplemented();
+        case Decoder::VOpFVvOpcode::kVfredusumvs:
+          // 14.3. Vector Single-Width Floating-Point Reduction Instructions:
+          // The additive identity is +0.0 when rounding down or -0.0 for all other rounding modes.
+          if (GetCsr<kFrm>() != FPFlags::RDN) {
+            return OpVectorvs<intrinsics::Vfredusumvs<ElementType>,
+                              ElementType,
+                              vlmul,
+                              vta,
+                              vma,
+                              kFrm>(args.dst, args.src1, Vec<kNegativeZero>{args.src2});
+          } else {
+            return OpVectorvs<intrinsics::Vfredusumvs<ElementType>,
+                              ElementType,
+                              vlmul,
+                              vta,
+                              vma,
+                              kFrm>(args.dst, args.src1, Vec<kPositiveZero>{args.src2});
           }
-          if (args.src2 != 0) {
-            return Unimplemented();
+        case Decoder::VOpFVvOpcode::kVfredosumvs:
+          // 14.3. Vector Single-Width Floating-Point Reduction Instructions:
+          // The additive identity is +0.0 when rounding down or -0.0 for all other rounding modes.
+          if (GetCsr<kFrm>() != FPFlags::RDN) {
+            return OpVectorvs<intrinsics::Vfredosumvs<ElementType>,
+                              ElementType,
+                              vlmul,
+                              vta,
+                              vma,
+                              kFrm>(args.dst, args.src1, Vec<kNegativeZero>{args.src2});
+          } else {
+            return OpVectorvs<intrinsics::Vfredosumvs<ElementType>,
+                              ElementType,
+                              vlmul,
+                              vta,
+                              vma,
+                              kFrm>(args.dst, args.src1, Vec<kPositiveZero>{args.src2});
           }
-          return OpVectorVmvfs<ElementType>(args.dst, args.src1);
-        case Decoder::VOpFVvOpcode::kVfmaxvv:
-          return OpVectorvv<intrinsics::Vfmaxvv<ElementType>, ElementType, vlmul, vta, vma>(
-              args.dst, args.src1, args.src2);
         case Decoder::VOpFVvOpcode::kVfminvv:
           return OpVectorvv<intrinsics::Vfminvv<ElementType>, ElementType, vlmul, vta, vma>(
               args.dst, args.src1, args.src2);
+        case Decoder::VOpFVvOpcode::kVfredminvs:
+          // For Vfredmin the identity element is +inf.
+          return OpVectorvs<intrinsics::Vfredminvs<ElementType>, ElementType, vlmul, vta, vma>(
+              args.dst,
+              args.src1,
+              Vec<UnsignedType{(sizeof(ElementType) == sizeof(Float32)) ? 0x7f80'0000
+                                                                        : 0x7ff0'0000'0000'0000}>{
+                  args.src2});
+        case Decoder::VOpFVvOpcode::kVfmaxvv:
+          return OpVectorvv<intrinsics::Vfmaxvv<ElementType>, ElementType, vlmul, vta, vma>(
+              args.dst, args.src1, args.src2);
+        case Decoder::VOpFVvOpcode::kVfredmaxvs:
+          // For Vfredmax the identity element is -inf.
+          return OpVectorvs<intrinsics::Vfredmaxvs<ElementType>, ElementType, vlmul, vta, vma>(
+              args.dst,
+              args.src1,
+              Vec<UnsignedType{(sizeof(ElementType) == sizeof(Float32)) ? 0xff80'0000
+                                                                        : 0xfff0'0000'0000'0000}>{
+                  args.src2});
         case Decoder::VOpFVvOpcode::kVfsgnjvv:
           return OpVectorvv<intrinsics::Vfsgnjvv<ElementType>, ElementType, vlmul, vta, vma>(
               args.dst, args.src1, args.src2);
@@ -1616,6 +1673,14 @@
               break;  // Make compiler happy.
           }
           break;
+        case Decoder::VOpFVvOpcode::kVfmvfs:
+          if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
+            return Unimplemented();
+          }
+          if (args.src2 != 0) {
+            return Unimplemented();
+          }
+          return OpVectorVmvfs<ElementType>(args.dst, args.src1);
         case Decoder::VOpFVvOpcode::kVmfeqvv:
           return OpVectorToMaskvv<intrinsics::Vfeqvv<ElementType>, ElementType, vlmul, vma>(
               args.dst, args.src1, args.src2);
@@ -1669,6 +1734,7 @@
     using UnsignedType = berberis::UnsignedType<ElementType>;
     using SaturatingSignedType = SaturatingType<SignedType>;
     using SaturatingUnsignedType = SaturatingType<UnsignedType>;
+    // Keep cases sorted in opcode order to match RISC-V V manual.
     switch (args.opcode) {
       case Decoder::VOpIViOpcode::kVaddvi:
         return OpVectorvx<intrinsics::Vaddvx<SignedType>, SignedType, vlmul, vta, vma>(
@@ -1795,6 +1861,7 @@
     using UnsignedType = berberis::UnsignedType<ElementType>;
     using SaturatingSignedType = SaturatingType<SignedType>;
     using SaturatingUnsignedType = SaturatingType<UnsignedType>;
+    // Keep cases sorted in opcode order to match RISC-V V manual.
     switch (args.opcode) {
       case Decoder::VOpIVvOpcode::kVaddvv:
         return OpVectorvv<intrinsics::Vaddvv<ElementType>, ElementType, vlmul, vta, vma>(
@@ -1918,6 +1985,7 @@
     using UnsignedType = berberis::UnsignedType<ElementType>;
     using SaturatingSignedType = SaturatingType<SignedType>;
     using SaturatingUnsignedType = SaturatingType<UnsignedType>;
+    // Keep cases sorted in opcode order to match RISC-V V manual.
     switch (args.opcode) {
       case Decoder::VOpIVxOpcode::kVaddvx:
         return OpVectorvx<intrinsics::Vaddvx<ElementType>, ElementType, vlmul, vta, vma>(
@@ -2049,6 +2117,7 @@
     using SignedType = berberis::SignedType<ElementType>;
     using UnsignedType = berberis::UnsignedType<ElementType>;
     if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
+      // Keep cases sorted in opcode order to match RISC-V V manual.
       switch (args.opcode) {
         case Decoder::VOpMVvOpcode::kVmandnmm:
           return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return lhs & ~rhs; }>(
@@ -2077,6 +2146,7 @@
         default:;  // Do nothing: handled in next switch.
       }
     }
+    // Keep cases sorted in opcode order to match RISC-V V manual.
     switch (args.opcode) {
       case Decoder::VOpMVvOpcode::kVredsumvs:
         return OpVectorvs<intrinsics::Vredsumvs<ElementType>, ElementType, vlmul, vta, vma>(
@@ -2277,6 +2347,7 @@
   void OpVector(const Decoder::VOpMVxArgs& args, Register arg2) {
     using SignedType = berberis::SignedType<ElementType>;
     using UnsignedType = berberis::UnsignedType<ElementType>;
+    // Keep cases sorted in opcode order to match RISC-V V manual.
     switch (args.opcode) {
       case Decoder::VOpMVxOpcode::kVslide1upvx:
         return OpVectorslide1up<SignedType, vlmul, vta, vma>(
@@ -2822,10 +2893,15 @@
             VectorRegisterGroupMultiplier vlmul,
             TailProcessing vta,
             auto vma,
+            CsrName... kExtraCsrs,
             auto kDefaultElement>
   void OpVectorvs(uint8_t dst, uint8_t src1, Vec<kDefaultElement> src2) {
-    return OpVectorvs<Intrinsic, ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(
-        dst, src1, src2);
+    return OpVectorvs<Intrinsic,
+                      ElementType,
+                      NumberOfRegistersInvolved(vlmul),
+                      vta,
+                      vma,
+                      kExtraCsrs...>(dst, src1, src2);
   }
 
   template <auto Intrinsic,
@@ -2833,6 +2909,7 @@
             size_t kRegistersInvolved,
             TailProcessing vta,
             auto vma,
+            CsrName... kExtraCsrs,
             auto kDefaultElement>
   void OpVectorvs(uint8_t dst, uint8_t src1, Vec<kDefaultElement> src2) {
     if (!IsAligned<kRegistersInvolved>(dst | src2.start_no)) {
@@ -2852,7 +2929,9 @@
     ElementType arg1 = SIMD128Register{state_->cpu.v[src1]}.Get<ElementType>(0);
     for (size_t index = 0; index < kRegistersInvolved; ++index) {
       arg1 = std::get<0>(
-          Intrinsic(arg1, GetVectorArgument<ElementType, vta, vma>(src2, vstart, vl, index, mask)));
+          Intrinsic(GetCsr<kExtraCsrs>()...,
+                    arg1,
+                    GetVectorArgument<ElementType, vta, vma>(src2, vstart, vl, index, mask)));
     }
     SIMD128Register result{state_->cpu.v[dst]};
     result.Set(arg1, 0);
diff --git a/interpreter/riscv64/interpreter_test.cc b/interpreter/riscv64/interpreter_test.cc
index fdd9a81..53a97c1 100644
--- a/interpreter/riscv64/interpreter_test.cc
+++ b/interpreter/riscv64/interpreter_test.cc
@@ -1667,6 +1667,21 @@
   }
 
   void TestVectorReductionInstruction(uint32_t insn_bytes,
+                                      const uint32_t (&expected_result_vd0_int32)[8],
+                                      const uint64_t (&expected_result_vd0_int64)[8],
+                                      const uint32_t (&expected_result_vd0_with_mask_int32)[8],
+                                      const uint64_t (&expected_result_vd0_with_mask_int64)[8],
+                                      const __v2du (&source)[16]) {
+    TestVectorReductionInstruction(
+        insn_bytes,
+        source,
+        std::tuple<const uint32_t(&)[8], const uint32_t(&)[8]>{expected_result_vd0_int32,
+                                                               expected_result_vd0_with_mask_int32},
+        std::tuple<const uint64_t(&)[8], const uint64_t(&)[8]>{
+            expected_result_vd0_int64, expected_result_vd0_with_mask_int64});
+  }
+
+  void TestVectorReductionInstruction(uint32_t insn_bytes,
                                       const uint8_t (&expected_result_vd0_int8)[8],
                                       const uint16_t (&expected_result_vd0_int16)[8],
                                       const uint32_t (&expected_result_vd0_int32)[8],
@@ -8265,6 +8280,44 @@
       kVectorCalculationsSource);
 }
 
+TEST_F(Riscv64InterpreterTest, TestVfredosum) {
+  TestVectorReductionInstruction(
+      0xd0c1457,  // vfredosum.vs v8, v16, v24, v0.t
+      // expected_result_vd0_int32
+      {0x9e0c'9a8e, 0xbe2c'bace, 0xfe6c'fb4e, 0x7e6b'fc4d, /* unused */ 0, /* unused */ 0,
+       0x9604'9200, 0x9e0c'9a8e},
+      // expected_result_vd0_int64
+      {0x9e0c'9a09'9604'9200, 0xbe2c'ba29'b624'b220, 0xfe6c'fa69'f664'f260, 0x7eec'5def'0cee'0dee,
+       /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9e0c'9a09'9604'9200},
+      // expected_result_vd0_with_mask_int32
+      {0x9604'929d, 0xbe2c'ba29, 0xfe6c'fb4e, 0x7e6b'fa84, /* unused */ 0, /* unused */ 0,
+       0x9604'9200, 0x9604'9200},
+      // expected_result_vd0_with_mask_int64
+      {0x9e0c'9a09'9604'9200, 0xbe2c'ba29'b624'b220, 0xee7c'ea78'e674'e271, 0x6efc'4e0d'ee0d'ee0f,
+       /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9e0c'9a09'9604'9200},
+      kVectorCalculationsSource);
+}
+
+// Currently Vfredusum is implemented as Vfredosum (as explicitly permitted by RVV 1.0).
+// If we would implement some speedups which would change results then we may need to alter tests.
+TEST_F(Riscv64InterpreterTest, TestVfredusum) {
+  TestVectorReductionInstruction(
+      0x50c1457,  // vfredusum.vs v8, v16, v24, v0.t
+      // expected_result_vd0_int32
+      {0x9e0c'9a8e, 0xbe2c'bace, 0xfe6c'fb4e, 0x7e6b'fc4d, /* unused */ 0, /* unused */ 0,
+       0x9604'9200, 0x9e0c'9a8e},
+      // expected_result_vd0_int64
+      {0x9e0c'9a09'9604'9200, 0xbe2c'ba29'b624'b220, 0xfe6c'fa69'f664'f260, 0x7eec'5def'0cee'0dee,
+       /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9e0c'9a09'9604'9200},
+      // expected_result_vd0_with_mask_int32
+      {0x9604'929d, 0xbe2c'ba29, 0xfe6c'fb4e, 0x7e6b'fa84, /* unused */ 0, /* unused */ 0,
+       0x9604'9200, 0x9604'9200},
+      // expected_result_vd0_with_mask_int64
+      {0x9e0c'9a09'9604'9200, 0xbe2c'ba29'b624'b220, 0xee7c'ea78'e674'e271, 0x6efc'4e0d'ee0d'ee0f,
+       /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9e0c'9a09'9604'9200},
+      kVectorCalculationsSource);
+}
+
 TEST_F(Riscv64InterpreterTest, TestVredand) {
   TestVectorReductionInstruction(
       0x50c2457,  // vredand.vs v8,v16,v24,v0.t
@@ -8395,6 +8448,24 @@
       kVectorCalculationsSource);
 }
 
+TEST_F(Riscv64InterpreterTest, TestVfredmin) {
+  TestVectorReductionInstruction(
+      0x150c1457,  // vfredmin.vs v8, v16, v24, v0.t
+      // expected_result_vd0_int32
+      {0x9e0c'9a09, 0xbe2c'ba29, 0xfe6c'fa69, 0xfe6c'fa69, /* unused */ 0, /* unused */ 0,
+       0x9604'9200, 0x9e0c'9a09},
+      // expected_result_vd0_int64
+      {0x9e0c'9a09'9604'9200, 0xbe2c'ba29'b624'b220, 0xfe6c'fa69'f664'f260, 0xfe6c'fa69'f664'f260,
+       /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9e0c'9a09'9604'9200},
+      // expected_result_vd0_with_mask_int32
+      {0x9604'9200, 0xbe2c'ba29, 0xfe6c'fa69, 0xfe6c'fa69, /* unused */ 0, /* unused */ 0,
+       0x9604'9200, 0x9604'9200},
+      // expected_result_vd0_with_mask_int64
+      {0x9e0c'9a09'9604'9200, 0xbe2c'ba29'b624'b220, 0xee7c'ea78'e674'e271, 0xee7c'ea78'e674'e271,
+       /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x9e0c'9a09'9604'9200},
+      kVectorCalculationsSource);
+}
+
 TEST_F(Riscv64InterpreterTest, TestVredmaxu) {
   TestVectorReductionInstruction(
       0x190c2457,  // vredmaxu.vs v8,v16,v24,v0.t
@@ -8447,6 +8518,24 @@
       kVectorCalculationsSource);
 }
 
+TEST_F(Riscv64InterpreterTest, TestVfredmax) {
+  TestVectorReductionInstruction(
+      0x1d0c1457,  // vfredmax.vs v8, v16, v24, v0.t
+      // expected_result_vd0_int32
+      {0x8302'8100, 0x8302'8100, 0x8302'8100, 0x7eec'7ae9, /* unused */ 0, /* unused */ 0,
+       0x8302'8100, 0x8302'8100},
+      // expected_result_vd0_int64
+      {0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x7eec'7ae9'76e4'72e0,
+       /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x8706'8504'8302'8100},
+      // expected_result_vd0_with_mask_int32
+      {0x8302'8100, 0x8302'8100, 0x8302'8100, 0x7eec'7ae9, /* unused */ 0, /* unused */ 0,
+       0x8302'8100, 0x8302'8100},
+      // expected_result_vd0_with_mask_int64
+      {0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x8706'8504'8302'8100, 0x6efc'6af8'66f4'62f1,
+       /* unused */ 0, /* unused */ 0, /* unused */ 0, 0x8706'8504'8302'8100},
+      kVectorCalculationsSource);
+}
+
 // Note that the expected test outputs for v[f]merge.vXm are identical to those for v[f]mv.v.X.
 // This happens because v[f]merge.vXm is just a v[f]mv.v.X with mask (second operand is not used
 // by v[f]mv.v.X but the difference between v[f]merge.vXm and v[f]mv.v.X is captured in masking
diff --git a/intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h b/intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h
index 383dd10..5d155e4 100644
--- a/intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h
+++ b/intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h
@@ -760,6 +760,15 @@
   DEFINE_ARITHMETIC_REDUCE_INTRINSIC(Vred##name##vs, return ({ __VA_ARGS__; }); \
                                      , (ResultType init, SIMD128Register src), (), (init, src))
 
+#define DEFINE_2OP_FLOAT_ARITHMETIC_INTRINSIC_VS(name, ...)                      \
+  DEFINE_ARITHMETIC_REDUCE_INTRINSIC(Vfred##name##vs, return ({ __VA_ARGS__; }); \
+                                     , (ResultType init, SIMD128Register src), (), (init, src))
+
+#define DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VS(name, ...) \
+  DEFINE_ARITHMETIC_REDUCE_INTRINSIC(                     \
+      Vfred##name##vs, return ({ __VA_ARGS__; });         \
+      , (int8_t csr, ResultType init, SIMD128Register src), (csr), (init, src))
+
 #define DEFINE_W_ARITHMETIC_INTRINSIC(Name, Pattern, arithmetic, parameters, arguments)           \
   template <typename ElementType,                                                                 \
             enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>       \
@@ -815,6 +824,8 @@
 DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(xor, (args ^ ...))
 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(add, std::get<0>(FAdd(FPFlags::DYN, csr, args...)))
 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(add, std::get<0>(FAdd(FPFlags::DYN, csr, args...)))
+DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VS(osum, std::get<0>(FAdd(FPFlags::DYN, csr, args...)))
+DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VS(usum, std::get<0>(FAdd(FPFlags::DYN, csr, args...)))
 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(mul, std::get<0>(FMul(FPFlags::DYN, csr, args...)))
 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV(mul, std::get<0>(FMul(FPFlags::DYN, csr, args...)))
 DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF(div, std::get<0>(FDiv(FPFlags::DYN, csr, args...)))
@@ -900,10 +911,12 @@
                                    (-(arg2 * arg3) + arg1))
 DEFINE_3OP_ARITHMETIC_INTRINSIC_VX(nmsub, auto [arg1, arg2, arg3] = std::tuple{args...};
                                    (-(arg2 * arg3) + arg1))
-DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fmin, std::get<0>(FMin(args...)))
-DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fmax, std::get<0>(FMax(args...)))
 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fmin, std::get<0>(FMin(args...)))
+DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fmin, std::get<0>(FMin(args...)))
+DEFINE_2OP_FLOAT_ARITHMETIC_INTRINSIC_VS(min, std::get<0>(FMin(args...)))
 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fmax, std::get<0>(FMax(args...)))
+DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fmax, std::get<0>(FMax(args...)))
+DEFINE_2OP_FLOAT_ARITHMETIC_INTRINSIC_VS(max, std::get<0>(FMax(args...)))
 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fsgnj, std::get<0>(FSgnj(args...)))
 DEFINE_2OP_ARITHMETIC_INTRINSIC_VX(fsgnj, std::get<0>(FSgnj(args...)))
 DEFINE_2OP_ARITHMETIC_INTRINSIC_VV(fsgnjn, std::get<0>(FSgnjn(args...)))
@@ -954,12 +967,14 @@
 #undef DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS
 #undef DEFINE_1OP_ARITHMETIC_INTRINSIC_V
 #undef DEFINE_2OP_ARITHMETIC_INTRINSIC_VS
+#undef DEFINE_2OP_FLOAT_ARITHMETIC_INTRINSIC_VS
 #undef DEFINE_2OP_ARITHMETIC_INTRINSIC_VV
 #undef DEFINE_3OP_ARITHMETIC_INTRINSIC_VV
 #undef DEFINE_2OP_ARITHMETIC_INTRINSIC_VX
 #undef DEFINE_3OP_ARITHMETIC_INTRINSIC_VX
 #undef DEFINE_1OP_ARITHMETIC_INTRINSIC_X
 #undef DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VF
+#undef DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VS
 #undef DEFINE_2OP_1CSR_ARITHMETIC_INTRINSIC_VV
 #undef DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WV
 #undef DEFINE_2OP_NARROW_ARITHMETIC_INTRINSIC_WX
diff --git a/intrinsics/riscv64_to_x86_64/include/berberis/intrinsics/macro_assembler_constants_pool.h b/intrinsics/riscv64_to_x86_64/include/berberis/intrinsics/macro_assembler_constants_pool.h
index 0a51a30..a7a20c9 100644
--- a/intrinsics/riscv64_to_x86_64/include/berberis/intrinsics/macro_assembler_constants_pool.h
+++ b/intrinsics/riscv64_to_x86_64/include/berberis/intrinsics/macro_assembler_constants_pool.h
@@ -37,16 +37,22 @@
 template <>
 extern const int32_t kVectorConst<int32_t{static_cast<int32_t>(-0x8000'0000)}>;
 template <>
+extern const int32_t kVectorConst<int32_t{-0x0080'0000}>;
+template <>
 extern const int32_t kVectorConst<int32_t{0x3f80'0000}>;
 template <>
+extern const int32_t kVectorConst<int32_t{0x7f80'0000}>;
+template <>
 extern const int32_t kVectorConst<int32_t{0x7fff'ffff}>;
 template <>
 extern const int32_t kVectorConst<int64_t{static_cast<int64_t>(-0x8000'0000'0000'0000)}>;
 template <>
-extern const int32_t kVectorConst<int64_t{0x7ff8'0000'0000'0000}>;
+extern const int32_t kVectorConst<int64_t{0x7ff0'0000'0000'0000}>;
 template <>
 extern const int32_t kVectorConst<int64_t{0x7fff'ffff'ffff'ffff}>;
 template <>
+extern const int32_t kVectorConst<int64_t{-0x0010'0000'0000'0000}>;
+template <>
 extern const int32_t kVectorConst<uint64_t{0x0000'0000'0000'0000}>;
 template <>
 inline const int32_t& kVectorConst<int8_t{0x00}> = kVectorConst<uint64_t{0x0000'0000'0000'0000}>;
@@ -58,19 +64,44 @@
 inline const int32_t& kVectorConst<uint16_t{0x0000}> =
     kVectorConst<uint64_t{0x0000'0000'0000'0000}>;
 template <>
+inline const int32_t& kVectorConst<uint8_t{127}> = kVectorConst<int8_t{127}>;
+template <>
+inline const int32_t& kVectorConst<uint8_t{128}> = kVectorConst<int8_t{-128}>;
+template <>
+inline const int32_t& kVectorConst<uint16_t{0x7fff}> = kVectorConst<int16_t{0x7fff}>;
+template <>
+inline const int32_t& kVectorConst<uint16_t{0x8000}> = kVectorConst<int16_t{-0x8000}>;
+template <>
 inline const int32_t& kVectorConst<int32_t{0x0000'0000}> =
     kVectorConst<uint64_t{0x0000'0000'0000'0000}>;
 template <>
 inline const int32_t& kVectorConst<uint32_t{0x0000'0000}> =
     kVectorConst<uint64_t{0x0000'0000'0000'0000}>;
 template <>
+inline const int32_t& kVectorConst<uint32_t{0x3f80'0000}> = kVectorConst<int32_t{0x3f80'0000}>;
+template <>
+inline const int32_t& kVectorConst<uint32_t{0x7f80'0000}> = kVectorConst<int32_t{0x7f80'0000}>;
+template <>
+inline const int32_t& kVectorConst<uint32_t{0x7fff'ffff}> = kVectorConst<int32_t{0x7fff'ffff}>;
+template <>
+inline const int32_t& kVectorConst<uint32_t{0x8000'0000}> =
+    kVectorConst<int32_t{static_cast<int32_t>(-0x8000'0000)}>;
+template <>
+inline const int32_t& kVectorConst<uint32_t{0xff80'0000}> = kVectorConst<int32_t{-0x0080'0000}>;
+template <>
 inline const int32_t& kVectorConst<int64_t{0x0000'0000'0000'0000}> =
     kVectorConst<uint64_t{0x0000'0000'0000'0000}>;
 template <>
 extern const int32_t kVectorConst<uint64_t{0x7fc'00000'7fc'00000}>;
 template <>
+inline const int32_t& kVectorConst<uint64_t{0x7ff0'0000'0000'0000}> =
+    kVectorConst<int64_t{0x7ff0'0000'0000'0000}>;
+template <>
 extern const int32_t kVectorConst<uint64_t{0x7ff8'0000'0000'0000}>;
 template <>
+inline const int32_t& kVectorConst<uint64_t{0xfff0'0000'0000'0000}> =
+    kVectorConst<int64_t{-0x0010'0000'0000'0000}>;
+template <>
 extern const int32_t kVectorConst<uint64_t{0xffff'ffff'0000'0000}>;
 template <>
 extern const int32_t kVectorConst<uint64_t{0xffff'ffff'7fc0'0000}>;
diff --git a/intrinsics/riscv64_to_x86_64/macro_assembler.cc b/intrinsics/riscv64_to_x86_64/macro_assembler.cc
index 597ebc9..edbf615 100644
--- a/intrinsics/riscv64_to_x86_64/macro_assembler.cc
+++ b/intrinsics/riscv64_to_x86_64/macro_assembler.cc
@@ -42,6 +42,10 @@
                                                          0x7ff8'0000'0000'0000};
   alignas(16) const uint32_t kFloat32One[4] = {0x3f80'0000, 0x3f80'0000, 0x3f80'0000, 0x3f80'0000};
   alignas(16) const uint64_t kFloat64One[2] = {0x3ff0'0000'0000'0000, 0x3ff0'0000'0000'0000};
+  alignas(16) const uint32_t kFloat32PInf[4] = {0x7f80'0000, 0x7f80'0000, 0x7f80'0000, 0x7f80'0000};
+  alignas(16) const uint32_t kFloat32NInf[4] = {0xff80'0000, 0xff80'0000, 0xff80'0000, 0xff80'0000};
+  alignas(16) const uint64_t kFloat64PInf[2] = {0x7ff0'0000'0000'0000, 0x7ff0'0000'0000'0000};
+  alignas(16) const uint64_t kFloat64NInf[2] = {0xfff0'0000'0000'0000, 0xfff0'0000'0000'0000};
   alignas(16) const int8_t kMinInt8[16] = {
       -128,
       -128,
@@ -262,36 +266,40 @@
 };
 
 // Make sure Layout is the same in 32-bit mode and 64-bit mode.
-CHECK_STRUCT_LAYOUT(MacroAssemblerConstants, 27008, 128);
+CHECK_STRUCT_LAYOUT(MacroAssemblerConstants, 27520, 128);
 CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kNanBoxFloat32, 0, 128);
 CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kNanBoxedNansFloat32, 128, 128);
 CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kCanonicalNansFloat32, 256, 128);
 CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kCanonicalNansFloat64, 384, 128);
 CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kFloat32One, 512, 128);
 CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kFloat64One, 640, 128);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kMinInt8, 768, 128);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kMaxInt8, 896, 128);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kMinInt16, 1024, 128);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kMaxInt16, 1152, 128);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kMinInt32, 1280, 128);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kMaxInt32, 1408, 128);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kMinInt64, 1536, 128);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kMaxInt64, 1664, 128);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kBsrToClzInt64, 1792, 64);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kWidthInBits64, 1856, 64);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kBsrToClzInt32, 1920, 32);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kWidthInBits32, 1952, 32);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, k0x8000_0000_0000_00ff, 1984, 64);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kRiscVToX87Exceptions, 2432, 256);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kX87ToRiscVExceptions, 2688, 512);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kBitMaskTable, 3200, 2048);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kVid64Bit, 5248, 1024);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kVid32Bit, 6272, 1024);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kVid16Bit, 7296, 1024);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kVid8Bit, 8320, 1024);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kBitMaskTo32bitMask, 9344, 256);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kBitMaskTo16bitMask, 9600, 1024);
-CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kBitMaskTo8bitMask, 10624, 16384);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kFloat32PInf, 768, 128);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kFloat32NInf, 896, 128);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kFloat64PInf, 1024, 128);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kFloat64NInf, 1152, 128);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kMinInt8, 1280, 128);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kMaxInt8, 1408, 128);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kMinInt16, 1536, 128);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kMaxInt16, 1664, 128);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kMinInt32, 1792, 128);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kMaxInt32, 1920, 128);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kMinInt64, 2048, 128);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kMaxInt64, 2176, 128);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kBsrToClzInt64, 2304, 64);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kWidthInBits64, 2368, 64);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kBsrToClzInt32, 2432, 32);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kWidthInBits32, 2464, 32);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, k0x8000_0000_0000_00ff, 2496, 64);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kRiscVToX87Exceptions, 2944, 256);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kX87ToRiscVExceptions, 3200, 512);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kBitMaskTable, 3712, 2048);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kVid64Bit, 5760, 1024);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kVid32Bit, 6784, 1024);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kVid16Bit, 7808, 1024);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kVid8Bit, 8832, 1024);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kBitMaskTo32bitMask, 9856, 256);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kBitMaskTo16bitMask, 10112, 1024);
+CHECK_FIELD_LAYOUT(MacroAssemblerConstants, kBitMaskTo8bitMask, 11136, 16384);
 
 // Note: because we have aligned fields and thus padding in that data structure
 // value-initialization is both slower and larger than copy-initialization for
@@ -342,17 +350,29 @@
 extern const int32_t kVectorConst<int32_t{0x3f80'0000}> =
     GetConstants() + offsetof(MacroAssemblerConstants, kFloat32One);
 template <>
+extern const int32_t kVectorConst<int32_t{0x7f80'0000}> =
+    GetConstants() + offsetof(MacroAssemblerConstants, kFloat32PInf);
+template <>
 extern const int32_t kVectorConst<int32_t{0x7fff'ffff}> =
     GetConstants() + offsetof(MacroAssemblerConstants, kMaxInt32);
 template <>
+extern const int32_t kVectorConst<int32_t{-0x0080'0000}> =
+    GetConstants() + offsetof(MacroAssemblerConstants, kFloat32NInf);
+template <>
 extern const int32_t kVectorConst<int64_t{static_cast<int64_t>(-0x8000'0000'0000'0000)}> =
     GetConstants() + offsetof(MacroAssemblerConstants, kMinInt64);
 template <>
+extern const int32_t kVectorConst<int64_t{0x3ff0'0000'0000'0000}> =
+    GetConstants() + offsetof(MacroAssemblerConstants, kFloat64One);
+template <>
+extern const int32_t kVectorConst<int64_t{0x7ff0'0000'0000'0000}> =
+    GetConstants() + offsetof(MacroAssemblerConstants, kFloat64PInf);
+template <>
 extern const int32_t kVectorConst<int64_t{0x7fff'ffff'ffff'ffff}> =
     GetConstants() + offsetof(MacroAssemblerConstants, kMaxInt64);
 template <>
-extern const int32_t kVectorConst<int64_t{0x3ff0'0000'0000'0000}> =
-    GetConstants() + offsetof(MacroAssemblerConstants, kFloat64One);
+extern const int32_t kVectorConst<int64_t{-0x0010'0000'0000'0000}> =
+    GetConstants() + offsetof(MacroAssemblerConstants, kFloat64NInf);
 template <>
 const int32_t kVectorConst<uint64_t{0x0000'0000'0000'0000}> =
     GetConstants() + offsetof(MacroAssemblerConstants, kBitMaskTable);