Revert^2 "Enable [vzext|vsext].vf[8|4|2] instructions"

This reverts commit 1bcd5b52b2a37f15d8f0bb96bcffd617cd189cb8.

Reason for revert: Should work now

Change-Id: I834fef1449168fc2e9ecbd315ba54dacb9b9e164
diff --git a/decoder/include/berberis/decoder/riscv64/decoder.h b/decoder/include/berberis/decoder/riscv64/decoder.h
index 483445a..0a3cdde 100644
--- a/decoder/include/berberis/decoder/riscv64/decoder.h
+++ b/decoder/include/berberis/decoder/riscv64/decoder.h
@@ -363,6 +363,7 @@
     kVmxnormm = 0b011111,
     kVXmXXs = 0b010000,
     kVmsXf = 0b010100,
+    kVxunary0 = 0b010010,
     kVmulhuvv = 0b100100,
     kVmulvv = 0b100101,
     kVmulhsuvv = 0b100110,
@@ -459,6 +460,15 @@
     kVidv = 0b10001,
   };
 
+  enum class Vxunary0Opcode : uint8_t {
+    kVzextvf8m = 0b00010,
+    kVsextvf8m = 0b00011,
+    kVzextvf4m = 0b00100,
+    kVsextvf4m = 0b00101,
+    kVzextvf2m = 0b00110,
+    kVsextvf2m = 0b00111,
+  };
+
   // Load/Store instruction include 3bit “width” field while all other floating-point instructions
   // include 2bit “fmt” field.
   //
@@ -748,6 +758,7 @@
     union {
       VXmXXsOpcode vXmXXs_opcode;
       VmsXfOpcode vmsXf_opcode;
+      Vxunary0Opcode vxunary0_opcode;
       uint8_t src2;
     };
   };
diff --git a/interpreter/riscv64/interpreter.cc b/interpreter/riscv64/interpreter.cc
index 450341a..bd11a43 100644
--- a/interpreter/riscv64/interpreter.cc
+++ b/interpreter/riscv64/interpreter.cc
@@ -983,6 +983,60 @@
           default:
               return Unimplemented();
         }
+      case Decoder::VOpMVvOpcode::kVxunary0:
+        switch (args.vxunary0_opcode) {
+          case Decoder::Vxunary0Opcode::kVzextvf2m:
+              if constexpr (sizeof(UnsignedType) >= 2) {
+              return OpVectorExtend<intrinsics::Vextf2<UnsignedType>,
+                                    UnsignedType,
+                                    2,
+                                    vlmul,
+                                    vta,
+                                    vma>(args.dst, args.src1);
+              }
+              break;
+          case Decoder::Vxunary0Opcode::kVsextvf2m:
+              if constexpr (sizeof(SignedType) >= 2) {
+              return OpVectorExtend<intrinsics::Vextf2<SignedType>, SignedType, 2, vlmul, vta, vma>(
+                  args.dst, args.src1);
+              }
+              break;
+          case Decoder::Vxunary0Opcode::kVzextvf4m:
+              if constexpr (sizeof(UnsignedType) >= 4) {
+              return OpVectorExtend<intrinsics::Vextf4<UnsignedType>,
+                                    UnsignedType,
+                                    4,
+                                    vlmul,
+                                    vta,
+                                    vma>(args.dst, args.src1);
+              }
+              break;
+          case Decoder::Vxunary0Opcode::kVsextvf4m:
+              if constexpr (sizeof(SignedType) >= 4) {
+              return OpVectorExtend<intrinsics::Vextf4<SignedType>, SignedType, 4, vlmul, vta, vma>(
+                  args.dst, args.src1);
+              }
+              break;
+          case Decoder::Vxunary0Opcode::kVzextvf8m:
+              if constexpr (sizeof(UnsignedType) >= 8) {
+              return OpVectorExtend<intrinsics::Vextf8<UnsignedType>,
+                                    UnsignedType,
+                                    8,
+                                    vlmul,
+                                    vta,
+                                    vma>(args.dst, args.src1);
+              }
+              break;
+          case Decoder::Vxunary0Opcode::kVsextvf8m:
+              if constexpr (sizeof(SignedType) >= 8) {
+              return OpVectorExtend<intrinsics::Vextf8<SignedType>, SignedType, 8, vlmul, vta, vma>(
+                  args.dst, args.src1);
+              }
+              break;
+          default:
+              return Unimplemented();
+        }
+        return Unimplemented();
       case Decoder::VOpMVvOpcode::kVmsXf:
         switch (args.vmsXf_opcode) {
           case Decoder::VmsXfOpcode::kVmsbfm:
@@ -1713,6 +1767,40 @@
   }
 
   template <auto Intrinsic,
+            typename DestElementType,
+            const uint8_t kFactor,
+            VectorRegisterGroupMultiplier vlmul,
+            TailProcessing vta,
+            auto vma>
+  void OpVectorExtend(uint8_t dst, uint8_t src) {
+    static_assert(kFactor == 2 || kFactor == 4 || kFactor == 8);
+    constexpr size_t kDestRegistersInvolved = NumberOfRegistersInvolved(vlmul);
+    constexpr size_t kSourceRegistersInvolved = (kDestRegistersInvolved / kFactor) ?: 1;
+    if (!IsAligned<kDestRegistersInvolved>(dst) || !IsAligned<kSourceRegistersInvolved>(src)) {
+      return Unimplemented();
+    }
+    int vstart = GetCsr<CsrName::kVstart>();
+    int vl = GetCsr<CsrName::kVl>();
+    auto mask = GetMaskForVectorOperations<vma>();
+    for (size_t dst_index = 0; dst_index < kDestRegistersInvolved; dst_index++) {
+      size_t src_index = dst_index / kFactor;
+      size_t src_elem = dst_index % kFactor;
+      SIMD128Register result{state_->cpu.v[dst + dst_index]};
+      SIMD128Register arg{state_->cpu.v[src + src_index] >> ((128 / kFactor) * src_elem)};
+
+      result = std::get<0>(intrinsics::VectorMasking<DestElementType, vta, vma>(
+          result,
+          std::get<0>(Intrinsic(arg)),
+          result,
+          vstart - dst_index * (16 / sizeof(DestElementType)),
+          vl - dst_index * (16 / sizeof(DestElementType)),
+          std::get<0>(intrinsics::MaskForRegisterInSequence<DestElementType>(mask, dst_index))));
+      state_->cpu.v[dst + dst_index] = result.Get<__uint128_t>();
+    }
+    SetCsr<CsrName::kVstart>(0);
+  }
+
+  template <auto Intrinsic,
             typename ElementType,
             VectorRegisterGroupMultiplier vlmul,
             TailProcessing vta,
diff --git a/interpreter/riscv64/interpreter_test.cc b/interpreter/riscv64/interpreter_test.cc
index 241651c..3bba652 100644
--- a/interpreter/riscv64/interpreter_test.cc
+++ b/interpreter/riscv64/interpreter_test.cc
@@ -517,6 +517,125 @@
     Verify(insn_bytes | (1 << 25), 2, 8, expected_result_int32, kNoMask);
   }
 
+  void TestExtendingVectorInstruction(uint32_t insn_bytes,
+                                      const __v8hu (&expected_result_int16)[8],
+                                      const __v4su (&expected_result_int32)[8],
+                                      const __v2du (&expected_result_int64)[8],
+                                      const __v2du (&source)[16],
+                                      const uint8_t factor) {
+    auto Verify = [this, &source, &factor](uint32_t insn_bytes,
+                                           uint8_t vsew,
+                                           uint8_t vlmul_max,
+                                           const auto& expected_result,
+                                           auto mask) {
+      CHECK((factor == 2) || (factor == 4) || (factor == 8));
+      // Mask register is, unconditionally, v0, and we need 8, 16, or 24 to handle full 8-registers
+      // inputs thus we use v8..v15 for destination and place sources into v16..v23 and v24..v31.
+      state_.cpu.v[0] = SIMD128Register{kMask}.Get<__uint128_t>();
+      for (uint8_t index = 0; index < (8 / factor); ++index) {
+        state_.cpu.v[16 + index] = SIMD128Register{source[index]}.Get<__uint128_t>();
+      }
+      for (uint8_t vlmul = 0; vlmul < vlmul_max; ++vlmul) {
+        if (vlmul == 3) {
+          continue;
+        }
+        for (uint8_t vta = 0; vta < 2; ++vta) {
+          for (uint8_t vma = 0; vma < 2; ++vma) {
+            auto [vlmax, vtype] =
+                intrinsics::Vsetvl(~0ULL, (vma << 7) | (vta << 6) | (vsew << 3) | vlmul);
+            // Incompatible vsew and vlmax. Skip it.
+            if (vlmax == 0) {
+              continue;
+            }
+            // To make tests quick enough we don't test vstart and vl change with small register
+            // sets. Only with vlmul == 2 (4 registers) we set vstart and vl to skip half of
+            // first
+            // register and half of last register.
+            // Don't use vlmul == 3 because that one may not be supported if instruction widens
+            // the result.
+            if (vlmul == 2) {
+              state_.cpu.vstart = vlmax / 8;
+              state_.cpu.vl = (vlmax * 5) / 8;
+            } else {
+              state_.cpu.vstart = 0;
+              state_.cpu.vl = vlmax;
+            }
+            state_.cpu.vtype = vtype;
+
+            // Set expected_result vector registers into 0b01010101… pattern.
+            for (size_t index = 0; index < 8; ++index) {
+              state_.cpu.v[8 + index] = SIMD128Register{kUndisturbedResult}.Get<__uint128_t>();
+            }
+
+            state_.cpu.insn_addr = ToGuestAddr(&insn_bytes);
+            EXPECT_TRUE(RunOneInstruction(&state_, state_.cpu.insn_addr + 4));
+
+            // Values for inactive elements (i.e. corresponding mask bit is 0).
+            const size_t n = std::size(source) * 2;
+            __m128i expected_inactive[n];
+            // For most instructions, follow basic inactive processing rules based on vma flag.
+            std::fill_n(expected_inactive, n, (vma ? kAgnosticResult : kUndisturbedResult));
+
+            if (vlmul < 4) {
+              for (size_t index = 0; index < 1 << vlmul; ++index) {
+                if (index == 0 && vlmul == 2) {
+                  EXPECT_EQ(state_.cpu.v[8 + index],
+                            SIMD128Register{
+                                (kUndisturbedResult & kFractionMaskInt8[3]) |
+                                    (expected_result[index] & mask[index] & ~kFractionMaskInt8[3]) |
+                                    (expected_inactive[index] & ~mask[index] & ~kFractionMaskInt8[3])}
+                                .Get<__uint128_t>());
+                } else if (index == 2 && vlmul == 2) {
+                  EXPECT_EQ(
+                      state_.cpu.v[8 + index],
+                      SIMD128Register{
+                          (expected_result[index] & mask[index] & kFractionMaskInt8[3]) |
+                              (expected_inactive[index] & ~mask[index] & kFractionMaskInt8[3]) |
+                              ((vta ? kAgnosticResult : kUndisturbedResult) & ~kFractionMaskInt8[3])}
+                          .Get<__uint128_t>());
+                } else if (index == 3 && vlmul == 2 && vta) {
+                  EXPECT_EQ(state_.cpu.v[8 + index], SIMD128Register{kAgnosticResult});
+                } else if (index == 3 && vlmul == 2) {
+                  EXPECT_EQ(state_.cpu.v[8 + index], SIMD128Register{kUndisturbedResult});
+                } else {
+                  EXPECT_EQ(state_.cpu.v[8 + index],
+                            SIMD128Register{(expected_result[index] & mask[index]) |
+                                (expected_inactive[index] & ~mask[index])}
+                                .Get<__uint128_t>());
+                }
+              }
+            } else {
+              EXPECT_EQ(
+                  state_.cpu.v[8],
+                  SIMD128Register{(expected_result[0] & mask[0] & kFractionMaskInt8[vlmul - 4]) |
+                      (expected_inactive[0] & ~mask[0] & kFractionMaskInt8[vlmul - 4]) |
+                      ((vta ? kAgnosticResult : kUndisturbedResult) &
+                          ~kFractionMaskInt8[vlmul - 4])}
+                      .Get<__uint128_t>());
+            }
+
+            if (vlmul == 2) {
+              // Every vector instruction must set vstart to 0, but shouldn't touch vl.
+              EXPECT_EQ(state_.cpu.vstart, 0);
+              EXPECT_EQ(state_.cpu.vl, (vlmax * 5) / 8);
+            }
+          }
+        }
+      }
+    };
+
+    if (factor == 2) {
+      Verify(insn_bytes, 1, 8, expected_result_int16, kMaskInt16);
+      Verify(insn_bytes | (1 << 25), 1, 8, expected_result_int16, kNoMask);
+    }
+    if (factor == 2 || factor == 4) {
+      Verify(insn_bytes, 2, 8, expected_result_int32, kMaskInt32);
+      Verify(insn_bytes | (1 << 25), 2, 8, expected_result_int32, kNoMask);
+    }
+    Verify(insn_bytes, 3, 8, expected_result_int64, kMaskInt64);
+    Verify(insn_bytes | (1 << 25), 3, 8, expected_result_int64, kNoMask);
+  }
+
   void TestVectorMaskInstruction(uint8_t max_vstart,
                                  intrinsics::InactiveProcessing vma,
                                  uint32_t insn_bytes,
@@ -3580,6 +3699,137 @@
       kVectorCalculationsSource[0]);
 }
 
+TEST_F(Riscv64InterpreterTest, TestVXext) {
+  TestExtendingVectorInstruction(
+      0x49012457,  // vzext.vf8 v8,v16,v0.t
+      {}, {},
+      {{0x0000'0000'0000'0000, 0x0000'0000'0000'0081},
+       {0x0000'0000'0000'0002, 0x0000'0000'0000'0083},
+       {0x0000'0000'0000'0004, 0x0000'0000'0000'0085},
+       {0x0000'0000'0000'0006, 0x0000'0000'0000'0087},
+       {0x0000'0000'0000'0008, 0x0000'0000'0000'0089},
+       {0x0000'0000'0000'000a, 0x0000'0000'0000'008b},
+       {0x0000'0000'0000'000c, 0x0000'0000'0000'008d},
+       {0x0000'0000'0000'000e, 0x0000'0000'0000'008f}},
+      kVectorCalculationsSource,
+      8);
+
+  TestExtendingVectorInstruction(
+      0x4901a457,  // vsext.vf8 v8,v16,v0.t
+      {}, {},
+      {{0x0000'0000'0000'0000, 0xffff'ffff'ffff'ff81},
+       {0x0000'0000'0000'0002, 0xffff'ffff'ffff'ff83},
+       {0x0000'0000'0000'0004, 0xffff'ffff'ffff'ff85},
+       {0x0000'0000'0000'0006, 0xffff'ffff'ffff'ff87},
+       {0x0000'0000'0000'0008, 0xffff'ffff'ffff'ff89},
+       {0x0000'0000'0000'000a, 0xffff'ffff'ffff'ff8b},
+       {0x0000'0000'0000'000c, 0xffff'ffff'ffff'ff8d},
+       {0x0000'0000'0000'000e, 0xffff'ffff'ffff'ff8f}},
+      kVectorCalculationsSource,
+      8);
+
+  TestExtendingVectorInstruction(
+      0x49022457,  // vzext.vf4 v8, v16, v0.t
+      {},
+      {{0x0000'0000, 0x0000'0081, 0x0000'0002, 0x0000'0083},
+       {0x0000'0004, 0x0000'0085, 0x0000'0006, 0x0000'0087},
+       {0x0000'0008, 0x0000'0089, 0x0000'000a, 0x0000'008b},
+       {0x0000'000c, 0x0000'008d, 0x0000'000e, 0x0000'008f},
+       {0x0000'0010, 0x0000'0091, 0x0000'0012, 0x0000'0093},
+       {0x0000'0014, 0x0000'0095, 0x0000'0016, 0x0000'0097},
+       {0x0000'0018, 0x0000'0099, 0x0000'001a, 0x0000'009b},
+       {0x0000'001c, 0x0000'009d, 0x0000'001e, 0x0000'009f}},
+      {{0x0000'0000'0000'8100, 0x0000'0000'0000'8302},
+       {0x0000'0000'0000'8504, 0x0000'0000'0000'8706},
+       {0x0000'0000'0000'8908, 0x0000'0000'0000'8b0a},
+       {0x0000'0000'0000'8d0c, 0x0000'0000'0000'8f0e},
+       {0x0000'0000'0000'9110, 0x0000'0000'0000'9312},
+       {0x0000'0000'0000'9514, 0x0000'0000'0000'9716},
+       {0x0000'0000'0000'9918, 0x0000'0000'0000'9b1a},
+       {0x0000'0000'0000'9d1c, 0x0000'0000'0000'9f1e}},
+      kVectorCalculationsSource,
+      4);
+
+  TestExtendingVectorInstruction(
+      0x4902a457,  // vsext.vf4 v8,v16,v0.t
+      {},
+      {{0x0000'0000, 0xffff'ff81, 0x0000'0002, 0xffff'ff83},
+       {0x0000'0004, 0xffff'ff85, 0x0000'0006, 0xffff'ff87},
+       {0x0000'0008, 0xffff'ff89, 0x0000'000a, 0xffff'ff8b},
+       {0x0000'000c, 0xffff'ff8d, 0x0000'000e, 0xffff'ff8f},
+       {0x0000'0010, 0xffff'ff91, 0x0000'0012, 0xffff'ff93},
+       {0x0000'0014, 0xffff'ff95, 0x0000'0016, 0xffff'ff97},
+       {0x0000'0018, 0xffff'ff99, 0x0000'001a, 0xffff'ff9b},
+       {0x0000'001c, 0xffff'ff9d, 0x0000'001e, 0xffff'ff9f}},
+      {{0xffff'ffff'ffff'8100, 0xffff'ffff'ffff'8302},
+       {0xffff'ffff'ffff'8504, 0xffff'ffff'ffff'8706},
+       {0xffff'ffff'ffff'8908, 0xffff'ffff'ffff'8b0a},
+       {0xffff'ffff'ffff'8d0c, 0xffff'ffff'ffff'8f0e},
+       {0xffff'ffff'ffff'9110, 0xffff'ffff'ffff'9312},
+       {0xffff'ffff'ffff'9514, 0xffff'ffff'ffff'9716},
+       {0xffff'ffff'ffff'9918, 0xffff'ffff'ffff'9b1a},
+       {0xffff'ffff'ffff'9d1c, 0xffff'ffff'ffff'9f1e}},
+      kVectorCalculationsSource,
+      4);
+
+  TestExtendingVectorInstruction(
+      0x49032457,  // vzext.vf2 v8,v16,v0.t
+      {{0x0000, 0x0081, 0x0002, 0x0083, 0x0004, 0x0085, 0x0006, 0x0087},
+       {0x0008, 0x0089, 0x000a, 0x008b, 0x000c, 0x008d, 0x000e, 0x008f},
+       {0x0010, 0x0091, 0x0012, 0x0093, 0x0014, 0x0095, 0x0016, 0x0097},
+       {0x0018, 0x0099, 0x001a, 0x009b, 0x001c, 0x009d, 0x001e, 0x009f},
+       {0x0020, 0x00a1, 0x0022, 0x00a3, 0x0024, 0x00a5, 0x0026, 0x00a7},
+       {0x0028, 0x00a9, 0x002a, 0x00ab, 0x002c, 0x00ad, 0x002e, 0x00af},
+       {0x0030, 0x00b1, 0x0032, 0x00b3, 0x0034, 0x00b5, 0x0036, 0x00b7},
+       {0x0038, 0x00b9, 0x003a, 0x00bb, 0x003c, 0x00bd, 0x003e, 0x00bf}},
+      {{0x0000'8100, 0x0000'8302, 0x0000'8504, 0x0000'8706},
+       {0x0000'8908, 0x0000'8b0a, 0x0000'8d0c, 0x0000'8f0e},
+       {0x0000'9110, 0x0000'9312, 0x0000'9514, 0x0000'9716},
+       {0x0000'9918, 0x0000'9b1a, 0x0000'9d1c, 0x0000'9f1e},
+       {0x0000'a120, 0x0000'a322, 0x0000'a524, 0x0000'a726},
+       {0x0000'a928, 0x0000'ab2a, 0x0000'ad2c, 0x0000'af2e},
+       {0x0000'b130, 0x0000'b332, 0x0000'b534, 0x0000'b736},
+       {0x0000'b938, 0x0000'bb3a, 0x0000'bd3c, 0x0000'bf3e}},
+      {{0x0000'0000'8302'8100, 0x0000'0000'8706'8504},
+       {0x0000'0000'8b0a'8908, 0x0000'0000'8f0e'8d0c},
+       {0x0000'0000'9312'9110, 0x0000'0000'9716'9514},
+       {0x0000'0000'9b1a'9918, 0x0000'0000'9f1e'9d1c},
+       {0x0000'0000'a322'a120, 0x0000'0000'a726'a524},
+       {0x0000'0000'ab2a'a928, 0x0000'0000'af2e'ad2c},
+       {0x0000'0000'b332'b130, 0x0000'0000'b736'b534},
+       {0x0000'0000'bb3a'b938, 0x0000'0000'bf3e'bd3c}},
+      kVectorCalculationsSource,
+      2);
+
+  TestExtendingVectorInstruction(
+      0x4903a457,  // vsext.vf2 v8,v16,v0.t
+      {{0x0000, 0xff81, 0x0002, 0xff83, 0x0004, 0xff85, 0x0006, 0xff87},
+       {0x0008, 0xff89, 0x000a, 0xff8b, 0x000c, 0xff8d, 0x000e, 0xff8f},
+       {0x0010, 0xff91, 0x0012, 0xff93, 0x0014, 0xff95, 0x0016, 0xff97},
+       {0x0018, 0xff99, 0x001a, 0xff9b, 0x001c, 0xff9d, 0x001e, 0xff9f},
+       {0x0020, 0xffa1, 0x0022, 0xffa3, 0x0024, 0xffa5, 0x0026, 0xffa7},
+       {0x0028, 0xffa9, 0x002a, 0xffab, 0x002c, 0xffad, 0x002e, 0xffaf},
+       {0x0030, 0xffb1, 0x0032, 0xffb3, 0x0034, 0xffb5, 0x0036, 0xffb7},
+       {0x0038, 0xffb9, 0x003a, 0xffbb, 0x003c, 0xffbd, 0x003e, 0xffbf}},
+      {{0xffff'8100, 0xffff'8302, 0xffff'8504, 0xffff'8706},
+       {0xffff'8908, 0xffff'8b0a, 0xffff'8d0c, 0xffff'8f0e},
+       {0xffff'9110, 0xffff'9312, 0xffff'9514, 0xffff'9716},
+       {0xffff'9918, 0xffff'9b1a, 0xffff'9d1c, 0xffff'9f1e},
+       {0xffff'a120, 0xffff'a322, 0xffff'a524, 0xffff'a726},
+       {0xffff'a928, 0xffff'ab2a, 0xffff'ad2c, 0xffff'af2e},
+       {0xffff'b130, 0xffff'b332, 0xffff'b534, 0xffff'b736},
+       {0xffff'b938, 0xffff'bb3a, 0xffff'bd3c, 0xffff'bf3e}},
+      {{0xffff'ffff'8302'8100, 0xffff'ffff'8706'8504},
+       {0xffff'ffff'8b0a'8908, 0xffff'ffff'8f0e'8d0c},
+       {0xffff'ffff'9312'9110, 0xffff'ffff'9716'9514},
+       {0xffff'ffff'9b1a'9918, 0xffff'ffff'9f1e'9d1c},
+       {0xffff'ffff'a322'a120, 0xffff'ffff'a726'a524},
+       {0xffff'ffff'ab2a'a928, 0xffff'ffff'af2e'ad2c},
+       {0xffff'ffff'b332'b130, 0xffff'ffff'b736'b534},
+       {0xffff'ffff'bb3a'b938, 0xffff'ffff'bf3e'bd3c}},
+      kVectorCalculationsSource,
+      2);
+}
 }  // namespace
 
 }  // namespace berberis
diff --git a/intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h b/intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h
index 7a2a334..d243699 100644
--- a/intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h
+++ b/intrinsics/riscv64/include/berberis/intrinsics/riscv64/vector_intrinsics.h
@@ -303,6 +303,39 @@
   return result;
 }
 
+template <typename ElementType>
+SIMD128Register VectorExtend(SIMD128Register src) {
+  SIMD128Register result;
+  constexpr int kElementsCount = static_cast<int>(8 / sizeof(ElementType));
+  for (int index = 0; index < kElementsCount; ++index) {
+    result.Set(Widen(VectorElement<ElementType>(src, index)), index);
+  }
+  return result;
+}
+
+template <typename ElementType,
+          enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
+inline std::tuple<SIMD128Register> Vextf2(SIMD128Register src) {
+  using SourceElementType = decltype(Narrow(ElementType{0}));
+  return {VectorExtend<SourceElementType>(src)};
+}
+
+template <typename ElementType,
+          enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
+inline std::tuple<SIMD128Register> Vextf4(SIMD128Register src) {
+  using WideSourceElementType = decltype(Narrow(ElementType{0}));
+  using SourceElementType = decltype(Narrow(WideSourceElementType{0}));
+  return {VectorExtend<WideSourceElementType>(VectorExtend<SourceElementType>(src))};
+}
+
+template <typename ElementType,
+          enum PreferredIntrinsicsImplementation = kUseAssemblerImplementationIfPossible>
+inline std::tuple<SIMD128Register> Vextf8(SIMD128Register src) {
+  using WideWideSourceElementType = decltype(Narrow(ElementType{0}));
+  return {
+      VectorExtend<WideWideSourceElementType>(std::get<0>(Vextf4<WideWideSourceElementType>(src)))};
+}
+
 // SEW = 2*SEW op SEW
 // TODO(b/260725458): Pass lambda as template argument after C++20 would become available.
 template <typename ElementType, typename Lambda, typename... ParameterType>
@@ -376,9 +409,8 @@
         DEFINE_ARITHMETIC_PARAMETERS_OR_ARGUMENTS arguments);                                     \
   }
 
-#define DEFINE_1OP_ARITHMETIC_INTRINSIC_M(name, ...)                 \
-  DEFINE_ARITHMETIC_INTRINSIC(V##name##m, return ({ __VA_ARGS__; }); \
-                              , (Int128 src), (src))
+#define DEFINE_1OP_ARITHMETIC_INTRINSIC_M(name, ...) \
+  DEFINE_ARITHMETIC_INTRINSIC(V##name##m, return ({ __VA_ARGS__; });, (Int128 src), (src))
 #define DEFINE_2OP_ARITHMETIC_INTRINSIC_VS(name, ...)                 \
   DEFINE_ARITHMETIC_INTRINSIC(V##name##vs, return ({ __VA_ARGS__; }); \
                               , (ElementType src1, ElementType src2), (src1, src2))