Implement vst1_8 and fix vst1_32 encoding

PiperOrigin-RevId: 416404695
diff --git a/src/jit/aarch32-assembler.cc b/src/jit/aarch32-assembler.cc
index d907169..20816a3 100644
--- a/src/jit/aarch32-assembler.cc
+++ b/src/jit/aarch32-assembler.cc
@@ -325,40 +325,41 @@
   return emit32(kAL | encode(regs, 22, 12) | 0xD2D << 16 | 0xB << 8);
 }
 
-Assembler& Assembler::vst1_32(DRegisterList regs, MemOperand op) {
-  uint8_t type = encode_regs_length_to_type(regs);
+Assembler& Assembler::vst1(DataSize size, DRegisterList regs, MemOperand op) {
+  const uint8_t type = encode_regs_length_to_type(regs);
   if (!type) {
     error_ = Error::kInvalidRegisterListLength;
     return *this;
   }
 
   const uint32_t rm = op.mode() == AddressingMode::kPostIndexed ? 0xD : 0xF;
-  return emit32(0xF400'0080 | encode(regs.start, 22, 12) | op.base().code << 16 | type << 8 | rm);
+  return emit32(0xF400'0000 | encode(regs.start, 22, 12) | op.base().code << 16 | type << 8 | size << 6 | rm);
 }
 
-Assembler& Assembler::vst1_32(DRegisterList regs, MemOperand op, CoreRegister rm) {
+Assembler& Assembler::vst1(DataSize size, DRegisterList regs, MemOperand op, CoreRegister rm) {
   if (rm.code == 0b1101 || rm.code == 0b1111) {
     error_ = Error::kInvalidOperand;
     return *this;
   }
 
-  uint8_t type = encode_regs_length_to_type(regs);
+  const uint8_t type = encode_regs_length_to_type(regs);
   if (!type) {
     error_ = Error::kInvalidRegisterListLength;
     return *this;
   }
 
-  return emit32(0xF400'0080 | encode(regs.start, 22, 12) | op.base().code << 16 | type << 8 | rm.code);
+  return emit32(0xF400'0000 | encode(regs.start, 22, 12) | op.base().code << 16 | type << 8 | size << 6 | rm.code);
 }
 
-Assembler& Assembler::vst1_32(DRegisterLane dd, MemOperand op) {
-  if (dd.lane > 1) {
+Assembler& Assembler::vst1(DataSize size, DRegisterLane dd, MemOperand op) {
+  if ((size == k8 && dd.lane > 7) || (size == k32 && dd.lane > 1)) {
     error_ = Error::kInvalidLaneIndex;
     return *this;
   }
 
+  const uint8_t shift = size == k8 ? 5 : 7;
   const uint32_t rm = op.mode() == AddressingMode::kPostIndexed ? 0xD : 0xF;
-  return emit32(0xF480'0800 | encode(dd, 22, 12) | op.base().code << 16 | dd.lane << 5 | rm);
+  return emit32(0xF480'0000 | encode(dd, 22, 12) | op.base().code << 16 | size << 10 | dd.lane << shift | rm);
 }
 
 void* Assembler::finalize() {
diff --git a/src/xnnpack/aarch32-assembler.h b/src/xnnpack/aarch32-assembler.h
index bb66850..cf1847a 100644
--- a/src/xnnpack/aarch32-assembler.h
+++ b/src/xnnpack/aarch32-assembler.h
@@ -326,6 +326,11 @@
   kInvalidRegisterListLength,
 };
 
+enum DataSize {
+  k8 = 0,
+  k32 = 2,
+};
+
 // A simple AAarch32 assembler.
 class Assembler {
  public:
@@ -391,12 +396,18 @@
   Assembler& vpop(DRegisterList regs);
   Assembler& vpush(SRegisterList regs);
   Assembler& vpush(DRegisterList regs);
+  // VST1.8 <list>, [<Rn>]{!} (multiple single elements).
+  Assembler& vst1_8(DRegisterList regs, MemOperand op) { return vst1(k8, regs, op); }
+  // VST1.8 <list>, [<Rn>]{!}, <Rm> (multiple single elements).
+  Assembler& vst1_8(DRegisterList regs, MemOperand op, CoreRegister rm) { return vst1(k8, regs, op, rm); }
+  // VST1.8 <list>, [<Rn>]{!} (single element form one lane).
+  Assembler& vst1_8(DRegisterLane dd, MemOperand op) { return vst1(k8, dd, op); }
   // VST1.32 <list>, [<Rn>]{!} (multiple single elements).
-  Assembler& vst1_32(DRegisterList regs, MemOperand op);
+  Assembler& vst1_32(DRegisterList regs, MemOperand op) { return vst1(k32, regs, op); }
   // VST1.32 <list>, [<Rn>]{!}, <Rm> (multiple single elements).
-  Assembler& vst1_32(DRegisterList regs, MemOperand op, CoreRegister rm);
+  Assembler& vst1_32(DRegisterList regs, MemOperand op, CoreRegister rm) { return vst1(k32, regs, op, rm); }
   // VST1.32 <list>, [<Rn>]{!} (single element form one lane).
-  Assembler& vst1_32(DRegisterLane dd, MemOperand op);
+  Assembler& vst1_32(DRegisterLane dd, MemOperand op) { return vst1(k32, dd, op); }
 
   // Binds Label l to the current location in the code buffer.
   Assembler& bind(Label& l);
@@ -419,6 +430,9 @@
   Assembler& emit32(uint32_t value);
   Assembler& mov(Condition c, CoreRegister rd, CoreRegister rm);
   Assembler& b(Condition c, Label& l);
+  Assembler& vst1(DataSize size, DRegisterList regs, MemOperand op);
+  Assembler& vst1(DataSize size, DRegisterList regs, MemOperand op, CoreRegister rm);
+  Assembler& vst1(DataSize size, DRegisterLane dd, MemOperand op);
 
   // Pointer to start of code buffer.
   uint32_t* buffer_;
diff --git a/test/aarch32-assembler.cc b/test/aarch32-assembler.cc
index 18b73a8..d4fe268 100644
--- a/test/aarch32-assembler.cc
+++ b/test/aarch32-assembler.cc
@@ -112,6 +112,11 @@
   CHECK_ENCODING(0xED2D8B10, a.vpush({d8, d15}));
   CHECK_ENCODING(0xED6D4B08, a.vpush({d20, d23}));
 
+  CHECK_ENCODING(0xF40B0707, a.vst1_8({d0}, mem[r11], r7));
+  CHECK_ENCODING(0xF48B000F, a.vst1_8({d0[0]}, mem[r11]));
+  CHECK_ENCODING(0xF48B00EF, a.vst1_8({d0[7]}, mem[r11]));
+  EXPECT_ERROR(Error::kInvalidLaneIndex, a.vst1_8(d0[8], mem[r11]));
+
   CHECK_ENCODING(0xF44B0280, a.vst1_32({d16, d19}, mem[r11], r0));
   EXPECT_ERROR(Error::kInvalidRegisterListLength, a.vst1_32({d0, d4}, mem[r11], r0));
   EXPECT_ERROR(Error::kInvalidOperand, a.vst1_32({d16, d19}, mem[r11], sp));
@@ -121,6 +126,7 @@
   CHECK_ENCODING(0xF4CB080F, a.vst1_32({d16[0]}, mem[r11]));
   // The surrounding braces are optional, but makes it look closer to native assembly.
   CHECK_ENCODING(0xF4CB080F, a.vst1_32(d16[0], mem[r11]));
+  CHECK_ENCODING(0xF4CB088F, a.vst1_32(d16[1], mem[r11]));
   EXPECT_ERROR(Error::kInvalidLaneIndex, a.vst1_32(d16[2], mem[r11]));
   CHECK_ENCODING(0xF4C6C80D, a.vst1_32({d28[0]}, mem[r6]++));