Implement vst1_8 and fix vst1_32 encoding
PiperOrigin-RevId: 416404695
diff --git a/src/jit/aarch32-assembler.cc b/src/jit/aarch32-assembler.cc
index d907169..20816a3 100644
--- a/src/jit/aarch32-assembler.cc
+++ b/src/jit/aarch32-assembler.cc
@@ -325,40 +325,41 @@
return emit32(kAL | encode(regs, 22, 12) | 0xD2D << 16 | 0xB << 8);
}
-Assembler& Assembler::vst1_32(DRegisterList regs, MemOperand op) {
- uint8_t type = encode_regs_length_to_type(regs);
+Assembler& Assembler::vst1(DataSize size, DRegisterList regs, MemOperand op) {
+ const uint8_t type = encode_regs_length_to_type(regs);
if (!type) {
error_ = Error::kInvalidRegisterListLength;
return *this;
}
const uint32_t rm = op.mode() == AddressingMode::kPostIndexed ? 0xD : 0xF;
- return emit32(0xF400'0080 | encode(regs.start, 22, 12) | op.base().code << 16 | type << 8 | rm);
+ return emit32(0xF400'0000 | encode(regs.start, 22, 12) | op.base().code << 16 | type << 8 | size << 6 | rm);
}
-Assembler& Assembler::vst1_32(DRegisterList regs, MemOperand op, CoreRegister rm) {
+Assembler& Assembler::vst1(DataSize size, DRegisterList regs, MemOperand op, CoreRegister rm) {
if (rm.code == 0b1101 || rm.code == 0b1111) {
error_ = Error::kInvalidOperand;
return *this;
}
- uint8_t type = encode_regs_length_to_type(regs);
+ const uint8_t type = encode_regs_length_to_type(regs);
if (!type) {
error_ = Error::kInvalidRegisterListLength;
return *this;
}
- return emit32(0xF400'0080 | encode(regs.start, 22, 12) | op.base().code << 16 | type << 8 | rm.code);
+ return emit32(0xF400'0000 | encode(regs.start, 22, 12) | op.base().code << 16 | type << 8 | size << 6 | rm.code);
}
-Assembler& Assembler::vst1_32(DRegisterLane dd, MemOperand op) {
- if (dd.lane > 1) {
+Assembler& Assembler::vst1(DataSize size, DRegisterLane dd, MemOperand op) {
+ if ((size == k8 && dd.lane > 7) || (size == k32 && dd.lane > 1)) {
error_ = Error::kInvalidLaneIndex;
return *this;
}
+ const uint8_t shift = size == k8 ? 5 : 7;
const uint32_t rm = op.mode() == AddressingMode::kPostIndexed ? 0xD : 0xF;
- return emit32(0xF480'0800 | encode(dd, 22, 12) | op.base().code << 16 | dd.lane << 5 | rm);
+ return emit32(0xF480'0000 | encode(dd, 22, 12) | op.base().code << 16 | size << 10 | dd.lane << shift | rm);
}
void* Assembler::finalize() {
diff --git a/src/xnnpack/aarch32-assembler.h b/src/xnnpack/aarch32-assembler.h
index bb66850..cf1847a 100644
--- a/src/xnnpack/aarch32-assembler.h
+++ b/src/xnnpack/aarch32-assembler.h
@@ -326,6 +326,11 @@
kInvalidRegisterListLength,
};
+enum DataSize {
+ k8 = 0,
+ k32 = 2,
+};
+
// A simple AAarch32 assembler.
class Assembler {
public:
@@ -391,12 +396,18 @@
Assembler& vpop(DRegisterList regs);
Assembler& vpush(SRegisterList regs);
Assembler& vpush(DRegisterList regs);
+ // VST1.8 <list>, [<Rn>]{!} (multiple single elements).
+ Assembler& vst1_8(DRegisterList regs, MemOperand op) { return vst1(k8, regs, op); }
+ // VST1.8 <list>, [<Rn>]{!}, <Rm> (multiple single elements).
+ Assembler& vst1_8(DRegisterList regs, MemOperand op, CoreRegister rm) { return vst1(k8, regs, op, rm); }
+ // VST1.8 <list>, [<Rn>]{!} (single element form one lane).
+ Assembler& vst1_8(DRegisterLane dd, MemOperand op) { return vst1(k8, dd, op); }
// VST1.32 <list>, [<Rn>]{!} (multiple single elements).
- Assembler& vst1_32(DRegisterList regs, MemOperand op);
+ Assembler& vst1_32(DRegisterList regs, MemOperand op) { return vst1(k32, regs, op); }
// VST1.32 <list>, [<Rn>]{!}, <Rm> (multiple single elements).
- Assembler& vst1_32(DRegisterList regs, MemOperand op, CoreRegister rm);
+ Assembler& vst1_32(DRegisterList regs, MemOperand op, CoreRegister rm) { return vst1(k32, regs, op, rm); }
// VST1.32 <list>, [<Rn>]{!} (single element form one lane).
- Assembler& vst1_32(DRegisterLane dd, MemOperand op);
+ Assembler& vst1_32(DRegisterLane dd, MemOperand op) { return vst1(k32, dd, op); }
// Binds Label l to the current location in the code buffer.
Assembler& bind(Label& l);
@@ -419,6 +430,9 @@
Assembler& emit32(uint32_t value);
Assembler& mov(Condition c, CoreRegister rd, CoreRegister rm);
Assembler& b(Condition c, Label& l);
+ Assembler& vst1(DataSize size, DRegisterList regs, MemOperand op);
+ Assembler& vst1(DataSize size, DRegisterList regs, MemOperand op, CoreRegister rm);
+ Assembler& vst1(DataSize size, DRegisterLane dd, MemOperand op);
// Pointer to start of code buffer.
uint32_t* buffer_;
diff --git a/test/aarch32-assembler.cc b/test/aarch32-assembler.cc
index 18b73a8..d4fe268 100644
--- a/test/aarch32-assembler.cc
+++ b/test/aarch32-assembler.cc
@@ -112,6 +112,11 @@
CHECK_ENCODING(0xED2D8B10, a.vpush({d8, d15}));
CHECK_ENCODING(0xED6D4B08, a.vpush({d20, d23}));
+ CHECK_ENCODING(0xF40B0707, a.vst1_8({d0}, mem[r11], r7));
+ CHECK_ENCODING(0xF48B000F, a.vst1_8({d0[0]}, mem[r11]));
+ CHECK_ENCODING(0xF48B00EF, a.vst1_8({d0[7]}, mem[r11]));
+ EXPECT_ERROR(Error::kInvalidLaneIndex, a.vst1_8(d0[8], mem[r11]));
+
CHECK_ENCODING(0xF44B0280, a.vst1_32({d16, d19}, mem[r11], r0));
EXPECT_ERROR(Error::kInvalidRegisterListLength, a.vst1_32({d0, d4}, mem[r11], r0));
EXPECT_ERROR(Error::kInvalidOperand, a.vst1_32({d16, d19}, mem[r11], sp));
@@ -121,6 +126,7 @@
CHECK_ENCODING(0xF4CB080F, a.vst1_32({d16[0]}, mem[r11]));
// The surrounding braces are optional, but makes it look closer to native assembly.
CHECK_ENCODING(0xF4CB080F, a.vst1_32(d16[0], mem[r11]));
+ CHECK_ENCODING(0xF4CB088F, a.vst1_32(d16[1], mem[r11]));
EXPECT_ERROR(Error::kInvalidLaneIndex, a.vst1_32(d16[2], mem[r11]));
CHECK_ENCODING(0xF4C6C80D, a.vst1_32({d28[0]}, mem[r6]++));