blob: 20816a3d9c1be5151d9d17ee95ad5be8657dd664 [file] [log] [blame]
#include "xnnpack/aarch32-assembler.h"
#include <cmath>
namespace xnnpack {
namespace aarch32 {
static const int DEFAULT_BUFFER_SIZE = 4096;
// PC register contains current address of instruction + 8 (2 instructions).
constexpr ptrdiff_t kPCDelta = 2;
// Constants used for checking branch offsets bounds.
constexpr ptrdiff_t kInt24Max = 8388607;
constexpr ptrdiff_t kInt24Min = -8388608;
// Check if a branch offset is valid, it must fit in 24 bits.
bool branch_offset_valid(ptrdiff_t offset) {
return offset < kInt24Max && offset > kInt24Min;
}
uint32_t encode(SRegister r, uint32_t single_bit_pos, uint32_t four_bits_pos) {
return r.d() << single_bit_pos | r.vd() << four_bits_pos;
}
uint32_t encode(DRegister r, uint32_t single_bit_pos, uint32_t four_bits_pos) {
return r.d() << single_bit_pos | r.vd() << four_bits_pos;
}
uint32_t encode(DRegisterLane r, uint32_t single_bit_pos, uint32_t four_bits_pos) {
return r.d() << single_bit_pos | r.vd() << four_bits_pos;
}
uint32_t encode(QRegister r, uint32_t single_bit_pos, uint32_t four_bits_pos) {
return r.d() << single_bit_pos | r.vd() << four_bits_pos;
}
uint32_t encode(SRegisterList regs, uint32_t single_bit_pos, uint32_t four_bits_pos) {
const SRegister r = regs.start;
return r.d() << single_bit_pos | r.vd() << four_bits_pos | regs.length;
}
uint32_t encode(DRegisterList regs, uint32_t single_bit_pos, uint32_t four_bits_pos) {
const DRegister r = regs.start;
return r.d() << single_bit_pos | r.vd() << four_bits_pos | regs.length * 2;
}
// Return value of 0 is invalid, indicates error.
uint32_t encode_regs_length_to_type(DRegisterList regs) {
switch (regs.length) {
case 1:
return 0x7;
case 2:
return 0xA;
case 3:
return 0x6;
case 4:
return 0x2;
}
return 0;
}
Assembler::Assembler(xnn_code_buffer* buf) {
buffer_ = reinterpret_cast<uint32_t*>(buf->code);
cursor_ = buffer_;
top_ = buffer_ + (buf->capacity / 4);
xnn_buffer = buf;
}
Assembler& Assembler::emit32(uint32_t value) {
if (error_ != Error::kNoError) {
return *this;
}
if (cursor_ == top_) {
error_ = Error::kOutOfMemory;
return *this;
}
*cursor_++ = value;
return *this;
}
Assembler& Assembler::add(CoreRegister rd, CoreRegister rn, CoreRegister rm) {
return emit32(kAL | 0x8 << 20 | rn.code << 16 | rd.code << 12 | rm.code);
}
Assembler& Assembler::add(CoreRegister rd, CoreRegister rn, uint8_t imm) {
// Rotation = 0, since imm is limited to 8 bits and fits in encoding.
return emit32(kAL | 0x28 << 20 | rn.code << 16 | rd.code << 12 | imm);
}
Assembler& Assembler::b(Condition c, Label& l) {
if (l.bound) {
// Offset is relative to after this b instruction + kPCDelta.
const ptrdiff_t offset = l.offset - cursor_ - kPCDelta;
if (!branch_offset_valid(offset)) {
error_ = Error::kLabelOffsetOutOfBounds;
return *this;
}
// No need to shift by 2 since our offset is already in terms of uint32_t.
return emit32(c | 0xA << 24 | (offset & 0x00FFFFFF));
} else {
if (!l.add_use(cursor_)) {
error_ = Error::kLabelHasTooManyUsers;
return *this;
}
// Emit 0 offset first, will patch it up when label is bound later.
return emit32(c | 0xA << 24);
}
}
Assembler& Assembler::bind(Label& l) {
if (l.bound) {
error_ = Error::kLabelAlreadyBound;
return *this;
}
l.bound = true;
l.offset = cursor_;
// Patch all users.
for (size_t i = 0; i < l.num_users; i++) {
uint32_t* user = l.users[i];
const ptrdiff_t offset = l.offset - user - kPCDelta;
if (!branch_offset_valid(offset)) {
error_ = Error::kLabelOffsetOutOfBounds;
return *this;
}
*user = (*user | (offset & 0x00FFFFFF));
}
return *this;
}
Assembler& Assembler::bx(CoreRegister rm) {
return emit32(kAL | 0x12fff10 | rm.code);
}
Assembler& Assembler::cmp(CoreRegister rn, uint8_t imm) {
return emit32(kAL | 0x35 << 20 | rn.code << 16 | imm);
}
Assembler& Assembler::ldr(CoreRegister rt, MemOperand op, int32_t offset) {
return ldr(rt, MemOperand(op.base(), offset, AddressingMode::kPostIndexed));
}
Assembler& Assembler::ldr(CoreRegister rt, MemOperand op) {
const int32_t offset = op.offset();
constexpr int32_t max_imm12 = 4095;
if (std::abs(offset) > max_imm12) {
error_ = Error::kInvalidOperand;
return *this;
}
return emit32(kAL | 0x41 << 20 | op.p() << 24 | op.u() << 23 | op.w() << 21 | op.base().code << 16 | rt.code << 12 |
offset);
}
Assembler& Assembler::mov(CoreRegister rd, CoreRegister rm) {
return mov(kAL, rd, rm);
}
Assembler& Assembler::movlo(CoreRegister rd, CoreRegister rm) {
return mov(kLO, rd, rm);
}
Assembler& Assembler::movls(CoreRegister rd, CoreRegister rm) {
return mov(kLS, rd, rm);
}
Assembler& Assembler::mov(Condition c, CoreRegister Rd, CoreRegister Rm) {
return emit32(c | 0x1A << 20 | Rd.code << 12 | Rm.code);
}
Assembler& Assembler::pld(MemOperand op) {
return emit32(0xF550F000 | op.u() << 23 | op.base().code << 16 | op.offset());
}
Assembler& Assembler::pop(CoreRegisterList regs) {
if (!regs.has_more_than_one_register()) {
// TODO(zhin): there is a different valid encoding for single register.
error_ = Error::kInvalidOperand;
return *this;
}
return emit32(kAL | 0x8BD << 16 | regs.list);
}
Assembler& Assembler::push(CoreRegisterList regs) {
if (!regs.has_more_than_one_register()) {
// TODO(zhin): there is a different valid encoding for single register.
error_ = Error::kInvalidOperand;
return *this;
}
return emit32(kAL | 0x92D << 16 | regs.list);
}
Assembler& Assembler::sub(CoreRegister rd, CoreRegister rn, CoreRegister rm) {
return emit32(kAL | 0x4 << 20 | rn.code << 16 | rd.code << 12 | rm.code);
}
Assembler& Assembler::subs(CoreRegister rd, CoreRegister rn, uint8_t imm) {
// Rotation = 0, since imm is limited to 8 bits and fits in encoding.
return emit32(kAL | 0x25 << 20 | rn.code << 16 | rd.code << 12 | imm);
}
Assembler& Assembler::tst(CoreRegister rn, uint8_t imm) {
// Rotation = 0, since imm is limited to 8 bits and fits in encoding.
return emit32(kAL | 0x31 << 20 | rn.code << 16 | imm);
}
Assembler& Assembler::vld1_8(DRegisterList regs, MemOperand op) {
if (regs.length != 1) {
// Unimplemented since only length 1 is used in microkernels.
error_ = Error::kInvalidOperand;
return *this;
}
const uint32_t rm = op.mode() == AddressingMode::kPostIndexed ? 0xD : 0xF;
return emit32(0xF4200700 | encode(regs.start, 22, 12) | op.base().code << 16 | rm);
}
Assembler& Assembler::vld1_32(DRegisterList regs, MemOperand op) {
if (regs.length != 1) {
// Unimplemented since only length 1 is used in microkernels.
error_ = Error::kInvalidOperand;
return *this;
}
const uint32_t rm = op.mode() == AddressingMode::kPostIndexed ? 0xD : 0xF;
return emit32(0xF4200780 | encode(regs.start, 22, 12) | op.base().code << 16 | rm);
}
Assembler& Assembler::vld1r_32(DRegisterList regs, MemOperand op) {
if (regs.length != 2) {
// Unimplemented since only length 2 used in microkernels.
error_ = Error::kInvalidOperand;
return *this;
}
const uint32_t rm = op.mode() == AddressingMode::kPostIndexed ? 0xD : 0xF;
return emit32(0xF4A00CA0 | encode(regs.start, 22, 12) | op.base().code << 16 | rm);
}
Assembler& Assembler::vldm(CoreRegister rn, SRegisterList regs, bool wb) {
return emit32(kAL | 0x0C900A00 | wb << 21 | rn.code << 16 | encode(regs, 22, 12));
}
Assembler& Assembler::vldm(CoreRegister rn, DRegisterList regs, bool wb) {
return emit32(kAL | 0x0C900B00 | wb << 21 | rn.code << 16 | encode(regs, 22, 12));
}
Assembler& Assembler::vldr(DRegister dd, MemOperand op) {
// TOOD(zhin): post-increment not used in any microkernels, so not implemented yet.
if (op.mode() != AddressingMode::kOffset || std::abs(op.offset()) > UINT8_MAX) {
error_ = Error::kInvalidOperand;
return *this;
}
return emit32(kAL | 0x0D100B00 | op.u() << 23 | encode(dd, 22, 12) | op.base().code << 16 | op.offset() >> 2);
}
Assembler& Assembler::vmax_f32(QRegister qd, QRegister qn, QRegister qm) {
return emit32(kAL | 0xF2000F40 | encode(qd, 22, 12) | encode(qn, 7, 16) | encode(qm, 5, 0));
}
Assembler& Assembler::vmin_f32(QRegister qd, QRegister qn, QRegister qm) {
return emit32(kAL | 0xF2200F40 | encode(qd, 22, 12) | encode(qn, 7, 16) | encode(qm, 5, 0));
}
Assembler& Assembler::vmla_f32(QRegister qd, QRegister qn, DRegisterLane dm) {
if (dm.lane > 1) {
error_ = Error::kInvalidLaneIndex;
return *this;
}
return emit32(0xF3A00140 | encode(qd, 22, 12) | encode(qn, 7, 16) | dm.lane << 5 | dm.code);
}
Assembler& Assembler::vmlal_s16(QRegister qd, DRegister dn, DRegisterLane dm) {
if (dm.lane > 3) {
error_ = Error::kInvalidLaneIndex;
return *this;
}
if (dm.code > 7) {
error_ = Error::kInvalidOperand;
return *this;
}
uint8_t lane_top = dm.lane >> 1;
uint8_t lane_bot = dm.lane & 1;
return emit32(0xF2900240 | encode(qd, 22, 12) | encode(dn, 7, 16) | lane_top << 5 | lane_bot << 3 | dm.code);
}
Assembler& Assembler::vmov(SRegister sd, SRegister sm) {
return emit32(kAL | 0x0EB00A40 | encode(sd, 22, 12) | encode(sm, 5, 0));
}
Assembler& Assembler::vmov(DRegister dm, CoreRegister rt, CoreRegister rt2) {
return emit32(kAL | 0x0C400B10 | rt2.code << 16 | rt.code << 12 | encode(dm, 5, 0));
}
Assembler& Assembler::vmov(DRegister dd, DRegister dm) {
return emit32(0xF2600110 | encode(dd, 22, 12) | encode(dm, 7, 16) | encode(dm, 5, 0));
}
Assembler& Assembler::vmov(QRegister qd, QRegister qm) {
return emit32(0xF2200150 | encode(qd, 22, 12) | encode(qm, 7, 16) | encode(qm, 5, 0));
}
Assembler& Assembler::vmovl_s8(QRegister qd, DRegister dm) {
return emit32(0xF2880A10 | encode(qd, 22, 12) | encode(dm, 5, 0));
}
Assembler& Assembler::vpop(DRegisterList regs) {
return emit32(kAL | encode(regs, 22, 12) | 0xCBD << 16 | 0xB << 8);
}
Assembler& Assembler::vpush(SRegisterList regs) {
return emit32(kAL | encode(regs, 22, 12) | 0xD2D << 16 | 0xA << 8);
}
Assembler& Assembler::vpush(DRegisterList regs) {
return emit32(kAL | encode(regs, 22, 12) | 0xD2D << 16 | 0xB << 8);
}
Assembler& Assembler::vst1(DataSize size, DRegisterList regs, MemOperand op) {
const uint8_t type = encode_regs_length_to_type(regs);
if (!type) {
error_ = Error::kInvalidRegisterListLength;
return *this;
}
const uint32_t rm = op.mode() == AddressingMode::kPostIndexed ? 0xD : 0xF;
return emit32(0xF400'0000 | encode(regs.start, 22, 12) | op.base().code << 16 | type << 8 | size << 6 | rm);
}
Assembler& Assembler::vst1(DataSize size, DRegisterList regs, MemOperand op, CoreRegister rm) {
if (rm.code == 0b1101 || rm.code == 0b1111) {
error_ = Error::kInvalidOperand;
return *this;
}
const uint8_t type = encode_regs_length_to_type(regs);
if (!type) {
error_ = Error::kInvalidRegisterListLength;
return *this;
}
return emit32(0xF400'0000 | encode(regs.start, 22, 12) | op.base().code << 16 | type << 8 | size << 6 | rm.code);
}
Assembler& Assembler::vst1(DataSize size, DRegisterLane dd, MemOperand op) {
if ((size == k8 && dd.lane > 7) || (size == k32 && dd.lane > 1)) {
error_ = Error::kInvalidLaneIndex;
return *this;
}
const uint8_t shift = size == k8 ? 5 : 7;
const uint32_t rm = op.mode() == AddressingMode::kPostIndexed ? 0xD : 0xF;
return emit32(0xF480'0000 | encode(dd, 22, 12) | op.base().code << 16 | size << 10 | dd.lane << shift | rm);
}
void* Assembler::finalize() {
xnn_buffer->size = code_size_in_bytes();
xnn_finalize_code_memory(xnn_buffer);
return reinterpret_cast<void*>(buffer_);
}
void Assembler::reset() {
cursor_ = buffer_;
error_ = Error::kNoError;
}
} // namespace aarch32
} // namespace xnnpack