blob: 0a48906ce9340e6b7e332656d852975911039289 [file] [log] [blame]
/* -*- mesa-c++ -*-
* Copyright 2022 Collabora LTD
* Author: Gert Wollny <gert.wollny@collabora.com>
* SPDX-License-Identifier: MIT
*/
#include "sfn_assembler.h"
#include "../eg_sq.h"
#include "../r600_asm.h"
#include "sfn_callstack.h"
#include "sfn_conditionaljumptracker.h"
#include "sfn_debug.h"
#include "sfn_instr_alugroup.h"
#include "sfn_instr_controlflow.h"
#include "sfn_instr_export.h"
#include "sfn_instr_fetch.h"
#include "sfn_instr_mem.h"
#include "sfn_instr_tex.h"
namespace r600 {
Assembler::Assembler(r600_shader *sh, const r600_shader_key& key):
m_sh(sh),
m_key(key)
{
}
extern const std::map<ESDOp, int> ds_opcode_map;
class AssamblerVisitor : public ConstInstrVisitor {
public:
AssamblerVisitor(r600_shader *sh, const r600_shader_key& key, bool legacy_math_rules);
void visit(const AluInstr& instr) override;
void visit(const AluGroup& instr) override;
void visit(const TexInstr& instr) override;
void visit(const ExportInstr& instr) override;
void visit(const FetchInstr& instr) override;
void visit(const Block& instr) override;
void visit(const IfInstr& instr) override;
void visit(const ControlFlowInstr& instr) override;
void visit(const ScratchIOInstr& instr) override;
void visit(const StreamOutInstr& instr) override;
void visit(const MemRingOutInstr& instr) override;
void visit(const EmitVertexInstr& instr) override;
void visit(const GDSInstr& instr) override;
void visit(const WriteTFInstr& instr) override;
void visit(const LDSAtomicInstr& instr) override;
void visit(const LDSReadInstr& instr) override;
void visit(const RatInstr& instr) override;
void finalize();
const uint32_t sf_vtx = 1;
const uint32_t sf_tex = 2;
const uint32_t sf_alu = 4;
const uint32_t sf_addr_register = 8;
const uint32_t sf_all = 0xf;
void clear_states(const uint32_t& states);
bool copy_dst(r600_bytecode_alu_dst& dst, const Register& d, bool write);
PVirtualValue copy_src(r600_bytecode_alu_src& src, const VirtualValue& s);
EBufferIndexMode emit_index_reg(const VirtualValue& addr, unsigned idx);
void emit_endif();
void emit_else();
void emit_loop_begin(bool vpm);
void emit_loop_end();
void emit_loop_break();
void emit_loop_cont();
void emit_alu_op(const AluInstr& ai);
void emit_lds_op(const AluInstr& lds);
auto translate_for_mathrules(EAluOp op) -> EAluOp;
void emit_wait_ack();
/* Start initialized in constructor */
const r600_shader_key& m_key;
r600_shader *m_shader;
r600_bytecode *m_bc;
ConditionalJumpTracker m_jump_tracker;
CallStack m_callstack;
bool ps_alpha_to_one;
/* End initialized in constructor */
std::set<uint32_t> m_nliterals_in_group;
std::set<int> vtx_fetch_results;
std::set<int> tex_fetch_results;
const VirtualValue *m_last_addr{nullptr};
unsigned m_max_color_exports{0};
int m_loop_nesting{0};
bool m_ack_suggested{false};
bool m_has_param_output{false};
bool m_has_pos_output{false};
bool m_last_op_was_barrier{false};
bool m_result{true};
bool m_legacy_math_rules{false};
};
bool
Assembler::lower(Shader *shader)
{
AssamblerVisitor ass(m_sh, m_key, shader->has_flag(Shader::sh_legacy_math_rules));
auto& blocks = shader->func();
for (auto b : blocks) {
b->accept(ass);
if (!ass.m_result)
return false;
}
ass.finalize();
return ass.m_result;
}
AssamblerVisitor::AssamblerVisitor(r600_shader *sh, const r600_shader_key& key,
bool legacy_math_rules):
m_key(key),
m_shader(sh),
m_bc(&sh->bc),
m_callstack(sh->bc),
ps_alpha_to_one(key.ps.alpha_to_one),
m_legacy_math_rules(legacy_math_rules)
{
if (m_shader->processor_type == PIPE_SHADER_FRAGMENT)
m_max_color_exports = MAX2(m_key.ps.nr_cbufs, 1);
if (m_shader->processor_type == PIPE_SHADER_VERTEX && m_shader->ninput > 0)
r600_bytecode_add_cfinst(m_bc, CF_OP_CALL_FS);
}
void
AssamblerVisitor::finalize()
{
const struct cf_op_info *last = nullptr;
if (m_bc->cf_last)
last = r600_isa_cf(m_bc->cf_last->op);
/* alu clause instructions don't have EOP bit, so add NOP */
if (m_shader->bc.gfx_level < CAYMAN &&
(!last || last->flags & CF_ALU || m_bc->cf_last->op == CF_OP_LOOP_END ||
m_bc->cf_last->op == CF_OP_POP))
r600_bytecode_add_cfinst(m_bc, CF_OP_NOP);
/* A fetch shader only can't be EOP (results in hang), but we can replace
* it by a NOP */
else if (last && m_bc->cf_last->op == CF_OP_CALL_FS)
m_bc->cf_last->op = CF_OP_NOP;
if (m_shader->bc.gfx_level != CAYMAN)
m_bc->cf_last->end_of_program = 1;
else
cm_bytecode_add_cf_end(m_bc);
}
extern const std::map<EAluOp, int> opcode_map;
void
AssamblerVisitor::visit(const AluInstr& ai)
{
assert(vtx_fetch_results.empty());
assert(tex_fetch_results.empty());
if (unlikely(ai.has_alu_flag(alu_is_lds)))
emit_lds_op(ai);
else
emit_alu_op(ai);
}
void
AssamblerVisitor::emit_lds_op(const AluInstr& lds)
{
struct r600_bytecode_alu alu;
memset(&alu, 0, sizeof(alu));
alu.is_lds_idx_op = true;
alu.op = lds.lds_opcode();
bool has_lds_fetch = false;
switch (alu.op) {
case LDS_WRITE:
alu.op = LDS_OP2_LDS_WRITE;
break;
case LDS_WRITE_REL:
alu.op = LDS_OP3_LDS_WRITE_REL;
alu.lds_idx = 1;
break;
case DS_OP_READ_RET:
alu.op = LDS_OP1_LDS_READ_RET;
FALLTHROUGH;
case LDS_ADD_RET:
case LDS_AND_RET:
case LDS_OR_RET:
case LDS_MAX_INT_RET:
case LDS_MAX_UINT_RET:
case LDS_MIN_INT_RET:
case LDS_MIN_UINT_RET:
case LDS_XOR_RET:
case LDS_XCHG_RET:
case LDS_CMP_XCHG_RET:
has_lds_fetch = true;
break;
case LDS_ADD:
case LDS_AND:
case LDS_OR:
case LDS_MAX_INT:
case LDS_MAX_UINT:
case LDS_MIN_INT:
case LDS_MIN_UINT:
case LDS_XOR:
break;
default:
std::cerr << "\n R600: error op: " << lds << "\n";
unreachable("Unhandled LDS op");
}
copy_src(alu.src[0], lds.src(0));
if (lds.n_sources() > 1)
copy_src(alu.src[1], lds.src(1));
else
alu.src[1].sel = V_SQ_ALU_SRC_0;
if (lds.n_sources() > 2)
copy_src(alu.src[2], lds.src(2));
else
alu.src[2].sel = V_SQ_ALU_SRC_0;
alu.last = lds.has_alu_flag(alu_last_instr);
int r = r600_bytecode_add_alu(m_bc, &alu);
if (has_lds_fetch)
m_bc->cf_last->nlds_read++;
if (r)
m_result = false;
}
auto AssamblerVisitor::translate_for_mathrules(EAluOp op) -> EAluOp
{
switch (op) {
case op2_dot_ieee: return op2_dot;
case op2_dot4_ieee: return op2_dot4;
case op2_mul_ieee: return op2_mul;
case op3_muladd_ieee : return op2_mul_ieee;
default:
return op;
}
}
void
AssamblerVisitor::emit_alu_op(const AluInstr& ai)
{
sfn_log << SfnLog::assembly << "Emit ALU op " << ai << "\n";
struct r600_bytecode_alu alu;
memset(&alu, 0, sizeof(alu));
auto opcode = ai.opcode();
if (unlikely(ai.opcode() == op1_mova_int &&
(m_bc->gfx_level < CAYMAN || alu.dst.sel == 0))) {
m_last_addr = ai.psrc(0);
m_bc->ar_chan = m_last_addr->chan();
m_bc->ar_reg = m_last_addr->sel();
}
if (m_legacy_math_rules)
opcode = translate_for_mathrules(opcode);
auto hw_opcode = opcode_map.find(opcode);
if (hw_opcode == opcode_map.end()) {
std::cerr << "Opcode not handled for " << ai << "\n";
m_result = false;
return;
}
// skip multiple barriers
if (m_last_op_was_barrier && opcode == op0_group_barrier)
return;
m_last_op_was_barrier = opcode == op0_group_barrier;
alu.op = hw_opcode->second;
auto dst = ai.dest();
if (dst) {
if (ai.opcode() != op1_mova_int) {
if (!copy_dst(alu.dst, *dst, ai.has_alu_flag(alu_write))) {
m_result = false;
return;
}
alu.dst.write = ai.has_alu_flag(alu_write);
alu.dst.clamp = ai.has_alu_flag(alu_dst_clamp);
alu.dst.rel = dst->addr() ? 1 : 0;
} else if (m_bc->gfx_level == CAYMAN && ai.dest()->sel() > 0) {
alu.dst.sel = ai.dest()->sel() + 1;
}
}
alu.is_op3 = ai.n_sources() == 3;
EBufferIndexMode kcache_index_mode = bim_none;
PVirtualValue buffer_offset = nullptr;
for (unsigned i = 0; i < ai.n_sources(); ++i) {
buffer_offset = copy_src(alu.src[i], ai.src(i));
alu.src[i].neg = ai.has_source_mod(i, AluInstr::mod_neg);
if (!alu.is_op3)
alu.src[i].abs = ai.has_source_mod(i, AluInstr::mod_abs);
if (buffer_offset && kcache_index_mode == bim_none) {
auto idx_reg = buffer_offset->as_register();
if (idx_reg && idx_reg->has_flag(Register::addr_or_idx)) {
switch (idx_reg->sel()) {
case 1: kcache_index_mode = bim_zero; break;
case 2: kcache_index_mode = bim_one; break;
default:
unreachable("Unsupported index mode");
}
} else {
kcache_index_mode = bim_zero;
}
alu.src[i].kc_rel = kcache_index_mode;
}
if (ai.has_lds_queue_read()) {
assert(m_bc->cf_last->nlds_read > 0);
m_bc->cf_last->nlds_read--;
}
}
if (ai.bank_swizzle() != alu_vec_unknown)
alu.bank_swizzle_force = ai.bank_swizzle();
alu.last = ai.has_alu_flag(alu_last_instr);
alu.execute_mask = ai.has_alu_flag(alu_update_exec);
/* If the destination register is equal to the last loaded address register
* then clear the latter one, because the values will no longer be
* identical */
if (m_last_addr)
sfn_log << SfnLog::assembly << " Current address register is " << *m_last_addr
<< "\n";
if (dst)
sfn_log << SfnLog::assembly << " Current dst register is " << *dst << "\n";
auto cf_op = ai.cf_type();
unsigned type = 0;
switch (cf_op) {
case cf_alu:
type = CF_OP_ALU;
break;
case cf_alu_push_before:
type = CF_OP_ALU_PUSH_BEFORE;
break;
case cf_alu_pop_after:
type = CF_OP_ALU_POP_AFTER;
break;
case cf_alu_pop2_after:
type = CF_OP_ALU_POP2_AFTER;
break;
case cf_alu_break:
type = CF_OP_ALU_BREAK;
break;
case cf_alu_else_after:
type = CF_OP_ALU_ELSE_AFTER;
break;
case cf_alu_continue:
type = CF_OP_ALU_CONTINUE;
break;
case cf_alu_extended:
type = CF_OP_ALU_EXT;
break;
default:
assert(0 && "cf_alu_undefined should have been replaced");
}
if (alu.last)
m_nliterals_in_group.clear();
m_result = !r600_bytecode_add_alu_type(m_bc, &alu, type);
if (unlikely(ai.opcode() == op1_mova_int)) {
if (m_bc->gfx_level < CAYMAN || alu.dst.sel == 0) {
m_bc->ar_loaded = 1;
} else if (m_bc->gfx_level == CAYMAN) {
int idx = alu.dst.sel - 2;
m_bc->index_loaded[idx] = 1;
m_bc->index_reg[idx] = -1;
}
}
if (alu.dst.sel >= g_clause_local_start && alu.dst.sel < g_clause_local_end) {
int clidx = 4 * (alu.dst.sel - g_clause_local_start) + alu.dst.chan;
m_bc->cf_last->clause_local_written |= 1 << clidx;
}
if (ai.opcode() == op1_set_cf_idx0) {
m_bc->index_loaded[0] = 1;
m_bc->index_reg[0] = -1;
}
if (ai.opcode() == op1_set_cf_idx1) {
m_bc->index_loaded[1] = 1;
m_bc->index_reg[1] = -1;
}
}
void
AssamblerVisitor::visit(const AluGroup& group)
{
clear_states(sf_vtx | sf_tex);
if (group.slots() == 0)
return;
static const unsigned slot_limit = 256;
if (m_bc->cf_last && !m_bc->force_add_cf) {
if (group.has_lds_group_start()) {
if (m_bc->cf_last->ndw + 2 * (*group.begin())->required_slots() > slot_limit) {
assert(m_bc->cf_last->nlds_read == 0);
assert(0 && "Not allowed to start new alu group here");
m_bc->force_add_cf = 1;
m_last_addr = nullptr;
}
} else {
if (m_bc->cf_last->ndw + 2 * group.slots() > slot_limit) {
std::cerr << "m_bc->cf_last->ndw = " << m_bc->cf_last->ndw
<< " group.slots() = " << group.slots()
<< " -> " << m_bc->cf_last->ndw + 2 * group.slots()
<< "> slot_limit = " << slot_limit << "\n";
assert(m_bc->cf_last->nlds_read == 0);
assert(0 && "Not allowed to start new alu group here");
m_bc->force_add_cf = 1;
m_last_addr = nullptr;
} else {
auto instr = *group.begin();
if (instr && !instr->has_alu_flag(alu_is_lds) &&
instr->opcode() == op0_group_barrier && m_bc->cf_last->ndw + 14 > slot_limit) {
assert(0 && "Not allowed to start new alu group here");
assert(m_bc->cf_last->nlds_read == 0);
m_bc->force_add_cf = 1;
m_last_addr = nullptr;
}
}
}
}
auto [addr, is_index] = group.addr();
assert(!addr || addr->has_flag(Register::addr_or_idx));
for (auto& i : group) {
if (i)
i->accept(*this);
}
}
void
AssamblerVisitor::visit(const TexInstr& tex_instr)
{
clear_states(sf_vtx | sf_alu);
if (tex_fetch_results.find(tex_instr.src().sel()) != tex_fetch_results.end()) {
m_bc->force_add_cf = 1;
tex_fetch_results.clear();
}
r600_bytecode_tex tex;
memset(&tex, 0, sizeof(struct r600_bytecode_tex));
tex.op = tex_instr.opcode();
tex.sampler_id = tex_instr.sampler_id();
tex.resource_id = tex_instr.resource_id();
tex.src_gpr = tex_instr.src().sel();
tex.dst_gpr = tex_instr.dst().sel();
tex.dst_sel_x = tex_instr.dest_swizzle(0);
tex.dst_sel_y = tex_instr.dest_swizzle(1);
tex.dst_sel_z = tex_instr.dest_swizzle(2);
tex.dst_sel_w = tex_instr.dest_swizzle(3);
tex.src_sel_x = tex_instr.src()[0]->chan();
tex.src_sel_y = tex_instr.src()[1]->chan();
tex.src_sel_z = tex_instr.src()[2]->chan();
tex.src_sel_w = tex_instr.src()[3]->chan();
tex.coord_type_x = !tex_instr.has_tex_flag(TexInstr::x_unnormalized);
tex.coord_type_y = !tex_instr.has_tex_flag(TexInstr::y_unnormalized);
tex.coord_type_z = !tex_instr.has_tex_flag(TexInstr::z_unnormalized);
tex.coord_type_w = !tex_instr.has_tex_flag(TexInstr::w_unnormalized);
tex.offset_x = tex_instr.get_offset(0);
tex.offset_y = tex_instr.get_offset(1);
tex.offset_z = tex_instr.get_offset(2);
tex.resource_index_mode = tex_instr.resource_index_mode();
tex.sampler_index_mode = tex_instr.sampler_index_mode();
if (tex.dst_sel_x < 4 && tex.dst_sel_y < 4 && tex.dst_sel_z < 4 && tex.dst_sel_w < 4)
tex_fetch_results.insert(tex.dst_gpr);
if (tex_instr.opcode() == TexInstr::get_gradient_h ||
tex_instr.opcode() == TexInstr::get_gradient_v)
tex.inst_mod = tex_instr.has_tex_flag(TexInstr::grad_fine) ? 1 : 0;
else
tex.inst_mod = tex_instr.inst_mode();
if (r600_bytecode_add_tex(m_bc, &tex)) {
R600_ASM_ERR("shader_from_nir: Error creating tex assembly instruction\n");
m_result = false;
}
}
void
AssamblerVisitor::visit(const ExportInstr& exi)
{
const auto& value = exi.value();
r600_bytecode_output output;
memset(&output, 0, sizeof(output));
output.gpr = value.sel();
output.elem_size = 3;
output.swizzle_x = value[0]->chan();
output.swizzle_y = value[1]->chan();
output.swizzle_z = value[2]->chan();
output.burst_count = 1;
output.op = exi.is_last_export() ? CF_OP_EXPORT_DONE : CF_OP_EXPORT;
output.type = exi.export_type();
clear_states(sf_all);
switch (exi.export_type()) {
case ExportInstr::pixel:
output.swizzle_w = ps_alpha_to_one ? 5 : exi.value()[3]->chan();
output.array_base = exi.location();
break;
case ExportInstr::pos:
output.swizzle_w = exi.value()[3]->chan();
output.array_base = 60 + exi.location();
break;
case ExportInstr::param:
output.swizzle_w = exi.value()[3]->chan();
output.array_base = exi.location();
break;
default:
R600_ASM_ERR("shader_from_nir: export %d type not yet supported\n",
exi.export_type());
m_result = false;
}
/* If all register elements pinned to fixed values
* we can override the gpr (the register allocator doesn't see
* this because it doesn't take these channels into account. */
if (output.swizzle_x > 3 && output.swizzle_y > 3 && output.swizzle_z > 3 &&
output.swizzle_w > 3)
output.gpr = 0;
int r = 0;
if ((r = r600_bytecode_add_output(m_bc, &output))) {
R600_ASM_ERR("Error adding export at location %d : err: %d\n", exi.location(), r);
m_result = false;
}
}
void
AssamblerVisitor::visit(const ScratchIOInstr& instr)
{
clear_states(sf_all);
struct r600_bytecode_output cf;
memset(&cf, 0, sizeof(struct r600_bytecode_output));
cf.op = CF_OP_MEM_SCRATCH;
cf.elem_size = 3;
cf.gpr = instr.value().sel();
cf.mark = !instr.is_read();
cf.comp_mask = instr.is_read() ? 0xf : instr.write_mask();
cf.swizzle_x = 0;
cf.swizzle_y = 1;
cf.swizzle_z = 2;
cf.swizzle_w = 3;
cf.burst_count = 1;
assert(!instr.is_read() || m_bc->gfx_level < R700);
if (instr.address()) {
cf.type = instr.is_read() || m_bc->gfx_level > R600 ? 3 : 1;
cf.index_gpr = instr.address()->sel();
/* The docu seems to be wrong here: In indirect addressing the
* address_base seems to be the array_size */
cf.array_size = instr.array_size();
} else {
cf.type = instr.is_read() || m_bc->gfx_level > R600 ? 2 : 0;
cf.array_base = instr.location();
}
if (r600_bytecode_add_output(m_bc, &cf)) {
R600_ASM_ERR("shader_from_nir: Error creating SCRATCH_WR assembly instruction\n");
m_result = false;
}
}
void
AssamblerVisitor::visit(const StreamOutInstr& instr)
{
struct r600_bytecode_output output;
memset(&output, 0, sizeof(struct r600_bytecode_output));
output.gpr = instr.value().sel();
output.elem_size = instr.element_size();
output.array_base = instr.array_base();
output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
output.burst_count = instr.burst_count();
output.array_size = instr.array_size();
output.comp_mask = instr.comp_mask();
output.op = instr.op(m_shader->bc.gfx_level);
if (r600_bytecode_add_output(m_bc, &output)) {
R600_ASM_ERR("shader_from_nir: Error creating stream output instruction\n");
m_result = false;
}
}
void
AssamblerVisitor::visit(const MemRingOutInstr& instr)
{
struct r600_bytecode_output output;
memset(&output, 0, sizeof(struct r600_bytecode_output));
output.gpr = instr.value().sel();
output.type = instr.type();
output.elem_size = 3;
output.comp_mask = 0xf;
output.burst_count = 1;
output.op = instr.op();
if (instr.type() == MemRingOutInstr::mem_write_ind ||
instr.type() == MemRingOutInstr::mem_write_ind_ack) {
output.index_gpr = instr.index_reg();
output.array_size = 0xfff;
}
output.array_base = instr.array_base();
if (r600_bytecode_add_output(m_bc, &output)) {
R600_ASM_ERR("shader_from_nir: Error creating mem ring write instruction\n");
m_result = false;
}
}
void
AssamblerVisitor::visit(const EmitVertexInstr& instr)
{
int r = r600_bytecode_add_cfinst(m_bc, instr.op());
if (!r)
m_bc->cf_last->count = instr.stream();
else
m_result = false;
assert(m_bc->cf_last->count < 4);
}
void
AssamblerVisitor::visit(const FetchInstr& fetch_instr)
{
bool use_tc =
fetch_instr.has_fetch_flag(FetchInstr::use_tc) || (m_bc->gfx_level == CAYMAN);
auto clear_flags = use_tc ? sf_vtx : sf_tex;
clear_states(clear_flags | sf_alu);
if (fetch_instr.has_fetch_flag(FetchInstr::wait_ack))
emit_wait_ack();
if (!use_tc &&
vtx_fetch_results.find(fetch_instr.src().sel()) != vtx_fetch_results.end()) {
m_bc->force_add_cf = 1;
vtx_fetch_results.clear();
}
if (fetch_instr.has_fetch_flag(FetchInstr::use_tc) &&
tex_fetch_results.find(fetch_instr.src().sel()) != tex_fetch_results.end()) {
m_bc->force_add_cf = 1;
tex_fetch_results.clear();
}
if (use_tc)
tex_fetch_results.insert(fetch_instr.dst().sel());
else
vtx_fetch_results.insert(fetch_instr.dst().sel());
struct r600_bytecode_vtx vtx;
memset(&vtx, 0, sizeof(vtx));
vtx.op = fetch_instr.opcode();
vtx.buffer_id = fetch_instr.resource_id();
vtx.fetch_type = fetch_instr.fetch_type();
vtx.src_gpr = fetch_instr.src().sel();
vtx.src_sel_x = fetch_instr.src().chan();
vtx.mega_fetch_count = fetch_instr.mega_fetch_count();
vtx.dst_gpr = fetch_instr.dst().sel();
vtx.dst_sel_x = fetch_instr.dest_swizzle(0); /* SEL_X */
vtx.dst_sel_y = fetch_instr.dest_swizzle(1); /* SEL_Y */
vtx.dst_sel_z = fetch_instr.dest_swizzle(2); /* SEL_Z */
vtx.dst_sel_w = fetch_instr.dest_swizzle(3); /* SEL_W */
vtx.use_const_fields = fetch_instr.has_fetch_flag(FetchInstr::use_const_field);
vtx.data_format = fetch_instr.data_format();
vtx.num_format_all = fetch_instr.num_format(); /* NUM_FORMAT_SCALED */
vtx.format_comp_all = fetch_instr.has_fetch_flag(FetchInstr::format_comp_signed);
vtx.endian = fetch_instr.endian_swap();
vtx.buffer_index_mode = fetch_instr.resource_index_mode();
vtx.offset = fetch_instr.src_offset();
vtx.indexed = fetch_instr.has_fetch_flag(FetchInstr::indexed);
vtx.uncached = fetch_instr.has_fetch_flag(FetchInstr::uncached);
vtx.elem_size = fetch_instr.elm_size();
vtx.array_base = fetch_instr.array_base();
vtx.array_size = fetch_instr.array_size();
vtx.srf_mode_all = fetch_instr.has_fetch_flag(FetchInstr::srf_mode);
if (fetch_instr.has_fetch_flag(FetchInstr::use_tc)) {
if ((r600_bytecode_add_vtx_tc(m_bc, &vtx))) {
R600_ASM_ERR("shader_from_nir: Error creating tex assembly instruction\n");
m_result = false;
}
} else {
if ((r600_bytecode_add_vtx(m_bc, &vtx))) {
R600_ASM_ERR("shader_from_nir: Error creating tex assembly instruction\n");
m_result = false;
}
}
m_bc->cf_last->vpm =
(m_bc->type == PIPE_SHADER_FRAGMENT) && fetch_instr.has_fetch_flag(FetchInstr::vpm);
m_bc->cf_last->barrier = 1;
}
void
AssamblerVisitor::visit(const WriteTFInstr& instr)
{
struct r600_bytecode_gds gds;
auto& value = instr.value();
memset(&gds, 0, sizeof(struct r600_bytecode_gds));
gds.src_gpr = value.sel();
gds.src_sel_x = value[0]->chan();
gds.src_sel_y = value[1]->chan();
gds.src_sel_z = 4;
gds.dst_sel_x = 7;
gds.dst_sel_y = 7;
gds.dst_sel_z = 7;
gds.dst_sel_w = 7;
gds.op = FETCH_OP_TF_WRITE;
if (r600_bytecode_add_gds(m_bc, &gds) != 0) {
m_result = false;
return;
}
if (value[2]->chan() != 7) {
memset(&gds, 0, sizeof(struct r600_bytecode_gds));
gds.src_gpr = value.sel();
gds.src_sel_x = value[2]->chan();
gds.src_sel_y = value[3]->chan();
gds.src_sel_z = 4;
gds.dst_sel_x = 7;
gds.dst_sel_y = 7;
gds.dst_sel_z = 7;
gds.dst_sel_w = 7;
gds.op = FETCH_OP_TF_WRITE;
if (r600_bytecode_add_gds(m_bc, &gds)) {
m_result = false;
return;
}
}
}
void
AssamblerVisitor::visit(const RatInstr& instr)
{
struct r600_bytecode_gds gds;
/* The instruction writes to the retuen buffer location, and
* the value will actually be read back, so make sure all previous writes
* have been finished */
if (m_ack_suggested /*&& instr.has_instr_flag(Instr::ack_rat_return_write)*/)
emit_wait_ack();
int rat_idx = instr.resource_id();
memset(&gds, 0, sizeof(struct r600_bytecode_gds));
r600_bytecode_add_cfinst(m_bc, instr.cf_opcode());
auto cf = m_bc->cf_last;
cf->rat.id = rat_idx + m_shader->rat_base;
cf->rat.inst = instr.rat_op();
cf->rat.index_mode = instr.resource_index_mode();
cf->output.type = instr.need_ack() ? 3 : 1;
cf->output.gpr = instr.data_gpr();
cf->output.index_gpr = instr.index_gpr();
cf->output.comp_mask = instr.comp_mask();
cf->output.burst_count = instr.burst_count();
assert(instr.data_swz(0) == PIPE_SWIZZLE_X);
if (cf->rat.inst != RatInstr::STORE_TYPED) {
assert(instr.data_swz(1) == PIPE_SWIZZLE_Y ||
instr.data_swz(1) == PIPE_SWIZZLE_MAX);
assert(instr.data_swz(2) == PIPE_SWIZZLE_Z ||
instr.data_swz(2) == PIPE_SWIZZLE_MAX);
}
cf->vpm = m_bc->type == PIPE_SHADER_FRAGMENT;
cf->barrier = 1;
cf->mark = instr.need_ack();
cf->output.elem_size = instr.elm_size();
m_ack_suggested |= instr.need_ack();
}
void
AssamblerVisitor::clear_states(const uint32_t& states)
{
if (states & sf_vtx)
vtx_fetch_results.clear();
if (states & sf_tex)
tex_fetch_results.clear();
if (states & sf_alu) {
m_last_op_was_barrier = false;
m_last_addr = nullptr;
}
}
void
AssamblerVisitor::visit(const Block& block)
{
if (block.empty())
return;
if (block.has_instr_flag(Instr::force_cf)) {
m_bc->force_add_cf = 1;
m_bc->ar_loaded = 0;
m_last_addr = nullptr;
}
sfn_log << SfnLog::assembly << "Translate block size: " << block.size()
<< " new_cf:" << m_bc->force_add_cf << "\n";
for (const auto& i : block) {
sfn_log << SfnLog::assembly << "Translate " << *i << " ";
i->accept(*this);
sfn_log << SfnLog::assembly << (m_result ? "good" : "fail") << "\n";
if (!m_result)
break;
}
}
void
AssamblerVisitor::visit(const IfInstr& instr)
{
int elems = m_callstack.push(FC_PUSH_VPM);
bool needs_workaround = false;
if (m_bc->gfx_level == CAYMAN && m_bc->stack.loop > 1)
needs_workaround = true;
if (m_bc->gfx_level == EVERGREEN && m_bc->family != CHIP_HEMLOCK &&
m_bc->family != CHIP_CYPRESS && m_bc->family != CHIP_JUNIPER) {
unsigned dmod1 = (elems - 1) % m_bc->stack.entry_size;
unsigned dmod2 = (elems) % m_bc->stack.entry_size;
if (elems && (!dmod1 || !dmod2))
needs_workaround = true;
}
auto pred = instr.predicate();
auto [addr, dummy0, dummy1] = pred->indirect_addr();
{
}
assert(!dummy1);
if (addr) {
if (!m_last_addr || !m_bc->ar_loaded || !m_last_addr->equal_to(*addr)) {
m_bc->ar_reg = addr->sel();
m_bc->ar_chan = addr->chan();
m_last_addr = addr;
m_bc->ar_loaded = 0;
r600_load_ar(m_bc, true);
}
}
if (needs_workaround) {
r600_bytecode_add_cfinst(m_bc, CF_OP_PUSH);
m_bc->cf_last->cf_addr = m_bc->cf_last->id + 2;
r600_bytecode_add_cfinst(m_bc, CF_OP_ALU);
pred->set_cf_type(cf_alu);
}
clear_states(sf_tex | sf_vtx);
pred->accept(*this);
r600_bytecode_add_cfinst(m_bc, CF_OP_JUMP);
clear_states(sf_all);
m_jump_tracker.push(m_bc->cf_last, jt_if);
}
void
AssamblerVisitor::visit(const ControlFlowInstr& instr)
{
clear_states(sf_all);
switch (instr.cf_type()) {
case ControlFlowInstr::cf_else:
emit_else();
break;
case ControlFlowInstr::cf_endif:
emit_endif();
break;
case ControlFlowInstr::cf_loop_begin: {
bool use_vpm = m_shader->processor_type == PIPE_SHADER_FRAGMENT &&
instr.has_instr_flag(Instr::vpm) &&
!instr.has_instr_flag(Instr::helper);
emit_loop_begin(use_vpm);
break;
}
case ControlFlowInstr::cf_loop_end:
emit_loop_end();
break;
case ControlFlowInstr::cf_loop_break:
emit_loop_break();
break;
case ControlFlowInstr::cf_loop_continue:
emit_loop_cont();
break;
case ControlFlowInstr::cf_wait_ack: {
int r = r600_bytecode_add_cfinst(m_bc, CF_OP_WAIT_ACK);
if (!r) {
m_bc->cf_last->cf_addr = 0;
m_bc->cf_last->barrier = 1;
m_ack_suggested = false;
} else {
m_result = false;
}
} break;
default:
unreachable("Unknown CF instruction type");
}
}
void
AssamblerVisitor::visit(const GDSInstr& instr)
{
struct r600_bytecode_gds gds;
memset(&gds, 0, sizeof(struct r600_bytecode_gds));
gds.op = ds_opcode_map.at(instr.opcode());
gds.uav_id = instr.resource_id();
gds.uav_index_mode = instr.resource_index_mode();
gds.src_gpr = instr.src().sel();
gds.src_sel_x = instr.src()[0]->chan() < 7 ? instr.src()[0]->chan() : 4;
gds.src_sel_y = instr.src()[1]->chan() < 7 ? instr.src()[1]->chan() : 4;
gds.src_sel_z = instr.src()[2]->chan() < 7 ? instr.src()[2]->chan() : 4;
gds.dst_sel_x = 7;
gds.dst_sel_y = 7;
gds.dst_sel_z = 7;
gds.dst_sel_w = 7;
if (instr.dest()) {
gds.dst_gpr = instr.dest()->sel();
switch (instr.dest()->chan()) {
case 0:
gds.dst_sel_x = 0;
break;
case 1:
gds.dst_sel_y = 0;
break;
case 2:
gds.dst_sel_z = 0;
break;
case 3:
gds.dst_sel_w = 0;
}
}
gds.src_gpr2 = 0;
gds.alloc_consume = m_bc->gfx_level < CAYMAN ? 1 : 0; // Not Cayman
int r = r600_bytecode_add_gds(m_bc, &gds);
if (r) {
m_result = false;
return;
}
m_bc->cf_last->vpm = PIPE_SHADER_FRAGMENT == m_bc->type;
m_bc->cf_last->barrier = 1;
}
void
AssamblerVisitor::visit(const LDSAtomicInstr& instr)
{
(void)instr;
unreachable("LDSAtomicInstr must be lowered to ALUInstr");
}
void
AssamblerVisitor::visit(const LDSReadInstr& instr)
{
(void)instr;
unreachable("LDSReadInstr must be lowered to ALUInstr");
}
EBufferIndexMode
AssamblerVisitor::emit_index_reg(const VirtualValue& addr, unsigned idx)
{
assert(idx < 2);
if (!m_bc->index_loaded[idx] || m_loop_nesting ||
m_bc->index_reg[idx] != (unsigned)addr.sel() ||
m_bc->index_reg_chan[idx] != (unsigned)addr.chan()) {
struct r600_bytecode_alu alu;
// Make sure MOVA is not last instr in clause
if (!m_bc->cf_last || (m_bc->cf_last->ndw >> 1) >= 110)
m_bc->force_add_cf = 1;
if (m_bc->gfx_level != CAYMAN) {
EAluOp idxop = idx ? op1_set_cf_idx1 : op1_set_cf_idx0;
memset(&alu, 0, sizeof(alu));
alu.op = opcode_map.at(op1_mova_int);
alu.dst.chan = 0;
alu.src[0].sel = addr.sel();
alu.src[0].chan = addr.chan();
alu.last = 1;
sfn_log << SfnLog::assembly << " mova_int, ";
int r = r600_bytecode_add_alu(m_bc, &alu);
if (r)
return bim_invalid;
alu.op = opcode_map.at(idxop);
alu.dst.chan = 0;
alu.src[0].sel = 0;
alu.src[0].chan = 0;
alu.last = 1;
sfn_log << SfnLog::assembly << "op1_set_cf_idx" << idx;
r = r600_bytecode_add_alu(m_bc, &alu);
if (r)
return bim_invalid;
} else {
memset(&alu, 0, sizeof(alu));
alu.op = opcode_map.at(op1_mova_int);
alu.dst.sel = idx == 0 ? CM_V_SQ_MOVA_DST_CF_IDX0 : CM_V_SQ_MOVA_DST_CF_IDX1;
alu.dst.chan = 0;
alu.src[0].sel = addr.sel();
alu.src[0].chan = addr.chan();
alu.last = 1;
sfn_log << SfnLog::assembly << " mova_int, ";
int r = r600_bytecode_add_alu(m_bc, &alu);
if (r)
return bim_invalid;
}
m_bc->ar_loaded = 0;
m_bc->index_reg[idx] = addr.sel();
m_bc->index_reg_chan[idx] = addr.chan();
m_bc->index_loaded[idx] = true;
m_bc->force_add_cf = 1;
sfn_log << SfnLog::assembly << "\n";
}
return idx == 0 ? bim_zero : bim_one;
}
void
AssamblerVisitor::emit_else()
{
r600_bytecode_add_cfinst(m_bc, CF_OP_ELSE);
m_bc->cf_last->pop_count = 1;
m_result &= m_jump_tracker.add_mid(m_bc->cf_last, jt_if);
}
void
AssamblerVisitor::emit_endif()
{
m_callstack.pop(FC_PUSH_VPM);
unsigned force_pop = m_bc->force_add_cf;
if (!force_pop) {
int alu_pop = 3;
if (m_bc->cf_last) {
if (m_bc->cf_last->op == CF_OP_ALU)
alu_pop = 0;
else if (m_bc->cf_last->op == CF_OP_ALU_POP_AFTER)
alu_pop = 1;
}
alu_pop += 1;
if (alu_pop == 1) {
m_bc->cf_last->op = CF_OP_ALU_POP_AFTER;
m_bc->force_add_cf = 1;
} else {
force_pop = 1;
}
}
if (force_pop) {
r600_bytecode_add_cfinst(m_bc, CF_OP_POP);
m_bc->cf_last->pop_count = 1;
m_bc->cf_last->cf_addr = m_bc->cf_last->id + 2;
}
m_result &= m_jump_tracker.pop(m_bc->cf_last, jt_if);
}
void
AssamblerVisitor::emit_loop_begin(bool vpm)
{
r600_bytecode_add_cfinst(m_bc, CF_OP_LOOP_START_DX10);
m_bc->cf_last->vpm = vpm && m_bc->type == PIPE_SHADER_FRAGMENT;
m_jump_tracker.push(m_bc->cf_last, jt_loop);
m_callstack.push(FC_LOOP);
++m_loop_nesting;
}
void
AssamblerVisitor::emit_loop_end()
{
if (m_ack_suggested) {
emit_wait_ack();
m_ack_suggested = false;
}
r600_bytecode_add_cfinst(m_bc, CF_OP_LOOP_END);
m_callstack.pop(FC_LOOP);
assert(m_loop_nesting);
--m_loop_nesting;
m_result |= m_jump_tracker.pop(m_bc->cf_last, jt_loop);
}
void
AssamblerVisitor::emit_loop_break()
{
r600_bytecode_add_cfinst(m_bc, CF_OP_LOOP_BREAK);
m_result |= m_jump_tracker.add_mid(m_bc->cf_last, jt_loop);
}
void
AssamblerVisitor::emit_loop_cont()
{
r600_bytecode_add_cfinst(m_bc, CF_OP_LOOP_CONTINUE);
m_result |= m_jump_tracker.add_mid(m_bc->cf_last, jt_loop);
}
bool
AssamblerVisitor::copy_dst(r600_bytecode_alu_dst& dst, const Register& d, bool write)
{
if (write && d.sel() > g_clause_local_end) {
R600_ASM_ERR("shader_from_nir: Don't support more then 123 GPRs + 4 clause "
"local, but try using %d\n",
d.sel());
m_result = false;
return false;
}
dst.sel = d.sel();
dst.chan = d.chan();
if (m_last_addr && m_last_addr->equal_to(d))
m_last_addr = nullptr;
return true;
}
void
AssamblerVisitor::emit_wait_ack()
{
int r = r600_bytecode_add_cfinst(m_bc, CF_OP_WAIT_ACK);
if (!r) {
m_bc->cf_last->cf_addr = 0;
m_bc->cf_last->barrier = 1;
m_ack_suggested = false;
} else
m_result = false;
}
class EncodeSourceVisitor : public ConstRegisterVisitor {
public:
EncodeSourceVisitor(r600_bytecode_alu_src& s, r600_bytecode *bc);
void visit(const Register& value) override;
void visit(const LocalArray& value) override;
void visit(const LocalArrayValue& value) override;
void visit(const UniformValue& value) override;
void visit(const LiteralConstant& value) override;
void visit(const InlineConstant& value) override;
r600_bytecode_alu_src& src;
r600_bytecode *m_bc;
PVirtualValue m_buffer_offset{nullptr};
};
PVirtualValue
AssamblerVisitor::copy_src(r600_bytecode_alu_src& src, const VirtualValue& s)
{
EncodeSourceVisitor visitor(src, m_bc);
src.sel = s.sel();
src.chan = s.chan();
if (s.sel() >= g_clause_local_start && s.sel() < g_clause_local_end ) {
assert(m_bc->cf_last);
int clidx = 4 * (s.sel() - g_clause_local_start) + s.chan();
/* Ensure that the clause local register was already written */
assert(m_bc->cf_last->clause_local_written & (1 << clidx));
}
s.accept(visitor);
return visitor.m_buffer_offset;
}
EncodeSourceVisitor::EncodeSourceVisitor(r600_bytecode_alu_src& s, r600_bytecode *bc):
src(s),
m_bc(bc)
{
}
void
EncodeSourceVisitor::visit(const Register& value)
{
assert(value.sel() < g_clause_local_end && "Only have 123 reisters + 4 clause local");
}
void
EncodeSourceVisitor::visit(const LocalArray& value)
{
(void)value;
unreachable("An array can't be a source register");
}
void
EncodeSourceVisitor::visit(const LocalArrayValue& value)
{
src.rel = value.addr() ? 1 : 0;
}
void
EncodeSourceVisitor::visit(const UniformValue& value)
{
assert(value.sel() >= 512 && "Uniform values must have a sel >= 512");
m_buffer_offset = value.buf_addr();
src.kc_bank = value.kcache_bank();
}
void
EncodeSourceVisitor::visit(const LiteralConstant& value)
{
src.value = value.value();
}
void
EncodeSourceVisitor::visit(const InlineConstant& value)
{
(void)value;
}
const std::map<EAluOp, int> opcode_map = {
{op2_add, ALU_OP2_ADD },
{op2_mul, ALU_OP2_MUL },
{op2_mul_ieee, ALU_OP2_MUL_IEEE },
{op2_max, ALU_OP2_MAX },
{op2_min, ALU_OP2_MIN },
{op2_max_dx10, ALU_OP2_MAX_DX10 },
{op2_min_dx10, ALU_OP2_MIN_DX10 },
{op2_sete, ALU_OP2_SETE },
{op2_setgt, ALU_OP2_SETGT },
{op2_setge, ALU_OP2_SETGE },
{op2_setne, ALU_OP2_SETNE },
{op2_sete_dx10, ALU_OP2_SETE_DX10 },
{op2_setgt_dx10, ALU_OP2_SETGT_DX10 },
{op2_setge_dx10, ALU_OP2_SETGE_DX10 },
{op2_setne_dx10, ALU_OP2_SETNE_DX10 },
{op1_fract, ALU_OP1_FRACT },
{op1_trunc, ALU_OP1_TRUNC },
{op1_ceil, ALU_OP1_CEIL },
{op1_rndne, ALU_OP1_RNDNE },
{op1_floor, ALU_OP1_FLOOR },
{op2_ashr_int, ALU_OP2_ASHR_INT },
{op2_lshr_int, ALU_OP2_LSHR_INT },
{op2_lshl_int, ALU_OP2_LSHL_INT },
{op1_mov, ALU_OP1_MOV },
{op0_nop, ALU_OP0_NOP },
{op2_mul_64, ALU_OP2_MUL_64 },
{op1v_flt64_to_flt32, ALU_OP1_FLT64_TO_FLT32 },
{op1v_flt32_to_flt64, ALU_OP1_FLT32_TO_FLT64 },
{op2_prede_int, ALU_OP2_PRED_SETE_INT },
{op2_pred_setne_int, ALU_OP2_PRED_SETNE_INT },
{op2_pred_setge_int, ALU_OP2_PRED_SETGE_INT },
{op2_pred_setgt_int, ALU_OP2_PRED_SETGT_INT },
{op2_pred_setgt_uint, ALU_OP2_PRED_SETGT_UINT },
{op2_pred_setge_uint, ALU_OP2_PRED_SETGE_UINT },
{op2_pred_sete, ALU_OP2_PRED_SETE },
{op2_pred_setgt, ALU_OP2_PRED_SETGT },
{op2_pred_setge, ALU_OP2_PRED_SETGE },
{op2_pred_setne, ALU_OP2_PRED_SETNE },
{op0_pred_set_clr, ALU_OP0_PRED_SET_CLR },
{op1_pred_set_restore, ALU_OP1_PRED_SET_RESTORE },
{op2_pred_sete_push, ALU_OP2_PRED_SETE_PUSH },
{op2_pred_setgt_push, ALU_OP2_PRED_SETGT_PUSH },
{op2_pred_setge_push, ALU_OP2_PRED_SETGE_PUSH },
{op2_pred_setne_push, ALU_OP2_PRED_SETNE_PUSH },
{op2_kille, ALU_OP2_KILLE },
{op2_killgt, ALU_OP2_KILLGT },
{op2_killge, ALU_OP2_KILLGE },
{op2_killne, ALU_OP2_KILLNE },
{op2_and_int, ALU_OP2_AND_INT },
{op2_or_int, ALU_OP2_OR_INT },
{op2_xor_int, ALU_OP2_XOR_INT },
{op1_not_int, ALU_OP1_NOT_INT },
{op2_add_int, ALU_OP2_ADD_INT },
{op2_sub_int, ALU_OP2_SUB_INT },
{op2_max_int, ALU_OP2_MAX_INT },
{op2_min_int, ALU_OP2_MIN_INT },
{op2_max_uint, ALU_OP2_MAX_UINT },
{op2_min_uint, ALU_OP2_MIN_UINT },
{op2_sete_int, ALU_OP2_SETE_INT },
{op2_setgt_int, ALU_OP2_SETGT_INT },
{op2_setge_int, ALU_OP2_SETGE_INT },
{op2_setne_int, ALU_OP2_SETNE_INT },
{op2_setgt_uint, ALU_OP2_SETGT_UINT },
{op2_setge_uint, ALU_OP2_SETGE_UINT },
{op2_killgt_uint, ALU_OP2_KILLGT_UINT },
{op2_killge_uint, ALU_OP2_KILLGE_UINT },
{op2_pred_setgt_int, ALU_OP2_PRED_SETGT_INT },
{op2_pred_setge_int, ALU_OP2_PRED_SETGE_INT },
{op2_pred_setne_int, ALU_OP2_PRED_SETNE_INT },
{op2_kille_int, ALU_OP2_KILLE_INT },
{op2_killgt_int, ALU_OP2_KILLGT_INT },
{op2_killge_int, ALU_OP2_KILLGE_INT },
{op2_killne_int, ALU_OP2_KILLNE_INT },
{op2_pred_sete_push_int, ALU_OP2_PRED_SETE_PUSH_INT },
{op2_pred_setgt_push_int, ALU_OP2_PRED_SETGT_PUSH_INT },
{op2_pred_setge_push_int, ALU_OP2_PRED_SETGE_PUSH_INT },
{op2_pred_setne_push_int, ALU_OP2_PRED_SETNE_PUSH_INT },
{op2_pred_setlt_push_int, ALU_OP2_PRED_SETLT_PUSH_INT },
{op2_pred_setle_push_int, ALU_OP2_PRED_SETLE_PUSH_INT },
{op1_flt_to_int, ALU_OP1_FLT_TO_INT },
{op1_bfrev_int, ALU_OP1_BFREV_INT },
{op2_addc_uint, ALU_OP2_ADDC_UINT },
{op2_subb_uint, ALU_OP2_SUBB_UINT },
{op0_group_barrier, ALU_OP0_GROUP_BARRIER },
{op0_group_seq_begin, ALU_OP0_GROUP_SEQ_BEGIN },
{op0_group_seq_end, ALU_OP0_GROUP_SEQ_END },
{op2_set_mode, ALU_OP2_SET_MODE },
{op1_set_cf_idx0, ALU_OP0_SET_CF_IDX0 },
{op1_set_cf_idx1, ALU_OP0_SET_CF_IDX1 },
{op2_set_lds_size, ALU_OP2_SET_LDS_SIZE },
{op1_exp_ieee, ALU_OP1_EXP_IEEE },
{op1_log_clamped, ALU_OP1_LOG_CLAMPED },
{op1_log_ieee, ALU_OP1_LOG_IEEE },
{op1_recip_clamped, ALU_OP1_RECIP_CLAMPED },
{op1_recip_ff, ALU_OP1_RECIP_FF },
{op1_recip_ieee, ALU_OP1_RECIP_IEEE },
{op1_recipsqrt_clamped, ALU_OP1_RECIPSQRT_CLAMPED },
{op1_recipsqrt_ff, ALU_OP1_RECIPSQRT_FF },
{op1_recipsqrt_ieee1, ALU_OP1_RECIPSQRT_IEEE },
{op1_sqrt_ieee, ALU_OP1_SQRT_IEEE },
{op1_sin, ALU_OP1_SIN },
{op1_cos, ALU_OP1_COS },
{op2_mullo_int, ALU_OP2_MULLO_INT },
{op2_mulhi_int, ALU_OP2_MULHI_INT },
{op2_mullo_uint, ALU_OP2_MULLO_UINT },
{op2_mulhi_uint, ALU_OP2_MULHI_UINT },
{op1_recip_int, ALU_OP1_RECIP_INT },
{op1_recip_uint, ALU_OP1_RECIP_UINT },
{op1_recip_64, ALU_OP2_RECIP_64 },
{op1_recip_clamped_64, ALU_OP2_RECIP_CLAMPED_64 },
{op1_recipsqrt_64, ALU_OP2_RECIPSQRT_64 },
{op1_recipsqrt_clamped_64, ALU_OP2_RECIPSQRT_CLAMPED_64 },
{op1_sqrt_64, ALU_OP2_SQRT_64 },
{op1_flt_to_uint, ALU_OP1_FLT_TO_UINT },
{op1_int_to_flt, ALU_OP1_INT_TO_FLT },
{op1_uint_to_flt, ALU_OP1_UINT_TO_FLT },
{op2_bfm_int, ALU_OP2_BFM_INT },
{op1_flt32_to_flt16, ALU_OP1_FLT32_TO_FLT16 },
{op1_flt16_to_flt32, ALU_OP1_FLT16_TO_FLT32 },
{op1_ubyte0_flt, ALU_OP1_UBYTE0_FLT },
{op1_ubyte1_flt, ALU_OP1_UBYTE1_FLT },
{op1_ubyte2_flt, ALU_OP1_UBYTE2_FLT },
{op1_ubyte3_flt, ALU_OP1_UBYTE3_FLT },
{op1_bcnt_int, ALU_OP1_BCNT_INT },
{op1_ffbh_uint, ALU_OP1_FFBH_UINT },
{op1_ffbl_int, ALU_OP1_FFBL_INT },
{op1_ffbh_int, ALU_OP1_FFBH_INT },
{op1_flt_to_uint4, ALU_OP1_FLT_TO_UINT4 },
{op2_dot_ieee, ALU_OP2_DOT_IEEE },
{op1_flt_to_int_rpi, ALU_OP1_FLT_TO_INT_RPI },
{op1_flt_to_int_floor, ALU_OP1_FLT_TO_INT_FLOOR },
{op2_mulhi_uint24, ALU_OP2_MULHI_UINT24 },
{op1_mbcnt_32hi_int, ALU_OP1_MBCNT_32HI_INT },
{op1_offset_to_flt, ALU_OP1_OFFSET_TO_FLT },
{op2_mul_uint24, ALU_OP2_MUL_UINT24 },
{op1_bcnt_accum_prev_int, ALU_OP1_BCNT_ACCUM_PREV_INT },
{op1_mbcnt_32lo_accum_prev_int, ALU_OP1_MBCNT_32LO_ACCUM_PREV_INT},
{op2_sete_64, ALU_OP2_SETE_64 },
{op2_setne_64, ALU_OP2_SETNE_64 },
{op2_setgt_64, ALU_OP2_SETGT_64 },
{op2_setge_64, ALU_OP2_SETGE_64 },
{op2_min_64, ALU_OP2_MIN_64 },
{op2_max_64, ALU_OP2_MAX_64 },
{op2_dot4, ALU_OP2_DOT4 },
{op2_dot4_ieee, ALU_OP2_DOT4_IEEE },
{op2_cube, ALU_OP2_CUBE },
{op1_max4, ALU_OP1_MAX4 },
{op1_frexp_64, ALU_OP1_FREXP_64 },
{op1_ldexp_64, ALU_OP2_LDEXP_64 },
{op1_fract_64, ALU_OP1_FRACT_64 },
{op2_pred_setgt_64, ALU_OP2_PRED_SETGT_64 },
{op2_pred_sete_64, ALU_OP2_PRED_SETE_64 },
{op2_pred_setge_64, ALU_OP2_PRED_SETGE_64 },
{op2_add_64, ALU_OP2_ADD_64 },
{op1_mova_int, ALU_OP1_MOVA_INT },
{op1v_flt64_to_flt32, ALU_OP1_FLT64_TO_FLT32 },
{op1_flt32_to_flt64, ALU_OP1_FLT32_TO_FLT64 },
{op2_sad_accum_prev_uint, ALU_OP2_SAD_ACCUM_PREV_UINT },
{op2_dot, ALU_OP2_DOT },
{op1_mul_prev, ALU_OP1_MUL_PREV },
{op1_mul_ieee_prev, ALU_OP1_MUL_IEEE_PREV },
{op1_add_prev, ALU_OP1_ADD_PREV },
{op2_muladd_prev, ALU_OP2_MULADD_PREV },
{op2_muladd_ieee_prev, ALU_OP2_MULADD_IEEE_PREV },
{op2_interp_xy, ALU_OP2_INTERP_XY },
{op2_interp_zw, ALU_OP2_INTERP_ZW },
{op2_interp_x, ALU_OP2_INTERP_X },
{op2_interp_z, ALU_OP2_INTERP_Z },
{op0_store_flags, ALU_OP1_STORE_FLAGS },
{op1_load_store_flags, ALU_OP1_LOAD_STORE_FLAGS },
{op0_lds_1a, ALU_OP2_LDS_1A },
{op0_lds_1a1d, ALU_OP2_LDS_1A1D },
{op0_lds_2a, ALU_OP2_LDS_2A },
{op1_interp_load_p0, ALU_OP1_INTERP_LOAD_P0 },
{op1_interp_load_p10, ALU_OP1_INTERP_LOAD_P10 },
{op1_interp_load_p20, ALU_OP1_INTERP_LOAD_P20 },
{op3_bfe_uint, ALU_OP3_BFE_UINT },
{op3_bfe_int, ALU_OP3_BFE_INT },
{op3_bfi_int, ALU_OP3_BFI_INT },
{op3_fma, ALU_OP3_FMA },
{op3_cndne_64, ALU_OP3_CNDNE_64 },
{op3_fma_64, ALU_OP3_FMA_64 },
{op3_lerp_uint, ALU_OP3_LERP_UINT },
{op3_bit_align_int, ALU_OP3_BIT_ALIGN_INT },
{op3_byte_align_int, ALU_OP3_BYTE_ALIGN_INT },
{op3_sad_accum_uint, ALU_OP3_SAD_ACCUM_UINT },
{op3_sad_accum_hi_uint, ALU_OP3_SAD_ACCUM_HI_UINT },
{op3_muladd_uint24, ALU_OP3_MULADD_UINT24 },
{op3_lds_idx_op, ALU_OP3_LDS_IDX_OP },
{op3_muladd, ALU_OP3_MULADD },
{op3_muladd_m2, ALU_OP3_MULADD_M2 },
{op3_muladd_m4, ALU_OP3_MULADD_M4 },
{op3_muladd_d2, ALU_OP3_MULADD_D2 },
{op3_muladd_ieee, ALU_OP3_MULADD_IEEE },
{op3_cnde, ALU_OP3_CNDE },
{op3_cndgt, ALU_OP3_CNDGT },
{op3_cndge, ALU_OP3_CNDGE },
{op3_cnde_int, ALU_OP3_CNDE_INT },
{op3_cndgt_int, ALU_OP3_CNDGT_INT },
{op3_cndge_int, ALU_OP3_CNDGE_INT },
{op3_mul_lit, ALU_OP3_MUL_LIT },
};
const std::map<ESDOp, int> ds_opcode_map = {
{DS_OP_ADD, FETCH_OP_GDS_ADD },
{DS_OP_SUB, FETCH_OP_GDS_SUB },
{DS_OP_RSUB, FETCH_OP_GDS_RSUB },
{DS_OP_INC, FETCH_OP_GDS_INC },
{DS_OP_DEC, FETCH_OP_GDS_DEC },
{DS_OP_MIN_INT, FETCH_OP_GDS_MIN_INT },
{DS_OP_MAX_INT, FETCH_OP_GDS_MAX_INT },
{DS_OP_MIN_UINT, FETCH_OP_GDS_MIN_UINT },
{DS_OP_MAX_UINT, FETCH_OP_GDS_MAX_UINT },
{DS_OP_AND, FETCH_OP_GDS_AND },
{DS_OP_OR, FETCH_OP_GDS_OR },
{DS_OP_XOR, FETCH_OP_GDS_XOR },
{DS_OP_MSKOR, FETCH_OP_GDS_MSKOR },
{DS_OP_WRITE, FETCH_OP_GDS_WRITE },
{DS_OP_WRITE_REL, FETCH_OP_GDS_WRITE_REL },
{DS_OP_WRITE2, FETCH_OP_GDS_WRITE2 },
{DS_OP_CMP_STORE, FETCH_OP_GDS_CMP_STORE },
{DS_OP_CMP_STORE_SPF, FETCH_OP_GDS_CMP_STORE_SPF },
{DS_OP_BYTE_WRITE, FETCH_OP_GDS_BYTE_WRITE },
{DS_OP_SHORT_WRITE, FETCH_OP_GDS_SHORT_WRITE },
{DS_OP_ADD_RET, FETCH_OP_GDS_ADD_RET },
{DS_OP_SUB_RET, FETCH_OP_GDS_SUB_RET },
{DS_OP_RSUB_RET, FETCH_OP_GDS_RSUB_RET },
{DS_OP_INC_RET, FETCH_OP_GDS_INC_RET },
{DS_OP_DEC_RET, FETCH_OP_GDS_DEC_RET },
{DS_OP_MIN_INT_RET, FETCH_OP_GDS_MIN_INT_RET },
{DS_OP_MAX_INT_RET, FETCH_OP_GDS_MAX_INT_RET },
{DS_OP_MIN_UINT_RET, FETCH_OP_GDS_MIN_UINT_RET },
{DS_OP_MAX_UINT_RET, FETCH_OP_GDS_MAX_UINT_RET },
{DS_OP_AND_RET, FETCH_OP_GDS_AND_RET },
{DS_OP_OR_RET, FETCH_OP_GDS_OR_RET },
{DS_OP_XOR_RET, FETCH_OP_GDS_XOR_RET },
{DS_OP_MSKOR_RET, FETCH_OP_GDS_MSKOR_RET },
{DS_OP_XCHG_RET, FETCH_OP_GDS_XCHG_RET },
{DS_OP_XCHG_REL_RET, FETCH_OP_GDS_XCHG_REL_RET },
{DS_OP_XCHG2_RET, FETCH_OP_GDS_XCHG2_RET },
{DS_OP_CMP_XCHG_RET, FETCH_OP_GDS_CMP_XCHG_RET },
{DS_OP_CMP_XCHG_SPF_RET, FETCH_OP_GDS_CMP_XCHG_SPF_RET },
{DS_OP_READ_RET, FETCH_OP_GDS_READ_RET },
{DS_OP_READ_REL_RET, FETCH_OP_GDS_READ_REL_RET },
{DS_OP_READ2_RET, FETCH_OP_GDS_READ2_RET },
{DS_OP_READWRITE_RET, FETCH_OP_GDS_READWRITE_RET },
{DS_OP_BYTE_READ_RET, FETCH_OP_GDS_BYTE_READ_RET },
{DS_OP_UBYTE_READ_RET, FETCH_OP_GDS_UBYTE_READ_RET },
{DS_OP_SHORT_READ_RET, FETCH_OP_GDS_SHORT_READ_RET },
{DS_OP_USHORT_READ_RET, FETCH_OP_GDS_USHORT_READ_RET },
{DS_OP_ATOMIC_ORDERED_ALLOC_RET, FETCH_OP_GDS_ATOMIC_ORDERED_ALLOC},
{DS_OP_INVALID, 0 },
};
} // namespace r600