aco: use Builder::copy more
fossil-db (Navi):
Totals from 6973 (5.07% of 137413) affected shaders:
SGPRs: 381768 -> 381776 (+0.00%)
VGPRs: 306092 -> 306096 (+0.00%); split: -0.00%, +0.00%
CodeSize: 24440844 -> 24421196 (-0.08%); split: -0.09%, +0.01%
MaxWaves: 86581 -> 86583 (+0.00%)
Instrs: 4682161 -> 4679578 (-0.06%); split: -0.06%, +0.00%
Cycles: 68793116 -> 68261648 (-0.77%); split: -0.83%, +0.05%
fossil-db (Polaris):
Totals from 8154 (5.87% of 138881) affected shaders:
VGPRs: 338916 -> 338920 (+0.00%); split: -0.00%, +0.00%
CodeSize: 23540428 -> 23540488 (+0.00%); split: -0.00%, +0.00%
MaxWaves: 49090 -> 49091 (+0.00%)
Instrs: 4576085 -> 4576101 (+0.00%); split: -0.00%, +0.00%
Cycles: 51720704 -> 51720888 (+0.00%); split: -0.00%, +0.00%
Most of the Navi cycle/instruction changes are from 8/16-bit parallel-rdp
shaders. They appear to be improved because the p_create_vector from
lower_subdword_phis() was blocking constant propagation.
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7216>
diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp
index 30c408a..638157c 100644
--- a/src/amd/compiler/aco_insert_exec_mask.cpp
+++ b/src/amd/compiler/aco_insert_exec_mask.cpp
@@ -381,7 +381,7 @@
/* exec seems to need to be manually initialized with combined shaders */
if (ctx.program->stage.num_sw_stages() > 1 || ctx.program->stage.hw == HWStage::NGG) {
- bld.sop1(Builder::s_mov, bld.exec(Definition(exec_mask)), bld.lm == s2 ? Operand(UINT64_MAX) : Operand(UINT32_MAX));
+ bld.copy(bld.exec(Definition(exec_mask)), Operand(UINT32_MAX, bld.lm == s2));
instructions[0]->definitions.pop_back();
}
@@ -653,15 +653,15 @@
Operand offset = instr->operands[1];
if (need_check) {
/* if exec is zero, then use UINT32_MAX as an offset and make this store a no-op */
- Temp nonempty = bld.sopc(Builder::s_cmp_lg, bld.def(s1, scc), cur_exec, Operand(0u));
+ Temp nonempty = bld.sopc(Builder::s_cmp_lg, bld.def(s1, scc), cur_exec, Operand(0u, bld.lm == s2));
if (offset.isLiteral())
- offset = bld.sop1(aco_opcode::s_mov_b32, bld.def(s1), offset);
+ offset = bld.copy(bld.def(s1), offset);
offset = bld.sop2(aco_opcode::s_cselect_b32, bld.hint_m0(bld.def(s1)),
offset, Operand(UINT32_MAX), bld.scc(nonempty));
} else if (offset.isConstant() && offset.constantValue() > 0xFFFFF) {
- offset = bld.sop1(aco_opcode::s_mov_b32, bld.hint_m0(bld.def(s1)), offset);
+ offset = bld.copy(bld.hint_m0(bld.def(s1)), offset);
}
if (!offset.isConstant())
offset.setFixed(m0);
@@ -1076,7 +1076,7 @@
unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0];
Block& succ = ctx.program->blocks[succ_idx];
if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) {
- ctx.info[idx].exec.back().first = bld.sop1(Builder::s_mov, bld.def(bld.lm, exec), Operand(0u));
+ ctx.info[idx].exec.back().first = bld.copy(bld.def(bld.lm, exec), Operand(0u, bld.lm == s2));
}
bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond), block->linear_succs[1], block->linear_succs[0]);
@@ -1105,7 +1105,7 @@
unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0];
Block& succ = ctx.program->blocks[succ_idx];
if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) {
- ctx.info[idx].exec.back().first = bld.sop1(Builder::s_mov, bld.def(bld.lm, exec), Operand(0u));
+ ctx.info[idx].exec.back().first = bld.copy(bld.def(bld.lm, exec), Operand(0u, bld.lm == s2));
}
bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond), block->linear_succs[1], block->linear_succs[0]);
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index 4de0fe6..6b82f82 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -279,7 +279,7 @@
bool post_shift = info.post_shift != 0;
if (!pre_shift && !increment && !multiply && !post_shift) {
- bld.vop1(aco_opcode::v_mov_b32, Definition(dst), a);
+ bld.copy(Definition(dst), a);
return;
}
@@ -299,7 +299,7 @@
if (multiply) {
multiply_dst = post_shift ? bld.tmp(v1) : dst;
bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst,
- bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand((uint32_t)info.multiplier)));
+ bld.copy(bld.def(v1), Operand((uint32_t)info.multiplier)));
}
if (post_shift) {
@@ -1007,7 +1007,7 @@
then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);
if (cond.id() == els.id())
- bld.sop1(Builder::s_mov, Definition(dst), then);
+ bld.copy(Definition(dst), then);
else
bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,
bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
@@ -1200,26 +1200,12 @@
}
case nir_op_mov: {
Temp src = get_alu_src(ctx, instr->src[0]);
- aco_ptr<Instruction> mov;
- if (dst.type() == RegType::sgpr) {
- if (src.type() == RegType::vgpr)
- bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
- else if (src.regClass() == s1)
- bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
- else if (src.regClass() == s2)
- bld.sop1(aco_opcode::s_mov_b64, Definition(dst), src);
- else
- unreachable("wrong src register class for nir_op_imov");
- } else {
- if (dst.regClass() == v1)
- bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src);
- else if (dst.regClass() == v1b ||
- dst.regClass() == v2b ||
- dst.regClass() == v2)
- bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
- else
- unreachable("wrong src register class for nir_op_imov");
- }
+ if (src.bytes() != dst.bytes())
+ unreachable("wrong src or dst register class for nir_op_mov");
+ if (src.type() == RegType::vgpr && dst.type() == RegType::sgpr)
+ bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
+ else
+ bld.copy(Definition(dst), src);
break;
}
case nir_op_inot: {
@@ -2113,11 +2099,11 @@
bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
} else if (dst.regClass() == v2) {
Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
- Temp tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0x3FF00000u));
+ Temp tmp = bld.copy(bld.def(v1), Operand(0x3FF00000u));
Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, emit_extract_vector(ctx, src, 1, v1), cond);
cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
- tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0xBFF00000u));
+ tmp = bld.copy(bld.def(v1), Operand(0xBFF00000u));
upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
@@ -2492,7 +2478,7 @@
src = bool_to_scalar_condition(ctx, src);
bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand(0x3f800000u), Operand(0u), bld.scc(src));
} else if (dst.regClass() == v2) {
- Temp one = bld.vop1(aco_opcode::v_mov_b32, bld.def(v2), Operand(0x3FF00000u));
+ Temp one = bld.copy(bld.def(v2), Operand(0x3FF00000u));
Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), one, src);
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
} else {
@@ -2902,13 +2888,12 @@
assert(dst.regClass() == bld.lm);
int val = instr->value[0].b ? -1 : 0;
Operand op = bld.lm.size() == 1 ? Operand((uint32_t) val) : Operand((uint64_t) val);
- bld.sop1(Builder::s_mov, Definition(dst), op);
+ bld.copy(Definition(dst), op);
} else if (instr->def.bit_size == 8) {
- /* ensure that the value is correctly represented in the low byte of the register */
- bld.sopk(aco_opcode::s_movk_i32, Definition(dst), instr->value[0].u8);
+ bld.copy(Definition(dst), Operand((uint32_t)instr->value[0].u8));
} else if (instr->def.bit_size == 16) {
- /* ensure that the value is correctly represented in the low half of the register */
- bld.sopk(aco_opcode::s_movk_i32, Definition(dst), instr->value[0].u16);
+ /* sign-extend to use s_movk_i32 instead of a literal */
+ bld.copy(Definition(dst), Operand((uint32_t)instr->value[0].i16));
} else if (dst.size() == 1) {
bld.copy(Definition(dst), Operand(instr->value[0].u32));
} else {
@@ -3209,7 +3194,7 @@
Operand load_lds_size_m0(Builder& bld)
{
/* TODO: m0 does not need to be initialized on GFX9+ */
- return bld.m0((Temp)bld.sopk(aco_opcode::s_movk_i32, bld.def(s1, m0), 0xffff));
+ return bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand(0xffffffffu)));
}
Temp lds_load_callback(Builder& bld, const LoadEmitInfo &info,
@@ -4601,7 +4586,7 @@
index = bld.vadd32(bld.def(v1), start_instance, instance_id);
}
} else {
- index = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), start_instance);
+ index = bld.copy(bld.def(v1), start_instance);
}
} else {
index = bld.vadd32(bld.def(v1),
@@ -5582,7 +5567,7 @@
bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(compare),
Operand(0u), emit_extract_vector(ctx, fmask_desc_ptr, 1, s1)).def(0).setHint(vcc);
- Temp sample_index_v = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), sample_index);
+ Temp sample_index_v = bld.copy(bld.def(v1), sample_index);
/* Replace the MSAA sample index. */
return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), sample_index_v, final_sample, compare);
@@ -5985,7 +5970,7 @@
/* LOD */
assert(nir_src_as_uint(instr->src[1]) == 0);
- Temp lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
+ Temp lod = bld.copy(bld.def(v1), Operand(0u));
/* Resource */
Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, NULL, true, false);
@@ -6802,7 +6787,7 @@
Temp sample_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
get_arg(ctx, ctx->args->ac.ancillary), Operand(8u), Operand(4u));
- Temp ps_iter_mask = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(ps_iter_masks[log2_ps_iter_samples]));
+ Temp ps_iter_mask = bld.copy(bld.def(v1), Operand(ps_iter_masks[log2_ps_iter_samples]));
Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sample_id, ps_iter_mask);
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask, get_arg(ctx, ctx->args->ac.sample_coverage));
@@ -7167,15 +7152,10 @@
Builder bld(ctx->program, ctx->block);
Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
assert(dst.regClass().type() != RegType::vgpr);
- if (src.regClass().type() == RegType::vgpr) {
+ if (src.regClass().type() == RegType::vgpr)
bld.pseudo(aco_opcode::p_as_uniform, dst, src);
- } else if (src.regClass() == s1) {
- bld.sop1(aco_opcode::s_mov_b32, dst, src);
- } else if (src.regClass() == s2) {
- bld.sop1(aco_opcode::s_mov_b64, dst, src);
- } else {
- isel_err(&instr->instr, "Unimplemented NIR instr bit size");
- }
+ else
+ bld.copy(dst, src);
}
void emit_addition_uniform_reduce(isel_context *ctx, nir_op op, Definition dst, nir_src src, Temp count)
@@ -7768,7 +7748,7 @@
bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc),
get_arg(ctx, ctx->args->ac.tg_size), Operand(0x6u | (0x6u << 16)));
} else {
- bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x0u));
+ bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x0u));
}
break;
}
@@ -7781,7 +7761,7 @@
bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), Operand(0x3fu),
get_arg(ctx, ctx->args->ac.tg_size));
else
- bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x1u));
+ bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x1u));
break;
}
case nir_intrinsic_ballot: {
@@ -7882,12 +7862,8 @@
Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src,
bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)));
bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst);
- } else if (src.regClass() == s1) {
- bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
- } else if (src.regClass() == s2) {
- bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
} else {
- isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+ bld.copy(Definition(dst), src);
}
break;
}
@@ -8705,7 +8681,7 @@
pack = bld.sop2(aco_opcode::v_or_b32, bld.def(v1), Operand(pack_const), pack);
}
if (pack_const && pack == Temp())
- offset = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(pack_const));
+ offset = bld.copy(bld.def(v1), Operand(pack_const));
else if (pack == Temp())
has_offset = false;
else
@@ -8809,7 +8785,7 @@
aco_ptr<MIMG_instruction> tex;
if (instr->op == nir_texop_txs || instr->op == nir_texop_query_levels) {
if (!has_lod)
- lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
+ lod = bld.copy(bld.def(v1), Operand(0u));
bool div_by_6 = instr->op == nir_texop_txs &&
instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
@@ -8861,7 +8837,7 @@
tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1));
tex->operands[0] = Operand(resource);
tex->operands[1] = Operand(s4); /* no sampler */
- tex->operands[2] = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
+ tex->operands[2] = bld.copy(bld.def(v1), Operand(0u));
tex->dim = dim;
tex->dmask = 0x3;
tex->da = da;
@@ -9259,10 +9235,8 @@
if (num_defined == 0) {
Builder bld(ctx->program, ctx->block);
- if (dst.regClass() == s1) {
- bld.sop1(aco_opcode::s_mov_b32, Definition(dst), Operand(0u));
- } else if (dst.regClass() == v1) {
- bld.vop1(aco_opcode::v_mov_b32, Definition(dst), Operand(0u));
+ if (dst.bytes() == 4) {
+ bld.copy(Definition(dst), Operand(0u));
} else {
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
for (unsigned i = 0; i < dst.size(); i++)
@@ -11385,7 +11359,7 @@
bld.reset(ctx->block);
Temp gds_addr = bld.copy(bld.def(v1), Operand(0u));
- Operand m = bld.m0((Temp)bld.sopk(aco_opcode::s_movk_i32, bld.def(s1, m0), 0x100));
+ Operand m = bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand(0x100u)));
bld.ds(aco_opcode::ds_add_u32, gds_addr, as_vgpr(ctx, sg_prm_cnt), m, 0u, 0u, true);
begin_divergent_if_else(ctx, &ic_last_lane);
diff --git a/src/amd/compiler/aco_lower_phis.cpp b/src/amd/compiler/aco_lower_phis.cpp
index f21d954..42e2e89 100644
--- a/src/amd/compiler/aco_lower_phis.cpp
+++ b/src/amd/compiler/aco_lower_phis.cpp
@@ -128,7 +128,7 @@
bld.reset(&block->instructions, std::prev(it.base()));
if (prev.isUndefined()) {
- bld.sop1(Builder::s_mov, dst, cur);
+ bld.copy(dst, cur);
return;
}
@@ -150,16 +150,16 @@
if (!cur_is_constant)
bld.sop2(Builder::s_orn2, dst, bld.def(s1, scc), cur, Operand(exec, bld.lm));
else if (cur.constantValue64(true))
- bld.sop1(Builder::s_mov, dst, program->wave_size == 64 ? Operand(UINT64_MAX) : Operand(UINT32_MAX));
+ bld.copy(dst, Operand(UINT32_MAX, bld.lm == s2));
else
bld.sop1(Builder::s_not, dst, bld.def(s1, scc), Operand(exec, bld.lm));
} else {
if (!cur_is_constant)
bld.sop2(Builder::s_and, dst, bld.def(s1, scc), cur, Operand(exec, bld.lm));
else if (cur.constantValue64(true))
- bld.sop1(Builder::s_mov, dst, Operand(exec, bld.lm));
+ bld.copy(dst, Operand(exec, bld.lm));
else
- bld.sop1(Builder::s_mov, dst, program->wave_size == 64 ? Operand((uint64_t)0u) : Operand(0u));
+ bld.copy(dst, Operand(0u, bld.lm == s2));
}
}
@@ -266,7 +266,7 @@
assert(phi_src.regClass().type() == RegType::sgpr);
Temp tmp = bld.tmp(RegClass(RegType::vgpr, phi_src.size()));
- insert_before_logical_end(pred, bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), phi_src).get_ptr());
+ insert_before_logical_end(pred, bld.copy(Definition(tmp), phi_src).get_ptr());
Temp new_phi_src = bld.tmp(phi->definitions[0].regClass());
insert_before_logical_end(pred, bld.pseudo(aco_opcode::p_extract_vector, Definition(new_phi_src), tmp, Operand(0u)).get_ptr());
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp
index 3c4898e..8c6766b 100644
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@@ -1003,9 +1003,10 @@
}
}
+ if (op.bytes() == 4 && op.constantEquals(0x3e22f983) && ctx->program->chip_class >= GFX8)
+ op.setFixed(PhysReg{248}); /* it can be an inline constant on GFX8+ */
+
if (dst.regClass() == s1) {
- if (op.constantEquals(0x3e22f983) && ctx->program->chip_class >= GFX8)
- op.setFixed(PhysReg{248}); /* it can be an inline constant on GFX8+ */
bld.sop1(aco_opcode::s_mov_b32, dst, op);
} else if (dst.regClass() == s2) {
bld.sop1(aco_opcode::s_mov_b64, dst, op);
@@ -1066,8 +1067,14 @@
if (def.physReg() == scc) {
bld.sopc(aco_opcode::s_cmp_lg_i32, def, op, Operand(0u));
*preserve_scc = true;
- } else if (def.bytes() == 8 && def.getTemp().type() == RegType::sgpr) {
- bld.sop1(aco_opcode::s_mov_b64, def, Operand(op.physReg(), s2));
+ } else if (op.isConstant()) {
+ copy_constant(ctx, bld, def, op);
+ } else if (def.regClass() == v1) {
+ bld.vop1(aco_opcode::v_mov_b32, def, op);
+ } else if (def.regClass() == s1) {
+ bld.sop1(aco_opcode::s_mov_b32, def, op);
+ } else if (def.regClass() == s2) {
+ bld.sop1(aco_opcode::s_mov_b64, def, op);
} else if (def.regClass().is_subdword() && ctx->program->chip_class < GFX8) {
if (op.physReg().byte()) {
assert(def.physReg().byte() == 0);
@@ -1098,14 +1105,6 @@
} else {
bld.vop1(aco_opcode::v_mov_b32, def, op);
}
- } else if (op.isConstant()) {
- copy_constant(ctx, bld, def, op);
- } else if (def.regClass() == v1) {
- bld.vop1(aco_opcode::v_mov_b32, def, op);
- } else if (def.regClass() == s1) {
- bld.sop1(aco_opcode::s_mov_b32, def, op);
- } else if (def.regClass() == s2) {
- bld.sop1(aco_opcode::s_mov_b64, def, op);
} else if (def.regClass().is_subdword()) {
bld.vop1_sdwa(aco_opcode::v_mov_b32, def, op);
} else {