aco: use VOP2 version of v_cvt_pkrtz_f16_f32 on GFX_6_7_10
Totals from 767 (0.56% of 136546) affected shaders (NAVI):
CodeSize: 2862208 -> 2850036 (-0.43%)
Instrs: 561572 -> 561574 (+0.00%)
Cycles: 6455420 -> 6455428 (+0.00%)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6777>
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index fb989eb..a7745d4 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -2141,7 +2141,10 @@
Temp src = get_alu_src(ctx, instr->src[0]);
if (instr->src[0].src.ssa->bit_size == 64)
src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
- bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, Operand(0u));
+ if (ctx->block->fp_mode.round16_64 == fp_round_tz)
+ bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
+ else
+ bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, Definition(dst), src, Operand(0u));
break;
}
case nir_op_f2f32: {
@@ -2615,7 +2618,10 @@
/* upper bits zero on GFX6-GFX9 */
bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), get_alu_src(ctx, instr->src[0]));
} else if (!ctx->block->fp_mode.care_about_round16_64 || ctx->block->fp_mode.round16_64 == fp_round_tz) {
- emit_vop3a_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32, dst);
+ if (ctx->program->chip_class == GFX8 || ctx->program->chip_class == GFX9)
+ emit_vop3a_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32_e64, dst);
+ else
+ emit_vop2_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32, dst, false);
} else {
Temp src0 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), get_alu_src(ctx, instr->src[0]));
Temp src1 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), get_alu_src(ctx, instr->src[1]));
@@ -10343,7 +10349,7 @@
case V_028714_SPI_SHADER_FP16_ABGR:
enabled_channels = 0x5;
- compr_op = aco_opcode::v_cvt_pkrtz_f16_f32;
+ compr_op = aco_opcode::v_cvt_pkrtz_f16_f32_e64;
if (is_16bit) {
if (ctx->options->chip_class >= GFX9) {
/* Pack the FP16 values together instead of converting them to
diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py
index 22daeff..97be2ad 100644
--- a/src/amd/compiler/aco_opcodes.py
+++ b/src/amd/compiler/aco_opcodes.py
@@ -682,6 +682,7 @@
( -1, -1, -1, -1, 0x2b, "v_fmac_f32", True),
( -1, -1, -1, -1, 0x2c, "v_fmamk_f32", True),
( -1, -1, -1, -1, 0x2d, "v_fmaak_f32", True),
+ (0x2f, 0x2f, -1, -1, 0x2f, "v_cvt_pkrtz_f16_f32", True),
( -1, -1, 0x1f, 0x1f, 0x32, "v_add_f16", True),
( -1, -1, 0x20, 0x20, 0x33, "v_sub_f16", True),
( -1, -1, 0x21, 0x21, 0x34, "v_subrev_f16", True),
@@ -1051,7 +1052,7 @@
(0x11e, 0x11e, 0x293, 0x293, 0x363, "v_bfm_b32", False, False),
(0x12d, 0x12d, 0x294, 0x294, 0x368, "v_cvt_pknorm_i16_f32", True, False),
(0x12e, 0x12e, 0x295, 0x295, 0x369, "v_cvt_pknorm_u16_f32", True, False),
- (0x12f, 0x12f, 0x296, 0x296, 0x12f, "v_cvt_pkrtz_f16_f32", True, False), # GFX6_7_10 is VOP2 with opcode 0x02f
+ (0x12f, 0x12f, 0x296, 0x296, 0x12f, "v_cvt_pkrtz_f16_f32_e64", True, False), # GFX6_7_10 is VOP2 with opcode 0x02f
(0x130, 0x130, 0x297, 0x297, 0x36a, "v_cvt_pk_u16_u32", False, False),
(0x131, 0x131, 0x298, 0x298, 0x36b, "v_cvt_pk_i16_i32", False, False),
( -1, -1, -1, 0x299, 0x312, "v_cvt_pknorm_i16_f16", True, False),