radv: move all NIR pass outside of ACO
This has several advantages:
- it generates roughly the same NIR for both compiler backends
(this might help for debugging purposes)
- it might allow to move around some NIR pass to improve compile time
- it might help for RadeonSI support
- it improves fossils-db stats for RADV/LLVM (this shouldn't matter
much but it's a win for free)
fossil-db (Navi/LLVM):
Totals from 80732 (59.18% of 136420) affected shaders:
SGPRs: 5390036 -> 5382843 (-0.13%); split: -3.38%, +3.24%
VGPRs: 3910932 -> 3890320 (-0.53%); split: -2.38%, +1.85%
SpillSGPRs: 319212 -> 283149 (-11.30%); split: -17.69%, +6.39%
SpillVGPRs: 14668 -> 14324 (-2.35%); split: -7.53%, +5.18%
CodeSize: 265360860 -> 267572132 (+0.83%); split: -0.47%, +1.30%
Scratch: 5338112 -> 6134784 (+14.92%); split: -2.65%, +17.57%
MaxWaves: 1077230 -> 1086902 (+0.90%); split: +2.79%, -1.90%
No fossils-db changes on RADV/ACO.
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7077>
diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp
index 9c2c567..cc5b78f 100644
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@@ -377,49 +377,6 @@
return RegClass::get(type, components * bitsize / 8u);
}
-bool
-mem_vectorize_callback(unsigned align_mul, unsigned align_offset,
- unsigned bit_size,
- unsigned num_components,
- nir_intrinsic_instr *low, nir_intrinsic_instr *high)
-{
- if (num_components > 4)
- return false;
-
- /* >128 bit loads are split except with SMEM */
- if (bit_size * num_components > 128)
- return false;
-
- uint32_t align;
- if (align_offset)
- align = 1 << (ffs(align_offset) - 1);
- else
- align = align_mul;
-
- switch (low->intrinsic) {
- case nir_intrinsic_load_global:
- case nir_intrinsic_store_global:
- case nir_intrinsic_store_ssbo:
- case nir_intrinsic_load_ssbo:
- case nir_intrinsic_load_ubo:
- case nir_intrinsic_load_push_constant:
- return align % (bit_size == 8 ? 2 : 4) == 0;
- case nir_intrinsic_load_deref:
- case nir_intrinsic_store_deref:
- assert(nir_src_as_deref(low->src[0])->mode == nir_var_mem_shared);
- /* fallthrough */
- case nir_intrinsic_load_shared:
- case nir_intrinsic_store_shared:
- if (bit_size * num_components > 64) /* 96 and 128 bit loads require 128 bit alignment and are split otherwise */
- return align % 16 == 0;
- else
- return align % (bit_size == 8 ? 2 : 4) == 0;
- default:
- return false;
- }
- return false;
-}
-
void
setup_vs_output_info(isel_context *ctx, nir_shader *nir,
bool export_prim_id, bool export_clip_dists,
@@ -633,108 +590,12 @@
}
}
-unsigned
-lower_bit_size_callback(const nir_alu_instr *alu, void *_)
-{
- if (nir_op_is_vec(alu->op))
- return 0;
-
- unsigned bit_size = alu->dest.dest.ssa.bit_size;
- if (nir_alu_instr_is_comparison(alu))
- bit_size = nir_src_bit_size(alu->src[0].src);
-
- if (bit_size >= 32 || bit_size == 1)
- return 0;
-
- if (alu->op == nir_op_bcsel)
- return 0;
-
- const nir_op_info *info = &nir_op_infos[alu->op];
-
- if (info->is_conversion)
- return 0;
-
- bool is_integer = info->output_type & (nir_type_uint | nir_type_int);
- for (unsigned i = 0; is_integer && (i < info->num_inputs); i++)
- is_integer = info->input_types[i] & (nir_type_uint | nir_type_int);
-
- return is_integer ? 32 : 0;
-}
-
void
setup_nir(isel_context *ctx, nir_shader *nir)
{
/* the variable setup has to be done before lower_io / CSE */
setup_variables(ctx, nir);
- bool lower_to_scalar = false;
- bool lower_pack = false;
- nir_variable_mode robust_modes = (nir_variable_mode)0;
-
- if (ctx->options->robust_buffer_access) {
- robust_modes = nir_var_mem_ubo |
- nir_var_mem_ssbo |
- nir_var_mem_global |
- nir_var_mem_push_const;
- }
-
- if (nir_opt_load_store_vectorize(nir,
- nir_var_mem_ssbo | nir_var_mem_ubo |
- nir_var_mem_push_const | nir_var_mem_shared |
- nir_var_mem_global,
- mem_vectorize_callback, robust_modes)) {
- lower_to_scalar = true;
- lower_pack = true;
- }
-
- lower_to_scalar |= nir_opt_shrink_vectors(nir);
-
- if (lower_to_scalar)
- nir_lower_alu_to_scalar(nir, NULL, NULL);
- if (lower_pack)
- nir_lower_pack(nir);
-
- /* lower ALU operations */
- nir_lower_int64(nir);
-
- if (nir_lower_bit_size(nir, lower_bit_size_callback, NULL))
- nir_copy_prop(nir); /* allow nir_opt_idiv_const() to optimize lowered divisions */
-
- nir_opt_idiv_const(nir, 32);
- nir_lower_idiv(nir, nir_lower_idiv_precise);
-
- /* optimize the lowered ALU operations */
- bool more_algebraic = true;
- while (more_algebraic) {
- more_algebraic = false;
- NIR_PASS_V(nir, nir_copy_prop);
- NIR_PASS_V(nir, nir_opt_dce);
- NIR_PASS_V(nir, nir_opt_constant_folding);
- NIR_PASS(more_algebraic, nir, nir_opt_algebraic);
- }
-
- /* Do late algebraic optimization to turn add(a, neg(b)) back into
- * subs, then the mandatory cleanup after algebraic. Note that it may
- * produce fnegs, and if so then we need to keep running to squash
- * fneg(fneg(a)).
- */
- bool more_late_algebraic = true;
- while (more_late_algebraic) {
- more_late_algebraic = false;
- NIR_PASS(more_late_algebraic, nir, nir_opt_algebraic_late);
- NIR_PASS_V(nir, nir_opt_constant_folding);
- NIR_PASS_V(nir, nir_copy_prop);
- NIR_PASS_V(nir, nir_opt_dce);
- NIR_PASS_V(nir, nir_opt_cse);
- }
-
- /* cleanup passes */
- nir_lower_load_const_to_scalar(nir);
- nir_move_options move_opts = (nir_move_options)(
- nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
- nir_move_comparisons | nir_move_copies);
- nir_opt_sink(nir, move_opts);
- nir_opt_move(nir, move_opts);
nir_convert_to_lcssa(nir, true, false);
nir_lower_phis_to_scalar(nir);
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 9751ff0..9efb7db 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -2902,6 +2902,77 @@
(cache_hit ? VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT_EXT : 0);
}
+static bool
+mem_vectorize_callback(unsigned align_mul, unsigned align_offset,
+ unsigned bit_size,
+ unsigned num_components,
+ nir_intrinsic_instr *low, nir_intrinsic_instr *high)
+{
+ if (num_components > 4)
+ return false;
+
+ /* >128 bit loads are split except with SMEM */
+ if (bit_size * num_components > 128)
+ return false;
+
+ uint32_t align;
+ if (align_offset)
+ align = 1 << (ffs(align_offset) - 1);
+ else
+ align = align_mul;
+
+ switch (low->intrinsic) {
+ case nir_intrinsic_load_global:
+ case nir_intrinsic_store_global:
+ case nir_intrinsic_store_ssbo:
+ case nir_intrinsic_load_ssbo:
+ case nir_intrinsic_load_ubo:
+ case nir_intrinsic_load_push_constant:
+ return align % (bit_size == 8 ? 2 : 4) == 0;
+ case nir_intrinsic_load_deref:
+ case nir_intrinsic_store_deref:
+ assert(nir_src_as_deref(low->src[0])->mode == nir_var_mem_shared);
+ /* fallthrough */
+ case nir_intrinsic_load_shared:
+ case nir_intrinsic_store_shared:
+ if (bit_size * num_components > 64) /* 96 and 128 bit loads require 128 bit alignment and are split otherwise */
+ return align % 16 == 0;
+ else
+ return align % (bit_size == 8 ? 2 : 4) == 0;
+ default:
+ return false;
+ }
+ return false;
+}
+
+static unsigned
+lower_bit_size_callback(const nir_alu_instr *alu, void *_)
+{
+ if (nir_op_is_vec(alu->op))
+ return 0;
+
+ unsigned bit_size = alu->dest.dest.ssa.bit_size;
+ if (nir_alu_instr_is_comparison(alu))
+ bit_size = nir_src_bit_size(alu->src[0].src);
+
+ if (bit_size >= 32 || bit_size == 1)
+ return 0;
+
+ if (alu->op == nir_op_bcsel)
+ return 0;
+
+ const nir_op_info *info = &nir_op_infos[alu->op];
+
+ if (info->is_conversion)
+ return 0;
+
+ bool is_integer = info->output_type & (nir_type_uint | nir_type_int);
+ for (unsigned i = 0; is_integer && (i < info->num_inputs); i++)
+ is_integer = info->input_types[i] & (nir_type_uint | nir_type_int);
+
+ return is_integer ? 32 : 0;
+}
+
VkResult radv_create_shaders(struct radv_pipeline *pipeline,
struct radv_device *device,
struct radv_pipeline_cache *cache,
@@ -3029,6 +3100,80 @@
NIR_PASS_V(nir[i], nir_lower_memory_model);
radv_lower_io(device, nir[i]);
+
+ bool lower_to_scalar = false;
+ bool lower_pack = false;
+ nir_variable_mode robust_modes = (nir_variable_mode)0;
+
+ if (device->robust_buffer_access) {
+ robust_modes = nir_var_mem_ubo |
+ nir_var_mem_ssbo |
+ nir_var_mem_global |
+ nir_var_mem_push_const;
+ }
+
+ if (nir_opt_load_store_vectorize(nir[i],
+ nir_var_mem_ssbo | nir_var_mem_ubo |
+ nir_var_mem_push_const | nir_var_mem_shared |
+ nir_var_mem_global,
+ mem_vectorize_callback, robust_modes)) {
+ lower_to_scalar = true;
+ lower_pack = true;
+ }
+
+ lower_to_scalar |= nir_opt_shrink_vectors(nir[i]);
+
+ if (lower_to_scalar)
+ nir_lower_alu_to_scalar(nir[i], NULL, NULL);
+ if (lower_pack)
+ nir_lower_pack(nir[i]);
+
+ /* lower ALU operations */
+ /* TODO: Some 64-bit tests crash inside LLVM. */
+ if (!radv_use_llvm_for_stage(device, i))
+ nir_lower_int64(nir[i]);
+
+ if (nir_lower_bit_size(nir[i], lower_bit_size_callback, NULL))
+ nir_copy_prop(nir[i]); /* allow nir_opt_idiv_const() to optimize lowered divisions */
+
+ /* TODO: Implement nir_op_uadd_sat with LLVM. */
+ if (!radv_use_llvm_for_stage(device, i))
+ nir_opt_idiv_const(nir[i], 32);
+ nir_lower_idiv(nir[i], nir_lower_idiv_precise);
+
+ /* optimize the lowered ALU operations */
+ bool more_algebraic = true;
+ while (more_algebraic) {
+ more_algebraic = false;
+ NIR_PASS_V(nir[i], nir_copy_prop);
+ NIR_PASS_V(nir[i], nir_opt_dce);
+ NIR_PASS_V(nir[i], nir_opt_constant_folding);
+ NIR_PASS(more_algebraic, nir[i], nir_opt_algebraic);
+ }
+
+ /* Do late algebraic optimization to turn add(a,
+ * neg(b)) back into subs, then the mandatory cleanup
+ * after algebraic. Note that it may produce fnegs,
+ * and if so then we need to keep running to squash
+ * fneg(fneg(a)).
+ */
+ bool more_late_algebraic = true;
+ while (more_late_algebraic) {
+ more_late_algebraic = false;
+ NIR_PASS(more_late_algebraic, nir[i], nir_opt_algebraic_late);
+ NIR_PASS_V(nir[i], nir_opt_constant_folding);
+ NIR_PASS_V(nir[i], nir_copy_prop);
+ NIR_PASS_V(nir[i], nir_opt_dce);
+ NIR_PASS_V(nir[i], nir_opt_cse);
+ }
+
+ /* cleanup passes */
+ nir_lower_load_const_to_scalar(nir[i]);
+ nir_move_options move_opts = (nir_move_options)(
+ nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
+ nir_move_comparisons | nir_move_copies);
+ nir_opt_sink(nir[i], move_opts);
+ nir_opt_move(nir[i], move_opts);
}
}