radv: move all NIR pass outside of ACO

This has several advantages:
- it generates roughly the same NIR for both compiler backends
  (this might help for debugging purposes)
- it might allow to move around some NIR pass to improve compile time
- it might help for RadeonSI support
- it improves fossils-db stats for RADV/LLVM (this shouldn't matter
  much but it's a win for free)

fossil-db (Navi/LLVM):
Totals from 80732 (59.18% of 136420) affected shaders:
SGPRs: 5390036 -> 5382843 (-0.13%); split: -3.38%, +3.24%
VGPRs: 3910932 -> 3890320 (-0.53%); split: -2.38%, +1.85%
SpillSGPRs: 319212 -> 283149 (-11.30%); split: -17.69%, +6.39%
SpillVGPRs: 14668 -> 14324 (-2.35%); split: -7.53%, +5.18%
CodeSize: 265360860 -> 267572132 (+0.83%); split: -0.47%, +1.30%
Scratch: 5338112 -> 6134784 (+14.92%); split: -2.65%, +17.57%
MaxWaves: 1077230 -> 1086902 (+0.90%); split: +2.79%, -1.90%

No fossils-db changes on RADV/ACO.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7077>
diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp
index 9c2c567..cc5b78f 100644
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@@ -377,49 +377,6 @@
       return RegClass::get(type, components * bitsize / 8u);
 }
 
-bool
-mem_vectorize_callback(unsigned align_mul, unsigned align_offset,
-                       unsigned bit_size,
-                       unsigned num_components,
-                       nir_intrinsic_instr *low, nir_intrinsic_instr *high)
-{
-   if (num_components > 4)
-      return false;
-
-   /* >128 bit loads are split except with SMEM */
-   if (bit_size * num_components > 128)
-      return false;
-
-   uint32_t align;
-   if (align_offset)
-      align = 1 << (ffs(align_offset) - 1);
-   else
-      align = align_mul;
-
-   switch (low->intrinsic) {
-   case nir_intrinsic_load_global:
-   case nir_intrinsic_store_global:
-   case nir_intrinsic_store_ssbo:
-   case nir_intrinsic_load_ssbo:
-   case nir_intrinsic_load_ubo:
-   case nir_intrinsic_load_push_constant:
-      return align % (bit_size == 8 ? 2 : 4) == 0;
-   case nir_intrinsic_load_deref:
-   case nir_intrinsic_store_deref:
-      assert(nir_src_as_deref(low->src[0])->mode == nir_var_mem_shared);
-      /* fallthrough */
-   case nir_intrinsic_load_shared:
-   case nir_intrinsic_store_shared:
-      if (bit_size * num_components > 64) /* 96 and 128 bit loads require 128 bit alignment and are split otherwise */
-         return align % 16 == 0;
-      else
-         return align % (bit_size == 8 ? 2 : 4) == 0;
-   default:
-      return false;
-   }
-   return false;
-}
-
 void
 setup_vs_output_info(isel_context *ctx, nir_shader *nir,
                      bool export_prim_id, bool export_clip_dists,
@@ -633,108 +590,12 @@
    }
 }
 
-unsigned
-lower_bit_size_callback(const nir_alu_instr *alu, void *_)
-{
-   if (nir_op_is_vec(alu->op))
-      return 0;
-
-   unsigned bit_size = alu->dest.dest.ssa.bit_size;
-   if (nir_alu_instr_is_comparison(alu))
-      bit_size = nir_src_bit_size(alu->src[0].src);
-
-   if (bit_size >= 32 || bit_size == 1)
-      return 0;
-
-   if (alu->op == nir_op_bcsel)
-      return 0;
-
-   const nir_op_info *info = &nir_op_infos[alu->op];
-
-   if (info->is_conversion)
-      return 0;
-
-   bool is_integer = info->output_type & (nir_type_uint | nir_type_int);
-   for (unsigned i = 0; is_integer && (i < info->num_inputs); i++)
-      is_integer = info->input_types[i] & (nir_type_uint | nir_type_int);
-
-   return is_integer ? 32 : 0;
-}
-
 void
 setup_nir(isel_context *ctx, nir_shader *nir)
 {
    /* the variable setup has to be done before lower_io / CSE */
    setup_variables(ctx, nir);
 
-   bool lower_to_scalar = false;
-   bool lower_pack = false;
-   nir_variable_mode robust_modes = (nir_variable_mode)0;
-
-   if (ctx->options->robust_buffer_access) {
-      robust_modes = nir_var_mem_ubo |
-                     nir_var_mem_ssbo |
-                     nir_var_mem_global |
-                     nir_var_mem_push_const;
-   }
-
-   if (nir_opt_load_store_vectorize(nir,
-                                    nir_var_mem_ssbo | nir_var_mem_ubo |
-                                    nir_var_mem_push_const | nir_var_mem_shared |
-                                    nir_var_mem_global,
-                                    mem_vectorize_callback, robust_modes)) {
-      lower_to_scalar = true;
-      lower_pack = true;
-   }
-
-   lower_to_scalar |= nir_opt_shrink_vectors(nir);
-
-   if (lower_to_scalar)
-      nir_lower_alu_to_scalar(nir, NULL, NULL);
-   if (lower_pack)
-      nir_lower_pack(nir);
-
-   /* lower ALU operations */
-   nir_lower_int64(nir);
-
-   if (nir_lower_bit_size(nir, lower_bit_size_callback, NULL))
-      nir_copy_prop(nir); /* allow nir_opt_idiv_const() to optimize lowered divisions */
-
-   nir_opt_idiv_const(nir, 32);
-   nir_lower_idiv(nir, nir_lower_idiv_precise);
-
-   /* optimize the lowered ALU operations */
-   bool more_algebraic = true;
-   while (more_algebraic) {
-      more_algebraic = false;
-      NIR_PASS_V(nir, nir_copy_prop);
-      NIR_PASS_V(nir, nir_opt_dce);
-      NIR_PASS_V(nir, nir_opt_constant_folding);
-      NIR_PASS(more_algebraic, nir, nir_opt_algebraic);
-   }
-
-   /* Do late algebraic optimization to turn add(a, neg(b)) back into
-    * subs, then the mandatory cleanup after algebraic.  Note that it may
-    * produce fnegs, and if so then we need to keep running to squash
-    * fneg(fneg(a)).
-    */
-   bool more_late_algebraic = true;
-   while (more_late_algebraic) {
-      more_late_algebraic = false;
-      NIR_PASS(more_late_algebraic, nir, nir_opt_algebraic_late);
-      NIR_PASS_V(nir, nir_opt_constant_folding);
-      NIR_PASS_V(nir, nir_copy_prop);
-      NIR_PASS_V(nir, nir_opt_dce);
-      NIR_PASS_V(nir, nir_opt_cse);
-   }
-
-   /* cleanup passes */
-   nir_lower_load_const_to_scalar(nir);
-   nir_move_options move_opts = (nir_move_options)(
-      nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
-      nir_move_comparisons | nir_move_copies);
-   nir_opt_sink(nir, move_opts);
-   nir_opt_move(nir, move_opts);
    nir_convert_to_lcssa(nir, true, false);
    nir_lower_phis_to_scalar(nir);
 
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 9751ff0..9efb7db 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -2902,6 +2902,77 @@
 	                   (cache_hit ? VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT_EXT : 0);
 }
 
+static bool
+mem_vectorize_callback(unsigned align_mul, unsigned align_offset,
+                       unsigned bit_size,
+                       unsigned num_components,
+                       nir_intrinsic_instr *low, nir_intrinsic_instr *high)
+{
+	if (num_components > 4)
+		return false;
+
+	/* >128 bit loads are split except with SMEM */
+	if (bit_size * num_components > 128)
+		return false;
+
+	uint32_t align;
+	if (align_offset)
+		align = 1 << (ffs(align_offset) - 1);
+	else
+		align = align_mul;
+
+	switch (low->intrinsic) {
+	case nir_intrinsic_load_global:
+	case nir_intrinsic_store_global:
+	case nir_intrinsic_store_ssbo:
+	case nir_intrinsic_load_ssbo:
+	case nir_intrinsic_load_ubo:
+	case nir_intrinsic_load_push_constant:
+		return align % (bit_size == 8 ? 2 : 4) == 0;
+	case nir_intrinsic_load_deref:
+	case nir_intrinsic_store_deref:
+		assert(nir_src_as_deref(low->src[0])->mode == nir_var_mem_shared);
+		/* fallthrough */
+	case nir_intrinsic_load_shared:
+	case nir_intrinsic_store_shared:
+		if (bit_size * num_components > 64) /* 96 and 128 bit loads require 128 bit alignment and are split otherwise */
+			return align % 16 == 0;
+		else
+			return align % (bit_size == 8 ? 2 : 4) == 0;
+	default:
+		return false;
+	}
+	return false;
+}
+
+static unsigned
+lower_bit_size_callback(const nir_alu_instr *alu, void *_)
+{
+	if (nir_op_is_vec(alu->op))
+		return 0;
+
+	unsigned bit_size = alu->dest.dest.ssa.bit_size;
+	if (nir_alu_instr_is_comparison(alu))
+		bit_size = nir_src_bit_size(alu->src[0].src);
+
+	if (bit_size >= 32 || bit_size == 1)
+		return 0;
+
+	if (alu->op == nir_op_bcsel)
+		return 0;
+
+	const nir_op_info *info = &nir_op_infos[alu->op];
+
+	if (info->is_conversion)
+		return 0;
+
+	bool is_integer = info->output_type & (nir_type_uint | nir_type_int);
+	for (unsigned i = 0; is_integer && (i < info->num_inputs); i++)
+		is_integer = info->input_types[i] & (nir_type_uint | nir_type_int);
+
+	return is_integer ? 32 : 0;
+}
+
 VkResult radv_create_shaders(struct radv_pipeline *pipeline,
                              struct radv_device *device,
                              struct radv_pipeline_cache *cache,
@@ -3029,6 +3100,80 @@
 			NIR_PASS_V(nir[i], nir_lower_memory_model);
 
 			radv_lower_io(device, nir[i]);
+
+			bool lower_to_scalar = false;
+			bool lower_pack = false;
+			nir_variable_mode robust_modes = (nir_variable_mode)0;
+
+			if (device->robust_buffer_access) {
+				robust_modes = nir_var_mem_ubo |
+					       nir_var_mem_ssbo |
+					       nir_var_mem_global |
+					       nir_var_mem_push_const;
+			}
+
+			if (nir_opt_load_store_vectorize(nir[i],
+							 nir_var_mem_ssbo | nir_var_mem_ubo |
+							 nir_var_mem_push_const | nir_var_mem_shared |
+							 nir_var_mem_global,
+							 mem_vectorize_callback, robust_modes)) {
+				lower_to_scalar = true;
+				lower_pack = true;
+			}
+
+			lower_to_scalar |= nir_opt_shrink_vectors(nir[i]);
+
+			if (lower_to_scalar)
+				nir_lower_alu_to_scalar(nir[i], NULL, NULL);
+			if (lower_pack)
+				nir_lower_pack(nir[i]);
+
+			/* lower ALU operations */
+			/* TODO: Some 64-bit tests crash inside LLVM. */
+			if (!radv_use_llvm_for_stage(device, i))
+				nir_lower_int64(nir[i]);
+
+			if (nir_lower_bit_size(nir[i], lower_bit_size_callback, NULL))
+				nir_copy_prop(nir[i]); /* allow nir_opt_idiv_const() to optimize lowered divisions */
+
+			/* TODO: Implement nir_op_uadd_sat with LLVM. */
+			if (!radv_use_llvm_for_stage(device, i))
+				nir_opt_idiv_const(nir[i], 32);
+			nir_lower_idiv(nir[i], nir_lower_idiv_precise);
+
+			/* optimize the lowered ALU operations */
+			bool more_algebraic = true;
+			while (more_algebraic) {
+				more_algebraic = false;
+				NIR_PASS_V(nir[i], nir_copy_prop);
+				NIR_PASS_V(nir[i], nir_opt_dce);
+				NIR_PASS_V(nir[i], nir_opt_constant_folding);
+				NIR_PASS(more_algebraic, nir[i], nir_opt_algebraic);
+			}
+
+			/* Do late algebraic optimization to turn add(a,
+			 * neg(b)) back into subs, then the mandatory cleanup
+			 * after algebraic.  Note that it may produce fnegs,
+			 * and if so then we need to keep running to squash
+			 * fneg(fneg(a)).
+			 */
+			bool more_late_algebraic = true;
+			while (more_late_algebraic) {
+				more_late_algebraic = false;
+				NIR_PASS(more_late_algebraic, nir[i], nir_opt_algebraic_late);
+				NIR_PASS_V(nir[i], nir_opt_constant_folding);
+				NIR_PASS_V(nir[i], nir_copy_prop);
+				NIR_PASS_V(nir[i], nir_opt_dce);
+				NIR_PASS_V(nir[i], nir_opt_cse);
+			}
+
+			/* cleanup passes */
+			nir_lower_load_const_to_scalar(nir[i]);
+			nir_move_options move_opts = (nir_move_options)(
+				nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
+				nir_move_comparisons | nir_move_copies);
+			nir_opt_sink(nir[i], move_opts);
+			nir_opt_move(nir[i], move_opts);
 		}
 	}