src/amd/vulkan/radv_pipeline.c - platform/external/mesa3d - Git at Google

 /*
  * Copyright © 2016 Red Hat.
  * Copyright © 2016 Bas Nieuwenhuizen
  *
  * based in part on anv driver which is:
  * Copyright © 2015 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */

 #include "meta/radv_meta.h"
 #include "nir/nir.h"
 #include "nir/nir_builder.h"
 #include "nir/nir_serialize.h"
 #include "nir/radv_nir.h"
 #include "spirv/nir_spirv.h"
 #include "util/disk_cache.h"
 #include "util/mesa-sha1.h"
 #include "util/os_time.h"
 #include "util/u_atomic.h"
 #include "radv_cs.h"
 #include "radv_debug.h"
 #include "radv_private.h"
 #include "radv_shader.h"
 #include "radv_shader_args.h"
 #include "vk_pipeline.h"
 #include "vk_render_pass.h"
 #include "vk_util.h"

 #include "util/u_debug.h"
 #include "ac_binary.h"
 #include "ac_nir.h"
 #include "ac_shader_util.h"
 #include "aco_interface.h"
 #include "sid.h"
 #include "vk_format.h"
 #include "vk_nir_convert_ycbcr.h"

 bool
 radv_shader_need_indirect_descriptor_sets(const struct radv_shader *shader)
 {
    const struct radv_userdata_info *loc = radv_get_user_sgpr(shader, AC_UD_INDIRECT_DESCRIPTOR_SETS);
    return loc->sgpr_idx != -1;
 }

 bool
 radv_pipeline_capture_shaders(const struct radv_device *device, VkPipelineCreateFlags flags)
 {
    return (flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR) ||
           (device->instance->debug_flags & RADV_DEBUG_DUMP_SHADERS) || device->keep_shader_info;
 }

 bool
 radv_pipeline_capture_shader_stats(const struct radv_device *device, VkPipelineCreateFlags flags)
 {
    return (flags & VK_PIPELINE_CREATE_CAPTURE_STATISTICS_BIT_KHR) ||
           (device->instance->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS) ||
           device->keep_shader_info;
 }

 void
 radv_pipeline_init(struct radv_device *device, struct radv_pipeline *pipeline,
                     enum radv_pipeline_type type)
 {
    vk_object_base_init(&device->vk, &pipeline->base, VK_OBJECT_TYPE_PIPELINE);

    pipeline->type = type;
 }

 void
 radv_pipeline_destroy(struct radv_device *device, struct radv_pipeline *pipeline,
                       const VkAllocationCallbacks *allocator)
 {
    switch (pipeline->type) {
    case RADV_PIPELINE_GRAPHICS:
       radv_destroy_graphics_pipeline(device, radv_pipeline_to_graphics(pipeline));
       break;
    case RADV_PIPELINE_GRAPHICS_LIB:
       radv_destroy_graphics_lib_pipeline(device, radv_pipeline_to_graphics_lib(pipeline));
       break;
    case RADV_PIPELINE_COMPUTE:
       radv_destroy_compute_pipeline(device, radv_pipeline_to_compute(pipeline));
       break;
    case RADV_PIPELINE_RAY_TRACING:
       radv_destroy_ray_tracing_pipeline(device, radv_pipeline_to_ray_tracing(pipeline));
       break;
    default:
       unreachable("invalid pipeline type");
    }

    if (pipeline->cs.buf)
       free(pipeline->cs.buf);

    radv_rmv_log_resource_destroy(device, (uint64_t)radv_pipeline_to_handle(pipeline));
    vk_object_base_finish(&pipeline->base);
    vk_free2(&device->vk.alloc, allocator, pipeline);
 }

 VKAPI_ATTR void VKAPI_CALL
 radv_DestroyPipeline(VkDevice _device, VkPipeline _pipeline,
                      const VkAllocationCallbacks *pAllocator)
 {
    RADV_FROM_HANDLE(radv_device, device, _device);
    RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);

    if (!_pipeline)
       return;

    radv_pipeline_destroy(device, pipeline, pAllocator);
 }

 void
 radv_pipeline_init_scratch(const struct radv_device *device, struct radv_pipeline *pipeline)
 {
    unsigned scratch_bytes_per_wave = 0;
    unsigned max_waves = 0;
    bool is_rt = pipeline->type == RADV_PIPELINE_RAY_TRACING;

    for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
       if (pipeline->shaders[i] && (pipeline->shaders[i]->config.scratch_bytes_per_wave || is_rt)) {
          unsigned max_stage_waves = device->scratch_waves;

          scratch_bytes_per_wave =
             MAX2(scratch_bytes_per_wave, pipeline->shaders[i]->config.scratch_bytes_per_wave);

          max_stage_waves =
             MIN2(max_stage_waves, 4 * device->physical_device->rad_info.num_cu *
                                      radv_get_max_waves(device, pipeline->shaders[i], i));
          max_waves = MAX2(max_waves, max_stage_waves);
       }
    }

    pipeline->scratch_bytes_per_wave = scratch_bytes_per_wave;
    pipeline->max_waves = max_waves;
 }

 struct radv_pipeline_key
 radv_generate_pipeline_key(const struct radv_device *device, const struct radv_pipeline *pipeline,
                            VkPipelineCreateFlags flags)
 {
    struct radv_pipeline_key key;

    memset(&key, 0, sizeof(key));

    if (flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT)
       key.optimisations_disabled = 1;

    key.disable_aniso_single_level = device->instance->disable_aniso_single_level &&
                                     device->physical_device->rad_info.gfx_level < GFX8;

    key.image_2d_view_of_3d =
       device->image_2d_view_of_3d && device->physical_device->rad_info.gfx_level == GFX9;

    key.tex_non_uniform = device->instance->tex_non_uniform;

    return key;
 }

 uint32_t
 radv_get_hash_flags(const struct radv_device *device, bool stats)
 {
    uint32_t hash_flags = 0;

    if (device->physical_device->use_ngg_culling)
       hash_flags |= RADV_HASH_SHADER_USE_NGG_CULLING;
    if (device->instance->perftest_flags & RADV_PERFTEST_EMULATE_RT)
       hash_flags |= RADV_HASH_SHADER_EMULATE_RT;
    if (device->physical_device->rt_wave_size == 64)
       hash_flags |= RADV_HASH_SHADER_RT_WAVE64;
    if (device->physical_device->cs_wave_size == 32)
       hash_flags |= RADV_HASH_SHADER_CS_WAVE32;
    if (device->physical_device->ps_wave_size == 32)
       hash_flags |= RADV_HASH_SHADER_PS_WAVE32;
    if (device->physical_device->ge_wave_size == 32)
       hash_flags |= RADV_HASH_SHADER_GE_WAVE32;
    if (device->physical_device->use_llvm)
       hash_flags |= RADV_HASH_SHADER_LLVM;
    if (stats)
       hash_flags |= RADV_HASH_SHADER_KEEP_STATISTICS;
    if (device->robust_buffer_access) /* forces per-attribute vertex descriptors */
       hash_flags |= RADV_HASH_SHADER_ROBUST_BUFFER_ACCESS;
    if (device->robust_buffer_access2) /* affects load/store vectorizer */
       hash_flags |= RADV_HASH_SHADER_ROBUST_BUFFER_ACCESS2;
    if (device->instance->debug_flags & RADV_DEBUG_SPLIT_FMA)
       hash_flags |= RADV_HASH_SHADER_SPLIT_FMA;
    if (device->instance->debug_flags & RADV_DEBUG_NO_FMASK)
       hash_flags |= RADV_HASH_SHADER_NO_FMASK;
    if (device->physical_device->use_ngg_streamout)
       hash_flags |= RADV_HASH_SHADER_NGG_STREAMOUT;
    return hash_flags;
 }

 void
 radv_pipeline_stage_init(const VkPipelineShaderStageCreateInfo *sinfo,
                          struct radv_pipeline_stage *out_stage, gl_shader_stage stage)
 {
    const VkShaderModuleCreateInfo *minfo =
       vk_find_struct_const(sinfo->pNext, SHADER_MODULE_CREATE_INFO);
    const VkPipelineShaderStageModuleIdentifierCreateInfoEXT *iinfo =
       vk_find_struct_const(sinfo->pNext, PIPELINE_SHADER_STAGE_MODULE_IDENTIFIER_CREATE_INFO_EXT);

    if (sinfo->module == VK_NULL_HANDLE && !minfo && !iinfo)
       return;

    memset(out_stage, 0, sizeof(*out_stage));

    out_stage->stage = stage;
    out_stage->entrypoint = sinfo->pName;
    out_stage->spec_info = sinfo->pSpecializationInfo;
    out_stage->feedback.flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;

    if (sinfo->module != VK_NULL_HANDLE) {
       struct vk_shader_module *module = vk_shader_module_from_handle(sinfo->module);

       out_stage->spirv.data = module->data;
       out_stage->spirv.size = module->size;
       out_stage->spirv.object = &module->base;

       if (module->nir)
          out_stage->internal_nir = module->nir;
    } else if (minfo) {
       out_stage->spirv.data = (const char *)minfo->pCode;
       out_stage->spirv.size = minfo->codeSize;
    }

    vk_pipeline_hash_shader_stage(sinfo, NULL, out_stage->shader_sha1);
 }

 static const struct vk_ycbcr_conversion_state *
 ycbcr_conversion_lookup(const void *data, uint32_t set, uint32_t binding, uint32_t array_index)
 {
    const struct radv_pipeline_layout *layout = data;

    const struct radv_descriptor_set_layout *set_layout = layout->set[set].layout;
    const struct vk_ycbcr_conversion_state *ycbcr_samplers =
       radv_immutable_ycbcr_samplers(set_layout, binding);

    if (!ycbcr_samplers)
       return NULL;

    return ycbcr_samplers + array_index;
 }

 bool
 radv_mem_vectorize_callback(unsigned align_mul, unsigned align_offset, unsigned bit_size,
                             unsigned num_components, nir_intrinsic_instr *low,
                             nir_intrinsic_instr *high, void *data)
 {
    if (num_components > 4)
       return false;

    /* >128 bit loads are split except with SMEM */
    if (bit_size * num_components > 128)
       return false;

    uint32_t align;
    if (align_offset)
       align = 1 << (ffs(align_offset) - 1);
    else
       align = align_mul;

    switch (low->intrinsic) {
    case nir_intrinsic_load_global:
    case nir_intrinsic_store_global:
    case nir_intrinsic_store_ssbo:
    case nir_intrinsic_load_ssbo:
    case nir_intrinsic_load_ubo:
    case nir_intrinsic_load_push_constant: {
       unsigned max_components;
       if (align % 4 == 0)
          max_components = NIR_MAX_VEC_COMPONENTS;
       else if (align % 2 == 0)
          max_components = 16u / bit_size;
       else
          max_components = 8u / bit_size;
       return (align % (bit_size / 8u)) == 0 && num_components <= max_components;
    }
    case nir_intrinsic_load_deref:
    case nir_intrinsic_store_deref:
       assert(nir_deref_mode_is(nir_src_as_deref(low->src[0]), nir_var_mem_shared));
       FALLTHROUGH;
    case nir_intrinsic_load_shared:
    case nir_intrinsic_store_shared:
       if (bit_size * num_components ==
           96) { /* 96 bit loads require 128 bit alignment and are split otherwise */
          return align % 16 == 0;
       } else if (bit_size == 16 && (align % 4)) {
          /* AMD hardware can't do 2-byte aligned f16vec2 loads, but they are useful for ALU
           * vectorization, because our vectorizer requires the scalar IR to already contain vectors.
           */
          return (align % 2 == 0) && num_components <= 2;
       } else {
          if (num_components == 3) {
             /* AMD hardware can't do 3-component loads except for 96-bit loads, handled above. */
             return false;
          }
          unsigned req = bit_size * num_components;
          if (req == 64 || req == 128) /* 64-bit and 128-bit loads can use ds_read2_b{32,64} */
             req /= 2u;
          return align % (req / 8u) == 0;
       }
    default:
       return false;
    }
    return false;
 }

 static unsigned
 lower_bit_size_callback(const nir_instr *instr, void *_)
 {
    struct radv_device *device = _;
    enum amd_gfx_level chip = device->physical_device->rad_info.gfx_level;

    if (instr->type != nir_instr_type_alu)
       return 0;
    nir_alu_instr *alu = nir_instr_as_alu(instr);

    /* If an instruction is not scalarized by this point,
     * it can be emitted as packed instruction */
    if (alu->dest.dest.ssa.num_components > 1)
       return 0;

    if (alu->dest.dest.ssa.bit_size & (8 | 16)) {
       unsigned bit_size = alu->dest.dest.ssa.bit_size;
       switch (alu->op) {
       case nir_op_bitfield_select:
       case nir_op_imul_high:
       case nir_op_umul_high:
       case nir_op_uadd_carry:
       case nir_op_usub_borrow:
          return 32;
       case nir_op_iabs:
       case nir_op_imax:
       case nir_op_umax:
       case nir_op_imin:
       case nir_op_umin:
       case nir_op_ishr:
       case nir_op_ushr:
       case nir_op_ishl:
       case nir_op_isign:
       case nir_op_uadd_sat:
       case nir_op_usub_sat:
          return (bit_size == 8 || !(chip >= GFX8 && nir_dest_is_divergent(alu->dest.dest))) ? 32
                                                                                             : 0;
       case nir_op_iadd_sat:
       case nir_op_isub_sat:
          return bit_size == 8 || !nir_dest_is_divergent(alu->dest.dest) ? 32 : 0;

       default:
          return 0;
       }
    }

    if (nir_src_bit_size(alu->src[0].src) & (8 | 16)) {
       unsigned bit_size = nir_src_bit_size(alu->src[0].src);
       switch (alu->op) {
       case nir_op_bit_count:
       case nir_op_find_lsb:
       case nir_op_ufind_msb:
          return 32;
       case nir_op_ilt:
       case nir_op_ige:
       case nir_op_ieq:
       case nir_op_ine:
       case nir_op_ult:
       case nir_op_uge:
          return (bit_size == 8 || !(chip >= GFX8 && nir_dest_is_divergent(alu->dest.dest))) ? 32
                                                                                             : 0;
       default:
          return 0;
       }
    }

    return 0;
 }

 static uint8_t
 opt_vectorize_callback(const nir_instr *instr, const void *_)
 {
    if (instr->type != nir_instr_type_alu)
       return 0;

    const struct radv_device *device = _;
    enum amd_gfx_level chip = device->physical_device->rad_info.gfx_level;
    if (chip < GFX9)
       return 1;

    const nir_alu_instr *alu = nir_instr_as_alu(instr);
    const unsigned bit_size = alu->dest.dest.ssa.bit_size;
    if (bit_size != 16)
       return 1;

    switch (alu->op) {
    case nir_op_fadd:
    case nir_op_fsub:
    case nir_op_fmul:
    case nir_op_ffma:
    case nir_op_fdiv:
    case nir_op_flrp:
    case nir_op_fabs:
    case nir_op_fneg:
    case nir_op_fsat:
    case nir_op_fmin:
    case nir_op_fmax:
    case nir_op_iabs:
    case nir_op_iadd:
    case nir_op_iadd_sat:
    case nir_op_uadd_sat:
    case nir_op_isub:
    case nir_op_isub_sat:
    case nir_op_usub_sat:
    case nir_op_ineg:
    case nir_op_imul:
    case nir_op_imin:
    case nir_op_imax:
    case nir_op_umin:
    case nir_op_umax:
       return 2;
    case nir_op_ishl: /* TODO: in NIR, these have 32bit shift operands */
    case nir_op_ishr: /* while Radeon needs 16bit operands when vectorized */
    case nir_op_ushr:
    default:
       return 1;
    }
 }

 static nir_component_mask_t
 non_uniform_access_callback(const nir_src *src, void *_)
 {
    if (src->ssa->num_components == 1)
       return 0x1;
    return nir_chase_binding(*src).success ? 0x2 : 0x3;
 }

 void
 radv_postprocess_nir(struct radv_device *device, const struct radv_pipeline_layout *pipeline_layout,
                      const struct radv_pipeline_key *pipeline_key, unsigned last_vgt_api_stage,
                      struct radv_pipeline_stage *stage)
 {
    enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level;
    bool progress;

    /* Wave and workgroup size should already be filled. */
    assert(stage->info.wave_size && stage->info.workgroup_size);

    if (stage->stage == MESA_SHADER_FRAGMENT) {
       if (!pipeline_key->optimisations_disabled) {
          NIR_PASS(_, stage->nir, nir_opt_cse);
       }
       NIR_PASS(_, stage->nir, radv_nir_lower_fs_intrinsics, stage, pipeline_key);
    }

    enum nir_lower_non_uniform_access_type lower_non_uniform_access_types =
       nir_lower_non_uniform_ubo_access | nir_lower_non_uniform_ssbo_access |
       nir_lower_non_uniform_texture_access | nir_lower_non_uniform_image_access;

    /* In practice, most shaders do not have non-uniform-qualified
     * accesses (see
     * https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17558#note_1475069)
     * thus a cheaper and likely to fail check is run first.
     */
    if (nir_has_non_uniform_access(stage->nir, lower_non_uniform_access_types)) {
       if (!pipeline_key->optimisations_disabled) {
          NIR_PASS(_, stage->nir, nir_opt_non_uniform_access);
       }

       if (!radv_use_llvm_for_stage(device, stage->stage)) {
          nir_lower_non_uniform_access_options options = {
             .types = lower_non_uniform_access_types,
             .callback = &non_uniform_access_callback,
             .callback_data = NULL,
          };
          NIR_PASS(_, stage->nir, nir_lower_non_uniform_access, &options);
       }
    }
    NIR_PASS(_, stage->nir, nir_lower_memory_model);

    nir_load_store_vectorize_options vectorize_opts = {
       .modes = nir_var_mem_ssbo | nir_var_mem_ubo | nir_var_mem_push_const | nir_var_mem_shared |
                nir_var_mem_global,
       .callback = radv_mem_vectorize_callback,
       .robust_modes = 0,
       /* On GFX6, read2/write2 is out-of-bounds if the offset register is negative, even if
        * the final offset is not.
        */
       .has_shared2_amd = gfx_level >= GFX7,
    };

    if (device->robust_buffer_access2) {
       vectorize_opts.robust_modes = nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_push_const;
    }

    if (!pipeline_key->optimisations_disabled) {
       progress = false;
       NIR_PASS(progress, stage->nir, nir_opt_load_store_vectorize, &vectorize_opts);
       if (progress) {
          NIR_PASS(_, stage->nir, nir_copy_prop);
          NIR_PASS(_, stage->nir, nir_opt_shrink_stores,
                   !device->instance->disable_shrink_image_store);

          /* Gather info again, to update whether 8/16-bit are used. */
          nir_shader_gather_info(stage->nir, nir_shader_get_entrypoint(stage->nir));
       }
    }

    NIR_PASS(_, stage->nir, ac_nir_lower_subdword_loads,
             (ac_nir_lower_subdword_options){.modes_1_comp = nir_var_mem_ubo,
                                             .modes_N_comps = nir_var_mem_ubo | nir_var_mem_ssbo});

    progress = false;
    NIR_PASS(progress, stage->nir, nir_vk_lower_ycbcr_tex, ycbcr_conversion_lookup, pipeline_layout);
    /* Gather info in the case that nir_vk_lower_ycbcr_tex might have emitted resinfo instructions. */
    if (progress)
       nir_shader_gather_info(stage->nir, nir_shader_get_entrypoint(stage->nir));

    bool fix_derivs_in_divergent_cf =
       stage->stage == MESA_SHADER_FRAGMENT && !radv_use_llvm_for_stage(device, stage->stage);
    if (fix_derivs_in_divergent_cf) {
       NIR_PASS(_, stage->nir, nir_convert_to_lcssa, true, true);
       nir_divergence_analysis(stage->nir);
    }
    NIR_PASS(_, stage->nir, ac_nir_lower_tex,
             &(ac_nir_lower_tex_options){
                .gfx_level = gfx_level,
                .lower_array_layer_round_even = !device->physical_device->rad_info.conformant_trunc_coord,
                .fix_derivs_in_divergent_cf = fix_derivs_in_divergent_cf,
                .max_wqm_vgprs = 64, // TODO: improve spiller and RA support for linear VGPRs
             });
    if (fix_derivs_in_divergent_cf)
       NIR_PASS(_, stage->nir, nir_opt_remove_phis); /* cleanup LCSSA phis */

    if (stage->nir->info.uses_resource_info_query)
       NIR_PASS(_, stage->nir, ac_nir_lower_resinfo, gfx_level);

    NIR_PASS_V(stage->nir, radv_nir_apply_pipeline_layout, device, pipeline_layout, &stage->info,
               &stage->args);

    if (!pipeline_key->optimisations_disabled) {
       NIR_PASS(_, stage->nir, nir_opt_shrink_vectors);
    }

    NIR_PASS(_, stage->nir, nir_lower_alu_width, opt_vectorize_callback, device);

    /* lower ALU operations */
    NIR_PASS(_, stage->nir, nir_lower_int64);

    nir_move_options sink_opts = nir_move_const_undef | nir_move_copies;

    if (!pipeline_key->optimisations_disabled) {
       if (stage->stage != MESA_SHADER_FRAGMENT || !pipeline_key->disable_sinking_load_input_fs)
          sink_opts |= nir_move_load_input;

       NIR_PASS(_, stage->nir, nir_opt_sink, sink_opts);
       NIR_PASS(_, stage->nir, nir_opt_move,
                nir_move_load_input | nir_move_const_undef | nir_move_copies);
    }

    /* Lower VS inputs. We need to do this after nir_opt_sink, because
     * load_input can be reordered, but buffer loads can't.
     */
    if (stage->stage == MESA_SHADER_VERTEX) {
       NIR_PASS(_, stage->nir, radv_nir_lower_vs_inputs, stage, pipeline_key,
                &device->physical_device->rad_info);
    }

    /* Lower I/O intrinsics to memory instructions. */
    bool io_to_mem = radv_nir_lower_io_to_mem(device, stage);
    bool lowered_ngg = stage->info.is_ngg && stage->stage == last_vgt_api_stage;
    if (lowered_ngg) {
       radv_lower_ngg(device, stage, pipeline_key);
    } else if (stage->stage == last_vgt_api_stage) {
       if (stage->stage != MESA_SHADER_GEOMETRY) {
          NIR_PASS_V(stage->nir, ac_nir_lower_legacy_vs, gfx_level,
                     stage->info.outinfo.clip_dist_mask | stage->info.outinfo.cull_dist_mask,
                     stage->info.outinfo.vs_output_param_offset, stage->info.outinfo.param_exports,
                     stage->info.outinfo.export_prim_id, false, false,
                     stage->info.force_vrs_per_vertex);

       } else {
          ac_nir_gs_output_info gs_out_info = {
             .streams = stage->info.gs.output_streams,
             .usage_mask = stage->info.gs.output_usage_mask,
          };
          NIR_PASS_V(stage->nir, ac_nir_lower_legacy_gs, false, false, &gs_out_info);
       }
    } else if (stage->stage == MESA_SHADER_FRAGMENT) {
       ac_nir_lower_ps_options options = {
          .gfx_level = gfx_level,
          .family = device->physical_device->rad_info.family,
          .use_aco = !radv_use_llvm_for_stage(device, stage->stage),
          .uses_discard = true,
          /* Compared to radv_pipeline_key.ps.alpha_to_coverage_via_mrtz,
           * radv_shader_info.ps.writes_mrt0_alpha need any depth/stencil/sample_mask exist.
           * ac_nir_lower_ps() require this field to reflect whether alpha via mrtz is really
           * present.
           */
          .alpha_to_coverage_via_mrtz = stage->info.ps.writes_mrt0_alpha,
          .dual_src_blend_swizzle =
             pipeline_key->ps.epilog.mrt0_is_dual_src && gfx_level >= GFX11,
          /* Need to filter out unwritten color slots. */
          .spi_shader_col_format =
             pipeline_key->ps.epilog.spi_shader_col_format & stage->info.ps.colors_written,
          .color_is_int8 = pipeline_key->ps.epilog.color_is_int8,
          .color_is_int10 = pipeline_key->ps.epilog.color_is_int10,
          .alpha_func = PIPE_FUNC_ALWAYS,

          .enable_mrt_output_nan_fixup = pipeline_key->ps.epilog.enable_mrt_output_nan_fixup,
          .no_color_export = stage->info.ps.has_epilog,

          .bc_optimize_for_persp = G_0286CC_PERSP_CENTER_ENA(stage->info.ps.spi_ps_input) &&
                                   G_0286CC_PERSP_CENTROID_ENA(stage->info.ps.spi_ps_input),
          .bc_optimize_for_linear = G_0286CC_LINEAR_CENTER_ENA(stage->info.ps.spi_ps_input) &&
                                    G_0286CC_LINEAR_CENTROID_ENA(stage->info.ps.spi_ps_input),
       };

       NIR_PASS_V(stage->nir, ac_nir_lower_ps, &options);
    }

    NIR_PASS(_, stage->nir, nir_opt_idiv_const, 8);

    NIR_PASS(_, stage->nir, nir_lower_idiv,
             &(nir_lower_idiv_options){
                .allow_fp16 = gfx_level >= GFX9,
             });

    if (radv_use_llvm_for_stage(device, stage->stage))
       NIR_PASS_V(stage->nir, nir_lower_io_to_scalar, nir_var_mem_global);

    NIR_PASS(_, stage->nir, ac_nir_lower_global_access);
    NIR_PASS_V(stage->nir, radv_nir_lower_abi, gfx_level, &stage->info, &stage->args, pipeline_key,
               device->physical_device->rad_info.address32_hi);
    radv_optimize_nir_algebraic(stage->nir, io_to_mem || lowered_ngg ||
                                               stage->stage == MESA_SHADER_COMPUTE ||
                                               stage->stage == MESA_SHADER_TASK);

    if (stage->nir->info.bit_sizes_int & (8 | 16)) {
       if (gfx_level >= GFX8) {
          NIR_PASS(_, stage->nir, nir_convert_to_lcssa, true, true);
          nir_divergence_analysis(stage->nir);
       }

       if (nir_lower_bit_size(stage->nir, lower_bit_size_callback, device)) {
          NIR_PASS(_, stage->nir, nir_opt_constant_folding);
       }

       if (gfx_level >= GFX8)
          NIR_PASS(_, stage->nir, nir_opt_remove_phis); /* cleanup LCSSA phis */
    }
    if (((stage->nir->info.bit_sizes_int | stage->nir->info.bit_sizes_float) & 16) &&
        gfx_level >= GFX9) {
       bool separate_g16 = gfx_level >= GFX10;
       struct nir_fold_tex_srcs_options fold_srcs_options[] = {
          {
             .sampler_dims =
                ~(BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE) | BITFIELD_BIT(GLSL_SAMPLER_DIM_BUF)),
             .src_types = (1 << nir_tex_src_coord) | (1 << nir_tex_src_lod) |
                          (1 << nir_tex_src_bias) | (1 << nir_tex_src_min_lod) |
                          (1 << nir_tex_src_ms_index) |
                          (separate_g16 ? 0 : (1 << nir_tex_src_ddx) | (1 << nir_tex_src_ddy)),
          },
          {
             .sampler_dims = ~BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE),
             .src_types = (1 << nir_tex_src_ddx) | (1 << nir_tex_src_ddy),
          },
       };
       struct nir_fold_16bit_tex_image_options fold_16bit_options = {
          .rounding_mode = nir_rounding_mode_rtz,
          .fold_tex_dest_types = nir_type_float,
          .fold_image_dest_types = nir_type_float,
          .fold_image_store_data = true,
          .fold_image_srcs = !radv_use_llvm_for_stage(device, stage->stage),
          .fold_srcs_options_count = separate_g16 ? 2 : 1,
          .fold_srcs_options = fold_srcs_options,
       };
       NIR_PASS(_, stage->nir, nir_fold_16bit_tex_image, &fold_16bit_options);

       if (!pipeline_key->optimisations_disabled) {
          NIR_PASS(_, stage->nir, nir_opt_vectorize, opt_vectorize_callback, device);
       }
    }

    /* cleanup passes */
    NIR_PASS(_, stage->nir, nir_lower_alu_width, opt_vectorize_callback, device);
    NIR_PASS(_, stage->nir, nir_lower_load_const_to_scalar);
    NIR_PASS(_, stage->nir, nir_copy_prop);
    NIR_PASS(_, stage->nir, nir_opt_dce);

    if (!pipeline_key->optimisations_disabled) {
       sink_opts |= nir_move_comparisons | nir_move_load_ubo | nir_move_load_ssbo;
       NIR_PASS(_, stage->nir, nir_opt_sink, sink_opts);

       nir_move_options move_opts = nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
                                    nir_move_comparisons | nir_move_copies;
       NIR_PASS(_, stage->nir, nir_opt_move, move_opts);
    }
 }

 static uint32_t
 radv_get_executable_count(struct radv_pipeline *pipeline)
 {
    if (pipeline->type == RADV_PIPELINE_RAY_TRACING)
       return 1;

    uint32_t ret = 0;
    for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
       if (!pipeline->shaders[i])
          continue;

       if (i == MESA_SHADER_GEOMETRY &&
           !radv_pipeline_has_ngg(radv_pipeline_to_graphics(pipeline))) {
          ret += 2u;
       } else {
          ret += 1u;
       }
    }
    return ret;
 }

 static struct radv_shader *
 radv_get_shader_from_executable_index(struct radv_pipeline *pipeline, int index,
                                       gl_shader_stage *stage)
 {
    if (pipeline->type == RADV_PIPELINE_RAY_TRACING) {
       *stage = MESA_SHADER_INTERSECTION;
       return pipeline->shaders[*stage];
    }

    for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
       if (!pipeline->shaders[i])
          continue;
       if (!index) {
          *stage = i;
          return pipeline->shaders[i];
       }

       --index;

       if (i == MESA_SHADER_GEOMETRY &&
           !radv_pipeline_has_ngg(radv_pipeline_to_graphics(pipeline))) {
          if (!index) {
             *stage = i;
             return pipeline->gs_copy_shader;
          }
          --index;
       }
    }

    *stage = -1;
    return NULL;
 }

 /* Basically strlcpy (which does not exist on linux) specialized for
  * descriptions. */
 static void
 desc_copy(char *desc, const char *src)
 {
    int len = strlen(src);
    assert(len < VK_MAX_DESCRIPTION_SIZE);
    memcpy(desc, src, len);
    memset(desc + len, 0, VK_MAX_DESCRIPTION_SIZE - len);
 }

 VKAPI_ATTR VkResult VKAPI_CALL
 radv_GetPipelineExecutablePropertiesKHR(VkDevice _device, const VkPipelineInfoKHR *pPipelineInfo,
                                         uint32_t *pExecutableCount,
                                         VkPipelineExecutablePropertiesKHR *pProperties)
 {
    RADV_FROM_HANDLE(radv_pipeline, pipeline, pPipelineInfo->pipeline);
    const uint32_t total_count = radv_get_executable_count(pipeline);

    if (!pProperties) {
       *pExecutableCount = total_count;
       return VK_SUCCESS;
    }

    const uint32_t count = MIN2(total_count, *pExecutableCount);
    for (unsigned i = 0, executable_idx = 0; i < MESA_VULKAN_SHADER_STAGES && executable_idx < count; ++i) {
       if (!pipeline->shaders[i])
          continue;
       pProperties[executable_idx].stages = mesa_to_vk_shader_stage(i);
       const char *name = NULL;
       const char *description = NULL;
       switch (i) {
       case MESA_SHADER_VERTEX:
          name = "Vertex Shader";
          description = "Vulkan Vertex Shader";
          break;
       case MESA_SHADER_TESS_CTRL:
          if (!pipeline->shaders[MESA_SHADER_VERTEX]) {
             pProperties[executable_idx].stages |= VK_SHADER_STAGE_VERTEX_BIT;
             name = "Vertex + Tessellation Control Shaders";
             description = "Combined Vulkan Vertex and Tessellation Control Shaders";
          } else {
             name = "Tessellation Control Shader";
             description = "Vulkan Tessellation Control Shader";
          }
          break;
       case MESA_SHADER_TESS_EVAL:
          name = "Tessellation Evaluation Shader";
          description = "Vulkan Tessellation Evaluation Shader";
          break;
       case MESA_SHADER_GEOMETRY:
          if (pipeline->shaders[MESA_SHADER_TESS_CTRL] && !pipeline->shaders[MESA_SHADER_TESS_EVAL]) {
             pProperties[executable_idx].stages |= VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT;
             name = "Tessellation Evaluation + Geometry Shaders";
             description = "Combined Vulkan Tessellation Evaluation and Geometry Shaders";
          } else if (!pipeline->shaders[MESA_SHADER_TESS_CTRL] && !pipeline->shaders[MESA_SHADER_VERTEX]) {
             pProperties[executable_idx].stages |= VK_SHADER_STAGE_VERTEX_BIT;
             name = "Vertex + Geometry Shader";
             description = "Combined Vulkan Vertex and Geometry Shaders";
          } else {
             name = "Geometry Shader";
             description = "Vulkan Geometry Shader";
          }
          break;
       case MESA_SHADER_FRAGMENT:
          name = "Fragment Shader";
          description = "Vulkan Fragment Shader";
          break;
       case MESA_SHADER_COMPUTE:
          name = "Compute Shader";
          description = "Vulkan Compute Shader";
          break;
       case MESA_SHADER_MESH:
          name = "Mesh Shader";
          description = "Vulkan Mesh Shader";
          break;
       case MESA_SHADER_TASK:
          name = "Task Shader";
          description = "Vulkan Task Shader";
          break;
       case MESA_SHADER_RAYGEN:
          name = "Ray Generation Shader";
          description = "Vulkan Ray Generation Shader";
          break;
       case MESA_SHADER_ANY_HIT:
          name = "Any-Hit Shader";
          description = "Vulkan Any-Hit Shader";
          break;
       case MESA_SHADER_CLOSEST_HIT:
          name = "Closest-Hit Shader";
          description = "Vulkan Closest-Hit Shader";
          break;
       case MESA_SHADER_MISS:
          name = "Miss Shader";
          description = "Vulkan Miss Shader";
          break;
       case MESA_SHADER_INTERSECTION:
          name = "Intersection Shader";
          description = "Vulkan Intersection Shader";
          break;
       case MESA_SHADER_CALLABLE:
          name = "Callable Shader";
          description = "Vulkan Callable Shader";
          break;
       }

       pProperties[executable_idx].subgroupSize = pipeline->shaders[i]->info.wave_size;
       desc_copy(pProperties[executable_idx].name, name);
       desc_copy(pProperties[executable_idx].description, description);

       ++executable_idx;
       if (i == MESA_SHADER_GEOMETRY &&
           !radv_pipeline_has_ngg(radv_pipeline_to_graphics(pipeline))) {
          assert(pipeline->gs_copy_shader);
          if (executable_idx >= count)
             break;

          pProperties[executable_idx].stages = VK_SHADER_STAGE_GEOMETRY_BIT;
          pProperties[executable_idx].subgroupSize = 64;
          desc_copy(pProperties[executable_idx].name, "GS Copy Shader");
          desc_copy(pProperties[executable_idx].description,
                    "Extra shader stage that loads the GS output ringbuffer into the rasterizer");

          ++executable_idx;
       }
    }

    VkResult result = *pExecutableCount < total_count ? VK_INCOMPLETE : VK_SUCCESS;
    *pExecutableCount = count;
    return result;
 }

 VKAPI_ATTR VkResult VKAPI_CALL
 radv_GetPipelineExecutableStatisticsKHR(VkDevice _device,
                                         const VkPipelineExecutableInfoKHR *pExecutableInfo,
                                         uint32_t *pStatisticCount,
                                         VkPipelineExecutableStatisticKHR *pStatistics)
 {
    RADV_FROM_HANDLE(radv_device, device, _device);
    RADV_FROM_HANDLE(radv_pipeline, pipeline, pExecutableInfo->pipeline);
    gl_shader_stage stage;
    struct radv_shader *shader =
       radv_get_shader_from_executable_index(pipeline, pExecutableInfo->executableIndex, &stage);

    const struct radv_physical_device *pdevice = device->physical_device;

    unsigned lds_increment = pdevice->rad_info.gfx_level >= GFX11 && stage == MESA_SHADER_FRAGMENT
       ? 1024 : pdevice->rad_info.lds_encode_granularity;
    unsigned max_waves = radv_get_max_waves(device, shader, stage);

    VkPipelineExecutableStatisticKHR *s = pStatistics;
    VkPipelineExecutableStatisticKHR *end = s + (pStatistics ? *pStatisticCount : 0);
    VkResult result = VK_SUCCESS;

    if (s < end) {
       desc_copy(s->name, "Driver pipeline hash");
       desc_copy(s->description, "Driver pipeline hash used by RGP");
       s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
       s->value.u64 = pipeline->pipeline_hash;
    }
    ++s;

    if (s < end) {
       desc_copy(s->name, "SGPRs");
       desc_copy(s->description, "Number of SGPR registers allocated per subgroup");
       s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
       s->value.u64 = shader->config.num_sgprs;
    }
    ++s;

    if (s < end) {
       desc_copy(s->name, "VGPRs");
       desc_copy(s->description, "Number of VGPR registers allocated per subgroup");
       s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
       s->value.u64 = shader->config.num_vgprs;
    }
    ++s;

    if (s < end) {
       desc_copy(s->name, "Spilled SGPRs");
       desc_copy(s->description, "Number of SGPR registers spilled per subgroup");
       s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
       s->value.u64 = shader->config.spilled_sgprs;
    }
    ++s;

    if (s < end) {
       desc_copy(s->name, "Spilled VGPRs");
       desc_copy(s->description, "Number of VGPR registers spilled per subgroup");
       s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
       s->value.u64 = shader->config.spilled_vgprs;
    }
    ++s;

    if (s < end) {
       desc_copy(s->name, "Code size");
       desc_copy(s->description, "Code size in bytes");
       s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
       s->value.u64 = shader->exec_size;
    }
    ++s;

    if (s < end) {
       desc_copy(s->name, "LDS size");
       desc_copy(s->description, "LDS size in bytes per workgroup");
       s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
       s->value.u64 = shader->config.lds_size * lds_increment;
    }
    ++s;

    if (s < end) {
       desc_copy(s->name, "Scratch size");
       desc_copy(s->description, "Private memory in bytes per subgroup");
       s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
       s->value.u64 = shader->config.scratch_bytes_per_wave;
    }
    ++s;

    if (s < end) {
       desc_copy(s->name, "Subgroups per SIMD");
       desc_copy(s->description, "The maximum number of subgroups in flight on a SIMD unit");
       s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
       s->value.u64 = max_waves;
    }
    ++s;

    if (shader->statistics) {
       for (unsigned i = 0; i < aco_num_statistics; i++) {
          const struct aco_compiler_statistic_info *info = &aco_statistic_infos[i];
          if (s < end) {
             desc_copy(s->name, info->name);
             desc_copy(s->description, info->desc);
             s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
             s->value.u64 = shader->statistics[i];
          }
          ++s;
       }
    }

    if (!pStatistics)
       *pStatisticCount = s - pStatistics;
    else if (s > end) {
       *pStatisticCount = end - pStatistics;
       result = VK_INCOMPLETE;
    } else {
       *pStatisticCount = s - pStatistics;
    }

    return result;
 }

 static VkResult
 radv_copy_representation(void *data, size_t *data_size, const char *src)
 {
    size_t total_size = strlen(src) + 1;

    if (!data) {
       *data_size = total_size;
       return VK_SUCCESS;
    }

    size_t size = MIN2(total_size, *data_size);

    memcpy(data, src, size);
    if (size)
       *((char *)data + size - 1) = 0;
    return size < total_size ? VK_INCOMPLETE : VK_SUCCESS;
 }

 VKAPI_ATTR VkResult VKAPI_CALL
 radv_GetPipelineExecutableInternalRepresentationsKHR(
    VkDevice _device, const VkPipelineExecutableInfoKHR *pExecutableInfo,
    uint32_t *pInternalRepresentationCount,
    VkPipelineExecutableInternalRepresentationKHR *pInternalRepresentations)
 {
    RADV_FROM_HANDLE(radv_device, device, _device);
    RADV_FROM_HANDLE(radv_pipeline, pipeline, pExecutableInfo->pipeline);
    gl_shader_stage stage;
    struct radv_shader *shader =
       radv_get_shader_from_executable_index(pipeline, pExecutableInfo->executableIndex, &stage);

    VkPipelineExecutableInternalRepresentationKHR *p = pInternalRepresentations;
    VkPipelineExecutableInternalRepresentationKHR *end =
       p + (pInternalRepresentations ? *pInternalRepresentationCount : 0);
    VkResult result = VK_SUCCESS;
    /* optimized NIR */
    if (p < end) {
       p->isText = true;
       desc_copy(p->name, "NIR Shader(s)");
       desc_copy(p->description, "The optimized NIR shader(s)");
       if (radv_copy_representation(p->pData, &p->dataSize, shader->nir_string) != VK_SUCCESS)
          result = VK_INCOMPLETE;
    }
    ++p;

    /* backend IR */
    if (p < end) {
       p->isText = true;
       if (radv_use_llvm_for_stage(device, stage)) {
          desc_copy(p->name, "LLVM IR");
          desc_copy(p->description, "The LLVM IR after some optimizations");
       } else {
          desc_copy(p->name, "ACO IR");
          desc_copy(p->description, "The ACO IR after some optimizations");
       }
       if (radv_copy_representation(p->pData, &p->dataSize, shader->ir_string) != VK_SUCCESS)
          result = VK_INCOMPLETE;
    }
    ++p;

    /* Disassembler */
    if (p < end && shader->disasm_string) {
       p->isText = true;
       desc_copy(p->name, "Assembly");
       desc_copy(p->description, "Final Assembly");
       if (radv_copy_representation(p->pData, &p->dataSize, shader->disasm_string) != VK_SUCCESS)
          result = VK_INCOMPLETE;
    }
    ++p;

    if (!pInternalRepresentations)
       *pInternalRepresentationCount = p - pInternalRepresentations;
    else if (p > end) {
       result = VK_INCOMPLETE;
       *pInternalRepresentationCount = end - pInternalRepresentations;
    } else {
       *pInternalRepresentationCount = p - pInternalRepresentations;
    }

    return result;
 }

 static void
 vk_shader_module_finish(void *_module)
 {
    struct vk_shader_module *module = _module;
    vk_object_base_finish(&module->base);
 }

 VkPipelineShaderStageCreateInfo *
 radv_copy_shader_stage_create_info(struct radv_device *device, uint32_t stageCount,
                                    const VkPipelineShaderStageCreateInfo *pStages, void *mem_ctx)
 {
    VkPipelineShaderStageCreateInfo *new_stages;

    size_t size = sizeof(VkPipelineShaderStageCreateInfo) * stageCount;
    new_stages = ralloc_size(mem_ctx, size);
    if (!new_stages)
       return NULL;

    memcpy(new_stages, pStages, size);

    for (uint32_t i = 0; i < stageCount; i++) {
       RADV_FROM_HANDLE(vk_shader_module, module, new_stages[i].module);

       const VkShaderModuleCreateInfo *minfo =
          vk_find_struct_const(pStages[i].pNext, SHADER_MODULE_CREATE_INFO);

       if (module) {
          struct vk_shader_module *new_module =
             ralloc_size(mem_ctx, sizeof(struct vk_shader_module) + module->size);
          if (!new_module)
             return NULL;

          ralloc_set_destructor(new_module, vk_shader_module_finish);
          vk_object_base_init(&device->vk, &new_module->base, VK_OBJECT_TYPE_SHADER_MODULE);

          new_module->nir = NULL;
          memcpy(new_module->sha1, module->sha1, sizeof(module->sha1));
          new_module->size = module->size;
          memcpy(new_module->data, module->data, module->size);

          module = new_module;
       } else if (minfo) {
          module = ralloc_size(mem_ctx, sizeof(struct vk_shader_module) + minfo->codeSize);
          if (!module)
             return NULL;

          vk_shader_module_init(&device->vk, module, minfo);
       }

       if (module) {
          const VkSpecializationInfo *spec = new_stages[i].pSpecializationInfo;
          if (spec) {
             VkSpecializationInfo *new_spec = ralloc(mem_ctx, VkSpecializationInfo);
             if (!new_spec)
                return NULL;

             new_spec->mapEntryCount = spec->mapEntryCount;
             uint32_t map_entries_size = sizeof(VkSpecializationMapEntry) * spec->mapEntryCount;
             new_spec->pMapEntries = ralloc_size(mem_ctx, map_entries_size);
             if (!new_spec->pMapEntries)
                return NULL;
             memcpy((void *)new_spec->pMapEntries, spec->pMapEntries, map_entries_size);

             new_spec->dataSize = spec->dataSize;
             new_spec->pData = ralloc_size(mem_ctx, spec->dataSize);
             if (!new_spec->pData)
                return NULL;
             memcpy((void *)new_spec->pData, spec->pData, spec->dataSize);

             new_stages[i].pSpecializationInfo = new_spec;
          }

          new_stages[i].module = vk_shader_module_to_handle(module);
          new_stages[i].pName = ralloc_strdup(mem_ctx, new_stages[i].pName);
          if (!new_stages[i].pName)
             return NULL;
          new_stages[i].pNext = NULL;
       }
    }

    return new_stages;
 }