src/gallium/drivers/radeonsi/si_shader.c - platform/external/mesa3d - Git at Google

 /*
  * Copyright 2012 Advanced Micro Devices, Inc.
  *
  * SPDX-License-Identifier: MIT
  */

 #include "ac_rtld.h"
 #include "nir_builder.h"
 #include "nir_serialize.h"
 #include "nir_tcs_info.h"
 #include "nir_xfb_info.h"
 #include "si_pipe.h"
 #include "si_shader_internal.h"
 #include "util/u_upload_mgr.h"
 #include "pipe/p_shader_tokens.h"

 static const char scratch_rsrc_dword0_symbol[] = "SCRATCH_RSRC_DWORD0";
 static const char scratch_rsrc_dword1_symbol[] = "SCRATCH_RSRC_DWORD1";

 static void si_dump_shader_key(const struct si_shader *shader, FILE *f);
 static void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader);

 /* Get the number of all interpolated inputs */
 unsigned si_get_ps_num_interp(struct si_shader *ps)
 {
    unsigned num_interp = ps->info.num_ps_inputs;

    /* Back colors are added by the PS prolog when needed. */
    if (!ps->is_monolithic && ps->key.ps.part.prolog.color_two_side)
       num_interp += !!(ps->info.ps_colors_read & 0x0f) + !!(ps->info.ps_colors_read & 0xf0);

    assert(num_interp <= 32);
    return MIN2(num_interp, 32);
 }

 /** Whether the shader runs as a combination of multiple API shaders */
 bool si_is_multi_part_shader(struct si_shader *shader)
 {
    if (shader->selector->screen->info.gfx_level <= GFX8 ||
        shader->selector->stage > MESA_SHADER_GEOMETRY)
       return false;

    return shader->key.ge.as_ls || shader->key.ge.as_es ||
           shader->selector->stage == MESA_SHADER_TESS_CTRL ||
           shader->selector->stage == MESA_SHADER_GEOMETRY;
 }

 /** Whether the shader runs on a merged HW stage (LSHS or ESGS) */
 bool si_is_merged_shader(struct si_shader *shader)
 {
    if (shader->selector->stage > MESA_SHADER_GEOMETRY || shader->is_gs_copy_shader)
       return false;

    return shader->key.ge.as_ngg || si_is_multi_part_shader(shader);
 }

 /**
  * Returns a unique index for a semantic name and index. The index must be
  * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
  * calculated.
  */
 unsigned si_shader_io_get_unique_index(unsigned semantic)
 {
    switch (semantic) {
    case VARYING_SLOT_POS:
       return SI_UNIQUE_SLOT_POS;
    default:
       if (semantic >= VARYING_SLOT_VAR0 && semantic <= VARYING_SLOT_VAR31)
          return SI_UNIQUE_SLOT_VAR0 + (semantic - VARYING_SLOT_VAR0);

       if (semantic >= VARYING_SLOT_VAR0_16BIT && semantic <= VARYING_SLOT_VAR15_16BIT)
          return SI_UNIQUE_SLOT_VAR0_16BIT + (semantic - VARYING_SLOT_VAR0_16BIT);

       assert(!"invalid generic index");
       return 0;

    /* Legacy desktop GL varyings. */
    case VARYING_SLOT_FOGC:
       return SI_UNIQUE_SLOT_FOGC;
    case VARYING_SLOT_COL0:
       return SI_UNIQUE_SLOT_COL0;
    case VARYING_SLOT_COL1:
       return SI_UNIQUE_SLOT_COL1;
    case VARYING_SLOT_BFC0:
       return SI_UNIQUE_SLOT_BFC0;
    case VARYING_SLOT_BFC1:
       return SI_UNIQUE_SLOT_BFC1;
    case VARYING_SLOT_TEX0:
    case VARYING_SLOT_TEX1:
    case VARYING_SLOT_TEX2:
    case VARYING_SLOT_TEX3:
    case VARYING_SLOT_TEX4:
    case VARYING_SLOT_TEX5:
    case VARYING_SLOT_TEX6:
    case VARYING_SLOT_TEX7:
       return SI_UNIQUE_SLOT_TEX0 + (semantic - VARYING_SLOT_TEX0);
    case VARYING_SLOT_CLIP_VERTEX:
       return SI_UNIQUE_SLOT_CLIP_VERTEX;

    /* Varyings present in both GLES and desktop GL. */
    case VARYING_SLOT_CLIP_DIST0:
       return SI_UNIQUE_SLOT_CLIP_DIST0;
    case VARYING_SLOT_CLIP_DIST1:
       return SI_UNIQUE_SLOT_CLIP_DIST1;
    case VARYING_SLOT_PSIZ:
       return SI_UNIQUE_SLOT_PSIZ;
    case VARYING_SLOT_LAYER:
       return SI_UNIQUE_SLOT_LAYER;
    case VARYING_SLOT_VIEWPORT:
       return SI_UNIQUE_SLOT_VIEWPORT;
    case VARYING_SLOT_PRIMITIVE_ID:
       return SI_UNIQUE_SLOT_PRIMITIVE_ID;
    }
 }

 unsigned si_get_max_workgroup_size(const struct si_shader *shader)
 {
    gl_shader_stage stage = shader->is_gs_copy_shader ?
       MESA_SHADER_VERTEX : shader->selector->stage;

    assert(shader->wave_size);

    switch (stage) {
    case MESA_SHADER_VERTEX:
    case MESA_SHADER_TESS_EVAL:
       /* Use the largest workgroup size for streamout */
       if (shader->key.ge.as_ngg)
          return shader->info.num_streamout_vec4s ? 256 : 128;

       /* As part of merged shader. */
       return shader->selector->screen->info.gfx_level >= GFX9 &&
          (shader->key.ge.as_ls || shader->key.ge.as_es) ? 128 : shader->wave_size;

    case MESA_SHADER_TESS_CTRL:
       /* Return this so that LLVM doesn't remove s_barrier
        * instructions on chips where we use s_barrier. */
       return shader->selector->screen->info.gfx_level >= GFX7 ? 128 : shader->wave_size;

    case MESA_SHADER_GEOMETRY:
       /* GS can always generate up to 256 vertices. */
       return shader->selector->screen->info.gfx_level >= GFX9 ? 256 : shader->wave_size;

    case MESA_SHADER_COMPUTE:
       break; /* see below */

    default:
       return shader->wave_size;
    }

    /* Compile a variable block size using the maximum variable size. */
    if (shader->selector->info.base.workgroup_size_variable)
       return SI_MAX_VARIABLE_THREADS_PER_BLOCK;

    uint16_t *local_size = shader->selector->info.base.workgroup_size;
    unsigned max_work_group_size = (uint32_t)local_size[0] *
                                   (uint32_t)local_size[1] *
                                   (uint32_t)local_size[2];
    assert(max_work_group_size);
    return max_work_group_size;
 }

 static unsigned get_lds_granularity(struct si_screen *screen, gl_shader_stage stage)
 {
    return screen->info.gfx_level >= GFX11 && stage == MESA_SHADER_FRAGMENT ? 1024 :
           screen->info.gfx_level >= GFX7 ? 512 : 256;
 }

 static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
                                   struct ac_rtld_binary *rtld)
 {
    const struct si_shader_selector *sel = shader->selector;
    const char *part_elfs[5];
    size_t part_sizes[5];
    unsigned num_parts = 0;

 #define add_part(shader_or_part)                                                                   \
    if (shader_or_part) {                                                                           \
       assert(shader_or_part->binary.type == SI_SHADER_BINARY_ELF);                                 \
       part_elfs[num_parts] = (shader_or_part)->binary.code_buffer;                                 \
       part_sizes[num_parts] = (shader_or_part)->binary.code_size;                                  \
       num_parts++;                                                                                 \
    }

    add_part(shader->prolog);
    add_part(shader->previous_stage);
    add_part(shader);
    add_part(shader->epilog);

 #undef add_part

    struct ac_rtld_symbol lds_symbols[2];
    unsigned num_lds_symbols = 0;

    if (sel && screen->info.gfx_level >= GFX9 && !shader->is_gs_copy_shader &&
        (sel->stage == MESA_SHADER_GEOMETRY ||
         (sel->stage <= MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg))) {
       struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
       sym->name = "esgs_ring";
       sym->size = (shader->key.ge.as_ngg ? shader->ngg.info.esgs_lds_size
                                          : shader->gs_info.esgs_ring_size) * 4;
       sym->align = 64 * 1024;
    }

    if (sel->stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) {
       struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
       sym->name = "ngg_emit";
       sym->size = shader->ngg.info.ngg_out_lds_size * 4;
       sym->align = 4;
    }

    bool ok = ac_rtld_open(
       rtld, (struct ac_rtld_open_info){.info = &screen->info,
                                        .options =
                                           {
                                              .halt_at_entry = screen->options.halt_shaders,
                                              .waitcnt_wa = num_parts > 1 &&
                                                            screen->info.needs_llvm_wait_wa,
                                           },
                                        .shader_type = sel->stage,
                                        .wave_size = shader->wave_size,
                                        .num_parts = num_parts,
                                        .elf_ptrs = part_elfs,
                                        .elf_sizes = part_sizes,
                                        .num_shared_lds_symbols = num_lds_symbols,
                                        .shared_lds_symbols = lds_symbols});

    if (rtld->lds_size > 0) {
       unsigned alloc_granularity = get_lds_granularity(screen, sel->stage);
       shader->config.lds_size = DIV_ROUND_UP(rtld->lds_size, alloc_granularity);
    }

    return ok;
 }

 static unsigned get_shader_binaries(struct si_shader *shader, struct si_shader_binary *bin[4])
 {
    unsigned num_bin = 0;

    if (shader->prolog)
       bin[num_bin++] = &shader->prolog->binary;

    if (shader->previous_stage)
       bin[num_bin++] = &shader->previous_stage->binary;

    bin[num_bin++] = &shader->binary;

    if (shader->epilog)
       bin[num_bin++] = &shader->epilog->binary;

    return num_bin;
 }

 /* si_get_shader_binary_size should only be called once per shader
  * and the result should be stored in shader->complete_shader_binary_size.
  */
 unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader)
 {
    if (shader->binary.type == SI_SHADER_BINARY_ELF) {
       struct ac_rtld_binary rtld;
       si_shader_binary_open(screen, shader, &rtld);
       uint64_t size = rtld.exec_size;
       ac_rtld_close(&rtld);
       return size;
    } else {
       struct si_shader_binary *bin[4];
       unsigned num_bin = get_shader_binaries(shader, bin);

       unsigned size = 0;
       for (unsigned i = 0; i < num_bin; i++) {
          assert(bin[i]->type == SI_SHADER_BINARY_RAW);
          size += bin[i]->exec_size;
       }
       return size;
    }
 }

 unsigned si_get_shader_prefetch_size(struct si_shader *shader)
 {
    struct si_screen *sscreen = shader->selector->screen;
    /* This excludes arrays of constants after instructions. */
    unsigned exec_size =
       ac_align_shader_binary_for_prefetch(&sscreen->info,
                                           shader->complete_shader_binary_size);

    /* INST_PREF_SIZE uses 128B granularity.
     * - GFX11: max 128 * 63 = 8064
     * - GFX12: max 128 * 255 = 32640
     */
    unsigned max_pref_size = shader->selector->screen->info.gfx_level >= GFX12 ? 255 : 63;
    unsigned exec_size_gran128 = DIV_ROUND_UP(exec_size, 128);

    return MIN2(max_pref_size, exec_size_gran128);
 }

 static bool si_get_external_symbol(enum amd_gfx_level gfx_level, void *data, const char *name,
                                    uint64_t *value)
 {
    uint64_t *scratch_va = data;

    if (!strcmp(scratch_rsrc_dword0_symbol, name)) {
       *value = (uint32_t)*scratch_va;
       return true;
    }
    if (!strcmp(scratch_rsrc_dword1_symbol, name)) {
       /* Enable scratch coalescing. */
       *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32);

       if (gfx_level >= GFX11)
          *value |= S_008F04_SWIZZLE_ENABLE_GFX11(1);
       else
          *value |= S_008F04_SWIZZLE_ENABLE_GFX6(1);
       return true;
    }

    return false;
 }

 static void *pre_upload_binary(struct si_screen *sscreen, struct si_shader *shader,
                                unsigned binary_size, bool dma_upload,
                                struct si_context **upload_ctx,
                                struct pipe_resource **staging,
                                unsigned *staging_offset,
                                int64_t bo_offset)
 {
    unsigned aligned_size = ac_align_shader_binary_for_prefetch(&sscreen->info, binary_size);

    if (bo_offset >= 0) {
       /* sqtt needs to upload shaders as a pipeline, where all shaders
        * are contiguous in memory.
        * In this case, bo_offset will be positive and we don't have to
        * realloc a new bo.
        */
       shader->gpu_address = shader->bo->gpu_address + bo_offset;
       dma_upload = false;
    } else {
       si_resource_reference(&shader->bo, NULL);
       shader->bo = si_aligned_buffer_create(
          &sscreen->b,
          SI_RESOURCE_FLAG_DRIVER_INTERNAL | SI_RESOURCE_FLAG_32BIT |
          (dma_upload ? PIPE_RESOURCE_FLAG_UNMAPPABLE : 0),
          PIPE_USAGE_IMMUTABLE, align(aligned_size, SI_CPDMA_ALIGNMENT), 256);
       if (!shader->bo)
          return NULL;

       shader->gpu_address = shader->bo->gpu_address;
       bo_offset = 0;
    }

    if (dma_upload) {
       /* First upload into a staging buffer. */
       *upload_ctx = si_get_aux_context(&sscreen->aux_context.shader_upload);

       void *ret;
       u_upload_alloc((*upload_ctx)->b.stream_uploader, 0, binary_size, 256,
                      staging_offset, staging, &ret);
       if (!ret)
          si_put_aux_context_flush(&sscreen->aux_context.shader_upload);

       return ret;
    } else {
       void *ptr = sscreen->ws->buffer_map(sscreen->ws,
          shader->bo->buf, NULL,
          PIPE_MAP_READ_WRITE | PIPE_MAP_UNSYNCHRONIZED | RADEON_MAP_TEMPORARY);
       if (!ptr)
          return NULL;

       return ptr + bo_offset;
    }
 }

 static void post_upload_binary(struct si_screen *sscreen, struct si_shader *shader,
                                void *code, unsigned code_size,
                                unsigned binary_size, bool dma_upload,
                                struct si_context *upload_ctx,
                                struct pipe_resource *staging,
                                unsigned staging_offset)
 {
    if (sscreen->debug_flags & DBG(SQTT)) {
       /* Remember the uploaded code */
       shader->binary.uploaded_code_size = code_size;
       shader->binary.uploaded_code = malloc(code_size);
       memcpy(shader->binary.uploaded_code, code, code_size);
    }

    if (dma_upload) {
       /* Then copy from the staging buffer to VRAM.
        *
        * We can't use the upload copy in si_buffer_transfer_unmap because that might use
        * a compute shader, and we can't use shaders in the code that is responsible for making
        * them available.
        */
       si_cp_dma_copy_buffer(upload_ctx, &shader->bo->b.b, staging, 0, staging_offset,
                             binary_size);
       si_barrier_after_simple_buffer_op(upload_ctx, 0, &shader->bo->b.b, staging);
       upload_ctx->barrier_flags |= SI_BARRIER_INV_ICACHE | SI_BARRIER_INV_L2;

 #if 0 /* debug: validate whether the copy was successful */
       uint32_t *dst_binary = malloc(binary_size);
       uint32_t *src_binary = (uint32_t*)code;
       pipe_buffer_read(&upload_ctx->b, &shader->bo->b.b, 0, binary_size, dst_binary);
       puts("dst_binary == src_binary:");
       for (unsigned i = 0; i < binary_size / 4; i++) {
          printf("   %08x == %08x\n", dst_binary[i], src_binary[i]);
       }
       free(dst_binary);
       exit(0);
 #endif

       si_put_aux_context_flush(&sscreen->aux_context.shader_upload);
       pipe_resource_reference(&staging, NULL);
    } else {
       sscreen->ws->buffer_unmap(sscreen->ws, shader->bo->buf);
    }
 }

 static int upload_binary_elf(struct si_screen *sscreen, struct si_shader *shader,
                              uint64_t scratch_va, bool dma_upload, int64_t bo_offset)
 {
    struct ac_rtld_binary binary;
    if (!si_shader_binary_open(sscreen, shader, &binary))
       return -1;

    struct si_context *upload_ctx = NULL;
    struct pipe_resource *staging = NULL;
    unsigned staging_offset = 0;

    void *rx_ptr = pre_upload_binary(sscreen, shader, binary.rx_size, dma_upload,
                                     &upload_ctx, &staging, &staging_offset,
                                     bo_offset);
    if (!rx_ptr)
       return -1;

    /* Upload. */
    struct ac_rtld_upload_info u = {};
    u.binary = &binary;
    u.get_external_symbol = si_get_external_symbol;
    u.cb_data = &scratch_va;
    u.rx_va = shader->gpu_address;
    u.rx_ptr = rx_ptr;

    int size = ac_rtld_upload(&u);

    post_upload_binary(sscreen, shader, rx_ptr, size, binary.rx_size, dma_upload,
                       upload_ctx, staging, staging_offset);

    ac_rtld_close(&binary);

    return size;
 }

 static void calculate_needed_lds_size(struct si_screen *sscreen, struct si_shader *shader)
 {
    gl_shader_stage stage =
       shader->is_gs_copy_shader ? MESA_SHADER_VERTEX : shader->selector->stage;

    if (sscreen->info.gfx_level >= GFX9 && stage <= MESA_SHADER_GEOMETRY &&
        (stage == MESA_SHADER_GEOMETRY || shader->key.ge.as_ngg)) {
       unsigned size_in_dw = shader->key.ge.as_ngg ? shader->ngg.info.esgs_lds_size
                                                   : shader->gs_info.esgs_ring_size;

       if (stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg)
          size_in_dw += shader->ngg.info.ngg_out_lds_size;

       shader->config.lds_size =
          DIV_ROUND_UP(size_in_dw * 4, get_lds_granularity(sscreen, stage));
    }
 }

 static int upload_binary_raw(struct si_screen *sscreen, struct si_shader *shader,
                              uint64_t scratch_va, bool dma_upload, int64_t bo_offset)
 {
    struct si_shader_binary *bin[4];
    unsigned num_bin = get_shader_binaries(shader, bin);

    unsigned code_size = 0, exec_size = 0;
    for (unsigned i = 0; i < num_bin; i++) {
       assert(bin[i]->type == SI_SHADER_BINARY_RAW);
       code_size += bin[i]->code_size;
       exec_size += bin[i]->exec_size;
    }

    struct si_context *upload_ctx = NULL;
    struct pipe_resource *staging = NULL;
    unsigned staging_offset = 0;

    void *rx_ptr = pre_upload_binary(sscreen, shader, code_size, dma_upload,
                                     &upload_ctx, &staging, &staging_offset,
                                     bo_offset);
    if (!rx_ptr)
       return -1;

    unsigned exec_offset = 0, data_offset = exec_size;
    for (unsigned i = 0; i < num_bin; i++) {
       memcpy(rx_ptr + exec_offset, bin[i]->code_buffer, bin[i]->exec_size);

       if (bin[i]->num_symbols) {
          /* Offset needed to add to const data symbol because of inserting other
           * shader part between exec code and const data.
           */
          unsigned const_offset = data_offset - exec_offset - bin[i]->exec_size;

          /* Prolog and epilog have no symbols. */
          struct si_shader *sh = bin[i] == &shader->binary ? shader : shader->previous_stage;
          assert(sh && bin[i] == &sh->binary);

          si_aco_resolve_symbols(sh, rx_ptr + exec_offset, (const uint32_t *)bin[i]->code_buffer,
                                 scratch_va, const_offset);
       }

       exec_offset += bin[i]->exec_size;

       unsigned data_size = bin[i]->code_size - bin[i]->exec_size;
       if (data_size) {
          memcpy(rx_ptr + data_offset, bin[i]->code_buffer + bin[i]->exec_size, data_size);
          data_offset += data_size;
       }
    }

    post_upload_binary(sscreen, shader, rx_ptr, code_size, code_size, dma_upload,
                       upload_ctx, staging, staging_offset);

    calculate_needed_lds_size(sscreen, shader);
    return code_size;
 }

 int si_shader_binary_upload_at(struct si_screen *sscreen, struct si_shader *shader,
                                uint64_t scratch_va, int64_t bo_offset)
 {
    bool dma_upload = !(sscreen->debug_flags & DBG(NO_DMA_SHADERS)) && sscreen->info.has_cp_dma &&
                      sscreen->info.has_dedicated_vram && !sscreen->info.all_vram_visible &&
                      bo_offset < 0;

    if (shader->binary.type == SI_SHADER_BINARY_ELF) {
       return upload_binary_elf(sscreen, shader, scratch_va, dma_upload, bo_offset);
    } else {
       assert(shader->binary.type == SI_SHADER_BINARY_RAW);
       return upload_binary_raw(sscreen, shader, scratch_va, dma_upload, bo_offset);
    }
 }

 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
                             uint64_t scratch_va)
 {
    return si_shader_binary_upload_at(sscreen, shader, scratch_va, -1);
 }

 static void print_disassembly(const char *disasm, size_t nbytes,
                               const char *name, FILE *file,
                               struct util_debug_callback *debug)
 {
    if (debug && debug->debug_message) {
       /* Very long debug messages are cut off, so send the
        * disassembly one line at a time. This causes more
        * overhead, but on the plus side it simplifies
        * parsing of resulting logs.
        */
       util_debug_message(debug, SHADER_INFO, "Shader Disassembly Begin");

       uint64_t line = 0;
       while (line < nbytes) {
          int count = nbytes - line;
          const char *nl = memchr(disasm + line, '\n', nbytes - line);
          if (nl)
             count = nl - (disasm + line);

          if (count) {
             util_debug_message(debug, SHADER_INFO, "%.*s", count, disasm + line);
          }

          line += count + 1;
       }

       util_debug_message(debug, SHADER_INFO, "Shader Disassembly End");
    }

    if (file) {
       fprintf(file, "Shader %s disassembly:\n", name);
       fprintf(file, "%*s", (int)nbytes, disasm);
    }
 }

 static void si_shader_dump_disassembly(struct si_screen *screen,
                                        const struct si_shader_binary *binary,
                                        gl_shader_stage stage, unsigned wave_size,
                                        struct util_debug_callback *debug, const char *name,
                                        FILE *file)
 {
    if (binary->type == SI_SHADER_BINARY_RAW) {
       print_disassembly(binary->disasm_string, binary->disasm_size, name, file, debug);
       return;
    }

    struct ac_rtld_binary rtld_binary;

    if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){
                                       .info = &screen->info,
                                       .shader_type = stage,
                                       .wave_size = wave_size,
                                       .num_parts = 1,
                                       .elf_ptrs = &binary->code_buffer,
                                       .elf_sizes = &binary->code_size}))
       return;

    const char *disasm;
    size_t nbytes;

    if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes))
       goto out;

    if (nbytes > INT_MAX)
       goto out;

    print_disassembly(disasm, nbytes, name, file, debug);

 out:
    ac_rtld_close(&rtld_binary);
 }

 static void si_calculate_max_simd_waves(struct si_shader *shader)
 {
    struct si_screen *sscreen = shader->selector->screen;
    struct ac_shader_config *conf = &shader->config;
    unsigned lds_increment = get_lds_granularity(sscreen, shader->selector->stage);
    unsigned lds_per_wave = 0;
    unsigned max_simd_waves;

    max_simd_waves = sscreen->info.max_waves_per_simd;

    /* Compute LDS usage for PS. */
    switch (shader->selector->stage) {
    case MESA_SHADER_FRAGMENT:
       /* The minimum usage per wave is (num_inputs * 48). The maximum
        * usage is (num_inputs * 48 * 16).
        * We can get anything in between and it varies between waves.
        *
        * The 48 bytes per input for a single primitive is equal to
        * 4 bytes/component * 4 components/input * 3 points.
        *
        * Other stages don't know the size at compile time or don't
        * allocate LDS per wave, but instead they do it per thread group.
        */
       lds_per_wave = conf->lds_size * lds_increment +
                      align(shader->info.num_ps_inputs * 48, lds_increment);
       break;
    case MESA_SHADER_COMPUTE: {
          unsigned max_workgroup_size = si_get_max_workgroup_size(shader);
          lds_per_wave = (conf->lds_size * lds_increment) /
                         DIV_ROUND_UP(max_workgroup_size, shader->wave_size);
       }
       break;
    default:;
    }

    /* Compute the per-SIMD wave counts. */
    if (conf->num_sgprs) {
       max_simd_waves =
          MIN2(max_simd_waves, sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs);
    }

    if (conf->num_vgprs) {
       /* GFX 10.3 internally:
        * - aligns VGPRS to 16 for Wave32 and 8 for Wave64
        * - aligns LDS to 1024
        *
        * For shader-db stats, set num_vgprs that the hw actually uses.
        */
       unsigned num_vgprs = conf->num_vgprs;
       if (sscreen->info.gfx_level >= GFX10_3) {
          unsigned real_vgpr_gran = sscreen->info.num_physical_wave64_vgprs_per_simd / 64;
          num_vgprs = util_align_npot(num_vgprs, real_vgpr_gran * (shader->wave_size == 32 ? 2 : 1));
       } else {
          num_vgprs = align(num_vgprs, shader->wave_size == 32 ? 8 : 4);
       }

       /* Always print wave limits as Wave64, so that we can compare
        * Wave32 and Wave64 with shader-db fairly. */
       unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd;
       max_simd_waves = MIN2(max_simd_waves, max_vgprs / num_vgprs);
    }

    unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / 4;
    if (lds_per_wave)
       max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);

    shader->info.max_simd_waves = max_simd_waves;
 }

 void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,
                                         struct util_debug_callback *debug)
 {
    const struct ac_shader_config *conf = &shader->config;
    static const char *stages[] = {"VS", "TCS", "TES", "GS", "PS", "CS"};

    if (screen->options.debug_disassembly)
       si_shader_dump_disassembly(screen, &shader->binary, shader->selector->stage,
                                  shader->wave_size, debug, "main", NULL);

    unsigned num_ls_outputs = 0;
    unsigned num_hs_outputs = 0;
    unsigned num_es_outputs = 0;
    unsigned num_gs_outputs = 0;
    unsigned num_vs_outputs = 0;
    unsigned num_ps_outputs = 0;

    if (shader->selector->stage <= MESA_SHADER_GEOMETRY) {
       /* This doesn't include pos exports because only param exports are interesting
        * for performance and can be optimized.
        */
       if (shader->key.ge.as_ls)
          num_ls_outputs = si_shader_lshs_vertex_stride(shader) / 16;
       else if (shader->selector->stage == MESA_SHADER_TESS_CTRL)
          num_hs_outputs = shader->selector->info.tess_io_info.highest_remapped_vram_output;
       else if (shader->key.ge.as_es)
          num_es_outputs = shader->selector->info.esgs_vertex_stride / 16;
       else if (shader->gs_copy_shader)
          num_gs_outputs = shader->gs_copy_shader->info.nr_param_exports;
       else if (shader->selector->stage == MESA_SHADER_GEOMETRY)
          num_gs_outputs = shader->info.nr_param_exports;
       else if (shader->selector->stage == MESA_SHADER_VERTEX ||
                shader->selector->stage == MESA_SHADER_TESS_EVAL)
          num_vs_outputs = shader->info.nr_param_exports;
       else
          unreachable("invalid shader key");
    } else if (shader->selector->stage == MESA_SHADER_FRAGMENT) {
       num_ps_outputs = util_bitcount(shader->selector->info.colors_written) +
                        (shader->info.writes_z ||
                         shader->info.writes_stencil ||
                         shader->info.writes_sample_mask);
    }

    util_debug_message(debug, SHADER_INFO,
                       "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
                       "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
                       "Spilled VGPRs: %d PrivMem VGPRs: %d LSOutputs: %u HSOutputs: %u "
                       "HSPatchOuts: %u ESOutputs: %u GSOutputs: %u VSOutputs: %u PSOutputs: %u "
                       "InlineUniforms: %u DivergentLoop: %u (%s, W%u)",
                       conf->num_sgprs, conf->num_vgprs, si_get_shader_binary_size(screen, shader),
                       conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves,
                       conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs,
                       num_ls_outputs, num_hs_outputs,
                       shader->selector->info.tess_io_info.highest_remapped_vram_patch_output,
                       num_es_outputs, num_gs_outputs, num_vs_outputs, num_ps_outputs,
                       shader->selector->info.base.num_inlinable_uniforms,
                       shader->selector->info.has_divergent_loop,
                       stages[shader->selector->stage], shader->wave_size);
 }

 bool si_can_dump_shader(struct si_screen *sscreen, gl_shader_stage stage,
                         enum si_shader_dump_type dump_type)
 {
    static uint64_t filter[] = {
       [SI_DUMP_SHADER_KEY] = DBG(NIR) | DBG(INIT_LLVM) | DBG(LLVM) | DBG(INIT_ACO) | DBG(ACO) | DBG(ASM),
       [SI_DUMP_INIT_NIR] = DBG(INIT_NIR),
       [SI_DUMP_NIR] = DBG(NIR),
       [SI_DUMP_INIT_LLVM_IR] = DBG(INIT_LLVM),
       [SI_DUMP_LLVM_IR] = DBG(LLVM),
       [SI_DUMP_INIT_ACO_IR] = DBG(INIT_ACO),
       [SI_DUMP_ACO_IR] = DBG(ACO),
       [SI_DUMP_ASM] = DBG(ASM),
       [SI_DUMP_STATS] = DBG(STATS),
       [SI_DUMP_ALWAYS] = DBG(VS) | DBG(TCS) | DBG(TES) | DBG(GS) | DBG(PS) | DBG(CS),
    };
    assert(dump_type < ARRAY_SIZE(filter));

    return sscreen->debug_flags & (1 << stage) &&
           sscreen->debug_flags & filter[dump_type];
 }

 static void si_shader_dump_stats(struct si_screen *sscreen, struct si_shader *shader, FILE *file,
                                  bool check_debug_option)
 {
    const struct ac_shader_config *conf = &shader->config;

    if (shader->selector->stage == MESA_SHADER_FRAGMENT) {
       fprintf(file,
               "*** SHADER CONFIG ***\n"
               "SPI_PS_INPUT_ADDR = 0x%04x\n"
               "SPI_PS_INPUT_ENA  = 0x%04x\n",
               conf->spi_ps_input_addr, conf->spi_ps_input_ena);
    }

    fprintf(file,
            "*** SHADER STATS ***\n"
            "SGPRS: %d\n"
            "VGPRS: %d\n"
            "Spilled SGPRs: %d\n"
            "Spilled VGPRs: %d\n"
            "Private memory VGPRs: %d\n"
            "Code Size: %d bytes\n"
            "LDS: %d bytes\n"
            "Scratch: %d bytes per wave\n"
            "Max Waves: %d\n"
            "********************\n\n\n",
            conf->num_sgprs, conf->num_vgprs, conf->spilled_sgprs, conf->spilled_vgprs,
            shader->info.private_mem_vgprs, si_get_shader_binary_size(sscreen, shader),
            conf->lds_size * get_lds_granularity(sscreen, shader->selector->stage),
            conf->scratch_bytes_per_wave, shader->info.max_simd_waves);
 }

 const char *si_get_shader_name(const struct si_shader *shader)
 {
    switch (shader->selector->stage) {
    case MESA_SHADER_VERTEX:
       if (shader->key.ge.as_es)
          return "Vertex Shader as ES";
       else if (shader->key.ge.as_ls)
          return "Vertex Shader as LS";
       else if (shader->key.ge.as_ngg)
          return "Vertex Shader as ESGS";
       else
          return "Vertex Shader as VS";
    case MESA_SHADER_TESS_CTRL:
       return "Tessellation Control Shader";
    case MESA_SHADER_TESS_EVAL:
       if (shader->key.ge.as_es)
          return "Tessellation Evaluation Shader as ES";
       else if (shader->key.ge.as_ngg)
          return "Tessellation Evaluation Shader as ESGS";
       else
          return "Tessellation Evaluation Shader as VS";
    case MESA_SHADER_GEOMETRY:
       if (shader->is_gs_copy_shader)
          return "GS Copy Shader as VS";
       else
          return "Geometry Shader";
    case MESA_SHADER_FRAGMENT:
       return "Pixel Shader";
    case MESA_SHADER_COMPUTE:
       return "Compute Shader";
    default:
       return "Unknown Shader";
    }
 }

 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
                     struct util_debug_callback *debug, FILE *file, bool check_debug_option)
 {
    gl_shader_stage stage = shader->selector->stage;

    if (!check_debug_option || si_can_dump_shader(sscreen, stage, SI_DUMP_SHADER_KEY))
       si_dump_shader_key(shader, file);

    if (!check_debug_option && shader->binary.llvm_ir_string) {
       /* This is only used with ddebug. */
       if (shader->previous_stage && shader->previous_stage->binary.llvm_ir_string) {
          fprintf(file, "\n%s - previous stage - LLVM IR:\n\n", si_get_shader_name(shader));
          fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
       }

       fprintf(file, "\n%s - main shader part - LLVM IR:\n\n", si_get_shader_name(shader));
       fprintf(file, "%s\n", shader->binary.llvm_ir_string);
    }

    if (!check_debug_option || (si_can_dump_shader(sscreen, stage, SI_DUMP_ASM))) {
       fprintf(file, "\n%s:\n", si_get_shader_name(shader));

       if (shader->prolog)
          si_shader_dump_disassembly(sscreen, &shader->prolog->binary, stage, shader->wave_size, debug,
                                     "prolog", file);
       if (shader->previous_stage)
          si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary, stage,
                                     shader->wave_size, debug, "previous stage", file);
       si_shader_dump_disassembly(sscreen, &shader->binary, stage, shader->wave_size, debug, "main",
                                  file);

       if (shader->epilog)
          si_shader_dump_disassembly(sscreen, &shader->epilog->binary, stage, shader->wave_size, debug,
                                     "epilog", file);
       fprintf(file, "\n");

       si_shader_dump_stats(sscreen, shader, file, check_debug_option);
    }
 }

 static void si_dump_shader_key_vs(const union si_shader_key *key, FILE *f)
 {
    fprintf(f, "  mono.instance_divisor_is_one = %u\n", key->ge.mono.instance_divisor_is_one);
    fprintf(f, "  mono.instance_divisor_is_fetched = %u\n",
            key->ge.mono.instance_divisor_is_fetched);
    fprintf(f, "  mono.vs.fetch_opencode = %x\n", key->ge.mono.vs_fetch_opencode);
    fprintf(f, "  mono.vs.fix_fetch = {");
    for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
       union si_vs_fix_fetch fix = key->ge.mono.vs_fix_fetch[i];
       if (i)
          fprintf(f, ", ");
       if (!fix.bits)
          fprintf(f, "0");
       else
          fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size, fix.u.num_channels_m1,
                  fix.u.format);
    }
    fprintf(f, "}\n");
 }

 static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
 {
    const union si_shader_key *key = &shader->key;
    gl_shader_stage stage = shader->selector->stage;

    fprintf(f, "SHADER KEY\n");
    fprintf(f, "  source_blake3 = {");
    _mesa_blake3_print(f, shader->selector->info.base.source_blake3);
    fprintf(f, "}\n");

    switch (stage) {
    case MESA_SHADER_VERTEX:
       si_dump_shader_key_vs(key, f);
       fprintf(f, "  as_es = %u\n", key->ge.as_es);
       fprintf(f, "  as_ls = %u\n", key->ge.as_ls);
       fprintf(f, "  as_ngg = %u\n", key->ge.as_ngg);
       fprintf(f, "  mono.u.vs_export_prim_id = %u\n", key->ge.mono.u.vs_export_prim_id);
       break;

    case MESA_SHADER_TESS_CTRL:
       if (shader->selector->screen->info.gfx_level >= GFX9)
          si_dump_shader_key_vs(key, f);

       fprintf(f, "  opt.tes_prim_mode = %u\n", key->ge.opt.tes_prim_mode);
       fprintf(f, "  opt.tes_reads_tess_factors = %u\n", key->ge.opt.tes_reads_tess_factors);
       fprintf(f, "  opt.prefer_mono = %u\n", key->ge.opt.prefer_mono);
       fprintf(f, "  opt.same_patch_vertices = %u\n", key->ge.opt.same_patch_vertices);
       break;

    case MESA_SHADER_TESS_EVAL:
       fprintf(f, "  as_es = %u\n", key->ge.as_es);
       fprintf(f, "  as_ngg = %u\n", key->ge.as_ngg);
       fprintf(f, "  mono.u.vs_export_prim_id = %u\n", key->ge.mono.u.vs_export_prim_id);
       break;

    case MESA_SHADER_GEOMETRY:
       if (shader->is_gs_copy_shader)
          break;

       if (shader->selector->screen->info.gfx_level >= GFX9 &&
           key->ge.part.gs.es->stage == MESA_SHADER_VERTEX)
          si_dump_shader_key_vs(key, f);

       fprintf(f, "  mono.u.gs_tri_strip_adj_fix = %u\n", key->ge.mono.u.gs_tri_strip_adj_fix);
       fprintf(f, "  as_ngg = %u\n", key->ge.as_ngg);
       break;

    case MESA_SHADER_COMPUTE:
       break;

    case MESA_SHADER_FRAGMENT:
       fprintf(f, "  prolog.color_two_side = %u\n", key->ps.part.prolog.color_two_side);
       fprintf(f, "  prolog.flatshade_colors = %u\n", key->ps.part.prolog.flatshade_colors);
       fprintf(f, "  prolog.poly_stipple = %u\n", key->ps.part.prolog.poly_stipple);
       fprintf(f, "  prolog.force_persp_sample_interp = %u\n",
               key->ps.part.prolog.force_persp_sample_interp);
       fprintf(f, "  prolog.force_linear_sample_interp = %u\n",
               key->ps.part.prolog.force_linear_sample_interp);
       fprintf(f, "  prolog.force_persp_center_interp = %u\n",
               key->ps.part.prolog.force_persp_center_interp);
       fprintf(f, "  prolog.force_linear_center_interp = %u\n",
               key->ps.part.prolog.force_linear_center_interp);
       fprintf(f, "  prolog.bc_optimize_for_persp = %u\n",
               key->ps.part.prolog.bc_optimize_for_persp);
       fprintf(f, "  prolog.bc_optimize_for_linear = %u\n",
               key->ps.part.prolog.bc_optimize_for_linear);
       fprintf(f, "  prolog.samplemask_log_ps_iter = %u\n",
               key->ps.part.prolog.samplemask_log_ps_iter);
       fprintf(f, "  prolog.get_frag_coord_from_pixel_coord = %u\n",
               key->ps.part.prolog.get_frag_coord_from_pixel_coord);
       fprintf(f, "  prolog.force_samplemask_to_helper_invocation = %u\n",
               key->ps.part.prolog.force_samplemask_to_helper_invocation);
       fprintf(f, "  epilog.spi_shader_col_format = 0x%x\n",
               key->ps.part.epilog.spi_shader_col_format);
       fprintf(f, "  epilog.color_is_int8 = 0x%X\n", key->ps.part.epilog.color_is_int8);
       fprintf(f, "  epilog.color_is_int10 = 0x%X\n", key->ps.part.epilog.color_is_int10);
       fprintf(f, "  epilog.alpha_func = %u\n", key->ps.part.epilog.alpha_func);
       fprintf(f, "  epilog.alpha_to_one = %u\n", key->ps.part.epilog.alpha_to_one);
       fprintf(f, "  epilog.alpha_to_coverage_via_mrtz = %u\n", key->ps.part.epilog.alpha_to_coverage_via_mrtz);
       fprintf(f, "  epilog.clamp_color = %u\n", key->ps.part.epilog.clamp_color);
       fprintf(f, "  epilog.dual_src_blend_swizzle = %u\n", key->ps.part.epilog.dual_src_blend_swizzle);
       fprintf(f, "  epilog.rbplus_depth_only_opt = %u\n", key->ps.part.epilog.rbplus_depth_only_opt);
       fprintf(f, "  epilog.kill_z = %u\n", key->ps.part.epilog.kill_z);
       fprintf(f, "  epilog.kill_stencil = %u\n", key->ps.part.epilog.kill_stencil);
       fprintf(f, "  epilog.kill_samplemask = %u\n", key->ps.part.epilog.kill_samplemask);
       fprintf(f, "  mono.poly_line_smoothing = %u\n", key->ps.mono.poly_line_smoothing);
       fprintf(f, "  mono.point_smoothing = %u\n", key->ps.mono.point_smoothing);
       fprintf(f, "  mono.interpolate_at_sample_force_center = %u\n",
               key->ps.mono.interpolate_at_sample_force_center);
       fprintf(f, "  mono.fbfetch_msaa = %u\n", key->ps.mono.fbfetch_msaa);
       fprintf(f, "  mono.fbfetch_is_1D = %u\n", key->ps.mono.fbfetch_is_1D);
       fprintf(f, "  mono.fbfetch_layered = %u\n", key->ps.mono.fbfetch_layered);
       break;

    default:
       assert(0);
    }

    if ((stage == MESA_SHADER_GEOMETRY || stage == MESA_SHADER_TESS_EVAL ||
         stage == MESA_SHADER_VERTEX) &&
        !key->ge.as_es && !key->ge.as_ls) {
       fprintf(f, "  mono.remove_streamout = 0x%x\n", key->ge.mono.remove_streamout);
       fprintf(f, "  opt.kill_outputs = 0x%" PRIx64 "\n", key->ge.opt.kill_outputs);
       fprintf(f, "  opt.kill_clip_distances = 0x%x\n", key->ge.opt.kill_clip_distances);
       fprintf(f, "  opt.kill_pointsize = %u\n", key->ge.opt.kill_pointsize);
       fprintf(f, "  opt.kill_layer = %u\n", key->ge.opt.kill_layer);
       fprintf(f, "  opt.remove_streamout = %u\n", key->ge.opt.remove_streamout);
       fprintf(f, "  opt.ngg_culling = 0x%x\n", key->ge.opt.ngg_culling);
       fprintf(f, "  opt.ngg_vs_streamout_num_verts_per_prim = %u\n",
               key->ge.opt.ngg_vs_streamout_num_verts_per_prim);
    }

    if (stage <= MESA_SHADER_GEOMETRY)
       fprintf(f, "  opt.prefer_mono = %u\n", key->ge.opt.prefer_mono);
    else
       fprintf(f, "  opt.prefer_mono = %u\n", key->ps.opt.prefer_mono);

    if (stage <= MESA_SHADER_GEOMETRY) {
       if (key->ge.opt.inline_uniforms) {
          fprintf(f, "  opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n",
                  key->ge.opt.inline_uniforms,
                  key->ge.opt.inlined_uniform_values[0],
                  key->ge.opt.inlined_uniform_values[1],
                  key->ge.opt.inlined_uniform_values[2],
                  key->ge.opt.inlined_uniform_values[3]);
       } else {
          fprintf(f, "  opt.inline_uniforms = 0\n");
       }
    } else {
       if (key->ps.opt.inline_uniforms) {
          fprintf(f, "  opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n",
                  key->ps.opt.inline_uniforms,
                  key->ps.opt.inlined_uniform_values[0],
                  key->ps.opt.inlined_uniform_values[1],
                  key->ps.opt.inlined_uniform_values[2],
                  key->ps.opt.inlined_uniform_values[3]);
       } else {
          fprintf(f, "  opt.inline_uniforms = 0\n");
       }
    }
 }

 unsigned si_map_io_driver_location(unsigned semantic)
 {
    if ((semantic >= VARYING_SLOT_PATCH0 && semantic < VARYING_SLOT_TESS_MAX) ||
        semantic == VARYING_SLOT_TESS_LEVEL_INNER ||
        semantic == VARYING_SLOT_TESS_LEVEL_OUTER)
       return ac_shader_io_get_unique_index_patch(semantic);

    return si_shader_io_get_unique_index(semantic);
 }

 static bool si_lower_io_to_mem(struct si_shader *shader, nir_shader *nir)
 {
    struct si_shader_selector *sel = shader->selector;
    struct si_shader_selector *next_sel = shader->next_shader ? shader->next_shader->selector : sel;
    const union si_shader_key *key = &shader->key;
    const bool is_gfx9_mono_tcs = shader->is_monolithic &&
                                  next_sel->stage == MESA_SHADER_TESS_CTRL &&
                                  sel->screen->info.gfx_level >= GFX9;

    if (nir->info.stage == MESA_SHADER_VERTEX) {
       if (key->ge.as_ls) {
          NIR_PASS_V(nir, ac_nir_lower_ls_outputs_to_mem,
                     is_gfx9_mono_tcs ? NULL : si_map_io_driver_location,
                     sel->screen->info.gfx_level,
                     key->ge.opt.same_patch_vertices,
                     is_gfx9_mono_tcs ? next_sel->info.tcs_inputs_via_temp : 0,
                     is_gfx9_mono_tcs ? next_sel->info.tcs_inputs_via_lds : ~0ull);
          return true;
       } else if (key->ge.as_es) {
          NIR_PASS_V(nir, ac_nir_lower_es_outputs_to_mem, si_map_io_driver_location,
                     sel->screen->info.gfx_level, sel->info.esgs_vertex_stride, ~0ULL);
          return true;
       }
    } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
       NIR_PASS_V(nir, ac_nir_lower_hs_inputs_to_mem,
                  is_gfx9_mono_tcs ? NULL : si_map_io_driver_location,
                  sel->screen->info.gfx_level, key->ge.opt.same_patch_vertices,
                  sel->info.tcs_inputs_via_temp, sel->info.tcs_inputs_via_lds);

       /* Used by hs_emit_write_tess_factors() when monolithic shader. */
       if (nir->info.tess._primitive_mode == TESS_PRIMITIVE_UNSPECIFIED)
          nir->info.tess._primitive_mode = key->ge.opt.tes_prim_mode;

       nir_tcs_info tcs_info;
       nir_gather_tcs_info(nir, &tcs_info, nir->info.tess._primitive_mode,
                           nir->info.tess.spacing);
       ac_nir_tess_io_info tess_io_info;
       ac_nir_get_tess_io_info(nir, &tcs_info, ~0ull, ~0, si_map_io_driver_location, false,
                               &tess_io_info);

       NIR_PASS_V(nir, ac_nir_lower_hs_outputs_to_mem, &tcs_info, &tess_io_info, si_map_io_driver_location,
                  sel->screen->info.gfx_level, shader->wave_size);
       return true;
    } else if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
       NIR_PASS_V(nir, ac_nir_lower_tes_inputs_to_mem, si_map_io_driver_location);

       if (key->ge.as_es) {
          NIR_PASS_V(nir, ac_nir_lower_es_outputs_to_mem, si_map_io_driver_location,
                     sel->screen->info.gfx_level, sel->info.esgs_vertex_stride, ~0ULL);
       }

       return true;
    } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
       NIR_PASS_V(nir, ac_nir_lower_gs_inputs_to_mem, si_map_io_driver_location,
                  sel->screen->info.gfx_level, key->ge.mono.u.gs_tri_strip_adj_fix);
       return true;
    }

    return false;
 }

 static void si_lower_ngg(struct si_shader *shader, nir_shader *nir,
                          struct si_temp_shader_variant_info *temp_info)
 {
    struct si_shader_selector *sel = shader->selector;
    const union si_shader_key *key = &shader->key;
    assert(key->ge.as_ngg);

    uint8_t clip_cull_dist_mask =
       (sel->info.clipdist_mask & ~key->ge.opt.kill_clip_distances) |
       sel->info.culldist_mask;

    ac_nir_lower_ngg_options options = {
       .hw_info = &sel->screen->info,
       .max_workgroup_size = si_get_max_workgroup_size(shader),
       .wave_size = shader->wave_size,
       .can_cull = si_shader_culling_enabled(shader),
       .disable_streamout = !shader->info.num_streamout_vec4s,
       .vs_output_param_offset = temp_info->vs_output_param_offset,
       .has_param_exports = shader->info.nr_param_exports,
       .export_clipdist_mask = clip_cull_dist_mask,
       .kill_pointsize = key->ge.opt.kill_pointsize,
       .kill_layer = key->ge.opt.kill_layer,
       .force_vrs = sel->screen->options.vrs2x2,
       .use_gfx12_xfb_intrinsic = !nir->info.use_aco_amd,
       .skip_viewport_state_culling = sel->info.writes_viewport_index,
       .use_point_tri_intersection = sel->screen->info.num_cu / sel->screen->info.num_se >= 12,
    };

    if (nir->info.stage == MESA_SHADER_VERTEX ||
        nir->info.stage == MESA_SHADER_TESS_EVAL) {
       /* Per instance inputs, used to remove instance load after culling. */
       unsigned instance_rate_inputs = 0;

       if (nir->info.stage == MESA_SHADER_VERTEX) {
          instance_rate_inputs = key->ge.mono.instance_divisor_is_one |
                                 key->ge.mono.instance_divisor_is_fetched;

          /* Manually mark the instance ID used, so the shader can repack it. */
          if (instance_rate_inputs)
             BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
       } else {
          /* Manually mark the primitive ID used, so the shader can repack it. */
          if (key->ge.mono.u.vs_export_prim_id)
             BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
       }

       unsigned clip_plane_enable =
          SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(key->ge.opt.ngg_culling);
       unsigned num_vertices = si_get_num_vertices_per_output_prim(shader);

       options.num_vertices_per_primitive = num_vertices ? num_vertices : 3;
       options.early_prim_export = gfx10_ngg_export_prim_early(shader);
       options.passthrough = gfx10_is_ngg_passthrough(shader);
       options.use_edgeflags = gfx10_has_variable_edgeflags(shader);
       options.has_gen_prim_query = options.has_xfb_prim_query =
          sel->screen->info.gfx_level >= GFX11 && !nir->info.vs.blit_sgprs_amd;
       options.export_primitive_id = key->ge.mono.u.vs_export_prim_id;
       options.instance_rate_inputs = instance_rate_inputs;
       options.cull_clipdist_mask = clip_plane_enable;

       NIR_PASS_V(nir, ac_nir_lower_ngg_nogs, &options, &shader->info.ngg_lds_vertex_size,
                  &shader->info.ngg_lds_scratch_size);
    } else {
       assert(nir->info.stage == MESA_SHADER_GEOMETRY);

       options.has_gen_prim_query = options.has_xfb_prim_query =
          sel->screen->info.gfx_level >= GFX11;
       options.has_gs_invocations_query = sel->screen->info.gfx_level < GFX11;
       options.has_gs_primitives_query = true;

       /* For monolithic ES/GS to add vscnt wait when GS export pos0. */
       if (key->ge.part.gs.es)
          nir->info.writes_memory |= key->ge.part.gs.es->info.base.writes_memory;

       NIR_PASS_V(nir, ac_nir_lower_ngg_gs, &options, &shader->info.ngg_lds_vertex_size,
                  &shader->info.ngg_lds_scratch_size);
    }

    /* may generate some vector output store */
    NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
 }

 struct nir_shader *si_deserialize_shader(struct si_shader_selector *sel)
 {
    struct pipe_screen *screen = &sel->screen->b;
    const void *options = screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR, sel->stage);

    struct blob_reader blob_reader;
    blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size);
    return nir_deserialize(NULL, options, &blob_reader);
 }

 static void si_nir_assign_param_offsets(nir_shader *nir, struct si_shader *shader,
                                         int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS],
                                         struct si_temp_shader_variant_info *temp_info)
 {
    struct si_shader_selector *sel = shader->selector;
    struct si_shader_variant_info *info = &shader->info;

    uint64_t outputs_written = 0;
    uint32_t outputs_written_16bit = 0;

    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
    assert(impl);

    nir_foreach_block(block, impl) {
       nir_foreach_instr_safe(instr, block) {
          if (instr->type != nir_instr_type_intrinsic)
             continue;

          nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
          if (intr->intrinsic != nir_intrinsic_store_output)
             continue;

          /* No indirect indexing allowed. */
          ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
          assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);

          assert(intr->num_components == 1); /* only scalar stores expected */
          nir_io_semantics sem = nir_intrinsic_io_semantics(intr);

          if (sem.location >= VARYING_SLOT_VAR0_16BIT)
             outputs_written_16bit |= BITFIELD_BIT(sem.location - VARYING_SLOT_VAR0_16BIT);
          else
             outputs_written |= BITFIELD64_BIT(sem.location);

          /* Assign the param index if it's unassigned. */
          if (nir_slot_is_varying(sem.location, MESA_SHADER_FRAGMENT) && !sem.no_varying &&
              (sem.gs_streams & 0x3) == 0 &&
              temp_info->vs_output_param_offset[sem.location] == AC_EXP_PARAM_UNDEFINED) {
             /* The semantic and the base should be the same as in si_shader_info. */
             assert(sem.location == sel->info.output_semantic[nir_intrinsic_base(intr)]);
             /* It must not be remapped (duplicated). */
             assert(slot_remap[sem.location] == -1);

             temp_info->vs_output_param_offset[sem.location] = info->nr_param_exports++;
          }
       }
    }

    /* Duplicated outputs are redirected here. */
    for (unsigned i = 0; i < NUM_TOTAL_VARYING_SLOTS; i++) {
       if (slot_remap[i] >= 0)
          temp_info->vs_output_param_offset[i] = temp_info->vs_output_param_offset[slot_remap[i]];
    }

    if (shader->key.ge.mono.u.vs_export_prim_id) {
       temp_info->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = info->nr_param_exports++;
    }

    /* Update outputs written info, we may remove some outputs before. */
    nir->info.outputs_written = outputs_written;
    nir->info.outputs_written_16bit = outputs_written_16bit;
 }

 static void si_assign_param_offsets(nir_shader *nir, struct si_shader *shader,
                                     struct si_temp_shader_variant_info *temp_info)
 {
    /* Initialize this first. */
    shader->info.nr_param_exports = 0;

    STATIC_ASSERT(sizeof(temp_info->vs_output_param_offset[0]) == 1);
    memset(temp_info->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
           sizeof(temp_info->vs_output_param_offset));

    /* A slot remapping table for duplicated outputs, so that 1 vertex shader output can be
     * mapped to multiple fragment shader inputs.
     */
    int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS];
    memset(slot_remap, -1, NUM_TOTAL_VARYING_SLOTS);

    /* This sets DEFAULT_VAL for constant outputs in vs_output_param_offset. */
    /* TODO: This doesn't affect GS. */
    NIR_PASS_V(nir, ac_nir_optimize_outputs, false, slot_remap,
               temp_info->vs_output_param_offset);

    /* Assign the non-constant outputs. */
    /* TODO: Use this for the GS copy shader too. */
    si_nir_assign_param_offsets(nir, shader, slot_remap, temp_info);

    /* Any unwritten output will default to (0,0,0,0). */
    for (unsigned i = 0; i < NUM_TOTAL_VARYING_SLOTS; i++) {
       if (temp_info->vs_output_param_offset[i] == AC_EXP_PARAM_UNDEFINED)
          temp_info->vs_output_param_offset[i] = AC_EXP_PARAM_DEFAULT_VAL_0000;
    }
 }

 static unsigned si_get_nr_pos_exports(const struct si_shader_selector *sel,
                                       const union si_shader_key *key)
 {
    const struct si_shader_info *info = &sel->info;

    /* Must have a position export. */
    unsigned nr_pos_exports = 1;

    if ((info->writes_psize && !key->ge.opt.kill_pointsize) ||
        (info->writes_edgeflag && !key->ge.as_ngg) ||
        (info->writes_layer && !key->ge.opt.kill_layer) ||
        info->writes_viewport_index || sel->screen->options.vrs2x2) {
       nr_pos_exports++;
    }

    unsigned clipdist_mask =
       (info->clipdist_mask & ~key->ge.opt.kill_clip_distances) | info->culldist_mask;

    for (int i = 0; i < 2; i++) {
       if (clipdist_mask & BITFIELD_RANGE(i * 4, 4))
          nr_pos_exports++;
    }

    return nr_pos_exports;
 }

 bool si_should_clear_lds(struct si_screen *sscreen, const struct nir_shader *shader)
 {
    return gl_shader_stage_is_compute(shader->info.stage) &&
       shader->info.shared_size > 0 && sscreen->options.clear_lds;
 }

 static void
 si_init_gs_output_info(struct si_shader_info *info, struct si_temp_shader_variant_info *out_info)
 {
    for (int i = 0; i < info->num_outputs; i++) {
       unsigned slot = info->output_semantic[i];
       if (slot < VARYING_SLOT_VAR0_16BIT) {
          out_info->gs_streams[slot] = info->output_streams[i];
          out_info->gs_out_usage_mask[slot] = info->output_usagemask[i];
       } else {
          unsigned index = slot - VARYING_SLOT_VAR0_16BIT;
          /* TODO: 16bit need separated fields for lo/hi part. */
          out_info->gs_streams_16bit_lo[index] = info->output_streams[i];
          out_info->gs_streams_16bit_hi[index] = info->output_streams[i];
          out_info->gs_out_usage_mask_16bit_lo[index] = info->output_usagemask[i];
          out_info->gs_out_usage_mask_16bit_hi[index] = info->output_usagemask[i];
       }
    }

    ac_nir_gs_output_info *ac_info = &out_info->gs_out_info;

    ac_info->streams = out_info->gs_streams;
    ac_info->streams_16bit_lo = out_info->gs_streams_16bit_lo;
    ac_info->streams_16bit_hi = out_info->gs_streams_16bit_hi;

    ac_info->sysval_mask = out_info->gs_out_usage_mask;
    ac_info->varying_mask = out_info->gs_out_usage_mask;
    ac_info->varying_mask_16bit_lo = out_info->gs_out_usage_mask_16bit_lo;
    ac_info->varying_mask_16bit_hi = out_info->gs_out_usage_mask_16bit_hi;

    /* TODO: construct 16bit slot per component store type. */
    ac_info->types_16bit_lo = ac_info->types_16bit_hi = NULL;
 }

 /* Run passes that eliminate code and affect shader_info. These should be run before linking
  * and shader_info gathering. Lowering passes can be run here too, but only if they lead to
  * better code or lower undesirable representations (like derefs). Lowering passes that prevent
  * linking optimizations or destroy shader_info shouldn't be run here.
  */
 static void run_pre_link_optimization_passes(struct si_nir_shader_ctx *ctx)
 {
    struct si_shader *shader = ctx->shader;
    struct si_shader_selector *sel = shader->selector;
    const union si_shader_key *key = &shader->key;
    nir_shader *nir = ctx->nir;
    bool progress = false;

    /* Kill outputs according to the shader key. */
    if (nir->info.stage <= MESA_SHADER_GEOMETRY)
       NIR_PASS(progress, nir, si_nir_kill_outputs, key);

    bool inline_uniforms = false;
    uint32_t *inlined_uniform_values;
    si_get_inline_uniform_state((union si_shader_key*)key, nir->info.stage,
                                &inline_uniforms, &inlined_uniform_values);

    if (inline_uniforms) {
       /* Most places use shader information from the default variant, not
        * the optimized variant. These are the things that the driver looks at
        * in optimized variants and the list of things that we need to do.
        *
        * The driver takes into account these things if they suddenly disappear
        * from the shader code:
        * - Register usage and code size decrease (obvious)
        * - Eliminated PS system values are disabled
        * - VS/TES/GS param exports are eliminated if they are undef.
        *   The param space for eliminated outputs is also not allocated.
        * - VS/TCS/TES/GS/PS input loads are eliminated (VS relies on DCE in LLVM)
        * - TCS output stores are eliminated
        * - Eliminated PS inputs are removed from PS.NUM_INTERP.
        *
        * TODO: These are things the driver ignores in the final shader code
        * and relies on the default shader info.
        * - System values in VS, TCS, TES, GS are not eliminated
        * - uses_discard - if it changed to false
        * - writes_memory - if it changed to false
        * - VS->TCS, VS->GS, TES->GS output stores for the former stage are not
        *   eliminated
        * - Eliminated VS/TCS/TES outputs are still allocated. (except when feeding PS)
        *   GS outputs are eliminated except for the temporary LDS.
        *   Clip distances, gl_PointSize, gl_Layer and PS outputs are eliminated based
        *   on current states, so we don't care about the shader code.
        *
        * TODO: Merged shaders don't inline uniforms for the first stage.
        * VS-GS: only GS inlines uniforms; VS-TCS: only TCS; TES-GS: only GS.
        * (key == NULL for the first stage here)
        *
        * TODO: Compute shaders don't support inlinable uniforms, because they
        * don't have shader variants.
        *
        * TODO: The driver uses a linear search to find a shader variant. This
        * can be really slow if we get too many variants due to uniform inlining.
        */
       NIR_PASS_V(nir, nir_inline_uniforms, nir->info.num_inlinable_uniforms,
                  inlined_uniform_values, nir->info.inlinable_uniform_dw_offsets);
       progress = true;
    }

    NIR_PASS(progress, nir, nir_opt_shrink_stores, false);

    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
       /* This uses the prolog/epilog keys, so only monolithic shaders can call this. */
       if (shader->is_monolithic) {
          /* This lowers load_color intrinsics to COLn/BFCn input loads and two-side color
           * selection.
           */
          if (sel->info.colors_read)
             NIR_PASS(progress, nir, si_nir_lower_ps_color_inputs, &shader->key, &sel->info);

          /* This adds discard and barycentrics. */
          if (key->ps.mono.point_smoothing)
             NIR_PASS(progress, nir, nir_lower_point_smooth, true);

          /* This eliminates system values and unused shader output components. */
          ac_nir_lower_ps_early_options early_options = {
             .force_center_interp_no_msaa = key->ps.part.prolog.force_persp_center_interp ||
                                            key->ps.part.prolog.force_linear_center_interp ||
                                            key->ps.part.prolog.force_samplemask_to_helper_invocation ||
                                            key->ps.mono.interpolate_at_sample_force_center,
             .load_sample_positions_always_loads_current_ones = true,
             .force_front_face = key->ps.opt.force_front_face_input,
             .optimize_frag_coord = true,
             .frag_coord_is_center = true,
             /* This does a lot of things. See the description in ac_nir_lower_ps_early_options. */
             .ps_iter_samples = key->ps.part.prolog.samplemask_log_ps_iter ?
                                  (1 << key->ps.part.prolog.samplemask_log_ps_iter) :
                                  (key->ps.part.prolog.force_persp_sample_interp ||
                                   key->ps.part.prolog.force_linear_sample_interp ? 2 :
                                   (key->ps.part.prolog.get_frag_coord_from_pixel_coord ? 1 : 0)),

             .fbfetch_is_1D = key->ps.mono.fbfetch_is_1D,
             .fbfetch_layered = key->ps.mono.fbfetch_layered,
             .fbfetch_msaa = key->ps.mono.fbfetch_msaa,
             .fbfetch_apply_fmask = sel->screen->info.gfx_level < GFX11 &&
                                    !(sel->screen->debug_flags & DBG(NO_FMASK)),

             .clamp_color = key->ps.part.epilog.clamp_color,
             .alpha_test_alpha_to_one = key->ps.part.epilog.alpha_to_one,
             .alpha_func = key->ps.part.epilog.alpha_func,
             .keep_alpha_for_mrtz = key->ps.part.epilog.alpha_to_coverage_via_mrtz,
             .spi_shader_col_format_hint = key->ps.part.epilog.spi_shader_col_format,
             .kill_z = key->ps.part.epilog.kill_z,
             .kill_stencil = key->ps.part.epilog.kill_stencil,
             .kill_samplemask = key->ps.part.epilog.kill_samplemask,
          };

          NIR_PASS(progress, nir, ac_nir_lower_ps_early, &early_options);

          /* This adds gl_SampleMaskIn. It must be after ac_nir_lower_ps_early that lowers
           * sample_mask_in to load_helper_invocation because we only want to do that for user
           * shaders while keeping the real sample mask for smoothing, which is produced using
           * MSAA overrasterization over a single-sample color buffer.
           */
          if (key->ps.mono.poly_line_smoothing)
             NIR_PASS(progress, nir, nir_lower_poly_line_smooth, SI_NUM_SMOOTH_AA_SAMPLES);

          /* This adds discard. */
          if (key->ps.part.prolog.poly_stipple)
             NIR_PASS(progress, nir, si_nir_lower_polygon_stipple);
       } else {
          ac_nir_lower_ps_early_options early_options = {
             .optimize_frag_coord = true,
             .frag_coord_is_center = true,
             .alpha_func = COMPARE_FUNC_ALWAYS,
             .spi_shader_col_format_hint = ~0,
          };
          NIR_PASS(progress, nir, ac_nir_lower_ps_early, &early_options);
       }
    }

    if (progress) {
       si_nir_opts(sel->screen, nir, true);
       progress = false;
    }

    /* Remove dead temps before we lower indirect indexing. */
    NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);

    /* Lower indirect indexing last.
     *
     * Shader variant optimizations (such as uniform inlining, replacing barycentrics, and IO
     * elimination) can help eliminate indirect indexing, so this should be done after that.
     *
     * Note that the code can still contain tautologies such as "array1[i] == array2[i]" when
     * array1 and array2 have provably equal values (NIR doesn't have a pass that can do that),
     * which NIR can optimize only after we lower indirecting indexing, so it's important that
     * we lower it before we gather shader_info.
     */

    /* Lower indirect indexing of large constant arrays to the load_constant intrinsic, which
     * will be turned into PC-relative loads from a data section next to the shader.
     */
    NIR_PASS(progress, nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16);

    /* Lower all other indirect indexing to if-else ladders or scratch. */
    progress |= ac_nir_lower_indirect_derefs(nir, sel->screen->info.gfx_level);

    if (progress)
       si_nir_opts(shader->selector->screen, nir, false);
 }

 /* Late optimization passes and lowering passes. The majority of lowering passes are here.
  * These passes should have no impact on linking optimizations and shouldn't affect shader_info
  * (those should be run before this) because any changes in shader_info won't be reflected
  * in hw registers from now on.
  */
 static void run_late_optimization_and_lowering_passes(struct si_nir_shader_ctx *ctx)
 {
    struct si_shader *shader = ctx->shader;
    struct si_shader_selector *sel = shader->selector;
    const union si_shader_key *key = &shader->key;
    nir_shader *nir = ctx->nir;
    bool progress = false;

    si_init_shader_args(shader, &ctx->args, &nir->info);

    if (nir->info.stage == MESA_SHADER_FRAGMENT)
       NIR_PASS(progress, nir, nir_lower_fragcoord_wtrans);

    NIR_PASS(progress, nir, ac_nir_lower_tex,
             &(ac_nir_lower_tex_options){
                .gfx_level = sel->screen->info.gfx_level,
                .lower_array_layer_round_even = !sel->screen->info.conformant_trunc_coord,
             });

    if (nir->info.uses_resource_info_query)
       NIR_PASS(progress, nir, ac_nir_lower_resinfo, sel->screen->info.gfx_level);

    /* This must be before si_nir_lower_resource. */
    if (!sel->screen->info.has_image_opcodes)
       NIR_PASS(progress, nir, ac_nir_lower_image_opcodes);

    /* LLVM does not work well with this, so is handled in llvm backend waterfall. */
    if (nir->info.use_aco_amd && ctx->temp_info.has_non_uniform_tex_access) {
       nir_lower_non_uniform_access_options options = {
          .types = nir_lower_non_uniform_texture_access,
       };
       NIR_PASS(progress, nir, nir_lower_non_uniform_access, &options);
    }

    /* Legacy GS is not the last VGT stage because there is also the GS copy shader. */
    bool is_last_vgt_stage =
       (nir->info.stage == MESA_SHADER_VERTEX ||
        nir->info.stage == MESA_SHADER_TESS_EVAL ||
        (nir->info.stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg)) &&
       !shader->key.ge.as_ls && !shader->key.ge.as_es;

    if (nir->info.stage == MESA_SHADER_VERTEX)
       NIR_PASS(progress, nir, si_nir_lower_vs_inputs, shader, &ctx->args);

    progress |= si_lower_io_to_mem(shader, nir);

    if (is_last_vgt_stage) {
       /* Assign param export indices. */
       si_assign_param_offsets(nir, shader, &ctx->temp_info);

       /* Assign num of position exports. */
       shader->info.nr_pos_exports = si_get_nr_pos_exports(sel, key);

       if (key->ge.as_ngg) {
          /* Lower last VGT NGG shader stage. */
          si_lower_ngg(shader, nir, &ctx->temp_info);
       } else if (nir->info.stage == MESA_SHADER_VERTEX ||
                  nir->info.stage == MESA_SHADER_TESS_EVAL) {
          /* Lower last VGT none-NGG VS/TES shader stage. */
          unsigned clip_cull_mask =
             (sel->info.clipdist_mask & ~key->ge.opt.kill_clip_distances) |
             sel->info.culldist_mask;

          NIR_PASS_V(nir, ac_nir_lower_legacy_vs,
                     sel->screen->info.gfx_level,
                     clip_cull_mask,
                     false, false,
                     ctx->temp_info.vs_output_param_offset,
                     shader->info.nr_param_exports,
                     shader->key.ge.mono.u.vs_export_prim_id,
                     !shader->info.num_streamout_vec4s,
                     key->ge.opt.kill_pointsize,
                     key->ge.opt.kill_layer,
                     sel->screen->options.vrs2x2);
       }
       progress = true;
    } else if (nir->info.stage == MESA_SHADER_GEOMETRY && !key->ge.as_ngg) {
       si_init_gs_output_info(&sel->info, &ctx->temp_info);
       NIR_PASS_V(nir, ac_nir_lower_legacy_gs, false, sel->screen->use_ngg,
                  &ctx->temp_info.gs_out_info);
       progress = true;
    } else if (nir->info.stage == MESA_SHADER_FRAGMENT && shader->is_monolithic) {
       ac_nir_lower_ps_late_options late_options = {
          .gfx_level = sel->screen->info.gfx_level,
          .family = sel->screen->info.family,
          .use_aco = nir->info.use_aco_amd,
          .bc_optimize_for_persp = key->ps.part.prolog.bc_optimize_for_persp,
          .bc_optimize_for_linear = key->ps.part.prolog.bc_optimize_for_linear,
          .uses_discard = shader->info.uses_discard,
          .alpha_to_coverage_via_mrtz = key->ps.part.epilog.alpha_to_coverage_via_mrtz,
          .dual_src_blend_swizzle = key->ps.part.epilog.dual_src_blend_swizzle,
          .spi_shader_col_format = key->ps.part.epilog.spi_shader_col_format,
          .color_is_int8 = key->ps.part.epilog.color_is_int8,
          .color_is_int10 = key->ps.part.epilog.color_is_int10,
          .alpha_to_one = key->ps.part.epilog.alpha_to_one,
       };

       NIR_PASS(progress, nir, ac_nir_lower_ps_late, &late_options);
    }

    assert(shader->wave_size == 32 || shader->wave_size == 64);

    NIR_PASS(progress, nir, nir_lower_subgroups,
             &(struct nir_lower_subgroups_options) {
                .subgroup_size = shader->wave_size,
                .ballot_bit_size = shader->wave_size,
                .ballot_components = 1,
                .lower_to_scalar = true,
                .lower_subgroup_masks = true,
                .lower_relative_shuffle = true,
                .lower_rotate_to_shuffle = !nir->info.use_aco_amd,
                .lower_shuffle_to_32bit = true,
                .lower_vote_feq = true,
                .lower_vote_ieq = true,
                .lower_vote_bool_eq = true,
                .lower_quad_broadcast_dynamic = true,
                .lower_quad_broadcast_dynamic_to_const = sel->screen->info.gfx_level <= GFX7,
                .lower_shuffle_to_swizzle_amd = true,
                .lower_ballot_bit_count_to_mbcnt_amd = true,
                .lower_boolean_reduce = nir->info.use_aco_amd,
                .lower_boolean_shuffle = true,
             });

    NIR_PASS(progress, nir, nir_lower_pack);
    NIR_PASS(progress, nir, nir_opt_idiv_const, 8);
    NIR_PASS(progress, nir, nir_lower_idiv,
             &(nir_lower_idiv_options){
                .allow_fp16 = sel->screen->info.gfx_level >= GFX9,
             });

    if (si_should_clear_lds(sel->screen, nir)) {
       const unsigned chunk_size = 16; /* max single store size */
       const unsigned shared_size = ALIGN(nir->info.shared_size, chunk_size);
       NIR_PASS_V(nir, nir_clear_shared_memory, shared_size, chunk_size);
    }

    nir_divergence_analysis(nir); /* required by ac_nir_flag_smem_for_loads */
    /* This is required by ac_nir_scalarize_overfetching_loads_callback. */
    NIR_PASS(progress, nir, ac_nir_flag_smem_for_loads, sel->screen->info.gfx_level,
             !sel->info.base.use_aco_amd, false);
    /* Scalarize overfetching loads, so that we don't load more components than necessary.
     * Adjacent loads will be re-vectorized with a conservative overfetching limit.
     */
    NIR_PASS(progress, nir, nir_lower_io_to_scalar,
             nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_shared | nir_var_mem_global,
             ac_nir_scalarize_overfetching_loads_callback, &sel->screen->info.gfx_level);
    /* Scalarize shared memory ops to get ds_load_2addr/ds_store_2addr more often.
     * If we don't do that, we might get pairs of ds_load_2addr + ds_load for vec3 loads, etc.
     */
    NIR_PASS(progress, nir, nir_lower_io_to_scalar, nir_var_mem_shared, NULL, NULL);
    NIR_PASS(progress, nir, si_nir_lower_resource, shader, &ctx->args);

    /* This must be done before load/store vectorization to lower 16-bit SMEM loads to 32 bits,
     * so that they can be vectorized as 32-bit loads. 16-bit loads are never vectorized.
     */
    NIR_PASS(progress, nir, ac_nir_lower_mem_access_bit_sizes,
             sel->screen->info.gfx_level, !nir->info.use_aco_amd);

    /* Load/store vectorization requires that offset computations are optimized. */
    if (progress) {
       si_nir_opts(sel->screen, nir, false);
       progress = false;
    }

    NIR_PASS(progress, nir, nir_opt_load_store_vectorize,
             &(nir_load_store_vectorize_options){
                .modes = nir_var_mem_ssbo | nir_var_mem_ubo | nir_var_mem_shared | nir_var_mem_global |
                         nir_var_shader_temp,
                .callback = ac_nir_mem_vectorize_callback,
                .cb_data = &(struct ac_nir_config){sel->screen->info.gfx_level, sel->info.base.use_aco_amd},
                /* On GFX6, read2/write2 is out-of-bounds if the offset register is negative, even if
                 * the final offset is not.
                 */
                .has_shared2_amd = sel->screen->info.gfx_level >= GFX7,
             });

    /* This must be done again if 8-bit or 16-bit buffer stores were vectorized. */
    NIR_PASS(progress, nir, ac_nir_lower_mem_access_bit_sizes,
             sel->screen->info.gfx_level, !nir->info.use_aco_amd);

    if (nir->info.stage == MESA_SHADER_KERNEL)
       NIR_PASS(progress, nir, ac_nir_lower_global_access);

    if (ac_nir_might_lower_bit_size(nir)) {
       if (sel->screen->info.gfx_level >= GFX8)
          nir_divergence_analysis(nir);

       NIR_PASS(progress, nir, nir_lower_bit_size, ac_nir_lower_bit_size_callback,
                &sel->screen->info.gfx_level);
    }

    /* This must be after lowering resources to descriptor loads and before lowering intrinsics
     * to args and lowering int64.
     */
    if (nir->info.use_aco_amd)
       progress |= ac_nir_optimize_uniform_atomics(nir);

    NIR_PASS(progress, nir, nir_lower_int64);
    NIR_PASS(progress, nir, si_nir_lower_abi, shader, &ctx->args);
    NIR_PASS(progress, nir, ac_nir_lower_intrinsics_to_args, sel->screen->info.gfx_level,
             sel->screen->info.has_ls_vgpr_init_bug,
             si_select_hw_stage(nir->info.stage, key, sel->screen->info.gfx_level),
             shader->wave_size, si_get_max_workgroup_size(shader), &ctx->args.ac);

    /* LLVM keep non-uniform sampler as index, so can't do this in NIR.
     * Must be done after si_nir_lower_resource().
     */
    if (nir->info.use_aco_amd && ctx->temp_info.has_shadow_comparison &&
        sel->screen->info.gfx_level >= GFX8 && sel->screen->info.gfx_level <= GFX9) {
       NIR_PASS(progress, nir, si_nir_clamp_shadow_comparison_value);
    }

    if (progress) {
       si_nir_opts(sel->screen, nir, false);
       progress = false;
    }

    static const nir_opt_offsets_options offset_options = {
       .uniform_max = 0,
       .buffer_max = ~0,
       .shared_max = ~0,
    };
    NIR_PASS_V(nir, nir_opt_offsets, &offset_options);

    si_nir_late_opts(nir);

    NIR_PASS(progress, nir, nir_opt_sink,
             nir_move_const_undef | nir_move_copies | nir_move_alu | nir_move_comparisons |
             nir_move_load_ubo | nir_move_load_ssbo);
    NIR_PASS(progress, nir, nir_opt_move,
             nir_move_const_undef | nir_move_copies | nir_move_alu | nir_move_comparisons |
             nir_move_load_ubo);
    /* Run nir_opt_move again to make sure that comparisons are as close as possible to the first
     * use to prevent SCC spilling.
     */
    NIR_PASS(progress, nir, nir_opt_move, nir_move_comparisons);

    /* This must be done after si_nir_late_opts() because it may generate vec const. */
    NIR_PASS(_, nir, nir_lower_load_const_to_scalar);

    /* This helps LLVM form VMEM clauses and thus get more GPU cache hits.
     * 200 is tuned for Viewperf. It should be done last.
     */
    NIR_PASS_V(nir, nir_group_loads, nir_group_same_resource_only, 200);
 }

 static void get_input_nir(struct si_shader *shader, struct si_nir_shader_ctx *ctx)
 {
    struct si_shader_selector *sel = shader->selector;

    ctx->shader = shader;
    ctx->free_nir = !sel->nir && sel->nir_binary;
    ctx->nir = sel->nir ? sel->nir : (sel->nir_binary ? si_deserialize_shader(sel) : NULL);
    assert(ctx->nir);

    if (sel->stage <= MESA_SHADER_GEOMETRY)
       ctx->nir->info.use_aco_amd = shader->key.ge.use_aco;

    assert(ctx->nir->info.use_aco_amd == si_shader_uses_aco(shader));

    if (unlikely(should_print_nir(ctx->nir))) {
       /* Modify the shader's name so that each variant gets its own name. */
       ctx->nir->info.name = ralloc_asprintf(ctx->nir, "%s-%08x", ctx->nir->info.name,
                                             _mesa_hash_data(&shader->key, sizeof(shader->key)));

       /* Dummy pass to get the starting point. */
       printf("nir_dummy_pass\n");
       nir_print_shader(ctx->nir, stdout);
    }
 }

 static void get_prev_stage_input_nir(struct si_shader *shader, struct si_linked_shaders *linked)
 {
    const union si_shader_key *key = &shader->key;

    if (shader->selector->stage == MESA_SHADER_TESS_CTRL) {
       linked->producer_shader.selector = key->ge.part.tcs.ls;
       linked->producer_shader.key.ge.as_ls = 1;
    } else {
       linked->producer_shader.selector = key->ge.part.gs.es;
       linked->producer_shader.key.ge.as_es = 1;
       linked->producer_shader.key.ge.as_ngg = key->ge.as_ngg;
    }
    linked->producer_shader.key.ge.use_aco = key->ge.use_aco;

    linked->producer_shader.next_shader = shader;
    linked->producer_shader.key.ge.mono = key->ge.mono;
    linked->producer_shader.key.ge.opt = key->ge.opt;
    linked->producer_shader.key.ge.opt.inline_uniforms = false; /* only TCS/GS can inline uniforms */
    /* kill_outputs was computed based on second shader's outputs so we can't use it to
     * kill first shader's outputs.
     */
    linked->producer_shader.key.ge.opt.kill_outputs = 0;
    linked->producer_shader.is_monolithic = true;
    linked->producer_shader.wave_size = shader->wave_size;

    get_input_nir(&linked->producer_shader, &linked->producer);
 }

 static void get_nir_shaders(struct si_shader *shader, struct si_linked_shaders *linked)
 {
    memset(linked, 0, sizeof(*linked));
    get_input_nir(shader, &linked->consumer);

    if (shader->selector->screen->info.gfx_level >= GFX9 && shader->is_monolithic &&
        (shader->selector->stage == MESA_SHADER_TESS_CTRL ||
         shader->selector->stage == MESA_SHADER_GEOMETRY))
       get_prev_stage_input_nir(shader, linked);

    for (unsigned i = 0; i < SI_NUM_LINKED_SHADERS; i++) {
       if (linked->shader[i].nir)
          run_pre_link_optimization_passes(&linked->shader[i]);
    }

    /* TODO: run linking optimizations here if we have LS+HS or ES+GS */

    /* Remove holes after removed PS inputs by renumbering them. Holes can only occur with
     * monolithic PS.
     */
    if (shader->selector->stage == MESA_SHADER_FRAGMENT && shader->is_monolithic)
       NIR_PASS_V(linked->consumer.nir, nir_recompute_io_bases, nir_var_shader_in);

    for (unsigned i = 0; i < SI_NUM_LINKED_SHADERS; i++) {
       if (linked->shader[i].nir) {
          si_get_shader_variant_info(shader, &linked->shader[i].temp_info, linked->shader[i].nir);
          run_late_optimization_and_lowering_passes(&linked->shader[i]);
          si_get_late_shader_variant_info(shader, &linked->shader[i].args, linked->shader[i].nir);
       }
    }
 }

 /* Generate code for the hardware VS shader stage to go with a geometry shader */
 static struct si_shader *
 si_nir_generate_gs_copy_shader(struct si_screen *sscreen,
                                struct ac_llvm_compiler *compiler,
                                struct si_shader *gs_shader,
                                struct si_temp_shader_variant_info *temp_info,
                                nir_shader *gs_nir,
                                struct util_debug_callback *debug)
 {
    struct si_shader *shader;
    struct si_shader_selector *gs_selector = gs_shader->selector;
    struct si_shader_info *gsinfo = &gs_selector->info;
    union si_shader_key *gskey = &gs_shader->key;

    shader = CALLOC_STRUCT(si_shader);
    if (!shader)
       return NULL;

    /* We can leave the fence as permanently signaled because the GS copy
     * shader only becomes visible globally after it has been compiled. */
    util_queue_fence_init(&shader->ready);

    shader->selector = gs_selector;
    shader->is_gs_copy_shader = true;
    shader->wave_size = si_determine_wave_size(sscreen, shader);
    shader->info.num_streamout_vec4s = gs_shader->info.num_streamout_vec4s;

    STATIC_ASSERT(sizeof(temp_info->vs_output_param_offset[0]) == 1);
    memset(temp_info->vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000,
           sizeof(temp_info->vs_output_param_offset));

    for (unsigned i = 0; i < gsinfo->num_outputs; i++) {
       unsigned semantic = gsinfo->output_semantic[i];

       /* Skip if no channel writes to stream 0. */
       if (!nir_slot_is_varying(semantic, MESA_SHADER_FRAGMENT) ||
           (gsinfo->output_streams[i] & 0x03 && /* whether component 0 writes to non-zero stream */
            gsinfo->output_streams[i] & 0x0c && /* whether component 1 writes to non-zero stream */
            gsinfo->output_streams[i] & 0x30 && /* whether component 2 writes to non-zero stream */
            gsinfo->output_streams[i] & 0xc0))  /* whether component 3 writes to non-zero stream */
          continue;

       temp_info->vs_output_param_offset[semantic] = shader->info.nr_param_exports++;
    }

    shader->info.nr_pos_exports = si_get_nr_pos_exports(gs_selector, gskey);

    unsigned clip_cull_mask =
       (gsinfo->clipdist_mask & ~gskey->ge.opt.kill_clip_distances) | gsinfo->culldist_mask;

    nir_shader *nir =
       ac_nir_create_gs_copy_shader(gs_nir,
                                    sscreen->info.gfx_level,
                                    clip_cull_mask,
                                    false, false,
                                    temp_info->vs_output_param_offset,
                                    shader->info.nr_param_exports,
                                    !gs_shader->info.num_streamout_vec4s,
                                    gskey->ge.opt.kill_pointsize,
                                    gskey->ge.opt.kill_layer,
                                    sscreen->options.vrs2x2,
                                    &temp_info->gs_out_info);

    struct si_linked_shaders linked;
    memset(&linked, 0, sizeof(linked));
    linked.consumer.nir = nir;

    si_init_shader_args(shader, &linked.consumer.args, &gs_nir->info);

    NIR_PASS_V(nir, si_nir_lower_abi, shader, &linked.consumer.args);
    NIR_PASS_V(nir, ac_nir_lower_intrinsics_to_args, sscreen->info.gfx_level,
               sscreen->info.has_ls_vgpr_init_bug, AC_HW_VERTEX_SHADER, 64, 64,
               &linked.consumer.args.ac);

    si_nir_opts(gs_selector->screen, nir, false);

    NIR_PASS_V(nir, nir_lower_load_const_to_scalar);

    if (si_can_dump_shader(sscreen, MESA_SHADER_GEOMETRY, SI_DUMP_NIR)) {
       fprintf(stderr, "GS Copy Shader:\n");
       nir_print_shader(nir, stderr);
    }

    bool ok =
 #if AMD_LLVM_AVAILABLE
       !gs_nir->info.use_aco_amd ? si_llvm_compile_shader(sscreen, compiler, shader, &linked, debug) :
 #endif
       si_aco_compile_shader(shader, &linked, debug);

 #if !AMD_LLVM_AVAILABLE
    assert(gs_nir->info.use_aco_amd);
 #endif

    if (ok) {
       assert(!shader->config.scratch_bytes_per_wave);
       ok = si_shader_binary_upload(sscreen, shader, 0) >= 0;
       si_shader_dump(sscreen, shader, debug, stderr, true);
    }
    ralloc_free(nir);

    if (!ok) {
       FREE(shader);
       shader = NULL;
    } else {
       si_fix_resource_usage(sscreen, shader);
    }
    return shader;
 }

 static void
 debug_message_stderr(void *data, unsigned *id, enum util_debug_type ptype,
                       const char *fmt, va_list args)
 {
    vfprintf(stderr, fmt, args);
    fprintf(stderr, "\n");
 }

 bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
                        struct si_shader *shader, struct util_debug_callback *debug)
 {
    bool ret = true;
    struct si_shader_selector *sel = shader->selector;
    struct si_linked_shaders linked;

    get_nir_shaders(shader, &linked);
    nir_shader *nir = linked.consumer.nir;

    /* Dump NIR before doing NIR->LLVM conversion in case the
     * conversion fails. */
    if (si_can_dump_shader(sscreen, nir->info.stage, SI_DUMP_NIR)) {
       nir_print_shader(nir, stderr);

       if (nir->xfb_info)
          nir_print_xfb_info(nir->xfb_info, stderr);
    }

    /* Initialize vs_output_ps_input_cntl to default. */
    for (unsigned i = 0; i < ARRAY_SIZE(shader->info.vs_output_ps_input_cntl); i++)
       shader->info.vs_output_ps_input_cntl[i] = SI_PS_INPUT_CNTL_UNUSED;
    shader->info.vs_output_ps_input_cntl[VARYING_SLOT_COL0] = SI_PS_INPUT_CNTL_UNUSED_COLOR0;

    shader->info.private_mem_vgprs = DIV_ROUND_UP(nir->scratch_size, 4);

    /* Set the FP ALU behavior. */
    /* By default, we disable denormals for FP32 and enable them for FP16 and FP64
     * for performance and correctness reasons. FP32 denormals can't be enabled because
     * they break output modifiers and v_mad_f32 and are very slow on GFX6-7.
     *
     * float_controls_execution_mode defines the set of valid behaviors. Contradicting flags
     * can be set simultaneously, which means we are allowed to choose, but not really because
     * some options cause GLCTS failures.
     */
    unsigned float_mode = V_00B028_FP_16_64_DENORMS;

    if (!(nir->info.float_controls_execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32) &&
        nir->info.float_controls_execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
       float_mode |= V_00B028_FP_32_ROUND_TOWARDS_ZERO;

    if (!(nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
                                                     FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64)) &&
        nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
                                                   FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
       float_mode |= V_00B028_FP_16_64_ROUND_TOWARDS_ZERO;

    if (!(nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_DENORM_PRESERVE_FP16 |
                                                     FLOAT_CONTROLS_DENORM_PRESERVE_FP64)) &&
        nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 |
                                                   FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64))
       float_mode &= ~V_00B028_FP_16_64_DENORMS;

    assert(nir->info.use_aco_amd == si_shader_uses_aco(shader));
    ret =
 #if AMD_LLVM_AVAILABLE
       !nir->info.use_aco_amd ? si_llvm_compile_shader(sscreen, compiler, shader, &linked, debug) :
 #endif
       si_aco_compile_shader(shader, &linked, debug);

 #if !AMD_LLVM_AVAILABLE
    assert(nir->info.use_aco_amd);
 #endif

    if (!ret)
       goto out;

    shader->config.float_mode = float_mode;

    /* The GS copy shader is compiled next. */
    if (nir->info.stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) {
       shader->gs_copy_shader =
          si_nir_generate_gs_copy_shader(sscreen, compiler, shader, &linked.consumer.temp_info,
                                         nir, debug);
       if (!shader->gs_copy_shader) {
          fprintf(stderr, "radeonsi: can't create GS copy shader\n");
          ret = false;
          goto out;
       }
    }

    /* Compute vs_output_ps_input_cntl. */
    if ((nir->info.stage == MESA_SHADER_VERTEX ||
         nir->info.stage == MESA_SHADER_TESS_EVAL ||
         nir->info.stage == MESA_SHADER_GEOMETRY) &&
        !shader->key.ge.as_ls && !shader->key.ge.as_es) {
       uint8_t *vs_output_param_offset = linked.consumer.temp_info.vs_output_param_offset;

       /* We must use the original shader info before the removal of duplicated shader outputs. */
       /* VS and TES should also set primitive ID output if it's used. */
       unsigned num_outputs_with_prim_id = sel->info.num_outputs +
                                           shader->key.ge.mono.u.vs_export_prim_id;

       for (unsigned i = 0; i < num_outputs_with_prim_id; i++) {
          unsigned semantic = sel->info.output_semantic[i];
          unsigned offset = vs_output_param_offset[semantic];
          unsigned ps_input_cntl;

          if (offset <= AC_EXP_PARAM_OFFSET_31) {
             /* The input is loaded from parameter memory. */
             ps_input_cntl = S_028644_OFFSET(offset);
          } else {
             /* The input is a DEFAULT_VAL constant. */
             assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
                    offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
             offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;

             /* OFFSET=0x20 means that DEFAULT_VAL is used. */
             ps_input_cntl = S_028644_OFFSET(0x20) |
                             S_028644_DEFAULT_VAL(offset);
          }

          shader->info.vs_output_ps_input_cntl[semantic] = ps_input_cntl;
       }
    }

    /* Validate SGPR and VGPR usage for compute to detect compiler bugs. */
    if (gl_shader_stage_is_compute(nir->info.stage)) {
       unsigned max_vgprs =
          sscreen->info.num_physical_wave64_vgprs_per_simd * (shader->wave_size == 32 ? 2 : 1);
       unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd;
       unsigned max_sgprs_per_wave = 128;
       unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */
       unsigned threads_per_tg = si_get_max_workgroup_size(shader);
       unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, shader->wave_size);
       unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg);

       max_vgprs = max_vgprs / waves_per_simd;
       max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave);

       if (shader->config.num_sgprs > max_sgprs || shader->config.num_vgprs > max_vgprs) {
          fprintf(stderr,
                  "LLVM failed to compile a shader correctly: "
                  "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
                  shader->config.num_sgprs, shader->config.num_vgprs, max_sgprs, max_vgprs);

          /* Just terminate the process, because dependent
           * shaders can hang due to bad input data, but use
           * the env var to allow shader-db to work.
           */
          if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
             abort();
       }
    }

    /* Add/remove the scratch offset to/from input SGPRs. */
    if (!sel->screen->info.has_scratch_base_registers &&
        !si_is_merged_shader(shader)) {
       if (nir->info.use_aco_amd) {
          /* When aco scratch_offset arg is added explicitly at the beginning.
           * After compile if no scratch used, reduce the input sgpr count.
           */
          if (!shader->config.scratch_bytes_per_wave)
             shader->info.num_input_sgprs--;
       } else {
          /* scratch_offset arg is added by llvm implicitly */
          if (shader->info.num_input_sgprs)
             shader->info.num_input_sgprs++;
       }
    }

    /* Calculate the number of fragment input VGPRs. */
    if (nir->info.stage == MESA_SHADER_FRAGMENT)
       shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt(&shader->config);

    si_calculate_max_simd_waves(shader);

    if (si_can_dump_shader(sscreen, nir->info.stage, SI_DUMP_STATS)) {
       struct util_debug_callback out_stderr = {
          .debug_message = debug_message_stderr,
       };

       si_shader_dump_stats_for_shader_db(sscreen, shader, &out_stderr);
    } else {
       si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
    }

 out:
    for (unsigned i = 0; i < SI_NUM_LINKED_SHADERS; i++) {
       if (linked.shader[i].free_nir)
          ralloc_free(linked.shader[i].nir);
    }

    return ret;
 }

 /**
  * Create, compile and return a shader part (prolog or epilog).
  *
  * \param sscreen  screen
  * \param list     list of shader parts of the same category
  * \param type     shader type
  * \param key      shader part key
  * \param prolog   whether the part being requested is a prolog
  * \param tm       LLVM target machine
  * \param debug    debug callback
  * \return         non-NULL on success
  */
 static struct si_shader_part *
 si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
                    gl_shader_stage stage, bool prolog, union si_shader_part_key *key,
                    struct ac_llvm_compiler *compiler, struct util_debug_callback *debug,
                    const char *name)
 {
    struct si_shader_part *result;

    simple_mtx_lock(&sscreen->shader_parts_mutex);

    /* Find existing. */
    for (result = *list; result; result = result->next) {
       if (memcmp(&result->key, key, sizeof(*key)) == 0) {
          simple_mtx_unlock(&sscreen->shader_parts_mutex);
          return result;
       }
    }

    /* Compile a new one. */
    result = CALLOC_STRUCT(si_shader_part);
    result->key = *key;

    bool ok =
 #if AMD_LLVM_AVAILABLE
       !(sscreen->use_aco ||
         (stage == MESA_SHADER_FRAGMENT &&
          ((prolog && key->ps_prolog.use_aco) ||
           (!prolog && key->ps_epilog.use_aco)))) ?
       si_llvm_build_shader_part(sscreen, stage, prolog, compiler, debug, name, result) :
 #endif
       si_aco_build_shader_part(sscreen, stage, prolog, debug, name, result);

    if (ok) {
       result->next = *list;
       *list = result;
    } else {
       FREE(result);
       result = NULL;
    }

    simple_mtx_unlock(&sscreen->shader_parts_mutex);
    return result;
 }


 /**
  * Select and compile (or reuse) TCS parts (epilog).
  */
 static bool si_shader_select_tcs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
                                        struct si_shader *shader, struct util_debug_callback *debug)
 {
    if (sscreen->info.gfx_level >= GFX9) {
       assert(shader->wave_size == 32 || shader->wave_size == 64);
       unsigned wave_size_index = shader->wave_size == 64;
       shader->previous_stage =
          shader->key.ge.part.tcs.ls->main_parts.named.ls[wave_size_index][shader->key.ge.use_aco];
       assert(shader->previous_stage->key.ge.use_aco == si_shader_uses_aco(shader));
       assert((shader->previous_stage->binary.type == SI_SHADER_BINARY_RAW) == si_shader_uses_aco(shader));
    }

    return true;
 }

 /**
  * Select and compile (or reuse) GS parts (prolog).
  */
 static bool si_shader_select_gs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
                                       struct si_shader *shader, struct util_debug_callback *debug)
 {
    if (sscreen->info.gfx_level >= GFX9) {
       if (shader->key.ge.as_ngg) {
          assert(shader->wave_size == 32 || shader->wave_size == 64);
          unsigned wave_size_index = shader->wave_size == 64;
          shader->previous_stage =
             shader->key.ge.part.gs.es->main_parts.named.ngg_es[wave_size_index][shader->key.ge.use_aco];
       } else {
          shader->previous_stage = shader->key.ge.part.gs.es->main_parts.named.es[shader->key.ge.use_aco];
       }
       assert(shader->previous_stage->key.ge.use_aco == si_shader_uses_aco(shader));
       assert((shader->previous_stage->binary.type == SI_SHADER_BINARY_RAW) == si_shader_uses_aco(shader));
    }

    return true;
 }

 /**
  * Compute the PS prolog key, which contains all the information needed to
  * build the PS prolog function, and set related bits in shader->config.
  */
 static void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key)
 {
    struct si_shader_info *info = &shader->selector->info;

    memset(key, 0, sizeof(*key));
    key->ps_prolog.states = shader->key.ps.part.prolog;
    key->ps_prolog.use_aco = info->base.use_aco_amd;
    key->ps_prolog.wave32 = shader->wave_size == 32;
    key->ps_prolog.colors_read = shader->info.ps_colors_read;
    key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
    key->ps_prolog.wqm =
       info->base.fs.needs_coarse_quad_helper_invocations &&
       (key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
        key->ps_prolog.states.force_linear_sample_interp ||
        key->ps_prolog.states.force_persp_center_interp ||
        key->ps_prolog.states.force_linear_center_interp ||
        key->ps_prolog.states.bc_optimize_for_persp || key->ps_prolog.states.bc_optimize_for_linear ||
        key->ps_prolog.states.samplemask_log_ps_iter ||
        key->ps_prolog.states.get_frag_coord_from_pixel_coord ||
        key->ps_prolog.states.force_samplemask_to_helper_invocation);
    key->ps_prolog.fragcoord_usage_mask =
       G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_ena) |
       (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_ena) << 1) |
       (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_ena) << 2) |
       (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) << 3);
    key->ps_prolog.pixel_center_integer = key->ps_prolog.fragcoord_usage_mask &&
                                          shader->selector->info.base.fs.pixel_center_integer;

    if (shader->key.ps.part.prolog.poly_stipple)
       shader->info.uses_vmem_load_other = true;

    if (shader->info.ps_colors_read) {
       uint8_t *color = shader->selector->info.color_attr_index;

       if (shader->key.ps.part.prolog.color_two_side) {
          /* BCOLORs are stored after the last input. */
          key->ps_prolog.num_interp_inputs = shader->info.num_ps_inputs;
          shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
       }

       for (unsigned i = 0; i < 2; i++) {
          unsigned interp = info->color_interpolate[i];
          unsigned location = info->color_interpolate_loc[i];

          if (!(shader->info.ps_colors_read & (0xf << i * 4)))
             continue;

          key->ps_prolog.color_attr_index[i] = color[i];

          if (shader->key.ps.part.prolog.flatshade_colors && interp == INTERP_MODE_COLOR)
             interp = INTERP_MODE_FLAT;

          switch (interp) {
          case INTERP_MODE_FLAT:
             key->ps_prolog.color_interp_vgpr_index[i] = -1;
             break;
          case INTERP_MODE_SMOOTH:
          case INTERP_MODE_COLOR:
             /* Force the interpolation location for colors here. */
             if (shader->key.ps.part.prolog.force_persp_sample_interp)
                location = TGSI_INTERPOLATE_LOC_SAMPLE;
             if (shader->key.ps.part.prolog.force_persp_center_interp)
                location = TGSI_INTERPOLATE_LOC_CENTER;

             switch (location) {
             case TGSI_INTERPOLATE_LOC_SAMPLE:
                key->ps_prolog.color_interp_vgpr_index[i] = 0;
                shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
                break;
             case TGSI_INTERPOLATE_LOC_CENTER:
                key->ps_prolog.color_interp_vgpr_index[i] = 2;
                shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
                break;
             case TGSI_INTERPOLATE_LOC_CENTROID:
                key->ps_prolog.color_interp_vgpr_index[i] = 4;
                shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTROID_ENA(1);
                break;
             default:
                assert(0);
             }
             break;
          case INTERP_MODE_NOPERSPECTIVE:
             /* Force the interpolation location for colors here. */
             if (shader->key.ps.part.prolog.force_linear_sample_interp)
                location = TGSI_INTERPOLATE_LOC_SAMPLE;
             if (shader->key.ps.part.prolog.force_linear_center_interp)
                location = TGSI_INTERPOLATE_LOC_CENTER;

             /* The VGPR assignment for non-monolithic shaders
              * works because InitialPSInputAddr is set on the
              * main shader and PERSP_PULL_MODEL is never used.
              */
             switch (location) {
             case TGSI_INTERPOLATE_LOC_SAMPLE:
                key->ps_prolog.color_interp_vgpr_index[i] = 6;
                shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
                break;
             case TGSI_INTERPOLATE_LOC_CENTER:
                key->ps_prolog.color_interp_vgpr_index[i] = 8;
                shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
                break;
             case TGSI_INTERPOLATE_LOC_CENTROID:
                key->ps_prolog.color_interp_vgpr_index[i] = 10;
                shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTROID_ENA(1);
                break;
             default:
                assert(0);
             }
             break;
          default:
             assert(0);
          }
       }
    }
 }

 /**
  * Check whether a PS prolog is required based on the key.
  */
 static bool si_need_ps_prolog(const union si_shader_part_key *key)
 {
    return key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
           key->ps_prolog.states.force_linear_sample_interp ||
           key->ps_prolog.states.force_persp_center_interp ||
           key->ps_prolog.states.force_linear_center_interp ||
           key->ps_prolog.states.bc_optimize_for_persp ||
           key->ps_prolog.states.bc_optimize_for_linear || key->ps_prolog.states.poly_stipple ||
           key->ps_prolog.states.samplemask_log_ps_iter ||
           key->ps_prolog.states.get_frag_coord_from_pixel_coord ||
           key->ps_prolog.states.force_samplemask_to_helper_invocation;
 }

 /**
  * Compute the PS epilog key, which contains all the information needed to
  * build the PS epilog function.
  */
 static void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *key)
 {
    struct si_shader_info *info = &shader->selector->info;
    memset(key, 0, sizeof(*key));
    key->ps_epilog.use_aco = info->base.use_aco_amd;
    key->ps_epilog.wave32 = shader->wave_size == 32;
    key->ps_epilog.uses_discard = shader->info.uses_discard ||
                                  shader->key.ps.part.prolog.poly_stipple ||
                                  shader->key.ps.part.epilog.alpha_func != PIPE_FUNC_ALWAYS;
    key->ps_epilog.colors_written = info->colors_written;
    key->ps_epilog.color_types = info->output_color_types;
    key->ps_epilog.writes_all_cbufs = info->color0_writes_all_cbufs &&
                                      /* Check whether a non-zero color buffer is bound. */
                                      !!(shader->key.ps.part.epilog.spi_shader_col_format & 0xfffffff0);
    key->ps_epilog.writes_z = info->writes_z;
    key->ps_epilog.writes_stencil = info->writes_stencil;
    key->ps_epilog.writes_samplemask = info->writes_samplemask;
    key->ps_epilog.states = shader->key.ps.part.epilog;
 }

 /**
  * Select and compile (or reuse) pixel shader parts (prolog & epilog).
  */
 static bool si_shader_select_ps_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
                                       struct si_shader *shader, struct util_debug_callback *debug)
 {
    union si_shader_part_key prolog_key;
    union si_shader_part_key epilog_key;

    /* Get the prolog. */
    si_get_ps_prolog_key(shader, &prolog_key);

    /* The prolog is a no-op if these aren't set. */
    if (si_need_ps_prolog(&prolog_key)) {
       shader->prolog =
          si_get_shader_part(sscreen, &sscreen->ps_prologs, MESA_SHADER_FRAGMENT, true, &prolog_key,
                             compiler, debug, "Fragment Shader Prolog");
       if (!shader->prolog)
          return false;
    }

    /* Get the epilog. */
    si_get_ps_epilog_key(shader, &epilog_key);

    shader->epilog =
       si_get_shader_part(sscreen, &sscreen->ps_epilogs, MESA_SHADER_FRAGMENT, false, &epilog_key,
                          compiler, debug, "Fragment Shader Epilog");
    if (!shader->epilog)
       return false;

    si_set_spi_ps_input_config_for_separate_prolog(shader);
    si_fixup_spi_ps_input_config(shader);

    /* Make sure spi_ps_input_addr bits is superset of spi_ps_input_ena. */
    unsigned spi_ps_input_ena = shader->config.spi_ps_input_ena;
    unsigned spi_ps_input_addr = shader->config.spi_ps_input_addr;
    assert((spi_ps_input_ena & spi_ps_input_addr) == spi_ps_input_ena);

    return true;
 }

 void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size)
 {
    /* If tessellation is all offchip and on-chip GS isn't used, this
     * workaround is not needed.
     */
    return;

    /* SPI barrier management bug:
     *   Make sure we have at least 4k of LDS in use to avoid the bug.
     *   It applies to workgroup sizes of more than one wavefront.
     */
    if (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_KABINI)
       *lds_size = MAX2(*lds_size, 8);
 }

 static void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader)
 {
    unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */

    shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);

    if (shader->selector->stage == MESA_SHADER_COMPUTE &&
        si_get_max_workgroup_size(shader) > shader->wave_size) {
       si_multiwave_lds_size_workaround(sscreen, &shader->config.lds_size);
    }
 }

 bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
                               struct si_shader *shader, struct util_debug_callback *debug)
 {
    struct si_shader_selector *sel = shader->selector;
    struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key, shader->wave_size);

    /* LS, ES, VS are compiled on demand if the main part hasn't been
     * compiled for that stage.
     *
     * GS are compiled on demand if the main part hasn't been compiled
     * for the chosen NGG-ness.
     *
     * Vertex shaders are compiled on demand when a vertex fetch
     * workaround must be applied.
     */
    if (shader->is_monolithic) {
       /* Monolithic shader (compiled as a whole, has many variants,
        * may take a long time to compile).
        */
       if (!si_compile_shader(sscreen, compiler, shader, debug))
          return false;
    } else {
       /* The shader consists of several parts:
        *
        * - the middle part is the user shader, it has 1 variant only
        *   and it was compiled during the creation of the shader
        *   selector
        * - the prolog part is inserted at the beginning
        * - the epilog part is inserted at the end
        *
        * The prolog and epilog have many (but simple) variants.
        *
        * Starting with gfx9, geometry and tessellation control
        * shaders also contain the prolog and user shader parts of
        * the previous shader stage.
        */

       if (!mainp)
          return false;

       /* Copy the compiled shader data over. */
       shader->is_binary_shared = true;
       shader->binary = mainp->binary;
       shader->config = mainp->config;
       shader->info = mainp->info;

       /* Select prologs and/or epilogs. */
       switch (sel->stage) {
       case MESA_SHADER_TESS_CTRL:
          if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug))
             return false;
          break;
       case MESA_SHADER_GEOMETRY:
          if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))
             return false;

          /* Clone the GS copy shader for the shader variant.
           * We can't just copy the pointer because we change the pm4 state and
           * si_shader_selector::gs_copy_shader must be immutable because it's shared
           * by multiple contexts.
           */
          if (!shader->key.ge.as_ngg) {
             assert(mainp->gs_copy_shader);
             assert(mainp->gs_copy_shader->bo);
             assert(!mainp->gs_copy_shader->previous_stage_sel);
             assert(!mainp->gs_copy_shader->scratch_va);

             shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
             memcpy(shader->gs_copy_shader, mainp->gs_copy_shader,
                    sizeof(*shader->gs_copy_shader));
             /* Increase the reference count. */
             pipe_reference(NULL, &shader->gs_copy_shader->bo->b.b.reference);
             /* Initialize some fields differently. */
             shader->gs_copy_shader->shader_log = NULL;
             shader->gs_copy_shader->is_binary_shared = true;
             util_queue_fence_init(&shader->gs_copy_shader->ready);
          }
          break;
       case MESA_SHADER_FRAGMENT:
          if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))
             return false;

          /* Make sure we have at least as many VGPRs as there
           * are allocated inputs.
           */
          shader->config.num_vgprs = MAX2(shader->config.num_vgprs, shader->info.num_input_vgprs);

          shader->info.writes_z &= !shader->key.ps.part.epilog.kill_z;
          shader->info.writes_stencil &= !shader->key.ps.part.epilog.kill_stencil;
          shader->info.writes_sample_mask &= !shader->key.ps.part.epilog.kill_samplemask;
          shader->info.uses_discard |= shader->key.ps.part.prolog.poly_stipple ||
                                       shader->key.ps.part.epilog.alpha_func != PIPE_FUNC_ALWAYS;
          break;
       default:;
       }

       assert(shader->wave_size == mainp->wave_size);
       assert(!shader->previous_stage || shader->wave_size == shader->previous_stage->wave_size);

       /* Update SGPR and VGPR counts. */
       if (shader->prolog) {
          shader->config.num_sgprs =
             MAX2(shader->config.num_sgprs, shader->prolog->num_sgprs);
          shader->config.num_vgprs =
             MAX2(shader->config.num_vgprs, shader->prolog->num_vgprs);
       }
       if (shader->previous_stage) {
          shader->config.num_sgprs =
             MAX2(shader->config.num_sgprs, shader->previous_stage->config.num_sgprs);
          shader->config.num_vgprs =
             MAX2(shader->config.num_vgprs, shader->previous_stage->config.num_vgprs);
          shader->config.spilled_sgprs =
             MAX2(shader->config.spilled_sgprs, shader->previous_stage->config.spilled_sgprs);
          shader->config.spilled_vgprs =
             MAX2(shader->config.spilled_vgprs, shader->previous_stage->config.spilled_vgprs);
          shader->info.private_mem_vgprs =
             MAX2(shader->info.private_mem_vgprs, shader->previous_stage->info.private_mem_vgprs);
          shader->config.scratch_bytes_per_wave =
             MAX2(shader->config.scratch_bytes_per_wave,
                  shader->previous_stage->config.scratch_bytes_per_wave);

          shader->info.uses_vmem_load_other |= shader->previous_stage->info.uses_vmem_load_other;
          shader->info.uses_vmem_sampler_or_bvh |= shader->previous_stage->info.uses_vmem_sampler_or_bvh;
          shader->info.uses_instance_id |= shader->previous_stage->info.uses_instance_id;
          shader->info.uses_base_instance |= shader->previous_stage->info.uses_base_instance;
          shader->info.uses_draw_id |= shader->previous_stage->info.uses_draw_id;
          shader->info.uses_vs_state_indexed |= shader->previous_stage->info.uses_vs_state_indexed;
          shader->info.uses_gs_state_provoking_vtx_first |= shader->previous_stage->info.uses_gs_state_provoking_vtx_first;
          shader->info.uses_gs_state_outprim |= shader->previous_stage->info.uses_gs_state_outprim;
       }
       if (shader->epilog) {
          shader->config.num_sgprs =
             MAX2(shader->config.num_sgprs, shader->epilog->num_sgprs);
          shader->config.num_vgprs =
             MAX2(shader->config.num_vgprs, shader->epilog->num_vgprs);
       }
       si_calculate_max_simd_waves(shader);
    }

    if (sel->stage <= MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) {
       assert(!shader->key.ge.as_es && !shader->key.ge.as_ls);
       if (!gfx10_ngg_calculate_subgroup_info(shader)) {
          fprintf(stderr, "Failed to compute subgroup info\n");
          return false;
       }
    } else if (sscreen->info.gfx_level >= GFX9 && sel->stage == MESA_SHADER_GEOMETRY) {
       gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info);
    }

    si_fix_resource_usage(sscreen, shader);

    /* Upload. */
    bool ok = si_shader_binary_upload(sscreen, shader, 0) >= 0;

    shader->complete_shader_binary_size = si_get_shader_binary_size(sscreen, shader);

    si_shader_dump(sscreen, shader, debug, stderr, true);

    if (!ok)
       fprintf(stderr, "LLVM failed to upload shader\n");
    return ok;
 }

 void si_shader_binary_clean(struct si_shader_binary *binary)
 {
    free((void *)binary->code_buffer);
    binary->code_buffer = NULL;

    free(binary->llvm_ir_string);
    binary->llvm_ir_string = NULL;

    free((void *)binary->symbols);
    binary->symbols = NULL;

    free(binary->uploaded_code);
    binary->uploaded_code = NULL;
    binary->uploaded_code_size = 0;
 }

 void si_shader_destroy(struct si_shader *shader)
 {
    si_resource_reference(&shader->bo, NULL);

    if (!shader->is_binary_shared)
       si_shader_binary_clean(&shader->binary);

    free(shader->shader_log);
 }