/*
 * Copyright 2012 Advanced Micro Devices, Inc.
 * All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * on the rights to use, copy, modify, merge, publish, distribute, sub
 * license, and/or sell copies of the Software, and to permit persons to whom
 * the Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

#include "ac_exp_param.h"
#include "ac_rtld.h"
#include "compiler/nir/nir.h"
#include "compiler/nir/nir_serialize.h"
#include "si_pipe.h"
#include "si_shader_internal.h"
#include "sid.h"
#include "tgsi/tgsi_from_mesa.h"
#include "tgsi/tgsi_strings.h"
#include "util/u_memory.h"

static const char scratch_rsrc_dword0_symbol[] = "SCRATCH_RSRC_DWORD0";

static const char scratch_rsrc_dword1_symbol[] = "SCRATCH_RSRC_DWORD1";

static void si_dump_shader_key(const struct si_shader *shader, FILE *f);

/** Whether the shader runs as a combination of multiple API shaders */
bool si_is_multi_part_shader(struct si_shader *shader)
{
   if (shader->selector->screen->info.chip_class <= GFX8)
      return false;

   return shader->key.as_ls || shader->key.as_es ||
          shader->selector->info.stage == MESA_SHADER_TESS_CTRL ||
          shader->selector->info.stage == MESA_SHADER_GEOMETRY;
}

/** Whether the shader runs on a merged HW stage (LSHS or ESGS) */
bool si_is_merged_shader(struct si_shader *shader)
{
   return shader->key.as_ngg || si_is_multi_part_shader(shader);
}

/**
 * Returns a unique index for a per-patch semantic name and index. The index
 * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
 * can be calculated.
 */
unsigned si_shader_io_get_unique_index_patch(unsigned semantic)
{
   switch (semantic) {
   case VARYING_SLOT_TESS_LEVEL_OUTER:
      return 0;
   case VARYING_SLOT_TESS_LEVEL_INNER:
      return 1;
   default:
      if (semantic >= VARYING_SLOT_PATCH0 && semantic < VARYING_SLOT_PATCH0 + 30)
         return 2 + (semantic - VARYING_SLOT_PATCH0);

      assert(!"invalid semantic");
      return 0;
   }
}

/**
 * Returns a unique index for a semantic name and index. The index must be
 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
 * calculated.
 */
unsigned si_shader_io_get_unique_index(unsigned semantic, bool is_varying)
{
   switch (semantic) {
   case VARYING_SLOT_POS:
      return 0;
   default:
      /* Since some shader stages use the the highest used IO index
       * to determine the size to allocate for inputs/outputs
       * (in LDS, tess and GS rings). GENERIC should be placed right
       * after POSITION to make that size as small as possible.
       */
      if (semantic >= VARYING_SLOT_VAR0 &&
          semantic < VARYING_SLOT_VAR0 + SI_MAX_IO_GENERIC)
         return 1 + (semantic - VARYING_SLOT_VAR0);

      assert(!"invalid generic index");
      return 0;
   case VARYING_SLOT_FOGC:
      return SI_MAX_IO_GENERIC + 1;
   case VARYING_SLOT_COL0:
      return SI_MAX_IO_GENERIC + 2;
   case VARYING_SLOT_COL1:
      return SI_MAX_IO_GENERIC + 3;
   case VARYING_SLOT_BFC0:
      /* If it's a varying, COLOR and BCOLOR alias. */
      if (is_varying)
         return SI_MAX_IO_GENERIC + 2;
      else
         return SI_MAX_IO_GENERIC + 4;
   case VARYING_SLOT_BFC1:
      if (is_varying)
         return SI_MAX_IO_GENERIC + 3;
      else
         return SI_MAX_IO_GENERIC + 5;
   case VARYING_SLOT_TEX0:
   case VARYING_SLOT_TEX1:
   case VARYING_SLOT_TEX2:
   case VARYING_SLOT_TEX3:
   case VARYING_SLOT_TEX4:
   case VARYING_SLOT_TEX5:
   case VARYING_SLOT_TEX6:
   case VARYING_SLOT_TEX7:
      return SI_MAX_IO_GENERIC + 6 + (semantic - VARYING_SLOT_TEX0);

   /* These are rarely used between LS and HS or ES and GS. */
   case VARYING_SLOT_CLIP_DIST0:
      return SI_MAX_IO_GENERIC + 6 + 8;
   case VARYING_SLOT_CLIP_DIST1:
      return SI_MAX_IO_GENERIC + 6 + 8 + 1;
   case VARYING_SLOT_CLIP_VERTEX:
      return SI_MAX_IO_GENERIC + 6 + 8 + 2;
   case VARYING_SLOT_PSIZ:
      return SI_MAX_IO_GENERIC + 6 + 8 + 3;

   /* These can't be written by LS, HS, and ES. */
   case VARYING_SLOT_LAYER:
      return SI_MAX_IO_GENERIC + 6 + 8 + 4;
   case VARYING_SLOT_VIEWPORT:
      return SI_MAX_IO_GENERIC + 6 + 8 + 5;
   case VARYING_SLOT_PRIMITIVE_ID:
      STATIC_ASSERT(SI_MAX_IO_GENERIC + 6 + 8 + 6 <= 63);
      return SI_MAX_IO_GENERIC + 6 + 8 + 6;
   }
}

static void si_dump_streamout(struct pipe_stream_output_info *so)
{
   unsigned i;

   if (so->num_outputs)
      fprintf(stderr, "STREAMOUT\n");

   for (i = 0; i < so->num_outputs; i++) {
      unsigned mask = ((1 << so->output[i].num_components) - 1) << so->output[i].start_component;
      fprintf(stderr, "  %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n", i, so->output[i].output_buffer,
              so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
              so->output[i].register_index, mask & 1 ? "x" : "", mask & 2 ? "y" : "",
              mask & 4 ? "z" : "", mask & 8 ? "w" : "");
   }
}

static void declare_streamout_params(struct si_shader_context *ctx,
                                     struct pipe_stream_output_info *so)
{
   if (ctx->screen->use_ngg_streamout) {
      if (ctx->stage == MESA_SHADER_TESS_EVAL)
         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
      return;
   }

   /* Streamout SGPRs. */
   if (so->num_outputs) {
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_config);
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_write_index);
   } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
   }

   /* A streamout buffer offset is loaded if the stride is non-zero. */
   for (int i = 0; i < 4; i++) {
      if (!so->stride[i])
         continue;

      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_offset[i]);
   }
}

unsigned si_get_max_workgroup_size(const struct si_shader *shader)
{
   switch (shader->selector->info.stage) {
   case MESA_SHADER_VERTEX:
   case MESA_SHADER_TESS_EVAL:
      return shader->key.as_ngg ? 128 : 0;

   case MESA_SHADER_TESS_CTRL:
      /* Return this so that LLVM doesn't remove s_barrier
       * instructions on chips where we use s_barrier. */
      return shader->selector->screen->info.chip_class >= GFX7 ? 128 : 0;

   case MESA_SHADER_GEOMETRY:
      return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 0;

   case MESA_SHADER_COMPUTE:
      break; /* see below */

   default:
      return 0;
   }

   /* Compile a variable block size using the maximum variable size. */
   if (shader->selector->info.base.cs.local_size_variable)
      return SI_MAX_VARIABLE_THREADS_PER_BLOCK;

   uint16_t *local_size = shader->selector->info.base.cs.local_size;
   unsigned max_work_group_size = (uint32_t)local_size[0] *
                                  (uint32_t)local_size[1] *
                                  (uint32_t)local_size[2];
   assert(max_work_group_size);
   return max_work_group_size;
}

static void declare_const_and_shader_buffers(struct si_shader_context *ctx, bool assign_params)
{
   enum ac_arg_type const_shader_buf_type;

   if (ctx->shader->selector->info.base.num_ubos == 1 &&
       ctx->shader->selector->info.base.num_ssbos == 0)
      const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR;
   else
      const_shader_buf_type = AC_ARG_CONST_DESC_PTR;

   ac_add_arg(
      &ctx->args, AC_ARG_SGPR, 1, const_shader_buf_type,
      assign_params ? &ctx->const_and_shader_buffers : &ctx->other_const_and_shader_buffers);
}

static void declare_samplers_and_images(struct si_shader_context *ctx, bool assign_params)
{
   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
              assign_params ? &ctx->samplers_and_images : &ctx->other_samplers_and_images);
}

static void declare_per_stage_desc_pointers(struct si_shader_context *ctx, bool assign_params)
{
   declare_const_and_shader_buffers(ctx, assign_params);
   declare_samplers_and_images(ctx, assign_params);
}

static void declare_global_desc_pointers(struct si_shader_context *ctx)
{
   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->rw_buffers);
   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
              &ctx->bindless_samplers_and_images);
}

static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx)
{
   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
   if (!ctx->shader->is_gs_copy_shader) {
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.base_vertex);
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.start_instance);
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.draw_id);
   }
}

static void declare_vb_descriptor_input_sgprs(struct si_shader_context *ctx)
{
   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->vertex_buffers);

   unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
   if (num_vbos_in_user_sgprs) {
      unsigned user_sgprs = ctx->args.num_sgprs_used;

      if (si_is_merged_shader(ctx->shader))
         user_sgprs -= 8;
      assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);

      /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */
      for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++)
         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */

      assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(ctx->vb_descriptors));
      for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++)
         ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->vb_descriptors[i]);
   }
}

static void declare_vs_input_vgprs(struct si_shader_context *ctx, unsigned *num_prolog_vgprs)
{
   struct si_shader *shader = ctx->shader;

   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vertex_id);
   if (shader->key.as_ls) {
      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->rel_auto_id);
      if (ctx->screen->info.chip_class >= GFX10) {
         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
      } else {
         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
      }
   } else if (ctx->screen->info.chip_class >= GFX10) {
      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
                 &ctx->vs_prim_id); /* user vgpr or PrimID (legacy) */
      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
   } else {
      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vs_prim_id);
      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
   }

   if (!shader->is_gs_copy_shader) {
      /* Vertex load indices. */
      if (shader->selector->info.num_inputs) {
         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vertex_index0);
         for (unsigned i = 1; i < shader->selector->info.num_inputs; i++)
            ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
      }
      *num_prolog_vgprs += shader->selector->info.num_inputs;
   }
}

static void declare_vs_blit_inputs(struct si_shader_context *ctx, unsigned vs_blit_property)
{
   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_blit_inputs); /* i16 x1, y1 */
   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);                 /* i16 x1, y1 */
   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL);               /* depth */

   if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */
   } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) {
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */
   }
}

static void declare_tes_input_vgprs(struct si_shader_context *ctx)
{
   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_u);
   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_v);
   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->tes_rel_patch_id);
   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_patch_id);
}

enum
{
   /* Convenient merged shader definitions. */
   SI_SHADER_MERGED_VERTEX_TESSCTRL = MESA_ALL_SHADER_STAGES,
   SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
};

void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, unsigned registers,
                        enum ac_arg_type type, struct ac_arg *arg, unsigned idx)
{
   assert(args->arg_count == idx);
   ac_add_arg(args, file, registers, type, arg);
}

void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader)
{
   struct si_shader *shader = ctx->shader;
   LLVMTypeRef returns[AC_MAX_ARGS];
   unsigned i, num_return_sgprs;
   unsigned num_returns = 0;
   unsigned num_prolog_vgprs = 0;
   unsigned stage = ctx->stage;

   memset(&ctx->args, 0, sizeof(ctx->args));

   /* Set MERGED shaders. */
   if (ctx->screen->info.chip_class >= GFX9) {
      if (shader->key.as_ls || stage == MESA_SHADER_TESS_CTRL)
         stage = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
      else if (shader->key.as_es || shader->key.as_ngg || stage == MESA_SHADER_GEOMETRY)
         stage = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
   }

   switch (stage) {
   case MESA_SHADER_VERTEX:
      declare_global_desc_pointers(ctx);

      if (shader->selector->info.base.vs.blit_sgprs_amd) {
         declare_vs_blit_inputs(ctx, shader->selector->info.base.vs.blit_sgprs_amd);

         /* VGPRs */
         declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
         break;
      }

      declare_per_stage_desc_pointers(ctx, true);
      declare_vs_specific_input_sgprs(ctx);
      if (!shader->is_gs_copy_shader)
         declare_vb_descriptor_input_sgprs(ctx);

      if (shader->key.as_es) {
         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->es2gs_offset);
      } else if (shader->key.as_ls) {
         /* no extra parameters */
      } else {
         /* The locations of the other parameters are assigned dynamically. */
         declare_streamout_params(ctx, &shader->selector->so);
      }

      /* VGPRs */
      declare_vs_input_vgprs(ctx, &num_prolog_vgprs);

      /* Return values */
      if (shader->key.opt.vs_as_prim_discard_cs) {
         for (i = 0; i < 4; i++)
            returns[num_returns++] = ctx->ac.f32; /* VGPRs */
      }
      break;

   case MESA_SHADER_TESS_CTRL: /* GFX6-GFX8 */
      declare_global_desc_pointers(ctx);
      declare_per_stage_desc_pointers(ctx, true);
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);

      /* VGPRs */
      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);

      /* param_tcs_offchip_offset and param_tcs_factor_offset are
       * placed after the user SGPRs.
       */
      for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
         returns[num_returns++] = ctx->ac.i32; /* SGPRs */
      for (i = 0; i < 11; i++)
         returns[num_returns++] = ctx->ac.f32; /* VGPRs */
      break;

   case SI_SHADER_MERGED_VERTEX_TESSCTRL:
      /* Merged stages have 8 system SGPRs at the beginning. */
      /* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
      declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_TESS_CTRL);
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */

      declare_global_desc_pointers(ctx);
      declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_VERTEX);
      declare_vs_specific_input_sgprs(ctx);

      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
      declare_vb_descriptor_input_sgprs(ctx);

      /* VGPRs (first TCS, then VS) */
      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);

      if (ctx->stage == MESA_SHADER_VERTEX) {
         declare_vs_input_vgprs(ctx, &num_prolog_vgprs);

         /* LS return values are inputs to the TCS main shader part. */
         for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
            returns[num_returns++] = ctx->ac.i32; /* SGPRs */
         for (i = 0; i < 2; i++)
            returns[num_returns++] = ctx->ac.f32; /* VGPRs */
      } else {
         /* TCS return values are inputs to the TCS epilog.
          *
          * param_tcs_offchip_offset, param_tcs_factor_offset,
          * param_tcs_offchip_layout, and param_rw_buffers
          * should be passed to the epilog.
          */
         for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++)
            returns[num_returns++] = ctx->ac.i32; /* SGPRs */
         for (i = 0; i < 11; i++)
            returns[num_returns++] = ctx->ac.f32; /* VGPRs */
      }
      break;

   case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
      /* Merged stages have 8 system SGPRs at the beginning. */
      /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
      declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_GEOMETRY);

      if (ctx->shader->key.as_ngg)
         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_tg_info);
      else
         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset);

      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
                 &ctx->small_prim_cull_info); /* SPI_SHADER_PGM_LO_GS << 8 */
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
                 NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */

      declare_global_desc_pointers(ctx);
      if (ctx->stage != MESA_SHADER_VERTEX || !shader->selector->info.base.vs.blit_sgprs_amd) {
         declare_per_stage_desc_pointers(
            ctx, (ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL));
      }

      if (ctx->stage == MESA_SHADER_VERTEX) {
         if (shader->selector->info.base.vs.blit_sgprs_amd)
            declare_vs_blit_inputs(ctx, shader->selector->info.base.vs.blit_sgprs_amd);
         else
            declare_vs_specific_input_sgprs(ctx);
      } else {
         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
         /* Declare as many input SGPRs as the VS has. */
      }

      if (ctx->stage == MESA_SHADER_VERTEX)
         declare_vb_descriptor_input_sgprs(ctx);

      /* VGPRs (first GS, then VS/TES) */
      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx01_offset);
      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx23_offset);
      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx45_offset);

      if (ctx->stage == MESA_SHADER_VERTEX) {
         declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
      } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
         declare_tes_input_vgprs(ctx);
      }

      if ((ctx->shader->key.as_es || ngg_cull_shader) &&
          (ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL)) {
         unsigned num_user_sgprs, num_vgprs;

         if (ctx->stage == MESA_SHADER_VERTEX) {
            /* For the NGG cull shader, add 1 SGPR to hold
             * the vertex buffer pointer.
             */
            num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR + ngg_cull_shader;

            if (ngg_cull_shader && shader->selector->num_vbos_in_user_sgprs) {
               assert(num_user_sgprs <= 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
               num_user_sgprs =
                  SI_SGPR_VS_VB_DESCRIPTOR_FIRST + shader->selector->num_vbos_in_user_sgprs * 4;
            }
         } else {
            num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
         }

         /* The NGG cull shader has to return all 9 VGPRs.
          *
          * The normal merged ESGS shader only has to return the 5 VGPRs
          * for the GS stage.
          */
         num_vgprs = ngg_cull_shader ? 9 : 5;

         /* ES return values are inputs to GS. */
         for (i = 0; i < 8 + num_user_sgprs; i++)
            returns[num_returns++] = ctx->ac.i32; /* SGPRs */
         for (i = 0; i < num_vgprs; i++)
            returns[num_returns++] = ctx->ac.f32; /* VGPRs */
      }
      break;

   case MESA_SHADER_TESS_EVAL:
      declare_global_desc_pointers(ctx);
      declare_per_stage_desc_pointers(ctx, true);
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);

      if (shader->key.as_es) {
         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->es2gs_offset);
      } else {
         declare_streamout_params(ctx, &shader->selector->so);
         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
      }

      /* VGPRs */
      declare_tes_input_vgprs(ctx);
      break;

   case MESA_SHADER_GEOMETRY:
      declare_global_desc_pointers(ctx);
      declare_per_stage_desc_pointers(ctx, true);
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset);
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_wave_id);

      /* VGPRs */
      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[0]);
      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[1]);
      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[2]);
      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[3]);
      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[4]);
      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[5]);
      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
      break;

   case MESA_SHADER_FRAGMENT:
      declare_global_desc_pointers(ctx);
      declare_per_stage_desc_pointers(ctx, true);
      si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL, SI_PARAM_ALPHA_REF);
      si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.prim_mask,
                         SI_PARAM_PRIM_MASK);

      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_sample,
                         SI_PARAM_PERSP_SAMPLE);
      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_center,
                         SI_PARAM_PERSP_CENTER);
      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_centroid,
                         SI_PARAM_PERSP_CENTROID);
      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, NULL, SI_PARAM_PERSP_PULL_MODEL);
      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_sample,
                         SI_PARAM_LINEAR_SAMPLE);
      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_center,
                         SI_PARAM_LINEAR_CENTER);
      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_centroid,
                         SI_PARAM_LINEAR_CENTROID);
      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_FLOAT, NULL, SI_PARAM_LINE_STIPPLE_TEX);
      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[0],
                         SI_PARAM_POS_X_FLOAT);
      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[1],
                         SI_PARAM_POS_Y_FLOAT);
      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[2],
                         SI_PARAM_POS_Z_FLOAT);
      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[3],
                         SI_PARAM_POS_W_FLOAT);
      shader->info.face_vgpr_index = ctx->args.num_vgprs_used;
      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.front_face,
                         SI_PARAM_FRONT_FACE);
      shader->info.ancillary_vgpr_index = ctx->args.num_vgprs_used;
      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.ancillary,
                         SI_PARAM_ANCILLARY);
      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.sample_coverage,
                         SI_PARAM_SAMPLE_COVERAGE);
      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->pos_fixed_pt,
                         SI_PARAM_POS_FIXED_PT);

      /* Color inputs from the prolog. */
      if (shader->selector->info.colors_read) {
         unsigned num_color_elements = util_bitcount(shader->selector->info.colors_read);

         for (i = 0; i < num_color_elements; i++)
            ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);

         num_prolog_vgprs += num_color_elements;
      }

      /* Outputs for the epilog. */
      num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
      num_returns = num_return_sgprs + util_bitcount(shader->selector->info.colors_written) * 4 +
                    shader->selector->info.writes_z + shader->selector->info.writes_stencil +
                    shader->selector->info.writes_samplemask + 1 /* SampleMaskIn */;

      num_returns = MAX2(num_returns, num_return_sgprs + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);

      for (i = 0; i < num_return_sgprs; i++)
         returns[i] = ctx->ac.i32;
      for (; i < num_returns; i++)
         returns[i] = ctx->ac.f32;
      break;

   case MESA_SHADER_COMPUTE:
      declare_global_desc_pointers(ctx);
      declare_per_stage_desc_pointers(ctx, true);
      if (shader->selector->info.uses_grid_size)
         ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->args.num_work_groups);
      if (shader->selector->info.uses_variable_block_size)
         ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->block_size);

      unsigned cs_user_data_dwords =
         shader->selector->info.base.cs.user_data_components_amd;
      if (cs_user_data_dwords) {
         ac_add_arg(&ctx->args, AC_ARG_SGPR, cs_user_data_dwords, AC_ARG_INT, &ctx->cs_user_data);
      }

      /* Some descriptors can be in user SGPRs. */
      /* Shader buffers in user SGPRs. */
      for (unsigned i = 0; i < shader->selector->cs_num_shaderbufs_in_user_sgprs; i++) {
         while (ctx->args.num_sgprs_used % 4 != 0)
            ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);

         ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->cs_shaderbuf[i]);
      }
      /* Images in user SGPRs. */
      for (unsigned i = 0; i < shader->selector->cs_num_images_in_user_sgprs; i++) {
         unsigned num_sgprs = shader->selector->info.base.image_buffers & (1 << i) ? 4 : 8;

         while (ctx->args.num_sgprs_used % num_sgprs != 0)
            ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);

         ac_add_arg(&ctx->args, AC_ARG_SGPR, num_sgprs, AC_ARG_INT, &ctx->cs_image[i]);
      }

      /* Hardware SGPRs. */
      for (i = 0; i < 3; i++) {
         if (shader->selector->info.uses_block_id[i]) {
            ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.workgroup_ids[i]);
         }
      }
      if (shader->selector->info.uses_subgroup_info)
         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tg_size);

      /* Hardware VGPRs. */
      ac_add_arg(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, &ctx->args.local_invocation_ids);
      break;
   default:
      assert(0 && "unimplemented shader");
      return;
   }

   si_llvm_create_func(ctx, ngg_cull_shader ? "ngg_cull_main" : "main", returns, num_returns,
                       si_get_max_workgroup_size(shader));

   /* Reserve register locations for VGPR inputs the PS prolog may need. */
   if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->shader->is_monolithic) {
      ac_llvm_add_target_dep_function_attr(
         ctx->main_fn, "InitialPSInputAddr",
         S_0286D0_PERSP_SAMPLE_ENA(1) | S_0286D0_PERSP_CENTER_ENA(1) |
            S_0286D0_PERSP_CENTROID_ENA(1) | S_0286D0_LINEAR_SAMPLE_ENA(1) |
            S_0286D0_LINEAR_CENTER_ENA(1) | S_0286D0_LINEAR_CENTROID_ENA(1) |
            S_0286D0_FRONT_FACE_ENA(1) | S_0286D0_ANCILLARY_ENA(1) | S_0286D0_POS_FIXED_PT_ENA(1));
   }

   shader->info.num_input_sgprs = ctx->args.num_sgprs_used;
   shader->info.num_input_vgprs = ctx->args.num_vgprs_used;

   assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
   shader->info.num_input_vgprs -= num_prolog_vgprs;

   if (shader->key.as_ls || ctx->stage == MESA_SHADER_TESS_CTRL) {
      if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
         /* The LSHS size is not known until draw time, so we append it
          * at the end of whatever LDS use there may be in the rest of
          * the shader (currently none, unless LLVM decides to do its
          * own LDS-based lowering).
          */
         ctx->ac.lds = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
                                                   "__lds_end", AC_ADDR_SPACE_LDS);
         LLVMSetAlignment(ctx->ac.lds, 256);
      } else {
         ac_declare_lds_as_pointer(&ctx->ac);
      }
   }

   /* Unlike radv, we override these arguments in the prolog, so to the
    * API shader they appear as normal arguments.
    */
   if (ctx->stage == MESA_SHADER_VERTEX) {
      ctx->abi.vertex_id = ac_get_arg(&ctx->ac, ctx->args.vertex_id);
      ctx->abi.instance_id = ac_get_arg(&ctx->ac, ctx->args.instance_id);
   } else if (ctx->stage == MESA_SHADER_FRAGMENT) {
      ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args.persp_centroid);
      ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args.linear_centroid);
   }
}

/* For the UMR disassembler. */
#define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */
#define DEBUGGER_NUM_MARKERS        5

static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
                                  struct ac_rtld_binary *rtld)
{
   const struct si_shader_selector *sel = shader->selector;
   const char *part_elfs[5];
   size_t part_sizes[5];
   unsigned num_parts = 0;

#define add_part(shader_or_part)                                                                   \
   if (shader_or_part) {                                                                           \
      part_elfs[num_parts] = (shader_or_part)->binary.elf_buffer;                                  \
      part_sizes[num_parts] = (shader_or_part)->binary.elf_size;                                   \
      num_parts++;                                                                                 \
   }

   add_part(shader->prolog);
   add_part(shader->previous_stage);
   add_part(shader->prolog2);
   add_part(shader);
   add_part(shader->epilog);

#undef add_part

   struct ac_rtld_symbol lds_symbols[2];
   unsigned num_lds_symbols = 0;

   if (sel && screen->info.chip_class >= GFX9 && !shader->is_gs_copy_shader &&
       (sel->info.stage == MESA_SHADER_GEOMETRY || shader->key.as_ngg)) {
      /* We add this symbol even on LLVM <= 8 to ensure that
       * shader->config.lds_size is set correctly below.
       */
      struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
      sym->name = "esgs_ring";
      sym->size = shader->gs_info.esgs_ring_size * 4;
      sym->align = 64 * 1024;
   }

   if (shader->key.as_ngg && sel->info.stage == MESA_SHADER_GEOMETRY) {
      struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
      sym->name = "ngg_emit";
      sym->size = shader->ngg.ngg_emit_size * 4;
      sym->align = 4;
   }

   bool ok = ac_rtld_open(
      rtld, (struct ac_rtld_open_info){.info = &screen->info,
                                       .options =
                                          {
                                             .halt_at_entry = screen->options.halt_shaders,
                                          },
                                       .shader_type = sel->info.stage,
                                       .wave_size = si_get_shader_wave_size(shader),
                                       .num_parts = num_parts,
                                       .elf_ptrs = part_elfs,
                                       .elf_sizes = part_sizes,
                                       .num_shared_lds_symbols = num_lds_symbols,
                                       .shared_lds_symbols = lds_symbols});

   if (rtld->lds_size > 0) {
      unsigned alloc_granularity = screen->info.chip_class >= GFX7 ? 512 : 256;
      shader->config.lds_size = align(rtld->lds_size, alloc_granularity) / alloc_granularity;
   }

   return ok;
}

static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader)
{
   struct ac_rtld_binary rtld;
   si_shader_binary_open(screen, shader, &rtld);
   return rtld.exec_size;
}

static bool si_get_external_symbol(void *data, const char *name, uint64_t *value)
{
   uint64_t *scratch_va = data;

   if (!strcmp(scratch_rsrc_dword0_symbol, name)) {
      *value = (uint32_t)*scratch_va;
      return true;
   }
   if (!strcmp(scratch_rsrc_dword1_symbol, name)) {
      /* Enable scratch coalescing. */
      *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32) | S_008F04_SWIZZLE_ENABLE(1);
      return true;
   }

   return false;
}

bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
                             uint64_t scratch_va)
{
   struct ac_rtld_binary binary;
   if (!si_shader_binary_open(sscreen, shader, &binary))
      return false;

   si_resource_reference(&shader->bo, NULL);
   shader->bo = si_aligned_buffer_create(
      &sscreen->b,
      (sscreen->info.cpdma_prefetch_writes_memory ?
         0 : SI_RESOURCE_FLAG_READ_ONLY) | SI_RESOURCE_FLAG_DRIVER_INTERNAL,
      PIPE_USAGE_IMMUTABLE, align(binary.rx_size, SI_CPDMA_ALIGNMENT), 256);
   if (!shader->bo)
      return false;

   /* Upload. */
   struct ac_rtld_upload_info u = {};
   u.binary = &binary;
   u.get_external_symbol = si_get_external_symbol;
   u.cb_data = &scratch_va;
   u.rx_va = shader->bo->gpu_address;
   u.rx_ptr = sscreen->ws->buffer_map(
      shader->bo->buf, NULL,
      PIPE_MAP_READ_WRITE | PIPE_MAP_UNSYNCHRONIZED | RADEON_MAP_TEMPORARY);
   if (!u.rx_ptr)
      return false;

   bool ok = ac_rtld_upload(&u);

   sscreen->ws->buffer_unmap(shader->bo->buf);
   ac_rtld_close(&binary);

   return ok;
}

static void si_shader_dump_disassembly(struct si_screen *screen,
                                       const struct si_shader_binary *binary,
                                       gl_shader_stage stage, unsigned wave_size,
                                       struct pipe_debug_callback *debug, const char *name,
                                       FILE *file)
{
   struct ac_rtld_binary rtld_binary;

   if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){
                                      .info = &screen->info,
                                      .shader_type = stage,
                                      .wave_size = wave_size,
                                      .num_parts = 1,
                                      .elf_ptrs = &binary->elf_buffer,
                                      .elf_sizes = &binary->elf_size}))
      return;

   const char *disasm;
   size_t nbytes;

   if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes))
      goto out;

   if (nbytes > INT_MAX)
      goto out;

   if (debug && debug->debug_message) {
      /* Very long debug messages are cut off, so send the
       * disassembly one line at a time. This causes more
       * overhead, but on the plus side it simplifies
       * parsing of resulting logs.
       */
      pipe_debug_message(debug, SHADER_INFO, "Shader Disassembly Begin");

      uint64_t line = 0;
      while (line < nbytes) {
         int count = nbytes - line;
         const char *nl = memchr(disasm + line, '\n', nbytes - line);
         if (nl)
            count = nl - (disasm + line);

         if (count) {
            pipe_debug_message(debug, SHADER_INFO, "%.*s", count, disasm + line);
         }

         line += count + 1;
      }

      pipe_debug_message(debug, SHADER_INFO, "Shader Disassembly End");
   }

   if (file) {
      fprintf(file, "Shader %s disassembly:\n", name);
      fprintf(file, "%*s", (int)nbytes, disasm);
   }

out:
   ac_rtld_close(&rtld_binary);
}

static void si_calculate_max_simd_waves(struct si_shader *shader)
{
   struct si_screen *sscreen = shader->selector->screen;
   struct ac_shader_config *conf = &shader->config;
   unsigned num_inputs = shader->selector->info.num_inputs;
   unsigned lds_increment = sscreen->info.chip_class >= GFX7 ? 512 : 256;
   unsigned lds_per_wave = 0;
   unsigned max_simd_waves;

   max_simd_waves = sscreen->info.max_wave64_per_simd;

   /* Compute LDS usage for PS. */
   switch (shader->selector->info.stage) {
   case MESA_SHADER_FRAGMENT:
      /* The minimum usage per wave is (num_inputs * 48). The maximum
       * usage is (num_inputs * 48 * 16).
       * We can get anything in between and it varies between waves.
       *
       * The 48 bytes per input for a single primitive is equal to
       * 4 bytes/component * 4 components/input * 3 points.
       *
       * Other stages don't know the size at compile time or don't
       * allocate LDS per wave, but instead they do it per thread group.
       */
      lds_per_wave = conf->lds_size * lds_increment + align(num_inputs * 48, lds_increment);
      break;
   case MESA_SHADER_COMPUTE:
      if (shader->selector) {
         unsigned max_workgroup_size = si_get_max_workgroup_size(shader);
         lds_per_wave = (conf->lds_size * lds_increment) /
                        DIV_ROUND_UP(max_workgroup_size, sscreen->compute_wave_size);
      }
      break;
   default:;
   }

   /* Compute the per-SIMD wave counts. */
   if (conf->num_sgprs) {
      max_simd_waves =
         MIN2(max_simd_waves, sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs);
   }

   if (conf->num_vgprs) {
      /* Always print wave limits as Wave64, so that we can compare
       * Wave32 and Wave64 with shader-db fairly. */
      unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd;
      max_simd_waves = MIN2(max_simd_waves, max_vgprs / conf->num_vgprs);
   }

   unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / 4;
   if (lds_per_wave)
      max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);

   shader->info.max_simd_waves = max_simd_waves;
}

void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,
                                        struct pipe_debug_callback *debug)
{
   const struct ac_shader_config *conf = &shader->config;

   if (screen->options.debug_disassembly)
      si_shader_dump_disassembly(screen, &shader->binary, shader->selector->info.stage,
                                 si_get_shader_wave_size(shader), debug, "main", NULL);

   pipe_debug_message(debug, SHADER_INFO,
                      "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
                      "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
                      "Spilled VGPRs: %d PrivMem VGPRs: %d",
                      conf->num_sgprs, conf->num_vgprs, si_get_shader_binary_size(screen, shader),
                      conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves,
                      conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs);
}

static void si_shader_dump_stats(struct si_screen *sscreen, struct si_shader *shader, FILE *file,
                                 bool check_debug_option)
{
   const struct ac_shader_config *conf = &shader->config;

   if (!check_debug_option || si_can_dump_shader(sscreen, shader->selector->info.stage)) {
      if (shader->selector->info.stage == MESA_SHADER_FRAGMENT) {
         fprintf(file,
                 "*** SHADER CONFIG ***\n"
                 "SPI_PS_INPUT_ADDR = 0x%04x\n"
                 "SPI_PS_INPUT_ENA  = 0x%04x\n",
                 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
      }

      fprintf(file,
              "*** SHADER STATS ***\n"
              "SGPRS: %d\n"
              "VGPRS: %d\n"
              "Spilled SGPRs: %d\n"
              "Spilled VGPRs: %d\n"
              "Private memory VGPRs: %d\n"
              "Code Size: %d bytes\n"
              "LDS: %d blocks\n"
              "Scratch: %d bytes per wave\n"
              "Max Waves: %d\n"
              "********************\n\n\n",
              conf->num_sgprs, conf->num_vgprs, conf->spilled_sgprs, conf->spilled_vgprs,
              shader->info.private_mem_vgprs, si_get_shader_binary_size(sscreen, shader),
              conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves);
   }
}

const char *si_get_shader_name(const struct si_shader *shader)
{
   switch (shader->selector->info.stage) {
   case MESA_SHADER_VERTEX:
      if (shader->key.as_es)
         return "Vertex Shader as ES";
      else if (shader->key.as_ls)
         return "Vertex Shader as LS";
      else if (shader->key.opt.vs_as_prim_discard_cs)
         return "Vertex Shader as Primitive Discard CS";
      else if (shader->key.as_ngg)
         return "Vertex Shader as ESGS";
      else
         return "Vertex Shader as VS";
   case MESA_SHADER_TESS_CTRL:
      return "Tessellation Control Shader";
   case MESA_SHADER_TESS_EVAL:
      if (shader->key.as_es)
         return "Tessellation Evaluation Shader as ES";
      else if (shader->key.as_ngg)
         return "Tessellation Evaluation Shader as ESGS";
      else
         return "Tessellation Evaluation Shader as VS";
   case MESA_SHADER_GEOMETRY:
      if (shader->is_gs_copy_shader)
         return "GS Copy Shader as VS";
      else
         return "Geometry Shader";
   case MESA_SHADER_FRAGMENT:
      return "Pixel Shader";
   case MESA_SHADER_COMPUTE:
      return "Compute Shader";
   default:
      return "Unknown Shader";
   }
}

void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
                    struct pipe_debug_callback *debug, FILE *file, bool check_debug_option)
{
   gl_shader_stage stage = shader->selector->info.stage;

   if (!check_debug_option || si_can_dump_shader(sscreen, stage))
      si_dump_shader_key(shader, file);

   if (!check_debug_option && shader->binary.llvm_ir_string) {
      if (shader->previous_stage && shader->previous_stage->binary.llvm_ir_string) {
         fprintf(file, "\n%s - previous stage - LLVM IR:\n\n", si_get_shader_name(shader));
         fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
      }

      fprintf(file, "\n%s - main shader part - LLVM IR:\n\n", si_get_shader_name(shader));
      fprintf(file, "%s\n", shader->binary.llvm_ir_string);
   }

   if (!check_debug_option ||
       (si_can_dump_shader(sscreen, stage) && !(sscreen->debug_flags & DBG(NO_ASM)))) {
      unsigned wave_size = si_get_shader_wave_size(shader);

      fprintf(file, "\n%s:\n", si_get_shader_name(shader));

      if (shader->prolog)
         si_shader_dump_disassembly(sscreen, &shader->prolog->binary, stage, wave_size, debug,
                                    "prolog", file);
      if (shader->previous_stage)
         si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary, stage,
                                    wave_size, debug, "previous stage", file);
      if (shader->prolog2)
         si_shader_dump_disassembly(sscreen, &shader->prolog2->binary, stage, wave_size,
                                    debug, "prolog2", file);

      si_shader_dump_disassembly(sscreen, &shader->binary, stage, wave_size, debug, "main",
                                 file);

      if (shader->epilog)
         si_shader_dump_disassembly(sscreen, &shader->epilog->binary, stage, wave_size, debug,
                                    "epilog", file);
      fprintf(file, "\n");
   }

   si_shader_dump_stats(sscreen, shader, file, check_debug_option);
}

static void si_dump_shader_key_vs(const struct si_shader_key *key,
                                  const struct si_vs_prolog_bits *prolog, const char *prefix,
                                  FILE *f)
{
   fprintf(f, "  %s.instance_divisor_is_one = %u\n", prefix, prolog->instance_divisor_is_one);
   fprintf(f, "  %s.instance_divisor_is_fetched = %u\n", prefix,
           prolog->instance_divisor_is_fetched);
   fprintf(f, "  %s.unpack_instance_id_from_vertex_id = %u\n", prefix,
           prolog->unpack_instance_id_from_vertex_id);
   fprintf(f, "  %s.ls_vgpr_fix = %u\n", prefix, prolog->ls_vgpr_fix);

   fprintf(f, "  mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode);
   fprintf(f, "  mono.vs.fix_fetch = {");
   for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
      union si_vs_fix_fetch fix = key->mono.vs_fix_fetch[i];
      if (i)
         fprintf(f, ", ");
      if (!fix.bits)
         fprintf(f, "0");
      else
         fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size, fix.u.num_channels_m1,
                 fix.u.format);
   }
   fprintf(f, "}\n");
}

static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
{
   const struct si_shader_key *key = &shader->key;
   gl_shader_stage stage = shader->selector->info.stage;

   fprintf(f, "SHADER KEY\n");

   switch (stage) {
   case MESA_SHADER_VERTEX:
      si_dump_shader_key_vs(key, &key->part.vs.prolog, "part.vs.prolog", f);
      fprintf(f, "  as_es = %u\n", key->as_es);
      fprintf(f, "  as_ls = %u\n", key->as_ls);
      fprintf(f, "  as_ngg = %u\n", key->as_ngg);
      fprintf(f, "  mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id);
      fprintf(f, "  opt.vs_as_prim_discard_cs = %u\n", key->opt.vs_as_prim_discard_cs);
      fprintf(f, "  opt.cs_prim_type = %s\n", tgsi_primitive_names[key->opt.cs_prim_type]);
      fprintf(f, "  opt.cs_indexed = %u\n", key->opt.cs_indexed);
      fprintf(f, "  opt.cs_instancing = %u\n", key->opt.cs_instancing);
      fprintf(f, "  opt.cs_primitive_restart = %u\n", key->opt.cs_primitive_restart);
      fprintf(f, "  opt.cs_provoking_vertex_first = %u\n", key->opt.cs_provoking_vertex_first);
      fprintf(f, "  opt.cs_need_correct_orientation = %u\n", key->opt.cs_need_correct_orientation);
      fprintf(f, "  opt.cs_cull_front = %u\n", key->opt.cs_cull_front);
      fprintf(f, "  opt.cs_cull_back = %u\n", key->opt.cs_cull_back);
      fprintf(f, "  opt.cs_cull_z = %u\n", key->opt.cs_cull_z);
      fprintf(f, "  opt.cs_halfz_clip_space = %u\n", key->opt.cs_halfz_clip_space);
      break;

   case MESA_SHADER_TESS_CTRL:
      if (shader->selector->screen->info.chip_class >= GFX9) {
         si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog, "part.tcs.ls_prolog", f);
      }
      fprintf(f, "  part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
      fprintf(f, "  mono.u.ff_tcs_inputs_to_copy = 0x%" PRIx64 "\n",
              key->mono.u.ff_tcs_inputs_to_copy);
      break;

   case MESA_SHADER_TESS_EVAL:
      fprintf(f, "  as_es = %u\n", key->as_es);
      fprintf(f, "  as_ngg = %u\n", key->as_ngg);
      fprintf(f, "  mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id);
      break;

   case MESA_SHADER_GEOMETRY:
      if (shader->is_gs_copy_shader)
         break;

      if (shader->selector->screen->info.chip_class >= GFX9 &&
          key->part.gs.es->info.stage == MESA_SHADER_VERTEX) {
         si_dump_shader_key_vs(key, &key->part.gs.vs_prolog, "part.gs.vs_prolog", f);
      }
      fprintf(f, "  part.gs.prolog.tri_strip_adj_fix = %u\n",
              key->part.gs.prolog.tri_strip_adj_fix);
      fprintf(f, "  part.gs.prolog.gfx9_prev_is_vs = %u\n", key->part.gs.prolog.gfx9_prev_is_vs);
      fprintf(f, "  as_ngg = %u\n", key->as_ngg);
      break;

   case MESA_SHADER_COMPUTE:
      break;

   case MESA_SHADER_FRAGMENT:
      fprintf(f, "  part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
      fprintf(f, "  part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
      fprintf(f, "  part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
      fprintf(f, "  part.ps.prolog.force_persp_sample_interp = %u\n",
              key->part.ps.prolog.force_persp_sample_interp);
      fprintf(f, "  part.ps.prolog.force_linear_sample_interp = %u\n",
              key->part.ps.prolog.force_linear_sample_interp);
      fprintf(f, "  part.ps.prolog.force_persp_center_interp = %u\n",
              key->part.ps.prolog.force_persp_center_interp);
      fprintf(f, "  part.ps.prolog.force_linear_center_interp = %u\n",
              key->part.ps.prolog.force_linear_center_interp);
      fprintf(f, "  part.ps.prolog.bc_optimize_for_persp = %u\n",
              key->part.ps.prolog.bc_optimize_for_persp);
      fprintf(f, "  part.ps.prolog.bc_optimize_for_linear = %u\n",
              key->part.ps.prolog.bc_optimize_for_linear);
      fprintf(f, "  part.ps.prolog.samplemask_log_ps_iter = %u\n",
              key->part.ps.prolog.samplemask_log_ps_iter);
      fprintf(f, "  part.ps.epilog.spi_shader_col_format = 0x%x\n",
              key->part.ps.epilog.spi_shader_col_format);
      fprintf(f, "  part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
      fprintf(f, "  part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
      fprintf(f, "  part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
      fprintf(f, "  part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
      fprintf(f, "  part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
      fprintf(f, "  part.ps.epilog.poly_line_smoothing = %u\n",
              key->part.ps.epilog.poly_line_smoothing);
      fprintf(f, "  part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
      fprintf(f, "  mono.u.ps.interpolate_at_sample_force_center = %u\n",
              key->mono.u.ps.interpolate_at_sample_force_center);
      fprintf(f, "  mono.u.ps.fbfetch_msaa = %u\n", key->mono.u.ps.fbfetch_msaa);
      fprintf(f, "  mono.u.ps.fbfetch_is_1D = %u\n", key->mono.u.ps.fbfetch_is_1D);
      fprintf(f, "  mono.u.ps.fbfetch_layered = %u\n", key->mono.u.ps.fbfetch_layered);
      break;

   default:
      assert(0);
   }

   if ((stage == MESA_SHADER_GEOMETRY || stage == MESA_SHADER_TESS_EVAL ||
        stage == MESA_SHADER_VERTEX) &&
       !key->as_es && !key->as_ls) {
      fprintf(f, "  opt.kill_outputs = 0x%" PRIx64 "\n", key->opt.kill_outputs);
      fprintf(f, "  opt.kill_clip_distances = 0x%x\n", key->opt.kill_clip_distances);
      if (stage != MESA_SHADER_GEOMETRY)
         fprintf(f, "  opt.ngg_culling = 0x%x\n", key->opt.ngg_culling);
   }
}

static void si_optimize_vs_outputs(struct si_shader_context *ctx)
{
   struct si_shader *shader = ctx->shader;
   struct si_shader_info *info = &shader->selector->info;
   unsigned skip_vs_optim_mask = 0;

   if ((ctx->stage != MESA_SHADER_VERTEX && ctx->stage != MESA_SHADER_TESS_EVAL) ||
       shader->key.as_ls || shader->key.as_es)
      return;

   /* Optimizing these outputs is not possible, since they might be overriden
    * at runtime with S_028644_PT_SPRITE_TEX. */
   for (int i = 0; i < info->num_outputs; i++) {
      if (info->output_semantic[i] == VARYING_SLOT_PNTC ||
          (info->output_semantic[i] >= VARYING_SLOT_TEX0 &&
           info->output_semantic[i] <= VARYING_SLOT_TEX7)) {
         skip_vs_optim_mask |= 1u << shader->info.vs_output_param_offset[i];
      }
   }

   ac_optimize_vs_outputs(&ctx->ac, ctx->main_fn, shader->info.vs_output_param_offset,
                          info->num_outputs, skip_vs_optim_mask,
                          &shader->info.nr_param_exports);
}

static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
                               const struct si_vs_prolog_bits *prolog_key,
                               const struct si_shader_key *key, bool ngg_cull_shader)
{
   /* VGPR initialization fixup for Vega10 and Raven is always done in the
    * VS prolog. */
   return sel->vs_needs_prolog || prolog_key->ls_vgpr_fix ||
          prolog_key->unpack_instance_id_from_vertex_id ||
          (ngg_cull_shader && key->opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
}

static bool si_build_main_function(struct si_shader_context *ctx, struct si_shader *shader,
                                   struct nir_shader *nir, bool free_nir, bool ngg_cull_shader)
{
   struct si_shader_selector *sel = shader->selector;
   const struct si_shader_info *info = &sel->info;

   ctx->shader = shader;
   ctx->stage = sel->info.stage;

   ctx->num_const_buffers = info->base.num_ubos;
   ctx->num_shader_buffers = info->base.num_ssbos;

   ctx->num_samplers = util_last_bit(info->base.textures_used);
   ctx->num_images = info->base.num_images;

   si_llvm_init_resource_callbacks(ctx);

   switch (ctx->stage) {
   case MESA_SHADER_VERTEX:
      si_llvm_init_vs_callbacks(ctx, ngg_cull_shader);
      break;
   case MESA_SHADER_TESS_CTRL:
      si_llvm_init_tcs_callbacks(ctx);
      break;
   case MESA_SHADER_TESS_EVAL:
      si_llvm_init_tes_callbacks(ctx, ngg_cull_shader);
      break;
   case MESA_SHADER_GEOMETRY:
      si_llvm_init_gs_callbacks(ctx);
      break;
   case MESA_SHADER_FRAGMENT:
      si_llvm_init_ps_callbacks(ctx);
      break;
   case MESA_SHADER_COMPUTE:
      ctx->abi.load_local_group_size = si_llvm_get_block_size;
      break;
   default:
      assert(!"Unsupported shader type");
      return false;
   }

   si_create_function(ctx, ngg_cull_shader);

   if (ctx->shader->key.as_es || ctx->stage == MESA_SHADER_GEOMETRY)
      si_preload_esgs_ring(ctx);

   if (ctx->stage == MESA_SHADER_GEOMETRY)
      si_preload_gs_rings(ctx);
   else if (ctx->stage == MESA_SHADER_TESS_EVAL)
      si_llvm_preload_tes_rings(ctx);

   if (ctx->stage == MESA_SHADER_TESS_CTRL && sel->info.tessfactors_are_def_in_all_invocs) {
      for (unsigned i = 0; i < 6; i++) {
         ctx->invoc0_tess_factors[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
      }
   }

   if (ctx->stage == MESA_SHADER_GEOMETRY) {
      for (unsigned i = 0; i < 4; i++) {
         ctx->gs_next_vertex[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
      }
      if (shader->key.as_ngg) {
         for (unsigned i = 0; i < 4; ++i) {
            ctx->gs_curprim_verts[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
            ctx->gs_generated_prims[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
         }

         assert(!ctx->gs_ngg_scratch);
         LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, gfx10_ngg_get_scratch_dw_size(shader));
         ctx->gs_ngg_scratch =
            LLVMAddGlobalInAddressSpace(ctx->ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
         LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(ai32));
         LLVMSetAlignment(ctx->gs_ngg_scratch, 4);

         ctx->gs_ngg_emit = LLVMAddGlobalInAddressSpace(
            ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
         LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage);
         LLVMSetAlignment(ctx->gs_ngg_emit, 4);
      }
   }

   if (ctx->stage != MESA_SHADER_GEOMETRY && (shader->key.as_ngg && !shader->key.as_es)) {
      /* Unconditionally declare scratch space base for streamout and
       * vertex compaction. Whether space is actually allocated is
       * determined during linking / PM4 creation.
       *
       * Add an extra dword per vertex to ensure an odd stride, which
       * avoids bank conflicts for SoA accesses.
       */
      if (!gfx10_is_ngg_passthrough(shader))
         si_llvm_declare_esgs_ring(ctx);

      /* This is really only needed when streamout and / or vertex
       * compaction is enabled.
       */
      if (!ctx->gs_ngg_scratch && (sel->so.num_outputs || shader->key.opt.ngg_culling)) {
         LLVMTypeRef asi32 = LLVMArrayType(ctx->ac.i32, gfx10_ngg_get_scratch_dw_size(shader));
         ctx->gs_ngg_scratch =
            LLVMAddGlobalInAddressSpace(ctx->ac.module, asi32, "ngg_scratch", AC_ADDR_SPACE_LDS);
         LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(asi32));
         LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
      }
   }

   /* For GFX9 merged shaders:
    * - Set EXEC for the first shader. If the prolog is present, set
    *   EXEC there instead.
    * - Add a barrier before the second shader.
    * - In the second shader, reset EXEC to ~0 and wrap the main part in
    *   an if-statement. This is required for correctness in geometry
    *   shaders, to ensure that empty GS waves do not send GS_EMIT and
    *   GS_CUT messages.
    *
    * For monolithic merged shaders, the first shader is wrapped in an
    * if-block together with its prolog in si_build_wrapper_function.
    *
    * NGG vertex and tess eval shaders running as the last
    * vertex/geometry stage handle execution explicitly using
    * if-statements.
    */
   if (ctx->screen->info.chip_class >= GFX9) {
      if (!shader->is_monolithic && (shader->key.as_es || shader->key.as_ls) &&
          (ctx->stage == MESA_SHADER_TESS_EVAL ||
           (ctx->stage == MESA_SHADER_VERTEX &&
            !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, ngg_cull_shader)))) {
         si_init_exec_from_input(ctx, ctx->merged_wave_info, 0);
      } else if (ctx->stage == MESA_SHADER_TESS_CTRL || ctx->stage == MESA_SHADER_GEOMETRY ||
                 (shader->key.as_ngg && !shader->key.as_es)) {
         LLVMValueRef thread_enabled;
         bool nested_barrier;

         if (!shader->is_monolithic || (ctx->stage == MESA_SHADER_TESS_EVAL && shader->key.as_ngg &&
                                        !shader->key.as_es && !shader->key.opt.ngg_culling))
            ac_init_exec_full_mask(&ctx->ac);

         if ((ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) &&
             shader->key.as_ngg && !shader->key.as_es && !shader->key.opt.ngg_culling) {
            gfx10_ngg_build_sendmsg_gs_alloc_req(ctx);

            /* Build the primitive export at the beginning
             * of the shader if possible.
             */
            if (gfx10_ngg_export_prim_early(shader))
               gfx10_ngg_build_export_prim(ctx, NULL, NULL);
         }

         if (ctx->stage == MESA_SHADER_TESS_CTRL || ctx->stage == MESA_SHADER_GEOMETRY) {
            if (ctx->stage == MESA_SHADER_GEOMETRY && shader->key.as_ngg) {
               gfx10_ngg_gs_emit_prologue(ctx);
               nested_barrier = false;
            } else {
               nested_barrier = true;
            }

            thread_enabled = si_is_gs_thread(ctx);
         } else {
            thread_enabled = si_is_es_thread(ctx);
            nested_barrier = false;
         }

         ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder);
         ctx->merged_wrap_if_label = 11500;
         ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label);

         if (nested_barrier) {
            /* Execute a barrier before the second shader in
             * a merged shader.
             *
             * Execute the barrier inside the conditional block,
             * so that empty waves can jump directly to s_endpgm,
             * which will also signal the barrier.
             *
             * This is possible in gfx9, because an empty wave
             * for the second shader does not participate in
             * the epilogue. With NGG, empty waves may still
             * be required to export data (e.g. GS output vertices),
             * so we cannot let them exit early.
             *
             * If the shader is TCS and the TCS epilog is present
             * and contains a barrier, it will wait there and then
             * reach s_endpgm.
             */
            si_llvm_emit_barrier(ctx);
         }
      }
   }

   bool success = si_nir_build_llvm(ctx, nir);
   if (free_nir)
      ralloc_free(nir);
   if (!success) {
      fprintf(stderr, "Failed to translate shader from NIR to LLVM\n");
      return false;
   }

   si_llvm_build_ret(ctx, ctx->return_value);
   return true;
}

/**
 * Compute the VS prolog key, which contains all the information needed to
 * build the VS prolog function, and set shader->info bits where needed.
 *
 * \param info             Shader info of the vertex shader.
 * \param num_input_sgprs  Number of input SGPRs for the vertex shader.
 * \param has_old_  Whether the preceding shader part is the NGG cull shader.
 * \param prolog_key       Key of the VS prolog
 * \param shader_out       The vertex shader, or the next shader if merging LS+HS or ES+GS.
 * \param key              Output shader part key.
 */
static void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_sgprs,
                                 bool ngg_cull_shader, const struct si_vs_prolog_bits *prolog_key,
                                 struct si_shader *shader_out, union si_shader_part_key *key)
{
   memset(key, 0, sizeof(*key));
   key->vs_prolog.states = *prolog_key;
   key->vs_prolog.num_input_sgprs = num_input_sgprs;
   key->vs_prolog.num_inputs = info->num_inputs;
   key->vs_prolog.as_ls = shader_out->key.as_ls;
   key->vs_prolog.as_es = shader_out->key.as_es;
   key->vs_prolog.as_ngg = shader_out->key.as_ngg;
   key->vs_prolog.as_prim_discard_cs = shader_out->key.opt.vs_as_prim_discard_cs;

   if (ngg_cull_shader) {
      key->vs_prolog.gs_fast_launch_tri_list =
         !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST);
      key->vs_prolog.gs_fast_launch_tri_strip =
         !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP);
   }

   if (shader_out->selector->info.stage == MESA_SHADER_TESS_CTRL) {
      key->vs_prolog.as_ls = 1;
      key->vs_prolog.num_merged_next_stage_vgprs = 2;
   } else if (shader_out->selector->info.stage == MESA_SHADER_GEOMETRY) {
      key->vs_prolog.as_es = 1;
      key->vs_prolog.num_merged_next_stage_vgprs = 5;
   } else if (shader_out->key.as_ngg) {
      key->vs_prolog.num_merged_next_stage_vgprs = 5;
   }

   /* Only one of these combinations can be set. as_ngg can be set with as_es. */
   assert(key->vs_prolog.as_ls + key->vs_prolog.as_ngg +
             (key->vs_prolog.as_es && !key->vs_prolog.as_ngg) + key->vs_prolog.as_prim_discard_cs <=
          1);

   /* Enable loading the InstanceID VGPR. */
   uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);

   if ((key->vs_prolog.states.instance_divisor_is_one |
        key->vs_prolog.states.instance_divisor_is_fetched) &
       input_mask)
      shader_out->info.uses_instanceid = true;
}

static bool si_should_optimize_less(struct ac_llvm_compiler *compiler,
                                    struct si_shader_selector *sel)
{
   if (!compiler->low_opt_passes)
      return false;

   /* Assume a slow CPU. */
   assert(!sel->screen->info.has_dedicated_vram && sel->screen->info.chip_class <= GFX8);

   /* For a crazy dEQP test containing 2597 memory opcodes, mostly
    * buffer stores. */
   return sel->info.stage == MESA_SHADER_COMPUTE && sel->info.num_memory_stores > 1000;
}

static struct nir_shader *get_nir_shader(struct si_shader_selector *sel, bool *free_nir)
{
   nir_shader *nir;
   *free_nir = false;

   if (sel->nir) {
      nir = sel->nir;
   } else if (sel->nir_binary) {
      struct pipe_screen *screen = &sel->screen->b;
      const void *options = screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR,
                                                         pipe_shader_type_from_mesa(sel->info.stage));

      struct blob_reader blob_reader;
      blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size);
      *free_nir = true;
      nir = nir_deserialize(NULL, options, &blob_reader);
   } else {
      return NULL;
   }

   return nir;
}

static bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
                                   struct si_shader *shader, struct pipe_debug_callback *debug,
                                   struct nir_shader *nir, bool free_nir)
{
   struct si_shader_selector *sel = shader->selector;
   struct si_shader_context ctx;

   si_llvm_context_init(&ctx, sscreen, compiler, si_get_shader_wave_size(shader));

   LLVMValueRef ngg_cull_main_fn = NULL;
   if (shader->key.opt.ngg_culling) {
      if (!si_build_main_function(&ctx, shader, nir, false, true)) {
         si_llvm_dispose(&ctx);
         return false;
      }
      ngg_cull_main_fn = ctx.main_fn;
      ctx.main_fn = NULL;
   }

   if (!si_build_main_function(&ctx, shader, nir, free_nir, false)) {
      si_llvm_dispose(&ctx);
      return false;
   }

   if (shader->is_monolithic && ctx.stage == MESA_SHADER_VERTEX) {
      LLVMValueRef parts[4];
      unsigned num_parts = 0;
      bool has_prolog = false;
      LLVMValueRef main_fn = ctx.main_fn;

      if (ngg_cull_main_fn) {
         if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, true)) {
            union si_shader_part_key prolog_key;
            si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, true,
                                 &shader->key.part.vs.prolog, shader, &prolog_key);
            prolog_key.vs_prolog.is_monolithic = true;
            si_llvm_build_vs_prolog(&ctx, &prolog_key);
            parts[num_parts++] = ctx.main_fn;
            has_prolog = true;
         }
         parts[num_parts++] = ngg_cull_main_fn;
      }

      if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, false)) {
         union si_shader_part_key prolog_key;
         si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, false,
                              &shader->key.part.vs.prolog, shader, &prolog_key);
         prolog_key.vs_prolog.is_monolithic = true;
         si_llvm_build_vs_prolog(&ctx, &prolog_key);
         parts[num_parts++] = ctx.main_fn;
         has_prolog = true;
      }
      parts[num_parts++] = main_fn;

      si_build_wrapper_function(&ctx, parts, num_parts, has_prolog ? 1 : 0, 0);

      if (ctx.shader->key.opt.vs_as_prim_discard_cs)
         si_build_prim_discard_compute_shader(&ctx);
   } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_TESS_EVAL && ngg_cull_main_fn) {
      LLVMValueRef parts[2];

      parts[0] = ngg_cull_main_fn;
      parts[1] = ctx.main_fn;

      si_build_wrapper_function(&ctx, parts, 2, 0, 0);
   } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_TESS_CTRL) {
      if (sscreen->info.chip_class >= GFX9) {
         struct si_shader_selector *ls = shader->key.part.tcs.ls;
         LLVMValueRef parts[4];
         bool vs_needs_prolog =
            si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog, &shader->key, false);

         /* TCS main part */
         parts[2] = ctx.main_fn;

         /* TCS epilog */
         union si_shader_part_key tcs_epilog_key;
         memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
         tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
         si_llvm_build_tcs_epilog(&ctx, &tcs_epilog_key);
         parts[3] = ctx.main_fn;

         /* VS as LS main part */
         nir = get_nir_shader(ls, &free_nir);
         struct si_shader shader_ls = {};
         shader_ls.selector = ls;
         shader_ls.key.as_ls = 1;
         shader_ls.key.mono = shader->key.mono;
         shader_ls.key.opt = shader->key.opt;
         shader_ls.is_monolithic = true;

         if (!si_build_main_function(&ctx, &shader_ls, nir, free_nir, false)) {
            si_llvm_dispose(&ctx);
            return false;
         }
         shader->info.uses_instanceid |= ls->info.uses_instanceid;
         parts[1] = ctx.main_fn;

         /* LS prolog */
         if (vs_needs_prolog) {
            union si_shader_part_key vs_prolog_key;
            si_get_vs_prolog_key(&ls->info, shader_ls.info.num_input_sgprs, false,
                                 &shader->key.part.tcs.ls_prolog, shader, &vs_prolog_key);
            vs_prolog_key.vs_prolog.is_monolithic = true;
            si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
            parts[0] = ctx.main_fn;
         }

         /* Reset the shader context. */
         ctx.shader = shader;
         ctx.stage = MESA_SHADER_TESS_CTRL;

         si_build_wrapper_function(&ctx, parts + !vs_needs_prolog, 4 - !vs_needs_prolog,
                                   vs_needs_prolog, vs_needs_prolog ? 2 : 1);
      } else {
         LLVMValueRef parts[2];
         union si_shader_part_key epilog_key;

         parts[0] = ctx.main_fn;

         memset(&epilog_key, 0, sizeof(epilog_key));
         epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
         si_llvm_build_tcs_epilog(&ctx, &epilog_key);
         parts[1] = ctx.main_fn;

         si_build_wrapper_function(&ctx, parts, 2, 0, 0);
      }
   } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_GEOMETRY) {
      if (ctx.screen->info.chip_class >= GFX9) {
         struct si_shader_selector *es = shader->key.part.gs.es;
         LLVMValueRef es_prolog = NULL;
         LLVMValueRef es_main = NULL;
         LLVMValueRef gs_prolog = NULL;
         LLVMValueRef gs_main = ctx.main_fn;

         /* GS prolog */
         union si_shader_part_key gs_prolog_key;
         memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
         gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
         gs_prolog_key.gs_prolog.is_monolithic = true;
         gs_prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
         si_llvm_build_gs_prolog(&ctx, &gs_prolog_key);
         gs_prolog = ctx.main_fn;

         /* ES main part */
         nir = get_nir_shader(es, &free_nir);
         struct si_shader shader_es = {};
         shader_es.selector = es;
         shader_es.key.as_es = 1;
         shader_es.key.as_ngg = shader->key.as_ngg;
         shader_es.key.mono = shader->key.mono;
         shader_es.key.opt = shader->key.opt;
         shader_es.is_monolithic = true;

         if (!si_build_main_function(&ctx, &shader_es, nir, free_nir, false)) {
            si_llvm_dispose(&ctx);
            return false;
         }
         shader->info.uses_instanceid |= es->info.uses_instanceid;
         es_main = ctx.main_fn;

         /* ES prolog */
         if (es->info.stage == MESA_SHADER_VERTEX &&
             si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog, &shader->key, false)) {
            union si_shader_part_key vs_prolog_key;
            si_get_vs_prolog_key(&es->info, shader_es.info.num_input_sgprs, false,
                                 &shader->key.part.gs.vs_prolog, shader, &vs_prolog_key);
            vs_prolog_key.vs_prolog.is_monolithic = true;
            si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
            es_prolog = ctx.main_fn;
         }

         /* Reset the shader context. */
         ctx.shader = shader;
         ctx.stage = MESA_SHADER_GEOMETRY;

         /* Prepare the array of shader parts. */
         LLVMValueRef parts[4];
         unsigned num_parts = 0, main_part, next_first_part;

         if (es_prolog)
            parts[num_parts++] = es_prolog;

         parts[main_part = num_parts++] = es_main;
         parts[next_first_part = num_parts++] = gs_prolog;
         parts[num_parts++] = gs_main;

         si_build_wrapper_function(&ctx, parts, num_parts, main_part, next_first_part);
      } else {
         LLVMValueRef parts[2];
         union si_shader_part_key prolog_key;

         parts[1] = ctx.main_fn;

         memset(&prolog_key, 0, sizeof(prolog_key));
         prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
         si_llvm_build_gs_prolog(&ctx, &prolog_key);
         parts[0] = ctx.main_fn;

         si_build_wrapper_function(&ctx, parts, 2, 1, 0);
      }
   } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_FRAGMENT) {
      si_llvm_build_monolithic_ps(&ctx, shader);
   }

   si_llvm_optimize_module(&ctx);

   /* Post-optimization transformations and analysis. */
   si_optimize_vs_outputs(&ctx);

   if ((debug && debug->debug_message) || si_can_dump_shader(sscreen, ctx.stage)) {
      ctx.shader->info.private_mem_vgprs = ac_count_scratch_private_memory(ctx.main_fn);
   }

   /* Make sure the input is a pointer and not integer followed by inttoptr. */
   assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) == LLVMPointerTypeKind);

   /* Compile to bytecode. */
   if (!si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler, &ctx.ac, debug,
                        ctx.stage, si_get_shader_name(shader),
                        si_should_optimize_less(compiler, shader->selector))) {
      si_llvm_dispose(&ctx);
      fprintf(stderr, "LLVM failed to compile shader\n");
      return false;
   }

   si_llvm_dispose(&ctx);
   return true;
}

bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
                       struct si_shader *shader, struct pipe_debug_callback *debug)
{
   struct si_shader_selector *sel = shader->selector;
   bool free_nir;
   struct nir_shader *nir = get_nir_shader(sel, &free_nir);

   /* Dump NIR before doing NIR->LLVM conversion in case the
    * conversion fails. */
   if (si_can_dump_shader(sscreen, sel->info.stage) &&
       !(sscreen->debug_flags & DBG(NO_NIR))) {
      nir_print_shader(nir, stderr);
      si_dump_streamout(&sel->so);
   }

   memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
          sizeof(shader->info.vs_output_param_offset));

   shader->info.uses_instanceid = sel->info.uses_instanceid;

   /* TODO: ACO could compile non-monolithic shaders here (starting
    * with PS and NGG VS), but monolithic shaders should be compiled
    * by LLVM due to more complicated compilation.
    */
   if (!si_llvm_compile_shader(sscreen, compiler, shader, debug, nir, free_nir))
      return false;

   /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
    * LLVM 3.9svn has this bug.
    */
   if (sel->info.stage == MESA_SHADER_COMPUTE) {
      unsigned wave_size = sscreen->compute_wave_size;
      unsigned max_vgprs =
         sscreen->info.num_physical_wave64_vgprs_per_simd * (wave_size == 32 ? 2 : 1);
      unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd;
      unsigned max_sgprs_per_wave = 128;
      unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */
      unsigned threads_per_tg = si_get_max_workgroup_size(shader);
      unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, wave_size);
      unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg);

      max_vgprs = max_vgprs / waves_per_simd;
      max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave);

      if (shader->config.num_sgprs > max_sgprs || shader->config.num_vgprs > max_vgprs) {
         fprintf(stderr,
                 "LLVM failed to compile a shader correctly: "
                 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
                 shader->config.num_sgprs, shader->config.num_vgprs, max_sgprs, max_vgprs);

         /* Just terminate the process, because dependent
          * shaders can hang due to bad input data, but use
          * the env var to allow shader-db to work.
          */
         if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
            abort();
      }
   }

   /* Add the scratch offset to input SGPRs. */
   if (shader->config.scratch_bytes_per_wave && !si_is_merged_shader(shader))
      shader->info.num_input_sgprs += 1; /* scratch byte offset */

   /* Calculate the number of fragment input VGPRs. */
   if (sel->info.stage == MESA_SHADER_FRAGMENT) {
      shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt(
         &shader->config, &shader->info.face_vgpr_index, &shader->info.ancillary_vgpr_index);
   }

   si_calculate_max_simd_waves(shader);
   si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
   return true;
}

/**
 * Create, compile and return a shader part (prolog or epilog).
 *
 * \param sscreen	screen
 * \param list		list of shader parts of the same category
 * \param type		shader type
 * \param key		shader part key
 * \param prolog	whether the part being requested is a prolog
 * \param tm		LLVM target machine
 * \param debug		debug callback
 * \param build		the callback responsible for building the main function
 * \return		non-NULL on success
 */
static struct si_shader_part *
si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
                   gl_shader_stage stage, bool prolog, union si_shader_part_key *key,
                   struct ac_llvm_compiler *compiler, struct pipe_debug_callback *debug,
                   void (*build)(struct si_shader_context *, union si_shader_part_key *),
                   const char *name)
{
   struct si_shader_part *result;

   simple_mtx_lock(&sscreen->shader_parts_mutex);

   /* Find existing. */
   for (result = *list; result; result = result->next) {
      if (memcmp(&result->key, key, sizeof(*key)) == 0) {
         simple_mtx_unlock(&sscreen->shader_parts_mutex);
         return result;
      }
   }

   /* Compile a new one. */
   result = CALLOC_STRUCT(si_shader_part);
   result->key = *key;

   struct si_shader_selector sel = {};
   sel.screen = sscreen;

   struct si_shader shader = {};
   shader.selector = &sel;

   switch (stage) {
   case MESA_SHADER_VERTEX:
      shader.key.as_ls = key->vs_prolog.as_ls;
      shader.key.as_es = key->vs_prolog.as_es;
      shader.key.as_ngg = key->vs_prolog.as_ngg;
      shader.key.opt.ngg_culling =
         (key->vs_prolog.gs_fast_launch_tri_list ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST : 0) |
         (key->vs_prolog.gs_fast_launch_tri_strip ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP : 0);
      shader.key.opt.vs_as_prim_discard_cs = key->vs_prolog.as_prim_discard_cs;
      break;
   case MESA_SHADER_TESS_CTRL:
      assert(!prolog);
      shader.key.part.tcs.epilog = key->tcs_epilog.states;
      break;
   case MESA_SHADER_GEOMETRY:
      assert(prolog);
      shader.key.as_ngg = key->gs_prolog.as_ngg;
      break;
   case MESA_SHADER_FRAGMENT:
      if (prolog)
         shader.key.part.ps.prolog = key->ps_prolog.states;
      else
         shader.key.part.ps.epilog = key->ps_epilog.states;
      break;
   default:
      unreachable("bad shader part");
   }

   struct si_shader_context ctx;
   si_llvm_context_init(&ctx, sscreen, compiler,
                        si_get_wave_size(sscreen, stage,
                                         shader.key.as_ngg, shader.key.as_es,
                                         shader.key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL,
                                         shader.key.opt.vs_as_prim_discard_cs));
   ctx.shader = &shader;
   ctx.stage = stage;

   build(&ctx, key);

   /* Compile. */
   si_llvm_optimize_module(&ctx);

   if (!si_compile_llvm(sscreen, &result->binary, &result->config, compiler, &ctx.ac, debug,
                        ctx.stage, name, false)) {
      FREE(result);
      result = NULL;
      goto out;
   }

   result->next = *list;
   *list = result;

out:
   si_llvm_dispose(&ctx);
   simple_mtx_unlock(&sscreen->shader_parts_mutex);
   return result;
}

static bool si_get_vs_prolog(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
                             struct si_shader *shader, struct pipe_debug_callback *debug,
                             struct si_shader *main_part, const struct si_vs_prolog_bits *key)
{
   struct si_shader_selector *vs = main_part->selector;

   if (!si_vs_needs_prolog(vs, key, &shader->key, false))
      return true;

   /* Get the prolog. */
   union si_shader_part_key prolog_key;
   si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, false, key, shader,
                        &prolog_key);

   shader->prolog =
      si_get_shader_part(sscreen, &sscreen->vs_prologs, MESA_SHADER_VERTEX, true, &prolog_key,
                         compiler, debug, si_llvm_build_vs_prolog, "Vertex Shader Prolog");
   return shader->prolog != NULL;
}

/**
 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
 */
static bool si_shader_select_vs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
                                      struct si_shader *shader, struct pipe_debug_callback *debug)
{
   return si_get_vs_prolog(sscreen, compiler, shader, debug, shader, &shader->key.part.vs.prolog);
}

/**
 * Select and compile (or reuse) TCS parts (epilog).
 */
static bool si_shader_select_tcs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
                                       struct si_shader *shader, struct pipe_debug_callback *debug)
{
   if (sscreen->info.chip_class >= GFX9) {
      struct si_shader *ls_main_part = shader->key.part.tcs.ls->main_shader_part_ls;

      if (!si_get_vs_prolog(sscreen, compiler, shader, debug, ls_main_part,
                            &shader->key.part.tcs.ls_prolog))
         return false;

      shader->previous_stage = ls_main_part;
   }

   /* Get the epilog. */
   union si_shader_part_key epilog_key;
   memset(&epilog_key, 0, sizeof(epilog_key));
   epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;

   shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs, MESA_SHADER_TESS_CTRL, false,
                                       &epilog_key, compiler, debug, si_llvm_build_tcs_epilog,
                                       "Tessellation Control Shader Epilog");
   return shader->epilog != NULL;
}

/**
 * Select and compile (or reuse) GS parts (prolog).
 */
static bool si_shader_select_gs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
                                      struct si_shader *shader, struct pipe_debug_callback *debug)
{
   if (sscreen->info.chip_class >= GFX9) {
      struct si_shader *es_main_part;

      if (shader->key.as_ngg)
         es_main_part = shader->key.part.gs.es->main_shader_part_ngg_es;
      else
         es_main_part = shader->key.part.gs.es->main_shader_part_es;

      if (shader->key.part.gs.es->info.stage == MESA_SHADER_VERTEX &&
          !si_get_vs_prolog(sscreen, compiler, shader, debug, es_main_part,
                            &shader->key.part.gs.vs_prolog))
         return false;

      shader->previous_stage = es_main_part;
   }

   if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
      return true;

   union si_shader_part_key prolog_key;
   memset(&prolog_key, 0, sizeof(prolog_key));
   prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
   prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;

   shader->prolog2 =
      si_get_shader_part(sscreen, &sscreen->gs_prologs, MESA_SHADER_GEOMETRY, true, &prolog_key,
                         compiler, debug, si_llvm_build_gs_prolog, "Geometry Shader Prolog");
   return shader->prolog2 != NULL;
}

/**
 * Compute the PS prolog key, which contains all the information needed to
 * build the PS prolog function, and set related bits in shader->config.
 */
void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key,
                          bool separate_prolog)
{
   struct si_shader_info *info = &shader->selector->info;

   memset(key, 0, sizeof(*key));
   key->ps_prolog.states = shader->key.part.ps.prolog;
   key->ps_prolog.colors_read = info->colors_read;
   key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
   key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
   key->ps_prolog.wqm =
      info->base.fs.needs_helper_invocations &&
      (key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
       key->ps_prolog.states.force_linear_sample_interp ||
       key->ps_prolog.states.force_persp_center_interp ||
       key->ps_prolog.states.force_linear_center_interp ||
       key->ps_prolog.states.bc_optimize_for_persp || key->ps_prolog.states.bc_optimize_for_linear);
   key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index;

   if (info->colors_read) {
      ubyte *color = shader->selector->color_attr_index;

      if (shader->key.part.ps.prolog.color_two_side) {
         /* BCOLORs are stored after the last input. */
         key->ps_prolog.num_interp_inputs = info->num_inputs;
         key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
         if (separate_prolog)
            shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
      }

      for (unsigned i = 0; i < 2; i++) {
         unsigned interp = info->color_interpolate[i];
         unsigned location = info->color_interpolate_loc[i];

         if (!(info->colors_read & (0xf << i * 4)))
            continue;

         key->ps_prolog.color_attr_index[i] = color[i];

         if (shader->key.part.ps.prolog.flatshade_colors && interp == INTERP_MODE_COLOR)
            interp = INTERP_MODE_FLAT;

         switch (interp) {
         case INTERP_MODE_FLAT:
            key->ps_prolog.color_interp_vgpr_index[i] = -1;
            break;
         case INTERP_MODE_SMOOTH:
         case INTERP_MODE_COLOR:
            /* Force the interpolation location for colors here. */
            if (shader->key.part.ps.prolog.force_persp_sample_interp)
               location = TGSI_INTERPOLATE_LOC_SAMPLE;
            if (shader->key.part.ps.prolog.force_persp_center_interp)
               location = TGSI_INTERPOLATE_LOC_CENTER;

            switch (location) {
            case TGSI_INTERPOLATE_LOC_SAMPLE:
               key->ps_prolog.color_interp_vgpr_index[i] = 0;
               if (separate_prolog) {
                  shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
               }
               break;
            case TGSI_INTERPOLATE_LOC_CENTER:
               key->ps_prolog.color_interp_vgpr_index[i] = 2;
               if (separate_prolog) {
                  shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
               }
               break;
            case TGSI_INTERPOLATE_LOC_CENTROID:
               key->ps_prolog.color_interp_vgpr_index[i] = 4;
               if (separate_prolog) {
                  shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTROID_ENA(1);
               }
               break;
            default:
               assert(0);
            }
            break;
         case INTERP_MODE_NOPERSPECTIVE:
            /* Force the interpolation location for colors here. */
            if (shader->key.part.ps.prolog.force_linear_sample_interp)
               location = TGSI_INTERPOLATE_LOC_SAMPLE;
            if (shader->key.part.ps.prolog.force_linear_center_interp)
               location = TGSI_INTERPOLATE_LOC_CENTER;

            /* The VGPR assignment for non-monolithic shaders
             * works because InitialPSInputAddr is set on the
             * main shader and PERSP_PULL_MODEL is never used.
             */
            switch (location) {
            case TGSI_INTERPOLATE_LOC_SAMPLE:
               key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 6 : 9;
               if (separate_prolog) {
                  shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
               }
               break;
            case TGSI_INTERPOLATE_LOC_CENTER:
               key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 8 : 11;
               if (separate_prolog) {
                  shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
               }
               break;
            case TGSI_INTERPOLATE_LOC_CENTROID:
               key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 10 : 13;
               if (separate_prolog) {
                  shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTROID_ENA(1);
               }
               break;
            default:
               assert(0);
            }
            break;
         default:
            assert(0);
         }
      }
   }
}

/**
 * Check whether a PS prolog is required based on the key.
 */
bool si_need_ps_prolog(const union si_shader_part_key *key)
{
   return key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
          key->ps_prolog.states.force_linear_sample_interp ||
          key->ps_prolog.states.force_persp_center_interp ||
          key->ps_prolog.states.force_linear_center_interp ||
          key->ps_prolog.states.bc_optimize_for_persp ||
          key->ps_prolog.states.bc_optimize_for_linear || key->ps_prolog.states.poly_stipple ||
          key->ps_prolog.states.samplemask_log_ps_iter;
}

/**
 * Compute the PS epilog key, which contains all the information needed to
 * build the PS epilog function.
 */
void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *key)
{
   struct si_shader_info *info = &shader->selector->info;
   memset(key, 0, sizeof(*key));
   key->ps_epilog.colors_written = info->colors_written;
   key->ps_epilog.color_types = info->output_color_types;
   key->ps_epilog.writes_z = info->writes_z;
   key->ps_epilog.writes_stencil = info->writes_stencil;
   key->ps_epilog.writes_samplemask = info->writes_samplemask;
   key->ps_epilog.states = shader->key.part.ps.epilog;
}

/**
 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
 */
static bool si_shader_select_ps_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
                                      struct si_shader *shader, struct pipe_debug_callback *debug)
{
   union si_shader_part_key prolog_key;
   union si_shader_part_key epilog_key;

   /* Get the prolog. */
   si_get_ps_prolog_key(shader, &prolog_key, true);

   /* The prolog is a no-op if these aren't set. */
   if (si_need_ps_prolog(&prolog_key)) {
      shader->prolog =
         si_get_shader_part(sscreen, &sscreen->ps_prologs, MESA_SHADER_FRAGMENT, true, &prolog_key,
                            compiler, debug, si_llvm_build_ps_prolog, "Fragment Shader Prolog");
      if (!shader->prolog)
         return false;
   }

   /* Get the epilog. */
   si_get_ps_epilog_key(shader, &epilog_key);

   shader->epilog =
      si_get_shader_part(sscreen, &sscreen->ps_epilogs, MESA_SHADER_FRAGMENT, false, &epilog_key,
                         compiler, debug, si_llvm_build_ps_epilog, "Fragment Shader Epilog");
   if (!shader->epilog)
      return false;

   /* Enable POS_FIXED_PT if polygon stippling is enabled. */
   if (shader->key.part.ps.prolog.poly_stipple) {
      shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
      assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
   }

   /* Set up the enable bits for per-sample shading if needed. */
   if (shader->key.part.ps.prolog.force_persp_sample_interp &&
       (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
        G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
      shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
      shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
      shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
   }
   if (shader->key.part.ps.prolog.force_linear_sample_interp &&
       (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
        G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
      shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
      shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
      shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
   }
   if (shader->key.part.ps.prolog.force_persp_center_interp &&
       (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
        G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
      shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
      shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
      shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
   }
   if (shader->key.part.ps.prolog.force_linear_center_interp &&
       (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
        G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
      shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
      shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
      shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
   }

   /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
   if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
       !(shader->config.spi_ps_input_ena & 0xf)) {
      shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
      assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
   }

   /* At least one pair of interpolation weights must be enabled. */
   if (!(shader->config.spi_ps_input_ena & 0x7f)) {
      shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
      assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
   }

   /* Samplemask fixup requires the sample ID. */
   if (shader->key.part.ps.prolog.samplemask_log_ps_iter) {
      shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
      assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr));
   }

   /* The sample mask input is always enabled, because the API shader always
    * passes it through to the epilog. Disable it here if it's unused.
    */
   if (!shader->key.part.ps.epilog.poly_line_smoothing && !shader->selector->info.reads_samplemask)
      shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;

   return true;
}

void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size)
{
   /* If tessellation is all offchip and on-chip GS isn't used, this
    * workaround is not needed.
    */
   return;

   /* SPI barrier management bug:
    *   Make sure we have at least 4k of LDS in use to avoid the bug.
    *   It applies to workgroup sizes of more than one wavefront.
    */
   if (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_KABINI)
      *lds_size = MAX2(*lds_size, 8);
}

void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader)
{
   unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */

   shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);

   if (shader->selector->info.stage == MESA_SHADER_COMPUTE &&
       si_get_max_workgroup_size(shader) > sscreen->compute_wave_size) {
      si_multiwave_lds_size_workaround(sscreen, &shader->config.lds_size);
   }
}

bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
                              struct si_shader *shader, struct pipe_debug_callback *debug)
{
   struct si_shader_selector *sel = shader->selector;
   struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);

   /* LS, ES, VS are compiled on demand if the main part hasn't been
    * compiled for that stage.
    *
    * GS are compiled on demand if the main part hasn't been compiled
    * for the chosen NGG-ness.
    *
    * Vertex shaders are compiled on demand when a vertex fetch
    * workaround must be applied.
    */
   if (shader->is_monolithic) {
      /* Monolithic shader (compiled as a whole, has many variants,
       * may take a long time to compile).
       */
      if (!si_compile_shader(sscreen, compiler, shader, debug))
         return false;
   } else {
      /* The shader consists of several parts:
       *
       * - the middle part is the user shader, it has 1 variant only
       *   and it was compiled during the creation of the shader
       *   selector
       * - the prolog part is inserted at the beginning
       * - the epilog part is inserted at the end
       *
       * The prolog and epilog have many (but simple) variants.
       *
       * Starting with gfx9, geometry and tessellation control
       * shaders also contain the prolog and user shader parts of
       * the previous shader stage.
       */

      if (!mainp)
         return false;

      /* Copy the compiled shader data over. */
      shader->is_binary_shared = true;
      shader->binary = mainp->binary;
      shader->config = mainp->config;
      shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
      shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
      shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
      shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index;
      memcpy(shader->info.vs_output_param_offset, mainp->info.vs_output_param_offset,
             sizeof(mainp->info.vs_output_param_offset));
      shader->info.uses_instanceid = mainp->info.uses_instanceid;
      shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
      shader->info.nr_param_exports = mainp->info.nr_param_exports;

      /* Select prologs and/or epilogs. */
      switch (sel->info.stage) {
      case MESA_SHADER_VERTEX:
         if (!si_shader_select_vs_parts(sscreen, compiler, shader, debug))
            return false;
         break;
      case MESA_SHADER_TESS_CTRL:
         if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug))
            return false;
         break;
      case MESA_SHADER_TESS_EVAL:
         break;
      case MESA_SHADER_GEOMETRY:
         if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))
            return false;
         break;
      case MESA_SHADER_FRAGMENT:
         if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))
            return false;

         /* Make sure we have at least as many VGPRs as there
          * are allocated inputs.
          */
         shader->config.num_vgprs = MAX2(shader->config.num_vgprs, shader->info.num_input_vgprs);
         break;
      default:;
      }

      /* Update SGPR and VGPR counts. */
      if (shader->prolog) {
         shader->config.num_sgprs =
            MAX2(shader->config.num_sgprs, shader->prolog->config.num_sgprs);
         shader->config.num_vgprs =
            MAX2(shader->config.num_vgprs, shader->prolog->config.num_vgprs);
      }
      if (shader->previous_stage) {
         shader->config.num_sgprs =
            MAX2(shader->config.num_sgprs, shader->previous_stage->config.num_sgprs);
         shader->config.num_vgprs =
            MAX2(shader->config.num_vgprs, shader->previous_stage->config.num_vgprs);
         shader->config.spilled_sgprs =
            MAX2(shader->config.spilled_sgprs, shader->previous_stage->config.spilled_sgprs);
         shader->config.spilled_vgprs =
            MAX2(shader->config.spilled_vgprs, shader->previous_stage->config.spilled_vgprs);
         shader->info.private_mem_vgprs =
            MAX2(shader->info.private_mem_vgprs, shader->previous_stage->info.private_mem_vgprs);
         shader->config.scratch_bytes_per_wave =
            MAX2(shader->config.scratch_bytes_per_wave,
                 shader->previous_stage->config.scratch_bytes_per_wave);
         shader->info.uses_instanceid |= shader->previous_stage->info.uses_instanceid;
      }
      if (shader->prolog2) {
         shader->config.num_sgprs =
            MAX2(shader->config.num_sgprs, shader->prolog2->config.num_sgprs);
         shader->config.num_vgprs =
            MAX2(shader->config.num_vgprs, shader->prolog2->config.num_vgprs);
      }
      if (shader->epilog) {
         shader->config.num_sgprs =
            MAX2(shader->config.num_sgprs, shader->epilog->config.num_sgprs);
         shader->config.num_vgprs =
            MAX2(shader->config.num_vgprs, shader->epilog->config.num_vgprs);
      }
      si_calculate_max_simd_waves(shader);
   }

   if (shader->key.as_ngg) {
      assert(!shader->key.as_es && !shader->key.as_ls);
      if (!gfx10_ngg_calculate_subgroup_info(shader)) {
         fprintf(stderr, "Failed to compute subgroup info\n");
         return false;
      }
   } else if (sscreen->info.chip_class >= GFX9 && sel->info.stage == MESA_SHADER_GEOMETRY) {
      gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info);
   }

   si_fix_resource_usage(sscreen, shader);
   si_shader_dump(sscreen, shader, debug, stderr, true);

   /* Upload. */
   if (!si_shader_binary_upload(sscreen, shader, 0)) {
      fprintf(stderr, "LLVM failed to upload shader\n");
      return false;
   }

   return true;
}

void si_shader_binary_clean(struct si_shader_binary *binary)
{
   free((void *)binary->elf_buffer);
   binary->elf_buffer = NULL;

   free(binary->llvm_ir_string);
   binary->llvm_ir_string = NULL;
}

void si_shader_destroy(struct si_shader *shader)
{
   if (shader->scratch_bo)
      si_resource_reference(&shader->scratch_bo, NULL);

   si_resource_reference(&shader->bo, NULL);

   if (!shader->is_binary_shared)
      si_shader_binary_clean(&shader->binary);

   free(shader->shader_log);
}
