src/asahi/vulkan/hk_shader.c - platform/external/mesa3d - Git at Google

 /*
  * Copyright 2024 Valve Corporation
  * Copyright 2024 Alyssa Rosenzweig
  * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
  * SPDX-License-Identifier: MIT
  */
 #include "hk_shader.h"

 #include "agx_debug.h"
 #include "agx_device.h"
 #include "agx_helpers.h"
 #include "agx_nir_lower_gs.h"
 #include "glsl_types.h"
 #include "libagx.h"
 #include "nir.h"
 #include "nir_builder.h"

 #include "agx_bo.h"
 #include "hk_cmd_buffer.h"
 #include "hk_descriptor_set_layout.h"
 #include "hk_device.h"
 #include "hk_physical_device.h"
 #include "hk_sampler.h"
 #include "hk_shader.h"

 #include "nir_builder_opcodes.h"
 #include "nir_builtin_builder.h"
 #include "nir_intrinsics.h"
 #include "nir_intrinsics_indices.h"
 #include "nir_xfb_info.h"
 #include "shader_enums.h"
 #include "vk_nir_convert_ycbcr.h"
 #include "vk_physical_device_features.h"
 #include "vk_pipeline.h"
 #include "vk_pipeline_layout.h"
 #include "vk_shader.h"
 #include "vk_shader_module.h"
 #include "vk_ycbcr_conversion.h"

 #include "asahi/compiler/agx_compile.h"
 #include "asahi/compiler/agx_nir.h"
 #include "asahi/compiler/agx_nir_texture.h"
 #include "asahi/lib/agx_abi.h"
 #include "asahi/lib/agx_linker.h"
 #include "asahi/lib/agx_tilebuffer.h"
 #include "asahi/lib/agx_uvs.h"
 #include "compiler/spirv/nir_spirv.h"

 #include "util/blob.h"
 #include "util/hash_table.h"
 #include "util/macros.h"
 #include "util/mesa-sha1.h"
 #include "util/simple_mtx.h"
 #include "util/u_debug.h"
 #include "vulkan/vulkan_core.h"

 struct hk_fs_key {
    bool zs_self_dep;

    /** True if sample shading is forced on via an API knob such as
     * VkPipelineMultisampleStateCreateInfo::minSampleShading
     */
    bool force_sample_shading;

    uint8_t pad[2];
 };
 static_assert(sizeof(struct hk_fs_key) == 4, "packed");

 static void
 shared_var_info(const struct glsl_type *type, unsigned *size, unsigned *align)
 {
    assert(glsl_type_is_vector_or_scalar(type));

    uint32_t comp_size =
       glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8;
    unsigned length = glsl_get_vector_elements(type);
    *size = comp_size * length, *align = comp_size;
 }

 uint64_t
 hk_physical_device_compiler_flags(const struct hk_physical_device *pdev)
 {
    /* This could be optimized but it doesn't matter */
    return pdev->dev.debug;
 }

 const nir_shader_compiler_options *
 hk_get_nir_options(struct vk_physical_device *vk_pdev, gl_shader_stage stage,
                    UNUSED const struct vk_pipeline_robustness_state *rs)
 {
    return &agx_nir_options;
 }

 static struct spirv_to_nir_options
 hk_get_spirv_options(struct vk_physical_device *vk_pdev,
                      UNUSED gl_shader_stage stage,
                      const struct vk_pipeline_robustness_state *rs)
 {
    return (struct spirv_to_nir_options){
       .ssbo_addr_format = hk_buffer_addr_format(rs->storage_buffers),
       .phys_ssbo_addr_format = nir_address_format_64bit_global,
       .ubo_addr_format = hk_buffer_addr_format(rs->uniform_buffers),
       .shared_addr_format = nir_address_format_32bit_offset,
       .min_ssbo_alignment = HK_MIN_SSBO_ALIGNMENT,
       .min_ubo_alignment = HK_MIN_UBO_ALIGNMENT,
    };
 }

 static bool
 lower_halt_to_return(nir_builder *b, nir_instr *instr, UNUSED void *_data)
 {
    if (instr->type != nir_instr_type_jump)
       return false;

    nir_jump_instr *jump = nir_instr_as_jump(instr);
    if (jump->type != nir_jump_halt)
       return false;

    assert(b->impl == nir_shader_get_entrypoint(b->shader));
    jump->type = nir_jump_return;
    return true;
 }

 void
 hk_preprocess_nir_internal(struct vk_physical_device *vk_pdev, nir_shader *nir)
 {
    /* Must lower before io to temps */
    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
       NIR_PASS(_, nir, nir_lower_terminate_to_demote);
       NIR_PASS(_, nir, nir_shader_instructions_pass, lower_halt_to_return,
                nir_metadata_all, NULL);
       NIR_PASS(_, nir, nir_lower_returns);
    }

    /* Unroll loops before lowering indirects via nir_lower_io_to_temporaries */
    UNUSED bool progress = false;
    NIR_PASS(_, nir, nir_lower_global_vars_to_local);

    do {
       progress = false;
       NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
       NIR_PASS(progress, nir, nir_copy_prop);
       NIR_PASS(progress, nir, nir_opt_dce);
       NIR_PASS(progress, nir, nir_opt_constant_folding);
       NIR_PASS(progress, nir, nir_opt_loop);
       NIR_PASS(progress, nir, nir_opt_loop_unroll);
    } while (progress);

    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
       struct nir_lower_sysvals_to_varyings_options sysvals_opts = {
          .point_coord = true,
       };

       nir_lower_sysvals_to_varyings(nir, &sysvals_opts);
    }

    NIR_PASS(_, nir, nir_lower_system_values);

    /* Gather info before preprocess_nir but after some general lowering, so
     * inputs_read and system_values_read are accurately set.
     */
    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));

    NIR_PASS(_, nir, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(nir),
             true, false);

    NIR_PASS(_, nir, nir_lower_global_vars_to_local);

    NIR_PASS(_, nir, nir_split_var_copies);
    NIR_PASS(_, nir, nir_split_struct_vars, nir_var_function_temp);

    /* Optimize but allow copies because we haven't lowered them yet */
    agx_preprocess_nir(nir);

    NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
    NIR_PASS(_, nir, nir_lower_var_copies);
 }

 static void
 hk_preprocess_nir(struct vk_physical_device *vk_pdev, nir_shader *nir,
                   UNUSED const struct vk_pipeline_robustness_state *rs)
 {
    hk_preprocess_nir_internal(vk_pdev, nir);
    nir_lower_compute_system_values_options csv_options = {
       .has_base_workgroup_id = true,
    };
    NIR_PASS(_, nir, nir_lower_compute_system_values, &csv_options);
 }

 static void
 hk_populate_fs_key(struct hk_fs_key *key,
                    const struct vk_graphics_pipeline_state *state)
 {
    memset(key, 0, sizeof(*key));

    if (state == NULL)
       return;

    if (state->pipeline_flags &
        VK_PIPELINE_CREATE_2_DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT)
       key->zs_self_dep = true;

    /* We force per-sample interpolation whenever sampleShadingEnable is set
     * regardless of minSampleShading or rasterizationSamples.
     *
     * When sampleShadingEnable is set, few guarantees are made about the
     * location of interpolation of the inputs.  The only real guarantees are
     * that the inputs are interpolated within the pixel and that you get at
     * least `rasterizationSamples * minSampleShading` unique positions.
     * Importantly, it does not require that when `rasterizationSamples *
     * minSampleShading <= 1.0` that those positions are at the fragment
     * center.  Therefore, it's valid to just always do per-sample all the time.
     *
     * The one caveat here is that we have to be careful about gl_SampleMaskIn.
     * When `hk_fs_key::force_sample_shading = true` we also turn any reads of
     * gl_SampleMaskIn into `1 << gl_SampleID` because the hardware sample mask
     * is actually per-fragment, not per-pass.  We handle this by smashing
     * minSampleShading to 1.0 whenever gl_SampleMaskIn is read.
     */
    const struct vk_multisample_state *ms = state->ms;
    if (ms != NULL && ms->sample_shading_enable)
       key->force_sample_shading = true;
 }

 enum hk_feature_key {
    HK_FEAT_MIN_LOD = BITFIELD_BIT(0),
    HK_FEAT_CUSTOM_BORDER = BITFIELD_BIT(1),
 };

 static enum hk_feature_key
 hk_make_feature_key(const struct vk_features *features)
 {
    if (!features)
       return ~0U;

    return (features->minLod ? HK_FEAT_MIN_LOD : 0) |
           (features->customBorderColors ? HK_FEAT_CUSTOM_BORDER : 0);
 }

 static void
 hk_hash_graphics_state(struct vk_physical_device *device,
                        const struct vk_graphics_pipeline_state *state,
                        const struct vk_features *features,
                        VkShaderStageFlags stages, blake3_hash blake3_out)
 {
    struct mesa_blake3 blake3_ctx;
    _mesa_blake3_init(&blake3_ctx);
    if (state && (stages & VK_SHADER_STAGE_FRAGMENT_BIT)) {
       struct hk_fs_key key;
       hk_populate_fs_key(&key, state);
       _mesa_blake3_update(&blake3_ctx, &key, sizeof(key));

       const bool is_multiview = state->rp->view_mask != 0;
       _mesa_blake3_update(&blake3_ctx, &is_multiview, sizeof(is_multiview));
    }

    enum hk_feature_key feature_key = hk_make_feature_key(features);
    _mesa_blake3_update(&blake3_ctx, &feature_key, sizeof(feature_key));

    _mesa_blake3_final(&blake3_ctx, blake3_out);
 }

 static nir_def *
 bounds_check(nir_builder *b, nir_def *data, nir_def *offs, nir_def *bound)
 {
    if (data->bit_size == 32 && data->num_components == 1) {
       return nir_bounds_agx(b, data, offs, bound);
    } else {
       /* TODO: Optimize */
       return nir_bcsel(b, nir_uge(b, bound, offs), data,
                        nir_imm_zero(b, data->num_components, data->bit_size));
    }
 }

 static bool
 lower_load_global_constant_offset_instr(nir_builder *b,
                                         nir_intrinsic_instr *intrin, void *data)
 {
    if (intrin->intrinsic != nir_intrinsic_load_global_constant_offset &&
        intrin->intrinsic != nir_intrinsic_load_global_constant_bounded)
       return false;

    b->cursor = nir_before_instr(&intrin->instr);
    bool *has_soft_fault = data;

    nir_def *base_addr = intrin->src[0].ssa;
    nir_def *offset = intrin->src[1].ssa;
    nir_def *bound = NULL;
    nir_def *zero = NULL;

    unsigned bit_size = intrin->def.bit_size;
    assert(bit_size >= 8 && bit_size % 8 == 0);
    unsigned byte_size = bit_size / 8;
    unsigned load_size = byte_size * intrin->num_components;

    if (intrin->intrinsic == nir_intrinsic_load_global_constant_bounded) {
       bound = intrin->src[2].ssa;
       zero = nir_imm_zero(b, intrin->num_components, bit_size);

       nir_def *sat_offset =
          nir_umin(b, offset, nir_imm_int(b, UINT32_MAX - (load_size - 1)));
       nir_def *in_bounds =
          nir_ilt(b, nir_iadd_imm(b, sat_offset, load_size - 1), bound);

       /* If we do not have soft fault, we branch to bounds check. This is slow,
        * fortunately we always have soft fault for release drivers.
        *
        * With soft fault, we speculatively load and smash to zero at the end.
        */
       if (!(*has_soft_fault))
          nir_push_if(b, in_bounds);
    }

    unsigned align_mul = nir_intrinsic_align_mul(intrin);
    unsigned align_offset = nir_intrinsic_align_offset(intrin);

    nir_def *val = nir_build_load_global_constant(
       b, intrin->def.num_components, intrin->def.bit_size,
       nir_iadd(b, base_addr, nir_u2u64(b, offset)), .align_mul = align_mul,
       .align_offset = align_offset, .access = nir_intrinsic_access(intrin));

    if (intrin->intrinsic == nir_intrinsic_load_global_constant_bounded) {
       if (*has_soft_fault) {
          nir_scalar offs = nir_scalar_resolved(offset, 0);
          if (nir_scalar_is_const(offs)) {
             /* Calculate last byte loaded */
             unsigned offs_imm = nir_scalar_as_uint(offs) + load_size;

             /* Simplify the bounds check. Uniform buffers are bounds checked at
              * 64B granularity, so `bound` is a multiple of K = 64. Then
              *
              * offs_imm < bound <==> round_down(offs_imm, K) < bound. Proof:
              *
              * "=>" round_down(offs_imm, K) <= offs_imm < bound.
              *
              * "<=" Let a, b be integer s.t. offs_imm = K a + b with b < K.
              *      Note round_down(offs_imm, K) = Ka.
              *
              *      Let c be integer s.t. bound = Kc.
              *      We have Ka < Kc => a < c.
              *      b < K => Ka + b < K(a + 1).
              *
              *      a < c with integers => a + 1 <= c.
              *      offs_imm < K(a + 1) <= Kc = bound.
              *      Hence offs_imm < bound.
              */
             assert(align_mul == 64);
             offs_imm &= ~(align_mul - 1);

             /* Bounds checks are `offset > bound ? 0 : val` so if offset = 0,
              * the bounds check is useless.
              */
             if (offs_imm) {
                val = bounds_check(b, val, nir_imm_int(b, offs_imm), bound);
             }
          } else {
             offset = nir_iadd_imm(b, offset, load_size);
             val = bounds_check(b, val, offset, bound);
          }

       } else {
          nir_pop_if(b, NULL);
          val = nir_if_phi(b, val, zero);
       }
    }

    nir_def_replace(&intrin->def, val);
    return true;
 }

 struct lower_ycbcr_state {
    uint32_t set_layout_count;
    struct vk_descriptor_set_layout *const *set_layouts;
 };

 static const struct vk_ycbcr_conversion_state *
 lookup_ycbcr_conversion(const void *_state, uint32_t set, uint32_t binding,
                         uint32_t array_index)
 {
    const struct lower_ycbcr_state *state = _state;
    assert(set < state->set_layout_count);
    assert(state->set_layouts[set] != NULL);
    const struct hk_descriptor_set_layout *set_layout =
       vk_to_hk_descriptor_set_layout(state->set_layouts[set]);
    assert(binding < set_layout->binding_count);

    const struct hk_descriptor_set_binding_layout *bind_layout =
       &set_layout->binding[binding];

    if (bind_layout->immutable_samplers == NULL)
       return NULL;

    array_index = MIN2(array_index, bind_layout->array_size - 1);

    const struct hk_sampler *sampler =
       bind_layout->immutable_samplers[array_index];

    return sampler && sampler->vk.ycbcr_conversion
              ? &sampler->vk.ycbcr_conversion->state
              : NULL;
 }

 static int
 glsl_type_size(const struct glsl_type *type, bool bindless)
 {
    return glsl_count_attribute_slots(type, false);
 }

 /*
  * This is the world's worst multiview implementation. We simply duplicate each
  * draw on the CPU side, changing a uniform in between, and then plumb the view
  * index into the layer ID here. Whatever, it works.
  *
  * The "proper" implementation on AGX would use vertex amplification, but a
  * MacBook is not a VR headset.
  */
 static void
 hk_lower_multiview(nir_shader *nir)
 {
    /* If there's an existing layer ID write, ignore it. This avoids validation
     * splat with vk_meta.
     */
    nir_variable *existing = nir_find_variable_with_location(
       nir, nir_var_shader_out, VARYING_SLOT_LAYER);

    if (existing) {
       existing->data.mode = nir_var_shader_temp;
       existing->data.location = 0;
       nir_fixup_deref_modes(nir);
    }

    /* Now write the view index as the layer */
    nir_builder b =
       nir_builder_at(nir_after_impl(nir_shader_get_entrypoint(nir)));

    nir_variable *layer =
       nir_variable_create(nir, nir_var_shader_out, glsl_uint_type(), NULL);

    layer->data.location = VARYING_SLOT_LAYER;

    nir_store_var(&b, layer, nir_load_view_index(&b), nir_component_mask(1));
    b.shader->info.outputs_written |= VARYING_BIT_LAYER;
 }

 static nir_def *
 query_custom_border(nir_builder *b, nir_tex_instr *tex)
 {
    return nir_build_texture_query(b, tex, nir_texop_custom_border_color_agx, 4,
                                   tex->dest_type, false, false);
 }

 static nir_def *
 has_custom_border(nir_builder *b, nir_tex_instr *tex)
 {
    return nir_build_texture_query(b, tex, nir_texop_has_custom_border_color_agx,
                                   1, nir_type_bool1, false, false);
 }

 static bool
 lower(nir_builder *b, nir_tex_instr *tex, UNUSED void *_data)
 {
    if (!nir_tex_instr_need_sampler(tex) || nir_tex_instr_is_query(tex))
       return false;

    /* XXX: this is a really weird edge case, is this even well-defined? */
    if (tex->is_shadow)
       return false;

    b->cursor = nir_after_instr(&tex->instr);
    nir_def *has_custom = has_custom_border(b, tex);

    nir_instr *orig = nir_instr_clone(b->shader, &tex->instr);
    nir_builder_instr_insert(b, orig);
    nir_def *clamp_to_1 = &nir_instr_as_tex(orig)->def;

    nir_push_if(b, has_custom);
    nir_def *replaced = NULL;
    {
       /* Sample again, this time with clamp-to-0 instead of clamp-to-1 */
       nir_instr *clone_instr = nir_instr_clone(b->shader, &tex->instr);
       nir_builder_instr_insert(b, clone_instr);

       nir_tex_instr *tex_0 = nir_instr_as_tex(clone_instr);
       nir_def *clamp_to_0 = &tex_0->def;

       tex_0->backend_flags |= AGX_TEXTURE_FLAG_CLAMP_TO_0;

       /* Grab the border colour */
       nir_def *border = query_custom_border(b, tex_0);

       if (tex->op == nir_texop_tg4) {
          border = nir_replicate(b, nir_channel(b, border, tex->component), 4);
       }

       /* Combine together with the border */
       if (nir_alu_type_get_base_type(tex->dest_type) == nir_type_float &&
           tex->op != nir_texop_tg4) {

          /* For floats, lerp together:
           *
           * For border texels:  (1 * border) + (0 * border      ) = border
           * For regular texels: (x * border) + (x * (1 - border)) = x.
           *
           * Linear filtering is linear (duh), so lerping is compatible.
           */
          replaced = nir_flrp(b, clamp_to_0, clamp_to_1, border);
       } else {
          /* For integers, just select componentwise since there is no linear
           * filtering. Gathers also use this path since they are unfiltered in
           * each component.
           */
          replaced = nir_bcsel(b, nir_ieq(b, clamp_to_0, clamp_to_1), clamp_to_0,
                               border);
       }
    }
    nir_pop_if(b, NULL);

    /* Put it together with a phi */
    nir_def *phi = nir_if_phi(b, replaced, clamp_to_1);
    nir_def_replace(&tex->def, phi);
    return true;
 }

 static bool
 agx_nir_lower_custom_border(nir_shader *nir)
 {
    return nir_shader_tex_pass(nir, lower, nir_metadata_none, NULL);
 }

 static nir_def *
 query_min_lod(nir_builder *b, nir_tex_instr *tex, bool int_coords)
 {
    nir_alu_type T = int_coords ? nir_type_uint16 : nir_type_float16;
    return nir_build_texture_query(b, tex, nir_texop_image_min_lod_agx, 1, T,
                                   false, false);
 }

 static bool
 lower_min_lod(nir_builder *b, nir_tex_instr *tex, UNUSED void *_data)
 {
    if (nir_tex_instr_is_query(tex))
       return false;

    /* Buffer textures don't have levels-of-detail */
    if (tex->sampler_dim == GLSL_SAMPLER_DIM_BUF)
       return false;

    if (tex->backend_flags & AGX_TEXTURE_FLAG_NO_CLAMP)
       return false;

    bool int_coords = tex->op == nir_texop_txf || tex->op == nir_texop_txf_ms ||
                      tex->op == nir_texop_tg4;

    b->cursor = nir_before_instr(&tex->instr);
    nir_def *min_lod = query_min_lod(b, tex, int_coords);
    nir_def *other_min_lod = nir_steal_tex_src(tex, nir_tex_src_min_lod);

    if (tex->op == nir_texop_tg4) {
       b->cursor = nir_after_instr(&tex->instr);

       /* The Vulkan spec section "Texel Gathering" says:
        *
        *    If levelbase < minLodIntegerimageView, then any values fetched are
        *    zero if the robustImageAccess2 feature is enabled.
        *
        * We currently always enable robustImageAccess2, so implement that
        * semantic here.
        *
        * We could probably optimize this with a special descriptor for this case
        * but tg4 is rare enough I'm not bothered.
        */
       nir_def *old = &tex->def;
       nir_def *oob = nir_ine_imm(b, min_lod, 0);
       nir_def *zero = nir_imm_zero(b, old->num_components, old->bit_size);
       nir_def *new_ = nir_bcsel(b, oob, zero, old);
       nir_def_rewrite_uses_after(old, new_, new_->parent_instr);
    } else if (tex->op == nir_texop_txl) {
       assert(other_min_lod == NULL && "txl doesn't have an API min lod");

       nir_def *lod = nir_steal_tex_src(tex, nir_tex_src_lod);
       if (lod) {
          min_lod = nir_fmax(b, nir_f2fN(b, lod, min_lod->bit_size), min_lod);
       }

       nir_tex_instr_add_src(tex, nir_tex_src_lod, min_lod);
    } else {
       if (other_min_lod) {
          assert(!int_coords && "no API min lod");
          min_lod = nir_fmax(b, min_lod, nir_f2f16(b, other_min_lod));
       }

       nir_tex_instr_add_src(tex, nir_tex_src_min_lod, min_lod);
    }

    return true;
 }

 static bool
 agx_nir_lower_image_view_min_lod(nir_shader *nir)
 {
    return nir_shader_tex_pass(nir, lower_min_lod, nir_metadata_none, NULL);
 }

 /*
  * In Vulkan, the VIEWPORT should read 0 in the fragment shader if it is not
  * written by the vertex shader, but in our implementation, the varying would
  * otherwise be undefined. This small pass predicates VIEWPORT reads based on
  * whether the hardware vertex shader writes the VIEWPORT (nonzero UVS index).
  */
 static bool
 lower_viewport_fs(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *data)
 {
    if (intr->intrinsic != nir_intrinsic_load_input)
       return false;

    nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
    if (sem.location != VARYING_SLOT_VIEWPORT)
       return false;

    b->cursor = nir_after_instr(&intr->instr);
    nir_def *orig = &intr->def;

    nir_def *uvs = nir_load_uvs_index_agx(b, .io_semantics = sem);
    nir_def *def = nir_bcsel(b, nir_ine_imm(b, uvs, 0), orig, nir_imm_int(b, 0));

    nir_def_rewrite_uses_after(orig, def, def->parent_instr);
    return true;
 }

 static bool
 lower_subpass_dim(nir_builder *b, nir_tex_instr *tex, UNUSED void *_data)
 {
    if (tex->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS)
       tex->sampler_dim = GLSL_SAMPLER_DIM_2D;
    else if (tex->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS)
       tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
    else
       return false;

    return true;
 }

 static bool
 should_lower_robust(const nir_intrinsic_instr *intr, const void *_)
 {
    /* The hardware is robust, but our software image atomics are not. Unlike the
     * GL driver, we don't use the common buffer image lowering, using the
     * agx_nir_lower_texture lowering for robustImageAccess2 semantics.
     */
    return intr->intrinsic == nir_intrinsic_image_deref_atomic ||
           intr->intrinsic == nir_intrinsic_image_deref_atomic_swap;
 }

 static void
 hk_lower_nir(struct hk_device *dev, nir_shader *nir,
              const struct vk_pipeline_robustness_state *rs, bool is_multiview,
              uint32_t set_layout_count,
              struct vk_descriptor_set_layout *const *set_layouts,
              enum hk_feature_key features)
 {
    if (HK_PERF(dev, NOROBUST)) {
       rs = &vk_robustness_disabled;
    }

    const nir_opt_access_options access_options = {
       .is_vulkan = true,
    };
    NIR_PASS(_, nir, nir_opt_access, &access_options);

    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
       NIR_PASS(_, nir, nir_lower_input_attachments,
                &(nir_input_attachment_options){
                   .use_fragcoord_sysval = true,
                   .use_layer_id_sysval = true,
                   .use_view_id_for_layer = is_multiview,
                });

       NIR_PASS(_, nir, nir_shader_tex_pass, lower_subpass_dim, nir_metadata_all,
                NULL);
       NIR_PASS(_, nir, nir_lower_wpos_center);
    }

    /* XXX: should be last geometry stage, how do I get to that? */
    if (nir->info.stage == MESA_SHADER_VERTEX) {
       if (is_multiview)
          hk_lower_multiview(nir);
    }

    if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
       NIR_PASS(_, nir, nir_lower_patch_vertices,
                nir->info.tess.tcs_vertices_out, NULL);
    }

    const struct lower_ycbcr_state ycbcr_state = {
       .set_layout_count = set_layout_count,
       .set_layouts = set_layouts,
    };
    NIR_PASS(_, nir, nir_vk_lower_ycbcr_tex, lookup_ycbcr_conversion,
             &ycbcr_state);

    /* Lower push constants before lower_descriptors */
    NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_push_const,
             nir_address_format_32bit_offset);

    // NIR_PASS(_, nir, nir_opt_large_constants, NULL, 32);

    /* Turn cache flushes into image coherency bits while we still have derefs */
    NIR_PASS(_, nir, nir_lower_memory_model);

    NIR_PASS(_, nir, nir_lower_robust_access, should_lower_robust, NULL);

    /* We must do early lowering before hk_nir_lower_descriptors, since this will
     * create lod_bias instructions.
     */
    NIR_PASS(_, nir, agx_nir_lower_texture_early, true /* support_lod_bias */);

    if (features & HK_FEAT_MIN_LOD) {
       NIR_PASS(_, nir, agx_nir_lower_image_view_min_lod);
    }

    if ((features & HK_FEAT_CUSTOM_BORDER) && !HK_PERF(dev, NOBORDER)) {
       NIR_PASS(_, nir, agx_nir_lower_custom_border);
    }

    NIR_PASS(_, nir, hk_nir_lower_descriptors, rs, set_layout_count,
             set_layouts);
    NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_global,
             nir_address_format_64bit_global);
    NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ssbo,
             hk_buffer_addr_format(rs->storage_buffers));
    NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ubo,
             hk_buffer_addr_format(rs->uniform_buffers));

    /* Before inserting bounds checks, we want to do a fair bit of optimization.
     * lower_load_global_constant_offset_instr has special optimizations for
     * constant offsets, so we want as many offsets to be constant as possible.
     */
    bool progress;
    do {
       progress = false;
       NIR_PASS(progress, nir, nir_opt_constant_folding);
       NIR_PASS(progress, nir, nir_opt_algebraic);
       NIR_PASS(progress, nir, nir_copy_prop);
       NIR_PASS(progress, nir, nir_opt_dce);
    } while (progress);

    bool soft_fault = agx_has_soft_fault(&dev->dev);
    NIR_PASS(_, nir, nir_shader_intrinsics_pass,
             lower_load_global_constant_offset_instr, nir_metadata_none,
             &soft_fault);

    assert(nir->info.shared_size == 0);

    NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, nir_var_mem_shared,
             shared_var_info);
    NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_shared,
             nir_address_format_32bit_offset);

    if (nir->info.zero_initialize_shared_memory && nir->info.shared_size > 0) {
       /* Align everything up to 16B so we can write whole vec4s. */
       nir->info.shared_size = align(nir->info.shared_size, 16);
       NIR_PASS(_, nir, nir_zero_initialize_shared_memory, nir->info.shared_size,
                16);

       /* We need to call lower_compute_system_values again because
        * nir_zero_initialize_shared_memory generates load_invocation_id which
        * has to be lowered to load_invocation_index.
        */
       NIR_PASS(_, nir, nir_lower_compute_system_values, NULL);
    }

    /* TODO: we can do indirect VS output */
    nir_variable_mode lower_indirect_modes = 0;
    if (nir->info.stage == MESA_SHADER_FRAGMENT)
       lower_indirect_modes |= nir_var_shader_out;
    else if (nir->info.stage == MESA_SHADER_VERTEX)
       lower_indirect_modes |= nir_var_shader_in | nir_var_shader_out;

    NIR_PASS(_, nir, nir_lower_indirect_derefs, lower_indirect_modes,
             UINT32_MAX);

    NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
             glsl_type_size,
             nir_lower_io_lower_64bit_to_32 |
                nir_lower_io_use_interpolated_input_intrinsics);

    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
       NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_viewport_fs,
                nir_metadata_control_flow, NULL);
    }

    NIR_PASS(_, nir, agx_nir_lower_texture);
    NIR_PASS(_, nir, agx_nir_lower_multisampled_image_store);

    agx_preprocess_nir(nir);

    nir_opt_peephole_select_options peephole_select_options = {
       .limit = 0,
       .discard_ok = true,
    };
    NIR_PASS(_, nir, nir_opt_peephole_select, &peephole_select_options);
    NIR_PASS(_, nir, nir_opt_if,
             nir_opt_if_optimize_phi_true_false | nir_opt_if_avoid_64bit_phis);
 }

 static void
 hk_upload_shader(struct hk_device *dev, struct hk_shader *shader)
 {
    if (shader->b.info.has_preamble || shader->b.info.rodata.size_16) {
       /* TODO: Do we wnat to compact? Revisit when we rework prolog/epilogs. */
       size_t size = shader->b.info.binary_size;
       assert(size > 0);

       shader->bo = agx_bo_create(&dev->dev, size, 0,
                                  AGX_BO_EXEC | AGX_BO_LOW_VA, "Preamble");
       memcpy(agx_bo_map(shader->bo), shader->b.binary, size);
       shader->preamble_addr =
          shader->bo->va->addr + shader->b.info.preamble_offset;
    }

    if (!shader->linked.ht) {
       /* If we only have a single variant, link now. */
       shader->only_linked = hk_fast_link(dev, false, shader, NULL, NULL, 0);
    }

    if (shader->info.stage == MESA_SHADER_FRAGMENT) {
       agx_pack_fragment_face_2(&shader->frag_face, 0, &shader->b.info);
    }

    agx_pack(&shader->counts, COUNTS, cfg) {
       cfg.uniform_register_count = shader->b.info.push_count;
       cfg.preshader_register_count = shader->b.info.nr_preamble_gprs;
       cfg.sampler_state_register_count = agx_translate_sampler_state_count(
          shader->b.info.uses_txf ? 1 : 0, false);
    }
 }

 DERIVE_HASH_TABLE(hk_fast_link_key_vs);
 DERIVE_HASH_TABLE(hk_fast_link_key_fs);

 static VkResult
 hk_init_link_ht(struct hk_shader *shader, gl_shader_stage sw_stage)
 {
    simple_mtx_init(&shader->linked.lock, mtx_plain);

    bool multiple_variants =
       sw_stage == MESA_SHADER_VERTEX || sw_stage == MESA_SHADER_FRAGMENT;

    if (!multiple_variants)
       return VK_SUCCESS;

    if (sw_stage == MESA_SHADER_VERTEX)
       shader->linked.ht = hk_fast_link_key_vs_table_create(NULL);
    else
       shader->linked.ht = hk_fast_link_key_fs_table_create(NULL);

    return (shader->linked.ht == NULL) ? VK_ERROR_OUT_OF_HOST_MEMORY
                                       : VK_SUCCESS;
 }

 static VkResult
 hk_compile_nir(struct hk_device *dev, const VkAllocationCallbacks *pAllocator,
                nir_shader *nir, VkShaderCreateFlagsEXT shader_flags,
                const struct vk_pipeline_robustness_state *rs,
                const struct hk_fs_key *fs_key, struct hk_shader *shader,
                gl_shader_stage sw_stage, bool hw, nir_xfb_info *xfb_info)
 {
    unsigned vs_uniform_base = 0;

    /* For now, only shader objects are supported */
    if (sw_stage == MESA_SHADER_VERTEX) {
       vs_uniform_base =
          6 * DIV_ROUND_UP(
                 BITSET_LAST_BIT(shader->info.vs.attrib_components_read), 4);
    } else if (sw_stage == MESA_SHADER_FRAGMENT) {
       shader->info.fs.interp = agx_gather_interp_info(nir);
       shader->info.fs.writes_memory = nir->info.writes_memory;

       /* Discards must be lowering before lowering MSAA to handle discards */
       NIR_PASS(_, nir, agx_nir_lower_discard_zs_emit);
       NIR_PASS(_, nir, agx_nir_lower_fs_output_to_epilog,
                &shader->info.fs.epilog_key);
       NIR_PASS(_, nir, agx_nir_lower_sample_mask);

       if (nir->info.fs.uses_sample_shading) {
          /* Ensure the sample mask is preserved in register */
          nir_builder b =
             nir_builder_at(nir_after_impl(nir_shader_get_entrypoint(nir)));

          nir_def *mask =
             nir_load_exported_agx(&b, 1, 16, .base = AGX_ABI_FIN_SAMPLE_MASK);

          nir_export_agx(&b, mask, .base = AGX_ABI_FOUT_SAMPLE_MASK);

          NIR_PASS(_, nir, agx_nir_lower_to_per_sample);
       }

       NIR_PASS(_, nir, agx_nir_lower_fs_active_samples_to_register);
       NIR_PASS(_, nir, agx_nir_lower_interpolation);
    } else if (sw_stage == MESA_SHADER_TESS_EVAL ||
               sw_stage == MESA_SHADER_TESS_CTRL) {

       shader->info.tess.info.ccw = nir->info.tess.ccw;
       shader->info.tess.info.points = nir->info.tess.point_mode;
       shader->info.tess.info.spacing = nir->info.tess.spacing;
       shader->info.tess.info.mode = nir->info.tess._primitive_mode;

       if (sw_stage == MESA_SHADER_TESS_CTRL) {
          shader->info.tess.tcs_output_patch_size =
             nir->info.tess.tcs_vertices_out;
          shader->info.tess.tcs_per_vertex_outputs =
             agx_tcs_per_vertex_outputs(nir);
          shader->info.tess.tcs_nr_patch_outputs =
             util_last_bit(nir->info.patch_outputs_written);
          shader->info.tess.tcs_output_stride = agx_tcs_output_stride(nir);
       } else {
          /* This destroys info so it needs to happen after the gather */
          NIR_PASS(_, nir, agx_nir_lower_tes, hw);
       }
    }

    uint64_t outputs = nir->info.outputs_written;
    if (!hw &&
        (sw_stage == MESA_SHADER_VERTEX || sw_stage == MESA_SHADER_TESS_EVAL)) {
       nir->info.stage = MESA_SHADER_COMPUTE;
       memset(&nir->info.cs, 0, sizeof(nir->info.cs));
       nir->xfb_info = NULL;
    }

    /* XXX: rename */
    NIR_PASS(_, nir, hk_lower_uvs_index, vs_uniform_base);

 #if 0
    /* TODO */
    nir_variable_mode robust2_modes = 0;
    if (rs->uniform_buffers == VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT)
       robust2_modes |= nir_var_mem_ubo;
    if (rs->storage_buffers == VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT)
       robust2_modes |= nir_var_mem_ssbo;
 #endif

    struct agx_shader_key backend_key = {
       .dev = agx_gather_device_key(&dev->dev),
       .reserved_preamble = 128 /* TODO */,
       .no_stop = nir->info.stage == MESA_SHADER_FRAGMENT,
       .has_scratch = !nir->info.internal,
       .promote_constants = true,
    };

    /* For now, sample shading is always dynamic. Indicate that. */
    if (nir->info.stage == MESA_SHADER_FRAGMENT &&
        nir->info.fs.uses_sample_shading)
       backend_key.fs.inside_sample_loop = true;

    simple_mtx_t *lock = NULL;
    if (agx_get_compiler_debug())
       lock = &hk_device_physical(dev)->debug_compile_lock;

    if (lock)
       simple_mtx_lock(lock);

    agx_compile_shader_nir(nir, &backend_key, &shader->b);

    if (lock)
       simple_mtx_unlock(lock);

    shader->code_ptr = shader->b.binary;
    shader->code_size = shader->b.info.binary_size;

    shader->info.stage = sw_stage;
    shader->info.clip_distance_array_size = nir->info.clip_distance_array_size;
    shader->info.cull_distance_array_size = nir->info.cull_distance_array_size;
    shader->b.info.outputs = outputs;

    if (xfb_info) {
       assert(xfb_info->output_count < ARRAY_SIZE(shader->info.xfb_outputs));

       memcpy(&shader->info.xfb_info, xfb_info,
              nir_xfb_info_size(xfb_info->output_count));

       typed_memcpy(shader->info.xfb_stride, nir->info.xfb_stride, 4);
    }

    if (nir->constant_data_size > 0) {
       uint32_t data_size = align(nir->constant_data_size, HK_MIN_UBO_ALIGNMENT);

       void *data = malloc(data_size);
       if (data == NULL) {
          ralloc_free(nir);
          return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
       }

       memcpy(data, nir->constant_data, nir->constant_data_size);

       assert(nir->constant_data_size <= data_size);
       memset(data + nir->constant_data_size, 0,
              data_size - nir->constant_data_size);

       shader->data_ptr = data;
       shader->data_size = data_size;
    }

    ralloc_free(nir);

    VkResult result = hk_init_link_ht(shader, sw_stage);
    if (result != VK_SUCCESS)
       return vk_error(dev, result);

    hk_upload_shader(dev, shader);
    return VK_SUCCESS;
 }

 static const struct vk_shader_ops hk_shader_ops;

 static void
 hk_destroy_linked_shader(struct hk_device *dev, struct hk_linked_shader *linked)
 {
    agx_bo_unreference(&dev->dev, linked->b.bo);
    ralloc_free(linked);
 }

 static void
 hk_shader_destroy(struct hk_device *dev, struct hk_shader *s)
 {
    free((void *)s->code_ptr);
    free((void *)s->data_ptr);
    agx_bo_unreference(&dev->dev, s->bo);

    simple_mtx_destroy(&s->linked.lock);

    if (s->only_linked)
       hk_destroy_linked_shader(dev, s->only_linked);

    if (s->linked.ht) {
       hash_table_foreach(s->linked.ht, entry) {
          hk_destroy_linked_shader(dev, entry->data);
       }
       _mesa_hash_table_destroy(s->linked.ht, NULL);
    }
 }

 void
 hk_api_shader_destroy(struct vk_device *vk_dev, struct vk_shader *vk_shader,
                       const VkAllocationCallbacks *pAllocator)
 {
    struct hk_device *dev = container_of(vk_dev, struct hk_device, vk);
    struct hk_api_shader *obj =
       container_of(vk_shader, struct hk_api_shader, vk);

    hk_foreach_variant(obj, shader) {
       hk_shader_destroy(dev, shader);
    }

    vk_shader_free(&dev->vk, pAllocator, &obj->vk);
 }

 static void
 hk_lower_hw_vs(nir_shader *nir, struct hk_shader *shader)
 {
    /* Point size must be clamped, excessively large points don't render
     * properly on G13.
     *
     * Must be synced with pointSizeRange.
     */
    NIR_PASS(_, nir, nir_lower_point_size, 1.0f, 511.95f);

    /* TODO: Optimize out for monolithic? */
    NIR_PASS(_, nir, nir_lower_default_point_size);

    NIR_PASS(_, nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
    NIR_PASS(_, nir, agx_nir_lower_cull_distance_vs);

    NIR_PASS(_, nir, agx_nir_lower_uvs, &shader->info.uvs);
 }

 VkResult
 hk_compile_shader(struct hk_device *dev, struct vk_shader_compile_info *info,
                   const struct vk_graphics_pipeline_state *state,
                   const struct vk_features *vk_features,
                   const VkAllocationCallbacks *pAllocator,
                   struct hk_api_shader **shader_out)
 {
    VkResult result;
    enum hk_feature_key features = hk_make_feature_key(vk_features);

    /* We consume the NIR, regardless of success or failure */
    nir_shader *nir = info->nir;

    size_t size = sizeof(struct hk_api_shader) +
                  sizeof(struct hk_shader) * hk_num_variants(info->stage);
    struct hk_api_shader *obj =
       vk_shader_zalloc(&dev->vk, &hk_shader_ops, info->stage, pAllocator, size);

    if (obj == NULL) {
       ralloc_free(nir);
       return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
    }

    /* TODO: Multiview with ESO */
    const bool is_multiview = state && state->rp->view_mask != 0;

    hk_lower_nir(dev, nir, info->robustness, is_multiview,
                 info->set_layout_count, info->set_layouts, features);

    gl_shader_stage sw_stage = nir->info.stage;

    struct hk_fs_key fs_key_tmp, *fs_key = NULL;
    if (sw_stage == MESA_SHADER_FRAGMENT) {
       hk_populate_fs_key(&fs_key_tmp, state);
       fs_key = &fs_key_tmp;

       nir->info.fs.uses_sample_shading |= fs_key->force_sample_shading;

       /* Force late-Z for Z/S self-deps. TODO: There's probably a less silly way
        * to do this.
        */
       if (fs_key->zs_self_dep) {
          nir_builder b =
             nir_builder_at(nir_before_impl(nir_shader_get_entrypoint(nir)));
          nir_discard_if(&b, nir_imm_false(&b));
          nir->info.fs.uses_discard = true;
       }

       NIR_PASS(_, nir, agx_nir_lower_sample_intrinsics, false);
    } else if (sw_stage == MESA_SHADER_TESS_CTRL) {
       NIR_PASS(_, nir, agx_nir_lower_tcs);
    }

    /* Compile all variants up front */
    if (sw_stage == MESA_SHADER_GEOMETRY) {
       for (unsigned rast_disc = 0; rast_disc < 2; ++rast_disc) {
          struct hk_shader *main_variant = hk_main_gs_variant(obj, rast_disc);
          struct hk_shader *count_variant = hk_count_gs_variant(obj, rast_disc);
          bool last = (rast_disc + 1) == 2;

          /* Each variant gets its own NIR. To save an extra clone, we use the
           * original NIR for the last stage.
           */
          nir_shader *clone = last ? nir : nir_shader_clone(NULL, nir);
          nir_shader *count = NULL, *rast = NULL, *pre_gs = NULL;

          NIR_PASS(_, clone, agx_nir_lower_gs, rast_disc, &count, &rast, &pre_gs,
                   &count_variant->info.gs);

          if (!rast_disc) {
             struct hk_shader *shader = &obj->variants[HK_GS_VARIANT_RAST];

             hk_lower_hw_vs(rast, shader);
             shader->info.gs = count_variant->info.gs;
          }

          main_variant->info.gs = count_variant->info.gs;

          struct {
             nir_shader *in;
             struct hk_shader *out;
          } variants[] = {
             {clone, hk_main_gs_variant(obj, rast_disc)},
             {pre_gs, hk_pre_gs_variant(obj, rast_disc)},
             {count, count_variant},
             {rast_disc ? NULL : rast, &obj->variants[HK_GS_VARIANT_RAST]},
          };

          for (unsigned v = 0; v < ARRAY_SIZE(variants); ++v) {
             if (variants[v].in) {
                result = hk_compile_nir(dev, pAllocator, variants[v].in,
                                        info->flags, info->robustness, NULL,
                                        variants[v].out, sw_stage, true, NULL);
                if (result != VK_SUCCESS) {
                   hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator);
                   if (clone != nir) {
                      ralloc_free(nir);
                   }

                   ralloc_free(clone);
                   ralloc_free(pre_gs);
                   ralloc_free(count);
                   ralloc_free(rast);
                   return result;
                }
             }
          }

          /* Nothing consumes this otherwise throw it away.
           *
           * TODO: We should just not generate it.
           */
          if (rast_disc) {
             ralloc_free(rast);
          }
       }
    } else if (sw_stage == MESA_SHADER_VERTEX ||
               sw_stage == MESA_SHADER_TESS_EVAL) {

       VkShaderStageFlags next_stage = info->next_stage_mask;

       /* Transform feedback is layered on top of geometry shaders. If there is
        * not a geometry shader in the pipeline, we will compile a geometry
        * shader for the purpose. Update the next_stage mask accordingly.
        */
       if (nir->xfb_info != NULL) {
          next_stage |= VK_SHADER_STAGE_GEOMETRY_BIT;
       }

       if (sw_stage == MESA_SHADER_VERTEX) {
          assert(
             !(nir->info.inputs_read & BITFIELD64_MASK(VERT_ATTRIB_GENERIC0)) &&
             "Fixed-function attributes not used in Vulkan");

          NIR_PASS(_, nir, nir_recompute_io_bases, nir_var_shader_in);
       }

       /* the shader_out portion of this is load-bearing even for tess eval */
       NIR_PASS(_, nir, nir_io_add_const_offset_to_base,
                nir_var_shader_in | nir_var_shader_out);

       for (enum hk_vs_variant v = 0; v < HK_VS_VARIANTS; ++v) {
          /* Only compile the software variant if we might use this shader with
           * geometry/tessellation. We need to compile the hardware variant
           * unconditionally to handle the VS -> null FS case, which does not
           * require setting the FRAGMENT bit.
           */
          if (v == HK_VS_VARIANT_SW &&
              !(next_stage & (VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT |
                              VK_SHADER_STAGE_GEOMETRY_BIT)))
             continue;

          struct hk_shader *shader = &obj->variants[v];
          bool hw = v == HK_VS_VARIANT_HW;
          bool last = (v + 1) == HK_VS_VARIANTS;

          /* Each variant gets its own NIR. To save an extra clone, we use the
           * original NIR for the last stage.
           */
          nir_shader *clone = last ? nir : nir_shader_clone(NULL, nir);

          if (sw_stage == MESA_SHADER_VERTEX) {
             NIR_PASS(_, clone, agx_nir_lower_vs_input_to_prolog,
                      shader->info.vs.attrib_components_read);

             shader->info.vs.attribs_read =
                nir->info.inputs_read >> VERT_ATTRIB_GENERIC0;
          }

          if (hw) {
             hk_lower_hw_vs(clone, shader);
          } else {
             NIR_PASS(_, clone, agx_nir_lower_vs_before_gs);
          }

          /* hk_compile_nir takes ownership of the clone */
          result = hk_compile_nir(dev, pAllocator, clone, info->flags,
                                  info->robustness, fs_key, shader, sw_stage, hw,
                                  nir->xfb_info);
          if (result != VK_SUCCESS) {
             hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator);
             ralloc_free(nir);
             return result;
          }
       }
    } else {
       struct hk_shader *shader = hk_only_variant(obj);

       /* hk_compile_nir takes ownership of nir */
       result =
          hk_compile_nir(dev, pAllocator, nir, info->flags, info->robustness,
                         fs_key, shader, sw_stage, true, NULL);
       if (result != VK_SUCCESS) {
          hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator);
          return result;
       }
    }

    *shader_out = obj;
    return VK_SUCCESS;
 }

 static VkResult
 hk_compile_shaders(struct vk_device *vk_dev, uint32_t shader_count,
                    struct vk_shader_compile_info *infos,
                    const struct vk_graphics_pipeline_state *state,
                    const struct vk_features *features,
                    const VkAllocationCallbacks *pAllocator,
                    struct vk_shader **shaders_out)
 {
    struct hk_device *dev = container_of(vk_dev, struct hk_device, vk);

    for (uint32_t i = 0; i < shader_count; i++) {
       VkResult result =
          hk_compile_shader(dev, &infos[i], state, features, pAllocator,
                            (struct hk_api_shader **)&shaders_out[i]);
       if (result != VK_SUCCESS) {
          /* Clean up all the shaders before this point */
          for (uint32_t j = 0; j < i; j++)
             hk_api_shader_destroy(&dev->vk, shaders_out[j], pAllocator);

          /* Clean up all the NIR after this point */
          for (uint32_t j = i + 1; j < shader_count; j++)
             ralloc_free(infos[j].nir);

          /* Memset the output array */
          memset(shaders_out, 0, shader_count * sizeof(*shaders_out));

          return result;
       }
    }

    return VK_SUCCESS;
 }

 static VkResult
 hk_deserialize_shader(struct hk_device *dev, struct blob_reader *blob,
                       struct hk_shader *shader)
 {
    struct hk_shader_info info;
    blob_copy_bytes(blob, &info, sizeof(info));

    struct agx_shader_info b_info;
    blob_copy_bytes(blob, &b_info, sizeof(b_info));

    const uint32_t code_size = blob_read_uint32(blob);
    const uint32_t data_size = blob_read_uint32(blob);
    if (blob->overrun)
       return vk_error(dev, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);

    VkResult result = hk_init_link_ht(shader, info.stage);
    if (result != VK_SUCCESS)
       return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);

    simple_mtx_init(&shader->linked.lock, mtx_plain);

    shader->b.info = b_info;
    shader->info = info;
    shader->code_size = code_size;
    shader->data_size = data_size;
    shader->b.info.binary_size = code_size;

    shader->code_ptr = malloc(code_size);
    if (shader->code_ptr == NULL)
       return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);

    shader->data_ptr = malloc(data_size);
    if (shader->data_ptr == NULL)
       return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);

    blob_copy_bytes(blob, (void *)shader->code_ptr, shader->code_size);
    blob_copy_bytes(blob, (void *)shader->data_ptr, shader->data_size);
    if (blob->overrun)
       return vk_error(dev, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);

    shader->b.binary = (void *)shader->code_ptr;
    hk_upload_shader(dev, shader);
    return VK_SUCCESS;
 }

 static VkResult
 hk_deserialize_api_shader(struct vk_device *vk_dev, struct blob_reader *blob,
                           uint32_t binary_version,
                           const VkAllocationCallbacks *pAllocator,
                           struct vk_shader **shader_out)
 {
    struct hk_device *dev = container_of(vk_dev, struct hk_device, vk);

    gl_shader_stage stage = blob_read_uint8(blob);
    if (blob->overrun)
       return vk_error(dev, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);

    size_t size = sizeof(struct hk_api_shader) +
                  sizeof(struct hk_shader) * hk_num_variants(stage);

    struct hk_api_shader *obj =
       vk_shader_zalloc(&dev->vk, &hk_shader_ops, stage, pAllocator, size);

    if (obj == NULL)
       return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);

    hk_foreach_variant(obj, shader) {
       VkResult result = hk_deserialize_shader(dev, blob, shader);

       if (result != VK_SUCCESS) {
          hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator);
          return result;
       }
    }

    *shader_out = &obj->vk;
    return VK_SUCCESS;
 }

 static void
 hk_shader_serialize(struct vk_device *vk_dev, const struct hk_shader *shader,
                     struct blob *blob)
 {
    blob_write_bytes(blob, &shader->info, sizeof(shader->info));
    blob_write_bytes(blob, &shader->b.info, sizeof(shader->b.info));

    blob_write_uint32(blob, shader->code_size);
    blob_write_uint32(blob, shader->data_size);
    blob_write_bytes(blob, shader->code_ptr, shader->code_size);
    blob_write_bytes(blob, shader->data_ptr, shader->data_size);
 }

 static bool
 hk_api_shader_serialize(struct vk_device *vk_dev,
                         const struct vk_shader *vk_shader, struct blob *blob)
 {
    struct hk_api_shader *obj =
       container_of(vk_shader, struct hk_api_shader, vk);

    blob_write_uint8(blob, vk_shader->stage);

    hk_foreach_variant(obj, shader) {
       hk_shader_serialize(vk_dev, shader, blob);
    }

    return !blob->out_of_memory;
 }

 static VkResult
 hk_shader_get_executable_properties(
    UNUSED struct vk_device *device, const struct vk_shader *vk_shader,
    uint32_t *executable_count, VkPipelineExecutablePropertiesKHR *properties)
 {
    struct hk_api_shader *obj =
       container_of(vk_shader, struct hk_api_shader, vk);

    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out, properties,
                           executable_count);

    vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props)
    {
       props->stages = mesa_to_vk_shader_stage(obj->vk.stage);
       props->subgroupSize = 32;
       VK_COPY_STR(props->name, _mesa_shader_stage_to_string(obj->vk.stage));
       VK_PRINT_STR(props->description, "%s shader",
                    _mesa_shader_stage_to_string(obj->vk.stage));
    }

    return vk_outarray_status(&out);
 }

 static VkResult
 hk_shader_get_executable_statistics(
    UNUSED struct vk_device *device, const struct vk_shader *vk_shader,
    uint32_t executable_index, uint32_t *statistic_count,
    VkPipelineExecutableStatisticKHR *statistics)
 {
    struct hk_api_shader *obj =
       container_of(vk_shader, struct hk_api_shader, vk);

    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out, statistics,
                           statistic_count);

    assert(executable_index == 0);

    /* TODO: find a sane way to report multiple variants and have that play nice
     * with zink.
     */
    struct hk_shader *shader = hk_any_variant(obj);
    vk_add_agx2_stats(out, &shader->b.info.stats);
    return vk_outarray_status(&out);
 }

 static bool
 write_ir_text(VkPipelineExecutableInternalRepresentationKHR *ir,
               const char *data)
 {
    ir->isText = VK_TRUE;

    size_t data_len = strlen(data) + 1;

    if (ir->pData == NULL) {
       ir->dataSize = data_len;
       return true;
    }

    strncpy(ir->pData, data, ir->dataSize);
    if (ir->dataSize < data_len)
       return false;

    ir->dataSize = data_len;
    return true;
 }

 static VkResult
 hk_shader_get_executable_internal_representations(
    UNUSED struct vk_device *device, const struct vk_shader *vk_shader,
    uint32_t executable_index, uint32_t *internal_representation_count,
    VkPipelineExecutableInternalRepresentationKHR *internal_representations)
 {
    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out,
                           internal_representations,
                           internal_representation_count);
    bool incomplete_text = false;

    assert(executable_index == 0);

    /* TODO */
 #if 0
    vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
       VK_COPY_STR(ir->name, "AGX assembly");
       VK_COPY_STR(ir->description, "AGX assembly");
       if (!write_ir_text(ir, TODO))
          incomplete_text = true;
    }
 #endif

    return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out);
 }

 static const struct vk_shader_ops hk_shader_ops = {
    .destroy = hk_api_shader_destroy,
    .serialize = hk_api_shader_serialize,
    .get_executable_properties = hk_shader_get_executable_properties,
    .get_executable_statistics = hk_shader_get_executable_statistics,
    .get_executable_internal_representations =
       hk_shader_get_executable_internal_representations,
 };

 const struct vk_device_shader_ops hk_device_shader_ops = {
    .get_nir_options = hk_get_nir_options,
    .get_spirv_options = hk_get_spirv_options,
    .preprocess_nir = hk_preprocess_nir,
    .hash_state = hk_hash_graphics_state,
    .compile = hk_compile_shaders,
    .deserialize = hk_deserialize_api_shader,
    .cmd_set_dynamic_graphics_state = vk_cmd_set_dynamic_graphics_state,
    .cmd_bind_shaders = hk_cmd_bind_shaders,
 };

 struct hk_linked_shader *
 hk_fast_link(struct hk_device *dev, bool fragment, struct hk_shader *main,
              struct agx_shader_part *prolog, struct agx_shader_part *epilog,
              unsigned nr_samples_shaded)
 {
    struct hk_linked_shader *s = rzalloc(NULL, struct hk_linked_shader);
    agx_fast_link(&s->b, &dev->dev, fragment, &main->b, prolog, epilog,
                  nr_samples_shaded);

    if (fragment) {
       agx_pack(&s->fs_counts, FRAGMENT_SHADER_WORD_0, cfg) {
          cfg.cf_binding_count = s->b.cf.nr_bindings;
          cfg.uniform_register_count = main->b.info.push_count;
          cfg.preshader_register_count = main->b.info.nr_preamble_gprs;
          cfg.sampler_state_register_count =
             agx_translate_sampler_state_count(s->b.uses_txf ? 1 : 0, false);
       }
    }

    /* Now that we've linked, bake the USC words to bind this program */
    struct agx_usc_builder b = agx_usc_builder(s->usc.data, sizeof(s->usc.data));

    if (main && main->b.info.rodata.size_16) {
       agx_usc_immediates(&b, &main->b.info.rodata, main->bo->va->addr);
    }

    agx_usc_push_packed(&b, UNIFORM, dev->rodata.image_heap);

    if (s->b.uses_txf)
       agx_usc_push_packed(&b, SAMPLER, dev->dev.txf_sampler);

    agx_usc_shared_non_fragment(&b, &main->b.info, 0);
    agx_usc_push_packed(&b, SHADER, s->b.shader);
    agx_usc_push_packed(&b, REGISTERS, s->b.regs);

    if (fragment)
       agx_usc_push_packed(&b, FRAGMENT_PROPERTIES, s->b.fragment_props);

    if (main && main->b.info.has_preamble) {
       agx_usc_pack(&b, PRESHADER, cfg) {
          cfg.code = agx_usc_addr(&dev->dev, main->preamble_addr);
       }
    } else {
       agx_usc_pack(&b, NO_PRESHADER, cfg)
          ;
    }

    s->usc.size = b.head - s->usc.data;
    return s;
 }