src/broadcom/vulkan/v3dv_pipeline.c - platform/external/mesa3d - Git at Google

 /*
  * Copyright © 2019 Raspberry Pi
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */

 #include "vk_util.h"

 #include "v3dv_debug.h"
 #include "v3dv_private.h"

 #include "vk_format_info.h"

 #include "common/v3d_debug.h"

 #include "compiler/nir/nir_builder.h"

 #include "util/u_atomic.h"

 #include "vulkan/util/vk_format.h"

 #include "broadcom/cle/v3dx_pack.h"

 VkResult
 v3dv_CreateShaderModule(VkDevice _device,
                         const VkShaderModuleCreateInfo *pCreateInfo,
                         const VkAllocationCallbacks *pAllocator,
                         VkShaderModule *pShaderModule)
 {
    V3DV_FROM_HANDLE(v3dv_device, device, _device);
    struct v3dv_shader_module *module;

    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO);
    assert(pCreateInfo->flags == 0);

    module = vk_alloc2(&device->alloc, pAllocator,
                       sizeof(*module) + pCreateInfo->codeSize, 8,
                       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (module == NULL)
       return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);

    module->nir = NULL;

    module->size = pCreateInfo->codeSize;
    memcpy(module->data, pCreateInfo->pCode, module->size);

    _mesa_sha1_compute(module->data, module->size, module->sha1);

    *pShaderModule = v3dv_shader_module_to_handle(module);

    return VK_SUCCESS;
 }

 void
 v3dv_DestroyShaderModule(VkDevice _device,
                          VkShaderModule _module,
                          const VkAllocationCallbacks *pAllocator)
 {
    V3DV_FROM_HANDLE(v3dv_device, device, _device);
    V3DV_FROM_HANDLE(v3dv_shader_module, module, _module);

    if (!module)
       return;

    /* NIR modules (which are only created internally by the driver) are not
     * dynamically allocated so we should never call this for them.
     * Instead the driver is responsible for freeing the NIR code when it is
     * no longer needed.
     */
    assert(module->nir == NULL);

    vk_free2(&device->alloc, pAllocator, module);
 }

 static void
 destroy_pipeline_stage(struct v3dv_device *device,
                        struct v3dv_pipeline_stage *p_stage,
                        const VkAllocationCallbacks *pAllocator)
 {
    if (!p_stage)
       return;

    hash_table_foreach(p_stage->cache, entry) {
       struct v3dv_shader_variant *variant = entry->data;

       if (variant->assembly_bo) {
          v3dv_bo_free(device, variant->assembly_bo);
          ralloc_free(variant->prog_data.base);
          vk_free2(&device->alloc, pAllocator, variant);
       }
    }

    ralloc_free(p_stage->nir);

    _mesa_hash_table_destroy(p_stage->cache, NULL);

    vk_free2(&device->alloc, pAllocator, p_stage);
 }

 static void
 v3dv_destroy_pipeline(struct v3dv_pipeline *pipeline,
                       struct v3dv_device *device,
                       const VkAllocationCallbacks *pAllocator)
 {
    if (!pipeline)
       return;

    /* FIXME: we can't just use a loop over mesa stage due the bin, would be
     * good to find an alternative.
     */
    destroy_pipeline_stage(device, pipeline->vs, pAllocator);
    destroy_pipeline_stage(device, pipeline->vs_bin, pAllocator);
    destroy_pipeline_stage(device, pipeline->fs, pAllocator);
    destroy_pipeline_stage(device, pipeline->cs, pAllocator);

    if (pipeline->spill.bo) {
       assert(pipeline->spill.size_per_thread > 0);
       v3dv_bo_free(device, pipeline->spill.bo);
    }

    if (pipeline->default_attribute_values) {
       v3dv_bo_free(device, pipeline->default_attribute_values);
       pipeline->default_attribute_values = NULL;
    }

    if (pipeline->combined_index_map)
       _mesa_hash_table_destroy(pipeline->combined_index_map, NULL);

    if (pipeline->default_attribute_values)
       v3dv_bo_free(device, pipeline->default_attribute_values);

    vk_free2(&device->alloc, pAllocator, pipeline);
 }

 void
 v3dv_DestroyPipeline(VkDevice _device,
                      VkPipeline _pipeline,
                      const VkAllocationCallbacks *pAllocator)
 {
    V3DV_FROM_HANDLE(v3dv_device, device, _device);
    V3DV_FROM_HANDLE(v3dv_pipeline, pipeline, _pipeline);

    if (!pipeline)
       return;

    v3dv_destroy_pipeline(pipeline, device, pAllocator);
 }

 static const struct spirv_to_nir_options default_spirv_options =  {
    .caps = { false },
    .ubo_addr_format = nir_address_format_32bit_index_offset,
    .ssbo_addr_format = nir_address_format_32bit_index_offset,
    .phys_ssbo_addr_format = nir_address_format_64bit_global,
    .push_const_addr_format = nir_address_format_logical,
    .shared_addr_format = nir_address_format_32bit_offset,
    .frag_coord_is_sysval = false,
 };

 const nir_shader_compiler_options v3dv_nir_options = {
    .lower_all_io_to_temps = true,
    .lower_extract_byte = true,
    .lower_extract_word = true,
    .lower_bitfield_insert_to_shifts = true,
    .lower_bitfield_extract_to_shifts = true,
    .lower_bitfield_reverse = true,
    .lower_bit_count = true,
    .lower_cs_local_id_from_index = true,
    .lower_ffract = true,
    .lower_fmod = true,
    .lower_pack_unorm_2x16 = true,
    .lower_pack_snorm_2x16 = true,
    .lower_unpack_unorm_2x16 = true,
    .lower_unpack_snorm_2x16 = true,
    .lower_pack_unorm_4x8 = true,
    .lower_pack_snorm_4x8 = true,
    .lower_unpack_unorm_4x8 = true,
    .lower_unpack_snorm_4x8 = true,
    .lower_pack_half_2x16 = true,
    .lower_unpack_half_2x16 = true,
    /* FIXME: see if we can avoid the uadd_carry and usub_borrow lowering and
     * get the tests to pass since it might produce slightly better code.
     */
    .lower_uadd_carry = true,
    .lower_usub_borrow = true,
    /* FIXME: check if we can use multop + umul24 to implement mul2x32_64
     * without lowering.
     */
    .lower_mul_2x32_64 = true,
    .lower_fdiv = true,
    .lower_find_lsb = true,
    .lower_ffma16 = true,
    .lower_ffma32 = true,
    .lower_ffma64 = true,
    .lower_flrp32 = true,
    .lower_fpow = true,
    .lower_fsat = true,
    .lower_fsqrt = true,
    .lower_ifind_msb = true,
    .lower_isign = true,
    .lower_ldexp = true,
    .lower_mul_high = true,
    .lower_wpos_pntc = true,
    .lower_rotate = true,
    .lower_to_scalar = true,
    .vertex_id_zero_based = false, /* FIXME: to set this to true, the intrinsic
                                    * needs to be supported */
 };

 const nir_shader_compiler_options *
 v3dv_pipeline_get_nir_options(void)
 {
    return &v3dv_nir_options;
 }

 #define OPT(pass, ...) ({                                  \
    bool this_progress = false;                             \
    NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
    if (this_progress)                                      \
       progress = true;                                     \
    this_progress;                                          \
 })

 static void
 nir_optimize(nir_shader *nir,
              struct v3dv_pipeline_stage *stage,
              bool allow_copies)
 {
    bool progress;

    do {
       progress = false;
       OPT(nir_split_array_vars, nir_var_function_temp);
       OPT(nir_shrink_vec_array_vars, nir_var_function_temp);
       OPT(nir_opt_deref);
       OPT(nir_lower_vars_to_ssa);
       if (allow_copies) {
          /* Only run this pass in the first call to nir_optimize.  Later calls
           * assume that we've lowered away any copy_deref instructions and we
           * don't want to introduce any more.
           */
          OPT(nir_opt_find_array_copies);
       }
       OPT(nir_opt_copy_prop_vars);
       OPT(nir_opt_dead_write_vars);
       OPT(nir_opt_combine_stores, nir_var_all);

       OPT(nir_lower_alu_to_scalar, NULL, NULL);

       OPT(nir_copy_prop);
       OPT(nir_lower_phis_to_scalar);

       OPT(nir_copy_prop);
       OPT(nir_opt_dce);
       OPT(nir_opt_cse);
       OPT(nir_opt_combine_stores, nir_var_all);

       /* Passing 0 to the peephole select pass causes it to convert
        * if-statements that contain only move instructions in the branches
        * regardless of the count.
        *
        * Passing 1 to the peephole select pass causes it to convert
        * if-statements that contain at most a single ALU instruction (total)
        * in both branches.
        */
       OPT(nir_opt_peephole_select, 0, false, false);
       OPT(nir_opt_peephole_select, 8, false, true);

       OPT(nir_opt_intrinsics);
       OPT(nir_opt_idiv_const, 32);
       OPT(nir_opt_algebraic);
       OPT(nir_opt_constant_folding);

       OPT(nir_opt_dead_cf);

       OPT(nir_opt_if, false);
       OPT(nir_opt_conditional_discard);

       OPT(nir_opt_remove_phis);
       OPT(nir_opt_undef);
       OPT(nir_lower_pack);
    } while (progress);

    OPT(nir_remove_dead_variables, nir_var_function_temp, NULL);
 }

 static void
 preprocess_nir(nir_shader *nir,
                struct v3dv_pipeline_stage *stage)
 {
    /* Make sure we lower variable initializers on output variables so that
     * nir_remove_dead_variables below sees the corresponding stores
     */
    NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_shader_out);

    /* Now that we've deleted all but the main function, we can go ahead and
     * lower the rest of the variable initializers.
     */
    NIR_PASS_V(nir, nir_lower_variable_initializers, ~0);

    /* Split member structs.  We do this before lower_io_to_temporaries so that
     * it doesn't lower system values to temporaries by accident.
     */
    NIR_PASS_V(nir, nir_split_var_copies);
    NIR_PASS_V(nir, nir_split_per_member_structs);

    if (nir->info.stage == MESA_SHADER_FRAGMENT)
       NIR_PASS_V(nir, nir_lower_io_to_vector, nir_var_shader_out);
    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
       NIR_PASS_V(nir, nir_lower_input_attachments,
                  &(nir_input_attachment_options) {
                     .use_fragcoord_sysval = false,
                        });
    }

    NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_shader_in |
               nir_var_shader_out | nir_var_system_value | nir_var_mem_shared,
               NULL);

    NIR_PASS_V(nir, nir_propagate_invariant);
    NIR_PASS_V(nir, nir_lower_io_to_temporaries,
               nir_shader_get_entrypoint(nir), true, false);

    NIR_PASS_V(nir, nir_lower_system_values);
    NIR_PASS_V(nir, nir_lower_clip_cull_distance_arrays);

    NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);

    NIR_PASS_V(nir, nir_normalize_cubemap_coords);

    NIR_PASS_V(nir, nir_lower_global_vars_to_local);

    NIR_PASS_V(nir, nir_split_var_copies);
    NIR_PASS_V(nir, nir_split_struct_vars, nir_var_function_temp);

    nir_optimize(nir, stage, true);

    NIR_PASS_V(nir, nir_lower_load_const_to_scalar);

    /* Lower a bunch of stuff */
    NIR_PASS_V(nir, nir_lower_var_copies);

    NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_shader_in |
               nir_var_shader_out |
               nir_var_function_temp, UINT32_MAX);

    NIR_PASS_V(nir, nir_lower_array_deref_of_vec,
               nir_var_mem_ubo | nir_var_mem_ssbo,
               nir_lower_direct_array_deref_of_vec_load);

    NIR_PASS_V(nir, nir_lower_frexp);

    /* Get rid of split copies */
    nir_optimize(nir, stage, false);
 }

 /* FIXME: This is basically the same code at anv, tu and radv. Move to common
  * place?
  */
 static struct nir_spirv_specialization*
 vk_spec_info_to_nir_spirv(const VkSpecializationInfo *spec_info,
                           uint32_t *out_num_spec_entries)
 {
    if (spec_info == NULL || spec_info->mapEntryCount == 0)
       return NULL;

    uint32_t num_spec_entries = spec_info->mapEntryCount;
    struct nir_spirv_specialization *spec_entries = calloc(num_spec_entries, sizeof(*spec_entries));

    for (uint32_t i = 0; i < num_spec_entries; i++) {
       VkSpecializationMapEntry entry = spec_info->pMapEntries[i];
       const void *data = spec_info->pData + entry.offset;
       assert(data + entry.size <= spec_info->pData + spec_info->dataSize);

       spec_entries[i].id = spec_info->pMapEntries[i].constantID;
       switch (entry.size) {
       case 8:
          spec_entries[i].value.u64 = *(const uint64_t *)data;
          break;
       case 4:
          spec_entries[i].value.u32 = *(const uint32_t *)data;
          break;
       case 2:
          spec_entries[i].value.u16 = *(const uint16_t *)data;
          break;
       case 1:
          spec_entries[i].value.u8 = *(const uint8_t *)data;
          break;
       default:
          assert(!"Invalid spec constant size");
          break;
       }
    }

    *out_num_spec_entries = num_spec_entries;
    return spec_entries;
 }

 static nir_shader *
 shader_module_compile_to_nir(struct v3dv_device *device,
                              struct v3dv_pipeline_stage *stage)
 {
    nir_shader *nir;
    const nir_shader_compiler_options *nir_options = &v3dv_nir_options;

    if (!stage->module->nir) {
       uint32_t *spirv = (uint32_t *) stage->module->data;
       assert(stage->module->size % 4 == 0);

       if (V3D_DEBUG & V3D_DEBUG_DUMP_SPIRV)
          v3dv_print_spirv(stage->module->data, stage->module->size, stderr);

       uint32_t num_spec_entries = 0;
       struct nir_spirv_specialization *spec_entries =
          vk_spec_info_to_nir_spirv(stage->spec_info, &num_spec_entries);
       const struct spirv_to_nir_options spirv_options = default_spirv_options;
       nir = spirv_to_nir(spirv, stage->module->size / 4,
                          spec_entries, num_spec_entries,
                          stage->stage, stage->entrypoint,
                          &spirv_options, nir_options);
       nir_validate_shader(nir, "after spirv_to_nir");
       free(spec_entries);
    } else {
       /* For NIR modules created by the driver we can't consume the NIR
        * directly, we need to clone it first, since ownership of the NIR code
        * (as with SPIR-V code for SPIR-V shaders), belongs to the creator
        * of the module and modules can be destroyed immediately after been used
        * to create pipelines.
        */
       nir = nir_shader_clone(NULL, stage->module->nir);
       nir_validate_shader(nir, "nir module");
    }
    assert(nir->info.stage == stage->stage);

    if (V3D_DEBUG & (V3D_DEBUG_NIR |
                     v3d_debug_flag_for_shader_stage(stage->stage))) {
       fprintf(stderr, "Initial form: %s prog %d NIR:\n",
               gl_shader_stage_name(stage->stage),
               stage->program_id);
       nir_print_shader(nir, stderr);
       fprintf(stderr, "\n");
    }

    /* We have to lower away local variable initializers right before we
     * inline functions.  That way they get properly initialized at the top
     * of the function and not at the top of its caller.
     */
    NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp);
    NIR_PASS_V(nir, nir_lower_returns);
    NIR_PASS_V(nir, nir_inline_functions);
    NIR_PASS_V(nir, nir_opt_deref);

    /* Pick off the single entrypoint that we want */
    foreach_list_typed_safe(nir_function, func, node, &nir->functions) {
       if (func->is_entrypoint)
          func->name = ralloc_strdup(func, "main");
       else
          exec_node_remove(&func->node);
    }
    assert(exec_list_length(&nir->functions) == 1);

    /* Vulkan uses the separate-shader linking model */
    nir->info.separate_shader = true;

    preprocess_nir(nir, stage);

    return nir;
 }

 static int
 type_size_vec4(const struct glsl_type *type, bool bindless)
 {
    return glsl_count_attribute_slots(type, false);
 }

 static unsigned
 descriptor_map_add(struct v3dv_descriptor_map *map,
                    int set,
                    int binding,
                    int array_index,
                    int array_size)
 {
    assert(array_index < array_size);

    unsigned index = 0;
    for (unsigned i = 0; i < map->num_desc; i++) {
       if (set == map->set[i] &&
           binding == map->binding[i] &&
           array_index == map->array_index[i]) {
          assert(array_size == map->array_size[i]);
          return index;
       }
       index++;
    }

    assert(index == map->num_desc);

    map->set[map->num_desc] = set;
    map->binding[map->num_desc] = binding;
    map->array_index[map->num_desc] = array_index;
    map->array_size[map->num_desc] = array_size;
    map->num_desc++;

    return index;
 }


 static void
 lower_load_push_constant(nir_builder *b, nir_intrinsic_instr *instr,
                          struct v3dv_pipeline *pipeline)
 {
    assert(instr->intrinsic == nir_intrinsic_load_push_constant);
    instr->intrinsic = nir_intrinsic_load_uniform;
 }

 /* Gathers info from the intrinsic (set and binding) and then lowers it so it
  * could be used by the v3d_compiler */
 static void
 lower_vulkan_resource_index(nir_builder *b,
                             nir_intrinsic_instr *instr,
                             struct v3dv_pipeline *pipeline,
                             const struct v3dv_pipeline_layout *layout)
 {
    assert(instr->intrinsic == nir_intrinsic_vulkan_resource_index);

    nir_const_value *const_val = nir_src_as_const_value(instr->src[0]);

    unsigned set = nir_intrinsic_desc_set(instr);
    unsigned binding = nir_intrinsic_binding(instr);
    struct v3dv_descriptor_set_layout *set_layout = layout->set[set].layout;
    struct v3dv_descriptor_set_binding_layout *binding_layout =
       &set_layout->binding[binding];
    unsigned index = 0;

    switch (nir_intrinsic_desc_type(instr)) {
    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: {
       struct v3dv_descriptor_map *descriptor_map =
          nir_intrinsic_desc_type(instr) == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ?
          &pipeline->ubo_map : &pipeline->ssbo_map;

       if (!const_val)
          unreachable("non-constant vulkan_resource_index array index");

       index = descriptor_map_add(descriptor_map, set, binding,
                                  const_val->u32,
                                  binding_layout->array_size);

       if (nir_intrinsic_desc_type(instr) == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
          /* skip index 0 which is used for push constants */
          index++;
       }
       break;
    }

    default:
       unreachable("unsupported desc_type for vulkan_resource_index");
       break;
    }

    nir_ssa_def_rewrite_uses(&instr->dest.ssa,
                             nir_src_for_ssa(nir_imm_int(b, index)));
    nir_instr_remove(&instr->instr);
 }

 static struct hash_table *
 pipeline_ensure_combined_index_map(struct v3dv_pipeline *pipeline)
 {
    if (pipeline->combined_index_map == NULL) {
       pipeline->combined_index_map =
          _mesa_hash_table_create(NULL, _mesa_hash_u32, _mesa_key_u32_equal);
       pipeline->next_combined_index = 0;
    }

    assert(pipeline->combined_index_map);

    return pipeline->combined_index_map;
 }

 static uint32_t
 get_combined_index(struct v3dv_pipeline *pipeline,
                    uint32_t texture_index,
                    uint32_t sampler_index)
 {
    struct hash_table *ht = pipeline_ensure_combined_index_map(pipeline);
    uint32_t key = v3dv_pipeline_combined_index_key_create(texture_index, sampler_index);
    struct hash_entry *entry = _mesa_hash_table_search(ht, &key);

    if (entry)
       return (uint32_t)(uintptr_t) (entry->data);

    uint32_t new_index = pipeline->next_combined_index;

    _mesa_hash_table_insert(ht, &key, (void *)(uintptr_t) (new_index));
    pipeline->combined_index_to_key_map[new_index] = key;
    pipeline->next_combined_index++;

    return new_index;
 }

 static void
 lower_tex_src_to_offset(nir_builder *b, nir_tex_instr *instr, unsigned src_idx,
                         struct v3dv_pipeline *pipeline,
                         const struct v3dv_pipeline_layout *layout)
 {
    nir_ssa_def *index = NULL;
    unsigned base_index = 0;
    unsigned array_elements = 1;
    nir_tex_src *src = &instr->src[src_idx];
    bool is_sampler = src->src_type == nir_tex_src_sampler_deref;

    /* We compute first the offsets */
    nir_deref_instr *deref = nir_instr_as_deref(src->src.ssa->parent_instr);
    while (deref->deref_type != nir_deref_type_var) {
       assert(deref->parent.is_ssa);
       nir_deref_instr *parent =
          nir_instr_as_deref(deref->parent.ssa->parent_instr);

       assert(deref->deref_type == nir_deref_type_array);

       if (nir_src_is_const(deref->arr.index) && index == NULL) {
          /* We're still building a direct index */
          base_index += nir_src_as_uint(deref->arr.index) * array_elements;
       } else {
          if (index == NULL) {
             /* We used to be direct but not anymore */
             index = nir_imm_int(b, base_index);
             base_index = 0;
          }

          index = nir_iadd(b, index,
                           nir_imul(b, nir_imm_int(b, array_elements),
                                    nir_ssa_for_src(b, deref->arr.index, 1)));
       }

       array_elements *= glsl_get_length(parent->type);

       deref = parent;
    }

    if (index)
       index = nir_umin(b, index, nir_imm_int(b, array_elements - 1));

    /* We have the offsets, we apply them, rewriting the source or removing
     * instr if needed
     */
    if (index) {
       nir_instr_rewrite_src(&instr->instr, &src->src,
                             nir_src_for_ssa(index));

       src->src_type = is_sampler ?
          nir_tex_src_sampler_offset :
          nir_tex_src_texture_offset;
    } else {
       nir_tex_instr_remove_src(instr, src_idx);
    }

    uint32_t set = deref->var->data.descriptor_set;
    uint32_t binding = deref->var->data.binding;
    struct v3dv_descriptor_set_layout *set_layout = layout->set[set].layout;
    struct v3dv_descriptor_set_binding_layout *binding_layout =
       &set_layout->binding[binding];

    /* For input attachments, the shader includes the attachment_idx. As we are
     * treating them as a texture, we only want the base_index
     */
    uint32_t array_index = binding_layout->type != VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT ?
       deref->var->data.index + base_index :
       base_index;

    int desc_index =
       descriptor_map_add(is_sampler ?
                          &pipeline->sampler_map : &pipeline->texture_map,
                          deref->var->data.descriptor_set,
                          deref->var->data.binding,
                          array_index,
                          binding_layout->array_size);
    if (is_sampler)
       instr->sampler_index = desc_index;
    else
       instr->texture_index = desc_index;
 }

 static bool
 lower_sampler(nir_builder *b, nir_tex_instr *instr,
               struct v3dv_pipeline *pipeline,
               const struct v3dv_pipeline_layout *layout)
 {
    int texture_idx =
       nir_tex_instr_src_index(instr, nir_tex_src_texture_deref);

    if (texture_idx >= 0)
       lower_tex_src_to_offset(b, instr, texture_idx, pipeline, layout);

    int sampler_idx =
       nir_tex_instr_src_index(instr, nir_tex_src_sampler_deref);

    if (sampler_idx >= 0)
       lower_tex_src_to_offset(b, instr, sampler_idx, pipeline, layout);

    if (texture_idx < 0 && sampler_idx < 0)
       return false;

    int combined_index =
       get_combined_index(pipeline,
                          instr->texture_index,
                          sampler_idx < 0 ? V3DV_NO_SAMPLER_IDX : instr->sampler_index);

    instr->texture_index = combined_index;
    instr->sampler_index = combined_index;

    return true;
 }

 /* FIXME: really similar to lower_tex_src_to_offset, perhaps refactor? */
 static void
 lower_image_deref(nir_builder *b,
                   nir_intrinsic_instr *instr,
                   struct v3dv_pipeline *pipeline,
                   const struct v3dv_pipeline_layout *layout)
 {
    nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
    nir_ssa_def *index = NULL;
    unsigned array_elements = 1;
    unsigned base_index = 0;

    while (deref->deref_type != nir_deref_type_var) {
       assert(deref->parent.is_ssa);
       nir_deref_instr *parent =
          nir_instr_as_deref(deref->parent.ssa->parent_instr);

       assert(deref->deref_type == nir_deref_type_array);

       if (nir_src_is_const(deref->arr.index) && index == NULL) {
          /* We're still building a direct index */
          base_index += nir_src_as_uint(deref->arr.index) * array_elements;
       } else {
          if (index == NULL) {
             /* We used to be direct but not anymore */
             index = nir_imm_int(b, base_index);
             base_index = 0;
          }

          index = nir_iadd(b, index,
                           nir_imul(b, nir_imm_int(b, array_elements),
                                    nir_ssa_for_src(b, deref->arr.index, 1)));
       }

       array_elements *= glsl_get_length(parent->type);

       deref = parent;
    }

    if (index)
       index = nir_umin(b, index, nir_imm_int(b, array_elements - 1));

    uint32_t set = deref->var->data.descriptor_set;
    uint32_t binding = deref->var->data.binding;
    struct v3dv_descriptor_set_layout *set_layout = layout->set[set].layout;
    struct v3dv_descriptor_set_binding_layout *binding_layout =
       &set_layout->binding[binding];

    uint32_t array_index = deref->var->data.index + base_index;

    assert(binding_layout->type == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE);

    int desc_index =
       descriptor_map_add(&pipeline->texture_map,
                          deref->var->data.descriptor_set,
                          deref->var->data.binding,
                          array_index,
                          binding_layout->array_size);

    /* We still need to get a combined_index, as we are integrating images with
     * the rest of the texture/sampler support
     */
    int combined_index =
       get_combined_index(pipeline, desc_index, V3DV_NO_SAMPLER_IDX);

    index = nir_imm_int(b, combined_index);

    nir_rewrite_image_intrinsic(instr, index, false);
 }

 static bool
 lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr,
                 struct v3dv_pipeline *pipeline,
                 const struct v3dv_pipeline_layout *layout)
 {
    switch (instr->intrinsic) {
    case nir_intrinsic_load_layer_id:
       /* FIXME: if layered rendering gets supported, this would need a real
        * lowering
        */
       nir_ssa_def_rewrite_uses(&instr->dest.ssa,
                                nir_src_for_ssa(nir_imm_int(b, 0)));
       nir_instr_remove(&instr->instr);
       return true;

    case nir_intrinsic_load_push_constant:
       lower_load_push_constant(b, instr, pipeline);
       pipeline->use_push_constants = true;
       return true;

    case nir_intrinsic_vulkan_resource_index:
       lower_vulkan_resource_index(b, instr, pipeline, layout);
       return true;

    case nir_intrinsic_image_deref_load:
    case nir_intrinsic_image_deref_store:
    case nir_intrinsic_image_deref_atomic_add:
    case nir_intrinsic_image_deref_atomic_imin:
    case nir_intrinsic_image_deref_atomic_umin:
    case nir_intrinsic_image_deref_atomic_imax:
    case nir_intrinsic_image_deref_atomic_umax:
    case nir_intrinsic_image_deref_atomic_and:
    case nir_intrinsic_image_deref_atomic_or:
    case nir_intrinsic_image_deref_atomic_xor:
    case nir_intrinsic_image_deref_atomic_exchange:
    case nir_intrinsic_image_deref_atomic_comp_swap:
    case nir_intrinsic_image_deref_size:
    case nir_intrinsic_image_deref_samples:
       lower_image_deref(b, instr, pipeline, layout);
       return true;

    default:
       return false;
    }
 }

 static bool
 lower_impl(nir_function_impl *impl,
            struct v3dv_pipeline *pipeline,
            const struct v3dv_pipeline_layout *layout)
 {
    nir_builder b;
    nir_builder_init(&b, impl);
    bool progress = false;

    nir_foreach_block(block, impl) {
       nir_foreach_instr_safe(instr, block) {
          b.cursor = nir_before_instr(instr);
          switch (instr->type) {
          case nir_instr_type_tex:
             progress |=
                lower_sampler(&b, nir_instr_as_tex(instr), pipeline, layout);
             break;
          case nir_instr_type_intrinsic:
             progress |=
                lower_intrinsic(&b, nir_instr_as_intrinsic(instr), pipeline, layout);
             break;
          default:
             break;
          }
       }
    }

    return progress;
 }

 static bool
 lower_pipeline_layout_info(nir_shader *shader,
                            struct v3dv_pipeline *pipeline,
                            const struct v3dv_pipeline_layout *layout)
 {
    bool progress = false;

    nir_foreach_function(function, shader) {
       if (function->impl)
          progress |= lower_impl(function->impl, pipeline, layout);
    }

    return progress;
 }


 static void
 lower_fs_io(nir_shader *nir)
 {
    /* Our backend doesn't handle array fragment shader outputs */
    NIR_PASS_V(nir, nir_lower_io_arrays_to_elements_no_indirects, false);
    NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_shader_out, NULL);

    nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs,
                                MESA_SHADER_FRAGMENT);

    nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs,
                                MESA_SHADER_FRAGMENT);

    NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
               type_size_vec4, 0);
 }

 static void
 lower_vs_io(struct nir_shader *nir)
 {
    NIR_PASS_V(nir, nir_lower_io_arrays_to_elements_no_indirects, false);

    nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs,
                                MESA_SHADER_VERTEX);

    nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs,
                                MESA_SHADER_VERTEX);

    /* FIXME: if we call nir_lower_io, we get a crash later. Likely because it
     * overlaps with v3d_nir_lower_io. Need further research though.
     */
 }

 static void
 shader_debug_output(const char *message, void *data)
 {
    /* FIXME: We probably don't want to debug anything extra here, and in fact
     * the compiler is not using this callback too much, only as an alternative
     * way to debug out the shaderdb stats, that you can already get using
     * V3D_DEBUG=shaderdb. Perhaps it would make sense to revisit the v3d
     * compiler to remove that callback.
     */
 }

 static void
 pipeline_populate_v3d_key(struct v3d_key *key,
                           const struct v3dv_pipeline_stage *p_stage,
                           uint32_t ucp_enables)
 {
    /* The following values are default values used at pipeline create, that
     * lack the info about the real sampler/texture format used, needed to
     * decide about lowerings and other stuff affecting the final
     * assembly. When all that info is in place, it would be needed to check if
     * it is needed a shader variant (if we are lucky the default values would
     * be the same and no new compilation will be done)
     */

    /* We don't use the nir shader info.num_textures because that doesn't take
     * into account input attachments, even after calling
     * nir_lower_input_attachments. As a general rule that makes sense, but on
     * our case we are handling them mostly as textures. We iterate through the
     * combined_index_map that was filled with the textures sused on th sader.
     */
    uint32_t tex_idx = 0;
    if (p_stage->pipeline->combined_index_map) {
       hash_table_foreach(p_stage->pipeline->combined_index_map, entry) {
          key->tex[tex_idx].swizzle[0] = PIPE_SWIZZLE_X;
          key->tex[tex_idx].swizzle[1] = PIPE_SWIZZLE_Y;
          key->tex[tex_idx].swizzle[2] = PIPE_SWIZZLE_Z;
          key->tex[tex_idx].swizzle[3] = PIPE_SWIZZLE_W;

          key->tex[tex_idx].return_size = 16;
          key->tex[tex_idx].return_channels = 2;

          tex_idx++;
       }
    }
    key->num_tex_used = tex_idx;
    assert(key->num_tex_used <= V3D_MAX_TEXTURE_SAMPLERS);

    /* default value. Would be override on the vs/gs populate methods when GS
     * gets supported
     */
    key->is_last_geometry_stage = true;

    /* Vulkan doesn't have fixed function state for user clip planes. Instead,
     * shaders can write to gl_ClipDistance[], in which case the SPIR-V compiler
     * takes care of adding a single compact array variable at
     * VARYING_SLOT_CLIP_DIST0, so we don't need any user clip plane lowering.
     *
     * The only lowering we are interested is specific to the fragment shader,
     * where we want to emit discards to honor writes to gl_ClipDistance[] in
     * previous stages. This is done via nir_lower_clip_fs() so we only set up
     * the ucp enable mask for that stage.
     */
    key->ucp_enables = ucp_enables;

    key->environment = V3D_ENVIRONMENT_VULKAN;
 }

 /* FIXME: anv maps to hw primitive type. Perhaps eventually we would do the
  * same. For not using prim_mode that is the one already used on v3d
  */
 static const enum pipe_prim_type vk_to_pipe_prim_type[] = {
    [VK_PRIMITIVE_TOPOLOGY_POINT_LIST] = PIPE_PRIM_POINTS,
    [VK_PRIMITIVE_TOPOLOGY_LINE_LIST] = PIPE_PRIM_LINES,
    [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP] = PIPE_PRIM_LINE_STRIP,
    [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST] = PIPE_PRIM_TRIANGLES,
    [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP] = PIPE_PRIM_TRIANGLE_STRIP,
    [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN] = PIPE_PRIM_TRIANGLE_FAN,
    [VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY] = PIPE_PRIM_LINES_ADJACENCY,
    [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY] = PIPE_PRIM_LINE_STRIP_ADJACENCY,
    [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY] = PIPE_PRIM_TRIANGLES_ADJACENCY,
    [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY] = PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY,
 };

 static const enum pipe_logicop vk_to_pipe_logicop[] = {
    [VK_LOGIC_OP_CLEAR] = PIPE_LOGICOP_CLEAR,
    [VK_LOGIC_OP_AND] = PIPE_LOGICOP_AND,
    [VK_LOGIC_OP_AND_REVERSE] = PIPE_LOGICOP_AND_REVERSE,
    [VK_LOGIC_OP_COPY] = PIPE_LOGICOP_COPY,
    [VK_LOGIC_OP_AND_INVERTED] = PIPE_LOGICOP_AND_INVERTED,
    [VK_LOGIC_OP_NO_OP] = PIPE_LOGICOP_NOOP,
    [VK_LOGIC_OP_XOR] = PIPE_LOGICOP_XOR,
    [VK_LOGIC_OP_OR] = PIPE_LOGICOP_OR,
    [VK_LOGIC_OP_NOR] = PIPE_LOGICOP_NOR,
    [VK_LOGIC_OP_EQUIVALENT] = PIPE_LOGICOP_EQUIV,
    [VK_LOGIC_OP_INVERT] = PIPE_LOGICOP_INVERT,
    [VK_LOGIC_OP_OR_REVERSE] = PIPE_LOGICOP_OR_REVERSE,
    [VK_LOGIC_OP_COPY_INVERTED] = PIPE_LOGICOP_COPY_INVERTED,
    [VK_LOGIC_OP_OR_INVERTED] = PIPE_LOGICOP_OR_INVERTED,
    [VK_LOGIC_OP_NAND] = PIPE_LOGICOP_NAND,
    [VK_LOGIC_OP_SET] = PIPE_LOGICOP_SET,
 };

 static void
 pipeline_populate_v3d_fs_key(struct v3d_fs_key *key,
                              const VkGraphicsPipelineCreateInfo *pCreateInfo,
                              const struct v3dv_pipeline_stage *p_stage,
                              uint32_t ucp_enables)
 {
    memset(key, 0, sizeof(*key));

    pipeline_populate_v3d_key(&key->base, p_stage, ucp_enables);

    const VkPipelineInputAssemblyStateCreateInfo *ia_info =
       pCreateInfo->pInputAssemblyState;
    uint8_t topology = vk_to_pipe_prim_type[ia_info->topology];

    key->is_points = (topology == PIPE_PRIM_POINTS);
    key->is_lines = (topology >= PIPE_PRIM_LINES &&
                     topology <= PIPE_PRIM_LINE_STRIP);

    /* Vulkan doesn't appear to specify (anv does the same) */
    key->clamp_color = false;

    const VkPipelineColorBlendStateCreateInfo *cb_info =
       pCreateInfo->pColorBlendState;

    key->logicop_func = cb_info && cb_info->logicOpEnable == VK_TRUE ?
                        vk_to_pipe_logicop[cb_info->logicOp] :
                        PIPE_LOGICOP_COPY;

    const VkPipelineMultisampleStateCreateInfo *ms_info =
       pCreateInfo->pMultisampleState;

    /* FIXME: msaa not supported yet (although we add some of the code to
     * translate vk sample info in advance)
     */
    key->msaa = false;
    if (key->msaa & (ms_info != NULL)) {
       uint32_t sample_mask = 0xffff;

       if (ms_info->pSampleMask)
          sample_mask = ms_info->pSampleMask[0] & 0xffff;

       key->sample_coverage = (sample_mask != (1 << V3D_MAX_SAMPLES) - 1);
       key->sample_alpha_to_coverage = ms_info->alphaToCoverageEnable;
       key->sample_alpha_to_one = ms_info->alphaToOneEnable;
    }

    const VkPipelineDepthStencilStateCreateInfo *ds_info =
       pCreateInfo->pDepthStencilState;

    key->depth_enabled = (ds_info == NULL ? false : ds_info->depthTestEnable);

    /* Vulkan doesn't support alpha test */
    key->alpha_test = false;
    key->alpha_test_func = COMPARE_FUNC_NEVER;

    /* FIXME: placeholder. Final value for swap_color_rb depends on the format
     * of the surface to be used.
     */
    key->swap_color_rb = false;

    const struct v3dv_render_pass *pass =
       v3dv_render_pass_from_handle(pCreateInfo->renderPass);
    const struct v3dv_subpass *subpass = p_stage->pipeline->subpass;
    for (uint32_t i = 0; i < subpass->color_count; i++) {
       const uint32_t att_idx = subpass->color_attachments[i].attachment;
       if (att_idx == VK_ATTACHMENT_UNUSED)
          continue;

       key->cbufs |= 1 << i;

       VkFormat fb_format = pass->attachments[att_idx].desc.format;
       enum pipe_format fb_pipe_format = vk_format_to_pipe_format(fb_format);

       /* If logic operations are enabled then we might emit color reads and we
        * need to know the color buffer format and swizzle for that
        */
       if (key->logicop_func != PIPE_LOGICOP_COPY) {
          key->color_fmt[i].format = fb_pipe_format;
          key->color_fmt[i].swizzle = v3dv_get_format_swizzle(fb_format);
       }

       const struct util_format_description *desc =
          vk_format_description(fb_format);

       if (desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
           desc->channel[0].size == 32) {
          key->f32_color_rb |= 1 << i;
       }

       if (p_stage->nir->info.fs.untyped_color_outputs) {
          if (util_format_is_pure_uint(fb_pipe_format))
             key->uint_color_rb |= 1 << i;
          else if (util_format_is_pure_sint(fb_pipe_format))
             key->int_color_rb |= 1 << i;
       }

       if (key->is_points) {
          /* FIXME: The mask would need to be computed based on the shader
           * inputs. On gallium it is done at st_atom_rasterizer
           * (sprite_coord_enable). anv seems (need to confirm) to do that on
           * genX_pipeline (PointSpriteTextureCoordinateEnable). Would be also
           * better to have tests to guide filling the mask.
           */
          key->point_sprite_mask = 0;

          /* Vulkan mandates upper left. */
          key->point_coord_upper_left = true;
       }
    }

    /* FIXME: we understand that this is used on GL to configure fixed-function
     * two side lighting support, and not make sense for Vulkan. Need to
     * confirm though.
     */
    key->light_twoside = false;

    /* FIXME: ditto, although for flat lighting. Again, neet to confirm.*/
    key->shade_model_flat = false;
 }

 static void
 pipeline_populate_v3d_vs_key(struct v3d_vs_key *key,
                              const VkGraphicsPipelineCreateInfo *pCreateInfo,
                              const struct v3dv_pipeline_stage *p_stage)
 {
    memset(key, 0, sizeof(*key));

    pipeline_populate_v3d_key(&key->base, p_stage, 0);

    /* Vulkan doesn't appear to specify (anv does the same) */
    key->clamp_color = false;

    /* Vulkan specifies a point size per vertex, so true for if the prim are
     * points, like on ES2)
     */
    const VkPipelineInputAssemblyStateCreateInfo *ia_info =
       pCreateInfo->pInputAssemblyState;
    uint8_t topology = vk_to_pipe_prim_type[ia_info->topology];

    /* FIXME: not enough to being PRIM_POINTS, on gallium the full check is
     * PIPE_PRIM_POINTS && v3d->rasterizer->base.point_size_per_vertex */
    key->per_vertex_point_size = (topology == PIPE_PRIM_POINTS);

    key->is_coord = p_stage->is_coord;
    if (p_stage->is_coord) {
       /* The only output varying on coord shaders are for transform
        * feedback. Set to 0 as VK_EXT_transform_feedback is not supported.
        */
       key->num_used_outputs = 0;
    } else {
       struct v3dv_pipeline *pipeline = p_stage->pipeline;
       struct v3dv_shader_variant *fs_variant = pipeline->fs->current_variant;

       key->num_used_outputs = fs_variant->prog_data.fs->num_inputs;

       STATIC_ASSERT(sizeof(key->used_outputs) ==
                     sizeof(fs_variant->prog_data.fs->input_slots));
       memcpy(key->used_outputs, fs_variant->prog_data.fs->input_slots,
              sizeof(key->used_outputs));
    }

    const VkPipelineVertexInputStateCreateInfo *vi_info =
       pCreateInfo->pVertexInputState;
    for (uint32_t i = 0; i < vi_info->vertexAttributeDescriptionCount; i++) {
       const VkVertexInputAttributeDescription *desc =
          &vi_info->pVertexAttributeDescriptions[i];
       assert(desc->location < MAX_VERTEX_ATTRIBS);
       if (desc->format == VK_FORMAT_B8G8R8A8_UNORM)
          key->va_swap_rb_mask |= 1 << desc->location;
    }
 }

 /* FIXME: following hash/compare methods are C&P from v3d. Common place? */
 static uint32_t
 fs_cache_hash(const void *key)
 {
    return _mesa_hash_data(key, sizeof(struct v3d_fs_key));
 }

 static uint32_t
 vs_cache_hash(const void *key)
 {
    return _mesa_hash_data(key, sizeof(struct v3d_vs_key));
 }

 static uint32_t
 cs_cache_hash(const void *key)
 {
    return _mesa_hash_data(key, sizeof(struct v3d_key));
 }

 static bool
 fs_cache_compare(const void *key1, const void *key2)
 {
    return memcmp(key1, key2, sizeof(struct v3d_fs_key)) == 0;
 }

 static bool
 vs_cache_compare(const void *key1, const void *key2)
 {
    return memcmp(key1, key2, sizeof(struct v3d_vs_key)) == 0;
 }

 static bool
 cs_cache_compare(const void *key1, const void *key2)
 {
    return memcmp(key1, key2, sizeof(struct v3d_key)) == 0;
 }

 static struct hash_table*
 create_variant_cache(gl_shader_stage stage)
 {
    switch (stage) {
    case MESA_SHADER_VERTEX:
       return _mesa_hash_table_create(NULL, vs_cache_hash, vs_cache_compare);
    case MESA_SHADER_FRAGMENT:
       return _mesa_hash_table_create(NULL, fs_cache_hash, fs_cache_compare);
    case MESA_SHADER_COMPUTE:
       return _mesa_hash_table_create(NULL, cs_cache_hash, cs_cache_compare);
    default:
       unreachable("not supported shader stage");
    }
 }

 /*
  * Creates the pipeline_stage for the coordinate shader. Initially a clone of
  * the vs pipeline_stage, with is_coord to true;
  */
 static struct v3dv_pipeline_stage*
 pipeline_stage_create_vs_bin(const struct v3dv_pipeline_stage *src,
                              const VkAllocationCallbacks *pAllocator)
 {
    struct v3dv_device *device = src->pipeline->device;

    struct v3dv_pipeline_stage *p_stage =
       vk_zalloc2(&device->alloc, pAllocator, sizeof(*p_stage), 8,
                  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);

    p_stage->pipeline = src->pipeline;
    assert(src->stage == MESA_SHADER_VERTEX);
    p_stage->stage = src->stage;
    p_stage->entrypoint = src->entrypoint;
    p_stage->module = src->module;
    p_stage->nir = nir_shader_clone(NULL, src->nir);
    p_stage->spec_info = src->spec_info;

    /* Technically we could share the hash_table, but having their own makes
     * destroy p_stage more straightforward
     */
    p_stage->cache = create_variant_cache(MESA_SHADER_VERTEX);

    p_stage->is_coord = true;

    return p_stage;
 }

 /* FIXME: right now this just asks for an bo for the exact size of the qpu
  * assembly. It would be good to be slighly smarter and having one "all
  * shaders" bo per pipeline, so each p_stage/variant would save their offset
  * on such. That is really relevant due the fact that bo are always aligned to
  * 4096, so that would allow to use less memory.
  *
  * For now one-bo per-assembly would work.
  *
  * Returns false if it was not able to allocate or map the assembly bo memory.
  */
 static bool
 upload_assembly(struct v3dv_pipeline_stage *p_stage,
                 struct v3dv_shader_variant *variant,
                 const void *data,
                 uint32_t size)
 {
    const char *name = NULL;
    /* We are uploading the assembly just once, so at this point we shouldn't
     * have any bo
     */
    assert(variant->assembly_bo == NULL);
    struct v3dv_device *device = p_stage->pipeline->device;

    switch (p_stage->stage) {
    case MESA_SHADER_VERTEX:
       name = (p_stage->is_coord == true) ? "coord_shader_assembly" :
          "vertex_shader_assembly";
       break;
    case MESA_SHADER_FRAGMENT:
       name = "fragment_shader_assembly";
       break;
    case MESA_SHADER_COMPUTE:
       name = "compute_shader_assembly";
       break;
    default:
       unreachable("Stage not supported\n");
       break;
    };

    struct v3dv_bo *bo = v3dv_bo_alloc(device, size, name, true);
    if (!bo) {
       fprintf(stderr, "failed to allocate memory for shader\n");
       return false;
    }

    bool ok = v3dv_bo_map(device, bo, size);
    if (!ok) {
       fprintf(stderr, "failed to map source shader buffer\n");
       return false;
    }

    memcpy(bo->map, data, size);

    v3dv_bo_unmap(device, bo);

    variant->assembly_bo = bo;

    return true;
 }

 /* For a given key, it returns the compiled version of the shader. If it was
  * already compiled, it gets it from the p_stage cache, if not it compiles is
  * through the v3d compiler
  *
  * If the method returns NULL it means that it was not able to allocate the
  * resources for the variant. out_vk_result would return which OOM applies.
  */
 struct v3dv_shader_variant*
 v3dv_get_shader_variant(struct v3dv_pipeline_stage *p_stage,
                         struct v3d_key *key,
                         size_t key_size,
                         const VkAllocationCallbacks *pAllocator,
                         VkResult *out_vk_result)
 {
    struct hash_table *ht = p_stage->cache;
    struct hash_entry *entry = _mesa_hash_table_search(ht, key);

    if (entry) {
       *out_vk_result = VK_SUCCESS;
       return entry->data;
    }

    struct v3dv_pipeline *pipeline = p_stage->pipeline;
    struct v3dv_device *device = pipeline->device;
    struct v3dv_shader_variant *variant =
       vk_zalloc(&device->alloc, sizeof(*variant), 8,
                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);

    if (variant == NULL) {
       *out_vk_result = VK_ERROR_OUT_OF_HOST_MEMORY;
       return NULL;
    }

    struct v3dv_physical_device *physical_device =
       &pipeline->device->instance->physicalDevice;
    const struct v3d_compiler *compiler = physical_device->compiler;

    uint32_t variant_id = p_atomic_inc_return(&p_stage->compiled_variant_count);

    if (V3D_DEBUG & (V3D_DEBUG_NIR |
                     v3d_debug_flag_for_shader_stage(p_stage->stage))) {
       fprintf(stderr, "Just before v3d_compile: %s prog %d variant %d NIR:\n",
               gl_shader_stage_name(p_stage->stage),
               p_stage->program_id,
               variant_id);
       nir_print_shader(p_stage->nir, stderr);
       fprintf(stderr, "\n");
    }

    uint64_t *qpu_insts;
    uint32_t qpu_insts_size;

    qpu_insts = v3d_compile(compiler,
                            key, &variant->prog_data.base,
                            p_stage->nir,
                            shader_debug_output, NULL,
                            p_stage->program_id,
                            variant_id,
                            &qpu_insts_size);

    if (!qpu_insts) {
       fprintf(stderr, "Failed to compile %s prog %d NIR to VIR\n",
               gl_shader_stage_name(p_stage->stage),
               p_stage->program_id);
    } else {
       if (!upload_assembly(p_stage, variant, qpu_insts, qpu_insts_size)) {
          free(qpu_insts);
          vk_free2(&device->alloc, pAllocator, variant);

          *out_vk_result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
          return NULL;
       }
    }

    free(qpu_insts);

    if (ht) {
       struct v3d_key *dup_key;
       dup_key = ralloc_size(ht, key_size);
       memcpy(dup_key, key, key_size);
       _mesa_hash_table_insert(ht, dup_key, variant);
    }

    if (variant->prog_data.base->spill_size > pipeline->spill.size_per_thread) {
       /* The TIDX register we use for choosing the area to access
        * for scratch space is: (core << 6) | (qpu << 2) | thread.
        * Even at minimum threadcount in a particular shader, that
        * means we still multiply by qpus by 4.
        */
       const uint32_t total_spill_size =
          4 * device->devinfo.qpu_count * variant->prog_data.base->spill_size;
       if (pipeline->spill.bo) {
          assert(pipeline->spill.size_per_thread > 0);
          v3dv_bo_free(device, pipeline->spill.bo);
       }
       pipeline->spill.bo =
          v3dv_bo_alloc(device, total_spill_size, "spill", true);
       pipeline->spill.size_per_thread = variant->prog_data.base->spill_size;
    }

    *out_vk_result = VK_SUCCESS;
    return variant;
 }

 /* FIXME: C&P from st, common place? */
 static void
 st_nir_opts(nir_shader *nir)
 {
    bool progress;

    do {
       progress = false;

       NIR_PASS_V(nir, nir_lower_vars_to_ssa);

       /* Linking deals with unused inputs/outputs, but here we can remove
        * things local to the shader in the hopes that we can cleanup other
        * things. This pass will also remove variables with only stores, so we
        * might be able to make progress after it.
        */
       NIR_PASS(progress, nir, nir_remove_dead_variables,
                (nir_variable_mode)(nir_var_function_temp |
                                    nir_var_shader_temp |
                                    nir_var_mem_shared),
                NULL);

       NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
       NIR_PASS(progress, nir, nir_opt_dead_write_vars);

       if (nir->options->lower_to_scalar) {
          NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
          NIR_PASS_V(nir, nir_lower_phis_to_scalar);
       }

       NIR_PASS_V(nir, nir_lower_alu);
       NIR_PASS_V(nir, nir_lower_pack);
       NIR_PASS(progress, nir, nir_copy_prop);
       NIR_PASS(progress, nir, nir_opt_remove_phis);
       NIR_PASS(progress, nir, nir_opt_dce);
       if (nir_opt_trivial_continues(nir)) {
          progress = true;
          NIR_PASS(progress, nir, nir_copy_prop);
          NIR_PASS(progress, nir, nir_opt_dce);
       }
       NIR_PASS(progress, nir, nir_opt_if, false);
       NIR_PASS(progress, nir, nir_opt_dead_cf);
       NIR_PASS(progress, nir, nir_opt_cse);
       NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true);

       NIR_PASS(progress, nir, nir_opt_algebraic);
       NIR_PASS(progress, nir, nir_opt_constant_folding);

       NIR_PASS(progress, nir, nir_opt_undef);
       NIR_PASS(progress, nir, nir_opt_conditional_discard);
    } while (progress);
 }

 static void
 link_shaders(nir_shader *producer, nir_shader *consumer)
 {
    assert(producer);
    assert(consumer);

    if (producer->options->lower_to_scalar) {
       NIR_PASS_V(producer, nir_lower_io_to_scalar_early, nir_var_shader_out);
       NIR_PASS_V(consumer, nir_lower_io_to_scalar_early, nir_var_shader_in);
    }

    nir_lower_io_arrays_to_elements(producer, consumer);

    st_nir_opts(producer);
    st_nir_opts(consumer);

    if (nir_link_opt_varyings(producer, consumer))
       st_nir_opts(consumer);

    NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_out, NULL);
    NIR_PASS_V(consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);

    if (nir_remove_unused_varyings(producer, consumer)) {
       NIR_PASS_V(producer, nir_lower_global_vars_to_local);
       NIR_PASS_V(consumer, nir_lower_global_vars_to_local);

       st_nir_opts(producer);
       st_nir_opts(consumer);

       /* Optimizations can cause varyings to become unused.
        * nir_compact_varyings() depends on all dead varyings being removed so
        * we need to call nir_remove_dead_variables() again here.
        */
       NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_out, NULL);
       NIR_PASS_V(consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
    }
 }

 static void
 pipeline_lower_nir(struct v3dv_pipeline *pipeline,
                    struct v3dv_pipeline_stage *p_stage,
                    struct v3dv_pipeline_layout *layout)
 {
    nir_shader_gather_info(p_stage->nir, nir_shader_get_entrypoint(p_stage->nir));

    /* Apply the actual pipeline layout to UBOs, SSBOs, and textures */
    NIR_PASS_V(p_stage->nir, lower_pipeline_layout_info, pipeline, layout);
 }

 /**
  * The SPIR-V compiler will insert a sized compact array for
  * VARYING_SLOT_CLIP_DIST0 if the vertex shader writes to gl_ClipDistance[],
  * where the size of the array determines the number of active clip planes.
  */
 static uint32_t
 get_ucp_enable_mask(struct v3dv_pipeline_stage **stages)
 {
    const nir_shader *shader = stages[MESA_SHADER_VERTEX]->nir;
    assert(shader);

    nir_foreach_variable_with_modes(var, shader, nir_var_shader_out) {
       if (var->data.location == VARYING_SLOT_CLIP_DIST0) {
          assert(var->data.compact);
          return (1 << glsl_get_length(var->type)) - 1;
       }
    }
    return 0;
 }

 static VkResult
 pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
                           const VkGraphicsPipelineCreateInfo *pCreateInfo,
                           const VkAllocationCallbacks *pAllocator)
 {
    struct v3dv_pipeline_stage *stages[MESA_SHADER_STAGES] = { };
    struct v3dv_device *device = pipeline->device;
    struct v3dv_physical_device *physical_device =
       &device->instance->physicalDevice;

    /* First pass to get the the common info from the shader and the nir
     * shader. We don't care of the coord shader for now.
     */
    for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) {
       const VkPipelineShaderStageCreateInfo *sinfo = &pCreateInfo->pStages[i];
       gl_shader_stage stage = vk_to_mesa_shader_stage(sinfo->stage);

       struct v3dv_pipeline_stage *p_stage =
          vk_zalloc2(&device->alloc, pAllocator, sizeof(*p_stage), 8,
                     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);

       /* Note that we are assigning program_id slightly differently that
        * v3d. Here we are assigning one per pipeline stage, so vs and vs_bin
        * would have a different program_id, while v3d would have the same for
        * both. For the case of v3dv, it is more natural to have an id this way,
        * as right now we are using it for debugging, not for shader-db.
        */
       p_stage->program_id =
          p_atomic_inc_return(&physical_device->next_program_id);
       p_stage->compiled_variant_count = 0;
       p_stage->cache = create_variant_cache(stage);

       p_stage->pipeline = pipeline;
       p_stage->stage = stage;
       if (stage == MESA_SHADER_VERTEX)
          p_stage->is_coord = false;
       p_stage->entrypoint = sinfo->pName;
       p_stage->module = v3dv_shader_module_from_handle(sinfo->module);
       p_stage->spec_info = sinfo->pSpecializationInfo;

       pipeline->active_stages |= sinfo->stage;

       /* FIXME: when cache support is in place, first check if for the given
        * spirv module and options, we already have a nir shader.
        */
       p_stage->nir = shader_module_compile_to_nir(pipeline->device, p_stage);

       stages[stage] = p_stage;
    }

    /* Add a no-op fragment shader if needed */
    if (!stages[MESA_SHADER_FRAGMENT]) {
       nir_builder b;
       nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT,
                                      &v3dv_nir_options);
       b.shader->info.name = ralloc_strdup(b.shader, "noop_fs");

       struct v3dv_pipeline_stage *p_stage =
          vk_zalloc2(&device->alloc, pAllocator, sizeof(*p_stage), 8,
                     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);

       p_stage->pipeline = pipeline;
       p_stage->stage = MESA_SHADER_FRAGMENT;
       p_stage->entrypoint = "main";
       p_stage->module = 0;
       p_stage->nir = b.shader;

       p_stage->program_id =
          p_atomic_inc_return(&physical_device->next_program_id);
       p_stage->compiled_variant_count = 0;
       p_stage->cache = create_variant_cache(MESA_SHADER_FRAGMENT);

       stages[MESA_SHADER_FRAGMENT] = p_stage;
       pipeline->active_stages |= MESA_SHADER_FRAGMENT;
    }

    /* Linking */
    struct v3dv_pipeline_stage *next_stage = NULL;
    for (int stage = MESA_SHADER_STAGES - 1; stage >= 0; stage--) {
       if (stages[stage] == NULL || stages[stage]->entrypoint == NULL)
          continue;

       struct v3dv_pipeline_stage *p_stage = stages[stage];

       switch(stage) {
       case MESA_SHADER_VERTEX:
          link_shaders(p_stage->nir, next_stage->nir);
          break;
       case MESA_SHADER_FRAGMENT:
          /* FIXME: not doing any specific linking stuff here yet */
          break;
       default:
          unreachable("not supported shader stage");
       }

       next_stage = stages[stage];
    }

    /* Compiling to vir. Note that at this point we are compiling a default
     * variant. Binding to textures, and other stuff (that would need a
     * cmd_buffer) would need a recompile
     */
    for (int stage = MESA_SHADER_STAGES - 1; stage >= 0; stage--) {
       if (stages[stage] == NULL || stages[stage]->entrypoint == NULL)
          continue;

       struct v3dv_pipeline_stage *p_stage = stages[stage];

       pipeline_lower_nir(pipeline, p_stage, pipeline->layout);

       switch(stage) {
       case MESA_SHADER_VERTEX: {
          /* Right now we only support pipelines with both vertex and fragment
           * shader.
           */
          assert(pipeline->fs);

          /* Make sure we do all our common lowering *before* we create the vs
           * and vs_bin pipeline stages, since from that point forward we need to
           * run lowerings for both of them separately, since each stage will
           * own its NIR code.
           */
          lower_vs_io(p_stage->nir);

          pipeline->vs = p_stage;
          pipeline->vs_bin = pipeline_stage_create_vs_bin(pipeline->vs, pAllocator);

          /* FIXME: likely this to be moved to a gather info method to a full
           * struct inside pipeline_stage
           */
          const VkPipelineInputAssemblyStateCreateInfo *ia_info =
             pCreateInfo->pInputAssemblyState;
          pipeline->vs->topology = vk_to_pipe_prim_type[ia_info->topology];

          /* Note that at this point we would compile twice, one for vs and
           * other for vs_bin. For now we are maintaining two pipeline_stages.
           *
           * FIXME: this leads to two caches, when it shouldnt, revisit
           */
          struct v3d_vs_key *key = &pipeline->vs->key.vs;
          pipeline_populate_v3d_vs_key(key, pCreateInfo, pipeline->vs);
          VkResult vk_result;
          pipeline->vs->current_variant =
             v3dv_get_shader_variant(pipeline->vs, &key->base, sizeof(*key),
                                     pAllocator, &vk_result);
          if (vk_result != VK_SUCCESS)
             return vk_result;

          key = &pipeline->vs_bin->key.vs;
          pipeline_populate_v3d_vs_key(key, pCreateInfo, pipeline->vs_bin);
          pipeline->vs_bin->current_variant =
             v3dv_get_shader_variant(pipeline->vs_bin, &key->base, sizeof(*key),
                                     pAllocator, &vk_result);
          if (vk_result != VK_SUCCESS)
             return vk_result;

          break;
       }
       case MESA_SHADER_FRAGMENT: {
          struct v3d_fs_key *key = &p_stage->key.fs;

          pipeline->fs = p_stage;

          pipeline_populate_v3d_fs_key(key, pCreateInfo, p_stage,
                                       get_ucp_enable_mask(stages));

          lower_fs_io(p_stage->nir);

          VkResult vk_result;
          p_stage->current_variant =
             v3dv_get_shader_variant(p_stage, &key->base, sizeof(*key),
                                     pAllocator, &vk_result);
          if (vk_result != VK_SUCCESS)
             return vk_result;

          break;
       }
       default:
          unreachable("not supported shader stage");
       }
    }

    /* FIXME: values below are default when non-GS is available. Would need to
     * provide real values if GS gets supported
     */
    pipeline->vpm_cfg_bin.As = 1;
    pipeline->vpm_cfg_bin.Ve = 0;
    pipeline->vpm_cfg_bin.Vc =
       pipeline->vs_bin->current_variant->prog_data.vs->vcm_cache_size;

    pipeline->vpm_cfg.As = 1;
    pipeline->vpm_cfg.Ve = 0;
    pipeline->vpm_cfg.Vc =
       pipeline->vs->current_variant->prog_data.vs->vcm_cache_size;

    return VK_SUCCESS;
 }

 static unsigned
 v3dv_dynamic_state_mask(VkDynamicState state)
 {
    switch(state) {
    case VK_DYNAMIC_STATE_VIEWPORT:
       return V3DV_DYNAMIC_VIEWPORT;
    case VK_DYNAMIC_STATE_SCISSOR:
       return V3DV_DYNAMIC_SCISSOR;
    case VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK:
       return V3DV_DYNAMIC_STENCIL_COMPARE_MASK;
    case VK_DYNAMIC_STATE_STENCIL_WRITE_MASK:
       return V3DV_DYNAMIC_STENCIL_WRITE_MASK;
    case VK_DYNAMIC_STATE_STENCIL_REFERENCE:
       return V3DV_DYNAMIC_STENCIL_REFERENCE;
    case VK_DYNAMIC_STATE_BLEND_CONSTANTS:
       return V3DV_DYNAMIC_BLEND_CONSTANTS;
    case VK_DYNAMIC_STATE_DEPTH_BIAS:
       return V3DV_DYNAMIC_DEPTH_BIAS;
    case VK_DYNAMIC_STATE_LINE_WIDTH:
       return V3DV_DYNAMIC_LINE_WIDTH;

    /* Depth bounds testing is not available in in V3D 4.2 so here we are just
     * ignoring this dynamic state. We are already asserting at pipeline creation
     * time that depth bounds testing is not enabled.
     */
    case VK_DYNAMIC_STATE_DEPTH_BOUNDS:
       return 0;

    default:
       unreachable("Unhandled dynamic state");
    }
 }

 static void
 pipeline_init_dynamic_state(
    struct v3dv_pipeline *pipeline,
    const VkPipelineDynamicStateCreateInfo *pDynamicState,
    const VkPipelineViewportStateCreateInfo *pViewportState,
    const VkPipelineDepthStencilStateCreateInfo *pDepthStencilState,
    const VkPipelineColorBlendStateCreateInfo *pColorBlendState,
    const VkPipelineRasterizationStateCreateInfo *pRasterizationState)
 {
    pipeline->dynamic_state = default_dynamic_state;
    struct v3dv_dynamic_state *dynamic = &pipeline->dynamic_state;

    /* Create a mask of enabled dynamic states */
    uint32_t dynamic_states = 0;
    if (pDynamicState) {
       uint32_t count = pDynamicState->dynamicStateCount;
       for (uint32_t s = 0; s < count; s++) {
          dynamic_states |=
             v3dv_dynamic_state_mask(pDynamicState->pDynamicStates[s]);
       }
    }

    /* For any pipeline states that are not dynamic, set the dynamic state
     * from the static pipeline state.
     */
    if (pViewportState) {
       if (!(dynamic_states & V3DV_DYNAMIC_VIEWPORT)) {
          dynamic->viewport.count = pViewportState->viewportCount;
          typed_memcpy(dynamic->viewport.viewports, pViewportState->pViewports,
                       pViewportState->viewportCount);

          for (uint32_t i = 0; i < dynamic->viewport.count; i++) {
             v3dv_viewport_compute_xform(&dynamic->viewport.viewports[i],
                                         dynamic->viewport.scale[i],
                                         dynamic->viewport.translate[i]);
          }
       }

       if (!(dynamic_states & V3DV_DYNAMIC_SCISSOR)) {
          dynamic->scissor.count = pViewportState->scissorCount;
          typed_memcpy(dynamic->scissor.scissors, pViewportState->pScissors,
                       pViewportState->scissorCount);
       }
    }

    if (pDepthStencilState) {
       if (!(dynamic_states & V3DV_DYNAMIC_STENCIL_COMPARE_MASK)) {
          dynamic->stencil_compare_mask.front =
             pDepthStencilState->front.compareMask;
          dynamic->stencil_compare_mask.back =
             pDepthStencilState->back.compareMask;
       }

       if (!(dynamic_states & V3DV_DYNAMIC_STENCIL_WRITE_MASK)) {
          dynamic->stencil_write_mask.front = pDepthStencilState->front.writeMask;
          dynamic->stencil_write_mask.back = pDepthStencilState->back.writeMask;
       }

       if (!(dynamic_states & V3DV_DYNAMIC_STENCIL_REFERENCE)) {
          dynamic->stencil_reference.front = pDepthStencilState->front.reference;
          dynamic->stencil_reference.back = pDepthStencilState->back.reference;
       }
    }

    if (pColorBlendState && !(dynamic_states & V3DV_DYNAMIC_BLEND_CONSTANTS)) {
       memcpy(dynamic->blend_constants, pColorBlendState->blendConstants,
              sizeof(dynamic->blend_constants));
    }

    if (pRasterizationState) {
       if (pRasterizationState->depthBiasEnable &&
           !(dynamic_states & V3DV_DYNAMIC_DEPTH_BIAS)) {
          dynamic->depth_bias.constant_factor =
             pRasterizationState->depthBiasConstantFactor;
          dynamic->depth_bias.slope_factor =
             pRasterizationState->depthBiasSlopeFactor;
       }
       if (!(dynamic_states & V3DV_DYNAMIC_LINE_WIDTH))
          dynamic->line_width = pRasterizationState->lineWidth;
    }

    pipeline->dynamic_state.mask = dynamic_states;
 }

 static uint8_t
 blend_factor(VkBlendFactor factor, bool dst_alpha_one, bool *needs_constants)
 {
    switch (factor) {
    case VK_BLEND_FACTOR_ZERO:
    case VK_BLEND_FACTOR_ONE:
    case VK_BLEND_FACTOR_SRC_COLOR:
    case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
    case VK_BLEND_FACTOR_DST_COLOR:
    case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
    case VK_BLEND_FACTOR_SRC_ALPHA:
    case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
    case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
       return factor;
    case VK_BLEND_FACTOR_CONSTANT_COLOR:
    case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
    case VK_BLEND_FACTOR_CONSTANT_ALPHA:
    case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
       *needs_constants = true;
       return factor;
    case VK_BLEND_FACTOR_DST_ALPHA:
       return dst_alpha_one ? V3D_BLEND_FACTOR_ONE :
                              V3D_BLEND_FACTOR_DST_ALPHA;
    case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
       return dst_alpha_one ? V3D_BLEND_FACTOR_ZERO :
                              V3D_BLEND_FACTOR_INV_DST_ALPHA;
    case VK_BLEND_FACTOR_SRC1_COLOR:
    case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR:
    case VK_BLEND_FACTOR_SRC1_ALPHA:
    case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA:
       assert(!"Invalid blend factor: dual source blending not supported.");
    default:
       assert(!"Unknown blend factor.");
    }

    /* Should be handled by the switch, added to avoid a "end of non-void
     * function" error
     */
    unreachable("Unknown blend factor.");
 }

 static void
 pack_blend(struct v3dv_pipeline *pipeline,
            const VkPipelineColorBlendStateCreateInfo *cb_info)
 {
    /* By default, we are not enabling blending and all color channel writes are
     * enabled. Color write enables are independent of whether blending is
     * enabled or not.
     *
     * Vulkan specifies color write masks so that bits set correspond to
     * enabled channels. Our hardware does it the other way around.
     */
    pipeline->blend.enables = 0;
    pipeline->blend.color_write_masks = 0; /* All channels enabled */

    if (!cb_info)
       return;

    assert(pipeline->subpass);
    if (pipeline->subpass->color_count == 0)
       return;

    assert(pipeline->subpass->color_count == cb_info->attachmentCount);

    pipeline->blend.needs_color_constants = false;
    uint32_t color_write_masks = 0;
    for (uint32_t i = 0; i < pipeline->subpass->color_count; i++) {
       const VkPipelineColorBlendAttachmentState *b_state =
          &cb_info->pAttachments[i];

       uint32_t attachment_idx =
          pipeline->subpass->color_attachments[i].attachment;
       if (attachment_idx == VK_ATTACHMENT_UNUSED)
          continue;

       color_write_masks |= (~b_state->colorWriteMask & 0xf) << (4 * i);

       if (!b_state->blendEnable)
          continue;

       VkAttachmentDescription *desc =
          &pipeline->pass->attachments[attachment_idx].desc;
       const struct v3dv_format *format = v3dv_get_format(desc->format);
       bool dst_alpha_one = (format->swizzle[3] == PIPE_SWIZZLE_1);

       uint8_t rt_mask = 1 << i;
       pipeline->blend.enables |= rt_mask;

       v3dv_pack(pipeline->blend.cfg[i], BLEND_CFG, config) {
          config.render_target_mask = rt_mask;

          config.color_blend_mode = b_state->colorBlendOp;
          config.color_blend_dst_factor =
             blend_factor(b_state->dstColorBlendFactor, dst_alpha_one,
                          &pipeline->blend.needs_color_constants);
          config.color_blend_src_factor =
             blend_factor(b_state->srcColorBlendFactor, dst_alpha_one,
                          &pipeline->blend.needs_color_constants);

          config.alpha_blend_mode = b_state->alphaBlendOp;
          config.alpha_blend_dst_factor =
             blend_factor(b_state->dstAlphaBlendFactor, dst_alpha_one,
                          &pipeline->blend.needs_color_constants);
          config.alpha_blend_src_factor =
             blend_factor(b_state->srcAlphaBlendFactor, dst_alpha_one,
                          &pipeline->blend.needs_color_constants);
       }
    }

    pipeline->blend.color_write_masks = color_write_masks;
 }

 /* This requires that pack_blend() had been called before so we can set
  * the overall blend enable bit in the CFG_BITS packet.
  */
 static void
 pack_cfg_bits(struct v3dv_pipeline *pipeline,
               const VkPipelineDepthStencilStateCreateInfo *ds_info,
               const VkPipelineRasterizationStateCreateInfo *rs_info)
 {
    assert(sizeof(pipeline->cfg_bits) == cl_packet_length(CFG_BITS));

    v3dv_pack(pipeline->cfg_bits, CFG_BITS, config) {
       config.enable_forward_facing_primitive =
          rs_info ? !(rs_info->cullMode & VK_CULL_MODE_FRONT_BIT) : false;

       config.enable_reverse_facing_primitive =
          rs_info ? !(rs_info->cullMode & VK_CULL_MODE_BACK_BIT) : false;

       /* Seems like the hardware is backwards regarding this setting... */
       config.clockwise_primitives =
          rs_info ? rs_info->frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE : false;

       config.enable_depth_offset = rs_info ? rs_info->depthBiasEnable: false;

       /* This is required to pass line rasterization tests in CTS while
        * exposing, at least, a minimum of 4-bits of subpixel precision
        * (the minimum requirement).
        */
       config.line_rasterization = 1; /* perp end caps */

       if (rs_info && rs_info->polygonMode != VK_POLYGON_MODE_FILL) {
          config.direct3d_wireframe_triangles_mode = true;
          config.direct3d_point_fill_mode =
             rs_info->polygonMode == VK_POLYGON_MODE_POINT;
       }

       /* FIXME: oversample_mode postponed until msaa gets supported */
       config.rasterizer_oversample_mode = false;

       /* From the Vulkan spec:
        *
        *   "Provoking Vertex:
        *
        *       The vertex in a primitive from which flat shaded attribute
        *       values are taken. This is generally the “first” vertex in the
        *       primitive, and depends on the primitive topology."
        *
        * First vertex is the Direct3D style for provoking vertex. OpenGL uses
        * the last vertex by default.
        */
       config.direct3d_provoking_vertex = true;

       config.blend_enable = pipeline->blend.enables != 0;

       /* Disable depth/stencil if we don't have a D/S attachment */
       bool has_ds_attachment =
          pipeline->subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED;

       if (ds_info && ds_info->depthTestEnable && has_ds_attachment) {
          config.z_updates_enable = true;
          config.depth_test_function = ds_info->depthCompareOp;
       } else {
          config.depth_test_function = VK_COMPARE_OP_ALWAYS;
       }

       /* EZ state will be updated at draw time based on bound pipeline state */
       config.early_z_updates_enable = false;
       config.early_z_enable = false;

       config.stencil_enable =
          ds_info ? ds_info->stencilTestEnable && has_ds_attachment: false;
    };
 }

 static uint32_t
 translate_stencil_op(enum pipe_stencil_op op)
 {
    switch (op) {
    case VK_STENCIL_OP_KEEP:
       return V3D_STENCIL_OP_KEEP;
    case VK_STENCIL_OP_ZERO:
       return V3D_STENCIL_OP_ZERO;
    case VK_STENCIL_OP_REPLACE:
       return V3D_STENCIL_OP_REPLACE;
    case VK_STENCIL_OP_INCREMENT_AND_CLAMP:
       return V3D_STENCIL_OP_INCR;
    case VK_STENCIL_OP_DECREMENT_AND_CLAMP:
       return V3D_STENCIL_OP_DECR;
    case VK_STENCIL_OP_INVERT:
       return V3D_STENCIL_OP_INVERT;
    case VK_STENCIL_OP_INCREMENT_AND_WRAP:
       return V3D_STENCIL_OP_INCWRAP;
    case VK_STENCIL_OP_DECREMENT_AND_WRAP:
       return V3D_STENCIL_OP_DECWRAP;
    default:
       unreachable("bad stencil op");
    }
 }

 static void
 pack_single_stencil_cfg(struct v3dv_pipeline *pipeline,
                         uint8_t *stencil_cfg,
                         bool is_front,
                         bool is_back,
                         const VkStencilOpState *stencil_state)
 {
    /* From the Vulkan spec:
     *
     *   "Reference is an integer reference value that is used in the unsigned
     *    stencil comparison. The reference value used by stencil comparison
     *    must be within the range [0,2^s-1] , where s is the number of bits in
     *    the stencil framebuffer attachment, otherwise the reference value is
     *    considered undefined."
     *
     * In our case, 's' is always 8, so we clamp to that to prevent our packing
     * functions to assert in debug mode if they see larger values.
     *
     * If we have dynamic state we need to make sure we set the corresponding
     * state bits to 0, since cl_emit_with_prepacked ORs the new value with
     * the old.
     */
    const uint8_t write_mask =
       pipeline->dynamic_state.mask & V3DV_DYNAMIC_STENCIL_WRITE_MASK ?
          0 : stencil_state->writeMask & 0xff;

    const uint8_t compare_mask =
       pipeline->dynamic_state.mask & V3DV_DYNAMIC_STENCIL_COMPARE_MASK ?
          0 : stencil_state->compareMask & 0xff;

    const uint8_t reference =
       pipeline->dynamic_state.mask & V3DV_DYNAMIC_STENCIL_COMPARE_MASK ?
          0 : stencil_state->reference & 0xff;

    v3dv_pack(stencil_cfg, STENCIL_CFG, config) {
       config.front_config = is_front;
       config.back_config = is_back;
       config.stencil_write_mask = write_mask;
       config.stencil_test_mask = compare_mask;
       config.stencil_test_function = stencil_state->compareOp;
       config.stencil_pass_op = translate_stencil_op(stencil_state->passOp);
       config.depth_test_fail_op = translate_stencil_op(stencil_state->depthFailOp);
       config.stencil_test_fail_op = translate_stencil_op(stencil_state->failOp);
       config.stencil_ref_value = reference;
    }
 }

 static void
 pack_stencil_cfg(struct v3dv_pipeline *pipeline,
                  const VkPipelineDepthStencilStateCreateInfo *ds_info)
 {
    assert(sizeof(pipeline->stencil_cfg) == 2 * cl_packet_length(STENCIL_CFG));

    if (!ds_info || !ds_info->stencilTestEnable)
       return;

    if (pipeline->subpass->ds_attachment.attachment == VK_ATTACHMENT_UNUSED)
       return;

    const uint32_t dynamic_stencil_states = V3DV_DYNAMIC_STENCIL_COMPARE_MASK |
                                            V3DV_DYNAMIC_STENCIL_WRITE_MASK |
                                            V3DV_DYNAMIC_STENCIL_REFERENCE;


    /* If front != back or we have dynamic stencil state we can't emit a single
     * packet for both faces.
     */
    bool needs_front_and_back = false;
    if ((pipeline->dynamic_state.mask & dynamic_stencil_states) ||
        memcmp(&ds_info->front, &ds_info->back, sizeof(ds_info->front)))
       needs_front_and_back = true;

    /* If the front and back configurations are the same we can emit both with
     * a single packet.
     */
    pipeline->emit_stencil_cfg[0] = true;
    if (!needs_front_and_back) {
       pack_single_stencil_cfg(pipeline, pipeline->stencil_cfg[0],
                               true, true, &ds_info->front);
    } else {
       pipeline->emit_stencil_cfg[1] = true;
       pack_single_stencil_cfg(pipeline, pipeline->stencil_cfg[0],
                               true, false, &ds_info->front);
       pack_single_stencil_cfg(pipeline, pipeline->stencil_cfg[1],
                               false, true, &ds_info->back);
    }
 }

 static bool
 stencil_op_is_no_op(const VkStencilOpState *stencil)
 {
    return stencil->depthFailOp == VK_STENCIL_OP_KEEP &&
           stencil->compareOp == VK_COMPARE_OP_ALWAYS;
 }

 static void
 enable_depth_bias(struct v3dv_pipeline *pipeline,
                   const VkPipelineRasterizationStateCreateInfo *rs_info)
 {
    pipeline->depth_bias.enabled = false;
    pipeline->depth_bias.is_z16 = false;

    if (!rs_info || !rs_info->depthBiasEnable)
       return;

    /* Check the depth/stencil attachment description for the subpass used with
     * this pipeline.
     */
    assert(pipeline->pass && pipeline->subpass);
    struct v3dv_render_pass *pass = pipeline->pass;
    struct v3dv_subpass *subpass = pipeline->subpass;

    if (subpass->ds_attachment.attachment == VK_ATTACHMENT_UNUSED)
       return;

    assert(subpass->ds_attachment.attachment < pass->attachment_count);
    struct v3dv_render_pass_attachment *att =
       &pass->attachments[subpass->ds_attachment.attachment];

    if (att->desc.format == VK_FORMAT_D16_UNORM)
       pipeline->depth_bias.is_z16 = true;

    pipeline->depth_bias.enabled = true;
 }

 static void
 pipeline_set_ez_state(struct v3dv_pipeline *pipeline,
                       const VkPipelineDepthStencilStateCreateInfo *ds_info)
 {
    if (!ds_info || !ds_info->depthTestEnable) {
       pipeline->ez_state = VC5_EZ_DISABLED;
       return;
    }

    switch (ds_info->depthCompareOp) {
    case VK_COMPARE_OP_LESS:
    case VK_COMPARE_OP_LESS_OR_EQUAL:
       pipeline->ez_state = VC5_EZ_LT_LE;
       break;
    case VK_COMPARE_OP_GREATER:
    case VK_COMPARE_OP_GREATER_OR_EQUAL:
       pipeline->ez_state = VC5_EZ_GT_GE;
       break;
    case VK_COMPARE_OP_NEVER:
    case VK_COMPARE_OP_EQUAL:
       pipeline->ez_state = VC5_EZ_UNDECIDED;
       break;
    default:
       pipeline->ez_state = VC5_EZ_DISABLED;
       break;
    }

    /* If stencil is enabled and is not a no-op, we need to disable EZ */
    if (ds_info->stencilTestEnable &&
        (!stencil_op_is_no_op(&ds_info->front) ||
         !stencil_op_is_no_op(&ds_info->back))) {
          pipeline->ez_state = VC5_EZ_DISABLED;
    }
 }

 static void
 pack_shader_state_record(struct v3dv_pipeline *pipeline)
 {
    assert(sizeof(pipeline->shader_state_record) ==
           cl_packet_length(GL_SHADER_STATE_RECORD));

    struct v3d_fs_prog_data *prog_data_fs =
       pipeline->fs->current_variant->prog_data.fs;

    struct v3d_vs_prog_data *prog_data_vs =
       pipeline->vs->current_variant->prog_data.vs;

    struct v3d_vs_prog_data *prog_data_vs_bin =
       pipeline->vs_bin->current_variant->prog_data.vs;


    /* Note: we are not packing addresses, as we need the job (see
     * cl_pack_emit_reloc). Additionally uniforms can't be filled up at this
     * point as they depend on dynamic info that can be set after create the
     * pipeline (like viewport), . Would need to be filled later, so we are
     * doing a partial prepacking.
     */
    v3dv_pack(pipeline->shader_state_record, GL_SHADER_STATE_RECORD, shader) {
       shader.enable_clipping = true;

       shader.point_size_in_shaded_vertex_data =
          pipeline->vs->topology == PIPE_PRIM_POINTS;

       /* Must be set if the shader modifies Z, discards, or modifies
        * the sample mask.  For any of these cases, the fragment
        * shader needs to write the Z value (even just discards).
        */
       shader.fragment_shader_does_z_writes = prog_data_fs->writes_z;
       /* Set if the EZ test must be disabled (due to shader side
        * effects and the early_z flag not being present in the
        * shader).
        */
       shader.turn_off_early_z_test = prog_data_fs->disable_ez;

       shader.fragment_shader_uses_real_pixel_centre_w_in_addition_to_centroid_w2 =
          prog_data_fs->uses_center_w;

       shader.any_shader_reads_hardware_written_primitive_id = false;

       shader.do_scoreboard_wait_on_first_thread_switch =
          prog_data_fs->lock_scoreboard_on_first_thrsw;
       shader.disable_implicit_point_line_varyings =
          !prog_data_fs->uses_implicit_point_line_varyings;

       shader.number_of_varyings_in_fragment_shader =
          prog_data_fs->num_inputs;

       shader.coordinate_shader_propagate_nans = true;
       shader.vertex_shader_propagate_nans = true;
       shader.fragment_shader_propagate_nans = true;

       /* Note: see previous note about adresses */
       /* shader.coordinate_shader_code_address */
       /* shader.vertex_shader_code_address */
       /* shader.fragment_shader_code_address */

       /* FIXME: Use combined input/output size flag in the common case (also
        * on v3d, see v3dx_draw).
        */
       shader.coordinate_shader_has_separate_input_and_output_vpm_blocks =
          prog_data_vs_bin->separate_segments;
       shader.vertex_shader_has_separate_input_and_output_vpm_blocks =
          prog_data_vs->separate_segments;

       shader.coordinate_shader_input_vpm_segment_size =
          prog_data_vs_bin->separate_segments ?
          prog_data_vs_bin->vpm_input_size : 1;
       shader.vertex_shader_input_vpm_segment_size =
          prog_data_vs->separate_segments ?
          prog_data_vs->vpm_input_size : 1;

       shader.coordinate_shader_output_vpm_segment_size =
          prog_data_vs_bin->vpm_output_size;
       shader.vertex_shader_output_vpm_segment_size =
          prog_data_vs->vpm_output_size;

       /* Note: see previous note about adresses */
       /* shader.coordinate_shader_uniforms_address */
       /* shader.vertex_shader_uniforms_address */
       /* shader.fragment_shader_uniforms_address */

       shader.min_coord_shader_input_segments_required_in_play =
          pipeline->vpm_cfg_bin.As;
       shader.min_vertex_shader_input_segments_required_in_play =
          pipeline->vpm_cfg.As;

       shader.min_coord_shader_output_segments_required_in_play_in_addition_to_vcm_cache_size =
          pipeline->vpm_cfg_bin.Ve;
       shader.min_vertex_shader_output_segments_required_in_play_in_addition_to_vcm_cache_size =
          pipeline->vpm_cfg.Ve;

       shader.coordinate_shader_4_way_threadable =
          prog_data_vs_bin->base.threads == 4;
       shader.vertex_shader_4_way_threadable =
          prog_data_vs->base.threads == 4;
       shader.fragment_shader_4_way_threadable =
          prog_data_fs->base.threads == 4;

       shader.coordinate_shader_start_in_final_thread_section =
          prog_data_vs_bin->base.single_seg;
       shader.vertex_shader_start_in_final_thread_section =
          prog_data_vs->base.single_seg;
       shader.fragment_shader_start_in_final_thread_section =
          prog_data_fs->base.single_seg;

       shader.vertex_id_read_by_coordinate_shader =
          prog_data_vs_bin->uses_vid;
       shader.base_instance_id_read_by_coordinate_shader =
          prog_data_vs_bin->uses_biid;
       shader.instance_id_read_by_coordinate_shader =
          prog_data_vs_bin->uses_iid;
       shader.vertex_id_read_by_vertex_shader =
          prog_data_vs->uses_vid;
       shader.base_instance_id_read_by_vertex_shader =
          prog_data_vs->uses_biid;
       shader.instance_id_read_by_vertex_shader =
          prog_data_vs->uses_iid;

       /* Note: see previous note about adresses */
       /* shader.address_of_default_attribute_values */
    }
 }

 static void
 pack_vcm_cache_size(struct v3dv_pipeline *pipeline)
 {
    assert(sizeof(pipeline->vcm_cache_size) ==
           cl_packet_length(VCM_CACHE_SIZE));

    v3dv_pack(pipeline->vcm_cache_size, VCM_CACHE_SIZE, vcm) {
       vcm.number_of_16_vertex_batches_for_binning = pipeline->vpm_cfg_bin.Vc;
       vcm.number_of_16_vertex_batches_for_rendering = pipeline->vpm_cfg.Vc;
    }
 }

 /* As defined on the GL_SHADER_STATE_ATTRIBUTE_RECORD */
 static uint8_t
 get_attr_type(const struct util_format_description *desc)
 {
    uint32_t r_size = desc->channel[0].size;
    uint8_t attr_type = ATTRIBUTE_FLOAT;

    switch (desc->channel[0].type) {
    case UTIL_FORMAT_TYPE_FLOAT:
       if (r_size == 32) {
          attr_type = ATTRIBUTE_FLOAT;
       } else {
          assert(r_size == 16);
          attr_type = ATTRIBUTE_HALF_FLOAT;
       }
       break;

    case UTIL_FORMAT_TYPE_SIGNED:
    case UTIL_FORMAT_TYPE_UNSIGNED:
       switch (r_size) {
       case 32:
          attr_type = ATTRIBUTE_INT;
          break;
       case 16:
          attr_type = ATTRIBUTE_SHORT;
          break;
       case 10:
          attr_type = ATTRIBUTE_INT2_10_10_10;
          break;
       case 8:
          attr_type = ATTRIBUTE_BYTE;
          break;
       default:
          fprintf(stderr,
                  "format %s unsupported\n",
                  desc->name);
          attr_type = ATTRIBUTE_BYTE;
          abort();
       }
       break;

    default:
       fprintf(stderr,
               "format %s unsupported\n",
               desc->name);
       abort();
    }

    return attr_type;
 }

 static bool
 create_default_attribute_values(struct v3dv_pipeline *pipeline,
                                 const VkPipelineVertexInputStateCreateInfo *vi_info)
 {
    uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4;

    if (pipeline->default_attribute_values == NULL) {
       pipeline->default_attribute_values = v3dv_bo_alloc(pipeline->device, size,
                                                          "default_vi_attributes",
                                                          true);

       if (!pipeline->default_attribute_values) {
          fprintf(stderr, "failed to allocate memory for the default "
                  "attribute values\n");
          return false;
       }
    }

    bool ok = v3dv_bo_map(pipeline->device,
                          pipeline->default_attribute_values, size);
    if (!ok) {
       fprintf(stderr, "failed to map default attribute values buffer\n");
       return false;
    }

    uint32_t *attrs = pipeline->default_attribute_values->map;

    for (int i = 0; i < MAX_VERTEX_ATTRIBS; i++) {
       attrs[i * 4 + 0] = 0;
       attrs[i * 4 + 1] = 0;
       attrs[i * 4 + 2] = 0;
       if (i < pipeline->va_count && vk_format_is_int(pipeline->va[i].vk_format)) {
          attrs[i * 4 + 3] = 1;
       } else {
          attrs[i * 4 + 3] = fui(1.0);
       }
    }

    v3dv_bo_unmap(pipeline->device, pipeline->default_attribute_values);

    return true;
 }

 static void
 pack_shader_state_attribute_record(struct v3dv_pipeline *pipeline,
                                    uint32_t index,
                                    const VkVertexInputAttributeDescription *vi_desc)
 {
    const uint32_t packet_length =
       cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD);

    const struct util_format_description *desc =
       vk_format_description(vi_desc->format);

    uint32_t binding = vi_desc->binding;

    v3dv_pack(&pipeline->vertex_attrs[index * packet_length],
              GL_SHADER_STATE_ATTRIBUTE_RECORD, attr) {

       /* vec_size == 0 means 4 */
       attr.vec_size = desc->nr_channels & 3;
       attr.signed_int_type = (desc->channel[0].type ==
                               UTIL_FORMAT_TYPE_SIGNED);
       attr.normalized_int_type = desc->channel[0].normalized;
       attr.read_as_int_uint = desc->channel[0].pure_integer;

       attr.instance_divisor = MIN2(pipeline->vb[binding].instance_divisor,
                                    0xffff);
       attr.stride = pipeline->vb[binding].stride;
       attr.type = get_attr_type(desc);
    }
 }

 static VkResult
 pipeline_init(struct v3dv_pipeline *pipeline,
               struct v3dv_device *device,
               const VkGraphicsPipelineCreateInfo *pCreateInfo,
               const VkAllocationCallbacks *pAllocator)
 {
    VkResult result = VK_SUCCESS;

    pipeline->device = device;

    V3DV_FROM_HANDLE(v3dv_pipeline_layout, layout, pCreateInfo->layout);
    pipeline->layout = layout;

    V3DV_FROM_HANDLE(v3dv_render_pass, render_pass, pCreateInfo->renderPass);
    assert(pCreateInfo->subpass < render_pass->subpass_count);
    pipeline->pass = render_pass;
    pipeline->subpass = &render_pass->subpasses[pCreateInfo->subpass];

    /* If rasterization is not enabled, various CreateInfo structs must be
     * ignored.
     */
    const bool raster_enabled =
       !pCreateInfo->pRasterizationState->rasterizerDiscardEnable;

    const VkPipelineViewportStateCreateInfo *vp_info =
       raster_enabled ? pCreateInfo->pViewportState : NULL;

    const VkPipelineDepthStencilStateCreateInfo *ds_info =
       raster_enabled ? pCreateInfo->pDepthStencilState : NULL;

    const VkPipelineRasterizationStateCreateInfo *rs_info =
       raster_enabled ? pCreateInfo->pRasterizationState : NULL;

    const VkPipelineColorBlendStateCreateInfo *cb_info =
       raster_enabled ? pCreateInfo->pColorBlendState : NULL;

    pipeline_init_dynamic_state(pipeline,
                                pCreateInfo->pDynamicState,
                                vp_info, ds_info, cb_info, rs_info);

    /* V3D 4.2 doesn't support depth bounds testing so we don't advertise that
     * feature and it shouldn't be used by any pipeline.
     */
    assert(!ds_info || !ds_info->depthBoundsTestEnable);

    pack_blend(pipeline, cb_info);
    pack_cfg_bits(pipeline, ds_info, rs_info);
    pack_stencil_cfg(pipeline, ds_info);
    pipeline_set_ez_state(pipeline, ds_info);
    enable_depth_bias(pipeline, rs_info);

    pipeline->primitive_restart =
       pCreateInfo->pInputAssemblyState->primitiveRestartEnable;

    result = pipeline_compile_graphics(pipeline, pCreateInfo, pAllocator);

    if (result != VK_SUCCESS) {
       /* Caller would already destroy the pipeline, and we didn't allocate any
        * extra info. We don't need to do anything else.
        */
       return result;
    }

    pack_shader_state_record(pipeline);
    pack_vcm_cache_size(pipeline);

    const VkPipelineVertexInputStateCreateInfo *vi_info =
       pCreateInfo->pVertexInputState;

    pipeline->vb_count = vi_info->vertexBindingDescriptionCount;
    for (uint32_t i = 0; i < vi_info->vertexBindingDescriptionCount; i++) {
       const VkVertexInputBindingDescription *desc =
          &vi_info->pVertexBindingDescriptions[i];

       pipeline->vb[desc->binding].stride = desc->stride;
       pipeline->vb[desc->binding].instance_divisor = desc->inputRate;
    }

    pipeline->va_count = 0;
    nir_shader *shader = pipeline->vs->nir;

    for (uint32_t i = 0; i < vi_info->vertexAttributeDescriptionCount; i++) {
       const VkVertexInputAttributeDescription *desc =
          &vi_info->pVertexAttributeDescriptions[i];
       uint32_t location = desc->location + VERT_ATTRIB_GENERIC0;

       nir_variable *var = nir_find_variable_with_location(shader, nir_var_shader_in, location);

       if (var != NULL) {
          unsigned driver_location = var->data.driver_location;

          assert(driver_location < MAX_VERTEX_ATTRIBS);
          pipeline->va[driver_location].offset = desc->offset;
          pipeline->va[driver_location].binding = desc->binding;
          pipeline->va[driver_location].vk_format = desc->format;

          pack_shader_state_attribute_record(pipeline, driver_location, desc);

          pipeline->va_count++;
       }
    }

    if (!create_default_attribute_values(pipeline, vi_info))
       return VK_ERROR_OUT_OF_DEVICE_MEMORY;

    return result;
 }

 static VkResult
 graphics_pipeline_create(VkDevice _device,
                          VkPipelineCache _cache,
                          const VkGraphicsPipelineCreateInfo *pCreateInfo,
                          const VkAllocationCallbacks *pAllocator,
                          VkPipeline *pPipeline)
 {
    V3DV_FROM_HANDLE(v3dv_device, device, _device);

    struct v3dv_pipeline *pipeline;
    VkResult result;

    pipeline = vk_zalloc2(&device->alloc, pAllocator, sizeof(*pipeline), 8,
                          VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (pipeline == NULL)
       return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

    result = pipeline_init(pipeline, device,
                           pCreateInfo,
                           pAllocator);

    if (result != VK_SUCCESS) {
       v3dv_destroy_pipeline(pipeline, device, pAllocator);
       return result;
    }

    *pPipeline = v3dv_pipeline_to_handle(pipeline);

    return VK_SUCCESS;
 }

 VkResult
 v3dv_CreateGraphicsPipelines(VkDevice _device,
                              VkPipelineCache pipelineCache,
                              uint32_t count,
                              const VkGraphicsPipelineCreateInfo *pCreateInfos,
                              const VkAllocationCallbacks *pAllocator,
                              VkPipeline *pPipelines)
 {
    VkResult result = VK_SUCCESS;

    for (uint32_t i = 0; i < count; i++) {
       VkResult local_result;

       local_result = graphics_pipeline_create(_device,
                                               pipelineCache,
                                               &pCreateInfos[i],
                                               pAllocator,
                                               &pPipelines[i]);

       if (local_result != VK_SUCCESS) {
          result = local_result;
          pPipelines[i] = VK_NULL_HANDLE;
       }
    }

    return result;
 }

 static void
 shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
 {
    assert(glsl_type_is_vector_or_scalar(type));

    uint32_t comp_size = glsl_type_is_boolean(type)
       ? 4 : glsl_get_bit_size(type) / 8;
    unsigned length = glsl_get_vector_elements(type);
    *size = comp_size * length,
    *align = comp_size * (length == 3 ? 4 : length);
 }

 static void
 lower_cs_shared(struct nir_shader *nir)
 {
    NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
               nir_var_mem_shared, shared_type_info);
    NIR_PASS_V(nir, nir_lower_explicit_io,
               nir_var_mem_shared, nir_address_format_32bit_offset);
 }

 static VkResult
 pipeline_compile_compute(struct v3dv_pipeline *pipeline,
                          const VkComputePipelineCreateInfo *info,
                          const VkAllocationCallbacks *alloc)
 {
    struct v3dv_device *device = pipeline->device;
    struct v3dv_physical_device *physical_device =
       &device->instance->physicalDevice;

    const VkPipelineShaderStageCreateInfo *sinfo = &info->stage;
    gl_shader_stage stage = vk_to_mesa_shader_stage(sinfo->stage);

    struct v3dv_pipeline_stage *p_stage =
       vk_zalloc2(&device->alloc, alloc, sizeof(*p_stage), 8,
                  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (!p_stage)
       return VK_ERROR_OUT_OF_HOST_MEMORY;

    p_stage->program_id = p_atomic_inc_return(&physical_device->next_program_id);
    p_stage->compiled_variant_count = 0;
    p_stage->cache = create_variant_cache(MESA_SHADER_COMPUTE);
    p_stage->pipeline = pipeline;
    p_stage->stage = stage;
    p_stage->entrypoint = sinfo->pName;
    p_stage->module = v3dv_shader_module_from_handle(sinfo->module);
    p_stage->spec_info = sinfo->pSpecializationInfo;
    p_stage->nir = shader_module_compile_to_nir(pipeline->device, p_stage);

    pipeline->active_stages |= sinfo->stage;
    st_nir_opts(p_stage->nir);
    pipeline_lower_nir(pipeline, p_stage, pipeline->layout);
    lower_cs_shared(p_stage->nir);

    pipeline->cs = p_stage;

    struct v3d_key *key = &p_stage->key.base;
    memset(key, 0, sizeof(*key));
    pipeline_populate_v3d_key(key, p_stage, 0);

     VkResult result;
     p_stage->current_variant =
       v3dv_get_shader_variant(p_stage, key, sizeof(*key), alloc, &result);
    return result;
 }

 static VkResult
 compute_pipeline_init(struct v3dv_pipeline *pipeline,
                       struct v3dv_device *device,
                       const VkComputePipelineCreateInfo *info,
                       const VkAllocationCallbacks *alloc)
 {
    V3DV_FROM_HANDLE(v3dv_pipeline_layout, layout, info->layout);

    pipeline->device = device;
    pipeline->layout = layout;

    VkResult result = pipeline_compile_compute(pipeline, info, alloc);

    return result;
 }

 static VkResult
 compute_pipeline_create(VkDevice _device,
                          VkPipelineCache _cache,
                          const VkComputePipelineCreateInfo *pCreateInfo,
                          const VkAllocationCallbacks *pAllocator,
                          VkPipeline *pPipeline)
 {
    V3DV_FROM_HANDLE(v3dv_device, device, _device);

    struct v3dv_pipeline *pipeline;
    VkResult result;

    pipeline = vk_zalloc2(&device->alloc, pAllocator, sizeof(*pipeline), 8,
                          VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (pipeline == NULL)
       return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

    result = compute_pipeline_init(pipeline, device, pCreateInfo, pAllocator);
    if (result != VK_SUCCESS) {
       v3dv_destroy_pipeline(pipeline, device, pAllocator);
       return result;
    }

    *pPipeline = v3dv_pipeline_to_handle(pipeline);

    return VK_SUCCESS;
 }

 VkResult
 v3dv_CreateComputePipelines(VkDevice _device,
                             VkPipelineCache pipelineCache,
                             uint32_t createInfoCount,
                             const VkComputePipelineCreateInfo *pCreateInfos,
                             const VkAllocationCallbacks *pAllocator,
                             VkPipeline *pPipelines)
 {
    VkResult result = VK_SUCCESS;

    for (uint32_t i = 0; i < createInfoCount; i++) {
       VkResult local_result;
       local_result = compute_pipeline_create(_device,
                                               pipelineCache,
                                               &pCreateInfos[i],
                                               pAllocator,
                                               &pPipelines[i]);

       if (local_result != VK_SUCCESS) {
          result = local_result;
          pPipelines[i] = VK_NULL_HANDLE;
       }
    }

    return result;
 }