src/compiler/nir/nir_lower_shader_calls.c - platform/external/mesa3d - Git at Google

 /*
  * Copyright © 2020 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */

 #include "nir.h"
 #include "nir_builder.h"
 #include "nir_phi_builder.h"
 #include "util/u_dynarray.h"
 #include "util/u_math.h"

 static bool
 move_system_values_to_top(nir_shader *shader)
 {
    nir_function_impl *impl = nir_shader_get_entrypoint(shader);

    bool progress = false;
    nir_foreach_block(block, impl) {
       nir_foreach_instr_safe(instr, block) {
          if (instr->type != nir_instr_type_intrinsic)
             continue;

          /* These intrinsics not only can't be re-materialized but aren't
           * preserved when moving to the continuation shader.  We have to move
           * them to the top to ensure they get spilled as needed.
           */
          nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
          switch (intrin->intrinsic) {
          case nir_intrinsic_load_shader_record_ptr:
          case nir_intrinsic_load_btd_local_arg_addr_intel:
             nir_instr_remove(instr);
             nir_instr_insert(nir_before_cf_list(&impl->body), instr);
             progress = true;
             break;

          default:
             break;
          }
       }
    }

    if (progress) {
       nir_metadata_preserve(impl, nir_metadata_block_index |
                                   nir_metadata_dominance);
    } else {
       nir_metadata_preserve(impl, nir_metadata_all);
    }

    return progress;
 }

 static bool
 instr_is_shader_call(nir_instr *instr)
 {
    if (instr->type != nir_instr_type_intrinsic)
       return false;

    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
    return intrin->intrinsic == nir_intrinsic_trace_ray ||
           intrin->intrinsic == nir_intrinsic_report_ray_intersection ||
           intrin->intrinsic == nir_intrinsic_execute_callable;
 }

 /* Previously named bitset, it had to be renamed as FreeBSD defines a struct
  * named bitset in sys/_bitset.h required by pthread_np.h which is included
  * from src/util/u_thread.h that is indirectly included by this file.
  */
 struct sized_bitset {
    BITSET_WORD *set;
    unsigned size;
 };

 static struct sized_bitset
 bitset_create(void *mem_ctx, unsigned size)
 {
    return (struct sized_bitset) {
       .set = rzalloc_array(mem_ctx, BITSET_WORD, BITSET_WORDS(size)),
       .size = size,
    };
 }

 static bool
 src_is_in_bitset(nir_src *src, void *_set)
 {
    struct sized_bitset *set = _set;
    assert(src->is_ssa);

    /* Any SSA values which were added after we generated liveness information
     * are things generated by this pass and, while most of it is arithmetic
     * which we could re-materialize, we don't need to because it's only used
     * for a single load/store and so shouldn't cross any shader calls.
     */
    if (src->ssa->index >= set->size)
       return false;

    return BITSET_TEST(set->set, src->ssa->index);
 }

 static void
 add_ssa_def_to_bitset(nir_ssa_def *def, struct sized_bitset *set)
 {
    if (def->index >= set->size)
       return;

    BITSET_SET(set->set, def->index);
 }

 static bool
 can_remat_instr(nir_instr *instr, struct sized_bitset *remat)
 {
    /* Set of all values which are trivially re-materializable and we shouldn't
     * ever spill them.  This includes:
     *
     *   - Undef values
     *   - Constants
     *   - Uniforms (UBO or push constant)
     *   - ALU combinations of any of the above
     *   - Derefs which are either complete or casts of any of the above
     *
     * Because this pass rewrites things in-order and phis are always turned
     * into register writes, We can use "is it SSA?" to answer the question
     * "can my source be re-materialized?".
     */
    switch (instr->type) {
    case nir_instr_type_alu:
       if (!nir_instr_as_alu(instr)->dest.dest.is_ssa)
          return false;

       return nir_foreach_src(instr, src_is_in_bitset, remat);

    case nir_instr_type_deref:
       return nir_foreach_src(instr, src_is_in_bitset, remat);

    case nir_instr_type_intrinsic: {
       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
       switch (intrin->intrinsic) {
       case nir_intrinsic_load_uniform:
       case nir_intrinsic_load_ubo:
       case nir_intrinsic_vulkan_resource_index:
       case nir_intrinsic_vulkan_resource_reindex:
       case nir_intrinsic_load_vulkan_descriptor:
       case nir_intrinsic_load_push_constant:
       case nir_intrinsic_load_global_constant:
       case nir_intrinsic_load_global_const_block_intel:
       case nir_intrinsic_load_desc_set_address_intel:
          /* These intrinsics don't need to be spilled as long as they don't
           * depend on any spilled values.
           */
          return nir_foreach_src(instr, src_is_in_bitset, remat);

       case nir_intrinsic_load_scratch_base_ptr:
       case nir_intrinsic_load_ray_launch_id:
       case nir_intrinsic_load_topology_id_intel:
       case nir_intrinsic_load_btd_global_arg_addr_intel:
       case nir_intrinsic_load_btd_resume_sbt_addr_intel:
       case nir_intrinsic_load_ray_base_mem_addr_intel:
       case nir_intrinsic_load_ray_hw_stack_size_intel:
       case nir_intrinsic_load_ray_sw_stack_size_intel:
       case nir_intrinsic_load_ray_num_dss_rt_stacks_intel:
       case nir_intrinsic_load_ray_hit_sbt_addr_intel:
       case nir_intrinsic_load_ray_hit_sbt_stride_intel:
       case nir_intrinsic_load_ray_miss_sbt_addr_intel:
       case nir_intrinsic_load_ray_miss_sbt_stride_intel:
       case nir_intrinsic_load_callable_sbt_addr_intel:
       case nir_intrinsic_load_callable_sbt_stride_intel:
       case nir_intrinsic_load_reloc_const_intel:
       case nir_intrinsic_load_ray_query_global_intel:
       case nir_intrinsic_load_ray_launch_size:
          /* Notably missing from the above list is btd_local_arg_addr_intel.
           * This is because the resume shader will have a different local
           * argument pointer because it has a different BSR.  Any access of
           * the original shader's local arguments needs to be preserved so
           * that pointer has to be saved on the stack.
           *
           * TODO: There may be some system values we want to avoid
           *       re-materializing as well but we have to be very careful
           *       to ensure that it's a system value which cannot change
           *       across a shader call.
           */
          return true;

       default:
          return false;
       }
    }

    case nir_instr_type_ssa_undef:
    case nir_instr_type_load_const:
       return true;

    default:
       return false;
    }
 }

 static bool
 can_remat_ssa_def(nir_ssa_def *def, struct sized_bitset *remat)
 {
    return can_remat_instr(def->parent_instr, remat);
 }

 struct add_instr_data {
    struct util_dynarray *buf;
    struct sized_bitset *remat;
 };

 static bool
 add_src_instr(nir_src *src, void *state)
 {
    if (!src->is_ssa)
       return false;

    struct add_instr_data *data = state;
    if (BITSET_TEST(data->remat->set, src->ssa->index))
       return true;

    util_dynarray_foreach(data->buf, nir_instr *, instr_ptr) {
       if (*instr_ptr == src->ssa->parent_instr)
          return true;
    }

    util_dynarray_append(data->buf, nir_instr *, src->ssa->parent_instr);
    return true;
 }

 static int
 compare_instr_indexes(const void *_inst1, const void *_inst2)
 {
    const nir_instr * const *inst1 = _inst1;
    const nir_instr * const *inst2 = _inst2;

    return (*inst1)->index - (*inst2)->index;
 }

 static bool
 can_remat_chain_ssa_def(nir_ssa_def *def, struct sized_bitset *remat, struct util_dynarray *buf)
 {
    assert(util_dynarray_num_elements(buf, nir_instr *) == 0);

    void *mem_ctx = ralloc_context(NULL);

    /* Add all the instructions involved in build this ssa_def */
    util_dynarray_append(buf, nir_instr *, def->parent_instr);

    unsigned idx = 0;
    struct add_instr_data data = {
       .buf = buf,
       .remat = remat,
    };
    while (idx < util_dynarray_num_elements(buf, nir_instr *)) {
       nir_instr *instr = *util_dynarray_element(buf, nir_instr *, idx++);
       if (!nir_foreach_src(instr, add_src_instr, &data))
          goto fail;
    }

    /* Sort instructions by index */
    qsort(util_dynarray_begin(buf),
          util_dynarray_num_elements(buf, nir_instr *),
          sizeof(nir_instr *),
          compare_instr_indexes);

    /* Create a temporary bitset with all values already
     * rematerialized/rematerializable. We'll add to this bit set as we go
     * through values that might not be in that set but that we can
     * rematerialize.
     */
    struct sized_bitset potential_remat = bitset_create(mem_ctx, remat->size);
    memcpy(potential_remat.set, remat->set, BITSET_WORDS(remat->size) * sizeof(BITSET_WORD));

    util_dynarray_foreach(buf, nir_instr *, instr_ptr) {
       nir_ssa_def *instr_ssa_def = nir_instr_ssa_def(*instr_ptr);

       /* If already in the potential rematerializable, nothing to do. */
       if (BITSET_TEST(potential_remat.set, instr_ssa_def->index))
          continue;

       if (!can_remat_instr(*instr_ptr, &potential_remat))
          goto fail;

       /* All the sources are rematerializable and the instruction is also
        * rematerializable, mark it as rematerializable too.
        */
       BITSET_SET(potential_remat.set, instr_ssa_def->index);
    }

    ralloc_free(mem_ctx);

    return true;

  fail:
    util_dynarray_clear(buf);
    ralloc_free(mem_ctx);
    return false;
 }

 static nir_ssa_def *
 remat_ssa_def(nir_builder *b, nir_ssa_def *def, struct hash_table *remap_table)
 {
    nir_instr *clone = nir_instr_clone_deep(b->shader, def->parent_instr, remap_table);
    nir_builder_instr_insert(b, clone);
    return nir_instr_ssa_def(clone);
 }

 static nir_ssa_def *
 remat_chain_ssa_def(nir_builder *b, struct util_dynarray *buf,
                     struct sized_bitset *remat, nir_ssa_def ***fill_defs,
                     unsigned call_idx, struct hash_table *remap_table)
 {
    nir_ssa_def *last_def = NULL;

    util_dynarray_foreach(buf, nir_instr *, instr_ptr) {
       nir_ssa_def *instr_ssa_def = nir_instr_ssa_def(*instr_ptr);
       unsigned ssa_index = instr_ssa_def->index;

       if (fill_defs[ssa_index] != NULL &&
           fill_defs[ssa_index][call_idx] != NULL)
          continue;

       /* Clone the instruction we want to rematerialize */
       nir_ssa_def *clone_ssa_def = remat_ssa_def(b, instr_ssa_def, remap_table);

       if (fill_defs[ssa_index] == NULL) {
          fill_defs[ssa_index] =
             rzalloc_array(fill_defs, nir_ssa_def *, remat->size);
       }

       /* Add the new ssa_def to the list fill_defs and flag it as
        * rematerialized
        */
       fill_defs[ssa_index][call_idx] = last_def = clone_ssa_def;
       BITSET_SET(remat->set, ssa_index);

       _mesa_hash_table_insert(remap_table, instr_ssa_def, last_def);
    }

    return last_def;
 }

 struct pbv_array {
    struct nir_phi_builder_value **arr;
    unsigned len;
 };

 static struct nir_phi_builder_value *
 get_phi_builder_value_for_def(nir_ssa_def *def,
                               struct pbv_array *pbv_arr)
 {
    if (def->index >= pbv_arr->len)
       return NULL;

    return pbv_arr->arr[def->index];
 }

 static nir_ssa_def *
 get_phi_builder_def_for_src(nir_src *src, struct pbv_array *pbv_arr,
                             nir_block *block)
 {
    assert(src->is_ssa);

    struct nir_phi_builder_value *pbv =
       get_phi_builder_value_for_def(src->ssa, pbv_arr);
    if (pbv == NULL)
       return NULL;

    return nir_phi_builder_value_get_block_def(pbv, block);
 }

 static bool
 rewrite_instr_src_from_phi_builder(nir_src *src, void *_pbv_arr)
 {
    nir_block *block;
    if (src->parent_instr->type == nir_instr_type_phi) {
       nir_phi_src *phi_src = exec_node_data(nir_phi_src, src, src);
       block = phi_src->pred;
    } else {
       block = src->parent_instr->block;
    }

    nir_ssa_def *new_def = get_phi_builder_def_for_src(src, _pbv_arr, block);
    if (new_def != NULL)
       nir_instr_rewrite_src(src->parent_instr, src, nir_src_for_ssa(new_def));
    return true;
 }

 static nir_ssa_def *
 spill_fill(nir_builder *before, nir_builder *after, nir_ssa_def *def,
            unsigned value_id, unsigned call_idx,
            unsigned offset, unsigned stack_alignment)
 {
    const unsigned comp_size = def->bit_size / 8;

    nir_store_stack(before, def,
                    .base = offset,
                    .call_idx = call_idx,
                    .align_mul = MIN2(comp_size, stack_alignment),
                    .value_id = value_id,
                    .write_mask = BITFIELD_MASK(def->num_components));
    return nir_load_stack(after, def->num_components, def->bit_size,
                          .base = offset,
                          .call_idx = call_idx,
                          .value_id = value_id,
                          .align_mul = MIN2(comp_size, stack_alignment));
 }

 static void
 spill_ssa_defs_and_lower_shader_calls(nir_shader *shader, uint32_t num_calls,
                                       unsigned stack_alignment)
 {
    /* TODO: If a SSA def is filled more than once, we probably want to just
     *       spill it at the LCM of the fill sites so we avoid unnecessary
     *       extra spills
     *
     * TODO: If a SSA def is defined outside a loop but live through some call
     *       inside the loop, we probably want to spill outside the loop.  We
     *       may also want to fill outside the loop if it's not used in the
     *       loop.
     *
     * TODO: Right now, we only re-materialize things if their immediate
     *       sources are things which we filled.  We probably want to expand
     *       that to re-materialize things whose sources are things we can
     *       re-materialize from things we filled.  We may want some DAG depth
     *       heuristic on this.
     */

    /* This happens per-shader rather than per-impl because we mess with
     * nir_shader::scratch_size.
     */
    nir_function_impl *impl = nir_shader_get_entrypoint(shader);

    nir_metadata_require(impl, nir_metadata_live_ssa_defs |
                               nir_metadata_dominance |
                               nir_metadata_block_index |
                               nir_metadata_instr_index);

    void *mem_ctx = ralloc_context(shader);

    const unsigned num_ssa_defs = impl->ssa_alloc;
    const unsigned live_words = BITSET_WORDS(num_ssa_defs);
    struct sized_bitset trivial_remat = bitset_create(mem_ctx, num_ssa_defs);

    /* Array of all live SSA defs which are spill candidates */
    nir_ssa_def **spill_defs =
       rzalloc_array(mem_ctx, nir_ssa_def *, num_ssa_defs);

    /* For each spill candidate, an array of every time it's defined by a fill,
     * indexed by call instruction index.
     */
    nir_ssa_def ***fill_defs =
       rzalloc_array(mem_ctx, nir_ssa_def **, num_ssa_defs);

    /* For each call instruction, the liveness set at the call */
    const BITSET_WORD **call_live =
       rzalloc_array(mem_ctx, const BITSET_WORD *, num_calls);

    /* For each call instruction, the block index of the block it lives in */
    uint32_t *call_block_indices = rzalloc_array(mem_ctx, uint32_t, num_calls);

    /* Remap table when rebuilding instructions out of fill operations */
    struct hash_table *trivial_remap_table =
       _mesa_pointer_hash_table_create(mem_ctx);

    /* Walk the call instructions and fetch the liveness set and block index
     * for each one.  We need to do this before we start modifying the shader
     * so that liveness doesn't complain that it's been invalidated.  Don't
     * worry, we'll be very careful with our live sets. :-)
     */
    unsigned call_idx = 0;
    nir_foreach_block(block, impl) {
       nir_foreach_instr(instr, block) {
          if (!instr_is_shader_call(instr))
             continue;

          call_block_indices[call_idx] = block->index;

          /* The objective here is to preserve values around shader call
           * instructions.  Therefore, we use the live set after the
           * instruction as the set of things we want to preserve.  Because
           * none of our shader call intrinsics return anything, we don't have
           * to worry about spilling over a return value.
           *
           * TODO: This isn't quite true for report_intersection.
           */
          call_live[call_idx] =
             nir_get_live_ssa_defs(nir_after_instr(instr), mem_ctx);

          call_idx++;
       }
    }

    nir_builder before, after;
    nir_builder_init(&before, impl);
    nir_builder_init(&after, impl);

    call_idx = 0;
    unsigned max_scratch_size = shader->scratch_size;
    nir_foreach_block(block, impl) {
       nir_foreach_instr_safe(instr, block) {
          nir_ssa_def *def = nir_instr_ssa_def(instr);
          if (def != NULL) {
             if (can_remat_ssa_def(def, &trivial_remat)) {
                add_ssa_def_to_bitset(def, &trivial_remat);
                _mesa_hash_table_insert(trivial_remap_table, def, def);
             } else {
                spill_defs[def->index] = def;
             }
          }

          if (!instr_is_shader_call(instr))
             continue;

          const BITSET_WORD *live = call_live[call_idx];

          struct hash_table *remap_table =
             _mesa_hash_table_clone(trivial_remap_table, mem_ctx);

          /* Make a copy of trivial_remat that we'll update as we crawl through
           * the live SSA defs and unspill them.
           */
          struct sized_bitset remat = bitset_create(mem_ctx, num_ssa_defs);
          memcpy(remat.set, trivial_remat.set, live_words * sizeof(BITSET_WORD));

          /* Before the two builders are always separated by the call
           * instruction, it won't break anything to have two of them.
           */
          before.cursor = nir_before_instr(instr);
          after.cursor = nir_after_instr(instr);

          /* Array used to hold all the values needed to rematerialize a live
           * value.
           */
          struct util_dynarray remat_chain;
          util_dynarray_init(&remat_chain, mem_ctx);

          unsigned offset = shader->scratch_size;
          for (unsigned w = 0; w < live_words; w++) {
             BITSET_WORD spill_mask = live[w] & ~trivial_remat.set[w];
             while (spill_mask) {
                int i = u_bit_scan(&spill_mask);
                assert(i >= 0);
                unsigned index = w * BITSET_WORDBITS + i;
                assert(index < num_ssa_defs);

                def = spill_defs[index];
                nir_ssa_def *original_def = def, *new_def;
                if (can_remat_ssa_def(def, &remat)) {
                   /* If this SSA def is re-materializable or based on other
                    * things we've already spilled, re-materialize it rather
                    * than spilling and filling.  Anything which is trivially
                    * re-materializable won't even get here because we take
                    * those into account in spill_mask above.
                    */
                   new_def = remat_ssa_def(&after, def, remap_table);
                } else if (can_remat_chain_ssa_def(def, &remat, &remat_chain)) {
                   new_def = remat_chain_ssa_def(&after, &remat_chain, &remat,
                                                 fill_defs, call_idx,
                                                 remap_table);
                   util_dynarray_clear(&remat_chain);
                } else {
                   bool is_bool = def->bit_size == 1;
                   if (is_bool)
                      def = nir_b2b32(&before, def);

                   const unsigned comp_size = def->bit_size / 8;
                   offset = ALIGN(offset, comp_size);

                   new_def = spill_fill(&before, &after, def,
                                        index, call_idx,
                                        offset, stack_alignment);

                   if (is_bool)
                      new_def = nir_b2b1(&after, new_def);

                   offset += def->num_components * comp_size;
                }

                /* Mark this SSA def as available in the remat set so that, if
                 * some other SSA def we need is computed based on it, we can
                 * just re-compute instead of fetching from memory.
                 */
                BITSET_SET(remat.set, index);

                /* For now, we just make a note of this new SSA def.  We'll
                 * fix things up with the phi builder as a second pass.
                 */
                if (fill_defs[index] == NULL) {
                   fill_defs[index] =
                      rzalloc_array(fill_defs, nir_ssa_def *, num_calls);
                }
                fill_defs[index][call_idx] = new_def;
                _mesa_hash_table_insert(remap_table, original_def, new_def);
             }
          }

          nir_builder *b = &before;

          offset = ALIGN(offset, stack_alignment);
          max_scratch_size = MAX2(max_scratch_size, offset);

          /* First thing on the called shader's stack is the resume address
           * followed by a pointer to the payload.
           */
          nir_intrinsic_instr *call = nir_instr_as_intrinsic(instr);

          /* Lower to generic intrinsics with information about the stack & resume shader. */
          switch (call->intrinsic) {
          case nir_intrinsic_trace_ray: {
             nir_rt_trace_ray(b, call->src[0].ssa, call->src[1].ssa,
                               call->src[2].ssa, call->src[3].ssa,
                               call->src[4].ssa, call->src[5].ssa,
                               call->src[6].ssa, call->src[7].ssa,
                               call->src[8].ssa, call->src[9].ssa,
                               call->src[10].ssa,
                               .call_idx = call_idx, .stack_size = offset);
             break;
          }

          case nir_intrinsic_report_ray_intersection:
             unreachable("Any-hit shaders must be inlined");

          case nir_intrinsic_execute_callable: {
             nir_rt_execute_callable(b, call->src[0].ssa, call->src[1].ssa, .call_idx = call_idx, .stack_size = offset);
             break;
          }

          default:
             unreachable("Invalid shader call instruction");
          }

          nir_rt_resume(b, .call_idx = call_idx, .stack_size = offset);

          nir_instr_remove(&call->instr);

          call_idx++;
       }
    }
    assert(call_idx == num_calls);
    shader->scratch_size = max_scratch_size;

    struct nir_phi_builder *pb = nir_phi_builder_create(impl);
    struct pbv_array pbv_arr = {
       .arr = rzalloc_array(mem_ctx, struct nir_phi_builder_value *,
                            num_ssa_defs),
       .len = num_ssa_defs,
    };

    const unsigned block_words = BITSET_WORDS(impl->num_blocks);
    BITSET_WORD *def_blocks = ralloc_array(mem_ctx, BITSET_WORD, block_words);

    /* Go through and set up phi builder values for each spillable value which
     * we ever needed to spill at any point.
     */
    for (unsigned index = 0; index < num_ssa_defs; index++) {
       if (fill_defs[index] == NULL)
          continue;

       nir_ssa_def *def = spill_defs[index];

       memset(def_blocks, 0, block_words * sizeof(BITSET_WORD));
       BITSET_SET(def_blocks, def->parent_instr->block->index);
       for (unsigned call_idx = 0; call_idx < num_calls; call_idx++) {
          if (fill_defs[index][call_idx] != NULL)
             BITSET_SET(def_blocks, call_block_indices[call_idx]);
       }

       pbv_arr.arr[index] = nir_phi_builder_add_value(pb, def->num_components,
                                                      def->bit_size, def_blocks);
    }

    /* Walk the shader one more time and rewrite SSA defs as needed using the
     * phi builder.
     */
    nir_foreach_block(block, impl) {
       nir_foreach_instr_safe(instr, block) {
          nir_ssa_def *def = nir_instr_ssa_def(instr);
          if (def != NULL) {
             struct nir_phi_builder_value *pbv =
                get_phi_builder_value_for_def(def, &pbv_arr);
             if (pbv != NULL)
                nir_phi_builder_value_set_block_def(pbv, block, def);
          }

          if (instr->type == nir_instr_type_phi)
             continue;

          nir_foreach_src(instr, rewrite_instr_src_from_phi_builder, &pbv_arr);

          if (instr->type != nir_instr_type_intrinsic)
             continue;

          nir_intrinsic_instr *resume = nir_instr_as_intrinsic(instr);
          if (resume->intrinsic != nir_intrinsic_rt_resume)
             continue;

          call_idx = nir_intrinsic_call_idx(resume);

          /* Technically, this is the wrong place to add the fill defs to the
           * phi builder values because we haven't seen any of the load_scratch
           * instructions for this call yet.  However, we know based on how we
           * emitted them that no value ever gets used until after the load
           * instruction has been emitted so this should be safe.  If we ever
           * fail validation due this it likely means a bug in our spilling
           * code and not the phi re-construction code here.
           */
          for (unsigned index = 0; index < num_ssa_defs; index++) {
             if (fill_defs[index] && fill_defs[index][call_idx]) {
                nir_phi_builder_value_set_block_def(pbv_arr.arr[index], block,
                                                    fill_defs[index][call_idx]);
             }
          }
       }

       nir_if *following_if = nir_block_get_following_if(block);
       if (following_if) {
          nir_ssa_def *new_def =
             get_phi_builder_def_for_src(&following_if->condition,
                                         &pbv_arr, block);
          if (new_def != NULL)
             nir_if_rewrite_condition(following_if, nir_src_for_ssa(new_def));
       }

       /* Handle phi sources that source from this block.  We have to do this
        * as a separate pass because the phi builder assumes that uses and
        * defs are processed in an order that respects dominance.  When we have
        * loops, a phi source may be a back-edge so we have to handle it as if
        * it were one of the last instructions in the predecessor block.
        */
       nir_foreach_phi_src_leaving_block(block,
                                         rewrite_instr_src_from_phi_builder,
                                         &pbv_arr);
    }

    nir_phi_builder_finish(pb);

    ralloc_free(mem_ctx);

    nir_metadata_preserve(impl, nir_metadata_block_index |
                                nir_metadata_dominance);
 }

 static nir_instr *
 find_resume_instr(nir_function_impl *impl, unsigned call_idx)
 {
    nir_foreach_block(block, impl) {
       nir_foreach_instr(instr, block) {
          if (instr->type != nir_instr_type_intrinsic)
             continue;

          nir_intrinsic_instr *resume = nir_instr_as_intrinsic(instr);
          if (resume->intrinsic != nir_intrinsic_rt_resume)
             continue;

          if (nir_intrinsic_call_idx(resume) == call_idx)
             return &resume->instr;
       }
    }
    unreachable("Couldn't find resume instruction");
 }

 /* Walk the CF tree and duplicate the contents of every loop, one half runs on
  * resume and the other half is for any post-resume loop iterations.  We are
  * careful in our duplication to ensure that resume_instr is in the resume
  * half of the loop though a copy of resume_instr will remain in the other
  * half as well in case the same shader call happens twice.
  */
 static bool
 duplicate_loop_bodies(nir_function_impl *impl, nir_instr *resume_instr)
 {
    nir_register *resume_reg = NULL;
    for (nir_cf_node *node = resume_instr->block->cf_node.parent;
         node->type != nir_cf_node_function; node = node->parent) {
       if (node->type != nir_cf_node_loop)
          continue;

       nir_loop *loop = nir_cf_node_as_loop(node);

       if (resume_reg == NULL) {
          /* We only create resume_reg if we encounter a loop.  This way we can
           * avoid re-validating the shader and calling ssa_to_regs in the case
           * where it's just if-ladders.
           */
          resume_reg = nir_local_reg_create(impl);
          resume_reg->num_components = 1;
          resume_reg->bit_size = 1;

          nir_builder b;
          nir_builder_init(&b, impl);

          /* Initialize resume to true */
          b.cursor = nir_before_cf_list(&impl->body);
          nir_store_reg(&b, resume_reg, nir_imm_true(&b), 1);

          /* Set resume to false right after the resume instruction */
          b.cursor = nir_after_instr(resume_instr);
          nir_store_reg(&b, resume_reg, nir_imm_false(&b), 1);
       }

       /* Before we go any further, make sure that everything which exits the
        * loop or continues around to the top of the loop does so through
        * registers.  We're about to duplicate the loop body and we'll have
        * serious trouble if we don't do this.
        */
       nir_convert_loop_to_lcssa(loop);
       nir_lower_phis_to_regs_block(nir_loop_first_block(loop));
       nir_lower_phis_to_regs_block(
          nir_cf_node_as_block(nir_cf_node_next(&loop->cf_node)));

       nir_cf_list cf_list;
       nir_cf_list_extract(&cf_list, &loop->body);

       nir_if *_if = nir_if_create(impl->function->shader);
       _if->condition = nir_src_for_reg(resume_reg);
       nir_cf_node_insert(nir_after_cf_list(&loop->body), &_if->cf_node);

       nir_cf_list clone;
       nir_cf_list_clone(&clone, &cf_list, &loop->cf_node, NULL);

       /* Insert the clone in the else and the original in the then so that
        * the resume_instr remains valid even after the duplication.
        */
       nir_cf_reinsert(&cf_list, nir_before_cf_list(&_if->then_list));
       nir_cf_reinsert(&clone, nir_before_cf_list(&_if->else_list));
    }

    if (resume_reg != NULL)
       nir_metadata_preserve(impl, nir_metadata_none);

    return resume_reg != NULL;
 }

 static bool
 cf_node_contains_block(nir_cf_node *node, nir_block *block)
 {
    for (nir_cf_node *n = &block->cf_node; n != NULL; n = n->parent) {
       if (n == node)
          return true;
    }

    return false;
 }

 static void
 rewrite_phis_to_pred(nir_block *block, nir_block *pred)
 {
    nir_foreach_instr(instr, block) {
       if (instr->type != nir_instr_type_phi)
          break;

       nir_phi_instr *phi = nir_instr_as_phi(instr);

       ASSERTED bool found = false;
       nir_foreach_phi_src(phi_src, phi) {
          if (phi_src->pred == pred) {
             found = true;
             assert(phi_src->src.is_ssa);
             nir_ssa_def_rewrite_uses(&phi->dest.ssa, phi_src->src.ssa);
             break;
          }
       }
       assert(found);
    }
 }

 static bool
 cursor_is_after_jump(nir_cursor cursor)
 {
    switch (cursor.option) {
    case nir_cursor_before_instr:
    case nir_cursor_before_block:
       return false;
    case nir_cursor_after_instr:
       return cursor.instr->type == nir_instr_type_jump;
    case nir_cursor_after_block:
       return nir_block_ends_in_jump(cursor.block);;
    }
    unreachable("Invalid cursor option");
 }

 /** Flattens if ladders leading up to a resume
  *
  * Given a resume_instr, this function flattens any if ladders leading to the
  * resume instruction and deletes any code that cannot be encountered on a
  * direct path to the resume instruction.  This way we get, for the most part,
  * straight-line control-flow up to the resume instruction.
  *
  * While we do this flattening, we also move any code which is in the remat
  * set up to the top of the function or to the top of the resume portion of
  * the current loop.  We don't worry about control-flow as we do this because
  * phis will never be in the remat set (see can_remat_instr) and so nothing
  * control-dependent will ever need to be re-materialized.  It is possible
  * that this algorithm will preserve too many instructions by moving them to
  * the top but we leave that for DCE to clean up.  Any code not in the remat
  * set is deleted because it's either unused in the continuation or else
  * unspilled from a previous continuation and the unspill code is after the
  * resume instruction.
  *
  * If, for instance, we have something like this:
  *
  *    // block 0
  *    if (cond1) {
  *       // block 1
  *    } else {
  *       // block 2
  *       if (cond2) {
  *          // block 3
  *          resume;
  *          if (cond3) {
  *             // block 4
  *          }
  *       } else {
  *          // block 5
  *       }
  *    }
  *
  * then we know, because we know the resume instruction had to be encoutered,
  * that cond1 = false and cond2 = true and we lower as follows:
  *
  *    // block 0
  *    // block 2
  *    // block 3
  *    resume;
  *    if (cond3) {
  *       // block 4
  *    }
  *
  * As you can see, the code in blocks 1 and 5 was removed because there is no
  * path from the start of the shader to the resume instruction which execute
  * blocks 1 or 5.  Any remat code from blocks 0, 2, and 3 is preserved and
  * moved to the top.  If the resume instruction is inside a loop then we know
  * a priori that it is of the form
  *
  *    loop {
  *       if (resume) {
  *          // Contents containing resume_instr
  *       } else {
  *          // Second copy of contents
  *       }
  *    }
  *
  * In this case, we only descend into the first half of the loop.  The second
  * half is left alone as that portion is only ever executed after the resume
  * instruction.
  */
 static bool
 flatten_resume_if_ladder(nir_builder *b,
                          nir_cf_node *parent_node,
                          struct exec_list *child_list,
                          bool child_list_contains_cursor,
                          nir_instr *resume_instr,
                          struct sized_bitset *remat)
 {
    nir_cf_list cf_list;

    /* If our child list contains the cursor instruction then we start out
     * before the cursor instruction.  We need to know this so that we can skip
     * moving instructions which are already before the cursor.
     */
    bool before_cursor = child_list_contains_cursor;

    nir_cf_node *resume_node = NULL;
    foreach_list_typed_safe(nir_cf_node, child, node, child_list) {
       switch (child->type) {
       case nir_cf_node_block: {
          nir_block *block = nir_cf_node_as_block(child);
          if (b->cursor.option == nir_cursor_before_block &&
              b->cursor.block == block) {
             assert(before_cursor);
             before_cursor = false;
          }
          nir_foreach_instr_safe(instr, block) {
             if ((b->cursor.option == nir_cursor_before_instr ||
                  b->cursor.option == nir_cursor_after_instr) &&
                 b->cursor.instr == instr) {
                assert(nir_cf_node_is_first(&block->cf_node));
                assert(before_cursor);
                before_cursor = false;
                continue;
             }

             if (instr == resume_instr)
                goto found_resume;

             if (!before_cursor && can_remat_instr(instr, remat)) {
                nir_instr_remove(instr);
                nir_instr_insert(b->cursor, instr);
                b->cursor = nir_after_instr(instr);

                nir_ssa_def *def = nir_instr_ssa_def(instr);
                BITSET_SET(remat->set, def->index);
             }
          }
          if (b->cursor.option == nir_cursor_after_block &&
              b->cursor.block == block) {
             assert(before_cursor);
             before_cursor = false;
          }
          break;
       }

       case nir_cf_node_if: {
          assert(!before_cursor);
          nir_if *_if = nir_cf_node_as_if(child);
          if (flatten_resume_if_ladder(b, &_if->cf_node, &_if->then_list,
                                       false, resume_instr, remat)) {
             resume_node = child;
             rewrite_phis_to_pred(nir_cf_node_as_block(nir_cf_node_next(child)),
                                  nir_if_last_then_block(_if));
             goto found_resume;
          }

          if (flatten_resume_if_ladder(b, &_if->cf_node, &_if->else_list,
                                       false, resume_instr, remat)) {
             resume_node = child;
             rewrite_phis_to_pred(nir_cf_node_as_block(nir_cf_node_next(child)),
                                  nir_if_last_else_block(_if));
             goto found_resume;
          }
          break;
       }

       case nir_cf_node_loop: {
          assert(!before_cursor);
          nir_loop *loop = nir_cf_node_as_loop(child);

          if (cf_node_contains_block(&loop->cf_node, resume_instr->block)) {
             /* Thanks to our loop body duplication pass, every level of loop
              * containing the resume instruction contains exactly three nodes:
              * two blocks and an if.  We don't want to lower away this if
              * because it's the resume selection if.  The resume half is
              * always the then_list so that's what we want to flatten.
              */
             nir_block *header = nir_loop_first_block(loop);
             nir_if *_if = nir_cf_node_as_if(nir_cf_node_next(&header->cf_node));

             /* We want to place anything re-materialized from inside the loop
              * at the top of the resume half of the loop.
              */
             nir_builder bl;
             nir_builder_init(&bl, b->impl);
             bl.cursor = nir_before_cf_list(&_if->then_list);

             ASSERTED bool found =
                flatten_resume_if_ladder(&bl, &_if->cf_node, &_if->then_list,
                                         true, resume_instr, remat);
             assert(found);
             resume_node = child;
             goto found_resume;
          } else {
             ASSERTED bool found =
                flatten_resume_if_ladder(b, &loop->cf_node, &loop->body,
                                         false, resume_instr, remat);
             assert(!found);
          }
          break;
       }

       case nir_cf_node_function:
          unreachable("Unsupported CF node type");
       }
    }
    assert(!before_cursor);

    /* If we got here, we didn't find the resume node or instruction. */
    return false;

 found_resume:
    /* If we got here then we found either the resume node or the resume
     * instruction in this CF list.
     */
    if (resume_node) {
       /* If the resume instruction is buried in side one of our children CF
        * nodes, resume_node now points to that child.
        */
       if (resume_node->type == nir_cf_node_if) {
          /* Thanks to the recursive call, all of the interesting contents of
           * resume_node have been copied before the cursor.  We just need to
           * copy the stuff after resume_node.
           */
          nir_cf_extract(&cf_list, nir_after_cf_node(resume_node),
                                   nir_after_cf_list(child_list));
       } else {
          /* The loop contains its own cursor and still has useful stuff in it.
           * We want to move everything after and including the loop to before
           * the cursor.
           */
          assert(resume_node->type == nir_cf_node_loop);
          nir_cf_extract(&cf_list, nir_before_cf_node(resume_node),
                                   nir_after_cf_list(child_list));
       }
    } else {
       /* If we found the resume instruction in one of our blocks, grab
        * everything after it in the entire list (not just the one block), and
        * place it before the cursor instr.
        */
       nir_cf_extract(&cf_list, nir_after_instr(resume_instr),
                                nir_after_cf_list(child_list));
    }

    if (cursor_is_after_jump(b->cursor)) {
       /* If the resume instruction is in a loop, it's possible cf_list ends
        * in a break or continue instruction, in which case we don't want to
        * insert anything.  It's also possible we have an early return if
        * someone hasn't lowered those yet.  In either case, nothing after that
        * point executes in this context so we can delete it.
        */
       nir_cf_delete(&cf_list);
    } else {
       b->cursor = nir_cf_reinsert(&cf_list, b->cursor);
    }

    if (!resume_node) {
       /* We want the resume to be the first "interesting" instruction */
       nir_instr_remove(resume_instr);
       nir_instr_insert(nir_before_cf_list(&b->impl->body), resume_instr);
    }

    /* We've copied everything interesting out of this CF list to before the
     * cursor.  Delete everything else.
     */
    if (child_list_contains_cursor) {
       nir_cf_extract(&cf_list, b->cursor, nir_after_cf_list(child_list));
    } else {
       nir_cf_list_extract(&cf_list, child_list);
    }
    nir_cf_delete(&cf_list);

    return true;
 }

 static bool
 wrap_jump_instr(nir_builder *b, nir_instr *instr, void *data)
 {
    if (instr->type != nir_instr_type_jump)
       return false;

    b->cursor = nir_before_instr(instr);

    nir_if *_if = nir_push_if(b, nir_imm_true(b));
    nir_pop_if(b, NULL);

    nir_cf_list cf_list;
    nir_cf_extract(&cf_list, nir_before_instr(instr), nir_after_instr(instr));
    nir_cf_reinsert(&cf_list, nir_before_block(nir_if_first_then_block(_if)));

    return true;
 }

 /* This pass wraps jump instructions in a dummy if block so that when
  * flatten_resume_if_ladder() does its job, it doesn't move a jump instruction
  * directly in front of another instruction which the NIR control flow helpers
  * do not allow.
  */
 static bool
 wrap_jumps(nir_shader *shader)
 {
    return nir_shader_instructions_pass(shader, wrap_jump_instr,
                                        nir_metadata_none, NULL);
 }

 static nir_instr *
 lower_resume(nir_shader *shader, int call_idx)
 {
    wrap_jumps(shader);

    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
    nir_instr *resume_instr = find_resume_instr(impl, call_idx);

    if (duplicate_loop_bodies(impl, resume_instr)) {
       nir_validate_shader(shader, "after duplicate_loop_bodies in "
                                   "nir_lower_shader_calls");
       /* If we duplicated the bodies of any loops, run regs_to_ssa to get rid
        * of all those pesky registers we just added.
        */
       NIR_PASS_V(shader, nir_lower_regs_to_ssa);
    }

    /* Re-index nir_ssa_def::index.  We don't care about actual liveness in
     * this pass but, so we can use the same helpers as the spilling pass, we
     * need to make sure that live_index is something sane.  It's used
     * constantly for determining if an SSA value has been added since the
     * start of the pass.
     */
    nir_index_ssa_defs(impl);

    void *mem_ctx = ralloc_context(shader);

    /* Used to track which things may have been assumed to be re-materialized
     * by the spilling pass and which we shouldn't delete.
     */
    struct sized_bitset remat = bitset_create(mem_ctx, impl->ssa_alloc);

    /* Create a nop instruction to use as a cursor as we extract and re-insert
     * stuff into the CFG.
     */
    nir_builder b;
    nir_builder_init(&b, impl);
    b.cursor = nir_before_cf_list(&impl->body);
    ASSERTED bool found =
       flatten_resume_if_ladder(&b, &impl->cf_node, &impl->body,
                                true, resume_instr, &remat);
    assert(found);

    ralloc_free(mem_ctx);

    nir_metadata_preserve(impl, nir_metadata_none);

    nir_validate_shader(shader, "after flatten_resume_if_ladder in "
                                "nir_lower_shader_calls");

    return resume_instr;
 }

 static void
 replace_resume_with_halt(nir_shader *shader, nir_instr *keep)
 {
    nir_function_impl *impl = nir_shader_get_entrypoint(shader);

    nir_builder b;
    nir_builder_init(&b, impl);

    nir_foreach_block_safe(block, impl) {
       nir_foreach_instr_safe(instr, block) {
          if (instr == keep)
             continue;

          if (instr->type != nir_instr_type_intrinsic)
             continue;

          nir_intrinsic_instr *resume = nir_instr_as_intrinsic(instr);
          if (resume->intrinsic != nir_intrinsic_rt_resume)
             continue;

          /* If this is some other resume, then we've kicked off a ray or
           * bindless thread and we don't want to go any further in this
           * shader.  Insert a halt so that NIR will delete any instructions
           * dominated by this call instruction including the scratch_load
           * instructions we inserted.
           */
          nir_cf_list cf_list;
          nir_cf_extract(&cf_list, nir_after_instr(&resume->instr),
                                   nir_after_block(block));
          nir_cf_delete(&cf_list);
          b.cursor = nir_instr_remove(&resume->instr);
          nir_jump(&b, nir_jump_halt);
          break;
       }
    }
 }

 struct lower_scratch_state {
    nir_address_format address_format;
 };

 static bool
 lower_stack_instr_to_scratch(struct nir_builder *b, nir_instr *instr, void *data)
 {
    struct lower_scratch_state *state = data;

    if (instr->type != nir_instr_type_intrinsic)
       return false;

    nir_intrinsic_instr *stack = nir_instr_as_intrinsic(instr);
    switch (stack->intrinsic) {
    case nir_intrinsic_load_stack: {
       b->cursor = nir_instr_remove(instr);
       nir_ssa_def *data, *old_data = nir_instr_ssa_def(instr);

       if (state->address_format == nir_address_format_64bit_global) {
          nir_ssa_def *addr = nir_iadd_imm(b,
                                           nir_load_scratch_base_ptr(b, 1, 64, 1),
                                           nir_intrinsic_base(stack));
          data = nir_load_global(b, addr,
                                 nir_intrinsic_align_mul(stack),
                                 stack->dest.ssa.num_components,
                                 stack->dest.ssa.bit_size);
       } else {
          assert(state->address_format == nir_address_format_32bit_offset);
          data = nir_load_scratch(b,
                                  old_data->num_components,
                                  old_data->bit_size,
                                  nir_imm_int(b, nir_intrinsic_base(stack)),
                                  .align_mul = nir_intrinsic_align_mul(stack));
       }
       nir_ssa_def_rewrite_uses(old_data, data);
       break;
    }

    case nir_intrinsic_store_stack: {
       b->cursor = nir_instr_remove(instr);
       nir_ssa_def *data = stack->src[0].ssa;

       if (state->address_format == nir_address_format_64bit_global) {
          nir_ssa_def *addr = nir_iadd_imm(b,
                                           nir_load_scratch_base_ptr(b, 1, 64, 1),
                                           nir_intrinsic_base(stack));
          nir_store_global(b, addr,
                           nir_intrinsic_align_mul(stack),
                           data,
                           BITFIELD_MASK(data->num_components));
       } else {
          assert(state->address_format == nir_address_format_32bit_offset);
          nir_store_scratch(b, data,
                            nir_imm_int(b, nir_intrinsic_base(stack)),
                            .align_mul = nir_intrinsic_align_mul(stack),
                            .write_mask = BITFIELD_MASK(data->num_components));
       }
       break;
    }

    default:
       return false;
    }

    return true;
 }

 static bool
 nir_lower_stack_to_scratch(nir_shader *shader,
                            nir_address_format address_format)
 {
    struct lower_scratch_state state = {
       .address_format = address_format,
    };

    return nir_shader_instructions_pass(shader,
                                        lower_stack_instr_to_scratch,
                                        nir_metadata_block_index |
                                        nir_metadata_dominance,
                                        &state);
 }

 static bool
 opt_remove_respills_instr(struct nir_builder *b, nir_instr *instr, void *data)
 {
    if (instr->type != nir_instr_type_intrinsic)
       return false;

    nir_intrinsic_instr *store_intrin = nir_instr_as_intrinsic(instr);
    if (store_intrin->intrinsic != nir_intrinsic_store_stack)
       return false;

    nir_instr *value_instr = store_intrin->src[0].ssa->parent_instr;
    if (value_instr->type != nir_instr_type_intrinsic)
       return false;

    nir_intrinsic_instr *load_intrin = nir_instr_as_intrinsic(value_instr);
    if (load_intrin->intrinsic != nir_intrinsic_load_stack)
       return false;

    if (nir_intrinsic_base(load_intrin) != nir_intrinsic_base(store_intrin))
       return false;

    nir_instr_remove(&store_intrin->instr);
    return true;
 }

 /* After shader split, look at stack load/store operations. If we're loading
  * and storing the same value at the same location, we can drop the store
  * instruction.
  */
 static bool
 nir_opt_remove_respills(nir_shader *shader)
 {
    return nir_shader_instructions_pass(shader,
                                        opt_remove_respills_instr,
                                        nir_metadata_block_index |
                                        nir_metadata_dominance,
                                        NULL);
 }

 static void
 add_use_mask(struct hash_table_u64 *offset_to_mask,
              unsigned offset, unsigned mask)
 {
    uintptr_t old_mask = (uintptr_t)
       _mesa_hash_table_u64_search(offset_to_mask, offset);

    _mesa_hash_table_u64_insert(offset_to_mask, offset,
                                (void *)(uintptr_t)(old_mask | mask));
 }

 /* When splitting the shaders, we might have inserted store & loads of vec4s,
  * because a live value is a 4 components. But sometimes, only some components
  * of that vec4 will be used by after the scratch load. This pass removes the
  * unused components of scratch load/stores.
  */
 static bool
 nir_opt_trim_stack_values(nir_shader *shader)
 {
    nir_function_impl *impl = nir_shader_get_entrypoint(shader);

    struct hash_table_u64 *value_id_to_mask = _mesa_hash_table_u64_create(NULL);
    bool progress = false;

    /* Find all the loads and how their value is being used */
    nir_foreach_block_safe(block, impl) {
       nir_foreach_instr_safe(instr, block) {
          if (instr->type != nir_instr_type_intrinsic)
             continue;

          nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
          if (intrin->intrinsic != nir_intrinsic_load_stack)
             continue;

          const unsigned value_id = nir_intrinsic_value_id(intrin);

          const unsigned mask =
             nir_ssa_def_components_read(nir_instr_ssa_def(instr));
          add_use_mask(value_id_to_mask, value_id, mask);
       }
    }

    /* For each store, if it stores more than is being used, trim it.
     * Otherwise, remove it from the hash table.
     */
    nir_foreach_block_safe(block, impl) {
       nir_foreach_instr_safe(instr, block) {
          if (instr->type != nir_instr_type_intrinsic)
             continue;

          nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
          if (intrin->intrinsic != nir_intrinsic_store_stack)
             continue;

          const unsigned value_id = nir_intrinsic_value_id(intrin);

          const unsigned write_mask = nir_intrinsic_write_mask(intrin);
          const unsigned read_mask = (uintptr_t)
             _mesa_hash_table_u64_search(value_id_to_mask, value_id);

          /* Already removed from the table, nothing to do */
          if (read_mask == 0)
             continue;

          /* Matching read/write mask, nothing to do, remove from the table. */
          if (write_mask == read_mask) {
             _mesa_hash_table_u64_remove(value_id_to_mask, value_id);
             continue;
          }

          nir_builder b;
          nir_builder_init(&b, impl);
          b.cursor = nir_before_instr(instr);

          nir_ssa_def *value = nir_channels(&b, intrin->src[0].ssa, read_mask);
          nir_instr_rewrite_src_ssa(instr, &intrin->src[0], value);

          intrin->num_components = util_bitcount(read_mask);
          nir_intrinsic_set_write_mask(intrin, (1u << intrin->num_components) - 1);

          progress = true;
       }
    }

    /* For each load remaining in the hash table (only the ones we changed the
     * number of components of), apply triming/reswizzle.
     */
    nir_foreach_block_safe(block, impl) {
       nir_foreach_instr_safe(instr, block) {
          if (instr->type != nir_instr_type_intrinsic)
             continue;

          nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
          if (intrin->intrinsic != nir_intrinsic_load_stack)
             continue;

          const unsigned value_id = nir_intrinsic_value_id(intrin);

          unsigned read_mask = (uintptr_t)
             _mesa_hash_table_u64_search(value_id_to_mask, value_id);
          if (read_mask == 0)
             continue;

          unsigned swiz_map[NIR_MAX_VEC_COMPONENTS] = { 0, };
          unsigned swiz_count = 0;
          u_foreach_bit(idx, read_mask)
             swiz_map[idx] = swiz_count++;

          nir_ssa_def *def = nir_instr_ssa_def(instr);

          nir_foreach_use_safe(use_src, def) {
             if (use_src->parent_instr->type == nir_instr_type_alu) {
                nir_alu_instr *alu = nir_instr_as_alu(use_src->parent_instr);
                nir_alu_src *alu_src = exec_node_data(nir_alu_src, use_src, src);

                unsigned write_mask = alu->dest.write_mask;
                u_foreach_bit(idx, write_mask)
                   alu_src->swizzle[idx] = swiz_map[alu_src->swizzle[idx]];
             } else if (use_src->parent_instr->type == nir_instr_type_intrinsic) {
                nir_intrinsic_instr *use_intrin =
                   nir_instr_as_intrinsic(use_src->parent_instr);
                assert(nir_intrinsic_has_write_mask(use_intrin));
                unsigned write_mask = nir_intrinsic_write_mask(use_intrin);
                unsigned new_write_mask = 0;
                u_foreach_bit(idx, write_mask)
                   new_write_mask |= 1 << swiz_map[idx];
                nir_intrinsic_set_write_mask(use_intrin, new_write_mask);
             } else {
                unreachable("invalid instruction type");
             }
          }

          intrin->dest.ssa.num_components = intrin->num_components = swiz_count;

          progress = true;
       }
    }

    nir_metadata_preserve(impl,
                          progress ?
                          (nir_metadata_dominance |
                           nir_metadata_block_index |
                           nir_metadata_loop_analysis) :
                          nir_metadata_all);

    _mesa_hash_table_u64_destroy(value_id_to_mask);

    return progress;
 }

 struct scratch_item {
    unsigned old_offset;
    unsigned new_offset;
    unsigned bit_size;
    unsigned num_components;
    unsigned value;
    unsigned call_idx;
 };

 static int
 sort_scratch_item_by_size_and_value_id(const void *_item1, const void *_item2)
 {
    const struct scratch_item *item1 = _item1;
    const struct scratch_item *item2 = _item2;

    /* By ascending value_id */
    if (item1->bit_size == item2->bit_size)
       return (int) item1->value - (int) item2->value;

    /* By descending size */
    return (int) item2->bit_size - (int) item1->bit_size;
 }

 static bool
 nir_opt_sort_and_pack_stack(nir_shader *shader,
                             unsigned start_call_scratch,
                             unsigned stack_alignment,
                             unsigned num_calls)
 {
    nir_function_impl *impl = nir_shader_get_entrypoint(shader);

    void *mem_ctx = ralloc_context(NULL);

    struct hash_table_u64 *value_id_to_item =
       _mesa_hash_table_u64_create(mem_ctx);
    struct util_dynarray ops;
    util_dynarray_init(&ops, mem_ctx);

    for (unsigned call_idx = 0; call_idx < num_calls; call_idx++) {
       _mesa_hash_table_u64_clear(value_id_to_item);
       util_dynarray_clear(&ops);

       /* Find all the stack load and their offset. */
       nir_foreach_block_safe(block, impl) {
          nir_foreach_instr_safe(instr, block) {
             if (instr->type != nir_instr_type_intrinsic)
                continue;

             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
             if (intrin->intrinsic != nir_intrinsic_load_stack)
                continue;

             if (nir_intrinsic_call_idx(intrin) != call_idx)
                continue;

             const unsigned value_id = nir_intrinsic_value_id(intrin);
             nir_ssa_def *def = nir_instr_ssa_def(instr);

             assert(_mesa_hash_table_u64_search(value_id_to_item,
                                                value_id) == NULL);

             struct scratch_item item = {
                .old_offset = nir_intrinsic_base(intrin),
                .bit_size = def->bit_size,
                .num_components = def->num_components,
                .value = value_id,
             };

             util_dynarray_append(&ops, struct scratch_item, item);
             _mesa_hash_table_u64_insert(value_id_to_item, value_id, (void *)(uintptr_t)true);
          }
       }

       /* Sort scratch item by component size. */
       qsort(util_dynarray_begin(&ops),
             util_dynarray_num_elements(&ops, struct scratch_item),
             sizeof(struct scratch_item),
             sort_scratch_item_by_size_and_value_id);


       /* Reorder things on the stack */
       _mesa_hash_table_u64_clear(value_id_to_item);

       unsigned scratch_size = start_call_scratch;
       util_dynarray_foreach(&ops, struct scratch_item, item) {
          item->new_offset = ALIGN(scratch_size, item->bit_size / 8);
          scratch_size = item->new_offset + (item->bit_size * item->num_components) / 8;
          _mesa_hash_table_u64_insert(value_id_to_item, item->value, item);
       }
       shader->scratch_size = ALIGN(scratch_size, stack_alignment);

       /* Update offsets in the instructions */
       nir_foreach_block_safe(block, impl) {
          nir_foreach_instr_safe(instr, block) {
             if (instr->type != nir_instr_type_intrinsic)
                continue;

             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
             switch (intrin->intrinsic) {
             case nir_intrinsic_load_stack:
             case nir_intrinsic_store_stack: {
                if (nir_intrinsic_call_idx(intrin) != call_idx)
                   continue;

                struct scratch_item *item =
                   _mesa_hash_table_u64_search(value_id_to_item,
                                               nir_intrinsic_value_id(intrin));
                assert(item);

                nir_intrinsic_set_base(intrin, item->new_offset);
                break;
             }

             case nir_intrinsic_rt_trace_ray:
             case nir_intrinsic_rt_execute_callable:
             case nir_intrinsic_rt_resume:
                if (nir_intrinsic_call_idx(intrin) != call_idx)
                   continue;
                nir_intrinsic_set_stack_size(intrin, shader->scratch_size);
                break;

             default:
                break;
             }
          }
       }
    }

    ralloc_free(mem_ctx);

    nir_shader_preserve_all_metadata(shader);

    return true;
 }

 static unsigned
 nir_block_loop_depth(nir_block *block)
 {
    nir_cf_node *node = &block->cf_node;
    unsigned loop_depth = 0;

    while (node != NULL) {
       if (node->type == nir_cf_node_loop)
          loop_depth++;
       node = node->parent;
    }

    return loop_depth;
 }

 /* Find the last block dominating all the uses of a SSA value. */
 static nir_block *
 find_last_dominant_use_block(nir_function_impl *impl, nir_ssa_def *value)
 {
    nir_block *old_block = value->parent_instr->block;
    unsigned old_block_loop_depth = nir_block_loop_depth(old_block);

    nir_foreach_block_reverse_safe(block, impl) {
       bool fits = true;

       /* Store on the current block of the value */
       if (block == old_block)
          return block;

       /* Don't move instructions deeper into loops, this would generate more
        * memory traffic.
        */
       unsigned block_loop_depth = nir_block_loop_depth(block);
       if (block_loop_depth > old_block_loop_depth)
          continue;

       nir_foreach_if_use(src, value) {
          nir_block *block_before_if =
             nir_cf_node_as_block(nir_cf_node_prev(&src->parent_if->cf_node));
          if (!nir_block_dominates(block, block_before_if)) {
             fits = false;
             break;
          }
       }
       if (!fits)
          continue;

       nir_foreach_use(src, value) {
          if (src->parent_instr->type == nir_instr_type_phi &&
              block == src->parent_instr->block) {
             fits = false;
             break;
          }

          if (!nir_block_dominates(block, src->parent_instr->block)) {
             fits = false;
             break;
          }
       }
       if (!fits)
          continue;

       return block;
    }
    unreachable("Cannot find block");
 }

 /* Put the scratch loads in the branches where they're needed. */
 static bool
 nir_opt_stack_loads(nir_shader *shader)
 {
    bool progress = false;

    nir_foreach_function(func, shader) {
       if (!func->impl)
          continue;

       nir_metadata_require(func->impl, nir_metadata_dominance |
                                        nir_metadata_block_index);

       bool func_progress = false;
       nir_foreach_block_safe(block, func->impl) {
          nir_foreach_instr_safe(instr, block) {
             if (instr->type != nir_instr_type_intrinsic)
                continue;

             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
             if (intrin->intrinsic != nir_intrinsic_load_stack)
                continue;

             nir_ssa_def *value = &intrin->dest.ssa;
             nir_block *new_block = find_last_dominant_use_block(func->impl, value);
             if (new_block == block)
                continue;

             /* Move the scratch load in the new block, after the phis. */
             nir_instr_remove(instr);
             nir_instr_insert(nir_before_block_after_phis(new_block), instr);

             func_progress = true;
          }
       }

       nir_metadata_preserve(func->impl,
                             func_progress ? (nir_metadata_block_index |
                                              nir_metadata_dominance |
                                              nir_metadata_loop_analysis) :
                             nir_metadata_all);

       progress |= func_progress;
    }

    return progress;
 }

 static bool
 split_stack_components_instr(struct nir_builder *b, nir_instr *instr, void *data)
 {
    if (instr->type != nir_instr_type_intrinsic)
       return false;

    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
    if (intrin->intrinsic != nir_intrinsic_load_stack &&
        intrin->intrinsic != nir_intrinsic_store_stack)
       return false;

    if (intrin->intrinsic == nir_intrinsic_load_stack &&
        intrin->dest.ssa.num_components == 1)
       return false;

    if (intrin->intrinsic == nir_intrinsic_store_stack &&
        intrin->src[0].ssa->num_components == 1)
       return false;

    b->cursor = nir_before_instr(instr);

    if (intrin->intrinsic == nir_intrinsic_load_stack) {
       nir_ssa_def *components[NIR_MAX_VEC_COMPONENTS] = { 0, };
       for (unsigned c = 0; c < intrin->dest.ssa.num_components; c++) {
          components[c] = nir_load_stack(b, 1, intrin->dest.ssa.bit_size,
                                         .base = nir_intrinsic_base(intrin) +
                                                 c * intrin->dest.ssa.bit_size / 8,
                                         .call_idx = nir_intrinsic_call_idx(intrin),
                                         .value_id = nir_intrinsic_value_id(intrin),
                                         .align_mul = nir_intrinsic_align_mul(intrin));
       }

       nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
                                nir_vec(b, components,
                                        intrin->dest.ssa.num_components));
    } else {
       assert(intrin->intrinsic == nir_intrinsic_store_stack);
       for (unsigned c = 0; c < intrin->src[0].ssa->num_components; c++) {
          nir_store_stack(b, nir_channel(b, intrin->src[0].ssa, c),
                          .base = nir_intrinsic_base(intrin) +
                                  c * intrin->src[0].ssa->bit_size / 8,
                          .call_idx = nir_intrinsic_call_idx(intrin),
                          .align_mul = nir_intrinsic_align_mul(intrin),
                          .value_id = nir_intrinsic_value_id(intrin),
                          .write_mask = 0x1);
       }
    }

    nir_instr_remove(instr);

    return true;
 }

 /* Break the load_stack/store_stack intrinsics into single compoments. This
  * helps the vectorizer to pack components.
  */
 static bool
 nir_split_stack_components(nir_shader *shader)
 {
    return nir_shader_instructions_pass(shader,
                                        split_stack_components_instr,
                                        nir_metadata_block_index |
                                        nir_metadata_dominance,
                                        NULL);
 }

 struct stack_op_vectorizer_state {
    nir_should_vectorize_mem_func     driver_callback;
    void                             *driver_data;
 };

 static bool
 should_vectorize(unsigned align_mul,
                  unsigned align_offset,
                  unsigned bit_size,
                  unsigned num_components,
                  nir_intrinsic_instr *low, nir_intrinsic_instr *high,
                  void *data)
 {
    /* We only care about those intrinsics */
    if ((low->intrinsic != nir_intrinsic_load_stack &&
         low->intrinsic != nir_intrinsic_store_stack) ||
        (high->intrinsic != nir_intrinsic_load_stack &&
         high->intrinsic != nir_intrinsic_store_stack))
       return false;

    struct stack_op_vectorizer_state *state = data;

    return state->driver_callback(align_mul, align_offset,
                                  bit_size, num_components,
                                  low, high, state->driver_data);
 }

 /** Lower shader call instructions to split shaders.
  *
  * Shader calls can be split into an initial shader and a series of "resume"
  * shaders.   When the shader is first invoked, it is the initial shader which
  * is executed.  At any point in the initial shader or any one of the resume
  * shaders, a shader call operation may be performed.  The possible shader call
  * operations are:
  *
  *  - trace_ray
  *  - report_ray_intersection
  *  - execute_callable
  *
  * When a shader call operation is performed, we push all live values to the
  * stack,call rt_trace_ray/rt_execute_callable and then kill the shader. Once
  * the operation we invoked is complete, a callee shader will return execution
  * to the respective resume shader. The resume shader pops the contents off
  * the stack and picks up where the calling shader left off.
  *
  * Stack management is assumed to be done after this pass. Call
  * instructions and their resumes get annotated with stack information that
  * should be enough for the backend to implement proper stack management.
  */
 bool
 nir_lower_shader_calls(nir_shader *shader,
                        const nir_lower_shader_calls_options *options,
                        nir_shader ***resume_shaders_out,
                        uint32_t *num_resume_shaders_out,
                        void *mem_ctx)
 {
    nir_function_impl *impl = nir_shader_get_entrypoint(shader);

    nir_builder b;
    nir_builder_init(&b, impl);

    int num_calls = 0;
    nir_foreach_block(block, impl) {
       nir_foreach_instr_safe(instr, block) {
          if (instr_is_shader_call(instr))
             num_calls++;
       }
    }

    if (num_calls == 0) {
       nir_shader_preserve_all_metadata(shader);
       *num_resume_shaders_out = 0;
       return false;
    }

    /* Some intrinsics not only can't be re-materialized but aren't preserved
     * when moving to the continuation shader.  We have to move them to the top
     * to ensure they get spilled as needed.
     */
    {
       bool progress = false;
       NIR_PASS(progress, shader, move_system_values_to_top);
       if (progress)
          NIR_PASS(progress, shader, nir_opt_cse);
    }

    /* Save the start point of the call stack in scratch */
    unsigned start_call_scratch = shader->scratch_size;

    NIR_PASS_V(shader, spill_ssa_defs_and_lower_shader_calls,
               num_calls, options->stack_alignment);

    NIR_PASS_V(shader, nir_opt_remove_phis);

    NIR_PASS_V(shader, nir_opt_trim_stack_values);
    NIR_PASS_V(shader, nir_opt_sort_and_pack_stack,
               start_call_scratch, options->stack_alignment, num_calls);

    /* Make N copies of our shader */
    nir_shader **resume_shaders = ralloc_array(mem_ctx, nir_shader *, num_calls);
    for (unsigned i = 0; i < num_calls; i++) {
       resume_shaders[i] = nir_shader_clone(mem_ctx, shader);

       /* Give them a recognizable name */
       resume_shaders[i]->info.name =
          ralloc_asprintf(mem_ctx, "%s%sresume_%u",
                          shader->info.name ? shader->info.name : "",
                          shader->info.name ? "-" : "",
                          i);
    }

    replace_resume_with_halt(shader, NULL);
    nir_opt_dce(shader);
    nir_opt_dead_cf(shader);
    for (unsigned i = 0; i < num_calls; i++) {
       nir_instr *resume_instr = lower_resume(resume_shaders[i], i);
       replace_resume_with_halt(resume_shaders[i], resume_instr);
       /* Remove the dummy blocks added by flatten_resume_if_ladder() */
       nir_opt_if(resume_shaders[i], nir_opt_if_optimize_phi_true_false);
       nir_opt_dce(resume_shaders[i]);
       nir_opt_dead_cf(resume_shaders[i]);
       nir_opt_remove_phis(resume_shaders[i]);
    }

    for (unsigned i = 0; i < num_calls; i++)
       NIR_PASS_V(resume_shaders[i], nir_opt_remove_respills);

    if (options->localized_loads) {
       /* Once loads have been combined we can try to put them closer to where
        * they're needed.
        */
       for (unsigned i = 0; i < num_calls; i++)
          NIR_PASS_V(resume_shaders[i], nir_opt_stack_loads);
    }

    struct stack_op_vectorizer_state vectorizer_state = {
       .driver_callback = options->vectorizer_callback,
       .driver_data     = options->vectorizer_data,
    };
    nir_load_store_vectorize_options vect_opts = {
       .modes = nir_var_shader_temp,
       .callback = should_vectorize,
       .cb_data = &vectorizer_state,
    };

    if (options->vectorizer_callback != NULL) {
       NIR_PASS_V(shader, nir_split_stack_components);
       NIR_PASS_V(shader, nir_opt_load_store_vectorize, &vect_opts);
    }
    NIR_PASS_V(shader, nir_lower_stack_to_scratch, options->address_format);
    nir_opt_cse(shader);
    for (unsigned i = 0; i < num_calls; i++) {
       if (options->vectorizer_callback != NULL) {
          NIR_PASS_V(shader, nir_split_stack_components);
          NIR_PASS_V(shader, nir_opt_load_store_vectorize, &vect_opts);
       }
       NIR_PASS_V(resume_shaders[i], nir_lower_stack_to_scratch,
                  options->address_format);
       nir_opt_cse(resume_shaders[i]);
    }

    *resume_shaders_out = resume_shaders;
    *num_resume_shaders_out = num_calls;

    return true;
 }