| /* |
| * Copyright © 2020 Intel Corporation |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
| * IN THE SOFTWARE. |
| */ |
| |
| #include "nir.h" |
| #include "nir_builder.h" |
| #include "nir_phi_builder.h" |
| #include "util/u_dynarray.h" |
| #include "util/u_math.h" |
| |
| static bool |
| move_system_values_to_top(nir_shader *shader) |
| { |
| nir_function_impl *impl = nir_shader_get_entrypoint(shader); |
| |
| bool progress = false; |
| nir_foreach_block(block, impl) { |
| nir_foreach_instr_safe(instr, block) { |
| if (instr->type != nir_instr_type_intrinsic) |
| continue; |
| |
| /* These intrinsics not only can't be re-materialized but aren't |
| * preserved when moving to the continuation shader. We have to move |
| * them to the top to ensure they get spilled as needed. |
| */ |
| nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); |
| switch (intrin->intrinsic) { |
| case nir_intrinsic_load_shader_record_ptr: |
| case nir_intrinsic_load_btd_local_arg_addr_intel: |
| nir_instr_remove(instr); |
| nir_instr_insert(nir_before_cf_list(&impl->body), instr); |
| progress = true; |
| break; |
| |
| default: |
| break; |
| } |
| } |
| } |
| |
| if (progress) { |
| nir_metadata_preserve(impl, nir_metadata_block_index | |
| nir_metadata_dominance); |
| } else { |
| nir_metadata_preserve(impl, nir_metadata_all); |
| } |
| |
| return progress; |
| } |
| |
| static bool |
| instr_is_shader_call(nir_instr *instr) |
| { |
| if (instr->type != nir_instr_type_intrinsic) |
| return false; |
| |
| nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); |
| return intrin->intrinsic == nir_intrinsic_trace_ray || |
| intrin->intrinsic == nir_intrinsic_report_ray_intersection || |
| intrin->intrinsic == nir_intrinsic_execute_callable; |
| } |
| |
| /* Previously named bitset, it had to be renamed as FreeBSD defines a struct |
| * named bitset in sys/_bitset.h required by pthread_np.h which is included |
| * from src/util/u_thread.h that is indirectly included by this file. |
| */ |
| struct sized_bitset { |
| BITSET_WORD *set; |
| unsigned size; |
| }; |
| |
| static struct sized_bitset |
| bitset_create(void *mem_ctx, unsigned size) |
| { |
| return (struct sized_bitset) { |
| .set = rzalloc_array(mem_ctx, BITSET_WORD, BITSET_WORDS(size)), |
| .size = size, |
| }; |
| } |
| |
| static bool |
| src_is_in_bitset(nir_src *src, void *_set) |
| { |
| struct sized_bitset *set = _set; |
| assert(src->is_ssa); |
| |
| /* Any SSA values which were added after we generated liveness information |
| * are things generated by this pass and, while most of it is arithmetic |
| * which we could re-materialize, we don't need to because it's only used |
| * for a single load/store and so shouldn't cross any shader calls. |
| */ |
| if (src->ssa->index >= set->size) |
| return false; |
| |
| return BITSET_TEST(set->set, src->ssa->index); |
| } |
| |
| static void |
| add_ssa_def_to_bitset(nir_ssa_def *def, struct sized_bitset *set) |
| { |
| if (def->index >= set->size) |
| return; |
| |
| BITSET_SET(set->set, def->index); |
| } |
| |
| static bool |
| can_remat_instr(nir_instr *instr, struct sized_bitset *remat) |
| { |
| /* Set of all values which are trivially re-materializable and we shouldn't |
| * ever spill them. This includes: |
| * |
| * - Undef values |
| * - Constants |
| * - Uniforms (UBO or push constant) |
| * - ALU combinations of any of the above |
| * - Derefs which are either complete or casts of any of the above |
| * |
| * Because this pass rewrites things in-order and phis are always turned |
| * into register writes, We can use "is it SSA?" to answer the question |
| * "can my source be re-materialized?". |
| */ |
| switch (instr->type) { |
| case nir_instr_type_alu: |
| if (!nir_instr_as_alu(instr)->dest.dest.is_ssa) |
| return false; |
| |
| return nir_foreach_src(instr, src_is_in_bitset, remat); |
| |
| case nir_instr_type_deref: |
| return nir_foreach_src(instr, src_is_in_bitset, remat); |
| |
| case nir_instr_type_intrinsic: { |
| nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); |
| switch (intrin->intrinsic) { |
| case nir_intrinsic_load_uniform: |
| case nir_intrinsic_load_ubo: |
| case nir_intrinsic_vulkan_resource_index: |
| case nir_intrinsic_vulkan_resource_reindex: |
| case nir_intrinsic_load_vulkan_descriptor: |
| case nir_intrinsic_load_push_constant: |
| case nir_intrinsic_load_global_constant: |
| case nir_intrinsic_load_global_const_block_intel: |
| case nir_intrinsic_load_desc_set_address_intel: |
| /* These intrinsics don't need to be spilled as long as they don't |
| * depend on any spilled values. |
| */ |
| return nir_foreach_src(instr, src_is_in_bitset, remat); |
| |
| case nir_intrinsic_load_scratch_base_ptr: |
| case nir_intrinsic_load_ray_launch_id: |
| case nir_intrinsic_load_topology_id_intel: |
| case nir_intrinsic_load_btd_global_arg_addr_intel: |
| case nir_intrinsic_load_btd_resume_sbt_addr_intel: |
| case nir_intrinsic_load_ray_base_mem_addr_intel: |
| case nir_intrinsic_load_ray_hw_stack_size_intel: |
| case nir_intrinsic_load_ray_sw_stack_size_intel: |
| case nir_intrinsic_load_ray_num_dss_rt_stacks_intel: |
| case nir_intrinsic_load_ray_hit_sbt_addr_intel: |
| case nir_intrinsic_load_ray_hit_sbt_stride_intel: |
| case nir_intrinsic_load_ray_miss_sbt_addr_intel: |
| case nir_intrinsic_load_ray_miss_sbt_stride_intel: |
| case nir_intrinsic_load_callable_sbt_addr_intel: |
| case nir_intrinsic_load_callable_sbt_stride_intel: |
| case nir_intrinsic_load_reloc_const_intel: |
| case nir_intrinsic_load_ray_query_global_intel: |
| case nir_intrinsic_load_ray_launch_size: |
| /* Notably missing from the above list is btd_local_arg_addr_intel. |
| * This is because the resume shader will have a different local |
| * argument pointer because it has a different BSR. Any access of |
| * the original shader's local arguments needs to be preserved so |
| * that pointer has to be saved on the stack. |
| * |
| * TODO: There may be some system values we want to avoid |
| * re-materializing as well but we have to be very careful |
| * to ensure that it's a system value which cannot change |
| * across a shader call. |
| */ |
| return true; |
| |
| default: |
| return false; |
| } |
| } |
| |
| case nir_instr_type_ssa_undef: |
| case nir_instr_type_load_const: |
| return true; |
| |
| default: |
| return false; |
| } |
| } |
| |
| static bool |
| can_remat_ssa_def(nir_ssa_def *def, struct sized_bitset *remat) |
| { |
| return can_remat_instr(def->parent_instr, remat); |
| } |
| |
| struct add_instr_data { |
| struct util_dynarray *buf; |
| struct sized_bitset *remat; |
| }; |
| |
| static bool |
| add_src_instr(nir_src *src, void *state) |
| { |
| if (!src->is_ssa) |
| return false; |
| |
| struct add_instr_data *data = state; |
| if (BITSET_TEST(data->remat->set, src->ssa->index)) |
| return true; |
| |
| util_dynarray_foreach(data->buf, nir_instr *, instr_ptr) { |
| if (*instr_ptr == src->ssa->parent_instr) |
| return true; |
| } |
| |
| util_dynarray_append(data->buf, nir_instr *, src->ssa->parent_instr); |
| return true; |
| } |
| |
| static int |
| compare_instr_indexes(const void *_inst1, const void *_inst2) |
| { |
| const nir_instr * const *inst1 = _inst1; |
| const nir_instr * const *inst2 = _inst2; |
| |
| return (*inst1)->index - (*inst2)->index; |
| } |
| |
| static bool |
| can_remat_chain_ssa_def(nir_ssa_def *def, struct sized_bitset *remat, struct util_dynarray *buf) |
| { |
| assert(util_dynarray_num_elements(buf, nir_instr *) == 0); |
| |
| void *mem_ctx = ralloc_context(NULL); |
| |
| /* Add all the instructions involved in build this ssa_def */ |
| util_dynarray_append(buf, nir_instr *, def->parent_instr); |
| |
| unsigned idx = 0; |
| struct add_instr_data data = { |
| .buf = buf, |
| .remat = remat, |
| }; |
| while (idx < util_dynarray_num_elements(buf, nir_instr *)) { |
| nir_instr *instr = *util_dynarray_element(buf, nir_instr *, idx++); |
| if (!nir_foreach_src(instr, add_src_instr, &data)) |
| goto fail; |
| } |
| |
| /* Sort instructions by index */ |
| qsort(util_dynarray_begin(buf), |
| util_dynarray_num_elements(buf, nir_instr *), |
| sizeof(nir_instr *), |
| compare_instr_indexes); |
| |
| /* Create a temporary bitset with all values already |
| * rematerialized/rematerializable. We'll add to this bit set as we go |
| * through values that might not be in that set but that we can |
| * rematerialize. |
| */ |
| struct sized_bitset potential_remat = bitset_create(mem_ctx, remat->size); |
| memcpy(potential_remat.set, remat->set, BITSET_WORDS(remat->size) * sizeof(BITSET_WORD)); |
| |
| util_dynarray_foreach(buf, nir_instr *, instr_ptr) { |
| nir_ssa_def *instr_ssa_def = nir_instr_ssa_def(*instr_ptr); |
| |
| /* If already in the potential rematerializable, nothing to do. */ |
| if (BITSET_TEST(potential_remat.set, instr_ssa_def->index)) |
| continue; |
| |
| if (!can_remat_instr(*instr_ptr, &potential_remat)) |
| goto fail; |
| |
| /* All the sources are rematerializable and the instruction is also |
| * rematerializable, mark it as rematerializable too. |
| */ |
| BITSET_SET(potential_remat.set, instr_ssa_def->index); |
| } |
| |
| ralloc_free(mem_ctx); |
| |
| return true; |
| |
| fail: |
| util_dynarray_clear(buf); |
| ralloc_free(mem_ctx); |
| return false; |
| } |
| |
| static nir_ssa_def * |
| remat_ssa_def(nir_builder *b, nir_ssa_def *def, struct hash_table *remap_table) |
| { |
| nir_instr *clone = nir_instr_clone_deep(b->shader, def->parent_instr, remap_table); |
| nir_builder_instr_insert(b, clone); |
| return nir_instr_ssa_def(clone); |
| } |
| |
| static nir_ssa_def * |
| remat_chain_ssa_def(nir_builder *b, struct util_dynarray *buf, |
| struct sized_bitset *remat, nir_ssa_def ***fill_defs, |
| unsigned call_idx, struct hash_table *remap_table) |
| { |
| nir_ssa_def *last_def = NULL; |
| |
| util_dynarray_foreach(buf, nir_instr *, instr_ptr) { |
| nir_ssa_def *instr_ssa_def = nir_instr_ssa_def(*instr_ptr); |
| unsigned ssa_index = instr_ssa_def->index; |
| |
| if (fill_defs[ssa_index] != NULL && |
| fill_defs[ssa_index][call_idx] != NULL) |
| continue; |
| |
| /* Clone the instruction we want to rematerialize */ |
| nir_ssa_def *clone_ssa_def = remat_ssa_def(b, instr_ssa_def, remap_table); |
| |
| if (fill_defs[ssa_index] == NULL) { |
| fill_defs[ssa_index] = |
| rzalloc_array(fill_defs, nir_ssa_def *, remat->size); |
| } |
| |
| /* Add the new ssa_def to the list fill_defs and flag it as |
| * rematerialized |
| */ |
| fill_defs[ssa_index][call_idx] = last_def = clone_ssa_def; |
| BITSET_SET(remat->set, ssa_index); |
| |
| _mesa_hash_table_insert(remap_table, instr_ssa_def, last_def); |
| } |
| |
| return last_def; |
| } |
| |
| struct pbv_array { |
| struct nir_phi_builder_value **arr; |
| unsigned len; |
| }; |
| |
| static struct nir_phi_builder_value * |
| get_phi_builder_value_for_def(nir_ssa_def *def, |
| struct pbv_array *pbv_arr) |
| { |
| if (def->index >= pbv_arr->len) |
| return NULL; |
| |
| return pbv_arr->arr[def->index]; |
| } |
| |
| static nir_ssa_def * |
| get_phi_builder_def_for_src(nir_src *src, struct pbv_array *pbv_arr, |
| nir_block *block) |
| { |
| assert(src->is_ssa); |
| |
| struct nir_phi_builder_value *pbv = |
| get_phi_builder_value_for_def(src->ssa, pbv_arr); |
| if (pbv == NULL) |
| return NULL; |
| |
| return nir_phi_builder_value_get_block_def(pbv, block); |
| } |
| |
| static bool |
| rewrite_instr_src_from_phi_builder(nir_src *src, void *_pbv_arr) |
| { |
| nir_block *block; |
| if (src->parent_instr->type == nir_instr_type_phi) { |
| nir_phi_src *phi_src = exec_node_data(nir_phi_src, src, src); |
| block = phi_src->pred; |
| } else { |
| block = src->parent_instr->block; |
| } |
| |
| nir_ssa_def *new_def = get_phi_builder_def_for_src(src, _pbv_arr, block); |
| if (new_def != NULL) |
| nir_instr_rewrite_src(src->parent_instr, src, nir_src_for_ssa(new_def)); |
| return true; |
| } |
| |
| static nir_ssa_def * |
| spill_fill(nir_builder *before, nir_builder *after, nir_ssa_def *def, |
| unsigned value_id, unsigned call_idx, |
| unsigned offset, unsigned stack_alignment) |
| { |
| const unsigned comp_size = def->bit_size / 8; |
| |
| nir_store_stack(before, def, |
| .base = offset, |
| .call_idx = call_idx, |
| .align_mul = MIN2(comp_size, stack_alignment), |
| .value_id = value_id, |
| .write_mask = BITFIELD_MASK(def->num_components)); |
| return nir_load_stack(after, def->num_components, def->bit_size, |
| .base = offset, |
| .call_idx = call_idx, |
| .value_id = value_id, |
| .align_mul = MIN2(comp_size, stack_alignment)); |
| } |
| |
| static void |
| spill_ssa_defs_and_lower_shader_calls(nir_shader *shader, uint32_t num_calls, |
| unsigned stack_alignment) |
| { |
| /* TODO: If a SSA def is filled more than once, we probably want to just |
| * spill it at the LCM of the fill sites so we avoid unnecessary |
| * extra spills |
| * |
| * TODO: If a SSA def is defined outside a loop but live through some call |
| * inside the loop, we probably want to spill outside the loop. We |
| * may also want to fill outside the loop if it's not used in the |
| * loop. |
| * |
| * TODO: Right now, we only re-materialize things if their immediate |
| * sources are things which we filled. We probably want to expand |
| * that to re-materialize things whose sources are things we can |
| * re-materialize from things we filled. We may want some DAG depth |
| * heuristic on this. |
| */ |
| |
| /* This happens per-shader rather than per-impl because we mess with |
| * nir_shader::scratch_size. |
| */ |
| nir_function_impl *impl = nir_shader_get_entrypoint(shader); |
| |
| nir_metadata_require(impl, nir_metadata_live_ssa_defs | |
| nir_metadata_dominance | |
| nir_metadata_block_index | |
| nir_metadata_instr_index); |
| |
| void *mem_ctx = ralloc_context(shader); |
| |
| const unsigned num_ssa_defs = impl->ssa_alloc; |
| const unsigned live_words = BITSET_WORDS(num_ssa_defs); |
| struct sized_bitset trivial_remat = bitset_create(mem_ctx, num_ssa_defs); |
| |
| /* Array of all live SSA defs which are spill candidates */ |
| nir_ssa_def **spill_defs = |
| rzalloc_array(mem_ctx, nir_ssa_def *, num_ssa_defs); |
| |
| /* For each spill candidate, an array of every time it's defined by a fill, |
| * indexed by call instruction index. |
| */ |
| nir_ssa_def ***fill_defs = |
| rzalloc_array(mem_ctx, nir_ssa_def **, num_ssa_defs); |
| |
| /* For each call instruction, the liveness set at the call */ |
| const BITSET_WORD **call_live = |
| rzalloc_array(mem_ctx, const BITSET_WORD *, num_calls); |
| |
| /* For each call instruction, the block index of the block it lives in */ |
| uint32_t *call_block_indices = rzalloc_array(mem_ctx, uint32_t, num_calls); |
| |
| /* Remap table when rebuilding instructions out of fill operations */ |
| struct hash_table *trivial_remap_table = |
| _mesa_pointer_hash_table_create(mem_ctx); |
| |
| /* Walk the call instructions and fetch the liveness set and block index |
| * for each one. We need to do this before we start modifying the shader |
| * so that liveness doesn't complain that it's been invalidated. Don't |
| * worry, we'll be very careful with our live sets. :-) |
| */ |
| unsigned call_idx = 0; |
| nir_foreach_block(block, impl) { |
| nir_foreach_instr(instr, block) { |
| if (!instr_is_shader_call(instr)) |
| continue; |
| |
| call_block_indices[call_idx] = block->index; |
| |
| /* The objective here is to preserve values around shader call |
| * instructions. Therefore, we use the live set after the |
| * instruction as the set of things we want to preserve. Because |
| * none of our shader call intrinsics return anything, we don't have |
| * to worry about spilling over a return value. |
| * |
| * TODO: This isn't quite true for report_intersection. |
| */ |
| call_live[call_idx] = |
| nir_get_live_ssa_defs(nir_after_instr(instr), mem_ctx); |
| |
| call_idx++; |
| } |
| } |
| |
| nir_builder before, after; |
| nir_builder_init(&before, impl); |
| nir_builder_init(&after, impl); |
| |
| call_idx = 0; |
| unsigned max_scratch_size = shader->scratch_size; |
| nir_foreach_block(block, impl) { |
| nir_foreach_instr_safe(instr, block) { |
| nir_ssa_def *def = nir_instr_ssa_def(instr); |
| if (def != NULL) { |
| if (can_remat_ssa_def(def, &trivial_remat)) { |
| add_ssa_def_to_bitset(def, &trivial_remat); |
| _mesa_hash_table_insert(trivial_remap_table, def, def); |
| } else { |
| spill_defs[def->index] = def; |
| } |
| } |
| |
| if (!instr_is_shader_call(instr)) |
| continue; |
| |
| const BITSET_WORD *live = call_live[call_idx]; |
| |
| struct hash_table *remap_table = |
| _mesa_hash_table_clone(trivial_remap_table, mem_ctx); |
| |
| /* Make a copy of trivial_remat that we'll update as we crawl through |
| * the live SSA defs and unspill them. |
| */ |
| struct sized_bitset remat = bitset_create(mem_ctx, num_ssa_defs); |
| memcpy(remat.set, trivial_remat.set, live_words * sizeof(BITSET_WORD)); |
| |
| /* Before the two builders are always separated by the call |
| * instruction, it won't break anything to have two of them. |
| */ |
| before.cursor = nir_before_instr(instr); |
| after.cursor = nir_after_instr(instr); |
| |
| /* Array used to hold all the values needed to rematerialize a live |
| * value. |
| */ |
| struct util_dynarray remat_chain; |
| util_dynarray_init(&remat_chain, mem_ctx); |
| |
| unsigned offset = shader->scratch_size; |
| for (unsigned w = 0; w < live_words; w++) { |
| BITSET_WORD spill_mask = live[w] & ~trivial_remat.set[w]; |
| while (spill_mask) { |
| int i = u_bit_scan(&spill_mask); |
| assert(i >= 0); |
| unsigned index = w * BITSET_WORDBITS + i; |
| assert(index < num_ssa_defs); |
| |
| def = spill_defs[index]; |
| nir_ssa_def *original_def = def, *new_def; |
| if (can_remat_ssa_def(def, &remat)) { |
| /* If this SSA def is re-materializable or based on other |
| * things we've already spilled, re-materialize it rather |
| * than spilling and filling. Anything which is trivially |
| * re-materializable won't even get here because we take |
| * those into account in spill_mask above. |
| */ |
| new_def = remat_ssa_def(&after, def, remap_table); |
| } else if (can_remat_chain_ssa_def(def, &remat, &remat_chain)) { |
| new_def = remat_chain_ssa_def(&after, &remat_chain, &remat, |
| fill_defs, call_idx, |
| remap_table); |
| util_dynarray_clear(&remat_chain); |
| } else { |
| bool is_bool = def->bit_size == 1; |
| if (is_bool) |
| def = nir_b2b32(&before, def); |
| |
| const unsigned comp_size = def->bit_size / 8; |
| offset = ALIGN(offset, comp_size); |
| |
| new_def = spill_fill(&before, &after, def, |
| index, call_idx, |
| offset, stack_alignment); |
| |
| if (is_bool) |
| new_def = nir_b2b1(&after, new_def); |
| |
| offset += def->num_components * comp_size; |
| } |
| |
| /* Mark this SSA def as available in the remat set so that, if |
| * some other SSA def we need is computed based on it, we can |
| * just re-compute instead of fetching from memory. |
| */ |
| BITSET_SET(remat.set, index); |
| |
| /* For now, we just make a note of this new SSA def. We'll |
| * fix things up with the phi builder as a second pass. |
| */ |
| if (fill_defs[index] == NULL) { |
| fill_defs[index] = |
| rzalloc_array(fill_defs, nir_ssa_def *, num_calls); |
| } |
| fill_defs[index][call_idx] = new_def; |
| _mesa_hash_table_insert(remap_table, original_def, new_def); |
| } |
| } |
| |
| nir_builder *b = &before; |
| |
| offset = ALIGN(offset, stack_alignment); |
| max_scratch_size = MAX2(max_scratch_size, offset); |
| |
| /* First thing on the called shader's stack is the resume address |
| * followed by a pointer to the payload. |
| */ |
| nir_intrinsic_instr *call = nir_instr_as_intrinsic(instr); |
| |
| /* Lower to generic intrinsics with information about the stack & resume shader. */ |
| switch (call->intrinsic) { |
| case nir_intrinsic_trace_ray: { |
| nir_rt_trace_ray(b, call->src[0].ssa, call->src[1].ssa, |
| call->src[2].ssa, call->src[3].ssa, |
| call->src[4].ssa, call->src[5].ssa, |
| call->src[6].ssa, call->src[7].ssa, |
| call->src[8].ssa, call->src[9].ssa, |
| call->src[10].ssa, |
| .call_idx = call_idx, .stack_size = offset); |
| break; |
| } |
| |
| case nir_intrinsic_report_ray_intersection: |
| unreachable("Any-hit shaders must be inlined"); |
| |
| case nir_intrinsic_execute_callable: { |
| nir_rt_execute_callable(b, call->src[0].ssa, call->src[1].ssa, .call_idx = call_idx, .stack_size = offset); |
| break; |
| } |
| |
| default: |
| unreachable("Invalid shader call instruction"); |
| } |
| |
| nir_rt_resume(b, .call_idx = call_idx, .stack_size = offset); |
| |
| nir_instr_remove(&call->instr); |
| |
| call_idx++; |
| } |
| } |
| assert(call_idx == num_calls); |
| shader->scratch_size = max_scratch_size; |
| |
| struct nir_phi_builder *pb = nir_phi_builder_create(impl); |
| struct pbv_array pbv_arr = { |
| .arr = rzalloc_array(mem_ctx, struct nir_phi_builder_value *, |
| num_ssa_defs), |
| .len = num_ssa_defs, |
| }; |
| |
| const unsigned block_words = BITSET_WORDS(impl->num_blocks); |
| BITSET_WORD *def_blocks = ralloc_array(mem_ctx, BITSET_WORD, block_words); |
| |
| /* Go through and set up phi builder values for each spillable value which |
| * we ever needed to spill at any point. |
| */ |
| for (unsigned index = 0; index < num_ssa_defs; index++) { |
| if (fill_defs[index] == NULL) |
| continue; |
| |
| nir_ssa_def *def = spill_defs[index]; |
| |
| memset(def_blocks, 0, block_words * sizeof(BITSET_WORD)); |
| BITSET_SET(def_blocks, def->parent_instr->block->index); |
| for (unsigned call_idx = 0; call_idx < num_calls; call_idx++) { |
| if (fill_defs[index][call_idx] != NULL) |
| BITSET_SET(def_blocks, call_block_indices[call_idx]); |
| } |
| |
| pbv_arr.arr[index] = nir_phi_builder_add_value(pb, def->num_components, |
| def->bit_size, def_blocks); |
| } |
| |
| /* Walk the shader one more time and rewrite SSA defs as needed using the |
| * phi builder. |
| */ |
| nir_foreach_block(block, impl) { |
| nir_foreach_instr_safe(instr, block) { |
| nir_ssa_def *def = nir_instr_ssa_def(instr); |
| if (def != NULL) { |
| struct nir_phi_builder_value *pbv = |
| get_phi_builder_value_for_def(def, &pbv_arr); |
| if (pbv != NULL) |
| nir_phi_builder_value_set_block_def(pbv, block, def); |
| } |
| |
| if (instr->type == nir_instr_type_phi) |
| continue; |
| |
| nir_foreach_src(instr, rewrite_instr_src_from_phi_builder, &pbv_arr); |
| |
| if (instr->type != nir_instr_type_intrinsic) |
| continue; |
| |
| nir_intrinsic_instr *resume = nir_instr_as_intrinsic(instr); |
| if (resume->intrinsic != nir_intrinsic_rt_resume) |
| continue; |
| |
| call_idx = nir_intrinsic_call_idx(resume); |
| |
| /* Technically, this is the wrong place to add the fill defs to the |
| * phi builder values because we haven't seen any of the load_scratch |
| * instructions for this call yet. However, we know based on how we |
| * emitted them that no value ever gets used until after the load |
| * instruction has been emitted so this should be safe. If we ever |
| * fail validation due this it likely means a bug in our spilling |
| * code and not the phi re-construction code here. |
| */ |
| for (unsigned index = 0; index < num_ssa_defs; index++) { |
| if (fill_defs[index] && fill_defs[index][call_idx]) { |
| nir_phi_builder_value_set_block_def(pbv_arr.arr[index], block, |
| fill_defs[index][call_idx]); |
| } |
| } |
| } |
| |
| nir_if *following_if = nir_block_get_following_if(block); |
| if (following_if) { |
| nir_ssa_def *new_def = |
| get_phi_builder_def_for_src(&following_if->condition, |
| &pbv_arr, block); |
| if (new_def != NULL) |
| nir_if_rewrite_condition(following_if, nir_src_for_ssa(new_def)); |
| } |
| |
| /* Handle phi sources that source from this block. We have to do this |
| * as a separate pass because the phi builder assumes that uses and |
| * defs are processed in an order that respects dominance. When we have |
| * loops, a phi source may be a back-edge so we have to handle it as if |
| * it were one of the last instructions in the predecessor block. |
| */ |
| nir_foreach_phi_src_leaving_block(block, |
| rewrite_instr_src_from_phi_builder, |
| &pbv_arr); |
| } |
| |
| nir_phi_builder_finish(pb); |
| |
| ralloc_free(mem_ctx); |
| |
| nir_metadata_preserve(impl, nir_metadata_block_index | |
| nir_metadata_dominance); |
| } |
| |
| static nir_instr * |
| find_resume_instr(nir_function_impl *impl, unsigned call_idx) |
| { |
| nir_foreach_block(block, impl) { |
| nir_foreach_instr(instr, block) { |
| if (instr->type != nir_instr_type_intrinsic) |
| continue; |
| |
| nir_intrinsic_instr *resume = nir_instr_as_intrinsic(instr); |
| if (resume->intrinsic != nir_intrinsic_rt_resume) |
| continue; |
| |
| if (nir_intrinsic_call_idx(resume) == call_idx) |
| return &resume->instr; |
| } |
| } |
| unreachable("Couldn't find resume instruction"); |
| } |
| |
| /* Walk the CF tree and duplicate the contents of every loop, one half runs on |
| * resume and the other half is for any post-resume loop iterations. We are |
| * careful in our duplication to ensure that resume_instr is in the resume |
| * half of the loop though a copy of resume_instr will remain in the other |
| * half as well in case the same shader call happens twice. |
| */ |
| static bool |
| duplicate_loop_bodies(nir_function_impl *impl, nir_instr *resume_instr) |
| { |
| nir_register *resume_reg = NULL; |
| for (nir_cf_node *node = resume_instr->block->cf_node.parent; |
| node->type != nir_cf_node_function; node = node->parent) { |
| if (node->type != nir_cf_node_loop) |
| continue; |
| |
| nir_loop *loop = nir_cf_node_as_loop(node); |
| |
| if (resume_reg == NULL) { |
| /* We only create resume_reg if we encounter a loop. This way we can |
| * avoid re-validating the shader and calling ssa_to_regs in the case |
| * where it's just if-ladders. |
| */ |
| resume_reg = nir_local_reg_create(impl); |
| resume_reg->num_components = 1; |
| resume_reg->bit_size = 1; |
| |
| nir_builder b; |
| nir_builder_init(&b, impl); |
| |
| /* Initialize resume to true */ |
| b.cursor = nir_before_cf_list(&impl->body); |
| nir_store_reg(&b, resume_reg, nir_imm_true(&b), 1); |
| |
| /* Set resume to false right after the resume instruction */ |
| b.cursor = nir_after_instr(resume_instr); |
| nir_store_reg(&b, resume_reg, nir_imm_false(&b), 1); |
| } |
| |
| /* Before we go any further, make sure that everything which exits the |
| * loop or continues around to the top of the loop does so through |
| * registers. We're about to duplicate the loop body and we'll have |
| * serious trouble if we don't do this. |
| */ |
| nir_convert_loop_to_lcssa(loop); |
| nir_lower_phis_to_regs_block(nir_loop_first_block(loop)); |
| nir_lower_phis_to_regs_block( |
| nir_cf_node_as_block(nir_cf_node_next(&loop->cf_node))); |
| |
| nir_cf_list cf_list; |
| nir_cf_list_extract(&cf_list, &loop->body); |
| |
| nir_if *_if = nir_if_create(impl->function->shader); |
| _if->condition = nir_src_for_reg(resume_reg); |
| nir_cf_node_insert(nir_after_cf_list(&loop->body), &_if->cf_node); |
| |
| nir_cf_list clone; |
| nir_cf_list_clone(&clone, &cf_list, &loop->cf_node, NULL); |
| |
| /* Insert the clone in the else and the original in the then so that |
| * the resume_instr remains valid even after the duplication. |
| */ |
| nir_cf_reinsert(&cf_list, nir_before_cf_list(&_if->then_list)); |
| nir_cf_reinsert(&clone, nir_before_cf_list(&_if->else_list)); |
| } |
| |
| if (resume_reg != NULL) |
| nir_metadata_preserve(impl, nir_metadata_none); |
| |
| return resume_reg != NULL; |
| } |
| |
| static bool |
| cf_node_contains_block(nir_cf_node *node, nir_block *block) |
| { |
| for (nir_cf_node *n = &block->cf_node; n != NULL; n = n->parent) { |
| if (n == node) |
| return true; |
| } |
| |
| return false; |
| } |
| |
| static void |
| rewrite_phis_to_pred(nir_block *block, nir_block *pred) |
| { |
| nir_foreach_instr(instr, block) { |
| if (instr->type != nir_instr_type_phi) |
| break; |
| |
| nir_phi_instr *phi = nir_instr_as_phi(instr); |
| |
| ASSERTED bool found = false; |
| nir_foreach_phi_src(phi_src, phi) { |
| if (phi_src->pred == pred) { |
| found = true; |
| assert(phi_src->src.is_ssa); |
| nir_ssa_def_rewrite_uses(&phi->dest.ssa, phi_src->src.ssa); |
| break; |
| } |
| } |
| assert(found); |
| } |
| } |
| |
| static bool |
| cursor_is_after_jump(nir_cursor cursor) |
| { |
| switch (cursor.option) { |
| case nir_cursor_before_instr: |
| case nir_cursor_before_block: |
| return false; |
| case nir_cursor_after_instr: |
| return cursor.instr->type == nir_instr_type_jump; |
| case nir_cursor_after_block: |
| return nir_block_ends_in_jump(cursor.block);; |
| } |
| unreachable("Invalid cursor option"); |
| } |
| |
| /** Flattens if ladders leading up to a resume |
| * |
| * Given a resume_instr, this function flattens any if ladders leading to the |
| * resume instruction and deletes any code that cannot be encountered on a |
| * direct path to the resume instruction. This way we get, for the most part, |
| * straight-line control-flow up to the resume instruction. |
| * |
| * While we do this flattening, we also move any code which is in the remat |
| * set up to the top of the function or to the top of the resume portion of |
| * the current loop. We don't worry about control-flow as we do this because |
| * phis will never be in the remat set (see can_remat_instr) and so nothing |
| * control-dependent will ever need to be re-materialized. It is possible |
| * that this algorithm will preserve too many instructions by moving them to |
| * the top but we leave that for DCE to clean up. Any code not in the remat |
| * set is deleted because it's either unused in the continuation or else |
| * unspilled from a previous continuation and the unspill code is after the |
| * resume instruction. |
| * |
| * If, for instance, we have something like this: |
| * |
| * // block 0 |
| * if (cond1) { |
| * // block 1 |
| * } else { |
| * // block 2 |
| * if (cond2) { |
| * // block 3 |
| * resume; |
| * if (cond3) { |
| * // block 4 |
| * } |
| * } else { |
| * // block 5 |
| * } |
| * } |
| * |
| * then we know, because we know the resume instruction had to be encoutered, |
| * that cond1 = false and cond2 = true and we lower as follows: |
| * |
| * // block 0 |
| * // block 2 |
| * // block 3 |
| * resume; |
| * if (cond3) { |
| * // block 4 |
| * } |
| * |
| * As you can see, the code in blocks 1 and 5 was removed because there is no |
| * path from the start of the shader to the resume instruction which execute |
| * blocks 1 or 5. Any remat code from blocks 0, 2, and 3 is preserved and |
| * moved to the top. If the resume instruction is inside a loop then we know |
| * a priori that it is of the form |
| * |
| * loop { |
| * if (resume) { |
| * // Contents containing resume_instr |
| * } else { |
| * // Second copy of contents |
| * } |
| * } |
| * |
| * In this case, we only descend into the first half of the loop. The second |
| * half is left alone as that portion is only ever executed after the resume |
| * instruction. |
| */ |
| static bool |
| flatten_resume_if_ladder(nir_builder *b, |
| nir_cf_node *parent_node, |
| struct exec_list *child_list, |
| bool child_list_contains_cursor, |
| nir_instr *resume_instr, |
| struct sized_bitset *remat) |
| { |
| nir_cf_list cf_list; |
| |
| /* If our child list contains the cursor instruction then we start out |
| * before the cursor instruction. We need to know this so that we can skip |
| * moving instructions which are already before the cursor. |
| */ |
| bool before_cursor = child_list_contains_cursor; |
| |
| nir_cf_node *resume_node = NULL; |
| foreach_list_typed_safe(nir_cf_node, child, node, child_list) { |
| switch (child->type) { |
| case nir_cf_node_block: { |
| nir_block *block = nir_cf_node_as_block(child); |
| if (b->cursor.option == nir_cursor_before_block && |
| b->cursor.block == block) { |
| assert(before_cursor); |
| before_cursor = false; |
| } |
| nir_foreach_instr_safe(instr, block) { |
| if ((b->cursor.option == nir_cursor_before_instr || |
| b->cursor.option == nir_cursor_after_instr) && |
| b->cursor.instr == instr) { |
| assert(nir_cf_node_is_first(&block->cf_node)); |
| assert(before_cursor); |
| before_cursor = false; |
| continue; |
| } |
| |
| if (instr == resume_instr) |
| goto found_resume; |
| |
| if (!before_cursor && can_remat_instr(instr, remat)) { |
| nir_instr_remove(instr); |
| nir_instr_insert(b->cursor, instr); |
| b->cursor = nir_after_instr(instr); |
| |
| nir_ssa_def *def = nir_instr_ssa_def(instr); |
| BITSET_SET(remat->set, def->index); |
| } |
| } |
| if (b->cursor.option == nir_cursor_after_block && |
| b->cursor.block == block) { |
| assert(before_cursor); |
| before_cursor = false; |
| } |
| break; |
| } |
| |
| case nir_cf_node_if: { |
| assert(!before_cursor); |
| nir_if *_if = nir_cf_node_as_if(child); |
| if (flatten_resume_if_ladder(b, &_if->cf_node, &_if->then_list, |
| false, resume_instr, remat)) { |
| resume_node = child; |
| rewrite_phis_to_pred(nir_cf_node_as_block(nir_cf_node_next(child)), |
| nir_if_last_then_block(_if)); |
| goto found_resume; |
| } |
| |
| if (flatten_resume_if_ladder(b, &_if->cf_node, &_if->else_list, |
| false, resume_instr, remat)) { |
| resume_node = child; |
| rewrite_phis_to_pred(nir_cf_node_as_block(nir_cf_node_next(child)), |
| nir_if_last_else_block(_if)); |
| goto found_resume; |
| } |
| break; |
| } |
| |
| case nir_cf_node_loop: { |
| assert(!before_cursor); |
| nir_loop *loop = nir_cf_node_as_loop(child); |
| |
| if (cf_node_contains_block(&loop->cf_node, resume_instr->block)) { |
| /* Thanks to our loop body duplication pass, every level of loop |
| * containing the resume instruction contains exactly three nodes: |
| * two blocks and an if. We don't want to lower away this if |
| * because it's the resume selection if. The resume half is |
| * always the then_list so that's what we want to flatten. |
| */ |
| nir_block *header = nir_loop_first_block(loop); |
| nir_if *_if = nir_cf_node_as_if(nir_cf_node_next(&header->cf_node)); |
| |
| /* We want to place anything re-materialized from inside the loop |
| * at the top of the resume half of the loop. |
| */ |
| nir_builder bl; |
| nir_builder_init(&bl, b->impl); |
| bl.cursor = nir_before_cf_list(&_if->then_list); |
| |
| ASSERTED bool found = |
| flatten_resume_if_ladder(&bl, &_if->cf_node, &_if->then_list, |
| true, resume_instr, remat); |
| assert(found); |
| resume_node = child; |
| goto found_resume; |
| } else { |
| ASSERTED bool found = |
| flatten_resume_if_ladder(b, &loop->cf_node, &loop->body, |
| false, resume_instr, remat); |
| assert(!found); |
| } |
| break; |
| } |
| |
| case nir_cf_node_function: |
| unreachable("Unsupported CF node type"); |
| } |
| } |
| assert(!before_cursor); |
| |
| /* If we got here, we didn't find the resume node or instruction. */ |
| return false; |
| |
| found_resume: |
| /* If we got here then we found either the resume node or the resume |
| * instruction in this CF list. |
| */ |
| if (resume_node) { |
| /* If the resume instruction is buried in side one of our children CF |
| * nodes, resume_node now points to that child. |
| */ |
| if (resume_node->type == nir_cf_node_if) { |
| /* Thanks to the recursive call, all of the interesting contents of |
| * resume_node have been copied before the cursor. We just need to |
| * copy the stuff after resume_node. |
| */ |
| nir_cf_extract(&cf_list, nir_after_cf_node(resume_node), |
| nir_after_cf_list(child_list)); |
| } else { |
| /* The loop contains its own cursor and still has useful stuff in it. |
| * We want to move everything after and including the loop to before |
| * the cursor. |
| */ |
| assert(resume_node->type == nir_cf_node_loop); |
| nir_cf_extract(&cf_list, nir_before_cf_node(resume_node), |
| nir_after_cf_list(child_list)); |
| } |
| } else { |
| /* If we found the resume instruction in one of our blocks, grab |
| * everything after it in the entire list (not just the one block), and |
| * place it before the cursor instr. |
| */ |
| nir_cf_extract(&cf_list, nir_after_instr(resume_instr), |
| nir_after_cf_list(child_list)); |
| } |
| |
| if (cursor_is_after_jump(b->cursor)) { |
| /* If the resume instruction is in a loop, it's possible cf_list ends |
| * in a break or continue instruction, in which case we don't want to |
| * insert anything. It's also possible we have an early return if |
| * someone hasn't lowered those yet. In either case, nothing after that |
| * point executes in this context so we can delete it. |
| */ |
| nir_cf_delete(&cf_list); |
| } else { |
| b->cursor = nir_cf_reinsert(&cf_list, b->cursor); |
| } |
| |
| if (!resume_node) { |
| /* We want the resume to be the first "interesting" instruction */ |
| nir_instr_remove(resume_instr); |
| nir_instr_insert(nir_before_cf_list(&b->impl->body), resume_instr); |
| } |
| |
| /* We've copied everything interesting out of this CF list to before the |
| * cursor. Delete everything else. |
| */ |
| if (child_list_contains_cursor) { |
| nir_cf_extract(&cf_list, b->cursor, nir_after_cf_list(child_list)); |
| } else { |
| nir_cf_list_extract(&cf_list, child_list); |
| } |
| nir_cf_delete(&cf_list); |
| |
| return true; |
| } |
| |
| static bool |
| wrap_jump_instr(nir_builder *b, nir_instr *instr, void *data) |
| { |
| if (instr->type != nir_instr_type_jump) |
| return false; |
| |
| b->cursor = nir_before_instr(instr); |
| |
| nir_if *_if = nir_push_if(b, nir_imm_true(b)); |
| nir_pop_if(b, NULL); |
| |
| nir_cf_list cf_list; |
| nir_cf_extract(&cf_list, nir_before_instr(instr), nir_after_instr(instr)); |
| nir_cf_reinsert(&cf_list, nir_before_block(nir_if_first_then_block(_if))); |
| |
| return true; |
| } |
| |
| /* This pass wraps jump instructions in a dummy if block so that when |
| * flatten_resume_if_ladder() does its job, it doesn't move a jump instruction |
| * directly in front of another instruction which the NIR control flow helpers |
| * do not allow. |
| */ |
| static bool |
| wrap_jumps(nir_shader *shader) |
| { |
| return nir_shader_instructions_pass(shader, wrap_jump_instr, |
| nir_metadata_none, NULL); |
| } |
| |
| static nir_instr * |
| lower_resume(nir_shader *shader, int call_idx) |
| { |
| wrap_jumps(shader); |
| |
| nir_function_impl *impl = nir_shader_get_entrypoint(shader); |
| nir_instr *resume_instr = find_resume_instr(impl, call_idx); |
| |
| if (duplicate_loop_bodies(impl, resume_instr)) { |
| nir_validate_shader(shader, "after duplicate_loop_bodies in " |
| "nir_lower_shader_calls"); |
| /* If we duplicated the bodies of any loops, run regs_to_ssa to get rid |
| * of all those pesky registers we just added. |
| */ |
| NIR_PASS_V(shader, nir_lower_regs_to_ssa); |
| } |
| |
| /* Re-index nir_ssa_def::index. We don't care about actual liveness in |
| * this pass but, so we can use the same helpers as the spilling pass, we |
| * need to make sure that live_index is something sane. It's used |
| * constantly for determining if an SSA value has been added since the |
| * start of the pass. |
| */ |
| nir_index_ssa_defs(impl); |
| |
| void *mem_ctx = ralloc_context(shader); |
| |
| /* Used to track which things may have been assumed to be re-materialized |
| * by the spilling pass and which we shouldn't delete. |
| */ |
| struct sized_bitset remat = bitset_create(mem_ctx, impl->ssa_alloc); |
| |
| /* Create a nop instruction to use as a cursor as we extract and re-insert |
| * stuff into the CFG. |
| */ |
| nir_builder b; |
| nir_builder_init(&b, impl); |
| b.cursor = nir_before_cf_list(&impl->body); |
| ASSERTED bool found = |
| flatten_resume_if_ladder(&b, &impl->cf_node, &impl->body, |
| true, resume_instr, &remat); |
| assert(found); |
| |
| ralloc_free(mem_ctx); |
| |
| nir_metadata_preserve(impl, nir_metadata_none); |
| |
| nir_validate_shader(shader, "after flatten_resume_if_ladder in " |
| "nir_lower_shader_calls"); |
| |
| return resume_instr; |
| } |
| |
| static void |
| replace_resume_with_halt(nir_shader *shader, nir_instr *keep) |
| { |
| nir_function_impl *impl = nir_shader_get_entrypoint(shader); |
| |
| nir_builder b; |
| nir_builder_init(&b, impl); |
| |
| nir_foreach_block_safe(block, impl) { |
| nir_foreach_instr_safe(instr, block) { |
| if (instr == keep) |
| continue; |
| |
| if (instr->type != nir_instr_type_intrinsic) |
| continue; |
| |
| nir_intrinsic_instr *resume = nir_instr_as_intrinsic(instr); |
| if (resume->intrinsic != nir_intrinsic_rt_resume) |
| continue; |
| |
| /* If this is some other resume, then we've kicked off a ray or |
| * bindless thread and we don't want to go any further in this |
| * shader. Insert a halt so that NIR will delete any instructions |
| * dominated by this call instruction including the scratch_load |
| * instructions we inserted. |
| */ |
| nir_cf_list cf_list; |
| nir_cf_extract(&cf_list, nir_after_instr(&resume->instr), |
| nir_after_block(block)); |
| nir_cf_delete(&cf_list); |
| b.cursor = nir_instr_remove(&resume->instr); |
| nir_jump(&b, nir_jump_halt); |
| break; |
| } |
| } |
| } |
| |
| struct lower_scratch_state { |
| nir_address_format address_format; |
| }; |
| |
| static bool |
| lower_stack_instr_to_scratch(struct nir_builder *b, nir_instr *instr, void *data) |
| { |
| struct lower_scratch_state *state = data; |
| |
| if (instr->type != nir_instr_type_intrinsic) |
| return false; |
| |
| nir_intrinsic_instr *stack = nir_instr_as_intrinsic(instr); |
| switch (stack->intrinsic) { |
| case nir_intrinsic_load_stack: { |
| b->cursor = nir_instr_remove(instr); |
| nir_ssa_def *data, *old_data = nir_instr_ssa_def(instr); |
| |
| if (state->address_format == nir_address_format_64bit_global) { |
| nir_ssa_def *addr = nir_iadd_imm(b, |
| nir_load_scratch_base_ptr(b, 1, 64, 1), |
| nir_intrinsic_base(stack)); |
| data = nir_load_global(b, addr, |
| nir_intrinsic_align_mul(stack), |
| stack->dest.ssa.num_components, |
| stack->dest.ssa.bit_size); |
| } else { |
| assert(state->address_format == nir_address_format_32bit_offset); |
| data = nir_load_scratch(b, |
| old_data->num_components, |
| old_data->bit_size, |
| nir_imm_int(b, nir_intrinsic_base(stack)), |
| .align_mul = nir_intrinsic_align_mul(stack)); |
| } |
| nir_ssa_def_rewrite_uses(old_data, data); |
| break; |
| } |
| |
| case nir_intrinsic_store_stack: { |
| b->cursor = nir_instr_remove(instr); |
| nir_ssa_def *data = stack->src[0].ssa; |
| |
| if (state->address_format == nir_address_format_64bit_global) { |
| nir_ssa_def *addr = nir_iadd_imm(b, |
| nir_load_scratch_base_ptr(b, 1, 64, 1), |
| nir_intrinsic_base(stack)); |
| nir_store_global(b, addr, |
| nir_intrinsic_align_mul(stack), |
| data, |
| BITFIELD_MASK(data->num_components)); |
| } else { |
| assert(state->address_format == nir_address_format_32bit_offset); |
| nir_store_scratch(b, data, |
| nir_imm_int(b, nir_intrinsic_base(stack)), |
| .align_mul = nir_intrinsic_align_mul(stack), |
| .write_mask = BITFIELD_MASK(data->num_components)); |
| } |
| break; |
| } |
| |
| default: |
| return false; |
| } |
| |
| return true; |
| } |
| |
| static bool |
| nir_lower_stack_to_scratch(nir_shader *shader, |
| nir_address_format address_format) |
| { |
| struct lower_scratch_state state = { |
| .address_format = address_format, |
| }; |
| |
| return nir_shader_instructions_pass(shader, |
| lower_stack_instr_to_scratch, |
| nir_metadata_block_index | |
| nir_metadata_dominance, |
| &state); |
| } |
| |
| static bool |
| opt_remove_respills_instr(struct nir_builder *b, nir_instr *instr, void *data) |
| { |
| if (instr->type != nir_instr_type_intrinsic) |
| return false; |
| |
| nir_intrinsic_instr *store_intrin = nir_instr_as_intrinsic(instr); |
| if (store_intrin->intrinsic != nir_intrinsic_store_stack) |
| return false; |
| |
| nir_instr *value_instr = store_intrin->src[0].ssa->parent_instr; |
| if (value_instr->type != nir_instr_type_intrinsic) |
| return false; |
| |
| nir_intrinsic_instr *load_intrin = nir_instr_as_intrinsic(value_instr); |
| if (load_intrin->intrinsic != nir_intrinsic_load_stack) |
| return false; |
| |
| if (nir_intrinsic_base(load_intrin) != nir_intrinsic_base(store_intrin)) |
| return false; |
| |
| nir_instr_remove(&store_intrin->instr); |
| return true; |
| } |
| |
| /* After shader split, look at stack load/store operations. If we're loading |
| * and storing the same value at the same location, we can drop the store |
| * instruction. |
| */ |
| static bool |
| nir_opt_remove_respills(nir_shader *shader) |
| { |
| return nir_shader_instructions_pass(shader, |
| opt_remove_respills_instr, |
| nir_metadata_block_index | |
| nir_metadata_dominance, |
| NULL); |
| } |
| |
| static void |
| add_use_mask(struct hash_table_u64 *offset_to_mask, |
| unsigned offset, unsigned mask) |
| { |
| uintptr_t old_mask = (uintptr_t) |
| _mesa_hash_table_u64_search(offset_to_mask, offset); |
| |
| _mesa_hash_table_u64_insert(offset_to_mask, offset, |
| (void *)(uintptr_t)(old_mask | mask)); |
| } |
| |
| /* When splitting the shaders, we might have inserted store & loads of vec4s, |
| * because a live value is a 4 components. But sometimes, only some components |
| * of that vec4 will be used by after the scratch load. This pass removes the |
| * unused components of scratch load/stores. |
| */ |
| static bool |
| nir_opt_trim_stack_values(nir_shader *shader) |
| { |
| nir_function_impl *impl = nir_shader_get_entrypoint(shader); |
| |
| struct hash_table_u64 *value_id_to_mask = _mesa_hash_table_u64_create(NULL); |
| bool progress = false; |
| |
| /* Find all the loads and how their value is being used */ |
| nir_foreach_block_safe(block, impl) { |
| nir_foreach_instr_safe(instr, block) { |
| if (instr->type != nir_instr_type_intrinsic) |
| continue; |
| |
| nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); |
| if (intrin->intrinsic != nir_intrinsic_load_stack) |
| continue; |
| |
| const unsigned value_id = nir_intrinsic_value_id(intrin); |
| |
| const unsigned mask = |
| nir_ssa_def_components_read(nir_instr_ssa_def(instr)); |
| add_use_mask(value_id_to_mask, value_id, mask); |
| } |
| } |
| |
| /* For each store, if it stores more than is being used, trim it. |
| * Otherwise, remove it from the hash table. |
| */ |
| nir_foreach_block_safe(block, impl) { |
| nir_foreach_instr_safe(instr, block) { |
| if (instr->type != nir_instr_type_intrinsic) |
| continue; |
| |
| nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); |
| if (intrin->intrinsic != nir_intrinsic_store_stack) |
| continue; |
| |
| const unsigned value_id = nir_intrinsic_value_id(intrin); |
| |
| const unsigned write_mask = nir_intrinsic_write_mask(intrin); |
| const unsigned read_mask = (uintptr_t) |
| _mesa_hash_table_u64_search(value_id_to_mask, value_id); |
| |
| /* Already removed from the table, nothing to do */ |
| if (read_mask == 0) |
| continue; |
| |
| /* Matching read/write mask, nothing to do, remove from the table. */ |
| if (write_mask == read_mask) { |
| _mesa_hash_table_u64_remove(value_id_to_mask, value_id); |
| continue; |
| } |
| |
| nir_builder b; |
| nir_builder_init(&b, impl); |
| b.cursor = nir_before_instr(instr); |
| |
| nir_ssa_def *value = nir_channels(&b, intrin->src[0].ssa, read_mask); |
| nir_instr_rewrite_src_ssa(instr, &intrin->src[0], value); |
| |
| intrin->num_components = util_bitcount(read_mask); |
| nir_intrinsic_set_write_mask(intrin, (1u << intrin->num_components) - 1); |
| |
| progress = true; |
| } |
| } |
| |
| /* For each load remaining in the hash table (only the ones we changed the |
| * number of components of), apply triming/reswizzle. |
| */ |
| nir_foreach_block_safe(block, impl) { |
| nir_foreach_instr_safe(instr, block) { |
| if (instr->type != nir_instr_type_intrinsic) |
| continue; |
| |
| nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); |
| if (intrin->intrinsic != nir_intrinsic_load_stack) |
| continue; |
| |
| const unsigned value_id = nir_intrinsic_value_id(intrin); |
| |
| unsigned read_mask = (uintptr_t) |
| _mesa_hash_table_u64_search(value_id_to_mask, value_id); |
| if (read_mask == 0) |
| continue; |
| |
| unsigned swiz_map[NIR_MAX_VEC_COMPONENTS] = { 0, }; |
| unsigned swiz_count = 0; |
| u_foreach_bit(idx, read_mask) |
| swiz_map[idx] = swiz_count++; |
| |
| nir_ssa_def *def = nir_instr_ssa_def(instr); |
| |
| nir_foreach_use_safe(use_src, def) { |
| if (use_src->parent_instr->type == nir_instr_type_alu) { |
| nir_alu_instr *alu = nir_instr_as_alu(use_src->parent_instr); |
| nir_alu_src *alu_src = exec_node_data(nir_alu_src, use_src, src); |
| |
| unsigned write_mask = alu->dest.write_mask; |
| u_foreach_bit(idx, write_mask) |
| alu_src->swizzle[idx] = swiz_map[alu_src->swizzle[idx]]; |
| } else if (use_src->parent_instr->type == nir_instr_type_intrinsic) { |
| nir_intrinsic_instr *use_intrin = |
| nir_instr_as_intrinsic(use_src->parent_instr); |
| assert(nir_intrinsic_has_write_mask(use_intrin)); |
| unsigned write_mask = nir_intrinsic_write_mask(use_intrin); |
| unsigned new_write_mask = 0; |
| u_foreach_bit(idx, write_mask) |
| new_write_mask |= 1 << swiz_map[idx]; |
| nir_intrinsic_set_write_mask(use_intrin, new_write_mask); |
| } else { |
| unreachable("invalid instruction type"); |
| } |
| } |
| |
| intrin->dest.ssa.num_components = intrin->num_components = swiz_count; |
| |
| progress = true; |
| } |
| } |
| |
| nir_metadata_preserve(impl, |
| progress ? |
| (nir_metadata_dominance | |
| nir_metadata_block_index | |
| nir_metadata_loop_analysis) : |
| nir_metadata_all); |
| |
| _mesa_hash_table_u64_destroy(value_id_to_mask); |
| |
| return progress; |
| } |
| |
| struct scratch_item { |
| unsigned old_offset; |
| unsigned new_offset; |
| unsigned bit_size; |
| unsigned num_components; |
| unsigned value; |
| unsigned call_idx; |
| }; |
| |
| static int |
| sort_scratch_item_by_size_and_value_id(const void *_item1, const void *_item2) |
| { |
| const struct scratch_item *item1 = _item1; |
| const struct scratch_item *item2 = _item2; |
| |
| /* By ascending value_id */ |
| if (item1->bit_size == item2->bit_size) |
| return (int) item1->value - (int) item2->value; |
| |
| /* By descending size */ |
| return (int) item2->bit_size - (int) item1->bit_size; |
| } |
| |
| static bool |
| nir_opt_sort_and_pack_stack(nir_shader *shader, |
| unsigned start_call_scratch, |
| unsigned stack_alignment, |
| unsigned num_calls) |
| { |
| nir_function_impl *impl = nir_shader_get_entrypoint(shader); |
| |
| void *mem_ctx = ralloc_context(NULL); |
| |
| struct hash_table_u64 *value_id_to_item = |
| _mesa_hash_table_u64_create(mem_ctx); |
| struct util_dynarray ops; |
| util_dynarray_init(&ops, mem_ctx); |
| |
| for (unsigned call_idx = 0; call_idx < num_calls; call_idx++) { |
| _mesa_hash_table_u64_clear(value_id_to_item); |
| util_dynarray_clear(&ops); |
| |
| /* Find all the stack load and their offset. */ |
| nir_foreach_block_safe(block, impl) { |
| nir_foreach_instr_safe(instr, block) { |
| if (instr->type != nir_instr_type_intrinsic) |
| continue; |
| |
| nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); |
| if (intrin->intrinsic != nir_intrinsic_load_stack) |
| continue; |
| |
| if (nir_intrinsic_call_idx(intrin) != call_idx) |
| continue; |
| |
| const unsigned value_id = nir_intrinsic_value_id(intrin); |
| nir_ssa_def *def = nir_instr_ssa_def(instr); |
| |
| assert(_mesa_hash_table_u64_search(value_id_to_item, |
| value_id) == NULL); |
| |
| struct scratch_item item = { |
| .old_offset = nir_intrinsic_base(intrin), |
| .bit_size = def->bit_size, |
| .num_components = def->num_components, |
| .value = value_id, |
| }; |
| |
| util_dynarray_append(&ops, struct scratch_item, item); |
| _mesa_hash_table_u64_insert(value_id_to_item, value_id, (void *)(uintptr_t)true); |
| } |
| } |
| |
| /* Sort scratch item by component size. */ |
| qsort(util_dynarray_begin(&ops), |
| util_dynarray_num_elements(&ops, struct scratch_item), |
| sizeof(struct scratch_item), |
| sort_scratch_item_by_size_and_value_id); |
| |
| |
| /* Reorder things on the stack */ |
| _mesa_hash_table_u64_clear(value_id_to_item); |
| |
| unsigned scratch_size = start_call_scratch; |
| util_dynarray_foreach(&ops, struct scratch_item, item) { |
| item->new_offset = ALIGN(scratch_size, item->bit_size / 8); |
| scratch_size = item->new_offset + (item->bit_size * item->num_components) / 8; |
| _mesa_hash_table_u64_insert(value_id_to_item, item->value, item); |
| } |
| shader->scratch_size = ALIGN(scratch_size, stack_alignment); |
| |
| /* Update offsets in the instructions */ |
| nir_foreach_block_safe(block, impl) { |
| nir_foreach_instr_safe(instr, block) { |
| if (instr->type != nir_instr_type_intrinsic) |
| continue; |
| |
| nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); |
| switch (intrin->intrinsic) { |
| case nir_intrinsic_load_stack: |
| case nir_intrinsic_store_stack: { |
| if (nir_intrinsic_call_idx(intrin) != call_idx) |
| continue; |
| |
| struct scratch_item *item = |
| _mesa_hash_table_u64_search(value_id_to_item, |
| nir_intrinsic_value_id(intrin)); |
| assert(item); |
| |
| nir_intrinsic_set_base(intrin, item->new_offset); |
| break; |
| } |
| |
| case nir_intrinsic_rt_trace_ray: |
| case nir_intrinsic_rt_execute_callable: |
| case nir_intrinsic_rt_resume: |
| if (nir_intrinsic_call_idx(intrin) != call_idx) |
| continue; |
| nir_intrinsic_set_stack_size(intrin, shader->scratch_size); |
| break; |
| |
| default: |
| break; |
| } |
| } |
| } |
| } |
| |
| ralloc_free(mem_ctx); |
| |
| nir_shader_preserve_all_metadata(shader); |
| |
| return true; |
| } |
| |
| static unsigned |
| nir_block_loop_depth(nir_block *block) |
| { |
| nir_cf_node *node = &block->cf_node; |
| unsigned loop_depth = 0; |
| |
| while (node != NULL) { |
| if (node->type == nir_cf_node_loop) |
| loop_depth++; |
| node = node->parent; |
| } |
| |
| return loop_depth; |
| } |
| |
| /* Find the last block dominating all the uses of a SSA value. */ |
| static nir_block * |
| find_last_dominant_use_block(nir_function_impl *impl, nir_ssa_def *value) |
| { |
| nir_block *old_block = value->parent_instr->block; |
| unsigned old_block_loop_depth = nir_block_loop_depth(old_block); |
| |
| nir_foreach_block_reverse_safe(block, impl) { |
| bool fits = true; |
| |
| /* Store on the current block of the value */ |
| if (block == old_block) |
| return block; |
| |
| /* Don't move instructions deeper into loops, this would generate more |
| * memory traffic. |
| */ |
| unsigned block_loop_depth = nir_block_loop_depth(block); |
| if (block_loop_depth > old_block_loop_depth) |
| continue; |
| |
| nir_foreach_if_use(src, value) { |
| nir_block *block_before_if = |
| nir_cf_node_as_block(nir_cf_node_prev(&src->parent_if->cf_node)); |
| if (!nir_block_dominates(block, block_before_if)) { |
| fits = false; |
| break; |
| } |
| } |
| if (!fits) |
| continue; |
| |
| nir_foreach_use(src, value) { |
| if (src->parent_instr->type == nir_instr_type_phi && |
| block == src->parent_instr->block) { |
| fits = false; |
| break; |
| } |
| |
| if (!nir_block_dominates(block, src->parent_instr->block)) { |
| fits = false; |
| break; |
| } |
| } |
| if (!fits) |
| continue; |
| |
| return block; |
| } |
| unreachable("Cannot find block"); |
| } |
| |
| /* Put the scratch loads in the branches where they're needed. */ |
| static bool |
| nir_opt_stack_loads(nir_shader *shader) |
| { |
| bool progress = false; |
| |
| nir_foreach_function(func, shader) { |
| if (!func->impl) |
| continue; |
| |
| nir_metadata_require(func->impl, nir_metadata_dominance | |
| nir_metadata_block_index); |
| |
| bool func_progress = false; |
| nir_foreach_block_safe(block, func->impl) { |
| nir_foreach_instr_safe(instr, block) { |
| if (instr->type != nir_instr_type_intrinsic) |
| continue; |
| |
| nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); |
| if (intrin->intrinsic != nir_intrinsic_load_stack) |
| continue; |
| |
| nir_ssa_def *value = &intrin->dest.ssa; |
| nir_block *new_block = find_last_dominant_use_block(func->impl, value); |
| if (new_block == block) |
| continue; |
| |
| /* Move the scratch load in the new block, after the phis. */ |
| nir_instr_remove(instr); |
| nir_instr_insert(nir_before_block_after_phis(new_block), instr); |
| |
| func_progress = true; |
| } |
| } |
| |
| nir_metadata_preserve(func->impl, |
| func_progress ? (nir_metadata_block_index | |
| nir_metadata_dominance | |
| nir_metadata_loop_analysis) : |
| nir_metadata_all); |
| |
| progress |= func_progress; |
| } |
| |
| return progress; |
| } |
| |
| static bool |
| split_stack_components_instr(struct nir_builder *b, nir_instr *instr, void *data) |
| { |
| if (instr->type != nir_instr_type_intrinsic) |
| return false; |
| |
| nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); |
| if (intrin->intrinsic != nir_intrinsic_load_stack && |
| intrin->intrinsic != nir_intrinsic_store_stack) |
| return false; |
| |
| if (intrin->intrinsic == nir_intrinsic_load_stack && |
| intrin->dest.ssa.num_components == 1) |
| return false; |
| |
| if (intrin->intrinsic == nir_intrinsic_store_stack && |
| intrin->src[0].ssa->num_components == 1) |
| return false; |
| |
| b->cursor = nir_before_instr(instr); |
| |
| if (intrin->intrinsic == nir_intrinsic_load_stack) { |
| nir_ssa_def *components[NIR_MAX_VEC_COMPONENTS] = { 0, }; |
| for (unsigned c = 0; c < intrin->dest.ssa.num_components; c++) { |
| components[c] = nir_load_stack(b, 1, intrin->dest.ssa.bit_size, |
| .base = nir_intrinsic_base(intrin) + |
| c * intrin->dest.ssa.bit_size / 8, |
| .call_idx = nir_intrinsic_call_idx(intrin), |
| .value_id = nir_intrinsic_value_id(intrin), |
| .align_mul = nir_intrinsic_align_mul(intrin)); |
| } |
| |
| nir_ssa_def_rewrite_uses(&intrin->dest.ssa, |
| nir_vec(b, components, |
| intrin->dest.ssa.num_components)); |
| } else { |
| assert(intrin->intrinsic == nir_intrinsic_store_stack); |
| for (unsigned c = 0; c < intrin->src[0].ssa->num_components; c++) { |
| nir_store_stack(b, nir_channel(b, intrin->src[0].ssa, c), |
| .base = nir_intrinsic_base(intrin) + |
| c * intrin->src[0].ssa->bit_size / 8, |
| .call_idx = nir_intrinsic_call_idx(intrin), |
| .align_mul = nir_intrinsic_align_mul(intrin), |
| .value_id = nir_intrinsic_value_id(intrin), |
| .write_mask = 0x1); |
| } |
| } |
| |
| nir_instr_remove(instr); |
| |
| return true; |
| } |
| |
| /* Break the load_stack/store_stack intrinsics into single compoments. This |
| * helps the vectorizer to pack components. |
| */ |
| static bool |
| nir_split_stack_components(nir_shader *shader) |
| { |
| return nir_shader_instructions_pass(shader, |
| split_stack_components_instr, |
| nir_metadata_block_index | |
| nir_metadata_dominance, |
| NULL); |
| } |
| |
| struct stack_op_vectorizer_state { |
| nir_should_vectorize_mem_func driver_callback; |
| void *driver_data; |
| }; |
| |
| static bool |
| should_vectorize(unsigned align_mul, |
| unsigned align_offset, |
| unsigned bit_size, |
| unsigned num_components, |
| nir_intrinsic_instr *low, nir_intrinsic_instr *high, |
| void *data) |
| { |
| /* We only care about those intrinsics */ |
| if ((low->intrinsic != nir_intrinsic_load_stack && |
| low->intrinsic != nir_intrinsic_store_stack) || |
| (high->intrinsic != nir_intrinsic_load_stack && |
| high->intrinsic != nir_intrinsic_store_stack)) |
| return false; |
| |
| struct stack_op_vectorizer_state *state = data; |
| |
| return state->driver_callback(align_mul, align_offset, |
| bit_size, num_components, |
| low, high, state->driver_data); |
| } |
| |
| /** Lower shader call instructions to split shaders. |
| * |
| * Shader calls can be split into an initial shader and a series of "resume" |
| * shaders. When the shader is first invoked, it is the initial shader which |
| * is executed. At any point in the initial shader or any one of the resume |
| * shaders, a shader call operation may be performed. The possible shader call |
| * operations are: |
| * |
| * - trace_ray |
| * - report_ray_intersection |
| * - execute_callable |
| * |
| * When a shader call operation is performed, we push all live values to the |
| * stack,call rt_trace_ray/rt_execute_callable and then kill the shader. Once |
| * the operation we invoked is complete, a callee shader will return execution |
| * to the respective resume shader. The resume shader pops the contents off |
| * the stack and picks up where the calling shader left off. |
| * |
| * Stack management is assumed to be done after this pass. Call |
| * instructions and their resumes get annotated with stack information that |
| * should be enough for the backend to implement proper stack management. |
| */ |
| bool |
| nir_lower_shader_calls(nir_shader *shader, |
| const nir_lower_shader_calls_options *options, |
| nir_shader ***resume_shaders_out, |
| uint32_t *num_resume_shaders_out, |
| void *mem_ctx) |
| { |
| nir_function_impl *impl = nir_shader_get_entrypoint(shader); |
| |
| nir_builder b; |
| nir_builder_init(&b, impl); |
| |
| int num_calls = 0; |
| nir_foreach_block(block, impl) { |
| nir_foreach_instr_safe(instr, block) { |
| if (instr_is_shader_call(instr)) |
| num_calls++; |
| } |
| } |
| |
| if (num_calls == 0) { |
| nir_shader_preserve_all_metadata(shader); |
| *num_resume_shaders_out = 0; |
| return false; |
| } |
| |
| /* Some intrinsics not only can't be re-materialized but aren't preserved |
| * when moving to the continuation shader. We have to move them to the top |
| * to ensure they get spilled as needed. |
| */ |
| { |
| bool progress = false; |
| NIR_PASS(progress, shader, move_system_values_to_top); |
| if (progress) |
| NIR_PASS(progress, shader, nir_opt_cse); |
| } |
| |
| /* Save the start point of the call stack in scratch */ |
| unsigned start_call_scratch = shader->scratch_size; |
| |
| NIR_PASS_V(shader, spill_ssa_defs_and_lower_shader_calls, |
| num_calls, options->stack_alignment); |
| |
| NIR_PASS_V(shader, nir_opt_remove_phis); |
| |
| NIR_PASS_V(shader, nir_opt_trim_stack_values); |
| NIR_PASS_V(shader, nir_opt_sort_and_pack_stack, |
| start_call_scratch, options->stack_alignment, num_calls); |
| |
| /* Make N copies of our shader */ |
| nir_shader **resume_shaders = ralloc_array(mem_ctx, nir_shader *, num_calls); |
| for (unsigned i = 0; i < num_calls; i++) { |
| resume_shaders[i] = nir_shader_clone(mem_ctx, shader); |
| |
| /* Give them a recognizable name */ |
| resume_shaders[i]->info.name = |
| ralloc_asprintf(mem_ctx, "%s%sresume_%u", |
| shader->info.name ? shader->info.name : "", |
| shader->info.name ? "-" : "", |
| i); |
| } |
| |
| replace_resume_with_halt(shader, NULL); |
| nir_opt_dce(shader); |
| nir_opt_dead_cf(shader); |
| for (unsigned i = 0; i < num_calls; i++) { |
| nir_instr *resume_instr = lower_resume(resume_shaders[i], i); |
| replace_resume_with_halt(resume_shaders[i], resume_instr); |
| /* Remove the dummy blocks added by flatten_resume_if_ladder() */ |
| nir_opt_if(resume_shaders[i], nir_opt_if_optimize_phi_true_false); |
| nir_opt_dce(resume_shaders[i]); |
| nir_opt_dead_cf(resume_shaders[i]); |
| nir_opt_remove_phis(resume_shaders[i]); |
| } |
| |
| for (unsigned i = 0; i < num_calls; i++) |
| NIR_PASS_V(resume_shaders[i], nir_opt_remove_respills); |
| |
| if (options->localized_loads) { |
| /* Once loads have been combined we can try to put them closer to where |
| * they're needed. |
| */ |
| for (unsigned i = 0; i < num_calls; i++) |
| NIR_PASS_V(resume_shaders[i], nir_opt_stack_loads); |
| } |
| |
| struct stack_op_vectorizer_state vectorizer_state = { |
| .driver_callback = options->vectorizer_callback, |
| .driver_data = options->vectorizer_data, |
| }; |
| nir_load_store_vectorize_options vect_opts = { |
| .modes = nir_var_shader_temp, |
| .callback = should_vectorize, |
| .cb_data = &vectorizer_state, |
| }; |
| |
| if (options->vectorizer_callback != NULL) { |
| NIR_PASS_V(shader, nir_split_stack_components); |
| NIR_PASS_V(shader, nir_opt_load_store_vectorize, &vect_opts); |
| } |
| NIR_PASS_V(shader, nir_lower_stack_to_scratch, options->address_format); |
| nir_opt_cse(shader); |
| for (unsigned i = 0; i < num_calls; i++) { |
| if (options->vectorizer_callback != NULL) { |
| NIR_PASS_V(shader, nir_split_stack_components); |
| NIR_PASS_V(shader, nir_opt_load_store_vectorize, &vect_opts); |
| } |
| NIR_PASS_V(resume_shaders[i], nir_lower_stack_to_scratch, |
| options->address_format); |
| nir_opt_cse(resume_shaders[i]); |
| } |
| |
| *resume_shaders_out = resume_shaders; |
| *num_resume_shaders_out = num_calls; |
| |
| return true; |
| } |