| /* |
| * Copyright © 2019 Broadcom |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
| * IN THE SOFTWARE. |
| */ |
| |
| #include "nir_schedule.h" |
| #include "util/dag.h" |
| #include "util/u_dynarray.h" |
| |
| /** @file |
| * |
| * Implements basic-block-level prepass instruction scheduling in NIR to |
| * manage register pressure. |
| * |
| * This is based on the Goodman/Hsu paper (1988, cached copy at |
| * https://people.freedesktop.org/~anholt/scheduling-goodman-hsu.pdf). We |
| * make up the DDG for NIR (which can be mostly done using the NIR def/use |
| * chains for SSA instructions, plus some edges for ordering register writes |
| * vs reads, and some more for ordering intrinsics). Then we pick heads off |
| * of the DDG using their heuristic to emit the NIR instructions back into the |
| * block in their new order. |
| * |
| * The hard case for prepass scheduling on GPUs seems to always be consuming |
| * texture/ubo results. The register pressure heuristic doesn't want to pick |
| * an instr that starts consuming texture results because it usually won't be |
| * the only usage, so that instruction will increase pressure. |
| * |
| * If you try to force consumption of tex results always, then in a case where |
| * single sample is used for many outputs, you'll end up picking every other |
| * user and expanding register pressure. The partially_evaluated_path flag |
| * helps tremendously, in that if you happen for whatever reason to pick a |
| * texture sample's output, then you'll try to finish off that sample. Future |
| * work may include doing some local search before locking in a choice, to try |
| * to more reliably find the case where just a few choices going against the |
| * heuristic can manage to free the whole vector. |
| */ |
| |
| static bool debug; |
| |
| /** |
| * Represents a node in the DDG for a NIR instruction. |
| */ |
| typedef struct { |
| struct dag_node dag; /* must be first for our u_dynarray_foreach */ |
| nir_instr *instr; |
| bool partially_evaluated_path; |
| |
| /* Approximate estimate of the delay between starting this instruction and |
| * its results being available. |
| * |
| * Accuracy is not too important, given that we're prepass scheduling here |
| * and just trying to reduce excess dependencies introduced by a register |
| * allocator by stretching out the live intervals of expensive |
| * instructions. |
| */ |
| uint32_t delay; |
| |
| /* Cost of the maximum-delay path from this node to the leaves. */ |
| uint32_t max_delay; |
| |
| /* scoreboard->time value when this instruction can be scheduled without |
| * any stalls expected. |
| */ |
| uint32_t ready_time; |
| } nir_schedule_node; |
| |
| typedef struct { |
| struct dag *dag; |
| |
| nir_shader *shader; |
| |
| /* Mapping from nir_register * or nir_ssa_def * to a struct set of |
| * instructions remaining to be scheduled using the register. |
| */ |
| struct hash_table *remaining_uses; |
| |
| /* Map from nir_instr to nir_schedule_node * */ |
| struct hash_table *instr_map; |
| |
| /* Set of nir_register * or nir_ssa_def * that have had any instruction |
| * scheduled on them. |
| */ |
| struct set *live_values; |
| |
| /* An abstract approximation of the number of nir_scheduler_node->delay |
| * units since the start of the shader. |
| */ |
| uint32_t time; |
| |
| /* Number of channels currently used by the NIR instructions that have been |
| * scheduled. |
| */ |
| int pressure; |
| |
| /* Options specified by the backend */ |
| const nir_schedule_options *options; |
| } nir_schedule_scoreboard; |
| |
| /* When walking the instructions in reverse, we use this flag to swap |
| * before/after in add_dep(). |
| */ |
| enum direction { F, R }; |
| |
| struct nir_schedule_class_dep { |
| int klass; |
| nir_schedule_node *node; |
| struct nir_schedule_class_dep *next; |
| }; |
| |
| typedef struct { |
| nir_schedule_scoreboard *scoreboard; |
| |
| /* Map from nir_register to nir_schedule_node * */ |
| struct hash_table *reg_map; |
| |
| /* Scheduler nodes for last instruction involved in some class of dependency. |
| */ |
| nir_schedule_node *load_input; |
| nir_schedule_node *store_shared; |
| nir_schedule_node *unknown_intrinsic; |
| nir_schedule_node *discard; |
| nir_schedule_node *jump; |
| |
| struct nir_schedule_class_dep *class_deps; |
| |
| enum direction dir; |
| } nir_deps_state; |
| |
| static void * |
| _mesa_hash_table_search_data(struct hash_table *ht, void *key) |
| { |
| struct hash_entry *entry = _mesa_hash_table_search(ht, key); |
| if (!entry) |
| return NULL; |
| return entry->data; |
| } |
| |
| static nir_schedule_node * |
| nir_schedule_get_node(struct hash_table *instr_map, nir_instr *instr) |
| { |
| return _mesa_hash_table_search_data(instr_map, instr); |
| } |
| |
| static struct set * |
| nir_schedule_scoreboard_get_src(nir_schedule_scoreboard *scoreboard, nir_src *src) |
| { |
| if (src->is_ssa) { |
| return _mesa_hash_table_search_data(scoreboard->remaining_uses, src->ssa); |
| } else { |
| return _mesa_hash_table_search_data(scoreboard->remaining_uses, |
| src->reg.reg); |
| } |
| } |
| |
| static int |
| nir_schedule_def_pressure(nir_ssa_def *def) |
| { |
| return def->num_components; |
| } |
| |
| static int |
| nir_schedule_src_pressure(nir_src *src) |
| { |
| if (src->is_ssa) |
| return nir_schedule_def_pressure(src->ssa); |
| else |
| return src->reg.reg->num_components; |
| } |
| |
| static int |
| nir_schedule_dest_pressure(nir_dest *dest) |
| { |
| if (dest->is_ssa) |
| return nir_schedule_def_pressure(&dest->ssa); |
| else |
| return dest->reg.reg->num_components; |
| } |
| |
| /** |
| * Adds a dependency such that @after must appear in the final program after |
| * @before. |
| * |
| * We add @before as a child of @after, so that DAG heads are the outputs of |
| * the program and we make our scheduling decisions bottom to top. |
| */ |
| static void |
| add_dep(nir_deps_state *state, |
| nir_schedule_node *before, |
| nir_schedule_node *after) |
| { |
| if (!before || !after) |
| return; |
| |
| assert(before != after); |
| |
| if (state->dir == F) |
| dag_add_edge(&before->dag, &after->dag, NULL); |
| else |
| dag_add_edge(&after->dag, &before->dag, NULL); |
| } |
| |
| |
| static void |
| add_read_dep(nir_deps_state *state, |
| nir_schedule_node *before, |
| nir_schedule_node *after) |
| { |
| add_dep(state, before, after); |
| } |
| |
| static void |
| add_write_dep(nir_deps_state *state, |
| nir_schedule_node **before, |
| nir_schedule_node *after) |
| { |
| add_dep(state, *before, after); |
| *before = after; |
| } |
| |
| static bool |
| nir_schedule_reg_src_deps(nir_src *src, void *in_state) |
| { |
| nir_deps_state *state = in_state; |
| |
| if (src->is_ssa) |
| return true; |
| |
| struct hash_entry *entry = _mesa_hash_table_search(state->reg_map, |
| src->reg.reg); |
| if (!entry) |
| return true; |
| nir_schedule_node *dst_n = entry->data; |
| |
| nir_schedule_node *src_n = |
| nir_schedule_get_node(state->scoreboard->instr_map, |
| src->parent_instr); |
| |
| add_dep(state, dst_n, src_n); |
| |
| return true; |
| } |
| |
| static bool |
| nir_schedule_reg_dest_deps(nir_dest *dest, void *in_state) |
| { |
| nir_deps_state *state = in_state; |
| |
| if (dest->is_ssa) |
| return true; |
| |
| nir_schedule_node *dest_n = |
| nir_schedule_get_node(state->scoreboard->instr_map, |
| dest->reg.parent_instr); |
| |
| struct hash_entry *entry = _mesa_hash_table_search(state->reg_map, |
| dest->reg.reg); |
| if (!entry) { |
| _mesa_hash_table_insert(state->reg_map, dest->reg.reg, dest_n); |
| return true; |
| } |
| nir_schedule_node **before = (nir_schedule_node **)&entry->data; |
| |
| add_write_dep(state, before, dest_n); |
| |
| return true; |
| } |
| |
| static bool |
| nir_schedule_ssa_deps(nir_ssa_def *def, void *in_state) |
| { |
| nir_deps_state *state = in_state; |
| struct hash_table *instr_map = state->scoreboard->instr_map; |
| nir_schedule_node *def_n = nir_schedule_get_node(instr_map, def->parent_instr); |
| |
| nir_foreach_use(src, def) { |
| nir_schedule_node *use_n = nir_schedule_get_node(instr_map, |
| src->parent_instr); |
| |
| add_read_dep(state, def_n, use_n); |
| } |
| |
| return true; |
| } |
| |
| static struct nir_schedule_class_dep * |
| nir_schedule_get_class_dep(nir_deps_state *state, |
| int klass) |
| { |
| for (struct nir_schedule_class_dep *class_dep = state->class_deps; |
| class_dep != NULL; |
| class_dep = class_dep->next) { |
| if (class_dep->klass == klass) |
| return class_dep; |
| } |
| |
| struct nir_schedule_class_dep *class_dep = |
| ralloc(state->reg_map, struct nir_schedule_class_dep); |
| |
| class_dep->klass = klass; |
| class_dep->node = NULL; |
| class_dep->next = state->class_deps; |
| |
| state->class_deps = class_dep; |
| |
| return class_dep; |
| } |
| |
| static void |
| nir_schedule_intrinsic_deps(nir_deps_state *state, |
| nir_intrinsic_instr *instr) |
| { |
| nir_schedule_node *n = nir_schedule_get_node(state->scoreboard->instr_map, |
| &instr->instr); |
| const nir_schedule_options *options = state->scoreboard->options; |
| nir_schedule_dependency dep; |
| |
| if (options->intrinsic_cb && |
| options->intrinsic_cb(instr, &dep, options->intrinsic_cb_data)) { |
| struct nir_schedule_class_dep *class_dep = |
| nir_schedule_get_class_dep(state, dep.klass); |
| |
| switch (dep.type) { |
| case NIR_SCHEDULE_READ_DEPENDENCY: |
| add_read_dep(state, class_dep->node, n); |
| break; |
| case NIR_SCHEDULE_WRITE_DEPENDENCY: |
| add_write_dep(state, &class_dep->node, n); |
| break; |
| } |
| } |
| |
| switch (instr->intrinsic) { |
| case nir_intrinsic_load_uniform: |
| case nir_intrinsic_load_ubo: |
| case nir_intrinsic_load_front_face: |
| break; |
| |
| case nir_intrinsic_discard: |
| case nir_intrinsic_discard_if: |
| /* We are adding two dependencies: |
| * |
| * * A individual one that we could use to add a read_dep while handling |
| * nir_instr_type_tex |
| * |
| * * Include it on the unknown intrinsic set, as we want discard to be |
| * serialized in in the same order relative to intervening stores or |
| * atomic accesses to SSBOs and images |
| */ |
| add_write_dep(state, &state->discard, n); |
| add_write_dep(state, &state->unknown_intrinsic, n); |
| break; |
| |
| case nir_intrinsic_store_output: |
| /* For some hardware and stages, output stores affect the same shared |
| * memory as input loads. |
| */ |
| if ((state->scoreboard->options->stages_with_shared_io_memory & |
| (1 << state->scoreboard->shader->info.stage))) |
| add_write_dep(state, &state->load_input, n); |
| |
| /* Make sure that preceding discards stay before the store_output */ |
| add_read_dep(state, state->discard, n); |
| |
| break; |
| |
| case nir_intrinsic_load_input: |
| case nir_intrinsic_load_per_vertex_input: |
| add_read_dep(state, state->load_input, n); |
| break; |
| |
| case nir_intrinsic_load_shared: |
| /* Don't move load_shared beyond a following store_shared, as it could |
| * change their value |
| */ |
| add_read_dep(state, state->store_shared, n); |
| break; |
| |
| case nir_intrinsic_store_shared: |
| add_write_dep(state, &state->store_shared, n); |
| break; |
| |
| case nir_intrinsic_control_barrier: |
| case nir_intrinsic_memory_barrier_shared: |
| add_write_dep(state, &state->store_shared, n); |
| |
| /* Serialize against ssbos/atomics/etc. */ |
| add_write_dep(state, &state->unknown_intrinsic, n); |
| break; |
| |
| default: |
| /* Attempt to handle other intrinsics that we haven't individually |
| * categorized by serializing them in the same order relative to each |
| * other. |
| */ |
| add_write_dep(state, &state->unknown_intrinsic, n); |
| break; |
| } |
| } |
| |
| /** |
| * Common code for dependencies that need to be tracked both forward and |
| * backward. |
| * |
| * This is for things like "all reads of r4 have to happen between the r4 |
| * writes that surround them". |
| */ |
| static void |
| nir_schedule_calculate_deps(nir_deps_state *state, nir_schedule_node *n) |
| { |
| nir_instr *instr = n->instr; |
| |
| /* For NIR SSA defs, we only need to do a single pass of making the uses |
| * depend on the def. |
| */ |
| if (state->dir == F) |
| nir_foreach_ssa_def(instr, nir_schedule_ssa_deps, state); |
| |
| /* For NIR regs, track the last writer in the scheduler state so that we |
| * can keep the writes in order and let reads get reordered only between |
| * each write. |
| */ |
| nir_foreach_src(instr, nir_schedule_reg_src_deps, state); |
| |
| nir_foreach_dest(instr, nir_schedule_reg_dest_deps, state); |
| |
| /* Make sure any other instructions keep their positions relative to |
| * jumps. |
| */ |
| if (instr->type != nir_instr_type_jump) |
| add_read_dep(state, state->jump, n); |
| |
| switch (instr->type) { |
| case nir_instr_type_ssa_undef: |
| case nir_instr_type_load_const: |
| case nir_instr_type_alu: |
| case nir_instr_type_deref: |
| break; |
| |
| case nir_instr_type_tex: |
| /* Don't move texture ops before a discard, as that could increase |
| * memory bandwidth for reading the discarded samples. |
| */ |
| add_read_dep(state, state->discard, n); |
| break; |
| |
| case nir_instr_type_jump: |
| add_write_dep(state, &state->jump, n); |
| break; |
| |
| case nir_instr_type_call: |
| unreachable("Calls should have been lowered"); |
| break; |
| |
| case nir_instr_type_parallel_copy: |
| unreachable("Parallel copies should have been lowered"); |
| break; |
| |
| case nir_instr_type_phi: |
| unreachable("nir_schedule() should be called after lowering from SSA"); |
| break; |
| |
| case nir_instr_type_intrinsic: |
| nir_schedule_intrinsic_deps(state, nir_instr_as_intrinsic(instr)); |
| break; |
| } |
| } |
| |
| static void |
| calculate_forward_deps(nir_schedule_scoreboard *scoreboard, nir_block *block) |
| { |
| nir_deps_state state = { |
| .scoreboard = scoreboard, |
| .dir = F, |
| .reg_map = _mesa_pointer_hash_table_create(NULL), |
| }; |
| |
| nir_foreach_instr(instr, block) { |
| nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map, |
| instr); |
| nir_schedule_calculate_deps(&state, node); |
| } |
| |
| ralloc_free(state.reg_map); |
| } |
| |
| static void |
| calculate_reverse_deps(nir_schedule_scoreboard *scoreboard, nir_block *block) |
| { |
| nir_deps_state state = { |
| .scoreboard = scoreboard, |
| .dir = R, |
| .reg_map = _mesa_pointer_hash_table_create(NULL), |
| }; |
| |
| nir_foreach_instr_reverse(instr, block) { |
| nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map, |
| instr); |
| nir_schedule_calculate_deps(&state, node); |
| } |
| |
| ralloc_free(state.reg_map); |
| } |
| |
| typedef struct { |
| nir_schedule_scoreboard *scoreboard; |
| int regs_freed; |
| } nir_schedule_regs_freed_state; |
| |
| static bool |
| nir_schedule_regs_freed_src_cb(nir_src *src, void *in_state) |
| { |
| nir_schedule_regs_freed_state *state = in_state; |
| nir_schedule_scoreboard *scoreboard = state->scoreboard; |
| struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src); |
| |
| if (remaining_uses->entries == 1 && |
| _mesa_set_search(remaining_uses, src->parent_instr)) { |
| state->regs_freed += nir_schedule_src_pressure(src); |
| } |
| |
| return true; |
| } |
| |
| static bool |
| nir_schedule_regs_freed_def_cb(nir_ssa_def *def, void *in_state) |
| { |
| nir_schedule_regs_freed_state *state = in_state; |
| |
| state->regs_freed -= nir_schedule_def_pressure(def); |
| |
| return true; |
| } |
| |
| static bool |
| nir_schedule_regs_freed_dest_cb(nir_dest *dest, void *in_state) |
| { |
| nir_schedule_regs_freed_state *state = in_state; |
| nir_schedule_scoreboard *scoreboard = state->scoreboard; |
| |
| if (dest->is_ssa) |
| return true; |
| |
| nir_register *reg = dest->reg.reg; |
| |
| /* Only the first def of a reg counts against register pressure. */ |
| if (!_mesa_set_search(scoreboard->live_values, reg)) |
| state->regs_freed -= nir_schedule_dest_pressure(dest); |
| |
| return true; |
| } |
| |
| static int |
| nir_schedule_regs_freed(nir_schedule_scoreboard *scoreboard, nir_schedule_node *n) |
| { |
| nir_schedule_regs_freed_state state = { |
| .scoreboard = scoreboard, |
| }; |
| |
| nir_foreach_src(n->instr, nir_schedule_regs_freed_src_cb, &state); |
| |
| nir_foreach_ssa_def(n->instr, nir_schedule_regs_freed_def_cb, &state); |
| |
| nir_foreach_dest(n->instr, nir_schedule_regs_freed_dest_cb, &state); |
| |
| return state.regs_freed; |
| } |
| |
| /** |
| * Chooses an instruction that will minimise the register pressure as much as |
| * possible. This should only be used as a fallback when the regular scheduling |
| * generates a shader whose register allocation fails. |
| */ |
| static nir_schedule_node * |
| nir_schedule_choose_instruction_fallback(nir_schedule_scoreboard *scoreboard) |
| { |
| nir_schedule_node *chosen = NULL; |
| |
| /* Find the leader in the ready (shouldn't-stall) set with the mininum |
| * cost. |
| */ |
| list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { |
| if (scoreboard->time < n->ready_time) |
| continue; |
| |
| if (!chosen || chosen->max_delay > n->max_delay) |
| chosen = n; |
| } |
| if (chosen) { |
| if (debug) { |
| fprintf(stderr, "chose (ready fallback): "); |
| nir_print_instr(chosen->instr, stderr); |
| fprintf(stderr, "\n"); |
| } |
| |
| return chosen; |
| } |
| |
| /* Otherwise, choose the leader with the minimum cost. */ |
| list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { |
| if (!chosen || chosen->max_delay > n->max_delay) |
| chosen = n; |
| } |
| if (debug) { |
| fprintf(stderr, "chose (leader fallback): "); |
| nir_print_instr(chosen->instr, stderr); |
| fprintf(stderr, "\n"); |
| } |
| |
| return chosen; |
| } |
| |
| /** |
| * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSP (Code |
| * Scheduling for Parallelism) heuristic. |
| * |
| * Picks an instruction on the critical that's ready to execute without |
| * stalls, if possible, otherwise picks the instruction on the critical path. |
| */ |
| static nir_schedule_node * |
| nir_schedule_choose_instruction_csp(nir_schedule_scoreboard *scoreboard) |
| { |
| nir_schedule_node *chosen = NULL; |
| |
| /* Find the leader in the ready (shouldn't-stall) set with the maximum |
| * cost. |
| */ |
| list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { |
| if (scoreboard->time < n->ready_time) |
| continue; |
| |
| if (!chosen || chosen->max_delay < n->max_delay) |
| chosen = n; |
| } |
| if (chosen) { |
| if (debug) { |
| fprintf(stderr, "chose (ready): "); |
| nir_print_instr(chosen->instr, stderr); |
| fprintf(stderr, "\n"); |
| } |
| |
| return chosen; |
| } |
| |
| /* Otherwise, choose the leader with the maximum cost. */ |
| list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { |
| if (!chosen || chosen->max_delay < n->max_delay) |
| chosen = n; |
| } |
| if (debug) { |
| fprintf(stderr, "chose (leader): "); |
| nir_print_instr(chosen->instr, stderr); |
| fprintf(stderr, "\n"); |
| } |
| |
| return chosen; |
| } |
| |
| /** |
| * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code |
| * Scheduling for Register pressure) heuristic. |
| */ |
| static nir_schedule_node * |
| nir_schedule_choose_instruction_csr(nir_schedule_scoreboard *scoreboard) |
| { |
| nir_schedule_node *chosen = NULL; |
| |
| /* Find a ready inst with regs freed and pick the one with max cost. */ |
| list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { |
| if (n->ready_time > scoreboard->time) |
| continue; |
| |
| int regs_freed = nir_schedule_regs_freed(scoreboard, n); |
| |
| if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) { |
| chosen = n; |
| } |
| } |
| if (chosen) { |
| if (debug) { |
| fprintf(stderr, "chose (freed+ready): "); |
| nir_print_instr(chosen->instr, stderr); |
| fprintf(stderr, "\n"); |
| } |
| |
| return chosen; |
| } |
| |
| /* Find a leader with regs freed and pick the one with max cost. */ |
| list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { |
| int regs_freed = nir_schedule_regs_freed(scoreboard, n); |
| |
| if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) { |
| chosen = n; |
| } |
| } |
| if (chosen) { |
| if (debug) { |
| fprintf(stderr, "chose (regs freed): "); |
| nir_print_instr(chosen->instr, stderr); |
| fprintf(stderr, "\n"); |
| } |
| |
| return chosen; |
| } |
| |
| /* Find a partially evaluated path and try to finish it off */ |
| list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { |
| if (n->partially_evaluated_path && |
| (!chosen || chosen->max_delay < n->max_delay)) { |
| chosen = n; |
| } |
| } |
| if (chosen) { |
| if (debug) { |
| fprintf(stderr, "chose (partial path): "); |
| nir_print_instr(chosen->instr, stderr); |
| fprintf(stderr, "\n"); |
| } |
| |
| return chosen; |
| } |
| |
| /* Contra the paper, pick a leader with no effect on used regs. This may |
| * open up new opportunities, as otherwise a single-operand instr consuming |
| * a value will tend to block finding freeing that value. This had a |
| * massive effect on reducing spilling on V3D. |
| * |
| * XXX: Should this prioritize ready? |
| */ |
| list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { |
| if (nir_schedule_regs_freed(scoreboard, n) != 0) |
| continue; |
| |
| if (!chosen || chosen->max_delay < n->max_delay) |
| chosen = n; |
| } |
| if (chosen) { |
| if (debug) { |
| fprintf(stderr, "chose (regs no-op): "); |
| nir_print_instr(chosen->instr, stderr); |
| fprintf(stderr, "\n"); |
| } |
| |
| return chosen; |
| } |
| |
| /* Pick the max delay of the remaining ready set. */ |
| list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { |
| if (n->ready_time > scoreboard->time) |
| continue; |
| if (!chosen || chosen->max_delay < n->max_delay) |
| chosen = n; |
| } |
| if (chosen) { |
| if (debug) { |
| fprintf(stderr, "chose (ready max delay): "); |
| nir_print_instr(chosen->instr, stderr); |
| fprintf(stderr, "\n"); |
| } |
| return chosen; |
| } |
| |
| /* Pick the max delay of the remaining leaders. */ |
| list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { |
| if (!chosen || chosen->max_delay < n->max_delay) |
| chosen = n; |
| } |
| |
| if (debug) { |
| fprintf(stderr, "chose (max delay): "); |
| nir_print_instr(chosen->instr, stderr); |
| fprintf(stderr, "\n"); |
| } |
| |
| return chosen; |
| } |
| |
| static void |
| dump_state(nir_schedule_scoreboard *scoreboard) |
| { |
| list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { |
| fprintf(stderr, "maxdel %5d ", n->max_delay); |
| nir_print_instr(n->instr, stderr); |
| fprintf(stderr, "\n"); |
| |
| util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { |
| nir_schedule_node *child = (nir_schedule_node *)edge->child; |
| |
| fprintf(stderr, " -> (%d parents) ", child->dag.parent_count); |
| nir_print_instr(child->instr, stderr); |
| fprintf(stderr, "\n"); |
| } |
| } |
| } |
| |
| static void |
| nir_schedule_mark_use(nir_schedule_scoreboard *scoreboard, |
| void *reg_or_def, |
| nir_instr *reg_or_def_parent, |
| int pressure) |
| { |
| /* Make the value live if it's the first time it's been used. */ |
| if (!_mesa_set_search(scoreboard->live_values, reg_or_def)) { |
| _mesa_set_add(scoreboard->live_values, reg_or_def); |
| scoreboard->pressure += pressure; |
| } |
| |
| /* Make the value dead if it's the last remaining use. Be careful when one |
| * instruction uses a value twice to not decrement pressure twice. |
| */ |
| struct set *remaining_uses = |
| _mesa_hash_table_search_data(scoreboard->remaining_uses, reg_or_def); |
| struct set_entry *entry = _mesa_set_search(remaining_uses, reg_or_def_parent); |
| if (entry) { |
| _mesa_set_remove(remaining_uses, entry); |
| |
| if (remaining_uses->entries == 0) |
| scoreboard->pressure -= pressure; |
| } |
| } |
| |
| static bool |
| nir_schedule_mark_src_scheduled(nir_src *src, void *state) |
| { |
| nir_schedule_scoreboard *scoreboard = state; |
| struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src); |
| |
| struct set_entry *entry = _mesa_set_search(remaining_uses, |
| src->parent_instr); |
| if (entry) { |
| /* Once we've used an SSA value in one instruction, bump the priority of |
| * the other uses so the SSA value can get fully consumed. |
| * |
| * We don't do this for registers, and it's would be a hassle and it's |
| * unclear if that would help or not. Also, skip it for constants, as |
| * they're often folded as immediates into backend instructions and have |
| * many unrelated instructions all referencing the same value (0). |
| */ |
| if (src->is_ssa && |
| src->ssa->parent_instr->type != nir_instr_type_load_const) { |
| nir_foreach_use(other_src, src->ssa) { |
| if (other_src->parent_instr == src->parent_instr) |
| continue; |
| |
| nir_schedule_node *n = |
| nir_schedule_get_node(scoreboard->instr_map, |
| other_src->parent_instr); |
| |
| if (n && !n->partially_evaluated_path) { |
| if (debug) { |
| fprintf(stderr, " New partially evaluated path: "); |
| nir_print_instr(n->instr, stderr); |
| fprintf(stderr, "\n"); |
| } |
| |
| n->partially_evaluated_path = true; |
| } |
| } |
| } |
| } |
| |
| nir_schedule_mark_use(scoreboard, |
| src->is_ssa ? (void *)src->ssa : (void *)src->reg.reg, |
| src->parent_instr, |
| nir_schedule_src_pressure(src)); |
| |
| return true; |
| } |
| |
| static bool |
| nir_schedule_mark_def_scheduled(nir_ssa_def *def, void *state) |
| { |
| nir_schedule_scoreboard *scoreboard = state; |
| |
| nir_schedule_mark_use(scoreboard, def, def->parent_instr, |
| nir_schedule_def_pressure(def)); |
| |
| return true; |
| } |
| |
| static bool |
| nir_schedule_mark_dest_scheduled(nir_dest *dest, void *state) |
| { |
| nir_schedule_scoreboard *scoreboard = state; |
| |
| /* SSA defs were handled in nir_schedule_mark_def_scheduled() |
| */ |
| if (dest->is_ssa) |
| return true; |
| |
| /* XXX: This is not actually accurate for regs -- the last use of a reg may |
| * have a live interval that extends across control flow. We should |
| * calculate the live ranges of regs, and have scheduler nodes for the CF |
| * nodes that also "use" the reg. |
| */ |
| nir_schedule_mark_use(scoreboard, dest->reg.reg, |
| dest->reg.parent_instr, |
| nir_schedule_dest_pressure(dest)); |
| |
| return true; |
| } |
| |
| static void |
| nir_schedule_mark_node_scheduled(nir_schedule_scoreboard *scoreboard, |
| nir_schedule_node *n) |
| { |
| nir_foreach_src(n->instr, nir_schedule_mark_src_scheduled, scoreboard); |
| nir_foreach_ssa_def(n->instr, nir_schedule_mark_def_scheduled, scoreboard); |
| nir_foreach_dest(n->instr, nir_schedule_mark_dest_scheduled, scoreboard); |
| |
| util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { |
| nir_schedule_node *child = (nir_schedule_node *)edge->child; |
| |
| child->ready_time = MAX2(child->ready_time, |
| scoreboard->time + n->delay); |
| |
| if (child->dag.parent_count == 1) { |
| if (debug) { |
| fprintf(stderr, " New DAG head: "); |
| nir_print_instr(child->instr, stderr); |
| fprintf(stderr, "\n"); |
| } |
| } |
| } |
| |
| dag_prune_head(scoreboard->dag, &n->dag); |
| |
| scoreboard->time = MAX2(n->ready_time, scoreboard->time); |
| scoreboard->time++; |
| } |
| |
| static void |
| nir_schedule_instructions(nir_schedule_scoreboard *scoreboard, nir_block *block) |
| { |
| while (!list_is_empty(&scoreboard->dag->heads)) { |
| if (debug) { |
| fprintf(stderr, "current list:\n"); |
| dump_state(scoreboard); |
| } |
| |
| nir_schedule_node *chosen; |
| if (scoreboard->options->fallback) |
| chosen = nir_schedule_choose_instruction_fallback(scoreboard); |
| else if (scoreboard->pressure < scoreboard->options->threshold) |
| chosen = nir_schedule_choose_instruction_csp(scoreboard); |
| else |
| chosen = nir_schedule_choose_instruction_csr(scoreboard); |
| |
| /* Now that we've scheduled a new instruction, some of its children may |
| * be promoted to the list of instructions ready to be scheduled. |
| */ |
| nir_schedule_mark_node_scheduled(scoreboard, chosen); |
| |
| /* Move the instruction to the end (so our first chosen instructions are |
| * the start of the program). |
| */ |
| exec_node_remove(&chosen->instr->node); |
| exec_list_push_tail(&block->instr_list, &chosen->instr->node); |
| |
| if (debug) |
| fprintf(stderr, "\n"); |
| } |
| } |
| |
| static uint32_t |
| nir_schedule_get_delay(nir_instr *instr) |
| { |
| switch (instr->type) { |
| case nir_instr_type_ssa_undef: |
| case nir_instr_type_load_const: |
| case nir_instr_type_alu: |
| case nir_instr_type_deref: |
| case nir_instr_type_jump: |
| case nir_instr_type_parallel_copy: |
| case nir_instr_type_call: |
| case nir_instr_type_phi: |
| return 1; |
| |
| case nir_instr_type_intrinsic: |
| /* XXX: Pick a large number for UBO/SSBO/image/shared loads */ |
| return 1; |
| |
| case nir_instr_type_tex: |
| /* Pick some large number to try to fetch textures early and sample them |
| * late. |
| */ |
| return 100; |
| } |
| |
| return 0; |
| } |
| |
| static void |
| nir_schedule_dag_max_delay_cb(struct dag_node *node, void *state) |
| { |
| nir_schedule_node *n = (nir_schedule_node *)node; |
| uint32_t max_delay = 0; |
| |
| util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { |
| nir_schedule_node *child = (nir_schedule_node *)edge->child; |
| max_delay = MAX2(child->max_delay, max_delay); |
| } |
| |
| n->max_delay = MAX2(n->max_delay, max_delay + n->delay); |
| } |
| |
| static void |
| nir_schedule_block(nir_schedule_scoreboard *scoreboard, nir_block *block) |
| { |
| void *mem_ctx = ralloc_context(NULL); |
| scoreboard->instr_map = _mesa_pointer_hash_table_create(mem_ctx); |
| |
| scoreboard->dag = dag_create(mem_ctx); |
| |
| nir_foreach_instr(instr, block) { |
| nir_schedule_node *n = |
| rzalloc(mem_ctx, nir_schedule_node); |
| |
| n->instr = instr; |
| n->delay = nir_schedule_get_delay(instr); |
| dag_init_node(scoreboard->dag, &n->dag); |
| |
| _mesa_hash_table_insert(scoreboard->instr_map, instr, n); |
| } |
| |
| calculate_forward_deps(scoreboard, block); |
| calculate_reverse_deps(scoreboard, block); |
| |
| dag_traverse_bottom_up(scoreboard->dag, nir_schedule_dag_max_delay_cb, NULL); |
| |
| nir_schedule_instructions(scoreboard, block); |
| |
| ralloc_free(mem_ctx); |
| scoreboard->instr_map = NULL; |
| } |
| |
| static bool |
| nir_schedule_ssa_def_init_scoreboard(nir_ssa_def *def, void *state) |
| { |
| nir_schedule_scoreboard *scoreboard = state; |
| struct set *def_uses = _mesa_pointer_set_create(scoreboard); |
| |
| _mesa_hash_table_insert(scoreboard->remaining_uses, def, def_uses); |
| |
| _mesa_set_add(def_uses, def->parent_instr); |
| |
| nir_foreach_use(src, def) { |
| _mesa_set_add(def_uses, src->parent_instr); |
| } |
| |
| /* XXX: Handle if uses */ |
| |
| return true; |
| } |
| |
| static nir_schedule_scoreboard * |
| nir_schedule_get_scoreboard(nir_shader *shader, |
| const nir_schedule_options *options) |
| { |
| nir_schedule_scoreboard *scoreboard = rzalloc(NULL, nir_schedule_scoreboard); |
| |
| scoreboard->shader = shader; |
| scoreboard->live_values = _mesa_pointer_set_create(scoreboard); |
| scoreboard->remaining_uses = _mesa_pointer_hash_table_create(scoreboard); |
| scoreboard->options = options; |
| scoreboard->pressure = 0; |
| |
| nir_foreach_function(function, shader) { |
| nir_foreach_register(reg, &function->impl->registers) { |
| struct set *register_uses = |
| _mesa_pointer_set_create(scoreboard); |
| |
| _mesa_hash_table_insert(scoreboard->remaining_uses, reg, register_uses); |
| |
| nir_foreach_use(src, reg) { |
| _mesa_set_add(register_uses, src->parent_instr); |
| } |
| |
| /* XXX: Handle if uses */ |
| |
| nir_foreach_def(dest, reg) { |
| _mesa_set_add(register_uses, dest->reg.parent_instr); |
| } |
| } |
| |
| nir_foreach_block(block, function->impl) { |
| nir_foreach_instr(instr, block) { |
| nir_foreach_ssa_def(instr, nir_schedule_ssa_def_init_scoreboard, |
| scoreboard); |
| } |
| |
| /* XXX: We're ignoring if uses, which may prioritize scheduling other |
| * uses of the if src even when it doesn't help. That's not many |
| * values, though, so meh. |
| */ |
| } |
| } |
| |
| return scoreboard; |
| } |
| |
| static void |
| nir_schedule_validate_uses(nir_schedule_scoreboard *scoreboard) |
| { |
| #ifdef NDEBUG |
| return; |
| #endif |
| |
| bool any_uses = false; |
| |
| hash_table_foreach(scoreboard->remaining_uses, entry) { |
| struct set *remaining_uses = entry->data; |
| |
| set_foreach(remaining_uses, instr_entry) { |
| if (!any_uses) { |
| fprintf(stderr, "Tracked uses remain after scheduling. " |
| "Affected instructions: \n"); |
| any_uses = true; |
| } |
| nir_print_instr(instr_entry->key, stderr); |
| fprintf(stderr, "\n"); |
| } |
| } |
| |
| assert(!any_uses); |
| } |
| |
| /** |
| * Schedules the NIR instructions to try to decrease stalls (for example, |
| * delaying texture reads) while managing register pressure. |
| * |
| * The threshold represents "number of NIR register/SSA def channels live |
| * before switching the scheduling heuristic to reduce register pressure", |
| * since most of our GPU architectures are scalar (extending to vector with a |
| * flag wouldn't be hard). This number should be a bit below the number of |
| * registers available (counting any that may be occupied by system value |
| * payload values, for example), since the heuristic may not always be able to |
| * free a register immediately. The amount below the limit is up to you to |
| * tune. |
| */ |
| void |
| nir_schedule(nir_shader *shader, |
| const nir_schedule_options *options) |
| { |
| nir_schedule_scoreboard *scoreboard = nir_schedule_get_scoreboard(shader, |
| options); |
| |
| if (debug) { |
| fprintf(stderr, "NIR shader before scheduling:\n"); |
| nir_print_shader(shader, stderr); |
| } |
| |
| nir_foreach_function(function, shader) { |
| if (!function->impl) |
| continue; |
| |
| nir_foreach_block(block, function->impl) { |
| nir_schedule_block(scoreboard, block); |
| } |
| } |
| |
| nir_schedule_validate_uses(scoreboard); |
| |
| ralloc_free(scoreboard); |
| } |