blob: 36a045525ed4a3568a5d3acb2bcec147ff01fa85 [file] [log] [blame]
/*
* Copyright © 2019 Broadcom
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "nir_schedule.h"
#include "util/dag.h"
#include "util/u_dynarray.h"
/** @file
*
* Implements basic-block-level prepass instruction scheduling in NIR to
* manage register pressure.
*
* This is based on the Goodman/Hsu paper (1988, cached copy at
* https://people.freedesktop.org/~anholt/scheduling-goodman-hsu.pdf). We
* make up the DDG for NIR (which can be mostly done using the NIR def/use
* chains for SSA instructions, plus some edges for ordering register writes
* vs reads, and some more for ordering intrinsics). Then we pick heads off
* of the DDG using their heuristic to emit the NIR instructions back into the
* block in their new order.
*
* The hard case for prepass scheduling on GPUs seems to always be consuming
* texture/ubo results. The register pressure heuristic doesn't want to pick
* an instr that starts consuming texture results because it usually won't be
* the only usage, so that instruction will increase pressure.
*
* If you try to force consumption of tex results always, then in a case where
* single sample is used for many outputs, you'll end up picking every other
* user and expanding register pressure. The partially_evaluated_path flag
* helps tremendously, in that if you happen for whatever reason to pick a
* texture sample's output, then you'll try to finish off that sample. Future
* work may include doing some local search before locking in a choice, to try
* to more reliably find the case where just a few choices going against the
* heuristic can manage to free the whole vector.
*/
static bool debug;
/**
* Represents a node in the DDG for a NIR instruction.
*/
typedef struct {
struct dag_node dag; /* must be first for our u_dynarray_foreach */
nir_instr *instr;
bool partially_evaluated_path;
/* Approximate estimate of the delay between starting this instruction and
* its results being available.
*
* Accuracy is not too important, given that we're prepass scheduling here
* and just trying to reduce excess dependencies introduced by a register
* allocator by stretching out the live intervals of expensive
* instructions.
*/
uint32_t delay;
/* Cost of the maximum-delay path from this node to the leaves. */
uint32_t max_delay;
/* scoreboard->time value when this instruction can be scheduled without
* any stalls expected.
*/
uint32_t ready_time;
} nir_schedule_node;
typedef struct {
struct dag *dag;
nir_shader *shader;
/* Mapping from nir_register * or nir_ssa_def * to a struct set of
* instructions remaining to be scheduled using the register.
*/
struct hash_table *remaining_uses;
/* Map from nir_instr to nir_schedule_node * */
struct hash_table *instr_map;
/* Set of nir_register * or nir_ssa_def * that have had any instruction
* scheduled on them.
*/
struct set *live_values;
/* An abstract approximation of the number of nir_scheduler_node->delay
* units since the start of the shader.
*/
uint32_t time;
/* Number of channels currently used by the NIR instructions that have been
* scheduled.
*/
int pressure;
/* Options specified by the backend */
const nir_schedule_options *options;
} nir_schedule_scoreboard;
/* When walking the instructions in reverse, we use this flag to swap
* before/after in add_dep().
*/
enum direction { F, R };
struct nir_schedule_class_dep {
int klass;
nir_schedule_node *node;
struct nir_schedule_class_dep *next;
};
typedef struct {
nir_schedule_scoreboard *scoreboard;
/* Map from nir_register to nir_schedule_node * */
struct hash_table *reg_map;
/* Scheduler nodes for last instruction involved in some class of dependency.
*/
nir_schedule_node *load_input;
nir_schedule_node *store_shared;
nir_schedule_node *unknown_intrinsic;
nir_schedule_node *discard;
nir_schedule_node *jump;
struct nir_schedule_class_dep *class_deps;
enum direction dir;
} nir_deps_state;
static void *
_mesa_hash_table_search_data(struct hash_table *ht, void *key)
{
struct hash_entry *entry = _mesa_hash_table_search(ht, key);
if (!entry)
return NULL;
return entry->data;
}
static nir_schedule_node *
nir_schedule_get_node(struct hash_table *instr_map, nir_instr *instr)
{
return _mesa_hash_table_search_data(instr_map, instr);
}
static struct set *
nir_schedule_scoreboard_get_src(nir_schedule_scoreboard *scoreboard, nir_src *src)
{
if (src->is_ssa) {
return _mesa_hash_table_search_data(scoreboard->remaining_uses, src->ssa);
} else {
return _mesa_hash_table_search_data(scoreboard->remaining_uses,
src->reg.reg);
}
}
static int
nir_schedule_def_pressure(nir_ssa_def *def)
{
return def->num_components;
}
static int
nir_schedule_src_pressure(nir_src *src)
{
if (src->is_ssa)
return nir_schedule_def_pressure(src->ssa);
else
return src->reg.reg->num_components;
}
static int
nir_schedule_dest_pressure(nir_dest *dest)
{
if (dest->is_ssa)
return nir_schedule_def_pressure(&dest->ssa);
else
return dest->reg.reg->num_components;
}
/**
* Adds a dependency such that @after must appear in the final program after
* @before.
*
* We add @before as a child of @after, so that DAG heads are the outputs of
* the program and we make our scheduling decisions bottom to top.
*/
static void
add_dep(nir_deps_state *state,
nir_schedule_node *before,
nir_schedule_node *after)
{
if (!before || !after)
return;
assert(before != after);
if (state->dir == F)
dag_add_edge(&before->dag, &after->dag, NULL);
else
dag_add_edge(&after->dag, &before->dag, NULL);
}
static void
add_read_dep(nir_deps_state *state,
nir_schedule_node *before,
nir_schedule_node *after)
{
add_dep(state, before, after);
}
static void
add_write_dep(nir_deps_state *state,
nir_schedule_node **before,
nir_schedule_node *after)
{
add_dep(state, *before, after);
*before = after;
}
static bool
nir_schedule_reg_src_deps(nir_src *src, void *in_state)
{
nir_deps_state *state = in_state;
if (src->is_ssa)
return true;
struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
src->reg.reg);
if (!entry)
return true;
nir_schedule_node *dst_n = entry->data;
nir_schedule_node *src_n =
nir_schedule_get_node(state->scoreboard->instr_map,
src->parent_instr);
add_dep(state, dst_n, src_n);
return true;
}
static bool
nir_schedule_reg_dest_deps(nir_dest *dest, void *in_state)
{
nir_deps_state *state = in_state;
if (dest->is_ssa)
return true;
nir_schedule_node *dest_n =
nir_schedule_get_node(state->scoreboard->instr_map,
dest->reg.parent_instr);
struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
dest->reg.reg);
if (!entry) {
_mesa_hash_table_insert(state->reg_map, dest->reg.reg, dest_n);
return true;
}
nir_schedule_node **before = (nir_schedule_node **)&entry->data;
add_write_dep(state, before, dest_n);
return true;
}
static bool
nir_schedule_ssa_deps(nir_ssa_def *def, void *in_state)
{
nir_deps_state *state = in_state;
struct hash_table *instr_map = state->scoreboard->instr_map;
nir_schedule_node *def_n = nir_schedule_get_node(instr_map, def->parent_instr);
nir_foreach_use(src, def) {
nir_schedule_node *use_n = nir_schedule_get_node(instr_map,
src->parent_instr);
add_read_dep(state, def_n, use_n);
}
return true;
}
static struct nir_schedule_class_dep *
nir_schedule_get_class_dep(nir_deps_state *state,
int klass)
{
for (struct nir_schedule_class_dep *class_dep = state->class_deps;
class_dep != NULL;
class_dep = class_dep->next) {
if (class_dep->klass == klass)
return class_dep;
}
struct nir_schedule_class_dep *class_dep =
ralloc(state->reg_map, struct nir_schedule_class_dep);
class_dep->klass = klass;
class_dep->node = NULL;
class_dep->next = state->class_deps;
state->class_deps = class_dep;
return class_dep;
}
static void
nir_schedule_intrinsic_deps(nir_deps_state *state,
nir_intrinsic_instr *instr)
{
nir_schedule_node *n = nir_schedule_get_node(state->scoreboard->instr_map,
&instr->instr);
const nir_schedule_options *options = state->scoreboard->options;
nir_schedule_dependency dep;
if (options->intrinsic_cb &&
options->intrinsic_cb(instr, &dep, options->intrinsic_cb_data)) {
struct nir_schedule_class_dep *class_dep =
nir_schedule_get_class_dep(state, dep.klass);
switch (dep.type) {
case NIR_SCHEDULE_READ_DEPENDENCY:
add_read_dep(state, class_dep->node, n);
break;
case NIR_SCHEDULE_WRITE_DEPENDENCY:
add_write_dep(state, &class_dep->node, n);
break;
}
}
switch (instr->intrinsic) {
case nir_intrinsic_load_uniform:
case nir_intrinsic_load_ubo:
case nir_intrinsic_load_front_face:
break;
case nir_intrinsic_discard:
case nir_intrinsic_discard_if:
case nir_intrinsic_demote:
case nir_intrinsic_demote_if:
/* We are adding two dependencies:
*
* * A individual one that we could use to add a read_dep while handling
* nir_instr_type_tex
*
* * Include it on the unknown intrinsic set, as we want discard to be
* serialized in in the same order relative to intervening stores or
* atomic accesses to SSBOs and images
*/
add_write_dep(state, &state->discard, n);
add_write_dep(state, &state->unknown_intrinsic, n);
break;
case nir_intrinsic_store_output:
/* For some hardware and stages, output stores affect the same shared
* memory as input loads.
*/
if ((state->scoreboard->options->stages_with_shared_io_memory &
(1 << state->scoreboard->shader->info.stage)))
add_write_dep(state, &state->load_input, n);
/* Make sure that preceding discards stay before the store_output */
add_read_dep(state, state->discard, n);
break;
case nir_intrinsic_load_input:
case nir_intrinsic_load_per_vertex_input:
add_read_dep(state, state->load_input, n);
break;
case nir_intrinsic_load_shared:
/* Don't move load_shared beyond a following store_shared, as it could
* change their value
*/
add_read_dep(state, state->store_shared, n);
break;
case nir_intrinsic_store_shared:
add_write_dep(state, &state->store_shared, n);
break;
case nir_intrinsic_control_barrier:
case nir_intrinsic_memory_barrier_shared:
add_write_dep(state, &state->store_shared, n);
/* Serialize against ssbos/atomics/etc. */
add_write_dep(state, &state->unknown_intrinsic, n);
break;
default:
/* Attempt to handle other intrinsics that we haven't individually
* categorized by serializing them in the same order relative to each
* other.
*/
add_write_dep(state, &state->unknown_intrinsic, n);
break;
}
}
/**
* Common code for dependencies that need to be tracked both forward and
* backward.
*
* This is for things like "all reads of r4 have to happen between the r4
* writes that surround them".
*/
static void
nir_schedule_calculate_deps(nir_deps_state *state, nir_schedule_node *n)
{
nir_instr *instr = n->instr;
/* For NIR SSA defs, we only need to do a single pass of making the uses
* depend on the def.
*/
if (state->dir == F)
nir_foreach_ssa_def(instr, nir_schedule_ssa_deps, state);
/* For NIR regs, track the last writer in the scheduler state so that we
* can keep the writes in order and let reads get reordered only between
* each write.
*/
nir_foreach_src(instr, nir_schedule_reg_src_deps, state);
nir_foreach_dest(instr, nir_schedule_reg_dest_deps, state);
/* Make sure any other instructions keep their positions relative to
* jumps.
*/
if (instr->type != nir_instr_type_jump)
add_read_dep(state, state->jump, n);
switch (instr->type) {
case nir_instr_type_ssa_undef:
case nir_instr_type_load_const:
case nir_instr_type_alu:
case nir_instr_type_deref:
break;
case nir_instr_type_tex:
/* Don't move texture ops before a discard, as that could increase
* memory bandwidth for reading the discarded samples.
*/
add_read_dep(state, state->discard, n);
break;
case nir_instr_type_jump:
add_write_dep(state, &state->jump, n);
break;
case nir_instr_type_call:
unreachable("Calls should have been lowered");
break;
case nir_instr_type_parallel_copy:
unreachable("Parallel copies should have been lowered");
break;
case nir_instr_type_phi:
unreachable("nir_schedule() should be called after lowering from SSA");
break;
case nir_instr_type_intrinsic:
nir_schedule_intrinsic_deps(state, nir_instr_as_intrinsic(instr));
break;
}
}
static void
calculate_forward_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
{
nir_deps_state state = {
.scoreboard = scoreboard,
.dir = F,
.reg_map = _mesa_pointer_hash_table_create(NULL),
};
nir_foreach_instr(instr, block) {
nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
instr);
nir_schedule_calculate_deps(&state, node);
}
ralloc_free(state.reg_map);
}
static void
calculate_reverse_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
{
nir_deps_state state = {
.scoreboard = scoreboard,
.dir = R,
.reg_map = _mesa_pointer_hash_table_create(NULL),
};
nir_foreach_instr_reverse(instr, block) {
nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
instr);
nir_schedule_calculate_deps(&state, node);
}
ralloc_free(state.reg_map);
}
typedef struct {
nir_schedule_scoreboard *scoreboard;
int regs_freed;
} nir_schedule_regs_freed_state;
static bool
nir_schedule_regs_freed_src_cb(nir_src *src, void *in_state)
{
nir_schedule_regs_freed_state *state = in_state;
nir_schedule_scoreboard *scoreboard = state->scoreboard;
struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
if (remaining_uses->entries == 1 &&
_mesa_set_search(remaining_uses, src->parent_instr)) {
state->regs_freed += nir_schedule_src_pressure(src);
}
return true;
}
static bool
nir_schedule_regs_freed_def_cb(nir_ssa_def *def, void *in_state)
{
nir_schedule_regs_freed_state *state = in_state;
state->regs_freed -= nir_schedule_def_pressure(def);
return true;
}
static bool
nir_schedule_regs_freed_dest_cb(nir_dest *dest, void *in_state)
{
nir_schedule_regs_freed_state *state = in_state;
nir_schedule_scoreboard *scoreboard = state->scoreboard;
if (dest->is_ssa)
return true;
nir_register *reg = dest->reg.reg;
/* Only the first def of a reg counts against register pressure. */
if (!_mesa_set_search(scoreboard->live_values, reg))
state->regs_freed -= nir_schedule_dest_pressure(dest);
return true;
}
static int
nir_schedule_regs_freed(nir_schedule_scoreboard *scoreboard, nir_schedule_node *n)
{
nir_schedule_regs_freed_state state = {
.scoreboard = scoreboard,
};
nir_foreach_src(n->instr, nir_schedule_regs_freed_src_cb, &state);
nir_foreach_ssa_def(n->instr, nir_schedule_regs_freed_def_cb, &state);
nir_foreach_dest(n->instr, nir_schedule_regs_freed_dest_cb, &state);
return state.regs_freed;
}
/**
* Chooses an instruction that will minimise the register pressure as much as
* possible. This should only be used as a fallback when the regular scheduling
* generates a shader whose register allocation fails.
*/
static nir_schedule_node *
nir_schedule_choose_instruction_fallback(nir_schedule_scoreboard *scoreboard)
{
nir_schedule_node *chosen = NULL;
/* Find the leader in the ready (shouldn't-stall) set with the mininum
* cost.
*/
list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
if (scoreboard->time < n->ready_time)
continue;
if (!chosen || chosen->max_delay > n->max_delay)
chosen = n;
}
if (chosen) {
if (debug) {
fprintf(stderr, "chose (ready fallback): ");
nir_print_instr(chosen->instr, stderr);
fprintf(stderr, "\n");
}
return chosen;
}
/* Otherwise, choose the leader with the minimum cost. */
list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
if (!chosen || chosen->max_delay > n->max_delay)
chosen = n;
}
if (debug) {
fprintf(stderr, "chose (leader fallback): ");
nir_print_instr(chosen->instr, stderr);
fprintf(stderr, "\n");
}
return chosen;
}
/**
* Chooses an instruction to schedule using the Goodman/Hsu (1988) CSP (Code
* Scheduling for Parallelism) heuristic.
*
* Picks an instruction on the critical that's ready to execute without
* stalls, if possible, otherwise picks the instruction on the critical path.
*/
static nir_schedule_node *
nir_schedule_choose_instruction_csp(nir_schedule_scoreboard *scoreboard)
{
nir_schedule_node *chosen = NULL;
/* Find the leader in the ready (shouldn't-stall) set with the maximum
* cost.
*/
list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
if (scoreboard->time < n->ready_time)
continue;
if (!chosen || chosen->max_delay < n->max_delay)
chosen = n;
}
if (chosen) {
if (debug) {
fprintf(stderr, "chose (ready): ");
nir_print_instr(chosen->instr, stderr);
fprintf(stderr, "\n");
}
return chosen;
}
/* Otherwise, choose the leader with the maximum cost. */
list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
if (!chosen || chosen->max_delay < n->max_delay)
chosen = n;
}
if (debug) {
fprintf(stderr, "chose (leader): ");
nir_print_instr(chosen->instr, stderr);
fprintf(stderr, "\n");
}
return chosen;
}
/**
* Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
* Scheduling for Register pressure) heuristic.
*/
static nir_schedule_node *
nir_schedule_choose_instruction_csr(nir_schedule_scoreboard *scoreboard)
{
nir_schedule_node *chosen = NULL;
/* Find a ready inst with regs freed and pick the one with max cost. */
list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
if (n->ready_time > scoreboard->time)
continue;
int regs_freed = nir_schedule_regs_freed(scoreboard, n);
if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
chosen = n;
}
}
if (chosen) {
if (debug) {
fprintf(stderr, "chose (freed+ready): ");
nir_print_instr(chosen->instr, stderr);
fprintf(stderr, "\n");
}
return chosen;
}
/* Find a leader with regs freed and pick the one with max cost. */
list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
int regs_freed = nir_schedule_regs_freed(scoreboard, n);
if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
chosen = n;
}
}
if (chosen) {
if (debug) {
fprintf(stderr, "chose (regs freed): ");
nir_print_instr(chosen->instr, stderr);
fprintf(stderr, "\n");
}
return chosen;
}
/* Find a partially evaluated path and try to finish it off */
list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
if (n->partially_evaluated_path &&
(!chosen || chosen->max_delay < n->max_delay)) {
chosen = n;
}
}
if (chosen) {
if (debug) {
fprintf(stderr, "chose (partial path): ");
nir_print_instr(chosen->instr, stderr);
fprintf(stderr, "\n");
}
return chosen;
}
/* Contra the paper, pick a leader with no effect on used regs. This may
* open up new opportunities, as otherwise a single-operand instr consuming
* a value will tend to block finding freeing that value. This had a
* massive effect on reducing spilling on V3D.
*
* XXX: Should this prioritize ready?
*/
list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
if (nir_schedule_regs_freed(scoreboard, n) != 0)
continue;
if (!chosen || chosen->max_delay < n->max_delay)
chosen = n;
}
if (chosen) {
if (debug) {
fprintf(stderr, "chose (regs no-op): ");
nir_print_instr(chosen->instr, stderr);
fprintf(stderr, "\n");
}
return chosen;
}
/* Pick the max delay of the remaining ready set. */
list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
if (n->ready_time > scoreboard->time)
continue;
if (!chosen || chosen->max_delay < n->max_delay)
chosen = n;
}
if (chosen) {
if (debug) {
fprintf(stderr, "chose (ready max delay): ");
nir_print_instr(chosen->instr, stderr);
fprintf(stderr, "\n");
}
return chosen;
}
/* Pick the max delay of the remaining leaders. */
list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
if (!chosen || chosen->max_delay < n->max_delay)
chosen = n;
}
if (debug) {
fprintf(stderr, "chose (max delay): ");
nir_print_instr(chosen->instr, stderr);
fprintf(stderr, "\n");
}
return chosen;
}
static void
dump_state(nir_schedule_scoreboard *scoreboard)
{
list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
fprintf(stderr, "maxdel %5d ", n->max_delay);
nir_print_instr(n->instr, stderr);
fprintf(stderr, "\n");
util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
nir_schedule_node *child = (nir_schedule_node *)edge->child;
fprintf(stderr, " -> (%d parents) ", child->dag.parent_count);
nir_print_instr(child->instr, stderr);
fprintf(stderr, "\n");
}
}
}
static void
nir_schedule_mark_use(nir_schedule_scoreboard *scoreboard,
void *reg_or_def,
nir_instr *reg_or_def_parent,
int pressure)
{
/* Make the value live if it's the first time it's been used. */
if (!_mesa_set_search(scoreboard->live_values, reg_or_def)) {
_mesa_set_add(scoreboard->live_values, reg_or_def);
scoreboard->pressure += pressure;
}
/* Make the value dead if it's the last remaining use. Be careful when one
* instruction uses a value twice to not decrement pressure twice.
*/
struct set *remaining_uses =
_mesa_hash_table_search_data(scoreboard->remaining_uses, reg_or_def);
struct set_entry *entry = _mesa_set_search(remaining_uses, reg_or_def_parent);
if (entry) {
_mesa_set_remove(remaining_uses, entry);
if (remaining_uses->entries == 0)
scoreboard->pressure -= pressure;
}
}
static bool
nir_schedule_mark_src_scheduled(nir_src *src, void *state)
{
nir_schedule_scoreboard *scoreboard = state;
struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
struct set_entry *entry = _mesa_set_search(remaining_uses,
src->parent_instr);
if (entry) {
/* Once we've used an SSA value in one instruction, bump the priority of
* the other uses so the SSA value can get fully consumed.
*
* We don't do this for registers, and it's would be a hassle and it's
* unclear if that would help or not. Also, skip it for constants, as
* they're often folded as immediates into backend instructions and have
* many unrelated instructions all referencing the same value (0).
*/
if (src->is_ssa &&
src->ssa->parent_instr->type != nir_instr_type_load_const) {
nir_foreach_use(other_src, src->ssa) {
if (other_src->parent_instr == src->parent_instr)
continue;
nir_schedule_node *n =
nir_schedule_get_node(scoreboard->instr_map,
other_src->parent_instr);
if (n && !n->partially_evaluated_path) {
if (debug) {
fprintf(stderr, " New partially evaluated path: ");
nir_print_instr(n->instr, stderr);
fprintf(stderr, "\n");
}
n->partially_evaluated_path = true;
}
}
}
}
nir_schedule_mark_use(scoreboard,
src->is_ssa ? (void *)src->ssa : (void *)src->reg.reg,
src->parent_instr,
nir_schedule_src_pressure(src));
return true;
}
static bool
nir_schedule_mark_def_scheduled(nir_ssa_def *def, void *state)
{
nir_schedule_scoreboard *scoreboard = state;
nir_schedule_mark_use(scoreboard, def, def->parent_instr,
nir_schedule_def_pressure(def));
return true;
}
static bool
nir_schedule_mark_dest_scheduled(nir_dest *dest, void *state)
{
nir_schedule_scoreboard *scoreboard = state;
/* SSA defs were handled in nir_schedule_mark_def_scheduled()
*/
if (dest->is_ssa)
return true;
/* XXX: This is not actually accurate for regs -- the last use of a reg may
* have a live interval that extends across control flow. We should
* calculate the live ranges of regs, and have scheduler nodes for the CF
* nodes that also "use" the reg.
*/
nir_schedule_mark_use(scoreboard, dest->reg.reg,
dest->reg.parent_instr,
nir_schedule_dest_pressure(dest));
return true;
}
static void
nir_schedule_mark_node_scheduled(nir_schedule_scoreboard *scoreboard,
nir_schedule_node *n)
{
nir_foreach_src(n->instr, nir_schedule_mark_src_scheduled, scoreboard);
nir_foreach_ssa_def(n->instr, nir_schedule_mark_def_scheduled, scoreboard);
nir_foreach_dest(n->instr, nir_schedule_mark_dest_scheduled, scoreboard);
util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
nir_schedule_node *child = (nir_schedule_node *)edge->child;
child->ready_time = MAX2(child->ready_time,
scoreboard->time + n->delay);
if (child->dag.parent_count == 1) {
if (debug) {
fprintf(stderr, " New DAG head: ");
nir_print_instr(child->instr, stderr);
fprintf(stderr, "\n");
}
}
}
dag_prune_head(scoreboard->dag, &n->dag);
scoreboard->time = MAX2(n->ready_time, scoreboard->time);
scoreboard->time++;
}
static void
nir_schedule_instructions(nir_schedule_scoreboard *scoreboard, nir_block *block)
{
while (!list_is_empty(&scoreboard->dag->heads)) {
if (debug) {
fprintf(stderr, "current list:\n");
dump_state(scoreboard);
}
nir_schedule_node *chosen;
if (scoreboard->options->fallback)
chosen = nir_schedule_choose_instruction_fallback(scoreboard);
else if (scoreboard->pressure < scoreboard->options->threshold)
chosen = nir_schedule_choose_instruction_csp(scoreboard);
else
chosen = nir_schedule_choose_instruction_csr(scoreboard);
/* Now that we've scheduled a new instruction, some of its children may
* be promoted to the list of instructions ready to be scheduled.
*/
nir_schedule_mark_node_scheduled(scoreboard, chosen);
/* Move the instruction to the end (so our first chosen instructions are
* the start of the program).
*/
exec_node_remove(&chosen->instr->node);
exec_list_push_tail(&block->instr_list, &chosen->instr->node);
if (debug)
fprintf(stderr, "\n");
}
}
static uint32_t
nir_schedule_get_delay(nir_instr *instr)
{
switch (instr->type) {
case nir_instr_type_ssa_undef:
case nir_instr_type_load_const:
case nir_instr_type_alu:
case nir_instr_type_deref:
case nir_instr_type_jump:
case nir_instr_type_parallel_copy:
case nir_instr_type_call:
case nir_instr_type_phi:
return 1;
case nir_instr_type_intrinsic:
/* XXX: Pick a large number for UBO/SSBO/image/shared loads */
return 1;
case nir_instr_type_tex:
/* Pick some large number to try to fetch textures early and sample them
* late.
*/
return 100;
}
return 0;
}
static void
nir_schedule_dag_max_delay_cb(struct dag_node *node, void *state)
{
nir_schedule_node *n = (nir_schedule_node *)node;
uint32_t max_delay = 0;
util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
nir_schedule_node *child = (nir_schedule_node *)edge->child;
max_delay = MAX2(child->max_delay, max_delay);
}
n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
}
static void
nir_schedule_block(nir_schedule_scoreboard *scoreboard, nir_block *block)
{
void *mem_ctx = ralloc_context(NULL);
scoreboard->instr_map = _mesa_pointer_hash_table_create(mem_ctx);
scoreboard->dag = dag_create(mem_ctx);
nir_foreach_instr(instr, block) {
nir_schedule_node *n =
rzalloc(mem_ctx, nir_schedule_node);
n->instr = instr;
n->delay = nir_schedule_get_delay(instr);
dag_init_node(scoreboard->dag, &n->dag);
_mesa_hash_table_insert(scoreboard->instr_map, instr, n);
}
calculate_forward_deps(scoreboard, block);
calculate_reverse_deps(scoreboard, block);
dag_traverse_bottom_up(scoreboard->dag, nir_schedule_dag_max_delay_cb, NULL);
nir_schedule_instructions(scoreboard, block);
ralloc_free(mem_ctx);
scoreboard->instr_map = NULL;
}
static bool
nir_schedule_ssa_def_init_scoreboard(nir_ssa_def *def, void *state)
{
nir_schedule_scoreboard *scoreboard = state;
struct set *def_uses = _mesa_pointer_set_create(scoreboard);
_mesa_hash_table_insert(scoreboard->remaining_uses, def, def_uses);
_mesa_set_add(def_uses, def->parent_instr);
nir_foreach_use(src, def) {
_mesa_set_add(def_uses, src->parent_instr);
}
/* XXX: Handle if uses */
return true;
}
static nir_schedule_scoreboard *
nir_schedule_get_scoreboard(nir_shader *shader,
const nir_schedule_options *options)
{
nir_schedule_scoreboard *scoreboard = rzalloc(NULL, nir_schedule_scoreboard);
scoreboard->shader = shader;
scoreboard->live_values = _mesa_pointer_set_create(scoreboard);
scoreboard->remaining_uses = _mesa_pointer_hash_table_create(scoreboard);
scoreboard->options = options;
scoreboard->pressure = 0;
nir_foreach_function(function, shader) {
nir_foreach_register(reg, &function->impl->registers) {
struct set *register_uses =
_mesa_pointer_set_create(scoreboard);
_mesa_hash_table_insert(scoreboard->remaining_uses, reg, register_uses);
nir_foreach_use(src, reg) {
_mesa_set_add(register_uses, src->parent_instr);
}
/* XXX: Handle if uses */
nir_foreach_def(dest, reg) {
_mesa_set_add(register_uses, dest->reg.parent_instr);
}
}
nir_foreach_block(block, function->impl) {
nir_foreach_instr(instr, block) {
nir_foreach_ssa_def(instr, nir_schedule_ssa_def_init_scoreboard,
scoreboard);
}
/* XXX: We're ignoring if uses, which may prioritize scheduling other
* uses of the if src even when it doesn't help. That's not many
* values, though, so meh.
*/
}
}
return scoreboard;
}
static void
nir_schedule_validate_uses(nir_schedule_scoreboard *scoreboard)
{
#ifdef NDEBUG
return;
#endif
bool any_uses = false;
hash_table_foreach(scoreboard->remaining_uses, entry) {
struct set *remaining_uses = entry->data;
set_foreach(remaining_uses, instr_entry) {
if (!any_uses) {
fprintf(stderr, "Tracked uses remain after scheduling. "
"Affected instructions: \n");
any_uses = true;
}
nir_print_instr(instr_entry->key, stderr);
fprintf(stderr, "\n");
}
}
assert(!any_uses);
}
/**
* Schedules the NIR instructions to try to decrease stalls (for example,
* delaying texture reads) while managing register pressure.
*
* The threshold represents "number of NIR register/SSA def channels live
* before switching the scheduling heuristic to reduce register pressure",
* since most of our GPU architectures are scalar (extending to vector with a
* flag wouldn't be hard). This number should be a bit below the number of
* registers available (counting any that may be occupied by system value
* payload values, for example), since the heuristic may not always be able to
* free a register immediately. The amount below the limit is up to you to
* tune.
*/
void
nir_schedule(nir_shader *shader,
const nir_schedule_options *options)
{
nir_schedule_scoreboard *scoreboard = nir_schedule_get_scoreboard(shader,
options);
if (debug) {
fprintf(stderr, "NIR shader before scheduling:\n");
nir_print_shader(shader, stderr);
}
nir_foreach_function(function, shader) {
if (!function->impl)
continue;
nir_foreach_block(block, function->impl) {
nir_schedule_block(scoreboard, block);
}
}
nir_schedule_validate_uses(scoreboard);
ralloc_free(scoreboard);
}