blob: 217e903454c8e5000598e7238408b5ef69c42649 [file] [log] [blame]
/*
* Copyright 2014 Intel Corporation
* Copyright 2023 Advanced Micro Devices, Inc.
*
* SPDX-License-Identifier: MIT
*/
/* This implements dominance and post-dominance of the SSA use graph where
* instructions are vertices and SSA uses are edges (i.e. edges go from
* each instruction to all its uses). CF nodes are ignored and irrelevant.
* It's different from nir_dominance.c, but the algorithm is the same, which
* is from "A Simple, Fast Dominance Algorithm" by Cooper, Harvey, and Kennedy.
*
* Definitions:
* - Instruction A is post-dominated by instruction B if the result of
* instruction A and following intermediate results using the result of
* instruction A only affect the result of instruction B. Consequently,
* if instruction B was removed, instruction A would become dead including
* all instructions computing the intermediate results.
* Example: A(load) -> ... -> B(ALU)
* Note: This is the foundation of inter-shader code motion from later
* shaders to earlier shaders.
* - Instruction B is dominated by instruction A if all use paths from
* all loads to instruction B must go through instruction A.
* Note: Unlike post-dominance, dominance is unusable as-is because
* the immediate dominator typically doesn't exist if there are non-unary
* opcodes (i.e. branches of an expression tree following source operands
* don't usually converge to a single instruction unless all instructions
* are unary). The solution is to ignore loads like load_const to allow
* non-unary opcodes, which would be the foundation of inter-shader code
* motion from earlier shaders to later shaders, such as 2 output stores
* having only 1 ALU instruction as their only source at the beginning,
* ignoring constant and uniform operands along the way.
*
* Interesting cases implied by this (post-)dominator tree:
* - load_const, loads without src operands, and undef are not dominated by
* anything because they don't have any src operands.
* - No instruction post-dominates store intrinsics (and all other intrinsics
* without a destination) and nir_if nodes (they use a value but don't
* produce any).
*
* Typical application:
* - The immediate post-dominator query returns the solution to the problem of
* how much code we can move into the previous shader or preamble without
* increasing the number of inputs. Example of an SSA-use graph and
* the possible result that a user of this utility can produce:
*
* input0 input1 input0 input1
* \ / \ | \
* constant alu ... ------> | ...
* \ /
* alu
* (immediate post-dominator of input0)
*
* Examples of possible applications:
* - Moving load_input+ALU to the previous shader: An immediate post-dominator
* of load_input and all instructions between load_input and the immediate
* post-dominator are a candidate for being moved into the previous shader
* and we only need to check if the post-dominator is movable. Repeat
* the immediate post-dominator query on the accepted post-dominator and see
* if that is also movable. Repeat that until you find the farthest post-
* dominator that is movable.
* - Moving load_uniform+ALU to a preamble shader or the CPU: An immediate
* post-dominator of load_uniform is a candidate for being moved into
* the preamble shader or the CPU. Repeat the immediate post-dominator query
* until you find the farthest post-dominator that is movable.
* - Replacing a value used to compute 2 shader outputs by only 1 output, and
* moving the computation into the next shader:
* The Lowest Common Ancestor of 2 output stores within the dominator tree
* is a candidate for the new replacement output. Any loads that are
* trivially movable such as load_const should be ignored by this utility,
* otherwise the Lowest Common Ancestor wouldn't exist.
*
* Queries:
* - get the immediate dominator of an instruction
* - get the Lowest Common Ancestor of 2 instructions
* - whether one instruction dominates another
*
* Implemenation details:
* - Since some instructions are not dominated by anything, a dummy root is
* added into the graph that dominates such instructions, which is required
* by the algorithm.
*
* TODO: only post-dominance implemented, not dominance
*/
#include "nir.h"
struct nir_use_dom_node {
nir_instr *instr;
uint32_t index;
/* The index of this node's immediate dominator in the dominator tree.
* The dummy root points it to itself. -1 == unset.
*/
int32_t imm_dom;
};
struct nir_use_dominance_state {
nir_function_impl *impl;
struct nir_use_dom_node *dom_nodes;
unsigned num_dom_nodes;
};
static struct nir_use_dom_node *
get_node(struct nir_use_dominance_state *state, nir_instr *instr)
{
return &state->dom_nodes[instr->index];
}
static struct nir_use_dom_node *
get_imm_dom(struct nir_use_dominance_state *state,
struct nir_use_dom_node *node)
{
assert(node->imm_dom != -1);
return &state->dom_nodes[node->imm_dom];
}
static bool
init_instr(struct nir_use_dominance_state *state, nir_instr *instr,
unsigned *index)
{
assert(*index < state->num_dom_nodes);
struct nir_use_dom_node *node = &state->dom_nodes[*index];
if (*index == 0) {
/* dummy root */
node->imm_dom = 0;
} else {
node->imm_dom = -1;
node->instr = instr;
instr->index = node->index = *index;
}
(*index)++;
return true;
}
static struct nir_use_dom_node *
intersect(struct nir_use_dominance_state *state, struct nir_use_dom_node *i1,
struct nir_use_dom_node *i2)
{
while (i1 != i2) {
/* Note, the comparisons here are the opposite of what the paper says
* because we index instrs from beginning -> end (i.e. reverse
* post-order) instead of post-order like they assume.
*/
while (i1->index > i2->index)
i1 = get_imm_dom(state, i1);
while (i2->index > i1->index)
i2 = get_imm_dom(state, i2);
}
return i1;
}
static void
update_imm_dom(struct nir_use_dominance_state *state,
struct nir_use_dom_node *pred,
struct nir_use_dom_node **new_idom)
{
if (pred->imm_dom != -1) {
if (*new_idom)
*new_idom = intersect(state, pred, *new_idom);
else
*new_idom = pred;
}
}
static bool
calc_dominance(struct nir_use_dominance_state *state,
struct nir_use_dom_node *node, bool post_dominance)
{
struct nir_use_dom_node *new_idom = NULL;
if (post_dominance) {
nir_def *def = nir_instr_def(node->instr);
bool has_use = false;
if (def) {
nir_foreach_use_including_if(src, def) {
has_use = true;
/* Ifs are treated like stores because they don't produce
* a value. dom_nodes[0] is the dummy root.
*/
if (nir_src_is_if(src)) {
update_imm_dom(state, &state->dom_nodes[0], &new_idom);
/* Short-cut because we can't come back from the root node. */
break;
} else {
update_imm_dom(state,
get_node(state, nir_src_parent_instr(src)),
&new_idom);
}
}
}
/* No destination (e.g. stores, atomics with an unused result, discard,
* dead instructions). dom_nodes[0] is the dummy root.
*/
if (!has_use)
update_imm_dom(state, &state->dom_nodes[0], &new_idom);
} else {
unreachable("TODO: only post-dominance implemented, not dominance");
}
if (new_idom && node->imm_dom != new_idom->index) {
node->imm_dom = new_idom->index;
return true;
}
return false;
}
/**
* Calculate dominance or post-dominance of the SSA use graph.
* The returned state must not be freed while dominance queries are being used.
* ralloc_free() frees the state.
*
* It clobbers nir_instr::index, which can't be changed while dominance queries
* are being used.
*
* \param impl NIR function
* \param post_dominance Whether to compute post-dominance or dominance.
*/
struct nir_use_dominance_state *
nir_calc_use_dominance_impl(nir_function_impl *impl, bool post_dominance)
{
struct nir_use_dominance_state *state =
rzalloc(NULL, struct nir_use_dominance_state);
if (!state)
return NULL;
unsigned num_dom_nodes = 1; /* including the dummy root */
nir_foreach_block(block, impl) {
num_dom_nodes += exec_list_length(&block->instr_list);
}
state->impl = impl;
state->num_dom_nodes = num_dom_nodes;
state->dom_nodes = rzalloc_array(state, struct nir_use_dom_node,
num_dom_nodes);
if (!state->dom_nodes) {
ralloc_free(state);
return NULL;
}
unsigned index = 0;
/* We need a dummy root node because there are instructions such as
* load_const that aren't dominated by anything. If we are calculating
* post-dominance, intrinsics without a destination aren't post-dominated
* by anything. However, the algorithm requires a common (post-)dominator.
*/
init_instr(state, NULL, &index);
/* Post-dominance is identical to dominance, but instructions are added
* in the opposite order.
*/
if (post_dominance) {
nir_foreach_block_reverse(block, impl) {
nir_foreach_instr_reverse(instr, block) {
init_instr(state, instr, &index);
}
}
} else {
nir_foreach_block(block, impl) {
nir_foreach_instr(instr, block) {
init_instr(state, instr, &index);
}
}
}
bool progress = true;
while (progress) {
progress = false;
/* Skip the dummy root (iterate from 1). */
for (unsigned i = 1; i < num_dom_nodes; i++) {
progress |= calc_dominance(state, &state->dom_nodes[i],
post_dominance);
}
}
return state;
}
nir_instr *
nir_get_immediate_use_dominator(struct nir_use_dominance_state *state,
nir_instr *instr)
{
struct nir_use_dom_node *node = get_node(state, instr);
return get_imm_dom(state, node)->instr;
}
/**
* Computes the least common ancestor of two instructions.
*/
nir_instr *
nir_use_dominance_lca(struct nir_use_dominance_state *state,
nir_instr *i1, nir_instr *i2)
{
assert(i1 && i2);
struct nir_use_dom_node *lca = intersect(state, get_node(state, i1),
get_node(state, i2));
assert(lca);
/* Might be NULL in case of the dummy root. */
return lca->instr;
}
/**
* Returns true if the parent dominates the child in the SSA use graph
* described at the beginning.
*/
bool
nir_instr_dominates_use(struct nir_use_dominance_state *state,
nir_instr *parent_instr, nir_instr *child_instr)
{
struct nir_use_dom_node *parent = get_node(state, parent_instr);
struct nir_use_dom_node *child = get_node(state, child_instr);
while (parent->index < child->index)
child = get_imm_dom(state, child);
return parent == child;
}
static void
print_instr(struct nir_use_dom_node *node)
{
if (!node)
printf("NULL - bug");
else if (node->index == 0)
printf("dummy_root");
else
nir_print_instr(node->instr, stdout);
}
void
nir_print_use_dominators(struct nir_use_dominance_state *state,
nir_instr **instructions, unsigned num_instructions)
{
for (unsigned i = 0; i < num_instructions; i++) {
printf("Input idom(\"");
nir_print_instr(instructions[i], stdout);
printf("\") = \"");
print_instr(get_imm_dom(state, get_node(state, instructions[i])));
printf("\"\n");
}
puts("");
nir_foreach_block(block, state->impl) {
nir_foreach_instr(instr, block) {
printf("idom(\"");
nir_print_instr(instr, stdout);
printf("\") = \"");
print_instr(get_imm_dom(state, get_node(state, instr)));
printf("\"\n");
}
}
puts("");
for (unsigned i = 0; i < num_instructions; i++) {
for (unsigned j = i + 1; j < num_instructions; j++) {
printf("LCA input 1: ");
nir_print_instr(instructions[i], stdout);
printf("\nLCA input 2: ");
nir_print_instr(instructions[j], stdout);
puts("");
nir_instr *lca =
nir_use_dominance_lca(state, instructions[i], instructions[j]);
if (lca) {
printf("2 inputs have a common post-dominator: ");
nir_print_instr(lca, stdout);
printf("\n");
}
puts("");
}
}
}