src/compiler/nir/nir_use_dominance.c - platform/external/mesa3d - Git at Google

 /*
  * Copyright 2014 Intel Corporation
  * Copyright 2023 Advanced Micro Devices, Inc.
  *
  * SPDX-License-Identifier: MIT
  */

 /* This implements dominance and post-dominance of the SSA use graph where
  * instructions are vertices and SSA uses are edges (i.e. edges go from
  * each instruction to all its uses). CF nodes are ignored and irrelevant.
  * It's different from nir_dominance.c, but the algorithm is the same, which
  * is from "A Simple, Fast Dominance Algorithm" by Cooper, Harvey, and Kennedy.
  *
  * Definitions:
  * - Instruction A is post-dominated by instruction B if the result of
  *   instruction A and following intermediate results using the result of
  *   instruction A only affect the result of instruction B. Consequently,
  *   if instruction B was removed, instruction A would become dead including
  *   all instructions computing the intermediate results.
  *   Example: A(load) -> ... -> B(ALU)
  *   Note: This is the foundation of inter-shader code motion from later
  *   shaders to earlier shaders.
  * - Instruction B is dominated by instruction A if all use paths from
  *   all loads to instruction B must go through instruction A.
  *   Note: Unlike post-dominance, dominance is unusable as-is because
  *   the immediate dominator typically doesn't exist if there are non-unary
  *   opcodes (i.e. branches of an expression tree following source operands
  *   don't usually converge to a single instruction unless all instructions
  *   are unary). The solution is to ignore loads like load_const to allow
  *   non-unary opcodes, which would be the foundation of inter-shader code
  *   motion from earlier shaders to later shaders, such as 2 output stores
  *   having only 1 ALU instruction as their only source at the beginning,
  *   ignoring constant and uniform operands along the way.
  *
  * Interesting cases implied by this (post-)dominator tree:
  * - load_const, loads without src operands, and undef are not dominated by
  *   anything because they don't have any src operands.
  * - No instruction post-dominates store intrinsics (and all other intrinsics
  *   without a destination) and nir_if nodes (they use a value but don't
  *   produce any).
  *
  * Typical application:
  * - The immediate post-dominator query returns the solution to the problem of
  *   how much code we can move into the previous shader or preamble without
  *   increasing the number of inputs. Example of an SSA-use graph and
  *   the possible result that a user of this utility can produce:
  *
  *          input0 input1             input0 input1
  *              \   / \                  |      \
  *    constant   alu  ...    ------>     |     ...
  *           \   /
  *            alu
  * (immediate post-dominator of input0)
  *
  * Examples of possible applications:
  * - Moving load_input+ALU to the previous shader: An immediate post-dominator
  *   of load_input and all instructions between load_input and the immediate
  *   post-dominator are a candidate for being moved into the previous shader
  *   and we only need to check if the post-dominator is movable. Repeat
  *   the immediate post-dominator query on the accepted post-dominator and see
  *   if that is also movable. Repeat that until you find the farthest post-
  *   dominator that is movable.
  * - Moving load_uniform+ALU to a preamble shader or the CPU: An immediate
  *   post-dominator of load_uniform is a candidate for being moved into
  *   the preamble shader or the CPU. Repeat the immediate post-dominator query
  *   until you find the farthest post-dominator that is movable.
  * - Replacing a value used to compute 2 shader outputs by only 1 output, and
  *   moving the computation into the next shader:
  *   The Lowest Common Ancestor of 2 output stores within the dominator tree
  *   is a candidate for the new replacement output. Any loads that are
  *   trivially movable such as load_const should be ignored by this utility,
  *   otherwise the Lowest Common Ancestor wouldn't exist.
  *
  * Queries:
  * - get the immediate dominator of an instruction
  * - get the Lowest Common Ancestor of 2 instructions
  * - whether one instruction dominates another
  *
  * Implemenation details:
  * - Since some instructions are not dominated by anything, a dummy root is
  *   added into the graph that dominates such instructions, which is required
  *   by the algorithm.
  *
  * TODO: only post-dominance implemented, not dominance
  */

 #include "nir.h"

 struct nir_use_dom_node {
    nir_instr *instr;
    uint32_t index;

    /* The index of this node's immediate dominator in the dominator tree.
     * The dummy root points it to itself. -1 == unset.
     */
    int32_t imm_dom;
 };

 struct nir_use_dominance_state {
    nir_function_impl *impl;
    struct nir_use_dom_node *dom_nodes;
    unsigned num_dom_nodes;
 };

 static struct nir_use_dom_node *
 get_node(struct nir_use_dominance_state *state, nir_instr *instr)
 {
    return &state->dom_nodes[instr->index];
 }

 static struct nir_use_dom_node *
 get_imm_dom(struct nir_use_dominance_state *state,
             struct nir_use_dom_node *node)
 {
    assert(node->imm_dom != -1);
    return &state->dom_nodes[node->imm_dom];
 }

 static bool
 init_instr(struct nir_use_dominance_state *state, nir_instr *instr,
            unsigned *index)
 {
    assert(*index < state->num_dom_nodes);
    struct nir_use_dom_node *node = &state->dom_nodes[*index];

    if (*index == 0) {
       /* dummy root */
       node->imm_dom = 0;
    } else {
       node->imm_dom = -1;
       node->instr = instr;
       instr->index = node->index = *index;
    }
    (*index)++;

    return true;
 }

 static struct nir_use_dom_node *
 intersect(struct nir_use_dominance_state *state, struct nir_use_dom_node *i1,
           struct nir_use_dom_node *i2)
 {
    while (i1 != i2) {
       /* Note, the comparisons here are the opposite of what the paper says
        * because we index instrs from beginning -> end (i.e. reverse
        * post-order) instead of post-order like they assume.
        */
       while (i1->index > i2->index)
          i1 = get_imm_dom(state, i1);
       while (i2->index > i1->index)
          i2 = get_imm_dom(state, i2);
    }

    return i1;
 }

 static void
 update_imm_dom(struct nir_use_dominance_state *state,
                struct nir_use_dom_node *pred,
                struct nir_use_dom_node **new_idom)
 {
    if (pred->imm_dom != -1) {
       if (*new_idom)
          *new_idom = intersect(state, pred, *new_idom);
       else
          *new_idom = pred;
    }
 }

 static bool
 calc_dominance(struct nir_use_dominance_state *state,
                struct nir_use_dom_node *node, bool post_dominance)
 {
    struct nir_use_dom_node *new_idom = NULL;

    if (post_dominance) {
       nir_def *def = nir_instr_def(node->instr);
       bool has_use = false;

       if (def) {
          nir_foreach_use_including_if(src, def) {
             has_use = true;

             /* Ifs are treated like stores because they don't produce
              * a value. dom_nodes[0] is the dummy root.
              */
             if (nir_src_is_if(src)) {
                update_imm_dom(state, &state->dom_nodes[0], &new_idom);
                /* Short-cut because we can't come back from the root node. */
                break;
             } else {
                update_imm_dom(state,
                               get_node(state, nir_src_parent_instr(src)),
                               &new_idom);
             }
          }
       }

       /* No destination (e.g. stores, atomics with an unused result, discard,
        * dead instructions). dom_nodes[0] is the dummy root.
        */
       if (!has_use)
          update_imm_dom(state, &state->dom_nodes[0], &new_idom);
    } else {
       unreachable("TODO: only post-dominance implemented, not dominance");
    }

    if (new_idom && node->imm_dom != new_idom->index) {
       node->imm_dom = new_idom->index;
       return true;
    }

    return false;
 }

 /**
  * Calculate dominance or post-dominance of the SSA use graph.
  * The returned state must not be freed while dominance queries are being used.
  * ralloc_free() frees the state.
  *
  * It clobbers nir_instr::index, which can't be changed while dominance queries
  * are being used.
  *
  * \param impl             NIR function
  * \param post_dominance   Whether to compute post-dominance or dominance.
  */
 struct nir_use_dominance_state *
 nir_calc_use_dominance_impl(nir_function_impl *impl, bool post_dominance)
 {
    struct nir_use_dominance_state *state =
       rzalloc(NULL, struct nir_use_dominance_state);
    if (!state)
       return NULL;

    unsigned num_dom_nodes = 1; /* including the dummy root */
    nir_foreach_block(block, impl) {
       num_dom_nodes += exec_list_length(&block->instr_list);
    }

    state->impl = impl;
    state->num_dom_nodes = num_dom_nodes;
    state->dom_nodes = rzalloc_array(state, struct nir_use_dom_node,
                                     num_dom_nodes);
    if (!state->dom_nodes) {
       ralloc_free(state);
       return NULL;
    }

    unsigned index = 0;

    /* We need a dummy root node because there are instructions such as
     * load_const that aren't dominated by anything. If we are calculating
     * post-dominance, intrinsics without a destination aren't post-dominated
     * by anything. However, the algorithm requires a common (post-)dominator.
     */
    init_instr(state, NULL, &index);

    /* Post-dominance is identical to dominance, but instructions are added
     * in the opposite order.
     */
    if (post_dominance) {
       nir_foreach_block_reverse(block, impl) {
          nir_foreach_instr_reverse(instr, block) {
             init_instr(state, instr, &index);
          }
       }
    } else {
       nir_foreach_block(block, impl) {
          nir_foreach_instr(instr, block) {
             init_instr(state, instr, &index);
          }
       }
    }

    bool progress = true;
    while (progress) {
       progress = false;

       /* Skip the dummy root (iterate from 1). */
       for (unsigned i = 1; i < num_dom_nodes; i++) {
          progress |= calc_dominance(state, &state->dom_nodes[i],
                                     post_dominance);
       }
    }

    return state;
 }

 nir_instr *
 nir_get_immediate_use_dominator(struct nir_use_dominance_state *state,
                                 nir_instr *instr)
 {
    struct nir_use_dom_node *node = get_node(state, instr);

    return get_imm_dom(state, node)->instr;
 }

 /**
  * Computes the least common ancestor of two instructions.
  */
 nir_instr *
 nir_use_dominance_lca(struct nir_use_dominance_state *state,
                       nir_instr *i1, nir_instr *i2)
 {
    assert(i1 && i2);
    struct nir_use_dom_node *lca = intersect(state, get_node(state, i1),
                                             get_node(state, i2));
    assert(lca);
    /* Might be NULL in case of the dummy root. */
    return lca->instr;
 }

 /**
  * Returns true if the parent dominates the child in the SSA use graph
  * described at the beginning.
  */
 bool
 nir_instr_dominates_use(struct nir_use_dominance_state *state,
                         nir_instr *parent_instr, nir_instr *child_instr)
 {
    struct nir_use_dom_node *parent = get_node(state, parent_instr);
    struct nir_use_dom_node *child = get_node(state, child_instr);

    while (parent->index < child->index)
       child = get_imm_dom(state, child);

    return parent == child;
 }

 static void
 print_instr(struct nir_use_dom_node *node)
 {
    if (!node)
       printf("NULL - bug");
    else if (node->index == 0)
       printf("dummy_root");
    else
       nir_print_instr(node->instr, stdout);
 }

 void
 nir_print_use_dominators(struct nir_use_dominance_state *state,
                          nir_instr **instructions, unsigned num_instructions)
 {
    for (unsigned i = 0; i < num_instructions; i++) {
       printf("Input idom(\"");
       nir_print_instr(instructions[i], stdout);
       printf("\") = \"");
       print_instr(get_imm_dom(state, get_node(state, instructions[i])));
       printf("\"\n");
    }
    puts("");

    nir_foreach_block(block, state->impl) {
       nir_foreach_instr(instr, block) {
          printf("idom(\"");
          nir_print_instr(instr, stdout);
          printf("\") = \"");
          print_instr(get_imm_dom(state, get_node(state, instr)));
          printf("\"\n");
       }
    }
    puts("");

    for (unsigned i = 0; i < num_instructions; i++) {
       for (unsigned j = i + 1; j < num_instructions; j++) {
          printf("LCA input 1: ");
          nir_print_instr(instructions[i], stdout);
          printf("\nLCA input 2: ");
          nir_print_instr(instructions[j], stdout);
          puts("");
          nir_instr *lca =
             nir_use_dominance_lca(state, instructions[i], instructions[j]);

          if (lca) {
             printf("2 inputs have a common post-dominator: ");
             nir_print_instr(lca, stdout);
             printf("\n");
          }
          puts("");
       }
    }
 }
	/*
	* Copyright 2014 Intel Corporation
	* Copyright 2023 Advanced Micro Devices, Inc.
	*
	* SPDX-License-Identifier: MIT
	*/

	/* This implements dominance and post-dominance of the SSA use graph where
	* instructions are vertices and SSA uses are edges (i.e. edges go from
	* each instruction to all its uses). CF nodes are ignored and irrelevant.
	* It's different from nir_dominance.c, but the algorithm is the same, which
	* is from "A Simple, Fast Dominance Algorithm" by Cooper, Harvey, and Kennedy.
	*
	* Definitions:
	* - Instruction A is post-dominated by instruction B if the result of
	* instruction A and following intermediate results using the result of
	* instruction A only affect the result of instruction B. Consequently,
	* if instruction B was removed, instruction A would become dead including
	* all instructions computing the intermediate results.
	* Example: A(load) -> ... -> B(ALU)
	* Note: This is the foundation of inter-shader code motion from later
	* shaders to earlier shaders.
	* - Instruction B is dominated by instruction A if all use paths from
	* all loads to instruction B must go through instruction A.
	* Note: Unlike post-dominance, dominance is unusable as-is because
	* the immediate dominator typically doesn't exist if there are non-unary
	* opcodes (i.e. branches of an expression tree following source operands
	* don't usually converge to a single instruction unless all instructions
	* are unary). The solution is to ignore loads like load_const to allow
	* non-unary opcodes, which would be the foundation of inter-shader code
	* motion from earlier shaders to later shaders, such as 2 output stores
	* having only 1 ALU instruction as their only source at the beginning,
	* ignoring constant and uniform operands along the way.
	*
	* Interesting cases implied by this (post-)dominator tree:
	* - load_const, loads without src operands, and undef are not dominated by
	* anything because they don't have any src operands.
	* - No instruction post-dominates store intrinsics (and all other intrinsics
	* without a destination) and nir_if nodes (they use a value but don't
	* produce any).
	*
	* Typical application:
	* - The immediate post-dominator query returns the solution to the problem of
	* how much code we can move into the previous shader or preamble without
	* increasing the number of inputs. Example of an SSA-use graph and
	* the possible result that a user of this utility can produce:
	*
	* input0 input1 input0 input1
	* \ / \ \| \
	* constant alu ... ------> \| ...
	* \ /
	* alu
	* (immediate post-dominator of input0)
	*
	* Examples of possible applications:
	* - Moving load_input+ALU to the previous shader: An immediate post-dominator
	* of load_input and all instructions between load_input and the immediate
	* post-dominator are a candidate for being moved into the previous shader
	* and we only need to check if the post-dominator is movable. Repeat
	* the immediate post-dominator query on the accepted post-dominator and see
	* if that is also movable. Repeat that until you find the farthest post-
	* dominator that is movable.
	* - Moving load_uniform+ALU to a preamble shader or the CPU: An immediate
	* post-dominator of load_uniform is a candidate for being moved into
	* the preamble shader or the CPU. Repeat the immediate post-dominator query
	* until you find the farthest post-dominator that is movable.
	* - Replacing a value used to compute 2 shader outputs by only 1 output, and
	* moving the computation into the next shader:
	* The Lowest Common Ancestor of 2 output stores within the dominator tree
	* is a candidate for the new replacement output. Any loads that are
	* trivially movable such as load_const should be ignored by this utility,
	* otherwise the Lowest Common Ancestor wouldn't exist.
	*
	* Queries:
	* - get the immediate dominator of an instruction
	* - get the Lowest Common Ancestor of 2 instructions
	* - whether one instruction dominates another
	*
	* Implemenation details:
	* - Since some instructions are not dominated by anything, a dummy root is
	* added into the graph that dominates such instructions, which is required
	* by the algorithm.
	*
	* TODO: only post-dominance implemented, not dominance
	*/

	#include "nir.h"

	struct nir_use_dom_node {
	nir_instr *instr;
	uint32_t index;

	/* The index of this node's immediate dominator in the dominator tree.
	* The dummy root points it to itself. -1 == unset.
	*/
	int32_t imm_dom;
	};

	struct nir_use_dominance_state {
	nir_function_impl *impl;
	struct nir_use_dom_node *dom_nodes;
	unsigned num_dom_nodes;
	};

	static struct nir_use_dom_node *
	get_node(struct nir_use_dominance_state state, nir_instr instr)
	{
	return &state->dom_nodes[instr->index];
	}

	static struct nir_use_dom_node *
	get_imm_dom(struct nir_use_dominance_state *state,
	struct nir_use_dom_node *node)
	{
	assert(node->imm_dom != -1);
	return &state->dom_nodes[node->imm_dom];
	}

	static bool
	init_instr(struct nir_use_dominance_state state, nir_instr instr,
	unsigned *index)
	{
	assert(*index < state->num_dom_nodes);
	struct nir_use_dom_node node = &state->dom_nodes[index];

	if (*index == 0) {
	/* dummy root */
	node->imm_dom = 0;
	} else {
	node->imm_dom = -1;
	node->instr = instr;
	instr->index = node->index = *index;
	}
	(*index)++;

	return true;
	}

	static struct nir_use_dom_node *
	intersect(struct nir_use_dominance_state state, struct nir_use_dom_node i1,
	struct nir_use_dom_node *i2)
	{
	while (i1 != i2) {
	/* Note, the comparisons here are the opposite of what the paper says
	* because we index instrs from beginning -> end (i.e. reverse
	* post-order) instead of post-order like they assume.
	*/
	while (i1->index > i2->index)
	i1 = get_imm_dom(state, i1);
	while (i2->index > i1->index)
	i2 = get_imm_dom(state, i2);
	}

	return i1;
	}

	static void
	update_imm_dom(struct nir_use_dominance_state *state,
	struct nir_use_dom_node *pred,
	struct nir_use_dom_node **new_idom)
	{
	if (pred->imm_dom != -1) {
	if (*new_idom)
	new_idom = intersect(state, pred, new_idom);
	else
	*new_idom = pred;
	}
	}

	static bool
	calc_dominance(struct nir_use_dominance_state *state,
	struct nir_use_dom_node *node, bool post_dominance)
	{
	struct nir_use_dom_node *new_idom = NULL;

	if (post_dominance) {
	nir_def *def = nir_instr_def(node->instr);
	bool has_use = false;

	if (def) {
	nir_foreach_use_including_if(src, def) {
	has_use = true;

	/* Ifs are treated like stores because they don't produce
	* a value. dom_nodes[0] is the dummy root.
	*/
	if (nir_src_is_if(src)) {
	update_imm_dom(state, &state->dom_nodes[0], &new_idom);
	/* Short-cut because we can't come back from the root node. */
	break;
	} else {
	update_imm_dom(state,
	get_node(state, nir_src_parent_instr(src)),
	&new_idom);
	}
	}
	}

	/* No destination (e.g. stores, atomics with an unused result, discard,
	* dead instructions). dom_nodes[0] is the dummy root.
	*/
	if (!has_use)
	update_imm_dom(state, &state->dom_nodes[0], &new_idom);
	} else {
	unreachable("TODO: only post-dominance implemented, not dominance");
	}

	if (new_idom && node->imm_dom != new_idom->index) {
	node->imm_dom = new_idom->index;
	return true;
	}

	return false;
	}

	/**
	* Calculate dominance or post-dominance of the SSA use graph.
	* The returned state must not be freed while dominance queries are being used.
	* ralloc_free() frees the state.
	*
	* It clobbers nir_instr::index, which can't be changed while dominance queries
	* are being used.
	*
	* \param impl NIR function
	* \param post_dominance Whether to compute post-dominance or dominance.
	*/
	struct nir_use_dominance_state *
	nir_calc_use_dominance_impl(nir_function_impl *impl, bool post_dominance)
	{
	struct nir_use_dominance_state *state =
	rzalloc(NULL, struct nir_use_dominance_state);
	if (!state)
	return NULL;

	unsigned num_dom_nodes = 1; /* including the dummy root */
	nir_foreach_block(block, impl) {
	num_dom_nodes += exec_list_length(&block->instr_list);
	}

	state->impl = impl;
	state->num_dom_nodes = num_dom_nodes;
	state->dom_nodes = rzalloc_array(state, struct nir_use_dom_node,
	num_dom_nodes);
	if (!state->dom_nodes) {
	ralloc_free(state);
	return NULL;
	}

	unsigned index = 0;

	/* We need a dummy root node because there are instructions such as
	* load_const that aren't dominated by anything. If we are calculating
	* post-dominance, intrinsics without a destination aren't post-dominated
	* by anything. However, the algorithm requires a common (post-)dominator.
	*/
	init_instr(state, NULL, &index);

	/* Post-dominance is identical to dominance, but instructions are added
	* in the opposite order.
	*/
	if (post_dominance) {
	nir_foreach_block_reverse(block, impl) {
	nir_foreach_instr_reverse(instr, block) {
	init_instr(state, instr, &index);
	}
	}
	} else {
	nir_foreach_block(block, impl) {
	nir_foreach_instr(instr, block) {
	init_instr(state, instr, &index);
	}
	}
	}

	bool progress = true;
	while (progress) {
	progress = false;

	/* Skip the dummy root (iterate from 1). */
	for (unsigned i = 1; i < num_dom_nodes; i++) {
	progress \|= calc_dominance(state, &state->dom_nodes[i],
	post_dominance);
	}
	}

	return state;
	}

	nir_instr *
	nir_get_immediate_use_dominator(struct nir_use_dominance_state *state,
	nir_instr *instr)
	{
	struct nir_use_dom_node *node = get_node(state, instr);

	return get_imm_dom(state, node)->instr;
	}

	/**
	* Computes the least common ancestor of two instructions.
	*/
	nir_instr *
	nir_use_dominance_lca(struct nir_use_dominance_state *state,
	nir_instr i1, nir_instr i2)
	{
	assert(i1 && i2);
	struct nir_use_dom_node *lca = intersect(state, get_node(state, i1),
	get_node(state, i2));
	assert(lca);
	/* Might be NULL in case of the dummy root. */
	return lca->instr;
	}

	/**
	* Returns true if the parent dominates the child in the SSA use graph
	* described at the beginning.
	*/
	bool
	nir_instr_dominates_use(struct nir_use_dominance_state *state,
	nir_instr parent_instr, nir_instr child_instr)
	{
	struct nir_use_dom_node *parent = get_node(state, parent_instr);
	struct nir_use_dom_node *child = get_node(state, child_instr);

	while (parent->index < child->index)
	child = get_imm_dom(state, child);

	return parent == child;
	}

	static void
	print_instr(struct nir_use_dom_node *node)
	{
	if (!node)
	printf("NULL - bug");
	else if (node->index == 0)
	printf("dummy_root");
	else
	nir_print_instr(node->instr, stdout);
	}

	void
	nir_print_use_dominators(struct nir_use_dominance_state *state,
	nir_instr **instructions, unsigned num_instructions)
	{
	for (unsigned i = 0; i < num_instructions; i++) {
	printf("Input idom(\"");
	nir_print_instr(instructions[i], stdout);
	printf("\") = \"");
	print_instr(get_imm_dom(state, get_node(state, instructions[i])));
	printf("\"\n");
	}
	puts("");

	nir_foreach_block(block, state->impl) {
	nir_foreach_instr(instr, block) {
	printf("idom(\"");
	nir_print_instr(instr, stdout);
	printf("\") = \"");
	print_instr(get_imm_dom(state, get_node(state, instr)));
	printf("\"\n");
	}
	}
	puts("");

	for (unsigned i = 0; i < num_instructions; i++) {
	for (unsigned j = i + 1; j < num_instructions; j++) {
	printf("LCA input 1: ");
	nir_print_instr(instructions[i], stdout);
	printf("\nLCA input 2: ");
	nir_print_instr(instructions[j], stdout);
	puts("");
	nir_instr *lca =
	nir_use_dominance_lca(state, instructions[i], instructions[j]);

	if (lca) {
	printf("2 inputs have a common post-dominator: ");
	nir_print_instr(lca, stdout);
	printf("\n");
	}
	puts("");
	}
	}
	}