blob: 1bb62f3214532e9809b123190a933c0aa5c00aa4 [file] [log] [blame]
/*
* Copyright © 2018 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "nir_instr_set.h"
#include "nir_search_helpers.h"
#include "nir_builder.h"
#include "util/u_vector.h"
/* Partial redundancy elimination of compares
*
* Seaches for comparisons of the form 'a cmp b' that dominate arithmetic
* instructions like 'b - a'. The comparison is replaced by the arithmetic
* instruction, and the result is compared with zero. For example,
*
* vec1 32 ssa_111 = flt 0.37, ssa_110.w
* if ssa_111 {
* block block_1:
* vec1 32 ssa_112 = fadd ssa_110.w, -0.37
* ...
*
* becomes
*
* vec1 32 ssa_111 = fadd ssa_110.w, -0.37
* vec1 32 ssa_112 = flt 0.0, ssa_111
* if ssa_112 {
* block block_1:
* ...
*/
struct block_queue {
/**
* Stack of blocks from the current location in the CFG to the entry point
* of the function.
*
* This is sort of a poor man's dominator tree.
*/
struct exec_list blocks;
/** List of freed block_instructions structures that can be reused. */
struct exec_list reusable_blocks;
};
struct block_instructions {
struct exec_node node;
/**
* Set of comparison instructions from the block that are candidates for
* being replaced by add instructions.
*/
struct u_vector instructions;
};
static void
block_queue_init(struct block_queue *bq)
{
exec_list_make_empty(&bq->blocks);
exec_list_make_empty(&bq->reusable_blocks);
}
static void
block_queue_finish(struct block_queue *bq)
{
struct block_instructions *n;
while ((n = (struct block_instructions *) exec_list_pop_head(&bq->blocks)) != NULL) {
u_vector_finish(&n->instructions);
free(n);
}
while ((n = (struct block_instructions *) exec_list_pop_head(&bq->reusable_blocks)) != NULL) {
free(n);
}
}
static struct block_instructions *
push_block(struct block_queue *bq)
{
struct block_instructions *bi =
(struct block_instructions *) exec_list_pop_head(&bq->reusable_blocks);
if (bi == NULL) {
bi = calloc(1, sizeof(struct block_instructions));
if (bi == NULL)
return NULL;
}
if (!u_vector_init(&bi->instructions,
sizeof(nir_alu_instr *),
8 * sizeof(nir_alu_instr *))) {
free(bi);
return NULL;
}
exec_list_push_tail(&bq->blocks, &bi->node);
return bi;
}
static void
pop_block(struct block_queue *bq, struct block_instructions *bi)
{
u_vector_finish(&bi->instructions);
exec_node_remove(&bi->node);
exec_list_push_head(&bq->reusable_blocks, &bi->node);
}
static void
add_instruction_for_block(struct block_instructions *bi,
nir_alu_instr *alu)
{
nir_alu_instr **data =
u_vector_add(&bi->instructions);
*data = alu;
}
static void
rewrite_compare_instruction(nir_builder *bld, nir_alu_instr *orig_cmp,
nir_alu_instr *orig_add, bool zero_on_left)
{
void *const mem_ctx = ralloc_parent(orig_cmp);
bld->cursor = nir_before_instr(&orig_cmp->instr);
/* This is somewhat tricky. The compare instruction may be something like
* (fcmp, a, b) while the add instruction is something like (fadd, fneg(a),
* b). This is problematic because the SSA value for the fneg(a) may not
* exist yet at the compare instruction.
*
* We fabricate the operands of the new add. This is done using
* information provided by zero_on_left. If zero_on_left is true, we know
* the resulting compare instruction is (fcmp, 0.0, (fadd, x, y)). If the
* original compare instruction was (fcmp, a, b), x = b and y = -a. If
* zero_on_left is false, the resulting compare instruction is (fcmp,
* (fadd, x, y), 0.0) and x = a and y = -b.
*/
nir_ssa_def *const a = nir_ssa_for_alu_src(bld, orig_cmp, 0);
nir_ssa_def *const b = nir_ssa_for_alu_src(bld, orig_cmp, 1);
nir_ssa_def *const fadd = zero_on_left
? nir_fadd(bld, b, nir_fneg(bld, a))
: nir_fadd(bld, a, nir_fneg(bld, b));
nir_ssa_def *const zero =
nir_imm_floatN_t(bld, 0.0, orig_add->dest.dest.ssa.bit_size);
nir_ssa_def *const cmp = zero_on_left
? nir_build_alu(bld, orig_cmp->op, zero, fadd, NULL, NULL)
: nir_build_alu(bld, orig_cmp->op, fadd, zero, NULL, NULL);
/* Generating extra moves of the results is the easy way to make sure the
* writemasks match the original instructions. Later optimization passes
* will clean these up. This is similar to nir_replace_instr (in
* nir_search.c).
*/
nir_alu_instr *mov_add = nir_alu_instr_create(mem_ctx, nir_op_mov);
mov_add->dest.write_mask = orig_add->dest.write_mask;
nir_ssa_dest_init(&mov_add->instr, &mov_add->dest.dest,
orig_add->dest.dest.ssa.num_components,
orig_add->dest.dest.ssa.bit_size, NULL);
mov_add->src[0].src = nir_src_for_ssa(fadd);
nir_builder_instr_insert(bld, &mov_add->instr);
nir_alu_instr *mov_cmp = nir_alu_instr_create(mem_ctx, nir_op_mov);
mov_cmp->dest.write_mask = orig_cmp->dest.write_mask;
nir_ssa_dest_init(&mov_cmp->instr, &mov_cmp->dest.dest,
orig_cmp->dest.dest.ssa.num_components,
orig_cmp->dest.dest.ssa.bit_size, NULL);
mov_cmp->src[0].src = nir_src_for_ssa(cmp);
nir_builder_instr_insert(bld, &mov_cmp->instr);
nir_ssa_def_rewrite_uses(&orig_cmp->dest.dest.ssa,
nir_src_for_ssa(&mov_cmp->dest.dest.ssa));
nir_ssa_def_rewrite_uses(&orig_add->dest.dest.ssa,
nir_src_for_ssa(&mov_add->dest.dest.ssa));
/* We know these have no more uses because we just rewrote them all, so we
* can remove them.
*/
nir_instr_remove(&orig_cmp->instr);
nir_instr_remove(&orig_add->instr);
}
static bool
comparison_pre_block(nir_block *block, struct block_queue *bq, nir_builder *bld)
{
bool progress = false;
struct block_instructions *bi = push_block(bq);
if (bi == NULL)
return false;
/* Starting with the current block, examine each instruction. If the
* instruction is a comparison that matches the '±a cmp ±b' pattern, add it
* to the block_instructions::instructions set. If the instruction is an
* add instruction, walk up the block queue looking at the stored
* instructions. If a matching comparison is found, move the addition and
* replace the comparison with a different comparison based on the result
* of the addition. All of the blocks in the queue are guaranteed to be
* dominators of the current block.
*
* After processing the current block, recurse into the blocks dominated by
* the current block.
*/
nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_alu)
continue;
nir_alu_instr *const alu = nir_instr_as_alu(instr);
if (alu->dest.dest.ssa.num_components != 1)
continue;
if (alu->dest.saturate)
continue;
static const uint8_t swizzle[4] = { 0, 0, 0, 0 };
switch (alu->op) {
case nir_op_fadd: {
/* If the instruction is fadd, check it against comparison
* instructions that dominate it.
*/
struct block_instructions *b =
(struct block_instructions *) exec_list_get_head_raw(&bq->blocks);
while (b->node.next != NULL) {
nir_alu_instr **a;
bool rewrote_compare = false;
u_vector_foreach(a, &b->instructions) {
nir_alu_instr *const cmp = *a;
if (cmp == NULL)
continue;
/* The operands of both instructions are, with some liberty,
* commutative. Check all four permutations. The third and
* fourth permutations are negations of the first two.
*/
if ((nir_alu_srcs_equal(cmp, alu, 0, 0) &&
nir_alu_srcs_negative_equal(cmp, alu, 1, 1)) ||
(nir_alu_srcs_equal(cmp, alu, 0, 1) &&
nir_alu_srcs_negative_equal(cmp, alu, 1, 0))) {
/* These are the cases where (A cmp B) matches either (A +
* -B) or (-B + A)
*
* A cmp B <=> A + -B cmp 0
*/
rewrite_compare_instruction(bld, cmp, alu, false);
*a = NULL;
rewrote_compare = true;
break;
} else if ((nir_alu_srcs_equal(cmp, alu, 1, 0) &&
nir_alu_srcs_negative_equal(cmp, alu, 0, 1)) ||
(nir_alu_srcs_equal(cmp, alu, 1, 1) &&
nir_alu_srcs_negative_equal(cmp, alu, 0, 0))) {
/* This is the case where (A cmp B) matches (B + -A) or (-A
* + B).
*
* A cmp B <=> 0 cmp B + -A
*/
rewrite_compare_instruction(bld, cmp, alu, true);
*a = NULL;
rewrote_compare = true;
break;
}
}
/* Bail after a compare in the most dominating block is found.
* This is necessary because 'alu' has been removed from the
* instruction stream. Should there be a matching compare in
* another block, calling rewrite_compare_instruction again will
* try to operate on a node that is not in the list as if it were
* in the list.
*
* FINISHME: There may be opportunity for additional optimization
* here. I discovered this problem due to a shader in Guacamelee.
* It may be possible to rewrite the matching compares that are
* encountered later to reuse the result from the compare that was
* first rewritten. It's also possible that this is just taken
* care of by calling the optimization pass repeatedly.
*/
if (rewrote_compare) {
progress = true;
break;
}
b = (struct block_instructions *) b->node.next;
}
break;
}
case nir_op_flt:
case nir_op_fge:
case nir_op_fne:
case nir_op_feq:
/* If the instruction is a comparison that is used by an if-statement
* and neither operand is immediate value 0, add it to the set.
*/
if (is_used_by_if(alu) &&
is_not_const_zero(NULL, alu, 0, 1, swizzle) &&
is_not_const_zero(NULL, alu, 1, 1, swizzle))
add_instruction_for_block(bi, alu);
break;
default:
break;
}
}
for (unsigned i = 0; i < block->num_dom_children; i++) {
nir_block *child = block->dom_children[i];
if (comparison_pre_block(child, bq, bld))
progress = true;
}
pop_block(bq, bi);
return progress;
}
bool
nir_opt_comparison_pre_impl(nir_function_impl *impl)
{
struct block_queue bq;
nir_builder bld;
block_queue_init(&bq);
nir_builder_init(&bld, impl);
nir_metadata_require(impl, nir_metadata_dominance);
const bool progress =
comparison_pre_block(nir_start_block(impl), &bq, &bld);
block_queue_finish(&bq);
if (progress) {
nir_metadata_preserve(impl, nir_metadata_block_index |
nir_metadata_dominance);
} else {
nir_metadata_preserve(impl, nir_metadata_all);
}
return progress;
}
bool
nir_opt_comparison_pre(nir_shader *shader)
{
bool progress = false;
nir_foreach_function(function, shader) {
if (function->impl)
progress |= nir_opt_comparison_pre_impl(function->impl);
}
return progress;
}