| /* |
| * Copyright © 2018 Intel Corporation |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
| * IN THE SOFTWARE. |
| */ |
| |
| #include "nir_instr_set.h" |
| #include "nir_search_helpers.h" |
| #include "nir_builder.h" |
| #include "util/u_vector.h" |
| |
| /* Partial redundancy elimination of compares |
| * |
| * Seaches for comparisons of the form 'a cmp b' that dominate arithmetic |
| * instructions like 'b - a'. The comparison is replaced by the arithmetic |
| * instruction, and the result is compared with zero. For example, |
| * |
| * vec1 32 ssa_111 = flt 0.37, ssa_110.w |
| * if ssa_111 { |
| * block block_1: |
| * vec1 32 ssa_112 = fadd ssa_110.w, -0.37 |
| * ... |
| * |
| * becomes |
| * |
| * vec1 32 ssa_111 = fadd ssa_110.w, -0.37 |
| * vec1 32 ssa_112 = flt 0.0, ssa_111 |
| * if ssa_112 { |
| * block block_1: |
| * ... |
| */ |
| |
| struct block_queue { |
| /** |
| * Stack of blocks from the current location in the CFG to the entry point |
| * of the function. |
| * |
| * This is sort of a poor man's dominator tree. |
| */ |
| struct exec_list blocks; |
| |
| /** List of freed block_instructions structures that can be reused. */ |
| struct exec_list reusable_blocks; |
| }; |
| |
| struct block_instructions { |
| struct exec_node node; |
| |
| /** |
| * Set of comparison instructions from the block that are candidates for |
| * being replaced by add instructions. |
| */ |
| struct u_vector instructions; |
| }; |
| |
| static void |
| block_queue_init(struct block_queue *bq) |
| { |
| exec_list_make_empty(&bq->blocks); |
| exec_list_make_empty(&bq->reusable_blocks); |
| } |
| |
| static void |
| block_queue_finish(struct block_queue *bq) |
| { |
| struct block_instructions *n; |
| |
| while ((n = (struct block_instructions *) exec_list_pop_head(&bq->blocks)) != NULL) { |
| u_vector_finish(&n->instructions); |
| free(n); |
| } |
| |
| while ((n = (struct block_instructions *) exec_list_pop_head(&bq->reusable_blocks)) != NULL) { |
| free(n); |
| } |
| } |
| |
| static struct block_instructions * |
| push_block(struct block_queue *bq) |
| { |
| struct block_instructions *bi = |
| (struct block_instructions *) exec_list_pop_head(&bq->reusable_blocks); |
| |
| if (bi == NULL) { |
| bi = calloc(1, sizeof(struct block_instructions)); |
| |
| if (bi == NULL) |
| return NULL; |
| } |
| |
| if (!u_vector_init(&bi->instructions, |
| sizeof(nir_alu_instr *), |
| 8 * sizeof(nir_alu_instr *))) { |
| free(bi); |
| return NULL; |
| } |
| |
| exec_list_push_tail(&bq->blocks, &bi->node); |
| |
| return bi; |
| } |
| |
| static void |
| pop_block(struct block_queue *bq, struct block_instructions *bi) |
| { |
| u_vector_finish(&bi->instructions); |
| exec_node_remove(&bi->node); |
| exec_list_push_head(&bq->reusable_blocks, &bi->node); |
| } |
| |
| static void |
| add_instruction_for_block(struct block_instructions *bi, |
| nir_alu_instr *alu) |
| { |
| nir_alu_instr **data = |
| u_vector_add(&bi->instructions); |
| |
| *data = alu; |
| } |
| |
| static void |
| rewrite_compare_instruction(nir_builder *bld, nir_alu_instr *orig_cmp, |
| nir_alu_instr *orig_add, bool zero_on_left) |
| { |
| void *const mem_ctx = ralloc_parent(orig_cmp); |
| |
| bld->cursor = nir_before_instr(&orig_cmp->instr); |
| |
| /* This is somewhat tricky. The compare instruction may be something like |
| * (fcmp, a, b) while the add instruction is something like (fadd, fneg(a), |
| * b). This is problematic because the SSA value for the fneg(a) may not |
| * exist yet at the compare instruction. |
| * |
| * We fabricate the operands of the new add. This is done using |
| * information provided by zero_on_left. If zero_on_left is true, we know |
| * the resulting compare instruction is (fcmp, 0.0, (fadd, x, y)). If the |
| * original compare instruction was (fcmp, a, b), x = b and y = -a. If |
| * zero_on_left is false, the resulting compare instruction is (fcmp, |
| * (fadd, x, y), 0.0) and x = a and y = -b. |
| */ |
| nir_ssa_def *const a = nir_ssa_for_alu_src(bld, orig_cmp, 0); |
| nir_ssa_def *const b = nir_ssa_for_alu_src(bld, orig_cmp, 1); |
| |
| nir_ssa_def *const fadd = zero_on_left |
| ? nir_fadd(bld, b, nir_fneg(bld, a)) |
| : nir_fadd(bld, a, nir_fneg(bld, b)); |
| |
| nir_ssa_def *const zero = |
| nir_imm_floatN_t(bld, 0.0, orig_add->dest.dest.ssa.bit_size); |
| |
| nir_ssa_def *const cmp = zero_on_left |
| ? nir_build_alu(bld, orig_cmp->op, zero, fadd, NULL, NULL) |
| : nir_build_alu(bld, orig_cmp->op, fadd, zero, NULL, NULL); |
| |
| /* Generating extra moves of the results is the easy way to make sure the |
| * writemasks match the original instructions. Later optimization passes |
| * will clean these up. This is similar to nir_replace_instr (in |
| * nir_search.c). |
| */ |
| nir_alu_instr *mov_add = nir_alu_instr_create(mem_ctx, nir_op_mov); |
| mov_add->dest.write_mask = orig_add->dest.write_mask; |
| nir_ssa_dest_init(&mov_add->instr, &mov_add->dest.dest, |
| orig_add->dest.dest.ssa.num_components, |
| orig_add->dest.dest.ssa.bit_size, NULL); |
| mov_add->src[0].src = nir_src_for_ssa(fadd); |
| |
| nir_builder_instr_insert(bld, &mov_add->instr); |
| |
| nir_alu_instr *mov_cmp = nir_alu_instr_create(mem_ctx, nir_op_mov); |
| mov_cmp->dest.write_mask = orig_cmp->dest.write_mask; |
| nir_ssa_dest_init(&mov_cmp->instr, &mov_cmp->dest.dest, |
| orig_cmp->dest.dest.ssa.num_components, |
| orig_cmp->dest.dest.ssa.bit_size, NULL); |
| mov_cmp->src[0].src = nir_src_for_ssa(cmp); |
| |
| nir_builder_instr_insert(bld, &mov_cmp->instr); |
| |
| nir_ssa_def_rewrite_uses(&orig_cmp->dest.dest.ssa, |
| nir_src_for_ssa(&mov_cmp->dest.dest.ssa)); |
| nir_ssa_def_rewrite_uses(&orig_add->dest.dest.ssa, |
| nir_src_for_ssa(&mov_add->dest.dest.ssa)); |
| |
| /* We know these have no more uses because we just rewrote them all, so we |
| * can remove them. |
| */ |
| nir_instr_remove(&orig_cmp->instr); |
| nir_instr_remove(&orig_add->instr); |
| } |
| |
| static bool |
| comparison_pre_block(nir_block *block, struct block_queue *bq, nir_builder *bld) |
| { |
| bool progress = false; |
| |
| struct block_instructions *bi = push_block(bq); |
| if (bi == NULL) |
| return false; |
| |
| /* Starting with the current block, examine each instruction. If the |
| * instruction is a comparison that matches the '±a cmp ±b' pattern, add it |
| * to the block_instructions::instructions set. If the instruction is an |
| * add instruction, walk up the block queue looking at the stored |
| * instructions. If a matching comparison is found, move the addition and |
| * replace the comparison with a different comparison based on the result |
| * of the addition. All of the blocks in the queue are guaranteed to be |
| * dominators of the current block. |
| * |
| * After processing the current block, recurse into the blocks dominated by |
| * the current block. |
| */ |
| nir_foreach_instr_safe(instr, block) { |
| if (instr->type != nir_instr_type_alu) |
| continue; |
| |
| nir_alu_instr *const alu = nir_instr_as_alu(instr); |
| |
| if (alu->dest.dest.ssa.num_components != 1) |
| continue; |
| |
| if (alu->dest.saturate) |
| continue; |
| |
| static const uint8_t swizzle[NIR_MAX_VEC_COMPONENTS] = {0}; |
| |
| switch (alu->op) { |
| case nir_op_fadd: { |
| /* If the instruction is fadd, check it against comparison |
| * instructions that dominate it. |
| */ |
| struct block_instructions *b = |
| (struct block_instructions *) exec_list_get_head_raw(&bq->blocks); |
| |
| while (b->node.next != NULL) { |
| nir_alu_instr **a; |
| bool rewrote_compare = false; |
| |
| u_vector_foreach(a, &b->instructions) { |
| nir_alu_instr *const cmp = *a; |
| |
| if (cmp == NULL) |
| continue; |
| |
| /* The operands of both instructions are, with some liberty, |
| * commutative. Check all four permutations. The third and |
| * fourth permutations are negations of the first two. |
| */ |
| if ((nir_alu_srcs_equal(cmp, alu, 0, 0) && |
| nir_alu_srcs_negative_equal(cmp, alu, 1, 1)) || |
| (nir_alu_srcs_equal(cmp, alu, 0, 1) && |
| nir_alu_srcs_negative_equal(cmp, alu, 1, 0))) { |
| /* These are the cases where (A cmp B) matches either (A + |
| * -B) or (-B + A) |
| * |
| * A cmp B <=> A + -B cmp 0 |
| */ |
| rewrite_compare_instruction(bld, cmp, alu, false); |
| |
| *a = NULL; |
| rewrote_compare = true; |
| break; |
| } else if ((nir_alu_srcs_equal(cmp, alu, 1, 0) && |
| nir_alu_srcs_negative_equal(cmp, alu, 0, 1)) || |
| (nir_alu_srcs_equal(cmp, alu, 1, 1) && |
| nir_alu_srcs_negative_equal(cmp, alu, 0, 0))) { |
| /* This is the case where (A cmp B) matches (B + -A) or (-A |
| * + B). |
| * |
| * A cmp B <=> 0 cmp B + -A |
| */ |
| rewrite_compare_instruction(bld, cmp, alu, true); |
| |
| *a = NULL; |
| rewrote_compare = true; |
| break; |
| } |
| } |
| |
| /* Bail after a compare in the most dominating block is found. |
| * This is necessary because 'alu' has been removed from the |
| * instruction stream. Should there be a matching compare in |
| * another block, calling rewrite_compare_instruction again will |
| * try to operate on a node that is not in the list as if it were |
| * in the list. |
| * |
| * FINISHME: There may be opportunity for additional optimization |
| * here. I discovered this problem due to a shader in Guacamelee. |
| * It may be possible to rewrite the matching compares that are |
| * encountered later to reuse the result from the compare that was |
| * first rewritten. It's also possible that this is just taken |
| * care of by calling the optimization pass repeatedly. |
| */ |
| if (rewrote_compare) { |
| progress = true; |
| break; |
| } |
| |
| b = (struct block_instructions *) b->node.next; |
| } |
| |
| break; |
| } |
| |
| case nir_op_flt: |
| case nir_op_fge: |
| case nir_op_fneu: |
| case nir_op_feq: |
| /* If the instruction is a comparison that is used by an if-statement |
| * and neither operand is immediate value 0, add it to the set. |
| */ |
| if (is_used_by_if(alu) && |
| is_not_const_zero(NULL, alu, 0, 1, swizzle) && |
| is_not_const_zero(NULL, alu, 1, 1, swizzle)) |
| add_instruction_for_block(bi, alu); |
| |
| break; |
| |
| default: |
| break; |
| } |
| } |
| |
| for (unsigned i = 0; i < block->num_dom_children; i++) { |
| nir_block *child = block->dom_children[i]; |
| |
| if (comparison_pre_block(child, bq, bld)) |
| progress = true; |
| } |
| |
| pop_block(bq, bi); |
| |
| return progress; |
| } |
| |
| bool |
| nir_opt_comparison_pre_impl(nir_function_impl *impl) |
| { |
| struct block_queue bq; |
| nir_builder bld; |
| |
| block_queue_init(&bq); |
| nir_builder_init(&bld, impl); |
| |
| nir_metadata_require(impl, nir_metadata_dominance); |
| |
| const bool progress = |
| comparison_pre_block(nir_start_block(impl), &bq, &bld); |
| |
| block_queue_finish(&bq); |
| |
| if (progress) { |
| nir_metadata_preserve(impl, nir_metadata_block_index | |
| nir_metadata_dominance); |
| } else { |
| nir_metadata_preserve(impl, nir_metadata_all); |
| } |
| |
| return progress; |
| } |
| |
| bool |
| nir_opt_comparison_pre(nir_shader *shader) |
| { |
| bool progress = false; |
| |
| nir_foreach_function(function, shader) { |
| if (function->impl) |
| progress |= nir_opt_comparison_pre_impl(function->impl); |
| } |
| |
| return progress; |
| } |