| /* |
| * Copyright (C) 2019 Google, Inc. |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| * SOFTWARE. |
| * |
| * Authors: |
| * Rob Clark <robclark@freedesktop.org> |
| */ |
| |
| #include "ir3.h" |
| |
| /* |
| * Helpers to figure out the necessary delay slots between instructions. Used |
| * both in scheduling pass(es) and the final pass to insert any required nop's |
| * so that the shader program is valid. |
| * |
| * Note that this needs to work both pre and post RA, so we can't assume ssa |
| * src iterators work. |
| */ |
| |
| /* generally don't count false dependencies, since this can just be |
| * something like a barrier, or SSBO store. The exception is array |
| * dependencies if the assigner is an array write and the consumer |
| * reads the same array. |
| */ |
| static bool |
| ignore_dep(struct ir3_instruction *assigner, |
| struct ir3_instruction *consumer, unsigned n) |
| { |
| if (!__is_false_dep(consumer, n)) |
| return false; |
| |
| if (assigner->barrier_class & IR3_BARRIER_ARRAY_W) { |
| struct ir3_register *dst = assigner->regs[0]; |
| |
| debug_assert(dst->flags & IR3_REG_ARRAY); |
| |
| foreach_src (src, consumer) { |
| if ((src->flags & IR3_REG_ARRAY) && |
| (dst->array.id == src->array.id)) { |
| return false; |
| } |
| } |
| } |
| |
| return true; |
| } |
| |
| /* calculate required # of delay slots between the instruction that |
| * assigns a value and the one that consumes |
| */ |
| int |
| ir3_delayslots(struct ir3_instruction *assigner, |
| struct ir3_instruction *consumer, unsigned n, bool soft) |
| { |
| if (ignore_dep(assigner, consumer, n)) |
| return 0; |
| |
| /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal |
| * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch |
| * handled with sync bits |
| */ |
| |
| if (is_meta(assigner) || is_meta(consumer)) |
| return 0; |
| |
| if (writes_addr0(assigner) || writes_addr1(assigner)) |
| return 6; |
| |
| /* On a6xx, it takes the number of delay slots to get a SFU result |
| * back (ie. using nop's instead of (ss) is: |
| * |
| * 8 - single warp |
| * 9 - two warps |
| * 10 - four warps |
| * |
| * and so on. Not quite sure where it tapers out (ie. how many |
| * warps share an SFU unit). But 10 seems like a reasonable # |
| * to choose: |
| */ |
| if (soft && is_sfu(assigner)) |
| return 10; |
| |
| /* handled via sync flags: */ |
| if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner)) |
| return 0; |
| |
| /* assigner must be alu: */ |
| if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) || |
| is_mem(consumer)) { |
| return 6; |
| } else if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) && |
| (n == 3)) { |
| /* special case, 3rd src to cat3 not required on first cycle */ |
| return 1; |
| } else { |
| return 3; |
| } |
| } |
| |
| static bool |
| count_instruction(struct ir3_instruction *n) |
| { |
| /* NOTE: don't count branch/jump since we don't know yet if they will |
| * be eliminated later in resolve_jumps().. really should do that |
| * earlier so we don't have this constraint. |
| */ |
| return is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_B)); |
| } |
| |
| /** |
| * @block: the block to search in, starting from end; in first pass, |
| * this will be the block the instruction would be inserted into |
| * (but has not yet, ie. it only contains already scheduled |
| * instructions). For intra-block scheduling (second pass), this |
| * would be one of the predecessor blocks. |
| * @instr: the instruction to search for |
| * @maxd: max distance, bail after searching this # of instruction |
| * slots, since it means the instruction we are looking for is |
| * far enough away |
| * @pred: if true, recursively search into predecessor blocks to |
| * find the worst case (shortest) distance (only possible after |
| * individual blocks are all scheduled) |
| */ |
| static unsigned |
| distance(struct ir3_block *block, struct ir3_instruction *instr, |
| unsigned maxd, bool pred) |
| { |
| unsigned d = 0; |
| |
| /* Note that this relies on incrementally building up the block's |
| * instruction list.. but this is how scheduling and nopsched |
| * work. |
| */ |
| foreach_instr_rev (n, &block->instr_list) { |
| if ((n == instr) || (d >= maxd)) |
| return MIN2(maxd, d + n->nop); |
| if (count_instruction(n)) |
| d = MIN2(maxd, d + 1 + n->repeat + n->nop); |
| } |
| |
| /* if coming from a predecessor block, assume it is assigned far |
| * enough away.. we'll fix up later. |
| */ |
| if (!pred) |
| return maxd; |
| |
| if (pred && (block->data != block)) { |
| /* Search into predecessor blocks, finding the one with the |
| * shortest distance, since that will be the worst case |
| */ |
| unsigned min = maxd - d; |
| |
| /* (ab)use block->data to prevent recursion: */ |
| block->data = block; |
| |
| set_foreach (block->predecessors, entry) { |
| struct ir3_block *pred = (struct ir3_block *)entry->key; |
| unsigned n; |
| |
| n = distance(pred, instr, min, pred); |
| |
| min = MIN2(min, n); |
| } |
| |
| block->data = NULL; |
| d += min; |
| } |
| |
| return d; |
| } |
| |
| /* calculate delay for specified src: */ |
| static unsigned |
| delay_calc_srcn(struct ir3_block *block, |
| struct ir3_instruction *assigner, |
| struct ir3_instruction *consumer, |
| unsigned srcn, bool soft, bool pred) |
| { |
| unsigned delay = 0; |
| |
| if (is_meta(assigner)) { |
| foreach_src_n (src, n, assigner) { |
| unsigned d; |
| |
| if (!src->instr) |
| continue; |
| |
| d = delay_calc_srcn(block, src->instr, consumer, srcn, soft, pred); |
| |
| /* A (rptN) instruction executes in consecutive cycles so |
| * it's outputs are written in successive cycles. And |
| * likewise for it's (r)'d (incremented) inputs, they are |
| * read on successive cycles. |
| * |
| * So we need to adjust the delay for (rptN)'s assigners |
| * and consumers accordingly. |
| * |
| * Note that the dst of a (rptN) instruction is implicitly |
| * (r) (the assigner case), although that is not the case |
| * for src registers. There is exactly one case, bary.f, |
| * which has a vecN (collect) src that is not (r)'d. |
| */ |
| if ((assigner->opc == OPC_META_SPLIT) && src->instr->repeat) { |
| /* (rptN) assigner case: */ |
| d -= MIN2(d, src->instr->repeat - assigner->split.off); |
| } else if ((assigner->opc == OPC_META_COLLECT) && consumer->repeat && |
| (consumer->regs[srcn]->flags & IR3_REG_R)) { |
| d -= MIN2(d, n); |
| } |
| |
| delay = MAX2(delay, d); |
| } |
| } else { |
| delay = ir3_delayslots(assigner, consumer, srcn, soft); |
| delay -= distance(block, assigner, delay, pred); |
| } |
| |
| return delay; |
| } |
| |
| static struct ir3_instruction * |
| find_array_write(struct ir3_block *block, unsigned array_id, unsigned maxd) |
| { |
| unsigned d = 0; |
| |
| /* Note that this relies on incrementally building up the block's |
| * instruction list.. but this is how scheduling and nopsched |
| * work. |
| */ |
| foreach_instr_rev (n, &block->instr_list) { |
| if (d >= maxd) |
| return NULL; |
| if (count_instruction(n)) |
| d++; |
| if (dest_regs(n) == 0) |
| continue; |
| |
| /* note that a dest reg will never be an immediate */ |
| if (n->regs[0]->array.id == array_id) |
| return n; |
| } |
| |
| return NULL; |
| } |
| |
| /* like list_length() but only counts instructions which count in the |
| * delay determination: |
| */ |
| static unsigned |
| count_block_delay(struct ir3_block *block) |
| { |
| unsigned delay = 0; |
| foreach_instr (n, &block->instr_list) { |
| if (!count_instruction(n)) |
| continue; |
| delay++; |
| } |
| return delay; |
| } |
| |
| static unsigned |
| delay_calc_array(struct ir3_block *block, unsigned array_id, |
| struct ir3_instruction *consumer, unsigned srcn, |
| bool soft, bool pred, unsigned maxd) |
| { |
| struct ir3_instruction *assigner; |
| |
| assigner = find_array_write(block, array_id, maxd); |
| if (assigner) |
| return delay_calc_srcn(block, assigner, consumer, srcn, soft, pred); |
| |
| if (!pred) |
| return 0; |
| |
| unsigned len = count_block_delay(block); |
| if (maxd <= len) |
| return 0; |
| |
| maxd -= len; |
| |
| if (block->data == block) { |
| /* we have a loop, return worst case: */ |
| return maxd; |
| } |
| |
| /* If we need to search into predecessors, find the one with the |
| * max delay.. the resulting delay is that minus the number of |
| * counted instructions in this block: |
| */ |
| unsigned max = 0; |
| |
| /* (ab)use block->data to prevent recursion: */ |
| block->data = block; |
| |
| set_foreach (block->predecessors, entry) { |
| struct ir3_block *pred = (struct ir3_block *)entry->key; |
| unsigned delay = |
| delay_calc_array(pred, array_id, consumer, srcn, soft, pred, maxd); |
| |
| max = MAX2(max, delay); |
| } |
| |
| block->data = NULL; |
| |
| if (max < len) |
| return 0; |
| |
| return max - len; |
| } |
| |
| /** |
| * Calculate delay for instruction (maximum of delay for all srcs): |
| * |
| * @soft: If true, add additional delay for situations where they |
| * would not be strictly required because a sync flag would be |
| * used (but scheduler would prefer to schedule some other |
| * instructions first to avoid stalling on sync flag) |
| * @pred: If true, recurse into predecessor blocks |
| */ |
| unsigned |
| ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr, |
| bool soft, bool pred) |
| { |
| unsigned delay = 0; |
| |
| foreach_src_n (src, i, instr) { |
| unsigned d = 0; |
| |
| if ((src->flags & IR3_REG_RELATIV) && !(src->flags & IR3_REG_CONST)) { |
| d = delay_calc_array(block, src->array.id, instr, i+1, soft, pred, 6); |
| } else if (src->instr) { |
| d = delay_calc_srcn(block, src->instr, instr, i+1, soft, pred); |
| } |
| |
| delay = MAX2(delay, d); |
| } |
| |
| if (instr->address) { |
| unsigned d = delay_calc_srcn(block, instr->address, instr, 0, soft, pred); |
| delay = MAX2(delay, d); |
| } |
| |
| return delay; |
| } |
| |
| /** |
| * Remove nop instructions. The scheduler can insert placeholder nop's |
| * so that ir3_delay_calc() can account for nop's that won't be needed |
| * due to nop's triggered by a previous instruction. However, before |
| * legalize, we want to remove these. The legalize pass can insert |
| * some nop's if needed to hold (for example) sync flags. This final |
| * remaining nops are inserted by legalize after this. |
| */ |
| void |
| ir3_remove_nops(struct ir3 *ir) |
| { |
| foreach_block (block, &ir->block_list) { |
| foreach_instr_safe (instr, &block->instr_list) { |
| if (instr->opc == OPC_NOP) { |
| list_del(&instr->node); |
| } |
| } |
| } |
| |
| } |