blob: 247ff6ee25064a639cfb5847235686b5fda8e3b7 [file] [log] [blame]
/*
* Copyright (C) 2019 Google, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* Authors:
* Rob Clark <robclark@freedesktop.org>
*/
#include "ir3.h"
/*
* Helpers to figure out the necessary delay slots between instructions. Used
* both in scheduling pass(es) and the final pass to insert any required nop's
* so that the shader program is valid.
*
* Note that this needs to work both pre and post RA, so we can't assume ssa
* src iterators work.
*/
/* generally don't count false dependencies, since this can just be
* something like a barrier, or SSBO store. The exception is array
* dependencies if the assigner is an array write and the consumer
* reads the same array.
*/
static bool
ignore_dep(struct ir3_instruction *assigner,
struct ir3_instruction *consumer, unsigned n)
{
if (!__is_false_dep(consumer, n))
return false;
if (assigner->barrier_class & IR3_BARRIER_ARRAY_W) {
struct ir3_register *dst = assigner->regs[0];
debug_assert(dst->flags & IR3_REG_ARRAY);
foreach_src (src, consumer) {
if ((src->flags & IR3_REG_ARRAY) &&
(dst->array.id == src->array.id)) {
return false;
}
}
}
return true;
}
/* calculate required # of delay slots between the instruction that
* assigns a value and the one that consumes
*/
int
ir3_delayslots(struct ir3_instruction *assigner,
struct ir3_instruction *consumer, unsigned n, bool soft)
{
if (ignore_dep(assigner, consumer, n))
return 0;
/* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
* alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
* handled with sync bits
*/
if (is_meta(assigner) || is_meta(consumer))
return 0;
if (writes_addr0(assigner) || writes_addr1(assigner))
return 6;
/* On a6xx, it takes the number of delay slots to get a SFU result
* back (ie. using nop's instead of (ss) is:
*
* 8 - single warp
* 9 - two warps
* 10 - four warps
*
* and so on. Not quite sure where it tapers out (ie. how many
* warps share an SFU unit). But 10 seems like a reasonable #
* to choose:
*/
if (soft && is_sfu(assigner))
return 10;
/* handled via sync flags: */
if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
return 0;
/* assigner must be alu: */
if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
is_mem(consumer)) {
return 6;
} else if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) &&
(n == 3)) {
/* special case, 3rd src to cat3 not required on first cycle */
return 1;
} else {
return 3;
}
}
static bool
count_instruction(struct ir3_instruction *n)
{
/* NOTE: don't count branch/jump since we don't know yet if they will
* be eliminated later in resolve_jumps().. really should do that
* earlier so we don't have this constraint.
*/
return is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_B));
}
/**
* @block: the block to search in, starting from end; in first pass,
* this will be the block the instruction would be inserted into
* (but has not yet, ie. it only contains already scheduled
* instructions). For intra-block scheduling (second pass), this
* would be one of the predecessor blocks.
* @instr: the instruction to search for
* @maxd: max distance, bail after searching this # of instruction
* slots, since it means the instruction we are looking for is
* far enough away
* @pred: if true, recursively search into predecessor blocks to
* find the worst case (shortest) distance (only possible after
* individual blocks are all scheduled)
*/
static unsigned
distance(struct ir3_block *block, struct ir3_instruction *instr,
unsigned maxd, bool pred)
{
unsigned d = 0;
/* Note that this relies on incrementally building up the block's
* instruction list.. but this is how scheduling and nopsched
* work.
*/
foreach_instr_rev (n, &block->instr_list) {
if ((n == instr) || (d >= maxd))
return MIN2(maxd, d + n->nop);
if (count_instruction(n))
d = MIN2(maxd, d + 1 + n->repeat + n->nop);
}
/* if coming from a predecessor block, assume it is assigned far
* enough away.. we'll fix up later.
*/
if (!pred)
return maxd;
if (pred && (block->data != block)) {
/* Search into predecessor blocks, finding the one with the
* shortest distance, since that will be the worst case
*/
unsigned min = maxd - d;
/* (ab)use block->data to prevent recursion: */
block->data = block;
set_foreach (block->predecessors, entry) {
struct ir3_block *pred = (struct ir3_block *)entry->key;
unsigned n;
n = distance(pred, instr, min, pred);
min = MIN2(min, n);
}
block->data = NULL;
d += min;
}
return d;
}
/* calculate delay for specified src: */
static unsigned
delay_calc_srcn(struct ir3_block *block,
struct ir3_instruction *assigner,
struct ir3_instruction *consumer,
unsigned srcn, bool soft, bool pred)
{
unsigned delay = 0;
if (is_meta(assigner)) {
foreach_src_n (src, n, assigner) {
unsigned d;
if (!src->instr)
continue;
d = delay_calc_srcn(block, src->instr, consumer, srcn, soft, pred);
/* A (rptN) instruction executes in consecutive cycles so
* it's outputs are written in successive cycles. And
* likewise for it's (r)'d (incremented) inputs, they are
* read on successive cycles.
*
* So we need to adjust the delay for (rptN)'s assigners
* and consumers accordingly.
*
* Note that the dst of a (rptN) instruction is implicitly
* (r) (the assigner case), although that is not the case
* for src registers. There is exactly one case, bary.f,
* which has a vecN (collect) src that is not (r)'d.
*/
if ((assigner->opc == OPC_META_SPLIT) && src->instr->repeat) {
/* (rptN) assigner case: */
d -= MIN2(d, src->instr->repeat - assigner->split.off);
} else if ((assigner->opc == OPC_META_COLLECT) && consumer->repeat &&
(consumer->regs[srcn]->flags & IR3_REG_R)) {
d -= MIN2(d, n);
}
delay = MAX2(delay, d);
}
} else {
delay = ir3_delayslots(assigner, consumer, srcn, soft);
delay -= distance(block, assigner, delay, pred);
}
return delay;
}
static struct ir3_instruction *
find_array_write(struct ir3_block *block, unsigned array_id, unsigned maxd)
{
unsigned d = 0;
/* Note that this relies on incrementally building up the block's
* instruction list.. but this is how scheduling and nopsched
* work.
*/
foreach_instr_rev (n, &block->instr_list) {
if (d >= maxd)
return NULL;
if (count_instruction(n))
d++;
if (dest_regs(n) == 0)
continue;
/* note that a dest reg will never be an immediate */
if (n->regs[0]->array.id == array_id)
return n;
}
return NULL;
}
/* like list_length() but only counts instructions which count in the
* delay determination:
*/
static unsigned
count_block_delay(struct ir3_block *block)
{
unsigned delay = 0;
foreach_instr (n, &block->instr_list) {
if (!count_instruction(n))
continue;
delay++;
}
return delay;
}
static unsigned
delay_calc_array(struct ir3_block *block, unsigned array_id,
struct ir3_instruction *consumer, unsigned srcn,
bool soft, bool pred, unsigned maxd)
{
struct ir3_instruction *assigner;
assigner = find_array_write(block, array_id, maxd);
if (assigner)
return delay_calc_srcn(block, assigner, consumer, srcn, soft, pred);
if (!pred)
return 0;
unsigned len = count_block_delay(block);
if (maxd <= len)
return 0;
maxd -= len;
if (block->data == block) {
/* we have a loop, return worst case: */
return maxd;
}
/* If we need to search into predecessors, find the one with the
* max delay.. the resulting delay is that minus the number of
* counted instructions in this block:
*/
unsigned max = 0;
/* (ab)use block->data to prevent recursion: */
block->data = block;
set_foreach (block->predecessors, entry) {
struct ir3_block *pred = (struct ir3_block *)entry->key;
unsigned delay =
delay_calc_array(pred, array_id, consumer, srcn, soft, pred, maxd);
max = MAX2(max, delay);
}
block->data = NULL;
if (max < len)
return 0;
return max - len;
}
/**
* Calculate delay for instruction (maximum of delay for all srcs):
*
* @soft: If true, add additional delay for situations where they
* would not be strictly required because a sync flag would be
* used (but scheduler would prefer to schedule some other
* instructions first to avoid stalling on sync flag)
* @pred: If true, recurse into predecessor blocks
*/
unsigned
ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
bool soft, bool pred)
{
unsigned delay = 0;
foreach_src_n (src, i, instr) {
unsigned d = 0;
if ((src->flags & IR3_REG_RELATIV) && !(src->flags & IR3_REG_CONST)) {
d = delay_calc_array(block, src->array.id, instr, i+1, soft, pred, 6);
} else if (src->instr) {
d = delay_calc_srcn(block, src->instr, instr, i+1, soft, pred);
}
delay = MAX2(delay, d);
}
if (instr->address) {
unsigned d = delay_calc_srcn(block, instr->address, instr, 0, soft, pred);
delay = MAX2(delay, d);
}
return delay;
}
/**
* Remove nop instructions. The scheduler can insert placeholder nop's
* so that ir3_delay_calc() can account for nop's that won't be needed
* due to nop's triggered by a previous instruction. However, before
* legalize, we want to remove these. The legalize pass can insert
* some nop's if needed to hold (for example) sync flags. This final
* remaining nops are inserted by legalize after this.
*/
void
ir3_remove_nops(struct ir3 *ir)
{
foreach_block (block, &ir->block_list) {
foreach_instr_safe (instr, &block->instr_list) {
if (instr->opc == OPC_NOP) {
list_del(&instr->node);
}
}
}
}