src/freedreno/ir3/ir3_delay.c - platform/external/mesa3d - Git at Google

 /*
  * Copyright (C) 2019 Google, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  *
  * Authors:
  *    Rob Clark <robclark@freedesktop.org>
  */

 #include "ir3.h"

 /*
  * Helpers to figure out the necessary delay slots between instructions.  Used
  * both in scheduling pass(es) and the final pass to insert any required nop's
  * so that the shader program is valid.
  *
  * Note that this needs to work both pre and post RA, so we can't assume ssa
  * src iterators work.
  */

 /* generally don't count false dependencies, since this can just be
  * something like a barrier, or SSBO store.  The exception is array
  * dependencies if the assigner is an array write and the consumer
  * reads the same array.
  */
 static bool
 ignore_dep(struct ir3_instruction *assigner,
 		struct ir3_instruction *consumer, unsigned n)
 {
 	if (!__is_false_dep(consumer, n))
 		return false;

 	if (assigner->barrier_class & IR3_BARRIER_ARRAY_W) {
 		struct ir3_register *dst = assigner->regs[0];

 		debug_assert(dst->flags & IR3_REG_ARRAY);

 		foreach_src (src, consumer) {
 			if ((src->flags & IR3_REG_ARRAY) &&
 					(dst->array.id == src->array.id)) {
 				return false;
 			}
 		}
 	}

 	return true;
 }

 /* calculate required # of delay slots between the instruction that
  * assigns a value and the one that consumes
  */
 int
 ir3_delayslots(struct ir3_instruction *assigner,
 		struct ir3_instruction *consumer, unsigned n, bool soft)
 {
 	if (ignore_dep(assigner, consumer, n))
 		return 0;

 	/* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
 	 * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
 	 * handled with sync bits
 	 */

 	if (is_meta(assigner) || is_meta(consumer))
 		return 0;

 	if (writes_addr0(assigner) || writes_addr1(assigner))
 		return 6;

 	/* On a6xx, it takes the number of delay slots to get a SFU result
 	 * back (ie. using nop's instead of (ss) is:
 	 *
 	 *     8 - single warp
 	 *     9 - two warps
 	 *    10 - four warps
 	 *
 	 * and so on.  Not quite sure where it tapers out (ie. how many
 	 * warps share an SFU unit).  But 10 seems like a reasonable #
 	 * to choose:
 	 */
 	if (soft && is_sfu(assigner))
 		return 10;

 	/* handled via sync flags: */
 	if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
 		return 0;

 	/* assigner must be alu: */
 	if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
 			is_mem(consumer)) {
 		return 6;
 	} else if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) &&
 			(n == 3)) {
 		/* special case, 3rd src to cat3 not required on first cycle */
 		return 1;
 	} else {
 		return 3;
 	}
 }

 static bool
 count_instruction(struct ir3_instruction *n)
 {
 	/* NOTE: don't count branch/jump since we don't know yet if they will
 	 * be eliminated later in resolve_jumps().. really should do that
 	 * earlier so we don't have this constraint.
 	 */
 	return is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_B));
 }

 /**
  * @block: the block to search in, starting from end; in first pass,
  *    this will be the block the instruction would be inserted into
  *    (but has not yet, ie. it only contains already scheduled
  *    instructions).  For intra-block scheduling (second pass), this
  *    would be one of the predecessor blocks.
  * @instr: the instruction to search for
  * @maxd:  max distance, bail after searching this # of instruction
  *    slots, since it means the instruction we are looking for is
  *    far enough away
  * @pred:  if true, recursively search into predecessor blocks to
  *    find the worst case (shortest) distance (only possible after
  *    individual blocks are all scheduled)
  */
 static unsigned
 distance(struct ir3_block *block, struct ir3_instruction *instr,
 		unsigned maxd, bool pred)
 {
 	unsigned d = 0;

 	/* Note that this relies on incrementally building up the block's
 	 * instruction list.. but this is how scheduling and nopsched
 	 * work.
 	 */
 	foreach_instr_rev (n, &block->instr_list) {
 		if ((n == instr) || (d >= maxd))
 			return MIN2(maxd, d + n->nop);
 		if (count_instruction(n))
 			d = MIN2(maxd, d + 1 + n->repeat + n->nop);
 	}

 	/* if coming from a predecessor block, assume it is assigned far
 	 * enough away.. we'll fix up later.
 	 */
 	if (!pred)
 		return maxd;

 	if (pred && (block->data != block)) {
 		/* Search into predecessor blocks, finding the one with the
 		 * shortest distance, since that will be the worst case
 		 */
 		unsigned min = maxd - d;

 		/* (ab)use block->data to prevent recursion: */
 		block->data = block;

 		set_foreach (block->predecessors, entry) {
 			struct ir3_block *pred = (struct ir3_block *)entry->key;
 			unsigned n;

 			n = distance(pred, instr, min, pred);

 			min = MIN2(min, n);
 		}

 		block->data = NULL;
 		d += min;
 	}

 	return d;
 }

 /* calculate delay for specified src: */
 static unsigned
 delay_calc_srcn(struct ir3_block *block,
 		struct ir3_instruction *assigner,
 		struct ir3_instruction *consumer,
 		unsigned srcn, bool soft, bool pred)
 {
 	unsigned delay = 0;

 	if (is_meta(assigner)) {
 		foreach_src_n (src, n, assigner) {
 			unsigned d;

 			if (!src->instr)
 				continue;

 			d = delay_calc_srcn(block, src->instr, consumer, srcn, soft, pred);

 			/* A (rptN) instruction executes in consecutive cycles so
 			 * it's outputs are written in successive cycles.  And
 			 * likewise for it's (r)'d (incremented) inputs, they are
 			 * read on successive cycles.
 			 *
 			 * So we need to adjust the delay for (rptN)'s assigners
 			 * and consumers accordingly.
 			 *
 			 * Note that the dst of a (rptN) instruction is implicitly
 			 * (r) (the assigner case), although that is not the case
 			 * for src registers.  There is exactly one case, bary.f,
 			 * which has a vecN (collect) src that is not (r)'d.
 			 */
 			if ((assigner->opc == OPC_META_SPLIT) && src->instr->repeat) {
 				/* (rptN) assigner case: */
 				d -= MIN2(d, src->instr->repeat - assigner->split.off);
 			} else if ((assigner->opc == OPC_META_COLLECT) && consumer->repeat &&
 					(consumer->regs[srcn]->flags & IR3_REG_R)) {
 				d -= MIN2(d, n);
 			}

 			delay = MAX2(delay, d);
 		}
 	} else {
 		delay = ir3_delayslots(assigner, consumer, srcn, soft);
 		delay -= distance(block, assigner, delay, pred);
 	}

 	return delay;
 }

 static struct ir3_instruction *
 find_array_write(struct ir3_block *block, unsigned array_id, unsigned maxd)
 {
 	unsigned d = 0;

 	/* Note that this relies on incrementally building up the block's
 	 * instruction list.. but this is how scheduling and nopsched
 	 * work.
 	 */
 	foreach_instr_rev (n, &block->instr_list) {
 		if (d >= maxd)
 			return NULL;
 		if (count_instruction(n))
 			d++;
 		if (dest_regs(n) == 0)
 			continue;

 		/* note that a dest reg will never be an immediate */
 		if (n->regs[0]->array.id == array_id)
 			return n;
 	}

 	return NULL;
 }

 /* like list_length() but only counts instructions which count in the
  * delay determination:
  */
 static unsigned
 count_block_delay(struct ir3_block *block)
 {
 	unsigned delay = 0;
 	foreach_instr (n, &block->instr_list) {
 		if (!count_instruction(n))
 			continue;
 		delay++;
 	}
 	return delay;
 }

 static unsigned
 delay_calc_array(struct ir3_block *block, unsigned array_id,
 		struct ir3_instruction *consumer, unsigned srcn,
 		bool soft, bool pred, unsigned maxd)
 {
 	struct ir3_instruction *assigner;

 	assigner = find_array_write(block, array_id, maxd);
 	if (assigner)
 		return delay_calc_srcn(block, assigner, consumer, srcn, soft, pred);

 	if (!pred)
 		return 0;

 	unsigned len = count_block_delay(block);
 	if (maxd <= len)
 		return 0;

 	maxd -= len;

 	if (block->data == block) {
 		/* we have a loop, return worst case: */
 		return maxd;
 	}

 	/* If we need to search into predecessors, find the one with the
 	 * max delay.. the resulting delay is that minus the number of
 	 * counted instructions in this block:
 	 */
 	unsigned max = 0;

 	/* (ab)use block->data to prevent recursion: */
 	block->data = block;

 	set_foreach (block->predecessors, entry) {
 		struct ir3_block *pred = (struct ir3_block *)entry->key;
 		unsigned delay =
 			delay_calc_array(pred, array_id, consumer, srcn, soft, pred, maxd);

 		max = MAX2(max, delay);
 	}

 	block->data = NULL;

 	if (max < len)
 		return 0;

 	return max - len;
 }

 /**
  * Calculate delay for instruction (maximum of delay for all srcs):
  *
  * @soft:  If true, add additional delay for situations where they
  *    would not be strictly required because a sync flag would be
  *    used (but scheduler would prefer to schedule some other
  *    instructions first to avoid stalling on sync flag)
  * @pred:  If true, recurse into predecessor blocks
  */
 unsigned
 ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
 		bool soft, bool pred)
 {
 	unsigned delay = 0;

 	foreach_src_n (src, i, instr) {
 		unsigned d = 0;

 		if ((src->flags & IR3_REG_RELATIV) && !(src->flags & IR3_REG_CONST)) {
 			d = delay_calc_array(block, src->array.id, instr, i+1, soft, pred, 6);
 		} else if (src->instr) {
 			d = delay_calc_srcn(block, src->instr, instr, i+1, soft, pred);
 		}

 		delay = MAX2(delay, d);
 	}

 	if (instr->address) {
 		unsigned d = delay_calc_srcn(block, instr->address, instr, 0, soft, pred);
 		delay = MAX2(delay, d);
 	}

 	return delay;
 }

 /**
  * Remove nop instructions.  The scheduler can insert placeholder nop's
  * so that ir3_delay_calc() can account for nop's that won't be needed
  * due to nop's triggered by a previous instruction.  However, before
  * legalize, we want to remove these.  The legalize pass can insert
  * some nop's if needed to hold (for example) sync flags.  This final
  * remaining nops are inserted by legalize after this.
  */
 void
 ir3_remove_nops(struct ir3 *ir)
 {
 	foreach_block (block, &ir->block_list) {
 		foreach_instr_safe (instr, &block->instr_list) {
 			if (instr->opc == OPC_NOP) {
 				list_del(&instr->node);
 			}
 		}
 	}

 }
	/*
	* Copyright (C) 2019 Google, Inc.
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the "Software"),
	* to deal in the Software without restriction, including without limitation
	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
	* and/or sell copies of the Software, and to permit persons to whom the
	* Software is furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice (including the next
	* paragraph) shall be included in all copies or substantial portions of the
	* Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	* SOFTWARE.
	*
	* Authors:
	* Rob Clark <robclark@freedesktop.org>
	*/

	#include "ir3.h"

	/*
	* Helpers to figure out the necessary delay slots between instructions. Used
	* both in scheduling pass(es) and the final pass to insert any required nop's
	* so that the shader program is valid.
	*
	* Note that this needs to work both pre and post RA, so we can't assume ssa
	* src iterators work.
	*/

	/* generally don't count false dependencies, since this can just be
	* something like a barrier, or SSBO store. The exception is array
	* dependencies if the assigner is an array write and the consumer
	* reads the same array.
	*/
	static bool
	ignore_dep(struct ir3_instruction *assigner,
	struct ir3_instruction *consumer, unsigned n)
	{
	if (!__is_false_dep(consumer, n))
	return false;

	if (assigner->barrier_class & IR3_BARRIER_ARRAY_W) {
	struct ir3_register *dst = assigner->regs[0];

	debug_assert(dst->flags & IR3_REG_ARRAY);

	foreach_src (src, consumer) {
	if ((src->flags & IR3_REG_ARRAY) &&
	(dst->array.id == src->array.id)) {
	return false;
	}
	}
	}

	return true;
	}

	/* calculate required # of delay slots between the instruction that
	* assigns a value and the one that consumes
	*/
	int
	ir3_delayslots(struct ir3_instruction *assigner,
	struct ir3_instruction *consumer, unsigned n, bool soft)
	{
	if (ignore_dep(assigner, consumer, n))
	return 0;

	/* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
	* alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
	* handled with sync bits
	*/

	if (is_meta(assigner) \|\| is_meta(consumer))
	return 0;

	if (writes_addr0(assigner) \|\| writes_addr1(assigner))
	return 6;

	/* On a6xx, it takes the number of delay slots to get a SFU result
	* back (ie. using nop's instead of (ss) is:
	*
	* 8 - single warp
	* 9 - two warps
	* 10 - four warps
	*
	* and so on. Not quite sure where it tapers out (ie. how many
	* warps share an SFU unit). But 10 seems like a reasonable #
	* to choose:
	*/
	if (soft && is_sfu(assigner))
	return 10;

	/* handled via sync flags: */
	if (is_sfu(assigner) \|\| is_tex(assigner) \|\| is_mem(assigner))
	return 0;

	/* assigner must be alu: */
	if (is_flow(consumer) \|\| is_sfu(consumer) \|\| is_tex(consumer) \|\|
	is_mem(consumer)) {
	return 6;
	} else if ((is_mad(consumer->opc) \|\| is_madsh(consumer->opc)) &&
	(n == 3)) {
	/* special case, 3rd src to cat3 not required on first cycle */
	return 1;
	} else {
	return 3;
	}
	}

	static bool
	count_instruction(struct ir3_instruction *n)
	{
	/* NOTE: don't count branch/jump since we don't know yet if they will
	* be eliminated later in resolve_jumps().. really should do that
	* earlier so we don't have this constraint.
	*/
	return is_alu(n) \|\| (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_B));
	}

	/**
	* @block: the block to search in, starting from end; in first pass,
	* this will be the block the instruction would be inserted into
	* (but has not yet, ie. it only contains already scheduled
	* instructions). For intra-block scheduling (second pass), this
	* would be one of the predecessor blocks.
	* @instr: the instruction to search for
	* @maxd: max distance, bail after searching this # of instruction
	* slots, since it means the instruction we are looking for is
	* far enough away
	* @pred: if true, recursively search into predecessor blocks to
	* find the worst case (shortest) distance (only possible after
	* individual blocks are all scheduled)
	*/
	static unsigned
	distance(struct ir3_block block, struct ir3_instruction instr,
	unsigned maxd, bool pred)
	{
	unsigned d = 0;

	/* Note that this relies on incrementally building up the block's
	* instruction list.. but this is how scheduling and nopsched
	* work.
	*/
	foreach_instr_rev (n, &block->instr_list) {
	if ((n == instr) \|\| (d >= maxd))
	return MIN2(maxd, d + n->nop);
	if (count_instruction(n))
	d = MIN2(maxd, d + 1 + n->repeat + n->nop);
	}

	/* if coming from a predecessor block, assume it is assigned far
	* enough away.. we'll fix up later.
	*/
	if (!pred)
	return maxd;

	if (pred && (block->data != block)) {
	/* Search into predecessor blocks, finding the one with the
	* shortest distance, since that will be the worst case
	*/
	unsigned min = maxd - d;

	/* (ab)use block->data to prevent recursion: */
	block->data = block;

	set_foreach (block->predecessors, entry) {
	struct ir3_block pred = (struct ir3_block )entry->key;
	unsigned n;

	n = distance(pred, instr, min, pred);

	min = MIN2(min, n);
	}

	block->data = NULL;
	d += min;
	}

	return d;
	}

	/* calculate delay for specified src: */
	static unsigned
	delay_calc_srcn(struct ir3_block *block,
	struct ir3_instruction *assigner,
	struct ir3_instruction *consumer,
	unsigned srcn, bool soft, bool pred)
	{
	unsigned delay = 0;

	if (is_meta(assigner)) {
	foreach_src_n (src, n, assigner) {
	unsigned d;

	if (!src->instr)
	continue;

	d = delay_calc_srcn(block, src->instr, consumer, srcn, soft, pred);

	/* A (rptN) instruction executes in consecutive cycles so
	* it's outputs are written in successive cycles. And
	* likewise for it's (r)'d (incremented) inputs, they are
	* read on successive cycles.
	*
	* So we need to adjust the delay for (rptN)'s assigners
	* and consumers accordingly.
	*
	* Note that the dst of a (rptN) instruction is implicitly
	* (r) (the assigner case), although that is not the case
	* for src registers. There is exactly one case, bary.f,
	* which has a vecN (collect) src that is not (r)'d.
	*/
	if ((assigner->opc == OPC_META_SPLIT) && src->instr->repeat) {
	/* (rptN) assigner case: */
	d -= MIN2(d, src->instr->repeat - assigner->split.off);
	} else if ((assigner->opc == OPC_META_COLLECT) && consumer->repeat &&
	(consumer->regs[srcn]->flags & IR3_REG_R)) {
	d -= MIN2(d, n);
	}

	delay = MAX2(delay, d);
	}
	} else {
	delay = ir3_delayslots(assigner, consumer, srcn, soft);
	delay -= distance(block, assigner, delay, pred);
	}

	return delay;
	}

	static struct ir3_instruction *
	find_array_write(struct ir3_block *block, unsigned array_id, unsigned maxd)
	{
	unsigned d = 0;

	/* Note that this relies on incrementally building up the block's
	* instruction list.. but this is how scheduling and nopsched
	* work.
	*/
	foreach_instr_rev (n, &block->instr_list) {
	if (d >= maxd)
	return NULL;
	if (count_instruction(n))
	d++;
	if (dest_regs(n) == 0)
	continue;

	/* note that a dest reg will never be an immediate */
	if (n->regs[0]->array.id == array_id)
	return n;
	}

	return NULL;
	}

	/* like list_length() but only counts instructions which count in the
	* delay determination:
	*/
	static unsigned
	count_block_delay(struct ir3_block *block)
	{
	unsigned delay = 0;
	foreach_instr (n, &block->instr_list) {
	if (!count_instruction(n))
	continue;
	delay++;
	}
	return delay;
	}

	static unsigned
	delay_calc_array(struct ir3_block *block, unsigned array_id,
	struct ir3_instruction *consumer, unsigned srcn,
	bool soft, bool pred, unsigned maxd)
	{
	struct ir3_instruction *assigner;

	assigner = find_array_write(block, array_id, maxd);
	if (assigner)
	return delay_calc_srcn(block, assigner, consumer, srcn, soft, pred);

	if (!pred)
	return 0;

	unsigned len = count_block_delay(block);
	if (maxd <= len)
	return 0;

	maxd -= len;

	if (block->data == block) {
	/* we have a loop, return worst case: */
	return maxd;
	}

	/* If we need to search into predecessors, find the one with the
	* max delay.. the resulting delay is that minus the number of
	* counted instructions in this block:
	*/
	unsigned max = 0;

	/* (ab)use block->data to prevent recursion: */
	block->data = block;

	set_foreach (block->predecessors, entry) {
	struct ir3_block pred = (struct ir3_block )entry->key;
	unsigned delay =
	delay_calc_array(pred, array_id, consumer, srcn, soft, pred, maxd);

	max = MAX2(max, delay);
	}

	block->data = NULL;

	if (max < len)
	return 0;

	return max - len;
	}

	/**
	* Calculate delay for instruction (maximum of delay for all srcs):
	*
	* @soft: If true, add additional delay for situations where they
	* would not be strictly required because a sync flag would be
	* used (but scheduler would prefer to schedule some other
	* instructions first to avoid stalling on sync flag)
	* @pred: If true, recurse into predecessor blocks
	*/
	unsigned
	ir3_delay_calc(struct ir3_block block, struct ir3_instruction instr,
	bool soft, bool pred)
	{
	unsigned delay = 0;

	foreach_src_n (src, i, instr) {
	unsigned d = 0;

	if ((src->flags & IR3_REG_RELATIV) && !(src->flags & IR3_REG_CONST)) {
	d = delay_calc_array(block, src->array.id, instr, i+1, soft, pred, 6);
	} else if (src->instr) {
	d = delay_calc_srcn(block, src->instr, instr, i+1, soft, pred);
	}

	delay = MAX2(delay, d);
	}

	if (instr->address) {
	unsigned d = delay_calc_srcn(block, instr->address, instr, 0, soft, pred);
	delay = MAX2(delay, d);
	}

	return delay;
	}

	/**
	* Remove nop instructions. The scheduler can insert placeholder nop's
	* so that ir3_delay_calc() can account for nop's that won't be needed
	* due to nop's triggered by a previous instruction. However, before
	* legalize, we want to remove these. The legalize pass can insert
	* some nop's if needed to hold (for example) sync flags. This final
	* remaining nops are inserted by legalize after this.
	*/
	void
	ir3_remove_nops(struct ir3 *ir)
	{
	foreach_block (block, &ir->block_list) {
	foreach_instr_safe (instr, &block->instr_list) {
	if (instr->opc == OPC_NOP) {
	list_del(&instr->node);
	}
	}
	}

	}