src/panfrost/bifrost/bi_ra.c - platform/external/mesa3d - Git at Google

 /*
  * Copyright (C) 2020 Collabora Ltd.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  *
  * Authors (Collabora):
  *      Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
  */

 #include "compiler.h"
 #include "bi_print.h"
 #include "panfrost/util/lcra.h"
 #include "util/u_memory.h"

 static void
 bi_compute_interference(bi_context *ctx, struct lcra_state *l)
 {
         bi_compute_liveness(ctx);

         bi_foreach_block(ctx, _blk) {
                 bi_block *blk = (bi_block *) _blk;
                 uint16_t *live = mem_dup(_blk->live_out, l->node_count * sizeof(uint16_t));

                 bi_foreach_instr_in_block_rev(blk, ins) {
                         /* Mark all registers live after the instruction as
                          * interfering with the destination */

                         if (ins->dest && (ins->dest < l->node_count)) {
                                 for (unsigned i = 1; i < l->node_count; ++i) {
                                         if (live[i])
                                                 lcra_add_node_interference(l, ins->dest, bi_writemask(ins), i, live[i]);
                                 }
                         }

                         /* Update live_in */
                         bi_liveness_ins_update(live, ins, l->node_count);
                 }

                 free(live);
         }
 }

 enum {
         BI_REG_CLASS_WORK = 0,
 } bi_reg_class;

 static struct lcra_state *
 bi_allocate_registers(bi_context *ctx, bool *success)
 {
         unsigned node_count = bi_max_temp(ctx);

         struct lcra_state *l =
                 lcra_alloc_equations(node_count, 1);

         if (ctx->is_blend) {
                 /* R0-R3 are reserved for the blend input */
                 l->class_start[BI_REG_CLASS_WORK] = 4 * 4;
                 l->class_size[BI_REG_CLASS_WORK] = 64 * 4;
         } else {
                 /* R0 - R63, all 32-bit */
                 l->class_start[BI_REG_CLASS_WORK] = 0;
                 l->class_size[BI_REG_CLASS_WORK] = 64 * 4;
         }

         bi_foreach_instr_global(ctx, ins) {
                 unsigned dest = ins->dest;

                 if (!dest || (dest >= node_count))
                         continue;

                 l->class[dest] = BI_REG_CLASS_WORK;
                 lcra_set_alignment(l, dest, 2, 16); /* 2^2 = 4 */
                 lcra_restrict_range(l, dest, 4);
         }

         bi_compute_interference(ctx, l);

         *success = lcra_solve(l);

         return l;
 }

 static unsigned
 bi_reg_from_index(struct lcra_state *l, unsigned index, unsigned offset)
 {
         /* Did we run RA for this index at all */
         if (index >= l->node_count)
                 return index;

         /* LCRA didn't bother solving this index (how lazy!) */
         signed solution = l->solutions[index];
         if (solution < 0)
                 return index;

         assert((solution & 0x3) == 0);
         unsigned reg = solution / 4;
         reg += offset;

         return BIR_INDEX_REGISTER | reg;
 }

 static void
 bi_adjust_src_ra(bi_instruction *ins, struct lcra_state *l, unsigned src)
 {
         if (ins->src[src] >= l->node_count)
                 return;

         bool vector = (bi_class_props[ins->type] & BI_VECTOR) && src == 0;
         unsigned offset = 0;

         if (vector) {
                 /* TODO: Do we do anything here? */
         } else {
                 /* Use the swizzle as component select */
                 unsigned components = bi_get_component_count(ins, src);

                 nir_alu_type T = ins->src_types[src];
                 unsigned size = nir_alu_type_get_type_size(T);

                 /* TODO: 64-bit? */
                 unsigned components_per_word = MAX2(32 / size, 1);

                 for (unsigned i = 0; i < components; ++i) {
                         unsigned off = ins->swizzle[src][i] / components_per_word;

                         /* We can't cross register boundaries in a swizzle */
                         if (i == 0)
                                 offset = off;
                         else
                                 assert(off == offset);

                         ins->swizzle[src][i] %= components_per_word;
                 }
         }

         ins->src[src] = bi_reg_from_index(l, ins->src[src], offset);
 }

 static void
 bi_adjust_dest_ra(bi_instruction *ins, struct lcra_state *l)
 {
         if (ins->dest >= l->node_count)
                 return;

         ins->dest = bi_reg_from_index(l, ins->dest, ins->dest_offset);
         ins->dest_offset = 0;
 }

 static void
 bi_install_registers(bi_context *ctx, struct lcra_state *l)
 {
         bi_foreach_instr_global(ctx, ins) {
                 bi_adjust_dest_ra(ins, l);

                 bi_foreach_src(ins, s)
                         bi_adjust_src_ra(ins, l, s);
         }
 }

 static void
 bi_rewrite_index_src_single(bi_instruction *ins, unsigned old, unsigned new)
 {
         bi_foreach_src(ins, i) {
                 if (ins->src[i] == old)
                         ins->src[i] = new;
         }
 }

 static bi_instruction
 bi_spill(unsigned node, uint64_t offset, unsigned channels)
 {
         bi_instruction store = {
                 .type = BI_STORE,
                 .segment = BI_SEGMENT_TLS,
                 .vector_channels = channels,
                 .src = {
                         node,
                         BIR_INDEX_CONSTANT,
                         BIR_INDEX_CONSTANT | 32,
                 },
                 .src_types = {
                         nir_type_uint32,
                         nir_type_uint32,
                         nir_type_uint32
                 },
                 .constant = { .u64 = offset },
         };

         return store;
 }

 static bi_instruction
 bi_fill(unsigned node, uint64_t offset, unsigned channels)
 {
         bi_instruction load = {
                 .type = BI_LOAD,
                 .segment = BI_SEGMENT_TLS,
                 .vector_channels = channels,
                 .dest = node,
                 .dest_type = nir_type_uint32,
                 .src = {
                         BIR_INDEX_CONSTANT,
                         BIR_INDEX_CONSTANT | 32,
                 },
                 .src_types = {
                         nir_type_uint32,
                         nir_type_uint32
                 },
                 .constant = { .u64 = offset },
         };

         return load;
 }

 /* Get the single instruction in a singleton clause. Precondition: clause
  * contains exactly 1 instruction.
  *
  * More complex scheduling implies tougher constraints on spilling. We'll cross
  * that bridge when we get to it. For now, just grab the one and only
  * instruction in the clause */

 static bi_instruction *
 bi_unwrap_singleton(bi_clause *clause)
 {
        assert(clause->bundle_count == 1);
        assert((clause->bundles[0].fma != NULL) ^ (clause->bundles[0].add != NULL));

        return clause->bundles[0].fma ? clause->bundles[0].fma
                : clause->bundles[0].add;
 }

 static inline void
 bi_insert_singleton(void *memctx, bi_clause *cursor, bi_block *block,
                 bi_instruction ins, bool before)
 {
         bi_instruction *uins = rzalloc(memctx, bi_instruction);
         memcpy(uins, &ins, sizeof(ins));

         /* Get the instruction to pivot around. Should be first/last of clause
          * depending on before setting, those coincide for singletons */
         bi_instruction *cursor_ins = bi_unwrap_singleton(cursor);

         bi_clause *clause = bi_make_singleton(memctx, uins,
                         block, 0, (1 << 0), true);

         if (before) {
                 list_addtail(&clause->link, &cursor->link);
                 list_addtail(&uins->link, &cursor_ins->link);
         } else {
                 list_add(&clause->link, &cursor->link);
                 list_add(&uins->link, &cursor_ins->link);
         }
 }

 /* If register allocation fails, find the best spill node */

 static signed
 bi_choose_spill_node(bi_context *ctx, struct lcra_state *l)
 {
         /* Pick a node satisfying bi_spill_register's preconditions */

         bi_foreach_instr_global(ctx, ins) {
                 if (ins->no_spill)
                         lcra_set_node_spill_cost(l, ins->dest, -1);
         }

         for (unsigned i = PAN_IS_REG; i < l->node_count; i += 2)
                 lcra_set_node_spill_cost(l, i, -1);

         return lcra_get_best_spill_node(l);
 }

 /* Once we've chosen a spill node, spill it. Precondition: node is a valid
  * SSA node in the non-optimized scheduled IR that was not already
  * spilled (enforced by bi_choose_spill_node). Returns bytes spilled */

 static unsigned
 bi_spill_register(bi_context *ctx, unsigned node, unsigned offset)
 {
         assert(!(node & PAN_IS_REG));

         unsigned channels = 1;

         /* Spill after every store */
         bi_foreach_block(ctx, _block) {
                 bi_block *block = (bi_block *) _block;
                 bi_foreach_clause_in_block_safe(block, clause) {
                         bi_instruction *ins = bi_unwrap_singleton(clause);

                         if (ins->dest != node) continue;

                         ins->dest = bi_make_temp(ctx);
                         ins->no_spill = true;
                         channels = MAX2(channels, ins->vector_channels);

                         bi_instruction st = bi_spill(ins->dest, offset, channels);
                         bi_insert_singleton(ctx, clause, block, st, false);
                         ctx->spills++;
                 }
         }

         /* Fill before every use */
         bi_foreach_block(ctx, _block) {
                 bi_block *block = (bi_block *) _block;
                 bi_foreach_clause_in_block_safe(block, clause) {
                         bi_instruction *ins = bi_unwrap_singleton(clause);
                         if (!bi_has_arg(ins, node)) continue;

                         /* Don't rewrite spills themselves */
                         if (ins->segment == BI_SEGMENT_TLS) continue;

                         unsigned index = bi_make_temp(ctx);

                         bi_instruction ld = bi_fill(index, offset, channels);
                         ld.no_spill = true;
                         bi_insert_singleton(ctx, clause, block, ld, true);

                         /* Rewrite to use */
                         bi_rewrite_index_src_single(ins, node, index);
                         ctx->fills++;
                 }
         }

         return (channels * 4);
 }

 void
 bi_register_allocate(bi_context *ctx)
 {
         struct lcra_state *l = NULL;
         bool success = false;

         unsigned iter_count = 100; /* max iterations */

         /* Number of bytes of memory we've spilled into */
         unsigned spill_count = 0;

         /* For instructions that both read and write from a data register, it's
          * the *same* data register. We enforce that constraint by just doing a
          * quick rewrite. TODO: are there cases where this causes RA to have no
          * solutions due to copyprop? */
         bi_foreach_instr_global(ctx, ins) {
                 unsigned props = bi_class_props[ins->type];
                 unsigned both = BI_DATA_REG_SRC | BI_DATA_REG_DEST;
                 if ((props & both) != both) continue;

                 bi_rewrite_uses(ctx, ins->dest, 0, ins->src[0], 0);
                 ins->dest = ins->src[0];
         }

         do {
                 if (l) {
                         signed spill_node = bi_choose_spill_node(ctx, l);
                         lcra_free(l);
                         l = NULL;


                         if (spill_node == -1)
                                 unreachable("Failed to choose spill node\n");

                         spill_count += bi_spill_register(ctx, spill_node, spill_count);
                 }

                 bi_invalidate_liveness(ctx);
                 l = bi_allocate_registers(ctx, &success);
         } while(!success && ((iter_count--) > 0));

         assert(success);

         bi_install_registers(ctx, l);

         lcra_free(l);
 }
	/*
	* Copyright (C) 2020 Collabora Ltd.
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the "Software"),
	* to deal in the Software without restriction, including without limitation
	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
	* and/or sell copies of the Software, and to permit persons to whom the
	* Software is furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice (including the next
	* paragraph) shall be included in all copies or substantial portions of the
	* Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	* SOFTWARE.
	*
	* Authors (Collabora):
	* Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
	*/

	#include "compiler.h"
	#include "bi_print.h"
	#include "panfrost/util/lcra.h"
	#include "util/u_memory.h"

	static void
	bi_compute_interference(bi_context ctx, struct lcra_state l)
	{
	bi_compute_liveness(ctx);

	bi_foreach_block(ctx, _blk) {
	bi_block blk = (bi_block ) _blk;
	uint16_t live = mem_dup(_blk->live_out, l->node_count sizeof(uint16_t));

	bi_foreach_instr_in_block_rev(blk, ins) {
	/* Mark all registers live after the instruction as
	* interfering with the destination */

	if (ins->dest && (ins->dest < l->node_count)) {
	for (unsigned i = 1; i < l->node_count; ++i) {
	if (live[i])
	lcra_add_node_interference(l, ins->dest, bi_writemask(ins), i, live[i]);
	}
	}

	/* Update live_in */
	bi_liveness_ins_update(live, ins, l->node_count);
	}

	free(live);
	}
	}

	enum {
	BI_REG_CLASS_WORK = 0,
	} bi_reg_class;

	static struct lcra_state *
	bi_allocate_registers(bi_context ctx, bool success)
	{
	unsigned node_count = bi_max_temp(ctx);

	struct lcra_state *l =
	lcra_alloc_equations(node_count, 1);

	if (ctx->is_blend) {
	/* R0-R3 are reserved for the blend input */
	l->class_start[BI_REG_CLASS_WORK] = 4 * 4;
	l->class_size[BI_REG_CLASS_WORK] = 64 * 4;
	} else {
	/* R0 - R63, all 32-bit */
	l->class_start[BI_REG_CLASS_WORK] = 0;
	l->class_size[BI_REG_CLASS_WORK] = 64 * 4;
	}

	bi_foreach_instr_global(ctx, ins) {
	unsigned dest = ins->dest;

	if (!dest \|\| (dest >= node_count))
	continue;

	l->class[dest] = BI_REG_CLASS_WORK;
	lcra_set_alignment(l, dest, 2, 16); /* 2^2 = 4 */
	lcra_restrict_range(l, dest, 4);
	}

	bi_compute_interference(ctx, l);

	*success = lcra_solve(l);

	return l;
	}

	static unsigned
	bi_reg_from_index(struct lcra_state *l, unsigned index, unsigned offset)
	{
	/* Did we run RA for this index at all */
	if (index >= l->node_count)
	return index;

	/* LCRA didn't bother solving this index (how lazy!) */
	signed solution = l->solutions[index];
	if (solution < 0)
	return index;

	assert((solution & 0x3) == 0);
	unsigned reg = solution / 4;
	reg += offset;

	return BIR_INDEX_REGISTER \| reg;
	}

	static void
	bi_adjust_src_ra(bi_instruction ins, struct lcra_state l, unsigned src)
	{
	if (ins->src[src] >= l->node_count)
	return;

	bool vector = (bi_class_props[ins->type] & BI_VECTOR) && src == 0;
	unsigned offset = 0;

	if (vector) {
	/* TODO: Do we do anything here? */
	} else {
	/* Use the swizzle as component select */
	unsigned components = bi_get_component_count(ins, src);

	nir_alu_type T = ins->src_types[src];
	unsigned size = nir_alu_type_get_type_size(T);

	/* TODO: 64-bit? */
	unsigned components_per_word = MAX2(32 / size, 1);

	for (unsigned i = 0; i < components; ++i) {
	unsigned off = ins->swizzle[src][i] / components_per_word;

	/* We can't cross register boundaries in a swizzle */
	if (i == 0)
	offset = off;
	else
	assert(off == offset);

	ins->swizzle[src][i] %= components_per_word;
	}
	}

	ins->src[src] = bi_reg_from_index(l, ins->src[src], offset);
	}

	static void
	bi_adjust_dest_ra(bi_instruction ins, struct lcra_state l)
	{
	if (ins->dest >= l->node_count)
	return;

	ins->dest = bi_reg_from_index(l, ins->dest, ins->dest_offset);
	ins->dest_offset = 0;
	}

	static void
	bi_install_registers(bi_context ctx, struct lcra_state l)
	{
	bi_foreach_instr_global(ctx, ins) {
	bi_adjust_dest_ra(ins, l);

	bi_foreach_src(ins, s)
	bi_adjust_src_ra(ins, l, s);
	}
	}

	static void
	bi_rewrite_index_src_single(bi_instruction *ins, unsigned old, unsigned new)
	{
	bi_foreach_src(ins, i) {
	if (ins->src[i] == old)
	ins->src[i] = new;
	}
	}

	static bi_instruction
	bi_spill(unsigned node, uint64_t offset, unsigned channels)
	{
	bi_instruction store = {
	.type = BI_STORE,
	.segment = BI_SEGMENT_TLS,
	.vector_channels = channels,
	.src = {
	node,
	BIR_INDEX_CONSTANT,
	BIR_INDEX_CONSTANT \| 32,
	},
	.src_types = {
	nir_type_uint32,
	nir_type_uint32,
	nir_type_uint32
	},
	.constant = { .u64 = offset },
	};

	return store;
	}

	static bi_instruction
	bi_fill(unsigned node, uint64_t offset, unsigned channels)
	{
	bi_instruction load = {
	.type = BI_LOAD,
	.segment = BI_SEGMENT_TLS,
	.vector_channels = channels,
	.dest = node,
	.dest_type = nir_type_uint32,
	.src = {
	BIR_INDEX_CONSTANT,
	BIR_INDEX_CONSTANT \| 32,
	},
	.src_types = {
	nir_type_uint32,
	nir_type_uint32
	},
	.constant = { .u64 = offset },
	};

	return load;
	}

	/* Get the single instruction in a singleton clause. Precondition: clause
	* contains exactly 1 instruction.
	*
	* More complex scheduling implies tougher constraints on spilling. We'll cross
	* that bridge when we get to it. For now, just grab the one and only
	* instruction in the clause */

	static bi_instruction *
	bi_unwrap_singleton(bi_clause *clause)
	{
	assert(clause->bundle_count == 1);
	assert((clause->bundles[0].fma != NULL) ^ (clause->bundles[0].add != NULL));

	return clause->bundles[0].fma ? clause->bundles[0].fma
	: clause->bundles[0].add;
	}

	static inline void
	bi_insert_singleton(void memctx, bi_clause cursor, bi_block *block,
	bi_instruction ins, bool before)
	{
	bi_instruction *uins = rzalloc(memctx, bi_instruction);
	memcpy(uins, &ins, sizeof(ins));

	/* Get the instruction to pivot around. Should be first/last of clause
	* depending on before setting, those coincide for singletons */
	bi_instruction *cursor_ins = bi_unwrap_singleton(cursor);

	bi_clause *clause = bi_make_singleton(memctx, uins,
	block, 0, (1 << 0), true);

	if (before) {
	list_addtail(&clause->link, &cursor->link);
	list_addtail(&uins->link, &cursor_ins->link);
	} else {
	list_add(&clause->link, &cursor->link);
	list_add(&uins->link, &cursor_ins->link);
	}
	}

	/* If register allocation fails, find the best spill node */

	static signed
	bi_choose_spill_node(bi_context ctx, struct lcra_state l)
	{
	/* Pick a node satisfying bi_spill_register's preconditions */

	bi_foreach_instr_global(ctx, ins) {
	if (ins->no_spill)
	lcra_set_node_spill_cost(l, ins->dest, -1);
	}

	for (unsigned i = PAN_IS_REG; i < l->node_count; i += 2)
	lcra_set_node_spill_cost(l, i, -1);

	return lcra_get_best_spill_node(l);
	}

	/* Once we've chosen a spill node, spill it. Precondition: node is a valid
	* SSA node in the non-optimized scheduled IR that was not already
	* spilled (enforced by bi_choose_spill_node). Returns bytes spilled */

	static unsigned
	bi_spill_register(bi_context *ctx, unsigned node, unsigned offset)
	{
	assert(!(node & PAN_IS_REG));

	unsigned channels = 1;

	/* Spill after every store */
	bi_foreach_block(ctx, _block) {
	bi_block block = (bi_block ) _block;
	bi_foreach_clause_in_block_safe(block, clause) {
	bi_instruction *ins = bi_unwrap_singleton(clause);

	if (ins->dest != node) continue;

	ins->dest = bi_make_temp(ctx);
	ins->no_spill = true;
	channels = MAX2(channels, ins->vector_channels);

	bi_instruction st = bi_spill(ins->dest, offset, channels);
	bi_insert_singleton(ctx, clause, block, st, false);
	ctx->spills++;
	}
	}

	/* Fill before every use */
	bi_foreach_block(ctx, _block) {
	bi_block block = (bi_block ) _block;
	bi_foreach_clause_in_block_safe(block, clause) {
	bi_instruction *ins = bi_unwrap_singleton(clause);
	if (!bi_has_arg(ins, node)) continue;

	/* Don't rewrite spills themselves */
	if (ins->segment == BI_SEGMENT_TLS) continue;

	unsigned index = bi_make_temp(ctx);

	bi_instruction ld = bi_fill(index, offset, channels);
	ld.no_spill = true;
	bi_insert_singleton(ctx, clause, block, ld, true);

	/* Rewrite to use */
	bi_rewrite_index_src_single(ins, node, index);
	ctx->fills++;
	}
	}

	return (channels * 4);
	}

	void
	bi_register_allocate(bi_context *ctx)
	{
	struct lcra_state *l = NULL;
	bool success = false;

	unsigned iter_count = 100; /* max iterations */

	/* Number of bytes of memory we've spilled into */
	unsigned spill_count = 0;

	/* For instructions that both read and write from a data register, it's
	* the same data register. We enforce that constraint by just doing a
	* quick rewrite. TODO: are there cases where this causes RA to have no
	* solutions due to copyprop? */
	bi_foreach_instr_global(ctx, ins) {
	unsigned props = bi_class_props[ins->type];
	unsigned both = BI_DATA_REG_SRC \| BI_DATA_REG_DEST;
	if ((props & both) != both) continue;

	bi_rewrite_uses(ctx, ins->dest, 0, ins->src[0], 0);
	ins->dest = ins->src[0];
	}

	do {
	if (l) {
	signed spill_node = bi_choose_spill_node(ctx, l);
	lcra_free(l);
	l = NULL;


	if (spill_node == -1)
	unreachable("Failed to choose spill node\n");

	spill_count += bi_spill_register(ctx, spill_node, spill_count);
	}

	bi_invalidate_liveness(ctx);
	l = bi_allocate_registers(ctx, &success);
	} while(!success && ((iter_count--) > 0));

	assert(success);

	bi_install_registers(ctx, l);

	lcra_free(l);
	}