src/panfrost/bifrost/bi_lower_combine.c - platform/external/mesa3d - Git at Google

 /*
  * Copyright (C) 2020 Collabora, Ltd.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */

 #include "compiler.h"

 /* NIR creates vectors as vecN ops, which we represent by a synthetic
  * BI_COMBINE instruction, e.g.:
  *
  *      v = combine x, y, z, w
  *
  * These combines need to be lowered by the pass in this file. Fix a given
  * source at component c.
  *
  * First suppose the source is SSA. If it is also scalar, then we may rewrite
  * the destination of the generating instruction (unique by SSA+scalar) to
  * write to v.c, and rewrite each of its uses to swizzle out .c instead of .x
  * (the original by scalar). If it is vector, there are two cases. If the
  * component c is `x`, we are accessing v.x, and each of the succeeding
  * components y, z... up to the last component of the vector are accessed
  * sequentially, then we may perform the same rewrite. If this is not the case,
  * rewriting would require more complex vector features, so we fallback on a
  * move.
  *
  * Otherwise is the source is not SSA, we also fallback on a move. We could
  * probably do better.
  */

 static void
 bi_combine_mov32(bi_context *ctx, bi_instruction *parent, unsigned comp, unsigned R)
 {
         bi_instruction move = {
                 .type = BI_MOV,
                 .dest = R,
                 .dest_type = nir_type_uint32,
                 .dest_offset = comp,
                 .src = { parent->src[comp] },
                 .src_types = { nir_type_uint32 },
                 .swizzle = { { parent->swizzle[comp][0] } }
         };

         bi_emit_before(ctx, parent, move);
 }

 static void
 bi_combine_sel16(bi_context *ctx, bi_instruction *parent, unsigned comp, unsigned R)
 {
         bi_instruction sel = {
                 .type = BI_SELECT,
                 .dest = R,
                 .dest_type = nir_type_uint32,
                 .dest_offset = comp >> 1,
                 .src = { parent->src[comp], parent->src[comp + 1] },
                 .src_types = { nir_type_uint16, nir_type_uint16 },
                 .swizzle = {
                         { parent->swizzle[comp][0] },
                         { parent->swizzle[comp + 1][0] },
                 }
         };

         /* In case we have a combine from a vec3 */
         if (!sel.src[1])
                 sel.src[1] = BIR_INDEX_ZERO;

         bi_emit_before(ctx, parent, sel);
 }

 /* Gets the instruction generating a given source. Combine lowering is
  * accidentally O(n^2) right now because this function is O(n) instead of O(1).
  * If this pass is slow, this cost can be avoided in favour for better
  * bookkeeping. */

 #if 0
 static bi_instruction *
 bi_get_parent(bi_context *ctx, unsigned idx)
 {
         bi_foreach_instr_global(ctx, ins) {
                 if (ins->dest == idx)
                         return ins;
         }

         return NULL;
 }
 #endif

 /* Rewrites uses of an index. Again, this could be O(n) to the program but is
  * currently O(nc) to the program and number of combines, so the pass becomes
  * effectively O(n^2). Better bookkeeping would bring down to linear if that's
  * an issue. */

 static void
 bi_rewrite_uses(bi_context *ctx,
                 unsigned old, unsigned oldc,
                 unsigned new, unsigned newc)
 {
         bi_foreach_instr_global(ctx, ins) {
                 bi_foreach_src(ins, s) {
                         if (ins->src[s] != old) continue;

                         for (unsigned i = 0; i < 16; ++i)
                                 ins->swizzle[s][i] += (newc - oldc);

                         ins->src[s] = new;
                 }
         }
 }

 /* Checks if we have a nicely aligned vector prefix */

 #if 0
 static bool
 bi_is_aligned_vec32(bi_instruction *combine, unsigned s, bi_instruction *io,
                 unsigned *count)
 {
         /* We only support prefixes */
         if (s != 0)
                 return false;

         if (!(bi_class_props[io->type] & BI_VECTOR))
                 return false;

         if (nir_alu_type_get_type_size(combine->dest_type) != 32)
                 return false;

         if (nir_alu_type_get_type_size(io->dest_type) != 32)
                 return false;

         unsigned components = io->vector_channels;

         /* Are we contiguous like that? */

         for (unsigned i = 0; i < components; ++i) {
                 if (combine->src[i] != io->dest)
                         return false;

                 if (combine->swizzle[i][0] != i)
                         return false;
         }

         /* We're good to go */
         *count = components;
         return true;
 }

 /* Tries to lower a given source of a combine to an appropriate rewrite,
  * returning true if successful, and false with no changes otherwise. */

 static bool
 bi_lower_combine_src(bi_context *ctx, bi_instruction *ins, unsigned s, unsigned R,
                 unsigned *vec_count)
 {
         unsigned src = ins->src[s];

         /* We currently only handle SSA */

         if (!src) return false;
         if (src & (BIR_SPECIAL | PAN_IS_REG)) return false;

         /* We are SSA. Lookup the generating instruction. */
         unsigned bytes = nir_alu_type_get_type_size(ins->dest_type) / 8;

         bi_instruction *parent = bi_get_parent(ctx, src,
                          0xF << (ins->swizzle[s][0] * bytes));

         if (!parent) return false;

         /* We have a parent instuction, sanity check the typesize */
         unsigned pbytes = nir_alu_type_get_type_size(parent->dest_type) / 8;
         if (pbytes != bytes) return false;

         bool scalar = parent->vector_channels != 0;
         if (!(scalar || bi_is_aligned_vec(ins, s, parent, vec_count))) return false;

         if (!bi_shift_mask(parent, bytes * s)) return false;
         bi_rewrite_uses(ctx, parent->dest, 0, R, s);
         parent->dest = R;
         return true;
 }
 #endif

 void
 bi_lower_combine(bi_context *ctx, bi_block *block)
 {
         bi_foreach_instr_in_block_safe(block, ins) {
                 if (ins->type != BI_COMBINE) continue;

                 bool needs_rewrite = !(ins->dest & PAN_IS_REG);
                 unsigned R = needs_rewrite ? bi_make_temp_reg(ctx) : ins->dest;
                 unsigned sz = nir_alu_type_get_type_size(ins->dest_type);

                 bi_foreach_src(ins, s) {
                         /* We're done early for vec2/3 */
                         if (!ins->src[s])
                                 continue;

 #if 0
                         unsigned vec_count = 0;

                         if (bi_lower_combine_src(ctx, ins, s, R, &vec_count)) {
                                 /* Skip vectored sources */
                                 if (vec_count)
                                         s += (vec_count - 1);
                         } else {
                                 bi_insert_combine_mov(ctx, ins, s, R);
                         }
 #endif
                         if (sz == 32)
                                 bi_combine_mov32(ctx, ins, s, R);
                         else if (sz == 16) {
                                 bi_combine_sel16(ctx, ins, s, R);
                                 s++;
                         } else {
                                 unreachable("Unknown COMBINE size");
                         }
                 }

                 if (needs_rewrite)
                         bi_rewrite_uses(ctx, ins->dest, 0, R, 0);

                 bi_remove_instruction(ins);
         }
 }
	/*
	* Copyright (C) 2020 Collabora, Ltd.
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the "Software"),
	* to deal in the Software without restriction, including without limitation
	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
	* and/or sell copies of the Software, and to permit persons to whom the
	* Software is furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice (including the next
	* paragraph) shall be included in all copies or substantial portions of the
	* Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	* SOFTWARE.
	*/

	#include "compiler.h"

	/* NIR creates vectors as vecN ops, which we represent by a synthetic
	* BI_COMBINE instruction, e.g.:
	*
	* v = combine x, y, z, w
	*
	* These combines need to be lowered by the pass in this file. Fix a given
	* source at component c.
	*
	* First suppose the source is SSA. If it is also scalar, then we may rewrite
	* the destination of the generating instruction (unique by SSA+scalar) to
	* write to v.c, and rewrite each of its uses to swizzle out .c instead of .x
	* (the original by scalar). If it is vector, there are two cases. If the
	* component c is `x`, we are accessing v.x, and each of the succeeding
	* components y, z... up to the last component of the vector are accessed
	* sequentially, then we may perform the same rewrite. If this is not the case,
	* rewriting would require more complex vector features, so we fallback on a
	* move.
	*
	* Otherwise is the source is not SSA, we also fallback on a move. We could
	* probably do better.
	*/

	static void
	bi_combine_mov32(bi_context ctx, bi_instruction parent, unsigned comp, unsigned R)
	{
	bi_instruction move = {
	.type = BI_MOV,
	.dest = R,
	.dest_type = nir_type_uint32,
	.dest_offset = comp,
	.src = { parent->src[comp] },
	.src_types = { nir_type_uint32 },
	.swizzle = { { parent->swizzle[comp][0] } }
	};

	bi_emit_before(ctx, parent, move);
	}

	static void
	bi_combine_sel16(bi_context ctx, bi_instruction parent, unsigned comp, unsigned R)
	{
	bi_instruction sel = {
	.type = BI_SELECT,
	.dest = R,
	.dest_type = nir_type_uint32,
	.dest_offset = comp >> 1,
	.src = { parent->src[comp], parent->src[comp + 1] },
	.src_types = { nir_type_uint16, nir_type_uint16 },
	.swizzle = {
	{ parent->swizzle[comp][0] },
	{ parent->swizzle[comp + 1][0] },
	}
	};

	/* In case we have a combine from a vec3 */
	if (!sel.src[1])
	sel.src[1] = BIR_INDEX_ZERO;

	bi_emit_before(ctx, parent, sel);
	}

	/* Gets the instruction generating a given source. Combine lowering is
	* accidentally O(n^2) right now because this function is O(n) instead of O(1).
	* If this pass is slow, this cost can be avoided in favour for better
	* bookkeeping. */

	#if 0
	static bi_instruction *
	bi_get_parent(bi_context *ctx, unsigned idx)
	{
	bi_foreach_instr_global(ctx, ins) {
	if (ins->dest == idx)
	return ins;
	}

	return NULL;
	}
	#endif

	/* Rewrites uses of an index. Again, this could be O(n) to the program but is
	* currently O(nc) to the program and number of combines, so the pass becomes
	* effectively O(n^2). Better bookkeeping would bring down to linear if that's
	* an issue. */

	static void
	bi_rewrite_uses(bi_context *ctx,
	unsigned old, unsigned oldc,
	unsigned new, unsigned newc)
	{
	bi_foreach_instr_global(ctx, ins) {
	bi_foreach_src(ins, s) {
	if (ins->src[s] != old) continue;

	for (unsigned i = 0; i < 16; ++i)
	ins->swizzle[s][i] += (newc - oldc);

	ins->src[s] = new;
	}
	}
	}

	/* Checks if we have a nicely aligned vector prefix */

	#if 0
	static bool
	bi_is_aligned_vec32(bi_instruction combine, unsigned s, bi_instruction io,
	unsigned *count)
	{
	/* We only support prefixes */
	if (s != 0)
	return false;

	if (!(bi_class_props[io->type] & BI_VECTOR))
	return false;

	if (nir_alu_type_get_type_size(combine->dest_type) != 32)
	return false;

	if (nir_alu_type_get_type_size(io->dest_type) != 32)
	return false;

	unsigned components = io->vector_channels;

	/* Are we contiguous like that? */

	for (unsigned i = 0; i < components; ++i) {
	if (combine->src[i] != io->dest)
	return false;

	if (combine->swizzle[i][0] != i)
	return false;
	}

	/* We're good to go */
	*count = components;
	return true;
	}

	/* Tries to lower a given source of a combine to an appropriate rewrite,
	* returning true if successful, and false with no changes otherwise. */

	static bool
	bi_lower_combine_src(bi_context ctx, bi_instruction ins, unsigned s, unsigned R,
	unsigned *vec_count)
	{
	unsigned src = ins->src[s];

	/* We currently only handle SSA */

	if (!src) return false;
	if (src & (BIR_SPECIAL \| PAN_IS_REG)) return false;

	/* We are SSA. Lookup the generating instruction. */
	unsigned bytes = nir_alu_type_get_type_size(ins->dest_type) / 8;

	bi_instruction *parent = bi_get_parent(ctx, src,
	0xF << (ins->swizzle[s][0] * bytes));

	if (!parent) return false;

	/* We have a parent instuction, sanity check the typesize */
	unsigned pbytes = nir_alu_type_get_type_size(parent->dest_type) / 8;
	if (pbytes != bytes) return false;

	bool scalar = parent->vector_channels != 0;
	if (!(scalar \|\| bi_is_aligned_vec(ins, s, parent, vec_count))) return false;

	if (!bi_shift_mask(parent, bytes * s)) return false;
	bi_rewrite_uses(ctx, parent->dest, 0, R, s);
	parent->dest = R;
	return true;
	}
	#endif

	void
	bi_lower_combine(bi_context ctx, bi_block block)
	{
	bi_foreach_instr_in_block_safe(block, ins) {
	if (ins->type != BI_COMBINE) continue;

	bool needs_rewrite = !(ins->dest & PAN_IS_REG);
	unsigned R = needs_rewrite ? bi_make_temp_reg(ctx) : ins->dest;
	unsigned sz = nir_alu_type_get_type_size(ins->dest_type);

	bi_foreach_src(ins, s) {
	/* We're done early for vec2/3 */
	if (!ins->src[s])
	continue;

	#if 0
	unsigned vec_count = 0;

	if (bi_lower_combine_src(ctx, ins, s, R, &vec_count)) {
	/* Skip vectored sources */
	if (vec_count)
	s += (vec_count - 1);
	} else {
	bi_insert_combine_mov(ctx, ins, s, R);
	}
	#endif
	if (sz == 32)
	bi_combine_mov32(ctx, ins, s, R);
	else if (sz == 16) {
	bi_combine_sel16(ctx, ins, s, R);
	s++;
	} else {
	unreachable("Unknown COMBINE size");
	}
	}

	if (needs_rewrite)
	bi_rewrite_uses(ctx, ins->dest, 0, R, 0);

	bi_remove_instruction(ins);
	}
	}