blob: db955552b11fc33f2c98e6bbf09bc2fe15bc9e49 [file] [log] [blame]
/*
* Copyright (C) 2020 Collabora, Ltd.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "compiler.h"
/* NIR creates vectors as vecN ops, which we represent by a synthetic
* BI_COMBINE instruction, e.g.:
*
* v = combine x, y, z, w
*
* These combines need to be lowered by the pass in this file. Fix a given
* source at component c.
*
* First suppose the source is SSA. If it is also scalar, then we may rewrite
* the destination of the generating instruction (unique by SSA+scalar) to
* write to v.c, and rewrite each of its uses to swizzle out .c instead of .x
* (the original by scalar). If it is vector, there are two cases. If the
* component c is `x`, we are accessing v.x, and each of the succeeding
* components y, z... up to the last component of the vector are accessed
* sequentially, then we may perform the same rewrite. If this is not the case,
* rewriting would require more complex vector features, so we fallback on a
* move.
*
* Otherwise is the source is not SSA, we also fallback on a move. We could
* probably do better.
*/
static void
bi_combine_mov32(bi_context *ctx, bi_instruction *parent, unsigned comp, unsigned R)
{
bi_instruction move = {
.type = BI_MOV,
.dest = R,
.dest_type = nir_type_uint32,
.dest_offset = comp,
.src = { parent->src[comp] },
.src_types = { nir_type_uint32 },
.swizzle = { { parent->swizzle[comp][0] } }
};
bi_emit_before(ctx, parent, move);
}
static void
bi_combine_sel16(bi_context *ctx, bi_instruction *parent, unsigned comp, unsigned R)
{
bi_instruction sel = {
.type = BI_SELECT,
.dest = R,
.dest_type = nir_type_uint32,
.dest_offset = comp >> 1,
.src = { parent->src[comp], parent->src[comp + 1] },
.src_types = { nir_type_uint16, nir_type_uint16 },
.swizzle = {
{ parent->swizzle[comp][0] },
{ parent->swizzle[comp + 1][0] },
}
};
/* In case we have a combine from a vec3 */
if (!sel.src[1])
sel.src[1] = BIR_INDEX_ZERO;
bi_emit_before(ctx, parent, sel);
}
/* Gets the instruction generating a given source. Combine lowering is
* accidentally O(n^2) right now because this function is O(n) instead of O(1).
* If this pass is slow, this cost can be avoided in favour for better
* bookkeeping. */
#if 0
static bi_instruction *
bi_get_parent(bi_context *ctx, unsigned idx)
{
bi_foreach_instr_global(ctx, ins) {
if (ins->dest == idx)
return ins;
}
return NULL;
}
#endif
/* Rewrites uses of an index. Again, this could be O(n) to the program but is
* currently O(nc) to the program and number of combines, so the pass becomes
* effectively O(n^2). Better bookkeeping would bring down to linear if that's
* an issue. */
static void
bi_rewrite_uses(bi_context *ctx,
unsigned old, unsigned oldc,
unsigned new, unsigned newc)
{
bi_foreach_instr_global(ctx, ins) {
bi_foreach_src(ins, s) {
if (ins->src[s] != old) continue;
for (unsigned i = 0; i < 16; ++i)
ins->swizzle[s][i] += (newc - oldc);
ins->src[s] = new;
}
}
}
/* Checks if we have a nicely aligned vector prefix */
#if 0
static bool
bi_is_aligned_vec32(bi_instruction *combine, unsigned s, bi_instruction *io,
unsigned *count)
{
/* We only support prefixes */
if (s != 0)
return false;
if (!(bi_class_props[io->type] & BI_VECTOR))
return false;
if (nir_alu_type_get_type_size(combine->dest_type) != 32)
return false;
if (nir_alu_type_get_type_size(io->dest_type) != 32)
return false;
unsigned components = io->vector_channels;
/* Are we contiguous like that? */
for (unsigned i = 0; i < components; ++i) {
if (combine->src[i] != io->dest)
return false;
if (combine->swizzle[i][0] != i)
return false;
}
/* We're good to go */
*count = components;
return true;
}
/* Tries to lower a given source of a combine to an appropriate rewrite,
* returning true if successful, and false with no changes otherwise. */
static bool
bi_lower_combine_src(bi_context *ctx, bi_instruction *ins, unsigned s, unsigned R,
unsigned *vec_count)
{
unsigned src = ins->src[s];
/* We currently only handle SSA */
if (!src) return false;
if (src & (BIR_SPECIAL | PAN_IS_REG)) return false;
/* We are SSA. Lookup the generating instruction. */
unsigned bytes = nir_alu_type_get_type_size(ins->dest_type) / 8;
bi_instruction *parent = bi_get_parent(ctx, src,
0xF << (ins->swizzle[s][0] * bytes));
if (!parent) return false;
/* We have a parent instuction, sanity check the typesize */
unsigned pbytes = nir_alu_type_get_type_size(parent->dest_type) / 8;
if (pbytes != bytes) return false;
bool scalar = parent->vector_channels != 0;
if (!(scalar || bi_is_aligned_vec(ins, s, parent, vec_count))) return false;
if (!bi_shift_mask(parent, bytes * s)) return false;
bi_rewrite_uses(ctx, parent->dest, 0, R, s);
parent->dest = R;
return true;
}
#endif
void
bi_lower_combine(bi_context *ctx, bi_block *block)
{
bi_foreach_instr_in_block_safe(block, ins) {
if (ins->type != BI_COMBINE) continue;
bool needs_rewrite = !(ins->dest & PAN_IS_REG);
unsigned R = needs_rewrite ? bi_make_temp_reg(ctx) : ins->dest;
unsigned sz = nir_alu_type_get_type_size(ins->dest_type);
bi_foreach_src(ins, s) {
/* We're done early for vec2/3 */
if (!ins->src[s])
continue;
#if 0
unsigned vec_count = 0;
if (bi_lower_combine_src(ctx, ins, s, R, &vec_count)) {
/* Skip vectored sources */
if (vec_count)
s += (vec_count - 1);
} else {
bi_insert_combine_mov(ctx, ins, s, R);
}
#endif
if (sz == 32)
bi_combine_mov32(ctx, ins, s, R);
else if (sz == 16) {
bi_combine_sel16(ctx, ins, s, R);
s++;
} else {
unreachable("Unknown COMBINE size");
}
}
if (needs_rewrite)
bi_rewrite_uses(ctx, ins->dest, 0, R, 0);
bi_remove_instruction(ins);
}
}