src/intel/compiler/brw_builder.cpp - platform/external/mesa3d - Git at Google

 /*
  * Copyright © 2010 Intel Corporation
  * SPDX-License-Identifier: MIT
  */

 #include "brw_builder.h"

 /*
  * This helper takes a source register and un/shuffles it into the destination
  * register.
  *
  * If source type size is smaller than destination type size the operation
  * needed is a component shuffle. The opposite case would be an unshuffle. If
  * source/destination type size is equal a shuffle is done that would be
  * equivalent to a simple MOV.
  *
  * For example, if source is a 16-bit type and destination is 32-bit. A 3
  * components .xyz 16-bit vector on SIMD8 would be.
  *
  *    |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8|
  *    |z1|z2|z3|z4|z5|z6|z7|z8|  |  |  |  |  |  |  |  |
  *
  * This helper will return the following 2 32-bit components with the 16-bit
  * values shuffled:
  *
  *    |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8|
  *    |z1   |z2   |z3   |z4   |z5   |z6   |z7   |z8   |
  *
  * For unshuffle, the example would be the opposite, a 64-bit type source
  * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8
  * would be:
  *
  *    | x1l   x1h | x2l   x2h | x3l   x3h | x4l   x4h |
  *    | x5l   x5h | x6l   x6h | x7l   x7h | x8l   x8h |
  *    | y1l   y1h | y2l   y2h | y3l   y3h | y4l   y4h |
  *    | y5l   y5h | y6l   y6h | y7l   y7h | y8l   y8h |
  *
  * The returned result would be the following 4 32-bit components unshuffled:
  *
  *    | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l |
  *    | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h |
  *    | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l |
  *    | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h |
  *
  * - Source and destination register must not be overlapped.
  * - components units are measured in terms of the smaller type between
  *   source and destination because we are un/shuffling the smaller
  *   components from/into the bigger ones.
  * - first_component parameter allows skipping source components.
  */
 static void
 shuffle_src_to_dst(const brw_builder &bld,
                    const brw_reg &dst,
                    const brw_reg &src,
                    uint32_t first_component,
                    uint32_t components)
 {
    if (brw_type_size_bytes(src.type) == brw_type_size_bytes(dst.type)) {
       assert(!regions_overlap(dst,
          brw_type_size_bytes(dst.type) * bld.dispatch_width() * components,
          offset(src, bld, first_component),
          brw_type_size_bytes(src.type) * bld.dispatch_width() * components));
       for (unsigned i = 0; i < components; i++) {
          bld.MOV(retype(offset(dst, bld, i), src.type),
                  offset(src, bld, i + first_component));
       }
    } else if (brw_type_size_bytes(src.type) < brw_type_size_bytes(dst.type)) {
       /* Source is shuffled into destination */
       unsigned size_ratio = brw_type_size_bytes(dst.type) / brw_type_size_bytes(src.type);
       assert(!regions_overlap(dst,
          brw_type_size_bytes(dst.type) * bld.dispatch_width() *
          DIV_ROUND_UP(components, size_ratio),
          offset(src, bld, first_component),
          brw_type_size_bytes(src.type) * bld.dispatch_width() * components));

       brw_reg_type shuffle_type =
          brw_type_with_size(BRW_TYPE_D, brw_type_size_bits(src.type));
       for (unsigned i = 0; i < components; i++) {
          brw_reg shuffle_component_i =
             subscript(offset(dst, bld, i / size_ratio),
                       shuffle_type, i % size_ratio);
          bld.MOV(shuffle_component_i,
                  retype(offset(src, bld, i + first_component), shuffle_type));
       }
    } else {
       /* Source is unshuffled into destination */
       unsigned size_ratio = brw_type_size_bytes(src.type) / brw_type_size_bytes(dst.type);
       assert(!regions_overlap(dst,
          brw_type_size_bytes(dst.type) * bld.dispatch_width() * components,
          offset(src, bld, first_component / size_ratio),
          brw_type_size_bytes(src.type) * bld.dispatch_width() *
          DIV_ROUND_UP(components + (first_component % size_ratio),
                       size_ratio)));

       brw_reg_type shuffle_type =
          brw_type_with_size(BRW_TYPE_D, brw_type_size_bits(dst.type));
       for (unsigned i = 0; i < components; i++) {
          brw_reg shuffle_component_i =
             subscript(offset(src, bld, (first_component + i) / size_ratio),
                       shuffle_type, (first_component + i) % size_ratio);
          bld.MOV(retype(offset(dst, bld, i), shuffle_type),
                  shuffle_component_i);
       }
    }
 }

 void
 brw_builder::shuffle_from_32bit_read(const brw_reg &dst,
                                      const brw_reg &src,
                                      uint32_t first_component,
                                      uint32_t components) const
 {
    assert(brw_type_size_bytes(src.type) == 4);

    /* This function takes components in units of the destination type while
     * shuffle_src_to_dst takes components in units of the smallest type
     */
    if (brw_type_size_bytes(dst.type) > 4) {
       assert(brw_type_size_bytes(dst.type) == 8);
       first_component *= 2;
       components *= 2;
    }

    shuffle_src_to_dst(*this, dst, src, first_component, components);
 }

 /**
  * Get the mask of SIMD channels enabled during dispatch and not yet disabled
  * by discard.  Due to the layout of the sample mask in the fragment shader
  * thread payload, \p bld is required to have a dispatch_width() not greater
  * than 16 for fragment shaders.
  */
 brw_reg
 brw_sample_mask_reg(const brw_builder &bld)
 {
    const fs_visitor &s = *bld.shader;

    if (s.stage != MESA_SHADER_FRAGMENT) {
       return brw_imm_ud(0xffffffff);
    } else if (s.devinfo->ver >= 20 ||
               brw_wm_prog_data(s.prog_data)->uses_kill) {
       return brw_flag_subreg(sample_mask_flag_subreg(s) + bld.group() / 16);
    } else {
       assert(bld.dispatch_width() <= 16);
       assert(s.devinfo->ver < 20);
       return retype(brw_vec1_grf((bld.group() >= 16 ? 2 : 1), 7),
                     BRW_TYPE_UW);
    }
 }

 /**
  * Predicate the specified instruction on the sample mask.
  */
 void
 brw_emit_predicate_on_sample_mask(const brw_builder &bld, brw_inst *inst)
 {
    assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
           bld.group() == inst->group &&
           bld.dispatch_width() == inst->exec_size);

    const fs_visitor &s = *bld.shader;
    const brw_reg sample_mask = brw_sample_mask_reg(bld);
    const unsigned subreg = sample_mask_flag_subreg(s);

    if (s.devinfo->ver >= 20 || brw_wm_prog_data(s.prog_data)->uses_kill) {
       assert(sample_mask.file == ARF &&
              sample_mask.nr == brw_flag_subreg(subreg).nr &&
              sample_mask.subnr == brw_flag_subreg(
                 subreg + inst->group / 16).subnr);
    } else {
       bld.group(1, 0).exec_all()
          .MOV(brw_flag_subreg(subreg + inst->group / 16), sample_mask);
    }

    if (inst->predicate) {
       assert(inst->predicate == BRW_PREDICATE_NORMAL);
       assert(!inst->predicate_inverse);
       assert(inst->flag_subreg == 0);
       assert(s.devinfo->ver < 20);
       /* Combine the sample mask with the existing predicate by using a
        * vertical predication mode.
        */
       inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;
    } else {
       inst->flag_subreg = subreg;
       inst->predicate = BRW_PREDICATE_NORMAL;
       inst->predicate_inverse = false;
    }
 }


 brw_reg
 brw_fetch_payload_reg(const brw_builder &bld, uint8_t regs[2],
                       brw_reg_type type, unsigned n)
 {
    if (!regs[0])
       return brw_reg();

    if (bld.dispatch_width() > 16) {
       const brw_reg tmp = bld.vgrf(type, n);
       const brw_builder hbld = bld.exec_all().group(16, 0);
       const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
       brw_reg *const components = new brw_reg[m * n];

       for (unsigned c = 0; c < n; c++) {
          for (unsigned g = 0; g < m; g++)
             components[c * m + g] =
                offset(retype(brw_vec8_grf(regs[g], 0), type), hbld, c);
       }

       hbld.LOAD_PAYLOAD(tmp, components, m * n, 0);

       delete[] components;
       return tmp;

    } else {
       return brw_reg(retype(brw_vec8_grf(regs[0], 0), type));
    }
 }

 brw_reg
 brw_fetch_barycentric_reg(const brw_builder &bld, uint8_t regs[2])
 {
    if (!regs[0])
       return brw_reg();
    else if (bld.shader->devinfo->ver >= 20)
       return brw_fetch_payload_reg(bld, regs, BRW_TYPE_F, 2);

    const brw_reg tmp = bld.vgrf(BRW_TYPE_F, 2);
    const brw_builder hbld = bld.exec_all().group(8, 0);
    const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
    brw_reg *const components = new brw_reg[2 * m];

    for (unsigned c = 0; c < 2; c++) {
       for (unsigned g = 0; g < m; g++)
          components[c * m + g] = offset(brw_vec8_grf(regs[g / 2], 0),
                                         hbld, c + 2 * (g % 2));
    }

    hbld.LOAD_PAYLOAD(tmp, components, 2 * m, 0);

    delete[] components;
    return tmp;
 }

 void
 brw_check_dynamic_msaa_flag(const brw_builder &bld,
                         const struct brw_wm_prog_data *wm_prog_data,
                         enum intel_msaa_flags flag)
 {
    brw_inst *inst = bld.AND(bld.null_reg_ud(),
                             brw_dynamic_msaa_flags(wm_prog_data),
                             brw_imm_ud(flag));
    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 }
	/*
	* Copyright © 2010 Intel Corporation
	* SPDX-License-Identifier: MIT
	*/

	#include "brw_builder.h"

	/*
	* This helper takes a source register and un/shuffles it into the destination
	* register.
	*
	* If source type size is smaller than destination type size the operation
	* needed is a component shuffle. The opposite case would be an unshuffle. If
	* source/destination type size is equal a shuffle is done that would be
	* equivalent to a simple MOV.
	*
	* For example, if source is a 16-bit type and destination is 32-bit. A 3
	* components .xyz 16-bit vector on SIMD8 would be.
	*
	* \|x1\|x2\|x3\|x4\|x5\|x6\|x7\|x8\|y1\|y2\|y3\|y4\|y5\|y6\|y7\|y8\|
	* \|z1\|z2\|z3\|z4\|z5\|z6\|z7\|z8\| \| \| \| \| \| \| \| \|
	*
	* This helper will return the following 2 32-bit components with the 16-bit
	* values shuffled:
	*
	* \|x1 y1\|x2 y2\|x3 y3\|x4 y4\|x5 y5\|x6 y6\|x7 y7\|x8 y8\|
	* \|z1 \|z2 \|z3 \|z4 \|z5 \|z6 \|z7 \|z8 \|
	*
	* For unshuffle, the example would be the opposite, a 64-bit type source
	* and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8
	* would be:
	*
	* \| x1l x1h \| x2l x2h \| x3l x3h \| x4l x4h \|
	* \| x5l x5h \| x6l x6h \| x7l x7h \| x8l x8h \|
	* \| y1l y1h \| y2l y2h \| y3l y3h \| y4l y4h \|
	* \| y5l y5h \| y6l y6h \| y7l y7h \| y8l y8h \|
	*
	* The returned result would be the following 4 32-bit components unshuffled:
	*
	* \| x1l \| x2l \| x3l \| x4l \| x5l \| x6l \| x7l \| x8l \|
	* \| x1h \| x2h \| x3h \| x4h \| x5h \| x6h \| x7h \| x8h \|
	* \| y1l \| y2l \| y3l \| y4l \| y5l \| y6l \| y7l \| y8l \|
	* \| y1h \| y2h \| y3h \| y4h \| y5h \| y6h \| y7h \| y8h \|
	*
	* - Source and destination register must not be overlapped.
	* - components units are measured in terms of the smaller type between
	* source and destination because we are un/shuffling the smaller
	* components from/into the bigger ones.
	* - first_component parameter allows skipping source components.
	*/
	static void
	shuffle_src_to_dst(const brw_builder &bld,
	const brw_reg &dst,
	const brw_reg &src,
	uint32_t first_component,
	uint32_t components)
	{
	if (brw_type_size_bytes(src.type) == brw_type_size_bytes(dst.type)) {
	assert(!regions_overlap(dst,
	brw_type_size_bytes(dst.type) * bld.dispatch_width() * components,
	offset(src, bld, first_component),
	brw_type_size_bytes(src.type) * bld.dispatch_width() * components));
	for (unsigned i = 0; i < components; i++) {
	bld.MOV(retype(offset(dst, bld, i), src.type),
	offset(src, bld, i + first_component));
	}
	} else if (brw_type_size_bytes(src.type) < brw_type_size_bytes(dst.type)) {
	/* Source is shuffled into destination */
	unsigned size_ratio = brw_type_size_bytes(dst.type) / brw_type_size_bytes(src.type);
	assert(!regions_overlap(dst,
	brw_type_size_bytes(dst.type) * bld.dispatch_width() *
	DIV_ROUND_UP(components, size_ratio),
	offset(src, bld, first_component),
	brw_type_size_bytes(src.type) * bld.dispatch_width() * components));

	brw_reg_type shuffle_type =
	brw_type_with_size(BRW_TYPE_D, brw_type_size_bits(src.type));
	for (unsigned i = 0; i < components; i++) {
	brw_reg shuffle_component_i =
	subscript(offset(dst, bld, i / size_ratio),
	shuffle_type, i % size_ratio);
	bld.MOV(shuffle_component_i,
	retype(offset(src, bld, i + first_component), shuffle_type));
	}
	} else {
	/* Source is unshuffled into destination */
	unsigned size_ratio = brw_type_size_bytes(src.type) / brw_type_size_bytes(dst.type);
	assert(!regions_overlap(dst,
	brw_type_size_bytes(dst.type) * bld.dispatch_width() * components,
	offset(src, bld, first_component / size_ratio),
	brw_type_size_bytes(src.type) * bld.dispatch_width() *
	DIV_ROUND_UP(components + (first_component % size_ratio),
	size_ratio)));

	brw_reg_type shuffle_type =
	brw_type_with_size(BRW_TYPE_D, brw_type_size_bits(dst.type));
	for (unsigned i = 0; i < components; i++) {
	brw_reg shuffle_component_i =
	subscript(offset(src, bld, (first_component + i) / size_ratio),
	shuffle_type, (first_component + i) % size_ratio);
	bld.MOV(retype(offset(dst, bld, i), shuffle_type),
	shuffle_component_i);
	}
	}
	}

	void
	brw_builder::shuffle_from_32bit_read(const brw_reg &dst,
	const brw_reg &src,
	uint32_t first_component,
	uint32_t components) const
	{
	assert(brw_type_size_bytes(src.type) == 4);

	/* This function takes components in units of the destination type while
	* shuffle_src_to_dst takes components in units of the smallest type
	*/
	if (brw_type_size_bytes(dst.type) > 4) {
	assert(brw_type_size_bytes(dst.type) == 8);
	first_component *= 2;
	components *= 2;
	}

	shuffle_src_to_dst(*this, dst, src, first_component, components);
	}

	/**
	* Get the mask of SIMD channels enabled during dispatch and not yet disabled
	* by discard. Due to the layout of the sample mask in the fragment shader
	* thread payload, \p bld is required to have a dispatch_width() not greater
	* than 16 for fragment shaders.
	*/
	brw_reg
	brw_sample_mask_reg(const brw_builder &bld)
	{
	const fs_visitor &s = *bld.shader;

	if (s.stage != MESA_SHADER_FRAGMENT) {
	return brw_imm_ud(0xffffffff);
	} else if (s.devinfo->ver >= 20 \|\|
	brw_wm_prog_data(s.prog_data)->uses_kill) {
	return brw_flag_subreg(sample_mask_flag_subreg(s) + bld.group() / 16);
	} else {
	assert(bld.dispatch_width() <= 16);
	assert(s.devinfo->ver < 20);
	return retype(brw_vec1_grf((bld.group() >= 16 ? 2 : 1), 7),
	BRW_TYPE_UW);
	}
	}

	/**
	* Predicate the specified instruction on the sample mask.
	*/
	void
	brw_emit_predicate_on_sample_mask(const brw_builder &bld, brw_inst *inst)
	{
	assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
	bld.group() == inst->group &&
	bld.dispatch_width() == inst->exec_size);

	const fs_visitor &s = *bld.shader;
	const brw_reg sample_mask = brw_sample_mask_reg(bld);
	const unsigned subreg = sample_mask_flag_subreg(s);

	if (s.devinfo->ver >= 20 \|\| brw_wm_prog_data(s.prog_data)->uses_kill) {
	assert(sample_mask.file == ARF &&
	sample_mask.nr == brw_flag_subreg(subreg).nr &&
	sample_mask.subnr == brw_flag_subreg(
	subreg + inst->group / 16).subnr);
	} else {
	bld.group(1, 0).exec_all()
	.MOV(brw_flag_subreg(subreg + inst->group / 16), sample_mask);
	}

	if (inst->predicate) {
	assert(inst->predicate == BRW_PREDICATE_NORMAL);
	assert(!inst->predicate_inverse);
	assert(inst->flag_subreg == 0);
	assert(s.devinfo->ver < 20);
	/* Combine the sample mask with the existing predicate by using a
	* vertical predication mode.
	*/
	inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;
	} else {
	inst->flag_subreg = subreg;
	inst->predicate = BRW_PREDICATE_NORMAL;
	inst->predicate_inverse = false;
	}
	}


	brw_reg
	brw_fetch_payload_reg(const brw_builder &bld, uint8_t regs[2],
	brw_reg_type type, unsigned n)
	{
	if (!regs[0])
	return brw_reg();

	if (bld.dispatch_width() > 16) {
	const brw_reg tmp = bld.vgrf(type, n);
	const brw_builder hbld = bld.exec_all().group(16, 0);
	const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
	brw_reg const components = new brw_reg[m n];

	for (unsigned c = 0; c < n; c++) {
	for (unsigned g = 0; g < m; g++)
	components[c * m + g] =
	offset(retype(brw_vec8_grf(regs[g], 0), type), hbld, c);
	}

	hbld.LOAD_PAYLOAD(tmp, components, m * n, 0);

	delete[] components;
	return tmp;

	} else {
	return brw_reg(retype(brw_vec8_grf(regs[0], 0), type));
	}
	}

	brw_reg
	brw_fetch_barycentric_reg(const brw_builder &bld, uint8_t regs[2])
	{
	if (!regs[0])
	return brw_reg();
	else if (bld.shader->devinfo->ver >= 20)
	return brw_fetch_payload_reg(bld, regs, BRW_TYPE_F, 2);

	const brw_reg tmp = bld.vgrf(BRW_TYPE_F, 2);
	const brw_builder hbld = bld.exec_all().group(8, 0);
	const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
	brw_reg const components = new brw_reg[2 m];

	for (unsigned c = 0; c < 2; c++) {
	for (unsigned g = 0; g < m; g++)
	components[c * m + g] = offset(brw_vec8_grf(regs[g / 2], 0),
	hbld, c + 2 * (g % 2));
	}

	hbld.LOAD_PAYLOAD(tmp, components, 2 * m, 0);

	delete[] components;
	return tmp;
	}

	void
	brw_check_dynamic_msaa_flag(const brw_builder &bld,
	const struct brw_wm_prog_data *wm_prog_data,
	enum intel_msaa_flags flag)
	{
	brw_inst *inst = bld.AND(bld.null_reg_ud(),
	brw_dynamic_msaa_flags(wm_prog_data),
	brw_imm_ud(flag));
	inst->conditional_mod = BRW_CONDITIONAL_NZ;
	}