src/intel/compiler/brw_fs_lower_dpas.cpp - platform/external/mesa3d - Git at Google

 /*
  * Copyright 2023 Intel Corporation
  * SPDX-License-Identifier: MIT
  */

 #include "brw_fs.h"
 #include "brw_fs_builder.h"

 using namespace brw;

 static void
 f16_using_mac(const fs_builder &bld, fs_inst *inst)
 {
    /* We only intend to support configurations where the destination and
     * accumulator have the same type.
     */
    if (!inst->src[0].is_null())
       assert(inst->dst.type == inst->src[0].type);

    assert(inst->src[1].type == BRW_REGISTER_TYPE_HF);
    assert(inst->src[2].type == BRW_REGISTER_TYPE_HF);

    const brw_reg_type src0_type = inst->dst.type;
    const brw_reg_type src1_type = BRW_REGISTER_TYPE_HF;
    const brw_reg_type src2_type = BRW_REGISTER_TYPE_HF;

    const fs_reg dest = inst->dst;
    fs_reg src0 = inst->src[0];
    const fs_reg src1 = retype(inst->src[1], src1_type);
    const fs_reg src2 = retype(inst->src[2], src2_type);

    const unsigned dest_stride =
       dest.type == BRW_REGISTER_TYPE_HF ? REG_SIZE / 2 : REG_SIZE;

    for (unsigned r = 0; r < inst->rcount; r++) {
       fs_reg temp = bld.vgrf(BRW_REGISTER_TYPE_HF, 1);

       for (unsigned subword = 0; subword < 2; subword++) {
          for (unsigned s = 0; s < inst->sdepth; s++) {
             /* The first multiply of the dot-product operation has to
              * explicitly write the accumulator register. The successive MAC
              * instructions will implicitly read *and* write the
              * accumulator. Those MAC instructions can also optionally
              * explicitly write some other register.
              *
              * FINISHME: The accumulator can actually hold 16 HF values. On
              * Gfx12 there are two accumulators. It should be possible to do
              * this in SIMD16 or even SIMD32. I was unable to get this to work
              * properly.
              */
             if (s == 0 && subword == 0) {
                const unsigned acc_width = 8;
                fs_reg acc = suboffset(retype(brw_acc_reg(inst->exec_size), BRW_REGISTER_TYPE_UD),
                                       inst->group % acc_width);

                if (bld.shader->devinfo->verx10 >= 125) {
                   acc = subscript(acc, BRW_REGISTER_TYPE_HF, subword);
                } else {
                   acc = retype(acc, BRW_REGISTER_TYPE_HF);
                }

                bld.MUL(acc,
                        subscript(retype(byte_offset(src1, s * REG_SIZE),
                                         BRW_REGISTER_TYPE_UD),
                                  BRW_REGISTER_TYPE_HF, subword),
                        component(retype(byte_offset(src2, r * REG_SIZE),
                                         BRW_REGISTER_TYPE_HF),
                                  s * 2 + subword))
                   ->writes_accumulator = true;

             } else {
                fs_reg result;

                /* As mentioned above, the MAC had an optional, explicit
                 * destination register. Various optimization passes are not
                 * clever enough to understand the intricacies of this
                 * instruction, so only write the result register on the final
                 * MAC in the sequence.
                 */
                if ((s + 1) == inst->sdepth && subword == 1)
                   result = temp;
                else
                   result = retype(bld.null_reg_ud(), BRW_REGISTER_TYPE_HF);

                bld.MAC(result,
                        subscript(retype(byte_offset(src1, s * REG_SIZE),
                                         BRW_REGISTER_TYPE_UD),
                                  BRW_REGISTER_TYPE_HF, subword),
                        component(retype(byte_offset(src2, r * REG_SIZE),
                                         BRW_REGISTER_TYPE_HF),
                                  s * 2 + subword))
                   ->writes_accumulator = true;
             }
          }
       }

       if (!src0.is_null()) {
          if (src0_type != BRW_REGISTER_TYPE_HF) {
             fs_reg temp2 = bld.vgrf(src0_type, 1);

             bld.MOV(temp2, temp);

             bld.ADD(byte_offset(dest, r * dest_stride),
                     temp2,
                     byte_offset(src0, r * dest_stride));
          } else {
             bld.ADD(byte_offset(dest, r * dest_stride),
                     temp,
                     byte_offset(src0, r * dest_stride));
          }
       } else {
          bld.MOV(byte_offset(dest, r * dest_stride), temp);
       }
    }
 }

 static void
 int8_using_dp4a(const fs_builder &bld, fs_inst *inst)
 {
    /* We only intend to support configurations where the destination and
     * accumulator have the same type.
     */
    if (!inst->src[0].is_null())
       assert(inst->dst.type == inst->src[0].type);

    assert(inst->src[1].type == BRW_REGISTER_TYPE_B ||
           inst->src[1].type == BRW_REGISTER_TYPE_UB);
    assert(inst->src[2].type == BRW_REGISTER_TYPE_B ||
           inst->src[2].type == BRW_REGISTER_TYPE_UB);

    const brw_reg_type src1_type = inst->src[1].type == BRW_REGISTER_TYPE_UB
       ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;

    const brw_reg_type src2_type = inst->src[2].type == BRW_REGISTER_TYPE_UB
       ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;

    fs_reg dest = inst->dst;
    fs_reg src0 = inst->src[0];
    const fs_reg src1 = retype(inst->src[1], src1_type);
    const fs_reg src2 = retype(inst->src[2], src2_type);

    const unsigned dest_stride = REG_SIZE;

    for (unsigned r = 0; r < inst->rcount; r++) {
       if (!src0.is_null()) {
          bld.MOV(dest, src0);
          src0 = byte_offset(src0, dest_stride);
       } else {
          bld.MOV(dest, retype(brw_imm_d(0), dest.type));
       }

       for (unsigned s = 0; s < inst->sdepth; s++) {
          bld.DP4A(dest,
                   dest,
                   byte_offset(src1, s * REG_SIZE),
                   component(byte_offset(src2, r * REG_SIZE), s))
             ->saturate = inst->saturate;
       }

       dest = byte_offset(dest, dest_stride);
    }
 }

 static void
 int8_using_mul_add(const fs_builder &bld, fs_inst *inst)
 {
    /* We only intend to support configurations where the destination and
     * accumulator have the same type.
     */
    if (!inst->src[0].is_null())
       assert(inst->dst.type == inst->src[0].type);

    assert(inst->src[1].type == BRW_REGISTER_TYPE_B ||
           inst->src[1].type == BRW_REGISTER_TYPE_UB);
    assert(inst->src[2].type == BRW_REGISTER_TYPE_B ||
           inst->src[2].type == BRW_REGISTER_TYPE_UB);

    const brw_reg_type src0_type = inst->dst.type;

    const brw_reg_type src1_type = inst->src[1].type == BRW_REGISTER_TYPE_UB
       ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;

    const brw_reg_type src2_type = inst->src[2].type == BRW_REGISTER_TYPE_UB
       ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;

    fs_reg dest = inst->dst;
    fs_reg src0 = inst->src[0];
    const fs_reg src1 = retype(inst->src[1], src1_type);
    const fs_reg src2 = retype(inst->src[2], src2_type);

    const unsigned dest_stride = REG_SIZE;

    for (unsigned r = 0; r < inst->rcount; r++) {
       if (!src0.is_null()) {
          bld.MOV(dest, src0);
          src0 = byte_offset(src0, dest_stride);
       } else {
          bld.MOV(dest, retype(brw_imm_d(0), dest.type));
       }

       for (unsigned s = 0; s < inst->sdepth; s++) {
          fs_reg temp1 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
          fs_reg temp2 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
          fs_reg temp3 = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
          const brw_reg_type temp_type =
             (inst->src[1].type == BRW_REGISTER_TYPE_B ||
              inst->src[2].type == BRW_REGISTER_TYPE_B)
             ? BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW;

          /* Expand 8 dwords of packed bytes into 16 dwords of packed
           * words.
           *
           * FINISHME: Gfx9 should not need this work around. Gfx11
           * may be able to use integer MAD. Both platforms may be
           * able to use MAC.
           */
          bld.group(32, 0).MOV(retype(temp3, temp_type),
                               retype(byte_offset(src2, r * REG_SIZE),
                                      inst->src[2].type));

          bld.MUL(subscript(temp1, temp_type, 0),
                  subscript(retype(byte_offset(src1, s * REG_SIZE),
                                   BRW_REGISTER_TYPE_UD),
                            inst->src[1].type, 0),
                  subscript(component(retype(temp3,
                                             BRW_REGISTER_TYPE_UD),
                                      s * 2),
                            temp_type, 0));

          bld.MUL(subscript(temp1, temp_type, 1),
                  subscript(retype(byte_offset(src1, s * REG_SIZE),
                                   BRW_REGISTER_TYPE_UD),
                            inst->src[1].type, 1),
                  subscript(component(retype(temp3,
                                             BRW_REGISTER_TYPE_UD),
                                      s * 2),
                            temp_type, 1));

          bld.MUL(subscript(temp2, temp_type, 0),
                  subscript(retype(byte_offset(src1, s * REG_SIZE),
                                   BRW_REGISTER_TYPE_UD),
                            inst->src[1].type, 2),
                  subscript(component(retype(temp3,
                                             BRW_REGISTER_TYPE_UD),
                                      s * 2 + 1),
                            temp_type, 0));

          bld.MUL(subscript(temp2, temp_type, 1),
                  subscript(retype(byte_offset(src1, s * REG_SIZE),
                                   BRW_REGISTER_TYPE_UD),
                            inst->src[1].type, 3),
                  subscript(component(retype(temp3,
                                             BRW_REGISTER_TYPE_UD),
                                      s * 2 + 1),
                            temp_type, 1));

          bld.ADD(subscript(temp1, src0_type, 0),
                  subscript(temp1, temp_type, 0),
                  subscript(temp1, temp_type, 1));

          bld.ADD(subscript(temp2, src0_type, 0),
                  subscript(temp2, temp_type, 0),
                  subscript(temp2, temp_type, 1));

          bld.ADD(retype(temp1, src0_type),
                  retype(temp1, src0_type),
                  retype(temp2, src0_type));

          bld.ADD(dest, dest, retype(temp1, src0_type))
             ->saturate = inst->saturate;
       }

       dest = byte_offset(dest, dest_stride);
    }
 }

 bool
 brw_lower_dpas(fs_visitor &v)
 {
    bool progress = false;

    foreach_block_and_inst_safe(block, fs_inst, inst, v.cfg) {
       if (inst->opcode != BRW_OPCODE_DPAS)
          continue;

       const fs_builder bld = fs_builder(&v, block, inst).group(8, 0).exec_all();

       if (brw_reg_type_is_floating_point(inst->dst.type)) {
          f16_using_mac(bld, inst);
       } else {
          if (v.devinfo->ver >= 12) {
             int8_using_dp4a(bld, inst);
          } else {
             int8_using_mul_add(bld, inst);
          }
       }

       inst->remove(block);
       progress = true;
    }

    if (progress)
       v.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);

    return progress;
 }
	/*
	* Copyright 2023 Intel Corporation
	* SPDX-License-Identifier: MIT
	*/

	#include "brw_fs.h"
	#include "brw_fs_builder.h"

	using namespace brw;

	static void
	f16_using_mac(const fs_builder &bld, fs_inst *inst)
	{
	/* We only intend to support configurations where the destination and
	* accumulator have the same type.
	*/
	if (!inst->src[0].is_null())
	assert(inst->dst.type == inst->src[0].type);

	assert(inst->src[1].type == BRW_REGISTER_TYPE_HF);
	assert(inst->src[2].type == BRW_REGISTER_TYPE_HF);

	const brw_reg_type src0_type = inst->dst.type;
	const brw_reg_type src1_type = BRW_REGISTER_TYPE_HF;
	const brw_reg_type src2_type = BRW_REGISTER_TYPE_HF;

	const fs_reg dest = inst->dst;
	fs_reg src0 = inst->src[0];
	const fs_reg src1 = retype(inst->src[1], src1_type);
	const fs_reg src2 = retype(inst->src[2], src2_type);

	const unsigned dest_stride =
	dest.type == BRW_REGISTER_TYPE_HF ? REG_SIZE / 2 : REG_SIZE;

	for (unsigned r = 0; r < inst->rcount; r++) {
	fs_reg temp = bld.vgrf(BRW_REGISTER_TYPE_HF, 1);

	for (unsigned subword = 0; subword < 2; subword++) {
	for (unsigned s = 0; s < inst->sdepth; s++) {
	/* The first multiply of the dot-product operation has to
	* explicitly write the accumulator register. The successive MAC
	* instructions will implicitly read and write the
	* accumulator. Those MAC instructions can also optionally
	* explicitly write some other register.
	*
	* FINISHME: The accumulator can actually hold 16 HF values. On
	* Gfx12 there are two accumulators. It should be possible to do
	* this in SIMD16 or even SIMD32. I was unable to get this to work
	* properly.
	*/
	if (s == 0 && subword == 0) {
	const unsigned acc_width = 8;
	fs_reg acc = suboffset(retype(brw_acc_reg(inst->exec_size), BRW_REGISTER_TYPE_UD),
	inst->group % acc_width);

	if (bld.shader->devinfo->verx10 >= 125) {
	acc = subscript(acc, BRW_REGISTER_TYPE_HF, subword);
	} else {
	acc = retype(acc, BRW_REGISTER_TYPE_HF);
	}

	bld.MUL(acc,
	subscript(retype(byte_offset(src1, s * REG_SIZE),
	BRW_REGISTER_TYPE_UD),
	BRW_REGISTER_TYPE_HF, subword),
	component(retype(byte_offset(src2, r * REG_SIZE),
	BRW_REGISTER_TYPE_HF),
	s * 2 + subword))
	->writes_accumulator = true;

	} else {
	fs_reg result;

	/* As mentioned above, the MAC had an optional, explicit
	* destination register. Various optimization passes are not
	* clever enough to understand the intricacies of this
	* instruction, so only write the result register on the final
	* MAC in the sequence.
	*/
	if ((s + 1) == inst->sdepth && subword == 1)
	result = temp;
	else
	result = retype(bld.null_reg_ud(), BRW_REGISTER_TYPE_HF);

	bld.MAC(result,
	subscript(retype(byte_offset(src1, s * REG_SIZE),
	BRW_REGISTER_TYPE_UD),
	BRW_REGISTER_TYPE_HF, subword),
	component(retype(byte_offset(src2, r * REG_SIZE),
	BRW_REGISTER_TYPE_HF),
	s * 2 + subword))
	->writes_accumulator = true;
	}
	}
	}

	if (!src0.is_null()) {
	if (src0_type != BRW_REGISTER_TYPE_HF) {
	fs_reg temp2 = bld.vgrf(src0_type, 1);

	bld.MOV(temp2, temp);

	bld.ADD(byte_offset(dest, r * dest_stride),
	temp2,
	byte_offset(src0, r * dest_stride));
	} else {
	bld.ADD(byte_offset(dest, r * dest_stride),
	temp,
	byte_offset(src0, r * dest_stride));
	}
	} else {
	bld.MOV(byte_offset(dest, r * dest_stride), temp);
	}
	}
	}

	static void
	int8_using_dp4a(const fs_builder &bld, fs_inst *inst)
	{
	/* We only intend to support configurations where the destination and
	* accumulator have the same type.
	*/
	if (!inst->src[0].is_null())
	assert(inst->dst.type == inst->src[0].type);

	assert(inst->src[1].type == BRW_REGISTER_TYPE_B \|\|
	inst->src[1].type == BRW_REGISTER_TYPE_UB);
	assert(inst->src[2].type == BRW_REGISTER_TYPE_B \|\|
	inst->src[2].type == BRW_REGISTER_TYPE_UB);

	const brw_reg_type src1_type = inst->src[1].type == BRW_REGISTER_TYPE_UB
	? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;

	const brw_reg_type src2_type = inst->src[2].type == BRW_REGISTER_TYPE_UB
	? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;

	fs_reg dest = inst->dst;
	fs_reg src0 = inst->src[0];
	const fs_reg src1 = retype(inst->src[1], src1_type);
	const fs_reg src2 = retype(inst->src[2], src2_type);

	const unsigned dest_stride = REG_SIZE;

	for (unsigned r = 0; r < inst->rcount; r++) {
	if (!src0.is_null()) {
	bld.MOV(dest, src0);
	src0 = byte_offset(src0, dest_stride);
	} else {
	bld.MOV(dest, retype(brw_imm_d(0), dest.type));
	}

	for (unsigned s = 0; s < inst->sdepth; s++) {
	bld.DP4A(dest,
	dest,
	byte_offset(src1, s * REG_SIZE),
	component(byte_offset(src2, r * REG_SIZE), s))
	->saturate = inst->saturate;
	}

	dest = byte_offset(dest, dest_stride);
	}
	}

	static void
	int8_using_mul_add(const fs_builder &bld, fs_inst *inst)
	{
	/* We only intend to support configurations where the destination and
	* accumulator have the same type.
	*/
	if (!inst->src[0].is_null())
	assert(inst->dst.type == inst->src[0].type);

	assert(inst->src[1].type == BRW_REGISTER_TYPE_B \|\|
	inst->src[1].type == BRW_REGISTER_TYPE_UB);
	assert(inst->src[2].type == BRW_REGISTER_TYPE_B \|\|
	inst->src[2].type == BRW_REGISTER_TYPE_UB);

	const brw_reg_type src0_type = inst->dst.type;

	const brw_reg_type src1_type = inst->src[1].type == BRW_REGISTER_TYPE_UB
	? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;

	const brw_reg_type src2_type = inst->src[2].type == BRW_REGISTER_TYPE_UB
	? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;

	fs_reg dest = inst->dst;
	fs_reg src0 = inst->src[0];
	const fs_reg src1 = retype(inst->src[1], src1_type);
	const fs_reg src2 = retype(inst->src[2], src2_type);

	const unsigned dest_stride = REG_SIZE;

	for (unsigned r = 0; r < inst->rcount; r++) {
	if (!src0.is_null()) {
	bld.MOV(dest, src0);
	src0 = byte_offset(src0, dest_stride);
	} else {
	bld.MOV(dest, retype(brw_imm_d(0), dest.type));
	}

	for (unsigned s = 0; s < inst->sdepth; s++) {
	fs_reg temp1 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
	fs_reg temp2 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
	fs_reg temp3 = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
	const brw_reg_type temp_type =
	(inst->src[1].type == BRW_REGISTER_TYPE_B \|\|
	inst->src[2].type == BRW_REGISTER_TYPE_B)
	? BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW;

	/* Expand 8 dwords of packed bytes into 16 dwords of packed
	* words.
	*
	* FINISHME: Gfx9 should not need this work around. Gfx11
	* may be able to use integer MAD. Both platforms may be
	* able to use MAC.
	*/
	bld.group(32, 0).MOV(retype(temp3, temp_type),
	retype(byte_offset(src2, r * REG_SIZE),
	inst->src[2].type));

	bld.MUL(subscript(temp1, temp_type, 0),
	subscript(retype(byte_offset(src1, s * REG_SIZE),
	BRW_REGISTER_TYPE_UD),
	inst->src[1].type, 0),
	subscript(component(retype(temp3,
	BRW_REGISTER_TYPE_UD),
	s * 2),
	temp_type, 0));

	bld.MUL(subscript(temp1, temp_type, 1),
	subscript(retype(byte_offset(src1, s * REG_SIZE),
	BRW_REGISTER_TYPE_UD),
	inst->src[1].type, 1),
	subscript(component(retype(temp3,
	BRW_REGISTER_TYPE_UD),
	s * 2),
	temp_type, 1));

	bld.MUL(subscript(temp2, temp_type, 0),
	subscript(retype(byte_offset(src1, s * REG_SIZE),
	BRW_REGISTER_TYPE_UD),
	inst->src[1].type, 2),
	subscript(component(retype(temp3,
	BRW_REGISTER_TYPE_UD),
	s * 2 + 1),
	temp_type, 0));

	bld.MUL(subscript(temp2, temp_type, 1),
	subscript(retype(byte_offset(src1, s * REG_SIZE),
	BRW_REGISTER_TYPE_UD),
	inst->src[1].type, 3),
	subscript(component(retype(temp3,
	BRW_REGISTER_TYPE_UD),
	s * 2 + 1),
	temp_type, 1));

	bld.ADD(subscript(temp1, src0_type, 0),
	subscript(temp1, temp_type, 0),
	subscript(temp1, temp_type, 1));

	bld.ADD(subscript(temp2, src0_type, 0),
	subscript(temp2, temp_type, 0),
	subscript(temp2, temp_type, 1));

	bld.ADD(retype(temp1, src0_type),
	retype(temp1, src0_type),
	retype(temp2, src0_type));

	bld.ADD(dest, dest, retype(temp1, src0_type))
	->saturate = inst->saturate;
	}

	dest = byte_offset(dest, dest_stride);
	}
	}

	bool
	brw_lower_dpas(fs_visitor &v)
	{
	bool progress = false;

	foreach_block_and_inst_safe(block, fs_inst, inst, v.cfg) {
	if (inst->opcode != BRW_OPCODE_DPAS)
	continue;

	const fs_builder bld = fs_builder(&v, block, inst).group(8, 0).exec_all();

	if (brw_reg_type_is_floating_point(inst->dst.type)) {
	f16_using_mac(bld, inst);
	} else {
	if (v.devinfo->ver >= 12) {
	int8_using_dp4a(bld, inst);
	} else {
	int8_using_mul_add(bld, inst);
	}
	}

	inst->remove(block);
	progress = true;
	}

	if (progress)
	v.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);

	return progress;
	}