src/intel/compiler/brw_lower_logical_sends.cpp - platform/external/mesa3d - Git at Google

 /*
  * Copyright © 2010, 2022 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */

 /**
  * @file
  */

 #include "brw_eu.h"
 #include "brw_fs.h"
 #include "brw_fs_builder.h"

 using namespace brw;

 static void
 lower_urb_read_logical_send(const fs_builder &bld, fs_inst *inst)
 {
    const intel_device_info *devinfo = bld.shader->devinfo;
    const bool per_slot_present =
       inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE;

    assert(inst->size_written % REG_SIZE == 0);
    assert(inst->header_size == 0);

    brw_reg payload_sources[2];
    unsigned header_size = 0;
    payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
    if (per_slot_present)
       payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];

    brw_reg payload = brw_vgrf(bld.shader->alloc.allocate(header_size),
                              BRW_TYPE_F);
    bld.LOAD_PAYLOAD(payload, payload_sources, header_size, header_size);

    inst->opcode = SHADER_OPCODE_SEND;
    inst->header_size = header_size;

    inst->sfid = BRW_SFID_URB;
    inst->desc = brw_urb_desc(devinfo,
                              GFX8_URB_OPCODE_SIMD8_READ,
                              per_slot_present,
                              false,
                              inst->offset);

    inst->mlen = header_size;
    inst->ex_desc = 0;
    inst->ex_mlen = 0;
    inst->send_is_volatile = true;

    inst->resize_sources(4);

    inst->src[0] = brw_imm_ud(0); /* desc */
    inst->src[1] = brw_imm_ud(0); /* ex_desc */
    inst->src[2] = payload;
    inst->src[3] = brw_null_reg();
 }

 static void
 lower_urb_read_logical_send_xe2(const fs_builder &bld, fs_inst *inst)
 {
    const intel_device_info *devinfo = bld.shader->devinfo;
    assert(devinfo->has_lsc);

    assert(inst->size_written % (REG_SIZE * reg_unit(devinfo)) == 0);
    assert(inst->header_size == 0);

    /* Get the logical send arguments. */
    const brw_reg handle = inst->src[URB_LOGICAL_SRC_HANDLE];

    /* Calculate the total number of components of the payload. */
    const unsigned dst_comps = inst->size_written / (REG_SIZE * reg_unit(devinfo));

    brw_reg payload = bld.vgrf(BRW_TYPE_UD);

    bld.MOV(payload, handle);

    /* The low 24-bits of the URB handle is a byte offset into the URB area.
     * Add the (OWord) offset of the write to this value.
     */
    if (inst->offset) {
       bld.ADD(payload, payload, brw_imm_ud(inst->offset * 16));
       inst->offset = 0;
    }

    brw_reg offsets = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
    if (offsets.file != BAD_FILE) {
       bld.ADD(payload, payload, offsets);
    }

    inst->sfid = BRW_SFID_URB;

    assert((dst_comps >= 1 && dst_comps <= 4) || dst_comps == 8);

    inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
                              LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A32,
                              LSC_DATA_SIZE_D32, dst_comps /* num_channels */,
                              false /* transpose */,
                              LSC_CACHE(devinfo, LOAD, L1UC_L3UC));

    /* Update the original instruction. */
    inst->opcode = SHADER_OPCODE_SEND;
    inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, inst->exec_size);
    inst->ex_mlen = 0;
    inst->header_size = 0;
    inst->send_has_side_effects = true;
    inst->send_is_volatile = false;

    inst->resize_sources(4);

    inst->src[0] = brw_imm_ud(0);
    inst->src[1] = brw_imm_ud(0);

    inst->src[2] = payload;
    inst->src[3] = brw_null_reg();
 }

 static void
 lower_urb_write_logical_send(const fs_builder &bld, fs_inst *inst)
 {
    const intel_device_info *devinfo = bld.shader->devinfo;
    const bool per_slot_present =
       inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE;
    const bool channel_mask_present =
       inst->src[URB_LOGICAL_SRC_CHANNEL_MASK].file != BAD_FILE;

    assert(inst->header_size == 0);

    const unsigned length = 1 + per_slot_present + channel_mask_present +
                            inst->components_read(URB_LOGICAL_SRC_DATA);

    brw_reg *payload_sources = new brw_reg[length];
    brw_reg payload = brw_vgrf(bld.shader->alloc.allocate(length),
                              BRW_TYPE_F);

    unsigned header_size = 0;
    payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
    if (per_slot_present)
       payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];

    if (channel_mask_present)
       payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_CHANNEL_MASK];

    for (unsigned i = header_size, j = 0; i < length; i++, j++)
       payload_sources[i] = offset(inst->src[URB_LOGICAL_SRC_DATA], bld, j);

    bld.LOAD_PAYLOAD(payload, payload_sources, length, header_size);

    delete [] payload_sources;

    inst->opcode = SHADER_OPCODE_SEND;
    inst->header_size = header_size;
    inst->dst = brw_null_reg();

    inst->sfid = BRW_SFID_URB;
    inst->desc = brw_urb_desc(devinfo,
                              GFX8_URB_OPCODE_SIMD8_WRITE,
                              per_slot_present,
                              channel_mask_present,
                              inst->offset);

    inst->mlen = length;
    inst->ex_desc = 0;
    inst->ex_mlen = 0;
    inst->send_has_side_effects = true;

    inst->resize_sources(4);

    inst->src[0] = brw_imm_ud(0); /* desc */
    inst->src[1] = brw_imm_ud(0); /* ex_desc */
    inst->src[2] = payload;
    inst->src[3] = brw_null_reg();
 }

 static void
 lower_urb_write_logical_send_xe2(const fs_builder &bld, fs_inst *inst)
 {
    const intel_device_info *devinfo = bld.shader->devinfo;
    assert(devinfo->has_lsc);

    /* Get the logical send arguments. */
    const brw_reg handle = inst->src[URB_LOGICAL_SRC_HANDLE];
    const brw_reg src = inst->components_read(URB_LOGICAL_SRC_DATA) ?
       inst->src[URB_LOGICAL_SRC_DATA] : brw_reg(brw_imm_ud(0));
    assert(brw_type_size_bytes(src.type) == 4);

    /* Calculate the total number of components of the payload. */
    const unsigned src_comps = MAX2(1, inst->components_read(URB_LOGICAL_SRC_DATA));
    const unsigned src_sz = brw_type_size_bytes(src.type);

    brw_reg payload = bld.vgrf(BRW_TYPE_UD);

    bld.MOV(payload, handle);

    /* The low 24-bits of the URB handle is a byte offset into the URB area.
     * Add the (OWord) offset of the write to this value.
     */
    if (inst->offset) {
       bld.ADD(payload, payload, brw_imm_ud(inst->offset * 16));
       inst->offset = 0;
    }

    brw_reg offsets = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
    if (offsets.file != BAD_FILE) {
       bld.ADD(payload, payload, offsets);
    }

    const brw_reg cmask = inst->src[URB_LOGICAL_SRC_CHANNEL_MASK];
    unsigned mask = 0;

    if (cmask.file != BAD_FILE) {
       assert(cmask.file == IMM);
       assert(cmask.type == BRW_TYPE_UD);
       mask = cmask.ud >> 16;
    }

    brw_reg payload2 = bld.move_to_vgrf(src, src_comps);
    const unsigned ex_mlen = (src_comps * src_sz * inst->exec_size) / REG_SIZE;

    inst->sfid = BRW_SFID_URB;

    enum lsc_opcode op = mask ? LSC_OP_STORE_CMASK : LSC_OP_STORE;
    inst->desc = lsc_msg_desc(devinfo, op,
                              LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A32,
                              LSC_DATA_SIZE_D32,
                              mask ? mask : src_comps /* num_channels */,
                              false /* transpose */,
                              LSC_CACHE(devinfo, STORE, L1UC_L3UC));


    /* Update the original instruction. */
    inst->opcode = SHADER_OPCODE_SEND;
    inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, inst->exec_size);
    inst->ex_mlen = ex_mlen;
    inst->header_size = 0;
    inst->send_has_side_effects = true;
    inst->send_is_volatile = false;

    inst->resize_sources(4);

    inst->src[0] = brw_imm_ud(0);
    inst->src[1] = brw_imm_ud(0);

    inst->src[2] = payload;
    inst->src[3] = payload2;
 }

 static void
 setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
                     brw_reg *dst, brw_reg color, unsigned components)
 {
    if (key->clamp_fragment_color) {
       brw_reg tmp = bld.vgrf(BRW_TYPE_F, 4);
       assert(color.type == BRW_TYPE_F);

       for (unsigned i = 0; i < components; i++)
          set_saturate(true,
                       bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));

       color = tmp;
    }

    for (unsigned i = 0; i < components; i++)
       dst[i] = offset(color, bld, i);
 }

 static void
 lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
                             const struct brw_wm_prog_data *prog_data,
                             const brw_wm_prog_key *key,
                             const fs_thread_payload &fs_payload)
 {
    assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
    assert(inst->src[FB_WRITE_LOGICAL_SRC_NULL_RT].file == IMM);
    const intel_device_info *devinfo = bld.shader->devinfo;
    const brw_reg color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0];
    const brw_reg color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1];
    const brw_reg src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA];
    const brw_reg src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH];
    const brw_reg dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH];
    const brw_reg src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL];
    brw_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK];
    const unsigned components =
       inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
    const bool null_rt = inst->src[FB_WRITE_LOGICAL_SRC_NULL_RT].ud != 0;

    assert(inst->target != 0 || src0_alpha.file == BAD_FILE);

    brw_reg sources[15];
    int header_size = 2, payload_header_size;
    unsigned length = 0;

    if (devinfo->ver < 11 &&
       (color1.file != BAD_FILE || key->nr_color_regions > 1)) {

       /* From the Sandy Bridge PRM, volume 4, page 198:
        *
        *     "Dispatched Pixel Enables. One bit per pixel indicating
        *      which pixels were originally enabled when the thread was
        *      dispatched. This field is only required for the end-of-
        *      thread message and on all dual-source messages."
        */
       const fs_builder ubld = bld.exec_all().group(8, 0);

       brw_reg header = ubld.vgrf(BRW_TYPE_UD, 2);
       if (bld.group() < 16) {
          /* The header starts off as g0 and g1 for the first half */
          ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
                                               BRW_TYPE_UD));
       } else {
          /* The header starts off as g0 and g2 for the second half */
          assert(bld.group() < 32);
          const brw_reg header_sources[2] = {
             retype(brw_vec8_grf(0, 0), BRW_TYPE_UD),
             retype(brw_vec8_grf(2, 0), BRW_TYPE_UD),
          };
          ubld.LOAD_PAYLOAD(header, header_sources, 2, 0);

          /* Gfx12 will require additional fix-ups if we ever hit this path. */
          assert(devinfo->ver < 12);
       }

       uint32_t g00_bits = 0;

       /* Set "Source0 Alpha Present to RenderTarget" bit in message
        * header.
        */
       if (src0_alpha.file != BAD_FILE)
          g00_bits |= 1 << 11;

       /* Set computes stencil to render target */
       if (prog_data->computed_stencil)
          g00_bits |= 1 << 14;

       if (g00_bits) {
          /* OR extra bits into g0.0 */
          ubld.group(1, 0).OR(component(header, 0),
                              retype(brw_vec1_grf(0, 0), BRW_TYPE_UD),
                              brw_imm_ud(g00_bits));
       }

       /* Set the render target index for choosing BLEND_STATE. */
       if (inst->target > 0) {
          ubld.group(1, 0).MOV(component(header, 2), brw_imm_ud(inst->target));
       }

       if (prog_data->uses_kill) {
          ubld.group(1, 0).MOV(retype(component(header, 15), BRW_TYPE_UW),
                               brw_sample_mask_reg(bld));
       }

       assert(length == 0);
       sources[0] = header;
       sources[1] = horiz_offset(header, 8);
       length = 2;
    }
    assert(length == 0 || length == 2);
    header_size = length;

    if (fs_payload.aa_dest_stencil_reg[0]) {
       assert(inst->group < 16);
       sources[length] = brw_vgrf(bld.shader->alloc.allocate(1), BRW_TYPE_F);
       bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
          .MOV(sources[length],
               brw_reg(brw_vec8_grf(fs_payload.aa_dest_stencil_reg[0], 0)));
       length++;
    }

    if (src0_alpha.file != BAD_FILE) {
       for (unsigned i = 0; i < bld.dispatch_width() / 8; i++) {
          const fs_builder &ubld = bld.exec_all().group(8, i)
                                     .annotate("FB write src0 alpha");
          const brw_reg tmp = ubld.vgrf(BRW_TYPE_F);
          ubld.MOV(tmp, horiz_offset(src0_alpha, i * 8));
          setup_color_payload(ubld, key, &sources[length], tmp, 1);
          length++;
       }
    }

    if (sample_mask.file != BAD_FILE) {
       const brw_reg tmp = brw_vgrf(bld.shader->alloc.allocate(reg_unit(devinfo)),
                                   BRW_TYPE_UD);

       /* Hand over gl_SampleMask.  Only the lower 16 bits of each channel are
        * relevant.  Since it's unsigned single words one vgrf is always
        * 16-wide, but only the lower or higher 8 channels will be used by the
        * hardware when doing a SIMD8 write depending on whether we have
        * selected the subspans for the first or second half respectively.
        */
       assert(sample_mask.file != BAD_FILE &&
              brw_type_size_bytes(sample_mask.type) == 4);
       sample_mask.type = BRW_TYPE_UW;
       sample_mask.stride *= 2;

       bld.exec_all().annotate("FB write oMask")
          .MOV(horiz_offset(retype(tmp, BRW_TYPE_UW),
                            inst->group % (16 * reg_unit(devinfo))),
               sample_mask);

       for (unsigned i = 0; i < reg_unit(devinfo); i++)
          sources[length++] = byte_offset(tmp, REG_SIZE * i);
    }

    payload_header_size = length;

    setup_color_payload(bld, key, &sources[length], color0, components);
    length += 4;

    if (color1.file != BAD_FILE) {
       setup_color_payload(bld, key, &sources[length], color1, components);
       length += 4;
    }

    if (src_depth.file != BAD_FILE) {
       sources[length] = src_depth;
       length++;
    }

    if (dst_depth.file != BAD_FILE) {
       sources[length] = dst_depth;
       length++;
    }

    if (src_stencil.file != BAD_FILE) {
       assert(bld.dispatch_width() == 8 * reg_unit(devinfo));

       /* XXX: src_stencil is only available on gfx9+. dst_depth is never
        * available on gfx9+. As such it's impossible to have both enabled at the
        * same time and therefore length cannot overrun the array.
        */
       assert(length < 15 * reg_unit(devinfo));

       sources[length] = bld.vgrf(BRW_TYPE_UD);
       bld.exec_all().annotate("FB write OS")
          .MOV(retype(sources[length], BRW_TYPE_UB),
               subscript(src_stencil, BRW_TYPE_UB, 0));
       length++;
    }

    /* Send from the GRF */
    brw_reg payload = brw_vgrf(-1, BRW_TYPE_F);
    fs_inst *load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
    payload.nr = bld.shader->alloc.allocate(regs_written(load));
    load->dst = payload;

    uint32_t msg_ctl = brw_fb_write_msg_control(inst, prog_data);

    /* XXX - Bit 13 Per-sample PS enable */
    inst->desc =
       (inst->group / 16) << 11 | /* rt slot group */
       brw_fb_write_desc(devinfo, inst->target, msg_ctl, inst->last_rt,
                         0 /* coarse_rt_write */);

    brw_reg desc = brw_imm_ud(0);
    if (prog_data->coarse_pixel_dispatch == INTEL_ALWAYS) {
       inst->desc |= (1 << 18);
    } else if (prog_data->coarse_pixel_dispatch == INTEL_SOMETIMES) {
       STATIC_ASSERT(INTEL_MSAA_FLAG_COARSE_RT_WRITES == (1 << 18));
       const fs_builder &ubld = bld.exec_all().group(8, 0);
       desc = ubld.vgrf(BRW_TYPE_UD);
       ubld.AND(desc, dynamic_msaa_flags(prog_data),
                brw_imm_ud(INTEL_MSAA_FLAG_COARSE_RT_WRITES));
       desc = component(desc, 0);
    }

    uint32_t ex_desc = 0;
    if (devinfo->ver >= 20) {
       ex_desc = inst->target << 21 |
                 null_rt << 20 |
                 (src0_alpha.file != BAD_FILE) << 15 |
                 (src_stencil.file != BAD_FILE) << 14 |
                 (src_depth.file != BAD_FILE) << 13 |
                 (sample_mask.file != BAD_FILE) << 12;
    } else if (devinfo->ver >= 11) {
       /* Set the "Render Target Index" and "Src0 Alpha Present" fields
        * in the extended message descriptor, in lieu of using a header.
        */
       ex_desc = inst->target << 12 |
                 null_rt << 20 |
                 (src0_alpha.file != BAD_FILE) << 15;
    }
    inst->ex_desc = ex_desc;

    inst->opcode = SHADER_OPCODE_SEND;
    inst->resize_sources(3);
    inst->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
    inst->src[0] = desc;
    inst->src[1] = brw_imm_ud(0);
    inst->src[2] = payload;
    inst->mlen = regs_written(load);
    inst->ex_mlen = 0;
    inst->header_size = header_size;
    inst->check_tdr = true;
    inst->send_has_side_effects = true;
 }

 static void
 lower_fb_read_logical_send(const fs_builder &bld, fs_inst *inst,
                            const struct brw_wm_prog_data *wm_prog_data)
 {
    const intel_device_info *devinfo = bld.shader->devinfo;
    const fs_builder &ubld = bld.exec_all().group(8, 0);
    const unsigned length = 2;
    const brw_reg header = ubld.vgrf(BRW_TYPE_UD, length);

    assert(devinfo->ver >= 9 && devinfo->ver < 20);

    if (bld.group() < 16) {
       ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
                                            BRW_TYPE_UD));
    } else {
       assert(bld.group() < 32);
       const brw_reg header_sources[] = {
          retype(brw_vec8_grf(0, 0), BRW_TYPE_UD),
          retype(brw_vec8_grf(2, 0), BRW_TYPE_UD)
       };
       ubld.LOAD_PAYLOAD(header, header_sources, ARRAY_SIZE(header_sources), 0);

       if (devinfo->ver >= 12) {
          /* On Gfx12 the Viewport and Render Target Array Index fields (AKA
           * Poly 0 Info) are provided in r1.1 instead of r0.0, and the render
           * target message header format was updated accordingly -- However
           * the updated format only works for the lower 16 channels in a
           * SIMD32 thread, since the higher 16 channels want the subspan data
           * from r2 instead of r1, so we need to copy over the contents of
           * r1.1 in order to fix things up.
           */
          ubld.group(1, 0).MOV(component(header, 9),
                               retype(brw_vec1_grf(1, 1), BRW_TYPE_UD));
       }
    }

    /* BSpec 12470 (Gfx8-11), BSpec 47842 (Gfx12+) :
     *
     *   "Must be zero for Render Target Read message."
     *
     * For bits :
     *   - 14 : Stencil Present to Render Target
     *   - 13 : Source Depth Present to Render Target
     *   - 12 : oMask to Render Target
     *   - 11 : Source0 Alpha Present to Render Target
     */
    ubld.group(1, 0).AND(component(header, 0),
                         component(header, 0),
                         brw_imm_ud(~INTEL_MASK(14, 11)));

    inst->resize_sources(4);
    inst->opcode = SHADER_OPCODE_SEND;
    inst->src[0] = brw_imm_ud(0);
    inst->src[1] = brw_imm_ud(0);
    inst->src[2] = header;
    inst->src[3] = brw_reg();
    inst->mlen = length;
    inst->header_size = length;
    inst->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
    inst->check_tdr = true;
    inst->desc =
       (inst->group / 16) << 11 | /* rt slot group */
       brw_fb_read_desc(devinfo, inst->target,
                        0 /* msg_control */, inst->exec_size,
                        wm_prog_data->persample_dispatch);
 }

 static bool
 is_high_sampler(const struct intel_device_info *devinfo, const brw_reg &sampler)
 {
    return sampler.file != IMM || sampler.ud >= 16;
 }

 static unsigned
 sampler_msg_type(const intel_device_info *devinfo,
                  opcode opcode, bool shadow_compare,
                  bool lod_is_zero, bool has_min_lod)
 {
    switch (opcode) {
    case SHADER_OPCODE_TEX_LOGICAL:
       if (devinfo->ver >= 20 && has_min_lod) {
          return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_COMPARE_MLOD :
                                  XE2_SAMPLER_MESSAGE_SAMPLE_MLOD;
       } else {
          return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE :
                                  GFX5_SAMPLER_MESSAGE_SAMPLE;
       }
    case FS_OPCODE_TXB_LOGICAL:
       return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE :
                               GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS;
    case SHADER_OPCODE_TXL_LOGICAL:
       assert(!has_min_lod);
       if (lod_is_zero) {
          return shadow_compare ? GFX9_SAMPLER_MESSAGE_SAMPLE_C_LZ :
                                  GFX9_SAMPLER_MESSAGE_SAMPLE_LZ;
       }
       return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE :
                               GFX5_SAMPLER_MESSAGE_SAMPLE_LOD;
    case SHADER_OPCODE_TXS_LOGICAL:
    case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
       assert(!has_min_lod);
       return GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
    case SHADER_OPCODE_TXD_LOGICAL:
       return shadow_compare ? HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE :
                               GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
    case SHADER_OPCODE_TXF_LOGICAL:
       assert(!has_min_lod);
       return lod_is_zero ? GFX9_SAMPLER_MESSAGE_SAMPLE_LD_LZ :
                            GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
    case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
    case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
       assert(!has_min_lod);
       return GFX9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
    case SHADER_OPCODE_TXF_MCS_LOGICAL:
       assert(!has_min_lod);
       return GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
    case SHADER_OPCODE_LOD_LOGICAL:
       assert(!has_min_lod);
       return GFX5_SAMPLER_MESSAGE_LOD;
    case SHADER_OPCODE_TG4_LOGICAL:
       assert(!has_min_lod);
       return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C :
                               GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
       break;
    case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
       assert(!has_min_lod);
       return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C :
                               GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
    case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
       assert(!has_min_lod);
       assert(devinfo->ver >= 20);
       return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_L_C:
                               XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_L;
    case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
       assert(!has_min_lod);
       assert(devinfo->ver >= 20);
       return XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_B;
    case SHADER_OPCODE_TG4_BIAS_LOGICAL:
       assert(!has_min_lod);
       assert(devinfo->ver >= 20);
       return XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_B;
    case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
       assert(!has_min_lod);
       assert(devinfo->ver >= 20);
       return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_L_C :
                               XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_L;
    case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
       assert(!has_min_lod);
       assert(devinfo->ver >= 20);
       return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_I_C :
                               XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_I;
   case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
       assert(!has_min_lod);
       return GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
    default:
       unreachable("not reached");
    }
 }

 /**
  * Emit a LOAD_PAYLOAD instruction while ensuring the sources are aligned to
  * the given requested_alignment_sz.
  */
 static fs_inst *
 emit_load_payload_with_padding(const fs_builder &bld, const brw_reg &dst,
                                const brw_reg *src, unsigned sources,
                                unsigned header_size,
                                unsigned requested_alignment_sz)
 {
    unsigned length = 0;
    unsigned num_srcs =
       sources * DIV_ROUND_UP(requested_alignment_sz, bld.dispatch_width());
    brw_reg *src_comps = new brw_reg[num_srcs];

    for (unsigned i = 0; i < header_size; i++)
       src_comps[length++] = src[i];

    for (unsigned i = header_size; i < sources; i++) {
       unsigned src_sz =
          retype(dst, src[i].type).component_size(bld.dispatch_width());
       const enum brw_reg_type padding_payload_type =
          brw_type_with_size(BRW_TYPE_UD, brw_type_size_bits(src[i].type));

       src_comps[length++] = src[i];

       /* Expand the real sources if component of requested payload type is
        * larger than real source component.
        */
       if (src_sz < requested_alignment_sz) {
          for (unsigned j = 0; j < (requested_alignment_sz / src_sz) - 1; j++) {
             src_comps[length++] = retype(brw_reg(), padding_payload_type);
          }
       }
    }

    fs_inst *inst = bld.LOAD_PAYLOAD(dst, src_comps, length, header_size);
    delete[] src_comps;

    return inst;
 }

 static bool
 shader_opcode_needs_header(opcode op)
 {
    switch (op) {
    case SHADER_OPCODE_TG4_LOGICAL:
    case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
    case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
    case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
    case SHADER_OPCODE_TG4_BIAS_LOGICAL:
    case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
    case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
    case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
       return true;
    default:
       break;
    }

    return false;
 }

 static void
 lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst,
                            const brw_reg &coordinate,
                            const brw_reg &shadow_c,
                            brw_reg lod, const brw_reg &lod2,
                            const brw_reg &min_lod,
                            const brw_reg &sample_index,
                            const brw_reg &mcs,
                            const brw_reg &surface,
                            const brw_reg &sampler,
                            const brw_reg &surface_handle,
                            const brw_reg &sampler_handle,
                            const brw_reg &tg4_offset,
                            unsigned payload_type_bit_size,
                            unsigned coord_components,
                            unsigned grad_components,
                            bool residency)
 {
    /* We never generate EOT sampler messages */
    assert(!inst->eot);

    const brw_compiler *compiler = bld.shader->compiler;
    const intel_device_info *devinfo = bld.shader->devinfo;
    const enum brw_reg_type payload_type =
       brw_type_with_size(BRW_TYPE_F, payload_type_bit_size);
    const enum brw_reg_type payload_unsigned_type =
       brw_type_with_size(BRW_TYPE_UD, payload_type_bit_size);
    const enum brw_reg_type payload_signed_type =
       brw_type_with_size(BRW_TYPE_D, payload_type_bit_size);
    unsigned header_size = 0, length = 0;
    opcode op = inst->opcode;
    brw_reg sources[1 + MAX_SAMPLER_MESSAGE_SIZE];
    for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
       sources[i] = bld.vgrf(payload_type);

    /* We must have exactly one of surface/sampler and surface/sampler_handle */
    assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
    assert((sampler.file == BAD_FILE) != (sampler_handle.file == BAD_FILE));

    if (shader_opcode_needs_header(op) || inst->offset != 0 ||
        sampler_handle.file != BAD_FILE ||
        is_high_sampler(devinfo, sampler) ||
        residency) {
       /* For general texture offsets (no txf workaround), we need a header to
        * put them in.
        *
        * TG4 needs to place its channel select in the header, for interaction
        * with ARB_texture_swizzle.  The sampler index is only 4-bits, so for
        * larger sampler numbers we need to offset the Sampler State Pointer in
        * the header.
        */
       brw_reg header = retype(sources[0], BRW_TYPE_UD);
       for (header_size = 0; header_size < reg_unit(devinfo); header_size++)
          sources[length++] = byte_offset(header, REG_SIZE * header_size);

       /* If we're requesting fewer than four channels worth of response,
        * and we have an explicit header, we need to set up the sampler
        * writemask.  It's reversed from normal: 1 means "don't write".
        */
       unsigned comps_regs =
          DIV_ROUND_UP(regs_written(inst) - reg_unit(devinfo) * residency,
                       reg_unit(devinfo));
       unsigned comp_regs =
          DIV_ROUND_UP(inst->dst.component_size(inst->exec_size),
                       reg_unit(devinfo) * REG_SIZE);
       if (comps_regs < 4 * comp_regs) {
          assert(comps_regs % comp_regs == 0);
          unsigned mask = ~((1 << (comps_regs / comp_regs)) - 1) & 0xf;
          inst->offset |= mask << 12;
       }

       if (residency)
          inst->offset |= 1 << 23; /* g0.2 bit23 : Pixel Null Mask Enable */

       /* Build the actual header */
       const fs_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0);
       const fs_builder ubld1 = ubld.group(1, 0);
       if (devinfo->ver >= 11)
          ubld.MOV(header, brw_imm_ud(0));
       else
          ubld.MOV(header, retype(brw_vec8_grf(0, 0), BRW_TYPE_UD));
       if (inst->offset) {
          ubld1.MOV(component(header, 2), brw_imm_ud(inst->offset));
       } else if (devinfo->ver < 11 &&
                  bld.shader->stage != MESA_SHADER_VERTEX &&
                  bld.shader->stage != MESA_SHADER_FRAGMENT) {
          /* The vertex and fragment stages have g0.2 set to 0, so
           * header0.2 is 0 when g0 is copied. Other stages may not, so we
           * must set it to 0 to avoid setting undesirable bits in the
           * message.
           */
          ubld1.MOV(component(header, 2), brw_imm_ud(0));
       }

       if (sampler_handle.file != BAD_FILE) {
          /* Bindless sampler handles aren't relative to the sampler state
           * pointer passed into the shader through SAMPLER_STATE_POINTERS_*.
           * Instead, it's an absolute pointer relative to dynamic state base
           * address.
           *
           * Sampler states are 16 bytes each and the pointer we give here has
           * to be 32-byte aligned.  In order to avoid more indirect messages
           * than required, we assume that all bindless sampler states are
           * 32-byte aligned.  This sacrifices a bit of general state base
           * address space but means we can do something more efficient in the
           * shader.
           */
          if (compiler->use_bindless_sampler_offset) {
             assert(devinfo->ver >= 11);
             ubld1.OR(component(header, 3), sampler_handle, brw_imm_ud(1));
          } else {
             ubld1.MOV(component(header, 3), sampler_handle);
          }
       } else if (is_high_sampler(devinfo, sampler)) {
          brw_reg sampler_state_ptr =
             retype(brw_vec1_grf(0, 3), BRW_TYPE_UD);

          /* Gfx11+ sampler message headers include bits in 4:0 which conflict
           * with the ones included in g0.3 bits 4:0.  Mask them out.
           */
          if (devinfo->ver >= 11) {
             sampler_state_ptr = ubld1.vgrf(BRW_TYPE_UD);
             ubld1.AND(sampler_state_ptr,
                       retype(brw_vec1_grf(0, 3), BRW_TYPE_UD),
                       brw_imm_ud(INTEL_MASK(31, 5)));
          }

          if (sampler.file == IMM) {
             assert(sampler.ud >= 16);
             const int sampler_state_size = 16; /* 16 bytes */

             ubld1.ADD(component(header, 3), sampler_state_ptr,
                       brw_imm_ud(16 * (sampler.ud / 16) * sampler_state_size));
          } else {
             brw_reg tmp = ubld1.vgrf(BRW_TYPE_UD);
             ubld1.AND(tmp, sampler, brw_imm_ud(0x0f0));
             ubld1.SHL(tmp, tmp, brw_imm_ud(4));
             ubld1.ADD(component(header, 3), sampler_state_ptr, tmp);
          }
       } else if (devinfo->ver >= 11) {
          /* Gfx11+ sampler message headers include bits in 4:0 which conflict
           * with the ones included in g0.3 bits 4:0.  Mask them out.
           */
          ubld1.AND(component(header, 3),
                    retype(brw_vec1_grf(0, 3), BRW_TYPE_UD),
                    brw_imm_ud(INTEL_MASK(31, 5)));
       }
    }

    const bool lod_is_zero = lod.is_zero();

    /* On Xe2 and newer platforms, min_lod is the first parameter specifically
     * so that a bunch of other, possibly unused, parameters don't need to also
     * be included.
     */
    const unsigned msg_type =
       sampler_msg_type(devinfo, op, inst->shadow_compare, lod_is_zero,
                        min_lod.file != BAD_FILE);

    const bool min_lod_is_first = devinfo->ver >= 20 &&
       (msg_type == XE2_SAMPLER_MESSAGE_SAMPLE_MLOD ||
        msg_type == XE2_SAMPLER_MESSAGE_SAMPLE_COMPARE_MLOD);

    if (min_lod_is_first) {
       assert(min_lod.file != BAD_FILE);
       bld.MOV(sources[length++], min_lod);
    }

    if (shadow_c.file != BAD_FILE) {
       bld.MOV(sources[length], shadow_c);
       length++;
    }

    bool coordinate_done = false;

    /* Set up the LOD info */
    switch (op) {
    case SHADER_OPCODE_TXL_LOGICAL:
       if (lod_is_zero)
          break;
       FALLTHROUGH;
    case FS_OPCODE_TXB_LOGICAL:
    case SHADER_OPCODE_TG4_BIAS_LOGICAL:
    case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
    case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
    case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
       bld.MOV(sources[length], lod);
       length++;
       break;
    case SHADER_OPCODE_TXD_LOGICAL:
       /* TXD should have been lowered in SIMD16 mode (in SIMD32 mode in
        * Xe2+).
        */
       assert(bld.dispatch_width() == (8 * reg_unit(devinfo)));

       /* Load dPdx and the coordinate together:
        * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
        */
       for (unsigned i = 0; i < coord_components; i++) {
          bld.MOV(sources[length++], offset(coordinate, bld, i));

          /* For cube map array, the coordinate is (u,v,r,ai) but there are
           * only derivatives for (u, v, r).
           */
          if (i < grad_components) {
             bld.MOV(sources[length++], offset(lod, bld, i));
             bld.MOV(sources[length++], offset(lod2, bld, i));
          }
       }

       coordinate_done = true;
       break;
    case SHADER_OPCODE_TXS_LOGICAL:
       sources[length] = retype(sources[length], payload_unsigned_type);
       bld.MOV(sources[length++], lod);
       break;
    case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
       /* We need an LOD; just use 0 */
       sources[length] = retype(sources[length], payload_unsigned_type);
       bld.MOV(sources[length++], brw_imm_ud(0));
       break;
    case SHADER_OPCODE_TXF_LOGICAL:
        /* On Gfx9 the parameters are intermixed they are u, v, lod, r. */
       sources[length] = retype(sources[length], payload_signed_type);
       bld.MOV(sources[length++], coordinate);

       if (coord_components >= 2) {
          sources[length] = retype(sources[length], payload_signed_type);
          bld.MOV(sources[length], offset(coordinate, bld, 1));
       } else {
          sources[length] = brw_imm_d(0);
       }
       length++;

       if (!lod_is_zero) {
          sources[length] = retype(sources[length], payload_signed_type);
          bld.MOV(sources[length++], lod);
       }

       for (unsigned i = 2; i < coord_components; i++) {
          sources[length] = retype(sources[length], payload_signed_type);
          bld.MOV(sources[length++], offset(coordinate, bld, i));
       }

       coordinate_done = true;
       break;

    case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
    case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
       sources[length] = retype(sources[length], payload_unsigned_type);
       bld.MOV(sources[length++], sample_index);

       /* Data from the multisample control surface. */
       for (unsigned i = 0; i < 2; ++i) {
          /* Sampler always writes 4/8 register worth of data but for ld_mcs
           * only valid data is in first two register. So with 16-bit
           * payload, we need to split 2-32bit register into 4-16-bit
           * payload.
           *
           * From the Gfx12HP BSpec: Render Engine - 3D and GPGPU Programs -
           * Shared Functions - 3D Sampler - Messages - Message Format:
           *
           *    ld2dms_w   si  mcs0 mcs1 mcs2  mcs3  u  v  r
           */
          if (op == SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL) {
             brw_reg tmp = offset(mcs, bld, i);
             sources[length] = retype(sources[length], payload_unsigned_type);
             bld.MOV(sources[length++],
                     mcs.file == IMM ? mcs :
                     brw_reg(subscript(tmp, payload_unsigned_type, 0)));

             sources[length] = retype(sources[length], payload_unsigned_type);
             bld.MOV(sources[length++],
                     mcs.file == IMM ? mcs :
                     brw_reg(subscript(tmp, payload_unsigned_type, 1)));
          } else {
             sources[length] = retype(sources[length], payload_unsigned_type);
             bld.MOV(sources[length++],
                     mcs.file == IMM ? mcs : offset(mcs, bld, i));
          }
       }
       FALLTHROUGH;

    case SHADER_OPCODE_TXF_MCS_LOGICAL:
       /* There is no offsetting for this message; just copy in the integer
        * texture coordinates.
        */
       for (unsigned i = 0; i < coord_components; i++) {
          sources[length] = retype(sources[length], payload_signed_type);
          bld.MOV(sources[length++], offset(coordinate, bld, i));
       }

       coordinate_done = true;
       break;
    case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
       /* More crazy intermixing */
       for (unsigned i = 0; i < 2; i++) /* u, v */
          bld.MOV(sources[length++], offset(coordinate, bld, i));

       for (unsigned i = 0; i < 2; i++) { /* offu, offv */
          sources[length] = retype(sources[length], payload_signed_type);
          bld.MOV(sources[length++], offset(tg4_offset, bld, i));
       }

       if (coord_components == 3) /* r if present */
          bld.MOV(sources[length++], offset(coordinate, bld, 2));

       coordinate_done = true;
       break;
    default:
       break;
    }

    /* Set up the coordinate (except for cases where it was done above) */
    if (!coordinate_done) {
       for (unsigned i = 0; i < coord_components; i++)
          bld.MOV(retype(sources[length++], payload_type),
                  offset(coordinate, bld, i));
    }

    if (min_lod.file != BAD_FILE && !min_lod_is_first) {
       /* Account for all of the missing coordinate sources */
       if (op == FS_OPCODE_TXB_LOGICAL && devinfo->ver >= 20) {
          /* Bspec 64985:
           *
           * For sample_b sampler message format:
           *
           * SIMD16H/SIMD32H
           * Param Number   0     1  2  3  4  5
           * Param          BIAS  U  V  R  Ai MLOD
           *
           * SIMD16/SIMD32
           * Param Number   0        1  2  3  4
           * Param          BIAS_AI  U  V  R  MLOD
           */
          length += 3 - coord_components;
       } else if (op == SHADER_OPCODE_TXD_LOGICAL && devinfo->verx10 >= 125) {
          /* On DG2 and newer platforms, sample_d can only be used with 1D and
           * 2D surfaces, so the maximum number of gradient components is 2.
           * In spite of this limitation, the Bspec lists a mysterious R
           * component before the min_lod, so the maximum coordinate components
           * is 3.
           *
           * See bspec 45942, "Enable new message layout for cube array"
           */
          length += 3 - coord_components;
          length += (2 - grad_components) * 2;
       } else {
          length += 4 - coord_components;
          if (op == SHADER_OPCODE_TXD_LOGICAL)
             length += (3 - grad_components) * 2;
       }

       bld.MOV(sources[length++], min_lod);

       /* Wa_14014595444: Populate MLOD as parameter 5 (twice). */
        if (devinfo->verx10 == 125 && op == FS_OPCODE_TXB_LOGICAL &&
           !inst->shadow_compare)
          bld.MOV(sources[length++], min_lod);
    }

    const brw_reg src_payload =
       brw_vgrf(bld.shader->alloc.allocate(length * bld.dispatch_width() / 8),
                BRW_TYPE_F);
    /* In case of 16-bit payload each component takes one full register in
     * both SIMD8H and SIMD16H modes. In both cases one reg can hold 16
     * elements. In SIMD8H case hardware simply expects the components to be
     * padded (i.e., aligned on reg boundary).
     */
    fs_inst *load_payload_inst =
       emit_load_payload_with_padding(bld, src_payload, sources, length,
                                      header_size, REG_SIZE * reg_unit(devinfo));
    unsigned mlen = load_payload_inst->size_written / REG_SIZE;
    unsigned simd_mode = 0;
    if (devinfo->ver < 20) {
       if (payload_type_bit_size == 16) {
          assert(devinfo->ver >= 11);
          simd_mode = inst->exec_size <= 8 ? GFX10_SAMPLER_SIMD_MODE_SIMD8H :
             GFX10_SAMPLER_SIMD_MODE_SIMD16H;
       } else {
          simd_mode = inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
             BRW_SAMPLER_SIMD_MODE_SIMD16;
       }
    } else {
       if (payload_type_bit_size == 16) {
          simd_mode = inst->exec_size <= 16 ? XE2_SAMPLER_SIMD_MODE_SIMD16H :
             XE2_SAMPLER_SIMD_MODE_SIMD32H;
       } else {
          simd_mode = inst->exec_size <= 16 ? XE2_SAMPLER_SIMD_MODE_SIMD16 :
             XE2_SAMPLER_SIMD_MODE_SIMD32;
       }
    }

    /* Generate the SEND. */
    inst->opcode = SHADER_OPCODE_SEND;
    inst->mlen = mlen;
    inst->header_size = header_size;
    inst->sfid = BRW_SFID_SAMPLER;
    uint sampler_ret_type = brw_type_size_bits(inst->dst.type) == 16
       ? GFX8_SAMPLER_RETURN_FORMAT_16BITS
       : GFX8_SAMPLER_RETURN_FORMAT_32BITS;
    if (surface.file == IMM &&
        (sampler.file == IMM || sampler_handle.file != BAD_FILE)) {
       inst->desc = brw_sampler_desc(devinfo, surface.ud,
                                     sampler.file == IMM ? sampler.ud % 16 : 0,
                                     msg_type,
                                     simd_mode,
                                     sampler_ret_type);
       inst->src[0] = brw_imm_ud(0);
       inst->src[1] = brw_imm_ud(0);
    } else if (surface_handle.file != BAD_FILE) {
       /* Bindless surface */
       inst->desc = brw_sampler_desc(devinfo,
                                     GFX9_BTI_BINDLESS,
                                     sampler.file == IMM ? sampler.ud % 16 : 0,
                                     msg_type,
                                     simd_mode,
                                     sampler_ret_type);

       /* For bindless samplers, the entire address is included in the message
        * header so we can leave the portion in the message descriptor 0.
        */
       if (sampler_handle.file != BAD_FILE || sampler.file == IMM) {
          inst->src[0] = brw_imm_ud(0);
       } else {
          const fs_builder ubld = bld.group(1, 0).exec_all();
          brw_reg desc = ubld.vgrf(BRW_TYPE_UD);
          ubld.SHL(desc, sampler, brw_imm_ud(8));
          inst->src[0] = component(desc, 0);
       }

       /* We assume that the driver provided the handle in the top 20 bits so
        * we can use the surface handle directly as the extended descriptor.
        */
       inst->src[1] = retype(surface_handle, BRW_TYPE_UD);
       inst->send_ex_bso = compiler->extended_bindless_surface_offset;
    } else {
       /* Immediate portion of the descriptor */
       inst->desc = brw_sampler_desc(devinfo,
                                     0, /* surface */
                                     0, /* sampler */
                                     msg_type,
                                     simd_mode,
                                     sampler_ret_type);
       const fs_builder ubld = bld.group(1, 0).exec_all();
       brw_reg desc = ubld.vgrf(BRW_TYPE_UD);
       if (surface.equals(sampler)) {
          /* This case is common in GL */
          ubld.MUL(desc, surface, brw_imm_ud(0x101));
       } else {
          if (sampler_handle.file != BAD_FILE) {
             ubld.MOV(desc, surface);
          } else if (sampler.file == IMM) {
             ubld.OR(desc, surface, brw_imm_ud(sampler.ud << 8));
          } else {
             ubld.SHL(desc, sampler, brw_imm_ud(8));
             ubld.OR(desc, desc, surface);
          }
       }
       ubld.AND(desc, desc, brw_imm_ud(0xfff));

       inst->src[0] = component(desc, 0);
       inst->src[1] = brw_imm_ud(0); /* ex_desc */
    }

    inst->ex_desc = 0;

    inst->src[2] = src_payload;
    inst->resize_sources(3);

    /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
    assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE * reg_unit(devinfo));
 }

 static unsigned
 get_sampler_msg_payload_type_bit_size(const intel_device_info *devinfo,
                                       const fs_inst *inst)
 {
    assert(inst);
    const brw_reg *src = inst->src;
    unsigned src_type_size = 0;

    /* All sources need to have the same size, therefore seek the first valid
     * and take the size from there.
     */
    for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
       if (src[i].file != BAD_FILE) {
          src_type_size = brw_type_size_bytes(src[i].type);
          break;
       }
    }

    assert(src_type_size == 2 || src_type_size == 4);

 #ifndef NDEBUG
    /* Make sure all sources agree. On gfx12 this doesn't hold when sampling
     * compressed multisampled surfaces. There the payload contains MCS data
     * which is already in 16-bits unlike the other parameters that need forced
     * conversion.
     */
    if (inst->opcode != SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL) {
       for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
          assert(src[i].file == BAD_FILE ||
                 brw_type_size_bytes(src[i].type) == src_type_size);
       }
    }
 #endif

    if (devinfo->verx10 < 125)
       return src_type_size * 8;

    /* Force conversion from 32-bit sources to 16-bit payload. From the XeHP Bspec:
     * 3D and GPGPU Programs - Shared Functions - 3D Sampler - Messages - Message
     * Format [GFX12:HAS:1209977870] *
     *
     *  ld2dms_w       SIMD8H and SIMD16H Only
     *  ld_mcs         SIMD8H and SIMD16H Only
     *  ld2dms         REMOVEDBY(GEN:HAS:1406788836)
     */
    if (inst->opcode == SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL ||
        inst->opcode == SHADER_OPCODE_TXF_MCS_LOGICAL)
       src_type_size = 2;

    return src_type_size * 8;
 }

 static void
 lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst)
 {
    const intel_device_info *devinfo = bld.shader->devinfo;
    const brw_reg coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE];
    const brw_reg shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
    const brw_reg lod = inst->src[TEX_LOGICAL_SRC_LOD];
    const brw_reg lod2 = inst->src[TEX_LOGICAL_SRC_LOD2];
    const brw_reg min_lod = inst->src[TEX_LOGICAL_SRC_MIN_LOD];
    const brw_reg sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX];
    const brw_reg mcs = inst->src[TEX_LOGICAL_SRC_MCS];
    const brw_reg surface = inst->src[TEX_LOGICAL_SRC_SURFACE];
    const brw_reg sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER];
    const brw_reg surface_handle = inst->src[TEX_LOGICAL_SRC_SURFACE_HANDLE];
    const brw_reg sampler_handle = inst->src[TEX_LOGICAL_SRC_SAMPLER_HANDLE];
    const brw_reg tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET];
    assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
    const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
    assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
    const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
    assert(inst->src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
    const bool residency = inst->src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;

    const unsigned msg_payload_type_bit_size =
       get_sampler_msg_payload_type_bit_size(devinfo, inst);

    /* 16-bit payloads are available only on gfx11+ */
    assert(msg_payload_type_bit_size != 16 || devinfo->ver >= 11);

    lower_sampler_logical_send(bld, inst, coordinate,
                               shadow_c, lod, lod2, min_lod,
                               sample_index,
                               mcs, surface, sampler,
                               surface_handle, sampler_handle,
                               tg4_offset,
                               msg_payload_type_bit_size,
                               coord_components, grad_components,
                               residency);
 }

 /**
  * Predicate the specified instruction on the vector mask.
  */
 static void
 emit_predicate_on_vector_mask(const fs_builder &bld, fs_inst *inst)
 {
    assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
           bld.group() == inst->group &&
           bld.dispatch_width() == inst->exec_size);

    const fs_builder ubld = bld.exec_all().group(1, 0);

    const fs_visitor &s = *bld.shader;
    const brw_reg vector_mask = ubld.vgrf(BRW_TYPE_UW);
    ubld.UNDEF(vector_mask);
    ubld.emit(SHADER_OPCODE_READ_ARCH_REG, vector_mask, retype(brw_sr0_reg(3),
                                                               BRW_TYPE_UD));
    const unsigned subreg = sample_mask_flag_subreg(s);

    ubld.MOV(brw_flag_subreg(subreg + inst->group / 16), vector_mask);

    if (inst->predicate) {
       assert(inst->predicate == BRW_PREDICATE_NORMAL);
       assert(!inst->predicate_inverse);
       assert(inst->flag_subreg == 0);
       assert(s.devinfo->ver < 20);
       /* Combine the vector mask with the existing predicate by using a
        * vertical predication mode.
        */
       inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;
    } else {
       inst->flag_subreg = subreg;
       inst->predicate = BRW_PREDICATE_NORMAL;
       inst->predicate_inverse = false;
    }
 }

 static void
 setup_surface_descriptors(const fs_builder &bld, fs_inst *inst, uint32_t desc,
                           const brw_reg &surface, const brw_reg &surface_handle)
 {
    const brw_compiler *compiler = bld.shader->compiler;

    /* We must have exactly one of surface and surface_handle */
    assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));

    if (surface.file == IMM) {
       inst->desc = desc | (surface.ud & 0xff);
       inst->src[0] = brw_imm_ud(0);
       inst->src[1] = brw_imm_ud(0); /* ex_desc */
    } else if (surface_handle.file != BAD_FILE) {
       /* Bindless surface */
       inst->desc = desc | GFX9_BTI_BINDLESS;
       inst->src[0] = brw_imm_ud(0);

       /* We assume that the driver provided the handle in the top 20 bits so
        * we can use the surface handle directly as the extended descriptor.
        */
       inst->src[1] = retype(surface_handle, BRW_TYPE_UD);
       inst->send_ex_bso = compiler->extended_bindless_surface_offset;
    } else {
       inst->desc = desc;
       const fs_builder ubld = bld.exec_all().group(1, 0);
       brw_reg tmp = ubld.vgrf(BRW_TYPE_UD);
       ubld.AND(tmp, surface, brw_imm_ud(0xff));
       inst->src[0] = component(tmp, 0);
       inst->src[1] = brw_imm_ud(0); /* ex_desc */
    }
 }

 static void
 setup_lsc_surface_descriptors(const fs_builder &bld, fs_inst *inst,
                               uint32_t desc, const brw_reg &surface)
 {
    const ASSERTED intel_device_info *devinfo = bld.shader->devinfo;
    const brw_compiler *compiler = bld.shader->compiler;

    inst->src[0] = brw_imm_ud(0); /* desc */

    enum lsc_addr_surface_type surf_type = lsc_msg_desc_addr_type(devinfo, desc);
    switch (surf_type) {
    case LSC_ADDR_SURFTYPE_BSS:
       inst->send_ex_bso = compiler->extended_bindless_surface_offset;
       /* fall-through */
    case LSC_ADDR_SURFTYPE_SS:
       assert(surface.file != BAD_FILE);
       /* We assume that the driver provided the handle in the top 20 bits so
        * we can use the surface handle directly as the extended descriptor.
        */
       inst->src[1] = retype(surface, BRW_TYPE_UD);
       break;

    case LSC_ADDR_SURFTYPE_BTI:
       assert(surface.file != BAD_FILE);
       if (surface.file == IMM) {
          inst->src[1] = brw_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud));
       } else {
          const fs_builder ubld = bld.exec_all().group(1, 0);
          brw_reg tmp = ubld.vgrf(BRW_TYPE_UD);
          ubld.SHL(tmp, surface, brw_imm_ud(24));
          inst->src[1] = component(tmp, 0);
       }
       break;

    case LSC_ADDR_SURFTYPE_FLAT:
       inst->src[1] = brw_imm_ud(0);
       break;

    default:
       unreachable("Invalid LSC surface address type");
    }
 }

 static enum lsc_addr_size
 lsc_addr_size_for_type(enum brw_reg_type type)
 {
    switch (brw_type_size_bytes(type)) {
    case 2: return LSC_ADDR_SIZE_A16;
    case 4: return LSC_ADDR_SIZE_A32;
    case 8: return LSC_ADDR_SIZE_A64;
    default: unreachable("invalid type size");
    }
 }

 static void
 lower_lsc_memory_logical_send(const fs_builder &bld, fs_inst *inst)
 {
    const intel_device_info *devinfo = bld.shader->devinfo;
    assert(devinfo->has_lsc);

    assert(inst->src[MEMORY_LOGICAL_OPCODE].file == IMM);
    assert(inst->src[MEMORY_LOGICAL_MODE].file == IMM);
    assert(inst->src[MEMORY_LOGICAL_BINDING_TYPE].file == IMM);
    assert(inst->src[MEMORY_LOGICAL_COORD_COMPONENTS].file == IMM);
    assert(inst->src[MEMORY_LOGICAL_DATA_SIZE].file == IMM);
    assert(inst->src[MEMORY_LOGICAL_FLAGS].file == IMM);

    /* Get the logical send arguments. */
    const enum lsc_opcode op = (lsc_opcode) inst->src[MEMORY_LOGICAL_OPCODE].ud;
    const enum memory_logical_mode mode =
       (enum memory_logical_mode) inst->src[MEMORY_LOGICAL_MODE].ud;
    const enum lsc_addr_surface_type binding_type =
       (enum lsc_addr_surface_type) inst->src[MEMORY_LOGICAL_BINDING_TYPE].ud;
    const brw_reg binding = inst->src[MEMORY_LOGICAL_BINDING];
    const brw_reg addr = inst->src[MEMORY_LOGICAL_ADDRESS];
    const unsigned coord_components =
       inst->src[MEMORY_LOGICAL_COORD_COMPONENTS].ud;
    enum lsc_data_size data_size =
       (enum lsc_data_size) inst->src[MEMORY_LOGICAL_DATA_SIZE].ud;
    const unsigned components = inst->src[MEMORY_LOGICAL_COMPONENTS].ud;
    const enum memory_flags flags =
       (enum memory_flags) inst->src[MEMORY_LOGICAL_FLAGS].ud;
    const bool transpose = flags & MEMORY_FLAG_TRANSPOSE;
    const bool include_helpers = flags & MEMORY_FLAG_INCLUDE_HELPERS;
    const brw_reg data0 = inst->src[MEMORY_LOGICAL_DATA0];
    const brw_reg data1 = inst->src[MEMORY_LOGICAL_DATA1];
    const bool has_side_effects = inst->has_side_effects();

    const uint32_t data_size_B = lsc_data_size_bytes(data_size);
    const enum brw_reg_type data_type =
       brw_type_with_size(data0.type, data_size_B * 8);

    const enum lsc_addr_size addr_size = lsc_addr_size_for_type(addr.type);

    brw_reg payload = addr;

    if (addr.file != VGRF || !addr.is_contiguous()) {
       if (inst->force_writemask_all) {
          const fs_builder dbld = bld.group(bld.shader->dispatch_width, 0);
          payload = dbld.move_to_vgrf(addr, coord_components);
       } else {
          payload = bld.move_to_vgrf(addr, coord_components);
       }
    }

    unsigned ex_mlen = 0;
    brw_reg payload2;
    if (data0.file != BAD_FILE) {
       if (transpose) {
          assert(data1.file == BAD_FILE);

          payload2 = data0;
          ex_mlen = DIV_ROUND_UP(components, 8);
       } else {
          brw_reg data[8];
          unsigned size = 0;

          assert(components < 8);

          for (unsigned i = 0; i < components; i++)
             data[size++] = offset(data0, inst->exec_size, i);

          if (data1.file != BAD_FILE) {
             for (unsigned i = 0; i < components; i++)
                data[size++] = offset(data1, inst->exec_size, i);
          }

          payload2 = bld.vgrf(data0.type, size);
          bld.LOAD_PAYLOAD(payload2, data, size, 0);
          ex_mlen = (size * brw_type_size_bytes(data_type) * inst->exec_size) / REG_SIZE;
       }
    }

    /* Bspec: Atomic instruction -> Cache section:
     *
     *    Atomic messages are always forced to "un-cacheable" in the L1
     *    cache.
     */
    unsigned cache_mode =
       lsc_opcode_is_atomic(op) ? (unsigned) LSC_CACHE(devinfo, STORE, L1UC_L3WB) :
       lsc_opcode_is_store(op)  ? (unsigned) LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS) :
       (unsigned) LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS);

    /* If we're a fragment shader, we have to predicate with the sample mask to
     * avoid helper invocations in instructions with side effects, unless they
     * are explicitly required.  One exception is for scratch writes - even
     * though those have side effects, they represent operations that didn't
     * originally have any.  We want to avoid accessing undefined values from
     * scratch, so we disable helper invocations entirely there.
     *
     * There are also special cases when we actually want to run on helpers
     * (ray queries).
     */
    if (bld.shader->stage == MESA_SHADER_FRAGMENT && !transpose) {
       if (include_helpers)
          emit_predicate_on_vector_mask(bld, inst);
       else if (has_side_effects && mode != MEMORY_MODE_SCRATCH)
          brw_emit_predicate_on_sample_mask(bld, inst);
    }

    switch (mode) {
    case MEMORY_MODE_UNTYPED:
    case MEMORY_MODE_SCRATCH:
       inst->sfid = GFX12_SFID_UGM;
       break;
    case MEMORY_MODE_TYPED:
       inst->sfid = GFX12_SFID_TGM;
       break;
    case MEMORY_MODE_SHARED_LOCAL:
       inst->sfid = GFX12_SFID_SLM;
       break;
    }
    assert(inst->sfid);

    /* Disable LSC data port L1 cache scheme for the TGM load/store for RT
     * shaders. (see HSD 18038444588)
     */
    if (devinfo->ver >= 20 && gl_shader_stage_is_rt(bld.shader->stage) &&
        inst->sfid == GFX12_SFID_TGM &&
        !lsc_opcode_is_atomic(op)) {
       if (lsc_opcode_is_store(op)) {
          cache_mode = (unsigned) LSC_CACHE(devinfo, STORE, L1UC_L3WB);
       } else {
          cache_mode = (unsigned) LSC_CACHE(devinfo, LOAD, L1UC_L3C);
       }
    }

    inst->desc = lsc_msg_desc(devinfo, op, binding_type, addr_size, data_size,
                              lsc_opcode_has_cmask(op) ?
                              (1 << components) - 1 : components,
                              transpose, cache_mode);

    /* Set up extended descriptors, fills src[0] and src[1]. */
    setup_lsc_surface_descriptors(bld, inst, inst->desc, binding);

    inst->opcode = SHADER_OPCODE_SEND;
    inst->mlen = lsc_msg_addr_len(devinfo, addr_size,
                                  inst->exec_size * coord_components);
    inst->ex_mlen = ex_mlen;
    inst->header_size = 0;
    inst->send_has_side_effects = has_side_effects;
    inst->send_is_volatile = !has_side_effects;

    inst->resize_sources(4);

    /* Finally, the payload */
    inst->src[2] = payload;
    inst->src[3] = payload2;
 }

 static brw_reg
 emit_a64_oword_block_header(const fs_builder &bld, const brw_reg &addr)
 {
    const fs_builder ubld = bld.exec_all().group(8, 0);

    assert(brw_type_size_bytes(addr.type) == 8 && addr.stride == 0);

    brw_reg expanded_addr = addr;
    if (addr.file == UNIFORM) {
       /* We can't do stride 1 with the UNIFORM file, it requires stride 0 */
       fs_builder ubld1 = ubld.group(1, 0);

       brw_reg tmp = ubld1.vgrf(BRW_TYPE_UQ);
       ubld1.UNDEF(tmp);

       expanded_addr = component(tmp, 0);
       ubld1.MOV(expanded_addr, retype(addr, BRW_TYPE_UQ));
    }

    brw_reg header = ubld.vgrf(BRW_TYPE_UD);
    ubld.MOV(header, brw_imm_ud(0));

    /* Use a 2-wide MOV to fill out the address */
    brw_reg addr_vec2 = expanded_addr;
    addr_vec2.type = BRW_TYPE_UD;
    addr_vec2.stride = 1;
    ubld.group(2, 0).MOV(header, addr_vec2);

    return header;
 }

 static void
 lower_hdc_memory_logical_send(const fs_builder &bld, fs_inst *inst)
 {
    const intel_device_info *devinfo = bld.shader->devinfo;
    const brw_compiler *compiler = bld.shader->compiler;

    assert(inst->src[MEMORY_LOGICAL_OPCODE].file == IMM);
    assert(inst->src[MEMORY_LOGICAL_MODE].file == IMM);
    assert(inst->src[MEMORY_LOGICAL_BINDING_TYPE].file == IMM);
    assert(inst->src[MEMORY_LOGICAL_COORD_COMPONENTS].file == IMM);
    assert(inst->src[MEMORY_LOGICAL_DATA_SIZE].file == IMM);
    assert(inst->src[MEMORY_LOGICAL_FLAGS].file == IMM);

    /* Get the logical send arguments. */
    const enum lsc_opcode op = (lsc_opcode)inst->src[MEMORY_LOGICAL_OPCODE].ud;
    const enum memory_logical_mode mode =
       (enum memory_logical_mode) inst->src[MEMORY_LOGICAL_MODE].ud;
    enum lsc_addr_surface_type binding_type =
       (enum lsc_addr_surface_type) inst->src[MEMORY_LOGICAL_BINDING_TYPE].ud;
    brw_reg binding = inst->src[MEMORY_LOGICAL_BINDING];
    const brw_reg addr = inst->src[MEMORY_LOGICAL_ADDRESS];
    const unsigned coord_components =
       inst->src[MEMORY_LOGICAL_COORD_COMPONENTS].ud;
    const unsigned alignment = inst->src[MEMORY_LOGICAL_ALIGNMENT].ud;
    const unsigned components = inst->src[MEMORY_LOGICAL_COMPONENTS].ud;
    const enum memory_flags flags =
       (enum memory_flags) inst->src[MEMORY_LOGICAL_FLAGS].ud;
    const bool block = flags & MEMORY_FLAG_TRANSPOSE;
    const bool include_helpers = flags & MEMORY_FLAG_INCLUDE_HELPERS;
    const brw_reg data0 = inst->src[MEMORY_LOGICAL_DATA0];
    const brw_reg data1 = inst->src[MEMORY_LOGICAL_DATA1];
    const bool has_side_effects = inst->has_side_effects();
    const bool has_dest = inst->dst.file != BAD_FILE && !inst->dst.is_null();

    /* Don't predicate scratch writes on the sample mask.  Otherwise,
     * FS helper invocations would load undefined values from scratch memory.
     * And scratch memory load/stores are produced from operations without
     * side-effects, thus they should not have different behavior in the
     * helper invocations.
     */
    bool allow_sample_mask = has_side_effects && mode != MEMORY_MODE_SCRATCH;

    const enum lsc_data_size data_size =
       (enum lsc_data_size) inst->src[MEMORY_LOGICAL_DATA_SIZE].ud;

    /* unpadded data size */
    const uint32_t data_bit_size =
       data_size == LSC_DATA_SIZE_D8U32 ? 8 :
       data_size == LSC_DATA_SIZE_D16U32 ? 16 :
       8 * lsc_data_size_bytes(data_size);

    const bool byte_scattered =
       data_bit_size < 32 || (alignment != 0 && alignment < 4);
    const bool dword_scattered = !byte_scattered && mode == MEMORY_MODE_SCRATCH;
    const bool surface_access = !byte_scattered && !dword_scattered && !block;

    /* SLM block reads must use the 16B-aligned OWord Block Read messages,
     * as the unaligned message doesn't exist for SLM.
     */
    const bool oword_aligned = block && mode == MEMORY_MODE_SHARED_LOCAL;
    assert(!oword_aligned || (alignment % 16) == 0);

    enum lsc_addr_size addr_size = lsc_addr_size_for_type(addr.type);
    unsigned addr_size_B = coord_components * lsc_addr_size_bytes(addr_size);

    brw_reg header;
    fs_builder ubld8 = bld.exec_all().group(8, 0);
    fs_builder ubld1 = ubld8.group(1, 0);
    if (mode == MEMORY_MODE_SCRATCH) {
       header = ubld8.vgrf(BRW_TYPE_UD);
       ubld8.emit(SHADER_OPCODE_SCRATCH_HEADER, header, brw_ud8_grf(0, 0));
    } else if (block) {
       if (addr_size == LSC_ADDR_SIZE_A64) {
          header = emit_a64_oword_block_header(bld, addr);
       } else {
          header = ubld8.vgrf(BRW_TYPE_UD);
          ubld8.MOV(header, brw_imm_ud(0));
          if (oword_aligned)
             ubld1.SHR(component(header, 2), addr, brw_imm_ud(4));
          else
             ubld1.MOV(component(header, 2), addr);
       }
    }

    /* If we're a fragment shader, we have to predicate with the sample mask to
     * avoid helper invocations to avoid helper invocations in instructions
     * with side effects, unless they are explicitly required.
     *
     * There are also special cases when we actually want to run on helpers
     * (ray queries).
     */
    if (bld.shader->stage == MESA_SHADER_FRAGMENT) {
       if (include_helpers)
          emit_predicate_on_vector_mask(bld, inst);
       else if (allow_sample_mask &&
                (header.file == BAD_FILE || !surface_access))
          brw_emit_predicate_on_sample_mask(bld, inst);
    }

    brw_reg payload, payload2;
    unsigned mlen, ex_mlen = 0;

    if (!block) {
       brw_reg data[11];
       unsigned num_sources = 0;
       if (header.file != BAD_FILE)
          data[num_sources++] = header;

       for (unsigned i = 0; i < coord_components; i++)
          data[num_sources++] = offset(addr, inst->exec_size, i);

       if (data0.file != BAD_FILE) {
          for (unsigned i = 0; i < components; i++)
             data[num_sources++] = offset(data0, inst->exec_size, i);
          if (data1.file != BAD_FILE) {
             for (unsigned i = 0; i < components; i++)
                data[num_sources++] = offset(data1, inst->exec_size, i);
          }
       }

       assert(num_sources <= ARRAY_SIZE(data));

       unsigned payload_size_UDs = (header.file != BAD_FILE ? 1 : 0) +
                                   (addr_size_B / 4) +
                                   (lsc_op_num_data_values(op) * components *
                                    lsc_data_size_bytes(data_size) / 4);

       payload = bld.vgrf(BRW_TYPE_UD, payload_size_UDs);
       fs_inst *load_payload =
          emit_load_payload_with_padding(bld, payload, data, num_sources,
                                         header.file != BAD_FILE ? 1 : 0,
                                         REG_SIZE);
       mlen = load_payload->size_written / REG_SIZE;
    } else {
       assert(data1.file == BAD_FILE);

       payload = header;
       mlen = 1;

       if (data0.file != BAD_FILE) {
          payload2 = bld.move_to_vgrf(data0, components);
          ex_mlen = components * sizeof(uint32_t) / REG_SIZE;
       }
    }


    if (mode == MEMORY_MODE_SHARED_LOCAL) {
       binding_type = LSC_ADDR_SURFTYPE_BTI;
       binding = brw_imm_ud(GFX7_BTI_SLM);
    } else if (mode == MEMORY_MODE_SCRATCH) {
       binding_type = LSC_ADDR_SURFTYPE_BTI;
       binding = brw_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT);
    }

    uint32_t sfid, desc;
    if (mode == MEMORY_MODE_TYPED) {
       assert(addr_size == LSC_ADDR_SIZE_A32);
       assert(!block);

       sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;

       if (lsc_opcode_is_atomic(op)) {
          desc = brw_dp_typed_atomic_desc(devinfo, inst->exec_size, inst->group,
                                          lsc_op_to_legacy_atomic(op),
                                          has_dest);
       } else {
          desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size,
                                              inst->group, components, !has_dest);
       }
    } else if (addr_size == LSC_ADDR_SIZE_A64) {
       assert(binding_type == LSC_ADDR_SURFTYPE_FLAT);
       assert(!dword_scattered);

       sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;

       if (lsc_opcode_is_atomic(op)) {
          unsigned aop = lsc_op_to_legacy_atomic(op);
          if (lsc_opcode_is_atomic_float(op)) {
             desc = brw_dp_a64_untyped_atomic_float_desc(devinfo, inst->exec_size,
                                                         data_bit_size, aop,
                                                         has_dest);
          } else {
             desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size,
                                                   data_bit_size, aop,
                                                   has_dest);
          }
       } else if (block) {
          desc = brw_dp_a64_oword_block_rw_desc(devinfo, oword_aligned,
                                                components, !has_dest);
       } else if (byte_scattered) {
          desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
                                                   data_bit_size, !has_dest);
       } else {
          desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
                                                    components, !has_dest);
       }
    } else {
       assert(binding_type != LSC_ADDR_SURFTYPE_FLAT);

       sfid = surface_access ? HSW_SFID_DATAPORT_DATA_CACHE_1
                             : GFX7_SFID_DATAPORT_DATA_CACHE;

       if (lsc_opcode_is_atomic(op)) {
          unsigned aop = lsc_op_to_legacy_atomic(op);
          if (lsc_opcode_is_atomic_float(op)) {
             desc = brw_dp_untyped_atomic_float_desc(devinfo, inst->exec_size,
                                                     aop, has_dest);
          } else {
             desc = brw_dp_untyped_atomic_desc(devinfo, inst->exec_size,
                                               aop, has_dest);
          }
       } else if (block) {
          desc = brw_dp_oword_block_rw_desc(devinfo, oword_aligned,
                                            components, !has_dest);
       } else if (byte_scattered) {
          desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
                                               data_bit_size, !has_dest);
       } else if (dword_scattered) {
          desc = brw_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,
                                                !has_dest);
       } else {
          desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
                                                components, !has_dest);
       }
    }

    assert(sfid);

    /* Update the original instruction. */
    inst->opcode = SHADER_OPCODE_SEND;
    inst->sfid = sfid;
    inst->mlen = mlen;
    inst->ex_mlen = ex_mlen;
    inst->header_size = header.file != BAD_FILE ? 1 : 0;
    inst->send_has_side_effects = has_side_effects;
    inst->send_is_volatile = !has_side_effects;

    if (block) {
       assert(inst->force_writemask_all);
       inst->exec_size = components > 8 ? 16 : 8;
    }

    inst->resize_sources(4);

    /* Set up descriptors */
    switch (binding_type) {
    case LSC_ADDR_SURFTYPE_FLAT:
       inst->src[0] = brw_imm_ud(0);
       inst->src[1] = brw_imm_ud(0);
       break;
    case LSC_ADDR_SURFTYPE_BSS:
       inst->send_ex_bso = compiler->extended_bindless_surface_offset;
       /* fall-through */
    case LSC_ADDR_SURFTYPE_SS:
       desc |= GFX9_BTI_BINDLESS;

       /* We assume that the driver provided the handle in the top 20 bits so
        * we can use the surface handle directly as the extended descriptor.
        */
       inst->src[0] = brw_imm_ud(0);
       inst->src[1] = binding;
       break;
    case LSC_ADDR_SURFTYPE_BTI:
       if (binding.file == IMM) {
          desc |= binding.ud & 0xff;
          inst->src[0] = brw_imm_ud(0);
          inst->src[1] = brw_imm_ud(0);
       } else {
          brw_reg tmp = ubld1.vgrf(BRW_TYPE_UD);
          ubld1.AND(tmp, binding, brw_imm_ud(0xff));
          inst->src[0] = component(tmp, 0);
          inst->src[1] = brw_imm_ud(0);
       }
       break;
    default:
       unreachable("Unknown surface type");
    }

    inst->desc = desc;

    /* Finally, the payloads */
    inst->src[2] = payload;
    inst->src[3] = payload2;
 }

 static void
 lower_lsc_varying_pull_constant_logical_send(const fs_builder &bld,
                                              fs_inst *inst)
 {
    const intel_device_info *devinfo = bld.shader->devinfo;
    ASSERTED const brw_compiler *compiler = bld.shader->compiler;

    brw_reg surface        = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE];
    brw_reg surface_handle = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE];
    brw_reg offset_B       = inst->src[PULL_VARYING_CONSTANT_SRC_OFFSET];
    brw_reg alignment_B    = inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT];

    /* We are switching the instruction from an ALU-like instruction to a
     * send-from-grf instruction.  Since sends can't handle strides or
     * source modifiers, we have to make a copy of the offset source.
     */
    brw_reg ubo_offset = bld.move_to_vgrf(offset_B, 1);

    enum lsc_addr_surface_type surf_type =
       surface_handle.file == BAD_FILE ?
       LSC_ADDR_SURFTYPE_BTI : LSC_ADDR_SURFTYPE_BSS;

    assert(alignment_B.file == IMM);
    unsigned alignment = alignment_B.ud;

    inst->opcode = SHADER_OPCODE_SEND;
    inst->sfid = GFX12_SFID_UGM;
    inst->resize_sources(3);
    inst->send_ex_bso = surf_type == LSC_ADDR_SURFTYPE_BSS &&
                        compiler->extended_bindless_surface_offset;

    assert(!compiler->indirect_ubos_use_sampler);

    inst->src[0] = brw_imm_ud(0);
    inst->src[2] = ubo_offset; /* payload */

    if (alignment >= 4) {
       inst->desc =
          lsc_msg_desc(devinfo, LSC_OP_LOAD,
                       surf_type, LSC_ADDR_SIZE_A32,
                       LSC_DATA_SIZE_D32,
                       4 /* num_channels */,
                       false /* transpose */,
                       LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
       inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, inst->exec_size);

       setup_lsc_surface_descriptors(bld, inst, inst->desc,
                                     surface.file != BAD_FILE ?
                                     surface : surface_handle);
    } else {
       inst->desc =
          lsc_msg_desc(devinfo, LSC_OP_LOAD,
                       surf_type, LSC_ADDR_SIZE_A32,
                       LSC_DATA_SIZE_D32,
                       1 /* num_channels */,
                       false /* transpose */,
                       LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
       inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, inst->exec_size);

       setup_lsc_surface_descriptors(bld, inst, inst->desc,
                                     surface.file != BAD_FILE ?
                                     surface : surface_handle);

       /* The byte scattered messages can only read one dword at a time so
        * we have to duplicate the message 4 times to read the full vec4.
        * Hopefully, dead code will clean up the mess if some of them aren't
        * needed.
        */
       assert(inst->size_written == 16 * inst->exec_size);
       inst->size_written /= 4;
       for (unsigned c = 1; c < 4; c++) {
          /* Emit a copy of the instruction because we're about to modify
           * it.  Because this loop starts at 1, we will emit copies for the
           * first 3 and the final one will be the modified instruction.
           */
          bld.emit(*inst);

          /* Offset the source */
          inst->src[2] = bld.vgrf(BRW_TYPE_UD);
          bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4));

          /* Offset the destination */
          inst->dst = offset(inst->dst, bld, 1);
       }
    }
 }

 static void
 lower_varying_pull_constant_logical_send(const fs_builder &bld, fs_inst *inst)
 {
    const intel_device_info *devinfo = bld.shader->devinfo;
    const brw_compiler *compiler = bld.shader->compiler;

    brw_reg surface = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE];
    brw_reg surface_handle = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE];
    brw_reg offset_B = inst->src[PULL_VARYING_CONSTANT_SRC_OFFSET];

    /* We are switching the instruction from an ALU-like instruction to a
     * send-from-grf instruction.  Since sends can't handle strides or
     * source modifiers, we have to make a copy of the offset source.
     */
    brw_reg ubo_offset = bld.vgrf(BRW_TYPE_UD);
    bld.MOV(ubo_offset, offset_B);

    assert(inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT].file == IMM);
    unsigned alignment = inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT].ud;

    inst->opcode = SHADER_OPCODE_SEND;
    inst->mlen = inst->exec_size / 8;
    inst->resize_sources(3);

    /* src[0] & src[1] are filled by setup_surface_descriptors() */
    inst->src[2] = ubo_offset; /* payload */

    if (compiler->indirect_ubos_use_sampler) {
       const unsigned simd_mode =
          inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
                                 BRW_SAMPLER_SIMD_MODE_SIMD16;
       const uint32_t desc = brw_sampler_desc(devinfo, 0, 0,
                                              GFX5_SAMPLER_MESSAGE_SAMPLE_LD,
                                              simd_mode, 0);

       inst->sfid = BRW_SFID_SAMPLER;
       setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
    } else if (alignment >= 4) {
       const uint32_t desc =
          brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
                                         4, /* num_channels */
                                         false   /* write */);

       inst->sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
       setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
    } else {
       const uint32_t desc =
          brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
                                        32,     /* bit_size */
                                        false   /* write */);

       inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
       setup_surface_descriptors(bld, inst, desc, surface, surface_handle);

       /* The byte scattered messages can only read one dword at a time so
        * we have to duplicate the message 4 times to read the full vec4.
        * Hopefully, dead code will clean up the mess if some of them aren't
        * needed.
        */
       assert(inst->size_written == 16 * inst->exec_size);
       inst->size_written /= 4;
       for (unsigned c = 1; c < 4; c++) {
          /* Emit a copy of the instruction because we're about to modify
           * it.  Because this loop starts at 1, we will emit copies for the
           * first 3 and the final one will be the modified instruction.
           */
          bld.emit(*inst);

          /* Offset the source */
          inst->src[2] = bld.vgrf(BRW_TYPE_UD);
          bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4));

          /* Offset the destination */
          inst->dst = offset(inst->dst, bld, 1);
       }
    }
 }

 static void
 lower_interpolator_logical_send(const fs_builder &bld, fs_inst *inst,
                                 const struct brw_wm_prog_key *wm_prog_key,
                                 const struct brw_wm_prog_data *wm_prog_data)
 {
    const intel_device_info *devinfo = bld.shader->devinfo;

    /* We have to send something */
    brw_reg payload = brw_vec8_grf(0, 0);
    unsigned mlen = 1;

    unsigned mode;
    switch (inst->opcode) {
    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
       assert(inst->src[INTERP_SRC_OFFSET].file == BAD_FILE);
       mode = GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE;
       break;

    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
       assert(inst->src[INTERP_SRC_OFFSET].file == BAD_FILE);
       mode = GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET;
       break;

    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
       payload = inst->src[INTERP_SRC_OFFSET];
       mlen = 2 * inst->exec_size / 8;
       mode = GFX7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET;
       break;

    default:
       unreachable("Invalid interpolator instruction");
    }

    const bool dynamic_mode =
       inst->src[INTERP_SRC_DYNAMIC_MODE].file != BAD_FILE;

    brw_reg desc = inst->src[INTERP_SRC_MSG_DESC];
    uint32_t desc_imm =
       brw_pixel_interp_desc(devinfo,
                             /* Leave the mode at 0 if persample_dispatch is
                              * dynamic, it will be ORed in below.
                              */
                             dynamic_mode ? 0 : mode,
                             inst->pi_noperspective,
                             false /* coarse_pixel_rate */,
                             inst->exec_size, inst->group);

    if (wm_prog_data->coarse_pixel_dispatch == INTEL_ALWAYS) {
       desc_imm |= (1 << 15);
    } else if (wm_prog_data->coarse_pixel_dispatch == INTEL_SOMETIMES) {
       STATIC_ASSERT(INTEL_MSAA_FLAG_COARSE_PI_MSG == (1 << 15));
       brw_reg orig_desc = desc;
       const fs_builder &ubld = bld.exec_all().group(8, 0);
       desc = ubld.vgrf(BRW_TYPE_UD);
       ubld.AND(desc, dynamic_msaa_flags(wm_prog_data),
                brw_imm_ud(INTEL_MSAA_FLAG_COARSE_PI_MSG));

       /* And, if it's AT_OFFSET, we might have a non-trivial descriptor */
       if (orig_desc.file == IMM) {
          desc_imm |= orig_desc.ud;
       } else {
          ubld.OR(desc, desc, orig_desc);
       }
    }

    /* If persample_dispatch is dynamic, select the interpolation mode
     * dynamically and OR into the descriptor to complete the static part
     * generated by brw_pixel_interp_desc().
     *
     * Why does this work? If you look at the SKL PRMs, Volume 7:
     * 3D-Media-GPGPU, Shared Functions Pixel Interpolater, you'll see that
     *
     *   - "Per Message Offset” Message Descriptor
     *   - “Sample Position Offset” Message Descriptor
     *
     * have different formats. Fortunately, a fragment shader dispatched at
     * pixel rate, will have gl_SampleID = 0 & gl_NumSamples = 1. So the value
     * we pack in “Sample Position Offset” will be a 0 and will cover the X/Y
     * components of "Per Message Offset”, which will give us the pixel offset 0x0.
     */
    if (dynamic_mode) {
       brw_reg orig_desc = desc;
       const fs_builder &ubld = bld.exec_all().group(8, 0);
       desc = ubld.vgrf(BRW_TYPE_UD);

       /* The predicate should have been built in brw_fs_nir.cpp when emitting
        * NIR code. This guarantees that we do not have incorrect interactions
        * with the flag register holding the predication result.
        */
       if (orig_desc.file == IMM) {
          /* Not using SEL here because we would generate an instruction with 2
           * immediate sources which is not supported by HW.
           */
          set_predicate_inv(BRW_PREDICATE_NORMAL, false,
                            ubld.MOV(desc, brw_imm_ud(orig_desc.ud |
                                                      GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE << 12)));
          set_predicate_inv(BRW_PREDICATE_NORMAL, true,
                            ubld.MOV(desc, brw_imm_ud(orig_desc.ud |
                                                      GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET << 12)));
       } else {
          set_predicate_inv(BRW_PREDICATE_NORMAL, false,
                            ubld.OR(desc, orig_desc,
                                    brw_imm_ud(GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE << 12)));
          set_predicate_inv(BRW_PREDICATE_NORMAL, true,
                            ubld.OR(desc, orig_desc,
                                    brw_imm_ud(GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET << 12)));
       }
    }

    inst->opcode = SHADER_OPCODE_SEND;
    inst->sfid = GFX7_SFID_PIXEL_INTERPOLATOR;
    inst->desc = desc_imm;
    inst->ex_desc = 0;
    inst->mlen = mlen;
    inst->ex_mlen = 0;
    inst->send_has_side_effects = false;
    inst->send_is_volatile = false;

    inst->resize_sources(3);
    inst->src[0] = component(desc, 0);
    inst->src[1] = brw_imm_ud(0); /* ex_desc */
    inst->src[2] = payload;
 }

 static void
 lower_btd_logical_send(const fs_builder &bld, fs_inst *inst)
 {
    const intel_device_info *devinfo = bld.shader->devinfo;
    brw_reg global_addr = inst->src[0];
    const brw_reg btd_record = inst->src[1];

    const unsigned unit = reg_unit(devinfo);
    const unsigned mlen = 2 * unit;
    const fs_builder ubld = bld.exec_all();
    brw_reg header = ubld.vgrf(BRW_TYPE_UD, 2 * unit);

    ubld.MOV(header, brw_imm_ud(0));
    switch (inst->opcode) {
    case SHADER_OPCODE_BTD_SPAWN_LOGICAL:
       assert(brw_type_size_bytes(global_addr.type) == 8 &&
              global_addr.stride == 0);
       global_addr.type = BRW_TYPE_UD;
       global_addr.stride = 1;
       ubld.group(2, 0).MOV(header, global_addr);
       break;

    case SHADER_OPCODE_BTD_RETIRE_LOGICAL:
       /* The bottom bit is the Stack ID release bit */
       ubld.group(1, 0).MOV(header, brw_imm_ud(1));
       break;

    default:
       unreachable("Invalid BTD message");
    }

    /* Stack IDs are always in R1 regardless of whether we're coming from a
     * bindless shader or a regular compute shader.
     */
    brw_reg stack_ids = retype(offset(header, bld, 1), BRW_TYPE_UW);
    bld.exec_all().MOV(stack_ids, retype(brw_vec8_grf(1 * unit, 0),
                                         BRW_TYPE_UW));

    unsigned ex_mlen = 0;
    brw_reg payload;
    if (inst->opcode == SHADER_OPCODE_BTD_SPAWN_LOGICAL) {
       ex_mlen = 2 * (inst->exec_size / 8);
       payload = bld.move_to_vgrf(btd_record, 1);
    } else {
       assert(inst->opcode == SHADER_OPCODE_BTD_RETIRE_LOGICAL);
       /* All these messages take a BTD and things complain if we don't provide
        * one for RETIRE.  However, it shouldn't ever actually get used so fill
        * it with zero.
        */
       ex_mlen = 2 * (inst->exec_size / 8);
       payload = bld.move_to_vgrf(brw_imm_uq(0), 1);
    }

    /* Update the original instruction. */
    inst->opcode = SHADER_OPCODE_SEND;
    inst->mlen = mlen;
    inst->ex_mlen = ex_mlen;
    inst->header_size = 0; /* HW docs require has_header = false */
    inst->send_has_side_effects = true;
    inst->send_is_volatile = false;

    /* Set up SFID and descriptors */
    inst->sfid = GEN_RT_SFID_BINDLESS_THREAD_DISPATCH;
    inst->desc = brw_btd_spawn_desc(devinfo, inst->exec_size,
                                    GEN_RT_BTD_MESSAGE_SPAWN);
    inst->resize_sources(4);
    inst->src[0] = brw_imm_ud(0); /* desc */
    inst->src[1] = brw_imm_ud(0); /* ex_desc */
    inst->src[2] = header;
    inst->src[3] = payload;
 }

 static void
 lower_trace_ray_logical_send(const fs_builder &bld, fs_inst *inst)
 {
    const intel_device_info *devinfo = bld.shader->devinfo;
    /* The emit_uniformize() in brw_fs_nir.cpp will generate an horizontal
     * stride of 0. Below we're doing a MOV() in SIMD2. Since we can't use UQ/Q
     * types in on Gfx12.5, we need to tweak the stride with a value of 1 dword
     * so that the MOV operates on 2 components rather than twice the same
     * component.
     */
    brw_reg globals_addr = retype(inst->src[RT_LOGICAL_SRC_GLOBALS], BRW_TYPE_UD);
    globals_addr.stride = 1;
    const brw_reg bvh_level =
       inst->src[RT_LOGICAL_SRC_BVH_LEVEL].file == IMM ?
       inst->src[RT_LOGICAL_SRC_BVH_LEVEL] :
       bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_BVH_LEVEL],
                        inst->components_read(RT_LOGICAL_SRC_BVH_LEVEL));
    const brw_reg trace_ray_control =
       inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL].file == IMM ?
       inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] :
       bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL],
                        inst->components_read(RT_LOGICAL_SRC_TRACE_RAY_CONTROL));
    const brw_reg synchronous_src = inst->src[RT_LOGICAL_SRC_SYNCHRONOUS];
    assert(synchronous_src.file == IMM);
    const bool synchronous = synchronous_src.ud;

    const unsigned unit = reg_unit(devinfo);
    const unsigned mlen = unit;
    const fs_builder ubld = bld.exec_all();
    brw_reg header = ubld.vgrf(BRW_TYPE_UD);
    ubld.MOV(header, brw_imm_ud(0));
    ubld.group(2, 0).MOV(header, globals_addr);
    if (synchronous)
       ubld.group(1, 0).MOV(byte_offset(header, 16), brw_imm_ud(synchronous));

    const unsigned ex_mlen = inst->exec_size / 8;
    brw_reg payload = bld.vgrf(BRW_TYPE_UD);
    if (bvh_level.file == IMM &&
        trace_ray_control.file == IMM) {
       uint32_t high = devinfo->ver >= 20 ? 10 : 9;
       bld.MOV(payload, brw_imm_ud(SET_BITS(trace_ray_control.ud, high, 8) |
                                   (bvh_level.ud & 0x7)));
    } else {
       bld.SHL(payload, trace_ray_control, brw_imm_ud(8));
       bld.OR(payload, payload, bvh_level);
    }

    /* When doing synchronous traversal, the HW implicitly computes the
     * stack_id using the following formula :
     *
     *    EUID[3:0] & THREAD_ID[2:0] & SIMD_LANE_ID[3:0]
     *
     * Only in the asynchronous case we need to set the stack_id given from the
     * payload register.
     */
    if (!synchronous) {
       bld.AND(subscript(payload, BRW_TYPE_UW, 1),
               retype(brw_vec8_grf(1 * unit, 0), BRW_TYPE_UW),
               brw_imm_uw(0x7ff));
    }

    /* Update the original instruction. */
    inst->opcode = SHADER_OPCODE_SEND;
    inst->mlen = mlen;
    inst->ex_mlen = ex_mlen;
    inst->header_size = 0; /* HW docs require has_header = false */
    inst->send_has_side_effects = true;
    inst->send_is_volatile = false;

    /* Set up SFID and descriptors */
    inst->sfid = GEN_RT_SFID_RAY_TRACE_ACCELERATOR;
    inst->desc = brw_rt_trace_ray_desc(devinfo, inst->exec_size);
    inst->resize_sources(4);
    inst->src[0] = brw_imm_ud(0); /* desc */
    inst->src[1] = brw_imm_ud(0); /* ex_desc */
    inst->src[2] = header;
    inst->src[3] = payload;
 }

 static void
 lower_get_buffer_size(const fs_builder &bld, fs_inst *inst)
 {
    const intel_device_info *devinfo = bld.shader->devinfo;
    /* Since we can only execute this instruction on uniform bti/surface
     * handles, brw_fs_nir.cpp should already have limited this to SIMD8.
     */
    assert(inst->exec_size == (devinfo->ver < 20 ? 8 : 16));

    brw_reg surface = inst->src[GET_BUFFER_SIZE_SRC_SURFACE];
    brw_reg surface_handle = inst->src[GET_BUFFER_SIZE_SRC_SURFACE_HANDLE];
    brw_reg lod = inst->src[GET_BUFFER_SIZE_SRC_LOD];

    inst->opcode = SHADER_OPCODE_SEND;
    inst->mlen = inst->exec_size / 8;
    inst->resize_sources(3);
    inst->ex_mlen = 0;
    inst->ex_desc = 0;

    /* src[0] & src[1] are filled by setup_surface_descriptors() */
    inst->src[2] = lod;

    const uint32_t return_format = GFX8_SAMPLER_RETURN_FORMAT_32BITS;

    const uint32_t desc = brw_sampler_desc(devinfo, 0, 0,
                                           GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
                                           BRW_SAMPLER_SIMD_MODE_SIMD8,
                                           return_format);

    inst->dst = retype(inst->dst, BRW_TYPE_UW);
    inst->sfid = BRW_SFID_SAMPLER;
    setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
 }

 bool
 brw_fs_lower_logical_sends(fs_visitor &s)
 {
    const intel_device_info *devinfo = s.devinfo;
    bool progress = false;

    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
       const fs_builder ibld(&s, block, inst);

       switch (inst->opcode) {
       case FS_OPCODE_FB_WRITE_LOGICAL:
          assert(s.stage == MESA_SHADER_FRAGMENT);
          lower_fb_write_logical_send(ibld, inst,
                                      brw_wm_prog_data(s.prog_data),
                                      (const brw_wm_prog_key *)s.key,
                                      s.fs_payload());
          break;

       case FS_OPCODE_FB_READ_LOGICAL:
          lower_fb_read_logical_send(ibld, inst, brw_wm_prog_data(s.prog_data));
          break;

       case SHADER_OPCODE_TEX_LOGICAL:
       case SHADER_OPCODE_TXD_LOGICAL:
       case SHADER_OPCODE_TXF_LOGICAL:
       case SHADER_OPCODE_TXL_LOGICAL:
       case SHADER_OPCODE_TXS_LOGICAL:
       case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
       case FS_OPCODE_TXB_LOGICAL:
       case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
       case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
       case SHADER_OPCODE_TXF_MCS_LOGICAL:
       case SHADER_OPCODE_LOD_LOGICAL:
       case SHADER_OPCODE_TG4_LOGICAL:
       case SHADER_OPCODE_TG4_BIAS_LOGICAL:
       case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
       case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
       case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
       case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
       case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
       case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
          lower_sampler_logical_send(ibld, inst);
          break;

       case SHADER_OPCODE_GET_BUFFER_SIZE:
          lower_get_buffer_size(ibld, inst);
          break;

       case SHADER_OPCODE_MEMORY_LOAD_LOGICAL:
       case SHADER_OPCODE_MEMORY_STORE_LOGICAL:
       case SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL:
          if (devinfo->ver >= 20 ||
              (devinfo->has_lsc &&
               inst->src[MEMORY_LOGICAL_MODE].ud != MEMORY_MODE_TYPED))
             lower_lsc_memory_logical_send(ibld, inst);
          else
             lower_hdc_memory_logical_send(ibld, inst);
          break;

       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
          if (devinfo->has_lsc && !s.compiler->indirect_ubos_use_sampler)
             lower_lsc_varying_pull_constant_logical_send(ibld, inst);
          else
             lower_varying_pull_constant_logical_send(ibld, inst);
          break;

       case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
       case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
       case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
          lower_interpolator_logical_send(ibld, inst,
                                          (const brw_wm_prog_key *)s.key,
                                          brw_wm_prog_data(s.prog_data));
          break;

       case SHADER_OPCODE_BTD_SPAWN_LOGICAL:
       case SHADER_OPCODE_BTD_RETIRE_LOGICAL:
          lower_btd_logical_send(ibld, inst);
          break;

       case RT_OPCODE_TRACE_RAY_LOGICAL:
          lower_trace_ray_logical_send(ibld, inst);
          break;

       case SHADER_OPCODE_URB_READ_LOGICAL:
          if (devinfo->ver < 20)
             lower_urb_read_logical_send(ibld, inst);
          else
             lower_urb_read_logical_send_xe2(ibld, inst);
          break;

       case SHADER_OPCODE_URB_WRITE_LOGICAL:
          if (devinfo->ver < 20)
             lower_urb_write_logical_send(ibld, inst);
          else
             lower_urb_write_logical_send_xe2(ibld, inst);

          break;

       default:
          continue;
       }

       progress = true;
    }

    if (progress)
       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);

    return progress;
 }

 /**
  * Turns the generic expression-style uniform pull constant load instruction
  * into a hardware-specific series of instructions for loading a pull
  * constant.
  *
  * The expression style allows the CSE pass before this to optimize out
  * repeated loads from the same offset, and gives the pre-register-allocation
  * scheduling full flexibility, while the conversion to native instructions
  * allows the post-register-allocation scheduler the best information
  * possible.
  *
  * Note that execution masking for setting up pull constant loads is special:
  * the channels that need to be written are unrelated to the current execution
  * mask, since a later instruction will use one of the result channels as a
  * source operand for all 8 or 16 of its channels.
  */
 bool
 brw_fs_lower_uniform_pull_constant_loads(fs_visitor &s)
 {
    const intel_device_info *devinfo = s.devinfo;
    bool progress = false;

    foreach_block_and_inst (block, fs_inst, inst, s.cfg) {
       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
          continue;

       const brw_reg surface = inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE];
       const brw_reg surface_handle = inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE_HANDLE];
       const brw_reg offset_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_OFFSET];
       const brw_reg size_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_SIZE];
       assert(surface.file == BAD_FILE || surface_handle.file == BAD_FILE);
       assert(offset_B.file == IMM);
       assert(size_B.file == IMM);

       if (devinfo->has_lsc) {
          const fs_builder ubld =
             fs_builder(&s, block, inst).group(8, 0).exec_all();

          const brw_reg payload = ubld.vgrf(BRW_TYPE_UD);
          ubld.MOV(payload, offset_B);

          inst->sfid = GFX12_SFID_UGM;
          inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
                                    surface_handle.file == BAD_FILE ?
                                    LSC_ADDR_SURFTYPE_BTI :
                                    LSC_ADDR_SURFTYPE_BSS,
                                    LSC_ADDR_SIZE_A32,
                                    LSC_DATA_SIZE_D32,
                                    inst->size_written / 4,
                                    true /* transpose */,
                                    LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));

          /* Update the original instruction. */
          inst->opcode = SHADER_OPCODE_SEND;
          inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, 1);
          inst->send_ex_bso = surface_handle.file != BAD_FILE &&
                              s.compiler->extended_bindless_surface_offset;
          inst->ex_mlen = 0;
          inst->header_size = 0;
          inst->send_has_side_effects = false;
          inst->send_is_volatile = true;
          inst->exec_size = 1;

          /* Finally, the payload */

          inst->resize_sources(3);
          setup_lsc_surface_descriptors(ubld, inst, inst->desc,
                                        surface.file != BAD_FILE ?
                                        surface : surface_handle);
          inst->src[2] = payload;

          s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
       } else {
          const fs_builder ubld = fs_builder(&s, block, inst).exec_all();
          brw_reg header = fs_builder(&s, 8).exec_all().vgrf(BRW_TYPE_UD);

          ubld.group(8, 0).MOV(header,
                               retype(brw_vec8_grf(0, 0), BRW_TYPE_UD));
          ubld.group(1, 0).MOV(component(header, 2),
                               brw_imm_ud(offset_B.ud / 16));

          inst->sfid = GFX6_SFID_DATAPORT_CONSTANT_CACHE;
          inst->opcode = SHADER_OPCODE_SEND;
          inst->header_size = 1;
          inst->mlen = 1;

          uint32_t desc =
             brw_dp_oword_block_rw_desc(devinfo, true /* align_16B */,
                                        size_B.ud / 4, false /* write */);

          inst->resize_sources(4);

          setup_surface_descriptors(ubld, inst, desc, surface, surface_handle);

          inst->src[2] = header;
          inst->src[3] = brw_reg(); /* unused for reads */

          s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
       }

       progress = true;
    }

    return progress;
 }