src/nouveau/compiler/nak_nir_lower_tex.c - platform/external/mesa3d - Git at Google

 /*
  * Copyright © 2023 Collabora, Ltd.
  * SPDX-License-Identifier: MIT
  */

 #include "nak_private.h"
 #include "nir_builder.h"
 #include "nir_format_convert.h"

 #include "util/u_math.h"

 static enum glsl_sampler_dim
 remap_sampler_dim(enum glsl_sampler_dim dim)
 {
    switch (dim) {
    case GLSL_SAMPLER_DIM_SUBPASS: return GLSL_SAMPLER_DIM_2D;
    case GLSL_SAMPLER_DIM_SUBPASS_MS: return GLSL_SAMPLER_DIM_MS;
    default: return dim;
    }
 }

 static bool
 lower_tex(nir_builder *b, nir_tex_instr *tex, const struct nak_compiler *nak)
 {
    b->cursor = nir_before_instr(&tex->instr);

    nir_def *tex_h = NULL, *samp_h = NULL, *coord = NULL, *ms_idx = NULL;
    nir_def *offset = NULL, *lod = NULL, *bias = NULL, *min_lod = NULL;
    nir_def *ddx = NULL, *ddy = NULL, *z_cmpr = NULL;
    for (unsigned i = 0; i < tex->num_srcs; i++) {
       switch (tex->src[i].src_type) {
       case nir_tex_src_texture_handle: tex_h =     tex->src[i].src.ssa; break;
       case nir_tex_src_sampler_handle: samp_h =    tex->src[i].src.ssa; break;
       case nir_tex_src_coord:          coord =     tex->src[i].src.ssa; break;
       case nir_tex_src_ms_index:       ms_idx =    tex->src[i].src.ssa; break;
       case nir_tex_src_comparator:     z_cmpr =    tex->src[i].src.ssa; break;
       case nir_tex_src_offset:         offset =    tex->src[i].src.ssa; break;
       case nir_tex_src_lod:            lod =       tex->src[i].src.ssa; break;
       case nir_tex_src_bias:           bias =      tex->src[i].src.ssa; break;
       case nir_tex_src_min_lod:        min_lod =   tex->src[i].src.ssa; break;
       case nir_tex_src_ddx:            ddx =       tex->src[i].src.ssa; break;
       case nir_tex_src_ddy:            ddy =       tex->src[i].src.ssa; break;
       default:
          unreachable("Unsupported texture source");
       }
    }

    /* Combine sampler and texture into one if needed */
    if (samp_h != NULL && samp_h != tex_h) {
       tex_h = nir_ior(b, nir_iand_imm(b, tex_h,  0x000fffff),
                          nir_iand_imm(b, samp_h, 0xfff00000));
    }
    tex_h = nir_u2u32(b, tex_h);

    /* Array index is treated separately, so pull it off if we have one. */
    nir_def *arr_idx = NULL;
    unsigned coord_components = tex->coord_components;
    if (coord && tex->is_array) {
       if (tex->op == nir_texop_lod) {
          /* The HW wants an array index. Use zero. */
          arr_idx = nir_imm_int(b, 0);
       } else {
          arr_idx = nir_channel(b, coord, --coord_components);

          /* Everything but texelFetch takes a float index
           *
           * TODO: Use F2I.U32.RNE
           */
          if (tex->op != nir_texop_txf && tex->op != nir_texop_txf_ms) {
             arr_idx = nir_fadd_imm(b, arr_idx, 0.5);

             // TODO: Hardware seems to clamp negative values to zero for us
             // in f2u, but we still need this fmax for constant folding.
             arr_idx = nir_fmax(b, arr_idx, nir_imm_float(b, 0.0));

             arr_idx = nir_f2u32(b, arr_idx);
          }

          arr_idx = nir_umin(b, arr_idx, nir_imm_int(b, UINT16_MAX));
       }
    }

    enum nak_nir_lod_mode lod_mode = NAK_NIR_LOD_MODE_AUTO;
    if (tex->op == nir_texop_txf_ms) {
       /* Multisampled textures do not have miplevels */
       lod_mode = NAK_NIR_LOD_MODE_ZERO;
       lod = NULL; /* We don't need this */
    } else if (lod != NULL) {
       nir_scalar lod_s = { .def = lod, .comp = 0 };
       if (nir_scalar_is_const(lod_s) &&
           nir_scalar_as_uint(lod_s) == 0) {
          lod_mode = NAK_NIR_LOD_MODE_ZERO;
          lod = NULL; /* We don't need this */
       } else {
          lod_mode = NAK_NIR_LOD_MODE_LOD;
       }
    } else if (bias != NULL) {
       lod_mode = NAK_NIR_LOD_MODE_BIAS;
       lod = bias;
    }

    if (min_lod != NULL) {
       switch (lod_mode) {
       case NAK_NIR_LOD_MODE_AUTO:
          lod_mode = NAK_NIR_LOD_MODE_CLAMP;
          break;
       case NAK_NIR_LOD_MODE_BIAS:
          lod_mode = NAK_NIR_LOD_MODE_BIAS_CLAMP;
          break;
       default:
          unreachable("Invalid min_lod");
       }
       min_lod = nir_f2u32(b, nir_fmax(b, nir_fmul_imm(b, min_lod, 256),
                                          nir_imm_float(b, 16)));
    }

    enum nak_nir_offset_mode offset_mode = NAK_NIR_OFFSET_MODE_NONE;
    if (offset != NULL) {
       /* For TG4, offsets, are packed into a single 32-bit value with 8 bits
        * per component.  For all other texture instructions, offsets are
        * packed into a single at most 16-bit value with 8 bits per component.
        */
       static const unsigned bits4[] = { 4, 4, 4, 4 };
       static const unsigned bits8[] = { 8, 8, 8, 8 };
       const unsigned *bits = tex->op == nir_texop_tg4 ? bits8 : bits4;

       offset = nir_pad_vector_imm_int(b, offset, 0, 4);
       offset = nir_format_clamp_sint(b, offset, bits);
       offset = nir_format_pack_uint(b, offset, bits, 4);
       offset_mode = NAK_NIR_OFFSET_MODE_AOFFI;
    } else if (nir_tex_instr_has_explicit_tg4_offsets(tex)) {
       uint64_t off_u64 = 0;
       for (uint8_t i = 0; i < 8; ++i) {
          uint64_t off = (uint8_t)tex->tg4_offsets[i / 2][i % 2];
          off_u64 |= off << (i * 8);
       }
       offset = nir_imm_ivec2(b, off_u64, off_u64 >> 32);
       offset_mode = NAK_NIR_OFFSET_MODE_PER_PX;
    }

    nir_def *src0[4] = { NULL, };
    nir_def *src1[4] = { NULL, };
    unsigned src0_comps = 0, src1_comps = 0;

 #define PUSH(a, x) do { \
    nir_def *val = (x); \
    assert(a##_comps < ARRAY_SIZE(a)); \
    a[a##_comps++] = val; \
 } while(0)

    if (nak->sm >= 50) {
       if (tex->op == nir_texop_txd) {
          PUSH(src0, tex_h);

          for (uint32_t i = 0; i < coord_components; i++)
             PUSH(src0, nir_channel(b, coord, i));

          if (offset != NULL) {
             nir_def *arr_idx_or_zero = arr_idx ? arr_idx : nir_imm_int(b, 0);
             nir_def *arr_off = nir_prmt_nv(b, nir_imm_int(b, 0x1054),
                                            offset, arr_idx_or_zero);
             PUSH(src0, arr_off);
          } else if (arr_idx != NULL) {
             PUSH(src0, arr_idx);
          }

          assert(ddx->num_components == coord_components);
          for (uint32_t i = 0; i < coord_components; i++) {
             PUSH(src1, nir_channel(b, ddx, i));
             PUSH(src1, nir_channel(b, ddy, i));
          }
       } else {
          if (min_lod != NULL) {
             nir_def *arr_idx_or_zero = arr_idx ? arr_idx : nir_imm_int(b, 0);
             nir_def *arr_ml = nir_prmt_nv(b, nir_imm_int(b, 0x1054),
                                           min_lod, arr_idx_or_zero);
             PUSH(src0, arr_ml);
          } else if (arr_idx != NULL) {
             PUSH(src0, arr_idx);
          }

          for (uint32_t i = 0; i < coord_components; i++)
             PUSH(src0, nir_channel(b, coord, i));

          PUSH(src1, tex_h);
          if (ms_idx != NULL)
             PUSH(src1, ms_idx);
          if (lod != NULL)
             PUSH(src1, lod);
          if (offset_mode == NAK_NIR_OFFSET_MODE_AOFFI) {
             PUSH(src1, offset);
          } else if (offset_mode == NAK_NIR_OFFSET_MODE_PER_PX) {
             PUSH(src1, nir_channel(b, offset, 0));
             PUSH(src1, nir_channel(b, offset, 1));
          }
          if (z_cmpr != NULL)
             PUSH(src1, z_cmpr);
       }
    } else {
       unreachable("Unsupported shader model");
    }

    nir_def *vec_srcs[2] = {
       nir_vec(b, src0, src0_comps),
       nir_vec(b, src1, src1_comps),
    };

    tex->src[0].src_type = nir_tex_src_backend1;
    nir_src_rewrite(&tex->src[0].src, vec_srcs[0]);

    tex->src[1].src_type = nir_tex_src_backend2;
    nir_src_rewrite(&tex->src[1].src, vec_srcs[1]);

    /* Remove any extras */
    while (tex->num_srcs > 2)
       nir_tex_instr_remove_src(tex, tex->num_srcs - 1);

    tex->sampler_dim = remap_sampler_dim(tex->sampler_dim);

    struct nak_nir_tex_flags flags = {
       .lod_mode = lod_mode,
       .offset_mode = offset_mode,
       .has_z_cmpr = tex->is_shadow,
       .is_sparse = tex->is_sparse,
    };
    STATIC_ASSERT(sizeof(flags) == sizeof(tex->backend_flags));
    memcpy(&tex->backend_flags, &flags, sizeof(flags));

    if (tex->op == nir_texop_lod) {
       b->cursor = nir_after_instr(&tex->instr);

       /* The outputs are flipped compared to what NIR expects */
       nir_def *abs = nir_channel(b, &tex->def, 1);
       nir_def *rel = nir_channel(b, &tex->def, 0);

       /* The returned values are not quite what we want:
        * (a) convert from s16/u16 to f32
        * (b) multiply by 1/256
        *
        * TODO: We can make this cheaper once we have 16-bit in NAK
        */
       abs = nir_u2f32(b, nir_iand_imm(b, abs, 0xffff));
       nir_def *shift = nir_imm_int(b, 16);
       rel = nir_i2f32(b, nir_ishr(b, nir_ishl(b, rel, shift), shift));

       abs = nir_fmul_imm(b, abs, 1.0 / 256.0);
       rel = nir_fmul_imm(b, rel, 1.0 / 256.0);

       nir_def *res = nir_vec2(b, abs, rel);
       nir_def_rewrite_uses_after(&tex->def, res, res->parent_instr);
    }

    return true;
 }

 static bool
 lower_txq(nir_builder *b, nir_tex_instr *tex, const struct nak_compiler *nak)
 {
    b->cursor = nir_before_instr(&tex->instr);

    assert(!tex->is_sparse);

    nir_def *tex_h = NULL, *lod = NULL;
    for (unsigned i = 0; i < tex->num_srcs; i++) {
       switch (tex->src[i].src_type) {
       case nir_tex_src_texture_handle: tex_h = tex->src[i].src.ssa; break;
       case nir_tex_src_sampler_handle: break; /* Ignored */
       case nir_tex_src_lod:            lod = tex->src[i].src.ssa; break;
       default:
          unreachable("Unsupported texture source");
       }
    }

    /* TODO: We should only support 32-bit handles */
    tex_h = nir_u2u32(b, tex_h);

    nir_def *txq_src;
    nir_component_mask_t mask;
    switch (tex->op) {
    case nir_texop_txs:
       tex->op = nir_texop_hdr_dim_nv;
       if (lod == NULL)
          lod = nir_imm_int(b, 0);
       txq_src = nir_vec2(b, tex_h, lod);
       mask = BITSET_MASK(tex->def.num_components);
       break;
    case nir_texop_query_levels:
       tex->op = nir_texop_hdr_dim_nv;
       txq_src = nir_vec2(b, tex_h, nir_imm_int(b, 0));
       mask = BITSET_BIT(3);
       break;
    case nir_texop_texture_samples:
       tex->op = nir_texop_tex_type_nv;
       txq_src = tex_h;
       mask = BITSET_BIT(2);
       break;
    default:
       unreachable("Invalid texture query op");
    }

    tex->src[0].src_type = nir_tex_src_backend1;
    nir_src_rewrite(&tex->src[0].src, txq_src);

    /* Remove any extras */
    while (tex->num_srcs > 1)
       nir_tex_instr_remove_src(tex, tex->num_srcs - 1);

    tex->sampler_dim = remap_sampler_dim(tex->sampler_dim);

    b->cursor = nir_after_instr(&tex->instr);

    /* Only pick off slected components */
    tex->def.num_components = 4;
    nir_def *res = nir_channels(b, &tex->def, mask);
    nir_def_rewrite_uses_after(&tex->def, res, res->parent_instr);

    return true;
 }

 static bool
 shrink_image_load(nir_builder *b, nir_intrinsic_instr *intrin,
                   const struct nak_compiler *nak)
 {
    enum pipe_format format = nir_intrinsic_format(intrin);
    nir_component_mask_t color_comps_read =
       nir_def_components_read(&intrin->def);

    assert(intrin->intrinsic == nir_intrinsic_bindless_image_load ||
           intrin->intrinsic == nir_intrinsic_bindless_image_sparse_load);

    /* Pick off the sparse resident component (if any) before we do anything
     * else.  This makes later logic easier.
     */
    bool is_sparse = false;
    if (intrin->intrinsic == nir_intrinsic_bindless_image_sparse_load) {
       unsigned resident_comp = intrin->def.num_components - 1;
       if (color_comps_read & BITFIELD_BIT(resident_comp)) {
          is_sparse = true;
          color_comps_read &= ~BITFIELD_BIT(resident_comp);
       } else {
          /* If the sparse bit is never used, get rid of it */
          intrin->intrinsic = nir_intrinsic_bindless_image_load;
          intrin->num_components--;
          intrin->def.num_components--;
       }
    }

    if (intrin->def.bit_size == 64) {
       assert(format == PIPE_FORMAT_NONE ||
              format == PIPE_FORMAT_R64_UINT ||
              format == PIPE_FORMAT_R64_SINT);

       b->cursor = nir_after_instr(&intrin->instr);

       nir_def *data_xy, *data_w, *resident = NULL;
       if (color_comps_read & BITFIELD_BIT(3)) {
          /* Thanks to descriptor indexing, we need to ensure that null
           * descriptor behavior works properly.  In particular, normal zero
           * reads will return (0, 0, 0, 1) whereas null descriptor reads need
           * to return (0, 0, 0, 0).  This means we can't blindly extend with
           * an alpha component of 1.  Instead, we need to trust the hardware
           * to extend the original RG32 with z = 0 and w = 1 and copy the w
           * value all the way out to 64-bit w value.
           */
          assert(intrin->num_components == 4 + is_sparse);
          assert(intrin->def.num_components == 4 + is_sparse);
          intrin->def.bit_size = 32;

          data_xy = nir_channels(b, &intrin->def, 0x3);
          data_w = nir_channels(b, &intrin->def, 0x8);
          if (is_sparse)
             resident = nir_channel(b, &intrin->def, 4);
       } else {
          intrin->num_components = 2 + is_sparse;
          intrin->def.num_components = 2 + is_sparse;
          intrin->def.bit_size = 32;

          data_xy = nir_channels(b, &intrin->def, 0x3);
          data_w = nir_imm_int(b, 0);
          if (is_sparse)
             resident = nir_channel(b, &intrin->def, 2);
       }

       nir_def *data;
       if (is_sparse) {
          data = nir_vec5(b, nir_pack_64_2x32(b, data_xy),
                          nir_imm_zero(b, 1, 64),
                          nir_imm_zero(b, 1, 64),
                          nir_u2u64(b, data_w),
                          nir_u2u64(b, resident));
       } else {
          data = nir_vec4(b, nir_pack_64_2x32(b, data_xy),
                          nir_imm_zero(b, 1, 64),
                          nir_imm_zero(b, 1, 64),
                          nir_u2u64(b, data_w));
       }

       nir_def_rewrite_uses_after(&intrin->def, data, data->parent_instr);
       return true;
    }

    if (format == PIPE_FORMAT_NONE)
       return false;

    /* In order for null descriptors to work properly, we don't want to shrink
     * loads when the alpha channel is read even if we know the format has
     * fewer channels.
     */
    if (color_comps_read & BITFIELD_BIT(3))
       return false;

    const unsigned old_comps = intrin->def.num_components;

    unsigned new_comps = util_format_get_nr_components(format);
    new_comps = util_next_power_of_two(new_comps);
    if (color_comps_read <= BITFIELD_MASK(2))
       new_comps = 2;
    if (color_comps_read <= BITFIELD_MASK(1))
       new_comps = 1;

    if (new_comps + is_sparse >= intrin->num_components)
       return false;

    b->cursor = nir_after_instr(&intrin->instr);

    intrin->num_components = new_comps + is_sparse;
    intrin->def.num_components = new_comps + is_sparse;

    assert(new_comps <= 4);
    nir_def *comps[5];
    for (unsigned c = 0; c < new_comps; c++)
       comps[c] = nir_channel(b, &intrin->def, c);
    for (unsigned c = new_comps; c < 3; c++)
       comps[c] = nir_imm_intN_t(b, 0, intrin->def.bit_size);
    if (new_comps < 4)
       comps[3] = nir_imm_intN_t(b, 1, intrin->def.bit_size);

    /* The resident bit always goes in the last channel */
    if (is_sparse)
       comps[old_comps - 1] = nir_channel(b, &intrin->def, new_comps);

    nir_def *data = nir_vec(b, comps, old_comps);
    nir_def_rewrite_uses_after(&intrin->def, data, data->parent_instr);
    return true;
 }

 static bool
 shrink_image_store(nir_builder *b, nir_intrinsic_instr *intrin,
                   const struct nak_compiler *nak)
 {
    enum pipe_format format = nir_intrinsic_format(intrin);
    nir_def *data = intrin->src[3].ssa;

    if (data->bit_size == 64) {
       assert(format == PIPE_FORMAT_NONE ||
              format == PIPE_FORMAT_R64_UINT ||
              format == PIPE_FORMAT_R64_SINT);

       b->cursor = nir_before_instr(&intrin->instr);

       /* For 64-bit image ops, we actually want a vec2 */
       nir_def *data_vec2 = nir_unpack_64_2x32(b, nir_channel(b, data, 0));
       nir_src_rewrite(&intrin->src[3], data_vec2);
       intrin->num_components = 2;
       return true;
    }

    if (format == PIPE_FORMAT_NONE)
       return false;

    unsigned new_comps = util_format_get_nr_components(format);
    new_comps = util_next_power_of_two(new_comps);
    if (new_comps >= intrin->num_components)
       return false;

    b->cursor = nir_before_instr(&intrin->instr);

    nir_def *trimmed = nir_trim_vector(b, data, new_comps);
    nir_src_rewrite(&intrin->src[3], trimmed);
    intrin->num_components = new_comps;
    return true;
 }

 static bool
 lower_image_txq(nir_builder *b, nir_intrinsic_instr *intrin,
                 const struct nak_compiler *nak)
 {
    b->cursor = nir_instr_remove(&intrin->instr);

    /* TODO: We should only support 32-bit handles */
    nir_def *img_h = nir_u2u32(b, intrin->src[0].ssa);

    nir_tex_instr *txq = nir_tex_instr_create(b->shader, 1);
    txq->sampler_dim = remap_sampler_dim(nir_intrinsic_image_dim(intrin));
    txq->is_array = nir_intrinsic_image_array(intrin);
    txq->dest_type = nir_type_int32;

    nir_component_mask_t mask;
    switch (intrin->intrinsic) {
    case nir_intrinsic_bindless_image_size: {
       nir_def *lod = intrin->src[1].ssa;

       txq->op = nir_texop_hdr_dim_nv;
       txq->src[0] = (nir_tex_src) {
          .src_type = nir_tex_src_backend1,
          .src = nir_src_for_ssa(nir_vec2(b, img_h, lod)),
       };
       mask = BITSET_MASK(intrin->def.num_components);
       break;
    }

    case nir_intrinsic_bindless_image_samples:
       txq->op = nir_texop_tex_type_nv;
       txq->src[0] = (nir_tex_src) {
          .src_type = nir_tex_src_backend1,
          .src = nir_src_for_ssa(img_h),
       };
       mask = BITSET_BIT(2);
       break;

    default:
       unreachable("Invalid image query op");
    }

    nir_def_init(&txq->instr, &txq->def, 4, 32);
    nir_builder_instr_insert(b, &txq->instr);

    /* Only pick off slected components */
    nir_def *res = nir_channels(b, &txq->def, mask);

    nir_def_rewrite_uses(&intrin->def, res);

    return true;
 }

 static bool
 lower_tex_instr(nir_builder *b, nir_instr *instr, void *_data)
 {
    const struct nak_compiler *nak = _data;

    switch (instr->type) {
    case nir_instr_type_tex: {
       nir_tex_instr *tex = nir_instr_as_tex(instr);
       switch (tex->op) {
       case nir_texop_tex:
       case nir_texop_txb:
       case nir_texop_txl:
       case nir_texop_txd:
       case nir_texop_txf:
       case nir_texop_txf_ms:
       case nir_texop_tg4:
       case nir_texop_lod:
          return lower_tex(b, tex, nak);
       case nir_texop_txs:
       case nir_texop_query_levels:
       case nir_texop_texture_samples:
          return lower_txq(b, tex, nak);
       default:
          unreachable("Unsupported texture instruction");
       }
    }
    case nir_instr_type_intrinsic: {
       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
       switch (intrin->intrinsic) {
       case nir_intrinsic_bindless_image_load:
       case nir_intrinsic_bindless_image_sparse_load:
          return shrink_image_load(b, intrin, nak);
       case nir_intrinsic_bindless_image_store:
          return shrink_image_store(b, intrin, nak);
       case nir_intrinsic_bindless_image_size:
       case nir_intrinsic_bindless_image_samples:
          return lower_image_txq(b, intrin, nak);
       default:
          return false;
       }
    }
    default:
       return false;
    }
 }

 bool
 nak_nir_lower_tex(nir_shader *nir, const struct nak_compiler *nak)
 {
    return nir_shader_instructions_pass(nir, lower_tex_instr,
                                        nir_metadata_control_flow,
                                        (void *)nak);
 }
	/*
	* Copyright © 2023 Collabora, Ltd.
	* SPDX-License-Identifier: MIT
	*/

	#include "nak_private.h"
	#include "nir_builder.h"
	#include "nir_format_convert.h"

	#include "util/u_math.h"

	static enum glsl_sampler_dim
	remap_sampler_dim(enum glsl_sampler_dim dim)
	{
	switch (dim) {
	case GLSL_SAMPLER_DIM_SUBPASS: return GLSL_SAMPLER_DIM_2D;
	case GLSL_SAMPLER_DIM_SUBPASS_MS: return GLSL_SAMPLER_DIM_MS;
	default: return dim;
	}
	}

	static bool
	lower_tex(nir_builder b, nir_tex_instr tex, const struct nak_compiler *nak)
	{
	b->cursor = nir_before_instr(&tex->instr);

	nir_def tex_h = NULL, samp_h = NULL, coord = NULL, ms_idx = NULL;
	nir_def offset = NULL, lod = NULL, bias = NULL, min_lod = NULL;
	nir_def ddx = NULL, ddy = NULL, *z_cmpr = NULL;
	for (unsigned i = 0; i < tex->num_srcs; i++) {
	switch (tex->src[i].src_type) {
	case nir_tex_src_texture_handle: tex_h = tex->src[i].src.ssa; break;
	case nir_tex_src_sampler_handle: samp_h = tex->src[i].src.ssa; break;
	case nir_tex_src_coord: coord = tex->src[i].src.ssa; break;
	case nir_tex_src_ms_index: ms_idx = tex->src[i].src.ssa; break;
	case nir_tex_src_comparator: z_cmpr = tex->src[i].src.ssa; break;
	case nir_tex_src_offset: offset = tex->src[i].src.ssa; break;
	case nir_tex_src_lod: lod = tex->src[i].src.ssa; break;
	case nir_tex_src_bias: bias = tex->src[i].src.ssa; break;
	case nir_tex_src_min_lod: min_lod = tex->src[i].src.ssa; break;
	case nir_tex_src_ddx: ddx = tex->src[i].src.ssa; break;
	case nir_tex_src_ddy: ddy = tex->src[i].src.ssa; break;
	default:
	unreachable("Unsupported texture source");
	}
	}

	/* Combine sampler and texture into one if needed */
	if (samp_h != NULL && samp_h != tex_h) {
	tex_h = nir_ior(b, nir_iand_imm(b, tex_h, 0x000fffff),
	nir_iand_imm(b, samp_h, 0xfff00000));
	}
	tex_h = nir_u2u32(b, tex_h);

	/* Array index is treated separately, so pull it off if we have one. */
	nir_def *arr_idx = NULL;
	unsigned coord_components = tex->coord_components;
	if (coord && tex->is_array) {
	if (tex->op == nir_texop_lod) {
	/* The HW wants an array index. Use zero. */
	arr_idx = nir_imm_int(b, 0);
	} else {
	arr_idx = nir_channel(b, coord, --coord_components);

	/* Everything but texelFetch takes a float index
	*
	* TODO: Use F2I.U32.RNE
	*/
	if (tex->op != nir_texop_txf && tex->op != nir_texop_txf_ms) {
	arr_idx = nir_fadd_imm(b, arr_idx, 0.5);

	// TODO: Hardware seems to clamp negative values to zero for us
	// in f2u, but we still need this fmax for constant folding.
	arr_idx = nir_fmax(b, arr_idx, nir_imm_float(b, 0.0));

	arr_idx = nir_f2u32(b, arr_idx);
	}

	arr_idx = nir_umin(b, arr_idx, nir_imm_int(b, UINT16_MAX));
	}
	}

	enum nak_nir_lod_mode lod_mode = NAK_NIR_LOD_MODE_AUTO;
	if (tex->op == nir_texop_txf_ms) {
	/* Multisampled textures do not have miplevels */
	lod_mode = NAK_NIR_LOD_MODE_ZERO;
	lod = NULL; /* We don't need this */
	} else if (lod != NULL) {
	nir_scalar lod_s = { .def = lod, .comp = 0 };
	if (nir_scalar_is_const(lod_s) &&
	nir_scalar_as_uint(lod_s) == 0) {
	lod_mode = NAK_NIR_LOD_MODE_ZERO;
	lod = NULL; /* We don't need this */
	} else {
	lod_mode = NAK_NIR_LOD_MODE_LOD;
	}
	} else if (bias != NULL) {
	lod_mode = NAK_NIR_LOD_MODE_BIAS;
	lod = bias;
	}

	if (min_lod != NULL) {
	switch (lod_mode) {
	case NAK_NIR_LOD_MODE_AUTO:
	lod_mode = NAK_NIR_LOD_MODE_CLAMP;
	break;
	case NAK_NIR_LOD_MODE_BIAS:
	lod_mode = NAK_NIR_LOD_MODE_BIAS_CLAMP;
	break;
	default:
	unreachable("Invalid min_lod");
	}
	min_lod = nir_f2u32(b, nir_fmax(b, nir_fmul_imm(b, min_lod, 256),
	nir_imm_float(b, 16)));
	}

	enum nak_nir_offset_mode offset_mode = NAK_NIR_OFFSET_MODE_NONE;
	if (offset != NULL) {
	/* For TG4, offsets, are packed into a single 32-bit value with 8 bits
	* per component. For all other texture instructions, offsets are
	* packed into a single at most 16-bit value with 8 bits per component.
	*/
	static const unsigned bits4[] = { 4, 4, 4, 4 };
	static const unsigned bits8[] = { 8, 8, 8, 8 };
	const unsigned *bits = tex->op == nir_texop_tg4 ? bits8 : bits4;

	offset = nir_pad_vector_imm_int(b, offset, 0, 4);
	offset = nir_format_clamp_sint(b, offset, bits);
	offset = nir_format_pack_uint(b, offset, bits, 4);
	offset_mode = NAK_NIR_OFFSET_MODE_AOFFI;
	} else if (nir_tex_instr_has_explicit_tg4_offsets(tex)) {
	uint64_t off_u64 = 0;
	for (uint8_t i = 0; i < 8; ++i) {
	uint64_t off = (uint8_t)tex->tg4_offsets[i / 2][i % 2];
	off_u64 \|= off << (i * 8);
	}
	offset = nir_imm_ivec2(b, off_u64, off_u64 >> 32);
	offset_mode = NAK_NIR_OFFSET_MODE_PER_PX;
	}

	nir_def *src0[4] = { NULL, };
	nir_def *src1[4] = { NULL, };
	unsigned src0_comps = 0, src1_comps = 0;

	#define PUSH(a, x) do { \
	nir_def *val = (x); \
	assert(a##_comps < ARRAY_SIZE(a)); \
	a[a##_comps++] = val; \
	} while(0)

	if (nak->sm >= 50) {
	if (tex->op == nir_texop_txd) {
	PUSH(src0, tex_h);

	for (uint32_t i = 0; i < coord_components; i++)
	PUSH(src0, nir_channel(b, coord, i));

	if (offset != NULL) {
	nir_def *arr_idx_or_zero = arr_idx ? arr_idx : nir_imm_int(b, 0);
	nir_def *arr_off = nir_prmt_nv(b, nir_imm_int(b, 0x1054),
	offset, arr_idx_or_zero);
	PUSH(src0, arr_off);
	} else if (arr_idx != NULL) {
	PUSH(src0, arr_idx);
	}

	assert(ddx->num_components == coord_components);
	for (uint32_t i = 0; i < coord_components; i++) {
	PUSH(src1, nir_channel(b, ddx, i));
	PUSH(src1, nir_channel(b, ddy, i));
	}
	} else {
	if (min_lod != NULL) {
	nir_def *arr_idx_or_zero = arr_idx ? arr_idx : nir_imm_int(b, 0);
	nir_def *arr_ml = nir_prmt_nv(b, nir_imm_int(b, 0x1054),
	min_lod, arr_idx_or_zero);
	PUSH(src0, arr_ml);
	} else if (arr_idx != NULL) {
	PUSH(src0, arr_idx);
	}

	for (uint32_t i = 0; i < coord_components; i++)
	PUSH(src0, nir_channel(b, coord, i));

	PUSH(src1, tex_h);
	if (ms_idx != NULL)
	PUSH(src1, ms_idx);
	if (lod != NULL)
	PUSH(src1, lod);
	if (offset_mode == NAK_NIR_OFFSET_MODE_AOFFI) {
	PUSH(src1, offset);
	} else if (offset_mode == NAK_NIR_OFFSET_MODE_PER_PX) {
	PUSH(src1, nir_channel(b, offset, 0));
	PUSH(src1, nir_channel(b, offset, 1));
	}
	if (z_cmpr != NULL)
	PUSH(src1, z_cmpr);
	}
	} else {
	unreachable("Unsupported shader model");
	}

	nir_def *vec_srcs[2] = {
	nir_vec(b, src0, src0_comps),
	nir_vec(b, src1, src1_comps),
	};

	tex->src[0].src_type = nir_tex_src_backend1;
	nir_src_rewrite(&tex->src[0].src, vec_srcs[0]);

	tex->src[1].src_type = nir_tex_src_backend2;
	nir_src_rewrite(&tex->src[1].src, vec_srcs[1]);

	/* Remove any extras */
	while (tex->num_srcs > 2)
	nir_tex_instr_remove_src(tex, tex->num_srcs - 1);

	tex->sampler_dim = remap_sampler_dim(tex->sampler_dim);

	struct nak_nir_tex_flags flags = {
	.lod_mode = lod_mode,
	.offset_mode = offset_mode,
	.has_z_cmpr = tex->is_shadow,
	.is_sparse = tex->is_sparse,
	};
	STATIC_ASSERT(sizeof(flags) == sizeof(tex->backend_flags));
	memcpy(&tex->backend_flags, &flags, sizeof(flags));

	if (tex->op == nir_texop_lod) {
	b->cursor = nir_after_instr(&tex->instr);

	/* The outputs are flipped compared to what NIR expects */
	nir_def *abs = nir_channel(b, &tex->def, 1);
	nir_def *rel = nir_channel(b, &tex->def, 0);

	/* The returned values are not quite what we want:
	* (a) convert from s16/u16 to f32
	* (b) multiply by 1/256
	*
	* TODO: We can make this cheaper once we have 16-bit in NAK
	*/
	abs = nir_u2f32(b, nir_iand_imm(b, abs, 0xffff));
	nir_def *shift = nir_imm_int(b, 16);
	rel = nir_i2f32(b, nir_ishr(b, nir_ishl(b, rel, shift), shift));

	abs = nir_fmul_imm(b, abs, 1.0 / 256.0);
	rel = nir_fmul_imm(b, rel, 1.0 / 256.0);

	nir_def *res = nir_vec2(b, abs, rel);
	nir_def_rewrite_uses_after(&tex->def, res, res->parent_instr);
	}

	return true;
	}

	static bool
	lower_txq(nir_builder b, nir_tex_instr tex, const struct nak_compiler *nak)
	{
	b->cursor = nir_before_instr(&tex->instr);

	assert(!tex->is_sparse);

	nir_def tex_h = NULL, lod = NULL;
	for (unsigned i = 0; i < tex->num_srcs; i++) {
	switch (tex->src[i].src_type) {
	case nir_tex_src_texture_handle: tex_h = tex->src[i].src.ssa; break;
	case nir_tex_src_sampler_handle: break; /* Ignored */
	case nir_tex_src_lod: lod = tex->src[i].src.ssa; break;
	default:
	unreachable("Unsupported texture source");
	}
	}

	/* TODO: We should only support 32-bit handles */
	tex_h = nir_u2u32(b, tex_h);

	nir_def *txq_src;
	nir_component_mask_t mask;
	switch (tex->op) {
	case nir_texop_txs:
	tex->op = nir_texop_hdr_dim_nv;
	if (lod == NULL)
	lod = nir_imm_int(b, 0);
	txq_src = nir_vec2(b, tex_h, lod);
	mask = BITSET_MASK(tex->def.num_components);
	break;
	case nir_texop_query_levels:
	tex->op = nir_texop_hdr_dim_nv;
	txq_src = nir_vec2(b, tex_h, nir_imm_int(b, 0));
	mask = BITSET_BIT(3);
	break;
	case nir_texop_texture_samples:
	tex->op = nir_texop_tex_type_nv;
	txq_src = tex_h;
	mask = BITSET_BIT(2);
	break;
	default:
	unreachable("Invalid texture query op");
	}

	tex->src[0].src_type = nir_tex_src_backend1;
	nir_src_rewrite(&tex->src[0].src, txq_src);

	/* Remove any extras */
	while (tex->num_srcs > 1)
	nir_tex_instr_remove_src(tex, tex->num_srcs - 1);

	tex->sampler_dim = remap_sampler_dim(tex->sampler_dim);

	b->cursor = nir_after_instr(&tex->instr);

	/* Only pick off slected components */
	tex->def.num_components = 4;
	nir_def *res = nir_channels(b, &tex->def, mask);
	nir_def_rewrite_uses_after(&tex->def, res, res->parent_instr);

	return true;
	}

	static bool
	shrink_image_load(nir_builder b, nir_intrinsic_instr intrin,
	const struct nak_compiler *nak)
	{
	enum pipe_format format = nir_intrinsic_format(intrin);
	nir_component_mask_t color_comps_read =
	nir_def_components_read(&intrin->def);

	assert(intrin->intrinsic == nir_intrinsic_bindless_image_load \|\|
	intrin->intrinsic == nir_intrinsic_bindless_image_sparse_load);

	/* Pick off the sparse resident component (if any) before we do anything
	* else. This makes later logic easier.
	*/
	bool is_sparse = false;
	if (intrin->intrinsic == nir_intrinsic_bindless_image_sparse_load) {
	unsigned resident_comp = intrin->def.num_components - 1;
	if (color_comps_read & BITFIELD_BIT(resident_comp)) {
	is_sparse = true;
	color_comps_read &= ~BITFIELD_BIT(resident_comp);
	} else {
	/* If the sparse bit is never used, get rid of it */
	intrin->intrinsic = nir_intrinsic_bindless_image_load;
	intrin->num_components--;
	intrin->def.num_components--;
	}
	}

	if (intrin->def.bit_size == 64) {
	assert(format == PIPE_FORMAT_NONE \|\|
	format == PIPE_FORMAT_R64_UINT \|\|
	format == PIPE_FORMAT_R64_SINT);

	b->cursor = nir_after_instr(&intrin->instr);

	nir_def data_xy, data_w, *resident = NULL;
	if (color_comps_read & BITFIELD_BIT(3)) {
	/* Thanks to descriptor indexing, we need to ensure that null
	* descriptor behavior works properly. In particular, normal zero
	* reads will return (0, 0, 0, 1) whereas null descriptor reads need
	* to return (0, 0, 0, 0). This means we can't blindly extend with
	* an alpha component of 1. Instead, we need to trust the hardware
	* to extend the original RG32 with z = 0 and w = 1 and copy the w
	* value all the way out to 64-bit w value.
	*/
	assert(intrin->num_components == 4 + is_sparse);
	assert(intrin->def.num_components == 4 + is_sparse);
	intrin->def.bit_size = 32;

	data_xy = nir_channels(b, &intrin->def, 0x3);
	data_w = nir_channels(b, &intrin->def, 0x8);
	if (is_sparse)
	resident = nir_channel(b, &intrin->def, 4);
	} else {
	intrin->num_components = 2 + is_sparse;
	intrin->def.num_components = 2 + is_sparse;
	intrin->def.bit_size = 32;

	data_xy = nir_channels(b, &intrin->def, 0x3);
	data_w = nir_imm_int(b, 0);
	if (is_sparse)
	resident = nir_channel(b, &intrin->def, 2);
	}

	nir_def *data;
	if (is_sparse) {
	data = nir_vec5(b, nir_pack_64_2x32(b, data_xy),
	nir_imm_zero(b, 1, 64),
	nir_imm_zero(b, 1, 64),
	nir_u2u64(b, data_w),
	nir_u2u64(b, resident));
	} else {
	data = nir_vec4(b, nir_pack_64_2x32(b, data_xy),
	nir_imm_zero(b, 1, 64),
	nir_imm_zero(b, 1, 64),
	nir_u2u64(b, data_w));
	}

	nir_def_rewrite_uses_after(&intrin->def, data, data->parent_instr);
	return true;
	}

	if (format == PIPE_FORMAT_NONE)
	return false;

	/* In order for null descriptors to work properly, we don't want to shrink
	* loads when the alpha channel is read even if we know the format has
	* fewer channels.
	*/
	if (color_comps_read & BITFIELD_BIT(3))
	return false;

	const unsigned old_comps = intrin->def.num_components;

	unsigned new_comps = util_format_get_nr_components(format);
	new_comps = util_next_power_of_two(new_comps);
	if (color_comps_read <= BITFIELD_MASK(2))
	new_comps = 2;
	if (color_comps_read <= BITFIELD_MASK(1))
	new_comps = 1;

	if (new_comps + is_sparse >= intrin->num_components)
	return false;

	b->cursor = nir_after_instr(&intrin->instr);

	intrin->num_components = new_comps + is_sparse;
	intrin->def.num_components = new_comps + is_sparse;

	assert(new_comps <= 4);
	nir_def *comps[5];
	for (unsigned c = 0; c < new_comps; c++)
	comps[c] = nir_channel(b, &intrin->def, c);
	for (unsigned c = new_comps; c < 3; c++)
	comps[c] = nir_imm_intN_t(b, 0, intrin->def.bit_size);
	if (new_comps < 4)
	comps[3] = nir_imm_intN_t(b, 1, intrin->def.bit_size);

	/* The resident bit always goes in the last channel */
	if (is_sparse)
	comps[old_comps - 1] = nir_channel(b, &intrin->def, new_comps);

	nir_def *data = nir_vec(b, comps, old_comps);
	nir_def_rewrite_uses_after(&intrin->def, data, data->parent_instr);
	return true;
	}

	static bool
	shrink_image_store(nir_builder b, nir_intrinsic_instr intrin,
	const struct nak_compiler *nak)
	{
	enum pipe_format format = nir_intrinsic_format(intrin);
	nir_def *data = intrin->src[3].ssa;

	if (data->bit_size == 64) {
	assert(format == PIPE_FORMAT_NONE \|\|
	format == PIPE_FORMAT_R64_UINT \|\|
	format == PIPE_FORMAT_R64_SINT);

	b->cursor = nir_before_instr(&intrin->instr);

	/* For 64-bit image ops, we actually want a vec2 */
	nir_def *data_vec2 = nir_unpack_64_2x32(b, nir_channel(b, data, 0));
	nir_src_rewrite(&intrin->src[3], data_vec2);
	intrin->num_components = 2;
	return true;
	}

	if (format == PIPE_FORMAT_NONE)
	return false;

	unsigned new_comps = util_format_get_nr_components(format);
	new_comps = util_next_power_of_two(new_comps);
	if (new_comps >= intrin->num_components)
	return false;

	b->cursor = nir_before_instr(&intrin->instr);

	nir_def *trimmed = nir_trim_vector(b, data, new_comps);
	nir_src_rewrite(&intrin->src[3], trimmed);
	intrin->num_components = new_comps;
	return true;
	}

	static bool
	lower_image_txq(nir_builder b, nir_intrinsic_instr intrin,
	const struct nak_compiler *nak)
	{
	b->cursor = nir_instr_remove(&intrin->instr);

	/* TODO: We should only support 32-bit handles */
	nir_def *img_h = nir_u2u32(b, intrin->src[0].ssa);

	nir_tex_instr *txq = nir_tex_instr_create(b->shader, 1);
	txq->sampler_dim = remap_sampler_dim(nir_intrinsic_image_dim(intrin));
	txq->is_array = nir_intrinsic_image_array(intrin);
	txq->dest_type = nir_type_int32;

	nir_component_mask_t mask;
	switch (intrin->intrinsic) {
	case nir_intrinsic_bindless_image_size: {
	nir_def *lod = intrin->src[1].ssa;

	txq->op = nir_texop_hdr_dim_nv;
	txq->src[0] = (nir_tex_src) {
	.src_type = nir_tex_src_backend1,
	.src = nir_src_for_ssa(nir_vec2(b, img_h, lod)),
	};
	mask = BITSET_MASK(intrin->def.num_components);
	break;
	}

	case nir_intrinsic_bindless_image_samples:
	txq->op = nir_texop_tex_type_nv;
	txq->src[0] = (nir_tex_src) {
	.src_type = nir_tex_src_backend1,
	.src = nir_src_for_ssa(img_h),
	};
	mask = BITSET_BIT(2);
	break;

	default:
	unreachable("Invalid image query op");
	}

	nir_def_init(&txq->instr, &txq->def, 4, 32);
	nir_builder_instr_insert(b, &txq->instr);

	/* Only pick off slected components */
	nir_def *res = nir_channels(b, &txq->def, mask);

	nir_def_rewrite_uses(&intrin->def, res);

	return true;
	}

	static bool
	lower_tex_instr(nir_builder b, nir_instr instr, void *_data)
	{
	const struct nak_compiler *nak = _data;

	switch (instr->type) {
	case nir_instr_type_tex: {
	nir_tex_instr *tex = nir_instr_as_tex(instr);
	switch (tex->op) {
	case nir_texop_tex:
	case nir_texop_txb:
	case nir_texop_txl:
	case nir_texop_txd:
	case nir_texop_txf:
	case nir_texop_txf_ms:
	case nir_texop_tg4:
	case nir_texop_lod:
	return lower_tex(b, tex, nak);
	case nir_texop_txs:
	case nir_texop_query_levels:
	case nir_texop_texture_samples:
	return lower_txq(b, tex, nak);
	default:
	unreachable("Unsupported texture instruction");
	}
	}
	case nir_instr_type_intrinsic: {
	nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
	switch (intrin->intrinsic) {
	case nir_intrinsic_bindless_image_load:
	case nir_intrinsic_bindless_image_sparse_load:
	return shrink_image_load(b, intrin, nak);
	case nir_intrinsic_bindless_image_store:
	return shrink_image_store(b, intrin, nak);
	case nir_intrinsic_bindless_image_size:
	case nir_intrinsic_bindless_image_samples:
	return lower_image_txq(b, intrin, nak);
	default:
	return false;
	}
	}
	default:
	return false;
	}
	}

	bool
	nak_nir_lower_tex(nir_shader nir, const struct nak_compiler nak)
	{
	return nir_shader_instructions_pass(nir, lower_tex_instr,
	nir_metadata_control_flow,
	(void *)nak);
	}