src/asahi/compiler/agx_nir_lower_texture.c - platform/external/mesa3d - Git at Google

 /*
  * Copyright 2023 Valve Corporation
  * Copyright 2021 Alyssa Rosenzweig
  * Copyright 2020 Collabora Ltd.
  * Copyright 2016 Broadcom
  * SPDX-License-Identifier: MIT
  */

 #include "compiler/nir/nir.h"
 #include "compiler/nir/nir_builder.h"
 #include "agx_nir.h"
 #include "agx_nir_texture.h"
 #include "glsl_types.h"
 #include "libagx.h"
 #include "nir_builder_opcodes.h"
 #include "nir_builtin_builder.h"
 #include "nir_intrinsics.h"
 #include "nir_intrinsics_indices.h"
 #include "shader_enums.h"

 /* Residency flags are inverted from NIR */
 #define AGX_RESIDENT (0)

 static bool
 fence_image(struct nir_builder *b, nir_intrinsic_instr *intr, void *data)
 {
    b->cursor = nir_after_instr(&intr->instr);

    /* If the image is write-only, there is no fencing needed */
    if (nir_intrinsic_has_access(intr) &&
        (nir_intrinsic_access(intr) & ACCESS_NON_READABLE)) {
       return false;
    }

    switch (intr->intrinsic) {
    case nir_intrinsic_image_store:
    case nir_intrinsic_bindless_image_store:
       nir_fence_pbe_to_tex_agx(b);
       return true;

    case nir_intrinsic_image_atomic:
    case nir_intrinsic_bindless_image_atomic:
    case nir_intrinsic_image_atomic_swap:
    case nir_intrinsic_bindless_image_atomic_swap:
       nir_fence_mem_to_tex_agx(b);
       return true;

    default:
       return false;
    }
 }

 static nir_def *
 texture_descriptor_ptr(nir_builder *b, nir_tex_instr *tex)
 {
    int handle_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_handle);
    assert(handle_idx >= 0 && "must be bindless");
    return nir_load_from_texture_handle_agx(b, tex->src[handle_idx].src.ssa);
 }

 static bool
 has_nonzero_lod(nir_tex_instr *tex)
 {
    int idx = nir_tex_instr_src_index(tex, nir_tex_src_lod);
    if (idx < 0)
       return false;

    nir_src src = tex->src[idx].src;
    return !(nir_src_is_const(src) && nir_src_as_uint(src) == 0);
 }

 static bool
 lower_tex_crawl(nir_builder *b, nir_instr *instr, UNUSED void *data)
 {
    if (instr->type != nir_instr_type_tex)
       return false;

    nir_tex_instr *tex = nir_instr_as_tex(instr);
    b->cursor = nir_before_instr(instr);

    if (tex->op != nir_texop_txs && tex->op != nir_texop_texture_samples &&
        tex->op != nir_texop_query_levels)
       return false;

    nir_def *ptr = texture_descriptor_ptr(b, tex);
    unsigned nr_comps = tex->def.num_components;
    assert(nr_comps <= 3);

    int lod_idx = nir_tex_instr_src_index(tex, nir_tex_src_lod);
    nir_def *lod = lod_idx >= 0 ? nir_u2u16(b, tex->src[lod_idx].src.ssa)
                                : nir_imm_intN_t(b, 0, 16);

    nir_def *res;
    if (tex->op == nir_texop_txs) {
       res =
          libagx_txs(b, ptr, lod, nir_imm_int(b, nr_comps),
                     nir_imm_bool(b, tex->sampler_dim == GLSL_SAMPLER_DIM_BUF),
                     nir_imm_bool(b, tex->sampler_dim == GLSL_SAMPLER_DIM_1D),
                     nir_imm_bool(b, tex->sampler_dim == GLSL_SAMPLER_DIM_2D),
                     nir_imm_bool(b, tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE),
                     nir_imm_bool(b, tex->is_array));
    } else if (tex->op == nir_texop_query_levels) {
       res = libagx_texture_levels(b, ptr);
    } else {
       res = libagx_texture_samples(b, ptr);
    }

    nir_def_rewrite_uses(&tex->def, nir_trim_vector(b, res, nr_comps));
    nir_instr_remove(instr);
    return true;
 }

 /*
  * Given a 1D buffer texture coordinate, calculate the 2D coordinate vector that
  * will be used to access the linear 2D texture bound to the buffer.
  */
 static nir_def *
 coords_for_buffer_texture(nir_builder *b, nir_def *coord)
 {
    return nir_vec2(b, nir_umod_imm(b, coord, AGX_TEXTURE_BUFFER_WIDTH),
                    nir_udiv_imm(b, coord, AGX_TEXTURE_BUFFER_WIDTH));
 }

 /*
  * Buffer textures are lowered to 2D (1024xN) textures in the driver to access
  * more storage. When lowering, we need to fix up the coordinate accordingly.
  *
  * Furthermore, RGB32 formats are emulated by lowering to global memory access,
  * so to read a buffer texture we generate code that looks like:
  *
  *    if (descriptor->format == RGB32)
  *       return ((uint32_t *) descriptor->address)[x];
  *    else
  *       return txf(texture_as_2d, vec2(x % 1024, x / 1024));
  */
 static bool
 lower_buffer_texture(nir_builder *b, nir_tex_instr *tex)
 {
    nir_def *coord = nir_steal_tex_src(tex, nir_tex_src_coord);
    nir_def *size = nir_get_texture_size(b, tex);
    nir_def *oob = nir_uge(b, coord, size);

    /* Apply the buffer offset after calculating oob but before remapping */
    nir_def *desc = texture_descriptor_ptr(b, tex);
    coord = libagx_buffer_texture_offset(b, desc, coord);

    /* Map out-of-bounds indices to out-of-bounds coordinates for robustness2
     * semantics from the hardware.
     */
    coord = nir_bcsel(b, oob, nir_imm_int(b, -1), coord);

    bool is_float = nir_alu_type_get_base_type(tex->dest_type) == nir_type_float;

    /* Lower RGB32 reads if the format requires. If we are out-of-bounds, we use
     * the hardware path so we get a zero texel.
     */
    nir_if *nif = nir_push_if(
       b, nir_iand(b, libagx_texture_is_rgb32(b, desc), nir_inot(b, oob)));

    nir_def *rgb32 = nir_trim_vector(
       b, libagx_texture_load_rgb32(b, desc, coord, nir_imm_bool(b, is_float)),
       nir_tex_instr_result_size(tex));

    /* Raw loads do not return residency information, but residency queries are
     * supported on buffer textures. Fortunately, we do not need to support
     * sparse RGB32 buffers, so we simply claim all RGB32 loads were resident.
     * Nothing should hit this in practice, but if we don't do *something* here
     * we'll get vector size mismatches which blow up in vkd3d-proton.
     */
    if (tex->is_sparse) {
       rgb32 = nir_pad_vector_imm_int(b, rgb32, AGX_RESIDENT,
                                      rgb32->num_components + 1);
    }

    nir_push_else(b, nif);

    /* Otherwise, lower the texture instruction to read from 2D */
    assert(coord->num_components == 1 && "buffer textures are 1D");
    tex->sampler_dim = GLSL_SAMPLER_DIM_2D;

    nir_def *coord2d = coords_for_buffer_texture(b, coord);
    nir_instr_remove(&tex->instr);
    nir_builder_instr_insert(b, &tex->instr);
    nir_tex_instr_add_src(tex, nir_tex_src_backend1, coord2d);
    nir_steal_tex_src(tex, nir_tex_src_sampler_handle);
    nir_steal_tex_src(tex, nir_tex_src_sampler_offset);
    nir_block *else_block = nir_cursor_current_block(b->cursor);
    nir_pop_if(b, nif);

    /* Put it together with a phi */
    nir_def *phi = nir_if_phi(b, rgb32, &tex->def);
    nir_def_rewrite_uses(&tex->def, phi);
    nir_phi_instr *phi_instr = nir_def_as_phi(phi);
    nir_phi_src *else_src = nir_phi_get_src_from_block(phi_instr, else_block);
    nir_src_rewrite(&else_src->src, &tex->def);
    return true;
 }

 /*
  * NIR indexes into array textures with unclamped floats (integer for txf). AGX
  * requires the index to be a clamped integer. Lower tex_src_coord into
  * tex_src_backend1 for array textures by type-converting and clamping.
  */
 static bool
 lower_regular_texture(nir_builder *b, nir_instr *instr, UNUSED void *data)
 {
    if (instr->type != nir_instr_type_tex)
       return false;

    nir_tex_instr *tex = nir_instr_as_tex(instr);
    b->cursor = nir_before_instr(instr);

    if (nir_tex_instr_is_query(tex) && tex->op != nir_texop_lod)
       return false;

    if (tex->sampler_dim == GLSL_SAMPLER_DIM_BUF)
       return lower_buffer_texture(b, tex);

    /* Don't lower twice */
    if (nir_tex_instr_src_index(tex, nir_tex_src_backend1) >= 0)
       return false;

    /* Get the coordinates */
    nir_def *coord = nir_steal_tex_src(tex, nir_tex_src_coord);
    nir_def *ms_idx = nir_steal_tex_src(tex, nir_tex_src_ms_index);

    /* Apply txf workaround, see libagx_lower_txf_robustness */
    bool is_txf = ((tex->op == nir_texop_txf) || (tex->op == nir_texop_txf_ms));

    if (is_txf &&
        (has_nonzero_lod(tex) || tex->is_array ||
         nir_tex_instr_src_index(tex, nir_tex_src_min_lod) >= 0) &&
        !(tex->backend_flags & AGX_TEXTURE_FLAG_NO_CLAMP)) {

       int lod_idx = nir_tex_instr_src_index(tex, nir_tex_src_lod);
       nir_def *lod =
          lod_idx >= 0 ? tex->src[lod_idx].src.ssa : nir_undef(b, 1, 16);

       nir_def *min_lod = nir_steal_tex_src(tex, nir_tex_src_min_lod);

       unsigned lidx = coord->num_components - 1;
       nir_def *layer = nir_channel(b, coord, lidx);

       nir_def *replaced = libagx_lower_txf_robustness(
          b, texture_descriptor_ptr(b, tex),
          nir_imm_bool(b, has_nonzero_lod(tex)), lod,
          nir_imm_bool(b, min_lod != NULL), min_lod ?: nir_undef(b, 1, 16),
          nir_imm_bool(b, tex->is_array), layer, nir_channel(b, coord, 0));

       coord = nir_vector_insert_imm(b, coord, replaced, 0);
    }

    /* The layer is always the last component of the NIR coordinate, split it off
     * because we'll need to swizzle.
     */
    nir_def *layer = NULL;

    if (tex->is_array && tex->op != nir_texop_lod) {
       unsigned lidx = coord->num_components - 1;
       nir_def *unclamped_layer = nir_channel(b, coord, lidx);
       coord = nir_trim_vector(b, coord, lidx);

       /* Round layer to nearest even */
       if (!is_txf) {
          unclamped_layer = nir_fround_even(b, unclamped_layer);

          /* Explicitly round negative to avoid undefined behaviour when constant
           * folding. This is load bearing on x86 builds.
           */
          unclamped_layer =
             nir_f2u32(b, nir_fmax(b, unclamped_layer, nir_imm_float(b, 0.0f)));
       }

       /* For a cube array, the layer is zero-indexed component 3 of the
        * coordinate but the number of layers is component 2 of the txs result.
        */
       if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
          assert(lidx == 3 && "4 components");
          lidx = 2;
       }

       /* Clamp to max layer = (# of layers - 1) for out-of-bounds handling.
        * Layer must be 16-bits for the hardware, drop top bits after clamping.
        *
        * For txf, we drop out-of-bounds components rather than clamp, see the
        * above txf robustness workaround.
        */
       if (!(tex->backend_flags & AGX_TEXTURE_FLAG_NO_CLAMP) && !is_txf) {
          nir_def *txs = nir_get_texture_size(b, tex);
          nir_def *nr_layers = nir_channel(b, txs, lidx);
          nir_def *max_layer = nir_iadd_imm(b, nr_layers, -1);
          layer = nir_umin(b, unclamped_layer, max_layer);
       } else {
          layer = unclamped_layer;
       }

       layer = nir_u2u16(b, layer);
    }

    /* Combine layer and multisample index into 32-bit so we don't need a vec5 or
     * vec6 16-bit coordinate tuple, which would be inconvenient in NIR for
     * little benefit (a minor optimization, I guess).
     */
    nir_def *sample_array = (ms_idx && layer)
                               ? nir_pack_32_2x16_split(b, ms_idx, layer)
                            : ms_idx ? nir_u2u32(b, ms_idx)
                            : layer  ? nir_u2u32(b, layer)
                                     : NULL;

    /* Combine into the final 32-bit tuple */
    if (sample_array != NULL) {
       unsigned end = coord->num_components;
       coord = nir_pad_vector(b, coord, end + 1);
       coord = nir_vector_insert_imm(b, coord, sample_array, end);
    }

    nir_tex_instr_add_src(tex, nir_tex_src_backend1, coord);

    /* Furthermore, if there is an offset vector, it must be packed */
    nir_def *offset = nir_steal_tex_src(tex, nir_tex_src_offset);

    if (offset != NULL) {
       nir_def *packed = NULL;

       for (unsigned c = 0; c < offset->num_components; ++c) {
          nir_def *nibble = nir_iand_imm(b, nir_channel(b, offset, c), 0xF);
          nir_def *shifted = nir_ishl_imm(b, nibble, 4 * c);

          /* We pack with iadd instead of ior to let us fuse in the shift with an
           * iadd-lsl instruction.
           */
          if (packed != NULL)
             packed = nir_iadd(b, packed, shifted);
          else
             packed = shifted;
       }

       nir_tex_instr_add_src(tex, nir_tex_src_backend2, packed);
    }

    if (nir_tex_instr_src_index(tex, nir_tex_src_bias) >= 0 &&
        nir_tex_instr_src_index(tex, nir_tex_src_min_lod) >= 0) {

       nir_def *bias = nir_steal_tex_src(tex, nir_tex_src_bias);
       nir_def *min_lod = nir_steal_tex_src(tex, nir_tex_src_min_lod);
       nir_def *packed = nir_pack_32_2x16_split(b, bias, min_lod);
       nir_tex_instr_add_src(tex, nir_tex_src_lod_bias_min_agx, packed);
    }

    /* We reserve bound sampler #0, so we offset bound samplers by 1 and
     * otherwise map bound samplers as-is.
     */
    nir_def *sampler = nir_steal_tex_src(tex, nir_tex_src_sampler_offset);
    if (!sampler)
       sampler = nir_imm_intN_t(b, tex->sampler_index, 16);

    if (!is_txf &&
        nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle) < 0) {

       nir_tex_instr_add_src(tex, nir_tex_src_sampler_handle,
                             nir_iadd_imm(b, nir_u2u16(b, sampler), 1));
    }

    return true;
 }

 static bool
 legalize_image_lod(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *data)
 {
    nir_src *src;

 #define CASE(op, idx)                                                          \
    case nir_intrinsic_##op:                                                    \
    case nir_intrinsic_bindless_##op:                                           \
       src = &intr->src[idx];                                                   \
       break;

    switch (intr->intrinsic) {
       CASE(image_load, 3)
       CASE(image_sparse_load, 3)
       CASE(image_store, 4)
       CASE(image_size, 1)
    default:
       return false;
    }

 #undef CASE

    if (src->ssa->bit_size == 16)
       return false;

    b->cursor = nir_before_instr(&intr->instr);
    nir_src_rewrite(src, nir_i2i16(b, src->ssa));
    return true;
 }

 static nir_def *
 txs_for_image(nir_builder *b, nir_intrinsic_instr *intr,
               unsigned num_components, unsigned bit_size, bool query_samples)
 {
    enum glsl_sampler_dim dim = nir_intrinsic_image_dim(intr);
    nir_def *lod = query_samples ? NULL : intr->src[1].ssa;
    nir_texop op = query_samples ? nir_texop_texture_samples : nir_texop_txs;

    nir_def *res =
       nir_build_tex(b, op, .texture_handle = intr->src[0].ssa, .lod = lod,
                     .dim = dim, .is_array = nir_intrinsic_image_array(intr),
                     .can_speculate = nir_instr_can_speculate(&intr->instr));

    /* Cube images are implemented as 2D arrays, so we need to divide here. */
    if (dim == GLSL_SAMPLER_DIM_CUBE && res->num_components > 2 &&
        !query_samples) {
       nir_def *divided = nir_udiv_imm(b, nir_channel(b, res, 2), 6);
       res = nir_vector_insert_imm(b, res, divided, 2);
    }

    return res;
 }

 static nir_def *
 image_texel_address(nir_builder *b, nir_intrinsic_instr *intr,
                     bool return_index)
 {
    /* First, calculate the address of the PBE descriptor */
    nir_def *desc_address =
       nir_load_from_texture_handle_agx(b, intr->src[0].ssa);

    nir_def *coord = intr->src[1].ssa;

    /* For atomics, we always infer the format. We only go down this path with
     * formatless intrinsics when lowering multisampled image stores, but that
     * uses the return_index path that ignores the block size.
     */
    enum pipe_format format = nir_intrinsic_format(intr);
    assert(return_index || format != PIPE_FORMAT_NONE);

    nir_def *blocksize_B = nir_imm_int(b, util_format_get_blocksize(format));

    enum glsl_sampler_dim dim = nir_intrinsic_image_dim(intr);
    bool layered = nir_intrinsic_image_array(intr) ||
                   (dim == GLSL_SAMPLER_DIM_CUBE) ||
                   (dim == GLSL_SAMPLER_DIM_3D);

    if (dim == GLSL_SAMPLER_DIM_BUF && return_index) {
       return nir_channel(b, coord, 0);
    } else if (dim == GLSL_SAMPLER_DIM_BUF) {
       return libagx_buffer_texel_address(b, desc_address, coord, blocksize_B);
    } else {
       return libagx_image_texel_address(
          b, desc_address, coord, nir_u2u32(b, intr->src[2].ssa), blocksize_B,
          nir_imm_bool(b, dim == GLSL_SAMPLER_DIM_1D),
          nir_imm_bool(b, dim == GLSL_SAMPLER_DIM_MS), nir_imm_bool(b, layered),
          nir_imm_bool(b, return_index));
    }
 }

 static void
 lower_buffer_image(nir_builder *b, nir_intrinsic_instr *intr)
 {
    nir_def *coord_vector = intr->src[1].ssa;
    nir_def *coord = nir_channel(b, coord_vector, 0);

    /* If we're not bindless, assume we don't need an offset (GL driver) */
    if (intr->intrinsic == nir_intrinsic_bindless_image_load ||
        intr->intrinsic == nir_intrinsic_bindless_image_sparse_load) {

       nir_def *desc = nir_load_from_texture_handle_agx(b, intr->src[0].ssa);
       coord = libagx_buffer_texture_offset(b, desc, coord);
    } else if (intr->intrinsic == nir_intrinsic_bindless_image_store) {
       nir_def *desc = nir_load_from_texture_handle_agx(b, intr->src[0].ssa);
       coord = libagx_buffer_image_offset(b, desc, coord);
    }

    /* Lower the buffer load/store to a 2D image load/store, matching the 2D
     * texture/PBE descriptor the driver supplies for buffer images.
     */
    nir_def *coord2d = coords_for_buffer_texture(b, coord);
    nir_src_rewrite(&intr->src[1], nir_pad_vector(b, coord2d, 4));
    nir_intrinsic_set_image_dim(intr, GLSL_SAMPLER_DIM_2D);
 }

 static void
 lower_1d_image(nir_builder *b, nir_intrinsic_instr *intr)
 {
    nir_def *coord = intr->src[1].ssa;
    bool is_array = nir_intrinsic_image_array(intr);
    nir_def *zero = nir_imm_intN_t(b, 0, coord->bit_size);

    if (is_array) {
       assert(coord->num_components >= 2);
       coord =
          nir_vec3(b, nir_channel(b, coord, 0), zero, nir_channel(b, coord, 1));
    } else {
       assert(coord->num_components >= 1);
       coord = nir_vec2(b, coord, zero);
    }

    nir_src_rewrite(&intr->src[1], nir_pad_vector(b, coord, 4));
    nir_intrinsic_set_image_dim(intr, GLSL_SAMPLER_DIM_2D);
 }

 /*
  * Just like for txf, we need special handling around layers (and LODs, but we
  * don't support mipmapped images yet) for robust image_loads. See
  * libagx_lower_txf_robustness for more info.
  */
 static bool
 lower_image_load_robustness(nir_builder *b, nir_intrinsic_instr *intr)
 {
    if (nir_intrinsic_access(intr) & ACCESS_IN_BOUNDS)
       return false;

    /* We only need to worry about array-like loads */
    enum glsl_sampler_dim dim = nir_intrinsic_image_dim(intr);
    if (!nir_intrinsic_image_array(intr) && dim != GLSL_SAMPLER_DIM_CUBE)
       return false;

    /* Determine the coordinate component of the layer. Cubes and cube arrays
     * keep their array in their last non-array coordinate component, other
     * arrays are immediately after.
     */
    unsigned lidx = glsl_get_sampler_dim_coordinate_components(dim);
    if (dim == GLSL_SAMPLER_DIM_CUBE)
       lidx--;

    nir_def *coord = intr->src[1].ssa;
    nir_def *lod = nir_undef(b, 1, 16);
    nir_def *layer = nir_channel(b, coord, lidx);

    /* image_load is effectively the same as txf, reuse the txf lower */
    nir_def *replaced = libagx_lower_txf_robustness(
       b, nir_load_from_texture_handle_agx(b, intr->src[0].ssa),
       nir_imm_bool(b, false /* lower LOD */), lod,
       nir_imm_bool(b, false /* lower min LOD */), lod,
       nir_imm_bool(b, true /* lower layer */), layer, nir_channel(b, coord, 0));

    nir_src_rewrite(&intr->src[1], nir_vector_insert_imm(b, coord, replaced, 0));
    return true;
 }

 static bool
 lower_images(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *data)
 {
    b->cursor = nir_before_instr(&intr->instr);

    switch (intr->intrinsic) {
    case nir_intrinsic_image_load:
    case nir_intrinsic_image_store:
    case nir_intrinsic_bindless_image_load:
    case nir_intrinsic_bindless_image_sparse_load:
    case nir_intrinsic_bindless_image_store: {
       /* Legalize MSAA index */
       nir_src_rewrite(&intr->src[2], nir_u2u16(b, intr->src[2].ssa));

       if (intr->intrinsic == nir_intrinsic_image_load ||
           intr->intrinsic == nir_intrinsic_bindless_image_load ||
           intr->intrinsic == nir_intrinsic_bindless_image_sparse_load) {
          lower_image_load_robustness(b, intr);
       }

       switch (nir_intrinsic_image_dim(intr)) {
       case GLSL_SAMPLER_DIM_1D:
          lower_1d_image(b, intr);
          return true;

       case GLSL_SAMPLER_DIM_BUF:
          lower_buffer_image(b, intr);
          return true;

       default:
          return true;
       }
    }

    case nir_intrinsic_bindless_image_size:
    case nir_intrinsic_bindless_image_samples:
       nir_def_rewrite_uses(
          &intr->def,
          txs_for_image(
             b, intr, intr->def.num_components, intr->def.bit_size,
             intr->intrinsic == nir_intrinsic_bindless_image_samples));
       return true;

    case nir_intrinsic_bindless_image_texel_address:
       nir_def_rewrite_uses(&intr->def, image_texel_address(b, intr, false));
       return true;

    case nir_intrinsic_is_sparse_texels_resident:
       /* Residency information is in bit 0, so we need to mask. Unclear what's
        * in the upper bits. For now, let's match the blob.
        */
       nir_def_replace(
          &intr->def,
          nir_ieq_imm(b, nir_iand_imm(b, intr->src[0].ssa, 1), AGX_RESIDENT));
       return true;

    case nir_intrinsic_sparse_residency_code_and:
       /* ior because residency codes are inverted from NIR */
       nir_def_replace(&intr->def,
                       nir_ior(b, intr->src[0].ssa, intr->src[1].ssa));
       return true;

    case nir_intrinsic_image_size:
    case nir_intrinsic_image_texel_address:
       UNREACHABLE("should've been lowered");

    default:
       return false;
    }
 }

 /*
  * Map out-of-bounds storage texel buffer accesses and multisampled image stores
  * to -1 indices, which will become an out-of-bounds hardware access. This gives
  * cheap robustness2.
  */
 static bool
 lower_robustness(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *data)
 {
    b->cursor = nir_before_instr(&intr->instr);

    switch (intr->intrinsic) {
    case nir_intrinsic_image_deref_load:
    case nir_intrinsic_image_deref_sparse_load:
    case nir_intrinsic_image_deref_store:
       break;
    default:
       return false;
    }

    enum glsl_sampler_dim dim = nir_intrinsic_image_dim(intr);
    bool array = nir_intrinsic_image_array(intr);
    unsigned size_components = nir_image_intrinsic_coord_components(intr);

    nir_def *deref = intr->src[0].ssa;
    nir_def *coord = intr->src[1].ssa;

    if (dim != GLSL_SAMPLER_DIM_BUF &&
        !(dim == GLSL_SAMPLER_DIM_MS &&
          intr->intrinsic == nir_intrinsic_image_deref_store))
       return false;

    /* Bounds check the coordinate */
    nir_def *size =
       nir_image_deref_size(b, size_components, 32, deref, nir_imm_int(b, 0),
                            .image_dim = dim, .image_array = array);
    nir_def *oob = nir_bany(b, nir_uge(b, coord, size));

    /* Bounds check the sample */
    if (dim == GLSL_SAMPLER_DIM_MS) {
       nir_def *samples = nir_image_deref_samples(b, 32, deref, .image_dim = dim,
                                                  .image_array = array);

       oob = nir_ior(b, oob, nir_uge(b, intr->src[2].ssa, samples));
    }

    /* Replace the last coordinate component with a large coordinate for
     * out-of-bounds. We pick 0xFFF0 as it fits in 16-bit, and it is not signed
     * as 32-bit so we won't get in-bounds coordinates for arrays due to two's
     * complement wraparound. Additionally it still meets this requirement after
     * adding 0xF, the maximum tail offset.
     *
     * This ensures the resulting hardware coordinate is definitely
     * out-of-bounds, giving hardware-level robustness2 behaviour.
     */
    unsigned c = size_components - 1;
    nir_def *r =
       nir_bcsel(b, oob, nir_imm_int(b, 0xFFF0), nir_channel(b, coord, c));

    nir_src_rewrite(&intr->src[1], nir_vector_insert_imm(b, coord, r, c));
    return true;
 }

 /*
  * Early texture lowering passes, called by the driver before lowering
  * descriptor bindings. That means these passes operate on texture derefs. The
  * purpose is to make descriptor crawls explicit in the NIR, so that the driver
  * can accurately lower descriptors after this pass but before calling
  * the full agx_nir_lower_texture.
  */
 bool
 agx_nir_lower_texture_early(nir_shader *s, bool support_lod_bias)
 {
    bool progress = false;

    NIR_PASS(progress, s, nir_shader_intrinsics_pass, lower_robustness,
             nir_metadata_control_flow, NULL);

    nir_lower_tex_options lower_tex_options = {
       .lower_txp = ~0,
       .lower_invalid_implicit_lod = true,
       .lower_tg4_offsets = true,
       .lower_index_to_offset = true,
       .lower_sampler_lod_bias = support_lod_bias,

       /* Unclear if/how mipmapped 1D textures work in the hardware. */
       .lower_1d = true,

       /* XXX: Metal seems to handle just like 3D txd, so why doesn't it work?
        * TODO: Stop using this lowering
        */
       .lower_txd_cube_map = true,
    };

    NIR_PASS(progress, s, nir_lower_tex, &lower_tex_options);
    return progress;
 }

 bool
 agx_nir_lower_texture(nir_shader *s)
 {
    bool progress = false;

    nir_tex_src_type_constraints tex_constraints = {
       [nir_tex_src_lod] = {true, 16},
       [nir_tex_src_bias] = {true, 16},
       [nir_tex_src_ms_index] = {true, 16},
       [nir_tex_src_min_lod] = {true, 16},
       [nir_tex_src_texture_offset] = {true, 16},
       [nir_tex_src_sampler_offset] = {true, 16},
    };

    /* Insert fences before lowering image atomics, since image atomics need
     * different fencing than other image operations.
     */
    NIR_PASS(progress, s, nir_shader_intrinsics_pass, fence_image,
             nir_metadata_control_flow, NULL);

    NIR_PASS(progress, s, nir_lower_image_atomics_to_global, NULL, NULL);

    NIR_PASS(progress, s, nir_shader_intrinsics_pass, legalize_image_lod,
             nir_metadata_control_flow, NULL);
    NIR_PASS(progress, s, nir_shader_intrinsics_pass, lower_images,
             nir_metadata_control_flow, NULL);
    NIR_PASS(progress, s, nir_legalize_16bit_sampler_srcs, tex_constraints);

    /* Fold constants after nir_legalize_16bit_sampler_srcs so we can detect 0 in
     * lower_regular_texture. This is required for correctness.
     */
    NIR_PASS(progress, s, nir_opt_constant_folding);

    /* Lower texture sources after legalizing types (as the lowering depends on
     * 16-bit multisample indices) but before lowering queries (as the lowering
     * generates txs for array textures).
     */
    NIR_PASS(progress, s, nir_shader_instructions_pass, lower_regular_texture,
             nir_metadata_none, NULL);
    NIR_PASS(progress, s, nir_shader_instructions_pass, lower_tex_crawl,
             nir_metadata_control_flow, NULL);

    return progress;
 }

 static bool
 lower_multisampled_store(nir_builder *b, nir_intrinsic_instr *intr,
                          UNUSED void *data)
 {
    b->cursor = nir_before_instr(&intr->instr);

    if (intr->intrinsic != nir_intrinsic_bindless_image_store)
       return false;

    if (nir_intrinsic_image_dim(intr) != GLSL_SAMPLER_DIM_MS)
       return false;

    nir_def *index_px = nir_u2u32(b, image_texel_address(b, intr, true));
    nir_def *coord2d = coords_for_buffer_texture(b, index_px);

    nir_src_rewrite(&intr->src[1], nir_pad_vector(b, coord2d, 4));
    nir_src_rewrite(&intr->src[2], nir_imm_int(b, 0));
    nir_intrinsic_set_image_dim(intr, GLSL_SAMPLER_DIM_2D);
    nir_intrinsic_set_image_array(intr, false);
    return true;
 }

 bool
 agx_nir_lower_multisampled_image_store(nir_shader *s)
 {
    return nir_shader_intrinsics_pass(s, lower_multisampled_store,
                                      nir_metadata_control_flow, NULL);
 }

 /*
  * Given a non-bindless instruction, return whether agx_nir_lower_texture will
  * lower it to something involving a descriptor crawl. This requires the driver
  * to lower the instruction to bindless before calling agx_nir_lower_texture.
  * The implementation just enumerates the cases handled in this file.
  */
 bool
 agx_nir_needs_texture_crawl(nir_instr *instr)
 {
    if (instr->type == nir_instr_type_intrinsic) {
       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);

       switch (intr->intrinsic) {
       /* Queries, atomics always become a crawl */
       case nir_intrinsic_image_size:
       case nir_intrinsic_image_deref_size:
       case nir_intrinsic_image_samples:
       case nir_intrinsic_image_deref_samples:
       case nir_intrinsic_image_atomic:
       case nir_intrinsic_image_deref_atomic:
       case nir_intrinsic_image_atomic_swap:
       case nir_intrinsic_image_deref_atomic_swap:
          return true;

       /* Multisampled stores need a crawl, others do not */
       case nir_intrinsic_image_store:
       case nir_intrinsic_image_deref_store:
          return nir_intrinsic_image_dim(intr) == GLSL_SAMPLER_DIM_MS;

       /* Array loads need a crawl, other load do not */
       case nir_intrinsic_image_load:
          return nir_intrinsic_image_array(intr) ||
                 nir_intrinsic_image_dim(intr) == GLSL_SAMPLER_DIM_CUBE;

       default:
          return false;
       }
    } else if (instr->type == nir_instr_type_tex) {
       nir_tex_instr *tex = nir_instr_as_tex(instr);

       /* Array textures get clamped to their size via txs */
       if (tex->is_array && !(tex->backend_flags & AGX_TEXTURE_FLAG_NO_CLAMP))
          return true;

       switch (tex->op) {
       /* Queries always become a crawl */
       case nir_texop_txs:
       case nir_texop_texture_samples:
       case nir_texop_query_levels:
          return true;

       /* Buffer textures need their format read and txf needs its LOD/layer
        * clamped.  Buffer textures are only read through txf.
        */
       case nir_texop_txf:
       case nir_texop_txf_ms:
          return has_nonzero_lod(tex) || tex->is_array ||
                 tex->sampler_dim == GLSL_SAMPLER_DIM_BUF;

       default:
          return false;
       }
    }

    return false;
 }