aco: implement 64-bit images

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7234>
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index d680ab8..ab4aba0 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -409,7 +409,7 @@
             src = bld.as_uniform(src);
          vec->operands[i] = Operand(src);
       } else {
-         vec->operands[i] = Operand(0u);
+         vec->operands[i] = Operand(0u, component_size == 2);
       }
       elems[i] = vec->operands[i].getTemp();
    }
@@ -5762,14 +5762,32 @@
    memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
    unsigned access = var->data.access | nir_intrinsic_access(instr);
 
+   unsigned expand_mask = nir_ssa_def_components_read(&instr->dest.ssa);
+   if (dim == GLSL_SAMPLER_DIM_BUF)
+      expand_mask = (1u << util_last_bit(expand_mask)) - 1;
+   unsigned dmask = expand_mask;
+   if (instr->dest.ssa.bit_size == 64) {
+      expand_mask &= 0x9;
+      /* only R64_UINT and R64_SINT supported. x is in xy of the result, w in zw */
+      dmask = ((expand_mask & 0x1) ? 0x3 : 0) | ((expand_mask & 0x8) ? 0xc : 0);
+   }
+   unsigned num_components = util_bitcount(dmask);
+
+   Temp tmp;
+   if (num_components == dst.size() && dst.type() == RegType::vgpr)
+      tmp = dst;
+   else
+      tmp = ctx->program->allocateTmp(RegClass(RegType::vgpr, num_components));
+
+   Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
+                                    dim == GLSL_SAMPLER_DIM_BUF ? ACO_DESC_BUFFER : ACO_DESC_IMAGE,
+                                    nullptr, true, true);
+
    if (dim == GLSL_SAMPLER_DIM_BUF) {
-      unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
-      unsigned num_channels = util_last_bit(mask);
-      Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
 
       aco_opcode opcode;
-      switch (num_channels) {
+      switch (num_components) {
       case 1:
          opcode = aco_opcode::buffer_load_format_x;
          break;
@@ -5786,55 +5804,37 @@
          unreachable(">4 channel buffer image load");
       }
       aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3, 1)};
-      load->operands[0] = Operand(rsrc);
+      load->operands[0] = Operand(resource);
       load->operands[1] = Operand(vindex);
       load->operands[2] = Operand((uint32_t) 0);
-      Temp tmp;
-      if (num_channels == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
-         tmp = dst;
-      else
-         tmp = ctx->program->allocateTmp(RegClass(RegType::vgpr, num_channels));
       load->definitions[0] = Definition(tmp);
       load->idxen = true;
       load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
       load->dlc = load->glc && ctx->options->chip_class >= GFX10;
       load->sync = sync;
       ctx->block->instructions.emplace_back(std::move(load));
+   } else {
+      Temp coords = get_image_coords(ctx, instr, type);
 
-      expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, (1 << num_channels) - 1);
-      return;
+      bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
+      aco_opcode opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip;
+
+      aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 1)};
+      load->operands[0] = Operand(resource);
+      load->operands[1] = Operand(s4); /* no sampler */
+      load->operands[2] = Operand(coords);
+      load->definitions[0] = Definition(tmp);
+      load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
+      load->dlc = load->glc && ctx->options->chip_class >= GFX10;
+      load->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
+      load->dmask = dmask;
+      load->unrm = true;
+      load->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
+      load->sync = sync;
+      ctx->block->instructions.emplace_back(std::move(load));
    }
 
-   Temp coords = get_image_coords(ctx, instr, type);
-   Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
-
-   unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa);
-   unsigned num_components = util_bitcount(dmask);
-   Temp tmp;
-   if (num_components == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
-      tmp = dst;
-   else
-      tmp = ctx->program->allocateTmp(RegClass(RegType::vgpr, num_components));
-
-   bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
-   aco_opcode opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip;
-
-   aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 1)};
-   load->operands[0] = Operand(resource);
-   load->operands[1] = Operand(s4); /* no sampler */
-   load->operands[2] = Operand(coords);
-   load->definitions[0] = Definition(tmp);
-   load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
-   load->dlc = load->glc && ctx->options->chip_class >= GFX10;
-   load->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
-   load->dmask = dmask;
-   load->unrm = true;
-   load->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
-   load->sync = sync;
-   ctx->block->instructions.emplace_back(std::move(load));
-
-   expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, dmask);
-   return;
+   expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, expand_mask);
 }
 
 void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
@@ -5843,7 +5843,12 @@
    const struct glsl_type *type = glsl_without_array(var->type);
    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
    bool is_array = glsl_sampler_type_is_array(type);
-   Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
+   Temp data = get_ssa_temp(ctx, instr->src[3].ssa);
+
+   /* only R64_UINT and R64_SINT supported */
+   if (instr->src[3].ssa->bit_size == 64 && data.bytes() > 8)
+      data = emit_extract_vector(ctx, data, 0, RegClass(data.type(), 2));
+   data = as_vgpr(ctx, data);
 
    memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
    unsigned access = var->data.access | nir_intrinsic_access(instr);
@@ -5928,51 +5933,62 @@
    Builder bld(ctx->program, ctx->block);
 
    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
-   assert(data.size() == 1 && "64bit ssbo atomics not yet implemented.");
+   bool is_64bit = data.bytes() == 8;
+   assert((data.bytes() == 4 || data.bytes() == 8) && "only 32/64-bit image atomics implemented.");
 
    if (instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap)
-      data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), get_ssa_temp(ctx, instr->src[4].ssa), data);
+      data = bld.pseudo(aco_opcode::p_create_vector, bld.def(is_64bit ? v4 : v2), get_ssa_temp(ctx, instr->src[4].ssa), data);
 
-   aco_opcode buf_op, image_op;
+   aco_opcode buf_op, buf_op64, image_op;
    switch (instr->intrinsic) {
       case nir_intrinsic_image_deref_atomic_add:
          buf_op = aco_opcode::buffer_atomic_add;
+         buf_op64 = aco_opcode::buffer_atomic_add_x2;
          image_op = aco_opcode::image_atomic_add;
          break;
       case nir_intrinsic_image_deref_atomic_umin:
          buf_op = aco_opcode::buffer_atomic_umin;
+         buf_op64 = aco_opcode::buffer_atomic_umin_x2;
          image_op = aco_opcode::image_atomic_umin;
          break;
       case nir_intrinsic_image_deref_atomic_imin:
          buf_op = aco_opcode::buffer_atomic_smin;
+         buf_op64 = aco_opcode::buffer_atomic_smin_x2;
          image_op = aco_opcode::image_atomic_smin;
          break;
       case nir_intrinsic_image_deref_atomic_umax:
          buf_op = aco_opcode::buffer_atomic_umax;
+         buf_op64 = aco_opcode::buffer_atomic_umax_x2;
          image_op = aco_opcode::image_atomic_umax;
          break;
       case nir_intrinsic_image_deref_atomic_imax:
          buf_op = aco_opcode::buffer_atomic_smax;
+         buf_op64 = aco_opcode::buffer_atomic_smax_x2;
          image_op = aco_opcode::image_atomic_smax;
          break;
       case nir_intrinsic_image_deref_atomic_and:
          buf_op = aco_opcode::buffer_atomic_and;
+         buf_op64 = aco_opcode::buffer_atomic_and_x2;
          image_op = aco_opcode::image_atomic_and;
          break;
       case nir_intrinsic_image_deref_atomic_or:
          buf_op = aco_opcode::buffer_atomic_or;
+         buf_op64 = aco_opcode::buffer_atomic_or_x2;
          image_op = aco_opcode::image_atomic_or;
          break;
       case nir_intrinsic_image_deref_atomic_xor:
          buf_op = aco_opcode::buffer_atomic_xor;
+         buf_op64 = aco_opcode::buffer_atomic_xor_x2;
          image_op = aco_opcode::image_atomic_xor;
          break;
       case nir_intrinsic_image_deref_atomic_exchange:
          buf_op = aco_opcode::buffer_atomic_swap;
+         buf_op64 = aco_opcode::buffer_atomic_swap_x2;
          image_op = aco_opcode::image_atomic_swap;
          break;
       case nir_intrinsic_image_deref_atomic_comp_swap:
          buf_op = aco_opcode::buffer_atomic_cmpswap;
+         buf_op64 = aco_opcode::buffer_atomic_cmpswap_x2;
          image_op = aco_opcode::image_atomic_cmpswap;
          break;
       default:
@@ -5986,7 +6002,8 @@
       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
       Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
       //assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet implemented.");
-      aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};
+      aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(
+         is_64bit ? buf_op64 : buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};
       mubuf->operands[0] = Operand(resource);
       mubuf->operands[1] = Operand(vindex);
       mubuf->operands[2] = Operand((uint32_t)0);