v3dv: handle multisample resolves for formats that don't support TLB resolves

The TLB multisample resolve feature is only limited to specific format types.
For everything else, including sfloat and integer formats, we need to
fallback to a blit resolve. This needs to be handled both for in-pass
resolves as well as for vkCmdResolveImage.

Because these blits would happen after the tile store operations, we need
to make sure we store the multisampled buffers so we can then read them for
the blit resolve.

Fixes the remaining test failures in:
dEQP-VK.renderpass.suballocation.multisample_resolve.*

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6766>
diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c
index d8f8483..9d01c1a 100644
--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c
@@ -430,26 +430,13 @@
        prev_subpass->ds_attachment.attachment)
       return false;
 
-   if (prev_subpass->resolve_attachments) {
-      if (!subpass->resolve_attachments)
-         return false;
-
-      compatible =
-         attachment_list_is_subset(prev_subpass->resolve_attachments,
-                                   prev_subpass->color_count,
-                                   subpass->resolve_attachments,
-                                   subpass->color_count);
-      if (!compatible)
-         return false;
-
-      compatible =
-         attachment_list_is_subset(subpass->resolve_attachments,
-                                   subpass->color_count,
-                                   prev_subpass->resolve_attachments,
-                                   prev_subpass->color_count);
-      if (!compatible)
-         return false;
-   }
+   /* FIXME: Since some attachment formats can't be resolved using the TLB we
+    * need to emit separate resolve jobs for them and that would not be
+    * compatible with subpass merges. We could fix that by testing if any of
+    * the attachments to resolve doesn't suppotr TLB resolves.
+    */
+   if (prev_subpass->resolve_attachments || subpass->resolve_attachments)
+      return false;
 
    return true;
 }
@@ -976,6 +963,91 @@
    vk_free2(&device->alloc, pAllocator, pool);
 }
 
+static void
+cmd_buffer_subpass_handle_pending_resolves(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   assert(cmd_buffer->state.subpass_idx < cmd_buffer->state.pass->subpass_count);
+   const struct v3dv_render_pass *pass = cmd_buffer->state.pass;
+   const struct v3dv_subpass *subpass =
+      &pass->subpasses[cmd_buffer->state.subpass_idx];
+
+   if (!subpass->resolve_attachments)
+      return;
+
+   struct v3dv_framebuffer *fb = cmd_buffer->state.framebuffer;
+
+   /* At this point we have already ended the current subpass and now we are
+    * about to emit vkCmdResolveImage calls to get the resolves we can't handle
+    * handle in the subpass RCL.
+    *
+    * vkCmdResolveImage is not supposed to be called inside a render pass so
+    * before we call that we need to make sure our command buffer state reflects
+    * that we are no longer in a subpass by finishing the current job and
+    * resetting the framebuffer and render pass state temporarily and then
+    * restoring it after we are done with the resolves.
+    */
+   if (cmd_buffer->state.job)
+      v3dv_cmd_buffer_finish_job(cmd_buffer);
+   struct v3dv_framebuffer *restore_fb = cmd_buffer->state.framebuffer;
+   struct v3dv_render_pass *restore_pass = cmd_buffer->state.pass;
+   uint32_t restore_subpass_idx = cmd_buffer->state.subpass_idx;
+   cmd_buffer->state.framebuffer = NULL;
+   cmd_buffer->state.pass = NULL;
+   cmd_buffer->state.subpass_idx = -1;
+
+   VkCommandBuffer cmd_buffer_handle = v3dv_cmd_buffer_to_handle(cmd_buffer);
+   for (uint32_t i = 0; i < subpass->color_count; i++) {
+      const uint32_t src_attachment_idx =
+         subpass->color_attachments[i].attachment;
+      if (src_attachment_idx == VK_ATTACHMENT_UNUSED)
+         continue;
+
+      if (pass->attachments[src_attachment_idx].use_tlb_resolve)
+         continue;
+
+      const uint32_t dst_attachment_idx =
+         subpass->resolve_attachments[i].attachment;
+      if (dst_attachment_idx == VK_ATTACHMENT_UNUSED)
+         continue;
+
+      struct v3dv_image_view *src_iview = fb->attachments[src_attachment_idx];
+      struct v3dv_image_view *dst_iview = fb->attachments[dst_attachment_idx];
+
+      VkImageResolve region = {
+         .srcSubresource = {
+            VK_IMAGE_ASPECT_COLOR_BIT,
+            src_iview->base_level,
+            src_iview->first_layer,
+            src_iview->last_layer - src_iview->first_layer + 1,
+         },
+         .srcOffset = { 0, 0, 0 },
+         .dstSubresource =  {
+            VK_IMAGE_ASPECT_COLOR_BIT,
+            dst_iview->base_level,
+            dst_iview->first_layer,
+            dst_iview->last_layer - dst_iview->first_layer + 1,
+         },
+         .dstOffset = { 0, 0, 0 },
+         .extent = src_iview->image->extent,
+      };
+
+      VkImage src_image_handle =
+         v3dv_image_to_handle((struct v3dv_image *) src_iview->image);
+      VkImage dst_image_handle =
+         v3dv_image_to_handle((struct v3dv_image *) dst_iview->image);
+      v3dv_CmdResolveImage(cmd_buffer_handle,
+                           src_image_handle,
+                           VK_IMAGE_LAYOUT_GENERAL,
+                           dst_image_handle,
+                           VK_IMAGE_LAYOUT_GENERAL,
+                           1, &region);
+   }
+
+   cmd_buffer->state.framebuffer = restore_fb;
+   cmd_buffer->state.pass = restore_pass;
+   cmd_buffer->state.subpass_idx = restore_subpass_idx;
+}
+
 static VkResult
 cmd_buffer_begin_render_pass_secondary(
    struct v3dv_cmd_buffer *cmd_buffer,
@@ -1356,6 +1428,7 @@
 
    /* Finish the previous subpass */
    v3dv_cmd_buffer_subpass_finish(cmd_buffer);
+   cmd_buffer_subpass_handle_pending_resolves(cmd_buffer);
 
    /* Start the next subpass */
    v3dv_cmd_buffer_subpass_start(cmd_buffer, state->subpass_idx + 1);
@@ -1686,9 +1759,16 @@
        * that would clear the tile buffer before we get to emit the actual
        * color attachment store below, since the clear happens after the
        * store is completed.
+       *
+       * If the attachment doesn't support TLB resolves then we will have to
+       * fallback to doing the resolve in a shader separately after this
+       * job, so we will need to store the multisampled sttachment even if that
+       * wansn't requested by the client.
        */
-      if (subpass->resolve_attachments &&
-          subpass->resolve_attachments[i].attachment != VK_ATTACHMENT_UNUSED) {
+      const bool needs_resolve =
+         subpass->resolve_attachments &&
+         subpass->resolve_attachments[i].attachment != VK_ATTACHMENT_UNUSED;
+      if (needs_resolve && attachment->use_tlb_resolve) {
          const uint32_t resolve_attachment_idx =
             subpass->resolve_attachments[i].attachment;
          cmd_buffer_render_pass_emit_store(cmd_buffer, cl,
@@ -1696,6 +1776,8 @@
                                            RENDER_TARGET_0 + i,
                                            false, true);
          has_stores = true;
+      } else if (needs_resolve) {
+         needs_store = true;
       }
 
       /* Emit the color attachment store if needed */
@@ -2263,6 +2345,8 @@
    v3dv_cmd_buffer_subpass_finish(cmd_buffer);
    v3dv_cmd_buffer_finish_job(cmd_buffer);
 
+   cmd_buffer_subpass_handle_pending_resolves(cmd_buffer);
+
    /* We are no longer inside a render pass */
    state->framebuffer = NULL;
    state->pass = NULL;
diff --git a/src/broadcom/vulkan/v3dv_formats.c b/src/broadcom/vulkan/v3dv_formats.c
index eaa9879..167d0d2 100644
--- a/src/broadcom/vulkan/v3dv_formats.c
+++ b/src/broadcom/vulkan/v3dv_formats.c
@@ -297,6 +297,14 @@
    }
 }
 
+bool
+v3dv_format_supports_tlb_resolve(const struct v3dv_format *format)
+{
+   uint32_t type, bpp;
+   v3dv_get_internal_type_bpp_for_output_format(format->rt_type, &type, &bpp);
+   return type == V3D_INTERNAL_TYPE_8 || type == V3D_INTERNAL_TYPE_16F;
+}
+
 const uint8_t *
 v3dv_get_format_swizzle(VkFormat f)
 {
diff --git a/src/broadcom/vulkan/v3dv_meta_copy.c b/src/broadcom/vulkan/v3dv_meta_copy.c
index 1e55431..a790602 100644
--- a/src/broadcom/vulkan/v3dv_meta_copy.c
+++ b/src/broadcom/vulkan/v3dv_meta_copy.c
@@ -4198,6 +4198,9 @@
       return false;
    }
 
+   if (!v3dv_format_supports_tlb_resolve(src->format))
+      return false;
+
    const VkFormat fb_format = src->vk_format;
 
    uint32_t num_layers;
diff --git a/src/broadcom/vulkan/v3dv_pass.c b/src/broadcom/vulkan/v3dv_pass.c
index 08f5f4f..d7ec95e 100644
--- a/src/broadcom/vulkan/v3dv_pass.c
+++ b/src/broadcom/vulkan/v3dv_pass.c
@@ -34,6 +34,13 @@
 }
 
 static void
+set_use_tlb_resolve(struct v3dv_render_pass_attachment *att)
+{
+   const struct v3dv_format *format = v3dv_get_format(att->desc.format);
+   att->use_tlb_resolve = v3dv_format_supports_tlb_resolve(format);
+}
+
+static void
 pass_find_subpass_range_for_attachments(struct v3dv_render_pass *pass)
 {
    for (uint32_t i = 0; i < pass->attachment_count; i++) {
@@ -53,6 +60,11 @@
             pass->attachments[attachment_idx].first_subpass = i;
          if (i > pass->attachments[attachment_idx].last_subpass)
             pass->attachments[attachment_idx].last_subpass = i;
+
+         if (subpass->resolve_attachments &&
+             subpass->resolve_attachments[j].attachment != VK_ATTACHMENT_UNUSED) {
+            set_use_tlb_resolve(&pass->attachments[attachment_idx]);
+         }
       }
 
       uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
index 9ec38df..1066b92 100644
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@@ -514,6 +514,11 @@
    VkAttachmentDescription desc;
    uint32_t first_subpass;
    uint32_t last_subpass;
+
+   /* If this is a multismapled attachment that is going to be resolved,
+    * whether we can use the TLB resolve on store.
+    */
+   bool use_tlb_resolve;
 };
 
 struct v3dv_render_pass {
@@ -1702,8 +1707,7 @@
 uint8_t v3dv_get_tex_return_size(const struct v3dv_format *vf, bool compare_enable);
 bool v3dv_tfu_supports_tex_format(const struct v3d_device_info *devinfo,
                                   uint32_t tex_format);
-
-
+bool v3dv_format_supports_tlb_resolve(const struct v3dv_format *format);
 
 uint32_t v3d_utile_width(int cpp);
 uint32_t v3d_utile_height(int cpp);