v3dv: skip unnecessary tile loads when blitting

If we are blitting to tile boundaries we don't need to emit
tile loads. The exception to this is the case where we are
blitting only a subset of the pixel components in the image
(which we do for single aspect blits of D24S8), since in that
case we need to preserve the components we are not writing.

There is a corner case where some times we create framebuffers
that alias subregions of a larger image. In that case the edge
tiles are not padded and we can't skip the loads.

Reviewed-by: Alejandro PiƱeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7247>
diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c
index 229740d..14a97cc 100644
--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c
@@ -1181,11 +1181,6 @@
    }
 }
 
-/* Checks whether the render area rectangle covers a region that is aligned to
- * tile boundaries, which means that for all tiles covered by the render area
- * region, there are no uncovered pixels (unless they are also outside the
- * framebuffer).
- */
 static void
 cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer *cmd_buffer)
 {
@@ -1200,24 +1195,11 @@
     * always have framebuffer information available.
     */
    assert(cmd_buffer->state.framebuffer);
-
-   const VkExtent2D fb_extent = {
-      .width = cmd_buffer->state.framebuffer->width,
-      .height = cmd_buffer->state.framebuffer->height
-   };
-
-   VkExtent2D granularity;
-   v3dv_subpass_get_granularity(cmd_buffer->state.pass,
-                                cmd_buffer->state.subpass_idx,
-                                &granularity);
-
    cmd_buffer->state.tile_aligned_render_area =
-      rect->offset.x % granularity.width == 0 &&
-      rect->offset.y % granularity.height == 0 &&
-      (rect->extent.width % granularity.width == 0 ||
-       rect->offset.x + rect->extent.width >= fb_extent.width) &&
-      (rect->extent.height % granularity.height == 0 ||
-       rect->offset.y + rect->extent.height >= fb_extent.height);
+      v3dv_subpass_area_is_tile_aligned(rect,
+                                        cmd_buffer->state.framebuffer,
+                                        cmd_buffer->state.pass,
+                                        cmd_buffer->state.subpass_idx);
 
    if (!cmd_buffer->state.tile_aligned_render_area) {
       perf_debug("Render area for subpass %d of render pass %p doesn't "
@@ -2023,7 +2005,6 @@
    assert(state->subpass_idx < state->pass->subpass_count);
    const struct v3dv_render_pass *pass = state->pass;
    const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
-
    struct v3dv_cl *rcl = &job->rcl;
 
    /* Comon config must be the first TILE_RENDERING_MODE_CFG and
@@ -2031,7 +2012,6 @@
     * updates to the previous HW state.
     */
    const uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
-
    cl_emit(rcl, TILE_RENDERING_MODE_CFG_COMMON, config) {
       config.image_width_pixels = framebuffer->width;
       config.image_height_pixels = framebuffer->height;
diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
index 36d74b8..6e0eefb 100644
--- a/src/broadcom/vulkan/v3dv_device.c
+++ b/src/broadcom/vulkan/v3dv_device.c
@@ -1977,6 +1977,8 @@
    framebuffer->width = pCreateInfo->width;
    framebuffer->height = pCreateInfo->height;
    framebuffer->layers = pCreateInfo->layers;
+   framebuffer->has_edge_padding = true;
+
    framebuffer->attachment_count = pCreateInfo->attachmentCount;
    framebuffer->color_attachment_count = 0;
    for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) {
diff --git a/src/broadcom/vulkan/v3dv_meta_copy.c b/src/broadcom/vulkan/v3dv_meta_copy.c
index fe9e6a0..0ba96b8 100644
--- a/src/broadcom/vulkan/v3dv_meta_copy.c
+++ b/src/broadcom/vulkan/v3dv_meta_copy.c
@@ -61,6 +61,7 @@
          struct v3dv_meta_blit_pipeline *item = entry->data;
          v3dv_DestroyPipeline(_device, item->pipeline, &device->alloc);
          v3dv_DestroyRenderPass(_device, item->pass, &device->alloc);
+         v3dv_DestroyRenderPass(_device, item->pass_no_load, &device->alloc);
          vk_free(&device->alloc, item);
       }
       _mesa_hash_table_destroy(device->meta.blit.cache[i], NULL);
@@ -771,7 +772,8 @@
             VkColorComponentFlags cmask,
             VkComponentMapping *cswizzle,
             const VkImageBlit *region,
-            VkFilter filter);
+            VkFilter filter,
+            bool dst_is_padded_image);
 
 /**
  * Returns true if the implementation supports the requested operation (even if
@@ -998,7 +1000,7 @@
                             v3dv_image_from_handle(buffer_image), dst_format,
                             image, src_format,
                             cmask, &cswizzle,
-                            &blit_region, VK_FILTER_NEAREST);
+                            &blit_region, VK_FILTER_NEAREST, false);
       if (!handled) {
          /* This is unexpected, we should have a supported blit spec */
          unreachable("Unable to blit buffer to destination image");
@@ -1454,7 +1456,7 @@
                               dst, format,
                               src, format,
                               0, NULL,
-                              &blit_region, VK_FILTER_NEAREST);
+                              &blit_region, VK_FILTER_NEAREST, true);
 
    /* We should have selected formats that we can blit */
    assert(handled);
@@ -2693,7 +2695,7 @@
                             image, dst_format,
                             v3dv_image_from_handle(buffer_image), src_format,
                             cmask, NULL,
-                            &blit_region, VK_FILTER_NEAREST);
+                            &blit_region, VK_FILTER_NEAREST, true);
       if (!handled) {
          /* This is unexpected, we should have a supported blit spec */
          unreachable("Unable to blit buffer to destination image");
@@ -3101,20 +3103,15 @@
 create_blit_render_pass(struct v3dv_device *device,
                         VkFormat dst_format,
                         VkFormat src_format,
-                        VkRenderPass *pass)
+                        VkRenderPass *pass_load,
+                        VkRenderPass *pass_no_load)
 {
    const bool is_color_blit = vk_format_is_color(dst_format);
 
-   /* FIXME: if blitting to tile boundaries or to the whole image, we could
-    * use LOAD_DONT_CARE, but then we would have to include that in the
-    * pipeline hash key. Or maybe we should just create both render passes and
-    * use one or the other at draw time since they would both be compatible
-    * with the pipeline anyway
-    */
+   /* Attachment load operation is specified below */
    VkAttachmentDescription att = {
       .format = dst_format,
       .samples = VK_SAMPLE_COUNT_1_BIT,
-      .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD,
       .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
       .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
       .finalLayout = VK_IMAGE_LAYOUT_GENERAL,
@@ -3146,8 +3143,16 @@
       .pDependencies = NULL,
    };
 
-   VkResult result = v3dv_CreateRenderPass(v3dv_device_to_handle(device),
-                                           &info, &device->alloc, pass);
+   VkResult result;
+   att.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
+   result = v3dv_CreateRenderPass(v3dv_device_to_handle(device),
+                                  &info, &device->alloc, pass_load);
+   if (result != VK_SUCCESS)
+      return false;
+
+   att.loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
+   result = v3dv_CreateRenderPass(v3dv_device_to_handle(device),
+                                  &info, &device->alloc, pass_no_load);
    return result == VK_SUCCESS;
 }
 
@@ -3763,10 +3768,14 @@
       goto fail;
 
    ok = create_blit_render_pass(device, dst_format, src_format,
-                                &(*pipeline)->pass);
+                                &(*pipeline)->pass,
+                                &(*pipeline)->pass_no_load);
    if (!ok)
       goto fail;
 
+   /* Create the pipeline using one of the render passes, they are both
+    * compatible, so we don't care which one we use here.
+    */
    ok = create_blit_pipeline(device,
                              dst_format,
                              src_format,
@@ -3794,6 +3803,8 @@
    if (*pipeline) {
       if ((*pipeline)->pass)
          v3dv_DestroyRenderPass(_device, (*pipeline)->pass, &device->alloc);
+      if ((*pipeline)->pass_no_load)
+         v3dv_DestroyRenderPass(_device, (*pipeline)->pass_no_load, &device->alloc);
       if ((*pipeline)->pipeline)
          v3dv_DestroyPipeline(_device, (*pipeline)->pipeline, &device->alloc);
       vk_free(&device->alloc, *pipeline);
@@ -3896,7 +3907,8 @@
             VkColorComponentFlags cmask,
             VkComponentMapping *cswizzle,
             const VkImageBlit *_region,
-            VkFilter filter)
+            VkFilter filter,
+            bool dst_is_padded_image)
 {
    bool handled = true;
 
@@ -3907,7 +3919,6 @@
           !vk_format_is_depth_or_stencil(dst_format));
 
    VkImageBlit region = *_region;
-
    /* Rewrite combined D/S blits to compatible color blits */
    if (vk_format_is_depth_or_stencil(dst_format)) {
       assert(src_format == dst_format);
@@ -3940,12 +3951,12 @@
       region.dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
    }
 
-   if (cmask == 0) {
-      cmask = VK_COLOR_COMPONENT_R_BIT |
-              VK_COLOR_COMPONENT_G_BIT |
-              VK_COLOR_COMPONENT_B_BIT |
-              VK_COLOR_COMPONENT_A_BIT;
-   }
+   const VkColorComponentFlags full_cmask = VK_COLOR_COMPONENT_R_BIT |
+                                            VK_COLOR_COMPONENT_G_BIT |
+                                            VK_COLOR_COMPONENT_B_BIT |
+                                            VK_COLOR_COMPONENT_A_BIT;
+   if (cmask == 0)
+      cmask = full_cmask;
 
    VkComponentMapping ident_swizzle = {
       .r = VK_COMPONENT_SWIZZLE_IDENTITY,
@@ -4072,7 +4083,8 @@
                                &pipeline);
    if (!ok)
       return handled;
-   assert(pipeline && pipeline->pipeline && pipeline->pass);
+   assert(pipeline && pipeline->pipeline &&
+          pipeline->pass && pipeline->pass_no_load);
 
    struct v3dv_device *device = cmd_buffer->device;
    assert(cmd_buffer->meta.blit.dspool);
@@ -4128,6 +4140,11 @@
       if (result != VK_SUCCESS)
          goto fail;
 
+      struct v3dv_framebuffer *framebuffer = v3dv_framebuffer_from_handle(fb);
+      framebuffer->has_edge_padding = fb_info.width == dst_level_w &&
+                                      fb_info.height == dst_level_h &&
+                                      dst_is_padded_image;
+
       v3dv_cmd_buffer_add_private_obj(
          cmd_buffer, (uintptr_t)fb,
          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyFramebuffer);
@@ -4208,15 +4225,30 @@
       };
       v3dv_UpdateDescriptorSets(_device, 1, &write, 0, NULL);
 
+      /* If the region we are about to blit is tile-aligned, then we can
+       * use the render pass version that won't pre-load the tile buffer
+       * with the dst image contents before the blit. The exception is when we
+       * don't have a full color mask, since in that case we need to preserve
+       * the original value of some of the color components.
+       */
+      const VkRect2D render_area = {
+         .offset = { dst_x, dst_y },
+         .extent = { dst_w, dst_h },
+      };
+      struct v3dv_render_pass *pipeline_pass =
+         v3dv_render_pass_from_handle(pipeline->pass);
+      bool can_skip_tlb_load =
+         cmask == full_cmask &&
+         v3dv_subpass_area_is_tile_aligned(&render_area, framebuffer,
+                                           pipeline_pass, 0);
+
       /* Record blit */
       VkRenderPassBeginInfo rp_info = {
          .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
-         .renderPass = pipeline->pass,
+         .renderPass = can_skip_tlb_load ? pipeline->pass_no_load :
+                                           pipeline->pass,
          .framebuffer = fb,
-         .renderArea = {
-            .offset = { dst_x, dst_y },
-            .extent = { dst_w, dst_h }
-         },
+         .renderArea = render_area,
          .clearValueCount = 0,
       };
 
@@ -4308,7 +4340,7 @@
                       dst, dst->vk_format,
                       src, src->vk_format,
                       0, NULL,
-                      &pRegions[i], filter)) {
+                      &pRegions[i], filter, true)) {
          continue;
       }
       unreachable("Unsupported blit operation");
@@ -4469,7 +4501,7 @@
                       dst, dst->vk_format,
                       src, src->vk_format,
                       0, NULL,
-                      &blit_region, VK_FILTER_NEAREST);
+                      &blit_region, VK_FILTER_NEAREST, true);
 }
 
 void
diff --git a/src/broadcom/vulkan/v3dv_pass.c b/src/broadcom/vulkan/v3dv_pass.c
index 35f9c61..a030b1c 100644
--- a/src/broadcom/vulkan/v3dv_pass.c
+++ b/src/broadcom/vulkan/v3dv_pass.c
@@ -255,10 +255,10 @@
    vk_free2(&device->alloc, pAllocator, pass);
 }
 
-void
-v3dv_subpass_get_granularity(struct v3dv_render_pass *pass,
-                             uint32_t subpass_idx,
-                             VkExtent2D *granularity)
+static void
+subpass_get_granularity(struct v3dv_render_pass *pass,
+                        uint32_t subpass_idx,
+                        VkExtent2D *granularity)
 {
    static const uint8_t tile_sizes[] = {
       64, 64,
@@ -321,8 +321,50 @@
 
    for (uint32_t i = 0; i < pass->subpass_count; i++) {
       VkExtent2D sg;
-      v3dv_subpass_get_granularity(pass, i, &sg);
+      subpass_get_granularity(pass, i, &sg);
       pGranularity->width = MIN2(pGranularity->width, sg.width);
       pGranularity->height = MIN2(pGranularity->height, sg.height);
    }
 }
+
+/* Checks whether the render area rectangle covers a region that is aligned to
+ * tile boundaries. This means that we are writing to all pixels covered by
+ * all tiles in that area (except for pixels on edge tiles that are outside
+ * the framebuffer dimensions).
+ *
+ * When our framebuffer is aligned to tile boundaries we know we are writing
+ * valid data to all all pixels in each tile and we can apply certain
+ * optimizations, like avoiding tile loads, since we know that none of the
+ * original pixel values in each tile for that area need to be preserved.
+ * We also use this to decide if we can use TLB clears, as these clear whole
+ * tiles so we can't use them if the render area is not aligned.
+ *
+ * Note that when an image is created it will possibly include padding blocks
+ * depending on its tiling layout. When the framebuffer dimensions are not
+ * aligned to tile boundaries then edge tiles are only partially covered by the
+ * framebuffer pixels, but tile stores still seem to store full tiles
+ * writing to the padded sections. This is important when the framebuffer
+ * is aliasing a smaller section of a larger image, as in that case the edge
+ * tiles of the framebuffer would overwrite valid pixels in the larger image.
+ * In that case, we can't flag the area as being aligned.
+ */
+bool
+v3dv_subpass_area_is_tile_aligned(const VkRect2D *area,
+                                  struct v3dv_framebuffer *fb,
+                                  struct v3dv_render_pass *pass,
+                                  uint32_t subpass_idx)
+{
+   assert(subpass_idx >= 0 && subpass_idx < pass->subpass_count);
+
+   VkExtent2D granularity;
+   subpass_get_granularity(pass, subpass_idx, &granularity);
+
+   return area->offset.x % granularity.width == 0 &&
+          area->offset.y % granularity.height == 0 &&
+         (area->extent.width % granularity.width == 0 ||
+          (fb->has_edge_padding &&
+           area->offset.x + area->extent.width >= fb->width)) &&
+         (area->extent.height % granularity.height == 0 ||
+          (fb->has_edge_padding &&
+           area->offset.y + area->extent.height >= fb->height));
+}
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
index 0528111..caa7ea6 100644
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@@ -257,6 +257,7 @@
 struct v3dv_meta_blit_pipeline {
    VkPipeline pipeline;
    VkRenderPass pass;
+   VkRenderPass pass_no_load;
    uint8_t key[V3DV_META_BLIT_CACHE_KEY_SIZE];
 };
 
@@ -555,15 +556,22 @@
    struct v3dv_subpass_attachment *subpass_attachments;
 };
 
-void v3dv_subpass_get_granularity(struct v3dv_render_pass *pass,
-                                  uint32_t subpass_idx,
-                                  VkExtent2D *granularity);
-
 struct v3dv_framebuffer {
    uint32_t width;
    uint32_t height;
    uint32_t layers;
 
+   /* Typically, edge tiles in the framebuffer have padding depending on the
+    * underlying tiling layout. One consequnce of this is that when the
+    * framebuffer dimensions are not aligned to tile boundaries, tile stores
+    * would still write full tiles on the edges and write to the padded area.
+    * If the framebuffer is aliasing a smaller region of a larger image, then
+    * we need to be careful with this though, as we won't have padding on the
+    * edge tiles (which typically means that we need to load the tile buffer
+    * before we store).
+    */
+   bool has_edge_padding;
+
    uint32_t attachment_count;
    uint32_t color_attachment_count;
    struct v3dv_image_view *attachments[0];
@@ -590,6 +598,10 @@
                                                 const struct v3dv_subpass *subpass,
                                                 uint8_t *max_bpp, bool *msaa);
 
+bool v3dv_subpass_area_is_tile_aligned(const VkRect2D *area,
+                                       struct v3dv_framebuffer *fb,
+                                       struct v3dv_render_pass *pass,
+                                       uint32_t subpass_idx);
 struct v3dv_cmd_pool {
    VkAllocationCallbacks alloc;
    struct list_head cmd_buffers;