v3dv: add a CPU path for buffer to image copies

The blit shader path for buffer to image copies is pretty bad,
since it needs to produce a tiled image from the linear buffer
prior to emitting the blit copy.

This patch adds a new preferential path where we implement the
copy using the CPU, similar to what the GL driver does for
texture uploads. This makes vkQuake2 at least 4x faster when
dynamic lights are enabled (which triggers dynamic texture
updates).

We also tested a GPU path where we use a shader that takes the
linear buffer as a UBO and copies directly from it. This also
shows a clear performance gain, but still worse than the CPU
implementation.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6766>
diff --git a/src/broadcom/vulkan/v3dv_meta_copy.c b/src/broadcom/vulkan/v3dv_meta_copy.c
index 56c4fdb..8664afa 100644
--- a/src/broadcom/vulkan/v3dv_meta_copy.c
+++ b/src/broadcom/vulkan/v3dv_meta_copy.c
@@ -2537,6 +2537,72 @@
    return true;
 }
 
+/**
+ * Returns true if the implementation supports the requested operation (even if
+ * it failed to process it, for example, due to an out-of-memory error).
+ */
+static bool
+copy_buffer_to_image_cpu(struct v3dv_cmd_buffer *cmd_buffer,
+                         struct v3dv_image *image,
+                         struct v3dv_buffer *buffer,
+                         const VkBufferImageCopy *region)
+{
+   /* FIXME */
+   if (vk_format_is_depth_or_stencil(image->vk_format))
+      return false;
+
+   if (vk_format_is_compressed(image->vk_format))
+      return false;
+
+   if (image->tiling == VK_IMAGE_TILING_LINEAR)
+      return false;
+
+   uint32_t buffer_width, buffer_height;
+   if (region->bufferRowLength == 0)
+      buffer_width = region->imageExtent.width;
+   else
+      buffer_width = region->bufferRowLength;
+
+   if (region->bufferImageHeight == 0)
+      buffer_height = region->imageExtent.height;
+   else
+      buffer_height = region->bufferImageHeight;
+
+   uint32_t buffer_stride = buffer_width * image->cpp;
+   uint32_t buffer_layer_stride = buffer_stride * buffer_height;
+
+   uint32_t num_layers;
+   if (image->type != VK_IMAGE_TYPE_3D)
+      num_layers = region->imageSubresource.layerCount;
+   else
+      num_layers = region->imageExtent.depth;
+   assert(num_layers > 0);
+
+   struct v3dv_job *job =
+      v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
+                                     V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE,
+                                     cmd_buffer, -1);
+   if (!job)
+      return true;
+
+   job->cpu.copy_buffer_to_image.image = image;
+   job->cpu.copy_buffer_to_image.buffer = buffer;
+   job->cpu.copy_buffer_to_image.buffer_stride = buffer_stride;
+   job->cpu.copy_buffer_to_image.buffer_layer_stride = buffer_layer_stride;
+   job->cpu.copy_buffer_to_image.buffer_offset = region->bufferOffset;
+   job->cpu.copy_buffer_to_image.image_extent = region->imageExtent;
+   job->cpu.copy_buffer_to_image.image_offset = region->imageOffset;
+   job->cpu.copy_buffer_to_image.mip_level =
+      region->imageSubresource.mipLevel;
+   job->cpu.copy_buffer_to_image.base_layer =
+      region->imageSubresource.baseArrayLayer;
+   job->cpu.copy_buffer_to_image.layer_count = num_layers;
+
+   list_addtail(&job->list_link, &cmd_buffer->jobs);
+
+   return true;
+}
+
 void
 v3dv_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
                           VkBuffer srcBuffer,
@@ -2554,6 +2620,8 @@
          continue;
       if (copy_buffer_to_image_tlb(cmd_buffer, image, buffer, &pRegions[i]))
          continue;
+      if (copy_buffer_to_image_cpu(cmd_buffer, image, buffer, &pRegions[i]))
+         continue;
       if (copy_buffer_to_image_blit(cmd_buffer, image, buffer, &pRegions[i]))
          continue;
       unreachable("Unsupported buffer to image copy.");
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
index 3d0aee8..c94c599 100644
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@@ -656,6 +656,7 @@
    V3DV_JOB_TYPE_CPU_SET_EVENT,
    V3DV_JOB_TYPE_CPU_WAIT_EVENTS,
    V3DV_JOB_TYPE_CPU_CLEAR_ATTACHMENTS,
+   V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE,
 };
 
 struct v3dv_reset_query_cpu_job_info {
@@ -700,6 +701,19 @@
    VkClearRect *rects;
 };
 
+struct v3dv_copy_buffer_to_image_cpu_job_info {
+   struct v3dv_image *image;
+   struct v3dv_buffer *buffer;
+   uint32_t buffer_offset;
+   uint32_t buffer_stride;
+   uint32_t buffer_layer_stride;
+   VkOffset3D image_offset;
+   VkExtent3D image_extent;
+   uint32_t mip_level;
+   uint32_t base_layer;
+   uint32_t layer_count;
+};
+
 struct v3dv_job {
    struct list_head list_link;
 
@@ -757,12 +771,13 @@
 
    /* Job specs for CPU jobs */
    union {
-      struct v3dv_reset_query_cpu_job_info        query_reset;
-      struct v3dv_end_query_cpu_job_info          query_end;
-      struct v3dv_copy_query_results_cpu_job_info query_copy_results;
-      struct v3dv_event_set_cpu_job_info          event_set;
-      struct v3dv_event_wait_cpu_job_info         event_wait;
-      struct v3dv_clear_attachments_cpu_job_info  clear_attachments;
+      struct v3dv_reset_query_cpu_job_info          query_reset;
+      struct v3dv_end_query_cpu_job_info            query_end;
+      struct v3dv_copy_query_results_cpu_job_info   query_copy_results;
+      struct v3dv_event_set_cpu_job_info            event_set;
+      struct v3dv_event_wait_cpu_job_info           event_wait;
+      struct v3dv_clear_attachments_cpu_job_info    clear_attachments;
+      struct v3dv_copy_buffer_to_image_cpu_job_info copy_buffer_to_image;
    } cpu;
 
    /* Job spects for TFU jobs */
diff --git a/src/broadcom/vulkan/v3dv_queue.c b/src/broadcom/vulkan/v3dv_queue.c
index 1d42132..267ce46 100644
--- a/src/broadcom/vulkan/v3dv_queue.c
+++ b/src/broadcom/vulkan/v3dv_queue.c
@@ -390,6 +390,53 @@
 }
 
 static VkResult
+handle_copy_buffer_to_image_cpu_job(struct v3dv_job *job)
+{
+   assert(job->type == V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE);
+   struct v3dv_copy_buffer_to_image_cpu_job_info *info =
+      &job->cpu.copy_buffer_to_image;
+
+   /* Wait for all GPU work to finish first, since we may be accessing
+    * the BOs involved in the operation.
+    */
+   v3dv_QueueWaitIdle(v3dv_queue_to_handle(&job->device->queue));
+
+   /* Map BOs */
+   struct v3dv_bo *dst_bo = info->image->mem->bo;
+   if (!dst_bo->map && !v3dv_bo_map(job->device, dst_bo, dst_bo->size))
+      return vk_error(job->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+   void *dst_ptr = dst_bo->map;
+
+   struct v3dv_bo *src_bo = info->buffer->mem->bo;
+   if (!src_bo->map && !v3dv_bo_map(job->device, src_bo, src_bo->size))
+      return vk_error(job->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+   void *src_ptr = src_bo->map;
+
+   const struct v3d_resource_slice *slice =
+      &info->image->slices[info->mip_level];
+
+   const struct pipe_box box = {
+      info->image_offset.x, info->image_offset.y, info->base_layer,
+      info->image_extent.width, info->image_extent.height, info->layer_count,
+   };
+
+   /* Copy each layer */
+   for (uint32_t i = 0; i < info->layer_count; i++) {
+      const uint32_t dst_offset =
+         v3dv_layer_offset(info->image, info->mip_level, info->base_layer + i);
+      const uint32_t src_offset =
+         info->buffer->mem_offset + info->buffer_offset +
+         info->buffer_layer_stride * i;
+      v3d_store_tiled_image(
+         dst_ptr + dst_offset, slice->stride,
+         src_ptr + src_offset, info->buffer_stride,
+         slice->tiling, info->image->cpp, slice->padded_height, &box);
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkResult
 process_semaphores_to_signal(struct v3dv_device *device,
                              uint32_t count, const VkSemaphore *sems)
 {
@@ -569,6 +616,8 @@
       return handle_set_event_cpu_job(job, wait_thread != NULL);
    case V3DV_JOB_TYPE_CPU_WAIT_EVENTS:
       return handle_wait_events_cpu_job(job, do_wait, wait_thread);
+   case V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE:
+      return handle_copy_buffer_to_image_cpu_job(job);
    default:
       unreachable("Unhandled job type");
    }