v3dv: don't cache subpass color clear pipelines

Subpass color clear pipelines are those used to emit partial attachment
clears as draw calls inside the render pass currently bound by the
application in the command buffer, leading to a huge performance improvement
compared to the case where we emit them in their own render pass.

Unfortunately, because the pipeline references the render pass
object in which it is used and the render pass object is owned by the
application (and can be destroyed at any point), we can't cache these
pipelines (unless we implement a refcounting mechanism or other
similar strategy).

Performance impact looks negligible based on experiments with vkQuake3,
probably because the underlying pipeline cache is preventing the
redundant shader recompiles.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6766>
diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
index 00a1747..fdad296 100644
--- a/src/broadcom/vulkan/v3dv_device.c
+++ b/src/broadcom/vulkan/v3dv_device.c
@@ -1249,6 +1249,17 @@
    init_meta_blit_resources(device);
 }
 
+void
+v3dv_meta_color_clear_pipeline_destroy(VkDevice _device,
+                                       struct v3dv_meta_color_clear_pipeline *p,
+                                       VkAllocationCallbacks *alloc)
+{
+   v3dv_DestroyPipeline(_device, p->pipeline, alloc);
+   if (p->cached)
+      v3dv_DestroyRenderPass(_device, p->pass, alloc);
+   vk_free(alloc, p);
+}
+
 static void
 destroy_device_meta(struct v3dv_device *device)
 {
@@ -1258,10 +1269,8 @@
 
    hash_table_foreach(device->meta.color_clear.cache, entry) {
       struct v3dv_meta_color_clear_pipeline *item = entry->data;
-      v3dv_DestroyPipeline(_device, item->pipeline, &device->alloc);
-      if (item->free_render_pass)
-         v3dv_DestroyRenderPass(_device, item->pass, &device->alloc);
-      vk_free(&device->alloc, item);
+      v3dv_meta_color_clear_pipeline_destroy(v3dv_device_to_handle(device),
+                                             item, &device->alloc);
    }
    _mesa_hash_table_destroy(device->meta.color_clear.cache, NULL);
 
diff --git a/src/broadcom/vulkan/v3dv_meta_clear.c b/src/broadcom/vulkan/v3dv_meta_clear.c
index 26e7e8f..bb01b6d 100644
--- a/src/broadcom/vulkan/v3dv_meta_clear.c
+++ b/src/broadcom/vulkan/v3dv_meta_clear.c
@@ -529,15 +529,33 @@
    if (result != VK_SUCCESS)
       return result;
 
-   const uint64_t key =
-      get_color_clear_pipeline_cache_key(rt_idx, format, samples, components);
-   mtx_lock(&device->meta.mtx);
-   struct hash_entry *entry =
-      _mesa_hash_table_search(device->meta.color_clear.cache, &key);
-   if (entry) {
-      mtx_unlock(&device->meta.mtx);
-      *pipeline = entry->data;
-      return VK_SUCCESS;
+   /* If pass != NULL it means that we are emitting the clear as a draw call
+    * in the current pass bound by the application. In that case, we can't
+    * cache the pipeline, since it will be referencing that pass and the
+    * application could be destroying it at any point. Hopefully, the perf
+    * impact is not too big since we still have the device pipeline cache
+    * around and we won't end up re-compiling the clear shader.
+    *
+    * FIXME: alternatively, we could refcount (or maybe clone) the render pass
+    * provided by the application and include it in the pipeline key setup
+    * to make caching safe in this scenario, however, based on tests with
+    * vkQuake3, the fact that we are not caching here doesn't seem to have
+    * any significant impact in performance, so it might not be worth it.
+    */
+   const bool can_cache_pipeline = (pass == NULL);
+
+   uint64_t key;
+   if (can_cache_pipeline) {
+      key =
+         get_color_clear_pipeline_cache_key(rt_idx, format, samples, components);
+      mtx_lock(&device->meta.mtx);
+      struct hash_entry *entry =
+         _mesa_hash_table_search(device->meta.color_clear.cache, &key);
+      if (entry) {
+         mtx_unlock(&device->meta.mtx);
+         *pipeline = entry->data;
+         return VK_SUCCESS;
+      }
    }
 
    *pipeline = vk_zalloc2(&device->alloc, NULL, sizeof(**pipeline), 8,
@@ -557,7 +575,6 @@
       if (result != VK_SUCCESS)
          goto fail;
 
-      (*pipeline)->free_render_pass = true;
       pass = v3dv_render_pass_from_handle((*pipeline)->pass);
    } else {
       (*pipeline)->pass = v3dv_render_pass_to_handle(pass);
@@ -575,19 +592,24 @@
    if (result != VK_SUCCESS)
       goto fail;
 
-   (*pipeline)->key = key;
-   _mesa_hash_table_insert(device->meta.color_clear.cache,
-                           &(*pipeline)->key, *pipeline);
+   if (can_cache_pipeline) {
+      (*pipeline)->key = key;
+      (*pipeline)->cached = true;
+      _mesa_hash_table_insert(device->meta.color_clear.cache,
+                              &(*pipeline)->key, *pipeline);
 
-   mtx_unlock(&device->meta.mtx);
+      mtx_unlock(&device->meta.mtx);
+   }
+
    return VK_SUCCESS;
 
 fail:
-   mtx_unlock(&device->meta.mtx);
+   if (can_cache_pipeline)
+      mtx_unlock(&device->meta.mtx);
 
    VkDevice _device = v3dv_device_to_handle(device);
    if (*pipeline) {
-      if ((*pipeline)->free_render_pass)
+      if ((*pipeline)->cached)
          v3dv_DestroyRenderPass(_device, (*pipeline)->pass, &device->alloc);
       if ((*pipeline)->pipeline)
          v3dv_DestroyPipeline(_device, (*pipeline)->pipeline, &device->alloc);
@@ -740,6 +762,12 @@
    }
    assert(pipeline && pipeline->pipeline && pipeline->pass);
 
+   /* Since we are not emitting the draw call in the current subpass we should
+    * be caching the clear pipeline and we don't have to take care of destorying
+    * it below.
+    */
+   assert(pipeline->cached);
+
    /* Store command buffer state for the current subpass before we interrupt
     * it to emit the color clear pass and then finish the job for the
     * interrupted subpass.
@@ -1002,6 +1030,15 @@
       v3dv_CmdDraw(cmd_buffer_handle, 4, 1, 0, 0);
    }
 
+   /* Subpass pipelines can't be cached because they include a reference to the
+    * render pass currently bound by the application, which means that we need
+    * to destroy them manually here.
+    */
+   assert(!pipeline->cached);
+   v3dv_cmd_buffer_add_private_obj(
+      cmd_buffer, (uintptr_t)pipeline,
+      (v3dv_cmd_buffer_private_obj_destroy_cb) v3dv_meta_color_clear_pipeline_destroy);
+
    v3dv_cmd_buffer_meta_state_pop(cmd_buffer, dynamic_states, false);
 }
 
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
index aea3789..218c1ad 100644
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@@ -233,7 +233,7 @@
 struct v3dv_meta_color_clear_pipeline {
    VkPipeline pipeline;
    VkRenderPass pass;
-   bool free_render_pass;
+   bool cached;
    uint64_t key;
 };
 
@@ -1859,6 +1859,11 @@
 void v3dv_shader_module_internal_init(struct v3dv_shader_module *module,
                                       nir_shader *nir);
 
+void
+v3dv_meta_color_clear_pipeline_destroy(VkDevice _device,
+                                       struct v3dv_meta_color_clear_pipeline *p,
+                                       VkAllocationCallbacks *alloc);
+
 #define V3DV_DEFINE_HANDLE_CASTS(__v3dv_type, __VkType)   \
                                                         \
    static inline struct __v3dv_type *                    \