v3dv: implement compute dispatch

for now this only implements regular dispatches, not indirect.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6766>
diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c
index 2255ab3..4a2d23a 100644
--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c
@@ -196,6 +196,20 @@
 }
 
 static void
+job_destroy_gpu_csd_resources(struct v3dv_job *job)
+{
+   assert(job->type == V3DV_JOB_TYPE_GPU_CSD);
+   assert(job->cmd_buffer);
+
+   v3dv_cl_destroy(&job->indirect);
+
+   _mesa_set_destroy(job->bos, NULL);
+
+   if (job->csd.shared_memory)
+      v3dv_bo_free(job->device, job->csd.shared_memory);
+}
+
+static void
 job_destroy_cpu_wait_events_resources(struct v3dv_job *job)
 {
    assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
@@ -220,6 +234,9 @@
       case V3DV_JOB_TYPE_GPU_CL_SECONDARY:
          job_destroy_gpu_cl_resources(job);
          break;
+      case V3DV_JOB_TYPE_GPU_CSD:
+         job_destroy_gpu_csd_resources(job);
+         break;
       case V3DV_JOB_TYPE_CPU_WAIT_EVENTS:
          job_destroy_cpu_wait_events_resources(job);
          break;
@@ -716,19 +733,24 @@
    list_inithead(&job->list_link);
 
    if (type == V3DV_JOB_TYPE_GPU_CL ||
-       type == V3DV_JOB_TYPE_GPU_CL_SECONDARY) {
+       type == V3DV_JOB_TYPE_GPU_CL_SECONDARY ||
+       type == V3DV_JOB_TYPE_GPU_CSD) {
       job->bos =
          _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
       job->bo_count = 0;
 
-      v3dv_cl_init(job, &job->bcl);
-      v3dv_cl_init(job, &job->rcl);
       v3dv_cl_init(job, &job->indirect);
 
       if (V3D_DEBUG & V3D_DEBUG_ALWAYS_FLUSH)
          job->always_flush = true;
    }
 
+   if (type == V3DV_JOB_TYPE_GPU_CL ||
+       type == V3DV_JOB_TYPE_GPU_CL_SECONDARY) {
+      v3dv_cl_init(job, &job->bcl);
+      v3dv_cl_init(job, &job->rcl);
+   }
+
    if (cmd_buffer) {
       /* Flag all state as dirty. Generally, we need to re-emit state for each
        * new job.
@@ -784,6 +806,28 @@
    return job;
 }
 
+static struct v3dv_job *
+cmd_buffer_start_compute_job(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   /* Compute jobs can only happen outside a render pass */
+   assert(!cmd_buffer->state.job);
+   assert(!cmd_buffer->state.pass);
+
+   struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->alloc,
+                                    sizeof(struct v3dv_job), 8,
+                                    VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+   cmd_buffer->state.job = job;
+
+   if (!job) {
+      v3dv_flag_oom(cmd_buffer, NULL);
+      return NULL;
+   }
+
+   v3dv_job_init(job, V3DV_JOB_TYPE_GPU_CSD, cmd_buffer->device, cmd_buffer, -1);
+
+   return job;
+}
+
 static VkResult
 cmd_buffer_reset(struct v3dv_cmd_buffer *cmd_buffer,
                  VkCommandBufferResetFlags flags)
@@ -2677,6 +2721,33 @@
    p_stage->current_variant = variant;
 }
 
+static void
+update_cs_variant(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   struct v3dv_shader_variant *variant;
+   struct v3dv_pipeline_stage *p_stage = cmd_buffer->state.pipeline->cs;
+   struct v3d_key local_key;
+
+   /* We start with a copy of the original pipeline key */
+   memcpy(&local_key, &p_stage->key.base, sizeof(struct v3d_key));
+
+   cmd_buffer_populate_v3d_key(&local_key, cmd_buffer,
+                               VK_PIPELINE_BIND_POINT_COMPUTE);
+
+   VkResult result;
+   variant = v3dv_get_shader_variant(p_stage, &local_key,
+                                     sizeof(struct v3d_key),
+                                     &cmd_buffer->device->alloc,
+                                     &result);
+   /* At this point we are not creating a vulkan object to return to the
+    * API user, so we can't really return back a OOM error
+    */
+   assert(variant);
+   assert(result == VK_SUCCESS);
+
+   p_stage->current_variant = variant;
+}
+
 /*
  * Some updates on the cmd buffer requires also updates on the shader being
  * compiled at the pipeline. The poster boy here are textures, as the compiler
@@ -2690,8 +2761,13 @@
 {
    assert(cmd_buffer->state.pipeline);
 
-   update_fs_variant(cmd_buffer);
-   update_vs_variant(cmd_buffer);
+   if (v3dv_pipeline_get_binding_point(cmd_buffer->state.pipeline) ==
+       VK_PIPELINE_BIND_POINT_GRAPHICS) {
+      update_fs_variant(cmd_buffer);
+      update_vs_variant(cmd_buffer);
+   } else {
+      update_cs_variant(cmd_buffer);
+   }
 }
 
 static void
@@ -4471,13 +4547,120 @@
    unreachable("Timestamp queries are not supported.");
 }
 
+static void
+cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   assert(cmd_buffer->state.pipeline);
+   assert(cmd_buffer->state.pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT);
+
+   /* We may need to compile shader variants based on bound textures */
+   uint32_t *dirty = &cmd_buffer->state.dirty;
+   if (*dirty & (V3DV_CMD_DIRTY_PIPELINE |
+                 V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS)) {
+      update_pipeline_variants(cmd_buffer);
+   }
+
+   *dirty &= ~(V3DV_CMD_DIRTY_PIPELINE |
+               V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS);
+}
+
+#define V3D_CSD_CFG012_WG_COUNT_SHIFT 16
+#define V3D_CSD_CFG012_WG_OFFSET_SHIFT 0
+/* Allow this dispatch to start while the last one is still running. */
+#define V3D_CSD_CFG3_OVERLAP_WITH_PREV (1 << 26)
+/* Maximum supergroup ID.  6 bits. */
+#define V3D_CSD_CFG3_MAX_SG_ID_SHIFT 20
+/* Batches per supergroup minus 1.  8 bits. */
+#define V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT 12
+/* Workgroups per supergroup, 0 means 16 */
+#define V3D_CSD_CFG3_WGS_PER_SG_SHIFT 8
+#define V3D_CSD_CFG3_WG_SIZE_SHIFT 0
+
+#define V3D_CSD_CFG5_PROPAGATE_NANS (1 << 2)
+#define V3D_CSD_CFG5_SINGLE_SEG (1 << 1)
+#define V3D_CSD_CFG5_THREADING (1 << 0)
+
+static void
+cmd_buffer_dispatch(struct v3dv_cmd_buffer *cmd_buffer,
+                    uint32_t group_count_x,
+                    uint32_t group_count_y,
+                    uint32_t group_count_z)
+{
+   if (group_count_x == 0 || group_count_y == 0 || group_count_z == 0)
+      return;
+
+   struct v3dv_job *job = cmd_buffer_start_compute_job(cmd_buffer);
+   if (!job)
+      return;
+
+   struct drm_v3d_submit_csd *submit = &job->csd.submit;
+
+   job->csd.workgroup_count[0] = group_count_x;
+   job->csd.workgroup_count[1] = group_count_y;
+   job->csd.workgroup_count[2] = group_count_z;
+
+   submit->cfg[0] |= group_count_x << V3D_CSD_CFG012_WG_COUNT_SHIFT;
+   submit->cfg[1] |= group_count_y << V3D_CSD_CFG012_WG_COUNT_SHIFT;
+   submit->cfg[2] |= group_count_z << V3D_CSD_CFG012_WG_COUNT_SHIFT;
+
+   struct v3dv_pipeline *pipeline = cmd_buffer->state.pipeline;
+   assert(pipeline->cs && pipeline->cs->nir);
+
+   const struct nir_shader *cs =  pipeline->cs->nir;
+
+   const uint32_t wgs_per_sg = 1; /* FIXME */
+   const uint32_t wg_size = cs->info.cs.local_size[0] *
+                            cs->info.cs.local_size[1] *
+                            cs->info.cs.local_size[2];
+   submit->cfg[3] |= wgs_per_sg << V3D_CSD_CFG3_WGS_PER_SG_SHIFT;
+   submit->cfg[3] |= ((DIV_ROUND_UP(wgs_per_sg * wg_size, 16) - 1) <<
+                     V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT);
+   submit->cfg[3] |= (wg_size & 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT;
+
+   uint32_t batches_per_wg = DIV_ROUND_UP(wg_size, 16);
+   submit->cfg[4] = batches_per_wg *
+                    (group_count_x * group_count_y * group_count_z) - 1;
+   assert(submit->cfg[4] != ~0);
+
+   assert(pipeline->cs->current_variant &&
+          pipeline->cs->current_variant->assembly_bo);
+   const struct v3dv_shader_variant *variant = pipeline->cs->current_variant;
+   submit->cfg[5] = variant->assembly_bo->offset;
+   submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
+   if (variant->prog_data.base->single_seg)
+      submit->cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG;
+   if (variant->prog_data.base->threads == 4)
+      submit->cfg[5] |= V3D_CSD_CFG5_THREADING;
+
+   if (variant->prog_data.cs->shared_size > 0) {
+      job->csd.shared_memory =
+         v3dv_bo_alloc(cmd_buffer->device,
+                       variant->prog_data.cs->shared_size * wgs_per_sg,
+                       "shared_vars", true);
+   }
+
+   v3dv_job_add_bo(job, variant->assembly_bo);
+
+   struct v3dv_cl_reloc uniforms =
+      v3dv_write_uniforms(cmd_buffer, pipeline->cs);
+   submit->cfg[6] = uniforms.bo->offset + uniforms.offset;
+
+   v3dv_job_add_bo(job, uniforms.bo);
+
+   list_addtail(&job->list_link, &cmd_buffer->jobs);
+   cmd_buffer->state.job = NULL;
+}
+
 void
 v3dv_CmdDispatch(VkCommandBuffer commandBuffer,
                  uint32_t groupCountX,
                  uint32_t groupCountY,
                  uint32_t groupCountZ)
 {
-   unreachable("vkCmdDispatch not implemented.");
+   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   cmd_buffer_emit_pre_dispatch(cmd_buffer);
+   cmd_buffer_dispatch(cmd_buffer, groupCountX, groupCountY, groupCountZ);
 }
 
 void
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
index 8164b7d..daa8a1d 100644
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@@ -798,8 +798,15 @@
       struct v3dv_copy_buffer_to_image_cpu_job_info copy_buffer_to_image;
    } cpu;
 
-   /* Job spects for TFU jobs */
+   /* Job specs for TFU jobs */
    struct drm_v3d_submit_tfu tfu;
+
+   /* Job specs for CSD jobs */
+   struct {
+      struct v3dv_bo *shared_memory;
+      uint32_t workgroup_count[3];
+      struct drm_v3d_submit_csd submit;
+   } csd;
 };
 
 void v3dv_job_init(struct v3dv_job *job,
diff --git a/src/broadcom/vulkan/v3dv_queue.c b/src/broadcom/vulkan/v3dv_queue.c
index 1200c5f..cadb906 100644
--- a/src/broadcom/vulkan/v3dv_queue.c
+++ b/src/broadcom/vulkan/v3dv_queue.c
@@ -579,6 +579,47 @@
 }
 
 static VkResult
+handle_csd_job(struct v3dv_queue *queue,
+               struct v3dv_job *job,
+               bool do_wait)
+{
+   struct v3dv_device *device = queue->device;
+
+   struct drm_v3d_submit_csd *submit = &job->csd.submit;
+
+   submit->bo_handle_count = job->bo_count;
+   uint32_t *bo_handles =
+      (uint32_t *) malloc(sizeof(uint32_t) * MAX2(4, submit->bo_handle_count * 2));
+   uint32_t bo_idx = 0;
+   set_foreach(job->bos, entry) {
+      struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
+      bo_handles[bo_idx++] = bo->handle;
+   }
+   assert(bo_idx == submit->bo_handle_count);
+   submit->bo_handles = (uintptr_t)(void *)bo_handles;
+
+   mtx_lock(&queue->device->mutex);
+   submit->in_sync = do_wait ? device->last_job_sync : 0;
+   submit->out_sync = device->last_job_sync;
+   int ret = v3dv_ioctl(device->render_fd, DRM_IOCTL_V3D_SUBMIT_CSD, submit);
+   mtx_unlock(&queue->device->mutex);
+
+   static bool warned = false;
+   if (ret && !warned) {
+      fprintf(stderr, "Compute dispatch returned %s. Expect corruption.\n",
+              strerror(errno));
+      warned = true;
+   }
+
+   free(bo_handles);
+
+   if (ret)
+      return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
+
+   return VK_SUCCESS;
+}
+
+static VkResult
 queue_submit_job(struct v3dv_queue *queue,
                  struct v3dv_job *job,
                  bool do_wait,
@@ -591,6 +632,8 @@
       return handle_cl_job(queue, job, do_wait);
    case V3DV_JOB_TYPE_GPU_TFU:
       return handle_tfu_job(queue, job, do_wait);
+   case V3DV_JOB_TYPE_GPU_CSD:
+      return handle_csd_job(queue, job, do_wait);
    case V3DV_JOB_TYPE_CPU_RESET_QUERIES:
       return handle_reset_query_cpu_job(job);
    case V3DV_JOB_TYPE_CPU_END_QUERY:
diff --git a/src/broadcom/vulkan/v3dv_uniforms.c b/src/broadcom/vulkan/v3dv_uniforms.c
index 475281d..70472e6 100644
--- a/src/broadcom/vulkan/v3dv_uniforms.c
+++ b/src/broadcom/vulkan/v3dv_uniforms.c
@@ -327,6 +327,18 @@
                                          data));
          break;
 
+      case QUNIFORM_NUM_WORK_GROUPS:
+         assert(job->type == V3DV_JOB_TYPE_GPU_CSD);
+         assert(job->csd.workgroup_count[data] > 0);
+         cl_aligned_u32(&uniforms, job->csd.workgroup_count[data]);
+         break;
+
+      case QUNIFORM_SHARED_OFFSET:
+         assert(job->type == V3DV_JOB_TYPE_GPU_CSD);
+         assert(job->csd.shared_memory);
+         cl_aligned_reloc(&job->indirect, &uniforms, job->csd.shared_memory, 0);
+         break;
+
       default:
          unreachable("unsupported quniform_contents uniform type\n");
       }