v3dv: add support for timestamp queries

V3D doesn't provide any means to acquire timestamps from the GPU
so we have to implement these in the CPU.

v2: enable timestampComputeAndGraphics and set timestampPeriod (Piñeiro)

Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7373>
diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c
index 61a3b5c..2081207 100644
--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c
@@ -5021,7 +5021,30 @@
                        VkQueryPool queryPool,
                        uint32_t query)
 {
-   unreachable("Timestamp queries are not supported.");
+   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+   V3DV_FROM_HANDLE(v3dv_query_pool, query_pool, queryPool);
+
+   /* If this is called inside a render pass we need to finish the current
+    * job here...
+    */
+   if (cmd_buffer->state.pass)
+      v3dv_cmd_buffer_finish_job(cmd_buffer);
+
+   struct v3dv_job *job =
+      v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
+                                     V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY,
+                                     cmd_buffer, -1);
+   v3dv_return_if_oom(cmd_buffer, NULL);
+
+   job->cpu.query_timestamp.pool = query_pool;
+   job->cpu.query_timestamp.query = query;
+
+   list_addtail(&job->list_link, &cmd_buffer->jobs);
+   cmd_buffer->state.job = NULL;
+
+   /* ...and resume the subpass after the timestamp */
+   if (cmd_buffer->state.pass)
+      v3dv_cmd_buffer_subpass_resume(cmd_buffer, cmd_buffer->state.subpass_idx);
 }
 
 static void
diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
index 55755f5..a5f002b 100644
--- a/src/broadcom/vulkan/v3dv_device.c
+++ b/src/broadcom/vulkan/v3dv_device.c
@@ -814,6 +814,11 @@
    const VkSampleCountFlags supported_sample_counts =
       VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_4_BIT;
 
+   struct timespec clock_res;
+   clock_getres(CLOCK_MONOTONIC, &clock_res);
+   const float timestamp_period =
+      clock_res.tv_sec * 1000000000.0f + clock_res.tv_nsec;
+
    /* FIXME: this will probably require an in-depth review */
    VkPhysicalDeviceLimits limits = {
       .maxImageDimension1D                      = 4096,
@@ -923,8 +928,8 @@
       .sampledImageStencilSampleCounts          = supported_sample_counts,
       .storageImageSampleCounts                 = VK_SAMPLE_COUNT_1_BIT,
       .maxSampleMaskWords                       = 1,
-      .timestampComputeAndGraphics              = false,
-      .timestampPeriod                          = 0.0f,
+      .timestampComputeAndGraphics              = true,
+      .timestampPeriod                          = timestamp_period,
       .maxClipDistances                         = 8,
       .maxCullDistances                         = 0,
       .maxCombinedClipAndCullDistances          = 8,
@@ -990,7 +995,7 @@
                  VK_QUEUE_COMPUTE_BIT |
                  VK_QUEUE_TRANSFER_BIT,
    .queueCount = 1,
-   .timestampValidBits = 0, /* FIXME */
+   .timestampValidBits = 64,
    .minImageTransferGranularity = { 1, 1, 1 },
 };
 
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
index 0a916cb..2017941 100644
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@@ -744,6 +744,7 @@
    V3DV_JOB_TYPE_CPU_CLEAR_ATTACHMENTS,
    V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE,
    V3DV_JOB_TYPE_CPU_CSD_INDIRECT,
+   V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY,
 };
 
 struct v3dv_reset_query_cpu_job_info {
@@ -810,6 +811,11 @@
    bool needs_wg_uniform_rewrite;
 };
 
+struct v3dv_timestamp_query_cpu_job_info {
+   struct v3dv_query_pool *pool;
+   uint32_t query;
+};
+
 struct v3dv_job {
    struct list_head list_link;
 
@@ -881,6 +887,7 @@
       struct v3dv_clear_attachments_cpu_job_info    clear_attachments;
       struct v3dv_copy_buffer_to_image_cpu_job_info copy_buffer_to_image;
       struct v3dv_csd_indirect_cpu_job_info         csd_indirect;
+      struct v3dv_timestamp_query_cpu_job_info      query_timestamp;
    } cpu;
 
    /* Job specs for TFU jobs */
@@ -1084,10 +1091,14 @@
 
 struct v3dv_query {
    bool maybe_available;
-   struct v3dv_bo *bo;
+   union {
+      struct v3dv_bo *bo; /* Used by GPU queries (occlusion) */
+      uint64_t value; /* Used by CPU queries (timestamp) */
+   };
 };
 
 struct v3dv_query_pool {
+   VkQueryType query_type;
    uint32_t query_count;
    struct v3dv_query *queries;
 };
diff --git a/src/broadcom/vulkan/v3dv_query.c b/src/broadcom/vulkan/v3dv_query.c
index 7c2ce10..7224de4 100644
--- a/src/broadcom/vulkan/v3dv_query.c
+++ b/src/broadcom/vulkan/v3dv_query.c
@@ -31,12 +31,12 @@
 {
    V3DV_FROM_HANDLE(v3dv_device, device, _device);
 
-   assert(pCreateInfo->queryType == VK_QUERY_TYPE_OCCLUSION);
+   assert(pCreateInfo->queryType == VK_QUERY_TYPE_OCCLUSION ||
+          pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP);
    assert(pCreateInfo->queryCount > 0);
 
-
    /* FIXME: the hw allows us to allocate up to 16 queries in a single block
-    *        so we should try to use that.
+    *        for occlussion queries so we should try to use that.
     */
    struct v3dv_query_pool *pool =
       vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
@@ -44,6 +44,7 @@
    if (pool == NULL)
       return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
+   pool->query_type = pCreateInfo->queryType;
    pool->query_count = pCreateInfo->queryCount;
 
    VkResult result;
@@ -59,16 +60,24 @@
    uint32_t i;
    for (i = 0; i < pool->query_count; i++) {
       pool->queries[i].maybe_available = false;
-      pool->queries[i].bo = v3dv_bo_alloc(device, 4096, "query", true);
-      if (!pool->queries[i].bo) {
-         result = vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
-         goto fail_alloc_bo;
-      }
-
-      /* For occlusion queries we only need a 4-byte counter */
-      if (!v3dv_bo_map(device, pool->queries[i].bo, 4)) {
-         result = vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
-         goto fail_alloc_bo;
+      switch (pool->query_type) {
+      case VK_QUERY_TYPE_OCCLUSION:
+         pool->queries[i].bo = v3dv_bo_alloc(device, 4096, "query", true);
+         if (!pool->queries[i].bo) {
+            result = vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+            goto fail_alloc_bo;
+         }
+         /* For occlusion queries we only need a 4-byte counter */
+         if (!v3dv_bo_map(device, pool->queries[i].bo, 4)) {
+            result = vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+            goto fail_alloc_bo;
+         }
+         break;
+      case VK_QUERY_TYPE_TIMESTAMP:
+         pool->queries[i].value = 0;
+         break;
+      default:
+         unreachable("Unsupported query type");
       }
    }
 
@@ -98,21 +107,105 @@
    if (!pool)
       return;
 
-   for (uint32_t i = 0; i < pool->query_count; i++)
-      v3dv_bo_free(device, pool->queries[i].bo);
+   if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
+      for (uint32_t i = 0; i < pool->query_count; i++)
+         v3dv_bo_free(device, pool->queries[i].bo);
+   }
+
    vk_free2(&device->alloc, pAllocator, pool->queries);
    vk_free2(&device->alloc, pAllocator, pool);
 }
 
 static void
-write_query_result(void *dst, uint32_t idx, bool do_64bit, uint32_t value)
+write_query_result(void *dst, uint32_t idx, bool do_64bit, uint64_t value)
 {
    if (do_64bit) {
       uint64_t *dst64 = (uint64_t *) dst;
       dst64[idx] = value;
    } else {
       uint32_t *dst32 = (uint32_t *) dst;
-      dst32[idx] = value;
+      dst32[idx] = (uint32_t) value;
+   }
+}
+
+static uint64_t
+get_occlusion_query_result(struct v3dv_device *device,
+                           struct v3dv_query_pool *pool,
+                           uint32_t query,
+                           bool do_wait,
+                           bool *available)
+{
+   assert(pool && pool->query_type == VK_QUERY_TYPE_OCCLUSION);
+
+   struct v3dv_query *q = &pool->queries[query];
+   assert(q->bo && q->bo->map);
+
+   if (do_wait) {
+      /* From the Vulkan 1.0 spec:
+       *
+       *    "If VK_QUERY_RESULT_WAIT_BIT is set, (...) If the query does not
+       *     become available in a finite amount of time (e.g. due to not
+       *     issuing a query since the last reset), a VK_ERROR_DEVICE_LOST
+       *     error may occur."
+       */
+      if (!q->maybe_available)
+         return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
+
+      if (!v3dv_bo_wait(device, q->bo, 0xffffffffffffffffull))
+         return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
+
+      *available = true;
+   } else {
+      *available = q->maybe_available && v3dv_bo_wait(device, q->bo, 0);
+   }
+
+   return (uint64_t) *((uint32_t *) q->bo->map);
+}
+
+static uint64_t
+get_timestamp_query_result(struct v3dv_device *device,
+                           struct v3dv_query_pool *pool,
+                           uint32_t query,
+                           bool do_wait,
+                           bool *available)
+{
+   assert(pool && pool->query_type == VK_QUERY_TYPE_TIMESTAMP);
+
+   struct v3dv_query *q = &pool->queries[query];
+
+   if (do_wait) {
+      /* From the Vulkan 1.0 spec:
+       *
+       *    "If VK_QUERY_RESULT_WAIT_BIT is set, (...) If the query does not
+       *     become available in a finite amount of time (e.g. due to not
+       *     issuing a query since the last reset), a VK_ERROR_DEVICE_LOST
+       *     error may occur."
+       */
+      if (!q->maybe_available)
+         return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
+
+      *available = true;
+   } else {
+      *available = q->maybe_available;
+   }
+
+   return q->value;
+}
+
+static uint64_t
+get_query_result(struct v3dv_device *device,
+                 struct v3dv_query_pool *pool,
+                 uint32_t query,
+                 bool do_wait,
+                 bool *available)
+{
+   switch (pool->query_type) {
+   case VK_QUERY_TYPE_OCCLUSION:
+      return get_occlusion_query_result(device, pool, query, do_wait, available);
+   case VK_QUERY_TYPE_TIMESTAMP:
+      return get_timestamp_query_result(device, pool, query, do_wait, available);
+   default:
+      unreachable("Unsupported query type");
    }
 }
 
@@ -135,30 +228,8 @@
 
    VkResult result = VK_SUCCESS;
    for (uint32_t i = first; i < first + count; i++) {
-      assert(pool->queries[i].bo && pool->queries[i].bo->map);
-      struct v3dv_bo *bo = pool->queries[i].bo;
-      const uint32_t *counter = (const uint32_t *) bo->map;
-
       bool available;
-      if (do_wait) {
-         /* From the Vulkan 1.0 spec:
-          *
-          *    "If VK_QUERY_RESULT_WAIT_BIT is set, (...) If the query does not
-          *     become available in a finite amount of time (e.g. due to not
-          *     issuing a query since the last reset), a VK_ERROR_DEVICE_LOST
-          *     error may occur."
-          */
-         if (!pool->queries[i].maybe_available)
-            return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
-
-         if (!v3dv_bo_wait(device, bo, 0xffffffffffffffffull))
-            return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
-
-         available = true;
-      } else {
-         available = pool->queries[i].maybe_available &&
-                     v3dv_bo_wait(device, bo, 0);
-      }
+      uint64_t value = get_query_result(device, pool, i, do_wait, &available);
 
       /**
        * From the Vulkan 1.0 spec:
@@ -174,7 +245,7 @@
 
       const bool write_result = available || do_partial;
       if (write_result)
-         write_query_result(data, slot, do_64bit, *counter);
+         write_query_result(data, slot, do_64bit, value);
       slot++;
 
       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
diff --git a/src/broadcom/vulkan/v3dv_queue.c b/src/broadcom/vulkan/v3dv_queue.c
index 0899e74..722e6b4 100644
--- a/src/broadcom/vulkan/v3dv_queue.c
+++ b/src/broadcom/vulkan/v3dv_queue.c
@@ -154,22 +154,37 @@
 handle_reset_query_cpu_job(struct v3dv_job *job)
 {
    /* We are about to reset query counters so we need to make sure that
-    * The GPU is not using them.
+    * The GPU is not using them. The exception is timestamp queries, since
+    * we handle those in the CPU.
     *
     * FIXME: we could avoid blocking the main thread for this if we use
     *        submission thread.
     */
-   VkResult result = gpu_queue_wait_idle(&job->device->queue);
-   if (result != VK_SUCCESS)
-      return result;
-
    struct v3dv_reset_query_cpu_job_info *info = &job->cpu.query_reset;
+   assert(info->pool);
+
+   if (info->pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
+      VkResult result = gpu_queue_wait_idle(&job->device->queue);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
    for (uint32_t i = info->first; i < info->first + info->count; i++) {
       assert(i < info->pool->query_count);
       struct v3dv_query *query = &info->pool->queries[i];
       query->maybe_available = false;
-      uint32_t *counter = (uint32_t *) query->bo->map;
-      *counter = 0;
+      switch (info->pool->query_type) {
+      case VK_QUERY_TYPE_OCCLUSION: {
+         uint32_t *counter = (uint32_t *) query->bo->map;
+         *counter = 0;
+         break;
+      }
+      case VK_QUERY_TYPE_TIMESTAMP:
+         query->value = 0;
+         break;
+      default:
+         unreachable("Unsupported query type");
+      }
    }
 
    return VK_SUCCESS;
@@ -420,6 +435,26 @@
 }
 
 static VkResult
+handle_timestamp_query_cpu_job(struct v3dv_job *job)
+{
+   assert(job->type == V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY);
+   struct v3dv_timestamp_query_cpu_job_info *info = &job->cpu.query_timestamp;
+
+   /* Wait for completion of all work queued before the timestamp query */
+   v3dv_QueueWaitIdle(v3dv_queue_to_handle(&job->device->queue));
+
+   /* Compute timestamp */
+   struct timespec t;
+   clock_gettime(CLOCK_MONOTONIC, &t);
+   assert(info->query < info->pool->query_count);
+   struct v3dv_query *query = &info->pool->queries[info->query];
+   query->maybe_available = true;
+   query->value = t.tv_sec * 1000000000ull + t.tv_nsec;
+
+   return VK_SUCCESS;
+}
+
+static VkResult
 handle_csd_job(struct v3dv_queue *queue,
                struct v3dv_job *job,
                bool do_sem_wait);
@@ -705,6 +740,8 @@
       return handle_copy_buffer_to_image_cpu_job(job);
    case V3DV_JOB_TYPE_CPU_CSD_INDIRECT:
       return handle_csd_indirect_cpu_job(queue, job, do_sem_wait);
+   case V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY:
+      return handle_timestamp_query_cpu_job(job);
    default:
       unreachable("Unhandled job type");
    }