radv/winsys: Add binary syncobj ABI changes for timeline semaphores. To facilitate cross-process timeline semaphores we have to deal with the fact that the syncobj signal operation might be submitted a small finite time after the wait operation. For that we start using DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT during the wait operation so we properly wait instead of returning an error. Furthermore, to make this effective for syncobjs that get reused we actually have to reset them after the wait. Otherwise the wait before submit would get the previous fence instead of waiting for the new thing to submit. The obvious choice, resetting the syncobj after the CS submission has 2 issues though: 1) If the same semaphore is used for wait and signal we can't reset it. This is solvable by only resetting the semaphores that are not in the signal list. 2) The submitted work might be complete before we get to resetting the syncobj. If there is a cycle of submissions that signals it again and finishes before we get to the reset we are screwed. Solution: Copy the fence into a new syncobj and reset the old syncobj before submission. Yes I know it is more syscalls :( At least I reduced the alloc/free overhead by keeping a cache of temporary syncobjs. This also introduces a syncobj_reset_count as we don't want to reset syncobjs that are part of an emulated timeline semaphore. (yes, if the kernel supports timeline syncobjs we should use those instead, but those still need to be implemented and if we depend on them in this patch ordering dependencies get hard ...) Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5600>

commit: fa97061a8235b64009d7897ecf20cc81258f3403 [log] [tgz]
author: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl> Mon Jun 22 01:11:03 2020 +0200
committer: Marge Bot <eric+marge@anholt.net> Thu Jul 23 17:36:46 2020 +0000
tree: b04e05d015a91ea152162b49e53f9dd0d99b40c3
parent: fb5237910b2e8f9b0d1a0a15de3f73dc0675771e [diff]
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index dd8b2e4..b8ddb89 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c

@@ -3762,7 +3762,7 @@
 				      VkFence _fence,
 				      bool is_signal)
 {
-	int syncobj_idx = 0, sem_idx = 0;
+	int syncobj_idx = 0, non_reset_idx = 0, sem_idx = 0;
 
 	if (num_sems == 0 && _fence == VK_NULL_HANDLE)
 		return VK_SUCCESS;
@@ -3771,6 +3771,7 @@
 		switch(sems[i]->kind) {
 		case RADV_SEMAPHORE_SYNCOBJ:
 			counts->syncobj_count++;
+			counts->syncobj_reset_count++;
 			break;
 		case RADV_SEMAPHORE_WINSYS:
 			counts->sem_count++;
@@ -3807,6 +3808,8 @@
 		}
 	}
 
+	non_reset_idx = counts->syncobj_reset_count;
+
 	for (uint32_t i = 0; i < num_sems; i++) {
 		switch(sems[i]->kind) {
 		case RADV_SEMAPHORE_NONE:
@@ -3830,7 +3833,7 @@
 			pthread_mutex_unlock(&sems[i]->timeline.mutex);
 
 			if (point) {
-				counts->syncobj[syncobj_idx++] = point->syncobj;
+				counts->syncobj[non_reset_idx++] = point->syncobj;
 			} else {
 				/* Explicitly remove the semaphore so we might not find
 				 * a point later post-submit. */
@@ -3848,11 +3851,11 @@
 			fence->temporary.kind != RADV_FENCE_NONE ?
 			&fence->temporary : &fence->permanent;
 		if (part->kind == RADV_FENCE_SYNCOBJ)
-			counts->syncobj[syncobj_idx++] = part->syncobj;
+			counts->syncobj[non_reset_idx++] = part->syncobj;
 	}
 
-	assert(syncobj_idx <= counts->syncobj_count);
-	counts->syncobj_count = syncobj_idx;
+	assert(MAX2(syncobj_idx, non_reset_idx) <= counts->syncobj_count);
+	counts->syncobj_count = MAX2(syncobj_idx, non_reset_idx);
 
 	return VK_SUCCESS;
 }

diff --git a/src/amd/vulkan/radv_radeon_winsys.h b/src/amd/vulkan/radv_radeon_winsys.h
index ac5555e..3757632 100644
--- a/src/amd/vulkan/radv_radeon_winsys.h
+++ b/src/amd/vulkan/radv_radeon_winsys.h

@@ -168,6 +168,7 @@
 };
 struct radv_winsys_sem_counts {
 	uint32_t syncobj_count;
+	uint32_t syncobj_reset_count; /* for wait only, whether to reset the syncobj */
 	uint32_t sem_count;
 	uint32_t *syncobj;
 	struct radeon_winsys_sem **sem;

diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
index b024ef9..2cce43c 100644
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c

@@ -1451,15 +1451,17 @@
 }
 
 static struct drm_amdgpu_cs_chunk_sem *radv_amdgpu_cs_alloc_syncobj_chunk(struct radv_winsys_sem_counts *counts,
+									  const uint32_t *syncobj_override,
 									  struct drm_amdgpu_cs_chunk *chunk, int chunk_id)
 {
+	const uint32_t *src = syncobj_override ? syncobj_override : counts->syncobj;
 	struct drm_amdgpu_cs_chunk_sem *syncobj = malloc(sizeof(struct drm_amdgpu_cs_chunk_sem) * counts->syncobj_count);
 	if (!syncobj)
 		return NULL;
 
 	for (unsigned i = 0; i < counts->syncobj_count; i++) {
 		struct drm_amdgpu_cs_chunk_sem *sem = &syncobj[i];
-		sem->handle = counts->syncobj[i];
+		sem->handle = src[i];
 	}
 
 	chunk->chunk_id = chunk_id;
@@ -1468,6 +1470,101 @@
 	return syncobj;
 }
 
+static int radv_amdgpu_cache_alloc_syncobjs(struct radv_amdgpu_winsys *ws, unsigned count, uint32_t *dst)
+{
+	pthread_mutex_lock(&ws->syncobj_lock);
+	if (count > ws->syncobj_capacity) {
+		if (ws->syncobj_capacity > UINT32_MAX / 2)
+			goto fail;
+
+		unsigned new_capacity = MAX2(count, ws->syncobj_capacity * 2);
+		uint32_t *n = realloc(ws->syncobj, new_capacity * sizeof(*ws->syncobj));
+		if (!n)
+			goto fail;
+		ws->syncobj_capacity = new_capacity;
+		ws->syncobj = n;
+	}
+
+	while(ws->syncobj_count < count) {
+		int r = amdgpu_cs_create_syncobj(ws->dev, ws->syncobj + ws->syncobj_count);
+		if (r)
+			goto fail;
+		++ws->syncobj_count;
+	}
+
+	for (unsigned i = 0; i < count; ++i)
+		dst[i] = ws->syncobj[--ws->syncobj_count];
+
+	pthread_mutex_unlock(&ws->syncobj_lock);
+	return 0;
+
+fail:
+	pthread_mutex_unlock(&ws->syncobj_lock);
+	return -ENOMEM;
+}
+
+static void radv_amdgpu_cache_free_syncobjs(struct radv_amdgpu_winsys *ws, unsigned count, uint32_t *src)
+{
+	pthread_mutex_lock(&ws->syncobj_lock);
+
+	uint32_t cache_count = MIN2(count, UINT32_MAX - ws->syncobj_count);
+	if (cache_count + ws->syncobj_count > ws->syncobj_capacity) {
+		unsigned new_capacity = MAX2(ws->syncobj_count + cache_count, ws->syncobj_capacity * 2);
+		uint32_t* n = realloc(ws->syncobj, new_capacity * sizeof(*ws->syncobj));
+		if (n) {
+			ws->syncobj_capacity = new_capacity;
+			ws->syncobj = n;
+		}
+	}
+
+	for (unsigned i = 0; i < count; ++i) {
+		if (ws->syncobj_count < ws->syncobj_capacity)
+			ws->syncobj[ws->syncobj_count++] = src[i];
+		else
+			amdgpu_cs_destroy_syncobj(ws->dev, src[i]);
+	}
+
+	pthread_mutex_unlock(&ws->syncobj_lock);
+
+}
+
+static int radv_amdgpu_cs_prepare_syncobjs(struct radv_amdgpu_winsys *ws,
+                                           struct radv_winsys_sem_counts *counts,
+                                           uint32_t **out_syncobjs)
+{
+	int r = 0;
+
+	if (!ws->info.has_timeline_syncobj || !counts->syncobj_count) {
+		*out_syncobjs = NULL;
+		return 0;
+	}
+
+	*out_syncobjs = malloc(counts->syncobj_count * sizeof(**out_syncobjs));
+	if (!*out_syncobjs)
+		return -ENOMEM;
+
+	r = radv_amdgpu_cache_alloc_syncobjs(ws, counts->syncobj_count, *out_syncobjs);
+	if (r)
+		return r;
+	
+	for (unsigned i = 0; i < counts->syncobj_count; ++i) {
+		r = amdgpu_cs_syncobj_transfer(ws->dev, (*out_syncobjs)[i], 0, counts->syncobj[i], 0, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT);
+		if (r)
+			goto fail;
+	}
+
+	r = amdgpu_cs_syncobj_reset(ws->dev, counts->syncobj, counts->syncobj_reset_count);
+	if (r)
+		goto fail;
+
+	return 0;
+fail:
+	radv_amdgpu_cache_free_syncobjs(ws, counts->syncobj_count, *out_syncobjs);
+	free(*out_syncobjs);
+	*out_syncobjs = NULL;
+	return r;
+}
+
 static VkResult
 radv_amdgpu_cs_submit(struct radv_amdgpu_ctx *ctx,
 		      struct radv_amdgpu_cs_request *request,
@@ -1483,6 +1580,7 @@
 	struct drm_amdgpu_cs_chunk_sem *wait_syncobj = NULL, *signal_syncobj = NULL;
 	bool use_bo_list_create = ctx->ws->info.drm_minor < 27;
 	struct drm_amdgpu_bo_list_in bo_list_in;
+	uint32_t *in_syncobjs = NULL;
 	int i;
 	struct amdgpu_cs_fence *sem;
 	uint32_t bo_list = 0;
@@ -1533,7 +1631,12 @@
 	}
 
 	if (sem_info->wait.syncobj_count && sem_info->cs_emit_wait) {
+		r = radv_amdgpu_cs_prepare_syncobjs(ctx->ws, &sem_info->wait, &in_syncobjs);
+		if (r)
+			goto error_out;
+
 		wait_syncobj = radv_amdgpu_cs_alloc_syncobj_chunk(&sem_info->wait,
+								  in_syncobjs,
 								  &chunks[num_chunks],
 								  AMDGPU_CHUNK_ID_SYNCOBJ_IN);
 		if (!wait_syncobj) {
@@ -1578,6 +1681,7 @@
 
 	if (sem_info->signal.syncobj_count && sem_info->cs_emit_signal) {
 		signal_syncobj = radv_amdgpu_cs_alloc_syncobj_chunk(&sem_info->signal,
+								    NULL,
 								    &chunks[num_chunks],
 								    AMDGPU_CHUNK_ID_SYNCOBJ_OUT);
 		if (!signal_syncobj) {
@@ -1642,6 +1746,10 @@
 	}
 
 error_out:
+	if (in_syncobjs) {
+		radv_amdgpu_cache_free_syncobjs(ctx->ws, sem_info->wait.syncobj_count, in_syncobjs);
+		free(in_syncobjs);
+	}
 	free(chunks);
 	free(chunk_data);
 	free(sem_dependencies);

diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c
index 29e7b20..c6deedc 100644
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c

@@ -162,6 +162,10 @@
 {
 	struct radv_amdgpu_winsys *ws = (struct radv_amdgpu_winsys*)rws;
 
+	for (unsigned i = 0; i < ws->syncobj_count; ++i)
+		amdgpu_cs_destroy_syncobj(ws->dev, ws->syncobj[i]);
+	free(ws->syncobj);
+
 	ac_addrlib_destroy(ws->addrlib);
 	amdgpu_device_deinitialize(ws->dev);
 	FREE(rws);
@@ -197,6 +201,7 @@
 	ws->use_llvm = debug_flags & RADV_DEBUG_LLVM;
 	list_inithead(&ws->global_bo_list);
 	pthread_mutex_init(&ws->global_bo_list_lock, NULL);
+	pthread_mutex_init(&ws->syncobj_lock, NULL);
 	ws->base.query_info = radv_amdgpu_winsys_query_info;
 	ws->base.query_value = radv_amdgpu_winsys_query_value;
 	ws->base.read_registers = radv_amdgpu_winsys_read_registers;

diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.h b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.h
index 0601481..bb8ce88 100644
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.h
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.h

@@ -55,6 +55,11 @@
 	uint64_t allocated_vram;
 	uint64_t allocated_vram_vis;
 	uint64_t allocated_gtt;
+
+	/* syncobj cache */
+	pthread_mutex_t syncobj_lock;
+	uint32_t *syncobj;
+	uint32_t syncobj_count, syncobj_capacity;
 };
 
 static inline struct radv_amdgpu_winsys *
commit	fa97061a8235b64009d7897ecf20cc81258f3403	[log] [tgz]
author	Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>	Mon Jun 22 01:11:03 2020 +0200
committer	Marge Bot <eric+marge@anholt.net>	Thu Jul 23 17:36:46 2020 +0000
tree	b04e05d015a91ea152162b49e53f9dd0d99b40c3
parent	fb5237910b2e8f9b0d1a0a15de3f73dc0675771e [diff]