winsys/amdgpu: fix a race condition between fence updates and IB submissions

The CS thread is needed to ensure proper ordering of operations and can't
be disabled (without complicating the code).

Discovered by Nine CSMT, which ended up in a deadlock.

Acked-by: Edward O'Callaghan <funfunctor@folklore1984.net>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index 95402bf..87246f7 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -1067,11 +1067,9 @@
 void amdgpu_cs_sync_flush(struct radeon_winsys_cs *rcs)
 {
    struct amdgpu_cs *cs = amdgpu_cs(rcs);
-   struct amdgpu_winsys *ws = cs->ctx->ws;
 
    /* Wait for any pending ioctl of this CS to complete. */
-   if (util_queue_is_initialized(&ws->cs_queue))
-      util_queue_job_wait(&cs->flush_completed);
+   util_queue_job_wait(&cs->flush_completed);
 }
 
 static int amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
@@ -1157,7 +1155,14 @@
       if (fence)
          amdgpu_fence_reference(fence, cur->fence);
 
-      /* Prepare buffers. */
+      amdgpu_cs_sync_flush(rcs);
+
+      /* Prepare buffers.
+       *
+       * This fence must be held until the submission is queued to ensure
+       * that the order of fence dependency updates matches the order of
+       * submissions.
+       */
       pipe_mutex_lock(ws->bo_fence_lock);
       amdgpu_add_fence_dependencies(cs);
 
@@ -1174,22 +1179,20 @@
          p_atomic_inc(&bo->num_active_ioctls);
          amdgpu_add_fence(bo, cur->fence);
       }
-      pipe_mutex_unlock(ws->bo_fence_lock);
-
-      amdgpu_cs_sync_flush(rcs);
 
       /* Swap command streams. "cst" is going to be submitted. */
       cs->csc = cs->cst;
       cs->cst = cur;
 
       /* Submit. */
-      if ((flags & RADEON_FLUSH_ASYNC) &&
-          util_queue_is_initialized(&ws->cs_queue)) {
-         util_queue_add_job(&ws->cs_queue, cs, &cs->flush_completed,
-                            amdgpu_cs_submit_ib, NULL);
-      } else {
-         amdgpu_cs_submit_ib(cs, 0);
-         error_code = cs->cst->error_code;
+      util_queue_add_job(&ws->cs_queue, cs, &cs->flush_completed,
+                         amdgpu_cs_submit_ib, NULL);
+      /* The submission has been queued, unlock the fence now. */
+      pipe_mutex_unlock(ws->bo_fence_lock);
+
+      if (!(flags & RADEON_FLUSH_ASYNC)) {
+         amdgpu_cs_sync_flush(rcs);
+         error_code = cur->error_code;
       }
    } else {
       amdgpu_cs_context_cleanup(cs->csc);
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
index b950d37..e944e62 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
@@ -478,8 +478,6 @@
    return key1 != key2;
 }
 
-DEBUG_GET_ONCE_BOOL_OPTION(thread, "RADEON_THREAD", true)
-
 static bool amdgpu_winsys_unref(struct radeon_winsys *rws)
 {
    struct amdgpu_winsys *ws = (struct amdgpu_winsys*)rws;
@@ -584,8 +582,11 @@
    pipe_mutex_init(ws->global_bo_list_lock);
    pipe_mutex_init(ws->bo_fence_lock);
 
-   if (sysconf(_SC_NPROCESSORS_ONLN) > 1 && debug_get_option_thread())
-      util_queue_init(&ws->cs_queue, "amdgpu_cs", 8, 1);
+   if (!util_queue_init(&ws->cs_queue, "amdgpu_cs", 8, 1)) {
+      amdgpu_winsys_destroy(&ws->base);
+      pipe_mutex_unlock(dev_tab_mutex);
+      return NULL;
+   }
 
    /* Create the screen at the end. The winsys must be initialized
     * completely.