radv: Include flushes in the barrier.

Since the flushes really happen on the next draw delay the barrier
end to include the flushes.

This fixes the barrier duration in RGP.

Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6550>
diff --git a/src/amd/vulkan/layers/radv_sqtt_layer.c b/src/amd/vulkan/layers/radv_sqtt_layer.c
index 2b3f13a..5518026 100644
--- a/src/amd/vulkan/layers/radv_sqtt_layer.c
+++ b/src/amd/vulkan/layers/radv_sqtt_layer.c
@@ -509,31 +509,17 @@
 }
 
 void
-radv_describe_barrier_start(struct radv_cmd_buffer *cmd_buffer,
-			   enum rgp_barrier_reason reason)
-{
-	struct rgp_sqtt_marker_barrier_start marker = {};
-	struct radeon_cmdbuf *cs = cmd_buffer->cs;
-
-	if (likely(!cmd_buffer->device->thread_trace_bo))
-		return;
-
-	marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_START;
-	marker.cb_id = 0;
-	marker.dword02 = reason;
-
-	radv_emit_thread_trace_userdata(cmd_buffer->device, cs, &marker, sizeof(marker) / 4);
-}
-
-void
-radv_describe_barrier_end(struct radv_cmd_buffer *cmd_buffer)
+radv_describe_barrier_end_delayed(struct radv_cmd_buffer *cmd_buffer)
 {
 	struct rgp_sqtt_marker_barrier_end marker = {};
 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
 
-	if (likely(!cmd_buffer->device->thread_trace_bo))
+	if (likely(!cmd_buffer->device->thread_trace_bo) ||
+	    !cmd_buffer->state.pending_sqtt_barrier_end)
 		return;
 
+	cmd_buffer->state.pending_sqtt_barrier_end = false;
+
 	marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_END;
 	marker.cb_id = 0;
 
@@ -547,6 +533,31 @@
 }
 
 void
+radv_describe_barrier_start(struct radv_cmd_buffer *cmd_buffer,
+			   enum rgp_barrier_reason reason)
+{
+	struct rgp_sqtt_marker_barrier_start marker = {};
+	struct radeon_cmdbuf *cs = cmd_buffer->cs;
+
+	if (likely(!cmd_buffer->device->thread_trace_bo))
+		return;
+
+	radv_describe_barrier_end_delayed(cmd_buffer);
+
+	marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_START;
+	marker.cb_id = 0;
+	marker.dword02 = reason;
+
+	radv_emit_thread_trace_userdata(cmd_buffer->device, cs, &marker, sizeof(marker) / 4);
+}
+
+void
+radv_describe_barrier_end(struct radv_cmd_buffer *cmd_buffer)
+{
+	cmd_buffer->state.pending_sqtt_barrier_end = true;
+}
+
+void
 radv_describe_layout_transition(struct radv_cmd_buffer *cmd_buffer,
 				const struct radv_barrier_data *barrier)
 {
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index 8023956..243810f 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -5056,6 +5056,8 @@
 	struct radeon_winsys *ws = cmd_buffer->device->ws;
 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
 
+	radv_describe_draw(cmd_buffer);
+
 	if (info->indirect) {
 		uint64_t va = radv_buffer_get_va(info->indirect->bo);
 		uint64_t count_va = 0;
@@ -5286,8 +5288,6 @@
 			return;
 	}
 
-	radv_describe_draw(cmd_buffer);
-
 	/* Use optimal packet order based on whether we need to sync the
 	 * pipeline.
 	 */
@@ -5523,6 +5523,8 @@
 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
 	struct radv_userdata_info *loc;
 
+	radv_describe_dispatch(cmd_buffer, 8, 8, 8);
+
 	loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_COMPUTE,
 				    AC_UD_CS_GRID_SIZE);
 
@@ -5663,8 +5665,6 @@
 	bool pipeline_is_dirty = pipeline &&
 				 pipeline != cmd_buffer->state.emitted_compute_pipeline;
 
-	radv_describe_dispatch(cmd_buffer, 8, 8, 8);
-
 	if (cmd_buffer->state.flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB |
 					    RADV_CMD_FLAG_FLUSH_AND_INV_DB |
 					    RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index 4f954d5..4de20b0 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -1369,6 +1369,7 @@
 	uint32_t current_event_type;
 	uint32_t num_events;
 	uint32_t num_layout_transitions;
+	bool pending_sqtt_barrier_end;
 };
 
 struct radv_cmd_pool {
@@ -2551,6 +2552,7 @@
 void radv_describe_barrier_start(struct radv_cmd_buffer *cmd_buffer,
 				 enum rgp_barrier_reason reason);
 void radv_describe_barrier_end(struct radv_cmd_buffer *cmd_buffer);
+void radv_describe_barrier_end_delayed(struct radv_cmd_buffer *cmd_buffer);
 void radv_describe_layout_transition(struct radv_cmd_buffer *cmd_buffer,
 				     const struct radv_barrier_data *barrier);
 
diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c
index cd6cf23..d840457 100644
--- a/src/amd/vulkan/si_cmd_buffer.c
+++ b/src/amd/vulkan/si_cmd_buffer.c
@@ -1424,8 +1424,10 @@
 						  RADV_CMD_FLAG_START_PIPELINE_STATS |
 						  RADV_CMD_FLAG_STOP_PIPELINE_STATS);
 
-	if (!cmd_buffer->state.flush_bits)
+	if (!cmd_buffer->state.flush_bits) {
+		radv_describe_barrier_end_delayed(cmd_buffer);
 		return;
+	}
 
 	radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 128);
 
@@ -1452,6 +1454,8 @@
 	 * should be finished at this point.
 	 */
 	cmd_buffer->pending_reset_query = false;
+
+	radv_describe_barrier_end_delayed(cmd_buffer);
 }
 
 /* sets the CP predication state using a boolean stored at va */