radv: Record cache flushes for RGP.

Not doing the EOP TS cacheflush event because that break wave counting
in RGP for some reason. But the rest looks to be all there.

Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6550>
diff --git a/src/amd/vulkan/layers/radv_sqtt_layer.c b/src/amd/vulkan/layers/radv_sqtt_layer.c
index 5518026..5f3de07 100644
--- a/src/amd/vulkan/layers/radv_sqtt_layer.c
+++ b/src/amd/vulkan/layers/radv_sqtt_layer.c
@@ -300,7 +300,7 @@
 	union {
 		struct {
 			uint32_t sync_cp_dma : 1;
-			uint32_t inval_ccp : 1;
+			uint32_t inval_tcp : 1;
 			uint32_t inval_sqI : 1;
 			uint32_t inval_sqK : 1;
 			uint32_t flush_tcc : 1;
@@ -526,6 +526,38 @@
 	marker.num_layout_transitions = cmd_buffer->state.num_layout_transitions;
 
 	/* TODO: fill pipeline stalls, cache flushes, etc */
+	if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_WAIT_ON_EOP_TS)
+		marker.wait_on_eop_ts = true;
+	if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_VS_PARTIAL_FLUSH)
+		marker.vs_partial_flush = true;
+	if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_PS_PARTIAL_FLUSH)
+		marker.ps_partial_flush = true;
+	if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_CS_PARTIAL_FLUSH)
+		marker.cs_partial_flush = true;
+	if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_PFP_SYNC_ME)
+		marker.pfp_sync_me = true;
+	if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_SYNC_CP_DMA)
+		marker.sync_cp_dma = true;
+	if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_VMEM_L0)
+		marker.inval_tcp = true;
+	if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_ICACHE)
+		marker.inval_sqI = true;
+	if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_SMEM_L0)
+		marker.inval_sqK = true;
+	if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_FLUSH_L2)
+		marker.flush_tcc = true;
+	if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_L2)
+		marker.inval_tcc = true;
+	if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_FLUSH_CB)
+		marker.flush_cb = true;
+	if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_CB)
+		marker.inval_cb = true;
+	if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_FLUSH_DB)
+		marker.flush_db = true;
+	if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_DB)
+		marker.inval_db = true;
+	if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_L1)
+		marker.inval_gl1 = true;
 
 	radv_emit_thread_trace_userdata(cmd_buffer->device, cs, &marker, sizeof(marker) / 4);
 
@@ -543,6 +575,7 @@
 		return;
 
 	radv_describe_barrier_end_delayed(cmd_buffer);
+	cmd_buffer->state.sqtt_flush_bits = 0;
 
 	marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_START;
 	marker.cb_id = 0;
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index 243810f..70f9bea 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -606,6 +606,7 @@
 	}
 
 	if (cmd_buffer->device->instance->debug_flags & RADV_DEBUG_SYNC_SHADERS) {
+		enum rgp_flush_bits sqtt_flush_bits = 0;
 		assert(flags & (RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
 				RADV_CMD_FLAG_CS_PARTIAL_FLUSH));
 
@@ -617,7 +618,7 @@
 				       &cmd_buffer->gfx9_fence_idx,
 				       cmd_buffer->gfx9_fence_va,
 				       radv_cmd_buffer_uses_mec(cmd_buffer),
-				       flags, cmd_buffer->gfx9_eop_bug_va);
+				       flags, &sqtt_flush_bits, cmd_buffer->gfx9_eop_bug_va);
 	}
 
 	if (unlikely(cmd_buffer->device->trace_bo))
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index 421cac4..133a375 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -3767,6 +3767,7 @@
 	}
 
 	for(int i = 0; i < 3; ++i) {
+		enum rgp_flush_bits sqtt_flush_bits = 0;
 		struct radeon_cmdbuf *cs = NULL;
 		cs = queue->device->ws->cs_create(queue->device->ws,
 						  queue->queue_family_index ? RING_COMPUTE : RING_GFX);
@@ -3832,7 +3833,7 @@
 			                       RADV_CMD_FLAG_INV_SCACHE |
 			                       RADV_CMD_FLAG_INV_VCACHE |
 			                       RADV_CMD_FLAG_INV_L2 |
-					       RADV_CMD_FLAG_START_PIPELINE_STATS, 0);
+					       RADV_CMD_FLAG_START_PIPELINE_STATS, &sqtt_flush_bits, 0);
 		} else if (i == 1) {
 			si_cs_emit_cache_flush(cs,
 			                       queue->device->physical_device->rad_info.chip_class,
@@ -3843,7 +3844,7 @@
 			                       RADV_CMD_FLAG_INV_SCACHE |
 			                       RADV_CMD_FLAG_INV_VCACHE |
 			                       RADV_CMD_FLAG_INV_L2 |
-					       RADV_CMD_FLAG_START_PIPELINE_STATS, 0);
+					       RADV_CMD_FLAG_START_PIPELINE_STATS, &sqtt_flush_bits, 0);
 		}
 
 		if (queue->device->ws->cs_finalize(cs) != VK_SUCCESS)
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index 4de20b0..1643650 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -1301,6 +1301,25 @@
 	struct radv_sample_locations_state sample_location;
 };
 
+enum rgp_flush_bits {
+	RGP_FLUSH_WAIT_ON_EOP_TS   = 0x1,
+	RGP_FLUSH_VS_PARTIAL_FLUSH = 0x2,
+	RGP_FLUSH_PS_PARTIAL_FLUSH = 0x4,
+	RGP_FLUSH_CS_PARTIAL_FLUSH = 0x8,
+	RGP_FLUSH_PFP_SYNC_ME      = 0x10,
+	RGP_FLUSH_SYNC_CP_DMA      = 0x20,
+	RGP_FLUSH_INVAL_VMEM_L0    = 0x40,
+	RGP_FLUSH_INVAL_ICACHE     = 0x80,
+	RGP_FLUSH_INVAL_SMEM_L0    = 0x100,
+	RGP_FLUSH_FLUSH_L2         = 0x200,
+	RGP_FLUSH_INVAL_L2         = 0x400,
+	RGP_FLUSH_FLUSH_CB         = 0x800,
+	RGP_FLUSH_INVAL_CB         = 0x1000,
+	RGP_FLUSH_FLUSH_DB         = 0x2000,
+	RGP_FLUSH_INVAL_DB         = 0x4000,
+	RGP_FLUSH_INVAL_L1         = 0x8000,
+};
+
 struct radv_cmd_state {
 	/* Vertex descriptors */
 	uint64_t                                      vb_va;
@@ -1370,6 +1389,7 @@
 	uint32_t num_events;
 	uint32_t num_layout_transitions;
 	bool pending_sqtt_barrier_end;
+	enum rgp_flush_bits sqtt_flush_bits;
 };
 
 struct radv_cmd_pool {
@@ -1487,6 +1507,7 @@
 			    uint32_t *fence_ptr, uint64_t va,
 			    bool is_mec,
 			    enum radv_cmd_flush_bits flush_bits,
+			    enum rgp_flush_bits *sqtt_flush_bits,
 			    uint64_t gfx9_eop_bug_va);
 void si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer);
 void si_emit_set_predication_state(struct radv_cmd_buffer *cmd_buffer,
diff --git a/src/amd/vulkan/radv_sqtt.c b/src/amd/vulkan/radv_sqtt.c
index a1de66e..345637c 100644
--- a/src/amd/vulkan/radv_sqtt.c
+++ b/src/amd/vulkan/radv_sqtt.c
@@ -390,6 +390,7 @@
 radv_emit_wait_for_idle(struct radv_device *device,
 			struct radeon_cmdbuf *cs, int family)
 {
+	enum rgp_flush_bits sqtt_flush_bits = 0;
 	si_cs_emit_cache_flush(cs, device->physical_device->rad_info.chip_class,
 			       NULL, 0,
 			       family == RING_COMPUTE &&
@@ -400,7 +401,7 @@
 			       RADV_CMD_FLAG_INV_ICACHE |
 			       RADV_CMD_FLAG_INV_SCACHE |
 			       RADV_CMD_FLAG_INV_VCACHE |
-			       RADV_CMD_FLAG_INV_L2, 0);
+			       RADV_CMD_FLAG_INV_L2, &sqtt_flush_bits, 0);
 }
 
 static void
diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c
index d840457..5f77211 100644
--- a/src/amd/vulkan/si_cmd_buffer.c
+++ b/src/amd/vulkan/si_cmd_buffer.c
@@ -1040,6 +1040,7 @@
 			  uint64_t flush_va,
 			  bool is_mec,
 			  enum radv_cmd_flush_bits flush_bits,
+			  enum rgp_flush_bits *sqtt_flush_bits,
 			  uint64_t gfx9_eop_bug_va)
 {
 	uint32_t gcr_cntl = 0;
@@ -1048,26 +1049,38 @@
 	/* We don't need these. */
 	assert(!(flush_bits & (RADV_CMD_FLAG_VGT_STREAMOUT_SYNC)));
 
-	if (flush_bits & RADV_CMD_FLAG_INV_ICACHE)
+	if (flush_bits & RADV_CMD_FLAG_INV_ICACHE) {
 		gcr_cntl |= S_586_GLI_INV(V_586_GLI_ALL);
+
+		*sqtt_flush_bits |= RGP_FLUSH_INVAL_ICACHE;
+	}
 	if (flush_bits & RADV_CMD_FLAG_INV_SCACHE) {
 		/* TODO: When writing to the SMEM L1 cache, we need to set SEQ
 		 * to FORWARD when both L1 and L2 are written out (WB or INV).
 		 */
 		gcr_cntl |= S_586_GL1_INV(1) | S_586_GLK_INV(1);
+
+		*sqtt_flush_bits |= RGP_FLUSH_INVAL_SMEM_L0;
 	}
-	if (flush_bits & RADV_CMD_FLAG_INV_VCACHE)
+	if (flush_bits & RADV_CMD_FLAG_INV_VCACHE) {
 		gcr_cntl |= S_586_GL1_INV(1) | S_586_GLV_INV(1);
+
+		*sqtt_flush_bits |= RGP_FLUSH_INVAL_VMEM_L0 | RGP_FLUSH_INVAL_L1;
+	}
 	if (flush_bits & RADV_CMD_FLAG_INV_L2) {
 		/* Writeback and invalidate everything in L2. */
 		gcr_cntl |= S_586_GL2_INV(1) | S_586_GL2_WB(1) |
 		            S_586_GLM_INV(1) | S_586_GLM_WB(1);
+
+		*sqtt_flush_bits |= RGP_FLUSH_INVAL_L2;
 	} else if (flush_bits & RADV_CMD_FLAG_WB_L2) {
 		/* Writeback but do not invalidate.
 		 * GLM doesn't support WB alone. If WB is set, INV must be set too.
 		 */
 		gcr_cntl |= S_586_GL2_WB(1) |
 		            S_586_GLM_WB(1) | S_586_GLM_INV(1);
+
+		*sqtt_flush_bits |= RGP_FLUSH_FLUSH_L2;
 	}
 
 	/* TODO: Implement this new flag for GFX9+.
@@ -1082,6 +1095,8 @@
 			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 			radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) |
 					EVENT_INDEX(0));
+
+			*sqtt_flush_bits |= RGP_FLUSH_FLUSH_CB | RGP_FLUSH_INVAL_CB;
 		}
 
 		/* TODO: trigger on RADV_CMD_FLAG_FLUSH_AND_INV_DB_META ? */
@@ -1090,6 +1105,8 @@
 			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 			radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) |
 					EVENT_INDEX(0));
+
+			*sqtt_flush_bits |= RGP_FLUSH_FLUSH_DB | RGP_FLUSH_INVAL_DB;
 		}
 
 		/* First flush CB/DB, then L1/L2. */
@@ -1110,15 +1127,21 @@
 		if (flush_bits & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) {
 			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 			radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+
+			*sqtt_flush_bits |= RGP_FLUSH_PS_PARTIAL_FLUSH;
 		} else if (flush_bits & RADV_CMD_FLAG_VS_PARTIAL_FLUSH) {
 			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 			radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+
+			*sqtt_flush_bits |= RGP_FLUSH_VS_PARTIAL_FLUSH;
 		}
 	}
 
 	if (flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH) {
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 		radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4)));
+
+		*sqtt_flush_bits |= RGP_FLUSH_CS_PARTIAL_FLUSH;
 	}
 
 	if (cb_db_event) {
@@ -1197,6 +1220,8 @@
 		/* We need to ensure that PFP waits as well. */
 		radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
 		radeon_emit(cs, 0);
+
+		*sqtt_flush_bits |= RGP_FLUSH_PFP_SYNC_ME;
 	}
 
 	if (flush_bits & RADV_CMD_FLAG_START_PIPELINE_STATS) {
@@ -1217,6 +1242,7 @@
 		       uint64_t flush_va,
                        bool is_mec,
                        enum radv_cmd_flush_bits flush_bits,
+		       enum rgp_flush_bits *sqtt_flush_bits,
 		       uint64_t gfx9_eop_bug_va)
 {
 	unsigned cp_coher_cntl = 0;
@@ -1226,14 +1252,19 @@
 	if (chip_class >= GFX10) {
 		/* GFX10 cache flush handling is quite different. */
 		gfx10_cs_emit_cache_flush(cs, chip_class, flush_cnt, flush_va,
-					  is_mec, flush_bits, gfx9_eop_bug_va);
+					  is_mec, flush_bits, sqtt_flush_bits,
+					  gfx9_eop_bug_va);
 		return;
 	}
 
-	if (flush_bits & RADV_CMD_FLAG_INV_ICACHE)
+	if (flush_bits & RADV_CMD_FLAG_INV_ICACHE) {
 		cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
-	if (flush_bits & RADV_CMD_FLAG_INV_SCACHE)
+		*sqtt_flush_bits |= RGP_FLUSH_INVAL_ICACHE;
+	}
+	if (flush_bits & RADV_CMD_FLAG_INV_SCACHE) {
 		cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
+		*sqtt_flush_bits |= RGP_FLUSH_INVAL_SMEM_L0;
+	}
 
 	if (chip_class <= GFX8) {
 		if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB) {
@@ -1259,34 +1290,48 @@
 							   0, 0,
 							   gfx9_eop_bug_va);
 			}
+
+			*sqtt_flush_bits |= RGP_FLUSH_FLUSH_CB | RGP_FLUSH_INVAL_CB;
 		}
 		if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB) {
 			cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) |
 				S_0085F0_DB_DEST_BASE_ENA(1);
+
+			*sqtt_flush_bits |= RGP_FLUSH_FLUSH_DB | RGP_FLUSH_INVAL_DB;
 		}
 	}
 
 	if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB_META) {
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 		radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
+
+		*sqtt_flush_bits |= RGP_FLUSH_FLUSH_CB | RGP_FLUSH_INVAL_CB;
 	}
 
 	if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB_META) {
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 		radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
+
+		*sqtt_flush_bits |= RGP_FLUSH_FLUSH_DB | RGP_FLUSH_INVAL_DB;
 	}
 
 	if (flush_bits & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) {
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 		radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+
+		*sqtt_flush_bits |= RGP_FLUSH_PS_PARTIAL_FLUSH;
 	} else if (flush_bits & RADV_CMD_FLAG_VS_PARTIAL_FLUSH) {
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 		radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+
+		*sqtt_flush_bits |= RGP_FLUSH_VS_PARTIAL_FLUSH;
 	}
 
 	if (flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH) {
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 		radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+
+		*sqtt_flush_bits |= RGP_FLUSH_CS_PARTIAL_FLUSH;
 	}
 
 	if (chip_class == GFX9 && flush_cb_db) {
@@ -1310,6 +1355,9 @@
 		tc_flags = EVENT_TC_ACTION_ENA |
 		           EVENT_TC_MD_ACTION_ENA;
 
+		*sqtt_flush_bits |= RGP_FLUSH_FLUSH_CB | RGP_FLUSH_INVAL_CB |
+		                    RGP_FLUSH_FLUSH_DB | RGP_FLUSH_INVAL_DB;
+
 		/* Ideally flush TC together with CB/DB. */
 		if (flush_bits & RADV_CMD_FLAG_INV_L2) {
 			/* Writeback and invalidate everything in L2 & L1. */
@@ -1321,6 +1369,8 @@
 		        flush_bits &= ~(RADV_CMD_FLAG_INV_L2 |
 					 RADV_CMD_FLAG_WB_L2 |
 					 RADV_CMD_FLAG_INV_VCACHE);
+
+			*sqtt_flush_bits |= RGP_FLUSH_INVAL_L2;
 		}
 		assert(flush_cnt);
 		(*flush_cnt)++;
@@ -1357,6 +1407,8 @@
 	    !is_mec) {
 		radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
 		radeon_emit(cs, 0);
+
+		*sqtt_flush_bits |= RGP_FLUSH_PFP_SYNC_ME;
 	}
 
 	if ((flush_bits & RADV_CMD_FLAG_INV_L2) ||
@@ -1367,6 +1419,8 @@
 				    S_0085F0_TCL1_ACTION_ENA(1) |
 				    S_0301F0_TC_WB_ACTION_ENA(chip_class >= GFX8));
 		cp_coher_cntl = 0;
+
+		*sqtt_flush_bits |= RGP_FLUSH_INVAL_L2 | RGP_FLUSH_INVAL_VMEM_L0;
 	} else {
 		if(flush_bits & RADV_CMD_FLAG_WB_L2) {
 			/* WB = write-back
@@ -1381,6 +1435,8 @@
 					    S_0301F0_TC_WB_ACTION_ENA(1) |
 					    S_0301F0_TC_NC_ACTION_ENA(1));
 			cp_coher_cntl = 0;
+
+			*sqtt_flush_bits |= RGP_FLUSH_FLUSH_L2 | RGP_FLUSH_INVAL_VMEM_L0;
 		}
 		if (flush_bits & RADV_CMD_FLAG_INV_VCACHE) {
 			si_emit_acquire_mem(cs, is_mec,
@@ -1388,6 +1444,8 @@
 					    cp_coher_cntl |
 					    S_0085F0_TCL1_ACTION_ENA(1));
 			cp_coher_cntl = 0;
+
+			*sqtt_flush_bits |= RGP_FLUSH_INVAL_VMEM_L0;
 		}
 	}
 
@@ -1437,6 +1495,7 @@
 			       cmd_buffer->gfx9_fence_va,
 	                       radv_cmd_buffer_uses_mec(cmd_buffer),
 	                       cmd_buffer->state.flush_bits,
+			       &cmd_buffer->state.sqtt_flush_bits,
 			       cmd_buffer->gfx9_eop_bug_va);