radeonsi: add HUD queries for cache flush stats

Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index 2e06556..74f86dc 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -574,6 +574,9 @@
 	unsigned			num_vs_flushes;
 	unsigned			num_ps_flushes;
 	unsigned			num_cs_flushes;
+	unsigned			num_fb_cache_flushes;
+	unsigned			num_L2_invalidates;
+	unsigned			num_L2_writebacks;
 	uint64_t			num_alloc_tex_transfer_bytes;
 	unsigned			last_tex_ps_draw_ratio; /* for query */
 
diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c
index 0e4270a..6b93329 100644
--- a/src/gallium/drivers/radeon/r600_query.c
+++ b/src/gallium/drivers/radeon/r600_query.c
@@ -113,6 +113,15 @@
 	case R600_QUERY_NUM_CS_FLUSHES:
 		query->begin_result = rctx->num_cs_flushes;
 		break;
+	case R600_QUERY_NUM_FB_CACHE_FLUSHES:
+		query->begin_result = rctx->num_fb_cache_flushes;
+		break;
+	case R600_QUERY_NUM_L2_INVALIDATES:
+		query->begin_result = rctx->num_L2_invalidates;
+		break;
+	case R600_QUERY_NUM_L2_WRITEBACKS:
+		query->begin_result = rctx->num_L2_writebacks;
+		break;
 	case R600_QUERY_REQUESTED_VRAM:
 	case R600_QUERY_REQUESTED_GTT:
 	case R600_QUERY_MAPPED_VRAM:
@@ -197,6 +206,15 @@
 	case R600_QUERY_NUM_CS_FLUSHES:
 		query->end_result = rctx->num_cs_flushes;
 		break;
+	case R600_QUERY_NUM_FB_CACHE_FLUSHES:
+		query->end_result = rctx->num_fb_cache_flushes;
+		break;
+	case R600_QUERY_NUM_L2_INVALIDATES:
+		query->end_result = rctx->num_L2_invalidates;
+		break;
+	case R600_QUERY_NUM_L2_WRITEBACKS:
+		query->end_result = rctx->num_L2_writebacks;
+		break;
 	case R600_QUERY_REQUESTED_VRAM:
 	case R600_QUERY_REQUESTED_GTT:
 	case R600_QUERY_MAPPED_VRAM:
@@ -1665,6 +1683,9 @@
 	X("num-vs-flushes",		NUM_VS_FLUSHES,		UINT64, AVERAGE),
 	X("num-ps-flushes",		NUM_PS_FLUSHES,		UINT64, AVERAGE),
 	X("num-cs-flushes",		NUM_CS_FLUSHES,		UINT64, AVERAGE),
+	X("num-fb-cache-flushes",	NUM_FB_CACHE_FLUSHES,	UINT64, AVERAGE),
+	X("num-L2-invalidates",		NUM_L2_INVALIDATES,	UINT64, AVERAGE),
+	X("num-L2-writebacks",		NUM_L2_WRITEBACKS,	UINT64, AVERAGE),
 	X("requested-VRAM",		REQUESTED_VRAM,		BYTES, AVERAGE),
 	X("requested-GTT",		REQUESTED_GTT,		BYTES, AVERAGE),
 	X("mapped-VRAM",		MAPPED_VRAM,		BYTES, AVERAGE),
diff --git a/src/gallium/drivers/radeon/r600_query.h b/src/gallium/drivers/radeon/r600_query.h
index 2ff586a..af434fa 100644
--- a/src/gallium/drivers/radeon/r600_query.h
+++ b/src/gallium/drivers/radeon/r600_query.h
@@ -52,6 +52,9 @@
 	R600_QUERY_NUM_VS_FLUSHES,
 	R600_QUERY_NUM_PS_FLUSHES,
 	R600_QUERY_NUM_CS_FLUSHES,
+	R600_QUERY_NUM_FB_CACHE_FLUSHES,
+	R600_QUERY_NUM_L2_INVALIDATES,
+	R600_QUERY_NUM_L2_WRITEBACKS,
 	R600_QUERY_REQUESTED_VRAM,
 	R600_QUERY_REQUESTED_GTT,
 	R600_QUERY_MAPPED_VRAM,
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index cae19dc..b3f664e 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -735,6 +735,9 @@
 	struct radeon_winsys_cs *cs = rctx->gfx.cs;
 	uint32_t cp_coher_cntl = 0;
 
+	if (rctx->flags & SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER)
+		sctx->b.num_fb_cache_flushes++;
+
 	/* SI has a bug that it always flushes ICACHE and KCACHE if either
 	 * bit is set. An alternative way is to write SQC_CACHES, but that
 	 * doesn't seem to work reliably. Since the bug doesn't affect
@@ -852,6 +855,7 @@
 				     S_0085F0_TC_ACTION_ENA(1) |
 				     S_0301F0_TC_WB_ACTION_ENA(rctx->chip_class >= VI));
 		cp_coher_cntl = 0;
+		sctx->b.num_L2_invalidates++;
 	} else {
 		/* L1 invalidation and L2 writeback must be done separately,
 		 * because both operations can't be done together.
@@ -867,6 +871,7 @@
 					     S_0301F0_TC_WB_ACTION_ENA(1) |
 					     S_0301F0_TC_NC_ACTION_ENA(1));
 			cp_coher_cntl = 0;
+			sctx->b.num_L2_writebacks++;
 		}
 		if (rctx->flags & SI_CONTEXT_INV_VMEM_L1) {
 			/* Invalidate per-CU VMEM L1. */