radv: Only use PKT3_OCCLUSION_QUERY when it doesn't hang.

PKT3_OCCLUSION_QUERY hangs when used in a nested IB. This only
calls it when in a primary command buffer and we change
GetQueryPoolResults to not need it. CmdCopyQueryPoolResults
still needs it so we break that behavior for secondary command buffers.
However, that would hang already and using an unitialized value is
better than a hang.

Signed-off-by: Bas Nieuwenhuizen <basni@google.com>
Reviewed-by: Dave Airlie <airlied@redhat.com>
Cc: 13.0 17.0 <mesa-stable@lists.freedesktop.org>
(cherry picked from commit 8ea34a98c06af4e3ce00ea665ee07480bcbd199f)
diff --git a/src/amd/vulkan/radv_query.c b/src/amd/vulkan/radv_query.c
index a29a05d..a2d0889 100644
--- a/src/amd/vulkan/radv_query.c
+++ b/src/amd/vulkan/radv_query.c
@@ -131,6 +131,7 @@
 	VkDeviceSize                                stride,
 	VkQueryResultFlags                          flags)
 {
+	RADV_FROM_HANDLE(radv_device, device, _device);
 	RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
 	char *data = pData;
 	VkResult result = VK_SUCCESS;
@@ -141,23 +142,20 @@
 		char *src = pool->ptr + query * pool->stride;
 		uint32_t available;
 
-		if (flags & VK_QUERY_RESULT_WAIT_BIT) {
-			while(!*(volatile uint32_t*)(pool->ptr + pool->availability_offset + 4 * query))
-				;
-		}
-
-		if (!*(uint32_t*)(pool->ptr + pool->availability_offset + 4 * query) &&
-		    !(flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
-			if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
-				*(uint32_t*)dest = 0;
-			result = VK_NOT_READY;
-			continue;
-
-		}
-
-		available = *(uint32_t*)(pool->ptr + pool->availability_offset + 4 * query);
 		switch (pool->type) {
-		case VK_QUERY_TYPE_TIMESTAMP:
+		case VK_QUERY_TYPE_TIMESTAMP: {
+			if (flags & VK_QUERY_RESULT_WAIT_BIT) {
+				while(!*(volatile uint32_t*)(pool->ptr + pool->availability_offset + 4 * query))
+					;
+			}
+
+			available = *(uint32_t*)(pool->ptr + pool->availability_offset + 4 * query);
+			if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
+				result = VK_NOT_READY;
+				break;
+
+			}
+
 			if (flags & VK_QUERY_RESULT_64_BIT) {
 				*(uint64_t*)dest = *(uint64_t*)src;
 				dest += 8;
@@ -166,8 +164,32 @@
 				dest += 4;
 			}
 			break;
+		}
 		case VK_QUERY_TYPE_OCCLUSION: {
-			uint64_t result = *(uint64_t*)(src + pool->stride - 16);
+			volatile uint64_t const *src64 = (volatile uint64_t const *)src;
+			uint64_t result = 0;
+			int db_count = get_max_db(device);
+			available = 1;
+
+			for (int i = 0; i < db_count; ++i) {
+				uint64_t start, end;
+				do {
+					start = src64[2 * i];
+					end = src64[2 * i + 1];
+				} while ((!(start & (1ull << 63)) || !(end & (1ull << 63))) && (flags & VK_QUERY_RESULT_WAIT_BIT));
+
+				if (!(start & (1ull << 63)) || !(end & (1ull << 63)))
+					available = 0;
+				else {
+					result += end - start;
+				}
+			}
+
+			if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
+				result = VK_NOT_READY;
+				break;
+
+			}
 
 			if (flags & VK_QUERY_RESULT_64_BIT) {
 				*(uint64_t*)dest = result;
@@ -357,11 +379,14 @@
 		radeon_emit(cs, va + 8);
 		radeon_emit(cs, (va + 8) >> 32);
 
-		radeon_emit(cs, PKT3(PKT3_OCCLUSION_QUERY, 3, 0));
-		radeon_emit(cs, va);
-		radeon_emit(cs, va >> 32);
-		radeon_emit(cs, va + pool->stride - 16);
-		radeon_emit(cs, (va + pool->stride - 16) >> 32);
+		/* hangs for VK_COMMAND_BUFFER_LEVEL_SECONDARY. */
+		if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
+			radeon_emit(cs, PKT3(PKT3_OCCLUSION_QUERY, 3, 0));
+			radeon_emit(cs, va);
+			radeon_emit(cs, va >> 32);
+			radeon_emit(cs, va + pool->stride - 16);
+			radeon_emit(cs, (va + pool->stride - 16) >> 32);
+		}
 
 		break;
 	default: