tu: Implement VK_EXT_conditional_rendering

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6009>
diff --git a/src/freedreno/vulkan/tu_clear_blit.c b/src/freedreno/vulkan/tu_clear_blit.c
index fe045c5..4f27035 100644
--- a/src/freedreno/vulkan/tu_clear_blit.c
+++ b/src/freedreno/vulkan/tu_clear_blit.c
@@ -328,6 +328,13 @@
 }
 
 static void
+r2d_teardown(struct tu_cmd_buffer *cmd,
+             struct tu_cs *cs)
+{
+   /* nothing to do here */
+}
+
+static void
 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 {
    tu_cs_emit_pkt7(cs, CP_BLIT, 1);
@@ -803,6 +810,11 @@
       .component_enable = aspect_write_mask(vk_format, aspect_mask)));
    tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));
    tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));
+
+   if (cmd->state.predication_active) {
+      tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
+      tu_cs_emit(cs, 0);
+   }
 }
 
 static void
@@ -816,6 +828,15 @@
    tu_cs_emit(cs, 2); /* vertex count */
 }
 
+static void
+r3d_teardown(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+{
+   if (cmd->state.predication_active) {
+      tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
+      tu_cs_emit(cs, 1);
+   }
+}
+
 /* blit ops - common interface for 2d/shader paths */
 
 struct blit_ops {
@@ -844,6 +865,8 @@
                  bool clear,
                  bool ubwc);
    void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
+   void (*teardown)(struct tu_cmd_buffer *cmd,
+                    struct tu_cs *cs);
 };
 
 static const struct blit_ops r2d_ops = {
@@ -855,6 +878,7 @@
    .dst_buffer = r2d_dst_buffer,
    .setup = r2d_setup,
    .run = r2d_run,
+   .teardown = r2d_teardown,
 };
 
 static const struct blit_ops r3d_ops = {
@@ -866,6 +890,7 @@
    .dst_buffer = r3d_dst_buffer,
    .setup = r3d_setup,
    .run = r3d_run,
+   .teardown = r3d_teardown,
 };
 
 /* passthrough set coords from 3D extents */
@@ -1061,6 +1086,8 @@
       ops->src(cmd, cs, &src, i, filter);
       ops->run(cmd, cs);
    }
+
+   ops->teardown(cmd, cs);
 }
 
 void
@@ -1170,6 +1197,8 @@
          ops->run(cmd, cs);
       }
    }
+
+   ops->teardown(cmd, cs);
 }
 
 void
@@ -1243,6 +1272,8 @@
          ops->run(cmd, cs);
       }
    }
+
+   ops->teardown(cmd, cs);
 }
 
 void
@@ -1464,6 +1495,8 @@
          ops->run(cmd, cs);
       }
    }
+
+   ops->teardown(cmd, cs);
 }
 
 void
@@ -1514,6 +1547,8 @@
       dst_va += width * block_size;
       blocks -= width;
    }
+
+   ops->teardown(cmd, cs);
 }
 
 void
@@ -1595,6 +1630,8 @@
       dst_va += width * 4;
       blocks -= width;
    }
+
+   ops->teardown(cmd, cs);
 }
 
 void
@@ -1637,6 +1674,8 @@
          ops->run(cmd, cs);
       }
    }
+
+   ops->teardown(cmd, cs);
 }
 
 void
@@ -1663,6 +1702,8 @@
       ops->dst(cs, dst, i);
       ops->run(cmd, cs);
    }
+
+   ops->teardown(cmd, cs);
 }
 
 static void
@@ -1714,6 +1755,8 @@
          ops->run(cmd, cs);
       }
    }
+
+   ops->teardown(cmd, cs);
 }
 
 void
@@ -2050,6 +2093,22 @@
     */
    tu_emit_cache_flush_renderpass(cmd, cs);
 
+   /* vkCmdClearAttachments is supposed to respect the predicate if active.
+    * The easiest way to do this is to always use the 3d path, which always
+    * works even with GMEM because it's just a simple draw using the existing
+    * attachment state. However it seems that IGNORE_VISIBILITY draws must be
+    * skipped in the binning pass, since otherwise they produce binning data
+    * which isn't consumed and leads to the wrong binning data being read, so
+    * condition on GMEM | SYSMEM.
+    */
+   if (cmd->state.predication_active) {
+      tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM |
+                             CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
+      tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
+      tu_cond_exec_end(cs);
+      return;
+   }
+
    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
    tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
    tu_cond_exec_end(cs);
@@ -2089,6 +2148,8 @@
       }
       ops->run(cmd, cs);
    }
+
+   ops->teardown(cmd, cs);
 }
 
 void
diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c
index 24b0de0..6d76f00 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.c
+++ b/src/freedreno/vulkan/tu_cmd_buffer.c
@@ -31,6 +31,7 @@
 #include "adreno_common.xml.h"
 
 #include "vk_format.h"
+#include "vk_util.h"
 
 #include "tu_cs.h"
 
@@ -568,6 +569,29 @@
    if (cmd->state.xfb_used)
       return true;
 
+   /* Some devices have a newer a630_sqe.fw in which, only in CP_DRAW_INDX and
+    * CP_DRAW_INDX_OFFSET, visibility-based skipping happens *before*
+    * predication-based skipping. It seems this breaks predication, because
+    * draws skipped by predication will not be executed in the binning phase,
+    * and therefore won't have an entry in the draw stream, but the
+    * visibility-based skipping will expect it to have an entry. The result is
+    * a GPU hang when actually executing the first non-predicated draw.
+    * However, it seems that things still work if the whole renderpass is
+    * predicated. Affected tests are
+    * dEQP-VK.conditional_rendering.draw_clear.draw.case_2 as well as a few
+    * other case_N.
+    *
+    * Broken FW version: 016ee181
+    * linux-firmware (working) FW version: 016ee176
+    *
+    * All known a650_sqe.fw versions don't have this bug.
+    *
+    * TODO: we should do version detection of the FW so that devices using the
+    * linux-firmware version of a630_sqe.fw don't need this workaround.
+    */
+   if (cmd->state.has_subpass_predication && cmd->device->physical_device->gpu_id != 650)
+      return false;
+
    if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_NOBIN))
       return false;
 
@@ -583,6 +607,13 @@
    if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_SYSMEM))
       return true;
 
+   /* If hw binning is required because of XFB but doesn't work because of the
+    * conditional rendering bug, fallback to sysmem.
+    */
+   if (cmd->state.xfb_used && cmd->state.has_subpass_predication &&
+       cmd->device->physical_device->gpu_id != 650)
+      return true;
+
    /* can't fit attachments into gmem */
    if (!cmd->state.pass->gmem_pixels)
       return true;
@@ -1591,8 +1622,21 @@
          break;
       }
    } else if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
+      assert(pBeginInfo->pInheritanceInfo);
+
+      vk_foreach_struct(ext, pBeginInfo->pInheritanceInfo) {
+         switch (ext->sType) {
+         case VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT: {
+            const VkCommandBufferInheritanceConditionalRenderingInfoEXT *cond_rend = (void *) ext;
+            cmd_buffer->state.predication_active = cond_rend->conditionalRenderingEnable;
+            break;
+         default:
+            break;
+         }
+         }
+      }
+
       if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
-         assert(pBeginInfo->pInheritanceInfo);
          cmd_buffer->state.pass = tu_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
          cmd_buffer->state.subpass =
             &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
@@ -2356,10 +2400,19 @@
     *
     * Transform feedback counters are read via CP_MEM_TO_REG, which implicitly
     * does CP_WAIT_FOR_ME, but we still need a WFI if the GPU writes it.
+    *
+    * Currently we read the draw predicate using CP_MEM_TO_MEM, which
+    * also implicitly does CP_WAIT_FOR_ME. However CP_DRAW_PRED_SET does *not*
+    * implicitly do CP_WAIT_FOR_ME, it seems to only wait for counters to
+    * complete since it's written for DX11 where you can only predicate on the
+    * result of a query object. So if we implement 64-bit comparisons in the
+    * future, or if CP_DRAW_PRED_SET grows the capability to do 32-bit
+    * comparisons, then this will have to be dealt with.
     */
    if (flags &
        (VK_ACCESS_INDIRECT_COMMAND_READ_BIT |
         VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT |
+        VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT |
         VK_ACCESS_MEMORY_READ_BIT)) {
       mask |= TU_ACCESS_WFI_READ;
    }
@@ -2531,6 +2584,8 @@
 
          if (secondary->state.has_tess)
             cmd->state.has_tess = true;
+         if (secondary->state.has_subpass_predication)
+            cmd->state.has_subpass_predication = true;
       } else {
          assert(tu_cs_is_empty(&secondary->draw_cs));
          assert(tu_cs_is_empty(&secondary->draw_epilogue_cs));
@@ -3671,6 +3726,7 @@
    cmd_buffer->state.subpass = NULL;
    cmd_buffer->state.framebuffer = NULL;
    cmd_buffer->state.has_tess = false;
+   cmd_buffer->state.has_subpass_predication = false;
 }
 
 void
@@ -3870,3 +3926,64 @@
 {
    /* No-op */
 }
+
+
+void
+tu_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,
+                                   const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
+{
+   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
+
+   cmd->state.predication_active = true;
+   if (cmd->state.pass)
+      cmd->state.has_subpass_predication = true;
+
+   struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
+
+   tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_GLOBAL, 1);
+   tu_cs_emit(cs, 1);
+
+   /* Wait for any writes to the predicate to land */
+   if (cmd->state.pass)
+      tu_emit_cache_flush_renderpass(cmd, cs);
+   else
+      tu_emit_cache_flush(cmd, cs);
+
+   TU_FROM_HANDLE(tu_buffer, buf, pConditionalRenderingBegin->buffer);
+   uint64_t iova = tu_buffer_iova(buf) + pConditionalRenderingBegin->offset;
+
+   /* qcom doesn't support 32-bit reference values, only 64-bit, but Vulkan
+    * mandates 32-bit comparisons. Our workaround is to copy the the reference
+    * value to the low 32-bits of a location where the high 32 bits are known
+    * to be 0 and then compare that.
+    */
+   tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
+   tu_cs_emit(cs, 0);
+   tu_cs_emit_qw(cs, global_iova(cmd, predicate));
+   tu_cs_emit_qw(cs, iova);
+
+   tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
+   tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
+
+   bool inv = pConditionalRenderingBegin->flags & VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
+   tu_cs_emit_pkt7(cs, CP_DRAW_PRED_SET, 3);
+   tu_cs_emit(cs, CP_DRAW_PRED_SET_0_SRC(PRED_SRC_MEM) |
+                  CP_DRAW_PRED_SET_0_TEST(inv ? EQ_0_PASS : NE_0_PASS));
+   tu_cs_emit_qw(cs, global_iova(cmd, predicate));
+
+   tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ);
+}
+
+void
+tu_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
+{
+   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
+
+   cmd->state.predication_active = false;
+
+   struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
+
+   tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_GLOBAL, 1);
+   tu_cs_emit(cs, 0);
+}
+
diff --git a/src/freedreno/vulkan/tu_device.c b/src/freedreno/vulkan/tu_device.c
index 14e2407..8540f8d 100644
--- a/src/freedreno/vulkan/tu_device.c
+++ b/src/freedreno/vulkan/tu_device.c
@@ -793,8 +793,8 @@
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CONDITIONAL_RENDERING_FEATURES_EXT: {
          VkPhysicalDeviceConditionalRenderingFeaturesEXT *features =
             (VkPhysicalDeviceConditionalRenderingFeaturesEXT *) ext;
-         features->conditionalRendering = false;
-         features->inheritedConditionalRendering = false;
+         features->conditionalRendering = true;
+         features->inheritedConditionalRendering = true;
          break;
       }
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_FEATURES_EXT: {
@@ -1354,8 +1354,10 @@
    if (result != VK_SUCCESS)
       goto fail_global_bo_map;
 
-   memcpy(device->global_bo.map + gb_offset(border_color), border_color, sizeof(border_color));
-   tu_init_clear_blit_shaders(device->global_bo.map);
+   struct tu6_global *global = device->global_bo.map;
+   memcpy(global->border_color, border_color, sizeof(border_color));
+   global->predicate = 0;
+   tu_init_clear_blit_shaders(global);
 
    VkPipelineCacheCreateInfo ci;
    ci.sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO;
diff --git a/src/freedreno/vulkan/tu_extensions.py b/src/freedreno/vulkan/tu_extensions.py
index 9b84e14..50afba1 100644
--- a/src/freedreno/vulkan/tu_extensions.py
+++ b/src/freedreno/vulkan/tu_extensions.py
@@ -90,6 +90,7 @@
     Extension('VK_EXT_depth_clip_enable',                 1, True),
     Extension('VK_KHR_draw_indirect_count',               1, True),
     Extension('VK_EXT_4444_formats',                      1, True),
+    Extension('VK_EXT_conditional_rendering',             1, True),
 ]
 
 MAX_API_VERSION = VkVersion(MAX_API_VERSION)
diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h
index b5f76e7..9cd15f7 100644
--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@@ -368,7 +368,8 @@
    volatile uint32_t vsc_draw_overflow;
    uint32_t _pad1;
    volatile uint32_t vsc_prim_overflow;
-   uint32_t _pad2[3];
+   uint32_t _pad2;
+   uint64_t predicate;
 
    /* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */
    struct {
@@ -923,6 +924,8 @@
 
    bool xfb_used;
    bool has_tess;
+   bool has_subpass_predication;
+   bool predication_active;
 };
 
 struct tu_cmd_pool