nvk: Emit NVK_MME_SCRATCH_DRAW_BEGIN as part of flush_ia_state()

We're always storing it in a scratch register for register pressure
reasons anyway.  We may as well just stash it there as a state reg and
we can avoid emitting it all over the place.  This reduces each draw
call to nvk_flush_gfx_state() followed by the actual draw, which is now
independenty of any dynamic state.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30703>
diff --git a/src/nouveau/vulkan/nvk_cmd_draw.c b/src/nouveau/vulkan/nvk_cmd_draw.c
index d2e68b3..ca3f506 100644
--- a/src/nouveau/vulkan/nvk_cmd_draw.c
+++ b/src/nouveau/vulkan/nvk_cmd_draw.c
@@ -1448,13 +1448,60 @@
    }
 }
 
+static uint32_t
+vk_to_nv9097_primitive_topology(VkPrimitiveTopology prim)
+{
+   switch (prim) {
+   case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
+      return NV9097_BEGIN_OP_POINTS;
+   case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
+      return NV9097_BEGIN_OP_LINES;
+   case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
+      return NV9097_BEGIN_OP_LINE_STRIP;
+   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wswitch"
+   case VK_PRIMITIVE_TOPOLOGY_META_RECT_LIST_MESA:
+#pragma GCC diagnostic pop
+      return NV9097_BEGIN_OP_TRIANGLES;
+   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
+      return NV9097_BEGIN_OP_TRIANGLE_STRIP;
+   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
+      return NV9097_BEGIN_OP_TRIANGLE_FAN;
+   case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
+      return NV9097_BEGIN_OP_LINELIST_ADJCY;
+   case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
+      return NV9097_BEGIN_OP_LINESTRIP_ADJCY;
+   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
+      return NV9097_BEGIN_OP_TRIANGLELIST_ADJCY;
+   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
+      return NV9097_BEGIN_OP_TRIANGLESTRIP_ADJCY;
+   case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
+      return NV9097_BEGIN_OP_PATCH;
+   default:
+      unreachable("Invalid primitive topology");
+   }
+}
+
 static void
 nvk_flush_ia_state(struct nvk_cmd_buffer *cmd)
 {
    const struct vk_dynamic_graphics_state *dyn =
       &cmd->vk.dynamic_graphics_state;
 
-   /** Nothing to do for MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY */
+   if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY)) {
+      uint32_t begin;
+      V_NV9097_BEGIN(begin, {
+         .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
+         .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
+         .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
+         .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
+      });
+
+      struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
+      P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_DRAW_BEGIN));
+      P_INLINE_DATA(p, begin);
+   }
 
    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE)) {
       struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
@@ -2734,41 +2781,6 @@
    }
 }
 
-static uint32_t
-vk_to_nv9097_primitive_topology(VkPrimitiveTopology prim)
-{
-   switch (prim) {
-   case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
-      return NV9097_BEGIN_OP_POINTS;
-   case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
-      return NV9097_BEGIN_OP_LINES;
-   case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
-      return NV9097_BEGIN_OP_LINE_STRIP;
-   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wswitch"
-   case VK_PRIMITIVE_TOPOLOGY_META_RECT_LIST_MESA:
-#pragma GCC diagnostic pop
-      return NV9097_BEGIN_OP_TRIANGLES;
-   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
-      return NV9097_BEGIN_OP_TRIANGLE_STRIP;
-   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
-      return NV9097_BEGIN_OP_TRIANGLE_FAN;
-   case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
-      return NV9097_BEGIN_OP_LINELIST_ADJCY;
-   case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
-      return NV9097_BEGIN_OP_LINESTRIP_ADJCY;
-   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
-      return NV9097_BEGIN_OP_TRIANGLELIST_ADJCY;
-   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
-      return NV9097_BEGIN_OP_TRIANGLESTRIP_ADJCY;
-   case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
-      return NV9097_BEGIN_OP_PATCH;
-   default:
-      unreachable("Invalid primitive topology");
-   }
-}
-
 static void
 nvk_mme_set_cb0_mthd(struct mme_builder *b,
                      uint16_t cb0_offset,
@@ -2943,9 +2955,7 @@
 void
 nvk_mme_draw(struct mme_builder *b)
 {
-   nvk_mme_load_to_scratch(b, DRAW_BEGIN);
    struct mme_value draw_index = mme_load(b);
-
    nvk_mme_build_draw(b, draw_index);
 }
 
@@ -2957,22 +2967,11 @@
             uint32_t firstInstance)
 {
    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
-   const struct vk_dynamic_graphics_state *dyn =
-      &cmd->vk.dynamic_graphics_state;
 
    nvk_flush_gfx_state(cmd);
 
-   uint32_t begin;
-   V_NV9097_BEGIN(begin, {
-      .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
-      .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
-      .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
-      .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
-   });
-
-   struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
+   struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW));
-   P_INLINE_DATA(p, begin);
    P_INLINE_DATA(p, 0 /* draw_index */);
    P_INLINE_DATA(p, vertexCount);
    P_INLINE_DATA(p, instanceCount);
@@ -2989,23 +2988,12 @@
                     uint32_t stride)
 {
    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
-   const struct vk_dynamic_graphics_state *dyn =
-      &cmd->vk.dynamic_graphics_state;
 
    nvk_flush_gfx_state(cmd);
 
-   uint32_t begin;
-   V_NV9097_BEGIN(begin, {
-      .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
-      .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
-      .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
-      .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
-   });
-
    for (uint32_t draw_index = 0; draw_index < drawCount; draw_index++) {
-      struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
+      struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW));
-      P_INLINE_DATA(p, begin);
       P_INLINE_DATA(p, draw_index);
       P_INLINE_DATA(p, pVertexInfo->vertexCount);
       P_INLINE_DATA(p, instanceCount);
@@ -3106,9 +3094,7 @@
 void
 nvk_mme_draw_indexed(struct mme_builder *b)
 {
-   nvk_mme_load_to_scratch(b, DRAW_BEGIN);
    struct mme_value draw_index = mme_load(b);
-
    nvk_mme_build_draw_indexed(b, draw_index);
 }
 
@@ -3121,22 +3107,11 @@
                    uint32_t firstInstance)
 {
    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
-   const struct vk_dynamic_graphics_state *dyn =
-      &cmd->vk.dynamic_graphics_state;
 
    nvk_flush_gfx_state(cmd);
 
-   uint32_t begin;
-   V_NV9097_BEGIN(begin, {
-      .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
-      .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
-      .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
-      .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
-   });
-
-   struct nv_push *p = nvk_cmd_buffer_push(cmd, 8);
+   struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED));
-   P_INLINE_DATA(p, begin);
    P_INLINE_DATA(p, 0 /* draw_index */);
    P_INLINE_DATA(p, indexCount);
    P_INLINE_DATA(p, instanceCount);
@@ -3155,26 +3130,15 @@
                            const int32_t *pVertexOffset)
 {
    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
-   const struct vk_dynamic_graphics_state *dyn =
-      &cmd->vk.dynamic_graphics_state;
 
    nvk_flush_gfx_state(cmd);
 
-   uint32_t begin;
-   V_NV9097_BEGIN(begin, {
-      .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
-      .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
-      .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
-      .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
-   });
-
    for (uint32_t draw_index = 0; draw_index < drawCount; draw_index++) {
       const uint32_t vertex_offset =
          pVertexOffset != NULL ? *pVertexOffset : pIndexInfo->vertexOffset;
 
-      struct nv_push *p = nvk_cmd_buffer_push(cmd, 8);
+      struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED));
-      P_INLINE_DATA(p, begin);
       P_INLINE_DATA(p, draw_index);
       P_INLINE_DATA(p, pIndexInfo->indexCount);
       P_INLINE_DATA(p, instanceCount);
@@ -3189,8 +3153,6 @@
 void
 nvk_mme_draw_indirect(struct mme_builder *b)
 {
-   nvk_mme_load_to_scratch(b, DRAW_BEGIN);
-
    if (b->devinfo->cls_eng3d >= TURING_A) {
       struct mme_value64 draw_addr = mme_load_addr64(b);
       struct mme_value draw_count = mme_load(b);
@@ -3236,8 +3198,6 @@
 {
    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
    VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
-   const struct vk_dynamic_graphics_state *dyn =
-      &cmd->vk.dynamic_graphics_state;
 
    /* From the Vulkan 1.3.238 spec:
     *
@@ -3259,18 +3219,9 @@
 
    nvk_flush_gfx_state(cmd);
 
-   uint32_t begin;
-   V_NV9097_BEGIN(begin, {
-      .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
-      .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
-      .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
-      .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
-   });
-
    if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
-      struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
+      struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT));
-      P_INLINE_DATA(p, begin);
       uint64_t draw_addr = nvk_buffer_address(buffer, offset);
       P_INLINE_DATA(p, draw_addr >> 32);
       P_INLINE_DATA(p, draw_addr);
@@ -3284,9 +3235,8 @@
       while (drawCount) {
          const uint32_t count = MIN2(drawCount, max_draws_per_push);
 
-         struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
+         struct nv_push *p = nvk_cmd_buffer_push(cmd, 3);
          P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT));
-         P_INLINE_DATA(p, begin);
          P_INLINE_DATA(p, count);
          P_INLINE_DATA(p, (stride - sizeof(VkDrawIndirectCommand)) / 4);
 
@@ -3303,8 +3253,6 @@
 void
 nvk_mme_draw_indexed_indirect(struct mme_builder *b)
 {
-   nvk_mme_load_to_scratch(b, DRAW_BEGIN);
-
    if (b->devinfo->cls_eng3d >= TURING_A) {
       struct mme_value64 draw_addr = mme_load_addr64(b);
       struct mme_value draw_count = mme_load(b);
@@ -3350,8 +3298,6 @@
 {
    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
    VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
-   const struct vk_dynamic_graphics_state *dyn =
-      &cmd->vk.dynamic_graphics_state;
 
    /* From the Vulkan 1.3.238 spec:
     *
@@ -3373,18 +3319,9 @@
 
    nvk_flush_gfx_state(cmd);
 
-   uint32_t begin;
-   V_NV9097_BEGIN(begin, {
-      .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
-      .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
-      .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
-      .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
-   });
-
    if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
-      struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
+      struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT));
-      P_INLINE_DATA(p, begin);
       uint64_t draw_addr = nvk_buffer_address(buffer, offset);
       P_INLINE_DATA(p, draw_addr >> 32);
       P_INLINE_DATA(p, draw_addr);
@@ -3398,9 +3335,8 @@
       while (drawCount) {
          const uint32_t count = MIN2(drawCount, max_draws_per_push);
 
-         struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
+         struct nv_push *p = nvk_cmd_buffer_push(cmd, 3);
          P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT));
-         P_INLINE_DATA(p, begin);
          P_INLINE_DATA(p, count);
          P_INLINE_DATA(p, (stride - sizeof(VkDrawIndexedIndirectCommand)) / 4);
 
@@ -3420,8 +3356,6 @@
    if (b->devinfo->cls_eng3d < TURING_A)
       return;
 
-   nvk_mme_load_to_scratch(b, DRAW_BEGIN);
-
    struct mme_value64 draw_addr = mme_load_addr64(b);
    struct mme_value64 draw_count_addr = mme_load_addr64(b);
    struct mme_value draw_max = mme_load(b);
@@ -3460,25 +3394,13 @@
    VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
    VK_FROM_HANDLE(nvk_buffer, count_buffer, countBuffer);
 
-   const struct vk_dynamic_graphics_state *dyn =
-      &cmd->vk.dynamic_graphics_state;
-
    /* TODO: Indirect count draw pre-Turing */
    assert(nvk_cmd_buffer_3d_cls(cmd) >= TURING_A);
 
    nvk_flush_gfx_state(cmd);
 
-   uint32_t begin;
-   V_NV9097_BEGIN(begin, {
-      .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
-      .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
-      .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
-      .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
-   });
-
-   struct nv_push *p = nvk_cmd_buffer_push(cmd, 8);
+   struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT_COUNT));
-   P_INLINE_DATA(p, begin);
    uint64_t draw_addr = nvk_buffer_address(buffer, offset);
    P_INLINE_DATA(p, draw_addr >> 32);
    P_INLINE_DATA(p, draw_addr);
@@ -3496,8 +3418,6 @@
    if (b->devinfo->cls_eng3d < TURING_A)
       return;
 
-   nvk_mme_load_to_scratch(b, DRAW_BEGIN);
-
    struct mme_value64 draw_addr = mme_load_addr64(b);
    struct mme_value64 draw_count_addr = mme_load_addr64(b);
    struct mme_value draw_max = mme_load(b);
@@ -3536,25 +3456,13 @@
    VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
    VK_FROM_HANDLE(nvk_buffer, count_buffer, countBuffer);
 
-   const struct vk_dynamic_graphics_state *dyn =
-      &cmd->vk.dynamic_graphics_state;
-
    /* TODO: Indexed indirect count draw pre-Turing */
    assert(nvk_cmd_buffer_3d_cls(cmd) >= TURING_A);
 
    nvk_flush_gfx_state(cmd);
 
-   uint32_t begin;
-   V_NV9097_BEGIN(begin, {
-      .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
-      .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
-      .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
-      .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
-   });
-
-   struct nv_push *p = nvk_cmd_buffer_push(cmd, 8);
+   struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT_COUNT));
-   P_INLINE_DATA(p, begin);
    uint64_t draw_addr = nvk_buffer_address(buffer, offset);
    P_INLINE_DATA(p, draw_addr >> 32);
    P_INLINE_DATA(p, draw_addr);
@@ -3592,8 +3500,6 @@
 void
 nvk_mme_xfb_draw_indirect(struct mme_builder *b)
 {
-   nvk_mme_load_to_scratch(b, DRAW_BEGIN);
-
    struct mme_value instance_count = mme_load(b);
    struct mme_value first_instance = mme_load(b);
 
@@ -3653,40 +3559,28 @@
 {
    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
    VK_FROM_HANDLE(nvk_buffer, counter_buffer, counterBuffer);
-   const struct vk_dynamic_graphics_state *dyn =
-      &cmd->vk.dynamic_graphics_state;
 
    nvk_flush_gfx_state(cmd);
 
-   uint32_t begin;
-   V_NV9097_BEGIN(begin, {
-      .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
-      .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
-      .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
-      .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
-   });
-
    uint64_t counter_addr = nvk_buffer_address(counter_buffer,
                                               counterBufferOffset);
 
    if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
-      struct nv_push *p = nvk_cmd_buffer_push(cmd, 10);
-      P_IMMD(p, NV9097, SET_DRAW_AUTO_START, counterOffset);
-      P_IMMD(p, NV9097, SET_DRAW_AUTO_STRIDE, vertexStride);
-
-      P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_DRAW_INDIRECT));
-      P_INLINE_DATA(p, begin);
-      P_INLINE_DATA(p, instanceCount);
-      P_INLINE_DATA(p, firstInstance);
-      P_INLINE_DATA(p, counter_addr >> 32);
-      P_INLINE_DATA(p, counter_addr);
-   } else {
       struct nv_push *p = nvk_cmd_buffer_push(cmd, 9);
       P_IMMD(p, NV9097, SET_DRAW_AUTO_START, counterOffset);
       P_IMMD(p, NV9097, SET_DRAW_AUTO_STRIDE, vertexStride);
 
       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_DRAW_INDIRECT));
-      P_INLINE_DATA(p, begin);
+      P_INLINE_DATA(p, instanceCount);
+      P_INLINE_DATA(p, firstInstance);
+      P_INLINE_DATA(p, counter_addr >> 32);
+      P_INLINE_DATA(p, counter_addr);
+   } else {
+      struct nv_push *p = nvk_cmd_buffer_push(cmd, 8);
+      P_IMMD(p, NV9097, SET_DRAW_AUTO_START, counterOffset);
+      P_IMMD(p, NV9097, SET_DRAW_AUTO_STRIDE, vertexStride);
+
+      P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_DRAW_INDIRECT));
       P_INLINE_DATA(p, instanceCount);
       P_INLINE_DATA(p, firstInstance);
       nv_push_update_count(p, 1);