tu: Implement clip/cull distances

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6959>
diff --git a/src/freedreno/vulkan/tu_device.c b/src/freedreno/vulkan/tu_device.c
index a94e391..c1eb9e8 100644
--- a/src/freedreno/vulkan/tu_device.c
+++ b/src/freedreno/vulkan/tu_device.c
@@ -423,8 +423,8 @@
       .shaderStorageImageArrayDynamicIndexing = true,
       .shaderStorageImageReadWithoutFormat = true,
       .shaderStorageImageWriteWithoutFormat = true,
-      .shaderClipDistance = false,
-      .shaderCullDistance = false,
+      .shaderClipDistance = true,
+      .shaderCullDistance = true,
       .shaderFloat64 = false,
       .shaderInt64 = false,
       .shaderInt16 = false,
diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c
index c1e6000..a855627 100644
--- a/src/freedreno/vulkan/tu_pipeline.c
+++ b/src/freedreno/vulkan/tu_pipeline.c
@@ -803,7 +803,11 @@
 
    const struct reg_config *cfg = &reg_config[last_shader->type];
 
-   struct ir3_shader_linkage linkage = { .primid_loc = 0xff };
+   struct ir3_shader_linkage linkage = {
+      .primid_loc = 0xff,
+      .clip0_loc = 0xff,
+      .clip1_loc = 0xff,
+   };
    if (fs)
       ir3_link_shaders(&linkage, last_shader, fs, true);
 
@@ -829,6 +833,10 @@
       ir3_find_output_regid(last_shader, VARYING_SLOT_LAYER);
    const uint32_t view_regid =
       ir3_find_output_regid(last_shader, VARYING_SLOT_VIEWPORT);
+   const uint32_t clip0_regid =
+      ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST0);
+   const uint32_t clip1_regid =
+      ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST1);
    uint32_t primitive_regid = gs ?
       ir3_find_sysval_regid(gs, SYSTEM_VALUE_PRIMITIVE_ID) : regid(63, 0);
    uint32_t flags_regid = gs ?
@@ -865,6 +873,19 @@
       ir3_link_add(&linkage, pointsize_regid, 0x1, linkage.max_loc);
    }
 
+   uint8_t clip_cull_mask = last_shader->clip_mask | last_shader->cull_mask;
+
+   /* Handle the case where clip/cull distances aren't read by the FS */
+   uint32_t clip0_loc = linkage.clip0_loc, clip1_loc = linkage.clip1_loc;
+   if (clip0_loc == 0xff && clip0_regid != regid(63, 0)) {
+      clip0_loc = linkage.max_loc;
+      ir3_link_add(&linkage, clip0_regid, clip_cull_mask & 0xf, linkage.max_loc);
+   }
+   if (clip1_loc == 0xff && clip1_regid != regid(63, 0)) {
+      clip1_loc = linkage.max_loc;
+      ir3_link_add(&linkage, clip1_regid, clip_cull_mask >> 4, linkage.max_loc);
+   }
+
    tu6_setup_streamout(cs, last_shader, &linkage);
 
    /* The GPU hangs on some models when there are no outputs (xs_pack::CNT),
@@ -902,17 +923,21 @@
                   A6XX_VPC_VS_PACK_EXTRAPOS(extra_pos));
 
    tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_clip_cntl, 1);
-   tu_cs_emit(cs, 0xffff00);
+   tu_cs_emit(cs, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) |
+                  A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
+                  A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc));
 
    tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_cl_cntl, 1);
-   tu_cs_emit(cs, 0);
+   tu_cs_emit(cs, A6XX_GRAS_VS_CL_CNTL_CLIP_MASK(last_shader->clip_mask) |
+                  A6XX_GRAS_VS_CL_CNTL_CULL_MASK(last_shader->cull_mask));
 
    tu_cs_emit_pkt4(cs, cfg->reg_pc_xs_out_cntl, 1);
    tu_cs_emit(cs, A6XX_PC_VS_OUT_CNTL_STRIDE_IN_VPC(linkage.max_loc) |
                   CONDREG(pointsize_regid, A6XX_PC_VS_OUT_CNTL_PSIZE) |
                   CONDREG(layer_regid, A6XX_PC_VS_OUT_CNTL_LAYER) |
                   CONDREG(view_regid, A6XX_PC_VS_OUT_CNTL_VIEW) |
-                  CONDREG(primitive_regid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID));
+                  CONDREG(primitive_regid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID) |
+                  A6XX_PC_VS_OUT_CNTL_CLIP_MASK(clip_cull_mask));
 
    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_primitive_cntl, 1);
    tu_cs_emit(cs, A6XX_SP_VS_PRIMITIVE_CNTL_OUT(linkage.cnt) |
@@ -2052,6 +2077,7 @@
 
    key.layer_zero = !(outputs_written & VARYING_BIT_LAYER);
    key.view_zero = !(outputs_written & VARYING_BIT_VIEWPORT);
+   key.ucp_enables = MASK(last_shader->ir3_shader->nir->info.clip_distance_array_size);
 
    pipeline->tess.patch_type = key.tessellation;
 
diff --git a/src/freedreno/vulkan/tu_shader.c b/src/freedreno/vulkan/tu_shader.c
index ad5c7c3..9b4b9f8 100644
--- a/src/freedreno/vulkan/tu_shader.c
+++ b/src/freedreno/vulkan/tu_shader.c
@@ -172,6 +172,8 @@
    NIR_PASS_V(nir, nir_lower_system_values);
    NIR_PASS_V(nir, nir_lower_compute_system_values, NULL);
 
+   NIR_PASS_V(nir, nir_lower_clip_cull_distance_arrays);
+
    NIR_PASS_V(nir, nir_lower_frexp);
 
    ir3_optimize_loop(nir);