intel/nir: Lower load_num_work_groups to 32-bit if needed

For OpenCL-style kernels, this builtin is 64-bit.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6570>
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 10fffa1..1e7418d 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -3759,6 +3759,7 @@
    }
 
    case nir_intrinsic_load_num_work_groups: {
+      assert(nir_dest_bit_size(instr->dest) == 32);
       const unsigned surface =
          cs_prog_data->binding_table.work_groups_start;
 
diff --git a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c
index 401fbdd..2fc160a 100644
--- a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c
+++ b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c
@@ -55,6 +55,7 @@
       switch (intrinsic->intrinsic) {
       case nir_intrinsic_load_local_group_size:
       case nir_intrinsic_load_work_group_id:
+      case nir_intrinsic_load_num_work_groups:
          /* Convert this to 32-bit if it's not */
          if (intrinsic->dest.ssa.bit_size == 64) {
             intrinsic->dest.ssa.bit_size = 32;