freedreno/ir3: Enable the i/o vectorizer on UBOs.

This will merge loads of UBO components together into vec4 loads.  At the
same time, it improves the alignment information on our loads, fixing the
regression from the vec3 loads fix.

shader-db results:
total instructions in shared programs: 12829370 -> 8755851 (-31.75%)
total cat6 in shared programs: 145840 -> 97027 (-33.47%)

Overall results from before the vec3 fix:
total instructions in shared programs: 8019997 -> 8755851 (9.18%)
total cat6 in shared programs: 87683 -> 97027 (10.66%)

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6612>
diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c
index 64daa68..2628746 100644
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -145,6 +145,36 @@
 	return &options;
 }
 
+static bool
+ir3_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
+		unsigned bit_size,
+		unsigned num_components,
+		nir_intrinsic_instr *low,
+		nir_intrinsic_instr *high)
+{
+	assert(bit_size >= 8);
+	if (bit_size != 32)
+		return false;
+	unsigned byte_size = bit_size / 8;
+
+	int size = num_components * byte_size;
+
+	/* Don't care about alignment past vec4. */
+	assert(util_is_power_of_two_nonzero(align_mul));
+	align_mul = MIN2(align_mul, 16);
+	align_offset &= 15;
+
+	/* Our offset alignment should aways be at least 4 bytes */
+	if (align_mul < 4)
+		return false;
+
+	unsigned worst_start_offset = 16 - align_mul + align_offset;
+	if (worst_start_offset + size > 16)
+		return false;
+
+	return true;
+}
+
 #define OPT(nir, pass, ...) ({                             \
    bool this_progress = false;                             \
    NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
@@ -188,6 +218,9 @@
 		progress |= OPT(s, nir_lower_pack);
 		progress |= OPT(s, nir_opt_constant_folding);
 
+		progress |= OPT(s, nir_opt_load_store_vectorize, nir_var_mem_ubo,
+				ir3_nir_should_vectorize_mem, 0);
+
 		if (lower_flrp != 0) {
 			if (OPT(s, nir_lower_flrp,
 					lower_flrp,