pan/bi: Add load_output support

This is mapped to the LD_TILE instruction. Note that multi-sample RTs
are not supported yet.

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7151>
diff --git a/src/panfrost/bifrost/bi_pack.c b/src/panfrost/bifrost/bi_pack.c
index f6e18fc..86b7efc 100644
--- a/src/panfrost/bifrost/bi_pack.c
+++ b/src/panfrost/bifrost/bi_pack.c
@@ -831,6 +831,8 @@
                 }
         case BI_LOAD_VAR_ADDRESS:
                 return pan_pack_add_lea_attr_imm(clause, bundle.add, regs);
+        case BI_LOAD_TILE:
+                return pan_pack_add_ld_tile(clause, bundle.add, regs);
         case BI_MINMAX:
                 if (bundle.add->op.minmax == BI_MINMAX_MIN) {
                         if (bundle.add->dest_type == nir_type_float32)
diff --git a/src/panfrost/bifrost/bi_print.c b/src/panfrost/bifrost/bi_print.c
index f95295c..a677c3b 100644
--- a/src/panfrost/bifrost/bi_print.c
+++ b/src/panfrost/bifrost/bi_print.c
@@ -62,6 +62,7 @@
         case BI_LOAD_ATTR: return "load_attr";
         case BI_LOAD_VAR: return "load_var";
         case BI_LOAD_VAR_ADDRESS: return "load_var_address";
+        case BI_LOAD_TILE: return "load_tile";
         case BI_MINMAX: return "minmax";
         case BI_MOV: return "mov";
         case BI_SELECT: return "select";
diff --git a/src/panfrost/bifrost/bi_schedule.c b/src/panfrost/bifrost/bi_schedule.c
index 3fd1dfb..4a05527 100644
--- a/src/panfrost/bifrost/bi_schedule.c
+++ b/src/panfrost/bifrost/bi_schedule.c
@@ -77,6 +77,9 @@
         case BI_BLEND:
                 return BIFROST_MESSAGE_BLEND;
 
+        case BI_LOAD_TILE:
+                return BIFROST_MESSAGE_TILE;
+
         case BI_ATEST:
                 return BIFROST_MESSAGE_ATEST;
 
diff --git a/src/panfrost/bifrost/bi_tables.c b/src/panfrost/bifrost/bi_tables.c
index 90862c8e..19352e5 100644
--- a/src/panfrost/bifrost/bi_tables.c
+++ b/src/panfrost/bifrost/bi_tables.c
@@ -45,6 +45,7 @@
         [BI_LOAD_ATTR] 		= BI_SCHED_HI_LATENCY | BI_SCHED_ADD | BI_VECTOR | BI_DATA_REG_DEST,
         [BI_LOAD_VAR] 		= BI_SCHED_HI_LATENCY | BI_SCHED_ADD | BI_VECTOR | BI_DATA_REG_DEST,
         [BI_LOAD_VAR_ADDRESS] 	= BI_SCHED_HI_LATENCY | BI_SCHED_ADD | BI_VECTOR | BI_DATA_REG_DEST,
+        [BI_LOAD_TILE]		= BI_SCHED_HI_LATENCY | BI_SCHED_ADD | BI_VECTOR | BI_DATA_REG_DEST,
         [BI_MINMAX] 		= BI_SCHED_ADD | BI_NO_ABS_ABS_FP16_FMA | BI_MODS,
         [BI_MOV] 		= BI_SCHED_ALL,
         [BI_FMOV]               = BI_MODS | BI_SCHED_ALL,
diff --git a/src/panfrost/bifrost/bifrost.h b/src/panfrost/bifrost/bifrost.h
index 55148c8..6d03a58 100644
--- a/src/panfrost/bifrost/bifrost.h
+++ b/src/panfrost/bifrost/bifrost.h
@@ -540,4 +540,15 @@
         unsigned mask : 4;
 } __attribute__((packed));
 
+#define BIFROST_MEGA_SAMPLE 128
+#define BIFROST_ALL_SAMPLES 255
+#define BIFROST_CURRENT_PIXEL 255
+
+struct bifrost_pixel_indices {
+        unsigned sample : 8;
+        unsigned rt : 8;
+        unsigned x : 8;
+        unsigned y : 8;
+} __attribute__((packed));
+
 #endif
diff --git a/src/panfrost/bifrost/bifrost_compile.c b/src/panfrost/bifrost/bifrost_compile.c
index 3e3df21..b8191d9 100644
--- a/src/panfrost/bifrost/bifrost_compile.c
+++ b/src/panfrost/bifrost/bifrost_compile.c
@@ -105,6 +105,46 @@
 }
 
 static void
+bi_emit_ld_output(bi_context *ctx, nir_intrinsic_instr *instr)
+{
+        assert(ctx->is_blend);
+
+        bi_instruction ins = {
+                .type = BI_LOAD_TILE,
+                .vector_channels = instr->num_components,
+                .dest = pan_dest_index(&instr->dest),
+                .dest_type = nir_type_float16,
+                .src = {
+                        /* PixelIndices */
+                        BIR_INDEX_CONSTANT,
+                        /* PixelCoverage: we simply pass r60 which contains the cumulative
+                         * coverage bitmap
+                         */
+                        BIR_INDEX_REGISTER | 60,
+                        /* InternalConversionDescriptor (see src/panfrost/lib/midgard.xml for more
+                         * details)
+			 */
+                        BIR_INDEX_CONSTANT | 32
+                },
+                .src_types = { nir_type_uint32, nir_type_uint32, nir_type_uint32 },
+        };
+
+        /* We want to load the current pixel.
+         * FIXME: The sample to load is currently hardcoded to 0. This should
+         * be addressed for multi-sample FBs.
+         */
+        struct bifrost_pixel_indices pix = {
+                .y = BIFROST_CURRENT_PIXEL,
+        };
+        memcpy(&ins.constant.u64, &pix, sizeof(pix));
+
+        /* Only keep the conversion part of the blend descriptor. */
+        ins.constant.u64 |= ctx->blend_desc & 0xffffffff00000000ULL;
+
+        bi_emit(ctx, ins);
+}
+
+static void
 bi_emit_ld_vary(bi_context *ctx, nir_intrinsic_instr *instr)
 {
         bi_instruction ins = bi_load(BI_LOAD_VAR, instr);
@@ -488,6 +528,10 @@
                 bi_emit_sysval(ctx, &instr->instr, 1, 8);
                 break;
 
+        case nir_intrinsic_load_output:
+                bi_emit_ld_output(ctx, instr);
+                break;
+
         case nir_intrinsic_load_viewport_scale:
         case nir_intrinsic_load_viewport_offset:
         case nir_intrinsic_load_num_work_groups:
diff --git a/src/panfrost/bifrost/compiler.h b/src/panfrost/bifrost/compiler.h
index 350a0d4..7a2a1fd 100644
--- a/src/panfrost/bifrost/compiler.h
+++ b/src/panfrost/bifrost/compiler.h
@@ -66,6 +66,7 @@
         BI_LOAD_ATTR,
         BI_LOAD_VAR,
         BI_LOAD_VAR_ADDRESS,
+        BI_LOAD_TILE,
         BI_MINMAX,
         BI_MOV,
         BI_REDUCE_FMA,