panfrost: XML-ify the local storage descriptor

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6797>
diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c
index 5a872ae..ddeb4f5 100644
--- a/src/gallium/drivers/panfrost/pan_cmdstream.c
+++ b/src/gallium/drivers/panfrost/pan_cmdstream.c
@@ -57,21 +57,25 @@
 {
         struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
 
-        struct mali_shared_memory shared = {
-                .shared_workgroup_count = ~0,
-        };
+        struct panfrost_transfer t =
+                panfrost_pool_alloc_aligned(&batch->pool,
+                                            MALI_LOCAL_STORAGE_LENGTH,
+                                            64);
 
-        if (batch->stack_size) {
-                struct panfrost_bo *stack =
-                        panfrost_batch_get_scratchpad(batch, batch->stack_size,
-                                        dev->thread_tls_alloc,
-                                        dev->core_count);
+        pan_pack(t.cpu, LOCAL_STORAGE, ls) {
+                ls.wls_instances = MALI_LOCAL_STORAGE_NO_WORKGROUP_MEM;
+                if (batch->stack_size) {
+                        struct panfrost_bo *stack =
+                                panfrost_batch_get_scratchpad(batch, batch->stack_size,
+                                                              dev->thread_tls_alloc,
+                                                              dev->core_count);
 
-                shared.stack_shift = panfrost_get_stack_shift(batch->stack_size);
-                shared.scratchpad = stack->gpu;
+                        ls.tls_size = panfrost_get_stack_shift(batch->stack_size);
+                        ls.tls_base_pointer = stack->gpu;
+                }
         }
 
-        return panfrost_pool_upload_aligned(&batch->pool, &shared, sizeof(shared), 64);
+        return t.gpu;
 }
 
 void
@@ -950,15 +954,18 @@
         struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
                                                                   shared_size,
                                                                   1);
+        struct panfrost_transfer t =
+                panfrost_pool_alloc_aligned(&batch->pool,
+                                            MALI_LOCAL_STORAGE_LENGTH,
+                                            64);
 
-        struct mali_shared_memory shared = {
-                .shared_memory = bo->gpu,
-                .shared_workgroup_count = log2_instances,
-                .shared_shift = util_logbase2(single_size) + 1
+        pan_pack(t.cpu, LOCAL_STORAGE, ls) {
+                ls.wls_base_pointer = bo->gpu;
+                ls.wls_instances = log2_instances;
+                ls.wls_size_scale = util_logbase2(single_size) + 1;
         };
 
-        return panfrost_pool_upload_aligned(&batch->pool, &shared,
-                        sizeof(shared), 64);
+        return t.gpu;
 }
 
 static mali_ptr
diff --git a/src/gallium/drivers/panfrost/pan_mfbd.c b/src/gallium/drivers/panfrost/pan_mfbd.c
index deccd3a..fad3f02 100644
--- a/src/gallium/drivers/panfrost/pan_mfbd.c
+++ b/src/gallium/drivers/panfrost/pan_mfbd.c
@@ -498,18 +498,24 @@
                 mfbd.msaa.sample_locations = panfrost_emit_sample_locations(batch);
                 mfbd.tiler_meta = panfrost_batch_get_tiler_meta(batch, vertex_count);
         } else {
-                if (batch->stack_size) {
-                        unsigned shift = panfrost_get_stack_shift(batch->stack_size);
-                        struct panfrost_bo *bo = panfrost_batch_get_scratchpad(batch,
-                                                                               batch->stack_size,
-                                                                               dev->thread_tls_alloc,
-                                                                               dev->core_count);
-                        mfbd.shared_memory.stack_shift = shift;
-                        mfbd.shared_memory.scratchpad = bo->gpu;
+                struct mali_local_storage_packed lsp;
+
+                pan_pack(&lsp, LOCAL_STORAGE, ls) {
+                        if (batch->stack_size) {
+                                unsigned shift =
+                                        panfrost_get_stack_shift(batch->stack_size);
+                                struct panfrost_bo *bo =
+                                        panfrost_batch_get_scratchpad(batch,
+                                                                      batch->stack_size,
+                                                                      dev->thread_tls_alloc,
+                                                                      dev->core_count);
+                                ls.tls_size = shift;
+                                ls.tls_base_pointer = bo->gpu;
+                        }
+
+                        ls.wls_instances = MALI_LOCAL_STORAGE_NO_WORKGROUP_MEM;
                 }
-
-                mfbd.shared_memory.shared_workgroup_count = ~0;
-
+                mfbd.shared_memory = lsp;
                 mfbd.tiler = panfrost_emit_midg_tiler(batch, vertex_count);
         }
 
diff --git a/src/gallium/drivers/panfrost/pan_sfbd.c b/src/gallium/drivers/panfrost/pan_sfbd.c
index d9173ba..ce68740 100644
--- a/src/gallium/drivers/panfrost/pan_sfbd.c
+++ b/src/gallium/drivers/panfrost/pan_sfbd.c
@@ -207,11 +207,6 @@
         struct mali_single_framebuffer framebuffer = {
                 .width = MALI_POSITIVE(width),
                 .height = MALI_POSITIVE(height),
-                .shared_memory = {
-                        .stack_shift = shift,
-                        .shared_workgroup_count = ~0,
-                        .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
-                },
                 .format = {
                         .unk3 = 0x3,
                 },
@@ -219,6 +214,18 @@
                 .tiler = panfrost_emit_midg_tiler(batch, vertex_count),
         };
 
+        struct mali_local_storage_packed lsp;
+        pan_pack(&lsp, LOCAL_STORAGE, ls) {
+                ls.tls_size = shift;
+                ls.wls_instances = MALI_LOCAL_STORAGE_NO_WORKGROUP_MEM;
+                ls.tls_base_pointer =
+                        panfrost_batch_get_scratchpad(batch,
+                                                      shift,
+                                                      dev->thread_tls_alloc,
+                                                      dev->core_count)->gpu;
+        }
+        framebuffer.shared_memory = lsp;
+
         return framebuffer;
 }
 
diff --git a/src/panfrost/bifrost/test/bi_submit.c b/src/panfrost/bifrost/test/bi_submit.c
index c953e87..408171f 100644
--- a/src/panfrost/bifrost/test/bi_submit.c
+++ b/src/panfrost/bifrost/test/bi_submit.c
@@ -169,11 +169,10 @@
                 memcpy(attr->cpu + 1024, iattr, sz_attr);
 
         struct panfrost_bo *shmem = bit_bo_create(dev, 4096);
-        struct mali_shared_memory shmemp = {
-                .shared_workgroup_count = 0x1f,
-        };
 
-        memcpy(shmem->cpu, &shmemp, sizeof(shmemp));
+        pan_pack(shmem->cpu, LOCAL_STORAGE, cfg) {
+                cfg.wls_instances = MALI_LOCAL_STORAGE_NO_WORKGROUP_MEM;
+        }
 
         pan_pack(shader_desc->cpu, STATE, cfg) {
                 cfg.shader.shader = shader->gpu;
diff --git a/src/panfrost/include/panfrost-job.h b/src/panfrost/include/panfrost-job.h
index 634f62b..5be6253 100644
--- a/src/panfrost/include/panfrost-job.h
+++ b/src/panfrost/include/panfrost-job.h
@@ -630,37 +630,6 @@
         unsigned unk3 : 4;
 };
 
-/* Shared structure at the start of framebuffer descriptors, or used bare for
- * compute jobs, configuring stack and shared memory */
-
-struct mali_shared_memory {
-        u32 stack_shift : 4;
-        u32 unk0 : 28;
-
-        /* Configuration for shared memory for compute shaders.
-         * shared_workgroup_count is logarithmic and may be computed for a
-         * compute shader using shared memory as:
-         *
-         *  shared_workgroup_count = MAX2(ceil(log2(count_x)) + ... + ceil(log2(count_z), 10)
-         *
-         * For compute shaders that don't use shared memory, or non-compute
-         * shaders, this is set to ~0
-         */
-
-        u32 shared_workgroup_count : 5;
-        u32 shared_unk1 : 3;
-        u32 shared_shift : 4;
-        u32 shared_zero : 20;
-
-        mali_ptr scratchpad;
-
-        /* For compute shaders, the RAM backing of workgroup-shared memory. For
-         * fragment shaders on Bifrost, apparently multisampling locations */
-
-        mali_ptr shared_memory;
-        mali_ptr unknown1;
-} __attribute__((packed));
-
 /* Configures multisampling on Bifrost fragment jobs */
 
 struct bifrost_multisampling {
@@ -671,7 +640,7 @@
 } __attribute__((packed));
 
 struct mali_single_framebuffer {
-        struct mali_shared_memory shared_memory;
+        struct mali_local_storage_packed shared_memory;
         struct mali_sfbd_format format;
 
         u32 clear_flags;
@@ -876,7 +845,7 @@
 
 struct mali_framebuffer {
         union {
-                struct mali_shared_memory shared_memory;
+                struct mali_local_storage_packed shared_memory;
                 struct bifrost_multisampling msaa;
         };
 
diff --git a/src/panfrost/lib/decode.c b/src/panfrost/lib/decode.c
index 12f9f8e..463a779 100644
--- a/src/panfrost/lib/decode.c
+++ b/src/panfrost/lib/decode.c
@@ -446,41 +446,6 @@
         pandecode_log("},\n");
 }
 
-static void
-pandecode_shared_memory(const struct mali_shared_memory *desc, bool is_compute)
-{
-        pandecode_prop("stack_shift = 0x%x", desc->stack_shift);
-
-        if (desc->unk0)
-                pandecode_prop("unk0 = 0x%x", desc->unk0);
-
-        if (desc->shared_workgroup_count != 0x1F) {
-                pandecode_prop("shared_workgroup_count = %d", desc->shared_workgroup_count);
-                if (!is_compute)
-                        pandecode_msg("XXX: wrong workgroup count for noncompute\n");
-        }
-
-        if (desc->shared_unk1 || desc->shared_shift) {
-                pandecode_prop("shared_unk1 = %X", desc->shared_unk1);
-                pandecode_prop("shared_shift = %X", desc->shared_shift);
-
-                if (!is_compute)
-                        pandecode_msg("XXX: shared memory configured in noncompute shader");
-        }
-
-        if (desc->shared_zero) {
-                pandecode_msg("XXX: shared memory zero tripped\n");
-                pandecode_prop("shared_zero = 0x%" PRIx32, desc->shared_zero);
-        }
-
-        if (desc->shared_memory && !is_compute)
-                pandecode_msg("XXX: shared memory used in noncompute shader\n");
-
-        MEMORY_PROP(desc, scratchpad);
-        MEMORY_PROP(desc, shared_memory);
-        MEMORY_PROP(desc, unknown1);
-}
-
 static struct pandecode_fbd
 pandecode_sfbd(uint64_t gpu_va, int job_no, bool is_fragment, unsigned gpu_id)
 {
@@ -494,13 +459,7 @@
 
         pandecode_log("struct mali_single_framebuffer framebuffer_%"PRIx64"_%d = {\n", gpu_va, job_no);
         pandecode_indent++;
-
-        pandecode_log(".shared_memory = {\n");
-        pandecode_indent++;
-        pandecode_shared_memory(&s->shared_memory, false);
-        pandecode_indent--;
-        pandecode_log("},\n");
-
+        DUMP_CL(LOCAL_STORAGE, &s->shared_memory, "Local Storage:\n");
         pandecode_sfbd_format(s->format);
 
         info.width = s->width + 1;
@@ -599,13 +558,8 @@
 pandecode_compute_fbd(uint64_t gpu_va, int job_no)
 {
         struct pandecode_mapped_memory *mem = pandecode_find_mapped_gpu_mem_containing(gpu_va);
-        const struct mali_shared_memory *PANDECODE_PTR_VAR(s, mem, (mali_ptr) gpu_va);
-
-        pandecode_log("struct mali_shared_memory shared_%"PRIx64"_%d = {\n", gpu_va, job_no);
-        pandecode_indent++;
-        pandecode_shared_memory(s, true);
-        pandecode_indent--;
-        pandecode_log("},\n");
+        const struct mali_local_storage_packed *PANDECODE_PTR_VAR(s, mem, (mali_ptr) gpu_va);
+        DUMP_CL(LOCAL_STORAGE, s, "Local Storage:\n");
 }
 
 /* Extracts the number of components associated with a Mali format */
@@ -872,11 +826,8 @@
                 pandecode_indent--;
                 pandecode_log("},\n");
         } else {
-                pandecode_log(".shared_memory = {\n");
-                pandecode_indent++;
-                pandecode_shared_memory(&fb->shared_memory, is_compute);
-                pandecode_indent--;
-                pandecode_log("},\n");
+                struct mali_local_storage_packed ls = fb->shared_memory;
+                DUMP_CL(LOCAL_STORAGE, &ls, "Local Storage:\n");
         }
 
         info.width = fb->width1 + 1;
diff --git a/src/panfrost/lib/midgard.xml b/src/panfrost/lib/midgard.xml
index 6fb896d..e27f880 100644
--- a/src/panfrost/lib/midgard.xml
+++ b/src/panfrost/lib/midgard.xml
@@ -561,4 +561,15 @@
     <field name="Scissor Maximum Y" size="16" start="7:16" type="uint"/>
   </struct>
 
+  <struct name="Local Storage" size="8">
+    <field name="TLS Size" size="5" start="0:0" type="uint"/>
+    <field name="TLS Initial Stack Pointer Offset" size="27" start="0:5" type="uint"/>
+    <field name="WLS Instances" size="5" start="1:0" type="uint" modifier="log2" prefix="MALI_LOCAL_STORAGE">
+      <value name="No Workgroup Mem" value="0x80000000"/>
+    </field>
+    <field name="WLS Size Base" size="2" start="1:5" type="uint"/>
+    <field name="WLS Size Scale" size="5" start="1:8" type="uint"/>
+    <field name="TLS Base Pointer" size="64" start="2:0" type="address"/>
+    <field name="WLS Base Pointer" size="64" start="4:0" type="address"/>
+  </struct>
 </panxml>