panfrost: Avoid minimum stack allocations

If stack isn't used, don't allocate it - simple as that.

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Reviewed-by: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6373>
diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c
index 02e50cb..bd01dac 100644
--- a/src/gallium/drivers/panfrost/pan_cmdstream.c
+++ b/src/gallium/drivers/panfrost/pan_cmdstream.c
@@ -58,12 +58,20 @@
         struct panfrost_device *dev = pan_device(ctx->base.screen);
         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
 
-        unsigned shift = panfrost_get_stack_shift(batch->stack_size);
         struct mali_shared_memory shared = {
-                .stack_shift = shift,
-                .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
                 .shared_workgroup_count = ~0,
         };
+
+        if (batch->stack_size) {
+                struct panfrost_bo *stack =
+                        panfrost_batch_get_scratchpad(batch, batch->stack_size,
+                                        dev->thread_tls_alloc,
+                                        dev->core_count);
+
+                shared.stack_shift = panfrost_get_stack_shift(batch->stack_size);
+                shared.scratchpad = stack->gpu;
+        }
+
         postfix->shared_memory = panfrost_pool_upload(&batch->pool, &shared, sizeof(shared));
 }
 
diff --git a/src/gallium/drivers/panfrost/pan_job.c b/src/gallium/drivers/panfrost/pan_job.c
index a016029..b1c6805 100644
--- a/src/gallium/drivers/panfrost/pan_job.c
+++ b/src/gallium/drivers/panfrost/pan_job.c
@@ -639,11 +639,11 @@
 
 struct panfrost_bo *
 panfrost_batch_get_scratchpad(struct panfrost_batch *batch,
-                unsigned shift,
+                unsigned size_per_thread,
                 unsigned thread_tls_alloc,
                 unsigned core_count)
 {
-        unsigned size = panfrost_get_total_stack_size(shift,
+        unsigned size = panfrost_get_total_stack_size(size_per_thread,
                         thread_tls_alloc,
                         core_count);
 
diff --git a/src/gallium/drivers/panfrost/pan_job.h b/src/gallium/drivers/panfrost/pan_job.h
index 00edd957..664d5da 100644
--- a/src/gallium/drivers/panfrost/pan_job.h
+++ b/src/gallium/drivers/panfrost/pan_job.h
@@ -174,7 +174,7 @@
 panfrost_batch_adjust_stack_size(struct panfrost_batch *batch);
 
 struct panfrost_bo *
-panfrost_batch_get_scratchpad(struct panfrost_batch *batch, unsigned shift, unsigned thread_tls_alloc, unsigned core_count);
+panfrost_batch_get_scratchpad(struct panfrost_batch *batch, unsigned size, unsigned thread_tls_alloc, unsigned core_count);
 
 struct panfrost_bo *
 panfrost_batch_get_shared_memory(struct panfrost_batch *batch, unsigned size, unsigned workgroup_count);
diff --git a/src/gallium/drivers/panfrost/pan_mfbd.c b/src/gallium/drivers/panfrost/pan_mfbd.c
index 6cbf2f2..4ae7320 100644
--- a/src/gallium/drivers/panfrost/pan_mfbd.c
+++ b/src/gallium/drivers/panfrost/pan_mfbd.c
@@ -498,13 +498,16 @@
                 mfbd.msaa.sample_locations = panfrost_emit_sample_locations(batch);
                 mfbd.tiler_meta = panfrost_batch_get_tiler_meta(batch, vertex_count);
         } else {
-                unsigned shift = panfrost_get_stack_shift(batch->stack_size);
-                struct panfrost_bo *bo = panfrost_batch_get_scratchpad(batch,
-                                                                       shift,
-                                                                       dev->thread_tls_alloc,
-                                                                       dev->core_count);
-                mfbd.shared_memory.stack_shift = shift;
-                mfbd.shared_memory.scratchpad = bo->gpu;
+                if (batch->stack_size) {
+                        unsigned shift = panfrost_get_stack_shift(batch->stack_size);
+                        struct panfrost_bo *bo = panfrost_batch_get_scratchpad(batch,
+                                                                               batch->stack_size,
+                                                                               dev->thread_tls_alloc,
+                                                                               dev->core_count);
+                        mfbd.shared_memory.stack_shift = shift;
+                        mfbd.shared_memory.scratchpad = bo->gpu;
+                }
+
                 mfbd.shared_memory.shared_workgroup_count = ~0;
 
                 mfbd.tiler = panfrost_emit_midg_tiler(batch, vertex_count);
diff --git a/src/panfrost/lib/pan_encoder.h b/src/panfrost/lib/pan_encoder.h
index a992a4c..0471701 100644
--- a/src/panfrost/lib/pan_encoder.h
+++ b/src/panfrost/lib/pan_encoder.h
@@ -74,7 +74,7 @@
 
 unsigned
 panfrost_get_total_stack_size(
-                unsigned stack_shift,
+                unsigned thread_size,
                 unsigned threads_per_core,
                 unsigned core_count);
 
diff --git a/src/panfrost/lib/pan_scratch.c b/src/panfrost/lib/pan_scratch.c
index 478a788..47c98f3 100644
--- a/src/panfrost/lib/pan_scratch.c
+++ b/src/panfrost/lib/pan_scratch.c
@@ -25,6 +25,7 @@
  */
 
 #include "util/u_math.h"
+#include "util/macros.h"
 #include "pan_encoder.h"
 
 /* Midgard has a small register file, so shaders with high register pressure
@@ -93,17 +94,16 @@
                 return 0;
 }
 
-/* Computes the aligned stack size given the shift and thread count. The blob
- * reserves an extra page, and since this is hardware-internal, we do too. */
+/* Computes the aligned stack size given the shift and thread count. */
 
 unsigned
 panfrost_get_total_stack_size(
-                unsigned stack_shift,
+                unsigned thread_size,
                 unsigned threads_per_core,
                 unsigned core_count)
 {
-        unsigned size_per_thread = MAX2(1 << (stack_shift + 4), 32);
-        unsigned size = size_per_thread * threads_per_core * core_count;
+        unsigned size_per_thread = (thread_size == 0) ? 0 :
+                util_next_power_of_two(ALIGN_POT(thread_size, 16));
 
-        return size + 4096;
+        return size_per_thread * threads_per_core * core_count;
 }