radeonsi: remove compute-based DCC decompression because it's broken
The new blit test discovered that it doesn't always work.
Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17864>
diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index 26511dc..59c0cda 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -464,7 +464,6 @@
first_level, last_level, level_mask);
if (need_dcc_decompress) {
- assert(sctx->gfx_level == GFX8 || tex->buffer.b.b.nr_storage_samples >= 2);
custom_blend = sctx->custom_blend_dcc_decompress;
assert(vi_dcc_enabled(tex, first_level));
@@ -971,7 +970,7 @@
si_can_use_compute_blit(sctx, src->format, src->nr_samples, false,
vi_dcc_enabled(ssrc, src_level)))) {
si_compute_copy_image(sctx, dst, dst_level, src, src_level, dstx, dsty, dstz,
- src_box, false, SI_OP_SYNC_BEFORE_AFTER);
+ src_box, SI_OP_SYNC_BEFORE_AFTER);
return;
}
@@ -1247,7 +1246,7 @@
if (sscreen->async_compute_context) {
si_compute_copy_image((struct si_context*)sctx->screen->async_compute_context,
info->dst.resource, 0, info->src.resource, 0, 0, 0, 0,
- &info->src.box, false, 0);
+ &info->src.box, 0);
si_flush_gfx_cs((struct si_context*)sctx->screen->async_compute_context, 0, NULL);
simple_mtx_unlock(&sscreen->async_compute_context_lock);
return;
@@ -1354,53 +1353,11 @@
/* If graphics is disabled, we can't decompress DCC, but it shouldn't
* be compressed either. The caller should simply discard it.
*/
- if (!tex->surface.meta_offset || !sctx->has_graphics || sctx->in_dcc_decompress)
+ if (!tex->surface.meta_offset || !sctx->has_graphics)
return;
- sctx->in_dcc_decompress = true;
-
- if (sctx->gfx_level == GFX8 || tex->buffer.b.b.nr_storage_samples >= 2) {
- si_blit_decompress_color(sctx, tex, 0, tex->buffer.b.b.last_level, 0,
- util_max_layer(&tex->buffer.b.b, 0), true, false);
- } else {
- struct pipe_resource *ptex = &tex->buffer.b.b;
- assert(ptex->nr_storage_samples <= 1);
-
- /* DCC decompression using a compute shader. */
- for (unsigned level = 0; level < tex->surface.num_meta_levels; level++) {
- struct pipe_box box;
-
- u_box_3d(0, 0, 0, u_minify(ptex->width0, level),
- u_minify(ptex->height0, level),
- util_num_layers(ptex, level), &box);
- si_compute_copy_image(sctx, ptex, level, ptex, level, 0, 0, 0, &box, true,
- /* Sync before the first copy and after the last copy */
- (level == 0 ? SI_OP_SYNC_BEFORE : 0) |
- (level == tex->surface.num_meta_levels - 1 ? SI_OP_SYNC_AFTER : 0));
- }
-
- /* Now clear DCC metadata to uncompressed.
- *
- * This uses SI_COMPUTE_CLEAR_METHOD to avoid a failure when running this
- * deqp caselist on gfx10:
- * dEQP-GLES31.functional.image_load_store.2d.format_reinterpret.rgba32f_rgba32ui
- * dEQP-GLES31.functional.image_load_store.2d.format_reinterpret.rgba32f_rgba32i
- */
- uint32_t clear_value = DCC_UNCOMPRESSED;
- si_clear_buffer(sctx, ptex, tex->surface.meta_offset,
- tex->surface.meta_size, &clear_value, 4, SI_OP_SYNC_AFTER,
- SI_COHERENCY_CB_META, SI_COMPUTE_CLEAR_METHOD);
- si_mark_display_dcc_dirty(sctx, tex);
-
- /* Clearing DCC metadata requires flushing L2 and invalidating L2 metadata to make
- * the metadata visible to L2 caches. This is because clear_buffer uses plain stores
- * that can go to different L2 channels than where L2 metadata caches expect them.
- * This is not done for fast clears because plain stores are visible to CB/DB. Only
- * L2 metadata caches have the problem.
- */
- sctx->flags |= SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_L2_METADATA;
- }
- sctx->in_dcc_decompress = false;
+ si_blit_decompress_color(sctx, tex, 0, tex->buffer.b.b.last_level, 0,
+ util_max_layer(&tex->buffer.b.b, 0), true, false);
}
void si_init_blit_functions(struct si_context *sctx)
diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c
index 95fe540d..ecc83e9 100644
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -543,9 +543,8 @@
void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, unsigned dst_level,
struct pipe_resource *src, unsigned src_level, unsigned dstx,
unsigned dsty, unsigned dstz, const struct pipe_box *src_box,
- bool is_dcc_decompress, unsigned flags)
+ unsigned flags)
{
- struct pipe_context *ctx = &sctx->b;
struct si_texture *ssrc = (struct si_texture*)src;
struct si_texture *sdst = (struct si_texture*)dst;
enum pipe_format src_format = util_format_linear(src->format);
@@ -652,75 +651,44 @@
image[1].u.tex.first_layer = 0;
image[1].u.tex.last_layer = util_max_layer(dst, dst_level);
- if (is_dcc_decompress)
- image[1].access |= SI_IMAGE_ACCESS_DCC_OFF;
-
struct pipe_grid_info info = {0};
- if (is_dcc_decompress) {
- /* The DCC decompression is a normal blit where the load is compressed
- * and the store is uncompressed. The workgroup size is either equal to
- * the DCC block size or a multiple thereof. The shader uses a barrier
- * between loads and stores to safely overwrite each DCC block of pixels.
- */
- assert(src == dst);
- assert(dst->target != PIPE_TEXTURE_1D && dst->target != PIPE_TEXTURE_1D_ARRAY);
+ bool dst_is_1d = dst->target == PIPE_TEXTURE_1D ||
+ dst->target == PIPE_TEXTURE_1D_ARRAY;
+ bool src_is_1d = src->target == PIPE_TEXTURE_1D ||
+ src->target == PIPE_TEXTURE_1D_ARRAY;
+ int block_x, block_y;
+ int block_z = 1;
- if (!sctx->cs_dcc_decompress)
- sctx->cs_dcc_decompress = si_create_dcc_decompress_cs(ctx);
-
- unsigned block_x = ssrc->surface.u.gfx9.color.dcc_block_width;
- unsigned block_y = ssrc->surface.u.gfx9.color.dcc_block_height;
- unsigned block_z = ssrc->surface.u.gfx9.color.dcc_block_depth;
-
- unsigned default_wave_size = si_determine_wave_size(sctx->screen, NULL);;
-
- /* Make sure the block size is at least the same as wave size. */
- while (block_x * block_y * block_z < default_wave_size) {
- block_x *= 2;
- }
-
- set_work_size(&info, block_x, block_y, block_z, src_box->width, src_box->height, src_box->depth);
-
- si_launch_grid_internal_images(sctx, image, 2, &info, sctx->cs_dcc_decompress, flags);
+ /* Choose the block dimensions based on the copy area size. */
+ if (src_box->height <= 4) {
+ block_y = util_next_power_of_two(src_box->height);
+ block_x = 64 / block_y;
+ } else if (src_box->width <= 4) {
+ block_x = util_next_power_of_two(src_box->width);
+ block_y = 64 / block_x;
+ } else if (is_linear) {
+ block_x = 64;
+ block_y = 1;
} else {
- bool dst_is_1d = dst->target == PIPE_TEXTURE_1D ||
- dst->target == PIPE_TEXTURE_1D_ARRAY;
- bool src_is_1d = src->target == PIPE_TEXTURE_1D ||
- src->target == PIPE_TEXTURE_1D_ARRAY;
- int block_x, block_y;
- int block_z = 1;
-
- /* Choose the block dimensions based on the copy area size. */
- if (src_box->height <= 4) {
- block_y = util_next_power_of_two(src_box->height);
- block_x = 64 / block_y;
- } else if (src_box->width <= 4) {
- block_x = util_next_power_of_two(src_box->width);
- block_y = 64 / block_x;
- } else if (is_linear) {
- block_x = 64;
- block_y = 1;
- } else {
- block_x = 8;
- block_y = 8;
- }
-
- sctx->cs_user_data[0] = src_box->x | (dstx << 16);
- sctx->cs_user_data[1] = src_box->y | (dsty << 16);
- sctx->cs_user_data[2] = src_box->z | (dstz << 16);
-
- set_work_size(&info, block_x, block_y, block_z,
- src_box->width, src_box->height, src_box->depth);
-
- void **copy_image_cs_ptr = &sctx->cs_copy_image[src_is_1d][dst_is_1d];
- if (!*copy_image_cs_ptr)
- *copy_image_cs_ptr = si_create_copy_image_cs(sctx, src_is_1d, dst_is_1d);
-
- assert(*copy_image_cs_ptr);
-
- si_launch_grid_internal_images(sctx, image, 2, &info, *copy_image_cs_ptr, flags);
+ block_x = 8;
+ block_y = 8;
}
+
+ sctx->cs_user_data[0] = src_box->x | (dstx << 16);
+ sctx->cs_user_data[1] = src_box->y | (dsty << 16);
+ sctx->cs_user_data[2] = src_box->z | (dstz << 16);
+
+ set_work_size(&info, block_x, block_y, block_z,
+ src_box->width, src_box->height, src_box->depth);
+
+ void **copy_image_cs_ptr = &sctx->cs_copy_image[src_is_1d][dst_is_1d];
+ if (!*copy_image_cs_ptr)
+ *copy_image_cs_ptr = si_create_copy_image_cs(sctx, src_is_1d, dst_is_1d);
+
+ assert(*copy_image_cs_ptr);
+
+ si_launch_grid_internal_images(sctx, image, 2, &info, *copy_image_cs_ptr, flags);
}
void si_retile_dcc(struct si_context *sctx, struct si_texture *tex)
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index f82f8d9..2cbaa9b 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -273,8 +273,6 @@
sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_render_target_1d_array);
if (sctx->cs_clear_12bytes_buffer)
sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_12bytes_buffer);
- if (sctx->cs_dcc_decompress)
- sctx->b.delete_compute_state(&sctx->b, sctx->cs_dcc_decompress);
for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_dcc_retile); i++) {
if (sctx->cs_dcc_retile[i])
sctx->b.delete_compute_state(&sctx->b, sctx->cs_dcc_retile[i]);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 5ddc271..f833c0b 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -984,7 +984,6 @@
void *cs_clear_render_target;
void *cs_clear_render_target_1d_array;
void *cs_clear_12bytes_buffer;
- void *cs_dcc_decompress;
void *cs_dcc_retile[32];
void *cs_fmask_expand[3][2]; /* [log2(samples)-1][is_array] */
struct si_screen *screen;
@@ -998,7 +997,6 @@
bool blitter_running;
bool in_update_ps_colorbuf0_slot;
- bool in_dcc_decompress;
bool is_noop:1;
bool has_graphics:1;
bool gfx_flush_in_progress : 1;
@@ -1428,7 +1426,7 @@
void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, unsigned dst_level,
struct pipe_resource *src, unsigned src_level, unsigned dstx,
unsigned dsty, unsigned dstz, const struct pipe_box *src_box,
- bool is_dcc_decompress, unsigned flags);
+ unsigned flags);
void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surface *dstsurf,
const union pipe_color_union *color, unsigned dstx,
unsigned dsty, unsigned width, unsigned height,
@@ -1554,7 +1552,6 @@
void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords_per_thread,
bool dst_stream_cache_policy, bool is_copy);
void *si_create_clear_buffer_rmw_cs(struct si_context *sctx);
-void *si_create_dcc_decompress_cs(struct pipe_context *ctx);
void *si_clear_render_target_shader(struct pipe_context *ctx);
void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx);
void *si_clear_12bytes_buffer_shader(struct pipe_context *ctx);
diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
index 0cc5348..821def5 100644
--- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
@@ -402,45 +402,6 @@
return sctx->b.create_compute_state(&sctx->b, &state);
}
-/* Create a compute shader implementing DCC decompression via a blit.
- * This is a trivial copy_image shader except that it has a variable block
- * size and a barrier.
- */
-void *si_create_dcc_decompress_cs(struct pipe_context *ctx)
-{
- static const char text[] =
- "COMP\n"
- "DCL SV[0], THREAD_ID\n"
- "DCL SV[1], BLOCK_ID\n"
- "DCL SV[2], BLOCK_SIZE\n"
- "DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
- "DCL IMAGE[1], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
- "DCL TEMP[0..1]\n"
-
- "UMAD TEMP[0].xyz, SV[1].xyzz, SV[2].xyzz, SV[0].xyzz\n"
- "LOAD TEMP[1], IMAGE[0], TEMP[0].xyzz, 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
- /* Wait for the whole threadgroup (= DCC block) to load texels before
- * overwriting them, because overwriting any pixel within a DCC block
- * can break compression for the whole block.
- */
- "BARRIER\n"
- "STORE IMAGE[1], TEMP[0].xyzz, TEMP[1], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
- "END\n";
-
- struct tgsi_token tokens[1024];
- struct pipe_compute_state state = {0};
-
- if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
- assert(false);
- return NULL;
- }
-
- state.ir_type = PIPE_SHADER_IR_TGSI;
- state.prog = tokens;
-
- return ctx->create_compute_state(ctx, &state);
-}
-
void *si_clear_render_target_shader(struct pipe_context *ctx)
{
static const char text[] =