src/gallium/drivers/radeonsi/si_test_dma_perf.c - platform/external/mesa3d - Git at Google

 /*
  * Copyright 2018 Advanced Micro Devices, Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  *
  */

 /* This file implements tests on the si_clearbuffer function. */

 #include "si_pipe.h"
 #include "si_query.h"

 #define MIN_SIZE   512
 #define MAX_SIZE   (128 * 1024 * 1024)
 #define SIZE_SHIFT 1
 #define NUM_RUNS   128

 static double get_MBps_rate(unsigned num_bytes, unsigned ns)
 {
    return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0);
 }

 void si_test_dma_perf(struct si_screen *sscreen)
 {
    struct pipe_screen *screen = &sscreen->b;
    struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
    struct si_context *sctx = (struct si_context *)ctx;
    const uint32_t clear_value = 0x12345678;
    static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1};
    static const unsigned cs_waves_per_sh_list[] = {0, 2, 4, 8, 16};

 #define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list)
 #define NUM_METHODS (4 + 3 * NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))

    static const char *method_str[] = {
       "CP MC   ",
       "CP L2   ",
       "CP L2   ",
       "SDMA    ",
    };
    static const char *placement_str[] = {
       /* Clear */
       "fill->VRAM",
       "fill->GTT ",
       /* Copy */
       "VRAM->VRAM",
       "VRAM->GTT ",
       "GTT ->VRAM",
    };

    printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n");
    printf("Heap       ,Method  ,L2p,Wa,");
    for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
       if (size >= 1024)
          printf("%6uKB,", size / 1024);
       else
          printf(" %6uB,", size);
    }
    printf("\n");

    /* results[log2(size)][placement][method][] */
    struct si_result {
       bool is_valid;
       bool is_cp;
       bool is_sdma;
       bool is_cs;
       unsigned cache_policy;
       unsigned dwords_per_thread;
       unsigned waves_per_sh;
       unsigned score;
       unsigned index; /* index in results[x][y][index] */
    } results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {};

    /* Run benchmarks. */
    for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
       bool is_copy = placement >= 2;

       printf("-----------,--------,---,--,");
       for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT)
          printf("--------,");
       printf("\n");

       for (unsigned method = 0; method < NUM_METHODS; method++) {
          bool test_cp = method <= 2;
          bool test_sdma = method == 3;
          bool test_cs = method >= 4;
          unsigned cs_method = method - 4;
          unsigned cs_waves_per_sh =
             test_cs ? cs_waves_per_sh_list[cs_method / (3 * NUM_SHADERS)] : 0;
          cs_method %= 3 * NUM_SHADERS;
          unsigned cache_policy =
             test_cp ? method % 3 : test_cs ? (cs_method / NUM_SHADERS) : 0;
          unsigned cs_dwords_per_thread =
             test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0;

          if (test_sdma && !sctx->sdma_cs)
             continue;

          if (sctx->chip_class == GFX6) {
             /* GFX6 doesn't support CP DMA operations through L2. */
             if (test_cp && cache_policy != L2_BYPASS)
                continue;
             /* WAVES_PER_SH is in multiples of 16 on GFX6. */
             if (test_cs && cs_waves_per_sh % 16 != 0)
                continue;
          }

          /* SI_RESOURCE_FLAG_UNCACHED setting RADEON_FLAG_UNCACHED doesn't affect
           * chips before gfx9.
           */
          if (test_cs && cache_policy && sctx->chip_class < GFX9)
             continue;

          printf("%s ,", placement_str[placement]);
          if (test_cs) {
             printf("CS x%-4u,%3s,", cs_dwords_per_thread,
                    cache_policy == L2_LRU ? "LRU" : cache_policy == L2_STREAM ? "Str" : "");
          } else {
             printf("%s,%3s,", method_str[method],
                    method == L2_LRU ? "LRU" : method == L2_STREAM ? "Str" : "");
          }
          if (test_cs && cs_waves_per_sh)
             printf("%2u,", cs_waves_per_sh);
          else
             printf("  ,");

          double score = 0;
          for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
             /* Don't test bigger sizes if it's too slow. Print 0. */
             if (size >= 512 * 1024 && score < 400 * (size / (4 * 1024 * 1024))) {
                printf("%7.0f ,", 0.0);
                continue;
             }

             enum pipe_resource_usage dst_usage, src_usage;
             struct pipe_resource *dst, *src;
             struct pipe_query *q[NUM_RUNS];
             unsigned query_type = PIPE_QUERY_TIME_ELAPSED;
             unsigned flags = cache_policy == L2_BYPASS ? SI_RESOURCE_FLAG_UNCACHED : 0;

             if (test_sdma) {
                if (sctx->chip_class == GFX6)
                   query_type = SI_QUERY_TIME_ELAPSED_SDMA_SI;
                else
                   query_type = SI_QUERY_TIME_ELAPSED_SDMA;
             }

             if (placement == 0 || placement == 2 || placement == 4)
                dst_usage = PIPE_USAGE_DEFAULT;
             else
                dst_usage = PIPE_USAGE_STREAM;

             if (placement == 2 || placement == 3)
                src_usage = PIPE_USAGE_DEFAULT;
             else
                src_usage = PIPE_USAGE_STREAM;

             dst = pipe_aligned_buffer_create(screen, flags, dst_usage, size, 256);
             src = is_copy ? pipe_aligned_buffer_create(screen, flags, src_usage, size, 256) : NULL;

             /* Run tests. */
             for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
                q[iter] = ctx->create_query(ctx, query_type, 0);
                ctx->begin_query(ctx, q[iter]);

                if (test_cp) {
                   /* CP DMA */
                   if (is_copy) {
                      si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, 0, SI_COHERENCY_NONE,
                                            cache_policy);
                   } else {
                      si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, size, clear_value, 0,
                                             SI_COHERENCY_NONE, cache_policy);
                   }
                } else if (test_sdma) {
                   /* SDMA */
                   if (is_copy) {
                      si_sdma_copy_buffer(sctx, dst, src, 0, 0, size);
                   } else {
                      si_sdma_clear_buffer(sctx, dst, 0, size, clear_value);
                   }
                } else {
                   /* Compute */
                   /* The memory accesses are coalesced, meaning that the 1st instruction writes
                    * the 1st contiguous block of data for the whole wave, the 2nd instruction
                    * writes the 2nd contiguous block of data, etc.
                    */
                   unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4);
                   unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread;
                   unsigned dwords_per_wave = cs_dwords_per_thread * 64;

                   unsigned num_dwords = size / 4;
                   unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);

                   void *cs = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
                                                           cache_policy == L2_STREAM, is_copy);

                   struct pipe_grid_info info = {};
                   info.block[0] = MIN2(64, num_instructions);
                   info.block[1] = 1;
                   info.block[2] = 1;
                   info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
                   info.grid[1] = 1;
                   info.grid[2] = 1;

                   struct pipe_shader_buffer sb[2] = {};
                   sb[0].buffer = dst;
                   sb[0].buffer_size = size;

                   if (is_copy) {
                      sb[1].buffer = src;
                      sb[1].buffer_size = size;
                   } else {
                      for (unsigned i = 0; i < 4; i++)
                         sctx->cs_user_data[i] = clear_value;
                   }

                   sctx->flags |= SI_CONTEXT_INV_VCACHE | SI_CONTEXT_INV_SCACHE;

                   ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, is_copy ? 2 : 1, sb, 0x1);
                   ctx->bind_compute_state(ctx, cs);
                   sctx->cs_max_waves_per_sh = cs_waves_per_sh;

                   ctx->launch_grid(ctx, &info);

                   ctx->bind_compute_state(ctx, NULL);
                   ctx->delete_compute_state(ctx, cs);
                   sctx->cs_max_waves_per_sh = 0; /* disable the limit */

                   sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
                }

                /* Flush L2, so that we don't just test L2 cache performance. */
                if (!test_sdma) {
                   sctx->flags |= SI_CONTEXT_WB_L2;
                   sctx->emit_cache_flush(sctx);
                }

                ctx->end_query(ctx, q[iter]);
                ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
             }
             pipe_resource_reference(&dst, NULL);
             pipe_resource_reference(&src, NULL);

             /* Get results. */
             uint64_t min = ~0ull, max = 0, total = 0;

             for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
                union pipe_query_result result;

                ctx->get_query_result(ctx, q[iter], true, &result);
                ctx->destroy_query(ctx, q[iter]);

                min = MIN2(min, result.u64);
                max = MAX2(max, result.u64);
                total += result.u64;
             }

             score = get_MBps_rate(size, total / (double)NUM_RUNS);
             printf("%7.0f ,", score);
             fflush(stdout);

             struct si_result *r = &results[util_logbase2(size)][placement][method];
             r->is_valid = true;
             r->is_cp = test_cp;
             r->is_sdma = test_sdma;
             r->is_cs = test_cs;
             r->cache_policy = cache_policy;
             r->dwords_per_thread = cs_dwords_per_thread;
             r->waves_per_sh = cs_waves_per_sh;
             r->score = score;
             r->index = method;
          }
          puts("");
       }
    }

    puts("");
    puts("static struct si_method");
    printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool "
           "cached)\n",
           sctx->screen->info.name);
    puts("{");
    puts("   unsigned size = MIN2(size64, UINT_MAX);\n");

    /* Analyze results and find the best methods. */
    for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
       if (placement == 0)
          puts("   if (dst == RADEON_DOMAIN_VRAM) {");
       else if (placement == 1)
          puts("   } else { /* GTT */");
       else if (placement == 2) {
          puts("}");
          puts("");
          puts("static struct si_method");
          printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n",
                 sctx->screen->info.name);
          printf("                     uint64_t size64, bool async, bool cached)\n");
          puts("{");
          puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
          puts("   if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {");
       } else if (placement == 3)
          puts("   } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {");
       else
          puts("   } else { /* GTT -> VRAM */");

       for (unsigned mode = 0; mode < 3; mode++) {
          bool async = mode == 0;
          bool cached = mode == 1;

          if (async)
             puts("      if (async) { /* SDMA or async compute */");
          else if (cached)
             puts("      if (cached) { /* gfx ring */");
          else
             puts("      } else { /* gfx ring - uncached */");

          /* The list of best chosen methods. */
          struct si_result *methods[32];
          unsigned method_max_size[32];
          unsigned num_methods = 0;

          for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
             /* Find the best method. */
             struct si_result *best = NULL;

             for (unsigned i = 0; i < NUM_METHODS; i++) {
                struct si_result *r = &results[util_logbase2(size)][placement][i];

                if (!r->is_valid)
                   continue;

                /* Ban CP DMA clears via MC on <= GFX8. They are super slow
                 * on GTT, which we can get due to BO evictions.
                 */
                if (sctx->chip_class <= GFX8 && placement == 1 && r->is_cp &&
                    r->cache_policy == L2_BYPASS)
                   continue;

                if (async) {
                   /* The following constraints for compute IBs try to limit
                    * resource usage so as not to decrease the performance
                    * of gfx IBs too much.
                    */

                   /* Don't use CP DMA on asynchronous rings, because
                    * the engine is shared with gfx IBs.
                    */
                   if (r->is_cp)
                      continue;

                   /* Don't use L2 caching on asynchronous rings to minimize
                    * L2 usage.
                    */
                   if (r->cache_policy == L2_LRU)
                      continue;

                   /* Asynchronous compute recommends waves_per_sh != 0
                    * to limit CU usage. */
                   if (r->is_cs && r->waves_per_sh == 0)
                      continue;
                } else {
                   /* SDMA is always asynchronous */
                   if (r->is_sdma)
                      continue;

                   if (cached && r->cache_policy == L2_BYPASS)
                      continue;
                   if (!cached && r->cache_policy == L2_LRU)
                      continue;
                }

                if (!best) {
                   best = r;
                   continue;
                }

                /* Assume some measurement error. Earlier methods occupy fewer
                 * resources, so the next method is always more greedy, and we
                 * don't want to select it due to a measurement error.
                 */
                double min_improvement = 1.03;

                if (best->score * min_improvement < r->score)
                   best = r;
             }

             if (num_methods > 0) {
                unsigned prev_index = num_methods - 1;
                struct si_result *prev = methods[prev_index];
                struct si_result *prev_this_size =
                   &results[util_logbase2(size)][placement][prev->index];

                /* If the best one is also the best for the previous size,
                 * just bump the size for the previous one.
                 *
                 * If there is no best, it means all methods were too slow
                 * for this size and were not tested. Use the best one for
                 * the previous size.
                 */
                if (!best ||
                    /* If it's the same method as for the previous size: */
                    (prev->is_cp == best->is_cp && prev->is_sdma == best->is_sdma &&
                     prev->is_cs == best->is_cs && prev->cache_policy == best->cache_policy &&
                     prev->dwords_per_thread == best->dwords_per_thread &&
                     prev->waves_per_sh == best->waves_per_sh) ||
                    /* If the method for the previous size is also the best
                     * for this size: */
                    (prev_this_size->is_valid && prev_this_size->score * 1.03 > best->score)) {
                   method_max_size[prev_index] = size;
                   continue;
                }
             }

             /* Add it to the list. */
             assert(num_methods < ARRAY_SIZE(methods));
             methods[num_methods] = best;
             method_max_size[num_methods] = size;
             num_methods++;
          }

          for (unsigned i = 0; i < num_methods; i++) {
             struct si_result *best = methods[i];
             unsigned size = method_max_size[i];

             /* The size threshold is between the current benchmarked
              * size and the next benchmarked size. */
             if (i < num_methods - 1)
                printf("         if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2);
             else if (i > 0)
                printf("         else                   ");
             else
                printf("         ");
             printf("return ");

             assert(best);
             const char *cache_policy_str =
                best->cache_policy == L2_BYPASS ? "L2_BYPASS" :
                best->cache_policy == L2_LRU ? "L2_LRU   " : "L2_STREAM";

             if (best->is_cp) {
                printf("CP_DMA(%s);\n", cache_policy_str);
             }
             if (best->is_sdma)
                printf("SDMA;\n");
             if (best->is_cs) {
                printf("COMPUTE(%s, %u, %u);\n", cache_policy_str,
                       best->dwords_per_thread, best->waves_per_sh);
             }
          }
       }
       puts("      }");
    }
    puts("   }");
    puts("}");

    ctx->destroy(ctx);
    exit(0);
 }
	/*
	* Copyright 2018 Advanced Micro Devices, Inc.
	* All Rights Reserved.
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the "Software"),
	* to deal in the Software without restriction, including without limitation
	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
	* and/or sell copies of the Software, and to permit persons to whom the
	* Software is furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice (including the next
	* paragraph) shall be included in all copies or substantial portions of the
	* Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	* SOFTWARE.
	*
	*/

	/* This file implements tests on the si_clearbuffer function. */

	#include "si_pipe.h"
	#include "si_query.h"

	#define MIN_SIZE 512
	#define MAX_SIZE (128 * 1024 * 1024)
	#define SIZE_SHIFT 1
	#define NUM_RUNS 128

	static double get_MBps_rate(unsigned num_bytes, unsigned ns)
	{
	return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0);
	}

	void si_test_dma_perf(struct si_screen *sscreen)
	{
	struct pipe_screen *screen = &sscreen->b;
	struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
	struct si_context sctx = (struct si_context )ctx;
	const uint32_t clear_value = 0x12345678;
	static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1};
	static const unsigned cs_waves_per_sh_list[] = {0, 2, 4, 8, 16};

	#define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list)
	#define NUM_METHODS (4 + 3 * NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))

	static const char *method_str[] = {
	"CP MC ",
	"CP L2 ",
	"CP L2 ",
	"SDMA ",
	};
	static const char *placement_str[] = {
	/* Clear */
	"fill->VRAM",
	"fill->GTT ",
	/* Copy */
	"VRAM->VRAM",
	"VRAM->GTT ",
	"GTT ->VRAM",
	};

	printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n");
	printf("Heap ,Method ,L2p,Wa,");
	for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
	if (size >= 1024)
	printf("%6uKB,", size / 1024);
	else
	printf(" %6uB,", size);
	}
	printf("\n");

	/* results[log2(size)][placement][method][] */
	struct si_result {
	bool is_valid;
	bool is_cp;
	bool is_sdma;
	bool is_cs;
	unsigned cache_policy;
	unsigned dwords_per_thread;
	unsigned waves_per_sh;
	unsigned score;
	unsigned index; /* index in results[x][y][index] */
	} results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {};

	/* Run benchmarks. */
	for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
	bool is_copy = placement >= 2;

	printf("-----------,--------,---,--,");
	for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT)
	printf("--------,");
	printf("\n");

	for (unsigned method = 0; method < NUM_METHODS; method++) {
	bool test_cp = method <= 2;
	bool test_sdma = method == 3;
	bool test_cs = method >= 4;
	unsigned cs_method = method - 4;
	unsigned cs_waves_per_sh =
	test_cs ? cs_waves_per_sh_list[cs_method / (3 * NUM_SHADERS)] : 0;
	cs_method %= 3 * NUM_SHADERS;
	unsigned cache_policy =
	test_cp ? method % 3 : test_cs ? (cs_method / NUM_SHADERS) : 0;
	unsigned cs_dwords_per_thread =
	test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0;

	if (test_sdma && !sctx->sdma_cs)
	continue;

	if (sctx->chip_class == GFX6) {
	/* GFX6 doesn't support CP DMA operations through L2. */
	if (test_cp && cache_policy != L2_BYPASS)
	continue;
	/* WAVES_PER_SH is in multiples of 16 on GFX6. */
	if (test_cs && cs_waves_per_sh % 16 != 0)
	continue;
	}

	/* SI_RESOURCE_FLAG_UNCACHED setting RADEON_FLAG_UNCACHED doesn't affect
	* chips before gfx9.
	*/
	if (test_cs && cache_policy && sctx->chip_class < GFX9)
	continue;

	printf("%s ,", placement_str[placement]);
	if (test_cs) {
	printf("CS x%-4u,%3s,", cs_dwords_per_thread,
	cache_policy == L2_LRU ? "LRU" : cache_policy == L2_STREAM ? "Str" : "");
	} else {
	printf("%s,%3s,", method_str[method],
	method == L2_LRU ? "LRU" : method == L2_STREAM ? "Str" : "");
	}
	if (test_cs && cs_waves_per_sh)
	printf("%2u,", cs_waves_per_sh);
	else
	printf(" ,");

	double score = 0;
	for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
	/* Don't test bigger sizes if it's too slow. Print 0. */
	if (size >= 512 * 1024 && score < 400 * (size / (4 * 1024 * 1024))) {
	printf("%7.0f ,", 0.0);
	continue;
	}

	enum pipe_resource_usage dst_usage, src_usage;
	struct pipe_resource dst, src;
	struct pipe_query *q[NUM_RUNS];
	unsigned query_type = PIPE_QUERY_TIME_ELAPSED;
	unsigned flags = cache_policy == L2_BYPASS ? SI_RESOURCE_FLAG_UNCACHED : 0;

	if (test_sdma) {
	if (sctx->chip_class == GFX6)
	query_type = SI_QUERY_TIME_ELAPSED_SDMA_SI;
	else
	query_type = SI_QUERY_TIME_ELAPSED_SDMA;
	}

	if (placement == 0 \|\| placement == 2 \|\| placement == 4)
	dst_usage = PIPE_USAGE_DEFAULT;
	else
	dst_usage = PIPE_USAGE_STREAM;

	if (placement == 2 \|\| placement == 3)
	src_usage = PIPE_USAGE_DEFAULT;
	else
	src_usage = PIPE_USAGE_STREAM;

	dst = pipe_aligned_buffer_create(screen, flags, dst_usage, size, 256);
	src = is_copy ? pipe_aligned_buffer_create(screen, flags, src_usage, size, 256) : NULL;

	/* Run tests. */
	for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
	q[iter] = ctx->create_query(ctx, query_type, 0);
	ctx->begin_query(ctx, q[iter]);

	if (test_cp) {
	/* CP DMA */
	if (is_copy) {
	si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, 0, SI_COHERENCY_NONE,
	cache_policy);
	} else {
	si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, size, clear_value, 0,
	SI_COHERENCY_NONE, cache_policy);
	}
	} else if (test_sdma) {
	/* SDMA */
	if (is_copy) {
	si_sdma_copy_buffer(sctx, dst, src, 0, 0, size);
	} else {
	si_sdma_clear_buffer(sctx, dst, 0, size, clear_value);
	}
	} else {
	/* Compute */
	/* The memory accesses are coalesced, meaning that the 1st instruction writes
	* the 1st contiguous block of data for the whole wave, the 2nd instruction
	* writes the 2nd contiguous block of data, etc.
	*/
	unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4);
	unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread;
	unsigned dwords_per_wave = cs_dwords_per_thread * 64;

	unsigned num_dwords = size / 4;
	unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);

	void *cs = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
	cache_policy == L2_STREAM, is_copy);

	struct pipe_grid_info info = {};
	info.block[0] = MIN2(64, num_instructions);
	info.block[1] = 1;
	info.block[2] = 1;
	info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
	info.grid[1] = 1;
	info.grid[2] = 1;

	struct pipe_shader_buffer sb[2] = {};
	sb[0].buffer = dst;
	sb[0].buffer_size = size;

	if (is_copy) {
	sb[1].buffer = src;
	sb[1].buffer_size = size;
	} else {
	for (unsigned i = 0; i < 4; i++)
	sctx->cs_user_data[i] = clear_value;
	}

	sctx->flags \|= SI_CONTEXT_INV_VCACHE \| SI_CONTEXT_INV_SCACHE;

	ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, is_copy ? 2 : 1, sb, 0x1);
	ctx->bind_compute_state(ctx, cs);
	sctx->cs_max_waves_per_sh = cs_waves_per_sh;

	ctx->launch_grid(ctx, &info);

	ctx->bind_compute_state(ctx, NULL);
	ctx->delete_compute_state(ctx, cs);
	sctx->cs_max_waves_per_sh = 0; /* disable the limit */

	sctx->flags \|= SI_CONTEXT_CS_PARTIAL_FLUSH;
	}

	/* Flush L2, so that we don't just test L2 cache performance. */
	if (!test_sdma) {
	sctx->flags \|= SI_CONTEXT_WB_L2;
	sctx->emit_cache_flush(sctx);
	}

	ctx->end_query(ctx, q[iter]);
	ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
	}
	pipe_resource_reference(&dst, NULL);
	pipe_resource_reference(&src, NULL);

	/* Get results. */
	uint64_t min = ~0ull, max = 0, total = 0;

	for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
	union pipe_query_result result;

	ctx->get_query_result(ctx, q[iter], true, &result);
	ctx->destroy_query(ctx, q[iter]);

	min = MIN2(min, result.u64);
	max = MAX2(max, result.u64);
	total += result.u64;
	}

	score = get_MBps_rate(size, total / (double)NUM_RUNS);
	printf("%7.0f ,", score);
	fflush(stdout);

	struct si_result *r = &results[util_logbase2(size)][placement][method];
	r->is_valid = true;
	r->is_cp = test_cp;
	r->is_sdma = test_sdma;
	r->is_cs = test_cs;
	r->cache_policy = cache_policy;
	r->dwords_per_thread = cs_dwords_per_thread;
	r->waves_per_sh = cs_waves_per_sh;
	r->score = score;
	r->index = method;
	}
	puts("");
	}
	}

	puts("");
	puts("static struct si_method");
	printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool "
	"cached)\n",
	sctx->screen->info.name);
	puts("{");
	puts(" unsigned size = MIN2(size64, UINT_MAX);\n");

	/* Analyze results and find the best methods. */
	for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
	if (placement == 0)
	puts(" if (dst == RADEON_DOMAIN_VRAM) {");
	else if (placement == 1)
	puts(" } else { /* GTT */");
	else if (placement == 2) {
	puts("}");
	puts("");
	puts("static struct si_method");
	printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n",
	sctx->screen->info.name);
	printf(" uint64_t size64, bool async, bool cached)\n");
	puts("{");
	puts(" unsigned size = MIN2(size64, UINT_MAX);\n");
	puts(" if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {");
	} else if (placement == 3)
	puts(" } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {");
	else
	puts(" } else { /* GTT -> VRAM */");

	for (unsigned mode = 0; mode < 3; mode++) {
	bool async = mode == 0;
	bool cached = mode == 1;

	if (async)
	puts(" if (async) { /* SDMA or async compute */");
	else if (cached)
	puts(" if (cached) { /* gfx ring */");
	else
	puts(" } else { /* gfx ring - uncached */");

	/* The list of best chosen methods. */
	struct si_result *methods[32];
	unsigned method_max_size[32];
	unsigned num_methods = 0;

	for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
	/* Find the best method. */
	struct si_result *best = NULL;

	for (unsigned i = 0; i < NUM_METHODS; i++) {
	struct si_result *r = &results[util_logbase2(size)][placement][i];

	if (!r->is_valid)
	continue;

	/* Ban CP DMA clears via MC on <= GFX8. They are super slow
	* on GTT, which we can get due to BO evictions.
	*/
	if (sctx->chip_class <= GFX8 && placement == 1 && r->is_cp &&
	r->cache_policy == L2_BYPASS)
	continue;

	if (async) {
	/* The following constraints for compute IBs try to limit
	* resource usage so as not to decrease the performance
	* of gfx IBs too much.
	*/

	/* Don't use CP DMA on asynchronous rings, because
	* the engine is shared with gfx IBs.
	*/
	if (r->is_cp)
	continue;

	/* Don't use L2 caching on asynchronous rings to minimize
	* L2 usage.
	*/
	if (r->cache_policy == L2_LRU)
	continue;

	/* Asynchronous compute recommends waves_per_sh != 0
	* to limit CU usage. */
	if (r->is_cs && r->waves_per_sh == 0)
	continue;
	} else {
	/* SDMA is always asynchronous */
	if (r->is_sdma)
	continue;

	if (cached && r->cache_policy == L2_BYPASS)
	continue;
	if (!cached && r->cache_policy == L2_LRU)
	continue;
	}

	if (!best) {
	best = r;
	continue;
	}

	/* Assume some measurement error. Earlier methods occupy fewer
	* resources, so the next method is always more greedy, and we
	* don't want to select it due to a measurement error.
	*/
	double min_improvement = 1.03;

	if (best->score * min_improvement < r->score)
	best = r;
	}

	if (num_methods > 0) {
	unsigned prev_index = num_methods - 1;
	struct si_result *prev = methods[prev_index];
	struct si_result *prev_this_size =
	&results[util_logbase2(size)][placement][prev->index];

	/* If the best one is also the best for the previous size,
	* just bump the size for the previous one.
	*
	* If there is no best, it means all methods were too slow
	* for this size and were not tested. Use the best one for
	* the previous size.
	*/
	if (!best \|\|
	/* If it's the same method as for the previous size: */
	(prev->is_cp == best->is_cp && prev->is_sdma == best->is_sdma &&
	prev->is_cs == best->is_cs && prev->cache_policy == best->cache_policy &&
	prev->dwords_per_thread == best->dwords_per_thread &&
	prev->waves_per_sh == best->waves_per_sh) \|\|
	/* If the method for the previous size is also the best
	* for this size: */
	(prev_this_size->is_valid && prev_this_size->score * 1.03 > best->score)) {
	method_max_size[prev_index] = size;
	continue;
	}
	}

	/* Add it to the list. */
	assert(num_methods < ARRAY_SIZE(methods));
	methods[num_methods] = best;
	method_max_size[num_methods] = size;
	num_methods++;
	}

	for (unsigned i = 0; i < num_methods; i++) {
	struct si_result *best = methods[i];
	unsigned size = method_max_size[i];

	/* The size threshold is between the current benchmarked
	* size and the next benchmarked size. */
	if (i < num_methods - 1)
	printf(" if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2);
	else if (i > 0)
	printf(" else ");
	else
	printf(" ");
	printf("return ");

	assert(best);
	const char *cache_policy_str =
	best->cache_policy == L2_BYPASS ? "L2_BYPASS" :
	best->cache_policy == L2_LRU ? "L2_LRU " : "L2_STREAM";

	if (best->is_cp) {
	printf("CP_DMA(%s);\n", cache_policy_str);
	}
	if (best->is_sdma)
	printf("SDMA;\n");
	if (best->is_cs) {
	printf("COMPUTE(%s, %u, %u);\n", cache_policy_str,
	best->dwords_per_thread, best->waves_per_sh);
	}
	}
	}
	puts(" }");
	}
	puts(" }");
	puts("}");

	ctx->destroy(ctx);
	exit(0);
	}