backends/vulkan/tools/gpuinfo/include/textures.h - platform/external/executorch - Git at Google

 /*
  * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */

 #pragma once

 #include "app.h"
 #include "stats.h"
 #include "utils.h"

 namespace gpuinfo {

 // Textures are drastically different from buffers in terms of data layout.
 // While buffers are a contiguous range of memory, textures are opaque objects
 // defined by the vendor and it is possible that nearby points of data are not
 // neighboring in memory. Likewise, data points are accessed in
 // multi-dimensional patches instead of simple lines. This makes the stride
 // method for figuring out the cache line size not applicable. To go around
 // this, this experiment runs an increasing amount of threads accessing
 // different datapoints in the texture and measures latency. If the cache line
 // is big enough to contain all requested data for the amount of threads,
 // latency will be low. When there are more threads and hence more data than
 // what a single cache line can handle, a second line must be fetched,
 // increasing latency in a measurable way.
 void tex_cacheline_concurr(const App& app) {
   if (!app.enabled("tex_cacheline_concurr")) {
     std::cout << "Skipped Texture Cacheline Optimal Concurrency" << std::endl;
     return;
   }

   const uint32_t TEXEL_WIDTH = 4;
   const uint32_t TEXEL_SIZE = sizeof(float) * TEXEL_WIDTH;

   const double COMPENSATE =
       app.get_config("tex_cacheline_concurr", "compensate");
   const double THRESHOLD = app.get_config("tex_cacheline_concurr", "threshold");

   for (int dim = 0; dim < 3; ++dim) {
     std::cout << std::endl;
     std::cout << "------ Texture Cacheline Optimal Concurrency (dim = " << dim
               << ") ------" << std::endl;

     uint32_t NITER;

     const uint32_t IMG_OTHER_EDGE = dim == 0 ? app.max_tex_width
         : dim == 1                           ? app.max_tex_height
                                              : app.max_tex_depth;

     const uint32_t MAX_NTHREAD = std::min(app.nthread_logic, IMG_OTHER_EDGE);

     auto bench = [&](uint32_t nthread) {
       std::vector<int64_t> sizes_whd = {
           app.max_tex_width, app.max_tex_height, app.max_tex_depth};

       auto sizes_nchw = whd_to_nchw(sizes_whd);

       vTensor in_tensor =
           api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);

       StagingBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH);

       vkapi::PipelineBarrier pipeline_barrier{};

       auto shader_name = "tex_cacheline_concurr_" + std::to_string(dim);

       auto time = benchmark_on_gpu(shader_name, 100, [&]() {
         context()->submit_compute_job(
             VK_KERNEL_FROM_STR(shader_name),
             pipeline_barrier,
             {nthread, 1, 1},
             {nthread, 1, 1},
             {SV(NITER)},
             VK_NULL_HANDLE,
             0,
             in_tensor.image(),
             out_buf.buffer());
       });
       return time;
     };

     ensure_min_niter(1000, NITER, [&]() { return bench(1); });

     DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
     uint32_t nthread = 1;
     for (; nthread <= MAX_NTHREAD; ++nthread) {
       double time = bench(nthread);
       std::cout << "Testing nthread=\t" << nthread << "\t, time=\t" << time
                 << std::endl;

       if (dj.push(time)) {
         auto max_concurrency = nthread - 1;
         std::cout << "TextureCachelineConcurrencyDim" << dim << " (B),"
                   << max_concurrency * TEXEL_SIZE << std::endl;
         break;
       }
     }
     if (nthread >= MAX_NTHREAD) {
       std::cout
           << "Unable to conclude an optimal texture cacheline concurrency for dim "
           << dim << std::endl;
     };
   }

   // TODO: Use concurrency information to obtain the cache line size for
   // textures as done in https://fburl.com/98xiou3g
 }

 void tex_bandwidth(const App& app) {
   if (!app.enabled("tex_bandwidth")) {
     std::cout << "Skipped Texture Bandwidth" << std::endl;
     return;
   }

   for (int dim = 0; dim < 3; dim++) {
     std::cout << "\n------ Texture Bandwidth (Dim = " << dim << ") ------"
               << std::endl;
     const uint32_t MAX_SIZE = dim == 0 ? app.max_tex_width
         : dim == 1                     ? app.max_tex_height
                                        : app.max_tex_depth;

     // rgba, float
     const uint32_t VEC_WIDTH = 4;
     const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float);
     const uint32_t NVEC = MAX_SIZE;

     const uint32_t RANGE = NVEC * VEC_SIZE;

     // Cache lines flushed
     const uint32_t NFLUSH = app.get_config("tex_bandwidth", "nflush");
     // Number of loop unrolls. Changing this value requires an equal change in
     // tex_bandwidth.yaml
     const uint32_t NUNROLL = app.get_config("tex_bandwidth", "nunroll");
     // Number of iterations. Increasing this value reduces noise in exchange
     // for higher latency.
     const uint32_t NITER = app.get_config("tex_bandwidth", "niter");
     // Number of memory reads per thread
     const uint32_t NREAD_PER_THREAD = NUNROLL * NITER;
     // Number of threads needed to read all texells
     const uint32_t NTHREAD = NVEC;
     // Occupy all threads
     const uint32_t local_x = app.nthread_logic;
     // Ensure that global is a multiple of local, and distribute across all
     // SMs
     const uint32_t global_x =
         (NTHREAD / local_x * local_x) * app.sm_count * NFLUSH;

     auto shader_name = "tex_bandwidth_" + std::to_string(dim);

     std::vector<int64_t> sizes_whd = {MAX_SIZE, 1, 1};
     if (dim == 1) {
       sizes_whd = {1, MAX_SIZE, 1};
     } else if (dim == 2) {
       sizes_whd = {1, 1, MAX_SIZE};
     }
     auto sizes_nchw = whd_to_nchw(sizes_whd);

     vTensor in_tensor = api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);

     auto bench = [&](uint32_t access_size, uint32_t dim) {
       // Number of texels that fit in this iteration
       const uint32_t ntexel_access = access_size / VEC_SIZE;

       // The address mask works as a modulo because x % 2^n == x & (2^n - 1).
       // This will help us limit address accessing to a specific set of unique
       // addresses depending on the access size we want to measure.
       const uint32_t addr_mask = ntexel_access - 1;

       // This is to distribute the accesses to unique addresses across the
       // workgroups, once the size of the access excedes the workgroup width.
       const uint32_t workgroup_width = local_x * NITER * NUNROLL;

       StagingBuffer out_buf(
           context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic);
       vkapi::PipelineBarrier pipeline_barrier{};

       auto time = benchmark_on_gpu(shader_name, 10, [&]() {
         context()->submit_compute_job(
             VK_KERNEL_FROM_STR(shader_name),
             pipeline_barrier,
             {global_x, 1, 1},
             {local_x, 1, 1},
             {SV(NITER),
              SV(ntexel_access),
              SV(local_x),
              SV(addr_mask),
              SV(workgroup_width)},
             VK_NULL_HANDLE,
             0,
             in_tensor.image(),
             out_buf.buffer());
       });

       const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE;
       double gbps = SIZE_TRANS * 1e-3 / time;
       std::cout << "Texture bandwidth accessing \t" << access_size
                 << "\tB unique data is \t" << gbps << " \tgbps (\t" << time
                 << "\tus)" << std::endl;
       return gbps;
     };

     double max_bandwidth = 0;
     double min_bandwidth = DBL_MAX;
     for (uint32_t access_size = VEC_SIZE; access_size < RANGE;
          access_size *= 2) {
       double gbps = bench(access_size, dim);
       max_bandwidth = std::max(gbps, max_bandwidth);
       min_bandwidth = std::min(gbps, min_bandwidth);
     }

     std::cout << "MaxTextureBandwidthDim" << dim << "(GB/s)," << max_bandwidth
               << std::endl;
     std::cout << "MinTextureBandwidthDim" << dim << "(GB/s)," << min_bandwidth
               << std::endl;
   }
 }
 } // namespace gpuinfo
	/*
	* Copyright (c) Meta Platforms, Inc. and affiliates.
	* All rights reserved.
	*
	* This source code is licensed under the BSD-style license found in the
	* LICENSE file in the root directory of this source tree.
	*/

	#pragma once

	#include "app.h"
	#include "stats.h"
	#include "utils.h"

	namespace gpuinfo {

	// Textures are drastically different from buffers in terms of data layout.
	// While buffers are a contiguous range of memory, textures are opaque objects
	// defined by the vendor and it is possible that nearby points of data are not
	// neighboring in memory. Likewise, data points are accessed in
	// multi-dimensional patches instead of simple lines. This makes the stride
	// method for figuring out the cache line size not applicable. To go around
	// this, this experiment runs an increasing amount of threads accessing
	// different datapoints in the texture and measures latency. If the cache line
	// is big enough to contain all requested data for the amount of threads,
	// latency will be low. When there are more threads and hence more data than
	// what a single cache line can handle, a second line must be fetched,
	// increasing latency in a measurable way.
	void tex_cacheline_concurr(const App& app) {
	if (!app.enabled("tex_cacheline_concurr")) {
	std::cout << "Skipped Texture Cacheline Optimal Concurrency" << std::endl;
	return;
	}

	const uint32_t TEXEL_WIDTH = 4;
	const uint32_t TEXEL_SIZE = sizeof(float) * TEXEL_WIDTH;

	const double COMPENSATE =
	app.get_config("tex_cacheline_concurr", "compensate");
	const double THRESHOLD = app.get_config("tex_cacheline_concurr", "threshold");

	for (int dim = 0; dim < 3; ++dim) {
	std::cout << std::endl;
	std::cout << "------ Texture Cacheline Optimal Concurrency (dim = " << dim
	<< ") ------" << std::endl;

	uint32_t NITER;

	const uint32_t IMG_OTHER_EDGE = dim == 0 ? app.max_tex_width
	: dim == 1 ? app.max_tex_height
	: app.max_tex_depth;

	const uint32_t MAX_NTHREAD = std::min(app.nthread_logic, IMG_OTHER_EDGE);

	auto bench = [&](uint32_t nthread) {
	std::vector<int64_t> sizes_whd = {
	app.max_tex_width, app.max_tex_height, app.max_tex_depth};

	auto sizes_nchw = whd_to_nchw(sizes_whd);

	vTensor in_tensor =
	api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);

	StagingBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH);

	vkapi::PipelineBarrier pipeline_barrier{};

	auto shader_name = "tex_cacheline_concurr_" + std::to_string(dim);

	auto time = benchmark_on_gpu(shader_name, 100, [&]() {
	context()->submit_compute_job(
	VK_KERNEL_FROM_STR(shader_name),
	pipeline_barrier,
	{nthread, 1, 1},
	{nthread, 1, 1},
	{SV(NITER)},
	VK_NULL_HANDLE,
	0,
	in_tensor.image(),
	out_buf.buffer());
	});
	return time;
	};

	ensure_min_niter(1000, NITER, [&]() { return bench(1); });

	DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
	uint32_t nthread = 1;
	for (; nthread <= MAX_NTHREAD; ++nthread) {
	double time = bench(nthread);
	std::cout << "Testing nthread=\t" << nthread << "\t, time=\t" << time
	<< std::endl;

	if (dj.push(time)) {
	auto max_concurrency = nthread - 1;
	std::cout << "TextureCachelineConcurrencyDim" << dim << " (B),"
	<< max_concurrency * TEXEL_SIZE << std::endl;
	break;
	}
	}
	if (nthread >= MAX_NTHREAD) {
	std::cout
	<< "Unable to conclude an optimal texture cacheline concurrency for dim "
	<< dim << std::endl;
	};
	}

	// TODO: Use concurrency information to obtain the cache line size for
	// textures as done in https://fburl.com/98xiou3g
	}

	void tex_bandwidth(const App& app) {
	if (!app.enabled("tex_bandwidth")) {
	std::cout << "Skipped Texture Bandwidth" << std::endl;
	return;
	}

	for (int dim = 0; dim < 3; dim++) {
	std::cout << "\n------ Texture Bandwidth (Dim = " << dim << ") ------"
	<< std::endl;
	const uint32_t MAX_SIZE = dim == 0 ? app.max_tex_width
	: dim == 1 ? app.max_tex_height
	: app.max_tex_depth;

	// rgba, float
	const uint32_t VEC_WIDTH = 4;
	const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float);
	const uint32_t NVEC = MAX_SIZE;

	const uint32_t RANGE = NVEC * VEC_SIZE;

	// Cache lines flushed
	const uint32_t NFLUSH = app.get_config("tex_bandwidth", "nflush");
	// Number of loop unrolls. Changing this value requires an equal change in
	// tex_bandwidth.yaml
	const uint32_t NUNROLL = app.get_config("tex_bandwidth", "nunroll");
	// Number of iterations. Increasing this value reduces noise in exchange
	// for higher latency.
	const uint32_t NITER = app.get_config("tex_bandwidth", "niter");
	// Number of memory reads per thread
	const uint32_t NREAD_PER_THREAD = NUNROLL * NITER;
	// Number of threads needed to read all texells
	const uint32_t NTHREAD = NVEC;
	// Occupy all threads
	const uint32_t local_x = app.nthread_logic;
	// Ensure that global is a multiple of local, and distribute across all
	// SMs
	const uint32_t global_x =
	(NTHREAD / local_x * local_x) * app.sm_count * NFLUSH;

	auto shader_name = "tex_bandwidth_" + std::to_string(dim);

	std::vector<int64_t> sizes_whd = {MAX_SIZE, 1, 1};
	if (dim == 1) {
	sizes_whd = {1, MAX_SIZE, 1};
	} else if (dim == 2) {
	sizes_whd = {1, 1, MAX_SIZE};
	}
	auto sizes_nchw = whd_to_nchw(sizes_whd);

	vTensor in_tensor = api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);

	auto bench = [&](uint32_t access_size, uint32_t dim) {
	// Number of texels that fit in this iteration
	const uint32_t ntexel_access = access_size / VEC_SIZE;

	// The address mask works as a modulo because x % 2^n == x & (2^n - 1).
	// This will help us limit address accessing to a specific set of unique
	// addresses depending on the access size we want to measure.
	const uint32_t addr_mask = ntexel_access - 1;

	// This is to distribute the accesses to unique addresses across the
	// workgroups, once the size of the access excedes the workgroup width.
	const uint32_t workgroup_width = local_x * NITER * NUNROLL;

	StagingBuffer out_buf(
	context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic);
	vkapi::PipelineBarrier pipeline_barrier{};

	auto time = benchmark_on_gpu(shader_name, 10, [&]() {
	context()->submit_compute_job(
	VK_KERNEL_FROM_STR(shader_name),
	pipeline_barrier,
	{global_x, 1, 1},
	{local_x, 1, 1},
	{SV(NITER),
	SV(ntexel_access),
	SV(local_x),
	SV(addr_mask),
	SV(workgroup_width)},
	VK_NULL_HANDLE,
	0,
	in_tensor.image(),
	out_buf.buffer());
	});

	const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE;
	double gbps = SIZE_TRANS * 1e-3 / time;
	std::cout << "Texture bandwidth accessing \t" << access_size
	<< "\tB unique data is \t" << gbps << " \tgbps (\t" << time
	<< "\tus)" << std::endl;
	return gbps;
	};

	double max_bandwidth = 0;
	double min_bandwidth = DBL_MAX;
	for (uint32_t access_size = VEC_SIZE; access_size < RANGE;
	access_size *= 2) {
	double gbps = bench(access_size, dim);
	max_bandwidth = std::max(gbps, max_bandwidth);
	min_bandwidth = std::min(gbps, min_bandwidth);
	}

	std::cout << "MaxTextureBandwidthDim" << dim << "(GB/s)," << max_bandwidth
	<< std::endl;
	std::cout << "MinTextureBandwidthDim" << dim << "(GB/s)," << min_bandwidth
	<< std::endl;
	}
	}
	} // namespace gpuinfo