Add a parallel mean shader for Vulkan 1.1.
PiperOrigin-RevId: 356727275
Change-Id: I527a36b2982e400857343f91ad0ad26476ed151c
diff --git a/tensorflow/lite/delegates/gpu/common/gpu_info.h b/tensorflow/lite/delegates/gpu/common/gpu_info.h
index 5413021..77162e3 100644
--- a/tensorflow/lite/delegates/gpu/common/gpu_info.h
+++ b/tensorflow/lite/delegates/gpu/common/gpu_info.h
@@ -235,6 +235,9 @@
uint32_t max_image_dimension_2d;
uint32_t max_image_array_layers;
+ uint32_t subgroup_size = 0;
+ bool supports_subgroup_arithmetic = false;
+
std::vector<std::string> extensions;
int max_compute_work_group_size_x;
int max_compute_work_group_size_y;
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
index 197887d..7bbb0ac 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
@@ -323,8 +323,10 @@
"//tensorflow/lite/delegates/gpu/common:operations",
"//tensorflow/lite/delegates/gpu/common:status",
"//tensorflow/lite/delegates/gpu/common:types",
+ "//tensorflow/lite/delegates/gpu/common:util",
"//tensorflow/lite/delegates/gpu/gl:node_shader",
"@com_google_absl//absl/memory",
+ "@com_google_absl//absl/status",
],
)
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/mean.cc b/tensorflow/lite/delegates/gpu/gl/kernels/mean.cc
index c66ff55..cf88fc3 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/mean.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/mean.cc
@@ -22,14 +22,155 @@
#include <vector>
#include "absl/memory/memory.h"
+#include "absl/status/status.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/types.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
namespace tflite {
namespace gpu {
namespace gl {
namespace {
+bool UseSubgroupBasedImpl(const GpuInfo& gpu_info) {
+ return gpu_info.IsApiVulkan() &&
+ (gpu_info.vulkan_info.api_version_major > 1 ||
+ gpu_info.vulkan_info.api_version_minor >= 1) &&
+ gpu_info.vulkan_info.subgroup_size >= 32 &&
+ gpu_info.vulkan_info.supports_subgroup_arithmetic;
+}
+
+// An implementation of Mean for desktop GPUs and some phones with recent
+// Vulkan drivers. It is more parallel than the trivial Mean operation, but
+// still limited to using a single work group.
+void GenerateSubgroupBasedMean(const NodeShader::GenerationContext& ctx,
+ GeneratedCode* generated_code) {
+ int height = ctx.input_shapes[0][1];
+ int width = ctx.input_shapes[0][2];
+ int depth = ctx.input_shapes[0][3];
+ std::vector<Variable> parameters = {
+ {"input_data_0_h", height},
+ {"input_data_0_w", width},
+ {"output_data_0_h", 1},
+ {"output_data_0_w", 1},
+ };
+
+ std::string source = R"(
+ // Round columns and rows per invocation up, to ensure that we read the
+ // entire input.
+ const uint columns_per_invocation =
+ ($input_data_0_w$ + (gl_WorkGroupSize.x - 1))/gl_WorkGroupSize.x;
+ const uint rows_per_invocation =
+ ($input_data_0_h$ + (gl_WorkGroupSize.y - 1))/gl_WorkGroupSize.y;
+ const uint first_row = gl_GlobalInvocationID.y*rows_per_invocation;
+ const uint first_col = gl_GlobalInvocationID.x*columns_per_invocation;
+ const uint last_row_exclusive =
+ min(first_row+rows_per_invocation, $input_data_0_h$);
+ const uint last_column_exclusive =
+ min(first_col+columns_per_invocation, $input_data_0_w$);
+ vec4 value = vec4(0);
+ for (uint h = first_row; h < last_row_exclusive; ++h) {
+ for (uint w = first_col; w < last_column_exclusive; ++w) {
+ value += $input_data_0[w, h, gid.z]$;
+ }
+ }
+ highp vec4 subgroup_sum = subgroupAdd(value);
+ if(subgroupElect()) {
+ subgroup_sums[gl_SubgroupID] = subgroup_sum;
+ }
+
+ memoryBarrierShared();
+ barrier();
+ // Do the final reduction in the first subgroup.
+ if(gl_SubgroupID == 0) {
+ highp vec4 subtotal = vec4(0);
+ if (gl_SubgroupInvocationID < gl_NumSubgroups) {
+ subtotal = subgroup_sums[gl_SubgroupInvocationID];
+ }
+ highp vec4 grand_total = subgroupAdd(subtotal);
+ if(subgroupElect()) {
+ highp vec4 result = grand_total / $input_data_0_w$ / $input_data_0_h$;
+ $output_data_0[0, 0, gid.z] = result$;
+ }
+ }
+ )";
+
+ const uint32_t subgroup_size = ctx.gpu_info->vulkan_info.subgroup_size;
+ const uint32_t max_wg_size_x = ctx.gpu_info->GetMaxWorkGroupSizeForX();
+ const uint32_t max_wg_size_y = ctx.gpu_info->GetMaxWorkGroupSizeForY();
+ // Due to the design of the shader, at most subgroup_size subgroups can be
+ // launched. This may limit the maximal workgroup size.
+ const uint32_t max_wg_size =
+ std::min(static_cast<uint32_t>(ctx.gpu_info->GetMaxWorkGroupTotalSize()),
+ subgroup_size * subgroup_size);
+ const uint32_t max_number_of_subgroups = max_wg_size / subgroup_size;
+ uint32_t wg_size_x = 0;
+ uint32_t wg_size_y = 0;
+ if (width * height <= max_wg_size && width <= max_wg_size_x &&
+ height <= max_wg_size_y) {
+ wg_size_x = width;
+ wg_size_y = height;
+ } else {
+ // Approximately square workgroup. Also make sure to limit by driver limit
+ // and input size.
+ wg_size_x = std::min({static_cast<uint32_t>(std::sqrt(max_wg_size)),
+ max_wg_size_x, static_cast<uint32_t>(width)});
+ wg_size_y = std::min({max_wg_size / wg_size_x, max_wg_size_y,
+ static_cast<uint32_t>(height)});
+ }
+
+ std::vector<Variable> shared_variables = {
+ {"subgroup_sums", std::vector<float4>(max_number_of_subgroups)},
+ };
+
+ *generated_code = {
+ /*parameters=*/std::move(parameters),
+ /*objects=*/{},
+ /*shared_variables=*/{std::move(shared_variables)},
+ // Make sure we get one dispatch of size wg_size_x*wg_size_y*1 per layer.
+ /*workload=*/
+ uint3(wg_size_x, wg_size_y, uint32_t(DivideRoundUp(depth, 4))),
+ /*workgroup=*/uint3(wg_size_x, wg_size_y, 1u),
+ /*source_code=*/std::move(source),
+ /*input=*/IOStructure::ONLY_DEFINITIONS,
+ /*output=*/IOStructure::ONLY_DEFINITIONS,
+ };
+}
+
+void GenerateTrivialMean(const NodeShader::GenerationContext& ctx,
+ GeneratedCode* generated_code) {
+ std::vector<Variable> parameters = {
+ {"input_data_0_h", static_cast<int>(ctx.input_shapes[0][1])},
+ {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])}};
+
+ std::string source = R"(
+ // Shaders may be compiled with a precision hint mediump, which means that
+ // GLSL compiler may drop the size of float data type from 32 to 16 bits.
+ // If "sum" and "size" variables are 16bit floats, their values range
+ // become not enough for providing a good results accuracy. That is why
+ // their precision is forced to be 32bit by using highp qualifier.
+
+ highp vec4 sum = vec4(0.0);
+ highp float size = float($input_data_0_w$ * $input_data_0_h$);
+ for (int w = 0; w < $input_data_0_w$; w++) {
+ for (int h = 0; h < $input_data_0_h$; h++) {
+ sum += $input_data_0[w, h, gid.z]$;
+ }
+ }
+ value_0 = sum / size;
+ )";
+ *generated_code = {
+ /*parameters=*/std::move(parameters),
+ /*objects=*/{},
+ /*shared_variables=*/{},
+ /*workload=*/uint3(),
+ /*workgroup=*/uint3(1, 1, 4),
+ /*source_code=*/std::move(source),
+ /*input=*/IOStructure::ONLY_DEFINITIONS,
+ /*output=*/IOStructure::AUTO,
+ };
+}
+
class Mean : public NodeShader {
public:
absl::Status GenerateCode(const GenerationContext& ctx,
@@ -40,36 +181,19 @@
"Mean calculation is supported only for height and width.");
}
- std::vector<Variable> parameters = {
- {"input_data_0_h", static_cast<int>(ctx.input_shapes[0][1])},
- {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])}};
+ if (!(ctx.input_shapes.size() == 1 && ctx.output_shapes.size() == 1 &&
+ ctx.output_shapes[0][1] == 1 && ctx.output_shapes[0][2] == 1 &&
+ ctx.output_shapes[0][3] == ctx.input_shapes[0][3])) {
+ return absl::InvalidArgumentError(
+ "Mean calculation is supported for one input and one 1x1 output with "
+ "the same channel count.");
+ }
- std::string source = R"(
- // Shaders may be compiled with a precision hint mediump, which means that
- // GLSL compiler may drop the size of float data type from 32 to 16 bits.
- // If "sum" and "size" variables are 16bit floats, their values range
- // become not enough for providing a good results accuracy. That is why
- // their precision is forced to be 32bit by using highp qualifier.
-
- highp vec4 sum = vec4(0.0);
- highp float size = float($input_data_0_w$ * $input_data_0_h$);
- for (int w = 0; w < $input_data_0_w$; w++) {
- for (int h = 0; h < $input_data_0_h$; h++) {
- sum += $input_data_0[w, h, gid.z]$;
- }
- }
- value_0 = sum / size;
- )";
- *generated_code = {
- /*parameters=*/std::move(parameters),
- /*objects=*/{},
- /*shared_variables=*/{},
- /*workload=*/uint3(),
- /*workgroup=*/uint3(1, 1, 4),
- /*source_code=*/std::move(source),
- /*input=*/IOStructure::ONLY_DEFINITIONS,
- /*output=*/IOStructure::AUTO,
- };
+ if (UseSubgroupBasedImpl(*ctx.gpu_info)) {
+ GenerateSubgroupBasedMean(ctx, generated_code);
+ } else {
+ GenerateTrivialMean(ctx, generated_code);
+ }
return absl::OkStatus();
}
};