| /* Copyright 2020 The TensorFlow Authors. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| |
| #include "tensorflow/lite/delegates/gpu/gl/kernels/mean.h" |
| |
| #include <algorithm> |
| #include <cstdint> |
| #include <cstring> |
| #include <string> |
| #include <vector> |
| |
| #include "absl/memory/memory.h" |
| #include "absl/status/status.h" |
| #include "tensorflow/lite/delegates/gpu/common/status.h" |
| #include "tensorflow/lite/delegates/gpu/common/types.h" |
| #include "tensorflow/lite/delegates/gpu/common/util.h" |
| |
| namespace tflite { |
| namespace gpu { |
| namespace gl { |
| namespace { |
| |
| bool UseSubgroupBasedImpl(const GpuInfo& gpu_info) { |
| return gpu_info.IsApiVulkan() && |
| (gpu_info.vulkan_info.api_version_major > 1 || |
| gpu_info.vulkan_info.api_version_minor >= 1) && |
| gpu_info.vulkan_info.subgroup_size >= 32 && |
| gpu_info.vulkan_info.supports_subgroup_arithmetic; |
| } |
| |
| // An implementation of Mean for desktop GPUs and some phones with recent |
| // Vulkan drivers. It is more parallel than the trivial Mean operation, but |
| // still limited to using a single work group. |
| void GenerateSubgroupBasedMean(const NodeShader::GenerationContext& ctx, |
| GeneratedCode* generated_code) { |
| int height = ctx.input_shapes[0][1]; |
| int width = ctx.input_shapes[0][2]; |
| int depth = ctx.input_shapes[0][3]; |
| std::vector<Variable> parameters = { |
| {"input_data_0_h", height}, |
| {"input_data_0_w", width}, |
| {"output_data_0_h", 1}, |
| {"output_data_0_w", 1}, |
| }; |
| |
| std::string source = R"( |
| // Round columns and rows per invocation up, to ensure that we read the |
| // entire input. |
| const uint columns_per_invocation = |
| ($input_data_0_w$ + (gl_WorkGroupSize.x - 1))/gl_WorkGroupSize.x; |
| const uint rows_per_invocation = |
| ($input_data_0_h$ + (gl_WorkGroupSize.y - 1))/gl_WorkGroupSize.y; |
| const uint first_row = gl_GlobalInvocationID.y*rows_per_invocation; |
| const uint first_col = gl_GlobalInvocationID.x*columns_per_invocation; |
| const uint last_row_exclusive = |
| min(first_row+rows_per_invocation, $input_data_0_h$); |
| const uint last_column_exclusive = |
| min(first_col+columns_per_invocation, $input_data_0_w$); |
| vec4 value = vec4(0); |
| for (uint h = first_row; h < last_row_exclusive; ++h) { |
| for (uint w = first_col; w < last_column_exclusive; ++w) { |
| value += $input_data_0[w, h, gid.z]$; |
| } |
| } |
| highp vec4 subgroup_sum = subgroupAdd(value); |
| if(subgroupElect()) { |
| subgroup_sums[gl_SubgroupID] = subgroup_sum; |
| } |
| |
| memoryBarrierShared(); |
| barrier(); |
| // Do the final reduction in the first subgroup. |
| if(gl_SubgroupID == 0) { |
| highp vec4 subtotal = vec4(0); |
| if (gl_SubgroupInvocationID < gl_NumSubgroups) { |
| subtotal = subgroup_sums[gl_SubgroupInvocationID]; |
| } |
| highp vec4 grand_total = subgroupAdd(subtotal); |
| if(subgroupElect()) { |
| highp vec4 result = grand_total / $input_data_0_w$ / $input_data_0_h$; |
| $output_data_0[0, 0, gid.z] = result$; |
| } |
| } |
| )"; |
| |
| const uint32_t subgroup_size = ctx.gpu_info->vulkan_info.subgroup_size; |
| const uint32_t max_wg_size_x = ctx.gpu_info->GetMaxWorkGroupSizeForX(); |
| const uint32_t max_wg_size_y = ctx.gpu_info->GetMaxWorkGroupSizeForY(); |
| // Due to the design of the shader, at most subgroup_size subgroups can be |
| // launched. This may limit the maximal workgroup size. |
| const uint32_t max_wg_size = |
| std::min(static_cast<uint32_t>(ctx.gpu_info->GetMaxWorkGroupTotalSize()), |
| subgroup_size * subgroup_size); |
| const uint32_t max_number_of_subgroups = max_wg_size / subgroup_size; |
| uint32_t wg_size_x = 0; |
| uint32_t wg_size_y = 0; |
| if (width * height <= max_wg_size && width <= max_wg_size_x && |
| height <= max_wg_size_y) { |
| wg_size_x = width; |
| wg_size_y = height; |
| } else { |
| // Approximately square workgroup. Also make sure to limit by driver limit |
| // and input size. |
| wg_size_x = std::min({static_cast<uint32_t>(std::sqrt(max_wg_size)), |
| max_wg_size_x, static_cast<uint32_t>(width)}); |
| wg_size_y = std::min({max_wg_size / wg_size_x, max_wg_size_y, |
| static_cast<uint32_t>(height)}); |
| } |
| |
| std::vector<Variable> shared_variables = { |
| {"subgroup_sums", std::vector<float4>(max_number_of_subgroups)}, |
| }; |
| |
| *generated_code = { |
| /*parameters=*/std::move(parameters), |
| /*objects=*/{}, |
| /*shared_variables=*/{std::move(shared_variables)}, |
| // Make sure we get one dispatch of size wg_size_x*wg_size_y*1 per layer. |
| /*workload=*/ |
| uint3(wg_size_x, wg_size_y, uint32_t(DivideRoundUp(depth, 4))), |
| /*workgroup=*/uint3(wg_size_x, wg_size_y, 1u), |
| /*source_code=*/std::move(source), |
| /*input=*/IOStructure::ONLY_DEFINITIONS, |
| /*output=*/IOStructure::ONLY_DEFINITIONS, |
| }; |
| } |
| |
| void GenerateTrivialMean(const NodeShader::GenerationContext& ctx, |
| GeneratedCode* generated_code) { |
| std::vector<Variable> parameters = { |
| {"input_data_0_h", static_cast<int>(ctx.input_shapes[0][1])}, |
| {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])}}; |
| |
| std::string source = R"( |
| // Shaders may be compiled with a precision hint mediump, which means that |
| // GLSL compiler may drop the size of float data type from 32 to 16 bits. |
| // If "sum" and "size" variables are 16bit floats, their values range |
| // become not enough for providing a good results accuracy. That is why |
| // their precision is forced to be 32bit by using highp qualifier. |
| |
| highp vec4 sum = vec4(0.0); |
| highp float size = float($input_data_0_w$ * $input_data_0_h$); |
| for (int w = 0; w < $input_data_0_w$; w++) { |
| for (int h = 0; h < $input_data_0_h$; h++) { |
| sum += $input_data_0[w, h, gid.z]$; |
| } |
| } |
| value_0 = sum / size; |
| )"; |
| *generated_code = { |
| /*parameters=*/std::move(parameters), |
| /*objects=*/{}, |
| /*shared_variables=*/{}, |
| /*workload=*/uint3(), |
| /*workgroup=*/uint3(1, 1, 4), |
| /*source_code=*/std::move(source), |
| /*input=*/IOStructure::ONLY_DEFINITIONS, |
| /*output=*/IOStructure::AUTO, |
| }; |
| } |
| |
| class Mean : public NodeShader { |
| public: |
| absl::Status GenerateCode(const GenerationContext& ctx, |
| GeneratedCode* generated_code) const final { |
| const auto& attr = absl::any_cast<const MeanAttributes&>(ctx.op_attr); |
| if (attr.dims != std::set<Axis>({Axis::HEIGHT, Axis::WIDTH})) { |
| return absl::InvalidArgumentError( |
| "Mean calculation is supported only for height and width."); |
| } |
| |
| if (!(ctx.input_shapes.size() == 1 && ctx.output_shapes.size() == 1 && |
| ctx.output_shapes[0][1] == 1 && ctx.output_shapes[0][2] == 1 && |
| ctx.output_shapes[0][3] == ctx.input_shapes[0][3])) { |
| return absl::InvalidArgumentError( |
| "Mean calculation is supported for one input and one 1x1 output with " |
| "the same channel count."); |
| } |
| |
| if (UseSubgroupBasedImpl(*ctx.gpu_info)) { |
| GenerateSubgroupBasedMean(ctx, generated_code); |
| } else { |
| GenerateTrivialMean(ctx, generated_code); |
| } |
| return absl::OkStatus(); |
| } |
| }; |
| |
| } // namespace |
| |
| std::unique_ptr<NodeShader> NewMeanNodeShader() { |
| return absl::make_unique<Mean>(); |
| } |
| |
| } // namespace gl |
| } // namespace gpu |
| } // namespace tflite |