| /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| |
| #include "tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv.h" |
| |
| #include <map> |
| #include <memory> |
| #include <utility> |
| #include <vector> |
| |
| #include "absl/strings/substitute.h" |
| #include "absl/types/span.h" |
| #include "tensorflow/lite/delegates/gpu/common/convert.h" |
| #include "tensorflow/lite/delegates/gpu/common/model.h" |
| #include "tensorflow/lite/delegates/gpu/common/shape.h" |
| #include "tensorflow/lite/delegates/gpu/common/types.h" |
| #include "tensorflow/lite/delegates/gpu/common/util.h" |
| #include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h" |
| |
| namespace tflite { |
| namespace gpu { |
| namespace metal { |
| namespace { |
| |
| std::string GetKernelDepthWiseConv3x3Stride1x1() { |
| std::string code = R"( |
| kernel void ComputeFunction($0 |
| uint3 ugid[[thread_position_in_grid]]) |
| { |
| int gid_x = ugid.x * 2; |
| int gid_y = ugid.y * 2; |
| int gid_z = ugid.z; |
| |
| if (gid_x >= args.dst_tensor.Width() || gid_y >= args.dst_tensor.Height()) { |
| return; |
| } |
| |
| ACCUM_FLT4 r0 = ACCUM_FLT4(0.0f, 0.0f, 0.0f, 0.0f); |
| ACCUM_FLT4 l0 = ACCUM_FLT4(0.0f, 0.0f, 0.0f, 0.0f); |
| ACCUM_FLT4 t0 = ACCUM_FLT4(0.0f, 0.0f, 0.0f, 0.0f); |
| ACCUM_FLT4 b0 = ACCUM_FLT4(0.0f, 0.0f, 0.0f, 0.0f); |
| |
| int x0 = gid_x + args.padding_x; |
| int x1 = gid_x + args.padding_x + 1; |
| int x2 = gid_x + args.padding_x + 2; |
| int x3 = gid_x + args.padding_x + 3; |
| int y0 = gid_y + args.padding_y; |
| int y1 = gid_y + args.padding_y + 1; |
| int y2 = gid_y + args.padding_y + 2; |
| int y3 = gid_y + args.padding_y + 3; |
| |
| bool x0_out = x0 < 0 || x0 >= args.src_tensor.Width(); |
| bool x1_out = x1 < 0 || x1 >= args.src_tensor.Width(); |
| bool x2_out = x2 < 0 || x2 >= args.src_tensor.Width(); |
| bool x3_out = x3 < 0 || x3 >= args.src_tensor.Width(); |
| bool y0_out = y0 < 0 || y0 >= args.src_tensor.Height(); |
| bool y1_out = y1 < 0 || y1 >= args.src_tensor.Height(); |
| bool y2_out = y2 < 0 || y2 >= args.src_tensor.Height(); |
| bool y3_out = y3 < 0 || y3 >= args.src_tensor.Height(); |
| |
| x0 = clamp(x0, 0, args.src_tensor.Width() - 1); |
| x1 = clamp(x1, 0, args.src_tensor.Width() - 1); |
| x2 = clamp(x2, 0, args.src_tensor.Width() - 1); |
| x3 = clamp(x3, 0, args.src_tensor.Width() - 1); |
| y0 = clamp(y0, 0, args.src_tensor.Height() - 1); |
| y1 = clamp(y1, 0, args.src_tensor.Height() - 1); |
| y2 = clamp(y2, 0, args.src_tensor.Height() - 1); |
| y3 = clamp(y3, 0, args.src_tensor.Height() - 1); |
| |
| device FLT4* src_loc = args.src_tensor.GetPtrWithSliceOffset(gid_z); |
| device FLT4* filters_loc = args.weights.GetPtr() + gid_z * 10; |
| |
| FLT4 s0 = src_loc[args.src_tensor.GetWHOffset(x0, y0)] * FLT(!(x0_out || y0_out)); |
| FLT4 s1 = src_loc[args.src_tensor.GetWHOffset(x0, y1)] * FLT(!(x0_out || y1_out)); |
| FLT4 s2 = src_loc[args.src_tensor.GetWHOffset(x0, y2)] * FLT(!(x0_out || y2_out)); |
| FLT4 s3 = src_loc[args.src_tensor.GetWHOffset(x0, y3)] * FLT(!(x0_out || y3_out)); |
| |
| r0 += TO_ACCUM_TYPE(s0 * filters_loc[0]); |
| r0 += TO_ACCUM_TYPE(s1 * filters_loc[1]); |
| r0 += TO_ACCUM_TYPE(s2 * filters_loc[2]); |
| l0 += TO_ACCUM_TYPE(s1 * filters_loc[0]); |
| l0 += TO_ACCUM_TYPE(s2 * filters_loc[1]); |
| l0 += TO_ACCUM_TYPE(s3 * filters_loc[2]); |
| |
| s0 = src_loc[args.src_tensor.GetWHOffset(x1, y0)] * FLT(!(x1_out || y0_out)); |
| s1 = src_loc[args.src_tensor.GetWHOffset(x1, y1)] * FLT(!(x1_out || y1_out)); |
| s2 = src_loc[args.src_tensor.GetWHOffset(x1, y2)] * FLT(!(x1_out || y2_out)); |
| s3 = src_loc[args.src_tensor.GetWHOffset(x1, y3)] * FLT(!(x1_out || y3_out)); |
| |
| r0 += TO_ACCUM_TYPE(s0 * filters_loc[3]); |
| r0 += TO_ACCUM_TYPE(s1 * filters_loc[4]); |
| r0 += TO_ACCUM_TYPE(s2 * filters_loc[5]); |
| l0 += TO_ACCUM_TYPE(s1 * filters_loc[3]); |
| l0 += TO_ACCUM_TYPE(s2 * filters_loc[4]); |
| l0 += TO_ACCUM_TYPE(s3 * filters_loc[5]); |
| t0 += TO_ACCUM_TYPE(s0 * filters_loc[0]); |
| t0 += TO_ACCUM_TYPE(s1 * filters_loc[1]); |
| t0 += TO_ACCUM_TYPE(s2 * filters_loc[2]); |
| b0 += TO_ACCUM_TYPE(s1 * filters_loc[0]); |
| b0 += TO_ACCUM_TYPE(s2 * filters_loc[1]); |
| b0 += TO_ACCUM_TYPE(s3 * filters_loc[2]); |
| |
| s0 = src_loc[args.src_tensor.GetWHOffset(x2, y0)] * FLT(!(x2_out || y0_out)); |
| s1 = src_loc[args.src_tensor.GetWHOffset(x2, y1)] * FLT(!(x2_out || y1_out)); |
| s2 = src_loc[args.src_tensor.GetWHOffset(x2, y2)] * FLT(!(x2_out || y2_out)); |
| s3 = src_loc[args.src_tensor.GetWHOffset(x2, y3)] * FLT(!(x2_out || y3_out)); |
| |
| r0 += TO_ACCUM_TYPE(s0 * filters_loc[6]); |
| r0 += TO_ACCUM_TYPE(s1 * filters_loc[7]); |
| r0 += TO_ACCUM_TYPE(s2 * filters_loc[8]); |
| l0 += TO_ACCUM_TYPE(s1 * filters_loc[6]); |
| l0 += TO_ACCUM_TYPE(s2 * filters_loc[7]); |
| l0 += TO_ACCUM_TYPE(s3 * filters_loc[8]); |
| t0 += TO_ACCUM_TYPE(s0 * filters_loc[3]); |
| t0 += TO_ACCUM_TYPE(s1 * filters_loc[4]); |
| t0 += TO_ACCUM_TYPE(s2 * filters_loc[5]); |
| b0 += TO_ACCUM_TYPE(s1 * filters_loc[3]); |
| b0 += TO_ACCUM_TYPE(s2 * filters_loc[4]); |
| b0 += TO_ACCUM_TYPE(s3 * filters_loc[5]); |
| |
| s0 = src_loc[args.src_tensor.GetWHOffset(x3, y0)] * FLT(!(x3_out || y0_out)); |
| s1 = src_loc[args.src_tensor.GetWHOffset(x3, y1)] * FLT(!(x3_out || y1_out)); |
| s2 = src_loc[args.src_tensor.GetWHOffset(x3, y2)] * FLT(!(x3_out || y2_out)); |
| s3 = src_loc[args.src_tensor.GetWHOffset(x3, y3)] * FLT(!(x3_out || y3_out)); |
| |
| t0 += TO_ACCUM_TYPE(s0 * filters_loc[6]); |
| t0 += TO_ACCUM_TYPE(s1 * filters_loc[7]); |
| t0 += TO_ACCUM_TYPE(s2 * filters_loc[8]); |
| b0 += TO_ACCUM_TYPE(s1 * filters_loc[6]); |
| b0 += TO_ACCUM_TYPE(s2 * filters_loc[7]); |
| b0 += TO_ACCUM_TYPE(s3 * filters_loc[8]); |
| |
| r0 += TO_ACCUM_TYPE(filters_loc[9]); |
| l0 += TO_ACCUM_TYPE(filters_loc[9]); |
| t0 += TO_ACCUM_TYPE(filters_loc[9]); |
| b0 += TO_ACCUM_TYPE(filters_loc[9]); |
| |
| bool x0_in = gid_x < args.dst_tensor.Width(); |
| bool x1_in = gid_x + 1 < args.dst_tensor.Width(); |
| bool y0_in = gid_y < args.dst_tensor.Height(); |
| bool y1_in = gid_y + 1 < args.dst_tensor.Height(); |
| |
| if (y0_in && x0_in) { |
| FLT4 value = FLT4(r0); |
| args.dst_tensor.Write(value, gid_x, gid_y, gid_z); |
| } |
| if (y1_in && x0_in) { |
| FLT4 value = FLT4(l0); |
| args.dst_tensor.Write(value, gid_x, gid_y + 1, gid_z); |
| } |
| if (y0_in && x1_in) { |
| FLT4 value = FLT4(t0); |
| args.dst_tensor.Write(value, gid_x + 1, gid_y, gid_z); |
| } |
| if (y1_in && x1_in) { |
| FLT4 value = FLT4(b0); |
| args.dst_tensor.Write(value, gid_x + 1, gid_y + 1, gid_z); |
| } |
| } |
| )"; |
| |
| return code; |
| } |
| |
| // Reorder weights to make the weights memory access pattern cache friendly for |
| // DepthWiseConv3x3Stride1x1 |
| std::vector<float> ReorderWeightsDepthWiseConv3x3Stride1x1( |
| const DepthwiseConvolution2DAttributes& attr) { |
| const int src_depth = DivideRoundUp(attr.weights.shape.i, 4); |
| const int kernel_x = 3; |
| const int kernel_y = 3; |
| std::vector<float> weights_reordered((kernel_x * kernel_y + 1) * src_depth * |
| 4); |
| |
| int counter = 0; |
| for (int s = 0; s < src_depth; ++s) { |
| for (int x = 0; x < kernel_x; ++x) { |
| for (int y = 0; y < kernel_y; ++y) { |
| for (int i = 0; i < 4; ++i) { |
| const int s_ch = s * 4 + i; |
| if (s_ch < attr.weights.shape.i) { |
| const int f_index = attr.weights.shape.LinearIndex({0, y, x, s_ch}); |
| weights_reordered[counter++] = attr.weights.data[f_index]; |
| } else { |
| weights_reordered[counter++] = 0.0f; |
| } |
| } |
| } |
| } |
| |
| for (int i = 0; i < 4; ++i) { |
| const int dst_ch = s * 4 + i; |
| if (dst_ch < attr.bias.shape.v) { |
| weights_reordered[counter++] = attr.bias.data[dst_ch]; |
| } else { |
| weights_reordered[counter++] = 0.0f; |
| } |
| } |
| } |
| |
| return weights_reordered; |
| } |
| |
| std::string GetKernelDepthWiseConv3x3Stride2() { |
| std::string code = R"( |
| kernel void ComputeFunction($0 |
| uint3 ugid[[thread_position_in_grid]]) |
| { |
| int gid_x = ugid.x; |
| int gid_y = ugid.y * 2; |
| int gid_z = ugid.z; |
| |
| if (gid_x >= args.dst_tensor.Width() || gid_y >= args.dst_tensor.Height()) { |
| return; |
| } |
| |
| device FLT4* src_loc = args.src_tensor.GetPtrWithSliceOffset(gid_z); |
| device FLT4* filters_loc = args.weights.GetPtr() + gid_z * 10; |
| |
| ACCUM_FLT4 r0 = ACCUM_FLT4(0.0f, 0.0f, 0.0f, 0.0f); |
| ACCUM_FLT4 l0 = ACCUM_FLT4(0.0f, 0.0f, 0.0f, 0.0f); |
| |
| int x0 = gid_x * args.stride_x + args.padding_x; |
| int x1 = gid_x * args.stride_x + args.padding_x + args.dilation_x; |
| int x2 = gid_x * args.stride_x + args.padding_x + 2 * args.dilation_x; |
| int y0 = gid_y * 2 + args.padding_y; |
| int y1 = gid_y * 2 + args.padding_y + 1; |
| int y2 = gid_y * 2 + args.padding_y + 2; |
| int y3 = gid_y * 2 + args.padding_y + 3; |
| int y4 = gid_y * 2 + args.padding_y + 4; |
| |
| bool x0_out = x0 < 0 || x0 >= args.src_tensor.Width(); |
| bool x1_out = x1 < 0 || x1 >= args.src_tensor.Width(); |
| bool x2_out = x2 < 0 || x2 >= args.src_tensor.Width(); |
| bool y0_out = y0 < 0 || y0 >= args.src_tensor.Height(); |
| bool y1_out = y1 < 0 || y1 >= args.src_tensor.Height(); |
| bool y2_out = y2 < 0 || y2 >= args.src_tensor.Height(); |
| bool y3_out = y3 < 0 || y3 >= args.src_tensor.Height(); |
| bool y4_out = y4 < 0 || y4 >= args.src_tensor.Height(); |
| |
| x0 = clamp(x0, 0, args.src_tensor.Width() - 1); |
| x1 = clamp(x1, 0, args.src_tensor.Width() - 1); |
| x2 = clamp(x2, 0, args.src_tensor.Width() - 1); |
| y0 = clamp(y0, 0, args.src_tensor.Height() - 1); |
| y1 = clamp(y1, 0, args.src_tensor.Height() - 1); |
| y2 = clamp(y2, 0, args.src_tensor.Height() - 1); |
| y3 = clamp(y3, 0, args.src_tensor.Height() - 1); |
| y4 = clamp(y4, 0, args.src_tensor.Height() - 1); |
| |
| FLT4 s0 = src_loc[args.src_tensor.GetWHOffset(x0, y0)] * FLT(!(x0_out || y0_out)); |
| FLT4 s1 = src_loc[args.src_tensor.GetWHOffset(x1, y0)] * FLT(!(x1_out || y0_out)); |
| FLT4 s2 = src_loc[args.src_tensor.GetWHOffset(x2, y0)] * FLT(!(x2_out || y0_out)); |
| |
| r0 += TO_ACCUM_TYPE(s0 * filters_loc[0]); |
| r0 += TO_ACCUM_TYPE(s1 * filters_loc[1]); |
| r0 += TO_ACCUM_TYPE(s2 * filters_loc[2]); |
| |
| s0 = src_loc[args.src_tensor.GetWHOffset(x0, y1)] * FLT(!(x0_out || y1_out)); |
| s1 = src_loc[args.src_tensor.GetWHOffset(x1, y1)] * FLT(!(x1_out || y1_out)); |
| s2 = src_loc[args.src_tensor.GetWHOffset(x2, y1)] * FLT(!(x2_out || y1_out)); |
| |
| r0 += TO_ACCUM_TYPE(s0 * filters_loc[3]); |
| r0 += TO_ACCUM_TYPE(s1 * filters_loc[4]); |
| r0 += TO_ACCUM_TYPE(s2 * filters_loc[5]); |
| |
| s0 = src_loc[args.src_tensor.GetWHOffset(x0, y2)] * FLT(!(x0_out || y2_out)); |
| s1 = src_loc[args.src_tensor.GetWHOffset(x1, y2)] * FLT(!(x1_out || y2_out)); |
| s2 = src_loc[args.src_tensor.GetWHOffset(x2, y2)] * FLT(!(x2_out || y2_out)); |
| |
| r0 += TO_ACCUM_TYPE(s0 * filters_loc[6]); |
| r0 += TO_ACCUM_TYPE(s1 * filters_loc[7]); |
| r0 += TO_ACCUM_TYPE(s2 * filters_loc[8]); |
| l0 += TO_ACCUM_TYPE(s0 * filters_loc[0]); |
| l0 += TO_ACCUM_TYPE(s1 * filters_loc[1]); |
| l0 += TO_ACCUM_TYPE(s2 * filters_loc[2]); |
| |
| s0 = src_loc[args.src_tensor.GetWHOffset(x0, y3)] * FLT(!(x0_out || y3_out)); |
| s1 = src_loc[args.src_tensor.GetWHOffset(x1, y3)] * FLT(!(x1_out || y3_out)); |
| s2 = src_loc[args.src_tensor.GetWHOffset(x2, y3)] * FLT(!(x2_out || y3_out)); |
| |
| l0 += TO_ACCUM_TYPE(s0 * filters_loc[3]); |
| l0 += TO_ACCUM_TYPE(s1 * filters_loc[4]); |
| l0 += TO_ACCUM_TYPE(s2 * filters_loc[5]); |
| |
| s0 = src_loc[args.src_tensor.GetWHOffset(x0, y4)] * FLT(!(x0_out || y4_out)); |
| s1 = src_loc[args.src_tensor.GetWHOffset(x1, y4)] * FLT(!(x1_out || y4_out)); |
| s2 = src_loc[args.src_tensor.GetWHOffset(x2, y4)] * FLT(!(x2_out || y4_out)); |
| |
| l0 += TO_ACCUM_TYPE(s0 * filters_loc[6]); |
| l0 += TO_ACCUM_TYPE(s1 * filters_loc[7]); |
| l0 += TO_ACCUM_TYPE(s2 * filters_loc[8]); |
| |
| r0 += TO_ACCUM_TYPE(filters_loc[9]); |
| l0 += TO_ACCUM_TYPE(filters_loc[9]); |
| |
| bool y0_in = gid_y < args.dst_tensor.Height(); |
| bool y1_in = gid_y + 1 < args.dst_tensor.Height(); |
| |
| if (y0_in) { |
| FLT4 value = FLT4(r0); |
| args.dst_tensor.Write(value, gid_x, gid_y, gid_z); |
| } |
| if (y1_in) { |
| FLT4 value = FLT4(l0); |
| args.dst_tensor.Write(value, gid_x, gid_y + 1, gid_z); |
| } |
| } |
| )"; |
| |
| return code; |
| } |
| |
| // Reorder weights to make the weights memory access pattern cache friendly for |
| // DepthWiseConv3x3Stride2 |
| std::vector<float> ReorderWeightsDepthWiseConv3x3Stride2( |
| const DepthwiseConvolution2DAttributes& attr) { |
| const int src_depth = DivideRoundUp(attr.weights.shape.i, 4); |
| const int kernel_x = 3; |
| const int kernel_y = 3; |
| std::vector<float> weights_reordered((kernel_x * kernel_y + 1) * src_depth * |
| 4); |
| |
| int counter = 0; |
| for (int s = 0; s < src_depth; ++s) { |
| for (int y = 0; y < kernel_y; ++y) { |
| for (int x = 0; x < kernel_x; ++x) { |
| for (int i = 0; i < 4; ++i) { |
| const int s_ch = s * 4 + i; |
| if (s_ch < attr.weights.shape.i) { |
| const int f_index = attr.weights.shape.LinearIndex({0, y, x, s_ch}); |
| weights_reordered[counter++] = attr.weights.data[f_index]; |
| } else { |
| weights_reordered[counter++] = 0.0f; |
| } |
| } |
| } |
| } |
| |
| for (int i = 0; i < 4; ++i) { |
| const int dst_ch = s * 4 + i; |
| if (dst_ch < attr.bias.shape.v) { |
| weights_reordered[counter++] = attr.bias.data[dst_ch]; |
| } else { |
| weights_reordered[counter++] = 0.0f; |
| } |
| } |
| } |
| |
| return weights_reordered; |
| } |
| |
| } // namespace |
| |
| int3 DepthWiseConvolution::GetGridSize() const { |
| return int3(dst_[0]->Width(), dst_[0]->Height(), dst_[0]->Slices()); |
| } |
| |
| DepthWiseConvolution CreateDepthWiseConvolution( |
| const OperationDef& definition, |
| const DepthwiseConvolution2DAttributes& attr) { |
| int channels_multiplier = attr.weights.shape.o; |
| std::string shader_source = R"( |
| kernel void ComputeFunction($0 |
| uint tid[[thread_index_in_threadgroup]], |
| uint3 gid[[thread_position_in_grid]]) { |
| int dst_x = static_cast<int>(gid.x); |
| int dst_y = static_cast<int>(gid.y); |
| int dst_z = static_cast<int>(gid.z); |
| |
| if (dst_x >= args.dst_tensor.Width() || dst_y >= args.dst_tensor.Height()) return; |
| |
| device FLT4* temp = args.weights.GetPtr() + dst_z * args.kernel_size_x * args.kernel_size_y; |
| ACCUM_FLT4 sum0 = ACCUM_FLT4(0.0f, 0.0f, 0.0f, 0.0f); |
| |
| int src_x = dst_x * args.stride_x + args.padding_x; |
| int src_y = dst_y * args.stride_y + args.padding_y; |
| |
| for(int ky = 0; ky < args.kernel_size_y; ++ky) { |
| int yc = ky * args.dilation_y + src_y; |
| if (yc < 0 || yc >= args.src_tensor.Height()) continue; |
| for(int kx = 0; kx < args.kernel_size_x; ++kx) { |
| int xc = kx * args.dilation_x + src_x; |
| if (xc < 0 || xc >= args.src_tensor.Width()) continue; |
| )"; |
| if (channels_multiplier == 1) { |
| shader_source += R"( |
| int src_layer = dst_z; |
| FLT4 src_modified = args.src_tensor.Read(xc, yc, src_layer); |
| )"; |
| } else if (channels_multiplier == 2) { |
| shader_source += R"( |
| int src_layer = dst_z / 2; |
| FLT4 src = args.src_tensor.Read(xc, yc, src_layer); |
| FLT2 t0 = dst_z % 2 == 0 ? src.xy : src.zw; |
| FLT4 src_modified = FLT4(t0.x, t0.x, t0.y, t0.y); |
| )"; |
| } else if (channels_multiplier == 4) { |
| shader_source += R"( |
| int src_layer = dst_z / 4; |
| FLT4 src = args.src_tensor.Read(xc, yc, src_layer); |
| FLT t0 = src[dst_z % 4]; |
| FLT4 src_modified = FLT4(t0, t0, t0, t0); |
| )"; |
| } else { |
| shader_source += R"( |
| int src_layer = dst_z / args.channel_multiplier; |
| FLT4 src = args.src_tensor.Read(xc, yc, src_layer); |
| FLT4 src_modified; |
| const int src_layer_offset = (dst_z % args.channel_multiplier) * 4; |
| src_modified.x = src[(src_layer_offset + 0) / args.channel_multiplier]; |
| src_modified.y = src[(src_layer_offset + 1) / args.channel_multiplier]; |
| src_modified.z = src[(src_layer_offset + 2) / args.channel_multiplier]; |
| src_modified.w = src[(src_layer_offset + 3) / args.channel_multiplier]; |
| )"; |
| } |
| shader_source += R"( |
| sum0 += TO_ACCUM_TYPE(src_modified * temp[ky * args.kernel_size_x + kx]); |
| } |
| } |
| FLT4 res = FLT4(sum0) + args.biases.Read(dst_z); |
| args.dst_tensor.Write(res, dst_x, dst_y, dst_z); |
| } |
| )"; |
| DepthWiseConvolution desc(definition); |
| desc.code_ = shader_source; |
| desc.AddSrcTensor("src_tensor", definition.src_tensors[0]); |
| desc.AddDstTensor("dst_tensor", definition.dst_tensors[0]); |
| |
| desc.args_.AddInt("padding_x", -attr.padding.prepended.w); |
| desc.args_.AddInt("padding_y", -attr.padding.prepended.h); |
| desc.args_.AddInt("dilation_x", attr.dilations.w); |
| desc.args_.AddInt("dilation_y", attr.dilations.h); |
| desc.args_.AddInt("stride_x", attr.strides.w); |
| desc.args_.AddInt("stride_y", attr.strides.h); |
| desc.args_.AddInt("kernel_size_x", attr.weights.shape.w); |
| desc.args_.AddInt("kernel_size_y", attr.weights.shape.h); |
| desc.args_.AddInt("channel_multiplier", attr.weights.shape.o); |
| |
| auto data_type = DeduceDataTypeFromPrecision(definition.precision); |
| const int output_channels_count = attr.weights.shape.i * attr.weights.shape.o; |
| const int dst_ch_aligned = AlignByN(output_channels_count, 4); |
| BufferDescriptor weights_desc; |
| weights_desc.element_type = data_type; |
| weights_desc.element_size = 4; |
| weights_desc.data = |
| GetByteBufferConverted(ConvertToPIOHW4(attr.weights), data_type); |
| weights_desc.size = weights_desc.data.size(); |
| desc.args_.AddObject( |
| "weights", absl::make_unique<BufferDescriptor>(std::move(weights_desc))); |
| |
| BufferDescriptor bias_desc; |
| bias_desc.element_type = data_type; |
| bias_desc.element_size = 4; |
| bias_desc.data = |
| GetByteBufferConvertedResized(attr.bias.data, data_type, dst_ch_aligned); |
| bias_desc.size = bias_desc.data.size(); |
| desc.args_.AddObject( |
| "biases", absl::make_unique<BufferDescriptor>(std::move(bias_desc))); |
| |
| desc.work_group_size_ = int3(8, 4, 1); |
| |
| return desc; |
| } |
| |
| void DepthWiseConv3x3Stride1x1::GetPossibleKernelWorkGroups( |
| TuningType tuning_type, const GpuInfo& gpu_info, |
| const KernelInfo& kernel_info, std::vector<int3>* work_groups) const { |
| const int grid_x = DivideRoundUp(dst_[0]->Width(), 2); |
| const int grid_z = dst_[0]->Slices(); |
| int3 group_size{8, 4, 1}; |
| if (grid_x <= 4) { |
| group_size.x = 4; |
| group_size.z = grid_z % 2 == 0 ? 2 : 1; |
| } |
| work_groups->push_back(group_size); |
| } |
| int3 DepthWiseConv3x3Stride1x1::GetGridSize() const { |
| const int grid_x = DivideRoundUp(dst_[0]->Width(), 2); |
| const int grid_y = DivideRoundUp(dst_[0]->Height(), 2); |
| const int grid_z = dst_[0]->Slices(); |
| return int3(grid_x, grid_y, grid_z); |
| } |
| |
| DepthWiseConv3x3Stride1x1 CreateDepthWiseConv3x3Stride1x1( |
| const OperationDef& definition, |
| const DepthwiseConvolution2DAttributes& attr) { |
| DepthWiseConv3x3Stride1x1 desc(definition); |
| desc.code_ = GetKernelDepthWiseConv3x3Stride1x1(); |
| desc.AddSrcTensor("src_tensor", definition.src_tensors[0]); |
| desc.AddDstTensor("dst_tensor", definition.dst_tensors[0]); |
| |
| desc.args_.AddInt("padding_x", -attr.padding.prepended.w); |
| desc.args_.AddInt("padding_y", -attr.padding.prepended.h); |
| |
| // For this operation we keep weights and biases in one buffer |
| auto weights_reordered = ReorderWeightsDepthWiseConv3x3Stride1x1(attr); |
| auto data_type = DeduceDataTypeFromPrecision(definition.precision); |
| BufferDescriptor weights_desc; |
| weights_desc.element_type = data_type; |
| weights_desc.element_size = 4; |
| weights_desc.data = GetByteBufferConverted(weights_reordered, data_type); |
| weights_desc.size = weights_desc.data.size(); |
| desc.args_.AddObject( |
| "weights", absl::make_unique<BufferDescriptor>(std::move(weights_desc))); |
| |
| return desc; |
| } |
| |
| bool CheckDepthWiseConv3x3Stride1x1Support( |
| const DepthwiseConvolution2DAttributes& attr) { |
| return attr.weights.shape.o == 1 && attr.weights.shape.h == 3 && |
| attr.weights.shape.w == 3 && attr.strides.h == 1 && |
| attr.strides.w == 1 && attr.dilations.h == 1 && attr.dilations.w == 1; |
| } |
| |
| int3 DepthWiseConv3x3Stride2::GetGridSize() const { |
| const int grid_x = dst_[0]->Width(); |
| const int grid_y = DivideRoundUp(dst_[0]->Height(), 2); |
| const int grid_z = dst_[0]->Slices(); |
| return int3(grid_x, grid_y, grid_z); |
| } |
| |
| DepthWiseConv3x3Stride2 CreateDepthWiseConv3x3Stride2( |
| const OperationDef& definition, |
| const DepthwiseConvolution2DAttributes& attr) { |
| DepthWiseConv3x3Stride2 desc(definition); |
| desc.code_ = GetKernelDepthWiseConv3x3Stride2(); |
| desc.AddSrcTensor("src_tensor", definition.src_tensors[0]); |
| desc.AddDstTensor("dst_tensor", definition.dst_tensors[0]); |
| |
| desc.args_.AddInt("padding_x", -attr.padding.prepended.w); |
| desc.args_.AddInt("padding_y", -attr.padding.prepended.h); |
| desc.args_.AddInt("stride_x", attr.strides.w); |
| desc.args_.AddInt("dilation_x", attr.dilations.w); |
| |
| // For this operation we keep weights and biases in one buffer |
| auto weights_reordered = ReorderWeightsDepthWiseConv3x3Stride2(attr); |
| auto data_type = DeduceDataTypeFromPrecision(definition.precision); |
| BufferDescriptor weights_desc; |
| weights_desc.element_type = data_type; |
| weights_desc.element_size = 4; |
| weights_desc.data = GetByteBufferConverted(weights_reordered, data_type); |
| weights_desc.size = weights_desc.data.size(); |
| desc.args_.AddObject( |
| "weights", absl::make_unique<BufferDescriptor>(std::move(weights_desc))); |
| |
| desc.work_group_size_ = int3(8, 4, 1); |
| return desc; |
| } |
| |
| bool CheckDepthWiseConv3x3Stride2Support( |
| const DepthwiseConvolution2DAttributes& attr) { |
| return attr.weights.shape.o == 1 && attr.weights.shape.h == 3 && |
| attr.weights.shape.w == 3 && attr.strides.h == 2 && |
| attr.dilations.h == 1; |
| } |
| |
| } // namespace metal |
| } // namespace gpu |
| } // namespace tflite |