| /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| |
| #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_DEPTHWISE_CONV_3X3_H_ |
| #define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_DEPTHWISE_CONV_3X3_H_ |
| |
| #include <memory> |
| #include <vector> |
| |
| #include "tensorflow/lite/delegates/gpu/cl/buffer.h" |
| #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h" |
| #include "tensorflow/lite/delegates/gpu/cl/tensor.h" |
| #include "tensorflow/lite/delegates/gpu/cl/texture2d.h" |
| #include "tensorflow/lite/delegates/gpu/cl/util.h" |
| #include "tensorflow/lite/delegates/gpu/common/data_type.h" |
| #include "tensorflow/lite/delegates/gpu/common/operations.h" |
| #include "tensorflow/lite/delegates/gpu/common/shape.h" |
| #include "tensorflow/lite/delegates/gpu/common/status.h" |
| #include "tensorflow/lite/delegates/gpu/common/tensor.h" |
| #include "tensorflow/lite/delegates/gpu/common/types.h" |
| |
| namespace tflite { |
| namespace gpu { |
| namespace cl { |
| |
| class DepthwiseConv3x3 : public GPUOperation { |
| public: |
| DepthwiseConv3x3() = default; |
| absl::Status AddToQueue(CLCommandQueue* queue) override; |
| absl::Status Tune(const TuningParameters& params) override; |
| |
| absl::Status Compile(const CreationContext& creation_context) override; |
| |
| // Move only |
| DepthwiseConv3x3(DepthwiseConv3x3&& operation); |
| DepthwiseConv3x3& operator=(DepthwiseConv3x3&& operation); |
| DepthwiseConv3x3(const DepthwiseConv3x3&) = delete; |
| DepthwiseConv3x3& operator=(const DepthwiseConv3x3&) = delete; |
| |
| private: |
| explicit DepthwiseConv3x3(const OperationDef& definition, |
| bool weights_are_buffer, bool local_mem_uploads); |
| template <DataType T> |
| absl::Status UploadWeightsAndBiases( |
| const tflite::gpu::Tensor<OHWI, T>& weights, |
| const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context); |
| |
| friend absl::Status CreateDepthwiseConv3x3( |
| const CreationContext& creation_context, const OperationDef& definition, |
| const DepthwiseConvolution2DAttributes& attr, DepthwiseConv3x3* result); |
| |
| template <DataType S, typename T> |
| void RearrangeWeightsAndBiasesData( |
| const tflite::gpu::Tensor<OHWI, S>& weights, |
| const tflite::gpu::Tensor<Linear, S>& biases, absl::Span<T> dst); |
| |
| absl::Status BindArguments(); |
| int3 GetGridSize() const; |
| |
| bool weights_are_buffer_; |
| bool local_mem_uploads_; |
| Texture2D weights_tex2d_; |
| Buffer weights_buf_; |
| cl_mem weights_; |
| |
| CLKernel kernel_; |
| int3 work_group_size_ = int3(8, 4, 1); |
| }; |
| |
| template <DataType T> |
| absl::Status DepthwiseConv3x3::UploadWeightsAndBiases( |
| const tflite::gpu::Tensor<OHWI, T>& weights, |
| const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) { |
| const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4); |
| int texture_width = 10; // 3x3 kernel + 1 bias |
| int texture_height = src_depth; |
| const int elements_count = texture_width * texture_height; |
| const bool fp32_weights = definition_.precision == CalculationsPrecision::F32; |
| const int float4_size = fp32_weights ? 16 : 8; |
| |
| if (fp32_weights) { |
| std::vector<float4> gpu_data(elements_count); |
| RearrangeWeightsAndBiasesData(weights, biases, absl::MakeSpan(gpu_data)); |
| if (weights_are_buffer_) { |
| RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count, |
| gpu_data.data(), context, |
| &weights_buf_)); |
| } else { |
| RETURN_IF_ERROR(CreateTexture2DRGBA( |
| definition_.GetDataType(), texture_width, texture_height, |
| gpu_data.data(), context, &weights_tex2d_)); |
| } |
| } else { |
| std::vector<half4> gpu_data(elements_count); |
| RearrangeWeightsAndBiasesData(weights, biases, absl::MakeSpan(gpu_data)); |
| if (weights_are_buffer_) { |
| RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count, |
| gpu_data.data(), context, |
| &weights_buf_)); |
| } else { |
| RETURN_IF_ERROR(CreateTexture2DRGBA( |
| definition_.GetDataType(), texture_width, texture_height, |
| gpu_data.data(), context, &weights_tex2d_)); |
| } |
| } |
| |
| if (weights_are_buffer_) { |
| weights_ = weights_buf_.GetMemoryPtr(); |
| } else { |
| weights_ = weights_tex2d_.GetMemoryPtr(); |
| } |
| |
| return absl::OkStatus(); |
| } |
| |
| template <DataType S, typename T> |
| void DepthwiseConv3x3::RearrangeWeightsAndBiasesData( |
| const tflite::gpu::Tensor<OHWI, S>& weights, |
| const tflite::gpu::Tensor<Linear, S>& biases, absl::Span<T> dst) { |
| const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4); |
| |
| int counter = 0; |
| for (int s = 0; s < src_depth; ++s) { |
| for (int y = 0; y < 3; ++y) { |
| for (int x = 0; x < 3; ++x) { |
| T filter_val; |
| for (int i = 0; i < 4; ++i) { |
| const int s_ch = s * 4 + i; |
| if (s_ch < weights.shape.i) { |
| const int f_index = weights.shape.LinearIndex({0, y, x, s_ch}); |
| filter_val[i] = weights.data[f_index]; |
| } else { |
| filter_val[i] = 0.0f; |
| } |
| } |
| dst[counter++] = filter_val; |
| } |
| } |
| |
| T bias_val; |
| for (int i = 0; i < 4; ++i) { |
| const int dst_ch = s * 4 + i; |
| bias_val[i] = dst_ch >= biases.shape.v ? 0.0f : biases.data[dst_ch]; |
| } |
| dst[counter++] = bias_val; |
| } |
| } |
| |
| bool IsDepthwiseConv3x3Supported(const DepthwiseConvolution2DAttributes& attr); |
| |
| absl::Status CreateDepthwiseConv3x3( |
| const CreationContext& creation_context, const OperationDef& definition, |
| const DepthwiseConvolution2DAttributes& attr, DepthwiseConv3x3* result); |
| |
| } // namespace cl |
| } // namespace gpu |
| } // namespace tflite |
| |
| #endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_DEPTHWISE_CONV_3X3_H_ |