blob: 9f549d33e7192ce7e8655d59383371dd0aae590b [file] [log] [blame]
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_BUFFER_1X1_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_BUFFER_1X1_H_
#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_common.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
#include "tensorflow/lite/delegates/gpu/cl/precision.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
#include "tensorflow/lite/delegates/gpu/cl/util.h"
#include "tensorflow/lite/delegates/gpu/common/data_type.h"
#include "tensorflow/lite/delegates/gpu/common/operations.h"
#include "tensorflow/lite/delegates/gpu/common/shape.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/tensor.h"
#include "tensorflow/lite/delegates/gpu/common/types.h"
#include "tensorflow/lite/delegates/gpu/common/winograd_util.h"
namespace tflite {
namespace gpu {
namespace cl {
class ConvBuffer1x1 : public GPUOperation {
public:
ConvBuffer1x1() = default;
// Move only
ConvBuffer1x1(ConvBuffer1x1&& operation);
ConvBuffer1x1& operator=(ConvBuffer1x1&& operation);
ConvBuffer1x1(const ConvBuffer1x1&) = delete;
ConvBuffer1x1& operator=(const ConvBuffer1x1&) = delete;
absl::Status Tune(const TuningParameters& params) override;
absl::Status Compile(const CreationContext& creation_context) override;
absl::Status BindArguments() override;
int3 GetGridSize() const override;
ConvWeightsDescription GetConvWeightsDescription() const {
ConvWeightsDescription desc;
desc.layout = ConvWeightsLayout::kOHWIOGroupI4O4;
desc.output_group_size = conv_params_.block_size.z;
return desc;
}
struct ConvParams {
int3 block_size = int3(1, 1, 1);
int element_size = 4; // can be 4, 8 or 16
// By default in 2d convolution we have the same weights for WH dims, but in
// some cases we need separate weights for H dimension and convolution
// kernel requires very small modifications to support it.
bool different_weights_for_height = false;
int3 work_group_size = int3(2, 4, 1);
};
private:
ConvBuffer1x1(const OperationDef& definition, const ConvParams& conv_params);
friend absl::Status CreateConvBuffer1x1(
const CreationContext& creation_context, const OperationDef& definition,
const Convolution2DAttributes& attr, ConvBuffer1x1* result,
const BHWC* shape);
friend absl::Status CreateConvBuffer1x1(
const CreationContext& creation_context, const OperationDef& definition,
const FullyConnectedAttributes& attr, ConvBuffer1x1* result,
const BHWC* shape);
friend absl::Status CreateConvBuffer1x1Wino4x4To6x6(
const CreationContext& creation_context, const OperationDef& definition,
const Convolution2DAttributes& attr, ConvBuffer1x1* result,
const BHWC* shape);
friend absl::Status CreateConvBuffer1x1DynamicWeights(
const CreationContext& creation_context, const OperationDef& definition,
const Convolution2DAttributes& attr, const BHWC& weights_shape,
ConvBuffer1x1* result, const BHWC* dst_shape);
template <DataType T>
absl::Status UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
const tflite::gpu::Tensor<Linear, T>& biases,
CLContext* context);
template <DataType T>
absl::Status UploadDataForWinograd4x4To6x6(
const tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
CLContext* context);
template <DataType T>
absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
CLContext* context);
template <DataType T>
absl::Status UploadBiases(const tflite::gpu::Tensor<Linear, T>& biases,
CLContext* context);
ConvParams conv_params_;
};
template <DataType T>
absl::Status ConvBuffer1x1::UploadData(
const tflite::gpu::Tensor<OHWI, T>& weights,
const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
RETURN_IF_ERROR(UploadWeights(weights, context));
RETURN_IF_ERROR(UploadBiases(biases, context));
return absl::OkStatus();
}
template <DataType T>
absl::Status ConvBuffer1x1::UploadDataForWinograd4x4To6x6(
const tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
CLContext* context) {
tflite::gpu::Tensor<OHWI, T> wino_weights;
RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
RETURN_IF_ERROR(UploadWeights(wino_weights, context));
tflite::gpu::Tensor<Linear, DataType::FLOAT32> bias;
bias.shape = Linear(weights.shape.o);
bias.data.resize(weights.shape.o, 0.0f);
RETURN_IF_ERROR(UploadBiases(bias, context));
return absl::OkStatus();
}
template <DataType T>
absl::Status ConvBuffer1x1::UploadWeights(
const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
const int dst_depth = DivideRoundUp(weights.shape.o, 4);
const int src_depth = DivideRoundUp(weights.shape.i, 4);
const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
const int float4_size = f32_weights ? sizeof(float4) : sizeof(half4);
const int dst_depth_aligned = AlignByN(dst_depth, conv_params_.block_size.z);
const int elements_count =
weights.shape.h * weights.shape.w * src_depth * dst_depth_aligned * 4;
Buffer weights_buffer;
if (f32_weights) {
std::vector<float4> gpu_data(elements_count);
RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
absl::MakeSpan(gpu_data));
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
gpu_data.data(), context,
&weights_buffer));
} else {
std::vector<half4> gpu_data(elements_count);
RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
absl::MakeSpan(gpu_data));
RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
gpu_data.data(), context,
&weights_buffer));
}
BufferDescriptor desc;
desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
desc.element_size = 16;
desc.memory_type = MemoryType::GLOBAL;
args_.AddObject("weights", AccessType::READ,
absl::make_unique<Buffer>(std::move(weights_buffer)),
absl::make_unique<BufferDescriptor>(desc));
return absl::OkStatus();
}
template <DataType T>
absl::Status ConvBuffer1x1::UploadBiases(
const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
TensorLinearDescriptor desc;
desc.storage_type = LinearStorageType::BUFFER;
desc.element_type = definition_.GetDataType();
tflite::gpu::Tensor<Linear, DataType::FLOAT32> bias = biases;
int channels = AlignByN(biases.shape.v, 4 * conv_params_.block_size.z);
bias.shape = Linear(channels);
bias.data.resize(channels, 0.0f);
LinearStorage lt;
RETURN_IF_ERROR(CreateLinearStorage(desc, bias, context, &lt));
args_.AddObject("biases", AccessType::READ,
absl::make_unique<LinearStorage>(std::move(lt)),
absl::make_unique<TensorLinearDescriptor>(desc));
return absl::OkStatus();
}
bool IsConvBuffer1x1Supported(const OperationDef& definition,
const Convolution2DAttributes& attr);
bool IsConvBuffer1x1Supported(const OperationDef& definition,
const BHWC& weights_shape,
const Convolution2DAttributes& attr);
absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
const OperationDef& definition,
const Convolution2DAttributes& attr,
ConvBuffer1x1* result,
const BHWC* shape = nullptr);
absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
const OperationDef& definition,
const FullyConnectedAttributes& attr,
ConvBuffer1x1* result,
const BHWC* shape = nullptr);
absl::Status CreateConvBuffer1x1DynamicWeights(
const CreationContext& creation_context, const OperationDef& definition,
const Convolution2DAttributes& attr, const BHWC& weights_shape,
ConvBuffer1x1* result, const BHWC* dst_shape = nullptr);
absl::Status CreateConvBuffer1x1Wino4x4To6x6(
const CreationContext& creation_context, const OperationDef& definition,
const Convolution2DAttributes& attr, ConvBuffer1x1* result,
const BHWC* shape = nullptr);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_BUFFER_1X1_H_