blob: e5ed20f62b938ae11964cf092ca29da94263b16e [file] [log] [blame]
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_UTIL_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_UTIL_H_
#include <string>
#include "absl/types/span.h"
#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
#include "tensorflow/lite/delegates/gpu/cl/precision.h"
#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
#include "tensorflow/lite/delegates/gpu/common/access_type.h"
#include "tensorflow/lite/delegates/gpu/common/data_type.h"
#include "tensorflow/lite/delegates/gpu/common/shape.h"
#include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/tensor.h"
#include "tensorflow/lite/delegates/gpu/common/types.h"
#include "tensorflow/lite/delegates/gpu/common/util.h"
namespace tflite {
namespace gpu {
namespace cl {
std::string GetCommonDefines(CalculationsPrecision precision);
enum class TextureAddressMode {
DONT_CARE, // translated to CLK_ADDRESS_NONE
ZERO, // translated to CLK_ADDRESS_CLAMP
};
class TensorCodeGenerator {
public:
TensorCodeGenerator() = default;
TensorCodeGenerator(const std::string& name,
const std::string& uniform_size_name,
const TensorDescriptor& descriptor);
std::string GetDeclaration(AccessType access) const;
std::string GetAddress(const std::string& var_name, const std::string& x,
const std::string& y, const std::string& z) const;
std::string GetAddress(const std::string& var_name, const std::string& x,
const std::string& y, const std::string& z,
const std::string& b) const;
// This function (and functions below) accept TextureAddressMode, but this
// argument applicable only for texture types. Buffer types ignore this
// parameter.
std::string Read3D(
const std::string& x, const std::string& y, const std::string& z,
TextureAddressMode address_mode = TextureAddressMode::ZERO) const;
// Read4D supports BUFFER and IMAGE_BUFFER storage types.
std::string Read4D(
const std::string& x, const std::string& y, const std::string& z,
const std::string& b,
TextureAddressMode address_mode = TextureAddressMode::ZERO) const;
// Optimization for textures, so as in opencl we can use read_imagef for any
// texture type.
std::string ReadAsFloat3D(
const std::string& x, const std::string& y, const std::string& z,
TextureAddressMode address_mode = TextureAddressMode::ZERO) const;
std::string ReadAsFloat4D(
const std::string& x, const std::string& y, const std::string& z,
const std::string& b,
TextureAddressMode address_mode = TextureAddressMode::ZERO) const;
std::string Write3D(const std::string& var_name, const std::string& x,
const std::string& y, const std::string& z) const;
// Write4D supports BUFFER and IMAGE_BUFFER storage types.
std::string Write4D(const std::string& var_name, const std::string& x,
const std::string& y, const std::string& z,
const std::string& b) const;
std::string Read(
const std::string& global_address,
TextureAddressMode address_mode = TextureAddressMode::ZERO) const;
// Optimization for textures, so as in opencl we can use read_imagef for any
// texture type.
std::string ReadAsFloat(
const std::string& global_address,
TextureAddressMode address_mode = TextureAddressMode::ZERO) const;
std::string Write(const std::string& var_name,
const std::string& global_address) const;
private:
std::string GetGlobalAddressNoDeclaration(const std::string& x,
const std::string& y,
const std::string& z) const;
std::string GetGlobalAddressNoDeclaration(const std::string& x,
const std::string& y,
const std::string& z,
const std::string& b) const;
std::string DeclareAddress(const std::string& var_name,
const std::string& address) const;
std::string tensor_name_;
std::string uniform_size_name_;
TensorDescriptor descriptor_;
};
template <DataType S, typename T>
void RearrangeWeightsToOHWI4I4O(const ::tflite::gpu::Tensor<OHWI, S>& weights,
absl::Span<T> dst) {
const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
const int kernel_x = weights.shape.w;
const int kernel_y = weights.shape.h;
int counter = 0;
for (int d = 0; d < dst_depth; ++d) {
for (int y = 0; y < kernel_y; ++y) {
for (int x = 0; x < kernel_x; ++x) {
for (int s = 0; s < src_depth; ++s) {
T filters[4];
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < 4; ++j) {
const int s_ch = s * 4 + j;
const int d_ch = d * 4 + i;
if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
const int f_index =
weights.shape.LinearIndex({d_ch, y, x, s_ch});
filters[j][i] = weights.data[f_index];
} else {
filters[j][i] = 0.0f;
}
}
}
dst[counter++] = filters[0];
dst[counter++] = filters[1];
dst[counter++] = filters[2];
dst[counter++] = filters[3];
}
}
}
}
}
// Returns fastest TextureAddressMode that return ZERO for out-of-range image
// coordinates.
//
// Unfortunately, CLK_ADDRESS_CLAMP is very slow on Adreno3xx and
// we can observe huge register overhead when compared to other modes.
// While using CLK_ADDRESS_NONE with out-of-range image coordinates is undefined
// in the OpenCL specification, we have observed that CLK_ADDRESS_NONE works
// like CLK_ADDRESS_CLAMP for out-of-range image coordinates for RGBA F16/F32
// textures on Adreno3xx devices. Using CLK_ADDRESS_NONE is significantly faster
// than CLK_ADDRESS_CLAMP on Adreno 3xx.
TextureAddressMode GetFastestZeroMode(const CLDevice& device);
// Returns float4 mask for last plane(batch of 4 channels)
// assumes that plane size is 4;
// for example we have 7 channels, in our data structures we align it to 8
// but 8s-channel will be empty, then last plane (batch of 4 channels) will
// have this mask (1, 1, 1, 0).
float4 GetMaskForLastPlane(int channels);
} // namespace cl
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_UTIL_H_