ConvPowerVR modified to support kernel languages without pointers.
PiperOrigin-RevId: 383894510
Change-Id: Ib6ff926e5caef7237dbea4e3a274fc480705b5c2
diff --git a/tensorflow/lite/delegates/gpu/common/gpu_info.cc b/tensorflow/lite/delegates/gpu/common/gpu_info.cc
index e7cf0dd..860043c 100644
--- a/tensorflow/lite/delegates/gpu/common/gpu_info.cc
+++ b/tensorflow/lite/delegates/gpu/common/gpu_info.cc
@@ -484,6 +484,10 @@
return true;
}
+bool GpuInfo::SupportsPointersInKernels() const {
+ return IsApiOpenCl() || IsApiMetal();
+}
+
bool GpuInfo::IsWaveSizeEqualTo32() const {
return supported_subgroup_sizes.size() == 1 &&
supported_subgroup_sizes[0] == 32;
diff --git a/tensorflow/lite/delegates/gpu/common/gpu_info.h b/tensorflow/lite/delegates/gpu/common/gpu_info.h
index 1387614..cedd5a2 100644
--- a/tensorflow/lite/delegates/gpu/common/gpu_info.h
+++ b/tensorflow/lite/delegates/gpu/common/gpu_info.h
@@ -361,6 +361,8 @@
bool SupportsImageBuffer() const;
bool SupportsImage3D() const;
+ bool SupportsPointersInKernels() const;
+
// returns true if device have fixed wave size equal to 32
bool IsWaveSizeEqualTo32() const;
bool SupportsSubGroupWithSize(int sub_group_size) const;
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/conv_powervr.cc b/tensorflow/lite/delegates/gpu/common/tasks/conv_powervr.cc
index 7aef649..3454b8f 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/conv_powervr.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/conv_powervr.cc
@@ -613,17 +613,18 @@
if (need_local_mem) {
c += " __local " + weights_data_type + " weights_cache[" +
std::to_string(local_mem_size) + "];\n";
- } else if (conv_params.AreWeightsBuffer()) {
+ } else if (conv_params.AreWeightsBuffer() &&
+ gpu_info.SupportsPointersInKernels()) {
c += " " + weights_global_ptr + " weights_cache;\n";
} else if (!trivial_kernel_size) {
c += " int filter_offset = 0;\n";
}
if (conv_params.AreWeightsBuffer()) {
+ std::string offset;
if (conv_params.different_weights_for_height) {
- c += " " + weights_global_ptr +
- " filters_loc = args.weights.GetPtr() + (DST_S * "
- "args.src_tensor.Height() + DST_Y * " +
- std::to_string(block_size.w) + ") * 4 * args.src_tensor.Slices();\n";
+ offset = "(DST_S * args.src_tensor.Height() + DST_Y * " +
+ std::to_string(block_size.w) +
+ ") * 4 * args.src_tensor.Slices()";
} else {
std::string kernel_spatial_offset = "";
if (!conv_params_.x_kernel_is_1) {
@@ -635,10 +636,13 @@
if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1) {
kernel_spatial_offset += " * args.kernel_size_z";
}
+ offset = "DST_S * 4 * args.src_tensor.Slices()" + kernel_spatial_offset;
+ }
+ if (gpu_info.SupportsPointersInKernels()) {
c += " " + weights_global_ptr +
- " filters_loc = args.weights.GetPtr() + DST_S * 4 * "
- "args.src_tensor.Slices()" +
- kernel_spatial_offset + ";\n";
+ " filters_loc = args.weights.GetPtr() + " + offset + ";\n";
+ } else {
+ c += " int filters_offset = " + offset + ";\n";
}
}
if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1) {
@@ -848,7 +852,12 @@
std::to_string(s * 4 + ch + shared_offset);
std::string w_val;
if (conv_params.AreWeightsBuffer()) {
- w_val = "weights_cache[" + weight_id + "]";
+ if (gpu_info.SupportsPointersInKernels()) {
+ w_val = "weights_cache[" + weight_id + "]";
+ } else {
+ w_val = "args.weights.Read(filters_offset + " +
+ weight_id + ")";
+ }
} else {
w_val = "f" + weight_id;
}
@@ -878,7 +887,12 @@
std::string weight_id =
std::to_string(s * 4 + i + shared_offset);
if (conv_params.AreWeightsBuffer()) {
- F[i] = "weights_cache[" + weight_id + "]";
+ if (gpu_info.SupportsPointersInKernels()) {
+ F[i] = "weights_cache[" + weight_id + "]";
+ } else {
+ F[i] =
+ "args.weights.Read(filters_offset + " + weight_id + ")";
+ }
} else {
F[i] = "f" + weight_id;
}
@@ -931,7 +945,9 @@
c += " }\n";
}
} else if (conv_params.AreWeightsBuffer()) { // GLOBAL_MEM/CONSTANT_MEM
- c += " weights_cache = filters_loc;\n";
+ if (gpu_info.SupportsPointersInKernels()) {
+ c += " weights_cache = filters_loc;\n";
+ }
} else { // TEXTURES_MEM
for (int dst_s = 0; dst_s < block_size.w; ++dst_s) {
std::string f_y = trivial_kernel_size ? "s" : "filter_offset";
@@ -964,7 +980,11 @@
c += " s += 1;\n";
}
if (conv_params.AreWeightsBuffer()) {
- c += " filters_loc += " + std::to_string(local_mem_size) + ";\n";
+ if (gpu_info.SupportsPointersInKernels()) {
+ c += " filters_loc += " + std::to_string(local_mem_size) + ";\n";
+ } else {
+ c += " filters_offset += " + std::to_string(local_mem_size) + ";\n";
+ }
}
c += " } while (s < args.src_tensor.Slices());\n";
if (!conv_params.x_kernel_is_1) {
@@ -988,7 +1008,7 @@
"DST_S", "lid", total_work_items,
block_size.w);
c += " " + barrier + ";\n";
- } else {
+ } else if (gpu_info.SupportsPointersInKernels()) {
c += " weights_cache = args.biases.GetPtr() + DST_S;\n";
}
}
@@ -1023,7 +1043,8 @@
const std::string sind = std::to_string(s);
c += " if (DST_S + " + sind + " >= args.dst_tensor.Slices()) return;\n";
c += " {\n";
- if (conv_params.AreWeightsBuffer()) {
+ if (conv_params.AreWeightsBuffer() &&
+ gpu_info.SupportsPointersInKernels()) {
c += " FLT4 bias_val = TO_FLT4(weights_cache[" + sind + "]);\n";
} else {
c += " FLT4 bias_val = args.biases.Read(DST_S + " + sind + ");\n";