TFLite iOS GPU: New conversion functions are used across all operations.

PiperOrigin-RevId: 273598884
diff --git a/tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.cc b/tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.cc
index 398635a..e27e697 100644
--- a/tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.cc
+++ b/tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.cc
@@ -29,16 +29,6 @@
 namespace gpu {
 namespace metal {
 
-/// Helper function to convert buffer's content into stream of bytes
-std::vector<uint8_t> VectorFloatToHalf(const std::vector<float>& input_vector) {
-  std::vector<HalfBits> result;
-  result.reserve(input_vector.size());
-  for (const float v : input_vector) {
-    result.push_back(fp16_ieee_from_fp32_value(v));
-  }
-  return GetByteBuffer(result);
-}
-
 /// Converts float to destination type (if needed) and stores as bytes array.
 std::vector<uint8_t> GetByteBufferConverted(
     const std::vector<float>& input_vector,
diff --git a/tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h b/tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h
index f1a22a4..be45cdf 100644
--- a/tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h
+++ b/tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h
@@ -135,9 +135,6 @@
     const std::vector<float>& input_vector,
     RuntimeOptions::Precision destination_type, size_t elements_count);
 
-/// Helper function to convert FP32 to FP16 and into stream of bytes.
-std::vector<uint8_t> VectorFloatToHalf(const std::vector<float>& input_vector);
-
 }  // namespace metal
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/add.cc b/tensorflow/lite/delegates/gpu/metal/kernels/add.cc
index 9edb617..6e9bfce 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/add.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/add.cc
@@ -95,11 +95,9 @@
       device FLT4* const broadcast) { return value + broadcast[gid.z]; })";
     desc->input_buffers = {{input_ids[0]}};
     desc->output_buffer = {output_id};
-    auto values = options.storage_precision == RuntimeOptions::Precision::FP32
-                      ? GetByteBuffer(broadcast->data)
-                      : VectorFloatToHalf(broadcast->data);
     desc->immutable_buffers = {
-        {"device FLT4* const", values},
+        {"device FLT4* const",
+         GetByteBufferConverted(broadcast->data, options.storage_precision)},
     };
     return {desc};
   }
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc b/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc
index 11564be..60ac73a 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc
@@ -931,11 +931,9 @@
       }};
 
   auto weights_reordered = ReorderWeightsForConvShared(params);
-  auto weights = options.storage_precision == RuntimeOptions::Precision::FP32
-                     ? GetByteBuffer(weights_reordered)
-                     : VectorFloatToHalf(weights_reordered);
   desc->immutable_buffers = {
-      {"device FLT4* const weights", weights},
+      {"device FLT4* const weights",
+       GetByteBufferConverted(weights_reordered, options.storage_precision)},
       {"device FLT4* const biases",
        GetByteBufferConvertedResized(params.bias.data,
                                      options.storage_precision,
@@ -990,12 +988,9 @@
       }};
 
   auto weights_reordered = ReorderWeightsForConv(params, z_out);
-  auto weights =
-      options.storage_precision == metal::RuntimeOptions::Precision::FP32
-          ? GetByteBuffer(weights_reordered)
-          : VectorFloatToHalf(weights_reordered);
   desc->immutable_buffers = {
-      {"device FLT4* const filters", weights},
+      {"device FLT4* const filters",
+       GetByteBufferConverted(weights_reordered, options.storage_precision)},
       {"device FLT4* const biases",
        GetByteBufferConvertedResized(params.bias.data,
                                      options.storage_precision,
@@ -1051,12 +1046,9 @@
       }};
 
   auto weights_reordered = ReorderWeightsForConv(params, z_out);
-  auto weights =
-      options.storage_precision == metal::RuntimeOptions::Precision::FP32
-          ? GetByteBuffer(weights_reordered)
-          : VectorFloatToHalf(weights_reordered);
   desc->immutable_buffers = {
-      {"device FLT4* const filters", weights},
+      {"device FLT4* const filters",
+       GetByteBufferConverted(weights_reordered, options.storage_precision)},
       {"device FLT4* const biases",
        GetByteBufferConvertedResized(params.bias.data,
                                      options.storage_precision,
@@ -1108,12 +1100,9 @@
       }};
 
   auto weights_reordered = ReorderWeightsForConv(params, z_out);
-  auto weights =
-      options.storage_precision == metal::RuntimeOptions::Precision::FP32
-          ? GetByteBuffer(weights_reordered)
-          : VectorFloatToHalf(weights_reordered);
   desc->immutable_buffers = {
-      {"device FLT4* const filters", weights},
+      {"device FLT4* const filters",
+       GetByteBufferConverted(weights_reordered, options.storage_precision)},
       {"device FLT4* const biases",
        GetByteBufferConvertedResized(params.bias.data,
                                      options.storage_precision,
@@ -1169,12 +1158,9 @@
       }};
 
   auto weights_reordered = ReorderWeightsForConv(params, z_out);
-  auto weights =
-      options.storage_precision == metal::RuntimeOptions::Precision::FP32
-          ? GetByteBuffer(weights_reordered)
-          : VectorFloatToHalf(weights_reordered);
   desc->immutable_buffers = {
-      {"device FLT4* const filters", weights},
+      {"device FLT4* const filters",
+       GetByteBufferConverted(weights_reordered, options.storage_precision)},
       {"device FLT4* const biases",
        GetByteBufferConvertedResized(params.bias.data,
                                      options.storage_precision,
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv.cc b/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv.cc
index d667006..9fa627b 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv.cc
@@ -566,19 +566,14 @@
         return out_shape;
       }};
 
-  std::vector<float> filters_reordered = ConvertToPIOHW4(attr.weights);
-  auto filters = options.storage_precision == RuntimeOptions::Precision::FP32
-                     ? GetByteBuffer(filters_reordered)
-                     : VectorFloatToHalf(filters_reordered);
-  auto biases = options.storage_precision == RuntimeOptions::Precision::FP32
-                    ? GetByteBuffer(attr.bias.data)
-                    : VectorFloatToHalf(attr.bias.data);
+  const int output_channels_count = attr.weights.shape.i * attr.weights.shape.o;
   desc->immutable_buffers = {
-      {"device FLT4* const filters", filters},
+      {"device FLT4* const filters",
+       GetByteBufferConverted(ConvertToPIOHW4(attr.weights),
+                              options.storage_precision)},
       {"device FLT4* const biases",
-       GetByteBufferConvertedResized(
-           attr.bias.data, options.storage_precision,
-           attr.weights.shape.i * attr.weights.shape.o)},
+       GetByteBufferConvertedResized(attr.bias.data, options.storage_precision,
+                                     output_channels_count)},
   };
 
   desc->uniform_buffers = {
@@ -647,12 +642,9 @@
 
   // For this operation we keep weights and biases in one buffer
   auto weights_reordered = ReorderWeightsDepthWiseConv3x3Stride1x1(attr);
-  auto weights =
-      options.storage_precision == metal::RuntimeOptions::Precision::FP32
-          ? GetByteBuffer(weights_reordered)
-          : VectorFloatToHalf(weights_reordered);
   desc->immutable_buffers = {
-      {"device FLT4* const filters", weights},
+      {"device FLT4* const filters",
+       GetByteBufferConverted(weights_reordered, options.storage_precision)},
   };
 
   desc->uniform_buffers = {
@@ -714,12 +706,9 @@
 
   // For this operation we keep weights and biases in one buffer
   auto weights_reordered = ReorderWeightsDepthWiseConv3x3Stride2(attr);
-  auto weights =
-      options.storage_precision == metal::RuntimeOptions::Precision::FP32
-          ? GetByteBuffer(weights_reordered)
-          : VectorFloatToHalf(weights_reordered);
   desc->immutable_buffers = {
-      {"device FLT4* const filters", weights},
+      {"device FLT4* const filters",
+       GetByteBufferConverted(weights_reordered, options.storage_precision)},
   };
 
   desc->uniform_buffers = {
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.cc b/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.cc
index 25c7440..67bd989 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.cc
@@ -159,11 +159,9 @@
     }
   }
 
-  auto filters = options.storage_precision == RuntimeOptions::Precision::FP32
-                     ? GetByteBuffer(filters_reordered)
-                     : VectorFloatToHalf(filters_reordered);
   desc->immutable_buffers = {
-      {"device FLT4* const matrix", filters},
+      {"device FLT4* const matrix",
+       GetByteBufferConverted(filters_reordered, options.storage_precision)},
       {"device FLT4* const biases",
        GetByteBufferConvertedResized(attr.bias.data, options.storage_precision,
                                      attr.weights.shape.o)},
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/mul.cc b/tensorflow/lite/delegates/gpu/metal/kernels/mul.cc
index 59c7c83..4d59622 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/mul.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/mul.cc
@@ -69,11 +69,9 @@
          }},
     };
   } else {
-    auto coeffs = options.storage_precision == RuntimeOptions::Precision::FP32
-                      ? GetByteBuffer(mul_buffer->data)
-                      : VectorFloatToHalf(mul_buffer->data);
     desc->immutable_buffers = {
-        {"device FLT4* const", coeffs},
+        {"device FLT4* const",
+         GetByteBufferConverted(mul_buffer->data, options.storage_precision)},
     };
   }
   return {desc};
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/prelu.cc b/tensorflow/lite/delegates/gpu/metal/kernels/prelu.cc
index 3382cbb..baa8312 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/prelu.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/prelu.cc
@@ -61,11 +61,9 @@
   }
   desc->input_buffers = {{input_id}};
   desc->output_buffer = {output_id};
-  auto alphas = options.storage_precision == RuntimeOptions::Precision::FP32
-                    ? GetByteBuffer(alpha_buffer->data)
-                    : VectorFloatToHalf(alpha_buffer->data);
   desc->immutable_buffers = {
-      {"device FLT4* const", alphas},
+      {"device FLT4* const",
+       GetByteBufferConverted(alpha_buffer->data, options.storage_precision)},
   };
   if (attr.clip != 0) {
     desc->uniform_buffers = {
@@ -106,11 +104,9 @@
   }
   desc->input_buffers = {{input_id}};
   desc->output_buffer = {output_id};
-  auto alphas = options.storage_precision == RuntimeOptions::Precision::FP32
-                    ? GetByteBuffer(ConvertToPHWC4(*alpha))
-                    : VectorFloatToHalf(ConvertToPHWC4(*alpha));
   desc->immutable_buffers = {
-      {"device FLT4* const", alphas},
+      {"device FLT4* const", GetByteBufferConverted(ConvertToPHWC4(*alpha),
+                                                    options.storage_precision)},
   };
   if (attr.clip != 0) {
     desc->uniform_buffers = {
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.cc b/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.cc
index 73f907e..70e7c52 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.cc
@@ -950,9 +950,8 @@
     }
   }
 
-  auto filters = options.storage_precision == RuntimeOptions::Precision::FP32
-                     ? GetByteBuffer(filters_reordered)
-                     : VectorFloatToHalf(filters_reordered);
+  auto filters =
+      GetByteBufferConverted(filters_reordered, options.storage_precision);
   desc->immutable_buffers = {
       {"device FilterStripe* const filters", filters},
       {"device FLT4* const biases",
@@ -1044,9 +1043,8 @@
     }
   }
 
-  auto filters = options.storage_precision == RuntimeOptions::Precision::FP32
-                     ? GetByteBuffer(filters_reordered)
-                     : VectorFloatToHalf(filters_reordered);
+  auto filters =
+      GetByteBufferConverted(filters_reordered, options.storage_precision);
   auto biases = GetByteBufferConvertedResized(
       params.bias.data, options.storage_precision, params.weights.shape.o);
   border_desc->immutable_buffers = {