Changed GPUOperation interface.
AddToQueue changed to non virtual method.
New virtual methods BindArguments and GetGridSize.
Using default Tuning method for majority of ops.

PiperOrigin-RevId: 321427369
Change-Id: I7186945a1f9e744c9ea6ec0c8d29612622845c77
diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.cc b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
index 7ff10f1..9e57dd1 100644
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.cc
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
@@ -197,6 +197,7 @@
   RETURN_IF_ERROR(AllocateMemory(env->device(), creation_context.context));
   BindMemoryToOperations();
   RETURN_IF_ERROR(Compile(creation_context));
+  RETURN_IF_ERROR(UpdateParams());
 
   TuningParameters tuning_parameters;
   tuning_parameters.queue = env->profiling_queue();
@@ -554,6 +555,13 @@
   return absl::OkStatus();
 }
 
+absl::Status InferenceContext::UpdateParams() {
+  for (auto& node : nodes_) {
+    RETURN_IF_ERROR(node.operations[0]->UpdateParams());
+  }
+  return absl::OkStatus();
+}
+
 absl::Status InferenceContext::AddToQueue(CLCommandQueue* queue) {
   if (need_manual_release_) {
     if (prev_enqueue_start_point_.is_valid()) {
diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.h b/tensorflow/lite/delegates/gpu/cl/inference_context.h
index 7536525..3f05026 100644
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.h
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.h
@@ -114,6 +114,7 @@
   void BindMemoryToOperations();
   absl::Status Compile(const CreationContext& creation_context);
   absl::Status Tune(const TuningParameters& tuning_parameters);
+  absl::Status UpdateParams();
 
   // performance hacks
   bool need_flush_ = false;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.cc
index deb0ebf..f864a73 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.cc
@@ -56,6 +56,7 @@
   }
 
   RETURN_IF_ERROR(operation->Compile(creation_context));
+  RETURN_IF_ERROR(operation->UpdateParams());
   RETURN_IF_ERROR(operation->AddToQueue(creation_context.queue));
   RETURN_IF_ERROR(creation_context.queue->WaitForCompletion());
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.cc b/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.cc
index 0a84d8a..9feb3ac 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.cc
@@ -154,9 +154,7 @@
     RETURN_IF_ERROR(
         args_.SetObjectRef("src_tensor_" + std::to_string(i), src_[i]));
   }
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return args_.SetObjectRef("dst_tensor", dst_[0]);
 }
 
 int3 ConcatXY::GetGridSize() const {
@@ -166,16 +164,6 @@
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status ConcatXY::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status ConcatXY::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 ConcatXY CreateConcatXY(const OperationDef& definition,
                         const ConcatAttributes& attr, int tensors_count) {
   return ConcatXY(definition, attr, tensors_count);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h b/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h
index a82ffb2..011d8fb 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h
@@ -31,10 +31,9 @@
   ConcatXY(const OperationDef& definition, const ConcatAttributes& attr,
            int tensors_count)
       : GPUOperation(definition), attr_(attr), tensors_count_(tensors_count) {}
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-
   absl::Status Compile(const CreationContext& creation_context) override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   // Move only
   ConcatXY(ConcatXY&& operation);
@@ -43,9 +42,6 @@
   ConcatXY& operator=(const ConcatXY&) = delete;
 
  private:
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
   ConcatAttributes attr_;
   int tensors_count_;
 };
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc b/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc
index 93bc7b4..7878919 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc
@@ -176,9 +176,7 @@
     RETURN_IF_ERROR(
         args_.SetObjectRef("src_tensor_" + std::to_string(i), src_[i]));
   }
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return args_.SetObjectRef("dst_tensor", dst_[0]);
 }
 
 int3 ConcatZ::GetGridSize() const {
@@ -188,16 +186,6 @@
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status ConcatZ::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status ConcatZ::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 ConcatZ CreateConcatZ(const OperationDef& definition,
                       const std::vector<int>& channels) {
   return ConcatZ(definition, channels);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.h b/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.h
index 6595432..496b943 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.h
@@ -32,10 +32,9 @@
  public:
   ConcatZ(const OperationDef& definition, const std::vector<int>& channels)
       : GPUOperation(definition), channels_(channels) {}
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-
   absl::Status Compile(const CreationContext& creation_context) override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   // Move only
   ConcatZ(ConcatZ&& kernel);
@@ -44,9 +43,6 @@
   ConcatZ& operator=(const ConcatZ&) = delete;
 
  private:
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
   std::vector<int> channels_;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.cc
index 1d9eaef..788b56c 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.cc
@@ -67,6 +67,7 @@
       definition_.IsBatchSupported() && stride_.x != 1;
   std::string code =
       GenerateConv3D(definition_, stride_correction, conv_params_, &args_);
+  work_group_size_ = conv_params_.work_group_size;
   std::string element_wise_code;
   RETURN_IF_ERROR(
       MergeOperations(linked_operations_, &args_, &element_wise_code));
@@ -105,11 +106,8 @@
     RETURN_IF_ERROR(args_.SetInt("kernel_size_z", kernel_size_.z));
     RETURN_IF_ERROR(args_.SetInt("dilation_z", dilation_.z));
   }
-  RETURN_IF_ERROR(args_.SetInt(
-      "grid_size_s",
-      DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w)));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return args_.SetInt("grid_size_s", DivideRoundUp(dst_[0]->Slices(),
+                                                   conv_params_.block_size.w));
 }
 
 int3 Conv3D::GetGridSize() const {
@@ -142,19 +140,14 @@
   if (conv_params_.work_group_launch_order[0] == 0 &&
       conv_params_.work_group_launch_order[1] == 1 &&
       conv_params_.work_group_launch_order[2] == 2) {
-    RETURN_IF_ERROR(BindArguments());
-    return GetBestWorkGroupConv(params, kernel_, GetGridSize(),
-                                &conv_params_.work_group_size);
+    RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
+    RETURN_IF_ERROR(GetBestWorkGroupConv(params, kernel_, grid_size_,
+                                         &conv_params_.work_group_size));
+    work_group_size_ = conv_params_.work_group_size;
   }
   return absl::OkStatus();
 }
 
-absl::Status Conv3D::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(),
-                                 conv_params_.work_group_size);
-}
-
 namespace {
 std::string GenerateUploadByThreads(const std::string& local_ptr_name,
                                     const std::string& global_ptr_name,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.h
index 501aa05..720f1ed 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.h
@@ -39,9 +39,10 @@
 class Conv3D : public GPUOperation {
  public:
   Conv3D() = default;
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
   absl::Status Tune(const TuningParameters& params) override;
   absl::Status Compile(const CreationContext& creation_context) override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   // Move only
   Conv3D(Conv3D&& operation);
@@ -105,9 +106,6 @@
                              int dst_slices, bool x_kernel_is_1,
                              bool y_kernel_is_1, bool z_kernel_is_1) const;
 
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
   int3 stride_;
   int3 padding_;
   int3 kernel_size_;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
index 6fab26a..9007155 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
@@ -293,6 +293,7 @@
 
 absl::Status ConvBuffer1x1::Compile(const CreationContext& creation_context) {
   std::string code = GenerateConvBuffer1x1(definition_, conv_params_, &args_);
+  work_group_size_ = conv_params_.work_group_size;
   std::string element_wise_code;
   RETURN_IF_ERROR(
       MergeOperations(linked_operations_, &args_, &element_wise_code));
@@ -310,9 +311,7 @@
     RETURN_IF_ERROR(args_.SetObjectRef("weights", src_[1]));
   }
   RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return args_.SetObjectRef("dst_tensor", dst_[0]);
 }
 
 int3 ConvBuffer1x1::GetGridSize() const {
@@ -328,15 +327,11 @@
 }
 
 absl::Status ConvBuffer1x1::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroupConv(params, kernel_, GetGridSize(),
-                              &conv_params_.work_group_size);
-}
-
-absl::Status ConvBuffer1x1::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(),
-                                 conv_params_.work_group_size);
+  RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
+  RETURN_IF_ERROR(GetBestWorkGroupConv(params, kernel_, grid_size_,
+                                       &conv_params_.work_group_size));
+  work_group_size_ = conv_params_.work_group_size;
+  return absl::OkStatus();
 }
 
 bool IsConvBuffer1x1Supported(const OperationDef& definition,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h
index 1be023f..9f549d3 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h
@@ -47,9 +47,10 @@
   ConvBuffer1x1(const ConvBuffer1x1&) = delete;
   ConvBuffer1x1& operator=(const ConvBuffer1x1&) = delete;
 
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
   absl::Status Tune(const TuningParameters& params) override;
   absl::Status Compile(const CreationContext& creation_context) override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   ConvWeightsDescription GetConvWeightsDescription() const {
     ConvWeightsDescription desc;
@@ -106,9 +107,6 @@
   absl::Status UploadBiases(const tflite::gpu::Tensor<Linear, T>& biases,
                             CLContext* context);
 
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
   ConvParams conv_params_;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
index e2d0e82..83c4300 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
@@ -251,9 +251,7 @@
   RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
   RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
   RETURN_IF_ERROR(args_.SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
-  RETURN_IF_ERROR(args_.SetInt("dilation_y", dilation_.y));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return args_.SetInt("dilation_y", dilation_.y);
 }
 
 int3 ConvConstants::GetGridSize() const {
@@ -262,16 +260,6 @@
   return int3(grid_x, grid_y, 1);
 }
 
-absl::Status ConvConstants::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status ConvConstants::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 bool IsConvConstantsSupported(const CLDevice& device,
                               const OperationDef& definition,
                               const Convolution2DAttributes& attr) {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
index f3f0025..d434af0 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
@@ -35,10 +35,10 @@
 class ConvConstants : public GPUOperation {
  public:
   ConvConstants() = default;
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
 
   absl::Status Compile(const CreationContext& creation_context) override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   // Move only
   ConvConstants(ConvConstants&& kernel);
@@ -68,9 +68,6 @@
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
                             absl::Span<T> dst);
 
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
   int2 kernel_size_;
   int2 stride_;
   int2 padding_;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
index 551f5f3..76ae58a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
@@ -184,6 +184,7 @@
       definition_.IsBatchSupported() && stride_padding_.x != 1;
   std::string code = GenerateConv(*creation_context.device, definition_,
                                   stride_correction, conv_params_, &args_);
+  work_group_size_ = conv_params_.work_group_size;
   std::string element_wise_code;
   RETURN_IF_ERROR(
       MergeOperations(linked_operations_, &args_, &element_wise_code));
@@ -226,8 +227,6 @@
                                      conv_params_.block_size.x);
     RETURN_IF_ERROR(args_.SetInt("task_size_x", grid_x));
   }
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
   return absl::OkStatus();
 }
 
@@ -272,19 +271,14 @@
   if (conv_params_.work_group_launch_order[0] == 0 &&
       conv_params_.work_group_launch_order[1] == 1 &&
       conv_params_.work_group_launch_order[2] == 2) {
-    RETURN_IF_ERROR(BindArguments());
-    return GetBestWorkGroupConv(params, kernel_, GetGridSize(),
-                                &conv_params_.work_group_size);
+    RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
+    RETURN_IF_ERROR(GetBestWorkGroupConv(params, kernel_, grid_size_,
+                                         &conv_params_.work_group_size));
+    work_group_size_ = conv_params_.work_group_size;
   }
   return absl::OkStatus();
 }
 
-absl::Status ConvPowerVR::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(),
-                                 conv_params_.work_group_size);
-}
-
 std::string GenerateConv(const CLDevice& device, const OperationDef& op_def,
                          bool stride_correction,
                          const ConvPowerVR::ConvParams& conv_params,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
index 07bcf2c..8ef8bc6 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
@@ -41,9 +41,10 @@
 class ConvPowerVR : public GPUOperation {
  public:
   ConvPowerVR() = default;
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
   absl::Status Tune(const TuningParameters& params) override;
   absl::Status Compile(const CreationContext& creation_context) override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   ConvWeightsDescription GetConvWeightsDescription() const {
     ConvWeightsDescription desc;
@@ -205,9 +206,6 @@
                              bool different_weights_for_height,
                              const BHWC* dst_shape = nullptr) const;
 
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
   int4 stride_padding_;
   int4 kernel_dilation_;
   ConvParams conv_params_;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
index d81c7e8..a31674d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
@@ -420,8 +420,6 @@
   RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
   RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
   RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
   return absl::OkStatus();
 }
 
@@ -434,14 +432,8 @@
 }
 
 absl::Status ConvTexture::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroupConv(params, kernel_, GetGridSize(),
-                              &work_group_size_);
-}
-
-absl::Status ConvTexture::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+  RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
+  return GetBestWorkGroupConv(params, kernel_, grid_size_, &work_group_size_);
 }
 
 absl::Status CreateConvTexture(const CreationContext& creation_context,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
index c21d5b1..80a328e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
@@ -42,10 +42,10 @@
 class ConvTexture : public GPUOperation {
  public:
   ConvTexture() = default;
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
   absl::Status Tune(const TuningParameters& params) override;
-
   absl::Status Compile(const CreationContext& creation_context) override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   // Move only
   ConvTexture(ConvTexture&& operation);
@@ -89,9 +89,6 @@
                             absl::Span<T> dst_0, absl::Span<T> dst_1,
                             absl::Span<T> dst_2, absl::Span<T> dst_3);
 
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
   int2 kernel_size_;
   int2 stride_;
   int2 padding_;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.cc
index 063b20e..ce97311 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.cc
@@ -127,9 +127,7 @@
   RETURN_IF_ERROR(args_.SetFloat("mask_x", mask.x));
   RETURN_IF_ERROR(args_.SetFloat("mask_y", mask.y));
   RETURN_IF_ERROR(args_.SetFloat("mask_z", mask.z));
-  RETURN_IF_ERROR(args_.SetFloat("mask_w", mask.w));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return args_.SetFloat("mask_w", mask.w);
 }
 
 int3 ConverterToConvWeights::GetGridSize() const {
@@ -140,16 +138,6 @@
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status ConverterToConvWeights::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status ConverterToConvWeights::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 ConverterToConvWeights CreateConverterToConvWeights(
     const OperationDef& definition,
     const ConvWeightsDescription& conv_weights_desc) {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.h
index 3bf17fa..d8d84b8 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.h
@@ -32,10 +32,9 @@
   ConverterToConvWeights(const OperationDef& definition,
                          const ConvWeightsDescription& conv_weights_desc)
       : GPUOperation(definition), conv_weights_desc_(conv_weights_desc) {}
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-
   absl::Status Compile(const CreationContext& creation_context) override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   // Move only
   ConverterToConvWeights(ConverterToConvWeights&& operation);
@@ -44,9 +43,6 @@
   ConverterToConvWeights& operator=(const ConverterToConvWeights&) = delete;
 
  private:
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
   ConvWeightsDescription conv_weights_desc_;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
index 85456fc..dc146c4 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
@@ -362,9 +362,7 @@
   RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x));
   RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
   RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
-  RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return args_.SetInt("kernel_size_y", kernel_size_.y);
 }
 
 int3 ConvolutionTransposed::GetGridSize() const {
@@ -377,14 +375,8 @@
 }
 
 absl::Status ConvolutionTransposed::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroupConv(params, kernel_, GetGridSize(),
-                              &work_group_size_);
-}
-
-absl::Status ConvolutionTransposed::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+  RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
+  return GetBestWorkGroupConv(params, kernel_, grid_size_, &work_group_size_);
 }
 
 absl::Status CreateConvolutionTransposed(
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h
index cf70799..fc53884 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h
@@ -38,10 +38,10 @@
 class ConvolutionTransposed : public GPUOperation {
  public:
   ConvolutionTransposed() = default;
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
   absl::Status Tune(const TuningParameters& params) override;
-
   absl::Status Compile(const CreationContext& creation_context) override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   // Move only
   ConvolutionTransposed(ConvolutionTransposed&& operation);
@@ -65,9 +65,6 @@
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
                             absl::Span<T> dst);
 
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
   bool weights_are_buffer_;
 
   int2 kernel_size_;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.cc
index 53f24cb..409f7e3 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.cc
@@ -399,10 +399,8 @@
   RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
   RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
   RETURN_IF_ERROR(args_.SetInt("kernel_size_z", kernel_size_.z));
-  RETURN_IF_ERROR(args_.SetInt(
-      "grid_size_s", DivideRoundUp(dst_[0]->Slices(), block_size_.w)));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return args_.SetInt("grid_size_s",
+                      DivideRoundUp(dst_[0]->Slices(), block_size_.w));
 }
 
 int3 ConvolutionTransposed3D::GetGridSize() const {
@@ -417,14 +415,8 @@
 }
 
 absl::Status ConvolutionTransposed3D::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroupConv(params, kernel_, GetGridSize(),
-                              &work_group_size_);
-}
-
-absl::Status ConvolutionTransposed3D::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+  RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
+  return GetBestWorkGroupConv(params, kernel_, grid_size_, &work_group_size_);
 }
 
 absl::Status CreateConvolutionTransposed3D(
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h
index 4b76e61..09f7e70 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h
@@ -38,10 +38,10 @@
 class ConvolutionTransposed3D : public GPUOperation {
  public:
   ConvolutionTransposed3D() = default;
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
   absl::Status Tune(const TuningParameters& params) override;
-
   absl::Status Compile(const CreationContext& creation_context) override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   // Move only
   ConvolutionTransposed3D(ConvolutionTransposed3D&& operation);
@@ -65,9 +65,6 @@
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWDI, S>& weights,
                             absl::Span<T> dst);
 
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
   bool weights_are_buffer_;
 
   int3 kernel_size_;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
index 0da4ca6..9446f0f 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
@@ -333,9 +333,7 @@
   const int padding_y =
       padding_.y >= 1 ? (padding_.y - 1) / 2 : (padding_.y - 2) / 2;
   RETURN_IF_ERROR(args_.SetInt("padding_x", padding_x * src_[0]->Batch()));
-  RETURN_IF_ERROR(args_.SetInt("padding_y", padding_y));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return args_.SetInt("padding_y", padding_y);
 }
 
 int3 ConvolutionTransposed3x3::GetGridSize() const {
@@ -349,12 +347,6 @@
   return int3(wg[work_group_launch_order_[0]] * work_group_size_.x,
               wg[work_group_launch_order_[1]] * work_group_size_.y,
               wg[work_group_launch_order_[2]] * work_group_size_.z);
-  return int3(grid_x, grid_y, grid_z);
-}
-
-absl::Status ConvolutionTransposed3x3::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
 }
 
 bool IsConvolutionTransposed3x3Supported(
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h
index 3792acd..0dc42a7 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h
@@ -37,8 +37,12 @@
 class ConvolutionTransposed3x3 : public GPUOperation {
  public:
   ConvolutionTransposed3x3() = default;
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
+  absl::Status Tune(const TuningParameters& params) override {
+    return absl::OkStatus();
+  }
   absl::Status Compile(const CreationContext& creation_context) override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   // Move only
   ConvolutionTransposed3x3(ConvolutionTransposed3x3&& operation);
@@ -68,9 +72,6 @@
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
                             absl::Span<T> dst);
 
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
   int2 padding_;
   int3 work_group_launch_order_;
   WeightsUploadType weights_upload_type_;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc
index 934c719..56a21cb 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc
@@ -207,9 +207,7 @@
 
 absl::Status ConvolutionTransposed3x3Thin::BindArguments() {
   RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return args_.SetObjectRef("dst_tensor", dst_[0]);
 }
 
 int3 ConvolutionTransposed3x3Thin::GetGridSize() const {
@@ -219,17 +217,6 @@
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status ConvolutionTransposed3x3Thin::Tune(
-    const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status ConvolutionTransposed3x3Thin::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 bool IsConvolutionTransposed3x3ThinSupported(
     const CLDevice& device, const ConvolutionTransposedAttributes& attr) {
   return attr.weights.shape.o <= 8 && attr.weights.shape.w == 3 &&
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h
index 2e27283..282f1b3 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h
@@ -37,10 +37,9 @@
 class ConvolutionTransposed3x3Thin : public GPUOperation {
  public:
   ConvolutionTransposed3x3Thin() = default;
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-
   absl::Status Compile(const CreationContext& creation_context) override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   // Move only
   ConvolutionTransposed3x3Thin(ConvolutionTransposed3x3Thin&& operation);
@@ -67,9 +66,6 @@
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
                             absl::Span<T> dst);
 
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
   int src_channels_;
   int dst_channels_;
 };
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
index 6c81457..d7660fc 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
@@ -318,9 +318,7 @@
 absl::Status ConvolutionTransposed4x4::BindArguments() {
   RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
   RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
-  RETURN_IF_ERROR(args_.SetInt("filter_offset", 4 * 16 * src_[0]->Slices()));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return args_.SetInt("filter_offset", 4 * 16 * src_[0]->Slices());
 }
 
 int3 ConvolutionTransposed4x4::GetGridSize() const {
@@ -330,11 +328,6 @@
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status ConvolutionTransposed4x4::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 bool IsConvolutionTransposed4x4Supported(
     const CLDevice& device, const OperationDef& definition,
     const ConvolutionTransposedAttributes& attr) {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h
index 1cf3b83..9829374 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h
@@ -37,8 +37,12 @@
 class ConvolutionTransposed4x4 : public GPUOperation {
  public:
   ConvolutionTransposed4x4() = default;
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
+  absl::Status Tune(const TuningParameters& params) override {
+    return absl::OkStatus();
+  }
   absl::Status Compile(const CreationContext& creation_context) override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   // Move only
   ConvolutionTransposed4x4(ConvolutionTransposed4x4&& operation);
@@ -68,9 +72,6 @@
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
                             absl::Span<T> dst);
 
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
   WeightsUploadType weights_upload_type_;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc
index 90b1a4c..5b31c98 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc
@@ -183,9 +183,7 @@
 
 absl::Status ConvolutionTransposedThin::BindArguments() {
   RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return args_.SetObjectRef("dst_tensor", dst_[0]);
 }
 
 int3 ConvolutionTransposedThin::GetGridSize() const {
@@ -195,16 +193,6 @@
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status ConvolutionTransposedThin::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status ConvolutionTransposedThin::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 bool IsConvolutionTransposedThinSupported(
     const CLDevice& device, const ConvolutionTransposedAttributes& attr) {
   return attr.weights.shape.o <= 4 && attr.weights.shape.w == attr.stride.w &&
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h
index bb06202..90a1b02 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h
@@ -37,10 +37,9 @@
 class ConvolutionTransposedThin : public GPUOperation {
  public:
   ConvolutionTransposedThin() = default;
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-
   absl::Status Compile(const CreationContext& creation_context) override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   // Move only
   ConvolutionTransposedThin(ConvolutionTransposedThin&& operation);
@@ -65,9 +64,6 @@
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
                             absl::Span<T> dst);
 
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
   int2 kernel_size_;
   int src_channels_;
   int dst_channels_;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc
index 82658d6..7d6bee68 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc
@@ -306,8 +306,7 @@
   if (!IsSpecializedCase(channel_multiplier_)) {
     RETURN_IF_ERROR(args_.SetInt("ch_multiplier", channel_multiplier_));
   }
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return absl::OkStatus();
 }
 
 int3 DepthwiseConvolution::GetGridSize() const {
@@ -317,16 +316,6 @@
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status DepthwiseConvolution::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status DepthwiseConvolution::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 absl::Status CreateDepthwiseConvolution(
     const CreationContext& creation_context, const OperationDef& definition,
     const DepthwiseConvolution2DAttributes& attr,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h
index 6433e8d..51cf68a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h
@@ -38,10 +38,9 @@
 class DepthwiseConvolution : public GPUOperation {
  public:
   DepthwiseConvolution() = default;
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-
   absl::Status Compile(const CreationContext& creation_context) override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   // Move only
   DepthwiseConvolution(DepthwiseConvolution&& operation);
@@ -81,9 +80,6 @@
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWDI, S>& weights,
                             absl::Span<T> dst);
 
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
   bool weights_are_buffer_;
 
   int4 kernel_size_;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.cc b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.cc
index 0494038..97afea4 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.cc
@@ -303,9 +303,7 @@
 
 absl::Status DepthwiseConv3x3::BindArguments() {
   RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return args_.SetObjectRef("dst_tensor", dst_[0]);
 }
 
 int3 DepthwiseConv3x3::GetGridSize() const {
@@ -319,15 +317,10 @@
   if (local_mem_uploads_) {
     return absl::OkStatus();
   }
-  RETURN_IF_ERROR(BindArguments());
+  RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
   return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
 }
 
-absl::Status DepthwiseConv3x3::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 bool IsDepthwiseConv3x3Supported(const DepthwiseConvolution2DAttributes& attr) {
   return attr.weights.shape.o == 1 && attr.dilations.w == 1 &&
          attr.dilations.h == 1 && attr.weights.shape.w == 3 &&
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.h b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.h
index fd1dca4..ce5b2d8 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.h
@@ -38,10 +38,10 @@
 class DepthwiseConv3x3 : public GPUOperation {
  public:
   DepthwiseConv3x3() = default;
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
   absl::Status Tune(const TuningParameters& params) override;
-
   absl::Status Compile(const CreationContext& creation_context) override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   // Move only
   DepthwiseConv3x3(DepthwiseConv3x3&& operation);
@@ -66,9 +66,6 @@
       const tflite::gpu::Tensor<OHWI, S>& weights,
       const tflite::gpu::Tensor<Linear, S>& biases, absl::Span<T> dst);
 
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
   bool weights_are_buffer_;
   bool local_mem_uploads_;
 };
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc
index 1685d4f..944af0a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc
@@ -129,13 +129,13 @@
   return absl::OkStatus();
 }
 
-absl::Status FullyConnected::AddToQueue(CLCommandQueue* queue) {
+absl::Status FullyConnected::BindArguments() {
   RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
-  return queue->DispatchImplicit(kernel_, {dst_[0]->Slices(), 1, 1},
-                                 work_group_size_);
+  return args_.SetObjectRef("dst_tensor", dst_[0]);
+}
+
+int3 FullyConnected::GetGridSize() const {
+  return int3(dst_[0]->Slices(), 1, 1);
 }
 
 absl::Status CreateFullyConnected(const CreationContext& creation_context,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h
index 2adff4f..138db00 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h
@@ -37,8 +37,11 @@
 class FullyConnected : public GPUOperation {
  public:
   FullyConnected() = default;
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-
+  absl::Status Tune(const TuningParameters& params) override {
+    return absl::OkStatus();
+  }
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
   absl::Status Compile(const CreationContext& creation_context) override;
 
   // Move only
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
index 2310ee5..d0d1f88 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
@@ -125,6 +125,7 @@
       args_(std::move(operation.args_)),
       kernel_(std::move(operation.kernel_)),
       work_group_size_(operation.work_group_size_),
+      grid_size_(operation.grid_size_),
       linked_operations_(std::move(operation.linked_operations_)) {}
 
 GPUOperation& GPUOperation::operator=(GPUOperation&& operation) {
@@ -135,6 +136,7 @@
     args_ = std::move(operation.args_);
     kernel_ = std::move(operation.kernel_);
     std::swap(work_group_size_, operation.work_group_size_);
+    std::swap(grid_size_, operation.grid_size_);
     linked_operations_ = std::move(operation.linked_operations_);
   }
   return *this;
@@ -162,10 +164,7 @@
 absl::Status ElementwiseOperation::BindArguments() {
   RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
   RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
-  RETURN_IF_ERROR(SetArgs("", &args_));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
-  return absl::OkStatus();
+  return SetArgs("", &args_);
 }
 
 int3 ElementwiseOperation::GetGridSize() const {
@@ -192,16 +191,6 @@
       *creation_context.device, &kernel_);
 }
 
-absl::Status ElementwiseOperation::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
-absl::Status ElementwiseOperation::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
 absl::Status MergeOperations(
     const std::vector<ElementwiseOperation*>& linked_ops,
     Arguments* merged_args, std::string* merged_code) {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
index 34d6d8c..88d0ff0 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
@@ -24,6 +24,7 @@
 #include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/tuning_parameters.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
 #include "tensorflow/lite/delegates/gpu/cl/precision.h"
 #include "tensorflow/lite/delegates/gpu/cl/program_cache.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor.h"
@@ -59,6 +60,9 @@
 
 class ElementwiseOperation;
 
+absl::Status SetArguments(const std::vector<ElementwiseOperation*>& linked_ops,
+                          Arguments* args);
+
 // GPUOperation represents some implementation of neural network operation on
 // GPU. GPUOperation can contain ElementwiseOperation operations, in this case,
 // ElementwiseOperation still hold necessary data and should be alive.
@@ -86,11 +90,22 @@
   void SetSrc(Tensor* ptr, int index = 0);
   void SetDst(Tensor* ptr, int index = 0);
 
-  virtual absl::Status AddToQueue(CLCommandQueue* queue) {
+  // should be called after changes of inputs/outputs.
+  absl::Status UpdateParams() {
+    RETURN_IF_ERROR(BindArguments());
+    RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+    grid_size_ = GetGridSize();
     return absl::OkStatus();
   }
+
+  absl::Status AddToQueue(CLCommandQueue* queue) {
+    RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
+    return queue->DispatchImplicit(kernel_, grid_size_, work_group_size_);
+  }
+
   virtual absl::Status Tune(const TuningParameters& params) {
-    return absl::OkStatus();
+    RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
+    return GetBestWorkGroup(params, kernel_, grid_size_, &work_group_size_);
   }
 
   virtual absl::Status Compile(const CreationContext& creation_context) {
@@ -100,6 +115,9 @@
   const OperationDef& GetDefinition() const { return definition_; }
 
  protected:
+  virtual absl::Status BindArguments() = 0;
+  virtual int3 GetGridSize() const = 0;
+
   // Defines operation calculation precision and format of src/dst tensors.
   OperationDef definition_;
   std::vector<Tensor*> src_;
@@ -107,6 +125,7 @@
   Arguments args_;
   CLKernel kernel_;
   int3 work_group_size_ = int3(8, 4, 1);
+  int3 grid_size_ = int3(0, 0, 0);
   std::vector<ElementwiseOperation*> linked_operations_;
 };
 
@@ -124,10 +143,10 @@
       : GPUOperation(definition) {}
 
   virtual ~ElementwiseOperation() {}
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
 
   absl::Status Compile(const CreationContext& creation_context) override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
 
   // Move only
   ElementwiseOperation(ElementwiseOperation&& operation);
@@ -149,17 +168,12 @@
  protected:
   bool check_src_channels_size_ = false;
   std::string code_;
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
 };
 
 absl::Status MergeOperations(
     const std::vector<ElementwiseOperation*>& linked_ops,
     Arguments* merged_args, std::string* merged_code);
 
-absl::Status SetArguments(const std::vector<ElementwiseOperation*>& linked_ops,
-                          Arguments* args);
-
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/lstm.cc b/tensorflow/lite/delegates/gpu/cl/kernels/lstm.cc
index 66d6b3d..ab61fcb 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/lstm.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/lstm.cc
@@ -125,7 +125,7 @@
   RETURN_IF_ERROR(args_.SetObjectRef("prev_state", src_[1]));
   RETURN_IF_ERROR(args_.SetObjectRef("new_state", dst_[0]));
   RETURN_IF_ERROR(args_.SetObjectRef("activation", dst_[1]));
-  return args_.Bind(kernel_.kernel());
+  return absl::OkStatus();
 }
 
 int3 LSTM::GetGridSize() const {
@@ -135,16 +135,6 @@
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status LSTM::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status LSTM::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 LSTM CreateLSTM(const OperationDef& definition) { return LSTM(definition); }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/lstm.h b/tensorflow/lite/delegates/gpu/cl/kernels/lstm.h
index 5310e19..6490f39 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/lstm.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/lstm.h
@@ -28,8 +28,8 @@
 class LSTM : public GPUOperation {
  public:
   explicit LSTM(const OperationDef& definition);
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
   absl::Status Compile(const CreationContext& creation_context) override;
 
   // Move only
@@ -37,10 +37,6 @@
   LSTM& operator=(LSTM&& kernel);
   LSTM(const LSTM&) = delete;
   LSTM& operator=(const LSTM&) = delete;
-
- private:
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
 };
 
 LSTM CreateLSTM(const OperationDef& definition);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc
index 58ace72..bef4c26 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc
@@ -205,8 +205,7 @@
     RETURN_IF_ERROR(args_.SetInt("padding_z", padding_.z));
     RETURN_IF_ERROR(args_.SetInt("kernel_size_z", kernel_size_.z));
   }
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return absl::OkStatus();
 }
 
 int3 MaxUnpooling::GetGridSize() const {
@@ -216,16 +215,6 @@
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status MaxUnpooling::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status MaxUnpooling::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 MaxUnpooling CreateMaxUnpooling(const OperationDef& definition,
                                 const MaxUnpooling2DAttributes& attr) {
   return MaxUnpooling(definition, attr);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h
index dae35e9..38f47df 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h
@@ -31,9 +31,9 @@
                const MaxUnpooling2DAttributes& attr);
   MaxUnpooling(const OperationDef& definition,
                const MaxUnpooling3DAttributes& attr);
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
 
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
   absl::Status Compile(const CreationContext& creation_context) override;
 
   // Move only
@@ -43,9 +43,6 @@
   MaxUnpooling& operator=(const MaxUnpooling&) = delete;
 
  private:
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
   int4 stride_;
   int4 padding_;
   int4 kernel_size_;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc b/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc
index 334181b..e3fa023 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc
@@ -129,8 +129,7 @@
   const double size_1 = total_size / size_0;
   RETURN_IF_ERROR(args_.SetFloat("inv_multiplier_1", 1.0 / size_1));
   RETURN_IF_ERROR(args_.SetFloat("inv_multiplier_2", 1.0 / size_0));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return absl::OkStatus();
 }
 
 int3 Mean::GetGridSize() const {
@@ -140,11 +139,6 @@
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status Mean::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 Mean CreateMean(const OperationDef& definition) { return Mean(definition); }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean.h b/tensorflow/lite/delegates/gpu/cl/kernels/mean.h
index 028e001..0552f16 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean.h
@@ -30,8 +30,12 @@
  public:
   Mean() = default;
   explicit Mean(const OperationDef& definition) : GPUOperation(definition) {}
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
 
+  absl::Status Tune(const TuningParameters& params) override {
+    return absl::OkStatus();
+  }
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
   absl::Status Compile(const CreationContext& creation_context) override;
 
   // Move only
@@ -39,10 +43,6 @@
   Mean& operator=(Mean&& operation);
   Mean(const Mean&) = delete;
   Mean& operator=(const Mean&) = delete;
-
- private:
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
 };
 
 Mean CreateMean(const OperationDef& definition);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc b/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc
index 8576475..ebd2809 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc
@@ -175,8 +175,7 @@
   RETURN_IF_ERROR(args_.SetInt("prepended_y", attributes_.prepended.h));
   RETURN_IF_ERROR(args_.SetInt("prepended_z", attributes_.prepended.c));
   RETURN_IF_ERROR(args_.SetInt("prepended_w", attributes_.prepended.b));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return absl::OkStatus();
 }
 
 int3 Padding::GetGridSize() const {
@@ -186,16 +185,6 @@
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status Padding::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status Padding::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 Padding CreatePadding(const OperationDef& definition,
                       const PadAttributes& attr) {
   return Padding(definition, attr);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/padding.h b/tensorflow/lite/delegates/gpu/cl/kernels/padding.h
index d87a3a8..12a83a4 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/padding.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/padding.h
@@ -28,9 +28,9 @@
 class Padding : public GPUOperation {
  public:
   Padding(const OperationDef& definition, const PadAttributes& attr);
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
 
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
   absl::Status Compile(const CreationContext& creation_context) override;
 
   // Move only
@@ -40,9 +40,6 @@
   Padding& operator=(const Padding&) = delete;
 
  private:
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
   PadAttributes attributes_;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc b/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc
index 966c655..6ba49e3 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc
@@ -399,8 +399,7 @@
   if (output_indices_) {
     RETURN_IF_ERROR(args_.SetObjectRef("dst_indices", dst_[1]));
   }
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return absl::OkStatus();
 }
 
 int3 Pooling::GetGridSize() const {
@@ -410,16 +409,6 @@
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status Pooling::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status Pooling::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 Pooling CreatePooling(const OperationDef& definition,
                       const Pooling2DAttributes& attr) {
   return Pooling(definition, attr);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/pooling.h b/tensorflow/lite/delegates/gpu/cl/kernels/pooling.h
index 67d290e..c0199d6 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/pooling.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/pooling.h
@@ -31,9 +31,9 @@
  public:
   Pooling(const OperationDef& definition, const Pooling2DAttributes& attr);
   Pooling(const OperationDef& definition, const Pooling3DAttributes& attr);
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
 
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
   absl::Status Compile(const CreationContext& creation_context) override;
 
   // Move only
@@ -43,9 +43,6 @@
   Pooling& operator=(const Pooling&) = delete;
 
  private:
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
   int4 stride_;
   int4 padding_;
   int4 kernel_size_;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc b/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc
index 4cc5b12..a2e1092 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc
@@ -114,8 +114,7 @@
 absl::Status Reshape::BindArguments() {
   RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
   RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return absl::OkStatus();
 }
 
 int3 Reshape::GetGridSize() const {
@@ -125,16 +124,6 @@
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status Reshape::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status Reshape::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 Reshape CreateReshape(const OperationDef& definition) {
   return Reshape(definition);
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshape.h b/tensorflow/lite/delegates/gpu/cl/kernels/reshape.h
index 8d95bbc..571a225 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshape.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshape.h
@@ -28,9 +28,9 @@
 class Reshape : public GPUOperation {
  public:
   explicit Reshape(const OperationDef& definition) : GPUOperation(definition) {}
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
 
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
   absl::Status Compile(const CreationContext& creation_context) override;
 
   // Move only
@@ -38,10 +38,6 @@
   Reshape& operator=(Reshape&& operation);
   Reshape(const Reshape&) = delete;
   Reshape& operator=(const Reshape&) = delete;
-
- private:
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
 };
 
 Reshape CreateReshape(const OperationDef& definition);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc
index e4c47b7..1036dd8 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc
@@ -99,8 +99,7 @@
 absl::Status Reshapex4::BindArguments() {
   RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
   RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return absl::OkStatus();
 }
 
 int3 Reshapex4::GetGridSize() const {
@@ -110,16 +109,6 @@
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status Reshapex4::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status Reshapex4::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 Reshapex4 CreateReshapex4(const OperationDef& definition) {
   return Reshapex4(definition);
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.h b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.h
index f7c98ab..040b5b8 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.h
@@ -30,9 +30,9 @@
  public:
   explicit Reshapex4(const OperationDef& definition)
       : GPUOperation(definition) {}
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
 
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
   absl::Status Compile(const CreationContext& creation_context) override;
 
   // Move only
@@ -40,10 +40,6 @@
   Reshapex4& operator=(Reshapex4&& operation);
   Reshapex4(const Reshapex4&) = delete;
   Reshapex4& operator=(const Reshapex4&) = delete;
-
- private:
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
 };
 
 // More optimized, but require src_channels % 4 == 0 and dst_channels % 4 == 0
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/resize.cc b/tensorflow/lite/delegates/gpu/cl/kernels/resize.cc
index a47fff9..33bb3b8 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/resize.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/resize.cc
@@ -227,8 +227,7 @@
   RETURN_IF_ERROR(args_.SetFloat(
       "scale_factor_y",
       CalculateResizeScale(src_[0]->Height(), dst_[0]->Height(), attr_)));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return absl::OkStatus();
 }
 
 int3 Resize::GetGridSize() const {
@@ -238,16 +237,6 @@
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status Resize::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
-absl::Status Resize::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
 Resize CreateResize(const OperationDef& definition,
                     const Resize2DAttributes& attr) {
   return Resize(definition, attr);
@@ -292,8 +281,7 @@
   RETURN_IF_ERROR(args_.SetFloat(
       "scale_factor_z",
       CalculateResizeScale(src_[0]->Depth(), dst_[0]->Depth(), attr_)));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return absl::OkStatus();
 }
 
 int3 Resize3D::GetGridSize() const {
@@ -303,16 +291,6 @@
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status Resize3D::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
-absl::Status Resize3D::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
 Resize3D CreateResize3D(const OperationDef& definition,
                         const Resize3DAttributes& attr) {
   return Resize3D(definition, attr);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/resize.h b/tensorflow/lite/delegates/gpu/cl/kernels/resize.h
index 10fb414..899c85b 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/resize.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/resize.h
@@ -27,9 +27,8 @@
 
 class Resize : public GPUOperation {
  public:
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
   absl::Status Compile(const CreationContext& creation_context) override;
 
   // Move only
@@ -45,9 +44,6 @@
   Resize(const OperationDef& definition, const Resize2DAttributes& attr)
       : GPUOperation(definition), attr_(attr) {}
 
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
   Resize2DAttributes attr_;
 };
 
@@ -56,9 +52,8 @@
 
 class Resize3D : public GPUOperation {
  public:
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
   absl::Status Compile(const CreationContext& creation_context) override;
 
   // Move only
@@ -74,9 +69,6 @@
   Resize3D(const OperationDef& definition, const Resize3DAttributes& attr)
       : GPUOperation(definition), attr_(attr) {}
 
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
   Resize3DAttributes attr_;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/softmax.cc b/tensorflow/lite/delegates/gpu/cl/kernels/softmax.cc
index ea8671b..edc720d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/softmax.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax.cc
@@ -91,8 +91,7 @@
 absl::Status Softmax::BindArguments() {
   RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
   RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return absl::OkStatus();
 }
 
 int3 Softmax::GetGridSize() const {
@@ -102,16 +101,6 @@
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status Softmax::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status Softmax::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 Softmax CreateSoftmax(const OperationDef& definition) {
   return Softmax(definition);
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/softmax.h b/tensorflow/lite/delegates/gpu/cl/kernels/softmax.h
index 5f974ef..eac06ca 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/softmax.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax.h
@@ -30,9 +30,9 @@
  public:
   Softmax() = default;
   explicit Softmax(const OperationDef& definition) : GPUOperation(definition) {}
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
 
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
   absl::Status Compile(const CreationContext& creation_context) override;
 
   // Move only
@@ -42,10 +42,6 @@
   Softmax& operator=(const Softmax&) = delete;
 
   friend Softmax CreateSoftmax();
-
- private:
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
 };
 
 Softmax CreateSoftmax(const OperationDef& definition);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc
index 28ebd8a..33dd285 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc
@@ -112,6 +112,7 @@
 absl::Status Softmax1x1::Compile(const CreationContext& creation_context) {
   std::string code = GetSoftmaxKernelCode(definition_, &args_);
   std::string element_wise_code;
+  work_group_size_ = int3(32, 1, 1);
   RETURN_IF_ERROR(
       MergeOperations(linked_operations_, &args_, &element_wise_code));
   RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
@@ -122,7 +123,7 @@
       *creation_context.device, &kernel_);
 }
 
-absl::Status Softmax1x1::AddToQueue(CLCommandQueue* queue) {
+absl::Status Softmax1x1::BindArguments() {
   RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
   RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
   float4 mask = GetMaskForLastPlane(src_[0]->Channels());
@@ -132,12 +133,11 @@
   RETURN_IF_ERROR(args_.SetFloat("mask_w", mask.w));
   RETURN_IF_ERROR(
       args_.SetInt("slices_x32", DivideRoundUp(src_[0]->Slices(), 32)));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
-  return queue->DispatchImplicit(kernel_, {32, dst_[0]->Batch(), 1},
-                                 {32, 1, 1});
+  return absl::OkStatus();
 }
 
+int3 Softmax1x1::GetGridSize() const { return int3(32, dst_[0]->Batch(), 1); }
+
 Softmax1x1 CreateSoftmax1x1(const OperationDef& definition) {
   return Softmax1x1(definition);
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.h b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.h
index d5ae037..f749a7b 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.h
@@ -30,8 +30,11 @@
   Softmax1x1() = default;
   explicit Softmax1x1(const OperationDef& definition)
       : GPUOperation(definition) {}
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-
+  absl::Status Tune(const TuningParameters& params) override {
+    return absl::OkStatus();
+  }
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
   absl::Status Compile(const CreationContext& creation_context) override;
 
   // Move only
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.cc b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.cc
index 6b5cc9f..37c3e09 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.cc
@@ -106,8 +106,7 @@
   RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
   RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
   RETURN_IF_ERROR(args_.SetInt("block_size", attr_.block_size));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return absl::OkStatus();
 }
 
 int3 SpaceToDepth::GetGridSize() const {
@@ -117,16 +116,6 @@
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status SpaceToDepth::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status SpaceToDepth::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 SpaceToDepth CreateSpaceToDepth(const OperationDef& op_def,
                                 const SpaceToDepthAttributes& attr) {
   return SpaceToDepth(op_def, attr);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.h b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.h
index 6268920..99a0ca0 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.h
@@ -30,8 +30,8 @@
  public:
   SpaceToDepth(const OperationDef& op_def, const SpaceToDepthAttributes& attr)
       : GPUOperation(op_def), attr_(attr) {}
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
   absl::Status Compile(const CreationContext& creation_context) override;
 
   SpaceToDepth(SpaceToDepth&& operation);
@@ -40,9 +40,6 @@
   SpaceToDepth& operator=(const SpaceToDepth&) = delete;
 
  private:
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
   SpaceToDepthAttributes attr_;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc
index 904e7fc..443c4a4 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc
@@ -185,8 +185,7 @@
   RETURN_IF_ERROR(args_.SetInt("stride_y", attributes_.strides.h));
   RETURN_IF_ERROR(args_.SetInt("stride_z", attributes_.strides.c));
   RETURN_IF_ERROR(args_.SetInt("stride_b", attributes_.strides.b));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return absl::OkStatus();
 }
 
 int3 StridedSlice::GetGridSize() const {
@@ -196,16 +195,6 @@
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status StridedSlice::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status StridedSlice::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 StridedSlice CreateStridedSlice(const OperationDef& definition,
                                 const SliceAttributes& attr) {
   return StridedSlice(definition, attr);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.h b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.h
index 3d88bd9..40005db 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.h
@@ -27,9 +27,8 @@
 class StridedSlice : public GPUOperation {
  public:
   StridedSlice(const OperationDef& definition, const SliceAttributes& attr);
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
   absl::Status Compile(const CreationContext& creation_context) override;
 
   // Move only
@@ -39,9 +38,6 @@
   StridedSlice& operator=(const StridedSlice&) = delete;
 
  private:
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
   SliceAttributes attributes_;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
index bd5df56..eb62e1e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
@@ -130,8 +130,7 @@
 absl::Status Transpose::BindArguments() {
   RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
   RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return absl::OkStatus();
 }
 
 int3 Transpose::GetGridSize() const {
@@ -141,16 +140,6 @@
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status Transpose::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status Transpose::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 Transpose CreateTranspose(const OperationDef& definition,
                           const TransposeAttributes& attr) {
   return Transpose(definition, attr);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.h b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.h
index 2c32fc4..36976d5 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.h
@@ -28,8 +28,8 @@
  public:
   Transpose(const OperationDef& definition, const TransposeAttributes& attr)
       : GPUOperation(definition), attr_(attr) {}
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
   absl::Status Compile(const CreationContext& creation_context) override;
 
   // Move only
@@ -39,9 +39,6 @@
   Transpose& operator=(const Transpose&) = delete;
 
  private:
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
   TransposeAttributes attr_;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
index a0f9238..d64b61a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
@@ -403,8 +403,7 @@
   RETURN_IF_ERROR(args_.SetInt("padding_y", -padding_.prepended.h));
   RETURN_IF_ERROR(args_.SetInt("tiles_total", tiles_total));
   RETURN_IF_ERROR(args_.SetInt("tiles_x", tiles_x));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return absl::OkStatus();
 }
 
 int3 Winograd4x4To36::GetGridSize() const {
@@ -417,9 +416,8 @@
 absl::Status Winograd4x4To36::Tune(const TuningParameters& params) {
   switch (params.tuning_type) {
     case TuningType::EXHAUSTIVE:
-      RETURN_IF_ERROR(BindArguments());
-      return GetBestWorkGroup(params, kernel_, GetGridSize(),
-                              &work_group_size_);
+      RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
+      return GetBestWorkGroup(params, kernel_, grid_size_, &work_group_size_);
     case TuningType::FAST:
     default:
       work_group_size_ = SelectBestWorkGroup();
@@ -427,11 +425,6 @@
   }
 }
 
-absl::Status Winograd4x4To36::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
 absl::Status CreateWinograd4x4To36(const CreationContext& creation_context,
                                    const OperationDef& definition,
                                    const Padding2D& padding,
@@ -506,8 +499,7 @@
   RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
   const int tiles_x = DivideRoundUp(dst_[0]->Width(), 4);
   RETURN_IF_ERROR(args_.SetInt("tiles_x", tiles_x));
-  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
-  return args_.Bind(kernel_.kernel());
+  return absl::OkStatus();
 }
 
 int3 Winograd36To4x4::GetGridSize() const {
@@ -522,9 +514,8 @@
 absl::Status Winograd36To4x4::Tune(const TuningParameters& params) {
   switch (params.tuning_type) {
     case TuningType::EXHAUSTIVE:
-      RETURN_IF_ERROR(BindArguments());
-      return GetBestWorkGroup(params, kernel_, GetGridSize(),
-                              &work_group_size_);
+      RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
+      return GetBestWorkGroup(params, kernel_, grid_size_, &work_group_size_);
     case TuningType::FAST:
     default:
       work_group_size_ = SelectBestWorkGroup();
@@ -532,10 +523,6 @@
   }
 }
 
-absl::Status Winograd36To4x4::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
 
 absl::Status CreateWinograd36To4x4(
     const CreationContext& creation_context, const OperationDef& definition,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
index 3f57342..7fe0fc0 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
@@ -38,7 +38,8 @@
       : GPUOperation(definition), padding_(padding) {
     work_group_size_ = int3(128, 1, 1);
   }
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
   absl::Status Tune(const TuningParameters& params) override;
   absl::Status Compile(const CreationContext& creation_context) override;
 
@@ -58,9 +59,6 @@
   // Must be called after kernel compilation
   int3 SelectBestWorkGroup();
 
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
   Padding2D padding_;
 };
 
@@ -76,7 +74,8 @@
       : GPUOperation(definition) {
     work_group_size_ = int3(128, 1, 1);
   }
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
+  absl::Status BindArguments() override;
+  int3 GetGridSize() const override;
   absl::Status Tune(const TuningParameters& params) override;
   absl::Status Compile(const CreationContext& creation_context) override;
 
@@ -96,9 +95,6 @@
 
   // Must be called after kernel compilation
   int3 SelectBestWorkGroup();
-
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
 };
 
 absl::Status CreateWinograd36To4x4(