Added missing bias for Winograd36To4x4Tile4x1.

PiperOrigin-RevId: 304685460
Change-Id: Iaa5c55ea0d0512c120f80b6cf925f0e55534e9a1
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/conv_test.mm b/tensorflow/lite/delegates/gpu/metal/kernels/conv_test.mm
index bb81212..0291cd7 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/conv_test.mm
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/conv_test.mm
@@ -272,8 +272,6 @@
                         attr.padding.appended.w - 2;
   int new_height = src_shape.h + attr.padding.prepended.h +
                          attr.padding.appended.h - 2;
-  std::cout << dst_shape.w << " vs " << new_width << std::endl;
-  std::cout << dst_shape.h << " vs " << new_height << std::endl;
   BHWC conv_shape;
   conv_shape.b = dst_shape.b;
   conv_shape.h = 36;
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/winograd.cc b/tensorflow/lite/delegates/gpu/metal/kernels/winograd.cc
index cdfbf90..6d68e9e 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/winograd.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/winograd.cc
@@ -425,9 +425,10 @@
   c += R"(
   FLT4 t0 = I1 + I2;
   FLT4 t1 = I3 + I4;
+  FLT4 bias_val = biases[DST_Z];
   int dst_adress = (DST_Z * U.dst_size.y + tile_y) * U.dst_size.x + tile_x;
   if (tile_x < U.dst_size.x) {
-    FLT4 value = I0 + t0 + t1;
+    FLT4 value = I0 + t0 + t1 + bias_val;
     uint3 gid = uint3(tile_x, tile_y, global_ids.z);
     int linear_index = dst_adress;
     $2;
@@ -436,21 +437,21 @@
   FLT4 t2 = I1 - I2;
   FLT4 t3 = I3 - I4;
   if (tile_x + 1 < U.dst_size.x) {
-    FLT4 value = t2 * At[7] + t3 * At[9];
+    FLT4 value = t2 * At[7] + t3 * At[9] + bias_val;
     uint3 gid = uint3(tile_x + 1, tile_y, global_ids.z);
     int linear_index = dst_adress + 1;
     $2;
     dst_buffer[linear_index] = value;
   }
   if (tile_x + 2 < U.dst_size.x) {
-    FLT4 value = t0 * At[13] + t1 * At[15];
+    FLT4 value = t0 * At[13] + t1 * At[15] + bias_val;
     uint3 gid = uint3(tile_x + 2, tile_y, global_ids.z);
     int linear_index = dst_adress + 2;
     $2;
     dst_buffer[linear_index] = value;
   }
   if (tile_x + 3 < U.dst_size.x) {
-    FLT4 value = t2 * At[19] + t3 * At[21] + I5;
+    FLT4 value = t2 * At[19] + t3 * At[21] + I5 + bias_val;
     uint3 gid = uint3(tile_x + 3, tile_y, global_ids.z);
     int linear_index = dst_adress + 3;
     $2;