Add FC tests:
- Large input and output to test multiple slices
- Extra large to test multiple workgroups

PiperOrigin-RevId: 331843358
Change-Id: I502bae224d4b9486438cb67799fc2fe8c2d96c01
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_test.cc
index f58487c..85c663b 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_test.cc
@@ -20,6 +20,7 @@
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
@@ -39,7 +40,8 @@
 
   FullyConnectedAttributes attr;
   attr.weights.shape = OHWI(2, 1, 1, 4);
-  attr.weights.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+  attr.weights.data = {0.0f, 1.0f, 2.0f, 3.0f,  //
+                       4.0f, 5.0f, 6.0f, 7.0f};
   attr.bias.shape = Linear(2);
   attr.bias.data = {0.5f, -0.5f};
 
@@ -61,6 +63,97 @@
   }
 }
 
+TEST_F(OpenCLOperationTest, FullyConnectedLarge) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 1, 1, 8);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  FullyConnectedAttributes attr;
+  attr.weights.shape = OHWI(12, 1, 1, 8);
+  attr.weights.data = {
+      0.0f,  1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,   //
+      8.0f,  9.0f,  10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f,  //
+      16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f,  //
+      24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f,  //
+      32.0f, 33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f,  //
+      40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f,  //
+      48.0f, 49.0f, 50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f,  //
+      56.0f, 57.0f, 58.0f, 59.0f, 60.0f, 61.0f, 62.0f, 63.0f,  //
+      64.0f, 65.0f, 66.0f, 67.0f, 68.0f, 69.0f, 70.0f, 71.0f,  //
+      72.0f, 73.0f, 74.0f, 75.0f, 76.0f, 77.0f, 78.0f, 79.0f,  //
+      80.0f, 81.0f, 82.0f, 83.0f, 84.0f, 85.0f, 86.0f, 87.0f,  //
+      88.0f, 89.0f, 90.0f, 91.0f, 92.0f, 93.0f, 94.0f, 95.0f,  //
+  };
+  attr.bias.shape = Linear(12);
+  attr.bias.data = {-0.6f, -0.5f, -0.4f, -0.3f, -0.2f, -0.1f,
+                    0.1f,  0.2f,  0.3f,  0.4f,  0.5f,  0.6f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 0.0f : 0.601f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      FullyConnected operation =
+          CreateFullyConnected(creation_context_.GetDeviceInfo(), op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 1, 1, 12), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(eps),
+                    {139.4f, 363.5f, 587.6f, 811.7f, 1035.8f, 1259.9f, 1484.1f,
+                     1708.2f, 1932.3f, 2156.4f, 2380.5f, 2604.6f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, FullyConnectedExtraLarge) {
+  static const int kInputSize = 1024;
+  static const int kOutputSize = 1024;
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 1, 1, kInputSize);
+  src_tensor.data.assign(kInputSize, 1.1f);
+
+  FullyConnectedAttributes attr;
+  attr.weights.shape = OHWI(1024, 1, 1, kInputSize);
+  attr.weights.data.assign(kOutputSize * kInputSize, 2.2f);
+  attr.bias.shape = Linear(kOutputSize);
+  attr.bias.data.assign(kOutputSize, 3.3f);
+
+  std::vector<float> expected(kOutputSize, 2481.38f);
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      float eps;
+      switch (precision) {
+        case CalculationsPrecision::F32:
+          eps = 1.23e-3f;
+          break;
+        case CalculationsPrecision::F32_F16:
+          eps = 1.38f;
+          break;
+        case CalculationsPrecision::F16:
+          eps = 3.38f;
+          break;
+      }
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      FullyConnected operation =
+          CreateFullyConnected(creation_context_.GetDeviceInfo(), op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 1, 1, kOutputSize), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), expected));
+    }
+  }
+}
+
 TEST_F(OpenCLOperationTest, RearrageWeights) {
   tflite::gpu::Tensor<OHWI, DataType::FLOAT32> weights;
   weights.shape = OHWI(8, 1, 1, 8);