Update and enable a test.

commit: ab9ab1fa7e0e71afb4e83a9dbb3be1b398dfa501 [log] [tgz]
author: Frederic Bastien <fbastien@nvidia.com> Tue Feb 16 11:56:23 2021 -0800
committer: Frederic Bastien <fbastien@nvidia.com> Wed Feb 24 17:06:44 2021 -0800
tree: 6e703275ccca2e2a273fdaa05ffaf52ee231f8c0
parent: 23b555ea58b32742f007925751b85c05ceeb3dba [diff]
diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
index 5305440..ee07357 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc

@@ -34,53 +34,6 @@
 
 class ReductionVectorizationTest : public GpuCodegenTest {};
 
-TEST_F(ReductionVectorizationTest, DISABLED_TileFit) {
-  const char* hlo_text = R"(
-HloModule ReduceTileFit
-
-%max_ {
-  %x = f32[] parameter(0)
-  %y = f32[] parameter(1)
-  ROOT %maximum.7 = f32[] maximum(f32[] %x, f32[] %y)
-}
-
-ENTRY %main {
-  %param_0 = f32[5,122880] parameter(0)
-  %constant.3 = f32[] constant(0)
-  ROOT %reduce.8 = f32[5] reduce(f32[5,122880] %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_
-}
-)";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> optimized_module,
-                          ParseAndReturnVerifiedModule(hlo_text));
-  se::StreamExecutor* executor = backend().default_stream_executor();
-  int cc_major = 0, cc_minor = 0;
-  executor->GetDeviceDescription().cuda_compute_capability(&cc_major,
-                                                           &cc_minor);
-  string expected_ptx;
-  if (cc_major >= 6) {
-    expected_ptx = R"(
-CHECK: ld.global.nc.v2.f32
-CHECK: ld.global.nc.v2.f32
-CHECK: ld.global.nc.v2.f32
-CHECK: ld.global.nc.v2.f32
-)";
-  } else {
-    expected_ptx = R"(
-CHECK-NOT: ld.global.nc.v2.f32
-CHECK: ld.global.nc.f32
-CHECK: ld.global.nc.f32
-CHECK: ld.global.nc.f32
-CHECK: ld.global.nc.f32
-CHECK: ld.global.nc.f32
-CHECK: ld.global.nc.f32
-CHECK: ld.global.nc.f32
-CHECK: ld.global.nc.f32
-)";
-  }
-  CompileAndOptionallyVerifyPtx(std::move(optimized_module), expected_ptx);
-
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
-}
 
 TEST_F(ReductionVectorizationTest, DISABLED_EvenColumns) {
   const char* hlo_text = R"(

diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_tile_fit_sm50.hlo b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_tile_fit_sm50.hlo
new file mode 100644
index 0000000..14cfe91
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_tile_fit_sm50.hlo

@@ -0,0 +1,25 @@
+// RUN: hlo_to_llvm_ir --ptx --sm=50 %s | FileCheck %s
+
+HloModule ReduceTileFit
+
+// CHECK-NOT: ld.global.nc.v2.f32
+// CHECK: ld.global.nc.f32
+// CHECK: ld.global.nc.f32
+// CHECK: ld.global.nc.f32
+// CHECK: ld.global.nc.f32
+// CHECK: ld.global.nc.f32
+// CHECK: ld.global.nc.f32
+// CHECK: ld.global.nc.f32
+// CHECK: ld.global.nc.f32
+
+%max_ {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %maximum.7 = f32[] maximum(f32[] %x, f32[] %y)
+}
+
+ENTRY %main {
+  %param_0 = f32[5,122880] parameter(0)
+  %constant.3 = f32[] constant(0)
+  ROOT %reduce.8 = f32[5] reduce(f32[5,122880] %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_
+}

diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_tile_fit_sm60.hlo b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_tile_fit_sm60.hlo
new file mode 100644
index 0000000..142a682
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_tile_fit_sm60.hlo

@@ -0,0 +1,20 @@
+// RUN: hlo_to_llvm_ir --ptx --sm=60 %s | FileCheck %s
+
+HloModule ReduceTileFit
+
+// CHECK: ld.global.nc.v2.f32
+// CHECK: ld.global.nc.v2.f32
+// CHECK: ld.global.nc.v2.f32
+// CHECK: ld.global.nc.v2.f32
+
+%max_ {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %maximum.7 = f32[] maximum(f32[] %x, f32[] %y)
+}
+
+ENTRY %main {
+  %param_0 = f32[5,122880] parameter(0)
+  %constant.3 = f32[] constant(0)
+  ROOT %reduce.8 = f32[5] reduce(f32[5,122880] %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_
+}
commit	ab9ab1fa7e0e71afb4e83a9dbb3be1b398dfa501	[log] [tgz]
author	Frederic Bastien <fbastien@nvidia.com>	Tue Feb 16 11:56:23 2021 -0800
committer	Frederic Bastien <fbastien@nvidia.com>	Wed Feb 24 17:06:44 2021 -0800
tree	6e703275ccca2e2a273fdaa05ffaf52ee231f8c0
parent	23b555ea58b32742f007925751b85c05ceeb3dba [diff]