Update and enable a test.
diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
index 5305440..ee07357 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
@@ -34,53 +34,6 @@
class ReductionVectorizationTest : public GpuCodegenTest {};
-TEST_F(ReductionVectorizationTest, DISABLED_TileFit) {
- const char* hlo_text = R"(
-HloModule ReduceTileFit
-
-%max_ {
- %x = f32[] parameter(0)
- %y = f32[] parameter(1)
- ROOT %maximum.7 = f32[] maximum(f32[] %x, f32[] %y)
-}
-
-ENTRY %main {
- %param_0 = f32[5,122880] parameter(0)
- %constant.3 = f32[] constant(0)
- ROOT %reduce.8 = f32[5] reduce(f32[5,122880] %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_
-}
-)";
- TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> optimized_module,
- ParseAndReturnVerifiedModule(hlo_text));
- se::StreamExecutor* executor = backend().default_stream_executor();
- int cc_major = 0, cc_minor = 0;
- executor->GetDeviceDescription().cuda_compute_capability(&cc_major,
- &cc_minor);
- string expected_ptx;
- if (cc_major >= 6) {
- expected_ptx = R"(
-CHECK: ld.global.nc.v2.f32
-CHECK: ld.global.nc.v2.f32
-CHECK: ld.global.nc.v2.f32
-CHECK: ld.global.nc.v2.f32
-)";
- } else {
- expected_ptx = R"(
-CHECK-NOT: ld.global.nc.v2.f32
-CHECK: ld.global.nc.f32
-CHECK: ld.global.nc.f32
-CHECK: ld.global.nc.f32
-CHECK: ld.global.nc.f32
-CHECK: ld.global.nc.f32
-CHECK: ld.global.nc.f32
-CHECK: ld.global.nc.f32
-CHECK: ld.global.nc.f32
-)";
- }
- CompileAndOptionallyVerifyPtx(std::move(optimized_module), expected_ptx);
-
- EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
-}
TEST_F(ReductionVectorizationTest, DISABLED_EvenColumns) {
const char* hlo_text = R"(
diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_tile_fit_sm50.hlo b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_tile_fit_sm50.hlo
new file mode 100644
index 0000000..14cfe91
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_tile_fit_sm50.hlo
@@ -0,0 +1,25 @@
+// RUN: hlo_to_llvm_ir --ptx --sm=50 %s | FileCheck %s
+
+HloModule ReduceTileFit
+
+// CHECK-NOT: ld.global.nc.v2.f32
+// CHECK: ld.global.nc.f32
+// CHECK: ld.global.nc.f32
+// CHECK: ld.global.nc.f32
+// CHECK: ld.global.nc.f32
+// CHECK: ld.global.nc.f32
+// CHECK: ld.global.nc.f32
+// CHECK: ld.global.nc.f32
+// CHECK: ld.global.nc.f32
+
+%max_ {
+ %x = f32[] parameter(0)
+ %y = f32[] parameter(1)
+ ROOT %maximum.7 = f32[] maximum(f32[] %x, f32[] %y)
+}
+
+ENTRY %main {
+ %param_0 = f32[5,122880] parameter(0)
+ %constant.3 = f32[] constant(0)
+ ROOT %reduce.8 = f32[5] reduce(f32[5,122880] %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_
+}
diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_tile_fit_sm60.hlo b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_tile_fit_sm60.hlo
new file mode 100644
index 0000000..142a682
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_tile_fit_sm60.hlo
@@ -0,0 +1,20 @@
+// RUN: hlo_to_llvm_ir --ptx --sm=60 %s | FileCheck %s
+
+HloModule ReduceTileFit
+
+// CHECK: ld.global.nc.v2.f32
+// CHECK: ld.global.nc.v2.f32
+// CHECK: ld.global.nc.v2.f32
+// CHECK: ld.global.nc.v2.f32
+
+%max_ {
+ %x = f32[] parameter(0)
+ %y = f32[] parameter(1)
+ ROOT %maximum.7 = f32[] maximum(f32[] %x, f32[] %y)
+}
+
+ENTRY %main {
+ %param_0 = f32[5,122880] parameter(0)
+ %constant.3 = f32[] constant(0)
+ ROOT %reduce.8 = f32[5] reduce(f32[5,122880] %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_
+}