Update and enable a test.
diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
index 5305440..ee07357 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
@@ -34,53 +34,6 @@
 
 class ReductionVectorizationTest : public GpuCodegenTest {};
 
-TEST_F(ReductionVectorizationTest, DISABLED_TileFit) {
-  const char* hlo_text = R"(
-HloModule ReduceTileFit
-
-%max_ {
-  %x = f32[] parameter(0)
-  %y = f32[] parameter(1)
-  ROOT %maximum.7 = f32[] maximum(f32[] %x, f32[] %y)
-}
-
-ENTRY %main {
-  %param_0 = f32[5,122880] parameter(0)
-  %constant.3 = f32[] constant(0)
-  ROOT %reduce.8 = f32[5] reduce(f32[5,122880] %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_
-}
-)";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> optimized_module,
-                          ParseAndReturnVerifiedModule(hlo_text));
-  se::StreamExecutor* executor = backend().default_stream_executor();
-  int cc_major = 0, cc_minor = 0;
-  executor->GetDeviceDescription().cuda_compute_capability(&cc_major,
-                                                           &cc_minor);
-  string expected_ptx;
-  if (cc_major >= 6) {
-    expected_ptx = R"(
-CHECK: ld.global.nc.v2.f32
-CHECK: ld.global.nc.v2.f32
-CHECK: ld.global.nc.v2.f32
-CHECK: ld.global.nc.v2.f32
-)";
-  } else {
-    expected_ptx = R"(
-CHECK-NOT: ld.global.nc.v2.f32
-CHECK: ld.global.nc.f32
-CHECK: ld.global.nc.f32
-CHECK: ld.global.nc.f32
-CHECK: ld.global.nc.f32
-CHECK: ld.global.nc.f32
-CHECK: ld.global.nc.f32
-CHECK: ld.global.nc.f32
-CHECK: ld.global.nc.f32
-)";
-  }
-  CompileAndOptionallyVerifyPtx(std::move(optimized_module), expected_ptx);
-
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
-}
 
 TEST_F(ReductionVectorizationTest, DISABLED_EvenColumns) {
   const char* hlo_text = R"(
diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_tile_fit_sm50.hlo b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_tile_fit_sm50.hlo
new file mode 100644
index 0000000..14cfe91
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_tile_fit_sm50.hlo
@@ -0,0 +1,25 @@
+// RUN: hlo_to_llvm_ir --ptx --sm=50 %s | FileCheck %s
+
+HloModule ReduceTileFit
+
+// CHECK-NOT: ld.global.nc.v2.f32
+// CHECK: ld.global.nc.f32
+// CHECK: ld.global.nc.f32
+// CHECK: ld.global.nc.f32
+// CHECK: ld.global.nc.f32
+// CHECK: ld.global.nc.f32
+// CHECK: ld.global.nc.f32
+// CHECK: ld.global.nc.f32
+// CHECK: ld.global.nc.f32
+
+%max_ {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %maximum.7 = f32[] maximum(f32[] %x, f32[] %y)
+}
+
+ENTRY %main {
+  %param_0 = f32[5,122880] parameter(0)
+  %constant.3 = f32[] constant(0)
+  ROOT %reduce.8 = f32[5] reduce(f32[5,122880] %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_
+}
diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_tile_fit_sm60.hlo b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_tile_fit_sm60.hlo
new file mode 100644
index 0000000..142a682
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_tile_fit_sm60.hlo
@@ -0,0 +1,20 @@
+// RUN: hlo_to_llvm_ir --ptx --sm=60 %s | FileCheck %s
+
+HloModule ReduceTileFit
+
+// CHECK: ld.global.nc.v2.f32
+// CHECK: ld.global.nc.v2.f32
+// CHECK: ld.global.nc.v2.f32
+// CHECK: ld.global.nc.v2.f32
+
+%max_ {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %maximum.7 = f32[] maximum(f32[] %x, f32[] %y)
+}
+
+ENTRY %main {
+  %param_0 = f32[5,122880] parameter(0)
+  %constant.3 = f32[] constant(0)
+  ROOT %reduce.8 = f32[5] reduce(f32[5,122880] %param_0, f32[] %constant.3), dimensions={1}, to_apply=%max_
+}