[XLA:GPU] Adjust launch_dimension.hlo test.

- Change op from `tanh` to `round-nearest-even` in preparation to enable `tanh` fusion rewrite (but not `round-nearest-even` because it's unsupported, see cl/464576491).
- Make lit check statements less brittle.

PiperOrigin-RevId: 464967659
diff --git a/tensorflow/compiler/xla/service/gpu/tests/launch_dimensions.hlo b/tensorflow/compiler/xla/service/gpu/tests/launch_dimensions.hlo
index 85d358a..b417994 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/launch_dimensions.hlo
+++ b/tensorflow/compiler/xla/service/gpu/tests/launch_dimensions.hlo
@@ -1,41 +1,47 @@
 // RUN: hlo_to_llvm_ir %s | FileCheck %s
-// This test that we do not increase the grid launch size when
+// This tests that we do not increase the grid launch size when
 // few_waves is enabled.
 
 // CHECK-LABEL: entry:
-// CHECK:       !2 = !{i32 0, i32 2}
-// CHECK:       !3 = !{i32 0, i32 256}
+// CHECK-DAG:   call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
+// CHECK-DAG:   call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
+// CHECK-DAG:   ![[ctaid_range]] = !{i32 0, i32 2}
+// CHECK-DAG:   ![[tid_range]] = !{i32 0, i32 256}
 
 HloModule Test
 
 ENTRY main {
   a = f32[100, 20]{1,0} parameter(0)
-  ROOT b = f32[100, 20]{1,0} tanh(a)
+  ROOT b = f32[100, 20]{1,0} round-nearest-even(a)
 }
 
 // -----
 
-// This test that we cap grid launch code when few_waves is enabled.
+// This tests that we cap grid launch code when few_waves is enabled.
 
 // CHECK-LABEL: entry:
-// CHECK:       !2 = !{i32 0, i32 1280}
-// CHECK:       !3 = !{i32 0, i32 128}
+// CHECK-DAG:   call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
+// CHECK-DAG:   call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
+// CHECK-DAG:   ![[ctaid_range]] = !{i32 0, i32 1280}
+// CHECK-DAG:   ![[tid_range]] = !{i32 0, i32 128}
 
 HloModule Test
 
 ENTRY main {
   a = f32[10000, 10000]{1,0} parameter(0)
-  ROOT b = f32[10000, 10000]{1,0} tanh(a)
+  ROOT b = f32[10000, 10000]{1,0} round-nearest-even(a)
 }
 
 // -----
 
-// This test that we cap grid launch code when few_waves is enabled
+// This tests that we cap grid launch code when few_waves is enabled
 // and scalar broadcast are present.
 
 // CHECK-LABEL: entry:
-// CHECK:       !2 = !{i32 0, i32 1280}
-// CHECK:       !3 = !{i32 0, i32 128}
+// CHECK-DAG:   call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
+// CHECK-DAG:   call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
+// CHECK-DAG:   ![[ctaid_range]] = !{i32 0, i32 1280}
+// CHECK-DAG:   ![[tid_range]] = !{i32 0, i32 128}
 
 HloModule ScalarBroadcast
 
@@ -55,12 +61,14 @@
 
 
 // -----
-// This test the GELU kernel. The original kernel that
+// This tests the GELU kernel. The original kernel that
 // motivated few_waves implementation.
 
 // CHECK-LABEL: entry:
-// CHECK:       !2 = !{i32 0, i32 1280}
-// CHECK:       !3 = !{i32 0, i32 128}
+// CHECK-DAG:   call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
+// CHECK-DAG:   call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
+// CHECK-DAG:   ![[ctaid_range]] = !{i32 0, i32 1280}
+// CHECK-DAG:   ![[tid_range]] = !{i32 0, i32 128}
 
 HloModule Test