[XLA/GPU] Migrate all unnested elementwise emitters.

PiperOrigin-RevId: 346559170
Change-Id: I990590eb45fa5d9f866d05d66d27efcb5211fe42
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/hlo_utils.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/hlo_utils.h
index 74ea9c9..5513dc6 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/hlo_utils.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/hlo_utils.h
@@ -83,6 +83,11 @@
 // Requires `ty` to be either FloatType or IntegerType.
 DenseElementsAttr GetScalarLimitOfType(Type ty, ScalarLimit limit);
 
+// Given `op_name` from LMHLO, returns the corresponding op name in MHLO.
+// Returns empty string if no such op exists.
+std::string LmhloToMhloOpName(llvm::StringRef op_name,
+                              mlir::MLIRContext* context);
+
 }  // namespace hlo
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/hlo/lib/utils/hlo_utils.cc b/tensorflow/compiler/mlir/hlo/lib/utils/hlo_utils.cc
index 0bbd91e..8ff1ce3 100644
--- a/tensorflow/compiler/mlir/hlo/lib/utils/hlo_utils.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/utils/hlo_utils.cc
@@ -132,5 +132,13 @@
   llvm_unreachable("unsupported type");
 }
 
+std::string LmhloToMhloOpName(llvm::StringRef op_name,
+                              mlir::MLIRContext *context) {
+  assert(op_name.startswith("lmhlo.") && "Expected an LMHLO op");
+  std::string mhlo_op_name(op_name.drop_front(1));
+  if (context->isOperationRegistered(mhlo_op_name)) return mhlo_op_name;
+  return "";
+}
+
 }  // namespace hlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 25d2ba7..7259c4a 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -31,6 +31,7 @@
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/APInt.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
@@ -40,11 +41,14 @@
 #include "llvm/IR/Module.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/Verifier.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.h"
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/hlo_utils.h"
 #include "tensorflow/compiler/mlir/utils/name_utils.h"
 #include "tensorflow/compiler/mlir/xla/hlo_function_importer.h"
 #include "tensorflow/compiler/mlir/xla/hlo_utils.h"
@@ -330,12 +334,22 @@
   return true;
 }
 
+std::vector<mlir::Operation*> GetOutputOps(mlir::lmhlo::FusionOp fusion) {
+  llvm::SetVector<mlir::Operation*> ops;
+  for (mlir::Value output_value : fusion.getFusionResults()) {
+    ops.insert(output_value.getDefiningOp());
+  }
+  return std::vector<mlir::Operation*>(ops.begin(), ops.end());
+}
+
 // Computes the maximum valid unroll factor for a given instruction.
 int ComputeMaxUnrollFactor(const Shape& shape,
                            const HloModuleConfig& hlo_module_config) {
   int max_unroll_factor =
       hlo_module_config.debug_options().xla_gpu_max_kernel_unroll_factor();
 
+  // Find the largest possible power of two to unroll by.
+  // TODO(kramerb): Make this smarter.
   int64 num_elements = ShapeUtil::ElementsIn(shape);
   for (int i = max_unroll_factor; i > 1; i /= 2) {
     if (num_elements % i == 0) {
@@ -349,14 +363,39 @@
 
 // Computes the maximum valid unroll factor for a given instruction.
 int ComputeMaxUnrollFactor(const HloInstruction* hlo) {
-  // Find the largest possible power of two to unroll by.
-  // TODO(kramerb): Make this smarter.
   const Shape& element_shape = hlo->IsMultiOutputFusion()
                                    ? ShapeUtil::GetSubshape(hlo->shape(), {0})
                                    : hlo->shape();
   return ComputeMaxUnrollFactor(element_shape, hlo->GetModule()->config());
 }
 
+// Computes the maximum valid unroll factor for a given instruction.
+int ComputeMaxUnrollFactor(mlir::Operation* op,
+                           const HloModuleConfig& hlo_module_config) {
+  Shape element_shape = [&] {
+    std::vector<Shape> shapes;
+    // Detect multi-output fusion. Notice that for a reduce in the fusion that
+    // returns a tuple, we don't want to treat it as multi-output fusion. We
+    // want to pass that tuple into ComputeMaxUnrollFactor below. For an actual
+    // MOF, just pass the first element of the root tuple.
+    if (auto fusion = mlir::dyn_cast<mlir::lmhlo::FusionOp>(op)) {
+      std::vector<mlir::Operation*> fusion_outputs = GetOutputOps(fusion);
+      for (mlir::Value result : fusion_outputs[0]->getResults()) {
+        shapes.push_back(TypeToShape(result.getType()));
+      }
+    } else {
+      for (mlir::Value result : op->getResults()) {
+        shapes.push_back(TypeToShape(result.getType()));
+      }
+    }
+    if (shapes.size() > 1) {
+      return ShapeUtil::MakeTupleShape(shapes);
+    }
+    return shapes[0];
+  }();
+  return ComputeMaxUnrollFactor(element_shape, hlo_module_config);
+}
+
 // Returns the llvm type for the indices used in the kernel that contains the
 // hlo instruction. Such indices include the index for the parallel loop and
 // the indices for the tensors accessed by the kernel. The return type is i32
@@ -613,10 +652,14 @@
 }
 
 Status IrEmitterUnnested::DefaultAction(HloInstruction* hlo) {
+  if (hlo->IsElementwise()) {
+    TF_ASSIGN_OR_RETURN(auto input, GetMlirEmitterInput(hlo));
+    return EmitUsingElementalIrEmitter(input);
+  }
   return IrEmitter::DefaultAction(hlo);
 }
 
-Status IrEmitterUnnested::DefaultActionForMlir(MlirEmitterInput input) {
+Status IrEmitterUnnested::EmitUsingElementalIrEmitter(MlirEmitterInput input) {
   // Replace unnested op with a fused nested op.
   //
   // TODO(timshen): Ultimately this should be a pass. It's currently not a pass,
@@ -670,19 +713,54 @@
       output_shape = ShapeUtil::MakeTupleShape(output_shapes);
     }
   } else {
-    LOG(FATAL) << "Unimplemented default action for mlir op: "
-               << MlirToString(input.op);
+    // Try to generically convert any LMHLO ops to LMHLO fusion + the
+    // corresponding MHLO op. Currently we've only looked at elementwise ops and
+    // they seem to be well covered.
+    //
+    // TODO(timshen): Moving forward, we should make it cover all ops if
+    // possible, and only special-case the ones it can't.
+    std::vector<mlir::Value> outputs;
+    mlir::Operation* new_op;
+    {
+      std::vector<mlir::Value> operands;
+      for (auto buffer : input.op->getOperands()) {
+        if (WritesMlirBuffer(input.op, buffer)) {
+          outputs.push_back(buffer);
+        } else {
+          operands.push_back(buffer);
+        }
+      }
+      TF_RET_CHECK(outputs.size() == 1);
+
+      std::vector<mlir::Value> loads = load_memrefs(operands);
+      std::string mhlo_op_name = mlir::hlo::LmhloToMhloOpName(
+          input.op->getName().getStringRef(), input.op->getContext());
+      TF_RET_CHECK(!mhlo_op_name.empty())
+          << "No corresponding MHLO op for given LMHLO op: "
+          << MlirToString(input.op);
+      mlir::OperationState op_state(loc, mhlo_op_name);
+
+      mlir::BlockAndValueMapping mapper;
+      for (mlir::Region& region : input.op->getRegions()) {
+        mlir::Region* new_region = op_state.addRegion();
+        region.cloneInto(new_region, mapper);
+      }
+
+      op_state.addOperands(loads);
+      op_state.addAttributes(input.op->getAttrs());
+      op_state.addTypes({mlir::RankedTensorType::get(
+          outputs[0].getType().cast<mlir::MemRefType>().getShape(),
+          outputs[0].getType().cast<mlir::MemRefType>().getElementType())});
+      new_op = b.createOperation(op_state);
+    }
+    TF_RET_CHECK(mlir::succeeded(mlir::verify(new_op)));
+    output_shape = TypeToShape(outputs[0].getType());
+    HloFunctionImporter::SetLayoutForMlir(new_op, output_shape);
+    b.create<mlir::TensorStoreOp>(loc, new_op->getResult(0), outputs[0]);
   }
   input.op->erase();
   input.op = fusion;
-  int unroll_factor = 1;
-  // TODO(timshen): Port MayPreventVectorization as we add more ops into this
-  // function.
-  if (output_shape.IsArray()) {
-    unroll_factor = ComputeMaxUnrollFactor(output_shape, hlo_module_config_);
-  }
-  auto ret = EmitLoopFusionFromMlir(input, output_shape, unroll_factor);
-  return ret;
+  return EmitLoopFusionFromMlir(input, output_shape);
 }
 
 Status IrEmitterUnnested::HandleConditional(HloInstruction* conditional) {
@@ -1210,8 +1288,7 @@
 // This is migrated from IrEmitter::HandleFusion() with IrEmitterUnnested as the
 // subclass. The logic is de-virtualized and less scattered.
 Status IrEmitterUnnested::EmitLoopFusionFromMlir(MlirEmitterInput input,
-                                                 const Shape& output_shape,
-                                                 int unroll_factor) {
+                                                 const Shape& output_shape) {
   auto fusion = mlir::cast<mlir::lmhlo::FusionOp>(input.op);
   MlirEmitterContext context;
   context.SetOperation(fusion);
@@ -1258,6 +1335,11 @@
       auto element_generator,
       fused_emitter.GetGenerator(fused_computation->root_instruction()));
 
+  int unroll_factor = 1;
+  if (!MayPreventVectorization(fusion)) {
+    unroll_factor = ComputeMaxUnrollFactor(fusion, hlo_module_config_);
+  }
+
   Shape element_shape = context.output_shapes[0];
   LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
       element_shape, ir_emitter_context_->gpu_device_info(), unroll_factor);
@@ -1436,12 +1518,7 @@
     return Status::OK();
   }
 
-  int unroll_factor = 1;
-  if (!MayPreventVectorization(*fusion)) {
-    unroll_factor = ComputeMaxUnrollFactor(fusion);
-  }
-
-  return EmitLoopFusionFromMlir(mlir_input, fusion->shape(), unroll_factor);
+  return EmitLoopFusionFromMlir(mlir_input, fusion->shape());
 }
 
 Status IrEmitterUnnested::HandleCopy(HloInstruction* copy) {
@@ -1476,7 +1553,7 @@
     return Status::OK();
   }
 
-  return DefaultActionForMlir(input);
+  return EmitUsingElementalIrEmitter(input);
 }
 
 Status IrEmitterUnnested::EmitExtraOutputsForReduce(
@@ -1507,7 +1584,7 @@
     return EmitReductionFromOrToContiguousDimensions(mlir_input);
   }
 
-  return DefaultActionForMlir(mlir_input);
+  return EmitUsingElementalIrEmitter(mlir_input);
 }
 
 Status IrEmitterUnnested::HandleTuple(HloInstruction* tuple) {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index b5fc20d..f69545b 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -157,7 +157,7 @@
   }
 
   Status DefaultAction(HloInstruction* hlo) override;
-  Status DefaultActionForMlir(MlirEmitterInput input);
+  Status EmitUsingElementalIrEmitter(MlirEmitterInput input);
 
   // IrEmitterUnnested handles the following instructions differently from
   // IrEmitter. It also mixes in some special handling for custom kernels
@@ -175,7 +175,7 @@
   Status HandleFft(HloInstruction* fft) override;
   Status HandleFusion(HloInstruction* fusion) override;
   Status EmitLoopFusionFromMlir(MlirEmitterInput input,
-                                const Shape& output_shape, int unroll_factor);
+                                const Shape& output_shape);
   Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
   Status HandleReduce(HloInstruction* reduce) override;
   Status HandleSelectAndScatter(HloInstruction* instruction) override;
diff --git a/tensorflow/compiler/xla/service/gpu/tests/elementwise.hlo b/tensorflow/compiler/xla/service/gpu/tests/elementwise.hlo
index d4ed447..c54affb 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/elementwise.hlo
+++ b/tensorflow/compiler/xla/service/gpu/tests/elementwise.hlo
@@ -7,9 +7,9 @@
 // CHECK:         %[[VAL_5:.*]] = bitcast i8* %[[VAL_3]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_6:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_7:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_8:.*]] = mul nuw nsw i32 %[[VAL_6]], 128
+// CHECK:         %[[VAL_8:.*]] = mul nuw nsw i32 %[[VAL_6]], 256
 // CHECK:         %[[VAL_9:.*]] = add nuw nsw i32 %[[VAL_8]], %[[VAL_7]]
-// CHECK:         %[[VAL_10:.*]] = icmp ult i32 %[[VAL_9]], 163840
+// CHECK:         %[[VAL_10:.*]] = icmp ult i32 %[[VAL_9]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_10]])
 // CHECK:         %[[VAL_11:.*]] = mul nuw nsw i32 %[[VAL_9]], 4
 // CHECK:         %[[VAL_12:.*]] = udiv i32 %[[VAL_11]], 1
@@ -32,32 +32,32 @@
 // CHECK:       r0.in_bounds-after:                               ; preds = %[[VAL_28]], %[[VAL_30:.*]]
 // CHECK:         ret void
 // CHECK:       r0.in_bounds-true:                                ; preds = %[[VAL_30]]
-// CHECK:         %[[VAL_31:.*]] = bitcast [100 x [200 x float]]* %[[VAL_5]] to float*
+// CHECK:         %[[VAL_31:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2]] to float*
 // CHECK:         %[[VAL_32:.*]] = getelementptr inbounds float, float* %[[VAL_31]], i32 %[[VAL_11]]
 // CHECK:         %[[VAL_33:.*]] = load float, float* %[[VAL_32]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_34:.*]] = call float @llvm.fabs.f32(float %[[VAL_33]])
-// CHECK:         %[[VAL_35:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2]] to float*
+// CHECK:         %[[VAL_35:.*]] = bitcast [100 x [200 x float]]* %[[VAL_5]] to float*
 // CHECK:         %[[VAL_36:.*]] = getelementptr inbounds float, float* %[[VAL_35]], i32 %[[VAL_11]]
 // CHECK:         store float %[[VAL_34]], float* %[[VAL_36]], align 4
-// CHECK:         %[[VAL_37:.*]] = bitcast [100 x [200 x float]]* %[[VAL_5]] to float*
+// CHECK:         %[[VAL_37:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2]] to float*
 // CHECK:         %[[VAL_38:.*]] = getelementptr inbounds float, float* %[[VAL_37]], i32 %[[VAL_15]]
 // CHECK:         %[[VAL_39:.*]] = load float, float* %[[VAL_38]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_40:.*]] = call float @llvm.fabs.f32(float %[[VAL_39]])
-// CHECK:         %[[VAL_41:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2]] to float*
+// CHECK:         %[[VAL_41:.*]] = bitcast [100 x [200 x float]]* %[[VAL_5]] to float*
 // CHECK:         %[[VAL_42:.*]] = getelementptr inbounds float, float* %[[VAL_41]], i32 %[[VAL_15]]
 // CHECK:         store float %[[VAL_40]], float* %[[VAL_42]], align 4
-// CHECK:         %[[VAL_43:.*]] = bitcast [100 x [200 x float]]* %[[VAL_5]] to float*
+// CHECK:         %[[VAL_43:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2]] to float*
 // CHECK:         %[[VAL_44:.*]] = getelementptr inbounds float, float* %[[VAL_43]], i32 %[[VAL_19]]
 // CHECK:         %[[VAL_45:.*]] = load float, float* %[[VAL_44]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_46:.*]] = call float @llvm.fabs.f32(float %[[VAL_45]])
-// CHECK:         %[[VAL_47:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2]] to float*
+// CHECK:         %[[VAL_47:.*]] = bitcast [100 x [200 x float]]* %[[VAL_5]] to float*
 // CHECK:         %[[VAL_48:.*]] = getelementptr inbounds float, float* %[[VAL_47]], i32 %[[VAL_19]]
 // CHECK:         store float %[[VAL_46]], float* %[[VAL_48]], align 4
-// CHECK:         %[[VAL_49:.*]] = bitcast [100 x [200 x float]]* %[[VAL_5]] to float*
+// CHECK:         %[[VAL_49:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2]] to float*
 // CHECK:         %[[VAL_50:.*]] = getelementptr inbounds float, float* %[[VAL_49]], i32 %[[VAL_23]]
 // CHECK:         %[[VAL_51:.*]] = load float, float* %[[VAL_50]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_52:.*]] = call float @llvm.fabs.f32(float %[[VAL_51]])
-// CHECK:         %[[VAL_53:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2]] to float*
+// CHECK:         %[[VAL_53:.*]] = bitcast [100 x [200 x float]]* %[[VAL_5]] to float*
 // CHECK:         %[[VAL_54:.*]] = getelementptr inbounds float, float* %[[VAL_53]], i32 %[[VAL_23]]
 // CHECK:         store float %[[VAL_52]], float* %[[VAL_54]], align 4
 // CHECK:         br label %[[VAL_29]]
@@ -68,9 +68,9 @@
 // CHECK:         %[[VAL_60:.*]] = bitcast i8* %[[VAL_58]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_61:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_62:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_63:.*]] = mul nuw nsw i32 %[[VAL_61]], 128
+// CHECK:         %[[VAL_63:.*]] = mul nuw nsw i32 %[[VAL_61]], 256
 // CHECK:         %[[VAL_64:.*]] = add nuw nsw i32 %[[VAL_63]], %[[VAL_62]]
-// CHECK:         %[[VAL_65:.*]] = icmp ult i32 %[[VAL_64]], 163840
+// CHECK:         %[[VAL_65:.*]] = icmp ult i32 %[[VAL_64]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_65]])
 // CHECK:         %[[VAL_66:.*]] = mul nuw nsw i32 %[[VAL_64]], 4
 // CHECK:         %[[VAL_67:.*]] = udiv i32 %[[VAL_66]], 1
@@ -93,32 +93,32 @@
 // CHECK:       r1.in_bounds-after:                               ; preds = %[[VAL_83]], %[[VAL_85:.*]]
 // CHECK:         ret void
 // CHECK:       r1.in_bounds-true:                                ; preds = %[[VAL_85]]
-// CHECK:         %[[VAL_86:.*]] = bitcast [100 x [200 x float]]* %[[VAL_60]] to float*
+// CHECK:         %[[VAL_86:.*]] = bitcast [100 x [200 x float]]* %[[VAL_57]] to float*
 // CHECK:         %[[VAL_87:.*]] = getelementptr inbounds float, float* %[[VAL_86]], i32 %[[VAL_66]]
 // CHECK:         %[[VAL_88:.*]] = load float, float* %[[VAL_87]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_89:.*]] = call float @llvm.round.f32(float %[[VAL_88]])
-// CHECK:         %[[VAL_90:.*]] = bitcast [100 x [200 x float]]* %[[VAL_57]] to float*
+// CHECK:         %[[VAL_90:.*]] = bitcast [100 x [200 x float]]* %[[VAL_60]] to float*
 // CHECK:         %[[VAL_91:.*]] = getelementptr inbounds float, float* %[[VAL_90]], i32 %[[VAL_66]]
 // CHECK:         store float %[[VAL_89]], float* %[[VAL_91]], align 4
-// CHECK:         %[[VAL_92:.*]] = bitcast [100 x [200 x float]]* %[[VAL_60]] to float*
+// CHECK:         %[[VAL_92:.*]] = bitcast [100 x [200 x float]]* %[[VAL_57]] to float*
 // CHECK:         %[[VAL_93:.*]] = getelementptr inbounds float, float* %[[VAL_92]], i32 %[[VAL_70]]
 // CHECK:         %[[VAL_94:.*]] = load float, float* %[[VAL_93]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_95:.*]] = call float @llvm.round.f32(float %[[VAL_94]])
-// CHECK:         %[[VAL_96:.*]] = bitcast [100 x [200 x float]]* %[[VAL_57]] to float*
+// CHECK:         %[[VAL_96:.*]] = bitcast [100 x [200 x float]]* %[[VAL_60]] to float*
 // CHECK:         %[[VAL_97:.*]] = getelementptr inbounds float, float* %[[VAL_96]], i32 %[[VAL_70]]
 // CHECK:         store float %[[VAL_95]], float* %[[VAL_97]], align 4
-// CHECK:         %[[VAL_98:.*]] = bitcast [100 x [200 x float]]* %[[VAL_60]] to float*
+// CHECK:         %[[VAL_98:.*]] = bitcast [100 x [200 x float]]* %[[VAL_57]] to float*
 // CHECK:         %[[VAL_99:.*]] = getelementptr inbounds float, float* %[[VAL_98]], i32 %[[VAL_74]]
 // CHECK:         %[[VAL_100:.*]] = load float, float* %[[VAL_99]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_101:.*]] = call float @llvm.round.f32(float %[[VAL_100]])
-// CHECK:         %[[VAL_102:.*]] = bitcast [100 x [200 x float]]* %[[VAL_57]] to float*
+// CHECK:         %[[VAL_102:.*]] = bitcast [100 x [200 x float]]* %[[VAL_60]] to float*
 // CHECK:         %[[VAL_103:.*]] = getelementptr inbounds float, float* %[[VAL_102]], i32 %[[VAL_74]]
 // CHECK:         store float %[[VAL_101]], float* %[[VAL_103]], align 4
-// CHECK:         %[[VAL_104:.*]] = bitcast [100 x [200 x float]]* %[[VAL_60]] to float*
+// CHECK:         %[[VAL_104:.*]] = bitcast [100 x [200 x float]]* %[[VAL_57]] to float*
 // CHECK:         %[[VAL_105:.*]] = getelementptr inbounds float, float* %[[VAL_104]], i32 %[[VAL_78]]
 // CHECK:         %[[VAL_106:.*]] = load float, float* %[[VAL_105]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_107:.*]] = call float @llvm.round.f32(float %[[VAL_106]])
-// CHECK:         %[[VAL_108:.*]] = bitcast [100 x [200 x float]]* %[[VAL_57]] to float*
+// CHECK:         %[[VAL_108:.*]] = bitcast [100 x [200 x float]]* %[[VAL_60]] to float*
 // CHECK:         %[[VAL_109:.*]] = getelementptr inbounds float, float* %[[VAL_108]], i32 %[[VAL_78]]
 // CHECK:         store float %[[VAL_107]], float* %[[VAL_109]], align 4
 // CHECK:         br label %[[VAL_84]]
@@ -129,9 +129,9 @@
 // CHECK:         %[[VAL_115:.*]] = bitcast i8* %[[VAL_113]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_116:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_117:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_118:.*]] = mul nuw nsw i32 %[[VAL_116]], 128
+// CHECK:         %[[VAL_118:.*]] = mul nuw nsw i32 %[[VAL_116]], 256
 // CHECK:         %[[VAL_119:.*]] = add nuw nsw i32 %[[VAL_118]], %[[VAL_117]]
-// CHECK:         %[[VAL_120:.*]] = icmp ult i32 %[[VAL_119]], 163840
+// CHECK:         %[[VAL_120:.*]] = icmp ult i32 %[[VAL_119]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_120]])
 // CHECK:         %[[VAL_121:.*]] = mul nuw nsw i32 %[[VAL_119]], 4
 // CHECK:         %[[VAL_122:.*]] = udiv i32 %[[VAL_121]], 1
@@ -154,32 +154,32 @@
 // CHECK:       r2.in_bounds-after:                               ; preds = %[[VAL_138]], %[[VAL_140:.*]]
 // CHECK:         ret void
 // CHECK:       r2.in_bounds-true:                                ; preds = %[[VAL_140]]
-// CHECK:         %[[VAL_141:.*]] = bitcast [100 x [200 x float]]* %[[VAL_115]] to float*
+// CHECK:         %[[VAL_141:.*]] = bitcast [100 x [200 x float]]* %[[VAL_112]] to float*
 // CHECK:         %[[VAL_142:.*]] = getelementptr inbounds float, float* %[[VAL_141]], i32 %[[VAL_121]]
 // CHECK:         %[[VAL_143:.*]] = load float, float* %[[VAL_142]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_144:.*]] = call float @llvm.ceil.f32(float %[[VAL_143]])
-// CHECK:         %[[VAL_145:.*]] = bitcast [100 x [200 x float]]* %[[VAL_112]] to float*
+// CHECK:         %[[VAL_145:.*]] = bitcast [100 x [200 x float]]* %[[VAL_115]] to float*
 // CHECK:         %[[VAL_146:.*]] = getelementptr inbounds float, float* %[[VAL_145]], i32 %[[VAL_121]]
 // CHECK:         store float %[[VAL_144]], float* %[[VAL_146]], align 4
-// CHECK:         %[[VAL_147:.*]] = bitcast [100 x [200 x float]]* %[[VAL_115]] to float*
+// CHECK:         %[[VAL_147:.*]] = bitcast [100 x [200 x float]]* %[[VAL_112]] to float*
 // CHECK:         %[[VAL_148:.*]] = getelementptr inbounds float, float* %[[VAL_147]], i32 %[[VAL_125]]
 // CHECK:         %[[VAL_149:.*]] = load float, float* %[[VAL_148]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_150:.*]] = call float @llvm.ceil.f32(float %[[VAL_149]])
-// CHECK:         %[[VAL_151:.*]] = bitcast [100 x [200 x float]]* %[[VAL_112]] to float*
+// CHECK:         %[[VAL_151:.*]] = bitcast [100 x [200 x float]]* %[[VAL_115]] to float*
 // CHECK:         %[[VAL_152:.*]] = getelementptr inbounds float, float* %[[VAL_151]], i32 %[[VAL_125]]
 // CHECK:         store float %[[VAL_150]], float* %[[VAL_152]], align 4
-// CHECK:         %[[VAL_153:.*]] = bitcast [100 x [200 x float]]* %[[VAL_115]] to float*
+// CHECK:         %[[VAL_153:.*]] = bitcast [100 x [200 x float]]* %[[VAL_112]] to float*
 // CHECK:         %[[VAL_154:.*]] = getelementptr inbounds float, float* %[[VAL_153]], i32 %[[VAL_129]]
 // CHECK:         %[[VAL_155:.*]] = load float, float* %[[VAL_154]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_156:.*]] = call float @llvm.ceil.f32(float %[[VAL_155]])
-// CHECK:         %[[VAL_157:.*]] = bitcast [100 x [200 x float]]* %[[VAL_112]] to float*
+// CHECK:         %[[VAL_157:.*]] = bitcast [100 x [200 x float]]* %[[VAL_115]] to float*
 // CHECK:         %[[VAL_158:.*]] = getelementptr inbounds float, float* %[[VAL_157]], i32 %[[VAL_129]]
 // CHECK:         store float %[[VAL_156]], float* %[[VAL_158]], align 4
-// CHECK:         %[[VAL_159:.*]] = bitcast [100 x [200 x float]]* %[[VAL_115]] to float*
+// CHECK:         %[[VAL_159:.*]] = bitcast [100 x [200 x float]]* %[[VAL_112]] to float*
 // CHECK:         %[[VAL_160:.*]] = getelementptr inbounds float, float* %[[VAL_159]], i32 %[[VAL_133]]
 // CHECK:         %[[VAL_161:.*]] = load float, float* %[[VAL_160]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_162:.*]] = call float @llvm.ceil.f32(float %[[VAL_161]])
-// CHECK:         %[[VAL_163:.*]] = bitcast [100 x [200 x float]]* %[[VAL_112]] to float*
+// CHECK:         %[[VAL_163:.*]] = bitcast [100 x [200 x float]]* %[[VAL_115]] to float*
 // CHECK:         %[[VAL_164:.*]] = getelementptr inbounds float, float* %[[VAL_163]], i32 %[[VAL_133]]
 // CHECK:         store float %[[VAL_162]], float* %[[VAL_164]], align 4
 // CHECK:         br label %[[VAL_139]]
@@ -190,9 +190,9 @@
 // CHECK:         %[[VAL_170:.*]] = bitcast i8* %[[VAL_168]] to [100 x [200 x i32]]*
 // CHECK:         %[[VAL_171:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_172:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_173:.*]] = mul nuw nsw i32 %[[VAL_171]], 128
+// CHECK:         %[[VAL_173:.*]] = mul nuw nsw i32 %[[VAL_171]], 256
 // CHECK:         %[[VAL_174:.*]] = add nuw nsw i32 %[[VAL_173]], %[[VAL_172]]
-// CHECK:         %[[VAL_175:.*]] = icmp ult i32 %[[VAL_174]], 163840
+// CHECK:         %[[VAL_175:.*]] = icmp ult i32 %[[VAL_174]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_175]])
 // CHECK:         %[[VAL_176:.*]] = mul nuw nsw i32 %[[VAL_174]], 4
 // CHECK:         %[[VAL_177:.*]] = udiv i32 %[[VAL_176]], 1
@@ -215,32 +215,32 @@
 // CHECK:       r3.in_bounds-after:                               ; preds = %[[VAL_193]], %[[VAL_195:.*]]
 // CHECK:         ret void
 // CHECK:       r3.in_bounds-true:                                ; preds = %[[VAL_195]]
-// CHECK:         %[[VAL_196:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_170]] to i32*
+// CHECK:         %[[VAL_196:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_167]] to i32*
 // CHECK:         %[[VAL_197:.*]] = getelementptr inbounds i32, i32* %[[VAL_196]], i32 %[[VAL_176]]
 // CHECK:         %[[VAL_198:.*]] = load i32, i32* %[[VAL_197]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_199:.*]] = call i32 @llvm.ctlz.i32(i32 %[[VAL_198]], i1 false)
-// CHECK:         %[[VAL_200:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_167]] to i32*
+// CHECK:         %[[VAL_200:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_170]] to i32*
 // CHECK:         %[[VAL_201:.*]] = getelementptr inbounds i32, i32* %[[VAL_200]], i32 %[[VAL_176]]
 // CHECK:         store i32 %[[VAL_199]], i32* %[[VAL_201]], align 4
-// CHECK:         %[[VAL_202:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_170]] to i32*
+// CHECK:         %[[VAL_202:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_167]] to i32*
 // CHECK:         %[[VAL_203:.*]] = getelementptr inbounds i32, i32* %[[VAL_202]], i32 %[[VAL_180]]
 // CHECK:         %[[VAL_204:.*]] = load i32, i32* %[[VAL_203]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_205:.*]] = call i32 @llvm.ctlz.i32(i32 %[[VAL_204]], i1 false)
-// CHECK:         %[[VAL_206:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_167]] to i32*
+// CHECK:         %[[VAL_206:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_170]] to i32*
 // CHECK:         %[[VAL_207:.*]] = getelementptr inbounds i32, i32* %[[VAL_206]], i32 %[[VAL_180]]
 // CHECK:         store i32 %[[VAL_205]], i32* %[[VAL_207]], align 4
-// CHECK:         %[[VAL_208:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_170]] to i32*
+// CHECK:         %[[VAL_208:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_167]] to i32*
 // CHECK:         %[[VAL_209:.*]] = getelementptr inbounds i32, i32* %[[VAL_208]], i32 %[[VAL_184]]
 // CHECK:         %[[VAL_210:.*]] = load i32, i32* %[[VAL_209]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_211:.*]] = call i32 @llvm.ctlz.i32(i32 %[[VAL_210]], i1 false)
-// CHECK:         %[[VAL_212:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_167]] to i32*
+// CHECK:         %[[VAL_212:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_170]] to i32*
 // CHECK:         %[[VAL_213:.*]] = getelementptr inbounds i32, i32* %[[VAL_212]], i32 %[[VAL_184]]
 // CHECK:         store i32 %[[VAL_211]], i32* %[[VAL_213]], align 4
-// CHECK:         %[[VAL_214:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_170]] to i32*
+// CHECK:         %[[VAL_214:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_167]] to i32*
 // CHECK:         %[[VAL_215:.*]] = getelementptr inbounds i32, i32* %[[VAL_214]], i32 %[[VAL_188]]
 // CHECK:         %[[VAL_216:.*]] = load i32, i32* %[[VAL_215]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_217:.*]] = call i32 @llvm.ctlz.i32(i32 %[[VAL_216]], i1 false)
-// CHECK:         %[[VAL_218:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_167]] to i32*
+// CHECK:         %[[VAL_218:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_170]] to i32*
 // CHECK:         %[[VAL_219:.*]] = getelementptr inbounds i32, i32* %[[VAL_218]], i32 %[[VAL_188]]
 // CHECK:         store i32 %[[VAL_217]], i32* %[[VAL_219]], align 4
 // CHECK:         br label %[[VAL_194]]
@@ -251,9 +251,9 @@
 // CHECK:         %[[VAL_225:.*]] = bitcast i8* %[[VAL_223]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_226:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_227:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_228:.*]] = mul nuw nsw i32 %[[VAL_226]], 128
+// CHECK:         %[[VAL_228:.*]] = mul nuw nsw i32 %[[VAL_226]], 256
 // CHECK:         %[[VAL_229:.*]] = add nuw nsw i32 %[[VAL_228]], %[[VAL_227]]
-// CHECK:         %[[VAL_230:.*]] = icmp ult i32 %[[VAL_229]], 163840
+// CHECK:         %[[VAL_230:.*]] = icmp ult i32 %[[VAL_229]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_230]])
 // CHECK:         %[[VAL_231:.*]] = mul nuw nsw i32 %[[VAL_229]], 4
 // CHECK:         %[[VAL_232:.*]] = udiv i32 %[[VAL_231]], 1
@@ -276,28 +276,28 @@
 // CHECK:       r4.in_bounds-after:                               ; preds = %[[VAL_248]], %[[VAL_250:.*]]
 // CHECK:         ret void
 // CHECK:       r4.in_bounds-true:                                ; preds = %[[VAL_250]]
-// CHECK:         %[[VAL_251:.*]] = bitcast [100 x [200 x float]]* %[[VAL_225]] to float*
+// CHECK:         %[[VAL_251:.*]] = bitcast [100 x [200 x float]]* %[[VAL_222]] to float*
 // CHECK:         %[[VAL_252:.*]] = getelementptr inbounds float, float* %[[VAL_251]], i32 %[[VAL_231]]
 // CHECK:         %[[VAL_253:.*]] = load float, float* %[[VAL_252]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_254:.*]] = bitcast [100 x [200 x float]]* %[[VAL_222]] to float*
+// CHECK:         %[[VAL_254:.*]] = bitcast [100 x [200 x float]]* %[[VAL_225]] to float*
 // CHECK:         %[[VAL_255:.*]] = getelementptr inbounds float, float* %[[VAL_254]], i32 %[[VAL_231]]
 // CHECK:         store float %[[VAL_253]], float* %[[VAL_255]], align 4
-// CHECK:         %[[VAL_256:.*]] = bitcast [100 x [200 x float]]* %[[VAL_225]] to float*
+// CHECK:         %[[VAL_256:.*]] = bitcast [100 x [200 x float]]* %[[VAL_222]] to float*
 // CHECK:         %[[VAL_257:.*]] = getelementptr inbounds float, float* %[[VAL_256]], i32 %[[VAL_235]]
 // CHECK:         %[[VAL_258:.*]] = load float, float* %[[VAL_257]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_259:.*]] = bitcast [100 x [200 x float]]* %[[VAL_222]] to float*
+// CHECK:         %[[VAL_259:.*]] = bitcast [100 x [200 x float]]* %[[VAL_225]] to float*
 // CHECK:         %[[VAL_260:.*]] = getelementptr inbounds float, float* %[[VAL_259]], i32 %[[VAL_235]]
 // CHECK:         store float %[[VAL_258]], float* %[[VAL_260]], align 4
-// CHECK:         %[[VAL_261:.*]] = bitcast [100 x [200 x float]]* %[[VAL_225]] to float*
+// CHECK:         %[[VAL_261:.*]] = bitcast [100 x [200 x float]]* %[[VAL_222]] to float*
 // CHECK:         %[[VAL_262:.*]] = getelementptr inbounds float, float* %[[VAL_261]], i32 %[[VAL_239]]
 // CHECK:         %[[VAL_263:.*]] = load float, float* %[[VAL_262]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_264:.*]] = bitcast [100 x [200 x float]]* %[[VAL_222]] to float*
+// CHECK:         %[[VAL_264:.*]] = bitcast [100 x [200 x float]]* %[[VAL_225]] to float*
 // CHECK:         %[[VAL_265:.*]] = getelementptr inbounds float, float* %[[VAL_264]], i32 %[[VAL_239]]
 // CHECK:         store float %[[VAL_263]], float* %[[VAL_265]], align 4
-// CHECK:         %[[VAL_266:.*]] = bitcast [100 x [200 x float]]* %[[VAL_225]] to float*
+// CHECK:         %[[VAL_266:.*]] = bitcast [100 x [200 x float]]* %[[VAL_222]] to float*
 // CHECK:         %[[VAL_267:.*]] = getelementptr inbounds float, float* %[[VAL_266]], i32 %[[VAL_243]]
 // CHECK:         %[[VAL_268:.*]] = load float, float* %[[VAL_267]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_269:.*]] = bitcast [100 x [200 x float]]* %[[VAL_222]] to float*
+// CHECK:         %[[VAL_269:.*]] = bitcast [100 x [200 x float]]* %[[VAL_225]] to float*
 // CHECK:         %[[VAL_270:.*]] = getelementptr inbounds float, float* %[[VAL_269]], i32 %[[VAL_243]]
 // CHECK:         store float %[[VAL_268]], float* %[[VAL_270]], align 4
 // CHECK:         br label %[[VAL_249]]
@@ -308,9 +308,9 @@
 // CHECK:         %[[VAL_276:.*]] = bitcast i8* %[[VAL_274]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_277:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_278:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_279:.*]] = mul nuw nsw i32 %[[VAL_277]], 128
+// CHECK:         %[[VAL_279:.*]] = mul nuw nsw i32 %[[VAL_277]], 256
 // CHECK:         %[[VAL_280:.*]] = add nuw nsw i32 %[[VAL_279]], %[[VAL_278]]
-// CHECK:         %[[VAL_281:.*]] = icmp ult i32 %[[VAL_280]], 163840
+// CHECK:         %[[VAL_281:.*]] = icmp ult i32 %[[VAL_280]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_281]])
 // CHECK:         %[[VAL_282:.*]] = mul nuw nsw i32 %[[VAL_280]], 4
 // CHECK:         %[[VAL_283:.*]] = udiv i32 %[[VAL_282]], 1
@@ -333,28 +333,28 @@
 // CHECK:       r5.in_bounds-after:                               ; preds = %[[VAL_299]], %[[VAL_301:.*]]
 // CHECK:         ret void
 // CHECK:       r5.in_bounds-true:                                ; preds = %[[VAL_301]]
-// CHECK:         %[[VAL_302:.*]] = bitcast [100 x [200 x float]]* %[[VAL_276]] to float*
+// CHECK:         %[[VAL_302:.*]] = bitcast [100 x [200 x float]]* %[[VAL_273]] to float*
 // CHECK:         %[[VAL_303:.*]] = getelementptr inbounds float, float* %[[VAL_302]], i32 %[[VAL_282]]
 // CHECK:         %[[VAL_304:.*]] = load float, float* %[[VAL_303]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_305:.*]] = bitcast [100 x [200 x float]]* %[[VAL_273]] to float*
+// CHECK:         %[[VAL_305:.*]] = bitcast [100 x [200 x float]]* %[[VAL_276]] to float*
 // CHECK:         %[[VAL_306:.*]] = getelementptr inbounds float, float* %[[VAL_305]], i32 %[[VAL_282]]
 // CHECK:         store float %[[VAL_304]], float* %[[VAL_306]], align 4
-// CHECK:         %[[VAL_307:.*]] = bitcast [100 x [200 x float]]* %[[VAL_276]] to float*
+// CHECK:         %[[VAL_307:.*]] = bitcast [100 x [200 x float]]* %[[VAL_273]] to float*
 // CHECK:         %[[VAL_308:.*]] = getelementptr inbounds float, float* %[[VAL_307]], i32 %[[VAL_286]]
 // CHECK:         %[[VAL_309:.*]] = load float, float* %[[VAL_308]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_310:.*]] = bitcast [100 x [200 x float]]* %[[VAL_273]] to float*
+// CHECK:         %[[VAL_310:.*]] = bitcast [100 x [200 x float]]* %[[VAL_276]] to float*
 // CHECK:         %[[VAL_311:.*]] = getelementptr inbounds float, float* %[[VAL_310]], i32 %[[VAL_286]]
 // CHECK:         store float %[[VAL_309]], float* %[[VAL_311]], align 4
-// CHECK:         %[[VAL_312:.*]] = bitcast [100 x [200 x float]]* %[[VAL_276]] to float*
+// CHECK:         %[[VAL_312:.*]] = bitcast [100 x [200 x float]]* %[[VAL_273]] to float*
 // CHECK:         %[[VAL_313:.*]] = getelementptr inbounds float, float* %[[VAL_312]], i32 %[[VAL_290]]
 // CHECK:         %[[VAL_314:.*]] = load float, float* %[[VAL_313]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_315:.*]] = bitcast [100 x [200 x float]]* %[[VAL_273]] to float*
+// CHECK:         %[[VAL_315:.*]] = bitcast [100 x [200 x float]]* %[[VAL_276]] to float*
 // CHECK:         %[[VAL_316:.*]] = getelementptr inbounds float, float* %[[VAL_315]], i32 %[[VAL_290]]
 // CHECK:         store float %[[VAL_314]], float* %[[VAL_316]], align 4
-// CHECK:         %[[VAL_317:.*]] = bitcast [100 x [200 x float]]* %[[VAL_276]] to float*
+// CHECK:         %[[VAL_317:.*]] = bitcast [100 x [200 x float]]* %[[VAL_273]] to float*
 // CHECK:         %[[VAL_318:.*]] = getelementptr inbounds float, float* %[[VAL_317]], i32 %[[VAL_294]]
 // CHECK:         %[[VAL_319:.*]] = load float, float* %[[VAL_318]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_320:.*]] = bitcast [100 x [200 x float]]* %[[VAL_273]] to float*
+// CHECK:         %[[VAL_320:.*]] = bitcast [100 x [200 x float]]* %[[VAL_276]] to float*
 // CHECK:         %[[VAL_321:.*]] = getelementptr inbounds float, float* %[[VAL_320]], i32 %[[VAL_294]]
 // CHECK:         store float %[[VAL_319]], float* %[[VAL_321]], align 4
 // CHECK:         br label %[[VAL_300]]
@@ -364,10 +364,10 @@
 // CHECK:         %[[VAL_325:.*]] = getelementptr inbounds i8, i8* %[[VAL_326:.*]], i64 0
 // CHECK:         %[[VAL_327:.*]] = bitcast i8* %[[VAL_325]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_328:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
-// CHECK:         %[[VAL_329:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_330:.*]] = mul nuw nsw i32 %[[VAL_328]], 128
+// CHECK:         %[[VAL_329:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !93
+// CHECK:         %[[VAL_330:.*]] = mul nuw nsw i32 %[[VAL_328]], 1024
 // CHECK:         %[[VAL_331:.*]] = add nuw nsw i32 %[[VAL_330]], %[[VAL_329]]
-// CHECK:         %[[VAL_332:.*]] = icmp ult i32 %[[VAL_331]], 163840
+// CHECK:         %[[VAL_332:.*]] = icmp ult i32 %[[VAL_331]], 20480
 // CHECK:         call void @llvm.assume(i1 %[[VAL_332]])
 // CHECK:         %[[VAL_333:.*]] = udiv i32 %[[VAL_331]], 1
 // CHECK:         %[[VAL_334:.*]] = urem i32 %[[VAL_333]], 200
@@ -377,11 +377,11 @@
 // CHECK:       r7.in_bounds-after:                               ; preds = %[[VAL_337]], %[[VAL_339:.*]]
 // CHECK:         ret void
 // CHECK:       r7.in_bounds-true:                                ; preds = %[[VAL_339]]
-// CHECK:         %[[VAL_340:.*]] = bitcast [100 x [200 x float]]* %[[VAL_327]] to float*
+// CHECK:         %[[VAL_340:.*]] = bitcast [100 x [200 x float]]* %[[VAL_324]] to float*
 // CHECK:         %[[VAL_341:.*]] = getelementptr inbounds float, float* %[[VAL_340]], i32 %[[VAL_331]]
 // CHECK:         %[[VAL_342:.*]] = load float, float* %[[VAL_341]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_343:.*]] = call float @__nv_cosf(float %[[VAL_342]])
-// CHECK:         %[[VAL_344:.*]] = bitcast [100 x [200 x float]]* %[[VAL_324]] to float*
+// CHECK:         %[[VAL_344:.*]] = bitcast [100 x [200 x float]]* %[[VAL_327]] to float*
 // CHECK:         %[[VAL_345:.*]] = getelementptr inbounds float, float* %[[VAL_344]], i32 %[[VAL_331]]
 // CHECK:         store float %[[VAL_343]], float* %[[VAL_345]], align 4
 // CHECK:         br label %[[VAL_338]]
@@ -392,9 +392,9 @@
 // CHECK:         %[[VAL_351:.*]] = bitcast i8* %[[VAL_349]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_352:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_353:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_354:.*]] = mul nuw nsw i32 %[[VAL_352]], 128
+// CHECK:         %[[VAL_354:.*]] = mul nuw nsw i32 %[[VAL_352]], 256
 // CHECK:         %[[VAL_355:.*]] = add nuw nsw i32 %[[VAL_354]], %[[VAL_353]]
-// CHECK:         %[[VAL_356:.*]] = icmp ult i32 %[[VAL_355]], 163840
+// CHECK:         %[[VAL_356:.*]] = icmp ult i32 %[[VAL_355]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_356]])
 // CHECK:         %[[VAL_357:.*]] = mul nuw nsw i32 %[[VAL_355]], 4
 // CHECK:         %[[VAL_358:.*]] = udiv i32 %[[VAL_357]], 1
@@ -417,32 +417,32 @@
 // CHECK:       r8.in_bounds-after:                               ; preds = %[[VAL_374]], %[[VAL_376:.*]]
 // CHECK:         ret void
 // CHECK:       r8.in_bounds-true:                                ; preds = %[[VAL_376]]
-// CHECK:         %[[VAL_377:.*]] = bitcast [100 x [200 x float]]* %[[VAL_351]] to float*
+// CHECK:         %[[VAL_377:.*]] = bitcast [100 x [200 x float]]* %[[VAL_348]] to float*
 // CHECK:         %[[VAL_378:.*]] = getelementptr inbounds float, float* %[[VAL_377]], i32 %[[VAL_357]]
 // CHECK:         %[[VAL_379:.*]] = load float, float* %[[VAL_378]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_380:.*]] = call float @__nv_expf(float %[[VAL_379]])
-// CHECK:         %[[VAL_381:.*]] = bitcast [100 x [200 x float]]* %[[VAL_348]] to float*
+// CHECK:         %[[VAL_381:.*]] = bitcast [100 x [200 x float]]* %[[VAL_351]] to float*
 // CHECK:         %[[VAL_382:.*]] = getelementptr inbounds float, float* %[[VAL_381]], i32 %[[VAL_357]]
 // CHECK:         store float %[[VAL_380]], float* %[[VAL_382]], align 4
-// CHECK:         %[[VAL_383:.*]] = bitcast [100 x [200 x float]]* %[[VAL_351]] to float*
+// CHECK:         %[[VAL_383:.*]] = bitcast [100 x [200 x float]]* %[[VAL_348]] to float*
 // CHECK:         %[[VAL_384:.*]] = getelementptr inbounds float, float* %[[VAL_383]], i32 %[[VAL_361]]
 // CHECK:         %[[VAL_385:.*]] = load float, float* %[[VAL_384]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_386:.*]] = call float @__nv_expf(float %[[VAL_385]])
-// CHECK:         %[[VAL_387:.*]] = bitcast [100 x [200 x float]]* %[[VAL_348]] to float*
+// CHECK:         %[[VAL_387:.*]] = bitcast [100 x [200 x float]]* %[[VAL_351]] to float*
 // CHECK:         %[[VAL_388:.*]] = getelementptr inbounds float, float* %[[VAL_387]], i32 %[[VAL_361]]
 // CHECK:         store float %[[VAL_386]], float* %[[VAL_388]], align 4
-// CHECK:         %[[VAL_389:.*]] = bitcast [100 x [200 x float]]* %[[VAL_351]] to float*
+// CHECK:         %[[VAL_389:.*]] = bitcast [100 x [200 x float]]* %[[VAL_348]] to float*
 // CHECK:         %[[VAL_390:.*]] = getelementptr inbounds float, float* %[[VAL_389]], i32 %[[VAL_365]]
 // CHECK:         %[[VAL_391:.*]] = load float, float* %[[VAL_390]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_392:.*]] = call float @__nv_expf(float %[[VAL_391]])
-// CHECK:         %[[VAL_393:.*]] = bitcast [100 x [200 x float]]* %[[VAL_348]] to float*
+// CHECK:         %[[VAL_393:.*]] = bitcast [100 x [200 x float]]* %[[VAL_351]] to float*
 // CHECK:         %[[VAL_394:.*]] = getelementptr inbounds float, float* %[[VAL_393]], i32 %[[VAL_365]]
 // CHECK:         store float %[[VAL_392]], float* %[[VAL_394]], align 4
-// CHECK:         %[[VAL_395:.*]] = bitcast [100 x [200 x float]]* %[[VAL_351]] to float*
+// CHECK:         %[[VAL_395:.*]] = bitcast [100 x [200 x float]]* %[[VAL_348]] to float*
 // CHECK:         %[[VAL_396:.*]] = getelementptr inbounds float, float* %[[VAL_395]], i32 %[[VAL_369]]
 // CHECK:         %[[VAL_397:.*]] = load float, float* %[[VAL_396]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_398:.*]] = call float @__nv_expf(float %[[VAL_397]])
-// CHECK:         %[[VAL_399:.*]] = bitcast [100 x [200 x float]]* %[[VAL_348]] to float*
+// CHECK:         %[[VAL_399:.*]] = bitcast [100 x [200 x float]]* %[[VAL_351]] to float*
 // CHECK:         %[[VAL_400:.*]] = getelementptr inbounds float, float* %[[VAL_399]], i32 %[[VAL_369]]
 // CHECK:         store float %[[VAL_398]], float* %[[VAL_400]], align 4
 // CHECK:         br label %[[VAL_375]]
@@ -453,9 +453,9 @@
 // CHECK:         %[[VAL_406:.*]] = bitcast i8* %[[VAL_404]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_407:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_408:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_409:.*]] = mul nuw nsw i32 %[[VAL_407]], 128
+// CHECK:         %[[VAL_409:.*]] = mul nuw nsw i32 %[[VAL_407]], 256
 // CHECK:         %[[VAL_410:.*]] = add nuw nsw i32 %[[VAL_409]], %[[VAL_408]]
-// CHECK:         %[[VAL_411:.*]] = icmp ult i32 %[[VAL_410]], 163840
+// CHECK:         %[[VAL_411:.*]] = icmp ult i32 %[[VAL_410]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_411]])
 // CHECK:         %[[VAL_412:.*]] = mul nuw nsw i32 %[[VAL_410]], 4
 // CHECK:         %[[VAL_413:.*]] = udiv i32 %[[VAL_412]], 1
@@ -478,32 +478,32 @@
 // CHECK:       r9.in_bounds-after:                               ; preds = %[[VAL_429]], %[[VAL_431:.*]]
 // CHECK:         ret void
 // CHECK:       r9.in_bounds-true:                                ; preds = %[[VAL_431]]
-// CHECK:         %[[VAL_432:.*]] = bitcast [100 x [200 x float]]* %[[VAL_406]] to float*
+// CHECK:         %[[VAL_432:.*]] = bitcast [100 x [200 x float]]* %[[VAL_403]] to float*
 // CHECK:         %[[VAL_433:.*]] = getelementptr inbounds float, float* %[[VAL_432]], i32 %[[VAL_412]]
 // CHECK:         %[[VAL_434:.*]] = load float, float* %[[VAL_433]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_435:.*]] = call float @__nv_expm1f(float %[[VAL_434]])
-// CHECK:         %[[VAL_436:.*]] = bitcast [100 x [200 x float]]* %[[VAL_403]] to float*
+// CHECK:         %[[VAL_436:.*]] = bitcast [100 x [200 x float]]* %[[VAL_406]] to float*
 // CHECK:         %[[VAL_437:.*]] = getelementptr inbounds float, float* %[[VAL_436]], i32 %[[VAL_412]]
 // CHECK:         store float %[[VAL_435]], float* %[[VAL_437]], align 4
-// CHECK:         %[[VAL_438:.*]] = bitcast [100 x [200 x float]]* %[[VAL_406]] to float*
+// CHECK:         %[[VAL_438:.*]] = bitcast [100 x [200 x float]]* %[[VAL_403]] to float*
 // CHECK:         %[[VAL_439:.*]] = getelementptr inbounds float, float* %[[VAL_438]], i32 %[[VAL_416]]
 // CHECK:         %[[VAL_440:.*]] = load float, float* %[[VAL_439]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_441:.*]] = call float @__nv_expm1f(float %[[VAL_440]])
-// CHECK:         %[[VAL_442:.*]] = bitcast [100 x [200 x float]]* %[[VAL_403]] to float*
+// CHECK:         %[[VAL_442:.*]] = bitcast [100 x [200 x float]]* %[[VAL_406]] to float*
 // CHECK:         %[[VAL_443:.*]] = getelementptr inbounds float, float* %[[VAL_442]], i32 %[[VAL_416]]
 // CHECK:         store float %[[VAL_441]], float* %[[VAL_443]], align 4
-// CHECK:         %[[VAL_444:.*]] = bitcast [100 x [200 x float]]* %[[VAL_406]] to float*
+// CHECK:         %[[VAL_444:.*]] = bitcast [100 x [200 x float]]* %[[VAL_403]] to float*
 // CHECK:         %[[VAL_445:.*]] = getelementptr inbounds float, float* %[[VAL_444]], i32 %[[VAL_420]]
 // CHECK:         %[[VAL_446:.*]] = load float, float* %[[VAL_445]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_447:.*]] = call float @__nv_expm1f(float %[[VAL_446]])
-// CHECK:         %[[VAL_448:.*]] = bitcast [100 x [200 x float]]* %[[VAL_403]] to float*
+// CHECK:         %[[VAL_448:.*]] = bitcast [100 x [200 x float]]* %[[VAL_406]] to float*
 // CHECK:         %[[VAL_449:.*]] = getelementptr inbounds float, float* %[[VAL_448]], i32 %[[VAL_420]]
 // CHECK:         store float %[[VAL_447]], float* %[[VAL_449]], align 4
-// CHECK:         %[[VAL_450:.*]] = bitcast [100 x [200 x float]]* %[[VAL_406]] to float*
+// CHECK:         %[[VAL_450:.*]] = bitcast [100 x [200 x float]]* %[[VAL_403]] to float*
 // CHECK:         %[[VAL_451:.*]] = getelementptr inbounds float, float* %[[VAL_450]], i32 %[[VAL_424]]
 // CHECK:         %[[VAL_452:.*]] = load float, float* %[[VAL_451]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_453:.*]] = call float @__nv_expm1f(float %[[VAL_452]])
-// CHECK:         %[[VAL_454:.*]] = bitcast [100 x [200 x float]]* %[[VAL_403]] to float*
+// CHECK:         %[[VAL_454:.*]] = bitcast [100 x [200 x float]]* %[[VAL_406]] to float*
 // CHECK:         %[[VAL_455:.*]] = getelementptr inbounds float, float* %[[VAL_454]], i32 %[[VAL_424]]
 // CHECK:         store float %[[VAL_453]], float* %[[VAL_455]], align 4
 // CHECK:         br label %[[VAL_430]]
@@ -514,9 +514,9 @@
 // CHECK:         %[[VAL_461:.*]] = bitcast i8* %[[VAL_459]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_462:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_463:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_464:.*]] = mul nuw nsw i32 %[[VAL_462]], 128
+// CHECK:         %[[VAL_464:.*]] = mul nuw nsw i32 %[[VAL_462]], 256
 // CHECK:         %[[VAL_465:.*]] = add nuw nsw i32 %[[VAL_464]], %[[VAL_463]]
-// CHECK:         %[[VAL_466:.*]] = icmp ult i32 %[[VAL_465]], 163840
+// CHECK:         %[[VAL_466:.*]] = icmp ult i32 %[[VAL_465]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_466]])
 // CHECK:         %[[VAL_467:.*]] = mul nuw nsw i32 %[[VAL_465]], 4
 // CHECK:         %[[VAL_468:.*]] = udiv i32 %[[VAL_467]], 1
@@ -539,45 +539,45 @@
 // CHECK:       r10.in_bounds-after:                              ; preds = %[[VAL_484]], %[[VAL_486:.*]]
 // CHECK:         ret void
 // CHECK:       r10.in_bounds-true:                               ; preds = %[[VAL_486]]
-// CHECK:         %[[VAL_487:.*]] = bitcast [100 x [200 x float]]* %[[VAL_461]] to float*
+// CHECK:         %[[VAL_487:.*]] = bitcast [100 x [200 x float]]* %[[VAL_458]] to float*
 // CHECK:         %[[VAL_488:.*]] = getelementptr inbounds float, float* %[[VAL_487]], i32 %[[VAL_467]]
 // CHECK:         %[[VAL_489:.*]] = load float, float* %[[VAL_488]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_490:.*]] = call float @llvm.floor.f32(float %[[VAL_489]])
-// CHECK:         %[[VAL_491:.*]] = bitcast [100 x [200 x float]]* %[[VAL_458]] to float*
+// CHECK:         %[[VAL_491:.*]] = bitcast [100 x [200 x float]]* %[[VAL_461]] to float*
 // CHECK:         %[[VAL_492:.*]] = getelementptr inbounds float, float* %[[VAL_491]], i32 %[[VAL_467]]
 // CHECK:         store float %[[VAL_490]], float* %[[VAL_492]], align 4
-// CHECK:         %[[VAL_493:.*]] = bitcast [100 x [200 x float]]* %[[VAL_461]] to float*
+// CHECK:         %[[VAL_493:.*]] = bitcast [100 x [200 x float]]* %[[VAL_458]] to float*
 // CHECK:         %[[VAL_494:.*]] = getelementptr inbounds float, float* %[[VAL_493]], i32 %[[VAL_471]]
 // CHECK:         %[[VAL_495:.*]] = load float, float* %[[VAL_494]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_496:.*]] = call float @llvm.floor.f32(float %[[VAL_495]])
-// CHECK:         %[[VAL_497:.*]] = bitcast [100 x [200 x float]]* %[[VAL_458]] to float*
+// CHECK:         %[[VAL_497:.*]] = bitcast [100 x [200 x float]]* %[[VAL_461]] to float*
 // CHECK:         %[[VAL_498:.*]] = getelementptr inbounds float, float* %[[VAL_497]], i32 %[[VAL_471]]
 // CHECK:         store float %[[VAL_496]], float* %[[VAL_498]], align 4
-// CHECK:         %[[VAL_499:.*]] = bitcast [100 x [200 x float]]* %[[VAL_461]] to float*
+// CHECK:         %[[VAL_499:.*]] = bitcast [100 x [200 x float]]* %[[VAL_458]] to float*
 // CHECK:         %[[VAL_500:.*]] = getelementptr inbounds float, float* %[[VAL_499]], i32 %[[VAL_475]]
 // CHECK:         %[[VAL_501:.*]] = load float, float* %[[VAL_500]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_502:.*]] = call float @llvm.floor.f32(float %[[VAL_501]])
-// CHECK:         %[[VAL_503:.*]] = bitcast [100 x [200 x float]]* %[[VAL_458]] to float*
+// CHECK:         %[[VAL_503:.*]] = bitcast [100 x [200 x float]]* %[[VAL_461]] to float*
 // CHECK:         %[[VAL_504:.*]] = getelementptr inbounds float, float* %[[VAL_503]], i32 %[[VAL_475]]
 // CHECK:         store float %[[VAL_502]], float* %[[VAL_504]], align 4
-// CHECK:         %[[VAL_505:.*]] = bitcast [100 x [200 x float]]* %[[VAL_461]] to float*
+// CHECK:         %[[VAL_505:.*]] = bitcast [100 x [200 x float]]* %[[VAL_458]] to float*
 // CHECK:         %[[VAL_506:.*]] = getelementptr inbounds float, float* %[[VAL_505]], i32 %[[VAL_479]]
 // CHECK:         %[[VAL_507:.*]] = load float, float* %[[VAL_506]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_508:.*]] = call float @llvm.floor.f32(float %[[VAL_507]])
-// CHECK:         %[[VAL_509:.*]] = bitcast [100 x [200 x float]]* %[[VAL_458]] to float*
+// CHECK:         %[[VAL_509:.*]] = bitcast [100 x [200 x float]]* %[[VAL_461]] to float*
 // CHECK:         %[[VAL_510:.*]] = getelementptr inbounds float, float* %[[VAL_509]], i32 %[[VAL_479]]
 // CHECK:         store float %[[VAL_508]], float* %[[VAL_510]], align 4
 // CHECK:         br label %[[VAL_485]]
 // CHECK:       entry:
 // CHECK:         %[[VAL_511:.*]] = getelementptr inbounds i8, i8* %[[VAL_512:.*]], i64 0
-// CHECK:         %[[VAL_513:.*]] = bitcast i8* %[[VAL_511]] to [100 x [200 x float]]*
-// CHECK:         %[[VAL_514:.*]] = getelementptr inbounds i8, i8* %[[VAL_515:.*]], i64 0
-// CHECK:         %[[VAL_516:.*]] = bitcast i8* %[[VAL_514]] to [100 x [200 x %[[VAL_517:.*]]]]*
+// CHECK:         %[[VAL_513:.*]] = bitcast i8* %[[VAL_511]] to [100 x [200 x %[[VAL_514:.*]]]]*
+// CHECK:         %[[VAL_515:.*]] = getelementptr inbounds i8, i8* %[[VAL_516:.*]], i64 0
+// CHECK:         %[[VAL_517:.*]] = bitcast i8* %[[VAL_515]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_518:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_519:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_520:.*]] = mul nuw nsw i32 %[[VAL_518]], 128
+// CHECK:         %[[VAL_520:.*]] = mul nuw nsw i32 %[[VAL_518]], 256
 // CHECK:         %[[VAL_521:.*]] = add nuw nsw i32 %[[VAL_520]], %[[VAL_519]]
-// CHECK:         %[[VAL_522:.*]] = icmp ult i32 %[[VAL_521]], 163840
+// CHECK:         %[[VAL_522:.*]] = icmp ult i32 %[[VAL_521]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_522]])
 // CHECK:         %[[VAL_523:.*]] = mul nuw nsw i32 %[[VAL_521]], 4
 // CHECK:         %[[VAL_524:.*]] = udiv i32 %[[VAL_523]], 1
@@ -600,45 +600,45 @@
 // CHECK:       r11.in_bounds-after:                              ; preds = %[[VAL_540]], %[[VAL_542:.*]]
 // CHECK:         ret void
 // CHECK:       r11.in_bounds-true:                               ; preds = %[[VAL_542]]
-// CHECK:         %[[VAL_543:.*]] = bitcast [100 x [200 x %[[VAL_517]]]]* %[[VAL_516]] to %[[VAL_517]]*
-// CHECK:         %[[VAL_544:.*]] = getelementptr inbounds %[[VAL_517]], %[[VAL_517]]* %[[VAL_543]], i32 %[[VAL_523]]
-// CHECK:         %[[VAL_545:.*]] = load %[[VAL_517]], %[[VAL_517]]* %[[VAL_544]], align 1, !invariant.load !92
-// CHECK:         %[[VAL_546:.*]] = extractvalue %[[VAL_517]] %[[VAL_545]], 1
-// CHECK:         %[[VAL_547:.*]] = bitcast [100 x [200 x float]]* %[[VAL_513]] to float*
+// CHECK:         %[[VAL_543:.*]] = bitcast [100 x [200 x %[[VAL_514]]]]* %[[VAL_513]] to %[[VAL_514]]*
+// CHECK:         %[[VAL_544:.*]] = getelementptr inbounds %[[VAL_514]], %[[VAL_514]]* %[[VAL_543]], i32 %[[VAL_523]]
+// CHECK:         %[[VAL_545:.*]] = load %[[VAL_514]], %[[VAL_514]]* %[[VAL_544]], align 1, !invariant.load !92
+// CHECK:         %[[VAL_546:.*]] = extractvalue %[[VAL_514]] %[[VAL_545]], 1
+// CHECK:         %[[VAL_547:.*]] = bitcast [100 x [200 x float]]* %[[VAL_517]] to float*
 // CHECK:         %[[VAL_548:.*]] = getelementptr inbounds float, float* %[[VAL_547]], i32 %[[VAL_523]]
 // CHECK:         store float %[[VAL_546]], float* %[[VAL_548]], align 4
-// CHECK:         %[[VAL_549:.*]] = bitcast [100 x [200 x %[[VAL_517]]]]* %[[VAL_516]] to %[[VAL_517]]*
-// CHECK:         %[[VAL_550:.*]] = getelementptr inbounds %[[VAL_517]], %[[VAL_517]]* %[[VAL_549]], i32 %[[VAL_527]]
-// CHECK:         %[[VAL_551:.*]] = load %[[VAL_517]], %[[VAL_517]]* %[[VAL_550]], align 1, !invariant.load !92
-// CHECK:         %[[VAL_552:.*]] = extractvalue %[[VAL_517]] %[[VAL_551]], 1
-// CHECK:         %[[VAL_553:.*]] = bitcast [100 x [200 x float]]* %[[VAL_513]] to float*
+// CHECK:         %[[VAL_549:.*]] = bitcast [100 x [200 x %[[VAL_514]]]]* %[[VAL_513]] to %[[VAL_514]]*
+// CHECK:         %[[VAL_550:.*]] = getelementptr inbounds %[[VAL_514]], %[[VAL_514]]* %[[VAL_549]], i32 %[[VAL_527]]
+// CHECK:         %[[VAL_551:.*]] = load %[[VAL_514]], %[[VAL_514]]* %[[VAL_550]], align 1, !invariant.load !92
+// CHECK:         %[[VAL_552:.*]] = extractvalue %[[VAL_514]] %[[VAL_551]], 1
+// CHECK:         %[[VAL_553:.*]] = bitcast [100 x [200 x float]]* %[[VAL_517]] to float*
 // CHECK:         %[[VAL_554:.*]] = getelementptr inbounds float, float* %[[VAL_553]], i32 %[[VAL_527]]
 // CHECK:         store float %[[VAL_552]], float* %[[VAL_554]], align 4
-// CHECK:         %[[VAL_555:.*]] = bitcast [100 x [200 x %[[VAL_517]]]]* %[[VAL_516]] to %[[VAL_517]]*
-// CHECK:         %[[VAL_556:.*]] = getelementptr inbounds %[[VAL_517]], %[[VAL_517]]* %[[VAL_555]], i32 %[[VAL_531]]
-// CHECK:         %[[VAL_557:.*]] = load %[[VAL_517]], %[[VAL_517]]* %[[VAL_556]], align 1, !invariant.load !92
-// CHECK:         %[[VAL_558:.*]] = extractvalue %[[VAL_517]] %[[VAL_557]], 1
-// CHECK:         %[[VAL_559:.*]] = bitcast [100 x [200 x float]]* %[[VAL_513]] to float*
+// CHECK:         %[[VAL_555:.*]] = bitcast [100 x [200 x %[[VAL_514]]]]* %[[VAL_513]] to %[[VAL_514]]*
+// CHECK:         %[[VAL_556:.*]] = getelementptr inbounds %[[VAL_514]], %[[VAL_514]]* %[[VAL_555]], i32 %[[VAL_531]]
+// CHECK:         %[[VAL_557:.*]] = load %[[VAL_514]], %[[VAL_514]]* %[[VAL_556]], align 1, !invariant.load !92
+// CHECK:         %[[VAL_558:.*]] = extractvalue %[[VAL_514]] %[[VAL_557]], 1
+// CHECK:         %[[VAL_559:.*]] = bitcast [100 x [200 x float]]* %[[VAL_517]] to float*
 // CHECK:         %[[VAL_560:.*]] = getelementptr inbounds float, float* %[[VAL_559]], i32 %[[VAL_531]]
 // CHECK:         store float %[[VAL_558]], float* %[[VAL_560]], align 4
-// CHECK:         %[[VAL_561:.*]] = bitcast [100 x [200 x %[[VAL_517]]]]* %[[VAL_516]] to %[[VAL_517]]*
-// CHECK:         %[[VAL_562:.*]] = getelementptr inbounds %[[VAL_517]], %[[VAL_517]]* %[[VAL_561]], i32 %[[VAL_535]]
-// CHECK:         %[[VAL_563:.*]] = load %[[VAL_517]], %[[VAL_517]]* %[[VAL_562]], align 1, !invariant.load !92
-// CHECK:         %[[VAL_564:.*]] = extractvalue %[[VAL_517]] %[[VAL_563]], 1
-// CHECK:         %[[VAL_565:.*]] = bitcast [100 x [200 x float]]* %[[VAL_513]] to float*
+// CHECK:         %[[VAL_561:.*]] = bitcast [100 x [200 x %[[VAL_514]]]]* %[[VAL_513]] to %[[VAL_514]]*
+// CHECK:         %[[VAL_562:.*]] = getelementptr inbounds %[[VAL_514]], %[[VAL_514]]* %[[VAL_561]], i32 %[[VAL_535]]
+// CHECK:         %[[VAL_563:.*]] = load %[[VAL_514]], %[[VAL_514]]* %[[VAL_562]], align 1, !invariant.load !92
+// CHECK:         %[[VAL_564:.*]] = extractvalue %[[VAL_514]] %[[VAL_563]], 1
+// CHECK:         %[[VAL_565:.*]] = bitcast [100 x [200 x float]]* %[[VAL_517]] to float*
 // CHECK:         %[[VAL_566:.*]] = getelementptr inbounds float, float* %[[VAL_565]], i32 %[[VAL_535]]
 // CHECK:         store float %[[VAL_564]], float* %[[VAL_566]], align 4
 // CHECK:         br label %[[VAL_541]]
 // CHECK:       entry:
 // CHECK:         %[[VAL_567:.*]] = getelementptr inbounds i8, i8* %[[VAL_568:.*]], i64 0
-// CHECK:         %[[VAL_569:.*]] = bitcast i8* %[[VAL_567]] to [100 x [200 x i8]]*
+// CHECK:         %[[VAL_569:.*]] = bitcast i8* %[[VAL_567]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_570:.*]] = getelementptr inbounds i8, i8* %[[VAL_571:.*]], i64 0
-// CHECK:         %[[VAL_572:.*]] = bitcast i8* %[[VAL_570]] to [100 x [200 x float]]*
+// CHECK:         %[[VAL_572:.*]] = bitcast i8* %[[VAL_570]] to [100 x [200 x i8]]*
 // CHECK:         %[[VAL_573:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_574:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_575:.*]] = mul nuw nsw i32 %[[VAL_573]], 128
+// CHECK:         %[[VAL_575:.*]] = mul nuw nsw i32 %[[VAL_573]], 256
 // CHECK:         %[[VAL_576:.*]] = add nuw nsw i32 %[[VAL_575]], %[[VAL_574]]
-// CHECK:         %[[VAL_577:.*]] = icmp ult i32 %[[VAL_576]], 163840
+// CHECK:         %[[VAL_577:.*]] = icmp ult i32 %[[VAL_576]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_577]])
 // CHECK:         %[[VAL_578:.*]] = mul nuw nsw i32 %[[VAL_576]], 4
 // CHECK:         %[[VAL_579:.*]] = udiv i32 %[[VAL_578]], 1
@@ -661,40 +661,40 @@
 // CHECK:       r12.in_bounds-after:                              ; preds = %[[VAL_595]], %[[VAL_597:.*]]
 // CHECK:         ret void
 // CHECK:       r12.in_bounds-true:                               ; preds = %[[VAL_597]]
-// CHECK:         %[[VAL_598:.*]] = bitcast [100 x [200 x float]]* %[[VAL_572]] to float*
+// CHECK:         %[[VAL_598:.*]] = bitcast [100 x [200 x float]]* %[[VAL_569]] to float*
 // CHECK:         %[[VAL_599:.*]] = getelementptr inbounds float, float* %[[VAL_598]], i32 %[[VAL_578]]
 // CHECK:         %[[VAL_600:.*]] = load float, float* %[[VAL_599]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_601:.*]] = call float @llvm.fabs.f32(float %[[VAL_600]])
 // CHECK:         %[[VAL_602:.*]] = fcmp one float %[[VAL_601]], 0x7FF0000000000000
 // CHECK:         %[[VAL_603:.*]] = zext i1 %[[VAL_602]] to i8
-// CHECK:         %[[VAL_604:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_569]] to i8*
+// CHECK:         %[[VAL_604:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_572]] to i8*
 // CHECK:         %[[VAL_605:.*]] = getelementptr inbounds i8, i8* %[[VAL_604]], i32 %[[VAL_578]]
 // CHECK:         store i8 %[[VAL_603]], i8* %[[VAL_605]], align 1
-// CHECK:         %[[VAL_606:.*]] = bitcast [100 x [200 x float]]* %[[VAL_572]] to float*
+// CHECK:         %[[VAL_606:.*]] = bitcast [100 x [200 x float]]* %[[VAL_569]] to float*
 // CHECK:         %[[VAL_607:.*]] = getelementptr inbounds float, float* %[[VAL_606]], i32 %[[VAL_582]]
 // CHECK:         %[[VAL_608:.*]] = load float, float* %[[VAL_607]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_609:.*]] = call float @llvm.fabs.f32(float %[[VAL_608]])
 // CHECK:         %[[VAL_610:.*]] = fcmp one float %[[VAL_609]], 0x7FF0000000000000
 // CHECK:         %[[VAL_611:.*]] = zext i1 %[[VAL_610]] to i8
-// CHECK:         %[[VAL_612:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_569]] to i8*
+// CHECK:         %[[VAL_612:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_572]] to i8*
 // CHECK:         %[[VAL_613:.*]] = getelementptr inbounds i8, i8* %[[VAL_612]], i32 %[[VAL_582]]
 // CHECK:         store i8 %[[VAL_611]], i8* %[[VAL_613]], align 1
-// CHECK:         %[[VAL_614:.*]] = bitcast [100 x [200 x float]]* %[[VAL_572]] to float*
+// CHECK:         %[[VAL_614:.*]] = bitcast [100 x [200 x float]]* %[[VAL_569]] to float*
 // CHECK:         %[[VAL_615:.*]] = getelementptr inbounds float, float* %[[VAL_614]], i32 %[[VAL_586]]
 // CHECK:         %[[VAL_616:.*]] = load float, float* %[[VAL_615]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_617:.*]] = call float @llvm.fabs.f32(float %[[VAL_616]])
 // CHECK:         %[[VAL_618:.*]] = fcmp one float %[[VAL_617]], 0x7FF0000000000000
 // CHECK:         %[[VAL_619:.*]] = zext i1 %[[VAL_618]] to i8
-// CHECK:         %[[VAL_620:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_569]] to i8*
+// CHECK:         %[[VAL_620:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_572]] to i8*
 // CHECK:         %[[VAL_621:.*]] = getelementptr inbounds i8, i8* %[[VAL_620]], i32 %[[VAL_586]]
 // CHECK:         store i8 %[[VAL_619]], i8* %[[VAL_621]], align 1
-// CHECK:         %[[VAL_622:.*]] = bitcast [100 x [200 x float]]* %[[VAL_572]] to float*
+// CHECK:         %[[VAL_622:.*]] = bitcast [100 x [200 x float]]* %[[VAL_569]] to float*
 // CHECK:         %[[VAL_623:.*]] = getelementptr inbounds float, float* %[[VAL_622]], i32 %[[VAL_590]]
 // CHECK:         %[[VAL_624:.*]] = load float, float* %[[VAL_623]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_625:.*]] = call float @llvm.fabs.f32(float %[[VAL_624]])
 // CHECK:         %[[VAL_626:.*]] = fcmp one float %[[VAL_625]], 0x7FF0000000000000
 // CHECK:         %[[VAL_627:.*]] = zext i1 %[[VAL_626]] to i8
-// CHECK:         %[[VAL_628:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_569]] to i8*
+// CHECK:         %[[VAL_628:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_572]] to i8*
 // CHECK:         %[[VAL_629:.*]] = getelementptr inbounds i8, i8* %[[VAL_628]], i32 %[[VAL_590]]
 // CHECK:         store i8 %[[VAL_627]], i8* %[[VAL_629]], align 1
 // CHECK:         br label %[[VAL_596]]
@@ -705,9 +705,9 @@
 // CHECK:         %[[VAL_635:.*]] = bitcast i8* %[[VAL_633]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_636:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_637:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_638:.*]] = mul nuw nsw i32 %[[VAL_636]], 128
+// CHECK:         %[[VAL_638:.*]] = mul nuw nsw i32 %[[VAL_636]], 256
 // CHECK:         %[[VAL_639:.*]] = add nuw nsw i32 %[[VAL_638]], %[[VAL_637]]
-// CHECK:         %[[VAL_640:.*]] = icmp ult i32 %[[VAL_639]], 163840
+// CHECK:         %[[VAL_640:.*]] = icmp ult i32 %[[VAL_639]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_640]])
 // CHECK:         %[[VAL_641:.*]] = mul nuw nsw i32 %[[VAL_639]], 4
 // CHECK:         %[[VAL_642:.*]] = udiv i32 %[[VAL_641]], 1
@@ -730,32 +730,32 @@
 // CHECK:       r13.in_bounds-after:                              ; preds = %[[VAL_658]], %[[VAL_660:.*]]
 // CHECK:         ret void
 // CHECK:       r13.in_bounds-true:                               ; preds = %[[VAL_660]]
-// CHECK:         %[[VAL_661:.*]] = bitcast [100 x [200 x float]]* %[[VAL_635]] to float*
+// CHECK:         %[[VAL_661:.*]] = bitcast [100 x [200 x float]]* %[[VAL_632]] to float*
 // CHECK:         %[[VAL_662:.*]] = getelementptr inbounds float, float* %[[VAL_661]], i32 %[[VAL_641]]
 // CHECK:         %[[VAL_663:.*]] = load float, float* %[[VAL_662]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_664:.*]] = call float @__nv_logf(float %[[VAL_663]])
-// CHECK:         %[[VAL_665:.*]] = bitcast [100 x [200 x float]]* %[[VAL_632]] to float*
+// CHECK:         %[[VAL_665:.*]] = bitcast [100 x [200 x float]]* %[[VAL_635]] to float*
 // CHECK:         %[[VAL_666:.*]] = getelementptr inbounds float, float* %[[VAL_665]], i32 %[[VAL_641]]
 // CHECK:         store float %[[VAL_664]], float* %[[VAL_666]], align 4
-// CHECK:         %[[VAL_667:.*]] = bitcast [100 x [200 x float]]* %[[VAL_635]] to float*
+// CHECK:         %[[VAL_667:.*]] = bitcast [100 x [200 x float]]* %[[VAL_632]] to float*
 // CHECK:         %[[VAL_668:.*]] = getelementptr inbounds float, float* %[[VAL_667]], i32 %[[VAL_645]]
 // CHECK:         %[[VAL_669:.*]] = load float, float* %[[VAL_668]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_670:.*]] = call float @__nv_logf(float %[[VAL_669]])
-// CHECK:         %[[VAL_671:.*]] = bitcast [100 x [200 x float]]* %[[VAL_632]] to float*
+// CHECK:         %[[VAL_671:.*]] = bitcast [100 x [200 x float]]* %[[VAL_635]] to float*
 // CHECK:         %[[VAL_672:.*]] = getelementptr inbounds float, float* %[[VAL_671]], i32 %[[VAL_645]]
 // CHECK:         store float %[[VAL_670]], float* %[[VAL_672]], align 4
-// CHECK:         %[[VAL_673:.*]] = bitcast [100 x [200 x float]]* %[[VAL_635]] to float*
+// CHECK:         %[[VAL_673:.*]] = bitcast [100 x [200 x float]]* %[[VAL_632]] to float*
 // CHECK:         %[[VAL_674:.*]] = getelementptr inbounds float, float* %[[VAL_673]], i32 %[[VAL_649]]
 // CHECK:         %[[VAL_675:.*]] = load float, float* %[[VAL_674]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_676:.*]] = call float @__nv_logf(float %[[VAL_675]])
-// CHECK:         %[[VAL_677:.*]] = bitcast [100 x [200 x float]]* %[[VAL_632]] to float*
+// CHECK:         %[[VAL_677:.*]] = bitcast [100 x [200 x float]]* %[[VAL_635]] to float*
 // CHECK:         %[[VAL_678:.*]] = getelementptr inbounds float, float* %[[VAL_677]], i32 %[[VAL_649]]
 // CHECK:         store float %[[VAL_676]], float* %[[VAL_678]], align 4
-// CHECK:         %[[VAL_679:.*]] = bitcast [100 x [200 x float]]* %[[VAL_635]] to float*
+// CHECK:         %[[VAL_679:.*]] = bitcast [100 x [200 x float]]* %[[VAL_632]] to float*
 // CHECK:         %[[VAL_680:.*]] = getelementptr inbounds float, float* %[[VAL_679]], i32 %[[VAL_653]]
 // CHECK:         %[[VAL_681:.*]] = load float, float* %[[VAL_680]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_682:.*]] = call float @__nv_logf(float %[[VAL_681]])
-// CHECK:         %[[VAL_683:.*]] = bitcast [100 x [200 x float]]* %[[VAL_632]] to float*
+// CHECK:         %[[VAL_683:.*]] = bitcast [100 x [200 x float]]* %[[VAL_635]] to float*
 // CHECK:         %[[VAL_684:.*]] = getelementptr inbounds float, float* %[[VAL_683]], i32 %[[VAL_653]]
 // CHECK:         store float %[[VAL_682]], float* %[[VAL_684]], align 4
 // CHECK:         br label %[[VAL_659]]
@@ -766,9 +766,9 @@
 // CHECK:         %[[VAL_690:.*]] = bitcast i8* %[[VAL_688]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_691:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_692:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_693:.*]] = mul nuw nsw i32 %[[VAL_691]], 128
+// CHECK:         %[[VAL_693:.*]] = mul nuw nsw i32 %[[VAL_691]], 256
 // CHECK:         %[[VAL_694:.*]] = add nuw nsw i32 %[[VAL_693]], %[[VAL_692]]
-// CHECK:         %[[VAL_695:.*]] = icmp ult i32 %[[VAL_694]], 163840
+// CHECK:         %[[VAL_695:.*]] = icmp ult i32 %[[VAL_694]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_695]])
 // CHECK:         %[[VAL_696:.*]] = mul nuw nsw i32 %[[VAL_694]], 4
 // CHECK:         %[[VAL_697:.*]] = udiv i32 %[[VAL_696]], 1
@@ -791,32 +791,32 @@
 // CHECK:       r14.in_bounds-after:                              ; preds = %[[VAL_713]], %[[VAL_715:.*]]
 // CHECK:         ret void
 // CHECK:       r14.in_bounds-true:                               ; preds = %[[VAL_715]]
-// CHECK:         %[[VAL_716:.*]] = bitcast [100 x [200 x float]]* %[[VAL_690]] to float*
+// CHECK:         %[[VAL_716:.*]] = bitcast [100 x [200 x float]]* %[[VAL_687]] to float*
 // CHECK:         %[[VAL_717:.*]] = getelementptr inbounds float, float* %[[VAL_716]], i32 %[[VAL_696]]
 // CHECK:         %[[VAL_718:.*]] = load float, float* %[[VAL_717]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_719:.*]] = call float @__nv_log1pf(float %[[VAL_718]])
-// CHECK:         %[[VAL_720:.*]] = bitcast [100 x [200 x float]]* %[[VAL_687]] to float*
+// CHECK:         %[[VAL_720:.*]] = bitcast [100 x [200 x float]]* %[[VAL_690]] to float*
 // CHECK:         %[[VAL_721:.*]] = getelementptr inbounds float, float* %[[VAL_720]], i32 %[[VAL_696]]
 // CHECK:         store float %[[VAL_719]], float* %[[VAL_721]], align 4
-// CHECK:         %[[VAL_722:.*]] = bitcast [100 x [200 x float]]* %[[VAL_690]] to float*
+// CHECK:         %[[VAL_722:.*]] = bitcast [100 x [200 x float]]* %[[VAL_687]] to float*
 // CHECK:         %[[VAL_723:.*]] = getelementptr inbounds float, float* %[[VAL_722]], i32 %[[VAL_700]]
 // CHECK:         %[[VAL_724:.*]] = load float, float* %[[VAL_723]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_725:.*]] = call float @__nv_log1pf(float %[[VAL_724]])
-// CHECK:         %[[VAL_726:.*]] = bitcast [100 x [200 x float]]* %[[VAL_687]] to float*
+// CHECK:         %[[VAL_726:.*]] = bitcast [100 x [200 x float]]* %[[VAL_690]] to float*
 // CHECK:         %[[VAL_727:.*]] = getelementptr inbounds float, float* %[[VAL_726]], i32 %[[VAL_700]]
 // CHECK:         store float %[[VAL_725]], float* %[[VAL_727]], align 4
-// CHECK:         %[[VAL_728:.*]] = bitcast [100 x [200 x float]]* %[[VAL_690]] to float*
+// CHECK:         %[[VAL_728:.*]] = bitcast [100 x [200 x float]]* %[[VAL_687]] to float*
 // CHECK:         %[[VAL_729:.*]] = getelementptr inbounds float, float* %[[VAL_728]], i32 %[[VAL_704]]
 // CHECK:         %[[VAL_730:.*]] = load float, float* %[[VAL_729]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_731:.*]] = call float @__nv_log1pf(float %[[VAL_730]])
-// CHECK:         %[[VAL_732:.*]] = bitcast [100 x [200 x float]]* %[[VAL_687]] to float*
+// CHECK:         %[[VAL_732:.*]] = bitcast [100 x [200 x float]]* %[[VAL_690]] to float*
 // CHECK:         %[[VAL_733:.*]] = getelementptr inbounds float, float* %[[VAL_732]], i32 %[[VAL_704]]
 // CHECK:         store float %[[VAL_731]], float* %[[VAL_733]], align 4
-// CHECK:         %[[VAL_734:.*]] = bitcast [100 x [200 x float]]* %[[VAL_690]] to float*
+// CHECK:         %[[VAL_734:.*]] = bitcast [100 x [200 x float]]* %[[VAL_687]] to float*
 // CHECK:         %[[VAL_735:.*]] = getelementptr inbounds float, float* %[[VAL_734]], i32 %[[VAL_708]]
 // CHECK:         %[[VAL_736:.*]] = load float, float* %[[VAL_735]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_737:.*]] = call float @__nv_log1pf(float %[[VAL_736]])
-// CHECK:         %[[VAL_738:.*]] = bitcast [100 x [200 x float]]* %[[VAL_687]] to float*
+// CHECK:         %[[VAL_738:.*]] = bitcast [100 x [200 x float]]* %[[VAL_690]] to float*
 // CHECK:         %[[VAL_739:.*]] = getelementptr inbounds float, float* %[[VAL_738]], i32 %[[VAL_708]]
 // CHECK:         store float %[[VAL_737]], float* %[[VAL_739]], align 4
 // CHECK:         br label %[[VAL_714]]
@@ -827,9 +827,9 @@
 // CHECK:         %[[VAL_745:.*]] = bitcast i8* %[[VAL_743]] to [100 x [200 x i8]]*
 // CHECK:         %[[VAL_746:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_747:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_748:.*]] = mul nuw nsw i32 %[[VAL_746]], 128
+// CHECK:         %[[VAL_748:.*]] = mul nuw nsw i32 %[[VAL_746]], 256
 // CHECK:         %[[VAL_749:.*]] = add nuw nsw i32 %[[VAL_748]], %[[VAL_747]]
-// CHECK:         %[[VAL_750:.*]] = icmp ult i32 %[[VAL_749]], 163840
+// CHECK:         %[[VAL_750:.*]] = icmp ult i32 %[[VAL_749]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_750]])
 // CHECK:         %[[VAL_751:.*]] = mul nuw nsw i32 %[[VAL_749]], 4
 // CHECK:         %[[VAL_752:.*]] = udiv i32 %[[VAL_751]], 1
@@ -852,40 +852,40 @@
 // CHECK:       r15.in_bounds-after:                              ; preds = %[[VAL_768]], %[[VAL_770:.*]]
 // CHECK:         ret void
 // CHECK:       r15.in_bounds-true:                               ; preds = %[[VAL_770]]
-// CHECK:         %[[VAL_771:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_745]] to i8*
+// CHECK:         %[[VAL_771:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_742]] to i8*
 // CHECK:         %[[VAL_772:.*]] = getelementptr inbounds i8, i8* %[[VAL_771]], i32 %[[VAL_751]]
 // CHECK:         %[[VAL_773:.*]] = load i8, i8* %[[VAL_772]], align 1, !invariant.load !92
 // CHECK:         %[[VAL_774:.*]] = trunc i8 %[[VAL_773]] to i1
 // CHECK:         %[[VAL_775:.*]] = xor i1 %[[VAL_774]], true
 // CHECK:         %[[VAL_776:.*]] = zext i1 %[[VAL_775]] to i8
-// CHECK:         %[[VAL_777:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_742]] to i8*
+// CHECK:         %[[VAL_777:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_745]] to i8*
 // CHECK:         %[[VAL_778:.*]] = getelementptr inbounds i8, i8* %[[VAL_777]], i32 %[[VAL_751]]
 // CHECK:         store i8 %[[VAL_776]], i8* %[[VAL_778]], align 1
-// CHECK:         %[[VAL_779:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_745]] to i8*
+// CHECK:         %[[VAL_779:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_742]] to i8*
 // CHECK:         %[[VAL_780:.*]] = getelementptr inbounds i8, i8* %[[VAL_779]], i32 %[[VAL_755]]
 // CHECK:         %[[VAL_781:.*]] = load i8, i8* %[[VAL_780]], align 1, !invariant.load !92
 // CHECK:         %[[VAL_782:.*]] = trunc i8 %[[VAL_781]] to i1
 // CHECK:         %[[VAL_783:.*]] = xor i1 %[[VAL_782]], true
 // CHECK:         %[[VAL_784:.*]] = zext i1 %[[VAL_783]] to i8
-// CHECK:         %[[VAL_785:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_742]] to i8*
+// CHECK:         %[[VAL_785:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_745]] to i8*
 // CHECK:         %[[VAL_786:.*]] = getelementptr inbounds i8, i8* %[[VAL_785]], i32 %[[VAL_755]]
 // CHECK:         store i8 %[[VAL_784]], i8* %[[VAL_786]], align 1
-// CHECK:         %[[VAL_787:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_745]] to i8*
+// CHECK:         %[[VAL_787:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_742]] to i8*
 // CHECK:         %[[VAL_788:.*]] = getelementptr inbounds i8, i8* %[[VAL_787]], i32 %[[VAL_759]]
 // CHECK:         %[[VAL_789:.*]] = load i8, i8* %[[VAL_788]], align 1, !invariant.load !92
 // CHECK:         %[[VAL_790:.*]] = trunc i8 %[[VAL_789]] to i1
 // CHECK:         %[[VAL_791:.*]] = xor i1 %[[VAL_790]], true
 // CHECK:         %[[VAL_792:.*]] = zext i1 %[[VAL_791]] to i8
-// CHECK:         %[[VAL_793:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_742]] to i8*
+// CHECK:         %[[VAL_793:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_745]] to i8*
 // CHECK:         %[[VAL_794:.*]] = getelementptr inbounds i8, i8* %[[VAL_793]], i32 %[[VAL_759]]
 // CHECK:         store i8 %[[VAL_792]], i8* %[[VAL_794]], align 1
-// CHECK:         %[[VAL_795:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_745]] to i8*
+// CHECK:         %[[VAL_795:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_742]] to i8*
 // CHECK:         %[[VAL_796:.*]] = getelementptr inbounds i8, i8* %[[VAL_795]], i32 %[[VAL_763]]
 // CHECK:         %[[VAL_797:.*]] = load i8, i8* %[[VAL_796]], align 1, !invariant.load !92
 // CHECK:         %[[VAL_798:.*]] = trunc i8 %[[VAL_797]] to i1
 // CHECK:         %[[VAL_799:.*]] = xor i1 %[[VAL_798]], true
 // CHECK:         %[[VAL_800:.*]] = zext i1 %[[VAL_799]] to i8
-// CHECK:         %[[VAL_801:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_742]] to i8*
+// CHECK:         %[[VAL_801:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_745]] to i8*
 // CHECK:         %[[VAL_802:.*]] = getelementptr inbounds i8, i8* %[[VAL_801]], i32 %[[VAL_763]]
 // CHECK:         store i8 %[[VAL_800]], i8* %[[VAL_802]], align 1
 // CHECK:         br label %[[VAL_769]]
@@ -896,9 +896,9 @@
 // CHECK:         %[[VAL_808:.*]] = bitcast i8* %[[VAL_806]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_809:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_810:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_811:.*]] = mul nuw nsw i32 %[[VAL_809]], 128
+// CHECK:         %[[VAL_811:.*]] = mul nuw nsw i32 %[[VAL_809]], 256
 // CHECK:         %[[VAL_812:.*]] = add nuw nsw i32 %[[VAL_811]], %[[VAL_810]]
-// CHECK:         %[[VAL_813:.*]] = icmp ult i32 %[[VAL_812]], 163840
+// CHECK:         %[[VAL_813:.*]] = icmp ult i32 %[[VAL_812]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_813]])
 // CHECK:         %[[VAL_814:.*]] = mul nuw nsw i32 %[[VAL_812]], 4
 // CHECK:         %[[VAL_815:.*]] = udiv i32 %[[VAL_814]], 1
@@ -921,32 +921,32 @@
 // CHECK:       r16.in_bounds-after:                              ; preds = %[[VAL_831]], %[[VAL_833:.*]]
 // CHECK:         ret void
 // CHECK:       r16.in_bounds-true:                               ; preds = %[[VAL_833]]
-// CHECK:         %[[VAL_834:.*]] = bitcast [100 x [200 x float]]* %[[VAL_808]] to float*
+// CHECK:         %[[VAL_834:.*]] = bitcast [100 x [200 x float]]* %[[VAL_805]] to float*
 // CHECK:         %[[VAL_835:.*]] = getelementptr inbounds float, float* %[[VAL_834]], i32 %[[VAL_814]]
 // CHECK:         %[[VAL_836:.*]] = load float, float* %[[VAL_835]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_837:.*]] = fneg float %[[VAL_836]]
-// CHECK:         %[[VAL_838:.*]] = bitcast [100 x [200 x float]]* %[[VAL_805]] to float*
+// CHECK:         %[[VAL_838:.*]] = bitcast [100 x [200 x float]]* %[[VAL_808]] to float*
 // CHECK:         %[[VAL_839:.*]] = getelementptr inbounds float, float* %[[VAL_838]], i32 %[[VAL_814]]
 // CHECK:         store float %[[VAL_837]], float* %[[VAL_839]], align 4
-// CHECK:         %[[VAL_840:.*]] = bitcast [100 x [200 x float]]* %[[VAL_808]] to float*
+// CHECK:         %[[VAL_840:.*]] = bitcast [100 x [200 x float]]* %[[VAL_805]] to float*
 // CHECK:         %[[VAL_841:.*]] = getelementptr inbounds float, float* %[[VAL_840]], i32 %[[VAL_818]]
 // CHECK:         %[[VAL_842:.*]] = load float, float* %[[VAL_841]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_843:.*]] = fneg float %[[VAL_842]]
-// CHECK:         %[[VAL_844:.*]] = bitcast [100 x [200 x float]]* %[[VAL_805]] to float*
+// CHECK:         %[[VAL_844:.*]] = bitcast [100 x [200 x float]]* %[[VAL_808]] to float*
 // CHECK:         %[[VAL_845:.*]] = getelementptr inbounds float, float* %[[VAL_844]], i32 %[[VAL_818]]
 // CHECK:         store float %[[VAL_843]], float* %[[VAL_845]], align 4
-// CHECK:         %[[VAL_846:.*]] = bitcast [100 x [200 x float]]* %[[VAL_808]] to float*
+// CHECK:         %[[VAL_846:.*]] = bitcast [100 x [200 x float]]* %[[VAL_805]] to float*
 // CHECK:         %[[VAL_847:.*]] = getelementptr inbounds float, float* %[[VAL_846]], i32 %[[VAL_822]]
 // CHECK:         %[[VAL_848:.*]] = load float, float* %[[VAL_847]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_849:.*]] = fneg float %[[VAL_848]]
-// CHECK:         %[[VAL_850:.*]] = bitcast [100 x [200 x float]]* %[[VAL_805]] to float*
+// CHECK:         %[[VAL_850:.*]] = bitcast [100 x [200 x float]]* %[[VAL_808]] to float*
 // CHECK:         %[[VAL_851:.*]] = getelementptr inbounds float, float* %[[VAL_850]], i32 %[[VAL_822]]
 // CHECK:         store float %[[VAL_849]], float* %[[VAL_851]], align 4
-// CHECK:         %[[VAL_852:.*]] = bitcast [100 x [200 x float]]* %[[VAL_808]] to float*
+// CHECK:         %[[VAL_852:.*]] = bitcast [100 x [200 x float]]* %[[VAL_805]] to float*
 // CHECK:         %[[VAL_853:.*]] = getelementptr inbounds float, float* %[[VAL_852]], i32 %[[VAL_826]]
 // CHECK:         %[[VAL_854:.*]] = load float, float* %[[VAL_853]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_855:.*]] = fneg float %[[VAL_854]]
-// CHECK:         %[[VAL_856:.*]] = bitcast [100 x [200 x float]]* %[[VAL_805]] to float*
+// CHECK:         %[[VAL_856:.*]] = bitcast [100 x [200 x float]]* %[[VAL_808]] to float*
 // CHECK:         %[[VAL_857:.*]] = getelementptr inbounds float, float* %[[VAL_856]], i32 %[[VAL_826]]
 // CHECK:         store float %[[VAL_855]], float* %[[VAL_857]], align 4
 // CHECK:         br label %[[VAL_832]]
@@ -957,9 +957,9 @@
 // CHECK:         %[[VAL_863:.*]] = bitcast i8* %[[VAL_861]] to [100 x [200 x i32]]*
 // CHECK:         %[[VAL_864:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_865:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_866:.*]] = mul nuw nsw i32 %[[VAL_864]], 128
+// CHECK:         %[[VAL_866:.*]] = mul nuw nsw i32 %[[VAL_864]], 256
 // CHECK:         %[[VAL_867:.*]] = add nuw nsw i32 %[[VAL_866]], %[[VAL_865]]
-// CHECK:         %[[VAL_868:.*]] = icmp ult i32 %[[VAL_867]], 163840
+// CHECK:         %[[VAL_868:.*]] = icmp ult i32 %[[VAL_867]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_868]])
 // CHECK:         %[[VAL_869:.*]] = mul nuw nsw i32 %[[VAL_867]], 4
 // CHECK:         %[[VAL_870:.*]] = udiv i32 %[[VAL_869]], 1
@@ -982,45 +982,45 @@
 // CHECK:       r17.in_bounds-after:                              ; preds = %[[VAL_886]], %[[VAL_888:.*]]
 // CHECK:         ret void
 // CHECK:       r17.in_bounds-true:                               ; preds = %[[VAL_888]]
-// CHECK:         %[[VAL_889:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_863]] to i32*
+// CHECK:         %[[VAL_889:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_860]] to i32*
 // CHECK:         %[[VAL_890:.*]] = getelementptr inbounds i32, i32* %[[VAL_889]], i32 %[[VAL_869]]
 // CHECK:         %[[VAL_891:.*]] = load i32, i32* %[[VAL_890]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_892:.*]] = call i32 @llvm.ctpop.i32(i32 %[[VAL_891]])
-// CHECK:         %[[VAL_893:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_860]] to i32*
+// CHECK:         %[[VAL_893:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_863]] to i32*
 // CHECK:         %[[VAL_894:.*]] = getelementptr inbounds i32, i32* %[[VAL_893]], i32 %[[VAL_869]]
 // CHECK:         store i32 %[[VAL_892]], i32* %[[VAL_894]], align 4
-// CHECK:         %[[VAL_895:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_863]] to i32*
+// CHECK:         %[[VAL_895:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_860]] to i32*
 // CHECK:         %[[VAL_896:.*]] = getelementptr inbounds i32, i32* %[[VAL_895]], i32 %[[VAL_873]]
 // CHECK:         %[[VAL_897:.*]] = load i32, i32* %[[VAL_896]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_898:.*]] = call i32 @llvm.ctpop.i32(i32 %[[VAL_897]])
-// CHECK:         %[[VAL_899:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_860]] to i32*
+// CHECK:         %[[VAL_899:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_863]] to i32*
 // CHECK:         %[[VAL_900:.*]] = getelementptr inbounds i32, i32* %[[VAL_899]], i32 %[[VAL_873]]
 // CHECK:         store i32 %[[VAL_898]], i32* %[[VAL_900]], align 4
-// CHECK:         %[[VAL_901:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_863]] to i32*
+// CHECK:         %[[VAL_901:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_860]] to i32*
 // CHECK:         %[[VAL_902:.*]] = getelementptr inbounds i32, i32* %[[VAL_901]], i32 %[[VAL_877]]
 // CHECK:         %[[VAL_903:.*]] = load i32, i32* %[[VAL_902]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_904:.*]] = call i32 @llvm.ctpop.i32(i32 %[[VAL_903]])
-// CHECK:         %[[VAL_905:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_860]] to i32*
+// CHECK:         %[[VAL_905:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_863]] to i32*
 // CHECK:         %[[VAL_906:.*]] = getelementptr inbounds i32, i32* %[[VAL_905]], i32 %[[VAL_877]]
 // CHECK:         store i32 %[[VAL_904]], i32* %[[VAL_906]], align 4
-// CHECK:         %[[VAL_907:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_863]] to i32*
+// CHECK:         %[[VAL_907:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_860]] to i32*
 // CHECK:         %[[VAL_908:.*]] = getelementptr inbounds i32, i32* %[[VAL_907]], i32 %[[VAL_881]]
 // CHECK:         %[[VAL_909:.*]] = load i32, i32* %[[VAL_908]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_910:.*]] = call i32 @llvm.ctpop.i32(i32 %[[VAL_909]])
-// CHECK:         %[[VAL_911:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_860]] to i32*
+// CHECK:         %[[VAL_911:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_863]] to i32*
 // CHECK:         %[[VAL_912:.*]] = getelementptr inbounds i32, i32* %[[VAL_911]], i32 %[[VAL_881]]
 // CHECK:         store i32 %[[VAL_910]], i32* %[[VAL_912]], align 4
 // CHECK:         br label %[[VAL_887]]
 // CHECK:       entry:
 // CHECK:         %[[VAL_913:.*]] = getelementptr inbounds i8, i8* %[[VAL_914:.*]], i64 0
-// CHECK:         %[[VAL_915:.*]] = bitcast i8* %[[VAL_913]] to [100 x [200 x float]]*
-// CHECK:         %[[VAL_916:.*]] = getelementptr inbounds i8, i8* %[[VAL_917:.*]], i64 0
-// CHECK:         %[[VAL_918:.*]] = bitcast i8* %[[VAL_916]] to [100 x [200 x %[[VAL_919:.*]]]]*
+// CHECK:         %[[VAL_915:.*]] = bitcast i8* %[[VAL_913]] to [100 x [200 x %[[VAL_916:.*]]]]*
+// CHECK:         %[[VAL_917:.*]] = getelementptr inbounds i8, i8* %[[VAL_918:.*]], i64 0
+// CHECK:         %[[VAL_919:.*]] = bitcast i8* %[[VAL_917]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_920:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_921:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_922:.*]] = mul nuw nsw i32 %[[VAL_920]], 128
+// CHECK:         %[[VAL_922:.*]] = mul nuw nsw i32 %[[VAL_920]], 256
 // CHECK:         %[[VAL_923:.*]] = add nuw nsw i32 %[[VAL_922]], %[[VAL_921]]
-// CHECK:         %[[VAL_924:.*]] = icmp ult i32 %[[VAL_923]], 163840
+// CHECK:         %[[VAL_924:.*]] = icmp ult i32 %[[VAL_923]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_924]])
 // CHECK:         %[[VAL_925:.*]] = mul nuw nsw i32 %[[VAL_923]], 4
 // CHECK:         %[[VAL_926:.*]] = udiv i32 %[[VAL_925]], 1
@@ -1043,32 +1043,32 @@
 // CHECK:       r18.in_bounds-after:                              ; preds = %[[VAL_942]], %[[VAL_944:.*]]
 // CHECK:         ret void
 // CHECK:       r18.in_bounds-true:                               ; preds = %[[VAL_944]]
-// CHECK:         %[[VAL_945:.*]] = bitcast [100 x [200 x %[[VAL_919]]]]* %[[VAL_918]] to %[[VAL_919]]*
-// CHECK:         %[[VAL_946:.*]] = getelementptr inbounds %[[VAL_919]], %[[VAL_919]]* %[[VAL_945]], i32 %[[VAL_925]]
-// CHECK:         %[[VAL_947:.*]] = load %[[VAL_919]], %[[VAL_919]]* %[[VAL_946]], align 1, !invariant.load !92
-// CHECK:         %[[VAL_948:.*]] = extractvalue %[[VAL_919]] %[[VAL_947]], 0
-// CHECK:         %[[VAL_949:.*]] = bitcast [100 x [200 x float]]* %[[VAL_915]] to float*
+// CHECK:         %[[VAL_945:.*]] = bitcast [100 x [200 x %[[VAL_916]]]]* %[[VAL_915]] to %[[VAL_916]]*
+// CHECK:         %[[VAL_946:.*]] = getelementptr inbounds %[[VAL_916]], %[[VAL_916]]* %[[VAL_945]], i32 %[[VAL_925]]
+// CHECK:         %[[VAL_947:.*]] = load %[[VAL_916]], %[[VAL_916]]* %[[VAL_946]], align 1, !invariant.load !92
+// CHECK:         %[[VAL_948:.*]] = extractvalue %[[VAL_916]] %[[VAL_947]], 0
+// CHECK:         %[[VAL_949:.*]] = bitcast [100 x [200 x float]]* %[[VAL_919]] to float*
 // CHECK:         %[[VAL_950:.*]] = getelementptr inbounds float, float* %[[VAL_949]], i32 %[[VAL_925]]
 // CHECK:         store float %[[VAL_948]], float* %[[VAL_950]], align 4
-// CHECK:         %[[VAL_951:.*]] = bitcast [100 x [200 x %[[VAL_919]]]]* %[[VAL_918]] to %[[VAL_919]]*
-// CHECK:         %[[VAL_952:.*]] = getelementptr inbounds %[[VAL_919]], %[[VAL_919]]* %[[VAL_951]], i32 %[[VAL_929]]
-// CHECK:         %[[VAL_953:.*]] = load %[[VAL_919]], %[[VAL_919]]* %[[VAL_952]], align 1, !invariant.load !92
-// CHECK:         %[[VAL_954:.*]] = extractvalue %[[VAL_919]] %[[VAL_953]], 0
-// CHECK:         %[[VAL_955:.*]] = bitcast [100 x [200 x float]]* %[[VAL_915]] to float*
+// CHECK:         %[[VAL_951:.*]] = bitcast [100 x [200 x %[[VAL_916]]]]* %[[VAL_915]] to %[[VAL_916]]*
+// CHECK:         %[[VAL_952:.*]] = getelementptr inbounds %[[VAL_916]], %[[VAL_916]]* %[[VAL_951]], i32 %[[VAL_929]]
+// CHECK:         %[[VAL_953:.*]] = load %[[VAL_916]], %[[VAL_916]]* %[[VAL_952]], align 1, !invariant.load !92
+// CHECK:         %[[VAL_954:.*]] = extractvalue %[[VAL_916]] %[[VAL_953]], 0
+// CHECK:         %[[VAL_955:.*]] = bitcast [100 x [200 x float]]* %[[VAL_919]] to float*
 // CHECK:         %[[VAL_956:.*]] = getelementptr inbounds float, float* %[[VAL_955]], i32 %[[VAL_929]]
 // CHECK:         store float %[[VAL_954]], float* %[[VAL_956]], align 4
-// CHECK:         %[[VAL_957:.*]] = bitcast [100 x [200 x %[[VAL_919]]]]* %[[VAL_918]] to %[[VAL_919]]*
-// CHECK:         %[[VAL_958:.*]] = getelementptr inbounds %[[VAL_919]], %[[VAL_919]]* %[[VAL_957]], i32 %[[VAL_933]]
-// CHECK:         %[[VAL_959:.*]] = load %[[VAL_919]], %[[VAL_919]]* %[[VAL_958]], align 1, !invariant.load !92
-// CHECK:         %[[VAL_960:.*]] = extractvalue %[[VAL_919]] %[[VAL_959]], 0
-// CHECK:         %[[VAL_961:.*]] = bitcast [100 x [200 x float]]* %[[VAL_915]] to float*
+// CHECK:         %[[VAL_957:.*]] = bitcast [100 x [200 x %[[VAL_916]]]]* %[[VAL_915]] to %[[VAL_916]]*
+// CHECK:         %[[VAL_958:.*]] = getelementptr inbounds %[[VAL_916]], %[[VAL_916]]* %[[VAL_957]], i32 %[[VAL_933]]
+// CHECK:         %[[VAL_959:.*]] = load %[[VAL_916]], %[[VAL_916]]* %[[VAL_958]], align 1, !invariant.load !92
+// CHECK:         %[[VAL_960:.*]] = extractvalue %[[VAL_916]] %[[VAL_959]], 0
+// CHECK:         %[[VAL_961:.*]] = bitcast [100 x [200 x float]]* %[[VAL_919]] to float*
 // CHECK:         %[[VAL_962:.*]] = getelementptr inbounds float, float* %[[VAL_961]], i32 %[[VAL_933]]
 // CHECK:         store float %[[VAL_960]], float* %[[VAL_962]], align 4
-// CHECK:         %[[VAL_963:.*]] = bitcast [100 x [200 x %[[VAL_919]]]]* %[[VAL_918]] to %[[VAL_919]]*
-// CHECK:         %[[VAL_964:.*]] = getelementptr inbounds %[[VAL_919]], %[[VAL_919]]* %[[VAL_963]], i32 %[[VAL_937]]
-// CHECK:         %[[VAL_965:.*]] = load %[[VAL_919]], %[[VAL_919]]* %[[VAL_964]], align 1, !invariant.load !92
-// CHECK:         %[[VAL_966:.*]] = extractvalue %[[VAL_919]] %[[VAL_965]], 0
-// CHECK:         %[[VAL_967:.*]] = bitcast [100 x [200 x float]]* %[[VAL_915]] to float*
+// CHECK:         %[[VAL_963:.*]] = bitcast [100 x [200 x %[[VAL_916]]]]* %[[VAL_915]] to %[[VAL_916]]*
+// CHECK:         %[[VAL_964:.*]] = getelementptr inbounds %[[VAL_916]], %[[VAL_916]]* %[[VAL_963]], i32 %[[VAL_937]]
+// CHECK:         %[[VAL_965:.*]] = load %[[VAL_916]], %[[VAL_916]]* %[[VAL_964]], align 1, !invariant.load !92
+// CHECK:         %[[VAL_966:.*]] = extractvalue %[[VAL_916]] %[[VAL_965]], 0
+// CHECK:         %[[VAL_967:.*]] = bitcast [100 x [200 x float]]* %[[VAL_919]] to float*
 // CHECK:         %[[VAL_968:.*]] = getelementptr inbounds float, float* %[[VAL_967]], i32 %[[VAL_937]]
 // CHECK:         store float %[[VAL_966]], float* %[[VAL_968]], align 4
 // CHECK:         br label %[[VAL_943]]
@@ -1079,9 +1079,9 @@
 // CHECK:         %[[VAL_974:.*]] = bitcast i8* %[[VAL_972]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_975:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_976:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_977:.*]] = mul nuw nsw i32 %[[VAL_975]], 128
+// CHECK:         %[[VAL_977:.*]] = mul nuw nsw i32 %[[VAL_975]], 256
 // CHECK:         %[[VAL_978:.*]] = add nuw nsw i32 %[[VAL_977]], %[[VAL_976]]
-// CHECK:         %[[VAL_979:.*]] = icmp ult i32 %[[VAL_978]], 163840
+// CHECK:         %[[VAL_979:.*]] = icmp ult i32 %[[VAL_978]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_979]])
 // CHECK:         %[[VAL_980:.*]] = mul nuw nsw i32 %[[VAL_978]], 4
 // CHECK:         %[[VAL_981:.*]] = udiv i32 %[[VAL_980]], 1
@@ -1104,7 +1104,7 @@
 // CHECK:       r19.in_bounds-after:                              ; preds = %[[VAL_997]], %[[VAL_999:.*]]
 // CHECK:         ret void
 // CHECK:       r19.in_bounds-true:                               ; preds = %[[VAL_999]]
-// CHECK:         %[[VAL_1000:.*]] = bitcast [100 x [200 x float]]* %[[VAL_974]] to float*
+// CHECK:         %[[VAL_1000:.*]] = bitcast [100 x [200 x float]]* %[[VAL_971]] to float*
 // CHECK:         %[[VAL_1001:.*]] = getelementptr inbounds float, float* %[[VAL_1000]], i32 %[[VAL_980]]
 // CHECK:         %[[VAL_1002:.*]] = load float, float* %[[VAL_1001]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1003:.*]] = bitcast float %[[VAL_1002]] to i32
@@ -1123,10 +1123,10 @@
 // CHECK:         %[[VAL_1016:.*]] = bitcast i32 %[[VAL_1015]] to float
 // CHECK:         %[[VAL_1017:.*]] = fcmp uno float %[[VAL_1002]], %[[VAL_1002]]
 // CHECK:         %[[VAL_1018:.*]] = select i1 %[[VAL_1017]], float %[[VAL_1002]], float %[[VAL_1016]]
-// CHECK:         %[[VAL_1019:.*]] = bitcast [100 x [200 x float]]* %[[VAL_971]] to float*
+// CHECK:         %[[VAL_1019:.*]] = bitcast [100 x [200 x float]]* %[[VAL_974]] to float*
 // CHECK:         %[[VAL_1020:.*]] = getelementptr inbounds float, float* %[[VAL_1019]], i32 %[[VAL_980]]
 // CHECK:         store float %[[VAL_1018]], float* %[[VAL_1020]], align 4
-// CHECK:         %[[VAL_1021:.*]] = bitcast [100 x [200 x float]]* %[[VAL_974]] to float*
+// CHECK:         %[[VAL_1021:.*]] = bitcast [100 x [200 x float]]* %[[VAL_971]] to float*
 // CHECK:         %[[VAL_1022:.*]] = getelementptr inbounds float, float* %[[VAL_1021]], i32 %[[VAL_984]]
 // CHECK:         %[[VAL_1023:.*]] = load float, float* %[[VAL_1022]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1024:.*]] = bitcast float %[[VAL_1023]] to i32
@@ -1145,10 +1145,10 @@
 // CHECK:         %[[VAL_1037:.*]] = bitcast i32 %[[VAL_1036]] to float
 // CHECK:         %[[VAL_1038:.*]] = fcmp uno float %[[VAL_1023]], %[[VAL_1023]]
 // CHECK:         %[[VAL_1039:.*]] = select i1 %[[VAL_1038]], float %[[VAL_1023]], float %[[VAL_1037]]
-// CHECK:         %[[VAL_1040:.*]] = bitcast [100 x [200 x float]]* %[[VAL_971]] to float*
+// CHECK:         %[[VAL_1040:.*]] = bitcast [100 x [200 x float]]* %[[VAL_974]] to float*
 // CHECK:         %[[VAL_1041:.*]] = getelementptr inbounds float, float* %[[VAL_1040]], i32 %[[VAL_984]]
 // CHECK:         store float %[[VAL_1039]], float* %[[VAL_1041]], align 4
-// CHECK:         %[[VAL_1042:.*]] = bitcast [100 x [200 x float]]* %[[VAL_974]] to float*
+// CHECK:         %[[VAL_1042:.*]] = bitcast [100 x [200 x float]]* %[[VAL_971]] to float*
 // CHECK:         %[[VAL_1043:.*]] = getelementptr inbounds float, float* %[[VAL_1042]], i32 %[[VAL_988]]
 // CHECK:         %[[VAL_1044:.*]] = load float, float* %[[VAL_1043]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1045:.*]] = bitcast float %[[VAL_1044]] to i32
@@ -1167,10 +1167,10 @@
 // CHECK:         %[[VAL_1058:.*]] = bitcast i32 %[[VAL_1057]] to float
 // CHECK:         %[[VAL_1059:.*]] = fcmp uno float %[[VAL_1044]], %[[VAL_1044]]
 // CHECK:         %[[VAL_1060:.*]] = select i1 %[[VAL_1059]], float %[[VAL_1044]], float %[[VAL_1058]]
-// CHECK:         %[[VAL_1061:.*]] = bitcast [100 x [200 x float]]* %[[VAL_971]] to float*
+// CHECK:         %[[VAL_1061:.*]] = bitcast [100 x [200 x float]]* %[[VAL_974]] to float*
 // CHECK:         %[[VAL_1062:.*]] = getelementptr inbounds float, float* %[[VAL_1061]], i32 %[[VAL_988]]
 // CHECK:         store float %[[VAL_1060]], float* %[[VAL_1062]], align 4
-// CHECK:         %[[VAL_1063:.*]] = bitcast [100 x [200 x float]]* %[[VAL_974]] to float*
+// CHECK:         %[[VAL_1063:.*]] = bitcast [100 x [200 x float]]* %[[VAL_971]] to float*
 // CHECK:         %[[VAL_1064:.*]] = getelementptr inbounds float, float* %[[VAL_1063]], i32 %[[VAL_992]]
 // CHECK:         %[[VAL_1065:.*]] = load float, float* %[[VAL_1064]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1066:.*]] = bitcast float %[[VAL_1065]] to i32
@@ -1189,7 +1189,7 @@
 // CHECK:         %[[VAL_1079:.*]] = bitcast i32 %[[VAL_1078]] to float
 // CHECK:         %[[VAL_1080:.*]] = fcmp uno float %[[VAL_1065]], %[[VAL_1065]]
 // CHECK:         %[[VAL_1081:.*]] = select i1 %[[VAL_1080]], float %[[VAL_1065]], float %[[VAL_1079]]
-// CHECK:         %[[VAL_1082:.*]] = bitcast [100 x [200 x float]]* %[[VAL_971]] to float*
+// CHECK:         %[[VAL_1082:.*]] = bitcast [100 x [200 x float]]* %[[VAL_974]] to float*
 // CHECK:         %[[VAL_1083:.*]] = getelementptr inbounds float, float* %[[VAL_1082]], i32 %[[VAL_992]]
 // CHECK:         store float %[[VAL_1081]], float* %[[VAL_1083]], align 4
 // CHECK:         br label %[[VAL_998]]
@@ -1200,9 +1200,9 @@
 // CHECK:         %[[VAL_1089:.*]] = bitcast i8* %[[VAL_1087]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_1090:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_1091:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_1092:.*]] = mul nuw nsw i32 %[[VAL_1090]], 128
+// CHECK:         %[[VAL_1092:.*]] = mul nuw nsw i32 %[[VAL_1090]], 256
 // CHECK:         %[[VAL_1093:.*]] = add nuw nsw i32 %[[VAL_1092]], %[[VAL_1091]]
-// CHECK:         %[[VAL_1094:.*]] = icmp ult i32 %[[VAL_1093]], 163840
+// CHECK:         %[[VAL_1094:.*]] = icmp ult i32 %[[VAL_1093]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_1094]])
 // CHECK:         %[[VAL_1095:.*]] = mul nuw nsw i32 %[[VAL_1093]], 4
 // CHECK:         %[[VAL_1096:.*]] = udiv i32 %[[VAL_1095]], 1
@@ -1225,32 +1225,32 @@
 // CHECK:       r20.in_bounds-after:                              ; preds = %[[VAL_1112]], %[[VAL_1114:.*]]
 // CHECK:         ret void
 // CHECK:       r20.in_bounds-true:                               ; preds = %[[VAL_1114]]
-// CHECK:         %[[VAL_1115:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1089]] to float*
+// CHECK:         %[[VAL_1115:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1086]] to float*
 // CHECK:         %[[VAL_1116:.*]] = getelementptr inbounds float, float* %[[VAL_1115]], i32 %[[VAL_1095]]
 // CHECK:         %[[VAL_1117:.*]] = load float, float* %[[VAL_1116]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1118:.*]] = call float @__nv_rsqrtf(float %[[VAL_1117]])
-// CHECK:         %[[VAL_1119:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1086]] to float*
+// CHECK:         %[[VAL_1119:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1089]] to float*
 // CHECK:         %[[VAL_1120:.*]] = getelementptr inbounds float, float* %[[VAL_1119]], i32 %[[VAL_1095]]
 // CHECK:         store float %[[VAL_1118]], float* %[[VAL_1120]], align 4
-// CHECK:         %[[VAL_1121:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1089]] to float*
+// CHECK:         %[[VAL_1121:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1086]] to float*
 // CHECK:         %[[VAL_1122:.*]] = getelementptr inbounds float, float* %[[VAL_1121]], i32 %[[VAL_1099]]
 // CHECK:         %[[VAL_1123:.*]] = load float, float* %[[VAL_1122]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1124:.*]] = call float @__nv_rsqrtf(float %[[VAL_1123]])
-// CHECK:         %[[VAL_1125:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1086]] to float*
+// CHECK:         %[[VAL_1125:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1089]] to float*
 // CHECK:         %[[VAL_1126:.*]] = getelementptr inbounds float, float* %[[VAL_1125]], i32 %[[VAL_1099]]
 // CHECK:         store float %[[VAL_1124]], float* %[[VAL_1126]], align 4
-// CHECK:         %[[VAL_1127:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1089]] to float*
+// CHECK:         %[[VAL_1127:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1086]] to float*
 // CHECK:         %[[VAL_1128:.*]] = getelementptr inbounds float, float* %[[VAL_1127]], i32 %[[VAL_1103]]
 // CHECK:         %[[VAL_1129:.*]] = load float, float* %[[VAL_1128]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1130:.*]] = call float @__nv_rsqrtf(float %[[VAL_1129]])
-// CHECK:         %[[VAL_1131:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1086]] to float*
+// CHECK:         %[[VAL_1131:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1089]] to float*
 // CHECK:         %[[VAL_1132:.*]] = getelementptr inbounds float, float* %[[VAL_1131]], i32 %[[VAL_1103]]
 // CHECK:         store float %[[VAL_1130]], float* %[[VAL_1132]], align 4
-// CHECK:         %[[VAL_1133:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1089]] to float*
+// CHECK:         %[[VAL_1133:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1086]] to float*
 // CHECK:         %[[VAL_1134:.*]] = getelementptr inbounds float, float* %[[VAL_1133]], i32 %[[VAL_1107]]
 // CHECK:         %[[VAL_1135:.*]] = load float, float* %[[VAL_1134]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1136:.*]] = call float @__nv_rsqrtf(float %[[VAL_1135]])
-// CHECK:         %[[VAL_1137:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1086]] to float*
+// CHECK:         %[[VAL_1137:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1089]] to float*
 // CHECK:         %[[VAL_1138:.*]] = getelementptr inbounds float, float* %[[VAL_1137]], i32 %[[VAL_1107]]
 // CHECK:         store float %[[VAL_1136]], float* %[[VAL_1138]], align 4
 // CHECK:         br label %[[VAL_1113]]
@@ -1261,9 +1261,9 @@
 // CHECK:         %[[VAL_1144:.*]] = bitcast i8* %[[VAL_1142]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_1145:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_1146:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_1147:.*]] = mul nuw nsw i32 %[[VAL_1145]], 128
+// CHECK:         %[[VAL_1147:.*]] = mul nuw nsw i32 %[[VAL_1145]], 256
 // CHECK:         %[[VAL_1148:.*]] = add nuw nsw i32 %[[VAL_1147]], %[[VAL_1146]]
-// CHECK:         %[[VAL_1149:.*]] = icmp ult i32 %[[VAL_1148]], 163840
+// CHECK:         %[[VAL_1149:.*]] = icmp ult i32 %[[VAL_1148]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_1149]])
 // CHECK:         %[[VAL_1150:.*]] = mul nuw nsw i32 %[[VAL_1148]], 4
 // CHECK:         %[[VAL_1151:.*]] = udiv i32 %[[VAL_1150]], 1
@@ -1286,7 +1286,7 @@
 // CHECK:       r22.in_bounds-after:                              ; preds = %[[VAL_1167]], %[[VAL_1169:.*]]
 // CHECK:         ret void
 // CHECK:       r22.in_bounds-true:                               ; preds = %[[VAL_1169]]
-// CHECK:         %[[VAL_1170:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1144]] to float*
+// CHECK:         %[[VAL_1170:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1141]] to float*
 // CHECK:         %[[VAL_1171:.*]] = getelementptr inbounds float, float* %[[VAL_1170]], i32 %[[VAL_1150]]
 // CHECK:         %[[VAL_1172:.*]] = load float, float* %[[VAL_1171]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1173:.*]] = fcmp one float %[[VAL_1172]], 0.000000e+00
@@ -1294,10 +1294,10 @@
 // CHECK:         %[[VAL_1175:.*]] = call float @llvm.copysign.f32(float %[[VAL_1174]], float %[[VAL_1172]])
 // CHECK:         %[[VAL_1176:.*]] = fcmp uno float %[[VAL_1172]], %[[VAL_1172]]
 // CHECK:         %[[VAL_1177:.*]] = select i1 %[[VAL_1176]], float %[[VAL_1172]], float %[[VAL_1175]]
-// CHECK:         %[[VAL_1178:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1141]] to float*
+// CHECK:         %[[VAL_1178:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1144]] to float*
 // CHECK:         %[[VAL_1179:.*]] = getelementptr inbounds float, float* %[[VAL_1178]], i32 %[[VAL_1150]]
 // CHECK:         store float %[[VAL_1177]], float* %[[VAL_1179]], align 4
-// CHECK:         %[[VAL_1180:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1144]] to float*
+// CHECK:         %[[VAL_1180:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1141]] to float*
 // CHECK:         %[[VAL_1181:.*]] = getelementptr inbounds float, float* %[[VAL_1180]], i32 %[[VAL_1154]]
 // CHECK:         %[[VAL_1182:.*]] = load float, float* %[[VAL_1181]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1183:.*]] = fcmp one float %[[VAL_1182]], 0.000000e+00
@@ -1305,10 +1305,10 @@
 // CHECK:         %[[VAL_1185:.*]] = call float @llvm.copysign.f32(float %[[VAL_1184]], float %[[VAL_1182]])
 // CHECK:         %[[VAL_1186:.*]] = fcmp uno float %[[VAL_1182]], %[[VAL_1182]]
 // CHECK:         %[[VAL_1187:.*]] = select i1 %[[VAL_1186]], float %[[VAL_1182]], float %[[VAL_1185]]
-// CHECK:         %[[VAL_1188:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1141]] to float*
+// CHECK:         %[[VAL_1188:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1144]] to float*
 // CHECK:         %[[VAL_1189:.*]] = getelementptr inbounds float, float* %[[VAL_1188]], i32 %[[VAL_1154]]
 // CHECK:         store float %[[VAL_1187]], float* %[[VAL_1189]], align 4
-// CHECK:         %[[VAL_1190:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1144]] to float*
+// CHECK:         %[[VAL_1190:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1141]] to float*
 // CHECK:         %[[VAL_1191:.*]] = getelementptr inbounds float, float* %[[VAL_1190]], i32 %[[VAL_1158]]
 // CHECK:         %[[VAL_1192:.*]] = load float, float* %[[VAL_1191]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1193:.*]] = fcmp one float %[[VAL_1192]], 0.000000e+00
@@ -1316,10 +1316,10 @@
 // CHECK:         %[[VAL_1195:.*]] = call float @llvm.copysign.f32(float %[[VAL_1194]], float %[[VAL_1192]])
 // CHECK:         %[[VAL_1196:.*]] = fcmp uno float %[[VAL_1192]], %[[VAL_1192]]
 // CHECK:         %[[VAL_1197:.*]] = select i1 %[[VAL_1196]], float %[[VAL_1192]], float %[[VAL_1195]]
-// CHECK:         %[[VAL_1198:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1141]] to float*
+// CHECK:         %[[VAL_1198:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1144]] to float*
 // CHECK:         %[[VAL_1199:.*]] = getelementptr inbounds float, float* %[[VAL_1198]], i32 %[[VAL_1158]]
 // CHECK:         store float %[[VAL_1197]], float* %[[VAL_1199]], align 4
-// CHECK:         %[[VAL_1200:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1144]] to float*
+// CHECK:         %[[VAL_1200:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1141]] to float*
 // CHECK:         %[[VAL_1201:.*]] = getelementptr inbounds float, float* %[[VAL_1200]], i32 %[[VAL_1162]]
 // CHECK:         %[[VAL_1202:.*]] = load float, float* %[[VAL_1201]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1203:.*]] = fcmp one float %[[VAL_1202]], 0.000000e+00
@@ -1327,7 +1327,7 @@
 // CHECK:         %[[VAL_1205:.*]] = call float @llvm.copysign.f32(float %[[VAL_1204]], float %[[VAL_1202]])
 // CHECK:         %[[VAL_1206:.*]] = fcmp uno float %[[VAL_1202]], %[[VAL_1202]]
 // CHECK:         %[[VAL_1207:.*]] = select i1 %[[VAL_1206]], float %[[VAL_1202]], float %[[VAL_1205]]
-// CHECK:         %[[VAL_1208:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1141]] to float*
+// CHECK:         %[[VAL_1208:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1144]] to float*
 // CHECK:         %[[VAL_1209:.*]] = getelementptr inbounds float, float* %[[VAL_1208]], i32 %[[VAL_1162]]
 // CHECK:         store float %[[VAL_1207]], float* %[[VAL_1209]], align 4
 // CHECK:         br label %[[VAL_1168]]
@@ -1337,10 +1337,10 @@
 // CHECK:         %[[VAL_1213:.*]] = getelementptr inbounds i8, i8* %[[VAL_1214:.*]], i64 0
 // CHECK:         %[[VAL_1215:.*]] = bitcast i8* %[[VAL_1213]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_1216:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
-// CHECK:         %[[VAL_1217:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_1218:.*]] = mul nuw nsw i32 %[[VAL_1216]], 128
+// CHECK:         %[[VAL_1217:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !93
+// CHECK:         %[[VAL_1218:.*]] = mul nuw nsw i32 %[[VAL_1216]], 1024
 // CHECK:         %[[VAL_1219:.*]] = add nuw nsw i32 %[[VAL_1218]], %[[VAL_1217]]
-// CHECK:         %[[VAL_1220:.*]] = icmp ult i32 %[[VAL_1219]], 163840
+// CHECK:         %[[VAL_1220:.*]] = icmp ult i32 %[[VAL_1219]], 20480
 // CHECK:         call void @llvm.assume(i1 %[[VAL_1220]])
 // CHECK:         %[[VAL_1221:.*]] = udiv i32 %[[VAL_1219]], 1
 // CHECK:         %[[VAL_1222:.*]] = urem i32 %[[VAL_1221]], 200
@@ -1350,11 +1350,11 @@
 // CHECK:       r23.in_bounds-after:                              ; preds = %[[VAL_1225]], %[[VAL_1227:.*]]
 // CHECK:         ret void
 // CHECK:       r23.in_bounds-true:                               ; preds = %[[VAL_1227]]
-// CHECK:         %[[VAL_1228:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1215]] to float*
+// CHECK:         %[[VAL_1228:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1212]] to float*
 // CHECK:         %[[VAL_1229:.*]] = getelementptr inbounds float, float* %[[VAL_1228]], i32 %[[VAL_1219]]
 // CHECK:         %[[VAL_1230:.*]] = load float, float* %[[VAL_1229]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1231:.*]] = call float @__nv_sinf(float %[[VAL_1230]])
-// CHECK:         %[[VAL_1232:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1212]] to float*
+// CHECK:         %[[VAL_1232:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1215]] to float*
 // CHECK:         %[[VAL_1233:.*]] = getelementptr inbounds float, float* %[[VAL_1232]], i32 %[[VAL_1219]]
 // CHECK:         store float %[[VAL_1231]], float* %[[VAL_1233]], align 4
 // CHECK:         br label %[[VAL_1226]]
@@ -1365,9 +1365,9 @@
 // CHECK:         %[[VAL_1239:.*]] = bitcast i8* %[[VAL_1237]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_1240:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_1241:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_1242:.*]] = mul nuw nsw i32 %[[VAL_1240]], 128
+// CHECK:         %[[VAL_1242:.*]] = mul nuw nsw i32 %[[VAL_1240]], 256
 // CHECK:         %[[VAL_1243:.*]] = add nuw nsw i32 %[[VAL_1242]], %[[VAL_1241]]
-// CHECK:         %[[VAL_1244:.*]] = icmp ult i32 %[[VAL_1243]], 163840
+// CHECK:         %[[VAL_1244:.*]] = icmp ult i32 %[[VAL_1243]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_1244]])
 // CHECK:         %[[VAL_1245:.*]] = mul nuw nsw i32 %[[VAL_1243]], 4
 // CHECK:         %[[VAL_1246:.*]] = udiv i32 %[[VAL_1245]], 1
@@ -1390,32 +1390,32 @@
 // CHECK:       r24.in_bounds-after:                              ; preds = %[[VAL_1262]], %[[VAL_1264:.*]]
 // CHECK:         ret void
 // CHECK:       r24.in_bounds-true:                               ; preds = %[[VAL_1264]]
-// CHECK:         %[[VAL_1265:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1239]] to float*
+// CHECK:         %[[VAL_1265:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1236]] to float*
 // CHECK:         %[[VAL_1266:.*]] = getelementptr inbounds float, float* %[[VAL_1265]], i32 %[[VAL_1245]]
 // CHECK:         %[[VAL_1267:.*]] = load float, float* %[[VAL_1266]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1268:.*]] = call float @__nv_sqrtf(float %[[VAL_1267]])
-// CHECK:         %[[VAL_1269:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1236]] to float*
+// CHECK:         %[[VAL_1269:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1239]] to float*
 // CHECK:         %[[VAL_1270:.*]] = getelementptr inbounds float, float* %[[VAL_1269]], i32 %[[VAL_1245]]
 // CHECK:         store float %[[VAL_1268]], float* %[[VAL_1270]], align 4
-// CHECK:         %[[VAL_1271:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1239]] to float*
+// CHECK:         %[[VAL_1271:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1236]] to float*
 // CHECK:         %[[VAL_1272:.*]] = getelementptr inbounds float, float* %[[VAL_1271]], i32 %[[VAL_1249]]
 // CHECK:         %[[VAL_1273:.*]] = load float, float* %[[VAL_1272]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1274:.*]] = call float @__nv_sqrtf(float %[[VAL_1273]])
-// CHECK:         %[[VAL_1275:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1236]] to float*
+// CHECK:         %[[VAL_1275:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1239]] to float*
 // CHECK:         %[[VAL_1276:.*]] = getelementptr inbounds float, float* %[[VAL_1275]], i32 %[[VAL_1249]]
 // CHECK:         store float %[[VAL_1274]], float* %[[VAL_1276]], align 4
-// CHECK:         %[[VAL_1277:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1239]] to float*
+// CHECK:         %[[VAL_1277:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1236]] to float*
 // CHECK:         %[[VAL_1278:.*]] = getelementptr inbounds float, float* %[[VAL_1277]], i32 %[[VAL_1253]]
 // CHECK:         %[[VAL_1279:.*]] = load float, float* %[[VAL_1278]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1280:.*]] = call float @__nv_sqrtf(float %[[VAL_1279]])
-// CHECK:         %[[VAL_1281:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1236]] to float*
+// CHECK:         %[[VAL_1281:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1239]] to float*
 // CHECK:         %[[VAL_1282:.*]] = getelementptr inbounds float, float* %[[VAL_1281]], i32 %[[VAL_1253]]
 // CHECK:         store float %[[VAL_1280]], float* %[[VAL_1282]], align 4
-// CHECK:         %[[VAL_1283:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1239]] to float*
+// CHECK:         %[[VAL_1283:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1236]] to float*
 // CHECK:         %[[VAL_1284:.*]] = getelementptr inbounds float, float* %[[VAL_1283]], i32 %[[VAL_1257]]
 // CHECK:         %[[VAL_1285:.*]] = load float, float* %[[VAL_1284]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1286:.*]] = call float @__nv_sqrtf(float %[[VAL_1285]])
-// CHECK:         %[[VAL_1287:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1236]] to float*
+// CHECK:         %[[VAL_1287:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1239]] to float*
 // CHECK:         %[[VAL_1288:.*]] = getelementptr inbounds float, float* %[[VAL_1287]], i32 %[[VAL_1257]]
 // CHECK:         store float %[[VAL_1286]], float* %[[VAL_1288]], align 4
 // CHECK:         br label %[[VAL_1263]]
@@ -1426,9 +1426,9 @@
 // CHECK:         %[[VAL_1294:.*]] = bitcast i8* %[[VAL_1292]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_1295:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_1296:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_1297:.*]] = mul nuw nsw i32 %[[VAL_1295]], 128
+// CHECK:         %[[VAL_1297:.*]] = mul nuw nsw i32 %[[VAL_1295]], 256
 // CHECK:         %[[VAL_1298:.*]] = add nuw nsw i32 %[[VAL_1297]], %[[VAL_1296]]
-// CHECK:         %[[VAL_1299:.*]] = icmp ult i32 %[[VAL_1298]], 163840
+// CHECK:         %[[VAL_1299:.*]] = icmp ult i32 %[[VAL_1298]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_1299]])
 // CHECK:         %[[VAL_1300:.*]] = mul nuw nsw i32 %[[VAL_1298]], 4
 // CHECK:         %[[VAL_1301:.*]] = udiv i32 %[[VAL_1300]], 1
@@ -1451,40 +1451,40 @@
 // CHECK:       r25.in_bounds-after:                              ; preds = %[[VAL_1317]], %[[VAL_1319:.*]]
 // CHECK:         ret void
 // CHECK:       r25.in_bounds-true:                               ; preds = %[[VAL_1319]]
-// CHECK:         %[[VAL_1320:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1294]] to float*
+// CHECK:         %[[VAL_1320:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1291]] to float*
 // CHECK:         %[[VAL_1321:.*]] = getelementptr inbounds float, float* %[[VAL_1320]], i32 %[[VAL_1300]]
 // CHECK:         %[[VAL_1322:.*]] = load float, float* %[[VAL_1321]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1323:.*]] = call float @llvm.fabs.f32(float %[[VAL_1322]])
 // CHECK:         %[[VAL_1324:.*]] = call float @__nv_powf(float %[[VAL_1323]], float 0x3FD5555560000000)
 // CHECK:         %[[VAL_1325:.*]] = call float @llvm.copysign.f32(float %[[VAL_1324]], float %[[VAL_1322]])
-// CHECK:         %[[VAL_1326:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1291]] to float*
+// CHECK:         %[[VAL_1326:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1294]] to float*
 // CHECK:         %[[VAL_1327:.*]] = getelementptr inbounds float, float* %[[VAL_1326]], i32 %[[VAL_1300]]
 // CHECK:         store float %[[VAL_1325]], float* %[[VAL_1327]], align 4
-// CHECK:         %[[VAL_1328:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1294]] to float*
+// CHECK:         %[[VAL_1328:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1291]] to float*
 // CHECK:         %[[VAL_1329:.*]] = getelementptr inbounds float, float* %[[VAL_1328]], i32 %[[VAL_1304]]
 // CHECK:         %[[VAL_1330:.*]] = load float, float* %[[VAL_1329]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1331:.*]] = call float @llvm.fabs.f32(float %[[VAL_1330]])
 // CHECK:         %[[VAL_1332:.*]] = call float @__nv_powf(float %[[VAL_1331]], float 0x3FD5555560000000)
 // CHECK:         %[[VAL_1333:.*]] = call float @llvm.copysign.f32(float %[[VAL_1332]], float %[[VAL_1330]])
-// CHECK:         %[[VAL_1334:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1291]] to float*
+// CHECK:         %[[VAL_1334:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1294]] to float*
 // CHECK:         %[[VAL_1335:.*]] = getelementptr inbounds float, float* %[[VAL_1334]], i32 %[[VAL_1304]]
 // CHECK:         store float %[[VAL_1333]], float* %[[VAL_1335]], align 4
-// CHECK:         %[[VAL_1336:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1294]] to float*
+// CHECK:         %[[VAL_1336:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1291]] to float*
 // CHECK:         %[[VAL_1337:.*]] = getelementptr inbounds float, float* %[[VAL_1336]], i32 %[[VAL_1308]]
 // CHECK:         %[[VAL_1338:.*]] = load float, float* %[[VAL_1337]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1339:.*]] = call float @llvm.fabs.f32(float %[[VAL_1338]])
 // CHECK:         %[[VAL_1340:.*]] = call float @__nv_powf(float %[[VAL_1339]], float 0x3FD5555560000000)
 // CHECK:         %[[VAL_1341:.*]] = call float @llvm.copysign.f32(float %[[VAL_1340]], float %[[VAL_1338]])
-// CHECK:         %[[VAL_1342:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1291]] to float*
+// CHECK:         %[[VAL_1342:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1294]] to float*
 // CHECK:         %[[VAL_1343:.*]] = getelementptr inbounds float, float* %[[VAL_1342]], i32 %[[VAL_1308]]
 // CHECK:         store float %[[VAL_1341]], float* %[[VAL_1343]], align 4
-// CHECK:         %[[VAL_1344:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1294]] to float*
+// CHECK:         %[[VAL_1344:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1291]] to float*
 // CHECK:         %[[VAL_1345:.*]] = getelementptr inbounds float, float* %[[VAL_1344]], i32 %[[VAL_1312]]
 // CHECK:         %[[VAL_1346:.*]] = load float, float* %[[VAL_1345]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1347:.*]] = call float @llvm.fabs.f32(float %[[VAL_1346]])
 // CHECK:         %[[VAL_1348:.*]] = call float @__nv_powf(float %[[VAL_1347]], float 0x3FD5555560000000)
 // CHECK:         %[[VAL_1349:.*]] = call float @llvm.copysign.f32(float %[[VAL_1348]], float %[[VAL_1346]])
-// CHECK:         %[[VAL_1350:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1291]] to float*
+// CHECK:         %[[VAL_1350:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1294]] to float*
 // CHECK:         %[[VAL_1351:.*]] = getelementptr inbounds float, float* %[[VAL_1350]], i32 %[[VAL_1312]]
 // CHECK:         store float %[[VAL_1349]], float* %[[VAL_1351]], align 4
 // CHECK:         br label %[[VAL_1318]]
@@ -1495,9 +1495,9 @@
 // CHECK:         %[[VAL_1357:.*]] = bitcast i8* %[[VAL_1355]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_1358:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_1359:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_1360:.*]] = mul nuw nsw i32 %[[VAL_1358]], 128
+// CHECK:         %[[VAL_1360:.*]] = mul nuw nsw i32 %[[VAL_1358]], 256
 // CHECK:         %[[VAL_1361:.*]] = add nuw nsw i32 %[[VAL_1360]], %[[VAL_1359]]
-// CHECK:         %[[VAL_1362:.*]] = icmp ult i32 %[[VAL_1361]], 163840
+// CHECK:         %[[VAL_1362:.*]] = icmp ult i32 %[[VAL_1361]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_1362]])
 // CHECK:         %[[VAL_1363:.*]] = mul nuw nsw i32 %[[VAL_1361]], 4
 // CHECK:         %[[VAL_1364:.*]] = udiv i32 %[[VAL_1363]], 1
@@ -1520,7 +1520,7 @@
 // CHECK:       r26.in_bounds-after:                              ; preds = %[[VAL_1380]], %[[VAL_1382:.*]]
 // CHECK:         ret void
 // CHECK:       r26.in_bounds-true:                               ; preds = %[[VAL_1382]]
-// CHECK:         %[[VAL_1383:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1357]] to float*
+// CHECK:         %[[VAL_1383:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1354]] to float*
 // CHECK:         %[[VAL_1384:.*]] = getelementptr inbounds float, float* %[[VAL_1383]], i32 %[[VAL_1363]]
 // CHECK:         %[[VAL_1385:.*]] = load float, float* %[[VAL_1384]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1386:.*]] = call float @llvm.fabs.f32(float %[[VAL_1385]])
@@ -1555,10 +1555,10 @@
 // CHECK:         %[[VAL_1415:.*]] = call float @llvm.copysign.f32(float 1.000000e+00, float %[[VAL_1385]])
 // CHECK:         %[[VAL_1416:.*]] = fcmp ult float %[[VAL_1386]], 2.000000e+01
 // CHECK:         %[[VAL_1417:.*]] = select i1 %[[VAL_1416]], float %[[VAL_1414]], float %[[VAL_1415]]
-// CHECK:         %[[VAL_1418:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1354]] to float*
+// CHECK:         %[[VAL_1418:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1357]] to float*
 // CHECK:         %[[VAL_1419:.*]] = getelementptr inbounds float, float* %[[VAL_1418]], i32 %[[VAL_1363]]
 // CHECK:         store float %[[VAL_1417]], float* %[[VAL_1419]], align 4
-// CHECK:         %[[VAL_1420:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1357]] to float*
+// CHECK:         %[[VAL_1420:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1354]] to float*
 // CHECK:         %[[VAL_1421:.*]] = getelementptr inbounds float, float* %[[VAL_1420]], i32 %[[VAL_1367]]
 // CHECK:         %[[VAL_1422:.*]] = load float, float* %[[VAL_1421]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1423:.*]] = call float @llvm.fabs.f32(float %[[VAL_1422]])
@@ -1593,10 +1593,10 @@
 // CHECK:         %[[VAL_1452:.*]] = call float @llvm.copysign.f32(float 1.000000e+00, float %[[VAL_1422]])
 // CHECK:         %[[VAL_1453:.*]] = fcmp ult float %[[VAL_1423]], 2.000000e+01
 // CHECK:         %[[VAL_1454:.*]] = select i1 %[[VAL_1453]], float %[[VAL_1451]], float %[[VAL_1452]]
-// CHECK:         %[[VAL_1455:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1354]] to float*
+// CHECK:         %[[VAL_1455:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1357]] to float*
 // CHECK:         %[[VAL_1456:.*]] = getelementptr inbounds float, float* %[[VAL_1455]], i32 %[[VAL_1367]]
 // CHECK:         store float %[[VAL_1454]], float* %[[VAL_1456]], align 4
-// CHECK:         %[[VAL_1457:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1357]] to float*
+// CHECK:         %[[VAL_1457:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1354]] to float*
 // CHECK:         %[[VAL_1458:.*]] = getelementptr inbounds float, float* %[[VAL_1457]], i32 %[[VAL_1371]]
 // CHECK:         %[[VAL_1459:.*]] = load float, float* %[[VAL_1458]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1460:.*]] = call float @llvm.fabs.f32(float %[[VAL_1459]])
@@ -1631,10 +1631,10 @@
 // CHECK:         %[[VAL_1489:.*]] = call float @llvm.copysign.f32(float 1.000000e+00, float %[[VAL_1459]])
 // CHECK:         %[[VAL_1490:.*]] = fcmp ult float %[[VAL_1460]], 2.000000e+01
 // CHECK:         %[[VAL_1491:.*]] = select i1 %[[VAL_1490]], float %[[VAL_1488]], float %[[VAL_1489]]
-// CHECK:         %[[VAL_1492:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1354]] to float*
+// CHECK:         %[[VAL_1492:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1357]] to float*
 // CHECK:         %[[VAL_1493:.*]] = getelementptr inbounds float, float* %[[VAL_1492]], i32 %[[VAL_1371]]
 // CHECK:         store float %[[VAL_1491]], float* %[[VAL_1493]], align 4
-// CHECK:         %[[VAL_1494:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1357]] to float*
+// CHECK:         %[[VAL_1494:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1354]] to float*
 // CHECK:         %[[VAL_1495:.*]] = getelementptr inbounds float, float* %[[VAL_1494]], i32 %[[VAL_1375]]
 // CHECK:         %[[VAL_1496:.*]] = load float, float* %[[VAL_1495]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1497:.*]] = call float @llvm.fabs.f32(float %[[VAL_1496]])
@@ -1669,7 +1669,7 @@
 // CHECK:         %[[VAL_1526:.*]] = call float @llvm.copysign.f32(float 1.000000e+00, float %[[VAL_1496]])
 // CHECK:         %[[VAL_1527:.*]] = fcmp ult float %[[VAL_1497]], 2.000000e+01
 // CHECK:         %[[VAL_1528:.*]] = select i1 %[[VAL_1527]], float %[[VAL_1525]], float %[[VAL_1526]]
-// CHECK:         %[[VAL_1529:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1354]] to float*
+// CHECK:         %[[VAL_1529:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1357]] to float*
 // CHECK:         %[[VAL_1530:.*]] = getelementptr inbounds float, float* %[[VAL_1529]], i32 %[[VAL_1375]]
 // CHECK:         store float %[[VAL_1528]], float* %[[VAL_1530]], align 4
 // CHECK:         br label %[[VAL_1381]]
@@ -1682,9 +1682,9 @@
 // CHECK:         %[[VAL_1539:.*]] = bitcast i8* %[[VAL_1537]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_1540:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_1541:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_1542:.*]] = mul nuw nsw i32 %[[VAL_1540]], 128
+// CHECK:         %[[VAL_1542:.*]] = mul nuw nsw i32 %[[VAL_1540]], 256
 // CHECK:         %[[VAL_1543:.*]] = add nuw nsw i32 %[[VAL_1542]], %[[VAL_1541]]
-// CHECK:         %[[VAL_1544:.*]] = icmp ult i32 %[[VAL_1543]], 163840
+// CHECK:         %[[VAL_1544:.*]] = icmp ult i32 %[[VAL_1543]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_1544]])
 // CHECK:         %[[VAL_1545:.*]] = mul nuw nsw i32 %[[VAL_1543]], 4
 // CHECK:         %[[VAL_1546:.*]] = udiv i32 %[[VAL_1545]], 1
@@ -1707,44 +1707,44 @@
 // CHECK:       r27.in_bounds-after:                              ; preds = %[[VAL_1562]], %[[VAL_1564:.*]]
 // CHECK:         ret void
 // CHECK:       r27.in_bounds-true:                               ; preds = %[[VAL_1564]]
-// CHECK:         %[[VAL_1565:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1536]] to float*
+// CHECK:         %[[VAL_1565:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1533]] to float*
 // CHECK:         %[[VAL_1566:.*]] = getelementptr inbounds float, float* %[[VAL_1565]], i32 %[[VAL_1545]]
 // CHECK:         %[[VAL_1567:.*]] = load float, float* %[[VAL_1566]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_1568:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1539]] to float*
+// CHECK:         %[[VAL_1568:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1536]] to float*
 // CHECK:         %[[VAL_1569:.*]] = getelementptr inbounds float, float* %[[VAL_1568]], i32 %[[VAL_1545]]
 // CHECK:         %[[VAL_1570:.*]] = load float, float* %[[VAL_1569]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1571:.*]] = fadd float %[[VAL_1567]], %[[VAL_1570]]
-// CHECK:         %[[VAL_1572:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1533]] to float*
+// CHECK:         %[[VAL_1572:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1539]] to float*
 // CHECK:         %[[VAL_1573:.*]] = getelementptr inbounds float, float* %[[VAL_1572]], i32 %[[VAL_1545]]
 // CHECK:         store float %[[VAL_1571]], float* %[[VAL_1573]], align 4
-// CHECK:         %[[VAL_1574:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1536]] to float*
+// CHECK:         %[[VAL_1574:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1533]] to float*
 // CHECK:         %[[VAL_1575:.*]] = getelementptr inbounds float, float* %[[VAL_1574]], i32 %[[VAL_1549]]
 // CHECK:         %[[VAL_1576:.*]] = load float, float* %[[VAL_1575]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_1577:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1539]] to float*
+// CHECK:         %[[VAL_1577:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1536]] to float*
 // CHECK:         %[[VAL_1578:.*]] = getelementptr inbounds float, float* %[[VAL_1577]], i32 %[[VAL_1549]]
 // CHECK:         %[[VAL_1579:.*]] = load float, float* %[[VAL_1578]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1580:.*]] = fadd float %[[VAL_1576]], %[[VAL_1579]]
-// CHECK:         %[[VAL_1581:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1533]] to float*
+// CHECK:         %[[VAL_1581:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1539]] to float*
 // CHECK:         %[[VAL_1582:.*]] = getelementptr inbounds float, float* %[[VAL_1581]], i32 %[[VAL_1549]]
 // CHECK:         store float %[[VAL_1580]], float* %[[VAL_1582]], align 4
-// CHECK:         %[[VAL_1583:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1536]] to float*
+// CHECK:         %[[VAL_1583:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1533]] to float*
 // CHECK:         %[[VAL_1584:.*]] = getelementptr inbounds float, float* %[[VAL_1583]], i32 %[[VAL_1553]]
 // CHECK:         %[[VAL_1585:.*]] = load float, float* %[[VAL_1584]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_1586:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1539]] to float*
+// CHECK:         %[[VAL_1586:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1536]] to float*
 // CHECK:         %[[VAL_1587:.*]] = getelementptr inbounds float, float* %[[VAL_1586]], i32 %[[VAL_1553]]
 // CHECK:         %[[VAL_1588:.*]] = load float, float* %[[VAL_1587]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1589:.*]] = fadd float %[[VAL_1585]], %[[VAL_1588]]
-// CHECK:         %[[VAL_1590:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1533]] to float*
+// CHECK:         %[[VAL_1590:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1539]] to float*
 // CHECK:         %[[VAL_1591:.*]] = getelementptr inbounds float, float* %[[VAL_1590]], i32 %[[VAL_1553]]
 // CHECK:         store float %[[VAL_1589]], float* %[[VAL_1591]], align 4
-// CHECK:         %[[VAL_1592:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1536]] to float*
+// CHECK:         %[[VAL_1592:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1533]] to float*
 // CHECK:         %[[VAL_1593:.*]] = getelementptr inbounds float, float* %[[VAL_1592]], i32 %[[VAL_1557]]
 // CHECK:         %[[VAL_1594:.*]] = load float, float* %[[VAL_1593]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_1595:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1539]] to float*
+// CHECK:         %[[VAL_1595:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1536]] to float*
 // CHECK:         %[[VAL_1596:.*]] = getelementptr inbounds float, float* %[[VAL_1595]], i32 %[[VAL_1557]]
 // CHECK:         %[[VAL_1597:.*]] = load float, float* %[[VAL_1596]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1598:.*]] = fadd float %[[VAL_1594]], %[[VAL_1597]]
-// CHECK:         %[[VAL_1599:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1533]] to float*
+// CHECK:         %[[VAL_1599:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1539]] to float*
 // CHECK:         %[[VAL_1600:.*]] = getelementptr inbounds float, float* %[[VAL_1599]], i32 %[[VAL_1557]]
 // CHECK:         store float %[[VAL_1598]], float* %[[VAL_1600]], align 4
 // CHECK:         br label %[[VAL_1563]]
@@ -1756,10 +1756,10 @@
 // CHECK:         %[[VAL_1607:.*]] = getelementptr inbounds i8, i8* %[[VAL_1608:.*]], i64 0
 // CHECK:         %[[VAL_1609:.*]] = bitcast i8* %[[VAL_1607]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_1610:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
-// CHECK:         %[[VAL_1611:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_1612:.*]] = mul nuw nsw i32 %[[VAL_1610]], 128
+// CHECK:         %[[VAL_1611:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !93
+// CHECK:         %[[VAL_1612:.*]] = mul nuw nsw i32 %[[VAL_1610]], 1024
 // CHECK:         %[[VAL_1613:.*]] = add nuw nsw i32 %[[VAL_1612]], %[[VAL_1611]]
-// CHECK:         %[[VAL_1614:.*]] = icmp ult i32 %[[VAL_1613]], 163840
+// CHECK:         %[[VAL_1614:.*]] = icmp ult i32 %[[VAL_1613]], 20480
 // CHECK:         call void @llvm.assume(i1 %[[VAL_1614]])
 // CHECK:         %[[VAL_1615:.*]] = udiv i32 %[[VAL_1613]], 1
 // CHECK:         %[[VAL_1616:.*]] = urem i32 %[[VAL_1615]], 200
@@ -1769,29 +1769,29 @@
 // CHECK:       r28.in_bounds-after:                              ; preds = %[[VAL_1619]], %[[VAL_1621:.*]]
 // CHECK:         ret void
 // CHECK:       r28.in_bounds-true:                               ; preds = %[[VAL_1621]]
-// CHECK:         %[[VAL_1622:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1606]] to float*
+// CHECK:         %[[VAL_1622:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1603]] to float*
 // CHECK:         %[[VAL_1623:.*]] = getelementptr inbounds float, float* %[[VAL_1622]], i32 %[[VAL_1613]]
 // CHECK:         %[[VAL_1624:.*]] = load float, float* %[[VAL_1623]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_1625:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1609]] to float*
+// CHECK:         %[[VAL_1625:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1606]] to float*
 // CHECK:         %[[VAL_1626:.*]] = getelementptr inbounds float, float* %[[VAL_1625]], i32 %[[VAL_1613]]
 // CHECK:         %[[VAL_1627:.*]] = load float, float* %[[VAL_1626]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1628:.*]] = call float @__nv_atan2f(float %[[VAL_1624]], float %[[VAL_1627]])
-// CHECK:         %[[VAL_1629:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1603]] to float*
+// CHECK:         %[[VAL_1629:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1609]] to float*
 // CHECK:         %[[VAL_1630:.*]] = getelementptr inbounds float, float* %[[VAL_1629]], i32 %[[VAL_1613]]
 // CHECK:         store float %[[VAL_1628]], float* %[[VAL_1630]], align 4
 // CHECK:         br label %[[VAL_1620]]
 // CHECK:       entry:
 // CHECK:         %[[VAL_1631:.*]] = getelementptr inbounds i8, i8* %[[VAL_1632:.*]], i64 0
-// CHECK:         %[[VAL_1633:.*]] = bitcast i8* %[[VAL_1631]] to [100 x [200 x i8]]*
+// CHECK:         %[[VAL_1633:.*]] = bitcast i8* %[[VAL_1631]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_1634:.*]] = getelementptr inbounds i8, i8* %[[VAL_1635:.*]], i64 0
 // CHECK:         %[[VAL_1636:.*]] = bitcast i8* %[[VAL_1634]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_1637:.*]] = getelementptr inbounds i8, i8* %[[VAL_1638:.*]], i64 0
-// CHECK:         %[[VAL_1639:.*]] = bitcast i8* %[[VAL_1637]] to [100 x [200 x float]]*
+// CHECK:         %[[VAL_1639:.*]] = bitcast i8* %[[VAL_1637]] to [100 x [200 x i8]]*
 // CHECK:         %[[VAL_1640:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_1641:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_1642:.*]] = mul nuw nsw i32 %[[VAL_1640]], 128
+// CHECK:         %[[VAL_1642:.*]] = mul nuw nsw i32 %[[VAL_1640]], 256
 // CHECK:         %[[VAL_1643:.*]] = add nuw nsw i32 %[[VAL_1642]], %[[VAL_1641]]
-// CHECK:         %[[VAL_1644:.*]] = icmp ult i32 %[[VAL_1643]], 163840
+// CHECK:         %[[VAL_1644:.*]] = icmp ult i32 %[[VAL_1643]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_1644]])
 // CHECK:         %[[VAL_1645:.*]] = mul nuw nsw i32 %[[VAL_1643]], 4
 // CHECK:         %[[VAL_1646:.*]] = udiv i32 %[[VAL_1645]], 1
@@ -1814,63 +1814,63 @@
 // CHECK:       r29.in_bounds-after:                              ; preds = %[[VAL_1662]], %[[VAL_1664:.*]]
 // CHECK:         ret void
 // CHECK:       r29.in_bounds-true:                               ; preds = %[[VAL_1664]]
-// CHECK:         %[[VAL_1665:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1636]] to float*
+// CHECK:         %[[VAL_1665:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1633]] to float*
 // CHECK:         %[[VAL_1666:.*]] = getelementptr inbounds float, float* %[[VAL_1665]], i32 %[[VAL_1645]]
 // CHECK:         %[[VAL_1667:.*]] = load float, float* %[[VAL_1666]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_1668:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1639]] to float*
+// CHECK:         %[[VAL_1668:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1636]] to float*
 // CHECK:         %[[VAL_1669:.*]] = getelementptr inbounds float, float* %[[VAL_1668]], i32 %[[VAL_1645]]
 // CHECK:         %[[VAL_1670:.*]] = load float, float* %[[VAL_1669]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1671:.*]] = fcmp oeq float %[[VAL_1667]], %[[VAL_1670]]
 // CHECK:         %[[VAL_1672:.*]] = zext i1 %[[VAL_1671]] to i8
-// CHECK:         %[[VAL_1673:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_1633]] to i8*
+// CHECK:         %[[VAL_1673:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_1639]] to i8*
 // CHECK:         %[[VAL_1674:.*]] = getelementptr inbounds i8, i8* %[[VAL_1673]], i32 %[[VAL_1645]]
 // CHECK:         store i8 %[[VAL_1672]], i8* %[[VAL_1674]], align 1
-// CHECK:         %[[VAL_1675:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1636]] to float*
+// CHECK:         %[[VAL_1675:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1633]] to float*
 // CHECK:         %[[VAL_1676:.*]] = getelementptr inbounds float, float* %[[VAL_1675]], i32 %[[VAL_1649]]
 // CHECK:         %[[VAL_1677:.*]] = load float, float* %[[VAL_1676]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_1678:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1639]] to float*
+// CHECK:         %[[VAL_1678:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1636]] to float*
 // CHECK:         %[[VAL_1679:.*]] = getelementptr inbounds float, float* %[[VAL_1678]], i32 %[[VAL_1649]]
 // CHECK:         %[[VAL_1680:.*]] = load float, float* %[[VAL_1679]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1681:.*]] = fcmp oeq float %[[VAL_1677]], %[[VAL_1680]]
 // CHECK:         %[[VAL_1682:.*]] = zext i1 %[[VAL_1681]] to i8
-// CHECK:         %[[VAL_1683:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_1633]] to i8*
+// CHECK:         %[[VAL_1683:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_1639]] to i8*
 // CHECK:         %[[VAL_1684:.*]] = getelementptr inbounds i8, i8* %[[VAL_1683]], i32 %[[VAL_1649]]
 // CHECK:         store i8 %[[VAL_1682]], i8* %[[VAL_1684]], align 1
-// CHECK:         %[[VAL_1685:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1636]] to float*
+// CHECK:         %[[VAL_1685:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1633]] to float*
 // CHECK:         %[[VAL_1686:.*]] = getelementptr inbounds float, float* %[[VAL_1685]], i32 %[[VAL_1653]]
 // CHECK:         %[[VAL_1687:.*]] = load float, float* %[[VAL_1686]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_1688:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1639]] to float*
+// CHECK:         %[[VAL_1688:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1636]] to float*
 // CHECK:         %[[VAL_1689:.*]] = getelementptr inbounds float, float* %[[VAL_1688]], i32 %[[VAL_1653]]
 // CHECK:         %[[VAL_1690:.*]] = load float, float* %[[VAL_1689]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1691:.*]] = fcmp oeq float %[[VAL_1687]], %[[VAL_1690]]
 // CHECK:         %[[VAL_1692:.*]] = zext i1 %[[VAL_1691]] to i8
-// CHECK:         %[[VAL_1693:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_1633]] to i8*
+// CHECK:         %[[VAL_1693:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_1639]] to i8*
 // CHECK:         %[[VAL_1694:.*]] = getelementptr inbounds i8, i8* %[[VAL_1693]], i32 %[[VAL_1653]]
 // CHECK:         store i8 %[[VAL_1692]], i8* %[[VAL_1694]], align 1
-// CHECK:         %[[VAL_1695:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1636]] to float*
+// CHECK:         %[[VAL_1695:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1633]] to float*
 // CHECK:         %[[VAL_1696:.*]] = getelementptr inbounds float, float* %[[VAL_1695]], i32 %[[VAL_1657]]
 // CHECK:         %[[VAL_1697:.*]] = load float, float* %[[VAL_1696]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_1698:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1639]] to float*
+// CHECK:         %[[VAL_1698:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1636]] to float*
 // CHECK:         %[[VAL_1699:.*]] = getelementptr inbounds float, float* %[[VAL_1698]], i32 %[[VAL_1657]]
 // CHECK:         %[[VAL_1700:.*]] = load float, float* %[[VAL_1699]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1701:.*]] = fcmp oeq float %[[VAL_1697]], %[[VAL_1700]]
 // CHECK:         %[[VAL_1702:.*]] = zext i1 %[[VAL_1701]] to i8
-// CHECK:         %[[VAL_1703:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_1633]] to i8*
+// CHECK:         %[[VAL_1703:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_1639]] to i8*
 // CHECK:         %[[VAL_1704:.*]] = getelementptr inbounds i8, i8* %[[VAL_1703]], i32 %[[VAL_1657]]
 // CHECK:         store i8 %[[VAL_1702]], i8* %[[VAL_1704]], align 1
 // CHECK:         br label %[[VAL_1663]]
 // CHECK:       entry:
 // CHECK:         %[[VAL_1705:.*]] = getelementptr inbounds i8, i8* %[[VAL_1706:.*]], i64 0
-// CHECK:         %[[VAL_1707:.*]] = bitcast i8* %[[VAL_1705]] to [100 x [200 x %[[VAL_1708:.*]]]]*
-// CHECK:         %[[VAL_1709:.*]] = getelementptr inbounds i8, i8* %[[VAL_1710:.*]], i64 0
-// CHECK:         %[[VAL_1711:.*]] = bitcast i8* %[[VAL_1709]] to [100 x [200 x float]]*
-// CHECK:         %[[VAL_1712:.*]] = getelementptr inbounds i8, i8* %[[VAL_1713:.*]], i64 0
-// CHECK:         %[[VAL_1714:.*]] = bitcast i8* %[[VAL_1712]] to [100 x [200 x float]]*
+// CHECK:         %[[VAL_1707:.*]] = bitcast i8* %[[VAL_1705]] to [100 x [200 x float]]*
+// CHECK:         %[[VAL_1708:.*]] = getelementptr inbounds i8, i8* %[[VAL_1709:.*]], i64 0
+// CHECK:         %[[VAL_1710:.*]] = bitcast i8* %[[VAL_1708]] to [100 x [200 x float]]*
+// CHECK:         %[[VAL_1711:.*]] = getelementptr inbounds i8, i8* %[[VAL_1712:.*]], i64 0
+// CHECK:         %[[VAL_1713:.*]] = bitcast i8* %[[VAL_1711]] to [100 x [200 x %[[VAL_1714:.*]]]]*
 // CHECK:         %[[VAL_1715:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_1716:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_1717:.*]] = mul nuw nsw i32 %[[VAL_1715]], 128
+// CHECK:         %[[VAL_1717:.*]] = mul nuw nsw i32 %[[VAL_1715]], 256
 // CHECK:         %[[VAL_1718:.*]] = add nuw nsw i32 %[[VAL_1717]], %[[VAL_1716]]
-// CHECK:         %[[VAL_1719:.*]] = icmp ult i32 %[[VAL_1718]], 163840
+// CHECK:         %[[VAL_1719:.*]] = icmp ult i32 %[[VAL_1718]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_1719]])
 // CHECK:         %[[VAL_1720:.*]] = mul nuw nsw i32 %[[VAL_1718]], 4
 // CHECK:         %[[VAL_1721:.*]] = udiv i32 %[[VAL_1720]], 1
@@ -1893,50 +1893,50 @@
 // CHECK:       r30.in_bounds-after:                              ; preds = %[[VAL_1737]], %[[VAL_1739:.*]]
 // CHECK:         ret void
 // CHECK:       r30.in_bounds-true:                               ; preds = %[[VAL_1739]]
-// CHECK:         %[[VAL_1740:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1711]] to float*
+// CHECK:         %[[VAL_1740:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1707]] to float*
 // CHECK:         %[[VAL_1741:.*]] = getelementptr inbounds float, float* %[[VAL_1740]], i32 %[[VAL_1720]]
 // CHECK:         %[[VAL_1742:.*]] = load float, float* %[[VAL_1741]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_1743:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1714]] to float*
+// CHECK:         %[[VAL_1743:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1710]] to float*
 // CHECK:         %[[VAL_1744:.*]] = getelementptr inbounds float, float* %[[VAL_1743]], i32 %[[VAL_1720]]
 // CHECK:         %[[VAL_1745:.*]] = load float, float* %[[VAL_1744]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_1746:.*]] = insertvalue %[[VAL_1708]] zeroinitializer, float %[[VAL_1742]], 0
-// CHECK:         %[[VAL_1747:.*]] = insertvalue %[[VAL_1708]] %[[VAL_1746]], float %[[VAL_1745]], 1
-// CHECK:         %[[VAL_1748:.*]] = bitcast [100 x [200 x %[[VAL_1708]]]]* %[[VAL_1707]] to %[[VAL_1708]]*
-// CHECK:         %[[VAL_1749:.*]] = getelementptr inbounds %[[VAL_1708]], %[[VAL_1708]]* %[[VAL_1748]], i32 %[[VAL_1720]]
-// CHECK:         store %[[VAL_1708]] %[[VAL_1747]], %[[VAL_1708]]* %[[VAL_1749]], align 1
-// CHECK:         %[[VAL_1750:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1711]] to float*
+// CHECK:         %[[VAL_1746:.*]] = insertvalue %[[VAL_1714]] zeroinitializer, float %[[VAL_1742]], 0
+// CHECK:         %[[VAL_1747:.*]] = insertvalue %[[VAL_1714]] %[[VAL_1746]], float %[[VAL_1745]], 1
+// CHECK:         %[[VAL_1748:.*]] = bitcast [100 x [200 x %[[VAL_1714]]]]* %[[VAL_1713]] to %[[VAL_1714]]*
+// CHECK:         %[[VAL_1749:.*]] = getelementptr inbounds %[[VAL_1714]], %[[VAL_1714]]* %[[VAL_1748]], i32 %[[VAL_1720]]
+// CHECK:         store %[[VAL_1714]] %[[VAL_1747]], %[[VAL_1714]]* %[[VAL_1749]], align 1
+// CHECK:         %[[VAL_1750:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1707]] to float*
 // CHECK:         %[[VAL_1751:.*]] = getelementptr inbounds float, float* %[[VAL_1750]], i32 %[[VAL_1724]]
 // CHECK:         %[[VAL_1752:.*]] = load float, float* %[[VAL_1751]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_1753:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1714]] to float*
+// CHECK:         %[[VAL_1753:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1710]] to float*
 // CHECK:         %[[VAL_1754:.*]] = getelementptr inbounds float, float* %[[VAL_1753]], i32 %[[VAL_1724]]
 // CHECK:         %[[VAL_1755:.*]] = load float, float* %[[VAL_1754]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_1756:.*]] = insertvalue %[[VAL_1708]] zeroinitializer, float %[[VAL_1752]], 0
-// CHECK:         %[[VAL_1757:.*]] = insertvalue %[[VAL_1708]] %[[VAL_1756]], float %[[VAL_1755]], 1
-// CHECK:         %[[VAL_1758:.*]] = bitcast [100 x [200 x %[[VAL_1708]]]]* %[[VAL_1707]] to %[[VAL_1708]]*
-// CHECK:         %[[VAL_1759:.*]] = getelementptr inbounds %[[VAL_1708]], %[[VAL_1708]]* %[[VAL_1758]], i32 %[[VAL_1724]]
-// CHECK:         store %[[VAL_1708]] %[[VAL_1757]], %[[VAL_1708]]* %[[VAL_1759]], align 1
-// CHECK:         %[[VAL_1760:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1711]] to float*
+// CHECK:         %[[VAL_1756:.*]] = insertvalue %[[VAL_1714]] zeroinitializer, float %[[VAL_1752]], 0
+// CHECK:         %[[VAL_1757:.*]] = insertvalue %[[VAL_1714]] %[[VAL_1756]], float %[[VAL_1755]], 1
+// CHECK:         %[[VAL_1758:.*]] = bitcast [100 x [200 x %[[VAL_1714]]]]* %[[VAL_1713]] to %[[VAL_1714]]*
+// CHECK:         %[[VAL_1759:.*]] = getelementptr inbounds %[[VAL_1714]], %[[VAL_1714]]* %[[VAL_1758]], i32 %[[VAL_1724]]
+// CHECK:         store %[[VAL_1714]] %[[VAL_1757]], %[[VAL_1714]]* %[[VAL_1759]], align 1
+// CHECK:         %[[VAL_1760:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1707]] to float*
 // CHECK:         %[[VAL_1761:.*]] = getelementptr inbounds float, float* %[[VAL_1760]], i32 %[[VAL_1728]]
 // CHECK:         %[[VAL_1762:.*]] = load float, float* %[[VAL_1761]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_1763:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1714]] to float*
+// CHECK:         %[[VAL_1763:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1710]] to float*
 // CHECK:         %[[VAL_1764:.*]] = getelementptr inbounds float, float* %[[VAL_1763]], i32 %[[VAL_1728]]
 // CHECK:         %[[VAL_1765:.*]] = load float, float* %[[VAL_1764]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_1766:.*]] = insertvalue %[[VAL_1708]] zeroinitializer, float %[[VAL_1762]], 0
-// CHECK:         %[[VAL_1767:.*]] = insertvalue %[[VAL_1708]] %[[VAL_1766]], float %[[VAL_1765]], 1
-// CHECK:         %[[VAL_1768:.*]] = bitcast [100 x [200 x %[[VAL_1708]]]]* %[[VAL_1707]] to %[[VAL_1708]]*
-// CHECK:         %[[VAL_1769:.*]] = getelementptr inbounds %[[VAL_1708]], %[[VAL_1708]]* %[[VAL_1768]], i32 %[[VAL_1728]]
-// CHECK:         store %[[VAL_1708]] %[[VAL_1767]], %[[VAL_1708]]* %[[VAL_1769]], align 1
-// CHECK:         %[[VAL_1770:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1711]] to float*
+// CHECK:         %[[VAL_1766:.*]] = insertvalue %[[VAL_1714]] zeroinitializer, float %[[VAL_1762]], 0
+// CHECK:         %[[VAL_1767:.*]] = insertvalue %[[VAL_1714]] %[[VAL_1766]], float %[[VAL_1765]], 1
+// CHECK:         %[[VAL_1768:.*]] = bitcast [100 x [200 x %[[VAL_1714]]]]* %[[VAL_1713]] to %[[VAL_1714]]*
+// CHECK:         %[[VAL_1769:.*]] = getelementptr inbounds %[[VAL_1714]], %[[VAL_1714]]* %[[VAL_1768]], i32 %[[VAL_1728]]
+// CHECK:         store %[[VAL_1714]] %[[VAL_1767]], %[[VAL_1714]]* %[[VAL_1769]], align 1
+// CHECK:         %[[VAL_1770:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1707]] to float*
 // CHECK:         %[[VAL_1771:.*]] = getelementptr inbounds float, float* %[[VAL_1770]], i32 %[[VAL_1732]]
 // CHECK:         %[[VAL_1772:.*]] = load float, float* %[[VAL_1771]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_1773:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1714]] to float*
+// CHECK:         %[[VAL_1773:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1710]] to float*
 // CHECK:         %[[VAL_1774:.*]] = getelementptr inbounds float, float* %[[VAL_1773]], i32 %[[VAL_1732]]
 // CHECK:         %[[VAL_1775:.*]] = load float, float* %[[VAL_1774]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_1776:.*]] = insertvalue %[[VAL_1708]] zeroinitializer, float %[[VAL_1772]], 0
-// CHECK:         %[[VAL_1777:.*]] = insertvalue %[[VAL_1708]] %[[VAL_1776]], float %[[VAL_1775]], 1
-// CHECK:         %[[VAL_1778:.*]] = bitcast [100 x [200 x %[[VAL_1708]]]]* %[[VAL_1707]] to %[[VAL_1708]]*
-// CHECK:         %[[VAL_1779:.*]] = getelementptr inbounds %[[VAL_1708]], %[[VAL_1708]]* %[[VAL_1778]], i32 %[[VAL_1732]]
-// CHECK:         store %[[VAL_1708]] %[[VAL_1777]], %[[VAL_1708]]* %[[VAL_1779]], align 1
+// CHECK:         %[[VAL_1776:.*]] = insertvalue %[[VAL_1714]] zeroinitializer, float %[[VAL_1772]], 0
+// CHECK:         %[[VAL_1777:.*]] = insertvalue %[[VAL_1714]] %[[VAL_1776]], float %[[VAL_1775]], 1
+// CHECK:         %[[VAL_1778:.*]] = bitcast [100 x [200 x %[[VAL_1714]]]]* %[[VAL_1713]] to %[[VAL_1714]]*
+// CHECK:         %[[VAL_1779:.*]] = getelementptr inbounds %[[VAL_1714]], %[[VAL_1714]]* %[[VAL_1778]], i32 %[[VAL_1732]]
+// CHECK:         store %[[VAL_1714]] %[[VAL_1777]], %[[VAL_1714]]* %[[VAL_1779]], align 1
 // CHECK:         br label %[[VAL_1738]]
 // CHECK:       entry:
 // CHECK:         %[[VAL_1780:.*]] = getelementptr inbounds i8, i8* %[[VAL_1781:.*]], i64 0
@@ -1947,9 +1947,9 @@
 // CHECK:         %[[VAL_1788:.*]] = bitcast i8* %[[VAL_1786]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_1789:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_1790:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_1791:.*]] = mul nuw nsw i32 %[[VAL_1789]], 128
+// CHECK:         %[[VAL_1791:.*]] = mul nuw nsw i32 %[[VAL_1789]], 256
 // CHECK:         %[[VAL_1792:.*]] = add nuw nsw i32 %[[VAL_1791]], %[[VAL_1790]]
-// CHECK:         %[[VAL_1793:.*]] = icmp ult i32 %[[VAL_1792]], 163840
+// CHECK:         %[[VAL_1793:.*]] = icmp ult i32 %[[VAL_1792]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_1793]])
 // CHECK:         %[[VAL_1794:.*]] = mul nuw nsw i32 %[[VAL_1792]], 4
 // CHECK:         %[[VAL_1795:.*]] = udiv i32 %[[VAL_1794]], 1
@@ -1972,44 +1972,44 @@
 // CHECK:       r31.in_bounds-after:                              ; preds = %[[VAL_1811]], %[[VAL_1813:.*]]
 // CHECK:         ret void
 // CHECK:       r31.in_bounds-true:                               ; preds = %[[VAL_1813]]
-// CHECK:         %[[VAL_1814:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1785]] to float*
+// CHECK:         %[[VAL_1814:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1782]] to float*
 // CHECK:         %[[VAL_1815:.*]] = getelementptr inbounds float, float* %[[VAL_1814]], i32 %[[VAL_1794]]
 // CHECK:         %[[VAL_1816:.*]] = load float, float* %[[VAL_1815]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_1817:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1788]] to float*
+// CHECK:         %[[VAL_1817:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1785]] to float*
 // CHECK:         %[[VAL_1818:.*]] = getelementptr inbounds float, float* %[[VAL_1817]], i32 %[[VAL_1794]]
 // CHECK:         %[[VAL_1819:.*]] = load float, float* %[[VAL_1818]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1820:.*]] = fdiv float %[[VAL_1816]], %[[VAL_1819]]
-// CHECK:         %[[VAL_1821:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1782]] to float*
+// CHECK:         %[[VAL_1821:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1788]] to float*
 // CHECK:         %[[VAL_1822:.*]] = getelementptr inbounds float, float* %[[VAL_1821]], i32 %[[VAL_1794]]
 // CHECK:         store float %[[VAL_1820]], float* %[[VAL_1822]], align 4
-// CHECK:         %[[VAL_1823:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1785]] to float*
+// CHECK:         %[[VAL_1823:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1782]] to float*
 // CHECK:         %[[VAL_1824:.*]] = getelementptr inbounds float, float* %[[VAL_1823]], i32 %[[VAL_1798]]
 // CHECK:         %[[VAL_1825:.*]] = load float, float* %[[VAL_1824]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_1826:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1788]] to float*
+// CHECK:         %[[VAL_1826:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1785]] to float*
 // CHECK:         %[[VAL_1827:.*]] = getelementptr inbounds float, float* %[[VAL_1826]], i32 %[[VAL_1798]]
 // CHECK:         %[[VAL_1828:.*]] = load float, float* %[[VAL_1827]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1829:.*]] = fdiv float %[[VAL_1825]], %[[VAL_1828]]
-// CHECK:         %[[VAL_1830:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1782]] to float*
+// CHECK:         %[[VAL_1830:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1788]] to float*
 // CHECK:         %[[VAL_1831:.*]] = getelementptr inbounds float, float* %[[VAL_1830]], i32 %[[VAL_1798]]
 // CHECK:         store float %[[VAL_1829]], float* %[[VAL_1831]], align 4
-// CHECK:         %[[VAL_1832:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1785]] to float*
+// CHECK:         %[[VAL_1832:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1782]] to float*
 // CHECK:         %[[VAL_1833:.*]] = getelementptr inbounds float, float* %[[VAL_1832]], i32 %[[VAL_1802]]
 // CHECK:         %[[VAL_1834:.*]] = load float, float* %[[VAL_1833]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_1835:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1788]] to float*
+// CHECK:         %[[VAL_1835:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1785]] to float*
 // CHECK:         %[[VAL_1836:.*]] = getelementptr inbounds float, float* %[[VAL_1835]], i32 %[[VAL_1802]]
 // CHECK:         %[[VAL_1837:.*]] = load float, float* %[[VAL_1836]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1838:.*]] = fdiv float %[[VAL_1834]], %[[VAL_1837]]
-// CHECK:         %[[VAL_1839:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1782]] to float*
+// CHECK:         %[[VAL_1839:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1788]] to float*
 // CHECK:         %[[VAL_1840:.*]] = getelementptr inbounds float, float* %[[VAL_1839]], i32 %[[VAL_1802]]
 // CHECK:         store float %[[VAL_1838]], float* %[[VAL_1840]], align 4
-// CHECK:         %[[VAL_1841:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1785]] to float*
+// CHECK:         %[[VAL_1841:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1782]] to float*
 // CHECK:         %[[VAL_1842:.*]] = getelementptr inbounds float, float* %[[VAL_1841]], i32 %[[VAL_1806]]
 // CHECK:         %[[VAL_1843:.*]] = load float, float* %[[VAL_1842]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_1844:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1788]] to float*
+// CHECK:         %[[VAL_1844:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1785]] to float*
 // CHECK:         %[[VAL_1845:.*]] = getelementptr inbounds float, float* %[[VAL_1844]], i32 %[[VAL_1806]]
 // CHECK:         %[[VAL_1846:.*]] = load float, float* %[[VAL_1845]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1847:.*]] = fdiv float %[[VAL_1843]], %[[VAL_1846]]
-// CHECK:         %[[VAL_1848:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1782]] to float*
+// CHECK:         %[[VAL_1848:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1788]] to float*
 // CHECK:         %[[VAL_1849:.*]] = getelementptr inbounds float, float* %[[VAL_1848]], i32 %[[VAL_1806]]
 // CHECK:         store float %[[VAL_1847]], float* %[[VAL_1849]], align 4
 // CHECK:         br label %[[VAL_1812]]
@@ -2022,9 +2022,9 @@
 // CHECK:         %[[VAL_1858:.*]] = bitcast i8* %[[VAL_1856]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_1859:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_1860:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_1861:.*]] = mul nuw nsw i32 %[[VAL_1859]], 128
+// CHECK:         %[[VAL_1861:.*]] = mul nuw nsw i32 %[[VAL_1859]], 256
 // CHECK:         %[[VAL_1862:.*]] = add nuw nsw i32 %[[VAL_1861]], %[[VAL_1860]]
-// CHECK:         %[[VAL_1863:.*]] = icmp ult i32 %[[VAL_1862]], 163840
+// CHECK:         %[[VAL_1863:.*]] = icmp ult i32 %[[VAL_1862]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_1863]])
 // CHECK:         %[[VAL_1864:.*]] = mul nuw nsw i32 %[[VAL_1862]], 4
 // CHECK:         %[[VAL_1865:.*]] = udiv i32 %[[VAL_1864]], 1
@@ -2047,44 +2047,44 @@
 // CHECK:       r32.in_bounds-after:                              ; preds = %[[VAL_1881]], %[[VAL_1883:.*]]
 // CHECK:         ret void
 // CHECK:       r32.in_bounds-true:                               ; preds = %[[VAL_1883]]
-// CHECK:         %[[VAL_1884:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1855]] to float*
+// CHECK:         %[[VAL_1884:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1852]] to float*
 // CHECK:         %[[VAL_1885:.*]] = getelementptr inbounds float, float* %[[VAL_1884]], i32 %[[VAL_1864]]
 // CHECK:         %[[VAL_1886:.*]] = load float, float* %[[VAL_1885]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_1887:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1858]] to float*
+// CHECK:         %[[VAL_1887:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1855]] to float*
 // CHECK:         %[[VAL_1888:.*]] = getelementptr inbounds float, float* %[[VAL_1887]], i32 %[[VAL_1864]]
 // CHECK:         %[[VAL_1889:.*]] = load float, float* %[[VAL_1888]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1890:.*]] = call float @llvm.maxnum.f32(float %[[VAL_1886]], float %[[VAL_1889]])
-// CHECK:         %[[VAL_1891:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1852]] to float*
+// CHECK:         %[[VAL_1891:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1858]] to float*
 // CHECK:         %[[VAL_1892:.*]] = getelementptr inbounds float, float* %[[VAL_1891]], i32 %[[VAL_1864]]
 // CHECK:         store float %[[VAL_1890]], float* %[[VAL_1892]], align 4
-// CHECK:         %[[VAL_1893:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1855]] to float*
+// CHECK:         %[[VAL_1893:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1852]] to float*
 // CHECK:         %[[VAL_1894:.*]] = getelementptr inbounds float, float* %[[VAL_1893]], i32 %[[VAL_1868]]
 // CHECK:         %[[VAL_1895:.*]] = load float, float* %[[VAL_1894]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_1896:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1858]] to float*
+// CHECK:         %[[VAL_1896:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1855]] to float*
 // CHECK:         %[[VAL_1897:.*]] = getelementptr inbounds float, float* %[[VAL_1896]], i32 %[[VAL_1868]]
 // CHECK:         %[[VAL_1898:.*]] = load float, float* %[[VAL_1897]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1899:.*]] = call float @llvm.maxnum.f32(float %[[VAL_1895]], float %[[VAL_1898]])
-// CHECK:         %[[VAL_1900:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1852]] to float*
+// CHECK:         %[[VAL_1900:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1858]] to float*
 // CHECK:         %[[VAL_1901:.*]] = getelementptr inbounds float, float* %[[VAL_1900]], i32 %[[VAL_1868]]
 // CHECK:         store float %[[VAL_1899]], float* %[[VAL_1901]], align 4
-// CHECK:         %[[VAL_1902:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1855]] to float*
+// CHECK:         %[[VAL_1902:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1852]] to float*
 // CHECK:         %[[VAL_1903:.*]] = getelementptr inbounds float, float* %[[VAL_1902]], i32 %[[VAL_1872]]
 // CHECK:         %[[VAL_1904:.*]] = load float, float* %[[VAL_1903]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_1905:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1858]] to float*
+// CHECK:         %[[VAL_1905:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1855]] to float*
 // CHECK:         %[[VAL_1906:.*]] = getelementptr inbounds float, float* %[[VAL_1905]], i32 %[[VAL_1872]]
 // CHECK:         %[[VAL_1907:.*]] = load float, float* %[[VAL_1906]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1908:.*]] = call float @llvm.maxnum.f32(float %[[VAL_1904]], float %[[VAL_1907]])
-// CHECK:         %[[VAL_1909:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1852]] to float*
+// CHECK:         %[[VAL_1909:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1858]] to float*
 // CHECK:         %[[VAL_1910:.*]] = getelementptr inbounds float, float* %[[VAL_1909]], i32 %[[VAL_1872]]
 // CHECK:         store float %[[VAL_1908]], float* %[[VAL_1910]], align 4
-// CHECK:         %[[VAL_1911:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1855]] to float*
+// CHECK:         %[[VAL_1911:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1852]] to float*
 // CHECK:         %[[VAL_1912:.*]] = getelementptr inbounds float, float* %[[VAL_1911]], i32 %[[VAL_1876]]
 // CHECK:         %[[VAL_1913:.*]] = load float, float* %[[VAL_1912]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_1914:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1858]] to float*
+// CHECK:         %[[VAL_1914:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1855]] to float*
 // CHECK:         %[[VAL_1915:.*]] = getelementptr inbounds float, float* %[[VAL_1914]], i32 %[[VAL_1876]]
 // CHECK:         %[[VAL_1916:.*]] = load float, float* %[[VAL_1915]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1917:.*]] = call float @llvm.maxnum.f32(float %[[VAL_1913]], float %[[VAL_1916]])
-// CHECK:         %[[VAL_1918:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1852]] to float*
+// CHECK:         %[[VAL_1918:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1858]] to float*
 // CHECK:         %[[VAL_1919:.*]] = getelementptr inbounds float, float* %[[VAL_1918]], i32 %[[VAL_1876]]
 // CHECK:         store float %[[VAL_1917]], float* %[[VAL_1919]], align 4
 // CHECK:         br label %[[VAL_1882]]
@@ -2097,9 +2097,9 @@
 // CHECK:         %[[VAL_1928:.*]] = bitcast i8* %[[VAL_1926]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_1929:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_1930:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_1931:.*]] = mul nuw nsw i32 %[[VAL_1929]], 128
+// CHECK:         %[[VAL_1931:.*]] = mul nuw nsw i32 %[[VAL_1929]], 256
 // CHECK:         %[[VAL_1932:.*]] = add nuw nsw i32 %[[VAL_1931]], %[[VAL_1930]]
-// CHECK:         %[[VAL_1933:.*]] = icmp ult i32 %[[VAL_1932]], 163840
+// CHECK:         %[[VAL_1933:.*]] = icmp ult i32 %[[VAL_1932]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_1933]])
 // CHECK:         %[[VAL_1934:.*]] = mul nuw nsw i32 %[[VAL_1932]], 4
 // CHECK:         %[[VAL_1935:.*]] = udiv i32 %[[VAL_1934]], 1
@@ -2122,44 +2122,44 @@
 // CHECK:       r33.in_bounds-after:                              ; preds = %[[VAL_1951]], %[[VAL_1953:.*]]
 // CHECK:         ret void
 // CHECK:       r33.in_bounds-true:                               ; preds = %[[VAL_1953]]
-// CHECK:         %[[VAL_1954:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1925]] to float*
+// CHECK:         %[[VAL_1954:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1922]] to float*
 // CHECK:         %[[VAL_1955:.*]] = getelementptr inbounds float, float* %[[VAL_1954]], i32 %[[VAL_1934]]
 // CHECK:         %[[VAL_1956:.*]] = load float, float* %[[VAL_1955]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_1957:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1928]] to float*
+// CHECK:         %[[VAL_1957:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1925]] to float*
 // CHECK:         %[[VAL_1958:.*]] = getelementptr inbounds float, float* %[[VAL_1957]], i32 %[[VAL_1934]]
 // CHECK:         %[[VAL_1959:.*]] = load float, float* %[[VAL_1958]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1960:.*]] = call float @llvm.minnum.f32(float %[[VAL_1956]], float %[[VAL_1959]])
-// CHECK:         %[[VAL_1961:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1922]] to float*
+// CHECK:         %[[VAL_1961:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1928]] to float*
 // CHECK:         %[[VAL_1962:.*]] = getelementptr inbounds float, float* %[[VAL_1961]], i32 %[[VAL_1934]]
 // CHECK:         store float %[[VAL_1960]], float* %[[VAL_1962]], align 4
-// CHECK:         %[[VAL_1963:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1925]] to float*
+// CHECK:         %[[VAL_1963:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1922]] to float*
 // CHECK:         %[[VAL_1964:.*]] = getelementptr inbounds float, float* %[[VAL_1963]], i32 %[[VAL_1938]]
 // CHECK:         %[[VAL_1965:.*]] = load float, float* %[[VAL_1964]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_1966:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1928]] to float*
+// CHECK:         %[[VAL_1966:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1925]] to float*
 // CHECK:         %[[VAL_1967:.*]] = getelementptr inbounds float, float* %[[VAL_1966]], i32 %[[VAL_1938]]
 // CHECK:         %[[VAL_1968:.*]] = load float, float* %[[VAL_1967]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1969:.*]] = call float @llvm.minnum.f32(float %[[VAL_1965]], float %[[VAL_1968]])
-// CHECK:         %[[VAL_1970:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1922]] to float*
+// CHECK:         %[[VAL_1970:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1928]] to float*
 // CHECK:         %[[VAL_1971:.*]] = getelementptr inbounds float, float* %[[VAL_1970]], i32 %[[VAL_1938]]
 // CHECK:         store float %[[VAL_1969]], float* %[[VAL_1971]], align 4
-// CHECK:         %[[VAL_1972:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1925]] to float*
+// CHECK:         %[[VAL_1972:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1922]] to float*
 // CHECK:         %[[VAL_1973:.*]] = getelementptr inbounds float, float* %[[VAL_1972]], i32 %[[VAL_1942]]
 // CHECK:         %[[VAL_1974:.*]] = load float, float* %[[VAL_1973]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_1975:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1928]] to float*
+// CHECK:         %[[VAL_1975:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1925]] to float*
 // CHECK:         %[[VAL_1976:.*]] = getelementptr inbounds float, float* %[[VAL_1975]], i32 %[[VAL_1942]]
 // CHECK:         %[[VAL_1977:.*]] = load float, float* %[[VAL_1976]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1978:.*]] = call float @llvm.minnum.f32(float %[[VAL_1974]], float %[[VAL_1977]])
-// CHECK:         %[[VAL_1979:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1922]] to float*
+// CHECK:         %[[VAL_1979:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1928]] to float*
 // CHECK:         %[[VAL_1980:.*]] = getelementptr inbounds float, float* %[[VAL_1979]], i32 %[[VAL_1942]]
 // CHECK:         store float %[[VAL_1978]], float* %[[VAL_1980]], align 4
-// CHECK:         %[[VAL_1981:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1925]] to float*
+// CHECK:         %[[VAL_1981:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1922]] to float*
 // CHECK:         %[[VAL_1982:.*]] = getelementptr inbounds float, float* %[[VAL_1981]], i32 %[[VAL_1946]]
 // CHECK:         %[[VAL_1983:.*]] = load float, float* %[[VAL_1982]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_1984:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1928]] to float*
+// CHECK:         %[[VAL_1984:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1925]] to float*
 // CHECK:         %[[VAL_1985:.*]] = getelementptr inbounds float, float* %[[VAL_1984]], i32 %[[VAL_1946]]
 // CHECK:         %[[VAL_1986:.*]] = load float, float* %[[VAL_1985]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_1987:.*]] = call float @llvm.minnum.f32(float %[[VAL_1983]], float %[[VAL_1986]])
-// CHECK:         %[[VAL_1988:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1922]] to float*
+// CHECK:         %[[VAL_1988:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1928]] to float*
 // CHECK:         %[[VAL_1989:.*]] = getelementptr inbounds float, float* %[[VAL_1988]], i32 %[[VAL_1946]]
 // CHECK:         store float %[[VAL_1987]], float* %[[VAL_1989]], align 4
 // CHECK:         br label %[[VAL_1952]]
@@ -2172,9 +2172,9 @@
 // CHECK:         %[[VAL_1998:.*]] = bitcast i8* %[[VAL_1996]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_1999:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_2000:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_2001:.*]] = mul nuw nsw i32 %[[VAL_1999]], 128
+// CHECK:         %[[VAL_2001:.*]] = mul nuw nsw i32 %[[VAL_1999]], 256
 // CHECK:         %[[VAL_2002:.*]] = add nuw nsw i32 %[[VAL_2001]], %[[VAL_2000]]
-// CHECK:         %[[VAL_2003:.*]] = icmp ult i32 %[[VAL_2002]], 163840
+// CHECK:         %[[VAL_2003:.*]] = icmp ult i32 %[[VAL_2002]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_2003]])
 // CHECK:         %[[VAL_2004:.*]] = mul nuw nsw i32 %[[VAL_2002]], 4
 // CHECK:         %[[VAL_2005:.*]] = udiv i32 %[[VAL_2004]], 1
@@ -2197,44 +2197,44 @@
 // CHECK:       r34.in_bounds-after:                              ; preds = %[[VAL_2021]], %[[VAL_2023:.*]]
 // CHECK:         ret void
 // CHECK:       r34.in_bounds-true:                               ; preds = %[[VAL_2023]]
-// CHECK:         %[[VAL_2024:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1995]] to float*
+// CHECK:         %[[VAL_2024:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1992]] to float*
 // CHECK:         %[[VAL_2025:.*]] = getelementptr inbounds float, float* %[[VAL_2024]], i32 %[[VAL_2004]]
 // CHECK:         %[[VAL_2026:.*]] = load float, float* %[[VAL_2025]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2027:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1998]] to float*
+// CHECK:         %[[VAL_2027:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1995]] to float*
 // CHECK:         %[[VAL_2028:.*]] = getelementptr inbounds float, float* %[[VAL_2027]], i32 %[[VAL_2004]]
 // CHECK:         %[[VAL_2029:.*]] = load float, float* %[[VAL_2028]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_2030:.*]] = fmul float %[[VAL_2026]], %[[VAL_2029]]
-// CHECK:         %[[VAL_2031:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1992]] to float*
+// CHECK:         %[[VAL_2031:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1998]] to float*
 // CHECK:         %[[VAL_2032:.*]] = getelementptr inbounds float, float* %[[VAL_2031]], i32 %[[VAL_2004]]
 // CHECK:         store float %[[VAL_2030]], float* %[[VAL_2032]], align 4
-// CHECK:         %[[VAL_2033:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1995]] to float*
+// CHECK:         %[[VAL_2033:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1992]] to float*
 // CHECK:         %[[VAL_2034:.*]] = getelementptr inbounds float, float* %[[VAL_2033]], i32 %[[VAL_2008]]
 // CHECK:         %[[VAL_2035:.*]] = load float, float* %[[VAL_2034]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2036:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1998]] to float*
+// CHECK:         %[[VAL_2036:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1995]] to float*
 // CHECK:         %[[VAL_2037:.*]] = getelementptr inbounds float, float* %[[VAL_2036]], i32 %[[VAL_2008]]
 // CHECK:         %[[VAL_2038:.*]] = load float, float* %[[VAL_2037]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_2039:.*]] = fmul float %[[VAL_2035]], %[[VAL_2038]]
-// CHECK:         %[[VAL_2040:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1992]] to float*
+// CHECK:         %[[VAL_2040:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1998]] to float*
 // CHECK:         %[[VAL_2041:.*]] = getelementptr inbounds float, float* %[[VAL_2040]], i32 %[[VAL_2008]]
 // CHECK:         store float %[[VAL_2039]], float* %[[VAL_2041]], align 4
-// CHECK:         %[[VAL_2042:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1995]] to float*
+// CHECK:         %[[VAL_2042:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1992]] to float*
 // CHECK:         %[[VAL_2043:.*]] = getelementptr inbounds float, float* %[[VAL_2042]], i32 %[[VAL_2012]]
 // CHECK:         %[[VAL_2044:.*]] = load float, float* %[[VAL_2043]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2045:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1998]] to float*
+// CHECK:         %[[VAL_2045:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1995]] to float*
 // CHECK:         %[[VAL_2046:.*]] = getelementptr inbounds float, float* %[[VAL_2045]], i32 %[[VAL_2012]]
 // CHECK:         %[[VAL_2047:.*]] = load float, float* %[[VAL_2046]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_2048:.*]] = fmul float %[[VAL_2044]], %[[VAL_2047]]
-// CHECK:         %[[VAL_2049:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1992]] to float*
+// CHECK:         %[[VAL_2049:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1998]] to float*
 // CHECK:         %[[VAL_2050:.*]] = getelementptr inbounds float, float* %[[VAL_2049]], i32 %[[VAL_2012]]
 // CHECK:         store float %[[VAL_2048]], float* %[[VAL_2050]], align 4
-// CHECK:         %[[VAL_2051:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1995]] to float*
+// CHECK:         %[[VAL_2051:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1992]] to float*
 // CHECK:         %[[VAL_2052:.*]] = getelementptr inbounds float, float* %[[VAL_2051]], i32 %[[VAL_2016]]
 // CHECK:         %[[VAL_2053:.*]] = load float, float* %[[VAL_2052]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2054:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1998]] to float*
+// CHECK:         %[[VAL_2054:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1995]] to float*
 // CHECK:         %[[VAL_2055:.*]] = getelementptr inbounds float, float* %[[VAL_2054]], i32 %[[VAL_2016]]
 // CHECK:         %[[VAL_2056:.*]] = load float, float* %[[VAL_2055]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_2057:.*]] = fmul float %[[VAL_2053]], %[[VAL_2056]]
-// CHECK:         %[[VAL_2058:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1992]] to float*
+// CHECK:         %[[VAL_2058:.*]] = bitcast [100 x [200 x float]]* %[[VAL_1998]] to float*
 // CHECK:         %[[VAL_2059:.*]] = getelementptr inbounds float, float* %[[VAL_2058]], i32 %[[VAL_2016]]
 // CHECK:         store float %[[VAL_2057]], float* %[[VAL_2059]], align 4
 // CHECK:         br label %[[VAL_2022]]
@@ -2246,10 +2246,10 @@
 // CHECK:         %[[VAL_2066:.*]] = getelementptr inbounds i8, i8* %[[VAL_2067:.*]], i64 0
 // CHECK:         %[[VAL_2068:.*]] = bitcast i8* %[[VAL_2066]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_2069:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
-// CHECK:         %[[VAL_2070:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_2071:.*]] = mul nuw nsw i32 %[[VAL_2069]], 128
+// CHECK:         %[[VAL_2070:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !93
+// CHECK:         %[[VAL_2071:.*]] = mul nuw nsw i32 %[[VAL_2069]], 1024
 // CHECK:         %[[VAL_2072:.*]] = add nuw nsw i32 %[[VAL_2071]], %[[VAL_2070]]
-// CHECK:         %[[VAL_2073:.*]] = icmp ult i32 %[[VAL_2072]], 163840
+// CHECK:         %[[VAL_2073:.*]] = icmp ult i32 %[[VAL_2072]], 20480
 // CHECK:         call void @llvm.assume(i1 %[[VAL_2073]])
 // CHECK:         %[[VAL_2074:.*]] = udiv i32 %[[VAL_2072]], 1
 // CHECK:         %[[VAL_2075:.*]] = urem i32 %[[VAL_2074]], 200
@@ -2259,14 +2259,14 @@
 // CHECK:       r35.in_bounds-after:                              ; preds = %[[VAL_2078]], %[[VAL_2080:.*]]
 // CHECK:         ret void
 // CHECK:       r35.in_bounds-true:                               ; preds = %[[VAL_2080]]
-// CHECK:         %[[VAL_2081:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2065]] to float*
+// CHECK:         %[[VAL_2081:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2062]] to float*
 // CHECK:         %[[VAL_2082:.*]] = getelementptr inbounds float, float* %[[VAL_2081]], i32 %[[VAL_2072]]
 // CHECK:         %[[VAL_2083:.*]] = load float, float* %[[VAL_2082]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2084:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2068]] to float*
+// CHECK:         %[[VAL_2084:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2065]] to float*
 // CHECK:         %[[VAL_2085:.*]] = getelementptr inbounds float, float* %[[VAL_2084]], i32 %[[VAL_2072]]
 // CHECK:         %[[VAL_2086:.*]] = load float, float* %[[VAL_2085]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_2087:.*]] = call float @__nv_powf(float %[[VAL_2083]], float %[[VAL_2086]])
-// CHECK:         %[[VAL_2088:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2062]] to float*
+// CHECK:         %[[VAL_2088:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2068]] to float*
 // CHECK:         %[[VAL_2089:.*]] = getelementptr inbounds float, float* %[[VAL_2088]], i32 %[[VAL_2072]]
 // CHECK:         store float %[[VAL_2087]], float* %[[VAL_2089]], align 4
 // CHECK:         br label %[[VAL_2079]]
@@ -2279,9 +2279,9 @@
 // CHECK:         %[[VAL_2098:.*]] = bitcast i8* %[[VAL_2096]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_2099:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_2100:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_2101:.*]] = mul nuw nsw i32 %[[VAL_2099]], 128
+// CHECK:         %[[VAL_2101:.*]] = mul nuw nsw i32 %[[VAL_2099]], 256
 // CHECK:         %[[VAL_2102:.*]] = add nuw nsw i32 %[[VAL_2101]], %[[VAL_2100]]
-// CHECK:         %[[VAL_2103:.*]] = icmp ult i32 %[[VAL_2102]], 163840
+// CHECK:         %[[VAL_2103:.*]] = icmp ult i32 %[[VAL_2102]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_2103]])
 // CHECK:         %[[VAL_2104:.*]] = mul nuw nsw i32 %[[VAL_2102]], 4
 // CHECK:         %[[VAL_2105:.*]] = udiv i32 %[[VAL_2104]], 1
@@ -2304,44 +2304,44 @@
 // CHECK:       r36.in_bounds-after:                              ; preds = %[[VAL_2121]], %[[VAL_2123:.*]]
 // CHECK:         ret void
 // CHECK:       r36.in_bounds-true:                               ; preds = %[[VAL_2123]]
-// CHECK:         %[[VAL_2124:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2095]] to float*
+// CHECK:         %[[VAL_2124:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2092]] to float*
 // CHECK:         %[[VAL_2125:.*]] = getelementptr inbounds float, float* %[[VAL_2124]], i32 %[[VAL_2104]]
 // CHECK:         %[[VAL_2126:.*]] = load float, float* %[[VAL_2125]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2127:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2098]] to float*
+// CHECK:         %[[VAL_2127:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2095]] to float*
 // CHECK:         %[[VAL_2128:.*]] = getelementptr inbounds float, float* %[[VAL_2127]], i32 %[[VAL_2104]]
 // CHECK:         %[[VAL_2129:.*]] = load float, float* %[[VAL_2128]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_2130:.*]] = call float @__nv_fmodf(float %[[VAL_2126]], float %[[VAL_2129]])
-// CHECK:         %[[VAL_2131:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2092]] to float*
+// CHECK:         %[[VAL_2131:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2098]] to float*
 // CHECK:         %[[VAL_2132:.*]] = getelementptr inbounds float, float* %[[VAL_2131]], i32 %[[VAL_2104]]
 // CHECK:         store float %[[VAL_2130]], float* %[[VAL_2132]], align 4
-// CHECK:         %[[VAL_2133:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2095]] to float*
+// CHECK:         %[[VAL_2133:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2092]] to float*
 // CHECK:         %[[VAL_2134:.*]] = getelementptr inbounds float, float* %[[VAL_2133]], i32 %[[VAL_2108]]
 // CHECK:         %[[VAL_2135:.*]] = load float, float* %[[VAL_2134]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2136:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2098]] to float*
+// CHECK:         %[[VAL_2136:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2095]] to float*
 // CHECK:         %[[VAL_2137:.*]] = getelementptr inbounds float, float* %[[VAL_2136]], i32 %[[VAL_2108]]
 // CHECK:         %[[VAL_2138:.*]] = load float, float* %[[VAL_2137]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_2139:.*]] = call float @__nv_fmodf(float %[[VAL_2135]], float %[[VAL_2138]])
-// CHECK:         %[[VAL_2140:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2092]] to float*
+// CHECK:         %[[VAL_2140:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2098]] to float*
 // CHECK:         %[[VAL_2141:.*]] = getelementptr inbounds float, float* %[[VAL_2140]], i32 %[[VAL_2108]]
 // CHECK:         store float %[[VAL_2139]], float* %[[VAL_2141]], align 4
-// CHECK:         %[[VAL_2142:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2095]] to float*
+// CHECK:         %[[VAL_2142:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2092]] to float*
 // CHECK:         %[[VAL_2143:.*]] = getelementptr inbounds float, float* %[[VAL_2142]], i32 %[[VAL_2112]]
 // CHECK:         %[[VAL_2144:.*]] = load float, float* %[[VAL_2143]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2145:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2098]] to float*
+// CHECK:         %[[VAL_2145:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2095]] to float*
 // CHECK:         %[[VAL_2146:.*]] = getelementptr inbounds float, float* %[[VAL_2145]], i32 %[[VAL_2112]]
 // CHECK:         %[[VAL_2147:.*]] = load float, float* %[[VAL_2146]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_2148:.*]] = call float @__nv_fmodf(float %[[VAL_2144]], float %[[VAL_2147]])
-// CHECK:         %[[VAL_2149:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2092]] to float*
+// CHECK:         %[[VAL_2149:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2098]] to float*
 // CHECK:         %[[VAL_2150:.*]] = getelementptr inbounds float, float* %[[VAL_2149]], i32 %[[VAL_2112]]
 // CHECK:         store float %[[VAL_2148]], float* %[[VAL_2150]], align 4
-// CHECK:         %[[VAL_2151:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2095]] to float*
+// CHECK:         %[[VAL_2151:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2092]] to float*
 // CHECK:         %[[VAL_2152:.*]] = getelementptr inbounds float, float* %[[VAL_2151]], i32 %[[VAL_2116]]
 // CHECK:         %[[VAL_2153:.*]] = load float, float* %[[VAL_2152]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2154:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2098]] to float*
+// CHECK:         %[[VAL_2154:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2095]] to float*
 // CHECK:         %[[VAL_2155:.*]] = getelementptr inbounds float, float* %[[VAL_2154]], i32 %[[VAL_2116]]
 // CHECK:         %[[VAL_2156:.*]] = load float, float* %[[VAL_2155]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_2157:.*]] = call float @__nv_fmodf(float %[[VAL_2153]], float %[[VAL_2156]])
-// CHECK:         %[[VAL_2158:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2092]] to float*
+// CHECK:         %[[VAL_2158:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2098]] to float*
 // CHECK:         %[[VAL_2159:.*]] = getelementptr inbounds float, float* %[[VAL_2158]], i32 %[[VAL_2116]]
 // CHECK:         store float %[[VAL_2157]], float* %[[VAL_2159]], align 4
 // CHECK:         br label %[[VAL_2122]]
@@ -2354,9 +2354,9 @@
 // CHECK:         %[[VAL_2168:.*]] = bitcast i8* %[[VAL_2166]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_2169:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_2170:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_2171:.*]] = mul nuw nsw i32 %[[VAL_2169]], 128
+// CHECK:         %[[VAL_2171:.*]] = mul nuw nsw i32 %[[VAL_2169]], 256
 // CHECK:         %[[VAL_2172:.*]] = add nuw nsw i32 %[[VAL_2171]], %[[VAL_2170]]
-// CHECK:         %[[VAL_2173:.*]] = icmp ult i32 %[[VAL_2172]], 163840
+// CHECK:         %[[VAL_2173:.*]] = icmp ult i32 %[[VAL_2172]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_2173]])
 // CHECK:         %[[VAL_2174:.*]] = mul nuw nsw i32 %[[VAL_2172]], 4
 // CHECK:         %[[VAL_2175:.*]] = udiv i32 %[[VAL_2174]], 1
@@ -2379,44 +2379,44 @@
 // CHECK:       r37.in_bounds-after:                              ; preds = %[[VAL_2191]], %[[VAL_2193:.*]]
 // CHECK:         ret void
 // CHECK:       r37.in_bounds-true:                               ; preds = %[[VAL_2193]]
-// CHECK:         %[[VAL_2194:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2165]] to float*
+// CHECK:         %[[VAL_2194:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2162]] to float*
 // CHECK:         %[[VAL_2195:.*]] = getelementptr inbounds float, float* %[[VAL_2194]], i32 %[[VAL_2174]]
 // CHECK:         %[[VAL_2196:.*]] = load float, float* %[[VAL_2195]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2197:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2168]] to float*
+// CHECK:         %[[VAL_2197:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2165]] to float*
 // CHECK:         %[[VAL_2198:.*]] = getelementptr inbounds float, float* %[[VAL_2197]], i32 %[[VAL_2174]]
 // CHECK:         %[[VAL_2199:.*]] = load float, float* %[[VAL_2198]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_2200:.*]] = fsub float %[[VAL_2196]], %[[VAL_2199]]
-// CHECK:         %[[VAL_2201:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2162]] to float*
+// CHECK:         %[[VAL_2201:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2168]] to float*
 // CHECK:         %[[VAL_2202:.*]] = getelementptr inbounds float, float* %[[VAL_2201]], i32 %[[VAL_2174]]
 // CHECK:         store float %[[VAL_2200]], float* %[[VAL_2202]], align 4
-// CHECK:         %[[VAL_2203:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2165]] to float*
+// CHECK:         %[[VAL_2203:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2162]] to float*
 // CHECK:         %[[VAL_2204:.*]] = getelementptr inbounds float, float* %[[VAL_2203]], i32 %[[VAL_2178]]
 // CHECK:         %[[VAL_2205:.*]] = load float, float* %[[VAL_2204]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2206:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2168]] to float*
+// CHECK:         %[[VAL_2206:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2165]] to float*
 // CHECK:         %[[VAL_2207:.*]] = getelementptr inbounds float, float* %[[VAL_2206]], i32 %[[VAL_2178]]
 // CHECK:         %[[VAL_2208:.*]] = load float, float* %[[VAL_2207]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_2209:.*]] = fsub float %[[VAL_2205]], %[[VAL_2208]]
-// CHECK:         %[[VAL_2210:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2162]] to float*
+// CHECK:         %[[VAL_2210:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2168]] to float*
 // CHECK:         %[[VAL_2211:.*]] = getelementptr inbounds float, float* %[[VAL_2210]], i32 %[[VAL_2178]]
 // CHECK:         store float %[[VAL_2209]], float* %[[VAL_2211]], align 4
-// CHECK:         %[[VAL_2212:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2165]] to float*
+// CHECK:         %[[VAL_2212:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2162]] to float*
 // CHECK:         %[[VAL_2213:.*]] = getelementptr inbounds float, float* %[[VAL_2212]], i32 %[[VAL_2182]]
 // CHECK:         %[[VAL_2214:.*]] = load float, float* %[[VAL_2213]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2215:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2168]] to float*
+// CHECK:         %[[VAL_2215:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2165]] to float*
 // CHECK:         %[[VAL_2216:.*]] = getelementptr inbounds float, float* %[[VAL_2215]], i32 %[[VAL_2182]]
 // CHECK:         %[[VAL_2217:.*]] = load float, float* %[[VAL_2216]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_2218:.*]] = fsub float %[[VAL_2214]], %[[VAL_2217]]
-// CHECK:         %[[VAL_2219:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2162]] to float*
+// CHECK:         %[[VAL_2219:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2168]] to float*
 // CHECK:         %[[VAL_2220:.*]] = getelementptr inbounds float, float* %[[VAL_2219]], i32 %[[VAL_2182]]
 // CHECK:         store float %[[VAL_2218]], float* %[[VAL_2220]], align 4
-// CHECK:         %[[VAL_2221:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2165]] to float*
+// CHECK:         %[[VAL_2221:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2162]] to float*
 // CHECK:         %[[VAL_2222:.*]] = getelementptr inbounds float, float* %[[VAL_2221]], i32 %[[VAL_2186]]
 // CHECK:         %[[VAL_2223:.*]] = load float, float* %[[VAL_2222]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2224:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2168]] to float*
+// CHECK:         %[[VAL_2224:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2165]] to float*
 // CHECK:         %[[VAL_2225:.*]] = getelementptr inbounds float, float* %[[VAL_2224]], i32 %[[VAL_2186]]
 // CHECK:         %[[VAL_2226:.*]] = load float, float* %[[VAL_2225]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_2227:.*]] = fsub float %[[VAL_2223]], %[[VAL_2226]]
-// CHECK:         %[[VAL_2228:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2162]] to float*
+// CHECK:         %[[VAL_2228:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2168]] to float*
 // CHECK:         %[[VAL_2229:.*]] = getelementptr inbounds float, float* %[[VAL_2228]], i32 %[[VAL_2186]]
 // CHECK:         store float %[[VAL_2227]], float* %[[VAL_2229]], align 4
 // CHECK:         br label %[[VAL_2192]]
@@ -2429,9 +2429,9 @@
 // CHECK:         %[[VAL_2238:.*]] = bitcast i8* %[[VAL_2236]] to [100 x [200 x i8]]*
 // CHECK:         %[[VAL_2239:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_2240:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_2241:.*]] = mul nuw nsw i32 %[[VAL_2239]], 128
+// CHECK:         %[[VAL_2241:.*]] = mul nuw nsw i32 %[[VAL_2239]], 256
 // CHECK:         %[[VAL_2242:.*]] = add nuw nsw i32 %[[VAL_2241]], %[[VAL_2240]]
-// CHECK:         %[[VAL_2243:.*]] = icmp ult i32 %[[VAL_2242]], 163840
+// CHECK:         %[[VAL_2243:.*]] = icmp ult i32 %[[VAL_2242]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_2243]])
 // CHECK:         %[[VAL_2244:.*]] = mul nuw nsw i32 %[[VAL_2242]], 4
 // CHECK:         %[[VAL_2245:.*]] = udiv i32 %[[VAL_2244]], 1
@@ -2454,44 +2454,44 @@
 // CHECK:       r38.in_bounds-after:                              ; preds = %[[VAL_2261]], %[[VAL_2263:.*]]
 // CHECK:         ret void
 // CHECK:       r38.in_bounds-true:                               ; preds = %[[VAL_2263]]
-// CHECK:         %[[VAL_2264:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2235]] to i8*
+// CHECK:         %[[VAL_2264:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2232]] to i8*
 // CHECK:         %[[VAL_2265:.*]] = getelementptr inbounds i8, i8* %[[VAL_2264]], i32 %[[VAL_2244]]
 // CHECK:         %[[VAL_2266:.*]] = load i8, i8* %[[VAL_2265]], align 1, !invariant.load !92
-// CHECK:         %[[VAL_2267:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2238]] to i8*
+// CHECK:         %[[VAL_2267:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2235]] to i8*
 // CHECK:         %[[VAL_2268:.*]] = getelementptr inbounds i8, i8* %[[VAL_2267]], i32 %[[VAL_2244]]
 // CHECK:         %[[VAL_2269:.*]] = load i8, i8* %[[VAL_2268]], align 1, !invariant.load !92
 // CHECK:         %[[VAL_2270:.*]] = and i8 %[[VAL_2266]], %[[VAL_2269]]
-// CHECK:         %[[VAL_2271:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2232]] to i8*
+// CHECK:         %[[VAL_2271:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2238]] to i8*
 // CHECK:         %[[VAL_2272:.*]] = getelementptr inbounds i8, i8* %[[VAL_2271]], i32 %[[VAL_2244]]
 // CHECK:         store i8 %[[VAL_2270]], i8* %[[VAL_2272]], align 1
-// CHECK:         %[[VAL_2273:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2235]] to i8*
+// CHECK:         %[[VAL_2273:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2232]] to i8*
 // CHECK:         %[[VAL_2274:.*]] = getelementptr inbounds i8, i8* %[[VAL_2273]], i32 %[[VAL_2248]]
 // CHECK:         %[[VAL_2275:.*]] = load i8, i8* %[[VAL_2274]], align 1, !invariant.load !92
-// CHECK:         %[[VAL_2276:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2238]] to i8*
+// CHECK:         %[[VAL_2276:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2235]] to i8*
 // CHECK:         %[[VAL_2277:.*]] = getelementptr inbounds i8, i8* %[[VAL_2276]], i32 %[[VAL_2248]]
 // CHECK:         %[[VAL_2278:.*]] = load i8, i8* %[[VAL_2277]], align 1, !invariant.load !92
 // CHECK:         %[[VAL_2279:.*]] = and i8 %[[VAL_2275]], %[[VAL_2278]]
-// CHECK:         %[[VAL_2280:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2232]] to i8*
+// CHECK:         %[[VAL_2280:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2238]] to i8*
 // CHECK:         %[[VAL_2281:.*]] = getelementptr inbounds i8, i8* %[[VAL_2280]], i32 %[[VAL_2248]]
 // CHECK:         store i8 %[[VAL_2279]], i8* %[[VAL_2281]], align 1
-// CHECK:         %[[VAL_2282:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2235]] to i8*
+// CHECK:         %[[VAL_2282:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2232]] to i8*
 // CHECK:         %[[VAL_2283:.*]] = getelementptr inbounds i8, i8* %[[VAL_2282]], i32 %[[VAL_2252]]
 // CHECK:         %[[VAL_2284:.*]] = load i8, i8* %[[VAL_2283]], align 1, !invariant.load !92
-// CHECK:         %[[VAL_2285:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2238]] to i8*
+// CHECK:         %[[VAL_2285:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2235]] to i8*
 // CHECK:         %[[VAL_2286:.*]] = getelementptr inbounds i8, i8* %[[VAL_2285]], i32 %[[VAL_2252]]
 // CHECK:         %[[VAL_2287:.*]] = load i8, i8* %[[VAL_2286]], align 1, !invariant.load !92
 // CHECK:         %[[VAL_2288:.*]] = and i8 %[[VAL_2284]], %[[VAL_2287]]
-// CHECK:         %[[VAL_2289:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2232]] to i8*
+// CHECK:         %[[VAL_2289:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2238]] to i8*
 // CHECK:         %[[VAL_2290:.*]] = getelementptr inbounds i8, i8* %[[VAL_2289]], i32 %[[VAL_2252]]
 // CHECK:         store i8 %[[VAL_2288]], i8* %[[VAL_2290]], align 1
-// CHECK:         %[[VAL_2291:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2235]] to i8*
+// CHECK:         %[[VAL_2291:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2232]] to i8*
 // CHECK:         %[[VAL_2292:.*]] = getelementptr inbounds i8, i8* %[[VAL_2291]], i32 %[[VAL_2256]]
 // CHECK:         %[[VAL_2293:.*]] = load i8, i8* %[[VAL_2292]], align 1, !invariant.load !92
-// CHECK:         %[[VAL_2294:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2238]] to i8*
+// CHECK:         %[[VAL_2294:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2235]] to i8*
 // CHECK:         %[[VAL_2295:.*]] = getelementptr inbounds i8, i8* %[[VAL_2294]], i32 %[[VAL_2256]]
 // CHECK:         %[[VAL_2296:.*]] = load i8, i8* %[[VAL_2295]], align 1, !invariant.load !92
 // CHECK:         %[[VAL_2297:.*]] = and i8 %[[VAL_2293]], %[[VAL_2296]]
-// CHECK:         %[[VAL_2298:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2232]] to i8*
+// CHECK:         %[[VAL_2298:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2238]] to i8*
 // CHECK:         %[[VAL_2299:.*]] = getelementptr inbounds i8, i8* %[[VAL_2298]], i32 %[[VAL_2256]]
 // CHECK:         store i8 %[[VAL_2297]], i8* %[[VAL_2299]], align 1
 // CHECK:         br label %[[VAL_2262]]
@@ -2504,9 +2504,9 @@
 // CHECK:         %[[VAL_2308:.*]] = bitcast i8* %[[VAL_2306]] to [100 x [200 x i8]]*
 // CHECK:         %[[VAL_2309:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_2310:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_2311:.*]] = mul nuw nsw i32 %[[VAL_2309]], 128
+// CHECK:         %[[VAL_2311:.*]] = mul nuw nsw i32 %[[VAL_2309]], 256
 // CHECK:         %[[VAL_2312:.*]] = add nuw nsw i32 %[[VAL_2311]], %[[VAL_2310]]
-// CHECK:         %[[VAL_2313:.*]] = icmp ult i32 %[[VAL_2312]], 163840
+// CHECK:         %[[VAL_2313:.*]] = icmp ult i32 %[[VAL_2312]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_2313]])
 // CHECK:         %[[VAL_2314:.*]] = mul nuw nsw i32 %[[VAL_2312]], 4
 // CHECK:         %[[VAL_2315:.*]] = udiv i32 %[[VAL_2314]], 1
@@ -2529,44 +2529,44 @@
 // CHECK:       r39.in_bounds-after:                              ; preds = %[[VAL_2331]], %[[VAL_2333:.*]]
 // CHECK:         ret void
 // CHECK:       r39.in_bounds-true:                               ; preds = %[[VAL_2333]]
-// CHECK:         %[[VAL_2334:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2305]] to i8*
+// CHECK:         %[[VAL_2334:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2302]] to i8*
 // CHECK:         %[[VAL_2335:.*]] = getelementptr inbounds i8, i8* %[[VAL_2334]], i32 %[[VAL_2314]]
 // CHECK:         %[[VAL_2336:.*]] = load i8, i8* %[[VAL_2335]], align 1, !invariant.load !92
-// CHECK:         %[[VAL_2337:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2308]] to i8*
+// CHECK:         %[[VAL_2337:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2305]] to i8*
 // CHECK:         %[[VAL_2338:.*]] = getelementptr inbounds i8, i8* %[[VAL_2337]], i32 %[[VAL_2314]]
 // CHECK:         %[[VAL_2339:.*]] = load i8, i8* %[[VAL_2338]], align 1, !invariant.load !92
 // CHECK:         %[[VAL_2340:.*]] = or i8 %[[VAL_2336]], %[[VAL_2339]]
-// CHECK:         %[[VAL_2341:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2302]] to i8*
+// CHECK:         %[[VAL_2341:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2308]] to i8*
 // CHECK:         %[[VAL_2342:.*]] = getelementptr inbounds i8, i8* %[[VAL_2341]], i32 %[[VAL_2314]]
 // CHECK:         store i8 %[[VAL_2340]], i8* %[[VAL_2342]], align 1
-// CHECK:         %[[VAL_2343:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2305]] to i8*
+// CHECK:         %[[VAL_2343:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2302]] to i8*
 // CHECK:         %[[VAL_2344:.*]] = getelementptr inbounds i8, i8* %[[VAL_2343]], i32 %[[VAL_2318]]
 // CHECK:         %[[VAL_2345:.*]] = load i8, i8* %[[VAL_2344]], align 1, !invariant.load !92
-// CHECK:         %[[VAL_2346:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2308]] to i8*
+// CHECK:         %[[VAL_2346:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2305]] to i8*
 // CHECK:         %[[VAL_2347:.*]] = getelementptr inbounds i8, i8* %[[VAL_2346]], i32 %[[VAL_2318]]
 // CHECK:         %[[VAL_2348:.*]] = load i8, i8* %[[VAL_2347]], align 1, !invariant.load !92
 // CHECK:         %[[VAL_2349:.*]] = or i8 %[[VAL_2345]], %[[VAL_2348]]
-// CHECK:         %[[VAL_2350:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2302]] to i8*
+// CHECK:         %[[VAL_2350:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2308]] to i8*
 // CHECK:         %[[VAL_2351:.*]] = getelementptr inbounds i8, i8* %[[VAL_2350]], i32 %[[VAL_2318]]
 // CHECK:         store i8 %[[VAL_2349]], i8* %[[VAL_2351]], align 1
-// CHECK:         %[[VAL_2352:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2305]] to i8*
+// CHECK:         %[[VAL_2352:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2302]] to i8*
 // CHECK:         %[[VAL_2353:.*]] = getelementptr inbounds i8, i8* %[[VAL_2352]], i32 %[[VAL_2322]]
 // CHECK:         %[[VAL_2354:.*]] = load i8, i8* %[[VAL_2353]], align 1, !invariant.load !92
-// CHECK:         %[[VAL_2355:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2308]] to i8*
+// CHECK:         %[[VAL_2355:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2305]] to i8*
 // CHECK:         %[[VAL_2356:.*]] = getelementptr inbounds i8, i8* %[[VAL_2355]], i32 %[[VAL_2322]]
 // CHECK:         %[[VAL_2357:.*]] = load i8, i8* %[[VAL_2356]], align 1, !invariant.load !92
 // CHECK:         %[[VAL_2358:.*]] = or i8 %[[VAL_2354]], %[[VAL_2357]]
-// CHECK:         %[[VAL_2359:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2302]] to i8*
+// CHECK:         %[[VAL_2359:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2308]] to i8*
 // CHECK:         %[[VAL_2360:.*]] = getelementptr inbounds i8, i8* %[[VAL_2359]], i32 %[[VAL_2322]]
 // CHECK:         store i8 %[[VAL_2358]], i8* %[[VAL_2360]], align 1
-// CHECK:         %[[VAL_2361:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2305]] to i8*
+// CHECK:         %[[VAL_2361:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2302]] to i8*
 // CHECK:         %[[VAL_2362:.*]] = getelementptr inbounds i8, i8* %[[VAL_2361]], i32 %[[VAL_2326]]
 // CHECK:         %[[VAL_2363:.*]] = load i8, i8* %[[VAL_2362]], align 1, !invariant.load !92
-// CHECK:         %[[VAL_2364:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2308]] to i8*
+// CHECK:         %[[VAL_2364:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2305]] to i8*
 // CHECK:         %[[VAL_2365:.*]] = getelementptr inbounds i8, i8* %[[VAL_2364]], i32 %[[VAL_2326]]
 // CHECK:         %[[VAL_2366:.*]] = load i8, i8* %[[VAL_2365]], align 1, !invariant.load !92
 // CHECK:         %[[VAL_2367:.*]] = or i8 %[[VAL_2363]], %[[VAL_2366]]
-// CHECK:         %[[VAL_2368:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2302]] to i8*
+// CHECK:         %[[VAL_2368:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2308]] to i8*
 // CHECK:         %[[VAL_2369:.*]] = getelementptr inbounds i8, i8* %[[VAL_2368]], i32 %[[VAL_2326]]
 // CHECK:         store i8 %[[VAL_2367]], i8* %[[VAL_2369]], align 1
 // CHECK:         br label %[[VAL_2332]]
@@ -2579,9 +2579,9 @@
 // CHECK:         %[[VAL_2378:.*]] = bitcast i8* %[[VAL_2376]] to [100 x [200 x i8]]*
 // CHECK:         %[[VAL_2379:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_2380:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_2381:.*]] = mul nuw nsw i32 %[[VAL_2379]], 128
+// CHECK:         %[[VAL_2381:.*]] = mul nuw nsw i32 %[[VAL_2379]], 256
 // CHECK:         %[[VAL_2382:.*]] = add nuw nsw i32 %[[VAL_2381]], %[[VAL_2380]]
-// CHECK:         %[[VAL_2383:.*]] = icmp ult i32 %[[VAL_2382]], 163840
+// CHECK:         %[[VAL_2383:.*]] = icmp ult i32 %[[VAL_2382]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_2383]])
 // CHECK:         %[[VAL_2384:.*]] = mul nuw nsw i32 %[[VAL_2382]], 4
 // CHECK:         %[[VAL_2385:.*]] = udiv i32 %[[VAL_2384]], 1
@@ -2604,44 +2604,44 @@
 // CHECK:       r40.in_bounds-after:                              ; preds = %[[VAL_2401]], %[[VAL_2403:.*]]
 // CHECK:         ret void
 // CHECK:       r40.in_bounds-true:                               ; preds = %[[VAL_2403]]
-// CHECK:         %[[VAL_2404:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2375]] to i8*
+// CHECK:         %[[VAL_2404:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2372]] to i8*
 // CHECK:         %[[VAL_2405:.*]] = getelementptr inbounds i8, i8* %[[VAL_2404]], i32 %[[VAL_2384]]
 // CHECK:         %[[VAL_2406:.*]] = load i8, i8* %[[VAL_2405]], align 1, !invariant.load !92
-// CHECK:         %[[VAL_2407:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2378]] to i8*
+// CHECK:         %[[VAL_2407:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2375]] to i8*
 // CHECK:         %[[VAL_2408:.*]] = getelementptr inbounds i8, i8* %[[VAL_2407]], i32 %[[VAL_2384]]
 // CHECK:         %[[VAL_2409:.*]] = load i8, i8* %[[VAL_2408]], align 1, !invariant.load !92
 // CHECK:         %[[VAL_2410:.*]] = xor i8 %[[VAL_2406]], %[[VAL_2409]]
-// CHECK:         %[[VAL_2411:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2372]] to i8*
+// CHECK:         %[[VAL_2411:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2378]] to i8*
 // CHECK:         %[[VAL_2412:.*]] = getelementptr inbounds i8, i8* %[[VAL_2411]], i32 %[[VAL_2384]]
 // CHECK:         store i8 %[[VAL_2410]], i8* %[[VAL_2412]], align 1
-// CHECK:         %[[VAL_2413:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2375]] to i8*
+// CHECK:         %[[VAL_2413:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2372]] to i8*
 // CHECK:         %[[VAL_2414:.*]] = getelementptr inbounds i8, i8* %[[VAL_2413]], i32 %[[VAL_2388]]
 // CHECK:         %[[VAL_2415:.*]] = load i8, i8* %[[VAL_2414]], align 1, !invariant.load !92
-// CHECK:         %[[VAL_2416:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2378]] to i8*
+// CHECK:         %[[VAL_2416:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2375]] to i8*
 // CHECK:         %[[VAL_2417:.*]] = getelementptr inbounds i8, i8* %[[VAL_2416]], i32 %[[VAL_2388]]
 // CHECK:         %[[VAL_2418:.*]] = load i8, i8* %[[VAL_2417]], align 1, !invariant.load !92
 // CHECK:         %[[VAL_2419:.*]] = xor i8 %[[VAL_2415]], %[[VAL_2418]]
-// CHECK:         %[[VAL_2420:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2372]] to i8*
+// CHECK:         %[[VAL_2420:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2378]] to i8*
 // CHECK:         %[[VAL_2421:.*]] = getelementptr inbounds i8, i8* %[[VAL_2420]], i32 %[[VAL_2388]]
 // CHECK:         store i8 %[[VAL_2419]], i8* %[[VAL_2421]], align 1
-// CHECK:         %[[VAL_2422:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2375]] to i8*
+// CHECK:         %[[VAL_2422:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2372]] to i8*
 // CHECK:         %[[VAL_2423:.*]] = getelementptr inbounds i8, i8* %[[VAL_2422]], i32 %[[VAL_2392]]
 // CHECK:         %[[VAL_2424:.*]] = load i8, i8* %[[VAL_2423]], align 1, !invariant.load !92
-// CHECK:         %[[VAL_2425:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2378]] to i8*
+// CHECK:         %[[VAL_2425:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2375]] to i8*
 // CHECK:         %[[VAL_2426:.*]] = getelementptr inbounds i8, i8* %[[VAL_2425]], i32 %[[VAL_2392]]
 // CHECK:         %[[VAL_2427:.*]] = load i8, i8* %[[VAL_2426]], align 1, !invariant.load !92
 // CHECK:         %[[VAL_2428:.*]] = xor i8 %[[VAL_2424]], %[[VAL_2427]]
-// CHECK:         %[[VAL_2429:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2372]] to i8*
+// CHECK:         %[[VAL_2429:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2378]] to i8*
 // CHECK:         %[[VAL_2430:.*]] = getelementptr inbounds i8, i8* %[[VAL_2429]], i32 %[[VAL_2392]]
 // CHECK:         store i8 %[[VAL_2428]], i8* %[[VAL_2430]], align 1
-// CHECK:         %[[VAL_2431:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2375]] to i8*
+// CHECK:         %[[VAL_2431:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2372]] to i8*
 // CHECK:         %[[VAL_2432:.*]] = getelementptr inbounds i8, i8* %[[VAL_2431]], i32 %[[VAL_2396]]
 // CHECK:         %[[VAL_2433:.*]] = load i8, i8* %[[VAL_2432]], align 1, !invariant.load !92
-// CHECK:         %[[VAL_2434:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2378]] to i8*
+// CHECK:         %[[VAL_2434:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2375]] to i8*
 // CHECK:         %[[VAL_2435:.*]] = getelementptr inbounds i8, i8* %[[VAL_2434]], i32 %[[VAL_2396]]
 // CHECK:         %[[VAL_2436:.*]] = load i8, i8* %[[VAL_2435]], align 1, !invariant.load !92
 // CHECK:         %[[VAL_2437:.*]] = xor i8 %[[VAL_2433]], %[[VAL_2436]]
-// CHECK:         %[[VAL_2438:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2372]] to i8*
+// CHECK:         %[[VAL_2438:.*]] = bitcast [100 x [200 x i8]]* %[[VAL_2378]] to i8*
 // CHECK:         %[[VAL_2439:.*]] = getelementptr inbounds i8, i8* %[[VAL_2438]], i32 %[[VAL_2396]]
 // CHECK:         store i8 %[[VAL_2437]], i8* %[[VAL_2439]], align 1
 // CHECK:         br label %[[VAL_2402]]
@@ -2654,9 +2654,9 @@
 // CHECK:         %[[VAL_2448:.*]] = bitcast i8* %[[VAL_2446]] to [100 x [200 x i32]]*
 // CHECK:         %[[VAL_2449:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_2450:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_2451:.*]] = mul nuw nsw i32 %[[VAL_2449]], 128
+// CHECK:         %[[VAL_2451:.*]] = mul nuw nsw i32 %[[VAL_2449]], 256
 // CHECK:         %[[VAL_2452:.*]] = add nuw nsw i32 %[[VAL_2451]], %[[VAL_2450]]
-// CHECK:         %[[VAL_2453:.*]] = icmp ult i32 %[[VAL_2452]], 163840
+// CHECK:         %[[VAL_2453:.*]] = icmp ult i32 %[[VAL_2452]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_2453]])
 // CHECK:         %[[VAL_2454:.*]] = mul nuw nsw i32 %[[VAL_2452]], 4
 // CHECK:         %[[VAL_2455:.*]] = udiv i32 %[[VAL_2454]], 1
@@ -2679,52 +2679,52 @@
 // CHECK:       r41.in_bounds-after:                              ; preds = %[[VAL_2471]], %[[VAL_2473:.*]]
 // CHECK:         ret void
 // CHECK:       r41.in_bounds-true:                               ; preds = %[[VAL_2473]]
-// CHECK:         %[[VAL_2474:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2445]] to i32*
+// CHECK:         %[[VAL_2474:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2442]] to i32*
 // CHECK:         %[[VAL_2475:.*]] = getelementptr inbounds i32, i32* %[[VAL_2474]], i32 %[[VAL_2454]]
 // CHECK:         %[[VAL_2476:.*]] = load i32, i32* %[[VAL_2475]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2477:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2448]] to i32*
+// CHECK:         %[[VAL_2477:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2445]] to i32*
 // CHECK:         %[[VAL_2478:.*]] = getelementptr inbounds i32, i32* %[[VAL_2477]], i32 %[[VAL_2454]]
 // CHECK:         %[[VAL_2479:.*]] = load i32, i32* %[[VAL_2478]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_2480:.*]] = shl i32 %[[VAL_2476]], %[[VAL_2479]]
 // CHECK:         %[[VAL_2481:.*]] = icmp ult i32 %[[VAL_2479]], 32
 // CHECK:         %[[VAL_2482:.*]] = select i1 %[[VAL_2481]], i32 %[[VAL_2480]], i32 0
-// CHECK:         %[[VAL_2483:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2442]] to i32*
+// CHECK:         %[[VAL_2483:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2448]] to i32*
 // CHECK:         %[[VAL_2484:.*]] = getelementptr inbounds i32, i32* %[[VAL_2483]], i32 %[[VAL_2454]]
 // CHECK:         store i32 %[[VAL_2482]], i32* %[[VAL_2484]], align 4
-// CHECK:         %[[VAL_2485:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2445]] to i32*
+// CHECK:         %[[VAL_2485:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2442]] to i32*
 // CHECK:         %[[VAL_2486:.*]] = getelementptr inbounds i32, i32* %[[VAL_2485]], i32 %[[VAL_2458]]
 // CHECK:         %[[VAL_2487:.*]] = load i32, i32* %[[VAL_2486]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2488:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2448]] to i32*
+// CHECK:         %[[VAL_2488:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2445]] to i32*
 // CHECK:         %[[VAL_2489:.*]] = getelementptr inbounds i32, i32* %[[VAL_2488]], i32 %[[VAL_2458]]
 // CHECK:         %[[VAL_2490:.*]] = load i32, i32* %[[VAL_2489]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_2491:.*]] = shl i32 %[[VAL_2487]], %[[VAL_2490]]
 // CHECK:         %[[VAL_2492:.*]] = icmp ult i32 %[[VAL_2490]], 32
 // CHECK:         %[[VAL_2493:.*]] = select i1 %[[VAL_2492]], i32 %[[VAL_2491]], i32 0
-// CHECK:         %[[VAL_2494:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2442]] to i32*
+// CHECK:         %[[VAL_2494:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2448]] to i32*
 // CHECK:         %[[VAL_2495:.*]] = getelementptr inbounds i32, i32* %[[VAL_2494]], i32 %[[VAL_2458]]
 // CHECK:         store i32 %[[VAL_2493]], i32* %[[VAL_2495]], align 4
-// CHECK:         %[[VAL_2496:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2445]] to i32*
+// CHECK:         %[[VAL_2496:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2442]] to i32*
 // CHECK:         %[[VAL_2497:.*]] = getelementptr inbounds i32, i32* %[[VAL_2496]], i32 %[[VAL_2462]]
 // CHECK:         %[[VAL_2498:.*]] = load i32, i32* %[[VAL_2497]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2499:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2448]] to i32*
+// CHECK:         %[[VAL_2499:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2445]] to i32*
 // CHECK:         %[[VAL_2500:.*]] = getelementptr inbounds i32, i32* %[[VAL_2499]], i32 %[[VAL_2462]]
 // CHECK:         %[[VAL_2501:.*]] = load i32, i32* %[[VAL_2500]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_2502:.*]] = shl i32 %[[VAL_2498]], %[[VAL_2501]]
 // CHECK:         %[[VAL_2503:.*]] = icmp ult i32 %[[VAL_2501]], 32
 // CHECK:         %[[VAL_2504:.*]] = select i1 %[[VAL_2503]], i32 %[[VAL_2502]], i32 0
-// CHECK:         %[[VAL_2505:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2442]] to i32*
+// CHECK:         %[[VAL_2505:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2448]] to i32*
 // CHECK:         %[[VAL_2506:.*]] = getelementptr inbounds i32, i32* %[[VAL_2505]], i32 %[[VAL_2462]]
 // CHECK:         store i32 %[[VAL_2504]], i32* %[[VAL_2506]], align 4
-// CHECK:         %[[VAL_2507:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2445]] to i32*
+// CHECK:         %[[VAL_2507:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2442]] to i32*
 // CHECK:         %[[VAL_2508:.*]] = getelementptr inbounds i32, i32* %[[VAL_2507]], i32 %[[VAL_2466]]
 // CHECK:         %[[VAL_2509:.*]] = load i32, i32* %[[VAL_2508]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2510:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2448]] to i32*
+// CHECK:         %[[VAL_2510:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2445]] to i32*
 // CHECK:         %[[VAL_2511:.*]] = getelementptr inbounds i32, i32* %[[VAL_2510]], i32 %[[VAL_2466]]
 // CHECK:         %[[VAL_2512:.*]] = load i32, i32* %[[VAL_2511]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_2513:.*]] = shl i32 %[[VAL_2509]], %[[VAL_2512]]
 // CHECK:         %[[VAL_2514:.*]] = icmp ult i32 %[[VAL_2512]], 32
 // CHECK:         %[[VAL_2515:.*]] = select i1 %[[VAL_2514]], i32 %[[VAL_2513]], i32 0
-// CHECK:         %[[VAL_2516:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2442]] to i32*
+// CHECK:         %[[VAL_2516:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2448]] to i32*
 // CHECK:         %[[VAL_2517:.*]] = getelementptr inbounds i32, i32* %[[VAL_2516]], i32 %[[VAL_2466]]
 // CHECK:         store i32 %[[VAL_2515]], i32* %[[VAL_2517]], align 4
 // CHECK:         br label %[[VAL_2472]]
@@ -2737,9 +2737,9 @@
 // CHECK:         %[[VAL_2526:.*]] = bitcast i8* %[[VAL_2524]] to [100 x [200 x i32]]*
 // CHECK:         %[[VAL_2527:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_2528:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_2529:.*]] = mul nuw nsw i32 %[[VAL_2527]], 128
+// CHECK:         %[[VAL_2529:.*]] = mul nuw nsw i32 %[[VAL_2527]], 256
 // CHECK:         %[[VAL_2530:.*]] = add nuw nsw i32 %[[VAL_2529]], %[[VAL_2528]]
-// CHECK:         %[[VAL_2531:.*]] = icmp ult i32 %[[VAL_2530]], 163840
+// CHECK:         %[[VAL_2531:.*]] = icmp ult i32 %[[VAL_2530]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_2531]])
 // CHECK:         %[[VAL_2532:.*]] = mul nuw nsw i32 %[[VAL_2530]], 4
 // CHECK:         %[[VAL_2533:.*]] = udiv i32 %[[VAL_2532]], 1
@@ -2762,10 +2762,10 @@
 // CHECK:       r42.in_bounds-after:                              ; preds = %[[VAL_2549]], %[[VAL_2551:.*]]
 // CHECK:         ret void
 // CHECK:       r42.in_bounds-true:                               ; preds = %[[VAL_2551]]
-// CHECK:         %[[VAL_2552:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2523]] to i32*
+// CHECK:         %[[VAL_2552:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2520]] to i32*
 // CHECK:         %[[VAL_2553:.*]] = getelementptr inbounds i32, i32* %[[VAL_2552]], i32 %[[VAL_2532]]
 // CHECK:         %[[VAL_2554:.*]] = load i32, i32* %[[VAL_2553]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2555:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2526]] to i32*
+// CHECK:         %[[VAL_2555:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2523]] to i32*
 // CHECK:         %[[VAL_2556:.*]] = getelementptr inbounds i32, i32* %[[VAL_2555]], i32 %[[VAL_2532]]
 // CHECK:         %[[VAL_2557:.*]] = load i32, i32* %[[VAL_2556]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_2558:.*]] = ashr i32 %[[VAL_2554]], %[[VAL_2557]]
@@ -2773,13 +2773,13 @@
 // CHECK:         %[[VAL_2560:.*]] = select i1 %[[VAL_2559]], i32 -1, i32 0
 // CHECK:         %[[VAL_2561:.*]] = icmp ult i32 %[[VAL_2557]], 32
 // CHECK:         %[[VAL_2562:.*]] = select i1 %[[VAL_2561]], i32 %[[VAL_2558]], i32 %[[VAL_2560]]
-// CHECK:         %[[VAL_2563:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2520]] to i32*
+// CHECK:         %[[VAL_2563:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2526]] to i32*
 // CHECK:         %[[VAL_2564:.*]] = getelementptr inbounds i32, i32* %[[VAL_2563]], i32 %[[VAL_2532]]
 // CHECK:         store i32 %[[VAL_2562]], i32* %[[VAL_2564]], align 4
-// CHECK:         %[[VAL_2565:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2523]] to i32*
+// CHECK:         %[[VAL_2565:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2520]] to i32*
 // CHECK:         %[[VAL_2566:.*]] = getelementptr inbounds i32, i32* %[[VAL_2565]], i32 %[[VAL_2536]]
 // CHECK:         %[[VAL_2567:.*]] = load i32, i32* %[[VAL_2566]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2568:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2526]] to i32*
+// CHECK:         %[[VAL_2568:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2523]] to i32*
 // CHECK:         %[[VAL_2569:.*]] = getelementptr inbounds i32, i32* %[[VAL_2568]], i32 %[[VAL_2536]]
 // CHECK:         %[[VAL_2570:.*]] = load i32, i32* %[[VAL_2569]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_2571:.*]] = ashr i32 %[[VAL_2567]], %[[VAL_2570]]
@@ -2787,13 +2787,13 @@
 // CHECK:         %[[VAL_2573:.*]] = select i1 %[[VAL_2572]], i32 -1, i32 0
 // CHECK:         %[[VAL_2574:.*]] = icmp ult i32 %[[VAL_2570]], 32
 // CHECK:         %[[VAL_2575:.*]] = select i1 %[[VAL_2574]], i32 %[[VAL_2571]], i32 %[[VAL_2573]]
-// CHECK:         %[[VAL_2576:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2520]] to i32*
+// CHECK:         %[[VAL_2576:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2526]] to i32*
 // CHECK:         %[[VAL_2577:.*]] = getelementptr inbounds i32, i32* %[[VAL_2576]], i32 %[[VAL_2536]]
 // CHECK:         store i32 %[[VAL_2575]], i32* %[[VAL_2577]], align 4
-// CHECK:         %[[VAL_2578:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2523]] to i32*
+// CHECK:         %[[VAL_2578:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2520]] to i32*
 // CHECK:         %[[VAL_2579:.*]] = getelementptr inbounds i32, i32* %[[VAL_2578]], i32 %[[VAL_2540]]
 // CHECK:         %[[VAL_2580:.*]] = load i32, i32* %[[VAL_2579]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2581:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2526]] to i32*
+// CHECK:         %[[VAL_2581:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2523]] to i32*
 // CHECK:         %[[VAL_2582:.*]] = getelementptr inbounds i32, i32* %[[VAL_2581]], i32 %[[VAL_2540]]
 // CHECK:         %[[VAL_2583:.*]] = load i32, i32* %[[VAL_2582]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_2584:.*]] = ashr i32 %[[VAL_2580]], %[[VAL_2583]]
@@ -2801,13 +2801,13 @@
 // CHECK:         %[[VAL_2586:.*]] = select i1 %[[VAL_2585]], i32 -1, i32 0
 // CHECK:         %[[VAL_2587:.*]] = icmp ult i32 %[[VAL_2583]], 32
 // CHECK:         %[[VAL_2588:.*]] = select i1 %[[VAL_2587]], i32 %[[VAL_2584]], i32 %[[VAL_2586]]
-// CHECK:         %[[VAL_2589:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2520]] to i32*
+// CHECK:         %[[VAL_2589:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2526]] to i32*
 // CHECK:         %[[VAL_2590:.*]] = getelementptr inbounds i32, i32* %[[VAL_2589]], i32 %[[VAL_2540]]
 // CHECK:         store i32 %[[VAL_2588]], i32* %[[VAL_2590]], align 4
-// CHECK:         %[[VAL_2591:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2523]] to i32*
+// CHECK:         %[[VAL_2591:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2520]] to i32*
 // CHECK:         %[[VAL_2592:.*]] = getelementptr inbounds i32, i32* %[[VAL_2591]], i32 %[[VAL_2544]]
 // CHECK:         %[[VAL_2593:.*]] = load i32, i32* %[[VAL_2592]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2594:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2526]] to i32*
+// CHECK:         %[[VAL_2594:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2523]] to i32*
 // CHECK:         %[[VAL_2595:.*]] = getelementptr inbounds i32, i32* %[[VAL_2594]], i32 %[[VAL_2544]]
 // CHECK:         %[[VAL_2596:.*]] = load i32, i32* %[[VAL_2595]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_2597:.*]] = ashr i32 %[[VAL_2593]], %[[VAL_2596]]
@@ -2815,7 +2815,7 @@
 // CHECK:         %[[VAL_2599:.*]] = select i1 %[[VAL_2598]], i32 -1, i32 0
 // CHECK:         %[[VAL_2600:.*]] = icmp ult i32 %[[VAL_2596]], 32
 // CHECK:         %[[VAL_2601:.*]] = select i1 %[[VAL_2600]], i32 %[[VAL_2597]], i32 %[[VAL_2599]]
-// CHECK:         %[[VAL_2602:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2520]] to i32*
+// CHECK:         %[[VAL_2602:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2526]] to i32*
 // CHECK:         %[[VAL_2603:.*]] = getelementptr inbounds i32, i32* %[[VAL_2602]], i32 %[[VAL_2544]]
 // CHECK:         store i32 %[[VAL_2601]], i32* %[[VAL_2603]], align 4
 // CHECK:         br label %[[VAL_2550]]
@@ -2828,9 +2828,9 @@
 // CHECK:         %[[VAL_2612:.*]] = bitcast i8* %[[VAL_2610]] to [100 x [200 x i32]]*
 // CHECK:         %[[VAL_2613:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_2614:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_2615:.*]] = mul nuw nsw i32 %[[VAL_2613]], 128
+// CHECK:         %[[VAL_2615:.*]] = mul nuw nsw i32 %[[VAL_2613]], 256
 // CHECK:         %[[VAL_2616:.*]] = add nuw nsw i32 %[[VAL_2615]], %[[VAL_2614]]
-// CHECK:         %[[VAL_2617:.*]] = icmp ult i32 %[[VAL_2616]], 163840
+// CHECK:         %[[VAL_2617:.*]] = icmp ult i32 %[[VAL_2616]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_2617]])
 // CHECK:         %[[VAL_2618:.*]] = mul nuw nsw i32 %[[VAL_2616]], 4
 // CHECK:         %[[VAL_2619:.*]] = udiv i32 %[[VAL_2618]], 1
@@ -2853,52 +2853,52 @@
 // CHECK:       r43.in_bounds-after:                              ; preds = %[[VAL_2635]], %[[VAL_2637:.*]]
 // CHECK:         ret void
 // CHECK:       r43.in_bounds-true:                               ; preds = %[[VAL_2637]]
-// CHECK:         %[[VAL_2638:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2609]] to i32*
+// CHECK:         %[[VAL_2638:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2606]] to i32*
 // CHECK:         %[[VAL_2639:.*]] = getelementptr inbounds i32, i32* %[[VAL_2638]], i32 %[[VAL_2618]]
 // CHECK:         %[[VAL_2640:.*]] = load i32, i32* %[[VAL_2639]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2641:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2612]] to i32*
+// CHECK:         %[[VAL_2641:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2609]] to i32*
 // CHECK:         %[[VAL_2642:.*]] = getelementptr inbounds i32, i32* %[[VAL_2641]], i32 %[[VAL_2618]]
 // CHECK:         %[[VAL_2643:.*]] = load i32, i32* %[[VAL_2642]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_2644:.*]] = lshr i32 %[[VAL_2640]], %[[VAL_2643]]
 // CHECK:         %[[VAL_2645:.*]] = icmp ult i32 %[[VAL_2643]], 32
 // CHECK:         %[[VAL_2646:.*]] = select i1 %[[VAL_2645]], i32 %[[VAL_2644]], i32 0
-// CHECK:         %[[VAL_2647:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2606]] to i32*
+// CHECK:         %[[VAL_2647:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2612]] to i32*
 // CHECK:         %[[VAL_2648:.*]] = getelementptr inbounds i32, i32* %[[VAL_2647]], i32 %[[VAL_2618]]
 // CHECK:         store i32 %[[VAL_2646]], i32* %[[VAL_2648]], align 4
-// CHECK:         %[[VAL_2649:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2609]] to i32*
+// CHECK:         %[[VAL_2649:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2606]] to i32*
 // CHECK:         %[[VAL_2650:.*]] = getelementptr inbounds i32, i32* %[[VAL_2649]], i32 %[[VAL_2622]]
 // CHECK:         %[[VAL_2651:.*]] = load i32, i32* %[[VAL_2650]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2652:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2612]] to i32*
+// CHECK:         %[[VAL_2652:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2609]] to i32*
 // CHECK:         %[[VAL_2653:.*]] = getelementptr inbounds i32, i32* %[[VAL_2652]], i32 %[[VAL_2622]]
 // CHECK:         %[[VAL_2654:.*]] = load i32, i32* %[[VAL_2653]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_2655:.*]] = lshr i32 %[[VAL_2651]], %[[VAL_2654]]
 // CHECK:         %[[VAL_2656:.*]] = icmp ult i32 %[[VAL_2654]], 32
 // CHECK:         %[[VAL_2657:.*]] = select i1 %[[VAL_2656]], i32 %[[VAL_2655]], i32 0
-// CHECK:         %[[VAL_2658:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2606]] to i32*
+// CHECK:         %[[VAL_2658:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2612]] to i32*
 // CHECK:         %[[VAL_2659:.*]] = getelementptr inbounds i32, i32* %[[VAL_2658]], i32 %[[VAL_2622]]
 // CHECK:         store i32 %[[VAL_2657]], i32* %[[VAL_2659]], align 4
-// CHECK:         %[[VAL_2660:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2609]] to i32*
+// CHECK:         %[[VAL_2660:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2606]] to i32*
 // CHECK:         %[[VAL_2661:.*]] = getelementptr inbounds i32, i32* %[[VAL_2660]], i32 %[[VAL_2626]]
 // CHECK:         %[[VAL_2662:.*]] = load i32, i32* %[[VAL_2661]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2663:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2612]] to i32*
+// CHECK:         %[[VAL_2663:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2609]] to i32*
 // CHECK:         %[[VAL_2664:.*]] = getelementptr inbounds i32, i32* %[[VAL_2663]], i32 %[[VAL_2626]]
 // CHECK:         %[[VAL_2665:.*]] = load i32, i32* %[[VAL_2664]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_2666:.*]] = lshr i32 %[[VAL_2662]], %[[VAL_2665]]
 // CHECK:         %[[VAL_2667:.*]] = icmp ult i32 %[[VAL_2665]], 32
 // CHECK:         %[[VAL_2668:.*]] = select i1 %[[VAL_2667]], i32 %[[VAL_2666]], i32 0
-// CHECK:         %[[VAL_2669:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2606]] to i32*
+// CHECK:         %[[VAL_2669:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2612]] to i32*
 // CHECK:         %[[VAL_2670:.*]] = getelementptr inbounds i32, i32* %[[VAL_2669]], i32 %[[VAL_2626]]
 // CHECK:         store i32 %[[VAL_2668]], i32* %[[VAL_2670]], align 4
-// CHECK:         %[[VAL_2671:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2609]] to i32*
+// CHECK:         %[[VAL_2671:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2606]] to i32*
 // CHECK:         %[[VAL_2672:.*]] = getelementptr inbounds i32, i32* %[[VAL_2671]], i32 %[[VAL_2630]]
 // CHECK:         %[[VAL_2673:.*]] = load i32, i32* %[[VAL_2672]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2674:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2612]] to i32*
+// CHECK:         %[[VAL_2674:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2609]] to i32*
 // CHECK:         %[[VAL_2675:.*]] = getelementptr inbounds i32, i32* %[[VAL_2674]], i32 %[[VAL_2630]]
 // CHECK:         %[[VAL_2676:.*]] = load i32, i32* %[[VAL_2675]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_2677:.*]] = lshr i32 %[[VAL_2673]], %[[VAL_2676]]
 // CHECK:         %[[VAL_2678:.*]] = icmp ult i32 %[[VAL_2676]], 32
 // CHECK:         %[[VAL_2679:.*]] = select i1 %[[VAL_2678]], i32 %[[VAL_2677]], i32 0
-// CHECK:         %[[VAL_2680:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2606]] to i32*
+// CHECK:         %[[VAL_2680:.*]] = bitcast [100 x [200 x i32]]* %[[VAL_2612]] to i32*
 // CHECK:         %[[VAL_2681:.*]] = getelementptr inbounds i32, i32* %[[VAL_2680]], i32 %[[VAL_2630]]
 // CHECK:         store i32 %[[VAL_2679]], i32* %[[VAL_2681]], align 4
 // CHECK:         br label %[[VAL_2636]]
@@ -2911,8 +2911,8 @@
 // CHECK:         %[[VAL_2690:.*]] = bitcast i8* %[[VAL_2688]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_2691:.*]] = getelementptr inbounds i8, i8* %[[VAL_2692:.*]], i64 0
 // CHECK:         %[[VAL_2693:.*]] = bitcast i8* %[[VAL_2691]] to [100 x [200 x float]]*
-// CHECK:         %[[VAL_2694:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
-// CHECK:         %[[VAL_2695:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
+// CHECK:         %[[VAL_2694:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !94
+// CHECK:         %[[VAL_2695:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !95
 // CHECK:         %[[VAL_2696:.*]] = mul nuw nsw i32 %[[VAL_2694]], 128
 // CHECK:         %[[VAL_2697:.*]] = add nuw nsw i32 %[[VAL_2696]], %[[VAL_2695]]
 // CHECK:         %[[VAL_2698:.*]] = icmp ult i32 %[[VAL_2697]], 163840
@@ -3006,9 +3006,9 @@
 // CHECK:         %[[VAL_2782:.*]] = bitcast i8* %[[VAL_2780]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_2783:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_2784:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_2785:.*]] = mul nuw nsw i32 %[[VAL_2783]], 128
+// CHECK:         %[[VAL_2785:.*]] = mul nuw nsw i32 %[[VAL_2783]], 256
 // CHECK:         %[[VAL_2786:.*]] = add nuw nsw i32 %[[VAL_2785]], %[[VAL_2784]]
-// CHECK:         %[[VAL_2787:.*]] = icmp ult i32 %[[VAL_2786]], 163840
+// CHECK:         %[[VAL_2787:.*]] = icmp ult i32 %[[VAL_2786]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_2787]])
 // CHECK:         %[[VAL_2788:.*]] = mul nuw nsw i32 %[[VAL_2786]], 4
 // CHECK:         %[[VAL_2789:.*]] = udiv i32 %[[VAL_2788]], 1
@@ -3031,68 +3031,68 @@
 // CHECK:       r45.in_bounds-after:                              ; preds = %[[VAL_2805]], %[[VAL_2807:.*]]
 // CHECK:         ret void
 // CHECK:       r45.in_bounds-true:                               ; preds = %[[VAL_2807]]
-// CHECK:         %[[VAL_2808:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2776]] to float*
+// CHECK:         %[[VAL_2808:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2773]] to float*
 // CHECK:         %[[VAL_2809:.*]] = getelementptr inbounds float, float* %[[VAL_2808]], i32 %[[VAL_2788]]
 // CHECK:         %[[VAL_2810:.*]] = load float, float* %[[VAL_2809]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2811:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2779]] to float*
+// CHECK:         %[[VAL_2811:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2776]] to float*
 // CHECK:         %[[VAL_2812:.*]] = getelementptr inbounds float, float* %[[VAL_2811]], i32 %[[VAL_2788]]
 // CHECK:         %[[VAL_2813:.*]] = load float, float* %[[VAL_2812]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2814:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2782]] to float*
+// CHECK:         %[[VAL_2814:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2779]] to float*
 // CHECK:         %[[VAL_2815:.*]] = getelementptr inbounds float, float* %[[VAL_2814]], i32 %[[VAL_2788]]
 // CHECK:         %[[VAL_2816:.*]] = load float, float* %[[VAL_2815]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_2817:.*]] = fcmp uge float %[[VAL_2810]], %[[VAL_2813]]
 // CHECK:         %[[VAL_2818:.*]] = select i1 %[[VAL_2817]], float %[[VAL_2810]], float %[[VAL_2813]]
 // CHECK:         %[[VAL_2819:.*]] = fcmp ule float %[[VAL_2816]], %[[VAL_2818]]
 // CHECK:         %[[VAL_2820:.*]] = select i1 %[[VAL_2819]], float %[[VAL_2816]], float %[[VAL_2818]]
-// CHECK:         %[[VAL_2821:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2773]] to float*
+// CHECK:         %[[VAL_2821:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2782]] to float*
 // CHECK:         %[[VAL_2822:.*]] = getelementptr inbounds float, float* %[[VAL_2821]], i32 %[[VAL_2788]]
 // CHECK:         store float %[[VAL_2820]], float* %[[VAL_2822]], align 4
-// CHECK:         %[[VAL_2823:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2776]] to float*
+// CHECK:         %[[VAL_2823:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2773]] to float*
 // CHECK:         %[[VAL_2824:.*]] = getelementptr inbounds float, float* %[[VAL_2823]], i32 %[[VAL_2792]]
 // CHECK:         %[[VAL_2825:.*]] = load float, float* %[[VAL_2824]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2826:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2779]] to float*
+// CHECK:         %[[VAL_2826:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2776]] to float*
 // CHECK:         %[[VAL_2827:.*]] = getelementptr inbounds float, float* %[[VAL_2826]], i32 %[[VAL_2792]]
 // CHECK:         %[[VAL_2828:.*]] = load float, float* %[[VAL_2827]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2829:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2782]] to float*
+// CHECK:         %[[VAL_2829:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2779]] to float*
 // CHECK:         %[[VAL_2830:.*]] = getelementptr inbounds float, float* %[[VAL_2829]], i32 %[[VAL_2792]]
 // CHECK:         %[[VAL_2831:.*]] = load float, float* %[[VAL_2830]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_2832:.*]] = fcmp uge float %[[VAL_2825]], %[[VAL_2828]]
 // CHECK:         %[[VAL_2833:.*]] = select i1 %[[VAL_2832]], float %[[VAL_2825]], float %[[VAL_2828]]
 // CHECK:         %[[VAL_2834:.*]] = fcmp ule float %[[VAL_2831]], %[[VAL_2833]]
 // CHECK:         %[[VAL_2835:.*]] = select i1 %[[VAL_2834]], float %[[VAL_2831]], float %[[VAL_2833]]
-// CHECK:         %[[VAL_2836:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2773]] to float*
+// CHECK:         %[[VAL_2836:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2782]] to float*
 // CHECK:         %[[VAL_2837:.*]] = getelementptr inbounds float, float* %[[VAL_2836]], i32 %[[VAL_2792]]
 // CHECK:         store float %[[VAL_2835]], float* %[[VAL_2837]], align 4
-// CHECK:         %[[VAL_2838:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2776]] to float*
+// CHECK:         %[[VAL_2838:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2773]] to float*
 // CHECK:         %[[VAL_2839:.*]] = getelementptr inbounds float, float* %[[VAL_2838]], i32 %[[VAL_2796]]
 // CHECK:         %[[VAL_2840:.*]] = load float, float* %[[VAL_2839]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2841:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2779]] to float*
+// CHECK:         %[[VAL_2841:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2776]] to float*
 // CHECK:         %[[VAL_2842:.*]] = getelementptr inbounds float, float* %[[VAL_2841]], i32 %[[VAL_2796]]
 // CHECK:         %[[VAL_2843:.*]] = load float, float* %[[VAL_2842]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2844:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2782]] to float*
+// CHECK:         %[[VAL_2844:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2779]] to float*
 // CHECK:         %[[VAL_2845:.*]] = getelementptr inbounds float, float* %[[VAL_2844]], i32 %[[VAL_2796]]
 // CHECK:         %[[VAL_2846:.*]] = load float, float* %[[VAL_2845]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_2847:.*]] = fcmp uge float %[[VAL_2840]], %[[VAL_2843]]
 // CHECK:         %[[VAL_2848:.*]] = select i1 %[[VAL_2847]], float %[[VAL_2840]], float %[[VAL_2843]]
 // CHECK:         %[[VAL_2849:.*]] = fcmp ule float %[[VAL_2846]], %[[VAL_2848]]
 // CHECK:         %[[VAL_2850:.*]] = select i1 %[[VAL_2849]], float %[[VAL_2846]], float %[[VAL_2848]]
-// CHECK:         %[[VAL_2851:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2773]] to float*
+// CHECK:         %[[VAL_2851:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2782]] to float*
 // CHECK:         %[[VAL_2852:.*]] = getelementptr inbounds float, float* %[[VAL_2851]], i32 %[[VAL_2796]]
 // CHECK:         store float %[[VAL_2850]], float* %[[VAL_2852]], align 4
-// CHECK:         %[[VAL_2853:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2776]] to float*
+// CHECK:         %[[VAL_2853:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2773]] to float*
 // CHECK:         %[[VAL_2854:.*]] = getelementptr inbounds float, float* %[[VAL_2853]], i32 %[[VAL_2800]]
 // CHECK:         %[[VAL_2855:.*]] = load float, float* %[[VAL_2854]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2856:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2779]] to float*
+// CHECK:         %[[VAL_2856:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2776]] to float*
 // CHECK:         %[[VAL_2857:.*]] = getelementptr inbounds float, float* %[[VAL_2856]], i32 %[[VAL_2800]]
 // CHECK:         %[[VAL_2858:.*]] = load float, float* %[[VAL_2857]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2859:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2782]] to float*
+// CHECK:         %[[VAL_2859:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2779]] to float*
 // CHECK:         %[[VAL_2860:.*]] = getelementptr inbounds float, float* %[[VAL_2859]], i32 %[[VAL_2800]]
 // CHECK:         %[[VAL_2861:.*]] = load float, float* %[[VAL_2860]], align 4, !invariant.load !92
 // CHECK:         %[[VAL_2862:.*]] = fcmp uge float %[[VAL_2855]], %[[VAL_2858]]
 // CHECK:         %[[VAL_2863:.*]] = select i1 %[[VAL_2862]], float %[[VAL_2855]], float %[[VAL_2858]]
 // CHECK:         %[[VAL_2864:.*]] = fcmp ule float %[[VAL_2861]], %[[VAL_2863]]
 // CHECK:         %[[VAL_2865:.*]] = select i1 %[[VAL_2864]], float %[[VAL_2861]], float %[[VAL_2863]]
-// CHECK:         %[[VAL_2866:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2773]] to float*
+// CHECK:         %[[VAL_2866:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2782]] to float*
 // CHECK:         %[[VAL_2867:.*]] = getelementptr inbounds float, float* %[[VAL_2866]], i32 %[[VAL_2800]]
 // CHECK:         store float %[[VAL_2865]], float* %[[VAL_2867]], align 4
 // CHECK:         br label %[[VAL_2806]]
@@ -3117,9 +3117,9 @@
 // CHECK:         %[[VAL_2888:.*]] = bitcast i8* %[[VAL_2886]] to [100 x [200 x float]]*
 // CHECK:         %[[VAL_2889:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !90
 // CHECK:         %[[VAL_2890:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !91
-// CHECK:         %[[VAL_2891:.*]] = mul nuw nsw i32 %[[VAL_2889]], 128
+// CHECK:         %[[VAL_2891:.*]] = mul nuw nsw i32 %[[VAL_2889]], 256
 // CHECK:         %[[VAL_2892:.*]] = add nuw nsw i32 %[[VAL_2891]], %[[VAL_2890]]
-// CHECK:         %[[VAL_2893:.*]] = icmp ult i32 %[[VAL_2892]], 163840
+// CHECK:         %[[VAL_2893:.*]] = icmp ult i32 %[[VAL_2892]], 5120
 // CHECK:         call void @llvm.assume(i1 %[[VAL_2893]])
 // CHECK:         %[[VAL_2894:.*]] = mul nuw nsw i32 %[[VAL_2892]], 4
 // CHECK:         %[[VAL_2895:.*]] = udiv i32 %[[VAL_2894]], 1
@@ -3142,56 +3142,56 @@
 // CHECK:       r46.in_bounds-after:                              ; preds = %[[VAL_2911]], %[[VAL_2913:.*]]
 // CHECK:         ret void
 // CHECK:       r46.in_bounds-true:                               ; preds = %[[VAL_2913]]
-// CHECK:         %[[VAL_2914:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2885]] to float*
+// CHECK:         %[[VAL_2914:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2882]] to float*
 // CHECK:         %[[VAL_2915:.*]] = getelementptr inbounds float, float* %[[VAL_2914]], i32 %[[VAL_2894]]
 // CHECK:         %[[VAL_2916:.*]] = load float, float* %[[VAL_2915]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2917:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2888]] to float*
+// CHECK:         %[[VAL_2917:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2885]] to float*
 // CHECK:         %[[VAL_2918:.*]] = getelementptr inbounds float, float* %[[VAL_2917]], i32 %[[VAL_2894]]
 // CHECK:         %[[VAL_2919:.*]] = load float, float* %[[VAL_2918]], align 4, !invariant.load !92
 // CHECK:         store float %[[VAL_2916]], float* %[[VAL_2878]], align 4
 // CHECK:         store float %[[VAL_2919]], float* %[[VAL_2877]], align 4
-// CHECK:         call void @add_F32(float* %[[VAL_2878]], float* %[[VAL_2877]], float* %[[VAL_2879]])
+// CHECK:         call void @region_1_3(float* %[[VAL_2878]], float* %[[VAL_2877]], float* %[[VAL_2879]])
 // CHECK:         %[[VAL_2920:.*]] = load float, float* %[[VAL_2879]], align 4
-// CHECK:         %[[VAL_2921:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2882]] to float*
+// CHECK:         %[[VAL_2921:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2888]] to float*
 // CHECK:         %[[VAL_2922:.*]] = getelementptr inbounds float, float* %[[VAL_2921]], i32 %[[VAL_2894]]
 // CHECK:         store float %[[VAL_2920]], float* %[[VAL_2922]], align 4
-// CHECK:         %[[VAL_2923:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2885]] to float*
+// CHECK:         %[[VAL_2923:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2882]] to float*
 // CHECK:         %[[VAL_2924:.*]] = getelementptr inbounds float, float* %[[VAL_2923]], i32 %[[VAL_2898]]
 // CHECK:         %[[VAL_2925:.*]] = load float, float* %[[VAL_2924]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2926:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2888]] to float*
+// CHECK:         %[[VAL_2926:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2885]] to float*
 // CHECK:         %[[VAL_2927:.*]] = getelementptr inbounds float, float* %[[VAL_2926]], i32 %[[VAL_2898]]
 // CHECK:         %[[VAL_2928:.*]] = load float, float* %[[VAL_2927]], align 4, !invariant.load !92
 // CHECK:         store float %[[VAL_2925]], float* %[[VAL_2875]], align 4
 // CHECK:         store float %[[VAL_2928]], float* %[[VAL_2874]], align 4
-// CHECK:         call void @add_F32(float* %[[VAL_2875]], float* %[[VAL_2874]], float* %[[VAL_2876]])
+// CHECK:         call void @region_1_3(float* %[[VAL_2875]], float* %[[VAL_2874]], float* %[[VAL_2876]])
 // CHECK:         %[[VAL_2929:.*]] = load float, float* %[[VAL_2876]], align 4
-// CHECK:         %[[VAL_2930:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2882]] to float*
+// CHECK:         %[[VAL_2930:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2888]] to float*
 // CHECK:         %[[VAL_2931:.*]] = getelementptr inbounds float, float* %[[VAL_2930]], i32 %[[VAL_2898]]
 // CHECK:         store float %[[VAL_2929]], float* %[[VAL_2931]], align 4
-// CHECK:         %[[VAL_2932:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2885]] to float*
+// CHECK:         %[[VAL_2932:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2882]] to float*
 // CHECK:         %[[VAL_2933:.*]] = getelementptr inbounds float, float* %[[VAL_2932]], i32 %[[VAL_2902]]
 // CHECK:         %[[VAL_2934:.*]] = load float, float* %[[VAL_2933]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2935:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2888]] to float*
+// CHECK:         %[[VAL_2935:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2885]] to float*
 // CHECK:         %[[VAL_2936:.*]] = getelementptr inbounds float, float* %[[VAL_2935]], i32 %[[VAL_2902]]
 // CHECK:         %[[VAL_2937:.*]] = load float, float* %[[VAL_2936]], align 4, !invariant.load !92
 // CHECK:         store float %[[VAL_2934]], float* %[[VAL_2872]], align 4
 // CHECK:         store float %[[VAL_2937]], float* %[[VAL_2871]], align 4
-// CHECK:         call void @add_F32(float* %[[VAL_2872]], float* %[[VAL_2871]], float* %[[VAL_2873]])
+// CHECK:         call void @region_1_3(float* %[[VAL_2872]], float* %[[VAL_2871]], float* %[[VAL_2873]])
 // CHECK:         %[[VAL_2938:.*]] = load float, float* %[[VAL_2873]], align 4
-// CHECK:         %[[VAL_2939:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2882]] to float*
+// CHECK:         %[[VAL_2939:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2888]] to float*
 // CHECK:         %[[VAL_2940:.*]] = getelementptr inbounds float, float* %[[VAL_2939]], i32 %[[VAL_2902]]
 // CHECK:         store float %[[VAL_2938]], float* %[[VAL_2940]], align 4
-// CHECK:         %[[VAL_2941:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2885]] to float*
+// CHECK:         %[[VAL_2941:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2882]] to float*
 // CHECK:         %[[VAL_2942:.*]] = getelementptr inbounds float, float* %[[VAL_2941]], i32 %[[VAL_2906]]
 // CHECK:         %[[VAL_2943:.*]] = load float, float* %[[VAL_2942]], align 4, !invariant.load !92
-// CHECK:         %[[VAL_2944:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2888]] to float*
+// CHECK:         %[[VAL_2944:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2885]] to float*
 // CHECK:         %[[VAL_2945:.*]] = getelementptr inbounds float, float* %[[VAL_2944]], i32 %[[VAL_2906]]
 // CHECK:         %[[VAL_2946:.*]] = load float, float* %[[VAL_2945]], align 4, !invariant.load !92
 // CHECK:         store float %[[VAL_2943]], float* %[[VAL_2869]], align 4
 // CHECK:         store float %[[VAL_2946]], float* %[[VAL_2868]], align 4
-// CHECK:         call void @add_F32(float* %[[VAL_2869]], float* %[[VAL_2868]], float* %[[VAL_2870]])
+// CHECK:         call void @region_1_3(float* %[[VAL_2869]], float* %[[VAL_2868]], float* %[[VAL_2870]])
 // CHECK:         %[[VAL_2947:.*]] = load float, float* %[[VAL_2870]], align 4
-// CHECK:         %[[VAL_2948:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2882]] to float*
+// CHECK:         %[[VAL_2948:.*]] = bitcast [100 x [200 x float]]* %[[VAL_2888]] to float*
 // CHECK:         %[[VAL_2949:.*]] = getelementptr inbounds float, float* %[[VAL_2948]], i32 %[[VAL_2906]]
 // CHECK:         store float %[[VAL_2947]], float* %[[VAL_2949]], align 4
 // CHECK:         br label %[[VAL_2912]]
diff --git a/tensorflow/compiler/xla/tests/llvm_irgen_test_base.cc b/tensorflow/compiler/xla/tests/llvm_irgen_test_base.cc
index c2dc912..d10d54d 100644
--- a/tensorflow/compiler/xla/tests/llvm_irgen_test_base.cc
+++ b/tensorflow/compiler/xla/tests/llvm_irgen_test_base.cc
@@ -56,7 +56,7 @@
 
   StatusOr<bool> filecheck_result = RunFileCheck(ir_, pattern);
   TF_ASSERT_OK(filecheck_result.status());
-  EXPECT_TRUE(filecheck_result.ValueOrDie());
+  EXPECT_TRUE(filecheck_result.ValueOrDie()) << "Full IR: " << ir_;
 }
 
 void LlvmIrGenTestBase::CompileAndVerifyIr(const string& hlo_text,
@@ -80,7 +80,7 @@
 
   StatusOr<bool> filecheck_result = RunFileCheck(ir_, pattern);
   ASSERT_TRUE(filecheck_result.ok());
-  EXPECT_TRUE(filecheck_result.ValueOrDie());
+  EXPECT_TRUE(filecheck_result.ValueOrDie()) << "Full IR: " << ir_;
 }
 
 void LlvmIrGenTestBase::MatchOptimizedHlo(absl::string_view hlo,