tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir - platform/external/tensorflow - Git at Google

 // RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-tpu-extract-head-tail-outside-compilation | FileCheck %s --dump-input-on-failure

 // Tests extraction of a outside compiled ops at head of TPU computation.

 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   // CHECK-LABEL: func @single_head_outside_compilation
   func @single_head_outside_compilation(%arg0: tensor<i32>) {
     // CHECK:      tf_device.launch
     // CHECK:        "tf.A"
     // CHECK-NEXT:   tf_device.return
     // CHECK:      device = "/job:worker/replica:0/task:0/device:CPU:0"
     //
     // CHECK:      "tf_device.cluster"
     // CHECK:        "tf.C"
     // CHECK-NEXT:   tf_device.return
     "tf_device.cluster"() ( {
       "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> ()
       "tf.B"() : () -> ()
       "tf.C"() : () -> ()
       tf_device.return
     }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
     return
   }

   // CHECK-LABEL: func @ops_no_operands
   func @ops_no_operands() {
     // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
     // CHECK:        %[[A_OUT:.*]] = "tf.A"
     // CHECK-NEXT:   tf_device.return %[[A_OUT]]
     // CHECK:      device = "/job:worker/replica:0/task:0/device:CPU:0"
     //
     // CHECK:      "tf_device.cluster"
     // CHECK-NEXT:   "tf.B"(%[[LAUNCH_OUT]])
     // CHECK-NEXT:   "tf.C"
     // CHECK-NEXT:   tf_device.return
     "tf_device.cluster"() ( {
       %0 = "tf.A"() {_xla_outside_compilation = "cluster1"} : () -> tensor<i32>
       %1 = "tf.B"(%0) {}: (tensor<i32>) -> tensor<i32>
       "tf.C"(%1) : (tensor<i32>) -> ()
       tf_device.return
     }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
     return
   }

   // CHECK-LABEL: func @op_operand_outside_cluster
   func @op_operand_outside_cluster() {
     // CHECK:      %[[A_OUT:.*]] = "tf.A"
     %0 = "tf.A"() : () -> tensor<i32>
     // CHECK-NEXT: %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
     // CHECK:        %[[B_OUT:.*]] = "tf.B"
     // CHECK-NEXT:   tf_device.return %[[B_OUT]]
     // CHECK:      device = "/job:worker/replica:0/task:0/device:CPU:0"
     //
     // CHECK:      "tf_device.cluster"
     // CHECK-NEXT:   "tf.C"(%[[LAUNCH_OUT]])
     // CHECK-NEXT:   "tf.D"
     // CHECK-NEXT:   tf_device.return
     "tf_device.cluster"() ( {
       %1 = "tf.B"(%0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
       %2 = "tf.C"(%1) {}: (tensor<i32>) -> tensor<i32>
       "tf.D"(%2) : (tensor<i32>) -> ()
       tf_device.return
     }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
     return
   }

   // CHECK-LABEL: func @aliased_output
   func @aliased_output() -> (tensor<i32>, tensor<i32>, tensor<i32>) {
     // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
     // CHECK:        %[[A_OUT:.*]] = "tf.A"
     // CHECK-NEXT:   tf_device.return %[[A_OUT]]
     // CHECK:      device = "/job:worker/replica:0/task:0/device:CPU:0"
     //
     // CHECK:      %[[CLUSTER_OUT:.*]]:2 = "tf_device.cluster"
     // CHECK-NEXT:   %[[B_OUT:.*]] = "tf.B"(%[[LAUNCH_OUT]])
     // CHECK-NEXT:   %[[C_OUT:.*]] = "tf.C"
     // CHECK-NEXT:   tf_device.return %[[C_OUT]], %[[B_OUT]]
     // CHECK-NEXT: {
     // CHECK-DAG:  num_cores_per_replica = 1
     // CHECK-DAG:  step_marker_location = ""
     // CHECK-DAG:  padding_map = []
     // CHECK-DAG:  topology = ""
     // CHECK-DAG:  device_assignment = []
     //
     // CHECK:      return %[[LAUNCH_OUT]], %[[CLUSTER_OUT]]#0, %[[CLUSTER_OUT]]#1
     %0:3 = "tf_device.cluster"() ( {
       %1 = "tf.A"() {_xla_outside_compilation = "cluster1"} : () -> tensor<i32>
       %2 = "tf.B"(%1) {}: (tensor<i32>) -> tensor<i32>
       %3 = "tf.C"(%2) : (tensor<i32>) -> tensor<i32>
       tf_device.return %1, %3, %2 : tensor<i32>, tensor<i32>, tensor<i32>
     }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> (tensor<i32>, tensor<i32>, tensor<i32>)
     return %0#0, %0#1, %0#2 : tensor<i32>, tensor<i32>, tensor<i32>
   }

   // CHECK-LABEL: func @all_head_computation_ops
   func @all_head_computation_ops(%arg0: tensor<i32>) -> tensor<i32> {
     // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
     // CHECK:        %[[A_OUT:.*]] = "tf.A"
     // CHECK:        %[[B_OUT:.*]] = "tf.B"(%[[A_OUT]])
     // CHECK:        %[[C_OUT:.*]] = "tf.C"(%[[B_OUT]], %arg0)
     // CHECK-NEXT:   tf_device.return %[[C_OUT]]
     // CHECK:      device = "/job:worker/replica:0/task:0/device:CPU:0"
     //
     // CHECK:      "tf_device.cluster"
     // CHECK-NEXT:   tf_device.return
     //
     // CHECK:      return %[[LAUNCH_OUT]]
     %0 = "tf_device.cluster"() ( {
       %1 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
       %2 = "tf.B"(%1) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
       %3 = "tf.C"(%2, %arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
       tf_device.return %3 : tensor<i32>
     }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> tensor<i32>
     return %0 : tensor<i32>
   }

   // CHECK-LABEL: func @multiple_head_outside_compilation
   func @multiple_head_outside_compilation(%arg0: tensor<i32>) {
     // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
     // CHECK:        %[[A_OUT:.*]] = "tf.A"
     // CHECK:        %[[B_OUT:.*]] = "tf.B"(%[[A_OUT]])
     // CHECK:        "tf.C"
     // CHECK-NEXT:   tf_device.return %[[B_OUT]]
     // CHECK:      device = "/job:worker/replica:0/task:0/device:CPU:0"
     //
     // CHECK:      "tf_device.cluster"
     // CHECK:        "tf.D"(%[[LAUNCH_OUT]])
     // CHECK-NEXT:   tf_device.return
     "tf_device.cluster"() ( {
       %0 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
       %1 = "tf.B"(%0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
       "tf.C"(%1, %arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> ()
       "tf.D"(%1) : (tensor<i32>) -> ()
       tf_device.return
     }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
     return
   }

   // CHECK-LABEL: func @no_extraction_middle_outside_compiled_ops
   func @no_extraction_middle_outside_compiled_ops(%arg0: tensor<i32>) {
     // CHECK-NOT:  tf_device.launch
     // CHECK:      "tf_device.cluster"
     // CHECK-NEXT:   "tf.A"
     // CHECK-NEXT:   "tf.B"
     // CHECK-NEXT:   "tf.C"
     // CHECK-NEXT:   tf_device.return
     "tf_device.cluster"() ( {
       %0 = "tf.A"(%arg0) {} : (tensor<i32>) -> tensor<i32>
       %1 = "tf.B"(%0) {_xla_outside_compilation = "cluster1"}: (tensor<i32>) -> tensor<i32>
       "tf.C"(%1) : (tensor<i32>) -> ()
       tf_device.return
     }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
     return
   }

   // CHECK-LABEL: func @no_extraction_tpu_op_operands
   func @no_extraction_tpu_op_operands(%arg0: tensor<i32>) {
     // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
     // CHECK:        %[[A_OUT:.*]] = "tf.A"
     // CHECK:        %[[D_OUT:.*]] = "tf.D"(%[[A_OUT]])
     // CHECK-NEXT:   tf_device.return %[[D_OUT]]
     // CHECK:      device = "/job:worker/replica:0/task:0/device:CPU:0"
     //
     // CHECK:      "tf_device.cluster"
     // CHECK:        "tf.B"
     // CHECK:        "tf.C"
     // CHECK:        "tf.E"
     // CHECK-NEXT:   tf_device.return
     "tf_device.cluster"() ( {
       %0 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
       %1 = "tf.B"() {} : () -> tensor<i32>
       %2 = "tf.C"(%arg0, %1) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
       %3 = "tf.D"(%0) {_xla_outside_compilation = "cluster1"}: (tensor<i32>) -> tensor<i32>
       %4 = "tf.E"(%3) {} : (tensor<i32>) -> tensor<i32>
       tf_device.return
     }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
     return
   }

   // CHECK-LABEL: func @replicated_head_outside_compilation
   func @replicated_head_outside_compilation(%arg0: tensor<i32>, %arg1: tensor<i32>) {
     // CHECK:      tf_device.replicate([%arg0, %arg1] as %[[RI:.*]]: tensor<i32>)
     //
     // CHECK-NEXT:   %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
     // CHECK:          %[[A_OUT:.*]] = "tf.A"(%[[RI]])
     // CHECK:          %[[D_OUT:.*]] = "tf.D"(%[[A_OUT]])
     // CHECK-NEXT:     tf_device.return %[[D_OUT]]
     // CHECK:        device = "TPU_REPLICATED_HOST"
     //
     // CHECK:        "tf_device.cluster"
     // CHECK:          %[[B_OUT:.*]] = "tf.B"
     // CHECK:          "tf.C"(%[[RI]], %[[B_OUT]])
     // CHECK:          "tf.E"(%[[LAUNCH_OUT]])
     // CHECK-NEXT:     tf_device.return
     tf_device.replicate([%arg0, %arg1] as %ri : tensor<i32>) {n = 2 : i32} {
       "tf_device.cluster"() ( {
         %0 = "tf.A"(%ri) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
         %1 = "tf.B"() {} : () -> tensor<i32>
         %2 = "tf.C"(%ri, %1) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
         %3 = "tf.D"(%0) {_xla_outside_compilation = "cluster1"}: (tensor<i32>) -> tensor<i32>
         %4 = "tf.E"(%3) {} : (tensor<i32>) -> tensor<i32>
         tf_device.return
       }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
       tf_device.return
     }
     return
   }
 }
	// RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-tpu-extract-head-tail-outside-compilation \| FileCheck %s --dump-input-on-failure

	// Tests extraction of a outside compiled ops at head of TPU computation.

	module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
	// CHECK-LABEL: func @single_head_outside_compilation
	func @single_head_outside_compilation(%arg0: tensor<i32>) {
	// CHECK: tf_device.launch
	// CHECK: "tf.A"
	// CHECK-NEXT: tf_device.return
	// CHECK: device = "/job:worker/replica:0/task:0/device:CPU:0"
	//
	// CHECK: "tf_device.cluster"
	// CHECK: "tf.C"
	// CHECK-NEXT: tf_device.return
	"tf_device.cluster"() ( {
	"tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> ()
	"tf.B"() : () -> ()
	"tf.C"() : () -> ()
	tf_device.return
	}) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
	return
	}

	// CHECK-LABEL: func @ops_no_operands
	func @ops_no_operands() {
	// CHECK: %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
	// CHECK: %[[A_OUT:.*]] = "tf.A"
	// CHECK-NEXT: tf_device.return %[[A_OUT]]
	// CHECK: device = "/job:worker/replica:0/task:0/device:CPU:0"
	//
	// CHECK: "tf_device.cluster"
	// CHECK-NEXT: "tf.B"(%[[LAUNCH_OUT]])
	// CHECK-NEXT: "tf.C"
	// CHECK-NEXT: tf_device.return
	"tf_device.cluster"() ( {
	%0 = "tf.A"() {_xla_outside_compilation = "cluster1"} : () -> tensor<i32>
	%1 = "tf.B"(%0) {}: (tensor<i32>) -> tensor<i32>
	"tf.C"(%1) : (tensor<i32>) -> ()
	tf_device.return
	}) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
	return
	}

	// CHECK-LABEL: func @op_operand_outside_cluster
	func @op_operand_outside_cluster() {
	// CHECK: %[[A_OUT:.*]] = "tf.A"
	%0 = "tf.A"() : () -> tensor<i32>
	// CHECK-NEXT: %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
	// CHECK: %[[B_OUT:.*]] = "tf.B"
	// CHECK-NEXT: tf_device.return %[[B_OUT]]
	// CHECK: device = "/job:worker/replica:0/task:0/device:CPU:0"
	//
	// CHECK: "tf_device.cluster"
	// CHECK-NEXT: "tf.C"(%[[LAUNCH_OUT]])
	// CHECK-NEXT: "tf.D"
	// CHECK-NEXT: tf_device.return
	"tf_device.cluster"() ( {
	%1 = "tf.B"(%0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
	%2 = "tf.C"(%1) {}: (tensor<i32>) -> tensor<i32>
	"tf.D"(%2) : (tensor<i32>) -> ()
	tf_device.return
	}) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
	return
	}

	// CHECK-LABEL: func @aliased_output
	func @aliased_output() -> (tensor<i32>, tensor<i32>, tensor<i32>) {
	// CHECK: %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
	// CHECK: %[[A_OUT:.*]] = "tf.A"
	// CHECK-NEXT: tf_device.return %[[A_OUT]]
	// CHECK: device = "/job:worker/replica:0/task:0/device:CPU:0"
	//
	// CHECK: %[[CLUSTER_OUT:.*]]:2 = "tf_device.cluster"
	// CHECK-NEXT: %[[B_OUT:.*]] = "tf.B"(%[[LAUNCH_OUT]])
	// CHECK-NEXT: %[[C_OUT:.*]] = "tf.C"
	// CHECK-NEXT: tf_device.return %[[C_OUT]], %[[B_OUT]]
	// CHECK-NEXT: {
	// CHECK-DAG: num_cores_per_replica = 1
	// CHECK-DAG: step_marker_location = ""
	// CHECK-DAG: padding_map = []
	// CHECK-DAG: topology = ""
	// CHECK-DAG: device_assignment = []
	//
	// CHECK: return %[[LAUNCH_OUT]], %[[CLUSTER_OUT]]#0, %[[CLUSTER_OUT]]#1
	%0:3 = "tf_device.cluster"() ( {
	%1 = "tf.A"() {_xla_outside_compilation = "cluster1"} : () -> tensor<i32>
	%2 = "tf.B"(%1) {}: (tensor<i32>) -> tensor<i32>
	%3 = "tf.C"(%2) : (tensor<i32>) -> tensor<i32>
	tf_device.return %1, %3, %2 : tensor<i32>, tensor<i32>, tensor<i32>
	}) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> (tensor<i32>, tensor<i32>, tensor<i32>)
	return %0#0, %0#1, %0#2 : tensor<i32>, tensor<i32>, tensor<i32>
	}

	// CHECK-LABEL: func @all_head_computation_ops
	func @all_head_computation_ops(%arg0: tensor<i32>) -> tensor<i32> {
	// CHECK: %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
	// CHECK: %[[A_OUT:.*]] = "tf.A"
	// CHECK: %[[B_OUT:.*]] = "tf.B"(%[[A_OUT]])
	// CHECK: %[[C_OUT:.*]] = "tf.C"(%[[B_OUT]], %arg0)
	// CHECK-NEXT: tf_device.return %[[C_OUT]]
	// CHECK: device = "/job:worker/replica:0/task:0/device:CPU:0"
	//
	// CHECK: "tf_device.cluster"
	// CHECK-NEXT: tf_device.return
	//
	// CHECK: return %[[LAUNCH_OUT]]
	%0 = "tf_device.cluster"() ( {
	%1 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
	%2 = "tf.B"(%1) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
	%3 = "tf.C"(%2, %arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
	tf_device.return %3 : tensor<i32>
	}) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> tensor<i32>
	return %0 : tensor<i32>
	}

	// CHECK-LABEL: func @multiple_head_outside_compilation
	func @multiple_head_outside_compilation(%arg0: tensor<i32>) {
	// CHECK: %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
	// CHECK: %[[A_OUT:.*]] = "tf.A"
	// CHECK: %[[B_OUT:.*]] = "tf.B"(%[[A_OUT]])
	// CHECK: "tf.C"
	// CHECK-NEXT: tf_device.return %[[B_OUT]]
	// CHECK: device = "/job:worker/replica:0/task:0/device:CPU:0"
	//
	// CHECK: "tf_device.cluster"
	// CHECK: "tf.D"(%[[LAUNCH_OUT]])
	// CHECK-NEXT: tf_device.return
	"tf_device.cluster"() ( {
	%0 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
	%1 = "tf.B"(%0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
	"tf.C"(%1, %arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> ()
	"tf.D"(%1) : (tensor<i32>) -> ()
	tf_device.return
	}) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
	return
	}

	// CHECK-LABEL: func @no_extraction_middle_outside_compiled_ops
	func @no_extraction_middle_outside_compiled_ops(%arg0: tensor<i32>) {
	// CHECK-NOT: tf_device.launch
	// CHECK: "tf_device.cluster"
	// CHECK-NEXT: "tf.A"
	// CHECK-NEXT: "tf.B"
	// CHECK-NEXT: "tf.C"
	// CHECK-NEXT: tf_device.return
	"tf_device.cluster"() ( {
	%0 = "tf.A"(%arg0) {} : (tensor<i32>) -> tensor<i32>
	%1 = "tf.B"(%0) {_xla_outside_compilation = "cluster1"}: (tensor<i32>) -> tensor<i32>
	"tf.C"(%1) : (tensor<i32>) -> ()
	tf_device.return
	}) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
	return
	}

	// CHECK-LABEL: func @no_extraction_tpu_op_operands
	func @no_extraction_tpu_op_operands(%arg0: tensor<i32>) {
	// CHECK: %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
	// CHECK: %[[A_OUT:.*]] = "tf.A"
	// CHECK: %[[D_OUT:.*]] = "tf.D"(%[[A_OUT]])
	// CHECK-NEXT: tf_device.return %[[D_OUT]]
	// CHECK: device = "/job:worker/replica:0/task:0/device:CPU:0"
	//
	// CHECK: "tf_device.cluster"
	// CHECK: "tf.B"
	// CHECK: "tf.C"
	// CHECK: "tf.E"
	// CHECK-NEXT: tf_device.return
	"tf_device.cluster"() ( {
	%0 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
	%1 = "tf.B"() {} : () -> tensor<i32>
	%2 = "tf.C"(%arg0, %1) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
	%3 = "tf.D"(%0) {_xla_outside_compilation = "cluster1"}: (tensor<i32>) -> tensor<i32>
	%4 = "tf.E"(%3) {} : (tensor<i32>) -> tensor<i32>
	tf_device.return
	}) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
	return
	}

	// CHECK-LABEL: func @replicated_head_outside_compilation
	func @replicated_head_outside_compilation(%arg0: tensor<i32>, %arg1: tensor<i32>) {
	// CHECK: tf_device.replicate([%arg0, %arg1] as %[[RI:.*]]: tensor<i32>)
	//
	// CHECK-NEXT: %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
	// CHECK: %[[A_OUT:.*]] = "tf.A"(%[[RI]])
	// CHECK: %[[D_OUT:.*]] = "tf.D"(%[[A_OUT]])
	// CHECK-NEXT: tf_device.return %[[D_OUT]]
	// CHECK: device = "TPU_REPLICATED_HOST"
	//
	// CHECK: "tf_device.cluster"
	// CHECK: %[[B_OUT:.*]] = "tf.B"
	// CHECK: "tf.C"(%[[RI]], %[[B_OUT]])
	// CHECK: "tf.E"(%[[LAUNCH_OUT]])
	// CHECK-NEXT: tf_device.return
	tf_device.replicate([%arg0, %arg1] as %ri : tensor<i32>) {n = 2 : i32} {
	"tf_device.cluster"() ( {
	%0 = "tf.A"(%ri) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
	%1 = "tf.B"() {} : () -> tensor<i32>
	%2 = "tf.C"(%ri, %1) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
	%3 = "tf.D"(%0) {_xla_outside_compilation = "cluster1"}: (tensor<i32>) -> tensor<i32>
	%4 = "tf.E"(%3) {} : (tensor<i32>) -> tensor<i32>
	tf_device.return
	}) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
	tf_device.return
	}
	return
	}
	}