tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_peel_tiled_loops.mlir - platform/external/tensorflow - Git at Google

 // RUN: tf-tfrt-opt %s -allow-unregistered-dialect -split-input-file \
 // RUN: -tf-jitrt-peel-tiled-loops -cse -canonicalize | FileCheck %s

 #map0 = affine_map<(d0) -> (8, -d0 + 102401)>
 #map1 = affine_map<(d0)[s0] -> (d0 + s0)>

 func @tanh_1d(%arg0: memref<102401xf32>) -> memref<102401xf32> {
   %c102401 = arith.constant 102401 : index
   %c8 = arith.constant 8 : index
   %cst = arith.constant 0.000000e+00 : f32
   %c0 = arith.constant 0 : index
   %0 = memref.alloc() : memref<102401xf32>
   linalg.tiled_loop (%arg1) = (%c0) to (%c102401) step (%c8)
       ins (%arg2 = %arg0: memref<102401xf32>)
       outs (%arg3 = %0: memref<102401xf32>) {
     %1 = affine.min #map0(%arg1)
     %2 = memref.subview %arg2[%arg1] [%1] [1]
         : memref<102401xf32> to memref<?xf32, #map1>
     %3 = memref.subview %arg3[%arg1] [%1] [1]
         : memref<102401xf32> to memref<?xf32, #map1>
     %4 = vector.transfer_read %2[%c0], %cst
         : memref<?xf32, #map1>, vector<8xf32>
     %5 = math.tanh %4 : vector<8xf32>
     vector.transfer_write %5, %3[%c0] : vector<8xf32>, memref<?xf32, #map1>
     memref.copy %3, %3 : memref<?xf32, #map1> to memref<?xf32, #map1>
     linalg.yield
   }
   return %0 : memref<102401xf32>
 }

 // CHECK-DAG:  #[[$MAP:.*]] = affine_map<(d0)[s0] -> (d0 + s0)>

 // CHECK-LABEL: func @tanh_1d

 // CHECK:       linalg.tiled_loop
 // CHECK:           memref.subview
 // CHECK-SAME:        memref<102401xf32> to memref<8xf32, #[[$MAP]]>
 // CHECK:           memref.subview
 // CHECK-SAME:        memref<102401xf32> to memref<8xf32, #[[$MAP]]>

 // CHECK:       linalg.tiled_loop
 // CHECK:           memref.subview
 // CHECK-SAME:        memref<102401xf32> to memref<?xf32, #[[$MAP]]>
 // CHECK:           memref.subview
 // CHECK-SAME:        memref<102401xf32> to memref<?xf32, #[[$MAP]]>

 // -----

 func @tanh_3d(%d0: index, %d1: index, %d2: index) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c8 = arith.constant 8 : index
   linalg.tiled_loop (%arg1 ,%arg2, %arg3) = (%c0, %c0, %c0)
     to (%d0, %d1, %d2) step (%c8, %c1, %c8)
     ins () outs () {
     "prevent.dce"() : () -> ()
     linalg.yield
   }
   return
 }

 // CHECK-LABEL: func @tanh_3d(
 // CHECK-SAME:    %[[D0:[a-z0-9]+]]: index, %[[D1:[a-z0-9]+]]: index,
 // CHECK-SAME:    %[[D2:[a-z0-9]+]]: index) {
 // CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
 // CHECK-DAG:     %[[C1:.*]] = arith.constant 1 : index
 // CHECK-DAG:     %[[C8:.*]] = arith.constant 8 : index

 // CHECK-DAG:     %[[SPLIT0:.*]] = affine.apply{{.*}}%[[D0]]
 // CHECK-DAG:     %[[SPLIT2:.*]] = affine.apply{{.*}}%[[D2]]

 // CHECK:     linalg.tiled_loop{{.*}}(%[[C0]], %[[C0]], %[[C0]])
 // CHECK-SAME:  to (%[[SPLIT0]], %arg1, %[[SPLIT2]])
 // CHECK-SAME:  step  (%[[C8]], %[[C1]], %[[C8]])

 // CHECK:     linalg.tiled_loop{{.*}}(%[[SPLIT0]], %[[C0]], %[[C0]])
 // CHECK-SAME:  to (%arg0, %arg1, %[[SPLIT2]])
 // CHECK-SAME:  step  (%[[C8]], %[[C1]], %[[C8]])

 // CHECK:     linalg.tiled_loop{{.*}}(%[[C0]], %[[C0]], %[[SPLIT2]])
 // CHECK-SAME:  to (%arg0, %arg1, %arg2)
 // CHECK-SAME:  step  (%[[C8]], %[[C1]], %[[C8]])

 // -----

 func @reduce_column_sum_2d_dynamic(%in: tensor<?x?xf32>) -> tensor<?xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %c1 = arith.constant 1 : index
   %c4 = arith.constant 4 : index
   %c0 = arith.constant 0 : index

   %dim_X = tensor.dim %in, %c0 : tensor<?x?xf32>
   %dim_Y = tensor.dim %in, %c1 : tensor<?x?xf32>

   %1 = linalg.init_tensor [%dim_Y] : tensor<?xf32>
   %2 = linalg.fill(%cst, %1) : f32, tensor<?xf32> -> tensor<?xf32>
   %5 = linalg.tiled_loop (%i, %j) = (%c0, %c0) to (%dim_Y, %dim_X)
          step (%c4, %c4)
          ins (%in_ = %in: tensor<?x?xf32>, %cst_ = %cst: f32)
          outs (%out_ = %2: tensor<?xf32>)
          iterators["parallel", "reduction"] {
     %6 = affine.min affine_map<(d0)[s0] -> (4, -d0 + s0)>(%j)[%dim_X]
     %9 = affine.min affine_map<(d0)[s0] -> (4, -d0 + s0)>(%i)[%dim_Y]

     %8 = tensor.extract_slice %in_[%j, %i] [%6, %9] [1, 1]
            : tensor<?x?xf32> to tensor<?x?xf32>
     %11 = tensor.extract_slice %out_[%i] [%9] [1]
            : tensor<?xf32> to tensor<?xf32>

     %12 = linalg.fill(%cst_, %11) : f32, tensor<?xf32> -> tensor<?xf32>
     %13 = linalg.generic {
             indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>,
                              affine_map<(d0, d1) -> (d0)>],
             iterator_types = ["parallel", "reduction"]}
             ins(%8 : tensor<?x?xf32>)
             outs(%12 : tensor<?xf32>) {
           ^bb0(%arg6: f32, %arg7: f32):
             %16 = arith.addf %arg6, %arg7 : f32
             linalg.yield %16 : f32
           } -> tensor<?xf32>
     %14 = linalg.generic {
             indexing_maps = [affine_map<(d0) -> (d0)>,
                              affine_map<(d0) -> (d0)>],
             iterator_types = ["parallel"]}
             ins(%13 : tensor<?xf32>)
             outs(%11 : tensor<?xf32>) {
           ^bb0(%arg6: f32, %arg7: f32):
             %16 = arith.addf %arg6, %arg7 : f32
             linalg.yield %16 : f32
           } -> tensor<?xf32>
     %15 = tensor.insert_slice %14 into %out_[%i] [%9] [1]
             : tensor<?xf32> into tensor<?xf32>
     linalg.yield %15 : tensor<?xf32>
   }
   return %5 : tensor<?xf32>
 }

 // CHECK-LABEL: func @reduce_column_sum_2d_dynamic

 // CHECK:       linalg.fill
 // CHECK:       linalg.tiled_loop
 // CHECK:           tensor.extract_slice
 // CHECK-SAME:        tensor<?x?xf32> to tensor<4x4xf32>
 // CHECK:           tensor.extract_slice
 // CHECK-SAME:        tensor<4xf32>

 // CHECK:       linalg.tiled_loop
 // CHECK:           tensor.extract_slice
 // CHECK-SAME:        tensor<?x?xf32> to tensor<4x?xf32>
 // CHECK:           tensor.extract_slice
 // CHECK-SAME:        tensor<?xf32> to tensor<?xf32>

 // CHECK:       linalg.tiled_loop
 // CHECK:           tensor.extract_slice
 // CHECK-SAME:        tensor<?x?xf32> to tensor<?x?xf32>
 // CHECK:           tensor.extract_slice
 // CHECK-SAME:        tensor<?xf32> to tensor<?xf32>

 // -----


 func @reduce_row_sum_2d_dynamic(%in: tensor<?x?xf32>) -> tensor<?xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %c0 = arith.constant 0 : index
   %c4 = arith.constant 4 : index
   %c1 = arith.constant 1 : index

   %dim_X = tensor.dim %in, %c0 : tensor<?x?xf32>
   %dim_Y = tensor.dim %in, %c1 : tensor<?x?xf32>

   %1 = linalg.init_tensor [%dim_X] : tensor<?xf32>
   %2 = linalg.fill(%cst, %1) : f32, tensor<?xf32> -> tensor<?xf32>
   %5 = linalg.tiled_loop (%i, %j) = (%c0, %c0) to (%dim_X, %dim_Y)
     step (%c4, %c4)
     ins (%in_ = %in: tensor<?x?xf32>, %cst_ = %cst: f32)
     outs (%out_ = %2: tensor<?xf32>)
     iterators["parallel", "reduction"] {
     %6 = affine.min affine_map<(d0)[s0] -> (4, -d0 + s0)>(%i)[%dim_X]
     %7 = affine.min affine_map<(d0)[s0] -> (4, -d0 + s0)>(%j)[%dim_Y]

     %8 = tensor.extract_slice %in_[%i, %j] [%6, %7] [1, 1]
            : tensor<?x?xf32> to tensor<?x?xf32>
     %11 = tensor.extract_slice %out_[%i] [%6] [1]
            : tensor<?xf32> to tensor<?xf32>
     %12 = linalg.fill(%cst_, %11) : f32, tensor<?xf32> -> tensor<?xf32>
     %13 = linalg.generic {
             indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
                              affine_map<(d0, d1) -> (d0)>],
             iterator_types = ["parallel", "reduction"]}
             ins(%8 : tensor<?x?xf32>)
             outs(%12 : tensor<?xf32>) {
           ^bb0(%arg6: f32, %arg7: f32):
             %16 = arith.addf %arg6, %arg7 : f32
             linalg.yield %16 : f32
           } -> tensor<?xf32>
     %14 = linalg.generic {
             indexing_maps = [affine_map<(d0) -> (d0)>,
                              affine_map<(d0) -> (d0)>],
             iterator_types = ["parallel"]}
             ins(%13 : tensor<?xf32>)
             outs(%11 : tensor<?xf32>) {
           ^bb0(%arg6: f32, %arg7: f32):
             %16 = arith.addf %arg6, %arg7 : f32
             linalg.yield %16 : f32
           } -> tensor<?xf32>
     %15 = tensor.insert_slice %14 into %out_[%i] [%6] [1]
             : tensor<?xf32> into tensor<?xf32>
     linalg.yield %15 : tensor<?xf32>
   }
   return %5 : tensor<?xf32>
 }

 // CHECK-LABEL: func @reduce_row_sum_2d_dynamic

 // CHECK:       linalg.fill
 // CHECK:       linalg.tiled_loop
 // CHECK:           tensor.extract_slice
 // CHECK-SAME:        tensor<?x?xf32> to tensor<4x4xf32>
 // CHECK:           tensor.extract_slice
 // CHECK-SAME:        tensor<4xf32>

 // CHECK:       linalg.tiled_loop
 // CHECK:           tensor.extract_slice
 // CHECK-SAME:        tensor<?x?xf32> to tensor<?x4xf32>
 // CHECK:           tensor.extract_slice
 // CHECK-SAME:        tensor<?xf32> to tensor<?xf32>

 // CHECK:       linalg.tiled_loop
 // CHECK:           tensor.extract_slice
 // CHECK-SAME:        tensor<?x?xf32> to tensor<?x?xf32>
 // CHECK:           tensor.extract_slice
 // CHECK-SAME:        tensor<?xf32> to tensor<?xf32>
	// RUN: tf-tfrt-opt %s -allow-unregistered-dialect -split-input-file \
	// RUN: -tf-jitrt-peel-tiled-loops -cse -canonicalize \| FileCheck %s

	#map0 = affine_map<(d0) -> (8, -d0 + 102401)>
	#map1 = affine_map<(d0)[s0] -> (d0 + s0)>

	func @tanh_1d(%arg0: memref<102401xf32>) -> memref<102401xf32> {
	%c102401 = arith.constant 102401 : index
	%c8 = arith.constant 8 : index
	%cst = arith.constant 0.000000e+00 : f32
	%c0 = arith.constant 0 : index
	%0 = memref.alloc() : memref<102401xf32>
	linalg.tiled_loop (%arg1) = (%c0) to (%c102401) step (%c8)
	ins (%arg2 = %arg0: memref<102401xf32>)
	outs (%arg3 = %0: memref<102401xf32>) {
	%1 = affine.min #map0(%arg1)
	%2 = memref.subview %arg2[%arg1] [%1] [1]
	: memref<102401xf32> to memref<?xf32, #map1>
	%3 = memref.subview %arg3[%arg1] [%1] [1]
	: memref<102401xf32> to memref<?xf32, #map1>
	%4 = vector.transfer_read %2[%c0], %cst
	: memref<?xf32, #map1>, vector<8xf32>
	%5 = math.tanh %4 : vector<8xf32>
	vector.transfer_write %5, %3[%c0] : vector<8xf32>, memref<?xf32, #map1>
	memref.copy %3, %3 : memref<?xf32, #map1> to memref<?xf32, #map1>
	linalg.yield
	}
	return %0 : memref<102401xf32>
	}

	// CHECK-DAG: #[[$MAP:.*]] = affine_map<(d0)[s0] -> (d0 + s0)>

	// CHECK-LABEL: func @tanh_1d

	// CHECK: linalg.tiled_loop
	// CHECK: memref.subview
	// CHECK-SAME: memref<102401xf32> to memref<8xf32, #[[$MAP]]>
	// CHECK: memref.subview
	// CHECK-SAME: memref<102401xf32> to memref<8xf32, #[[$MAP]]>

	// CHECK: linalg.tiled_loop
	// CHECK: memref.subview
	// CHECK-SAME: memref<102401xf32> to memref<?xf32, #[[$MAP]]>
	// CHECK: memref.subview
	// CHECK-SAME: memref<102401xf32> to memref<?xf32, #[[$MAP]]>

	// -----

	func @tanh_3d(%d0: index, %d1: index, %d2: index) {
	%c0 = arith.constant 0 : index
	%c1 = arith.constant 1 : index
	%c8 = arith.constant 8 : index
	linalg.tiled_loop (%arg1 ,%arg2, %arg3) = (%c0, %c0, %c0)
	to (%d0, %d1, %d2) step (%c8, %c1, %c8)
	ins () outs () {
	"prevent.dce"() : () -> ()
	linalg.yield
	}
	return
	}

	// CHECK-LABEL: func @tanh_3d(
	// CHECK-SAME: %[[D0:[a-z0-9]+]]: index, %[[D1:[a-z0-9]+]]: index,
	// CHECK-SAME: %[[D2:[a-z0-9]+]]: index) {
	// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
	// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
	// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index

	// CHECK-DAG: %[[SPLIT0:.]] = affine.apply{{.}}%[[D0]]
	// CHECK-DAG: %[[SPLIT2:.]] = affine.apply{{.}}%[[D2]]

	// CHECK: linalg.tiled_loop{{.*}}(%[[C0]], %[[C0]], %[[C0]])
	// CHECK-SAME: to (%[[SPLIT0]], %arg1, %[[SPLIT2]])
	// CHECK-SAME: step (%[[C8]], %[[C1]], %[[C8]])

	// CHECK: linalg.tiled_loop{{.*}}(%[[SPLIT0]], %[[C0]], %[[C0]])
	// CHECK-SAME: to (%arg0, %arg1, %[[SPLIT2]])
	// CHECK-SAME: step (%[[C8]], %[[C1]], %[[C8]])

	// CHECK: linalg.tiled_loop{{.*}}(%[[C0]], %[[C0]], %[[SPLIT2]])
	// CHECK-SAME: to (%arg0, %arg1, %arg2)
	// CHECK-SAME: step (%[[C8]], %[[C1]], %[[C8]])

	// -----

	func @reduce_column_sum_2d_dynamic(%in: tensor<?x?xf32>) -> tensor<?xf32> {
	%cst = arith.constant 0.000000e+00 : f32
	%c1 = arith.constant 1 : index
	%c4 = arith.constant 4 : index
	%c0 = arith.constant 0 : index

	%dim_X = tensor.dim %in, %c0 : tensor<?x?xf32>
	%dim_Y = tensor.dim %in, %c1 : tensor<?x?xf32>

	%1 = linalg.init_tensor [%dim_Y] : tensor<?xf32>
	%2 = linalg.fill(%cst, %1) : f32, tensor<?xf32> -> tensor<?xf32>
	%5 = linalg.tiled_loop (%i, %j) = (%c0, %c0) to (%dim_Y, %dim_X)
	step (%c4, %c4)
	ins (%in_ = %in: tensor<?x?xf32>, %cst_ = %cst: f32)
	outs (%out_ = %2: tensor<?xf32>)
	iterators["parallel", "reduction"] {
	%6 = affine.min affine_map<(d0)[s0] -> (4, -d0 + s0)>(%j)[%dim_X]
	%9 = affine.min affine_map<(d0)[s0] -> (4, -d0 + s0)>(%i)[%dim_Y]

	%8 = tensor.extract_slice %in_[%j, %i] [%6, %9] [1, 1]
	: tensor<?x?xf32> to tensor<?x?xf32>
	%11 = tensor.extract_slice %out_[%i] [%9] [1]
	: tensor<?xf32> to tensor<?xf32>

	%12 = linalg.fill(%cst_, %11) : f32, tensor<?xf32> -> tensor<?xf32>
	%13 = linalg.generic {
	indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>,
	affine_map<(d0, d1) -> (d0)>],
	iterator_types = ["parallel", "reduction"]}
	ins(%8 : tensor<?x?xf32>)
	outs(%12 : tensor<?xf32>) {
	^bb0(%arg6: f32, %arg7: f32):
	%16 = arith.addf %arg6, %arg7 : f32
	linalg.yield %16 : f32
	} -> tensor<?xf32>
	%14 = linalg.generic {
	indexing_maps = [affine_map<(d0) -> (d0)>,
	affine_map<(d0) -> (d0)>],
	iterator_types = ["parallel"]}
	ins(%13 : tensor<?xf32>)
	outs(%11 : tensor<?xf32>) {
	^bb0(%arg6: f32, %arg7: f32):
	%16 = arith.addf %arg6, %arg7 : f32
	linalg.yield %16 : f32
	} -> tensor<?xf32>
	%15 = tensor.insert_slice %14 into %out_[%i] [%9] [1]
	: tensor<?xf32> into tensor<?xf32>
	linalg.yield %15 : tensor<?xf32>
	}
	return %5 : tensor<?xf32>
	}

	// CHECK-LABEL: func @reduce_column_sum_2d_dynamic

	// CHECK: linalg.fill
	// CHECK: linalg.tiled_loop
	// CHECK: tensor.extract_slice
	// CHECK-SAME: tensor<?x?xf32> to tensor<4x4xf32>
	// CHECK: tensor.extract_slice
	// CHECK-SAME: tensor<4xf32>

	// CHECK: linalg.tiled_loop
	// CHECK: tensor.extract_slice
	// CHECK-SAME: tensor<?x?xf32> to tensor<4x?xf32>
	// CHECK: tensor.extract_slice
	// CHECK-SAME: tensor<?xf32> to tensor<?xf32>

	// CHECK: linalg.tiled_loop
	// CHECK: tensor.extract_slice
	// CHECK-SAME: tensor<?x?xf32> to tensor<?x?xf32>
	// CHECK: tensor.extract_slice
	// CHECK-SAME: tensor<?xf32> to tensor<?xf32>

	// -----


	func @reduce_row_sum_2d_dynamic(%in: tensor<?x?xf32>) -> tensor<?xf32> {
	%cst = arith.constant 0.000000e+00 : f32
	%c0 = arith.constant 0 : index
	%c4 = arith.constant 4 : index
	%c1 = arith.constant 1 : index

	%dim_X = tensor.dim %in, %c0 : tensor<?x?xf32>
	%dim_Y = tensor.dim %in, %c1 : tensor<?x?xf32>

	%1 = linalg.init_tensor [%dim_X] : tensor<?xf32>
	%2 = linalg.fill(%cst, %1) : f32, tensor<?xf32> -> tensor<?xf32>
	%5 = linalg.tiled_loop (%i, %j) = (%c0, %c0) to (%dim_X, %dim_Y)
	step (%c4, %c4)
	ins (%in_ = %in: tensor<?x?xf32>, %cst_ = %cst: f32)
	outs (%out_ = %2: tensor<?xf32>)
	iterators["parallel", "reduction"] {
	%6 = affine.min affine_map<(d0)[s0] -> (4, -d0 + s0)>(%i)[%dim_X]
	%7 = affine.min affine_map<(d0)[s0] -> (4, -d0 + s0)>(%j)[%dim_Y]

	%8 = tensor.extract_slice %in_[%i, %j] [%6, %7] [1, 1]
	: tensor<?x?xf32> to tensor<?x?xf32>
	%11 = tensor.extract_slice %out_[%i] [%6] [1]
	: tensor<?xf32> to tensor<?xf32>
	%12 = linalg.fill(%cst_, %11) : f32, tensor<?xf32> -> tensor<?xf32>
	%13 = linalg.generic {
	indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
	affine_map<(d0, d1) -> (d0)>],
	iterator_types = ["parallel", "reduction"]}
	ins(%8 : tensor<?x?xf32>)
	outs(%12 : tensor<?xf32>) {
	^bb0(%arg6: f32, %arg7: f32):
	%16 = arith.addf %arg6, %arg7 : f32
	linalg.yield %16 : f32
	} -> tensor<?xf32>
	%14 = linalg.generic {
	indexing_maps = [affine_map<(d0) -> (d0)>,
	affine_map<(d0) -> (d0)>],
	iterator_types = ["parallel"]}
	ins(%13 : tensor<?xf32>)
	outs(%11 : tensor<?xf32>) {
	^bb0(%arg6: f32, %arg7: f32):
	%16 = arith.addf %arg6, %arg7 : f32
	linalg.yield %16 : f32
	} -> tensor<?xf32>
	%15 = tensor.insert_slice %14 into %out_[%i] [%6] [1]
	: tensor<?xf32> into tensor<?xf32>
	linalg.yield %15 : tensor<?xf32>
	}
	return %5 : tensor<?xf32>
	}

	// CHECK-LABEL: func @reduce_row_sum_2d_dynamic

	// CHECK: linalg.fill
	// CHECK: linalg.tiled_loop
	// CHECK: tensor.extract_slice
	// CHECK-SAME: tensor<?x?xf32> to tensor<4x4xf32>
	// CHECK: tensor.extract_slice
	// CHECK-SAME: tensor<4xf32>

	// CHECK: linalg.tiled_loop
	// CHECK: tensor.extract_slice
	// CHECK-SAME: tensor<?x?xf32> to tensor<?x4xf32>
	// CHECK: tensor.extract_slice
	// CHECK-SAME: tensor<?xf32> to tensor<?xf32>

	// CHECK: linalg.tiled_loop
	// CHECK: tensor.extract_slice
	// CHECK-SAME: tensor<?x?xf32> to tensor<?x?xf32>
	// CHECK: tensor.extract_slice
	// CHECK-SAME: tensor<?xf32> to tensor<?xf32>