blob: 4aaacb13634cf471259791cf83f098e3a6222630 [file] [log] [blame]
// RUN: tf-tfrt-opt %s -allow-unregistered-dialect -split-input-file \
// RUN: -tf-jitrt-peel-tiled-loops -cse -canonicalize | FileCheck %s
#map0 = affine_map<(d0) -> (8, -d0 + 102401)>
#map1 = affine_map<(d0)[s0] -> (d0 + s0)>
func @tanh_1d(%arg0: memref<102401xf32>) -> memref<102401xf32> {
%c102401 = arith.constant 102401 : index
%c8 = arith.constant 8 : index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = memref.alloc() : memref<102401xf32>
linalg.tiled_loop (%arg1) = (%c0) to (%c102401) step (%c8)
ins (%arg2 = %arg0: memref<102401xf32>)
outs (%arg3 = %0: memref<102401xf32>) {
%1 = affine.min #map0(%arg1)
%2 = memref.subview %arg2[%arg1] [%1] [1]
: memref<102401xf32> to memref<?xf32, #map1>
%3 = memref.subview %arg3[%arg1] [%1] [1]
: memref<102401xf32> to memref<?xf32, #map1>
%4 = vector.transfer_read %2[%c0], %cst
: memref<?xf32, #map1>, vector<8xf32>
%5 = math.tanh %4 : vector<8xf32>
vector.transfer_write %5, %3[%c0] : vector<8xf32>, memref<?xf32, #map1>
memref.copy %3, %3 : memref<?xf32, #map1> to memref<?xf32, #map1>
return %0 : memref<102401xf32>
// CHECK-DAG: #[[$MAP:.*]] = affine_map<(d0)[s0] -> (d0 + s0)>
// CHECK-LABEL: func @tanh_1d
// CHECK: linalg.tiled_loop
// CHECK: memref.subview
// CHECK-SAME: memref<102401xf32> to memref<8xf32, #[[$MAP]]>
// CHECK: memref.subview
// CHECK-SAME: memref<102401xf32> to memref<8xf32, #[[$MAP]]>
// CHECK: linalg.tiled_loop
// CHECK: memref.subview
// CHECK-SAME: memref<102401xf32> to memref<?xf32, #[[$MAP]]>
// CHECK: memref.subview
// CHECK-SAME: memref<102401xf32> to memref<?xf32, #[[$MAP]]>
// -----
func @tanh_3d(%d0: index, %d1: index, %d2: index) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c8 = arith.constant 8 : index
linalg.tiled_loop (%arg1 ,%arg2, %arg3) = (%c0, %c0, %c0)
to (%d0, %d1, %d2) step (%c8, %c1, %c8)
ins () outs () {
"prevent.dce"() : () -> ()
// CHECK-LABEL: func @tanh_3d(
// CHECK-SAME: %[[D0:[a-z0-9]+]]: index, %[[D1:[a-z0-9]+]]: index,
// CHECK-SAME: %[[D2:[a-z0-9]+]]: index) {
// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index
// CHECK-DAG: %[[SPLIT0:.*]] = affine.apply{{.*}}%[[D0]]
// CHECK-DAG: %[[SPLIT2:.*]] = affine.apply{{.*}}%[[D2]]
// CHECK: linalg.tiled_loop{{.*}}(%[[C0]], %[[C0]], %[[C0]])
// CHECK-SAME: to (%[[SPLIT0]], %arg1, %[[SPLIT2]])
// CHECK-SAME: step (%[[C8]], %[[C1]], %[[C8]])
// CHECK: linalg.tiled_loop{{.*}}(%[[SPLIT0]], %[[C0]], %[[C0]])
// CHECK-SAME: to (%arg0, %arg1, %[[SPLIT2]])
// CHECK-SAME: step (%[[C8]], %[[C1]], %[[C8]])
// CHECK: linalg.tiled_loop{{.*}}(%[[C0]], %[[C0]], %[[SPLIT2]])
// CHECK-SAME: to (%arg0, %arg1, %arg2)
// CHECK-SAME: step (%[[C8]], %[[C1]], %[[C8]])
// -----
func @reduce_column_sum_2d_dynamic(%in: tensor<?x?xf32>) -> tensor<?xf32> {
%cst = arith.constant 0.000000e+00 : f32
%c1 = arith.constant 1 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%dim_X = tensor.dim %in, %c0 : tensor<?x?xf32>
%dim_Y = tensor.dim %in, %c1 : tensor<?x?xf32>
%1 = linalg.init_tensor [%dim_Y] : tensor<?xf32>
%2 = linalg.fill(%cst, %1) : f32, tensor<?xf32> -> tensor<?xf32>
%5 = linalg.tiled_loop (%i, %j) = (%c0, %c0) to (%dim_Y, %dim_X)
step (%c4, %c4)
ins (%in_ = %in: tensor<?x?xf32>, %cst_ = %cst: f32)
outs (%out_ = %2: tensor<?xf32>)
iterators["parallel", "reduction"] {
%6 = affine.min affine_map<(d0)[s0] -> (4, -d0 + s0)>(%j)[%dim_X]
%9 = affine.min affine_map<(d0)[s0] -> (4, -d0 + s0)>(%i)[%dim_Y]
%8 = tensor.extract_slice %in_[%j, %i] [%6, %9] [1, 1]
: tensor<?x?xf32> to tensor<?x?xf32>
%11 = tensor.extract_slice %out_[%i] [%9] [1]
: tensor<?xf32> to tensor<?xf32>
%12 = linalg.fill(%cst_, %11) : f32, tensor<?xf32> -> tensor<?xf32>
%13 = linalg.generic {
indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>,
affine_map<(d0, d1) -> (d0)>],
iterator_types = ["parallel", "reduction"]}
ins(%8 : tensor<?x?xf32>)
outs(%12 : tensor<?xf32>) {
^bb0(%arg6: f32, %arg7: f32):
%16 = arith.addf %arg6, %arg7 : f32
linalg.yield %16 : f32
} -> tensor<?xf32>
%14 = linalg.generic {
indexing_maps = [affine_map<(d0) -> (d0)>,
affine_map<(d0) -> (d0)>],
iterator_types = ["parallel"]}
ins(%13 : tensor<?xf32>)
outs(%11 : tensor<?xf32>) {
^bb0(%arg6: f32, %arg7: f32):
%16 = arith.addf %arg6, %arg7 : f32
linalg.yield %16 : f32
} -> tensor<?xf32>
%15 = tensor.insert_slice %14 into %out_[%i] [%9] [1]
: tensor<?xf32> into tensor<?xf32>
linalg.yield %15 : tensor<?xf32>
return %5 : tensor<?xf32>
// CHECK-LABEL: func @reduce_column_sum_2d_dynamic
// CHECK: linalg.fill
// CHECK: linalg.tiled_loop
// CHECK: tensor.extract_slice
// CHECK-SAME: tensor<?x?xf32> to tensor<4x4xf32>
// CHECK: tensor.extract_slice
// CHECK-SAME: tensor<4xf32>
// CHECK: linalg.tiled_loop
// CHECK: tensor.extract_slice
// CHECK-SAME: tensor<?x?xf32> to tensor<4x?xf32>
// CHECK: tensor.extract_slice
// CHECK-SAME: tensor<?xf32> to tensor<?xf32>
// CHECK: linalg.tiled_loop
// CHECK: tensor.extract_slice
// CHECK-SAME: tensor<?x?xf32> to tensor<?x?xf32>
// CHECK: tensor.extract_slice
// CHECK-SAME: tensor<?xf32> to tensor<?xf32>
// -----
func @reduce_row_sum_2d_dynamic(%in: tensor<?x?xf32>) -> tensor<?xf32> {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%dim_X = tensor.dim %in, %c0 : tensor<?x?xf32>
%dim_Y = tensor.dim %in, %c1 : tensor<?x?xf32>
%1 = linalg.init_tensor [%dim_X] : tensor<?xf32>
%2 = linalg.fill(%cst, %1) : f32, tensor<?xf32> -> tensor<?xf32>
%5 = linalg.tiled_loop (%i, %j) = (%c0, %c0) to (%dim_X, %dim_Y)
step (%c4, %c4)
ins (%in_ = %in: tensor<?x?xf32>, %cst_ = %cst: f32)
outs (%out_ = %2: tensor<?xf32>)
iterators["parallel", "reduction"] {
%6 = affine.min affine_map<(d0)[s0] -> (4, -d0 + s0)>(%i)[%dim_X]
%7 = affine.min affine_map<(d0)[s0] -> (4, -d0 + s0)>(%j)[%dim_Y]
%8 = tensor.extract_slice %in_[%i, %j] [%6, %7] [1, 1]
: tensor<?x?xf32> to tensor<?x?xf32>
%11 = tensor.extract_slice %out_[%i] [%6] [1]
: tensor<?xf32> to tensor<?xf32>
%12 = linalg.fill(%cst_, %11) : f32, tensor<?xf32> -> tensor<?xf32>
%13 = linalg.generic {
indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
affine_map<(d0, d1) -> (d0)>],
iterator_types = ["parallel", "reduction"]}
ins(%8 : tensor<?x?xf32>)
outs(%12 : tensor<?xf32>) {
^bb0(%arg6: f32, %arg7: f32):
%16 = arith.addf %arg6, %arg7 : f32
linalg.yield %16 : f32
} -> tensor<?xf32>
%14 = linalg.generic {
indexing_maps = [affine_map<(d0) -> (d0)>,
affine_map<(d0) -> (d0)>],
iterator_types = ["parallel"]}
ins(%13 : tensor<?xf32>)
outs(%11 : tensor<?xf32>) {
^bb0(%arg6: f32, %arg7: f32):
%16 = arith.addf %arg6, %arg7 : f32
linalg.yield %16 : f32
} -> tensor<?xf32>
%15 = tensor.insert_slice %14 into %out_[%i] [%6] [1]
: tensor<?xf32> into tensor<?xf32>
linalg.yield %15 : tensor<?xf32>
return %5 : tensor<?xf32>
// CHECK-LABEL: func @reduce_row_sum_2d_dynamic
// CHECK: linalg.fill
// CHECK: linalg.tiled_loop
// CHECK: tensor.extract_slice
// CHECK-SAME: tensor<?x?xf32> to tensor<4x4xf32>
// CHECK: tensor.extract_slice
// CHECK-SAME: tensor<4xf32>
// CHECK: linalg.tiled_loop
// CHECK: tensor.extract_slice
// CHECK-SAME: tensor<?x?xf32> to tensor<?x4xf32>
// CHECK: tensor.extract_slice
// CHECK-SAME: tensor<?xf32> to tensor<?xf32>
// CHECK: linalg.tiled_loop
// CHECK: tensor.extract_slice
// CHECK-SAME: tensor<?x?xf32> to tensor<?x?xf32>
// CHECK: tensor.extract_slice
// CHECK-SAME: tensor<?xf32> to tensor<?xf32>