blob: d06d2aadaea2efe5e8d0adaf9a5137af197f760e [file] [log] [blame]
// RUN: mlir-opt %s -dma-generate -canonicalize | FileCheck %s
// Index of the buffer for the second DMA is remapped.
// CHECK-DAG: [[MAP:#map[0-9]+]] = (d0) -> (d0 - 256)
// CHECK-DAG: #map{{[0-9]+}} = (d0, d1) -> (d0 * 16 + d1)
// CHECK-DAG: #map{{[0-9]+}} = (d0, d1) -> (d0, d1)
// CHECK-DAG: [[MAP_INDEX_DIFF:#map[0-9]+]] = (d0, d1, d2, d3) -> (d2 - d0, d3 - d1)
// CHECK-DAG: [[MAP_MINUS_ONE:#map[0-9]+]] = (d0, d1) -> (d0 - 1, d1)
// CHECK-DAG: [[MAP_ORIG_ACCESS:#map[0-9]+]] = (d0, d1)[s0, s1] -> (d0, d1 + s0 + s1)
// CHECK-DAG: [[MAP_SUB_OFFSET:#map[0-9]+]] = (d0, d1, d2) -> (d1, d2 - (d0 + 9))
// CHECK-LABEL: func @loop_nest_1d() {
func @loop_nest_1d() {
%A = alloc() : memref<256 x f32>
%B = alloc() : memref<512 x f32>
%F = alloc() : memref<256 x f32, 1>
// First DMA buffer.
// CHECK: %3 = alloc() : memref<256xf32, 1>
// Tag for first DMA.
// CHECK: %4 = alloc() : memref<1xi32>
// First DMA transfer.
// CHECK: dma_start %0[%c0], %3[%c0], %c256, %4[%c0] : memref<256xf32>, memref<256xf32, 1>, memref<1xi32>
// CHECK: dma_wait %4[%c0], %c256 : memref<1xi32>
// Second DMA buffer.
// CHECK: %5 = alloc() : memref<256xf32, 1>
// Tag for second DMA.
// CHECK: %6 = alloc() : memref<1xi32>
// Second DMA transfer.
// CHECK: dma_start %1[%c256], %5[%c0], %c256, %6[%c0] : memref<512xf32>, memref<256xf32, 1>, memref<1xi32>
// CHECK-NEXT: dma_wait %6[%c0], %c256 : memref<1xi32>
// CHECK: for %i0 = 0 to 256 {
// CHECK: %7 = affine_apply #map{{[0-9]+}}(%i0)
// CHECK-NEXT: %8 = load %3[%7] : memref<256xf32, 1>
// CHECK: %9 = affine_apply #map{{[0-9]+}}(%i0)
// CHECK: %10 = affine_apply [[MAP]](%9)
// CHECK-NEXT: %11 = load %5[%10] : memref<256xf32, 1>
// Already in faster memory space.
// CHECK: %12 = load %2[%i0] : memref<256xf32, 1>
// CHECK-NEXT: }
// CHECK-NEXT: return
for %i = 0 to 256 {
load %A[%i] : memref<256 x f32>
%idx = affine_apply (d0) -> (d0 + 256)(%i)
load %B[%idx] : memref<512 x f32>
load %F[%i] : memref<256 x f32, 1>
}
return
}
// CHECK-LABEL: func @loop_nest_high_d
// CHECK: %c16384 = constant 16384 : index
// CHECK-NEXT: %0 = alloc() : memref<512x32xf32, 1>
// CHECK-NEXT: %1 = alloc() : memref<1xi32>
// INCOMING DMA for B
// CHECK-NEXT: dma_start %arg1[%c0, %c0], %0[%c0, %c0], %c16384, %1[%c0] : memref<512x32xf32>, memref<512x32xf32, 1>, memref<1xi32>
// CHECK-NEXT: dma_wait %1[%c0], %c16384 : memref<1xi32>
// CHECK-NEXT: %2 = alloc() : memref<512x32xf32, 1>
// CHECK-NEXT: %3 = alloc() : memref<1xi32>
// INCOMING DMA for A.
// CHECK-NEXT: dma_start %arg0[%c0, %c0], %2[%c0, %c0], %c16384, %3[%c0] : memref<512x32xf32>, memref<512x32xf32, 1>, memref<1xi32>
// CHECK-NEXT: dma_wait %3[%c0], %c16384 : memref<1xi32>
// CHECK-NEXT: %4 = alloc() : memref<512x32xf32, 1>
// CHECK-NEXT: %5 = alloc() : memref<1xi32>
// INCOMING DMA for C.
// CHECK-NEXT: dma_start %arg2[%c0, %c0], %4[%c0, %c0], %c16384, %5[%c0] : memref<512x32xf32>, memref<512x32xf32, 1>, memref<1xi32>
// CHECK-NEXT: dma_wait %5[%c0], %c16384 : memref<1xi32>
// CHECK-NEXT: %6 = alloc() : memref<1xi32>
// CHECK-NEXT: for %i0 = 0 to 32 {
// CHECK-NEXT: for %i1 = 0 to 32 {
// CHECK-NEXT: for %i2 = 0 to 32 {
// CHECK-NEXT: for %i3 = 0 to 16 {
// CHECK-NEXT: %7 = affine_apply #map{{[0-9]+}}(%i1, %i3)
// CHECK-NEXT: %8 = affine_apply #map{{[0-9]+}}(%7, %i0)
// CHECK-NEXT: %9 = load %0[%8#0, %8#1] : memref<512x32xf32, 1>
// CHECK-NEXT: "foo"(%9) : (f32) -> ()
// CHECK-NEXT: }
// CHECK-NEXT: for %i4 = 0 to 16 {
// CHECK-NEXT: %10 = affine_apply #map{{[0-9]+}}(%i2, %i4)
// CHECK-NEXT: %11 = affine_apply #map{{[0-9]+}}(%10, %i1)
// CHECK-NEXT: %12 = load %2[%11#0, %11#1] : memref<512x32xf32, 1>
// CHECK-NEXT: "bar"(%12) {mxu_id: 0} : (f32) -> ()
// CHECK-NEXT: }
// CHECK-NEXT: for %i5 = 0 to 16 {
// CHECK-NEXT: %13 = "abc_compute"() : () -> f32
// CHECK-NEXT: %14 = affine_apply #map{{[0-9]+}}(%i2, %i5)
// CHECK-NEXT: %15 = affine_apply #map{{[0-9]+}}(%14, %i0)
// CHECK-NEXT: %16 = load %4[%15#0, %15#1] : memref<512x32xf32, 1>
// CHECK-NEXT: %17 = "addf32"(%13, %16) : (f32, f32) -> f32
// CHECK-NEXT: %18 = affine_apply #map{{[0-9]+}}(%14, %i0)
// CHECK-NEXT: store %17, %4[%18#0, %18#1] : memref<512x32xf32, 1>
// CHECK-NEXT: }
// CHECK-NEXT: "foobar"() : () -> ()
// CHECK-NEXT: }
// CHECK-NEXT: }
// CHECK-NEXT: }
// OUTGOING DMA for C.
// CHECK-NEXT: dma_start %4[%c0, %c0], %arg2[%c0, %c0], %c16384, %6[%c0] : memref<512x32xf32, 1>, memref<512x32xf32>, memref<1xi32>
// CHECK-NEXT: dma_wait %6[%c0], %c16384 : memref<1xi32>
// CHECK-NEXT: return
// CHECK-NEXT:}
func @loop_nest_high_d(%A: memref<512 x 32 x f32>,
%B: memref<512 x 32 x f32>, %C: memref<512 x 32 x f32>) {
// DMAs will be performed at this level (jT is the first loop without a stride).
// A and B are read, while C is both read and written. A total of three new buffers
// are allocated and existing load's/store's are replaced by accesses to those buffers.
for %jT = 0 to 32 {
for %kT = 0 to 32 {
for %iT = 0 to 32 {
for %kk = 0 to 16 { // k intratile
%k = affine_apply (d0, d1) -> (16*d0 + d1) (%kT, %kk)
%v0 = load %B[%k, %jT] : memref<512 x 32 x f32>
"foo"(%v0) : (f32) -> ()
}
for %ii = 0 to 16 { // i intratile.
%i = affine_apply (d0, d1) -> (16*d0 + d1)(%iT, %ii)
%v1 = load %A[%i, %kT] : memref<512 x 32 x f32>
"bar"(%v1) {mxu_id: 0} : (f32) -> ()
}
for %ii_ = 0 to 16 { // i intratile.
%v2 = "abc_compute"() : () -> f32
%i_ = affine_apply (d0, d1) -> (16*d0 + d1)(%iT, %ii_)
%v3 = load %C[%i_, %jT] : memref<512 x 32 x f32>
%v4 = "addf32"(%v2, %v3) : (f32, f32) -> (f32)
store %v4, %C[%i_, %jT] : memref<512 x 32 x f32>
}
"foobar"() : () -> ()
}
}
}
return
}
// A loop nest with a modulo 2 access. A strided DMA is not needed here a 1x2
// region within a 256 x 8 memref.
//
// CHECK-LABEL: func @loop_nest_modulo() {
// CHECK: %0 = alloc() : memref<256x8xf32>
// CHECK-NEXT: for %i0 = 0 to 32 step 4 {
// CHECK-NEXT: %1 = affine_apply #map{{[0-9]+}}(%i0)
// CHECK-NEXT: %2 = alloc() : memref<1x2xf32, 1>
// CHECK-NEXT: %3 = alloc() : memref<1xi32>
// CHECK-NEXT: dma_start %0[%1, %c0], %2[%c0, %c0], %c2, %3[%c0] : memref<256x8xf32>, memref<1x2xf32, 1>, memref<1xi32>
// CHECK-NEXT: dma_wait %3[%c0], %c2 : memref<1xi32>
// CHECK-NEXT: for %i1 = 0 to 8 {
// ...
// ...
// CHECK: }
// CHECK-NEXT: }
// CHECK-NEXT: return
func @loop_nest_modulo() {
%A = alloc() : memref<256 x 8 x f32>
for %i = 0 to 32 step 4 {
// DMAs will be performed at this level (%j is the first unit stride loop)
for %j = 0 to 8 {
%idx = affine_apply (d0) -> (d0 mod 2) (%j)
// A buffer of size 32 x 2 will be allocated (original buffer was 256 x 8).
%v = load %A[%i, %idx] : memref<256 x 8 x f32>
}
}
return
}
// DMA on tiled loop nest. This also tests the case where the bounds are
// dependent on outer loop IVs.
// CHECK-LABEL: func @loop_nest_tiled() -> memref<256x1024xf32> {
func @loop_nest_tiled() -> memref<256x1024xf32> {
%0 = alloc() : memref<256x1024xf32>
for %i0 = 0 to 256 step 32 {
for %i1 = 0 to 1024 step 32 {
// CHECK: %3 = alloc() : memref<32x32xf32, 1>
// CHECK-NEXT: %4 = alloc() : memref<1xi32>
// Strided DMA here: 32 x 32 tile in a 256 x 1024 memref.
// CHECK-NEXT: dma_start %0[%1, %2], %3[%c0, %c0], %c1024, %4[%c0], %c1024, %c32 : memref<256x1024xf32>, memref<32x32xf32, 1>, memref<1xi32>
// CHECK-NEXT: dma_wait
// CHECK-NEXT: for %i2 = #map
// CHECK-NEXT: for %i3 = #map
for %i2 = (d0) -> (d0)(%i0) to (d0) -> (d0 + 32)(%i0) {
for %i3 = (d0) -> (d0)(%i1) to (d0) -> (d0 + 32)(%i1) {
// CHECK: %5 = affine_apply [[MAP_INDEX_DIFF]](%i0, %i1, %i2, %i3)
// CHECK-NEXT: %6 = load %3[%5#0, %5#1] : memref<32x32xf32, 1>
%1 = load %0[%i2, %i3] : memref<256x1024xf32>
} // CHECK-NEXT: }
}
}
}
// CHECK: return %0 : memref<256x1024xf32>
return %0 : memref<256x1024xf32>
}
// CHECK-LABEL: func @dma_constant_dim_access
func @dma_constant_dim_access(%A : memref<100x100xf32>) {
%one = constant 1 : index
%N = constant 100 : index
// CHECK: %0 = alloc() : memref<1x100xf32, 1>
// CHECK-NEXT: %1 = alloc() : memref<1xi32>
// No strided DMA needed here.
// CHECK: dma_start %arg0[%c1, %c0], %0[%c0, %c0], %c100, %1[%c0] : memref<100x100xf32>, memref<1x100xf32, 1>,
// CHECK-NEXT: dma_wait %1[%c0], %c100 : memref<1xi32>
for %i = 0 to 100 {
for %j = 0 to ()[s0] -> (s0) ()[%N] {
// CHECK: %2 = affine_apply [[MAP_MINUS_ONE]](%c1, %i1)
// CHECK-NEXT: %3 = load %0[%2#0, %2#1] : memref<1x100xf32, 1>
load %A[%one, %j] : memref<100 x 100 x f32>
}
}
return
}
// CHECK-LABEL: func @dma_with_symbolic_accesses
func @dma_with_symbolic_accesses(%A : memref<100x100xf32>, %M : index) {
%N = constant 9 : index
for %i = 0 to 100 {
for %j = 0 to 100 {
%idx = affine_apply (d0, d1) [s0, s1] -> (d0, d1 + s0 + s1)(%i, %j)[%M, %N]
load %A[%idx#0, %idx#1] : memref<100 x 100 x f32>
}
}
return
// CHECK: %1 = alloc() : memref<100x100xf32, 1>
// CHECK-NEXT: %2 = alloc() : memref<1xi32>
// CHECK-NEXT: dma_start %arg0[%c0, %0], %1[%c0, %c0], %c10000, %2[%c0]
// CHECK-NEXT: dma_wait %2[%c0], %c10000
// CHECK-NEXT: for %i0 = 0 to 100 {
// CHECK-NEXT: for %i1 = 0 to 100 {
// CHECK-NEXT: %3 = affine_apply [[MAP_ORIG_ACCESS]](%i0, %i1)[%arg1, %c9]
// CHECK-NEXT: %4 = affine_apply [[MAP_SUB_OFFSET]](%arg1, %3#0, %3#1)
// CHECK-NEXT: %5 = load %1[%4#0, %4#1] : memref<100x100xf32, 1>
// CHECK-NEXT: }
// CHECK-NEXT: }
// CHECK-NEXT: return
}
// CHECK-LABEL: func @dma_with_symbolic_loop_bounds
func @dma_with_symbolic_loop_bounds(%A : memref<100x100xf32>, %M : index, %N: index) {
%K = constant 9 : index
// The buffer size can't be bound by a constant smaller than the original
// memref size; so the DMA buffer is the entire 100x100.
// CHECK: %0 = alloc() : memref<100x100xf32, 1>
// CHECK-NEXT: %1 = alloc() : memref<1xi32>
// CHECK-NEXT: dma_start %arg0[%c0, %c0], %0[%c0, %c0], %c10000, %1[%c0] : memref<100x100xf32>, memref<100x100xf32, 1>, memref<1xi32>
// CHECK-NEXT: dma_wait %1[%c0], %c10000 : memref<1xi32>
for %i = 0 to 100 {
for %j = %M to %N {
%idx = affine_apply (d0, d1) [s0] -> (d0, d1 + s0)(%i, %j)[%K]
load %A[%idx#0, %idx#1] : memref<100 x 100 x f32>
}
}
return
}
// CHECK-LABEL: func @dma_unknown_size
func @dma_unknown_size(%arg0: memref<?x?xf32>) {
%M = dim %arg0, 0 : memref<? x ? x f32>
%N = dim %arg0, 0 : memref<? x ? x f32>
for %i = 0 to %M {
for %j = 0 to %N {
// If this loop nest isn't tiled, requires a non-constant size DMA -- not
// yet implemented.
// CHECK: %2 = load %arg0[%i0, %i1] : memref<?x?xf32>
load %arg0[%i, %j] : memref<? x ? x f32>
}
}
return
}
// CHECK-LABEL: func @dma_memref_3d
func @dma_memref_3d(%arg0: memref<1024x1024x1024xf32>) {
for %i = 0 to 1024 {
for %j = 0 to 1024 {
for %k = 0 to 1024 {
%idx = affine_apply (d0, d1, d2) -> (d0 mod 128, d1 mod 128, d2 mod 128)(%i, %j, %k)
// DMA with nested striding (or emulating with loop around strided DMA)
// not yet implemented.
// CHECK: %3 = load %arg0[%2#0, %2#1, %2#2] : memref<1024x1024x1024xf32>
%v = load %arg0[%idx#0, %idx#1, %idx#2] : memref<1024 x 1024 x 1024 x f32>
}
}
}
return
}