[XLA:GPU] Add layout attributes to LHLO_GPU Convolution operations. - MLIR MemRefs do not preserve layout information correctly when unit dimensions are involved. Operations like convolution that use cuDNN however need the correct layout to be preserved so that we do not end up creating an incompatible combination of input/filter/output layout that is not supported by cuDNN. - Add these layouts to convolution attributes in the form of I32ArrayAttr for representing the layout in "minor_to_major" form similar to XLA. PiperOrigin-RevId: 348034757 Change-Id: I4bbccfc713d136335ac3b436a8b657bd34b98fae

commit: fdcfc2359157aacf6a67f26c1c39ce99d5fdde51 [log] [tgz]
author: Rahul Joshi <jurahul@google.com> Thu Dec 17 09:25:03 2020 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> Thu Dec 17 09:28:50 2020 -0800
tree: a0e82bd3182a320d898532814948912882d6efec
parent: 165b3e83a7b19cecdb08c1a4d81887c524d857e0 [diff]
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.td
index 2bf93f7..da7d179 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.td

@@ -21,7 +21,17 @@
 def ConvolutionBackendConfigAttr : StructAttr<"ConvolutionBackendConfig",
                                           LHLO_GPU_Dialect, [
    StructFieldAttr<"algorithm", I64Attr>,
-   StructFieldAttr<"tensor_ops_enabled", BoolAttr>]> {
+   StructFieldAttr<"tensor_ops_enabled", BoolAttr>,
+   // The following 3 attributes describe the layout as an array of integers
+   // that list the dimensions in minor-to-major order similar to XLA's layout
+   // representation. operand_0_layout and operand_0_layout described the layout
+   // of the first 2 operands of the convolution, and result_layout describes
+   // the layout of the primary output operand of the convolution.
+   // Note: Not using names like input_layout or filter_layout as `input` may be
+   // an input operand (for ConvForward) but output for ConvBackward.
+   StructFieldAttr<"operand_0_layout", I64ArrayAttr>,
+   StructFieldAttr<"operand_1_layout", I64ArrayAttr>,
+   StructFieldAttr<"result_layout", I64ArrayAttr>]> {
    let description = "GPU Convolution backend configuration";
 }
 

diff --git a/tensorflow/compiler/mlir/hlo/tests/lhlo_gpu_ops.mlir b/tensorflow/compiler/mlir/hlo/tests/lhlo_gpu_ops.mlir
index 83327d4..82c455c 100644
--- a/tensorflow/compiler/mlir/hlo/tests/lhlo_gpu_ops.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/lhlo_gpu_ops.mlir

@@ -50,8 +50,11 @@
       feature_group_count = 1,
       batch_group_count = 1,
       result_scale = 1.0,
-      backend_config = {algorithm=0, tensor_ops_enabled = true }
-    }
+      backend_config = {algorithm=0,
+                        operand_0_layout = [3,2,1,0],
+                        operand_1_layout = [3,2,1,0],
+                        result_layout = [3,2,1,0],
+                        tensor_ops_enabled = true}}
     : (memref<1x1x8x8xf16>, memref<1x1x2x2xf16>, memref<1x1x7x7xf16>, memref<32xi8>) -> ()
   return
 }
@@ -60,7 +63,11 @@
 func @conv_backfilter(%input : memref<3x56x56x16xf64>, %filter: memref<3x3x3x64xf64>, %output: memref<54x54x16x64xf64>) {
   %scratch = alloc() : memref<23328xui8>
   "lmhlo_gpu.conv_backwardfilter"(%input, %filter, %output, %scratch)
-    { backend_config = {algorithm = 1 : i64, tensor_ops_enabled = false},
+    { backend_config = {algorithm = 1 : i64,
+                        operand_0_layout = [3,2,1,0],
+                        operand_1_layout = [3,2,1,0],
+                        result_layout = [3,2,1,0],
+                        tensor_ops_enabled = false},
       batch_group_count = 1 : i64,
       dimension_numbers = {input_batch_dimension = 0 : i64,
                            input_feature_dimension = 3 : i64,
@@ -86,7 +93,11 @@
 func @conv_backinput(%input : memref<4x5x16x16xf64>, %filter : memref<5x3x7x7xf64>, %output : memref<4x3x16x16xf64>) {
   %scratch = alloc() : memref<32xui8>
   "lmhlo_gpu.conv_backwardinput"(%input, %filter, %output, %scratch)
-  { backend_config = {algorithm = 1 : i64, tensor_ops_enabled = false},
+    { backend_config = {algorithm = 1 : i64,
+                        operand_0_layout = [3,2,1,0],
+                        operand_1_layout = [3,2,1,0],
+                        result_layout = [3,2,1,0],
+                        tensor_ops_enabled = false},
     batch_group_count = 1 : i64,
     dimension_numbers = {input_batch_dimension = 0 : i64,
                          input_feature_dimension = 1 : i64,
@@ -114,7 +125,11 @@
   %scratch = alloc() : memref<32xui8>
   "lmhlo_gpu.conv_forward_fused"(%input, %filter, %bias, %output, %scratch)
     {activation_mode = "Relu",
-     backend_config = {algorithm = 0 : i64, tensor_ops_enabled = false},
+     backend_config = {algorithm = 1 : i64,
+                       operand_0_layout = [3,2,1,0],
+                       operand_1_layout = [3,2,1,0],
+                       result_layout = [3,2,1,0],
+                       tensor_ops_enabled = false},
      batch_group_count = 1 : i64,
      dimension_numbers = {input_batch_dimension = 0 : i64,
        input_feature_dimension = 1 : i64,
@@ -141,7 +156,11 @@
   %scratch = alloc() : memref<0xui8>
   "lmhlo_gpu.conv_forward_fused_with_side_input"(%input, %filter, %bias, %side_input, %output, %scratch)
     {activation_mode = "Relu",
-     backend_config = {algorithm = 0 : i64, tensor_ops_enabled = false},
+     backend_config = {algorithm = 1 : i64,
+                       operand_0_layout = [3,2,1,0],
+                       operand_1_layout = [3,2,1,0],
+                       result_layout = [3,2,1,0],
+                       tensor_ops_enabled = false},
      batch_group_count = 1 : i64,
      dimension_numbers = {input_batch_dimension = 0 : i64,
        input_feature_dimension = 1 : i64,

diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index fb611ab..436ae0d 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD

@@ -154,6 +154,7 @@
         "//tensorflow/compiler/xla/service/gpu:backend_configs_cc",
         "//tensorflow/compiler/xla/service/gpu:ir_emission_utils",
         "//tensorflow/compiler/xla/service/llvm_ir:buffer_assignment_util",
+        "@com_google_absl//absl/algorithm:container",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",

diff --git a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/hlo_text_to_lhlo_no_opt.hlotxt b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/hlo_text_to_lhlo_no_opt.hlotxt
index a212a7c..34db86a 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/hlo_text_to_lhlo_no_opt.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/hlo_text_to_lhlo_no_opt.hlotxt

@@ -226,6 +226,9 @@
 // CHECK-LABEL: func @main
 // CHECK: "lmhlo_gpu.conv_forward"
 // CHECK-SAME: algorithm = 2 : i64
+// CHECK-SAME: operand_0_layout = [3, 2, 1, 0]
+// CKECK-SAME: operand_1_layout = [3, 2, 1, 0]
+// CHECK-SAME: result_layout = [3, 2, 1, 0]
 // CHECK-SAME: tensor_ops_enabled = false
 // CHECK-SAME: batch_group_count = 1 : i64
 // CHECK-SAME: input_batch_dimension = 0 : i64
@@ -248,7 +251,7 @@
 ENTRY main {
   %input = f32[4,256,3,3]{3,2,1,0} parameter(0)
   %filter = f32[256,256,2,2]{3,2,1,0} parameter(1)
-  ROOT %custom-call.1 = (f32[4,256,2,2]{3,2,1,0}, u8[65536]{0}) custom-call(f32[4,256,3,3]{3,2,1,0} %input, f32[256,256,2,2]{3,2,1,0} %filter),
+  ROOT %custom-call.1 = (f32[4,256,2,2]{3,2, 1,0}, u8[65536]{0}) custom-call(f32[4,256,3,3]{3,2,1,0} %input, f32[256,256,2,2]{3,2,1,0} %filter),
                         window={size=2x2 rhs_reversal=1x1}, dim_labels=bf01_oi01->bf01,
                         custom_call_target="__cudnn$convForward",
                         backend_config="{\"algorithm\":\"2\",\"tensor_ops_enabled\":false,\"conv_result_scale\":1,\"activation_mode\":\"0\",\"side_input_scale\":0}"
@@ -260,6 +263,9 @@
 // CHECK: "lmhlo_gpu.conv_forward_fused"
 // CHECK-SAME: activation_mode = "Relu"
 // CHECK-SAME: algorithm = 0 : i64
+// CHECK-SAME: operand_0_layout = [1, 3, 2, 0]
+// CHECK-SAME: operand_1_layout = [2, 1, 0, 3]
+// CHECK-SAME: result_layout = [1, 3, 2, 0]
 // CHECK-SAME: tensor_ops_enabled = false
 // CHECK-SAME: batch_group_count = 1 : i64
 // CHECK-SAME: input_batch_dimension = 0 : i64
@@ -296,6 +302,9 @@
 // CHECK: "lmhlo_gpu.conv_forward_fused_with_side_input"
 // CHECK-SAME: activation_mode = "Relu"
 // CHECK-SAME: algorithm = 0 : i64
+// CHECK-SAME: operand_0_layout = [1, 3, 2, 0]
+// CHECK-SAME: operand_1_layout = [2, 1, 0, 3]
+// CHECK-SAME: result_layout = [1, 3, 2, 0]
 // CHECK-SAME: tensor_ops_enabled = false
 // CHECK-SAME: batch_group_count = 1 : i64
 // CHECK-SAME: input_batch_dimension = 0 : i64

diff --git a/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.cc b/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.cc
index 83e47d5..f4b2165 100644
--- a/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.cc

@@ -19,6 +19,7 @@
 #include <memory>
 #include <tuple>
 
+#include "absl/algorithm/container.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
@@ -659,6 +660,13 @@
   TF_ASSIGN_OR_RETURN(const xla::gpu::CudnnConvKind kind,
                       xla::gpu::GetCudnnConvKind(custom_call));
 
+  auto get_layout_attribute = [&](const xla::Layout& layout) {
+    std::vector<int64_t> minor_to_major(layout.minor_to_major_size());
+    absl::c_transform(layout.minor_to_major(), minor_to_major.begin(),
+                      [](xla::int64 x) { return static_cast<int64_t>(x); });
+    return builder_.getI64ArrayAttr(minor_to_major);
+  };
+
   auto set_common_conv_attributes = [&, this](auto op) -> Operation* {
     const xla::Window& window = custom_call->window();
     // Window size for Cudnn Conv is same as the kernel size.
@@ -703,6 +711,9 @@
     auto config = mlir::lmhlo_gpu::ConvolutionBackendConfig::get(
         builder_.getI64IntegerAttr(backend_config.algorithm()),
         builder_.getBoolAttr(backend_config.tensor_ops_enabled()),
+        get_layout_attribute(custom_call->operand(0)->shape().layout()),
+        get_layout_attribute(custom_call->operand(1)->shape().layout()),
+        get_layout_attribute(custom_call->shape().tuple_shapes(0).layout()),
         builder_.getContext());
     op.backend_configAttr(config);
commit	fdcfc2359157aacf6a67f26c1c39ce99d5fdde51	[log] [tgz]
author	Rahul Joshi <jurahul@google.com>	Thu Dec 17 09:25:03 2020 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	Thu Dec 17 09:28:50 2020 -0800
tree	a0e82bd3182a320d898532814948912882d6efec
parent	165b3e83a7b19cecdb08c1a4d81887c524d857e0 [diff]