tensorflow/compiler/mlir/tfrt/transforms/lmhlo_to_gpu/convolution_pattern.cc - platform/external/tensorflow - Git at Google

 // Copyright 2021 The TensorFlow Runtime Authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // Pattern to lower lmhlo convolution ops to tfrt_gpu dialect.
 #include <sys/types.h>

 #include <algorithm>
 #include <functional>

 #include "mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/lmhlo_to_gpu/pattern_utils.h"
 #include "tensorflow/compiler/mlir/xla/attribute_exporter.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_conv_runner.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_type_conversion_util.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tfrt/gpu/kernels/gpu_ops.h"  // from @tf_runtime
 #include "tfrt/gpu/passes/passes.h"  // from @tf_runtime
 #include "tfrt/gpu/wrapper/cudnn_wrapper.h"  // from @tf_runtime
 #include "tfrt/gpu/wrapper/dnn_wrapper.h"  // from @tf_runtime
 #include "tfrt/basic_kernels/opdefs/types.h"  // from @tf_runtime

 namespace tensorflow {
 namespace {

 template <class ConvolutionOpType>
 void FillConvDescriptor(ConvolutionOpType op, Value result,
                         xla::gpu::GpuConvDescriptor& descriptor) {
   auto apply_layout = [](const xla::Shape& shape,
                          mlir::ArrayRef<int64_t> minor_to_major) {
     return xla::ShapeUtil::MakeShapeWithLayout(
         shape.element_type(), shape.dimensions(), minor_to_major);
   };

   descriptor.operand0_shape =
       apply_layout(xla::gpu::GetShape(op->getOperand(0)),
                    op.getBackendConfig().getOperand_0Layout());
   descriptor.operand1_shape =
       apply_layout(xla::gpu::GetShape(op->getOperand(1)),
                    op.getBackendConfig().getOperand_1Layout());
   descriptor.result_shape = apply_layout(
       xla::gpu::GetShape(result), op.getBackendConfig().getResultLayout());
   descriptor.dnums = xla::ConvertConvDimensionNumbers(op.getDimensionNumbers());
   descriptor.scratch_size = 0;  // Not used for op lowering.
   mlir::DenseIntElementsAttr window_strides = op.getWindowStrides().getValue();
   mlir::DenseIntElementsAttr lhs_dilation = op.getLhsDilation().getValue();
   mlir::DenseIntElementsAttr rhs_dilation = op.getRhsDilation().getValue();
   mlir::DenseElementsAttr window_reversal = op.getWindowReversal().getValue();
   for (auto index : llvm::seq<int>(0, window_strides.getNumElements())) {
     xla::WindowDimension* dim = descriptor.window.add_dimensions();
     // Window size for a convolution is the same as the kernel size.
     // Kernel size of the convolution is operand1_shape. We need to look at
     // the convolution dimension numbers kernel spatial dimensions to get
     // the window size.
     int kernel_dim = descriptor.dnums.kernel_spatial_dimensions(index);
     dim->set_size(descriptor.operand0_shape.dimensions(kernel_dim));
     dim->set_stride(window_strides.getValues<int64_t>()[index]);
     dim->set_base_dilation(lhs_dilation.getValues<int64_t>()[index]);
     dim->set_window_dilation(rhs_dilation.getValues<int64_t>()[index]);
     dim->set_window_reversal(window_reversal.getValues<bool>()[index]);
     if (op.getPadding().hasValue()) {
       mlir::DenseIntElementsAttr padding = op.getPadding().getValue();
       dim->set_padding_low(padding.getValues<int64_t>()[index]);
       dim->set_padding_high(padding.getValues<int64_t>()[index]);
     }
   }
   descriptor.feature_group_count = op.getFeatureGroupCount();
   {
     auto* algorithm = descriptor.backend_config.mutable_algorithm();
     algorithm->set_algo_id(op.getBackendConfig().getAlgorithm());
     algorithm->set_math_type(op.getBackendConfig().getTensorOpsEnabled()
                                  ? se::dnn::AlgorithmProto::TENSOR_OP_MATH
                                  : se::dnn::AlgorithmProto::DEFAULT_MATH);
     for (int i = 0; i < op.getBackendConfig().getKnobIds().size(); ++i) {
       // N.B. tuning_knobs is a map rather than a repeated field, so this
       // doesn't require reserving space up front.
       auto knob_id = op.getBackendConfig().getKnobIds()[i];
       auto knob_value = op.getBackendConfig().getKnobValues()[i];
       (*algorithm->mutable_tuning_knobs())[knob_id] = knob_value;
     }
     algorithm->set_is_cudnn_frontend(
         op.getBackendConfig().getIsCudnnFrontend());
     auto workspace_size = op.getBackendConfig().getWorkspaceSize();
     if (workspace_size >= 0) {
       algorithm->mutable_workspace_size()->set_value(workspace_size);
     }
   }
   descriptor.backend_config.set_conv_result_scale(
       op.getResultScale().convertToDouble());
 }

 mlir::Type GetMemRefElementType(Value value) {
   return value.getType().cast<mlir::MemRefType>().getElementType();
 }

 // Converts (via narrowing) a Span<const int64_t> to a vector<int>, and checks
 // that the elements have not changed due to the conversion.
 std::vector<int> CheckedNarrowing(absl::Span<const int64_t> wide_span) {
   std::vector<int> narrow_vector(wide_span.size());
   std::transform(
       wide_span.cbegin(), wide_span.cend(), narrow_vector.begin(),
       [](int64_t wide) {
         int narrow = wide;
         assert(narrow == wide &&
                "checked narrowing failed; values not equal post-conversion");
         return narrow;
       });
   return narrow_vector;
 }

 // Create ops to describe tensors (e.g., input, output, or bias) when using
 // legacy cudnn.
 template <class ConvolutionOpType>
 FailureOr<Value> CreateLegacyTensorDescriptor(
     ConvolutionOpType op, const se::dnn::BatchDescriptor& batch_descriptor,
     tfrt::gpu::wrapper::DnnDataType elem_type, Value chain,
     ConversionPatternRewriter& rewriter) {
   std::vector<int64_t> dims64, strides64;
   switch (batch_descriptor.layout()) {
     case se::dnn::DataLayout::kBatchYXDepth:
     case se::dnn::DataLayout::kBatchDepthYX: {
       // cuDNN requires the strides and dims to be ordered as BDYX.
       dims64 = batch_descriptor.full_dims(se::dnn::DataLayout::kBatchDepthYX);
       strides64 =
           batch_descriptor.full_strides(se::dnn::DataLayout::kBatchDepthYX);
       break;
     }
     case se::dnn::DataLayout::kBatchDepthYX4:
     case se::dnn::DataLayout::kBatchDepthYX32: {
       const int64_t n = batch_descriptor.count();
       const int64_t c = batch_descriptor.feature_map_count();
       const int64_t h = batch_descriptor.height();
       const int64_t w = batch_descriptor.width();
       const int64_t v =
           batch_descriptor.layout() == se::dnn::DataLayout::kBatchDepthYX4 ? 4
                                                                            : 32;
       assert(c / v > 0 && "Vectorized feature map count is non-positive.");
       dims64 = {n, c / v, h, w};
       strides64 = {c / v * h * w, h * w, w, 1};
       break;
     }
     default:
       return rewriter.notifyMatchFailure(op, "Unsupported tensor format.");
   }

   // cuDNN requires arrays of ints.
   std::vector<int> dims = CheckedNarrowing(dims64);
   std::vector<int> strides = CheckedNarrowing(strides64);
   return rewriter
       .create<tfrt::gpu::DnnCreateTensorDescriptorOp>(
           op.getLoc(), elem_type, rewriter.getI32ArrayAttr(dims),
           rewriter.getI32ArrayAttr(strides))
       .getResult();
 }

 template <class FusedConvOpType, class FusedConvOpAdaptorType>
 FailureOr<Value> CreateLegacyFusedConvOp(
     FusedConvOpType op, FusedConvOpAdaptorType adaptor, Type mlir_scale_type,
     Value handle, Value stream, Value input_desc, Value output_desc,
     Value filter_desc, Value conv_desc, Value algorithm, Value side_input,
     Value chain, const xla::gpu::GpuConvConfig& config,
     ConversionPatternRewriter& rewriter) {
   // Create bias descriptor.
   se::dnn::BatchDescriptor bias_descriptor = GetBiasDescriptor(config);
   cudnnDataType_t bias_type = MlirTypeToDnnDataType(
       GetMemRefElementType(op.getBias()), bias_descriptor.layout());
   FailureOr<Value> bias_desc_or = CreateLegacyTensorDescriptor(
       op, bias_descriptor, bias_type, chain, rewriter);
   if (failed(bias_desc_or)) {
     return bias_desc_or;
   }

   // Create activation descriptor.
   auto loc = op.getLoc();
   auto coefficient =
       rewriter.create<tfrt::compiler::ConstantF64Op>(loc, llvm::APFloat(0.0));
   cudnnActivationMode_t activaton_mode = config.fusion->mode == se::dnn::kRelu
                                              ? CUDNN_ACTIVATION_RELU
                                              : CUDNN_ACTIVATION_IDENTITY;
   auto activation_desc =
       rewriter.create<tfrt::gpu::DnnCreateActivationDescriptorOp>(
           loc, coefficient, activaton_mode, CUDNN_NOT_PROPAGATE_NAN);

   tfrt::gpu::wrapper::DnnDataType scale_type =
       MlirTypeToDnnDataType(mlir_scale_type);
   auto alpha1 = MakeScalingFactorConstant(
       rewriter, loc, mlir_scale_type, llvm::APFloat(config.conv_result_scale),
       llvm::APFloat(0.0));
   auto alpha2 = MakeScalingFactorConstant(
       rewriter, loc, mlir_scale_type,
       llvm::APFloat(config.fusion->side_input_scale), llvm::APFloat(0.0));
   return rewriter
       .create<tfrt::gpu::DnnConvolutionBiasActivationForwardOp>(
           loc, handle, stream, scale_type, alpha1, input_desc,
           adaptor.getInput(), filter_desc, adaptor.getFilter(), conv_desc,
           algorithm, adaptor.getScratch(), alpha2, output_desc, side_input,
           *bias_desc_or, adaptor.getBias(), activation_desc, output_desc,
           adaptor.getOutput(), chain)
       .getResult();
 }

 // Create op to build a convolution plan, which can be used to run the
 // convolution. This is the unfused variant (not fused with activation).
 Value CreateBuildUnfusedConvPlanOp(Value input, Value output, Value handle,
                                    mlir::Location loc,
                                    const xla::gpu::GpuConvConfig& config,
                                    cudnnBackendDescriptorType_t backend_type,
                                    ConversionPatternRewriter& rewriter) {
   tfrt::gpu::wrapper::DnnDataType input_type = MlirTypeToDnnDataType(
       GetMemRefElementType(input), config.input_descriptor.layout());
   tfrt::gpu::wrapper::DnnDataType output_type = MlirTypeToDnnDataType(
       GetMemRefElementType(output), config.output_descriptor.layout());

   int vector_size, vector_dim;
   std::tie(vector_size, vector_dim) =
       tfrt::gpu::wrapper::GetTensorVectorizedSizeAndDim(input_type);
   std::vector<int64_t> input_dims = config.input_descriptor.vectorized_dims(
       se::dnn::DataLayout::kBatchDepthYX, vector_size, vector_dim);
   std::vector<int64_t> input_strides =
       config.input_descriptor.vectorized_strides(
           se::dnn::DataLayout::kBatchDepthYX, vector_size, vector_dim);

   std::tie(vector_size, vector_dim) =
       tfrt::gpu::wrapper::GetTensorVectorizedSizeAndDim(output_type);
   std::vector<int64_t> output_dims = config.output_descriptor.vectorized_dims(
       se::dnn::DataLayout::kBatchDepthYX, vector_size, vector_dim);
   std::vector<int64_t> output_strides =
       config.output_descriptor.vectorized_strides(
           se::dnn::DataLayout::kBatchDepthYX, vector_size, vector_dim);

   std::tie(vector_size, vector_dim) =
       tfrt::gpu::wrapper::GetTensorVectorizedSizeAndDim(input_type);
   std::vector<int64_t> filter_dims = config.filter_descriptor.vectorized_dims(
       se::dnn::FilterLayout::kOutputInputYX, vector_size, vector_dim);
   std::vector<int64_t> filter_strides =
       config.filter_descriptor.vectorized_strides(
           se::dnn::FilterLayout::kOutputInputYX, vector_size, vector_dim);

   const auto* conv_desc = &config.conv_desc;
   cudnnConvolutionMode_t conv_mode =
       config.conv_desc.convolution_not_crosscorr() ? CUDNN_CONVOLUTION
                                                    : CUDNN_CROSS_CORRELATION;
   int conv_dim = config.conv_desc.ndims();
   auto conv_dialations = xla::llvm_ir::AsArrayRef(conv_desc->dilations());
   auto conv_padding = xla::llvm_ir::AsArrayRef(conv_desc->padding());
   auto conv_strides = xla::llvm_ir::AsArrayRef(conv_desc->strides());

   std::vector<int64_t> tuning_knob_ids, tuning_knob_values;
   tuning_knob_ids.reserve(config.algorithm.TuningKnobs().size());
   tuning_knob_values.reserve(config.algorithm.TuningKnobs().size());
   for (auto iter : config.algorithm.TuningKnobs()) {
     tuning_knob_ids.push_back(iter.first);
     tuning_knob_values.push_back(iter.second);
   }

   return rewriter.create<tfrt::gpu::DnnBuildConvolutionOp>(
       loc, handle, input_type, output_type,
       rewriter.getI64ArrayAttr(input_dims),
       rewriter.getI64ArrayAttr(input_strides),
       rewriter.getI64ArrayAttr(output_dims),
       rewriter.getI64ArrayAttr(output_strides),
       rewriter.getI64ArrayAttr(filter_dims),
       rewriter.getI64ArrayAttr(filter_strides), conv_mode, conv_dim,
       rewriter.getI64ArrayAttr(conv_dialations),
       rewriter.getI64ArrayAttr(conv_padding),
       rewriter.getI64ArrayAttr(conv_strides), backend_type,
       config.algorithm.algo_id(), rewriter.getI64ArrayAttr(tuning_knob_ids),
       rewriter.getI64ArrayAttr(tuning_knob_values));
 }

 // Create op to build a convolution plan, which can be used to run the
 // convolution. This is the variant with fused activation.
 Value CreateBuildFusedConvPlanOp(Value input, Value output, Value bias,
                                  Value handle, mlir::Location loc,
                                  const xla::gpu::GpuConvConfig& config,
                                  cudnnBackendDescriptorType_t backend_type,
                                  ConversionPatternRewriter& rewriter) {
   se::dnn::BatchDescriptor bias_descriptor = GetBiasDescriptor(config);
   // For the purposes of the cudnn graph, say that the bias tensor has the same
   // layout as the output tensor.  It doesn't actually matter, because bias is a
   // 1D array.  But we need to get the correct vectorization, otherwise the
   // cudnn graph API rejects this tensor, even though vectorized float tensors
   // aren't even a thing in cuDNN.
   bias_descriptor.set_layout(config.output_descriptor.layout());

   tfrt::gpu::wrapper::DnnDataType input_type = MlirTypeToDnnDataType(
       GetMemRefElementType(input), config.input_descriptor.layout());
   tfrt::gpu::wrapper::DnnDataType output_type = MlirTypeToDnnDataType(
       GetMemRefElementType(output), config.output_descriptor.layout());
   tfrt::gpu::wrapper::DnnDataType bias_type = MlirTypeToDnnDataType(
       GetMemRefElementType(bias), bias_descriptor.layout());

   int vector_size, vector_dim;
   std::tie(vector_size, vector_dim) =
       tfrt::gpu::wrapper::GetTensorVectorizedSizeAndDim(input_type);
   std::vector<int64_t> input_dims = config.input_descriptor.vectorized_dims(
       se::dnn::DataLayout::kBatchDepthYX, vector_size, vector_dim);
   std::vector<int64_t> input_strides =
       config.input_descriptor.vectorized_strides(
           se::dnn::DataLayout::kBatchDepthYX, vector_size, vector_dim);

   std::tie(vector_size, vector_dim) =
       tfrt::gpu::wrapper::GetTensorVectorizedSizeAndDim(output_type);
   std::vector<int64_t> output_dims = config.output_descriptor.vectorized_dims(
       se::dnn::DataLayout::kBatchDepthYX, vector_size, vector_dim);
   std::vector<int64_t> output_strides =
       config.output_descriptor.vectorized_strides(
           se::dnn::DataLayout::kBatchDepthYX, vector_size, vector_dim);

   std::tie(vector_size, vector_dim) =
       tfrt::gpu::wrapper::GetTensorVectorizedSizeAndDim(input_type);
   std::vector<int64_t> filter_dims = config.filter_descriptor.vectorized_dims(
       se::dnn::FilterLayout::kOutputInputYX, vector_size, vector_dim);
   std::vector<int64_t> filter_strides =
       config.filter_descriptor.vectorized_strides(
           se::dnn::FilterLayout::kOutputInputYX, vector_size, vector_dim);

   std::tie(vector_size, vector_dim) =
       tfrt::gpu::wrapper::GetTensorVectorizedSizeAndDim(input_type);
   std::vector<int64_t> bias_dims = bias_descriptor.vectorized_dims(
       se::dnn::DataLayout::kBatchDepthYX, vector_size, vector_dim);
   std::vector<int64_t> bias_strides = bias_descriptor.vectorized_strides(
       se::dnn::DataLayout::kBatchDepthYX, vector_size, vector_dim);

   const auto* conv_desc = &config.conv_desc;
   cudnnConvolutionMode_t conv_mode =
       config.conv_desc.convolution_not_crosscorr() ? CUDNN_CONVOLUTION
                                                    : CUDNN_CROSS_CORRELATION;
   int conv_dim = config.conv_desc.ndims();
   auto conv_dialations = xla::llvm_ir::AsArrayRef(conv_desc->dilations());
   auto conv_padding = xla::llvm_ir::AsArrayRef(conv_desc->padding());
   auto conv_strides = xla::llvm_ir::AsArrayRef(conv_desc->strides());

   std::vector<int64_t> tuning_knob_ids, tuning_knob_values;
   tuning_knob_ids.reserve(config.algorithm.TuningKnobs().size());
   tuning_knob_values.reserve(config.algorithm.TuningKnobs().size());
   for (auto iter : config.algorithm.TuningKnobs()) {
     tuning_knob_ids.push_back(iter.first);
     tuning_knob_values.push_back(iter.second);
   }

   auto alpha = rewriter.create<tfrt::compiler::ConstantF64Op>(
       loc, llvm::APFloat(config.conv_result_scale));
   auto alpha2 = rewriter.create<tfrt::compiler::ConstantF64Op>(
       loc, llvm::APFloat(config.fusion->side_input_scale));
   cudnnActivationMode_t activaton_mode = config.fusion->mode == se::dnn::kRelu
                                              ? CUDNN_ACTIVATION_RELU
                                              : CUDNN_ACTIVATION_IDENTITY;

   return rewriter.create<tfrt::gpu::DnnBuildFusedConvolutionOp>(
       loc, handle, input_type, output_type, bias_type,
       rewriter.getI64ArrayAttr(input_dims),
       rewriter.getI64ArrayAttr(input_strides),
       rewriter.getI64ArrayAttr(output_dims),
       rewriter.getI64ArrayAttr(output_strides),
       rewriter.getI64ArrayAttr(filter_dims),
       rewriter.getI64ArrayAttr(filter_strides),
       rewriter.getI64ArrayAttr(bias_dims),
       rewriter.getI64ArrayAttr(bias_strides), conv_mode, conv_dim,
       rewriter.getI64ArrayAttr(conv_dialations),
       rewriter.getI64ArrayAttr(conv_padding),
       rewriter.getI64ArrayAttr(conv_strides), backend_type, alpha, alpha2,
       activaton_mode, config.algorithm.algo_id(),
       rewriter.getI64ArrayAttr(tuning_knob_ids),
       rewriter.getI64ArrayAttr(tuning_knob_values));
 }

 // Specialization for convolution forward
 Status SetConvKind(lmhlo_gpu::ConvForwardOp op,
                    xla::gpu::GpuConvDescriptor& descriptor) {
   descriptor.kind = xla::gpu::CudnnConvKind::kForward;
   return Status::OK();
 }
 Value GetInput(lmhlo_gpu::ConvForwardOp op) { return op.getInput(); }
 Value GetOutput(lmhlo_gpu::ConvForwardOp op) { return op.getOutput(); }
 Value GetResult(lmhlo_gpu::ConvForwardOp op) { return op.getOutput(); }
 Value CreateBuildConvPlanOp(lmhlo_gpu::ConvForwardOp op, Value handle,
                             const xla::gpu::GpuConvConfig& config,
                             cudnnBackendDescriptorType_t backend_type,
                             ConversionPatternRewriter& rewriter) {
   return CreateBuildUnfusedConvPlanOp(op.getInput(), op.getOutput(), handle,
                                       op.getLoc(), config, backend_type,
                                       rewriter);
 }
 Value CreateRunConvolutionOp(lmhlo_gpu::ConvForwardOpAdaptor adaptor,
                              mlir::Location loc, Value handle, Value conv_plan,
                              Value chain, Value stream,
                              ConversionPatternRewriter& rewriter) {
   return rewriter.create<tfrt::gpu::DnnRunConvolutionOp>(
       loc, handle, stream, conv_plan, adaptor.getInput(), adaptor.getOutput(),
       adaptor.getFilter(), adaptor.getScratch(), chain);
 }
 FailureOr<Value> CreateLegacyConvOp(
     lmhlo_gpu::ConvForwardOp op, lmhlo_gpu::ConvForwardOpAdaptor adaptor,
     Type mlir_scale_type, Value handle, Value stream, Value input_desc,
     Value output_desc, Value filter_desc, Value conv_desc, int64_t algorithm,
     Value chain, const xla::gpu::GpuConvConfig& config,
     ConversionPatternRewriter& rewriter) {
   tfrt::gpu::wrapper::DnnDataType scale_type =
       MlirTypeToDnnDataType(mlir_scale_type);
   Value algo = rewriter.create<tfrt::gpu::DnnConvolutionForwardAlgorithmOp>(
       op.getLoc(), tfrt::gpu::wrapper::DnnConvFwdAlgo(
                        algorithm, tfrt::gpu::wrapper::Platform::CUDA));
   return rewriter
       .create<tfrt::gpu::DnnConvolutionForwardOp>(
           op.getLoc(), handle, stream, scale_type, input_desc,
           adaptor.getInput(), filter_desc, adaptor.getFilter(), conv_desc, algo,
           adaptor.getScratch(), output_desc, adaptor.getOutput(), chain)
       .getResult();
 }

 // Specialization for convolution backward input
 Status SetConvKind(lmhlo_gpu::ConvBackwardInputOp op,
                    xla::gpu::GpuConvDescriptor& descriptor) {
   descriptor.kind = xla::gpu::CudnnConvKind::kBackwardInput;
   return Status::OK();
 }
 Value GetInput(lmhlo_gpu::ConvBackwardInputOp op) { return op.getDInput(); }
 Value GetOutput(lmhlo_gpu::ConvBackwardInputOp op) { return op.getDOutput(); }
 Value GetResult(lmhlo_gpu::ConvBackwardInputOp op) { return op.getDInput(); }
 Value CreateBuildConvPlanOp(lmhlo_gpu::ConvBackwardInputOp op, Value handle,
                             const xla::gpu::GpuConvConfig& config,
                             cudnnBackendDescriptorType_t backend_type,
                             ConversionPatternRewriter& rewriter) {
   return CreateBuildUnfusedConvPlanOp(op.getDInput(), op.getDOutput(), handle,
                                       op.getLoc(), config, backend_type,
                                       rewriter);
 }
 Value CreateRunConvolutionOp(lmhlo_gpu::ConvBackwardInputOpAdaptor adaptor,
                              mlir::Location loc, Value handle, Value conv_plan,
                              Value chain, Value stream,
                              ConversionPatternRewriter& rewriter) {
   return rewriter.create<tfrt::gpu::DnnRunConvolutionOp>(
       loc, handle, stream, conv_plan, adaptor.getDInput(), adaptor.getDOutput(),
       adaptor.getFilter(), adaptor.getScratch(), chain);
 }
 FailureOr<Value> CreateLegacyConvOp(
     lmhlo_gpu::ConvBackwardInputOp op,
     lmhlo_gpu::ConvBackwardInputOpAdaptor adaptor, Type mlir_scale_type,
     Value handle, Value stream, Value input_desc, Value output_desc,
     Value filter_desc, Value conv_desc, int64_t algorithm, Value chain,
     const xla::gpu::GpuConvConfig& config,
     ConversionPatternRewriter& rewriter) {
   tfrt::gpu::wrapper::DnnDataType scale_type =
       MlirTypeToDnnDataType(mlir_scale_type);
   Value algo =
       rewriter.create<tfrt::gpu::DnnConvolutionBackwardDataAlgorithmOp>(
           op.getLoc(), tfrt::gpu::wrapper::DnnConvBwdDataAlgo(
                            algorithm, tfrt::gpu::wrapper::Platform::CUDA));
   return rewriter
       .create<tfrt::gpu::DnnConvolutionBackwardDataOp>(
           op.getLoc(), handle, stream, scale_type, filter_desc,
           adaptor.getFilter(), output_desc, adaptor.getDOutput(), conv_desc,
           algo, adaptor.getScratch(), input_desc, adaptor.getDInput(), chain)
       .getResult();
 }

 // Specialization for convolution backward filter
 Status SetConvKind(lmhlo_gpu::ConvBackwardFilterOp op,
                    xla::gpu::GpuConvDescriptor& descriptor) {
   descriptor.kind = xla::gpu::CudnnConvKind::kBackwardFilter;
   return Status::OK();
 }
 Value GetInput(lmhlo_gpu::ConvBackwardFilterOp op) { return op.getInput(); }
 Value GetOutput(lmhlo_gpu::ConvBackwardFilterOp op) { return op.getDOutput(); }
 Value GetResult(lmhlo_gpu::ConvBackwardFilterOp op) { return op.getDFilter(); }
 Value CreateBuildConvPlanOp(lmhlo_gpu::ConvBackwardFilterOp op, Value handle,
                             const xla::gpu::GpuConvConfig& config,
                             cudnnBackendDescriptorType_t backend_type,
                             ConversionPatternRewriter& rewriter) {
   return CreateBuildUnfusedConvPlanOp(op.getInput(), op.getDOutput(), handle,
                                       op.getLoc(), config, backend_type,
                                       rewriter);
 }
 Value CreateRunConvolutionOp(lmhlo_gpu::ConvBackwardFilterOpAdaptor adaptor,
                              mlir::Location loc, Value handle, Value conv_plan,
                              Value chain, Value stream,
                              ConversionPatternRewriter& rewriter) {
   return rewriter.create<tfrt::gpu::DnnRunConvolutionOp>(
       loc, handle, stream, conv_plan, adaptor.getInput(), adaptor.getDOutput(),
       adaptor.getDFilter(), adaptor.getScratch(), chain);
 }
 FailureOr<Value> CreateLegacyConvOp(
     lmhlo_gpu::ConvBackwardFilterOp op,
     lmhlo_gpu::ConvBackwardFilterOpAdaptor adaptor, Type mlir_scale_type,
     Value handle, Value stream, Value input_desc, Value output_desc,
     Value filter_desc, Value conv_desc, int64_t algorithm, Value chain,
     const xla::gpu::GpuConvConfig& config,
     ConversionPatternRewriter& rewriter) {
   tfrt::gpu::wrapper::DnnDataType scale_type =
       MlirTypeToDnnDataType(mlir_scale_type);
   Value algo =
       rewriter.create<tfrt::gpu::DnnConvolutionBackwardFilterAlgorithmOp>(
           op.getLoc(), tfrt::gpu::wrapper::DnnConvBwdFilterAlgo(
                            algorithm, tfrt::gpu::wrapper::Platform::CUDA));
   return rewriter
       .create<tfrt::gpu::DnnConvolutionBackwardFilterOp>(
           op.getLoc(), handle, stream, scale_type, input_desc,
           adaptor.getInput(), output_desc, adaptor.getDOutput(), conv_desc,
           algo, adaptor.getScratch(), filter_desc, adaptor.getDFilter(), chain)
       .getResult();
 }

 // Specialization for convolution forward fused
 Status SetConvKind(lmhlo_gpu::ConvForwardFusedOp op,
                    xla::gpu::GpuConvDescriptor& descriptor) {
   descriptor.kind = xla::gpu::CudnnConvKind::kForwardActivation;
   auto activation_mode_or =
       xla::ConvertConvActivationMode(op.getActivationMode());
   if (!activation_mode_or.ok()) {
     return activation_mode_or.status();
   }
   auto activation_mode = activation_mode_or.ValueOrDie();
   descriptor.backend_config.set_activation_mode(
       static_cast<int64_t>(activation_mode));
   return Status::OK();
 }
 Value GetInput(lmhlo_gpu::ConvForwardFusedOp op) { return op.getInput(); }
 Value GetOutput(lmhlo_gpu::ConvForwardFusedOp op) { return op.getOutput(); }
 Value GetResult(lmhlo_gpu::ConvForwardFusedOp op) { return op.getOutput(); }
 Value CreateBuildConvPlanOp(lmhlo_gpu::ConvForwardFusedOp op, Value handle,
                             const xla::gpu::GpuConvConfig& config,
                             cudnnBackendDescriptorType_t backend_type,
                             ConversionPatternRewriter& rewriter) {
   return CreateBuildFusedConvPlanOp(op.getInput(), op.getOutput(), op.getBias(),
                                     handle, op.getLoc(), config, backend_type,
                                     rewriter);
 }
 Value CreateRunConvolutionOp(lmhlo_gpu::ConvForwardFusedOpAdaptor adaptor,
                              mlir::Location loc, Value handle, Value conv_plan,
                              Value chain, Value stream,
                              ConversionPatternRewriter& rewriter) {
   return rewriter.create<tfrt::gpu::DnnRunFusedConvolutionOp>(
       loc, handle, stream, conv_plan, adaptor.getInput(), adaptor.getOutput(),
       adaptor.getFilter(), adaptor.getOutput(), adaptor.getBias(),
       adaptor.getScratch(), chain);
 }
 FailureOr<Value> CreateLegacyConvOp(
     lmhlo_gpu::ConvForwardFusedOp op,
     lmhlo_gpu::ConvForwardFusedOpAdaptor adaptor, Type mlir_scale_type,
     Value handle, Value stream, Value input_desc, Value output_desc,
     Value filter_desc, Value conv_desc, int64_t algorithm, Value chain,
     const xla::gpu::GpuConvConfig& config,
     ConversionPatternRewriter& rewriter) {
   Value algo = rewriter.create<tfrt::gpu::DnnConvolutionForwardAlgorithmOp>(
       op.getLoc(), tfrt::gpu::wrapper::DnnConvFwdAlgo(
                        algorithm, tfrt::gpu::wrapper::Platform::CUDA));
   return CreateLegacyFusedConvOp(op, adaptor, mlir_scale_type, handle, stream,
                                  input_desc, output_desc, filter_desc,
                                  conv_desc, algo, adaptor.getOutput(), chain,
                                  config, rewriter);
 }

 // Specialization for convolution forward fused side input
 Status SetConvKind(lmhlo_gpu::ConvForwardFusedSideInputOp op,
                    xla::gpu::GpuConvDescriptor& descriptor) {
   descriptor.kind = xla::gpu::CudnnConvKind::kForwardActivation;
   auto activation_mode_or =
       xla::ConvertConvActivationMode(op.getActivationMode());
   if (!activation_mode_or.ok()) {
     return activation_mode_or.status();
   }
   auto activation_mode = activation_mode_or.ValueOrDie();
   descriptor.backend_config.set_activation_mode(
       static_cast<int64_t>(activation_mode));
   descriptor.backend_config.set_side_input_scale(
       op.getSideInputScale().convertToDouble());
   return Status::OK();
 }
 Value GetInput(lmhlo_gpu::ConvForwardFusedSideInputOp op) {
   return op.getInput();
 }
 Value GetOutput(lmhlo_gpu::ConvForwardFusedSideInputOp op) {
   return op.getOutput();
 }
 Value GetResult(lmhlo_gpu::ConvForwardFusedSideInputOp op) {
   return op.getOutput();
 }
 Value CreateBuildConvPlanOp(lmhlo_gpu::ConvForwardFusedSideInputOp op,
                             Value handle, const xla::gpu::GpuConvConfig& config,
                             cudnnBackendDescriptorType_t backend_type,
                             ConversionPatternRewriter& rewriter) {
   return CreateBuildFusedConvPlanOp(op.getInput(), op.getOutput(), op.getBias(),
                                     handle, op.getLoc(), config, backend_type,
                                     rewriter);
 }
 Value CreateRunConvolutionOp(
     lmhlo_gpu::ConvForwardFusedSideInputOpAdaptor adaptor, mlir::Location loc,
     Value handle, Value conv_plan, Value chain, Value stream,
     ConversionPatternRewriter& rewriter) {
   return rewriter.create<tfrt::gpu::DnnRunFusedConvolutionOp>(
       loc, handle, stream, conv_plan, adaptor.getInput(), adaptor.getOutput(),
       adaptor.getFilter(), adaptor.getSideInput(), adaptor.getBias(),
       adaptor.getScratch(), chain);
 }
 FailureOr<Value> CreateLegacyConvOp(
     lmhlo_gpu::ConvForwardFusedSideInputOp op,
     lmhlo_gpu::ConvForwardFusedSideInputOpAdaptor adaptor, Type mlir_scale_type,
     Value handle, Value stream, Value input_desc, Value output_desc,
     Value filter_desc, Value conv_desc, int64_t algorithm, Value chain,
     const xla::gpu::GpuConvConfig& config,
     ConversionPatternRewriter& rewriter) {
   Value side_input = config.fusion->side_input_scale == 0
                          ? adaptor.getOutput()
                          : adaptor.getSideInput();
   Value algo = rewriter.create<tfrt::gpu::DnnConvolutionForwardAlgorithmOp>(
       op.getLoc(), tfrt::gpu::wrapper::DnnConvFwdAlgo(
                        algorithm, tfrt::gpu::wrapper::Platform::CUDA));
   return CreateLegacyFusedConvOp(
       op, adaptor, mlir_scale_type, handle, stream, input_desc, output_desc,
       filter_desc, conv_desc, algo, side_input, chain, config, rewriter);
 }

 template <class ConvolutionOpType, class OpAdaptor>
 FailureOr<Value> LegacyConvolutionRewritePattern(
     ConvolutionOpType op, OpAdaptor adaptor, Value chain, Value stream,
     const xla::gpu::GpuConvConfig& config,
     ConversionPatternRewriter& rewriter) {
   tfrt::gpu::wrapper::DnnDataType input_type = MlirTypeToDnnDataType(
       GetMemRefElementType(GetInput(op)), config.input_descriptor.layout());
   tfrt::gpu::wrapper::DnnDataType output_type = MlirTypeToDnnDataType(
       GetMemRefElementType(GetOutput(op)), config.output_descriptor.layout());
   tfrt::gpu::wrapper::DnnDataType filter_type = MlirTypeToDnnDataType(
       GetMemRefElementType(GetInput(op)), config.filter_descriptor.layout());

   // Create input descriptor.
   FailureOr<Value> input_desc_or = CreateLegacyTensorDescriptor(
       op, config.input_descriptor, input_type, chain, rewriter);
   if (failed(input_desc_or)) {
     return input_desc_or;
   }

   // Create output descriptor.
   FailureOr<Value> output_desc_or = CreateLegacyTensorDescriptor(
       op, config.output_descriptor, output_type, chain, rewriter);
   if (failed(output_desc_or)) {
     return output_desc_or;
   }

   // Create filter descriptor.
   cudnnTensorFormat_t tensor_format;
   switch (config.filter_descriptor.layout()) {
     case se::dnn::FilterLayout::kOutputInputYX:
       tensor_format = CUDNN_TENSOR_NCHW;
       break;
     case se::dnn::FilterLayout::kOutputYXInput:
       tensor_format = CUDNN_TENSOR_NHWC;
       break;
     case se::dnn::FilterLayout::kOutputInputYX4:
     case se::dnn::FilterLayout::kOutputInputYX32: {
       tensor_format = CUDNN_TENSOR_NCHW_VECT_C;
       break;
     }
     default:
       return rewriter.notifyMatchFailure(op, "Unexpected filter layout.");
   }
   std::vector<int> dims(2 + config.filter_descriptor.ndims());
   dims[0] = config.filter_descriptor.output_feature_map_count();
   dims[1] = config.filter_descriptor.input_feature_map_count();
   absl::Span<const int64_t> spatial_dims =
       config.filter_descriptor.input_filter_dims();
   std::copy(spatial_dims.begin(), spatial_dims.end(), dims.begin() + 2);
   auto loc = op.getLoc();
   Value filter_desc = rewriter.create<tfrt::gpu::DnnCreateFilterDescriptorOp>(
       loc, filter_type, tensor_format, rewriter.getI32ArrayAttr(dims));

   // Create convolution descriptor.
   mlir::Type mlir_compute_type = GetMemRefElementType(GetInput(op));
   tfrt::gpu::wrapper::DnnDataType compute_type =
       MlirTypeToDnnDataType(mlir_compute_type);
   cudnnConvolutionMode_t conv_mode =
       config.conv_desc.convolution_not_crosscorr() ? CUDNN_CONVOLUTION
                                                    : CUDNN_CROSS_CORRELATION;
   const auto& convolution_descriptor = config.conv_desc;
   // cuDNN requires arrays of ints.
   std::vector<int> strides = CheckedNarrowing(convolution_descriptor.strides());
   std::vector<int> padding = CheckedNarrowing(convolution_descriptor.padding());
   std::vector<int> dilations =
       CheckedNarrowing(convolution_descriptor.dilations());
   cudnnMathType_t math_type = config.algorithm.tensor_ops_enabled()
                                   ? CUDNN_TENSOR_OP_MATH
                                   : CUDNN_FMA_MATH;
   Value conv_desc =
       rewriter.create<tfrt::gpu::DnnCreateConvolutionDescriptorOp>(
           loc, compute_type, conv_mode, math_type,
           rewriter.getI32ArrayAttr(padding), rewriter.getI32ArrayAttr(strides),
           rewriter.getI32ArrayAttr(dilations));

   // Create convolution op.
   mlir::Type mlir_scale_type =
       mlir_compute_type.isF64() ? rewriter.getF64Type() : rewriter.getF32Type();
   Value context = rewriter.create<tfrt::gpu::StreamGetContextOp>(loc, stream);
   Value handle = rewriter.create<tfrt::gpu::DnnCreateOp>(loc, context);
   auto out_chain_or = CreateLegacyConvOp(
       op, adaptor, mlir_scale_type, handle, stream, *input_desc_or,
       *output_desc_or, filter_desc, conv_desc, config.algorithm.algo_id(),
       chain, config, rewriter);
   if (succeeded(out_chain_or)) {
     rewriter.eraseOp(op);
   }
   return out_chain_or;
 }

 template <class ConvolutionOpType>
 struct ConvolutionRewritePattern
     : tfrt::gpu::StreamifyOpConversionPattern<ConvolutionOpType> {
   using typename tfrt::gpu::StreamifyOpConversionPattern<
       ConvolutionOpType>::OpAdaptor;
   using tfrt::gpu::StreamifyOpConversionPattern<
       ConvolutionOpType>::StreamifyOpConversionPattern;
   FailureOr<Value> matchAndRewriteOp(
       ConvolutionOpType op, OpAdaptor adaptor, Value chain, Value stream,
       ConversionPatternRewriter& rewriter) const override {
     xla::gpu::GpuConvDescriptor descriptor;
     auto status = SetConvKind(op, descriptor);
     if (!status.ok()) {
       return rewriter.notifyMatchFailure(op, status.error_message());
     }
     FillConvDescriptor(op, GetResult(op), descriptor);
     auto config_or = xla::gpu::GetGpuConvConfig(descriptor, "");
     if (!config_or.ok()) {
       return rewriter.notifyMatchFailure(
           op, "Failed to get GPU convolution config.");
     }
     xla::gpu::GpuConvConfig config = config_or.ValueOrDie();

     if (config.fusion.has_value()) {
       if (config.fusion->mode != se::dnn::kNone &&
           config.fusion->mode != se::dnn::kRelu) {
         return rewriter.notifyMatchFailure(op,
                                            "Unimplemented activation mode.");
       }
     }
     if (config.conv_desc.pad_alignment() ==
         se::dnn::PadAlignment::kTensorFlowPadding) {
       return rewriter.notifyMatchFailure(
           op, "TensorFlow padding alignment is not supported.");
     }

     bool use_legacy_conv = [&] {
       if (!config.algorithm.is_cudnn_frontend()) return true;

       auto print_reason = [&](const char* reason) {
         LOG(ERROR)
             << "Disabling cuDNN frontend for the following convolution:\n"
             << "  input: " << config.input_descriptor.ToString() << "\n"
             << "  filter: " << config.filter_descriptor.ToString() << "\n"
             << "  conv: " << config.conv_desc.ToString() << "\n... because "
             << reason;
       };

       if (config.input_descriptor.layout() ==
           se::dnn::DataLayout::kBatchDepthYX32)
         // Current versions of the frontend API lack support for Tx32.
         return print_reason("Tx32 convolutions are unsupported."), true;

       if (CUDNN_VERSION < 8100)
         // cuDNN frontend support became sufficiently stable to use in 8.1.
         return print_reason("the cuDNN version does not support it."), true;

       return false;
     }();

     if (use_legacy_conv) {
       return LegacyConvolutionRewritePattern(op, adaptor, chain, stream, config,
                                              rewriter);
     }

     cudnnBackendDescriptorType_t backend_type;
     switch (descriptor.kind) {
       case xla::gpu::CudnnConvKind::kForward:
       case xla::gpu::CudnnConvKind::kForwardActivation:
         backend_type = CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR;
         break;
       case xla::gpu::CudnnConvKind::kBackwardInput:
         backend_type =
             CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR;
         break;
       case xla::gpu::CudnnConvKind::kBackwardFilter:
         backend_type =
             CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR;
         break;
       default:
         return rewriter.notifyMatchFailure(op, "Unexpected convolution kind.");
     }

     auto saved_point = rewriter.saveInsertionPoint();
     Location loc = op->getLoc();

     // Create a function that returns the convolution plan.
     mlir::SymbolTable symbol_table(
         op->template getParentOfType<mlir::ModuleOp>());
     rewriter.setInsertionPoint(
         op->template getParentOfType<mlir::func::FuncOp>());
     mlir::Type handle_type = rewriter.getType<tfrt::gpu::DnnHandleType>();
     mlir::Type conv_plan_type =
         rewriter.getType<tfrt::gpu::DnnConvolutionPlanType>();
     std::string function_name =
         absl::StrCat("get_", op->getName().stripDialect().str(), "_plan");
     mlir::func::FuncOp conv_plan_func = rewriter.create<mlir::func::FuncOp>(
         loc, function_name,
         rewriter.getFunctionType(handle_type, conv_plan_type));
     symbol_table.insert(conv_plan_func);
     rewriter.setInsertionPointToEnd(conv_plan_func.addEntryBlock());
     Value conv_plan = CreateBuildConvPlanOp(op, conv_plan_func.getArgument(0),
                                             config, backend_type, rewriter);
     rewriter.create<tfrt::compiler::ReturnOp>(loc, conv_plan);

     // Once-initialize the convolution plan.
     rewriter.restoreInsertionPoint(saved_point);
     Value context = rewriter.create<tfrt::gpu::StreamGetContextOp>(loc, stream);
     Value handle = rewriter.create<tfrt::gpu::DnnCreateOp>(loc, context);
     auto once_op = rewriter.create<tfrt::compiler::OnceOp>(
         loc, conv_plan_func.getFunctionType().getResults(), handle,
         conv_plan_func.getName());

     chain = CreateRunConvolutionOp(adaptor, loc, handle, once_op.getResult(0),
                                    chain, stream, rewriter);
     rewriter.eraseOp(op);
     return chain;
   }
 };

 }  // namespace

 void populateConvolutionConversionPattern(RewritePatternSet& patterns,
                                           TypeConverter& converter) {
   patterns
       .add<ConvolutionRewritePattern<lmhlo_gpu::ConvForwardOp>,
            ConvolutionRewritePattern<lmhlo_gpu::ConvBackwardInputOp>,
            ConvolutionRewritePattern<lmhlo_gpu::ConvBackwardFilterOp>,
            ConvolutionRewritePattern<lmhlo_gpu::ConvForwardFusedOp>,
            ConvolutionRewritePattern<lmhlo_gpu::ConvForwardFusedSideInputOp>>(
           converter, patterns.getContext());
 }

 }  // namespace tensorflow