CUDNNv8: Add support in stream executor
diff --git a/tensorflow/stream_executor/cuda/BUILD b/tensorflow/stream_executor/cuda/BUILD
index ace5aa8..e9c13c4 100644
--- a/tensorflow/stream_executor/cuda/BUILD
+++ b/tensorflow/stream_executor/cuda/BUILD
@@ -384,6 +384,7 @@
     name = "cudnn_plugin",
     srcs = if_cuda_is_configured(["cuda_dnn.cc"]),
     hdrs = if_cuda_is_configured(["cuda_dnn.h"]),
+    copts = ["-DNV_CUDNN_DISABLE_EXCEPTION"],
     visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         ":cuda_activation",
@@ -400,6 +401,7 @@
         "//third_party/eigen3",
         "@local_config_cuda//cuda:cuda_headers",
         "@local_config_cuda//cuda:cudnn_header",
+        "@cudnn_frontend_archive//:cudnn_frontend",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/platform:tensor_float_32_utils",
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 593619f..531b46b 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -46,6 +46,9 @@
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 // clang-format off
 #include "third_party/gpus/cudnn/cudnn.h"
+#if CUDNN_VERSION >= 8100
+#include "third_party/cudnn_frontend/include/cudnn_frontend.h"
+#endif // CUDNN_VERSION >= 8100
 #include "absl/strings/string_view.h"
 // clang-format on
 
@@ -80,6 +83,28 @@
     }                                                                    \
   } while (false)
 
+#define RETURN_MSG_IF_CUDNN_ERROR(expr)                                  \
+  do {                                                                   \
+    cudnnStatus_t _status = expr.get_status();                           \
+    if (!SE_PREDICT_TRUE(_status == CUDNN_STATUS_SUCCESS)) {             \
+      std::ostringstream oss;                                            \
+      oss << ToString(_status) << "\nin " << __FILE__ << "(" << __LINE__ \
+          << "): '" << #expr << "' " << expr.get_error();                \
+      return port::Status(port::error::UNKNOWN, oss.str().c_str());      \
+    }                                                                    \
+  } while (false)
+
+#define RETURN_FALSE_IF_CUDNN_ERROR(expr)                                \
+  do {                                                                   \
+    cudnnStatus_t _status = expr.get_status();                           \
+    if (!SE_PREDICT_TRUE(_status == CUDNN_STATUS_SUCCESS)) {             \
+      std::ostringstream oss;                                            \
+      oss << ToString(_status) << "\nin " << __FILE__ << "(" << __LINE__ \
+          << "): '" << #expr << "' " << expr.get_error();                \
+      return false;                                                      \
+    }                                                                    \
+  } while (false)
+
 // Converts (via narrowing) a type T value to a type U, and checks that the
 // value has no value change due to the conversion.
 template <typename WideT, typename NarrowT>
@@ -1584,6 +1609,26 @@
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnRnnStateTensorDescriptor);
 };
 
+#if CUDNN_VERSION >= 8100
+class CudnnConvolveExecutionPlan : public dnn::ConvolveExecutionPlan {
+ public:
+  CudnnConvolveExecutionPlan (cudnn_frontend::ExecutionPlan plan)
+      : plan_(std::move(plan)) {};
+  std::string getTag() override {
+    return plan_.getTag();
+  };
+  void* get_raw_desc() override {
+    return plan_.get_raw_desc();
+  }
+  int64_t getWorkspaceSize() override {
+    return plan_.getWorkspaceSize();
+  }
+ private:
+  cudnn_frontend::ExecutionPlan plan_;
+  SE_DISALLOW_COPY_AND_ASSIGN(CudnnConvolveExecutionPlan);
+};
+#endif // CUDNN_VERSION >= 8100
+
 namespace {
 
 struct RnnModelDims {
@@ -3082,6 +3127,8 @@
 // By default it is turned on, users can explicitly disable them through an
 // env-var "TF_ENABLE_WINOGRAD_NONFUSED=0".
 // https://github.com/tensorflow/tensorflow/pull/4901
+// For CUDNN v8.1, when this env-var is turned off, both the winograd and
+// winograd-non-fused engines will be ruled out.
 struct WinogradNonfused {
   static constexpr const char* kName = "TF_ENABLE_WINOGRAD_NONFUSED";
   // NVIDIA has fixed winograd nonfused bug for cudnn v>=7. For older versions,
@@ -3121,6 +3168,38 @@
   static constexpr bool kDefaultFlag = CUDNN_VERSION >= 7500;
 };
 
+namespace {
+#if CUDNN_VERSION >= 8100
+bool isNonDeterministic(cudnnBackendDescriptor_t engine_config) {
+  return cudnn_frontend::hasNumericalNote<
+             CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC>(engine_config);
+}
+
+bool isWinograd(cudnnBackendDescriptor_t engine_config) {
+  return cudnn_frontend::hasNumericalNote<
+             CUDNN_NUMERICAL_NOTE_WINOGRAD>(engine_config);
+}
+
+bool isDownConvertingInputs(cudnnBackendDescriptor_t engine_config) {
+  if (CudnnEnvVar<WinogradNonfused>::IsEnabled()) {
+    return cudnn_frontend::hasNumericalNote<
+               CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS>(engine_config);
+  } else {
+    return isWinograd(engine_config) ||
+           cudnn_frontend::hasNumericalNote<
+               CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS>(engine_config);
+  }
+}
+
+bool isNonDeterministicOrIsDownConverting(
+         cudnnBackendDescriptor_t engine_config) {
+  return isNonDeterministic(engine_config) ||
+         isDownConvertingInputs(engine_config);
+}
+
+#endif // CUDNN_VERSION >= 8100
+} // namespace
+
 cudnnDataType_t GetRnnComputeType(dnn::DataType data_type) {
   switch (data_type) {
     case dnn::DataType::kFloat:
@@ -3154,6 +3233,198 @@
       LOG(FATAL) << "Invalid DNN data type: " << static_cast<int>(data_type);
   }
 }
+
+#if CUDNN_VERSION >= 8100
+cudnnBackendDescriptorType_t GetCudnnConvolutionType(dnn::ConvolutionKind kind) { 
+  cudnnBackendDescriptorType_t conv_mode;
+  switch (kind) {
+    case dnn::ConvolutionKind::FORWARD: {
+      conv_mode = CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR;
+      break;
+    }
+    case dnn::ConvolutionKind::BACKWARD_DATA: {
+      conv_mode = CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR;
+      break;
+    }
+    case dnn::ConvolutionKind::BACKWARD_FILTER: {
+      conv_mode =
+          CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR;
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unexpected convolution kind " << static_cast<int>(kind);
+      break;
+  }
+  return conv_mode;
+}
+
+port::StatusOr<std::unique_ptr<cudnn_frontend::OperationGraph>>
+GetCudnnOperationGraph(
+    dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
+    const dnn::BatchDescriptor& input_descriptor,
+    const dnn::FilterDescriptor& filter_descriptor,
+    const dnn::BatchDescriptor& output_descriptor,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    CudnnHandle &cudnn) {
+  cudnnBackendDescriptorType_t conv_mode = GetCudnnConvolutionType(kind);
+  cudnnDataType_t cudnn_type = ToCudnnDataType(element_type);
+
+  // x tensor.
+  std::vector<int64> input_strides64 = input_descriptor.full_strides(
+                                           dnn::DataLayout::kBatchDepthYX);
+  std::vector<int64> input_dims64 = input_descriptor.full_dims(
+                                        dnn::DataLayout::kBatchDepthYX);
+  std::vector<int64_t> input_strides(input_strides64.cbegin(),
+                                     input_strides64.cend());
+  std::vector<int64_t> input_dims(input_dims64.cbegin(), input_dims64.cend());
+  auto tensor_x = cudnn_frontend::TensorBuilder()
+                      .setDim(input_dims.size(), &input_dims[0])
+                      .setStrides(input_dims.size(), &input_strides[0])
+                      .setId('x')
+                      .setAlignment(32)
+                      .setDataType(cudnn_type)
+                      .build();
+  RETURN_MSG_IF_CUDNN_ERROR(tensor_x);
+
+  // y tensor.
+  std::vector<int64> output_strides64 = output_descriptor.full_strides(
+                                            dnn::DataLayout::kBatchDepthYX);
+  std::vector<int64> output_dims64 = output_descriptor.full_dims(
+                                         dnn::DataLayout::kBatchDepthYX);
+  std::vector<int64_t> output_strides(output_strides64.cbegin(),
+                                      output_strides64.cend());
+  std::vector<int64_t> output_dims(output_dims64.cbegin(),
+                                   output_dims64.cend());
+  auto tensor_y = cudnn_frontend::TensorBuilder()
+                      .setDim(output_dims.size(), &output_dims[0])
+                      .setStrides(output_dims.size(), &output_strides[0])
+                      .setId('y')
+                      .setAlignment(32)
+                      .setDataType(cudnn_type)
+                      .build();
+  RETURN_MSG_IF_CUDNN_ERROR(tensor_y);
+
+  // w tensor: Transform HWNC (XYIO) format to NCHW/NHWC.
+  std::vector<int64> filter_dims64(2 + filter_descriptor.ndims());
+  filter_dims64[0] = filter_descriptor.output_feature_map_count();
+  filter_dims64[1] = filter_descriptor.input_feature_map_count();
+  auto spatial_dims64 = filter_descriptor.input_filter_dims();
+  std::copy(spatial_dims64.begin(), spatial_dims64.end(),
+            filter_dims64.begin() + 2);
+	cudnnTensorFormat_t format;
+	dnn::DataLayout tensor_format;
+  switch (filter_descriptor.layout()) {
+    case dnn::FilterLayout::kOutputInputYX:
+      format = CUDNN_TENSOR_NCHW;
+	    tensor_format = dnn::DataLayout::kBatchDepthYX; 
+      break;
+    case dnn::FilterLayout::kOutputYXInput:
+      format = CUDNN_TENSOR_NHWC;
+	    tensor_format = dnn::DataLayout::kBatchYXDepth; 
+      break;
+    case dnn::FilterLayout::kOutputInputYX4:
+      format = CUDNN_TENSOR_NCHW_VECT_C;
+	    tensor_format = dnn::DataLayout::kBatchDepthYX4; 
+      break;
+    default:
+      LOG(FATAL) << "Unsupported filter format "
+                 << FilterLayoutString(filter_descriptor.layout());
+      break;
+  }
+  std::vector<int64> phys_dims = dnn::ReorderDims(
+      filter_dims64, dnn::DataLayout::kBatchDepthYX, tensor_format);
+  std::vector<int64> phys_strides(phys_dims.size());
+  phys_strides[spatial_dims64.size() + 1] = 1;
+  for (int i = spatial_dims64.size(); i >= 0; i--) {
+    phys_strides[i] = phys_strides[i + 1] * phys_dims[i + 1];
+  }
+	std::vector<int64> filter_strides64 = dnn::ReorderDims(
+      phys_strides, tensor_format, dnn::DataLayout::kBatchDepthYX);
+  std::vector<int64_t> filter_dims(filter_dims64.cbegin(),
+                                   filter_dims64.cend());
+  std::vector<int64_t> filter_strides(filter_strides64.cbegin(),
+                                      filter_strides64.cend());
+  auto tensor_w = cudnn_frontend::TensorBuilder()
+                      .setDim(filter_dims.size(), &filter_dims[0])
+                      .setStrides(filter_dims.size(), &filter_strides[0])
+                      .setId('w')
+                      .setAlignment(32)
+                      .setDataType(cudnn_type)
+                      .build();
+  RETURN_MSG_IF_CUDNN_ERROR(tensor_w);
+
+  // conv_desc.
+  auto mode = convolution_descriptor.convolution_not_crosscorr()
+                  ? CUDNN_CONVOLUTION
+                  : CUDNN_CROSS_CORRELATION;
+  int convDim = convolution_descriptor.ndims();
+  auto accumulator_type = ToCudnnDataType(GetConvAccumulatorType(element_type));
+  absl::Span<const int64> strides64 = convolution_descriptor.strides();
+  absl::Span<const int64> padding64 = convolution_descriptor.padding();
+  absl::Span<const int64> dilations64 = convolution_descriptor.dilations();
+  CHECK_NE(convolution_descriptor.pad_alignment(),
+           dnn::PadAlignment::kTensorFlowPadding)
+      << "TensorFlow padding alignment is not supported.";
+  std::vector<int64_t> strides(convolution_descriptor.ndims());
+  std::vector<int64_t> padding(convolution_descriptor.ndims());
+  std::vector<int64_t> dilations(convolution_descriptor.ndims());
+  std::copy(strides64.cbegin(), strides64.cend(), strides.begin());
+  std::copy(padding64.cbegin(), padding64.cend(), padding.begin());
+  std::copy(dilations64.cbegin(), dilations64.cend(), dilations.begin());
+  auto conv_desc = cudnn_frontend::ConvDescBuilder()
+                       .setDataType(accumulator_type)
+                       .setMathMode(mode)
+                       .setNDims(convDim)
+                       .setStrides(convDim, &strides[0])
+                       .setPrePadding(convDim, &padding[0])
+                       .setPostPadding(convDim, &padding[0])
+                       .setDilation(convDim, &dilations[0])
+                       .build();
+  RETURN_MSG_IF_CUDNN_ERROR(conv_desc);
+
+  // Alpha is the scaling factor for input.
+  float falpha = 1.0;
+  double dalpha = 1.0;
+  // Beta is the scaling factor for output.
+  float fbeta = 0.0;
+  double dbeta = 0.0;
+
+  // CUDNN Operation
+  auto op_builder = cudnn_frontend::OperationBuilder(conv_mode);
+  op_builder.setxDesc(tensor_x)
+            .setyDesc(tensor_y)
+            .setwDesc(tensor_w)
+            .setcDesc(conv_desc);
+	if (cudnn_type == CUDNN_DATA_DOUBLE) {
+    op_builder.setAlpha(dalpha)
+              .setBeta(dbeta);
+  } else {
+    op_builder.setAlpha(falpha)
+              .setBeta(fbeta);
+  }
+  auto op = op_builder.build();
+  RETURN_MSG_IF_CUDNN_ERROR(op);
+
+  // CUDNN OperationGraph
+  std::array<cudnn_frontend::Operation const *, 1> ops = {&op};
+  auto opGraph = cudnn_frontend::OperationGraphBuilder()
+                   .setHandle(cudnn.handle())
+                   .setOperationGraph(ops.size(), ops.data())
+                   .build();
+  RETURN_MSG_IF_CUDNN_ERROR(opGraph);
+
+  VLOG(4) << "\nTensor_x: " << tensor_x.describe()
+          << "\nTensor_y: " << tensor_y.describe()
+          << "\nTensor_w: " << tensor_w.describe()
+          << "\nConv: " << conv_desc.describe()
+          << "\nOp: " << op.describe()
+          << "\nOpGraph: " << opGraph.describe();
+
+  return std::unique_ptr<cudnn_frontend::OperationGraph>(
+      new cudnn_frontend::OperationGraph(std::move(opGraph)));
+}
+#endif // CUDNN_VERSION >= 8100
+
 }  // namespace
 
 port::Status CudnnSupport::DoPrepareForConvolution(
@@ -3348,6 +3619,168 @@
   return port::Status::OK();
 }
 
+port::Status CudnnSupport::DoConvolve(
+    dnn::ConvolutionKind kind, dnn::DataType element_type,
+    dnn::DataType output_type, Stream* stream,
+    const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
+    const dnn::FilterDescriptor& filter_descriptor,
+    DeviceMemoryBase filter_data, const dnn::BatchDescriptor& output_descriptor,
+    DeviceMemoryBase output_data,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::ExecutionPlanConfig& plan_config,
+    ScratchAllocator* scratch_allocator, 
+    dnn::ProfileExecutionPlanResult* output_profile_result) {
+#if CUDNN_VERSION >= 8100
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+
+  absl::optional<dnn::ExecutionPlanDesc> plan_or = plan_config.plan();
+  absl::optional<dnn::ExecutionPlanDesc> plan_no_scratch_or =
+      plan_config.plan_no_scratch();
+
+  std::unique_ptr<cudnn_frontend::ExecutionPlan> current_plan;
+  if (!plan_or.has_value()) {
+    SE_ASSIGN_OR_RETURN(
+        std::unique_ptr<cudnn_frontend::OperationGraph> op_graph, 
+        GetCudnnOperationGraph(kind, element_type, stream, input_descriptor,
+                               filter_descriptor, output_descriptor,
+                               convolution_descriptor, cudnn));
+
+    auto heuristics = cudnn_frontend::EngineHeuristicsBuilder()
+                          .setOperationGraph(*op_graph)
+                          .setHeurMode(CUDNN_HEUR_MODE_INSTANT)
+                          .build();
+    RETURN_MSG_IF_CUDNN_ERROR(heuristics);
+
+    cudnnBackendDescriptorType_t conv_mode = GetCudnnConvolutionType(kind);
+    auto fallback = cudnn_frontend::EngineFallbackListBuilder()
+                        .setOperationGraph(*op_graph)
+                        .setOperation(conv_mode)
+                        .build();
+    RETURN_MSG_IF_CUDNN_ERROR(fallback);
+
+    auto engine_count = heuristics.getEngineConfigCount();
+    auto &engine_config = heuristics.getEngineConfig(engine_count);
+    auto &fallback_list = fallback.getFallbackList();
+
+    cudnn_frontend::EngineConfigList filtered_configs;
+    if (RequireCudnnDeterminism()) {
+      cudnn_frontend::filter(engine_config, filtered_configs,
+                             isNonDeterministicOrIsDownConverting);
+      cudnn_frontend::filter(fallback_list, filtered_configs,
+                             isNonDeterministicOrIsDownConverting);
+    } else {
+      cudnn_frontend::filter(engine_config, filtered_configs,
+                             isDownConvertingInputs);
+      cudnn_frontend::filter(fallback_list, filtered_configs,
+                             isDownConvertingInputs);
+    }
+    for (int i = 0; i < filtered_configs.size(); i++) {
+      auto plan = cudnn_frontend::ExecutionPlanBuilder()
+                      .setHandle(cudnn.handle())
+                      .setEngineConfig(filtered_configs[i], op_graph->getTag())
+                      .build();
+      if (plan.get_status() == CUDNN_STATUS_SUCCESS) {
+        bool specify_workspace_limit = scratch_allocator != nullptr;
+        auto memory_limit_bytes =
+            specify_workspace_limit
+                ? std::max(scratch_allocator->GetMemoryLimitInBytes(), int64{0})
+                : int64{0};
+        int64_t workspace_size = plan.getWorkspaceSize(); 
+        if (workspace_size <= memory_limit_bytes) {
+          current_plan =
+              std::unique_ptr<cudnn_frontend::ExecutionPlan>(
+                  new cudnn_frontend::ExecutionPlan(std::move(plan)));
+          break;
+        }
+      }
+    }
+    if (!current_plan) {
+      return port::Status(port::error::UNKNOWN, "CUDNN failed to get a working"
+                                                " plan.");
+    }
+  }
+  
+  size_t workspace_size;
+  cudnnBackendDescriptor_t plan_desc;
+  std::string exec_plan_id = "unknown";
+  if (current_plan) {
+    exec_plan_id = current_plan->getTag();
+    workspace_size = current_plan->getWorkspaceSize(); 
+    plan_desc = current_plan->get_raw_desc();
+  } else {
+    exec_plan_id = plan_or->exec_plan_id();
+    auto workspace_size_or = plan_config.scratch_size(); 
+    if (workspace_size_or.has_value()) {
+      workspace_size = *workspace_size_or; 
+    }
+    plan_desc = plan_or->exec_plan_desc();
+  }
+  dnn::ExecutionPlanDesc selected_plan_(exec_plan_id, plan_desc);
+
+  DeviceMemory<uint8> scratch_memory;
+  if (workspace_size > 0) {
+    auto scratch_or = scratch_allocator->AllocateBytes(workspace_size);
+    if (scratch_or.ok()) {
+      scratch_memory = scratch_or.ValueOrDie();
+    } else if (plan_no_scratch_or.has_value()) {
+      selected_plan_ = {plan_no_scratch_or->exec_plan_id(),
+                        plan_no_scratch_or->exec_plan_desc()};
+    } else {
+      return port::Status(port::error::UNKNOWN,
+                          "CUDNN failed to allocate the scratch space for the "
+                          "plan or to find a working no-scratch plan.");
+    }
+  }
+
+  void * data_ptrs[] = {input_data.opaque(), output_data.opaque(),
+                        filter_data.opaque()};
+  int64_t uids[] = {'x', 'y', 'w'};
+  auto variantPack = cudnn_frontend::VariantPackBuilder()
+                         .setWorkspacePointer(scratch_memory.opaque())
+                         .setDataPointers(3, data_ptrs)
+                         .setUids(3, uids)
+                         .build();
+  RETURN_MSG_IF_CUDNN_ERROR(variantPack);
+
+  VLOG(4) << "\nDo convolution with plan tag: "
+          << selected_plan_.exec_plan_id()
+          << "\nWorkspace size in bytes: " << workspace_size 
+          << "\nVariantPack: " << variantPack.describe();
+
+  const bool is_profiling = output_profile_result != nullptr;
+
+  std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
+  if (is_profiling) {
+    timer.reset(new GpuTimer(parent_));  // NOLINT
+    // The start and stop of the timer should be as close to the Cudnn call as
+    // possible. It is still possible for other threads to issue workload on
+    // to this stream. So it could take multiple profiling measurements.
+    if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
+      return port::Status(port::error::INTERNAL, "Failed to start timer");
+    }
+  }
+
+  cudnnStatus_t status = cudnnBackendExecute(cudnn.handle(),
+                                             selected_plan_.exec_plan_desc(),
+                                             variantPack.get_raw_desc());
+  RETURN_IF_CUDNN_ERROR(status);
+
+  if (is_profiling) {
+    if (!timer->Stop(AsGpuStream(stream))) {
+      return port::Status(port::error::INTERNAL, "Failed to stop timer");
+    }
+    output_profile_result->set_plan(selected_plan_);
+    output_profile_result->set_elapsed_time_in_ms(
+        timer->GetElapsedMilliseconds());
+    output_profile_result->set_scratch_size(scratch_memory.size());
+  }
+
+  return port::Status::OK();
+#else
+  return port::InternalError("CUDNN version needs to be >= 8.0");
+#endif // CUDNN_VERSION >= 8100
+}
+
 template <typename ElementType, typename BiasType, typename ScaleType,
           typename OutputType>
 port::Status CudnnSupport::DoFusedConvolveImpl(
@@ -3467,6 +3900,82 @@
   return port::Status::OK();
 }
 
+bool CudnnSupport::GetConvolveExecutionPlans(
+    dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
+    const dnn::BatchDescriptor& input_descriptor,
+    const dnn::FilterDescriptor& filter_descriptor,
+    const dnn::BatchDescriptor& output_descriptor,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    std::vector<std::unique_ptr<dnn::ConvolveExecutionPlan>>* out_exec_plans) {
+#if CUDNN_VERSION >= 8100
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+  auto op_graph_status = GetCudnnOperationGraph(
+                             kind, element_type, stream, input_descriptor,
+                             filter_descriptor, output_descriptor,
+                             convolution_descriptor, cudnn);
+  if (!op_graph_status.status().ok()) {
+    return false;
+  }
+  auto op_graph = op_graph_status.ConsumeValueOrDie();
+
+  auto heur = cudnn_frontend::EngineHeuristicsBuilder()
+                  .setOperationGraph(*op_graph)
+                  .setHeurMode(CUDNN_HEUR_MODE_INSTANT)
+                  .build();
+  RETURN_FALSE_IF_CUDNN_ERROR(heur);
+
+  auto fallback = cudnn_frontend::EngineFallbackListBuilder()
+                        .setOperationGraph(*op_graph)
+                        .setOperation(GetCudnnConvolutionType(kind))
+                        .build();
+  RETURN_FALSE_IF_CUDNN_ERROR(fallback);
+
+  auto &heur_configs = heur.getEngineConfig(heur.getEngineConfigCount());
+  auto &fallback_configs = fallback.getFallbackList();
+
+  VLOG(4) << "\nHeuristics engine configs size: " << heur_configs.size()
+          << "\nFallback engine configs size: " << fallback_configs.size();
+
+  cudnn_frontend::EngineConfigList filtered_configs;
+  if (RequireCudnnDeterminism()) {
+    cudnn_frontend::filter(heur_configs, filtered_configs,
+                           isNonDeterministicOrIsDownConverting);
+    cudnn_frontend::filter(fallback_configs, filtered_configs,
+                           isNonDeterministicOrIsDownConverting);
+  } else {
+    cudnn_frontend::filter(heur_configs, filtered_configs,
+                           isDownConvertingInputs);
+    cudnn_frontend::filter(fallback_configs, filtered_configs,
+                           isDownConvertingInputs);
+  }
+
+  VLOG(4) << "\nFiltered engine configs size: " << filtered_configs.size();
+
+  out_exec_plans->clear();
+  for (int i = 0; i < filtered_configs.size(); i++) {
+    auto plan = cudnn_frontend::ExecutionPlanBuilder()
+                    .setHandle(cudnn.handle())
+                    .setEngineConfig(filtered_configs[i], op_graph->getTag())
+                    .build();
+    if (plan.get_status() == CUDNN_STATUS_SUCCESS) {
+      out_exec_plans->push_back(
+          std::unique_ptr<dnn::ConvolveExecutionPlan>(
+              new CudnnConvolveExecutionPlan(std::move(plan))));
+      // We will use the first working plan when determinism is required.
+      if (RequireCudnnDeterminism()) {
+        break;
+      }
+    }
+  }
+
+  VLOG(4) << "\nReturned execution plans size: " << out_exec_plans->size();
+
+  return true;
+#else
+  return false;
+#endif // CUDNN_VERSION >= 8100
+}
+
 bool CudnnSupport::GetConvolveAlgorithms(
     bool with_winograd_nonfused, int cc_major, int cc_minor,
     std::vector<dnn::AlgorithmDesc>* out_algorithms) {
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index 941260e..4e70b4d 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -210,6 +210,15 @@
       bool with_winograd_nonfused, int cc_major, int cc_minor,
       std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
 
+  bool GetConvolveExecutionPlans(
+      dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
+      const dnn::BatchDescriptor& input_descriptor,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const dnn::BatchDescriptor& output_descriptor,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      std::vector<std::unique_ptr<
+                      dnn::ConvolveExecutionPlan>>* out_exec_plans) override;
+
   bool GetRnnAlgorithms(
       std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
 
@@ -283,6 +292,19 @@
       dnn::AlgorithmDesc algorithm_desc, DeviceMemory<uint8> scratch_memory,
       dnn::ProfileResult* output_profile_result) override;
 
+  port::Status DoConvolve(
+      dnn::ConvolutionKind kind, dnn::DataType element_type,
+      dnn::DataType output_type, Stream* stream,
+      const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
+      const dnn::FilterDescriptor& filter_descriptor,
+      DeviceMemoryBase filter_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemoryBase output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::ExecutionPlanConfig& plan_config,
+      ScratchAllocator* scratch_allocator, 
+      dnn::ProfileExecutionPlanResult* output_profile_result) override;
+
   port::Status DoFusedConvolve(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
       const DeviceMemory<double>& conv_input_data, double conv_input_scale,
diff --git a/tensorflow/stream_executor/dnn.cc b/tensorflow/stream_executor/dnn.cc
index bbc7f6a..22087e9 100644
--- a/tensorflow/stream_executor/dnn.cc
+++ b/tensorflow/stream_executor/dnn.cc
@@ -41,12 +41,32 @@
   }
 }
 
+uint64 ExecutionPlanDesc::hash() const {
+  auto p = exec_plan_id();
+  return absl::Hash<decltype(p)>()(p);
+}
+
+std::string ExecutionPlanDesc::ToString() const {
+  return absl::StrCat(exec_plan_id());
+}
+
 bool DnnSupport::GetConvolveAlgorithms(
     bool with_winograd_nonfused, int cc_major, int cc_minor,
     std::vector<AlgorithmDesc>* out_algorithms) {
   return false;
 }
 
+bool DnnSupport::GetConvolveExecutionPlans(
+    dnn::ConvolutionKind /*kind*/, dnn::DataType /*element_type*/,
+    Stream* /*stream*/,
+    const dnn::BatchDescriptor& /*input_descriptor*/,
+    const dnn::FilterDescriptor& /*filter_descriptor*/,
+    const dnn::BatchDescriptor& /*output_descriptor*/,
+    const dnn::ConvolutionDescriptor& /*convolution_descriptor*/,
+    std::vector<std::unique_ptr<dnn::ConvolveExecutionPlan>>* /*exec_plans*/) {
+  return false;
+}
+
 bool DnnSupport::GetMIOpenConvolveAlgorithms(
     dnn::ConvolutionKind /*kind*/, dnn::DataType /*element_type*/,
     Stream* /*stream*/, const dnn::BatchDescriptor& /*input_descriptor*/,
@@ -264,6 +284,20 @@
   return absl::StrCat(algo, ", ", algo_no_scratch);
 }
 
+// -- ExecutionPlanConfig
+
+std::string ExecutionPlanConfig::ToString() const {
+  std::string plan_str = "none";
+  if (plan().has_value()) {
+    plan_str = plan()->ToString();
+  }
+  std::string plan_no_scratch_str = "none";
+  if (plan_no_scratch().has_value()) {
+    plan_no_scratch_str = plan_no_scratch()->ToString();
+  }
+  return absl::StrCat(plan_str, ", ", plan_no_scratch_str);
+}
+
 // -- BatchDescriptor
 
 BatchDescriptor::BatchDescriptor(int ndims)
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 6ca4234..7cac781 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -58,6 +58,10 @@
   Z = 2,
 };
 
+// Return a reordered dims.
+std::vector<int64> ReorderDims(const std::vector<int64>& input,
+                               const DataLayout& from, const DataLayout& to);
+
 // Helper functions to make methods more readable.
 inline int64 GetDim(absl::Span<const int64> data, DimIndex dim) {
   return data.rbegin()[static_cast<int64>(dim)];
@@ -185,6 +189,15 @@
   virtual ~RnnStateTensorDescriptor() {}
 };
 
+// Specifies the execution plan in convolution.
+class ConvolveExecutionPlan {
+ public:
+  virtual ~ConvolveExecutionPlan() {}
+  virtual std::string getTag() { return "unknown"; };
+  virtual void* get_raw_desc() { return nullptr; };
+  virtual int64_t getWorkspaceSize() { return -1; };
+};
+
 // Returns a string representation of the given quantization mode.
 std::string QuantizedActivationModeString(QuantizedActivationMode mode);
 
@@ -766,6 +779,35 @@
   AlgorithmProto proto_;
 };
 
+// Collects parameters for DNN execution plans
+class ExecutionPlanDesc {
+ public:
+  typedef std::string Index;
+  ExecutionPlanDesc() : ExecutionPlanDesc("unknown", nullptr) {}
+  ExecutionPlanDesc(Index a, void* b) {
+    proto_.set_exec_plan_id(a);
+    exec_plan_desc_ = b;
+  }
+  Index exec_plan_id() const { return proto_.exec_plan_id(); }
+  void* exec_plan_desc() const { return exec_plan_desc_; }
+  bool operator==(const ExecutionPlanDesc& other) const {
+    return exec_plan_id() == other.exec_plan_id();
+  }
+  // TODO(kaixih): Currently, hash() and ToString() only recognize the
+  // exec_plan_id. We might include more information in this class, such as the
+  // CUDNN numerical notes, which can tell whether the underlying engine uses
+  // deterministic algorithm, tensor cores, etc.
+  uint64 hash() const;
+
+  ExecutionPlanProto ToProto() const { return proto_; }
+
+  std::string ToString() const;
+
+ private:
+  ExecutionPlanProto proto_;
+  void* exec_plan_desc_;
+};
+
 // Describes the result from a perf experiment.
 //
 // Arguments:
@@ -795,6 +837,29 @@
   size_t scratch_size_ = 0;
 };
 
+class ProfileExecutionPlanResult {
+ public:
+  bool is_valid() const {
+    return plan_.has_value() &&
+           elapsed_time_in_ms() != std::numeric_limits<float>::max();
+  }
+
+  ExecutionPlanDesc plan() const { return *plan_; }
+  void set_plan(ExecutionPlanDesc val) { plan_ = val; }
+
+  float elapsed_time_in_ms() const { return elapsed_time_in_ms_; }
+  void set_elapsed_time_in_ms(float val) { elapsed_time_in_ms_ = val; }
+
+  size_t scratch_size() const { return scratch_size_; }
+  void set_scratch_size(size_t val) { scratch_size_ = val; }
+
+ private:
+  absl::optional<ExecutionPlanDesc> plan_;
+  float elapsed_time_in_ms_ = std::numeric_limits<float>::max();
+  size_t scratch_size_ = 0;
+};
+
+
 // Describes the configuration for the algorithms that will used.
 //
 // Arguments:
@@ -845,6 +910,53 @@
   absl::optional<size_t> scratch_size_;
 };
 
+// Describes the configuration for the execution plans that will used.
+//
+// Arguments:
+//  plan: the primary execution plan that should be used.
+//  plan_no_scratch: a secondary execution plan that should be used, if the
+//    the allocation for the scratch memory fails.
+//  scrach_size: specify the size of scratch memory in bytes needed for the
+//    primary execution plan used.
+//
+// This class is only for CUDA platform with CUDNN v8 library. Given the
+// execution plan, users can query the scratch size. However, for convenience,
+// we also store this size in the class.
+
+class ExecutionPlanConfig {
+ public:
+  ExecutionPlanConfig() {}
+  ExecutionPlanConfig(ExecutionPlanDesc plan, size_t scratch_size)
+      : plan_(plan), scratch_size_(scratch_size) {}
+  ExecutionPlanConfig(ExecutionPlanDesc plan, size_t scratch_size,
+                      ExecutionPlanDesc plan_no_scratch)
+      : plan_(plan), scratch_size_(scratch_size),
+        plan_no_scratch_(plan_no_scratch) {}
+  absl::optional<ExecutionPlanDesc> plan() const { return plan_; }
+  void set_plan(ExecutionPlanDesc val) { plan_ = val; }
+  absl::optional<ExecutionPlanDesc> plan_no_scratch() const {
+    return plan_no_scratch_;
+  }
+  void set_plan_no_scratch(ExecutionPlanDesc val) { plan_no_scratch_ = val; }
+  absl::optional<size_t> scratch_size() const { return scratch_size_; }
+  void set_scratch_size(size_t val) { scratch_size_ = val; }
+  bool operator==(const ExecutionPlanConfig& other) const {
+    return this->plan_ == other.plan_ &&
+           this->scratch_size_ == other.scratch_size_ &&
+           this->plan_no_scratch_ == other.plan_no_scratch_;
+  }
+  bool operator!=(const ExecutionPlanConfig& other) const {
+    return !(*this == other);
+  }
+  std::string ToString() const;
+
+ private:
+  absl::optional<ExecutionPlanDesc> plan_;
+  absl::optional<size_t> scratch_size_;
+  absl::optional<ExecutionPlanDesc> plan_no_scratch_;
+};
+
+
 // Describes a local response normalization (LRN). LRN is used e.g. in
 // dist_belief.
 //
@@ -1301,6 +1413,17 @@
       AlgorithmDesc algorithm_desc, DeviceMemory<uint8> scratch_memory,
       ProfileResult* output_profile_result) = 0;
 
+  virtual port::Status DoConvolve(
+      ConvolutionKind kind, DataType element_type, DataType output_type,
+      Stream* stream, const BatchDescriptor& input_descriptor,
+      DeviceMemoryBase input_data, const FilterDescriptor& filter_descriptor,
+      DeviceMemoryBase filter_data, const BatchDescriptor& output_descriptor,
+      DeviceMemoryBase output_data,
+      const ConvolutionDescriptor& convolution_descriptor,
+      const ExecutionPlanConfig& plan_config,
+      ScratchAllocator* scratch_allocator, 
+      ProfileExecutionPlanResult* output_profile_result) = 0;
+
   template <typename ElementType, typename OutputType>
   bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& input_descriptor,
                   const DeviceMemory<ElementType>& input_data,
@@ -1327,6 +1450,14 @@
       bool with_winograd_nonfused, int cc_major, int cc_minor,
       std::vector<AlgorithmDesc>* out_algorithms);
 
+  virtual bool GetConvolveExecutionPlans(
+      dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
+      const dnn::BatchDescriptor& input_descriptor,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const dnn::BatchDescriptor& output_descriptor,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      std::vector<std::unique_ptr<dnn::ConvolveExecutionPlan>>* out_exec_plans);
+
   virtual bool GetMIOpenConvolveAlgorithms(
       dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
       const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
diff --git a/tensorflow/stream_executor/dnn.proto b/tensorflow/stream_executor/dnn.proto
index f849b01..2671724 100644
--- a/tensorflow/stream_executor/dnn.proto
+++ b/tensorflow/stream_executor/dnn.proto
@@ -100,6 +100,12 @@
   MathType math_type = 2;
 }
 
+// CUDNN execution plan representation.
+message ExecutionPlanProto {
+  // A unique ID to represent which engine and knobs are used in the plan.
+  string exec_plan_id = 1;
+}
+
 // Convolution-specific parameters.
 message ConvolutionDescriptorProto {
   repeated int64 paddings = 1;
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index ccdb467..eab00e1 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -62,6 +62,10 @@
   return algo_config.ToString();
 }
 
+std::string ToVlogString(const dnn::ExecutionPlanConfig &plan_config) {
+  return plan_config.ToString();
+}
+
 std::string ToVlogString(dnn::ElementwiseOperation op) {
   return dnn::ElementwiseOperationString(op);
 }
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index e214ee4..59bca62 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -69,6 +69,7 @@
 class FilterDescriptor;
 class ConvolutionDescriptor;
 class ProfileResult;
+class ProfileExecutionPlanResult;
 class AlgorithmDesc;
 }  // namespace dnn
 
@@ -351,6 +352,28 @@
     return port::UnimplementedError("DNN library is not found.");
   }
 
+  template <typename InputType, typename OutputType>
+  port::Status ConvolveWithExecutionPlan(
+      const dnn::BatchDescriptor &input_descriptor,
+      const DeviceMemory<InputType> &input_data,
+      const dnn::FilterDescriptor &filter_descriptor,
+      const DeviceMemory<InputType> &filter_data,
+      const dnn::ConvolutionDescriptor &convolution_descriptor,
+      const dnn::BatchDescriptor &output_descriptor,
+      DeviceMemory<OutputType> *output, ScratchAllocator *scratch_allocator,
+      const dnn::ExecutionPlanConfig &plan_config,
+      dnn::ProfileExecutionPlanResult *output_profile_result) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      return dnn->DoConvolve(
+          dnn::ConvolutionKind::FORWARD, dnn::ToDataType<InputType>::value,
+          dnn::ToDataType<OutputType>::value, this, input_descriptor,
+          input_data, filter_descriptor, filter_data,
+          output_descriptor, *output, convolution_descriptor, plan_config,
+          scratch_allocator, output_profile_result);
+    }
+    return port::UnimplementedError("DNN library is not found.");
+  }
+
   port::Status FusedConvolveWithAlgorithm(
       const dnn::BatchDescriptor &conv_input_descriptor,
       const DeviceMemory<double> &conv_input_data, double conv_input_scale,
@@ -433,6 +456,30 @@
       DeviceMemory<float> *output);
 
   template <typename ElementType>
+  port::Status ConvolveBackwardDataWithExecutionPlan(
+    const dnn::FilterDescriptor &filter_descriptor,
+    const DeviceMemory<ElementType> &filter_data,
+    const dnn::BatchDescriptor &output_descriptor,
+    DeviceMemory<ElementType> backward_output_data,
+    const dnn::ConvolutionDescriptor &convolution_descriptor,
+    const dnn::BatchDescriptor &input_descriptor,
+    DeviceMemory<ElementType> *backward_input_data,
+    ScratchAllocator *scratch_allocator,
+    const dnn::ExecutionPlanConfig &plan_config,
+    dnn::ProfileExecutionPlanResult *output_profile_result) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      return dnn->DoConvolve(
+          dnn::ConvolutionKind::BACKWARD_DATA,
+          dnn::ToDataType<ElementType>::value,
+          dnn::ToDataType<ElementType>::value, this, input_descriptor,
+          *backward_input_data, filter_descriptor, filter_data,
+          output_descriptor, backward_output_data, convolution_descriptor,
+          plan_config, scratch_allocator, output_profile_result);
+    }
+    return port::UnimplementedError("DNN library is not found.");
+  }
+
+  template <typename ElementType>
   port::Status ConvolveBackwardDataWithAlgorithm(
       const dnn::FilterDescriptor &filter_descriptor,
       const DeviceMemory<ElementType> &filter_data,
@@ -496,6 +543,30 @@
     return port::UnimplementedError("DNN library is not found.");
   }
 
+  template <typename ElementType>
+  port::Status ConvolveBackwardFilterWithExecutionPlan(
+      const dnn::BatchDescriptor &input_descriptor,
+      const DeviceMemory<ElementType> &input_data,
+      const dnn::BatchDescriptor &output_descriptor,
+      DeviceMemory<ElementType> backward_output_data,
+      const dnn::ConvolutionDescriptor &convolution_descriptor,
+      const dnn::FilterDescriptor &filter_descriptor,
+      DeviceMemory<ElementType> *backward_filter_data,
+      ScratchAllocator *scratch_allocator,
+      const dnn::ExecutionPlanConfig &plan_config,
+      dnn::ProfileExecutionPlanResult *output_profile_result) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      return dnn->DoConvolve(
+          dnn::ConvolutionKind::BACKWARD_FILTER,
+          dnn::ToDataType<ElementType>::value,
+          dnn::ToDataType<ElementType>::value, this, input_descriptor,
+          input_data, filter_descriptor, *backward_filter_data,
+          output_descriptor, backward_output_data, convolution_descriptor,
+          plan_config, scratch_allocator, output_profile_result);
+    }
+    return port::UnimplementedError("DNN library is not found.");
+  }
+
   Stream &ThenConvolveBackwardBias(const dnn::BatchDescriptor &input_descriptor,
                                    const DeviceMemory<double> &input_data,
                                    const dnn::BatchDescriptor &bias_descriptor,
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index 35b4844..8566585 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -274,6 +274,22 @@
                                             cc_minor, out_algorithms);
 }
 
+bool StreamExecutor::GetConvolveExecutionPlans(
+    dnn::ConvolutionKind kind, dnn::DataType element_type, Stream *stream,
+    const dnn::BatchDescriptor &input_descriptor,
+    const dnn::FilterDescriptor &filter_descriptor,
+    const dnn::BatchDescriptor &output_descriptor,
+    const dnn::ConvolutionDescriptor &convolution_descriptor,
+    std::vector<std::unique_ptr<dnn::ConvolveExecutionPlan>> *out_exec_plans) {
+  dnn::DnnSupport *dnn_support = AsDnn();
+  if (!dnn_support) {
+    return false;
+  }
+  return dnn_support->GetConvolveExecutionPlans(
+      kind, element_type, stream, input_descriptor, filter_descriptor,
+      output_descriptor, convolution_descriptor, out_exec_plans);
+}
+
 bool StreamExecutor::GetMIOpenConvolveAlgorithms(
     dnn::ConvolutionKind kind, dnn::DataType element_type, Stream *stream,
     const dnn::BatchDescriptor &input_descriptor, DeviceMemoryBase input_data,
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index 43774cb..ed5e500 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -365,6 +365,15 @@
   bool GetConvolveAlgorithms(bool with_winograd_nonfused,
                              std::vector<dnn::AlgorithmDesc> *out_algorithms);
 
+  // Returns the supported execution plans for the convolution operation.
+  bool GetConvolveExecutionPlans(
+      dnn::ConvolutionKind kind, dnn::DataType element_type, Stream *stream,
+      const dnn::BatchDescriptor &input_descriptor,
+      const dnn::FilterDescriptor &filter_descriptor,
+      const dnn::BatchDescriptor &output_descriptor,
+      const dnn::ConvolutionDescriptor &convolution_descriptor,
+      std::vector<std::unique_ptr<dnn::ConvolveExecutionPlan>> *out_exec_plans);
+
   // Returns the list of supported algorithms for the forward convolution
   // operation.
   bool GetMIOpenConvolveAlgorithms(