[ROCm] disabling fast NHWC implementation for the ROCm platform.
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index 7322b4e..c40e9ad 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -733,11 +733,16 @@
return;
}
+#if GOOGLE_CUDA
// Tensor Core (NVIDIA Volta+ GPUs) supports efficient convolution with fp16
// in NHWC data layout. In all other configurations it's more efficient to
// run computation in NCHW data format.
const bool compute_in_nhwc =
DataTypeToEnum<T>::value == DT_HALF && IsVoltaOrLater(*stream->parent());
+#else
+ // fast NHWC implementation is a CUDA only feature
+ const bool compute_in_nhwc = false;
+#endif
// We only do one directional conversion: NHWC->NCHW. We never convert in the
// other direction. Grappler layout optimizer selects preferred layout and
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
index 5dc45fb..38debfa 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc
@@ -514,9 +514,10 @@
}
};
-#if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM
+#if !GOOGLE_CUDA
namespace {
// See implementation under GOOGLE_CUDA #ifdef below.
+// This is a CUDA specific feature, do not enable it for non-CUDA builds
bool BatchnormSpatialPersistentEnabled() { return false; }
} // namespace
#endif
@@ -535,6 +536,7 @@
}
}
+#if GOOGLE_CUDA
// NOTE(ezhulenev): See `BatchnormSpatialPersistentEnabled` documentation in the
// `cuda_dnn.cc` for details.
bool BatchnormSpatialPersistentEnabled() {
@@ -551,6 +553,8 @@
return false;
#endif
}
+#endif
+
} // namespace
template <typename U, typename T>
@@ -679,6 +683,7 @@
// If use_reserved_space we have reserve_space_3 output (only in
// FusedBatchNormV3 op).
+#if GOOGLE_CUDA
// Check if cuDNN batch normalization has a fast NHWC implementation:
// (1) In inference mode it's always fast.
// (2) Tensorflow enabled batchnorm spatial persistence, we are called
@@ -688,6 +693,10 @@
!is_training ||
(BatchnormSpatialPersistentEnabled() &&
DataTypeToEnum<T>::value == DT_HALF && use_reserved_space);
+#else
+ // fast NHWC implementation is a CUDA only feature
+ const bool fast_nhwc_batch_norm = false;
+#endif
// If input tensor is in NHWC format, and we have a fast cuDNN
// implementation, there is no need to do data format conversion.
@@ -898,12 +907,17 @@
const int64 height = GetTensorDim(x, tensor_format, 'H');
const int64 width = GetTensorDim(x, tensor_format, 'W');
+#if GOOGLE_CUDA
// Check if cuDNN batch normalization has a fast NHWC implementation:
// (1) Tensorflow enabled batchnorm spatial persistence, and
// FusedBatchNormGradV3 passed non-null reserve space and allocator.
const bool fast_nhwc_batch_norm = BatchnormSpatialPersistentEnabled() &&
DataTypeToEnum<T>::value == DT_HALF &&
use_reserved_space;
+#else
+ // fast NHWC implementation is a CUDA only feature
+ const bool fast_nhwc_batch_norm = false;
+#endif
// If input tensor is in NHWC format, and we have a fast cuDNN
// implementation, there is no need to do data format conversion.