[ROCm] disabling fast NHWC implementation for the ROCm platform.

commit: 17b87f0b51ad290269f983a85b887ae838c2ebe2 [log] [tgz]
author: Deven Desai <deven.desai.amd@gmail.com> Mon Jan 06 22:24:26 2020 +0000
committer: Deven Desai <deven.desai.amd@gmail.com> Mon Jan 06 22:24:26 2020 +0000
tree: db1328a4c556eb9fe0275e678eaee2eb01e02281
parent: d325b255ff7d0bf1ca04229880dffb0a37d52e2d [diff]
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index 7322b4e..c40e9ad 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc

@@ -733,11 +733,16 @@
     return;
   }
 
+#if GOOGLE_CUDA
   // Tensor Core (NVIDIA Volta+ GPUs) supports efficient convolution with fp16
   // in NHWC data layout. In all other configurations it's more efficient to
   // run computation in NCHW data format.
   const bool compute_in_nhwc =
       DataTypeToEnum<T>::value == DT_HALF && IsVoltaOrLater(*stream->parent());
+#else
+  // fast NHWC implementation is a CUDA only feature
+  const bool compute_in_nhwc = false;
+#endif
 
   // We only do one directional conversion: NHWC->NCHW. We never convert in the
   // other direction. Grappler layout optimizer selects preferred layout and

diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
index 5dc45fb..38debfa 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc

@@ -514,9 +514,10 @@
   }
 };
 
-#if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM
+#if !GOOGLE_CUDA
 namespace {
 // See implementation under GOOGLE_CUDA #ifdef below.
+// This is a CUDA specific feature, do not enable it for non-CUDA builds
 bool BatchnormSpatialPersistentEnabled() { return false; }
 }  // namespace
 #endif
@@ -535,6 +536,7 @@
   }
 }
 
+#if GOOGLE_CUDA
 // NOTE(ezhulenev): See `BatchnormSpatialPersistentEnabled` documentation in the
 // `cuda_dnn.cc` for details.
 bool BatchnormSpatialPersistentEnabled() {
@@ -551,6 +553,8 @@
   return false;
 #endif
 }
+#endif
+
 }  // namespace
 
 template <typename U, typename T>
@@ -679,6 +683,7 @@
     // If use_reserved_space we have reserve_space_3 output (only in
     // FusedBatchNormV3 op).
 
+#if GOOGLE_CUDA
     // Check if cuDNN batch normalization has a fast NHWC implementation:
     //   (1) In inference mode it's always fast.
     //   (2) Tensorflow enabled batchnorm spatial persistence, we are called
@@ -688,6 +693,10 @@
         !is_training ||
         (BatchnormSpatialPersistentEnabled() &&
          DataTypeToEnum<T>::value == DT_HALF && use_reserved_space);
+#else
+    // fast NHWC implementation is a CUDA only feature
+    const bool fast_nhwc_batch_norm = false;
+#endif
 
     // If input tensor is in NHWC format, and we have a fast cuDNN
     // implementation, there is no need to do data format conversion.
@@ -898,12 +907,17 @@
     const int64 height = GetTensorDim(x, tensor_format, 'H');
     const int64 width = GetTensorDim(x, tensor_format, 'W');
 
+#if GOOGLE_CUDA
     // Check if cuDNN batch normalization has a fast NHWC implementation:
     //   (1) Tensorflow enabled batchnorm spatial persistence, and
     //       FusedBatchNormGradV3 passed non-null reserve space and allocator.
     const bool fast_nhwc_batch_norm = BatchnormSpatialPersistentEnabled() &&
                                       DataTypeToEnum<T>::value == DT_HALF &&
                                       use_reserved_space;
+#else
+    // fast NHWC implementation is a CUDA only feature
+    const bool fast_nhwc_batch_norm = false;
+#endif
 
     // If input tensor is in NHWC format, and we have a fast cuDNN
     // implementation, there is no need to do data format conversion.
commit	17b87f0b51ad290269f983a85b887ae838c2ebe2	[log] [tgz]
author	Deven Desai <deven.desai.amd@gmail.com>	Mon Jan 06 22:24:26 2020 +0000
committer	Deven Desai <deven.desai.amd@gmail.com>	Mon Jan 06 22:24:26 2020 +0000
tree	db1328a4c556eb9fe0275e678eaee2eb01e02281
parent	d325b255ff7d0bf1ca04229880dffb0a37d52e2d [diff]