blob: c510dc204f71990e782d5bf086dea631a7e8bb4d [file] [log] [blame]
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <tuple>
#include "tensorflow/core/platform/byte_order.h"
#include "tensorflow/core/platform/cpu_info.h"
#include "tensorflow/core/platform/denormal.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/platform.h"
// If we're on gcc 4.8 or older, there's a known bug that prevents the use of
// intrinsics when the architecture is not defined in the flags. See
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=57202
#if !defined(__SSE3__) && !defined(__clang__) && \
(defined(__GNUC__) && (__GNUC__ < 4) || \
((__GNUC__ == 4) && (__GNUC_MINOR__ < 9)))
#define GCC_WITHOUT_INTRINSICS
#endif
// Only try to use SSE3 instructions if we're on an x86 platform, and it's not
// mobile, and we're not on a known bad gcc version.
#if defined(PLATFORM_IS_X86) && !defined(IS_MOBILE_PLATFORM) && \
!defined(GCC_WITHOUT_INTRINSICS)
#define DENORM_USE_INTRINSICS
#endif
#ifdef DENORM_USE_INTRINSICS
#include <pmmintrin.h>
#endif
namespace tensorflow {
namespace port {
static void SetDenormalState(bool flush_zero_mode, bool denormals_zero_mode) {
// For now, we flush denormals only on SSE 3. Other architectures such as ARM
// can be added as needed.
#ifdef DENORM_USE_INTRINSICS
if (TestCPUFeature(SSE3)) {
// Restore flags
_MM_SET_FLUSH_ZERO_MODE(flush_zero_mode ? _MM_FLUSH_ZERO_ON
: _MM_FLUSH_ZERO_OFF);
_MM_SET_DENORMALS_ZERO_MODE(denormals_zero_mode ? _MM_DENORMALS_ZERO_ON
: _MM_DENORMALS_ZERO_OFF);
}
#endif
}
static std::pair<bool, bool> GetDernormalState() {
// For now, we flush denormals only on SSE 3. Other architectures such as ARM
// can be added as needed.
#ifdef DENORM_USE_INTRINSICS
if (TestCPUFeature(SSE3)) {
// Save existing flags
bool flush_zero_mode = _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON;
bool denormals_zero_mode =
_MM_GET_DENORMALS_ZERO_MODE() == _MM_DENORMALS_ZERO_ON;
return {flush_zero_mode, denormals_zero_mode};
}
#endif
return {false, false};
}
ScopedRestoreFlushDenormalState::ScopedRestoreFlushDenormalState() {
std::tie(flush_zero_mode_, denormals_zero_mode_) = GetDernormalState();
}
ScopedRestoreFlushDenormalState::~ScopedRestoreFlushDenormalState() {
SetDenormalState(flush_zero_mode_, denormals_zero_mode_);
}
ScopedFlushDenormal::ScopedFlushDenormal() {
SetDenormalState(/*flush_zero_mode=*/true, /*denormals_zero_mode=*/true);
}
ScopedDontFlushDenormal::ScopedDontFlushDenormal() {
SetDenormalState(/*flush_zero_mode=*/false, /*denormals_zero_mode=*/false);
}
} // namespace port
} // namespace tensorflow