blob: 37f30df9d0004cd419bebf1e330c493387ff838a [file] [log] [blame]
/* Copyright 2019 Google LLC. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
/* Temporary dotprod-detection until we can rely on proper feature-detection
such as getauxval on Linux (requires a newer Linux kernel than we can
currently rely on on Android).
There are two main ways that this could be implemented: using a signal
handler or a fork. The current implementation uses a signal handler.
This is because on current Android, an uncaught signal gives a latency
of over 100 ms. In order for the fork approach to be worthwhile, it would
have to save us the hassle of handling signals, and such an approach thus
has an unavoidable 100ms latency. By contrast, the present signal-handling
approach has low latency.
Downsides of the current signal-handling approach include:
1. Setting and restoring signal handlers is not thread-safe: we can't
prevent another thread from interfering with us. We at least prevent
other threads from calling our present code concurrently by using a lock,
but we can't do anything about other threads using their own code to
set signal handlers.
2. Signal handlers are not entirely portable, e.g. b/132973173 showed that
on Apple platform the EXC_BAD_INSTRUCTION signal is not always caught
by a SIGILL handler (difference between Release and Debug builds).
3. The signal handler approach looks confusing in a debugger (has to
tell the debugger to 'continue' past the signal every time). Fix:
```
(gdb) handle SIGILL nostop noprint pass
```
Here is what the nicer fork-based alternative would look like.
Its only downside, as discussed above, is high latency, 100 ms on Android.
```
bool TryAsmSnippet(bool (*asm_snippet)()) {
int child_pid = fork();
if (child_pid == -1) {
// Fork failed.
return false;
}
if (child_pid == 0) {
// Child process code path. Pass the raw boolean return value of
// asm_snippet as exit code (unconventional: 1 means true == success).
_exit(asm_snippet());
}
int child_status;
waitpid(child_pid, &child_status, 0);
if (WIFSIGNALED(child_status)) {
// Child process terminated by signal, meaning the instruction was
// not supported.
return false;
}
// Return the exit code of the child, which per child code above was
// the return value of asm_snippet().
return WEXITSTATUS(child_status);
}
```
*/
#include "ruy/detect_arm.h"
#if (defined __linux__) && (defined __aarch64__)
#define RUY_DETECT_DOTPROD
#endif
#ifdef RUY_DETECT_DOTPROD
#include <setjmp.h>
#include <signal.h>
#include <cstdlib>
#include <cstring>
#include <mutex>
#ifdef __linux__
#include <sys/auxv.h>
#endif
#endif
namespace ruy {
#ifdef RUY_DETECT_DOTPROD
namespace {
// long-jump buffer used to continue execution after a caught SIGILL.
sigjmp_buf global_sigjmp_buf;
// Signal handler. Long-jumps to just before
// we ran the snippet that we know is the only thing that could have generated
// the SIGILL.
void SignalHandler(int) { siglongjmp(global_sigjmp_buf, 1); }
// RAII helper for calling sigprocmask to unblock all signals temporarily.
class ScopeUnblockSignals final {
public:
ScopeUnblockSignals() {
sigset_t procmask;
sigemptyset(&procmask);
success_ = !sigprocmask(SIG_SETMASK, &procmask, &old_procmask_);
}
~ScopeUnblockSignals() {
if (success_) {
sigprocmask(SIG_SETMASK, &old_procmask_, nullptr);
}
}
bool success() const { return success_; }
private:
sigset_t old_procmask_;
bool success_ = false;
};
// RAII helper to install and uninstall a signal handler.
class ScopeSigaction final {
public:
ScopeSigaction(int signal_number, void (*handler_function)(int))
: signal_number_(signal_number) {
struct sigaction action;
memset(&action, 0, sizeof(action));
sigemptyset(&action.sa_mask);
action.sa_handler = handler_function;
success_ = !sigaction(signal_number_, &action, &old_action_);
}
~ScopeSigaction() {
if (success_) {
sigaction(signal_number_, &old_action_, nullptr);
}
}
bool success() const { return success_; }
private:
const int signal_number_;
struct sigaction old_action_;
bool success_ = false;
};
// Try an asm snippet. Returns true if it passed i.e. ran without generating
// an illegal-instruction signal and returned true. Returns false otherwise.
bool TryAsmSnippet(bool (*asm_snippet)()) {
// This function installs and restores signal handlers and the signal-blocking
// mask. We can't prevent another thread from interfering, but we can at least
// put a big lock here so that it works if, for whatever reason, another
// thread calls this function concurrently.
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
ScopeUnblockSignals unblock_signals;
if (!unblock_signals.success()) {
return false;
}
ScopeSigaction handle_sigill(SIGILL, SignalHandler);
if (!handle_sigill.success()) {
return false;
}
// Set the long jump buffer to this point in the code. This normally returns
// 0 so we don't take this branch...
if (sigsetjmp(global_sigjmp_buf, false)) {
// ... except in the fake return from sigsetjmp that is produced when
// the long-jump back to here actually happened, that is, in the signal
// handler. In this case, we know that the asm_snippet triggered an illegal
// instruction signal, so we return false.
return false;
}
return asm_snippet();
}
bool DotprodAsmSnippet() {
// maratek@ mentioned that for some other ISA extensions (fp16)
// there have been implementations that failed to generate SIGILL even
// though they did not correctly implement the instruction. Just in case
// a similar situation might exist here, we do a simple correctness test.
int result = 0;
asm volatile(
"mov w0, #100\n"
"dup v0.16b, w0\n"
"dup v1.4s, w0\n"
".word 0x6e809401 // udot v1.4s, v0.16b, v0.16b\n"
"mov %w[result], v1.s[0]\n"
: [result] "=r"(result)
:
: "x0", "v0", "v1");
// Expecting 100 (input accumulator value) + 100 * 100 + ... (repeat 4 times)
return result == 40100;
}
bool DetectDotprodBySignalMethod() { return TryAsmSnippet(DotprodAsmSnippet); }
#ifdef __linux__
bool DetectDotprodByLinuxAuxvMethod() {
// This is the value of HWCAP_ASIMDDP in sufficiently recent Linux headers,
// however we need to support building against older headers for the time
// being.
const int kLocalHwcapAsimddp = 1 << 20;
return getauxval(AT_HWCAP) & kLocalHwcapAsimddp;
}
#endif
} // namespace
bool DetectDotprod() {
#ifdef __linux__
// We always try the auxv method and don't try to check the linux version
// before. It's only in the mainline linux tree from 4.14.151, but it's been
// backported to earlier linux versions in Android vendor device trees.
// The cost of just trying this is near zero, and the benefit is large
// as the signal method has higher latency and a substantial crash potential.
if (DetectDotprodByLinuxAuxvMethod()) {
return true;
}
#endif
return DetectDotprodBySignalMethod();
}
#else // not defined RUY_DETECT_DOTPROD
bool DetectDotprod() { return false; }
#endif // defined RUY_DETECT_DOTPROD
} // namespace ruy