blob: 5aa1e307c1f33b955c6a142c53c484876a9b00c8 [file] [log] [blame]
/* Copyright 2019 Google LLC. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
/* Temporary dotprod-detection until we can rely on proper feature-detection
such as getauxval on Linux (requires a newer Linux kernel than we can
currently rely on on Android).
There are two main ways that this could be implemented: using a signal
handler or a fork. The current implementation uses a signal handler.
This is because on current Android, an uncaught signal gives a latency
of over 100 ms. In order for the fork approach to be worthwhile, it would
have to save us the hassle of handling signals, and such an approach thus
has an unavoidable 100ms latency. By contrast, the present signal-handling
approach has low latency.
Downsides of the current signal-handling approach include:
1. Setting and restoring signal handlers is not thread-safe: we can't
prevent another thread from interfering with us. We at least prevent
other threads from calling our present code concurrently by using a lock,
but we can't do anything about other threads using their own code to
set signal handlers.
2. Signal handlers are not entirely portable, e.g. b/132973173 showed that
on Apple platform the EXC_BAD_INSTRUCTION signal is not always caught
by a SIGILL handler (difference between Release and Debug builds).
3. The signal handler approach looks confusing in a debugger (has to
tell the debugger to 'continue' past the signal every time). Fix:
```
(gdb) handle SIGILL nostop noprint pass
```
Here is what the nicer fork-based alternative would look like.
Its only downside, as discussed above, is high latency, 100 ms on Android.
```
bool try_asm_snippet(bool (*asm_snippet)()) {
int child_pid = fork();
if (child_pid == -1) {
// Fork failed.
return false;
}
if (child_pid == 0) {
// Child process code path. Pass the raw boolean return value of
// asm_snippet as exit code (unconventional: 1 means true == success).
_exit(asm_snippet());
}
int child_status;
waitpid(child_pid, &child_status, 0);
if (WIFSIGNALED(child_status)) {
// Child process terminated by signal, meaning the instruction was
// not supported.
return false;
}
// Return the exit code of the child, which per child code above was
// the return value of asm_snippet().
return WEXITSTATUS(child_status);
}
```
*/
#if defined __aarch64__ && defined __linux__
#define RUY_IMPLEMENT_DETECT_DOTPROD
#endif
#ifdef RUY_IMPLEMENT_DETECT_DOTPROD
#include <setjmp.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <mutex>
// Intentionally keep checking for __linux__ here in case we want to
// extend RUY_IMPLEMENT_DETECT_DOTPROD outside of linux in the future.
#ifdef __linux__
#include <sys/auxv.h>
#include <sys/utsname.h>
#endif
#endif
namespace ruy {
#ifdef RUY_IMPLEMENT_DETECT_DOTPROD
namespace {
// Waits until there are no pending SIGILL's.
void wait_until_no_pending_sigill() {
sigset_t pending;
do {
sigemptyset(&pending);
sigpending(&pending);
} while (sigismember(&pending, SIGILL));
}
// long-jump buffer used to continue execution after a caught SIGILL.
sigjmp_buf& global_sigjmp_buf_just_before_trying_snippet() {
static sigjmp_buf g;
return g;
};
// SIGILL signal handler. Long-jumps to just before
// we ran the snippet that we know is the only thing that could have generated
// the SIGILL.
void sigill_handler(int) {
siglongjmp(global_sigjmp_buf_just_before_trying_snippet(), 1);
}
// Try an asm snippet. Returns true if it passed i.e. ran without generating
// a SIGILL and returned true. Returns false if a SIGILL was generated, or
// if it returned false.
// Other signals are not handled.
bool try_asm_snippet(bool (*asm_snippet)()) {
// This function installs and restores signal handlers. The only way it's ever
// going to be reentrant is with a big lock around it.
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
// Install the SIGILL signal handler. Save any existing signal handler for
// restoring later.
struct sigaction sigill_action;
memset(&sigill_action, 0, sizeof(sigill_action));
sigill_action.sa_handler = sigill_handler;
sigemptyset(&sigill_action.sa_mask);
struct sigaction old_action;
sigaction(SIGILL, &sigill_action, &old_action);
// Try the snippet.
bool got_sigill =
sigsetjmp(global_sigjmp_buf_just_before_trying_snippet(), true);
bool snippet_retval = false;
if (!got_sigill) {
snippet_retval = asm_snippet();
wait_until_no_pending_sigill();
}
// Restore the old signal handler.
sigaction(SIGILL, &old_action, nullptr);
return snippet_retval && !got_sigill;
}
bool dotprod_asm_snippet() {
// maratek@ mentioned that for some other ISA extensions (fp16)
// there have been implementations that failed to generate SIGILL even
// though they did not correctly implement the instruction. Just in case
// a similar situation might exist here, we do a simple correctness test.
int result = 0;
asm volatile(
"mov w0, #100\n"
"dup v0.16b, w0\n"
"dup v1.4s, w0\n"
".word 0x6e809401 // udot v1.4s, v0.16b, v0.16b\n"
"mov %w[result], v1.s[0]\n"
: [ result ] "=r"(result)
:
: "x0", "v0", "v1");
// Expecting 100 (input accumulator value) + 100 * 100 + ... (repeat 4 times)
return result == 40100;
};
bool DetectDotprodBySigIllMethod() {
return try_asm_snippet(dotprod_asm_snippet);
}
// Intentionally keep checking for __linux__ here in case we want to
// extend RUY_IMPLEMENT_DETECT_DOTPROD outside of linux in the future.
#ifdef __linux__
bool IsLinuxAuxvMethodAvailable() {
struct utsname utsbuf;
uname(&utsbuf);
int major, minor, patch;
if (3 != sscanf(utsbuf.release, "%d.%d.%d", &major, &minor, &patch)) {
return false;
}
// This is implemented in linux 4.14.111, not in 4.14.105.
return major > 4 ||
(major == 4 && (minor > 14 || (minor == 14 && patch >= 111)));
}
bool DetectDotprodByLinuxAuxvMethod() {
// This is the value of HWCAP_ASIMDDP in sufficiently recent Linux headers,
// however we need to support building against older headers for the time
// being.
const int kLocalHwcapAsimddp = 1 << 20;
return getauxval(AT_HWCAP) & kLocalHwcapAsimddp;
}
#endif
} // namespace
bool DetectDotprod() {
#ifdef __linux__
if (IsLinuxAuxvMethodAvailable()) {
return DetectDotprodByLinuxAuxvMethod();
}
#endif
return DetectDotprodBySigIllMethod();
}
#else
bool DetectDotprod() { return false; }
#endif
} // namespace ruy