blob: e522a3a6bea01dc21ccf495f20cb8e272624daf0 [file] [log] [blame]
#include "caffe2/utils/signal_handler.h"
#include "caffe2/core/logging.h"
#if defined(CAFFE2_SUPPORTS_SIGNAL_HANDLER)
// Normal signal handler implementation.
#include <cxxabi.h>
#include <dirent.h>
#include <dlfcn.h>
#include <pthread.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <unistd.h>
#include <unwind.h>
#include <atomic>
#include <csignal>
#include <cstdio>
#include <cstdlib>
#include <mutex>
#include <unordered_set>
#include "caffe2/core/init.h"
#include "caffe2/core/workspace.h"
#ifdef C10_ANDROID
#ifndef SYS_gettid
#define SYS_gettid __NR_gettid
#endif
#ifndef SYS_tgkill
#define SYS_tgkill __NR_tgkill
#endif
#endif
namespace {
struct sigaction previousSighup;
struct sigaction previousSigint;
std::atomic<int> sigintCount(0);
std::atomic<int> sighupCount(0);
std::atomic<int> hookedUpCount(0);
void handleSignal(int signal) {
switch (signal) {
// TODO: what if the previous handler uses sa_sigaction?
case SIGHUP:
sighupCount += 1;
if (previousSighup.sa_handler) {
previousSighup.sa_handler(signal);
}
break;
case SIGINT:
sigintCount += 1;
if (previousSigint.sa_handler) {
previousSigint.sa_handler(signal);
}
break;
}
}
void hookupHandler() {
if (hookedUpCount++) {
return;
}
struct sigaction sa;
// Setup the handler
sa.sa_handler = &handleSignal;
// Restart the system call, if at all possible
sa.sa_flags = SA_RESTART;
// Block every signal during the handler
sigfillset(&sa.sa_mask);
// Intercept SIGHUP and SIGINT
if (sigaction(SIGHUP, &sa, &previousSighup) == -1) {
LOG(FATAL) << "Cannot install SIGHUP handler.";
}
if (sigaction(SIGINT, &sa, &previousSigint) == -1) {
LOG(FATAL) << "Cannot install SIGINT handler.";
}
}
// Set the signal handlers to the default.
void unhookHandler() {
if (--hookedUpCount > 0) {
return;
}
struct sigaction sa;
// Setup the sighub handler
sa.sa_handler = SIG_DFL;
// Restart the system call, if at all possible
sa.sa_flags = SA_RESTART;
// Block every signal during the handler
sigfillset(&sa.sa_mask);
// Intercept SIGHUP and SIGINT
if (sigaction(SIGHUP, &previousSighup, nullptr) == -1) {
LOG(FATAL) << "Cannot uninstall SIGHUP handler.";
}
if (sigaction(SIGINT, &previousSigint, nullptr) == -1) {
LOG(FATAL) << "Cannot uninstall SIGINT handler.";
}
}
#if defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS)
// The mutex protects the bool.
std::mutex fatalSignalHandlersInstallationMutex;
bool fatalSignalHandlersInstalled;
// We need to hold a reference to call the previous SIGUSR2 handler in case
// we didn't signal it
struct sigaction previousSigusr2;
// Flag dictating whether the SIGUSR2 handler falls back to previous handlers
// or is intercepted in order to print a stack trace.
std::atomic<bool> fatalSignalReceived(false);
// Global state set when a fatal signal is received so that backtracing threads
// know why they're printing a stacktrace.
const char* fatalSignalName("<UNKNOWN>");
int fatalSignum(-1);
// This wait condition is used to wait for other threads to finish writing
// their stack trace when in fatal sig handler (we can't use pthread_join
// because there's no way to convert from a tid to a pthread_t).
pthread_cond_t writingCond = PTHREAD_COND_INITIALIZER;
pthread_mutex_t writingMutex = PTHREAD_MUTEX_INITIALIZER;
struct {
const char* name;
int signum;
struct sigaction previous;
} kSignalHandlers[] = {{"SIGABRT", SIGABRT, {}},
{"SIGINT", SIGINT, {}},
{"SIGILL", SIGILL, {}},
{"SIGFPE", SIGFPE, {}},
{"SIGBUS", SIGBUS, {}},
{"SIGSEGV", SIGSEGV, {}},
{nullptr, 0, {}}};
struct sigaction* getPreviousSigaction(int signum) {
for (auto handler = kSignalHandlers; handler->name != nullptr; handler++) {
if (handler->signum == signum) {
return &handler->previous;
}
}
return nullptr;
}
const char* getSignalName(int signum) {
for (auto handler = kSignalHandlers; handler->name != nullptr; handler++) {
if (handler->signum == signum) {
return handler->name;
}
}
return nullptr;
}
_Unwind_Reason_Code unwinder(struct _Unwind_Context* context, void* userInfo) {
auto& pcs = *reinterpret_cast<std::vector<uintptr_t>*>(userInfo);
pcs.push_back(_Unwind_GetIP(context));
return _URC_NO_REASON;
}
std::vector<uintptr_t> getBacktrace() {
std::vector<uintptr_t> pcs;
_Unwind_Backtrace(unwinder, &pcs);
return pcs;
}
void printBlobSizes() {
::caffe2::Workspace::ForEach(
[&](::caffe2::Workspace* ws) { ws->PrintBlobSizes(); });
}
void printStacktrace() {
std::vector<uintptr_t> pcs = getBacktrace();
Dl_info info;
size_t i = 0;
for (uintptr_t pcAddr : pcs) {
const void* pc = reinterpret_cast<const void*>(pcAddr);
const char* path = nullptr;
const char* name = "???";
char* demangled = nullptr;
int offset = -1;
std::cerr << "[" << i << "] ";
if (dladdr(pc, &info)) {
path = info.dli_fname;
name = info.dli_sname ?: "???";
offset = reinterpret_cast<uintptr_t>(pc) -
reinterpret_cast<uintptr_t>(info.dli_saddr);
int status;
demangled = abi::__cxa_demangle(name, nullptr, nullptr, &status);
if (status == 0) {
name = demangled;
}
}
std::cerr << name;
if (offset >= 0) {
std::cerr << "+" << reinterpret_cast<void*>(offset);
}
std::cerr << "(" << pc << ")";
if (path) {
std::cerr << " in " << path;
}
std::cerr << std::endl;
if (demangled) {
free(demangled);
}
i += 1;
}
}
void callPreviousSignalHandler(
struct sigaction* action,
int signum,
siginfo_t* info,
void* ctx) {
if (!action->sa_handler) {
return;
}
if ((action->sa_flags & SA_SIGINFO) == SA_SIGINFO) {
action->sa_sigaction(signum, info, ctx);
} else {
action->sa_handler(signum);
}
}
// needsLock signals whether we need to lock our writing mutex.
void stacktraceSignalHandler(bool needsLock) {
if (needsLock) {
pthread_mutex_lock(&writingMutex);
}
pid_t tid = syscall(SYS_gettid);
std::cerr << fatalSignalName << "(" << fatalSignum << "), Thread " << tid
<< ": " << std::endl;
printStacktrace();
std::cerr << std::endl;
if (needsLock) {
pthread_mutex_unlock(&writingMutex);
pthread_cond_signal(&writingCond);
}
}
// Our fatal signal entry point
void fatalSignalHandler(int signum) {
// Check if this is a proper signal that we declared above.
const char* name = getSignalName(signum);
if (!name) {
return;
}
if (fatalSignalReceived) {
return;
}
// Set the flag so that our SIGUSR2 handler knows that we're aborting and
// that it should intercept any SIGUSR2 signal.
fatalSignalReceived = true;
// Set state for other threads.
fatalSignum = signum;
fatalSignalName = name;
// Linux doesn't have a nice userland API for enumerating threads so we
// need to use the proc pseudo-filesystem.
DIR* procDir = opendir("/proc/self/task");
if (procDir) {
pid_t pid = getpid();
pid_t currentTid = syscall(SYS_gettid);
struct dirent* entry;
pthread_mutex_lock(&writingMutex);
while ((entry = readdir(procDir)) != nullptr) {
if (entry->d_name[0] == '.') {
continue;
}
pid_t tid = atoi(entry->d_name);
// If we've found the current thread then we'll jump into the SIGUSR2
// handler before calling pthread_cond_wait thus deadlocking, so branch
// our directly to the backtrace handler instead of signaling it.
if (tid != currentTid) {
syscall(SYS_tgkill, pid, tid, SIGUSR2);
pthread_cond_wait(&writingCond, &writingMutex);
} else {
stacktraceSignalHandler(false);
}
}
pthread_mutex_unlock(&writingMutex);
} else {
perror("Failed to open /proc/self/task");
}
printBlobSizes();
sigaction(signum, getPreviousSigaction(signum), nullptr);
raise(signum);
}
// Our SIGUSR2 entry point
void stacktraceSignalHandler(int signum, siginfo_t* info, void* ctx) {
if (fatalSignalReceived) {
stacktraceSignalHandler(true);
} else {
// We don't want to actually change the signal handler as we want to
// remain the signal handler so that we may get the usr2 signal later.
callPreviousSignalHandler(&previousSigusr2, signum, info, ctx);
}
}
// Installs SIGABRT signal handler so that we get stack traces
// from every thread on SIGABRT caused exit. Also installs SIGUSR2 handler
// so that threads can communicate with each other (be sure if you use SIGUSR2)
// to install your handler before initing caffe2 (we properly fall back to
// the previous handler if we didn't initiate the SIGUSR2).
void installFatalSignalHandlers() {
std::lock_guard<std::mutex> locker(fatalSignalHandlersInstallationMutex);
if (fatalSignalHandlersInstalled) {
return;
}
fatalSignalHandlersInstalled = true;
struct sigaction sa;
sigemptyset(&sa.sa_mask);
// Since we'll be in an exiting situation it's possible there's memory
// corruption, so make our own stack just in case.
sa.sa_flags = SA_ONSTACK | SA_SIGINFO;
sa.sa_handler = ::fatalSignalHandler;
for (auto* handler = kSignalHandlers; handler->name != nullptr; handler++) {
if (sigaction(handler->signum, &sa, &handler->previous)) {
std::string str("Failed to add ");
str += handler->name;
str += " handler!";
perror(str.c_str());
}
}
sa.sa_sigaction = ::stacktraceSignalHandler;
if (sigaction(SIGUSR2, &sa, &::previousSigusr2)) {
perror("Failed to add SIGUSR2 handler!");
}
}
void uninstallFatalSignalHandlers() {
std::lock_guard<std::mutex> locker(fatalSignalHandlersInstallationMutex);
if (!fatalSignalHandlersInstalled) {
return;
}
fatalSignalHandlersInstalled = false;
for (auto* handler = kSignalHandlers; handler->name != nullptr; handler++) {
if (sigaction(handler->signum, &handler->previous, nullptr)) {
std::string str("Failed to remove ");
str += handler->name;
str += " handler!";
perror(str.c_str());
} else {
handler->previous = {};
}
}
if (sigaction(SIGUSR2, &::previousSigusr2, nullptr)) {
perror("Failed to add SIGUSR2 handler!");
} else {
::previousSigusr2 = {};
}
}
#endif // defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS)
} // namespace
#if defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS)
C10_DEFINE_bool(
caffe2_print_stacktraces,
false,
"If set, prints stacktraces when a fatal signal is raised.");
#endif
namespace caffe2 {
SignalHandler::SignalHandler(
SignalHandler::Action SIGINT_action,
SignalHandler::Action SIGHUP_action)
: SIGINT_action_(SIGINT_action),
SIGHUP_action_(SIGHUP_action),
my_sigint_count_(sigintCount),
my_sighup_count_(sighupCount) {
hookupHandler();
}
SignalHandler::~SignalHandler() {
unhookHandler();
}
// Return true iff a SIGINT has been received since the last time this
// function was called.
bool SignalHandler::GotSIGINT() {
uint64_t count = sigintCount;
bool result = (count != my_sigint_count_);
my_sigint_count_ = count;
return result;
}
// Return true iff a SIGHUP has been received since the last time this
// function was called.
bool SignalHandler::GotSIGHUP() {
uint64_t count = sighupCount;
bool result = (count != my_sighup_count_);
my_sighup_count_ = count;
return result;
}
SignalHandler::Action SignalHandler::CheckForSignals() {
if (GotSIGHUP()) {
return SIGHUP_action_;
}
if (GotSIGINT()) {
return SIGINT_action_;
}
return SignalHandler::Action::NONE;
}
#if defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS)
void setPrintStackTracesOnFatalSignal(bool print) {
if (print) {
installFatalSignalHandlers();
} else {
uninstallFatalSignalHandlers();
}
}
bool printStackTracesOnFatalSignal() {
std::lock_guard<std::mutex> locker(fatalSignalHandlersInstallationMutex);
return fatalSignalHandlersInstalled;
}
namespace internal {
bool Caffe2InitFatalSignalHandler(int*, char***) {
if (FLAGS_caffe2_print_stacktraces) {
setPrintStackTracesOnFatalSignal(true);
}
return true;
}
REGISTER_CAFFE2_INIT_FUNCTION(
Caffe2InitFatalSignalHandler,
&Caffe2InitFatalSignalHandler,
"Inits signal handlers for fatal signals so we can see what if"
" caffe2_print_stacktraces is set.");
} // namespace internal
#endif // defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS)
} // namespace caffe2
#else // defined(CAFFE2_SUPPORTS_SIGNAL_HANDLER)
// TODO: Currently we do not support signal handling in non-Linux yet - below is
// a minimal implementation that makes things compile.
namespace caffe2 {
SignalHandler::SignalHandler(
SignalHandler::Action SIGINT_action,
SignalHandler::Action SIGHUP_action) {
SIGINT_action_ = SIGINT_action;
SIGHUP_action_ = SIGHUP_action;
my_sigint_count_ = 0;
my_sighup_count_ = 0;
}
SignalHandler::~SignalHandler() {}
bool SignalHandler::GotSIGINT() {
return false;
}
bool SignalHandler::GotSIGHUP() {
return false;
}
SignalHandler::Action SignalHandler::CheckForSignals() {
return SignalHandler::Action::NONE;
}
} // namespace caffe2
#endif // defined(CAFFE2_SUPPORTS_SIGNAL_HANDLER)