Euthanize a process with timeout
Summary: vigneshr has been experiencing randomly that the process does not exit in the end. We don't know what causes this, so this will help with two ways: (1) by putting timeout_guard.EuthanizeIfNecessary(600) in the end of the operator, you ensure that the process is killed in 10 minutes, allowing for retry; (2) this killing will cause python stack traces to be dumped, helping debug the real issue.
Differential Revision: D4635781
fbshipit-source-id: b558418c80671c00effdd514e4ddc01e935c95df
diff --git a/caffe2/python/timeout_guard.py b/caffe2/python/timeout_guard.py
index d93fd69..f6c1122 100644
--- a/caffe2/python/timeout_guard.py
+++ b/caffe2/python/timeout_guard.py
@@ -76,12 +76,11 @@
import traceback
code = []
for threadId, stack in sys._current_frames().items():
- if threadId == self.caller_thread.ident:
- code.append("\n# ThreadID: %s" % threadId)
- for filename, lineno, name, line in traceback.extract_stack(stack):
- code.append('File: "%s", line %d, in %s' % (filename, lineno, name))
- if line:
- code.append(" %s" % (line.strip()))
+ code.append("\n# ThreadID: %s" % threadId)
+ for filename, lineno, name, line in traceback.extract_stack(stack):
+ code.append('File: "%s", line %d, in %s' % (filename, lineno, name))
+ if line:
+ code.append(" %s" % (line.strip()))
print("\n".join(code))
os.kill(os.getpid(), signal.SIGINT)
@@ -96,3 +95,12 @@
watcher.condition.acquire()
watcher.condition.notify()
watcher.condition.release()
+
+
+def EuthanizeIfNecessary(timeout_secs=120):
+ '''
+ Call this if you have problem with process getting stuck at shutdown.
+ It will kill the process if it does not terminate in timeout_secs.
+ '''
+ watcher = WatcherThread(timeout_secs)
+ watcher.start()