remove timeout from RNN executor Summary: I had a 30 sec timeout in RNN executor to find out deadlock bugs, but looks like people are occasionally bumping on it in the course of normal business -- perhaps when CPU is heavily used, the threads don't get enough time and run out of the timeout. Removed the timeout but retain the warning logging. Reviewed By: salexspb Differential Revision: D6001960 fbshipit-source-id: 5b2293359ee68c1c24f0d9e0406d88391e531280

commit: ca392b7c76d1eb6a2fdd411179604b14f74758df [log] [tgz]
author: Aapo Kyrola <akyrola@fb.com> Thu Oct 12 10:40:39 2017 -0700
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> Thu Oct 12 10:59:41 2017 -0700
tree: 90259293b52f136615614a094848f7c4dd66562a
parent: 6b22f64d2cb710bf44de1b457f76c7b1a79e73a2 [diff]
diff --git a/caffe2/operators/recurrent_network_executor.cc b/caffe2/operators/recurrent_network_executor.cc
index 035f369..48be752 100644
--- a/caffe2/operators/recurrent_network_executor.cc
+++ b/caffe2/operators/recurrent_network_executor.cc

@@ -182,27 +182,24 @@
   // Start threads if not started
   std::unique_lock<std::mutex> lk(countdown_mtx_);
   while (workers_.size() < num_threads_) {
-    VLOG(1) << "Start RNN worker " << workers_.size() << " / "
-              << num_threads_;
+    VLOG(1) << "Start RNN worker " << workers_.size() << " / " << num_threads_;
     workers_.push_back(
         std::thread(&ThreadedRecurrentNetworkExecutor::WorkerFunction, this));
   }
 
   // Wait until threads finish.
   Timer t;
-  cv_.wait_for(lk, std::chrono::seconds(30), [&] {
-    // Log if we are still running, so that we catch deadlocks.. there
-    // should not be any deadlocks, but...
-    if (t.Seconds() > 10) {
-      LOG(INFO) << "RNN Executor still running, remaining ops: " << countdown_;
-    }
-    return failed_ || countdown_ == 0;
-  });
-
-  CAFFE_ENFORCE_EQ(false, failed_, "Recurrent network execution failed");
-  CAFFE_ENFORCE_EQ(
-      0, countdown_, "Recurrent network execution did not finish in time");
-  CAFFE_ENFORCE_EQ(job_queue_.size(), 0);
+  while (!failed_ && countdown_ > 0) {
+    cv_.wait_for(lk, std::chrono::seconds(30), [&] {
+      // Log if we are still running, so that we catch deadlocks.. there
+      // should not be any deadlocks, but...
+      if (t.Seconds() > 10) {
+        LOG(INFO) << "RNN Executor still running, remaining ops: "
+                  << countdown_;
+      }
+      return failed_ || countdown_ == 0;
+    });
+  }
 }
 
 } // namespace caffe2
commit	ca392b7c76d1eb6a2fdd411179604b14f74758df	[log] [tgz]
author	Aapo Kyrola <akyrola@fb.com>	Thu Oct 12 10:40:39 2017 -0700
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	Thu Oct 12 10:59:41 2017 -0700
tree	90259293b52f136615614a094848f7c4dd66562a
parent	6b22f64d2cb710bf44de1b457f76c7b1a79e73a2 [diff]