fix bug in dependency inference for RNNExecutor
Summary: RNN executor did not consider race condition -type of dependency where an op A reads blob X and following op writes blob X. This happened in beam search with a inplace-reshape following FC op.
Reviewed By: jamesr66a
Differential Revision: D5792018
fbshipit-source-id: a5590d80e1b7b127abcdf2b1c2854ea56018e12f
diff --git a/caffe2/operators/recurrent_network_executor.cc b/caffe2/operators/recurrent_network_executor.cc
index 40fb097..b287b7a 100644
--- a/caffe2/operators/recurrent_network_executor.cc
+++ b/caffe2/operators/recurrent_network_executor.cc
@@ -141,7 +141,9 @@
} catch (::caffe2::EnforceNotMet& enf) {
std::unique_lock<std::mutex> lk(countdown_mtx_);
LOG(ERROR) << "Crash at thread " << id << " timestep " << job.timestep
- << " " << enf.what();
+ << " op:"
+ << ProtoDebugString(step_net_def_.op(job.op_idx))
+ << enf.what();
job_queue_.NoMoreJobs();
failed_ = true;
cv_.notify_one();
diff --git a/caffe2/operators/recurrent_network_executor.h b/caffe2/operators/recurrent_network_executor.h
index 55a6f33..0236302 100644
--- a/caffe2/operators/recurrent_network_executor.h
+++ b/caffe2/operators/recurrent_network_executor.h
@@ -166,12 +166,18 @@
* Add dependencies to ops in the next timestep that would write an op
* that this op has as an input or output. This is special for RNNs,
* since we can have ops running in different timesteps concurrently.
+ * Also, we need to check ops that output a blob that is input of
+ * of the op in question.
*/
void add_race_conflict_dependencies(
int opidx,
std::vector<RNNNetOperator>& rnn_ops,
std::set<int>* dep_ops) {
- for (int i = 0; i < opidx; i++) {
+
+ for (int i = 0; i < rnn_ops.size(); i++) {
+ if (i == opidx) {
+ continue;
+ }
if (rnn_ops[i].link_op && this->ignoreLinkDependencies()) {
continue;
}
@@ -182,10 +188,12 @@
break;
}
}
- for (auto& outp : step_net_def_.op(opidx).output()) {
- if (outp == dep_blob) {
- dep_ops->insert(i);
- break;
+ if (i < opidx) {
+ for (auto& outp : step_net_def_.op(opidx).output()) {
+ if (outp == dep_blob) {
+ dep_ops->insert(i);
+ break;
+ }
}
}
}
@@ -231,11 +239,13 @@
timestep_ops_template_,
&dependent_ops);
+ // Race conditions arise when operator writes a blob that is
+ // being read by another.
if (!this->ignoreLinkDependencies()) {
- // Race conditions arise when link ops can be run out-of-order.
add_race_conflict_dependencies(
- rnn_op.order, timestep_ops_template_, &dependent_ops);
+ rnn_op.order, timestep_ops_template_, &dependent_ops);
}
+
for (int i : dependent_ops) {
rnn_op.dependencies.push_back(i);
}