| #include "torch/csrc/autograd/function.h" |
| |
| #include "torch/csrc/autograd/engine.h" |
| #include "torch/csrc/autograd/functions/special.h" |
| #include "torch/csrc/autograd/variable.h" |
| #include "torch/csrc/jit/ir.h" |
| |
| #include <ATen/ATen.h> |
| |
| #include <algorithm> |
| #include <cstdint> |
| #include <memory> |
| #include <stdexcept> |
| #include <string> |
| #include <utility> |
| #include <vector> |
| #include <deque> |
| |
| namespace torch { namespace autograd { |
| |
| thread_local uint64_t Function::next_sequence_nr_ = 0; |
| |
| auto Function::name() const -> std::string { |
| return at::demangle(typeid(*this).name()); |
| } |
| |
| // This function is analogous to make_trace which operates on PythonOp, but this |
| // function instead works for C++ implemented autograd Functions, which don't |
| // actually have any backing Python class. We still need to trace them! |
| variable_list Function::traced_apply(variable_list inputs) { |
| using namespace torch::jit; |
| // Traceable Functions are completely transparent to the JIT. |
| if (is_traceable()) { |
| return apply(inputs); |
| } |
| auto state = tracer::getTracingState(inputs); |
| auto state_lock = state->lock(); |
| |
| // Insert a CppOp in the trace. |
| auto& graph = state->graph; |
| auto* this_node = graph->createCppOp(get_shared_ptr()); |
| jit::tracer::recordSourceLocation(this_node); |
| for (auto& input: inputs) { |
| this_node->addInput(tracer::getValueTrace(state, input)); |
| } |
| graph->appendNode(this_node); |
| |
| // Finally apply this Function. |
| state_lock.unlock(); |
| variable_list outputs = apply(inputs); |
| state_lock.lock(); |
| |
| // Set up output traces. |
| int num_outputs = outputs.size(); |
| for (int i = 0; i < num_outputs; ++i) { |
| auto& output = outputs[i]; |
| auto sel = this_node->addOutput(); |
| // TODO: At the moment, C++ does not track shared storage. It |
| // should. Update this when that happens. |
| if (output.defined()) { |
| sel->inferTypeFrom(output.data()); |
| tracer::setValueTrace(state, output, sel); |
| } |
| } |
| |
| if (!passes_state_transparently()) { |
| auto this_eval = dynamic_cast<Eval*>(this); |
| // Evals consume handle from a context edge of forward node |
| if (this_eval) |
| this_node->addInput(this_eval->forward_ctx_select); |
| // There's no point in wrapping functions in Eval, if we know they already are |
| // part of another Eval subgraph. This is both a small optimization, and |
| // it allows us to not implement saved_variables() in many functions. |
| const bool should_trace_backward = tracing_state_->in_eval_subgraph; |
| if (!should_trace_backward) { |
| auto saved_vars = saved_variables(); |
| if (!saved_vars) |
| throw std::runtime_error("saved_variables() needed but not implemented in " + name()); |
| variable_list bw_subgraph_inputs(inputs); |
| for (auto& saved_var : *saved_vars) { |
| bw_subgraph_inputs.emplace_back(saved_var.unpack(get_shared_ptr())); |
| } |
| tracer::nontraceableBackwardSubgraph(bw_subgraph_inputs, outputs); |
| } |
| bool has_backwards_eval = !should_trace_backward || this_eval; |
| if (has_backwards_eval) |
| set_up_context_edge(this_node, inputs, outputs); |
| } |
| return outputs; |
| } |
| |
| void Function::set_up_context_edge( |
| jit::Node* this_node, |
| const variable_list& inputs, |
| const variable_list& outputs) { |
| auto ctx_select = this_node->addOutput(); |
| ctx_select->setType(jit::HandleType::get()); |
| auto backward_eval = Eval::getBackwardEval(inputs, outputs); |
| if (backward_eval) |
| backward_eval->forward_ctx_select = ctx_select; |
| } |
| |
| AnomalyMetadata* Function::metadata() noexcept { |
| if (!anomaly_metadata_) { |
| anomaly_metadata_ = Engine::get_default_engine().make_anomaly_metadata(); |
| } |
| return anomaly_metadata_.get(); |
| } |
| |
| /* |
| * Fix for #5534: prevent stack overflow on deletion of deep computation graph |
| * |
| * Sometimes one can end up with a very big computation graph of Functions |
| * and Edges. Each std::shared_ptr<Function> contains a list of Edge, and |
| * each Edge contains a std::shared_ptr<Function>. Deleting a |
| * std::shared_ptr<Function> can trigger the recursive deletion of other |
| * std::shared_ptr<Function>'s: this can stack overflow if the graph |
| * is deep enough. Here is an example of such a graph: |
| * |
| * shared_ptr<Function> -> Edge -> shared_ptr<Function> -> Edge -> ... -> shared_ptr<Function> |
| * |
| * The solution here is to use a custom deleter with each |
| * std::shared_ptr<Function>. The custom deleter keeps track of how many |
| * nested deleters it is in. When this number exceeds the maximum allowed |
| * depth, the Function* to be deleted are accumulated in a per-thread |
| * delete queue and handled by one of the deleters. |
| * |
| * Note that these custom deleters are NOT necessary for deleting PyFunction. |
| * This is because a THPFunction Python object owns a PyFunction that is in a |
| * computation graph. When Python objects get recursively destroyed, they |
| * are also queued into a delete list. This happens very early for them |
| * (at 50 deleters): https://github.com/python/cpython/blob/f320be77ffb73e3b9e7fc98c37b8df3975d84b40/Include/object.h#L1024-L1063 |
| * so we don't need to worry about them. |
| */ |
| |
| thread_local std::deque<Function*> deleteFunctionQueue; |
| thread_local size_t deleteFunctionRecursionDepth = 0; |
| |
| /* |
| * If this number is set too high, a deep computation graph can still |
| * stack overflow. The procedure for setting this number was to |
| * 1) find the smallest value that would not guard against stack overflows |
| * on various machines |
| * 2) Take the minimum of all such values and subtract some leeway because |
| * the memory of these stack frames will probably grow as time passes. |
| * Testing on a few machines machines, the magic numbers were: |
| * - Mac OSX (Macbook Pro 15) : ~60000 |
| * - A beefy Ubuntu 16.04 box : ~15000 |
| * - Windows AWS instance (g3.4xlarge): variable. My two attempts at different |
| * times have gotten the following numbers: ~8300, 3669 |
| */ |
| #ifdef _WIN32 |
| constexpr size_t kDeleteFunctionMaxRecursionDepth = 3000; |
| #else |
| constexpr size_t kDeleteFunctionMaxRecursionDepth = 10000; |
| #endif |
| |
| struct RecursionDepthCounter { |
| public: |
| explicit RecursionDepthCounter() { |
| ++deleteFunctionRecursionDepth; |
| } |
| ~RecursionDepthCounter() { |
| --deleteFunctionRecursionDepth; |
| } |
| |
| size_t value() { |
| return deleteFunctionRecursionDepth; |
| } |
| }; |
| |
| /* |
| * Note that the custom deleter deletes in BFS style. Without using |
| * the custom deleter, the computation graph is deleted in a DFS style. |
| * The BFS deletion is valid (and safe) because if a shared_ptr<Function> |
| * 's reference count hits 0, nothing else will access it. |
| */ |
| void deleteFunction(Function* function) { |
| RecursionDepthCounter recursion_depth; |
| |
| if (recursion_depth.value() > kDeleteFunctionMaxRecursionDepth) { |
| deleteFunctionQueue.push_back(function); |
| return; |
| } |
| |
| delete function; |
| |
| if (deleteFunctionQueue.size() == 0) { |
| return; |
| } |
| if (recursion_depth.value() != kDeleteFunctionMaxRecursionDepth) { |
| AT_ERROR("Only one deleter per thread should be able to process " |
| "the delete queue. Please open an issue."); |
| } |
| while (deleteFunctionQueue.size() > 0) { |
| auto queued_function = deleteFunctionQueue.front(); |
| deleteFunctionQueue.pop_front(); |
| delete queued_function; |
| } |
| } |
| |
| }} // namespace torch::autograd |