| #ifndef NO_PYTHON |
| #include <Python.h> |
| #endif |
| #include "interpreter.h" |
| |
| #include "torch/csrc/autograd/edge.h" |
| #include "torch/csrc/autograd/function.h" |
| #include "torch/csrc/autograd/functions/special.h" |
| #include "torch/csrc/autograd/profiler.h" |
| #include "torch/csrc/autograd/variable.h" |
| #include "torch/csrc/jit/fusion_compiler.h" |
| #include "torch/csrc/jit/generated/aten_dispatch.h" |
| #include "torch/csrc/jit/graph_executor.h" |
| #include "torch/csrc/jit/ir.h" |
| #include "torch/csrc/jit/tensor_conversions.h" |
| |
| #include <typeinfo> |
| |
| #ifndef NO_PYTHON |
| #include "torch/csrc/autograd/python_engine.h" |
| #include "torch/csrc/autograd/python_variable.h" |
| #include "torch/csrc/jit/pybind.h" |
| #include "torch/csrc/utils/auto_gil.h" |
| |
| namespace py = pybind11; |
| #endif |
| |
| namespace torch { namespace jit { |
| |
| |
| // Before we translate to intepreter instructions, we do |
| // some preprocessing of the graph to turn it into a form that is closer |
| // to what the instructions will look like. |
| // In particular we: |
| // * (TODO) desugar Loop trip counts into c = 0, c += 1 instructions in the loop |
| // * flatten stages so that each stage starts with a load from the stack |
| // and ends with a store to the stack |
| // *. computes move_flags (see Outputs), and inserts |
| // * Drop nodes are inserted for any node that is unused to create a dummy use |
| // that will cause the interpreter to free the node. |
| // A drop node is just a node with no outputs that just pops its inputs off the stack, |
| // to ensure the interpreter release references to nodes that are never used. |
| // Drop nodes are also inserted when the last use of a node is in some conditionally |
| // run control flow (e.g. one side of an If) and the interpreter must free |
| // the node only after the control flow has reconverged |
| // Outputs are: |
| // * graph - the post processed copy of g |
| // * move_flags[n] - a list of booleans, one for each input, |
| // indicating whether this is the last use of the value. The interpreter |
| // should generate a move rather than a copy in this case. |
| // * stage_input_types: the type annotations on the inputs to each stage |
| // these can be removed once the the backward tracer is no longer used |
| |
| namespace { |
| |
| // new_cond = (i < max_trip_count) && cond |
| Value* createTripCountConjunctiveCondition( |
| Graph* g, |
| Value* cur_trip_count, |
| Value* max_trip_count, |
| Value* cond) { |
| // Emit initial comparison -- initial_trip_count < max_trip_count |
| Value* initial_comparison_value = |
| g->insertNode(g->create(aten::lt, {cur_trip_count, max_trip_count}, 1)) |
| ->output(); |
| |
| // Replace initial condition with logical `and` of trip count and |
| // initial condition |
| Value* new_cond = |
| g->insertNode( |
| g->create(aten::__and__, {initial_comparison_value, cond}, 1)) |
| ->output(); |
| return new_cond; |
| } |
| |
| } // namespace |
| |
| // this currently just _removes_ the trip count inputs and checks they are |
| // unused. In the future they will be desugared into normal arithmetic to |
| // provide a loop counter |
| void desugarTripCounts(Block * b) { |
| for(auto n : b->nodes()) { |
| |
| if(n->kind() == prim::Loop) { |
| auto g = n->owningGraph(); |
| auto body_block = n->blocks()[0]; |
| |
| Value* block_trip_count_input = body_block->inputs()[0]; |
| // Treat loop iteration number as a loop-carried dependency. We emit an |
| // increment at the end of the body block. |
| n->insertOutput(0); |
| |
| Value* max_trip_count_value = n->input(0); |
| { |
| WithInsertPoint guard(n); |
| // int i = 0 |
| Value* initial_trip_count = |
| g->insertNode(g->createConstant(at::zeros(at::CPU(at::kLong), {1}))) |
| ->output(); |
| // Set up initial iteration number value for loop-carried dependency |
| n->removeInput(0); |
| // Input 0 is now initial termination condition, insert this after that. |
| // LCD's start at index 1. |
| n->insertInput(1, initial_trip_count); |
| |
| Value* new_cond = createTripCountConjunctiveCondition( |
| g, initial_trip_count, max_trip_count_value, n->input(0)); |
| n->replaceInput(0, new_cond); |
| } |
| |
| { |
| WithInsertPoint guard(body_block); |
| // Trip count is now a loop carried dependency. We emit an op to |
| // increment the trip count at the end of the body. Then, emit the same |
| // conjunctive stopping condition as above. |
| |
| Value* const_one = |
| g->insertNode(g->createConstant(at::ones(at::CPU(at::kLong), {1}))) |
| ->output(); |
| |
| Value* inc_trip_count = |
| g->insertNode(g->create( |
| aten::add, {block_trip_count_input, const_one, const_one}, 1)) |
| ->output(); |
| body_block->insertOutput(1, inc_trip_count); |
| |
| Value* body_cond = createTripCountConjunctiveCondition( |
| g, inc_trip_count, max_trip_count_value, body_block->outputs()[0]); |
| body_block->eraseOutput(0); |
| body_block->insertOutput(0, body_cond); |
| } |
| } |
| for(auto sb : n->blocks()) { |
| desugarTripCounts(sb); |
| } |
| } |
| } |
| |
| // removes all inputs and outputs to a graph, replacing them with nodes before of after each insertStage |
| static std::vector<std::vector<TypePtr>> flattenStages(Graph & graph) { |
| // because JIT classic needs this to fix up gradients, remove when possible |
| std::vector<std::vector<TypePtr>> stage_input_types; |
| |
| WithInsertPoint guard(*graph.nodes().begin()); |
| size_t input_pos = 0; |
| size_t output_pos = 0; |
| auto it = graph.nodes().begin(); |
| for(size_t i = 0; i <= graph.stage(); i++) { |
| stage_input_types.emplace_back(); |
| auto store = graph.create(prim::Store, 0)->insertBefore(*it); |
| while(input_pos < graph.inputs().size() && graph.inputs()[input_pos]->stage() == i) { |
| auto nv = store->addOutput(); |
| auto old_node = graph.inputs()[input_pos]; |
| stage_input_types[i].push_back(old_node->type()); |
| old_node->replaceAllUsesWith(nv); |
| input_pos++; |
| } |
| while(it != graph.nodes().end() && it->stage() == i) |
| ++it; |
| auto load = graph.create(prim::Load, 0)->insertBefore(*it); |
| while(output_pos < graph.outputs().size() && graph.outputs()[output_pos]->stage() == i) { |
| load->addInput(graph.outputs()[output_pos]); |
| output_pos++; |
| } |
| } |
| while (graph.inputs().size() > 0) |
| graph.eraseInput(graph.inputs().size() - 1); |
| while (graph.outputs().size() > 0) |
| graph.eraseOutput(graph.outputs().size() - 1); |
| |
| return stage_input_types; |
| } |
| |
| |
| // insert Drop nodes to kill references for anything unused: |
| // this can happen in a few places, e.g. when a node returns |
| // many values but only one is used |
| // a, b = foo() |
| // return a |
| void dropUnused(Block *b) { |
| auto createDropIfUnused = [&](ArrayRef<Value*> values) -> Node* { |
| std::vector<Value*> to_drop; |
| for(auto v : values) { |
| if(v->uses().size() == 0) |
| to_drop.push_back(v); |
| } |
| if(to_drop.size() == 0) |
| return nullptr; |
| return b->owningGraph()->create(prim::Drop, to_drop, 0); |
| }; |
| |
| if(auto d = createDropIfUnused(b->inputs())) { |
| b->prependNode(d); |
| } |
| for(auto n : b->nodes()) { |
| if(auto d = createDropIfUnused(n->outputs())) { |
| d->insertAfter(n); |
| } |
| for(auto b : n->blocks()) |
| dropUnused(b); |
| } |
| } |
| |
| |
| // for each input, should we move rather than copy the inputs |
| std::unordered_map<Node*, std::vector<uint8_t>> findLastUses(Graph & g) { |
| // struct to share common data structures |
| struct FindLastUses { |
| Graph & graph; |
| // have we seen this value, yet, if not, it is the last use of the value |
| std::unordered_set<Value*> seen; |
| |
| std::unordered_map<Node*, std::vector<uint8_t>> move_flags; |
| // A map from an If or Loop node to the optional Drop block that |
| // occurs directly after it to release any tensors that go out of scope |
| // when the If/Loop exits. These are created and inserted on demand. |
| std::unordered_map<Node*, Node*> drop_for_node; |
| |
| FindLastUses(Graph & g) |
| : graph(g) { |
| scanBlock(graph.block()); |
| } |
| void scanBlock(Block * b) { |
| scanNode(b->return_node()); |
| for(auto n : b->nodes().reverse()) { |
| scanNode(n); |
| } |
| } |
| void scanNode(Node * n) { |
| for(auto b : n->blocks()) { |
| scanBlock(b); |
| } |
| move_flags[n].resize(n->inputs().size()); |
| // scan backwards so if a value is used twice in the list then it is a move |
| for(size_t i = n->inputs().size(); i > 0; --i) { |
| scanUse(n, i-1); |
| } |
| } |
| void scanUse(Node * n, size_t i) { |
| auto & move_flags_n = move_flags[n]; |
| auto v = n->inputs()[i]; |
| auto inserted = seen.insert(v).second; |
| if(!inserted) { |
| move_flags_n[i] = false; |
| return; |
| } |
| |
| // the last use of v may be in a nested block of an If or Loop statement |
| // find the node 'same_depth_node' at the same depth as the definition of v, |
| // and consider that node to be the last use of v. |
| // This ensures we do not delete nodes in nested scopes |
| // that may be executed multiple times |
| // and that nodes used on one side of an if |
| // but not the other get deleted regardless of the branch |
| // e.g. |
| // a = 4 |
| // while <...>: |
| // y = a + a |
| // drop(a) |
| // In other words, we find the first program point for v that |
| // _reverse_ dominates the definition of v, and add a drop point there. |
| Node * same_depth_node = findOwnerInBlock(n, v->node()->owningBlock()); |
| JIT_ASSERT(same_depth_node); // failure means v is not in scope for n, use lint! |
| |
| // In the case where v and n are in the same block, just mark |
| // its move_flags to be true |
| if(same_depth_node == n) { |
| move_flags_n[i] = true; |
| return; |
| } |
| |
| // in the case where the use is nested in a block |
| // add a Drop node after that block which will drop 'v'. |
| move_flags_n[i] = false; |
| addToDropIfNotExists(findOrCreateDropInstructionForNode(same_depth_node), v); |
| } |
| |
| // finds the node in block 'block' that contains in 'n' |
| // or nullptr if no such node exists, e.g.: |
| // n0: a = 4 |
| // n1: if <cond>: |
| // n2: b = a + a |
| // findOwnerInBlock(n2, n0.block()) == n1 |
| Node * findOwnerInBlock(Node * n, Block * block) { |
| while(n != nullptr && block != n->owningBlock()) { |
| n = n->owningBlock()->owningNode(); |
| } |
| return n; |
| } |
| |
| Node * findOrCreateDropInstructionForNode(Node * n) { |
| auto it = drop_for_node.find(n); |
| if(it == drop_for_node.end()) { |
| auto drop_node = graph.create(prim::Drop, 0); |
| drop_node->insertAfter(n); |
| it = drop_for_node.emplace(n, drop_node).first; |
| } |
| return it->second; |
| } |
| |
| void addToDropIfNotExists(Node * drop, Value * v) { |
| for(auto i : drop->inputs()) { |
| // we already accounted for this use |
| if(i == v) |
| return; |
| } |
| drop->addInput(v); |
| move_flags[drop].push_back(true); |
| } |
| }; |
| |
| return FindLastUses(g).move_flags; |
| } |
| |
| // pre-processing that happens once per graph |
| struct PreprocessGraph { |
| PreprocessGraph(Graph & g) |
| : graph(g.copy()) { |
| desugarTripCounts(graph->block()); |
| stage_input_types = flattenStages(*graph); |
| dropUnused(graph->block()); |
| // fill in move_flags by scanning blocks; |
| move_flags = findLastUses(*graph); |
| //TODO: desugar Loop trip counts, for now we drop trip counts |
| } |
| // Outputs of the preprocessing: |
| std::shared_ptr<Graph> graph; |
| // for each input, should we move rather than copy the inputs |
| std::unordered_map<Node*, std::vector<uint8_t>> move_flags; |
| std::vector<std::vector<TypePtr>> stage_input_types; |
| |
| }; |
| |
| // previously the interpreter worked with at::Retainable values, |
| // which are annoying to handle since 99% of values are at::Tensor anyway |
| // instead we create a fake subclass of TensorImpl that can be subclassed |
| // to hold arbitrary things |
| struct ContainerTensor : public at::TensorImpl { |
| public: |
| ContainerTensor() |
| : TensorImpl(&(at::globalContext().getType(at::Backend::Undefined,at::ScalarType::Undefined))) {} |
| |
| virtual ~ContainerTensor() {} |
| virtual const char * toString() const override { |
| throw std::runtime_error("toString() on ContainerTensor"); |
| } |
| virtual at::IntList sizes() const override { |
| throw std::runtime_error("sizes() on ContainerTensor"); |
| } |
| virtual at::IntList strides() const override { |
| throw std::runtime_error("strides() on ContainerTensor"); |
| } |
| virtual int64_t dim() const override { |
| throw std::runtime_error("dim() on ContainerTensor"); |
| } |
| virtual at::Scalar localScalar() override { |
| throw std::runtime_error("localScalar() on ContainerTensor"); |
| } |
| virtual void * unsafeGetTH(bool retain) override { |
| throw std::runtime_error("unsafeGetTH() on ContainerTensor"); |
| } |
| virtual std::unique_ptr<at::Storage> storage() override { |
| throw std::runtime_error("storage() on ContainerTensor"); |
| } |
| }; |
| |
| |
| // Dummy function is the last function that the autograd engine calls |
| // when evaluating Eval nodes. Its input tensors are the outputs that the |
| // Eval node needs to produce. |
| // We interscept these values using an Autograd callback. So the function itself |
| // never runs. |
| struct DummyFunction : autograd::Function { |
| virtual autograd::variable_list apply(const autograd::variable_list& inputs) override { |
| throw std::logic_error("DummyFunction::apply() called, but it should be blocked by a callback returning false"); |
| } |
| }; |
| |
| // An AutogradHandle holds the information needed to run an Autograd backward pass |
| // after running a forward operator (such as PythonOp, CppOp, or for double-backwards another Eval Op) |
| // The EvalOperation uses AutogradHandle to perform this operation. |
| struct AutogradHandle : public ContainerTensor { |
| |
| // The inputs of DummyFunction are the gradients of the forward passes |
| // inputs, and the _outputs_ of the run of the Autograd engine computing backward. |
| // there is one entry in this list for each forward input that requires |
| // gradients |
| std::shared_ptr<DummyFunction> forward_inputs; |
| |
| // there is one entry in this list for each output of the forward pass |
| // that represents the location in the backwaard pass where the gradient |
| // of this output should be inserted at the beginning of the backward pass |
| autograd::edge_list forward_outputs; |
| }; |
| |
| // HandleBuilder is used to construct the correct Autograd Handle objects |
| // for use in a future stage. |
| // It is used even when the future stage does not require a handle since |
| // it also performs the conversions between Tensor and Variable, which |
| // behave differently depending on whether a future handle needs to be |
| // created. |
| struct HandleBuilder { |
| HandleBuilder(bool requires_handle) { |
| if(requires_handle) { |
| handle = new AutogradHandle(); |
| handle->forward_inputs = std::make_shared<DummyFunction>(); |
| } |
| } |
| autograd::Variable addInput(at::Tensor && input, const VariableFlags & flags_) { |
| if(handle && flags_.requires_grad) { |
| auto variable = autograd::make_variable(std::move(input), /*requires_grad=*/false); |
| autograd::create_gradient_edge(variable, handle->forward_inputs); |
| return variable; |
| } else { |
| return autograd::make_variable(std::move(input), /*requires_grad=*/false); |
| } |
| } |
| at::Tensor addOutput(const autograd::Variable & output) { |
| if(handle) { |
| handle->forward_outputs.push_back(output.gradient_edge()); |
| } |
| return output.data(); |
| } |
| void writeTo(Stack & outputs) { |
| // outputs takes ownership of handle |
| if(handle) { |
| outputs.push_back(at::Tensor(handle, /*retain=*/false)); |
| handle = nullptr; |
| } |
| } |
| private: |
| AutogradHandle* handle = nullptr; |
| }; |
| |
| bool hasHandleOutput(Node * n) { |
| if(n->outputs().size() == 0) |
| return false; |
| auto & last = n->outputs().back(); |
| return last->isHandle() && last->uses().size() > 0; // don't bother creating a handle if it is never used |
| } |
| |
| #ifndef NO_PYTHON |
| Operation createPythonOperation(PythonOp* op, bool values_are_variables) { |
| py::function func; |
| if (op->tracing_autograd_python_function) { |
| func = py::function(py::handle(op->pyobj.get()).attr("apply")); |
| } else { |
| func = py::reinterpret_borrow<py::function>(py::handle(op->pyobj.get())); |
| } |
| bool tracing_autograd_python_function = op->tracing_autograd_python_function; |
| bool has_handle = hasHandleOutput(op); |
| size_t num_inputs = 0; |
| for(auto arg_type : op->cconv) { |
| if(arg_type == 't') |
| num_inputs++; |
| } |
| return [=](Stack & stack) { |
| AutoGIL gil; |
| py::tuple py_inputs(op->cconv.size()); |
| size_t i = 0; |
| size_t next_scalar = 0; |
| size_t next_tensor = 0; |
| HandleBuilder builder(has_handle); |
| // Note: The first branch here should be considered deprecated and will |
| // probably be removed in the future. |
| // |
| // tracing_autograd_python_function indicates that we need to hook this |
| // PythonOp up to autograd with the HandleBuilder |
| if (tracing_autograd_python_function) { |
| for (auto arg_type : op->cconv) { |
| if (arg_type == 's') { |
| py_inputs[i] = py::reinterpret_borrow<py::object>( |
| op->scalar_args[next_scalar++].get()); |
| } else if (arg_type == 't') { |
| py_inputs[i] = py::reinterpret_steal<py::object>( |
| THPVariable_Wrap(builder.addInput( |
| std::move(peek(stack, next_tensor, num_inputs)), |
| op->var_flags.at(next_tensor)))); |
| next_tensor++; |
| } |
| i++; |
| } |
| drop(stack, num_inputs); |
| py::object py_outputs(func(*py_inputs)); |
| auto num_outputs = op->outputs().size(); |
| auto addOutput = [&](py::handle entry) { |
| if (!THPVariable_Check(entry.ptr())) { |
| throw std::runtime_error( |
| "Function.apply returned a non-Variable output"); |
| } |
| THPVariable* var = (THPVariable*)entry.ptr(); |
| stack.push_back(builder.addOutput(var->cdata)); |
| }; |
| if (!PyTuple_Check(py_outputs.ptr())) { |
| if (num_outputs != 1) { |
| throw std::runtime_error( |
| "Function.apply returned the wrong number of outputs."); |
| } |
| addOutput(py_outputs); |
| } else { |
| auto output_tuple = py::tuple(py_outputs); |
| if (output_tuple.size() != num_outputs) { |
| throw std::runtime_error( |
| "Function.apply returned the wrong number of outputs."); |
| } |
| for (py::handle entry : output_tuple) { |
| addOutput(entry); |
| } |
| } |
| builder.writeTo(stack); |
| return 0; |
| } else { |
| // In this case we're not hooking this PythonOp up to autograd. We always |
| // pass in and return Variables to the PythonOp. The flag |
| // values_are_variables indicates that the actual inputs and outputs are |
| // Variable types. In the case that this is false, we must wrap up inputs |
| // Tensors into Variables and we must unwrap the outputs to Tensors. In |
| // the other case, we pass in inputs and return outputs as-is |
| for (auto arg_type : op->cconv) { |
| if (arg_type == 's') { |
| py_inputs[i] = py::reinterpret_borrow<py::object>( |
| op->scalar_args[next_scalar++].get()); |
| } else if (arg_type == 't') { |
| auto var = peek(stack, next_tensor, num_inputs); |
| if (!values_are_variables) { |
| var = autograd::make_variable(var); |
| } |
| py_inputs[i] = |
| py::reinterpret_steal<py::object>(THPVariable_Wrap(var)); |
| next_tensor++; |
| } |
| i++; |
| } |
| drop(stack, num_inputs); |
| py::object py_outputs(func(*py_inputs)); |
| |
| auto num_outputs = op->outputs().size(); |
| auto addOutput = [&](py::handle entry) { |
| if (!THPVariable_Check(entry.ptr())) { |
| throw std::runtime_error( |
| "Function application returned a non-Variable output"); |
| } |
| THPVariable* var = (THPVariable*)entry.ptr(); |
| auto cdata = var->cdata; |
| stack.push_back(values_are_variables ? std::move(cdata) : cdata.data()); |
| }; |
| |
| if (!PyTuple_Check(py_outputs.ptr())) { |
| if (num_outputs != 1) { |
| throw std::runtime_error( |
| "Function.apply returned the wrong number of outputs."); |
| } |
| addOutput(py_outputs); |
| } else { |
| auto output_tuple = py::tuple(py_outputs); |
| if (output_tuple.size() != num_outputs) { |
| throw std::runtime_error( |
| "Function application returned the wrong number of outputs."); |
| } |
| for (py::handle entry : py::tuple(py_outputs)) { |
| addOutput(entry); |
| } |
| } |
| return 0; |
| } |
| }; |
| } |
| #else |
| Operation createPythonOperation(PythonOp* op, bool values_are_variables) { |
| throw std::runtime_error("Trying to create Python operation from a C++ build"); |
| return [=](Stack & stack) { |
| return 0; |
| }; |
| } |
| #endif |
| |
| Operation createCppOperation(CppOp* op) { |
| std::shared_ptr<autograd::Function> func = op->fn; |
| bool has_handle = hasHandleOutput(op); |
| auto num_inputs = op->inputs().size(); |
| return [=](Stack & stack) { |
| HandleBuilder builder(has_handle); |
| autograd::variable_list v_inputs; |
| for(size_t i = 0; i < num_inputs; i++) { |
| v_inputs.push_back(builder.addInput(std::move(peek(stack, i, num_inputs)), op->var_flags[i])); |
| } |
| drop(stack, num_inputs); |
| autograd::variable_list v_outputs = (*func)(v_inputs); |
| for(auto & output : v_outputs) { |
| stack.push_back(builder.addOutput(output)); |
| } |
| builder.writeTo(stack); |
| return 0; |
| }; |
| } |
| |
| Operation createEvalOperation(CppOp * op) { |
| bool has_handle_output = hasHandleOutput(op); |
| auto num_inputs = op->inputs().size(); |
| return [=](Stack & stack) { |
| at::Tensor handle_t = std::move(stack.back()); |
| AutogradHandle * handle_in = dynamic_cast<AutogradHandle*>(handle_t.get()); |
| JIT_ASSERT(handle_in); |
| HandleBuilder builder(has_handle_output); |
| auto& engine = torch::autograd::Engine::getDefaultEngine(); |
| autograd::variable_list v_inputs; |
| for(size_t i = 0; i < num_inputs - 1; i++) { |
| v_inputs.push_back(builder.addInput(std::move(peek(stack, i, num_inputs)), op->var_flags[i])); |
| } |
| drop(stack, num_inputs); |
| // TODO: handle create_graph appropriately |
| bool create_graph = true; |
| // note: node handle_in->use_count() == 1 means that we are guarenteed that we have the only |
| // only copy of the handle. This might make it seem it is ok to pass keep_graph=False. |
| // However, it is possible for 'copied_next_fns' to grab functions used by _other_ handles, |
| // and these functions will be executed in this run. Since these other handles |
| // may still be alive, it is not safe to release the graph |
| // TODO: we could cache this list in AutogradHandle (it's read only) |
| autograd::edge_list output_edges; |
| const auto num_inputs = handle_in->forward_inputs->num_inputs(); |
| output_edges.reserve(num_inputs); |
| for (uint32_t i = 0; i < num_inputs; ++i) |
| output_edges.emplace_back(handle_in->forward_inputs, i); |
| auto values = engine.execute(handle_in->forward_outputs, v_inputs, true, create_graph, output_edges); |
| for(auto & v : values) |
| stack.push_back(builder.addOutput(v)); |
| builder.writeTo(stack); |
| return 0; |
| }; |
| } |
| |
| // Returns a function implementing functionality of a given node, |
| // or nullptr if it's a no-op for autograd. |
| Operation getOperation(jit::Node* node, bool values_are_variables) { |
| IR_IFM(node, PythonOp) |
| return createPythonOperation(value, values_are_variables); |
| IR_ELSEIFM(CppOp) |
| if(dynamic_cast<autograd::Eval*>(value->fn.get())) { |
| return createEvalOperation(value); |
| } else { |
| return createCppOperation(value); |
| } |
| IR_ELSEIF(FusionGroup) |
| auto fusion_fn = sharedFusionCompiler().getOrCompile(value); |
| auto num_inputs = value->inputs().size(); |
| return [fusion_fn, num_inputs](Stack & stack) { |
| autograd::profiler::RecordFunction record("FusionGroup"); |
| std::vector<at::Tensor> toutputs; |
| // TODO: have fusion_fn work off of a stack as well |
| fusion_fn->launch(last(stack, num_inputs), toutputs); |
| drop(stack, num_inputs); |
| stack.insert(stack.end(), toutputs.begin(), toutputs.end()); |
| return 0; |
| }; |
| IR_ELSEIF(Constant) |
| if (values_are_variables) { |
| auto t = torch::autograd::make_variable(value->t(attr::value), false); |
| return [t](Stack& stack) { |
| stack.push_back(t); |
| return 0; |
| }; |
| } else { |
| auto t = value->t(attr::value); |
| return [t](Stack & stack) { |
| stack.push_back(t); |
| return 0; |
| }; |
| } |
| IR_ELSEIF(Undefined) |
| return [](Stack & stack) { |
| stack.push_back(at::Tensor()); |
| return 0; |
| }; |
| IR_ELSEIF(ReplaceIfUndef) |
| return [](Stack & stack) { |
| auto alternate = pop(stack); |
| auto result = pop(stack); |
| if(result.defined()) { |
| stack.push_back(std::move(result)); |
| } else { |
| stack.push_back(std::move(alternate)); |
| } |
| return 0; |
| }; |
| IR_ELSEIF(Print) |
| size_t num_inputs = value->inputs().size(); |
| return [num_inputs](Stack & stack) { |
| bool first = true; |
| for (at::Tensor i : last(stack, num_inputs)) { |
| if (!first) std::cout << " "; |
| first = false; |
| if (auto tensor_impl = dynamic_cast<at::TensorImpl*>(i.get())) { |
| std::cout << at::Tensor(tensor_impl, true); |
| } else if (!i.defined()) { |
| std::cout << "<undefined tensor>"; |
| } else { |
| auto& r = *i.get(); |
| std::cout << "<" << typeid(r).name() << " at " << i << ">"; |
| } |
| } |
| drop(stack, num_inputs); |
| std::cout << std::endl; |
| return 0; |
| }; |
| IR_ELSEIF(GraphExecutor) |
| GraphExecutor executor(value->g(attr::Subgraph)); |
| auto num_inputs = value->inputs().size(); |
| return [=](Stack& stack) mutable { |
| autograd::profiler::RecordFunction record("GraphExecutor"); |
| auto inputs = last(stack, num_inputs); |
| variable_tensor_list tinputs(inputs.begin(), inputs.end()); |
| drop(stack, num_inputs); |
| //TODO: has graph executor work from a stack as well |
| variable_tensor_list toutputs = executor.run(variable_tensor_list(std::move(tinputs))); |
| stack.insert(stack.end(), toutputs.begin(), toutputs.end()); |
| return 0; |
| }; |
| |
| |
| // Load x, y |
| // loads values from registers onto the stack, the actual callback does |
| // nothing since the stack manipulation is already encoded in inst.inputs |
| // and inst.outputs |
| IR_ELSEIF(Load) |
| return [=](Stack& stack) { |
| return 0; |
| }; |
| |
| // x, y = Store |
| // stores values from stack into registers, the actual callback does |
| // nothing since the stack manipulation is already encoded in inst.inputs |
| // and inst.outputs |
| IR_ELSEIF(Store) |
| return [=](Stack& stack) { |
| return 0; |
| }; |
| IR_ELSEIF(Drop) |
| auto N = value->inputs().size(); |
| return [=](Stack& stack) { |
| drop(stack, N); |
| return 0; |
| }; |
| IR_ELSE() |
| return getTensorOp(node).op; |
| IR_END() |
| } |
| |
| |
| // We need some lists for inputs and outputs. To keep all the memory |
| // contiguous we allocate a single vector and use offsets into the vector |
| // which are stored in the ListHandle struct |
| // start is an offset into int_data of Code for ListHandle<int> |
| // and bool_data of Code for ListHandle<bool> |
| template<typename T> |
| struct ListHandle { |
| int start; |
| int size; |
| }; |
| |
| struct UseList { |
| // values to be used |
| ListHandle<int> values; |
| // boolean flags indicating whether to free the Tensor after this use |
| ListHandle<bool> free_flags; |
| }; |
| |
| // one instruction plus meta-data |
| struct Instruction { |
| Operation callback; |
| UseList inputs; |
| ListHandle<int> outputs; |
| Symbol debug_name; // used in dump to understand the generated code |
| std::shared_ptr<SourceLocation> debug_location; // for error reporting |
| }; |
| |
| |
| int relativeJump(int from_inst, int to_inst) { |
| return to_inst - (from_inst + 1); |
| } |
| |
| struct CodeImpl { |
| CodeImpl(std::shared_ptr<Graph>& graph_, bool values_are_variables) |
| : values_are_variables(values_are_variables), preprocess(*graph_) { |
| graph = preprocess.graph; |
| //std::cout << "into code graph:\n" << *graph << "\n"; |
| insertNodesFromBlock(graph->block()); |
| } |
| |
| // jump when input is 0 |
| void createJumpZ(int from_inst, int to_inst) { |
| auto & inst = instructions[from_inst]; |
| JIT_ASSERT(inst.debug_name == prim::Placeholder); |
| auto offset = relativeJump(from_inst, to_inst); |
| inst.callback = [offset](Stack & stack) { |
| auto t = tensor_as<int64_t>(pop(stack)); |
| return (t == 0) ? offset : 0; |
| }; |
| inst.debug_name = prim::JumpZ; |
| } |
| |
| // jump when input is not 0 |
| void createJumpNZ(int from_inst, int to_inst) { |
| auto & inst = instructions[from_inst]; |
| JIT_ASSERT(inst.debug_name == prim::Placeholder); |
| auto offset = relativeJump(from_inst, to_inst); |
| inst.callback = [offset](Stack & stack) { |
| auto t = tensor_as<int64_t>(pop(stack)); |
| return (t != 0) ? offset : 0; |
| }; |
| inst.debug_name = prim::JumpNZ; |
| } |
| |
| void createJump(int from_inst, int to_inst) { |
| auto & inst = instructions[from_inst]; |
| JIT_ASSERT(inst.debug_name == prim::Placeholder); |
| auto offset = relativeJump(from_inst, to_inst); |
| inst.callback = [=](Stack & stack) { |
| return offset; |
| }; |
| inst.debug_name = prim::Jump; |
| } |
| |
| void insertNodesFromBlock(Block* block) { |
| for(auto node : block->nodes()) { |
| const auto & source_location = node->getSourceLocation(); |
| switch(node->kind()) { |
| case prim::If: { |
| // x = if c: |
| // <then_block> |
| // -> (vt) |
| // else: |
| // <else_block> |
| // -> (vf) |
| |
| // turns into: |
| // JumpNZ c, then |
| // <else_block> |
| // x = vf |
| // Jump end |
| // then: |
| // <then_block> |
| // x = vt |
| // end: |
| |
| // prim::Placeholder instructions are replaced with branch instructions |
| // when the branch target locations are known |
| auto cond_branch = insertInstruction(prim::Placeholder, source_location, node->inputs(), moveFlags(node), {}); |
| auto then_block = node->blocks()[0]; |
| auto else_block = node->blocks()[1]; |
| insertNodesFromBlock(else_block); |
| insertAssign(source_location,else_block->outputs(), moveFlags(else_block), node->outputs()); |
| auto jump = insertInstruction(prim::Placeholder, source_location, {}, {}, {}); |
| auto then_block_start = instructions.size(); |
| insertNodesFromBlock(then_block); |
| insertAssign(source_location, then_block->outputs(), moveFlags(then_block), node->outputs()); |
| createJump(jump, instructions.size()); |
| createJumpNZ(cond_branch, then_block_start); |
| } break; |
| case prim::Loop: { |
| // o0 = while c i0 |
| // block 0: l0 |
| // <body> |
| // -> (v0, v1) |
| |
| // turns into: |
| // l0 = i0 |
| // JumpZ c, end |
| // begin: |
| // <body> |
| // c, l0 = v0, v1 |
| // JumpNZ c, begin |
| // end: |
| |
| auto body_block = node->blocks()[0]; |
| |
| // before assign op: stack: ... <cond> <loop-carried-depdencies> |
| insertAssign(source_location, node->inputs(), moveFlags(node), body_block->inputs()); |
| // after assign op: stack: ... <cond> |
| // cond_branch consumes <cond> from top of the stack |
| auto cond_branch = insertInstruction(prim::Placeholder, source_location,{}, {}, {}); |
| // after branch: stack: ... |
| |
| auto entry = instructions.size(); |
| insertNodesFromBlock(body_block); |
| // before assign op: stack: ... <cond> <loop-carried-depdencies> |
| insertAssign(source_location, body_block->outputs(), moveFlags(body_block), body_block->inputs()); |
| // after assign op: stack: ... <cond> |
| auto cond_branch_end = insertInstruction(prim::Placeholder, source_location, {}, {}, {}); |
| // after branch: stack: ... |
| |
| aliasRegistersTo(node->outputs(), body_block->inputs()); |
| createJumpZ(cond_branch, instructions.size()); |
| createJumpNZ(cond_branch_end, entry); |
| } break; |
| default: { |
| insertInstruction(node); |
| } break; |
| } |
| // each stage ends with a load instruction |
| // we record where these instructions occur, and use them to |
| // exit the interpreter |
| if(node->kind() == prim::Load) { |
| stage_end.push_back(instructions.size()); |
| } |
| } |
| } |
| |
| size_t insertInstruction(Node * n) { |
| auto inst = insertInstruction(n->kind(), n->getSourceLocation(), n->inputs(), moveFlags(n) , n->outputs()); |
| instructions[inst].callback = getOperation(n, values_are_variables); |
| return inst; |
| } |
| size_t insertInstruction(Symbol sym, |
| std::shared_ptr<SourceLocation> debug_location, |
| ArrayRef<Value*> inputs, |
| ArrayRef<uint8_t> move_flags, |
| ArrayRef<Value*> outputs) { |
| instructions.emplace_back(); |
| auto & inst = instructions.back(); |
| inst.debug_name = sym; |
| inst.debug_location = std::move(debug_location); |
| listBegin(inst.inputs.values); |
| for(auto input : inputs) { |
| listInsert(inst.inputs.values, getOrAllocateRegister(input, true)); |
| } |
| listBegin(inst.inputs.free_flags); |
| for(auto flag : move_flags) { |
| listInsert(inst.inputs.free_flags, flag); |
| } |
| listBegin(inst.outputs); |
| for(auto output : outputs) { |
| listInsert(inst.outputs, getOrAllocateRegister(output)); |
| } |
| return instructions.size() - 1; |
| } |
| ArrayRef<uint8_t> moveFlags(Node * n) { |
| return preprocess.move_flags.at(n); |
| } |
| ArrayRef<uint8_t> moveFlags(Block *b) { |
| return moveFlags(b->return_node()); |
| } |
| |
| size_t insertAssign(std::shared_ptr<SourceLocation> debug_location, ArrayRef<Value*> inputs, ArrayRef<uint8_t> move_flags, ArrayRef<Value*> outputs) { |
| auto inst = insertInstruction(prim::Assign, std::move(debug_location),inputs, move_flags, outputs); |
| // This node effectively forwards its inputs into different places in a register list. |
| // We don't need to manipulate the stack in any way, because all inputs are also outputs, |
| // and the interpreter will take care of putting them in correct places. |
| instructions[inst].callback = [](Stack& stack) { return 0; }; |
| return inst; |
| } |
| |
| // helpers to build/access RegList objects |
| int get(const ListHandle<int> & list, int i) const { |
| return int_data[list.start + i]; |
| } |
| bool get(const ListHandle<bool> & list, int i) const { |
| return bool_data[list.start + i]; |
| } |
| void listBegin(ListHandle<int> & list) { |
| list.start = int_data.size(); |
| list.size = 0; |
| } |
| void listInsert(ListHandle<int> & list, int value) { |
| JIT_ASSERTM(list.start + list.size == (int)int_data.size(), "another list already started"); |
| int_data.push_back(value); |
| list.size++; |
| } |
| void listBegin(ListHandle<bool> & list) { |
| list.start = bool_data.size(); |
| list.size = 0; |
| } |
| void listInsert(ListHandle<bool> & list, int value) { |
| JIT_ASSERTM(list.start + list.size == (int)bool_data.size(), "another list already started"); |
| bool_data.push_back(value); |
| list.size++; |
| } |
| // must be called before any new_allocations are used, otherwise they will |
| // already have registers assigned |
| void aliasRegistersTo(ArrayRef<Value*> new_allocations, ArrayRef<Value*> existing_allocations) { |
| JIT_ASSERT(new_allocations.size() == existing_allocations.size()); |
| for(size_t i = 0; i < new_allocations.size(); ++i) { |
| auto n = new_allocations[i]->unique(); |
| auto e = existing_allocations[i]->unique(); |
| JIT_ASSERT(unique_to_reg.count(e) > 0 && unique_to_reg.count(n) == 0); |
| unique_to_reg[n] = unique_to_reg[e]; |
| } |
| } |
| int getOrAllocateRegister(Value * n, bool required = false) { |
| size_t u = n->unique(); |
| if(unique_to_reg.count(u) > 0) |
| return unique_to_reg[u]; |
| JIT_ASSERT(!required); |
| int r = register_size++; |
| unique_to_reg[u] = r; |
| return r; |
| } |
| |
| void dumpInstruction(std::ostream & out, size_t pc) const { |
| auto writeList = [&](const ListHandle<int> & list) { |
| for(int i = 0; i < list.size; i++) { |
| if(i > 0) |
| out << ", "; |
| out << get(list, i); |
| } |
| }; |
| auto writeUseList = [&](const UseList & list) { |
| for(int i = 0; i < list.values.size; i++) { |
| if(i > 0) |
| out << ", "; |
| if(get(list.free_flags, i)) |
| out << "move(" << get(list.values, i) << ")"; |
| else |
| out << get(list.values, i); |
| } |
| }; |
| auto & inst = instructions.at(pc); |
| writeList(inst.outputs); |
| // NB: debug names are the kind of operator used to select |
| // dispatch |
| out << " = " << inst.debug_name.toUnqualString() << " "; |
| writeUseList(inst.inputs); |
| } |
| void dump(std::ostream & out) const { |
| for(size_t i = 0; i < instructions.size(); ++i) { |
| dumpInstruction(out, i); |
| out << "\n"; |
| } |
| } |
| |
| // We MUST hold onto graph here because some Operators stored in the |
| // instruction lists have dependencies on meta-data stored in the graph |
| // that would be dead otherwise. |
| // It is also very useful for debugging interpreter problems to |
| // keep this around. |
| std::shared_ptr<Graph> graph; |
| bool values_are_variables; |
| PreprocessGraph preprocess; |
| |
| std::unordered_map<size_t, int> unique_to_reg; // map from unique of nodes to register in register table |
| |
| friend struct InterpreterState; |
| std::vector<Instruction> instructions; |
| std::vector<size_t> stage_end; // each stage runs while(pc < stage_end[stage]) |
| int register_size = 0; |
| |
| // all memory ArrayRef<int> are slices of this, to make sure |
| // the interpreter is mostly linearly scanning through memory |
| std::vector<int> int_data; |
| std::vector<bool> bool_data; |
| }; |
| |
| // InterpreterState state that is held across stages and used to compute a Code |
| struct InterpreterStateImpl { |
| InterpreterStateImpl(const Code & function_) |
| : function(function_.pImpl), |
| int_data(function->int_data.data()), |
| bool_data(function->bool_data), |
| registers(function->register_size) { |
| } |
| void runOneStage(Stack & stack) { |
| // std::cout << "running stage: " << current_stage << " of " << function->stage_end.size() << "\n"; |
| // std::cout << *function->graph << "\n"; |
| // function->dump(std::cout); |
| size_t pc = current_pc; |
| size_t last = function->stage_end[current_stage]; |
| auto & instructions = function->instructions; |
| while(pc < last) { |
| // std::cout << "executing " << pc << ": "; |
| // function->dumpInstruction(std::cout, pc); |
| // std::cout << "\n"; |
| try { |
| auto & inst = instructions[pc]; |
| loadTensorsFromRegisters(inst.inputs, stack); |
| size_t new_pc = pc + 1 + inst.callback(stack); |
| for(int i = inst.outputs.size - 1; i >= 0; i--) { |
| int reg = get(inst.outputs,i); |
| registers[reg] = pop(stack); |
| // std::cout << "pop reg[" << reg << "];\n" << registers[reg].pImpl << "\n"; |
| } |
| pc = new_pc; |
| } catch(std::exception & e) { |
| if(!instructions[pc].debug_location) |
| throw; // rethrow original exception |
| // throw a new exception with enhanced debugging information |
| instructions[pc].debug_location->wrapAndRethrowException(e, "operation failed in interpreter"); |
| } |
| } |
| current_pc = pc; |
| current_stage++; |
| } |
| const TensorType & tensorTypeForInput(size_t i) const { |
| return *function->preprocess.stage_input_types.at(current_stage).at(i)->expect<TensorType>(); |
| } |
| int get(const ListHandle<int> & list, int i) { |
| return int_data[list.start + i]; |
| }; |
| bool get(const ListHandle<bool> & list, int i) { |
| return bool_data[list.start + i]; |
| } |
| void loadTensorsFromRegisters(const UseList & uses, Stack & stack) { |
| for(int i = 0; i < uses.values.size; i++) { |
| int reg = get(uses.values,i); |
| // std::cout << "push reg[" << reg << "];\n" << registers[reg] << "\n\n"; |
| if(get(uses.free_flags,i)) { |
| stack.push_back(std::move(registers[reg])); |
| } else { |
| stack.push_back(registers[reg]); |
| } |
| |
| } |
| } |
| size_t current_stage = 0; |
| size_t current_pc = 0; |
| std::shared_ptr<CodeImpl> function; // keep function alive |
| // these are just copies of function to prevent indirections in interpreter |
| int * int_data; |
| const std::vector<bool> & bool_data; |
| |
| |
| // this holds all the tensors for this interpreter run |
| // we don't bother minimizing the size of this vector, since the extra |
| // memory used by the pointers in this will be small |
| // instead we are very aggresive about releasing tensors when they become dead |
| // to make sure memory management happens efficiently. |
| |
| // We optimize for the case where derivatives are run with retain_graph=False |
| // in the case where it is true, then the interpreter and this array get copied |
| // if this every becomes a bottleneck then we _should_ consider minimizing the |
| // total number or register |
| std::vector<at::Tensor> registers; |
| |
| // single buffer for input/output calls to ATen functions, so that we do not reallocate |
| Stack stack; |
| }; |
| |
| std::ostream & operator<<(std::ostream & out, const Code & code) { |
| out << *code.pImpl->graph << "\n"; |
| code.pImpl->dump(out); |
| return out; |
| } |
| |
| Code::Code(std::shared_ptr<Graph>& graph, bool values_are_variables) |
| : pImpl(new CodeImpl(graph, values_are_variables)) {} |
| Code::~Code() {} |
| InterpreterState::InterpreterState(const Code & function) |
| : pImpl(new InterpreterStateImpl(function)) {} |
| InterpreterState::~InterpreterState() {} |
| void InterpreterState::runOneStage(Stack & stack) { |
| return pImpl->runOneStage(stack); |
| } |
| const TensorType & InterpreterState::tensorTypeForInput(size_t i) const { |
| return pImpl->tensorTypeForInput(i); |
| } |
| InterpreterState InterpreterState::clone() const { |
| return InterpreterState(new InterpreterStateImpl(*pImpl)); |
| } |
| InterpreterState::InterpreterState(InterpreterStateImpl * pImpl) : pImpl(pImpl) {} |
| |
| }} |