| #include <torch/csrc/jit/runtime/interpreter.h> |
| |
| #include <ATen/Parallel.h> |
| #include <ATen/core/ivalue.h> |
| #include <c10/core/thread_pool.h> |
| #include <c10/util/Exception.h> |
| #include <torch/csrc/autograd/edge.h> |
| #include <torch/csrc/autograd/grad_mode.h> |
| #include <torch/csrc/autograd/record_function.h> |
| #include <torch/csrc/autograd/variable.h> |
| #include <torch/csrc/jit/api/compilation_unit.h> |
| #include <torch/csrc/jit/api/function_impl.h> |
| #include <torch/csrc/jit/ir/constants.h> |
| #include <torch/csrc/jit/ir/ir.h> |
| #include <torch/csrc/jit/jit_log.h> |
| #include <torch/csrc/jit/passes/bailout_graph.h> |
| #include <torch/csrc/jit/runtime/exception_message.h> |
| #include <torch/csrc/jit/runtime/graph_executor.h> |
| #include <torch/csrc/jit/runtime/instruction.h> |
| #include <torch/csrc/jit/runtime/jit_exception.h> |
| #include <torch/csrc/jit/runtime/operator.h> |
| #include <torch/csrc/jit/runtime/vararg_functions.h> |
| |
| #ifdef USE_DISTRIBUTED |
| #include <torch/csrc/distributed/autograd/context/container.h> |
| using torch::distributed::autograd::DistAutogradContainer; |
| #endif |
| |
| #include <exception> |
| #include <iostream> |
| #include <memory> |
| #include <mutex> |
| #include <ostream> |
| #include <stdexcept> |
| #include <typeinfo> |
| #include <unordered_map> |
| #include <unordered_set> |
| #include <utility> |
| #include <vector> |
| |
| namespace torch { |
| namespace jit { |
| |
| // Before we translate to intepreter instructions, we do |
| // some preprocessing of the graph to turn it into a form that is closer |
| // to what the instructions will look like. |
| // In particular we: |
| // * Computes whether a input to a node is the last use, so we can issue MOVE |
| // rather than LOAD instructions. |
| // * Drop nodes are inserted for any node that is unused to create a dummy use |
| // that will cause the interpreter to free the node. |
| // A drop node just pops its input off the stack to ensure the interpreter |
| // releases references to nodes that are never used. Drop nodes are also |
| // inserted when the last use of a node is in some conditionally run control |
| // flow (e.g. one side of an If) and the interpreter must free the node only |
| // after the control flow has reconverged |
| // Outputs are: |
| // * graph - the post processed copy of g |
| // * move_flags[n] - a list of booleans, one for each input, |
| // indicating whether this is the last use of the value. The interpreter |
| // should generate a move rather than a copy in this case. |
| |
| TensorTypePtr tensorTypeInCurrentExecutionContext(const at::Tensor& t) { |
| if (!t.defined()) { |
| return TensorType::get()->withUndefined(); |
| } |
| auto r = TensorType::create(t); |
| if (!at::GradMode::is_enabled()) { |
| return r->withRequiresGrad(false); |
| } |
| return r; |
| } |
| |
| namespace { |
| |
| // insert Drop nodes to kill references for anything unused: |
| // this can happen in a few places, e.g. when a node returns |
| // many values but only one is used |
| // a, b = foo() |
| // return a |
| void dropUnused(Block* b) { |
| auto createDropIfUnused = [&](ArrayRef<Value*> values) -> Node* { |
| std::vector<Value*> to_drop; |
| for (auto v : values) { |
| if (v->uses().size() == 0 && v->node()->kind() != prim::Constant) |
| to_drop.push_back(v); |
| } |
| if (to_drop.size() == 0) |
| return nullptr; |
| return b->owningGraph()->create(prim::Drop, to_drop, 0); |
| }; |
| |
| if (auto d = createDropIfUnused(b->inputs())) { |
| b->prependNode(d); |
| } |
| for (auto n : b->nodes()) { |
| if (auto d = createDropIfUnused(n->outputs())) { |
| d->insertAfter(n); |
| } |
| for (auto b : n->blocks()) |
| dropUnused(b); |
| } |
| } |
| |
| // ensure every value has a final use in the same block where it is defined. |
| // This already true for most nodes. The exceptions are: |
| // 1. A value that is unused. |
| // 2. A value whose last use is nested in some control flow. |
| // For (1) we simply add a prim::Drop node that uses the value right after |
| // it is defined. For (2), we insert a prim::Drop right after the control |
| // flow node where the last use occurs |
| void insertLastUses(Graph& g) { |
| // struct to share common data structures |
| struct InsertLastUses { |
| Graph& graph; |
| // have we seen this value, yet, if not, it is the last use of the value |
| std::unordered_set<Value*> seen; |
| |
| // A map from an If or Loop node to the optional Drop block that |
| // occurs directly after it to release any tensors that go out of scope |
| // when the If/Loop exits. These are created and inserted on demand. |
| std::unordered_map<Node*, Node*> drop_for_node; |
| |
| InsertLastUses(Graph& g) : graph(g) { |
| scanBlock(graph.block()); |
| } |
| void scanBlock(Block* b) { |
| scanNode(b->return_node()); |
| for (auto n : b->nodes().reverse()) { |
| scanNode(n); |
| } |
| } |
| void scanNode(Node* n) { |
| for (auto b : n->blocks()) { |
| scanBlock(b); |
| } |
| // scan backwards so if a value is used twice in the list then it is a |
| // move |
| for (size_t i = n->inputs().size(); i > 0; --i) { |
| scanUse(n, i - 1); |
| } |
| } |
| void scanUse(Node* n, size_t i) { |
| auto v = n->inputs()[i]; |
| auto inserted = seen.insert(v).second; |
| if (!inserted) { |
| return; |
| } |
| |
| // the last use of v may be in a nested block of an If or Loop statement |
| // find the node 'same_depth_node' at the same depth as the definition of |
| // v, and consider that node to be the last use of v. This ensures we do |
| // not delete nodes in nested scopes that may be executed multiple times |
| // and that nodes used on one side of an if |
| // but not the other get deleted regardless of the branch |
| // e.g. |
| // a = 4 |
| // while <...>: |
| // y = a + a |
| // drop(a) |
| // In other words, we find the first program point for v that |
| // _reverse_ dominates the definition of v, and add a drop point there. |
| Node* same_depth_node = findOwnerInBlock(n, v->node()->owningBlock()); |
| AT_ASSERT( |
| same_depth_node); // failure means v is not in scope for n, use lint! |
| |
| // In the case where v and n are in the same block, |
| // we have a legit final use already. |
| if (same_depth_node == n) { |
| return; |
| } |
| |
| // in the case where the use is nested in a block |
| // add a Drop node after that block which will drop 'v'. |
| addToDropIfNotExists( |
| findOrCreateDropInstructionForNode(same_depth_node), v); |
| } |
| |
| // finds the node in block 'block' that contains in 'n' |
| // or nullptr if no such node exists, e.g.: |
| // n0: a = 4 |
| // n1: if <cond>: |
| // n2: b = a + a |
| // findOwnerInBlock(n2, n0.block()) == n1 |
| Node* findOwnerInBlock(Node* n, Block* block) { |
| while (n != nullptr && block != n->owningBlock()) { |
| n = n->owningBlock()->owningNode(); |
| } |
| return n; |
| } |
| |
| Node* findOrCreateDropInstructionForNode(Node* n) { |
| auto it = drop_for_node.find(n); |
| if (it == drop_for_node.end()) { |
| auto drop_node = graph.create(prim::Drop, 0); |
| drop_node->insertAfter(n); |
| it = drop_for_node.emplace(n, drop_node).first; |
| } |
| return it->second; |
| } |
| |
| void addToDropIfNotExists(Node* drop, Value* v) { |
| if (v->node()->kind() == prim::Constant) { |
| return; |
| } |
| for (auto i : drop->inputs()) { |
| // we already accounted for this use |
| if (i == v) |
| return; |
| } |
| drop->addInput(v); |
| } |
| }; |
| |
| InsertLastUses ilu(g); |
| } |
| |
| inline int64_t getDistAutogradContextId() { |
| #ifdef USE_DISTRIBUTED |
| return DistAutogradContainer::currentContextId(); |
| #else |
| return 0; |
| #endif |
| } |
| } // namespace |
| |
| std::ostream& operator<<(std::ostream& out, Instruction inst); |
| |
| /* |
| This is an optimization that reduces the number of store/load/move nodes needed |
| by recognizing that parts of the graph are simple trees like a*x + b*y. When |
| this happens it is possible to work directly off of the stack by emitting the |
| tree in a depth-first left-to-right manner: |
| load a |
| load x |
| mul # stack now is a*x |
| load b |
| load y |
| mul # stack now is a*x, b*y |
| add |
| |
| can_emit_inline_[node] == true means that this node participates as a non-root |
| member of one of these trees. The code emitter will not emit this node when |
| it is encountered in the node. Instead the node is emitted in a depth first |
| traversal from where it is used in a tree. |
| |
| To participate in a tree a node must have a single use (otherwise it is not |
| tree-like) and output a single value (for simplicity.) If our IR was functional, |
| these would be the only constraints. However, many nodes have side effects, so |
| we must ensure that emitting the nodes in depth first order from the tree's root |
| _does not reorder the emission of the nodes_. To ensure this, we work backward |
| from the root of a potential tree, visiting its inputs in reverse depth first |
| order, while scanning the node list backward (with the block_point node). When |
| these traversal line up we know it is safe to emit the tree in this way. We |
| ignore constant nodes, which do not have side effects. |
| */ |
| struct CanEmitInline { |
| CanEmitInline(const std::shared_ptr<Graph>& graph) { |
| scanBlock(graph->block()); |
| } |
| bool canInline(Value* v) { |
| return v->node()->kind() != prim::Param && |
| // without this a BailOut may float downstream past some later |
| // BailOut |
| // and receive a higher jf_index. Then a GUARD instruction |
| // we generated for the floated BailOut will get popped up from the |
| // instruction stack |
| // by the later BailOut in createBailoutBlock and its jf_index |
| // will become invalid. |
| v->node()->kind() != prim::CudaFusionGroup && |
| v->node()->kind() != prim::FusionGroup && |
| v->node()->kind() != prim::BailOut && v->uses().size() == 1 && |
| v->node()->outputs().size() == 1; |
| } |
| |
| Node* previousNonConstant(Node* n) { |
| do { |
| n = n->prev(); |
| } while (n->kind() == prim::Constant); |
| return n; |
| } |
| |
| Node* scanValue(Node* block_point, Value* v) { |
| // this node is a candidate for inline, if our reverse scan of the |
| // node list lines up with the use of v, we know it will be emitted in |
| // tree order, and we can inlining. Scan continutes for further nodes. |
| if (v->node() == block_point && canInline(v)) { |
| // since we inlined this node, we may be able to recursively inline |
| // its inputs, so we continue scanning it |
| block_point = scanNode(v->node()); |
| can_emit_inline_[v->node()] = true; |
| } |
| // if it does not line up, we can't inline 'v', and will just generate |
| // a load/move for it. However, other inputs may still appear in tree |
| // order so we continue the scan of the inputs. |
| return block_point; |
| } |
| |
| Node* scanNode(Node* n) { |
| // don't bother to scan nodes we have already determined to be inline |
| if (can_emit_inline_.count(n)) { |
| return nullptr; |
| } |
| for (auto b : n->blocks()) { |
| scanBlock(b); |
| } |
| Node* block_point = previousNonConstant(n); |
| for (auto it = n->inputs().rbegin(), end = n->inputs().rend(); it != end; |
| ++it) { |
| block_point = scanValue(block_point, *it); |
| } |
| return block_point; |
| } |
| |
| void scanBlock(Block* b) { |
| scanNode(b->return_node()); |
| for (auto node : b->nodes().reverse()) { |
| scanNode(node); |
| } |
| } |
| std::unordered_map<Node*, bool> can_emit_inline_; |
| }; |
| |
| // pre-processing that happens once per graph |
| struct PreprocessGraph { |
| PreprocessGraph(Graph& g) : graph(g.copy()) { |
| dropUnused(graph->block()); |
| // fill in move_flags by scanning blocks; |
| insertLastUses(*graph); |
| can_emit_inline = std::move(CanEmitInline(graph).can_emit_inline_); |
| } |
| |
| // Outputs of the preprocessing: |
| std::shared_ptr<Graph> graph; |
| std::unordered_map<Node*, bool> can_emit_inline; |
| }; |
| |
| // for keeping track of the current node |
| struct WithCurrentNode { |
| WithCurrentNode(Node** loc, Node* new_value) : loc_(loc), old_value_(*loc_) { |
| *loc = new_value; |
| } |
| ~WithCurrentNode() { |
| *loc_ = old_value_; |
| } |
| |
| private: |
| Node** loc_; |
| Node* old_value_; |
| }; |
| |
| // BailoutBlocks are used to temporarily store |
| // instructions (typically, argument LOADs and TAIL_CALL) |
| // generated for prim::BailOut nodes |
| // before they are merged back into |
| // CodeImpl._instructions_ by insertBailoutBlocks |
| struct BailoutBlock { |
| size_t jf_instruction_index; // this node gets patched to jump here on failure |
| std::vector<Instruction> instructions; // ends in a TAIL_CALL |
| }; |
| |
| struct CodeImpl { |
| friend struct InterpreterState; |
| std::vector<Instruction> instructions_; |
| |
| // same length as instructions. |
| // what node in the graph cause this |
| // instruction to be emitted? |
| std::vector<Node*> instructions_source_; |
| |
| std::vector<IValue> constant_table_; |
| std::vector<Operation> operator_table_; |
| std::vector<Function*> function_table_; |
| std::vector<std::unique_ptr<GraphFunction>> forked_functions_; |
| std::vector<TypePtr> type_table_; |
| std::vector<std::function<void(std::vector<IValue>&)>> |
| profile_function_table_; |
| |
| int register_size_ = 0; |
| size_t n_outputs; |
| size_t n_inputs; |
| TypePtr return_type_; |
| std::string function_name_; |
| |
| // We MUST hold onto graph here because some Operators stored in the |
| // instruction lists have dependencies on meta-data stored in the graph |
| // that would be dead otherwise. |
| // It is also very useful for debugging interpreter problems to |
| // keep this around. |
| std::shared_ptr<Graph> graph_; |
| c10::optional<std::vector<GraphExecutor*>> grad_executors_; |
| PreprocessGraph preprocess_; |
| |
| // map from unique of nodes to register in register table |
| std::unordered_map<Value*, int> value_to_reg_; |
| |
| // running count of uses as we emit. When we reach use_count_[v] = |
| // v.uses().size() we know it is the final use and we can move rather than |
| // load. |
| std::unordered_map<Value*, size_t> use_count_; |
| |
| Node* current_node_; // used in creation of code to keep track |
| // of node being emitted |
| Node* last_inserted_op_ = nullptr; |
| |
| // out-of-line jumps for bailouts that are patched in at the end |
| std::vector<BailoutBlock> bailout_blocks_; |
| std::vector<std::unique_ptr<Function>> bailout_functions_; |
| size_t remaining_bailout_depth_; |
| |
| CodeImpl( |
| const std::shared_ptr<Graph>& graph, |
| std::string function_name, |
| size_t remaining_bailout_depth) |
| : function_name_(std::move(function_name)), |
| preprocess_(*graph), |
| current_node_(preprocess_.graph->return_node()), |
| remaining_bailout_depth_(remaining_bailout_depth) { |
| graph_ = preprocess_.graph; |
| n_outputs = graph_->outputs().size(); |
| if (n_outputs == 1) { |
| return_type_ = graph->outputs().at(0)->type(); |
| } else { |
| return_type_ = TupleType::create( |
| fmap(graph->outputs(), [](const Value* v) { return v->type(); })); |
| } |
| n_inputs = graph_->inputs().size(); |
| // std::cout << *graph_ << "\n"; |
| emitCodeForBlock(graph_->block()); |
| insertInstruction(RET); |
| // we deferred the emission of bailout blocks so they appear at the end |
| // emit them now and patch up the jumps |
| insertBailoutBlocks(); |
| } |
| |
| const std::vector<c10::IValue>& constant_table() const { |
| return constant_table_; |
| } |
| |
| void request_bailout(size_t index) { |
| auto count = index; |
| for (size_t instr_index = 0; instr_index < instructions_.size(); |
| instr_index++) { |
| if (instructions_[instr_index].op == GUARD || |
| instructions_[instr_index].op == FAIL_GUARD) { |
| if (count-- == 0) { |
| // patching GUARD to FAIL_GUARD |
| instructions_[instr_index].op = FAIL_GUARD; |
| GRAPH_DEBUG( |
| "Added a bailout request for ", |
| index, |
| " at instruction ", |
| instr_index); |
| break; |
| } |
| } |
| } |
| } |
| |
| const std::vector<Instruction>& instructions() const { |
| return instructions_; |
| } |
| |
| const std::vector<Node*>& instructions_source() const { |
| return instructions_source_; |
| } |
| |
| void insertInstruction(OpCode op, int64_t X = 0, uint64_t N = 0) { |
| instructions_.emplace_back(op, X, N); |
| instructions_source_.emplace_back(current_node_); |
| |
| // check that we didn't accidentally emit nodes out of topological order |
| if (op == OP) { |
| if (last_inserted_op_ != nullptr && current_node_ != last_inserted_op_ && |
| current_node_->owningBlock() == last_inserted_op_->owningBlock()) { |
| TORCH_INTERNAL_ASSERT( |
| current_node_->isAfter(last_inserted_op_), |
| *current_node_, |
| " is not after ", |
| *last_inserted_op_); |
| } |
| last_inserted_op_ = current_node_; |
| } |
| } |
| |
| void truncateInstructions(size_t size) { |
| while (instructions_.size() > size) { |
| instructions_.pop_back(); |
| instructions_source_.pop_back(); |
| } |
| } |
| |
| void createBailoutBlock(size_t jf_index) { |
| bailout_blocks_.emplace_back(BailoutBlock{jf_index}); |
| auto& bailout_instructions = bailout_blocks_.back().instructions; |
| |
| bailout_instructions.insert( |
| bailout_instructions.end(), |
| instructions_.begin() + jf_index + 1, |
| instructions_.end()); |
| truncateInstructions(jf_index + 1); |
| } |
| |
| int allocRegs(at::ArrayRef<Value*> vs) { |
| int result = register_size_ + 1; |
| for (Value* v : vs) { |
| AT_ASSERT(value_to_reg_.count(v) == 0); |
| value_to_reg_[v] = ++register_size_; |
| } |
| return result; |
| } |
| |
| int registerFor(Value* v) { |
| return value_to_reg_.at(v); |
| } |
| |
| void emitUse(Value* input, bool drop) { |
| // drop - if true, we are not actually going to use this thing |
| // and we can short circuit doing many instructions here |
| // by either clearing the register (DROPR) or just popping the stack |
| // (DROP) |
| if (preprocess_.can_emit_inline[input->node()]) { |
| emitNode(input->node()); |
| if (drop) { |
| insertInstruction(DROP); |
| } |
| } else { |
| int reg = registerFor(input); |
| bool moved = input->uses().size() == ++use_count_[input]; |
| |
| OpCode op; |
| if (input->node()->kind() == prim::Constant) { |
| op = LOADC; |
| } else if (drop) { |
| op = DROPR; |
| } else if (moved) { |
| op = MOVE; |
| } else { |
| op = LOAD; |
| } |
| insertInstruction(op, reg); |
| } |
| } |
| |
| void emitLoadInputs(at::ArrayRef<Value*> inputs) { |
| for (Value* input : inputs) { |
| emitUse(input, false); |
| } |
| } |
| |
| void emitOperator(Node* node) { |
| emitLoadInputs(node->inputs()); |
| const Operator& op = node->getOperator(); |
| if (op.hasOperation() && op.schema().is_vararg()) { |
| insertInstruction(OPN, operator_table_.size(), node->inputs().size()); |
| } else { |
| insertInstruction(OP, operator_table_.size()); |
| } |
| operator_table_.emplace_back(op.getOperation(node)); |
| } |
| |
| void emitWait(Node* node) { |
| emitLoadInputs(node->inputs()); |
| insertInstruction(WAIT); |
| } |
| |
| void emitDrop(at::ArrayRef<Value*> to_drop) { |
| for (Value* input : to_drop) { |
| emitUse(input, true); |
| } |
| } |
| |
| void emitStoreOutputs(Node* node) { |
| size_t N = node->outputs().size(); |
| if (N == 0) |
| return; |
| int regs = allocRegs(node->outputs()); |
| if (N == 1) { |
| insertInstruction(STORE, regs); |
| } else { |
| insertInstruction(STOREN, regs, node->outputs().size()); |
| } |
| } |
| |
| int insertConstant(IValue value) { |
| int result = constant_table_.size(); |
| constant_table_.emplace_back(std::move(value)); |
| return result; |
| } |
| |
| void emitConstant(Node* node) { |
| if (node->output()->type()->kind() == FunctionType::Kind) { |
| return; |
| } |
| // constants are just put in the constant table |
| value_to_reg_[node->output()] = |
| insertConstant(toIValue(node->output()).value()); |
| } |
| |
| void emitIf(Node* node) { |
| emitLoadInputs(node->inputs()); |
| size_t start_if = instructions_.size(); |
| insertInstruction(JF, 0); // dummy offset to be filled in |
| emitCodeForBlock(node->blocks().at(0)); |
| insertInstruction(JMP, 0); // dummy offset |
| size_t start_else = instructions_.size(); |
| instructions_[start_if].X = start_else - start_if; |
| emitCodeForBlock(node->blocks().at(1)); |
| instructions_[start_else - 1].X = instructions_.size() - (start_else - 1); |
| } |
| |
| void emitLoop(Node* loop) { |
| insertInstruction(LOADC, insertConstant(0)); |
| emitLoadInputs(loop->inputs()); |
| size_t start = instructions_.size(); |
| insertInstruction(LOOP, 0, loop->inputs().size()); // dummy offset |
| emitCodeForBlock(loop->blocks().at(0)); |
| insertInstruction(JMP, start - instructions_.size()); |
| instructions_[start].X = instructions_.size() - start; |
| } |
| |
| void emitCall(Function* func, at::ArrayRef<Value*> inputs) { |
| emitLoadInputs(inputs); |
| insertInstruction(CALL, function_table_.size()); |
| function_table_.emplace_back(std::move(func)); |
| } |
| |
| void emitNodeAtBlockLevel(Node* node) { |
| WithCurrentNode guard(¤t_node_, node); |
| switch (node->kind()) { |
| case prim::Constant: |
| emitConstant(node); |
| break; |
| case prim::Return: |
| emitLoadInputs(node->inputs()); |
| break; |
| default: |
| if (!preprocess_.can_emit_inline[node]) { |
| emitNode(node); |
| emitStoreOutputs(node); |
| } |
| break; |
| } |
| } |
| |
| size_t emitType(TypePtr t) { |
| size_t r = type_table_.size(); |
| type_table_.emplace_back(std::move(t)); |
| return r; |
| } |
| |
| size_t emitGuard(Node* node) { |
| // unoptimized graph is at index 0 |
| // guarded input is at index 1 |
| // the rest of args follow |
| emitLoadInputs(node->inputs().slice(1, 1)); |
| insertInstruction(GUARD, emitType(node->outputs().at(0)->type())); |
| insertInstruction(JF, 0 /* to be patched */); |
| return instructions_.size() - 1; |
| } |
| |
| void emitBailOut(Node* node) { |
| auto jf_index = emitGuard(node); |
| auto unoptimized_graph = node->inputs().at(0)->node()->g(attr::Subgraph); |
| // note, guaded input is already loaded onto the stack |
| // for GUARD instruction |
| emitLoadInputs(node->inputs().slice(2)); |
| insertInstruction(TAIL_CALL, function_table_.size()); |
| TORCH_INTERNAL_ASSERT(node->kind() == prim::BailOut); |
| auto bailout_index = node->i(attr::index); |
| TORCH_INTERNAL_ASSERT(bailout_index >= 0); |
| |
| auto build_bailout_graph = [bailout_index, |
| unoptimized_graph](Function& func) { |
| BuildBailOutGraphFrom(bailout_index, unoptimized_graph, func.graph()); |
| }; |
| |
| auto empty_graph = std::make_shared<Graph>(); |
| auto func = torch::make_unique<GraphFunction>( |
| "bailout", empty_graph, build_bailout_graph); |
| function_table_.emplace_back(func.get()); |
| bailout_functions_.emplace_back(std::move(func)); |
| createBailoutBlock(jf_index); |
| } |
| |
| void emitProfile(Node* node) { |
| emitLoadInputs(node->inputs()); |
| insertInstruction(PROFILE_OP, profile_function_table_.size()); |
| profile_function_table_.push_back(node->cast<ProfileOp>()->getCallback()); |
| } |
| |
| void emitGetAttr(Node* node) { |
| emitLoadInputs(node->inputs()); |
| const auto type = node->input()->type()->expect<ClassType>(); |
| const auto& field = node->s(attr::name); |
| const auto slot = type->getAttributeSlot(field); |
| insertInstruction(GET_ATTR, slot); |
| } |
| |
| void emitSetAttr(Node* node) { |
| emitLoadInputs(node->inputs()); |
| const auto type = node->inputs().at(0)->type()->expect<ClassType>(); |
| const auto& field = node->s(attr::name); |
| const auto slot = type->getAttributeSlot(field); |
| insertInstruction(SET_ATTR, slot); |
| } |
| |
| void insertBailoutBlocks() { |
| for (const BailoutBlock& block : bailout_blocks_) { |
| TORCH_INTERNAL_ASSERT(instructions_[block.jf_instruction_index].op == JF) |
| instructions_[block.jf_instruction_index].X = |
| instructions_.size() - block.jf_instruction_index; |
| instructions_.insert( |
| instructions_.end(), |
| block.instructions.begin(), |
| block.instructions.end()); |
| instructions_source_.insert( |
| instructions_source_.end(), |
| block.instructions.size(), |
| instructions_source_[block.jf_instruction_index]); |
| } |
| } |
| void emitInterfaceCall( |
| std::string method_name_str, |
| c10::ArrayRef<Value*> inputs) { |
| emitLoadInputs(inputs); |
| auto method_name = insertConstant(std::move(method_name_str)); |
| insertInstruction(INTERFACE_CALL, method_name, inputs.size()); |
| } |
| |
| void emitListUnpack(Node* node) { |
| emitLoadInputs(node->inputs()); |
| insertInstruction(LIST_UNPACK, node->outputs().size()); |
| } |
| |
| void emitTupleConstruct(Node* node) { |
| bool named = |
| node->output()->type()->expect<TupleType>()->name().has_value(); |
| if (named) { |
| emitContainerConstruct(NAMED_TUPLE_CONSTRUCT, node); |
| } else { |
| emitLoadInputs(node->inputs()); |
| insertInstruction(TUPLE_CONSTRUCT, node->inputs().size()); |
| } |
| } |
| |
| void emitContainerConstruct(OpCode op, Node* node) { |
| emitLoadInputs(node->inputs()); |
| insertInstruction( |
| op, emitType(node->output()->type()), node->inputs().size()); |
| } |
| |
| void emitCreateObject(Node* node) { |
| insertInstruction(CREATE_OBJECT, emitType(node->output()->type())); |
| } |
| void emitIsinstance(Node* node) { |
| emitLoadInputs(node->inputs()); |
| std::vector<TypePtr> types = node->tys(attr::types); |
| size_t types_start = type_table_.size(); |
| for (const auto& typ : types) { |
| emitType(typ); |
| } |
| insertInstruction(ISINSTANCE, types_start, types.size()); |
| } |
| |
| void emitTupleSlice(Node* node) { |
| emitLoadInputs(node->inputs()); |
| int64_t beg_ind = node->i(attr::beg); |
| int64_t end_ind = node->i(attr::end); |
| insertInstruction(TUPLE_SLICE, beg_ind, end_ind - beg_ind); |
| } |
| |
| void emitFork(Node* node) { |
| emitLoadInputs(node->inputs()); |
| std::unique_ptr<GraphFunction> forked_fn(new GraphFunction( |
| "<forked function>", node->g(attr::Subgraph), nullptr)); |
| forked_functions_.emplace_back(std::move(forked_fn)); |
| function_table_.emplace_back(forked_functions_.back().get()); |
| insertInstruction(FORK, function_table_.size() - 1, node->inputs().size()); |
| } |
| |
| void emitWarn(Node* node) { |
| emitLoadInputs(node->inputs()); |
| insertInstruction(WARN); |
| } |
| |
| void emitNode(Node* node) { |
| WithCurrentNode guard(¤t_node_, node); |
| switch (node->kind()) { |
| default: |
| emitOperator(node); |
| break; |
| case prim::Drop: |
| emitDrop(node->inputs()); |
| break; |
| case prim::Constant: |
| emitConstant(node); |
| break; |
| case prim::If: |
| emitIf(node); |
| break; |
| case prim::Loop: |
| emitLoop(node); |
| break; |
| case aten::wait: |
| emitWait(node); |
| break; |
| case prim::Param: |
| break; |
| case prim::CallFunction: |
| emitCall( |
| node->inputs().at(0)->type()->expect<FunctionType>()->function(), |
| node->inputs().slice(1)); |
| break; |
| case prim::CallMethod: |
| if (auto class_type = node->inputs().at(0)->type()->cast<ClassType>()) { |
| emitCall(class_type->getMethod(node->s(attr::name)), node->inputs()); |
| } else { |
| emitInterfaceCall(node->s(attr::name), node->inputs()); |
| } |
| break; |
| case prim::BailOut: |
| emitBailOut(node); |
| break; |
| case prim::profile: |
| emitProfile(node); |
| break; |
| case prim::GetAttr: |
| emitGetAttr(node); |
| break; |
| case prim::SetAttr: |
| emitSetAttr(node); |
| break; |
| case prim::ListUnpack: |
| emitListUnpack(node); |
| break; |
| case prim::TupleConstruct: |
| emitTupleConstruct(node); |
| break; |
| case prim::ListConstruct: |
| emitContainerConstruct(LIST_CONSTRUCT, node); |
| break; |
| case prim::DictConstruct: |
| emitContainerConstruct(DICT_CONSTRUCT, node); |
| break; |
| case prim::CreateObject: |
| emitCreateObject(node); |
| break; |
| case prim::isinstance: |
| emitIsinstance(node); |
| break; |
| case prim::TupleSlice: |
| emitTupleSlice(node); |
| break; |
| case prim::fork: |
| emitFork(node); |
| break; |
| case aten::warn: |
| emitWarn(node); |
| break; |
| } |
| } |
| |
| void emitCodeForBlock(Block* block) { |
| emitNodeAtBlockLevel(block->param_node()); |
| for (auto node : block->nodes()) { |
| emitNodeAtBlockLevel(node); |
| } |
| emitNodeAtBlockLevel(block->return_node()); |
| } |
| |
| const std::vector<GraphExecutor*>& grad_executors() { |
| if (!grad_executors_) { |
| grad_executors_.emplace(); |
| for (Operation& op : operator_table_) { |
| if (auto executor = detail::getGradExecutor(op)) { |
| grad_executors_->push_back(executor); |
| } |
| } |
| } |
| return *grad_executors_; |
| } |
| |
| void dump(std::ostream& out, size_t i) const { |
| out << i << " " << instructions_[i]; |
| if (instructions_[i].op == OP || instructions_[i].op == CALL || |
| instructions_[i].op == OPN) { |
| out << " # " << *instructions_source_[i]; |
| } else { |
| out << "\n"; |
| } |
| } |
| |
| void dump(std::ostream& out) const { |
| out << *graph_ << "\n"; |
| for (size_t i = 0; i < instructions_.size(); ++i) { |
| dump(out, i); |
| } |
| } |
| }; |
| |
| // InterpreterState state that and used to compute a Code |
| struct InterpreterStateImpl : c10::intrusive_ptr_target { |
| InterpreterStateImpl(const Code& code) { |
| enterFrame(code, 0); |
| } |
| |
| private: |
| // if we need to suspend, where do we reset the stack? |
| // answer: to where it was when we were called, not |
| // including any inputs to this function |
| int64_t stack_start_ = -1; |
| c10::intrusive_ptr<Future> future_; |
| |
| // this holds all the tensors for this interpreter run |
| // we don't bother minimizing the size of this vector, since the extra |
| // memory used by the pointers in this will be small |
| // instead we are very aggresive about releasing tensors when they become dead |
| // to make sure memory management happens efficiently. |
| // We optimize for the case where derivatives are run with retain_graph=False |
| // in the case where it is true, then the interpreter and this array get |
| // copied if this every becomes a bottleneck then we _should_ consider |
| // minimizing the total number or register |
| std::vector<IValue> registers; |
| |
| // A Frame captures function's state |
| // (e.g. `pc` and `base_pointer`) |
| // Each Frame corresponds to a call to a `Frame::function` |
| // which has not yet returned |
| // The arguments for `Frame::function` |
| // are located at [base_pointer + arg_number] |
| struct Frame { |
| std::shared_ptr<CodeImpl> function; |
| // program counter corresponds to the index |
| // of the currently executed instruction |
| size_t pc; |
| // marks the start index of the frame |
| // base_pointer is used by TAIL_CALL |
| // to replace the current frame |
| // with a frame of a bailout graph |
| size_t base_pointer; |
| |
| // unique to every frame with prim::profile across all threads |
| c10::optional<size_t> id; |
| static std::atomic<size_t> num_frames; |
| }; |
| |
| // saved-by-value stuff that can exist on the stack inside runInterpreter |
| struct ActiveFrame { |
| size_t pc; |
| Instruction* instructions; |
| IValue* constants; |
| Operation* operators; |
| Function** functions; |
| std::function<void(std::vector<IValue>&)>* profile_functions; |
| TypePtr* types; |
| |
| ActiveFrame(const Frame& frame) |
| : pc(frame.pc), |
| instructions(frame.function->instructions_.data()), |
| constants(frame.function->constant_table_.data()), |
| operators(frame.function->operator_table_.data()), |
| functions(frame.function->function_table_.data()), |
| profile_functions(frame.function->profile_function_table_.data()), |
| types(frame.function->type_table_.data()) {} |
| }; |
| |
| std::vector<Frame> frames; |
| |
| c10::intrusive_ptr<InterpreterStateImpl> intrusive_from_this() { |
| c10::raw::intrusive_ptr::incref(this); |
| return c10::intrusive_ptr<InterpreterStateImpl>::reclaim(this); |
| } |
| |
| void enterFrame(const Code& code, size_t base_pointer) { |
| frames.emplace_back(Frame{code.pImpl, 0, base_pointer, c10::nullopt}); |
| registers.resize(registers.size() + code.pImpl->register_size_); |
| // frames.back().function->dump(std::cout); |
| } |
| |
| void leaveFrame() { |
| registers.resize(registers.size() - frames.back().function->register_size_); |
| frames.pop_back(); |
| } |
| |
| // relative to the end of the register list so that when we call |
| // functions we are referring to the registers of the currenly executing |
| // function. |
| IValue& reg(size_t reg) { |
| return *(registers.end() - reg); |
| } |
| |
| void dump(std::ostream& out, const Stack& stack) const { |
| out << "Stack:\n"; |
| for (const auto& val : stack) { |
| out << val; |
| out << "\n"; |
| } |
| } |
| |
| void runBuiltinFunction(Stack& stack, Function* fn, ActiveFrame* af) { |
| // BuiltinOpFunction directly invokes a void(Stack&) to implement |
| // custom C++ classes. Call run() here with the stack, and we will |
| // get the results from that C++ method back in the stack. Advance |
| // the PC by 1 without adding any new frame. |
| fn->run(stack); |
| ++af->pc; |
| } |
| |
| void runGraphFunction(Stack& stack, Function* fn, ActiveFrame* af) { |
| const Code& code = |
| // consider passing |
| // `frames.back().function->remaining_bailout_depth_` into |
| // `get_executor().getPlanFor()` to propagate caller's depth |
| // restrictions onto children while this strategy has a |
| // potential to reduce the number of compilations for too |
| // dynamic callers we might miss opportunities where a caller is |
| // dynamic but a callee gets stable arguments |
| fn->get_executor() |
| .getPlanFor(stack, GraphExecutor::getDefaultNumBailOuts()) |
| .code; |
| frames.back().pc = af->pc + 1; |
| RECORD_TORCHSCRIPT_FUNCTION(fn->name(), last(stack, code.num_inputs())); |
| enterFrame(code, stack.size() - code.num_inputs()); |
| *af = ActiveFrame(frames.back()); |
| } |
| |
| bool runImpl(Stack& stack) { |
| // if we have never run before, then we might have to return the |
| // stack when we suspend, record where it starts so we return the right |
| // stack |
| if (stack_start_ == -1) { |
| TORCH_INTERNAL_ASSERT(stack.size() >= frames.back().function->n_inputs); |
| stack_start_ = stack.size() - frames.back().function->n_inputs; |
| } else { |
| // during restarts, all of the stack is always our own, so we leave |
| // nothing |
| stack_start_ = 0; |
| } |
| |
| ActiveFrame af(frames.back()); |
| try { |
| while (true) { |
| // std::cout << "RUNNING "; |
| // frames.back().function->dump(std::cout, af.pc); |
| Instruction inst = af.instructions[af.pc]; |
| switch (inst.op) { |
| case OP: |
| af.operators[inst.X](stack); |
| ++af.pc; |
| break; |
| case OPN: |
| stack.push_back(inst.N); |
| af.operators[inst.X](stack); |
| ++af.pc; |
| break; |
| case LOAD: |
| stack.emplace_back(reg(inst.X)); |
| ++af.pc; |
| break; |
| case MOVE: |
| stack.emplace_back(std::move(reg(inst.X))); |
| ++af.pc; |
| break; |
| case STORE: |
| reg(inst.X) = pop(stack); |
| ++af.pc; |
| break; |
| case STOREN: |
| for (size_t i = inst.N; i > 0; --i) { |
| reg(inst.X + i - 1) = pop(stack); |
| } |
| ++af.pc; |
| break; |
| case DROP: |
| pop(stack); |
| ++af.pc; |
| break; |
| case DROPR: |
| reg(inst.X) = IValue(); |
| ++af.pc; |
| break; |
| case LOADC: |
| stack.emplace_back(af.constants[inst.X]); |
| ++af.pc; |
| break; |
| case GET_ATTR: { |
| auto userObj = pop(stack).toObject(); |
| auto value = userObj->getSlot(inst.X); |
| push(stack, std::move(value)); |
| ++af.pc; |
| } break; |
| case SET_ATTR: { |
| auto v = pop(stack); |
| auto userObj = pop(stack).toObject(); |
| userObj->setSlot(inst.X, std::move(v)); |
| ++af.pc; |
| } break; |
| case JF: |
| af.pc += (pop(stack).toBool()) ? 1 : inst.X; |
| break; |
| case JMP: |
| af.pc += inst.X; |
| break; |
| case LOOP: { |
| // stack: iteration_count, max_iter, cond, loop_carried_deps... |
| auto frame = stack.end() - (inst.N + 1); |
| int64_t trip_count = frame[0].toInt(); |
| int64_t max_trip_count = frame[1].toInt(); |
| bool cond = frame[2].toBool(); |
| if (trip_count < max_trip_count && cond) { |
| frame[2] = trip_count; |
| frame[0] = trip_count + 1; |
| ++af.pc; |
| } else { |
| size_t n_loop_carried = inst.N - 2; |
| for (size_t i = 0; i < n_loop_carried; ++i) { |
| frame[i] = std::move(frame[i + 3]); |
| } |
| drop(stack, 3); // iteration_count, max_iter, cond |
| af.pc += inst.X; |
| } |
| } break; |
| case CALL: { |
| Function* fn = af.functions[inst.X]; |
| if (!fn->isGraphFunction()) { |
| runBuiltinFunction(stack, fn, &af); |
| } else { |
| runGraphFunction(stack, fn, &af); |
| } |
| } break; |
| case INTERFACE_CALL: { |
| // note the hash table lookup to find the function |
| // this can be more optimized if necessary, caching parts |
| // of the hashing computation or storing the offset when |
| // the object is turned into an interface |
| |
| // consider passing |
| // `frames.back().function->remaining_bailout_depth_` into |
| // `get_executor().getPlanFor()` to propagate caller's depth |
| // restrictions onto children while this strategy has a potential to |
| // reduce the number of compilations for too dynamic callers we |
| // might miss opportunities where a caller is dynamic but a callee |
| // gets stable arguments |
| auto function = peek(stack, 0, inst.N) |
| .toObject() |
| ->type() |
| ->getMethod(af.constants[inst.X].toStringRef()); |
| if (!function->isGraphFunction()) { |
| runBuiltinFunction(stack, function, &af); |
| } else { |
| runGraphFunction(stack, function, &af); |
| } |
| } break; |
| case RET: |
| if (frames.size() > 1) { |
| leaveFrame(); |
| af = ActiveFrame(frames.back()); |
| break; |
| } |
| if (future_) { |
| auto num_outputs = frames.back().function->n_outputs; |
| if (num_outputs == 1) { |
| future_->markCompleted(stack.back()); |
| } else { |
| future_->markCompleted(c10::ivalue::Tuple::create( |
| jit::last(stack, num_outputs).vec())); |
| } |
| } |
| return false; |
| case WAIT: { |
| auto future = stack.back().toFuture(); |
| if (!future->completed()) { |
| getOrCreateFuture(); |
| |
| // callback needs to be a struct rather than a lambda so that |
| // we can move the stack to the other thread |
| struct Callback { |
| Callback( |
| c10::intrusive_ptr<InterpreterStateImpl> state, |
| Stack stack) |
| : state_(std::move(state)), stack_(std::move(stack)) { |
| dist_autograd_context_id_ = getDistAutogradContextId(); |
| } |
| void operator()() { |
| at::launch(InterpreterContinuation( |
| state_, std::move(stack_), dist_autograd_context_id_)); |
| } |
| |
| private: |
| InterpreterState state_; |
| Stack stack_; |
| int64_t dist_autograd_context_id_; |
| }; |
| |
| // we are suspending, so we need to reset the stack to where we |
| // started if it started empty, except for the inputs we can avoid |
| // a true copy by swapping, which leaves the original stack empty. |
| Stack copied; |
| if (stack_start_ == 0) { |
| copied.swap(stack); |
| } else { |
| copied.insert( |
| copied.begin(), |
| std::make_move_iterator(stack.begin() + stack_start_), |
| std::make_move_iterator(stack.end())); |
| stack.resize(stack_start_); |
| } |
| // save pc into the frame so we continue here when restored |
| frames.back().pc = af.pc; |
| future->addCallback( |
| Callback(intrusive_from_this(), std::move(copied))); |
| |
| return true; |
| } |
| stack.pop_back(); |
| stack.emplace_back(future->value()); |
| ++af.pc; |
| } break; |
| case PROFILE_OP: { |
| auto& frame_id_ref = frames.back().id; |
| if (!frame_id_ref.has_value()) { |
| frame_id_ref = Frame::num_frames++; |
| } |
| auto callback = af.profile_functions[inst.X]; |
| push(stack, c10::IValue{static_cast<int64_t>(*frame_id_ref)}); |
| callback(stack); |
| ++af.pc; |
| break; |
| } |
| case FAIL_GUARD: { |
| // patch FAIL_GUARD back to GUARD |
| GRAPH_DEBUG( |
| "Bailout ", inst.X, " triggered via bailout_requests_!"); |
| af.instructions[af.pc].op = GUARD; |
| push(stack, false); |
| ++af.pc; |
| break; |
| } |
| case GUARD: { |
| if (!stack.back().isTensor()) { |
| // stack.back() is an Uninitialized IValue and this is a guard |
| // on a block output. Uninitialized IValues are never used |
| // so it's safe to pass this guard check |
| push(stack, true); |
| } else { |
| auto t = stack.back().toTensor(); |
| const TypePtr& expected = af.types[inst.X]; |
| bool comp = expected->cast<TensorType>() |
| ->isCompatibleWithInCurrentExecutionContext(t); |
| push(stack, comp); |
| } |
| ++af.pc; |
| } break; |
| case TAIL_CALL: { |
| GRAPH_DEBUG("running TAIL_CALL for ", inst.X); |
| af.functions[inst.X]->ensure_defined(); |
| size_t remaining_bailout_depth = |
| frames.back().function->remaining_bailout_depth_ > 0 |
| ? frames.back().function->remaining_bailout_depth_ - 1 |
| : 0; |
| const Code& code = af.functions[inst.X] |
| ->get_executor() |
| .getPlanFor(stack, remaining_bailout_depth) |
| .code; |
| size_t num_inputs = code.num_inputs(); |
| size_t base_pointer = frames.back().base_pointer; |
| TORCH_INTERNAL_ASSERT(stack.size() >= num_inputs); |
| size_t inputs_start = stack.size() - num_inputs; |
| for (size_t i = 0; i < num_inputs; ++i) { |
| stack.at(base_pointer + i) = |
| std::move(stack.at(inputs_start + i)); |
| } |
| stack.resize(base_pointer + num_inputs); |
| leaveFrame(); |
| enterFrame(code, base_pointer); |
| af = ActiveFrame(frames.back()); |
| } break; |
| case LIST_UNPACK: { |
| listUnpack(stack, inst.X); |
| ++af.pc; |
| } break; |
| case TUPLE_CONSTRUCT: { |
| tupleConstruct(stack, inst.X); |
| ++af.pc; |
| } break; |
| case TUPLE_SLICE: { |
| tupleSlice(stack, inst.X, inst.X + inst.N); |
| ++af.pc; |
| } break; |
| case NAMED_TUPLE_CONSTRUCT: { |
| auto type = af.types[inst.X]->expect<TupleType>(); |
| namedTupleConstruct(stack, type, inst.N); |
| ++af.pc; |
| } break; |
| case LIST_CONSTRUCT: { |
| auto type = af.types[inst.X]->expect<ListType>(); |
| listConstruct(stack, type, inst.N); |
| ++af.pc; |
| } break; |
| case DICT_CONSTRUCT: { |
| auto type = af.types[inst.X]->expect<DictType>(); |
| dictConstruct(stack, type, inst.N); |
| ++af.pc; |
| } break; |
| case CREATE_OBJECT: { |
| auto type = af.types[inst.X]->expect<ClassType>(); |
| createObject(stack, type); |
| ++af.pc; |
| } break; |
| case ISINSTANCE: { |
| at::ArrayRef<TypePtr> types( |
| af.types + inst.X, af.types + inst.X + inst.N); |
| isinstance(stack, types); |
| ++af.pc; |
| } break; |
| case FORK: { |
| // Move inputs to a separate stack |
| Function* forked_fn = af.functions[inst.X]; |
| InterpreterState forked_interpreter( |
| forked_fn->get_executor() |
| .getPlanFor(stack, GraphExecutor::getDefaultNumBailOuts()) |
| .code); |
| InterpreterContinuation continuation( |
| forked_interpreter, |
| Stack(stack.end() - inst.N, stack.end()), |
| getDistAutogradContextId()); |
| drop(stack, inst.N); |
| push(stack, forked_interpreter.getFuture()); |
| at::launch(std::move(continuation)); |
| ++af.pc; |
| } break; |
| case WARN: { |
| Node* node = frames.back().function->instructions_source_.at(af.pc); |
| auto range = node->sourceRange().source(); |
| if (range->filename()) { |
| auto line = range->starting_line_no() + |
| range->lineno_for_offset(node->sourceRange().start()); |
| drop(stack, 1); |
| c10::SourceLocation location{ |
| "", range->filename()->c_str(), uint32_t(line)}; |
| c10::Warning::warn(location, pop(stack).toStringRef()); |
| } else { |
| TORCH_WARN(pop(stack).toStringRef()); |
| } |
| ++af.pc; |
| } break; |
| } |
| } |
| } catch (std::exception& e) { |
| frames.back().pc = af.pc; |
| bool is_jit_exception = dynamic_cast<JITException*>(&e); |
| handleError(ExceptionMessage(e), is_jit_exception); |
| return false; |
| } |
| } |
| |
| void formatStackTrace(std::ostream& out) { |
| std::vector<StackEntry> entries; |
| for (size_t i = 0; i < frames.size(); ++i) { |
| const Frame& frame = frames[i]; |
| std::string previous_fn_name = frame.function->function_name_; |
| size_t pc = frame.pc; |
| // CALL nodes have already advanced the pc, so |
| // undo that to report the call node |
| if (i + 1 < frames.size()) { |
| --pc; |
| } |
| |
| Node* node = frame.function->instructions_source_[pc]; |
| if (node->callstack()) { |
| for (const auto& p : (*node->callstack())->vec()) { |
| entries.emplace_back(StackEntry{previous_fn_name, p.second}); |
| previous_fn_name = p.first->name(); |
| } |
| } |
| entries.emplace_back(StackEntry{previous_fn_name, node->sourceRange()}); |
| } |
| format_stack_trace(out, entries); |
| } |
| |
| void handleError(const ExceptionMessage& msg, bool is_jit_exception) { |
| std::ostringstream ss; |
| ss << "The following operation failed in the TorchScript interpreter.\n"; |
| formatStackTrace(ss); |
| ss << "RuntimeError: " << msg << "\n"; |
| if (future_) { |
| future_->setError(Future::FutureError(ss.str())); |
| } else if (is_jit_exception) { |
| throw JITException(ss.str()); |
| } else { |
| throw std::runtime_error(ss.str()); |
| } |
| } |
| |
| public: |
| c10::intrusive_ptr<Future> getOrCreateFuture() { |
| if (!future_) { |
| future_ = |
| c10::make_intrusive<Future>(frames.front().function->return_type_); |
| } |
| return future_; |
| } |
| |
| c10::intrusive_ptr<Future> runAsync(Stack& stack) { |
| getOrCreateFuture(); |
| runImpl(stack); |
| return future_; |
| } |
| |
| void run(Stack& stack) { |
| if (runImpl(stack)) { |
| future_->wait(); |
| |
| auto num_outputs = frames.front().function->n_outputs; |
| if (num_outputs == 1) { |
| push(stack, future_->value()); |
| } else { |
| auto tuple = future_->value().toTuple(); |
| for (const IValue& value : tuple->elements()) { |
| push(stack, value); |
| } |
| } |
| } |
| } |
| }; |
| |
| std::atomic<size_t> InterpreterStateImpl::Frame::num_frames; |
| |
| std::ostream& operator<<(std::ostream& out, const Code& code) { |
| out << *code.pImpl->graph_ << "\n"; |
| code.pImpl->dump(out); |
| return out; |
| } |
| |
| Code::Code( |
| const std::shared_ptr<Graph>& graph, |
| std::string function_name, |
| size_t remaining_bailout_depth) |
| : pImpl(new CodeImpl( |
| graph, |
| std::move(function_name), |
| remaining_bailout_depth)) {} |
| Code::~Code() = default; |
| |
| const std::vector<GraphExecutor*>& Code::grad_executors() { |
| return pImpl->grad_executors(); |
| } |
| |
| size_t Code::num_bailouts() const { |
| return pImpl->type_table_.size(); |
| } |
| |
| void Code::request_bailout(size_t index) { |
| pImpl->request_bailout(index); |
| } |
| |
| size_t Code::num_inputs() const { |
| return pImpl->n_inputs; |
| } |
| |
| size_t Code::num_outputs() const { |
| return pImpl->n_outputs; |
| } |
| |
| const std::vector<c10::IValue>& Code::constant_table() const { |
| return pImpl->constant_table(); |
| } |
| |
| const std::vector<Instruction>& Code::instructions() const { |
| return pImpl->instructions(); |
| } |
| |
| const std::vector<Node*>& Code::instructions_source() const { |
| return pImpl->instructions_source(); |
| } |
| |
| const std::vector<TypePtr>& Code::type_table() const { |
| return pImpl->type_table_; |
| } |
| |
| size_t Code::register_size() const { |
| return pImpl->register_size_; |
| } |
| |
| InterpreterState::InterpreterState(const Code& code) |
| : pImpl(c10::make_intrusive<InterpreterStateImpl>(code)) {} |
| InterpreterState::~InterpreterState() = default; |
| |
| void InterpreterState::run(Stack& stack) { |
| static_cast<InterpreterStateImpl*>(pImpl.get())->run(stack); |
| } |
| |
| c10::intrusive_ptr<Future> InterpreterState::runAsync(Stack& stack) { |
| return static_cast<InterpreterStateImpl*>(pImpl.get())->runAsync(stack); |
| } |
| |
| c10::intrusive_ptr<Future> InterpreterState::getFuture() { |
| return static_cast<InterpreterStateImpl*>(pImpl.get())->getOrCreateFuture(); |
| } |
| |
| InterpreterState::InterpreterState( |
| c10::intrusive_ptr<c10::intrusive_ptr_target> pImpl_) |
| : pImpl(std::move(pImpl_)) {} |
| |
| void InterpreterContinuation::operator()() { |
| #ifdef USE_DISTRIBUTED |
| auto prev_dist_id = DistAutogradContainer::currentContextId(); |
| DistAutogradContainer::forceCurrentContextId(dist_autograd_context_id_); |
| #endif |
| state.runAsync(stack); |
| #ifdef USE_DISTRIBUTED |
| DistAutogradContainer::forceCurrentContextId(prev_dist_id); |
| #endif |
| } |
| } // namespace jit |
| } // namespace torch |