| #pragma once |
| |
| #include <ATen/core/jit_type.h> |
| #include <ATen/core/stack.h> |
| #include <torch/csrc/WindowsTorchApiMacro.h> |
| #include <torch/csrc/autograd/variable.h> |
| #include <torch/csrc/jit/ir.h> |
| #include <torch/csrc/utils/hash.h> |
| #include <iostream> |
| #include <vector> |
| |
| #include <torch/csrc/utils/hash.h> |
| |
| namespace torch { |
| namespace jit { |
| |
| // GraphExecutor creates specializations of Graphs for different |
| // dimensionalitities and types of inputs. |
| |
| inline static at::Device ConvertIntToCPUOrCUDA(int device) { |
| return device < 0 ? at::kCPU : at::Device(at::DeviceType::CUDA, device); |
| } |
| struct ArgumentInfo { |
| friend struct ArgumentSpec; |
| using plain_data_type = uint32_t; |
| |
| bool defined() const { |
| return defined_; |
| } |
| int device() const { |
| return device_; |
| } |
| // XXX: It is guaranteed that this will return false when called on non-tensor |
| // arguments |
| bool requires_grad() const { |
| return requires_grad_; |
| } |
| int dim() const { |
| return dim_; |
| } |
| at::ScalarType type() const { |
| return at::ScalarType(type_); |
| } |
| TypePtr toType() const { |
| if (!defined()) |
| return TensorType::get(); |
| return TensorType::create( |
| type(), |
| ConvertIntToCPUOrCUDA(device()), |
| c10::VaryingShape(dim()), |
| c10::VaryingShape(dim()), |
| requires_grad()); |
| } |
| operator TypePtr() const { |
| return toType(); |
| } |
| |
| private: |
| unsigned defined_ : 1; |
| unsigned requires_grad_ : 1; |
| unsigned : 5; |
| unsigned dim_ : 8; |
| int device_ : 8; // NOTE: this needs to be signed because we use -1 to |
| // represent CPU |
| unsigned type_ : 8; |
| }; |
| |
| static_assert( |
| std::is_pod<ArgumentInfo>::value, |
| "ArgumentInfo is to be a POD struct"); |
| static_assert( |
| sizeof(ArgumentInfo) == sizeof(ArgumentInfo::plain_data_type), |
| "ArgumentInfo is expected to be a 32-bit struct"); |
| |
| struct ArgumentSpec { |
| ArgumentSpec(size_t num_flat_tensor_inputs, size_t num_flat_optional_inputs) { |
| hash_code = hash_combine(num_flat_tensor_inputs, num_flat_optional_inputs); |
| tensor_args.reserve(num_flat_tensor_inputs); |
| optional_presence.reserve(num_flat_optional_inputs); |
| } |
| |
| void addOptional(const IValue& input) { |
| bool is_present = !input.isNone(); |
| optional_presence.push_back(is_present); |
| hash_code = hash_combine(hash_code, is_present); |
| } |
| |
| void addTensor(const IValue& input, bool with_grad) { |
| AT_ASSERT(input.isTensor(), "Expected Tensor but found ", input.tagKind()); |
| tensor_args.emplace_back(); |
| auto& arg = tensor_args.back(); |
| // Initialize all fields to 0. This is convenient, because e.g. |
| // requires_grad() can be checked even on tensors AND will make |
| // padding bits all 0s. |
| std::memset(&arg, 0, sizeof(ArgumentInfo)); |
| |
| // [argspec refcounting] reinterpret the IValue to avoid having to refcount |
| // the Tensor microbenchmarks |
| // https://github.com/zdevito/pytorch/commit/21e7200a0a0fc456bea2f10e95b1781f83933d10 |
| // show overhead in extra refcounting along this path |
| const at::Tensor* t = reinterpret_cast<const at::Tensor*>(&input); |
| if ((arg.defined_ = t->defined())) { |
| arg.requires_grad_ = with_grad && autograd::Variable(*t).requires_grad(); |
| arg.dim_ = t->dim(); |
| arg.device_ = t->is_cuda() ? t->get_device() : -1; |
| arg.type_ = static_cast<unsigned>(t->scalar_type()); |
| } |
| combineHash(arg); |
| } |
| |
| void combineHash(const ArgumentInfo& arg) { |
| ArgumentInfo::plain_data_type arg_data; |
| std::memcpy(&arg_data, &arg, sizeof(ArgumentInfo)); |
| hash_code = hash_combine(hash_code, arg_data); |
| } |
| |
| // equality is fast: check ninputs, and then check the raw array data, |
| // there are no size/stride indirections |
| // hopefully std::vector<bool> has fast equality |
| bool operator==(const ArgumentSpec& spec) const { |
| if (optional_presence != spec.optional_presence) { |
| return false; |
| } |
| if (tensor_args.size() != spec.tensor_args.size()) |
| return false; |
| // NB: we need to break out early when there are no elements, because |
| // passing a nullptr to memcmp is UB. |
| if (tensor_args.size() == 0) |
| return true; |
| return std::memcmp( |
| tensor_args.data(), |
| spec.tensor_args.data(), |
| tensor_args.size() * sizeof(ArgumentInfo)) == 0; |
| } |
| bool operator!=(const ArgumentSpec& spec) const { |
| return !(*this == spec); |
| } |
| size_t numTensors() const { |
| return tensor_args.size(); |
| } |
| const ArgumentInfo& tensorAt(size_t i) const { |
| return tensor_args[i]; |
| } |
| size_t numOptionals() const { |
| return optional_presence.size(); |
| } |
| bool isPresent(size_t i) const { |
| return optional_presence[i]; |
| } |
| size_t hashCode() const { |
| return hash_code; |
| } |
| |
| private: |
| size_t hash_code; // precomputed on construction |
| std::vector<ArgumentInfo> tensor_args; |
| std::vector<bool> optional_presence; |
| }; |
| |
| // ArgumentSpecCreator takes an initial graph and comes up with a set |
| // of simple instructions to compute the ArgumentSpec given a set of |
| // input tensors. |
| struct TORCH_API ArgumentSpecCreator { |
| // instructs acts on a stack of a list of input IValues |
| // at the beginning the stack contains a single list of the inputs to the |
| // function the ENTER_ instructs descend into subobjects and push new lists |
| // onto the stack |
| enum Inst : char { |
| ENTER_TUPLE, // consume a tuple ivalue from the top-most list, and push the |
| // list of its elements onto the stack as a new list |
| ENTER_OBJECT, // same as ENTER_TUPLE, but the input is a class |
| LEAVE, // pop the top-most list from the stack |
| SKIP, // consume an element from the top-most list, and discard |
| SPECIALIZE_OPTIONAL_TENSOR, // consume a optional tensor for the top-most |
| // list, and add it to the ArgSpec key being |
| // created |
| SPECIALIZE_TENSOR, // consume a tensor for the top-most |
| // list, and add it to the ArgSpec key being created |
| SPECIALIZE_OPTIONAL, |
| // consume a nontensor optional from the top-most list, |
| // and add it to the ArgSpec key being created |
| }; |
| ArgumentSpecCreator(Graph& graph); |
| ArgumentSpec create(bool with_grad, const Stack& stack) const; |
| void specializeTypes(Graph& g, const ArgumentSpec& spec) const; |
| void dump() const; |
| using WrittenSlots = std::unordered_set<std::string>; |
| |
| private: |
| static constexpr size_t DEPTH_LIMIT = 128; |
| void scan( |
| const TypePtr& typ, |
| size_t depth, |
| const WrittenSlots& written_slots); |
| size_t num_inputs_; |
| size_t num_tensors_ = 0; |
| size_t num_optionals_ = 0; |
| std::vector<Inst> instructions_; |
| }; |
| |
| // CompleteArgumentSpec represents one particular specialization. |
| // It is designed so that it can be created, hashed, and compared quickly |
| // since it is used along the hot-path of the JIT to check if the code |
| // we have created is valid for the given inputs. |
| |
| // COmpleteArgumentInfoPOD is only used internally in CompleteArgumentSpec |
| // API users should use ArgumentInfo |
| struct CompleteArgumentInfoPOD { |
| // total size is 64-bit |
| unsigned is_tensor : 8; // all other fields are invalid if this is false |
| unsigned type : 8; // scalar type |
| unsigned defined : 1; |
| unsigned requires_grad : 1; |
| signed device : 14; |
| uint32_t total_dims; // all TensorInfoPODs are in CompleteArgumentSpec's |
| // tensor_info() array. total_dims is the total number of |
| // dimensions seen so far in all previous members of |
| // tensor_info(), including this tensor 2*total_dims |
| // becomes the offset into the sizes_strides list for the |
| // _next_ tensor in the tensor_info array for tensor 0, |
| // the offset is always 0 |
| }; |
| |
| static_assert( |
| sizeof(CompleteArgumentInfoPOD) == sizeof(int64_t), |
| "CompleteArgumentInfoPOD must be 64-bit struct for CompleteArgumentSpec encoding to work"); |
| |
| struct CompleteArgumentInfo; |
| |
| struct CompleteArgumentSpec { |
| CompleteArgumentSpec(bool with_grad, at::ArrayRef<IValue> inputs) |
| : hash_code(0), ninputs(inputs.size()) { |
| int32_t all_dims = 0; |
| const int32_t num_inputs = inputs.size(); |
| for (int32_t i = 0; i < num_inputs; i++) { |
| if (!inputs[i].isTensor()) |
| continue; |
| auto tensor = inputs[i].toTensor(); |
| all_dims += tensor.defined() ? tensor.ndimension() : 0; |
| } |
| // allocate enough room for all TensorPODs and dimensions |
| data.resize(ninputs + all_dims * 2); |
| |
| // and reinterpret our data array as these structs |
| auto* pods = reinterpret_cast<CompleteArgumentInfoPOD*>(data.data()); |
| int64_t* next_dim = sizes_strides(); |
| int32_t total_dims = 0; |
| for (int32_t i = 0; i < num_inputs; i++) { |
| auto& pod = pods[i]; |
| pod.is_tensor = static_cast<uint32_t>(inputs[i].isTensor()); |
| if (pod.is_tensor) { |
| at::Tensor t = inputs[i].toTensor(); |
| pod.defined = t.defined(); |
| if (pod.defined) { |
| pod.type = static_cast<int>(t.scalar_type()); |
| pod.device = (!t.is_cuda()) ? -1 : t.get_device(); |
| pod.requires_grad = |
| with_grad && autograd::as_variable_ref(t).requires_grad(); |
| total_dims += t.ndimension(); |
| auto sizes = t.sizes(); |
| std::copy(sizes.begin(), sizes.end(), next_dim); |
| next_dim += sizes.size(); |
| auto strides = t.strides(); |
| std::copy(strides.begin(), strides.end(), next_dim); |
| next_dim += strides.size(); |
| } |
| } |
| // each POD has a running tally of all dimensions including its own |
| pod.total_dims = total_dims; |
| } |
| // we precompute the hash_code to minimize the time inside of hash |
| // table operations where we may need to hold a compiler cache lock. |
| hash_code = hash_combine(0, ninputs); |
| for (auto d : data) { |
| hash_code = hash_combine(hash_code, d); |
| } |
| } |
| |
| // equality is fast: check ninputs, and then check the raw array data, |
| // there are no size/stride indirections |
| bool operator==(const CompleteArgumentSpec& spec) const { |
| return ninputs == spec.ninputs && data == spec.data; |
| } |
| bool operator!=(const CompleteArgumentSpec& spec) const { |
| return !(*this == spec); |
| } |
| friend struct CompleteArgumentInfo; |
| CompleteArgumentInfo at(size_t i) const; |
| size_t size() const { |
| return ninputs; |
| } |
| size_t hashCode() const { |
| return hash_code; |
| } |
| |
| private: |
| ArrayRef<CompleteArgumentInfoPOD> tensor_info() const { |
| return ArrayRef<CompleteArgumentInfoPOD>( |
| reinterpret_cast<const CompleteArgumentInfoPOD*>(data.data()), ninputs); |
| } |
| // the start of the sizes_strides information, which comes after the |
| // CompleteArgumentInfoPOD list. |
| const int64_t* sizes_strides() const { |
| return data.data() + ninputs; |
| } |
| int64_t* sizes_strides() { |
| return data.data() + ninputs; |
| } |
| size_t hash_code; // precomputed on construction |
| int32_t ninputs; |
| // layout is ninputs of TensorPOD (each 64-bit) followed by their size and |
| // stride info for 3 tensors: |
| // [t0POD][t1POD][t2POD]... |
| // [t0 sizes][t0 strides][t1 sizes][t1 strides][t2 sizes][t2 strides] |
| std::vector<int64_t> data; |
| }; |
| |
| // public view of compressed CompleteArgumentInfo |
| struct CompleteArgumentInfo { |
| CompleteArgumentInfo(const CompleteArgumentSpec& spec, const int i) |
| : spec(spec), i(i) {} |
| bool isTensor() const { |
| return pod(i).is_tensor; |
| } |
| at::ScalarType type() const { |
| return at::ScalarType(pod(i).type); |
| } |
| bool defined() const { |
| return pod(i).defined; |
| } |
| bool requires_grad() const { |
| return pod(i).requires_grad; |
| } |
| int device() const { |
| return pod(i).device; |
| } |
| int ndimension() const { |
| // See [valid range], it is always valid to ask for offset for (i + 1) |
| return (sizes_strides_offset(i + 1) - sizes_strides_offset(i)) / 2; |
| } |
| at::IntArrayRef sizes() const { |
| return at::IntArrayRef( |
| spec.sizes_strides() + sizes_strides_offset(i), ndimension()); |
| } |
| at::IntArrayRef strides() const { |
| int ndim = ndimension(); |
| return at::IntArrayRef( |
| spec.sizes_strides() + sizes_strides_offset(i) + ndim, ndim); |
| } |
| operator TypePtr() const { |
| if (!defined()) |
| return TensorType::get(); |
| return TensorType::create( |
| type(), ConvertIntToCPUOrCUDA(device()), sizes(), strides()); |
| } |
| |
| private: |
| // offsetinto sizes_strides() array where the sizes start for tensor j |
| // [valid range] valid range is [0, ninputs] |
| // (i.e. you can ask for the offset at ninputs, which would be the offset of |
| // the next tensor if it existed) |
| int sizes_strides_offset(int j) const { |
| if (j == 0) |
| return 0; |
| return 2 * pod(j - 1).total_dims; |
| } |
| const CompleteArgumentInfoPOD& pod(int j) const { |
| return spec.tensor_info().at(j); |
| } |
| const CompleteArgumentSpec& spec; |
| const int i; |
| }; |
| |
| inline std::ostream& operator<<(std::ostream& out, const ArgumentInfo& info) { |
| if (!info.defined()) { |
| return out << "<undefined>"; |
| } |
| out << "Tensor(device=" << info.device() << ", type=" << toString(info.type()) |
| << ", requires_grad=" << info.requires_grad() << ", dims=" << info.dim() |
| << ")"; |
| return out; |
| } |
| |
| inline std::ostream& operator<<(std::ostream& out, const ArgumentSpec& spec) { |
| out << "{"; |
| for (size_t i = 0; i < spec.numTensors(); ++i) { |
| if (i > 0) |
| out << ", "; |
| out << spec.tensorAt(i); |
| } |
| out << "; "; |
| for (size_t i = 0; i < spec.numOptionals(); ++i) { |
| if (i > 0) |
| out << ", "; |
| out << spec.isPresent(i); |
| } |
| out << "}"; |
| return out; |
| } |
| |
| inline std::ostream& operator<<( |
| std::ostream& out, |
| const CompleteArgumentInfo& info) { |
| if (!info.defined()) { |
| return out << "<undefined>"; |
| } |
| out << "Tensor(device=" << info.device() << ", type=" << toString(info.type()) |
| << ", requires_grad=" << info.requires_grad() |
| << ", sizes=" << info.sizes() << ", strides=" << info.strides() << ")"; |
| return out; |
| } |
| |
| inline std::ostream& operator<<( |
| std::ostream& out, |
| const CompleteArgumentSpec& spec) { |
| out << "{"; |
| for (size_t i = 0; i < spec.size(); ++i) { |
| if (i > 0) |
| out << ", "; |
| out << spec.at(i); |
| } |
| out << "}"; |
| return out; |
| } |
| |
| inline CompleteArgumentInfo CompleteArgumentSpec::at(size_t i) const { |
| return CompleteArgumentInfo(*this, i); |
| } |
| |
| inline c10::optional<int8_t> convertOptional( |
| c10::optional<c10::ScalarType> const& from) { |
| return (from) ? c10::optional<int8_t>(static_cast<int8_t>(*from)) |
| : c10::optional<int8_t>{}; |
| } |
| |
| } // namespace jit |
| } // namespace torch |
| |
| namespace std { |
| |
| template <> |
| struct hash<c10::VaryingShape> { |
| size_t operator()(const c10::VaryingShape& vs) const { |
| return torch::get_hash( |
| vs.size(), |
| vs.size() ? vs.sizes().value() : std::vector<c10::optional<int64_t>>()); |
| } |
| }; |
| |
| template <> |
| struct hash<c10::TensorType> { |
| size_t operator()(const c10::TensorType& ptt) const { |
| return torch::get_hash< |
| c10::optional<int8_t>, |
| c10::VaryingShape, |
| c10::VaryingShape, |
| c10::optional<bool>>( |
| torch::jit::convertOptional(ptt.scalarType()), |
| ptt.sizes(), |
| ptt.strides(), |
| ptt.requiresGrad()); |
| } |
| }; |
| |
| template <> |
| struct hash<torch::jit::ArgumentSpec> { |
| size_t operator()(const torch::jit::ArgumentSpec& spec) const { |
| return spec.hashCode(); |
| } |
| }; |
| template <> |
| struct hash<torch::jit::CompleteArgumentSpec> { |
| size_t operator()(const torch::jit::CompleteArgumentSpec& spec) const { |
| return spec.hashCode(); |
| } |
| }; |
| } // namespace std |