| #define PY_SSIZE_T_CLEAN |
| #include <ATen/EmptyTensor.h> |
| #include <c10/util/flat_hash_map.h> |
| #include <torch/csrc/autograd/grad_mode.h> |
| #include <torch/csrc/dynamo/guards.h> |
| #include <torch/csrc/utils/disable_torch_function.h> |
| #include <torch/csrc/utils/python_compat.h> |
| #include <torch/csrc/utils/python_numbers.h> |
| #include <torch/csrc/utils/python_symnode.h> |
| #include <torch/extension.h> |
| |
| #ifdef USE_CUDA |
| #include <ATen/cuda/EmptyTensor.h> |
| #endif |
| |
| #include <sstream> |
| |
| namespace { |
| |
| struct LocalState { |
| // TLS state that changes operators |
| c10::impl::LocalDispatchKeySet dispatch_modifier; |
| bool grad_mode_enabled; |
| |
| at::DispatchKeySet apply(at::DispatchKeySet ks) const { |
| return (ks | dispatch_modifier.included_) - dispatch_modifier.excluded_; |
| } |
| |
| LocalState() |
| : dispatch_modifier(c10::impl::tls_local_dispatch_key_set()), |
| grad_mode_enabled(at::GradMode::is_enabled()) {} |
| }; |
| |
| class TensorCheck { |
| public: |
| TensorCheck( |
| const LocalState& state, |
| PyTypeObject* pt, |
| const at::Tensor& v, |
| std::vector<std::optional<c10::SymInt>> dynamic_dims_sizes, |
| std::vector<std::optional<c10::SymInt>> dynamic_dims_strides) |
| : pytype(pt), |
| dispatch_key_(state.apply(v.key_set()).raw_repr()), |
| dtype_(v.dtype().toScalarType()), |
| device_index_(v.device().index()), |
| requires_grad_(v.requires_grad()), |
| sizes_(std::move(dynamic_dims_sizes)), |
| strides_(std::move(dynamic_dims_strides)), |
| dim_(static_cast<int64_t>(sizes_.size())) { |
| // TODO(voz): In cases where sizes_ and strides_ are fully dynamic, should |
| // we just treat this as optional? |
| } |
| |
| // See note in guards.py [Note - On Export Tensor Guards] |
| // Logic parallel to here must be maintained in python |
| bool check(const LocalState& state, const at::Tensor& v) { |
| if (dispatch_key_ != state.apply(v.key_set()).raw_repr() || |
| dtype_ != v.dtype().toScalarType() || |
| device_index_ != v.device().index() || |
| requires_grad_ != v.requires_grad()) { |
| return false; |
| } |
| auto ndim = v.ndimension(); |
| if (ndim != dim_) { |
| return false; |
| } |
| const auto& sizes = v.sym_sizes(); |
| const auto& strides = v.sym_strides(); |
| for (auto i : c10::irange(ndim)) { |
| auto known_size = sizes_[i]; |
| auto known_stride = strides_[i]; |
| if (known_size.has_value()) { |
| if (known_size.value() != sizes[i]) { |
| return false; |
| } |
| } |
| if (known_stride.has_value()) { |
| if (known_stride.value() != strides[i]) { |
| return false; |
| } |
| } |
| } |
| return true; |
| } |
| |
| std::string check_verbose( |
| const LocalState& state, |
| const at::Tensor& v, |
| const std::string& tensor_name) { |
| std::stringstream fail_reason; |
| fail_reason << "tensor '" << tensor_name << "' "; |
| if (dispatch_key_ != state.apply(v.key_set()).raw_repr()) { |
| // return fmt::format("tensor dispatch key mismatch. expected {}, actual |
| // {}", dispatch_key_, state.apply(v.key_set()).raw_repr()); |
| fail_reason << "dispatch key set mismatch. expected " |
| << c10::DispatchKeySet( |
| c10::DispatchKeySet::RAW, dispatch_key_) |
| << ", actual " << state.apply(v.key_set()); |
| return fail_reason.str(); |
| } else if (dtype_ != v.dtype().toScalarType()) { |
| // return fmt::format("tensor dtype mismatch. expected {}, actual {}", |
| // dtype_, v.dtype().toScalarType()); |
| fail_reason << "dtype mismatch. expected " << dtype_ << ", actual " |
| << v.dtype().toScalarType(); |
| return fail_reason.str(); |
| } else if (device_index_ != v.device().index()) { |
| fail_reason |
| << "Tensor device index mismatch. Expected device index to be " |
| << device_index_ << ", actual " << v.device().index(); |
| return fail_reason.str(); |
| } else if (requires_grad_ != v.requires_grad()) { |
| // return fmt::format("tensor requires_grad mismatch. expected {}", |
| // requires_grad_); |
| fail_reason << "requires_grad mismatch. expected requires_grad=" |
| << requires_grad_; |
| return fail_reason.str(); |
| } |
| auto ndim = v.ndimension(); |
| if (ndim != dim_) { |
| // return fmt::format("tensor rank mismatch. expected {}, actual {}", |
| // sizes_.size(), ndim); |
| fail_reason << "rank mismatch. expected " << sizes_.size() << ", actual " |
| << ndim; |
| return fail_reason.str(); |
| } |
| const auto& sizes = v.sym_sizes(); |
| const auto& strides = v.sym_strides(); |
| for (auto i : c10::irange(ndim)) { |
| auto known_size = sizes_[i]; |
| auto known_stride = strides_[i]; |
| if (known_size.has_value() && (known_size.value() != sizes[i])) { |
| fail_reason << "size mismatch at index " << i << ". expected " |
| << known_size.value() << ", actual " << sizes[i]; |
| return fail_reason.str(); |
| } |
| if (known_stride.has_value() && known_stride.value() != strides[i]) { |
| fail_reason << "stride mismatch at index " << i << ". expected " |
| << known_stride.value() << ", actual " << strides[i]; |
| return fail_reason.str(); |
| } |
| } |
| return ""; |
| } |
| |
| PyTypeObject* pytype; |
| |
| private: |
| uint64_t dispatch_key_; // DispatchKeySet includes device/layout |
| at::ScalarType dtype_; |
| // Note(voz): While dispatch_key_ is sufficiently representative of a device |
| // In that keys are more granular AND device specific - they do not |
| // necessarily capture device indices correctly. |
| at::DeviceIndex device_index_; |
| bool requires_grad_; |
| // NB: These are unset if dynamic shapes is enabled. |
| std::vector<std::optional<c10::SymInt>> sizes_; |
| std::vector<std::optional<c10::SymInt>> strides_; |
| // Not strictly required for dense tensors, but nested tensors need it. |
| int64_t dim_; |
| }; |
| |
| typedef std::vector<TensorCheck> ChecksList; |
| |
| typedef struct { |
| PyObject_HEAD; |
| ChecksList* checks; |
| } TensorGuards; |
| |
| static void TensorGuards_dealloc(TensorGuards* self) { |
| if (self->checks != nullptr) { |
| delete self->checks; |
| self->checks = nullptr; |
| } |
| Py_TYPE(self)->tp_free((PyObject*)self); |
| } |
| |
| static PyObject* TensorGuards_new( |
| PyTypeObject* type, |
| PyObject* args, |
| PyObject* kwds) { |
| TensorGuards* self = (TensorGuards*)type->tp_alloc(type, 0); |
| if (self != nullptr) { |
| self->checks = new ChecksList(); |
| } |
| return (PyObject*)self; |
| } |
| |
| static std::vector<std::optional<c10::SymInt>> wrapIntegersInOptional( |
| const c10::SymIntArrayRef& intArray) { |
| std::vector<std::optional<c10::SymInt>> optVec(intArray.size()); |
| std::transform( |
| intArray.begin(), |
| intArray.end(), |
| optVec.begin(), |
| [](const c10::SymInt& value) { return std::make_optional(value); }); |
| return optVec; |
| } |
| |
| static std::vector<std::optional<c10::SymInt>> pyListToVecOptInt( |
| PyObject* pyList) { |
| std::vector<std::optional<c10::SymInt>> vec; |
| Py_ssize_t size = PyList_Size(pyList); |
| for (Py_ssize_t i = 0; i < size; i++) { |
| PyObject* item = PyList_GetItem(pyList, i); |
| auto handle = py::handle(item); |
| if (item == Py_None) { |
| vec.emplace_back(std::nullopt); |
| } else if (torch::is_symint(handle)) { |
| vec.emplace_back(py::cast<c10::SymInt>(handle)); |
| } else { |
| int64_t value = PyLong_AsLongLong(item); |
| if (value == -1 && PyErr_Occurred()) { |
| PyErr_SetString( |
| PyExc_TypeError, |
| "Size or stride list item is not a valid integer."); |
| TORCH_CHECK(false, "Size or stride list item is not a valid integer."); |
| } |
| vec.emplace_back(c10::SymInt(value)); |
| } |
| } |
| return vec; |
| } |
| |
| static std::vector<std::vector<std::optional<c10::SymInt>>> get_dynamic_dims( |
| PyObject* dynamic_dims_py) { |
| std::vector<std::vector<std::optional<c10::SymInt>>> per_tensor_dynamic_dims; |
| if (dynamic_dims_py != Py_None) { |
| Py_ssize_t size = PyList_Size(dynamic_dims_py); |
| for (Py_ssize_t i = 0; i < size; i++) { |
| PyObject* py_list = PyList_GetItem(dynamic_dims_py, i); |
| std::vector<std::optional<c10::SymInt>> vec = pyListToVecOptInt(py_list); |
| per_tensor_dynamic_dims.push_back(std::move(vec)); |
| } |
| } |
| return per_tensor_dynamic_dims; |
| } |
| |
| static int TensorGuards_init( |
| TensorGuards* self, |
| PyObject* args, |
| PyObject* kwds) { |
| if (!PyTuple_CheckExact(args)) { |
| PyErr_SetString(PyExc_TypeError, "expected tuple()"); |
| return -1; |
| } |
| // Top level structure is List[List[Union[int, None]]] |
| PyObject* dynamic_dims_sizes_py = |
| PyDict_GetItemString(kwds, "dynamic_dims_sizes"); |
| if (dynamic_dims_sizes_py == nullptr) { |
| PyErr_SetString(PyExc_TypeError, "missing dynamic_dims_sizes=..."); |
| return -1; |
| } |
| PyObject* dynamic_dims_strides_py = |
| PyDict_GetItemString(kwds, "dynamic_dims_strides"); |
| if (dynamic_dims_strides_py == nullptr) { |
| PyErr_SetString(PyExc_TypeError, "missing dynamic_dims_strides=..."); |
| return -1; |
| } |
| |
| // dynamic_dims_strides/sizes_py is None when dynamic_shapes=False - this is |
| // an optimization to avoid invoking .size()/.stride() in python needlessly |
| std::vector<std::vector<std::optional<c10::SymInt>>> |
| per_tensor_dynamic_dims_sizes = get_dynamic_dims(dynamic_dims_sizes_py); |
| std::vector<std::vector<std::optional<c10::SymInt>>> |
| per_tensor_dynamic_dims_strides = |
| get_dynamic_dims(dynamic_dims_strides_py); |
| |
| auto& checks = *self->checks; |
| auto len = PyTuple_GET_SIZE(args); |
| checks.reserve(len); |
| LocalState state; |
| |
| for (auto i : c10::irange(len)) { |
| PyObject* item = PyTuple_GET_ITEM(args, i); |
| if (!THPVariable_CheckExact(item) && !THPVariable_Check(item)) { |
| PyErr_SetString(PyExc_TypeError, "expected Tensor()"); |
| return -1; |
| } |
| auto tensor = THPVariable_Unpack(item); |
| std::vector<std::optional<c10::SymInt>> tensor_dims_size = |
| per_tensor_dynamic_dims_sizes.empty() |
| ? wrapIntegersInOptional(tensor.sym_sizes()) |
| : per_tensor_dynamic_dims_sizes[i]; |
| std::vector<std::optional<c10::SymInt>> tensor_dims_stride = |
| per_tensor_dynamic_dims_strides.empty() |
| ? wrapIntegersInOptional(tensor.sym_strides()) |
| : per_tensor_dynamic_dims_strides[i]; |
| |
| checks.emplace_back( |
| state, |
| Py_TYPE(item), |
| std::move(tensor), |
| std::move(tensor_dims_size), |
| std::move(tensor_dims_stride)); |
| } |
| return 0; |
| } |
| |
| PyObject* TensorGuards_check( |
| TensorGuards* self, |
| PyObject* args, |
| PyObject* kwargs) { |
| if (!PyTuple_CheckExact(args)) { |
| PyErr_SetString(PyExc_TypeError, "expected tuple()"); |
| return nullptr; |
| } |
| auto& checks = *self->checks; |
| auto len = PyTuple_GET_SIZE(args); |
| |
| // kwargs is just ignored here |
| |
| if (static_cast<decltype(len)>(checks.size()) != len) { |
| PyErr_SetString(PyExc_TypeError, "wrong length"); |
| return nullptr; |
| } |
| |
| LocalState state; |
| // Note - all the tensors that make it to guards must be unique. Dynamo |
| // builder handles guarding for positive aliases (X is Y). However, we do not |
| // create guards for negative alias (X is not Y) as that is an N^2 |
| // relationship. Instead, we rely on the uniqueness upstream to verify, at |
| // check_fn time (this function). |
| ska::flat_hash_map<PyObject*, std::nullptr_t> unique_tensors; |
| for (auto i : c10::irange(len)) { |
| PyObject* item = PyTuple_GET_ITEM(args, i); |
| |
| if (Py_TYPE(item) != checks[i].pytype) { |
| Py_RETURN_FALSE; |
| } |
| auto insertion = unique_tensors.insert({item, nullptr}); |
| if (!insertion.second) { |
| // Violates uniqueness |
| Py_RETURN_FALSE; |
| } |
| if (!checks[i].check(state, THPVariable_Unpack(item))) { |
| Py_RETURN_FALSE; |
| } |
| } |
| |
| Py_RETURN_TRUE; |
| } |
| |
| PyObject* TensorGuards_check_verbose( |
| TensorGuards* self, |
| PyObject* args, |
| PyObject* kwargs) { |
| if (!PyTuple_CheckExact(args)) { |
| PyErr_SetString(PyExc_TypeError, "expected tuple()"); |
| return nullptr; |
| } |
| auto& checks = *self->checks; |
| auto len = PyTuple_GET_SIZE(args); |
| |
| if (static_cast<decltype(len)>(checks.size()) != len) { |
| PyErr_SetString(PyExc_TypeError, "wrong length"); |
| return nullptr; |
| } |
| |
| PyObject* tensor_check_names_py = |
| PyDict_GetItemString(kwargs, "tensor_check_names"); |
| if (tensor_check_names_py == nullptr) { |
| PyErr_SetString(PyExc_TypeError, "missing tensor_check_names kwarg"); |
| return nullptr; |
| } |
| |
| if (!PyList_Check(tensor_check_names_py)) { |
| PyErr_SetString(PyExc_TypeError, "tensor_check_names kwarg must be a list"); |
| return nullptr; |
| } |
| |
| auto names_size = PyList_Size(tensor_check_names_py); |
| if (names_size != static_cast<decltype(names_size)>(checks.size())) { |
| PyErr_SetString( |
| PyExc_TypeError, |
| "tensor_check_names should be the same size as # tensors"); |
| return nullptr; |
| } |
| |
| std::vector<std::string> tensor_check_names; |
| tensor_check_names.reserve(names_size); |
| for (auto i : c10::irange(names_size)) { |
| PyObject* value = PyList_GetItem(tensor_check_names_py, i); |
| if (!PyUnicode_Check(value)) { |
| PyErr_SetString( |
| PyExc_TypeError, "tensor_check_names must only contain strings"); |
| return nullptr; |
| } |
| tensor_check_names.emplace_back(PyUnicode_AsUTF8(value)); |
| } |
| |
| LocalState state; |
| ska::flat_hash_map<PyObject*, std::nullptr_t> unique_tensors; |
| for (auto i : c10::irange(len)) { |
| PyObject* item = PyTuple_GET_ITEM(args, i); |
| if (Py_TYPE(item) != checks[i].pytype) { |
| std::stringstream fail_reason; |
| PyObject* type_str = PyObject_Str(PyObject_Type(item)); |
| fail_reason << "expected type of '" << tensor_check_names[i] |
| << "' to be a tensor type, "; |
| if (!type_str) { |
| fail_reason << "but found a different type"; |
| } else { |
| fail_reason << "' but found " << PyUnicode_AsUTF8(type_str); |
| } |
| return Py_BuildValue("s", fail_reason.str().c_str()); |
| } |
| |
| auto insertion = unique_tensors.insert({item, nullptr}); |
| if (!insertion.second) { |
| std::stringstream fail_reason; |
| fail_reason << "Duplicate tensor found where not expected! "; |
| fail_reason << tensor_check_names[i] |
| << "should not alias to anything, but is aliased"; |
| return Py_BuildValue("s", fail_reason.str().c_str()); |
| } |
| std::string fail_reason = checks[i].check_verbose( |
| state, THPVariable_Unpack(item), tensor_check_names[i]); |
| if (fail_reason.length() > 0) { |
| return Py_BuildValue("s", fail_reason.c_str()); |
| } |
| } |
| |
| Py_RETURN_TRUE; |
| } |
| |
| // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays) |
| static PyMethodDef TensorGuards_methods[] = { |
| {"check", |
| (PyCFunction)(void*)TensorGuards_check, |
| METH_VARARGS | METH_KEYWORDS, |
| ""}, |
| {"check_verbose", |
| (PyCFunction)(void*)TensorGuards_check_verbose, |
| METH_VARARGS | METH_KEYWORDS, |
| "verbose fail reasons for failed checks"}, |
| {nullptr} /* Sentinel */ |
| }; |
| |
| static PyTypeObject TensorGuardsType = {PyVarObject_HEAD_INIT(nullptr, 0)}; |
| |
| struct GlobalStateGuard { |
| PyObject_HEAD; |
| |
| inline void init() { |
| auto& ctx = at::globalContext(); |
| _grad_mode = at::GradMode::is_enabled(); |
| _torch_function = torch::torch_function_enabled(); |
| _deterministic_algorithms = ctx.deterministicAlgorithms(); |
| _deterministic_algorithms_warn_only = ctx.deterministicAlgorithmsWarnOnly(); |
| _allow_tf32 = ctx.allowTF32CuBLAS(); |
| _allow_fp16_reduce = ctx.allowFP16ReductionCuBLAS(); |
| _allow_bf16_reduce = ctx.allowBF16ReductionCuBLAS(); |
| _num_threads = at::get_num_threads(); |
| _default_dtype = at::get_default_dtype(); |
| } |
| |
| inline bool check() { |
| auto& ctx = at::globalContext(); |
| return (_grad_mode == at::GradMode::is_enabled() && |
| _torch_function == torch::torch_function_enabled() && |
| _deterministic_algorithms == ctx.deterministicAlgorithms() && |
| _deterministic_algorithms_warn_only == |
| ctx.deterministicAlgorithmsWarnOnly() && |
| _allow_tf32 == ctx.allowTF32CuBLAS() && |
| _allow_fp16_reduce == ctx.allowFP16ReductionCuBLAS() && |
| _allow_bf16_reduce == ctx.allowBF16ReductionCuBLAS() && |
| _num_threads == at::get_num_threads()) && |
| _default_dtype == at::get_default_dtype(); |
| } |
| |
| bool _grad_mode; |
| bool _torch_function; |
| bool _deterministic_algorithms; |
| bool _deterministic_algorithms_warn_only; |
| bool _allow_tf32; |
| bool _allow_fp16_reduce; |
| bool _allow_bf16_reduce; |
| int _num_threads; |
| caffe2::TypeMeta _default_dtype; |
| // TODO(jansel): we should guard on more state as inductor starts using it |
| }; |
| |
| int GlobalStateGuard_init( |
| GlobalStateGuard* self, |
| PyObject* args, |
| PyObject* kwargs) { |
| self->init(); |
| return 0; |
| } |
| |
| PyObject* GlobalStateGuard_check( |
| GlobalStateGuard* self, |
| PyObject* args, |
| PyObject* kwargs) { |
| if (self->check()) { |
| Py_RETURN_TRUE; |
| } else { |
| Py_RETURN_FALSE; |
| } |
| } |
| |
| static PyMethodDef GlobalStateGuard_methods[] = { |
| {"check", |
| (PyCFunction)(void*)GlobalStateGuard_check, |
| METH_NOARGS, |
| "Return true if global state was the same as at creation time"}, |
| {nullptr}}; |
| static PyTypeObject GlobalStateGuardType = {PyVarObject_HEAD_INIT(nullptr, 0)}; |
| |
| static PyObject* check_type_id(PyObject* dummy, PyObject* args) { |
| // faster `lambda obj, expected: id(type(obj)) == expected` |
| PyObject* obj = nullptr; |
| unsigned long long expected = 0; |
| if (!PyArg_ParseTuple(args, "OK", &obj, &expected)) { |
| return nullptr; |
| } |
| // NOLINTNEXTLINE(performance-no-int-to-ptr) |
| if (Py_TYPE(obj) == (void*)expected) { |
| Py_RETURN_TRUE; |
| } else { |
| Py_RETURN_FALSE; |
| } |
| } |
| |
| static PyObject* check_obj_id(PyObject* dummy, PyObject* args) { |
| // faster `lambda obj, expected: id(obj) == expected` |
| PyObject* obj = nullptr; |
| unsigned long long expected = 0; |
| if (!PyArg_ParseTuple(args, "OK", &obj, &expected)) { |
| return nullptr; |
| } |
| // NOLINTNEXTLINE(performance-no-int-to-ptr) |
| if (obj == (void*)expected) { |
| Py_RETURN_TRUE; |
| } else { |
| Py_RETURN_FALSE; |
| } |
| } |
| |
| static PyObject* dict_version(PyObject* dummy, PyObject* args) { |
| // Retrieves the version of a dictionary. |
| PyObject* obj = nullptr; |
| if (!PyArg_ParseTuple(args, "O", &obj)) { |
| return nullptr; |
| } |
| if (!PyDict_Check(obj)) { |
| return nullptr; |
| } |
| #if IS_PYTHON_3_12_PLUS |
| TORCH_CHECK(false, "Dynamo does not support CPython 3.12 yet."); |
| return nullptr; |
| #else |
| // ma_version_tag is deprecated since 3.12. We will need to transition |
| // to use the appropriate API for later versions. |
| // This warning is an error on some clang builds, so we have to ifdef it |
| // away for now. |
| return THPUtils_packUInt64(((PyDictObject*)obj)->ma_version_tag); |
| #endif |
| } |
| |
| static PyObject* assert_size_stride(PyObject* dummy, PyObject* args) { |
| /* |
| Assert that a given tensor has a given size/stride, but ignore strides |
| of size==1 dimensions. Implemented in C++ as this is on the hot path. |
| */ |
| PyObject* item = nullptr; |
| PyObject* size = nullptr; |
| PyObject* stride = nullptr; |
| if (!PyArg_ParseTuple(args, "OOO", &item, &size, &stride)) { |
| return nullptr; |
| } |
| if (!THPVariable_CheckExact(item) && !THPVariable_Check(item)) { |
| PyErr_SetString(PyExc_TypeError, "expected Tensor()"); |
| return nullptr; |
| } |
| if (!PyTuple_CheckExact(size) || !PyTuple_CheckExact(stride)) { |
| PyErr_SetString(PyExc_TypeError, "expected tuple()"); |
| return nullptr; |
| } |
| at::Tensor tensor = THPVariable_Unpack(item); |
| int64_t ndim = tensor.ndimension(); |
| if (PyTuple_GET_SIZE(size) != ndim || PyTuple_GET_SIZE(stride) != ndim) { |
| PyErr_SetString(PyExc_AssertionError, "wrong number of dimensions"); |
| return nullptr; |
| } |
| for (auto i : c10::irange(ndim)) { |
| int64_t want_size = THPUtils_unpackLong(PyTuple_GET_ITEM(size, i)); |
| int64_t want_stride = THPUtils_unpackLong(PyTuple_GET_ITEM(stride, i)); |
| int64_t actual_size = tensor.size(i); |
| int64_t actual_stride = tensor.stride(i); |
| if (want_size != actual_size || |
| // ignore stride differences when size is 1 |
| (want_stride != actual_stride && actual_size > 1)) { |
| std::stringstream msg; |
| msg << "expected size " << actual_size << "==" << want_size << ", stride " |
| << actual_stride << "==" << want_stride << " at dim=" << i; |
| PyErr_SetString(PyExc_AssertionError, msg.str().c_str()); |
| return nullptr; |
| } |
| } |
| Py_RETURN_TRUE; |
| } |
| |
| template <typename T> |
| inline static void unwrap_size_tuple(PyObject* obj, T& output) { |
| TORCH_CHECK(PyTuple_CheckExact(obj)); |
| size_t len = PyTuple_GET_SIZE(obj); |
| output.reserve(len); |
| for (size_t i = 0; i < len; ++i) { |
| auto result = PyLong_AsSsize_t(PyTuple_GET_ITEM(obj, i)); |
| TORCH_CHECK(result >= 0); |
| output.emplace_back(result); |
| } |
| } |
| |
| template <typename T> |
| inline static void _parse_empty_strided_args( |
| PyObject* args, |
| T& sizes, |
| T& strides, |
| at::ScalarType& dtype) { |
| TORCH_CHECK(PyTuple_CheckExact(args)); |
| TORCH_CHECK(PyTuple_GET_SIZE(args) == 3); |
| // note PyTuple_GET_ITEM returns a borrowed ref, so no need for refcounts |
| unwrap_size_tuple(PyTuple_GET_ITEM(args, 0), sizes); |
| unwrap_size_tuple(PyTuple_GET_ITEM(args, 1), strides); |
| PyObject* py_dtype = PyTuple_GET_ITEM(args, 2); |
| TORCH_CHECK(THPDtype_Check(py_dtype)); |
| dtype = reinterpret_cast<THPDtype*>(py_dtype)->scalar_type; |
| } |
| |
| static PyObject* _empty_strided_cpu(PyObject* dummy, PyObject* args) { |
| // at::empty_strided is surprising slow. This is a lower-overhead |
| // version that saves ~2us on every allocation. |
| HANDLE_TH_ERRORS; |
| at::SmallVector<int64_t, 8> sizes; |
| at::SmallVector<int64_t, 8> strides; |
| at::ScalarType dtype; |
| _parse_empty_strided_args(args, sizes, strides, dtype); |
| return THPVariable_Wrap(at::detail::empty_strided_cpu(sizes, strides, dtype)); |
| END_HANDLE_TH_ERRORS; |
| } |
| |
| static PyObject* _empty_strided_cuda(PyObject* dummy, PyObject* args) { |
| // at::empty_strided is surprising slow. This is lower-overhead. |
| HANDLE_TH_ERRORS; |
| #ifdef USE_CUDA |
| at::SmallVector<int64_t, 8> sizes; |
| at::SmallVector<int64_t, 8> strides; |
| at::ScalarType dtype; |
| _parse_empty_strided_args(args, sizes, strides, dtype); |
| return THPVariable_Wrap(at::detail::empty_strided_cuda( |
| sizes, strides, dtype, c10::DeviceType::CUDA)); |
| #else |
| TORCH_CHECK(false, "PyTorch compiled without USE_CUDA"); |
| #endif |
| END_HANDLE_TH_ERRORS; |
| } |
| |
| // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays) |
| static PyMethodDef _methods[] = { |
| {"check_type_id", check_type_id, METH_VARARGS, nullptr}, |
| {"check_obj_id", check_obj_id, METH_VARARGS, nullptr}, |
| {"assert_size_stride", assert_size_stride, METH_VARARGS, nullptr}, |
| {"dict_version", dict_version, METH_VARARGS, nullptr}, |
| {"_empty_strided_cpu", _empty_strided_cpu, METH_VARARGS, nullptr}, |
| {"_empty_strided_cuda", _empty_strided_cuda, METH_VARARGS, nullptr}, |
| {nullptr, nullptr, 0, nullptr}}; |
| |
| static struct PyModuleDef _module = { |
| PyModuleDef_HEAD_INIT, |
| "torch._C._dynamo.guards", |
| "Module containing checks on tensors", |
| -1, |
| _methods}; |
| |
| /** |
| * Stores relevant guard debug information, e.g., failure str for a LeafGuard |
| * failure. The data structure is also accessible in Python. |
| */ |
| |
| class GuardDebugInfo { |
| public: |
| GuardDebugInfo( |
| bool result, |
| py::list verbose_code_parts, |
| int num_guards_executed) |
| : result(result), |
| verbose_code_parts(verbose_code_parts), |
| num_guards_executed(num_guards_executed) {} |
| |
| // This constructor is used when guard succeeds. |
| GuardDebugInfo(bool result, int num_guards_executed) |
| : result(result), num_guards_executed(num_guards_executed) {} |
| |
| GuardDebugInfo( |
| bool result, |
| std::string failed_reason, |
| int num_guards_executed) |
| : GuardDebugInfo(result, num_guards_executed) { |
| verbose_code_parts.append(failed_reason); |
| } |
| |
| std::string to_string() { |
| std::stringstream ss; |
| ss << "GuardDebugInfo(\n" |
| << "result=" << result << ",\n" |
| << "verbose_code_parts=" << verbose_code_parts << ",\n" |
| << "num_guards_executed=" << num_guards_executed << ")\n"; |
| return ss.str(); |
| } |
| |
| // Whether the guard passed or failed. |
| bool result; |
| |
| // This is a list of verbose_code_parts for the failed guard. When there are |
| // more than one verbose_code_parts, then recompilation reasoning infra on the |
| // Python side can iterate over this list and eval each string to pinpoint the |
| // exact code part that failed. |
| py::list verbose_code_parts; |
| |
| // Total number of executed guards so far. This is helpful in debugging if |
| // shuffling is working. |
| int num_guards_executed; |
| }; |
| |
| /** |
| * Base class for the leaf guard in the GuardManager hierarchy. |
| */ |
| class LeafGuard { |
| public: |
| LeafGuard(py::object verbose_code_parts) |
| : _verbose_code_parts(verbose_code_parts) {} |
| |
| // check function could be called from python. This is useful for debugging |
| // purpose. |
| bool check(py::handle value) { |
| return check_nopybind(value.ptr()); |
| } |
| |
| GuardDebugInfo check_verbose(py::handle value) { |
| return check_verbose_nopybind(value.ptr()); |
| } |
| |
| virtual GuardDebugInfo check_verbose_nopybind( |
| PyObject* value) { // borrowed ref |
| bool result = check_nopybind(value); |
| if (!result) { |
| return GuardDebugInfo(result, _verbose_code_parts, 0); |
| } |
| return GuardDebugInfo(true, 0); |
| } |
| |
| py::list verbose_code_parts() { |
| return _verbose_code_parts; |
| } |
| |
| // This is on the hot path and avoids any refcounting code from pybind. This |
| // is not exposed to Python and can only be called from C++. |
| virtual bool check_nopybind(PyObject* value) = 0; |
| virtual ~LeafGuard() = default; |
| |
| private: |
| // This is set while constructing the leaf guard. This is used for identifying |
| // the cause of recompilation. |
| py::list _verbose_code_parts; |
| }; |
| |
| /** |
| * Represents a leaf guard that accepts the python guard check function. We |
| * would like to have most of the guards in C++ (to avoid a Python function |
| * call). But, it will take some time to reach that goal. Also, there might be |
| * cases where its too tedious to write an equivalent C++ guard. |
| * |
| * LAMBDA_GUARD allows us to gradually move to C++. We can start from all |
| * guards of type PythonLambaGuard and incrementally move expensive guards to |
| * C++. |
| */ |
| class LAMBDA_GUARD : public LeafGuard { |
| public: |
| LAMBDA_GUARD(py::object guard_check_fn, py::object verbose_code_parts) |
| : LeafGuard(verbose_code_parts) { |
| if (py::isinstance<py::function>(guard_check_fn)) { |
| _guard_check_fn = py::cast<py::function>(guard_check_fn); |
| } else { |
| throw py::type_error("LAMBDA_GUARD expects (callable, str)"); |
| } |
| } |
| |
| // Runs the lambda function with the current f_locals value. |
| bool check_nopybind(PyObject* value) override { // borrowed ref |
| PyObject* x = PyObject_CallOneArg(_guard_check_fn.ptr(), value); // new ref |
| if (x == nullptr) { |
| // An exception is caught in the lambda function. |
| PyErr_Clear(); |
| return false; |
| } |
| bool result = PyObject_IsTrue(x); |
| Py_DECREF(x); |
| return result; |
| } |
| |
| private: |
| // The user provided lambda function for check_fn. |
| py::function _guard_check_fn; |
| }; |
| |
| class TYPE_MATCH : public LeafGuard { |
| public: |
| // type_id = id(type(obj)) |
| TYPE_MATCH(py::object type_id, py::object verbose_code_parts) |
| : LeafGuard(verbose_code_parts), _expected(py::cast<intptr_t>(type_id)) {} |
| |
| bool check_nopybind(PyObject* value) override { // borrowed ref |
| return Py_TYPE(value) == (void*)_expected; |
| } |
| |
| private: |
| // id of the type of the original object. |
| intptr_t _expected; |
| }; |
| |
| class ID_MATCH : public LeafGuard { |
| public: |
| // obj_id = id(obj) |
| ID_MATCH(py::object obj_id, py::object verbose_code_parts) |
| : LeafGuard(verbose_code_parts), _expected(py::cast<intptr_t>(obj_id)) {} |
| |
| bool check_nopybind(PyObject* value) override { // borrowed ref |
| return value == (void*)_expected; |
| } |
| |
| private: |
| // id of the original object. |
| intptr_t _expected; |
| }; |
| |
| class EQUALS_MATCH : public LeafGuard { |
| public: |
| EQUALS_MATCH(py::object value, py::object verbose_code_parts) |
| : LeafGuard(verbose_code_parts), |
| _value(value), |
| _value_type(Py_TYPE(value.ptr())) {} |
| |
| bool check_nopybind(PyObject* value) override { // borrowed ref |
| // Fast path - pointer equality check. |
| if (value != _value.ptr()) { |
| // Check type |
| if (Py_TYPE(value) != _value_type) { |
| return false; |
| } |
| int result = PyObject_RichCompareBool(value, _value.ptr(), Py_EQ); |
| // Check for exception |
| if (result == -1) { |
| PyErr_Clear(); |
| return false; |
| } |
| return result; |
| } |
| return true; |
| } |
| |
| private: |
| // value to compare against. This is py::object so that we hold on to the |
| // original value and prevent garbage collection. We run EQUALS_MATCH only on |
| // selected objects which do not have high memory footprint, so holding on to |
| // these objects is ok. |
| py::object _value; |
| |
| // Type of the value |
| PyTypeObject* _value_type; |
| }; |
| |
| /** |
| * Relational guards compare more than one value. We implement Relational |
| * guards by capturing some state in the guard object. For example for tensor |
| * aliasing guards - tensor X is not tensor Y - we construct one leaf guard |
| * and and install it at as a leaf of two guard managers (one for X and |
| * another for Y). Therefore, this guard is run twice. In the first |
| * invocation, it saves the first value (state) and returns True. In the |
| * second invocation, it compares the saved value with the new value and |
| * returns True if they do not alias. |
| * |
| * We have to be careful about resetting in case the other guards fail and we |
| * have some state in the relational guard. This is done by virtual method |
| * reset_state(). This is called by the GuardManager whenever |
| * there is a guard failure. In the event that the Guard evals to true, we do |
| * not need to reset the state. THe check_nopybind method should itself reset |
| * the state if it was called N times. So, fast path is unaffected. |
| * |
| * There is a question on which GuardManager node calls the |
| * reset_state. This is done by registering the guard as a |
| * relational_guard_resetter on the root node, which calls the resets all the |
| * relational guards on guard evaluation to False. |
| */ |
| class RelationalGuard : public LeafGuard { |
| public: |
| RelationalGuard(py::object verbose_code_parts) |
| : LeafGuard(verbose_code_parts) {} |
| |
| // reset the relational guard state on guard failure. This is called by the |
| // guard manager. |
| virtual void reset_state() = 0; |
| }; |
| |
| class GuardManager; |
| class RootGuardManager; |
| class DictGuardManager; |
| // GuardManager can be a pointer to DictGuardManager, but at this point the |
| // compiler does not know that DictGuardManager is a derived class of |
| // GuardManager (no way to define inheritance relationships in forward |
| // declarations), so we forward declare a factory function and define it when |
| // both DictGuardManager and GuardManager are fully defined. |
| std::unique_ptr<GuardManager> make_guard_manager( |
| RootGuardManager* root, |
| py::handle example_value); |
| |
| /** |
| * Base class representing a pair of accessor and the associated guard |
| * manager. The accessor defines how to access the child value from the |
| * py::object given to the parent check function. |
| * |
| * GuardAccessors can be considered equivalent to name() method of Source |
| * objects in guards.py. In python, name() method returns a str which we can |
| * then eval in f_locals and f_globals to retrieve the actual py object. |
| * GuardAccessor serves the same purpose. The minor difference is that |
| * GuardManager is a tree structure, so a GuardAccessor just has to retrieve |
| * the value in the next level in this tree and pass it to the child |
| * GuardAccessor. |
| * |
| * GuardAccessor also owns the GuardManager associated with the retrieved |
| * value from the GuardAccessor. |
| */ |
| class GuardAccessor { |
| public: |
| GuardAccessor( |
| RootGuardManager* root, |
| py::object accessor_key, |
| py::handle example_value) |
| : _guard_manager(make_guard_manager(root, example_value)), |
| _accessor_key(std::move(accessor_key)) {} |
| |
| // Return by reference as GuardAccessor owns the GuardManager. |
| std::unique_ptr<GuardManager>& get_guard_manager() { |
| return _guard_manager; |
| } |
| |
| bool matches_key(const py::handle key) const { |
| return _accessor_key.equal(key); |
| } |
| |
| virtual bool check_nopybind(PyObject* obj) = 0; |
| virtual GuardDebugInfo check_verbose_nopybind(PyObject* obj) = 0; |
| virtual std::string repr() const = 0; |
| |
| virtual ~GuardAccessor() = default; |
| |
| protected: |
| // Guard manager corresponding to the retrieved value from the |
| // GuardAccessor. |
| std::unique_ptr<GuardManager> _guard_manager; |
| // accessor key could be py::str for getattr, getitem or py::function for |
| // lambda accessor. It is a py::object because we need to keep these accessor |
| // keys alive. |
| py::object _accessor_key; |
| }; |
| |
| /** |
| * GuardManager encapsulates all the guards related to a particular |
| * py::object. It is a tree structure and consists of 1) Leaf guards - Guards |
| * that are run on the user given object 2) Accessors - Guard accessors (like |
| * getattr, getitem) to access the next value in the tree hierarchy. Accessor |
| * object also holds the child GuardManager. |
| * |
| * Lets look at an example to understand how it works. |
| * class Pair: |
| * int x = 1; |
| * int y = 2; |
| * |
| * At compile time |
| * >> guard_mananger = GuardManager() |
| * >> guard_mananger.x.add_lambda_guard( |
| * lambda x: isinstance(x, Pair), |
| * lambda x: f"expected Pair, found {type(x)}" |
| * ) |
| * >> guard_mananger.x.add_lambda_guard(lambda x: x == 1, lambda x: f"found |
| * {x}, expected 1") |
| * >> guard_mananger.y.add_lambda_guard(lambda x: x == 2, lambda x: f"found |
| * {x}, expected 2") |
| * |
| * At runtime |
| * >> guard_mananger.check(Pair()) |
| * |
| * At compile time we build the tree structure. When we do `guard_manager.x`, |
| * it creates an AttrGuardAccessorNode, initializes a child guard manager with |
| * this accessor node, and adds it as a child. When we do |
| * `guard_manager.x.add_lambda_guard`, we call add_lambda_guard on the newly |
| * created guard manager and register a new leaf guard on it. |
| * |
| * At runtime, the accessor node has an important function of providing a way |
| * to access the value for the child guard. In the above example, |
| * guard_manager.x adds an AttrGuardAccessorNode with attr_name x. When check |
| * function is called, parent GuardManager calls getattr(value, "x") on its |
| * value passed to the check function to call the check function of the child |
| * guard manager. |
| * |
| * Performace optimization for fail fast - An optimization for runtime here is |
| * to sort the execution of child guards depending on the failure count. This |
| * ensures that we run the guards that are more prone to fail statistically |
| * first. This can improve the cache lookup time when we have multiple cache |
| * entries. |
| */ |
| |
| class GuardManager { |
| public: |
| GuardManager() = delete; |
| GuardManager(RootGuardManager* root) : _root(root) {} |
| GuardManager(const GuardManager& m) = delete; |
| GuardManager& operator=(const GuardManager&) = delete; |
| virtual ~GuardManager() {} |
| |
| RootGuardManager* get_root() { |
| return _root; |
| } |
| |
| void add_leaf_guard(std::shared_ptr<LeafGuard> leaf_guard) { |
| _leaf_guards.emplace_back(std::move(leaf_guard)); |
| } |
| /** |
| * Adds a new guard manager with appropriate Accessor. If the accessor is |
| * already present, we just return the guard manager. |
| */ |
| template <typename GuardAccessorT> |
| GuardManager* get_child_manager( |
| const py::object& accessor_key, |
| py::handle example_value) { |
| // accessor_key type depends on the GuardAccessorT |
| // for example for GetAttrGuardAccessor - py::str name |
| |
| // Return the manager if the guard accessor exists |
| for (const auto& accessor : _accessors) { |
| if (accessor->matches_key(accessor_key)) { |
| return accessor->get_guard_manager().get(); |
| } |
| } |
| |
| // Construct a new guard accessor |
| _accessors.emplace_back( |
| std::make_unique<GuardAccessorT>(_root, accessor_key, example_value)); |
| return _accessors.back()->get_guard_manager().get(); |
| } |
| |
| // Runs the leaf guards check and then child managers check function. |
| // |
| // NB: There is some code DUPLICATION between this and check_verbose |
| // function. This is intentional. check function is in the hot path and is |
| // kept very simple. The purpose of check_verbose function is to get guard |
| // failure reasoning to understand recompilations. check_verbose function |
| // does not change the state of the guard, e.g., it does not shuffle the |
| // guards and does not change the fail count. For simplicity, we duplicate |
| // the code here. |
| virtual bool check_nopybind(PyObject* value) { // borrowed ref |
| // Iterate over leaf guards |
| for (const auto& guard : _leaf_guards) { |
| if (!guard->check_nopybind(value)) { // early exit |
| _fail_count += 1; |
| // no need of sorting, just return. |
| return false; |
| } |
| } |
| |
| // Iterate over accessors. |
| bool result = true; |
| bool failed_on_first = true; |
| for (const auto& accessor : _accessors) { |
| if (!accessor->check_nopybind(value)) { // early exit |
| _fail_count += 1; |
| result = false; |
| // need to sort, so break the loop. |
| break; |
| } |
| failed_on_first = false; |
| } |
| |
| // failed_on_first is just an optimization to avoid sorting if we are |
| // failing on the first accessor itself. This is helpful when we have |
| // already sorted the guards once, and dont need to sort again. |
| if (!result && !failed_on_first) { |
| // Inplace sort the child guards by fail count. This moves the guard |
| // with higher fail count earlier in the queue, and enables fail fast |
| // for the next check_verbose. |
| |
| // An alternate implementation was to use priority queue directly on |
| // _accessors, but it was rejected because of the complexity of |
| // popping and creating a new pq on each run_guards. Moreover, this sort |
| // is happening on the unhappy path when check_verbose guard |
| // fails. So, its probably ok. |
| std::sort( |
| _accessors.begin(), |
| _accessors.end(), |
| [](const std::unique_ptr<GuardAccessor>& a, |
| const std::unique_ptr<GuardAccessor>& b) { |
| return a->get_guard_manager()->fail_count() > |
| b->get_guard_manager()->fail_count(); |
| }); |
| } |
| |
| return result; |
| } |
| |
| // This function has some code duplication with function check. This is |
| // deliberate to keep check function simple and fast. |
| virtual GuardDebugInfo check_verbose_nopybind( |
| PyObject* value) { // borrowed ref |
| int num_guards_executed = 0; |
| // Iterate over leaf guards |
| for (const auto& guard : _leaf_guards) { |
| const GuardDebugInfo& debug_info = guard->check_verbose_nopybind(value); |
| num_guards_executed++; |
| if (!debug_info.result) { |
| return GuardDebugInfo( |
| false, debug_info.verbose_code_parts, num_guards_executed); |
| } |
| } |
| |
| // Iterate over accessors |
| for (const auto& accessor : _accessors) { |
| const GuardDebugInfo& debug_info = |
| accessor->check_verbose_nopybind(value); |
| num_guards_executed += debug_info.num_guards_executed; |
| if (!debug_info.result) { |
| return GuardDebugInfo( |
| false, debug_info.verbose_code_parts, num_guards_executed); |
| } |
| } |
| |
| return GuardDebugInfo(true, num_guards_executed); |
| } |
| |
| int fail_count() const { |
| return _fail_count; |
| } |
| |
| // DEBUG function - Returning raw pointers because we can't return unique_ptr |
| // and pybind does not accept a unique_ptr reference return type. |
| std::vector<GuardAccessor*> get_accessors() const { |
| std::vector<GuardAccessor*> ret; |
| for (const auto& accessor : _accessors) { |
| ret.emplace_back(accessor.get()); |
| } |
| return ret; |
| } |
| |
| // DEBUG function - Returning raw pointers because we can't return unique_ptr |
| // and pybind does not accept a unique_ptr reference return type. |
| virtual std::vector<GuardManager*> get_child_managers() { |
| std::vector<GuardManager*> ret; |
| for (const auto& accessor : _accessors) { |
| ret.emplace_back(accessor->get_guard_manager().get()); |
| } |
| return ret; |
| } |
| |
| // DEBUG function - Returning raw pointers because we can't return unique_ptr |
| // and pybind does not accept a unique_ptr reference return type. |
| std::vector<LeafGuard*> get_leaf_guards() const { |
| std::vector<LeafGuard*> ret; |
| for (const auto& guard : _leaf_guards) { |
| ret.push_back(guard.get()); |
| } |
| return ret; |
| } |
| |
| protected: |
| // Keeps a count of how many times this guard manager check function returns |
| // False. This is used for sorting optimization. |
| int64_t _fail_count{0}; |
| |
| private: |
| // Root of the guard manager, this is the used to install the relational |
| // guard resetters. |
| RootGuardManager* _root; |
| |
| // Leaf guards are the terminal guards on this object, e.g, type check on a |
| // list. These guards have to be run before any children are run. |
| // |
| // These leaf guards are not shufflable. In almost all cases, these guards |
| // will have an order, e,g., type(x) is int guard and x == 5 guard. We also |
| // expect very few leaf guards per GuardManager node. |
| // |
| // NB: Why are leaf guards shared ptr? This is primarily to enable relational |
| // guards like `tensor X is not tensor Y`. These guards require multiple |
| // values. We handle it by creating one guard object that holds state and this |
| // guard is installed in many guard managers, hence a shared ptr. |
| std::vector<std::shared_ptr<LeafGuard>> _leaf_guards; |
| |
| // GuardAccessors nodes to access the child guards. These guards are |
| // shufflable. On a guard failure, they are sorted based on their fail count |
| // to enable fail fast for the next check. |
| std::vector<std::unique_ptr<GuardAccessor>> _accessors; |
| }; |
| |
| /** |
| * RootGuardManager is the root of the guard tree. This is primarily |
| * constructed to hold the relational guard pointers so that we can reset the |
| * state of those guards on guard failure. All the other important |
| * implementation is in GuardManager class. |
| */ |
| |
| class RootGuardManager : public GuardManager { |
| public: |
| // This is the root node, set its _root member to nullptr |
| RootGuardManager() : GuardManager(this) {} |
| |
| // Adds the relational guard resetter |
| void add_relational_guard_resetter( |
| std::shared_ptr<RelationalGuard> relational_guard) { |
| _relational_guard_resetters.emplace_back(std::move(relational_guard)); |
| } |
| |
| // Python visible API to check guard function. |
| bool check(py::handle value) { |
| return check_nopybind(value.ptr()); |
| } |
| |
| // Python visible API to check_verbose guard function. |
| GuardDebugInfo check_verbose(py::handle value) { |
| return check_verbose_nopybind(value.ptr()); |
| } |
| |
| // Fast check function. |
| virtual bool check_nopybind(PyObject* value) override { // borrowed ref |
| // Check [Note on GIL interaction with mutex lock] for details on why we |
| // need mutex and its interactions wth GIL. |
| PyThreadState* _save; |
| Py_UNBLOCK_THREADS; // ; is added to avoid clang-formatting |
| std::lock_guard<std::mutex> lock_guard(_lock); |
| Py_BLOCK_THREADS; // ; is added to avoid clang-formatting |
| |
| if (!GuardManager::check_nopybind(value)) { |
| _reset_relational_guard_state(); |
| return false; |
| } |
| |
| // Iterate over epilogue leaf guards. |
| for (const auto& guard : _epilogue_lambda_guards) { |
| if (!guard->check_nopybind(value)) { // early exit |
| _reset_relational_guard_state(); |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| // Fast check_verbose function. |
| virtual GuardDebugInfo check_verbose_nopybind( |
| PyObject* value) override { // borrowed ref |
| // Check [Note on GIL interaction with mutex lock] for details on why we |
| // need mutex and its interactions wth GIL. |
| PyThreadState* _save; |
| Py_UNBLOCK_THREADS; // ; is added to avoid clang-formatting |
| std::lock_guard<std::mutex> lock_guard(_lock); |
| Py_BLOCK_THREADS; // ; is added to avoid clang-formatting |
| |
| GuardDebugInfo debug_info = GuardManager::check_verbose_nopybind(value); |
| if (!debug_info.result) { |
| _reset_relational_guard_state(); |
| return debug_info; |
| } |
| |
| int num_guards_executed = debug_info.num_guards_executed; |
| |
| // Iterate over epilogue leaf guards |
| for (const auto& guard : _epilogue_lambda_guards) { |
| const GuardDebugInfo& tmp_debug_info = |
| guard->check_verbose_nopybind(value); |
| num_guards_executed++; |
| if (!tmp_debug_info.result) { |
| _reset_relational_guard_state(); |
| return GuardDebugInfo( |
| false, tmp_debug_info.verbose_code_parts, num_guards_executed); |
| } |
| } |
| return GuardDebugInfo(true, num_guards_executed); |
| } |
| |
| void add_epilogue_lambda_guard(std::unique_ptr<LeafGuard> leaf_guard) { |
| _epilogue_lambda_guards.emplace_back(std::move(leaf_guard)); |
| } |
| |
| // DEBUG function - Returning raw pointers because we can't return unique_ptr |
| // and pybind does not accept a unique_ptr reference return type. |
| std::vector<LeafGuard*> get_epilogue_lambda_guards() const { |
| std::vector<LeafGuard*> ret; |
| for (const auto& guard : _epilogue_lambda_guards) { |
| ret.push_back(guard.get()); |
| } |
| return ret; |
| } |
| |
| private: |
| // Reset the state of all the relational guards on failure. |
| void _reset_relational_guard_state() { |
| for (auto& guard : _relational_guard_resetters) { |
| guard->reset_state(); |
| } |
| } |
| |
| private: |
| // All the relational guards under this guard mananger. We only use these |
| // when the guard evaluates to False. This ensures that guard state is reset |
| // on guard failure so that next invocation is clean. |
| std::vector<std::shared_ptr<RelationalGuard>> _relational_guard_resetters; |
| |
| // These guards are lambda guards, i.e., the guards that lack C++ |
| // implementation. For simplicity, we add these guards at the root. They |
| // MUST be run after all other guard managers have finished to ensure that |
| // the epilogue guards do not step on some nonexistent getattr or getitem. |
| std::vector<std::unique_ptr<LeafGuard>> _epilogue_lambda_guards; |
| |
| // [Note on GIL interaction with mutex lock] |
| // We use std::mutex to prevent multiple threads from running |
| // check/check_verbose simultaneously. This is to prevent race condition due |
| // to state changes in RelationalGuard. |
| // |
| // However, we also need to be careful about GIL interaction with mutex. There |
| // is a chance of deadlock |
| // |
| // Thread 1: has GIL, waiting for lock |
| // Thread 2: has lock, waiting for GIL |
| // |
| // This can happen when Thread 2 earlier acquired the mutex lock, starting |
| // running the critical section of check function and then called some python |
| // function (like LAMBDA_GUARD) and reached Cpython codebase that checks if it |
| // should release the GIL (typically happens after every few bytecode |
| // instructions). Thread 2 here can decide to release the GIL. Thread 1 can |
| // acquire GIL and reach the mutex, where it will wait forever. |
| // |
| // To avoid this, each thread releases the GIL before acquiring the mutex and |
| // then acquires the GIL again after acquiring the mutex lock by using |
| // Py_BLOCK_THREADS and Py_UNBLOCK_THREADS. This avoids the deadlock. |
| std::mutex _lock; |
| }; |
| |
| std::unique_ptr<GuardManager> make_guard_manager( |
| RootGuardManager* root, |
| py::handle example_value) { |
| // TODO(janimesh) - Remove comment when DictGuardManager is introduced. |
| // // Check if example_value is a dict |
| // if (py::isinstance<py::dict>(example_value)) { |
| // return std::make_unique<DictGuardManager>(root); |
| // } |
| return std::make_unique<GuardManager>(root); |
| } |
| |
| /** |
| * Represents __getattr__ acccessor. |
| */ |
| class GetAttrGuardAccessor : public GuardAccessor { |
| public: |
| GetAttrGuardAccessor( |
| RootGuardManager* root, |
| py::str name, |
| py::handle example_value) |
| : GuardAccessor(root, name, example_value), _attr_name(name.ptr()) {} |
| |
| // NB: Intentional duplication between check_nopybind and |
| // check_verbose_nopybind. |
| bool check_nopybind(PyObject* obj) override { // borrowed ref |
| PyObject* x = PyObject_GetAttr(obj, _attr_name); // new ref |
| if (x == nullptr) { |
| // Attribute absent, clear the exception and return false. |
| PyErr_Clear(); |
| return false; |
| } |
| bool result = _guard_manager->check_nopybind(x); |
| Py_DECREF(x); |
| return result; |
| } |
| |
| GuardDebugInfo check_verbose_nopybind( |
| PyObject* obj) override { // borrowed ref |
| PyObject* x = PyObject_GetAttr(obj, _attr_name); // new ref |
| if (x == nullptr) { |
| // Attribute absent, clear the exception and return false. |
| PyErr_Clear(); |
| return GuardDebugInfo( |
| false, |
| std::string("get attr failed for attr name ") + |
| py::str(_attr_name).cast<std::string>(), |
| 0); |
| } |
| GuardDebugInfo result = _guard_manager->check_verbose_nopybind(x); |
| Py_DECREF(x); |
| return result; |
| } |
| |
| std::string repr() const override { |
| // Helpful when priting GuardManager tree structure. |
| return "GetAttrGuardAccessor(" + py::str(_attr_name).cast<std::string>() + |
| ")"; |
| } |
| |
| private: |
| // no need of py::object here because the attr_name is already passed on to |
| // the base class as accessor_key which is a py::object. |
| PyObject* _attr_name; |
| }; |
| |
| } // namespace |
| |
| static void* _torchinductor_pyobject_tensor_data_ptr(PyObject* obj) { |
| if (C10_UNLIKELY( |
| obj == nullptr || |
| (!THPVariable_CheckExact(obj) && !THPVariable_Check(obj)))) { |
| throw std::runtime_error( |
| "_torchinductor_pyobject_tensor_data_ptr: non-tensor input"); |
| } |
| return THPVariable_Unpack(obj).data_ptr(); |
| } |
| |
| PyObject* torch_c_dynamo_guards_init() { |
| // initialize TensorGuardsType |
| TensorGuardsType.tp_name = "torch._C._dynamo.guards.TensorGuards"; |
| TensorGuardsType.tp_basicsize = sizeof(TensorGuards); |
| TensorGuardsType.tp_itemsize = 0; |
| TensorGuardsType.tp_dealloc = (destructor)TensorGuards_dealloc; |
| TensorGuardsType.tp_flags = Py_TPFLAGS_DEFAULT; |
| TensorGuardsType.tp_doc = "Check properties of a torch.Tensor"; |
| TensorGuardsType.tp_methods = TensorGuards_methods; |
| TensorGuardsType.tp_init = (initproc)TensorGuards_init; |
| TensorGuardsType.tp_new = TensorGuards_new; |
| |
| if (PyType_Ready(&TensorGuardsType) < 0) |
| return nullptr; |
| |
| GlobalStateGuardType.tp_name = "torch._C._dynamo.guards.GlobalStateGuard"; |
| GlobalStateGuardType.tp_basicsize = sizeof(GlobalStateGuard); |
| GlobalStateGuardType.tp_itemsize = 0; |
| GlobalStateGuardType.tp_flags = Py_TPFLAGS_DEFAULT; |
| GlobalStateGuardType.tp_doc = "Guard on PyTorch global flags such as no_grad"; |
| GlobalStateGuardType.tp_methods = GlobalStateGuard_methods; |
| GlobalStateGuardType.tp_init = (initproc)GlobalStateGuard_init; |
| GlobalStateGuardType.tp_new = PyType_GenericNew; |
| |
| if (PyType_Ready(&GlobalStateGuardType) < 0) |
| return nullptr; |
| |
| auto m = PyModule_Create(&_module); |
| if (m == nullptr) |
| return nullptr; |
| |
| Py_INCREF(&TensorGuardsType); |
| if (PyModule_AddObject(m, "TensorGuards", (PyObject*)&TensorGuardsType) < 0) { |
| Py_DECREF(&TensorGuardsType); |
| Py_DECREF(m); |
| return nullptr; |
| } |
| |
| Py_INCREF(&GlobalStateGuardType); |
| if (PyModule_AddObject( |
| m, "GlobalStateGuard", (PyObject*)&GlobalStateGuardType) < 0) { |
| Py_DECREF(&GlobalStateGuardType); |
| Py_DECREF(m); |
| return nullptr; |
| } |
| |
| // We expose the address of _torchinductor_pyobject_tensor_data_ptr in order |
| // to allow manual linking in our generated TorchInductor Python bindings. |
| // While regular linking works in most cases, it does not work properly in |
| // fbcode due to janky build setup there. |
| if (PyModule_AddObject( |
| m, |
| "_torchinductor_pyobject_tensor_data_ptr", |
| PyLong_FromVoidPtr(reinterpret_cast<void*>( |
| &_torchinductor_pyobject_tensor_data_ptr))) < 0) { |
| return nullptr; |
| } |
| |
| auto py_m = py::handle(m).cast<py::module>(); |
| py::class_<GuardDebugInfo, std::unique_ptr<GuardDebugInfo>>( |
| py_m, "GuardDebugInfo") |
| .def(py::init<bool, py::list, int>()) |
| .def("__str__", &GuardDebugInfo::to_string) |
| .def_readonly("result", &GuardDebugInfo::result) |
| .def_readonly("verbose_code_parts", &GuardDebugInfo::verbose_code_parts) |
| .def_readonly( |
| "num_guards_executed", &GuardDebugInfo::num_guards_executed); |
| |
| // Leaf Guards |
| py::class_<LeafGuard, std::shared_ptr<LeafGuard>>(py_m, "LeafGuard") |
| .def("verbose_code_parts", &LeafGuard::verbose_code_parts); |
| py::class_<LAMBDA_GUARD, LeafGuard, std::shared_ptr<LAMBDA_GUARD>>( |
| py_m, "LAMBDA_GUARD") |
| .def(py::init<py::function, py::list>()) |
| .def("__call__", &LAMBDA_GUARD::check); |
| py::class_<TYPE_MATCH, LeafGuard, std::shared_ptr<TYPE_MATCH>>( |
| py_m, "TYPE_MATCH") |
| .def(py::init<py::object, py::list>()) |
| .def("__call__", &TYPE_MATCH::check); |
| py::class_<ID_MATCH, LeafGuard, std::shared_ptr<ID_MATCH>>(py_m, "ID_MATCH") |
| .def(py::init<py::object, py::list>()) |
| .def("__call__", &ID_MATCH::check); |
| py::class_<EQUALS_MATCH, LeafGuard, std::shared_ptr<EQUALS_MATCH>>( |
| py_m, "EQUALS_MATCH") |
| .def(py::init<py::object, py::list>()) |
| .def("__call__", &EQUALS_MATCH::check); |
| |
| // Guard Accessors - These are present so that we can iterate over the |
| // GuardManager hierarchy. We intentionally do not provide even an init |
| // function on these, because these should be constructed from within C++. |
| py::class_<GuardAccessor, std::unique_ptr<GuardAccessor>>( |
| py_m, "GuardAccessor") |
| .def("repr", &GuardAccessor::repr); |
| py::class_< |
| GetAttrGuardAccessor, |
| GuardAccessor, |
| std::unique_ptr<GetAttrGuardAccessor>>(py_m, "GetAttrGuardAccessor"); |
| |
| // Guard Manager - No constructor in python, python should use |
| // RootGuardManager. |
| py::class_<GuardManager, std::unique_ptr<GuardManager>>(py_m, "GuardManager") |
| // return by reference because GuardManager has the ownership of accessors |
| .def( |
| "get_accessors", |
| &GuardManager::get_accessors, |
| py::return_value_policy::reference) |
| // return by reference because GuardManager has the ownership of child |
| // managers |
| .def( |
| "get_child_managers", |
| &GuardManager::get_child_managers, |
| py::return_value_policy::reference) |
| // return by reference because GuardManager has the ownership of leaf |
| // guards |
| .def( |
| "get_leaf_guards", |
| &GuardManager::get_leaf_guards, |
| py::return_value_policy::reference) |
| .def( |
| "add_lambda_guard", |
| [](GuardManager& self, |
| py::object lambda, |
| py::object verbose_code_parts) -> void { |
| self.add_leaf_guard( |
| std::make_shared<LAMBDA_GUARD>(lambda, verbose_code_parts)); |
| }) |
| .def( |
| "add_type_match_guard", |
| [](GuardManager& self, |
| py::object value, |
| py::object verbose_code_parts) -> void { |
| self.add_leaf_guard( |
| std::make_shared<TYPE_MATCH>(value, verbose_code_parts)); |
| }) |
| .def( |
| "add_id_match_guard", |
| [](GuardManager& self, |
| py::object value, |
| py::object verbose_code_parts) -> void { |
| self.add_leaf_guard( |
| std::make_shared<ID_MATCH>(value, verbose_code_parts)); |
| }) |
| .def( |
| "add_equals_match_guard", |
| [](GuardManager& self, |
| py::object value, |
| py::object verbose_code_parts) -> void { |
| self.add_leaf_guard( |
| std::make_shared<EQUALS_MATCH>(value, verbose_code_parts)); |
| }) |
| // return by reference because C++ GuardManager has the ownership of |
| // accessors and guard managers |
| .def( |
| "getattr_manager", |
| &GuardManager::get_child_manager<GetAttrGuardAccessor>, |
| py::return_value_policy::reference); |
| |
| // Root Guard Manager |
| py::class_<RootGuardManager, GuardManager, std::unique_ptr<RootGuardManager>>( |
| py_m, "RootGuardManager") |
| .def(py::init<>()) |
| .def("check", &RootGuardManager::check) |
| .def("check_verbose", &RootGuardManager::check_verbose) |
| // return by reference because GuardManager has the ownership of leaf |
| // guards |
| .def( |
| "get_epilogue_lambda_guards", |
| &RootGuardManager::get_epilogue_lambda_guards, |
| py::return_value_policy::reference) |
| .def( |
| "add_epilogue_lambda_guard", |
| [](RootGuardManager& self, |
| py::object lambda, |
| py::object verbose_code_parts) -> void { |
| self.add_epilogue_lambda_guard( |
| std::make_unique<LAMBDA_GUARD>(lambda, verbose_code_parts)); |
| }); |
| |
| return m; |
| } |