torch/csrc/autograd/init.cpp - platform/external/pytorch - Git at Google

 #include <torch/csrc/python_headers.h>

 #include <c10/core/DeviceType.h>
 #include <c10/core/InferenceMode.h>
 #include <torch/csrc/Exceptions.h>
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/autograd/autograd.h>
 #include <torch/csrc/autograd/grad_mode.h>
 #include <ATen/autocast_mode.h>
 #include <torch/csrc/autograd/profiler.h>
 #include <torch/csrc/autograd/python_function.h>
 #include <torch/csrc/autograd/function.h>
 #include <torch/csrc/autograd/saved_variable.h>
 #include <torch/csrc/autograd/python_saved_variable_hooks.h>
 #include <torch/csrc/autograd/utils/wrap_outputs.h>
 #include <torch/csrc/autograd/utils/python_arg_parsing.h>
 #include <torch/csrc/utils/pycfunction_helpers.h>
 #include <c10/core/ScalarType.h>

 #include <set>

 struct DisableTorchDispatch {
   DisableTorchDispatch() : guard_(c10::DispatchKey::Python) {
   }
   c10::impl::ExcludeDispatchKeyGuard guard_;
 };

 PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
   using namespace torch::autograd::profiler;
   auto tensor_module = THPObjectPtr(PyImport_ImportModule("torch._tensor"));
   if (!tensor_module)
     return nullptr;

   // NOTE: "leaks" THPVariableClass
   THPVariableClass = PyObject_GetAttrString(tensor_module, "Tensor");
   if (!THPVariableClass)
     return nullptr;

   auto autograd_module = THPObjectPtr(PyImport_ImportModule("torch.autograd"));
   if (!autograd_module)
     return nullptr;

   // NOTE: "leaks" Function
   THPFunctionClass = PyObject_GetAttrString(autograd_module, "Function");
   if (!THPFunctionClass)
     return nullptr;

   auto torch_C_module = THPObjectPtr(PyImport_ImportModule("torch._C"));
   if (!torch_C_module)
     return nullptr;
   auto _C_m = py::handle(torch_C_module).cast<py::module>();
   auto m = _C_m.def_submodule("_autograd", "autograd bindings");

   auto parameter_module = THPObjectPtr(PyImport_ImportModule("torch.nn.parameter"));
   if (!parameter_module)
     return nullptr;

   // NOTE: "leaks" ParameterClass
   ParameterClass = PyObject_GetAttrString(parameter_module, "Parameter");
   if (!ParameterClass)
     return nullptr;

   py::enum_<ProfilerState>(m, "ProfilerState")
       .value("Disabled", ProfilerState::Disabled)
       .value("CPU", ProfilerState::CPU)
       .value("CUDA", ProfilerState::CUDA)
       .value("NVTX", ProfilerState::NVTX)
       .value("KINETO", ProfilerState::KINETO)
       .value("KINETO_GPU_FALLBACK", ProfilerState::KINETO_GPU_FALLBACK);

   py::enum_<ActivityType>(m, "ProfilerActivity")
       .value("CPU", ActivityType::CPU)
       .value("CUDA", ActivityType::CUDA);

   py::class_<ProfilerConfig>(m, "ProfilerConfig")
       .def(py::init<ProfilerState,
           bool, /* record_input_shapes */
           bool, /* profile_memory */
           bool, /* with_stac k*/
           bool, /* with_flops */
           bool  /* with_modules */
           >());

   py::class_<LegacyEvent>(m, "ProfilerEvent")
       .def("kind", &LegacyEvent::kindStr)
       .def("name", [](const LegacyEvent& e) { return e.name(); })
       .def("thread_id", &LegacyEvent::threadId)
       .def("fwd_thread_id", &LegacyEvent::fwdThreadId)
       .def("device", &LegacyEvent::device)
       .def("cpu_elapsed_us", &LegacyEvent::cpuElapsedUs)
       .def("cuda_elapsed_us", &LegacyEvent::cudaElapsedUs)
       .def("has_cuda", &LegacyEvent::hasCuda)
       .def("shapes", &LegacyEvent::shapes)
       .def("cpu_memory_usage", &LegacyEvent::cpuMemoryUsage)
       .def("cuda_memory_usage", &LegacyEvent::cudaMemoryUsage)
       .def("handle", &LegacyEvent::handle)
       .def("node_id", &LegacyEvent::nodeId)
       .def("is_remote", &LegacyEvent::isRemote)
       .def("sequence_nr", &LegacyEvent::sequenceNr)
       .def("stack", &LegacyEvent::stack)
       .def("scope", &LegacyEvent::scope)
       .def("correlation_id", &LegacyEvent::correlationId)
       .def("start_us", &LegacyEvent::cpuUs)
       .def("flops", &LegacyEvent::flops)
       .def("is_async", &LegacyEvent::isAsync);

   py::enum_<c10::DeviceType>(m, "DeviceType")
       .value("CPU", c10::DeviceType::CPU)
       .value("CUDA", c10::DeviceType::CUDA)
       .value("MKLDNN", c10::DeviceType::MKLDNN)
       .value("OPENGL", c10::DeviceType::OPENGL)
       .value("OPENCL", c10::DeviceType::OPENCL)
       .value("IDEEP", c10::DeviceType::IDEEP)
       .value("HIP", c10::DeviceType::HIP)
       .value("FPGA", c10::DeviceType::FPGA)
       .value("MSNPU", c10::DeviceType::MSNPU)
       .value("XLA", c10::DeviceType::XLA)
       .value("Lazy", c10::DeviceType::Lazy)
       .value("MLC", c10::DeviceType::MLC)
       .value("HPU", c10::DeviceType::HPU)
       .value("Meta", c10::DeviceType::Meta)
       .value("Vulkan", c10::DeviceType::Vulkan)
       .value("Metal", c10::DeviceType::Metal);

   py::class_<KinetoEvent>(m, "_KinetoEvent")
       // name of the event
       .def("name", [](const KinetoEvent& e) {
         return e.name();
       })
       // PyTorch thread id of the start callback
       .def("start_thread_id", [](const KinetoEvent& e) {
         return e.startThreadId();
       })
       // PyTorch thread id of the end callback
       .def("end_thread_id", [](const KinetoEvent& e) {
         return e.endThreadId();
       })
       // for events of scope BACKWARD_FUNCTION - PyTorch thread id
       // of the corresponding forward op
       .def("fwd_thread_id", [](const KinetoEvent& e) {
         return e.fwdThreadId();
       })
       // together with fwd_thread_id, used to uniquely identify
       // the forward op
       .def("sequence_nr", [](const KinetoEvent& e) {
         return e.sequenceNr();
       })
       // absolute start time (since unix epoch) in us
       .def("start_us", [](const KinetoEvent& e) {
         return e.startUs();
       })
       // duration in us
       .def("duration_us", [](const KinetoEvent& e) {
         return e.durationUs();
       })
       // used for correlation between high-level PyTorch events
       // and low-level device events
       .def("correlation_id", [](const KinetoEvent& e) {
         return e.correlationId();
       })
       // shapes of input tensors
       .def("shapes", [](const KinetoEvent& e) {
         if (e.hasShapes()) {
           return e.shapes();
         } else {
           return std::vector<std::vector<int64_t>>();
         }
       })
       .def("dtypes", [](const KinetoEvent& e) {
         if (e.hasTypes()) {
           return e.dtypes();
         } else {
           return std::vector<std::string>();
         }
       })
       // stack traces of the PyTorch CPU events
       .def("stack", [](const KinetoEvent& e) {
         if (e.hasStack()) {
           return e.stack();
         } else {
           return std::vector<std::string>();
         }
       })
       // type of the RecordFunction that generated a PyTorch CPU event
       // (op, torchscript function, user label, etc)
       .def("scope", [](const KinetoEvent& e) {
         return e.scope();
       })
       // device number, for CPU - process id
       .def("device_index", [](const KinetoEvent& e) {
         return e.deviceIndex();
       })
       // for CUDA - stream id, for CPU - start thread id
       .def("device_resource_id", [](const KinetoEvent& e) {
         return e.deviceResourceId();
       })
       // device type
       .def("device_type", [](const KinetoEvent& e) {
         return e.deviceType();
       })
       // correlation id of a linked event
       .def("linked_correlation_id", [](const KinetoEvent& e) {
         return e.linkedCorrelationId();
       })
       // compute flops
       .def("flops", [](const KinetoEvent& e) {
         return e.flops();
       })
       // Whether this is async event or not
       .def("is_async", [](const KinetoEvent& e) {
         return e.isAsync();
       })
       .def("cuda_elapsed_us", &KinetoEvent::cudaElapsedUs)
       .def("nbytes", [](const KinetoEvent& e) {
         return e.nBytes();
       });

   py::class_<ProfilerResult>(m, "_ProfilerResult")
     .def("trace_start_us", &ProfilerResult::trace_start_us)
     .def("events", &ProfilerResult::events)
 #ifdef USE_KINETO
     .def("save", &ProfilerResult::save)
 #endif // USE_KINETO
     ;

   m.def("_enable_profiler", enableProfiler);
   m.def("_disable_profiler", disableProfiler);
   m.def("_prepare_profiler", prepareProfiler);

   m.def("_add_metadata_json", [](const std::string& key, const std::string& value) {
 #ifdef USE_KINETO
       addMetadataJson(key, value);
 #else
       LOG(WARNING) << "Adding profiling metadata requires using "
                    << "torch.profiler with Kineto support (USE_KINETO=1)";
 #endif // USE_KINETO
   });

   m.def("kineto_available", []() {
 #ifdef USE_KINETO
     return true;
 #else
     return false;
 #endif
   });

   m.def("_supported_activities", []() {
     std::set<ActivityType> activities {ActivityType::CPU};
 #if defined(USE_KINETO) && !defined(LIBKINETO_NOCUPTI)
     if (at::getNumGPUs() > 0 && !at::hasHIP()) {
       activities.insert(ActivityType::CUDA);
     }
 #endif
     return activities;
   });

   m.def("_enable_profiler_legacy", enableProfilerLegacy);
   py::class_<ProfilerDisableOptions>(m, "_ProfilerDisableOptions")
       .def(py::init<bool, bool>());
   m.def(
       "_disable_profiler_legacy",
       disableProfilerLegacy,
       py::arg("profiler_disable_options") = ProfilerDisableOptions());
   m.def("_profiler_enabled", profilerEnabled);
   m.def("_enable_record_function", [](bool enable) {
     at::enableRecordFunction(enable);
   });
   m.def("_set_empty_test_observer", [](bool is_global, double sampling_prob) {
     auto cb = at::RecordFunctionCallback(nullptr)
       .needsInputs(true)
       .samplingProb(sampling_prob);
     if (is_global) {
       at::addGlobalCallback(cb);
     } else {
       at::addThreadLocalCallback(cb);
     }
   });
   m.def("_clear_callbacks", []() {
     at::clearCallbacks();
   });
   m.def("_register_saved_tensors_default_hooks", [](py::function &pack_hook, py::function &unpack_hook) {
     torch::autograd::PyDefaultSavedVariableHooks::set_hooks(pack_hook, unpack_hook);
   });
   m.def("_reset_saved_tensors_default_hooks", []() {
     torch::autograd::PyDefaultSavedVariableHooks::reset_hooks();
   });

   py::class_<c10::InferenceMode>(_C_m, "_InferenceMode")
       .def(py::init<bool>());

   py::class_<DisableTorchDispatch>(_C_m, "_DisableTorchDispatch")
       .def(py::init<>());

   py::class_<torch::autograd::SavedVariable>(m, "SavedTensor")
     .def(py::init([]()->torch::autograd::SavedVariable {
       TORCH_CHECK(false, "Trying to create a SavedTensor object from Python is forbidden.");
     }))
     .def("register_hooks", [](torch::autograd::SavedVariable &s, py::function &pack_hook, py::function &unpack_hook) {
         // Because we use a py::object, pybind will increment the refcount of the hook functions for us
         s.register_hooks(std::make_unique<torch::autograd::PySavedVariableHooks>(pack_hook, unpack_hook));
     });

   Py_RETURN_TRUE;
 }

 namespace torch { namespace autograd {

 static PyObject * set_autocast_enabled(PyObject* _unused, PyObject *arg) {
   HANDLE_TH_ERRORS
   if (!PyBool_Check(arg)) {
     throw TypeError("enabled must be a bool (got %s)", Py_TYPE(arg)->tp_name);
   }
   at::autocast::set_enabled(arg == Py_True);
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }

 static PyObject * is_autocast_enabled(PyObject* _unused, PyObject *arg) {
   HANDLE_TH_ERRORS
   if (at::autocast::is_enabled()) {
     Py_RETURN_TRUE;
   } else {
     Py_RETURN_FALSE;
   }
   END_HANDLE_TH_ERRORS
 }

 static PyObject * set_autocast_cpu_enabled(PyObject* _unused, PyObject *arg) {
   HANDLE_TH_ERRORS
   if (!PyBool_Check(arg)) {
     throw TypeError("enabled must be a bool (got %s)", Py_TYPE(arg)->tp_name);
   }
   at::autocast::set_cpu_enabled(arg == Py_True);
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }

 static PyObject * is_autocast_cpu_enabled(PyObject* _unused, PyObject *arg) {
   HANDLE_TH_ERRORS
   if (at::autocast::is_cpu_enabled()) {
     Py_RETURN_TRUE;
   } else {
     Py_RETURN_FALSE;
   }
   END_HANDLE_TH_ERRORS
 }

 static PyObject * set_autocast_gpu_dtype(PyObject* _unused, PyObject *arg) {
   HANDLE_TH_ERRORS
   if (!THPDtype_Check(arg)) {
     throw TypeError(
         "dtype must be a torch.dtype (got %s)", Py_TYPE(arg)->tp_name);
   }
   at::ScalarType targetType = reinterpret_cast<THPDtype*>(arg)->scalar_type;
   at::autocast::set_autocast_gpu_dtype(targetType);
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }

 static PyObject * set_autocast_cpu_dtype(PyObject* _unused, PyObject *arg) {
   HANDLE_TH_ERRORS
   if (!THPDtype_Check(arg)) {
     throw TypeError(
         "dtype must be a torch.dtype (got %s)", Py_TYPE(arg)->tp_name);
   }
   at::ScalarType targetType = reinterpret_cast<THPDtype*>(arg)->scalar_type;
   at::autocast::set_autocast_cpu_dtype(targetType);
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }

 static const char* scalarTypeName(const at::ScalarType type) {
   switch (type) {
 #define DEFINE_CASE(ctype, name) \
   case at::ScalarType::name:     \
     return #ctype;
     AT_FORAUTOCAST_SCALAR_TYPES(DEFINE_CASE)
 #undef DEFINE_CASE
     default:
       throw std::runtime_error("unknown scalar type for autocast");
   }
 }

 static PyObject * get_autocast_gpu_dtype(PyObject* _unused, PyObject *arg){
   HANDLE_TH_ERRORS
   at::ScalarType current_dtype = at::autocast::get_autocast_gpu_dtype();
   return THPDtype_New(current_dtype, scalarTypeName(current_dtype));
   END_HANDLE_TH_ERRORS
 }

 static PyObject * get_autocast_cpu_dtype(PyObject* _unused, PyObject *arg){
   HANDLE_TH_ERRORS
   at::ScalarType current_dtype = at::autocast::get_autocast_cpu_dtype();
   return THPDtype_New(current_dtype, scalarTypeName(current_dtype));
   END_HANDLE_TH_ERRORS
 }

 static PyObject * clear_autocast_cache(PyObject* _unused, PyObject *arg) {
   HANDLE_TH_ERRORS
   at::autocast::clear_cache();
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }

 static PyObject * autocast_increment_nesting(PyObject* _unused, PyObject *arg) {
   HANDLE_TH_ERRORS
   return THPUtils_packInt64(at::autocast::increment_nesting());
   END_HANDLE_TH_ERRORS
 }

 static PyObject * autocast_decrement_nesting(PyObject* _unused, PyObject *arg) {
   HANDLE_TH_ERRORS
   return THPUtils_packInt64(at::autocast::decrement_nesting());
   END_HANDLE_TH_ERRORS
 }

 static PyObject * set_grad_enabled(PyObject* _unused, PyObject *arg) {
   HANDLE_TH_ERRORS
   if (!PyBool_Check(arg)) {
     throw TypeError("enabled must be a bool (got %s)", Py_TYPE(arg)->tp_name);
   }
   GradMode::set_enabled(arg == Py_True);
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }

 static PyObject * is_grad_enabled(PyObject* _unused, PyObject *arg) {
   HANDLE_TH_ERRORS
   if (GradMode::is_enabled()) {
     Py_RETURN_TRUE;
   } else {
     Py_RETURN_FALSE;
   }
   END_HANDLE_TH_ERRORS
 }

 static PyObject * is_inference_mode_enabled(PyObject* _unused, PyObject *arg) {
   HANDLE_TH_ERRORS
   if (c10::InferenceMode::is_enabled()) {
     Py_RETURN_TRUE;
   } else {
     Py_RETURN_FALSE;
   }
   END_HANDLE_TH_ERRORS
 }

 static PyObject * set_anomaly_mode_enabled(PyObject* _unused, PyObject *arg) {
   HANDLE_TH_ERRORS
   if (!PyBool_Check(arg)) {
     throw TypeError("enabled must be a bool (got %s)", Py_TYPE(arg)->tp_name);
   }
   AnomalyMode::set_enabled(arg == Py_True);
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }

 static PyObject * is_anomaly_mode_enabled(PyObject* _unused, PyObject *arg) {
   HANDLE_TH_ERRORS
   if (AnomalyMode::is_enabled()) {
     Py_RETURN_TRUE;
   } else {
     Py_RETURN_FALSE;
   }
   END_HANDLE_TH_ERRORS
 }

 static PyObject * python_enter_dual_level(PyObject* _unused, PyObject* arg) {
   HANDLE_TH_ERRORS
   // It is unlikely that the depth of forward nesting will overflow int64_t so we
   // just static cast here.
   return utils::wrap(static_cast<int64_t>(forward_ad::enter_dual_level()));
   END_HANDLE_TH_ERRORS
 }

 static PyObject * python_exit_dual_level(PyObject* _unused, PyObject* args, PyObject* kwargs) {
   HANDLE_TH_ERRORS
   static PythonArgParser parser({
     "exit_dual_level(int64_t level)"
   });

   ParsedArgs<1> parsed_args;
   auto _r = parser.parse(args, kwargs, parsed_args);

   auto idx = _r.toInt64(0);
   // Make sure the given index is valid before casting it
   TORCH_CHECK(idx >= 0, "Dual level must be a positive number.");
   forward_ad::exit_dual_level(static_cast<uint64_t>(idx));
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }

 // autograd methods on torch._C
 static PyMethodDef methods[] = { // NOLINT
   {"_set_grad_enabled", set_grad_enabled, METH_O, nullptr},
   {"is_grad_enabled", is_grad_enabled, METH_NOARGS, nullptr},
   {"is_inference_mode_enabled", is_inference_mode_enabled, METH_NOARGS, nullptr},
   {"set_autocast_enabled", set_autocast_enabled, METH_O, nullptr},
   {"is_autocast_enabled", is_autocast_enabled, METH_NOARGS, nullptr},
   {"clear_autocast_cache", clear_autocast_cache, METH_NOARGS, nullptr},
   {"set_autocast_cpu_enabled", set_autocast_cpu_enabled, METH_O, nullptr},
   {"is_autocast_cpu_enabled", is_autocast_cpu_enabled, METH_NOARGS, nullptr},
   {"set_autocast_cpu_dtype", set_autocast_cpu_dtype, METH_O, nullptr},
   {"get_autocast_cpu_dtype", get_autocast_cpu_dtype, METH_NOARGS, nullptr},
   {"set_autocast_gpu_dtype", set_autocast_gpu_dtype, METH_O, nullptr},
   {"get_autocast_gpu_dtype", get_autocast_gpu_dtype, METH_NOARGS, nullptr},
   {"autocast_increment_nesting", autocast_increment_nesting, METH_NOARGS, nullptr},
   {"autocast_decrement_nesting", autocast_decrement_nesting, METH_NOARGS, nullptr},
   {"set_anomaly_enabled", set_anomaly_mode_enabled, METH_O, nullptr},
   {"is_anomaly_enabled", is_anomaly_mode_enabled, METH_NOARGS, nullptr},
   {"_enter_dual_level", python_enter_dual_level, METH_NOARGS, nullptr},
   {"_exit_dual_level", castPyCFunctionWithKeywords(python_exit_dual_level), METH_VARARGS | METH_KEYWORDS, nullptr},
   {nullptr, nullptr, 0, nullptr}
 };

 PyMethodDef* python_functions() {
   return methods;
 }

 }} // namespace torch::autograd
	#include <torch/csrc/python_headers.h>

	#include <c10/core/DeviceType.h>
	#include <c10/core/InferenceMode.h>
	#include <torch/csrc/Exceptions.h>
	#include <torch/csrc/utils/pybind.h>
	#include <torch/csrc/autograd/autograd.h>
	#include <torch/csrc/autograd/grad_mode.h>
	#include <ATen/autocast_mode.h>
	#include <torch/csrc/autograd/profiler.h>
	#include <torch/csrc/autograd/python_function.h>
	#include <torch/csrc/autograd/function.h>
	#include <torch/csrc/autograd/saved_variable.h>
	#include <torch/csrc/autograd/python_saved_variable_hooks.h>
	#include <torch/csrc/autograd/utils/wrap_outputs.h>
	#include <torch/csrc/autograd/utils/python_arg_parsing.h>
	#include <torch/csrc/utils/pycfunction_helpers.h>
	#include <c10/core/ScalarType.h>

	#include <set>

	struct DisableTorchDispatch {
	DisableTorchDispatch() : guard_(c10::DispatchKey::Python) {
	}
	c10::impl::ExcludeDispatchKeyGuard guard_;
	};

	PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
	using namespace torch::autograd::profiler;
	auto tensor_module = THPObjectPtr(PyImport_ImportModule("torch._tensor"));
	if (!tensor_module)
	return nullptr;

	// NOTE: "leaks" THPVariableClass
	THPVariableClass = PyObject_GetAttrString(tensor_module, "Tensor");
	if (!THPVariableClass)
	return nullptr;

	auto autograd_module = THPObjectPtr(PyImport_ImportModule("torch.autograd"));
	if (!autograd_module)
	return nullptr;

	// NOTE: "leaks" Function
	THPFunctionClass = PyObject_GetAttrString(autograd_module, "Function");
	if (!THPFunctionClass)
	return nullptr;

	auto torch_C_module = THPObjectPtr(PyImport_ImportModule("torch._C"));
	if (!torch_C_module)
	return nullptr;
	auto _C_m = py::handle(torch_C_module).cast<py::module>();
	auto m = _C_m.def_submodule("_autograd", "autograd bindings");

	auto parameter_module = THPObjectPtr(PyImport_ImportModule("torch.nn.parameter"));
	if (!parameter_module)
	return nullptr;

	// NOTE: "leaks" ParameterClass
	ParameterClass = PyObject_GetAttrString(parameter_module, "Parameter");
	if (!ParameterClass)
	return nullptr;

	py::enum_<ProfilerState>(m, "ProfilerState")
	.value("Disabled", ProfilerState::Disabled)
	.value("CPU", ProfilerState::CPU)
	.value("CUDA", ProfilerState::CUDA)
	.value("NVTX", ProfilerState::NVTX)
	.value("KINETO", ProfilerState::KINETO)
	.value("KINETO_GPU_FALLBACK", ProfilerState::KINETO_GPU_FALLBACK);

	py::enum_<ActivityType>(m, "ProfilerActivity")
	.value("CPU", ActivityType::CPU)
	.value("CUDA", ActivityType::CUDA);

	py::class_<ProfilerConfig>(m, "ProfilerConfig")
	.def(py::init<ProfilerState,
	bool, /* record_input_shapes */
	bool, /* profile_memory */
	bool, /* with_stac k*/
	bool, /* with_flops */
	bool /* with_modules */
	>());

	py::class_<LegacyEvent>(m, "ProfilerEvent")
	.def("kind", &LegacyEvent::kindStr)
	.def("name", [](const LegacyEvent& e) { return e.name(); })
	.def("thread_id", &LegacyEvent::threadId)
	.def("fwd_thread_id", &LegacyEvent::fwdThreadId)
	.def("device", &LegacyEvent::device)
	.def("cpu_elapsed_us", &LegacyEvent::cpuElapsedUs)
	.def("cuda_elapsed_us", &LegacyEvent::cudaElapsedUs)
	.def("has_cuda", &LegacyEvent::hasCuda)
	.def("shapes", &LegacyEvent::shapes)
	.def("cpu_memory_usage", &LegacyEvent::cpuMemoryUsage)
	.def("cuda_memory_usage", &LegacyEvent::cudaMemoryUsage)
	.def("handle", &LegacyEvent::handle)
	.def("node_id", &LegacyEvent::nodeId)
	.def("is_remote", &LegacyEvent::isRemote)
	.def("sequence_nr", &LegacyEvent::sequenceNr)
	.def("stack", &LegacyEvent::stack)
	.def("scope", &LegacyEvent::scope)
	.def("correlation_id", &LegacyEvent::correlationId)
	.def("start_us", &LegacyEvent::cpuUs)
	.def("flops", &LegacyEvent::flops)
	.def("is_async", &LegacyEvent::isAsync);

	py::enum_<c10::DeviceType>(m, "DeviceType")
	.value("CPU", c10::DeviceType::CPU)
	.value("CUDA", c10::DeviceType::CUDA)
	.value("MKLDNN", c10::DeviceType::MKLDNN)
	.value("OPENGL", c10::DeviceType::OPENGL)
	.value("OPENCL", c10::DeviceType::OPENCL)
	.value("IDEEP", c10::DeviceType::IDEEP)
	.value("HIP", c10::DeviceType::HIP)
	.value("FPGA", c10::DeviceType::FPGA)
	.value("MSNPU", c10::DeviceType::MSNPU)
	.value("XLA", c10::DeviceType::XLA)
	.value("Lazy", c10::DeviceType::Lazy)
	.value("MLC", c10::DeviceType::MLC)
	.value("HPU", c10::DeviceType::HPU)
	.value("Meta", c10::DeviceType::Meta)
	.value("Vulkan", c10::DeviceType::Vulkan)
	.value("Metal", c10::DeviceType::Metal);

	py::class_<KinetoEvent>(m, "_KinetoEvent")
	// name of the event
	.def("name", [](const KinetoEvent& e) {
	return e.name();
	})
	// PyTorch thread id of the start callback
	.def("start_thread_id", [](const KinetoEvent& e) {
	return e.startThreadId();
	})
	// PyTorch thread id of the end callback
	.def("end_thread_id", [](const KinetoEvent& e) {
	return e.endThreadId();
	})
	// for events of scope BACKWARD_FUNCTION - PyTorch thread id
	// of the corresponding forward op
	.def("fwd_thread_id", [](const KinetoEvent& e) {
	return e.fwdThreadId();
	})
	// together with fwd_thread_id, used to uniquely identify
	// the forward op
	.def("sequence_nr", [](const KinetoEvent& e) {
	return e.sequenceNr();
	})
	// absolute start time (since unix epoch) in us
	.def("start_us", [](const KinetoEvent& e) {
	return e.startUs();
	})
	// duration in us
	.def("duration_us", [](const KinetoEvent& e) {
	return e.durationUs();
	})
	// used for correlation between high-level PyTorch events
	// and low-level device events
	.def("correlation_id", [](const KinetoEvent& e) {
	return e.correlationId();
	})
	// shapes of input tensors
	.def("shapes", [](const KinetoEvent& e) {
	if (e.hasShapes()) {
	return e.shapes();
	} else {
	return std::vector<std::vector<int64_t>>();
	}
	})
	.def("dtypes", [](const KinetoEvent& e) {
	if (e.hasTypes()) {
	return e.dtypes();
	} else {
	return std::vector<std::string>();
	}
	})
	// stack traces of the PyTorch CPU events
	.def("stack", [](const KinetoEvent& e) {
	if (e.hasStack()) {
	return e.stack();
	} else {
	return std::vector<std::string>();
	}
	})
	// type of the RecordFunction that generated a PyTorch CPU event
	// (op, torchscript function, user label, etc)
	.def("scope", [](const KinetoEvent& e) {
	return e.scope();
	})
	// device number, for CPU - process id
	.def("device_index", [](const KinetoEvent& e) {
	return e.deviceIndex();
	})
	// for CUDA - stream id, for CPU - start thread id
	.def("device_resource_id", [](const KinetoEvent& e) {
	return e.deviceResourceId();
	})
	// device type
	.def("device_type", [](const KinetoEvent& e) {
	return e.deviceType();
	})
	// correlation id of a linked event
	.def("linked_correlation_id", [](const KinetoEvent& e) {
	return e.linkedCorrelationId();
	})
	// compute flops
	.def("flops", [](const KinetoEvent& e) {
	return e.flops();
	})
	// Whether this is async event or not
	.def("is_async", [](const KinetoEvent& e) {
	return e.isAsync();
	})
	.def("cuda_elapsed_us", &KinetoEvent::cudaElapsedUs)
	.def("nbytes", [](const KinetoEvent& e) {
	return e.nBytes();
	});

	py::class_<ProfilerResult>(m, "_ProfilerResult")
	.def("trace_start_us", &ProfilerResult::trace_start_us)
	.def("events", &ProfilerResult::events)
	#ifdef USE_KINETO
	.def("save", &ProfilerResult::save)
	#endif // USE_KINETO
	;

	m.def("_enable_profiler", enableProfiler);
	m.def("_disable_profiler", disableProfiler);
	m.def("_prepare_profiler", prepareProfiler);

	m.def("_add_metadata_json", [](const std::string& key, const std::string& value) {
	#ifdef USE_KINETO
	addMetadataJson(key, value);
	#else
	LOG(WARNING) << "Adding profiling metadata requires using "
	<< "torch.profiler with Kineto support (USE_KINETO=1)";
	#endif // USE_KINETO
	});

	m.def("kineto_available", []() {
	#ifdef USE_KINETO
	return true;
	#else
	return false;
	#endif
	});

	m.def("_supported_activities", []() {
	std::set<ActivityType> activities {ActivityType::CPU};
	#if defined(USE_KINETO) && !defined(LIBKINETO_NOCUPTI)
	if (at::getNumGPUs() > 0 && !at::hasHIP()) {
	activities.insert(ActivityType::CUDA);
	}
	#endif
	return activities;
	});

	m.def("_enable_profiler_legacy", enableProfilerLegacy);
	py::class_<ProfilerDisableOptions>(m, "_ProfilerDisableOptions")
	.def(py::init<bool, bool>());
	m.def(
	"_disable_profiler_legacy",
	disableProfilerLegacy,
	py::arg("profiler_disable_options") = ProfilerDisableOptions());
	m.def("_profiler_enabled", profilerEnabled);
	m.def("_enable_record_function", [](bool enable) {
	at::enableRecordFunction(enable);
	});
	m.def("_set_empty_test_observer", [](bool is_global, double sampling_prob) {
	auto cb = at::RecordFunctionCallback(nullptr)
	.needsInputs(true)
	.samplingProb(sampling_prob);
	if (is_global) {
	at::addGlobalCallback(cb);
	} else {
	at::addThreadLocalCallback(cb);
	}
	});
	m.def("_clear_callbacks", []() {
	at::clearCallbacks();
	});
	m.def("_register_saved_tensors_default_hooks", [](py::function &pack_hook, py::function &unpack_hook) {
	torch::autograd::PyDefaultSavedVariableHooks::set_hooks(pack_hook, unpack_hook);
	});
	m.def("_reset_saved_tensors_default_hooks", []() {
	torch::autograd::PyDefaultSavedVariableHooks::reset_hooks();
	});

	py::class_<c10::InferenceMode>(_C_m, "_InferenceMode")
	.def(py::init<bool>());

	py::class_<DisableTorchDispatch>(_C_m, "_DisableTorchDispatch")
	.def(py::init<>());

	py::class_<torch::autograd::SavedVariable>(m, "SavedTensor")
	.def(py::init([]()->torch::autograd::SavedVariable {
	TORCH_CHECK(false, "Trying to create a SavedTensor object from Python is forbidden.");
	}))
	.def("register_hooks", [](torch::autograd::SavedVariable &s, py::function &pack_hook, py::function &unpack_hook) {
	// Because we use a py::object, pybind will increment the refcount of the hook functions for us
	s.register_hooks(std::make_unique<torch::autograd::PySavedVariableHooks>(pack_hook, unpack_hook));
	});

	Py_RETURN_TRUE;
	}

	namespace torch { namespace autograd {

	static PyObject * set_autocast_enabled(PyObject* _unused, PyObject *arg) {
	HANDLE_TH_ERRORS
	if (!PyBool_Check(arg)) {
	throw TypeError("enabled must be a bool (got %s)", Py_TYPE(arg)->tp_name);
	}
	at::autocast::set_enabled(arg == Py_True);
	Py_RETURN_NONE;
	END_HANDLE_TH_ERRORS
	}

	static PyObject * is_autocast_enabled(PyObject* _unused, PyObject *arg) {
	HANDLE_TH_ERRORS
	if (at::autocast::is_enabled()) {
	Py_RETURN_TRUE;
	} else {
	Py_RETURN_FALSE;
	}
	END_HANDLE_TH_ERRORS
	}

	static PyObject * set_autocast_cpu_enabled(PyObject* _unused, PyObject *arg) {
	HANDLE_TH_ERRORS
	if (!PyBool_Check(arg)) {
	throw TypeError("enabled must be a bool (got %s)", Py_TYPE(arg)->tp_name);
	}
	at::autocast::set_cpu_enabled(arg == Py_True);
	Py_RETURN_NONE;
	END_HANDLE_TH_ERRORS
	}

	static PyObject * is_autocast_cpu_enabled(PyObject* _unused, PyObject *arg) {
	HANDLE_TH_ERRORS
	if (at::autocast::is_cpu_enabled()) {
	Py_RETURN_TRUE;
	} else {
	Py_RETURN_FALSE;
	}
	END_HANDLE_TH_ERRORS
	}

	static PyObject * set_autocast_gpu_dtype(PyObject* _unused, PyObject *arg) {
	HANDLE_TH_ERRORS
	if (!THPDtype_Check(arg)) {
	throw TypeError(
	"dtype must be a torch.dtype (got %s)", Py_TYPE(arg)->tp_name);
	}
	at::ScalarType targetType = reinterpret_cast<THPDtype*>(arg)->scalar_type;
	at::autocast::set_autocast_gpu_dtype(targetType);
	Py_RETURN_NONE;
	END_HANDLE_TH_ERRORS
	}

	static PyObject * set_autocast_cpu_dtype(PyObject* _unused, PyObject *arg) {
	HANDLE_TH_ERRORS
	if (!THPDtype_Check(arg)) {
	throw TypeError(
	"dtype must be a torch.dtype (got %s)", Py_TYPE(arg)->tp_name);
	}
	at::ScalarType targetType = reinterpret_cast<THPDtype*>(arg)->scalar_type;
	at::autocast::set_autocast_cpu_dtype(targetType);
	Py_RETURN_NONE;
	END_HANDLE_TH_ERRORS
	}

	static const char* scalarTypeName(const at::ScalarType type) {
	switch (type) {
	#define DEFINE_CASE(ctype, name) \
	case at::ScalarType::name: \
	return #ctype;
	AT_FORAUTOCAST_SCALAR_TYPES(DEFINE_CASE)
	#undef DEFINE_CASE
	default:
	throw std::runtime_error("unknown scalar type for autocast");
	}
	}

	static PyObject * get_autocast_gpu_dtype(PyObject* _unused, PyObject *arg){
	HANDLE_TH_ERRORS
	at::ScalarType current_dtype = at::autocast::get_autocast_gpu_dtype();
	return THPDtype_New(current_dtype, scalarTypeName(current_dtype));
	END_HANDLE_TH_ERRORS
	}

	static PyObject * get_autocast_cpu_dtype(PyObject* _unused, PyObject *arg){
	HANDLE_TH_ERRORS
	at::ScalarType current_dtype = at::autocast::get_autocast_cpu_dtype();
	return THPDtype_New(current_dtype, scalarTypeName(current_dtype));
	END_HANDLE_TH_ERRORS
	}

	static PyObject * clear_autocast_cache(PyObject* _unused, PyObject *arg) {
	HANDLE_TH_ERRORS
	at::autocast::clear_cache();
	Py_RETURN_NONE;
	END_HANDLE_TH_ERRORS
	}

	static PyObject * autocast_increment_nesting(PyObject* _unused, PyObject *arg) {
	HANDLE_TH_ERRORS
	return THPUtils_packInt64(at::autocast::increment_nesting());
	END_HANDLE_TH_ERRORS
	}

	static PyObject * autocast_decrement_nesting(PyObject* _unused, PyObject *arg) {
	HANDLE_TH_ERRORS
	return THPUtils_packInt64(at::autocast::decrement_nesting());
	END_HANDLE_TH_ERRORS
	}

	static PyObject * set_grad_enabled(PyObject* _unused, PyObject *arg) {
	HANDLE_TH_ERRORS
	if (!PyBool_Check(arg)) {
	throw TypeError("enabled must be a bool (got %s)", Py_TYPE(arg)->tp_name);
	}
	GradMode::set_enabled(arg == Py_True);
	Py_RETURN_NONE;
	END_HANDLE_TH_ERRORS
	}

	static PyObject * is_grad_enabled(PyObject* _unused, PyObject *arg) {
	HANDLE_TH_ERRORS
	if (GradMode::is_enabled()) {
	Py_RETURN_TRUE;
	} else {
	Py_RETURN_FALSE;
	}
	END_HANDLE_TH_ERRORS
	}

	static PyObject * is_inference_mode_enabled(PyObject* _unused, PyObject *arg) {
	HANDLE_TH_ERRORS
	if (c10::InferenceMode::is_enabled()) {
	Py_RETURN_TRUE;
	} else {
	Py_RETURN_FALSE;
	}
	END_HANDLE_TH_ERRORS
	}

	static PyObject * set_anomaly_mode_enabled(PyObject* _unused, PyObject *arg) {
	HANDLE_TH_ERRORS
	if (!PyBool_Check(arg)) {
	throw TypeError("enabled must be a bool (got %s)", Py_TYPE(arg)->tp_name);
	}
	AnomalyMode::set_enabled(arg == Py_True);
	Py_RETURN_NONE;
	END_HANDLE_TH_ERRORS
	}

	static PyObject * is_anomaly_mode_enabled(PyObject* _unused, PyObject *arg) {
	HANDLE_TH_ERRORS
	if (AnomalyMode::is_enabled()) {
	Py_RETURN_TRUE;
	} else {
	Py_RETURN_FALSE;
	}
	END_HANDLE_TH_ERRORS
	}

	static PyObject * python_enter_dual_level(PyObject* _unused, PyObject* arg) {
	HANDLE_TH_ERRORS
	// It is unlikely that the depth of forward nesting will overflow int64_t so we
	// just static cast here.
	return utils::wrap(static_cast<int64_t>(forward_ad::enter_dual_level()));
	END_HANDLE_TH_ERRORS
	}

	static PyObject * python_exit_dual_level(PyObject* _unused, PyObject* args, PyObject* kwargs) {
	HANDLE_TH_ERRORS
	static PythonArgParser parser({
	"exit_dual_level(int64_t level)"
	});

	ParsedArgs<1> parsed_args;
	auto _r = parser.parse(args, kwargs, parsed_args);

	auto idx = _r.toInt64(0);
	// Make sure the given index is valid before casting it
	TORCH_CHECK(idx >= 0, "Dual level must be a positive number.");
	forward_ad::exit_dual_level(static_cast<uint64_t>(idx));
	Py_RETURN_NONE;
	END_HANDLE_TH_ERRORS
	}

	// autograd methods on torch._C
	static PyMethodDef methods[] = { // NOLINT
	{"_set_grad_enabled", set_grad_enabled, METH_O, nullptr},
	{"is_grad_enabled", is_grad_enabled, METH_NOARGS, nullptr},
	{"is_inference_mode_enabled", is_inference_mode_enabled, METH_NOARGS, nullptr},
	{"set_autocast_enabled", set_autocast_enabled, METH_O, nullptr},
	{"is_autocast_enabled", is_autocast_enabled, METH_NOARGS, nullptr},
	{"clear_autocast_cache", clear_autocast_cache, METH_NOARGS, nullptr},
	{"set_autocast_cpu_enabled", set_autocast_cpu_enabled, METH_O, nullptr},
	{"is_autocast_cpu_enabled", is_autocast_cpu_enabled, METH_NOARGS, nullptr},
	{"set_autocast_cpu_dtype", set_autocast_cpu_dtype, METH_O, nullptr},
	{"get_autocast_cpu_dtype", get_autocast_cpu_dtype, METH_NOARGS, nullptr},
	{"set_autocast_gpu_dtype", set_autocast_gpu_dtype, METH_O, nullptr},
	{"get_autocast_gpu_dtype", get_autocast_gpu_dtype, METH_NOARGS, nullptr},
	{"autocast_increment_nesting", autocast_increment_nesting, METH_NOARGS, nullptr},
	{"autocast_decrement_nesting", autocast_decrement_nesting, METH_NOARGS, nullptr},
	{"set_anomaly_enabled", set_anomaly_mode_enabled, METH_O, nullptr},
	{"is_anomaly_enabled", is_anomaly_mode_enabled, METH_NOARGS, nullptr},
	{"_enter_dual_level", python_enter_dual_level, METH_NOARGS, nullptr},
	{"_exit_dual_level", castPyCFunctionWithKeywords(python_exit_dual_level), METH_VARARGS \| METH_KEYWORDS, nullptr},
	{nullptr, nullptr, 0, nullptr}
	};

	PyMethodDef* python_functions() {
	return methods;
	}

	}} // namespace torch::autograd