| #pragma once |
| |
| #include <c10/macros/Macros.h> |
| #include <c10/util/ArrayRef.h> |
| #include <c10/util/Exception.h> |
| #include <ostream> |
| #include <string> |
| #include <vector> |
| |
| namespace c10 { |
| |
| // Semantically, a dispatch key identifies a possible "level" in our |
| // dispatch, for which a handler may be registered. Traditional |
| // backends like CPU and CUDA get dispatch keys; however, so do |
| // "wrapping" layers like Variable (for autograd handling). |
| // |
| // In implementation terms, the dispatch key identifies a specific "bit" in a |
| // DispatchKeySet. Higher bit indexes get handled by dispatching first (because |
| // we "count leading zeros" when we extract the highest priority dispatch |
| // key.) |
| // |
| // NOTE: Keep the list in sync with `DispatchKey` in tools/codegen/model.py |
| enum class DispatchKey : uint8_t { |
| |
| // ~~~~~~~~~~~~~~~~~~~~~~~~~~ UNDEFINED ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // |
| // This is not a "real" tensor id, but it exists to give us a "nullopt" |
| // element we can return for cases when a DispatchKeySet contains no elements. |
| // You can think a more semantically accurate definition of DispatchKey is: |
| // |
| // using DispatchKey = optional<RealDispatchKey> |
| // |
| // and Undefined == nullopt. We didn't actually represent |
| // it this way because optional<RealDispatchKey> would take two |
| // words, when DispatchKey fits in eight bits. |
| |
| Undefined = 0, |
| |
| // Define an alias for Undefined to represent CatchAll (long term |
| // this will get eliminated, but for now it's convenient) |
| CatchAll = Undefined, |
| |
| // ~~~~~~~~~~~~~~~~~~~~~~~~~~ BACKENDS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // |
| // A "backend" is colloquially used to refer to handlers for dispatch |
| // which actually implement the numerics of an operation in question. |
| // |
| // Due to the nature of the enum, these backends are specified in |
| // an ordered way, but for most backends this order is not semantically |
| // meaningful (e.g., it's valid to reorder these backends without changing |
| // semantics). The only situation when backend ordering is meaningful |
| // is when the backend participates in multiple dispatch with another |
| // backend; e.g., CPU and SparseCPU (sparse must have |
| // higher priority). |
| |
| // Here are backends which you think of as traditionally specifying |
| // how to implement operations on some device. |
| CPU, // registered at build/aten/src/ATen/RegisterCPU.cpp |
| CUDA, // registered at build/aten/src/ATen/RegisterCUDA.cpp |
| HIP, // NB: I think this is not actually used, due to Note [Masquerading as |
| // CUDA] |
| FPGA, // Xilinx support lives out of tree at |
| // https://gitlab.com/pytorch-complex/vitis_kernels |
| |
| // ONNX Runtime, lives out of tree at https://github.com/pytorch/ort and |
| // https://github.com/microsoft/onnxruntime, and is also used to test general |
| // backend/extension machinery in the core. cf: |
| // - test/cpp_extensions/ort_extension.cpp |
| // - test/test_torch.py |
| // - aten/src/ATen/test/extension_backend_test.cpp |
| ORT, |
| |
| XLA, // lives out of tree at https://github.com/pytorch/xla |
| MLC, // lives out of tree at https://github.com/pytorch/MLCompute |
| Vulkan, |
| Metal, |
| XPU, // For out of tree Intel's heterogeneous computing plug-in |
| HPU, // For out of tree & closed source integration of HPU / Habana |
| VE, // For out of tree & closed source integration of SX-Aurora / NEC |
| Lazy, // For lazy tensor backends |
| |
| // A meta tensor is a tensor without any data associated with it. (They |
| // have also colloquially been referred to as tensors on the "null" device). |
| // A meta tensor can be used to dry run operators without actually doing any |
| // computation, e.g., add on two meta tensors would give you another meta |
| // tensor with the output shape and dtype, but wouldn't actually add anything. |
| Meta, |
| |
| // Here are backends which specify more specialized operators |
| // based on the dtype of the tensor. |
| QuantizedCPU, // registered at build/aten/src/ATen/RegisterQuantizedCPU.cpp |
| QuantizedCUDA, // registered at build/aten/src/ATen/RegisterQuantizedCUDA.cpp |
| QuantizedXPU, // For out of tree Intel's heterogeneous computing plug-in |
| |
| // This backend is to support custom RNGs; it lets you go |
| // to a different kernel if you pass in a generator that is not a |
| // traditional CPUGeneratorImpl/CUDAGeneratorImpl. To make use of this |
| // key: |
| // 1) set it as a second parameter of at::Generator constructor call in |
| // the user-defined PRNG class. |
| // 2) use it as a dispatch key while registering custom kernels |
| // (templatized kernels specialized for user-defined PRNG class) |
| // intended for out of tree use; tested by aten/src/ATen/test/rng_test.cpp |
| CustomRNGKeyId, |
| |
| // Here are backends which specify more specialized operators |
| // based on the layout of the tensor. Note that the sparse backends |
| // are one case where ordering matters: sparse multi-dispatches with |
| // the corresponding dense tensors, and must be handled before them. |
| MkldnnCPU, // registered at build/aten/src/ATen/RegisterMkldnnCPU.cpp |
| // NB: not to be confused with MKLDNN, which is Caffe2 only |
| SparseCPU, // registered at build/aten/src/ATen/RegisterSparseCPU.cpp |
| SparseCUDA, // registered at build/aten/src/ATen/RegisterSparseCUDA.cpp |
| SparseHIP, // TODO: I think this is not actually used, due to Note |
| // [Masquerading as CUDA] |
| SparseXPU, // For out of tree Intel's heterogeneous computing plug-in |
| SparseVE, // For out of tree & closed source integration of SX-Aurora / NEC |
| |
| SparseCsrCPU, |
| SparseCsrCUDA, |
| |
| NestedTensor, // lives out of tree at https://github.com/pytorch/nestedtensor |
| |
| // Here are reserved backends for user-defined backends, see Note [Private use |
| // DispatchKey] |
| // To see some example about how to use this, check out ORT |
| PrivateUse1, |
| PrivateUse2, |
| PrivateUse3, |
| |
| // Define an alias key to represent end of backend dispatch keys. |
| // If you add new backend keys after PrivateUse3, please also update it here. |
| // (But you shouldn't: private use keys should have higher precedence than |
| // all built-in keys) |
| EndOfBackendKeys = PrivateUse3, |
| |
| // In some situations, it is not immediately obvious what the correct |
| // backend for function is, because the function in question doesn't |
| // have any "tensor" arguments. In this case, a BackendSelect function |
| // can be registered to implement the custom determination of the |
| // correct backend. |
| BackendSelect, |
| |
| Python, |
| FuncTorchPython, // See Note [Out-of-tree vmap+grad prototype] |
| |
| // The named dispatch key is set for any tensors with named dimensions. |
| // Although we have a dispatch key for named tensors, for historical reasons, |
| // this dispatch key doesn't do any of the substantive functionality for named |
| // tensor (though, hypothetically, it could!) At the moment, it's just |
| // responsible for letting us give good error messages when operations |
| // don't support named tensors. |
| // |
| // NB: If you ever consider moving named tensor functionality into |
| // this dispatch key, note that it might be necessary add another dispatch |
| // key that triggers before composite operators, in case a composite operator |
| // has named dimension propagation that doesn't match that of its |
| // constituent parts. |
| Named, |
| |
| // The Conjugate dispatch key is set for any tensors that need to perform |
| // conjugation |
| // This is implemented at a dispatch level right before any backends run |
| Conjugate, |
| |
| // The Negative dispatch key is set for any tensors that need to perform |
| // negation |
| // This is implemented at a dispatch level right before any backends run |
| Negative, |
| |
| // See Note [Out-of-tree vmap+grad prototype]. The purpose of this key |
| // is to insert code after the "autograd subsystem" runs, so this key should |
| // be directly after ADInplaceOrView and all of the autograd keys. |
| FuncTorchDynamicLayerBackMode, |
| |
| // Note [ADInplaceOrView key] |
| // ADInplaceOrView key is used by inplace or view ops to register a kernel |
| // that does additional setup for future autograd computation. |
| // |
| // 1. For inplace ops this kernel does version bump |
| // 2. For view ops this kernel does `as_view` setup where we properly setup |
| // DifferentiableViewMeta on the view tensors. |
| // |
| // For other ops it's fallthrough kernel since there's no extra |
| // work to do. |
| // |
| // Note [Dream: skip VariableType kernel when requires_grad=false] |
| // |
| // In an ideal world where we can skip VariableType kernel for inputs |
| // with requires_grad=false, instead of a fallthrough kernel, we'll |
| // register a kernel shown below to all functional ops as well: |
| // torch::Tensor my_functional_op(...) { |
| // { |
| // // Note for every op in VariableType, you need to go through |
| // // `AutoDispatchBelowADInplaceOrView` guard exactly once to add the |
| // // key to TLS excluded set. If you don't go through it at all, |
| // // inplace/view ops called through `at::` inside your backend |
| // // kernel will dispatch to ADInplaceOrView kernels and do a lot |
| // // of extra work. |
| // at::AutoDispatchBelowADInplaceOrView guard; |
| // at::redispatch::my_functional_op(...); |
| // } |
| // } |
| // But this work is currently blocked since it adds an extra dispatch |
| // for all ops and it's non-trivial overhead at model level(a few percents). |
| // Thus our current approach takes advantage of the fact every kernel go |
| // through VariableType kernel first and pulls the |
| // `at::AutoDispatchBelowADInplaceOrView` guard of functional ops |
| // up to the `VariableType` kernel. Thus we only add the extra dispatch |
| // to view/inplace ops to minimize its perf impact to real models. |
| ADInplaceOrView, |
| |
| // Note [Alias Dispatch Key : Autograd] |
| // All backends are oblivious to autograd; autograd is handled as a |
| // layer which happens on top of all backends. It inspects the autograd |
| // metadata of all inputs, determines what autograd metadata should be |
| // constructed by the output, and otherwise defers to the backend to |
| // actually do the numeric computation. Autograd contains |
| // the bulk of this logic. |
| |
| // Autograd is now an alias dispatch key which by default maps to all |
| // backend-specific autograd keys. |
| // Backend-specific allow backends to override the default kernel registered |
| // to Autograd key as needed. |
| // For example, XLA wants to define autograd for einsum directly. |
| // Registering a custom autograd implementation at the XLA key won't work |
| // because we process Autograd before XLA. This key has higher priority and |
| // gets processed first. You generally should NOT redispatch after handling |
| // autograd here (since that would result in execution of the Autograd |
| // operator, which you're trying to skip). In AutogradXLA implementations, |
| // you are responsible for handling autograd yourself, or deferring to other |
| // operators which support autograd. |
| |
| // Currently we only have backend-specific autograd keys for CPU/CUDA/XLA and |
| // reserved user-defined backends. All other in-tree backends share the |
| // AutogradOther key. We can add specific autograd key for those backends |
| // upon request. |
| AutogradOther, |
| AutogradCPU, |
| AutogradCUDA, |
| AutogradXLA, |
| AutogradLazy, |
| AutogradXPU, |
| AutogradMLC, |
| AutogradHPU, |
| AutogradNestedTensor, // lives out of tree at |
| // https://github.com/pytorch/nestedtensor |
| // Here are some reserved pre-autograd keys for user-defined backends, see |
| // Note [Private use DispatchKey] |
| AutogradPrivateUse1, |
| AutogradPrivateUse2, |
| AutogradPrivateUse3, |
| |
| Tracer, |
| |
| // Autocasting precedes VariableTypeId, to ensure casts are autograd-exposed |
| // and inputs are saved for backward in the post-autocast type. |
| AutocastCPU, |
| // Naughtily, AutocastCUDA is also being used for XLA. In the terminal state, |
| // it probably should get its own Autocast key |
| AutocastCUDA, |
| |
| // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ WRAPPERS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // |
| // There are a number of alternative modes which may want to handle before |
| // autograd; for example, error checking, tracing, profiling or vmap. They |
| // go here. |
| |
| FuncTorchBatched, // See Note [Out-of-tree vmap+grad prototype] |
| FuncTorchVmapMode, // See Note [Out-of-tree vmap+grad prototype] |
| |
| // This is the dispatch key for BatchedTensorImpl, which is used to implement |
| // batching rules for vmap. |
| Batched, |
| |
| // When we are inside a vmap, all tensors dispatch on this key. |
| // See Note: [DispatchKey::VmapMode usage] for more details. |
| VmapMode, |
| |
| FuncTorchGradWrapper, // See Note [Out-of-tree vmap+grad prototype] |
| FuncTorchDynamicLayerFrontMode, // See Note [Out-of-tree vmap+grad prototype] |
| |
| // TESTING: This is intended to be a generic testing tensor type id. |
| // Don't use it for anything real; its only acceptable use is within a single |
| // process test. Use it by creating a TensorImpl with this DispatchKey, and |
| // then registering operators to operate on this type id. See |
| // aten/src/ATen/core/dispatch/backend_fallback_test.cpp for a usage example. |
| TESTING_ONLY_GenericWrapper, |
| |
| // TESTING: This is intended to be a generic testing tensor type id. |
| // Don't use it for anything real; its only acceptable use is within a ingle |
| // process test. Use it by toggling the mode on and off via |
| // TESTING_ONLY_tls_generic_mode_set_enabled and then registering operators |
| // to operate on this type id. See |
| // aten/src/ATen/core/dispatch/backend_fallback_test.cpp |
| // for a usage example |
| TESTING_ONLY_GenericMode, |
| |
| // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FIN ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // |
| NumDispatchKeys, // Sentinel, end of runtime keys. |
| |
| // ~~~~~~~~~~~~~~~~~~~~~~ Alias Dispatch Keys ~~~~~~~~~~~~~~~~~~~~~~~~~~ // |
| // Alias dispatch keys are synthetic dispatch keys which map to multiple |
| // runtime dispatch keys. Alisa keys have precedence, but they are always |
| // lower precedence than runtime keys. You can register a kernel to an |
| // alias key, the kernel might be populated to the mapped runtime keys |
| // during dispatch table computation. |
| // If a runtime dispatch key has multiple kernels from alias keys, which |
| // kernel wins is done based on the precedence of alias keys (but runtime |
| // keys always have precedence over alias keys). |
| // Alias keys won't be directly called during runtime. |
| |
| // See Note [Alias Dispatch Key : Autograd] |
| Autograd, |
| CompositeImplicitAutograd, // registered at |
| // build/aten/src/ATen/RegisterCompositeImplicitAutograd.cpp |
| CompositeExplicitAutograd, // registered at |
| // build/aten/src/ATen/RegisterCompositeExplicitAutograd.cpp |
| |
| // Define an alias key to represent end of alias dispatch keys. |
| // If you add new alias keys after Autograd, please also update it here. |
| EndOfAliasKeys = CompositeExplicitAutograd, // |
| |
| // ~~~~~~~~~~~~~~~~~~~~~~~~~ BC ALIASES ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // |
| // The aliases exist for backwards compatibility reasons, they shouldn't |
| // be used |
| CPUTensorId = CPU, |
| CUDATensorId = CUDA, |
| DefaultBackend = CompositeExplicitAutograd, |
| PrivateUse1_PreAutograd = AutogradPrivateUse1, |
| PrivateUse2_PreAutograd = AutogradPrivateUse2, |
| PrivateUse3_PreAutograd = AutogradPrivateUse3, |
| Autocast = AutocastCUDA, |
| }; |
| |
| // Note [Private use DispatchKey] |
| // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| // Private use tensor IDs are preallocated tensor type IDs for use in user |
| // applications. Similar to private use fields in HTTP, they can be used |
| // by end users for experimental or private applications, without needing |
| // to "standardize" the tensor ID (which would be done by submitting a PR |
| // to PyTorch to add your type ID). |
| // |
| // Private use tensor IDs are appropriate to use if you want to experiment |
| // with adding a new tensor type (without having to patch PyTorch first) or |
| // have a private, non-distributed application that needs to make use of a |
| // new tensor type. Private use tensor IDs are NOT appropriate to use for |
| // libraries intended to be distributed to further users: please contact |
| // the PyTorch developers to get a type ID registered in this case. |
| // |
| // We provide two classes of private user tensor id: regular DispatchKeys |
| // and Autograd DispatchKeys. DispatchKeys serve the role of ordinary "backend" |
| // DispatchKeys; if you were adding support for a new type of accelerator, you |
| // would use a backend DispatchKey, and ideally automatically reuse |
| // AutogradOther definitions already defined in PyTorch. AutogradPrivateUse |
| // DispatchKeys serve as "wrapper" DispatchKeys: they are only necessary for |
| // tensors that compose multiple internal tensors, and for cases when the |
| // built-in autograd formulas for operators are not appropriate. |
| |
| static_assert( |
| static_cast<uint8_t>(DispatchKey::NumDispatchKeys) < 64, |
| "DispatchKey is used as index into 64-bit bitmask; you must have less than 64 entries"); |
| |
| C10_API const char* toString(DispatchKey); |
| C10_API std::ostream& operator<<(std::ostream&, DispatchKey); |
| |
| C10_API DispatchKey getAutogradKeyFromBackend(DispatchKey t); |
| |
| // Parses a string into a dispatch key. |
| // If the string cannot be correctly parsed, throws an exception. |
| C10_API c10::DispatchKey parseDispatchKey(const std::string& k); |
| |
| // These are some convenience identifiers for dispatch keys which are |
| // shorter to type than their long counterparts. Note that some of these |
| // dispatch keys directly correspond to DeviceType; and most APIs that |
| // accept DispatchKey also accept DeviceType; e.g., |
| // torch::dispatch(torch::kCPU, ...) is also valid. |
| constexpr DispatchKey kAutograd = DispatchKey::Autograd; |
| |
| // Check if a DispatchKey is an alias mapping to other runtime keys. |
| inline bool isAliasDispatchKey(DispatchKey k) { |
| return k > DispatchKey::NumDispatchKeys && k <= DispatchKey::EndOfAliasKeys; |
| } |
| } // namespace c10 |
| |
| namespace torch { |
| // Expose the constant, but not the TYPE (DispatchKey is an implementation |
| // detail!) |
| using c10::kAutograd; |
| } // namespace torch |
| |
| // NB: You really shouldn't use this instance; this enum is guaranteed |
| // to be pretty small so a regular array should be acceptable. |
| namespace std { |
| template <> |
| struct hash<c10::DispatchKey> { |
| typedef size_t result_type; |
| typedef c10::DispatchKey argument_type; |
| |
| size_t operator()(c10::DispatchKey x) const { |
| return static_cast<size_t>(x); |
| } |
| }; |
| } // namespace std |