| #pragma once |
| |
| #include <iostream> |
| #include <string> |
| #include "c10/macros/Macros.h" |
| |
| namespace c10 { |
| |
| // Semantically, a dispatch key identifies a possible "level" in our |
| // dispatch, for which a handler may be registered. Traditional |
| // backends like CPU and CUDA get dispatch keys; however, so do |
| // "wrapping" layers like Variable (for autograd handling). |
| // |
| // In implementation terms, the dispatch key identifies a specific "bit" in a |
| // DispatchKeySet. Higher bit indexes get handled by dispatching first (because |
| // we "count leading zeros" when we extract the highest priority dispatch |
| // key.) |
| enum class DispatchKey : uint8_t { |
| |
| // ~~~~~~~~~~~~~~~~~~~~~~~~~~ UNDEFINED ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // |
| // This is not a "real" tensor id, but it exists to give us a "nullopt" |
| // element we can return for cases when a DispatchKeySet contains no elements. |
| // You can think a more semantically accurate definition of DispatchKey is: |
| // |
| // using DispatchKey = optional<RealDispatchKey> |
| // |
| // and Undefined == nullopt. We didn't actually represent |
| // it this way because optional<RealDispatchKey> would take two |
| // words, when DispatchKey fits in eight bits. |
| |
| Undefined = 0, |
| |
| |
| |
| // ~~~~~~~~~~~~~~~~~~~~~~~~~~ BACKENDS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // |
| // A "backend" is colloquially used to refer to handlers for dispatch |
| // which actually implement the numerics of an operation in question. |
| // |
| // Due to the nature of the enum, these backends are specified in |
| // an ordered way, but for most backends this order is not semantically |
| // meaningful (e.g., it's valid to reorder these backends without changing |
| // semantics). The only situation when backend ordering is meaningful |
| // is when the backend participates in multiple dispatch with another |
| // backend; e.g., CPUTensorId and SparseCPUTensorId (sparse must have |
| // higher priority). |
| |
| // Here are backends which you think of as traditionally specifying |
| // how to implement operations on some device. |
| CPUTensorId, // registered at build/aten/src/ATen/CPUType.cpp |
| CUDATensorId, // registered at build/aten/src/ATen/CUDAType.cpp |
| HIPTensorId, // NB: I think this is not actually used, due to Note [Masquerading as CUDA] |
| MSNPUTensorId, // unused externally, but tested at test/cpp_extensions/msnpu_extension.cpp |
| XLATensorId, // lives out of tree at https://github.com/pytorch/xla |
| |
| // These are Caffe2 device types which we grandfathered into |
| // DispatchKey. |
| // TODO: Caffe2-only DispatchKeys actually should be removed from this enum |
| // and just simply be undispatchable. |
| MKLDNNTensorId, // (MKLDNN is treated as another "device" in Caffe2) |
| OpenGLTensorId, |
| OpenCLTensorId, |
| IDEEPTensorId, |
| |
| // Here are backends which specify more specialized operators |
| // based on the dtype of the tensor. |
| QuantizedCPUTensorId, // registered at build/aten/src/ATen/QuantizedCPUType.cpp |
| ComplexCPUTensorId, // lives out of tree at https://gitlab.com/pytorch-complex/pytorch-cpu-strided-complex |
| ComplexCUDATensorId, // and https://gitlab.com/pytorch-complex/pytorch-cuda-strided-complex |
| // tested at test/cpp_extensions/complex_registration_extension.cpp |
| // TODO: Remove Complex dispatch keys when Complex is moved in tree |
| |
| // This backend is to support custom RNGs; it lets you go |
| // to a different kernel if you pass in a generator that is not a |
| // traditional CPUGenerator/CUDAGenerator. To make use of this |
| // key: |
| // 1) set it as a second parameter of at::Generator constructor call in |
| // the user-defined PRNG class. |
| // 2) use it as a dispatch key while registering custom kernels |
| // (templatized kernels specialized for user-defined PRNG class) |
| // intended for out of tree use; tested by aten/src/ATen/test/rng_test.cpp |
| CustomRNGKeyId, |
| |
| // Here are backends which specify more specialized operators |
| // based on the layout of the tensor. Note that the sparse backends |
| // are one case where ordering matters: sparse multi-dispatches with |
| // the corresponding dense tensors, and must be handled before them. |
| MkldnnCPUTensorId, // registered at build/aten/src/ATen/MkldnnCPUType.cpp |
| // NB: not to be confused with MKLDNNTensorId, which is Caffe2 only |
| SparseCPUTensorId, // registered at build/aten/src/ATen/SparseCPUType.cpp |
| SparseCUDATensorId, // registered at build/aten/src/ATen/SparseCUDAType.cpp |
| SparseHIPTensorId, // TODO: I think this is not actually used, due to Note [Masquerading as CUDA] |
| |
| // Here are reserved backends for user-defined backends, see Note [Private use TensorId] |
| // To see some example about how to use this, check out MSNPU |
| PrivateUse1_TensorId, |
| PrivateUse2_TensorId, |
| PrivateUse3_TensorId, |
| |
| // In some situations, it is not immediately obvious what the correct |
| // backend for function is, because the function in question doesn't |
| // have any "tensor" arguments. In this case, a BackendSelect function |
| // can be registered to implement the custom determination of the |
| // correct backend. |
| BackendSelect, |
| |
| |
| |
| // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ AUTOGRAD ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // |
| // All backends are oblivious to autograd; autograd is handled as a |
| // layer which happens on top of all backends. It inspects the autograd |
| // metadata of all inputs, determines what autograd metadata should be |
| // constructed by the output, and otherwise defers to the backend to |
| // actually do the numeric computation. VariableTensorId contains |
| // the bulk of this logic. |
| VariableTensorId, |
| |
| // Pre-autograd dispatch keys allow backends to override the autograd behavior |
| // (aka VariableTensorId) for operators which have a Variable kernel |
| // already registered. For example, XLA wants to define autograd for |
| // einsum directly. Registering a custom autograd implementation at the |
| // XLATensorId key won't work because we process VariableTensorId |
| // before XLATensorId. This key has higher priority and gets processed |
| // first. You generally should NOT redispatch after handling autograd |
| // here (since that would result in execution of the VariableTensorId |
| // operator, which you're trying to skip). In PreAutograd implementations, |
| // you are responsible for handling autograd yourself, or deferring to other |
| // operators which support autograd. |
| XLAPreAutograd, |
| |
| // Here are some reserved pre-autograd keys for user-defined backends, see Note [Private use TensorId] |
| PrivateUse1_PreAutogradTensorId, |
| PrivateUse2_PreAutogradTensorId, |
| PrivateUse3_PreAutogradTensorId, |
| |
| |
| |
| // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ WRAPPERS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // |
| // There are a number of alternative modes which may want to handle before |
| // autograd; for example, error checking, tracing, profiling or vmap. They |
| // go here. |
| |
| // TESTING: This is intended to be a generic testing tensor type id. |
| // Don't use it for anything real; its only acceptable use is within a single |
| // process test. Use it by creating a TensorImpl with this DispatchKey, and |
| // then registering operators to operate on this type id. See |
| // aten/src/ATen/test/backend_fallback_test.cpp for a usage example. |
| TESTING_ONLY_GenericWrapperTensorId, |
| |
| // TESTING: This is intended to be a generic testing tensor type id. |
| // Don't use it for anything real; its only acceptable use is within a ingle |
| // process test. Use it by toggling the mode on and off via |
| // TESTING_ONLY_tls_generic_mode_set_enabled and then registering operators |
| // to operate on this type id. See aten/src/ATen/test/backend_fallback_test.cpp |
| // for a usage example |
| TESTING_ONLY_GenericModeTensorId, |
| |
| |
| |
| // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FIN ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // |
| NumDispatchKeys, // Sentinel |
| }; |
| |
| |
| |
| // Note [Private use TensorId] |
| // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| // Private use tensor IDs are preallocated tensor type IDs for use in user |
| // applications. Similar to private use fields in HTTP, they can be used |
| // by end users for experimental or private applications, without needing |
| // to "standardize" the tensor ID (which would be done by submitting a PR |
| // to PyTorch to add your type ID). |
| // |
| // Private use tensor IDs are appropriate to use if you want to experiment |
| // with adding a new tensor type (without having to patch PyTorch first) or |
| // have a private, non-distributed application that needs to make use of a |
| // new tensor type. Private use tensor IDs are NOT appropriate to use for |
| // libraries intended to be distributed to further users: please contact |
| // the PyTorch developers to get a type ID registered in this case. |
| // |
| // We provide two classes of private user tensor id: regular TensorIds |
| // and PreAutogradTensorIds. TensorIds serve the role of ordinary "backend" |
| // TensorIds; if you were adding support for a new type of accelerator, you |
| // would use a TensorId, and reuse autograd definitions already defined in |
| // PyTorch for operators you define. PreAutogradTensorIds serve as "wrapper" |
| // TensorIds: they are most appropriate for tensors that compose multiple |
| // internal tensors, and for cases when the built-in autograd formulas for |
| // operators are not appropriate. |
| |
| static_assert( |
| static_cast<uint8_t>(DispatchKey::NumDispatchKeys) < 64, |
| "DispatchKey is used as index into 64-bit bitmask; you must have less than 64 entries"); |
| |
| C10_API const char* toString(DispatchKey); |
| C10_API std::ostream& operator<<(std::ostream&, DispatchKey); |
| |
| // For backwards compatibility with XLA repository |
| // (I don't want to fix this in XLA right now because there might be |
| // more renaming coming in the future.) |
| static inline DispatchKey XLATensorId() { |
| return DispatchKey::XLATensorId; |
| } |
| |
| } // namespace c10 |
| |
| // NB: You really shouldn't use this instance; this enum is guaranteed |
| // to be pretty small so a regular array should be acceptable. |
| namespace std { |
| template <> |
| struct hash<c10::DispatchKey> { |
| typedef size_t result_type; |
| typedef c10::DispatchKey argument_type; |
| |
| size_t operator()(c10::DispatchKey x) const { |
| return static_cast<size_t>(x); |
| } |
| }; |
| } |