Renaming CAFFE2_API to TORCH_API (#49496)

Summary:
Since caffe2 and torch have been consolidated, CAFFE2_API should be merged with TORCH_API. Addresses a TODO.

Manually edited some references of the removed `CAFFE2_API`:
* `CONTRIBUTING.md`
* `caffe2/proto/CMakeLists.txt`
* `cmake/ProtoBuf.cmake`
* `c10/macros/Export.h`
* `torch/csrc/WindowsTorchApiMacro.h`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/49496

Reviewed By: malfet, samestep

Differential Revision: D25600726

Pulled By: janeyx99

fbshipit-source-id: 7e068d959e397ac183c097d7e9a9afeca5ddd782
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 51e9d13..0933180 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -754,7 +754,7 @@
 1. Symbols are NOT exported by default on Windows; instead, you have to explicitly
    mark a symbol as exported/imported in a header file with `__declspec(dllexport)` /
    `__declspec(dllimport)`. We have codified this pattern into a set of macros
-   which follow the convention `*_API`, e.g., `CAFFE2_API` inside Caffe2 and ATen.
+   which follow the convention `*_API`, e.g., `TORCH_API` inside Caffe2, Aten and Torch.
    (Every separate shared library needs a unique macro name, because symbol visibility
    is on a per shared library basis. See c10/macros/Macros.h for more details.)
 
diff --git a/aten/src/ATen/CPUGeneratorImpl.h b/aten/src/ATen/CPUGeneratorImpl.h
index 04119d1..eceb338 100644
--- a/aten/src/ATen/CPUGeneratorImpl.h
+++ b/aten/src/ATen/CPUGeneratorImpl.h
@@ -7,7 +7,7 @@
 
 namespace at {
 
-struct CAFFE2_API CPUGeneratorImpl : public c10::GeneratorImpl {
+struct TORCH_API CPUGeneratorImpl : public c10::GeneratorImpl {
   // Constructors
   CPUGeneratorImpl(uint64_t seed_in = default_rng_seed_val);
   ~CPUGeneratorImpl() = default;
@@ -36,8 +36,8 @@
 
 namespace detail {
 
-CAFFE2_API const Generator& getDefaultCPUGenerator();
-CAFFE2_API Generator createCPUGenerator(uint64_t seed_val = default_rng_seed_val);
+TORCH_API const Generator& getDefaultCPUGenerator();
+TORCH_API Generator createCPUGenerator(uint64_t seed_val = default_rng_seed_val);
 
 } // namespace detail
 
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index 0597f99..276bf16 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -21,7 +21,7 @@
 
 class Tensor;
 
-class CAFFE2_API Context {
+class TORCH_API Context {
  public:
   Context();
 
@@ -225,13 +225,13 @@
   std::unique_ptr<THHState, void(*)(THHState*)> thh_state;
 };
 
-CAFFE2_API Context& globalContext();
+TORCH_API Context& globalContext();
 
 static inline void init() {
   globalContext();
 }
 
-CAFFE2_API Allocator* getCPUAllocator();
+TORCH_API Allocator* getCPUAllocator();
 
 static inline DeprecatedTypeProperties& getDeprecatedTypeProperties(Backend p, ScalarType s) {
   return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
diff --git a/aten/src/ATen/DLConvertor.h b/aten/src/ATen/DLConvertor.h
index 8458e6e..a34d4b3 100644
--- a/aten/src/ATen/DLConvertor.h
+++ b/aten/src/ATen/DLConvertor.h
@@ -10,10 +10,10 @@
 
 namespace at {
 
-CAFFE2_API ScalarType toScalarType(const DLDataType& dtype);
-CAFFE2_API DLManagedTensor* toDLPack(const Tensor& src);
-CAFFE2_API Tensor fromDLPack(const DLManagedTensor* src);
-CAFFE2_API DLDataType getDLDataType(const Tensor& t);
-CAFFE2_API DLContext getDLContext(const Tensor& tensor, const int64_t& device_id);
+TORCH_API ScalarType toScalarType(const DLDataType& dtype);
+TORCH_API DLManagedTensor* toDLPack(const Tensor& src);
+TORCH_API Tensor fromDLPack(const DLManagedTensor* src);
+TORCH_API DLDataType getDLDataType(const Tensor& t);
+TORCH_API DLContext getDLContext(const Tensor& tensor, const int64_t& device_id);
 
 } //namespace at
diff --git a/aten/src/ATen/DynamicLibrary.h b/aten/src/ATen/DynamicLibrary.h
index ea919a7..089503c 100644
--- a/aten/src/ATen/DynamicLibrary.h
+++ b/aten/src/ATen/DynamicLibrary.h
@@ -8,11 +8,11 @@
 struct DynamicLibrary {
   AT_DISALLOW_COPY_AND_ASSIGN(DynamicLibrary);
 
-  CAFFE2_API DynamicLibrary(const char* name);
+  TORCH_API DynamicLibrary(const char* name);
 
-  CAFFE2_API void* sym(const char* name);
+  TORCH_API void* sym(const char* name);
 
-  CAFFE2_API ~DynamicLibrary();
+  TORCH_API ~DynamicLibrary();
 
  private:
   void* handle = nullptr;
diff --git a/aten/src/ATen/ExpandUtils.h b/aten/src/ATen/ExpandUtils.h
index 41cc320..b03c293 100644
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@@ -9,14 +9,14 @@
 
 namespace at {
 
-CAFFE2_API std::vector<int64_t> infer_size(IntArrayRef a, IntArrayRef b);
-CAFFE2_API std::tuple<std::vector<int64_t>, std::vector<int64_t>>
+TORCH_API std::vector<int64_t> infer_size(IntArrayRef a, IntArrayRef b);
+TORCH_API std::tuple<std::vector<int64_t>, std::vector<int64_t>>
 inferExpandGeometry(
     IntArrayRef tensor_sizes,
     IntArrayRef tensor_strides,
     IntArrayRef sizes);
 
-CAFFE2_API std::vector<int64_t> infer_dense_strides(
+TORCH_API std::vector<int64_t> infer_dense_strides(
     IntArrayRef tensor_sizes,
     IntArrayRef tensor_strides);
 
diff --git a/aten/src/ATen/MemoryOverlap.h b/aten/src/ATen/MemoryOverlap.h
index 5cd4eab..f7437c6 100644
--- a/aten/src/ATen/MemoryOverlap.h
+++ b/aten/src/ATen/MemoryOverlap.h
@@ -15,19 +15,19 @@
 
 enum class MemOverlapStatus { FULL, PARTIAL, NO, TOO_HARD };
 
-CAFFE2_API MemOverlap has_internal_overlap(const Tensor& t);
-CAFFE2_API MemOverlap has_internal_overlap(TensorImpl* t);
+TORCH_API MemOverlap has_internal_overlap(const Tensor& t);
+TORCH_API MemOverlap has_internal_overlap(TensorImpl* t);
 
-CAFFE2_API void assert_no_internal_overlap(const Tensor& t);
-CAFFE2_API void assert_no_internal_overlap(TensorImpl* t);
+TORCH_API void assert_no_internal_overlap(const Tensor& t);
+TORCH_API void assert_no_internal_overlap(TensorImpl* t);
 
-CAFFE2_API MemOverlapStatus get_overlap_status(const Tensor& a, const Tensor& b);
-CAFFE2_API MemOverlapStatus get_overlap_status(TensorImpl* a, TensorImpl* b);
+TORCH_API MemOverlapStatus get_overlap_status(const Tensor& a, const Tensor& b);
+TORCH_API MemOverlapStatus get_overlap_status(TensorImpl* a, TensorImpl* b);
 
-CAFFE2_API void assert_no_partial_overlap(const Tensor& a, const Tensor& b);
+TORCH_API void assert_no_partial_overlap(const Tensor& a, const Tensor& b);
 void assert_no_partial_overlap(TensorImpl* a, TensorImpl* b);
 
-CAFFE2_API void assert_no_overlap(const Tensor& a, const Tensor& b);
-CAFFE2_API void assert_no_overlap(TensorImpl* a, TensorImpl* b);
+TORCH_API void assert_no_overlap(const Tensor& a, const Tensor& b);
+TORCH_API void assert_no_overlap(TensorImpl* a, TensorImpl* b);
 
 }
diff --git a/aten/src/ATen/NamedTensorUtils.h b/aten/src/ATen/NamedTensorUtils.h
index 47dfd58..af55841 100644
--- a/aten/src/ATen/NamedTensorUtils.h
+++ b/aten/src/ATen/NamedTensorUtils.h
@@ -17,8 +17,8 @@
 
 // Converts dim to an positional index. Errors if `dim` cannot be used to
 // refer to any dimension of tensor.
-CAFFE2_API int64_t dimname_to_position(const Tensor& tensor, Dimname dim);
-CAFFE2_API std::vector<int64_t> dimnames_to_positions(const Tensor& tensor, DimnameList dims);
+TORCH_API int64_t dimname_to_position(const Tensor& tensor, Dimname dim);
+TORCH_API std::vector<int64_t> dimnames_to_positions(const Tensor& tensor, DimnameList dims);
 
 // Unifies two DimnameList to produce a third. This is useful for implementing
 // the named inference rule for binary broadcasting operations like add.
@@ -28,7 +28,7 @@
 // 2) Check misaligned: If a name `n` is in `names`, then it must appear at
 //    the same index from the right in other.
 // 3) The output names are obtained by unifying the names individually from the right.
-CAFFE2_API std::vector<Dimname>
+TORCH_API std::vector<Dimname>
 unify_from_right(DimnameList names, DimnameList other, const char* action = "broadcast");
 
 [[noreturn]] inline void reportNYIDimnameOverload(const char* op_name) {
@@ -75,50 +75,50 @@
 // `names` can be empty; see [NOTE] Writing name inference rules
 // If `names` is not empty, `names.size()` should equal `result.dim()`.
 // When in doubt, use this overload instead of the others.
-CAFFE2_API Tensor& propagate_names_if_nonempty(
+TORCH_API Tensor& propagate_names_if_nonempty(
     Tensor& result,
     DimnameList maybe_names,
     bool validate_names = false);
 
 // Propagates `names` to `result`. Only use this if we are certain that there are
 // names to propagate (that names is not empty).
-CAFFE2_API Tensor& propagate_names(
+TORCH_API Tensor& propagate_names(
     Tensor& result,
     DimnameList names,
     bool validate_names = false);
 
 // Propagates all names from src to result.
-CAFFE2_API void propagate_names(Tensor& result, const Tensor& src);
+TORCH_API void propagate_names(Tensor& result, const Tensor& src);
 
 // Propagates all names except for those at the excluded_idxs.
-CAFFE2_API void propagate_names_except(Tensor& result, const Tensor& src, IntArrayRef excluded_idxs);
+TORCH_API void propagate_names_except(Tensor& result, const Tensor& src, IntArrayRef excluded_idxs);
 
 // Used for reduction ops that have a `keepdim` arg.
-CAFFE2_API void propagate_names_for_reduction(Tensor& result, const Tensor& src, IntArrayRef excluded_idxs, bool keepdim);
+TORCH_API void propagate_names_for_reduction(Tensor& result, const Tensor& src, IntArrayRef excluded_idxs, bool keepdim);
 
-CAFFE2_API void propagate_names_for_expand(Tensor& result, const Tensor& self);
+TORCH_API void propagate_names_for_expand(Tensor& result, const Tensor& self);
 
-CAFFE2_API std::vector<Dimname> compute_cat_outnames(TensorList tensors);
+TORCH_API std::vector<Dimname> compute_cat_outnames(TensorList tensors);
 
-CAFFE2_API std::vector<Dimname> compute_broadcast_outnames(
+TORCH_API std::vector<Dimname> compute_broadcast_outnames(
     const Tensor& self,
     const Tensor& other);
 
-CAFFE2_API std::vector<Dimname> broadcast_to_outnames(
+TORCH_API std::vector<Dimname> broadcast_to_outnames(
     const Tensor& tensor,
     const Tensor& reference_tensor,
     const char* op_name);
 
-CAFFE2_API std::vector<Dimname> compute_matmul_outnames(const Tensor& self, const Tensor& other);
+TORCH_API std::vector<Dimname> compute_matmul_outnames(const Tensor& self, const Tensor& other);
 
-CAFFE2_API std::vector<Dimname> compute_cdist_outnames(const Tensor& self, const Tensor& other);
+TORCH_API std::vector<Dimname> compute_cdist_outnames(const Tensor& self, const Tensor& other);
 
-CAFFE2_API std::vector<Dimname> compute_bmm_outnames(
+TORCH_API std::vector<Dimname> compute_bmm_outnames(
     Tensor& result,
     const Tensor& self,
     const Tensor& other);
 
-CAFFE2_API std::vector<Dimname> compute_squeeze_outnames(const Tensor& tensor);
+TORCH_API std::vector<Dimname> compute_squeeze_outnames(const Tensor& tensor);
 
 std::vector<Dimname> compute_diagonal_outnames(
     const Tensor& tensor,
@@ -127,40 +127,40 @@
 
 // TensorImpl* overloads for Legacy TH/THC code. Use these sparingly.
 
-CAFFE2_API TensorImpl* propagate_names_if_nonempty(
+TORCH_API TensorImpl* propagate_names_if_nonempty(
     TensorImpl* result,
     DimnameList maybe_names,
     bool validate_names = false);
 
-CAFFE2_API TensorImpl* propagate_names(
+TORCH_API TensorImpl* propagate_names(
     TensorImpl* result,
     DimnameList names,
     bool validate_names = false);
 
-CAFFE2_API void propagate_names(TensorImpl* result, /*const */TensorImpl* src);
+TORCH_API void propagate_names(TensorImpl* result, /*const */TensorImpl* src);
 
 // result = m1 @ m2 + bias
-CAFFE2_API void propagate_names_for_addmm(
+TORCH_API void propagate_names_for_addmm(
     Tensor& result,
     const Tensor& m1,
     const Tensor& m2,
     const Tensor& bias);
 
-CAFFE2_API void propagate_names_for_addmv(
+TORCH_API void propagate_names_for_addmv(
     Tensor& result,
     const Tensor& mat,
     const Tensor& vec,
     const Tensor& bias);
 
-CAFFE2_API void check_names_for_dot(TensorImpl* vec1, TensorImpl* vec2);
+TORCH_API void check_names_for_dot(TensorImpl* vec1, TensorImpl* vec2);
 
-CAFFE2_API std::vector<Dimname> compute_baddbmm_outnames(
+TORCH_API std::vector<Dimname> compute_baddbmm_outnames(
     Tensor& result,
     const Tensor& self,
     const Tensor& other,
     const Tensor& bias);
 
-CAFFE2_API bool are_names_equal(TensorImpl* self, TensorImpl* other);
+TORCH_API bool are_names_equal(TensorImpl* self, TensorImpl* other);
 
 } // namespace namedinference
 
diff --git a/aten/src/ATen/OpaqueTensorImpl.h b/aten/src/ATen/OpaqueTensorImpl.h
index d9831b5..f23a097 100644
--- a/aten/src/ATen/OpaqueTensorImpl.h
+++ b/aten/src/ATen/OpaqueTensorImpl.h
@@ -17,7 +17,7 @@
 // "shallow copy" in order to add support.
 
 template <typename OpaqueHandle>
-struct CAFFE2_API OpaqueTensorImpl : public TensorImpl {
+struct TORCH_API OpaqueTensorImpl : public TensorImpl {
   // public constructor for now...
   OpaqueTensorImpl(
       at::DispatchKeySet key_set,
diff --git a/aten/src/ATen/PTThreadPool.h b/aten/src/ATen/PTThreadPool.h
index f5e8a1a..7015f7c 100644
--- a/aten/src/ATen/PTThreadPool.h
+++ b/aten/src/ATen/PTThreadPool.h
@@ -5,7 +5,7 @@
 
 namespace at {
 
-class CAFFE2_API PTThreadPool : public c10::ThreadPool {
+class TORCH_API PTThreadPool : public c10::ThreadPool {
 public:
   explicit PTThreadPool(
       int pool_size,
diff --git a/aten/src/ATen/Parallel.h b/aten/src/ATen/Parallel.h
index e21401c..122b8ea 100644
--- a/aten/src/ATen/Parallel.h
+++ b/aten/src/ATen/Parallel.h
@@ -10,25 +10,25 @@
 }
 
 // Called during new thread initialization
-CAFFE2_API void init_num_threads();
+TORCH_API void init_num_threads();
 
 // Sets the number of threads to be used in parallel region
-CAFFE2_API void set_num_threads(int);
+TORCH_API void set_num_threads(int);
 
 // Returns the maximum number of threads that may be used in a parallel region
-CAFFE2_API int get_num_threads();
+TORCH_API int get_num_threads();
 
 // Returns the current thread number (starting from 0)
 // in the current parallel region, or 0 in the sequential region
-CAFFE2_API int get_thread_num();
+TORCH_API int get_thread_num();
 
 // Checks whether the code runs in parallel region
-CAFFE2_API bool in_parallel_region();
+TORCH_API bool in_parallel_region();
 
 namespace internal {
 
 // Initialise num_threads lazily at first parallel call
-inline CAFFE2_API void lazy_init_num_threads() {
+inline TORCH_API void lazy_init_num_threads() {
   thread_local bool init = false;
   if (C10_UNLIKELY(!init)) {
     at::init_num_threads();
@@ -110,29 +110,29 @@
     const SF& sf);
 
 // Returns a detailed string describing parallelization settings
-CAFFE2_API std::string get_parallel_info();
+TORCH_API std::string get_parallel_info();
 
 // Sets number of threads used for inter-op parallelism
-CAFFE2_API void set_num_interop_threads(int);
+TORCH_API void set_num_interop_threads(int);
 
 // Returns the number of threads used for inter-op parallelism
-CAFFE2_API int get_num_interop_threads();
+TORCH_API int get_num_interop_threads();
 
 // Launches inter-op parallel task
-CAFFE2_API void launch(std::function<void()> func);
+TORCH_API void launch(std::function<void()> func);
 namespace internal {
 void launch_no_thread_state(std::function<void()> fn);
 } // namespace internal
 
 // Launches intra-op parallel task
-CAFFE2_API void intraop_launch(std::function<void()> func);
+TORCH_API void intraop_launch(std::function<void()> func);
 
 // Launches intra-op parallel task, returns a future
-CAFFE2_API std::shared_ptr<c10::ivalue::Future> intraop_launch_future(
+TORCH_API std::shared_ptr<c10::ivalue::Future> intraop_launch_future(
     std::function<void()> func);
 
 // Returns number of intra-op threads used by default
-CAFFE2_API int intraop_default_num_threads();
+TORCH_API int intraop_default_num_threads();
 
 } // namespace at
 
diff --git a/aten/src/ATen/ParallelNative.h b/aten/src/ATen/ParallelNative.h
index 58d3445..3a8d263 100644
--- a/aten/src/ATen/ParallelNative.h
+++ b/aten/src/ATen/ParallelNative.h
@@ -22,7 +22,7 @@
   return std::make_tuple(num_tasks, chunk_size);
 }
 
-CAFFE2_API void _parallel_run(
+TORCH_API void _parallel_run(
   const int64_t begin,
   const int64_t end,
   const int64_t grain_size,
diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h
index 4373c9b..db97b22 100644
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@@ -5,7 +5,7 @@
 #include <c10/util/Exception.h>
 
 namespace at {
-struct CAFFE2_API SparseTensorImpl : public TensorImpl {
+struct TORCH_API SparseTensorImpl : public TensorImpl {
   // Stored in COO format, indices + values.
 
   // INVARIANTS:
diff --git a/aten/src/ATen/TensorGeometry.h b/aten/src/ATen/TensorGeometry.h
index 291892a..ad3e16d 100644
--- a/aten/src/ATen/TensorGeometry.h
+++ b/aten/src/ATen/TensorGeometry.h
@@ -5,7 +5,7 @@
 
 namespace at {
 
-struct CAFFE2_API TensorGeometry {
+struct TORCH_API TensorGeometry {
   TensorGeometry() : storage_offset_(0) {}
 
   explicit TensorGeometry(IntArrayRef sizes)
diff --git a/aten/src/ATen/TensorIndexing.h b/aten/src/ATen/TensorIndexing.h
index 162efd1..3890662 100644
--- a/aten/src/ATen/TensorIndexing.h
+++ b/aten/src/ATen/TensorIndexing.h
@@ -20,10 +20,10 @@
 
 constexpr c10::nullopt_t None = c10::nullopt;
 
-struct CAFFE2_API EllipsisIndexType final { EllipsisIndexType() {} };
-CAFFE2_API extern const EllipsisIndexType Ellipsis;
+struct TORCH_API EllipsisIndexType final { EllipsisIndexType() {} };
+TORCH_API extern const EllipsisIndexType Ellipsis;
 
-struct CAFFE2_API Slice final {
+struct TORCH_API Slice final {
  public:
   // This mirrors `__PySlice_Unpack` in torch/csrc/utils/python_compat.h
   Slice(
@@ -73,7 +73,7 @@
   int64_t step_;
 };
 
-CAFFE2_API std::ostream& operator<<(std::ostream& stream, const Slice& slice);
+TORCH_API std::ostream& operator<<(std::ostream& stream, const Slice& slice);
 
 // `at::indexing::TensorIndex` is used for converting C++ tensor indices such as
 // `{None, "...", Ellipsis, 0, true, Slice(1, None, 2), torch::tensor({1, 2})}`
@@ -100,7 +100,7 @@
 // `:3:2`                  | `Slice(None, 3, 2)`
 // `1:3:2`                 | `Slice(1, 3, 2)`
 // `torch.tensor([1, 2])`) | `torch::tensor({1, 2})`
-struct CAFFE2_API TensorIndex final {
+struct TORCH_API TensorIndex final {
   // Case 1: `at::indexing::None`
   TensorIndex(c10::nullopt_t) : type_(TensorIndexType::None) {}
 
@@ -175,8 +175,8 @@
   TensorIndexType type_;
 };
 
-CAFFE2_API std::ostream& operator<<(std::ostream& stream, const TensorIndex& tensor_index);
-CAFFE2_API std::ostream& operator<<(std::ostream& stream, const std::vector<TensorIndex>& tensor_indices);
+TORCH_API std::ostream& operator<<(std::ostream& stream, const TensorIndex& tensor_index);
+TORCH_API std::ostream& operator<<(std::ostream& stream, const std::vector<TensorIndex>& tensor_indices);
 
 namespace impl {
 static inline Tensor applySlice(
diff --git a/aten/src/ATen/TensorIterator.h b/aten/src/ATen/TensorIterator.h
index ba781d7..5132fb0 100644
--- a/aten/src/ATen/TensorIterator.h
+++ b/aten/src/ATen/TensorIterator.h
@@ -70,7 +70,7 @@
   int64_t offset;
 };
 
-struct CAFFE2_API OperandInfo {
+struct TORCH_API OperandInfo {
   using StrideVector = SmallVector<int64_t, 6>;
   OperandInfo() {}
   explicit OperandInfo(Tensor t) : tensor(std::move(t)) {
@@ -141,7 +141,7 @@
 class TensorIteratorConfig;
 struct TensorIterator;
 
-struct CAFFE2_API TensorIteratorBase : public impl::MetaBase {
+struct TORCH_API TensorIteratorBase : public impl::MetaBase {
   using DimMask = std::bitset<64>;
   using PtrVector = SmallVector<char*, 4>;
   using StrideVector = SmallVector<int64_t, 6>;
@@ -408,7 +408,7 @@
   bool is_meta_ = false;
 };
 
-struct CAFFE2_API TensorIterator final : public TensorIteratorBase {
+struct TORCH_API TensorIterator final : public TensorIteratorBase {
   TensorIterator() : TensorIteratorBase() {}
   // Slicing is OK, TensorIterator guaranteed NOT to have any fields
   TensorIterator(const TensorIteratorBase& iter) : TensorIteratorBase(iter) {}
@@ -426,7 +426,7 @@
   void set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) override;
 };
 
-class CAFFE2_API TensorIteratorConfig final {
+class TORCH_API TensorIteratorConfig final {
 public:
   friend struct TensorIteratorBase;
   friend struct TensorIterator;
@@ -532,8 +532,8 @@
 /// A container-like struct that acts as if it contains splits of a
 /// TensorIterator that can use 32-bit indexing. Taken together the splits cover
 /// the original TensorIterator.
-struct CAFFE2_API SplitUntil32Bit {
-  struct CAFFE2_API iterator {
+struct TORCH_API SplitUntil32Bit {
+  struct TORCH_API iterator {
     iterator() {};
     iterator(const TensorIteratorBase& iter);
     iterator(iterator&&) = default;
diff --git a/aten/src/ATen/TensorMeta.h b/aten/src/ATen/TensorMeta.h
index 134bb37..1b05b69 100644
--- a/aten/src/ATen/TensorMeta.h
+++ b/aten/src/ATen/TensorMeta.h
@@ -46,7 +46,7 @@
 // (although presently it isn't).
 //
 // A notable subclass of this interface is TensorIteratorBase.
-struct CAFFE2_API MetaBase {
+struct TORCH_API MetaBase {
   virtual void set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) = 0;
   virtual const Tensor& maybe_get_output(int64_t output_idx) = 0;
   void set_output(IntArrayRef sizes, TensorOptions options) {
diff --git a/aten/src/ATen/TensorNames.h b/aten/src/ATen/TensorNames.h
index eeb8ec1..64bad7c 100644
--- a/aten/src/ATen/TensorNames.h
+++ b/aten/src/ATen/TensorNames.h
@@ -26,7 +26,7 @@
 // None (in tensor) cannot match A (in other) because if the None were refined
 // to A, `tensor` would have duplicate names [A, A]. Therefore we need to check
 // tensor.names [A, None] for the existence of A.
-struct CAFFE2_API TensorName {
+struct TORCH_API TensorName {
   explicit TensorName(ArrayRef<Dimname> origin, int origin_idx)
     : origin_(origin),
       name_(origin[maybe_wrap_dim(origin_idx, origin.size())]),
@@ -41,14 +41,14 @@
   Dimname name_;
   int origin_idx_; // A named tensor can have at most 64 dims.
 
-  CAFFE2_API friend std::ostream& operator<<(
+  TORCH_API friend std::ostream& operator<<(
       std::ostream& out,
       const TensorName& tensorname);
 };
 
 using TensorNameVec = SmallVector<TensorName, 10>;
 
-struct CAFFE2_API TensorNames {
+struct TORCH_API TensorNames {
   explicit TensorNames(ArrayRef<Dimname> names);
 
   // Create TensorNames from names[start:end]. Each individual TensorName stores
diff --git a/aten/src/ATen/TensorUtils.h b/aten/src/ATen/TensorUtils.h
index 0882eb4..c182cd6 100644
--- a/aten/src/ATen/TensorUtils.h
+++ b/aten/src/ATen/TensorUtils.h
@@ -12,7 +12,7 @@
 // make sense.  These are particularly useful for native functions,
 // which do NO argument checking by default.
 
-struct CAFFE2_API TensorArg {
+struct TORCH_API TensorArg {
   Tensor tensor;
   const char* name;
   int pos; // 1-indexed
@@ -22,7 +22,7 @@
   const Tensor& operator*() const { return tensor; }
 };
 
-struct CAFFE2_API TensorGeometryArg {
+struct TORCH_API TensorGeometryArg {
   TensorGeometry tensor;
   const char* name;
   int pos; // 1-indexed
@@ -49,104 +49,104 @@
 // not TensorGeometryArg, because the Tensor to TensorGeometry
 // conversion will blow up if you have undefined tensors.
 
-CAFFE2_API std::ostream& operator<<(std::ostream& out, TensorGeometryArg t);
-CAFFE2_API void checkDim(
+TORCH_API std::ostream& operator<<(std::ostream& out, TensorGeometryArg t);
+TORCH_API void checkDim(
     CheckedFrom c,
     const TensorGeometryArg& t,
     int64_t dim);
 // NB: this is an inclusive-exclusive range
-CAFFE2_API void checkDimRange(
+TORCH_API void checkDimRange(
     CheckedFrom c,
     const TensorGeometryArg& t,
     int64_t dim_start,
     int64_t dim_end);
-CAFFE2_API void checkSameDim(
+TORCH_API void checkSameDim(
     CheckedFrom c,
     const TensorGeometryArg& t1,
     const TensorGeometryArg& t2);
-CAFFE2_API void checkContiguous(CheckedFrom c, const TensorGeometryArg& t);
-CAFFE2_API void checkAllContiguous(CheckedFrom c, at::ArrayRef<TensorArg> ts);
-CAFFE2_API void checkSize(
+TORCH_API void checkContiguous(CheckedFrom c, const TensorGeometryArg& t);
+TORCH_API void checkAllContiguous(CheckedFrom c, at::ArrayRef<TensorArg> ts);
+TORCH_API void checkSize(
     CheckedFrom c,
     const TensorGeometryArg& t,
     IntArrayRef sizes);
-CAFFE2_API void checkSize(
+TORCH_API void checkSize(
     CheckedFrom c,
     const TensorGeometryArg& t,
     int64_t dim,
     int64_t size);
-CAFFE2_API void checkNumel(
+TORCH_API void checkNumel(
     CheckedFrom c,
     const TensorGeometryArg& t,
     int64_t numel);
-CAFFE2_API void checkSameNumel(
+TORCH_API void checkSameNumel(
     CheckedFrom c,
     const TensorGeometryArg& t1,
     const TensorGeometryArg& t2);
-CAFFE2_API void checkAllSameNumel(CheckedFrom c, ArrayRef<TensorArg> tensors);
-CAFFE2_API void checkScalarType(
+TORCH_API void checkAllSameNumel(CheckedFrom c, ArrayRef<TensorArg> tensors);
+TORCH_API void checkScalarType(
     CheckedFrom c,
     const TensorArg& t,
     ScalarType s);
-CAFFE2_API void checkScalarTypes(
+TORCH_API void checkScalarTypes(
     CheckedFrom c,
     const TensorArg& t,
     at::ArrayRef<ScalarType> l);
-CAFFE2_API void checkSameGPU(
+TORCH_API void checkSameGPU(
     CheckedFrom c,
     const TensorArg& t1,
     const TensorArg& t2);
-CAFFE2_API void checkAllSameGPU(CheckedFrom c, ArrayRef<TensorArg> tensors);
-CAFFE2_API void checkSameType(
+TORCH_API void checkAllSameGPU(CheckedFrom c, ArrayRef<TensorArg> tensors);
+TORCH_API void checkSameType(
     CheckedFrom c,
     const TensorArg& t1,
     const TensorArg& t2);
-CAFFE2_API void checkAllSameType(CheckedFrom c, ArrayRef<TensorArg> tensors);
-CAFFE2_API void checkSameSize(
+TORCH_API void checkAllSameType(CheckedFrom c, ArrayRef<TensorArg> tensors);
+TORCH_API void checkSameSize(
     CheckedFrom c,
     const TensorArg& t1,
     const TensorArg& t2);
-CAFFE2_API void checkDefined(CheckedFrom c, const TensorArg& t);
-CAFFE2_API void checkAllDefined(CheckedFrom c, at::ArrayRef<TensorArg> t);
+TORCH_API void checkDefined(CheckedFrom c, const TensorArg& t);
+TORCH_API void checkAllDefined(CheckedFrom c, at::ArrayRef<TensorArg> t);
 
 // FixMe: does TensorArg slow things down?
-CAFFE2_API void checkBackend(
+TORCH_API void checkBackend(
     CheckedFrom c,
     at::ArrayRef<Tensor> t,
     at::Backend backend);
 
-CAFFE2_API void checkDeviceType(
+TORCH_API void checkDeviceType(
     CheckedFrom c,
     at::ArrayRef<Tensor> tensors,
     at::DeviceType device_type);
 
-CAFFE2_API void checkLayout(CheckedFrom c, const Tensor& t, Layout layout);
+TORCH_API void checkLayout(CheckedFrom c, const Tensor& t, Layout layout);
 
-CAFFE2_API void checkLayout(CheckedFrom c, at::ArrayRef<Tensor> tensors, at::Layout layout);
+TORCH_API void checkLayout(CheckedFrom c, at::ArrayRef<Tensor> tensors, at::Layout layout);
 
 // Methods for getting data_ptr if tensor is defined
-CAFFE2_API void* maybe_data_ptr(const Tensor& tensor);
-CAFFE2_API void* maybe_data_ptr(const TensorArg& tensor);
+TORCH_API void* maybe_data_ptr(const Tensor& tensor);
+TORCH_API void* maybe_data_ptr(const TensorArg& tensor);
 
 // Return if the tensor geometry represented by `sizes` and `strides` is contiguous
 // Although we cache is_contiguous in tensor now, this is till useful because it
 // allows checking if a particular geometry is contiguous without explicitly
 // constructing a tensor, e.g., when you want to choose a kernel strategy based
 // on whether a subgeometry is contiguous.
-CAFFE2_API bool geometry_is_contiguous(IntArrayRef sizes, IntArrayRef strides);
+TORCH_API bool geometry_is_contiguous(IntArrayRef sizes, IntArrayRef strides);
 
 // Correspond to THCUNN_check_dim_size/THNN_check_dim_size
-CAFFE2_API void check_dim_size(
+TORCH_API void check_dim_size(
     const Tensor& tensor,
     int64_t dim,
     int64_t dim_size,
     int64_t size);
 
 namespace detail {
-CAFFE2_API std::vector<int64_t> defaultStrides(IntArrayRef sizes);
-CAFFE2_API size_t
+TORCH_API std::vector<int64_t> defaultStrides(IntArrayRef sizes);
+TORCH_API size_t
 computeStorageNbytes(IntArrayRef sizes, IntArrayRef strides, size_t itemsize);
-CAFFE2_API c10::optional<std::vector<int64_t>> computeStride(
+TORCH_API c10::optional<std::vector<int64_t>> computeStride(
     IntArrayRef oldshape,
     IntArrayRef oldstride,
     IntArrayRef newshape);
diff --git a/aten/src/ATen/Utils.h b/aten/src/ATen/Utils.h
index e814d52..e100bb1 100644
--- a/aten/src/ATen/Utils.h
+++ b/aten/src/ATen/Utils.h
@@ -22,7 +22,7 @@
 
 namespace at {
 
-CAFFE2_API int _crash_if_asan(int);
+TORCH_API int _crash_if_asan(int);
 
 // TODO: This unwrapping code is ONLY used for TH bindings; once TH goes
 // away, we can delete this function
@@ -135,24 +135,24 @@
 }
 
 namespace detail {
-CAFFE2_API
+TORCH_API
 Tensor empty_cpu(IntArrayRef size, c10::optional<ScalarType> dtype_opt, c10::optional<Layout> layout_opt,
                  c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt, c10::optional<c10::MemoryFormat> memory_format_opt);
 
 template <typename T>
-CAFFE2_API
+TORCH_API
 Tensor tensor_cpu(ArrayRef<T> values, const TensorOptions& options);
 
 template <typename T>
-CAFFE2_API
+TORCH_API
 Tensor tensor_backend(ArrayRef<T> values, const TensorOptions& options);
 
 template <typename T>
-CAFFE2_API
+TORCH_API
 Tensor tensor_complex_cpu(ArrayRef<T> values, const TensorOptions& options);
 
 template <typename T>
-CAFFE2_API
+TORCH_API
 Tensor tensor_complex_backend(ArrayRef<T> values, const TensorOptions& options);
 } // namespace detail
 
diff --git a/aten/src/ATen/Version.h b/aten/src/ATen/Version.h
index 3ac7a58..88d010c 100644
--- a/aten/src/ATen/Version.h
+++ b/aten/src/ATen/Version.h
@@ -3,14 +3,14 @@
 namespace at {
 
 /// Returns a detailed string describing the configuration PyTorch.
-CAFFE2_API std::string show_config();
+TORCH_API std::string show_config();
 
-CAFFE2_API std::string get_mkl_version();
+TORCH_API std::string get_mkl_version();
 
-CAFFE2_API std::string get_mkldnn_version();
+TORCH_API std::string get_mkldnn_version();
 
-CAFFE2_API std::string get_openmp_version();
+TORCH_API std::string get_openmp_version();
 
-CAFFE2_API std::string get_cxx_flags();
+TORCH_API std::string get_cxx_flags();
 
 }  // namespace at
diff --git a/aten/src/ATen/VmapMode.h b/aten/src/ATen/VmapMode.h
index 8e59aac..c50f57a 100644
--- a/aten/src/ATen/VmapMode.h
+++ b/aten/src/ATen/VmapMode.h
@@ -11,7 +11,7 @@
 //
 // NOTE: this is NOT the c++ api for torch.vmap. That doesn't exist yet.
 
-struct CAFFE2_API VmapMode {
+struct TORCH_API VmapMode {
   // Returns the vmap level, aka the count of how many nested vmaps we're in.
   static int64_t current_vmap_level();
 
diff --git a/aten/src/ATen/core/ATenOpList.h b/aten/src/ATen/core/ATenOpList.h
index 880a690..1419376 100644
--- a/aten/src/ATen/core/ATenOpList.h
+++ b/aten/src/ATen/core/ATenOpList.h
@@ -9,5 +9,5 @@
 namespace at {
 
 // check if an op is a custom op (i.e. did not come from native_functions.yaml)
-CAFFE2_API bool is_custom_op(const c10::OperatorName& opName);
+TORCH_API bool is_custom_op(const c10::OperatorName& opName);
 }
diff --git a/aten/src/ATen/core/DeprecatedTypeProperties.h b/aten/src/ATen/core/DeprecatedTypeProperties.h
index 719cd9a..0c30444 100644
--- a/aten/src/ATen/core/DeprecatedTypeProperties.h
+++ b/aten/src/ATen/core/DeprecatedTypeProperties.h
@@ -17,7 +17,7 @@
 // serves as a replacement return value for Tensor::type(). Previously,
 // Tensor::type() returned Type&, but we are changing Type to not be
 // dtype-specific.
-class CAFFE2_API DeprecatedTypeProperties {
+class TORCH_API DeprecatedTypeProperties {
  public:
   DeprecatedTypeProperties(Backend backend, ScalarType scalar_type)
     : backend_(backend), scalar_type_(scalar_type) {}
diff --git a/aten/src/ATen/core/DeprecatedTypePropertiesRegistry.h b/aten/src/ATen/core/DeprecatedTypePropertiesRegistry.h
index d9b29a3..a21f1ab 100644
--- a/aten/src/ATen/core/DeprecatedTypePropertiesRegistry.h
+++ b/aten/src/ATen/core/DeprecatedTypePropertiesRegistry.h
@@ -10,11 +10,11 @@
 
 class DeprecatedTypeProperties;
 
-struct CAFFE2_API DeprecatedTypePropertiesDeleter {
+struct TORCH_API DeprecatedTypePropertiesDeleter {
   void operator()(DeprecatedTypeProperties * ptr);
 };
 
-class CAFFE2_API DeprecatedTypePropertiesRegistry {
+class TORCH_API DeprecatedTypePropertiesRegistry {
  public:
   DeprecatedTypePropertiesRegistry();
 
@@ -26,6 +26,6 @@
     [static_cast<int>(ScalarType::NumOptions)];
 };
 
-CAFFE2_API DeprecatedTypePropertiesRegistry& globalDeprecatedTypePropertiesRegistry();
+TORCH_API DeprecatedTypePropertiesRegistry& globalDeprecatedTypePropertiesRegistry();
 
 } // namespace at
diff --git a/aten/src/ATen/core/Dimname.h b/aten/src/ATen/core/Dimname.h
index 8010614..c68ee86 100644
--- a/aten/src/ATen/core/Dimname.h
+++ b/aten/src/ATen/core/Dimname.h
@@ -9,7 +9,7 @@
 
 enum class NameType: uint8_t { BASIC, WILDCARD };
 
-struct CAFFE2_API Dimname {
+struct TORCH_API Dimname {
   static Dimname fromSymbol(Symbol name);
   static Dimname wildcard();
   static bool isValidName(const std::string& name);
@@ -35,7 +35,7 @@
 
 using DimnameList = c10::ArrayRef<Dimname>;
 
-CAFFE2_API std::ostream& operator<<(std::ostream& out, const Dimname& dimname);
+TORCH_API std::ostream& operator<<(std::ostream& out, const Dimname& dimname);
 
 inline bool operator==(const Dimname& lhs, const Dimname& rhs) {
   return lhs.symbol() == rhs.symbol();
diff --git a/aten/src/ATen/core/Formatting.h b/aten/src/ATen/core/Formatting.h
index 63c5e12..4a5545a 100644
--- a/aten/src/ATen/core/Formatting.h
+++ b/aten/src/ATen/core/Formatting.h
@@ -6,12 +6,12 @@
 
 
 namespace c10 {
-CAFFE2_API std::ostream& operator<<(std::ostream& out, Backend b);
+TORCH_API std::ostream& operator<<(std::ostream& out, Backend b);
 }
 namespace at {
 
-CAFFE2_API std::ostream& operator<<(std::ostream& out, const DeprecatedTypeProperties& t);
-CAFFE2_API std::ostream& print(
+TORCH_API std::ostream& operator<<(std::ostream& out, const DeprecatedTypeProperties& t);
+TORCH_API std::ostream& print(
     std::ostream& stream,
     const Tensor& tensor,
     int64_t linesize);
diff --git a/aten/src/ATen/core/Generator.h b/aten/src/ATen/core/Generator.h
index 62f43fd..de3f6e4 100644
--- a/aten/src/ATen/core/Generator.h
+++ b/aten/src/ATen/core/Generator.h
@@ -56,7 +56,7 @@
 
 namespace at {
 
-struct CAFFE2_API Generator {
+struct TORCH_API Generator {
   Generator() {}
 
   explicit Generator(c10::intrusive_ptr<c10::GeneratorImpl> gen_impl)
diff --git a/aten/src/ATen/core/LegacyTypeDispatch.h b/aten/src/ATen/core/LegacyTypeDispatch.h
index 925a87a..85f771a 100644
--- a/aten/src/ATen/core/LegacyTypeDispatch.h
+++ b/aten/src/ATen/core/LegacyTypeDispatch.h
@@ -43,7 +43,7 @@
 // trace).  To unify the two, we would first have to move profiling and tracing
 // out of VariableType.
 
-struct CAFFE2_API AutoNonVariableTypeMode {
+struct TORCH_API AutoNonVariableTypeMode {
   // NB: The enabled parameter must ALWAYS be black, as Henry Ford used to say.
   // TODO: Eliminate this parameter entirely
   AutoNonVariableTypeMode(bool enabled = true) :
diff --git a/aten/src/ATen/core/NamedTensor.h b/aten/src/ATen/core/NamedTensor.h
index b67e24a..5b064ca7 100644
--- a/aten/src/ATen/core/NamedTensor.h
+++ b/aten/src/ATen/core/NamedTensor.h
@@ -19,7 +19,7 @@
 //
 // This class has an important invariant: there must be at least ONE
 // non-wildcard
-struct CAFFE2_API NamedTensorMeta final : public c10::NamedTensorMetaInterface {
+struct TORCH_API NamedTensorMeta final : public c10::NamedTensorMetaInterface {
   // This enum is to remind people that the invariant on constructors is that
   // the list of dimnames must have at least one non-wildcard
   enum HAS_NON_WILDCARD {
@@ -69,7 +69,7 @@
 
 // When NamesMode is disabled, then all operations ignore tensors' names fields.
 // Concretely speaking, all tensors are treated as having nullopt names.
-struct CAFFE2_API NamesMode {
+struct TORCH_API NamesMode {
   static bool is_enabled();
   static void set_enabled(bool enabled);
 };
@@ -77,7 +77,7 @@
 
 // A RAII, thread local (!) guard that enables or disables names upon
 // construction, and sets it back to the original value upon destruction.
-struct CAFFE2_API NoNamesGuard {
+struct TORCH_API NoNamesGuard {
   NoNamesGuard() : prev_mode(NamesMode::is_enabled()), initialized(true) {
     NamesMode::set_enabled(false);
   }
@@ -99,8 +99,8 @@
 void check_names_valid_for(size_t tensor_dim, DimnameList names);
 
 // Sets the names of `tensor` to be `names`.
-CAFFE2_API Tensor& internal_set_names_inplace(Tensor& tensor, c10::optional<DimnameList> names);
-CAFFE2_API Tensor& internal_set_names_inplace(Tensor& tensor, std::vector<Dimname>&& names, bool validate_names);
+TORCH_API Tensor& internal_set_names_inplace(Tensor& tensor, c10::optional<DimnameList> names);
+TORCH_API Tensor& internal_set_names_inplace(Tensor& tensor, std::vector<Dimname>&& names, bool validate_names);
 
 constexpr size_t kMaxNamedTensorDim = 64;
 
@@ -110,8 +110,8 @@
 
 // Some helper functions on TensorImpl. Useful for working with names in TH.
 // XXX: Ideally these would exist as methods on TensorImpl
-CAFFE2_API void internal_set_names_inplace(TensorImpl* impl, c10::optional<DimnameList> names, bool validate_names);
-CAFFE2_API void internal_set_names_inplace(TensorImpl* impl, std::vector<Dimname>&& names, bool validate_names);
+TORCH_API void internal_set_names_inplace(TensorImpl* impl, c10::optional<DimnameList> names, bool validate_names);
+TORCH_API void internal_set_names_inplace(TensorImpl* impl, std::vector<Dimname>&& names, bool validate_names);
 
 void check_names_valid_for(TensorImpl* impl, DimnameList names);
 
@@ -119,19 +119,19 @@
 // Returns false if the tensor's names don't exist (were not allocated),
 // or if all names are 'None'.
 // We treat not-allocated-names the same as allocated names that are all 'None'.
-CAFFE2_API bool has_names(const TensorImpl* impl);
+TORCH_API bool has_names(const TensorImpl* impl);
 
 // Returns the names of the tensor's dimensions.
 // Unnamed tensors are treated as having 'None' in all dimension; this method
 // would return a DimnameList of all 'None's for an unnamed tensor.
-CAFFE2_API DimnameList get_names(const TensorImpl* impl);
+TORCH_API DimnameList get_names(const TensorImpl* impl);
 
 // This is more of an implementation detail; one should use impl::get_names /
 // Tensor::names() whenever possible because it provides a cleaner API.
 // Returns the names of the tensor if they have been allocated; returns nullopt
 // instead if the haven't been. The names of a tensor are not allocated if a
 // tensor is constructed with names=None.
-CAFFE2_API c10::optional<DimnameList> get_opt_names(const TensorImpl* impl);
+TORCH_API c10::optional<DimnameList> get_opt_names(const TensorImpl* impl);
 
 
 } // namespace impl
diff --git a/aten/src/ATen/core/QuantizerBase.h b/aten/src/ATen/core/QuantizerBase.h
index fa796e5..0103c81 100644
--- a/aten/src/ATen/core/QuantizerBase.h
+++ b/aten/src/ATen/core/QuantizerBase.h
@@ -32,7 +32,7 @@
  * Quantized Tensor holds an intrusive_ptr to Quantizer, and multiple Tensor can
  * share the same Quantizer. Quantizer should be immutable.
  */
-struct CAFFE2_API Quantizer : public c10::intrusive_ptr_target {
+struct TORCH_API Quantizer : public c10::intrusive_ptr_target {
   const ScalarType scalar_type_;
   explicit Quantizer(ScalarType scalar_type) : scalar_type_(scalar_type) {}
   virtual ~Quantizer();
diff --git a/aten/src/ATen/core/VariableHooksInterface.h b/aten/src/ATen/core/VariableHooksInterface.h
index e510471..3a85919 100644
--- a/aten/src/ATen/core/VariableHooksInterface.h
+++ b/aten/src/ATen/core/VariableHooksInterface.h
@@ -16,7 +16,7 @@
 // merge the libraries inside Facebook".  Well, the problem is that there
 // are some downstream applications which are at binary size limit, and
 // incorporating all of the extra code from libtorch would push them
-// over (admarket/adreview/service:adreviewservice, see also 
+// over (admarket/adreview/service:adreviewservice, see also
 // https://github.com/pytorch/pytorch/pull/29299)  So if you want to do that,
 // we have to fix all of the services like this.
 //
@@ -38,7 +38,7 @@
 namespace at {
 namespace impl {
 
-struct CAFFE2_API VariableHooksInterface {
+struct TORCH_API VariableHooksInterface {
   virtual ~VariableHooksInterface() = default;
   virtual Tensor tensor_data(const Tensor&) const = 0;
   virtual Tensor variable_data(const Tensor&) const = 0;
@@ -50,10 +50,10 @@
   virtual const std::string& name(const Tensor&) const = 0;
 };
 
-CAFFE2_API void SetVariableHooks(VariableHooksInterface* hooks);
-CAFFE2_API VariableHooksInterface* GetVariableHooks();
+TORCH_API void SetVariableHooks(VariableHooksInterface* hooks);
+TORCH_API VariableHooksInterface* GetVariableHooks();
 
-struct CAFFE2_API VariableHooksRegisterer {
+struct TORCH_API VariableHooksRegisterer {
   explicit VariableHooksRegisterer(VariableHooksInterface* hooks) {
     SetVariableHooks(hooks);
   }
diff --git a/aten/src/ATen/core/blob.h b/aten/src/ATen/core/blob.h
index 3b6bafa..1c59ac0 100644
--- a/aten/src/ATen/core/blob.h
+++ b/aten/src/ATen/core/blob.h
@@ -21,7 +21,7 @@
  * properly when the blob is deallocated or re-allocated with a new type. A blob
  * could contain anything, although the most common case is to contain a Tensor.
  */
-class CAFFE2_API Blob final : public c10::intrusive_ptr_target {
+class TORCH_API Blob final : public c10::intrusive_ptr_target {
  public:
   /**
    * Initializes an empty Blob.
diff --git a/aten/src/ATen/core/boxing/KernelFunction.h b/aten/src/ATen/core/boxing/KernelFunction.h
index a528745..6817907 100644
--- a/aten/src/ATen/core/boxing/KernelFunction.h
+++ b/aten/src/ATen/core/boxing/KernelFunction.h
@@ -15,7 +15,7 @@
 // no overhead to fallthrough to the next key.  See cpp file for some more
 // implementation notes; notably, this does NOT actually go through the
 // boxing/unboxing codepath.
-CAFFE2_API void fallthrough_kernel(OperatorKernel*, const OperatorHandle&, Stack*);
+TORCH_API void fallthrough_kernel(OperatorKernel*, const OperatorHandle&, Stack*);
 
 // Note [Ambiguity in AutogradOther kernel]
 // This kernel implements reporting an error message when there're kernels registered
@@ -27,7 +27,7 @@
 //   See c10/core/DispatchKeySet.cpp for a list of backends mapped to AutogradOther.
 // Thus if backend extender indeed want to override Math kernel behavior, they should request
 // a dedicated Autograd key for their backend to resolve the ambiguity.
-CAFFE2_API void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle&, Stack*);
+TORCH_API void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle&, Stack*);
 
 // Note [named_not_supported_kernel]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -36,7 +36,7 @@
 // cased in the dispatcher to be triggered before we attempt boxing (so we can
 // give a good error message in cases when boxing is not supported).  When
 // boxing is universally supported this can be removed.
-[[noreturn]] CAFFE2_API void named_not_supported_kernel(OperatorKernel*, const OperatorHandle&, Stack*);
+[[noreturn]] TORCH_API void named_not_supported_kernel(OperatorKernel*, const OperatorHandle&, Stack*);
 
 /**
  * KernelFunction is similar to std::function but stores a kernel function.
@@ -44,7 +44,7 @@
  * and call it in a boxed or unboxed way. If the way it was created doesn't
  * match the way it was called, it will do boxing or unboxing as necessary.
  */
-class CAFFE2_API KernelFunction final {
+class TORCH_API KernelFunction final {
 public:
   // This is how boxed kernels are actually stored
   using InternalBoxedKernelFunction = void(OperatorKernel*, const OperatorHandle&, Stack*);
diff --git a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
index 7bdb0d9..b9f59b3 100644
--- a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
+++ b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
@@ -26,7 +26,7 @@
  *
  * See below for how to register this kernel with PyTorch.
  */
-struct CAFFE2_API OperatorKernel {
+struct TORCH_API OperatorKernel {
   virtual ~OperatorKernel() = default;
 };
 
diff --git a/aten/src/ATen/core/dispatch/CppSignature.h b/aten/src/ATen/core/dispatch/CppSignature.h
index 9cfc7b3..b5a41ca 100644
--- a/aten/src/ATen/core/dispatch/CppSignature.h
+++ b/aten/src/ATen/core/dispatch/CppSignature.h
@@ -10,7 +10,7 @@
 
 // A CppSignature object holds RTTI information about a C++ function signature at runtime
 // and can compare them or get a debug-printable name.
-class CAFFE2_API CppSignature final {
+class TORCH_API CppSignature final {
 public:
     CppSignature(const CppSignature&) = default;
     CppSignature(CppSignature&&) noexcept = default;
diff --git a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
index f8d401e..1bc1f1d 100644
--- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
+++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
@@ -102,7 +102,7 @@
  *    varies from operator, as some operators may have overridden the
  *    fallthrough with custom behavior.
  */
-struct CAFFE2_API DispatchKeyExtractor final {
+struct TORCH_API DispatchKeyExtractor final {
 public:
   static DispatchKeyExtractor make(const FunctionSchema& schema) {
     return DispatchKeyExtractor(makeBitsetForDispatchArgs(schema));
diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h
index f83302e..60f9f9b 100644
--- a/aten/src/ATen/core/dispatch/Dispatcher.h
+++ b/aten/src/ATen/core/dispatch/Dispatcher.h
@@ -16,7 +16,7 @@
 
 namespace c10 {
 
-class CAFFE2_API OperatorHandle;
+class TORCH_API OperatorHandle;
 template<class FuncType> class TypedOperatorHandle;
 
 /**
@@ -27,7 +27,7 @@
  * NB: registration events only occur when a 'def' occurs; we don't trigger
  * on 'impl' or 'fallback' calls.
  */
-class CAFFE2_API OpRegistrationListener {
+class TORCH_API OpRegistrationListener {
 public:
   virtual ~OpRegistrationListener();
 
@@ -45,7 +45,7 @@
  * Most end users shouldn't use this directly; if you're trying to register
  * ops look in op_registration
  */
-class CAFFE2_API Dispatcher final {
+class TORCH_API Dispatcher final {
 private:
   // For direct access to backend fallback information
   friend class impl::OperatorEntry;
@@ -267,7 +267,7 @@
  * This handle can be used to register kernels with the dispatcher or
  * to lookup a kernel for a certain set of arguments.
  */
-class CAFFE2_API OperatorHandle {
+class TORCH_API OperatorHandle {
 public:
   OperatorHandle(OperatorHandle&&) noexcept = default;
   OperatorHandle& operator=(OperatorHandle&&) noexcept = default;
diff --git a/aten/src/ATen/core/dispatch/ObservedOperators.h b/aten/src/ATen/core/dispatch/ObservedOperators.h
index 45db9d1..b8919d0 100644
--- a/aten/src/ATen/core/dispatch/ObservedOperators.h
+++ b/aten/src/ATen/core/dispatch/ObservedOperators.h
@@ -4,7 +4,7 @@
 
 namespace c10 {
 
-struct CAFFE2_API ObservedOperators {
+struct TORCH_API ObservedOperators {
   ObservedOperators() = delete;
 
   static bool isObserved(const OperatorName& name);
diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.h b/aten/src/ATen/core/dispatch/OperatorEntry.h
index 79af224..5098fd0 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.h
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.h
@@ -61,7 +61,7 @@
 // Concurrent writes to OperatorEntry are protected by the GLOBAL Dispatcher
 // lock (this is important because some methods in OperatorEntry access
 // dispatcher state)
-class CAFFE2_API OperatorEntry final {
+class TORCH_API OperatorEntry final {
 public:
   explicit OperatorEntry(OperatorName&& operator_name);
 
diff --git a/aten/src/ATen/core/function.h b/aten/src/ATen/core/function.h
index 9191dad..75592bf 100644
--- a/aten/src/ATen/core/function.h
+++ b/aten/src/ATen/core/function.h
@@ -9,7 +9,7 @@
 };
 
 namespace at {
-CAFFE2_API void launch(std::function<void()> func);
+TORCH_API void launch(std::function<void()> func);
 }
 
 namespace torch {
diff --git a/aten/src/ATen/core/grad_mode.h b/aten/src/ATen/core/grad_mode.h
index acd5fd0..84f8c6d 100644
--- a/aten/src/ATen/core/grad_mode.h
+++ b/aten/src/ATen/core/grad_mode.h
@@ -4,14 +4,14 @@
 
 namespace at {
 
-struct CAFFE2_API GradMode {
+struct TORCH_API GradMode {
   static bool is_enabled();
   static void set_enabled(bool enabled);
 };
 
 // A RAII, thread local (!) guard that enables or disables grad mode upon
 // construction, and sets it back to the original value upon destruction.
-struct CAFFE2_API AutoGradMode {
+struct TORCH_API AutoGradMode {
   AutoGradMode(bool enabled) : prev_mode(GradMode::is_enabled()) {
     GradMode::set_enabled(enabled);
   }
@@ -23,7 +23,7 @@
 
 // A RAII, thread local (!) guard that stops future operations from building
 // gradients.
-struct CAFFE2_API NoGradGuard : public AutoGradMode {
+struct TORCH_API NoGradGuard : public AutoGradMode {
   NoGradGuard() : AutoGradMode(/*enabled=*/false) {}
 };
 
diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h
index 7e9f3a2..8065300 100644
--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@@ -435,7 +435,7 @@
 // A Symbol is like an interned string, but with a little extra
 // structure; it is namespaced via SymbolNamespace and the resulting
 // intern pointers support efficient namespace testing.
-struct CAFFE2_API Symbol {
+struct TORCH_API Symbol {
   explicit constexpr Symbol() : value(0) {};
   explicit constexpr Symbol(unique_t uniq)
   : value(uniq) {}
diff --git a/aten/src/ATen/core/interned_strings_class.h b/aten/src/ATen/core/interned_strings_class.h
index b13e3f1..54303e0 100644
--- a/aten/src/ATen/core/interned_strings_class.h
+++ b/aten/src/ATen/core/interned_strings_class.h
@@ -11,7 +11,7 @@
 
 namespace c10 {
 
-struct CAFFE2_API InternedStrings {
+struct TORCH_API InternedStrings {
   InternedStrings();
   Symbol symbol(const std::string& s);
   std::pair<const char*, const char*> string(Symbol sym);
diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
index 51b6d082..85e176b 100644
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@@ -33,7 +33,7 @@
               expected_type->repr_str());
 }
 
-CAFFE2_API c10::intrusive_ptr<ConstantString> ConstantString::create(
+TORCH_API c10::intrusive_ptr<ConstantString> ConstantString::create(
     std::string str_) {
   return c10::make_intrusive<ConstantString>(std::move(str_));
 }
@@ -887,7 +887,7 @@
   return classConverter;
 }
 
-CAFFE2_API intrusive_ptr<ivalue::Future> collectAll(
+TORCH_API intrusive_ptr<ivalue::Future> collectAll(
     List<intrusive_ptr<ivalue::Future>> srcs) {
   struct Ctx {
     explicit Ctx(List<intrusive_ptr<ivalue::Future>> srcs)
@@ -919,7 +919,7 @@
   return ctx->dstFuture;
 }
 
-CAFFE2_API intrusive_ptr<ivalue::Future> collectAny(
+TORCH_API intrusive_ptr<ivalue::Future> collectAny(
     List<intrusive_ptr<ivalue::Future>> srcs) {
   if (srcs.empty()) {
     auto res = make_intrusive<ivalue::Future>(NoneType::get());
diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
index 152c28d..4a7e15c 100644
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@@ -157,7 +157,7 @@
 ///   // `my_ivalue` is tagged as an int and cannot be used as another type
 ///   torch::Tensor my_tensor = my_ivalue.toTensor()
 /// \endrst
-struct CAFFE2_API IValue final {
+struct TORCH_API IValue final {
   IValue(const IValue& rhs)
       : IValue(rhs.payload, rhs.tag, rhs.is_intrusive_ptr) {
     if (is_intrusive_ptr) {
@@ -744,7 +744,7 @@
   // This is different from `repr()` in that there is no expectation that we can
   // exactly reconstruct an IValue from the output; feel free to use a
   // concise/pretty form
-  CAFFE2_API friend std::ostream& operator<<(
+  TORCH_API friend std::ostream& operator<<(
       std::ostream& out,
       const IValue& v);
 
@@ -847,7 +847,7 @@
   friend struct WeakIValue;
 };
 
-struct CAFFE2_API WeakIValue final {
+struct TORCH_API WeakIValue final {
   WeakIValue() : payload{0}, tag(IValue::Tag::None), is_intrusive_ptr(false) {}
 
   WeakIValue(const WeakIValue& rhs)
diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h
index a1e0491..9aab8e6 100644
--- a/aten/src/ATen/core/ivalue_inl.h
+++ b/aten/src/ATen/core/ivalue_inl.h
@@ -180,14 +180,14 @@
 
 namespace ivalue {
 
-void CAFFE2_API
+void TORCH_API
 checkCustomClassType(const Type* expected_type, const Type* actual_type);
 
 template <typename T>
 using Shared = c10::intrusive_ptr<T>;
 
 // string
-struct CAFFE2_API ConstantString final : c10::intrusive_ptr_target {
+struct TORCH_API ConstantString final : c10::intrusive_ptr_target {
  private:
   const std::string str_;
 
@@ -200,14 +200,14 @@
   operator const std::string&() const {
     return string();
   }
-  CAFFE2_API friend std::ostream& operator<<(
+  TORCH_API friend std::ostream& operator<<(
       std::ostream& out,
       const ConstantString& v);
 };
 
 struct Future;
 
-struct CAFFE2_API Tuple : c10::intrusive_ptr_target {
+struct TORCH_API Tuple : c10::intrusive_ptr_target {
  private:
   std::vector<IValue> elements_;
   mutable std::shared_ptr<TupleType>
@@ -254,7 +254,7 @@
     return c10::get_hash(t.elements());
   }
 
-  CAFFE2_API friend bool operator==(
+  TORCH_API friend bool operator==(
       const ivalue::Tuple& lhs,
       const ivalue::Tuple& rhs);
 
@@ -283,7 +283,7 @@
 
  public:
   explicit Future(TypePtr type) : type_(type) {}
-  struct CAFFE2_API FutureError final : public std::exception {
+  struct TORCH_API FutureError final : public std::exception {
     explicit FutureError(std::string&& error_msg_)
         : error_msg(std::move(error_msg_)) {}
 
@@ -485,7 +485,7 @@
     return eptr_;
   }
 
-  CAFFE2_API friend std::ostream& operator<<(
+  TORCH_API friend std::ostream& operator<<(
       std::ostream& out,
       const Future& v);
 
@@ -573,11 +573,11 @@
 
 // Input is a list of Futures with the same target type.
 // Output is a Future to the List of completed Futures.
-CAFFE2_API intrusive_ptr<ivalue::Future> collectAll(
+TORCH_API intrusive_ptr<ivalue::Future> collectAll(
     c10::List<c10::intrusive_ptr<ivalue::Future>> srcs);
 // Input is a List of Futures with the same target type.
 // Output is a Future that will be updated with a seen value.
-CAFFE2_API intrusive_ptr<ivalue::Future> collectAny(
+TORCH_API intrusive_ptr<ivalue::Future> collectAny(
     c10::List<c10::intrusive_ptr<ivalue::Future>> srcs);
 
 // User-defined object.
@@ -692,11 +692,11 @@
       const ivalue::EnumHolder& lhs,
       const ivalue::EnumHolder& rhs);
 
-  CAFFE2_API friend std::ostream& operator<<(
+  TORCH_API friend std::ostream& operator<<(
       std::ostream& out,
       const EnumHolder& v);
 
-  CAFFE2_API const std::string qualifiedClassName() const;
+  TORCH_API const std::string qualifiedClassName() const;
 
   const std::string unqualifiedClassName() const;
 
diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h
index f0c93ca..ae06a8a 100644
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@@ -67,7 +67,7 @@
 #undef DEFINE_TYPE
 };
 
-CAFFE2_API const char* typeKindToString(TypeKind kind);
+TORCH_API const char* typeKindToString(TypeKind kind);
 
 struct Type;
 using TypePtr = std::shared_ptr<Type>;
@@ -79,7 +79,7 @@
 using TypePrinter =
     std::function<c10::optional<std::string>(const ConstTypePtr&)>;
 
-struct CAFFE2_API Type : std::enable_shared_from_this<Type> {
+struct TORCH_API Type : std::enable_shared_from_this<Type> {
  private:
   TypeKind kind_;
 
@@ -212,7 +212,7 @@
 using AnyTypePtr = std::shared_ptr<AnyType>;
 // Any is the top of the type hierarchy, all other types are subtypes
 // T <: Any, forall T
-struct CAFFE2_API AnyType : public Type {
+struct TORCH_API AnyType : public Type {
   static AnyTypePtr create() {
     return AnyTypePtr(
         new AnyType()); // NOLINT(modernize-make-shared)
@@ -284,7 +284,7 @@
 // 1. Optional[T] <: Optional[R] iff T <: R
 // 2. T <: Optional[R] if T <: R
 // 3. None <: Optional[T] for all T
-struct CAFFE2_API OptionalType
+struct TORCH_API OptionalType
     : public SingleElementType<TypeKind::OptionalType, OptionalType> {
   static OptionalTypePtr create(TypePtr element) {
     TORCH_INTERNAL_ASSERT(element, "OptionalType requires valid TypePtr");
@@ -356,7 +356,7 @@
 // `stride_indices` A contiguity marker on the smallest stride (c0) indicates
 // the stride is precisely 1, otherwise a contiguity marker means that $stride_n
 // = size_{n-1}*stride_{n-1}$
-struct CAFFE2_API Stride {
+struct TORCH_API Stride {
   Stride() {}
   Stride(
       const c10::optional<size_t>& stride_index,
@@ -401,7 +401,7 @@
   return r;
 }
 
-struct CAFFE2_API ShapeSymbol {
+struct TORCH_API ShapeSymbol {
   // needed for use in `std::map`
   ShapeSymbol() : value_(-1) {}
   // is this symbol a fixed/static dimension
@@ -426,7 +426,7 @@
   static ShapeSymbol newSymbol() {
     return fromStaticSize(-static_cast<int64_t>(++num_symbols));
   };
-  friend CAFFE2_API std::ostream& operator<<(
+  friend TORCH_API std::ostream& operator<<(
       std::ostream& os,
       const ShapeSymbol& s);
 
@@ -447,7 +447,7 @@
 
 // Shape of a Tensor represented with ShapeSymbol's. Unranked, ranked unknown
 // dims, partially known and fully known shapes are all supported.
-struct CAFFE2_API SymbolicShape {
+struct TORCH_API SymbolicShape {
   // Unranked shape constructor.
   SymbolicShape() : dims_(c10::nullopt) {}
 
@@ -576,7 +576,7 @@
     return dims_;
   }
 
-  CAFFE2_API VaryingShape merge(const VaryingShape& other) const;
+  TORCH_API VaryingShape merge(const VaryingShape& other) const;
 
   c10::optional<std::vector<T>> concrete_sizes() const {
     if (!dims_) {
@@ -611,7 +611,7 @@
 struct TensorType;
 using TensorTypePtr = std::shared_ptr<TensorType>;
 // This type represents a single Tensor with a specific size
-struct CAFFE2_API TensorType : public Type {
+struct TORCH_API TensorType : public Type {
   static TensorTypePtr create(const at::Tensor& t);
 
   // used by TensorType::create(size_t dim) which in turn used by
@@ -864,7 +864,7 @@
 
 struct ListType;
 using ListTypePtr = std::shared_ptr<ListType>;
-struct CAFFE2_API ListType
+struct TORCH_API ListType
     : public SingleElementType<TypeKind::ListType, ListType> {
   // It's not exactly a singleton, but there should be exactly one instance of
   // List[T] for every T
@@ -906,7 +906,7 @@
 
 struct DictType;
 using DictTypePtr = std::shared_ptr<DictType>;
-struct CAFFE2_API DictType : public Type {
+struct TORCH_API DictType : public Type {
   friend struct Type;
   static const TypeKind Kind = TypeKind::DictType;
 
@@ -988,7 +988,7 @@
 struct FutureType;
 using FutureTypePtr = std::shared_ptr<FutureType>;
 
-struct CAFFE2_API FutureType
+struct TORCH_API FutureType
     : public SingleElementType<TypeKind::FutureType, FutureType> {
   friend struct Type;
   template <typename... T>
@@ -1030,7 +1030,7 @@
 struct RRefType;
 using RRefTypePtr = std::shared_ptr<RRefType>;
 
-struct CAFFE2_API RRefType
+struct TORCH_API RRefType
     : public SingleElementType<TypeKind::RRefType, RRefType> {
   friend struct Type;
   template <typename... T>
@@ -1064,7 +1064,7 @@
 using NamedTypePtr = std::shared_ptr<NamedType>;
 using ConstNamedTypePtr = std::shared_ptr<const NamedType>;
 
-struct CAFFE2_API NamedType : public Type {
+struct TORCH_API NamedType : public Type {
   NamedType(TypeKind tk, c10::optional<QualifiedName> name)
       : Type(tk), name_(std::move(name)) {
     TORCH_INTERNAL_ASSERT(
@@ -1091,7 +1091,7 @@
 // static types in named types to reconstruct type tags of loaded
 // values. Lifting this restriction requires solving the serialization
 // problem first.
-CAFFE2_API void checkNoAny(
+TORCH_API void checkNoAny(
     const Type& base,
     const char* what,
     const std::string& attrname,
@@ -1101,7 +1101,7 @@
 using TupleTypePtr = std::shared_ptr<TupleType>;
 using NameList = std::vector<std::string>;
 // This type represents a Tuple
-struct CAFFE2_API TupleType : public NamedType {
+struct TORCH_API TupleType : public NamedType {
   static TupleTypePtr createNamed(const c10::optional<c10::QualifiedName>& name,
       const std::vector<std::string>& field_names,
       const std::vector<TypePtr>& types);
@@ -1172,7 +1172,7 @@
 struct EnumType;
 using EnumTypePtr = std::shared_ptr<EnumType>;
 using EnumNameValue = std::pair<std::string, IValue>;
-struct CAFFE2_API EnumType : public NamedType {
+struct TORCH_API EnumType : public NamedType {
   friend struct Type;
   static const TypeKind Kind = TypeKind::EnumType;
 
@@ -1258,7 +1258,7 @@
 // EnumType <: AnyEnumType for all Enums
 struct AnyEnumType;
 using AnyEnumTypePtr = std::shared_ptr<AnyEnumType>;
-struct CAFFE2_API AnyEnumType : public Type {
+struct TORCH_API AnyEnumType : public Type {
   static AnyEnumTypePtr create() {
     return AnyEnumTypePtr(
         new AnyEnumType()); // NOLINT(modernize-make-shared)
@@ -1284,7 +1284,7 @@
 // Subtype hierarchy for Number Types (NumberType as the base type):
 // IntType <: NumberType
 // FloatType <: NumberType
-struct CAFFE2_API NumberType : public Type {
+struct TORCH_API NumberType : public Type {
   static NumberTypePtr create() {
     return NumberTypePtr(new NumberType()); // NOLINT(modernize-make-shared)
   }
@@ -1311,7 +1311,7 @@
 struct FloatType;
 using FloatTypePtr = std::shared_ptr<FloatType>;
 // This type represents a Python float number
-struct CAFFE2_API FloatType : public NumberType {
+struct TORCH_API FloatType : public NumberType {
   static FloatTypePtr create() {
     return FloatTypePtr(new FloatType()); // NOLINT(modernize-make-shared)
   }
@@ -1338,7 +1338,7 @@
 struct IntType;
 using IntTypePtr = std::shared_ptr<IntType>;
 // This type represents a Python int number
-struct CAFFE2_API IntType : public NumberType {
+struct TORCH_API IntType : public NumberType {
   static IntTypePtr create() {
     return IntTypePtr(new IntType()); // NOLINT(modernize-make-shared)
   }
@@ -1365,7 +1365,7 @@
 struct BoolType;
 using BoolTypePtr = std::shared_ptr<BoolType>;
 // This node represents a Python bool value
-struct CAFFE2_API BoolType : public Type {
+struct TORCH_API BoolType : public Type {
   static BoolTypePtr create() {
     return BoolTypePtr(new BoolType());
   }
@@ -1386,7 +1386,7 @@
 struct StringType;
 using StringTypePtr = std::shared_ptr<StringType>;
 // This type represents a Python string
-struct CAFFE2_API StringType : public Type {
+struct TORCH_API StringType : public Type {
   static StringTypePtr create() {
     return StringTypePtr(new StringType()); // NOLINT(modernize-make-shared)
   }
@@ -1410,7 +1410,7 @@
 
 struct StorageType;
 using StorageTypePtr = std::shared_ptr<StorageType>;
-struct CAFFE2_API StorageType : public Type {
+struct TORCH_API StorageType : public Type {
   static StorageTypePtr create() {
     return StorageTypePtr(new StorageType()); // NOLINT(modernize-make-shared)
   }
@@ -1433,7 +1433,7 @@
 
 struct FunctionType;
 using FunctionTypePtr = std::shared_ptr<FunctionType>;
-struct CAFFE2_API FunctionType : public NamedType {
+struct TORCH_API FunctionType : public NamedType {
   static FunctionTypePtr create(torch::jit::Function* function) {
     return FunctionTypePtr(
         new FunctionType(function)); // NOLINT(modernize-make-shared)
@@ -1465,7 +1465,7 @@
 struct NoneType;
 using NoneTypePtr = std::shared_ptr<NoneType>;
 // This type represents a Python None
-struct CAFFE2_API NoneType : public Type {
+struct TORCH_API NoneType : public Type {
   static NoneTypePtr create() {
     return NoneTypePtr(new NoneType()); // NOLINT(modernize-make-shared)
   }
@@ -1492,7 +1492,7 @@
 struct GeneratorType;
 using GeneratorTypePtr = std::shared_ptr<GeneratorType>;
 // This type represents a Generator
-struct CAFFE2_API GeneratorType : public Type {
+struct TORCH_API GeneratorType : public Type {
   static GeneratorTypePtr create() {
     return GeneratorTypePtr(
         new GeneratorType()); // NOLINT(modernize-make-shared)
@@ -1514,7 +1514,7 @@
 struct QuantizerType;
 using QuantizerTypePtr = std::shared_ptr<QuantizerType>;
 // This type represents a Quantizer
-struct CAFFE2_API QuantizerType : public Type {
+struct TORCH_API QuantizerType : public Type {
   static QuantizerTypePtr create() {
     return QuantizerTypePtr(
         new QuantizerType()); // NOLINT(modernize-make-shared)
@@ -1536,7 +1536,7 @@
 struct QSchemeType;
 using QSchemeTypePtr = std::shared_ptr<QSchemeType>;
 // This type represents a QScheme
-struct CAFFE2_API QSchemeType : public Type {
+struct TORCH_API QSchemeType : public Type {
   static QSchemeTypePtr create() {
     return QSchemeTypePtr(
         new QSchemeType()); // NOLINT(modernize-make-shared)
@@ -1558,7 +1558,7 @@
 struct DeviceObjType;
 using DeviceObjTypePtr = std::shared_ptr<DeviceObjType>;
 // This type represents a Device
-struct CAFFE2_API DeviceObjType : public Type {
+struct TORCH_API DeviceObjType : public Type {
   static DeviceObjTypePtr create() {
     return DeviceObjTypePtr(
         new DeviceObjType()); // NOLINT(modernize-make-shared)
@@ -1580,7 +1580,7 @@
 struct StreamObjType;
 using StreamObjTypePtr = std::shared_ptr<StreamObjType>;
 // This type represents a Generator
-struct CAFFE2_API StreamObjType : public Type {
+struct TORCH_API StreamObjType : public Type {
   static StreamObjTypePtr create() {
     return StreamObjTypePtr(
       new StreamObjType()); // NOLINT(modernize-make-shared)
@@ -1630,7 +1630,7 @@
 using CapsuleTypePtr = std::shared_ptr<CapsuleType>;
 // This type represents a Python Capsule.
 // It does not appear in the IR and is only used during runtime
-struct CAFFE2_API CapsuleType : public Type {
+struct TORCH_API CapsuleType : public Type {
   static CapsuleTypePtr create() {
     return CapsuleTypePtr(new CapsuleType()); // NOLINT(modernize-make-shared)
   }
@@ -1651,7 +1651,7 @@
 struct PyObjectType;
 using PyObjectTypePtr = std::shared_ptr<PyObjectType>;
 // This type represents a PyObject Type
-struct CAFFE2_API PyObjectType : public Type {
+struct TORCH_API PyObjectType : public Type {
   static PyObjectTypePtr create() {
     return PyObjectTypePtr(new PyObjectType()); // NOLINT(modernize-make-shared)
   }
@@ -1677,16 +1677,16 @@
   Default = Full,
 };
 
-CAFFE2_API TypeVerbosity type_verbosity();
+TORCH_API TypeVerbosity type_verbosity();
 
-CAFFE2_API std::ostream& operator<<(std::ostream& out, const Type& t);
+TORCH_API std::ostream& operator<<(std::ostream& out, const Type& t);
 template <typename T>
-CAFFE2_API std::ostream& operator<<(
+TORCH_API std::ostream& operator<<(
     std::ostream& out,
     const VaryingShape<T>& t);
-CAFFE2_API std::ostream& operator<<(std::ostream& os, const SymbolicShape& s);
-CAFFE2_API std::ostream& operator<<(std::ostream& os, const ShapeSymbol& s);
-CAFFE2_API std::ostream& operator<<(std::ostream& os, const Stride& s);
+TORCH_API std::ostream& operator<<(std::ostream& os, const SymbolicShape& s);
+TORCH_API std::ostream& operator<<(std::ostream& os, const ShapeSymbol& s);
+TORCH_API std::ostream& operator<<(std::ostream& os, const Stride& s);
 // what is the type, ignoring extra size/shape information?
 // e.g. Tensor(2x3) -> Dynamic, and Tuple(Tensor(2x3),...) -> Tuple(Dynamic,...)
 
@@ -1738,12 +1738,12 @@
 // Two different tensortypes will return dynamic.
 // Currently we chose not to support returning a NumberType for a float & int
 // input because of a lack of operator support for NumberType
-CAFFE2_API c10::optional<TypePtr> unifyTypes(
+TORCH_API c10::optional<TypePtr> unifyTypes(
     const TypePtr& t1,
     const TypePtr& t2,
     bool default_to_any = false);
 
-CAFFE2_API c10::optional<TypePtr> unifyTypeList(
+TORCH_API c10::optional<TypePtr> unifyTypeList(
     at::ArrayRef<TypePtr> elements,
     std::ostream& why_not);
 
@@ -1963,15 +1963,15 @@
 // note: It is possible to successfully match a formal, but for type variables
 // in the formal to still not be defined. In particular, None matches Optional[T]
 // but does not define the value of T.
-CAFFE2_API MatchTypeReturn
+TORCH_API MatchTypeReturn
 matchTypeVariables(TypePtr formal, TypePtr actual, TypeEnv& type_env);
 
 // replace type variables appearing in `type` with the values in
 // `type_env`. Returns nullptr if a variable used in `type`
 // does not appear in `type_env`
-CAFFE2_API TypePtr tryEvalTypeVariables(TypePtr type, TypeEnv& type_env);
+TORCH_API TypePtr tryEvalTypeVariables(TypePtr type, TypeEnv& type_env);
 
-CAFFE2_API bool elementTypeCanBeInferredFromMembers(const TypePtr& elem_type);
+TORCH_API bool elementTypeCanBeInferredFromMembers(const TypePtr& elem_type);
 
 // This enumerator represents the 'kind' of an attribute - a buffer, a paramter, or neither.
 // This state is mutually exclusive. Buffers and Parameters can only appear on modules.
@@ -1983,7 +1983,7 @@
 
 // This structure represents all notional booking entities in a class attribute: name, kind (see: AttributeKind), and type (see: TypePtr).
 // Note: This structure does not represent the value of the attribute.
-struct CAFFE2_API ClassAttribute {
+struct TORCH_API ClassAttribute {
   public:
   ClassAttribute(AttributeKind kind,
   TypePtr attributeType,
@@ -2019,7 +2019,7 @@
 using ::torch::jit::CompilationUnit;
 
 // This represents a class in TorchScript.
-struct CAFFE2_API ClassType : public NamedType {
+struct TORCH_API ClassType : public NamedType {
   // This represents an attribute of a class; a name associated with an attribute, and a
   // getter and (optional) setter for that attribute.
   struct Property {
@@ -2377,7 +2377,7 @@
 // lhs (ClassType or InterfaceType) is a subtype of rhs if:
 // 1. lhs methods are a superset of rhs methods
 // 2. if rhs is module interface, the lhs must be module interface or module itself
-struct CAFFE2_API InterfaceType : public NamedType {
+struct TORCH_API InterfaceType : public NamedType {
   static InterfaceTypePtr create(
       QualifiedName qualifiedName, bool is_module=false);
 
@@ -2441,7 +2441,7 @@
 struct LayoutType;
 using LayoutTypePtr = std::shared_ptr<LayoutType>;
 // This type represents a Generator
-struct CAFFE2_API LayoutType : public EnumerationType<TypeKind::LayoutType> {
+struct TORCH_API LayoutType : public EnumerationType<TypeKind::LayoutType> {
 static LayoutTypePtr create() {
 return LayoutTypePtr(
     new LayoutType()); // NOLINT(modernize-make-shared)
@@ -2460,7 +2460,7 @@
 struct ScalarTypeType;
 using ScalarTypeTypePtr = std::shared_ptr<ScalarTypeType>;
 // This type represents a Generator
-struct CAFFE2_API ScalarTypeType : public EnumerationType<TypeKind::ScalarTypeType> {
+struct TORCH_API ScalarTypeType : public EnumerationType<TypeKind::ScalarTypeType> {
 static ScalarTypeTypePtr create() {
 return ScalarTypeTypePtr(
     new ScalarTypeType()); // NOLINT(modernize-make-shared)
@@ -2480,7 +2480,7 @@
 // List[T] <: AnyList for all T
 struct AnyListType;
 using AnyListTypePtr = std::shared_ptr<AnyListType>;
-struct CAFFE2_API AnyListType : public Type {
+struct TORCH_API AnyListType : public Type {
   static AnyListTypePtr create() {
     return AnyListTypePtr(
         new AnyListType()); // NOLINT(modernize-make-shared)
@@ -2503,7 +2503,7 @@
 // Tuple[T...] <: AnyTuple for all T
 struct AnyTupleType;
 using AnyTupleTypePtr = std::shared_ptr<AnyTupleType>;
-struct CAFFE2_API AnyTupleType : public Type {
+struct TORCH_API AnyTupleType : public Type {
   static AnyTupleTypePtr create() {
     return AnyTupleTypePtr(
         new AnyTupleType()); // NOLINT(modernize-make-shared)
@@ -2528,7 +2528,7 @@
 // ClassType <: AnyClassType for all classes
 struct AnyClassType;
 using AnyClassTypePtr = std::shared_ptr<AnyClassType>;
-struct CAFFE2_API AnyClassType : public Type {
+struct TORCH_API AnyClassType : public Type {
   static AnyClassTypePtr create() {
     return AnyClassTypePtr(
         new AnyClassType()); // NOLINT(modernize-make-shared)
diff --git a/aten/src/ATen/core/op_registration/infer_schema.h b/aten/src/ATen/core/op_registration/infer_schema.h
index f0746ef..17bf6bb 100644
--- a/aten/src/ATen/core/op_registration/infer_schema.h
+++ b/aten/src/ATen/core/op_registration/infer_schema.h
@@ -153,6 +153,6 @@
   return detail::infer_schema::createFunctionSchemaFromTraitsSingleReturn<guts::infer_function_traits_t<FuncType>>(std::move(name), std::move(overload_name));
 }
 
-CAFFE2_API c10::optional<std::string> findSchemaDifferences(const FunctionSchema& inferred, const FunctionSchema& specified);
+TORCH_API c10::optional<std::string> findSchemaDifferences(const FunctionSchema& inferred, const FunctionSchema& specified);
 
 }
diff --git a/aten/src/ATen/core/op_registration/op_registration.h b/aten/src/ATen/core/op_registration/op_registration.h
index 0045221..2d68ff0 100644
--- a/aten/src/ATen/core/op_registration/op_registration.h
+++ b/aten/src/ATen/core/op_registration/op_registration.h
@@ -43,7 +43,7 @@
  * >         .schema("my_op")
  * >         .kernel<my_kernel_cpu>(DispatchKey::CPU));
  */
-class CAFFE2_API RegisterOperators final {
+class TORCH_API RegisterOperators final {
 public:
   RegisterOperators();
   ~RegisterOperators();
@@ -53,7 +53,7 @@
   RegisterOperators(RegisterOperators&&) noexcept;
   RegisterOperators& operator=(RegisterOperators&&) noexcept;
 
-  class CAFFE2_API Options final {
+  class TORCH_API Options final {
   public:
     Options(const Options&) = delete;
     Options(Options&&) noexcept = delete;
diff --git a/aten/src/ATen/core/operator_name.h b/aten/src/ATen/core/operator_name.h
index b120a07..2a92697 100644
--- a/aten/src/ATen/core/operator_name.h
+++ b/aten/src/ATen/core/operator_name.h
@@ -72,8 +72,8 @@
   return !operator==(lhs, rhs);
 }
 
-CAFFE2_API std::string toString(const OperatorName& opName);
-CAFFE2_API std::ostream& operator<<(std::ostream&, const OperatorName&);
+TORCH_API std::string toString(const OperatorName& opName);
+TORCH_API std::ostream& operator<<(std::ostream&, const OperatorName&);
 
 } // namespace c10
 
diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp
index d84dc5e..fa2e85b 100644
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@@ -469,7 +469,7 @@
 }
 
 // change return types like List[List[t]] into List[List[int]]
-CAFFE2_API TypePtr tryEvalTypeVariables(TypePtr type, std::unordered_map<std::string, TypePtr>& type_env) {
+TORCH_API TypePtr tryEvalTypeVariables(TypePtr type, std::unordered_map<std::string, TypePtr>& type_env) {
   if (!type->hasFreeVariables()) {
     return type;
   }
@@ -494,7 +494,7 @@
   }
 }
 
-CAFFE2_API bool elementTypeCanBeInferredFromMembers(const TypePtr& elem_type) {
+TORCH_API bool elementTypeCanBeInferredFromMembers(const TypePtr& elem_type) {
   if (elem_type->kind() == OptionalType::Kind ||
       elem_type->kind() == NumberType::Kind) {
     // Builtin Union types
diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h
index f57ad03..af4eb6f 100644
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@@ -67,7 +67,7 @@
 // TODO: Consider putting the stub definitions in another class, so that one
 // never forgets to implement each virtual function in the real implementation
 // in CUDAHooks.  This probably doesn't buy us much though.
-struct CAFFE2_API CUDAHooksInterface {
+struct TORCH_API CUDAHooksInterface {
   // This should never actually be implemented, but it is used to
   // squelch -Werror=non-virtual-dtor
   virtual ~CUDAHooksInterface() {}
@@ -185,13 +185,13 @@
 
 // NB: dummy argument to suppress "ISO C++11 requires at least one argument
 // for the "..." in a variadic macro"
-struct CAFFE2_API CUDAHooksArgs {};
+struct TORCH_API CUDAHooksArgs {};
 
 C10_DECLARE_REGISTRY(CUDAHooksRegistry, CUDAHooksInterface, CUDAHooksArgs);
 #define REGISTER_CUDA_HOOKS(clsname) \
   C10_REGISTER_CLASS(CUDAHooksRegistry, clsname, clsname)
 
 namespace detail {
-CAFFE2_API const CUDAHooksInterface& getCUDAHooks();
+TORCH_API const CUDAHooksInterface& getCUDAHooks();
 } // namespace detail
 } // namespace at
diff --git a/aten/src/ATen/detail/HIPHooksInterface.h b/aten/src/ATen/detail/HIPHooksInterface.h
index e5099a8..876d7ca 100644
--- a/aten/src/ATen/detail/HIPHooksInterface.h
+++ b/aten/src/ATen/detail/HIPHooksInterface.h
@@ -24,7 +24,7 @@
 // which we may want to call into from CPU code (and thus must be dynamically
 // dispatched, to allow for separate compilation of HIP code).  See
 // CUDAHooksInterface for more detailed motivation.
-struct CAFFE2_API HIPHooksInterface {
+struct TORCH_API HIPHooksInterface {
   // This should never actually be implemented, but it is used to
   // squelch -Werror=non-virtual-dtor
   virtual ~HIPHooksInterface() {}
@@ -61,14 +61,14 @@
 
 // NB: dummy argument to suppress "ISO C++11 requires at least one argument
 // for the "..." in a variadic macro"
-struct CAFFE2_API HIPHooksArgs {};
+struct TORCH_API HIPHooksArgs {};
 
 C10_DECLARE_REGISTRY(HIPHooksRegistry, HIPHooksInterface, HIPHooksArgs);
 #define REGISTER_HIP_HOOKS(clsname) \
   C10_REGISTER_CLASS(HIPHooksRegistry, clsname, clsname)
 
 namespace detail {
-CAFFE2_API const HIPHooksInterface& getHIPHooks();
+TORCH_API const HIPHooksInterface& getHIPHooks();
 
 } // namespace detail
 } // namespace at
diff --git a/aten/src/ATen/native/DispatchStub.h b/aten/src/ATen/native/DispatchStub.h
index 0368fa9..b5de0f5 100644
--- a/aten/src/ATen/native/DispatchStub.h
+++ b/aten/src/ATen/native/DispatchStub.h
@@ -59,10 +59,10 @@
 CPUCapability get_cpu_capability();
 
 template <typename FnPtr, typename T>
-struct CAFFE2_API DispatchStub;
+struct TORCH_API DispatchStub;
 
 template <typename rT, typename T, typename... Args>
-struct CAFFE2_API DispatchStub<rT (*)(Args...), T> {
+struct TORCH_API DispatchStub<rT (*)(Args...), T> {
   using FnPtr = rT (*) (Args...);
 
   DispatchStub() = default;
@@ -167,7 +167,7 @@
     name(const name&) = delete;            \
     name& operator=(const name&) = delete; \
   };                                       \
-  extern CAFFE2_API struct name name
+  extern TORCH_API struct name name
 
 #define DEFINE_DISPATCH(name) struct name name
 
diff --git a/aten/src/ATen/native/Resize.h b/aten/src/ATen/native/Resize.h
index d3d8faf..bde91c6 100644
--- a/aten/src/ATen/native/Resize.h
+++ b/aten/src/ATen/native/Resize.h
@@ -14,7 +14,7 @@
 // Issues a warning if the output tensor has one or more elements and
 //   needs resizing
 // NOTE: In the future the warning will become an error
-CAFFE2_API void resize_output(Tensor& output, IntArrayRef shape);
+TORCH_API void resize_output(Tensor& output, IntArrayRef shape);
 
 // These functions are called by native::resize_ as well as (legacy) TH resize.
 // They are not in TH/THTensor.cpp because the at namespace is easier
diff --git a/aten/src/ATen/native/SpectralOpsUtils.h b/aten/src/ATen/native/SpectralOpsUtils.h
index 52aab66..bd38257 100644
--- a/aten/src/ATen/native/SpectralOpsUtils.h
+++ b/aten/src/ATen/native/SpectralOpsUtils.h
@@ -75,6 +75,6 @@
 // self should be the shape of the full signal and dims.back() should be the
 // one-sided dimension.
 // See NOTE [ Fourier Transform Conjugate Symmetry ]
-CAFFE2_API void _fft_fill_with_conjugate_symmetry_(const Tensor& self, IntArrayRef dims);
+TORCH_API void _fft_fill_with_conjugate_symmetry_(const Tensor& self, IntArrayRef dims);
 
 }} // at::native
diff --git a/aten/src/ATen/native/TypeProperties.h b/aten/src/ATen/native/TypeProperties.h
index 2e0c750..85ffed1 100644
--- a/aten/src/ATen/native/TypeProperties.h
+++ b/aten/src/ATen/native/TypeProperties.h
@@ -10,9 +10,9 @@
   c10::ScalarType zeroResult = ScalarType::Undefined;
 };
 
-CAFFE2_API ResultTypeState update_result_type_state(const Tensor& tensor, const ResultTypeState& in_state);
-CAFFE2_API ScalarType result_type(const ResultTypeState& state);
+TORCH_API ResultTypeState update_result_type_state(const Tensor& tensor, const ResultTypeState& in_state);
+TORCH_API ScalarType result_type(const ResultTypeState& state);
 
-CAFFE2_API ScalarType result_type(TensorList tensors);
+TORCH_API ScalarType result_type(TensorList tensors);
 
 }}
diff --git a/aten/src/ATen/native/mkldnn/MKLDNNCommon.cpp b/aten/src/ATen/native/mkldnn/MKLDNNCommon.cpp
index 3d3b54a..ce397aa 100644
--- a/aten/src/ATen/native/mkldnn/MKLDNNCommon.cpp
+++ b/aten/src/ATen/native/mkldnn/MKLDNNCommon.cpp
@@ -21,7 +21,7 @@
  * NOTE: if this is generally useful we may want to move this to its own header.
  */
 template <typename T>
-struct CAFFE2_API IntrusivePtrTargetWrapper : c10::intrusive_ptr_target {
+struct TORCH_API IntrusivePtrTargetWrapper : c10::intrusive_ptr_target {
 private:
   T target_;
 
diff --git a/aten/src/ATen/native/quantized/affine_quantizer.cpp b/aten/src/ATen/native/quantized/affine_quantizer.cpp
index ac2d31b..ecbe1de 100644
--- a/aten/src/ATen/native/quantized/affine_quantizer.cpp
+++ b/aten/src/ATen/native/quantized/affine_quantizer.cpp
@@ -396,7 +396,7 @@
 }
 
 template <typename T>
-CAFFE2_API float dequantize_val(double scale, int64_t zero_point, T value) {
+TORCH_API float dequantize_val(double scale, int64_t zero_point, T value) {
   // We need to convert the qint8 value to float to ensure the subtraction
   // subexpression returns a float
   return (static_cast<float>(value.val_) - zero_point) * scale;
@@ -441,67 +441,67 @@
       std::min<int64_t>(std::max<int64_t>(quantize_down, min), max));
 }
 
-template CAFFE2_API qint8
+template TORCH_API qint8
 quantize_val<qint8>(double scale, int64_t zero_point, float value);
-template CAFFE2_API quint8
+template TORCH_API quint8
 quantize_val<quint8>(double scale, int64_t zero_point, float value);
-template CAFFE2_API qint32
+template TORCH_API qint32
 quantize_val<qint32>(double scale, int64_t zero_point, float value);
-template CAFFE2_API void quantize_vec<c10::qint8>(
+template TORCH_API void quantize_vec<c10::qint8>(
     double scale,
     int64_t zero_point,
     const float* src,
     c10::qint8* dst,
     size_t count);
-template CAFFE2_API void quantize_vec<c10::quint8>(
+template TORCH_API void quantize_vec<c10::quint8>(
     double scale,
     int64_t zero_point,
     const float* src,
     c10::quint8* dst,
     size_t count);
-template CAFFE2_API void quantize_vec<c10::qint32, 32>(
+template TORCH_API void quantize_vec<c10::qint32, 32>(
     double scale,
     int64_t zero_point,
     const float* src,
     c10::qint32* dst,
     size_t count);
 
-template CAFFE2_API float dequantize_val<qint8>(
+template TORCH_API float dequantize_val<qint8>(
     double scale,
     int64_t zero_point,
     qint8 value);
-template CAFFE2_API float dequantize_val<quint8>(
+template TORCH_API float dequantize_val<quint8>(
     double scale,
     int64_t zero_point,
     quint8 value);
-template CAFFE2_API float dequantize_val<qint32>(
+template TORCH_API float dequantize_val<qint32>(
     double scale,
     int64_t zero_point,
     qint32 value);
 
-template CAFFE2_API qint8
+template TORCH_API qint8
 requantize_val<qint8, qint8>(double, int64_t, double, int64_t, qint8);
-template CAFFE2_API quint8
+template TORCH_API quint8
 requantize_val<qint8, quint8>(double, int64_t, double, int64_t, qint8);
-template CAFFE2_API qint32
+template TORCH_API qint32
 requantize_val<qint8, qint32>(double, int64_t, double, int64_t, qint8);
-template CAFFE2_API qint8
+template TORCH_API qint8
 requantize_val<quint8, qint8>(double, int64_t, double, int64_t, quint8);
-template CAFFE2_API quint8
+template TORCH_API quint8
 requantize_val<quint8, quint8>(double, int64_t, double, int64_t, quint8);
-template CAFFE2_API qint32
+template TORCH_API qint32
 requantize_val<quint8, qint32>(double, int64_t, double, int64_t, quint8);
-template CAFFE2_API qint8
+template TORCH_API qint8
 requantize_val<qint32, qint8>(double, int64_t, double, int64_t, qint32);
-template CAFFE2_API quint8
+template TORCH_API quint8
 requantize_val<qint32, quint8>(double, int64_t, double, int64_t, qint32);
-template CAFFE2_API qint32
+template TORCH_API qint32
 requantize_val<qint32, qint32>(double, int64_t, double, int64_t, qint32);
 
-template CAFFE2_API qint8 requantize_from_int<qint8>(double, int64_t, int64_t);
-template CAFFE2_API quint8
+template TORCH_API qint8 requantize_from_int<qint8>(double, int64_t, int64_t);
+template TORCH_API quint8
 requantize_from_int<quint8>(double, int64_t, int64_t);
-template CAFFE2_API qint32
+template TORCH_API qint32
 requantize_from_int<qint32>(double, int64_t, int64_t);
 
 } // namespace native
diff --git a/aten/src/ATen/native/quantized/affine_quantizer.h b/aten/src/ATen/native/quantized/affine_quantizer.h
index 670b119..d583106 100644
--- a/aten/src/ATen/native/quantized/affine_quantizer.h
+++ b/aten/src/ATen/native/quantized/affine_quantizer.h
@@ -113,7 +113,7 @@
 
 // Quantize a float value into a uint value given scale and zero_point
 template <typename T>
-CAFFE2_API T quantize_val(double scale, int64_t zero_point, float value);
+TORCH_API T quantize_val(double scale, int64_t zero_point, float value);
 // TODO combine this with quantize_val once the numerics for ARM are aligned
 // with it
 uint8_t quantize_val_arm(
@@ -128,34 +128,34 @@
     T* dst,
     size_t count = 8);
 template <typename T>
-CAFFE2_API Tensor quantize_tensor(
+TORCH_API Tensor quantize_tensor(
     Tensor rtensor,
     Tensor qtensor,
     double scale,
     int64_t zero_point);
 template <typename T>
-CAFFE2_API float dequantize_val(double scale, int64_t zero_point, T value);
+TORCH_API float dequantize_val(double scale, int64_t zero_point, T value);
 template <typename T>
-CAFFE2_API float dequantize_vec(
+TORCH_API float dequantize_vec(
     double scale,
     int64_t zero_point,
     const T* src,
     float* dst,
     size_t count = 8);
 template <typename T>
-CAFFE2_API Tensor dequantize_tensor(
+TORCH_API Tensor dequantize_tensor(
     Tensor qtensor,
     Tensor rtensor,
     double scale,
     int64_t zero_point);
 template <typename SRC_T, typename DST_T>
-CAFFE2_API DST_T requantize_val(double, int64_t, double, int64_t, SRC_T src);
+TORCH_API DST_T requantize_val(double, int64_t, double, int64_t, SRC_T src);
 
 // Given a multiplier and a zero_point, requantize int32_t computed values back
 // to quantized values. See comment above
 // make_per_tensor_affine_quantizer function for the usage of int64_t
 template <typename DST_T>
-CAFFE2_API DST_T
+TORCH_API DST_T
 requantize_from_int(double multiplier, int64_t zero_point, int64_t src);
 
 int quantize_val_float_qparams(float scale, float zero_point, float value, int qmin, int qmax);
diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
index 19e98ca..f25a301 100644
--- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
+++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
@@ -357,7 +357,7 @@
 #endif // USE_FBGEMM
 
     template <int kSpatialDim = 2>
-    CAFFE2_API torch::class_<ConvPackedParamsBase<kSpatialDim>>
+    TORCH_API torch::class_<ConvPackedParamsBase<kSpatialDim>>
     register_conv_params() {
   static auto register_conv_params =
     torch::class_<ConvPackedParamsBase<kSpatialDim>>(
@@ -397,9 +397,9 @@
 }
 
 template
-CAFFE2_API torch::class_<ConvPackedParamsBase<2>> register_conv_params<2>();
+TORCH_API torch::class_<ConvPackedParamsBase<2>> register_conv_params<2>();
 template
-CAFFE2_API torch::class_<ConvPackedParamsBase<3>> register_conv_params<3>();
+TORCH_API torch::class_<ConvPackedParamsBase<3>> register_conv_params<3>();
 
 torch::class_<LinearPackedParamsBase> register_linear_params() {
   using SerializationType = std::tuple<at::Tensor, c10::optional<at::Tensor>>;
diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
index b4cff64..916bf03 100644
--- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
+++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
@@ -20,7 +20,7 @@
 // of the A rows. The column offsets are needed for the asymmetric quantization
 // (affine quantization) of input matrix.
 // Note that in JIT mode we can think of a way to fuse col_offsets with bias.
-struct CAFFE2_API PackedLinearWeight : public LinearPackedParamsBase {
+struct TORCH_API PackedLinearWeight : public LinearPackedParamsBase {
   PackedLinearWeight(
       std::unique_ptr<fbgemm::PackBMatrix<int8_t>> w,
       c10::optional<at::Tensor> bias,
@@ -74,7 +74,7 @@
   at::Tensor apply_dynamic_impl(at::Tensor input, bool reduce_range=false);
 };
 
-struct CAFFE2_API PackedLinearWeightFp16 : public LinearPackedParamsBase {
+struct TORCH_API PackedLinearWeightFp16 : public LinearPackedParamsBase {
   PackedLinearWeightFp16(
       std::unique_ptr<fbgemm::PackedGemmMatrixFP16> w,
       c10::optional<at::Tensor> bias)
@@ -117,7 +117,7 @@
 };
 
 template <int kSpatialDim = 2>
-struct CAFFE2_API PackedConvWeight : public ConvPackedParamsBase<kSpatialDim> {
+struct TORCH_API PackedConvWeight : public ConvPackedParamsBase<kSpatialDim> {
   PackedConvWeight(
       std::unique_ptr<fbgemm::PackWeightsForConv<kSpatialDim>> w,
       c10::optional<at::Tensor> bias,
@@ -306,7 +306,7 @@
 
 #endif // USE_FBGEMM
 
-struct CAFFE2_API PackedEmbeddingBagWeight : public EmbeddingPackedParamsBase {
+struct TORCH_API PackedEmbeddingBagWeight : public EmbeddingPackedParamsBase {
   PackedEmbeddingBagWeight(
       at::Tensor packed_w,
       std::vector<float> w_scale,
diff --git a/aten/src/ATen/quantized/QTensorImpl.h b/aten/src/ATen/quantized/QTensorImpl.h
index 1bd859e..9c5db9f 100644
--- a/aten/src/ATen/quantized/QTensorImpl.h
+++ b/aten/src/ATen/quantized/QTensorImpl.h
@@ -13,7 +13,7 @@
  *
  * We'll use QTensor in code or documentation to refer to a Tensor with QTensorImpl.
  */
-struct CAFFE2_API QTensorImpl : public c10::TensorImpl {
+struct TORCH_API QTensorImpl : public c10::TensorImpl {
  public:
   QTensorImpl(
       Storage&& storage,
diff --git a/aten/src/ATen/quantized/Quantizer.h b/aten/src/ATen/quantized/Quantizer.h
index c5c63c6..1c740b7 100644
--- a/aten/src/ATen/quantized/Quantizer.h
+++ b/aten/src/ATen/quantized/Quantizer.h
@@ -24,7 +24,7 @@
  * the quantized value. For example, affine quantizer is
  * the most commonly used scheme in this category.
  */
-struct CAFFE2_API UniformQuantizer : public Quantizer {
+struct TORCH_API UniformQuantizer : public Quantizer {
   explicit UniformQuantizer(ScalarType scalar_type) : Quantizer(scalar_type) {}
 };
 
@@ -33,7 +33,7 @@
  * These quantization scheme may map float value non-uniformly to the quantized
  * value. K-means quantization is a representative example in this category.
  */
-struct CAFFE2_API NonUniformQuantizer : public Quantizer {
+struct TORCH_API NonUniformQuantizer : public Quantizer {
   explicit NonUniformQuantizer(ScalarType scalar_type) : Quantizer(scalar_type) {}
 };
 
@@ -47,7 +47,7 @@
  * For dequantize:
  * X = (Y - zero_point) * scale
  */
-struct CAFFE2_API AffineQuantizer : public UniformQuantizer {
+struct TORCH_API AffineQuantizer : public UniformQuantizer {
   explicit AffineQuantizer(ScalarType scalar_type) : UniformQuantizer(scalar_type) {}
 };
 
@@ -58,7 +58,7 @@
  * PerTensorAffineQuantizer stores a scale and a zero_point, which is used for
  * all the values in the Tensor.
  */
-struct CAFFE2_API PerTensorAffineQuantizer : public AffineQuantizer {
+struct TORCH_API PerTensorAffineQuantizer : public AffineQuantizer {
   explicit PerTensorAffineQuantizer(ScalarType scalar_type, double scale, int64_t zero_point)
     : AffineQuantizer(scalar_type),
         scale_(scale),
@@ -107,7 +107,7 @@
  * processors since it requires each multiplication result within a single
  * dot-product to have a different scale.
  */
-struct CAFFE2_API PerChannelAffineQuantizer : public AffineQuantizer {
+struct TORCH_API PerChannelAffineQuantizer : public AffineQuantizer {
   explicit PerChannelAffineQuantizer(
       ScalarType scalar_type,
       Tensor scales,
@@ -169,7 +169,7 @@
  * be exactly represented in the quantized space. We can get additional precision by
  * using floating point values for zero point.
  */
-struct CAFFE2_API PerChannelAffineFloatQParamsQuantizer : public PerChannelAffineQuantizer {
+struct TORCH_API PerChannelAffineFloatQParamsQuantizer : public PerChannelAffineQuantizer {
   explicit PerChannelAffineFloatQParamsQuantizer(
       ScalarType scalar_type,
       Tensor scales,
@@ -205,26 +205,26 @@
 // setters/getters for QTensorImpl fields; otherwise, you should use
 // the low level setters/getters that were implemented using this.
 // This may be called repeatedly, so make sure it's pretty cheap.
-CAFFE2_API QTensorImpl* get_qtensorimpl(const Tensor& self);
+TORCH_API QTensorImpl* get_qtensorimpl(const Tensor& self);
 
 // double and int64_t are because of the native function API, we only have these
 // argument types right now in native functions
-CAFFE2_API QuantizerPtr
+TORCH_API QuantizerPtr
 make_per_tensor_affine_quantizer(
     double scale, int64_t zero_point, ScalarType scalar_type);
 
-CAFFE2_API QuantizerPtr make_per_channel_affine_quantizer(
+TORCH_API QuantizerPtr make_per_channel_affine_quantizer(
     const Tensor& scales,
     const Tensor& zero_points,
     int64_t axis,
     ScalarType scalar_type);
 
 // Create a Quantized Tensor given arguments for normal Tensor and a quantizer
-CAFFE2_API Tensor new_qtensor(
+TORCH_API Tensor new_qtensor(
     IntArrayRef sizes,
     const TensorOptions& options,
     QuantizerPtr quantizer);
 
-CAFFE2_API void set_quantizer_(const Tensor& self, ConstQuantizerPtr quantizer);
+TORCH_API void set_quantizer_(const Tensor& self, ConstQuantizerPtr quantizer);
 
 } // namespace at
diff --git a/aten/src/ATen/record_function.h b/aten/src/ATen/record_function.h
index e993966..96b2215 100644
--- a/aten/src/ATen/record_function.h
+++ b/aten/src/ATen/record_function.h
@@ -10,7 +10,7 @@
 #include <functional>
 
 namespace c10 {
-class CAFFE2_API OperatorHandle;
+class TORCH_API OperatorHandle;
 }
 
 namespace at {
diff --git a/aten/src/ATen/templates/Functions.h b/aten/src/ATen/templates/Functions.h
index 8f5e35d..767ca01 100644
--- a/aten/src/ATen/templates/Functions.h
+++ b/aten/src/ATen/templates/Functions.h
@@ -20,7 +20,7 @@
 
 // These functions are defined in ATen/Utils.cpp.
 #define TENSOR(T, S)                                                          \
-  CAFFE2_API Tensor tensor(ArrayRef<T> values, const TensorOptions& options); \
+  TORCH_API Tensor tensor(ArrayRef<T> values, const TensorOptions& options); \
   inline Tensor tensor(                                                       \
       std::initializer_list<T> values, const TensorOptions& options) {        \
     return at::tensor(ArrayRef<T>(values), options);                          \
@@ -46,10 +46,10 @@
 // Special C++ only overloads for std()-like functions (See gh-40287)
 // These are needed because int -> bool conversion takes precedence over int -> IntArrayRef
 // So, for example std(0) would select the std(unbiased=False) overload
-CAFFE2_API Tensor var(const Tensor& self, int dim);
-CAFFE2_API std::tuple<Tensor,Tensor> var_mean(const Tensor& self, int dim);
-CAFFE2_API Tensor std(const Tensor& self, int dim);
-CAFFE2_API std::tuple<Tensor,Tensor> std_mean(const Tensor& self, int dim);
+TORCH_API Tensor var(const Tensor& self, int dim);
+TORCH_API std::tuple<Tensor,Tensor> var_mean(const Tensor& self, int dim);
+TORCH_API Tensor std(const Tensor& self, int dim);
+TORCH_API std::tuple<Tensor,Tensor> std_mean(const Tensor& self, int dim);
 
 namespace {
   inline std::vector<int64_t> zero_sizes(const TensorOptions& options) {
diff --git a/aten/src/ATen/templates/TensorBody.h b/aten/src/ATen/templates/TensorBody.h
index 75f614b..2375f6c 100644
--- a/aten/src/ATen/templates/TensorBody.h
+++ b/aten/src/ATen/templates/TensorBody.h
@@ -83,7 +83,7 @@
 //
 // Note that Tensor can also be NULL, i.e. it is not associated with any underlying TensorImpl, and
 // special care must be taken to handle this.
-class CAFFE2_API Tensor {
+class TORCH_API Tensor {
  public:
   Tensor(){};
   // This constructor should not be used by end users and is an implementation
diff --git a/aten/src/TH/THAllocator.h b/aten/src/TH/THAllocator.h
index d189bd1b..4a4e3852 100644
--- a/aten/src/TH/THAllocator.h
+++ b/aten/src/TH/THAllocator.h
@@ -21,7 +21,7 @@
 // the non-file descriptor constructor
 enum WithFd { WITH_FD };
 
-class CAFFE2_API THMapAllocator {
+class TORCH_API THMapAllocator {
  public:
   THMapAllocator(const char *filename, int flags, size_t size);
   THMapAllocator(WithFd, const char *filename, int fd, int flags, size_t size);
@@ -71,11 +71,11 @@
 };
 
 // Base-from-member idiom
-struct CAFFE2_API THRefcountedMapAllocatorArgCheck {
+struct TORCH_API THRefcountedMapAllocatorArgCheck {
   THRefcountedMapAllocatorArgCheck(int flags);
 };
 
-class CAFFE2_API THRefcountedMapAllocator
+class TORCH_API THRefcountedMapAllocator
     : private THRefcountedMapAllocatorArgCheck,
       public THMapAllocator {
  public:
diff --git a/c10/core/GeneratorImpl.h b/c10/core/GeneratorImpl.h
index fff105a..3af652a 100644
--- a/c10/core/GeneratorImpl.h
+++ b/c10/core/GeneratorImpl.h
@@ -42,7 +42,7 @@
  * Please use the public mutex_ when using any methods from these classes, except for the
  * read-only methods. You can learn about the usage by looking into the unittests
  * (aten/src/ATen/cpu_generator_test.cpp) and other places where we have used lock_guard.
- * 
+ *
  * TODO: Look into changing the threading semantics of Generators in ATen (e.g., making
  * them non-thread safe and instead making the generator state splittable, to accommodate
  * forks into other threads).
@@ -96,7 +96,7 @@
 
 namespace detail {
 
-CAFFE2_API uint64_t getNonDeterministicRandom(bool is_cuda = false);
+TORCH_API uint64_t getNonDeterministicRandom(bool is_cuda = false);
 
 } // namespace detail
 
diff --git a/c10/cuda/CUDAStream.h b/c10/cuda/CUDAStream.h
index e825654..41802b3 100644
--- a/c10/cuda/CUDAStream.h
+++ b/c10/cuda/CUDAStream.h
@@ -179,7 +179,7 @@
  * isHighPriority to true, or a stream for a specific device by setting device
  * (defaulting to the current CUDA stream.)
  */
-CAFFE2_API CUDAStream
+TORCH_API CUDAStream
 getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1);
 
 /**
@@ -188,7 +188,7 @@
  * where most computation occurs when you aren't explicitly using
  * streams.
  */
-CAFFE2_API CUDAStream getDefaultCUDAStream(DeviceIndex device_index = -1);
+TORCH_API CUDAStream getDefaultCUDAStream(DeviceIndex device_index = -1);
 
 /**
  * Get the current CUDA stream, for the passed CUDA device, or for the
@@ -197,7 +197,7 @@
  * be different if someone called 'setCurrentCUDAStream' or used 'StreamGuard'
  * or 'CUDAStreamGuard'.
  */
-CAFFE2_API CUDAStream getCurrentCUDAStream(DeviceIndex device_index = -1);
+TORCH_API CUDAStream getCurrentCUDAStream(DeviceIndex device_index = -1);
 
 /**
  * Set the current stream on the device of the passed in stream to be
@@ -209,7 +209,7 @@
  * (which will switch both your current device and current stream in the way you
  * expect, and reset it back to its original state afterwards).
  */
-CAFFE2_API void setCurrentCUDAStream(CUDAStream stream);
+TORCH_API void setCurrentCUDAStream(CUDAStream stream);
 
 C10_API std::ostream& operator<<(std::ostream& stream, const CUDAStream& s);
 
diff --git a/c10/macros/Export.h b/c10/macros/Export.h
index 966dd22..64d1037 100644
--- a/c10/macros/Export.h
+++ b/c10/macros/Export.h
@@ -92,11 +92,10 @@
 #endif
 
 // This one is being used by libtorch.so
-// TODO: rename this to TORCH_API
 #ifdef CAFFE2_BUILD_MAIN_LIB
-#define CAFFE2_API C10_EXPORT
+#define TORCH_API C10_EXPORT
 #else
-#define CAFFE2_API C10_IMPORT
+#define TORCH_API C10_IMPORT
 #endif
 
 // NB: For now, HIP is overloaded to use the same macro, but ideally
diff --git a/c10/util/UniqueVoidPtr.h b/c10/util/UniqueVoidPtr.h
index cf51871..c4e3158 100644
--- a/c10/util/UniqueVoidPtr.h
+++ b/c10/util/UniqueVoidPtr.h
@@ -10,7 +10,7 @@
 namespace detail {
 
 // Does not delete anything
-CAFFE2_API void deleteNothing(void*);
+TORCH_API void deleteNothing(void*);
 
 // A detail::UniqueVoidPtr is an owning smart pointer like unique_ptr, but
 // with three major differences:
diff --git a/caffe2/contrib/aten/aten_op_template.h b/caffe2/contrib/aten/aten_op_template.h
index 97c575e..f3a42db 100644
--- a/caffe2/contrib/aten/aten_op_template.h
+++ b/caffe2/contrib/aten/aten_op_template.h
@@ -19,7 +19,7 @@
 using at::Half; // for AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, ...)
 
 namespace internal {
-CAFFE2_API at::Tensor index_with_uint8_handling(
+TORCH_API at::Tensor index_with_uint8_handling(
     const at::Tensor& self,
     at::TensorList indices);
 }
diff --git a/caffe2/contrib/gloo/common.h b/caffe2/contrib/gloo/common.h
index 0c56bf9..f258775 100644
--- a/caffe2/contrib/gloo/common.h
+++ b/caffe2/contrib/gloo/common.h
@@ -11,7 +11,7 @@
 namespace caffe2 {
 namespace gloo {
 
-CAFFE2_API void signalFailure(Blob* status_blob, std::exception& exception);
+TORCH_API void signalFailure(Blob* status_blob, std::exception& exception);
 
 struct createDeviceAttr {
     // "tcp" or "ibverbs"
@@ -22,7 +22,7 @@
     std::string interface;
 };
 
-CAFFE2_API std::shared_ptr<::gloo::transport::Device> createDevice(
+TORCH_API std::shared_ptr<::gloo::transport::Device> createDevice(
     const createDeviceAttr attr);
 
 // Captures the parameters passed to Gloo.
diff --git a/caffe2/contrib/gloo/store_handler.h b/caffe2/contrib/gloo/store_handler.h
index 00b651c..a68f01e 100644
--- a/caffe2/contrib/gloo/store_handler.h
+++ b/caffe2/contrib/gloo/store_handler.h
@@ -8,7 +8,7 @@
 namespace caffe2 {
 namespace gloo {
 
-class CAFFE2_API StoreHandlerWrapper : public ::gloo::rendezvous::Store {
+class TORCH_API StoreHandlerWrapper : public ::gloo::rendezvous::Store {
  public:
   explicit StoreHandlerWrapper(StoreHandler& handler) : handler_(handler) {}
 
diff --git a/caffe2/contrib/tensorrt/tensorrt_tranformer.h b/caffe2/contrib/tensorrt/tensorrt_tranformer.h
index ec7786e..4d4e92d 100644
--- a/caffe2/contrib/tensorrt/tensorrt_tranformer.h
+++ b/caffe2/contrib/tensorrt/tensorrt_tranformer.h
@@ -14,12 +14,12 @@
 
 namespace caffe2 {
 
-CAFFE2_API void BuildInitializationList(
+TORCH_API void BuildInitializationList(
     Workspace* ws,
     ::ONNX_NAMESPACE::GraphProto* g,
     std::unordered_set<std::string>* initialization_list);
 
-class CAFFE2_API TensorRTTransformer {
+class TORCH_API TensorRTTransformer {
  public:
   TensorRTTransformer(
       size_t max_batch_size,
diff --git a/caffe2/core/blob_serialization.h b/caffe2/core/blob_serialization.h
index 72d148c..43c0549 100644
--- a/caffe2/core/blob_serialization.h
+++ b/caffe2/core/blob_serialization.h
@@ -40,7 +40,7 @@
  * approaches for specific classes. Acceptor should take care of writing data
  * to the actual storage.
  */
-CAFFE2_API void SerializeBlob(
+TORCH_API void SerializeBlob(
     const Blob& blob,
     const string& name,
     BlobSerializerBase::SerializationAcceptor acceptor,
@@ -56,15 +56,15 @@
  *
  * NOTE: this function doesn't do chunking and might break with big tensors.
  */
-CAFFE2_API string SerializeBlob(const Blob& blob, const string& name);
+TORCH_API string SerializeBlob(const Blob& blob, const string& name);
 
 /**
  * Deserializes from a string containing either BlobProto or TensorProto. If
  * the deserialization fails, the content in the blob should no longer be
  * trusted.
  */
-CAFFE2_API void DeserializeBlob(const string& content, Blob* result);
-CAFFE2_API void DeserializeBlob(const BlobProto& proto, Blob* result);
+TORCH_API void DeserializeBlob(const string& content, Blob* result);
+TORCH_API void DeserializeBlob(const BlobProto& proto, Blob* result);
 
 /*
  * Get an empty Tensor from the TensorProto given the meta data in proto (data
@@ -86,7 +86,7 @@
  * these function calls. e.g. mutable_data will allocate memory on the first
  * call and it will return a pointer to the allocated memory on later calls.
  */
-CAFFE2_API Tensor EmptyTensorFromProto(const TensorProto& proto);
+TORCH_API Tensor EmptyTensorFromProto(const TensorProto& proto);
 
 /**
  * @brief TensorSerializer is the serializer for Tensors.
@@ -94,7 +94,7 @@
  * TensorSerializer takes in a blob that contains a Tensor, and serializes it
  * into a TensorProto protocol buffer.
  */
-class CAFFE2_API TensorSerializer : public BlobSerializerBase {
+class TORCH_API TensorSerializer : public BlobSerializerBase {
  public:
   TensorSerializer() {}
   ~TensorSerializer() override {}
@@ -136,7 +136,7 @@
  * tensor, change the TensorProto's corresponding fields before calling
  * Deserialize.
  */
-class CAFFE2_API TensorDeserializer : public BlobDeserializerBase {
+class TORCH_API TensorDeserializer : public BlobDeserializerBase {
  public:
   void Deserialize(const BlobProto& proto, Blob* blob) override;
 
@@ -240,7 +240,7 @@
 // Converts MessageLite to string while also checking that SerializeAsString
 // succeeds. Pass description of class/function of the call if you'd
 // like it appended to the error message.
-CAFFE2_API std::string SerializeAsString_EnforceCheck(
+TORCH_API std::string SerializeAsString_EnforceCheck(
     const google::protobuf::MessageLite&,
     const char* error_location = nullptr);
 
diff --git a/caffe2/core/blob_serializer_base.h b/caffe2/core/blob_serializer_base.h
index ad282f3..969fb92 100644
--- a/caffe2/core/blob_serializer_base.h
+++ b/caffe2/core/blob_serializer_base.h
@@ -78,7 +78,7 @@
  * @brief BlobDeserializerBase is an abstract class that deserializes a blob
  * from a BlobProto or a TensorProto.
  */
-class CAFFE2_API BlobDeserializerBase {
+class TORCH_API BlobDeserializerBase {
  public:
   virtual ~BlobDeserializerBase() {}
 
diff --git a/caffe2/core/blob_stats.h b/caffe2/core/blob_stats.h
index e05b451..547897e 100644
--- a/caffe2/core/blob_stats.h
+++ b/caffe2/core/blob_stats.h
@@ -41,6 +41,6 @@
  * Return size in bytes of the blob, if available for a blob of given type.
  * If not available, return 0.
  */
-CAFFE2_API size_t sizeBytes(const Blob& blob);
+TORCH_API size_t sizeBytes(const Blob& blob);
 }
 }
diff --git a/caffe2/core/common.h b/caffe2/core/common.h
index 076d83b..1b71eab 100644
--- a/caffe2/core/common.h
+++ b/caffe2/core/common.h
@@ -124,18 +124,18 @@
 // linked. This function should not be used in static initialization functions
 // as the underlying boolean variable is going to be switched on when one
 // loads libtorch_gpu.so.
-CAFFE2_API bool HasCudaRuntime();
-CAFFE2_API bool HasHipRuntime();
+TORCH_API bool HasCudaRuntime();
+TORCH_API bool HasHipRuntime();
 namespace internal {
 // Sets the Cuda Runtime flag that is used by HasCudaRuntime(). You should
 // never use this function - it is only used by the Caffe2 gpu code to notify
 // Caffe2 core that cuda runtime has been loaded.
-CAFFE2_API void SetCudaRuntimeFlag();
-CAFFE2_API void SetHipRuntimeFlag();
+TORCH_API void SetCudaRuntimeFlag();
+TORCH_API void SetHipRuntimeFlag();
 } // namespace internal
 // Returns which setting Caffe2 was configured and built with (exported from
 // CMake)
-CAFFE2_API const std::map<string, string>& GetBuildOptions();
+TORCH_API const std::map<string, string>& GetBuildOptions();
 
 } // namespace caffe2
 
diff --git a/caffe2/core/context.h b/caffe2/core/context.h
index b0e99ef..d5fe108 100644
--- a/caffe2/core/context.h
+++ b/caffe2/core/context.h
@@ -30,7 +30,7 @@
  * A function to generate a random number seed that is unique in a best-effort
  * basis, using an ever-incrementing seed and the current time.
  */
-CAFFE2_API uint32_t RandomNumberSeed();
+TORCH_API uint32_t RandomNumberSeed();
 
 /**
  * The CPU Context, representing the bare minimum of what a Context class in
@@ -44,7 +44,7 @@
  * computation it has.
  *
  */
-class CAFFE2_API CPUContext final : public BaseContext {
+class TORCH_API CPUContext final : public BaseContext {
  public:
 #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
   typedef at::CPUGeneratorImpl rand_gen_type;
diff --git a/caffe2/core/context_base.h b/caffe2/core/context_base.h
index 036ac98..dfc1504 100644
--- a/caffe2/core/context_base.h
+++ b/caffe2/core/context_base.h
@@ -33,7 +33,7 @@
  * functions in the BaseContext class.
  * TODO: add docs after this is finalized.
  */
-class CAFFE2_API BaseContext {
+class TORCH_API BaseContext {
  public:
   virtual ~BaseContext() noexcept {}
 
diff --git a/caffe2/core/db.h b/caffe2/core/db.h
index 2d04b3c..9765779 100644
--- a/caffe2/core/db.h
+++ b/caffe2/core/db.h
@@ -19,7 +19,7 @@
 /**
  * An abstract class for the cursor of the database while reading.
  */
-class CAFFE2_API Cursor {
+class TORCH_API Cursor {
  public:
   Cursor() {}
   virtual ~Cursor() {}
@@ -60,7 +60,7 @@
 /**
  * An abstract class for the current database transaction while writing.
  */
-class CAFFE2_API Transaction {
+class TORCH_API Transaction {
  public:
   Transaction() {}
   virtual ~Transaction() {}
@@ -79,7 +79,7 @@
 /**
  * An abstract class for accessing a database of key-value pairs.
  */
-class CAFFE2_API DB {
+class TORCH_API DB {
  public:
   DB(const string& /*source*/, Mode mode) : mode_(mode) {}
   virtual ~DB() {}
@@ -143,7 +143,7 @@
 /**
  * A reader wrapper for DB that also allows us to serialize it.
  */
-class CAFFE2_API DBReader {
+class TORCH_API DBReader {
  public:
   friend class DBReaderSerializer;
   DBReader() {}
@@ -296,7 +296,7 @@
   C10_DISABLE_COPY_AND_ASSIGN(DBReader);
 };
 
-class CAFFE2_API DBReaderSerializer : public BlobSerializerBase {
+class TORCH_API DBReaderSerializer : public BlobSerializerBase {
  public:
   /**
    * Serializes a DBReader. Note that this blob has to contain DBReader,
@@ -309,7 +309,7 @@
       BlobSerializerBase::SerializationAcceptor acceptor) override;
 };
 
-class CAFFE2_API DBReaderDeserializer : public BlobDeserializerBase {
+class TORCH_API DBReaderDeserializer : public BlobDeserializerBase {
  public:
   void Deserialize(const BlobProto& proto, Blob* blob) override;
 };
diff --git a/caffe2/core/event.cc b/caffe2/core/event.cc
index b643385..919ff11 100644
--- a/caffe2/core/event.cc
+++ b/caffe2/core/event.cc
@@ -2,19 +2,19 @@
 
 namespace caffe2 {
 
-CAFFE2_API EventCreateFunction Event::event_creator_[MaxDeviceTypes];
-CAFFE2_API EventRecordFunction Event::event_recorder_[MaxDeviceTypes];
-CAFFE2_API EventWaitFunction
+TORCH_API EventCreateFunction Event::event_creator_[MaxDeviceTypes];
+TORCH_API EventRecordFunction Event::event_recorder_[MaxDeviceTypes];
+TORCH_API EventWaitFunction
     Event::event_waiter_[MaxDeviceTypes][MaxDeviceTypes];
-CAFFE2_API EventFinishFunction Event::event_finisher_[MaxDeviceTypes];
+TORCH_API EventFinishFunction Event::event_finisher_[MaxDeviceTypes];
 
-CAFFE2_API EventQueryFunction Event::event_querier_[MaxDeviceTypes];
-CAFFE2_API EventErrorMessageFunction
+TORCH_API EventQueryFunction Event::event_querier_[MaxDeviceTypes];
+TORCH_API EventErrorMessageFunction
     Event::event_err_msg_getter_[MaxDeviceTypes];
-CAFFE2_API EventSetFinishedFunction
+TORCH_API EventSetFinishedFunction
     Event::event_finished_setter_[MaxDeviceTypes];
-CAFFE2_API EventResetFunction Event::event_resetter_[MaxDeviceTypes];
-CAFFE2_API EventSetCallbackFunction
+TORCH_API EventResetFunction Event::event_resetter_[MaxDeviceTypes];
+TORCH_API EventSetCallbackFunction
     Event::event_callback_setter_[MaxDeviceTypes];
 
 namespace {
diff --git a/caffe2/core/event.h b/caffe2/core/event.h
index 77e3b19..0bbb701 100644
--- a/caffe2/core/event.h
+++ b/caffe2/core/event.h
@@ -55,7 +55,7 @@
 typedef std::function<void()> EventCallbackFunction;
 typedef void (*EventSetCallbackFunction)(Event*, EventCallbackFunction);
 
-class CAFFE2_API Event {
+class TORCH_API Event {
  public:
   explicit Event(const DeviceOption& option)
       : event_(), type_(option.device_type()), option_(option) {
diff --git a/caffe2/core/export_caffe2_op_to_c10.h b/caffe2/core/export_caffe2_op_to_c10.h
index 21bc79d..814ee05 100644
--- a/caffe2/core/export_caffe2_op_to_c10.h
+++ b/caffe2/core/export_caffe2_op_to_c10.h
@@ -180,7 +180,7 @@
 #define C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(OperatorName)   \
   namespace caffe2 {                                        \
   namespace _c10_ops {                                      \
-  CAFFE2_API const FunctionSchema& schema_##OperatorName(); \
+  TORCH_API const FunctionSchema& schema_##OperatorName(); \
   }                                                         \
   }
 
diff --git a/caffe2/core/graph.h b/caffe2/core/graph.h
index 6162b08..dfee4b7 100644
--- a/caffe2/core/graph.h
+++ b/caffe2/core/graph.h
@@ -16,7 +16,7 @@
 /**
  *  Graph representation of an operator.
  */
-struct CAFFE2_API Node {
+struct TORCH_API Node {
  public:
   // Empty constructor for resize
   Node() {}
@@ -45,7 +45,7 @@
 /**
  *  Graph representation of a Netdef.
  */
-struct CAFFE2_API Graph {
+struct TORCH_API Graph {
  public:
   /**
    * Given a subgraph, gets all of the parents of the subgraph, as well as
@@ -155,7 +155,7 @@
 
 // Adds an operator def to a netdef.
 // Returns the ptr, if you want to add anything extra (such as device_option)
-CAFFE2_API OperatorDef* AddOp(
+TORCH_API OperatorDef* AddOp(
     NetDef* netdef_ptr,
     string op_type,
     std::vector<string> inputs,
@@ -168,12 +168,12 @@
  * For example, if we wanted to match an operator to Conv or FC, we can give:
  * "Conv|FC" as the type() of that op.
  */
-CAFFE2_API bool MatchStrings(string p, string s);
+TORCH_API bool MatchStrings(string p, string s);
 
 /**
  * This ensures that each named arg that exists in the pattern exists in g_op,
  * is equal in value.
  */
-CAFFE2_API bool MatchArguments(const OperatorDef& p_op, const OperatorDef& g_op);
+TORCH_API bool MatchArguments(const OperatorDef& p_op, const OperatorDef& g_op);
 
 } // namespace caffe2
diff --git a/caffe2/core/init.h b/caffe2/core/init.h
index 634b601..8d0fbd3 100644
--- a/caffe2/core/init.h
+++ b/caffe2/core/init.h
@@ -8,7 +8,7 @@
 namespace caffe2 {
 
 namespace internal {
-class CAFFE2_API Caffe2InitializeRegistry {
+class TORCH_API Caffe2InitializeRegistry {
  public:
   typedef bool (*InitFunction)(int*, char***);
   // Registry() is defined in .cpp file to make registration work across
@@ -96,12 +96,12 @@
 };
 }  // namespace internal
 
-CAFFE2_API bool unsafeRunCaffe2InitFunction(
+TORCH_API bool unsafeRunCaffe2InitFunction(
     const char* name,
     int* pargc = nullptr,
     char*** pargv = nullptr);
 
-class CAFFE2_API InitRegisterer {
+class TORCH_API InitRegisterer {
  public:
   InitRegisterer(
       internal::Caffe2InitializeRegistry::InitFunction function,
@@ -128,9 +128,9 @@
 /**
  * @brief Determine whether GlobalInit has already been run
  */
-CAFFE2_API bool GlobalInitAlreadyRun();
+TORCH_API bool GlobalInitAlreadyRun();
 
-class CAFFE2_API GlobalInitIsCalledGuard {
+class TORCH_API GlobalInitIsCalledGuard {
  public:
   GlobalInitIsCalledGuard() {
     if (!GlobalInitAlreadyRun()) {
@@ -165,7 +165,7 @@
  *
  * GlobalInit is also thread-safe and can be called concurrently.
  */
-CAFFE2_API bool GlobalInit(int* pargc, char*** argv);
+TORCH_API bool GlobalInit(int* pargc, char*** argv);
 
 /**
  * @brief Initialize the global environment without command line arguments
@@ -174,6 +174,6 @@
  * On mobile devices, use this global init, since we cannot pass the
  * command line options to caffe2, no arguments are passed.
  */
-CAFFE2_API bool GlobalInit();
+TORCH_API bool GlobalInit();
 }  // namespace caffe2
 #endif  // CAFFE2_CORE_INIT_H_
diff --git a/caffe2/core/memonger.h b/caffe2/core/memonger.h
index b740ccc..b015a23 100644
--- a/caffe2/core/memonger.h
+++ b/caffe2/core/memonger.h
@@ -10,15 +10,15 @@
 namespace caffe2 {
 
 // op schema check
-CAFFE2_API void run_schema_check(const NetDef& net);
+TORCH_API void run_schema_check(const NetDef& net);
 
 namespace memonger {
 
-CAFFE2_API NetDef optimize_inference_net(
+TORCH_API NetDef optimize_inference_net(
     const NetDef& net,
     const std::set<string>& static_blobs);
 
-CAFFE2_API NetDef compute_blob_recycling_for_dag(
+TORCH_API NetDef compute_blob_recycling_for_dag(
     const NetDef& net,
     const std::vector<string>& heads,
     const std::vector<int>& op_indices,
diff --git a/caffe2/core/module.h b/caffe2/core/module.h
index 88f8730..bb5dceb 100644
--- a/caffe2/core/module.h
+++ b/caffe2/core/module.h
@@ -23,7 +23,7 @@
  * different modules. Currently, we only store the name and a simple
  * description of what this module does.
  */
-class CAFFE2_API ModuleSchema {
+class TORCH_API ModuleSchema {
  public:
   ModuleSchema(const char* name, const char* description);
 };
@@ -41,12 +41,12 @@
  *       the reason we do not include ".so" is for cross-platform compatibility
  *       on platforms like mac os.
  */
-CAFFE2_API const CaffeMap<string, const ModuleSchema*>& CurrentModules();
+TORCH_API const CaffeMap<string, const ModuleSchema*>& CurrentModules();
 
 /**
  * @brief Checks whether a module is already present in the current binary.
  */
-CAFFE2_API bool HasModule(const string& name);
+TORCH_API bool HasModule(const string& name);
 
 /**
  * @brief Load a module.
@@ -56,7 +56,7 @@
  *       full path option to only experimental modules.
  *   filename: (optional) a filename that serves as a hint to load the module.
  */
-CAFFE2_API void LoadModule(const string& name, const string& filename="");
+TORCH_API void LoadModule(const string& name, const string& filename="");
 
 
 #define CAFFE2_MODULE(name, description)                                    \
diff --git a/caffe2/core/net.h b/caffe2/core/net.h
index 49333b1..0726d8e 100644
--- a/caffe2/core/net.h
+++ b/caffe2/core/net.h
@@ -34,7 +34,7 @@
 
 // Net is a thin struct that owns all the operators together with the operator
 // contexts.
-class CAFFE2_API NetBase : public Observable<NetBase> {
+class TORCH_API NetBase : public Observable<NetBase> {
  public:
   NetBase(const std::shared_ptr<const NetDef>& net_def, Workspace* ws);
   virtual ~NetBase() noexcept {}
@@ -135,7 +135,7 @@
   C10_DISABLE_COPY_AND_ASSIGN(NetBase);
 };
 
-class CAFFE2_API ExecutorHelper {
+class TORCH_API ExecutorHelper {
  public:
   ExecutorHelper() {}
   virtual TaskThreadPoolBase* GetPool(const DeviceOption& option) const;
@@ -161,14 +161,14 @@
  * created net object to the workspace's net map, while this function returns
  * a standalone net object.
  */
-CAFFE2_API unique_ptr<NetBase> CreateNet(const NetDef& net_def, Workspace* ws);
-CAFFE2_API unique_ptr<NetBase> CreateNet(
+TORCH_API unique_ptr<NetBase> CreateNet(const NetDef& net_def, Workspace* ws);
+TORCH_API unique_ptr<NetBase> CreateNet(
     const std::shared_ptr<const NetDef>& net_def,
     Workspace* ws);
 
-CAFFE2_API void AddGlobalNetObserverCreator(NetObserverCreator creator);
+TORCH_API void AddGlobalNetObserverCreator(NetObserverCreator creator);
 
-CAFFE2_API void ClearGlobalNetObservers();
+TORCH_API void ClearGlobalNetObservers();
 
 } // namespace caffe2
 
diff --git a/caffe2/core/net_async_base.h b/caffe2/core/net_async_base.h
index 20e3a69..b80ef98 100644
--- a/caffe2/core/net_async_base.h
+++ b/caffe2/core/net_async_base.h
@@ -57,13 +57,13 @@
   bool run_root_tasks_inline_ = false;
 };
 
-struct CAFFE2_API AsyncNetCancelled : public std::exception {
+struct TORCH_API AsyncNetCancelled : public std::exception {
   const char* what() const noexcept override {
     return "Cancelled";
   }
 };
 
-class CAFFE2_API AsyncNetBase : public NetBase {
+class TORCH_API AsyncNetBase : public NetBase {
  public:
   AsyncNetBase(const std::shared_ptr<const NetDef>& net_def, Workspace* ws);
   ~AsyncNetBase() override;
diff --git a/caffe2/core/net_async_scheduling.h b/caffe2/core/net_async_scheduling.h
index 3751669..7a557ce 100644
--- a/caffe2/core/net_async_scheduling.h
+++ b/caffe2/core/net_async_scheduling.h
@@ -5,7 +5,7 @@
 
 namespace caffe2 {
 
-class CAFFE2_API AsyncSchedulingNet : public AsyncNetBase {
+class TORCH_API AsyncSchedulingNet : public AsyncNetBase {
  public:
   AsyncSchedulingNet(
       const std::shared_ptr<const NetDef>& net_def,
diff --git a/caffe2/core/net_async_tracing.h b/caffe2/core/net_async_tracing.h
index 43665b1..33e91c7 100644
--- a/caffe2/core/net_async_tracing.h
+++ b/caffe2/core/net_async_tracing.h
@@ -29,7 +29,7 @@
 namespace caffe2 {
 namespace tracing {
 
-struct CAFFE2_API TracerEvent {
+struct TORCH_API TracerEvent {
   int op_id_ = -1;
   int task_id_ = -1;
   int stream_id_ = -1;
@@ -70,7 +70,7 @@
   int64_t trace_for_n_ms = 1000; // 1sec
 };
 
-class CAFFE2_API Tracer {
+class TORCH_API Tracer {
  public:
   Tracer(
       const NetBase* net,
@@ -111,7 +111,7 @@
   friend class TracerGuard;
 };
 
-class CAFFE2_API TracerGuard {
+class TORCH_API TracerGuard {
  public:
   TracerGuard() {}
 
@@ -142,16 +142,16 @@
 
 // Extract the shard id from name of the form "...shard:123..."
 // Return -1 if there is no shard found
-CAFFE2_API int extractShardId(const std::string& name);
+TORCH_API int extractShardId(const std::string& name);
 
 // Check if the net name is white-listed for tracing (specified via a command
 // line flag)
-CAFFE2_API bool isTraceableNetName(const std::string& net_name);
+TORCH_API bool isTraceableNetName(const std::string& net_name);
 
-CAFFE2_API std::shared_ptr<Tracer> create(
+TORCH_API std::shared_ptr<Tracer> create(
     const NetBase* net,
     const std::string& net_name);
-CAFFE2_API bool startIter(const std::shared_ptr<Tracer>& tracer);
+TORCH_API bool startIter(const std::shared_ptr<Tracer>& tracer);
 
 } // namespace tracing
 
diff --git a/caffe2/core/net_parallel.h b/caffe2/core/net_parallel.h
index 756637f..60030c3 100644
--- a/caffe2/core/net_parallel.h
+++ b/caffe2/core/net_parallel.h
@@ -10,7 +10,7 @@
 
 class ParallelNetExecutorHelper;
 
-class CAFFE2_API ParallelNet : public NetBase {
+class TORCH_API ParallelNet : public NetBase {
  public:
   ParallelNet(const std::shared_ptr<const NetDef>& net_def, Workspace* ws);
 
diff --git a/caffe2/core/net_simple.h b/caffe2/core/net_simple.h
index 5b8bc29..c6b25ea 100644
--- a/caffe2/core/net_simple.h
+++ b/caffe2/core/net_simple.h
@@ -16,7 +16,7 @@
 // This is the very basic structure you need to run a network - all it
 // does is simply to run everything in sequence. If you want more fancy control
 // such as a DAG-like execution, check out other better net implementations.
-class CAFFE2_API SimpleNet : public NetBase {
+class TORCH_API SimpleNet : public NetBase {
  public:
   SimpleNet(const std::shared_ptr<const NetDef>& net_def, Workspace* ws);
   bool SupportsAsync() override {
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/Compiler.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/Compiler.h
index d8e9c10..d5a019f 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Representations/Compiler.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/Compiler.h
@@ -8,7 +8,7 @@
 namespace nom {
 namespace repr {
 
-class CAFFE2_API Value {
+class TORCH_API Value {
  public:
   enum class ValueKind { Value, Instruction, Data };
   Value(ValueKind K) : kind_(K) {}
@@ -22,7 +22,7 @@
   const ValueKind kind_;
 };
 
-class CAFFE2_API Data : public Value {
+class TORCH_API Data : public Value {
  public:
   Data() : Value(ValueKind::Data) {}
   static bool classof(const Value* V) {
@@ -41,7 +41,7 @@
   size_t version_ = 0;
 };
 
-class CAFFE2_API Instruction : public Value {
+class TORCH_API Instruction : public Value {
  public:
   /// \brief All the different types of execution.
   enum class Opcode {
@@ -66,7 +66,7 @@
   Opcode op_;
 };
 
-class CAFFE2_API Terminator : public Instruction {
+class TORCH_API Terminator : public Instruction {
  public:
   Terminator(Instruction::Opcode op) : Instruction(op) {}
 
@@ -80,17 +80,17 @@
   }
 };
 
-class CAFFE2_API Branch : public Terminator {
+class TORCH_API Branch : public Terminator {
  public:
   Branch() : Terminator(Instruction::Opcode::Branch) {}
 };
 
-class CAFFE2_API Return : public Terminator {
+class TORCH_API Return : public Terminator {
  public:
   Return() : Terminator(Instruction::Opcode::Return) {}
 };
 
-class CAFFE2_API Phi : public Instruction {
+class TORCH_API Phi : public Instruction {
  public:
   Phi() : Instruction(Instruction::Opcode::Phi) {}
 };
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
index 812fea7..e3eb90a 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
@@ -41,7 +41,7 @@
 /// a saved void* pointer for external use.  Derived classes
 /// add richer semantics to the annotation and it is encouraged
 /// to use them.
-class CAFFE2_API Annotation {
+class TORCH_API Annotation {
  public:
   enum class AnnotationKind { Generic, Caffe2 };
 
@@ -57,7 +57,7 @@
   const AnnotationKind kind_;
 };
 
-class CAFFE2_API NeuralNetOperator : public Instruction {
+class TORCH_API NeuralNetOperator : public Instruction {
  public:
   /// Discriminator for LLVM-style RTTI (isa<>)
   enum class NNKind {
@@ -132,7 +132,7 @@
   std::unique_ptr<Annotation> extraAnnotation_;
 };
 
-class CAFFE2_API NeuralNetData : public Data {
+class TORCH_API NeuralNetData : public Data {
  public:
   /// Discriminator for LLVM-style RTTI (isa<>)
   enum class NNDataKind { Generic, Tensor };
@@ -155,7 +155,7 @@
   NNDataKind kind_;
 };
 
-class CAFFE2_API Tensor : public NeuralNetData {
+class TORCH_API Tensor : public NeuralNetData {
  public:
   enum class DataType { Generic, Float, Half, Int8 };
   enum class Layout { Generic, NCHW, NHWC };
@@ -208,21 +208,21 @@
 
 #include "nomnigraph/Generated/OpClasses.h"
 
-class CAFFE2_API While : public NeuralNetOperator {
+class TORCH_API While : public NeuralNetOperator {
  public:
   While() : NeuralNetOperator(NNKind::While, Opcode::Branch) {}
   NOMNIGRAPH_DEFINE_NN_RTTI(While);
   ~While() {}
 };
 
-class CAFFE2_API NNPhi : public NeuralNetOperator {
+class TORCH_API NNPhi : public NeuralNetOperator {
  public:
   NNPhi() : NeuralNetOperator(NNKind::NNPhi, Opcode::Phi) {}
   NOMNIGRAPH_DEFINE_NN_RTTI(NNPhi);
   ~NNPhi() {}
 };
 
-class CAFFE2_API GenericOperator : public NeuralNetOperator {
+class TORCH_API GenericOperator : public NeuralNetOperator {
  public:
   GenericOperator() : NeuralNetOperator(NNKind::GenericOperator) {}
   GenericOperator(std::string name)
@@ -244,7 +244,7 @@
 using NNSubgraph = nom::Subgraph<std::unique_ptr<nom::repr::Value>>;
 using NNCFGraph = nom::repr::ControlFlowGraph<NNGraph>;
 
-struct CAFFE2_API NNModule {
+struct TORCH_API NNModule {
   NNGraph dataFlow;
   NNCFGraph controlFlow;
   std::unordered_set<NNGraph::NodeRef> inputs;
@@ -464,41 +464,41 @@
 }
 
 /// NeuralNetData specific helpers.
-CAFFE2_API bool hasProducer(NNGraph::NodeRef n);
-CAFFE2_API NNGraph::NodeRef getProducer(NNGraph::NodeRef n);
-CAFFE2_API bool hasConsumer(NNGraph::NodeRef n);
-CAFFE2_API std::vector<NNGraph::NodeRef> getConsumers(NNGraph::NodeRef n);
+TORCH_API bool hasProducer(NNGraph::NodeRef n);
+TORCH_API NNGraph::NodeRef getProducer(NNGraph::NodeRef n);
+TORCH_API bool hasConsumer(NNGraph::NodeRef n);
+TORCH_API std::vector<NNGraph::NodeRef> getConsumers(NNGraph::NodeRef n);
 
-CAFFE2_API bool hasInputs(NNGraph::NodeRef n);
-CAFFE2_API std::vector<NNGraph::NodeRef> getInputs(NNGraph::NodeRef n);
-CAFFE2_API std::vector<NNGraph::NodeRef> getOutputs(NNGraph::NodeRef n);
+TORCH_API bool hasInputs(NNGraph::NodeRef n);
+TORCH_API std::vector<NNGraph::NodeRef> getInputs(NNGraph::NodeRef n);
+TORCH_API std::vector<NNGraph::NodeRef> getOutputs(NNGraph::NodeRef n);
 
-CAFFE2_API std::set<NNGraph::NodeRef> getInputs(const NNSubgraph& sg);
-CAFFE2_API std::set<NNGraph::NodeRef> getOutputs(const NNSubgraph& sg);
+TORCH_API std::set<NNGraph::NodeRef> getInputs(const NNSubgraph& sg);
+TORCH_API std::set<NNGraph::NodeRef> getOutputs(const NNSubgraph& sg);
 
 // Get the name of the node regardless of underlying type.
-CAFFE2_API std::string getName(NNGraph::NodeRef n);
+TORCH_API std::string getName(NNGraph::NodeRef n);
 
 // Replace the producer of the first argument with the second argument
-CAFFE2_API void replaceProducer(
+TORCH_API void replaceProducer(
     NNGraph::NodeRef tensorNode,
     NNGraph::NodeRef newProducer);
 // Set all consumers of first argument to consume the second argument
-CAFFE2_API void replaceAllUsesWith(
+TORCH_API void replaceAllUsesWith(
     NNGraph::NodeRef oldTensorNode,
     NNGraph::NodeRef newTensorNode);
 // Set the second argument to consume the inputs of the first argument
-CAFFE2_API void replaceAsConsumer(
+TORCH_API void replaceAsConsumer(
     NNGraph::NodeRef oldConsumer,
     NNGraph::NodeRef newConsumer);
 
 // Create an output tensor node
-CAFFE2_API NNGraph::NodeRef
+TORCH_API NNGraph::NodeRef
 createOutput(NNModule* nn, NNGraph::NodeRef producer, std::string name);
 
 // Hack for windows compiler.
 template <typename T, typename... Args>
-CAFFE2_API NNGraph::NodeRef createOperator(NNModule* nn, Args... args);
+TORCH_API NNGraph::NodeRef createOperator(NNModule* nn, Args... args);
 
 // Create an operator
 template <typename T, typename... Args>
@@ -506,7 +506,7 @@
   return nn->dataFlow.createNode(util::make_unique<T>(args...));
 }
 
-CAFFE2_API void coalesceInsertedDataDependencies(repr::NNModule* m);
+TORCH_API void coalesceInsertedDataDependencies(repr::NNModule* m);
 
 template <NNGraph* G>
 struct C10_EXPORT NodeHelper {};
@@ -517,12 +517,12 @@
 // Commonly used node predicate.
 
 // The node has a single output and the output has a single consumer.
-CAFFE2_API bool hasSingleOutputAndConsumer(NNGraph::NodeRef nodeRef);
+TORCH_API bool hasSingleOutputAndConsumer(NNGraph::NodeRef nodeRef);
 // The node has a unique consumer (there may be multiple edges from output
 // to the single consumer).
-CAFFE2_API bool hasUniqueConsumer(NNGraph::NodeRef nodeRef);
+TORCH_API bool hasUniqueConsumer(NNGraph::NodeRef nodeRef);
 
-CAFFE2_API NNMatchPredicate matchExternalTensorNode();
+TORCH_API NNMatchPredicate matchExternalTensorNode();
 
 } // namespace nn
 
diff --git a/caffe2/core/nomnigraph/tests/test_util.h b/caffe2/core/nomnigraph/tests/test_util.h
index f19f75f..f60e73f 100644
--- a/caffe2/core/nomnigraph/tests/test_util.h
+++ b/caffe2/core/nomnigraph/tests/test_util.h
@@ -102,9 +102,9 @@
  *      return labelMap;
  *    });
  */
-CAFFE2_API nom::Graph<std::string> createGraph();
+TORCH_API nom::Graph<std::string> createGraph();
 
-CAFFE2_API nom::Graph<std::string> createGraphWithCycle();
+TORCH_API nom::Graph<std::string> createGraphWithCycle();
 
 std::map<std::string, std::string> BBPrinter(typename nom::repr::NNCFGraph::NodeRef node);
 
@@ -112,9 +112,9 @@
 
 std::map<std::string, std::string> NNPrinter(typename nom::repr::NNGraph::NodeRef node);
 
-CAFFE2_API nom::Graph<TestClass>::NodeRef createTestNode(
+TORCH_API nom::Graph<TestClass>::NodeRef createTestNode(
     nom::Graph<TestClass>& g);
 
-CAFFE2_API std::map<std::string, std::string> TestNodePrinter(
+TORCH_API std::map<std::string, std::string> TestNodePrinter(
     nom::Graph<TestClass>::NodeRef node);
 #endif // NOM_TESTS_TEST_UTIL_H
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
index 8674f3d..8b2a6b5 100644
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@@ -48,10 +48,10 @@
 
 namespace caffe2 {
 
-class CAFFE2_API OperatorBase;
+class TORCH_API OperatorBase;
 typedef ObserverBase<OperatorBase> OperatorObserver;
 
-class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
+class TORCH_API OperatorBase : public Observable<OperatorBase> {
  public:
   explicit OperatorBase(const OperatorDef& operator_def, Workspace* ws);
 
@@ -1325,9 +1325,9 @@
     std::unique_ptr<OperatorBase>,
     const OperatorDef&,
     Workspace*>* (*RegistryFunction)();
-CAFFE2_API std::map<DeviceType, OperatorRegistry*>* gDeviceTypeRegistry();
+TORCH_API std::map<DeviceType, OperatorRegistry*>* gDeviceTypeRegistry();
 
-struct CAFFE2_API DeviceTypeRegisterer {
+struct TORCH_API DeviceTypeRegisterer {
   explicit DeviceTypeRegisterer(DeviceType type, RegistryFunction func) {
     if (gDeviceTypeRegistry()->count(type)) {
       std::cerr << "Device type " << DeviceTypeName(type)
@@ -1467,7 +1467,7 @@
 // specific engines that only implement a subset of the features required by
 // the original operator schema.
 // TODO(jiayq): make more feature-complete exception message.
-class CAFFE2_API UnsupportedOperatorFeature : public std::exception {
+class TORCH_API UnsupportedOperatorFeature : public std::exception {
  public:
   UnsupportedOperatorFeature(const string& msg) : msg_(msg) {}
   const char* what() const noexcept override {
@@ -1488,12 +1488,12 @@
 
 // Creates an operator with the given operator definition.
 // Throws on error and never returns nullptr
-CAFFE2_API unique_ptr<OperatorBase> CreateOperator(
+TORCH_API unique_ptr<OperatorBase> CreateOperator(
     const OperatorDef& operator_def,
     Workspace* ws,
     int net_position = OperatorBase::kNoNetPositionSet);
 
-CAFFE2_API const std::string OpRegistryKey(
+TORCH_API const std::string OpRegistryKey(
     const std::string& op_type,
     const std::string& engine = "");
 
@@ -1505,50 +1505,50 @@
     CaffeMap<DeviceType, CaffeMap<std::string, EnginePrefType>>;
 // {device_type -> EnginePrefType}
 using GlobalEnginePrefType = CaffeMap<DeviceType, EnginePrefType>;
-CAFFE2_API void SetPerOpEnginePref(
+TORCH_API void SetPerOpEnginePref(
     const PerOpEnginePrefType& per_op_engine_pref);
-CAFFE2_API void SetGlobalEnginePref(
+TORCH_API void SetGlobalEnginePref(
     const GlobalEnginePrefType& global_engine_pref);
-CAFFE2_API void SetEnginePref(
+TORCH_API void SetEnginePref(
     const PerOpEnginePrefType& per_op_engine_pref,
     const GlobalEnginePrefType& global_engine_pref);
-CAFFE2_API void SetOpEnginePref(
+TORCH_API void SetOpEnginePref(
     const std::string& op_type,
     const CaffeMap<DeviceType, EnginePrefType>& op_pref);
 
-CAFFE2_API void LoadInt8TensorInfoOfBlob(
+TORCH_API void LoadInt8TensorInfoOfBlob(
     std::vector<float>* scale,
     std::vector<float>* offset,
     uint32_t* axis,
     const Blob* b);
 
-CAFFE2_API TensorShape GetTensorShapeOfBlob(const Blob* b);
+TORCH_API TensorShape GetTensorShapeOfBlob(const Blob* b);
 
-CAFFE2_API TensorShapes InferBlobShapesAndTypes(
+TORCH_API TensorShapes InferBlobShapesAndTypes(
     CaffeMap<string, TensorShape>& blob_desc,
     const vector<NetDef*>& nets);
 
-CAFFE2_API TensorShapes InferBlobShapesAndTypesFromWorkspace(
+TORCH_API TensorShapes InferBlobShapesAndTypesFromWorkspace(
     Workspace* ws,
     const vector<NetDef*>& nets);
 
-CAFFE2_API TensorShapes InferBlobShapesAndTypesFromMap(
+TORCH_API TensorShapes InferBlobShapesAndTypesFromMap(
     const CaffeMap<std::string, std::vector<int64_t>>& blob_dimensions,
     const vector<NetDef*>& nets);
 
-CAFFE2_API TensorShapes InferBlobShapesAndTypesFromMap(
+TORCH_API TensorShapes InferBlobShapesAndTypesFromMap(
     const CaffeMap<std::string, std::vector<int64_t>>& blob_dimensions,
     const CaffeMap<std::string, TensorProto_DataType>& blob_types,
     const vector<NetDef*>& nets);
 
-CAFFE2_API std::map<string, std::pair<DeviceOption, DeviceOption>>
+TORCH_API std::map<string, std::pair<DeviceOption, DeviceOption>>
 ValidateTensorDevices(OperatorBase& op, const OperatorDef& op_def);
 
 // Get a set of registered operator names
-CAFFE2_API std::set<std::string> GetRegisteredOperators();
+TORCH_API std::set<std::string> GetRegisteredOperators();
 
 // Operator logging capabilities
-CAFFE2_API void SetOperatorLogger(
+TORCH_API void SetOperatorLogger(
     std::function<void(const OperatorDef&)> tracer);
 std::function<void(const OperatorDef&)> GetOperatorLogger();
 
diff --git a/caffe2/core/operator_gradient.h b/caffe2/core/operator_gradient.h
index b444c28..5c8d97a 100644
--- a/caffe2/core/operator_gradient.h
+++ b/caffe2/core/operator_gradient.h
@@ -14,7 +14,7 @@
  * a sparse blob, its gradient name should be written into indice_ for
  * the sparse indices and value_ for the values.
  */
-struct CAFFE2_API GradientWrapper {
+struct TORCH_API GradientWrapper {
   string dense_;
   string indices_;
   string values_;
@@ -33,7 +33,7 @@
 /**
  * A struct that holds the gradient operators and related gradient maps.
  */
-struct CAFFE2_API GradientOpsMeta {
+struct TORCH_API GradientOpsMeta {
   vector<OperatorDef> ops_;
   vector<GradientWrapper> g_input_;
 
@@ -44,7 +44,7 @@
       : ops_(ops), g_input_(v) {}
 };
 
-class CAFFE2_API GradientMakerBase {
+class TORCH_API GradientMakerBase {
  public:
   GradientMakerBase(
       const OperatorDef& def,
@@ -256,7 +256,7 @@
  * that the gradient computation should not flow through it at all, and throws
  * an error if it is called.
  */
-class CAFFE2_API NoGradient : public GradientMakerBase {
+class TORCH_API NoGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
   vector<OperatorDef> GetGradientDefs() override {
     return vector<OperatorDef>();
@@ -328,7 +328,7 @@
 /**
  * @brief Gets the GradientOpsMeta for the given operator def.
  */
-CAFFE2_API GradientOpsMeta GetGradientForOp(
+TORCH_API GradientOpsMeta GetGradientForOp(
     const OperatorDef& def,
     const vector<GradientWrapper>& g_output);
 
diff --git a/caffe2/core/operator_schema.cc b/caffe2/core/operator_schema.cc
index 3009ba4..9ff8dfd 100644
--- a/caffe2/core/operator_schema.cc
+++ b/caffe2/core/operator_schema.cc
@@ -307,8 +307,8 @@
 }
 
 #define DEFINE_STANDARG_ARG(name, str)                                \
-  CAFFE2_API const char* OpSchema::Arg_##name = #str;                 \
-  CAFFE2_API OpSchema& OpSchema::Arg##name(const char* description) { \
+  TORCH_API const char* OpSchema::Arg_##name = #str;                 \
+  TORCH_API OpSchema& OpSchema::Arg##name(const char* description) { \
     return Arg(#str, description, true);                              \
   }
 
diff --git a/caffe2/core/operator_schema.h b/caffe2/core/operator_schema.h
index deca56a..00834fa 100644
--- a/caffe2/core/operator_schema.h
+++ b/caffe2/core/operator_schema.h
@@ -37,7 +37,7 @@
  *     OPERATOR_SCHEMA(name)
  *         .NumInputs(2).NumOutputs(1).AllowInplace({{0, 0}});
  */
-class CAFFE2_API OpSchema {
+class TORCH_API OpSchema {
  public:
   OpSchema() : OpSchema("unknown", "unknown", 0) {}
   OpSchema(const string& type, const string& file, const int line);
@@ -339,7 +339,7 @@
     return inplace_enforced_(x, y);
   }
 
-  CAFFE2_API friend std::ostream& operator<<(std::ostream& out, const OpSchema& schema);
+  TORCH_API friend std::ostream& operator<<(std::ostream& out, const OpSchema& schema);
 
   const std::vector<Argument>& args() const {
     return args_;
@@ -457,7 +457,7 @@
 /**
  * @brief A registry to hold all the operator schemas.
  */
-class CAFFE2_API OpSchemaRegistry {
+class TORCH_API OpSchemaRegistry {
  public:
   static OpSchema&
   NewSchema(const string& key, const string& file, const int line) {
diff --git a/caffe2/core/stats.h b/caffe2/core/stats.h
index f037ca6..a2ba948 100644
--- a/caffe2/core/stats.h
+++ b/caffe2/core/stats.h
@@ -11,7 +11,7 @@
 
 namespace caffe2 {
 
-class CAFFE2_API StatValue {
+class TORCH_API StatValue {
   std::atomic<int64_t> v_{0};
 
  public:
@@ -28,7 +28,7 @@
   }
 };
 
-struct CAFFE2_API ExportedStatValue {
+struct TORCH_API ExportedStatValue {
   std::string key;
   int64_t value;
   std::chrono::time_point<std::chrono::high_resolution_clock> ts;
@@ -40,7 +40,7 @@
 using ExportedStatList = std::vector<ExportedStatValue>;
 using ExportedStatMap = std::unordered_map<std::string, int64_t>;
 
-CAFFE2_API ExportedStatMap toMap(const ExportedStatList& stats);
+TORCH_API ExportedStatMap toMap(const ExportedStatList& stats);
 
 /**
  * @brief Holds a map of atomic counters keyed by name.
@@ -114,7 +114,7 @@
  * structure by calling StatRegistry::update().
  *
  */
-class CAFFE2_API StatRegistry {
+class TORCH_API StatRegistry {
   std::mutex mutex_;
   std::unordered_map<std::string, std::unique_ptr<StatValue>> stats_;
 
@@ -153,7 +153,7 @@
   ~StatRegistry();
 };
 
-struct CAFFE2_API Stat {
+struct TORCH_API Stat {
   std::string groupName;
   std::string name;
   Stat(const std::string& gn, const std::string& n) : groupName(gn), name(n) {}
@@ -164,7 +164,7 @@
   }
 };
 
-class CAFFE2_API ExportedStat : public Stat {
+class TORCH_API ExportedStat : public Stat {
   StatValue* value_;
 
  public:
@@ -181,7 +181,7 @@
   }
 };
 
-class CAFFE2_API AvgExportedStat : public ExportedStat {
+class TORCH_API AvgExportedStat : public ExportedStat {
  private:
   ExportedStat count_;
 
@@ -200,7 +200,7 @@
   }
 };
 
-class CAFFE2_API StdDevExportedStat : public ExportedStat {
+class TORCH_API StdDevExportedStat : public ExportedStat {
   // Uses an offset (first_) to remove issue of cancellation
   // Variance is then (sumsqoffset_ - (sumoffset_^2) / count_) / (count_ - 1)
  private:
@@ -234,7 +234,7 @@
   }
 };
 
-class CAFFE2_API DetailedExportedStat : public ExportedStat {
+class TORCH_API DetailedExportedStat : public ExportedStat {
  private:
   std::vector<ExportedStat> details_;
 
@@ -258,7 +258,7 @@
   }
 };
 
-class CAFFE2_API StaticStat : public Stat {
+class TORCH_API StaticStat : public Stat {
  private:
   StatValue* value_;
 
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 83df530..77b8d2b 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -26,7 +26,7 @@
  *
  * NB: See TensorImpl for documentation on these methods.
  */
-class CAFFE2_API Tensor final {
+class TORCH_API Tensor final {
  private:
   enum Unsafe { IDoWantAliasing };
   Tensor(const Tensor& other, Unsafe _) : impl_(other.getIntrusivePtr()) {}
@@ -530,10 +530,10 @@
  * this will not do anything if the
  * Tensor already has correct size and data type
  */
-CAFFE2_API void
+TORCH_API void
 ReinitializeTensor(Tensor* t, at::IntArrayRef dims, at::TensorOptions options);
 
-CAFFE2_API void ReinitializeAndCopyFrom(
+TORCH_API void ReinitializeAndCopyFrom(
     Tensor* t,
     at::TensorOptions options,
     const Tensor& src,
@@ -564,7 +564,7 @@
     DeviceType type);
 
 // Tensor factory function
-CAFFE2_API Tensor empty(at::IntArrayRef dims, at::TensorOptions options);
+TORCH_API Tensor empty(at::IntArrayRef dims, at::TensorOptions options);
 
 /**
  * @brief Creates a CPU tensor, and fills its contents with the given values.
@@ -585,7 +585,7 @@
 vector<int64_t>
 GetTensorInfo(const void* c, size_t* capacity, DeviceOption* device);
 
-class CAFFE2_API TensorPrinter {
+class TORCH_API TensorPrinter {
  public:
   explicit TensorPrinter(
       const std::string& tensor_name = "",
diff --git a/caffe2/core/test_utils.h b/caffe2/core/test_utils.h
index 47226c9..89f21c1 100644
--- a/caffe2/core/test_utils.h
+++ b/caffe2/core/test_utils.h
@@ -18,13 +18,13 @@
 namespace testing {
 
 // Asserts that the values of two tensors are the same.
-CAFFE2_API void assertTensorEquals(
+TORCH_API void assertTensorEquals(
     const TensorCPU& tensor1,
     const TensorCPU& tensor2,
     float eps = 1e-6);
 
 // Asserts that two float values are close within epsilon.
-CAFFE2_API void assertNear(float value1, float value2, float epsilon);
+TORCH_API void assertNear(float value1, float value2, float epsilon);
 
 // Asserts that the numeric values of a tensor is equal to a data vector.
 template <typename T>
@@ -55,23 +55,23 @@
 }
 
 // Asserts a list of tensors presented in two workspaces are equal.
-CAFFE2_API void assertTensorListEquals(
+TORCH_API void assertTensorListEquals(
     const std::vector<std::string>& tensorNames,
     const Workspace& workspace1,
     const Workspace& workspace2);
 
 // Read a tensor from the workspace.
-CAFFE2_API const caffe2::Tensor& getTensor(
+TORCH_API const caffe2::Tensor& getTensor(
     const caffe2::Workspace& workspace,
     const std::string& name);
 
 // Create a new tensor in the workspace.
-CAFFE2_API caffe2::Tensor* createTensor(
+TORCH_API caffe2::Tensor* createTensor(
     const std::string& name,
     caffe2::Workspace* workspace);
 
 // Create a new operator in the net.
-CAFFE2_API caffe2::OperatorDef* createOperator(
+TORCH_API caffe2::OperatorDef* createOperator(
     const std::string& type,
     const std::vector<std::string>& inputs,
     const std::vector<std::string>& outputs,
@@ -154,7 +154,7 @@
 }
 
 // Concise util class to mutate a net in a chaining fashion.
-class CAFFE2_API NetMutator {
+class TORCH_API NetMutator {
  public:
   explicit NetMutator(caffe2::NetDef* net) : net_(net) {}
 
@@ -184,7 +184,7 @@
 };
 
 // Concise util class to mutate a workspace in a chaining fashion.
-class CAFFE2_API WorkspaceMutator {
+class TORCH_API WorkspaceMutator {
  public:
   explicit WorkspaceMutator(caffe2::Workspace* workspace)
       : workspace_(workspace) {}
diff --git a/caffe2/core/transform.h b/caffe2/core/transform.h
index 723e147..7f8971c 100644
--- a/caffe2/core/transform.h
+++ b/caffe2/core/transform.h
@@ -31,7 +31,7 @@
  * own transform, write your implementations for PatternRule, ValidatorRule, and
  * ReplaceRule.
  */
-class CAFFE2_API Transform {
+class TORCH_API Transform {
  public:
   Transform() {}
 
@@ -148,7 +148,7 @@
 };
 
 // Creates a Transform based on a key, which should be defined in registry.
-CAFFE2_API unique_ptr<Transform> CreateTransform(string key);
+TORCH_API unique_ptr<Transform> CreateTransform(string key);
 
 C10_DECLARE_REGISTRY(TransformRegistry, Transform);
 #define REGISTER_TRANSFORM(name, ...) \
@@ -156,14 +156,14 @@
 
 // Create a Transform object from registry,
 // and immediately apply it to a Netdef.
-CAFFE2_API NetDef ApplyTransform(const string& key, const NetDef& netdef);
+TORCH_API NetDef ApplyTransform(const string& key, const NetDef& netdef);
 
 // Create a Transform object from registry, apply it to a NetDef.
 // Will only return the transformed net if it is faster than the old net.
 // This will run the init net first, will run the two nets warmup_runs times.
 // Then, we will take the average time of main_runs runs, and only keep the
 // transformed net if it is faster by a factor of improvement_threshold.
-CAFFE2_API NetDef ApplyTransformIfFaster(
+TORCH_API NetDef ApplyTransformIfFaster(
     const string& key,
     const NetDef& netdef,
     const NetDef& init_netdef,
diff --git a/caffe2/core/types.h b/caffe2/core/types.h
index 5dda5a5..7a74abe 100644
--- a/caffe2/core/types.h
+++ b/caffe2/core/types.h
@@ -47,10 +47,10 @@
 inline constexpr char NameScopeSeparator() { return '/'; }
 
 // From TypeMeta to caffe2::DataType protobuffer enum.
-CAFFE2_API TensorProto::DataType TypeMetaToDataType(const TypeMeta meta);
+TORCH_API TensorProto::DataType TypeMetaToDataType(const TypeMeta meta);
 
 // From caffe2::DataType protobuffer enum to TypeMeta
-CAFFE2_API const TypeMeta DataTypeToTypeMeta(const TensorProto::DataType& dt);
+TORCH_API const TypeMeta DataTypeToTypeMeta(const TensorProto::DataType& dt);
 
 }  // namespace caffe2
 
diff --git a/caffe2/core/workspace.h b/caffe2/core/workspace.h
index 793b5f6..25805d0 100644
--- a/caffe2/core/workspace.h
+++ b/caffe2/core/workspace.h
@@ -24,7 +24,7 @@
 
 class NetBase;
 
-struct CAFFE2_API StopOnSignal {
+struct TORCH_API StopOnSignal {
   StopOnSignal()
       : handler_(std::make_shared<SignalHandler>(
             SignalHandler::Action::STOP,
@@ -44,7 +44,7 @@
  * runtime: (1) all blobs, and (2) all instantiated networks. It is the owner of
  * all these objects and deals with the scaffolding logistics.
  */
-class CAFFE2_API Workspace {
+class TORCH_API Workspace {
  public:
   typedef std::function<bool(int)> ShouldContinue;
   typedef CaffeMap<string, unique_ptr<Blob> > BlobMap;
diff --git a/caffe2/distributed/file_store_handler.h b/caffe2/distributed/file_store_handler.h
index 9ca81e4..43d86fe 100644
--- a/caffe2/distributed/file_store_handler.h
+++ b/caffe2/distributed/file_store_handler.h
@@ -4,7 +4,7 @@
 
 namespace caffe2 {
 
-class CAFFE2_API FileStoreHandler : public StoreHandler {
+class TORCH_API FileStoreHandler : public StoreHandler {
  public:
   explicit FileStoreHandler(const std::string& path, const std::string& prefix);
   virtual ~FileStoreHandler();
diff --git a/caffe2/distributed/redis_store_handler.h b/caffe2/distributed/redis_store_handler.h
index d5fa767..1ff7591 100644
--- a/caffe2/distributed/redis_store_handler.h
+++ b/caffe2/distributed/redis_store_handler.h
@@ -10,7 +10,7 @@
 
 namespace caffe2 {
 
-class CAFFE2_API RedisStoreHandler : public StoreHandler {
+class TORCH_API RedisStoreHandler : public StoreHandler {
  public:
   explicit RedisStoreHandler(std::string& host, int port, std::string& prefix);
   virtual ~RedisStoreHandler();
diff --git a/caffe2/distributed/store_handler.h b/caffe2/distributed/store_handler.h
index 951fe26..d4d9b80 100644
--- a/caffe2/distributed/store_handler.h
+++ b/caffe2/distributed/store_handler.h
@@ -10,7 +10,7 @@
 
 namespace caffe2 {
 
-class CAFFE2_API StoreHandler {
+class TORCH_API StoreHandler {
  public:
   static constexpr std::chrono::milliseconds kDefaultTimeout =
       std::chrono::seconds(30);
@@ -67,7 +67,7 @@
 /*
  * The backing store is no longer available. It may have been deleted.
  */
-struct CAFFE2_API StoreHandlerNotAvailableException
+struct TORCH_API StoreHandlerNotAvailableException
     : public std::runtime_error {
   explicit StoreHandlerNotAvailableException(const std::string& msg)
       : std::runtime_error(msg) {}
@@ -80,7 +80,7 @@
 /*
  * Timeout accessing the store.
  */
-struct CAFFE2_API StoreHandlerTimeoutException : public std::runtime_error {
+struct TORCH_API StoreHandlerTimeoutException : public std::runtime_error {
   explicit StoreHandlerTimeoutException(const std::string& msg)
       : std::runtime_error(msg) {}
 };
diff --git a/caffe2/mpi/mpi_common.h b/caffe2/mpi/mpi_common.h
index fab89ed..ab04afb 100644
--- a/caffe2/mpi/mpi_common.h
+++ b/caffe2/mpi/mpi_common.h
@@ -34,7 +34,7 @@
 #undef MPI_DATATYPE_WRAPPER
 
 // For all Caffe MPI calls, we will wrap it inside an MPI mutex lock guard.
-CAFFE2_API std::mutex& MPIMutex();
+TORCH_API std::mutex& MPIMutex();
 
 #define MPI_CHECK(condition)                                 \
   do {                                                       \
@@ -54,23 +54,23 @@
  * @brief Gets the global MPI communicator used by Caffe2. In default, this
  * is MPI_COMM_WORLD unless you call SetGlobalMPIComm().
  */
-CAFFE2_API MPI_Comm GlobalMPIComm();
+TORCH_API MPI_Comm GlobalMPIComm();
 
 /**
  * @brief Sets the global MPI communicator. Caffe2 takes over the ownership
  * of the passed in communicator.
  */
-CAFFE2_API void SetGlobalMPIComm(MPI_Comm new_comm);
+TORCH_API void SetGlobalMPIComm(MPI_Comm new_comm);
 
 /**
  * @brief A helper function to return the size of the given communicator.
  */
-CAFFE2_API int MPICommSize(MPI_Comm comm);
+TORCH_API int MPICommSize(MPI_Comm comm);
 
 /**
  * @brief A helper function to return the rank of the given communicator.
  */
-CAFFE2_API int MPICommRank(MPI_Comm comm);
+TORCH_API int MPICommRank(MPI_Comm comm);
 
 /**
  * @brief A simple wrapper over an MPI common world.
diff --git a/caffe2/observers/profile_observer.h b/caffe2/observers/profile_observer.h
index 89cd83f..8f39710 100644
--- a/caffe2/observers/profile_observer.h
+++ b/caffe2/observers/profile_observer.h
@@ -46,7 +46,7 @@
   float run_time_ = 0.0f;
 };
 
-class CAFFE2_API ProfileOperatorObserver final
+class TORCH_API ProfileOperatorObserver final
     : public ProfileCounter,
       public ObserverBase<OperatorBase> {
  public:
@@ -94,7 +94,7 @@
   void Stop() override;
 };
 
-class CAFFE2_API ProfileObserver final : public OperatorAttachingNetObserver<
+class TORCH_API ProfileObserver final : public OperatorAttachingNetObserver<
                                              ProfileOperatorObserver,
                                              ProfileObserver> {
  public:
diff --git a/caffe2/observers/runcnt_observer.h b/caffe2/observers/runcnt_observer.h
index 76a0e40..93bf4e4 100644
--- a/caffe2/observers/runcnt_observer.h
+++ b/caffe2/observers/runcnt_observer.h
@@ -9,7 +9,7 @@
 
 class RunCountNetObserver;
 
-class CAFFE2_API RunCountOperatorObserver final
+class TORCH_API RunCountOperatorObserver final
     : public ObserverBase<OperatorBase> {
  public:
   explicit RunCountOperatorObserver(OperatorBase* op) = delete;
@@ -27,7 +27,7 @@
   RunCountNetObserver* netObserver_;
 };
 
-class CAFFE2_API RunCountNetObserver final
+class TORCH_API RunCountNetObserver final
     : public OperatorAttachingNetObserver<
           RunCountOperatorObserver,
           RunCountNetObserver> {
diff --git a/caffe2/observers/time_observer.h b/caffe2/observers/time_observer.h
index fa54e1f..84de8ef 100644
--- a/caffe2/observers/time_observer.h
+++ b/caffe2/observers/time_observer.h
@@ -14,7 +14,7 @@
 
 class TimeObserver;
 
-class CAFFE2_API TimeCounter {
+class TORCH_API TimeCounter {
  public:
   explicit TimeCounter() {}
   inline float average_time() const {
@@ -28,7 +28,7 @@
   int iterations_ = 0;
 };
 
-class CAFFE2_API TimeOperatorObserver final
+class TORCH_API TimeOperatorObserver final
     : public TimeCounter,
       public ObserverBase<OperatorBase> {
  public:
@@ -46,7 +46,7 @@
   void Stop() override;
 };
 
-class CAFFE2_API TimeObserver final
+class TORCH_API TimeObserver final
     : public TimeCounter,
       public OperatorAttachingNetObserver<TimeOperatorObserver, TimeObserver> {
  public:
diff --git a/caffe2/onnx/backend.h b/caffe2/onnx/backend.h
index 05d5678..5fec9dc 100644
--- a/caffe2/onnx/backend.h
+++ b/caffe2/onnx/backend.h
@@ -25,7 +25,7 @@
 
 using ValueInfoMap = std::unordered_map<std::string, ValueInfoProto>;
 
-class CAFFE2_API ConversionContext {
+class TORCH_API ConversionContext {
  public:
   ConversionContext(const ValueInfoMap& value_infos, int opset_version)
       : value_infos_(value_infos), opset_version_(opset_version) {}
@@ -44,7 +44,7 @@
 // \brief This struct holds the converted ops after the onnx->c2 conversion.
 // Notice that for RNN ops, it may create ops in init_net. Hence we have the
 // `init_ops` field.
-struct CAFFE2_API Caffe2Ops {
+struct TORCH_API Caffe2Ops {
   ::google::protobuf::RepeatedPtrField<caffe2::OperatorDef> init_ops;
   ::google::protobuf::RepeatedPtrField<caffe2::OperatorDef> ops;
   ::google::protobuf::RepeatedPtrField<std::string> interface_blobs;
@@ -52,7 +52,7 @@
 
 // A convenient class to query attributes of a NodeProto. Note that the
 // NodeProto can not be modified during the query of OnnxAttributes object
-class CAFFE2_API OnnxAttributes {
+class TORCH_API OnnxAttributes {
  public:
   OnnxAttributes(const NodeProto& node);
 
@@ -120,7 +120,7 @@
 const TensorProto* OnnxAttributes::get(const std::string& key) const;
 
 // convenient class for onnx node
-struct CAFFE2_API OnnxNode {
+struct TORCH_API OnnxNode {
   OnnxNode(const NodeProto& node_in) : node(node_in), attributes(node_in) {}
 
   const NodeProto& node;
@@ -128,7 +128,7 @@
   OnnxAttributes attributes;
 };
 
-class CAFFE2_API Caffe2Backend {
+class TORCH_API Caffe2Backend {
  public:
   // Since we still have this Python-C++ hybrid flow, we will need to take the
   // DummyName generator from Python as a pointer. In this case, Python env owns
diff --git a/caffe2/onnx/backend_rep.h b/caffe2/onnx/backend_rep.h
index eb91ea6..8618995 100644
--- a/caffe2/onnx/backend_rep.h
+++ b/caffe2/onnx/backend_rep.h
@@ -9,7 +9,7 @@
 
 namespace caffe2 {
 namespace onnx {
-class CAFFE2_API Caffe2BackendRep {
+class TORCH_API Caffe2BackendRep {
  public:
   void Run(
       const caffe2::Predictor::TensorList& inputs,
diff --git a/caffe2/onnx/helper.h b/caffe2/onnx/helper.h
index c310aa4..5f706b2 100644
--- a/caffe2/onnx/helper.h
+++ b/caffe2/onnx/helper.h
@@ -14,7 +14,7 @@
 using ::ONNX_NAMESPACE::NodeProto;
 
 // \brief This class generates unique dummy names
-class CAFFE2_API DummyName {
+class TORCH_API DummyName {
  public:
   std::string NewDummyName();
 
@@ -98,7 +98,7 @@
   return ret;
 }
 
-CAFFE2_API NodeProto MakeNode(
+TORCH_API NodeProto MakeNode(
     const std::string& type,
     const std::vector<std::string>& inputs,
     const std::vector<std::string>& outputs,
diff --git a/caffe2/onnx/offline_tensor.h b/caffe2/onnx/offline_tensor.h
index 094df7d..9c6b85d 100644
--- a/caffe2/onnx/offline_tensor.h
+++ b/caffe2/onnx/offline_tensor.h
@@ -7,7 +7,7 @@
 namespace caffe2 {
 
 #ifndef C10_MOBILE
-struct CAFFE2_API OfflineTensor {
+struct TORCH_API OfflineTensor {
   // A shell tensor to record shape and dtype
   Tensor shape_tensor{CPU};
 
diff --git a/caffe2/onnx/onnx_exporter.h b/caffe2/onnx/onnx_exporter.h
index 6416336..c0040e5 100644
--- a/caffe2/onnx/onnx_exporter.h
+++ b/caffe2/onnx/onnx_exporter.h
@@ -31,7 +31,7 @@
 
 // Rewrite Caffe2 nets into SSA forms. Notice that we will preserve the external
 // output names for predict net.
-CAFFE2_API std::unordered_map<std::string, std::string> SsaRewrite(
+TORCH_API std::unordered_map<std::string, std::string> SsaRewrite(
     caffe2::NetDef* init_net,
     caffe2::NetDef* pred_net,
     bool PreserveInPlaceOps = true);
@@ -39,7 +39,7 @@
 ::ONNX_NAMESPACE::TensorProto::DataType Caffe2TypeToOnnxType(
     caffe2::TensorProto::DataType t);
 
-class CAFFE2_API OnnxExporter {
+class TORCH_API OnnxExporter {
   using SpecialOpConverter = ConvertedResult (OnnxExporter::*)(
       const caffe2::OperatorDef&,
       const std::unordered_map<std::string, caffe2::TensorShape>&);
diff --git a/caffe2/operators/counter_ops.h b/caffe2/operators/counter_ops.h
index aea0136..bc90f9c 100644
--- a/caffe2/operators/counter_ops.h
+++ b/caffe2/operators/counter_ops.h
@@ -9,7 +9,7 @@
 
 namespace caffe2 {
 template <typename T>
-class CAFFE2_API Counter {
+class TORCH_API Counter {
  public:
   explicit Counter(T count) : count_(count) {}
   bool countDown() {
diff --git a/caffe2/operators/create_scope_op.h b/caffe2/operators/create_scope_op.h
index b5d75a8..474b1c1 100644
--- a/caffe2/operators/create_scope_op.h
+++ b/caffe2/operators/create_scope_op.h
@@ -20,7 +20,7 @@
  * Keeps track of forward and backward gradient workspaces in stack,
  * reuses previously created workspaces, non-thread safe
  */
-class CAFFE2_API WorkspaceStack {
+class TORCH_API WorkspaceStack {
  public:
   explicit WorkspaceStack() : parent_ws_(nullptr), top_(-1) {}
 
diff --git a/caffe2/operators/cross_entropy_op.h b/caffe2/operators/cross_entropy_op.h
index 932ed0d..ec587b7 100644
--- a/caffe2/operators/cross_entropy_op.h
+++ b/caffe2/operators/cross_entropy_op.h
@@ -125,7 +125,7 @@
 };
 
 template <typename T, class Context>
-class CAFFE2_API CrossEntropyOp final : public Operator<Context> {
+class TORCH_API CrossEntropyOp final : public Operator<Context> {
  public:
   USE_SIMPLE_CTOR_DTOR(CrossEntropyOp);
   USE_OPERATOR_CONTEXT_FUNCTIONS;
@@ -140,7 +140,7 @@
 };
 
 template <typename T, class Context>
-class CAFFE2_API CrossEntropyGradientOp final : public Operator<Context> {
+class TORCH_API CrossEntropyGradientOp final : public Operator<Context> {
  public:
   USE_SIMPLE_CTOR_DTOR(CrossEntropyGradientOp);
   USE_OPERATOR_CONTEXT_FUNCTIONS;
diff --git a/caffe2/operators/elementwise_ops_utils.h b/caffe2/operators/elementwise_ops_utils.h
index 93ef400..104e7a8 100644
--- a/caffe2/operators/elementwise_ops_utils.h
+++ b/caffe2/operators/elementwise_ops_utils.h
@@ -10,20 +10,20 @@
 namespace caffe2 {
 namespace elementwise_ops_utils {
 
-CAFFE2_API std::tuple<size_t, size_t, size_t>
+TORCH_API std::tuple<size_t, size_t, size_t>
 ComputeLegacyBroadcastSizes(const Tensor& A, const Tensor& B, int axis);
 
-CAFFE2_API std::vector<int> ComputeBinaryBroadcastForwardDims(
+TORCH_API std::vector<int> ComputeBinaryBroadcastForwardDims(
     const std::vector<int>& A_dims,
     const std::vector<int>& B_dims);
 
-CAFFE2_API void ComputeBinaryBroadcastBackwardAxes(
+TORCH_API void ComputeBinaryBroadcastBackwardAxes(
     const std::vector<int>& A_dims,
     const std::vector<int>& B_dims,
     std::vector<int>* A_axes,
     std::vector<int>* B_axes);
 
-CAFFE2_API void ComputeBinaryBroadcastBackwardDims(
+TORCH_API void ComputeBinaryBroadcastBackwardDims(
     const std::vector<int>& A_dims,
     const std::vector<int>& B_dims,
     std::vector<int>* A_back_dims,
diff --git a/caffe2/operators/generate_proposals_op.h b/caffe2/operators/generate_proposals_op.h
index 0b239a3..b783b3d 100644
--- a/caffe2/operators/generate_proposals_op.h
+++ b/caffe2/operators/generate_proposals_op.h
@@ -49,7 +49,7 @@
 // anchors: predefined anchors, size(A, 4)
 // Return: all_anchors_vec: (H * W, A * 4)
 // Need to reshape to (H * W * A, 4) to match the format in python
-CAFFE2_API ERMatXf ComputeAllAnchors(
+TORCH_API ERMatXf ComputeAllAnchors(
     const TensorCPU& anchors,
     int height,
     int width,
@@ -59,7 +59,7 @@
 // spatial location, only computes anchors for the already sorted and filtered
 // positions after NMS is applied to avoid unnecessary computation.
 // `order` is a raveled array of sorted indices in (A, H, W) format.
-CAFFE2_API ERArrXXf ComputeSortedAnchors(
+TORCH_API ERArrXXf ComputeSortedAnchors(
     const Eigen::Map<const ERArrXXf>& anchors,
     int height,
     int width,
diff --git a/caffe2/operators/generate_proposals_op_util_nms_gpu.h b/caffe2/operators/generate_proposals_op_util_nms_gpu.h
index 10d081f..697a1dd 100644
--- a/caffe2/operators/generate_proposals_op_util_nms_gpu.h
+++ b/caffe2/operators/generate_proposals_op_util_nms_gpu.h
@@ -23,7 +23,7 @@
 // by NMS
 //    Those tensors will be resized to the necessary size
 // context : current CUDA context
-CAFFE2_API void nms_gpu_upright(
+TORCH_API void nms_gpu_upright(
     const float* d_desc_sorted_boxes,
     const int N,
     const float thresh,
@@ -42,7 +42,7 @@
 // d_desc_sorted_boxes : pixel coordinates of proposed bounding boxes
 //    size: (N,5), format: [x_ct; y_ctr; width; height; angle]
 //    the boxes are sorted by scores in descending order
-CAFFE2_API void nms_gpu_rotated(
+TORCH_API void nms_gpu_rotated(
     const float* d_desc_sorted_boxes,
     const int N,
     const float thresh,
@@ -52,7 +52,7 @@
     TensorCPU& host_delete_mask,
     CUDAContext* context);
 
-CAFFE2_API void nms_gpu(
+TORCH_API void nms_gpu(
     const float* d_desc_sorted_boxes,
     const int N,
     const float thresh,
diff --git a/caffe2/operators/load_save_op_util.h b/caffe2/operators/load_save_op_util.h
index b99bf73..f0978d9 100644
--- a/caffe2/operators/load_save_op_util.h
+++ b/caffe2/operators/load_save_op_util.h
@@ -26,32 +26,32 @@
         is_tensor(is_tensor) {}
 };
 
-CAFFE2_API std::string buildBlobNameFromDbKey(
+TORCH_API std::string buildBlobNameFromDbKey(
     const std::string& dbKey,
     const std::string& strip_prefix = "",
     const std::string& add_prefix = "");
 
 // We are tracking sizes of already read tensor parts while reading data
 // chunks. This way we can make sure that all chunks were loaded in the end.
-CAFFE2_API void ProcessBlob(
+TORCH_API void ProcessBlob(
     Blob* blob,
     const BlobProto& proto,
     std::unordered_map<std::string, BlobState>* blob_states_ptr,
     const std::string& key,
     int* loaded_blobs);
 
-CAFFE2_API void prepareBlob(
+TORCH_API void prepareBlob(
     Blob* blob,
     std::unordered_map<std::string, BlobState>* blob_states_ptr,
     const std::string& key);
 
-CAFFE2_API void updateBlobStates(
+TORCH_API void updateBlobStates(
     const BlobProto& proto,
     std::unordered_map<std::string, BlobState>* blob_states_ptr,
     const std::string& key,
     int* loaded_blobs);
 
-CAFFE2_API void validateBlobStates(
+TORCH_API void validateBlobStates(
     const std::unordered_map<std::string, BlobState>& blob_states);
 
 } // namespace load_save_op_util
diff --git a/caffe2/operators/locally_connected_op_util.h b/caffe2/operators/locally_connected_op_util.h
index e9eb900..d1fd77f 100644
--- a/caffe2/operators/locally_connected_op_util.h
+++ b/caffe2/operators/locally_connected_op_util.h
@@ -35,7 +35,7 @@
   int Y_W;
 };
 
-CAFFE2_API void SetColumnBufferShape(
+TORCH_API void SetColumnBufferShape(
     int N,
     int kernel_dim,
     int output_image_size,
@@ -46,7 +46,7 @@
     std::vector<int>* column_transposed_dims,
     std::vector<int>* column_axes);
 
-CAFFE2_API void SetYBufferShape(
+TORCH_API void SetYBufferShape(
     int N,
     int M,
     int output_image_size,
diff --git a/caffe2/operators/pad_op.h b/caffe2/operators/pad_op.h
index fc13899..8ba352c 100644
--- a/caffe2/operators/pad_op.h
+++ b/caffe2/operators/pad_op.h
@@ -16,7 +16,7 @@
   EDGE = 2, // pads with the edge values, with string "edge"
 };
 
-CAFFE2_API PadMode StringToPadMode(const string&);
+TORCH_API PadMode StringToPadMode(const string&);
 
 template <typename T, class Context>
 class PadImageOp final : public ConvPoolOpBase<Context> {
diff --git a/caffe2/operators/rnn/recurrent_network_executor.h b/caffe2/operators/rnn/recurrent_network_executor.h
index 95197ee..eecccf7 100644
--- a/caffe2/operators/rnn/recurrent_network_executor.h
+++ b/caffe2/operators/rnn/recurrent_network_executor.h
@@ -476,7 +476,7 @@
     std::string timestep_blob,
     ArgumentHelper rnn_args);
 
-class CAFFE2_API ThreadedRecurrentNetworkExecutor : public RecurrentNetworkExecutorBase {
+class TORCH_API ThreadedRecurrentNetworkExecutor : public RecurrentNetworkExecutorBase {
  public:
   ThreadedRecurrentNetworkExecutor(
       const NetDef& step_net_def,
diff --git a/caffe2/operators/rnn/recurrent_network_op.h b/caffe2/operators/rnn/recurrent_network_op.h
index 8484b68..86b6e45 100644
--- a/caffe2/operators/rnn/recurrent_network_op.h
+++ b/caffe2/operators/rnn/recurrent_network_op.h
@@ -46,7 +46,7 @@
   int32_t window{1};
 };
 
-struct CAFFE2_API ScratchWorkspaces {
+struct TORCH_API ScratchWorkspaces {
   std::vector<std::shared_ptr<Workspace>> stepWorkspaces;
   std::shared_ptr<Workspace> sharedBlobsWs = nullptr;
 };
@@ -59,7 +59,7 @@
       t;
 }
 
-CAFFE2_API std::map<string, string> GetRecurrentMapping(
+TORCH_API std::map<string, string> GetRecurrentMapping(
     const std::vector<detail::Link>& links,
     bool backward);
 
@@ -158,15 +158,15 @@
   }
 }
 
-CAFFE2_API void PrependOps(std::vector<OperatorDef> ops, NetDef* netdef);
+TORCH_API void PrependOps(std::vector<OperatorDef> ops, NetDef* netdef);
 
-CAFFE2_API void AddApplyLinkOps(
+TORCH_API void AddApplyLinkOps(
     const vector<Link>& links,
     std::string timestep,
     const DeviceOption& device_option,
     NetDef* netdef);
 
-CAFFE2_API void extractLinks(
+TORCH_API void extractLinks(
     OperatorBase* op,
     const std::string& internalArg,
     const std::string& externalArg,
@@ -174,7 +174,7 @@
     const std::string& windowArg,
     std::vector<detail::Link>* links);
 
-CAFFE2_API NetDef
+TORCH_API NetDef
 extractNetDef(const OperatorDef& op, const std::string& argName);
 } // namespace detail
 
diff --git a/caffe2/operators/sparse_lp_regularizer_op.h b/caffe2/operators/sparse_lp_regularizer_op.h
index 95a33e0..b2e1965 100644
--- a/caffe2/operators/sparse_lp_regularizer_op.h
+++ b/caffe2/operators/sparse_lp_regularizer_op.h
@@ -6,7 +6,7 @@
 namespace caffe2 {
 
 template <typename T, class Context>
-class CAFFE2_API SparseLpRegularizerOp final : public Operator<Context> {
+class TORCH_API SparseLpRegularizerOp final : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   template <class... Args>
diff --git a/caffe2/operators/sparse_normalize_op.h b/caffe2/operators/sparse_normalize_op.h
index de2fba4..44434b2 100644
--- a/caffe2/operators/sparse_normalize_op.h
+++ b/caffe2/operators/sparse_normalize_op.h
@@ -6,7 +6,7 @@
 namespace caffe2 {
 
 template <typename T, class Context>
-class CAFFE2_API SparseNormalizeOp final : public Operator<Context> {
+class TORCH_API SparseNormalizeOp final : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   template <class... Args>
diff --git a/caffe2/operators/text_file_reader_utils.h b/caffe2/operators/text_file_reader_utils.h
index 558c733..01b4743 100644
--- a/caffe2/operators/text_file_reader_utils.h
+++ b/caffe2/operators/text_file_reader_utils.h
@@ -9,13 +9,13 @@
 
 namespace caffe2 {
 
-struct CAFFE2_API Token {
+struct TORCH_API Token {
   int startDelimId;
   const char* start;
   const char* end;
 };
 
-class CAFFE2_API TokenizedString {
+class TORCH_API TokenizedString {
   // holder for strings that have been modified
   std::vector<std::shared_ptr<std::string>> modifiedStrings_;
   std::vector<Token> tokens_;
@@ -31,7 +31,7 @@
   friend class Tokenizer;
 };
 
-class CAFFE2_API Tokenizer {
+class TORCH_API Tokenizer {
  private:
   int startDelimId_;
   // state of the tokenizer
@@ -48,18 +48,18 @@
   void next(char* start, char* end, TokenizedString& tokenized);
 };
 
-struct CAFFE2_API CharRange {
+struct TORCH_API CharRange {
   char* start;
   char* end;
 };
 
-struct CAFFE2_API StringProvider {
+struct TORCH_API StringProvider {
   virtual void operator()(CharRange&) = 0;
   virtual void reset() = 0;
   virtual ~StringProvider() {}
 };
 
-class CAFFE2_API BufferedTokenizer {
+class TORCH_API BufferedTokenizer {
  public:
   BufferedTokenizer(const Tokenizer& t, StringProvider* p, int numPasses = 1)
       : provider_(p), tokenizer_(t), tokenIndex_(0), numPasses_(numPasses) {}
@@ -104,7 +104,7 @@
   int pass_{0};
 };
 
-class CAFFE2_API FileReader : public StringProvider {
+class TORCH_API FileReader : public StringProvider {
  public:
   explicit FileReader(const std::string& path, size_t bufferSize = 65536);
   ~FileReader();
diff --git a/caffe2/opt/annotations.h b/caffe2/opt/annotations.h
index 9bc1f1e..89ff7c3 100644
--- a/caffe2/opt/annotations.h
+++ b/caffe2/opt/annotations.h
@@ -7,7 +7,7 @@
 
 namespace caffe2 {
 
-class CAFFE2_API Caffe2Annotation : public nom::repr::Annotation {
+class TORCH_API Caffe2Annotation : public nom::repr::Annotation {
  public:
   Caffe2Annotation() : Annotation(AnnotationKind::Caffe2) {}
   Caffe2Annotation(std::string device)
diff --git a/caffe2/opt/backend_cutting.h b/caffe2/opt/backend_cutting.h
index c4c0a68..5b4df14 100644
--- a/caffe2/opt/backend_cutting.h
+++ b/caffe2/opt/backend_cutting.h
@@ -8,8 +8,8 @@
 
 namespace caffe2 {
 namespace opt {
-CAFFE2_API void DumpGraph(nom::repr::NNGraph* g, const std::string& fname);
-CAFFE2_API caffe2::NetDef OptimizeForBackend(
+TORCH_API void DumpGraph(nom::repr::NNGraph* g, const std::string& fname);
+TORCH_API caffe2::NetDef OptimizeForBackend(
     caffe2::NetDef& net,
     std::function<bool(const caffe2::OperatorDef&)> supports,
     std::function<caffe2::NetDef(const caffe2::NetDef&)> transform_func,
diff --git a/caffe2/opt/bound_shape_inferencer.h b/caffe2/opt/bound_shape_inferencer.h
index 662121b..54290c5 100644
--- a/caffe2/opt/bound_shape_inferencer.h
+++ b/caffe2/opt/bound_shape_inferencer.h
@@ -15,7 +15,7 @@
 // max_seq_size is the upper bound of length of every item in a batch.
 // Upper bound of length of a batch of items should be max_batch_size *
 // max_seq_size.
-struct CAFFE2_API BoundShapeSpec {
+struct TORCH_API BoundShapeSpec {
   explicit BoundShapeSpec(int64_t b, int64_t q)
       : max_batch_size(b),
         max_seq_size(q),
@@ -86,7 +86,7 @@
   bool extract_feature_len_;
 };
 
-class CAFFE2_API BoundShapeInferencer : public BoundShapeInferencerBase {
+class TORCH_API BoundShapeInferencer : public BoundShapeInferencerBase {
  public:
   explicit BoundShapeInferencer(const BoundShapeSpec& spec)
       : BoundShapeInferencerBase(spec) {}
@@ -149,7 +149,7 @@
   int64_t current_max_batch_size_{0};
 };
 
-CAFFE2_API std::shared_ptr<BoundShapeInferencerBase> getBoundShapeInferencer(
+TORCH_API std::shared_ptr<BoundShapeInferencerBase> getBoundShapeInferencer(
     const BoundShapeSpec& spec);
 
 C10_DECLARE_SHARED_REGISTRY(
diff --git a/caffe2/opt/converter.h b/caffe2/opt/converter.h
index 5cd69f1..7341899 100644
--- a/caffe2/opt/converter.h
+++ b/caffe2/opt/converter.h
@@ -13,38 +13,38 @@
 
 namespace caffe2 {
 
-CAFFE2_API void injectDataEdgeIndicators(caffe2::NetDef* net);
-CAFFE2_API void removeDataEdgeIndicators(caffe2::NetDef* net);
+TORCH_API void injectDataEdgeIndicators(caffe2::NetDef* net);
+TORCH_API void removeDataEdgeIndicators(caffe2::NetDef* net);
 
 // Default conversion to a NNModule
 // Optionally strict -- which checks for various input and output conditions.
 // Optionally this function will update a vector that maps operators in the
 // netdef positionally to NodeRefs in the resultant NNModule.
-CAFFE2_API nom::repr::NNModule convertToNNModule(
+TORCH_API nom::repr::NNModule convertToNNModule(
     const caffe2::NetDef& net,
     bool strict = false,
     std::vector<nom::repr::NNGraph::NodeRef>* = nullptr);
-CAFFE2_API caffe2::NetDef convertToCaffe2Proto(nom::repr::NNModule&);
+TORCH_API caffe2::NetDef convertToCaffe2Proto(nom::repr::NNModule&);
 
 // Pass in an oldNet to copy all the attributes of that network.
 // Be warned that transformations that modify the graph's inputs or outputs
 // are not reflected in changes to external_input or external_output.
-CAFFE2_API caffe2::NetDef convertToCaffe2Proto(
+TORCH_API caffe2::NetDef convertToCaffe2Proto(
     nom::repr::NNModule&,
     const caffe2::NetDef& oldNet);
 
 // Use these functions instead of the registry directly.
-CAFFE2_API std::unique_ptr<nom::repr::NeuralNetOperator>
+TORCH_API std::unique_ptr<nom::repr::NeuralNetOperator>
 convertToNeuralNetOperator(const caffe2::OperatorDef& op);
 
-CAFFE2_API caffe2::OperatorDef convertToOperatorDef(
+TORCH_API caffe2::OperatorDef convertToOperatorDef(
     const nom::repr::NNGraph::NodeRef& instrNode);
 
 // If the annotation doesn't exist, attempt to add it
-CAFFE2_API Caffe2Annotation* getOrAddCaffe2Annotation(
+TORCH_API Caffe2Annotation* getOrAddCaffe2Annotation(
     nom::repr::NNGraph::NodeRef& instrNode);
 
-class CAFFE2_API Converter {
+class TORCH_API Converter {
  public:
   explicit Converter() = default;
   virtual std::unique_ptr<nom::repr::NeuralNetOperator>
diff --git a/caffe2/opt/device.h b/caffe2/opt/device.h
index daa634d..b2425cc 100644
--- a/caffe2/opt/device.h
+++ b/caffe2/opt/device.h
@@ -4,7 +4,7 @@
 namespace caffe2 {
 namespace opt {
 
-CAFFE2_API void insertCopies(
+TORCH_API void insertCopies(
     nom::repr::NNModule* nn,
     std::function<bool(nom::repr::NNGraph::NodeRef)> supported,
     std::function<nom::repr::NNGraph::NodeRef(nom::repr::NNGraph&)> copyToFn,
diff --git a/caffe2/opt/distributed.h b/caffe2/opt/distributed.h
index 27e57f5..8089612 100644
--- a/caffe2/opt/distributed.h
+++ b/caffe2/opt/distributed.h
@@ -16,7 +16,7 @@
 ///
 /// Throws an exception if the passed in blobMap contains
 /// blobs that are not present in the NNModule.
-CAFFE2_API nom::repr::NNModule convertToNNModule(
+TORCH_API nom::repr::NNModule convertToNNModule(
     caffe2::NetDef&,
     std::map<std::string, caffe2::DeviceOption>);
 
@@ -24,10 +24,10 @@
 /// if you already have an NNModule.
 /// You probably don't want to use these
 /// if you can use convertToNNModule instead.
-CAFFE2_API void addBlobDeviceOptions(
+TORCH_API void addBlobDeviceOptions(
     std::map<std::string, caffe2::DeviceOption> blobMap,
     nom::repr::NNModule* nn);
-CAFFE2_API void injectDataEdgeIndicators(nom::repr::NNModule* nn);
-CAFFE2_API void removeDataEdgeIndicators(nom::repr::NNModule* nn);
+TORCH_API void injectDataEdgeIndicators(nom::repr::NNModule* nn);
+TORCH_API void removeDataEdgeIndicators(nom::repr::NNModule* nn);
 
 } // namespace caffe2
diff --git a/caffe2/opt/fakefp16_transform.h b/caffe2/opt/fakefp16_transform.h
index 9697381..22729a0 100644
--- a/caffe2/opt/fakefp16_transform.h
+++ b/caffe2/opt/fakefp16_transform.h
@@ -12,14 +12,14 @@
 namespace opt {
 
 // Mapping from fp32 ops to fakefp16 ops
-CAFFE2_API std::unordered_map<std::string, std::string> getFakeFp16OpMapping(
+TORCH_API std::unordered_map<std::string, std::string> getFakeFp16OpMapping(
     bool use_fp16_acc = false,
     bool use_nnpi = false);
 
-CAFFE2_API void fakeFp16FuseOps(NetDef* net);
+TORCH_API void fakeFp16FuseOps(NetDef* net);
 
 // Transform normal fp32 operators to fakefp16 operators.
-CAFFE2_API void fakeFp16Transform(NetDef* net);
+TORCH_API void fakeFp16Transform(NetDef* net);
 
 } // namespace opt
 } // namespace caffe2
diff --git a/caffe2/opt/fusion.h b/caffe2/opt/fusion.h
index 0973ade..7dde163 100644
--- a/caffe2/opt/fusion.h
+++ b/caffe2/opt/fusion.h
@@ -25,7 +25,7 @@
 
 using namespace nom;
 
-CAFFE2_API void fuseConvBN(repr::NNModule* nn, caffe2::Workspace* ws);
+TORCH_API void fuseConvBN(repr::NNModule* nn, caffe2::Workspace* ws);
 
 // Generic activation fusion helper.
 //
diff --git a/caffe2/opt/mobile.h b/caffe2/opt/mobile.h
index 78e9876..d31a3f8 100644
--- a/caffe2/opt/mobile.h
+++ b/caffe2/opt/mobile.h
@@ -7,8 +7,8 @@
 namespace caffe2 {
 namespace opt {
 
-CAFFE2_API void addNNPACK(nom::repr::NNModule* nn, bool low_memory = false);
-CAFFE2_API void fuseNNPACKConvRelu(nom::repr::NNModule* nn);
+TORCH_API void addNNPACK(nom::repr::NNModule* nn, bool low_memory = false);
+TORCH_API void fuseNNPACKConvRelu(nom::repr::NNModule* nn);
 
 } // namespace opt
 } // namespace caffe2
diff --git a/caffe2/opt/onnx_convert.h b/caffe2/opt/onnx_convert.h
index 707d413..89bf209 100644
--- a/caffe2/opt/onnx_convert.h
+++ b/caffe2/opt/onnx_convert.h
@@ -1,6 +1,6 @@
 #include "caffe2/core/common.h"
 
-class CAFFE2_API OnnxAnnotation : public nom::repr::Annotation {
+class TORCH_API OnnxAnnotation : public nom::repr::Annotation {
 public:
   OnnxAnnotation() : Annotation(AnnotationKind::Onnx) {}
   OnnxAnnotation(std::string device)
@@ -30,8 +30,8 @@
   caffe2::OperatorDef* OpDef = nullptr;
 };
 
-CAFFE2_API nom::repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map<std::string, nom::repr::NNGraph::NodeRef>* blobMapOut = nullptr);
+TORCH_API nom::repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map<std::string, nom::repr::NNGraph::NodeRef>* blobMapOut = nullptr);
 
-CAFFE2_API caffe2::NetDef convertToOnnxProto(nom::repr::NNModule&);
+TORCH_API caffe2::NetDef convertToOnnxProto(nom::repr::NNModule&);
 
-CAFFE2_API std::unique_ptr<nom::repr::NeuralNetOperator> convertToOperatorDef(caffe2::OperatorDef op);
+TORCH_API std::unique_ptr<nom::repr::NeuralNetOperator> convertToOperatorDef(caffe2::OperatorDef op);
diff --git a/caffe2/opt/onnxifi_transformer.h b/caffe2/opt/onnxifi_transformer.h
index d86f112..d88eb73 100644
--- a/caffe2/opt/onnxifi_transformer.h
+++ b/caffe2/opt/onnxifi_transformer.h
@@ -18,7 +18,7 @@
 
 // Split SparseLengthsSumSparse into SparseLengthsSumSparseLookup +
 // SparseLengthsSum
-CAFFE2_API void splitSparseLengthsSumSparse(NetDef* net, const Workspace& ws);
+TORCH_API void splitSparseLengthsSumSparse(NetDef* net, const Workspace& ws);
 
 struct OnnxifiTransformerOptions final : public BackendTransformOptions {
   explicit OnnxifiTransformerOptions() : BackendTransformOptions() {}
@@ -49,7 +49,7 @@
   std::unordered_map<int, ShapeInfoMap> shape_hints_per_bs;
 };
 
-class CAFFE2_API OnnxifiTransformer final : public BackendTransformerBase {
+class TORCH_API OnnxifiTransformer final : public BackendTransformerBase {
  public:
   explicit OnnxifiTransformer(const OnnxifiTransformerOptions& opts);
   ~OnnxifiTransformer() override;
diff --git a/caffe2/opt/optimize_ideep.h b/caffe2/opt/optimize_ideep.h
index 85b86bf..280ef1b 100644
--- a/caffe2/opt/optimize_ideep.h
+++ b/caffe2/opt/optimize_ideep.h
@@ -8,7 +8,7 @@
 namespace caffe2 {
 namespace opt {
 
-CAFFE2_API void OptimizeForMkldnn(
+TORCH_API void OptimizeForMkldnn(
     nom::repr::NNModule* nn,
     caffe2::Workspace* ws,
     bool training_mode = false);
diff --git a/caffe2/opt/optimizer.h b/caffe2/opt/optimizer.h
index 326f371..72245b4 100644
--- a/caffe2/opt/optimizer.h
+++ b/caffe2/opt/optimizer.h
@@ -8,8 +8,8 @@
 namespace caffe2 {
 namespace opt {
 
-CAFFE2_API NetDef optimize(NetDef net, Workspace* ws, int level = 1);
-CAFFE2_API NetDef optimize(NetDef net, int level = 1);
+TORCH_API NetDef optimize(NetDef net, Workspace* ws, int level = 1);
+TORCH_API NetDef optimize(NetDef net, int level = 1);
 
 } // namespace opt
 } // namespace caffe2
diff --git a/caffe2/opt/passes.h b/caffe2/opt/passes.h
index fc15dca..b2ef81c 100644
--- a/caffe2/opt/passes.h
+++ b/caffe2/opt/passes.h
@@ -21,7 +21,7 @@
  * use a different registry and inherit from WorkspaceOptimizationPass.
  */
 
-class CAFFE2_API OptimizationPass {
+class TORCH_API OptimizationPass {
  public:
   OptimizationPass(NNModule* nn) : nn_(nn) {}
   virtual void run() = 0;
@@ -31,7 +31,7 @@
   NNModule* nn_;
 };
 
-class CAFFE2_API WorkspaceOptimizationPass : public OptimizationPass {
+class TORCH_API WorkspaceOptimizationPass : public OptimizationPass {
  public:
   WorkspaceOptimizationPass(NNModule* nn, Workspace* ws) : OptimizationPass(nn), ws_(ws) {}
   virtual ~WorkspaceOptimizationPass() {}
diff --git a/caffe2/opt/shape_info.h b/caffe2/opt/shape_info.h
index e92f28e..b843963 100644
--- a/caffe2/opt/shape_info.h
+++ b/caffe2/opt/shape_info.h
@@ -4,7 +4,7 @@
 
 namespace caffe2 {
 
-struct CAFFE2_API QShapeInfo {
+struct TORCH_API QShapeInfo {
   QShapeInfo(float o = 0, float s = 1, uint32_t a = 1) {
     offset.clear();
     scale.clear();
@@ -18,7 +18,7 @@
   vector<float> scale;
 };
 
-struct CAFFE2_API ShapeInfo {
+struct TORCH_API ShapeInfo {
   ShapeInfo(bool q = false) : is_quantized(q) {}
   ShapeInfo(
       std::vector<TensorBoundShape_DimType>&& t,
@@ -133,23 +133,23 @@
 // since they are already inserted as CONSTANT, it will take effect here.
 // For SEQ typed tensors, there are only a few of them and they will be
 // handled by BoundShapeInferencer.
-CAFFE2_API ShapeInfo constructShapeInfoWithDefaultDimType(
+TORCH_API ShapeInfo constructShapeInfoWithDefaultDimType(
     TensorShape shape,
     TensorBoundShape_DimType defaultFirstDimType =
         TensorBoundShape_DimType_BATCH);
 
-CAFFE2_API void parseShapeInfoMapFromString(const std::string&, ShapeInfoMap&);
+TORCH_API void parseShapeInfoMapFromString(const std::string&, ShapeInfoMap&);
 
 // Extract shape info from tensorBoundShapes to a ShapeInfoMap.
 // Change shape according to new max_batch_size and max_feature_len
 // at the same time if necessary.
-CAFFE2_API ShapeInfoMap extractShapeInfoFromTensorBoundShapes(
+TORCH_API ShapeInfoMap extractShapeInfoFromTensorBoundShapes(
     TensorBoundShapes tensor_bound_shapes,
     int64_t new_max_batch_size = -1,
     int64_t new_max_feature_len = -1);
 
 // In-place modify TensorBoundShape to change shape size based on type
-CAFFE2_API void changeTensorBoundShapes(
+TORCH_API void changeTensorBoundShapes(
     TensorBoundShape& tensor_shape_and_type,
     const int64_t old_batch_size,
     const int64_t old_seq_size,
@@ -157,7 +157,7 @@
     const int64_t new_seq_size);
 
 // In-place modify TensorShape's shape at a specific dimension
-CAFFE2_API void modifyTensorShapeDimSize(
+TORCH_API void modifyTensorShapeDimSize(
     TensorShape* tensor_shape,
     int dim_index,
     const int64_t old_size,
diff --git a/caffe2/opt/tvm_transformer.h b/caffe2/opt/tvm_transformer.h
index 8ff29ba..6a4a345 100644
--- a/caffe2/opt/tvm_transformer.h
+++ b/caffe2/opt/tvm_transformer.h
@@ -13,7 +13,7 @@
   bool profiling_based_jit{false};
 };
 
-class CAFFE2_API TvmTransformer final : public BackendTransformerBase {
+class TORCH_API TvmTransformer final : public BackendTransformerBase {
  public:
   explicit TvmTransformer(const TvmTransformOptions& opts)
       : BackendTransformerBase(), opts_(opts) {}
@@ -68,7 +68,7 @@
 };
 
 // Helper function to clean up a net and run tvm transform.
-CAFFE2_API void tvmTransform(
+TORCH_API void tvmTransform(
     NetDef* net,
     Workspace* ws,
     const std::vector<std::string>& input_names,
@@ -84,7 +84,7 @@
     bool tvm_profiling_based_jit,
     bool debug);
 
-CAFFE2_API void cleanUpPredictNet(
+TORCH_API void cleanUpPredictNet(
     NetDef* net,
     const std::vector<std::string>& input_names,
     const std::vector<std::string>& output_names,
diff --git a/caffe2/predictor/emulator/data_filler.h b/caffe2/predictor/emulator/data_filler.h
index b893a18..e3021f6 100644
--- a/caffe2/predictor/emulator/data_filler.h
+++ b/caffe2/predictor/emulator/data_filler.h
@@ -144,7 +144,7 @@
 };
 
 // Convenient helpers to fill data to workspace.
-CAFFE2_API void fillRandomNetworkInputs(
+TORCH_API void fillRandomNetworkInputs(
     const NetDef& net,
     const std::vector<std::vector<std::vector<int64_t>>>& inputDims,
     const std::vector<std::vector<std::string>>& inputTypes,
diff --git a/caffe2/predictor/predictor.h b/caffe2/predictor/predictor.h
index fd16eb5..f49de20 100644
--- a/caffe2/predictor/predictor.h
+++ b/caffe2/predictor/predictor.h
@@ -7,7 +7,7 @@
 
 namespace caffe2 {
 
-class CAFFE2_API Predictor {
+class TORCH_API Predictor {
  public:
   using TensorList = std::vector<TensorCPU>;
   using TensorMap = std::unordered_map<std::string, TensorCPU>;
diff --git a/caffe2/predictor/predictor_config.h b/caffe2/predictor/predictor_config.h
index 243729b..ad3711e 100644
--- a/caffe2/predictor/predictor_config.h
+++ b/caffe2/predictor/predictor_config.h
@@ -17,7 +17,7 @@
 /**
  * Stores parameters nessasary for creating a PredictorInterface object.
  */
-struct CAFFE2_API PredictorConfig {
+struct TORCH_API PredictorConfig {
   // A map of parameter name to Tensor object. Predictor is supposed to
   // guarantee constness of all these Tensor objects.
   std::shared_ptr<PredictorParameters> parameters;
@@ -41,14 +41,14 @@
   std::shared_ptr<Workspace> ws;
 };
 
-CAFFE2_API Workspace makeWorkspace(std::shared_ptr<PredictorParameters> parameters);
+TORCH_API Workspace makeWorkspace(std::shared_ptr<PredictorParameters> parameters);
 
-CAFFE2_API PredictorConfig makePredictorConfig(
+TORCH_API PredictorConfig makePredictorConfig(
     const MetaNetDef& net,
     Workspace* parent = nullptr,
     bool run_init = true);
 
-CAFFE2_API PredictorConfig makePredictorConfig(
+TORCH_API PredictorConfig makePredictorConfig(
     const NetDef& init_net,
     const NetDef& run_net,
     Workspace* parent = nullptr,
diff --git a/caffe2/predictor/predictor_utils.cc b/caffe2/predictor/predictor_utils.cc
index e38d51d..44b2868 100644
--- a/caffe2/predictor/predictor_utils.cc
+++ b/caffe2/predictor/predictor_utils.cc
@@ -9,7 +9,7 @@
 namespace caffe2 {
 namespace predictor_utils {
 
-CAFFE2_API const NetDef& getNet(
+TORCH_API const NetDef& getNet(
     const MetaNetDef& def,
     const std::string& name) {
   for (const auto& n : def.nets()) {
diff --git a/caffe2/predictor/predictor_utils.h b/caffe2/predictor/predictor_utils.h
index 8c9cb4a..e7405e6 100644
--- a/caffe2/predictor/predictor_utils.h
+++ b/caffe2/predictor/predictor_utils.h
@@ -8,18 +8,18 @@
 namespace caffe2 {
 namespace predictor_utils {
 
-CAFFE2_API const NetDef& getNet(const MetaNetDef& def, const std::string& name);
+TORCH_API const NetDef& getNet(const MetaNetDef& def, const std::string& name);
 const ::google::protobuf::RepeatedPtrField<::std::string>& getBlobs(
     const MetaNetDef& def,
     const std::string& name);
 
-CAFFE2_API std::unique_ptr<MetaNetDef> extractMetaNetDef(
+TORCH_API std::unique_ptr<MetaNetDef> extractMetaNetDef(
     db::Cursor* cursor,
     const std::string& key);
 
 // Extract the MetaNetDef from `db`, and run the global init net on the
 // `master` workspace.
-CAFFE2_API std::unique_ptr<MetaNetDef> runGlobalInitialization(
+TORCH_API std::unique_ptr<MetaNetDef> runGlobalInitialization(
     std::unique_ptr<db::DBReader> db,
     Workspace* master);
 
diff --git a/caffe2/proto/CMakeLists.txt b/caffe2/proto/CMakeLists.txt
index 9dc4b4a..ba6b696 100644
--- a/caffe2/proto/CMakeLists.txt
+++ b/caffe2/proto/CMakeLists.txt
@@ -10,14 +10,14 @@
 
 if(MSVC)
   if(BUILD_SHARED_LIBS)
-    set(Caffe2_API_DEFINE "-DCAFFE2_API=__declspec(dllexport)")
+    set(TORCH_API_DEFINE "-DTORCH_API=__declspec(dllexport)")
   else()
-    set(Caffe2_API_DEFINE "-DCAFFE2_API=")
+    set(TORCH_API_DEFINE "-DTORCH_API=")
   endif()
 else()
-  set(Caffe2_API_DEFINE "-DCAFFE2_API=")
+  set(TORCH_API_DEFINE "-DTORCH_API=")
 endif()
 target_compile_definitions(
-    Caffe2_PROTO PRIVATE ${Caffe2_API_DEFINE})
+    Caffe2_PROTO PRIVATE ${TORCH_API_DEFINE})
 
 install(FILES ${Caffe2_PROTO_HEADERS} DESTINATION include/caffe2/proto)
diff --git a/caffe2/proto/caffe2_pb.h b/caffe2/proto/caffe2_pb.h
index 23af2be..fc8acab 100644
--- a/caffe2/proto/caffe2_pb.h
+++ b/caffe2/proto/caffe2_pb.h
@@ -16,7 +16,7 @@
 constexpr DeviceType COMPILE_TIME_MAX_DEVICE_TYPES =
     DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES;
 
-inline CAFFE2_API DeviceType ProtoToType(const caffe2::DeviceTypeProto p) {
+inline TORCH_API DeviceType ProtoToType(const caffe2::DeviceTypeProto p) {
   switch (p) {
     case caffe2::PROTO_CPU:
       return DeviceType::CPU;
@@ -44,11 +44,11 @@
   }
 }
 
-inline CAFFE2_API DeviceType ProtoToType(int p) {
+inline TORCH_API DeviceType ProtoToType(int p) {
   return ProtoToType(static_cast<caffe2::DeviceTypeProto>(p));
 }
 
-inline CAFFE2_API DeviceTypeProto TypeToProto(const DeviceType& t) {
+inline TORCH_API DeviceTypeProto TypeToProto(const DeviceType& t) {
   switch (t) {
     case DeviceType::CPU:
       return caffe2::PROTO_CPU;
@@ -76,7 +76,7 @@
   }
 }
 
-inline CAFFE2_API caffe2::DeviceOption DeviceToOption(
+inline TORCH_API caffe2::DeviceOption DeviceToOption(
     const at::Device& device) {
   caffe2::DeviceOption option;
   auto type = device.type();
@@ -109,7 +109,7 @@
   return option;
 }
 
-inline CAFFE2_API at::Device OptionToDevice(const caffe2::DeviceOption option) {
+inline TORCH_API at::Device OptionToDevice(const caffe2::DeviceOption option) {
   auto type = option.device_type();
   int32_t id = -1;
   switch (type) {
diff --git a/caffe2/queue/blobs_queue.h b/caffe2/queue/blobs_queue.h
index 5ad5c93..a60cc15 100644
--- a/caffe2/queue/blobs_queue.h
+++ b/caffe2/queue/blobs_queue.h
@@ -20,7 +20,7 @@
 // Containing blobs are owned by the workspace.
 // On read, we swap out the underlying data for the blob passed in for blobs
 
-class CAFFE2_API BlobsQueue : public std::enable_shared_from_this<BlobsQueue> {
+class TORCH_API BlobsQueue : public std::enable_shared_from_this<BlobsQueue> {
  public:
   BlobsQueue(
       Workspace* ws,
diff --git a/caffe2/serialize/file_adapter.h b/caffe2/serialize/file_adapter.h
index 416208e..ee68b79 100644
--- a/caffe2/serialize/file_adapter.h
+++ b/caffe2/serialize/file_adapter.h
@@ -10,7 +10,7 @@
 namespace caffe2 {
 namespace serialize {
 
-class CAFFE2_API FileAdapter final : public ReadAdapterInterface {
+class TORCH_API FileAdapter final : public ReadAdapterInterface {
  public:
   C10_DISABLE_COPY_AND_ASSIGN(FileAdapter);
   explicit FileAdapter(const std::string& file_name);
diff --git a/caffe2/serialize/inline_container.h b/caffe2/serialize/inline_container.h
index ee7e971..a34a6db 100644
--- a/caffe2/serialize/inline_container.h
+++ b/caffe2/serialize/inline_container.h
@@ -152,7 +152,7 @@
 // handle an updated operator.
 constexpr uint64_t kMinSupportedBytecodeVersion = 0x3L;
 
-class CAFFE2_API PyTorchStreamReader final {
+class TORCH_API PyTorchStreamReader final {
  public:
   explicit PyTorchStreamReader(const std::string& file_name);
   explicit PyTorchStreamReader(std::istream* in);
@@ -184,7 +184,7 @@
   int64_t version_;
 };
 
-class CAFFE2_API PyTorchStreamWriter final {
+class TORCH_API PyTorchStreamWriter final {
  public:
   explicit PyTorchStreamWriter(std::string archive_name);
   explicit PyTorchStreamWriter(
diff --git a/caffe2/serialize/istream_adapter.h b/caffe2/serialize/istream_adapter.h
index b7a0444..8960d55 100644
--- a/caffe2/serialize/istream_adapter.h
+++ b/caffe2/serialize/istream_adapter.h
@@ -9,7 +9,7 @@
 namespace serialize {
 
 // this is a reader implemented by std::istream
-class CAFFE2_API IStreamAdapter final : public ReadAdapterInterface {
+class TORCH_API IStreamAdapter final : public ReadAdapterInterface {
  public:
   C10_DISABLE_COPY_AND_ASSIGN(IStreamAdapter);
   explicit IStreamAdapter(std::istream* istream);
diff --git a/caffe2/serialize/read_adapter_interface.h b/caffe2/serialize/read_adapter_interface.h
index 556c005..0a6b5b7 100644
--- a/caffe2/serialize/read_adapter_interface.h
+++ b/caffe2/serialize/read_adapter_interface.h
@@ -11,7 +11,7 @@
 // this is the interface for the (file/stream/memory) reader in
 // PyTorchStreamReader. with this interface, we can extend the support
 // besides standard istream
-class CAFFE2_API ReadAdapterInterface {
+class TORCH_API ReadAdapterInterface {
  public:
   virtual size_t size() const = 0;
   virtual size_t read(uint64_t pos, void* buf, size_t n, const char* what = "")
diff --git a/caffe2/transforms/common_subexpression_elimination.h b/caffe2/transforms/common_subexpression_elimination.h
index fdec50a..6e54f81 100644
--- a/caffe2/transforms/common_subexpression_elimination.h
+++ b/caffe2/transforms/common_subexpression_elimination.h
@@ -25,7 +25,7 @@
  *
  * TODO(benz): Fix the error to not match nodes that write to external output.
  */
-class CAFFE2_API CommonSubexpressionEliminationTransform : public Transform {
+class TORCH_API CommonSubexpressionEliminationTransform : public Transform {
  public:
   CommonSubexpressionEliminationTransform() {
     SetPatternMatchType(SORTED_WRT_EXECUTION_ORDER);
diff --git a/caffe2/transforms/conv_to_nnpack_transform.h b/caffe2/transforms/conv_to_nnpack_transform.h
index 8563732..0e19989 100644
--- a/caffe2/transforms/conv_to_nnpack_transform.h
+++ b/caffe2/transforms/conv_to_nnpack_transform.h
@@ -7,7 +7,7 @@
 
 namespace caffe2 {
 
-class CAFFE2_API ConvToNNPackTransform : public SingleOpTransform {
+class TORCH_API ConvToNNPackTransform : public SingleOpTransform {
  protected:
   // Specify what the op needs to be to match the pattern.
   bool MatchOperator(const OperatorDef& op) override {
diff --git a/caffe2/transforms/pattern_net_transform.h b/caffe2/transforms/pattern_net_transform.h
index 397258f..95638f4 100644
--- a/caffe2/transforms/pattern_net_transform.h
+++ b/caffe2/transforms/pattern_net_transform.h
@@ -15,7 +15,7 @@
  * and this Transform will find subgraphs which fit the pattern net,
  * and replace it with the replace net.
  */
-class CAFFE2_API PatternNetTransform : public Transform {
+class TORCH_API PatternNetTransform : public Transform {
  public:
   PatternNetTransform(const NetDef& pattern_net, const NetDef& replace_net)
       : p_(transform::Graph(pattern_net)), r_(transform::Graph(replace_net)) {
diff --git a/caffe2/transforms/single_op_transform.h b/caffe2/transforms/single_op_transform.h
index 45f93cb..096c064 100644
--- a/caffe2/transforms/single_op_transform.h
+++ b/caffe2/transforms/single_op_transform.h
@@ -15,7 +15,7 @@
  * Transforms which derive from SingleOpTransform need to override:
  * ReplaceOperator and MatchOperator.
  */
-class CAFFE2_API SingleOpTransform : public Transform {
+class TORCH_API SingleOpTransform : public Transform {
  protected:
   bool PatternRule(
       const transform::Graph& g,
diff --git a/caffe2/utils/bench_utils.h b/caffe2/utils/bench_utils.h
index b879ccc..59997ed 100644
--- a/caffe2/utils/bench_utils.h
+++ b/caffe2/utils/bench_utils.h
@@ -23,7 +23,7 @@
 
 namespace caffe2 {
 
-CAFFE2_API uint32_t wipe_cache();
+TORCH_API uint32_t wipe_cache();
 
 } // namespace caffe2
 
diff --git a/caffe2/utils/cpuid.cc b/caffe2/utils/cpuid.cc
index b2e6b89..7ef47dd 100644
--- a/caffe2/utils/cpuid.cc
+++ b/caffe2/utils/cpuid.cc
@@ -7,10 +7,10 @@
   return cpuid_singleton;
 }
 
-CAFFE2_API uint32_t CpuId::f1c_ = 0;
-CAFFE2_API uint32_t CpuId::f1d_ = 0;
-CAFFE2_API uint32_t CpuId::f7b_ = 0;
-CAFFE2_API uint32_t CpuId::f7c_ = 0;
+TORCH_API uint32_t CpuId::f1c_ = 0;
+TORCH_API uint32_t CpuId::f1d_ = 0;
+TORCH_API uint32_t CpuId::f7b_ = 0;
+TORCH_API uint32_t CpuId::f7c_ = 0;
 
 CpuId::CpuId() {
 #ifdef _MSC_VER
diff --git a/caffe2/utils/cpuid.h b/caffe2/utils/cpuid.h
index 7cc0900..598e1bd 100644
--- a/caffe2/utils/cpuid.h
+++ b/caffe2/utils/cpuid.h
@@ -12,7 +12,7 @@
 
 class CpuId;
 
-CAFFE2_API const CpuId& GetCpuId();
+TORCH_API const CpuId& GetCpuId();
 
 ///////////////////////////////////////////////////////////////////////////////
 // Implementation of CpuId that is borrowed from folly.
@@ -137,10 +137,10 @@
 #undef X
 
  private:
-  CAFFE2_API static uint32_t f1c_;
-  CAFFE2_API static uint32_t f1d_;
-  CAFFE2_API static uint32_t f7b_;
-  CAFFE2_API static uint32_t f7c_;
+  TORCH_API static uint32_t f1c_;
+  TORCH_API static uint32_t f1d_;
+  TORCH_API static uint32_t f7b_;
+  TORCH_API static uint32_t f7c_;
 };
 
 } // namespace caffe2
diff --git a/caffe2/utils/math.h b/caffe2/utils/math.h
index 4ad285d..07911a3 100644
--- a/caffe2/utils/math.h
+++ b/caffe2/utils/math.h
@@ -28,7 +28,7 @@
 
 // An empty class as a placeholder for a math function that has no specific
 // engine specified.
-class CAFFE2_API DefaultEngine {};
+class TORCH_API DefaultEngine {};
 
 namespace math {
 
@@ -118,7 +118,7 @@
 
 // Broadcasts X with X_dims to Y with Y_dims.
 template <typename T, class Context>
-CAFFE2_API void Broadcast(
+TORCH_API void Broadcast(
     const int X_ndim,
     const int* X_dims,
     const int Y_ndim,
@@ -130,7 +130,7 @@
 
 // Computes inv_std from variance.
 template <typename T, class Context>
-CAFFE2_API void InvStd(
+TORCH_API void InvStd(
     const int N,
     const T epsilon,
     const T* var,
@@ -140,7 +140,7 @@
 // Adds batch sub-tensors elementwise to output. Stripe is the stripe length
 // and N is the number of elements to add (size of Y).
 template <typename T, class Context>
-CAFFE2_API void AddStripedBatch(
+TORCH_API void AddStripedBatch(
     const int N,
     const T* first,
     T* y,
@@ -151,24 +151,24 @@
 // Compute the row-wise max of a N*D matrix X, and write it to a N
 // dimensional vector y.
 template <typename T, class Context>
-CAFFE2_API void
+TORCH_API void
 RowwiseMax(const int N, const int D, const T* x, T* y, Context* context);
 
 // Compute the column-wise max of a N*D matrix X, and write it to a D
 // dimensional vector y.
 template <typename T, class Context>
-CAFFE2_API void
+TORCH_API void
 ColwiseMax(const int N, const int D, const T* x, T* y, Context* context);
 
 // Elemwise maximum of vector x and scalar alpha. y[i] = max(x[i], alpha)
 template <typename T, class Context>
-CAFFE2_API void
+TORCH_API void
 Maximum(const int N, const float alpha, const T* x, T* y, Context* context);
 
 // Decaf gemm provides a simpler interface to the gemm functions, with the
 // limitation that the data has to be contiguous in memory.
 template <typename T, class Context, class Engine = DefaultEngine>
-CAFFE2_API void Gemm(
+TORCH_API void Gemm(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int M,
@@ -185,7 +185,7 @@
 // We also provide a gemm that has explicit lda, ldb and ldc specified.
 // In most cases you probably want to use the function above, though.
 template <typename T, class Context, class Engine = DefaultEngine>
-CAFFE2_API void GemmEx(
+TORCH_API void GemmEx(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int M,
@@ -203,7 +203,7 @@
 
 // GemmBatched provides a simple abstraction into library routines
 template <typename T, class Context, class Engine = DefaultEngine>
-CAFFE2_API void GemmBatched(
+TORCH_API void GemmBatched(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int batch_size,
@@ -219,7 +219,7 @@
     TensorProto::DataType math_type = TensorProto_DataType_FLOAT);
 
 template <typename T, class Context, class Engine = DefaultEngine>
-CAFFE2_API void GemmStridedBatched(
+TORCH_API void GemmStridedBatched(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int batch_size,
@@ -242,7 +242,7 @@
 // CblasNoTrans: x is an N dim vector and y is an M dim vector.
 // CblasTrans:   x is an M dim vector and y is an N dim vector.
 template <typename T, class Context, class Engine = DefaultEngine>
-CAFFE2_API void Gemv(
+TORCH_API void Gemv(
     const CBLAS_TRANSPOSE trans_A,
     const int M,
     const int N,
@@ -255,13 +255,13 @@
     TensorProto::DataType math_type = TensorProto_DataType_FLOAT);
 
 template <typename T, class Context>
-CAFFE2_API void
+TORCH_API void
 RandUniform(const size_t n, const T a, const T b, T* r, Context* context);
 
 // Generate n values that sum up to a fixed sum
 // and subject to a restriction a <= x <= b for each x generated
 template <typename T, class Context>
-CAFFE2_API void RandFixedSum(
+TORCH_API void RandFixedSum(
     const size_t n,
     const T a,
     const T b,
@@ -270,7 +270,7 @@
     Context* context);
 
 template <typename T, class Context>
-CAFFE2_API void RandUniformUnique(
+TORCH_API void RandUniformUnique(
     const size_t n,
     const T a,
     const T b,
@@ -282,21 +282,21 @@
 // Generate n values from synthetic data distribution,
 // define by unique accesses and stack distances
 template <typename T, class Context>
-CAFFE2_API void
+TORCH_API void
 RandSyntheticData(const size_t n, const T a, const T b, T* r, Context* context);
 
 template <typename T, class Context>
-CAFFE2_API void
+TORCH_API void
 RandGaussian(const size_t n, const T mean, const T std, T* r, Context* context);
 
 // Dot matrix of vector a and b, and writes the result to a single value y.
 template <typename T, class Context>
-CAFFE2_API void
+TORCH_API void
 Dot(const int N, const T* a, const T* b, T* y, Context* context);
 
 // Sum of vector x, and writes the result to a single value y.
 template <typename T, class Context>
-CAFFE2_API void Sum(
+TORCH_API void Sum(
     const int N,
     const T* x,
     T* y,
@@ -305,7 +305,7 @@
 
 // Sum of squares of vector x, and writes the result to a single value y.
 template <typename T, class Context>
-CAFFE2_API void SumSqr(
+TORCH_API void SumSqr(
     const int N,
     const T* x,
     T* y,
@@ -315,7 +315,7 @@
 // Select does index selection of the rows a N*D matrix x, and gives the N
 // dimensional vector y that contains the selected data.
 template <typename T, class Context>
-CAFFE2_API void Select(
+TORCH_API void Select(
     const int N,
     const int D,
     const T* x,
@@ -329,7 +329,7 @@
 // For NCHW order, groups doesn't make any difference because we're doing Im2Col
 // for each N and C is the slowest moving dimension among CHW.
 template <typename T, class Context, StorageOrder kOrder>
-CAFFE2_API void Im2Col(
+TORCH_API void Im2Col(
     const int channels,
     const int height,
     const int width,
@@ -350,7 +350,7 @@
 
 // groups must be 1 for GPU
 template <typename T, class Context, StorageOrder kOrder>
-CAFFE2_API void Im2ColNd(
+TORCH_API void Im2ColNd(
     const int N,
     const int img_size,
     const int col_size,
@@ -371,7 +371,7 @@
 // For NCHW order, groups doesn't make any difference because we're doing Im2Col
 // for each N and C is the slowest moving dimension among CHW.
 template <typename T, class Context, StorageOrder kOrder>
-CAFFE2_API void Col2Im(
+TORCH_API void Col2Im(
     const int channels,
     const int height,
     const int width,
@@ -396,7 +396,7 @@
 // For NCHW order, groups doesn't make any difference because we're doing Im2Col
 // for each N and C is the slowest moving dimension among CHW.
 template <typename T, class Context, StorageOrder kOrder>
-CAFFE2_API void Col2ImNd(
+TORCH_API void Col2ImNd(
     const int N,
     const int img_size,
     const int col_size,
@@ -414,7 +414,7 @@
 // Applies a per-channel bias value to each channel of the input
 // image. image_size is H * W
 template <typename T, class Context>
-CAFFE2_API void BiasCHW(
+TORCH_API void BiasCHW(
     const T* bias,
     const T* bias_multiplier,
     const int bias_channels,
@@ -423,7 +423,7 @@
     Context* context);
 
 template <class Context>
-CAFFE2_API void CopyMatrix(
+TORCH_API void CopyMatrix(
     const size_t item_size,
     const int M,
     const int N,
@@ -435,7 +435,7 @@
     TypeMeta::Copy copy = nullptr);
 
 template <typename T, class Context>
-CAFFE2_API void CopyMatrix(
+TORCH_API void CopyMatrix(
     const int M,
     const int N,
     const T* A,
@@ -445,7 +445,7 @@
     Context* context);
 
 template <typename T, class Context>
-CAFFE2_API void CopyMatrix(
+TORCH_API void CopyMatrix(
     const int M,
     const int N,
     const T* A,
@@ -457,7 +457,7 @@
     Context* context);
 
 template <typename T, class Context>
-CAFFE2_API void CopyVector(const int N, const T* A, T* B, Context* context);
+TORCH_API void CopyVector(const int N, const T* A, T* B, Context* context);
 
 } // namespace math
 } // namespace caffe2
diff --git a/caffe2/utils/math/broadcast.h b/caffe2/utils/math/broadcast.h
index 67e37d1..16b98c7 100644
--- a/caffe2/utils/math/broadcast.h
+++ b/caffe2/utils/math/broadcast.h
@@ -8,7 +8,7 @@
 namespace math {
 
 template <typename T, class Context, StorageOrder kOrder>
-CAFFE2_API void AffineChannel(
+TORCH_API void AffineChannel(
     const int N,
     const int C,
     const int HxW,
diff --git a/caffe2/utils/math/elementwise.h b/caffe2/utils/math/elementwise.h
index 40b275f..7947093 100644
--- a/caffe2/utils/math/elementwise.h
+++ b/caffe2/utils/math/elementwise.h
@@ -8,69 +8,69 @@
 namespace math {
 
 template <typename T, class Context>
-CAFFE2_API void Exp(int N, const T* X, T* Y, Context* context);
+TORCH_API void Exp(int N, const T* X, T* Y, Context* context);
 template <typename T, class Context>
-CAFFE2_API void Log(int N, const T* X, T* Y, Context* context);
+TORCH_API void Log(int N, const T* X, T* Y, Context* context);
 template <typename T, class Context>
-CAFFE2_API void Log1p(int N, const T* X, T* Y, Context* context);
+TORCH_API void Log1p(int N, const T* X, T* Y, Context* context);
 template <typename T, class Context>
-CAFFE2_API void Sin(int N, const T* X, T* Y, Context* context);
+TORCH_API void Sin(int N, const T* X, T* Y, Context* context);
 template <typename T, class Context>
-CAFFE2_API void Asin(int N, const T* X, T* Y, Context* context);
+TORCH_API void Asin(int N, const T* X, T* Y, Context* context);
 template <typename T, class Context>
-CAFFE2_API void Cos(int N, const T* X, T* Y, Context* context);
+TORCH_API void Cos(int N, const T* X, T* Y, Context* context);
 template <typename T, class Context>
-CAFFE2_API void Acos(int N, const T* X, T* Y, Context* context);
+TORCH_API void Acos(int N, const T* X, T* Y, Context* context);
 template <typename T, class Context>
-CAFFE2_API void Tan(int N, const T* X, T* Y, Context* context);
+TORCH_API void Tan(int N, const T* X, T* Y, Context* context);
 template <typename T, class Context>
-CAFFE2_API void Atan(int N, const T* X, T* Y, Context* context);
+TORCH_API void Atan(int N, const T* X, T* Y, Context* context);
 template <typename T, class Context>
-CAFFE2_API void Sinh(int N, const T* X, T* Y, Context* context);
+TORCH_API void Sinh(int N, const T* X, T* Y, Context* context);
 template <typename T, class Context>
-CAFFE2_API void Cosh(int N, const T* X, T* Y, Context* context);
+TORCH_API void Cosh(int N, const T* X, T* Y, Context* context);
 template <typename T, class Context>
-CAFFE2_API void SinCos(int N, const T* X, T* S, T* C, Context* context);
+TORCH_API void SinCos(int N, const T* X, T* S, T* C, Context* context);
 template <typename T, class Context>
-CAFFE2_API void Tanh(int N, const T* X, T* Y, Context* context);
+TORCH_API void Tanh(int N, const T* X, T* Y, Context* context);
 template <typename T, class Context>
-CAFFE2_API void Abs(int N, const T* X, T* Y, Context* context);
+TORCH_API void Abs(int N, const T* X, T* Y, Context* context);
 template <typename T, class Context>
-CAFFE2_API void Sqr(int N, const T* X, T* Y, Context* context);
+TORCH_API void Sqr(int N, const T* X, T* Y, Context* context);
 template <typename T, class Context>
-CAFFE2_API void Sqrt(int N, const T* X, T* Y, Context* context);
+TORCH_API void Sqrt(int N, const T* X, T* Y, Context* context);
 template <typename T, class Context>
-CAFFE2_API void Rsqrt(int N, const T* X, T* Y, Context* context);
+TORCH_API void Rsqrt(int N, const T* X, T* Y, Context* context);
 template <typename T, class Context>
-CAFFE2_API void Cube(int N, const T* X, T* Y, Context* context);
+TORCH_API void Cube(int N, const T* X, T* Y, Context* context);
 template <typename T, class Context>
-CAFFE2_API void Cbrt(int N, const T* X, T* Y, Context* context);
+TORCH_API void Cbrt(int N, const T* X, T* Y, Context* context);
 template <typename T, class Context>
-CAFFE2_API void Neg(int N, const T* X, T* Y, Context* context);
+TORCH_API void Neg(int N, const T* X, T* Y, Context* context);
 template <typename T, class Context>
-CAFFE2_API void Sign(int N, const T* X, T* Y, Context* context);
+TORCH_API void Sign(int N, const T* X, T* Y, Context* context);
 template <typename T, class Context>
-CAFFE2_API void Not(int N, const T* X, T* Y, Context* context);
+TORCH_API void Not(int N, const T* X, T* Y, Context* context);
 template <typename T, class Context>
-CAFFE2_API void Powx(int N, const T* A, const T b, T* Y, Context* context);
+TORCH_API void Powx(int N, const T* A, const T b, T* Y, Context* context);
 template <typename T, class Context>
-CAFFE2_API void Inv(int N, const T* X, T* Y, Context* context);
+TORCH_API void Inv(int N, const T* X, T* Y, Context* context);
 template <typename T, class Context>
-CAFFE2_API void Erf(int N, const T* X, T* Y, Context* context);
+TORCH_API void Erf(int N, const T* X, T* Y, Context* context);
 template <typename T, class Context>
-CAFFE2_API void CdfNorm(int N, const T* X, T* Y, Context* context);
+TORCH_API void CdfNorm(int N, const T* X, T* Y, Context* context);
 
 template <typename T, class Context>
-CAFFE2_API void Set(std::int64_t N, T alpha, T* X, Context* context);
+TORCH_API void Set(std::int64_t N, T alpha, T* X, Context* context);
 
 template <typename TAlpha, typename TData, class Context>
-CAFFE2_API void
+TORCH_API void
 Scale(std::int64_t N, TAlpha alpha, const TData* X, TData* Y, Context* context);
 
 // Different from the Scale function above, if alpha is passed in as a pointer,
 // we will assume that it lives on the Context device, for example on GPU.
 template <typename TAlpha, typename TData, class Context>
-CAFFE2_API void Scale(
+TORCH_API void Scale(
     std::int64_t N,
     const TAlpha* alpha,
     const TData* X,
@@ -78,58 +78,58 @@
     Context* context);
 
 template <typename T, class Context>
-CAFFE2_API void Add(int N, const T* A, const T* B, T* C, Context* context);
+TORCH_API void Add(int N, const T* A, const T* B, T* C, Context* context);
 template <typename T, class Context>
-CAFFE2_API void Sub(int N, const T* A, const T* B, T* C, Context* context);
+TORCH_API void Sub(int N, const T* A, const T* B, T* C, Context* context);
 template <typename T, class Context>
-CAFFE2_API void Mul(int N, const T* A, const T* B, T* C, Context* context);
+TORCH_API void Mul(int N, const T* A, const T* B, T* C, Context* context);
 template <typename T, class Context>
-CAFFE2_API void Div(int N, const T* A, const T* B, T* C, Context* context);
+TORCH_API void Div(int N, const T* A, const T* B, T* C, Context* context);
 
 template <typename T, class Context>
-CAFFE2_API void Min(int N, const T* A, const T* B, T* C, Context* context);
+TORCH_API void Min(int N, const T* A, const T* B, T* C, Context* context);
 template <typename T, class Context>
-CAFFE2_API void Max(int N, const T* A, const T* B, T* C, Context* context);
+TORCH_API void Max(int N, const T* A, const T* B, T* C, Context* context);
 
 template <typename T, class Context>
-CAFFE2_API void And(int N, const T* A, const T* B, T* C, Context* context);
+TORCH_API void And(int N, const T* A, const T* B, T* C, Context* context);
 template <typename T, class Context>
-CAFFE2_API void Or(int N, const T* A, const T* B, T* C, Context* context);
+TORCH_API void Or(int N, const T* A, const T* B, T* C, Context* context);
 template <typename T, class Context>
-CAFFE2_API void Xor(int N, const T* A, const T* B, T* C, Context* context);
+TORCH_API void Xor(int N, const T* A, const T* B, T* C, Context* context);
 
 template <typename T, class Context>
-CAFFE2_API void
+TORCH_API void
 BitwiseAnd(int N, const T* A, const T* B, T* C, Context* context);
 template <typename T, class Context>
-CAFFE2_API void
+TORCH_API void
 BitwiseOr(int N, const T* A, const T* B, T* C, Context* context);
 template <typename T, class Context>
-CAFFE2_API void
+TORCH_API void
 BitwiseXor(int N, const T* A, const T* B, T* C, Context* context);
 
 template <typename T, class Context>
-CAFFE2_API void EQ(int N, const T* A, const T* B, bool* C, Context* context);
+TORCH_API void EQ(int N, const T* A, const T* B, bool* C, Context* context);
 template <typename T, class Context>
-CAFFE2_API void NE(int N, const T* A, const T* B, bool* C, Context* context);
+TORCH_API void NE(int N, const T* A, const T* B, bool* C, Context* context);
 template <typename T, class Context>
-CAFFE2_API void LT(int N, const T* A, const T* B, bool* C, Context* context);
+TORCH_API void LT(int N, const T* A, const T* B, bool* C, Context* context);
 template <typename T, class Context>
-CAFFE2_API void LE(int N, const T* A, const T* B, bool* C, Context* context);
+TORCH_API void LE(int N, const T* A, const T* B, bool* C, Context* context);
 template <typename T, class Context>
-CAFFE2_API void GT(int N, const T* A, const T* B, bool* C, Context* context);
+TORCH_API void GT(int N, const T* A, const T* B, bool* C, Context* context);
 template <typename T, class Context>
-CAFFE2_API void GE(int N, const T* A, const T* B, bool* C, Context* context);
+TORCH_API void GE(int N, const T* A, const T* B, bool* C, Context* context);
 
 template <typename TAlpha, typename TData, class Context>
-CAFFE2_API void
+TORCH_API void
 Axpy(std::int64_t N, TAlpha alpha, const TData* X, TData* Y, Context* context);
 
 // Different from the Axpy function above, if alpha is passed in
 // as a pointer, we will assume that it lives on the Context device,
 // for example on GPU.
 template <typename TAlpha, typename TData, class Context>
-CAFFE2_API void Axpy(
+TORCH_API void Axpy(
     std::int64_t N,
     const TAlpha* alpha,
     const TData* X,
@@ -137,7 +137,7 @@
     Context* context);
 
 template <typename TAlpha, typename TData, class Context>
-CAFFE2_API void Axpby(
+TORCH_API void Axpby(
     std::int64_t N,
     TAlpha alpha,
     const TData* X,
@@ -146,7 +146,7 @@
     Context* context);
 
 template <typename TAlpha, typename TData, class Context>
-CAFFE2_API void Axpby(
+TORCH_API void Axpby(
     std::int64_t N,
     const TAlpha* alpha,
     const TData* X,
diff --git a/caffe2/utils/math/reduce.h b/caffe2/utils/math/reduce.h
index 7f8b835..52d056d 100644
--- a/caffe2/utils/math/reduce.h
+++ b/caffe2/utils/math/reduce.h
@@ -11,11 +11,11 @@
 namespace math {
 
 template <typename T, class Context>
-CAFFE2_API void
+TORCH_API void
 ReduceMin(const int N, const T* X, T* y, Tensor* scratch_ptr, Context* context);
 
 template <typename T, class Context>
-CAFFE2_API void
+TORCH_API void
 ReduceMax(const int N, const T* X, T* y, Tensor* scratch_ptr, Context* context);
 
 // In all of the reduce functions, X_dims and Y_dims should have ndim elements.
@@ -25,7 +25,7 @@
 
 // Y = alpha * ReduceMin(X)
 template <typename T, class Context>
-CAFFE2_API void ReduceMin(
+TORCH_API void ReduceMin(
     const int ndim,
     const int* X_dims,
     const int* Y_dims,
@@ -36,7 +36,7 @@
 
 // Y = alpha * ReduceMax(X)
 template <typename T, class Context>
-CAFFE2_API void ReduceMax(
+TORCH_API void ReduceMax(
     const int ndim,
     const int* X_dims,
     const int* Y_dims,
@@ -47,7 +47,7 @@
 
 // Y = alpha * ReduceSum(X)
 template <typename T, class Context>
-CAFFE2_API void ReduceSum(
+TORCH_API void ReduceSum(
     const int ndim,
     const int* X_dims,
     const int* Y_dims,
@@ -58,7 +58,7 @@
 
 // Y = alpha * ReduceMean(X)
 template <typename T, class Context>
-CAFFE2_API void ReduceMean(
+TORCH_API void ReduceMean(
     const int ndim,
     const int* X_dims,
     const int* Y_dims,
@@ -69,7 +69,7 @@
 
 // Y = alpha * ReduceL1(X)
 template <typename T, class Context>
-CAFFE2_API void ReduceL1(
+TORCH_API void ReduceL1(
     const int ndim,
     const int* X_dims,
     const int* Y_dims,
@@ -80,7 +80,7 @@
 
 // Y = alpha * ReduceL2(X)
 template <typename T, class Context>
-CAFFE2_API void ReduceL2(
+TORCH_API void ReduceL2(
     const int ndim,
     const int* X_dims,
     const int* Y_dims,
@@ -91,7 +91,7 @@
 
 // Computes mean and variance over axes.
 template <typename T, class Context>
-CAFFE2_API void Moments(
+TORCH_API void Moments(
     const int ndims,
     const int* X_dims,
     const int* Y_dims,
diff --git a/caffe2/utils/math/transpose.h b/caffe2/utils/math/transpose.h
index a01caa2..ca3d7fd 100644
--- a/caffe2/utils/math/transpose.h
+++ b/caffe2/utils/math/transpose.h
@@ -9,7 +9,7 @@
 
 // Transpose tensor X with dims by axes and write the result to tensor Y.
 template <typename TIndex, typename TData, class Context>
-CAFFE2_API void Transpose(
+TORCH_API void Transpose(
     int ndim,
     const TIndex* dims,
     const int* axes,
@@ -18,11 +18,11 @@
     Context* context);
 
 template <typename T, class Context>
-CAFFE2_API void
+TORCH_API void
 NCHW2NHWC(int N, int C, int HxW, const T* X, T* Y, Context* context);
 
 template <typename T, class Context>
-CAFFE2_API void
+TORCH_API void
 NHWC2NCHW(int N, int C, int HxW, const T* X, T* Y, Context* context);
 
 } // namespace math
diff --git a/caffe2/utils/math/utils.h b/caffe2/utils/math/utils.h
index 473db41..88b9f7c 100644
--- a/caffe2/utils/math/utils.h
+++ b/caffe2/utils/math/utils.h
@@ -61,35 +61,35 @@
 
 // Increase the index digits by one based on dims.
 template <typename TIndex>
-CAFFE2_API void
+TORCH_API void
 IncreaseIndexInDims(int ndim, const TIndex* dims, TIndex* index);
 
 // Get index value from dims and index digits.
 template <typename TIndex>
-CAFFE2_API TIndex
+TORCH_API TIndex
 GetIndexFromDims(const int n, const TIndex* dims, const TIndex* index);
 
 // Checks if the input permutation is an identity permutation;
-CAFFE2_API bool IsIdentityPermutation(const int n, const int* perm);
+TORCH_API bool IsIdentityPermutation(const int n, const int* perm);
 
-CAFFE2_API bool
+TORCH_API bool
 CheckReduceDims(const int ndim, const int* X_dims, const int* Y_dims);
 
-CAFFE2_API bool IsRowwiseReduce(
+TORCH_API bool IsRowwiseReduce(
     const int ndim,
     const int* X_dims,
     const int* Y_dims,
     int* rows,
     int* cols);
 
-CAFFE2_API bool IsColwiseReduce(
+TORCH_API bool IsColwiseReduce(
     const int ndim,
     const int* X_dims,
     const int* Y_dims,
     int* rows,
     int* cols);
 
-CAFFE2_API bool IsBothEndsReduce(
+TORCH_API bool IsBothEndsReduce(
     const int ndim,
     const int* X_dims,
     const int* Y_dims,
@@ -99,7 +99,7 @@
 
 // Computest the broadcast binary operation dims.
 template <typename TIndex>
-CAFFE2_API void ComputeBroadcastBinaryOpDims(
+TORCH_API void ComputeBroadcastBinaryOpDims(
     const int A_ndim,
     const TIndex* A_dims,
     const int B_ndim,
@@ -108,7 +108,7 @@
     TIndex* B_broadcast_dims,
     TIndex* C_broadcast_dims);
 
-CAFFE2_API bool IsRowwiseBroadcastBinaryOp(
+TORCH_API bool IsRowwiseBroadcastBinaryOp(
     const int ndim,
     const int* A_dims,
     const int* B_dims,
@@ -116,7 +116,7 @@
     int* cols,
     bool* broadcast_1st);
 
-CAFFE2_API bool IsColwiseBroadcastBinaryOp(
+TORCH_API bool IsColwiseBroadcastBinaryOp(
     const int ndim,
     const int* A_dims,
     const int* B_dims,
@@ -124,7 +124,7 @@
     int* cols,
     bool* broadcast_1st);
 
-CAFFE2_API bool IsBothEndsBroadcastBinaryOp(
+TORCH_API bool IsBothEndsBroadcastBinaryOp(
     const int ndim,
     const int* A_dims,
     const int* B_dims,
@@ -133,19 +133,19 @@
     int* nxt,
     bool* broadcast_1st);
 
-CAFFE2_API bool IsBatchTranspose2D(const int ndim, const int* axes);
+TORCH_API bool IsBatchTranspose2D(const int ndim, const int* axes);
 
-CAFFE2_API void ComputeTransposeAxesForReduceOp(
+TORCH_API void ComputeTransposeAxesForReduceOp(
     const int num_dims,
     const int num_reduce_axes,
     const int* reduce_axes,
     int* transpose_axes);
 
-CAFFE2_API void
+TORCH_API void
 ComputeTransposeAxesForReduceOp(const int ndim, const int* dims, int* axes);
 
 template <typename TIndex>
-CAFFE2_API void ComputeTransposedStrides(
+TORCH_API void ComputeTransposedStrides(
     int ndim,
     const TIndex* dims,
     const int* axes,
diff --git a/caffe2/utils/proto_utils.h b/caffe2/utils/proto_utils.h
index 3502326..47f6c25 100644
--- a/caffe2/utils/proto_utils.h
+++ b/caffe2/utils/proto_utils.h
@@ -23,27 +23,27 @@
 // Note that we can't use DeviceType_Name, because that is only available in
 // protobuf-full, and some platforms (like mobile) may want to use
 // protobuf-lite instead.
-CAFFE2_API std::string DeviceTypeName(const int32_t& d);
+TORCH_API std::string DeviceTypeName(const int32_t& d);
 
-CAFFE2_API int DeviceId(const DeviceOption& option);
+TORCH_API int DeviceId(const DeviceOption& option);
 
 // Returns if the two DeviceOptions are pointing to the same device.
-CAFFE2_API bool IsSameDevice(const DeviceOption& lhs, const DeviceOption& rhs);
+TORCH_API bool IsSameDevice(const DeviceOption& lhs, const DeviceOption& rhs);
 
-CAFFE2_API bool IsCPUDeviceType(int device_type);
-CAFFE2_API bool IsGPUDeviceType(int device_type);
+TORCH_API bool IsCPUDeviceType(int device_type);
+TORCH_API bool IsGPUDeviceType(int device_type);
 
 // Common interfaces that reads file contents into a string.
-CAFFE2_API bool ReadStringFromFile(const char* filename, string* str);
-CAFFE2_API bool WriteStringToFile(const string& str, const char* filename);
+TORCH_API bool ReadStringFromFile(const char* filename, string* str);
+TORCH_API bool WriteStringToFile(const string& str, const char* filename);
 
 // Common interfaces that are supported by both lite and full protobuf.
-CAFFE2_API bool ReadProtoFromBinaryFile(const char* filename, MessageLite* proto);
+TORCH_API bool ReadProtoFromBinaryFile(const char* filename, MessageLite* proto);
 inline bool ReadProtoFromBinaryFile(const string filename, MessageLite* proto) {
   return ReadProtoFromBinaryFile(filename.c_str(), proto);
 }
 
-CAFFE2_API void WriteProtoToBinaryFile(const MessageLite& proto, const char* filename);
+TORCH_API void WriteProtoToBinaryFile(const MessageLite& proto, const char* filename);
 inline void WriteProtoToBinaryFile(const MessageLite& proto,
                                    const string& filename) {
   return WriteProtoToBinaryFile(proto, filename.c_str());
@@ -60,9 +60,9 @@
 } // namespace TextFormat
 
 
-CAFFE2_API string ProtoDebugString(const MessageLite& proto);
+TORCH_API string ProtoDebugString(const MessageLite& proto);
 
-CAFFE2_API bool ParseProtoFromLargeString(const string& str, MessageLite* proto);
+TORCH_API bool ParseProtoFromLargeString(const string& str, MessageLite* proto);
 
 // Text format MessageLite wrappers: these functions do nothing but just
 // allowing things to compile. It will produce a runtime error if you are using
@@ -105,19 +105,19 @@
 using ::google::protobuf::Message;
 
 namespace TextFormat {
-CAFFE2_API bool ParseFromString(const string& spec, Message* proto);
+TORCH_API bool ParseFromString(const string& spec, Message* proto);
 } // namespace TextFormat
 
-CAFFE2_API string ProtoDebugString(const Message& proto);
+TORCH_API string ProtoDebugString(const Message& proto);
 
-CAFFE2_API bool ParseProtoFromLargeString(const string& str, Message* proto);
+TORCH_API bool ParseProtoFromLargeString(const string& str, Message* proto);
 
-CAFFE2_API bool ReadProtoFromTextFile(const char* filename, Message* proto);
+TORCH_API bool ReadProtoFromTextFile(const char* filename, Message* proto);
 inline bool ReadProtoFromTextFile(const string filename, Message* proto) {
   return ReadProtoFromTextFile(filename.c_str(), proto);
 }
 
-CAFFE2_API void WriteProtoToTextFile(const Message& proto, const char* filename, bool throwIfError = true);
+TORCH_API void WriteProtoToTextFile(const Message& proto, const char* filename, bool throwIfError = true);
 inline void WriteProtoToTextFile(const Message& proto, const string& filename, bool throwIfError = true) {
   return WriteProtoToTextFile(proto, filename.c_str(), throwIfError);
 }
@@ -189,8 +189,8 @@
       engine);
 }
 
-CAFFE2_API bool HasOutput(const OperatorDef& op, const std::string& output);
-CAFFE2_API bool HasInput(const OperatorDef& op, const std::string& input);
+TORCH_API bool HasOutput(const OperatorDef& op, const std::string& output);
+TORCH_API bool HasInput(const OperatorDef& op, const std::string& input);
 
 /**
  * @brief A helper class to index into arguments.
@@ -299,36 +299,36 @@
 
 // Helper methods to get an argument from OperatorDef or NetDef given argument
 // name. Throws if argument does not exist.
-CAFFE2_API const Argument& GetArgument(const OperatorDef& def, const string& name);
-CAFFE2_API const Argument& GetArgument(const NetDef& def, const string& name);
+TORCH_API const Argument& GetArgument(const OperatorDef& def, const string& name);
+TORCH_API const Argument& GetArgument(const NetDef& def, const string& name);
 // Helper methods to get an argument from OperatorDef or NetDef given argument
 // name. Returns nullptr if argument does not exist.
-CAFFE2_API const Argument* GetArgumentPtr(const OperatorDef& def, const string& name);
-CAFFE2_API const Argument* GetArgumentPtr(const NetDef& def, const string& name);
+TORCH_API const Argument* GetArgumentPtr(const OperatorDef& def, const string& name);
+TORCH_API const Argument* GetArgumentPtr(const NetDef& def, const string& name);
 
 // Helper methods to query a boolean argument flag from OperatorDef or NetDef
 // given argument name. If argument does not exist, return default value.
 // Throws if argument exists but the type is not boolean.
-CAFFE2_API bool GetFlagArgument(
+TORCH_API bool GetFlagArgument(
     const OperatorDef& def,
     const string& name,
     bool default_value = false);
-CAFFE2_API bool GetFlagArgument(
+TORCH_API bool GetFlagArgument(
     const NetDef& def,
     const string& name,
     bool default_value = false);
 
-CAFFE2_API Argument* GetMutableArgument(
+TORCH_API Argument* GetMutableArgument(
     const string& name,
     const bool create_if_missing,
     OperatorDef* def);
-CAFFE2_API Argument* GetMutableArgument(
+TORCH_API Argument* GetMutableArgument(
     const string& name,
     const bool create_if_missing,
     NetDef* def);
 
 template <typename T>
-CAFFE2_API Argument MakeArgument(const string& name, const T& value);
+TORCH_API Argument MakeArgument(const string& name, const T& value);
 
 template <typename T, typename Def>
 inline void AddArgument(const string& name, const T& value, Def* def) {
@@ -347,7 +347,7 @@
 // - Going through list of ops in order, all op inputs must be outputs
 // from other ops, or registered as external inputs.
 // - All external outputs must be outputs of some operators.
-CAFFE2_API void cleanupExternalInputsAndOutputs(NetDef* net);
+TORCH_API void cleanupExternalInputsAndOutputs(NetDef* net);
 
 } // namespace caffe2
 
diff --git a/caffe2/utils/proto_wrap.cc b/caffe2/utils/proto_wrap.cc
index eb06524..6899a5d 100644
--- a/caffe2/utils/proto_wrap.cc
+++ b/caffe2/utils/proto_wrap.cc
@@ -9,7 +9,7 @@
 // ONNX wrapper functions for protobuf's GetEmptyStringAlreadyInited() function
 // used to avoid duplicated global variable in the case when protobuf
 // is built with hidden visibility.
-CAFFE2_API const ::std::string& GetEmptyStringAlreadyInited() {
+TORCH_API const ::std::string& GetEmptyStringAlreadyInited() {
   return ::google::protobuf::internal::GetEmptyStringAlreadyInited();
 }
 
@@ -20,7 +20,7 @@
 // Caffe2 wrapper functions for protobuf's GetEmptyStringAlreadyInited() function
 // used to avoid duplicated global variable in the case when protobuf
 // is built with hidden visibility.
-CAFFE2_API const ::std::string& GetEmptyStringAlreadyInited() {
+TORCH_API const ::std::string& GetEmptyStringAlreadyInited() {
   return ::google::protobuf::internal::GetEmptyStringAlreadyInited();
 }
 
@@ -35,7 +35,7 @@
 // Caffe2 wrapper functions for protobuf's GetEmptyStringAlreadyInited() function
 // used to avoid duplicated global variable in the case when protobuf
 // is built with hidden visibility.
-CAFFE2_API const ::std::string& GetEmptyStringAlreadyInited() {
+TORCH_API const ::std::string& GetEmptyStringAlreadyInited() {
   return ::google::protobuf::internal::GetEmptyStringAlreadyInited();
 }
 
diff --git a/caffe2/utils/proto_wrap.h b/caffe2/utils/proto_wrap.h
index 92cb2b4..bcbce663 100644
--- a/caffe2/utils/proto_wrap.h
+++ b/caffe2/utils/proto_wrap.h
@@ -7,7 +7,7 @@
 
 // A wrapper function to shut down protobuf library (this is needed in ASAN
 // testing and valgrind cases to avoid protobuf appearing to "leak" memory).
-CAFFE2_API void ShutdownProtobufLibrary();
+TORCH_API void ShutdownProtobufLibrary();
 
 } // namespace caffe2
 
diff --git a/caffe2/utils/signal_handler.h b/caffe2/utils/signal_handler.h
index c773bdd..9e0bc2a 100644
--- a/caffe2/utils/signal_handler.h
+++ b/caffe2/utils/signal_handler.h
@@ -11,7 +11,7 @@
 
 namespace caffe2 {
 
-class CAFFE2_API SignalHandler {
+class TORCH_API SignalHandler {
  public:
   enum class Action {
     NONE,
@@ -38,8 +38,8 @@
 // This works by setting up certain fatal signal handlers. Previous fatal
 // signal handlers will still be called when the signal is raised. Defaults
 // to being off.
-CAFFE2_API void setPrintStackTracesOnFatalSignal(bool print);
-CAFFE2_API bool printStackTracesOnFatalSignal();
+TORCH_API void setPrintStackTracesOnFatalSignal(bool print);
+TORCH_API bool printStackTracesOnFatalSignal();
 #endif // defined(CAFFE2_SUPPORTS_SIGNAL_HANDLER)
 
 }  // namespace caffe2
diff --git a/caffe2/utils/smart_tensor_printer.h b/caffe2/utils/smart_tensor_printer.h
index 48e1e47..e6d96ef 100644
--- a/caffe2/utils/smart_tensor_printer.h
+++ b/caffe2/utils/smart_tensor_printer.h
@@ -8,7 +8,7 @@
 // explicit specify the type of the tensor while calling the Print() method.
 // It also supports a convenience function with a default constructed printer as
 // a static method.
-class CAFFE2_API SmartTensorPrinter {
+class TORCH_API SmartTensorPrinter {
  public:
   // The proliferation of constructors is to give the feature parity with
   // TensorPrinter
diff --git a/caffe2/utils/string_utils.h b/caffe2/utils/string_utils.h
index bd13b72..e959a46 100644
--- a/caffe2/utils/string_utils.h
+++ b/caffe2/utils/string_utils.h
@@ -9,17 +9,17 @@
 
 namespace caffe2 {
 
-CAFFE2_API std::vector<std::string>
+TORCH_API std::vector<std::string>
 split(char separator, const std::string& string, bool ignore_empty = false);
 
-CAFFE2_API std::string trim(const std::string& str);
+TORCH_API std::string trim(const std::string& str);
 
-CAFFE2_API size_t editDistance(
+TORCH_API size_t editDistance(
     const std::string& s1,
     const std::string& s2,
     size_t max_distance = 0);
 
-CAFFE2_API inline bool StartsWith(
+TORCH_API inline bool StartsWith(
     const std::string& str,
     const std::string& prefix) {
   return str.length() >= prefix.length() &&
@@ -27,7 +27,7 @@
       prefix.end();
 }
 
-CAFFE2_API inline bool EndsWith(
+TORCH_API inline bool EndsWith(
     const std::string& full,
     const std::string& ending) {
   if (full.length() >= ending.length()) {
@@ -39,7 +39,7 @@
   }
 }
 
-CAFFE2_API int32_t editDistanceHelper(
+TORCH_API int32_t editDistanceHelper(
     const char* s1,
     size_t s1_len,
     const char* s2,
diff --git a/caffe2/utils/threadpool/ThreadPool.h b/caffe2/utils/threadpool/ThreadPool.h
index 5165764..951b8f7 100644
--- a/caffe2/utils/threadpool/ThreadPool.h
+++ b/caffe2/utils/threadpool/ThreadPool.h
@@ -29,8 +29,8 @@
 // misaligned intrinsics, no SSE instructions shall be involved in
 // the ThreadPool implementation.
 // Note: alignas is disabled because some compilers do not deal with
-// CAFFE2_API and alignas annotations at the same time.
-class CAFFE2_API /*alignas(kCacheLineSize)*/ ThreadPool {
+// TORCH_API and alignas annotations at the same time.
+class TORCH_API /*alignas(kCacheLineSize)*/ ThreadPool {
  public:
   static std::unique_ptr<ThreadPool> defaultThreadPool();
   ThreadPool(int numThreads);
diff --git a/caffe2/video/video_decoder.h b/caffe2/video/video_decoder.h
index 5286d52..a091142 100644
--- a/caffe2/video/video_decoder.h
+++ b/caffe2/video/video_decoder.h
@@ -477,11 +477,11 @@
       Callback& callback);
 };
 
-CAFFE2_API void FreeDecodedData(
+TORCH_API void FreeDecodedData(
     std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames,
     std::vector<std::unique_ptr<DecodedAudio>>& sampledAudio);
 
-CAFFE2_API bool DecodeMultipleClipsFromVideo(
+TORCH_API bool DecodeMultipleClipsFromVideo(
     const char* video_buffer,
     const std::string& video_filename,
     const int encoded_size,
diff --git a/caffe2/video/video_io.h b/caffe2/video/video_io.h
index a25e87e..beefd7b 100644
--- a/caffe2/video/video_io.h
+++ b/caffe2/video/video_io.h
@@ -12,7 +12,7 @@
 
 namespace caffe2 {
 
-CAFFE2_API void ClipTransformRGB(
+TORCH_API void ClipTransformRGB(
     const unsigned char* buffer_rgb,
     const int crop_size,
     const int length_rgb,
@@ -27,7 +27,7 @@
     const std::vector<float>& inv_std_rgb,
     float* transformed_clip);
 
-CAFFE2_API void ClipTransformOpticalFlow(
+TORCH_API void ClipTransformOpticalFlow(
     const unsigned char* buffer_rgb,
     const int crop_size,
     const int length_of,
diff --git a/cmake/ProtoBuf.cmake b/cmake/ProtoBuf.cmake
index 9a4ad35..d8a2c27 100644
--- a/cmake/ProtoBuf.cmake
+++ b/cmake/ProtoBuf.cmake
@@ -39,7 +39,7 @@
   set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
   if(MSVC)
-    foreach(flag_var 
+    foreach(flag_var
         CMAKE_C_FLAGS CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_MINSIZEREL
         CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_MINSIZEREL)
       if(${flag_var} MATCHES "/Z[iI7]")
@@ -172,8 +172,8 @@
     list(APPEND ${hdrs_var} "${CMAKE_CURRENT_BINARY_DIR}/${fil_we}.pb.h")
     list(APPEND ${python_var} "${CMAKE_CURRENT_BINARY_DIR}/${fil_we}_pb2.py")
 
-    # Add CAFFE2_API prefix to protobuf classes and methods in all cases
-    set(DLLEXPORT_STR "dllexport_decl=CAFFE2_API:")
+    # Add TORCH_API prefix to protobuf classes and methods in all cases
+    set(DLLEXPORT_STR "dllexport_decl=TORCH_API:")
 
     # Note: the following depends on PROTOBUF_PROTOC_EXECUTABLE. This
     # is done to make sure protoc is built before attempting to
diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
index 0507106..37f4ea7 100644
--- a/tools/codegen/gen.py
+++ b/tools/codegen/gen.py
@@ -599,9 +599,9 @@
         sig_group = CppSignatureGroup.from_schema(f.func, method=False, fallback_binding=f.manual_cpp_binding)
 
         if self.target is Target.DECLARATION:
-            result = f"CAFFE2_API {sig_group.signature.decl()};\n"
+            result = f"TORCH_API {sig_group.signature.decl()};\n"
             if sig_group.faithful_signature is not None:
-                result += f"CAFFE2_API {sig_group.faithful_signature.decl()};\n"
+                result += f"TORCH_API {sig_group.faithful_signature.decl()};\n"
             return result
 
         assert self.target is Target.DEFINITION
@@ -714,7 +714,7 @@
                 continue
             seen.add(n)
             rs.append(f"""\
-struct CAFFE2_API structured_{n} : public at::meta::{meta_name} {{
+struct TORCH_API structured_{n} : public at::meta::{meta_name} {{
     void impl({', '.join(a.decl() for a in out_args)});
 }};
 """)
@@ -740,7 +740,7 @@
                     args_str = ', '.join(a.defn() for a in args)
                 else:
                     args_str = ', '.join(a.decl() for a in args)
-                rs.append(f"CAFFE2_API {returns_type} {n}({args_str});")
+                rs.append(f"TORCH_API {returns_type} {n}({args_str});")
 
         return rs
 
@@ -760,7 +760,7 @@
             seen.add(n)
             returns_type = native.returns_type(f.func.returns)
             args = native.arguments(f.func)
-            rs.append(f"CAFFE2_API {returns_type} {n}({', '.join(a.decl() for a in args)});")
+            rs.append(f"TORCH_API {returns_type} {n}({', '.join(a.decl() for a in args)});")
 
         return rs
 
@@ -774,7 +774,7 @@
         if parent_class is None:
             parent_class = "at::impl::MetaBase"
         return f"""\
-struct CAFFE2_API {name} : public {parent_class} {{
+struct TORCH_API {name} : public {parent_class} {{
     void meta({args_str});
 }};
 """
diff --git a/torch/csrc/WindowsTorchApiMacro.h b/torch/csrc/WindowsTorchApiMacro.h
index 7f44db0..44ae3b3 100644
--- a/torch/csrc/WindowsTorchApiMacro.h
+++ b/torch/csrc/WindowsTorchApiMacro.h
@@ -2,12 +2,8 @@
 
 #include <c10/macros/Export.h>
 
-// There's no difference between aten, torch and caffe2 libs any more
-// TODO: clean up the naming for consistency
-#define TORCH_API CAFFE2_API
-
 #ifdef _WIN32
 #define TORCH_PYTHON_API
 #else
-#define TORCH_PYTHON_API CAFFE2_API
+#define TORCH_PYTHON_API TORCH_API
 #endif
diff --git a/torch/csrc/jit/frontend/error_report.h b/torch/csrc/jit/frontend/error_report.h
index 1d7f4fc..a07f5e4 100644
--- a/torch/csrc/jit/frontend/error_report.h
+++ b/torch/csrc/jit/frontend/error_report.h
@@ -11,7 +11,7 @@
   SourceRange caller_range;
 };
 
-struct CAFFE2_API ErrorReport : public std::exception {
+struct TORCH_API ErrorReport : public std::exception {
   ErrorReport(const ErrorReport& e);
 
   explicit ErrorReport(SourceRange r);
@@ -20,7 +20,7 @@
 
   const char* what() const noexcept override;
 
-  struct CAFFE2_API CallStack {
+  struct TORCH_API CallStack {
     // These functions are used to report why a function was being compiled
     // (i.e. what was the call stack of user functions at compilation time that
     // led to this error)
diff --git a/torch/csrc/jit/frontend/function_schema_parser.h b/torch/csrc/jit/frontend/function_schema_parser.h
index e4fcf1e..bdfaec6 100644
--- a/torch/csrc/jit/frontend/function_schema_parser.h
+++ b/torch/csrc/jit/frontend/function_schema_parser.h
@@ -8,10 +8,10 @@
 namespace torch {
 namespace jit {
 
-CAFFE2_API c10::either<c10::OperatorName, c10::FunctionSchema> parseSchemaOrName(
+TORCH_API c10::either<c10::OperatorName, c10::FunctionSchema> parseSchemaOrName(
     const std::string& schemaOrName);
-CAFFE2_API c10::FunctionSchema parseSchema(const std::string& schema);
-CAFFE2_API c10::OperatorName parseName(const std::string& name);
+TORCH_API c10::FunctionSchema parseSchema(const std::string& schema);
+TORCH_API c10::OperatorName parseName(const std::string& name);
 
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/frontend/lexer.h b/torch/csrc/jit/frontend/lexer.h
index 4e8cc42..34b92a9 100644
--- a/torch/csrc/jit/frontend/lexer.h
+++ b/torch/csrc/jit/frontend/lexer.h
@@ -125,8 +125,8 @@
 #undef DEFINE_TOKEN
 };
 
-CAFFE2_API std::string kindToString(int kind);
-CAFFE2_API int stringToKind(const std::string& str);
+TORCH_API std::string kindToString(int kind);
+TORCH_API int stringToKind(const std::string& str);
 
 // nested hash tables that indicate char-by-char what is a valid token.
 struct TokenTrie;
@@ -159,7 +159,7 @@
 
 // stuff that is shared against all TC lexers/parsers and is initialized only
 // once.
-struct CAFFE2_API SharedParserData {
+struct TORCH_API SharedParserData {
   SharedParserData() : head(new TokenTrie()) {
     std::stringstream ss;
     for (const char* c = valid_single_char_tokens; *c; c++) {
@@ -363,7 +363,7 @@
   TokenTrieRef head;
 };
 
-CAFFE2_API SharedParserData& sharedParserData();
+TORCH_API SharedParserData& sharedParserData();
 
 struct Token {
   int kind;
diff --git a/torch/csrc/jit/frontend/schema_type_parser.h b/torch/csrc/jit/frontend/schema_type_parser.h
index fe6089d..1778247 100644
--- a/torch/csrc/jit/frontend/schema_type_parser.h
+++ b/torch/csrc/jit/frontend/schema_type_parser.h
@@ -10,7 +10,7 @@
 
 using TypePtr = c10::TypePtr;
 
-struct CAFFE2_API SchemaTypeParser {
+struct TORCH_API SchemaTypeParser {
   TypePtr parseBaseType();
   c10::optional<c10::AliasInfo> parseAliasAnnotation();
   std::pair<TypePtr, c10::optional<c10::AliasInfo>> parseType();
diff --git a/torch/csrc/jit/frontend/source_range.h b/torch/csrc/jit/frontend/source_range.h
index 24b5ce6..36772807 100644
--- a/torch/csrc/jit/frontend/source_range.h
+++ b/torch/csrc/jit/frontend/source_range.h
@@ -106,7 +106,7 @@
 
 // A SourceRange is a view into a Source, that points to a subset of the source,
 // specified by `start` and `end` byte offsets into the source text.
-struct CAFFE2_API SourceRange {
+struct TORCH_API SourceRange {
   SourceRange(std::shared_ptr<Source> source_, size_t start_, size_t end_)
       : source_(std::move(source_)), start_(start_), end_(end_) {}
   SourceRange() : source_(nullptr), start_(0), end_(0) {}
diff --git a/torch/csrc/jit/frontend/strtod.h b/torch/csrc/jit/frontend/strtod.h
index f257a36..c333ed0 100644
--- a/torch/csrc/jit/frontend/strtod.h
+++ b/torch/csrc/jit/frontend/strtod.h
@@ -5,8 +5,8 @@
 namespace torch {
 namespace jit {
 
-CAFFE2_API double strtod_c(const char* nptr, char** endptr);
-CAFFE2_API float strtof_c(const char* nptr, char** endptr);
+TORCH_API double strtod_c(const char* nptr, char** endptr);
+TORCH_API float strtof_c(const char* nptr, char** endptr);
 
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/runtime/interpreter.h b/torch/csrc/jit/runtime/interpreter.h
index 279d41c..120a3ff 100644
--- a/torch/csrc/jit/runtime/interpreter.h
+++ b/torch/csrc/jit/runtime/interpreter.h
@@ -12,7 +12,7 @@
 
 namespace at {
 class Tensor;
-CAFFE2_API void launch(std::function<void()> func);
+TORCH_API void launch(std::function<void()> func);
 } // namespace at
 namespace c10 {
 struct IValue;
diff --git a/torch/library.h b/torch/library.h
index ac936d2..d86c1af 100644
--- a/torch/library.h
+++ b/torch/library.h
@@ -81,7 +81,7 @@
 ///
 /// This class erases the type of the passed in function, but durably records
 /// the type via an inferred schema for the function.
-class CAFFE2_API CppFunction final {
+class TORCH_API CppFunction final {
   // TODO: This is morally the same thing as KernelRegistrationConfig, but it's
   // opaque to the user.
 
@@ -367,7 +367,7 @@
 /// }
 /// ```
 ///
-class CAFFE2_API Library final {
+class TORCH_API Library final {
 public:
   /// \private
   ///