Remove more caffe2 files (#127511) Remove more caffe2 files. Pull Request resolved: https://github.com/pytorch/pytorch/pull/127511 Approved by: https://github.com/r-barnes

commit: a6bae1f6db3bb86c521dd3c2417f42b8f5e8d705 [log] [tgz]
author: cyy <cyyever@outlook.com> Fri May 31 11:26:24 2024 +0000
committer: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com> Fri May 31 11:26:27 2024 +0000
tree: 1424b9fd43a2f2fe54edf39f4dc2d47be356945d
parent: df0c69f32d269f8cdc136c9c65d791b6b86ef5e3 [diff]
diff --git a/BUILD.bazel b/BUILD.bazel
index ecbeaab..7a2c3a5 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel

@@ -488,10 +488,7 @@
 filegroup(
     name = "caffe2_utils_srcs",
     srcs = [
-        "caffe2/utils/bench_utils.cc",
         "caffe2/utils/cpuid.cc",
-        "caffe2/utils/murmur_hash3.cc",
-        "caffe2/utils/proto_utils.cc",
         "caffe2/utils/proto_wrap.cc",
         "caffe2/utils/string_utils.cc",
         "caffe2/utils/threadpool/ThreadPool.cc",
@@ -544,7 +541,6 @@
         ],
     ) + if_cuda(glob([
         "caffe2/**/*.cuh",
-        "caffe2/image/*.h",
     ])),
     copts = CAFFE2_COPTS,
     visibility = ["//visibility:public"],

diff --git a/caffe2/core/blob.h b/caffe2/core/blob.h
deleted file mode 100644
index 5823280..0000000
--- a/caffe2/core/blob.h
+++ /dev/null

@@ -1,130 +0,0 @@
-#ifndef CAFFE2_CORE_BLOB_H_
-#define CAFFE2_CORE_BLOB_H_
-
-#include <cstddef>
-#include <sstream>
-#include <typeinfo>
-#include <type_traits>
-#include <vector>
-#include "caffe2/core/common.h"
-
-#include <ATen/core/blob.h>
-#include <c10/util/typeid.h>
-#include "caffe2/core/logging.h"
-#include "caffe2/core/tensor.h"
-#include "caffe2/core/tensor_int8.h"
-
-namespace caffe2 {
-
-inline bool BlobIsInt8TensorCPUType(const Blob& blob) {
-  return blob.meta().Match<int8::Int8TensorCPU>();
-}
-
-inline bool BlobIsTensorType(const Blob& blob, DeviceType device_type) {
-  bool is_match = blob.meta().Match<Tensor>();
-  if (!is_match) {
-    return false;
-  }
-  const Tensor* tensor = &blob.Get<Tensor>();
-  return tensor && *tensor && tensor->GetDeviceType() == device_type;
-}
-
-inline Tensor* BlobSetTensor(Blob* blob, Tensor&& tensor) {
-  return blob->Reset<Tensor>(new Tensor(std::move(tensor)));
-}
-
-inline Tensor GetSizedTensorWithOptions(
-    Tensor&& previous_tensor,
-    at::IntArrayRef dims,
-    at::TensorOptions options) {
-  Tensor tensor = std::move(previous_tensor);
-  if (!tensor.defined()) {
-    return caffe2::empty(dims, options);
-  }
-  if (tensor.GetDevice() == options.device() ||
-      (!tensor.GetDevice().has_index() &&
-       tensor.GetDeviceType() == options.device().type())) {
-    if (tensor.sizes() != dims) {
-      // Resize when the dims doesn't match
-      tensor.Resize(dims);
-    }
-    if (tensor.dtype() == options.dtype()) {
-      tensor.raw_mutable_data();
-    } else {
-      // create a new Tensor when the data_type doesn't match
-      return caffe2::empty(dims, options);
-    }
-    return tensor;
-  }
-  return caffe2::empty(dims, options);
-}
-
-// need to keep both functions that returns Tensor* and the one
-// returns Tensor for clangr codemod
-inline Tensor*
-BlobGetMutableTensor(Blob* blob, at::IntArrayRef dims, at::TensorOptions options) {
-  if (blob->IsType<Tensor>()) {
-    Tensor* tensor = blob->GetMutable<Tensor>();
-    if (*tensor) {
-      // We only compare device_type if the index is not set since there are Tensors
-      // TODO: remove the extra check when all the Tensors are properly initialized
-      const auto tensorDevice = tensor->GetDevice();
-      if (tensorDevice == options.device() || (!tensorDevice.has_index() && tensor->GetDeviceType() == options.device().type())) {
-        if (tensor->sizes() != dims) {
-          // Resize when the dims doesn't match
-          tensor->Resize(dims);
-        }
-        tensor->raw_mutable_data(options.dtype());
-        return tensor;
-      }
-      // create a new Tensor when device doesn't match
-    }
-  }
-
-  VLOG(1) << "Create new mutable object " << TypeMeta::TypeName<Tensor>()
-          << " dims: " << dims;
-  // << " options: " << options; (operator<< for Options is in at:: now)
-  return BlobSetTensor(blob, caffe2::empty(dims, options));
-}
-
-inline Tensor
-XBlobGetMutableTensor(Blob* blob, at::IntArrayRef dims, at::TensorOptions options) {
-  return BlobGetMutableTensor(blob, dims, options)->UnsafeSharedInstance();
-}
-
-inline Tensor* BlobGetMutableTensor(Blob* blob, DeviceType device_type) {
-  if (blob->IsType<Tensor>()) {
-    Tensor* tensor = blob->GetMutable<Tensor>();
-    if (*tensor && tensor->GetDeviceType() == device_type) {
-      return tensor;
-    }
-  }
-
-  // if we're here, then either Blob didn't hold a Tensor
-  // or that Tensor had the wrong DeviceType.
-  VLOG(1) << "Create new mutable object " << TypeMeta::TypeName<Tensor>()
-          << " DeviceType:" << device_type;
-
-  return BlobSetTensor(blob, Tensor(device_type));
-}
-
-inline const Tensor& BlobGetTensor(const Blob& blob, DeviceType device_type) {
-  if (blob.IsType<Tensor>()) {
-    const auto& tensor = blob.Get<Tensor>();
-    if (tensor.GetDeviceType() == device_type) {
-      return tensor;
-    }
-  }
-  CAFFE_THROW("Blob didn't contain a Tensor or the device_type doesn't match");
-}
-
-inline Tensor BlobGetTensorOrUndefined(const Blob& blob) {
-  if (blob.IsType<Tensor>()) {
-    return blob.Get<Tensor>().UnsafeSharedInstance();
-  } else {
-    return Tensor();
-  }
-}
-
-}  // namespace caffe2
-#endif  // CAFFE2_CORE_BLOB_H_

diff --git a/caffe2/core/blob_serialization_gpu.cc b/caffe2/core/blob_serialization_gpu.cc
deleted file mode 100644
index 4d67535..0000000
--- a/caffe2/core/blob_serialization_gpu.cc
+++ /dev/null

@@ -1,10 +0,0 @@
-#include "caffe2/core/blob.h"
-#include "caffe2/core/blob_serialization.h"
-#include "caffe2/core/context_gpu.h"
-
-namespace caffe2 {
-
-namespace {
-REGISTER_BLOB_DESERIALIZER(TensorCUDA, TensorDeserializer);
-}
-}  // namespace caffe2

diff --git a/caffe2/core/common_cudnn.cc b/caffe2/core/common_cudnn.cc
deleted file mode 100644
index f818654..0000000
--- a/caffe2/core/common_cudnn.cc
+++ /dev/null

@@ -1,26 +0,0 @@
-#include "caffe2/core/common_cudnn.h"
-#include "caffe2/core/cudnn_wrappers.h"
-
-#include "caffe2/core/init.h"
-
-namespace caffe2 {
-
-CuDNNWrapper::PerGPUCuDNNStates& CuDNNWrapper::cudnn_states() {
-  // New it (never delete) to avoid calling the destructors on process
-  // exit and racing against the CUDA shutdown sequence.
-  static auto* p = new CuDNNWrapper::PerGPUCuDNNStates();
-  TORCH_CHECK_NOTNULL(p);
-  return *p;
-}
-
-namespace {
-bool PrintCuDNNInfo(int*, char***) {
-  VLOG(1) << "Caffe2 is built with CuDNN version " << CUDNN_VERSION;
-  return true;
-}
-
-REGISTER_CAFFE2_INIT_FUNCTION(PrintCuDNNInfo, &PrintCuDNNInfo,
-                              "Print CuDNN Info.");
-
-}  // namespace
-}  // namespace caffe2

diff --git a/caffe2/core/common_cudnn.h b/caffe2/core/common_cudnn.h
deleted file mode 100644
index b130103..0000000
--- a/caffe2/core/common_cudnn.h
+++ /dev/null

@@ -1,314 +0,0 @@
-#ifndef CAFFE2_CORE_COMMON_CUDNN_H_
-#define CAFFE2_CORE_COMMON_CUDNN_H_
-
-#include <array>
-#include <mutex>
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/types.h"
-
-#ifndef CAFFE2_USE_CUDNN
-#error("This Caffe2 install is not built with cudnn, so you should not include this file.");
-#endif
-
-#include <cudnn.h>
-
-static_assert(
-    CUDNN_VERSION >= 8200,
-    "Caffe2 requires cudnn version 8.2 or above.");
-
-#define CUDNN_VERSION_MIN(major, minor, patch) \
-    (major >= 9 ? CUDNN_VERSION >= ((major) * 10000 + (minor) * 100 + (patch)) : \
-                  CUDNN_VERSION >= ((major) * 1000 + (minor) * 100 + (patch)))
-
-namespace caffe2 {
-
-namespace internal {
-/**
- * A helper function to obtain cudnn error strings.
- */
-inline const char* cudnnGetErrorString(cudnnStatus_t status) {
-  switch (status) {
-    case CUDNN_STATUS_SUCCESS:
-      return "CUDNN_STATUS_SUCCESS";
-    case CUDNN_STATUS_NOT_INITIALIZED:
-      return "CUDNN_STATUS_NOT_INITIALIZED";
-    case CUDNN_STATUS_ALLOC_FAILED:
-      return "CUDNN_STATUS_ALLOC_FAILED";
-    case CUDNN_STATUS_BAD_PARAM:
-      return "CUDNN_STATUS_BAD_PARAM";
-    case CUDNN_STATUS_INTERNAL_ERROR:
-      return "CUDNN_STATUS_INTERNAL_ERROR";
-    case CUDNN_STATUS_INVALID_VALUE:
-      return "CUDNN_STATUS_INVALID_VALUE";
-    case CUDNN_STATUS_ARCH_MISMATCH:
-      return "CUDNN_STATUS_ARCH_MISMATCH";
-    case CUDNN_STATUS_MAPPING_ERROR:
-      return "CUDNN_STATUS_MAPPING_ERROR";
-    case CUDNN_STATUS_EXECUTION_FAILED:
-      return "CUDNN_STATUS_EXECUTION_FAILED";
-    case CUDNN_STATUS_NOT_SUPPORTED:
-      return "CUDNN_STATUS_NOT_SUPPORTED";
-    case CUDNN_STATUS_LICENSE_ERROR:
-      return "CUDNN_STATUS_LICENSE_ERROR";
-    default:
-      return "Unknown cudnn error number";
-  }
-}
-} // namespace internal
-
-// A macro that wraps around a cudnn statement so we can check if the cudnn
-// execution finishes or not.
-#define CUDNN_ENFORCE(condition)                          \
-  do {                                                    \
-    cudnnStatus_t status = condition;                     \
-    CAFFE_ENFORCE_EQ(                                     \
-        status,                                           \
-        CUDNN_STATUS_SUCCESS,                             \
-        ", Error at: ",                                   \
-        __FILE__,                                         \
-        ":",                                              \
-        __LINE__,                                         \
-        ": ",                                             \
-        ::caffe2::internal::cudnnGetErrorString(status)); \
-  } while (0)
-#define CUDNN_CHECK(condition)                              \
-  do {                                                      \
-    cudnnStatus_t status = condition;                       \
-    CHECK(status == CUDNN_STATUS_SUCCESS)                   \
-        << ::caffe2::internal::cudnnGetErrorString(status); \
-  } while (0)
-
-// report the version of cuDNN Caffe2 was compiled with
-inline size_t cudnnCompiledVersion() {
-  return CUDNN_VERSION;
-}
-// report the runtime version of cuDNN
-inline size_t cudnnRuntimeVersion() {
-  return cudnnGetVersion();
-}
-
-// Check compatibility of compiled and runtime cuDNN versions
-inline void CheckCuDNNVersions() {
-  // Version format is major*1000 + minor*100 + patch
-  // If compiled with version < 7, major, minor and patch must all match
-  // If compiled with version >= 7, then either
-  //    runtime_version > compiled_version
-  //    major and minor match
-  bool version_match = cudnnCompiledVersion() == cudnnRuntimeVersion();
-  bool compiled_with_7 = cudnnCompiledVersion() >= 7000;
-  bool backwards_compatible_7 = compiled_with_7 && cudnnRuntimeVersion() >= cudnnCompiledVersion();
-  bool patch_compatible = compiled_with_7 && (cudnnRuntimeVersion() / 100) == (cudnnCompiledVersion() / 100);
-  CAFFE_ENFORCE(version_match || backwards_compatible_7 || patch_compatible,
-                "cuDNN compiled (", cudnnCompiledVersion(), ") and "
-                "runtime (", cudnnRuntimeVersion(), ") versions mismatch");
-}
-
-/**
- * cudnnTypeWrapper is a wrapper class that allows us to refer to the cudnn type
- * in a template function. The class is specialized explicitly for different
- * data types below.
- */
-template <typename T>
-class cudnnTypeWrapper;
-
-template <>
-class cudnnTypeWrapper<float> {
- public:
-  static const cudnnDataType_t type = CUDNN_DATA_FLOAT;
-  typedef const float ScalingParamType;
-  typedef float BNParamType;
-  static ScalingParamType* kOne() {
-    static ScalingParamType v = 1.0;
-    return &v;
-  }
-  static const ScalingParamType* kZero() {
-    static ScalingParamType v = 0.0;
-    return &v;
-  }
-};
-
-template <>
-class cudnnTypeWrapper<int> {
- public:
-  static const cudnnDataType_t type = CUDNN_DATA_INT32;
-  typedef const int ScalingParamType;
-  typedef int BNParamType;
-  static ScalingParamType* kOne() {
-    static ScalingParamType v = 1;
-    return &v;
-  }
-  static const ScalingParamType* kZero() {
-    static ScalingParamType v = 0;
-    return &v;
-  }
-};
-
-template <>
-class cudnnTypeWrapper<double> {
- public:
-  static const cudnnDataType_t type = CUDNN_DATA_DOUBLE;
-  typedef const double ScalingParamType;
-  typedef double BNParamType;
-  static ScalingParamType* kOne() {
-    static ScalingParamType v = 1.0;
-    return &v;
-  }
-  static ScalingParamType* kZero() {
-    static ScalingParamType v = 0.0;
-    return &v;
-  }
-};
-
-template <>
-class cudnnTypeWrapper<at::Half> {
- public:
-  static const cudnnDataType_t type = CUDNN_DATA_HALF;
-  typedef const float ScalingParamType;
-  typedef float BNParamType;
-  static ScalingParamType* kOne() {
-    static ScalingParamType v = 1.0;
-    return &v;
-  }
-  static ScalingParamType* kZero() {
-    static ScalingParamType v = 0.0;
-    return &v;
-  }
-};
-
-/**
- * A wrapper function to convert the Caffe storage order to cudnn storage order
- * enum values.
- */
-inline cudnnTensorFormat_t GetCudnnTensorFormat(const StorageOrder& order) {
-  switch (order) {
-    case StorageOrder::NHWC:
-      return CUDNN_TENSOR_NHWC;
-    case StorageOrder::NCHW:
-      return CUDNN_TENSOR_NCHW;
-    default:
-      LOG(FATAL) << "Unknown cudnn equivalent for order: " << order;
-  }
-  // Just to suppress compiler warnings
-  return CUDNN_TENSOR_NCHW;
-}
-
-/**
- * cudnnTensorDescWrapper is the placeholder that wraps around a
- * cudnnTensorDescriptor_t, allowing us to do descriptor change as-needed during
- * runtime.
- */
-class cudnnTensorDescWrapper {
- public:
-  cudnnTensorDescWrapper() {
-    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&desc_));
-  }
-  ~cudnnTensorDescWrapper() noexcept {
-    CUDNN_CHECK(cudnnDestroyTensorDescriptor(desc_));
-  }
-
-  inline cudnnTensorDescriptor_t Descriptor(
-      const cudnnTensorFormat_t format,
-      const cudnnDataType_t type,
-      const vector<int>& dims,
-      bool* changed) {
-    if (type_ == type && format_ == format && dims_ == dims) {
-      // if not changed, simply return the current descriptor.
-      if (changed)
-        *changed = false;
-      return desc_;
-    }
-    CAFFE_ENFORCE_EQ(
-        dims.size(), 4U, "Currently only 4-dimensional descriptor supported.");
-    format_ = format;
-    type_ = type;
-    dims_ = dims;
-    CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
-        desc_,
-        format,
-        type,
-        dims_[0],
-        (format == CUDNN_TENSOR_NCHW ? dims_[1] : dims_[3]),
-        (format == CUDNN_TENSOR_NCHW ? dims_[2] : dims_[1]),
-        (format == CUDNN_TENSOR_NCHW ? dims_[3] : dims_[2])));
-    if (changed)
-      *changed = true;
-    return desc_;
-  }
-
-  template <typename T>
-  inline cudnnTensorDescriptor_t Descriptor(
-      const StorageOrder& order,
-      const vector<int>& dims) {
-    return Descriptor(
-        GetCudnnTensorFormat(order), cudnnTypeWrapper<T>::type, dims, nullptr);
-  }
-
- private:
-  cudnnTensorDescriptor_t desc_;
-  cudnnTensorFormat_t format_;
-  cudnnDataType_t type_;
-  vector<int> dims_;
-  C10_DISABLE_COPY_AND_ASSIGN(cudnnTensorDescWrapper);
-};
-
-class cudnnFilterDescWrapper {
- public:
-  cudnnFilterDescWrapper() {
-    CUDNN_ENFORCE(cudnnCreateFilterDescriptor(&desc_));
-  }
-  ~cudnnFilterDescWrapper() noexcept {
-    CUDNN_CHECK(cudnnDestroyFilterDescriptor(desc_));
-  }
-
-  inline cudnnFilterDescriptor_t Descriptor(
-      const StorageOrder& order,
-      const cudnnDataType_t type,
-      const vector<int>& dims,
-      bool* changed) {
-    if (type_ == type && order_ == order && dims_ == dims) {
-      // if not changed, simply return the current descriptor.
-      if (changed)
-        *changed = false;
-      return desc_;
-    }
-    CAFFE_ENFORCE_EQ(
-        dims.size(), 4U, "Currently only 4-dimensional descriptor supported.");
-    order_ = order;
-    type_ = type;
-    dims_ = dims;
-    CUDNN_ENFORCE(cudnnSetFilter4dDescriptor(
-        desc_,
-        type,
-        GetCudnnTensorFormat(order),
-        dims_[0],
-        // TODO - confirm that this is correct for NHWC
-        (order == StorageOrder::NCHW ? dims_[1] : dims_[3]),
-        (order == StorageOrder::NCHW ? dims_[2] : dims_[1]),
-        (order == StorageOrder::NCHW ? dims_[3] : dims_[2])));
-    if (changed)
-      *changed = true;
-    return desc_;
-  }
-
-  template <typename T>
-  inline cudnnFilterDescriptor_t Descriptor(
-      const StorageOrder& order,
-      const vector<int>& dims) {
-    return Descriptor(order, cudnnTypeWrapper<T>::type, dims, nullptr);
-  }
-
- private:
-  cudnnFilterDescriptor_t desc_;
-  StorageOrder order_;
-  cudnnDataType_t type_;
-  vector<int> dims_;
-  C10_DISABLE_COPY_AND_ASSIGN(cudnnFilterDescWrapper);
-};
-
-
-} // namespace caffe2
-
-#endif // CAFFE2_CORE_COMMON_CUDNN_H_

diff --git a/caffe2/core/common_gpu.cc b/caffe2/core/common_gpu.cc
deleted file mode 100644
index e5a2635..0000000
--- a/caffe2/core/common_gpu.cc
+++ /dev/null

@@ -1,253 +0,0 @@
-#include "caffe2/core/common_gpu.h"
-
-#include <atomic>
-#include <cstdlib>
-#include <iostream>
-#include <sstream>
-
-#include <c10/cuda/CUDAFunctions.h>
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/init.h"
-#include "caffe2/core/logging.h"
-
-namespace caffe2 {
-
-int NumCudaDevices() {
-  if (getenv("CAFFE2_DEBUG_CUDA_INIT_ORDER")) {
-    static bool first = true;
-    if (first) {
-      first = false;
-      std::cerr << "DEBUG: caffe2::NumCudaDevices() invoked for the first time"
-                << std::endl;
-    }
-  }
-  // It logs warnings on first run
-  return c10::cuda::device_count();
-}
-
-namespace {
-int gDefaultGPUID = 0;
-}  // namespace
-
-void SetDefaultGPUID(const int deviceid) {
-  CAFFE_ENFORCE_LT(
-      deviceid,
-      NumCudaDevices(),
-      "The default gpu id should be smaller than the number of gpus "
-      "on this machine: ",
-      deviceid,
-      " vs ",
-      NumCudaDevices());
-  gDefaultGPUID = deviceid;
-}
-
-int GetDefaultGPUID() { return gDefaultGPUID; }
-
-int CaffeCudaGetDevice() {
-  int gpu_id = 0;
-  CUDA_ENFORCE(cudaGetDevice(&gpu_id));
-  return gpu_id;
-}
-
-void CaffeCudaSetDevice(const int id) {
-  CUDA_ENFORCE(cudaSetDevice(id));
-}
-
-int GetGPUIDForPointer(const void* ptr) {
-  cudaPointerAttributes attr;
-  cudaError_t err = cudaPointerGetAttributes(&attr, ptr);
-
-  if (err == cudaErrorInvalidValue) {
-    // Occurs when the pointer is in the CPU address space that is
-    // unmanaged by CUDA; make sure the last error state is cleared,
-    // since it is persistent
-    err = cudaGetLastError();
-    CHECK(err == cudaErrorInvalidValue);
-    return -1;
-  }
-
-  // Otherwise, there must be no error
-  CUDA_ENFORCE(err);
-
-  if (attr.type == cudaMemoryTypeHost) {
-    return -1;
-  }
-
-  return attr.device;
-}
-
-struct CudaDevicePropWrapper {
-  CudaDevicePropWrapper() : props(NumCudaDevices()) {
-    for (int i = 0; i < NumCudaDevices(); ++i) {
-      CUDA_ENFORCE(cudaGetDeviceProperties(&props[i], i));
-    }
-  }
-
-  vector<cudaDeviceProp> props;
-};
-
-const cudaDeviceProp& GetDeviceProperty(const int deviceid) {
-  // According to C++11 standard section 6.7, static local variable init is
-  // thread safe. See
-  //   https://stackoverflow.com/questions/8102125/is-local-static-variable-initialization-thread-safe-in-c11
-  // for details.
-  static CudaDevicePropWrapper props;
-  CAFFE_ENFORCE_LT(
-      deviceid,
-      NumCudaDevices(),
-      "The gpu id should be smaller than the number of gpus ",
-      "on this machine: ",
-      deviceid,
-      " vs ",
-      NumCudaDevices());
-  return props.props[deviceid];
-}
-
-void DeviceQuery(const int device) {
-  const cudaDeviceProp& prop = GetDeviceProperty(device);
-  std::stringstream ss;
-  ss << std::endl;
-  ss << "Device id:                     " << device << std::endl;
-  ss << "Major revision number:         " << prop.major << std::endl;
-  ss << "Minor revision number:         " << prop.minor << std::endl;
-  ss << "Name:                          " << prop.name << std::endl;
-  ss << "Total global memory:           " << prop.totalGlobalMem << std::endl;
-  ss << "Total shared memory per block: " << prop.sharedMemPerBlock
-     << std::endl;
-  ss << "Total registers per block:     " << prop.regsPerBlock << std::endl;
-  ss << "Warp size:                     " << prop.warpSize << std::endl;
-#if !defined(USE_ROCM)
-  ss << "Maximum memory pitch:          " << prop.memPitch << std::endl;
-#endif
-  ss << "Maximum threads per block:     " << prop.maxThreadsPerBlock
-     << std::endl;
-  ss << "Maximum dimension of block:    "
-     << prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", "
-     << prop.maxThreadsDim[2] << std::endl;
-  ss << "Maximum dimension of grid:     "
-     << prop.maxGridSize[0] << ", " << prop.maxGridSize[1] << ", "
-     << prop.maxGridSize[2] << std::endl;
-  ss << "Clock rate:                    " << prop.clockRate << std::endl;
-  ss << "Total constant memory:         " << prop.totalConstMem << std::endl;
-#if !defined(USE_ROCM)
-  ss << "Texture alignment:             " << prop.textureAlignment << std::endl;
-  ss << "Concurrent copy and execution: "
-     << (prop.deviceOverlap ? "Yes" : "No") << std::endl;
-#endif
-  ss << "Number of multiprocessors:     " << prop.multiProcessorCount
-     << std::endl;
-#if !defined(USE_ROCM)
-  ss << "Kernel execution timeout:      "
-     << (prop.kernelExecTimeoutEnabled ? "Yes" : "No") << std::endl;
-#endif
-  LOG(INFO) << ss.str();
-  return;
-}
-
-bool GetCudaPeerAccessPattern(vector<vector<bool> >* pattern) {
-  int gpu_count;
-  if (cudaGetDeviceCount(&gpu_count) != cudaSuccess) return false;
-  pattern->clear();
-  pattern->resize(gpu_count, vector<bool>(gpu_count, false));
-  for (int i = 0; i < gpu_count; ++i) {
-    for (int j = 0; j < gpu_count; ++j) {
-      int can_access = true;
-      if (i != j) {
-        if (cudaDeviceCanAccessPeer(&can_access, i, j)
-                 != cudaSuccess) {
-          return false;
-        }
-      }
-      (*pattern)[i][j] = static_cast<bool>(can_access);
-    }
-  }
-  return true;
-}
-
-bool TensorCoreAvailable() {
-  int device = CaffeCudaGetDevice();
-  auto& prop = GetDeviceProperty(device);
-
-  return prop.major >= 7;
-}
-
-const char* cublasGetErrorString(cublasStatus_t error) {
-  switch (error) {
-  case CUBLAS_STATUS_SUCCESS:
-    return "CUBLAS_STATUS_SUCCESS";
-  case CUBLAS_STATUS_NOT_INITIALIZED:
-    return "CUBLAS_STATUS_NOT_INITIALIZED";
-  case CUBLAS_STATUS_ALLOC_FAILED:
-    return "CUBLAS_STATUS_ALLOC_FAILED";
-  case CUBLAS_STATUS_INVALID_VALUE:
-    return "CUBLAS_STATUS_INVALID_VALUE";
-  case CUBLAS_STATUS_ARCH_MISMATCH:
-    return "CUBLAS_STATUS_ARCH_MISMATCH";
-  case CUBLAS_STATUS_INTERNAL_ERROR:
-    return "CUBLAS_STATUS_INTERNAL_ERROR";
-  case CUBLAS_STATUS_MAPPING_ERROR:
-    return "CUBLAS_STATUS_MAPPING_ERROR";
-  case CUBLAS_STATUS_EXECUTION_FAILED:
-    return "CUBLAS_STATUS_EXECUTION_FAILED";
-  case CUBLAS_STATUS_NOT_SUPPORTED:
-    return "CUBLAS_STATUS_NOT_SUPPORTED";
-#if !defined(USE_ROCM)
-  case CUBLAS_STATUS_LICENSE_ERROR:
-    return "CUBLAS_STATUS_LICENSE_ERROR";
-#endif
-  }
-  // To suppress compiler warning.
-  return "Unrecognized cublas error string";
-}
-
-const char* curandGetErrorString(curandStatus_t error) {
-  switch (error) {
-  case CURAND_STATUS_SUCCESS:
-    return "CURAND_STATUS_SUCCESS";
-  case CURAND_STATUS_VERSION_MISMATCH:
-    return "CURAND_STATUS_VERSION_MISMATCH";
-  case CURAND_STATUS_NOT_INITIALIZED:
-    return "CURAND_STATUS_NOT_INITIALIZED";
-  case CURAND_STATUS_ALLOCATION_FAILED:
-    return "CURAND_STATUS_ALLOCATION_FAILED";
-  case CURAND_STATUS_TYPE_ERROR:
-    return "CURAND_STATUS_TYPE_ERROR";
-  case CURAND_STATUS_OUT_OF_RANGE:
-    return "CURAND_STATUS_OUT_OF_RANGE";
-  case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
-    return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
-  case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
-    return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
-  case CURAND_STATUS_LAUNCH_FAILURE:
-    return "CURAND_STATUS_LAUNCH_FAILURE";
-  case CURAND_STATUS_PREEXISTING_FAILURE:
-    return "CURAND_STATUS_PREEXISTING_FAILURE";
-  case CURAND_STATUS_INITIALIZATION_FAILED:
-    return "CURAND_STATUS_INITIALIZATION_FAILED";
-  case CURAND_STATUS_ARCH_MISMATCH:
-    return "CURAND_STATUS_ARCH_MISMATCH";
-  case CURAND_STATUS_INTERNAL_ERROR:
-    return "CURAND_STATUS_INTERNAL_ERROR";
-#if defined(USE_ROCM)
-  case HIPRAND_STATUS_NOT_IMPLEMENTED:
-    return "HIPRAND_STATUS_NOT_IMPLEMENTED";
-#endif
-  }
-  // To suppress compiler warning.
-  return "Unrecognized curand error string";
-}
-
-// Turn on the flag g_caffe2_has_cuda_linked to true for HasCudaRuntime()
-// function.
-namespace {
-class CudaRuntimeFlagFlipper {
- public:
-  CudaRuntimeFlagFlipper() {
-    internal::SetCudaRuntimeFlag();
-  }
-};
-static CudaRuntimeFlagFlipper g_flipper;
-} // namespace
-
-}  // namespace caffe2

diff --git a/caffe2/core/common_gpu.h b/caffe2/core/common_gpu.h
deleted file mode 100644
index 011f462..0000000
--- a/caffe2/core/common_gpu.h
+++ /dev/null

@@ -1,475 +0,0 @@
-#ifndef CAFFE2_CORE_COMMON_GPU_H_
-#define CAFFE2_CORE_COMMON_GPU_H_
-
-#include <assert.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#if !defined(USE_ROCM)
-#ifdef __GNUC__
-#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
-#pragma GCC diagnostic push
-#endif
-#pragma GCC diagnostic ignored "-Wstrict-aliasing"
-#endif // __GNUC__
-#endif // USE_ROCM
-
-#include <cublas_v2.h>
-#include <curand.h>
-#include <driver_types.h>
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/logging.h"
-
-#include "c10/cuda/CUDAMacros.h"
-#include "c10/cuda/CUDAMathCompat.h"
-#include <c10/cuda/CUDAGuard.h>
-
-#define CAFFE2_CUDA_EXPORT C10_EXPORT
-
-// CAFFE2_CUDA_API gets translated to CAFFE2_HIP_API in hipify script, which
-// causes a marco redefinition issue with the later definition of
-// CAFFE2_HIP_API, so we exclude this definition when HIP is specified
-#if !defined(USE_ROCM)
-#define CAFFE2_CUDA_API TORCH_CUDA_CPP_API
-#endif // USE_ROCM
-
-//TODO: [ROCm] Need to remove this after CUDA->HIP mapping is updated.
-#define CAFFE2_HIP_EXPORT C10_EXPORT
-#define CAFFE2_HIP_API TORCH_HIP_API
-
-// This is a macro defined for cuda fp16 support. In default, cuda fp16 is
-// supported by NVCC 7.5, but it is also included in the Tegra X1 platform with
-// a (custom?) NVCC 7.0. As a result, we would normally just check the cuda
-// version here, but would also allow a use to pass in the flag
-// CAFFE_HAS_CUDA_FP16 manually.
-
-#ifndef CAFFE_HAS_CUDA_FP16
-#define CAFFE_HAS_CUDA_FP16
-#endif // CAFFE_HAS_CUDA_FP16
-
-#ifdef CAFFE_HAS_CUDA_FP16
-#include <cuda_fp16.h>
-#endif
-
-// cuda major revision number below which fp16 compute is not supoorted
-#if !defined(USE_ROCM)
-constexpr int kFp16CUDADevicePropMajor = 6;
-#else
-constexpr int kFp16CUDADevicePropMajor = 3;
-#endif
-
-// Re-enable strict aliasing diagnostic if it was disabled.
-#if !defined(USE_ROCM)
-#ifdef __GNUC__
-#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
-#pragma GCC diagnostic pop
-#endif
-#endif // __GNUC__
-#endif // USE_ROCM
-
-/**
- * The maximum number of peers that each gpu can have when doing p2p setup.
- * Currently, according to NVidia documentation, each device can support a
- * system-wide maximum of eight peer connections.
- * When Caffe2 sets up peer access resources, if we have more than 8 gpus,
- * we will enable peer access in groups of 8.
- */
-#define CAFFE2_CUDA_MAX_PEER_SIZE 8
-
-namespace caffe2 {
-
-#if !defined(USE_ROCM)
-/**
- * Empty class to identify TensorCore-based math
- */
-class TensorCoreEngine {};
-#endif // USE_ROCM
-
-/**
- * A runtime function to report the cuda version that Caffe2 is built with.
- */
-inline int CudaVersion() {
-#if defined(USE_ROCM)
-  return ROCM_VERSION;
-#else
-  return CUDA_VERSION;
-#endif
-}
-
-/**
- * Returns the number of devices.
- */
-CAFFE2_CUDA_API int NumCudaDevices();
-
-/**
- * Check if the current running session has a cuda gpu present.
- *
- * Note that this is different from having caffe2 built with cuda. Building
- * Caffe2 with cuda only guarantees that this function exists. If there are no
- * cuda gpus present in the machine, or there are hardware configuration
- * problems like an insufficient driver, this function will still return false,
- * meaning that there is no usable GPU present.
- *
- * In the open source build, it is possible that Caffe2's GPU code is
- * dynamically loaded, and as a result a library could be only linked to the
- * CPU code, but want to test if cuda is later available or not. In this case,
- * one should use HasCudaRuntime() from common.h.
- */
-inline bool HasCudaGPU() {
-  return NumCudaDevices() > 0;
-}
-
-/**
- * Gets the current GPU id. This is a simple wrapper around cudaGetDevice().
- */
-CAFFE2_CUDA_API int CaffeCudaGetDevice();
-
-/**
- * Gets the current GPU id. This is a simple wrapper around cudaGetDevice().
- */
-CAFFE2_CUDA_API void CaffeCudaSetDevice(const int id);
-
-/**
- * Gets the GPU id that the current pointer is located at.
- */
-CAFFE2_CUDA_API int GetGPUIDForPointer(const void* ptr);
-
-/**
- * Gets the device property for the given device. This function is thread safe.
- * The initial run on this function is ~1ms/device; however, the results are
- * cached so subsequent runs should be much faster.
- */
-CAFFE2_CUDA_API const cudaDeviceProp& GetDeviceProperty(const int device);
-
-/**
- * Runs a device query function and prints out the results to LOG(INFO).
- */
-CAFFE2_CUDA_API void DeviceQuery(const int deviceid);
-
-/**
- * Return a peer access pattern by returning a matrix (in the format of a
- * nested vector) of boolean values specifying whether peer access is possible.
- *
- * This function returns false if anything wrong happens during the query of
- * the GPU access pattern.
- */
-CAFFE2_CUDA_API bool GetCudaPeerAccessPattern(vector<vector<bool>>* pattern);
-
-/**
- * Return the availability of TensorCores for math
- */
-CAFFE2_CUDA_API bool TensorCoreAvailable();
-
-/**
- * Return a human readable cublas error string.
- */
-CAFFE2_CUDA_API const char* cublasGetErrorString(cublasStatus_t error);
-
-/**
- * Return a human readable curand error string.
- */
-CAFFE2_CUDA_API const char* curandGetErrorString(curandStatus_t error);
-
-// CUDA: various checks for different function calls.
-#define CUDA_ENFORCE(condition, ...) \
-  do {                               \
-    cudaError_t error = condition;   \
-    CAFFE_ENFORCE_EQ(                \
-        error,                       \
-        cudaSuccess,                 \
-        "Error at: ",                \
-        __FILE__,                    \
-        ":",                         \
-        __LINE__,                    \
-        ": ",                        \
-        cudaGetErrorString(error),   \
-        ##__VA_ARGS__);              \
-  } while (0)
-#define CUDA_CHECK(condition)                                 \
-  do {                                                        \
-    cudaError_t error = condition;                            \
-    CHECK(error == cudaSuccess) << cudaGetErrorString(error); \
-  } while (0)
-
-#define CUDA_DRIVERAPI_ENFORCE(condition)                            \
-  do {                                                               \
-    CUresult result = condition;                                     \
-    if (result != CUDA_SUCCESS) {                                    \
-      const char* msg;                                               \
-      cuGetErrorName(result, &msg);                                  \
-      CAFFE_THROW("Error at: ", __FILE__, ":", __LINE__, ": ", msg); \
-    }                                                                \
-  } while (0)
-#define CUDA_DRIVERAPI_CHECK(condition)                                 \
-  do {                                                                  \
-    CUresult result = condition;                                        \
-    if (result != CUDA_SUCCESS) {                                       \
-      const char* msg;                                                  \
-      cuGetErrorName(result, &msg);                                     \
-      LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": " \
-                 << msg;                                                \
-    }                                                                   \
-  } while (0)
-
-#define CUBLAS_ENFORCE(condition)                \
-  do {                                           \
-    cublasStatus_t status = condition;           \
-    CAFFE_ENFORCE_EQ(                            \
-        status,                                  \
-        CUBLAS_STATUS_SUCCESS,                   \
-        "Error at: ",                            \
-        __FILE__,                                \
-        ":",                                     \
-        __LINE__,                                \
-        ": ",                                    \
-        ::caffe2::cublasGetErrorString(status)); \
-  } while (0)
-#define CUBLAS_CHECK(condition)                    \
-  do {                                             \
-    cublasStatus_t status = condition;             \
-    CHECK(status == CUBLAS_STATUS_SUCCESS)         \
-        << ::caffe2::cublasGetErrorString(status); \
-  } while (0)
-
-#define CURAND_ENFORCE(condition)                \
-  do {                                           \
-    curandStatus_t status = condition;           \
-    CAFFE_ENFORCE_EQ(                            \
-        status,                                  \
-        CURAND_STATUS_SUCCESS,                   \
-        "Error at: ",                            \
-        __FILE__,                                \
-        ":",                                     \
-        __LINE__,                                \
-        ": ",                                    \
-        ::caffe2::curandGetErrorString(status)); \
-  } while (0)
-#define CURAND_CHECK(condition)                    \
-  do {                                             \
-    curandStatus_t status = condition;             \
-    CHECK(status == CURAND_STATUS_SUCCESS)         \
-        << ::caffe2::curandGetErrorString(status); \
-  } while (0)
-
-#define CUDA_1D_KERNEL_LOOP(i, n)                                 \
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
-#define CUDA_2D_KERNEL_LOOP(i, n, j, m)                             \
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n);   \
-       i += blockDim.x * gridDim.x)                                 \
-    for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \
-         j += blockDim.y * gridDim.y)
-
-// The following helper functions are here so that you can write a kernel call
-// when you are not particularly interested in maxing out the kernels'
-// performance. Usually, this will give you a reasonable speed, but if you
-// really want to find the best performance, it is advised that you tune the
-// size of the blocks and grids more reasonably.
-// A legacy note: this is derived from the old good Caffe days, when I simply
-// hard-coded the number of threads and wanted to keep backward compatibility
-// for different computation capabilities.
-// For more info on CUDA compute capabilities, visit the NVidia website at:
-//    http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
-
-// The number of cuda threads to use. Since work is assigned to SMs at the
-// granularity of a block, 128 is chosen to allow utilizing more SMs for
-// smaller input sizes.
-// 1D grid
-constexpr int CAFFE_CUDA_NUM_THREADS = 128;
-// 2D grid
-constexpr int CAFFE_CUDA_NUM_THREADS_2D_DIMX = 16;
-constexpr int CAFFE_CUDA_NUM_THREADS_2D_DIMY = 16;
-
-// The maximum number of blocks to use in the default kernel call. We set it to
-// 4096 which would work for compute capability 2.x (where 65536 is the limit).
-// This number is very carelessly chosen. Ideally, one would like to look at
-// the hardware at runtime, and pick the number of blocks that makes most
-// sense for the specific runtime environment. This is a todo item.
-// 1D grid
-constexpr int CAFFE_MAXIMUM_NUM_BLOCKS = 4096;
-// 2D grid
-constexpr int CAFFE_MAXIMUM_NUM_BLOCKS_2D_DIMX = 128;
-constexpr int CAFFE_MAXIMUM_NUM_BLOCKS_2D_DIMY = 128;
-
-constexpr int kCUDAGridDimMaxX = 2147483647;
-constexpr int kCUDAGridDimMaxY = 65535;
-constexpr int kCUDAGridDimMaxZ = 65535;
-
-/**
- * @brief Compute the number of blocks needed to run N threads.
- */
-inline int CAFFE_GET_BLOCKS(const int N) {
-  return std::max(
-      std::min(
-          (N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS,
-          CAFFE_MAXIMUM_NUM_BLOCKS),
-      // Use at least 1 block, since CUDA does not allow empty block
-      1);
-}
-
-/**
- * @brief Compute the number of blocks needed to run N threads for a 2D grid
- */
-inline dim3 CAFFE_GET_BLOCKS_2D(const int N, const int /* M */) {
-  dim3 grid;
-  // Not calling the 1D version for each dim to keep all constants as literals
-
-  grid.x = std::max(
-      std::min(
-          (N + CAFFE_CUDA_NUM_THREADS_2D_DIMX - 1) /
-              CAFFE_CUDA_NUM_THREADS_2D_DIMX,
-          CAFFE_MAXIMUM_NUM_BLOCKS_2D_DIMX),
-      // Use at least 1 block, since CUDA does not allow empty block
-      1);
-
-  grid.y = std::max(
-      std::min(
-          (N + CAFFE_CUDA_NUM_THREADS_2D_DIMY - 1) /
-              CAFFE_CUDA_NUM_THREADS_2D_DIMY,
-          CAFFE_MAXIMUM_NUM_BLOCKS_2D_DIMY),
-      // Use at least 1 block, since CUDA does not allow empty block
-      1);
-
-  return grid;
-}
-
-using CUDAGuard = c10::cuda::CUDAGuard;
-
-template <typename T, int N>
-struct SimpleArray {
-  T data[N];
-};
-
-constexpr int kCUDATensorMaxDims = 8;
-
-#define DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(val, Func, T, ...) \
-  do {                                                            \
-    CAFFE_ENFORCE_LE(val, kCUDATensorMaxDims);                    \
-    switch (val) {                                                \
-      case 1: {                                                   \
-        Func<T, 1>(__VA_ARGS__);                                  \
-        break;                                                    \
-      }                                                           \
-      case 2: {                                                   \
-        Func<T, 2>(__VA_ARGS__);                                  \
-        break;                                                    \
-      }                                                           \
-      case 3: {                                                   \
-        Func<T, 3>(__VA_ARGS__);                                  \
-        break;                                                    \
-      }                                                           \
-      case 4: {                                                   \
-        Func<T, 4>(__VA_ARGS__);                                  \
-        break;                                                    \
-      }                                                           \
-      case 5: {                                                   \
-        Func<T, 5>(__VA_ARGS__);                                  \
-        break;                                                    \
-      }                                                           \
-      case 6: {                                                   \
-        Func<T, 6>(__VA_ARGS__);                                  \
-        break;                                                    \
-      }                                                           \
-      case 7: {                                                   \
-        Func<T, 7>(__VA_ARGS__);                                  \
-        break;                                                    \
-      }                                                           \
-      case 8: {                                                   \
-        Func<T, 8>(__VA_ARGS__);                                  \
-        break;                                                    \
-      }                                                           \
-      default: {                                                  \
-        break;                                                    \
-      }                                                           \
-    }                                                             \
-  } while (false)
-
-#define DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2(val, Func, T1, T2, ...) \
-  do {                                                                 \
-    CAFFE_ENFORCE_LE(val, kCUDATensorMaxDims);                         \
-    switch (val) {                                                     \
-      case 1: {                                                        \
-        Func<T1, T2, 1>(__VA_ARGS__);                                  \
-        break;                                                         \
-      }                                                                \
-      case 2: {                                                        \
-        Func<T1, T2, 2>(__VA_ARGS__);                                  \
-        break;                                                         \
-      }                                                                \
-      case 3: {                                                        \
-        Func<T1, T2, 3>(__VA_ARGS__);                                  \
-        break;                                                         \
-      }                                                                \
-      case 4: {                                                        \
-        Func<T1, T2, 4>(__VA_ARGS__);                                  \
-        break;                                                         \
-      }                                                                \
-      case 5: {                                                        \
-        Func<T1, T2, 5>(__VA_ARGS__);                                  \
-        break;                                                         \
-      }                                                                \
-      case 6: {                                                        \
-        Func<T1, T2, 6>(__VA_ARGS__);                                  \
-        break;                                                         \
-      }                                                                \
-      case 7: {                                                        \
-        Func<T1, T2, 7>(__VA_ARGS__);                                  \
-        break;                                                         \
-      }                                                                \
-      case 8: {                                                        \
-        Func<T1, T2, 8>(__VA_ARGS__);                                  \
-        break;                                                         \
-      }                                                                \
-      default: {                                                       \
-        break;                                                         \
-      }                                                                \
-    }                                                                  \
-  } while (false)
-
-#define DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_3(val, Func, T1, T2, T3, ...) \
-  do {                                                                     \
-    CAFFE_ENFORCE_LE(val, kCUDATensorMaxDims);                             \
-    switch (val) {                                                         \
-      case 1: {                                                            \
-        Func<T1, T2, T3, 1>(__VA_ARGS__);                                  \
-        break;                                                             \
-      }                                                                    \
-      case 2: {                                                            \
-        Func<T1, T2, T3, 2>(__VA_ARGS__);                                  \
-        break;                                                             \
-      }                                                                    \
-      case 3: {                                                            \
-        Func<T1, T2, T3, 3>(__VA_ARGS__);                                  \
-        break;                                                             \
-      }                                                                    \
-      case 4: {                                                            \
-        Func<T1, T2, T3, 4>(__VA_ARGS__);                                  \
-        break;                                                             \
-      }                                                                    \
-      case 5: {                                                            \
-        Func<T1, T2, T3, 5>(__VA_ARGS__);                                  \
-        break;                                                             \
-      }                                                                    \
-      case 6: {                                                            \
-        Func<T1, T2, T3, 6>(__VA_ARGS__);                                  \
-        break;                                                             \
-      }                                                                    \
-      case 7: {                                                            \
-        Func<T1, T2, T3, 7>(__VA_ARGS__);                                  \
-        break;                                                             \
-      }                                                                    \
-      case 8: {                                                            \
-        Func<T1, T2, T3, 8>(__VA_ARGS__);                                  \
-        break;                                                             \
-      }                                                                    \
-      default: {                                                           \
-        break;                                                             \
-      }                                                                    \
-    }                                                                      \
-  } while (false)
-
-} // namespace caffe2
-
-#endif // CAFFE2_CORE_COMMON_GPU_H_

diff --git a/caffe2/core/context.h b/caffe2/core/context.h
deleted file mode 100644
index eb46f78..0000000
--- a/caffe2/core/context.h
+++ /dev/null

@@ -1,227 +0,0 @@
-#ifndef CAFFE2_CORE_CONTEXT_H_
-#define CAFFE2_CORE_CONTEXT_H_
-
-#include <cstdlib>
-#include <ctime>
-#include <random>
-#include <unordered_map>
-
-#include <c10/util/typeid.h>
-#include "caffe2/core/allocator.h"
-#include "caffe2/core/context_base.h"
-#include "caffe2/core/event.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/proto/caffe2_pb.h"
-
-#include <c10/util/ArrayRef.h>
-
-#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-#include <c10/core/GeneratorImpl.h>
-#include <c10/util/irange.h>
-#include <ATen/core/DistributionsHelper.h>
-#include <ATen/core/MT19937RNGEngine.h>
-#else
-#include "caffe2/core/distributions_stubs.h"
-#endif
-
-C10_DECLARE_bool(caffe2_report_cpu_memory_usage);
-
-namespace caffe2 {
-
-/**
- * A function to generate a random number seed that is unique in a best-effort
- * basis, using an ever-incrementing seed and the current time.
- */
-TORCH_API uint32_t RandomNumberSeed();
-
-/**
- * The CPU Context, representing the bare minimum of what a Context class in
- * Caffe2 should implement.
- *
- * // TODO modify docs
- * See operator.h, especially Operator<Context>, for how Context are used in
- * actual operator implementations that are associated with specific devices.
- * In general, the Context class is passed in as a template argument, and
- * the operator can use the functions defined in the context to execute whatever
- * computation it has.
- *
- */
-class TORCH_API CPUContext final : public BaseContext {
- public:
-#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-  class rand_gen_type {
-   public:
-    explicit rand_gen_type(uint64_t seed_in = default_rng_seed_val)
-        : engine_{seed_in} {}
-
-    uint32_t random() {
-      return engine_();
-    }
-    uint64_t random64() {
-      uint32_t random1 = engine_();
-      uint32_t random2 = engine_();
-      return (static_cast<uint64_t>(random1) << 32) | random2;
-    }
-
-    std::optional<float> next_float_normal_sample() {
-      return next_float_normal_sample_;
-    }
-    std::optional<double> next_double_normal_sample() {
-      return next_double_normal_sample_;
-    }
-    void set_next_float_normal_sample(std::optional<float> randn) {
-      next_float_normal_sample_ = randn;
-    }
-    void set_next_double_normal_sample(std::optional<double> randn) {
-      next_double_normal_sample_ = randn;
-    }
-
-   private:
-    at::mt19937 engine_;
-    std::optional<float> next_float_normal_sample_;
-    std::optional<double> next_double_normal_sample_;
-  };
-#else
-  typedef std::mt19937 rand_gen_type;
-#endif
-
-  CPUContext() {}
-  explicit CPUContext(const DeviceOption& option)
-      : random_seed_(option.has_random_seed() ? option.random_seed() : 1701),
-        random_seed_set_(option.has_random_seed() ? true : false) {
-    CAFFE_ENFORCE_EQ(option.device_type(), PROTO_CPU);
-  }
-  explicit CPUContext(const at::Device& device)
-      : CPUContext(DeviceToOption(device)) {}
-
-  ~CPUContext() noexcept override {}
-
-  inline void SwitchToDevice(int64_t /*stream_id*/) override {}
-
-  using BaseContext::SwitchToDevice;
-
-  inline void WaitEvent(const Event& ev) override {
-    ev.Wait(CPU, this);
-  }
-
-  inline void Record(Event* ev, const char* err_msg = nullptr) const override {
-    CAFFE_ENFORCE(ev, "Event must not be null.");
-    ev->Record(CPU, this, err_msg);
-  }
-
-  inline void FinishDeviceComputation() override {}
-
-  inline rand_gen_type* RandGenerator() {
-    if (!random_generator_.get()) {
-      random_generator_.reset(new rand_gen_type(RandSeed()));
-    }
-    return random_generator_.get();
-  }
-
-  inline uint32_t RandSeed() {
-    if (!random_seed_set_) {
-      random_seed_ = RandomNumberSeed();
-      random_seed_set_ = true;
-    }
-    return static_cast<uint32_t>(random_seed_);
-  }
-
-  inline static at::DataPtr New(size_t nbytes) {
-    return GetCPUAllocator()->allocate(nbytes);
-  }
-
-  void CopyBytesSameDevice(size_t nbytes, const void* src, void* dst) override;
-
-  void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst) override {
-    CopyBytesSameDevice(nbytes, src, dst);
-  }
-
-  void CopyBytesToCPU(size_t nbytes, const void* src, void* dst) override {
-    CopyBytesSameDevice(nbytes, src, dst);
-  }
-
-  bool SupportsNonFundamentalTypes() const override {
-    // CPU non fumdamental type copy OK
-    return true;
-  }
-
-  template <class SrcContext, class DstContext>
-  inline void CopyBytes(size_t nbytes, const void* src, void* dst);
-
-  template <typename T, class SrcContext, class DstContext>
-  inline void Copy(size_t n, const T* src, T* dst) {
-    if (c10::guts::is_fundamental<T>::value) {
-      CopyBytes<SrcContext, DstContext>(
-          n * sizeof(T),
-          static_cast<const void*>(src),
-          static_cast<void*>(dst));
-    } else {
-      for (const auto i : c10::irange(n)) {
-        dst[i] = src[i];
-      }
-    }
-  }
-
-  template <class SrcContext, class DstContext>
-  inline void
-  CopyItems(const TypeMeta meta, size_t n, const void* src, void* dst) {
-    if (meta.copy()) {
-      meta.copy()(src, dst, n);
-    } else {
-      CopyBytes<SrcContext, DstContext>(n * meta.itemsize(), src, dst);
-    }
-  }
-
-  // By default CPU operators don't have async device parts
-  static bool HasAsyncPartDefault() {
-    return false;
-  }
-
-  static bool SupportsAsyncScheduling() {
-    return false;
-  }
-
-  // CPU streams are not implemented and are silently ignored by CPU ops,
-  // return true to signal executor to schedule a CPU op
-  static bool IsStreamFree(
-      const DeviceOption& /* option */,
-      int /* stream_id */) {
-    return true;
-  }
-
-  at::Device device() const override {
-    // TODO: numa?
-    return at::Device(CPU);
-  }
-
-  DeviceType device_type() const override {
-    return CPU;
-  }
-
-  static constexpr DeviceType GetDeviceType() {
-    return CPU;
-  }
-
- protected:
-  // TODO(jiayq): instead of hard-coding a generator, make it more flexible.
-  int random_seed_{1701};
-  bool random_seed_set_{false};
-  std::unique_ptr<rand_gen_type> random_generator_;
-};
-
-template <>
-inline void CPUContext::CopyBytes<CPUContext, CPUContext>(
-    size_t nbytes,
-    const void* src,
-    void* dst) {
-  if (nbytes == 0) {
-    return;
-  }
-  CAFFE_ENFORCE(src);
-  CAFFE_ENFORCE(dst);
-  memcpy(dst, src, nbytes);
-}
-
-} // namespace caffe2
-
-#endif // CAFFE2_CORE_CONTEXT_H_

diff --git a/caffe2/core/context_base.h b/caffe2/core/context_base.h
deleted file mode 100644
index cc8cc4c..0000000
--- a/caffe2/core/context_base.h
+++ /dev/null

@@ -1,168 +0,0 @@
-#pragma once
-
-#include <array>
-#include <cstdlib>
-#include <ctime>
-#include <memory>
-#include <unordered_map>
-
-#include <c10/macros/Macros.h>
-#include <c10/core/Allocator.h>
-#include <c10/util/typeid.h>
-#include <c10/util/Exception.h>
-#include <c10/util/Registry.h>
-#include <c10/core/CopyBytes.h>
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/proto/caffe2_pb.h"
-
-namespace caffe2 {
-class Event;
-
-} // namespace caffe2
-namespace at {
-
-class BaseContext;
-
-/**
- * Virtual interface for the Context class in Caffe2.
- *
- * A Context defines all the necessities to run an operator on a specific
- * device. Specific Context classes needs to implement all the pure virtual
- * functions in the BaseContext class.
- * TODO: add docs after this is finalized.
- */
-class TORCH_API BaseContext {
- public:
-  virtual ~BaseContext() noexcept {}
-
-  virtual Device device() const = 0;
-
-  /* Sorry for the naming, will get rid of this in future diff */
-  virtual DeviceType device_type() const = 0;
-
-  virtual void SwitchToDevice(int64_t /*stream_id*/) = 0;
-
-  inline void SwitchToDevice() {
-    SwitchToDevice(0);
-  }
-
-  virtual void WaitEvent(const caffe2::Event& ev) = 0;
-
-  virtual void Record(caffe2::Event* ev, const char* err_msg = nullptr)
-      const = 0;
-
-  virtual void FinishDeviceComputation() = 0;
-
-  // This used to be arbitrary cross-device copy, but it turns out everyone
-  // did direct CPU-X copy, so we just make three functions for it (to avoid
-  // double dispatch).  This will get obsoleted by C10. where copies
-  // will be proper operators (and get to rely on multiple dispatch there.)
-  virtual void CopyBytesSameDevice(
-      size_t nbytes,
-      const void* src,
-      void* dst) = 0;
-
-  virtual void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst) = 0;
-
-  virtual void CopyBytesToCPU(size_t nbytes, const void* src, void* dst) = 0;
-
-  template <typename T>
-  inline void CopySameDevice(size_t n, const T* src, T* dst) {
-    static_assert(
-        c10::guts::is_fundamental<T>::value,
-        "CopySameDevice requires fundamental types");
-    CopyBytesSameDevice(
-        n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
-  }
-
-  template <typename T>
-  inline void CopyFromCPU(size_t n, const T* src, T* dst) {
-    static_assert(
-        c10::guts::is_fundamental<T>::value,
-        "CopyFromCPU requires fundamental types");
-    CopyBytesFromCPU(
-        n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
-  }
-
-  template <typename T>
-  inline void CopyToCPU(size_t n, const T* src, T* dst) {
-    static_assert(
-        c10::guts::is_fundamental<T>::value, "CopyToCPU requires fundamental types");
-    CopyBytesToCPU(
-        n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
-  }
-
-  virtual bool SupportsNonFundamentalTypes() const {
-    return false;
-  }
-
-  inline void EnforceMetaCopyOK() {
-    AT_ASSERTM(
-        SupportsNonFundamentalTypes(), "Context requires fundamental types");
-  }
-
-  void CopyItemsSameDevice(
-      const caffe2::TypeMeta meta,
-      size_t n,
-      const void* src,
-      void* dst) {
-    if (meta.copy()) {
-      EnforceMetaCopyOK();
-      meta.copy()(src, dst, n);
-    } else {
-      CopyBytesSameDevice(n * meta.itemsize(), src, dst);
-    }
-  }
-
-  void CopyItemsFromCPU(
-      const caffe2::TypeMeta meta,
-      size_t n,
-      const void* src,
-      void* dst) {
-    if (meta.copy()) {
-      EnforceMetaCopyOK();
-      meta.copy()(src, dst, n);
-    } else {
-      CopyBytesFromCPU(n * meta.itemsize(), src, dst);
-    }
-  }
-
-  void CopyItemsToCPU(
-      const caffe2::TypeMeta meta,
-      size_t n,
-      const void* src,
-      void* dst) {
-    if (meta.copy()) {
-      EnforceMetaCopyOK();
-      meta.copy()(src, dst, n);
-    } else {
-      CopyBytesToCPU(n * meta.itemsize(), src, dst);
-    }
-  }
-};
-
-// Context constructor registry
-C10_DECLARE_TYPED_REGISTRY(
-    ContextRegistry,
-    at::DeviceType,
-    at::BaseContext,
-    std::unique_ptr,
-    at::Device);
-
-#define REGISTER_CONTEXT(type, ...) \
-  C10_REGISTER_TYPED_CLASS(ContextRegistry, type, __VA_ARGS__)
-
-inline std::unique_ptr<at::BaseContext> CreateContext(
-    const at::Device& device) {
-  return at::ContextRegistry()->Create(device.type(), device);
-}
-
-} // namespace at
-
-namespace caffe2 {
-
-using at::BaseContext;
-using at::CreateContext;
-} // namespace caffe2

diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu
deleted file mode 100644
index ecc933a..0000000
--- a/caffe2/core/context_gpu.cu
+++ /dev/null

@@ -1,669 +0,0 @@
-#include <algorithm>
-#include <atomic>
-#include <cstdlib>
-#include <string>
-#include <unordered_map>
-
-#include <ATen/Context.h>
-#include <c10/cuda/CUDAFunctions.h>
-#include <c10/cuda/CUDACachingAllocator.h>
-#include "cub/util_allocator.cuh"
-
-// Needed to be included first to check the CAFFE2_USE_CUDNN macros.
-#include "caffe2/core/macros.h"
-
-#include "caffe2/core/blob_stats.h"
-#ifdef CAFFE2_USE_CUDNN
-#include "caffe2/core/common_cudnn.h"
-#endif // CAFFE2_USE_CUDNN
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/core/init.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/tensor.h"
-#include "caffe2/utils/string_utils.h"
-#include "caffe2/utils/cub_namespace.cuh"
-
-C10_DEFINE_string(
-    caffe2_cuda_memory_pool,
-    "",
-    "Sets the memory pool used by caffe2. Possible values are "
-    "none, cnmem, thc and cub.");
-
-// For description of CUB caching allocator configuration, see
-// https://nvlabs.github.io/cub/structcub_1_1_caching_device_allocator.html
-C10_DEFINE_int(
-    caffe2_cub_bin_growth,
-    8,
-    "If using cub as the memory allocator, sets the growth of bins "
-    "used by the cub pool.");
-C10_DEFINE_int(
-    caffe2_cub_min_bin,
-    3,
-    "If using cub as the memory allocator, sets the min number of "
-    "bins.");
-C10_DEFINE_int(
-    caffe2_cub_max_bin,
-    10,
-    "If using cub as the memory allocator, sets the max number of "
-    "bins.");
-C10_DEFINE_int(
-    caffe2_cub_max_managed_mb,
-    10 * 1024,
-    "If using cub as the memory allocators, sets the maximum amount "
-    "of memory managed in gigabytes");
-
-C10_DEFINE_bool(
-    caffe2_cub_print_allocation_events,
-    false,
-    "If true CachingDeviceAllocator will print allocation and deallocation "
-    "events to stdout.");
-
-C10_DEFINE_bool(
-    caffe2_gpu_memory_tracking,
-    false,
-    "If set, logs changes in GPU memory allocations");
-C10_DEFINE_int(
-    caffe2_gpu_memory_report_interval_mb,
-    128,
-    "The threshold in MB on how frequently to report memory changes");
-
-namespace at {
-
-REGISTER_CONTEXT(DeviceType::CUDA, caffe2::CUDAContext);
-} // namespace at
-
-namespace caffe2 {
-
-// Generic implementation - CUDA will handle the right function to call for us
-void CUDAContext::CopyBytesAsync(
-    size_t nbytes,
-    const void* src,
-    Device src_device,
-    void* dst,
-    Device dst_device) {
-  // TODO: verify that the CUDA handles copy from device to device correctly
-  // even without SetDevice()
-  // TODO: verify whether source or dest device should be a priority in picking
-  // the stream
-  // NB: right now the cross-device copy logic is invoked only in the contexts
-  // when surrounding code explicitly manages data dependencies and sets up
-  // events, so it's fine.  In order to make it a standalone function proper
-  // synchronization between stream is required
-  int gpu_id = 0;
-  if (dst_device.is_cuda()) {
-    gpu_id = dst_device.index();
-  } else if (src_device.is_cuda()) {
-    gpu_id = src_device.index();
-  } else {
-    LOG(FATAL) << "shouldn't be called with non-cuda device";
-  }
-  CUDA_ENFORCE(cudaMemcpyAsync(
-      dst,
-      src,
-      nbytes,
-      cudaMemcpyDefault,
-      CUDAContext::getCudaObjects().GetStream(gpu_id)));
-}
-
-void CUDAContext::CopyBytesSync(
-    size_t nbytes,
-    const void* src,
-    Device src_device,
-    void* dst,
-    Device dst_device) {
-  // This emulates Caffe2 original behavior where sync copy doesn't change the
-  // device. It's probably better for clarity to switch to the target device
-  // explicitly here, but in the worst case CUDA would sync for us.
-  // TODO: change it to CUDAGuard
-  CUDAContext context(-1); // take current device
-  CUDA_ENFORCE(cudaMemcpyAsync(
-      dst, src, nbytes, cudaMemcpyDefault, context.cuda_stream()));
-  // destructor of context synchronizes
-}
-
-// For the CPU context, we also allow a (probably expensive) function
-// to copy the data from a cuda context. Inside the function, we create
-// a temporary CUDAContext object to carry out the copy. From the caller's
-// side, these functions are synchronous with respect to the host, similar
-// to a normal CPUContext::CopyBytes<CPUContext, CPUContext> call.
-template <>
-inline void CPUContext::CopyBytes<CUDAContext, CPUContext>(
-    size_t nbytes,
-    const void* src,
-    void* dst) {
-  CUDAContext context(GetGPUIDForPointer(src));
-  context.CopyBytes<CUDAContext, CPUContext>(nbytes, src, dst);
-}
-template <>
-inline void CPUContext::CopyBytes<CPUContext, CUDAContext>(
-    size_t nbytes,
-    const void* src,
-    void* dst) {
-  CUDAContext context(GetGPUIDForPointer(dst));
-  context.CopyBytes<CPUContext, CUDAContext>(nbytes, src, dst);
-}
-
-} // namespace caffe2
-
-namespace caffe2 {
-
-ThreadLocalCUDAObjects& CUDAContext::getCudaObjects() {
-  static thread_local ThreadLocalCUDAObjects cuda_objects_;
-  return cuda_objects_;
-}
-
-// TODO(jiayq): these variables shouldn't be currently accessed during static
-// initialization. We should consider moving them to a Mayer's singleton to
-// be totally safe against SIOF.
-
-// Static global variables for setting up the memory pool.
-CudaMemoryPoolType g_cuda_memory_pool_type;
-
-std::unique_ptr<cub::CachingDeviceAllocator> g_cub_allocator;
-
-// an unordered map that holds the map from the cuda memory pointer to the
-// device id that it is allocated from. This is used in the cuda memory pool
-// cases, where we need the device id to carry out the deletion.
-// Note(jiayq): an alternate approach is to use cudaGetPointerAttributes, but
-// that is usually quite slow. We might want to benchmark the speed difference
-// though.
-// Note(jiayq): another alternate approach is to augment the Tensor class that
-// would allow one to record the device id. However, this does not address any
-// non-tensor allocation and deallocation.
-// Ideally, a memory pool should already have the device id information, as
-// long as we are using UVA (as of CUDA 5 and later) so the addresses are
-// unique.
-static std::unordered_map<void*, uint8_t> g_cuda_device_affiliation;
-
-// Data structures for optional memory tracking. Access to these structures
-// is guarded by the CUDAContext::mutex.
-static std::unordered_map<void*, long> g_size_map;
-static std::vector<long> g_total_by_gpu_map(C10_COMPILE_TIME_MAX_GPUS, 0);
-static std::vector<long> g_max_by_gpu_map(C10_COMPILE_TIME_MAX_GPUS, 0);
-
-static long g_total_mem = 0;
-static long g_last_rep = 0;
-
-CudaMemoryPoolType GetCudaMemoryPoolType() {
-  return g_cuda_memory_pool_type;
-}
-
-///////////////////////////////////////////////////////////////////////////////
-// A wrapper to allow us to lazily initialize all cuda environments that Caffe
-// uses. This gets done the first time a caffe2::CUDAContext::New() gets called
-// which is probably the decisive indication that this caffe2 run is going to
-// use GPUs. We avoid cuda initialization with core/init.h functionalities so
-// that we have minimal resource impact in case we will need to run multiple
-// caffe2 instances on a GPU machine.
-///////////////////////////////////////////////////////////////////////////////
-
-static void Caffe2InitializeCuda() {
-  // If the current run does not have any cuda devices, do nothing.
-  if (!HasCudaGPU()) {
-    VLOG(1) << "No cuda gpu present. Skipping.";
-    return;
-  }
-  C10_LOG_API_USAGE_ONCE("caffe2.init.cuda");
-  // Check if the number of GPUs matches the expected compile-time max number
-  // of GPUs.
-  CAFFE_ENFORCE_LE(
-      NumCudaDevices(),
-      C10_COMPILE_TIME_MAX_GPUS,
-      "Number of CUDA devices on the machine is larger than the compiled "
-      "max number of gpus expected (",
-      C10_COMPILE_TIME_MAX_GPUS,
-      "). Increase that and recompile.");
-
-  for (DeviceIndex i = 0; i < NumCudaDevices(); ++i) {
-    CUDAGuard g(i);
-    // Enable peer access.
-    const int peer_group = i / CAFFE2_CUDA_MAX_PEER_SIZE;
-    const int peer_start = peer_group * CAFFE2_CUDA_MAX_PEER_SIZE;
-    const int peer_end = std::min(
-        NumCudaDevices(), (peer_group + 1) * CAFFE2_CUDA_MAX_PEER_SIZE);
-    VLOG(1) << "Enabling peer access within group #" << peer_group
-            << ", from gpuid " << peer_start << " to " << peer_end - 1
-            << ", for gpuid " << i << ".";
-
-    for (int j = peer_start; j < peer_end; ++j) {
-      if (i == j) continue;
-      int can_access;
-      CUDA_ENFORCE(cudaDeviceCanAccessPeer(&can_access, i, j));
-      if (can_access) {
-        VLOG(1) << "Enabling peer access from " << i << " to " << j;
-        // Note: just for future reference, the 0 here is not a gpu id, it is
-        // a reserved flag for cudaDeviceEnablePeerAccess that should always be
-        // zero currently.
-        // It is ok if peer access is already enabled...
-        cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaDeviceEnablePeerAccess(j, 0));
-        if ((err != cudaErrorPeerAccessAlreadyEnabled) &&
-            (err != cudaSuccess)) {
-          CAFFE_THROW(cudaGetErrorString(err));
-        }
-        cudaGetLastError(); // reset cuda error code
-      }
-    }
-  }
-
-#ifdef CAFFE2_USE_CUDNN
-  // Check the versions of cuDNN that were compiled and linked with are compatible
-  CheckCuDNNVersions();
-#endif // CAFFE2_USE_CUDNN
-}
-
-static void SetUpCub() {
-  VLOG(1) << "Setting up cub memory pool.";
-  // Sets up the cub memory pool
-  try {
-    g_cub_allocator.reset(new cub::CachingDeviceAllocator(
-        FLAGS_caffe2_cub_bin_growth,
-        FLAGS_caffe2_cub_min_bin,
-        FLAGS_caffe2_cub_max_bin,
-        size_t(FLAGS_caffe2_cub_max_managed_mb) * 1024L * 1024L,
-        false,
-        FLAGS_caffe2_cub_print_allocation_events));
-  } catch (...) {
-    CAFFE_THROW("Some error happened at cub initialization.");
-  }
-  VLOG(1) << "Done setting up cub memory pool.";
-}
-
-static void Caffe2SetCUDAMemoryPool() {
-  if (FLAGS_caffe2_cuda_memory_pool == "" ||
-      FLAGS_caffe2_cuda_memory_pool == "none") {
-    g_cuda_memory_pool_type = CudaMemoryPoolType::NONE;
-  } else if (FLAGS_caffe2_cuda_memory_pool == "cnmem") {
-    CAFFE_THROW("CNMEM is no longer used by Caffe2. Use cub instead. "
-                "This error message may go away in the future.");
-  } else if (FLAGS_caffe2_cuda_memory_pool == "cub") {
-    // Sets up cub.
-    g_cuda_memory_pool_type = CudaMemoryPoolType::CUB;
-    SetUpCub();
-  } else if (FLAGS_caffe2_cuda_memory_pool == "thc") {
-    g_cuda_memory_pool_type = CudaMemoryPoolType::THC;
-    // Initialize caching allocator
-    at::globalContext().lazyInitCUDA();
-  } else {
-    CAFFE_THROW(
-        "Unrecognized cuda memory pool type: ", FLAGS_caffe2_cuda_memory_pool);
-  }
-}
-
-/**
- * An allocator that does the CPU memory allocation with pinned memory.
- *
- * This is needed because if we want to do any asynchronous cuda memcpy,
- * the underlying CPU memory also needs to be allocated into pinned memory
- * space. As a result, whenever Caffe2 is built with GPU and there is
- * GPU present during runtime, at global initialization time we will set
- * the CPU memory allocator to allocate pinned memory.
- *
- * NB: This behavior is probably too aggressive. We should consider asking users
- * to do on-demand memory pinning (like exposed in PyTorch APIs) instead.
- */
-struct CAFFE2_CUDA_API PinnedCPUAllocator final : public at::Allocator {
-  PinnedCPUAllocator() {
-    baseAllocator_ = GetDefaultCPUAllocator();
-  }
-  ~PinnedCPUAllocator() override {}
-  at::DataPtr allocate(size_t nbytes) override {
-    if (nbytes == 0) {
-      // replicate c10::alloc_cpu behavior - return nullptr
-      return {nullptr, nullptr, &Delete, at::Device(CPU)};
-    }
-    void* data;
-    at::DataPtr data_ptr;
-    std::lock_guard<std::mutex> lock(CUDAContext::mutex());
-    if (IsNUMAEnabled()) {
-      at::DeleterFnPtr expected_deleter = baseAllocator_->raw_deleter();
-      data_ptr = baseAllocator_->allocate(nbytes);
-      data = data_ptr.get();
-      CAFFE_ENFORCE(data);
-      CUDA_ENFORCE(cudaHostRegister(data, nbytes, cudaHostRegisterDefault));
-      CAFFE_ENFORCE(
-          data_ptr.compare_exchange_deleter(expected_deleter, &Delete),
-          "Failed to swap deleter (already swapped?)");
-    } else {
-      CUDA_ENFORCE(cudaMallocHost(&data, nbytes));
-      profiledCPUMemoryReporter().New(data, nbytes);
-      data_ptr = {data, data, &Delete, at::Device(CPU)};
-    }
-    memset(data, 0, nbytes);
-    return data_ptr;
-  }
-
-  at::DeleterFnPtr raw_deleter() const override {
-    return &Delete;
-  }
-
-  void copy_data(void* dest, const void* src, std::size_t count) const final {
-    TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for PinnedCPUAllocator");
-  }
-
- private:
-  static void Delete(void* data) {
-    if (!data) {
-      return;
-    }
-    // Caffe2 uses a lazy way to figure out if one is actually going to use GPUs
-    // or not. If a CUDAContext::New() call is made, inside the CUDAContext
-    // function we will switch the cpu side allocator to a PinnedCPUAllocator.
-    // But, if one calls CPUContext::New() before any cuda allocations,
-    // PinnedCPUAllocator can still delete the corresponding memory.
-    std::lock_guard<std::mutex> lock(CUDAContext::mutex());
-    if (IsNUMAEnabled()) {
-      CUDA_ENFORCE(cudaHostUnregister(data));
-      GetDefaultCPUAllocator()->raw_deleter()(data);
-    } else {
-      cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaFreeHost(data));
-      profiledCPUMemoryReporter().Delete(data);
-      if (err == cudaErrorInvalidValue) {
-        free(data);
-        // Calling cudaGetLastError will reset the cuda error.
-        cudaError_t _err = cudaGetLastError();
-      } else {
-        // For all other errors, still do a cuda check.
-        CUDA_ENFORCE(err);
-      }
-    }
-  }
-
-  at::Allocator* baseAllocator_;
-};
-
-static PinnedCPUAllocator g_pinned_cpu_alloc;
-
-// An initialization function that sets the CPU side to use pinned cpu
-// allocator.
-void Caffe2UsePinnedCPUAllocator() {
-#if C10_ASAN_ENABLED
-  // Note(jiayq): for more details, see
-  //     https://github.com/google/sanitizers/issues/629
-  LOG(WARNING) << "There are known issues between address sanitizer and "
-                  "cudaMallocHost. As a result, caffe2 will not enable pinned "
-                  "memory allocation in asan mode. If you are expecting any "
-                  "behavior that depends on asan, be advised that it is not "
-                  "turned on.";
-#else
-  if (!HasCudaGPU()) {
-    VLOG(1) << "No GPU present. I won't use pinned allocator then.";
-    return;
-  }
-  VLOG(1) << "Caffe2 gpu: setting CPUAllocator to PinnedCPUAllocator.";
-
-  // If CUDA is enabled, using CPU allocators other than PinnedCPUAllocator
-  // will cause memory corruptions. Therefore, we need to set the priority
-  // to highest to avoid being overwritten.
-  SetCPUAllocator(
-      &g_pinned_cpu_alloc,
-      std::numeric_limits<uint8_t>::max() /* priority */);
-#endif
-}
-
-// Caffe2CudaInitializerHelper is a minimal struct whose sole purpose is to
-// detect the first hint that this Caffe2 run is going to use GPU: either
-// CUDAContext is initialized or CUDAContext::New is called. It then runs
-// all the related cuda initialization functions.
-namespace {
-struct Caffe2CudaInitializerHelper {
-  Caffe2CudaInitializerHelper() {
-    // We cannot use bool because nvcc changes bool to __nv_bool which does
-    // not have a std::atomic instantiation.
-    static std::atomic<char> first_call(1);
-    if (first_call.fetch_and((char)0)) {
-      Caffe2InitializeCuda();
-      Caffe2SetCUDAMemoryPool();
-      Caffe2UsePinnedCPUAllocator();
-    }
-  }
-};
-} // namespace
-
-/**
- * A utility function to rectify the gpu id. If the context specifies the
- * gpu id to be -1, it means that we will just use the current gpu id when
- * the function is being called.
- */
-static inline DeviceIndex RectifyGPUID(DeviceIndex gpu_id) {
-  return gpu_id == -1 ? CaffeCudaGetDevice() : gpu_id;
-}
-
-CUDAContext::CUDAContext(DeviceIndex gpu_id)
-    : gpu_id_(RectifyGPUID(gpu_id)), random_seed_(RandomNumberSeed()) {
-  static Caffe2CudaInitializerHelper g_cuda_initializer_;
-}
-
-CUDAContext::CUDAContext(const DeviceOption& option)
-    : gpu_id_(
-          option.has_device_id() ? RectifyGPUID(option.device_id())
-                                   : CaffeCudaGetDevice()),
-      random_seed_(
-          option.has_random_seed() ? option.random_seed()
-                                   : RandomNumberSeed()) {
-  static Caffe2CudaInitializerHelper g_cuda_initializer_;
-  TORCH_DCHECK_EQ(option.device_type(), PROTO_CUDA);
-}
-
-CUDAContext::~CUDAContext() {
-  try {
-    if (curand_generator_) {
-      CURAND_CHECK(curandDestroyGenerator(curand_generator_));
-    }
-    // CUDAContext is used in 2 cases now:
-    // - long-lived instance inside OperatorBase in which case what happens in
-    //   destructor doesn't really matter
-    // - short-lived on-the-fly instances that are utilized as CUDAGuard - in
-    //   this case there's only one stream id (passed to SwitchToDevice) and
-    //   it's preferrable to synchronize in the destructor
-    FinishDeviceComputation();
-  } catch (const std::exception& e)  {
-    LOG(ERROR) << "Encountered following in " << __FUNCTION__ << ": " << e.what();
-  }
-}
-
-// shared mutex to lock out alloc / free during NCCL launches
-std::mutex& CUDAContext::mutex() {
-  static std::mutex m;
-  return m;
-}
-
-std::vector<long> CUDAContext::TotalMemoryByGpu() {
-  std::lock_guard<std::mutex> lock(CUDAContext::mutex());
-  CAFFE_ENFORCE(
-      FLAGS_caffe2_gpu_memory_tracking,
-      "Pass --caffe2_gpu_memory_tracking to enable memory stats");
-  return g_total_by_gpu_map;
-}
-
-std::vector<long> CUDAContext::MaxMemoryByGpu() {
-  std::lock_guard<std::mutex> lock(CUDAContext::mutex());
-  CAFFE_ENFORCE(
-      FLAGS_caffe2_gpu_memory_tracking,
-      "Pass --caffe2_gpu_memory_tracking to enable memory stats");
-  return g_max_by_gpu_map;
-}
-
-namespace {
-void TrackMemoryAlloc(size_t nbytes) {
-  int this_gpu = CaffeCudaGetDevice();
-  g_total_by_gpu_map[this_gpu] += nbytes;
-  g_max_by_gpu_map[this_gpu] =
-      std::max(g_max_by_gpu_map[this_gpu], g_total_by_gpu_map[this_gpu]);
-  g_total_mem += nbytes;
-  if (g_total_mem - g_last_rep >
-      FLAGS_caffe2_gpu_memory_report_interval_mb * 1024 * 1024) {
-    for (int gpu = 0; gpu < g_total_by_gpu_map.size(); gpu++) {
-      long t = g_total_by_gpu_map[gpu];
-      long max_t = g_max_by_gpu_map[gpu];
-      if (max_t > 0) {
-        if (max_t != t) {
-          VLOG(1) << "GPU " << gpu << ": " << t / 1024 / 1024 << " MB"
-                  << " (max: " << max_t / 1024 / 1024 << " MB)";
-        } else {
-          VLOG(1) << "GPU " << gpu << ": " << t / 1024 / 1024 << " MB";
-        }
-      }
-    }
-    VLOG(1) << "Total: " << g_total_mem / 1024 / 1024 << " MB";
-    g_last_rep = g_total_mem;
-  }
-}
-}
-
-struct DefaultCUDAAllocator final : public at::Allocator {
-  DefaultCUDAAllocator() {}
-  ~DefaultCUDAAllocator() override {}
-  at::DataPtr allocate(size_t nbytes) override {
-    // Lock the mutex
-    std::lock_guard<std::mutex> lock(CUDAContext::mutex());
-    // A one-time caffe2 cuda initializer.
-    static Caffe2CudaInitializerHelper g_cuda_initializer_;
-    void* ptr = nullptr;
-
-    if (FLAGS_caffe2_gpu_memory_tracking) {
-      TrackMemoryAlloc(nbytes);
-    }
-    switch (g_cuda_memory_pool_type) {
-      case CudaMemoryPoolType::NONE:
-        if (nbytes != 0) {
-          CUDA_ENFORCE(cudaMalloc(&ptr, nbytes));
-        }
-        if (FLAGS_caffe2_gpu_memory_tracking) {
-          g_size_map[ptr] = nbytes;
-          g_cuda_device_affiliation[ptr] = CaffeCudaGetDevice();
-        }
-        return {ptr, ptr, &Delete, at::Device(CUDA, CaffeCudaGetDevice())};
-      case CudaMemoryPoolType::CUB:
-        if (nbytes != 0) {
-          CUDA_ENFORCE(g_cub_allocator->DeviceAllocate(&ptr, nbytes));
-        }
-        g_cuda_device_affiliation[ptr] = CaffeCudaGetDevice();
-        VLOG(2) << "CUB allocating pointer " << ptr << " on device "
-                << CaffeCudaGetDevice();
-        if (FLAGS_caffe2_gpu_memory_tracking) {
-          g_size_map[ptr] = nbytes;
-        }
-        return {ptr, ptr, &Delete, at::Device(CUDA, CaffeCudaGetDevice())};
-      case CudaMemoryPoolType::THC:
-        {
-          // The reason we have this stream guard here is to preserve
-          // the historical behavior of the 'thc' allocator in Caffe2,
-          // which is to put all allocations on the same (default)
-          // stream.  This behavior is morally wrong (since passing
-          // allocations between streams allows for the possibility
-          // of you handing out some memory that an old stream
-          // is still working on), but it doesn't seem to cause issues
-          // in Caffe2 today.  Our hypothesis for why this is the case
-          // is that Caffe2 doesn't really do very many allocations
-          // on the fly; instead they allocate once and then reuse
-          // the allocations for the whole program.  In this case,
-          // the hazard is avoided.
-          //
-          // We intend to remove this stream guard, but the benefit
-          // to putting all allocations on the same stream is it
-          // reduces per-stream fragmentation, and this helps
-          // some models that are currently running with the thc
-          // allocator fit in memory.  We will need to find some
-          // way of resolving this problem.
-          c10::cuda::CUDAStreamGuard g(
-            Stream(
-              Stream::DEFAULT,
-              Device(kCUDA, CaffeCudaGetDevice())
-            ));
-          ptr = c10::cuda::CUDACachingAllocator::raw_alloc(nbytes);
-        }
-        if (FLAGS_caffe2_gpu_memory_tracking) {
-          g_size_map[ptr] = nbytes;
-          g_cuda_device_affiliation[ptr] = CaffeCudaGetDevice();
-        }
-        return {ptr, ptr, &Delete, at::Device(CUDA, CaffeCudaGetDevice())};
-    }
-    return {nullptr, nullptr, &Delete, at::Device(CUDA, CaffeCudaGetDevice())};
-  }
-
-  at::DeleterFnPtr raw_deleter() const override {
-    return &Delete;
-  }
-
-  void copy_data(void* dest, const void* src, std::size_t count) const final {
-    TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for DefaultCUDAAllocator");
-  }
-
- private:
-  static void Delete(void* ptr) {
-    // lock the mutex
-    std::lock_guard<std::mutex> lock(CUDAContext::mutex());
-    if (FLAGS_caffe2_gpu_memory_tracking) {
-      auto sz_it = g_size_map.find(ptr);
-      DCHECK(sz_it != g_size_map.end());
-      auto aff_it = g_cuda_device_affiliation.find(ptr);
-      DCHECK(aff_it != g_cuda_device_affiliation.end());
-      g_total_mem -= sz_it->second;
-      g_total_by_gpu_map[aff_it->second] -= sz_it->second;
-      g_size_map.erase(sz_it);
-    }
-
-    switch (g_cuda_memory_pool_type) {
-      case CudaMemoryPoolType::NONE: {
-        // If memory pool is not set up, use simple cudaFree.
-        cudaError_t error = C10_CUDA_ERROR_HANDLED(cudaFree(ptr));
-        // For some reason, in Python runtime we sometimes delete a data pointer
-        // after the cuda runtime exits - this is odd but is probably caused by
-        // a static workspace that pycaffe2 uses, and the destruction got
-        // entangled in some race condition. Anyway, since cuda runtime is
-        // exiting anyway, we will not need to worry about memory leak, so we
-        // basically ignore it. This is definitely not ideal but works for now.
-        if (error != cudaSuccess && error != cudaErrorCudartUnloading) {
-          LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": "
-                     << cudaGetErrorString(error);
-        }
-
-        if (FLAGS_caffe2_gpu_memory_tracking) {
-          g_cuda_device_affiliation.erase(g_cuda_device_affiliation.find(ptr));
-        }
-
-        break;
-      }
-      case CudaMemoryPoolType::CUB: {
-        auto it = g_cuda_device_affiliation.find(ptr);
-        DCHECK(it != g_cuda_device_affiliation.end());
-        VLOG(2) << "CUB freeing pointer " << ptr << " on device " << it->second;
-        CUDA_ENFORCE(g_cub_allocator->DeviceFree(it->second, ptr));
-        g_cuda_device_affiliation.erase(it);
-        break;
-      }
-      case CudaMemoryPoolType::THC: {
-        c10::cuda::CUDACachingAllocator::raw_delete(ptr);
-        if (FLAGS_caffe2_gpu_memory_tracking) {
-          g_cuda_device_affiliation.erase(g_cuda_device_affiliation.find(ptr));
-        }
-        break;
-      }
-    }
-  }
-};
-
-static DefaultCUDAAllocator g_cuda_alloc;
-REGISTER_ALLOCATOR(CUDA, &g_cuda_alloc);
-
-} // namespace caffe2
-
-namespace at {
-REGISTER_COPY_BYTES_FUNCTION(
-    DeviceType::CUDA,
-    DeviceType::CUDA,
-    caffe2::CUDAContext::CopyBytesSync,
-    caffe2::CUDAContext::CopyBytesAsync);
-
-REGISTER_COPY_BYTES_FUNCTION(
-    DeviceType::CUDA,
-    DeviceType::CPU,
-    caffe2::CUDAContext::CopyBytesSync,
-    caffe2::CUDAContext::CopyBytesAsync);
-
-REGISTER_COPY_BYTES_FUNCTION(
-    DeviceType::CPU,
-    DeviceType::CUDA,
-    caffe2::CUDAContext::CopyBytesSync,
-    caffe2::CUDAContext::CopyBytesAsync);
-} // namespace at

diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h
deleted file mode 100644
index 8490a50..0000000
--- a/caffe2/core/context_gpu.h
+++ /dev/null

@@ -1,354 +0,0 @@
-#ifndef CAFFE2_CORE_CONTEXT_GPU_H_
-#define CAFFE2_CORE_CONTEXT_GPU_H_
-
-#include <ctime>
-#include <mutex>
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/common_gpu.h"
-#include "caffe2/core/context.h"
-#include "caffe2/core/context_base.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/numa.h"
-#include "caffe2/core/tensor.h"
-#include "caffe2/core/types.h"
-#include "caffe2/proto/caffe2_pb.h"
-
-// Since we are using the macro CAFFE2_USE_CUDNN, we will need to include this
-// file after common.h is included.
-#ifdef CAFFE2_USE_CUDNN
-#include "caffe2/core/common_cudnn.h"
-#endif // CAFFE2_USE_CUDNN
-
-#include <c10/core/Device.h>
-#include <c10/core/Stream.h>
-#include <c10/cuda/CUDAStream.h>
-#include <c10/cuda/CUDAGuard.h>
-
-namespace caffe2 {
-
-enum class CudaMemoryPoolType {
-  NONE = 0,
-  CUB = 1,
-  THC = 2,
-};
-
-/**
- * Gets the current memory pool type used by Caffe2.
- *
- * The memory pool is set up during caffe2's global initialization time.
- */
-CAFFE2_CUDA_API CudaMemoryPoolType GetCudaMemoryPoolType();
-
-/**
- * A struct to host thread-local cuda objects.
- *
- * In Caffe2, each thread has its own non-default cuda stream as well as
- * related objects such as cublas and curand handles. This is achieved by
- * having the ThreadLocalCUDAObjects wrapper that takes care of allocating
- * and deallocating these objects at the thread scope. This class is solely
- * used inside CUDAContext and should not be used externally.
- *
- * This class manages the mapping from logical stream ID (int stream_id
- * passed around in Caffe2) and CUDAStream objects.  We intend to eventually
- * deprecate the logical stream ID interface, but not for now.
- */
-class CAFFE2_CUDA_API ThreadLocalCUDAObjects {
-  friend class CUDAContext;
-
- private:
-  ThreadLocalCUDAObjects() {
-    for (DeviceIndex i = 0; i < C10_COMPILE_TIME_MAX_GPUS; ++i) {
-      cuda_streams_[i] = vector<c10::cuda::CUDAStream>();
-    }
-  }
-
-  // Record current stream id for the current thread.
-  // This is the new API we're trying to migrate use cases to and get rid of
-  // explicit stream id passing. For now it's invoked in
-  // CUDAContext::SwitchToDevice
-  void SetCurrentStreamId(DeviceIndex gpu, StreamId stream_id) {
-    // TODO: use current device id from thread local instead of passing gpu in
-    if (stream_id != -1) {
-      c10::cuda::setCurrentCUDAStream(GetCUDAStream(gpu, stream_id));
-    }
-  }
-
-  // Retrieves the CUDAStream corresponding to a logical stream ID, ensuring
-  // that it exists in cuda_streams_ if it has not been allocated yet.
-  c10::cuda::CUDAStream GetCUDAStream(DeviceIndex gpu, StreamId stream_id) {
-    vector<c10::cuda::CUDAStream>& gpu_streams = cuda_streams_[gpu];
-    while (gpu_streams.size() <= static_cast<size_t>(stream_id)) {
-      // NB: This streams are not guaranteed to be unique; we'll
-      // wrap around once we run out of streams in the pool.
-      gpu_streams.emplace_back(c10::cuda::getStreamFromPool(/* high priority */ false, gpu));
-    }
-    return gpu_streams[stream_id];
-  }
-
-  // Uses the logical stream id from the thread local to pick the stream
-  // We're going to migrate all usages to this case API instead of passing the
-  // stream id directly
-  cudaStream_t GetStream(DeviceIndex gpu) {
-    return c10::cuda::getCurrentCUDAStream(gpu).stream();
-  }
-
-  cudaStream_t GetStream(DeviceIndex gpu, StreamId stream_id) {
-    return GetCUDAStream(gpu, stream_id).stream();
-  }
-
-  // Uses the logical stream id from the thread local to pick the stream
-  // We're going to migrate all usages to this case API instead of passing the
-  // stream id directly
-  cublasHandle_t GetHandle(DeviceIndex gpu) {
-    return GetHandle(c10::cuda::getCurrentCUDAStream(gpu));
-  }
-
-  cublasHandle_t GetHandle(c10::cuda::CUDAStream cuda_stream) {
-    CUDAGuard guard(cuda_stream.device_index());
-    // Default construct in the map if it doesn't exist, and return a mutable
-    // reference to it.
-    auto& r = cublas_handles_[cuda_stream];
-    if (r == nullptr) {
-      CUBLAS_ENFORCE(cublasCreate(&r));
-      // The default is CUBLAS_POINTER_MODE_HOST. You can override
-      // it after obtaining the cublas handle, but do that with
-      // caution.
-      CUBLAS_ENFORCE(cublasSetPointerMode(r, CUBLAS_POINTER_MODE_HOST));
-      CUBLAS_ENFORCE(cublasSetStream(r, cuda_stream));
-    }
-    return r;
-  }
-
-#ifdef CAFFE2_USE_CUDNN
-  // Uses the logical stream id from the thread local to pick the stream
-  // We're going to migrate all usages to this case API instead of passing the
-  // stream id directly
-  cudnnHandle_t GetCudnnHandle(DeviceIndex gpu) {
-    return GetCudnnHandle(c10::cuda::getCurrentCUDAStream(gpu));
-  }
-
-  cudnnHandle_t GetCudnnHandle(c10::cuda::CUDAStream cuda_stream) {
-    CUDAGuard guard(cuda_stream.device_index());
-    auto& r = cudnn_handles_[cuda_stream];
-    if (r == nullptr) {
-      CUDNN_ENFORCE(cudnnCreate(&r));
-      CUDNN_ENFORCE(cudnnSetStream(r, cuda_stream));
-    }
-    return r;
-  }
-#endif // CAFFE2_USE_CUDNN
-
-  ~ThreadLocalCUDAObjects() noexcept {
-    for (auto element : cublas_handles_) {
-      if (element.second) {
-        CUBLAS_CHECK(cublasDestroy(element.second));
-      }
-    }
-#ifdef CAFFE2_USE_CUDNN
-    for (auto element : cudnn_handles_) {
-      if (element.second) {
-#ifdef _WIN32
-        // this is because of something dumb in the ordering of
-        // destruction. Sometimes at exit, the cuda context would already
-        // be destroyed by the time this gets destroyed. This happens on
-        // windows with cuda 11 and cuda 12.
-        cudnnDestroy(element.second);
-#else
-        CUDNN_CHECK(cudnnDestroy(element.second));
-#endif // _WIN32
-      }
-    }
-#endif // CAFFE2_USE_CUDNN
-  }
-  // WARNING: mapping from logical stream ID to c10::cuda::CUDAStream
-  // is NOT bijective; multiple logical stream IDs may map to the
-  // same underlying stream ID.
-  vector<c10::cuda::CUDAStream> cuda_streams_[C10_COMPILE_TIME_MAX_GPUS];
-  std::unordered_map<c10::cuda::CUDAStream, cublasHandle_t> cublas_handles_;
-#ifdef CAFFE2_USE_CUDNN
-  std::unordered_map<c10::cuda::CUDAStream, cudnnHandle_t> cudnn_handles_;
-#endif // CAFFE2_USE_CUDNN
-};
-
-class CAFFE2_CUDA_API CUDAContext final : public BaseContext {
- public:
-  // The default cuda context constructor.
-  explicit CUDAContext(DeviceIndex gpu_id = -1);
-  explicit CUDAContext(const DeviceOption& option);
-  explicit CUDAContext(Device device)
-      : CUDAContext(DeviceToOption(device)) {}
-
-  ~CUDAContext() override;
-
-  inline void SwitchToDevice(StreamId stream_id) override {
-    getCudaObjects().SetCurrentStreamId(gpu_id_, stream_id);
-    CaffeCudaSetDevice(gpu_id_);
-  }
-
-  // void SwitchToDevice()
-  using BaseContext::SwitchToDevice;
-
-  inline void WaitEvent(const Event& ev) override {
-    ev.Wait(CUDA, this);
-  }
-
-  inline void Record(Event* ev, const char* err_msg = nullptr) const override {
-    CAFFE_ENFORCE(ev, "Event must not be null.");
-    ev->Record(CUDA, this, err_msg);
-  }
-
-  // Note on current use cases:
-  // FinishDeviceComputation must be called on the same cpu thread as
-  // SwitchToDevice()
-  void FinishDeviceComputation() override {
-    CUDA_ENFORCE(cudaStreamSynchronize(getCudaObjects().GetStream(gpu_id_)));
-  }
-
-  inline int device_id() const {
-    return gpu_id_;
-  }
-
-  inline c10::cuda::CUDAStream stream() const {
-    return at::cuda::getStreamFromExternal(getCudaObjects().GetStream(gpu_id_), gpu_id_);
-  }
-
-  inline cudaStream_t cuda_stream() const {
-    return getCudaObjects().GetStream(gpu_id_);
-  }
-
-  static cudaStream_t cuda_stream(DeviceIndex gpu_id, StreamId stream_id) {
-    return getCudaObjects().GetStream(gpu_id, stream_id);
-  }
-
-  cublasHandle_t cublas_handle() {
-    return getCudaObjects().GetHandle(gpu_id_);
-  }
-
-#ifdef CAFFE2_USE_CUDNN
-  cudnnHandle_t cudnn_handle() {
-    return getCudaObjects().GetCudnnHandle(gpu_id_);
-  }
-#endif // CAFFE2_USE_CUDNN
-
-  curandGenerator_t& curand_generator() {
-    if (!curand_generator_) {
-      CUDAGuard guard(gpu_id_);
-      CURAND_ENFORCE(
-          curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
-      CURAND_ENFORCE(
-          curandSetPseudoRandomGeneratorSeed(curand_generator_, random_seed_));
-      TORCH_CHECK_NOTNULL(curand_generator_);
-    }
-    CURAND_ENFORCE(curandSetStream(curand_generator_, cuda_stream()));
-    return curand_generator_;
-  }
-
-  inline static at::DataPtr New(size_t nbytes) {
-    return GetAllocator(CUDA)->allocate(nbytes);
-  }
-
-  // Get a mutex to lock out cudaMalloc / cudaFree calls when
-  // NCCL kernels are being launched. Should remove threat of
-  // deadlocks
-  static std::mutex& mutex();
-
-  // Functions to query memory stats. Only available if flag
-  // --caffe2_gpu_memory_tracking is enabled.
-  static std::vector<long> TotalMemoryByGpu();
-  static std::vector<long> MaxMemoryByGpu();
-
-  template <class SrcContext, class DstContext>
-  inline void CopyBytes(size_t nbytes, const void* src, void* dst) {
-    CUDA_ENFORCE(cudaMemcpyAsync(
-        dst,
-        src,
-        nbytes,
-        cudaMemcpyDefault,
-        getCudaObjects().GetStream(gpu_id_)));
-  }
-
-  void CopyBytesSameDevice(size_t nbytes, const void* src, void* dst) override {
-    CopyBytes<CUDAContext, CUDAContext>(nbytes, src, dst);
-  }
-
-  void CopyBytesToCPU(size_t nbytes, const void* src, void* dst) override {
-    CopyBytes<CUDAContext, CPUContext>(nbytes, src, dst);
-  }
-
-  void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst) override {
-    CopyBytes<CPUContext, CUDAContext>(nbytes, src, dst);
-  }
-
-  template <typename T, class SrcContext, class DstContext>
-  inline void Copy(int n, const T* src, T* dst) {
-    CopyBytes<SrcContext, DstContext>(n * sizeof(T),
-                                 static_cast<const void*>(src),
-                                 static_cast<void*>(dst));
-  }
-
-  template <class SrcContext, class DstContext>
-  inline void
-  CopyItems(const TypeMeta meta, size_t n, const void* src, void* dst) {
-    CAFFE_ENFORCE(!meta.copy(), "CUDAContext requires fundamental types.");
-    CopyBytes<SrcContext, DstContext>(n * meta.itemsize(), src, dst);
-  }
-
-  static void CopyBytesAsync(
-      size_t nbytes,
-      const void* src,
-      Device src_device,
-      void* dst,
-      Device dst_device);
-  static void CopyBytesSync(
-      size_t nbytes,
-      const void* src,
-      Device src_device,
-      void* dst,
-      Device dst_device);
-
-  // By default CUDA operators have async device parts
-  static bool HasAsyncPartDefault() {
-    return true;
-  }
-
-  static bool SupportsAsyncScheduling() {
-    return true;
-  }
-
-  static bool IsStreamFree(const DeviceOption& option, StreamId stream_id) {
-    const auto stream = CUDAContext::cuda_stream(option.device_id(), stream_id);
-    const auto status = C10_CUDA_ERROR_HANDLED(cudaStreamQuery(stream));
-    if (status == cudaErrorNotReady) {
-      // ignore and clear the error if not ready
-      C10_CUDA_CLEAR_ERROR();
-    } else {
-      C10_CUDA_CHECK(status); // Reraise error
-    }
-    return status == cudaSuccess;
-  }
-
-  at::Device device() const override {
-    return at::Device(CUDA, gpu_id_);
-  }
-
-  DeviceType device_type() const override {
-    return CUDA;
-  }
-
-  static constexpr DeviceType GetDeviceType() {
-    return CUDA;
-  }
-
- protected:
-  int gpu_id_;
-  int random_seed_;
-  curandGenerator_t curand_generator_{nullptr};
-  static ThreadLocalCUDAObjects& getCudaObjects();
-};
-
-using TensorCUDA = Tensor;
-
-}  // namespace caffe2
-
-#endif  // CAFFE2_CORE_CONTEXT_GPU_H_

diff --git a/caffe2/core/event_gpu.cc b/caffe2/core/event_gpu.cc
deleted file mode 100644
index 82000de..0000000
--- a/caffe2/core/event_gpu.cc
+++ /dev/null

@@ -1,227 +0,0 @@
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/core/event_cpu.h"
-#include "caffe2/core/operator.h"
-
-#include <atomic>
-#include <iostream>
-
-namespace caffe2 {
-
-struct CudaEventWrapper {
-  explicit CudaEventWrapper(const DeviceOption& option)
-      : cuda_stream_(nullptr),
-        device_id_(option.device_id()),
-        status_(EventStatus::EVENT_INITIALIZED) {
-    CAFFE_ENFORCE(option.device_type(), PROTO_CUDA);
-    CUDAGuard g(device_id_);
-    try {
-      CUDA_ENFORCE(cudaEventCreateWithFlags(
-          &cuda_event_, cudaEventDefault | cudaEventDisableTiming));
-    } catch (const Error&) {
-      std::cerr << "ERROR: Failed to load CUDA.\n"
-                << "HINT: Check that this binary contains GPU code."
-                << std::endl;
-      throw;
-    }
-  }
-  ~CudaEventWrapper() {
-    CUDAGuard g(device_id_);
-    CUDA_CHECK(cudaEventDestroy(cuda_event_));
-  }
-
-  cudaEvent_t cuda_event_;
-  cudaStream_t cuda_stream_;
-  int device_id_;
-
-  std::atomic<int> status_;
-  std::mutex mutex_recorded_;
-  std::condition_variable cv_recorded_;
-  std::string err_msg_;
-};
-
-namespace {
-const std::string kNoError = "No error";
-}
-
-void EventCreateCUDA(const DeviceOption& option, Event* event) {
-  event->event_ = std::make_shared<CudaEventWrapper>(option);
-}
-
-void EventRecordCUDA(Event* event, const void* context, const char* err_msg) {
-  auto* wrapper = static_cast<CudaEventWrapper*>(event->event_.get());
-  {
-    std::unique_lock<std::mutex> lock(wrapper->mutex_recorded_);
-
-    // Possible state changes:
-    //  INITIALIZED -> SCHEDULED/FAILED
-    //  SCHEDULED -> SUCCESS/FAILED
-    //  SUCCESS/FAILED - terminal
-    //
-    // No further changes to cuda_event_ and cuda_stream_ after transitioning
-    // from INITIALIZED
-    // No further changes to err_msg_ after transitioning into FAILED
-
-    CAFFE_ENFORCE_EQ(
-        wrapper->status_,
-        EventStatus::EVENT_INITIALIZED,
-        "Calling Record multiple times");
-
-    if (!err_msg) {
-      // When recording, one needs to make sure that the current gpu id is
-      // correct.
-      // TODO(jiayq): move the enforce logic to the caller?
-      const auto& current_device = CaffeCudaGetDevice();
-      CAFFE_ENFORCE_EQ(
-          current_device,
-          wrapper->device_id_,
-          "When you call EventRecordCUDA, your current device should be the same "
-          "as the device specified by the event.");
-      CAFFE_ENFORCE_EQ(
-          current_device,
-          static_cast<const CUDAContext*>(context)->device_id());
-      CUDA_ENFORCE(cudaEventRecord(
-          wrapper->cuda_event_,
-          static_cast<const CUDAContext*>(context)->cuda_stream()));
-      wrapper->cuda_stream_ =
-          static_cast<const CUDAContext*>(context)->cuda_stream();
-      wrapper->status_ = EventStatus::EVENT_SCHEDULED;
-    } else {
-      wrapper->err_msg_ = err_msg;
-      wrapper->status_ = EventStatus::EVENT_FAILED;
-    }
-  }
-  wrapper->cv_recorded_.notify_all();
-}
-
-void EventFinishCUDA(const Event* event) {
-  auto* wrapper = static_cast<CudaEventWrapper*>(event->event_.get());
-  {
-    std::unique_lock<std::mutex> lock(wrapper->mutex_recorded_);
-    while (wrapper->status_ == EventStatus::EVENT_INITIALIZED) {
-      wrapper->cv_recorded_.wait(lock);
-    }
-  }
-
-  if (wrapper->status_ == EventStatus::EVENT_SCHEDULED) {
-    // ok, even if event is already completed and status was not yet updated
-    CUDAGuard g(wrapper->device_id_);
-    auto cudaResult = cudaEventSynchronize(wrapper->cuda_event_);
-    if (cudaResult == cudaSuccess) {
-      wrapper->status_ = EventStatus::EVENT_SUCCESS;
-    } else {
-      const auto& err_msg = cudaGetErrorString(cudaResult);
-
-      std::unique_lock<std::mutex> lock(wrapper->mutex_recorded_);
-      wrapper->err_msg_ = err_msg;
-      wrapper->status_ = EventStatus::EVENT_FAILED;
-    }
-  }
-}
-
-// Both waiter and event are CUDA. Non-blocking
-void EventWaitCUDACUDA(const Event* event, void* context) {
-  auto* wrapper = static_cast<CudaEventWrapper*>(event->event_.get());
-  {
-    std::unique_lock<std::mutex> lock(wrapper->mutex_recorded_);
-    while (wrapper->status_ == EventStatus::EVENT_INITIALIZED) {
-      wrapper->cv_recorded_.wait(lock);
-    }
-  }
-
-  if (wrapper->status_ == EventStatus::EVENT_SCHEDULED) {
-    // ok, even if event is already completed and status was not yet updated
-    auto context_stream = static_cast<CUDAContext*>(context)->cuda_stream();
-    auto event_stream = wrapper->cuda_stream_;
-    if (context_stream != event_stream) {
-      // CAFFE_ENFORCE_EQ(
-      //    CaffeCudaGetDevice(),
-      //    static_cast<const CUDAContext*>(context)->device_id());
-      CUDA_CHECK(cudaStreamWaitEvent(context_stream, wrapper->cuda_event_, 0));
-    }
-  }
-}
-
-// Waiter is CPU, event is CUDA
-void EventWaitCPUCUDA(const Event* event, void* context) {
-  EventFinishCUDA(event);
-}
-
-// Waiter is CUDA, event is CPU
-void EventWaitCUDACPU(const Event* event, void* context) {
-  event->Finish(); // calls EventFinishCPU
-}
-
-EventStatus EventQueryCUDA(const Event* event) {
-  auto* wrapper = static_cast<CudaEventWrapper*>(event->event_.get());
-  if (wrapper->status_ == EventStatus::EVENT_SCHEDULED) {
-    auto cudaResult = cudaEventQuery(wrapper->cuda_event_);
-    if (cudaResult == cudaSuccess) {
-      wrapper->status_ = EventStatus::EVENT_SUCCESS;
-    } else if (cudaResult != cudaErrorNotReady) {
-      const auto& err_msg = cudaGetErrorString(cudaResult);
-
-      std::unique_lock<std::mutex> lock(wrapper->mutex_recorded_);
-      wrapper->err_msg_ = err_msg;
-      wrapper->status_ = EventStatus::EVENT_FAILED;
-    } else {
-      // ignore and clear the error if not ready
-      (void)cudaGetLastError();
-    }
-  }
-  return static_cast<EventStatus>(wrapper->status_.load());
-}
-
-const std::string& EventErrorMessageCUDA(const Event* event) {
-  auto* wrapper = static_cast<CudaEventWrapper*>(event->event_.get());
-  // supposed to be called after EventQueryCUDA to update status first
-  if (wrapper->status_ == EventStatus::EVENT_FAILED) {
-    return wrapper->err_msg_;
-  } else {
-    return kNoError;
-  }
-}
-
-void EventSetFinishedCUDA(const Event* event, const char* err_msg) {
-  auto* wrapper = static_cast<CudaEventWrapper*>(event->event_.get());
-  {
-    std::unique_lock<std::mutex> lock(wrapper->mutex_recorded_);
-
-    CAFFE_ENFORCE_EQ(
-        wrapper->status_,
-        EventStatus::EVENT_INITIALIZED,
-        "Calling SetFinished on recorded CUDA event");
-
-    if (!err_msg) {
-      wrapper->status_ = EventStatus::EVENT_SUCCESS;
-    } else {
-      wrapper->err_msg_ = err_msg;
-      wrapper->status_ = EventStatus::EVENT_FAILED;
-    }
-  }
-  wrapper->cv_recorded_.notify_all();
-}
-
-void EventResetCUDA(Event* event) {
-  auto* wrapper = static_cast<CudaEventWrapper*>(event->event_.get());
-  std::unique_lock<std::mutex> lock(wrapper->mutex_recorded_);
-  wrapper->status_ = EventStatus::EVENT_INITIALIZED;
-  wrapper->err_msg_ = "";
-  wrapper->cuda_stream_ = nullptr;
-}
-
-REGISTER_EVENT_CREATE_FUNCTION(CUDA, EventCreateCUDA);
-REGISTER_EVENT_RECORD_FUNCTION(CUDA, EventRecordCUDA);
-REGISTER_EVENT_WAIT_FUNCTION(CUDA, CUDA, EventWaitCUDACUDA);
-REGISTER_EVENT_WAIT_FUNCTION(CPU, CUDA, EventWaitCPUCUDA);
-REGISTER_EVENT_WAIT_FUNCTION(CUDA, CPU, EventWaitCUDACPU);
-REGISTER_EVENT_FINISH_FUNCTION(CUDA, EventFinishCUDA);
-
-REGISTER_EVENT_QUERY_FUNCTION(CUDA, EventQueryCUDA);
-REGISTER_EVENT_ERROR_MESSAGE_FUNCTION(CUDA, EventErrorMessageCUDA);
-REGISTER_EVENT_SET_FINISHED_FUNCTION(CUDA, EventSetFinishedCUDA);
-REGISTER_EVENT_RESET_FUNCTION(CUDA, EventResetCUDA);
-
-REGISTER_EVENT_WAIT_FUNCTION(MKLDNN, CUDA, EventWaitCPUCUDA);
-REGISTER_EVENT_WAIT_FUNCTION(CUDA, MKLDNN, EventWaitCUDACPU);
-
-} // namespace caffe2

diff --git a/caffe2/core/flags.h b/caffe2/core/flags.h
deleted file mode 100644
index 54f1f41..0000000
--- a/caffe2/core/flags.h
+++ /dev/null

@@ -1,4 +0,0 @@
-#pragma once
-
-#include "c10/util/Flags.h"
-#include "caffe2/core/common.h"

diff --git a/caffe2/core/hip/common_miopen.h b/caffe2/core/hip/common_miopen.h
deleted file mode 100644
index 6901055..0000000
--- a/caffe2/core/hip/common_miopen.h
+++ /dev/null

@@ -1,178 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef CAFFE2_CORE_COMMON_MIOPEN_H_
-#define CAFFE2_CORE_COMMON_MIOPEN_H_
-
-#include <array>
-#include <mutex>
-#include "miopen/miopen.h"
-#include "caffe2/core/common.h"
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/types.h"
-#include "caffe2/proto/caffe2_pb.h"
-
-#define MIOPEN_VERSION 1399
-
-namespace caffe2 {
-
-namespace internal {
-/**
- * A helper function to obtain miopen error strings.
- */
-inline const char* miopenGetErrorString(miopenStatus_t status)
-{
-    switch(status)
-    {
-    case miopenStatusSuccess: return "MIOPEN_STATUS_SUCCESS";
-    case miopenStatusNotInitialized: return "MIOPEN_STATUS_NOT_INITIALIZED";
-    case miopenStatusAllocFailed: return "MIOPEN_STATUS_ALLOC_FAILED";
-    case miopenStatusBadParm: return "MIOPEN_STATUS_BAD_PARAM";
-    case miopenStatusInternalError: return "MIOPEN_STATUS_INTERNAL_ERROR";
-    case miopenStatusInvalidValue: return "MIOPEN_STATUS_INVALID_VALUE";
-    case miopenStatusNotImplemented: return "MIOPEN_STATUS_NOT_SUPPORTED";
-    case miopenStatusUnknownError: return "MIOPEN_STATUS_UNKNOWN_ERROR";
-    default: return "MIOPEN_STATUS_UNKNOWN_ERROR";
-    }
-}
-} // namespace internal
-
-// A macro that wraps around a miopen statement so we can check if the miopen
-// execution finishes or not.
-#define MIOPEN_ENFORCE(condition)                                           \
-    do                                                                      \
-    {                                                                       \
-        miopenStatus_t status = condition;                                  \
-        CAFFE_ENFORCE_EQ(status,                                            \
-                         miopenStatusSuccess,                               \
-                         ", Error at: ",                                    \
-                         __FILE__,                                          \
-                         ":",                                               \
-                         __LINE__,                                          \
-                         ": ",                                              \
-                         ::caffe2::internal::miopenGetErrorString(status)); \
-    } while(0)
-#define MIOPEN_CHECK(condition)                                                                   \
-    do                                                                                            \
-    {                                                                                             \
-        miopenStatus_t status = condition;                                                        \
-        CHECK(status == miopenStatusSuccess) << ::caffe2::internal::miopenGetErrorString(status); \
-    } while(0)
-
-// report the version of miopen Caffe2 was compiled with
-inline size_t miopenCompiledVersion() { return MIOPEN_VERSION; }
-
-// report the runtime version of miopen
-inline size_t miopenRuntimeVersion() { return MIOPEN_VERSION; }
-
-// Check compatibility of compiled and runtime miopen versions
-inline void CheckMIOPENVersions() {}
-
-/**
- * miopenTypeWrapper is a wrapper class that allows us to refer to the miopen type
- * in a template function. The class is specialized explicitly for different
- * data types below.
- */
-template <typename T>
-class miopenTypeWrapper;
-
-template <>
-class miopenTypeWrapper<float>
-{
-    public:
-    static const miopenDataType_t type = miopenFloat;
-    typedef const float ScalingParamType;
-    typedef float BNParamType;
-    static ScalingParamType* kOne()
-    {
-        static ScalingParamType v = 1.0;
-        return &v;
-    }
-    static const ScalingParamType* kZero()
-    {
-        static ScalingParamType v = 0.0;
-        return &v;
-    }
-};
-
-template <>
-class miopenTypeWrapper<at::Half>
-{
-    public:
-    static const miopenDataType_t type = miopenHalf;
-    typedef const float ScalingParamType;
-    typedef float BNParamType;
-    static ScalingParamType* kOne()
-    {
-        static ScalingParamType v = 1.0;
-        return &v;
-    }
-    static ScalingParamType* kZero()
-    {
-        static ScalingParamType v = 0.0;
-        return &v;
-    }
-};
-
-/**
- * miopenTensorDescWrapper is the placeholder that wraps around a
- * miopenTensorDescriptor_t, allowing us to do descriptor change as-needed during
- * runtime.
- */
-class miopenTensorDescWrapper
-{
-    public:
-    miopenTensorDescWrapper() { MIOPEN_ENFORCE(miopenCreateTensorDescriptor(&desc_)); }
-    ~miopenTensorDescWrapper() noexcept { MIOPEN_CHECK(miopenDestroyTensorDescriptor(desc_)); }
-
-    inline miopenTensorDescriptor_t
-    Descriptor(const miopenDataType_t type, const vector<int>& dims, bool* changed)
-    {
-        if(type_ == type && dims_ == dims)
-        {
-            // if not changed, simply return the current descriptor.
-            if(changed)
-                *changed = false;
-            return desc_;
-        }
-        CAFFE_ENFORCE_EQ(
-            dims.size(), 4, "MIOPEN currently only support 4-dimensional tensor descriptor");
-
-        type_ = type;
-        dims_ = dims;
-        MIOPEN_ENFORCE(
-            miopenSet4dTensorDescriptor(desc_, type, dims_[0], dims_[1], dims_[2], dims_[3]));
-        if(changed)
-            *changed = true;
-        return desc_;
-    }
-
-    template <typename T>
-    inline miopenTensorDescriptor_t Descriptor(const StorageOrder& order, const vector<int>& dims)
-    {
-        return Descriptor(miopenTypeWrapper<T>::type, dims, nullptr);
-    }
-
-    private:
-    miopenTensorDescriptor_t desc_;
-    miopenDataType_t type_;
-    vector<int> dims_;
-    C10_DISABLE_COPY_AND_ASSIGN(miopenTensorDescWrapper);
-};
-
-} // namespace caffe2
-
-#endif // CAFFE2_CORE_COMMON_MIOPEN_H_

diff --git a/caffe2/core/hip/common_miopen.hip b/caffe2/core/hip/common_miopen.hip
deleted file mode 100644
index a617bad..0000000
--- a/caffe2/core/hip/common_miopen.hip
+++ /dev/null

@@ -1,42 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "caffe2/core/hip/common_miopen.h"
-#include "caffe2/core/hip/miopen_wrapper.h"
-
-#include "caffe2/core/init.h"
-
-namespace caffe2 {
-
-MIOPENWrapper::PerGPUMIOPENStates& MIOPENWrapper::miopen_states()
-{
-    // New it (never delete) to avoid calling the destructors on process
-    // exit and racing against the CUDA shutdown sequence.
-    static auto* p = new MIOPENWrapper::PerGPUMIOPENStates();
-    TORCH_CHECK_NOTNULL(p);
-    return *p;
-}
-
-namespace {
-bool PrintMIOPENInfo(int*, char***)
-{
-    VLOG(1) << "Caffe2 is built with MIOPEN version " << MIOPEN_VERSION;
-    return true;
-}
-
-REGISTER_CAFFE2_INIT_FUNCTION(PrintMIOPENInfo, &PrintMIOPENInfo, "Print MIOPEN Info.");
-
-} // namespace
-} // namespace caffe2

diff --git a/caffe2/core/hip/miopen_wrapper.h b/caffe2/core/hip/miopen_wrapper.h
deleted file mode 100644
index f60bed6..0000000
--- a/caffe2/core/hip/miopen_wrapper.h
+++ /dev/null

@@ -1,166 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-#ifndef CAFFE2_CORE_MIOPEN_WRAPPERS_H_
-#define CAFFE2_CORE_MIOPEN_WRAPPERS_H_
-
-#include "caffe2/core/hip/common_miopen.h"
-#include "caffe2/core/hip/context_gpu.h"
-
-#include <c10/hip/HIPGuard.h>
-
-namespace caffe2 {
-
-class MIOPENWrapper;
-
-/**
- * MIOPENWorkspace is a wrapper around a raw cuda pointer that holds the miopen
- * scratch space. This struct is meant to be only used in MIOPENWrapper to
- * provide a program-wide scratch space for MIOPEN. The reason behind it is that
- * miopen function calls are usually very efficient, hence one probably does not
- * want to run multiple miopen calls at the same time. As a result, one should
- * not need more than one miopen workspace per device.
- */
-struct MIOPENWorkspace
-{
-    ~MIOPENWorkspace() noexcept {}
-
-    void* get(size_t nbytes)
-    {
-        if(nbytes_ < nbytes)
-        {
-            reset();
-            data_ = HIPContext::New(nbytes);
-            nbytes_               = nbytes;
-        }
-        CAFFE_ENFORCE_GE(nbytes_, nbytes);
-        return data_.get();
-    }
-
-    void reset()
-    {
-      data_.clear();
-      nbytes_ = 0;
-    }
-
-    private:
-     at::DataPtr data_;
-     size_t nbytes_{0};
-};
-
-// MIOPENState is the owner of the MIOPENWorkspace, and serializes all
-// executions of operations that use the state onto it's own stream
-// (so multiple Net workers can reuse the same workspace from
-// different threads and HIP streams).
-class MIOPENState
-{
-    public:
-    explicit MIOPENState(size_t gpu_id) : gpu_id_(gpu_id)
-    {
-        HIPGuard g(gpu_id_);
-        MIOPEN_ENFORCE(miopenCreate(&miopen_handle_));
-        HIP_ENFORCE(hipEventCreate(&before_));
-        HIP_ENFORCE(hipEventCreate(&after_));
-        HIP_ENFORCE(hipStreamCreate(&stream_));
-        MIOPEN_ENFORCE(miopenSetStream(miopen_handle_, stream_));
-    }
-
-    ~MIOPENState() noexcept
-    {
-        HIPGuard g(gpu_id_);
-        MIOPEN_CHECK(miopenDestroy(miopen_handle_));
-        HIP_CHECK(hipStreamDestroy(stream_));
-        HIP_CHECK(hipEventDestroy(after_));
-        HIP_CHECK(hipEventDestroy(before_));
-    }
-
-    miopenHandle_t& miopen_handle() { return miopen_handle_; }
-
-    MIOPENWorkspace& workspace() { return workspace_; }
-
-    template <typename F>
-    void execute(hipStream_t stream, F&& f)
-    {
-        HIP_ENFORCE(hipEventRecord(before_, stream));
-        HIP_ENFORCE(hipStreamWaitEvent(stream_, before_, 0));
-        f(this);
-        HIP_ENFORCE(hipEventRecord(after_, stream_));
-        HIP_ENFORCE(hipStreamWaitEvent(stream, after_, 0));
-    }
-
-    private:
-    miopenHandle_t miopen_handle_{nullptr};
-    hipEvent_t before_{nullptr};
-    hipEvent_t after_{nullptr};
-    hipStream_t stream_{nullptr};
-    MIOPENWorkspace workspace_;
-    size_t gpu_id_{0};
-    C10_DISABLE_COPY_AND_ASSIGN(MIOPENState);
-};
-
-/**
- * MIOPENWrapper is a class that wraps the miopen handles and miopen workspaces.
- *
- * The wrapper ensures that for each thread and each gpu, there is one
- * identical miopen handle, which is also associated with the thread-local
- * per-device hip stream. The wrapper also hosts the device-specific miopen
- * workspace (scratch space for some miopen functions).
- *
- */
-class MIOPENWrapper
-{
-    public:
-    /**
-     * Creates a miopen wrapper associated with a HIPContext object. Note that
-     * the HIPContext object should outlive the MIOPENWrapper.
-     */
-    explicit MIOPENWrapper(HIPContext* context) : context_(context) {}
-
-    /**
-     * Returns the inline miopen handle that executes on the current
-     * thread's hip_stream.
-     */
-    miopenHandle_t inline_miopen_handle() { return context_->miopen_handle(); }
-
-    // Executes the closure F on the MIOPENState associated with state_idx
-    template <typename F>
-    void with_miopen_state(size_t state_idx, F&& f)
-    {
-        CAFFE_ENFORCE(state_idx < CAFFE2_COMPILE_TIME_MAX_MIOPEN_STATES, "Invalid state_idx");
-        auto& sync_state = miopen_states()[context_->device_id()][state_idx];
-
-        HIPGuard dg(context_->device_id());
-
-        // We need to serialize execution on the MIOPENState as we can't
-        // allow multiple threads to race through the cudaEventRecord
-        // calls (so a worker thread might wait on another worker thread's
-        // execution)
-        std::lock_guard<std::mutex> g(sync_state.mutex);
-        if(!sync_state.state.get())
-        {
-          sync_state.state.reset(new MIOPENState(context_->device_id()));
-        }
-        TORCH_CHECK_NOTNULL(sync_state.state.get())->execute(context_->hip_stream(), f);
-    }
-
-    protected:
-    // Pointer to an external cuda context that the miopen wrapper will use.
-    HIPContext* context_;
-
-    static constexpr size_t CAFFE2_COMPILE_TIME_MAX_MIOPEN_STATES = 4;
-
-    struct SyncedMIOPENState
-    {
-        std::mutex mutex;
-        std::unique_ptr<MIOPENState> state;
-    };
-
-    using PerGPUMIOPENStates = std::array<
-        std::array<SyncedMIOPENState, CAFFE2_COMPILE_TIME_MAX_MIOPEN_STATES>,
-        C10_COMPILE_TIME_MAX_GPUS>;
-    static PerGPUMIOPENStates& miopen_states();
-
-    C10_DISABLE_COPY_AND_ASSIGN(MIOPENWrapper);
-};
-
-}; // namespace caffe2
-
-#endif

diff --git a/caffe2/core/init.h b/caffe2/core/init.h
deleted file mode 100644
index 8d0fbd3..0000000
--- a/caffe2/core/init.h
+++ /dev/null

@@ -1,179 +0,0 @@
-#ifndef CAFFE2_CORE_INIT_H_
-#define CAFFE2_CORE_INIT_H_
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/flags.h"
-#include "caffe2/core/logging.h"
-
-namespace caffe2 {
-
-namespace internal {
-class TORCH_API Caffe2InitializeRegistry {
- public:
-  typedef bool (*InitFunction)(int*, char***);
-  // Registry() is defined in .cpp file to make registration work across
-  // multiple shared libraries loaded with RTLD_LOCAL
-  static Caffe2InitializeRegistry* Registry();
-
-  void Register(
-      InitFunction function,
-      bool run_early,
-      const char* description,
-      const char* name = nullptr) {
-    if (name) {
-      named_functions_[name] = function;
-    }
-    if (run_early) {
-      // Disallow registration after GlobalInit of early init functions
-      CAFFE_ENFORCE(!early_init_functions_run_yet_);
-      early_init_functions_.emplace_back(function, description);
-    } else {
-      if (init_functions_run_yet_) {
-        // Run immediately, since GlobalInit already ran. This should be
-        // rare but we want to allow it in some cases.
-        LOG(WARNING) << "Running init function after GlobalInit: "
-                     << description;
-        // TODO(orionr): Consider removing argc and argv for non-early
-        // registration. Unfortunately that would require a new InitFunction
-        // typedef, so not making the change right now.
-        //
-        // Note that init doesn't receive argc and argv, so the function
-        // might fail and we want to raise an error in that case.
-        int argc = 0;
-        char** argv = nullptr;
-        bool success = (function)(&argc, &argv);
-        CAFFE_ENFORCE(success);
-      } else {
-        // Wait until GlobalInit to run
-        init_functions_.emplace_back(function, description);
-      }
-    }
-  }
-
-  bool RunRegisteredEarlyInitFunctions(int* pargc, char*** pargv) {
-    CAFFE_ENFORCE(!early_init_functions_run_yet_);
-    early_init_functions_run_yet_ = true;
-    return RunRegisteredInitFunctionsInternal(
-        early_init_functions_, pargc, pargv);
-  }
-
-  bool RunRegisteredInitFunctions(int* pargc, char*** pargv) {
-    CAFFE_ENFORCE(!init_functions_run_yet_);
-    init_functions_run_yet_ = true;
-    return RunRegisteredInitFunctionsInternal(init_functions_, pargc, pargv);
-  }
-
-  bool RunNamedFunction(const char* name, int* pargc, char*** pargv) {
-    if (named_functions_.count(name)) {
-      return named_functions_[name](pargc, pargv);
-    }
-    return false;
-  }
-
- private:
-  // Run all registered initialization functions. This has to be called AFTER
-  // all static initialization are finished and main() has started, since we are
-  // using logging.
-  bool RunRegisteredInitFunctionsInternal(
-      vector<std::pair<InitFunction, const char*>>& functions,
-      int* pargc, char*** pargv) {
-    for (const auto& init_pair : functions) {
-      VLOG(1) << "Running init function: " << init_pair.second;
-      if (!(*init_pair.first)(pargc, pargv)) {
-        LOG(ERROR) << "Initialization function failed.";
-        return false;
-      }
-    }
-    return true;
-  }
-
-  Caffe2InitializeRegistry() {}
-  vector<std::pair<InitFunction, const char*> > early_init_functions_;
-  vector<std::pair<InitFunction, const char*> > init_functions_;
-  std::unordered_map<std::string, InitFunction> named_functions_;
-  bool early_init_functions_run_yet_ = false;
-  bool init_functions_run_yet_ = false;
-};
-}  // namespace internal
-
-TORCH_API bool unsafeRunCaffe2InitFunction(
-    const char* name,
-    int* pargc = nullptr,
-    char*** pargv = nullptr);
-
-class TORCH_API InitRegisterer {
- public:
-  InitRegisterer(
-      internal::Caffe2InitializeRegistry::InitFunction function,
-      bool run_early,
-      const char* description,
-      const char* name = nullptr) {
-    internal::Caffe2InitializeRegistry::Registry()->Register(
-        function, run_early, description, name);
-  }
-};
-
-#define REGISTER_CAFFE2_INIT_FUNCTION(name, function, description)         \
-  namespace {                                                              \
-  ::caffe2::InitRegisterer                                                 \
-      g_caffe2_initregisterer_##name(function, false, description, #name); \
-  } // namespace
-
-#define REGISTER_CAFFE2_EARLY_INIT_FUNCTION(name, function, description)  \
-  namespace {                                                             \
-  ::caffe2::InitRegisterer                                                \
-      g_caffe2_initregisterer_##name(function, true, description, #name); \
-  } // namespace
-
-/**
- * @brief Determine whether GlobalInit has already been run
- */
-TORCH_API bool GlobalInitAlreadyRun();
-
-class TORCH_API GlobalInitIsCalledGuard {
- public:
-  GlobalInitIsCalledGuard() {
-    if (!GlobalInitAlreadyRun()) {
-      LOG(WARNING)
-          << "Caffe2 GlobalInit should be run before any other API calls.";
-    }
-  }
-};
-
-/**
- * @brief Initialize the global environment of caffe2.
- *
- * Caffe2 uses a registration pattern for initialization functions. Custom
- * initialization functions should take the signature
- *     bool (*func)(int*, char***)
- * where the pointers to argc and argv are passed in. Caffe2 then runs the
- * initialization in three phases:
- * (1) Functions registered with REGISTER_CAFFE2_EARLY_INIT_FUNCTION. Note that
- *     since it is possible the logger is not initialized yet, any logging in
- *     such early init functions may not be printed correctly.
- * (2) Parses Caffe-specific commandline flags, and initializes caffe logging.
- * (3) Functions registered with REGISTER_CAFFE2_INIT_FUNCTION.
- * If there is something wrong at each stage, the function returns false. If
- * the global initialization has already been run, the function returns false
- * as well.
- *
- * GlobalInit is re-entrant safe; a re-entrant call will no-op and exit.
- *
- * GlobalInit is safe to call multiple times but not idempotent;
- * successive calls will parse flags and re-set caffe2 logging levels from
- * flags as needed, but NOT re-run early init and init functions.
- *
- * GlobalInit is also thread-safe and can be called concurrently.
- */
-TORCH_API bool GlobalInit(int* pargc, char*** argv);
-
-/**
- * @brief Initialize the global environment without command line arguments
- *
- * This is a version of the GlobalInit where no argument is passed in.
- * On mobile devices, use this global init, since we cannot pass the
- * command line options to caffe2, no arguments are passed.
- */
-TORCH_API bool GlobalInit();
-}  // namespace caffe2
-#endif  // CAFFE2_CORE_INIT_H_

diff --git a/caffe2/core/net.h b/caffe2/core/net.h
deleted file mode 100644
index 0726d8e..0000000
--- a/caffe2/core/net.h
+++ /dev/null

@@ -1,175 +0,0 @@
-#ifndef CAFFE2_CORE_NET_H_
-#define CAFFE2_CORE_NET_H_
-
-#include <atomic>
-#include <climits>
-#include <cstddef>
-#include <thread> // NOLINT
-#include <typeinfo>
-#include <unordered_map>
-#include <vector>
-
-#include "c10/core/thread_pool.h"
-#include "c10/util/Registry.h"
-#include "caffe2/core/blob.h"
-#include "caffe2/core/common.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/observer.h"
-#include "caffe2/core/operator_schema.h"
-#include "caffe2/core/tensor.h"
-#include "caffe2/proto/caffe2_pb.h"
-#include "caffe2/utils/simple_queue.h"
-
-C10_DECLARE_string(caffe2_override_executor);
-
-namespace caffe2 {
-
-class NetBase;
-typedef ObserverBase<NetBase> NetObserver;
-typedef std::function<std::unique_ptr<NetObserver>(NetBase*)>
-    NetObserverCreator;
-
-class OperatorBase;
-class Workspace;
-
-// Net is a thin struct that owns all the operators together with the operator
-// contexts.
-class TORCH_API NetBase : public Observable<NetBase> {
- public:
-  NetBase(const std::shared_ptr<const NetDef>& net_def, Workspace* ws);
-  virtual ~NetBase() noexcept {}
-
-  virtual bool SupportsAsync() = 0;
-  inline const vector<const Event*>& events() const {
-    return events_;
-  }
-
-  virtual void Wait() {
-    // by default just wait till all events are finished
-    for (const auto& event : events_) {
-      event->Finish();
-    }
-  }
-
-  virtual bool Run() {
-    if (!RunAsync()) {
-      LOG(ERROR) << "Failed to execute async run";
-      return false;
-    }
-    Wait();
-    return handleRunError();
-  }
-
-  virtual bool RunAsync();
-
-  virtual void Cancel();
-
-  /* Benchmarks a network for one individual run so that we can feed new
-   * inputs on additional calls.
-   * This function returns the number of microseconds spent
-   * during the benchmark
-   */
-  virtual float TEST_Benchmark_One_Run();
-
-  /**
-   * Benchmarks a network.
-   *
-   * This function returns a vector of float recording the number of milli-
-   * seconds spent during the benchmark. The 0-th item is the time spent per
-   * each network run, and if a net instantiation supports run_individual,
-   * the remainder of the vector returns the number of milliseconds spent per
-   * operator.
-   */
-  virtual vector<float> TEST_Benchmark(
-      const int /*warmup_runs*/,
-      const int /*main_runs*/,
-      const bool /*run_individual*/);
-
-  inline const vector<string>& external_output() const {
-    return external_output_;
-  }
-
-  inline const vector<string>& external_input() const {
-    return external_input_;
-  }
-
-  /* Used to attach Observers to operators of a Net
-   *
-   * Returns pointers to objects owned with unique_ptrs.
-   * Use with caution.
-   */
-  virtual vector<OperatorBase*> GetOperators() const = 0;
-
-  const string& Name() const {
-    return name_;
-  }
-
-  inline const NetDef& debug_def() const {
-    CAFFE_ENFORCE(has_debug_def(), "net_def was null!");
-    return *net_def_;
-  }
-
-  inline bool has_debug_def() const {
-    return net_def_ != nullptr;
-  }
-
- protected:
-  virtual bool DoRunAsync() {
-    CAFFE_THROW("Not implemented");
-  };
-
-  virtual bool handleRunError() {
-    for (const Event* event : events_) {
-      if (event->Query() != EventStatus::EVENT_SUCCESS) {
-        CAFFE_THROW(event->ErrorMessage());
-      }
-    }
-    return true;
-  }
-
-  vector<string> external_input_;
-  vector<string> external_output_;
-  string name_;
-  vector<const Event*> events_;
-  std::shared_ptr<const NetDef> net_def_;
-  C10_DISABLE_COPY_AND_ASSIGN(NetBase);
-};
-
-class TORCH_API ExecutorHelper {
- public:
-  ExecutorHelper() {}
-  virtual TaskThreadPoolBase* GetPool(const DeviceOption& option) const;
-  virtual std::vector<OperatorBase*> GetOperators() const;
-  virtual int GetNumWorkers() const;
-  virtual ~ExecutorHelper() {}
-};
-
-C10_DECLARE_REGISTRY(
-    NetRegistry,
-    NetBase,
-    const std::shared_ptr<const NetDef>&,
-    Workspace*);
-#define REGISTER_NET_CREATOR(key, ...) \
-  C10_REGISTER_CREATOR(NetRegistry, key, __VA_ARGS__)
-#define REGISTER_NET(name, ...) \
-  C10_REGISTER_CLASS(NetRegistry, name, __VA_ARGS__)
-
-/**
- * @brief Creates a network, accessing / creating blobs in the given workspace.
- *
- * Note that this is different from Workspace::CreateNet. The latter adds the
- * created net object to the workspace's net map, while this function returns
- * a standalone net object.
- */
-TORCH_API unique_ptr<NetBase> CreateNet(const NetDef& net_def, Workspace* ws);
-TORCH_API unique_ptr<NetBase> CreateNet(
-    const std::shared_ptr<const NetDef>& net_def,
-    Workspace* ws);
-
-TORCH_API void AddGlobalNetObserverCreator(NetObserverCreator creator);
-
-TORCH_API void ClearGlobalNetObservers();
-
-} // namespace caffe2
-
-#endif // CAFFE2_CORE_NET_H_

diff --git a/caffe2/core/numa.h b/caffe2/core/numa.h
deleted file mode 100644
index 8424d54..0000000
--- a/caffe2/core/numa.h
+++ /dev/null

@@ -1,3 +0,0 @@
-#pragma once
-#include "c10/util/numa.h"
-#include "caffe2/core/common.h"

diff --git a/caffe2/core/observer.h b/caffe2/core/observer.h
deleted file mode 100644
index 3897bb7..0000000
--- a/caffe2/core/observer.h
+++ /dev/null

@@ -1,164 +0,0 @@
-#pragma once
-
-#include <memory>
-#include <unordered_set>
-
-#include "caffe2/core/logging.h"
-
-namespace caffe2 {
-
-/**
- *  Use this to implement a Observer using the Observer Pattern template.
- */
-
-template <class T>
-class ObserverBase {
- public:
-  explicit ObserverBase(T* subject) : subject_(subject) {}
-
-  virtual void Start() {}
-  virtual void Stop() {}
-
-  virtual std::string debugInfo() {
-    return "Not implemented.";
-  }
-
-  virtual ~ObserverBase() noexcept {}
-
-  T* subject() const {
-    return subject_;
-  }
-
-  virtual std::unique_ptr<ObserverBase<T>> rnnCopy(T* subject, int rnn_order)
-      const {
-    return nullptr;
-  }
-
- protected:
-  T* subject_;
-};
-
-/**
- *  Inherit to make your class observable.
- */
-template <class T>
-class Observable {
- public:
-  Observable() = default;
-
-  Observable(Observable&&) = default;
-  Observable& operator =(Observable&&) = default;
-
-  virtual ~Observable() = default;
-
-  C10_DISABLE_COPY_AND_ASSIGN(Observable);
-
-  using Observer = ObserverBase<T>;
-
-  /* Returns a reference to the observer after addition. */
-  const Observer* AttachObserver(std::unique_ptr<Observer> observer) {
-    CAFFE_ENFORCE(observer, "Couldn't attach a null observer.");
-    std::unordered_set<const Observer*> observers;
-    for (auto& ob : observers_list_) {
-      observers.insert(ob.get());
-    }
-
-    const auto* observer_ptr = observer.get();
-    if (observers.count(observer_ptr)) {
-      return observer_ptr;
-    }
-    observers_list_.push_back(std::move(observer));
-    UpdateCache();
-
-    return observer_ptr;
-  }
-
-  /**
-   * Returns a unique_ptr to the removed observer. If not found, return a
-   * nullptr
-   */
-  std::unique_ptr<Observer> DetachObserver(const Observer* observer_ptr) {
-    for (auto it = observers_list_.begin(); it != observers_list_.end(); ++it) {
-      if (it->get() == observer_ptr) {
-        auto res = std::move(*it);
-        observers_list_.erase(it);
-        UpdateCache();
-        return res;
-      }
-    }
-    return nullptr;
-  }
-
-  virtual size_t NumObservers() {
-    return num_observers_;
-  }
-
- private:
-  inline static void StartObserver(Observer* observer) {
-    try {
-      observer->Start();
-    } catch (const std::exception& e) {
-      LOG(ERROR) << "Exception from observer: " << e.what();
-    } catch (...) {
-      LOG(ERROR) << "Exception from observer: unknown";
-    }
-  }
-
-  inline static void StopObserver(Observer* observer) {
-    try {
-      observer->Stop();
-    } catch (const std::exception& e) {
-      LOG(ERROR) << "Exception from observer: " << e.what();
-    } catch (...) {
-      LOG(ERROR) << "Exception from observer: unknown";
-    }
-  }
-
-  void UpdateCache() {
-    num_observers_ = observers_list_.size();
-    if (num_observers_ != 1) {
-      // we cannot take advantage of the cache
-      return;
-    }
-    observer_cache_ = observers_list_[0].get();
-  }
-
- public:
-  void StartAllObservers() {
-    // do not access observers_list_ unless necessary
-    if (num_observers_ == 0) {
-      return;
-    } else if (num_observers_ == 1) {
-      StartObserver(observer_cache_);
-    } else {
-      for (auto& observer : observers_list_) {
-        StartObserver(observer.get());
-      }
-    }
-  }
-
-  void StopAllObservers() {
-    // do not access observers_list_ unless necessary
-    if (num_observers_ == 0) {
-      return;
-    } else if (num_observers_ == 1) {
-      StopObserver(observer_cache_);
-    } else {
-      for (auto& observer : observers_list_) {
-        StopObserver(observer.get());
-      }
-    }
-  }
-
- private:
-  // an on-stack cache for fast iteration;
-  // ideally, inside StartAllObservers and StopAllObservers,
-  // we should never access observers_list_
-  Observer* observer_cache_;
-  size_t num_observers_ = 0;
-
- protected:
-  std::vector<std::unique_ptr<Observer>> observers_list_;
-};
-
-} // namespace caffe2

diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
deleted file mode 100644
index 3277357..0000000
--- a/caffe2/core/operator.h
+++ /dev/null

@@ -1,1600 +0,0 @@
-#ifndef CAFFE2_CORE_OPERATOR_H_
-#define CAFFE2_CORE_OPERATOR_H_
-
-#include <array>
-#include <cfenv>
-#include <climits>
-#include <cstddef>
-#include <exception>
-#include <functional>
-#include <set>
-#include <sstream>
-#include <string>
-#include <typeinfo>
-#include <vector>
-
-#include <c10/macros/Macros.h>
-#include <c10/util/Registry.h>
-#include <c10/util/string_view.h>
-#include <c10/util/typeid.h>
-#include <c10/core/Stream.h>
-#include "caffe2/core/blob.h"
-#include "caffe2/core/common.h"
-#include "caffe2/core/net.h"
-#include "caffe2/core/observer.h"
-#include "caffe2/core/operator_gradient.h"
-#include "caffe2/core/operator_schema.h"
-#include "caffe2/core/tensor.h"
-#include "caffe2/core/tensor_int8.h"
-#include "caffe2/core/types.h"
-#include "caffe2/core/workspace.h"
-#include "caffe2/proto/caffe2_pb.h"
-#include "caffe2/utils/proto_utils.h"
-
-#if defined(EXPOSE_C2_OPS) || \
-    !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-#include <ATen/core/TensorBody.h>
-#include <ATen/core/function_schema.h>
-#include <ATen/core/ivalue.h>
-#endif
-
-C10_CLANG_DIAGNOSTIC_PUSH()
-#if C10_CLANG_HAS_WARNING("-Wshorten-64-to-32")
-C10_CLANG_DIAGNOSTIC_IGNORE("-Wshorten-64-to-32")
-#endif
-
-C10_DECLARE_bool(caffe2_operator_throw_if_fp_exceptions);
-C10_DECLARE_bool(caffe2_operator_throw_if_fp_overflow_exceptions);
-#ifdef __GNU_LIBRARY__
-C10_DECLARE_bool(caffe2_operator_throw_on_first_occurrence_if_fp_exceptions);
-#endif
-
-namespace c10 {
-struct FunctionSchema;
-}
-
-namespace caffe2 {
-
-class TORCH_API OperatorBase;
-typedef ObserverBase<OperatorBase> OperatorObserver;
-
-class TORCH_API OperatorBase : public Observable<OperatorBase> {
- public:
-  explicit OperatorBase(const OperatorDef& operator_def, Workspace* ws);
-
-  /*
-   * Notes: All outputs ivalues must be tensors. Input ivalue list must start
-   * with all tensors ("inputs" in caffe2 terminology),
-   * followed by non-tensors ("arguments" in caffe2 terminology).
-   * Alternatively, inputs can be one tensor list ivalue followed by non-tensors
-   * to represent operators with a variable number of inputs.
-   */
-#if defined(EXPOSE_C2_OPS) || \
-    !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-  explicit OperatorBase(
-      const c10::FunctionSchema& schema,
-      std::vector<c10::IValue> inputs,
-      std::vector<caffe2::Tensor> outputs);
-#endif
-
-  virtual ~OperatorBase() noexcept;
-
-  /** @brief Return true if the operator was instantiated with OperatorDef
-   * New operators should be instantiated with FunctionSchema
-   */
-  bool isLegacyOperator() const {
-#if defined(EXPOSE_C2_OPS) || \
-    !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-    return !fn_schema_;
-#else
-    return true;
-#endif
-  }
-
-  const c10::FunctionSchema& getFunctionSchema() const {
-    CAFFE_ENFORCE(!isLegacyOperator());
-#if defined(EXPOSE_C2_OPS) || \
-    !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-    return *fn_schema_.get();
-#else
-    CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
-#endif
-  }
-
-  /** @brief Checks if the operator has an argument of the given name.
-   */
-  inline bool HasArgument(c10::string_view name) const {
-    if (isLegacyOperator()) {
-      CAFFE_ENFORCE(operator_def_, "operator_def was null!");
-      return ArgumentHelper::HasArgument(*operator_def_, name);
-    }
-    return argumentIndexWithName(name).has_value();
-  }
-
-  // Functions that deal with arguments. Basically, this allows us to map an
-  // argument name to a specific type of argument that we are trying to access.
-  template <typename T>
-  inline T GetSingleArgument(c10::string_view name, const T& default_value) const {
-    if (isLegacyOperator()) {
-      CAFFE_ENFORCE(operator_def_, "operator_def was null!");
-      return ArgumentHelper::GetSingleArgument<OperatorDef, T>(
-          *operator_def_, name, default_value);
-    }
-#if defined(EXPOSE_C2_OPS) || \
-    !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-    auto index = argumentIndexWithName(name);
-    CAFFE_ENFORCE(index.has_value(), "Couldn't get index for argument!", name);
-    const auto& value = newstyle_inputs_[index.value()];
-    return value.template to<T>();
-#else
-    CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
-#endif
-  }
-
-  template <typename T>
-  inline bool HasSingleArgumentOfType(c10::string_view name) const {
-    CAFFE_ENFORCE(operator_def_, "operator_def was null!");
-    return ArgumentHelper::HasSingleArgumentOfType<OperatorDef, T>(
-        *operator_def_, name);
-  }
-#if defined(EXPOSE_C2_OPS) || \
-    !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-  template <typename T>
-  inline vector<T> GetVectorFromIValueList(const c10::IValue& value) const {
-    return value.template to<List<T>>().vec();
-  }
-#endif
-
-  template <typename T>
-  inline vector<T> GetRepeatedArgument(
-      c10::string_view name,
-      const vector<T>& default_value = {}) const;
-
-  // Get the inputs and outputs as specific types.
-  template <typename T>
-  inline const T& Input(int idx) {
-    static_assert(
-        !std::is_same<T, Tensor>::value,
-        "You should use Input<Tensor>(int, DeviceType) for "
-        "Tensor.");
-    TORCH_DCHECK_LT((size_t)idx, inputs_.size());
-    try {
-      return inputs_.at(idx)->template Get<T>();
-    } catch (::caffe2::EnforceNotMet& enf) {
-      if (has_debug_def()) {
-        TORCH_RETHROW(enf, "Offending Blob name: ", debug_def().input(idx), ".");
-      }
-      throw enf;
-    }
-  }
-
-  // TODO(jerryzh): Remove template
-  // and the type argument?
-  // This is to keep the API changes minimal and make refactoring
-  // a bit easier
-  template <typename T>
-  inline const T& Input(int idx, DeviceType type) {
-    if (isLegacyOperator()) {
-      static_assert(
-          std::is_same<T, Tensor>::value,
-          "Input(int, DeviceType) is only available for Tensor");
-      TORCH_DCHECK_LT((size_t)idx, inputs_.size());
-      try {
-        // TODO(jerryzh): We'll need to check device type in Get<T>() later
-        // Get<T>() -> Get<T>(type)
-        const auto& tensor = inputs_.at(idx)->template Get<T>();
-        return tensor;
-      } catch (::caffe2::EnforceNotMet& enf) {
-        if (has_debug_def()) {
-          TORCH_RETHROW(enf, "Offending Blob name: ", debug_def().input(idx), ".");
-        }
-        throw enf;
-      }
-    }
-#if defined(EXPOSE_C2_OPS) || \
-    !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-    TORCH_DCHECK_LT(0U, newstyle_inputs_.size());
-    IValue ival;
-    if (newstyle_inputs_[0].isTensorList()) {
-      // if the first input is a tensor list, we get input tensors by indexing
-      // into that list. currently, this means that only tensors from that list
-      // are accessible as inputs. any hypothetical input tensors that come
-      // after the list are not accessible.
-      auto tensorList = newstyle_inputs_[0].toTensorVector();
-      TORCH_DCHECK_LT((size_t)idx, tensorList.size());
-      ival = tensorList[idx];
-    } else {
-      // if the first input is not a tensor list, we get input tensors by
-      // indexing into the inputs.
-      TORCH_DCHECK_LT((size_t)idx, newstyle_inputs_.size());
-      ival = newstyle_inputs_[idx];
-    }
-    CAFFE_ENFORCE(
-        ival.isTensor(),
-        "Input(int, DeviceType) is only available for IValues that store Tensors");
-    auto t = ival.toTensor();
-    if (!t.is_contiguous()) {
-      t = t.contiguous();
-    }
-    Tensor tensor = caffe2::Tensor(std::move(t));
-    CAFFE_ENFORCE_EQ(tensor.GetDeviceType(), type);
-    input_tensors_[idx] = std::move(tensor);
-    return input_tensors_[idx];
-#else
-    CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
-#endif
-  }
-
-  template <typename T>
-  inline T* Output(int idx) {
-    CAFFE_ENFORCE(
-        isLegacyOperator(),
-        "Output(idx) not supported for operators exported to c10. Please use XOutput instead.");
-
-    static_assert(
-        !std::is_same<T, Tensor>::value,
-        "You should use Output<Tensor>(int, DeviceType) for "
-        "Tensor.");
-    return outputs_.at(idx)->template GetMutable<T>();
-  }
-
-  // TODO(jerryzh): Remove this template
-  template <typename T>
-  inline T* Output(int idx, DeviceType type) {
-    if (isLegacyOperator()) {
-      static_assert(
-          std::is_same<T, Tensor>::value,
-          "Output(int, DeviceType) is only available for Tensor");
-      // When you get a Tensor here it is not fully initialized
-      return BlobGetMutableTensor(outputs_.at(idx), type);
-    }
-#if defined(EXPOSE_C2_OPS) || \
-    !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-    auto &output = output_tensors_[idx];
-    if (!output.defined() || output.GetDeviceType() != type) {
-      // Fix tensor type
-      output = Tensor(type);
-    }
-    return &output;
-#else
-    CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
-#endif
-  }
-
-  inline Tensor
-  XOutputTensor(int idx, at::IntArrayRef dims, at::TensorOptions options) {
-    CAFFE_ENFORCE_WITH_CALLER(
-        options.device_opt() != c10::nullopt,
-        "device must be provided in option.");
-    if (isLegacyOperator()) {
-      return XBlobGetMutableTensor(outputs_.at(idx), dims, options);
-    }
-
-    return OutputTensor(idx, dims, options)->UnsafeSharedInstance();
-  }
-
-  void SetOutputTensor(int idx, Tensor tensor) {
-    if (!isLegacyOperator()) {
-#if defined(EXPOSE_C2_OPS) || \
-    !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-      output_tensors_[idx] = std::move(tensor);
-#else
-      CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
-#endif
-    } else {
-      // update the tensor in the workspace
-      BlobSetTensor(outputs_.at(idx), std::move(tensor));
-    }
-  }
-
-  Tensor OutputTensorOrUndefined(int idx) {
-    if (isLegacyOperator()) {
-      return BlobGetTensorOrUndefined(*outputs_.at(idx));
-    }
-    return output_tensors_[idx].UnsafeSharedInstance();
-  }
-
-  inline Tensor*
-  OutputTensor(int idx, at::IntArrayRef dims, at::TensorOptions options) {
-    if (isLegacyOperator()) {
-      CAFFE_ENFORCE_WITH_CALLER(
-          options.device_opt() != c10::nullopt,
-          "device must be provided in options.");
-      return BlobGetMutableTensor(outputs_.at(idx), dims, options);
-    }
-#if defined(EXPOSE_C2_OPS) || \
-    !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-    auto &output = output_tensors_[idx];
-    output = output.defined()
-        ? GetSizedTensorWithOptions(std::move(output), dims, options)
-        : caffe2::empty(dims, options);
-
-    return &output;
-#else
-    CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
-#endif
-  }
-
-  // Get output Tensor of the operator and CopyFrom the given Tensor
-  Tensor* OutputTensorCopyFrom(
-      int idx,
-      at::TensorOptions options,
-      const Tensor& src,
-      bool async = false) {
-    CAFFE_ENFORCE_WITH_CALLER(
-        options.device_opt() != c10::nullopt,
-        "device must be provided in options.");
-    // Ouptut Tensor will always have the same data type as `src`
-    if (!options.has_dtype()) {
-      options = options.dtype(src.dtype());
-    }
-    CAFFE_ENFORCE_WITH_CALLER(
-        options.dtype() == src.dtype(),
-        "We don't allow change of src data type in OutputTensorCopyFrom");
-    Tensor* t = OutputTensor(idx, src.sizes(), options);
-    t->CopyFrom(src, async);
-    return t;
-  }
-
-  Tensor* OutputTensorAlias(int idx, const Tensor& src) {
-    CAFFE_ENFORCE(
-        isLegacyOperator(),
-        "OutputTensorAlias(idx, src) not (yet) supported for operators exported to c10.");
-    return BlobSetTensor(OutputBlob(idx), src.Alias());
-  }
-
-  template <typename T>
-  inline T* Output(int idx, T* allocated) {
-    CAFFE_ENFORCE(
-        isLegacyOperator(),
-        "Output(idx, allocated) not supported for operators exported to c10. Please use XOutput.");
-    outputs_.at(idx)->Reset(allocated);
-    return allocated;
-  }
-
-  inline const Blob& InputBlob(int idx) {
-    CAFFE_ENFORCE(
-        isLegacyOperator(),
-        "InputBlob(idx) not (yet) supported for operators exported to c10.");
-    return *inputs_.at(idx);
-  }
-
-  inline Blob* OutputBlob(int idx) {
-    CAFFE_ENFORCE(
-        isLegacyOperator(),
-        "OutputBlob(idx) not (yet) supported for operators exported to c10.");
-    return outputs_.at(idx);
-  }
-
-  // Check whether output j is an alias of input i by comparing Blob pointers,
-  // note this does not check if the two Blobs points to the same Tensor, or if
-  // the Tensor pointers point to the same TensorImpl, or if the Storages alias
-  inline bool IsInputOutputAlias(int i, int j) {
-    CAFFE_ENFORCE(
-        isLegacyOperator(),
-        "IsInputOutputAlias(i, j) not (yet) supported for operators exported to c10.");
-    return inputs_.at(i) == outputs_.at(j);
-  }
-
-  template <typename T>
-  inline bool InputIsType(int idx) {
-    CAFFE_ENFORCE(
-        isLegacyOperator(),
-        "InputIsType(idx) not (yet) supported for operators exported to c10.");
-    static_assert(
-        !std::is_same<T, Tensor>::value,
-        "You should use InputIsTensorType(int, DeviceType) for "
-        "Tensor.");
-    return inputs_.at(idx)->template IsType<T>();
-  }
-
-  inline bool InputIsTensorType(int idx, DeviceType device_type) {
-    CAFFE_ENFORCE(
-        isLegacyOperator(),
-        "InputIsTensorType(idx, device_type) not (yet) supported for operators exported to c10.");
-    return BlobIsTensorType(*inputs_.at(idx), device_type);
-  }
-
-  template <typename T>
-  inline bool OutputIsType(int idx) {
-    CAFFE_ENFORCE(
-        isLegacyOperator(),
-        "OutputIsType(idx) not (yet) supported for operators exported to c10.");
-    static_assert(
-        !std::is_same<T, Tensor>::value,
-        "You should use OutputIsTensorType(int, DeviceType) for "
-        "Tensor.");
-    return outputs_.at(idx)->template IsType<T>();
-  }
-
-  inline bool OutputIsTensorType(int idx, DeviceType type) {
-    CAFFE_ENFORCE(
-        isLegacyOperator(),
-        "OutputIsTensorType(idx, type) not (yet) supported for operators exported to c10.");
-    return BlobIsTensorType(*outputs_.at(idx), type);
-  }
-
-  inline int InputSize() const {
-    return input_size_;
-  }
-
-  inline int OutputSize() const {
-    if (isLegacyOperator()) {
-      return outputs_.size();
-    }
-#if defined(EXPOSE_C2_OPS) || \
-    !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-    return output_tensors_.size();
-#else
-    CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
-#endif
-  }
-  inline const vector<const Blob*>& Inputs() const {
-    CAFFE_ENFORCE(
-        isLegacyOperator(),
-        "Inputs() not supported for operators exported to c10.");
-    return inputs_;
-  }
-  inline const vector<Blob*>& Outputs() {
-    CAFFE_ENFORCE(
-        isLegacyOperator(),
-        "Outputs() not supported for operators exported to c10.");
-    return outputs_;
-  }
-  vector<TensorShape> InputTensorShapes() const;
-
-  virtual void WaitEvent(const Event& ev, int /*stream_id */ = -1) {
-    ev.Finish();
-  }
-
-  inline void Wait(const OperatorBase& other, int stream_id = -1) {
-    if (!other.IsEventDisabled()) {
-      WaitEvent(other.event(), stream_id);
-    }
-  }
-
-  virtual void WaitEvents(
-      const std::vector<const Event*>& events,
-      int /*stream_id*/ = -1) {
-    for (const auto& ev : events) {
-      ev->Finish();
-    }
-  }
-
-  virtual void Finish() {
-    if (event_) {
-      event_->Finish();
-    }
-  }
-
-  virtual bool Run(int /* unused */ /*stream_id*/ = 0) {
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
-  virtual bool HasAsyncPart() const {
-    return false;
-  }
-
-  virtual bool SupportsAsyncScheduling() const {
-    return false;
-  }
-
-  virtual void CancelAsyncCallback() {}
-
-  virtual void Cancel() {}
-
-  // RunAsync, if implemented by the specific operators, will schedule the
-  // computation on the corresponding context and record the event in its
-  // event_ member object. If the specific operator does not support RunAsync,
-  // it will simply be synchronous as a fallback.
-  virtual bool RunAsync(int stream_id = 0);
-
-  virtual void AddRelatedBlobInfo(EnforceNotMet* err);
-
-  virtual std::string debug_info_string() const {
-    return "";
-  }
-
-  inline const OperatorDef& debug_def() const {
-    CAFFE_ENFORCE(has_debug_def(), "operator_def was null!");
-    return *operator_def_;
-  }
-
-  inline void set_debug_def(
-      const std::shared_ptr<const OperatorDef>& operator_def) {
-    operator_def_ = operator_def;
-  }
-
-  inline bool has_debug_def() const {
-    return operator_def_ != nullptr;
-  }
-
- public:
-  void RecordLastFailedOpNetPosition() {
-    if (net_position_ != kNoNetPositionSet) {
-      VLOG(1) << "Operator with id " << net_position_ << " failed";
-      operator_ws_->last_failed_op_net_position = net_position_;
-    } else {
-      VLOG(1) << "Failed operator doesn't have id set";
-    }
-  }
-
-  int net_position() const {
-    return net_position_;
-  }
-
-  void set_net_position(int idx) {
-    net_position_ = idx;
-  }
-
-  const DeviceOption& device_option() const {
-    return device_option_;
-  }
-
-  const Event& event() const {
-    CAFFE_ENFORCE(event_, "Event is disabled");
-    return *event_;
-  }
-
-  Event& event() {
-    CAFFE_ENFORCE(event_, "Event is disabled");
-    return *event_;
-  }
-
-  void ResetEvent() {
-    if (event_) {
-      event_->Reset();
-    }
-  }
-
-  void DisableEvent() {
-    event_ = nullptr;
-  }
-
-  bool IsEventDisabled() const {
-    return !event_;
-  }
-
-  // Internal API invoked by observers. Normal callers shouldn't invoke it.
-  virtual void SyncDeviceBarrierForObservers() {
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
-  // Checks whether stream is ready to execute new computation,
-  // used in stream allocation optimization to skip stream that is currently
-  // busy. Depends on context and operator's device, returns true by default
-  virtual bool IsStreamFree(int /* unused */) const {
-    return true;
-  }
-
-  const std::string& type() const {
-    return type_;
-  }
-
-  void annotate_engine(const std::string& engine) {
-    engine_ = engine;
-  }
-
-  const std::string& engine() const {
-    return engine_;
-  }
-
-  void SetExecutorHelper(ExecutorHelper* helper) {
-    helper_ = helper;
-  }
-
-  ExecutorHelper* GetExecutorHelper() const {
-    return helper_;
-  }
-
-#if defined(EXPOSE_C2_OPS) || \
-    !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-  std::vector<caffe2::Tensor> move_output_tensors() && {
-    return std::move(output_tensors_);
-  }
-#endif
-
- public:
-  static const int kNoNetPositionSet = -1;
-
- private:
-  Workspace* operator_ws_;
-  std::shared_ptr<const OperatorDef> operator_def_;
-  DeviceOption device_option_;
-  std::string engine_;
-  std::string type_;
-  vector<const Blob*> inputs_;
-  vector<Blob*> outputs_;
-  // Preferably use std::optional, but nvcc doesn't work
-#if defined(EXPOSE_C2_OPS) || \
-    !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-  std::unique_ptr<const c10::FunctionSchema> fn_schema_;
-  vector<c10::IValue> newstyle_inputs_;
-#endif
-  // HACK
-  // We preserve the fact that Output() returns Tensor*
-  // by storing Tensor in a vector owned by the
-  // operator.
-  vector<caffe2::Tensor> input_tensors_;
-  vector<caffe2::Tensor> output_tensors_;
-
-  int input_size_;
-
-  int net_position_{kNoNetPositionSet};
-
-  ExecutorHelper* helper_ = nullptr;
-
- protected:
-  virtual void RecordEvent(const char* /*err_msg*/ = nullptr) {
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
-  void SetEventFinished(const char* err_msg = nullptr) {
-    if (event_) {
-      event_->SetFinished(err_msg);
-    }
-  }
-
-  void SetEventFinishedWithException(const char* err_msg = nullptr) {
-    if (event_) {
-      event_->SetFinishedWithException(err_msg);
-    }
-  }
-
-  std::string getErrorMsg() {
-    if (has_debug_def()) {
-      return "Error from operator: " + ProtoDebugString(debug_def());
-    } else {
-      return "Error from operator: no op def";
-    }
-  }
-
-  std::optional<int> argumentIndexWithName(c10::string_view name) const;
-
-  // An event used by asynchronous execution.
-  std::unique_ptr<Event> event_;
-
-  C10_DISABLE_COPY_AND_ASSIGN(OperatorBase);
-};
-
-template <>
-inline NetDef OperatorBase::GetSingleArgument<NetDef>(
-    c10::string_view name,
-    const NetDef& default_value) const {
-  if (isLegacyOperator()) {
-    CAFFE_ENFORCE(operator_def_, "operator_def was null!");
-    return ArgumentHelper::GetSingleArgument<OperatorDef, NetDef>(
-        *operator_def_, name, default_value);
-  }
-  CAFFE_THROW("Cannot get NetDefs from IValue");
-  return NetDef();
-}
-
-#if defined(EXPOSE_C2_OPS) || \
-    !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-template <>
-inline vector<int> OperatorBase::GetVectorFromIValueList<int>(
-    const c10::IValue& value) const {
-  auto vs = value.toIntVector();
-  vector<int> out;
-  out.reserve(vs.size());
-  for (int64_t v : vs) {
-    out.emplace_back(v);
-  }
-  return out;
-}
-
-template <>
-inline vector<float> OperatorBase::GetVectorFromIValueList<float>(
-    const c10::IValue& value) const {
-  const auto& vs = value.toDoubleVector();
-  vector<float> out;
-  out.reserve(vs.size());
-  for (double v : vs) {
-    out.emplace_back(v);
-  }
-  return out;
-}
-
-template <>
-inline vector<string> OperatorBase::GetVectorFromIValueList<string>(
-    const c10::IValue& value) const {
-  auto vs = value.template to<c10::List<string>>();
-  vector<string> out;
-  out.reserve(vs.size());
-  for (string v : vs) {
-    out.emplace_back(v);
-  }
-  return out;
-}
-
-// We need this specialisation because IValue based lists don't support
-// int16_t. We need to load it as List<int64_t> and transform to int16_t.
-template <>
-inline vector<int16_t> OperatorBase::GetVectorFromIValueList<int16_t>(
-    const c10::IValue& value) const {
-  auto list = value.template to<c10::List<int64_t>>();
-  std::vector<int16_t> result;
-  result.reserve(list.size());
-  for (int64_t elem : list) {
-    result.push_back(static_cast<int16_t>(elem));
-  }
-  return result;
-}
-#endif
-
-// OP_SINGLE_ARG provides a shorter initialization choice for initialization of
-// member variables for the class constructors.
-#define OP_SINGLE_ARG(type, name, variable, default) \
-  variable(OperatorBase::GetSingleArgument<type>(name, (default)))
-
-// INPUT_TAGS and OUTPUT_TAGS are optional features to name the indices of the
-// operator's inputs and outputs, in order to avoid confusion. For example, for
-// a fully convolution layer that has input, weight and bias, you can define its
-// input tags as:
-//     INPUT_TAGS(INPUT, WEIGHT, BIAS);
-// And in the code, instead of doing
-//     auto& weight = Input(1);
-// you can now do
-//     auto& weight = Input(WEIGHT);
-// to make it more clear.
-#define INPUT_TAGS(first_input, ...) \
-  enum _InputTags { first_input = 0, __VA_ARGS__ }
-#define OUTPUT_TAGS(first_input, ...) \
-  enum _OutputTags { first_input = 0, __VA_ARGS__ }
-
-template <typename T>
-inline vector<T> OperatorBase::GetRepeatedArgument(
-    c10::string_view name,
-    const vector<T>& default_value) const {
-  if (isLegacyOperator()) {
-    CAFFE_ENFORCE(operator_def_, "operator_def was null!");
-    return ArgumentHelper::GetRepeatedArgument<OperatorDef, T>(
-        *operator_def_, name, default_value);
-  }
-#if defined(EXPOSE_C2_OPS) || \
-    !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-  auto index = argumentIndexWithName(name);
-  CAFFE_ENFORCE(index.has_value(), "Couldn't get index for argument!", name);
-  const auto& value = newstyle_inputs_[index.value()];
-  return GetVectorFromIValueList<T>(value);
-#else
-  CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
-#endif
-}
-
-// We need this specialisation because IValue based lists don't support
-// int16_t. We need to load it as List<int64_t> and transform to int16_t.
-template <>
-inline vector<int16_t> OperatorBase::GetRepeatedArgument<int16_t>(
-    c10::string_view name,
-    const vector<int16_t>& default_value) const {
-  if (isLegacyOperator()) {
-    CAFFE_ENFORCE(operator_def_, "operator_def was null!");
-    return ArgumentHelper::GetRepeatedArgument<OperatorDef, int16_t>(
-        *operator_def_, name, default_value);
-  }
-#if defined(EXPOSE_C2_OPS) || \
-    !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-  auto index = argumentIndexWithName(name);
-  CAFFE_ENFORCE(index.has_value(), "Couldn't get index for argument!", name);
-  const auto& value = newstyle_inputs_[index.value()];
-  auto vec = GetVectorFromIValueList<int64_t>(value);
-  std::vector<int16_t> result;
-  result.reserve(vec.size());
-  for (int64_t elem : vec) {
-    result.push_back(static_cast<int16_t>(elem));
-  }
-  return result;
-#else
-  CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
-#endif
-}
-
-// Operator is the class that you usually want to derive, if your operator will
-// run on different devices. You should then implement the RunOnDevice()
-// function.
-template <class Context>
-class Operator : public OperatorBase {
- public:
-  explicit Operator(const OperatorDef& operator_def, Workspace* ws, StreamId stream = 0)
-      : OperatorBase(operator_def, ws), context_(operator_def.device_option()) {
-    // In the constructor, we switch to the device so that the child class
-    // constructors will run on that device.
-    context_.SwitchToDevice(stream);
-  }
-#if defined(EXPOSE_C2_OPS) || \
-    !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-  explicit Operator(
-      const c10::FunctionSchema& fn_schema,
-      std::vector<c10::IValue> inputs,
-      std::vector<caffe2::Tensor> outputs,
-      StreamId stream = 0)
-      : OperatorBase(fn_schema, std::move(inputs), std::move(outputs)) {
-    // In the constructor, we switch to the device so that the child class
-    // constructors will run on that device.
-    context_.SwitchToDevice(stream);
-  }
-#endif
-  ~Operator() noexcept override {}
-
-  /// Retrieve a non-owning reference to the input at position 'idx' for this
-  /// operator.  The returned reference is valid for the duration of the
-  /// RunOnDevice call.  The optional 'type' parameter can be used to assert a
-  /// required device type for the input (by default, we assert that the tensor
-  /// is consistent with the device type implied by the Context parameter of an
-  /// Operator.)
-  inline const Tensor& Input(
-      int idx,
-      DeviceType type = Context::GetDeviceType()) {
-    return OperatorBase::template Input<Tensor>(idx, type);
-  }
-
-  /// XOutput is a modernized version of Output which returns a Tensor
-  /// rather than a Tensor* (the raw pointer in the latter case is
-  /// useless, as Tensor is a pointer type.)
-  Tensor XOutput(int idx, at::IntArrayRef dims, at::TensorOptions options) {
-    // We'll default device to the device of the current Operator Context
-    if (options.device_opt() == c10::nullopt) {
-      return OperatorBase::XOutputTensor(
-          idx, dims, options.device(context_.device()));
-    }
-    return OperatorBase::XOutputTensor(idx, dims, options);
-  }
-
-  /// Retrieve a non-owning pointer to the output at position 'idx',
-  /// initializing it to have size 'dims' and properties 'options' if
-  /// there is no pre-existing output or the pre-existing output does
-  /// not have the correct options.  The returned pointer is valid for
-  /// the duration of the RunOnDevice call.  If device is not explicitly
-  /// specified in options, we default to allocating output on the
-  /// current device of the device type implied by the Context parameter
-  /// of this Operator.
-  ///
-  /// Note [Operator::Output what?]
-  /// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  /// The contract of Operator::Output is somewhat complex; it is perhaps better
-  /// understood in terms of what was historically an idiomatic Caffe2 operator
-  /// implementation:
-  ///
-  ///     void RunOnDevice() override {
-  ///         auto* output = Output(0, output_size, dtype<float>());
-  ///         float* output_ptr = output->data<float>();
-  ///         // write into output_ptr
-  ///     }
-  ///
-  /// In the simple case, this code does the following things:
-  ///
-  ///   1. Allocates a new tensor with size 'output_size' and dtype 'float'
-  ///      (and device type whatever the Operator's device type is)
-  ///   2. "Registers" this tensor as the 0th output tensor of this operator
-  ///      (Caffe2 operators don't "return" outputs; instead, outputs
-  ///      are shoved into an output vector which the executor reads out.)
-  ///   3. Returns the tensor, so the operator implementation can write
-  ///      the actual output data into the tensor.
-  ///
-  /// So what's this business with "pre-existing" outputs?  Caffe2
-  /// commonly applies an optimization whereby it reuses tensors on
-  /// subsequent runs of operators in a graph.  It doesn't know ahead
-  /// of time what intermediate tensors it will need, so the first
-  /// time it runs a graph it has all of the operators create the outputs
-  /// necessary (as described above).  However, the second time around,
-  /// it will reuse all of the tensors created from the first time.
-  /// If they are lucky, this time the Output() call is a no-op and
-  /// just returns the old tensor.
-  ///
-  /// However, we cannot /guarantee/ that the output size will be the
-  /// same the next time the Operator is called; for example, output
-  /// size may be data dependent and vary between runs.  In this case,
-  /// we have to resize it to the correct size.  Resizing is still
-  /// helpful, as we may be able to fit the output in the same
-  /// space that was previously used.
-  ///
-  Tensor* Output(int idx, at::IntArrayRef dims, at::TensorOptions options) {
-    // We'll default device to the device of the current Operator Context
-    if (options.device_opt() == c10::nullopt) {
-      return OperatorBase::OutputTensor(
-          idx, dims, options.device(context_.device()));
-    }
-    return OperatorBase::OutputTensor(idx, dims, options);
-  }
-
-  /// Legacy: please consider using the version of Output() which also takes
-  /// dtype and size as arguments.
-  inline Tensor* Output(int idx, DeviceType type = Context::GetDeviceType()) {
-    return OperatorBase::template Output<Tensor>(idx, type);
-  }
-
-  /// Get the output Tensor of an operator (allocating it if it is not
-  /// already initialized), and copy the contents of src into it.
-  /// You probably don't actually want to use this function (the fact
-  /// that you have a Tensor to copy from is probably a mistake:
-  /// you should have written the output into the output tensor,
-  /// from Output, directly in the first place), but this method
-  /// is situationally useful.
-  Tensor* OutputTensorCopyFrom(
-      int idx,
-      at::TensorOptions options,
-      const Tensor& src,
-      bool async = false) {
-    if (options.device_opt() == c10::nullopt) {
-      return OperatorBase::OutputTensorCopyFrom(
-          idx, options.device(context_.device()), src, async);
-    }
-    return OperatorBase::OutputTensorCopyFrom(idx, options, src, async);
-  }
-
-  void WaitEvent(const Event& ev, int stream_id = -1) final {
-    if (stream_id >= 0) {
-      context_.SwitchToDevice(stream_id);
-    }
-    context_.WaitEvent(ev);
-  }
-
-  void WaitEvents(const std::vector<const Event*>& events, int stream_id = -1)
-      final {
-    if (stream_id >= 0) {
-      context_.SwitchToDevice(stream_id);
-    }
-    for (const auto& ev : events) {
-      context_.WaitEvent(*ev);
-    }
-  }
-
-  // The run function of Operator switches to the device, and then carries out
-  // the actual computation with RunOnDevice(). You should implement RunOnDevice
-  // instead of Run().
-  // Note: Run does not update operator's event and can be used only with
-  // non-async executors that do not rely on events
-  bool Run(int stream_id = 0) final {
-    try {
-      StartAllObservers();
-
-      context_.SwitchToDevice(stream_id);
-
-      // Clear floating point exception flags before RunOnDevice. We will test
-      // exception flags afterwards, and raise an error if an exception has
-      // happened.
-      if (FLAGS_caffe2_operator_throw_if_fp_exceptions ||
-          FLAGS_caffe2_operator_throw_if_fp_overflow_exceptions) {
-        std::feclearexcept(FE_ALL_EXCEPT);
-      }
-
-#ifdef __GNU_LIBRARY__
-      // If glibc is available, use feenableexcept that will raise exception
-      // right away.
-      int old_enabled_exceptions = 0;
-      if (FLAGS_caffe2_operator_throw_on_first_occurrence_if_fp_exceptions) {
-        if (FLAGS_caffe2_operator_throw_if_fp_exceptions ||
-            FLAGS_caffe2_operator_throw_if_fp_overflow_exceptions) {
-          int flag = 0;
-          if (FLAGS_caffe2_operator_throw_if_fp_exceptions) {
-            flag |= FE_DIVBYZERO | FE_INVALID;
-          }
-          if (FLAGS_caffe2_operator_throw_if_fp_overflow_exceptions) {
-            flag |= FE_OVERFLOW;
-          }
-          old_enabled_exceptions = feenableexcept(flag);
-        }
-      }
-#endif
-      bool result = RunOnDevice();
-#ifdef __GNU_LIBRARY__
-      if (FLAGS_caffe2_operator_throw_on_first_occurrence_if_fp_exceptions) {
-        if (FLAGS_caffe2_operator_throw_if_fp_exceptions ||
-            FLAGS_caffe2_operator_throw_if_fp_overflow_exceptions) {
-          fedisableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
-          std::feclearexcept(FE_ALL_EXCEPT);
-          feenableexcept(old_enabled_exceptions);
-        }
-      }
-#endif
-      if (FLAGS_caffe2_operator_throw_if_fp_exceptions) {
-        CAFFE_ENFORCE(
-            !std::fetestexcept(FE_DIVBYZERO),
-            "Division by zero floating point exception (FE_DIVBYZERO) reported.");
-        CAFFE_ENFORCE(
-            !std::fetestexcept(FE_INVALID),
-            "Invalid floating point exception (FE_INVALID) reported.");
-      }
-      if (FLAGS_caffe2_operator_throw_if_fp_overflow_exceptions) {
-        CAFFE_ENFORCE(
-            !std::fetestexcept(FE_OVERFLOW),
-            "Overflow floating point exception (FE_OVERFLOW) reported.");
-      }
-      if (!result) {
-        this->RecordLastFailedOpNetPosition();
-      }
-      context_.FinishDeviceComputation(); // throws on error
-
-      StopAllObservers();
-
-      return result;
-    } catch (EnforceNotMet& err) {
-      if (has_debug_def()) {
-        err.add_context(
-            "Error from operator: \n" + ProtoDebugString(debug_def()));
-        AddRelatedBlobInfo(&err);
-      }
-      this->RecordLastFailedOpNetPosition();
-      StopAllObservers();
-      throw;
-    } catch (...) {
-      this->RecordLastFailedOpNetPosition();
-      StopAllObservers();
-      throw;
-    }
-  }
-
-  bool RunAsync(int stream_id = 0) final {
-    try {
-      StartAllObservers();
-
-      context_.SwitchToDevice(stream_id);
-      auto result = RunOnDevice();
-      if (result) {
-        if (HasAsyncPart()) {
-          RecordEvent();
-        } else {
-          // Manually set CPU operator's event status to finished,
-          // unless this is an async CPU operator
-          SetEventFinished();
-        }
-      } else {
-        SetEventFinished(getErrorMsg().c_str());
-        this->RecordLastFailedOpNetPosition();
-      }
-
-      StopAllObservers();
-
-      return result;
-    } catch (EnforceNotMet& err) {
-      if (has_debug_def()) {
-        err.add_context(
-            "Error from operator: \n" + ProtoDebugString(debug_def()));
-        AddRelatedBlobInfo(&err);
-      }
-      SetEventFinishedWithException(err.what());
-      this->RecordLastFailedOpNetPosition();
-      StopAllObservers();
-      throw;
-    } catch (const std::exception& err) {
-      SetEventFinishedWithException(err.what());
-      this->RecordLastFailedOpNetPosition();
-      StopAllObservers();
-      throw;
-    } catch (...) {
-      SetEventFinishedWithException(getErrorMsg().c_str());
-      this->RecordLastFailedOpNetPosition();
-      StopAllObservers();
-      throw;
-    }
-  }
-
-  bool IsStreamFree(int stream_id) const override {
-    return context_.IsStreamFree(device_option(), stream_id);
-  }
-
-  virtual bool RunOnDevice() = 0;
-
-  // Returns whether operator has async on device part.
-  // CUDA operators by default have async parts, CPU operators by default
-  // don't have async parts and are finished after RunOnDevice call.
-  // Events of operators that don't have async parts are automatically set
-  // to finished state by RunAsync.
-  // Defaulting to the value from context (true for CUDA, false for CPU).
-  // Override in case of async CPU operators
-  // Async CPU operators are expected to catch all exceptions in async parts
-  // and set Event to finished/failed state with Event::SetFinished or
-  // SetFinishedWithException call.
-  bool HasAsyncPart() const override {
-    return context_.HasAsyncPartDefault();
-  }
-
-  // Returns whether operator's RunOnDevice schedules async on device part and
-  // can be run without waiting for parent operator's async part to be finished
-  // on the same device.
-  // Note: when true, RunOnDevice must not access the content of the input blobs
-  // as they might not be computed yet
-  // Note: when true, operator's device needs to support async scheduling:
-  //  - supports concept of streams: async ops scheduled on the same stream are
-  //    guaranteed to be executed in the same order they were scheduled
-  //  - provides non-blocking cross device/cross stream synchronization
-  //    primitives
-  //
-  // By default, assuming an op with an async part can be scheduled
-  // asynchronously if device supports async scheduling
-  bool SupportsAsyncScheduling() const override {
-    return HasAsyncPart() && context_.SupportsAsyncScheduling();
-  }
-
-  void SyncDeviceBarrierForObservers() override {
-    context_.FinishDeviceComputation();
-  }
-
-  const Context* getContext() const {
-    return &context_;
-  }
-  Context* getContext() {
-    return &context_;
-  }
-
- protected:
-  void RecordEvent(const char* err_msg = nullptr) final {
-    if (event_) {
-      context_.Record(event_.get(), err_msg);
-    }
-  }
-
-  Context context_;
-};
-
-#define USE_OPERATOR_BASE_FUNCTIONS                                 \
-  /* using override */ using OperatorBase::HasArgument;             \
-  /* using override */ using OperatorBase::GetSingleArgument;       \
-  /* using override */ using OperatorBase::HasSingleArgumentOfType; \
-  /* using override */ using OperatorBase::GetRepeatedArgument;     \
-  /* using override */ using OperatorBase::InputIsType;             \
-  /* using override */ using OperatorBase::InputSize;               \
-  /* using override */ using OperatorBase::Output;                  \
-  /* using override */ using OperatorBase::Input;                   \
-  /* using override */ using OperatorBase::OutputSize;              \
-  /* using override */ using OperatorBase::IsInputOutputAlias;      \
-  /* using override */ using OperatorBase::OutputTensorAlias
-
-#define USE_OPERATOR_FUNCTIONS(context)                     \
-  USE_OPERATOR_BASE_FUNCTIONS;                              \
-  /* using override */ using Operator<context>::context_;   \
-  /* using override */ using Operator<context>::Input;      \
-  /* using override */ using Operator<context>::InputBlob;  \
-  /* using override */ using Operator<context>::Output;     \
-  /* using override */ using Operator<context>::OutputBlob; \
-  /* using override */ using Operator<context>::OutputTensorCopyFrom
-
-#define USE_OPERATOR_CONTEXT_FUNCTIONS USE_OPERATOR_FUNCTIONS(Context)
-
-#define USE_SIMPLE_CTOR_DTOR(name)                        \
-  template <class... Args>                                \
-  explicit name(Args&&... args)                           \
-      : Operator<Context>(std::forward<Args>(args)...) {} \
-  virtual ~name() noexcept override {}
-
-// Helpers to implement runtime op polymorphism. Often it's convenient to make
-// an op work on different input types (e.g. i32 vs i64 indices) or special-case
-// it for particular input size (e.g. ScatterWeightedSum for block size of 1
-// doesn't need to call Eigen).
-//
-// DispatchHelper provides compile-time generation of nested "if" statements,
-// e.g. `DispatchHelper<FixedValues<1, 4>>::call(this, block_size);`
-// unrolls into:
-//   if (block_size == 1) {
-//     return DoRunWithValue<1>();
-//   } else if (block_size = 4) {
-//     return DoRunWithValue<4>();
-//   } else {
-//     return DoRunWithValue<-1>();
-//   }`
-//
-// DoRunWithValue implementation can use template arguments to do "if"
-// statements
-// or proxy to functions in math.h which often provide fixed size
-// implementation.
-//
-// Similarly `TensorTypes<int32_t, int64_t>(this, Input(0))` provides branching
-// based on type of the first input and calls DoRunWithType.
-//
-// Note, that the same instance of Op class is used as the method, not class is
-// templated. We might consider adding static class-level polymorphism later.
-//
-// Convenient macro USE_DISPATCH_HELPER is provided for declaring friendship in
-// case DoRunWithValue or DoRunWithType are declared non-public.
-
-#define USE_DISPATCH_HELPER                           \
-  template <typename FirstArg, typename... ExtraArgs> \
-  friend struct DispatchHelper
-
-template <int... Values>
-struct FixedValues {};
-
-template <typename... Types>
-struct TensorTypes {};
-
-// Special tag that can be listed in TensorTypes to denote that a special
-// implementation in 'RunWithOtherType' needs to be called instead of failing
-// Obviously this needs to be the last item in lists, e.g.
-// TensorTypes<float, double, GenericTensorImplementation>
-struct GenericTensorImplementation {};
-
-// Same as TensorTypes but call DoRunWithType2
-template <typename... Types>
-struct TensorTypes2 {};
-
-template <typename Sizes, typename... ExtraArgs>
-struct DispatchHelper;
-
-template <int FirstVal, int... Values, typename... ExtraArgs>
-struct DispatchHelper<FixedValues<FirstVal, Values...>, ExtraArgs...> {
-  template <typename Op>
-  static bool call(Op* op, int value) {
-    if (FirstVal == value) {
-      return op->template DoRunWithValue<ExtraArgs..., FirstVal>();
-    }
-    return DispatchHelper<FixedValues<Values...>, ExtraArgs...>::template call<
-        Op>(op, value);
-  }
-};
-
-template <typename... ExtraArgs>
-struct DispatchHelper<FixedValues<>, ExtraArgs...> {
-  template <typename Op>
-  static bool call(Op* op, int64_t /*size*/) {
-    return op->template DoRunWithValue<ExtraArgs..., -1>();
-  }
-};
-
-#define C10_DEFINE_TENSOR_TYPES_DISPATCHER(                                    \
-    TensorTypes, DoRunWithType, DoRunWithOtherType)                            \
-  template <typename FirstType, typename... Types, typename... ExtraArgs>      \
-  struct DispatchHelper<TensorTypes<FirstType, Types...>, ExtraArgs...> {      \
-    template <typename Op>                                                     \
-    static bool call(Op* op, const TypeMeta meta) {                           \
-      static_assert(                                                           \
-          !std::is_same<GenericTensorImplementation, FirstType>::value,        \
-          "GenericTensorImplementation must be the last in TensorTypes list"); \
-      if (meta.Match<FirstType>()) {                                           \
-        return op->template DoRunWithType<ExtraArgs..., FirstType>();          \
-      }                                                                        \
-      return DispatchHelper<TensorTypes<Types...>, ExtraArgs...>::             \
-          template call<Op>(op, meta);                                         \
-    }                                                                          \
-    template <typename Op>                                                     \
-    static bool call(Op* op, const Tensor& tensor) {                           \
-      return call<Op>(op, tensor.dtype());                                     \
-    }                                                                          \
-    template <typename Op>                                                     \
-    static bool call(Op* op, const Blob& blob) {                               \
-      return call<Op>(op, blob.meta());                                        \
-    }                                                                          \
-  };                                                                           \
-                                                                               \
-  template <typename... ExtraArgs>                                             \
-  struct DispatchHelper<TensorTypes<>, ExtraArgs...> {                         \
-    template <typename Op>                                                     \
-    static bool call(Op* /* unused */, const TypeMeta meta) {                 \
-      CAFFE_THROW("Unsupported type of tensor: ", meta.name());                \
-    }                                                                          \
-    template <typename Op>                                                     \
-    static bool call(Op* op, const Tensor& tensor) {                           \
-      return call<Op>(op, tensor.dtype());                                     \
-    }                                                                          \
-    template <typename Op>                                                     \
-    static bool call(Op* op, const Blob& blob) {                               \
-      return call<Op>(op, blob.meta());                                        \
-    }                                                                          \
-  };                                                                           \
-                                                                               \
-  template <typename... ExtraArgs>                                             \
-  struct DispatchHelper<                                                       \
-      TensorTypes<GenericTensorImplementation>,                                \
-      ExtraArgs...> {                                                          \
-    template <typename Op>                                                     \
-    static bool call(Op* op, const TypeMeta) {                                \
-      return op->template DoRunWithOtherType<ExtraArgs...>();                  \
-    }                                                                          \
-    template <typename Op>                                                     \
-    static bool call(Op* op, const Tensor& tensor) {                           \
-      return call<Op>(op, tensor.dtype());                                     \
-    }                                                                          \
-    template <typename Op>                                                     \
-    static bool call(Op* op, const Blob& blob) {                               \
-      return call<Op>(op, blob.meta());                                        \
-    }                                                                          \
-  };
-C10_DEFINE_TENSOR_TYPES_DISPATCHER(
-    TensorTypes,
-    DoRunWithType,
-    DoRunWithOtherType)
-C10_DEFINE_TENSOR_TYPES_DISPATCHER(
-    TensorTypes2,
-    DoRunWithType2,
-    DoRunWithOtherType2)
-#undef C10_DEFINE_TENSOR_TYPES_DISPATCHER
-
-// The device type registry. This works in two phases:
-// (1) gDeviceTypeRegistry() maps the device types values to the actual operator
-//     registry function.
-// (2) Then, one can call the operator registry function to further create the
-//     operators.
-typedef c10::Registry<
-    std::string,
-    std::unique_ptr<OperatorBase>,
-    const OperatorDef&,
-    Workspace*>
-    OperatorRegistry;
-typedef c10::Registry<
-    std::string,
-    std::unique_ptr<OperatorBase>,
-    const OperatorDef&,
-    Workspace*>* (*RegistryFunction)();
-TORCH_API std::map<DeviceType, OperatorRegistry*>* gDeviceTypeRegistry();
-
-struct TORCH_API DeviceTypeRegisterer {
-  explicit DeviceTypeRegisterer(DeviceType type, RegistryFunction func);
-};
-
-#if defined(_MSC_VER)
-#define IMPORT_IF_NOT_MSVC
-#else
-#define IMPORT_IF_NOT_MSVC C10_IMPORT
-#endif
-
-#define CAFFE_REGISTER_DEVICE_TYPE(type, registry_function) \
-  namespace {                                               \
-  static DeviceTypeRegisterer C10_ANONYMOUS_VARIABLE(       \
-      DeviceType)(type, &registry_function);                \
-  }
-
-// The operator registry. Since we are not expecting a great number of devices,
-// we will simply have an if-then type command and allocate the actual
-// generation to device-specific registerers.
-// Note that although we have CUDA and CUDNN here, the registerers themselves do
-// not depend on specific cuda or cudnn libraries. This means that we will be
-// able to compile it even when there is no cuda available - we simply do not
-// link any cuda or cudnn operators.
-C10_DECLARE_REGISTRY(
-    CPUOperatorRegistry,
-    OperatorBase,
-    const OperatorDef&,
-    Workspace*);
-#define REGISTER_CPU_OPERATOR_CREATOR(key, ...) \
-  C10_REGISTER_CREATOR(CPUOperatorRegistry, key, __VA_ARGS__)
-#define REGISTER_CPU_OPERATOR(name, ...)                                   \
-  IMPORT_IF_NOT_MSVC void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();  \
-  static void CAFFE2_UNUSED CAFFE_ANONYMOUS_VARIABLE_CPU##name() {         \
-    CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();                        \
-  }                                                                        \
-  C10_REGISTER_CLASS(CPUOperatorRegistry, name, __VA_ARGS__)
-#define REGISTER_CPU_OPERATOR_STR(str_name, ...) \
-  C10_REGISTER_TYPED_CLASS(CPUOperatorRegistry, str_name, __VA_ARGS__)
-
-#define REGISTER_CPU_OPERATOR_WITH_ENGINE(name, engine, ...) \
-  C10_REGISTER_CLASS(CPUOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
-
-// Use these macros to register gradient operators.  They can be automatically
-// excluded from builds that don't need them (e.g., mobile).
-#ifdef CAFFE2_NO_GRADIENT_OPS
-#define REGISTER_CPU_GRADIENT_OPERATOR(...) /* No gradients. */
-#else
-#define REGISTER_CPU_GRADIENT_OPERATOR(...) \
-  C10_MACRO_EXPAND(REGISTER_CPU_OPERATOR(__VA_ARGS__))
-#endif
-
-#ifdef CAFFE2_NO_GRADIENT_OPS
-#define REGISTER_CPU_GRADIENT_OPERATOR_WITH_ENGINE(...) /* No gradients. */
-#else
-#define REGISTER_CPU_GRADIENT_OPERATOR_WITH_ENGINE(...) \
-  C10_MACRO_EXPAND(REGISTER_CPU_OPERATOR_WITH_ENGINE(__VA_ARGS__))
-#endif
-
-C10_DECLARE_REGISTRY(
-    CUDAOperatorRegistry,
-    OperatorBase,
-    const OperatorDef&,
-    Workspace*);
-#define REGISTER_CUDA_OPERATOR_CREATOR(key, ...) \
-  C10_REGISTER_CREATOR(CUDAOperatorRegistry, key, __VA_ARGS__)
-#define REGISTER_CUDA_OPERATOR(name, ...)                                   \
-  IMPORT_IF_NOT_MSVC void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();   \
-  static void CAFFE2_UNUSED CAFFE_ANONYMOUS_VARIABLE_CUDA##name() {         \
-    CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();                         \
-  }                                                                         \
-  C10_REGISTER_CLASS(CUDAOperatorRegistry, name, __VA_ARGS__)
-#define REGISTER_CUDA_OPERATOR_STR(str_name, ...) \
-  C10_REGISTER_TYPED_CLASS(CUDAOperatorRegistry, str_name, __VA_ARGS__)
-
-#define REGISTER_CUDA_OPERATOR_WITH_ENGINE(name, engine, ...) \
-  C10_REGISTER_CLASS(CUDAOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
-
-// Macros for cudnn since we use it often
-#define REGISTER_CUDNN_OPERATOR(name, ...) \
-  REGISTER_CUDA_OPERATOR_WITH_ENGINE(name, CUDNN, __VA_ARGS__)
-
-// Macros for HIP operators
-C10_DECLARE_REGISTRY(
-    HIPOperatorRegistry,
-    OperatorBase,
-    const OperatorDef&,
-    Workspace*);
-#define REGISTER_HIP_OPERATOR_CREATOR(key, ...) \
-  C10_REGISTER_CREATOR(HIPOperatorRegistry, key, __VA_ARGS__)
-#define REGISTER_HIP_OPERATOR(name, ...)                                   \
-  IMPORT_IF_NOT_MSVC void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();  \
-  static void CAFFE2_UNUSED CAFFE_ANONYMOUS_VARIABLE_HIP##name() {         \
-    CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();                        \
-  }                                                                        \
-  C10_REGISTER_CLASS(HIPOperatorRegistry, name, __VA_ARGS__)
-#define REGISTER_HIP_OPERATOR_STR(str_name, ...) \
-  C10_REGISTER_TYPED_CLASS(HIPOperatorRegistry, str_name, __VA_ARGS__)
-
-#define REGISTER_HIP_OPERATOR_WITH_ENGINE(name, engine, ...) \
-  C10_REGISTER_CLASS(HIPOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
-
-#define REGISTER_MIOPEN_OPERATOR(name, ...)                    \
-  REGISTER_HIP_OPERATOR_WITH_ENGINE(name, MIOPEN, __VA_ARGS__) \
-  REGISTER_HIP_OPERATOR_WITH_ENGINE(                           \
-      name, CUDNN, __VA_ARGS__) // Make CUDNN an alias of MIOPEN for HIP ops
-
-// StaticLinkingProtector is a helper class that ensures that the Caffe2
-// library is linked correctly with whole archives (in the case of static
-// linking). What happens is that when CreateOperator is called for the first
-// time, it instantiates an OperatorLinkingProtector object to check if the
-// operator registry is empty. If it is empty, this means that we are not
-// properly linking the library.
-//
-// You should not need to use this class.
-struct StaticLinkingProtector {
-  StaticLinkingProtector() {
-    const auto registered_ops = CPUOperatorRegistry()->Keys().size();
-    // Note: this is a check failure instead of an exception, because if
-    // the linking is wrong, Caffe2 won't be able to run properly anyway,
-    // so it's better to fail loud.
-    // If Caffe2 is properly linked with whole archive, there should be more
-    // than zero registered ops.
-    if (registered_ops == 0) {
-      LOG(FATAL)
-          << "You might have made a build error: the Caffe2 library does not seem "
-             "to be linked with whole-static library option. To do so, use "
-             "-Wl,-force_load (clang) or -Wl,--whole-archive (gcc) to link the "
-             "Caffe2 library.";
-    }
-  }
-};
-
-// An exception that can be thrown by an operator constructor that notifies
-// that it does not support the given setting. This can be usually used for
-// specific engines that only implement a subset of the features required by
-// the original operator schema.
-// TODO(jiayq): make more feature-complete exception message.
-class TORCH_API UnsupportedOperatorFeature : public std::exception {
- public:
-  UnsupportedOperatorFeature(const string& msg) : msg_(msg) {}
-  const char* what() const noexcept override {
-    return msg_.c_str();
-  }
-
- private:
-  string msg_;
-};
-
-// A helper macro that should ONLY be used in the operator constructor to check
-// if needed features are met. If not, throws the UnsupportedOperatorFeature
-// exception with the given message.
-#define OPERATOR_NEEDS_FEATURE(condition, ...)                 \
-  if (!(condition)) {                                          \
-    throw UnsupportedOperatorFeature(::c10::str(__VA_ARGS__)); \
-  }
-
-// Creates an operator with the given operator definition.
-// Throws on error and never returns nullptr
-TORCH_API unique_ptr<OperatorBase> CreateOperator(
-    const OperatorDef& operator_def,
-    Workspace* ws,
-    int net_position = OperatorBase::kNoNetPositionSet);
-
-TORCH_API const std::string OpRegistryKey(
-    const std::string& op_type,
-    const std::string& engine = "");
-
-// User can set the preferred engines as a list of engine names, in
-// descending order of preference.
-using EnginePrefType = std::vector<std::string>;
-// {device_type -> {operator_name -> EnginePrefType}}
-using PerOpEnginePrefType =
-    CaffeMap<DeviceType, CaffeMap<std::string, EnginePrefType>>;
-// {device_type -> EnginePrefType}
-using GlobalEnginePrefType = CaffeMap<DeviceType, EnginePrefType>;
-TORCH_API void SetPerOpEnginePref(
-    const PerOpEnginePrefType& per_op_engine_pref);
-TORCH_API void SetGlobalEnginePref(
-    const GlobalEnginePrefType& global_engine_pref);
-TORCH_API void SetEnginePref(
-    const PerOpEnginePrefType& per_op_engine_pref,
-    const GlobalEnginePrefType& global_engine_pref);
-TORCH_API void SetOpEnginePref(
-    const std::string& op_type,
-    const CaffeMap<DeviceType, EnginePrefType>& op_pref);
-
-TORCH_API void LoadInt8TensorInfoOfBlob(
-    std::vector<float>* scale,
-    std::vector<float>* offset,
-    uint32_t* axis,
-    const Blob* b);
-
-TORCH_API TensorShape GetTensorShapeOfBlob(const Blob* b);
-
-TORCH_API TensorShapes InferBlobShapesAndTypes(
-    CaffeMap<string, TensorShape>& blob_desc,
-    const vector<NetDef*>& nets);
-
-TORCH_API TensorShapes InferBlobShapesAndTypesFromWorkspace(
-    Workspace* ws,
-    const vector<NetDef*>& nets);
-
-TORCH_API TensorShapes InferBlobShapesAndTypesFromMap(
-    const CaffeMap<std::string, std::vector<int64_t>>& blob_dimensions,
-    const vector<NetDef*>& nets);
-
-TORCH_API TensorShapes InferBlobShapesAndTypesFromMap(
-    const CaffeMap<std::string, std::vector<int64_t>>& blob_dimensions,
-    const CaffeMap<std::string, TensorProto_DataType>& blob_types,
-    const vector<NetDef*>& nets);
-
-TORCH_API std::map<string, std::pair<DeviceOption, DeviceOption>>
-ValidateTensorDevices(OperatorBase& op, const OperatorDef& op_def);
-
-// Get a set of registered operator names
-TORCH_API std::set<std::string> GetRegisteredOperators();
-
-// Operator logging capabilities
-TORCH_API void SetOperatorLogger(
-    std::function<void(const OperatorDef&)> tracer);
-std::function<void(const OperatorDef&)> GetOperatorLogger();
-
-#ifndef C10_MOBILE
-// This is for transferring tensor data between C2 and backends.
-struct ExternalTensorDescriptor {
-  uint64_t dataType;
-  uint32_t dimensions;
-  const uint64_t* shape;
-  uint8_t isOffline = 0;
-  uint32_t quantizationAxis;
-  uint64_t quantizationParams;
-  const float* scales;
-  const int32_t* biases;
-  uint64_t buffer;
-};
-
-class ExternalTensorFunctionsBase {
- public:
-  explicit ExternalTensorFunctionsBase() {}
-  virtual ~ExternalTensorFunctionsBase() {}
-  virtual bool isQuantized() const = 0;
-  virtual bool IsSameMetaType(TypeIdentifier id) = 0;
-  virtual void SetupExternalTensorDescriptor(
-      const Blob* blob,
-      std::vector<std::vector<uint64_t>>* shapes,
-      std::vector<std::vector<float>>* all_scales,
-      std::vector<std::vector<int32_t>>* all_offsets,
-      ExternalTensorDescriptor* desc) = 0;
-  virtual void LoadInfoOfBlob(
-      const Blob* blob,
-      std::vector<float>* scale,
-      std::vector<float>* offset,
-      uint32_t* axis) = 0;
-  virtual TypeIdentifier GetTypeMetaId() = 0;
-  virtual TypeMeta GetExternalTensorType(const void* c) = 0;
-  virtual vector<int64_t> GetExternalTensorInfo(
-      const void* c,
-      size_t* capacity,
-      DeviceOption* device) = 0;
-};
-
-C10_DECLARE_TYPED_REGISTRY(
-    ExternalTensorFunctionsBaseRegistry,
-    TypeIdentifier,
-    ExternalTensorFunctionsBase,
-    std::unique_ptr);
-
-#define REGISTER_EXTERNAL_TENSOR_FUNCTIONS(id, ...) \
-  C10_REGISTER_TYPED_CLASS(ExternalTensorFunctionsBaseRegistry, id, __VA_ARGS__)
-inline unique_ptr<ExternalTensorFunctionsBase> CreateExternalTensorFunctions(
-    TypeIdentifier id) {
-  return ExternalTensorFunctionsBaseRegistry()->Create(id);
-}
-#endif // C10_MOBILE
-
-} // namespace caffe2
-
-C10_CLANG_DIAGNOSTIC_POP()
-
-#endif // CAFFE2_CORE_OPERATOR_H_

diff --git a/caffe2/core/operator_gradient.h b/caffe2/core/operator_gradient.h
deleted file mode 100644
index 5c8d97a..0000000
--- a/caffe2/core/operator_gradient.h
+++ /dev/null

@@ -1,337 +0,0 @@
-#ifndef CAFFE2_CORE_OPERATOR_GRADIENT_H_
-#define CAFFE2_CORE_OPERATOR_GRADIENT_H_
-
-#include "c10/util/Registry.h"
-#include "caffe2/core/operator_schema.h"
-#include "caffe2/proto/caffe2_pb.h"
-#include "caffe2/utils/proto_utils.h"
-
-namespace caffe2 {
-
-/* @brief A struct that abstracts on top of dense and sparse blobs.
- *
- * For a dense blob, its gradient name should be written into dense_, and for
- * a sparse blob, its gradient name should be written into indice_ for
- * the sparse indices and value_ for the values.
- */
-struct TORCH_API GradientWrapper {
-  string dense_;
-  string indices_;
-  string values_;
-
-  inline bool IsDense() const {
-    return (dense_.size() != 0);
-  }
-  inline bool IsSparse() const {
-    return (indices_.size() != 0 || values_.size() != 0);
-  }
-  inline bool IsEmpty() const {
-    return (!IsDense() && !IsSparse());
-  }
-};
-
-/**
- * A struct that holds the gradient operators and related gradient maps.
- */
-struct TORCH_API GradientOpsMeta {
-  vector<OperatorDef> ops_;
-  vector<GradientWrapper> g_input_;
-
-  GradientOpsMeta() {}
-  GradientOpsMeta(
-      const vector<OperatorDef>& ops,
-      const vector<GradientWrapper>& v)
-      : ops_(ops), g_input_(v) {}
-};
-
-class TORCH_API GradientMakerBase {
- public:
-  GradientMakerBase(
-      const OperatorDef& def,
-      const vector<GradientWrapper>& g_output)
-      : def_(def), g_output_(g_output), g_input_(def.input_size()){};
-  virtual ~GradientMakerBase() {}
-  virtual bool CopyDeviceOption() const {
-    return true;
-  }
-  virtual bool CopyEngine() const {
-    return true;
-  }
-  virtual bool CopyArguments() const {
-    return true;
-  }
-
-  virtual void VerifyOp() const {
-    auto* schema = OpSchemaRegistry::Schema(def_.type());
-    if (schema) {
-      CAFFE_ENFORCE(
-          schema->Verify(def_),
-          "(GradientMaker) Operator def did not pass schema checking: ",
-          ProtoDebugString(def_));
-    }
-  }
-
-  /**
-   * @brief Returns the gradient ops meta.
-   *
-   * If your gradient op generator only use standard input and output
-   * manipulations, you can simply implement GetGradientDefs() that
-   * returns vector<OperatorDef>. In that, you can call GI, GI_V and GI_I
-   * that will automatically create the gradient registration for you.
-   *
-   * If you need to do custom gradient name registration, overload this
-   * function directly.
-   */
-  virtual GradientOpsMeta Get() {
-    VerifyOp();
-    vector<OperatorDef> new_defs = GetGradientDefs();
-    for (auto& opdef : new_defs) {
-      opdef.set_is_gradient_op(true);
-    }
-    return GradientOpsMeta(new_defs, g_input_);
-  };
-
-  const OperatorDef& Def() const {
-    return def_;
-  }
-
- protected:
-  virtual vector<OperatorDef> GetGradientDefs() {
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
-  // Helper functions to return names for the gradient computation.
-  // I(idx), O(idx): return the input and output names.
-  // GO(idx): return the name of the gradient for output idx.
-  // GI(idx), GI_I(idx), GI_V(idx): return the name of the gradient for
-  //     input idx, and also registers that name into the gradient
-  //     registry to be returned.
-  string I(const int i) {
-    CAFFE_ENFORCE((i >= 0) && (i < def_.input().size()));
-    return def_.input(i);
-  }
-  string O(const int i) {
-    CAFFE_ENFORCE((i >= 0) && (i < def_.output().size()));
-    return def_.output(i);
-  }
-  string GI(const int i) {
-    CAFFE_ENFORCE(
-        !g_input_.at(i).IsSparse(),
-        "Input ",
-        def_.input(i),
-        " already set to sparse.");
-    g_input_.at(i).dense_ = GradientName(def_.input(i));
-    return GradientName(def_.input(i));
-  }
-  string GI_I(const int i) {
-    CAFFE_ENFORCE(
-        !g_input_.at(i).IsDense(),
-        "Input ",
-        def_.input(i),
-        " already set to dense.");
-    g_input_.at(i).indices_ = GradientSliceIndices(def_.input(i));
-    return GradientSliceIndices(def_.input(i));
-  }
-  string GI_V(const int i) {
-    CAFFE_ENFORCE(
-        !g_input_.at(i).IsDense(),
-        "Input ",
-        def_.input(i),
-        " already set to dense.");
-    g_input_.at(i).values_ = GradientSliceValues(def_.input(i));
-    return GradientSliceValues(def_.input(i));
-  }
-  string GO(const int i) {
-    CAFFE_ENFORCE(
-        g_output_.at(i).IsDense(),
-        "Gradient of output ",
-        def_.output(i),
-        (g_output_.at(i).IsSparse() ? " is sparse (expected dense)."
-                                    : " is not provided!"));
-    return g_output_.at(i).dense_;
-  }
-  string GO_I(const int i) {
-    CAFFE_ENFORCE(
-        g_output_.at(i).IsSparse(),
-        "Gradient of output ",
-        def_.output(i),
-        (g_output_.at(i).IsDense() ? " is dense (expected sparse)."
-                                   : " is not provided!"));
-    return g_output_.at(i).indices_;
-  }
-  string GO_V(const int i) {
-    CAFFE_ENFORCE(
-        g_output_.at(i).IsSparse(),
-        "Gradient of output ",
-        def_.output(i),
-        (g_output_.at(i).IsDense() ? " is dense (expected sparse)."
-                                   : " is not provided!"));
-    return g_output_.at(i).values_;
-  }
-  const GradientWrapper& GradOut(int i) {
-    return g_output_.at(i);
-  }
-
-  // Function to add a gradient pair to map.
-  void SetDense(const int i, const string& name) {
-    CAFFE_ENFORCE(
-        !g_input_.at(i).IsSparse(),
-        "Input ",
-        def_.input(i),
-        " already set to sparse.");
-    g_input_.at(i).dense_ = name;
-  }
-  void SetSparse(const int i, const string& indices, const string& values) {
-    CAFFE_ENFORCE(
-        !g_input_.at(i).IsDense(),
-        "Input ",
-        def_.input(i),
-        " already set to dense.");
-    g_input_.at(i).indices_ = indices;
-    g_input_.at(i).values_ = values;
-  }
-
-  /**
-   * @brief a helper function to allow one to create one single operator
-   * def, which is usually the case for many simple operators.
-   */
-  template <class... Args>
-  inline static vector<OperatorDef> SingleGradientDef(const Args&... args) {
-    return vector<OperatorDef>{CreateOperatorDef(args...)};
-  }
-
- public:
-  /**
-    * Returns map that returns the parameters that the gradients are for.
-    */
-  static CaffeMap<string, string> MatchGradsToParams(const OperatorDef& op) {
-    // NOTE: how to go beyond string-matching?
-    CaffeMap<string, string> m;
-    for (auto& out : op.output()) {
-      if (IsGradientBlob(out)) {
-        m[out] = out.substr(0, out.length() - 5);
-      }
-    }
-    return m;
-  }
-
- private:
-  // Utility functions for gradient name computation. We don't expose them
-  // in order to discourage the use of such names explicitly.
-  static string GradientName(const string& name) {
-    return name + "_grad";
-  }
-
-  static bool IsGradientBlob(const string& name) {
-    return name.length() > 5 && name.find("_grad") == name.length() - 5;
-  }
-
-  static string GradientNameToParam(const string& name) {
-    CHECK(IsGradientBlob(name));
-    return name.substr(0, name.length() - 5);
-  }
-
-  static string GradientSliceIndices(const string& name) {
-    return name + "_grad_indices";
-  }
-
-  static string GradientSliceValues(const string& name) {
-    return name + "_grad_values";
-  }
-
- protected:
-  // We make the member variables protected in case someone wants to write
-  // a fully custom Get() function.
-  const OperatorDef& def_;
-  const vector<GradientWrapper>& g_output_;
-  vector<GradientWrapper> g_input_;
-};
-
-/**
- * @brief A helper class to indicate that the operator does not need gradient
- * computation.
- *
- * Use the macro NO_GRADIENT to register operators that do not have gradients.
- * Note that this is different fron SHOULD_NOT_DO_GRADIENT: the latter means
- * that the gradient computation should not flow through it at all, and throws
- * an error if it is called.
- */
-class TORCH_API NoGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    return vector<OperatorDef>();
-  }
-};
-
-/**
- * @brief A helper class to indicate that the operator should have no gradient.
- *
- * This is used when the operator definition is designed to not have a gradient.
- * Calling a gradient on this operator def will cause Caffe2 to quit.
- */
-struct ThrowInTheTowelIfGradientIsCalled : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  GradientOpsMeta Get() override {
-    CAFFE_THROW("One should not call gradient for operator ", def_.type(), ".");
-  }
-};
-
-/**
- * @brief A helper class to indicate that the gradient mechanism is not ready.
- *
- * This should only be used sparsely when the gradient does exist, but we have
- * not implemented it yet and are using this as a lazy excuse. Eventually, a
- * gradient operator should be implemented.
- */
-struct GradientNotImplementedYet : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  GradientOpsMeta Get() override {
-    CAFFE_THROW(
-        "Operator ",
-        def_.type(),
-        " should have a gradient but is not implemented yet.");
-  }
-};
-
-C10_DECLARE_REGISTRY(
-    GradientRegistry,
-    GradientMakerBase,
-    const OperatorDef&,
-    const vector<GradientWrapper>&);
-
-#ifdef CAFFE2_NO_GRADIENT_OPS
-
-#define REGISTER_GRADIENT(name, ...) /* No gradients. */
-#define REGISTER_GRADIENT_STR(str_name, ...) /* No gradients. */
-
-#else
-
-#define REGISTER_GRADIENT(name, ...) \
-  C10_REGISTER_CLASS(GradientRegistry, name, __VA_ARGS__)
-#define REGISTER_GRADIENT_STR(str_name, ...) \
-  C10_REGISTER_TYPED_CLASS(GradientRegistry, str_name, __VA_ARGS__)
-
-#endif
-
-// NO_GRADIENT means that the operator does not need any gradient computation.
-#define NO_GRADIENT(name) REGISTER_GRADIENT(name, NoGradient)
-
-// SHOULD_NOT_DO_GRADIENT means that the operator is not designed to have
-// gradient operators. If you attempt to call the gradient, a log fatal will
-// occur.
-#define SHOULD_NOT_DO_GRADIENT(name) \
-  REGISTER_GRADIENT(name, ThrowInTheTowelIfGradientIsCalled)
-
-#define GRADIENT_NOT_IMPLEMENTED_YET(name) \
-  REGISTER_GRADIENT(name, GradientNotImplementedYet)
-
-/**
- * @brief Gets the GradientOpsMeta for the given operator def.
- */
-TORCH_API GradientOpsMeta GetGradientForOp(
-    const OperatorDef& def,
-    const vector<GradientWrapper>& g_output);
-
-} // namespace caffe2
-
-#endif // CAFFE2_CORE_OPERATOR_GRADIENT_H_

diff --git a/caffe2/core/operator_schema.h b/caffe2/core/operator_schema.h
deleted file mode 100644
index f5b9d0d..0000000
--- a/caffe2/core/operator_schema.h
+++ /dev/null

@@ -1,612 +0,0 @@
-#ifndef CAFFE2_CORE_OPERATOR_SCHEMA_H_
-#define CAFFE2_CORE_OPERATOR_SCHEMA_H_
-
-#include <climits>
-#include <functional>
-#include <initializer_list>
-#include <ostream>
-#include <set>
-#include <unordered_map>
-#include <vector>
-
-#include <c10/util/irange.h>
-#include <c10/util/Registry.h>
-#include <caffe2/core/common.h>
-#include <caffe2/core/logging.h>
-#include <caffe2/core/types.h>
-#include <caffe2/proto/caffe2_pb.h>
-#include <caffe2/utils/filler.h>
-#include <caffe2/utils/proto_utils.h>
-
-namespace caffe2 {
-
-// A const value returned by OpSchema::CalculateOutput() if the number of
-// output cannot be determined.
-constexpr int kCannotComputeNumOutputs = -1;
-
-/**
- * @brief A class to record the schema of an op.
- *
- * OpSchema records the common interface of an op specified by its name. This
- * is optional for each operator implemented in Caffe2 but is strongly
- * recommended.
- *
- * To register an OpSchema, one can use the macro OPERATOR_SCHEMA(name) and
- * then append the various functions in the class. For example, for an op
- * that takes in two inputs, one output, and the first input and output
- * could be in-place, can be written as
- *
- *     OPERATOR_SCHEMA(name)
- *         .NumInputs(2).NumOutputs(1).AllowInplace({{0, 0}});
- */
-class TORCH_API OpSchema {
- public:
-  OpSchema() : OpSchema("unknown", "unknown", 0) {}
-  OpSchema(const string& type, const string& file, const int line);
-
-  /**
-   * @brief Returns the file that the op schema is registered from.
-   */
-  inline const string& file() const {
-    return file_;
-  }
-
-  /**
-   * @brief Returns the line in file that the op schema is registered from.
-   */
-  inline int line() const {
-    return line_;
-  }
-
-  /**
-   * @brief Returns the docstring of the op schema.
-   */
-  inline const char* doc() const {
-    return doc_.empty() ? nullptr : doc_.c_str();
-  }
-
-  /**
-   * @brief Verifies if an operator definition protobuf matches the pattern
-   * specified in the schema.
-   */
-  bool Verify(const OperatorDef& def) const;
-
-  // Functions to set the property of the operator schemas.
-  // Sets the number of inputs, either a fixed number or a min and a max.
-
-  /**
-   * @brief A single input.
-   */
-  OpSchema& NumInputs(int n);
-  /**
-   * @brief Input could be in range [min, max], inclusive.
-   */
-  OpSchema& NumInputs(int min, int max);
-  /**
-   * @brief Input could be one of the values specified in allowed_input_nums.
-   */
-  OpSchema& NumInputs(set<int> allowed_input_nums);
-  /**
-   * @brief Input is checked with a specified function.
-   */
-  OpSchema& NumInputs(std::function<bool(int)> func);
-
-  // Sets the number of outputs, either a fixed number, a min and a max,
-  // or a function that takes in the input number and produces an output
-  // number. Use only one function in the set below.
-  /**
-   * @brief A single output.
-   */
-  OpSchema& NumOutputs(int n);
-  /**
-   * @brief Output could be in range [min, max], inclusive.
-   */
-  OpSchema& NumOutputs(int min, int max);
-  /**
-   * @brief Output could be one of the values specified in allowed_output_nums.
-   */
-  OpSchema& NumOutputs(set<int> allowed_output_nums);
-  /**
-   * @brief Output is checked with a specified function.
-   */
-  OpSchema& NumOutputs(std::function<bool(int)> func);
-
-  /**
-   * @brief Relationship between inputs and outputs is checked with a specified
-   * function.
-   */
-  OpSchema& NumInputsOutputs(std::function<bool(int, int)> func);
-
-  // Set the function that can calculate the number of output based on the
-  // number of input. Use only one function in the set below.
-  /**
-   * @brief Set the output calculator to a user-defined function.
-   */
-  OpSchema& OutputCalculator(std::function<int(int)> calc);
-  /**
-   * @brief Set the number of outputs to be the same as the number of inputs.
-   */
-  OpSchema& SameNumberOfOutput();
-
-  // Sets the rule to allow optional in-place operation.
-  OpSchema& AllowInplace(std::function<bool(int, int)> inplace);
-  OpSchema& AllowInplace(set<std::pair<int, int>> inplace);
-  OpSchema& AllowOneToOneInplace();
-  // Sets the rule to enforce in-place operation.
-  OpSchema& EnforceInplace(std::function<bool(int, int)> inplace);
-  OpSchema& EnforceInplace(set<std::pair<int, int>> inplace);
-  OpSchema& EnforceOneToOneInplace();
-
-  // Functions to deal with type and shape inference. Basically, this registers
-  // a function that takes in an OperatorDef and a series of input type and
-  // shape specified by TensorProto objects (whose data fields are empty), and
-  // produces a series of output type and shape.
-  typedef std::function<
-      vector<TensorShape>(const OperatorDef&, const vector<TensorShape>&)>
-      TensorInferenceFunctionType;
-
-  /**
-   * @brief Sets the tensor inference function, which is a std::function object
-   * defined in operator_schema.h.
-   */
-  OpSchema& TensorInferenceFunction(TensorInferenceFunctionType function);
-
-  /**
-   * A wrapper that makes an infer tensor function to return unknown
-   * shape for all outputs if any one of the inputs has unknown shape
-   */
-
-  static TensorInferenceFunctionType NeedsAllInputShapes(
-      TensorInferenceFunctionType f);
-
-  /**
-   * @brief Sets the corresponding onnx schema name
-   */
-  OpSchema& InheritOnnxSchema(const std::string& onnx_schema_name);
-
-  /**
-   * @brief Shortcut to InheritOnnxSchema(type_)
-   */
-  OpSchema& InheritOnnxSchema() {
-    return InheritOnnxSchema(type_);
-  }
-
-  /**
-   * @brief Sets the tensor inference function to produce the same output as
-   * the input.
-   */
-  OpSchema& IdenticalTypeAndShape();
-  OpSchema& IdenticalTypeAndShapeOfInput(int idx);
-  OpSchema& IdenticalTypeAndShapeOfInputDim(int idx, int dim);
-  OpSchema& IdenticalTypeAndShapeOfMultipleInputs(const vector<int>& indices);
-  OpSchema& ScalarType(::caffe2::TensorProto_DataType dt);
-
-  /**
-   * @brief A function to allow one to infer the type and shape from the op
-   * schema.
-   */
-  inline vector<TensorShape> InferTensor(
-      const OperatorDef& def,
-      const vector<TensorShape>& input_type_shape) const {
-    CAFFE_ENFORCE(
-        Verify(def),
-        "(InferTensor) Operator def did not pass schema checking: ",
-        ProtoDebugString(def));
-    return tensor_inference_function_(def, input_type_shape);
-  }
-
-  /*
-   * @brief A struct to store various cost information about
-   * an operator such as FLOPs, total memory use and parameters.
-   */
-  struct Cost {
-    uint64_t flops{0}; // Floating point operations.
-    uint64_t bytes_read{0}; // Total memory read.
-    uint64_t bytes_written{0}; // Total memory written.
-    uint64_t params_bytes{0}; // Memory read for parameters.
-  };
-  /**
-   * @brief Registers a function that takes in an OperatorDef
-   * and a series of input shapes and returns the total "cost"
-   * required to run the operator via struct by value.
-   */
-  typedef std::function<
-      struct Cost(const OperatorDef&, const vector<TensorShape>&)>
-      CostInferenceFunctionType;
-
-  /**
-   * @brief Register the Cost inference function.
-   */
-  OpSchema& CostInferenceFunction(CostInferenceFunctionType function);
-
-#if 0 // def _MSC_VER
-  /**
-   * @brief Register the Cost inference function via a pointer.
-   */
-  template <typename T,
-            typename = std::enable_if<
-                std::is_same<CostInferenceFunctionType&&, T>:value
-                >:type>
-  inline OpSchema& CostInferenceFunction(T func) {
-    // Note: This is here in order to resolve an MSVC compiler issue: it
-    // does not automatically convert a function pointer to a std::function,
-    // and needs an explicit conversion.
-    return CostInferenceFunction(CostInferenceFunctionType(func));
-  }
-#endif // _MSC_VER
-
-  bool HasCostInferenceFunction() const {
-    return !!cost_inference_function_;
-  }
-
-  inline struct Cost InferCost(
-      const OperatorDef& def,
-      const vector<TensorShape>& input_tensor_shape) const {
-    CAFFE_ENFORCE(
-        cost_inference_function_, "Cost inference function not defined.");
-    return (*cost_inference_function_)(def, input_tensor_shape);
-  }
-
-  // Functions to do documentation for the operator schema.
-  OpSchema& SetDoc(const string& doc);
-
-  struct Argument {
-    Argument(const char* name, const char* description, bool required)
-        : name_{name}, description_{description}, required_{required} {}
-
-    const char* name() const {
-      return name_;
-    }
-
-    const char* description() const {
-      return description_;
-    }
-
-    bool is_required() const {
-      return required_;
-    }
-
-   private:
-    const char* name_;
-    const char* description_;
-    const bool required_;
-  };
-
-  OpSchema&
-  Arg(const char* name, const char* description, bool required = false);
-
-#define DECLARE_STANDARD_ARG(name, str) \
-  static const char* Arg_##name;        \
-  OpSchema& Arg##name(const char* description);
-
-  DECLARE_STANDARD_ARG(IsTest, is_test)
-
-#undef DECLARE_STANDARD_ARG
-
-  OpSchema& Input(const int n, const char* name, const char* description);
-  OpSchema& Output(const int n, const char* name, const char* description);
-  // Calls the passed function with `this` as an argument. Useful for
-  // adding docs for templated/macro ops.
-  OpSchema& FillUsing(std::function<void(OpSchema&)> populator);
-
-  // Remove from documentation
-  OpSchema& Private();
-
-  // This op can pass data across devices
-  OpSchema& InputsCanCrossDevices();
-
-  /**
-   * @brief A function to allow one to get the number of outputs based on the
-   * number of inputs, if this schema supports it.
-   */
-  int CalculateOutput(int num_input) const;
-
-  const std::string& onnx_schema() const {
-    return onnx_schema_;
-  }
-
-  int min_input() const {
-    return min_input_;
-  }
-
-  int max_input() const {
-    return max_input_;
-  }
-
-  int min_output() const {
-    return min_output_;
-  }
-
-  int max_output() const {
-    return max_output_;
-  }
-
-  bool num_inputs_allowed(int x) const {
-    return num_inputs_allowed_(x);
-  }
-
-  bool num_outputs_allowed(int x) const {
-    return num_outputs_allowed_(x);
-  }
-
-  bool num_inputs_outputs_allowed(int x, int y) const {
-    return num_inputs_outputs_allowed_(x, y);
-  }
-
-  int inf() const {
-    return std::numeric_limits<int>::max();
-  }
-
-  bool inplace_enforced(int x, int y) const {
-    return inplace_enforced_(x, y);
-  }
-
-  TORCH_API friend std::ostream& operator<<(
-      std::ostream& out,
-      const OpSchema& schema);
-
-  const std::vector<Argument>& args() const {
-    return args_;
-  }
-
-  const std::vector<std::pair<const char*, const char*>>& input_desc() const {
-    return input_desc_;
-  }
-  const std::vector<std::pair<const char*, const char*>>& output_desc() const {
-    return output_desc_;
-  }
-  bool private_op() {
-    return private_;
-  }
-  bool inputs_can_cross_devices() const {
-    return inputs_can_cross_devices_;
-  }
-
-  /**
-   * @brief Returns the required device location of inputs and outputs.
-   */
-  using DeviceInferenceFunctionType = std::function<
-      std::pair<std::vector<DeviceOption>, std::vector<DeviceOption>>(
-          const OperatorDef& def)>;
-
-  OpSchema& DeviceInferenceFunction(DeviceInferenceFunctionType function);
-
-  /**
-   * @brief Infer required device location of an op's inputs and outputs
-   */
-  inline std::pair<std::vector<DeviceOption>, std::vector<DeviceOption>>
-  InferDevice(const OperatorDef& def) const {
-    return device_inference_function_(def);
-  }
-
-  // The helper is build sparse input with values, keys, weights and lengths;
-  // e.g.:
-  // values  = [1, 2, 3, 2, 4, 6, 7, 3, 6]
-  // keys    = [0, 1, 4, 0, 1, 2, 5, 1, 2]
-  // weights = [1, 2, 3, 4, 5, 6, 7, 8, 9]
-  //            \_____/  \________/  \__/
-  // lengths =    [3,        4,       2]
-  OpSchema& WeightedValueKeyLengthInputFillers(
-      size_t value_index,
-      size_t key_index,
-      size_t length_index,
-      size_t weight_index);
-
-  // The helper is build sparse input with values, keys, weights and lengths;
-  // e.g.:
-  // values  = [1, 2, 3, 2, 4, 6, 7, 3, 6]
-  // keys    = [0, 1, 4, 0, 1, 2, 5, 1, 2]
-  //            \_____/  \________/  \__/
-  // lengths =    [3,        4,       2]
-  OpSchema& ValueKeyLengthInputFillers(
-      size_t value_index,
-      size_t key_index,
-      size_t length_index);
-
-  // The helper is build sparse input with values and lengths; e.g.:
-  // values  = [1, 2, 3, 2, 4, 6, 7, 3, 6]
-  //            \_____/  \________/  \__/
-  // lengths =    [3,        4,       2]
-  OpSchema& ValueLengthInputFillers(size_t value_index, size_t length_index);
-
-  OpSchema& DisallowInputFillers();
-
-  std::vector<TensorFiller> InputFillers(
-      const std::vector<std::vector<int64_t>>& shapes) const;
-
- private:
-  std::vector<TensorFiller> SupplyDenseFillers(
-      const std::vector<std::vector<int64_t>>& shapes);
-
- private:
-  string type_;
-  string file_;
-  string doc_;
-  string onnx_schema_;
-  std::vector<Argument> args_{};
-  std::vector<std::pair<const char*, const char*>> input_desc_{};
-  std::vector<std::pair<const char*, const char*>> output_desc_{};
-  int line_ = 0;
-  int min_input_ = 0;
-  int max_input_ = std::numeric_limits<int>::max();
-  int min_output_ = 0;
-  int max_output_ = std::numeric_limits<int>::max();
-  bool private_ = false;
-  bool inputs_can_cross_devices_ = false;
-  std::function<bool(int)> num_inputs_allowed_ = [](int) { return true; };
-  std::function<bool(int)> num_outputs_allowed_ = [](int) { return true; };
-  std::function<bool(int, int)> num_inputs_outputs_allowed_ = [](int, int) {
-    return true;
-  };
-  std::function<int(int)> calculate_output_;
-  // In default, any in-place operation is neither allowed nor enforced.
-  std::function<bool(int, int)> inplace_allowed_ = [](int, int) {
-    return false;
-  };
-  std::function<bool(int, int)> inplace_enforced_ = [](int, int) {
-    return false;
-  };
-  TensorInferenceFunctionType tensor_inference_function_;
-  std::unique_ptr<CostInferenceFunctionType> cost_inference_function_ = nullptr;
-  DeviceInferenceFunctionType device_inference_function_;
-
-  std::function<std::vector<TensorFiller>(
-      const std::vector<std::vector<int64_t>>&)>
-      filler_supplier_ =
-          [this](const std::vector<std::vector<int64_t>>& shapes) {
-            return SupplyDenseFillers(shapes);
-          };
-};
-
-/**
- * @brief A registry to hold all the operator schemas.
- */
-class TORCH_API OpSchemaRegistry {
- public:
-  static OpSchema&
-  NewSchema(const string& key, const string& file, const int line);
-
-  static const OpSchema* Schema(const string& key) {
-    auto& m = map();
-    auto it = m.find(key);
-    if (it != m.end()) {
-      return &it->second;
-    } else {
-      return nullptr;
-    }
-  }
-
- private:
-  // OpSchemaRegistry should not need to be instantiated.
-  OpSchemaRegistry() = delete;
-
-  /**
-   * @brief Returns the underlying string to OpSchema map.
-   *
-   * You should not manually manipulate the map object returned. Instead, use
-   * the macros defined such as OPERATOR_SCHEMA to register your operator
-   * schema.
-   *
-   * We wrap it inside a function to avoid the static initialization order
-   * fiasco.
-   */
-  static CaffeMap<string, OpSchema>& map();
-};
-
-// Helper function for creating simple tensorproto with dimension and type
-template <typename T_I = int>
-inline TensorShape CreateTensorShape(
-    vector<T_I> dims,
-    ::caffe2::TensorProto_DataType dt) {
-  TensorShape ts;
-  for (T_I d : dims) {
-    ts.add_dims(d);
-  }
-  ts.set_data_type(dt);
-  return ts;
-}
-
-// Helper function
-inline vector<int64_t> GetDimsVector(const TensorShape& shape) {
-  vector<int64_t> dims;
-  for (auto d : shape.dims()) {
-    dims.push_back(d);
-  }
-  return dims;
-}
-
-// Helper function
-inline uint64_t nElemFromDim(const TensorShape& X, int dim = 0) {
-  CAFFE_ENFORCE_GE(dim, 0, "Invalid maximum index specified");
-
-  uint64_t nElem = 1;
-  for (const auto i : c10::irange(dim, X.dims_size())) {
-    nElem *= X.dims(i);
-  }
-  return nElem;
-}
-
-// Helper function
-inline uint64_t nElemBetweenDim(const TensorShape& X, int start, int stop) {
-  CAFFE_ENFORCE_GE(start, 0, "Invalid maximum index specified");
-  CAFFE_ENFORCE_LE(stop, X.dims_size(), "Invalid maximum index specified");
-
-  uint64_t nElem = 1;
-  for (const auto i : c10::irange(start, stop)) {
-    nElem *= X.dims(i);
-  }
-  return nElem;
-}
-
-// Helper function for infer op inputs and outputs device information.
-inline std::pair<std::vector<DeviceOption>, std::vector<DeviceOption>>
-InferOpInputOutputDevice(const OperatorDef& op) {
-  auto op_schema = OpSchemaRegistry::Schema(op.type());
-  if (op_schema) {
-    // op_schema found
-    return op_schema->InferDevice(op);
-
-  } else {
-    // No schema for op.type registered
-    auto temp_schema = OpSchema();
-    return temp_schema.InferDevice(op);
-  }
-}
-
-template <uint64_t OpsPerPoint>
-OpSchema::Cost PointwiseCostInference(
-    const OperatorDef& /* unused */,
-    const vector<TensorShape>& inputs) {
-  struct OpSchema::Cost c;
-  const TensorShape X = inputs[0];
-  uint64_t nElemX = nElemFromDim(X);
-  uint64_t nElemRead = 0;
-  for (const auto i : c10::irange(inputs.size())) {
-    nElemRead += nElemFromDim(inputs[i]);
-  }
-
-  c.flops = nElemX * OpsPerPoint;
-  auto const& X_element_size_byte =
-      DataTypeToTypeMeta(X.data_type()).itemsize();
-  c.bytes_read = nElemRead * X_element_size_byte;
-  c.bytes_written = nElemX * X_element_size_byte;
-  return c;
-}
-
-} // namespace caffe2
-
-#if defined(_MSC_VER)
-#define EXPORT_IF_NOT_MSVC
-#else
-#define EXPORT_IF_NOT_MSVC C10_EXPORT
-#endif
-
-#ifndef CAFFE2_NO_OPERATOR_SCHEMA
-
-#define OPERATOR_SCHEMA(name)                                               \
-  EXPORT_IF_NOT_MSVC void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(){}; \
-  static OpSchema* C10_ANONYMOUS_VARIABLE(name) CAFFE2_UNUSED =             \
-      &OpSchemaRegistry::NewSchema(#name, __FILE__, __LINE__)
-
-#else // CAFFE2_NO_OPERATOR_SCHEMA
-
-#define OPERATOR_SCHEMA(name)                                               \
-  EXPORT_IF_NOT_MSVC void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(){}; \
-  static OpSchema* C10_ANONYMOUS_VARIABLE(name) CAFFE2_UNUSED =             \
-      1 ? nullptr : &OpSchemaRegistry::NewSchema(#name, __FILE__, __LINE__)
-
-#endif // CAFFE2_NO_OPERATOR_SCHEMA
-
-#ifdef CAFFE2_NO_GRADIENT_OPS
-
-#define GRADIENT_OPERATOR_SCHEMA(name)                                      \
-  EXPORT_IF_NOT_MSVC void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(){}; \
-  static OpSchema* C10_ANONYMOUS_VARIABLE(name) CAFFE2_UNUSED =             \
-      1 ? nullptr : &OpSchemaRegistry::NewSchema(#name, __FILE__, __LINE__)
-
-#else
-
-#define GRADIENT_OPERATOR_SCHEMA(name) OPERATOR_SCHEMA(name)
-
-#endif
-#endif // CAFFE2_CORE_OPERATOR_SCHEMA_H_

diff --git a/caffe2/core/storage.h b/caffe2/core/storage.h
deleted file mode 100644
index e9bd6ed..0000000
--- a/caffe2/core/storage.h
+++ /dev/null

@@ -1,33 +0,0 @@
-#ifndef CAFFE2_CORE_STORAGE_H_
-#define CAFFE2_CORE_STORAGE_H_
-
-#include <cstddef>
-#include <cstdint>
-#include <fstream>
-#include <sstream>
-#include <type_traits>
-#include <typeinfo>
-#include <vector>
-
-#include "caffe2/core/allocator.h"
-#include "caffe2/core/common.h"
-#include "caffe2/core/context.h"
-#include "caffe2/core/flags.h"
-#include "caffe2/core/logging.h"
-#include <c10/util/typeid.h>
-
-#include <c10/core/Allocator.h>
-#include <c10/core/Device.h>
-#include <c10/core/DeviceType.h>
-#include <c10/util/intrusive_ptr.h>
-#include <c10/core/Storage.h>
-#include <c10/core/StorageImpl.h>
-
-namespace caffe2 {
-
-using StorageImpl = at::StorageImpl;
-using Storage = at::Storage;
-
-} // namespace caffe2
-
-#endif // CAFFE2_CORE_STORAGE_H_

diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
deleted file mode 100644
index 1171605..0000000
--- a/caffe2/core/tensor.h
+++ /dev/null

@@ -1,674 +0,0 @@
-#ifndef CAFFE2_CORE_TENSOR_H_
-#define CAFFE2_CORE_TENSOR_H_
-
-#include <c10/macros/Macros.h>
-#include "caffe2/core/storage.h"
-
-#include <c10/core/SymIntArrayRef.h>
-#include <ATen/core/UndefinedTensorImpl.h>
-#include <c10/core/TensorOptions.h>
-#include <c10/util/ExclusivelyOwned.h>
-#include <c10/util/ExclusivelyOwnedTensorTraits.h>
-#include <c10/util/intrusive_ptr.h>
-
-C10_CLANG_DIAGNOSTIC_PUSH()
-#if C10_CLANG_HAS_WARNING("-Wshorten-64-to-32")
-C10_CLANG_DIAGNOSTIC_IGNORE("-Wshorten-64-to-32")
-#endif
-
-#if defined(EXPOSE_C2_OPS) || \
-    !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-namespace at {
-class Tensor;
-};
-#endif
-namespace caffe2 {
-
-using at::UndefinedTensorImpl;
-
-/**
- * @brief Tensor class holds a shared pointer to the implementation TensorImpl,
- * redirects API calls to TensorImpl;
- * Copying of Tensor results in sharing the same underlying implementation
- * object
- *
- * NB: See TensorImpl for documentation on these methods.
- */
-class TORCH_API Tensor final {
- private:
-  enum Unsafe { IDoWantAliasing };
-  Tensor(const Tensor& other, Unsafe _) : impl_(other.getIntrusivePtr()) {}
-
- protected:
-  using TensorImplPtr = c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>;
-  TensorImplPtr impl_;
-
-  void enforce_invariants();
-
- public:
-  Tensor() : impl_() {}
-
-  Tensor(const Tensor& t) : impl_(t.impl_) {}
-  Tensor& operator=(const Tensor& t) {
-    impl_ = t.impl_;
-    return *this;
-  }
-
-  Tensor(Tensor&&) = default;
-  Tensor& operator=(Tensor&&) = default;
-
-  operator bool() const {
-    return impl_.defined();
-  }
-
-  TensorImpl* unsafeGetTensorImpl() const {
-    return impl_.get();
-  }
-
-  TensorImpl* unsafeReleaseTensorImpl() {
-    return impl_.release();
-  }
-
-  Tensor UnsafeSharedInstance() const {
-    return Tensor(*this, IDoWantAliasing);
-  }
-
-  /**
-   * @brief Creates a tensor of the given device type.
-   *
-   * Note that the actual data allocation is not going to be carried out until
-   * you resize the tensor and then call mutable_data().
-   */
-  explicit Tensor(at::Device device)
-      : impl_(c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(
-            Storage::create_legacy(device),
-            c10::computeDispatchKey(c10::nullopt, at::kStrided, device),
-            TypeMeta())) {}
-
-  /**
-   * @brief Creates a tensor of the given dimension.
-   *
-   * Note that the actual data allocation is not going to be carried out until
-   * the first time mutable_data() is called.
-   */
-  explicit Tensor(at::IntArrayRef dims, DeviceType type) : Tensor(type) {
-    // TODO: here, we create a Storage
-    // and immediately discard it in Resize() since
-    // reset_tensor will be true and FreeMemory will be called,
-    // we might want to avoid creating Storage twice?
-    Resize(dims);
-  }
-
-  // we want to preserve index information
-  explicit Tensor(at::IntArrayRef dims, at::Device device) : Tensor(device) {
-    Resize(dims);
-  }
-
-  // TODO: remove?
-  explicit Tensor(const vector<int>& dims, DeviceType type) : Tensor(type) {
-    Resize(dims);
-  }
-
-  /**
-   * @brief: Create a Tensor of at::DeviceType `type` and initialize it with
-   * src Tensor
-   */
-  Tensor(const Tensor& src, DeviceType type) : Tensor(type) {
-    CopyFrom(src);
-  }
-
-  /**
-   * @brief Mutual conversion with at::Tensor
-   *
-   * The tensor will share the same instance (data, strides, sizes, etc) but
-   * a different subset of APIs would be available
-   */
-#if defined(EXPOSE_C2_OPS) || \
-    !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-  explicit Tensor(at::Tensor tensor);
-
-  explicit operator at::Tensor() const&;
-
-  explicit operator at::Tensor() &&;
-#endif
-
-  bool is_same(const Tensor& other) const noexcept {
-    return impl_ == other.impl_;
-  }
-
-  Tensor Clone() const {
-    Tensor x(GetDevice());
-    x.CopyFrom(*this);
-    return x;
-  }
-
-  /**
-   * Clone self as a Tensor that share the same Storage,
-   * that is, both Tensors are views on the same Storage.
-   * If we change the sizes or strides of one Tensor, it
-   * does not affect the other Tensor that it shares Storage
-   * with.
-   * A similar yet different usage is `Tensor x = y;`, this
-   * will make x and y pointing to the same Tensor and resizing
-   * one of them will resize the other as well.
-   *
-   * TODO: Deduplicate this with THTensor_(newWithTensor)
-   * (exposed in ATen as at::alias but not otherwise available)
-   */
-  Tensor Alias() const {
-    Tensor x(sizes(), GetDevice());
-    if (!dtype_initialized()) {
-      C10_LOG_EVERY_MS(WARNING, 1000)
-          << "Cloning a tensor that don't have a data type (did you call mutable_data<T> on the tensor?)";
-    }
-    AT_ASSERTM(
-        storage_initialized(),
-        "Cloning a tensor that has no content and has size > 0");
-    // set_storage already sets data_type_ of TensorImpl
-    x.impl_->set_storage_and_dtype(storage(), impl_->dtype());
-    x.impl_->set_storage_offset(impl_->storage_offset());
-    x.impl_->set_sizes_and_strides(sizes(), strides());
-    return x;
-  }
-
-  DeviceType GetDeviceType() const {
-    return impl_->device_type();
-  }
-
-  at::Device GetDevice() const {
-    return impl_.get()->device();
-  }
-
-  /**
-   * @brief Copies the data from a source tensor, with a context provided to
-   * carry out the underlying memcpy operation.  This method respects
-   * caffe2_keep_on_shrink.
-   *
-   * After CopyFrom, this function guarantees that the destination tensor will
-   * have the same initialization state and dtype as src.  This function
-   * preserves the DeviceType of the source tensor (so, e.g., if you allocate
-   * a tensor on CPU and then CopyFrom a CUDA tensor, that will to a
-   * CUDA-to-CPU transfer).
-   *
-   * 'async' parameter triggers async copy for CUDA tensors
-   */
-  void CopyFrom(const Tensor& src, bool async = false);
-
-  /**
-   * @brief Extend the outer-most dimension of this tensor
-   *        to dimension of `num`.
-   */
-  void ExtendTo(int64_t num, float growthPct) const {
-    CAFFE_ENFORCE_GE_WITH_CALLER(impl_->dim(), 1);
-    CAFFE_ENFORCE_GE_WITH_CALLER(growthPct, 0);
-    Extend(num - impl_->size(0), growthPct);
-  }
-
-  void Extend(int64_t num, float growthPct) const {
-    impl_.get()->Extend(num, growthPct);
-  }
-
-  /**
-   * @brief Shrinks the outer-most dimension to given size, keeping the data.
-   *
-   * This method guarantees that no re-allocations are carried out, which means
-   * that the extra capacity after the end of the shrunk tensor is maintained.
-   * Notably, this function does NOT respect caffe2_keep_on_shrink.
-   */
-  void ShrinkTo(int64_t outer_dim) const {
-    CAFFE_ENFORCE_WITH_CALLER(
-        impl_->is_contiguous(),
-        "Right now ShrinkTo is only supported on contiguous Tensor.");
-    CAFFE_ENFORCE_WITH_CALLER(impl_->dim() >= 1, "Tensor must be at least 1D");
-    CAFFE_ENFORCE_WITH_CALLER(
-        outer_dim <= impl_->size(0),
-        "New outer dimension must be smaller than current.");
-    CAFFE_ENFORCE(
-        impl_->storage().unique(),
-        "Can't call ShrinkTo on shared storage, please call Resize instead.");
-    impl_.get()->set_size(0, outer_dim);
-  }
-
-  template <class T>
-  void ReserveSpace(const T& outer_dim) const {
-    impl_.get()->ReserveSpace(outer_dim);
-  }
-
-  template <typename... Ts>
-  void Resize(Ts... dim_source) const {
-    impl_.get()->Resize(dim_source...);
-  }
-
-  template <typename T>
-  void Resize(const std::vector<T>& dim_source) const {
-    impl_.get()->Resize(ArrayRef<T>(dim_source));
-  }
-
-  /**
-   * Resize the tensor like the source tensor. Note that this is just a
-   * sugar wrapper that essentially calls Resize(src_tensor.dims()).
-   * This method respects caffe2_keep_on_shrink.
-   */
-  inline void ResizeLike(const Tensor& src_tensor) const {
-    CAFFE_ENFORCE_WITH_CALLER(
-        src_tensor.is_contiguous(),
-        "Right now ResizeLike is only supported for contiguous Tensor.");
-    if (impl_ != src_tensor.impl_) {
-      impl_.get()->Resize(src_tensor.sizes());
-    }
-  }
-
-  inline void Reshape(const vector<int64_t>& dims) const {
-    impl_.get()->Reshape(dims);
-  }
-
-  inline void Reshape(const vector<int>& dims) const {
-    impl_.get()->Reshape(ToVectorint64_t(dims));
-  }
-
-  inline void FreeMemory() const {
-    impl_.get()->FreeMemory();
-  }
-
-  /**
-   * A utility function to print the debug string for the tensor. Note that this
-   * is very slow since it involves quite some string operations, so do not use
-   * it in your performance-critical code.
-   */
-  string DebugString() const {
-    std::stringstream ss;
-    ss << "A Tensor of item size " << impl_->dtype().itemsize() << " and type "
-       << impl_->dtype().name() << " and dimension (";
-    for (int d : impl_->sizes()) {
-      ss << d << ",";
-    }
-    ss << ").";
-    return ss.str();
-  }
-
-  // To be deprecated
-  void ShareData(const Tensor& src) const {
-    impl_.get()->ShareData(*src.impl_.get());
-  }
-
-  /**
-   * @brief Shares the data with an externally managed pointer.
-   *
-   * This is similar to ShareData() but the source is a pointer with an advanced
-   * deleter option. In default, no deletion takes place, and one needs to make
-   * sure that the external memory is deallocated only after the tensor finishes
-   * using it. If a Deleter object is passed in, when this tensor is reallocated
-   * or freed, the deleter function is going to be called.
-   */
-  template <typename T>
-  void ShareExternalPointer(
-      T* src,
-      size_t nbytes = 0,
-      MemoryDeleter d = nullptr) const {
-    ShareExternalPointer((void*)src, caffe2::TypeMeta::Make<T>(), nbytes, d);
-  }
-
-  template <typename T>
-  void ShareExternalPointer(at::DataPtr&& data_ptr, size_t nbytes = 0) const {
-    ShareExternalPointer(
-        std::move(data_ptr), caffe2::TypeMeta::Make<T>(), nbytes);
-  }
-
-  void ShareExternalPointer(
-      void* src,
-      const TypeMeta data_type,
-      size_t nbytes = 0,
-      MemoryDeleter d = nullptr) const {
-    CAFFE_ENFORCE_WITH_CALLER(
-        impl_->is_contiguous(),
-        "Right now ShareExternalPointer is only supported for contiguous Tensor.");
-    CAFFE_ENFORCE_WITH_CALLER(
-        data_type != ScalarType::Undefined,
-        "To share with a raw external pointer you need to pass in an "
-        "initialized data_type(TypeMeta).");
-    impl_.get()->ShareExternalPointer(
-        at::DataPtr(src, src, d, impl_->device_type()), data_type, nbytes);
-  }
-
-  void ShareExternalPointer(
-      at::DataPtr&& data_ptr,
-      const TypeMeta data_type,
-      size_t nbytes) {
-    impl_.get()->ShareExternalPointer(std::move(data_ptr), data_type, nbytes);
-  }
-
-  const c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>& getIntrusivePtr()
-      const {
-    return impl_;
-  }
-
-  bool defined() const {
-    return impl_;
-  }
-
-  /**
-   * Returns a raw void* pointer of the underlying storage. mutable_data()
-   * or raw_mutable_data() must have been called prior to this function call.
-   */
-  inline void* raw_data() const {
-    return impl_->mutable_data();
-  }
-
-  template <typename T>
-  inline T* data() const {
-    return impl_.get()->mutable_data_dtype_initialized<T>();
-  }
-
-  inline void* raw_mutable_data(const TypeMeta meta) const {
-    return impl_.get()->raw_mutable_data(meta);
-  }
-
-  /**
-   * Returns a mutable raw pointer of the underlying storage. This can only be
-   * used when you know for sure that the underlying storage of the tensor is
-   * already created via an earlier raw_mutable_data(meta) call or a
-   * mutable_data<T>() call.
-   *
-   * If the existing data does not match the desired type, it will be deleted
-   * and a new storage will be created.
-   */
-  inline void* raw_mutable_data() const {
-    const auto& data_type = impl_->dtype();
-    CAFFE_ENFORCE_WITH_CALLER(
-        data_type != ScalarType::Undefined,
-        "Calling raw_mutable_data() without meta, but the current meta is "
-        "of unknown type.");
-    return raw_mutable_data(data_type);
-  }
-
-  template <typename T>
-  inline T* mutable_data() const {
-    return impl_.get()->mutable_data<T>();
-  }
-
-  /**
-   * Returns the number of dimensions of the data.
-   */
-  inline int dim() const {
-    return impl_->dim();
-  }
-
-  /**
-   * (To be deprecated) Returns the number of dimensions of the data.
-   */
-  inline int ndim() const {
-    return impl_->dim();
-  }
-
-  /**
-   * (To be deprecated) Returns the size (i.e. the number of items) of the
-   * tensor.
-   */
-  inline int64_t size() const {
-    return impl_->numel();
-  }
-
-  /**
-   * Returns the number of items of the tensor.
-   */
-  inline int64_t numel() const {
-    return impl_->numel();
-  }
-
-  /**
-   * Return the number of bytes each item takes in the tensor.
-   */
-  inline size_t itemsize() const {
-    return impl_->dtype().itemsize();
-  }
-
-  /**
-   * Returns the total number of bytes of the storage.
-   *
-   * This is equivalent to calling size() * itemsize().
-   */
-  inline size_t nbytes() const {
-    return impl_->numel() * itemsize();
-  }
-
-  inline at::IntArrayRef sizes() const {
-    return impl_.get()->sizes();
-  }
-
-  inline c10::SymIntArrayRef sym_sizes() const {
-    return impl_->sym_sizes();
-  }
-
-  inline c10::SymInt sym_numel() const {
-    return impl_->sym_numel();
-  }
-
-  inline c10::SymIntArrayRef sym_strides() const {
-    return impl_->sym_strides();
-  }
-
-  inline int64_t size_from_dim(int k) const {
-    return size_from_dim_(k, impl_->sizes());
-  }
-
-  inline int64_t size_to_dim(int k) const {
-    return size_to_dim_(k, impl_->sizes());
-  }
-
-  inline int64_t size_between_dim(int k, int l) const {
-    return size_between_dim_(k, l, impl_->sizes());
-  }
-
-  /**
-   * Returns the 'canonical' version of a (usually)  user-specified axis,
-   * allowing for negative indexing (e.g., -1 for the last axis).
-   *
-   * @param axis_index the axis index.
-   *        If 0 <= index < dim(), return index.
-   *        If -ndim <= index <= -1, return (dim() - (-index)),
-   *        e.g., the last axis index (dim() - 1) if index == -1,
-   *        the second to last if index == -2, etc.
-   *        Dies on out of range index.
-   */
-  inline int canonical_axis_index(int axis_index) const {
-    return canonical_axis_index_(axis_index, impl_->dim());
-  }
-
-  inline int64_t stride(int64_t dim) const {
-    return impl_.get()->stride(dim);
-  }
-
-  inline at::IntArrayRef strides() const {
-    return impl_.get()->strides();
-  }
-
-  inline bool is_contiguous(
-      at::MemoryFormat memory_format = at::MemoryFormat::Contiguous) const {
-    return impl_.get()->is_contiguous(memory_format);
-  }
-
-  /**
-   * Checks if the tensor content is of the given data type.
-   */
-  template <typename T>
-  inline bool IsType() const {
-    return impl_->dtype().Match<T>();
-  }
-
-  /**
-   * Returns the TypeMeta object associated with the current data type.
-   */
-  inline const TypeMeta dtype() const {
-    return impl_->dtype();
-  }
-
-  /**
-   * (To be deprecated) Returns the TypeMeta object associated with the current
-   * data type.
-   */
-  inline const TypeMeta meta() const {
-    return impl_->dtype();
-  }
-
-  /**
-   * Returns the i-th dimension of the tensor in int.
-   *
-   * This function returns an int value instead of int64_t, which depending on
-   * the typedef could be int64. If you want int64 dim values, make sure you
-   * call dim() instead.
-   */
-  inline int dim32(const int i) const {
-#ifndef NDEBUG
-    CAFFE_ENFORCE_LT_WITH_CALLER(
-        i, static_cast<int>(impl_->dim()), "Exceeding ndim limit");
-    CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index");
-#endif
-    // Avoid TensorImpl::size() because it is a virtual call that
-    // supports out-of-range indexing like Python.
-    auto s = impl_->sizes()[i];
-    CAFFE_ENFORCE_LT_WITH_CALLER(s, std::numeric_limits<int>::max());
-    return static_cast<int>(s);
-  }
-
-  inline int64_t size(const int i) const {
-    return impl_->size(i);
-  }
-
-  // To be deprecated
-  inline int64_t dim(const int i) const {
-    return impl_->size(i);
-  }
-
-  const Storage& storage() {
-    return impl_->storage();
-  }
-
-  const Storage& storage() const {
-    return impl_->storage();
-  }
-
-  bool storage_initialized() const {
-    return impl_->storage_initialized();
-  }
-
-  bool dtype_initialized() const {
-    return impl_->dtype_initialized();
-  }
-};
-
-/**
- * Reinitialize a Tensor to given dims and options if necessary, note that
- * this will not do anything if the
- * Tensor already has correct size and data type
- */
-TORCH_API void
-ReinitializeTensor(Tensor* t, at::IntArrayRef dims, at::TensorOptions options);
-
-TORCH_API void ReinitializeAndCopyFrom(
-    Tensor* t,
-    at::TensorOptions options,
-    const Tensor& src,
-    bool async = false);
-
-using TensorCPU = Tensor;
-
-constexpr int k_limit_default_ = 1000;
-
-// TODO: the following logic can be merged into regular Tensor class methods
-// after MKLMemory starts to implement Tensor interface
-
-// Type call registry
-typedef TypeMeta (*TypeCall)(const void*);
-TypeCall GetTypeCallFunction(TypeIdentifier id);
-void RegisterTypeCallFunction(TypeIdentifier id, TypeCall c);
-
-// Shape call registry
-typedef vector<int64_t> (
-    *TensorInfoCall)(const void*, size_t* capacity, DeviceOption* device);
-TensorInfoCall GetTensorInfoFunction(TypeIdentifier id);
-void RegisterTensorInfoFunction(TypeIdentifier id, TensorInfoCall c);
-
-// resize helper function
-void TensorVectorResize(
-    std::vector<Tensor>& tensors,
-    int size,
-    DeviceType type);
-
-// Tensor factory function
-TORCH_API Tensor empty(at::IntArrayRef dims, at::TensorOptions options);
-
-/**
- * @brief Creates a CPU tensor, and fills its contents with the given values.
- * Values are copied in
- */
-// TODO: can be unified with at::from_blob when Tensor is merged and string
-// types are supported
-template <typename T>
-Tensor TensorCPUFromValues(at::IntArrayRef dims, at::ArrayRef<T> values) {
-  Tensor r = empty(dims, at::device(CPU).dtype<T>());
-  CAFFE_ENFORCE_EQ(values.size(), r.numel());
-  CPUContext context;
-  context.CopyItemsFromCPU(
-      r.dtype(), values.size(), values.data(), r.mutable_data<T>());
-  return r;
-}
-
-vector<int64_t>
-GetTensorInfo(const void* c, size_t* capacity, DeviceOption* device);
-
-class TORCH_API TensorPrinter {
- public:
-  explicit TensorPrinter(
-      const std::string& tensor_name = "",
-      const std::string& file_name = "",
-      int limit = k_limit_default_);
-  ~TensorPrinter();
-
-  template <class T>
-  void Print(const Tensor& tensor);
-
-  void PrintMeta(const Tensor& tensor);
-
-  string MetaStr(const Tensor& tensor);
-
- private:
-  bool to_file_;
-  int limit_;
-  std::unique_ptr<std::ofstream> log_file_;
-  std::string tensor_name_;
-};
-
-template <class T>
-void TensorPrinter::Print(const Tensor& tensor) {
-  std::stringstream values_stream;
-  // One most likely doesn't want to print int64-number of items for visual
-  // inspection, so we cast down to int here.
-  int total_count = static_cast<int>(std::min(tensor.numel(), int64_t(limit_)));
-
-  const T* tensor_data = tensor.template data<T>();
-  for (int i = 0; i < total_count - 1; ++i) {
-    values_stream << tensor_data[i] << ",";
-  }
-  if (total_count) {
-    // We do not add a comma after the last item.
-    values_stream << tensor_data[total_count - 1];
-  }
-
-  if (to_file_) {
-    (*log_file_) << MetaStr(tensor) << values_stream.str() << std::endl;
-  } else {
-    // Log to console.
-    LOG(INFO) << MetaStr(tensor) << values_stream.str();
-  }
-}
-
-CAFFE_DECLARE_KNOWN_TYPE(Tensor, Caffe2Tensor)
-} // namespace caffe2
-
-C10_CLANG_DIAGNOSTIC_POP()
-
-namespace c10 {
-template <>
-struct ExclusivelyOwnedTraits<caffe2::Tensor> : public c10::ExclusivelyOwnedTensorTraits<caffe2::Tensor> {};
-} // namespace c10
-#endif // CAFFE2_CORE_TENSOR_H_

diff --git a/caffe2/core/tensor_int8.h b/caffe2/core/tensor_int8.h
deleted file mode 100644
index b95b7b8..0000000
--- a/caffe2/core/tensor_int8.h
+++ /dev/null

@@ -1,21 +0,0 @@
-#ifndef CAFFE2_TENSOR_INT8_H_
-#define CAFFE2_TENSOR_INT8_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/tensor.h"
-#include "caffe2/proto/caffe2_pb.h"
-
-namespace caffe2 {
-namespace int8 {
-
-struct Int8TensorCPU {
-  float scale{1.0};
-  int32_t zero_point{0};
-  // Generally stores uint8_t data, but sometimes int32_t (e.g. bias
-  // parameters).
-  Tensor t{CPU};
-};
-} // namespace int8
-} // namespace caffe2
-
-#endif // CAFFE2_TENSOR_INT8_H_

diff --git a/caffe2/core/workspace.h b/caffe2/core/workspace.h
deleted file mode 100644
index 04fa86f..0000000
--- a/caffe2/core/workspace.h
+++ /dev/null

@@ -1,342 +0,0 @@
-#ifndef CAFFE2_CORE_WORKSPACE_H_
-#define CAFFE2_CORE_WORKSPACE_H_
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/observer.h"
-
-#include <climits>
-#include <cstddef>
-#include <mutex>
-#include <typeinfo>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "c10/util/Registry.h"
-#include "caffe2/core/blob.h"
-#include "caffe2/core/net.h"
-#include "caffe2/proto/caffe2_pb.h"
-#include "caffe2/utils/signal_handler.h"
-#include "caffe2/utils/threadpool/ThreadPool.h"
-
-C10_DECLARE_bool(caffe2_print_blob_sizes_at_exit);
-
-namespace caffe2 {
-
-class NetBase;
-
-struct TORCH_API StopOnSignal {
-  StopOnSignal()
-      : handler_(std::make_shared<SignalHandler>(
-            SignalHandler::Action::STOP,
-            SignalHandler::Action::STOP)) {}
-
-  StopOnSignal(const StopOnSignal& other) : handler_(other.handler_) {}
-
-  bool operator()(int /*iter*/) {
-    return handler_->CheckForSignals() != SignalHandler::Action::STOP;
-  }
-
-  std::shared_ptr<SignalHandler> handler_;
-};
-
-/**
- * Workspace is a class that holds all the related objects created during
- * runtime: (1) all blobs, and (2) all instantiated networks. It is the owner of
- * all these objects and deals with the scaffolding logistics.
- */
-class TORCH_API Workspace {
- public:
-  typedef std::function<bool(int)> ShouldContinue;
-  /**
-   * Initializes an empty workspace.
-   */
-  Workspace() : Workspace(".", nullptr) {}
-
-  /**
-   * Initializes an empty workspace with the given root folder.
-   *
-   * For any operators that are going to interface with the file system, such
-   * as load operators, they will write things under this root folder given
-   * by the workspace.
-   */
-  explicit Workspace(const string& root_folder)
-      : Workspace(root_folder, nullptr) {}
-
-  /**
-   * Initializes a workspace with a shared workspace.
-   *
-   * When we access a Blob, we will first try to access the blob that exists
-   * in the local workspace, and if not, access the blob that exists in the
-   * shared workspace. The caller keeps the ownership of the shared workspace
-   * and is responsible for making sure that its lifetime is longer than the
-   * created workspace.
-   */
-  explicit Workspace(const Workspace* shared) : Workspace(".", shared) {}
-
-  /**
-   * Initializes workspace with parent workspace, blob name remapping
-   * (new name -> parent blob name), no other blobs are inherited from
-   * parent workspace
-   */
-  Workspace(
-      const Workspace* shared,
-      const std::unordered_map<string, string>& forwarded_blobs)
-      : Workspace(".", nullptr) {
-    CAFFE_ENFORCE(shared, "Parent workspace must be specified");
-    for (const auto& forwarded : forwarded_blobs) {
-      CAFFE_ENFORCE(
-          shared->HasBlob(forwarded.second),
-          "Invalid parent workspace blob: ",
-          forwarded.second);
-      forwarded_blobs_[forwarded.first] =
-          std::make_pair(shared, forwarded.second);
-    }
-  }
-
-  /**
-   * Initializes a workspace with a root folder and a shared workspace.
-   */
-  Workspace(const string& root_folder, const Workspace* shared)
-      : root_folder_(root_folder), shared_(shared), bookkeeper_(bookkeeper()) {
-    std::lock_guard<std::mutex> guard(bookkeeper_->wsmutex);
-    bookkeeper_->workspaces.insert(this);
-  }
-
-  ~Workspace() {
-    if (FLAGS_caffe2_print_blob_sizes_at_exit) {
-      PrintBlobSizes();
-    }
-    // This is why we have a bookkeeper_ shared_ptr instead of a naked static! A
-    // naked static makes us vulnerable to out-of-order static destructor bugs.
-    std::lock_guard<std::mutex> guard(bookkeeper_->wsmutex);
-    bookkeeper_->workspaces.erase(this);
-  }
-
-  /**
-   * Adds blob mappings from workspace to the blobs from parent workspace.
-   * Creates blobs under possibly new names that redirect read/write operations
-   * to the blobs in the parent workspace.
-   * Arguments:
-   *  parent - pointer to parent workspace
-   *  forwarded_blobs - map from new blob name to blob name in parent's
-   * workspace skip_defined_blob - if set skips blobs with names that already
-   * exist in the workspace, otherwise throws exception
-   */
-  void AddBlobMapping(
-      const Workspace* parent,
-      const std::unordered_map<string, string>& forwarded_blobs,
-      bool skip_defined_blobs = false);
-
-  /**
-   * Converts previously mapped tensor blobs to local blobs, copies values from
-   * parent workspace blobs into new local blobs. Ignores undefined blobs.
-   */
-  template <class Context>
-  void CopyForwardedTensors(const std::unordered_set<std::string>& blobs) {
-    for (const auto& blob : blobs) {
-      auto it = forwarded_blobs_.find(blob);
-      if (it == forwarded_blobs_.end()) {
-        continue;
-      }
-      const auto& ws_blob = it->second;
-      const auto* parent_ws = ws_blob.first;
-      auto* from_blob = parent_ws->GetBlob(ws_blob.second);
-      CAFFE_ENFORCE(from_blob);
-      CAFFE_ENFORCE(
-          from_blob->template IsType<Tensor>(),
-          "Expected blob with tensor value",
-          ws_blob.second);
-      forwarded_blobs_.erase(blob);
-      auto* to_blob = CreateBlob(blob);
-      CAFFE_ENFORCE(to_blob);
-      const auto& from_tensor = from_blob->template Get<Tensor>();
-      auto* to_tensor = BlobGetMutableTensor(to_blob, Context::GetDeviceType());
-      to_tensor->CopyFrom(from_tensor);
-    }
-  }
-
-  /**
-   * Return list of blobs owned by this Workspace, not including blobs
-   * shared from parent workspace.
-   */
-  vector<string> LocalBlobs() const;
-
-  /**
-   * Return a list of blob names. This may be a bit slow since it will involve
-   * creation of multiple temp variables. For best performance, simply use
-   * HasBlob() and GetBlob().
-   */
-  vector<string> Blobs() const;
-
-  /**
-   * Return the root folder of the workspace.
-   */
-  const string& RootFolder() { return root_folder_; }
-  /**
-   * Checks if a blob with the given name is present in the current workspace.
-   */
-  inline bool HasBlob(const string& name) const {
-    // First, check the local workspace,
-    // Then, check the forwarding map, then the parent workspace
-    if (blob_map_.count(name)) {
-      return true;
-    }
-
-    auto it = forwarded_blobs_.find(name);
-    if (it != forwarded_blobs_.end()) {
-      const auto parent_ws = it->second.first;
-      const auto& parent_name = it->second.second;
-      return parent_ws->HasBlob(parent_name);
-    }
-
-    if (shared_) {
-      return shared_->HasBlob(name);
-    }
-
-    return false;
-  }
-
-  void PrintBlobSizes();
-
-  /**
-   * Creates a blob of the given name. The pointer to the blob is returned, but
-   * the workspace keeps ownership of the pointer. If a blob of the given name
-   * already exists, the creation is skipped and the existing blob is returned.
-   */
-  Blob* CreateBlob(const string& name);
-  /**
-   * Similar to CreateBlob(), but it creates a blob in the local workspace even
-   * if another blob with the same name already exists in the parent workspace
-   * -- in such case the new blob hides the blob in parent workspace. If a blob
-   * of the given name already exists in the local workspace, the creation is
-   * skipped and the existing blob is returned.
-   */
-  Blob* CreateLocalBlob(const string& name);
-  /**
-   * Remove the blob of the given name. Return true if removed and false if
-   * not exist.
-   * Will NOT remove from the shared workspace.
-   */
-  bool RemoveBlob(const string& name);
-  /**
-   * Gets the blob with the given name as a const pointer. If the blob does not
-   * exist, a nullptr is returned.
-   */
-  const Blob* GetBlob(const string& name) const;
-  /**
-   * Gets the blob with the given name as a mutable pointer. If the blob does
-   * not exist, a nullptr is returned.
-   */
-  Blob* GetBlob(const string& name);
-
-  /**
-   * Renames a local workspace blob. If blob is not found in the local blob list
-   * or if the target name is already present in local or any parent blob list
-   * the function will throw.
-   */
-  Blob* RenameBlob(const string& old_name, const string& new_name);
-
-  /**
-   * Creates a network with the given NetDef, and returns the pointer to the
-   * network. If there is anything wrong during the creation of the network, a
-   * nullptr is returned. The Workspace keeps ownership of the pointer.
-   *
-   * If there is already a net created in the workspace with the given name,
-   * CreateNet will overwrite it if overwrite=true is specified. Otherwise, an
-   * exception is thrown.
-   */
-  NetBase* CreateNet(const NetDef& net_def, bool overwrite = false);
-  NetBase* CreateNet(
-      const std::shared_ptr<const NetDef>& net_def,
-      bool overwrite = false);
-  /**
-   * Gets the pointer to a created net. The workspace keeps ownership of the
-   * network.
-   */
-  NetBase* GetNet(const string& net_name);
-  /**
-   * Deletes the instantiated network with the given name.
-   */
-  void DeleteNet(const string& net_name);
-  /**
-   * Finds and runs the instantiated network with the given name. If the network
-   * does not exist or there are errors running the network, the function
-   * returns false.
-   */
-  bool RunNet(const string& net_name);
-
-  /**
-   * Returns a list of names of the currently instantiated networks.
-   */
-  vector<string> Nets() const {
-    vector<string> names;
-    for (auto& entry : net_map_) {
-      names.push_back(entry.first);
-    }
-    return names;
-  }
-
-  /**
-   * Runs a plan that has multiple nets and execution steps.
-   */
-  bool RunPlan(const PlanDef& plan_def,
-               ShouldContinue should_continue = StopOnSignal{});
-
-  /*
-   * Returns a CPU threadpool instance for parallel execution of
-   * work. The threadpool is created lazily; if no operators use it,
-   * then no threadpool will be created.
-   */
-  ThreadPool* GetThreadPool();
-
-  // RunOperatorOnce and RunNetOnce runs an operator or net once. The difference
-  // between RunNet and RunNetOnce lies in the fact that RunNet allows you to
-  // have a persistent net object, while RunNetOnce creates a net and discards
-  // it on the fly - this may make things like database read and random number
-  // generators repeat the same thing over multiple calls.
-  bool RunOperatorOnce(const OperatorDef& op_def);
-  bool RunNetOnce(const NetDef& net_def);
-
-  /**
-   * Applies a function f on each workspace that currently exists.
-   *
-   * This function is thread safe and there is no race condition between
-   * workspaces being passed to f in this thread and destroyed in another.
-   */
-  template <typename F>
-  static void ForEach(F f) {
-    auto bk = bookkeeper();
-    std::lock_guard<std::mutex> guard(bk->wsmutex);
-    for (Workspace* ws : bk->workspaces) {
-      f(ws);
-    }
-  }
-
- public:
-  std::atomic<int> last_failed_op_net_position{};
-
- private:
-  struct Bookkeeper {
-    std::mutex wsmutex;
-    std::unordered_set<Workspace*> workspaces;
-  };
-
-  static std::shared_ptr<Bookkeeper> bookkeeper();
-
-  std::unordered_map<string, unique_ptr<Blob>> blob_map_;
-  const string root_folder_;
-  const Workspace* shared_;
-  std::unordered_map<string, std::pair<const Workspace*, string>>
-      forwarded_blobs_;
-  std::unique_ptr<ThreadPool> thread_pool_;
-  std::mutex thread_pool_creation_mutex_;
-  std::shared_ptr<Bookkeeper> bookkeeper_;
-  std::unordered_map<string, unique_ptr<NetBase>> net_map_;
-
-  C10_DISABLE_COPY_AND_ASSIGN(Workspace);
-};
-
-}  // namespace caffe2
-
-#endif  // CAFFE2_CORE_WORKSPACE_H_

diff --git a/caffe2/utils/GpuAtomics.cuh b/caffe2/utils/GpuAtomics.cuh
deleted file mode 100644
index 2bbcc14..0000000
--- a/caffe2/utils/GpuAtomics.cuh
+++ /dev/null

@@ -1,28 +0,0 @@
-#ifndef CAFFE2_UTILS_GPU_ATOMICS_H_
-#define CAFFE2_UTILS_GPU_ATOMICS_H_
-
-#include <cuda_runtime.h>
-
-namespace caffe2 {
-
-namespace {
-
-template <typename T>
-inline __device__ void gpu_atomic_add(T* address, const T val) {
-  atomicAdd(address, val);
-}
-
-template <>
-inline __device__ void gpu_atomic_add(float* address, const float val) {
-#if defined(USE_ROCM) && defined(__gfx908__)
-  atomicAddNoRet(address, val);
-#else
-  atomicAdd(address, val);
-#endif
-}
-
-} // namespace
-
-} // namespace caffe2
-
-#endif  // CAFFE2_UTILS_GPU_ATOMICS_H_

diff --git a/caffe2/utils/GpuBitonicSort.cuh b/caffe2/utils/GpuBitonicSort.cuh
deleted file mode 100644
index 45cb298..0000000
--- a/caffe2/utils/GpuBitonicSort.cuh
+++ /dev/null

@@ -1,178 +0,0 @@
-#ifndef CAFFE2_UTILS_GPU_BITONIC_SORT_H_
-#define CAFFE2_UTILS_GPU_BITONIC_SORT_H_
-
-#include "caffe2/utils/math.h"
-#include "caffe2/utils/GpuDefs.cuh"
-
-namespace caffe2 {
-
-// Returns true if the given integer type is a power-of-2 (positive only)
-// Note(jiayq): windows reported an error per
-//     https://github.com/caffe2/caffe2/issues/997
-// and as a result will make it a macro.
-#ifdef _MSC_VER
-#define integerIsPowerOf2(v) ((v) && !((v) & ((v) - 1)))
-#else // _MSC_VER
-template <typename T>
-constexpr bool integerIsPowerOf2(T v) {
-  return (v && !(v & (v - 1)));
-}
-#endif // _MSC_VER
-
-/// The maximum in-block bitonic sort we support
-constexpr int kMaxBitonicSortSize = 4096;
-
-template <typename T>
-__device__ inline void swapVars(T& t1, T& t2) {
-  T tmp = t1;
-  t1 = t2;
-  t2 = tmp;
-}
-
-template <typename Comparator, typename K, typename V>
-__device__ inline void bitonicSwap(K& kA, V& vA,
-                                   K& kB, V& vB,
-                                   bool dir,
-                                   const Comparator& comp) {
-  bool swap = comp(kA, vA, kB, vB);
-  if (swap == dir) {
-    swapVars(kA, kB);
-    swapVars(vA, vB);
-  }
-};
-
-template <typename Comparator, typename K, typename V,
-          int Power2SortSize,
-          int ThreadsPerBlock>
-__device__ inline void bitonicSort(K* keys,
-                                   V* values,
-                                   const Comparator& comp) {
-  static_assert(Power2SortSize <= kMaxBitonicSortSize,
-                "sort size <= 4096 only supported");
-  // Assume the sort is taking place in shared memory
-  // static_assert(Power2SortSize * (sizeof(K) + sizeof(V)) < 32768,
-  //               "sort data too large (>32768 bytes)");
-  static_assert(integerIsPowerOf2(Power2SortSize),
-                "sort size must be power of 2");
-  static_assert(integerIsPowerOf2(ThreadsPerBlock),
-                "threads in block must be power of 2");
-
-  // If what we are sorting is too small, then not all threads
-  // participate
-  constexpr int numThreadsForSort = Power2SortSize / 2;
-  constexpr bool allThreads = numThreadsForSort >= ThreadsPerBlock;
-
-  // If what we are sorting is too large, then threads must loop more
-  // than once
-  constexpr int loopPerThread =
-    allThreads ? numThreadsForSort / ThreadsPerBlock : 1;
-
-#pragma unroll
-  for (int size = 2; size < Power2SortSize; size *= 2) {
-
-#pragma unroll
-    for (int stride = size / 2; stride > 0; stride /= 2) {
-
-#pragma unroll
-      for (int loop = 0; loop < loopPerThread; ++loop) {
-        int threadId = loop * ThreadsPerBlock + threadIdx.x;
-        bool flag = ((threadId & (size / 2)) != 0);
-
-        int pos = 2 * threadId - (threadId & (stride - 1));
-
-        if (allThreads || (threadId < numThreadsForSort)) {
-          bitonicSwap<Comparator, K, V>(
-            keys[pos], values[pos],
-            keys[pos + stride], values[pos + stride],
-            flag, comp);
-        }
-
-        __syncthreads();
-      }
-    }
-  }
-
-#pragma unroll
-  for (int stride = Power2SortSize / 2; stride > 0; stride /= 2) {
-
-#pragma unroll
-    for (int loop = 0; loop < loopPerThread; ++loop) {
-      int threadId = loop * ThreadsPerBlock + threadIdx.x;
-
-      int pos = 2 * threadId - (threadId & (stride - 1));
-
-      if (allThreads || (threadId < numThreadsForSort)) {
-        bitonicSwap<Comparator, K, V>(
-          keys[pos], values[pos],
-          keys[pos + stride], values[pos + stride],
-          false, comp);
-      }
-
-      __syncthreads();
-    }
-  }
-}
-
-template <typename Comparator, typename K, typename V, int Power2SortSize>
-__device__ inline void warpBitonicSort(K* keys,
-                                       V* values,
-                                       const Comparator& comp) {
-  // Smaller sorts should use a warp shuffle sort
-  static_assert(Power2SortSize > kWarpSize,
-                "sort not large enough");
-  static_assert(integerIsPowerOf2(Power2SortSize),
-                "sort size must be power of 2");
-  static_assert(Power2SortSize <= kMaxBitonicSortSize,
-                "sort size <= 4096 only supported");
-
-  // If what we are sorting is too large, then lanes must loop more
-  // than once
-  constexpr int loopPerThread = (Power2SortSize / 2) / kWarpSize;
-  int laneId = getLaneId();
-
-#pragma unroll
-  for (int size = 2; size < Power2SortSize; size *= 2) {
-
-#pragma unroll
-    for (int stride = size / 2; stride > 0; stride /= 2) {
-
-#pragma unroll
-      for (int loop = 0; loop < loopPerThread; ++loop) {
-        int threadId = loop * kWarpSize + laneId;
-        bool flag = ((threadId & (size / 2)) != 0);
-
-        int pos = 2 * threadId - (threadId & (stride - 1));
-
-        bitonicSwap<Comparator, K, V>(
-          keys[pos], values[pos],
-          keys[pos + stride], values[pos + stride],
-          flag, comp);
-
-        __threadfence_block();
-      }
-    }
-  }
-
-#pragma unroll
-  for (int stride = Power2SortSize / 2; stride > 0; stride /= 2) {
-
-#pragma unroll
-    for (int loop = 0; loop < loopPerThread; ++loop) {
-      int threadId = loop * kWarpSize + laneId;
-
-      int pos = 2 * threadId - (threadId & (stride - 1));
-
-      bitonicSwap<Comparator, K, V>(
-        keys[pos], values[pos],
-        keys[pos + stride], values[pos + stride],
-        false, comp);
-
-      __threadfence_block();
-    }
-  }
-}
-
-
-}  // namespace caffe2
-
-#endif  // CAFFE2_UTILS_GPU_BITONIC_SORT_H_

diff --git a/caffe2/utils/GpuDefs.cuh b/caffe2/utils/GpuDefs.cuh
deleted file mode 100644
index fcf2c64..0000000
--- a/caffe2/utils/GpuDefs.cuh
+++ /dev/null

@@ -1,158 +0,0 @@
-#ifndef CAFFE2_UTILS_GPU_DEFS_H_
-#define CAFFE2_UTILS_GPU_DEFS_H_
-
-#include <cuda_runtime.h>
-
-namespace caffe2 {
-
-// Static definition of GPU warp size for unrolling and code generation
-
-#if defined(USE_ROCM)
-constexpr int kWarpSize = warpSize;   // = 64 (Defined in hip_runtime.h)
-#else
-constexpr int kWarpSize = 32;
-#endif // __CUDA_ARCH__
-
-//
-// Interfaces to PTX instructions for which there appears to be no
-// intrinsic
-//
-
-template <typename T>
-struct Bitfield {};
-
-template <>
-struct Bitfield<unsigned int> {
-  static __device__ __forceinline__
-  unsigned int getBitfield(unsigned int val, int pos, int len) {
-#if defined(USE_ROCM)
-    pos &= 0xff;
-    len &= 0xff;
-
-    unsigned int m = (1u << len) - 1u;
-    return (val >> pos) & m;
-#else
-    unsigned int ret;
-    asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len));
-    return ret;
-#endif // USE_ROCM
-  }
-
-  static __device__ __forceinline__
-  unsigned int setBitfield(unsigned int val, unsigned int toInsert, int pos, int len) {
-#if defined(USE_ROCM)
-    pos &= 0xff;
-    len &= 0xff;
-
-    unsigned int m = (1u << len) - 1u;
-    toInsert &= m;
-    toInsert <<= pos;
-    m <<= pos;
-
-    return (val & ~m) | toInsert;
-#else
-    unsigned int ret;
-    asm("bfi.b32 %0, %1, %2, %3, %4;" :
-        "=r"(ret) : "r"(toInsert), "r"(val), "r"(pos), "r"(len));
-    return ret;
-#endif // USE_ROCM
-  }
-};
-
-template <>
-struct Bitfield<unsigned long long int> {
-  static __device__ __forceinline__
-  unsigned long long int getBitfield(unsigned long long int val, int pos, int len) {
-#if defined(USE_ROCM)
-    pos &= 0xff;
-    len &= 0xff;
-
-    unsigned long long int m = (1u << len) - 1u;
-    return (val >> pos) & m;
-#else
-    unsigned long long int ret;
-    asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len));
-    return ret;
-#endif // USE_ROCM
-  }
-
-  static __device__ __forceinline__
-  unsigned long long int setBitfield(unsigned long long int val, unsigned long long int toInsert, int pos, int len) {
-#if defined(USE_ROCM)
-    pos &= 0xff;
-    len &= 0xff;
-
-    unsigned long long int m = (1u << len) - 1u;
-    toInsert &= m;
-    toInsert <<= pos;
-    m <<= pos;
-
-    return (val & ~m) | toInsert;
-#else
-    unsigned long long int ret;
-    asm("bfi.b64 %0, %1, %2, %3, %4;" :
-        "=l"(ret) : "l"(toInsert), "l"(val), "r"(pos), "r"(len));
-    return ret;
-#endif // USE_ROCM
-  }
-};
-
-__device__ __forceinline__ int getLaneId() {
-#if defined(USE_ROCM)
-  return __lane_id();
-#else
-  int laneId;
-  asm("mov.s32 %0, %%laneid;" : "=r"(laneId) );
-  return laneId;
-#endif // USE_ROCM
-}
-
-#if defined(USE_ROCM)
-__device__ __forceinline__ unsigned long long int getLaneMaskLt() {
-  unsigned long long int m = (1ull << getLaneId()) - 1ull;
-  return m;
-}
-
-__device__ __forceinline__ unsigned long long int getLaneMaskLe() {
-  unsigned long long int m = UINT64_MAX >> (sizeof(std::uint64_t) * CHAR_BIT - (getLaneId() + 1));
-  return m;
-}
-
-__device__ __forceinline__ unsigned long long int getLaneMaskGt() {
-  unsigned long long int m = getLaneMaskLe();
-  return m ? ~m : m;
-}
-
-__device__ __forceinline__ unsigned long long int getLaneMaskGe() {
-  unsigned long long int m = getLaneMaskLt();
-  return ~m;
-}
-#else
-__device__ __forceinline__ unsigned getLaneMaskLt() {
-  unsigned mask;
-  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask));
-  return mask;
-}
-
-__device__ __forceinline__ unsigned getLaneMaskLe() {
-  unsigned mask;
-  asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
-  return mask;
-}
-
-__device__ __forceinline__ unsigned getLaneMaskGt() {
-  unsigned mask;
-  asm("mov.u32 %0, %%lanemask_gt;" : "=r"(mask));
-  return mask;
-}
-
-__device__ __forceinline__ unsigned getLaneMaskGe() {
-  unsigned mask;
-  asm("mov.u32 %0, %%lanemask_ge;" : "=r"(mask));
-  return mask;
-}
-#endif // USE_ROCM
-
-}  // namespace caffe2
-
-#endif  // CAFFE2_UTILS_GPU_DEFS_H_

diff --git a/caffe2/utils/GpuScanUtils.cuh b/caffe2/utils/GpuScanUtils.cuh
deleted file mode 100644
index 0f6823d..0000000
--- a/caffe2/utils/GpuScanUtils.cuh
+++ /dev/null

@@ -1,133 +0,0 @@
-#ifndef CAFFE2_UTILS_GPU_SCAN_UTILS_H_
-#define CAFFE2_UTILS_GPU_SCAN_UTILS_H_
-
-#include "caffe2/utils/GpuDefs.cuh"
-
-namespace caffe2 {
-
-// from the cutorch library; can probably be replaced with their CUB
-// equivalents
-// Collection of in-kernel scan / prefix sum utilities
-
-// Inclusive prefix sum using shared memory
-template <typename T, bool KillWARDependency, class BinaryFunction>
-__device__ void inclusivePrefixScan(T* smem, T in, T* out, BinaryFunction binop) {
-  // FIXME: this is a slow, simple implementation; need up/down sweep,
-  // prevent smem conflicts
-  smem[threadIdx.x] = in;
-
-  __syncthreads();
-
-  for (int offset = 1; offset < blockDim.x; offset *= 2) {
-    T val = 0;
-
-    if (threadIdx.x >= offset) {
-      val = binop(smem[threadIdx.x - offset], smem[threadIdx.x]);
-    }
-
-    __syncthreads();
-    if (threadIdx.x >= offset) {
-      smem[threadIdx.x] = val;
-    }
-
-    __syncthreads();
-  }
-
-  *out = smem[threadIdx.x];
-
-  // Prevent write-after-read dependencies on smem usage above if necessary
-  if (KillWARDependency) {
-    __syncthreads();
-  }
-}
-
-// Exclusive prefix sum using shared memory
-template <typename T, bool KillWARDependency, class BinaryFunction>
-__device__ void exclusivePrefixScan(T* smem, T in, T* out, T* carry, BinaryFunction binop) {
-  // FIXME: crappy implementation
-  // We kill write-after-read dependencies separately below, hence the `false`
-  inclusivePrefixScan<T, false, BinaryFunction>(smem, in, out, binop);
-
-  *out -= in;
-  *carry = smem[blockDim.x - 1];
-
-  // Prevent write-after-read dependencies on smem usage above if necessary
-  if (KillWARDependency) {
-    __syncthreads();
-  }
-}
-
-// Inclusive prefix sum for binary vars using intra-warp voting +
-// shared memory
-template <typename T, bool KillWARDependency, class BinaryFunction>
-__device__ void inclusiveBinaryPrefixScan(T* smem, bool in, T* out, BinaryFunction binop) {
-  // Within-warp, we use warp voting.
-#if defined(USE_ROCM)
-  unsigned long long int vote = __ballot(in);
-
-  T index = __popcll(getLaneMaskLe() & vote);
-  T carry = __popcll(vote);
-#else
-  T vote = __ballot_sync(__activemask(), in);
-  T index = __popc(getLaneMaskLe() & vote);
-  T carry = __popc(vote);
-#endif  // USE_ROCM
-
-  int warp = threadIdx.x / kWarpSize;
-
-  // Per each warp, write out a value
-  if (getLaneId() == 0) {
-    smem[warp] = carry;
-  }
-
-  __syncthreads();
-
-  // Sum across warps in one thread. This appears to be faster than a
-  // warp shuffle scan for CC 3.0+
-  if (threadIdx.x == 0) {
-    int current = 0;
-    for (int i = 0; i < blockDim.x / kWarpSize; ++i) {
-      T v = smem[i];
-      smem[i] = binop(smem[i], current);
-      current = binop(current, v);
-    }
-  }
-
-  __syncthreads();
-
-  // load the carry from the preceding warp
-  if (warp >= 1) {
-    index = binop(index, smem[warp - 1]);
-  }
-
-  *out = index;
-
-  if (KillWARDependency) {
-    __syncthreads();
-  }
-}
-
-// Exclusive prefix sum for binary vars using intra-warp voting +
-// shared memory
-template <typename T, bool KillWARDependency, class BinaryFunction>
-__device__ void exclusiveBinaryPrefixScan(T* smem, bool in, T* out, T* carry, BinaryFunction binop) {
-  inclusiveBinaryPrefixScan<T, false, BinaryFunction>(smem, in, out, binop);
-
-  // Inclusive to exclusive
-  *out -= (T) in;
-
-  // The outgoing carry for all threads is the last warp's sum
-#if defined(USE_ROCM)
-  *carry = smem[math::DivUp<int>(blockDim.x, kWarpSize) - 1];
-#else
-  *carry = smem[(blockDim.x / kWarpSize) - 1];
-#endif  // USE_ROCM
-
-  if (KillWARDependency) {
-    __syncthreads();
-  }
-}
-
-}  // namespace caffe2
-
-#endif  // CAFFE2_UTILS_GPU_SCAN_UTILS_H_

diff --git a/caffe2/utils/bench_utils.cc b/caffe2/utils/bench_utils.cc
deleted file mode 100644
index baa8d34..0000000
--- a/caffe2/utils/bench_utils.cc
+++ /dev/null

@@ -1,120 +0,0 @@
-#if !defined(__s390x__) && !defined(__powerpc__)
-#include <cpuinfo.h>
-#else
-#include <unistd.h>
-#endif
-// NOLINTNEXTLINE(modernize-deprecated-headers)
-#include <stdint.h>
-// NOLINTNEXTLINE(modernize-deprecated-headers)
-#include <stdlib.h>
-
-#include "caffe2/core/logging.h"
-#include "caffe2/utils/bench_utils.h"
-
-namespace caffe2 {
-
-uint32_t wipe_cache() {
-  static uint32_t* wipe_buffer = nullptr;
-  static size_t wipe_size = 0;
-
-  if (wipe_buffer == nullptr) {
-#if !defined(__s390x__) && !defined(__powerpc__)
-    CAFFE_ENFORCE(cpuinfo_initialize(), "failed to initialize cpuinfo");
-    const cpuinfo_processor* processor = cpuinfo_get_processor(0);
-    if (processor->cache.l4 != nullptr) {
-      wipe_size = processor->cache.l4->size;
-    } else if (processor->cache.l3 != nullptr) {
-      wipe_size = processor->cache.l3->size;
-    } else if (processor->cache.l2 != nullptr) {
-      wipe_size = processor->cache.l2->size;
-    } else {
-      wipe_size = processor->cache.l1d->size;
-    }
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
-    /*
-     * On ARM precise cache size is not available, and cpuinfo may
-     * underestimate. Use max for uArch (see src/arm/cache.c)
-     */
-    switch (processor->core->uarch) {
-      case cpuinfo_uarch_cortex_a5:
-        wipe_size = 512 * 1024; /* Max observed */
-        break;
-      case cpuinfo_uarch_cortex_a7:
-        wipe_size = 1024 * 1024; /* uArch max */
-        break;
-      case cpuinfo_uarch_cortex_a8:
-        wipe_size = 1024 * 1024; /* uArch max */
-        break;
-      case cpuinfo_uarch_cortex_a9:
-        wipe_size = 1024 * 1024; /* Max observed */
-        break;
-      case cpuinfo_uarch_cortex_a12:
-      case cpuinfo_uarch_cortex_a17:
-        wipe_size = 8 * 1024 * 1024; /* uArch max */
-        break;
-      case cpuinfo_uarch_cortex_a15:
-        wipe_size = 4 * 1024 * 1024; /* uArch max */
-        break;
-      case cpuinfo_uarch_cortex_a35:
-        wipe_size = 1024 * 1024; /* uArch max */
-        break;
-      case cpuinfo_uarch_cortex_a53:
-        wipe_size = 2 * 1024 * 1024; /* uArch max */
-        break;
-      case cpuinfo_uarch_cortex_a57:
-        wipe_size = 2 * 1024 * 1024; /* uArch max */
-        break;
-      case cpuinfo_uarch_cortex_a72:
-        wipe_size = 4 * 1024 * 1024; /* uArch max */
-        break;
-      case cpuinfo_uarch_cortex_a73:
-        wipe_size = 8 * 1024 * 1024; /* uArch max */
-        break;
-      case cpuinfo_uarch_cortex_a55:
-      case cpuinfo_uarch_cortex_a75:
-      case cpuinfo_uarch_meerkat_m3:
-        wipe_size = 4 * 1024 * 1024; /* DynamIQ max */
-        break;
-      default:
-        wipe_size = 60 * 1024 * 1024;
-        break;
-    }
-#endif
-#elif defined (__s390x__)
-    wipe_size = sysconf(_SC_LEVEL4_CACHE_SIZE);
-    if (wipe_size <= 0)
-    {
-      /*
-      * Take current max L4 cache size for s390x
-      */
-      wipe_size = 1024 * 1024 * 1024;
-    }
-#else
-    /* ppc64le */
-    wipe_size = sysconf(_SC_LEVEL4_CACHE_SIZE);
-    if (wipe_size <= 0) {
-      wipe_size = sysconf(_SC_LEVEL3_CACHE_SIZE);
-      if (wipe_size <= 0) {
-        wipe_size = sysconf(_SC_LEVEL2_CACHE_SIZE);
-        if(wipe_size <= 0) {
-          wipe_size = sysconf(_SC_LEVEL1D_CACHE_SIZE);
-        }
-      }
-    }
-#endif
-    LOG(INFO) << "Allocating cache wipe buffer of size " << wipe_size;
-    // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
-    wipe_buffer = static_cast<uint32_t*>(malloc(wipe_size));
-    CAFFE_ENFORCE(wipe_buffer != nullptr);
-  }
-  uint32_t hash = 0;
-  for (uint32_t i = 0; i * sizeof(uint32_t) < wipe_size; i += 8) {
-    // NOLINTNEXTLINE(clang-analyzer-core.uninitialized.Assign)
-    hash ^= wipe_buffer[i];
-    wipe_buffer[i] = hash;
-  }
-  /* Make sure compiler doesn't optimize the loop away */
-  return hash;
-}
-
-} /* namespace caffe2 */

diff --git a/caffe2/utils/bench_utils.h b/caffe2/utils/bench_utils.h
deleted file mode 100644
index 59997ed..0000000
--- a/caffe2/utils/bench_utils.h
+++ /dev/null

@@ -1,30 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef CAFFE2_UTILS_BENCH_UTILS_H_
-#define CAFFE2_UTILS_BENCH_UTILS_H_
-
-#include <stdint.h>
-
-#include <c10/macros/Export.h>
-
-namespace caffe2 {
-
-TORCH_API uint32_t wipe_cache();
-
-} // namespace caffe2
-
-#endif // CAFFE2_UTILS_BENCH_UTILS_H_

diff --git a/caffe2/utils/cast.h b/caffe2/utils/cast.h
deleted file mode 100644
index 6f9db083..0000000
--- a/caffe2/utils/cast.h
+++ /dev/null

@@ -1,49 +0,0 @@
-#pragma once
-
-#include <caffe2/utils/proto_utils.h>
-
-namespace caffe2 {
-
-namespace cast {
-
-inline TensorProto_DataType GetCastDataType(const ArgumentHelper& helper, std::string arg) {
-  TensorProto_DataType to;
-  if (helper.HasSingleArgumentOfType<string>(arg)) {
-    string s = helper.GetSingleArgument<string>(arg, "float");
-    std::transform(s.begin(), s.end(), s.begin(), ::toupper);
-#ifndef CAFFE2_USE_LITE_PROTO
-    CAFFE_ENFORCE(TensorProto_DataType_Parse(s, &to), "Unknown 'to' argument: ", s);
-#else
-
-// Manually implement in the lite proto case.
-#define X(t)                         \
-  if (s == #t) {                     \
-    return TensorProto_DataType_##t; \
-  }
-
-    X(FLOAT);
-    X(INT32);
-    X(BYTE);
-    X(STRING);
-    X(BOOL);
-    X(UINT8);
-    X(INT8);
-    X(UINT16);
-    X(INT16);
-    X(INT64);
-    X(FLOAT16);
-    X(DOUBLE);
-#undef X
-    CAFFE_THROW("Unhandled type argument: ", s);
-
-#endif
-  } else {
-    to = static_cast<TensorProto_DataType>(
-        helper.GetSingleArgument<int>(arg, TensorProto_DataType_FLOAT));
-  }
-  return to;
-}
-
-};  // namespace cast
-
-};  // namespace caffe2

diff --git a/caffe2/utils/cast_test.cc b/caffe2/utils/cast_test.cc
deleted file mode 100644
index 680e87b..0000000
--- a/caffe2/utils/cast_test.cc
+++ /dev/null

@@ -1,39 +0,0 @@
-#include <memory>
-#include <vector>
-
-#include <gtest/gtest.h>
-
-#include "caffe2/utils/cast.h"
-
-namespace caffe2 {
-
-TEST(CastTest, GetCastDataType) {
-  auto castOp = [](std::string t) {
-    // Ensure lowercase.
-    std::transform(t.begin(), t.end(), t.begin(), ::tolower);
-    auto op = CreateOperatorDef("Cast", "", {}, {});
-    AddArgument("to", t, &op);
-    return op;
-  };
-
-#define X(t)                    \
-  EXPECT_EQ(                    \
-      TensorProto_DataType_##t, \
-      cast::GetCastDataType(ArgumentHelper(castOp(#t)), "to"));
-
-  X(FLOAT);
-  X(INT32);
-  X(BYTE);
-  X(STRING);
-  X(BOOL);
-  X(UINT8);
-  X(INT8);
-  X(UINT16);
-  X(INT16);
-  X(INT64);
-  X(FLOAT16);
-  X(DOUBLE);
-#undef X
-}
-
-} // namespace caffe2

diff --git a/caffe2/utils/cblas.h b/caffe2/utils/cblas.h
deleted file mode 100644
index c91b8bf..0000000
--- a/caffe2/utils/cblas.h
+++ /dev/null

@@ -1,606 +0,0 @@
-// This is the exact cblas.h header file, placed here purely in order to get
-// the enums.
-
-#include "caffe2/core/macros.h"
-
-#ifndef CBLAS_H
-#ifdef CAFFE2_USE_MKL
-#include <mkl_cblas.h>
-#else  // CAFFE2_USE_MKL
-
-#ifndef CBLAS_ENUM_DEFINED_H
-   #define CBLAS_ENUM_DEFINED_H
-   enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102 };
-   enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113,
-                         AtlasConj=114};
-   enum CBLAS_UPLO  {CblasUpper=121, CblasLower=122};
-   enum CBLAS_DIAG  {CblasNonUnit=131, CblasUnit=132};
-   enum CBLAS_SIDE  {CblasLeft=141, CblasRight=142};
-#endif
-
-#ifndef CBLAS_ENUM_ONLY
-#define CBLAS_H
-#define CBLAS_INDEX int
-
-int cblas_errprn(int ierr, int info, char *form, ...);
-void cblas_xerbla(int p, const char *rout, const char *form, ...);
-
-/*
- * ===========================================================================
- * Prototypes for level 1 BLAS functions (complex are recast as routines)
- * ===========================================================================
- */
-float  cblas_sdsdot(const int N, const float alpha, const float *X,
-                    const int incX, const float *Y, const int incY);
-double cblas_dsdot(const int N, const float *X, const int incX, const float *Y,
-                   const int incY);
-float  cblas_sdot(const int N, const float  *X, const int incX,
-                  const float  *Y, const int incY);
-double cblas_ddot(const int N, const double *X, const int incX,
-                  const double *Y, const int incY);
-/*
- * Functions having prefixes Z and C only
- */
-void   cblas_cdotu_sub(const int N, const void *X, const int incX,
-                       const void *Y, const int incY, void *dotu);
-void   cblas_cdotc_sub(const int N, const void *X, const int incX,
-                       const void *Y, const int incY, void *dotc);
-
-void   cblas_zdotu_sub(const int N, const void *X, const int incX,
-                       const void *Y, const int incY, void *dotu);
-void   cblas_zdotc_sub(const int N, const void *X, const int incX,
-                       const void *Y, const int incY, void *dotc);
-
-
-/*
- * Functions having prefixes S D SC DZ
- */
-float  cblas_snrm2(const int N, const float *X, const int incX);
-float  cblas_sasum(const int N, const float *X, const int incX);
-
-double cblas_dnrm2(const int N, const double *X, const int incX);
-double cblas_dasum(const int N, const double *X, const int incX);
-
-float  cblas_scnrm2(const int N, const void *X, const int incX);
-float  cblas_scasum(const int N, const void *X, const int incX);
-
-double cblas_dznrm2(const int N, const void *X, const int incX);
-double cblas_dzasum(const int N, const void *X, const int incX);
-
-
-/*
- * Functions having standard 4 prefixes (S D C Z)
- */
-CBLAS_INDEX cblas_isamax(const int N, const float  *X, const int incX);
-CBLAS_INDEX cblas_idamax(const int N, const double *X, const int incX);
-CBLAS_INDEX cblas_icamax(const int N, const void   *X, const int incX);
-CBLAS_INDEX cblas_izamax(const int N, const void   *X, const int incX);
-
-/*
- * ===========================================================================
- * Prototypes for level 1 BLAS routines
- * ===========================================================================
- */
-
-/*
- * Routines with standard 4 prefixes (s, d, c, z)
- */
-void cblas_sswap(const int N, float *X, const int incX,
-                 float *Y, const int incY);
-void cblas_scopy(const int N, const float *X, const int incX,
-                 float *Y, const int incY);
-void cblas_saxpy(const int N, const float alpha, const float *X,
-                 const int incX, float *Y, const int incY);
-void catlas_saxpby(const int N, const float alpha, const float *X,
-                  const int incX, const float beta, float *Y, const int incY);
-void catlas_sset
-   (const int N, const float alpha, float *X, const int incX);
-
-void cblas_dswap(const int N, double *X, const int incX,
-                 double *Y, const int incY);
-void cblas_dcopy(const int N, const double *X, const int incX,
-                 double *Y, const int incY);
-void cblas_daxpy(const int N, const double alpha, const double *X,
-                 const int incX, double *Y, const int incY);
-void catlas_daxpby(const int N, const double alpha, const double *X,
-                  const int incX, const double beta, double *Y, const int incY);
-void catlas_dset
-   (const int N, const double alpha, double *X, const int incX);
-
-void cblas_cswap(const int N, void *X, const int incX,
-                 void *Y, const int incY);
-void cblas_ccopy(const int N, const void *X, const int incX,
-                 void *Y, const int incY);
-void cblas_caxpy(const int N, const void *alpha, const void *X,
-                 const int incX, void *Y, const int incY);
-void catlas_caxpby(const int N, const void *alpha, const void *X,
-                  const int incX, const void *beta, void *Y, const int incY);
-void catlas_cset
-   (const int N, const void *alpha, void *X, const int incX);
-
-void cblas_zswap(const int N, void *X, const int incX,
-                 void *Y, const int incY);
-void cblas_zcopy(const int N, const void *X, const int incX,
-                 void *Y, const int incY);
-void cblas_zaxpy(const int N, const void *alpha, const void *X,
-                 const int incX, void *Y, const int incY);
-void catlas_zaxpby(const int N, const void *alpha, const void *X,
-                  const int incX, const void *beta, void *Y, const int incY);
-void catlas_zset
-   (const int N, const void *alpha, void *X, const int incX);
-
-
-/*
- * Routines with S and D prefix only
- */
-void cblas_srotg(float *a, float *b, float *c, float *s);
-void cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P);
-void cblas_srot(const int N, float *X, const int incX,
-                float *Y, const int incY, const float c, const float s);
-void cblas_srotm(const int N, float *X, const int incX,
-                float *Y, const int incY, const float *P);
-
-void cblas_drotg(double *a, double *b, double *c, double *s);
-void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P);
-void cblas_drot(const int N, double *X, const int incX,
-                double *Y, const int incY, const double c, const double s);
-void cblas_drotm(const int N, double *X, const int incX,
-                double *Y, const int incY, const double *P);
-
-
-/*
- * Routines with S D C Z CS and ZD prefixes
- */
-void cblas_sscal(const int N, const float alpha, float *X, const int incX);
-void cblas_dscal(const int N, const double alpha, double *X, const int incX);
-void cblas_cscal(const int N, const void *alpha, void *X, const int incX);
-void cblas_zscal(const int N, const void *alpha, void *X, const int incX);
-void cblas_csscal(const int N, const float alpha, void *X, const int incX);
-void cblas_zdscal(const int N, const double alpha, void *X, const int incX);
-
-/*
- * Extra reference routines provided by ATLAS, but not mandated by the standard
- */
-void cblas_crotg(void *a, void *b, void *c, void *s);
-void cblas_zrotg(void *a, void *b, void *c, void *s);
-void cblas_csrot(const int N, void *X, const int incX, void *Y, const int incY,
-                 const float c, const float s);
-void cblas_zdrot(const int N, void *X, const int incX, void *Y, const int incY,
-                 const double c, const double s);
-
-/*
- * ===========================================================================
- * Prototypes for level 2 BLAS
- * ===========================================================================
- */
-
-/*
- * Routines with standard 4 prefixes (S, D, C, Z)
- */
-void cblas_sgemv(const enum CBLAS_ORDER Order,
-                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
-                 const float alpha, const float *A, const int lda,
-                 const float *X, const int incX, const float beta,
-                 float *Y, const int incY);
-void cblas_sgbmv(const enum CBLAS_ORDER Order,
-                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
-                 const int KL, const int KU, const float alpha,
-                 const float *A, const int lda, const float *X,
-                 const int incX, const float beta, float *Y, const int incY);
-void cblas_strmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const float *A, const int lda,
-                 float *X, const int incX);
-void cblas_stbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const int K, const float *A, const int lda,
-                 float *X, const int incX);
-void cblas_stpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const float *Ap, float *X, const int incX);
-void cblas_strsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const float *A, const int lda, float *X,
-                 const int incX);
-void cblas_stbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const int K, const float *A, const int lda,
-                 float *X, const int incX);
-void cblas_stpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const float *Ap, float *X, const int incX);
-
-void cblas_dgemv(const enum CBLAS_ORDER Order,
-                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
-                 const double alpha, const double *A, const int lda,
-                 const double *X, const int incX, const double beta,
-                 double *Y, const int incY);
-void cblas_dgbmv(const enum CBLAS_ORDER Order,
-                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
-                 const int KL, const int KU, const double alpha,
-                 const double *A, const int lda, const double *X,
-                 const int incX, const double beta, double *Y, const int incY);
-void cblas_dtrmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const double *A, const int lda,
-                 double *X, const int incX);
-void cblas_dtbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const int K, const double *A, const int lda,
-                 double *X, const int incX);
-void cblas_dtpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const double *Ap, double *X, const int incX);
-void cblas_dtrsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const double *A, const int lda, double *X,
-                 const int incX);
-void cblas_dtbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const int K, const double *A, const int lda,
-                 double *X, const int incX);
-void cblas_dtpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const double *Ap, double *X, const int incX);
-
-void cblas_cgemv(const enum CBLAS_ORDER Order,
-                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
-                 const void *alpha, const void *A, const int lda,
-                 const void *X, const int incX, const void *beta,
-                 void *Y, const int incY);
-void cblas_cgbmv(const enum CBLAS_ORDER Order,
-                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
-                 const int KL, const int KU, const void *alpha,
-                 const void *A, const int lda, const void *X,
-                 const int incX, const void *beta, void *Y, const int incY);
-void cblas_ctrmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const void *A, const int lda,
-                 void *X, const int incX);
-void cblas_ctbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const int K, const void *A, const int lda,
-                 void *X, const int incX);
-void cblas_ctpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const void *Ap, void *X, const int incX);
-void cblas_ctrsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const void *A, const int lda, void *X,
-                 const int incX);
-void cblas_ctbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const int K, const void *A, const int lda,
-                 void *X, const int incX);
-void cblas_ctpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const void *Ap, void *X, const int incX);
-
-void cblas_zgemv(const enum CBLAS_ORDER Order,
-                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
-                 const void *alpha, const void *A, const int lda,
-                 const void *X, const int incX, const void *beta,
-                 void *Y, const int incY);
-void cblas_zgbmv(const enum CBLAS_ORDER Order,
-                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
-                 const int KL, const int KU, const void *alpha,
-                 const void *A, const int lda, const void *X,
-                 const int incX, const void *beta, void *Y, const int incY);
-void cblas_ztrmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const void *A, const int lda,
-                 void *X, const int incX);
-void cblas_ztbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const int K, const void *A, const int lda,
-                 void *X, const int incX);
-void cblas_ztpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const void *Ap, void *X, const int incX);
-void cblas_ztrsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const void *A, const int lda, void *X,
-                 const int incX);
-void cblas_ztbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const int K, const void *A, const int lda,
-                 void *X, const int incX);
-void cblas_ztpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
-                 const int N, const void *Ap, void *X, const int incX);
-
-
-/*
- * Routines with S and D prefixes only
- */
-void cblas_ssymv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const int N, const float alpha, const float *A,
-                 const int lda, const float *X, const int incX,
-                 const float beta, float *Y, const int incY);
-void cblas_ssbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const int N, const int K, const float alpha, const float *A,
-                 const int lda, const float *X, const int incX,
-                 const float beta, float *Y, const int incY);
-void cblas_sspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const int N, const float alpha, const float *Ap,
-                 const float *X, const int incX,
-                 const float beta, float *Y, const int incY);
-void cblas_sger(const enum CBLAS_ORDER Order, const int M, const int N,
-                const float alpha, const float *X, const int incX,
-                const float *Y, const int incY, float *A, const int lda);
-void cblas_ssyr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                const int N, const float alpha, const float *X,
-                const int incX, float *A, const int lda);
-void cblas_sspr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                const int N, const float alpha, const float *X,
-                const int incX, float *Ap);
-void cblas_ssyr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                const int N, const float alpha, const float *X,
-                const int incX, const float *Y, const int incY, float *A,
-                const int lda);
-void cblas_sspr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                const int N, const float alpha, const float *X,
-                const int incX, const float *Y, const int incY, float *A);
-
-void cblas_dsymv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const int N, const double alpha, const double *A,
-                 const int lda, const double *X, const int incX,
-                 const double beta, double *Y, const int incY);
-void cblas_dsbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const int N, const int K, const double alpha, const double *A,
-                 const int lda, const double *X, const int incX,
-                 const double beta, double *Y, const int incY);
-void cblas_dspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const int N, const double alpha, const double *Ap,
-                 const double *X, const int incX,
-                 const double beta, double *Y, const int incY);
-void cblas_dger(const enum CBLAS_ORDER Order, const int M, const int N,
-                const double alpha, const double *X, const int incX,
-                const double *Y, const int incY, double *A, const int lda);
-void cblas_dsyr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                const int N, const double alpha, const double *X,
-                const int incX, double *A, const int lda);
-void cblas_dspr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                const int N, const double alpha, const double *X,
-                const int incX, double *Ap);
-void cblas_dsyr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                const int N, const double alpha, const double *X,
-                const int incX, const double *Y, const int incY, double *A,
-                const int lda);
-void cblas_dspr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                const int N, const double alpha, const double *X,
-                const int incX, const double *Y, const int incY, double *A);
-
-
-/*
- * Routines with C and Z prefixes only
- */
-void cblas_chemv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const int N, const void *alpha, const void *A,
-                 const int lda, const void *X, const int incX,
-                 const void *beta, void *Y, const int incY);
-void cblas_chbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const int N, const int K, const void *alpha, const void *A,
-                 const int lda, const void *X, const int incX,
-                 const void *beta, void *Y, const int incY);
-void cblas_chpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const int N, const void *alpha, const void *Ap,
-                 const void *X, const int incX,
-                 const void *beta, void *Y, const int incY);
-void cblas_cgeru(const enum CBLAS_ORDER Order, const int M, const int N,
-                 const void *alpha, const void *X, const int incX,
-                 const void *Y, const int incY, void *A, const int lda);
-void cblas_cgerc(const enum CBLAS_ORDER Order, const int M, const int N,
-                 const void *alpha, const void *X, const int incX,
-                 const void *Y, const int incY, void *A, const int lda);
-void cblas_cher(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                const int N, const float alpha, const void *X, const int incX,
-                void *A, const int lda);
-void cblas_chpr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                const int N, const float alpha, const void *X,
-                const int incX, void *A);
-void cblas_cher2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N,
-                const void *alpha, const void *X, const int incX,
-                const void *Y, const int incY, void *A, const int lda);
-void cblas_chpr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N,
-                const void *alpha, const void *X, const int incX,
-                const void *Y, const int incY, void *Ap);
-
-void cblas_zhemv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const int N, const void *alpha, const void *A,
-                 const int lda, const void *X, const int incX,
-                 const void *beta, void *Y, const int incY);
-void cblas_zhbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const int N, const int K, const void *alpha, const void *A,
-                 const int lda, const void *X, const int incX,
-                 const void *beta, void *Y, const int incY);
-void cblas_zhpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const int N, const void *alpha, const void *Ap,
-                 const void *X, const int incX,
-                 const void *beta, void *Y, const int incY);
-void cblas_zgeru(const enum CBLAS_ORDER Order, const int M, const int N,
-                 const void *alpha, const void *X, const int incX,
-                 const void *Y, const int incY, void *A, const int lda);
-void cblas_zgerc(const enum CBLAS_ORDER Order, const int M, const int N,
-                 const void *alpha, const void *X, const int incX,
-                 const void *Y, const int incY, void *A, const int lda);
-void cblas_zher(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                const int N, const double alpha, const void *X, const int incX,
-                void *A, const int lda);
-void cblas_zhpr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                const int N, const double alpha, const void *X,
-                const int incX, void *A);
-void cblas_zher2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N,
-                const void *alpha, const void *X, const int incX,
-                const void *Y, const int incY, void *A, const int lda);
-void cblas_zhpr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N,
-                const void *alpha, const void *X, const int incX,
-                const void *Y, const int incY, void *Ap);
-
-/*
- * ===========================================================================
- * Prototypes for level 3 BLAS
- * ===========================================================================
- */
-
-/*
- * Routines with standard 4 prefixes (S, D, C, Z)
- */
-void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
-                 const int K, const float alpha, const float *A,
-                 const int lda, const float *B, const int ldb,
-                 const float beta, float *C, const int ldc);
-void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const int M, const int N,
-                 const float alpha, const float *A, const int lda,
-                 const float *B, const int ldb, const float beta,
-                 float *C, const int ldc);
-void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                 const float alpha, const float *A, const int lda,
-                 const float beta, float *C, const int ldc);
-void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                  const float alpha, const float *A, const int lda,
-                  const float *B, const int ldb, const float beta,
-                  float *C, const int ldc);
-void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_DIAG Diag, const int M, const int N,
-                 const float alpha, const float *A, const int lda,
-                 float *B, const int ldb);
-void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_DIAG Diag, const int M, const int N,
-                 const float alpha, const float *A, const int lda,
-                 float *B, const int ldb);
-
-void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
-                 const int K, const double alpha, const double *A,
-                 const int lda, const double *B, const int ldb,
-                 const double beta, double *C, const int ldc);
-void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const int M, const int N,
-                 const double alpha, const double *A, const int lda,
-                 const double *B, const int ldb, const double beta,
-                 double *C, const int ldc);
-void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                 const double alpha, const double *A, const int lda,
-                 const double beta, double *C, const int ldc);
-void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                  const double alpha, const double *A, const int lda,
-                  const double *B, const int ldb, const double beta,
-                  double *C, const int ldc);
-void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_DIAG Diag, const int M, const int N,
-                 const double alpha, const double *A, const int lda,
-                 double *B, const int ldb);
-void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_DIAG Diag, const int M, const int N,
-                 const double alpha, const double *A, const int lda,
-                 double *B, const int ldb);
-
-void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
-                 const int K, const void *alpha, const void *A,
-                 const int lda, const void *B, const int ldb,
-                 const void *beta, void *C, const int ldc);
-void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const int M, const int N,
-                 const void *alpha, const void *A, const int lda,
-                 const void *B, const int ldb, const void *beta,
-                 void *C, const int ldc);
-void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                 const void *alpha, const void *A, const int lda,
-                 const void *beta, void *C, const int ldc);
-void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                  const void *alpha, const void *A, const int lda,
-                  const void *B, const int ldb, const void *beta,
-                  void *C, const int ldc);
-void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_DIAG Diag, const int M, const int N,
-                 const void *alpha, const void *A, const int lda,
-                 void *B, const int ldb);
-void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_DIAG Diag, const int M, const int N,
-                 const void *alpha, const void *A, const int lda,
-                 void *B, const int ldb);
-
-void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
-                 const int K, const void *alpha, const void *A,
-                 const int lda, const void *B, const int ldb,
-                 const void *beta, void *C, const int ldc);
-void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const int M, const int N,
-                 const void *alpha, const void *A, const int lda,
-                 const void *B, const int ldb, const void *beta,
-                 void *C, const int ldc);
-void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                 const void *alpha, const void *A, const int lda,
-                 const void *beta, void *C, const int ldc);
-void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                  const void *alpha, const void *A, const int lda,
-                  const void *B, const int ldb, const void *beta,
-                  void *C, const int ldc);
-void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_DIAG Diag, const int M, const int N,
-                 const void *alpha, const void *A, const int lda,
-                 void *B, const int ldb);
-void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_DIAG Diag, const int M, const int N,
-                 const void *alpha, const void *A, const int lda,
-                 void *B, const int ldb);
-
-
-/*
- * Routines with prefixes C and Z only
- */
-void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const int M, const int N,
-                 const void *alpha, const void *A, const int lda,
-                 const void *B, const int ldb, const void *beta,
-                 void *C, const int ldc);
-void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                 const float alpha, const void *A, const int lda,
-                 const float beta, void *C, const int ldc);
-void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                  const void *alpha, const void *A, const int lda,
-                  const void *B, const int ldb, const float beta,
-                  void *C, const int ldc);
-void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
-                 const enum CBLAS_UPLO Uplo, const int M, const int N,
-                 const void *alpha, const void *A, const int lda,
-                 const void *B, const int ldb, const void *beta,
-                 void *C, const int ldc);
-void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                 const double alpha, const void *A, const int lda,
-                 const double beta, void *C, const int ldc);
-void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
-                  const void *alpha, const void *A, const int lda,
-                  const void *B, const int ldb, const double beta,
-                  void *C, const int ldc);
-
-int cblas_errprn(int ierr, int info, char *form, ...);
-
-#endif  /* end #ifdef CBLAS_ENUM_ONLY */
-#endif  // CAFFE2_USE_MKL
-#endif

diff --git a/caffe2/utils/cpu_neon.h b/caffe2/utils/cpu_neon.h
deleted file mode 100644
index 7e68d73..0000000
--- a/caffe2/utils/cpu_neon.h
+++ /dev/null

@@ -1,53 +0,0 @@
-#ifndef CAFFE2_UTILS_CPU_NEON_H_
-#define CAFFE2_UTILS_CPU_NEON_H_
-
-// Provides a variety of ARM NEON-specific utility functions
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-
-namespace caffe2 {
-
-template <typename T>
-inline bool isPointerAligned(T* p, size_t align) {
-  return (reinterpret_cast<uintptr_t>(p) % align == 0);
-}
-
-inline float32x4_t vert_sum_f32(float32x4_t v0,
-                                float32x4_t v1,
-                                float32x4_t v2,
-                                float32x4_t v3) {
-  v0 = vaddq_f32(v0, v1);
-  v2 = vaddq_f32(v2, v3);
-  return vaddq_f32(v0, v2);
-}
-
-inline float horizontal_sum_f32(float32x4_t v0,
-                                float32x4_t v1,
-                                float32x4_t v2,
-                                float32x4_t v3) {
-  v0 = vert_sum_f32(v0, v1, v2, v3);
-  float32x2_t v = vadd_f32(vget_high_f32(v0), vget_low_f32(v0));
-  return vget_lane_f32(vpadd_f32(v, v), 0);
-}
-
-// Load/store functions that assume alignment
-
-inline float32x4_t vld1q_f32_aligned(const float* p) {
-  return vld1q_f32((const float*)
-                   __builtin_assume_aligned(p, sizeof(float32x4_t)));
-}
-
-inline void vst1q_f32_aligned(float* p, float32x4_t v) {
-  vst1q_f32((float*) __builtin_assume_aligned(p, sizeof(float32x4_t)), v);
-}
-
-inline void vst4_u8_aligned(uint8_t* p, uint8x8x4_t v) {
-  vst4_u8((uint8_t*)
-          __builtin_assume_aligned(p, sizeof(uint8x8x4_t)), v);
-}
-
-}  // namespace caffe2
-
-#endif //  defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-#endif  // CAFFE2_UTILS_CPU_NEON_H_

diff --git a/caffe2/utils/cpuid_test.cc b/caffe2/utils/cpuid_test.cc
deleted file mode 100644
index f3694f5..0000000
--- a/caffe2/utils/cpuid_test.cc
+++ /dev/null

@@ -1,10 +0,0 @@
-#include <gtest/gtest.h>
-#include "caffe2/utils/cpuid.h"
-
-namespace caffe2 {
-
-TEST(CpuIdTest, ShouldAlwaysHaveMMX) {
-  EXPECT_TRUE(GetCpuId().mmx());
-}
-
-} // namespace caffe2

diff --git a/caffe2/utils/cub_namespace.cuh b/caffe2/utils/cub_namespace.cuh
deleted file mode 100644
index 188a993..0000000
--- a/caffe2/utils/cub_namespace.cuh
+++ /dev/null

@@ -1,17 +0,0 @@
-#pragma once
-
-// cub sort support for CUB_WRAPPED_NAMESPACE is added to cub 1.13.1 in:
-// https://github.com/NVIDIA/cub/pull/326
-// CUB_WRAPPED_NAMESPACE is defined globally in cmake/Dependencies.cmake
-// starting from CUDA 11.5
-#if defined(CUB_WRAPPED_NAMESPACE) || defined(THRUST_CUB_WRAPPED_NAMESPACE)
-#define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() true
-#else
-#define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() false
-#endif
-
-#if USE_GLOBAL_CUB_WRAPPED_NAMESPACE()
-namespace caffe2 {
-namespace cub = ::CUB_WRAPPED_NAMESPACE::cub;
-}
-#endif

diff --git a/caffe2/utils/eigen_utils.h b/caffe2/utils/eigen_utils.h
deleted file mode 100644
index c6c34db..0000000
--- a/caffe2/utils/eigen_utils.h
+++ /dev/null

@@ -1,205 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#ifndef CAFFE2_OPERATORS_UTILS_EIGEN_H_
-#define CAFFE2_OPERATORS_UTILS_EIGEN_H_
-
-#include "Eigen/Core"
-#include "Eigen/Dense"
-
-#include <c10/util/Logging.h>
-#include <c10/util/irange.h>
-
-namespace caffe2 {
-
-// Common Eigen types that we will often use
-template <typename T>
-using EigenMatrixMap =
-    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using EigenArrayMap =
-    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using EigenVectorMap = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, 1>>;
-template <typename T>
-using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
-template <typename T>
-using ConstEigenMatrixMap =
-    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using ConstEigenArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using ConstEigenVectorMap =
-    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>;
-template <typename T>
-using ConstEigenVectorArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
-
-using EigenOuterStride = Eigen::OuterStride<Eigen::Dynamic>;
-using EigenInnerStride = Eigen::InnerStride<Eigen::Dynamic>;
-using EigenStride = Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>;
-template <typename T>
-using EigenOuterStridedMatrixMap = Eigen::
-    Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>, 0, EigenOuterStride>;
-template <typename T>
-using EigenOuterStridedArrayMap = Eigen::
-    Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>, 0, EigenOuterStride>;
-template <typename T>
-using ConstEigenOuterStridedMatrixMap = Eigen::Map<
-    const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>,
-    0,
-    EigenOuterStride>;
-template <typename T>
-using ConstEigenOuterStridedArrayMap = Eigen::Map<
-    const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>,
-    0,
-    EigenOuterStride>;
-template <typename T>
-using EigenStridedMatrixMap = Eigen::
-    Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>, 0, EigenStride>;
-template <typename T>
-using EigenStridedArrayMap =
-    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>, 0, EigenStride>;
-template <typename T>
-using ConstEigenStridedMatrixMap = Eigen::
-    Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>, 0, EigenStride>;
-template <typename T>
-using ConstEigenStridedArrayMap = Eigen::
-    Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>, 0, EigenStride>;
-
-// 1-d array
-template <typename T>
-using EArrXt = Eigen::Array<T, Eigen::Dynamic, 1>;
-using EArrXf = Eigen::ArrayXf;
-using EArrXd = Eigen::ArrayXd;
-using EArrXi = Eigen::ArrayXi;
-using EArrXb = EArrXt<bool>;
-using EArrXI32 = EArrXt<int32_t>;
-using EArrXU16 = EArrXt<uint16_t>;
-using EArrXU8 = EArrXt<uint8_t>;
-using EArr3U8 = Eigen::Array<uint8_t, 3, 1>;
-
-// 2-d array, column major
-template <typename T>
-using EArrXXt = Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>;
-using EArrXXf = Eigen::ArrayXXf;
-using EArrXXI32 = EArrXXt<int32_t>;
-using EArrXXU16 = EArrXXt<uint16_t>;
-using EArrXXU8 = EArrXXt<uint8_t>;
-using EArrXXi = EArrXXt<int>;
-
-// 2-d array, row major
-template <typename T>
-using ERArrXXt =
-    Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-using ERArrXXf = ERArrXXt<float>;
-using ERArrXXI32t = ERArrXXt<int32_t>;
-using ERArrXXU16t = ERArrXXt<uint16_t>;
-using ERArrXXU8t = ERArrXXt<uint8_t>;
-using ERArrXXi = ERArrXXt<int>;
-using ERArrXXi64t = ERArrXXt<int64_t>;
-using ERArrXXi32t = ERArrXXt<int32_t>;
-
-// 1-d vector
-template <typename T>
-using EVecXt = Eigen::Matrix<T, Eigen::Dynamic, 1>;
-using EVecXd = Eigen::VectorXd;
-using EVecXf = Eigen::VectorXf;
-
-// 1-d row vector
-using ERVecXd = Eigen::RowVectorXd;
-using ERVecXf = Eigen::RowVectorXf;
-
-// 2-d matrix, column major
-template <typename T>
-using EMatXt = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>;
-using EMatXd = Eigen::MatrixXd;
-using EMatXf = Eigen::MatrixXf;
-using EMatXU8 = EMatXt<uint8_t>;
-using EMatXU16 = EMatXt<uint16_t>;
-
-// 2-d matrix, row major
-template <typename T>
-using ERMatXt =
-    Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-using ERMatXd = ERMatXt<double>;
-using ERMatXf = ERMatXt<float>;
-using ERMatXU8 = ERMatXt<uint8_t>;
-
-namespace utils {
-
-template <typename T>
-Eigen::Map<const EArrXt<T>> AsEArrXt(const std::vector<T>& arr) {
-  return {arr.data(), static_cast<int>(arr.size())};
-}
-template <typename T>
-Eigen::Map<EArrXt<T>> AsEArrXt(std::vector<T>& arr) {
-  return {arr.data(), static_cast<int>(arr.size())};
-}
-
-// return a sub array of 'array' based on indices 'indices'
-template <class Derived, class Derived1, class Derived2>
-void GetSubArray(
-    const Eigen::ArrayBase<Derived>& array,
-    const Eigen::ArrayBase<Derived1>& indices,
-    Eigen::ArrayBase<Derived2>* out_array) {
-  CAFFE_ENFORCE_EQ(array.cols(), 1);
-  // using T = typename Derived::Scalar;
-
-  out_array->derived().resize(indices.size());
-  for (const auto i : c10::irange(indices.size())) {
-    TORCH_DCHECK_LT(indices[i], array.size());
-    (*out_array)[i] = array[indices[i]];
-  }
-}
-
-// return a sub array of 'array' based on indices 'indices'
-template <class Derived, class Derived1>
-EArrXt<typename Derived::Scalar> GetSubArray(
-    const Eigen::ArrayBase<Derived>& array,
-    const Eigen::ArrayBase<Derived1>& indices) {
-  using T = typename Derived::Scalar;
-  EArrXt<T> ret(indices.size());
-  GetSubArray(array, indices, &ret);
-  return ret;
-}
-
-// return a sub array of 'array' based on indices 'indices'
-template <class Derived>
-EArrXt<typename Derived::Scalar> GetSubArray(
-    const Eigen::ArrayBase<Derived>& array,
-    const std::vector<int>& indices) {
-  return GetSubArray(array, AsEArrXt(indices));
-}
-
-// return 2d sub array of 'array' based on row indices 'row_indices'
-template <class Derived, class Derived1, class Derived2>
-void GetSubArrayRows(
-    const Eigen::ArrayBase<Derived>& array2d,
-    const Eigen::ArrayBase<Derived1>& row_indices,
-    Eigen::ArrayBase<Derived2>* out_array) {
-  out_array->derived().resize(row_indices.size(), array2d.cols());
-
-  for (const auto i : c10::irange(row_indices.size())) {
-    TORCH_DCHECK_LT(row_indices[i], array2d.size());
-    out_array->row(i) =
-        array2d.row(row_indices[i]).template cast<typename Derived2::Scalar>();
-  }
-}
-
-// return indices of 1d array for elements evaluated to true
-template <class Derived>
-std::vector<int> GetArrayIndices(const Eigen::ArrayBase<Derived>& array) {
-  std::vector<int> ret;
-  for (const auto i : c10::irange(array.size())) {
-    if (array[i]) {
-      ret.push_back(i);
-    }
-  }
-  return ret;
-}
-
-} // namespace utils
-} // namespace caffe2
-
-#endif

diff --git a/caffe2/utils/fatal_signal_asan_no_sig_test.cc b/caffe2/utils/fatal_signal_asan_no_sig_test.cc
deleted file mode 100644
index 9c64102..0000000
--- a/caffe2/utils/fatal_signal_asan_no_sig_test.cc
+++ /dev/null

@@ -1,148 +0,0 @@
-#include "caffe2/utils/signal_handler.h"
-#if defined(C10_SUPPORTS_FATAL_SIGNAL_HANDLERS)
-#include <gtest/gtest.h>
-#include <pthread.h>
-#include <unistd.h>
-
-#include <functional>
-#include <iostream>
-#include <array>
-
-#include "caffe2/core/common.h"
-
-namespace {
-void* dummy_thread(void*) {
-  while (1) {
-  }
-  return nullptr;
-}
-
-bool forkAndPipe(
-    std::string& stderrBuffer,
-    std::function<void(void)> callback) {
-  std::array<int, 2> stderrPipe;
-  if (pipe(stderrPipe.data()) != 0) {
-    perror("STDERR pipe");
-    return false;
-  }
-  pid_t child = fork();
-  if (child == 0) {
-    // Replace this process' stderr so we can read it.
-    if (dup2(stderrPipe[1], STDERR_FILENO) < 0) {
-      close(stderrPipe[0]);
-      close(stderrPipe[1]);
-      perror("dup2 STDERR");
-      exit(5);
-    }
-
-    // This is for the parent to work with.
-    close(stderrPipe[0]);
-    close(stderrPipe[1]);
-
-    callback();
-    exit(7);
-  } else if (child > 0) {
-    const int bufferSize = 128;
-    std::array<char, bufferSize> buffer;
-
-    // We want to close the writing end of the pipe right away so our
-    // read actually gets an EOF.
-    close(stderrPipe[1]);
-
-    // wait for child to finish crashing.
-    int statloc;
-    if (wait(&statloc) < 0) {
-      close(stderrPipe[0]);
-      perror("wait");
-      return false;
-    }
-
-    ssize_t bytesRead;
-    while ((bytesRead = read(stderrPipe[0], buffer.data(), bufferSize)) > 0) {
-      const std::string tmp(buffer.data(), bytesRead);
-      std::cout << tmp;
-      stderrBuffer += tmp;
-    }
-
-    // The child should have exited due to signal.
-    if (!WIFSIGNALED(statloc)) {
-      fprintf(stderr, "Child didn't exit because it received a signal\n");
-      if (WIFEXITED(statloc)) {
-        fprintf(stderr, "Exited with code: %d\n", WEXITSTATUS(statloc) & 0xff);
-      }
-      return false;
-    }
-
-    if (bytesRead < 0) {
-      perror("read");
-      return false;
-    }
-
-    close(stderrPipe[0]);
-    return true;
-  } else {
-    perror("fork");
-    return false;
-  }
-}
-} // namespace
-
-#define _TEST_FATAL_SIGNAL(signum, name, threadCount, print, expected)       \
-  do {                                                                       \
-    std::string stderrBuffer;                                                \
-    ASSERT_TRUE(forkAndPipe(stderrBuffer, [=]() {                            \
-      caffe2::setPrintStackTracesOnFatalSignal(print);                       \
-      pthread_t pt;                                                          \
-      for (int i = 0; i < threadCount; i++) {                                \
-        if (pthread_create(&pt, nullptr, ::dummy_thread, nullptr)) {         \
-          perror("pthread_create");                                          \
-        }                                                                    \
-      }                                                                      \
-      raise(signum);                                                         \
-    }));                                                                     \
-    int keyPhraseCount = 0;                                                  \
-    std::string keyPhrase =                                                  \
-        std::string(name) + "(" + c10::to_string(signum) + ")";              \
-    size_t loc = 0;                                                          \
-    while ((loc = stderrBuffer.find(keyPhrase, loc)) != std::string::npos) { \
-      keyPhraseCount += 1;                                                   \
-      loc += 1;                                                              \
-    }                                                                        \
-    EXPECT_GE(keyPhraseCount, expected);                                     \
-  } while (0)
-
-#define TEST_FATAL_SIGNAL(signum, name, threadCount) \
-  _TEST_FATAL_SIGNAL(signum, name, threadCount, true, threadCount + 1)
-
-#define TEST_FATAL_SIGNAL_NO_PRINT(signum, name, threadCount) \
-  _TEST_FATAL_SIGNAL(signum, name, threadCount, false, 0)
-
-TEST(fatalSignalTest, SIGABRT8) {
-  TEST_FATAL_SIGNAL(SIGABRT, "SIGABRT", 8);
-}
-
-TEST(fatalSignalTest, SIGINT8) {
-  TEST_FATAL_SIGNAL(SIGINT, "SIGINT", 8);
-}
-
-TEST(fatalSignalTest, SIGILL8) {
-  TEST_FATAL_SIGNAL(SIGILL, "SIGILL", 8);
-}
-
-TEST(fatalSignalTest, SIGFPE8) {
-  TEST_FATAL_SIGNAL(SIGFPE, "SIGFPE", 8);
-}
-
-TEST(fatalSignalTest, SIGBUS8) {
-  TEST_FATAL_SIGNAL(SIGBUS, "SIGBUS", 8);
-}
-
-TEST(fatalSignalTest, SIGSEGV8) {
-  TEST_FATAL_SIGNAL(SIGSEGV, "SIGSEGV", 8);
-}
-
-// Test that if we don't enable printing stack traces then we don't get any.
-TEST(fatalSignalTest, SIGABRT8_NOPRINT) {
-  TEST_FATAL_SIGNAL_NO_PRINT(SIGABRT, "SIGABRT", 8);
-}
-#endif // defined(C10_SUPPORTS_FATAL_SIGNAL_HANDLERS)

diff --git a/caffe2/utils/filler.h b/caffe2/utils/filler.h
deleted file mode 100644
index 3d0e399..0000000
--- a/caffe2/utils/filler.h
+++ /dev/null

@@ -1,140 +0,0 @@
-#ifndef CAFFE2_FILLER_H_
-#define CAFFE2_FILLER_H_
-
-#include <sstream>
-
-#include "caffe2/core/logging.h"
-#include "caffe2/core/tensor.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-// TODO: replace filler distribution enum with a better abstraction
-enum FillerDistribution { FD_UNIFORM, FD_FIXEDSUM, FD_SYNTHETIC };
-
-class TensorFiller {
- public:
-  template <class Type, class Context>
-  void Fill(Tensor* tensor, Context* context) const {
-    CAFFE_ENFORCE(context, "context is null");
-    CAFFE_ENFORCE(tensor, "tensor is null");
-    auto min = (min_ < (double)std::numeric_limits<Type>::min())
-        ? std::numeric_limits<Type>::min()
-        : static_cast<Type>(min_);
-    auto max = (max_ > (double)std::numeric_limits<Type>::max())
-        ? std::numeric_limits<Type>::max()
-        : static_cast<Type>(max_);
-    CAFFE_ENFORCE_LE(min, max);
-
-    Tensor temp_tensor(shape_, Context::GetDeviceType());
-    std::swap(*tensor, temp_tensor);
-    Type* data = tensor->template mutable_data<Type>();
-
-    // select distribution
-    switch (dist_) {
-      case FD_UNIFORM: {
-        math::RandUniform<Type, Context>(
-            tensor->numel(), min, max, data, context);
-        break;
-      }
-      case FD_FIXEDSUM: {
-        auto fixed_sum = static_cast<Type>(fixed_sum_);
-        CAFFE_ENFORCE_LE(min * tensor->numel(), fixed_sum);
-        CAFFE_ENFORCE_GE(max * tensor->numel(), fixed_sum);
-        math::RandFixedSum<Type, Context>(
-            tensor->numel(), min, max, fixed_sum_, data, context);
-        break;
-      }
-      case FD_SYNTHETIC: {
-        math::RandSyntheticData<Type, Context>(
-            tensor->numel(), min, max, data, context);
-        break;
-      }
-    }
-  }
-
-  TensorFiller& Dist(FillerDistribution dist) {
-    dist_ = dist;
-    return *this;
-  }
-
-  template <class Type>
-  TensorFiller& Min(Type min) {
-    min_ = (double)min;
-    return *this;
-  }
-
-  template <class Type>
-  TensorFiller& Max(Type max) {
-    max_ = (double)max;
-    return *this;
-  }
-
-  template <class Type>
-  TensorFiller& FixedSum(Type fixed_sum) {
-    dist_ = FD_FIXEDSUM;
-    fixed_sum_ = (double)fixed_sum;
-    return *this;
-  }
-
-  // A helper function to construct the lengths vector for sparse features
-  // We try to pad least one index per batch unless the total_length is 0
-  template <class Type>
-  TensorFiller& SparseLengths(Type total_length) {
-    return FixedSum(total_length)
-        .Min(std::min(static_cast<Type>(1), total_length))
-        .Max(total_length);
-  }
-
-  // a helper function to construct the segments vector for sparse features
-  template <class Type>
-  TensorFiller& SparseSegments(Type max_segment) {
-    CAFFE_ENFORCE(dist_ != FD_FIXEDSUM);
-    return Min(0).Max(max_segment).Dist(FD_SYNTHETIC);
-  }
-
-  TensorFiller& Shape(const std::vector<int64_t>& shape) {
-    shape_ = shape;
-    return *this;
-  }
-
-  template <class Type>
-  TensorFiller(const std::vector<int64_t>& shape, Type fixed_sum)
-      : shape_(shape), dist_(FD_FIXEDSUM), fixed_sum_((double)fixed_sum) {}
-
-  TensorFiller(const std::vector<int64_t>& shape)
-      : shape_(shape), dist_(FD_UNIFORM), fixed_sum_(0) {}
-
-  TensorFiller() : TensorFiller(std::vector<int64_t>()) {}
-
-  std::string DebugString() const {
-    std::stringstream stream;
-    stream << "shape = [" << shape_ << "]; min = " << min_
-           << "; max = " << max_;
-    switch (dist_) {
-      case FD_FIXEDSUM:
-        stream << "; dist = FD_FIXEDSUM";
-        break;
-      case FD_SYNTHETIC:
-        stream << "; dist = FD_SYNTHETIC";
-        break;
-      default:
-        stream << "; dist = FD_UNIFORM";
-        break;
-    }
-    return stream.str();
-  }
-
- private:
-  std::vector<int64_t> shape_;
-  // TODO: type is unknown until a user starts to fill data;
-  // cast everything to double for now.
-  double min_ = 0.0;
-  double max_ = 1.0;
-  FillerDistribution dist_;
-  double fixed_sum_;
-};
-
-} // namespace caffe2
-
-#endif // CAFFE2_FILLER_H_

diff --git a/caffe2/utils/fixed_divisor_test.cc b/caffe2/utils/fixed_divisor_test.cc
deleted file mode 100644
index 6093bc7..0000000
--- a/caffe2/utils/fixed_divisor_test.cc
+++ /dev/null

@@ -1,80 +0,0 @@
-#include "caffe2/utils/fixed_divisor.h"
-
-#include <gtest/gtest.h>
-
-#include <random>
-
-namespace caffe2 {
-
-namespace {
-
-void CompareDivMod(int32_t v, int32_t divisor) {
-  auto fixed = FixedDivisor<int32_t>(divisor);
-
-  int native_q = v / divisor;
-  int native_r = v % divisor;
-
-  int fixed_q = fixed.Div(v);
-  int fixed_r = fixed.Mod(v);
-
-#if !defined(USE_ROCM)
-  EXPECT_EQ(native_q, fixed_q)
-      << v << " / " << divisor << " magic " << fixed.magic() << " shift "
-      << fixed.shift() << " quot " << fixed_q << " " << native_q;
-
-  EXPECT_EQ(native_r, fixed_r)
-      << v << " / " << divisor << " magic " << fixed.magic() << " shift "
-      << fixed.shift() << " rem " << fixed_r << " " << native_r;
-#endif
-}
-
-} // namespace
-
-TEST(FixedDivisorTest, FixedDivisorInt32Test) {
-  constexpr int32_t kMax = std::numeric_limits<int32_t>::max();
-
-  // divide by 1
-  CompareDivMod(kMax, 1);
-  CompareDivMod(0, 1);
-  CompareDivMod(1, 1);
-
-  // divide by max
-  CompareDivMod(kMax, kMax);
-  CompareDivMod(0, kMax);
-  CompareDivMod(1, kMax);
-
-  // divide by random positive values
-  std::random_device rd;
-  std::uniform_int_distribution<int32_t> v_dist(0, kMax);
-  std::uniform_int_distribution<int32_t> q_dist(1, kMax);
-
-  std::uniform_int_distribution<int32_t> v_small_dist(0, 1000);
-  std::uniform_int_distribution<int32_t> q_small_dist(1, 1000);
-  for (int i = 0; i < 10000; ++i) {
-    auto q = q_dist(rd);
-    auto v = v_dist(rd);
-    auto q_small = q_small_dist(rd);
-    auto v_small = v_small_dist(rd);
-
-    // random value
-    CompareDivMod(v_small, q_small);
-    CompareDivMod(v_small, q);
-    CompareDivMod(v, q_small);
-    CompareDivMod(v, q);
-
-    // special values
-    CompareDivMod(kMax, q_small);
-    CompareDivMod(0, q_small);
-    CompareDivMod(1, q_small);
-    CompareDivMod(kMax, q);
-    CompareDivMod(0, q);
-    CompareDivMod(1, q);
-
-    CompareDivMod(v_small, 1);
-    CompareDivMod(v_small, kMax);
-    CompareDivMod(v, 1);
-    CompareDivMod(v, kMax);
-  }
-}
-
-} // namespace caffe2

diff --git a/caffe2/utils/knob_patcher.cc b/caffe2/utils/knob_patcher.cc
deleted file mode 100644
index e099ea6..0000000
--- a/caffe2/utils/knob_patcher.cc
+++ /dev/null

@@ -1,137 +0,0 @@
-// Copyright (c) Meta Platforms, Inc. and its affiliates.
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <map>
-
-#include <c10/util/string_view.h>
-#include <c10/util/Flags.h>
-#include <c10/util/Logging.h>
-
-#include "caffe2/utils/knobs.h"
-#include "caffe2/utils/knob_patcher.h"
-
-namespace caffe2 {
-namespace detail {
-std::map<c10::string_view, bool*>& getRegisteredKnobs();
-} // namespace detail
-
-namespace {
-class PatchNode {
- public:
-  PatchNode(c10::string_view name, bool value);
-  ~PatchNode();
-
-  std::string name;
-  bool oldValue{false};
-  // Nodes to form a linked list of existing PatchState objects for this knob.
-  // This allows us to restore state correctly even if KnobPatcher objects
-  // are destroyed in any arbitrary order.
-  PatchNode* prev{nullptr};
-  PatchNode* next{nullptr};
-};
-} // namespace
-
-class KnobPatcher::PatchState : public PatchNode {
-  using PatchNode::PatchNode;
-};
-
-KnobPatcher::KnobPatcher(c10::string_view name, bool value)
-  : state_{std::make_unique<PatchState>(name, value)} {}
-
-KnobPatcher::~KnobPatcher() = default;
-KnobPatcher::KnobPatcher(KnobPatcher&&) noexcept = default;
-KnobPatcher& KnobPatcher::operator=(KnobPatcher&&) noexcept = default;
-
-namespace {
-
-class Patcher {
- public:
-  void patch(PatchNode* node, bool value) {
-    std::lock_guard<std::mutex> lock{mutex_};
-
-    node->oldValue = setKnobValue(node->name, value);
-    auto ret = patches_.emplace(node->name, node);
-    if (!ret.second) {
-      // There was already another patcher for this knob
-      // Append the new node to the linked list.
-      node->prev = ret.first->second;
-      CHECK(!node->prev->next);
-      node->prev->next = node;
-      ret.first->second = node;
-    }
-  }
-
-  void unpatch(PatchNode* node) {
-    std::lock_guard<std::mutex> lock{mutex_};
-
-    // Remove this PatchNode from the linked list
-    if (node->prev) {
-      node->prev->next = node->next;
-    }
-    if (node->next) {
-      // There was another patch applied after this one.
-      node->next->prev = node->prev;
-      node->next->oldValue = node->oldValue;
-    } else {
-      // This was the most recently applied patch for this knob,
-      // so restore the knob value.
-      setKnobValue(node->name, node->oldValue);
-
-      // The patches_ map should point to this node.
-      // Update it to point to the previous patch, if there is one.
-      auto iter = patches_.find(node->name);
-      if (iter == patches_.end()) {
-        LOG(FATAL) << "patch node not found when unpatching knob value";
-      }
-      TORCH_CHECK_EQ(iter->second, node);
-      if (node->prev) {
-        iter->second = node->prev;
-      } else {
-        patches_.erase(iter);
-      }
-    }
-  }
-
- private:
-  bool setKnobValue(c10::string_view name, bool value) {
-    auto& knobs = caffe2::detail::getRegisteredKnobs();
-    auto iter = knobs.find(name);
-    if (iter == knobs.end()) {
-      throw std::invalid_argument(
-          "attempted to patch unknown knob \"" + std::string(name) + "\"");
-    }
-    bool oldValue = *(iter->second);
-    *iter->second = value;
-    return oldValue;
-  }
-
-  std::mutex mutex_;
-  std::map<std::string, PatchNode*> patches_;
-};
-
-Patcher& getPatcher() {
-  static Patcher patcher;
-  return patcher;
-}
-
-PatchNode::PatchNode(c10::string_view knobName, bool value)
-    : name{knobName} {
-  getPatcher().patch(this, value);
-}
-
-PatchNode::~PatchNode() {
-  try {
-    getPatcher().unpatch(this);
-  } catch (const std::exception& ex) {
-    // This shouldn't ever happen unless we have a programming bug, but it keeps
-    // clang-tidy happy if we put a catch block here to handle the theoretical
-    // error if unpatch() calls setKnobValue() and it throws due to not finding
-    // the knob by name.
-    LOG(FATAL) << "error removing knob patch: " << ex.what();
-  }
-}
-
-} // namespace
-} // namespace caffe2

diff --git a/caffe2/utils/knob_patcher.h b/caffe2/utils/knob_patcher.h
deleted file mode 100644
index ec2b627..0000000
--- a/caffe2/utils/knob_patcher.h
+++ /dev/null

@@ -1,32 +0,0 @@
-#pragma once
-
-#include <memory>
-
-#include <c10/util/string_view.h>
-
-namespace caffe2 {
-
-/**
- * Patch the value of a knob during a unit test.
- *
- * This forces the knob to the specified value for as long as the KnobPatcher
- * object exists.  When the KnobPatcher object is destroyed the knob will revert
- * to its previous value.
- */
-class KnobPatcher {
- public:
-  KnobPatcher(c10::string_view name, bool value);
-  ~KnobPatcher();
-
-  KnobPatcher(KnobPatcher&&) noexcept;
-  KnobPatcher& operator=(KnobPatcher&&) noexcept;
-  KnobPatcher(const KnobPatcher&) = delete;
-  KnobPatcher& operator=(const KnobPatcher&) = delete;
-
- private:
-  class PatchState;
-
-  std::unique_ptr<PatchState> state_;
-};
-
-} // namespace caffe2

diff --git a/caffe2/utils/knobs.cc b/caffe2/utils/knobs.cc
deleted file mode 100644
index 63941a5..0000000
--- a/caffe2/utils/knobs.cc
+++ /dev/null

@@ -1,76 +0,0 @@
-// Copyright (c) Meta Platforms, Inc. and affiliates.
-//
-// This is a very basic knob implementation that purely uses command line flags.
-// This can be replaced with a more sophisticated implementation for use in
-// other production environments.
-
-#include <map>
-
-#include <c10/util/string_view.h>
-#include <c10/util/Flags.h>
-
-#include "caffe2/utils/knobs.h"
-
-namespace caffe2 {
-
-namespace detail {
-// Get the map of knob names to pointers to their command-line controlled
-// boolean value.
-std::map<c10::string_view, bool*>& getRegisteredKnobs() {
-  // It's safe to store the keys as string_view, since DEFINE_KNOB() ensures
-  // that these views always point to string literals.
-  static std::map<c10::string_view, bool*> registeredKnobs;
-  return registeredKnobs;
-}
-} // namespace detail
-
-bool CheckKnob(c10::string_view name) {
-  const auto& knobs = detail::getRegisteredKnobs();
-  auto iter = knobs.find(name);
-  if (iter == knobs.end()) {
-      throw std::invalid_argument(
-          "attempted to check unknown knob \"" + std::string(name) + "\"");
-  }
-  return *iter->second;
-}
-
-namespace {
-class RegisterKnob {
- public:
-  RegisterKnob(c10::string_view name, bool* cmdlineFlag) {
-    auto ret = caffe2::detail::getRegisteredKnobs().emplace(name, cmdlineFlag);
-    if (!ret.second) {
-      throw std::runtime_error("duplicate knob name: " + std::string(name));
-    }
-  }
-};
-} // namespace
-} // namespace caffe2
-
-/**
- * Define a knob.
- *
- * This will define a --caffe2_knob_<name> command line flag to control the
- * knob.
- *
- * The knob can be checked in code by calling CheckKnob(name)
- * or CheckKnob<check_fn_name>()
- */
-#define DEFINE_KNOB(name, check_fn_name, default_value, docstring) \
-  C10_DEFINE_bool(caffe2_knob_##name, default_value, docstring);   \
-  namespace caffe2 {                                               \
-  bool CheckKnob##check_fn_name() {                                \
-    return FLAGS_caffe2_knob_##name;                               \
-  }                                                                \
-  }                                                                \
-  static caffe2::RegisterKnob _knob_##name(#name, &FLAGS_caffe2_knob_##name)
-
-/*
- * Definitions of well-known knobs.
- */
-
-DEFINE_KNOB(
-    example_knob,
-    ExampleKnob,
-    false,
-    "An example knob, mainly intended for use in unit tests");

diff --git a/caffe2/utils/knobs.h b/caffe2/utils/knobs.h
deleted file mode 100644
index fbebd90..0000000
--- a/caffe2/utils/knobs.h
+++ /dev/null

@@ -1,26 +0,0 @@
-#pragma once
-
-// This file contains functions for checking rollout knobs to enable staged
-// roll out of specific code functionality.
-
-#include <memory>
-
-#include <c10/util/string_view.h>
-
-namespace caffe2 {
-
-/**
- * Check an arbitrary knob by name.
- */
-bool CheckKnob(c10::string_view name);
-
-/*
- * The following are functions for checking specific known knob values.
- *
- * These APIs are more efficient than checking by name.
- */
-
-// An example knob, just for use in unit tests.
-bool CheckKnobExampleKnob();
-
-} // namespace caffe2

diff --git a/caffe2/utils/knobs_test.cc b/caffe2/utils/knobs_test.cc
deleted file mode 100644
index 95f29cf..0000000
--- a/caffe2/utils/knobs_test.cc
+++ /dev/null

@@ -1,34 +0,0 @@
-#include <gtest/gtest.h>
-
-#include "caffe2/utils/knobs.h"
-#include "caffe2/utils/knob_patcher.h"
-
-using namespace caffe2;
-
-TEST(KnobsTest, TestKnob) {
-  auto p = KnobPatcher("example_knob", false);
-  EXPECT_FALSE(CheckKnobExampleKnob());
-  EXPECT_FALSE(CheckKnob("example_knob"));
-
-  p = KnobPatcher("example_knob", true);
-  EXPECT_TRUE(CheckKnobExampleKnob());
-  EXPECT_TRUE(CheckKnob("example_knob"));
-
-  // Test nested patchers
-  {
-    auto p2 = KnobPatcher("example_knob", false);
-    EXPECT_FALSE(CheckKnobExampleKnob());
-    EXPECT_FALSE(CheckKnob("example_knob"));
-
-    auto p3 = KnobPatcher("example_knob", true);
-    EXPECT_TRUE(CheckKnobExampleKnob());
-    EXPECT_TRUE(CheckKnob("example_knob"));
-  }
-  EXPECT_TRUE(CheckKnobExampleKnob());
-  EXPECT_TRUE(CheckKnob("example_knob"));
-}
-
-TEST(KnobsTest, TestUnknownKnob) {
-  // Unknown knob names should throw an exception
-  EXPECT_THROW(CheckKnob("this_knob_does_not_exist"), std::exception);
-}

diff --git a/caffe2/utils/map_utils.h b/caffe2/utils/map_utils.h
deleted file mode 100644
index ef8ff0c..0000000
--- a/caffe2/utils/map_utils.h
+++ /dev/null

@@ -1,19 +0,0 @@
-#pragma once
-
-namespace caffe2 {
-
-// Get value from map given key. Return supplied default value if not found
-// This is a stripped down version from folly:
-// https://github.com/facebook/folly/blob/5a07e203d79324b68d69f294fa38e43b9671e9b1/folly/MapUtil.h#L35-L45
-template <
-    class Map,
-    typename Key = typename Map::key_type,
-    typename Value = typename Map::mapped_type>
-typename Map::mapped_type
-get_default(const Map& map, const Key& key, Value&& dflt) {
-  using M = typename Map::mapped_type;
-  auto pos = map.find(key);
-  return (pos != map.end()) ? (pos->second) : M(std::forward<Value>(dflt));
-}
-
-} // namespace caffe2

diff --git a/caffe2/utils/murmur_hash3.cc b/caffe2/utils/murmur_hash3.cc
deleted file mode 100644
index 68cce1f..0000000
--- a/caffe2/utils/murmur_hash3.cc
+++ /dev/null

@@ -1,450 +0,0 @@
-//-----------------------------------------------------------------------------
-// MurmurHash3 was written by Austin Appleby, and is placed in the public
-// domain. The author hereby disclaims copyright to this source code.
-
-// Note - The x86 and x64 versions do _not_ produce the same results, as the
-// algorithms are optimized for their respective platforms. You can still
-// compile and run any of them on any platform, but your performance with the
-// non-native version will be less than optimal.
-
-#include "caffe2/utils/murmur_hash3.h"
-
-//-----------------------------------------------------------------------------
-// Platform-specific functions and macros
-
-// Microsoft Visual Studio
-
-#if defined(_MSC_VER)
-
-#define FORCE_INLINE __forceinline
-
-#include <stdlib.h>
-
-#define ROTL32(x, y) _rotl(x, y)
-#define ROTL64(x, y) _rotl64(x, y)
-
-#define BIG_CONSTANT(x) (x)
-
-// Other compilers
-
-#else // defined(_MSC_VER)
-
-#define FORCE_INLINE inline __attribute__((__always_inline__))
-
-inline uint32_t rotl32(uint32_t x, int8_t r) {
-  return (x << r) | (x >> (32 - r));
-}
-
-inline uint64_t rotl64(uint64_t x, int8_t r) {
-  return (x << r) | (x >> (64 - r));
-}
-
-#define ROTL32(x, y) rotl32(x, y)
-#define ROTL64(x, y) rotl64(x, y)
-
-#define BIG_CONSTANT(x) (x##LLU)
-
-#endif // !defined(_MSC_VER)
-
-//-----------------------------------------------------------------------------
-// Block read - if your platform needs to do endian-swapping or can only
-// handle aligned reads, do the conversion here
-
-FORCE_INLINE uint32_t getblock32(const uint32_t* p, int i) {
-  return p[i];
-}
-
-FORCE_INLINE uint64_t getblock64(const uint64_t* p, int i) {
-  return p[i];
-}
-
-//-----------------------------------------------------------------------------
-// Finalization mix - force all bits of a hash block to avalanche
-
-FORCE_INLINE uint32_t fmix32(uint32_t h) {
-  h ^= h >> 16;
-  h *= 0x85ebca6b;
-  h ^= h >> 13;
-  h *= 0xc2b2ae35;
-  h ^= h >> 16;
-
-  return h;
-}
-
-//----------
-
-FORCE_INLINE uint64_t fmix64(uint64_t k) {
-  k ^= k >> 33;
-  k *= BIG_CONSTANT(0xff51afd7ed558ccd);
-  k ^= k >> 33;
-  k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
-  k ^= k >> 33;
-
-  return k;
-}
-
-namespace caffe2 {
-
-void MurmurHash3_x86_32(const void* key, int len, uint32_t seed, void* out) {
-  const uint8_t* data = (const uint8_t*)key;
-  const int nblocks = len / 4;
-
-  uint32_t h1 = seed;
-
-  const uint32_t c1 = 0xcc9e2d51;
-  const uint32_t c2 = 0x1b873593;
-
-  //----------
-  // body
-
-  const uint32_t* blocks = (const uint32_t*)(data + nblocks * 4);
-
-  for (int i = -nblocks; i; i++) {
-    uint32_t k1 = getblock32(blocks, i);
-
-    k1 *= c1;
-    k1 = ROTL32(k1, 15);
-    k1 *= c2;
-
-    h1 ^= k1;
-    h1 = ROTL32(h1, 13);
-    h1 = h1 * 5 + 0xe6546b64;
-  }
-
-  //----------
-  // tail
-
-  const uint8_t* tail = (const uint8_t*)(data + nblocks * 4);
-
-  uint32_t k1 = 0;
-
-  switch (len & 3) {
-    case 3:
-      k1 ^= tail[2] << 16;
-      [[fallthrough]];
-    case 2:
-      k1 ^= tail[1] << 8;
-      [[fallthrough]];
-    case 1:
-      k1 ^= tail[0];
-      k1 *= c1;
-      k1 = ROTL32(k1, 15);
-      k1 *= c2;
-      h1 ^= k1;
-  };
-
-  //----------
-  // finalization
-
-  h1 ^= len;
-
-  h1 = fmix32(h1);
-
-  *(uint32_t*)out = h1;
-}
-
-//-----------------------------------------------------------------------------
-
-void MurmurHash3_x86_128(
-    const void* key,
-    const int len,
-    uint32_t seed,
-    void* out) {
-  const uint8_t* data = (const uint8_t*)key;
-  const int nblocks = len / 16;
-
-  uint32_t h1 = seed;
-  uint32_t h2 = seed;
-  uint32_t h3 = seed;
-  uint32_t h4 = seed;
-
-  const uint32_t c1 = 0x239b961b;
-  const uint32_t c2 = 0xab0e9789;
-  const uint32_t c3 = 0x38b34ae5;
-  const uint32_t c4 = 0xa1e38b93;
-
-  //----------
-  // body
-
-  const uint32_t* blocks = (const uint32_t*)(data + nblocks * 16);
-
-  for (int i = -nblocks; i; i++) {
-    uint32_t k1 = getblock32(blocks, i * 4 + 0);
-    uint32_t k2 = getblock32(blocks, i * 4 + 1);
-    uint32_t k3 = getblock32(blocks, i * 4 + 2);
-    uint32_t k4 = getblock32(blocks, i * 4 + 3);
-
-    k1 *= c1;
-    k1 = ROTL32(k1, 15);
-    k1 *= c2;
-    h1 ^= k1;
-
-    h1 = ROTL32(h1, 19);
-    h1 += h2;
-    h1 = h1 * 5 + 0x561ccd1b;
-
-    k2 *= c2;
-    k2 = ROTL32(k2, 16);
-    k2 *= c3;
-    h2 ^= k2;
-
-    h2 = ROTL32(h2, 17);
-    h2 += h3;
-    h2 = h2 * 5 + 0x0bcaa747;
-
-    k3 *= c3;
-    k3 = ROTL32(k3, 17);
-    k3 *= c4;
-    h3 ^= k3;
-
-    h3 = ROTL32(h3, 15);
-    h3 += h4;
-    h3 = h3 * 5 + 0x96cd1c35;
-
-    k4 *= c4;
-    k4 = ROTL32(k4, 18);
-    k4 *= c1;
-    h4 ^= k4;
-
-    h4 = ROTL32(h4, 13);
-    h4 += h1;
-    h4 = h4 * 5 + 0x32ac3b17;
-  }
-
-  //----------
-  // tail
-
-  const uint8_t* tail = (const uint8_t*)(data + nblocks * 16);
-
-  uint32_t k1 = 0;
-  uint32_t k2 = 0;
-  uint32_t k3 = 0;
-  uint32_t k4 = 0;
-
-  switch (len & 15) {
-    case 15:
-      k4 ^= tail[14] << 16;
-      [[fallthrough]];
-    case 14:
-      k4 ^= tail[13] << 8;
-      [[fallthrough]];
-    case 13:
-      k4 ^= tail[12] << 0;
-      k4 *= c4;
-      k4 = ROTL32(k4, 18);
-      k4 *= c1;
-      h4 ^= k4;
-      [[fallthrough]];
-
-    case 12:
-      k3 ^= tail[11] << 24;
-      [[fallthrough]];
-    case 11:
-      k3 ^= tail[10] << 16;
-      [[fallthrough]];
-    case 10:
-      k3 ^= tail[9] << 8;
-      [[fallthrough]];
-    case 9:
-      k3 ^= tail[8] << 0;
-      k3 *= c3;
-      k3 = ROTL32(k3, 17);
-      k3 *= c4;
-      h3 ^= k3;
-      [[fallthrough]];
-
-    case 8:
-      k2 ^= tail[7] << 24;
-      [[fallthrough]];
-    case 7:
-      k2 ^= tail[6] << 16;
-      [[fallthrough]];
-    case 6:
-      k2 ^= tail[5] << 8;
-      [[fallthrough]];
-    case 5:
-      k2 ^= tail[4] << 0;
-      k2 *= c2;
-      k2 = ROTL32(k2, 16);
-      k2 *= c3;
-      h2 ^= k2;
-      [[fallthrough]];
-
-    case 4:
-      k1 ^= tail[3] << 24;
-      [[fallthrough]];
-    case 3:
-      k1 ^= tail[2] << 16;
-      [[fallthrough]];
-    case 2:
-      k1 ^= tail[1] << 8;
-      [[fallthrough]];
-    case 1:
-      k1 ^= tail[0] << 0;
-      k1 *= c1;
-      k1 = ROTL32(k1, 15);
-      k1 *= c2;
-      h1 ^= k1;
-  };
-
-  //----------
-  // finalization
-
-  h1 ^= len;
-  h2 ^= len;
-  h3 ^= len;
-  h4 ^= len;
-
-  h1 += h2;
-  h1 += h3;
-  h1 += h4;
-  h2 += h1;
-  h3 += h1;
-  h4 += h1;
-
-  h1 = fmix32(h1);
-  h2 = fmix32(h2);
-  h3 = fmix32(h3);
-  h4 = fmix32(h4);
-
-  h1 += h2;
-  h1 += h3;
-  h1 += h4;
-  h2 += h1;
-  h3 += h1;
-  h4 += h1;
-
-  ((uint32_t*)out)[0] = h1;
-  ((uint32_t*)out)[1] = h2;
-  ((uint32_t*)out)[2] = h3;
-  ((uint32_t*)out)[3] = h4;
-}
-
-//-----------------------------------------------------------------------------
-
-void MurmurHash3_x64_128(
-    const void* key,
-    const int len,
-    const uint32_t seed,
-    void* out) {
-  const uint8_t* data = (const uint8_t*)key;
-  const int nblocks = len / 16;
-
-  uint64_t h1 = seed;
-  uint64_t h2 = seed;
-
-  const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
-  const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
-
-  //----------
-  // body
-
-  const uint64_t* blocks = (const uint64_t*)(data);
-
-  for (int i = 0; i < nblocks; i++) {
-    uint64_t k1 = getblock64(blocks, i * 2 + 0);
-    uint64_t k2 = getblock64(blocks, i * 2 + 1);
-
-    k1 *= c1;
-    k1 = ROTL64(k1, 31);
-    k1 *= c2;
-    h1 ^= k1;
-
-    h1 = ROTL64(h1, 27);
-    h1 += h2;
-    h1 = h1 * 5 + 0x52dce729;
-
-    k2 *= c2;
-    k2 = ROTL64(k2, 33);
-    k2 *= c1;
-    h2 ^= k2;
-
-    h2 = ROTL64(h2, 31);
-    h2 += h1;
-    h2 = h2 * 5 + 0x38495ab5;
-  }
-
-  //----------
-  // tail
-
-  const uint8_t* tail = (const uint8_t*)(data + nblocks * 16);
-
-  uint64_t k1 = 0;
-  uint64_t k2 = 0;
-
-  switch (len & 15) {
-    case 15:
-      k2 ^= ((uint64_t)tail[14]) << 48;
-      [[fallthrough]];
-    case 14:
-      k2 ^= ((uint64_t)tail[13]) << 40;
-      [[fallthrough]];
-    case 13:
-      k2 ^= ((uint64_t)tail[12]) << 32;
-      [[fallthrough]];
-    case 12:
-      k2 ^= ((uint64_t)tail[11]) << 24;
-      [[fallthrough]];
-    case 11:
-      k2 ^= ((uint64_t)tail[10]) << 16;
-      [[fallthrough]];
-    case 10:
-      k2 ^= ((uint64_t)tail[9]) << 8;
-      [[fallthrough]];
-    case 9:
-      k2 ^= ((uint64_t)tail[8]) << 0;
-      k2 *= c2;
-      k2 = ROTL64(k2, 33);
-      k2 *= c1;
-      h2 ^= k2;
-      [[fallthrough]];
-
-    case 8:
-      k1 ^= ((uint64_t)tail[7]) << 56;
-      [[fallthrough]];
-    case 7:
-      k1 ^= ((uint64_t)tail[6]) << 48;
-      [[fallthrough]];
-    case 6:
-      k1 ^= ((uint64_t)tail[5]) << 40;
-      [[fallthrough]];
-    case 5:
-      k1 ^= ((uint64_t)tail[4]) << 32;
-      [[fallthrough]];
-    case 4:
-      k1 ^= ((uint64_t)tail[3]) << 24;
-      [[fallthrough]];
-    case 3:
-      k1 ^= ((uint64_t)tail[2]) << 16;
-      [[fallthrough]];
-    case 2:
-      k1 ^= ((uint64_t)tail[1]) << 8;
-      [[fallthrough]];
-    case 1:
-      k1 ^= ((uint64_t)tail[0]) << 0;
-      k1 *= c1;
-      k1 = ROTL64(k1, 31);
-      k1 *= c2;
-      h1 ^= k1;
-  };
-
-  //----------
-  // finalization
-
-  h1 ^= len;
-  h2 ^= len;
-
-  h1 += h2;
-  h2 += h1;
-
-  h1 = fmix64(h1);
-  h2 = fmix64(h2);
-
-  h1 += h2;
-  h2 += h1;
-
-  ((uint64_t*)out)[0] = h1;
-  ((uint64_t*)out)[1] = h2;
-}
-
-} // namespace caffe2

diff --git a/caffe2/utils/murmur_hash3.h b/caffe2/utils/murmur_hash3.h
deleted file mode 100644
index ea67e71..0000000
--- a/caffe2/utils/murmur_hash3.h
+++ /dev/null

@@ -1,34 +0,0 @@
-//-----------------------------------------------------------------------------
-// MurmurHash3 was written by Austin Appleby, and is placed in the public
-// domain. The author hereby disclaims copyright to this source code.
-
-#pragma once
-
-//-----------------------------------------------------------------------------
-// Platform-specific functions and macros
-
-// Microsoft Visual Studio
-
-#if defined(_MSC_VER) && (_MSC_VER < 1600)
-
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned __int64 uint64_t;
-
-// Other compilers
-
-#else // defined(_MSC_VER)
-
-#include <stdint.h>
-
-#endif // !defined(_MSC_VER)
-
-namespace caffe2 {
-
-void MurmurHash3_x86_32(const void* key, int len, uint32_t seed, void* out);
-
-void MurmurHash3_x86_128(const void* key, int len, uint32_t seed, void* out);
-
-void MurmurHash3_x64_128(const void* key, int len, uint32_t seed, void* out);
-
-} // namespace caffe2

diff --git a/caffe2/utils/proto_utils.cc b/caffe2/utils/proto_utils.cc
deleted file mode 100644
index 8fc8158..0000000
--- a/caffe2/utils/proto_utils.cc
+++ /dev/null

@@ -1,715 +0,0 @@
-#include "caffe2/utils/proto_utils.h"
-
-#include <c10/core/DeviceType.h>
-
-#include <fcntl.h>
-#include <cerrno>
-#include <fstream>
-#include <unordered_set>
-
-#if defined(_MSC_VER)
-#include <io.h>
-#else
-#include <unistd.h>
-#endif
-
-#include <google/protobuf/io/coded_stream.h>
-
-#ifndef CAFFE2_USE_LITE_PROTO
-#include <google/protobuf/io/zero_copy_stream_impl.h>
-#include <google/protobuf/text_format.h>
-#else
-#include <google/protobuf/io/zero_copy_stream_impl_lite.h>
-#endif // !CAFFE2_USE_LITE_PROTO
-
-#include <c10/util/Logging.h>
-
-using ::google::protobuf::MessageLite;
-
-namespace caffe2 {
-
-C10_EXPORT std::string DeviceTypeName(const int32_t& d) {
-  return at::DeviceTypeName(static_cast<at::DeviceType>(d));
-}
-
-void setTotalBytesLimit(::google::protobuf::io::CodedInputStream& stream, int bytes_limit, int warning_threshold) {
-  #if GOOGLE_PROTOBUF_VERSION >= 3011000
-    // Only take one parameter since protobuf 3.11
-    stream.SetTotalBytesLimit(bytes_limit);
-  #else
-    stream.SetTotalBytesLimit(bytes_limit, warning_threshold);
-  #endif
-}
-
-C10_EXPORT int DeviceId(const DeviceOption& option) {
-  switch (option.device_type()) {
-    case PROTO_CPU:
-      return option.numa_node_id();
-    case PROTO_CUDA:
-    case PROTO_HIP:
-      return option.device_id();
-    case PROTO_MKLDNN:
-      return option.numa_node_id();
-    default:
-      CAFFE_THROW("Unknown device id for device type: ", option.device_type());
-  }
-}
-
-C10_EXPORT bool IsSameDevice(const DeviceOption& lhs, const DeviceOption& rhs) {
-  return (
-      lhs.device_type() == rhs.device_type() &&
-      lhs.device_id() == rhs.device_id() &&
-      lhs.node_name() == rhs.node_name() &&
-      lhs.numa_node_id() == rhs.numa_node_id());
-}
-
-C10_EXPORT bool IsCPUDeviceType(int device_type) {
-  static const std::unordered_set<int> cpu_types{
-      PROTO_CPU,
-      PROTO_MKLDNN,
-      PROTO_IDEEP,
-  };
-  return cpu_types.count(device_type);
-}
-
-C10_EXPORT bool IsGPUDeviceType(int device_type) {
-  static const std::unordered_set<int> gpu_types{
-      PROTO_CUDA,
-      PROTO_HIP,
-  };
-  return gpu_types.count(device_type);
-}
-
-C10_EXPORT bool ReadStringFromFile(const char* filename, string* str) {
-  std::ifstream ifs(filename, std::ios::in);
-  if (!ifs) {
-    VLOG(1) << "File cannot be opened: " << filename
-            << " error: " << ifs.rdstate();
-    return false;
-  }
-  ifs.seekg(0, std::ios::end);
-  size_t n = ifs.tellg();
-  str->resize(n);
-  ifs.seekg(0);
-  ifs.read(&(*str)[0], n);
-  return true;
-}
-
-C10_EXPORT bool WriteStringToFile(const string& str, const char* filename) {
-  std::ofstream ofs(filename, std::ios::out | std::ios::trunc);
-  if (!ofs.is_open()) {
-    VLOG(1) << "File cannot be created: " << filename
-            << " error: " << ofs.rdstate();
-    return false;
-  }
-  ofs << str;
-  return true;
-}
-
-// IO-specific proto functions: we will deal with the protocol buffer lite and
-// full versions differently.
-
-#ifdef CAFFE2_USE_LITE_PROTO
-
-// Lite runtime.
-
-namespace {
-class IfstreamInputStream : public ::google::protobuf::io::CopyingInputStream {
- public:
-  explicit IfstreamInputStream(const string& filename)
-      : ifs_(filename.c_str(), std::ios::in | std::ios::binary) {}
-  ~IfstreamInputStream() {
-    ifs_.close();
-  }
-
-  int Read(void* buffer, int size) {
-    if (!ifs_) {
-      return -1;
-    }
-    ifs_.read(static_cast<char*>(buffer), size);
-    return ifs_.gcount();
-  }
-
- private:
-  std::ifstream ifs_;
-};
-} // namespace
-
-C10_EXPORT string ProtoDebugString(const MessageLite& proto) {
-  string serialized = proto.SerializeAsString();
-  for (char& c : serialized) {
-    if (c < 0x20 || c >= 0x7f) {
-      c = '?';
-    }
-  }
-  return serialized;
-}
-
-C10_EXPORT bool ParseProtoFromLargeString(
-    const string& str,
-    MessageLite* proto) {
-  ::google::protobuf::io::ArrayInputStream input_stream(str.data(), str.size());
-  ::google::protobuf::io::CodedInputStream coded_stream(&input_stream);
-  // Set PlanDef message size limit to 2G.
-  setTotalBytesLimit(coded_stream, 2147483647, 512LL << 20);
-  return proto->ParseFromCodedStream(&coded_stream);
-}
-
-C10_EXPORT bool ReadProtoFromBinaryFile(
-    const char* filename,
-    MessageLite* proto) {
-  ::google::protobuf::io::CopyingInputStreamAdaptor stream(
-      new IfstreamInputStream(filename));
-  stream.SetOwnsCopyingStream(true);
-  // Total bytes hard limit / warning limit are set to 2GB and 512MB
-  // respectively.
-  ::google::protobuf::io::CodedInputStream coded_stream(&stream);
-  setTotalBytesLimit(coded_stream, 2147483647, 512LL << 20);
-  return proto->ParseFromCodedStream(&coded_stream);
-}
-
-C10_EXPORT void WriteProtoToBinaryFile(
-    const MessageLite& /*proto*/,
-    const char* /*filename*/) {
-  LOG(FATAL) << "Not implemented yet.";
-}
-
-#else // CAFFE2_USE_LITE_PROTO
-
-// Full protocol buffer.
-
-using ::google::protobuf::Message;
-using ::google::protobuf::io::CodedInputStream;
-using ::google::protobuf::io::CodedOutputStream;
-using ::google::protobuf::io::FileInputStream;
-using ::google::protobuf::io::FileOutputStream;
-using ::google::protobuf::io::ZeroCopyInputStream;
-using ::google::protobuf::io::ZeroCopyOutputStream;
-
-namespace TextFormat {
-C10_EXPORT bool ParseFromString(const string& spec, Message* proto) {
-  string bc_spec = spec;
-
-  {
-    auto num_replaced = c10::ReplaceAll(bc_spec, "cuda_gpu_id", "device_id");
-    if (num_replaced) {
-      LOG(ERROR) << "Your model was serialized in Protobuf TextFormat and "
-                 << "it has " << num_replaced
-                 << " places using the deprecated field name 'cuda_gpu_id'!\n"
-                 << spec
-                 << "\nPlease re-export your model in Protobuf binary format "
-                 << "to make it backward compatible for field renaming.";
-    }
-  }
-
-  return ::google::protobuf::TextFormat::ParseFromString(
-      // NOLINTNEXTLINE(performance-move-const-arg)
-      std::move(bc_spec), proto);
-}
-} // namespace TextFormat
-
-C10_EXPORT string ProtoDebugString(const Message& proto) {
-  return proto.ShortDebugString();
-}
-
-C10_EXPORT bool ParseProtoFromLargeString(const string& str, Message* proto) {
-  ::google::protobuf::io::ArrayInputStream input_stream(str.data(), str.size());
-  ::google::protobuf::io::CodedInputStream coded_stream(&input_stream);
-  // Set PlanDef message size limit to 2G.
-  setTotalBytesLimit(coded_stream, 2147483647, 512LL << 20);
-  return proto->ParseFromCodedStream(&coded_stream);
-}
-
-C10_EXPORT bool ReadProtoFromTextFile(const char* filename, Message* proto) {
-  int fd = open(filename, O_RDONLY);
-  CAFFE_ENFORCE_NE(fd, -1, "File not found: ", filename);
-  FileInputStream* input = new FileInputStream(fd);
-  bool success = google::protobuf::TextFormat::Parse(input, proto);
-  delete input;
-  close(fd);
-  return success;
-}
-
-C10_EXPORT void WriteProtoToTextFile(
-    const Message& proto,
-    const char* filename,
-    bool throwIfError) {
-  int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
-  FileOutputStream* output = new FileOutputStream(fd);
-  if(!google::protobuf::TextFormat::Print(proto, output)) {
-     if (throwIfError) {
-       CAFFE_THROW("Cannot write proto to text file: ", filename);
-     } else {
-       LOG(ERROR) << "Cannot write proto to text file: " << filename;
-     }
-  }
-  delete output;
-  close(fd);
-}
-
-C10_EXPORT bool ReadProtoFromBinaryFile(
-    const char* filename,
-    MessageLite* proto) {
-#if defined(_MSC_VER) // for MSC compiler binary flag needs to be specified
-  int fd = open(filename, O_RDONLY | O_BINARY);
-#else
-  int fd = open(filename, O_RDONLY);
-#endif
-  CAFFE_ENFORCE_NE(fd, -1, "File not found: ", filename);
-  std::unique_ptr<ZeroCopyInputStream> raw_input(new FileInputStream(fd));
-  std::unique_ptr<CodedInputStream> coded_input(
-      new CodedInputStream(raw_input.get()));
-  // A hack to manually allow using very large protocol buffers.
-  #if GOOGLE_PROTOBUF_VERSION >= 3011000
-    // Only take one parameter since protobuf 3.11
-    coded_input->SetTotalBytesLimit(2147483647);
-  #else
-    // Total bytes hard limit / warning limit are set to 2GB and 512MB respectively.
-    coded_input->SetTotalBytesLimit(2147483647, 536870912);
-  #endif
-  bool success = proto->ParseFromCodedStream(coded_input.get());
-  coded_input.reset();
-  raw_input.reset();
-  close(fd);
-  return success;
-}
-
-C10_EXPORT void WriteProtoToBinaryFile(
-    const MessageLite& proto,
-    const char* filename) {
-  int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
-  CAFFE_ENFORCE_NE(
-      fd, -1, "File cannot be created: ", filename, " error number: ", errno);
-  std::unique_ptr<ZeroCopyOutputStream> raw_output(new FileOutputStream(fd));
-  std::unique_ptr<CodedOutputStream> coded_output(
-      new CodedOutputStream(raw_output.get()));
-  CAFFE_ENFORCE(proto.SerializeToCodedStream(coded_output.get()));
-  coded_output.reset();
-  raw_output.reset();
-  close(fd);
-}
-
-#endif // CAFFE2_USE_LITE_PROTO
-
-C10_EXPORT ArgumentHelper::ArgumentHelper(const OperatorDef& def) {
-  for (auto& arg : def.arg()) {
-    if (arg_map_.count(arg.name())) {
-      if (arg.SerializeAsString() != arg_map_[arg.name()].SerializeAsString()) {
-        // If there are two arguments of the same name but different contents,
-        // we will throw an error.
-        CAFFE_THROW(
-            "Found argument of the same name ",
-            arg.name(),
-            "but with different contents.",
-            ProtoDebugString(def));
-      } else {
-        LOG(WARNING) << "Duplicated argument name [" << arg.name()
-                     << "] found in operator def: " << ProtoDebugString(def);
-      }
-    }
-    arg_map_[arg.name()] = arg;
-  }
-}
-
-C10_EXPORT ArgumentHelper::ArgumentHelper(const NetDef& netdef) {
-  for (auto& arg : netdef.arg()) {
-    CAFFE_ENFORCE(
-        arg_map_.count(arg.name()) == 0,
-        "Duplicated argument name [",
-        arg.name(),
-        "] found in net def: ",
-        ProtoDebugString(netdef));
-    arg_map_[arg.name()] = arg;
-  }
-}
-
-C10_EXPORT bool ArgumentHelper::HasArgument(c10::string_view name) const {
-#ifdef CAFFE2_ENABLE_REDUCED_STRINGS_IN_ARGUMENT_LOOKUP
-  return arg_map_.count(name);
-#else
-  return arg_map_.count(std::string(name));
-#endif
-}
-
-namespace {
-// Helper function to verify that conversion between types won't loose any
-// significant bit.
-template <typename InputType, typename TargetType>
-bool SupportsLosslessConversion(const InputType& value) {
-  return static_cast<InputType>(static_cast<TargetType>(value)) == value;
-}
-} // namespace
-bool operator==(const TensorProto& l, const TensorProto& r) {
-  return l.SerializeAsString() == r.SerializeAsString();
-}
-
-std::ostream& operator<<(std::ostream& output, const TensorProto& n) {
-  output << n.SerializeAsString();
-  return output;
-}
-bool operator==(const QTensorProto& l, const QTensorProto& r) {
-  return l.SerializeAsString() == r.SerializeAsString();
-}
-
-std::ostream& operator<<(std::ostream& output, const QTensorProto& n) {
-  output << n.SerializeAsString();
-  return output;
-}
-bool operator==(const NetDef& l, const NetDef& r) {
-  return l.SerializeAsString() == r.SerializeAsString();
-}
-
-std::ostream& operator<<(std::ostream& output, const NetDef& n) {
-  output << n.SerializeAsString();
-  return output;
-}
-
-#define INSTANTIATE_GET_SINGLE_ARGUMENT(                               \
-    T, fieldname, enforce_lossless_conversion)                         \
-  template <>                                                          \
-  C10_EXPORT T ArgumentHelper::GetSingleArgument<T>(                   \
-      c10::string_view name, const T& default_value) const {           \
-    auto it = CAFFE2_ARG_MAP_FIND(arg_map_, name);                     \
-    if (it == arg_map_.end()) {                                        \
-      VLOG(1) << "Using default parameter value " << default_value     \
-              << " for parameter " << name;                            \
-      return default_value;                                            \
-    }                                                                  \
-    CAFFE_ENFORCE(                                                     \
-        it->second.has_##fieldname(),                                  \
-        "Argument ",                                                   \
-        name,                                                          \
-        " does not have the right field: expected field " #fieldname); \
-    const auto& value = it->second.fieldname();                        \
-    if (enforce_lossless_conversion) {                                 \
-      auto supportsConversion =                                        \
-          SupportsLosslessConversion<decltype(value), T>(value);       \
-      CAFFE_ENFORCE(                                                   \
-          supportsConversion,                                          \
-          "Value",                                                     \
-          value,                                                       \
-          " of argument ",                                             \
-          name,                                                        \
-          "cannot be represented correctly in a target type");         \
-    }                                                                  \
-    return static_cast<T>(value);                                      \
-  }                                                                    \
-  template <>                                                          \
-  C10_EXPORT bool ArgumentHelper::HasSingleArgumentOfType<T>(          \
-      c10::string_view name) const {                                   \
-    auto it = CAFFE2_ARG_MAP_FIND(arg_map_, name);                     \
-    if (it == arg_map_.end()) {                                        \
-      return false;                                                    \
-    }                                                                  \
-    return it->second.has_##fieldname();                               \
-  }
-
-INSTANTIATE_GET_SINGLE_ARGUMENT(float, f, false)
-INSTANTIATE_GET_SINGLE_ARGUMENT(double, f, false)
-INSTANTIATE_GET_SINGLE_ARGUMENT(bool, i, false)
-INSTANTIATE_GET_SINGLE_ARGUMENT(int8_t, i, true)
-INSTANTIATE_GET_SINGLE_ARGUMENT(int16_t, i, true)
-INSTANTIATE_GET_SINGLE_ARGUMENT(int, i, true)
-INSTANTIATE_GET_SINGLE_ARGUMENT(int64_t, i, true)
-INSTANTIATE_GET_SINGLE_ARGUMENT(uint8_t, i, true)
-INSTANTIATE_GET_SINGLE_ARGUMENT(uint16_t, i, true)
-INSTANTIATE_GET_SINGLE_ARGUMENT(size_t, i, true)
-INSTANTIATE_GET_SINGLE_ARGUMENT(string, s, false)
-INSTANTIATE_GET_SINGLE_ARGUMENT(NetDef, n, false)
-#undef INSTANTIATE_GET_SINGLE_ARGUMENT
-
-#define INSTANTIATE_GET_REPEATED_ARGUMENT(                             \
-    T, fieldname, enforce_lossless_conversion)                         \
-  template <>                                                          \
-  C10_EXPORT std::vector<T> ArgumentHelper::GetRepeatedArgument<T>( \
-      c10::string_view name, const std::vector<T>& default_value) const { \
-    auto it = CAFFE2_ARG_MAP_FIND(arg_map_, name);                      \
-    if (it == arg_map_.end()) {                                         \
-      return default_value;                                            \
-    }                                                                  \
-    std::vector<T> values;                                           \
-    for (const auto& v : it->second.fieldname()) {                     \
-      if (enforce_lossless_conversion) {                               \
-        auto supportsConversion =                                      \
-            SupportsLosslessConversion<decltype(v), T>(v);             \
-        CAFFE_ENFORCE(                                                 \
-            supportsConversion,                                        \
-            "Value",                                                   \
-            v,                                                         \
-            " of argument ",                                           \
-            name,                                                      \
-            "cannot be represented correctly in a target type");       \
-      }                                                                \
-      values.push_back(static_cast<T>(v));                             \
-    }                                                                  \
-    return values;                                                     \
-  }
-
-INSTANTIATE_GET_REPEATED_ARGUMENT(float, floats, false)
-INSTANTIATE_GET_REPEATED_ARGUMENT(double, floats, false)
-INSTANTIATE_GET_REPEATED_ARGUMENT(bool, ints, false)
-INSTANTIATE_GET_REPEATED_ARGUMENT(int8_t, ints, true)
-INSTANTIATE_GET_REPEATED_ARGUMENT(int16_t, ints, true)
-INSTANTIATE_GET_REPEATED_ARGUMENT(int, ints, true)
-INSTANTIATE_GET_REPEATED_ARGUMENT(int64_t, ints, true)
-INSTANTIATE_GET_REPEATED_ARGUMENT(uint8_t, ints, true)
-INSTANTIATE_GET_REPEATED_ARGUMENT(uint16_t, ints, true)
-INSTANTIATE_GET_REPEATED_ARGUMENT(size_t, ints, true)
-INSTANTIATE_GET_REPEATED_ARGUMENT(string, strings, false)
-INSTANTIATE_GET_REPEATED_ARGUMENT(NetDef, nets, false)
-INSTANTIATE_GET_REPEATED_ARGUMENT(TensorProto, tensors, false)
-INSTANTIATE_GET_REPEATED_ARGUMENT(QTensorProto, qtensors, false)
-#undef INSTANTIATE_GET_REPEATED_ARGUMENT
-
-#define CAFFE2_MAKE_SINGULAR_ARGUMENT(T, fieldname)                      \
-  template <>                                                            \
-  C10_EXPORT Argument MakeArgument(const string& name, const T& value) { \
-    Argument arg;                                                        \
-    arg.set_name(name);                                                  \
-    arg.set_##fieldname(value);                                          \
-    return arg;                                                          \
-  }
-
-CAFFE2_MAKE_SINGULAR_ARGUMENT(bool, i)
-CAFFE2_MAKE_SINGULAR_ARGUMENT(float, f)
-CAFFE2_MAKE_SINGULAR_ARGUMENT(int, i)
-CAFFE2_MAKE_SINGULAR_ARGUMENT(int16_t, i)
-CAFFE2_MAKE_SINGULAR_ARGUMENT(int64_t, i)
-CAFFE2_MAKE_SINGULAR_ARGUMENT(string, s)
-#undef CAFFE2_MAKE_SINGULAR_ARGUMENT
-
-template <>
-C10_EXPORT Argument MakeArgument(const string& name, const NetDef& value) {
-  Argument arg;
-  arg.set_name(name);
-  *arg.mutable_n() = value;
-  return arg;
-}
-
-template <>
-C10_EXPORT bool ArgumentHelper::RemoveArgument(OperatorDef& def, int index);
-template <>
-bool ArgumentHelper::RemoveArgument(NetDef& def, int index);
-
-template <>
-C10_EXPORT Argument MakeArgument(const string& name, const MessageLite& value) {
-  Argument arg;
-  arg.set_name(name);
-  arg.set_s(value.SerializeAsString());
-  return arg;
-}
-
-#define CAFFE2_MAKE_REPEATED_ARGUMENT(T, fieldname) \
-  template <>                                       \
-  C10_EXPORT Argument MakeArgument(                 \
-      const string& name, const std::vector<T>& value) { \
-    Argument arg;                                   \
-    arg.set_name(name);                             \
-    for (const auto& v : value) {                   \
-      arg.add_##fieldname(v);                       \
-    }                                               \
-    return arg;                                     \
-  }
-
-CAFFE2_MAKE_REPEATED_ARGUMENT(float, floats)
-CAFFE2_MAKE_REPEATED_ARGUMENT(int, ints)
-CAFFE2_MAKE_REPEATED_ARGUMENT(int64_t, ints)
-CAFFE2_MAKE_REPEATED_ARGUMENT(string, strings)
-#undef CAFFE2_MAKE_REPEATED_ARGUMENT
-
-C10_EXPORT bool HasOutput(const OperatorDef& op, const std::string& output) {
-  for (const auto& outp : op.output()) {
-    if (outp == output) {
-      return true;
-    }
-  }
-  return false;
-}
-
-C10_EXPORT bool HasInput(const OperatorDef& op, const std::string& input) {
-  for (const auto& inp : op.input()) {
-    if (inp == input) {
-      return true;
-    }
-  }
-  return false;
-}
-
-// Return the argument index or -1 if it does not exist.
-C10_EXPORT int GetArgumentIndex(
-    const google::protobuf::RepeatedPtrField<Argument>& args,
-    c10::string_view name) {
-  int index = 0;
-  for (const Argument& arg : args) {
-    if (arg.name() == name) {
-      return index;
-    }
-    index++;
-  }
-  return -1;
-}
-
-C10_EXPORT const Argument& GetArgument(
-    const OperatorDef& def,
-    c10::string_view name) {
-  int index = GetArgumentIndex(def.arg(), name);
-  if (index != -1) {
-    return def.arg(index);
-  } else {
-    CAFFE_THROW(
-        "Argument named ",
-        name,
-        " does not exist in operator ",
-        ProtoDebugString(def));
-  }
-}
-
-C10_EXPORT const Argument& GetArgument(const NetDef& def, c10::string_view name) {
-  int index = GetArgumentIndex(def.arg(), name);
-  if (index != -1) {
-    return def.arg(index);
-  } else {
-    CAFFE_THROW(
-        "Argument named ",
-        name,
-        " does not exist in net ",
-        ProtoDebugString(def));
-  }
-}
-
-C10_EXPORT const Argument* GetArgumentPtr(
-    const OperatorDef& def,
-    c10::string_view name) {
-  int index = GetArgumentIndex(def.arg(), name);
-  if (index != -1) {
-    return &def.arg(index);
-  } else {
-    return nullptr;
-  }
-}
-
-C10_EXPORT const Argument* GetArgumentPtr(
-    const NetDef& def,
-    c10::string_view name) {
-  int index = GetArgumentIndex(def.arg(), name);
-  if (index != -1) {
-    return &def.arg(index);
-  } else {
-    return nullptr;
-  }
-}
-
-C10_EXPORT bool GetFlagArgument(
-    const google::protobuf::RepeatedPtrField<Argument>& args,
-    c10::string_view name,
-    bool default_value) {
-  int index = GetArgumentIndex(args, name);
-  if (index != -1) {
-    // NOLINTNEXTLINE(performance-unnecessary-copy-initialization)
-    auto arg = args.Get(index);
-    CAFFE_ENFORCE(
-        arg.has_i(), "Can't parse argument as bool: ", ProtoDebugString(arg));
-    return arg.i();
-  }
-  return default_value;
-}
-
-C10_EXPORT bool GetFlagArgument(
-    const OperatorDef& def,
-    c10::string_view name,
-    bool default_value) {
-  return GetFlagArgument(def.arg(), name, default_value);
-}
-
-C10_EXPORT bool
-GetFlagArgument(const NetDef& def, c10::string_view name, bool default_value) {
-  return GetFlagArgument(def.arg(), name, default_value);
-}
-
-template <typename Def>
-Argument* GetMutableArgumentImpl(
-    const string& name,
-    const bool create_if_missing,
-    Def* def) {
-  for (int i = 0; i < def->arg_size(); ++i) {
-    if (def->arg(i).name() == name) {
-      return def->mutable_arg(i);
-    }
-  }
-  // If no argument of the right name is found...
-  if (create_if_missing) {
-    Argument* arg = def->add_arg();
-    arg->set_name(name);
-    return arg;
-  } else {
-    return nullptr;
-  }
-}
-
-C10_EXPORT Argument* GetMutableArgument(
-    const string& name,
-    const bool create_if_missing,
-    OperatorDef* def) {
-  return GetMutableArgumentImpl(name, create_if_missing, def);
-}
-
-C10_EXPORT Argument* GetMutableArgument(
-    const string& name,
-    const bool create_if_missing,
-    NetDef* def) {
-  return GetMutableArgumentImpl(name, create_if_missing, def);
-}
-
-C10_EXPORT void cleanupExternalInputsAndOutputs(NetDef* net) {
-  std::vector<std::string> oldExternalInputs;
-  for (const auto& input : net->external_input()) {
-    oldExternalInputs.emplace_back(input);
-  }
-  std::vector<std::string> oldExternalOutputs;
-  for (const auto& output : net->external_output()) {
-    oldExternalOutputs.emplace_back(output);
-  }
-
-  net->clear_external_input();
-  net->clear_external_output();
-
-  std::set<std::string> inputSet;
-  for (const auto& input : oldExternalInputs) {
-    if (inputSet.count(input)) {
-      // Prevent duplicate external inputs.
-      continue;
-    }
-    inputSet.insert(input);
-    net->add_external_input(input);
-  }
-
-  // Set of blobs that are external inputs or outputs of some operators.
-  std::set<std::string> allOutputs(inputSet.begin(), inputSet.end());
-  for (const auto& op : net->op()) {
-    for (const auto& input : op.input()) {
-      if (inputSet.count(input) || allOutputs.count(input)) {
-        continue;
-      }
-      // Add missing external inputs.
-      inputSet.insert(input);
-      net->add_external_input(input);
-    }
-    for (const auto& output : op.output()) {
-      allOutputs.insert(output);
-    }
-  }
-
-  std::set<std::string> outputSet;
-  for (const auto& output : oldExternalOutputs) {
-    if (!allOutputs.count(output)) {
-      continue;
-    }
-    if (outputSet.count(output)) {
-      continue;
-    }
-    outputSet.insert(output);
-    net->add_external_output(output);
-  }
-}
-
-} // namespace caffe2

diff --git a/caffe2/utils/proto_utils.h b/caffe2/utils/proto_utils.h
deleted file mode 100644
index a690342..0000000
--- a/caffe2/utils/proto_utils.h
+++ /dev/null

@@ -1,383 +0,0 @@
-#ifndef CAFFE2_UTILS_PROTO_UTILS_H_
-#define CAFFE2_UTILS_PROTO_UTILS_H_
-
-#ifdef CAFFE2_USE_LITE_PROTO
-#include <google/protobuf/message_lite.h>
-#else // CAFFE2_USE_LITE_PROTO
-#include <google/protobuf/message.h>
-#endif  // !CAFFE2_USE_LITE_PROTO
-
-#include <c10/util/Logging.h>
-#include <c10/util/string_view.h>
-#include <c10/util/irange.h>
-
-#include "caffe2/utils/proto_wrap.h"
-#include "caffe2/proto/caffe2_pb.h"
-
-#ifndef C10_ANDROID
-#define CAFFE2_ENABLE_REDUCED_STRINGS_IN_ARGUMENT_LOOKUP
-#define CAFFE2_ARG_MAP_FIND(map, key) map.find(key)
-#else
-#define CAFFE2_ARG_MAP_FIND(map, key) map.find(std::string(key))
-#endif
-
-namespace caffe2 {
-
-using std::string;
-using ::google::protobuf::MessageLite;
-
-// A wrapper function to return device name string for use in blob serialization
-// / deserialization. This should have one to one correspondence with
-// caffe2/proto/caffe2.proto: enum DeviceType.
-//
-// Note that we can't use DeviceType_Name, because that is only available in
-// protobuf-full, and some platforms (like mobile) may want to use
-// protobuf-lite instead.
-TORCH_API std::string DeviceTypeName(const int32_t& d);
-
-TORCH_API int DeviceId(const DeviceOption& option);
-
-// Returns if the two DeviceOptions are pointing to the same device.
-TORCH_API bool IsSameDevice(const DeviceOption& lhs, const DeviceOption& rhs);
-
-TORCH_API bool IsCPUDeviceType(int device_type);
-TORCH_API bool IsGPUDeviceType(int device_type);
-
-// Common interfaces that reads file contents into a string.
-TORCH_API bool ReadStringFromFile(const char* filename, string* str);
-TORCH_API bool WriteStringToFile(const string& str, const char* filename);
-
-// Common interfaces that are supported by both lite and full protobuf.
-TORCH_API bool ReadProtoFromBinaryFile(const char* filename, MessageLite* proto);
-inline bool ReadProtoFromBinaryFile(const string filename, MessageLite* proto) {
-  return ReadProtoFromBinaryFile(filename.c_str(), proto);
-}
-
-TORCH_API void WriteProtoToBinaryFile(const MessageLite& proto, const char* filename);
-inline void WriteProtoToBinaryFile(const MessageLite& proto,
-                                   const string& filename) {
-  return WriteProtoToBinaryFile(proto, filename.c_str());
-}
-
-#ifdef CAFFE2_USE_LITE_PROTO
-
-namespace TextFormat {
-inline bool ParseFromString(const string& spec, MessageLite* proto) {
-  LOG(FATAL) << "If you are running lite version, you should not be "
-             << "calling any text-format protobuffers.";
-  return false;
-}
-} // namespace TextFormat
-
-
-TORCH_API string ProtoDebugString(const MessageLite& proto);
-
-TORCH_API bool ParseProtoFromLargeString(const string& str, MessageLite* proto);
-
-// Text format MessageLite wrappers: these functions do nothing but just
-// allowing things to compile. It will produce a runtime error if you are using
-// MessageLite but still want text support.
-inline bool ReadProtoFromTextFile(
-    const char* /*filename*/,
-    MessageLite* /*proto*/) {
-  LOG(FATAL) << "If you are running lite version, you should not be "
-                  << "calling any text-format protobuffers.";
-  return false;  // Just to suppress compiler warning.
-}
-inline bool ReadProtoFromTextFile(const string filename, MessageLite* proto) {
-  return ReadProtoFromTextFile(filename.c_str(), proto);
-}
-
-inline void WriteProtoToTextFile(
-    const MessageLite& /*proto*/,
-    const char* /*filename*/,
-    bool throwIfError = true) {
-  LOG(FATAL) << "If you are running lite version, you should not be "
-                  << "calling any text-format protobuffers.";
-}
-inline void WriteProtoToTextFile(const MessageLite& proto,
-                                 const string& filename,
-                                 bool throwIfError = true) {
-  return WriteProtoToTextFile(proto, filename.c_str(), throwIfError);
-}
-
-inline bool ReadProtoFromFile(const char* filename, MessageLite* proto) {
-  return (ReadProtoFromBinaryFile(filename, proto) ||
-          ReadProtoFromTextFile(filename, proto));
-}
-
-inline bool ReadProtoFromFile(const string& filename, MessageLite* proto) {
-  return ReadProtoFromFile(filename.c_str(), proto);
-}
-
-#else  // CAFFE2_USE_LITE_PROTO
-
-using ::google::protobuf::Message;
-
-namespace TextFormat {
-TORCH_API bool ParseFromString(const string& spec, Message* proto);
-} // namespace TextFormat
-
-TORCH_API string ProtoDebugString(const Message& proto);
-
-TORCH_API bool ParseProtoFromLargeString(const string& str, Message* proto);
-
-TORCH_API bool ReadProtoFromTextFile(const char* filename, Message* proto);
-inline bool ReadProtoFromTextFile(const string filename, Message* proto) {
-  return ReadProtoFromTextFile(filename.c_str(), proto);
-}
-
-TORCH_API void WriteProtoToTextFile(const Message& proto, const char* filename, bool throwIfError = true);
-inline void WriteProtoToTextFile(const Message& proto, const string& filename, bool throwIfError = true) {
-  return WriteProtoToTextFile(proto, filename.c_str(), throwIfError);
-}
-
-// Read Proto from a file, letting the code figure out if it is text or binary.
-inline bool ReadProtoFromFile(const char* filename, Message* proto) {
-  return (ReadProtoFromBinaryFile(filename, proto) ||
-          ReadProtoFromTextFile(filename, proto));
-}
-
-inline bool ReadProtoFromFile(const string& filename, Message* proto) {
-  return ReadProtoFromFile(filename.c_str(), proto);
-}
-
-#endif  // CAFFE2_USE_LITE_PROTO
-
-template <
-    class IterableInputs = std::initializer_list<string>,
-    class IterableOutputs = std::initializer_list<string>,
-    class IterableArgs = std::initializer_list<Argument>>
-OperatorDef CreateOperatorDef(
-    const string& type,
-    const string& name,
-    const IterableInputs& inputs,
-    const IterableOutputs& outputs,
-    const IterableArgs& args,
-    const DeviceOption& device_option = DeviceOption(),
-    const string& engine = "") {
-  OperatorDef def;
-  def.set_type(type);
-  def.set_name(name);
-  for (const string& in : inputs) {
-    def.add_input(in);
-  }
-  for (const string& out : outputs) {
-    def.add_output(out);
-  }
-  for (const Argument& arg : args) {
-    def.add_arg()->CopyFrom(arg);
-  }
-  if (device_option.has_device_type()) {
-    def.mutable_device_option()->CopyFrom(device_option);
-  }
-  if (engine.size()) {
-    def.set_engine(engine);
-  }
-  return def;
-}
-
-// A simplified version compared to the full CreateOperator, if you do not need
-// to specify args.
-template <
-    class IterableInputs = std::initializer_list<string>,
-    class IterableOutputs = std::initializer_list<string>>
-inline OperatorDef CreateOperatorDef(
-    const string& type,
-    const string& name,
-    const IterableInputs& inputs,
-    const IterableOutputs& outputs,
-    const DeviceOption& device_option = DeviceOption(),
-    const string& engine = "") {
-  return CreateOperatorDef(
-      type,
-      name,
-      inputs,
-      outputs,
-      std::vector<Argument>(),
-      device_option,
-      engine);
-}
-
-TORCH_API bool HasOutput(const OperatorDef& op, const std::string& output);
-TORCH_API bool HasInput(const OperatorDef& op, const std::string& input);
-
-/**
- * @brief A helper class to index into arguments.
- *
- * This helper helps us to more easily index into a set of arguments
- * that are present in the operator. To save memory, the argument helper
- * does not copy the operator def, so one would need to make sure that the
- * lifetime of the OperatorDef object outlives that of the ArgumentHelper.
- */
-class C10_EXPORT ArgumentHelper {
- public:
-  template <typename Def>
-  static bool HasArgument(const Def& def, c10::string_view name) {
-    return ArgumentHelper(def).HasArgument(name);
-  }
-
-  template <typename Def, typename T>
-  static T GetSingleArgument(
-      const Def& def,
-      c10::string_view name,
-      const T& default_value) {
-    return ArgumentHelper(def).GetSingleArgument<T>(name, default_value);
-  }
-
-  template <typename Def, typename T>
-  static bool HasSingleArgumentOfType(const Def& def, c10::string_view name) {
-    return ArgumentHelper(def).HasSingleArgumentOfType<T>(name);
-  }
-
-  template <typename Def, typename T>
-  static std::vector<T> GetRepeatedArgument(
-      const Def& def,
-      c10::string_view name,
-      const std::vector<T>& default_value = std::vector<T>()) {
-    return ArgumentHelper(def).GetRepeatedArgument<T>(name, default_value);
-  }
-
-  template <typename Def, typename MessageType>
-  static MessageType GetMessageArgument(const Def& def, c10::string_view name) {
-    return ArgumentHelper(def).GetMessageArgument<MessageType>(name);
-  }
-
-  template <typename Def, typename MessageType>
-  static std::vector<MessageType> GetRepeatedMessageArgument(
-      const Def& def,
-      c10::string_view name) {
-    return ArgumentHelper(def).GetRepeatedMessageArgument<MessageType>(name);
-  }
-
-  template <typename Def>
-  static bool RemoveArgument(Def& def, int index) {
-    if (index >= def.arg_size()) {
-      return false;
-    }
-    if (index < def.arg_size() - 1) {
-      def.mutable_arg()->SwapElements(index, def.arg_size() - 1);
-    }
-    def.mutable_arg()->RemoveLast();
-    return true;
-  }
-
-  explicit ArgumentHelper(const OperatorDef& def);
-  explicit ArgumentHelper(const NetDef& netdef);
-  bool HasArgument(c10::string_view name) const;
-
-  template <typename T>
-  T GetSingleArgument(c10::string_view name, const T& default_value) const;
-  template <typename T>
-  bool HasSingleArgumentOfType(c10::string_view name) const;
-  template <typename T>
-  std::vector<T> GetRepeatedArgument(
-      c10::string_view name,
-      const std::vector<T>& default_value = std::vector<T>()) const;
-
-  template <typename MessageType>
-  MessageType GetMessageArgument(c10::string_view name) const {
-    auto it = CAFFE2_ARG_MAP_FIND(arg_map_, name);
-    CAFFE_ENFORCE(it != arg_map_.end(), "Cannot find parameter named ", name);
-    MessageType message;
-    if (it->second.has_s()) {
-      CAFFE_ENFORCE(
-          message.ParseFromString(it->second.s()),
-          "Failed to parse content from the string");
-    } else {
-      VLOG(1) << "Return empty message for parameter " << name;
-    }
-    return message;
-  }
-
-  template <typename MessageType>
-  std::vector<MessageType> GetRepeatedMessageArgument(c10::string_view name) const {
-    auto it = CAFFE2_ARG_MAP_FIND(arg_map_, name);
-    CAFFE_ENFORCE(it != arg_map_.end(), "Cannot find parameter named ", name);
-    std::vector<MessageType> messages(it->second.strings_size());
-    for (int i = 0; i < messages.size(); ++i) {
-      CAFFE_ENFORCE(
-          messages[i].ParseFromString(it->second.strings(i)),
-          "Failed to parse content from the string");
-    }
-    return messages;
-  }
-
- private:
-  std::map<string, Argument
-#ifdef CAFFE2_ENABLE_REDUCED_STRINGS_IN_ARGUMENT_LOOKUP
-  , std::less<>
-#endif
-  > arg_map_;
-};
-
-// **** Arguments Utils *****
-
-// Helper methods to get an argument from OperatorDef or NetDef given argument
-// name. Throws if argument does not exist.
-TORCH_API const Argument& GetArgument(const OperatorDef& def, c10::string_view name);
-TORCH_API const Argument& GetArgument(const NetDef& def, c10::string_view name);
-// Helper methods to get an argument from OperatorDef or NetDef given argument
-// name. Returns nullptr if argument does not exist.
-TORCH_API const Argument* GetArgumentPtr(const OperatorDef& def, c10::string_view name);
-TORCH_API const Argument* GetArgumentPtr(const NetDef& def, c10::string_view name);
-
-// Helper methods to query a boolean argument flag from OperatorDef or NetDef
-// given argument name. If argument does not exist, return default value.
-// Throws if argument exists but the type is not boolean.
-TORCH_API bool GetFlagArgument(
-    const OperatorDef& def,
-    c10::string_view name,
-    bool default_value = false);
-TORCH_API bool GetFlagArgument(
-    const NetDef& def,
-    c10::string_view name,
-    bool default_value = false);
-
-TORCH_API Argument* GetMutableArgument(
-    const string& name,
-    const bool create_if_missing,
-    OperatorDef* def);
-TORCH_API Argument* GetMutableArgument(
-    const string& name,
-    const bool create_if_missing,
-    NetDef* def);
-
-template <typename T>
-TORCH_API Argument MakeArgument(const string& name, const T& value);
-
-template <typename T, typename Def>
-inline void AddArgument(const string& name, const T& value, Def* def) {
-  GetMutableArgument(name, true, def)->CopyFrom(MakeArgument(name, value));
-}
-// **** End Arguments Utils *****
-
-bool inline operator==(const DeviceOption& dl, const DeviceOption& dr) {
-  return IsSameDevice(dl, dr);
-}
-
-// Given a net, modify the external inputs/outputs if necessary so that
-// the following conditions are met
-// - No duplicate external inputs
-// - No duplicate external outputs
-// - Going through list of ops in order, all op inputs must be outputs
-// from other ops, or registered as external inputs.
-// - All external outputs must be outputs of some operators.
-TORCH_API void cleanupExternalInputsAndOutputs(NetDef* net);
-
-} // namespace caffe2
-
-namespace std {
-template <>
-struct hash<caffe2::DeviceOption> {
-  typedef caffe2::DeviceOption argument_type;
-  typedef std::size_t result_type;
-  result_type operator()(argument_type const& device_option) const {
-    std::string serialized;
-    CAFFE_ENFORCE(device_option.SerializeToString(&serialized));
-    return std::hash<std::string>{}(serialized);
-  }
-};
-} // namespace std
-
-#endif // CAFFE2_UTILS_PROTO_UTILS_H_

diff --git a/caffe2/utils/proto_utils_test.cc b/caffe2/utils/proto_utils_test.cc
deleted file mode 100644
index 1a68769..0000000
--- a/caffe2/utils/proto_utils_test.cc
+++ /dev/null

@@ -1,63 +0,0 @@
-#include <gtest/gtest.h>
-
-#include "caffe2/core/test_utils.h"
-#include "caffe2/utils/proto_utils.h"
-
-namespace caffe2 {
-
-TEST(ProtoUtilsTest, IsSameDevice) {
-  DeviceOption a;
-  DeviceOption b;
-  EXPECT_TRUE(IsSameDevice(a, b));
-  a.set_node_name("my_node");
-  EXPECT_FALSE(IsSameDevice(a, b));
-  b.set_node_name("my_node");
-  EXPECT_TRUE(IsSameDevice(a, b));
-  b.set_device_id(2);
-  EXPECT_FALSE(IsSameDevice(a, b));
-  a.set_device_id(2);
-  EXPECT_TRUE(IsSameDevice(a, b));
-  a.set_device_type(DeviceTypeProto::PROTO_CUDA);
-  b.set_device_type(DeviceTypeProto::PROTO_CPU);
-  EXPECT_FALSE(IsSameDevice(a, b));
-}
-
-TEST(ProtoUtilsTest, SimpleReadWrite) {
-  string content("The quick brown fox jumps over the lazy dog.");
-  string name = std::tmpnam(nullptr);
-  EXPECT_TRUE(WriteStringToFile(content, name.c_str()));
-  string read_back;
-  EXPECT_TRUE(ReadStringFromFile(name.c_str(), &read_back));
-  EXPECT_EQ(content, read_back);
-}
-
-TEST(ProtoUtilsTest, CleanupExternalInputsAndOutputs) {
-  caffe2::NetDef net;
-  caffe2::testing::NetMutator(&net)
-      .newOp("op1", {"X1", "X2"}, {"Y"})
-      .newOp("op2", {"W", "Y"}, {"Z1", "Z2"})
-      .newOp("op3", {"Z2", "W"}, {"O"})
-      .externalInputs({"X1", "X3", "X1", "W"})
-      .externalOutputs({"O", "Z2", "Z3", "O", "X3"});
-  cleanupExternalInputsAndOutputs(&net);
-
-  std::vector<std::string> externalInputs;
-  for (const auto& inputName : net.external_input()) {
-    externalInputs.emplace_back(inputName);
-  }
-  // The 2nd X1 is removed because of duplication.
-  // X2 is added because it should be a missing external input.
-  std::vector<std::string> expectedExternalInputs{"X1", "X3", "W", "X2"};
-  EXPECT_EQ(externalInputs, expectedExternalInputs);
-
-  std::vector<std::string> externalOutputs;
-  for (const auto& outputName : net.external_output()) {
-    externalOutputs.emplace_back(outputName);
-  }
-  // Z3 is removed because it's not an output of any operator in the net.
-  // The 2nd O is removed because of duplication.
-  std::vector<std::string> expectedexternalOutputs{"O", "Z2", "X3"};
-  EXPECT_EQ(externalOutputs, expectedexternalOutputs);
-}
-
-} // namespace caffe2

diff --git a/caffe2/utils/signal_handler.h b/caffe2/utils/signal_handler.h
deleted file mode 100644
index 14d93a0..0000000
--- a/caffe2/utils/signal_handler.h
+++ /dev/null

@@ -1,24 +0,0 @@
-#pragma once
-
-#include <c10/util/signal_handler.h>
-
-namespace caffe2 {
-
-#if defined(C10_SUPPORTS_FATAL_SIGNAL_HANDLERS)
-class TORCH_API C2FatalSignalHandler : public c10::FatalSignalHandler {
- public:
-  void fatalSignalHandlerPostProcess() override;
-  static C2FatalSignalHandler& getInstance();
-
- private:
-  explicit C2FatalSignalHandler();
-};
-
-// This works by setting up certain fatal signal handlers. Previous fatal
-// signal handlers will still be called when the signal is raised. Defaults
-// to being off.
-TORCH_API void setPrintStackTracesOnFatalSignal(bool print);
-TORCH_API bool printStackTracesOnFatalSignal();
-#endif // defined(C10_SUPPORTS_FATAL_SIGNAL_HANDLER)
-
-} // namespace caffe2

diff --git a/caffe2/utils/simple_queue.h b/caffe2/utils/simple_queue.h
deleted file mode 100644
index c16f552..0000000
--- a/caffe2/utils/simple_queue.h
+++ /dev/null

@@ -1,79 +0,0 @@
-#ifndef CAFFE2_UTILS_SIMPLE_QUEUE_H_
-#define CAFFE2_UTILS_SIMPLE_QUEUE_H_
-
-#include <condition_variable>  // NOLINT
-#include <mutex>  // NOLINT
-#include <queue>
-
-#include <c10/util/Logging.h>
-
-namespace caffe2 {
-
-// This is a very simple queue that Yangqing wrote when bottlefeeding the baby,
-// so don't take it seriously. What it does is a minimal thread-safe queue that
-// allows me to run network as a DAG.
-//
-// A usual work pattern looks like this: one or multiple producers push jobs
-// into this queue, and one or multiple workers pops jobs from this queue. If
-// nothing is in the queue but NoMoreJobs() is not called yet, the pop calls
-// will wait. If NoMoreJobs() has been called, pop calls will return false,
-// which serves as a message to the workers that they should exit.
-template <typename T>
-class SimpleQueue {
- public:
-  SimpleQueue() : no_more_jobs_(false) {}
-
-  // Pops a value and writes it to the value pointer. If there is nothing in the
-  // queue, this will wait till a value is inserted to the queue. If there are
-  // no more jobs to pop, the function returns false. Otherwise, it returns
-  // true.
-  bool Pop(T* value) {
-    std::unique_lock<std::mutex> mutex_lock(mutex_);
-    while (queue_.size() == 0 && !no_more_jobs_) cv_.wait(mutex_lock);
-    if (queue_.size() == 0 && no_more_jobs_) return false;
-    *value = queue_.front();
-    queue_.pop();
-    return true;
-  }
-
-  int size() {
-    std::unique_lock<std::mutex> mutex_lock(mutex_);
-    return queue_.size();
-  }
-
-  // Push pushes a value to the queue.
-  void Push(const T& value) {
-    {
-      std::lock_guard<std::mutex> mutex_lock(mutex_);
-      CAFFE_ENFORCE(!no_more_jobs_, "Cannot push to a closed queue.");
-      queue_.push(value);
-    }
-    cv_.notify_one();
-  }
-
-  // NoMoreJobs() marks the close of this queue. It also notifies all waiting
-  // Pop() calls so that they either check out remaining jobs, or return false.
-  // After NoMoreJobs() is called, this queue is considered closed - no more
-  // Push() functions are allowed, and once existing items are all checked out
-  // by the Pop() functions, any more Pop() function will immediately return
-  // false with nothing set to the value.
-  void NoMoreJobs() {
-    {
-      std::lock_guard<std::mutex> mutex_lock(mutex_);
-      no_more_jobs_ = true;
-    }
-    cv_.notify_all();
-  }
-
- private:
-  std::mutex mutex_;
-  std::condition_variable cv_;
-  std::queue<T> queue_;
-  bool no_more_jobs_{};
-  // We do not allow copy constructors.
-  SimpleQueue(const SimpleQueue& /*src*/) {}
-};
-
-}  // namespace caffe2
-
-#endif  // CAFFE2_UTILS_SIMPLE_QUEUE_H_

diff --git a/caffe2/utils/simple_queue_test.cc b/caffe2/utils/simple_queue_test.cc
deleted file mode 100644
index e59f699..0000000
--- a/caffe2/utils/simple_queue_test.cc
+++ /dev/null

@@ -1,76 +0,0 @@
-#include <thread>  // NOLINT
-
-#include "caffe2/utils/simple_queue.h"
-#include <gtest/gtest.h>
-
-namespace caffe2 {
-
-static std::unique_ptr<SimpleQueue<int> > gQueue;
-
-static void ConsumerFunction(int thread_idx) {
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  int value;
-  while (true) {
-    if (!gQueue->Pop(&value)) return;
-    VLOG(1) << "Emitting " << value << " from thread " << thread_idx;
-  }
-}
-
-static void ProducerFunction(int thread_idx, int start, int count) {
-  for (int i = 0; i < count; ++i) {
-    VLOG(1) << "Pushing " << i + start << " from thread " << thread_idx;
-    gQueue->Push(i + start);
-  }
-}
-
-
-TEST(SimpleQueueTest, SingleProducerSingleConsumer) {
-  // NOLINTNEXTLINE(modernize-make-unique)
-  gQueue.reset(new SimpleQueue<int>());
-  std::thread consumer(ConsumerFunction, 0);
-  for (int i = 0; i < 10; ++i) {
-    gQueue->Push(i);
-  }
-  gQueue->NoMoreJobs();
-  consumer.join();
-}
-
-TEST(SimpleQueueTest, SingleProducerDoubleConsumer) {
-  // NOLINTNEXTLINE(modernize-make-unique)
-  gQueue.reset(new SimpleQueue<int>());
-  std::thread consumer0(ConsumerFunction, 0);
-  std::thread consumer1(ConsumerFunction, 1);
-  for (int i = 0; i < 10; ++i) {
-    gQueue->Push(i);
-  }
-  gQueue->NoMoreJobs();
-  consumer0.join();
-  consumer1.join();
-}
-
-
-TEST(SimpleQueueTest, DoubleProducerDoubleConsumer) {
-  // NOLINTNEXTLINE(modernize-make-unique)
-  gQueue.reset(new SimpleQueue<int>());
-  std::thread producer0(ProducerFunction, 0, 0, 10);
-  std::thread producer1(ProducerFunction, 0, 10, 10);
-  std::thread consumer0(ConsumerFunction, 2);
-  std::thread consumer1(ConsumerFunction, 3);
-  producer0.join();
-  producer1.join();
-  gQueue->NoMoreJobs();
-  consumer0.join();
-  consumer1.join();
-}
-
-TEST(SimpleQueueDeathTest, CannotAddAfterQueueFinished) {
-  // NOLINTNEXTLINE(modernize-make-unique)
-  gQueue.reset(new SimpleQueue<int>());
-  gQueue->Push(0);
-  gQueue->NoMoreJobs();
-  // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
-  ASSERT_THROW(gQueue->Push(0), EnforceNotMet);
-}
-
-
-}  // namespace caffe2

diff --git a/caffe2/utils/smart_tensor_printer.h b/caffe2/utils/smart_tensor_printer.h
deleted file mode 100644
index e6d96ef..0000000
--- a/caffe2/utils/smart_tensor_printer.h
+++ /dev/null

@@ -1,50 +0,0 @@
-#pragma once
-
-#include "caffe2/core/tensor.h"
-
-namespace caffe2 {
-
-// This is a wrapper around the TensorPrinter that doesn't require the user to
-// explicit specify the type of the tensor while calling the Print() method.
-// It also supports a convenience function with a default constructed printer as
-// a static method.
-class TORCH_API SmartTensorPrinter {
- public:
-  // The proliferation of constructors is to give the feature parity with
-  // TensorPrinter
-  // yet not repeat the default arguments explicitly in case they change in the
-  // future.
-  SmartTensorPrinter() = default;
-
-  explicit SmartTensorPrinter(const std::string& tensor_name);
-
-  SmartTensorPrinter(
-      const std::string& tensor_name,
-      const std::string& file_name);
-
-  SmartTensorPrinter(
-      const std::string& tensor_name,
-      const std::string& file_name,
-      int limit);
-
-  void Print(const Tensor& tensor);
-
-  void PrintMeta(const Tensor& tensor) {
-    tensorPrinter_.PrintMeta(tensor);
-  }
-
-  // Uses a default constructed SmartTensorPrinter
-  static void PrintTensor(const Tensor& tensor);
-
-  // Uses a default constructed SmartTensorPrinter
-  void PrintTensorMeta(const Tensor& tensor) {
-    DefaultTensorPrinter().PrintMeta(tensor);
-  }
-
- private:
-  // Returns a thread local default constructed TensorPrinter
-  static SmartTensorPrinter& DefaultTensorPrinter();
-
-  TensorPrinter tensorPrinter_;
-};
-}

diff --git a/caffe2/utils/smart_tensor_printer_test.cc b/caffe2/utils/smart_tensor_printer_test.cc
deleted file mode 100644
index a455730..0000000
--- a/caffe2/utils/smart_tensor_printer_test.cc
+++ /dev/null

@@ -1,53 +0,0 @@
-#include "caffe2/utils/smart_tensor_printer.h"
-
-#include "caffe2/core/common.h"
-
-#include <gtest/gtest.h>
-
-namespace caffe2 {
-
-template <typename T>
-std::string my_to_string(const T& value) {
-  return to_string(value);
-}
-
-template <>
-std::string my_to_string<std::string>(const std::string& value) {
-  return value;
-}
-
-template <typename T>
-void expect_stderr_contains(const std::vector<T>& values) {
-  std::string captured_stderr = testing::internal::GetCapturedStderr();
-  for (const auto& value : values) {
-    std::string stringValue = my_to_string(value);
-    EXPECT_TRUE(captured_stderr.find(stringValue) != std::string::npos);
-  }
-}
-
-template <typename T>
-void printTensorAndCheck(const std::vector<T>& values) {
-  testing::internal::CaptureStderr();
-
-  Tensor tensor =
-      TensorCPUFromValues<T>({static_cast<int64_t>(values.size())}, values);
-
-  SmartTensorPrinter::PrintTensor(tensor);
-  expect_stderr_contains(values);
-}
-
-// We need real glog for this test to pass
-#ifdef CAFFE2_USE_GOOGLE_GLOG
-
-#if !(__APPLE__) // TODO(janusz): thread_local does not work under mac.
-
-TEST(SmartTensorPrinterTest, SimpleTest) {
-  printTensorAndCheck(std::vector<int>{1, 2, 3, 4, 5});
-  printTensorAndCheck(std::vector<std::string>{"bob", "alice", "facebook"});
-}
-
-#endif // !(__APPLE__)
-
-#endif // CAFFE2_USE_GOOGLE_GLOG
-
-} // namespace caffe2

diff --git a/caffe2/utils/zmq_helper.h b/caffe2/utils/zmq_helper.h
deleted file mode 100644
index 05bc22a..0000000
--- a/caffe2/utils/zmq_helper.h
+++ /dev/null

@@ -1,137 +0,0 @@
-#ifndef CAFFE2_UTILS_ZMQ_HELPER_H_
-#define CAFFE2_UTILS_ZMQ_HELPER_H_
-
-#include <zmq.h>
-
-#include "caffe2/core/logging.h"
-
-namespace caffe2 {
-
-class ZmqContext {
- public:
-  explicit ZmqContext(int io_threads) : ptr_(zmq_ctx_new()) {
-    CAFFE_ENFORCE(ptr_ != nullptr, "Failed to create zmq context.");
-    int rc = zmq_ctx_set(ptr_, ZMQ_IO_THREADS, io_threads);
-    CAFFE_ENFORCE_EQ(rc, 0);
-    rc = zmq_ctx_set(ptr_, ZMQ_MAX_SOCKETS, ZMQ_MAX_SOCKETS_DFLT);
-    CAFFE_ENFORCE_EQ(rc, 0);
-  }
-  ~ZmqContext() {
-    int rc = zmq_ctx_destroy(ptr_);
-    CAFFE_ENFORCE_EQ(rc, 0);
-  }
-
-  void* ptr() { return ptr_; }
-
- private:
-  void* ptr_;
-
-  C10_DISABLE_COPY_AND_ASSIGN(ZmqContext);
-};
-
-class ZmqMessage {
- public:
-  ZmqMessage() {
-    int rc = zmq_msg_init(&msg_);
-    CAFFE_ENFORCE_EQ(rc, 0);
-  }
-
-  ~ZmqMessage() {
-    int rc = zmq_msg_close(&msg_);
-    CAFFE_ENFORCE_EQ(rc, 0);
-  }
-
-  zmq_msg_t* msg() { return &msg_; }
-
-  void* data() { return zmq_msg_data(&msg_); }
-  size_t size() { return zmq_msg_size(&msg_); }
-
- private:
-  zmq_msg_t msg_;
-  C10_DISABLE_COPY_AND_ASSIGN(ZmqMessage);
-};
-
-class ZmqSocket {
- public:
-  explicit ZmqSocket(int type)
-      : context_(1), ptr_(zmq_socket(context_.ptr(), type)) {
-    CAFFE_ENFORCE(ptr_ != nullptr, "Failed to create zmq socket.");
-  }
-
-  ~ZmqSocket() {
-    int rc = zmq_close(ptr_);
-    CAFFE_ENFORCE_EQ(rc, 0);
-  }
-
-  void Bind(const string& addr) {
-    int rc = zmq_bind(ptr_, addr.c_str());
-    CAFFE_ENFORCE_EQ(rc, 0);
-  }
-
-  void Unbind(const string& addr) {
-    int rc = zmq_unbind(ptr_, addr.c_str());
-    CAFFE_ENFORCE_EQ(rc, 0);
-  }
-
-  void Connect(const string& addr) {
-    int rc = zmq_connect(ptr_, addr.c_str());
-    CAFFE_ENFORCE_EQ(rc, 0);
-  }
-
-  void Disconnect(const string& addr) {
-    int rc = zmq_disconnect(ptr_, addr.c_str());
-    CAFFE_ENFORCE_EQ(rc, 0);
-  }
-
-  int Send(const string& msg, int flags) {
-    int nbytes = zmq_send(ptr_, msg.c_str(), msg.size(), flags);
-    if (nbytes) {
-      return nbytes;
-    } else if (zmq_errno() == EAGAIN) {
-      return 0;
-    } else {
-      LOG(FATAL) << "Cannot send zmq message. Error number: "
-                      << zmq_errno();
-      return 0;
-    }
-  }
-
-  int SendTillSuccess(const string& msg, int flags) {
-    CAFFE_ENFORCE(msg.size(), "You cannot send an empty message.");
-    int nbytes = 0;
-    do {
-      nbytes = Send(msg, flags);
-    } while (nbytes == 0);
-    return nbytes;
-  }
-
-  int Recv(ZmqMessage* msg) {
-    int nbytes = zmq_msg_recv(msg->msg(), ptr_, 0);
-    if (nbytes >= 0) {
-      return nbytes;
-    } else if (zmq_errno() == EAGAIN || zmq_errno() == EINTR) {
-      return 0;
-    } else {
-      LOG(FATAL) << "Cannot receive zmq message. Error number: "
-                      << zmq_errno();
-      return 0;
-    }
-  }
-
-  int RecvTillSuccess(ZmqMessage* msg) {
-    int nbytes = 0;
-    do {
-      nbytes = Recv(msg);
-    } while (nbytes == 0);
-    return nbytes;
-  }
-
- private:
-  ZmqContext context_;
-  void* ptr_;
-};
-
-}  // namespace caffe2
-
-
-#endif  // CAFFE2_UTILS_ZMQ_HELPER_H_
commit	a6bae1f6db3bb86c521dd3c2417f42b8f5e8d705	[log] [tgz]
author	cyy <cyyever@outlook.com>	Fri May 31 11:26:24 2024 +0000
committer	PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>	Fri May 31 11:26:27 2024 +0000
tree	1424b9fd43a2f2fe54edf39f4dc2d47be356945d
parent	df0c69f32d269f8cdc136c9c65d791b6b86ef5e3 [diff]