Remove more caffe2 files (#127511)
Remove more caffe2 files.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/127511
Approved by: https://github.com/r-barnes
diff --git a/BUILD.bazel b/BUILD.bazel
index ecbeaab..7a2c3a5 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -488,10 +488,7 @@
filegroup(
name = "caffe2_utils_srcs",
srcs = [
- "caffe2/utils/bench_utils.cc",
"caffe2/utils/cpuid.cc",
- "caffe2/utils/murmur_hash3.cc",
- "caffe2/utils/proto_utils.cc",
"caffe2/utils/proto_wrap.cc",
"caffe2/utils/string_utils.cc",
"caffe2/utils/threadpool/ThreadPool.cc",
@@ -544,7 +541,6 @@
],
) + if_cuda(glob([
"caffe2/**/*.cuh",
- "caffe2/image/*.h",
])),
copts = CAFFE2_COPTS,
visibility = ["//visibility:public"],
diff --git a/caffe2/core/blob.h b/caffe2/core/blob.h
deleted file mode 100644
index 5823280..0000000
--- a/caffe2/core/blob.h
+++ /dev/null
@@ -1,130 +0,0 @@
-#ifndef CAFFE2_CORE_BLOB_H_
-#define CAFFE2_CORE_BLOB_H_
-
-#include <cstddef>
-#include <sstream>
-#include <typeinfo>
-#include <type_traits>
-#include <vector>
-#include "caffe2/core/common.h"
-
-#include <ATen/core/blob.h>
-#include <c10/util/typeid.h>
-#include "caffe2/core/logging.h"
-#include "caffe2/core/tensor.h"
-#include "caffe2/core/tensor_int8.h"
-
-namespace caffe2 {
-
-inline bool BlobIsInt8TensorCPUType(const Blob& blob) {
- return blob.meta().Match<int8::Int8TensorCPU>();
-}
-
-inline bool BlobIsTensorType(const Blob& blob, DeviceType device_type) {
- bool is_match = blob.meta().Match<Tensor>();
- if (!is_match) {
- return false;
- }
- const Tensor* tensor = &blob.Get<Tensor>();
- return tensor && *tensor && tensor->GetDeviceType() == device_type;
-}
-
-inline Tensor* BlobSetTensor(Blob* blob, Tensor&& tensor) {
- return blob->Reset<Tensor>(new Tensor(std::move(tensor)));
-}
-
-inline Tensor GetSizedTensorWithOptions(
- Tensor&& previous_tensor,
- at::IntArrayRef dims,
- at::TensorOptions options) {
- Tensor tensor = std::move(previous_tensor);
- if (!tensor.defined()) {
- return caffe2::empty(dims, options);
- }
- if (tensor.GetDevice() == options.device() ||
- (!tensor.GetDevice().has_index() &&
- tensor.GetDeviceType() == options.device().type())) {
- if (tensor.sizes() != dims) {
- // Resize when the dims doesn't match
- tensor.Resize(dims);
- }
- if (tensor.dtype() == options.dtype()) {
- tensor.raw_mutable_data();
- } else {
- // create a new Tensor when the data_type doesn't match
- return caffe2::empty(dims, options);
- }
- return tensor;
- }
- return caffe2::empty(dims, options);
-}
-
-// need to keep both functions that returns Tensor* and the one
-// returns Tensor for clangr codemod
-inline Tensor*
-BlobGetMutableTensor(Blob* blob, at::IntArrayRef dims, at::TensorOptions options) {
- if (blob->IsType<Tensor>()) {
- Tensor* tensor = blob->GetMutable<Tensor>();
- if (*tensor) {
- // We only compare device_type if the index is not set since there are Tensors
- // TODO: remove the extra check when all the Tensors are properly initialized
- const auto tensorDevice = tensor->GetDevice();
- if (tensorDevice == options.device() || (!tensorDevice.has_index() && tensor->GetDeviceType() == options.device().type())) {
- if (tensor->sizes() != dims) {
- // Resize when the dims doesn't match
- tensor->Resize(dims);
- }
- tensor->raw_mutable_data(options.dtype());
- return tensor;
- }
- // create a new Tensor when device doesn't match
- }
- }
-
- VLOG(1) << "Create new mutable object " << TypeMeta::TypeName<Tensor>()
- << " dims: " << dims;
- // << " options: " << options; (operator<< for Options is in at:: now)
- return BlobSetTensor(blob, caffe2::empty(dims, options));
-}
-
-inline Tensor
-XBlobGetMutableTensor(Blob* blob, at::IntArrayRef dims, at::TensorOptions options) {
- return BlobGetMutableTensor(blob, dims, options)->UnsafeSharedInstance();
-}
-
-inline Tensor* BlobGetMutableTensor(Blob* blob, DeviceType device_type) {
- if (blob->IsType<Tensor>()) {
- Tensor* tensor = blob->GetMutable<Tensor>();
- if (*tensor && tensor->GetDeviceType() == device_type) {
- return tensor;
- }
- }
-
- // if we're here, then either Blob didn't hold a Tensor
- // or that Tensor had the wrong DeviceType.
- VLOG(1) << "Create new mutable object " << TypeMeta::TypeName<Tensor>()
- << " DeviceType:" << device_type;
-
- return BlobSetTensor(blob, Tensor(device_type));
-}
-
-inline const Tensor& BlobGetTensor(const Blob& blob, DeviceType device_type) {
- if (blob.IsType<Tensor>()) {
- const auto& tensor = blob.Get<Tensor>();
- if (tensor.GetDeviceType() == device_type) {
- return tensor;
- }
- }
- CAFFE_THROW("Blob didn't contain a Tensor or the device_type doesn't match");
-}
-
-inline Tensor BlobGetTensorOrUndefined(const Blob& blob) {
- if (blob.IsType<Tensor>()) {
- return blob.Get<Tensor>().UnsafeSharedInstance();
- } else {
- return Tensor();
- }
-}
-
-} // namespace caffe2
-#endif // CAFFE2_CORE_BLOB_H_
diff --git a/caffe2/core/blob_serialization_gpu.cc b/caffe2/core/blob_serialization_gpu.cc
deleted file mode 100644
index 4d67535..0000000
--- a/caffe2/core/blob_serialization_gpu.cc
+++ /dev/null
@@ -1,10 +0,0 @@
-#include "caffe2/core/blob.h"
-#include "caffe2/core/blob_serialization.h"
-#include "caffe2/core/context_gpu.h"
-
-namespace caffe2 {
-
-namespace {
-REGISTER_BLOB_DESERIALIZER(TensorCUDA, TensorDeserializer);
-}
-} // namespace caffe2
diff --git a/caffe2/core/common_cudnn.cc b/caffe2/core/common_cudnn.cc
deleted file mode 100644
index f818654..0000000
--- a/caffe2/core/common_cudnn.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-#include "caffe2/core/common_cudnn.h"
-#include "caffe2/core/cudnn_wrappers.h"
-
-#include "caffe2/core/init.h"
-
-namespace caffe2 {
-
-CuDNNWrapper::PerGPUCuDNNStates& CuDNNWrapper::cudnn_states() {
- // New it (never delete) to avoid calling the destructors on process
- // exit and racing against the CUDA shutdown sequence.
- static auto* p = new CuDNNWrapper::PerGPUCuDNNStates();
- TORCH_CHECK_NOTNULL(p);
- return *p;
-}
-
-namespace {
-bool PrintCuDNNInfo(int*, char***) {
- VLOG(1) << "Caffe2 is built with CuDNN version " << CUDNN_VERSION;
- return true;
-}
-
-REGISTER_CAFFE2_INIT_FUNCTION(PrintCuDNNInfo, &PrintCuDNNInfo,
- "Print CuDNN Info.");
-
-} // namespace
-} // namespace caffe2
diff --git a/caffe2/core/common_cudnn.h b/caffe2/core/common_cudnn.h
deleted file mode 100644
index b130103..0000000
--- a/caffe2/core/common_cudnn.h
+++ /dev/null
@@ -1,314 +0,0 @@
-#ifndef CAFFE2_CORE_COMMON_CUDNN_H_
-#define CAFFE2_CORE_COMMON_CUDNN_H_
-
-#include <array>
-#include <mutex>
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/types.h"
-
-#ifndef CAFFE2_USE_CUDNN
-#error("This Caffe2 install is not built with cudnn, so you should not include this file.");
-#endif
-
-#include <cudnn.h>
-
-static_assert(
- CUDNN_VERSION >= 8200,
- "Caffe2 requires cudnn version 8.2 or above.");
-
-#define CUDNN_VERSION_MIN(major, minor, patch) \
- (major >= 9 ? CUDNN_VERSION >= ((major) * 10000 + (minor) * 100 + (patch)) : \
- CUDNN_VERSION >= ((major) * 1000 + (minor) * 100 + (patch)))
-
-namespace caffe2 {
-
-namespace internal {
-/**
- * A helper function to obtain cudnn error strings.
- */
-inline const char* cudnnGetErrorString(cudnnStatus_t status) {
- switch (status) {
- case CUDNN_STATUS_SUCCESS:
- return "CUDNN_STATUS_SUCCESS";
- case CUDNN_STATUS_NOT_INITIALIZED:
- return "CUDNN_STATUS_NOT_INITIALIZED";
- case CUDNN_STATUS_ALLOC_FAILED:
- return "CUDNN_STATUS_ALLOC_FAILED";
- case CUDNN_STATUS_BAD_PARAM:
- return "CUDNN_STATUS_BAD_PARAM";
- case CUDNN_STATUS_INTERNAL_ERROR:
- return "CUDNN_STATUS_INTERNAL_ERROR";
- case CUDNN_STATUS_INVALID_VALUE:
- return "CUDNN_STATUS_INVALID_VALUE";
- case CUDNN_STATUS_ARCH_MISMATCH:
- return "CUDNN_STATUS_ARCH_MISMATCH";
- case CUDNN_STATUS_MAPPING_ERROR:
- return "CUDNN_STATUS_MAPPING_ERROR";
- case CUDNN_STATUS_EXECUTION_FAILED:
- return "CUDNN_STATUS_EXECUTION_FAILED";
- case CUDNN_STATUS_NOT_SUPPORTED:
- return "CUDNN_STATUS_NOT_SUPPORTED";
- case CUDNN_STATUS_LICENSE_ERROR:
- return "CUDNN_STATUS_LICENSE_ERROR";
- default:
- return "Unknown cudnn error number";
- }
-}
-} // namespace internal
-
-// A macro that wraps around a cudnn statement so we can check if the cudnn
-// execution finishes or not.
-#define CUDNN_ENFORCE(condition) \
- do { \
- cudnnStatus_t status = condition; \
- CAFFE_ENFORCE_EQ( \
- status, \
- CUDNN_STATUS_SUCCESS, \
- ", Error at: ", \
- __FILE__, \
- ":", \
- __LINE__, \
- ": ", \
- ::caffe2::internal::cudnnGetErrorString(status)); \
- } while (0)
-#define CUDNN_CHECK(condition) \
- do { \
- cudnnStatus_t status = condition; \
- CHECK(status == CUDNN_STATUS_SUCCESS) \
- << ::caffe2::internal::cudnnGetErrorString(status); \
- } while (0)
-
-// report the version of cuDNN Caffe2 was compiled with
-inline size_t cudnnCompiledVersion() {
- return CUDNN_VERSION;
-}
-// report the runtime version of cuDNN
-inline size_t cudnnRuntimeVersion() {
- return cudnnGetVersion();
-}
-
-// Check compatibility of compiled and runtime cuDNN versions
-inline void CheckCuDNNVersions() {
- // Version format is major*1000 + minor*100 + patch
- // If compiled with version < 7, major, minor and patch must all match
- // If compiled with version >= 7, then either
- // runtime_version > compiled_version
- // major and minor match
- bool version_match = cudnnCompiledVersion() == cudnnRuntimeVersion();
- bool compiled_with_7 = cudnnCompiledVersion() >= 7000;
- bool backwards_compatible_7 = compiled_with_7 && cudnnRuntimeVersion() >= cudnnCompiledVersion();
- bool patch_compatible = compiled_with_7 && (cudnnRuntimeVersion() / 100) == (cudnnCompiledVersion() / 100);
- CAFFE_ENFORCE(version_match || backwards_compatible_7 || patch_compatible,
- "cuDNN compiled (", cudnnCompiledVersion(), ") and "
- "runtime (", cudnnRuntimeVersion(), ") versions mismatch");
-}
-
-/**
- * cudnnTypeWrapper is a wrapper class that allows us to refer to the cudnn type
- * in a template function. The class is specialized explicitly for different
- * data types below.
- */
-template <typename T>
-class cudnnTypeWrapper;
-
-template <>
-class cudnnTypeWrapper<float> {
- public:
- static const cudnnDataType_t type = CUDNN_DATA_FLOAT;
- typedef const float ScalingParamType;
- typedef float BNParamType;
- static ScalingParamType* kOne() {
- static ScalingParamType v = 1.0;
- return &v;
- }
- static const ScalingParamType* kZero() {
- static ScalingParamType v = 0.0;
- return &v;
- }
-};
-
-template <>
-class cudnnTypeWrapper<int> {
- public:
- static const cudnnDataType_t type = CUDNN_DATA_INT32;
- typedef const int ScalingParamType;
- typedef int BNParamType;
- static ScalingParamType* kOne() {
- static ScalingParamType v = 1;
- return &v;
- }
- static const ScalingParamType* kZero() {
- static ScalingParamType v = 0;
- return &v;
- }
-};
-
-template <>
-class cudnnTypeWrapper<double> {
- public:
- static const cudnnDataType_t type = CUDNN_DATA_DOUBLE;
- typedef const double ScalingParamType;
- typedef double BNParamType;
- static ScalingParamType* kOne() {
- static ScalingParamType v = 1.0;
- return &v;
- }
- static ScalingParamType* kZero() {
- static ScalingParamType v = 0.0;
- return &v;
- }
-};
-
-template <>
-class cudnnTypeWrapper<at::Half> {
- public:
- static const cudnnDataType_t type = CUDNN_DATA_HALF;
- typedef const float ScalingParamType;
- typedef float BNParamType;
- static ScalingParamType* kOne() {
- static ScalingParamType v = 1.0;
- return &v;
- }
- static ScalingParamType* kZero() {
- static ScalingParamType v = 0.0;
- return &v;
- }
-};
-
-/**
- * A wrapper function to convert the Caffe storage order to cudnn storage order
- * enum values.
- */
-inline cudnnTensorFormat_t GetCudnnTensorFormat(const StorageOrder& order) {
- switch (order) {
- case StorageOrder::NHWC:
- return CUDNN_TENSOR_NHWC;
- case StorageOrder::NCHW:
- return CUDNN_TENSOR_NCHW;
- default:
- LOG(FATAL) << "Unknown cudnn equivalent for order: " << order;
- }
- // Just to suppress compiler warnings
- return CUDNN_TENSOR_NCHW;
-}
-
-/**
- * cudnnTensorDescWrapper is the placeholder that wraps around a
- * cudnnTensorDescriptor_t, allowing us to do descriptor change as-needed during
- * runtime.
- */
-class cudnnTensorDescWrapper {
- public:
- cudnnTensorDescWrapper() {
- CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&desc_));
- }
- ~cudnnTensorDescWrapper() noexcept {
- CUDNN_CHECK(cudnnDestroyTensorDescriptor(desc_));
- }
-
- inline cudnnTensorDescriptor_t Descriptor(
- const cudnnTensorFormat_t format,
- const cudnnDataType_t type,
- const vector<int>& dims,
- bool* changed) {
- if (type_ == type && format_ == format && dims_ == dims) {
- // if not changed, simply return the current descriptor.
- if (changed)
- *changed = false;
- return desc_;
- }
- CAFFE_ENFORCE_EQ(
- dims.size(), 4U, "Currently only 4-dimensional descriptor supported.");
- format_ = format;
- type_ = type;
- dims_ = dims;
- CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
- desc_,
- format,
- type,
- dims_[0],
- (format == CUDNN_TENSOR_NCHW ? dims_[1] : dims_[3]),
- (format == CUDNN_TENSOR_NCHW ? dims_[2] : dims_[1]),
- (format == CUDNN_TENSOR_NCHW ? dims_[3] : dims_[2])));
- if (changed)
- *changed = true;
- return desc_;
- }
-
- template <typename T>
- inline cudnnTensorDescriptor_t Descriptor(
- const StorageOrder& order,
- const vector<int>& dims) {
- return Descriptor(
- GetCudnnTensorFormat(order), cudnnTypeWrapper<T>::type, dims, nullptr);
- }
-
- private:
- cudnnTensorDescriptor_t desc_;
- cudnnTensorFormat_t format_;
- cudnnDataType_t type_;
- vector<int> dims_;
- C10_DISABLE_COPY_AND_ASSIGN(cudnnTensorDescWrapper);
-};
-
-class cudnnFilterDescWrapper {
- public:
- cudnnFilterDescWrapper() {
- CUDNN_ENFORCE(cudnnCreateFilterDescriptor(&desc_));
- }
- ~cudnnFilterDescWrapper() noexcept {
- CUDNN_CHECK(cudnnDestroyFilterDescriptor(desc_));
- }
-
- inline cudnnFilterDescriptor_t Descriptor(
- const StorageOrder& order,
- const cudnnDataType_t type,
- const vector<int>& dims,
- bool* changed) {
- if (type_ == type && order_ == order && dims_ == dims) {
- // if not changed, simply return the current descriptor.
- if (changed)
- *changed = false;
- return desc_;
- }
- CAFFE_ENFORCE_EQ(
- dims.size(), 4U, "Currently only 4-dimensional descriptor supported.");
- order_ = order;
- type_ = type;
- dims_ = dims;
- CUDNN_ENFORCE(cudnnSetFilter4dDescriptor(
- desc_,
- type,
- GetCudnnTensorFormat(order),
- dims_[0],
- // TODO - confirm that this is correct for NHWC
- (order == StorageOrder::NCHW ? dims_[1] : dims_[3]),
- (order == StorageOrder::NCHW ? dims_[2] : dims_[1]),
- (order == StorageOrder::NCHW ? dims_[3] : dims_[2])));
- if (changed)
- *changed = true;
- return desc_;
- }
-
- template <typename T>
- inline cudnnFilterDescriptor_t Descriptor(
- const StorageOrder& order,
- const vector<int>& dims) {
- return Descriptor(order, cudnnTypeWrapper<T>::type, dims, nullptr);
- }
-
- private:
- cudnnFilterDescriptor_t desc_;
- StorageOrder order_;
- cudnnDataType_t type_;
- vector<int> dims_;
- C10_DISABLE_COPY_AND_ASSIGN(cudnnFilterDescWrapper);
-};
-
-
-} // namespace caffe2
-
-#endif // CAFFE2_CORE_COMMON_CUDNN_H_
diff --git a/caffe2/core/common_gpu.cc b/caffe2/core/common_gpu.cc
deleted file mode 100644
index e5a2635..0000000
--- a/caffe2/core/common_gpu.cc
+++ /dev/null
@@ -1,253 +0,0 @@
-#include "caffe2/core/common_gpu.h"
-
-#include <atomic>
-#include <cstdlib>
-#include <iostream>
-#include <sstream>
-
-#include <c10/cuda/CUDAFunctions.h>
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/init.h"
-#include "caffe2/core/logging.h"
-
-namespace caffe2 {
-
-int NumCudaDevices() {
- if (getenv("CAFFE2_DEBUG_CUDA_INIT_ORDER")) {
- static bool first = true;
- if (first) {
- first = false;
- std::cerr << "DEBUG: caffe2::NumCudaDevices() invoked for the first time"
- << std::endl;
- }
- }
- // It logs warnings on first run
- return c10::cuda::device_count();
-}
-
-namespace {
-int gDefaultGPUID = 0;
-} // namespace
-
-void SetDefaultGPUID(const int deviceid) {
- CAFFE_ENFORCE_LT(
- deviceid,
- NumCudaDevices(),
- "The default gpu id should be smaller than the number of gpus "
- "on this machine: ",
- deviceid,
- " vs ",
- NumCudaDevices());
- gDefaultGPUID = deviceid;
-}
-
-int GetDefaultGPUID() { return gDefaultGPUID; }
-
-int CaffeCudaGetDevice() {
- int gpu_id = 0;
- CUDA_ENFORCE(cudaGetDevice(&gpu_id));
- return gpu_id;
-}
-
-void CaffeCudaSetDevice(const int id) {
- CUDA_ENFORCE(cudaSetDevice(id));
-}
-
-int GetGPUIDForPointer(const void* ptr) {
- cudaPointerAttributes attr;
- cudaError_t err = cudaPointerGetAttributes(&attr, ptr);
-
- if (err == cudaErrorInvalidValue) {
- // Occurs when the pointer is in the CPU address space that is
- // unmanaged by CUDA; make sure the last error state is cleared,
- // since it is persistent
- err = cudaGetLastError();
- CHECK(err == cudaErrorInvalidValue);
- return -1;
- }
-
- // Otherwise, there must be no error
- CUDA_ENFORCE(err);
-
- if (attr.type == cudaMemoryTypeHost) {
- return -1;
- }
-
- return attr.device;
-}
-
-struct CudaDevicePropWrapper {
- CudaDevicePropWrapper() : props(NumCudaDevices()) {
- for (int i = 0; i < NumCudaDevices(); ++i) {
- CUDA_ENFORCE(cudaGetDeviceProperties(&props[i], i));
- }
- }
-
- vector<cudaDeviceProp> props;
-};
-
-const cudaDeviceProp& GetDeviceProperty(const int deviceid) {
- // According to C++11 standard section 6.7, static local variable init is
- // thread safe. See
- // https://stackoverflow.com/questions/8102125/is-local-static-variable-initialization-thread-safe-in-c11
- // for details.
- static CudaDevicePropWrapper props;
- CAFFE_ENFORCE_LT(
- deviceid,
- NumCudaDevices(),
- "The gpu id should be smaller than the number of gpus ",
- "on this machine: ",
- deviceid,
- " vs ",
- NumCudaDevices());
- return props.props[deviceid];
-}
-
-void DeviceQuery(const int device) {
- const cudaDeviceProp& prop = GetDeviceProperty(device);
- std::stringstream ss;
- ss << std::endl;
- ss << "Device id: " << device << std::endl;
- ss << "Major revision number: " << prop.major << std::endl;
- ss << "Minor revision number: " << prop.minor << std::endl;
- ss << "Name: " << prop.name << std::endl;
- ss << "Total global memory: " << prop.totalGlobalMem << std::endl;
- ss << "Total shared memory per block: " << prop.sharedMemPerBlock
- << std::endl;
- ss << "Total registers per block: " << prop.regsPerBlock << std::endl;
- ss << "Warp size: " << prop.warpSize << std::endl;
-#if !defined(USE_ROCM)
- ss << "Maximum memory pitch: " << prop.memPitch << std::endl;
-#endif
- ss << "Maximum threads per block: " << prop.maxThreadsPerBlock
- << std::endl;
- ss << "Maximum dimension of block: "
- << prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", "
- << prop.maxThreadsDim[2] << std::endl;
- ss << "Maximum dimension of grid: "
- << prop.maxGridSize[0] << ", " << prop.maxGridSize[1] << ", "
- << prop.maxGridSize[2] << std::endl;
- ss << "Clock rate: " << prop.clockRate << std::endl;
- ss << "Total constant memory: " << prop.totalConstMem << std::endl;
-#if !defined(USE_ROCM)
- ss << "Texture alignment: " << prop.textureAlignment << std::endl;
- ss << "Concurrent copy and execution: "
- << (prop.deviceOverlap ? "Yes" : "No") << std::endl;
-#endif
- ss << "Number of multiprocessors: " << prop.multiProcessorCount
- << std::endl;
-#if !defined(USE_ROCM)
- ss << "Kernel execution timeout: "
- << (prop.kernelExecTimeoutEnabled ? "Yes" : "No") << std::endl;
-#endif
- LOG(INFO) << ss.str();
- return;
-}
-
-bool GetCudaPeerAccessPattern(vector<vector<bool> >* pattern) {
- int gpu_count;
- if (cudaGetDeviceCount(&gpu_count) != cudaSuccess) return false;
- pattern->clear();
- pattern->resize(gpu_count, vector<bool>(gpu_count, false));
- for (int i = 0; i < gpu_count; ++i) {
- for (int j = 0; j < gpu_count; ++j) {
- int can_access = true;
- if (i != j) {
- if (cudaDeviceCanAccessPeer(&can_access, i, j)
- != cudaSuccess) {
- return false;
- }
- }
- (*pattern)[i][j] = static_cast<bool>(can_access);
- }
- }
- return true;
-}
-
-bool TensorCoreAvailable() {
- int device = CaffeCudaGetDevice();
- auto& prop = GetDeviceProperty(device);
-
- return prop.major >= 7;
-}
-
-const char* cublasGetErrorString(cublasStatus_t error) {
- switch (error) {
- case CUBLAS_STATUS_SUCCESS:
- return "CUBLAS_STATUS_SUCCESS";
- case CUBLAS_STATUS_NOT_INITIALIZED:
- return "CUBLAS_STATUS_NOT_INITIALIZED";
- case CUBLAS_STATUS_ALLOC_FAILED:
- return "CUBLAS_STATUS_ALLOC_FAILED";
- case CUBLAS_STATUS_INVALID_VALUE:
- return "CUBLAS_STATUS_INVALID_VALUE";
- case CUBLAS_STATUS_ARCH_MISMATCH:
- return "CUBLAS_STATUS_ARCH_MISMATCH";
- case CUBLAS_STATUS_INTERNAL_ERROR:
- return "CUBLAS_STATUS_INTERNAL_ERROR";
- case CUBLAS_STATUS_MAPPING_ERROR:
- return "CUBLAS_STATUS_MAPPING_ERROR";
- case CUBLAS_STATUS_EXECUTION_FAILED:
- return "CUBLAS_STATUS_EXECUTION_FAILED";
- case CUBLAS_STATUS_NOT_SUPPORTED:
- return "CUBLAS_STATUS_NOT_SUPPORTED";
-#if !defined(USE_ROCM)
- case CUBLAS_STATUS_LICENSE_ERROR:
- return "CUBLAS_STATUS_LICENSE_ERROR";
-#endif
- }
- // To suppress compiler warning.
- return "Unrecognized cublas error string";
-}
-
-const char* curandGetErrorString(curandStatus_t error) {
- switch (error) {
- case CURAND_STATUS_SUCCESS:
- return "CURAND_STATUS_SUCCESS";
- case CURAND_STATUS_VERSION_MISMATCH:
- return "CURAND_STATUS_VERSION_MISMATCH";
- case CURAND_STATUS_NOT_INITIALIZED:
- return "CURAND_STATUS_NOT_INITIALIZED";
- case CURAND_STATUS_ALLOCATION_FAILED:
- return "CURAND_STATUS_ALLOCATION_FAILED";
- case CURAND_STATUS_TYPE_ERROR:
- return "CURAND_STATUS_TYPE_ERROR";
- case CURAND_STATUS_OUT_OF_RANGE:
- return "CURAND_STATUS_OUT_OF_RANGE";
- case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
- return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
- case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
- return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
- case CURAND_STATUS_LAUNCH_FAILURE:
- return "CURAND_STATUS_LAUNCH_FAILURE";
- case CURAND_STATUS_PREEXISTING_FAILURE:
- return "CURAND_STATUS_PREEXISTING_FAILURE";
- case CURAND_STATUS_INITIALIZATION_FAILED:
- return "CURAND_STATUS_INITIALIZATION_FAILED";
- case CURAND_STATUS_ARCH_MISMATCH:
- return "CURAND_STATUS_ARCH_MISMATCH";
- case CURAND_STATUS_INTERNAL_ERROR:
- return "CURAND_STATUS_INTERNAL_ERROR";
-#if defined(USE_ROCM)
- case HIPRAND_STATUS_NOT_IMPLEMENTED:
- return "HIPRAND_STATUS_NOT_IMPLEMENTED";
-#endif
- }
- // To suppress compiler warning.
- return "Unrecognized curand error string";
-}
-
-// Turn on the flag g_caffe2_has_cuda_linked to true for HasCudaRuntime()
-// function.
-namespace {
-class CudaRuntimeFlagFlipper {
- public:
- CudaRuntimeFlagFlipper() {
- internal::SetCudaRuntimeFlag();
- }
-};
-static CudaRuntimeFlagFlipper g_flipper;
-} // namespace
-
-} // namespace caffe2
diff --git a/caffe2/core/common_gpu.h b/caffe2/core/common_gpu.h
deleted file mode 100644
index 011f462..0000000
--- a/caffe2/core/common_gpu.h
+++ /dev/null
@@ -1,475 +0,0 @@
-#ifndef CAFFE2_CORE_COMMON_GPU_H_
-#define CAFFE2_CORE_COMMON_GPU_H_
-
-#include <assert.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#if !defined(USE_ROCM)
-#ifdef __GNUC__
-#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
-#pragma GCC diagnostic push
-#endif
-#pragma GCC diagnostic ignored "-Wstrict-aliasing"
-#endif // __GNUC__
-#endif // USE_ROCM
-
-#include <cublas_v2.h>
-#include <curand.h>
-#include <driver_types.h>
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/logging.h"
-
-#include "c10/cuda/CUDAMacros.h"
-#include "c10/cuda/CUDAMathCompat.h"
-#include <c10/cuda/CUDAGuard.h>
-
-#define CAFFE2_CUDA_EXPORT C10_EXPORT
-
-// CAFFE2_CUDA_API gets translated to CAFFE2_HIP_API in hipify script, which
-// causes a marco redefinition issue with the later definition of
-// CAFFE2_HIP_API, so we exclude this definition when HIP is specified
-#if !defined(USE_ROCM)
-#define CAFFE2_CUDA_API TORCH_CUDA_CPP_API
-#endif // USE_ROCM
-
-//TODO: [ROCm] Need to remove this after CUDA->HIP mapping is updated.
-#define CAFFE2_HIP_EXPORT C10_EXPORT
-#define CAFFE2_HIP_API TORCH_HIP_API
-
-// This is a macro defined for cuda fp16 support. In default, cuda fp16 is
-// supported by NVCC 7.5, but it is also included in the Tegra X1 platform with
-// a (custom?) NVCC 7.0. As a result, we would normally just check the cuda
-// version here, but would also allow a use to pass in the flag
-// CAFFE_HAS_CUDA_FP16 manually.
-
-#ifndef CAFFE_HAS_CUDA_FP16
-#define CAFFE_HAS_CUDA_FP16
-#endif // CAFFE_HAS_CUDA_FP16
-
-#ifdef CAFFE_HAS_CUDA_FP16
-#include <cuda_fp16.h>
-#endif
-
-// cuda major revision number below which fp16 compute is not supoorted
-#if !defined(USE_ROCM)
-constexpr int kFp16CUDADevicePropMajor = 6;
-#else
-constexpr int kFp16CUDADevicePropMajor = 3;
-#endif
-
-// Re-enable strict aliasing diagnostic if it was disabled.
-#if !defined(USE_ROCM)
-#ifdef __GNUC__
-#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
-#pragma GCC diagnostic pop
-#endif
-#endif // __GNUC__
-#endif // USE_ROCM
-
-/**
- * The maximum number of peers that each gpu can have when doing p2p setup.
- * Currently, according to NVidia documentation, each device can support a
- * system-wide maximum of eight peer connections.
- * When Caffe2 sets up peer access resources, if we have more than 8 gpus,
- * we will enable peer access in groups of 8.
- */
-#define CAFFE2_CUDA_MAX_PEER_SIZE 8
-
-namespace caffe2 {
-
-#if !defined(USE_ROCM)
-/**
- * Empty class to identify TensorCore-based math
- */
-class TensorCoreEngine {};
-#endif // USE_ROCM
-
-/**
- * A runtime function to report the cuda version that Caffe2 is built with.
- */
-inline int CudaVersion() {
-#if defined(USE_ROCM)
- return ROCM_VERSION;
-#else
- return CUDA_VERSION;
-#endif
-}
-
-/**
- * Returns the number of devices.
- */
-CAFFE2_CUDA_API int NumCudaDevices();
-
-/**
- * Check if the current running session has a cuda gpu present.
- *
- * Note that this is different from having caffe2 built with cuda. Building
- * Caffe2 with cuda only guarantees that this function exists. If there are no
- * cuda gpus present in the machine, or there are hardware configuration
- * problems like an insufficient driver, this function will still return false,
- * meaning that there is no usable GPU present.
- *
- * In the open source build, it is possible that Caffe2's GPU code is
- * dynamically loaded, and as a result a library could be only linked to the
- * CPU code, but want to test if cuda is later available or not. In this case,
- * one should use HasCudaRuntime() from common.h.
- */
-inline bool HasCudaGPU() {
- return NumCudaDevices() > 0;
-}
-
-/**
- * Gets the current GPU id. This is a simple wrapper around cudaGetDevice().
- */
-CAFFE2_CUDA_API int CaffeCudaGetDevice();
-
-/**
- * Gets the current GPU id. This is a simple wrapper around cudaGetDevice().
- */
-CAFFE2_CUDA_API void CaffeCudaSetDevice(const int id);
-
-/**
- * Gets the GPU id that the current pointer is located at.
- */
-CAFFE2_CUDA_API int GetGPUIDForPointer(const void* ptr);
-
-/**
- * Gets the device property for the given device. This function is thread safe.
- * The initial run on this function is ~1ms/device; however, the results are
- * cached so subsequent runs should be much faster.
- */
-CAFFE2_CUDA_API const cudaDeviceProp& GetDeviceProperty(const int device);
-
-/**
- * Runs a device query function and prints out the results to LOG(INFO).
- */
-CAFFE2_CUDA_API void DeviceQuery(const int deviceid);
-
-/**
- * Return a peer access pattern by returning a matrix (in the format of a
- * nested vector) of boolean values specifying whether peer access is possible.
- *
- * This function returns false if anything wrong happens during the query of
- * the GPU access pattern.
- */
-CAFFE2_CUDA_API bool GetCudaPeerAccessPattern(vector<vector<bool>>* pattern);
-
-/**
- * Return the availability of TensorCores for math
- */
-CAFFE2_CUDA_API bool TensorCoreAvailable();
-
-/**
- * Return a human readable cublas error string.
- */
-CAFFE2_CUDA_API const char* cublasGetErrorString(cublasStatus_t error);
-
-/**
- * Return a human readable curand error string.
- */
-CAFFE2_CUDA_API const char* curandGetErrorString(curandStatus_t error);
-
-// CUDA: various checks for different function calls.
-#define CUDA_ENFORCE(condition, ...) \
- do { \
- cudaError_t error = condition; \
- CAFFE_ENFORCE_EQ( \
- error, \
- cudaSuccess, \
- "Error at: ", \
- __FILE__, \
- ":", \
- __LINE__, \
- ": ", \
- cudaGetErrorString(error), \
- ##__VA_ARGS__); \
- } while (0)
-#define CUDA_CHECK(condition) \
- do { \
- cudaError_t error = condition; \
- CHECK(error == cudaSuccess) << cudaGetErrorString(error); \
- } while (0)
-
-#define CUDA_DRIVERAPI_ENFORCE(condition) \
- do { \
- CUresult result = condition; \
- if (result != CUDA_SUCCESS) { \
- const char* msg; \
- cuGetErrorName(result, &msg); \
- CAFFE_THROW("Error at: ", __FILE__, ":", __LINE__, ": ", msg); \
- } \
- } while (0)
-#define CUDA_DRIVERAPI_CHECK(condition) \
- do { \
- CUresult result = condition; \
- if (result != CUDA_SUCCESS) { \
- const char* msg; \
- cuGetErrorName(result, &msg); \
- LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": " \
- << msg; \
- } \
- } while (0)
-
-#define CUBLAS_ENFORCE(condition) \
- do { \
- cublasStatus_t status = condition; \
- CAFFE_ENFORCE_EQ( \
- status, \
- CUBLAS_STATUS_SUCCESS, \
- "Error at: ", \
- __FILE__, \
- ":", \
- __LINE__, \
- ": ", \
- ::caffe2::cublasGetErrorString(status)); \
- } while (0)
-#define CUBLAS_CHECK(condition) \
- do { \
- cublasStatus_t status = condition; \
- CHECK(status == CUBLAS_STATUS_SUCCESS) \
- << ::caffe2::cublasGetErrorString(status); \
- } while (0)
-
-#define CURAND_ENFORCE(condition) \
- do { \
- curandStatus_t status = condition; \
- CAFFE_ENFORCE_EQ( \
- status, \
- CURAND_STATUS_SUCCESS, \
- "Error at: ", \
- __FILE__, \
- ":", \
- __LINE__, \
- ": ", \
- ::caffe2::curandGetErrorString(status)); \
- } while (0)
-#define CURAND_CHECK(condition) \
- do { \
- curandStatus_t status = condition; \
- CHECK(status == CURAND_STATUS_SUCCESS) \
- << ::caffe2::curandGetErrorString(status); \
- } while (0)
-
-#define CUDA_1D_KERNEL_LOOP(i, n) \
- for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
- i += blockDim.x * gridDim.x)
-
-#define CUDA_2D_KERNEL_LOOP(i, n, j, m) \
- for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
- i += blockDim.x * gridDim.x) \
- for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \
- j += blockDim.y * gridDim.y)
-
-// The following helper functions are here so that you can write a kernel call
-// when you are not particularly interested in maxing out the kernels'
-// performance. Usually, this will give you a reasonable speed, but if you
-// really want to find the best performance, it is advised that you tune the
-// size of the blocks and grids more reasonably.
-// A legacy note: this is derived from the old good Caffe days, when I simply
-// hard-coded the number of threads and wanted to keep backward compatibility
-// for different computation capabilities.
-// For more info on CUDA compute capabilities, visit the NVidia website at:
-// http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
-
-// The number of cuda threads to use. Since work is assigned to SMs at the
-// granularity of a block, 128 is chosen to allow utilizing more SMs for
-// smaller input sizes.
-// 1D grid
-constexpr int CAFFE_CUDA_NUM_THREADS = 128;
-// 2D grid
-constexpr int CAFFE_CUDA_NUM_THREADS_2D_DIMX = 16;
-constexpr int CAFFE_CUDA_NUM_THREADS_2D_DIMY = 16;
-
-// The maximum number of blocks to use in the default kernel call. We set it to
-// 4096 which would work for compute capability 2.x (where 65536 is the limit).
-// This number is very carelessly chosen. Ideally, one would like to look at
-// the hardware at runtime, and pick the number of blocks that makes most
-// sense for the specific runtime environment. This is a todo item.
-// 1D grid
-constexpr int CAFFE_MAXIMUM_NUM_BLOCKS = 4096;
-// 2D grid
-constexpr int CAFFE_MAXIMUM_NUM_BLOCKS_2D_DIMX = 128;
-constexpr int CAFFE_MAXIMUM_NUM_BLOCKS_2D_DIMY = 128;
-
-constexpr int kCUDAGridDimMaxX = 2147483647;
-constexpr int kCUDAGridDimMaxY = 65535;
-constexpr int kCUDAGridDimMaxZ = 65535;
-
-/**
- * @brief Compute the number of blocks needed to run N threads.
- */
-inline int CAFFE_GET_BLOCKS(const int N) {
- return std::max(
- std::min(
- (N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS,
- CAFFE_MAXIMUM_NUM_BLOCKS),
- // Use at least 1 block, since CUDA does not allow empty block
- 1);
-}
-
-/**
- * @brief Compute the number of blocks needed to run N threads for a 2D grid
- */
-inline dim3 CAFFE_GET_BLOCKS_2D(const int N, const int /* M */) {
- dim3 grid;
- // Not calling the 1D version for each dim to keep all constants as literals
-
- grid.x = std::max(
- std::min(
- (N + CAFFE_CUDA_NUM_THREADS_2D_DIMX - 1) /
- CAFFE_CUDA_NUM_THREADS_2D_DIMX,
- CAFFE_MAXIMUM_NUM_BLOCKS_2D_DIMX),
- // Use at least 1 block, since CUDA does not allow empty block
- 1);
-
- grid.y = std::max(
- std::min(
- (N + CAFFE_CUDA_NUM_THREADS_2D_DIMY - 1) /
- CAFFE_CUDA_NUM_THREADS_2D_DIMY,
- CAFFE_MAXIMUM_NUM_BLOCKS_2D_DIMY),
- // Use at least 1 block, since CUDA does not allow empty block
- 1);
-
- return grid;
-}
-
-using CUDAGuard = c10::cuda::CUDAGuard;
-
-template <typename T, int N>
-struct SimpleArray {
- T data[N];
-};
-
-constexpr int kCUDATensorMaxDims = 8;
-
-#define DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(val, Func, T, ...) \
- do { \
- CAFFE_ENFORCE_LE(val, kCUDATensorMaxDims); \
- switch (val) { \
- case 1: { \
- Func<T, 1>(__VA_ARGS__); \
- break; \
- } \
- case 2: { \
- Func<T, 2>(__VA_ARGS__); \
- break; \
- } \
- case 3: { \
- Func<T, 3>(__VA_ARGS__); \
- break; \
- } \
- case 4: { \
- Func<T, 4>(__VA_ARGS__); \
- break; \
- } \
- case 5: { \
- Func<T, 5>(__VA_ARGS__); \
- break; \
- } \
- case 6: { \
- Func<T, 6>(__VA_ARGS__); \
- break; \
- } \
- case 7: { \
- Func<T, 7>(__VA_ARGS__); \
- break; \
- } \
- case 8: { \
- Func<T, 8>(__VA_ARGS__); \
- break; \
- } \
- default: { \
- break; \
- } \
- } \
- } while (false)
-
-#define DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2(val, Func, T1, T2, ...) \
- do { \
- CAFFE_ENFORCE_LE(val, kCUDATensorMaxDims); \
- switch (val) { \
- case 1: { \
- Func<T1, T2, 1>(__VA_ARGS__); \
- break; \
- } \
- case 2: { \
- Func<T1, T2, 2>(__VA_ARGS__); \
- break; \
- } \
- case 3: { \
- Func<T1, T2, 3>(__VA_ARGS__); \
- break; \
- } \
- case 4: { \
- Func<T1, T2, 4>(__VA_ARGS__); \
- break; \
- } \
- case 5: { \
- Func<T1, T2, 5>(__VA_ARGS__); \
- break; \
- } \
- case 6: { \
- Func<T1, T2, 6>(__VA_ARGS__); \
- break; \
- } \
- case 7: { \
- Func<T1, T2, 7>(__VA_ARGS__); \
- break; \
- } \
- case 8: { \
- Func<T1, T2, 8>(__VA_ARGS__); \
- break; \
- } \
- default: { \
- break; \
- } \
- } \
- } while (false)
-
-#define DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_3(val, Func, T1, T2, T3, ...) \
- do { \
- CAFFE_ENFORCE_LE(val, kCUDATensorMaxDims); \
- switch (val) { \
- case 1: { \
- Func<T1, T2, T3, 1>(__VA_ARGS__); \
- break; \
- } \
- case 2: { \
- Func<T1, T2, T3, 2>(__VA_ARGS__); \
- break; \
- } \
- case 3: { \
- Func<T1, T2, T3, 3>(__VA_ARGS__); \
- break; \
- } \
- case 4: { \
- Func<T1, T2, T3, 4>(__VA_ARGS__); \
- break; \
- } \
- case 5: { \
- Func<T1, T2, T3, 5>(__VA_ARGS__); \
- break; \
- } \
- case 6: { \
- Func<T1, T2, T3, 6>(__VA_ARGS__); \
- break; \
- } \
- case 7: { \
- Func<T1, T2, T3, 7>(__VA_ARGS__); \
- break; \
- } \
- case 8: { \
- Func<T1, T2, T3, 8>(__VA_ARGS__); \
- break; \
- } \
- default: { \
- break; \
- } \
- } \
- } while (false)
-
-} // namespace caffe2
-
-#endif // CAFFE2_CORE_COMMON_GPU_H_
diff --git a/caffe2/core/context.h b/caffe2/core/context.h
deleted file mode 100644
index eb46f78..0000000
--- a/caffe2/core/context.h
+++ /dev/null
@@ -1,227 +0,0 @@
-#ifndef CAFFE2_CORE_CONTEXT_H_
-#define CAFFE2_CORE_CONTEXT_H_
-
-#include <cstdlib>
-#include <ctime>
-#include <random>
-#include <unordered_map>
-
-#include <c10/util/typeid.h>
-#include "caffe2/core/allocator.h"
-#include "caffe2/core/context_base.h"
-#include "caffe2/core/event.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/proto/caffe2_pb.h"
-
-#include <c10/util/ArrayRef.h>
-
-#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-#include <c10/core/GeneratorImpl.h>
-#include <c10/util/irange.h>
-#include <ATen/core/DistributionsHelper.h>
-#include <ATen/core/MT19937RNGEngine.h>
-#else
-#include "caffe2/core/distributions_stubs.h"
-#endif
-
-C10_DECLARE_bool(caffe2_report_cpu_memory_usage);
-
-namespace caffe2 {
-
-/**
- * A function to generate a random number seed that is unique in a best-effort
- * basis, using an ever-incrementing seed and the current time.
- */
-TORCH_API uint32_t RandomNumberSeed();
-
-/**
- * The CPU Context, representing the bare minimum of what a Context class in
- * Caffe2 should implement.
- *
- * // TODO modify docs
- * See operator.h, especially Operator<Context>, for how Context are used in
- * actual operator implementations that are associated with specific devices.
- * In general, the Context class is passed in as a template argument, and
- * the operator can use the functions defined in the context to execute whatever
- * computation it has.
- *
- */
-class TORCH_API CPUContext final : public BaseContext {
- public:
-#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
- class rand_gen_type {
- public:
- explicit rand_gen_type(uint64_t seed_in = default_rng_seed_val)
- : engine_{seed_in} {}
-
- uint32_t random() {
- return engine_();
- }
- uint64_t random64() {
- uint32_t random1 = engine_();
- uint32_t random2 = engine_();
- return (static_cast<uint64_t>(random1) << 32) | random2;
- }
-
- std::optional<float> next_float_normal_sample() {
- return next_float_normal_sample_;
- }
- std::optional<double> next_double_normal_sample() {
- return next_double_normal_sample_;
- }
- void set_next_float_normal_sample(std::optional<float> randn) {
- next_float_normal_sample_ = randn;
- }
- void set_next_double_normal_sample(std::optional<double> randn) {
- next_double_normal_sample_ = randn;
- }
-
- private:
- at::mt19937 engine_;
- std::optional<float> next_float_normal_sample_;
- std::optional<double> next_double_normal_sample_;
- };
-#else
- typedef std::mt19937 rand_gen_type;
-#endif
-
- CPUContext() {}
- explicit CPUContext(const DeviceOption& option)
- : random_seed_(option.has_random_seed() ? option.random_seed() : 1701),
- random_seed_set_(option.has_random_seed() ? true : false) {
- CAFFE_ENFORCE_EQ(option.device_type(), PROTO_CPU);
- }
- explicit CPUContext(const at::Device& device)
- : CPUContext(DeviceToOption(device)) {}
-
- ~CPUContext() noexcept override {}
-
- inline void SwitchToDevice(int64_t /*stream_id*/) override {}
-
- using BaseContext::SwitchToDevice;
-
- inline void WaitEvent(const Event& ev) override {
- ev.Wait(CPU, this);
- }
-
- inline void Record(Event* ev, const char* err_msg = nullptr) const override {
- CAFFE_ENFORCE(ev, "Event must not be null.");
- ev->Record(CPU, this, err_msg);
- }
-
- inline void FinishDeviceComputation() override {}
-
- inline rand_gen_type* RandGenerator() {
- if (!random_generator_.get()) {
- random_generator_.reset(new rand_gen_type(RandSeed()));
- }
- return random_generator_.get();
- }
-
- inline uint32_t RandSeed() {
- if (!random_seed_set_) {
- random_seed_ = RandomNumberSeed();
- random_seed_set_ = true;
- }
- return static_cast<uint32_t>(random_seed_);
- }
-
- inline static at::DataPtr New(size_t nbytes) {
- return GetCPUAllocator()->allocate(nbytes);
- }
-
- void CopyBytesSameDevice(size_t nbytes, const void* src, void* dst) override;
-
- void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst) override {
- CopyBytesSameDevice(nbytes, src, dst);
- }
-
- void CopyBytesToCPU(size_t nbytes, const void* src, void* dst) override {
- CopyBytesSameDevice(nbytes, src, dst);
- }
-
- bool SupportsNonFundamentalTypes() const override {
- // CPU non fumdamental type copy OK
- return true;
- }
-
- template <class SrcContext, class DstContext>
- inline void CopyBytes(size_t nbytes, const void* src, void* dst);
-
- template <typename T, class SrcContext, class DstContext>
- inline void Copy(size_t n, const T* src, T* dst) {
- if (c10::guts::is_fundamental<T>::value) {
- CopyBytes<SrcContext, DstContext>(
- n * sizeof(T),
- static_cast<const void*>(src),
- static_cast<void*>(dst));
- } else {
- for (const auto i : c10::irange(n)) {
- dst[i] = src[i];
- }
- }
- }
-
- template <class SrcContext, class DstContext>
- inline void
- CopyItems(const TypeMeta meta, size_t n, const void* src, void* dst) {
- if (meta.copy()) {
- meta.copy()(src, dst, n);
- } else {
- CopyBytes<SrcContext, DstContext>(n * meta.itemsize(), src, dst);
- }
- }
-
- // By default CPU operators don't have async device parts
- static bool HasAsyncPartDefault() {
- return false;
- }
-
- static bool SupportsAsyncScheduling() {
- return false;
- }
-
- // CPU streams are not implemented and are silently ignored by CPU ops,
- // return true to signal executor to schedule a CPU op
- static bool IsStreamFree(
- const DeviceOption& /* option */,
- int /* stream_id */) {
- return true;
- }
-
- at::Device device() const override {
- // TODO: numa?
- return at::Device(CPU);
- }
-
- DeviceType device_type() const override {
- return CPU;
- }
-
- static constexpr DeviceType GetDeviceType() {
- return CPU;
- }
-
- protected:
- // TODO(jiayq): instead of hard-coding a generator, make it more flexible.
- int random_seed_{1701};
- bool random_seed_set_{false};
- std::unique_ptr<rand_gen_type> random_generator_;
-};
-
-template <>
-inline void CPUContext::CopyBytes<CPUContext, CPUContext>(
- size_t nbytes,
- const void* src,
- void* dst) {
- if (nbytes == 0) {
- return;
- }
- CAFFE_ENFORCE(src);
- CAFFE_ENFORCE(dst);
- memcpy(dst, src, nbytes);
-}
-
-} // namespace caffe2
-
-#endif // CAFFE2_CORE_CONTEXT_H_
diff --git a/caffe2/core/context_base.h b/caffe2/core/context_base.h
deleted file mode 100644
index cc8cc4c..0000000
--- a/caffe2/core/context_base.h
+++ /dev/null
@@ -1,168 +0,0 @@
-#pragma once
-
-#include <array>
-#include <cstdlib>
-#include <ctime>
-#include <memory>
-#include <unordered_map>
-
-#include <c10/macros/Macros.h>
-#include <c10/core/Allocator.h>
-#include <c10/util/typeid.h>
-#include <c10/util/Exception.h>
-#include <c10/util/Registry.h>
-#include <c10/core/CopyBytes.h>
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/proto/caffe2_pb.h"
-
-namespace caffe2 {
-class Event;
-
-} // namespace caffe2
-namespace at {
-
-class BaseContext;
-
-/**
- * Virtual interface for the Context class in Caffe2.
- *
- * A Context defines all the necessities to run an operator on a specific
- * device. Specific Context classes needs to implement all the pure virtual
- * functions in the BaseContext class.
- * TODO: add docs after this is finalized.
- */
-class TORCH_API BaseContext {
- public:
- virtual ~BaseContext() noexcept {}
-
- virtual Device device() const = 0;
-
- /* Sorry for the naming, will get rid of this in future diff */
- virtual DeviceType device_type() const = 0;
-
- virtual void SwitchToDevice(int64_t /*stream_id*/) = 0;
-
- inline void SwitchToDevice() {
- SwitchToDevice(0);
- }
-
- virtual void WaitEvent(const caffe2::Event& ev) = 0;
-
- virtual void Record(caffe2::Event* ev, const char* err_msg = nullptr)
- const = 0;
-
- virtual void FinishDeviceComputation() = 0;
-
- // This used to be arbitrary cross-device copy, but it turns out everyone
- // did direct CPU-X copy, so we just make three functions for it (to avoid
- // double dispatch). This will get obsoleted by C10. where copies
- // will be proper operators (and get to rely on multiple dispatch there.)
- virtual void CopyBytesSameDevice(
- size_t nbytes,
- const void* src,
- void* dst) = 0;
-
- virtual void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst) = 0;
-
- virtual void CopyBytesToCPU(size_t nbytes, const void* src, void* dst) = 0;
-
- template <typename T>
- inline void CopySameDevice(size_t n, const T* src, T* dst) {
- static_assert(
- c10::guts::is_fundamental<T>::value,
- "CopySameDevice requires fundamental types");
- CopyBytesSameDevice(
- n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
- }
-
- template <typename T>
- inline void CopyFromCPU(size_t n, const T* src, T* dst) {
- static_assert(
- c10::guts::is_fundamental<T>::value,
- "CopyFromCPU requires fundamental types");
- CopyBytesFromCPU(
- n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
- }
-
- template <typename T>
- inline void CopyToCPU(size_t n, const T* src, T* dst) {
- static_assert(
- c10::guts::is_fundamental<T>::value, "CopyToCPU requires fundamental types");
- CopyBytesToCPU(
- n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
- }
-
- virtual bool SupportsNonFundamentalTypes() const {
- return false;
- }
-
- inline void EnforceMetaCopyOK() {
- AT_ASSERTM(
- SupportsNonFundamentalTypes(), "Context requires fundamental types");
- }
-
- void CopyItemsSameDevice(
- const caffe2::TypeMeta meta,
- size_t n,
- const void* src,
- void* dst) {
- if (meta.copy()) {
- EnforceMetaCopyOK();
- meta.copy()(src, dst, n);
- } else {
- CopyBytesSameDevice(n * meta.itemsize(), src, dst);
- }
- }
-
- void CopyItemsFromCPU(
- const caffe2::TypeMeta meta,
- size_t n,
- const void* src,
- void* dst) {
- if (meta.copy()) {
- EnforceMetaCopyOK();
- meta.copy()(src, dst, n);
- } else {
- CopyBytesFromCPU(n * meta.itemsize(), src, dst);
- }
- }
-
- void CopyItemsToCPU(
- const caffe2::TypeMeta meta,
- size_t n,
- const void* src,
- void* dst) {
- if (meta.copy()) {
- EnforceMetaCopyOK();
- meta.copy()(src, dst, n);
- } else {
- CopyBytesToCPU(n * meta.itemsize(), src, dst);
- }
- }
-};
-
-// Context constructor registry
-C10_DECLARE_TYPED_REGISTRY(
- ContextRegistry,
- at::DeviceType,
- at::BaseContext,
- std::unique_ptr,
- at::Device);
-
-#define REGISTER_CONTEXT(type, ...) \
- C10_REGISTER_TYPED_CLASS(ContextRegistry, type, __VA_ARGS__)
-
-inline std::unique_ptr<at::BaseContext> CreateContext(
- const at::Device& device) {
- return at::ContextRegistry()->Create(device.type(), device);
-}
-
-} // namespace at
-
-namespace caffe2 {
-
-using at::BaseContext;
-using at::CreateContext;
-} // namespace caffe2
diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu
deleted file mode 100644
index ecc933a..0000000
--- a/caffe2/core/context_gpu.cu
+++ /dev/null
@@ -1,669 +0,0 @@
-#include <algorithm>
-#include <atomic>
-#include <cstdlib>
-#include <string>
-#include <unordered_map>
-
-#include <ATen/Context.h>
-#include <c10/cuda/CUDAFunctions.h>
-#include <c10/cuda/CUDACachingAllocator.h>
-#include "cub/util_allocator.cuh"
-
-// Needed to be included first to check the CAFFE2_USE_CUDNN macros.
-#include "caffe2/core/macros.h"
-
-#include "caffe2/core/blob_stats.h"
-#ifdef CAFFE2_USE_CUDNN
-#include "caffe2/core/common_cudnn.h"
-#endif // CAFFE2_USE_CUDNN
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/core/init.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/tensor.h"
-#include "caffe2/utils/string_utils.h"
-#include "caffe2/utils/cub_namespace.cuh"
-
-C10_DEFINE_string(
- caffe2_cuda_memory_pool,
- "",
- "Sets the memory pool used by caffe2. Possible values are "
- "none, cnmem, thc and cub.");
-
-// For description of CUB caching allocator configuration, see
-// https://nvlabs.github.io/cub/structcub_1_1_caching_device_allocator.html
-C10_DEFINE_int(
- caffe2_cub_bin_growth,
- 8,
- "If using cub as the memory allocator, sets the growth of bins "
- "used by the cub pool.");
-C10_DEFINE_int(
- caffe2_cub_min_bin,
- 3,
- "If using cub as the memory allocator, sets the min number of "
- "bins.");
-C10_DEFINE_int(
- caffe2_cub_max_bin,
- 10,
- "If using cub as the memory allocator, sets the max number of "
- "bins.");
-C10_DEFINE_int(
- caffe2_cub_max_managed_mb,
- 10 * 1024,
- "If using cub as the memory allocators, sets the maximum amount "
- "of memory managed in gigabytes");
-
-C10_DEFINE_bool(
- caffe2_cub_print_allocation_events,
- false,
- "If true CachingDeviceAllocator will print allocation and deallocation "
- "events to stdout.");
-
-C10_DEFINE_bool(
- caffe2_gpu_memory_tracking,
- false,
- "If set, logs changes in GPU memory allocations");
-C10_DEFINE_int(
- caffe2_gpu_memory_report_interval_mb,
- 128,
- "The threshold in MB on how frequently to report memory changes");
-
-namespace at {
-
-REGISTER_CONTEXT(DeviceType::CUDA, caffe2::CUDAContext);
-} // namespace at
-
-namespace caffe2 {
-
-// Generic implementation - CUDA will handle the right function to call for us
-void CUDAContext::CopyBytesAsync(
- size_t nbytes,
- const void* src,
- Device src_device,
- void* dst,
- Device dst_device) {
- // TODO: verify that the CUDA handles copy from device to device correctly
- // even without SetDevice()
- // TODO: verify whether source or dest device should be a priority in picking
- // the stream
- // NB: right now the cross-device copy logic is invoked only in the contexts
- // when surrounding code explicitly manages data dependencies and sets up
- // events, so it's fine. In order to make it a standalone function proper
- // synchronization between stream is required
- int gpu_id = 0;
- if (dst_device.is_cuda()) {
- gpu_id = dst_device.index();
- } else if (src_device.is_cuda()) {
- gpu_id = src_device.index();
- } else {
- LOG(FATAL) << "shouldn't be called with non-cuda device";
- }
- CUDA_ENFORCE(cudaMemcpyAsync(
- dst,
- src,
- nbytes,
- cudaMemcpyDefault,
- CUDAContext::getCudaObjects().GetStream(gpu_id)));
-}
-
-void CUDAContext::CopyBytesSync(
- size_t nbytes,
- const void* src,
- Device src_device,
- void* dst,
- Device dst_device) {
- // This emulates Caffe2 original behavior where sync copy doesn't change the
- // device. It's probably better for clarity to switch to the target device
- // explicitly here, but in the worst case CUDA would sync for us.
- // TODO: change it to CUDAGuard
- CUDAContext context(-1); // take current device
- CUDA_ENFORCE(cudaMemcpyAsync(
- dst, src, nbytes, cudaMemcpyDefault, context.cuda_stream()));
- // destructor of context synchronizes
-}
-
-// For the CPU context, we also allow a (probably expensive) function
-// to copy the data from a cuda context. Inside the function, we create
-// a temporary CUDAContext object to carry out the copy. From the caller's
-// side, these functions are synchronous with respect to the host, similar
-// to a normal CPUContext::CopyBytes<CPUContext, CPUContext> call.
-template <>
-inline void CPUContext::CopyBytes<CUDAContext, CPUContext>(
- size_t nbytes,
- const void* src,
- void* dst) {
- CUDAContext context(GetGPUIDForPointer(src));
- context.CopyBytes<CUDAContext, CPUContext>(nbytes, src, dst);
-}
-template <>
-inline void CPUContext::CopyBytes<CPUContext, CUDAContext>(
- size_t nbytes,
- const void* src,
- void* dst) {
- CUDAContext context(GetGPUIDForPointer(dst));
- context.CopyBytes<CPUContext, CUDAContext>(nbytes, src, dst);
-}
-
-} // namespace caffe2
-
-namespace caffe2 {
-
-ThreadLocalCUDAObjects& CUDAContext::getCudaObjects() {
- static thread_local ThreadLocalCUDAObjects cuda_objects_;
- return cuda_objects_;
-}
-
-// TODO(jiayq): these variables shouldn't be currently accessed during static
-// initialization. We should consider moving them to a Mayer's singleton to
-// be totally safe against SIOF.
-
-// Static global variables for setting up the memory pool.
-CudaMemoryPoolType g_cuda_memory_pool_type;
-
-std::unique_ptr<cub::CachingDeviceAllocator> g_cub_allocator;
-
-// an unordered map that holds the map from the cuda memory pointer to the
-// device id that it is allocated from. This is used in the cuda memory pool
-// cases, where we need the device id to carry out the deletion.
-// Note(jiayq): an alternate approach is to use cudaGetPointerAttributes, but
-// that is usually quite slow. We might want to benchmark the speed difference
-// though.
-// Note(jiayq): another alternate approach is to augment the Tensor class that
-// would allow one to record the device id. However, this does not address any
-// non-tensor allocation and deallocation.
-// Ideally, a memory pool should already have the device id information, as
-// long as we are using UVA (as of CUDA 5 and later) so the addresses are
-// unique.
-static std::unordered_map<void*, uint8_t> g_cuda_device_affiliation;
-
-// Data structures for optional memory tracking. Access to these structures
-// is guarded by the CUDAContext::mutex.
-static std::unordered_map<void*, long> g_size_map;
-static std::vector<long> g_total_by_gpu_map(C10_COMPILE_TIME_MAX_GPUS, 0);
-static std::vector<long> g_max_by_gpu_map(C10_COMPILE_TIME_MAX_GPUS, 0);
-
-static long g_total_mem = 0;
-static long g_last_rep = 0;
-
-CudaMemoryPoolType GetCudaMemoryPoolType() {
- return g_cuda_memory_pool_type;
-}
-
-///////////////////////////////////////////////////////////////////////////////
-// A wrapper to allow us to lazily initialize all cuda environments that Caffe
-// uses. This gets done the first time a caffe2::CUDAContext::New() gets called
-// which is probably the decisive indication that this caffe2 run is going to
-// use GPUs. We avoid cuda initialization with core/init.h functionalities so
-// that we have minimal resource impact in case we will need to run multiple
-// caffe2 instances on a GPU machine.
-///////////////////////////////////////////////////////////////////////////////
-
-static void Caffe2InitializeCuda() {
- // If the current run does not have any cuda devices, do nothing.
- if (!HasCudaGPU()) {
- VLOG(1) << "No cuda gpu present. Skipping.";
- return;
- }
- C10_LOG_API_USAGE_ONCE("caffe2.init.cuda");
- // Check if the number of GPUs matches the expected compile-time max number
- // of GPUs.
- CAFFE_ENFORCE_LE(
- NumCudaDevices(),
- C10_COMPILE_TIME_MAX_GPUS,
- "Number of CUDA devices on the machine is larger than the compiled "
- "max number of gpus expected (",
- C10_COMPILE_TIME_MAX_GPUS,
- "). Increase that and recompile.");
-
- for (DeviceIndex i = 0; i < NumCudaDevices(); ++i) {
- CUDAGuard g(i);
- // Enable peer access.
- const int peer_group = i / CAFFE2_CUDA_MAX_PEER_SIZE;
- const int peer_start = peer_group * CAFFE2_CUDA_MAX_PEER_SIZE;
- const int peer_end = std::min(
- NumCudaDevices(), (peer_group + 1) * CAFFE2_CUDA_MAX_PEER_SIZE);
- VLOG(1) << "Enabling peer access within group #" << peer_group
- << ", from gpuid " << peer_start << " to " << peer_end - 1
- << ", for gpuid " << i << ".";
-
- for (int j = peer_start; j < peer_end; ++j) {
- if (i == j) continue;
- int can_access;
- CUDA_ENFORCE(cudaDeviceCanAccessPeer(&can_access, i, j));
- if (can_access) {
- VLOG(1) << "Enabling peer access from " << i << " to " << j;
- // Note: just for future reference, the 0 here is not a gpu id, it is
- // a reserved flag for cudaDeviceEnablePeerAccess that should always be
- // zero currently.
- // It is ok if peer access is already enabled...
- cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaDeviceEnablePeerAccess(j, 0));
- if ((err != cudaErrorPeerAccessAlreadyEnabled) &&
- (err != cudaSuccess)) {
- CAFFE_THROW(cudaGetErrorString(err));
- }
- cudaGetLastError(); // reset cuda error code
- }
- }
- }
-
-#ifdef CAFFE2_USE_CUDNN
- // Check the versions of cuDNN that were compiled and linked with are compatible
- CheckCuDNNVersions();
-#endif // CAFFE2_USE_CUDNN
-}
-
-static void SetUpCub() {
- VLOG(1) << "Setting up cub memory pool.";
- // Sets up the cub memory pool
- try {
- g_cub_allocator.reset(new cub::CachingDeviceAllocator(
- FLAGS_caffe2_cub_bin_growth,
- FLAGS_caffe2_cub_min_bin,
- FLAGS_caffe2_cub_max_bin,
- size_t(FLAGS_caffe2_cub_max_managed_mb) * 1024L * 1024L,
- false,
- FLAGS_caffe2_cub_print_allocation_events));
- } catch (...) {
- CAFFE_THROW("Some error happened at cub initialization.");
- }
- VLOG(1) << "Done setting up cub memory pool.";
-}
-
-static void Caffe2SetCUDAMemoryPool() {
- if (FLAGS_caffe2_cuda_memory_pool == "" ||
- FLAGS_caffe2_cuda_memory_pool == "none") {
- g_cuda_memory_pool_type = CudaMemoryPoolType::NONE;
- } else if (FLAGS_caffe2_cuda_memory_pool == "cnmem") {
- CAFFE_THROW("CNMEM is no longer used by Caffe2. Use cub instead. "
- "This error message may go away in the future.");
- } else if (FLAGS_caffe2_cuda_memory_pool == "cub") {
- // Sets up cub.
- g_cuda_memory_pool_type = CudaMemoryPoolType::CUB;
- SetUpCub();
- } else if (FLAGS_caffe2_cuda_memory_pool == "thc") {
- g_cuda_memory_pool_type = CudaMemoryPoolType::THC;
- // Initialize caching allocator
- at::globalContext().lazyInitCUDA();
- } else {
- CAFFE_THROW(
- "Unrecognized cuda memory pool type: ", FLAGS_caffe2_cuda_memory_pool);
- }
-}
-
-/**
- * An allocator that does the CPU memory allocation with pinned memory.
- *
- * This is needed because if we want to do any asynchronous cuda memcpy,
- * the underlying CPU memory also needs to be allocated into pinned memory
- * space. As a result, whenever Caffe2 is built with GPU and there is
- * GPU present during runtime, at global initialization time we will set
- * the CPU memory allocator to allocate pinned memory.
- *
- * NB: This behavior is probably too aggressive. We should consider asking users
- * to do on-demand memory pinning (like exposed in PyTorch APIs) instead.
- */
-struct CAFFE2_CUDA_API PinnedCPUAllocator final : public at::Allocator {
- PinnedCPUAllocator() {
- baseAllocator_ = GetDefaultCPUAllocator();
- }
- ~PinnedCPUAllocator() override {}
- at::DataPtr allocate(size_t nbytes) override {
- if (nbytes == 0) {
- // replicate c10::alloc_cpu behavior - return nullptr
- return {nullptr, nullptr, &Delete, at::Device(CPU)};
- }
- void* data;
- at::DataPtr data_ptr;
- std::lock_guard<std::mutex> lock(CUDAContext::mutex());
- if (IsNUMAEnabled()) {
- at::DeleterFnPtr expected_deleter = baseAllocator_->raw_deleter();
- data_ptr = baseAllocator_->allocate(nbytes);
- data = data_ptr.get();
- CAFFE_ENFORCE(data);
- CUDA_ENFORCE(cudaHostRegister(data, nbytes, cudaHostRegisterDefault));
- CAFFE_ENFORCE(
- data_ptr.compare_exchange_deleter(expected_deleter, &Delete),
- "Failed to swap deleter (already swapped?)");
- } else {
- CUDA_ENFORCE(cudaMallocHost(&data, nbytes));
- profiledCPUMemoryReporter().New(data, nbytes);
- data_ptr = {data, data, &Delete, at::Device(CPU)};
- }
- memset(data, 0, nbytes);
- return data_ptr;
- }
-
- at::DeleterFnPtr raw_deleter() const override {
- return &Delete;
- }
-
- void copy_data(void* dest, const void* src, std::size_t count) const final {
- TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for PinnedCPUAllocator");
- }
-
- private:
- static void Delete(void* data) {
- if (!data) {
- return;
- }
- // Caffe2 uses a lazy way to figure out if one is actually going to use GPUs
- // or not. If a CUDAContext::New() call is made, inside the CUDAContext
- // function we will switch the cpu side allocator to a PinnedCPUAllocator.
- // But, if one calls CPUContext::New() before any cuda allocations,
- // PinnedCPUAllocator can still delete the corresponding memory.
- std::lock_guard<std::mutex> lock(CUDAContext::mutex());
- if (IsNUMAEnabled()) {
- CUDA_ENFORCE(cudaHostUnregister(data));
- GetDefaultCPUAllocator()->raw_deleter()(data);
- } else {
- cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaFreeHost(data));
- profiledCPUMemoryReporter().Delete(data);
- if (err == cudaErrorInvalidValue) {
- free(data);
- // Calling cudaGetLastError will reset the cuda error.
- cudaError_t _err = cudaGetLastError();
- } else {
- // For all other errors, still do a cuda check.
- CUDA_ENFORCE(err);
- }
- }
- }
-
- at::Allocator* baseAllocator_;
-};
-
-static PinnedCPUAllocator g_pinned_cpu_alloc;
-
-// An initialization function that sets the CPU side to use pinned cpu
-// allocator.
-void Caffe2UsePinnedCPUAllocator() {
-#if C10_ASAN_ENABLED
- // Note(jiayq): for more details, see
- // https://github.com/google/sanitizers/issues/629
- LOG(WARNING) << "There are known issues between address sanitizer and "
- "cudaMallocHost. As a result, caffe2 will not enable pinned "
- "memory allocation in asan mode. If you are expecting any "
- "behavior that depends on asan, be advised that it is not "
- "turned on.";
-#else
- if (!HasCudaGPU()) {
- VLOG(1) << "No GPU present. I won't use pinned allocator then.";
- return;
- }
- VLOG(1) << "Caffe2 gpu: setting CPUAllocator to PinnedCPUAllocator.";
-
- // If CUDA is enabled, using CPU allocators other than PinnedCPUAllocator
- // will cause memory corruptions. Therefore, we need to set the priority
- // to highest to avoid being overwritten.
- SetCPUAllocator(
- &g_pinned_cpu_alloc,
- std::numeric_limits<uint8_t>::max() /* priority */);
-#endif
-}
-
-// Caffe2CudaInitializerHelper is a minimal struct whose sole purpose is to
-// detect the first hint that this Caffe2 run is going to use GPU: either
-// CUDAContext is initialized or CUDAContext::New is called. It then runs
-// all the related cuda initialization functions.
-namespace {
-struct Caffe2CudaInitializerHelper {
- Caffe2CudaInitializerHelper() {
- // We cannot use bool because nvcc changes bool to __nv_bool which does
- // not have a std::atomic instantiation.
- static std::atomic<char> first_call(1);
- if (first_call.fetch_and((char)0)) {
- Caffe2InitializeCuda();
- Caffe2SetCUDAMemoryPool();
- Caffe2UsePinnedCPUAllocator();
- }
- }
-};
-} // namespace
-
-/**
- * A utility function to rectify the gpu id. If the context specifies the
- * gpu id to be -1, it means that we will just use the current gpu id when
- * the function is being called.
- */
-static inline DeviceIndex RectifyGPUID(DeviceIndex gpu_id) {
- return gpu_id == -1 ? CaffeCudaGetDevice() : gpu_id;
-}
-
-CUDAContext::CUDAContext(DeviceIndex gpu_id)
- : gpu_id_(RectifyGPUID(gpu_id)), random_seed_(RandomNumberSeed()) {
- static Caffe2CudaInitializerHelper g_cuda_initializer_;
-}
-
-CUDAContext::CUDAContext(const DeviceOption& option)
- : gpu_id_(
- option.has_device_id() ? RectifyGPUID(option.device_id())
- : CaffeCudaGetDevice()),
- random_seed_(
- option.has_random_seed() ? option.random_seed()
- : RandomNumberSeed()) {
- static Caffe2CudaInitializerHelper g_cuda_initializer_;
- TORCH_DCHECK_EQ(option.device_type(), PROTO_CUDA);
-}
-
-CUDAContext::~CUDAContext() {
- try {
- if (curand_generator_) {
- CURAND_CHECK(curandDestroyGenerator(curand_generator_));
- }
- // CUDAContext is used in 2 cases now:
- // - long-lived instance inside OperatorBase in which case what happens in
- // destructor doesn't really matter
- // - short-lived on-the-fly instances that are utilized as CUDAGuard - in
- // this case there's only one stream id (passed to SwitchToDevice) and
- // it's preferrable to synchronize in the destructor
- FinishDeviceComputation();
- } catch (const std::exception& e) {
- LOG(ERROR) << "Encountered following in " << __FUNCTION__ << ": " << e.what();
- }
-}
-
-// shared mutex to lock out alloc / free during NCCL launches
-std::mutex& CUDAContext::mutex() {
- static std::mutex m;
- return m;
-}
-
-std::vector<long> CUDAContext::TotalMemoryByGpu() {
- std::lock_guard<std::mutex> lock(CUDAContext::mutex());
- CAFFE_ENFORCE(
- FLAGS_caffe2_gpu_memory_tracking,
- "Pass --caffe2_gpu_memory_tracking to enable memory stats");
- return g_total_by_gpu_map;
-}
-
-std::vector<long> CUDAContext::MaxMemoryByGpu() {
- std::lock_guard<std::mutex> lock(CUDAContext::mutex());
- CAFFE_ENFORCE(
- FLAGS_caffe2_gpu_memory_tracking,
- "Pass --caffe2_gpu_memory_tracking to enable memory stats");
- return g_max_by_gpu_map;
-}
-
-namespace {
-void TrackMemoryAlloc(size_t nbytes) {
- int this_gpu = CaffeCudaGetDevice();
- g_total_by_gpu_map[this_gpu] += nbytes;
- g_max_by_gpu_map[this_gpu] =
- std::max(g_max_by_gpu_map[this_gpu], g_total_by_gpu_map[this_gpu]);
- g_total_mem += nbytes;
- if (g_total_mem - g_last_rep >
- FLAGS_caffe2_gpu_memory_report_interval_mb * 1024 * 1024) {
- for (int gpu = 0; gpu < g_total_by_gpu_map.size(); gpu++) {
- long t = g_total_by_gpu_map[gpu];
- long max_t = g_max_by_gpu_map[gpu];
- if (max_t > 0) {
- if (max_t != t) {
- VLOG(1) << "GPU " << gpu << ": " << t / 1024 / 1024 << " MB"
- << " (max: " << max_t / 1024 / 1024 << " MB)";
- } else {
- VLOG(1) << "GPU " << gpu << ": " << t / 1024 / 1024 << " MB";
- }
- }
- }
- VLOG(1) << "Total: " << g_total_mem / 1024 / 1024 << " MB";
- g_last_rep = g_total_mem;
- }
-}
-}
-
-struct DefaultCUDAAllocator final : public at::Allocator {
- DefaultCUDAAllocator() {}
- ~DefaultCUDAAllocator() override {}
- at::DataPtr allocate(size_t nbytes) override {
- // Lock the mutex
- std::lock_guard<std::mutex> lock(CUDAContext::mutex());
- // A one-time caffe2 cuda initializer.
- static Caffe2CudaInitializerHelper g_cuda_initializer_;
- void* ptr = nullptr;
-
- if (FLAGS_caffe2_gpu_memory_tracking) {
- TrackMemoryAlloc(nbytes);
- }
- switch (g_cuda_memory_pool_type) {
- case CudaMemoryPoolType::NONE:
- if (nbytes != 0) {
- CUDA_ENFORCE(cudaMalloc(&ptr, nbytes));
- }
- if (FLAGS_caffe2_gpu_memory_tracking) {
- g_size_map[ptr] = nbytes;
- g_cuda_device_affiliation[ptr] = CaffeCudaGetDevice();
- }
- return {ptr, ptr, &Delete, at::Device(CUDA, CaffeCudaGetDevice())};
- case CudaMemoryPoolType::CUB:
- if (nbytes != 0) {
- CUDA_ENFORCE(g_cub_allocator->DeviceAllocate(&ptr, nbytes));
- }
- g_cuda_device_affiliation[ptr] = CaffeCudaGetDevice();
- VLOG(2) << "CUB allocating pointer " << ptr << " on device "
- << CaffeCudaGetDevice();
- if (FLAGS_caffe2_gpu_memory_tracking) {
- g_size_map[ptr] = nbytes;
- }
- return {ptr, ptr, &Delete, at::Device(CUDA, CaffeCudaGetDevice())};
- case CudaMemoryPoolType::THC:
- {
- // The reason we have this stream guard here is to preserve
- // the historical behavior of the 'thc' allocator in Caffe2,
- // which is to put all allocations on the same (default)
- // stream. This behavior is morally wrong (since passing
- // allocations between streams allows for the possibility
- // of you handing out some memory that an old stream
- // is still working on), but it doesn't seem to cause issues
- // in Caffe2 today. Our hypothesis for why this is the case
- // is that Caffe2 doesn't really do very many allocations
- // on the fly; instead they allocate once and then reuse
- // the allocations for the whole program. In this case,
- // the hazard is avoided.
- //
- // We intend to remove this stream guard, but the benefit
- // to putting all allocations on the same stream is it
- // reduces per-stream fragmentation, and this helps
- // some models that are currently running with the thc
- // allocator fit in memory. We will need to find some
- // way of resolving this problem.
- c10::cuda::CUDAStreamGuard g(
- Stream(
- Stream::DEFAULT,
- Device(kCUDA, CaffeCudaGetDevice())
- ));
- ptr = c10::cuda::CUDACachingAllocator::raw_alloc(nbytes);
- }
- if (FLAGS_caffe2_gpu_memory_tracking) {
- g_size_map[ptr] = nbytes;
- g_cuda_device_affiliation[ptr] = CaffeCudaGetDevice();
- }
- return {ptr, ptr, &Delete, at::Device(CUDA, CaffeCudaGetDevice())};
- }
- return {nullptr, nullptr, &Delete, at::Device(CUDA, CaffeCudaGetDevice())};
- }
-
- at::DeleterFnPtr raw_deleter() const override {
- return &Delete;
- }
-
- void copy_data(void* dest, const void* src, std::size_t count) const final {
- TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for DefaultCUDAAllocator");
- }
-
- private:
- static void Delete(void* ptr) {
- // lock the mutex
- std::lock_guard<std::mutex> lock(CUDAContext::mutex());
- if (FLAGS_caffe2_gpu_memory_tracking) {
- auto sz_it = g_size_map.find(ptr);
- DCHECK(sz_it != g_size_map.end());
- auto aff_it = g_cuda_device_affiliation.find(ptr);
- DCHECK(aff_it != g_cuda_device_affiliation.end());
- g_total_mem -= sz_it->second;
- g_total_by_gpu_map[aff_it->second] -= sz_it->second;
- g_size_map.erase(sz_it);
- }
-
- switch (g_cuda_memory_pool_type) {
- case CudaMemoryPoolType::NONE: {
- // If memory pool is not set up, use simple cudaFree.
- cudaError_t error = C10_CUDA_ERROR_HANDLED(cudaFree(ptr));
- // For some reason, in Python runtime we sometimes delete a data pointer
- // after the cuda runtime exits - this is odd but is probably caused by
- // a static workspace that pycaffe2 uses, and the destruction got
- // entangled in some race condition. Anyway, since cuda runtime is
- // exiting anyway, we will not need to worry about memory leak, so we
- // basically ignore it. This is definitely not ideal but works for now.
- if (error != cudaSuccess && error != cudaErrorCudartUnloading) {
- LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": "
- << cudaGetErrorString(error);
- }
-
- if (FLAGS_caffe2_gpu_memory_tracking) {
- g_cuda_device_affiliation.erase(g_cuda_device_affiliation.find(ptr));
- }
-
- break;
- }
- case CudaMemoryPoolType::CUB: {
- auto it = g_cuda_device_affiliation.find(ptr);
- DCHECK(it != g_cuda_device_affiliation.end());
- VLOG(2) << "CUB freeing pointer " << ptr << " on device " << it->second;
- CUDA_ENFORCE(g_cub_allocator->DeviceFree(it->second, ptr));
- g_cuda_device_affiliation.erase(it);
- break;
- }
- case CudaMemoryPoolType::THC: {
- c10::cuda::CUDACachingAllocator::raw_delete(ptr);
- if (FLAGS_caffe2_gpu_memory_tracking) {
- g_cuda_device_affiliation.erase(g_cuda_device_affiliation.find(ptr));
- }
- break;
- }
- }
- }
-};
-
-static DefaultCUDAAllocator g_cuda_alloc;
-REGISTER_ALLOCATOR(CUDA, &g_cuda_alloc);
-
-} // namespace caffe2
-
-namespace at {
-REGISTER_COPY_BYTES_FUNCTION(
- DeviceType::CUDA,
- DeviceType::CUDA,
- caffe2::CUDAContext::CopyBytesSync,
- caffe2::CUDAContext::CopyBytesAsync);
-
-REGISTER_COPY_BYTES_FUNCTION(
- DeviceType::CUDA,
- DeviceType::CPU,
- caffe2::CUDAContext::CopyBytesSync,
- caffe2::CUDAContext::CopyBytesAsync);
-
-REGISTER_COPY_BYTES_FUNCTION(
- DeviceType::CPU,
- DeviceType::CUDA,
- caffe2::CUDAContext::CopyBytesSync,
- caffe2::CUDAContext::CopyBytesAsync);
-} // namespace at
diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h
deleted file mode 100644
index 8490a50..0000000
--- a/caffe2/core/context_gpu.h
+++ /dev/null
@@ -1,354 +0,0 @@
-#ifndef CAFFE2_CORE_CONTEXT_GPU_H_
-#define CAFFE2_CORE_CONTEXT_GPU_H_
-
-#include <ctime>
-#include <mutex>
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/common_gpu.h"
-#include "caffe2/core/context.h"
-#include "caffe2/core/context_base.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/numa.h"
-#include "caffe2/core/tensor.h"
-#include "caffe2/core/types.h"
-#include "caffe2/proto/caffe2_pb.h"
-
-// Since we are using the macro CAFFE2_USE_CUDNN, we will need to include this
-// file after common.h is included.
-#ifdef CAFFE2_USE_CUDNN
-#include "caffe2/core/common_cudnn.h"
-#endif // CAFFE2_USE_CUDNN
-
-#include <c10/core/Device.h>
-#include <c10/core/Stream.h>
-#include <c10/cuda/CUDAStream.h>
-#include <c10/cuda/CUDAGuard.h>
-
-namespace caffe2 {
-
-enum class CudaMemoryPoolType {
- NONE = 0,
- CUB = 1,
- THC = 2,
-};
-
-/**
- * Gets the current memory pool type used by Caffe2.
- *
- * The memory pool is set up during caffe2's global initialization time.
- */
-CAFFE2_CUDA_API CudaMemoryPoolType GetCudaMemoryPoolType();
-
-/**
- * A struct to host thread-local cuda objects.
- *
- * In Caffe2, each thread has its own non-default cuda stream as well as
- * related objects such as cublas and curand handles. This is achieved by
- * having the ThreadLocalCUDAObjects wrapper that takes care of allocating
- * and deallocating these objects at the thread scope. This class is solely
- * used inside CUDAContext and should not be used externally.
- *
- * This class manages the mapping from logical stream ID (int stream_id
- * passed around in Caffe2) and CUDAStream objects. We intend to eventually
- * deprecate the logical stream ID interface, but not for now.
- */
-class CAFFE2_CUDA_API ThreadLocalCUDAObjects {
- friend class CUDAContext;
-
- private:
- ThreadLocalCUDAObjects() {
- for (DeviceIndex i = 0; i < C10_COMPILE_TIME_MAX_GPUS; ++i) {
- cuda_streams_[i] = vector<c10::cuda::CUDAStream>();
- }
- }
-
- // Record current stream id for the current thread.
- // This is the new API we're trying to migrate use cases to and get rid of
- // explicit stream id passing. For now it's invoked in
- // CUDAContext::SwitchToDevice
- void SetCurrentStreamId(DeviceIndex gpu, StreamId stream_id) {
- // TODO: use current device id from thread local instead of passing gpu in
- if (stream_id != -1) {
- c10::cuda::setCurrentCUDAStream(GetCUDAStream(gpu, stream_id));
- }
- }
-
- // Retrieves the CUDAStream corresponding to a logical stream ID, ensuring
- // that it exists in cuda_streams_ if it has not been allocated yet.
- c10::cuda::CUDAStream GetCUDAStream(DeviceIndex gpu, StreamId stream_id) {
- vector<c10::cuda::CUDAStream>& gpu_streams = cuda_streams_[gpu];
- while (gpu_streams.size() <= static_cast<size_t>(stream_id)) {
- // NB: This streams are not guaranteed to be unique; we'll
- // wrap around once we run out of streams in the pool.
- gpu_streams.emplace_back(c10::cuda::getStreamFromPool(/* high priority */ false, gpu));
- }
- return gpu_streams[stream_id];
- }
-
- // Uses the logical stream id from the thread local to pick the stream
- // We're going to migrate all usages to this case API instead of passing the
- // stream id directly
- cudaStream_t GetStream(DeviceIndex gpu) {
- return c10::cuda::getCurrentCUDAStream(gpu).stream();
- }
-
- cudaStream_t GetStream(DeviceIndex gpu, StreamId stream_id) {
- return GetCUDAStream(gpu, stream_id).stream();
- }
-
- // Uses the logical stream id from the thread local to pick the stream
- // We're going to migrate all usages to this case API instead of passing the
- // stream id directly
- cublasHandle_t GetHandle(DeviceIndex gpu) {
- return GetHandle(c10::cuda::getCurrentCUDAStream(gpu));
- }
-
- cublasHandle_t GetHandle(c10::cuda::CUDAStream cuda_stream) {
- CUDAGuard guard(cuda_stream.device_index());
- // Default construct in the map if it doesn't exist, and return a mutable
- // reference to it.
- auto& r = cublas_handles_[cuda_stream];
- if (r == nullptr) {
- CUBLAS_ENFORCE(cublasCreate(&r));
- // The default is CUBLAS_POINTER_MODE_HOST. You can override
- // it after obtaining the cublas handle, but do that with
- // caution.
- CUBLAS_ENFORCE(cublasSetPointerMode(r, CUBLAS_POINTER_MODE_HOST));
- CUBLAS_ENFORCE(cublasSetStream(r, cuda_stream));
- }
- return r;
- }
-
-#ifdef CAFFE2_USE_CUDNN
- // Uses the logical stream id from the thread local to pick the stream
- // We're going to migrate all usages to this case API instead of passing the
- // stream id directly
- cudnnHandle_t GetCudnnHandle(DeviceIndex gpu) {
- return GetCudnnHandle(c10::cuda::getCurrentCUDAStream(gpu));
- }
-
- cudnnHandle_t GetCudnnHandle(c10::cuda::CUDAStream cuda_stream) {
- CUDAGuard guard(cuda_stream.device_index());
- auto& r = cudnn_handles_[cuda_stream];
- if (r == nullptr) {
- CUDNN_ENFORCE(cudnnCreate(&r));
- CUDNN_ENFORCE(cudnnSetStream(r, cuda_stream));
- }
- return r;
- }
-#endif // CAFFE2_USE_CUDNN
-
- ~ThreadLocalCUDAObjects() noexcept {
- for (auto element : cublas_handles_) {
- if (element.second) {
- CUBLAS_CHECK(cublasDestroy(element.second));
- }
- }
-#ifdef CAFFE2_USE_CUDNN
- for (auto element : cudnn_handles_) {
- if (element.second) {
-#ifdef _WIN32
- // this is because of something dumb in the ordering of
- // destruction. Sometimes at exit, the cuda context would already
- // be destroyed by the time this gets destroyed. This happens on
- // windows with cuda 11 and cuda 12.
- cudnnDestroy(element.second);
-#else
- CUDNN_CHECK(cudnnDestroy(element.second));
-#endif // _WIN32
- }
- }
-#endif // CAFFE2_USE_CUDNN
- }
- // WARNING: mapping from logical stream ID to c10::cuda::CUDAStream
- // is NOT bijective; multiple logical stream IDs may map to the
- // same underlying stream ID.
- vector<c10::cuda::CUDAStream> cuda_streams_[C10_COMPILE_TIME_MAX_GPUS];
- std::unordered_map<c10::cuda::CUDAStream, cublasHandle_t> cublas_handles_;
-#ifdef CAFFE2_USE_CUDNN
- std::unordered_map<c10::cuda::CUDAStream, cudnnHandle_t> cudnn_handles_;
-#endif // CAFFE2_USE_CUDNN
-};
-
-class CAFFE2_CUDA_API CUDAContext final : public BaseContext {
- public:
- // The default cuda context constructor.
- explicit CUDAContext(DeviceIndex gpu_id = -1);
- explicit CUDAContext(const DeviceOption& option);
- explicit CUDAContext(Device device)
- : CUDAContext(DeviceToOption(device)) {}
-
- ~CUDAContext() override;
-
- inline void SwitchToDevice(StreamId stream_id) override {
- getCudaObjects().SetCurrentStreamId(gpu_id_, stream_id);
- CaffeCudaSetDevice(gpu_id_);
- }
-
- // void SwitchToDevice()
- using BaseContext::SwitchToDevice;
-
- inline void WaitEvent(const Event& ev) override {
- ev.Wait(CUDA, this);
- }
-
- inline void Record(Event* ev, const char* err_msg = nullptr) const override {
- CAFFE_ENFORCE(ev, "Event must not be null.");
- ev->Record(CUDA, this, err_msg);
- }
-
- // Note on current use cases:
- // FinishDeviceComputation must be called on the same cpu thread as
- // SwitchToDevice()
- void FinishDeviceComputation() override {
- CUDA_ENFORCE(cudaStreamSynchronize(getCudaObjects().GetStream(gpu_id_)));
- }
-
- inline int device_id() const {
- return gpu_id_;
- }
-
- inline c10::cuda::CUDAStream stream() const {
- return at::cuda::getStreamFromExternal(getCudaObjects().GetStream(gpu_id_), gpu_id_);
- }
-
- inline cudaStream_t cuda_stream() const {
- return getCudaObjects().GetStream(gpu_id_);
- }
-
- static cudaStream_t cuda_stream(DeviceIndex gpu_id, StreamId stream_id) {
- return getCudaObjects().GetStream(gpu_id, stream_id);
- }
-
- cublasHandle_t cublas_handle() {
- return getCudaObjects().GetHandle(gpu_id_);
- }
-
-#ifdef CAFFE2_USE_CUDNN
- cudnnHandle_t cudnn_handle() {
- return getCudaObjects().GetCudnnHandle(gpu_id_);
- }
-#endif // CAFFE2_USE_CUDNN
-
- curandGenerator_t& curand_generator() {
- if (!curand_generator_) {
- CUDAGuard guard(gpu_id_);
- CURAND_ENFORCE(
- curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
- CURAND_ENFORCE(
- curandSetPseudoRandomGeneratorSeed(curand_generator_, random_seed_));
- TORCH_CHECK_NOTNULL(curand_generator_);
- }
- CURAND_ENFORCE(curandSetStream(curand_generator_, cuda_stream()));
- return curand_generator_;
- }
-
- inline static at::DataPtr New(size_t nbytes) {
- return GetAllocator(CUDA)->allocate(nbytes);
- }
-
- // Get a mutex to lock out cudaMalloc / cudaFree calls when
- // NCCL kernels are being launched. Should remove threat of
- // deadlocks
- static std::mutex& mutex();
-
- // Functions to query memory stats. Only available if flag
- // --caffe2_gpu_memory_tracking is enabled.
- static std::vector<long> TotalMemoryByGpu();
- static std::vector<long> MaxMemoryByGpu();
-
- template <class SrcContext, class DstContext>
- inline void CopyBytes(size_t nbytes, const void* src, void* dst) {
- CUDA_ENFORCE(cudaMemcpyAsync(
- dst,
- src,
- nbytes,
- cudaMemcpyDefault,
- getCudaObjects().GetStream(gpu_id_)));
- }
-
- void CopyBytesSameDevice(size_t nbytes, const void* src, void* dst) override {
- CopyBytes<CUDAContext, CUDAContext>(nbytes, src, dst);
- }
-
- void CopyBytesToCPU(size_t nbytes, const void* src, void* dst) override {
- CopyBytes<CUDAContext, CPUContext>(nbytes, src, dst);
- }
-
- void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst) override {
- CopyBytes<CPUContext, CUDAContext>(nbytes, src, dst);
- }
-
- template <typename T, class SrcContext, class DstContext>
- inline void Copy(int n, const T* src, T* dst) {
- CopyBytes<SrcContext, DstContext>(n * sizeof(T),
- static_cast<const void*>(src),
- static_cast<void*>(dst));
- }
-
- template <class SrcContext, class DstContext>
- inline void
- CopyItems(const TypeMeta meta, size_t n, const void* src, void* dst) {
- CAFFE_ENFORCE(!meta.copy(), "CUDAContext requires fundamental types.");
- CopyBytes<SrcContext, DstContext>(n * meta.itemsize(), src, dst);
- }
-
- static void CopyBytesAsync(
- size_t nbytes,
- const void* src,
- Device src_device,
- void* dst,
- Device dst_device);
- static void CopyBytesSync(
- size_t nbytes,
- const void* src,
- Device src_device,
- void* dst,
- Device dst_device);
-
- // By default CUDA operators have async device parts
- static bool HasAsyncPartDefault() {
- return true;
- }
-
- static bool SupportsAsyncScheduling() {
- return true;
- }
-
- static bool IsStreamFree(const DeviceOption& option, StreamId stream_id) {
- const auto stream = CUDAContext::cuda_stream(option.device_id(), stream_id);
- const auto status = C10_CUDA_ERROR_HANDLED(cudaStreamQuery(stream));
- if (status == cudaErrorNotReady) {
- // ignore and clear the error if not ready
- C10_CUDA_CLEAR_ERROR();
- } else {
- C10_CUDA_CHECK(status); // Reraise error
- }
- return status == cudaSuccess;
- }
-
- at::Device device() const override {
- return at::Device(CUDA, gpu_id_);
- }
-
- DeviceType device_type() const override {
- return CUDA;
- }
-
- static constexpr DeviceType GetDeviceType() {
- return CUDA;
- }
-
- protected:
- int gpu_id_;
- int random_seed_;
- curandGenerator_t curand_generator_{nullptr};
- static ThreadLocalCUDAObjects& getCudaObjects();
-};
-
-using TensorCUDA = Tensor;
-
-} // namespace caffe2
-
-#endif // CAFFE2_CORE_CONTEXT_GPU_H_
diff --git a/caffe2/core/event_gpu.cc b/caffe2/core/event_gpu.cc
deleted file mode 100644
index 82000de..0000000
--- a/caffe2/core/event_gpu.cc
+++ /dev/null
@@ -1,227 +0,0 @@
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/core/event_cpu.h"
-#include "caffe2/core/operator.h"
-
-#include <atomic>
-#include <iostream>
-
-namespace caffe2 {
-
-struct CudaEventWrapper {
- explicit CudaEventWrapper(const DeviceOption& option)
- : cuda_stream_(nullptr),
- device_id_(option.device_id()),
- status_(EventStatus::EVENT_INITIALIZED) {
- CAFFE_ENFORCE(option.device_type(), PROTO_CUDA);
- CUDAGuard g(device_id_);
- try {
- CUDA_ENFORCE(cudaEventCreateWithFlags(
- &cuda_event_, cudaEventDefault | cudaEventDisableTiming));
- } catch (const Error&) {
- std::cerr << "ERROR: Failed to load CUDA.\n"
- << "HINT: Check that this binary contains GPU code."
- << std::endl;
- throw;
- }
- }
- ~CudaEventWrapper() {
- CUDAGuard g(device_id_);
- CUDA_CHECK(cudaEventDestroy(cuda_event_));
- }
-
- cudaEvent_t cuda_event_;
- cudaStream_t cuda_stream_;
- int device_id_;
-
- std::atomic<int> status_;
- std::mutex mutex_recorded_;
- std::condition_variable cv_recorded_;
- std::string err_msg_;
-};
-
-namespace {
-const std::string kNoError = "No error";
-}
-
-void EventCreateCUDA(const DeviceOption& option, Event* event) {
- event->event_ = std::make_shared<CudaEventWrapper>(option);
-}
-
-void EventRecordCUDA(Event* event, const void* context, const char* err_msg) {
- auto* wrapper = static_cast<CudaEventWrapper*>(event->event_.get());
- {
- std::unique_lock<std::mutex> lock(wrapper->mutex_recorded_);
-
- // Possible state changes:
- // INITIALIZED -> SCHEDULED/FAILED
- // SCHEDULED -> SUCCESS/FAILED
- // SUCCESS/FAILED - terminal
- //
- // No further changes to cuda_event_ and cuda_stream_ after transitioning
- // from INITIALIZED
- // No further changes to err_msg_ after transitioning into FAILED
-
- CAFFE_ENFORCE_EQ(
- wrapper->status_,
- EventStatus::EVENT_INITIALIZED,
- "Calling Record multiple times");
-
- if (!err_msg) {
- // When recording, one needs to make sure that the current gpu id is
- // correct.
- // TODO(jiayq): move the enforce logic to the caller?
- const auto& current_device = CaffeCudaGetDevice();
- CAFFE_ENFORCE_EQ(
- current_device,
- wrapper->device_id_,
- "When you call EventRecordCUDA, your current device should be the same "
- "as the device specified by the event.");
- CAFFE_ENFORCE_EQ(
- current_device,
- static_cast<const CUDAContext*>(context)->device_id());
- CUDA_ENFORCE(cudaEventRecord(
- wrapper->cuda_event_,
- static_cast<const CUDAContext*>(context)->cuda_stream()));
- wrapper->cuda_stream_ =
- static_cast<const CUDAContext*>(context)->cuda_stream();
- wrapper->status_ = EventStatus::EVENT_SCHEDULED;
- } else {
- wrapper->err_msg_ = err_msg;
- wrapper->status_ = EventStatus::EVENT_FAILED;
- }
- }
- wrapper->cv_recorded_.notify_all();
-}
-
-void EventFinishCUDA(const Event* event) {
- auto* wrapper = static_cast<CudaEventWrapper*>(event->event_.get());
- {
- std::unique_lock<std::mutex> lock(wrapper->mutex_recorded_);
- while (wrapper->status_ == EventStatus::EVENT_INITIALIZED) {
- wrapper->cv_recorded_.wait(lock);
- }
- }
-
- if (wrapper->status_ == EventStatus::EVENT_SCHEDULED) {
- // ok, even if event is already completed and status was not yet updated
- CUDAGuard g(wrapper->device_id_);
- auto cudaResult = cudaEventSynchronize(wrapper->cuda_event_);
- if (cudaResult == cudaSuccess) {
- wrapper->status_ = EventStatus::EVENT_SUCCESS;
- } else {
- const auto& err_msg = cudaGetErrorString(cudaResult);
-
- std::unique_lock<std::mutex> lock(wrapper->mutex_recorded_);
- wrapper->err_msg_ = err_msg;
- wrapper->status_ = EventStatus::EVENT_FAILED;
- }
- }
-}
-
-// Both waiter and event are CUDA. Non-blocking
-void EventWaitCUDACUDA(const Event* event, void* context) {
- auto* wrapper = static_cast<CudaEventWrapper*>(event->event_.get());
- {
- std::unique_lock<std::mutex> lock(wrapper->mutex_recorded_);
- while (wrapper->status_ == EventStatus::EVENT_INITIALIZED) {
- wrapper->cv_recorded_.wait(lock);
- }
- }
-
- if (wrapper->status_ == EventStatus::EVENT_SCHEDULED) {
- // ok, even if event is already completed and status was not yet updated
- auto context_stream = static_cast<CUDAContext*>(context)->cuda_stream();
- auto event_stream = wrapper->cuda_stream_;
- if (context_stream != event_stream) {
- // CAFFE_ENFORCE_EQ(
- // CaffeCudaGetDevice(),
- // static_cast<const CUDAContext*>(context)->device_id());
- CUDA_CHECK(cudaStreamWaitEvent(context_stream, wrapper->cuda_event_, 0));
- }
- }
-}
-
-// Waiter is CPU, event is CUDA
-void EventWaitCPUCUDA(const Event* event, void* context) {
- EventFinishCUDA(event);
-}
-
-// Waiter is CUDA, event is CPU
-void EventWaitCUDACPU(const Event* event, void* context) {
- event->Finish(); // calls EventFinishCPU
-}
-
-EventStatus EventQueryCUDA(const Event* event) {
- auto* wrapper = static_cast<CudaEventWrapper*>(event->event_.get());
- if (wrapper->status_ == EventStatus::EVENT_SCHEDULED) {
- auto cudaResult = cudaEventQuery(wrapper->cuda_event_);
- if (cudaResult == cudaSuccess) {
- wrapper->status_ = EventStatus::EVENT_SUCCESS;
- } else if (cudaResult != cudaErrorNotReady) {
- const auto& err_msg = cudaGetErrorString(cudaResult);
-
- std::unique_lock<std::mutex> lock(wrapper->mutex_recorded_);
- wrapper->err_msg_ = err_msg;
- wrapper->status_ = EventStatus::EVENT_FAILED;
- } else {
- // ignore and clear the error if not ready
- (void)cudaGetLastError();
- }
- }
- return static_cast<EventStatus>(wrapper->status_.load());
-}
-
-const std::string& EventErrorMessageCUDA(const Event* event) {
- auto* wrapper = static_cast<CudaEventWrapper*>(event->event_.get());
- // supposed to be called after EventQueryCUDA to update status first
- if (wrapper->status_ == EventStatus::EVENT_FAILED) {
- return wrapper->err_msg_;
- } else {
- return kNoError;
- }
-}
-
-void EventSetFinishedCUDA(const Event* event, const char* err_msg) {
- auto* wrapper = static_cast<CudaEventWrapper*>(event->event_.get());
- {
- std::unique_lock<std::mutex> lock(wrapper->mutex_recorded_);
-
- CAFFE_ENFORCE_EQ(
- wrapper->status_,
- EventStatus::EVENT_INITIALIZED,
- "Calling SetFinished on recorded CUDA event");
-
- if (!err_msg) {
- wrapper->status_ = EventStatus::EVENT_SUCCESS;
- } else {
- wrapper->err_msg_ = err_msg;
- wrapper->status_ = EventStatus::EVENT_FAILED;
- }
- }
- wrapper->cv_recorded_.notify_all();
-}
-
-void EventResetCUDA(Event* event) {
- auto* wrapper = static_cast<CudaEventWrapper*>(event->event_.get());
- std::unique_lock<std::mutex> lock(wrapper->mutex_recorded_);
- wrapper->status_ = EventStatus::EVENT_INITIALIZED;
- wrapper->err_msg_ = "";
- wrapper->cuda_stream_ = nullptr;
-}
-
-REGISTER_EVENT_CREATE_FUNCTION(CUDA, EventCreateCUDA);
-REGISTER_EVENT_RECORD_FUNCTION(CUDA, EventRecordCUDA);
-REGISTER_EVENT_WAIT_FUNCTION(CUDA, CUDA, EventWaitCUDACUDA);
-REGISTER_EVENT_WAIT_FUNCTION(CPU, CUDA, EventWaitCPUCUDA);
-REGISTER_EVENT_WAIT_FUNCTION(CUDA, CPU, EventWaitCUDACPU);
-REGISTER_EVENT_FINISH_FUNCTION(CUDA, EventFinishCUDA);
-
-REGISTER_EVENT_QUERY_FUNCTION(CUDA, EventQueryCUDA);
-REGISTER_EVENT_ERROR_MESSAGE_FUNCTION(CUDA, EventErrorMessageCUDA);
-REGISTER_EVENT_SET_FINISHED_FUNCTION(CUDA, EventSetFinishedCUDA);
-REGISTER_EVENT_RESET_FUNCTION(CUDA, EventResetCUDA);
-
-REGISTER_EVENT_WAIT_FUNCTION(MKLDNN, CUDA, EventWaitCPUCUDA);
-REGISTER_EVENT_WAIT_FUNCTION(CUDA, MKLDNN, EventWaitCUDACPU);
-
-} // namespace caffe2
diff --git a/caffe2/core/flags.h b/caffe2/core/flags.h
deleted file mode 100644
index 54f1f41..0000000
--- a/caffe2/core/flags.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#pragma once
-
-#include "c10/util/Flags.h"
-#include "caffe2/core/common.h"
diff --git a/caffe2/core/hip/common_miopen.h b/caffe2/core/hip/common_miopen.h
deleted file mode 100644
index 6901055..0000000
--- a/caffe2/core/hip/common_miopen.h
+++ /dev/null
@@ -1,178 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef CAFFE2_CORE_COMMON_MIOPEN_H_
-#define CAFFE2_CORE_COMMON_MIOPEN_H_
-
-#include <array>
-#include <mutex>
-#include "miopen/miopen.h"
-#include "caffe2/core/common.h"
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/types.h"
-#include "caffe2/proto/caffe2_pb.h"
-
-#define MIOPEN_VERSION 1399
-
-namespace caffe2 {
-
-namespace internal {
-/**
- * A helper function to obtain miopen error strings.
- */
-inline const char* miopenGetErrorString(miopenStatus_t status)
-{
- switch(status)
- {
- case miopenStatusSuccess: return "MIOPEN_STATUS_SUCCESS";
- case miopenStatusNotInitialized: return "MIOPEN_STATUS_NOT_INITIALIZED";
- case miopenStatusAllocFailed: return "MIOPEN_STATUS_ALLOC_FAILED";
- case miopenStatusBadParm: return "MIOPEN_STATUS_BAD_PARAM";
- case miopenStatusInternalError: return "MIOPEN_STATUS_INTERNAL_ERROR";
- case miopenStatusInvalidValue: return "MIOPEN_STATUS_INVALID_VALUE";
- case miopenStatusNotImplemented: return "MIOPEN_STATUS_NOT_SUPPORTED";
- case miopenStatusUnknownError: return "MIOPEN_STATUS_UNKNOWN_ERROR";
- default: return "MIOPEN_STATUS_UNKNOWN_ERROR";
- }
-}
-} // namespace internal
-
-// A macro that wraps around a miopen statement so we can check if the miopen
-// execution finishes or not.
-#define MIOPEN_ENFORCE(condition) \
- do \
- { \
- miopenStatus_t status = condition; \
- CAFFE_ENFORCE_EQ(status, \
- miopenStatusSuccess, \
- ", Error at: ", \
- __FILE__, \
- ":", \
- __LINE__, \
- ": ", \
- ::caffe2::internal::miopenGetErrorString(status)); \
- } while(0)
-#define MIOPEN_CHECK(condition) \
- do \
- { \
- miopenStatus_t status = condition; \
- CHECK(status == miopenStatusSuccess) << ::caffe2::internal::miopenGetErrorString(status); \
- } while(0)
-
-// report the version of miopen Caffe2 was compiled with
-inline size_t miopenCompiledVersion() { return MIOPEN_VERSION; }
-
-// report the runtime version of miopen
-inline size_t miopenRuntimeVersion() { return MIOPEN_VERSION; }
-
-// Check compatibility of compiled and runtime miopen versions
-inline void CheckMIOPENVersions() {}
-
-/**
- * miopenTypeWrapper is a wrapper class that allows us to refer to the miopen type
- * in a template function. The class is specialized explicitly for different
- * data types below.
- */
-template <typename T>
-class miopenTypeWrapper;
-
-template <>
-class miopenTypeWrapper<float>
-{
- public:
- static const miopenDataType_t type = miopenFloat;
- typedef const float ScalingParamType;
- typedef float BNParamType;
- static ScalingParamType* kOne()
- {
- static ScalingParamType v = 1.0;
- return &v;
- }
- static const ScalingParamType* kZero()
- {
- static ScalingParamType v = 0.0;
- return &v;
- }
-};
-
-template <>
-class miopenTypeWrapper<at::Half>
-{
- public:
- static const miopenDataType_t type = miopenHalf;
- typedef const float ScalingParamType;
- typedef float BNParamType;
- static ScalingParamType* kOne()
- {
- static ScalingParamType v = 1.0;
- return &v;
- }
- static ScalingParamType* kZero()
- {
- static ScalingParamType v = 0.0;
- return &v;
- }
-};
-
-/**
- * miopenTensorDescWrapper is the placeholder that wraps around a
- * miopenTensorDescriptor_t, allowing us to do descriptor change as-needed during
- * runtime.
- */
-class miopenTensorDescWrapper
-{
- public:
- miopenTensorDescWrapper() { MIOPEN_ENFORCE(miopenCreateTensorDescriptor(&desc_)); }
- ~miopenTensorDescWrapper() noexcept { MIOPEN_CHECK(miopenDestroyTensorDescriptor(desc_)); }
-
- inline miopenTensorDescriptor_t
- Descriptor(const miopenDataType_t type, const vector<int>& dims, bool* changed)
- {
- if(type_ == type && dims_ == dims)
- {
- // if not changed, simply return the current descriptor.
- if(changed)
- *changed = false;
- return desc_;
- }
- CAFFE_ENFORCE_EQ(
- dims.size(), 4, "MIOPEN currently only support 4-dimensional tensor descriptor");
-
- type_ = type;
- dims_ = dims;
- MIOPEN_ENFORCE(
- miopenSet4dTensorDescriptor(desc_, type, dims_[0], dims_[1], dims_[2], dims_[3]));
- if(changed)
- *changed = true;
- return desc_;
- }
-
- template <typename T>
- inline miopenTensorDescriptor_t Descriptor(const StorageOrder& order, const vector<int>& dims)
- {
- return Descriptor(miopenTypeWrapper<T>::type, dims, nullptr);
- }
-
- private:
- miopenTensorDescriptor_t desc_;
- miopenDataType_t type_;
- vector<int> dims_;
- C10_DISABLE_COPY_AND_ASSIGN(miopenTensorDescWrapper);
-};
-
-} // namespace caffe2
-
-#endif // CAFFE2_CORE_COMMON_MIOPEN_H_
diff --git a/caffe2/core/hip/common_miopen.hip b/caffe2/core/hip/common_miopen.hip
deleted file mode 100644
index a617bad..0000000
--- a/caffe2/core/hip/common_miopen.hip
+++ /dev/null
@@ -1,42 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "caffe2/core/hip/common_miopen.h"
-#include "caffe2/core/hip/miopen_wrapper.h"
-
-#include "caffe2/core/init.h"
-
-namespace caffe2 {
-
-MIOPENWrapper::PerGPUMIOPENStates& MIOPENWrapper::miopen_states()
-{
- // New it (never delete) to avoid calling the destructors on process
- // exit and racing against the CUDA shutdown sequence.
- static auto* p = new MIOPENWrapper::PerGPUMIOPENStates();
- TORCH_CHECK_NOTNULL(p);
- return *p;
-}
-
-namespace {
-bool PrintMIOPENInfo(int*, char***)
-{
- VLOG(1) << "Caffe2 is built with MIOPEN version " << MIOPEN_VERSION;
- return true;
-}
-
-REGISTER_CAFFE2_INIT_FUNCTION(PrintMIOPENInfo, &PrintMIOPENInfo, "Print MIOPEN Info.");
-
-} // namespace
-} // namespace caffe2
diff --git a/caffe2/core/hip/miopen_wrapper.h b/caffe2/core/hip/miopen_wrapper.h
deleted file mode 100644
index f60bed6..0000000
--- a/caffe2/core/hip/miopen_wrapper.h
+++ /dev/null
@@ -1,166 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-#ifndef CAFFE2_CORE_MIOPEN_WRAPPERS_H_
-#define CAFFE2_CORE_MIOPEN_WRAPPERS_H_
-
-#include "caffe2/core/hip/common_miopen.h"
-#include "caffe2/core/hip/context_gpu.h"
-
-#include <c10/hip/HIPGuard.h>
-
-namespace caffe2 {
-
-class MIOPENWrapper;
-
-/**
- * MIOPENWorkspace is a wrapper around a raw cuda pointer that holds the miopen
- * scratch space. This struct is meant to be only used in MIOPENWrapper to
- * provide a program-wide scratch space for MIOPEN. The reason behind it is that
- * miopen function calls are usually very efficient, hence one probably does not
- * want to run multiple miopen calls at the same time. As a result, one should
- * not need more than one miopen workspace per device.
- */
-struct MIOPENWorkspace
-{
- ~MIOPENWorkspace() noexcept {}
-
- void* get(size_t nbytes)
- {
- if(nbytes_ < nbytes)
- {
- reset();
- data_ = HIPContext::New(nbytes);
- nbytes_ = nbytes;
- }
- CAFFE_ENFORCE_GE(nbytes_, nbytes);
- return data_.get();
- }
-
- void reset()
- {
- data_.clear();
- nbytes_ = 0;
- }
-
- private:
- at::DataPtr data_;
- size_t nbytes_{0};
-};
-
-// MIOPENState is the owner of the MIOPENWorkspace, and serializes all
-// executions of operations that use the state onto it's own stream
-// (so multiple Net workers can reuse the same workspace from
-// different threads and HIP streams).
-class MIOPENState
-{
- public:
- explicit MIOPENState(size_t gpu_id) : gpu_id_(gpu_id)
- {
- HIPGuard g(gpu_id_);
- MIOPEN_ENFORCE(miopenCreate(&miopen_handle_));
- HIP_ENFORCE(hipEventCreate(&before_));
- HIP_ENFORCE(hipEventCreate(&after_));
- HIP_ENFORCE(hipStreamCreate(&stream_));
- MIOPEN_ENFORCE(miopenSetStream(miopen_handle_, stream_));
- }
-
- ~MIOPENState() noexcept
- {
- HIPGuard g(gpu_id_);
- MIOPEN_CHECK(miopenDestroy(miopen_handle_));
- HIP_CHECK(hipStreamDestroy(stream_));
- HIP_CHECK(hipEventDestroy(after_));
- HIP_CHECK(hipEventDestroy(before_));
- }
-
- miopenHandle_t& miopen_handle() { return miopen_handle_; }
-
- MIOPENWorkspace& workspace() { return workspace_; }
-
- template <typename F>
- void execute(hipStream_t stream, F&& f)
- {
- HIP_ENFORCE(hipEventRecord(before_, stream));
- HIP_ENFORCE(hipStreamWaitEvent(stream_, before_, 0));
- f(this);
- HIP_ENFORCE(hipEventRecord(after_, stream_));
- HIP_ENFORCE(hipStreamWaitEvent(stream, after_, 0));
- }
-
- private:
- miopenHandle_t miopen_handle_{nullptr};
- hipEvent_t before_{nullptr};
- hipEvent_t after_{nullptr};
- hipStream_t stream_{nullptr};
- MIOPENWorkspace workspace_;
- size_t gpu_id_{0};
- C10_DISABLE_COPY_AND_ASSIGN(MIOPENState);
-};
-
-/**
- * MIOPENWrapper is a class that wraps the miopen handles and miopen workspaces.
- *
- * The wrapper ensures that for each thread and each gpu, there is one
- * identical miopen handle, which is also associated with the thread-local
- * per-device hip stream. The wrapper also hosts the device-specific miopen
- * workspace (scratch space for some miopen functions).
- *
- */
-class MIOPENWrapper
-{
- public:
- /**
- * Creates a miopen wrapper associated with a HIPContext object. Note that
- * the HIPContext object should outlive the MIOPENWrapper.
- */
- explicit MIOPENWrapper(HIPContext* context) : context_(context) {}
-
- /**
- * Returns the inline miopen handle that executes on the current
- * thread's hip_stream.
- */
- miopenHandle_t inline_miopen_handle() { return context_->miopen_handle(); }
-
- // Executes the closure F on the MIOPENState associated with state_idx
- template <typename F>
- void with_miopen_state(size_t state_idx, F&& f)
- {
- CAFFE_ENFORCE(state_idx < CAFFE2_COMPILE_TIME_MAX_MIOPEN_STATES, "Invalid state_idx");
- auto& sync_state = miopen_states()[context_->device_id()][state_idx];
-
- HIPGuard dg(context_->device_id());
-
- // We need to serialize execution on the MIOPENState as we can't
- // allow multiple threads to race through the cudaEventRecord
- // calls (so a worker thread might wait on another worker thread's
- // execution)
- std::lock_guard<std::mutex> g(sync_state.mutex);
- if(!sync_state.state.get())
- {
- sync_state.state.reset(new MIOPENState(context_->device_id()));
- }
- TORCH_CHECK_NOTNULL(sync_state.state.get())->execute(context_->hip_stream(), f);
- }
-
- protected:
- // Pointer to an external cuda context that the miopen wrapper will use.
- HIPContext* context_;
-
- static constexpr size_t CAFFE2_COMPILE_TIME_MAX_MIOPEN_STATES = 4;
-
- struct SyncedMIOPENState
- {
- std::mutex mutex;
- std::unique_ptr<MIOPENState> state;
- };
-
- using PerGPUMIOPENStates = std::array<
- std::array<SyncedMIOPENState, CAFFE2_COMPILE_TIME_MAX_MIOPEN_STATES>,
- C10_COMPILE_TIME_MAX_GPUS>;
- static PerGPUMIOPENStates& miopen_states();
-
- C10_DISABLE_COPY_AND_ASSIGN(MIOPENWrapper);
-};
-
-}; // namespace caffe2
-
-#endif
diff --git a/caffe2/core/init.h b/caffe2/core/init.h
deleted file mode 100644
index 8d0fbd3..0000000
--- a/caffe2/core/init.h
+++ /dev/null
@@ -1,179 +0,0 @@
-#ifndef CAFFE2_CORE_INIT_H_
-#define CAFFE2_CORE_INIT_H_
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/flags.h"
-#include "caffe2/core/logging.h"
-
-namespace caffe2 {
-
-namespace internal {
-class TORCH_API Caffe2InitializeRegistry {
- public:
- typedef bool (*InitFunction)(int*, char***);
- // Registry() is defined in .cpp file to make registration work across
- // multiple shared libraries loaded with RTLD_LOCAL
- static Caffe2InitializeRegistry* Registry();
-
- void Register(
- InitFunction function,
- bool run_early,
- const char* description,
- const char* name = nullptr) {
- if (name) {
- named_functions_[name] = function;
- }
- if (run_early) {
- // Disallow registration after GlobalInit of early init functions
- CAFFE_ENFORCE(!early_init_functions_run_yet_);
- early_init_functions_.emplace_back(function, description);
- } else {
- if (init_functions_run_yet_) {
- // Run immediately, since GlobalInit already ran. This should be
- // rare but we want to allow it in some cases.
- LOG(WARNING) << "Running init function after GlobalInit: "
- << description;
- // TODO(orionr): Consider removing argc and argv for non-early
- // registration. Unfortunately that would require a new InitFunction
- // typedef, so not making the change right now.
- //
- // Note that init doesn't receive argc and argv, so the function
- // might fail and we want to raise an error in that case.
- int argc = 0;
- char** argv = nullptr;
- bool success = (function)(&argc, &argv);
- CAFFE_ENFORCE(success);
- } else {
- // Wait until GlobalInit to run
- init_functions_.emplace_back(function, description);
- }
- }
- }
-
- bool RunRegisteredEarlyInitFunctions(int* pargc, char*** pargv) {
- CAFFE_ENFORCE(!early_init_functions_run_yet_);
- early_init_functions_run_yet_ = true;
- return RunRegisteredInitFunctionsInternal(
- early_init_functions_, pargc, pargv);
- }
-
- bool RunRegisteredInitFunctions(int* pargc, char*** pargv) {
- CAFFE_ENFORCE(!init_functions_run_yet_);
- init_functions_run_yet_ = true;
- return RunRegisteredInitFunctionsInternal(init_functions_, pargc, pargv);
- }
-
- bool RunNamedFunction(const char* name, int* pargc, char*** pargv) {
- if (named_functions_.count(name)) {
- return named_functions_[name](pargc, pargv);
- }
- return false;
- }
-
- private:
- // Run all registered initialization functions. This has to be called AFTER
- // all static initialization are finished and main() has started, since we are
- // using logging.
- bool RunRegisteredInitFunctionsInternal(
- vector<std::pair<InitFunction, const char*>>& functions,
- int* pargc, char*** pargv) {
- for (const auto& init_pair : functions) {
- VLOG(1) << "Running init function: " << init_pair.second;
- if (!(*init_pair.first)(pargc, pargv)) {
- LOG(ERROR) << "Initialization function failed.";
- return false;
- }
- }
- return true;
- }
-
- Caffe2InitializeRegistry() {}
- vector<std::pair<InitFunction, const char*> > early_init_functions_;
- vector<std::pair<InitFunction, const char*> > init_functions_;
- std::unordered_map<std::string, InitFunction> named_functions_;
- bool early_init_functions_run_yet_ = false;
- bool init_functions_run_yet_ = false;
-};
-} // namespace internal
-
-TORCH_API bool unsafeRunCaffe2InitFunction(
- const char* name,
- int* pargc = nullptr,
- char*** pargv = nullptr);
-
-class TORCH_API InitRegisterer {
- public:
- InitRegisterer(
- internal::Caffe2InitializeRegistry::InitFunction function,
- bool run_early,
- const char* description,
- const char* name = nullptr) {
- internal::Caffe2InitializeRegistry::Registry()->Register(
- function, run_early, description, name);
- }
-};
-
-#define REGISTER_CAFFE2_INIT_FUNCTION(name, function, description) \
- namespace { \
- ::caffe2::InitRegisterer \
- g_caffe2_initregisterer_##name(function, false, description, #name); \
- } // namespace
-
-#define REGISTER_CAFFE2_EARLY_INIT_FUNCTION(name, function, description) \
- namespace { \
- ::caffe2::InitRegisterer \
- g_caffe2_initregisterer_##name(function, true, description, #name); \
- } // namespace
-
-/**
- * @brief Determine whether GlobalInit has already been run
- */
-TORCH_API bool GlobalInitAlreadyRun();
-
-class TORCH_API GlobalInitIsCalledGuard {
- public:
- GlobalInitIsCalledGuard() {
- if (!GlobalInitAlreadyRun()) {
- LOG(WARNING)
- << "Caffe2 GlobalInit should be run before any other API calls.";
- }
- }
-};
-
-/**
- * @brief Initialize the global environment of caffe2.
- *
- * Caffe2 uses a registration pattern for initialization functions. Custom
- * initialization functions should take the signature
- * bool (*func)(int*, char***)
- * where the pointers to argc and argv are passed in. Caffe2 then runs the
- * initialization in three phases:
- * (1) Functions registered with REGISTER_CAFFE2_EARLY_INIT_FUNCTION. Note that
- * since it is possible the logger is not initialized yet, any logging in
- * such early init functions may not be printed correctly.
- * (2) Parses Caffe-specific commandline flags, and initializes caffe logging.
- * (3) Functions registered with REGISTER_CAFFE2_INIT_FUNCTION.
- * If there is something wrong at each stage, the function returns false. If
- * the global initialization has already been run, the function returns false
- * as well.
- *
- * GlobalInit is re-entrant safe; a re-entrant call will no-op and exit.
- *
- * GlobalInit is safe to call multiple times but not idempotent;
- * successive calls will parse flags and re-set caffe2 logging levels from
- * flags as needed, but NOT re-run early init and init functions.
- *
- * GlobalInit is also thread-safe and can be called concurrently.
- */
-TORCH_API bool GlobalInit(int* pargc, char*** argv);
-
-/**
- * @brief Initialize the global environment without command line arguments
- *
- * This is a version of the GlobalInit where no argument is passed in.
- * On mobile devices, use this global init, since we cannot pass the
- * command line options to caffe2, no arguments are passed.
- */
-TORCH_API bool GlobalInit();
-} // namespace caffe2
-#endif // CAFFE2_CORE_INIT_H_
diff --git a/caffe2/core/net.h b/caffe2/core/net.h
deleted file mode 100644
index 0726d8e..0000000
--- a/caffe2/core/net.h
+++ /dev/null
@@ -1,175 +0,0 @@
-#ifndef CAFFE2_CORE_NET_H_
-#define CAFFE2_CORE_NET_H_
-
-#include <atomic>
-#include <climits>
-#include <cstddef>
-#include <thread> // NOLINT
-#include <typeinfo>
-#include <unordered_map>
-#include <vector>
-
-#include "c10/core/thread_pool.h"
-#include "c10/util/Registry.h"
-#include "caffe2/core/blob.h"
-#include "caffe2/core/common.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/observer.h"
-#include "caffe2/core/operator_schema.h"
-#include "caffe2/core/tensor.h"
-#include "caffe2/proto/caffe2_pb.h"
-#include "caffe2/utils/simple_queue.h"
-
-C10_DECLARE_string(caffe2_override_executor);
-
-namespace caffe2 {
-
-class NetBase;
-typedef ObserverBase<NetBase> NetObserver;
-typedef std::function<std::unique_ptr<NetObserver>(NetBase*)>
- NetObserverCreator;
-
-class OperatorBase;
-class Workspace;
-
-// Net is a thin struct that owns all the operators together with the operator
-// contexts.
-class TORCH_API NetBase : public Observable<NetBase> {
- public:
- NetBase(const std::shared_ptr<const NetDef>& net_def, Workspace* ws);
- virtual ~NetBase() noexcept {}
-
- virtual bool SupportsAsync() = 0;
- inline const vector<const Event*>& events() const {
- return events_;
- }
-
- virtual void Wait() {
- // by default just wait till all events are finished
- for (const auto& event : events_) {
- event->Finish();
- }
- }
-
- virtual bool Run() {
- if (!RunAsync()) {
- LOG(ERROR) << "Failed to execute async run";
- return false;
- }
- Wait();
- return handleRunError();
- }
-
- virtual bool RunAsync();
-
- virtual void Cancel();
-
- /* Benchmarks a network for one individual run so that we can feed new
- * inputs on additional calls.
- * This function returns the number of microseconds spent
- * during the benchmark
- */
- virtual float TEST_Benchmark_One_Run();
-
- /**
- * Benchmarks a network.
- *
- * This function returns a vector of float recording the number of milli-
- * seconds spent during the benchmark. The 0-th item is the time spent per
- * each network run, and if a net instantiation supports run_individual,
- * the remainder of the vector returns the number of milliseconds spent per
- * operator.
- */
- virtual vector<float> TEST_Benchmark(
- const int /*warmup_runs*/,
- const int /*main_runs*/,
- const bool /*run_individual*/);
-
- inline const vector<string>& external_output() const {
- return external_output_;
- }
-
- inline const vector<string>& external_input() const {
- return external_input_;
- }
-
- /* Used to attach Observers to operators of a Net
- *
- * Returns pointers to objects owned with unique_ptrs.
- * Use with caution.
- */
- virtual vector<OperatorBase*> GetOperators() const = 0;
-
- const string& Name() const {
- return name_;
- }
-
- inline const NetDef& debug_def() const {
- CAFFE_ENFORCE(has_debug_def(), "net_def was null!");
- return *net_def_;
- }
-
- inline bool has_debug_def() const {
- return net_def_ != nullptr;
- }
-
- protected:
- virtual bool DoRunAsync() {
- CAFFE_THROW("Not implemented");
- };
-
- virtual bool handleRunError() {
- for (const Event* event : events_) {
- if (event->Query() != EventStatus::EVENT_SUCCESS) {
- CAFFE_THROW(event->ErrorMessage());
- }
- }
- return true;
- }
-
- vector<string> external_input_;
- vector<string> external_output_;
- string name_;
- vector<const Event*> events_;
- std::shared_ptr<const NetDef> net_def_;
- C10_DISABLE_COPY_AND_ASSIGN(NetBase);
-};
-
-class TORCH_API ExecutorHelper {
- public:
- ExecutorHelper() {}
- virtual TaskThreadPoolBase* GetPool(const DeviceOption& option) const;
- virtual std::vector<OperatorBase*> GetOperators() const;
- virtual int GetNumWorkers() const;
- virtual ~ExecutorHelper() {}
-};
-
-C10_DECLARE_REGISTRY(
- NetRegistry,
- NetBase,
- const std::shared_ptr<const NetDef>&,
- Workspace*);
-#define REGISTER_NET_CREATOR(key, ...) \
- C10_REGISTER_CREATOR(NetRegistry, key, __VA_ARGS__)
-#define REGISTER_NET(name, ...) \
- C10_REGISTER_CLASS(NetRegistry, name, __VA_ARGS__)
-
-/**
- * @brief Creates a network, accessing / creating blobs in the given workspace.
- *
- * Note that this is different from Workspace::CreateNet. The latter adds the
- * created net object to the workspace's net map, while this function returns
- * a standalone net object.
- */
-TORCH_API unique_ptr<NetBase> CreateNet(const NetDef& net_def, Workspace* ws);
-TORCH_API unique_ptr<NetBase> CreateNet(
- const std::shared_ptr<const NetDef>& net_def,
- Workspace* ws);
-
-TORCH_API void AddGlobalNetObserverCreator(NetObserverCreator creator);
-
-TORCH_API void ClearGlobalNetObservers();
-
-} // namespace caffe2
-
-#endif // CAFFE2_CORE_NET_H_
diff --git a/caffe2/core/numa.h b/caffe2/core/numa.h
deleted file mode 100644
index 8424d54..0000000
--- a/caffe2/core/numa.h
+++ /dev/null
@@ -1,3 +0,0 @@
-#pragma once
-#include "c10/util/numa.h"
-#include "caffe2/core/common.h"
diff --git a/caffe2/core/observer.h b/caffe2/core/observer.h
deleted file mode 100644
index 3897bb7..0000000
--- a/caffe2/core/observer.h
+++ /dev/null
@@ -1,164 +0,0 @@
-#pragma once
-
-#include <memory>
-#include <unordered_set>
-
-#include "caffe2/core/logging.h"
-
-namespace caffe2 {
-
-/**
- * Use this to implement a Observer using the Observer Pattern template.
- */
-
-template <class T>
-class ObserverBase {
- public:
- explicit ObserverBase(T* subject) : subject_(subject) {}
-
- virtual void Start() {}
- virtual void Stop() {}
-
- virtual std::string debugInfo() {
- return "Not implemented.";
- }
-
- virtual ~ObserverBase() noexcept {}
-
- T* subject() const {
- return subject_;
- }
-
- virtual std::unique_ptr<ObserverBase<T>> rnnCopy(T* subject, int rnn_order)
- const {
- return nullptr;
- }
-
- protected:
- T* subject_;
-};
-
-/**
- * Inherit to make your class observable.
- */
-template <class T>
-class Observable {
- public:
- Observable() = default;
-
- Observable(Observable&&) = default;
- Observable& operator =(Observable&&) = default;
-
- virtual ~Observable() = default;
-
- C10_DISABLE_COPY_AND_ASSIGN(Observable);
-
- using Observer = ObserverBase<T>;
-
- /* Returns a reference to the observer after addition. */
- const Observer* AttachObserver(std::unique_ptr<Observer> observer) {
- CAFFE_ENFORCE(observer, "Couldn't attach a null observer.");
- std::unordered_set<const Observer*> observers;
- for (auto& ob : observers_list_) {
- observers.insert(ob.get());
- }
-
- const auto* observer_ptr = observer.get();
- if (observers.count(observer_ptr)) {
- return observer_ptr;
- }
- observers_list_.push_back(std::move(observer));
- UpdateCache();
-
- return observer_ptr;
- }
-
- /**
- * Returns a unique_ptr to the removed observer. If not found, return a
- * nullptr
- */
- std::unique_ptr<Observer> DetachObserver(const Observer* observer_ptr) {
- for (auto it = observers_list_.begin(); it != observers_list_.end(); ++it) {
- if (it->get() == observer_ptr) {
- auto res = std::move(*it);
- observers_list_.erase(it);
- UpdateCache();
- return res;
- }
- }
- return nullptr;
- }
-
- virtual size_t NumObservers() {
- return num_observers_;
- }
-
- private:
- inline static void StartObserver(Observer* observer) {
- try {
- observer->Start();
- } catch (const std::exception& e) {
- LOG(ERROR) << "Exception from observer: " << e.what();
- } catch (...) {
- LOG(ERROR) << "Exception from observer: unknown";
- }
- }
-
- inline static void StopObserver(Observer* observer) {
- try {
- observer->Stop();
- } catch (const std::exception& e) {
- LOG(ERROR) << "Exception from observer: " << e.what();
- } catch (...) {
- LOG(ERROR) << "Exception from observer: unknown";
- }
- }
-
- void UpdateCache() {
- num_observers_ = observers_list_.size();
- if (num_observers_ != 1) {
- // we cannot take advantage of the cache
- return;
- }
- observer_cache_ = observers_list_[0].get();
- }
-
- public:
- void StartAllObservers() {
- // do not access observers_list_ unless necessary
- if (num_observers_ == 0) {
- return;
- } else if (num_observers_ == 1) {
- StartObserver(observer_cache_);
- } else {
- for (auto& observer : observers_list_) {
- StartObserver(observer.get());
- }
- }
- }
-
- void StopAllObservers() {
- // do not access observers_list_ unless necessary
- if (num_observers_ == 0) {
- return;
- } else if (num_observers_ == 1) {
- StopObserver(observer_cache_);
- } else {
- for (auto& observer : observers_list_) {
- StopObserver(observer.get());
- }
- }
- }
-
- private:
- // an on-stack cache for fast iteration;
- // ideally, inside StartAllObservers and StopAllObservers,
- // we should never access observers_list_
- Observer* observer_cache_;
- size_t num_observers_ = 0;
-
- protected:
- std::vector<std::unique_ptr<Observer>> observers_list_;
-};
-
-} // namespace caffe2
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
deleted file mode 100644
index 3277357..0000000
--- a/caffe2/core/operator.h
+++ /dev/null
@@ -1,1600 +0,0 @@
-#ifndef CAFFE2_CORE_OPERATOR_H_
-#define CAFFE2_CORE_OPERATOR_H_
-
-#include <array>
-#include <cfenv>
-#include <climits>
-#include <cstddef>
-#include <exception>
-#include <functional>
-#include <set>
-#include <sstream>
-#include <string>
-#include <typeinfo>
-#include <vector>
-
-#include <c10/macros/Macros.h>
-#include <c10/util/Registry.h>
-#include <c10/util/string_view.h>
-#include <c10/util/typeid.h>
-#include <c10/core/Stream.h>
-#include "caffe2/core/blob.h"
-#include "caffe2/core/common.h"
-#include "caffe2/core/net.h"
-#include "caffe2/core/observer.h"
-#include "caffe2/core/operator_gradient.h"
-#include "caffe2/core/operator_schema.h"
-#include "caffe2/core/tensor.h"
-#include "caffe2/core/tensor_int8.h"
-#include "caffe2/core/types.h"
-#include "caffe2/core/workspace.h"
-#include "caffe2/proto/caffe2_pb.h"
-#include "caffe2/utils/proto_utils.h"
-
-#if defined(EXPOSE_C2_OPS) || \
- !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-#include <ATen/core/TensorBody.h>
-#include <ATen/core/function_schema.h>
-#include <ATen/core/ivalue.h>
-#endif
-
-C10_CLANG_DIAGNOSTIC_PUSH()
-#if C10_CLANG_HAS_WARNING("-Wshorten-64-to-32")
-C10_CLANG_DIAGNOSTIC_IGNORE("-Wshorten-64-to-32")
-#endif
-
-C10_DECLARE_bool(caffe2_operator_throw_if_fp_exceptions);
-C10_DECLARE_bool(caffe2_operator_throw_if_fp_overflow_exceptions);
-#ifdef __GNU_LIBRARY__
-C10_DECLARE_bool(caffe2_operator_throw_on_first_occurrence_if_fp_exceptions);
-#endif
-
-namespace c10 {
-struct FunctionSchema;
-}
-
-namespace caffe2 {
-
-class TORCH_API OperatorBase;
-typedef ObserverBase<OperatorBase> OperatorObserver;
-
-class TORCH_API OperatorBase : public Observable<OperatorBase> {
- public:
- explicit OperatorBase(const OperatorDef& operator_def, Workspace* ws);
-
- /*
- * Notes: All outputs ivalues must be tensors. Input ivalue list must start
- * with all tensors ("inputs" in caffe2 terminology),
- * followed by non-tensors ("arguments" in caffe2 terminology).
- * Alternatively, inputs can be one tensor list ivalue followed by non-tensors
- * to represent operators with a variable number of inputs.
- */
-#if defined(EXPOSE_C2_OPS) || \
- !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
- explicit OperatorBase(
- const c10::FunctionSchema& schema,
- std::vector<c10::IValue> inputs,
- std::vector<caffe2::Tensor> outputs);
-#endif
-
- virtual ~OperatorBase() noexcept;
-
- /** @brief Return true if the operator was instantiated with OperatorDef
- * New operators should be instantiated with FunctionSchema
- */
- bool isLegacyOperator() const {
-#if defined(EXPOSE_C2_OPS) || \
- !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
- return !fn_schema_;
-#else
- return true;
-#endif
- }
-
- const c10::FunctionSchema& getFunctionSchema() const {
- CAFFE_ENFORCE(!isLegacyOperator());
-#if defined(EXPOSE_C2_OPS) || \
- !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
- return *fn_schema_.get();
-#else
- CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
-#endif
- }
-
- /** @brief Checks if the operator has an argument of the given name.
- */
- inline bool HasArgument(c10::string_view name) const {
- if (isLegacyOperator()) {
- CAFFE_ENFORCE(operator_def_, "operator_def was null!");
- return ArgumentHelper::HasArgument(*operator_def_, name);
- }
- return argumentIndexWithName(name).has_value();
- }
-
- // Functions that deal with arguments. Basically, this allows us to map an
- // argument name to a specific type of argument that we are trying to access.
- template <typename T>
- inline T GetSingleArgument(c10::string_view name, const T& default_value) const {
- if (isLegacyOperator()) {
- CAFFE_ENFORCE(operator_def_, "operator_def was null!");
- return ArgumentHelper::GetSingleArgument<OperatorDef, T>(
- *operator_def_, name, default_value);
- }
-#if defined(EXPOSE_C2_OPS) || \
- !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
- auto index = argumentIndexWithName(name);
- CAFFE_ENFORCE(index.has_value(), "Couldn't get index for argument!", name);
- const auto& value = newstyle_inputs_[index.value()];
- return value.template to<T>();
-#else
- CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
-#endif
- }
-
- template <typename T>
- inline bool HasSingleArgumentOfType(c10::string_view name) const {
- CAFFE_ENFORCE(operator_def_, "operator_def was null!");
- return ArgumentHelper::HasSingleArgumentOfType<OperatorDef, T>(
- *operator_def_, name);
- }
-#if defined(EXPOSE_C2_OPS) || \
- !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
- template <typename T>
- inline vector<T> GetVectorFromIValueList(const c10::IValue& value) const {
- return value.template to<List<T>>().vec();
- }
-#endif
-
- template <typename T>
- inline vector<T> GetRepeatedArgument(
- c10::string_view name,
- const vector<T>& default_value = {}) const;
-
- // Get the inputs and outputs as specific types.
- template <typename T>
- inline const T& Input(int idx) {
- static_assert(
- !std::is_same<T, Tensor>::value,
- "You should use Input<Tensor>(int, DeviceType) for "
- "Tensor.");
- TORCH_DCHECK_LT((size_t)idx, inputs_.size());
- try {
- return inputs_.at(idx)->template Get<T>();
- } catch (::caffe2::EnforceNotMet& enf) {
- if (has_debug_def()) {
- TORCH_RETHROW(enf, "Offending Blob name: ", debug_def().input(idx), ".");
- }
- throw enf;
- }
- }
-
- // TODO(jerryzh): Remove template
- // and the type argument?
- // This is to keep the API changes minimal and make refactoring
- // a bit easier
- template <typename T>
- inline const T& Input(int idx, DeviceType type) {
- if (isLegacyOperator()) {
- static_assert(
- std::is_same<T, Tensor>::value,
- "Input(int, DeviceType) is only available for Tensor");
- TORCH_DCHECK_LT((size_t)idx, inputs_.size());
- try {
- // TODO(jerryzh): We'll need to check device type in Get<T>() later
- // Get<T>() -> Get<T>(type)
- const auto& tensor = inputs_.at(idx)->template Get<T>();
- return tensor;
- } catch (::caffe2::EnforceNotMet& enf) {
- if (has_debug_def()) {
- TORCH_RETHROW(enf, "Offending Blob name: ", debug_def().input(idx), ".");
- }
- throw enf;
- }
- }
-#if defined(EXPOSE_C2_OPS) || \
- !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
- TORCH_DCHECK_LT(0U, newstyle_inputs_.size());
- IValue ival;
- if (newstyle_inputs_[0].isTensorList()) {
- // if the first input is a tensor list, we get input tensors by indexing
- // into that list. currently, this means that only tensors from that list
- // are accessible as inputs. any hypothetical input tensors that come
- // after the list are not accessible.
- auto tensorList = newstyle_inputs_[0].toTensorVector();
- TORCH_DCHECK_LT((size_t)idx, tensorList.size());
- ival = tensorList[idx];
- } else {
- // if the first input is not a tensor list, we get input tensors by
- // indexing into the inputs.
- TORCH_DCHECK_LT((size_t)idx, newstyle_inputs_.size());
- ival = newstyle_inputs_[idx];
- }
- CAFFE_ENFORCE(
- ival.isTensor(),
- "Input(int, DeviceType) is only available for IValues that store Tensors");
- auto t = ival.toTensor();
- if (!t.is_contiguous()) {
- t = t.contiguous();
- }
- Tensor tensor = caffe2::Tensor(std::move(t));
- CAFFE_ENFORCE_EQ(tensor.GetDeviceType(), type);
- input_tensors_[idx] = std::move(tensor);
- return input_tensors_[idx];
-#else
- CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
-#endif
- }
-
- template <typename T>
- inline T* Output(int idx) {
- CAFFE_ENFORCE(
- isLegacyOperator(),
- "Output(idx) not supported for operators exported to c10. Please use XOutput instead.");
-
- static_assert(
- !std::is_same<T, Tensor>::value,
- "You should use Output<Tensor>(int, DeviceType) for "
- "Tensor.");
- return outputs_.at(idx)->template GetMutable<T>();
- }
-
- // TODO(jerryzh): Remove this template
- template <typename T>
- inline T* Output(int idx, DeviceType type) {
- if (isLegacyOperator()) {
- static_assert(
- std::is_same<T, Tensor>::value,
- "Output(int, DeviceType) is only available for Tensor");
- // When you get a Tensor here it is not fully initialized
- return BlobGetMutableTensor(outputs_.at(idx), type);
- }
-#if defined(EXPOSE_C2_OPS) || \
- !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
- auto &output = output_tensors_[idx];
- if (!output.defined() || output.GetDeviceType() != type) {
- // Fix tensor type
- output = Tensor(type);
- }
- return &output;
-#else
- CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
-#endif
- }
-
- inline Tensor
- XOutputTensor(int idx, at::IntArrayRef dims, at::TensorOptions options) {
- CAFFE_ENFORCE_WITH_CALLER(
- options.device_opt() != c10::nullopt,
- "device must be provided in option.");
- if (isLegacyOperator()) {
- return XBlobGetMutableTensor(outputs_.at(idx), dims, options);
- }
-
- return OutputTensor(idx, dims, options)->UnsafeSharedInstance();
- }
-
- void SetOutputTensor(int idx, Tensor tensor) {
- if (!isLegacyOperator()) {
-#if defined(EXPOSE_C2_OPS) || \
- !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
- output_tensors_[idx] = std::move(tensor);
-#else
- CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
-#endif
- } else {
- // update the tensor in the workspace
- BlobSetTensor(outputs_.at(idx), std::move(tensor));
- }
- }
-
- Tensor OutputTensorOrUndefined(int idx) {
- if (isLegacyOperator()) {
- return BlobGetTensorOrUndefined(*outputs_.at(idx));
- }
- return output_tensors_[idx].UnsafeSharedInstance();
- }
-
- inline Tensor*
- OutputTensor(int idx, at::IntArrayRef dims, at::TensorOptions options) {
- if (isLegacyOperator()) {
- CAFFE_ENFORCE_WITH_CALLER(
- options.device_opt() != c10::nullopt,
- "device must be provided in options.");
- return BlobGetMutableTensor(outputs_.at(idx), dims, options);
- }
-#if defined(EXPOSE_C2_OPS) || \
- !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
- auto &output = output_tensors_[idx];
- output = output.defined()
- ? GetSizedTensorWithOptions(std::move(output), dims, options)
- : caffe2::empty(dims, options);
-
- return &output;
-#else
- CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
-#endif
- }
-
- // Get output Tensor of the operator and CopyFrom the given Tensor
- Tensor* OutputTensorCopyFrom(
- int idx,
- at::TensorOptions options,
- const Tensor& src,
- bool async = false) {
- CAFFE_ENFORCE_WITH_CALLER(
- options.device_opt() != c10::nullopt,
- "device must be provided in options.");
- // Ouptut Tensor will always have the same data type as `src`
- if (!options.has_dtype()) {
- options = options.dtype(src.dtype());
- }
- CAFFE_ENFORCE_WITH_CALLER(
- options.dtype() == src.dtype(),
- "We don't allow change of src data type in OutputTensorCopyFrom");
- Tensor* t = OutputTensor(idx, src.sizes(), options);
- t->CopyFrom(src, async);
- return t;
- }
-
- Tensor* OutputTensorAlias(int idx, const Tensor& src) {
- CAFFE_ENFORCE(
- isLegacyOperator(),
- "OutputTensorAlias(idx, src) not (yet) supported for operators exported to c10.");
- return BlobSetTensor(OutputBlob(idx), src.Alias());
- }
-
- template <typename T>
- inline T* Output(int idx, T* allocated) {
- CAFFE_ENFORCE(
- isLegacyOperator(),
- "Output(idx, allocated) not supported for operators exported to c10. Please use XOutput.");
- outputs_.at(idx)->Reset(allocated);
- return allocated;
- }
-
- inline const Blob& InputBlob(int idx) {
- CAFFE_ENFORCE(
- isLegacyOperator(),
- "InputBlob(idx) not (yet) supported for operators exported to c10.");
- return *inputs_.at(idx);
- }
-
- inline Blob* OutputBlob(int idx) {
- CAFFE_ENFORCE(
- isLegacyOperator(),
- "OutputBlob(idx) not (yet) supported for operators exported to c10.");
- return outputs_.at(idx);
- }
-
- // Check whether output j is an alias of input i by comparing Blob pointers,
- // note this does not check if the two Blobs points to the same Tensor, or if
- // the Tensor pointers point to the same TensorImpl, or if the Storages alias
- inline bool IsInputOutputAlias(int i, int j) {
- CAFFE_ENFORCE(
- isLegacyOperator(),
- "IsInputOutputAlias(i, j) not (yet) supported for operators exported to c10.");
- return inputs_.at(i) == outputs_.at(j);
- }
-
- template <typename T>
- inline bool InputIsType(int idx) {
- CAFFE_ENFORCE(
- isLegacyOperator(),
- "InputIsType(idx) not (yet) supported for operators exported to c10.");
- static_assert(
- !std::is_same<T, Tensor>::value,
- "You should use InputIsTensorType(int, DeviceType) for "
- "Tensor.");
- return inputs_.at(idx)->template IsType<T>();
- }
-
- inline bool InputIsTensorType(int idx, DeviceType device_type) {
- CAFFE_ENFORCE(
- isLegacyOperator(),
- "InputIsTensorType(idx, device_type) not (yet) supported for operators exported to c10.");
- return BlobIsTensorType(*inputs_.at(idx), device_type);
- }
-
- template <typename T>
- inline bool OutputIsType(int idx) {
- CAFFE_ENFORCE(
- isLegacyOperator(),
- "OutputIsType(idx) not (yet) supported for operators exported to c10.");
- static_assert(
- !std::is_same<T, Tensor>::value,
- "You should use OutputIsTensorType(int, DeviceType) for "
- "Tensor.");
- return outputs_.at(idx)->template IsType<T>();
- }
-
- inline bool OutputIsTensorType(int idx, DeviceType type) {
- CAFFE_ENFORCE(
- isLegacyOperator(),
- "OutputIsTensorType(idx, type) not (yet) supported for operators exported to c10.");
- return BlobIsTensorType(*outputs_.at(idx), type);
- }
-
- inline int InputSize() const {
- return input_size_;
- }
-
- inline int OutputSize() const {
- if (isLegacyOperator()) {
- return outputs_.size();
- }
-#if defined(EXPOSE_C2_OPS) || \
- !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
- return output_tensors_.size();
-#else
- CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
-#endif
- }
- inline const vector<const Blob*>& Inputs() const {
- CAFFE_ENFORCE(
- isLegacyOperator(),
- "Inputs() not supported for operators exported to c10.");
- return inputs_;
- }
- inline const vector<Blob*>& Outputs() {
- CAFFE_ENFORCE(
- isLegacyOperator(),
- "Outputs() not supported for operators exported to c10.");
- return outputs_;
- }
- vector<TensorShape> InputTensorShapes() const;
-
- virtual void WaitEvent(const Event& ev, int /*stream_id */ = -1) {
- ev.Finish();
- }
-
- inline void Wait(const OperatorBase& other, int stream_id = -1) {
- if (!other.IsEventDisabled()) {
- WaitEvent(other.event(), stream_id);
- }
- }
-
- virtual void WaitEvents(
- const std::vector<const Event*>& events,
- int /*stream_id*/ = -1) {
- for (const auto& ev : events) {
- ev->Finish();
- }
- }
-
- virtual void Finish() {
- if (event_) {
- event_->Finish();
- }
- }
-
- virtual bool Run(int /* unused */ /*stream_id*/ = 0) {
- CAFFE_NOT_IMPLEMENTED;
- }
-
- virtual bool HasAsyncPart() const {
- return false;
- }
-
- virtual bool SupportsAsyncScheduling() const {
- return false;
- }
-
- virtual void CancelAsyncCallback() {}
-
- virtual void Cancel() {}
-
- // RunAsync, if implemented by the specific operators, will schedule the
- // computation on the corresponding context and record the event in its
- // event_ member object. If the specific operator does not support RunAsync,
- // it will simply be synchronous as a fallback.
- virtual bool RunAsync(int stream_id = 0);
-
- virtual void AddRelatedBlobInfo(EnforceNotMet* err);
-
- virtual std::string debug_info_string() const {
- return "";
- }
-
- inline const OperatorDef& debug_def() const {
- CAFFE_ENFORCE(has_debug_def(), "operator_def was null!");
- return *operator_def_;
- }
-
- inline void set_debug_def(
- const std::shared_ptr<const OperatorDef>& operator_def) {
- operator_def_ = operator_def;
- }
-
- inline bool has_debug_def() const {
- return operator_def_ != nullptr;
- }
-
- public:
- void RecordLastFailedOpNetPosition() {
- if (net_position_ != kNoNetPositionSet) {
- VLOG(1) << "Operator with id " << net_position_ << " failed";
- operator_ws_->last_failed_op_net_position = net_position_;
- } else {
- VLOG(1) << "Failed operator doesn't have id set";
- }
- }
-
- int net_position() const {
- return net_position_;
- }
-
- void set_net_position(int idx) {
- net_position_ = idx;
- }
-
- const DeviceOption& device_option() const {
- return device_option_;
- }
-
- const Event& event() const {
- CAFFE_ENFORCE(event_, "Event is disabled");
- return *event_;
- }
-
- Event& event() {
- CAFFE_ENFORCE(event_, "Event is disabled");
- return *event_;
- }
-
- void ResetEvent() {
- if (event_) {
- event_->Reset();
- }
- }
-
- void DisableEvent() {
- event_ = nullptr;
- }
-
- bool IsEventDisabled() const {
- return !event_;
- }
-
- // Internal API invoked by observers. Normal callers shouldn't invoke it.
- virtual void SyncDeviceBarrierForObservers() {
- CAFFE_NOT_IMPLEMENTED;
- }
-
- // Checks whether stream is ready to execute new computation,
- // used in stream allocation optimization to skip stream that is currently
- // busy. Depends on context and operator's device, returns true by default
- virtual bool IsStreamFree(int /* unused */) const {
- return true;
- }
-
- const std::string& type() const {
- return type_;
- }
-
- void annotate_engine(const std::string& engine) {
- engine_ = engine;
- }
-
- const std::string& engine() const {
- return engine_;
- }
-
- void SetExecutorHelper(ExecutorHelper* helper) {
- helper_ = helper;
- }
-
- ExecutorHelper* GetExecutorHelper() const {
- return helper_;
- }
-
-#if defined(EXPOSE_C2_OPS) || \
- !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
- std::vector<caffe2::Tensor> move_output_tensors() && {
- return std::move(output_tensors_);
- }
-#endif
-
- public:
- static const int kNoNetPositionSet = -1;
-
- private:
- Workspace* operator_ws_;
- std::shared_ptr<const OperatorDef> operator_def_;
- DeviceOption device_option_;
- std::string engine_;
- std::string type_;
- vector<const Blob*> inputs_;
- vector<Blob*> outputs_;
- // Preferably use std::optional, but nvcc doesn't work
-#if defined(EXPOSE_C2_OPS) || \
- !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
- std::unique_ptr<const c10::FunctionSchema> fn_schema_;
- vector<c10::IValue> newstyle_inputs_;
-#endif
- // HACK
- // We preserve the fact that Output() returns Tensor*
- // by storing Tensor in a vector owned by the
- // operator.
- vector<caffe2::Tensor> input_tensors_;
- vector<caffe2::Tensor> output_tensors_;
-
- int input_size_;
-
- int net_position_{kNoNetPositionSet};
-
- ExecutorHelper* helper_ = nullptr;
-
- protected:
- virtual void RecordEvent(const char* /*err_msg*/ = nullptr) {
- CAFFE_NOT_IMPLEMENTED;
- }
-
- void SetEventFinished(const char* err_msg = nullptr) {
- if (event_) {
- event_->SetFinished(err_msg);
- }
- }
-
- void SetEventFinishedWithException(const char* err_msg = nullptr) {
- if (event_) {
- event_->SetFinishedWithException(err_msg);
- }
- }
-
- std::string getErrorMsg() {
- if (has_debug_def()) {
- return "Error from operator: " + ProtoDebugString(debug_def());
- } else {
- return "Error from operator: no op def";
- }
- }
-
- std::optional<int> argumentIndexWithName(c10::string_view name) const;
-
- // An event used by asynchronous execution.
- std::unique_ptr<Event> event_;
-
- C10_DISABLE_COPY_AND_ASSIGN(OperatorBase);
-};
-
-template <>
-inline NetDef OperatorBase::GetSingleArgument<NetDef>(
- c10::string_view name,
- const NetDef& default_value) const {
- if (isLegacyOperator()) {
- CAFFE_ENFORCE(operator_def_, "operator_def was null!");
- return ArgumentHelper::GetSingleArgument<OperatorDef, NetDef>(
- *operator_def_, name, default_value);
- }
- CAFFE_THROW("Cannot get NetDefs from IValue");
- return NetDef();
-}
-
-#if defined(EXPOSE_C2_OPS) || \
- !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-template <>
-inline vector<int> OperatorBase::GetVectorFromIValueList<int>(
- const c10::IValue& value) const {
- auto vs = value.toIntVector();
- vector<int> out;
- out.reserve(vs.size());
- for (int64_t v : vs) {
- out.emplace_back(v);
- }
- return out;
-}
-
-template <>
-inline vector<float> OperatorBase::GetVectorFromIValueList<float>(
- const c10::IValue& value) const {
- const auto& vs = value.toDoubleVector();
- vector<float> out;
- out.reserve(vs.size());
- for (double v : vs) {
- out.emplace_back(v);
- }
- return out;
-}
-
-template <>
-inline vector<string> OperatorBase::GetVectorFromIValueList<string>(
- const c10::IValue& value) const {
- auto vs = value.template to<c10::List<string>>();
- vector<string> out;
- out.reserve(vs.size());
- for (string v : vs) {
- out.emplace_back(v);
- }
- return out;
-}
-
-// We need this specialisation because IValue based lists don't support
-// int16_t. We need to load it as List<int64_t> and transform to int16_t.
-template <>
-inline vector<int16_t> OperatorBase::GetVectorFromIValueList<int16_t>(
- const c10::IValue& value) const {
- auto list = value.template to<c10::List<int64_t>>();
- std::vector<int16_t> result;
- result.reserve(list.size());
- for (int64_t elem : list) {
- result.push_back(static_cast<int16_t>(elem));
- }
- return result;
-}
-#endif
-
-// OP_SINGLE_ARG provides a shorter initialization choice for initialization of
-// member variables for the class constructors.
-#define OP_SINGLE_ARG(type, name, variable, default) \
- variable(OperatorBase::GetSingleArgument<type>(name, (default)))
-
-// INPUT_TAGS and OUTPUT_TAGS are optional features to name the indices of the
-// operator's inputs and outputs, in order to avoid confusion. For example, for
-// a fully convolution layer that has input, weight and bias, you can define its
-// input tags as:
-// INPUT_TAGS(INPUT, WEIGHT, BIAS);
-// And in the code, instead of doing
-// auto& weight = Input(1);
-// you can now do
-// auto& weight = Input(WEIGHT);
-// to make it more clear.
-#define INPUT_TAGS(first_input, ...) \
- enum _InputTags { first_input = 0, __VA_ARGS__ }
-#define OUTPUT_TAGS(first_input, ...) \
- enum _OutputTags { first_input = 0, __VA_ARGS__ }
-
-template <typename T>
-inline vector<T> OperatorBase::GetRepeatedArgument(
- c10::string_view name,
- const vector<T>& default_value) const {
- if (isLegacyOperator()) {
- CAFFE_ENFORCE(operator_def_, "operator_def was null!");
- return ArgumentHelper::GetRepeatedArgument<OperatorDef, T>(
- *operator_def_, name, default_value);
- }
-#if defined(EXPOSE_C2_OPS) || \
- !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
- auto index = argumentIndexWithName(name);
- CAFFE_ENFORCE(index.has_value(), "Couldn't get index for argument!", name);
- const auto& value = newstyle_inputs_[index.value()];
- return GetVectorFromIValueList<T>(value);
-#else
- CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
-#endif
-}
-
-// We need this specialisation because IValue based lists don't support
-// int16_t. We need to load it as List<int64_t> and transform to int16_t.
-template <>
-inline vector<int16_t> OperatorBase::GetRepeatedArgument<int16_t>(
- c10::string_view name,
- const vector<int16_t>& default_value) const {
- if (isLegacyOperator()) {
- CAFFE_ENFORCE(operator_def_, "operator_def was null!");
- return ArgumentHelper::GetRepeatedArgument<OperatorDef, int16_t>(
- *operator_def_, name, default_value);
- }
-#if defined(EXPOSE_C2_OPS) || \
- !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
- auto index = argumentIndexWithName(name);
- CAFFE_ENFORCE(index.has_value(), "Couldn't get index for argument!", name);
- const auto& value = newstyle_inputs_[index.value()];
- auto vec = GetVectorFromIValueList<int64_t>(value);
- std::vector<int16_t> result;
- result.reserve(vec.size());
- for (int64_t elem : vec) {
- result.push_back(static_cast<int16_t>(elem));
- }
- return result;
-#else
- CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
-#endif
-}
-
-// Operator is the class that you usually want to derive, if your operator will
-// run on different devices. You should then implement the RunOnDevice()
-// function.
-template <class Context>
-class Operator : public OperatorBase {
- public:
- explicit Operator(const OperatorDef& operator_def, Workspace* ws, StreamId stream = 0)
- : OperatorBase(operator_def, ws), context_(operator_def.device_option()) {
- // In the constructor, we switch to the device so that the child class
- // constructors will run on that device.
- context_.SwitchToDevice(stream);
- }
-#if defined(EXPOSE_C2_OPS) || \
- !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
- explicit Operator(
- const c10::FunctionSchema& fn_schema,
- std::vector<c10::IValue> inputs,
- std::vector<caffe2::Tensor> outputs,
- StreamId stream = 0)
- : OperatorBase(fn_schema, std::move(inputs), std::move(outputs)) {
- // In the constructor, we switch to the device so that the child class
- // constructors will run on that device.
- context_.SwitchToDevice(stream);
- }
-#endif
- ~Operator() noexcept override {}
-
- /// Retrieve a non-owning reference to the input at position 'idx' for this
- /// operator. The returned reference is valid for the duration of the
- /// RunOnDevice call. The optional 'type' parameter can be used to assert a
- /// required device type for the input (by default, we assert that the tensor
- /// is consistent with the device type implied by the Context parameter of an
- /// Operator.)
- inline const Tensor& Input(
- int idx,
- DeviceType type = Context::GetDeviceType()) {
- return OperatorBase::template Input<Tensor>(idx, type);
- }
-
- /// XOutput is a modernized version of Output which returns a Tensor
- /// rather than a Tensor* (the raw pointer in the latter case is
- /// useless, as Tensor is a pointer type.)
- Tensor XOutput(int idx, at::IntArrayRef dims, at::TensorOptions options) {
- // We'll default device to the device of the current Operator Context
- if (options.device_opt() == c10::nullopt) {
- return OperatorBase::XOutputTensor(
- idx, dims, options.device(context_.device()));
- }
- return OperatorBase::XOutputTensor(idx, dims, options);
- }
-
- /// Retrieve a non-owning pointer to the output at position 'idx',
- /// initializing it to have size 'dims' and properties 'options' if
- /// there is no pre-existing output or the pre-existing output does
- /// not have the correct options. The returned pointer is valid for
- /// the duration of the RunOnDevice call. If device is not explicitly
- /// specified in options, we default to allocating output on the
- /// current device of the device type implied by the Context parameter
- /// of this Operator.
- ///
- /// Note [Operator::Output what?]
- /// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- /// The contract of Operator::Output is somewhat complex; it is perhaps better
- /// understood in terms of what was historically an idiomatic Caffe2 operator
- /// implementation:
- ///
- /// void RunOnDevice() override {
- /// auto* output = Output(0, output_size, dtype<float>());
- /// float* output_ptr = output->data<float>();
- /// // write into output_ptr
- /// }
- ///
- /// In the simple case, this code does the following things:
- ///
- /// 1. Allocates a new tensor with size 'output_size' and dtype 'float'
- /// (and device type whatever the Operator's device type is)
- /// 2. "Registers" this tensor as the 0th output tensor of this operator
- /// (Caffe2 operators don't "return" outputs; instead, outputs
- /// are shoved into an output vector which the executor reads out.)
- /// 3. Returns the tensor, so the operator implementation can write
- /// the actual output data into the tensor.
- ///
- /// So what's this business with "pre-existing" outputs? Caffe2
- /// commonly applies an optimization whereby it reuses tensors on
- /// subsequent runs of operators in a graph. It doesn't know ahead
- /// of time what intermediate tensors it will need, so the first
- /// time it runs a graph it has all of the operators create the outputs
- /// necessary (as described above). However, the second time around,
- /// it will reuse all of the tensors created from the first time.
- /// If they are lucky, this time the Output() call is a no-op and
- /// just returns the old tensor.
- ///
- /// However, we cannot /guarantee/ that the output size will be the
- /// same the next time the Operator is called; for example, output
- /// size may be data dependent and vary between runs. In this case,
- /// we have to resize it to the correct size. Resizing is still
- /// helpful, as we may be able to fit the output in the same
- /// space that was previously used.
- ///
- Tensor* Output(int idx, at::IntArrayRef dims, at::TensorOptions options) {
- // We'll default device to the device of the current Operator Context
- if (options.device_opt() == c10::nullopt) {
- return OperatorBase::OutputTensor(
- idx, dims, options.device(context_.device()));
- }
- return OperatorBase::OutputTensor(idx, dims, options);
- }
-
- /// Legacy: please consider using the version of Output() which also takes
- /// dtype and size as arguments.
- inline Tensor* Output(int idx, DeviceType type = Context::GetDeviceType()) {
- return OperatorBase::template Output<Tensor>(idx, type);
- }
-
- /// Get the output Tensor of an operator (allocating it if it is not
- /// already initialized), and copy the contents of src into it.
- /// You probably don't actually want to use this function (the fact
- /// that you have a Tensor to copy from is probably a mistake:
- /// you should have written the output into the output tensor,
- /// from Output, directly in the first place), but this method
- /// is situationally useful.
- Tensor* OutputTensorCopyFrom(
- int idx,
- at::TensorOptions options,
- const Tensor& src,
- bool async = false) {
- if (options.device_opt() == c10::nullopt) {
- return OperatorBase::OutputTensorCopyFrom(
- idx, options.device(context_.device()), src, async);
- }
- return OperatorBase::OutputTensorCopyFrom(idx, options, src, async);
- }
-
- void WaitEvent(const Event& ev, int stream_id = -1) final {
- if (stream_id >= 0) {
- context_.SwitchToDevice(stream_id);
- }
- context_.WaitEvent(ev);
- }
-
- void WaitEvents(const std::vector<const Event*>& events, int stream_id = -1)
- final {
- if (stream_id >= 0) {
- context_.SwitchToDevice(stream_id);
- }
- for (const auto& ev : events) {
- context_.WaitEvent(*ev);
- }
- }
-
- // The run function of Operator switches to the device, and then carries out
- // the actual computation with RunOnDevice(). You should implement RunOnDevice
- // instead of Run().
- // Note: Run does not update operator's event and can be used only with
- // non-async executors that do not rely on events
- bool Run(int stream_id = 0) final {
- try {
- StartAllObservers();
-
- context_.SwitchToDevice(stream_id);
-
- // Clear floating point exception flags before RunOnDevice. We will test
- // exception flags afterwards, and raise an error if an exception has
- // happened.
- if (FLAGS_caffe2_operator_throw_if_fp_exceptions ||
- FLAGS_caffe2_operator_throw_if_fp_overflow_exceptions) {
- std::feclearexcept(FE_ALL_EXCEPT);
- }
-
-#ifdef __GNU_LIBRARY__
- // If glibc is available, use feenableexcept that will raise exception
- // right away.
- int old_enabled_exceptions = 0;
- if (FLAGS_caffe2_operator_throw_on_first_occurrence_if_fp_exceptions) {
- if (FLAGS_caffe2_operator_throw_if_fp_exceptions ||
- FLAGS_caffe2_operator_throw_if_fp_overflow_exceptions) {
- int flag = 0;
- if (FLAGS_caffe2_operator_throw_if_fp_exceptions) {
- flag |= FE_DIVBYZERO | FE_INVALID;
- }
- if (FLAGS_caffe2_operator_throw_if_fp_overflow_exceptions) {
- flag |= FE_OVERFLOW;
- }
- old_enabled_exceptions = feenableexcept(flag);
- }
- }
-#endif
- bool result = RunOnDevice();
-#ifdef __GNU_LIBRARY__
- if (FLAGS_caffe2_operator_throw_on_first_occurrence_if_fp_exceptions) {
- if (FLAGS_caffe2_operator_throw_if_fp_exceptions ||
- FLAGS_caffe2_operator_throw_if_fp_overflow_exceptions) {
- fedisableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
- std::feclearexcept(FE_ALL_EXCEPT);
- feenableexcept(old_enabled_exceptions);
- }
- }
-#endif
- if (FLAGS_caffe2_operator_throw_if_fp_exceptions) {
- CAFFE_ENFORCE(
- !std::fetestexcept(FE_DIVBYZERO),
- "Division by zero floating point exception (FE_DIVBYZERO) reported.");
- CAFFE_ENFORCE(
- !std::fetestexcept(FE_INVALID),
- "Invalid floating point exception (FE_INVALID) reported.");
- }
- if (FLAGS_caffe2_operator_throw_if_fp_overflow_exceptions) {
- CAFFE_ENFORCE(
- !std::fetestexcept(FE_OVERFLOW),
- "Overflow floating point exception (FE_OVERFLOW) reported.");
- }
- if (!result) {
- this->RecordLastFailedOpNetPosition();
- }
- context_.FinishDeviceComputation(); // throws on error
-
- StopAllObservers();
-
- return result;
- } catch (EnforceNotMet& err) {
- if (has_debug_def()) {
- err.add_context(
- "Error from operator: \n" + ProtoDebugString(debug_def()));
- AddRelatedBlobInfo(&err);
- }
- this->RecordLastFailedOpNetPosition();
- StopAllObservers();
- throw;
- } catch (...) {
- this->RecordLastFailedOpNetPosition();
- StopAllObservers();
- throw;
- }
- }
-
- bool RunAsync(int stream_id = 0) final {
- try {
- StartAllObservers();
-
- context_.SwitchToDevice(stream_id);
- auto result = RunOnDevice();
- if (result) {
- if (HasAsyncPart()) {
- RecordEvent();
- } else {
- // Manually set CPU operator's event status to finished,
- // unless this is an async CPU operator
- SetEventFinished();
- }
- } else {
- SetEventFinished(getErrorMsg().c_str());
- this->RecordLastFailedOpNetPosition();
- }
-
- StopAllObservers();
-
- return result;
- } catch (EnforceNotMet& err) {
- if (has_debug_def()) {
- err.add_context(
- "Error from operator: \n" + ProtoDebugString(debug_def()));
- AddRelatedBlobInfo(&err);
- }
- SetEventFinishedWithException(err.what());
- this->RecordLastFailedOpNetPosition();
- StopAllObservers();
- throw;
- } catch (const std::exception& err) {
- SetEventFinishedWithException(err.what());
- this->RecordLastFailedOpNetPosition();
- StopAllObservers();
- throw;
- } catch (...) {
- SetEventFinishedWithException(getErrorMsg().c_str());
- this->RecordLastFailedOpNetPosition();
- StopAllObservers();
- throw;
- }
- }
-
- bool IsStreamFree(int stream_id) const override {
- return context_.IsStreamFree(device_option(), stream_id);
- }
-
- virtual bool RunOnDevice() = 0;
-
- // Returns whether operator has async on device part.
- // CUDA operators by default have async parts, CPU operators by default
- // don't have async parts and are finished after RunOnDevice call.
- // Events of operators that don't have async parts are automatically set
- // to finished state by RunAsync.
- // Defaulting to the value from context (true for CUDA, false for CPU).
- // Override in case of async CPU operators
- // Async CPU operators are expected to catch all exceptions in async parts
- // and set Event to finished/failed state with Event::SetFinished or
- // SetFinishedWithException call.
- bool HasAsyncPart() const override {
- return context_.HasAsyncPartDefault();
- }
-
- // Returns whether operator's RunOnDevice schedules async on device part and
- // can be run without waiting for parent operator's async part to be finished
- // on the same device.
- // Note: when true, RunOnDevice must not access the content of the input blobs
- // as they might not be computed yet
- // Note: when true, operator's device needs to support async scheduling:
- // - supports concept of streams: async ops scheduled on the same stream are
- // guaranteed to be executed in the same order they were scheduled
- // - provides non-blocking cross device/cross stream synchronization
- // primitives
- //
- // By default, assuming an op with an async part can be scheduled
- // asynchronously if device supports async scheduling
- bool SupportsAsyncScheduling() const override {
- return HasAsyncPart() && context_.SupportsAsyncScheduling();
- }
-
- void SyncDeviceBarrierForObservers() override {
- context_.FinishDeviceComputation();
- }
-
- const Context* getContext() const {
- return &context_;
- }
- Context* getContext() {
- return &context_;
- }
-
- protected:
- void RecordEvent(const char* err_msg = nullptr) final {
- if (event_) {
- context_.Record(event_.get(), err_msg);
- }
- }
-
- Context context_;
-};
-
-#define USE_OPERATOR_BASE_FUNCTIONS \
- /* using override */ using OperatorBase::HasArgument; \
- /* using override */ using OperatorBase::GetSingleArgument; \
- /* using override */ using OperatorBase::HasSingleArgumentOfType; \
- /* using override */ using OperatorBase::GetRepeatedArgument; \
- /* using override */ using OperatorBase::InputIsType; \
- /* using override */ using OperatorBase::InputSize; \
- /* using override */ using OperatorBase::Output; \
- /* using override */ using OperatorBase::Input; \
- /* using override */ using OperatorBase::OutputSize; \
- /* using override */ using OperatorBase::IsInputOutputAlias; \
- /* using override */ using OperatorBase::OutputTensorAlias
-
-#define USE_OPERATOR_FUNCTIONS(context) \
- USE_OPERATOR_BASE_FUNCTIONS; \
- /* using override */ using Operator<context>::context_; \
- /* using override */ using Operator<context>::Input; \
- /* using override */ using Operator<context>::InputBlob; \
- /* using override */ using Operator<context>::Output; \
- /* using override */ using Operator<context>::OutputBlob; \
- /* using override */ using Operator<context>::OutputTensorCopyFrom
-
-#define USE_OPERATOR_CONTEXT_FUNCTIONS USE_OPERATOR_FUNCTIONS(Context)
-
-#define USE_SIMPLE_CTOR_DTOR(name) \
- template <class... Args> \
- explicit name(Args&&... args) \
- : Operator<Context>(std::forward<Args>(args)...) {} \
- virtual ~name() noexcept override {}
-
-// Helpers to implement runtime op polymorphism. Often it's convenient to make
-// an op work on different input types (e.g. i32 vs i64 indices) or special-case
-// it for particular input size (e.g. ScatterWeightedSum for block size of 1
-// doesn't need to call Eigen).
-//
-// DispatchHelper provides compile-time generation of nested "if" statements,
-// e.g. `DispatchHelper<FixedValues<1, 4>>::call(this, block_size);`
-// unrolls into:
-// if (block_size == 1) {
-// return DoRunWithValue<1>();
-// } else if (block_size = 4) {
-// return DoRunWithValue<4>();
-// } else {
-// return DoRunWithValue<-1>();
-// }`
-//
-// DoRunWithValue implementation can use template arguments to do "if"
-// statements
-// or proxy to functions in math.h which often provide fixed size
-// implementation.
-//
-// Similarly `TensorTypes<int32_t, int64_t>(this, Input(0))` provides branching
-// based on type of the first input and calls DoRunWithType.
-//
-// Note, that the same instance of Op class is used as the method, not class is
-// templated. We might consider adding static class-level polymorphism later.
-//
-// Convenient macro USE_DISPATCH_HELPER is provided for declaring friendship in
-// case DoRunWithValue or DoRunWithType are declared non-public.
-
-#define USE_DISPATCH_HELPER \
- template <typename FirstArg, typename... ExtraArgs> \
- friend struct DispatchHelper
-
-template <int... Values>
-struct FixedValues {};
-
-template <typename... Types>
-struct TensorTypes {};
-
-// Special tag that can be listed in TensorTypes to denote that a special
-// implementation in 'RunWithOtherType' needs to be called instead of failing
-// Obviously this needs to be the last item in lists, e.g.
-// TensorTypes<float, double, GenericTensorImplementation>
-struct GenericTensorImplementation {};
-
-// Same as TensorTypes but call DoRunWithType2
-template <typename... Types>
-struct TensorTypes2 {};
-
-template <typename Sizes, typename... ExtraArgs>
-struct DispatchHelper;
-
-template <int FirstVal, int... Values, typename... ExtraArgs>
-struct DispatchHelper<FixedValues<FirstVal, Values...>, ExtraArgs...> {
- template <typename Op>
- static bool call(Op* op, int value) {
- if (FirstVal == value) {
- return op->template DoRunWithValue<ExtraArgs..., FirstVal>();
- }
- return DispatchHelper<FixedValues<Values...>, ExtraArgs...>::template call<
- Op>(op, value);
- }
-};
-
-template <typename... ExtraArgs>
-struct DispatchHelper<FixedValues<>, ExtraArgs...> {
- template <typename Op>
- static bool call(Op* op, int64_t /*size*/) {
- return op->template DoRunWithValue<ExtraArgs..., -1>();
- }
-};
-
-#define C10_DEFINE_TENSOR_TYPES_DISPATCHER( \
- TensorTypes, DoRunWithType, DoRunWithOtherType) \
- template <typename FirstType, typename... Types, typename... ExtraArgs> \
- struct DispatchHelper<TensorTypes<FirstType, Types...>, ExtraArgs...> { \
- template <typename Op> \
- static bool call(Op* op, const TypeMeta meta) { \
- static_assert( \
- !std::is_same<GenericTensorImplementation, FirstType>::value, \
- "GenericTensorImplementation must be the last in TensorTypes list"); \
- if (meta.Match<FirstType>()) { \
- return op->template DoRunWithType<ExtraArgs..., FirstType>(); \
- } \
- return DispatchHelper<TensorTypes<Types...>, ExtraArgs...>:: \
- template call<Op>(op, meta); \
- } \
- template <typename Op> \
- static bool call(Op* op, const Tensor& tensor) { \
- return call<Op>(op, tensor.dtype()); \
- } \
- template <typename Op> \
- static bool call(Op* op, const Blob& blob) { \
- return call<Op>(op, blob.meta()); \
- } \
- }; \
- \
- template <typename... ExtraArgs> \
- struct DispatchHelper<TensorTypes<>, ExtraArgs...> { \
- template <typename Op> \
- static bool call(Op* /* unused */, const TypeMeta meta) { \
- CAFFE_THROW("Unsupported type of tensor: ", meta.name()); \
- } \
- template <typename Op> \
- static bool call(Op* op, const Tensor& tensor) { \
- return call<Op>(op, tensor.dtype()); \
- } \
- template <typename Op> \
- static bool call(Op* op, const Blob& blob) { \
- return call<Op>(op, blob.meta()); \
- } \
- }; \
- \
- template <typename... ExtraArgs> \
- struct DispatchHelper< \
- TensorTypes<GenericTensorImplementation>, \
- ExtraArgs...> { \
- template <typename Op> \
- static bool call(Op* op, const TypeMeta) { \
- return op->template DoRunWithOtherType<ExtraArgs...>(); \
- } \
- template <typename Op> \
- static bool call(Op* op, const Tensor& tensor) { \
- return call<Op>(op, tensor.dtype()); \
- } \
- template <typename Op> \
- static bool call(Op* op, const Blob& blob) { \
- return call<Op>(op, blob.meta()); \
- } \
- };
-C10_DEFINE_TENSOR_TYPES_DISPATCHER(
- TensorTypes,
- DoRunWithType,
- DoRunWithOtherType)
-C10_DEFINE_TENSOR_TYPES_DISPATCHER(
- TensorTypes2,
- DoRunWithType2,
- DoRunWithOtherType2)
-#undef C10_DEFINE_TENSOR_TYPES_DISPATCHER
-
-// The device type registry. This works in two phases:
-// (1) gDeviceTypeRegistry() maps the device types values to the actual operator
-// registry function.
-// (2) Then, one can call the operator registry function to further create the
-// operators.
-typedef c10::Registry<
- std::string,
- std::unique_ptr<OperatorBase>,
- const OperatorDef&,
- Workspace*>
- OperatorRegistry;
-typedef c10::Registry<
- std::string,
- std::unique_ptr<OperatorBase>,
- const OperatorDef&,
- Workspace*>* (*RegistryFunction)();
-TORCH_API std::map<DeviceType, OperatorRegistry*>* gDeviceTypeRegistry();
-
-struct TORCH_API DeviceTypeRegisterer {
- explicit DeviceTypeRegisterer(DeviceType type, RegistryFunction func);
-};
-
-#if defined(_MSC_VER)
-#define IMPORT_IF_NOT_MSVC
-#else
-#define IMPORT_IF_NOT_MSVC C10_IMPORT
-#endif
-
-#define CAFFE_REGISTER_DEVICE_TYPE(type, registry_function) \
- namespace { \
- static DeviceTypeRegisterer C10_ANONYMOUS_VARIABLE( \
- DeviceType)(type, ®istry_function); \
- }
-
-// The operator registry. Since we are not expecting a great number of devices,
-// we will simply have an if-then type command and allocate the actual
-// generation to device-specific registerers.
-// Note that although we have CUDA and CUDNN here, the registerers themselves do
-// not depend on specific cuda or cudnn libraries. This means that we will be
-// able to compile it even when there is no cuda available - we simply do not
-// link any cuda or cudnn operators.
-C10_DECLARE_REGISTRY(
- CPUOperatorRegistry,
- OperatorBase,
- const OperatorDef&,
- Workspace*);
-#define REGISTER_CPU_OPERATOR_CREATOR(key, ...) \
- C10_REGISTER_CREATOR(CPUOperatorRegistry, key, __VA_ARGS__)
-#define REGISTER_CPU_OPERATOR(name, ...) \
- IMPORT_IF_NOT_MSVC void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(); \
- static void CAFFE2_UNUSED CAFFE_ANONYMOUS_VARIABLE_CPU##name() { \
- CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(); \
- } \
- C10_REGISTER_CLASS(CPUOperatorRegistry, name, __VA_ARGS__)
-#define REGISTER_CPU_OPERATOR_STR(str_name, ...) \
- C10_REGISTER_TYPED_CLASS(CPUOperatorRegistry, str_name, __VA_ARGS__)
-
-#define REGISTER_CPU_OPERATOR_WITH_ENGINE(name, engine, ...) \
- C10_REGISTER_CLASS(CPUOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
-
-// Use these macros to register gradient operators. They can be automatically
-// excluded from builds that don't need them (e.g., mobile).
-#ifdef CAFFE2_NO_GRADIENT_OPS
-#define REGISTER_CPU_GRADIENT_OPERATOR(...) /* No gradients. */
-#else
-#define REGISTER_CPU_GRADIENT_OPERATOR(...) \
- C10_MACRO_EXPAND(REGISTER_CPU_OPERATOR(__VA_ARGS__))
-#endif
-
-#ifdef CAFFE2_NO_GRADIENT_OPS
-#define REGISTER_CPU_GRADIENT_OPERATOR_WITH_ENGINE(...) /* No gradients. */
-#else
-#define REGISTER_CPU_GRADIENT_OPERATOR_WITH_ENGINE(...) \
- C10_MACRO_EXPAND(REGISTER_CPU_OPERATOR_WITH_ENGINE(__VA_ARGS__))
-#endif
-
-C10_DECLARE_REGISTRY(
- CUDAOperatorRegistry,
- OperatorBase,
- const OperatorDef&,
- Workspace*);
-#define REGISTER_CUDA_OPERATOR_CREATOR(key, ...) \
- C10_REGISTER_CREATOR(CUDAOperatorRegistry, key, __VA_ARGS__)
-#define REGISTER_CUDA_OPERATOR(name, ...) \
- IMPORT_IF_NOT_MSVC void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(); \
- static void CAFFE2_UNUSED CAFFE_ANONYMOUS_VARIABLE_CUDA##name() { \
- CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(); \
- } \
- C10_REGISTER_CLASS(CUDAOperatorRegistry, name, __VA_ARGS__)
-#define REGISTER_CUDA_OPERATOR_STR(str_name, ...) \
- C10_REGISTER_TYPED_CLASS(CUDAOperatorRegistry, str_name, __VA_ARGS__)
-
-#define REGISTER_CUDA_OPERATOR_WITH_ENGINE(name, engine, ...) \
- C10_REGISTER_CLASS(CUDAOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
-
-// Macros for cudnn since we use it often
-#define REGISTER_CUDNN_OPERATOR(name, ...) \
- REGISTER_CUDA_OPERATOR_WITH_ENGINE(name, CUDNN, __VA_ARGS__)
-
-// Macros for HIP operators
-C10_DECLARE_REGISTRY(
- HIPOperatorRegistry,
- OperatorBase,
- const OperatorDef&,
- Workspace*);
-#define REGISTER_HIP_OPERATOR_CREATOR(key, ...) \
- C10_REGISTER_CREATOR(HIPOperatorRegistry, key, __VA_ARGS__)
-#define REGISTER_HIP_OPERATOR(name, ...) \
- IMPORT_IF_NOT_MSVC void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(); \
- static void CAFFE2_UNUSED CAFFE_ANONYMOUS_VARIABLE_HIP##name() { \
- CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(); \
- } \
- C10_REGISTER_CLASS(HIPOperatorRegistry, name, __VA_ARGS__)
-#define REGISTER_HIP_OPERATOR_STR(str_name, ...) \
- C10_REGISTER_TYPED_CLASS(HIPOperatorRegistry, str_name, __VA_ARGS__)
-
-#define REGISTER_HIP_OPERATOR_WITH_ENGINE(name, engine, ...) \
- C10_REGISTER_CLASS(HIPOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
-
-#define REGISTER_MIOPEN_OPERATOR(name, ...) \
- REGISTER_HIP_OPERATOR_WITH_ENGINE(name, MIOPEN, __VA_ARGS__) \
- REGISTER_HIP_OPERATOR_WITH_ENGINE( \
- name, CUDNN, __VA_ARGS__) // Make CUDNN an alias of MIOPEN for HIP ops
-
-// StaticLinkingProtector is a helper class that ensures that the Caffe2
-// library is linked correctly with whole archives (in the case of static
-// linking). What happens is that when CreateOperator is called for the first
-// time, it instantiates an OperatorLinkingProtector object to check if the
-// operator registry is empty. If it is empty, this means that we are not
-// properly linking the library.
-//
-// You should not need to use this class.
-struct StaticLinkingProtector {
- StaticLinkingProtector() {
- const auto registered_ops = CPUOperatorRegistry()->Keys().size();
- // Note: this is a check failure instead of an exception, because if
- // the linking is wrong, Caffe2 won't be able to run properly anyway,
- // so it's better to fail loud.
- // If Caffe2 is properly linked with whole archive, there should be more
- // than zero registered ops.
- if (registered_ops == 0) {
- LOG(FATAL)
- << "You might have made a build error: the Caffe2 library does not seem "
- "to be linked with whole-static library option. To do so, use "
- "-Wl,-force_load (clang) or -Wl,--whole-archive (gcc) to link the "
- "Caffe2 library.";
- }
- }
-};
-
-// An exception that can be thrown by an operator constructor that notifies
-// that it does not support the given setting. This can be usually used for
-// specific engines that only implement a subset of the features required by
-// the original operator schema.
-// TODO(jiayq): make more feature-complete exception message.
-class TORCH_API UnsupportedOperatorFeature : public std::exception {
- public:
- UnsupportedOperatorFeature(const string& msg) : msg_(msg) {}
- const char* what() const noexcept override {
- return msg_.c_str();
- }
-
- private:
- string msg_;
-};
-
-// A helper macro that should ONLY be used in the operator constructor to check
-// if needed features are met. If not, throws the UnsupportedOperatorFeature
-// exception with the given message.
-#define OPERATOR_NEEDS_FEATURE(condition, ...) \
- if (!(condition)) { \
- throw UnsupportedOperatorFeature(::c10::str(__VA_ARGS__)); \
- }
-
-// Creates an operator with the given operator definition.
-// Throws on error and never returns nullptr
-TORCH_API unique_ptr<OperatorBase> CreateOperator(
- const OperatorDef& operator_def,
- Workspace* ws,
- int net_position = OperatorBase::kNoNetPositionSet);
-
-TORCH_API const std::string OpRegistryKey(
- const std::string& op_type,
- const std::string& engine = "");
-
-// User can set the preferred engines as a list of engine names, in
-// descending order of preference.
-using EnginePrefType = std::vector<std::string>;
-// {device_type -> {operator_name -> EnginePrefType}}
-using PerOpEnginePrefType =
- CaffeMap<DeviceType, CaffeMap<std::string, EnginePrefType>>;
-// {device_type -> EnginePrefType}
-using GlobalEnginePrefType = CaffeMap<DeviceType, EnginePrefType>;
-TORCH_API void SetPerOpEnginePref(
- const PerOpEnginePrefType& per_op_engine_pref);
-TORCH_API void SetGlobalEnginePref(
- const GlobalEnginePrefType& global_engine_pref);
-TORCH_API void SetEnginePref(
- const PerOpEnginePrefType& per_op_engine_pref,
- const GlobalEnginePrefType& global_engine_pref);
-TORCH_API void SetOpEnginePref(
- const std::string& op_type,
- const CaffeMap<DeviceType, EnginePrefType>& op_pref);
-
-TORCH_API void LoadInt8TensorInfoOfBlob(
- std::vector<float>* scale,
- std::vector<float>* offset,
- uint32_t* axis,
- const Blob* b);
-
-TORCH_API TensorShape GetTensorShapeOfBlob(const Blob* b);
-
-TORCH_API TensorShapes InferBlobShapesAndTypes(
- CaffeMap<string, TensorShape>& blob_desc,
- const vector<NetDef*>& nets);
-
-TORCH_API TensorShapes InferBlobShapesAndTypesFromWorkspace(
- Workspace* ws,
- const vector<NetDef*>& nets);
-
-TORCH_API TensorShapes InferBlobShapesAndTypesFromMap(
- const CaffeMap<std::string, std::vector<int64_t>>& blob_dimensions,
- const vector<NetDef*>& nets);
-
-TORCH_API TensorShapes InferBlobShapesAndTypesFromMap(
- const CaffeMap<std::string, std::vector<int64_t>>& blob_dimensions,
- const CaffeMap<std::string, TensorProto_DataType>& blob_types,
- const vector<NetDef*>& nets);
-
-TORCH_API std::map<string, std::pair<DeviceOption, DeviceOption>>
-ValidateTensorDevices(OperatorBase& op, const OperatorDef& op_def);
-
-// Get a set of registered operator names
-TORCH_API std::set<std::string> GetRegisteredOperators();
-
-// Operator logging capabilities
-TORCH_API void SetOperatorLogger(
- std::function<void(const OperatorDef&)> tracer);
-std::function<void(const OperatorDef&)> GetOperatorLogger();
-
-#ifndef C10_MOBILE
-// This is for transferring tensor data between C2 and backends.
-struct ExternalTensorDescriptor {
- uint64_t dataType;
- uint32_t dimensions;
- const uint64_t* shape;
- uint8_t isOffline = 0;
- uint32_t quantizationAxis;
- uint64_t quantizationParams;
- const float* scales;
- const int32_t* biases;
- uint64_t buffer;
-};
-
-class ExternalTensorFunctionsBase {
- public:
- explicit ExternalTensorFunctionsBase() {}
- virtual ~ExternalTensorFunctionsBase() {}
- virtual bool isQuantized() const = 0;
- virtual bool IsSameMetaType(TypeIdentifier id) = 0;
- virtual void SetupExternalTensorDescriptor(
- const Blob* blob,
- std::vector<std::vector<uint64_t>>* shapes,
- std::vector<std::vector<float>>* all_scales,
- std::vector<std::vector<int32_t>>* all_offsets,
- ExternalTensorDescriptor* desc) = 0;
- virtual void LoadInfoOfBlob(
- const Blob* blob,
- std::vector<float>* scale,
- std::vector<float>* offset,
- uint32_t* axis) = 0;
- virtual TypeIdentifier GetTypeMetaId() = 0;
- virtual TypeMeta GetExternalTensorType(const void* c) = 0;
- virtual vector<int64_t> GetExternalTensorInfo(
- const void* c,
- size_t* capacity,
- DeviceOption* device) = 0;
-};
-
-C10_DECLARE_TYPED_REGISTRY(
- ExternalTensorFunctionsBaseRegistry,
- TypeIdentifier,
- ExternalTensorFunctionsBase,
- std::unique_ptr);
-
-#define REGISTER_EXTERNAL_TENSOR_FUNCTIONS(id, ...) \
- C10_REGISTER_TYPED_CLASS(ExternalTensorFunctionsBaseRegistry, id, __VA_ARGS__)
-inline unique_ptr<ExternalTensorFunctionsBase> CreateExternalTensorFunctions(
- TypeIdentifier id) {
- return ExternalTensorFunctionsBaseRegistry()->Create(id);
-}
-#endif // C10_MOBILE
-
-} // namespace caffe2
-
-C10_CLANG_DIAGNOSTIC_POP()
-
-#endif // CAFFE2_CORE_OPERATOR_H_
diff --git a/caffe2/core/operator_gradient.h b/caffe2/core/operator_gradient.h
deleted file mode 100644
index 5c8d97a..0000000
--- a/caffe2/core/operator_gradient.h
+++ /dev/null
@@ -1,337 +0,0 @@
-#ifndef CAFFE2_CORE_OPERATOR_GRADIENT_H_
-#define CAFFE2_CORE_OPERATOR_GRADIENT_H_
-
-#include "c10/util/Registry.h"
-#include "caffe2/core/operator_schema.h"
-#include "caffe2/proto/caffe2_pb.h"
-#include "caffe2/utils/proto_utils.h"
-
-namespace caffe2 {
-
-/* @brief A struct that abstracts on top of dense and sparse blobs.
- *
- * For a dense blob, its gradient name should be written into dense_, and for
- * a sparse blob, its gradient name should be written into indice_ for
- * the sparse indices and value_ for the values.
- */
-struct TORCH_API GradientWrapper {
- string dense_;
- string indices_;
- string values_;
-
- inline bool IsDense() const {
- return (dense_.size() != 0);
- }
- inline bool IsSparse() const {
- return (indices_.size() != 0 || values_.size() != 0);
- }
- inline bool IsEmpty() const {
- return (!IsDense() && !IsSparse());
- }
-};
-
-/**
- * A struct that holds the gradient operators and related gradient maps.
- */
-struct TORCH_API GradientOpsMeta {
- vector<OperatorDef> ops_;
- vector<GradientWrapper> g_input_;
-
- GradientOpsMeta() {}
- GradientOpsMeta(
- const vector<OperatorDef>& ops,
- const vector<GradientWrapper>& v)
- : ops_(ops), g_input_(v) {}
-};
-
-class TORCH_API GradientMakerBase {
- public:
- GradientMakerBase(
- const OperatorDef& def,
- const vector<GradientWrapper>& g_output)
- : def_(def), g_output_(g_output), g_input_(def.input_size()){};
- virtual ~GradientMakerBase() {}
- virtual bool CopyDeviceOption() const {
- return true;
- }
- virtual bool CopyEngine() const {
- return true;
- }
- virtual bool CopyArguments() const {
- return true;
- }
-
- virtual void VerifyOp() const {
- auto* schema = OpSchemaRegistry::Schema(def_.type());
- if (schema) {
- CAFFE_ENFORCE(
- schema->Verify(def_),
- "(GradientMaker) Operator def did not pass schema checking: ",
- ProtoDebugString(def_));
- }
- }
-
- /**
- * @brief Returns the gradient ops meta.
- *
- * If your gradient op generator only use standard input and output
- * manipulations, you can simply implement GetGradientDefs() that
- * returns vector<OperatorDef>. In that, you can call GI, GI_V and GI_I
- * that will automatically create the gradient registration for you.
- *
- * If you need to do custom gradient name registration, overload this
- * function directly.
- */
- virtual GradientOpsMeta Get() {
- VerifyOp();
- vector<OperatorDef> new_defs = GetGradientDefs();
- for (auto& opdef : new_defs) {
- opdef.set_is_gradient_op(true);
- }
- return GradientOpsMeta(new_defs, g_input_);
- };
-
- const OperatorDef& Def() const {
- return def_;
- }
-
- protected:
- virtual vector<OperatorDef> GetGradientDefs() {
- CAFFE_NOT_IMPLEMENTED;
- }
-
- // Helper functions to return names for the gradient computation.
- // I(idx), O(idx): return the input and output names.
- // GO(idx): return the name of the gradient for output idx.
- // GI(idx), GI_I(idx), GI_V(idx): return the name of the gradient for
- // input idx, and also registers that name into the gradient
- // registry to be returned.
- string I(const int i) {
- CAFFE_ENFORCE((i >= 0) && (i < def_.input().size()));
- return def_.input(i);
- }
- string O(const int i) {
- CAFFE_ENFORCE((i >= 0) && (i < def_.output().size()));
- return def_.output(i);
- }
- string GI(const int i) {
- CAFFE_ENFORCE(
- !g_input_.at(i).IsSparse(),
- "Input ",
- def_.input(i),
- " already set to sparse.");
- g_input_.at(i).dense_ = GradientName(def_.input(i));
- return GradientName(def_.input(i));
- }
- string GI_I(const int i) {
- CAFFE_ENFORCE(
- !g_input_.at(i).IsDense(),
- "Input ",
- def_.input(i),
- " already set to dense.");
- g_input_.at(i).indices_ = GradientSliceIndices(def_.input(i));
- return GradientSliceIndices(def_.input(i));
- }
- string GI_V(const int i) {
- CAFFE_ENFORCE(
- !g_input_.at(i).IsDense(),
- "Input ",
- def_.input(i),
- " already set to dense.");
- g_input_.at(i).values_ = GradientSliceValues(def_.input(i));
- return GradientSliceValues(def_.input(i));
- }
- string GO(const int i) {
- CAFFE_ENFORCE(
- g_output_.at(i).IsDense(),
- "Gradient of output ",
- def_.output(i),
- (g_output_.at(i).IsSparse() ? " is sparse (expected dense)."
- : " is not provided!"));
- return g_output_.at(i).dense_;
- }
- string GO_I(const int i) {
- CAFFE_ENFORCE(
- g_output_.at(i).IsSparse(),
- "Gradient of output ",
- def_.output(i),
- (g_output_.at(i).IsDense() ? " is dense (expected sparse)."
- : " is not provided!"));
- return g_output_.at(i).indices_;
- }
- string GO_V(const int i) {
- CAFFE_ENFORCE(
- g_output_.at(i).IsSparse(),
- "Gradient of output ",
- def_.output(i),
- (g_output_.at(i).IsDense() ? " is dense (expected sparse)."
- : " is not provided!"));
- return g_output_.at(i).values_;
- }
- const GradientWrapper& GradOut(int i) {
- return g_output_.at(i);
- }
-
- // Function to add a gradient pair to map.
- void SetDense(const int i, const string& name) {
- CAFFE_ENFORCE(
- !g_input_.at(i).IsSparse(),
- "Input ",
- def_.input(i),
- " already set to sparse.");
- g_input_.at(i).dense_ = name;
- }
- void SetSparse(const int i, const string& indices, const string& values) {
- CAFFE_ENFORCE(
- !g_input_.at(i).IsDense(),
- "Input ",
- def_.input(i),
- " already set to dense.");
- g_input_.at(i).indices_ = indices;
- g_input_.at(i).values_ = values;
- }
-
- /**
- * @brief a helper function to allow one to create one single operator
- * def, which is usually the case for many simple operators.
- */
- template <class... Args>
- inline static vector<OperatorDef> SingleGradientDef(const Args&... args) {
- return vector<OperatorDef>{CreateOperatorDef(args...)};
- }
-
- public:
- /**
- * Returns map that returns the parameters that the gradients are for.
- */
- static CaffeMap<string, string> MatchGradsToParams(const OperatorDef& op) {
- // NOTE: how to go beyond string-matching?
- CaffeMap<string, string> m;
- for (auto& out : op.output()) {
- if (IsGradientBlob(out)) {
- m[out] = out.substr(0, out.length() - 5);
- }
- }
- return m;
- }
-
- private:
- // Utility functions for gradient name computation. We don't expose them
- // in order to discourage the use of such names explicitly.
- static string GradientName(const string& name) {
- return name + "_grad";
- }
-
- static bool IsGradientBlob(const string& name) {
- return name.length() > 5 && name.find("_grad") == name.length() - 5;
- }
-
- static string GradientNameToParam(const string& name) {
- CHECK(IsGradientBlob(name));
- return name.substr(0, name.length() - 5);
- }
-
- static string GradientSliceIndices(const string& name) {
- return name + "_grad_indices";
- }
-
- static string GradientSliceValues(const string& name) {
- return name + "_grad_values";
- }
-
- protected:
- // We make the member variables protected in case someone wants to write
- // a fully custom Get() function.
- const OperatorDef& def_;
- const vector<GradientWrapper>& g_output_;
- vector<GradientWrapper> g_input_;
-};
-
-/**
- * @brief A helper class to indicate that the operator does not need gradient
- * computation.
- *
- * Use the macro NO_GRADIENT to register operators that do not have gradients.
- * Note that this is different fron SHOULD_NOT_DO_GRADIENT: the latter means
- * that the gradient computation should not flow through it at all, and throws
- * an error if it is called.
- */
-class TORCH_API NoGradient : public GradientMakerBase {
- using GradientMakerBase::GradientMakerBase;
- vector<OperatorDef> GetGradientDefs() override {
- return vector<OperatorDef>();
- }
-};
-
-/**
- * @brief A helper class to indicate that the operator should have no gradient.
- *
- * This is used when the operator definition is designed to not have a gradient.
- * Calling a gradient on this operator def will cause Caffe2 to quit.
- */
-struct ThrowInTheTowelIfGradientIsCalled : public GradientMakerBase {
- using GradientMakerBase::GradientMakerBase;
- GradientOpsMeta Get() override {
- CAFFE_THROW("One should not call gradient for operator ", def_.type(), ".");
- }
-};
-
-/**
- * @brief A helper class to indicate that the gradient mechanism is not ready.
- *
- * This should only be used sparsely when the gradient does exist, but we have
- * not implemented it yet and are using this as a lazy excuse. Eventually, a
- * gradient operator should be implemented.
- */
-struct GradientNotImplementedYet : public GradientMakerBase {
- using GradientMakerBase::GradientMakerBase;
- GradientOpsMeta Get() override {
- CAFFE_THROW(
- "Operator ",
- def_.type(),
- " should have a gradient but is not implemented yet.");
- }
-};
-
-C10_DECLARE_REGISTRY(
- GradientRegistry,
- GradientMakerBase,
- const OperatorDef&,
- const vector<GradientWrapper>&);
-
-#ifdef CAFFE2_NO_GRADIENT_OPS
-
-#define REGISTER_GRADIENT(name, ...) /* No gradients. */
-#define REGISTER_GRADIENT_STR(str_name, ...) /* No gradients. */
-
-#else
-
-#define REGISTER_GRADIENT(name, ...) \
- C10_REGISTER_CLASS(GradientRegistry, name, __VA_ARGS__)
-#define REGISTER_GRADIENT_STR(str_name, ...) \
- C10_REGISTER_TYPED_CLASS(GradientRegistry, str_name, __VA_ARGS__)
-
-#endif
-
-// NO_GRADIENT means that the operator does not need any gradient computation.
-#define NO_GRADIENT(name) REGISTER_GRADIENT(name, NoGradient)
-
-// SHOULD_NOT_DO_GRADIENT means that the operator is not designed to have
-// gradient operators. If you attempt to call the gradient, a log fatal will
-// occur.
-#define SHOULD_NOT_DO_GRADIENT(name) \
- REGISTER_GRADIENT(name, ThrowInTheTowelIfGradientIsCalled)
-
-#define GRADIENT_NOT_IMPLEMENTED_YET(name) \
- REGISTER_GRADIENT(name, GradientNotImplementedYet)
-
-/**
- * @brief Gets the GradientOpsMeta for the given operator def.
- */
-TORCH_API GradientOpsMeta GetGradientForOp(
- const OperatorDef& def,
- const vector<GradientWrapper>& g_output);
-
-} // namespace caffe2
-
-#endif // CAFFE2_CORE_OPERATOR_GRADIENT_H_
diff --git a/caffe2/core/operator_schema.h b/caffe2/core/operator_schema.h
deleted file mode 100644
index f5b9d0d..0000000
--- a/caffe2/core/operator_schema.h
+++ /dev/null
@@ -1,612 +0,0 @@
-#ifndef CAFFE2_CORE_OPERATOR_SCHEMA_H_
-#define CAFFE2_CORE_OPERATOR_SCHEMA_H_
-
-#include <climits>
-#include <functional>
-#include <initializer_list>
-#include <ostream>
-#include <set>
-#include <unordered_map>
-#include <vector>
-
-#include <c10/util/irange.h>
-#include <c10/util/Registry.h>
-#include <caffe2/core/common.h>
-#include <caffe2/core/logging.h>
-#include <caffe2/core/types.h>
-#include <caffe2/proto/caffe2_pb.h>
-#include <caffe2/utils/filler.h>
-#include <caffe2/utils/proto_utils.h>
-
-namespace caffe2 {
-
-// A const value returned by OpSchema::CalculateOutput() if the number of
-// output cannot be determined.
-constexpr int kCannotComputeNumOutputs = -1;
-
-/**
- * @brief A class to record the schema of an op.
- *
- * OpSchema records the common interface of an op specified by its name. This
- * is optional for each operator implemented in Caffe2 but is strongly
- * recommended.
- *
- * To register an OpSchema, one can use the macro OPERATOR_SCHEMA(name) and
- * then append the various functions in the class. For example, for an op
- * that takes in two inputs, one output, and the first input and output
- * could be in-place, can be written as
- *
- * OPERATOR_SCHEMA(name)
- * .NumInputs(2).NumOutputs(1).AllowInplace({{0, 0}});
- */
-class TORCH_API OpSchema {
- public:
- OpSchema() : OpSchema("unknown", "unknown", 0) {}
- OpSchema(const string& type, const string& file, const int line);
-
- /**
- * @brief Returns the file that the op schema is registered from.
- */
- inline const string& file() const {
- return file_;
- }
-
- /**
- * @brief Returns the line in file that the op schema is registered from.
- */
- inline int line() const {
- return line_;
- }
-
- /**
- * @brief Returns the docstring of the op schema.
- */
- inline const char* doc() const {
- return doc_.empty() ? nullptr : doc_.c_str();
- }
-
- /**
- * @brief Verifies if an operator definition protobuf matches the pattern
- * specified in the schema.
- */
- bool Verify(const OperatorDef& def) const;
-
- // Functions to set the property of the operator schemas.
- // Sets the number of inputs, either a fixed number or a min and a max.
-
- /**
- * @brief A single input.
- */
- OpSchema& NumInputs(int n);
- /**
- * @brief Input could be in range [min, max], inclusive.
- */
- OpSchema& NumInputs(int min, int max);
- /**
- * @brief Input could be one of the values specified in allowed_input_nums.
- */
- OpSchema& NumInputs(set<int> allowed_input_nums);
- /**
- * @brief Input is checked with a specified function.
- */
- OpSchema& NumInputs(std::function<bool(int)> func);
-
- // Sets the number of outputs, either a fixed number, a min and a max,
- // or a function that takes in the input number and produces an output
- // number. Use only one function in the set below.
- /**
- * @brief A single output.
- */
- OpSchema& NumOutputs(int n);
- /**
- * @brief Output could be in range [min, max], inclusive.
- */
- OpSchema& NumOutputs(int min, int max);
- /**
- * @brief Output could be one of the values specified in allowed_output_nums.
- */
- OpSchema& NumOutputs(set<int> allowed_output_nums);
- /**
- * @brief Output is checked with a specified function.
- */
- OpSchema& NumOutputs(std::function<bool(int)> func);
-
- /**
- * @brief Relationship between inputs and outputs is checked with a specified
- * function.
- */
- OpSchema& NumInputsOutputs(std::function<bool(int, int)> func);
-
- // Set the function that can calculate the number of output based on the
- // number of input. Use only one function in the set below.
- /**
- * @brief Set the output calculator to a user-defined function.
- */
- OpSchema& OutputCalculator(std::function<int(int)> calc);
- /**
- * @brief Set the number of outputs to be the same as the number of inputs.
- */
- OpSchema& SameNumberOfOutput();
-
- // Sets the rule to allow optional in-place operation.
- OpSchema& AllowInplace(std::function<bool(int, int)> inplace);
- OpSchema& AllowInplace(set<std::pair<int, int>> inplace);
- OpSchema& AllowOneToOneInplace();
- // Sets the rule to enforce in-place operation.
- OpSchema& EnforceInplace(std::function<bool(int, int)> inplace);
- OpSchema& EnforceInplace(set<std::pair<int, int>> inplace);
- OpSchema& EnforceOneToOneInplace();
-
- // Functions to deal with type and shape inference. Basically, this registers
- // a function that takes in an OperatorDef and a series of input type and
- // shape specified by TensorProto objects (whose data fields are empty), and
- // produces a series of output type and shape.
- typedef std::function<
- vector<TensorShape>(const OperatorDef&, const vector<TensorShape>&)>
- TensorInferenceFunctionType;
-
- /**
- * @brief Sets the tensor inference function, which is a std::function object
- * defined in operator_schema.h.
- */
- OpSchema& TensorInferenceFunction(TensorInferenceFunctionType function);
-
- /**
- * A wrapper that makes an infer tensor function to return unknown
- * shape for all outputs if any one of the inputs has unknown shape
- */
-
- static TensorInferenceFunctionType NeedsAllInputShapes(
- TensorInferenceFunctionType f);
-
- /**
- * @brief Sets the corresponding onnx schema name
- */
- OpSchema& InheritOnnxSchema(const std::string& onnx_schema_name);
-
- /**
- * @brief Shortcut to InheritOnnxSchema(type_)
- */
- OpSchema& InheritOnnxSchema() {
- return InheritOnnxSchema(type_);
- }
-
- /**
- * @brief Sets the tensor inference function to produce the same output as
- * the input.
- */
- OpSchema& IdenticalTypeAndShape();
- OpSchema& IdenticalTypeAndShapeOfInput(int idx);
- OpSchema& IdenticalTypeAndShapeOfInputDim(int idx, int dim);
- OpSchema& IdenticalTypeAndShapeOfMultipleInputs(const vector<int>& indices);
- OpSchema& ScalarType(::caffe2::TensorProto_DataType dt);
-
- /**
- * @brief A function to allow one to infer the type and shape from the op
- * schema.
- */
- inline vector<TensorShape> InferTensor(
- const OperatorDef& def,
- const vector<TensorShape>& input_type_shape) const {
- CAFFE_ENFORCE(
- Verify(def),
- "(InferTensor) Operator def did not pass schema checking: ",
- ProtoDebugString(def));
- return tensor_inference_function_(def, input_type_shape);
- }
-
- /*
- * @brief A struct to store various cost information about
- * an operator such as FLOPs, total memory use and parameters.
- */
- struct Cost {
- uint64_t flops{0}; // Floating point operations.
- uint64_t bytes_read{0}; // Total memory read.
- uint64_t bytes_written{0}; // Total memory written.
- uint64_t params_bytes{0}; // Memory read for parameters.
- };
- /**
- * @brief Registers a function that takes in an OperatorDef
- * and a series of input shapes and returns the total "cost"
- * required to run the operator via struct by value.
- */
- typedef std::function<
- struct Cost(const OperatorDef&, const vector<TensorShape>&)>
- CostInferenceFunctionType;
-
- /**
- * @brief Register the Cost inference function.
- */
- OpSchema& CostInferenceFunction(CostInferenceFunctionType function);
-
-#if 0 // def _MSC_VER
- /**
- * @brief Register the Cost inference function via a pointer.
- */
- template <typename T,
- typename = std::enable_if<
- std::is_same<CostInferenceFunctionType&&, T>:value
- >:type>
- inline OpSchema& CostInferenceFunction(T func) {
- // Note: This is here in order to resolve an MSVC compiler issue: it
- // does not automatically convert a function pointer to a std::function,
- // and needs an explicit conversion.
- return CostInferenceFunction(CostInferenceFunctionType(func));
- }
-#endif // _MSC_VER
-
- bool HasCostInferenceFunction() const {
- return !!cost_inference_function_;
- }
-
- inline struct Cost InferCost(
- const OperatorDef& def,
- const vector<TensorShape>& input_tensor_shape) const {
- CAFFE_ENFORCE(
- cost_inference_function_, "Cost inference function not defined.");
- return (*cost_inference_function_)(def, input_tensor_shape);
- }
-
- // Functions to do documentation for the operator schema.
- OpSchema& SetDoc(const string& doc);
-
- struct Argument {
- Argument(const char* name, const char* description, bool required)
- : name_{name}, description_{description}, required_{required} {}
-
- const char* name() const {
- return name_;
- }
-
- const char* description() const {
- return description_;
- }
-
- bool is_required() const {
- return required_;
- }
-
- private:
- const char* name_;
- const char* description_;
- const bool required_;
- };
-
- OpSchema&
- Arg(const char* name, const char* description, bool required = false);
-
-#define DECLARE_STANDARD_ARG(name, str) \
- static const char* Arg_##name; \
- OpSchema& Arg##name(const char* description);
-
- DECLARE_STANDARD_ARG(IsTest, is_test)
-
-#undef DECLARE_STANDARD_ARG
-
- OpSchema& Input(const int n, const char* name, const char* description);
- OpSchema& Output(const int n, const char* name, const char* description);
- // Calls the passed function with `this` as an argument. Useful for
- // adding docs for templated/macro ops.
- OpSchema& FillUsing(std::function<void(OpSchema&)> populator);
-
- // Remove from documentation
- OpSchema& Private();
-
- // This op can pass data across devices
- OpSchema& InputsCanCrossDevices();
-
- /**
- * @brief A function to allow one to get the number of outputs based on the
- * number of inputs, if this schema supports it.
- */
- int CalculateOutput(int num_input) const;
-
- const std::string& onnx_schema() const {
- return onnx_schema_;
- }
-
- int min_input() const {
- return min_input_;
- }
-
- int max_input() const {
- return max_input_;
- }
-
- int min_output() const {
- return min_output_;
- }
-
- int max_output() const {
- return max_output_;
- }
-
- bool num_inputs_allowed(int x) const {
- return num_inputs_allowed_(x);
- }
-
- bool num_outputs_allowed(int x) const {
- return num_outputs_allowed_(x);
- }
-
- bool num_inputs_outputs_allowed(int x, int y) const {
- return num_inputs_outputs_allowed_(x, y);
- }
-
- int inf() const {
- return std::numeric_limits<int>::max();
- }
-
- bool inplace_enforced(int x, int y) const {
- return inplace_enforced_(x, y);
- }
-
- TORCH_API friend std::ostream& operator<<(
- std::ostream& out,
- const OpSchema& schema);
-
- const std::vector<Argument>& args() const {
- return args_;
- }
-
- const std::vector<std::pair<const char*, const char*>>& input_desc() const {
- return input_desc_;
- }
- const std::vector<std::pair<const char*, const char*>>& output_desc() const {
- return output_desc_;
- }
- bool private_op() {
- return private_;
- }
- bool inputs_can_cross_devices() const {
- return inputs_can_cross_devices_;
- }
-
- /**
- * @brief Returns the required device location of inputs and outputs.
- */
- using DeviceInferenceFunctionType = std::function<
- std::pair<std::vector<DeviceOption>, std::vector<DeviceOption>>(
- const OperatorDef& def)>;
-
- OpSchema& DeviceInferenceFunction(DeviceInferenceFunctionType function);
-
- /**
- * @brief Infer required device location of an op's inputs and outputs
- */
- inline std::pair<std::vector<DeviceOption>, std::vector<DeviceOption>>
- InferDevice(const OperatorDef& def) const {
- return device_inference_function_(def);
- }
-
- // The helper is build sparse input with values, keys, weights and lengths;
- // e.g.:
- // values = [1, 2, 3, 2, 4, 6, 7, 3, 6]
- // keys = [0, 1, 4, 0, 1, 2, 5, 1, 2]
- // weights = [1, 2, 3, 4, 5, 6, 7, 8, 9]
- // \_____/ \________/ \__/
- // lengths = [3, 4, 2]
- OpSchema& WeightedValueKeyLengthInputFillers(
- size_t value_index,
- size_t key_index,
- size_t length_index,
- size_t weight_index);
-
- // The helper is build sparse input with values, keys, weights and lengths;
- // e.g.:
- // values = [1, 2, 3, 2, 4, 6, 7, 3, 6]
- // keys = [0, 1, 4, 0, 1, 2, 5, 1, 2]
- // \_____/ \________/ \__/
- // lengths = [3, 4, 2]
- OpSchema& ValueKeyLengthInputFillers(
- size_t value_index,
- size_t key_index,
- size_t length_index);
-
- // The helper is build sparse input with values and lengths; e.g.:
- // values = [1, 2, 3, 2, 4, 6, 7, 3, 6]
- // \_____/ \________/ \__/
- // lengths = [3, 4, 2]
- OpSchema& ValueLengthInputFillers(size_t value_index, size_t length_index);
-
- OpSchema& DisallowInputFillers();
-
- std::vector<TensorFiller> InputFillers(
- const std::vector<std::vector<int64_t>>& shapes) const;
-
- private:
- std::vector<TensorFiller> SupplyDenseFillers(
- const std::vector<std::vector<int64_t>>& shapes);
-
- private:
- string type_;
- string file_;
- string doc_;
- string onnx_schema_;
- std::vector<Argument> args_{};
- std::vector<std::pair<const char*, const char*>> input_desc_{};
- std::vector<std::pair<const char*, const char*>> output_desc_{};
- int line_ = 0;
- int min_input_ = 0;
- int max_input_ = std::numeric_limits<int>::max();
- int min_output_ = 0;
- int max_output_ = std::numeric_limits<int>::max();
- bool private_ = false;
- bool inputs_can_cross_devices_ = false;
- std::function<bool(int)> num_inputs_allowed_ = [](int) { return true; };
- std::function<bool(int)> num_outputs_allowed_ = [](int) { return true; };
- std::function<bool(int, int)> num_inputs_outputs_allowed_ = [](int, int) {
- return true;
- };
- std::function<int(int)> calculate_output_;
- // In default, any in-place operation is neither allowed nor enforced.
- std::function<bool(int, int)> inplace_allowed_ = [](int, int) {
- return false;
- };
- std::function<bool(int, int)> inplace_enforced_ = [](int, int) {
- return false;
- };
- TensorInferenceFunctionType tensor_inference_function_;
- std::unique_ptr<CostInferenceFunctionType> cost_inference_function_ = nullptr;
- DeviceInferenceFunctionType device_inference_function_;
-
- std::function<std::vector<TensorFiller>(
- const std::vector<std::vector<int64_t>>&)>
- filler_supplier_ =
- [this](const std::vector<std::vector<int64_t>>& shapes) {
- return SupplyDenseFillers(shapes);
- };
-};
-
-/**
- * @brief A registry to hold all the operator schemas.
- */
-class TORCH_API OpSchemaRegistry {
- public:
- static OpSchema&
- NewSchema(const string& key, const string& file, const int line);
-
- static const OpSchema* Schema(const string& key) {
- auto& m = map();
- auto it = m.find(key);
- if (it != m.end()) {
- return &it->second;
- } else {
- return nullptr;
- }
- }
-
- private:
- // OpSchemaRegistry should not need to be instantiated.
- OpSchemaRegistry() = delete;
-
- /**
- * @brief Returns the underlying string to OpSchema map.
- *
- * You should not manually manipulate the map object returned. Instead, use
- * the macros defined such as OPERATOR_SCHEMA to register your operator
- * schema.
- *
- * We wrap it inside a function to avoid the static initialization order
- * fiasco.
- */
- static CaffeMap<string, OpSchema>& map();
-};
-
-// Helper function for creating simple tensorproto with dimension and type
-template <typename T_I = int>
-inline TensorShape CreateTensorShape(
- vector<T_I> dims,
- ::caffe2::TensorProto_DataType dt) {
- TensorShape ts;
- for (T_I d : dims) {
- ts.add_dims(d);
- }
- ts.set_data_type(dt);
- return ts;
-}
-
-// Helper function
-inline vector<int64_t> GetDimsVector(const TensorShape& shape) {
- vector<int64_t> dims;
- for (auto d : shape.dims()) {
- dims.push_back(d);
- }
- return dims;
-}
-
-// Helper function
-inline uint64_t nElemFromDim(const TensorShape& X, int dim = 0) {
- CAFFE_ENFORCE_GE(dim, 0, "Invalid maximum index specified");
-
- uint64_t nElem = 1;
- for (const auto i : c10::irange(dim, X.dims_size())) {
- nElem *= X.dims(i);
- }
- return nElem;
-}
-
-// Helper function
-inline uint64_t nElemBetweenDim(const TensorShape& X, int start, int stop) {
- CAFFE_ENFORCE_GE(start, 0, "Invalid maximum index specified");
- CAFFE_ENFORCE_LE(stop, X.dims_size(), "Invalid maximum index specified");
-
- uint64_t nElem = 1;
- for (const auto i : c10::irange(start, stop)) {
- nElem *= X.dims(i);
- }
- return nElem;
-}
-
-// Helper function for infer op inputs and outputs device information.
-inline std::pair<std::vector<DeviceOption>, std::vector<DeviceOption>>
-InferOpInputOutputDevice(const OperatorDef& op) {
- auto op_schema = OpSchemaRegistry::Schema(op.type());
- if (op_schema) {
- // op_schema found
- return op_schema->InferDevice(op);
-
- } else {
- // No schema for op.type registered
- auto temp_schema = OpSchema();
- return temp_schema.InferDevice(op);
- }
-}
-
-template <uint64_t OpsPerPoint>
-OpSchema::Cost PointwiseCostInference(
- const OperatorDef& /* unused */,
- const vector<TensorShape>& inputs) {
- struct OpSchema::Cost c;
- const TensorShape X = inputs[0];
- uint64_t nElemX = nElemFromDim(X);
- uint64_t nElemRead = 0;
- for (const auto i : c10::irange(inputs.size())) {
- nElemRead += nElemFromDim(inputs[i]);
- }
-
- c.flops = nElemX * OpsPerPoint;
- auto const& X_element_size_byte =
- DataTypeToTypeMeta(X.data_type()).itemsize();
- c.bytes_read = nElemRead * X_element_size_byte;
- c.bytes_written = nElemX * X_element_size_byte;
- return c;
-}
-
-} // namespace caffe2
-
-#if defined(_MSC_VER)
-#define EXPORT_IF_NOT_MSVC
-#else
-#define EXPORT_IF_NOT_MSVC C10_EXPORT
-#endif
-
-#ifndef CAFFE2_NO_OPERATOR_SCHEMA
-
-#define OPERATOR_SCHEMA(name) \
- EXPORT_IF_NOT_MSVC void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(){}; \
- static OpSchema* C10_ANONYMOUS_VARIABLE(name) CAFFE2_UNUSED = \
- &OpSchemaRegistry::NewSchema(#name, __FILE__, __LINE__)
-
-#else // CAFFE2_NO_OPERATOR_SCHEMA
-
-#define OPERATOR_SCHEMA(name) \
- EXPORT_IF_NOT_MSVC void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(){}; \
- static OpSchema* C10_ANONYMOUS_VARIABLE(name) CAFFE2_UNUSED = \
- 1 ? nullptr : &OpSchemaRegistry::NewSchema(#name, __FILE__, __LINE__)
-
-#endif // CAFFE2_NO_OPERATOR_SCHEMA
-
-#ifdef CAFFE2_NO_GRADIENT_OPS
-
-#define GRADIENT_OPERATOR_SCHEMA(name) \
- EXPORT_IF_NOT_MSVC void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(){}; \
- static OpSchema* C10_ANONYMOUS_VARIABLE(name) CAFFE2_UNUSED = \
- 1 ? nullptr : &OpSchemaRegistry::NewSchema(#name, __FILE__, __LINE__)
-
-#else
-
-#define GRADIENT_OPERATOR_SCHEMA(name) OPERATOR_SCHEMA(name)
-
-#endif
-#endif // CAFFE2_CORE_OPERATOR_SCHEMA_H_
diff --git a/caffe2/core/storage.h b/caffe2/core/storage.h
deleted file mode 100644
index e9bd6ed..0000000
--- a/caffe2/core/storage.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef CAFFE2_CORE_STORAGE_H_
-#define CAFFE2_CORE_STORAGE_H_
-
-#include <cstddef>
-#include <cstdint>
-#include <fstream>
-#include <sstream>
-#include <type_traits>
-#include <typeinfo>
-#include <vector>
-
-#include "caffe2/core/allocator.h"
-#include "caffe2/core/common.h"
-#include "caffe2/core/context.h"
-#include "caffe2/core/flags.h"
-#include "caffe2/core/logging.h"
-#include <c10/util/typeid.h>
-
-#include <c10/core/Allocator.h>
-#include <c10/core/Device.h>
-#include <c10/core/DeviceType.h>
-#include <c10/util/intrusive_ptr.h>
-#include <c10/core/Storage.h>
-#include <c10/core/StorageImpl.h>
-
-namespace caffe2 {
-
-using StorageImpl = at::StorageImpl;
-using Storage = at::Storage;
-
-} // namespace caffe2
-
-#endif // CAFFE2_CORE_STORAGE_H_
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
deleted file mode 100644
index 1171605..0000000
--- a/caffe2/core/tensor.h
+++ /dev/null
@@ -1,674 +0,0 @@
-#ifndef CAFFE2_CORE_TENSOR_H_
-#define CAFFE2_CORE_TENSOR_H_
-
-#include <c10/macros/Macros.h>
-#include "caffe2/core/storage.h"
-
-#include <c10/core/SymIntArrayRef.h>
-#include <ATen/core/UndefinedTensorImpl.h>
-#include <c10/core/TensorOptions.h>
-#include <c10/util/ExclusivelyOwned.h>
-#include <c10/util/ExclusivelyOwnedTensorTraits.h>
-#include <c10/util/intrusive_ptr.h>
-
-C10_CLANG_DIAGNOSTIC_PUSH()
-#if C10_CLANG_HAS_WARNING("-Wshorten-64-to-32")
-C10_CLANG_DIAGNOSTIC_IGNORE("-Wshorten-64-to-32")
-#endif
-
-#if defined(EXPOSE_C2_OPS) || \
- !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-namespace at {
-class Tensor;
-};
-#endif
-namespace caffe2 {
-
-using at::UndefinedTensorImpl;
-
-/**
- * @brief Tensor class holds a shared pointer to the implementation TensorImpl,
- * redirects API calls to TensorImpl;
- * Copying of Tensor results in sharing the same underlying implementation
- * object
- *
- * NB: See TensorImpl for documentation on these methods.
- */
-class TORCH_API Tensor final {
- private:
- enum Unsafe { IDoWantAliasing };
- Tensor(const Tensor& other, Unsafe _) : impl_(other.getIntrusivePtr()) {}
-
- protected:
- using TensorImplPtr = c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>;
- TensorImplPtr impl_;
-
- void enforce_invariants();
-
- public:
- Tensor() : impl_() {}
-
- Tensor(const Tensor& t) : impl_(t.impl_) {}
- Tensor& operator=(const Tensor& t) {
- impl_ = t.impl_;
- return *this;
- }
-
- Tensor(Tensor&&) = default;
- Tensor& operator=(Tensor&&) = default;
-
- operator bool() const {
- return impl_.defined();
- }
-
- TensorImpl* unsafeGetTensorImpl() const {
- return impl_.get();
- }
-
- TensorImpl* unsafeReleaseTensorImpl() {
- return impl_.release();
- }
-
- Tensor UnsafeSharedInstance() const {
- return Tensor(*this, IDoWantAliasing);
- }
-
- /**
- * @brief Creates a tensor of the given device type.
- *
- * Note that the actual data allocation is not going to be carried out until
- * you resize the tensor and then call mutable_data().
- */
- explicit Tensor(at::Device device)
- : impl_(c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(
- Storage::create_legacy(device),
- c10::computeDispatchKey(c10::nullopt, at::kStrided, device),
- TypeMeta())) {}
-
- /**
- * @brief Creates a tensor of the given dimension.
- *
- * Note that the actual data allocation is not going to be carried out until
- * the first time mutable_data() is called.
- */
- explicit Tensor(at::IntArrayRef dims, DeviceType type) : Tensor(type) {
- // TODO: here, we create a Storage
- // and immediately discard it in Resize() since
- // reset_tensor will be true and FreeMemory will be called,
- // we might want to avoid creating Storage twice?
- Resize(dims);
- }
-
- // we want to preserve index information
- explicit Tensor(at::IntArrayRef dims, at::Device device) : Tensor(device) {
- Resize(dims);
- }
-
- // TODO: remove?
- explicit Tensor(const vector<int>& dims, DeviceType type) : Tensor(type) {
- Resize(dims);
- }
-
- /**
- * @brief: Create a Tensor of at::DeviceType `type` and initialize it with
- * src Tensor
- */
- Tensor(const Tensor& src, DeviceType type) : Tensor(type) {
- CopyFrom(src);
- }
-
- /**
- * @brief Mutual conversion with at::Tensor
- *
- * The tensor will share the same instance (data, strides, sizes, etc) but
- * a different subset of APIs would be available
- */
-#if defined(EXPOSE_C2_OPS) || \
- !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
- explicit Tensor(at::Tensor tensor);
-
- explicit operator at::Tensor() const&;
-
- explicit operator at::Tensor() &&;
-#endif
-
- bool is_same(const Tensor& other) const noexcept {
- return impl_ == other.impl_;
- }
-
- Tensor Clone() const {
- Tensor x(GetDevice());
- x.CopyFrom(*this);
- return x;
- }
-
- /**
- * Clone self as a Tensor that share the same Storage,
- * that is, both Tensors are views on the same Storage.
- * If we change the sizes or strides of one Tensor, it
- * does not affect the other Tensor that it shares Storage
- * with.
- * A similar yet different usage is `Tensor x = y;`, this
- * will make x and y pointing to the same Tensor and resizing
- * one of them will resize the other as well.
- *
- * TODO: Deduplicate this with THTensor_(newWithTensor)
- * (exposed in ATen as at::alias but not otherwise available)
- */
- Tensor Alias() const {
- Tensor x(sizes(), GetDevice());
- if (!dtype_initialized()) {
- C10_LOG_EVERY_MS(WARNING, 1000)
- << "Cloning a tensor that don't have a data type (did you call mutable_data<T> on the tensor?)";
- }
- AT_ASSERTM(
- storage_initialized(),
- "Cloning a tensor that has no content and has size > 0");
- // set_storage already sets data_type_ of TensorImpl
- x.impl_->set_storage_and_dtype(storage(), impl_->dtype());
- x.impl_->set_storage_offset(impl_->storage_offset());
- x.impl_->set_sizes_and_strides(sizes(), strides());
- return x;
- }
-
- DeviceType GetDeviceType() const {
- return impl_->device_type();
- }
-
- at::Device GetDevice() const {
- return impl_.get()->device();
- }
-
- /**
- * @brief Copies the data from a source tensor, with a context provided to
- * carry out the underlying memcpy operation. This method respects
- * caffe2_keep_on_shrink.
- *
- * After CopyFrom, this function guarantees that the destination tensor will
- * have the same initialization state and dtype as src. This function
- * preserves the DeviceType of the source tensor (so, e.g., if you allocate
- * a tensor on CPU and then CopyFrom a CUDA tensor, that will to a
- * CUDA-to-CPU transfer).
- *
- * 'async' parameter triggers async copy for CUDA tensors
- */
- void CopyFrom(const Tensor& src, bool async = false);
-
- /**
- * @brief Extend the outer-most dimension of this tensor
- * to dimension of `num`.
- */
- void ExtendTo(int64_t num, float growthPct) const {
- CAFFE_ENFORCE_GE_WITH_CALLER(impl_->dim(), 1);
- CAFFE_ENFORCE_GE_WITH_CALLER(growthPct, 0);
- Extend(num - impl_->size(0), growthPct);
- }
-
- void Extend(int64_t num, float growthPct) const {
- impl_.get()->Extend(num, growthPct);
- }
-
- /**
- * @brief Shrinks the outer-most dimension to given size, keeping the data.
- *
- * This method guarantees that no re-allocations are carried out, which means
- * that the extra capacity after the end of the shrunk tensor is maintained.
- * Notably, this function does NOT respect caffe2_keep_on_shrink.
- */
- void ShrinkTo(int64_t outer_dim) const {
- CAFFE_ENFORCE_WITH_CALLER(
- impl_->is_contiguous(),
- "Right now ShrinkTo is only supported on contiguous Tensor.");
- CAFFE_ENFORCE_WITH_CALLER(impl_->dim() >= 1, "Tensor must be at least 1D");
- CAFFE_ENFORCE_WITH_CALLER(
- outer_dim <= impl_->size(0),
- "New outer dimension must be smaller than current.");
- CAFFE_ENFORCE(
- impl_->storage().unique(),
- "Can't call ShrinkTo on shared storage, please call Resize instead.");
- impl_.get()->set_size(0, outer_dim);
- }
-
- template <class T>
- void ReserveSpace(const T& outer_dim) const {
- impl_.get()->ReserveSpace(outer_dim);
- }
-
- template <typename... Ts>
- void Resize(Ts... dim_source) const {
- impl_.get()->Resize(dim_source...);
- }
-
- template <typename T>
- void Resize(const std::vector<T>& dim_source) const {
- impl_.get()->Resize(ArrayRef<T>(dim_source));
- }
-
- /**
- * Resize the tensor like the source tensor. Note that this is just a
- * sugar wrapper that essentially calls Resize(src_tensor.dims()).
- * This method respects caffe2_keep_on_shrink.
- */
- inline void ResizeLike(const Tensor& src_tensor) const {
- CAFFE_ENFORCE_WITH_CALLER(
- src_tensor.is_contiguous(),
- "Right now ResizeLike is only supported for contiguous Tensor.");
- if (impl_ != src_tensor.impl_) {
- impl_.get()->Resize(src_tensor.sizes());
- }
- }
-
- inline void Reshape(const vector<int64_t>& dims) const {
- impl_.get()->Reshape(dims);
- }
-
- inline void Reshape(const vector<int>& dims) const {
- impl_.get()->Reshape(ToVectorint64_t(dims));
- }
-
- inline void FreeMemory() const {
- impl_.get()->FreeMemory();
- }
-
- /**
- * A utility function to print the debug string for the tensor. Note that this
- * is very slow since it involves quite some string operations, so do not use
- * it in your performance-critical code.
- */
- string DebugString() const {
- std::stringstream ss;
- ss << "A Tensor of item size " << impl_->dtype().itemsize() << " and type "
- << impl_->dtype().name() << " and dimension (";
- for (int d : impl_->sizes()) {
- ss << d << ",";
- }
- ss << ").";
- return ss.str();
- }
-
- // To be deprecated
- void ShareData(const Tensor& src) const {
- impl_.get()->ShareData(*src.impl_.get());
- }
-
- /**
- * @brief Shares the data with an externally managed pointer.
- *
- * This is similar to ShareData() but the source is a pointer with an advanced
- * deleter option. In default, no deletion takes place, and one needs to make
- * sure that the external memory is deallocated only after the tensor finishes
- * using it. If a Deleter object is passed in, when this tensor is reallocated
- * or freed, the deleter function is going to be called.
- */
- template <typename T>
- void ShareExternalPointer(
- T* src,
- size_t nbytes = 0,
- MemoryDeleter d = nullptr) const {
- ShareExternalPointer((void*)src, caffe2::TypeMeta::Make<T>(), nbytes, d);
- }
-
- template <typename T>
- void ShareExternalPointer(at::DataPtr&& data_ptr, size_t nbytes = 0) const {
- ShareExternalPointer(
- std::move(data_ptr), caffe2::TypeMeta::Make<T>(), nbytes);
- }
-
- void ShareExternalPointer(
- void* src,
- const TypeMeta data_type,
- size_t nbytes = 0,
- MemoryDeleter d = nullptr) const {
- CAFFE_ENFORCE_WITH_CALLER(
- impl_->is_contiguous(),
- "Right now ShareExternalPointer is only supported for contiguous Tensor.");
- CAFFE_ENFORCE_WITH_CALLER(
- data_type != ScalarType::Undefined,
- "To share with a raw external pointer you need to pass in an "
- "initialized data_type(TypeMeta).");
- impl_.get()->ShareExternalPointer(
- at::DataPtr(src, src, d, impl_->device_type()), data_type, nbytes);
- }
-
- void ShareExternalPointer(
- at::DataPtr&& data_ptr,
- const TypeMeta data_type,
- size_t nbytes) {
- impl_.get()->ShareExternalPointer(std::move(data_ptr), data_type, nbytes);
- }
-
- const c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>& getIntrusivePtr()
- const {
- return impl_;
- }
-
- bool defined() const {
- return impl_;
- }
-
- /**
- * Returns a raw void* pointer of the underlying storage. mutable_data()
- * or raw_mutable_data() must have been called prior to this function call.
- */
- inline void* raw_data() const {
- return impl_->mutable_data();
- }
-
- template <typename T>
- inline T* data() const {
- return impl_.get()->mutable_data_dtype_initialized<T>();
- }
-
- inline void* raw_mutable_data(const TypeMeta meta) const {
- return impl_.get()->raw_mutable_data(meta);
- }
-
- /**
- * Returns a mutable raw pointer of the underlying storage. This can only be
- * used when you know for sure that the underlying storage of the tensor is
- * already created via an earlier raw_mutable_data(meta) call or a
- * mutable_data<T>() call.
- *
- * If the existing data does not match the desired type, it will be deleted
- * and a new storage will be created.
- */
- inline void* raw_mutable_data() const {
- const auto& data_type = impl_->dtype();
- CAFFE_ENFORCE_WITH_CALLER(
- data_type != ScalarType::Undefined,
- "Calling raw_mutable_data() without meta, but the current meta is "
- "of unknown type.");
- return raw_mutable_data(data_type);
- }
-
- template <typename T>
- inline T* mutable_data() const {
- return impl_.get()->mutable_data<T>();
- }
-
- /**
- * Returns the number of dimensions of the data.
- */
- inline int dim() const {
- return impl_->dim();
- }
-
- /**
- * (To be deprecated) Returns the number of dimensions of the data.
- */
- inline int ndim() const {
- return impl_->dim();
- }
-
- /**
- * (To be deprecated) Returns the size (i.e. the number of items) of the
- * tensor.
- */
- inline int64_t size() const {
- return impl_->numel();
- }
-
- /**
- * Returns the number of items of the tensor.
- */
- inline int64_t numel() const {
- return impl_->numel();
- }
-
- /**
- * Return the number of bytes each item takes in the tensor.
- */
- inline size_t itemsize() const {
- return impl_->dtype().itemsize();
- }
-
- /**
- * Returns the total number of bytes of the storage.
- *
- * This is equivalent to calling size() * itemsize().
- */
- inline size_t nbytes() const {
- return impl_->numel() * itemsize();
- }
-
- inline at::IntArrayRef sizes() const {
- return impl_.get()->sizes();
- }
-
- inline c10::SymIntArrayRef sym_sizes() const {
- return impl_->sym_sizes();
- }
-
- inline c10::SymInt sym_numel() const {
- return impl_->sym_numel();
- }
-
- inline c10::SymIntArrayRef sym_strides() const {
- return impl_->sym_strides();
- }
-
- inline int64_t size_from_dim(int k) const {
- return size_from_dim_(k, impl_->sizes());
- }
-
- inline int64_t size_to_dim(int k) const {
- return size_to_dim_(k, impl_->sizes());
- }
-
- inline int64_t size_between_dim(int k, int l) const {
- return size_between_dim_(k, l, impl_->sizes());
- }
-
- /**
- * Returns the 'canonical' version of a (usually) user-specified axis,
- * allowing for negative indexing (e.g., -1 for the last axis).
- *
- * @param axis_index the axis index.
- * If 0 <= index < dim(), return index.
- * If -ndim <= index <= -1, return (dim() - (-index)),
- * e.g., the last axis index (dim() - 1) if index == -1,
- * the second to last if index == -2, etc.
- * Dies on out of range index.
- */
- inline int canonical_axis_index(int axis_index) const {
- return canonical_axis_index_(axis_index, impl_->dim());
- }
-
- inline int64_t stride(int64_t dim) const {
- return impl_.get()->stride(dim);
- }
-
- inline at::IntArrayRef strides() const {
- return impl_.get()->strides();
- }
-
- inline bool is_contiguous(
- at::MemoryFormat memory_format = at::MemoryFormat::Contiguous) const {
- return impl_.get()->is_contiguous(memory_format);
- }
-
- /**
- * Checks if the tensor content is of the given data type.
- */
- template <typename T>
- inline bool IsType() const {
- return impl_->dtype().Match<T>();
- }
-
- /**
- * Returns the TypeMeta object associated with the current data type.
- */
- inline const TypeMeta dtype() const {
- return impl_->dtype();
- }
-
- /**
- * (To be deprecated) Returns the TypeMeta object associated with the current
- * data type.
- */
- inline const TypeMeta meta() const {
- return impl_->dtype();
- }
-
- /**
- * Returns the i-th dimension of the tensor in int.
- *
- * This function returns an int value instead of int64_t, which depending on
- * the typedef could be int64. If you want int64 dim values, make sure you
- * call dim() instead.
- */
- inline int dim32(const int i) const {
-#ifndef NDEBUG
- CAFFE_ENFORCE_LT_WITH_CALLER(
- i, static_cast<int>(impl_->dim()), "Exceeding ndim limit");
- CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index");
-#endif
- // Avoid TensorImpl::size() because it is a virtual call that
- // supports out-of-range indexing like Python.
- auto s = impl_->sizes()[i];
- CAFFE_ENFORCE_LT_WITH_CALLER(s, std::numeric_limits<int>::max());
- return static_cast<int>(s);
- }
-
- inline int64_t size(const int i) const {
- return impl_->size(i);
- }
-
- // To be deprecated
- inline int64_t dim(const int i) const {
- return impl_->size(i);
- }
-
- const Storage& storage() {
- return impl_->storage();
- }
-
- const Storage& storage() const {
- return impl_->storage();
- }
-
- bool storage_initialized() const {
- return impl_->storage_initialized();
- }
-
- bool dtype_initialized() const {
- return impl_->dtype_initialized();
- }
-};
-
-/**
- * Reinitialize a Tensor to given dims and options if necessary, note that
- * this will not do anything if the
- * Tensor already has correct size and data type
- */
-TORCH_API void
-ReinitializeTensor(Tensor* t, at::IntArrayRef dims, at::TensorOptions options);
-
-TORCH_API void ReinitializeAndCopyFrom(
- Tensor* t,
- at::TensorOptions options,
- const Tensor& src,
- bool async = false);
-
-using TensorCPU = Tensor;
-
-constexpr int k_limit_default_ = 1000;
-
-// TODO: the following logic can be merged into regular Tensor class methods
-// after MKLMemory starts to implement Tensor interface
-
-// Type call registry
-typedef TypeMeta (*TypeCall)(const void*);
-TypeCall GetTypeCallFunction(TypeIdentifier id);
-void RegisterTypeCallFunction(TypeIdentifier id, TypeCall c);
-
-// Shape call registry
-typedef vector<int64_t> (
- *TensorInfoCall)(const void*, size_t* capacity, DeviceOption* device);
-TensorInfoCall GetTensorInfoFunction(TypeIdentifier id);
-void RegisterTensorInfoFunction(TypeIdentifier id, TensorInfoCall c);
-
-// resize helper function
-void TensorVectorResize(
- std::vector<Tensor>& tensors,
- int size,
- DeviceType type);
-
-// Tensor factory function
-TORCH_API Tensor empty(at::IntArrayRef dims, at::TensorOptions options);
-
-/**
- * @brief Creates a CPU tensor, and fills its contents with the given values.
- * Values are copied in
- */
-// TODO: can be unified with at::from_blob when Tensor is merged and string
-// types are supported
-template <typename T>
-Tensor TensorCPUFromValues(at::IntArrayRef dims, at::ArrayRef<T> values) {
- Tensor r = empty(dims, at::device(CPU).dtype<T>());
- CAFFE_ENFORCE_EQ(values.size(), r.numel());
- CPUContext context;
- context.CopyItemsFromCPU(
- r.dtype(), values.size(), values.data(), r.mutable_data<T>());
- return r;
-}
-
-vector<int64_t>
-GetTensorInfo(const void* c, size_t* capacity, DeviceOption* device);
-
-class TORCH_API TensorPrinter {
- public:
- explicit TensorPrinter(
- const std::string& tensor_name = "",
- const std::string& file_name = "",
- int limit = k_limit_default_);
- ~TensorPrinter();
-
- template <class T>
- void Print(const Tensor& tensor);
-
- void PrintMeta(const Tensor& tensor);
-
- string MetaStr(const Tensor& tensor);
-
- private:
- bool to_file_;
- int limit_;
- std::unique_ptr<std::ofstream> log_file_;
- std::string tensor_name_;
-};
-
-template <class T>
-void TensorPrinter::Print(const Tensor& tensor) {
- std::stringstream values_stream;
- // One most likely doesn't want to print int64-number of items for visual
- // inspection, so we cast down to int here.
- int total_count = static_cast<int>(std::min(tensor.numel(), int64_t(limit_)));
-
- const T* tensor_data = tensor.template data<T>();
- for (int i = 0; i < total_count - 1; ++i) {
- values_stream << tensor_data[i] << ",";
- }
- if (total_count) {
- // We do not add a comma after the last item.
- values_stream << tensor_data[total_count - 1];
- }
-
- if (to_file_) {
- (*log_file_) << MetaStr(tensor) << values_stream.str() << std::endl;
- } else {
- // Log to console.
- LOG(INFO) << MetaStr(tensor) << values_stream.str();
- }
-}
-
-CAFFE_DECLARE_KNOWN_TYPE(Tensor, Caffe2Tensor)
-} // namespace caffe2
-
-C10_CLANG_DIAGNOSTIC_POP()
-
-namespace c10 {
-template <>
-struct ExclusivelyOwnedTraits<caffe2::Tensor> : public c10::ExclusivelyOwnedTensorTraits<caffe2::Tensor> {};
-} // namespace c10
-#endif // CAFFE2_CORE_TENSOR_H_
diff --git a/caffe2/core/tensor_int8.h b/caffe2/core/tensor_int8.h
deleted file mode 100644
index b95b7b8..0000000
--- a/caffe2/core/tensor_int8.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef CAFFE2_TENSOR_INT8_H_
-#define CAFFE2_TENSOR_INT8_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/tensor.h"
-#include "caffe2/proto/caffe2_pb.h"
-
-namespace caffe2 {
-namespace int8 {
-
-struct Int8TensorCPU {
- float scale{1.0};
- int32_t zero_point{0};
- // Generally stores uint8_t data, but sometimes int32_t (e.g. bias
- // parameters).
- Tensor t{CPU};
-};
-} // namespace int8
-} // namespace caffe2
-
-#endif // CAFFE2_TENSOR_INT8_H_
diff --git a/caffe2/core/workspace.h b/caffe2/core/workspace.h
deleted file mode 100644
index 04fa86f..0000000
--- a/caffe2/core/workspace.h
+++ /dev/null
@@ -1,342 +0,0 @@
-#ifndef CAFFE2_CORE_WORKSPACE_H_
-#define CAFFE2_CORE_WORKSPACE_H_
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/observer.h"
-
-#include <climits>
-#include <cstddef>
-#include <mutex>
-#include <typeinfo>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "c10/util/Registry.h"
-#include "caffe2/core/blob.h"
-#include "caffe2/core/net.h"
-#include "caffe2/proto/caffe2_pb.h"
-#include "caffe2/utils/signal_handler.h"
-#include "caffe2/utils/threadpool/ThreadPool.h"
-
-C10_DECLARE_bool(caffe2_print_blob_sizes_at_exit);
-
-namespace caffe2 {
-
-class NetBase;
-
-struct TORCH_API StopOnSignal {
- StopOnSignal()
- : handler_(std::make_shared<SignalHandler>(
- SignalHandler::Action::STOP,
- SignalHandler::Action::STOP)) {}
-
- StopOnSignal(const StopOnSignal& other) : handler_(other.handler_) {}
-
- bool operator()(int /*iter*/) {
- return handler_->CheckForSignals() != SignalHandler::Action::STOP;
- }
-
- std::shared_ptr<SignalHandler> handler_;
-};
-
-/**
- * Workspace is a class that holds all the related objects created during
- * runtime: (1) all blobs, and (2) all instantiated networks. It is the owner of
- * all these objects and deals with the scaffolding logistics.
- */
-class TORCH_API Workspace {
- public:
- typedef std::function<bool(int)> ShouldContinue;
- /**
- * Initializes an empty workspace.
- */
- Workspace() : Workspace(".", nullptr) {}
-
- /**
- * Initializes an empty workspace with the given root folder.
- *
- * For any operators that are going to interface with the file system, such
- * as load operators, they will write things under this root folder given
- * by the workspace.
- */
- explicit Workspace(const string& root_folder)
- : Workspace(root_folder, nullptr) {}
-
- /**
- * Initializes a workspace with a shared workspace.
- *
- * When we access a Blob, we will first try to access the blob that exists
- * in the local workspace, and if not, access the blob that exists in the
- * shared workspace. The caller keeps the ownership of the shared workspace
- * and is responsible for making sure that its lifetime is longer than the
- * created workspace.
- */
- explicit Workspace(const Workspace* shared) : Workspace(".", shared) {}
-
- /**
- * Initializes workspace with parent workspace, blob name remapping
- * (new name -> parent blob name), no other blobs are inherited from
- * parent workspace
- */
- Workspace(
- const Workspace* shared,
- const std::unordered_map<string, string>& forwarded_blobs)
- : Workspace(".", nullptr) {
- CAFFE_ENFORCE(shared, "Parent workspace must be specified");
- for (const auto& forwarded : forwarded_blobs) {
- CAFFE_ENFORCE(
- shared->HasBlob(forwarded.second),
- "Invalid parent workspace blob: ",
- forwarded.second);
- forwarded_blobs_[forwarded.first] =
- std::make_pair(shared, forwarded.second);
- }
- }
-
- /**
- * Initializes a workspace with a root folder and a shared workspace.
- */
- Workspace(const string& root_folder, const Workspace* shared)
- : root_folder_(root_folder), shared_(shared), bookkeeper_(bookkeeper()) {
- std::lock_guard<std::mutex> guard(bookkeeper_->wsmutex);
- bookkeeper_->workspaces.insert(this);
- }
-
- ~Workspace() {
- if (FLAGS_caffe2_print_blob_sizes_at_exit) {
- PrintBlobSizes();
- }
- // This is why we have a bookkeeper_ shared_ptr instead of a naked static! A
- // naked static makes us vulnerable to out-of-order static destructor bugs.
- std::lock_guard<std::mutex> guard(bookkeeper_->wsmutex);
- bookkeeper_->workspaces.erase(this);
- }
-
- /**
- * Adds blob mappings from workspace to the blobs from parent workspace.
- * Creates blobs under possibly new names that redirect read/write operations
- * to the blobs in the parent workspace.
- * Arguments:
- * parent - pointer to parent workspace
- * forwarded_blobs - map from new blob name to blob name in parent's
- * workspace skip_defined_blob - if set skips blobs with names that already
- * exist in the workspace, otherwise throws exception
- */
- void AddBlobMapping(
- const Workspace* parent,
- const std::unordered_map<string, string>& forwarded_blobs,
- bool skip_defined_blobs = false);
-
- /**
- * Converts previously mapped tensor blobs to local blobs, copies values from
- * parent workspace blobs into new local blobs. Ignores undefined blobs.
- */
- template <class Context>
- void CopyForwardedTensors(const std::unordered_set<std::string>& blobs) {
- for (const auto& blob : blobs) {
- auto it = forwarded_blobs_.find(blob);
- if (it == forwarded_blobs_.end()) {
- continue;
- }
- const auto& ws_blob = it->second;
- const auto* parent_ws = ws_blob.first;
- auto* from_blob = parent_ws->GetBlob(ws_blob.second);
- CAFFE_ENFORCE(from_blob);
- CAFFE_ENFORCE(
- from_blob->template IsType<Tensor>(),
- "Expected blob with tensor value",
- ws_blob.second);
- forwarded_blobs_.erase(blob);
- auto* to_blob = CreateBlob(blob);
- CAFFE_ENFORCE(to_blob);
- const auto& from_tensor = from_blob->template Get<Tensor>();
- auto* to_tensor = BlobGetMutableTensor(to_blob, Context::GetDeviceType());
- to_tensor->CopyFrom(from_tensor);
- }
- }
-
- /**
- * Return list of blobs owned by this Workspace, not including blobs
- * shared from parent workspace.
- */
- vector<string> LocalBlobs() const;
-
- /**
- * Return a list of blob names. This may be a bit slow since it will involve
- * creation of multiple temp variables. For best performance, simply use
- * HasBlob() and GetBlob().
- */
- vector<string> Blobs() const;
-
- /**
- * Return the root folder of the workspace.
- */
- const string& RootFolder() { return root_folder_; }
- /**
- * Checks if a blob with the given name is present in the current workspace.
- */
- inline bool HasBlob(const string& name) const {
- // First, check the local workspace,
- // Then, check the forwarding map, then the parent workspace
- if (blob_map_.count(name)) {
- return true;
- }
-
- auto it = forwarded_blobs_.find(name);
- if (it != forwarded_blobs_.end()) {
- const auto parent_ws = it->second.first;
- const auto& parent_name = it->second.second;
- return parent_ws->HasBlob(parent_name);
- }
-
- if (shared_) {
- return shared_->HasBlob(name);
- }
-
- return false;
- }
-
- void PrintBlobSizes();
-
- /**
- * Creates a blob of the given name. The pointer to the blob is returned, but
- * the workspace keeps ownership of the pointer. If a blob of the given name
- * already exists, the creation is skipped and the existing blob is returned.
- */
- Blob* CreateBlob(const string& name);
- /**
- * Similar to CreateBlob(), but it creates a blob in the local workspace even
- * if another blob with the same name already exists in the parent workspace
- * -- in such case the new blob hides the blob in parent workspace. If a blob
- * of the given name already exists in the local workspace, the creation is
- * skipped and the existing blob is returned.
- */
- Blob* CreateLocalBlob(const string& name);
- /**
- * Remove the blob of the given name. Return true if removed and false if
- * not exist.
- * Will NOT remove from the shared workspace.
- */
- bool RemoveBlob(const string& name);
- /**
- * Gets the blob with the given name as a const pointer. If the blob does not
- * exist, a nullptr is returned.
- */
- const Blob* GetBlob(const string& name) const;
- /**
- * Gets the blob with the given name as a mutable pointer. If the blob does
- * not exist, a nullptr is returned.
- */
- Blob* GetBlob(const string& name);
-
- /**
- * Renames a local workspace blob. If blob is not found in the local blob list
- * or if the target name is already present in local or any parent blob list
- * the function will throw.
- */
- Blob* RenameBlob(const string& old_name, const string& new_name);
-
- /**
- * Creates a network with the given NetDef, and returns the pointer to the
- * network. If there is anything wrong during the creation of the network, a
- * nullptr is returned. The Workspace keeps ownership of the pointer.
- *
- * If there is already a net created in the workspace with the given name,
- * CreateNet will overwrite it if overwrite=true is specified. Otherwise, an
- * exception is thrown.
- */
- NetBase* CreateNet(const NetDef& net_def, bool overwrite = false);
- NetBase* CreateNet(
- const std::shared_ptr<const NetDef>& net_def,
- bool overwrite = false);
- /**
- * Gets the pointer to a created net. The workspace keeps ownership of the
- * network.
- */
- NetBase* GetNet(const string& net_name);
- /**
- * Deletes the instantiated network with the given name.
- */
- void DeleteNet(const string& net_name);
- /**
- * Finds and runs the instantiated network with the given name. If the network
- * does not exist or there are errors running the network, the function
- * returns false.
- */
- bool RunNet(const string& net_name);
-
- /**
- * Returns a list of names of the currently instantiated networks.
- */
- vector<string> Nets() const {
- vector<string> names;
- for (auto& entry : net_map_) {
- names.push_back(entry.first);
- }
- return names;
- }
-
- /**
- * Runs a plan that has multiple nets and execution steps.
- */
- bool RunPlan(const PlanDef& plan_def,
- ShouldContinue should_continue = StopOnSignal{});
-
- /*
- * Returns a CPU threadpool instance for parallel execution of
- * work. The threadpool is created lazily; if no operators use it,
- * then no threadpool will be created.
- */
- ThreadPool* GetThreadPool();
-
- // RunOperatorOnce and RunNetOnce runs an operator or net once. The difference
- // between RunNet and RunNetOnce lies in the fact that RunNet allows you to
- // have a persistent net object, while RunNetOnce creates a net and discards
- // it on the fly - this may make things like database read and random number
- // generators repeat the same thing over multiple calls.
- bool RunOperatorOnce(const OperatorDef& op_def);
- bool RunNetOnce(const NetDef& net_def);
-
- /**
- * Applies a function f on each workspace that currently exists.
- *
- * This function is thread safe and there is no race condition between
- * workspaces being passed to f in this thread and destroyed in another.
- */
- template <typename F>
- static void ForEach(F f) {
- auto bk = bookkeeper();
- std::lock_guard<std::mutex> guard(bk->wsmutex);
- for (Workspace* ws : bk->workspaces) {
- f(ws);
- }
- }
-
- public:
- std::atomic<int> last_failed_op_net_position{};
-
- private:
- struct Bookkeeper {
- std::mutex wsmutex;
- std::unordered_set<Workspace*> workspaces;
- };
-
- static std::shared_ptr<Bookkeeper> bookkeeper();
-
- std::unordered_map<string, unique_ptr<Blob>> blob_map_;
- const string root_folder_;
- const Workspace* shared_;
- std::unordered_map<string, std::pair<const Workspace*, string>>
- forwarded_blobs_;
- std::unique_ptr<ThreadPool> thread_pool_;
- std::mutex thread_pool_creation_mutex_;
- std::shared_ptr<Bookkeeper> bookkeeper_;
- std::unordered_map<string, unique_ptr<NetBase>> net_map_;
-
- C10_DISABLE_COPY_AND_ASSIGN(Workspace);
-};
-
-} // namespace caffe2
-
-#endif // CAFFE2_CORE_WORKSPACE_H_
diff --git a/caffe2/utils/GpuAtomics.cuh b/caffe2/utils/GpuAtomics.cuh
deleted file mode 100644
index 2bbcc14..0000000
--- a/caffe2/utils/GpuAtomics.cuh
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef CAFFE2_UTILS_GPU_ATOMICS_H_
-#define CAFFE2_UTILS_GPU_ATOMICS_H_
-
-#include <cuda_runtime.h>
-
-namespace caffe2 {
-
-namespace {
-
-template <typename T>
-inline __device__ void gpu_atomic_add(T* address, const T val) {
- atomicAdd(address, val);
-}
-
-template <>
-inline __device__ void gpu_atomic_add(float* address, const float val) {
-#if defined(USE_ROCM) && defined(__gfx908__)
- atomicAddNoRet(address, val);
-#else
- atomicAdd(address, val);
-#endif
-}
-
-} // namespace
-
-} // namespace caffe2
-
-#endif // CAFFE2_UTILS_GPU_ATOMICS_H_
diff --git a/caffe2/utils/GpuBitonicSort.cuh b/caffe2/utils/GpuBitonicSort.cuh
deleted file mode 100644
index 45cb298..0000000
--- a/caffe2/utils/GpuBitonicSort.cuh
+++ /dev/null
@@ -1,178 +0,0 @@
-#ifndef CAFFE2_UTILS_GPU_BITONIC_SORT_H_
-#define CAFFE2_UTILS_GPU_BITONIC_SORT_H_
-
-#include "caffe2/utils/math.h"
-#include "caffe2/utils/GpuDefs.cuh"
-
-namespace caffe2 {
-
-// Returns true if the given integer type is a power-of-2 (positive only)
-// Note(jiayq): windows reported an error per
-// https://github.com/caffe2/caffe2/issues/997
-// and as a result will make it a macro.
-#ifdef _MSC_VER
-#define integerIsPowerOf2(v) ((v) && !((v) & ((v) - 1)))
-#else // _MSC_VER
-template <typename T>
-constexpr bool integerIsPowerOf2(T v) {
- return (v && !(v & (v - 1)));
-}
-#endif // _MSC_VER
-
-/// The maximum in-block bitonic sort we support
-constexpr int kMaxBitonicSortSize = 4096;
-
-template <typename T>
-__device__ inline void swapVars(T& t1, T& t2) {
- T tmp = t1;
- t1 = t2;
- t2 = tmp;
-}
-
-template <typename Comparator, typename K, typename V>
-__device__ inline void bitonicSwap(K& kA, V& vA,
- K& kB, V& vB,
- bool dir,
- const Comparator& comp) {
- bool swap = comp(kA, vA, kB, vB);
- if (swap == dir) {
- swapVars(kA, kB);
- swapVars(vA, vB);
- }
-};
-
-template <typename Comparator, typename K, typename V,
- int Power2SortSize,
- int ThreadsPerBlock>
-__device__ inline void bitonicSort(K* keys,
- V* values,
- const Comparator& comp) {
- static_assert(Power2SortSize <= kMaxBitonicSortSize,
- "sort size <= 4096 only supported");
- // Assume the sort is taking place in shared memory
- // static_assert(Power2SortSize * (sizeof(K) + sizeof(V)) < 32768,
- // "sort data too large (>32768 bytes)");
- static_assert(integerIsPowerOf2(Power2SortSize),
- "sort size must be power of 2");
- static_assert(integerIsPowerOf2(ThreadsPerBlock),
- "threads in block must be power of 2");
-
- // If what we are sorting is too small, then not all threads
- // participate
- constexpr int numThreadsForSort = Power2SortSize / 2;
- constexpr bool allThreads = numThreadsForSort >= ThreadsPerBlock;
-
- // If what we are sorting is too large, then threads must loop more
- // than once
- constexpr int loopPerThread =
- allThreads ? numThreadsForSort / ThreadsPerBlock : 1;
-
-#pragma unroll
- for (int size = 2; size < Power2SortSize; size *= 2) {
-
-#pragma unroll
- for (int stride = size / 2; stride > 0; stride /= 2) {
-
-#pragma unroll
- for (int loop = 0; loop < loopPerThread; ++loop) {
- int threadId = loop * ThreadsPerBlock + threadIdx.x;
- bool flag = ((threadId & (size / 2)) != 0);
-
- int pos = 2 * threadId - (threadId & (stride - 1));
-
- if (allThreads || (threadId < numThreadsForSort)) {
- bitonicSwap<Comparator, K, V>(
- keys[pos], values[pos],
- keys[pos + stride], values[pos + stride],
- flag, comp);
- }
-
- __syncthreads();
- }
- }
- }
-
-#pragma unroll
- for (int stride = Power2SortSize / 2; stride > 0; stride /= 2) {
-
-#pragma unroll
- for (int loop = 0; loop < loopPerThread; ++loop) {
- int threadId = loop * ThreadsPerBlock + threadIdx.x;
-
- int pos = 2 * threadId - (threadId & (stride - 1));
-
- if (allThreads || (threadId < numThreadsForSort)) {
- bitonicSwap<Comparator, K, V>(
- keys[pos], values[pos],
- keys[pos + stride], values[pos + stride],
- false, comp);
- }
-
- __syncthreads();
- }
- }
-}
-
-template <typename Comparator, typename K, typename V, int Power2SortSize>
-__device__ inline void warpBitonicSort(K* keys,
- V* values,
- const Comparator& comp) {
- // Smaller sorts should use a warp shuffle sort
- static_assert(Power2SortSize > kWarpSize,
- "sort not large enough");
- static_assert(integerIsPowerOf2(Power2SortSize),
- "sort size must be power of 2");
- static_assert(Power2SortSize <= kMaxBitonicSortSize,
- "sort size <= 4096 only supported");
-
- // If what we are sorting is too large, then lanes must loop more
- // than once
- constexpr int loopPerThread = (Power2SortSize / 2) / kWarpSize;
- int laneId = getLaneId();
-
-#pragma unroll
- for (int size = 2; size < Power2SortSize; size *= 2) {
-
-#pragma unroll
- for (int stride = size / 2; stride > 0; stride /= 2) {
-
-#pragma unroll
- for (int loop = 0; loop < loopPerThread; ++loop) {
- int threadId = loop * kWarpSize + laneId;
- bool flag = ((threadId & (size / 2)) != 0);
-
- int pos = 2 * threadId - (threadId & (stride - 1));
-
- bitonicSwap<Comparator, K, V>(
- keys[pos], values[pos],
- keys[pos + stride], values[pos + stride],
- flag, comp);
-
- __threadfence_block();
- }
- }
- }
-
-#pragma unroll
- for (int stride = Power2SortSize / 2; stride > 0; stride /= 2) {
-
-#pragma unroll
- for (int loop = 0; loop < loopPerThread; ++loop) {
- int threadId = loop * kWarpSize + laneId;
-
- int pos = 2 * threadId - (threadId & (stride - 1));
-
- bitonicSwap<Comparator, K, V>(
- keys[pos], values[pos],
- keys[pos + stride], values[pos + stride],
- false, comp);
-
- __threadfence_block();
- }
- }
-}
-
-
-} // namespace caffe2
-
-#endif // CAFFE2_UTILS_GPU_BITONIC_SORT_H_
diff --git a/caffe2/utils/GpuDefs.cuh b/caffe2/utils/GpuDefs.cuh
deleted file mode 100644
index fcf2c64..0000000
--- a/caffe2/utils/GpuDefs.cuh
+++ /dev/null
@@ -1,158 +0,0 @@
-#ifndef CAFFE2_UTILS_GPU_DEFS_H_
-#define CAFFE2_UTILS_GPU_DEFS_H_
-
-#include <cuda_runtime.h>
-
-namespace caffe2 {
-
-// Static definition of GPU warp size for unrolling and code generation
-
-#if defined(USE_ROCM)
-constexpr int kWarpSize = warpSize; // = 64 (Defined in hip_runtime.h)
-#else
-constexpr int kWarpSize = 32;
-#endif // __CUDA_ARCH__
-
-//
-// Interfaces to PTX instructions for which there appears to be no
-// intrinsic
-//
-
-template <typename T>
-struct Bitfield {};
-
-template <>
-struct Bitfield<unsigned int> {
- static __device__ __forceinline__
- unsigned int getBitfield(unsigned int val, int pos, int len) {
-#if defined(USE_ROCM)
- pos &= 0xff;
- len &= 0xff;
-
- unsigned int m = (1u << len) - 1u;
- return (val >> pos) & m;
-#else
- unsigned int ret;
- asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len));
- return ret;
-#endif // USE_ROCM
- }
-
- static __device__ __forceinline__
- unsigned int setBitfield(unsigned int val, unsigned int toInsert, int pos, int len) {
-#if defined(USE_ROCM)
- pos &= 0xff;
- len &= 0xff;
-
- unsigned int m = (1u << len) - 1u;
- toInsert &= m;
- toInsert <<= pos;
- m <<= pos;
-
- return (val & ~m) | toInsert;
-#else
- unsigned int ret;
- asm("bfi.b32 %0, %1, %2, %3, %4;" :
- "=r"(ret) : "r"(toInsert), "r"(val), "r"(pos), "r"(len));
- return ret;
-#endif // USE_ROCM
- }
-};
-
-template <>
-struct Bitfield<unsigned long long int> {
- static __device__ __forceinline__
- unsigned long long int getBitfield(unsigned long long int val, int pos, int len) {
-#if defined(USE_ROCM)
- pos &= 0xff;
- len &= 0xff;
-
- unsigned long long int m = (1u << len) - 1u;
- return (val >> pos) & m;
-#else
- unsigned long long int ret;
- asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len));
- return ret;
-#endif // USE_ROCM
- }
-
- static __device__ __forceinline__
- unsigned long long int setBitfield(unsigned long long int val, unsigned long long int toInsert, int pos, int len) {
-#if defined(USE_ROCM)
- pos &= 0xff;
- len &= 0xff;
-
- unsigned long long int m = (1u << len) - 1u;
- toInsert &= m;
- toInsert <<= pos;
- m <<= pos;
-
- return (val & ~m) | toInsert;
-#else
- unsigned long long int ret;
- asm("bfi.b64 %0, %1, %2, %3, %4;" :
- "=l"(ret) : "l"(toInsert), "l"(val), "r"(pos), "r"(len));
- return ret;
-#endif // USE_ROCM
- }
-};
-
-__device__ __forceinline__ int getLaneId() {
-#if defined(USE_ROCM)
- return __lane_id();
-#else
- int laneId;
- asm("mov.s32 %0, %%laneid;" : "=r"(laneId) );
- return laneId;
-#endif // USE_ROCM
-}
-
-#if defined(USE_ROCM)
-__device__ __forceinline__ unsigned long long int getLaneMaskLt() {
- unsigned long long int m = (1ull << getLaneId()) - 1ull;
- return m;
-}
-
-__device__ __forceinline__ unsigned long long int getLaneMaskLe() {
- unsigned long long int m = UINT64_MAX >> (sizeof(std::uint64_t) * CHAR_BIT - (getLaneId() + 1));
- return m;
-}
-
-__device__ __forceinline__ unsigned long long int getLaneMaskGt() {
- unsigned long long int m = getLaneMaskLe();
- return m ? ~m : m;
-}
-
-__device__ __forceinline__ unsigned long long int getLaneMaskGe() {
- unsigned long long int m = getLaneMaskLt();
- return ~m;
-}
-#else
-__device__ __forceinline__ unsigned getLaneMaskLt() {
- unsigned mask;
- asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask));
- return mask;
-}
-
-__device__ __forceinline__ unsigned getLaneMaskLe() {
- unsigned mask;
- asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
- return mask;
-}
-
-__device__ __forceinline__ unsigned getLaneMaskGt() {
- unsigned mask;
- asm("mov.u32 %0, %%lanemask_gt;" : "=r"(mask));
- return mask;
-}
-
-__device__ __forceinline__ unsigned getLaneMaskGe() {
- unsigned mask;
- asm("mov.u32 %0, %%lanemask_ge;" : "=r"(mask));
- return mask;
-}
-#endif // USE_ROCM
-
-} // namespace caffe2
-
-#endif // CAFFE2_UTILS_GPU_DEFS_H_
diff --git a/caffe2/utils/GpuScanUtils.cuh b/caffe2/utils/GpuScanUtils.cuh
deleted file mode 100644
index 0f6823d..0000000
--- a/caffe2/utils/GpuScanUtils.cuh
+++ /dev/null
@@ -1,133 +0,0 @@
-#ifndef CAFFE2_UTILS_GPU_SCAN_UTILS_H_
-#define CAFFE2_UTILS_GPU_SCAN_UTILS_H_
-
-#include "caffe2/utils/GpuDefs.cuh"
-
-namespace caffe2 {
-
-// from the cutorch library; can probably be replaced with their CUB
-// equivalents
-// Collection of in-kernel scan / prefix sum utilities
-
-// Inclusive prefix sum using shared memory
-template <typename T, bool KillWARDependency, class BinaryFunction>
-__device__ void inclusivePrefixScan(T* smem, T in, T* out, BinaryFunction binop) {
- // FIXME: this is a slow, simple implementation; need up/down sweep,
- // prevent smem conflicts
- smem[threadIdx.x] = in;
-
- __syncthreads();
-
- for (int offset = 1; offset < blockDim.x; offset *= 2) {
- T val = 0;
-
- if (threadIdx.x >= offset) {
- val = binop(smem[threadIdx.x - offset], smem[threadIdx.x]);
- }
-
- __syncthreads();
- if (threadIdx.x >= offset) {
- smem[threadIdx.x] = val;
- }
-
- __syncthreads();
- }
-
- *out = smem[threadIdx.x];
-
- // Prevent write-after-read dependencies on smem usage above if necessary
- if (KillWARDependency) {
- __syncthreads();
- }
-}
-
-// Exclusive prefix sum using shared memory
-template <typename T, bool KillWARDependency, class BinaryFunction>
-__device__ void exclusivePrefixScan(T* smem, T in, T* out, T* carry, BinaryFunction binop) {
- // FIXME: crappy implementation
- // We kill write-after-read dependencies separately below, hence the `false`
- inclusivePrefixScan<T, false, BinaryFunction>(smem, in, out, binop);
-
- *out -= in;
- *carry = smem[blockDim.x - 1];
-
- // Prevent write-after-read dependencies on smem usage above if necessary
- if (KillWARDependency) {
- __syncthreads();
- }
-}
-
-// Inclusive prefix sum for binary vars using intra-warp voting +
-// shared memory
-template <typename T, bool KillWARDependency, class BinaryFunction>
-__device__ void inclusiveBinaryPrefixScan(T* smem, bool in, T* out, BinaryFunction binop) {
- // Within-warp, we use warp voting.
-#if defined(USE_ROCM)
- unsigned long long int vote = __ballot(in);
-
- T index = __popcll(getLaneMaskLe() & vote);
- T carry = __popcll(vote);
-#else
- T vote = __ballot_sync(__activemask(), in);
- T index = __popc(getLaneMaskLe() & vote);
- T carry = __popc(vote);
-#endif // USE_ROCM
-
- int warp = threadIdx.x / kWarpSize;
-
- // Per each warp, write out a value
- if (getLaneId() == 0) {
- smem[warp] = carry;
- }
-
- __syncthreads();
-
- // Sum across warps in one thread. This appears to be faster than a
- // warp shuffle scan for CC 3.0+
- if (threadIdx.x == 0) {
- int current = 0;
- for (int i = 0; i < blockDim.x / kWarpSize; ++i) {
- T v = smem[i];
- smem[i] = binop(smem[i], current);
- current = binop(current, v);
- }
- }
-
- __syncthreads();
-
- // load the carry from the preceding warp
- if (warp >= 1) {
- index = binop(index, smem[warp - 1]);
- }
-
- *out = index;
-
- if (KillWARDependency) {
- __syncthreads();
- }
-}
-
-// Exclusive prefix sum for binary vars using intra-warp voting +
-// shared memory
-template <typename T, bool KillWARDependency, class BinaryFunction>
-__device__ void exclusiveBinaryPrefixScan(T* smem, bool in, T* out, T* carry, BinaryFunction binop) {
- inclusiveBinaryPrefixScan<T, false, BinaryFunction>(smem, in, out, binop);
-
- // Inclusive to exclusive
- *out -= (T) in;
-
- // The outgoing carry for all threads is the last warp's sum
-#if defined(USE_ROCM)
- *carry = smem[math::DivUp<int>(blockDim.x, kWarpSize) - 1];
-#else
- *carry = smem[(blockDim.x / kWarpSize) - 1];
-#endif // USE_ROCM
-
- if (KillWARDependency) {
- __syncthreads();
- }
-}
-
-} // namespace caffe2
-
-#endif // CAFFE2_UTILS_GPU_SCAN_UTILS_H_
diff --git a/caffe2/utils/bench_utils.cc b/caffe2/utils/bench_utils.cc
deleted file mode 100644
index baa8d34..0000000
--- a/caffe2/utils/bench_utils.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-#if !defined(__s390x__) && !defined(__powerpc__)
-#include <cpuinfo.h>
-#else
-#include <unistd.h>
-#endif
-// NOLINTNEXTLINE(modernize-deprecated-headers)
-#include <stdint.h>
-// NOLINTNEXTLINE(modernize-deprecated-headers)
-#include <stdlib.h>
-
-#include "caffe2/core/logging.h"
-#include "caffe2/utils/bench_utils.h"
-
-namespace caffe2 {
-
-uint32_t wipe_cache() {
- static uint32_t* wipe_buffer = nullptr;
- static size_t wipe_size = 0;
-
- if (wipe_buffer == nullptr) {
-#if !defined(__s390x__) && !defined(__powerpc__)
- CAFFE_ENFORCE(cpuinfo_initialize(), "failed to initialize cpuinfo");
- const cpuinfo_processor* processor = cpuinfo_get_processor(0);
- if (processor->cache.l4 != nullptr) {
- wipe_size = processor->cache.l4->size;
- } else if (processor->cache.l3 != nullptr) {
- wipe_size = processor->cache.l3->size;
- } else if (processor->cache.l2 != nullptr) {
- wipe_size = processor->cache.l2->size;
- } else {
- wipe_size = processor->cache.l1d->size;
- }
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
- /*
- * On ARM precise cache size is not available, and cpuinfo may
- * underestimate. Use max for uArch (see src/arm/cache.c)
- */
- switch (processor->core->uarch) {
- case cpuinfo_uarch_cortex_a5:
- wipe_size = 512 * 1024; /* Max observed */
- break;
- case cpuinfo_uarch_cortex_a7:
- wipe_size = 1024 * 1024; /* uArch max */
- break;
- case cpuinfo_uarch_cortex_a8:
- wipe_size = 1024 * 1024; /* uArch max */
- break;
- case cpuinfo_uarch_cortex_a9:
- wipe_size = 1024 * 1024; /* Max observed */
- break;
- case cpuinfo_uarch_cortex_a12:
- case cpuinfo_uarch_cortex_a17:
- wipe_size = 8 * 1024 * 1024; /* uArch max */
- break;
- case cpuinfo_uarch_cortex_a15:
- wipe_size = 4 * 1024 * 1024; /* uArch max */
- break;
- case cpuinfo_uarch_cortex_a35:
- wipe_size = 1024 * 1024; /* uArch max */
- break;
- case cpuinfo_uarch_cortex_a53:
- wipe_size = 2 * 1024 * 1024; /* uArch max */
- break;
- case cpuinfo_uarch_cortex_a57:
- wipe_size = 2 * 1024 * 1024; /* uArch max */
- break;
- case cpuinfo_uarch_cortex_a72:
- wipe_size = 4 * 1024 * 1024; /* uArch max */
- break;
- case cpuinfo_uarch_cortex_a73:
- wipe_size = 8 * 1024 * 1024; /* uArch max */
- break;
- case cpuinfo_uarch_cortex_a55:
- case cpuinfo_uarch_cortex_a75:
- case cpuinfo_uarch_meerkat_m3:
- wipe_size = 4 * 1024 * 1024; /* DynamIQ max */
- break;
- default:
- wipe_size = 60 * 1024 * 1024;
- break;
- }
-#endif
-#elif defined (__s390x__)
- wipe_size = sysconf(_SC_LEVEL4_CACHE_SIZE);
- if (wipe_size <= 0)
- {
- /*
- * Take current max L4 cache size for s390x
- */
- wipe_size = 1024 * 1024 * 1024;
- }
-#else
- /* ppc64le */
- wipe_size = sysconf(_SC_LEVEL4_CACHE_SIZE);
- if (wipe_size <= 0) {
- wipe_size = sysconf(_SC_LEVEL3_CACHE_SIZE);
- if (wipe_size <= 0) {
- wipe_size = sysconf(_SC_LEVEL2_CACHE_SIZE);
- if(wipe_size <= 0) {
- wipe_size = sysconf(_SC_LEVEL1D_CACHE_SIZE);
- }
- }
- }
-#endif
- LOG(INFO) << "Allocating cache wipe buffer of size " << wipe_size;
- // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
- wipe_buffer = static_cast<uint32_t*>(malloc(wipe_size));
- CAFFE_ENFORCE(wipe_buffer != nullptr);
- }
- uint32_t hash = 0;
- for (uint32_t i = 0; i * sizeof(uint32_t) < wipe_size; i += 8) {
- // NOLINTNEXTLINE(clang-analyzer-core.uninitialized.Assign)
- hash ^= wipe_buffer[i];
- wipe_buffer[i] = hash;
- }
- /* Make sure compiler doesn't optimize the loop away */
- return hash;
-}
-
-} /* namespace caffe2 */
diff --git a/caffe2/utils/bench_utils.h b/caffe2/utils/bench_utils.h
deleted file mode 100644
index 59997ed..0000000
--- a/caffe2/utils/bench_utils.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef CAFFE2_UTILS_BENCH_UTILS_H_
-#define CAFFE2_UTILS_BENCH_UTILS_H_
-
-#include <stdint.h>
-
-#include <c10/macros/Export.h>
-
-namespace caffe2 {
-
-TORCH_API uint32_t wipe_cache();
-
-} // namespace caffe2
-
-#endif // CAFFE2_UTILS_BENCH_UTILS_H_
diff --git a/caffe2/utils/cast.h b/caffe2/utils/cast.h
deleted file mode 100644
index 6f9db083..0000000
--- a/caffe2/utils/cast.h
+++ /dev/null
@@ -1,49 +0,0 @@
-#pragma once
-
-#include <caffe2/utils/proto_utils.h>
-
-namespace caffe2 {
-
-namespace cast {
-
-inline TensorProto_DataType GetCastDataType(const ArgumentHelper& helper, std::string arg) {
- TensorProto_DataType to;
- if (helper.HasSingleArgumentOfType<string>(arg)) {
- string s = helper.GetSingleArgument<string>(arg, "float");
- std::transform(s.begin(), s.end(), s.begin(), ::toupper);
-#ifndef CAFFE2_USE_LITE_PROTO
- CAFFE_ENFORCE(TensorProto_DataType_Parse(s, &to), "Unknown 'to' argument: ", s);
-#else
-
-// Manually implement in the lite proto case.
-#define X(t) \
- if (s == #t) { \
- return TensorProto_DataType_##t; \
- }
-
- X(FLOAT);
- X(INT32);
- X(BYTE);
- X(STRING);
- X(BOOL);
- X(UINT8);
- X(INT8);
- X(UINT16);
- X(INT16);
- X(INT64);
- X(FLOAT16);
- X(DOUBLE);
-#undef X
- CAFFE_THROW("Unhandled type argument: ", s);
-
-#endif
- } else {
- to = static_cast<TensorProto_DataType>(
- helper.GetSingleArgument<int>(arg, TensorProto_DataType_FLOAT));
- }
- return to;
-}
-
-}; // namespace cast
-
-}; // namespace caffe2
diff --git a/caffe2/utils/cast_test.cc b/caffe2/utils/cast_test.cc
deleted file mode 100644
index 680e87b..0000000
--- a/caffe2/utils/cast_test.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-#include <memory>
-#include <vector>
-
-#include <gtest/gtest.h>
-
-#include "caffe2/utils/cast.h"
-
-namespace caffe2 {
-
-TEST(CastTest, GetCastDataType) {
- auto castOp = [](std::string t) {
- // Ensure lowercase.
- std::transform(t.begin(), t.end(), t.begin(), ::tolower);
- auto op = CreateOperatorDef("Cast", "", {}, {});
- AddArgument("to", t, &op);
- return op;
- };
-
-#define X(t) \
- EXPECT_EQ( \
- TensorProto_DataType_##t, \
- cast::GetCastDataType(ArgumentHelper(castOp(#t)), "to"));
-
- X(FLOAT);
- X(INT32);
- X(BYTE);
- X(STRING);
- X(BOOL);
- X(UINT8);
- X(INT8);
- X(UINT16);
- X(INT16);
- X(INT64);
- X(FLOAT16);
- X(DOUBLE);
-#undef X
-}
-
-} // namespace caffe2
diff --git a/caffe2/utils/cblas.h b/caffe2/utils/cblas.h
deleted file mode 100644
index c91b8bf..0000000
--- a/caffe2/utils/cblas.h
+++ /dev/null
@@ -1,606 +0,0 @@
-// This is the exact cblas.h header file, placed here purely in order to get
-// the enums.
-
-#include "caffe2/core/macros.h"
-
-#ifndef CBLAS_H
-#ifdef CAFFE2_USE_MKL
-#include <mkl_cblas.h>
-#else // CAFFE2_USE_MKL
-
-#ifndef CBLAS_ENUM_DEFINED_H
- #define CBLAS_ENUM_DEFINED_H
- enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102 };
- enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113,
- AtlasConj=114};
- enum CBLAS_UPLO {CblasUpper=121, CblasLower=122};
- enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132};
- enum CBLAS_SIDE {CblasLeft=141, CblasRight=142};
-#endif
-
-#ifndef CBLAS_ENUM_ONLY
-#define CBLAS_H
-#define CBLAS_INDEX int
-
-int cblas_errprn(int ierr, int info, char *form, ...);
-void cblas_xerbla(int p, const char *rout, const char *form, ...);
-
-/*
- * ===========================================================================
- * Prototypes for level 1 BLAS functions (complex are recast as routines)
- * ===========================================================================
- */
-float cblas_sdsdot(const int N, const float alpha, const float *X,
- const int incX, const float *Y, const int incY);
-double cblas_dsdot(const int N, const float *X, const int incX, const float *Y,
- const int incY);
-float cblas_sdot(const int N, const float *X, const int incX,
- const float *Y, const int incY);
-double cblas_ddot(const int N, const double *X, const int incX,
- const double *Y, const int incY);
-/*
- * Functions having prefixes Z and C only
- */
-void cblas_cdotu_sub(const int N, const void *X, const int incX,
- const void *Y, const int incY, void *dotu);
-void cblas_cdotc_sub(const int N, const void *X, const int incX,
- const void *Y, const int incY, void *dotc);
-
-void cblas_zdotu_sub(const int N, const void *X, const int incX,
- const void *Y, const int incY, void *dotu);
-void cblas_zdotc_sub(const int N, const void *X, const int incX,
- const void *Y, const int incY, void *dotc);
-
-
-/*
- * Functions having prefixes S D SC DZ
- */
-float cblas_snrm2(const int N, const float *X, const int incX);
-float cblas_sasum(const int N, const float *X, const int incX);
-
-double cblas_dnrm2(const int N, const double *X, const int incX);
-double cblas_dasum(const int N, const double *X, const int incX);
-
-float cblas_scnrm2(const int N, const void *X, const int incX);
-float cblas_scasum(const int N, const void *X, const int incX);
-
-double cblas_dznrm2(const int N, const void *X, const int incX);
-double cblas_dzasum(const int N, const void *X, const int incX);
-
-
-/*
- * Functions having standard 4 prefixes (S D C Z)
- */
-CBLAS_INDEX cblas_isamax(const int N, const float *X, const int incX);
-CBLAS_INDEX cblas_idamax(const int N, const double *X, const int incX);
-CBLAS_INDEX cblas_icamax(const int N, const void *X, const int incX);
-CBLAS_INDEX cblas_izamax(const int N, const void *X, const int incX);
-
-/*
- * ===========================================================================
- * Prototypes for level 1 BLAS routines
- * ===========================================================================
- */
-
-/*
- * Routines with standard 4 prefixes (s, d, c, z)
- */
-void cblas_sswap(const int N, float *X, const int incX,
- float *Y, const int incY);
-void cblas_scopy(const int N, const float *X, const int incX,
- float *Y, const int incY);
-void cblas_saxpy(const int N, const float alpha, const float *X,
- const int incX, float *Y, const int incY);
-void catlas_saxpby(const int N, const float alpha, const float *X,
- const int incX, const float beta, float *Y, const int incY);
-void catlas_sset
- (const int N, const float alpha, float *X, const int incX);
-
-void cblas_dswap(const int N, double *X, const int incX,
- double *Y, const int incY);
-void cblas_dcopy(const int N, const double *X, const int incX,
- double *Y, const int incY);
-void cblas_daxpy(const int N, const double alpha, const double *X,
- const int incX, double *Y, const int incY);
-void catlas_daxpby(const int N, const double alpha, const double *X,
- const int incX, const double beta, double *Y, const int incY);
-void catlas_dset
- (const int N, const double alpha, double *X, const int incX);
-
-void cblas_cswap(const int N, void *X, const int incX,
- void *Y, const int incY);
-void cblas_ccopy(const int N, const void *X, const int incX,
- void *Y, const int incY);
-void cblas_caxpy(const int N, const void *alpha, const void *X,
- const int incX, void *Y, const int incY);
-void catlas_caxpby(const int N, const void *alpha, const void *X,
- const int incX, const void *beta, void *Y, const int incY);
-void catlas_cset
- (const int N, const void *alpha, void *X, const int incX);
-
-void cblas_zswap(const int N, void *X, const int incX,
- void *Y, const int incY);
-void cblas_zcopy(const int N, const void *X, const int incX,
- void *Y, const int incY);
-void cblas_zaxpy(const int N, const void *alpha, const void *X,
- const int incX, void *Y, const int incY);
-void catlas_zaxpby(const int N, const void *alpha, const void *X,
- const int incX, const void *beta, void *Y, const int incY);
-void catlas_zset
- (const int N, const void *alpha, void *X, const int incX);
-
-
-/*
- * Routines with S and D prefix only
- */
-void cblas_srotg(float *a, float *b, float *c, float *s);
-void cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P);
-void cblas_srot(const int N, float *X, const int incX,
- float *Y, const int incY, const float c, const float s);
-void cblas_srotm(const int N, float *X, const int incX,
- float *Y, const int incY, const float *P);
-
-void cblas_drotg(double *a, double *b, double *c, double *s);
-void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P);
-void cblas_drot(const int N, double *X, const int incX,
- double *Y, const int incY, const double c, const double s);
-void cblas_drotm(const int N, double *X, const int incX,
- double *Y, const int incY, const double *P);
-
-
-/*
- * Routines with S D C Z CS and ZD prefixes
- */
-void cblas_sscal(const int N, const float alpha, float *X, const int incX);
-void cblas_dscal(const int N, const double alpha, double *X, const int incX);
-void cblas_cscal(const int N, const void *alpha, void *X, const int incX);
-void cblas_zscal(const int N, const void *alpha, void *X, const int incX);
-void cblas_csscal(const int N, const float alpha, void *X, const int incX);
-void cblas_zdscal(const int N, const double alpha, void *X, const int incX);
-
-/*
- * Extra reference routines provided by ATLAS, but not mandated by the standard
- */
-void cblas_crotg(void *a, void *b, void *c, void *s);
-void cblas_zrotg(void *a, void *b, void *c, void *s);
-void cblas_csrot(const int N, void *X, const int incX, void *Y, const int incY,
- const float c, const float s);
-void cblas_zdrot(const int N, void *X, const int incX, void *Y, const int incY,
- const double c, const double s);
-
-/*
- * ===========================================================================
- * Prototypes for level 2 BLAS
- * ===========================================================================
- */
-
-/*
- * Routines with standard 4 prefixes (S, D, C, Z)
- */
-void cblas_sgemv(const enum CBLAS_ORDER Order,
- const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
- const float alpha, const float *A, const int lda,
- const float *X, const int incX, const float beta,
- float *Y, const int incY);
-void cblas_sgbmv(const enum CBLAS_ORDER Order,
- const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
- const int KL, const int KU, const float alpha,
- const float *A, const int lda, const float *X,
- const int incX, const float beta, float *Y, const int incY);
-void cblas_strmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
- const int N, const float *A, const int lda,
- float *X, const int incX);
-void cblas_stbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
- const int N, const int K, const float *A, const int lda,
- float *X, const int incX);
-void cblas_stpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
- const int N, const float *Ap, float *X, const int incX);
-void cblas_strsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
- const int N, const float *A, const int lda, float *X,
- const int incX);
-void cblas_stbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
- const int N, const int K, const float *A, const int lda,
- float *X, const int incX);
-void cblas_stpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
- const int N, const float *Ap, float *X, const int incX);
-
-void cblas_dgemv(const enum CBLAS_ORDER Order,
- const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
- const double alpha, const double *A, const int lda,
- const double *X, const int incX, const double beta,
- double *Y, const int incY);
-void cblas_dgbmv(const enum CBLAS_ORDER Order,
- const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
- const int KL, const int KU, const double alpha,
- const double *A, const int lda, const double *X,
- const int incX, const double beta, double *Y, const int incY);
-void cblas_dtrmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
- const int N, const double *A, const int lda,
- double *X, const int incX);
-void cblas_dtbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
- const int N, const int K, const double *A, const int lda,
- double *X, const int incX);
-void cblas_dtpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
- const int N, const double *Ap, double *X, const int incX);
-void cblas_dtrsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
- const int N, const double *A, const int lda, double *X,
- const int incX);
-void cblas_dtbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
- const int N, const int K, const double *A, const int lda,
- double *X, const int incX);
-void cblas_dtpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
- const int N, const double *Ap, double *X, const int incX);
-
-void cblas_cgemv(const enum CBLAS_ORDER Order,
- const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
- const void *alpha, const void *A, const int lda,
- const void *X, const int incX, const void *beta,
- void *Y, const int incY);
-void cblas_cgbmv(const enum CBLAS_ORDER Order,
- const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
- const int KL, const int KU, const void *alpha,
- const void *A, const int lda, const void *X,
- const int incX, const void *beta, void *Y, const int incY);
-void cblas_ctrmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
- const int N, const void *A, const int lda,
- void *X, const int incX);
-void cblas_ctbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
- const int N, const int K, const void *A, const int lda,
- void *X, const int incX);
-void cblas_ctpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
- const int N, const void *Ap, void *X, const int incX);
-void cblas_ctrsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
- const int N, const void *A, const int lda, void *X,
- const int incX);
-void cblas_ctbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
- const int N, const int K, const void *A, const int lda,
- void *X, const int incX);
-void cblas_ctpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
- const int N, const void *Ap, void *X, const int incX);
-
-void cblas_zgemv(const enum CBLAS_ORDER Order,
- const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
- const void *alpha, const void *A, const int lda,
- const void *X, const int incX, const void *beta,
- void *Y, const int incY);
-void cblas_zgbmv(const enum CBLAS_ORDER Order,
- const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
- const int KL, const int KU, const void *alpha,
- const void *A, const int lda, const void *X,
- const int incX, const void *beta, void *Y, const int incY);
-void cblas_ztrmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
- const int N, const void *A, const int lda,
- void *X, const int incX);
-void cblas_ztbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
- const int N, const int K, const void *A, const int lda,
- void *X, const int incX);
-void cblas_ztpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
- const int N, const void *Ap, void *X, const int incX);
-void cblas_ztrsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
- const int N, const void *A, const int lda, void *X,
- const int incX);
-void cblas_ztbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
- const int N, const int K, const void *A, const int lda,
- void *X, const int incX);
-void cblas_ztpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
- const int N, const void *Ap, void *X, const int incX);
-
-
-/*
- * Routines with S and D prefixes only
- */
-void cblas_ssymv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const int N, const float alpha, const float *A,
- const int lda, const float *X, const int incX,
- const float beta, float *Y, const int incY);
-void cblas_ssbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const int N, const int K, const float alpha, const float *A,
- const int lda, const float *X, const int incX,
- const float beta, float *Y, const int incY);
-void cblas_sspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const int N, const float alpha, const float *Ap,
- const float *X, const int incX,
- const float beta, float *Y, const int incY);
-void cblas_sger(const enum CBLAS_ORDER Order, const int M, const int N,
- const float alpha, const float *X, const int incX,
- const float *Y, const int incY, float *A, const int lda);
-void cblas_ssyr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const int N, const float alpha, const float *X,
- const int incX, float *A, const int lda);
-void cblas_sspr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const int N, const float alpha, const float *X,
- const int incX, float *Ap);
-void cblas_ssyr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const int N, const float alpha, const float *X,
- const int incX, const float *Y, const int incY, float *A,
- const int lda);
-void cblas_sspr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const int N, const float alpha, const float *X,
- const int incX, const float *Y, const int incY, float *A);
-
-void cblas_dsymv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const int N, const double alpha, const double *A,
- const int lda, const double *X, const int incX,
- const double beta, double *Y, const int incY);
-void cblas_dsbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const int N, const int K, const double alpha, const double *A,
- const int lda, const double *X, const int incX,
- const double beta, double *Y, const int incY);
-void cblas_dspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const int N, const double alpha, const double *Ap,
- const double *X, const int incX,
- const double beta, double *Y, const int incY);
-void cblas_dger(const enum CBLAS_ORDER Order, const int M, const int N,
- const double alpha, const double *X, const int incX,
- const double *Y, const int incY, double *A, const int lda);
-void cblas_dsyr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const int N, const double alpha, const double *X,
- const int incX, double *A, const int lda);
-void cblas_dspr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const int N, const double alpha, const double *X,
- const int incX, double *Ap);
-void cblas_dsyr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const int N, const double alpha, const double *X,
- const int incX, const double *Y, const int incY, double *A,
- const int lda);
-void cblas_dspr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const int N, const double alpha, const double *X,
- const int incX, const double *Y, const int incY, double *A);
-
-
-/*
- * Routines with C and Z prefixes only
- */
-void cblas_chemv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const int N, const void *alpha, const void *A,
- const int lda, const void *X, const int incX,
- const void *beta, void *Y, const int incY);
-void cblas_chbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const int N, const int K, const void *alpha, const void *A,
- const int lda, const void *X, const int incX,
- const void *beta, void *Y, const int incY);
-void cblas_chpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const int N, const void *alpha, const void *Ap,
- const void *X, const int incX,
- const void *beta, void *Y, const int incY);
-void cblas_cgeru(const enum CBLAS_ORDER Order, const int M, const int N,
- const void *alpha, const void *X, const int incX,
- const void *Y, const int incY, void *A, const int lda);
-void cblas_cgerc(const enum CBLAS_ORDER Order, const int M, const int N,
- const void *alpha, const void *X, const int incX,
- const void *Y, const int incY, void *A, const int lda);
-void cblas_cher(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const int N, const float alpha, const void *X, const int incX,
- void *A, const int lda);
-void cblas_chpr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const int N, const float alpha, const void *X,
- const int incX, void *A);
-void cblas_cher2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N,
- const void *alpha, const void *X, const int incX,
- const void *Y, const int incY, void *A, const int lda);
-void cblas_chpr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N,
- const void *alpha, const void *X, const int incX,
- const void *Y, const int incY, void *Ap);
-
-void cblas_zhemv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const int N, const void *alpha, const void *A,
- const int lda, const void *X, const int incX,
- const void *beta, void *Y, const int incY);
-void cblas_zhbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const int N, const int K, const void *alpha, const void *A,
- const int lda, const void *X, const int incX,
- const void *beta, void *Y, const int incY);
-void cblas_zhpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const int N, const void *alpha, const void *Ap,
- const void *X, const int incX,
- const void *beta, void *Y, const int incY);
-void cblas_zgeru(const enum CBLAS_ORDER Order, const int M, const int N,
- const void *alpha, const void *X, const int incX,
- const void *Y, const int incY, void *A, const int lda);
-void cblas_zgerc(const enum CBLAS_ORDER Order, const int M, const int N,
- const void *alpha, const void *X, const int incX,
- const void *Y, const int incY, void *A, const int lda);
-void cblas_zher(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const int N, const double alpha, const void *X, const int incX,
- void *A, const int lda);
-void cblas_zhpr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const int N, const double alpha, const void *X,
- const int incX, void *A);
-void cblas_zher2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N,
- const void *alpha, const void *X, const int incX,
- const void *Y, const int incY, void *A, const int lda);
-void cblas_zhpr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N,
- const void *alpha, const void *X, const int incX,
- const void *Y, const int incY, void *Ap);
-
-/*
- * ===========================================================================
- * Prototypes for level 3 BLAS
- * ===========================================================================
- */
-
-/*
- * Routines with standard 4 prefixes (S, D, C, Z)
- */
-void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
- const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
- const int K, const float alpha, const float *A,
- const int lda, const float *B, const int ldb,
- const float beta, float *C, const int ldc);
-void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
- const enum CBLAS_UPLO Uplo, const int M, const int N,
- const float alpha, const float *A, const int lda,
- const float *B, const int ldb, const float beta,
- float *C, const int ldc);
-void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
- const float alpha, const float *A, const int lda,
- const float beta, float *C, const int ldc);
-void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
- const float alpha, const float *A, const int lda,
- const float *B, const int ldb, const float beta,
- float *C, const int ldc);
-void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
- const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
- const enum CBLAS_DIAG Diag, const int M, const int N,
- const float alpha, const float *A, const int lda,
- float *B, const int ldb);
-void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
- const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
- const enum CBLAS_DIAG Diag, const int M, const int N,
- const float alpha, const float *A, const int lda,
- float *B, const int ldb);
-
-void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
- const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
- const int K, const double alpha, const double *A,
- const int lda, const double *B, const int ldb,
- const double beta, double *C, const int ldc);
-void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
- const enum CBLAS_UPLO Uplo, const int M, const int N,
- const double alpha, const double *A, const int lda,
- const double *B, const int ldb, const double beta,
- double *C, const int ldc);
-void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
- const double alpha, const double *A, const int lda,
- const double beta, double *C, const int ldc);
-void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
- const double alpha, const double *A, const int lda,
- const double *B, const int ldb, const double beta,
- double *C, const int ldc);
-void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
- const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
- const enum CBLAS_DIAG Diag, const int M, const int N,
- const double alpha, const double *A, const int lda,
- double *B, const int ldb);
-void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
- const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
- const enum CBLAS_DIAG Diag, const int M, const int N,
- const double alpha, const double *A, const int lda,
- double *B, const int ldb);
-
-void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
- const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
- const int K, const void *alpha, const void *A,
- const int lda, const void *B, const int ldb,
- const void *beta, void *C, const int ldc);
-void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
- const enum CBLAS_UPLO Uplo, const int M, const int N,
- const void *alpha, const void *A, const int lda,
- const void *B, const int ldb, const void *beta,
- void *C, const int ldc);
-void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
- const void *alpha, const void *A, const int lda,
- const void *beta, void *C, const int ldc);
-void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
- const void *alpha, const void *A, const int lda,
- const void *B, const int ldb, const void *beta,
- void *C, const int ldc);
-void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
- const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
- const enum CBLAS_DIAG Diag, const int M, const int N,
- const void *alpha, const void *A, const int lda,
- void *B, const int ldb);
-void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
- const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
- const enum CBLAS_DIAG Diag, const int M, const int N,
- const void *alpha, const void *A, const int lda,
- void *B, const int ldb);
-
-void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
- const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
- const int K, const void *alpha, const void *A,
- const int lda, const void *B, const int ldb,
- const void *beta, void *C, const int ldc);
-void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
- const enum CBLAS_UPLO Uplo, const int M, const int N,
- const void *alpha, const void *A, const int lda,
- const void *B, const int ldb, const void *beta,
- void *C, const int ldc);
-void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
- const void *alpha, const void *A, const int lda,
- const void *beta, void *C, const int ldc);
-void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
- const void *alpha, const void *A, const int lda,
- const void *B, const int ldb, const void *beta,
- void *C, const int ldc);
-void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
- const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
- const enum CBLAS_DIAG Diag, const int M, const int N,
- const void *alpha, const void *A, const int lda,
- void *B, const int ldb);
-void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
- const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
- const enum CBLAS_DIAG Diag, const int M, const int N,
- const void *alpha, const void *A, const int lda,
- void *B, const int ldb);
-
-
-/*
- * Routines with prefixes C and Z only
- */
-void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
- const enum CBLAS_UPLO Uplo, const int M, const int N,
- const void *alpha, const void *A, const int lda,
- const void *B, const int ldb, const void *beta,
- void *C, const int ldc);
-void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
- const float alpha, const void *A, const int lda,
- const float beta, void *C, const int ldc);
-void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
- const void *alpha, const void *A, const int lda,
- const void *B, const int ldb, const float beta,
- void *C, const int ldc);
-void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
- const enum CBLAS_UPLO Uplo, const int M, const int N,
- const void *alpha, const void *A, const int lda,
- const void *B, const int ldb, const void *beta,
- void *C, const int ldc);
-void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
- const double alpha, const void *A, const int lda,
- const double beta, void *C, const int ldc);
-void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
- const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
- const void *alpha, const void *A, const int lda,
- const void *B, const int ldb, const double beta,
- void *C, const int ldc);
-
-int cblas_errprn(int ierr, int info, char *form, ...);
-
-#endif /* end #ifdef CBLAS_ENUM_ONLY */
-#endif // CAFFE2_USE_MKL
-#endif
diff --git a/caffe2/utils/cpu_neon.h b/caffe2/utils/cpu_neon.h
deleted file mode 100644
index 7e68d73..0000000
--- a/caffe2/utils/cpu_neon.h
+++ /dev/null
@@ -1,53 +0,0 @@
-#ifndef CAFFE2_UTILS_CPU_NEON_H_
-#define CAFFE2_UTILS_CPU_NEON_H_
-
-// Provides a variety of ARM NEON-specific utility functions
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-
-namespace caffe2 {
-
-template <typename T>
-inline bool isPointerAligned(T* p, size_t align) {
- return (reinterpret_cast<uintptr_t>(p) % align == 0);
-}
-
-inline float32x4_t vert_sum_f32(float32x4_t v0,
- float32x4_t v1,
- float32x4_t v2,
- float32x4_t v3) {
- v0 = vaddq_f32(v0, v1);
- v2 = vaddq_f32(v2, v3);
- return vaddq_f32(v0, v2);
-}
-
-inline float horizontal_sum_f32(float32x4_t v0,
- float32x4_t v1,
- float32x4_t v2,
- float32x4_t v3) {
- v0 = vert_sum_f32(v0, v1, v2, v3);
- float32x2_t v = vadd_f32(vget_high_f32(v0), vget_low_f32(v0));
- return vget_lane_f32(vpadd_f32(v, v), 0);
-}
-
-// Load/store functions that assume alignment
-
-inline float32x4_t vld1q_f32_aligned(const float* p) {
- return vld1q_f32((const float*)
- __builtin_assume_aligned(p, sizeof(float32x4_t)));
-}
-
-inline void vst1q_f32_aligned(float* p, float32x4_t v) {
- vst1q_f32((float*) __builtin_assume_aligned(p, sizeof(float32x4_t)), v);
-}
-
-inline void vst4_u8_aligned(uint8_t* p, uint8x8x4_t v) {
- vst4_u8((uint8_t*)
- __builtin_assume_aligned(p, sizeof(uint8x8x4_t)), v);
-}
-
-} // namespace caffe2
-
-#endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-#endif // CAFFE2_UTILS_CPU_NEON_H_
diff --git a/caffe2/utils/cpuid_test.cc b/caffe2/utils/cpuid_test.cc
deleted file mode 100644
index f3694f5..0000000
--- a/caffe2/utils/cpuid_test.cc
+++ /dev/null
@@ -1,10 +0,0 @@
-#include <gtest/gtest.h>
-#include "caffe2/utils/cpuid.h"
-
-namespace caffe2 {
-
-TEST(CpuIdTest, ShouldAlwaysHaveMMX) {
- EXPECT_TRUE(GetCpuId().mmx());
-}
-
-} // namespace caffe2
diff --git a/caffe2/utils/cub_namespace.cuh b/caffe2/utils/cub_namespace.cuh
deleted file mode 100644
index 188a993..0000000
--- a/caffe2/utils/cub_namespace.cuh
+++ /dev/null
@@ -1,17 +0,0 @@
-#pragma once
-
-// cub sort support for CUB_WRAPPED_NAMESPACE is added to cub 1.13.1 in:
-// https://github.com/NVIDIA/cub/pull/326
-// CUB_WRAPPED_NAMESPACE is defined globally in cmake/Dependencies.cmake
-// starting from CUDA 11.5
-#if defined(CUB_WRAPPED_NAMESPACE) || defined(THRUST_CUB_WRAPPED_NAMESPACE)
-#define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() true
-#else
-#define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() false
-#endif
-
-#if USE_GLOBAL_CUB_WRAPPED_NAMESPACE()
-namespace caffe2 {
-namespace cub = ::CUB_WRAPPED_NAMESPACE::cub;
-}
-#endif
diff --git a/caffe2/utils/eigen_utils.h b/caffe2/utils/eigen_utils.h
deleted file mode 100644
index c6c34db..0000000
--- a/caffe2/utils/eigen_utils.h
+++ /dev/null
@@ -1,205 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#ifndef CAFFE2_OPERATORS_UTILS_EIGEN_H_
-#define CAFFE2_OPERATORS_UTILS_EIGEN_H_
-
-#include "Eigen/Core"
-#include "Eigen/Dense"
-
-#include <c10/util/Logging.h>
-#include <c10/util/irange.h>
-
-namespace caffe2 {
-
-// Common Eigen types that we will often use
-template <typename T>
-using EigenMatrixMap =
- Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using EigenArrayMap =
- Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using EigenVectorMap = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, 1>>;
-template <typename T>
-using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
-template <typename T>
-using ConstEigenMatrixMap =
- Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using ConstEigenArrayMap =
- Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using ConstEigenVectorMap =
- Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>;
-template <typename T>
-using ConstEigenVectorArrayMap =
- Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
-
-using EigenOuterStride = Eigen::OuterStride<Eigen::Dynamic>;
-using EigenInnerStride = Eigen::InnerStride<Eigen::Dynamic>;
-using EigenStride = Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>;
-template <typename T>
-using EigenOuterStridedMatrixMap = Eigen::
- Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>, 0, EigenOuterStride>;
-template <typename T>
-using EigenOuterStridedArrayMap = Eigen::
- Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>, 0, EigenOuterStride>;
-template <typename T>
-using ConstEigenOuterStridedMatrixMap = Eigen::Map<
- const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>,
- 0,
- EigenOuterStride>;
-template <typename T>
-using ConstEigenOuterStridedArrayMap = Eigen::Map<
- const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>,
- 0,
- EigenOuterStride>;
-template <typename T>
-using EigenStridedMatrixMap = Eigen::
- Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>, 0, EigenStride>;
-template <typename T>
-using EigenStridedArrayMap =
- Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>, 0, EigenStride>;
-template <typename T>
-using ConstEigenStridedMatrixMap = Eigen::
- Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>, 0, EigenStride>;
-template <typename T>
-using ConstEigenStridedArrayMap = Eigen::
- Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>, 0, EigenStride>;
-
-// 1-d array
-template <typename T>
-using EArrXt = Eigen::Array<T, Eigen::Dynamic, 1>;
-using EArrXf = Eigen::ArrayXf;
-using EArrXd = Eigen::ArrayXd;
-using EArrXi = Eigen::ArrayXi;
-using EArrXb = EArrXt<bool>;
-using EArrXI32 = EArrXt<int32_t>;
-using EArrXU16 = EArrXt<uint16_t>;
-using EArrXU8 = EArrXt<uint8_t>;
-using EArr3U8 = Eigen::Array<uint8_t, 3, 1>;
-
-// 2-d array, column major
-template <typename T>
-using EArrXXt = Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>;
-using EArrXXf = Eigen::ArrayXXf;
-using EArrXXI32 = EArrXXt<int32_t>;
-using EArrXXU16 = EArrXXt<uint16_t>;
-using EArrXXU8 = EArrXXt<uint8_t>;
-using EArrXXi = EArrXXt<int>;
-
-// 2-d array, row major
-template <typename T>
-using ERArrXXt =
- Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-using ERArrXXf = ERArrXXt<float>;
-using ERArrXXI32t = ERArrXXt<int32_t>;
-using ERArrXXU16t = ERArrXXt<uint16_t>;
-using ERArrXXU8t = ERArrXXt<uint8_t>;
-using ERArrXXi = ERArrXXt<int>;
-using ERArrXXi64t = ERArrXXt<int64_t>;
-using ERArrXXi32t = ERArrXXt<int32_t>;
-
-// 1-d vector
-template <typename T>
-using EVecXt = Eigen::Matrix<T, Eigen::Dynamic, 1>;
-using EVecXd = Eigen::VectorXd;
-using EVecXf = Eigen::VectorXf;
-
-// 1-d row vector
-using ERVecXd = Eigen::RowVectorXd;
-using ERVecXf = Eigen::RowVectorXf;
-
-// 2-d matrix, column major
-template <typename T>
-using EMatXt = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>;
-using EMatXd = Eigen::MatrixXd;
-using EMatXf = Eigen::MatrixXf;
-using EMatXU8 = EMatXt<uint8_t>;
-using EMatXU16 = EMatXt<uint16_t>;
-
-// 2-d matrix, row major
-template <typename T>
-using ERMatXt =
- Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-using ERMatXd = ERMatXt<double>;
-using ERMatXf = ERMatXt<float>;
-using ERMatXU8 = ERMatXt<uint8_t>;
-
-namespace utils {
-
-template <typename T>
-Eigen::Map<const EArrXt<T>> AsEArrXt(const std::vector<T>& arr) {
- return {arr.data(), static_cast<int>(arr.size())};
-}
-template <typename T>
-Eigen::Map<EArrXt<T>> AsEArrXt(std::vector<T>& arr) {
- return {arr.data(), static_cast<int>(arr.size())};
-}
-
-// return a sub array of 'array' based on indices 'indices'
-template <class Derived, class Derived1, class Derived2>
-void GetSubArray(
- const Eigen::ArrayBase<Derived>& array,
- const Eigen::ArrayBase<Derived1>& indices,
- Eigen::ArrayBase<Derived2>* out_array) {
- CAFFE_ENFORCE_EQ(array.cols(), 1);
- // using T = typename Derived::Scalar;
-
- out_array->derived().resize(indices.size());
- for (const auto i : c10::irange(indices.size())) {
- TORCH_DCHECK_LT(indices[i], array.size());
- (*out_array)[i] = array[indices[i]];
- }
-}
-
-// return a sub array of 'array' based on indices 'indices'
-template <class Derived, class Derived1>
-EArrXt<typename Derived::Scalar> GetSubArray(
- const Eigen::ArrayBase<Derived>& array,
- const Eigen::ArrayBase<Derived1>& indices) {
- using T = typename Derived::Scalar;
- EArrXt<T> ret(indices.size());
- GetSubArray(array, indices, &ret);
- return ret;
-}
-
-// return a sub array of 'array' based on indices 'indices'
-template <class Derived>
-EArrXt<typename Derived::Scalar> GetSubArray(
- const Eigen::ArrayBase<Derived>& array,
- const std::vector<int>& indices) {
- return GetSubArray(array, AsEArrXt(indices));
-}
-
-// return 2d sub array of 'array' based on row indices 'row_indices'
-template <class Derived, class Derived1, class Derived2>
-void GetSubArrayRows(
- const Eigen::ArrayBase<Derived>& array2d,
- const Eigen::ArrayBase<Derived1>& row_indices,
- Eigen::ArrayBase<Derived2>* out_array) {
- out_array->derived().resize(row_indices.size(), array2d.cols());
-
- for (const auto i : c10::irange(row_indices.size())) {
- TORCH_DCHECK_LT(row_indices[i], array2d.size());
- out_array->row(i) =
- array2d.row(row_indices[i]).template cast<typename Derived2::Scalar>();
- }
-}
-
-// return indices of 1d array for elements evaluated to true
-template <class Derived>
-std::vector<int> GetArrayIndices(const Eigen::ArrayBase<Derived>& array) {
- std::vector<int> ret;
- for (const auto i : c10::irange(array.size())) {
- if (array[i]) {
- ret.push_back(i);
- }
- }
- return ret;
-}
-
-} // namespace utils
-} // namespace caffe2
-
-#endif
diff --git a/caffe2/utils/fatal_signal_asan_no_sig_test.cc b/caffe2/utils/fatal_signal_asan_no_sig_test.cc
deleted file mode 100644
index 9c64102..0000000
--- a/caffe2/utils/fatal_signal_asan_no_sig_test.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-#include "caffe2/utils/signal_handler.h"
-#if defined(C10_SUPPORTS_FATAL_SIGNAL_HANDLERS)
-#include <gtest/gtest.h>
-#include <pthread.h>
-#include <unistd.h>
-
-#include <functional>
-#include <iostream>
-#include <array>
-
-#include "caffe2/core/common.h"
-
-namespace {
-void* dummy_thread(void*) {
- while (1) {
- }
- return nullptr;
-}
-
-bool forkAndPipe(
- std::string& stderrBuffer,
- std::function<void(void)> callback) {
- std::array<int, 2> stderrPipe;
- if (pipe(stderrPipe.data()) != 0) {
- perror("STDERR pipe");
- return false;
- }
- pid_t child = fork();
- if (child == 0) {
- // Replace this process' stderr so we can read it.
- if (dup2(stderrPipe[1], STDERR_FILENO) < 0) {
- close(stderrPipe[0]);
- close(stderrPipe[1]);
- perror("dup2 STDERR");
- exit(5);
- }
-
- // This is for the parent to work with.
- close(stderrPipe[0]);
- close(stderrPipe[1]);
-
- callback();
- exit(7);
- } else if (child > 0) {
- const int bufferSize = 128;
- std::array<char, bufferSize> buffer;
-
- // We want to close the writing end of the pipe right away so our
- // read actually gets an EOF.
- close(stderrPipe[1]);
-
- // wait for child to finish crashing.
- int statloc;
- if (wait(&statloc) < 0) {
- close(stderrPipe[0]);
- perror("wait");
- return false;
- }
-
- ssize_t bytesRead;
- while ((bytesRead = read(stderrPipe[0], buffer.data(), bufferSize)) > 0) {
- const std::string tmp(buffer.data(), bytesRead);
- std::cout << tmp;
- stderrBuffer += tmp;
- }
-
- // The child should have exited due to signal.
- if (!WIFSIGNALED(statloc)) {
- fprintf(stderr, "Child didn't exit because it received a signal\n");
- if (WIFEXITED(statloc)) {
- fprintf(stderr, "Exited with code: %d\n", WEXITSTATUS(statloc) & 0xff);
- }
- return false;
- }
-
- if (bytesRead < 0) {
- perror("read");
- return false;
- }
-
- close(stderrPipe[0]);
- return true;
- } else {
- perror("fork");
- return false;
- }
-}
-} // namespace
-
-#define _TEST_FATAL_SIGNAL(signum, name, threadCount, print, expected) \
- do { \
- std::string stderrBuffer; \
- ASSERT_TRUE(forkAndPipe(stderrBuffer, [=]() { \
- caffe2::setPrintStackTracesOnFatalSignal(print); \
- pthread_t pt; \
- for (int i = 0; i < threadCount; i++) { \
- if (pthread_create(&pt, nullptr, ::dummy_thread, nullptr)) { \
- perror("pthread_create"); \
- } \
- } \
- raise(signum); \
- })); \
- int keyPhraseCount = 0; \
- std::string keyPhrase = \
- std::string(name) + "(" + c10::to_string(signum) + ")"; \
- size_t loc = 0; \
- while ((loc = stderrBuffer.find(keyPhrase, loc)) != std::string::npos) { \
- keyPhraseCount += 1; \
- loc += 1; \
- } \
- EXPECT_GE(keyPhraseCount, expected); \
- } while (0)
-
-#define TEST_FATAL_SIGNAL(signum, name, threadCount) \
- _TEST_FATAL_SIGNAL(signum, name, threadCount, true, threadCount + 1)
-
-#define TEST_FATAL_SIGNAL_NO_PRINT(signum, name, threadCount) \
- _TEST_FATAL_SIGNAL(signum, name, threadCount, false, 0)
-
-TEST(fatalSignalTest, SIGABRT8) {
- TEST_FATAL_SIGNAL(SIGABRT, "SIGABRT", 8);
-}
-
-TEST(fatalSignalTest, SIGINT8) {
- TEST_FATAL_SIGNAL(SIGINT, "SIGINT", 8);
-}
-
-TEST(fatalSignalTest, SIGILL8) {
- TEST_FATAL_SIGNAL(SIGILL, "SIGILL", 8);
-}
-
-TEST(fatalSignalTest, SIGFPE8) {
- TEST_FATAL_SIGNAL(SIGFPE, "SIGFPE", 8);
-}
-
-TEST(fatalSignalTest, SIGBUS8) {
- TEST_FATAL_SIGNAL(SIGBUS, "SIGBUS", 8);
-}
-
-TEST(fatalSignalTest, SIGSEGV8) {
- TEST_FATAL_SIGNAL(SIGSEGV, "SIGSEGV", 8);
-}
-
-// Test that if we don't enable printing stack traces then we don't get any.
-TEST(fatalSignalTest, SIGABRT8_NOPRINT) {
- TEST_FATAL_SIGNAL_NO_PRINT(SIGABRT, "SIGABRT", 8);
-}
-#endif // defined(C10_SUPPORTS_FATAL_SIGNAL_HANDLERS)
diff --git a/caffe2/utils/filler.h b/caffe2/utils/filler.h
deleted file mode 100644
index 3d0e399..0000000
--- a/caffe2/utils/filler.h
+++ /dev/null
@@ -1,140 +0,0 @@
-#ifndef CAFFE2_FILLER_H_
-#define CAFFE2_FILLER_H_
-
-#include <sstream>
-
-#include "caffe2/core/logging.h"
-#include "caffe2/core/tensor.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-// TODO: replace filler distribution enum with a better abstraction
-enum FillerDistribution { FD_UNIFORM, FD_FIXEDSUM, FD_SYNTHETIC };
-
-class TensorFiller {
- public:
- template <class Type, class Context>
- void Fill(Tensor* tensor, Context* context) const {
- CAFFE_ENFORCE(context, "context is null");
- CAFFE_ENFORCE(tensor, "tensor is null");
- auto min = (min_ < (double)std::numeric_limits<Type>::min())
- ? std::numeric_limits<Type>::min()
- : static_cast<Type>(min_);
- auto max = (max_ > (double)std::numeric_limits<Type>::max())
- ? std::numeric_limits<Type>::max()
- : static_cast<Type>(max_);
- CAFFE_ENFORCE_LE(min, max);
-
- Tensor temp_tensor(shape_, Context::GetDeviceType());
- std::swap(*tensor, temp_tensor);
- Type* data = tensor->template mutable_data<Type>();
-
- // select distribution
- switch (dist_) {
- case FD_UNIFORM: {
- math::RandUniform<Type, Context>(
- tensor->numel(), min, max, data, context);
- break;
- }
- case FD_FIXEDSUM: {
- auto fixed_sum = static_cast<Type>(fixed_sum_);
- CAFFE_ENFORCE_LE(min * tensor->numel(), fixed_sum);
- CAFFE_ENFORCE_GE(max * tensor->numel(), fixed_sum);
- math::RandFixedSum<Type, Context>(
- tensor->numel(), min, max, fixed_sum_, data, context);
- break;
- }
- case FD_SYNTHETIC: {
- math::RandSyntheticData<Type, Context>(
- tensor->numel(), min, max, data, context);
- break;
- }
- }
- }
-
- TensorFiller& Dist(FillerDistribution dist) {
- dist_ = dist;
- return *this;
- }
-
- template <class Type>
- TensorFiller& Min(Type min) {
- min_ = (double)min;
- return *this;
- }
-
- template <class Type>
- TensorFiller& Max(Type max) {
- max_ = (double)max;
- return *this;
- }
-
- template <class Type>
- TensorFiller& FixedSum(Type fixed_sum) {
- dist_ = FD_FIXEDSUM;
- fixed_sum_ = (double)fixed_sum;
- return *this;
- }
-
- // A helper function to construct the lengths vector for sparse features
- // We try to pad least one index per batch unless the total_length is 0
- template <class Type>
- TensorFiller& SparseLengths(Type total_length) {
- return FixedSum(total_length)
- .Min(std::min(static_cast<Type>(1), total_length))
- .Max(total_length);
- }
-
- // a helper function to construct the segments vector for sparse features
- template <class Type>
- TensorFiller& SparseSegments(Type max_segment) {
- CAFFE_ENFORCE(dist_ != FD_FIXEDSUM);
- return Min(0).Max(max_segment).Dist(FD_SYNTHETIC);
- }
-
- TensorFiller& Shape(const std::vector<int64_t>& shape) {
- shape_ = shape;
- return *this;
- }
-
- template <class Type>
- TensorFiller(const std::vector<int64_t>& shape, Type fixed_sum)
- : shape_(shape), dist_(FD_FIXEDSUM), fixed_sum_((double)fixed_sum) {}
-
- TensorFiller(const std::vector<int64_t>& shape)
- : shape_(shape), dist_(FD_UNIFORM), fixed_sum_(0) {}
-
- TensorFiller() : TensorFiller(std::vector<int64_t>()) {}
-
- std::string DebugString() const {
- std::stringstream stream;
- stream << "shape = [" << shape_ << "]; min = " << min_
- << "; max = " << max_;
- switch (dist_) {
- case FD_FIXEDSUM:
- stream << "; dist = FD_FIXEDSUM";
- break;
- case FD_SYNTHETIC:
- stream << "; dist = FD_SYNTHETIC";
- break;
- default:
- stream << "; dist = FD_UNIFORM";
- break;
- }
- return stream.str();
- }
-
- private:
- std::vector<int64_t> shape_;
- // TODO: type is unknown until a user starts to fill data;
- // cast everything to double for now.
- double min_ = 0.0;
- double max_ = 1.0;
- FillerDistribution dist_;
- double fixed_sum_;
-};
-
-} // namespace caffe2
-
-#endif // CAFFE2_FILLER_H_
diff --git a/caffe2/utils/fixed_divisor_test.cc b/caffe2/utils/fixed_divisor_test.cc
deleted file mode 100644
index 6093bc7..0000000
--- a/caffe2/utils/fixed_divisor_test.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-#include "caffe2/utils/fixed_divisor.h"
-
-#include <gtest/gtest.h>
-
-#include <random>
-
-namespace caffe2 {
-
-namespace {
-
-void CompareDivMod(int32_t v, int32_t divisor) {
- auto fixed = FixedDivisor<int32_t>(divisor);
-
- int native_q = v / divisor;
- int native_r = v % divisor;
-
- int fixed_q = fixed.Div(v);
- int fixed_r = fixed.Mod(v);
-
-#if !defined(USE_ROCM)
- EXPECT_EQ(native_q, fixed_q)
- << v << " / " << divisor << " magic " << fixed.magic() << " shift "
- << fixed.shift() << " quot " << fixed_q << " " << native_q;
-
- EXPECT_EQ(native_r, fixed_r)
- << v << " / " << divisor << " magic " << fixed.magic() << " shift "
- << fixed.shift() << " rem " << fixed_r << " " << native_r;
-#endif
-}
-
-} // namespace
-
-TEST(FixedDivisorTest, FixedDivisorInt32Test) {
- constexpr int32_t kMax = std::numeric_limits<int32_t>::max();
-
- // divide by 1
- CompareDivMod(kMax, 1);
- CompareDivMod(0, 1);
- CompareDivMod(1, 1);
-
- // divide by max
- CompareDivMod(kMax, kMax);
- CompareDivMod(0, kMax);
- CompareDivMod(1, kMax);
-
- // divide by random positive values
- std::random_device rd;
- std::uniform_int_distribution<int32_t> v_dist(0, kMax);
- std::uniform_int_distribution<int32_t> q_dist(1, kMax);
-
- std::uniform_int_distribution<int32_t> v_small_dist(0, 1000);
- std::uniform_int_distribution<int32_t> q_small_dist(1, 1000);
- for (int i = 0; i < 10000; ++i) {
- auto q = q_dist(rd);
- auto v = v_dist(rd);
- auto q_small = q_small_dist(rd);
- auto v_small = v_small_dist(rd);
-
- // random value
- CompareDivMod(v_small, q_small);
- CompareDivMod(v_small, q);
- CompareDivMod(v, q_small);
- CompareDivMod(v, q);
-
- // special values
- CompareDivMod(kMax, q_small);
- CompareDivMod(0, q_small);
- CompareDivMod(1, q_small);
- CompareDivMod(kMax, q);
- CompareDivMod(0, q);
- CompareDivMod(1, q);
-
- CompareDivMod(v_small, 1);
- CompareDivMod(v_small, kMax);
- CompareDivMod(v, 1);
- CompareDivMod(v, kMax);
- }
-}
-
-} // namespace caffe2
diff --git a/caffe2/utils/knob_patcher.cc b/caffe2/utils/knob_patcher.cc
deleted file mode 100644
index e099ea6..0000000
--- a/caffe2/utils/knob_patcher.cc
+++ /dev/null
@@ -1,137 +0,0 @@
-// Copyright (c) Meta Platforms, Inc. and its affiliates.
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <map>
-
-#include <c10/util/string_view.h>
-#include <c10/util/Flags.h>
-#include <c10/util/Logging.h>
-
-#include "caffe2/utils/knobs.h"
-#include "caffe2/utils/knob_patcher.h"
-
-namespace caffe2 {
-namespace detail {
-std::map<c10::string_view, bool*>& getRegisteredKnobs();
-} // namespace detail
-
-namespace {
-class PatchNode {
- public:
- PatchNode(c10::string_view name, bool value);
- ~PatchNode();
-
- std::string name;
- bool oldValue{false};
- // Nodes to form a linked list of existing PatchState objects for this knob.
- // This allows us to restore state correctly even if KnobPatcher objects
- // are destroyed in any arbitrary order.
- PatchNode* prev{nullptr};
- PatchNode* next{nullptr};
-};
-} // namespace
-
-class KnobPatcher::PatchState : public PatchNode {
- using PatchNode::PatchNode;
-};
-
-KnobPatcher::KnobPatcher(c10::string_view name, bool value)
- : state_{std::make_unique<PatchState>(name, value)} {}
-
-KnobPatcher::~KnobPatcher() = default;
-KnobPatcher::KnobPatcher(KnobPatcher&&) noexcept = default;
-KnobPatcher& KnobPatcher::operator=(KnobPatcher&&) noexcept = default;
-
-namespace {
-
-class Patcher {
- public:
- void patch(PatchNode* node, bool value) {
- std::lock_guard<std::mutex> lock{mutex_};
-
- node->oldValue = setKnobValue(node->name, value);
- auto ret = patches_.emplace(node->name, node);
- if (!ret.second) {
- // There was already another patcher for this knob
- // Append the new node to the linked list.
- node->prev = ret.first->second;
- CHECK(!node->prev->next);
- node->prev->next = node;
- ret.first->second = node;
- }
- }
-
- void unpatch(PatchNode* node) {
- std::lock_guard<std::mutex> lock{mutex_};
-
- // Remove this PatchNode from the linked list
- if (node->prev) {
- node->prev->next = node->next;
- }
- if (node->next) {
- // There was another patch applied after this one.
- node->next->prev = node->prev;
- node->next->oldValue = node->oldValue;
- } else {
- // This was the most recently applied patch for this knob,
- // so restore the knob value.
- setKnobValue(node->name, node->oldValue);
-
- // The patches_ map should point to this node.
- // Update it to point to the previous patch, if there is one.
- auto iter = patches_.find(node->name);
- if (iter == patches_.end()) {
- LOG(FATAL) << "patch node not found when unpatching knob value";
- }
- TORCH_CHECK_EQ(iter->second, node);
- if (node->prev) {
- iter->second = node->prev;
- } else {
- patches_.erase(iter);
- }
- }
- }
-
- private:
- bool setKnobValue(c10::string_view name, bool value) {
- auto& knobs = caffe2::detail::getRegisteredKnobs();
- auto iter = knobs.find(name);
- if (iter == knobs.end()) {
- throw std::invalid_argument(
- "attempted to patch unknown knob \"" + std::string(name) + "\"");
- }
- bool oldValue = *(iter->second);
- *iter->second = value;
- return oldValue;
- }
-
- std::mutex mutex_;
- std::map<std::string, PatchNode*> patches_;
-};
-
-Patcher& getPatcher() {
- static Patcher patcher;
- return patcher;
-}
-
-PatchNode::PatchNode(c10::string_view knobName, bool value)
- : name{knobName} {
- getPatcher().patch(this, value);
-}
-
-PatchNode::~PatchNode() {
- try {
- getPatcher().unpatch(this);
- } catch (const std::exception& ex) {
- // This shouldn't ever happen unless we have a programming bug, but it keeps
- // clang-tidy happy if we put a catch block here to handle the theoretical
- // error if unpatch() calls setKnobValue() and it throws due to not finding
- // the knob by name.
- LOG(FATAL) << "error removing knob patch: " << ex.what();
- }
-}
-
-} // namespace
-} // namespace caffe2
diff --git a/caffe2/utils/knob_patcher.h b/caffe2/utils/knob_patcher.h
deleted file mode 100644
index ec2b627..0000000
--- a/caffe2/utils/knob_patcher.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#pragma once
-
-#include <memory>
-
-#include <c10/util/string_view.h>
-
-namespace caffe2 {
-
-/**
- * Patch the value of a knob during a unit test.
- *
- * This forces the knob to the specified value for as long as the KnobPatcher
- * object exists. When the KnobPatcher object is destroyed the knob will revert
- * to its previous value.
- */
-class KnobPatcher {
- public:
- KnobPatcher(c10::string_view name, bool value);
- ~KnobPatcher();
-
- KnobPatcher(KnobPatcher&&) noexcept;
- KnobPatcher& operator=(KnobPatcher&&) noexcept;
- KnobPatcher(const KnobPatcher&) = delete;
- KnobPatcher& operator=(const KnobPatcher&) = delete;
-
- private:
- class PatchState;
-
- std::unique_ptr<PatchState> state_;
-};
-
-} // namespace caffe2
diff --git a/caffe2/utils/knobs.cc b/caffe2/utils/knobs.cc
deleted file mode 100644
index 63941a5..0000000
--- a/caffe2/utils/knobs.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-// Copyright (c) Meta Platforms, Inc. and affiliates.
-//
-// This is a very basic knob implementation that purely uses command line flags.
-// This can be replaced with a more sophisticated implementation for use in
-// other production environments.
-
-#include <map>
-
-#include <c10/util/string_view.h>
-#include <c10/util/Flags.h>
-
-#include "caffe2/utils/knobs.h"
-
-namespace caffe2 {
-
-namespace detail {
-// Get the map of knob names to pointers to their command-line controlled
-// boolean value.
-std::map<c10::string_view, bool*>& getRegisteredKnobs() {
- // It's safe to store the keys as string_view, since DEFINE_KNOB() ensures
- // that these views always point to string literals.
- static std::map<c10::string_view, bool*> registeredKnobs;
- return registeredKnobs;
-}
-} // namespace detail
-
-bool CheckKnob(c10::string_view name) {
- const auto& knobs = detail::getRegisteredKnobs();
- auto iter = knobs.find(name);
- if (iter == knobs.end()) {
- throw std::invalid_argument(
- "attempted to check unknown knob \"" + std::string(name) + "\"");
- }
- return *iter->second;
-}
-
-namespace {
-class RegisterKnob {
- public:
- RegisterKnob(c10::string_view name, bool* cmdlineFlag) {
- auto ret = caffe2::detail::getRegisteredKnobs().emplace(name, cmdlineFlag);
- if (!ret.second) {
- throw std::runtime_error("duplicate knob name: " + std::string(name));
- }
- }
-};
-} // namespace
-} // namespace caffe2
-
-/**
- * Define a knob.
- *
- * This will define a --caffe2_knob_<name> command line flag to control the
- * knob.
- *
- * The knob can be checked in code by calling CheckKnob(name)
- * or CheckKnob<check_fn_name>()
- */
-#define DEFINE_KNOB(name, check_fn_name, default_value, docstring) \
- C10_DEFINE_bool(caffe2_knob_##name, default_value, docstring); \
- namespace caffe2 { \
- bool CheckKnob##check_fn_name() { \
- return FLAGS_caffe2_knob_##name; \
- } \
- } \
- static caffe2::RegisterKnob _knob_##name(#name, &FLAGS_caffe2_knob_##name)
-
-/*
- * Definitions of well-known knobs.
- */
-
-DEFINE_KNOB(
- example_knob,
- ExampleKnob,
- false,
- "An example knob, mainly intended for use in unit tests");
diff --git a/caffe2/utils/knobs.h b/caffe2/utils/knobs.h
deleted file mode 100644
index fbebd90..0000000
--- a/caffe2/utils/knobs.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#pragma once
-
-// This file contains functions for checking rollout knobs to enable staged
-// roll out of specific code functionality.
-
-#include <memory>
-
-#include <c10/util/string_view.h>
-
-namespace caffe2 {
-
-/**
- * Check an arbitrary knob by name.
- */
-bool CheckKnob(c10::string_view name);
-
-/*
- * The following are functions for checking specific known knob values.
- *
- * These APIs are more efficient than checking by name.
- */
-
-// An example knob, just for use in unit tests.
-bool CheckKnobExampleKnob();
-
-} // namespace caffe2
diff --git a/caffe2/utils/knobs_test.cc b/caffe2/utils/knobs_test.cc
deleted file mode 100644
index 95f29cf..0000000
--- a/caffe2/utils/knobs_test.cc
+++ /dev/null
@@ -1,34 +0,0 @@
-#include <gtest/gtest.h>
-
-#include "caffe2/utils/knobs.h"
-#include "caffe2/utils/knob_patcher.h"
-
-using namespace caffe2;
-
-TEST(KnobsTest, TestKnob) {
- auto p = KnobPatcher("example_knob", false);
- EXPECT_FALSE(CheckKnobExampleKnob());
- EXPECT_FALSE(CheckKnob("example_knob"));
-
- p = KnobPatcher("example_knob", true);
- EXPECT_TRUE(CheckKnobExampleKnob());
- EXPECT_TRUE(CheckKnob("example_knob"));
-
- // Test nested patchers
- {
- auto p2 = KnobPatcher("example_knob", false);
- EXPECT_FALSE(CheckKnobExampleKnob());
- EXPECT_FALSE(CheckKnob("example_knob"));
-
- auto p3 = KnobPatcher("example_knob", true);
- EXPECT_TRUE(CheckKnobExampleKnob());
- EXPECT_TRUE(CheckKnob("example_knob"));
- }
- EXPECT_TRUE(CheckKnobExampleKnob());
- EXPECT_TRUE(CheckKnob("example_knob"));
-}
-
-TEST(KnobsTest, TestUnknownKnob) {
- // Unknown knob names should throw an exception
- EXPECT_THROW(CheckKnob("this_knob_does_not_exist"), std::exception);
-}
diff --git a/caffe2/utils/map_utils.h b/caffe2/utils/map_utils.h
deleted file mode 100644
index ef8ff0c..0000000
--- a/caffe2/utils/map_utils.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#pragma once
-
-namespace caffe2 {
-
-// Get value from map given key. Return supplied default value if not found
-// This is a stripped down version from folly:
-// https://github.com/facebook/folly/blob/5a07e203d79324b68d69f294fa38e43b9671e9b1/folly/MapUtil.h#L35-L45
-template <
- class Map,
- typename Key = typename Map::key_type,
- typename Value = typename Map::mapped_type>
-typename Map::mapped_type
-get_default(const Map& map, const Key& key, Value&& dflt) {
- using M = typename Map::mapped_type;
- auto pos = map.find(key);
- return (pos != map.end()) ? (pos->second) : M(std::forward<Value>(dflt));
-}
-
-} // namespace caffe2
diff --git a/caffe2/utils/murmur_hash3.cc b/caffe2/utils/murmur_hash3.cc
deleted file mode 100644
index 68cce1f..0000000
--- a/caffe2/utils/murmur_hash3.cc
+++ /dev/null
@@ -1,450 +0,0 @@
-//-----------------------------------------------------------------------------
-// MurmurHash3 was written by Austin Appleby, and is placed in the public
-// domain. The author hereby disclaims copyright to this source code.
-
-// Note - The x86 and x64 versions do _not_ produce the same results, as the
-// algorithms are optimized for their respective platforms. You can still
-// compile and run any of them on any platform, but your performance with the
-// non-native version will be less than optimal.
-
-#include "caffe2/utils/murmur_hash3.h"
-
-//-----------------------------------------------------------------------------
-// Platform-specific functions and macros
-
-// Microsoft Visual Studio
-
-#if defined(_MSC_VER)
-
-#define FORCE_INLINE __forceinline
-
-#include <stdlib.h>
-
-#define ROTL32(x, y) _rotl(x, y)
-#define ROTL64(x, y) _rotl64(x, y)
-
-#define BIG_CONSTANT(x) (x)
-
-// Other compilers
-
-#else // defined(_MSC_VER)
-
-#define FORCE_INLINE inline __attribute__((__always_inline__))
-
-inline uint32_t rotl32(uint32_t x, int8_t r) {
- return (x << r) | (x >> (32 - r));
-}
-
-inline uint64_t rotl64(uint64_t x, int8_t r) {
- return (x << r) | (x >> (64 - r));
-}
-
-#define ROTL32(x, y) rotl32(x, y)
-#define ROTL64(x, y) rotl64(x, y)
-
-#define BIG_CONSTANT(x) (x##LLU)
-
-#endif // !defined(_MSC_VER)
-
-//-----------------------------------------------------------------------------
-// Block read - if your platform needs to do endian-swapping or can only
-// handle aligned reads, do the conversion here
-
-FORCE_INLINE uint32_t getblock32(const uint32_t* p, int i) {
- return p[i];
-}
-
-FORCE_INLINE uint64_t getblock64(const uint64_t* p, int i) {
- return p[i];
-}
-
-//-----------------------------------------------------------------------------
-// Finalization mix - force all bits of a hash block to avalanche
-
-FORCE_INLINE uint32_t fmix32(uint32_t h) {
- h ^= h >> 16;
- h *= 0x85ebca6b;
- h ^= h >> 13;
- h *= 0xc2b2ae35;
- h ^= h >> 16;
-
- return h;
-}
-
-//----------
-
-FORCE_INLINE uint64_t fmix64(uint64_t k) {
- k ^= k >> 33;
- k *= BIG_CONSTANT(0xff51afd7ed558ccd);
- k ^= k >> 33;
- k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
- k ^= k >> 33;
-
- return k;
-}
-
-namespace caffe2 {
-
-void MurmurHash3_x86_32(const void* key, int len, uint32_t seed, void* out) {
- const uint8_t* data = (const uint8_t*)key;
- const int nblocks = len / 4;
-
- uint32_t h1 = seed;
-
- const uint32_t c1 = 0xcc9e2d51;
- const uint32_t c2 = 0x1b873593;
-
- //----------
- // body
-
- const uint32_t* blocks = (const uint32_t*)(data + nblocks * 4);
-
- for (int i = -nblocks; i; i++) {
- uint32_t k1 = getblock32(blocks, i);
-
- k1 *= c1;
- k1 = ROTL32(k1, 15);
- k1 *= c2;
-
- h1 ^= k1;
- h1 = ROTL32(h1, 13);
- h1 = h1 * 5 + 0xe6546b64;
- }
-
- //----------
- // tail
-
- const uint8_t* tail = (const uint8_t*)(data + nblocks * 4);
-
- uint32_t k1 = 0;
-
- switch (len & 3) {
- case 3:
- k1 ^= tail[2] << 16;
- [[fallthrough]];
- case 2:
- k1 ^= tail[1] << 8;
- [[fallthrough]];
- case 1:
- k1 ^= tail[0];
- k1 *= c1;
- k1 = ROTL32(k1, 15);
- k1 *= c2;
- h1 ^= k1;
- };
-
- //----------
- // finalization
-
- h1 ^= len;
-
- h1 = fmix32(h1);
-
- *(uint32_t*)out = h1;
-}
-
-//-----------------------------------------------------------------------------
-
-void MurmurHash3_x86_128(
- const void* key,
- const int len,
- uint32_t seed,
- void* out) {
- const uint8_t* data = (const uint8_t*)key;
- const int nblocks = len / 16;
-
- uint32_t h1 = seed;
- uint32_t h2 = seed;
- uint32_t h3 = seed;
- uint32_t h4 = seed;
-
- const uint32_t c1 = 0x239b961b;
- const uint32_t c2 = 0xab0e9789;
- const uint32_t c3 = 0x38b34ae5;
- const uint32_t c4 = 0xa1e38b93;
-
- //----------
- // body
-
- const uint32_t* blocks = (const uint32_t*)(data + nblocks * 16);
-
- for (int i = -nblocks; i; i++) {
- uint32_t k1 = getblock32(blocks, i * 4 + 0);
- uint32_t k2 = getblock32(blocks, i * 4 + 1);
- uint32_t k3 = getblock32(blocks, i * 4 + 2);
- uint32_t k4 = getblock32(blocks, i * 4 + 3);
-
- k1 *= c1;
- k1 = ROTL32(k1, 15);
- k1 *= c2;
- h1 ^= k1;
-
- h1 = ROTL32(h1, 19);
- h1 += h2;
- h1 = h1 * 5 + 0x561ccd1b;
-
- k2 *= c2;
- k2 = ROTL32(k2, 16);
- k2 *= c3;
- h2 ^= k2;
-
- h2 = ROTL32(h2, 17);
- h2 += h3;
- h2 = h2 * 5 + 0x0bcaa747;
-
- k3 *= c3;
- k3 = ROTL32(k3, 17);
- k3 *= c4;
- h3 ^= k3;
-
- h3 = ROTL32(h3, 15);
- h3 += h4;
- h3 = h3 * 5 + 0x96cd1c35;
-
- k4 *= c4;
- k4 = ROTL32(k4, 18);
- k4 *= c1;
- h4 ^= k4;
-
- h4 = ROTL32(h4, 13);
- h4 += h1;
- h4 = h4 * 5 + 0x32ac3b17;
- }
-
- //----------
- // tail
-
- const uint8_t* tail = (const uint8_t*)(data + nblocks * 16);
-
- uint32_t k1 = 0;
- uint32_t k2 = 0;
- uint32_t k3 = 0;
- uint32_t k4 = 0;
-
- switch (len & 15) {
- case 15:
- k4 ^= tail[14] << 16;
- [[fallthrough]];
- case 14:
- k4 ^= tail[13] << 8;
- [[fallthrough]];
- case 13:
- k4 ^= tail[12] << 0;
- k4 *= c4;
- k4 = ROTL32(k4, 18);
- k4 *= c1;
- h4 ^= k4;
- [[fallthrough]];
-
- case 12:
- k3 ^= tail[11] << 24;
- [[fallthrough]];
- case 11:
- k3 ^= tail[10] << 16;
- [[fallthrough]];
- case 10:
- k3 ^= tail[9] << 8;
- [[fallthrough]];
- case 9:
- k3 ^= tail[8] << 0;
- k3 *= c3;
- k3 = ROTL32(k3, 17);
- k3 *= c4;
- h3 ^= k3;
- [[fallthrough]];
-
- case 8:
- k2 ^= tail[7] << 24;
- [[fallthrough]];
- case 7:
- k2 ^= tail[6] << 16;
- [[fallthrough]];
- case 6:
- k2 ^= tail[5] << 8;
- [[fallthrough]];
- case 5:
- k2 ^= tail[4] << 0;
- k2 *= c2;
- k2 = ROTL32(k2, 16);
- k2 *= c3;
- h2 ^= k2;
- [[fallthrough]];
-
- case 4:
- k1 ^= tail[3] << 24;
- [[fallthrough]];
- case 3:
- k1 ^= tail[2] << 16;
- [[fallthrough]];
- case 2:
- k1 ^= tail[1] << 8;
- [[fallthrough]];
- case 1:
- k1 ^= tail[0] << 0;
- k1 *= c1;
- k1 = ROTL32(k1, 15);
- k1 *= c2;
- h1 ^= k1;
- };
-
- //----------
- // finalization
-
- h1 ^= len;
- h2 ^= len;
- h3 ^= len;
- h4 ^= len;
-
- h1 += h2;
- h1 += h3;
- h1 += h4;
- h2 += h1;
- h3 += h1;
- h4 += h1;
-
- h1 = fmix32(h1);
- h2 = fmix32(h2);
- h3 = fmix32(h3);
- h4 = fmix32(h4);
-
- h1 += h2;
- h1 += h3;
- h1 += h4;
- h2 += h1;
- h3 += h1;
- h4 += h1;
-
- ((uint32_t*)out)[0] = h1;
- ((uint32_t*)out)[1] = h2;
- ((uint32_t*)out)[2] = h3;
- ((uint32_t*)out)[3] = h4;
-}
-
-//-----------------------------------------------------------------------------
-
-void MurmurHash3_x64_128(
- const void* key,
- const int len,
- const uint32_t seed,
- void* out) {
- const uint8_t* data = (const uint8_t*)key;
- const int nblocks = len / 16;
-
- uint64_t h1 = seed;
- uint64_t h2 = seed;
-
- const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
- const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
-
- //----------
- // body
-
- const uint64_t* blocks = (const uint64_t*)(data);
-
- for (int i = 0; i < nblocks; i++) {
- uint64_t k1 = getblock64(blocks, i * 2 + 0);
- uint64_t k2 = getblock64(blocks, i * 2 + 1);
-
- k1 *= c1;
- k1 = ROTL64(k1, 31);
- k1 *= c2;
- h1 ^= k1;
-
- h1 = ROTL64(h1, 27);
- h1 += h2;
- h1 = h1 * 5 + 0x52dce729;
-
- k2 *= c2;
- k2 = ROTL64(k2, 33);
- k2 *= c1;
- h2 ^= k2;
-
- h2 = ROTL64(h2, 31);
- h2 += h1;
- h2 = h2 * 5 + 0x38495ab5;
- }
-
- //----------
- // tail
-
- const uint8_t* tail = (const uint8_t*)(data + nblocks * 16);
-
- uint64_t k1 = 0;
- uint64_t k2 = 0;
-
- switch (len & 15) {
- case 15:
- k2 ^= ((uint64_t)tail[14]) << 48;
- [[fallthrough]];
- case 14:
- k2 ^= ((uint64_t)tail[13]) << 40;
- [[fallthrough]];
- case 13:
- k2 ^= ((uint64_t)tail[12]) << 32;
- [[fallthrough]];
- case 12:
- k2 ^= ((uint64_t)tail[11]) << 24;
- [[fallthrough]];
- case 11:
- k2 ^= ((uint64_t)tail[10]) << 16;
- [[fallthrough]];
- case 10:
- k2 ^= ((uint64_t)tail[9]) << 8;
- [[fallthrough]];
- case 9:
- k2 ^= ((uint64_t)tail[8]) << 0;
- k2 *= c2;
- k2 = ROTL64(k2, 33);
- k2 *= c1;
- h2 ^= k2;
- [[fallthrough]];
-
- case 8:
- k1 ^= ((uint64_t)tail[7]) << 56;
- [[fallthrough]];
- case 7:
- k1 ^= ((uint64_t)tail[6]) << 48;
- [[fallthrough]];
- case 6:
- k1 ^= ((uint64_t)tail[5]) << 40;
- [[fallthrough]];
- case 5:
- k1 ^= ((uint64_t)tail[4]) << 32;
- [[fallthrough]];
- case 4:
- k1 ^= ((uint64_t)tail[3]) << 24;
- [[fallthrough]];
- case 3:
- k1 ^= ((uint64_t)tail[2]) << 16;
- [[fallthrough]];
- case 2:
- k1 ^= ((uint64_t)tail[1]) << 8;
- [[fallthrough]];
- case 1:
- k1 ^= ((uint64_t)tail[0]) << 0;
- k1 *= c1;
- k1 = ROTL64(k1, 31);
- k1 *= c2;
- h1 ^= k1;
- };
-
- //----------
- // finalization
-
- h1 ^= len;
- h2 ^= len;
-
- h1 += h2;
- h2 += h1;
-
- h1 = fmix64(h1);
- h2 = fmix64(h2);
-
- h1 += h2;
- h2 += h1;
-
- ((uint64_t*)out)[0] = h1;
- ((uint64_t*)out)[1] = h2;
-}
-
-} // namespace caffe2
diff --git a/caffe2/utils/murmur_hash3.h b/caffe2/utils/murmur_hash3.h
deleted file mode 100644
index ea67e71..0000000
--- a/caffe2/utils/murmur_hash3.h
+++ /dev/null
@@ -1,34 +0,0 @@
-//-----------------------------------------------------------------------------
-// MurmurHash3 was written by Austin Appleby, and is placed in the public
-// domain. The author hereby disclaims copyright to this source code.
-
-#pragma once
-
-//-----------------------------------------------------------------------------
-// Platform-specific functions and macros
-
-// Microsoft Visual Studio
-
-#if defined(_MSC_VER) && (_MSC_VER < 1600)
-
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned __int64 uint64_t;
-
-// Other compilers
-
-#else // defined(_MSC_VER)
-
-#include <stdint.h>
-
-#endif // !defined(_MSC_VER)
-
-namespace caffe2 {
-
-void MurmurHash3_x86_32(const void* key, int len, uint32_t seed, void* out);
-
-void MurmurHash3_x86_128(const void* key, int len, uint32_t seed, void* out);
-
-void MurmurHash3_x64_128(const void* key, int len, uint32_t seed, void* out);
-
-} // namespace caffe2
diff --git a/caffe2/utils/proto_utils.cc b/caffe2/utils/proto_utils.cc
deleted file mode 100644
index 8fc8158..0000000
--- a/caffe2/utils/proto_utils.cc
+++ /dev/null
@@ -1,715 +0,0 @@
-#include "caffe2/utils/proto_utils.h"
-
-#include <c10/core/DeviceType.h>
-
-#include <fcntl.h>
-#include <cerrno>
-#include <fstream>
-#include <unordered_set>
-
-#if defined(_MSC_VER)
-#include <io.h>
-#else
-#include <unistd.h>
-#endif
-
-#include <google/protobuf/io/coded_stream.h>
-
-#ifndef CAFFE2_USE_LITE_PROTO
-#include <google/protobuf/io/zero_copy_stream_impl.h>
-#include <google/protobuf/text_format.h>
-#else
-#include <google/protobuf/io/zero_copy_stream_impl_lite.h>
-#endif // !CAFFE2_USE_LITE_PROTO
-
-#include <c10/util/Logging.h>
-
-using ::google::protobuf::MessageLite;
-
-namespace caffe2 {
-
-C10_EXPORT std::string DeviceTypeName(const int32_t& d) {
- return at::DeviceTypeName(static_cast<at::DeviceType>(d));
-}
-
-void setTotalBytesLimit(::google::protobuf::io::CodedInputStream& stream, int bytes_limit, int warning_threshold) {
- #if GOOGLE_PROTOBUF_VERSION >= 3011000
- // Only take one parameter since protobuf 3.11
- stream.SetTotalBytesLimit(bytes_limit);
- #else
- stream.SetTotalBytesLimit(bytes_limit, warning_threshold);
- #endif
-}
-
-C10_EXPORT int DeviceId(const DeviceOption& option) {
- switch (option.device_type()) {
- case PROTO_CPU:
- return option.numa_node_id();
- case PROTO_CUDA:
- case PROTO_HIP:
- return option.device_id();
- case PROTO_MKLDNN:
- return option.numa_node_id();
- default:
- CAFFE_THROW("Unknown device id for device type: ", option.device_type());
- }
-}
-
-C10_EXPORT bool IsSameDevice(const DeviceOption& lhs, const DeviceOption& rhs) {
- return (
- lhs.device_type() == rhs.device_type() &&
- lhs.device_id() == rhs.device_id() &&
- lhs.node_name() == rhs.node_name() &&
- lhs.numa_node_id() == rhs.numa_node_id());
-}
-
-C10_EXPORT bool IsCPUDeviceType(int device_type) {
- static const std::unordered_set<int> cpu_types{
- PROTO_CPU,
- PROTO_MKLDNN,
- PROTO_IDEEP,
- };
- return cpu_types.count(device_type);
-}
-
-C10_EXPORT bool IsGPUDeviceType(int device_type) {
- static const std::unordered_set<int> gpu_types{
- PROTO_CUDA,
- PROTO_HIP,
- };
- return gpu_types.count(device_type);
-}
-
-C10_EXPORT bool ReadStringFromFile(const char* filename, string* str) {
- std::ifstream ifs(filename, std::ios::in);
- if (!ifs) {
- VLOG(1) << "File cannot be opened: " << filename
- << " error: " << ifs.rdstate();
- return false;
- }
- ifs.seekg(0, std::ios::end);
- size_t n = ifs.tellg();
- str->resize(n);
- ifs.seekg(0);
- ifs.read(&(*str)[0], n);
- return true;
-}
-
-C10_EXPORT bool WriteStringToFile(const string& str, const char* filename) {
- std::ofstream ofs(filename, std::ios::out | std::ios::trunc);
- if (!ofs.is_open()) {
- VLOG(1) << "File cannot be created: " << filename
- << " error: " << ofs.rdstate();
- return false;
- }
- ofs << str;
- return true;
-}
-
-// IO-specific proto functions: we will deal with the protocol buffer lite and
-// full versions differently.
-
-#ifdef CAFFE2_USE_LITE_PROTO
-
-// Lite runtime.
-
-namespace {
-class IfstreamInputStream : public ::google::protobuf::io::CopyingInputStream {
- public:
- explicit IfstreamInputStream(const string& filename)
- : ifs_(filename.c_str(), std::ios::in | std::ios::binary) {}
- ~IfstreamInputStream() {
- ifs_.close();
- }
-
- int Read(void* buffer, int size) {
- if (!ifs_) {
- return -1;
- }
- ifs_.read(static_cast<char*>(buffer), size);
- return ifs_.gcount();
- }
-
- private:
- std::ifstream ifs_;
-};
-} // namespace
-
-C10_EXPORT string ProtoDebugString(const MessageLite& proto) {
- string serialized = proto.SerializeAsString();
- for (char& c : serialized) {
- if (c < 0x20 || c >= 0x7f) {
- c = '?';
- }
- }
- return serialized;
-}
-
-C10_EXPORT bool ParseProtoFromLargeString(
- const string& str,
- MessageLite* proto) {
- ::google::protobuf::io::ArrayInputStream input_stream(str.data(), str.size());
- ::google::protobuf::io::CodedInputStream coded_stream(&input_stream);
- // Set PlanDef message size limit to 2G.
- setTotalBytesLimit(coded_stream, 2147483647, 512LL << 20);
- return proto->ParseFromCodedStream(&coded_stream);
-}
-
-C10_EXPORT bool ReadProtoFromBinaryFile(
- const char* filename,
- MessageLite* proto) {
- ::google::protobuf::io::CopyingInputStreamAdaptor stream(
- new IfstreamInputStream(filename));
- stream.SetOwnsCopyingStream(true);
- // Total bytes hard limit / warning limit are set to 2GB and 512MB
- // respectively.
- ::google::protobuf::io::CodedInputStream coded_stream(&stream);
- setTotalBytesLimit(coded_stream, 2147483647, 512LL << 20);
- return proto->ParseFromCodedStream(&coded_stream);
-}
-
-C10_EXPORT void WriteProtoToBinaryFile(
- const MessageLite& /*proto*/,
- const char* /*filename*/) {
- LOG(FATAL) << "Not implemented yet.";
-}
-
-#else // CAFFE2_USE_LITE_PROTO
-
-// Full protocol buffer.
-
-using ::google::protobuf::Message;
-using ::google::protobuf::io::CodedInputStream;
-using ::google::protobuf::io::CodedOutputStream;
-using ::google::protobuf::io::FileInputStream;
-using ::google::protobuf::io::FileOutputStream;
-using ::google::protobuf::io::ZeroCopyInputStream;
-using ::google::protobuf::io::ZeroCopyOutputStream;
-
-namespace TextFormat {
-C10_EXPORT bool ParseFromString(const string& spec, Message* proto) {
- string bc_spec = spec;
-
- {
- auto num_replaced = c10::ReplaceAll(bc_spec, "cuda_gpu_id", "device_id");
- if (num_replaced) {
- LOG(ERROR) << "Your model was serialized in Protobuf TextFormat and "
- << "it has " << num_replaced
- << " places using the deprecated field name 'cuda_gpu_id'!\n"
- << spec
- << "\nPlease re-export your model in Protobuf binary format "
- << "to make it backward compatible for field renaming.";
- }
- }
-
- return ::google::protobuf::TextFormat::ParseFromString(
- // NOLINTNEXTLINE(performance-move-const-arg)
- std::move(bc_spec), proto);
-}
-} // namespace TextFormat
-
-C10_EXPORT string ProtoDebugString(const Message& proto) {
- return proto.ShortDebugString();
-}
-
-C10_EXPORT bool ParseProtoFromLargeString(const string& str, Message* proto) {
- ::google::protobuf::io::ArrayInputStream input_stream(str.data(), str.size());
- ::google::protobuf::io::CodedInputStream coded_stream(&input_stream);
- // Set PlanDef message size limit to 2G.
- setTotalBytesLimit(coded_stream, 2147483647, 512LL << 20);
- return proto->ParseFromCodedStream(&coded_stream);
-}
-
-C10_EXPORT bool ReadProtoFromTextFile(const char* filename, Message* proto) {
- int fd = open(filename, O_RDONLY);
- CAFFE_ENFORCE_NE(fd, -1, "File not found: ", filename);
- FileInputStream* input = new FileInputStream(fd);
- bool success = google::protobuf::TextFormat::Parse(input, proto);
- delete input;
- close(fd);
- return success;
-}
-
-C10_EXPORT void WriteProtoToTextFile(
- const Message& proto,
- const char* filename,
- bool throwIfError) {
- int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
- FileOutputStream* output = new FileOutputStream(fd);
- if(!google::protobuf::TextFormat::Print(proto, output)) {
- if (throwIfError) {
- CAFFE_THROW("Cannot write proto to text file: ", filename);
- } else {
- LOG(ERROR) << "Cannot write proto to text file: " << filename;
- }
- }
- delete output;
- close(fd);
-}
-
-C10_EXPORT bool ReadProtoFromBinaryFile(
- const char* filename,
- MessageLite* proto) {
-#if defined(_MSC_VER) // for MSC compiler binary flag needs to be specified
- int fd = open(filename, O_RDONLY | O_BINARY);
-#else
- int fd = open(filename, O_RDONLY);
-#endif
- CAFFE_ENFORCE_NE(fd, -1, "File not found: ", filename);
- std::unique_ptr<ZeroCopyInputStream> raw_input(new FileInputStream(fd));
- std::unique_ptr<CodedInputStream> coded_input(
- new CodedInputStream(raw_input.get()));
- // A hack to manually allow using very large protocol buffers.
- #if GOOGLE_PROTOBUF_VERSION >= 3011000
- // Only take one parameter since protobuf 3.11
- coded_input->SetTotalBytesLimit(2147483647);
- #else
- // Total bytes hard limit / warning limit are set to 2GB and 512MB respectively.
- coded_input->SetTotalBytesLimit(2147483647, 536870912);
- #endif
- bool success = proto->ParseFromCodedStream(coded_input.get());
- coded_input.reset();
- raw_input.reset();
- close(fd);
- return success;
-}
-
-C10_EXPORT void WriteProtoToBinaryFile(
- const MessageLite& proto,
- const char* filename) {
- int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
- CAFFE_ENFORCE_NE(
- fd, -1, "File cannot be created: ", filename, " error number: ", errno);
- std::unique_ptr<ZeroCopyOutputStream> raw_output(new FileOutputStream(fd));
- std::unique_ptr<CodedOutputStream> coded_output(
- new CodedOutputStream(raw_output.get()));
- CAFFE_ENFORCE(proto.SerializeToCodedStream(coded_output.get()));
- coded_output.reset();
- raw_output.reset();
- close(fd);
-}
-
-#endif // CAFFE2_USE_LITE_PROTO
-
-C10_EXPORT ArgumentHelper::ArgumentHelper(const OperatorDef& def) {
- for (auto& arg : def.arg()) {
- if (arg_map_.count(arg.name())) {
- if (arg.SerializeAsString() != arg_map_[arg.name()].SerializeAsString()) {
- // If there are two arguments of the same name but different contents,
- // we will throw an error.
- CAFFE_THROW(
- "Found argument of the same name ",
- arg.name(),
- "but with different contents.",
- ProtoDebugString(def));
- } else {
- LOG(WARNING) << "Duplicated argument name [" << arg.name()
- << "] found in operator def: " << ProtoDebugString(def);
- }
- }
- arg_map_[arg.name()] = arg;
- }
-}
-
-C10_EXPORT ArgumentHelper::ArgumentHelper(const NetDef& netdef) {
- for (auto& arg : netdef.arg()) {
- CAFFE_ENFORCE(
- arg_map_.count(arg.name()) == 0,
- "Duplicated argument name [",
- arg.name(),
- "] found in net def: ",
- ProtoDebugString(netdef));
- arg_map_[arg.name()] = arg;
- }
-}
-
-C10_EXPORT bool ArgumentHelper::HasArgument(c10::string_view name) const {
-#ifdef CAFFE2_ENABLE_REDUCED_STRINGS_IN_ARGUMENT_LOOKUP
- return arg_map_.count(name);
-#else
- return arg_map_.count(std::string(name));
-#endif
-}
-
-namespace {
-// Helper function to verify that conversion between types won't loose any
-// significant bit.
-template <typename InputType, typename TargetType>
-bool SupportsLosslessConversion(const InputType& value) {
- return static_cast<InputType>(static_cast<TargetType>(value)) == value;
-}
-} // namespace
-bool operator==(const TensorProto& l, const TensorProto& r) {
- return l.SerializeAsString() == r.SerializeAsString();
-}
-
-std::ostream& operator<<(std::ostream& output, const TensorProto& n) {
- output << n.SerializeAsString();
- return output;
-}
-bool operator==(const QTensorProto& l, const QTensorProto& r) {
- return l.SerializeAsString() == r.SerializeAsString();
-}
-
-std::ostream& operator<<(std::ostream& output, const QTensorProto& n) {
- output << n.SerializeAsString();
- return output;
-}
-bool operator==(const NetDef& l, const NetDef& r) {
- return l.SerializeAsString() == r.SerializeAsString();
-}
-
-std::ostream& operator<<(std::ostream& output, const NetDef& n) {
- output << n.SerializeAsString();
- return output;
-}
-
-#define INSTANTIATE_GET_SINGLE_ARGUMENT( \
- T, fieldname, enforce_lossless_conversion) \
- template <> \
- C10_EXPORT T ArgumentHelper::GetSingleArgument<T>( \
- c10::string_view name, const T& default_value) const { \
- auto it = CAFFE2_ARG_MAP_FIND(arg_map_, name); \
- if (it == arg_map_.end()) { \
- VLOG(1) << "Using default parameter value " << default_value \
- << " for parameter " << name; \
- return default_value; \
- } \
- CAFFE_ENFORCE( \
- it->second.has_##fieldname(), \
- "Argument ", \
- name, \
- " does not have the right field: expected field " #fieldname); \
- const auto& value = it->second.fieldname(); \
- if (enforce_lossless_conversion) { \
- auto supportsConversion = \
- SupportsLosslessConversion<decltype(value), T>(value); \
- CAFFE_ENFORCE( \
- supportsConversion, \
- "Value", \
- value, \
- " of argument ", \
- name, \
- "cannot be represented correctly in a target type"); \
- } \
- return static_cast<T>(value); \
- } \
- template <> \
- C10_EXPORT bool ArgumentHelper::HasSingleArgumentOfType<T>( \
- c10::string_view name) const { \
- auto it = CAFFE2_ARG_MAP_FIND(arg_map_, name); \
- if (it == arg_map_.end()) { \
- return false; \
- } \
- return it->second.has_##fieldname(); \
- }
-
-INSTANTIATE_GET_SINGLE_ARGUMENT(float, f, false)
-INSTANTIATE_GET_SINGLE_ARGUMENT(double, f, false)
-INSTANTIATE_GET_SINGLE_ARGUMENT(bool, i, false)
-INSTANTIATE_GET_SINGLE_ARGUMENT(int8_t, i, true)
-INSTANTIATE_GET_SINGLE_ARGUMENT(int16_t, i, true)
-INSTANTIATE_GET_SINGLE_ARGUMENT(int, i, true)
-INSTANTIATE_GET_SINGLE_ARGUMENT(int64_t, i, true)
-INSTANTIATE_GET_SINGLE_ARGUMENT(uint8_t, i, true)
-INSTANTIATE_GET_SINGLE_ARGUMENT(uint16_t, i, true)
-INSTANTIATE_GET_SINGLE_ARGUMENT(size_t, i, true)
-INSTANTIATE_GET_SINGLE_ARGUMENT(string, s, false)
-INSTANTIATE_GET_SINGLE_ARGUMENT(NetDef, n, false)
-#undef INSTANTIATE_GET_SINGLE_ARGUMENT
-
-#define INSTANTIATE_GET_REPEATED_ARGUMENT( \
- T, fieldname, enforce_lossless_conversion) \
- template <> \
- C10_EXPORT std::vector<T> ArgumentHelper::GetRepeatedArgument<T>( \
- c10::string_view name, const std::vector<T>& default_value) const { \
- auto it = CAFFE2_ARG_MAP_FIND(arg_map_, name); \
- if (it == arg_map_.end()) { \
- return default_value; \
- } \
- std::vector<T> values; \
- for (const auto& v : it->second.fieldname()) { \
- if (enforce_lossless_conversion) { \
- auto supportsConversion = \
- SupportsLosslessConversion<decltype(v), T>(v); \
- CAFFE_ENFORCE( \
- supportsConversion, \
- "Value", \
- v, \
- " of argument ", \
- name, \
- "cannot be represented correctly in a target type"); \
- } \
- values.push_back(static_cast<T>(v)); \
- } \
- return values; \
- }
-
-INSTANTIATE_GET_REPEATED_ARGUMENT(float, floats, false)
-INSTANTIATE_GET_REPEATED_ARGUMENT(double, floats, false)
-INSTANTIATE_GET_REPEATED_ARGUMENT(bool, ints, false)
-INSTANTIATE_GET_REPEATED_ARGUMENT(int8_t, ints, true)
-INSTANTIATE_GET_REPEATED_ARGUMENT(int16_t, ints, true)
-INSTANTIATE_GET_REPEATED_ARGUMENT(int, ints, true)
-INSTANTIATE_GET_REPEATED_ARGUMENT(int64_t, ints, true)
-INSTANTIATE_GET_REPEATED_ARGUMENT(uint8_t, ints, true)
-INSTANTIATE_GET_REPEATED_ARGUMENT(uint16_t, ints, true)
-INSTANTIATE_GET_REPEATED_ARGUMENT(size_t, ints, true)
-INSTANTIATE_GET_REPEATED_ARGUMENT(string, strings, false)
-INSTANTIATE_GET_REPEATED_ARGUMENT(NetDef, nets, false)
-INSTANTIATE_GET_REPEATED_ARGUMENT(TensorProto, tensors, false)
-INSTANTIATE_GET_REPEATED_ARGUMENT(QTensorProto, qtensors, false)
-#undef INSTANTIATE_GET_REPEATED_ARGUMENT
-
-#define CAFFE2_MAKE_SINGULAR_ARGUMENT(T, fieldname) \
- template <> \
- C10_EXPORT Argument MakeArgument(const string& name, const T& value) { \
- Argument arg; \
- arg.set_name(name); \
- arg.set_##fieldname(value); \
- return arg; \
- }
-
-CAFFE2_MAKE_SINGULAR_ARGUMENT(bool, i)
-CAFFE2_MAKE_SINGULAR_ARGUMENT(float, f)
-CAFFE2_MAKE_SINGULAR_ARGUMENT(int, i)
-CAFFE2_MAKE_SINGULAR_ARGUMENT(int16_t, i)
-CAFFE2_MAKE_SINGULAR_ARGUMENT(int64_t, i)
-CAFFE2_MAKE_SINGULAR_ARGUMENT(string, s)
-#undef CAFFE2_MAKE_SINGULAR_ARGUMENT
-
-template <>
-C10_EXPORT Argument MakeArgument(const string& name, const NetDef& value) {
- Argument arg;
- arg.set_name(name);
- *arg.mutable_n() = value;
- return arg;
-}
-
-template <>
-C10_EXPORT bool ArgumentHelper::RemoveArgument(OperatorDef& def, int index);
-template <>
-bool ArgumentHelper::RemoveArgument(NetDef& def, int index);
-
-template <>
-C10_EXPORT Argument MakeArgument(const string& name, const MessageLite& value) {
- Argument arg;
- arg.set_name(name);
- arg.set_s(value.SerializeAsString());
- return arg;
-}
-
-#define CAFFE2_MAKE_REPEATED_ARGUMENT(T, fieldname) \
- template <> \
- C10_EXPORT Argument MakeArgument( \
- const string& name, const std::vector<T>& value) { \
- Argument arg; \
- arg.set_name(name); \
- for (const auto& v : value) { \
- arg.add_##fieldname(v); \
- } \
- return arg; \
- }
-
-CAFFE2_MAKE_REPEATED_ARGUMENT(float, floats)
-CAFFE2_MAKE_REPEATED_ARGUMENT(int, ints)
-CAFFE2_MAKE_REPEATED_ARGUMENT(int64_t, ints)
-CAFFE2_MAKE_REPEATED_ARGUMENT(string, strings)
-#undef CAFFE2_MAKE_REPEATED_ARGUMENT
-
-C10_EXPORT bool HasOutput(const OperatorDef& op, const std::string& output) {
- for (const auto& outp : op.output()) {
- if (outp == output) {
- return true;
- }
- }
- return false;
-}
-
-C10_EXPORT bool HasInput(const OperatorDef& op, const std::string& input) {
- for (const auto& inp : op.input()) {
- if (inp == input) {
- return true;
- }
- }
- return false;
-}
-
-// Return the argument index or -1 if it does not exist.
-C10_EXPORT int GetArgumentIndex(
- const google::protobuf::RepeatedPtrField<Argument>& args,
- c10::string_view name) {
- int index = 0;
- for (const Argument& arg : args) {
- if (arg.name() == name) {
- return index;
- }
- index++;
- }
- return -1;
-}
-
-C10_EXPORT const Argument& GetArgument(
- const OperatorDef& def,
- c10::string_view name) {
- int index = GetArgumentIndex(def.arg(), name);
- if (index != -1) {
- return def.arg(index);
- } else {
- CAFFE_THROW(
- "Argument named ",
- name,
- " does not exist in operator ",
- ProtoDebugString(def));
- }
-}
-
-C10_EXPORT const Argument& GetArgument(const NetDef& def, c10::string_view name) {
- int index = GetArgumentIndex(def.arg(), name);
- if (index != -1) {
- return def.arg(index);
- } else {
- CAFFE_THROW(
- "Argument named ",
- name,
- " does not exist in net ",
- ProtoDebugString(def));
- }
-}
-
-C10_EXPORT const Argument* GetArgumentPtr(
- const OperatorDef& def,
- c10::string_view name) {
- int index = GetArgumentIndex(def.arg(), name);
- if (index != -1) {
- return &def.arg(index);
- } else {
- return nullptr;
- }
-}
-
-C10_EXPORT const Argument* GetArgumentPtr(
- const NetDef& def,
- c10::string_view name) {
- int index = GetArgumentIndex(def.arg(), name);
- if (index != -1) {
- return &def.arg(index);
- } else {
- return nullptr;
- }
-}
-
-C10_EXPORT bool GetFlagArgument(
- const google::protobuf::RepeatedPtrField<Argument>& args,
- c10::string_view name,
- bool default_value) {
- int index = GetArgumentIndex(args, name);
- if (index != -1) {
- // NOLINTNEXTLINE(performance-unnecessary-copy-initialization)
- auto arg = args.Get(index);
- CAFFE_ENFORCE(
- arg.has_i(), "Can't parse argument as bool: ", ProtoDebugString(arg));
- return arg.i();
- }
- return default_value;
-}
-
-C10_EXPORT bool GetFlagArgument(
- const OperatorDef& def,
- c10::string_view name,
- bool default_value) {
- return GetFlagArgument(def.arg(), name, default_value);
-}
-
-C10_EXPORT bool
-GetFlagArgument(const NetDef& def, c10::string_view name, bool default_value) {
- return GetFlagArgument(def.arg(), name, default_value);
-}
-
-template <typename Def>
-Argument* GetMutableArgumentImpl(
- const string& name,
- const bool create_if_missing,
- Def* def) {
- for (int i = 0; i < def->arg_size(); ++i) {
- if (def->arg(i).name() == name) {
- return def->mutable_arg(i);
- }
- }
- // If no argument of the right name is found...
- if (create_if_missing) {
- Argument* arg = def->add_arg();
- arg->set_name(name);
- return arg;
- } else {
- return nullptr;
- }
-}
-
-C10_EXPORT Argument* GetMutableArgument(
- const string& name,
- const bool create_if_missing,
- OperatorDef* def) {
- return GetMutableArgumentImpl(name, create_if_missing, def);
-}
-
-C10_EXPORT Argument* GetMutableArgument(
- const string& name,
- const bool create_if_missing,
- NetDef* def) {
- return GetMutableArgumentImpl(name, create_if_missing, def);
-}
-
-C10_EXPORT void cleanupExternalInputsAndOutputs(NetDef* net) {
- std::vector<std::string> oldExternalInputs;
- for (const auto& input : net->external_input()) {
- oldExternalInputs.emplace_back(input);
- }
- std::vector<std::string> oldExternalOutputs;
- for (const auto& output : net->external_output()) {
- oldExternalOutputs.emplace_back(output);
- }
-
- net->clear_external_input();
- net->clear_external_output();
-
- std::set<std::string> inputSet;
- for (const auto& input : oldExternalInputs) {
- if (inputSet.count(input)) {
- // Prevent duplicate external inputs.
- continue;
- }
- inputSet.insert(input);
- net->add_external_input(input);
- }
-
- // Set of blobs that are external inputs or outputs of some operators.
- std::set<std::string> allOutputs(inputSet.begin(), inputSet.end());
- for (const auto& op : net->op()) {
- for (const auto& input : op.input()) {
- if (inputSet.count(input) || allOutputs.count(input)) {
- continue;
- }
- // Add missing external inputs.
- inputSet.insert(input);
- net->add_external_input(input);
- }
- for (const auto& output : op.output()) {
- allOutputs.insert(output);
- }
- }
-
- std::set<std::string> outputSet;
- for (const auto& output : oldExternalOutputs) {
- if (!allOutputs.count(output)) {
- continue;
- }
- if (outputSet.count(output)) {
- continue;
- }
- outputSet.insert(output);
- net->add_external_output(output);
- }
-}
-
-} // namespace caffe2
diff --git a/caffe2/utils/proto_utils.h b/caffe2/utils/proto_utils.h
deleted file mode 100644
index a690342..0000000
--- a/caffe2/utils/proto_utils.h
+++ /dev/null
@@ -1,383 +0,0 @@
-#ifndef CAFFE2_UTILS_PROTO_UTILS_H_
-#define CAFFE2_UTILS_PROTO_UTILS_H_
-
-#ifdef CAFFE2_USE_LITE_PROTO
-#include <google/protobuf/message_lite.h>
-#else // CAFFE2_USE_LITE_PROTO
-#include <google/protobuf/message.h>
-#endif // !CAFFE2_USE_LITE_PROTO
-
-#include <c10/util/Logging.h>
-#include <c10/util/string_view.h>
-#include <c10/util/irange.h>
-
-#include "caffe2/utils/proto_wrap.h"
-#include "caffe2/proto/caffe2_pb.h"
-
-#ifndef C10_ANDROID
-#define CAFFE2_ENABLE_REDUCED_STRINGS_IN_ARGUMENT_LOOKUP
-#define CAFFE2_ARG_MAP_FIND(map, key) map.find(key)
-#else
-#define CAFFE2_ARG_MAP_FIND(map, key) map.find(std::string(key))
-#endif
-
-namespace caffe2 {
-
-using std::string;
-using ::google::protobuf::MessageLite;
-
-// A wrapper function to return device name string for use in blob serialization
-// / deserialization. This should have one to one correspondence with
-// caffe2/proto/caffe2.proto: enum DeviceType.
-//
-// Note that we can't use DeviceType_Name, because that is only available in
-// protobuf-full, and some platforms (like mobile) may want to use
-// protobuf-lite instead.
-TORCH_API std::string DeviceTypeName(const int32_t& d);
-
-TORCH_API int DeviceId(const DeviceOption& option);
-
-// Returns if the two DeviceOptions are pointing to the same device.
-TORCH_API bool IsSameDevice(const DeviceOption& lhs, const DeviceOption& rhs);
-
-TORCH_API bool IsCPUDeviceType(int device_type);
-TORCH_API bool IsGPUDeviceType(int device_type);
-
-// Common interfaces that reads file contents into a string.
-TORCH_API bool ReadStringFromFile(const char* filename, string* str);
-TORCH_API bool WriteStringToFile(const string& str, const char* filename);
-
-// Common interfaces that are supported by both lite and full protobuf.
-TORCH_API bool ReadProtoFromBinaryFile(const char* filename, MessageLite* proto);
-inline bool ReadProtoFromBinaryFile(const string filename, MessageLite* proto) {
- return ReadProtoFromBinaryFile(filename.c_str(), proto);
-}
-
-TORCH_API void WriteProtoToBinaryFile(const MessageLite& proto, const char* filename);
-inline void WriteProtoToBinaryFile(const MessageLite& proto,
- const string& filename) {
- return WriteProtoToBinaryFile(proto, filename.c_str());
-}
-
-#ifdef CAFFE2_USE_LITE_PROTO
-
-namespace TextFormat {
-inline bool ParseFromString(const string& spec, MessageLite* proto) {
- LOG(FATAL) << "If you are running lite version, you should not be "
- << "calling any text-format protobuffers.";
- return false;
-}
-} // namespace TextFormat
-
-
-TORCH_API string ProtoDebugString(const MessageLite& proto);
-
-TORCH_API bool ParseProtoFromLargeString(const string& str, MessageLite* proto);
-
-// Text format MessageLite wrappers: these functions do nothing but just
-// allowing things to compile. It will produce a runtime error if you are using
-// MessageLite but still want text support.
-inline bool ReadProtoFromTextFile(
- const char* /*filename*/,
- MessageLite* /*proto*/) {
- LOG(FATAL) << "If you are running lite version, you should not be "
- << "calling any text-format protobuffers.";
- return false; // Just to suppress compiler warning.
-}
-inline bool ReadProtoFromTextFile(const string filename, MessageLite* proto) {
- return ReadProtoFromTextFile(filename.c_str(), proto);
-}
-
-inline void WriteProtoToTextFile(
- const MessageLite& /*proto*/,
- const char* /*filename*/,
- bool throwIfError = true) {
- LOG(FATAL) << "If you are running lite version, you should not be "
- << "calling any text-format protobuffers.";
-}
-inline void WriteProtoToTextFile(const MessageLite& proto,
- const string& filename,
- bool throwIfError = true) {
- return WriteProtoToTextFile(proto, filename.c_str(), throwIfError);
-}
-
-inline bool ReadProtoFromFile(const char* filename, MessageLite* proto) {
- return (ReadProtoFromBinaryFile(filename, proto) ||
- ReadProtoFromTextFile(filename, proto));
-}
-
-inline bool ReadProtoFromFile(const string& filename, MessageLite* proto) {
- return ReadProtoFromFile(filename.c_str(), proto);
-}
-
-#else // CAFFE2_USE_LITE_PROTO
-
-using ::google::protobuf::Message;
-
-namespace TextFormat {
-TORCH_API bool ParseFromString(const string& spec, Message* proto);
-} // namespace TextFormat
-
-TORCH_API string ProtoDebugString(const Message& proto);
-
-TORCH_API bool ParseProtoFromLargeString(const string& str, Message* proto);
-
-TORCH_API bool ReadProtoFromTextFile(const char* filename, Message* proto);
-inline bool ReadProtoFromTextFile(const string filename, Message* proto) {
- return ReadProtoFromTextFile(filename.c_str(), proto);
-}
-
-TORCH_API void WriteProtoToTextFile(const Message& proto, const char* filename, bool throwIfError = true);
-inline void WriteProtoToTextFile(const Message& proto, const string& filename, bool throwIfError = true) {
- return WriteProtoToTextFile(proto, filename.c_str(), throwIfError);
-}
-
-// Read Proto from a file, letting the code figure out if it is text or binary.
-inline bool ReadProtoFromFile(const char* filename, Message* proto) {
- return (ReadProtoFromBinaryFile(filename, proto) ||
- ReadProtoFromTextFile(filename, proto));
-}
-
-inline bool ReadProtoFromFile(const string& filename, Message* proto) {
- return ReadProtoFromFile(filename.c_str(), proto);
-}
-
-#endif // CAFFE2_USE_LITE_PROTO
-
-template <
- class IterableInputs = std::initializer_list<string>,
- class IterableOutputs = std::initializer_list<string>,
- class IterableArgs = std::initializer_list<Argument>>
-OperatorDef CreateOperatorDef(
- const string& type,
- const string& name,
- const IterableInputs& inputs,
- const IterableOutputs& outputs,
- const IterableArgs& args,
- const DeviceOption& device_option = DeviceOption(),
- const string& engine = "") {
- OperatorDef def;
- def.set_type(type);
- def.set_name(name);
- for (const string& in : inputs) {
- def.add_input(in);
- }
- for (const string& out : outputs) {
- def.add_output(out);
- }
- for (const Argument& arg : args) {
- def.add_arg()->CopyFrom(arg);
- }
- if (device_option.has_device_type()) {
- def.mutable_device_option()->CopyFrom(device_option);
- }
- if (engine.size()) {
- def.set_engine(engine);
- }
- return def;
-}
-
-// A simplified version compared to the full CreateOperator, if you do not need
-// to specify args.
-template <
- class IterableInputs = std::initializer_list<string>,
- class IterableOutputs = std::initializer_list<string>>
-inline OperatorDef CreateOperatorDef(
- const string& type,
- const string& name,
- const IterableInputs& inputs,
- const IterableOutputs& outputs,
- const DeviceOption& device_option = DeviceOption(),
- const string& engine = "") {
- return CreateOperatorDef(
- type,
- name,
- inputs,
- outputs,
- std::vector<Argument>(),
- device_option,
- engine);
-}
-
-TORCH_API bool HasOutput(const OperatorDef& op, const std::string& output);
-TORCH_API bool HasInput(const OperatorDef& op, const std::string& input);
-
-/**
- * @brief A helper class to index into arguments.
- *
- * This helper helps us to more easily index into a set of arguments
- * that are present in the operator. To save memory, the argument helper
- * does not copy the operator def, so one would need to make sure that the
- * lifetime of the OperatorDef object outlives that of the ArgumentHelper.
- */
-class C10_EXPORT ArgumentHelper {
- public:
- template <typename Def>
- static bool HasArgument(const Def& def, c10::string_view name) {
- return ArgumentHelper(def).HasArgument(name);
- }
-
- template <typename Def, typename T>
- static T GetSingleArgument(
- const Def& def,
- c10::string_view name,
- const T& default_value) {
- return ArgumentHelper(def).GetSingleArgument<T>(name, default_value);
- }
-
- template <typename Def, typename T>
- static bool HasSingleArgumentOfType(const Def& def, c10::string_view name) {
- return ArgumentHelper(def).HasSingleArgumentOfType<T>(name);
- }
-
- template <typename Def, typename T>
- static std::vector<T> GetRepeatedArgument(
- const Def& def,
- c10::string_view name,
- const std::vector<T>& default_value = std::vector<T>()) {
- return ArgumentHelper(def).GetRepeatedArgument<T>(name, default_value);
- }
-
- template <typename Def, typename MessageType>
- static MessageType GetMessageArgument(const Def& def, c10::string_view name) {
- return ArgumentHelper(def).GetMessageArgument<MessageType>(name);
- }
-
- template <typename Def, typename MessageType>
- static std::vector<MessageType> GetRepeatedMessageArgument(
- const Def& def,
- c10::string_view name) {
- return ArgumentHelper(def).GetRepeatedMessageArgument<MessageType>(name);
- }
-
- template <typename Def>
- static bool RemoveArgument(Def& def, int index) {
- if (index >= def.arg_size()) {
- return false;
- }
- if (index < def.arg_size() - 1) {
- def.mutable_arg()->SwapElements(index, def.arg_size() - 1);
- }
- def.mutable_arg()->RemoveLast();
- return true;
- }
-
- explicit ArgumentHelper(const OperatorDef& def);
- explicit ArgumentHelper(const NetDef& netdef);
- bool HasArgument(c10::string_view name) const;
-
- template <typename T>
- T GetSingleArgument(c10::string_view name, const T& default_value) const;
- template <typename T>
- bool HasSingleArgumentOfType(c10::string_view name) const;
- template <typename T>
- std::vector<T> GetRepeatedArgument(
- c10::string_view name,
- const std::vector<T>& default_value = std::vector<T>()) const;
-
- template <typename MessageType>
- MessageType GetMessageArgument(c10::string_view name) const {
- auto it = CAFFE2_ARG_MAP_FIND(arg_map_, name);
- CAFFE_ENFORCE(it != arg_map_.end(), "Cannot find parameter named ", name);
- MessageType message;
- if (it->second.has_s()) {
- CAFFE_ENFORCE(
- message.ParseFromString(it->second.s()),
- "Failed to parse content from the string");
- } else {
- VLOG(1) << "Return empty message for parameter " << name;
- }
- return message;
- }
-
- template <typename MessageType>
- std::vector<MessageType> GetRepeatedMessageArgument(c10::string_view name) const {
- auto it = CAFFE2_ARG_MAP_FIND(arg_map_, name);
- CAFFE_ENFORCE(it != arg_map_.end(), "Cannot find parameter named ", name);
- std::vector<MessageType> messages(it->second.strings_size());
- for (int i = 0; i < messages.size(); ++i) {
- CAFFE_ENFORCE(
- messages[i].ParseFromString(it->second.strings(i)),
- "Failed to parse content from the string");
- }
- return messages;
- }
-
- private:
- std::map<string, Argument
-#ifdef CAFFE2_ENABLE_REDUCED_STRINGS_IN_ARGUMENT_LOOKUP
- , std::less<>
-#endif
- > arg_map_;
-};
-
-// **** Arguments Utils *****
-
-// Helper methods to get an argument from OperatorDef or NetDef given argument
-// name. Throws if argument does not exist.
-TORCH_API const Argument& GetArgument(const OperatorDef& def, c10::string_view name);
-TORCH_API const Argument& GetArgument(const NetDef& def, c10::string_view name);
-// Helper methods to get an argument from OperatorDef or NetDef given argument
-// name. Returns nullptr if argument does not exist.
-TORCH_API const Argument* GetArgumentPtr(const OperatorDef& def, c10::string_view name);
-TORCH_API const Argument* GetArgumentPtr(const NetDef& def, c10::string_view name);
-
-// Helper methods to query a boolean argument flag from OperatorDef or NetDef
-// given argument name. If argument does not exist, return default value.
-// Throws if argument exists but the type is not boolean.
-TORCH_API bool GetFlagArgument(
- const OperatorDef& def,
- c10::string_view name,
- bool default_value = false);
-TORCH_API bool GetFlagArgument(
- const NetDef& def,
- c10::string_view name,
- bool default_value = false);
-
-TORCH_API Argument* GetMutableArgument(
- const string& name,
- const bool create_if_missing,
- OperatorDef* def);
-TORCH_API Argument* GetMutableArgument(
- const string& name,
- const bool create_if_missing,
- NetDef* def);
-
-template <typename T>
-TORCH_API Argument MakeArgument(const string& name, const T& value);
-
-template <typename T, typename Def>
-inline void AddArgument(const string& name, const T& value, Def* def) {
- GetMutableArgument(name, true, def)->CopyFrom(MakeArgument(name, value));
-}
-// **** End Arguments Utils *****
-
-bool inline operator==(const DeviceOption& dl, const DeviceOption& dr) {
- return IsSameDevice(dl, dr);
-}
-
-// Given a net, modify the external inputs/outputs if necessary so that
-// the following conditions are met
-// - No duplicate external inputs
-// - No duplicate external outputs
-// - Going through list of ops in order, all op inputs must be outputs
-// from other ops, or registered as external inputs.
-// - All external outputs must be outputs of some operators.
-TORCH_API void cleanupExternalInputsAndOutputs(NetDef* net);
-
-} // namespace caffe2
-
-namespace std {
-template <>
-struct hash<caffe2::DeviceOption> {
- typedef caffe2::DeviceOption argument_type;
- typedef std::size_t result_type;
- result_type operator()(argument_type const& device_option) const {
- std::string serialized;
- CAFFE_ENFORCE(device_option.SerializeToString(&serialized));
- return std::hash<std::string>{}(serialized);
- }
-};
-} // namespace std
-
-#endif // CAFFE2_UTILS_PROTO_UTILS_H_
diff --git a/caffe2/utils/proto_utils_test.cc b/caffe2/utils/proto_utils_test.cc
deleted file mode 100644
index 1a68769..0000000
--- a/caffe2/utils/proto_utils_test.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-#include <gtest/gtest.h>
-
-#include "caffe2/core/test_utils.h"
-#include "caffe2/utils/proto_utils.h"
-
-namespace caffe2 {
-
-TEST(ProtoUtilsTest, IsSameDevice) {
- DeviceOption a;
- DeviceOption b;
- EXPECT_TRUE(IsSameDevice(a, b));
- a.set_node_name("my_node");
- EXPECT_FALSE(IsSameDevice(a, b));
- b.set_node_name("my_node");
- EXPECT_TRUE(IsSameDevice(a, b));
- b.set_device_id(2);
- EXPECT_FALSE(IsSameDevice(a, b));
- a.set_device_id(2);
- EXPECT_TRUE(IsSameDevice(a, b));
- a.set_device_type(DeviceTypeProto::PROTO_CUDA);
- b.set_device_type(DeviceTypeProto::PROTO_CPU);
- EXPECT_FALSE(IsSameDevice(a, b));
-}
-
-TEST(ProtoUtilsTest, SimpleReadWrite) {
- string content("The quick brown fox jumps over the lazy dog.");
- string name = std::tmpnam(nullptr);
- EXPECT_TRUE(WriteStringToFile(content, name.c_str()));
- string read_back;
- EXPECT_TRUE(ReadStringFromFile(name.c_str(), &read_back));
- EXPECT_EQ(content, read_back);
-}
-
-TEST(ProtoUtilsTest, CleanupExternalInputsAndOutputs) {
- caffe2::NetDef net;
- caffe2::testing::NetMutator(&net)
- .newOp("op1", {"X1", "X2"}, {"Y"})
- .newOp("op2", {"W", "Y"}, {"Z1", "Z2"})
- .newOp("op3", {"Z2", "W"}, {"O"})
- .externalInputs({"X1", "X3", "X1", "W"})
- .externalOutputs({"O", "Z2", "Z3", "O", "X3"});
- cleanupExternalInputsAndOutputs(&net);
-
- std::vector<std::string> externalInputs;
- for (const auto& inputName : net.external_input()) {
- externalInputs.emplace_back(inputName);
- }
- // The 2nd X1 is removed because of duplication.
- // X2 is added because it should be a missing external input.
- std::vector<std::string> expectedExternalInputs{"X1", "X3", "W", "X2"};
- EXPECT_EQ(externalInputs, expectedExternalInputs);
-
- std::vector<std::string> externalOutputs;
- for (const auto& outputName : net.external_output()) {
- externalOutputs.emplace_back(outputName);
- }
- // Z3 is removed because it's not an output of any operator in the net.
- // The 2nd O is removed because of duplication.
- std::vector<std::string> expectedexternalOutputs{"O", "Z2", "X3"};
- EXPECT_EQ(externalOutputs, expectedexternalOutputs);
-}
-
-} // namespace caffe2
diff --git a/caffe2/utils/signal_handler.h b/caffe2/utils/signal_handler.h
deleted file mode 100644
index 14d93a0..0000000
--- a/caffe2/utils/signal_handler.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#pragma once
-
-#include <c10/util/signal_handler.h>
-
-namespace caffe2 {
-
-#if defined(C10_SUPPORTS_FATAL_SIGNAL_HANDLERS)
-class TORCH_API C2FatalSignalHandler : public c10::FatalSignalHandler {
- public:
- void fatalSignalHandlerPostProcess() override;
- static C2FatalSignalHandler& getInstance();
-
- private:
- explicit C2FatalSignalHandler();
-};
-
-// This works by setting up certain fatal signal handlers. Previous fatal
-// signal handlers will still be called when the signal is raised. Defaults
-// to being off.
-TORCH_API void setPrintStackTracesOnFatalSignal(bool print);
-TORCH_API bool printStackTracesOnFatalSignal();
-#endif // defined(C10_SUPPORTS_FATAL_SIGNAL_HANDLER)
-
-} // namespace caffe2
diff --git a/caffe2/utils/simple_queue.h b/caffe2/utils/simple_queue.h
deleted file mode 100644
index c16f552..0000000
--- a/caffe2/utils/simple_queue.h
+++ /dev/null
@@ -1,79 +0,0 @@
-#ifndef CAFFE2_UTILS_SIMPLE_QUEUE_H_
-#define CAFFE2_UTILS_SIMPLE_QUEUE_H_
-
-#include <condition_variable> // NOLINT
-#include <mutex> // NOLINT
-#include <queue>
-
-#include <c10/util/Logging.h>
-
-namespace caffe2 {
-
-// This is a very simple queue that Yangqing wrote when bottlefeeding the baby,
-// so don't take it seriously. What it does is a minimal thread-safe queue that
-// allows me to run network as a DAG.
-//
-// A usual work pattern looks like this: one or multiple producers push jobs
-// into this queue, and one or multiple workers pops jobs from this queue. If
-// nothing is in the queue but NoMoreJobs() is not called yet, the pop calls
-// will wait. If NoMoreJobs() has been called, pop calls will return false,
-// which serves as a message to the workers that they should exit.
-template <typename T>
-class SimpleQueue {
- public:
- SimpleQueue() : no_more_jobs_(false) {}
-
- // Pops a value and writes it to the value pointer. If there is nothing in the
- // queue, this will wait till a value is inserted to the queue. If there are
- // no more jobs to pop, the function returns false. Otherwise, it returns
- // true.
- bool Pop(T* value) {
- std::unique_lock<std::mutex> mutex_lock(mutex_);
- while (queue_.size() == 0 && !no_more_jobs_) cv_.wait(mutex_lock);
- if (queue_.size() == 0 && no_more_jobs_) return false;
- *value = queue_.front();
- queue_.pop();
- return true;
- }
-
- int size() {
- std::unique_lock<std::mutex> mutex_lock(mutex_);
- return queue_.size();
- }
-
- // Push pushes a value to the queue.
- void Push(const T& value) {
- {
- std::lock_guard<std::mutex> mutex_lock(mutex_);
- CAFFE_ENFORCE(!no_more_jobs_, "Cannot push to a closed queue.");
- queue_.push(value);
- }
- cv_.notify_one();
- }
-
- // NoMoreJobs() marks the close of this queue. It also notifies all waiting
- // Pop() calls so that they either check out remaining jobs, or return false.
- // After NoMoreJobs() is called, this queue is considered closed - no more
- // Push() functions are allowed, and once existing items are all checked out
- // by the Pop() functions, any more Pop() function will immediately return
- // false with nothing set to the value.
- void NoMoreJobs() {
- {
- std::lock_guard<std::mutex> mutex_lock(mutex_);
- no_more_jobs_ = true;
- }
- cv_.notify_all();
- }
-
- private:
- std::mutex mutex_;
- std::condition_variable cv_;
- std::queue<T> queue_;
- bool no_more_jobs_{};
- // We do not allow copy constructors.
- SimpleQueue(const SimpleQueue& /*src*/) {}
-};
-
-} // namespace caffe2
-
-#endif // CAFFE2_UTILS_SIMPLE_QUEUE_H_
diff --git a/caffe2/utils/simple_queue_test.cc b/caffe2/utils/simple_queue_test.cc
deleted file mode 100644
index e59f699..0000000
--- a/caffe2/utils/simple_queue_test.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-#include <thread> // NOLINT
-
-#include "caffe2/utils/simple_queue.h"
-#include <gtest/gtest.h>
-
-namespace caffe2 {
-
-static std::unique_ptr<SimpleQueue<int> > gQueue;
-
-static void ConsumerFunction(int thread_idx) {
- // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
- int value;
- while (true) {
- if (!gQueue->Pop(&value)) return;
- VLOG(1) << "Emitting " << value << " from thread " << thread_idx;
- }
-}
-
-static void ProducerFunction(int thread_idx, int start, int count) {
- for (int i = 0; i < count; ++i) {
- VLOG(1) << "Pushing " << i + start << " from thread " << thread_idx;
- gQueue->Push(i + start);
- }
-}
-
-
-TEST(SimpleQueueTest, SingleProducerSingleConsumer) {
- // NOLINTNEXTLINE(modernize-make-unique)
- gQueue.reset(new SimpleQueue<int>());
- std::thread consumer(ConsumerFunction, 0);
- for (int i = 0; i < 10; ++i) {
- gQueue->Push(i);
- }
- gQueue->NoMoreJobs();
- consumer.join();
-}
-
-TEST(SimpleQueueTest, SingleProducerDoubleConsumer) {
- // NOLINTNEXTLINE(modernize-make-unique)
- gQueue.reset(new SimpleQueue<int>());
- std::thread consumer0(ConsumerFunction, 0);
- std::thread consumer1(ConsumerFunction, 1);
- for (int i = 0; i < 10; ++i) {
- gQueue->Push(i);
- }
- gQueue->NoMoreJobs();
- consumer0.join();
- consumer1.join();
-}
-
-
-TEST(SimpleQueueTest, DoubleProducerDoubleConsumer) {
- // NOLINTNEXTLINE(modernize-make-unique)
- gQueue.reset(new SimpleQueue<int>());
- std::thread producer0(ProducerFunction, 0, 0, 10);
- std::thread producer1(ProducerFunction, 0, 10, 10);
- std::thread consumer0(ConsumerFunction, 2);
- std::thread consumer1(ConsumerFunction, 3);
- producer0.join();
- producer1.join();
- gQueue->NoMoreJobs();
- consumer0.join();
- consumer1.join();
-}
-
-TEST(SimpleQueueDeathTest, CannotAddAfterQueueFinished) {
- // NOLINTNEXTLINE(modernize-make-unique)
- gQueue.reset(new SimpleQueue<int>());
- gQueue->Push(0);
- gQueue->NoMoreJobs();
- // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
- ASSERT_THROW(gQueue->Push(0), EnforceNotMet);
-}
-
-
-} // namespace caffe2
diff --git a/caffe2/utils/smart_tensor_printer.h b/caffe2/utils/smart_tensor_printer.h
deleted file mode 100644
index e6d96ef..0000000
--- a/caffe2/utils/smart_tensor_printer.h
+++ /dev/null
@@ -1,50 +0,0 @@
-#pragma once
-
-#include "caffe2/core/tensor.h"
-
-namespace caffe2 {
-
-// This is a wrapper around the TensorPrinter that doesn't require the user to
-// explicit specify the type of the tensor while calling the Print() method.
-// It also supports a convenience function with a default constructed printer as
-// a static method.
-class TORCH_API SmartTensorPrinter {
- public:
- // The proliferation of constructors is to give the feature parity with
- // TensorPrinter
- // yet not repeat the default arguments explicitly in case they change in the
- // future.
- SmartTensorPrinter() = default;
-
- explicit SmartTensorPrinter(const std::string& tensor_name);
-
- SmartTensorPrinter(
- const std::string& tensor_name,
- const std::string& file_name);
-
- SmartTensorPrinter(
- const std::string& tensor_name,
- const std::string& file_name,
- int limit);
-
- void Print(const Tensor& tensor);
-
- void PrintMeta(const Tensor& tensor) {
- tensorPrinter_.PrintMeta(tensor);
- }
-
- // Uses a default constructed SmartTensorPrinter
- static void PrintTensor(const Tensor& tensor);
-
- // Uses a default constructed SmartTensorPrinter
- void PrintTensorMeta(const Tensor& tensor) {
- DefaultTensorPrinter().PrintMeta(tensor);
- }
-
- private:
- // Returns a thread local default constructed TensorPrinter
- static SmartTensorPrinter& DefaultTensorPrinter();
-
- TensorPrinter tensorPrinter_;
-};
-}
diff --git a/caffe2/utils/smart_tensor_printer_test.cc b/caffe2/utils/smart_tensor_printer_test.cc
deleted file mode 100644
index a455730..0000000
--- a/caffe2/utils/smart_tensor_printer_test.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-#include "caffe2/utils/smart_tensor_printer.h"
-
-#include "caffe2/core/common.h"
-
-#include <gtest/gtest.h>
-
-namespace caffe2 {
-
-template <typename T>
-std::string my_to_string(const T& value) {
- return to_string(value);
-}
-
-template <>
-std::string my_to_string<std::string>(const std::string& value) {
- return value;
-}
-
-template <typename T>
-void expect_stderr_contains(const std::vector<T>& values) {
- std::string captured_stderr = testing::internal::GetCapturedStderr();
- for (const auto& value : values) {
- std::string stringValue = my_to_string(value);
- EXPECT_TRUE(captured_stderr.find(stringValue) != std::string::npos);
- }
-}
-
-template <typename T>
-void printTensorAndCheck(const std::vector<T>& values) {
- testing::internal::CaptureStderr();
-
- Tensor tensor =
- TensorCPUFromValues<T>({static_cast<int64_t>(values.size())}, values);
-
- SmartTensorPrinter::PrintTensor(tensor);
- expect_stderr_contains(values);
-}
-
-// We need real glog for this test to pass
-#ifdef CAFFE2_USE_GOOGLE_GLOG
-
-#if !(__APPLE__) // TODO(janusz): thread_local does not work under mac.
-
-TEST(SmartTensorPrinterTest, SimpleTest) {
- printTensorAndCheck(std::vector<int>{1, 2, 3, 4, 5});
- printTensorAndCheck(std::vector<std::string>{"bob", "alice", "facebook"});
-}
-
-#endif // !(__APPLE__)
-
-#endif // CAFFE2_USE_GOOGLE_GLOG
-
-} // namespace caffe2
diff --git a/caffe2/utils/zmq_helper.h b/caffe2/utils/zmq_helper.h
deleted file mode 100644
index 05bc22a..0000000
--- a/caffe2/utils/zmq_helper.h
+++ /dev/null
@@ -1,137 +0,0 @@
-#ifndef CAFFE2_UTILS_ZMQ_HELPER_H_
-#define CAFFE2_UTILS_ZMQ_HELPER_H_
-
-#include <zmq.h>
-
-#include "caffe2/core/logging.h"
-
-namespace caffe2 {
-
-class ZmqContext {
- public:
- explicit ZmqContext(int io_threads) : ptr_(zmq_ctx_new()) {
- CAFFE_ENFORCE(ptr_ != nullptr, "Failed to create zmq context.");
- int rc = zmq_ctx_set(ptr_, ZMQ_IO_THREADS, io_threads);
- CAFFE_ENFORCE_EQ(rc, 0);
- rc = zmq_ctx_set(ptr_, ZMQ_MAX_SOCKETS, ZMQ_MAX_SOCKETS_DFLT);
- CAFFE_ENFORCE_EQ(rc, 0);
- }
- ~ZmqContext() {
- int rc = zmq_ctx_destroy(ptr_);
- CAFFE_ENFORCE_EQ(rc, 0);
- }
-
- void* ptr() { return ptr_; }
-
- private:
- void* ptr_;
-
- C10_DISABLE_COPY_AND_ASSIGN(ZmqContext);
-};
-
-class ZmqMessage {
- public:
- ZmqMessage() {
- int rc = zmq_msg_init(&msg_);
- CAFFE_ENFORCE_EQ(rc, 0);
- }
-
- ~ZmqMessage() {
- int rc = zmq_msg_close(&msg_);
- CAFFE_ENFORCE_EQ(rc, 0);
- }
-
- zmq_msg_t* msg() { return &msg_; }
-
- void* data() { return zmq_msg_data(&msg_); }
- size_t size() { return zmq_msg_size(&msg_); }
-
- private:
- zmq_msg_t msg_;
- C10_DISABLE_COPY_AND_ASSIGN(ZmqMessage);
-};
-
-class ZmqSocket {
- public:
- explicit ZmqSocket(int type)
- : context_(1), ptr_(zmq_socket(context_.ptr(), type)) {
- CAFFE_ENFORCE(ptr_ != nullptr, "Failed to create zmq socket.");
- }
-
- ~ZmqSocket() {
- int rc = zmq_close(ptr_);
- CAFFE_ENFORCE_EQ(rc, 0);
- }
-
- void Bind(const string& addr) {
- int rc = zmq_bind(ptr_, addr.c_str());
- CAFFE_ENFORCE_EQ(rc, 0);
- }
-
- void Unbind(const string& addr) {
- int rc = zmq_unbind(ptr_, addr.c_str());
- CAFFE_ENFORCE_EQ(rc, 0);
- }
-
- void Connect(const string& addr) {
- int rc = zmq_connect(ptr_, addr.c_str());
- CAFFE_ENFORCE_EQ(rc, 0);
- }
-
- void Disconnect(const string& addr) {
- int rc = zmq_disconnect(ptr_, addr.c_str());
- CAFFE_ENFORCE_EQ(rc, 0);
- }
-
- int Send(const string& msg, int flags) {
- int nbytes = zmq_send(ptr_, msg.c_str(), msg.size(), flags);
- if (nbytes) {
- return nbytes;
- } else if (zmq_errno() == EAGAIN) {
- return 0;
- } else {
- LOG(FATAL) << "Cannot send zmq message. Error number: "
- << zmq_errno();
- return 0;
- }
- }
-
- int SendTillSuccess(const string& msg, int flags) {
- CAFFE_ENFORCE(msg.size(), "You cannot send an empty message.");
- int nbytes = 0;
- do {
- nbytes = Send(msg, flags);
- } while (nbytes == 0);
- return nbytes;
- }
-
- int Recv(ZmqMessage* msg) {
- int nbytes = zmq_msg_recv(msg->msg(), ptr_, 0);
- if (nbytes >= 0) {
- return nbytes;
- } else if (zmq_errno() == EAGAIN || zmq_errno() == EINTR) {
- return 0;
- } else {
- LOG(FATAL) << "Cannot receive zmq message. Error number: "
- << zmq_errno();
- return 0;
- }
- }
-
- int RecvTillSuccess(ZmqMessage* msg) {
- int nbytes = 0;
- do {
- nbytes = Recv(msg);
- } while (nbytes == 0);
- return nbytes;
- }
-
- private:
- ZmqContext context_;
- void* ptr_;
-};
-
-} // namespace caffe2
-
-
-#endif // CAFFE2_UTILS_ZMQ_HELPER_H_