| #define TORCH_ASSERT_ONLY_METHOD_OPERATORS |
| #include <ATen/core/Tensor.h> |
| #include <ATen/Config.h> |
| |
| #include <c10/util/CallOnce.h> |
| |
| #include <thread> |
| |
| #ifndef AT_PER_OPERATOR_HEADERS |
| #include <ATen/Functions.h> |
| #include <ATen/NativeFunctions.h> |
| #else |
| #include <ATen/ops/_nnpack_available_native.h> |
| #include <ATen/ops/_nnpack_spatial_convolution_native.h> |
| #include <ATen/ops/empty.h> |
| #include <ATen/ops/zeros.h> |
| #endif |
| |
| #if !AT_NNPACK_ENABLED() |
| |
| namespace at::native { |
| |
| at::Tensor _nnpack_spatial_convolution( |
| const Tensor& input, |
| const Tensor& weight, const c10::optional<Tensor>& bias_opt, |
| const IntArrayRef padding, |
| const IntArrayRef stride) { |
| throw std::runtime_error( |
| "nnpack_spatial_convolution: ATen not compiled with NNPACK support"); |
| } |
| |
| bool _nnpack_available() { |
| return false; |
| } |
| |
| } // namespace at::native |
| |
| #else |
| |
| #include <nnpack.h> |
| |
| #include <caffe2/utils/threadpool/pthreadpool-cpp.h> |
| #include <ATen/native/ConvUtils.h> |
| #include <ATen/Parallel.h> |
| #include <c10/util/irange.h> |
| |
| namespace at::native { |
| |
| static bool init_nnpack() { |
| static c10::once_flag once_; |
| static bool nnpack_successfully_initialized_ = false; |
| |
| c10::call_once(once_, []() { |
| const nnp_status nnpack_status = nnp_initialize(); |
| nnpack_successfully_initialized_ = (nnp_status_success == nnpack_status); |
| |
| if (nnpack_status != nnp_status_success) { |
| if (nnpack_status == nnp_status_out_of_memory) { |
| LOG(WARNING) << "Could not initialize NNPACK! Reason: Out of memory."; |
| } else if (nnpack_status == nnp_status_unsupported_hardware) { |
| LOG(WARNING) << "Could not initialize NNPACK! Reason: Unsupported hardware."; |
| } else { |
| LOG(WARNING) << "Could not initialize NNPACK! Reason: Unknown error!"; |
| } |
| } |
| }); |
| |
| return nnpack_successfully_initialized_; |
| } |
| |
| static pthreadpool_t nnpack_threadpool() { |
| #ifdef C10_MOBILE |
| return caffe2::pthreadpool_(); |
| #else |
| static pthreadpool_t nnpack_threadpool_ = nullptr; |
| static bool called_nnpack_threadpool_ = false; |
| |
| if (!called_nnpack_threadpool_) { |
| called_nnpack_threadpool_ = true; |
| |
| #ifdef INTRA_OP_PARALLEL |
| const uint32_t threads = at::get_num_threads(); |
| #else |
| const uint32_t threads = std::thread::hardware_concurrency(); |
| #endif |
| |
| nnpack_threadpool_ = pthreadpool_create(threads); |
| if (!nnpack_threadpool_) { |
| LOG(WARNING) << "Failed to initialize pthreadpool! Running NNPACK in single-threaded mode."; |
| } |
| } |
| |
| return nnpack_threadpool_; |
| #endif |
| } |
| |
| bool _nnpack_available() { |
| return init_nnpack(); |
| } |
| |
| namespace { |
| struct Workspace { |
| void* buffer = nullptr; |
| size_t size = 0; |
| |
| void deallocate() { |
| if (buffer) { |
| // NOLINTNEXTLINE(cppcoreguidelines-no-malloc) |
| std::free(buffer); |
| buffer = nullptr; |
| } |
| } |
| |
| void allocate() { |
| deallocate(); |
| |
| // NNPack has alignment requirements |
| constexpr size_t nnpack_memory_alignment_boundary = 64; |
| |
| // Won't work on Windows, but NNPACK doesn't support Windows either |
| auto res = posix_memalign(&buffer, nnpack_memory_alignment_boundary, size); |
| if (res != 0) { |
| TORCH_CHECK(false, "posix_memalign failed:", strerror(errno), " (", errno, ")"); |
| } |
| return; |
| } |
| |
| ~Workspace() { |
| deallocate(); |
| } |
| }; |
| } // namespace |
| |
| // Make thread_local for safety in cases where we have multiple threads running |
| // Convs at once |
| static thread_local Workspace workspace; |
| |
| Tensor _nnpack_spatial_convolution( |
| const Tensor& input, |
| const Tensor& weight, const c10::optional<Tensor>& bias_opt, |
| const IntArrayRef padding, |
| const IntArrayRef stride) { |
| // See [Note: hacky wrapper removal for optional tensor] |
| c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); |
| const Tensor& bias = *bias_maybe_owned; |
| |
| at::Tensor output = at::empty( |
| conv_output_size(input.sizes(), weight.sizes(), padding, stride), |
| input.options()); |
| |
| // Our input Tensor must be in the form N,C,H,W |
| if (input.ndimension() != 4) { |
| throw std::runtime_error( |
| "NNPack convolutionOutput expects 4D input Tensor N,C,H,W"); |
| } |
| // Our weight Tensor must be in the form oC,iC,kH,kW |
| if (weight.ndimension() != 4) { |
| throw std::runtime_error( |
| "NNPack convolutionOutput expects 4D weight Tensor oC,iC,kH,kW"); |
| } |
| // Our output Tensor must be in the form N,oC,oH,oW |
| if (output.ndimension() != 4) { |
| throw std::runtime_error( |
| "NNPack convolutionOutput expects 4D output Tensor N,oC,oH,oW"); |
| } |
| |
| // Some basic shape checking, not comprehensive |
| if (input.size(1) != weight.size(1)) { |
| std::stringstream err; |
| err << "Mismatch between number of input channels in input Tensor (" |
| << input.size(1) << ") and weight Tensor (" << weight.size(1) |
| << ") in NNPack convolutionOutput"; |
| throw std::runtime_error(err.str()); |
| } |
| if (weight.size(0) != output.size(1)) { |
| std::stringstream err; |
| err << "Mismatch between number of output channels in weight Tensor (" |
| << weight.size(0) << ") and output Tensor (" << output.size(1) |
| << ") in NNPack convolutionOutput"; |
| throw std::runtime_error(err.str()); |
| } |
| if (input.size(0) != output.size(0)) { |
| std::stringstream err; |
| err << "Mismatch between batch size in input Tensor (" << input.size(0) |
| << ") and output Tensor (" << output.size(0) |
| << ") in NNPack convolutionOutput"; |
| throw std::runtime_error(err.str()); |
| } |
| |
| // All Tensors must be float Tensors |
| if (input.device().type() != kCPU || input.scalar_type() != kFloat || |
| weight.device().type() != kCPU || weight.scalar_type() != kFloat || |
| output.device().type() != kCPU || output.scalar_type() != kFloat || |
| (bias.defined() && (bias.device().type() != kCPU || bias.scalar_type() != kFloat))) { |
| throw std::runtime_error( |
| "Mismatched Tensor types in NNPack convolutionOutput"); |
| } |
| |
| const auto algorithm = nnp_convolution_algorithm_auto; |
| const size_t input_channels = input.size(1); |
| const size_t output_channels = weight.size(0); |
| const struct nnp_size input_size = { |
| .width = (size_t)input.size(3), |
| .height = (size_t)input.size(2), |
| }; |
| const struct nnp_padding input_padding = { |
| .top = (size_t)padding[0], |
| .right = (size_t)padding[1], |
| .bottom = (size_t)padding[0], |
| .left = (size_t)padding[1], |
| }; |
| const struct nnp_size kernel_size = { |
| .width = (size_t)weight.size(3), |
| .height = (size_t)weight.size(2), |
| }; |
| const struct nnp_size output_size = { |
| .width = (size_t)output.size(3), |
| .height = (size_t)output.size(2), |
| }; |
| const nnp_size output_subsample = { |
| .width = static_cast<std::size_t>(stride[1]), |
| .height = static_cast<std::size_t>(stride[0]), |
| }; |
| |
| const auto input_ = input.contiguous(); |
| const auto weight_ = weight.contiguous(); |
| // If we don't have a defined bias Tensor, we need to create one filled with zeroes |
| const auto bias_ = bias.defined() ? bias.contiguous() : at::zeros({weight.size(0)}, input.options()); |
| |
| const auto compute = [&](const size_t batch_size) -> nnp_status { |
| if ((batch_size == 1) || (output_subsample.width != 1) || (output_subsample.height != 1)) { |
| const size_t input_size_per_batch = input_channels * input_size.width * input_size.height; |
| const size_t output_size_per_batch = output_channels * output_size.width * output_size.height; |
| |
| for (const auto batch : c10::irange(0u, batch_size)) { |
| const nnp_status status = nnp_convolution_inference( |
| algorithm, |
| nnp_convolution_transform_strategy_compute, |
| input_channels, |
| output_channels, |
| input_size, |
| input_padding, |
| kernel_size, |
| output_subsample, |
| input_.data_ptr<float>() + batch * input_size_per_batch, |
| weight_.data_ptr<float>(), |
| bias_.data_ptr<float>(), |
| output.data_ptr<float>() + batch * output_size_per_batch, |
| workspace.buffer, |
| &workspace.size, |
| nnp_activation_identity, |
| nullptr, |
| nnpack_threadpool(), |
| nullptr ); |
| |
| if (nnp_status_success != status) { |
| return status; |
| } |
| } |
| |
| return nnp_status_success; |
| } |
| else { |
| return nnp_convolution_output( |
| algorithm, |
| batch_size, |
| input_channels, |
| output_channels, |
| input_size, |
| input_padding, |
| kernel_size, |
| input_.data_ptr<float>(), |
| weight_.data_ptr<float>(), |
| bias_.data_ptr<float>(), |
| output.data_ptr<float>(), |
| workspace.buffer, |
| &workspace.size, |
| nnp_activation_identity, |
| nullptr, |
| nnpack_threadpool(), |
| nullptr ); |
| } |
| }; |
| |
| const size_t batch_size = input.size(0); |
| |
| auto size_and_allocate_ws = [&]() { |
| // Run a single pass to get the size of memory workspace buffer |
| const auto status = compute(batch_size); |
| if (status != nnp_status_success) { |
| throw std::runtime_error("NNPACK SpatialConvolution_updateOutput failed"); |
| } |
| workspace.allocate(); |
| }; |
| |
| // If no workspace created yet, allocate it |
| if (workspace.buffer == nullptr) { |
| size_and_allocate_ws(); |
| } |
| |
| // Try to run with the newly created, or existing workspace |
| auto status = compute(batch_size); |
| |
| if (status == nnp_status_insufficient_buffer) { |
| // Need to reallocate the workspace |
| workspace.deallocate(); |
| size_and_allocate_ws(); |
| |
| // Try one more time |
| status = compute(batch_size); |
| } |
| |
| if (status != nnp_status_success) { |
| throw std::runtime_error("NNPACK SpatialConvolution_updateOutput failed"); |
| } |
| |
| return output; |
| } |
| |
| } // namespace at::native |
| |
| #endif // AT_NNPACK_ENABLED |