| #define TORCH_ASSERT_ONLY_METHOD_OPERATORS |
| #include <ATen/core/Tensor.h> |
| #include <ATen/Context.h> |
| #include <ATen/Dispatch.h> |
| #include <ATen/Parallel.h> |
| #include <ATen/TensorIterator.h> |
| #include <ATen/cpu/vec/vec.h> |
| #include <ATen/native/UpSample.h> |
| #include <ATen/native/cpu/utils.h> |
| #include <c10/util/irange.h> |
| |
| #ifndef AT_PER_OPERATOR_HEADERS |
| #include <ATen/Functions.h> |
| #else |
| #include <ATen/ops/empty.h> |
| #include <ATen/ops/empty_native.h> |
| #include <ATen/ops/ones.h> |
| #endif |
| |
| namespace at { |
| namespace native { |
| namespace { |
| |
| using scale_t = std::vector<c10::optional<double>>; |
| |
| // Helper structs and methods for cpu_upsample_linear |
| // |
| // Interpolation methods that used below are separable, and as such we can compute the interpolation |
| // independently per dimension in a recursive way. Please, refer to #10482 for more context. |
| // |
| // Linear Interpolation structure to compute output value in n-dimensional case. |
| // - recursively compute interpolated output for each dimension |
| // - we rely a lot on compiler's code optimization such that implemented operations |
| // can be automatically factorized and vectorized using SSE and AVX2 |
| template <int n, typename scalar_t, typename index_t, int interp_size> |
| struct Interpolate { |
| static inline scalar_t eval(char* src, char** data, const int64_t* strides, int64_t i) { |
| index_t ids = *(index_t*)&data[0][i * strides[0]]; |
| scalar_t wts = *(scalar_t*)&data[1][i * strides[1]]; |
| scalar_t t = Interpolate<n - 1, scalar_t, index_t, interp_size>::eval(src + ids, &data[2 * interp_size], &strides[2 * interp_size], i); |
| scalar_t output = t * wts; |
| for (const auto j : c10::irange(1, interp_size)) { |
| ids = *(index_t*)&data[2 * j + 0][i * strides[2 * j + 0]]; |
| wts = *(scalar_t*)&data[2 * j + 1][i * strides[2 * j + 1]]; |
| t = Interpolate<n - 1, scalar_t, index_t, interp_size>::eval(src + ids, &data[2 * interp_size], &strides[2 * interp_size], i); |
| output += t * wts; |
| } |
| return output; |
| } |
| }; |
| |
| template <typename scalar_t, typename index_t, int interp_size> |
| struct Interpolate<1, scalar_t, index_t, interp_size> { |
| static inline scalar_t eval(char* src, char** data, const int64_t* strides, int64_t i) { |
| index_t ids = *(index_t*)&data[0][i * strides[0]]; |
| scalar_t wts = *(scalar_t*)&data[1][i * strides[1]]; |
| scalar_t t = *(scalar_t *)&src[ids]; |
| scalar_t output = t * wts; |
| for (const auto j : c10::irange(1, interp_size)) { |
| ids = *(index_t*)&data[2 * j + 0][i * strides[2 * j + 0]]; |
| wts = *(scalar_t*)&data[2 * j + 1][i * strides[2 * j + 1]]; |
| t = *(scalar_t *)&src[ids]; |
| output += t * wts; |
| } |
| return output; |
| } |
| }; |
| |
| template <int n, typename scalar_t, typename index_t> |
| struct Interpolate<n, scalar_t, index_t, 1> { |
| static inline scalar_t eval(char* src, char** data, const int64_t* strides, int64_t i) { |
| index_t ids = *(index_t*)&data[0][i * strides[0]]; |
| return Interpolate<n - 1, scalar_t, index_t, 1>::eval(src + ids, &data[2], &strides[2], i); |
| } |
| }; |
| |
| template <typename scalar_t, typename index_t> |
| struct Interpolate<1, scalar_t, index_t, 1> { |
| static inline scalar_t eval(char* src, char** data, const int64_t* strides, int64_t i) { |
| index_t ids = *(index_t*)&data[0][i * strides[0]]; |
| return *(scalar_t *)&src[ids]; |
| } |
| }; |
| |
| // There is an unexpected 2x slowdown for upsample_trilinear3d channels_first |
| // for both 1 and 6 threads. We have to specialize this case as below: |
| // Once the issue is fixed we can keep generic implementation and remove: |
| // struct Interpolate<n, scalar_t, index_t, 2> and |
| // struct Interpolate<1, scalar_t, index_t, 2> |
| template <int n, typename scalar_t, typename index_t> |
| struct Interpolate<n, scalar_t, index_t, 2> { |
| static inline scalar_t eval(char* src, char** data, const int64_t* strides, int64_t i) { |
| index_t i0 = *(index_t*)&data[0][i * strides[0]]; |
| index_t i1 = *(index_t*)&data[2][i * strides[2]]; |
| scalar_t w0 = *(scalar_t *)&data[1][i * strides[1]]; |
| scalar_t w1 = *(scalar_t *)&data[3][i * strides[3]]; |
| |
| scalar_t t0 = Interpolate<n - 1, scalar_t, index_t, 2>::eval(src + i0, &data[4], &strides[4], i); |
| scalar_t t1 = Interpolate<n - 1, scalar_t, index_t, 2>::eval(src + i1, &data[4], &strides[4], i); |
| |
| return t0 * w0 + t1 * w1; |
| } |
| }; |
| |
| template <typename scalar_t, typename index_t> |
| struct Interpolate<1, scalar_t, index_t, 2> { |
| static inline scalar_t eval(char* src, char** data, const int64_t* strides, int64_t i) { |
| index_t i0 = *(index_t*)&data[0][i * strides[0]]; |
| index_t i1 = *(index_t*)&data[2][i * strides[2]]; |
| scalar_t w0 = *(scalar_t *)&data[1][i * strides[1]]; |
| scalar_t w1 = *(scalar_t *)&data[3][i * strides[3]]; |
| scalar_t t0 = *(scalar_t *)&src[i0]; |
| scalar_t t1 = *(scalar_t *)&src[i1]; |
| return t0 * w0 + t1 * w1; |
| } |
| }; |
| |
| template <int n, typename scalar_t, typename index_t, int interp_size> |
| static inline scalar_t interpolate(char* src, char** data, const int64_t* strides, int64_t i) { |
| return Interpolate<n, scalar_t, index_t, interp_size>::eval(src, data, strides, i); |
| } |
| |
| template <typename scalar_t, typename index_t> |
| static inline scalar_t interpolate_aa_single_dim_zero_strides( |
| char* src, |
| char** data, |
| const index_t ids_stride) { |
| const index_t ids_min = *(index_t*)&data[0][0]; |
| const index_t ids_size = *(index_t*)&data[1][0]; |
| |
| char* src_min = src + ids_min; |
| |
| scalar_t t = *(scalar_t*)&src_min[0]; |
| index_t wts_idx = *(index_t*)&data[4][0]; |
| scalar_t* wts_ptr = (scalar_t*)&data[3][wts_idx]; |
| scalar_t wts = wts_ptr[0]; |
| |
| scalar_t output = t * wts; |
| for (const auto j : c10::irange(1, ids_size)) { |
| wts = wts_ptr[j]; |
| t = *(scalar_t*)&src_min[j * ids_stride]; |
| output += t * wts; |
| } |
| return output; |
| } |
| |
| template <typename scalar_t, typename index_t> |
| static inline scalar_t interpolate_aa_single_dim( |
| char* src, |
| char** data, |
| const int64_t* strides, |
| int64_t i, |
| const index_t ids_stride) { |
| index_t ids_min = *(index_t*)&data[0][i * strides[0]]; |
| index_t ids_size = *(index_t*)&data[1][i * strides[1]]; |
| |
| char* src_min = src + ids_min; |
| |
| scalar_t t = *(scalar_t*)&src_min[0]; |
| index_t wts_idx = *(index_t*)&data[4][i * strides[4]]; |
| scalar_t* wts_ptr = (scalar_t*)&data[3][wts_idx]; |
| scalar_t wts = wts_ptr[0]; |
| |
| scalar_t output = t * wts; |
| for (const auto j : c10::irange(1, ids_size)) { |
| wts = wts_ptr[j]; |
| t = *(scalar_t*)&src_min[j * ids_stride]; |
| output += t * wts; |
| } |
| return output; |
| } |
| |
| template<int m> |
| static inline bool is_zero_stride(const int64_t* strides) { |
| bool output = strides[0] == 0; |
| for (const auto i : c10::irange(1, m)) { |
| output &= (strides[i] == 0); |
| } |
| return output; |
| } |
| |
| template <typename scalar_t, typename index_t, int interp_size> |
| static inline bool is_contiguous_stride(const int64_t* strides) { |
| bool output = (strides[0] == sizeof(index_t)) && (strides[1] == sizeof(scalar_t)); |
| for (int i=2; i<2 * interp_size; i+=2) { |
| output &= (strides[i] == sizeof(index_t)) && (strides[i + 1] == sizeof(scalar_t)); |
| } |
| return output; |
| } |
| |
| // Helper class to recursively check if all input strides corresponding to interpolated dimensions |
| // are equal zero except on a single dimension. |
| // |
| // Inputs: array of strides of size N, non_zero_stride_dim which can be -1, 0, 1, 2, ... |
| // if non_zero_stride_dim, we check that all strides are equal zero, otherwise |
| // 4 strides corresponding to the strides for index_0, weight_0, index_1 and weight_1 for non_zero_stride_dim |
| // dimension should be non zero. |
| // |
| // Unit check of the recursion is to verify whether 4 strides for one interpolated dimension are either zero, |
| // see method is_zero_stride, or (sizeof(index_t), sizeof(scalar_t), sizeof(index_t), sizeof(scalar_t)), see |
| // method is_contiguous_stride. |
| // |
| // In practice, we have the following cases: |
| // - for ND, float32, channel first, strides are |
| // dimN-1, dim1, dim0 |
| // i0, w0, i1, w1, ..., i0, w0, i1, w1, i0, w0, i1, w1 |
| // strides=(0, 0, 0, 0, ..., 0, 0, 0, 0, 4, 4, 4, 4) |
| // |
| // if size dim0 is 1 then its strides are 0 and dim1 strides are equal 4 |
| // |
| // - for ND, float32, channel last, strides are |
| // dimN-1, dimN-2, dim0 |
| // i0, w0, i1, w1, i0, w0, i1, w1, ... i0, w0, i1, w1 |
| // strides=(0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0) |
| // |
| // Using these methods we can hint the compiler to factorize constant indices and weights |
| // in cpu_upsample_linear method |
| template <int N, int non_zero_stride_dim, typename scalar_t, typename index_t, int interp_size> |
| struct CheckAlmostAllZeroStrides { |
| static inline bool eval(const int64_t* strides) { |
| // N is dim index: N -> dim0, N-1 -> dim1, ... |
| // non_zero_stride_dim should be out_dims - dim |
| // NOLINTNEXTLINE(cppcoreguidelines-init-variables) |
| bool output; |
| if (N == non_zero_stride_dim) { |
| output = is_contiguous_stride<scalar_t, index_t, interp_size>(strides); |
| } else { |
| output = is_zero_stride<2 * interp_size>(strides); |
| } |
| return output && |
| CheckAlmostAllZeroStrides<N - 1, non_zero_stride_dim, scalar_t, index_t, interp_size>::eval( |
| &strides[2 * interp_size]); |
| } |
| }; |
| |
| template <int non_zero_stride_dim, typename scalar_t, typename index_t, int interp_size> |
| struct CheckAlmostAllZeroStrides<0, non_zero_stride_dim, scalar_t, index_t, interp_size> { |
| static inline bool eval(const int64_t* /*strides*/) { |
| return true; |
| } |
| }; |
| |
| template <int n, int s, typename scalar_t, typename index_t, int interp_size> |
| static inline bool check_almost_all_zero_stride(const int64_t* strides) { |
| return CheckAlmostAllZeroStrides<n, s, scalar_t, index_t, interp_size>::eval(strides); |
| } |
| |
| // Helper method to compute interpolation for nearest, linear, cubic modes |
| template <typename scalar_t, typename index_t, int out_ndims, int interp_size> |
| static inline void basic_loop(char** data, const int64_t* strides, int64_t n) { |
| char* dst = data[0]; |
| char* src = data[1]; |
| for (const auto i : c10::irange(n)) { |
| *(scalar_t*)&dst[i * strides[0]] = interpolate<out_ndims, scalar_t, index_t, interp_size>( |
| src + i * strides[1], &data[2], &strides[2], i); |
| } |
| } |
| |
| template <typename scalar_t, typename index_t> |
| static inline void basic_loop_aa_single_dim_zero_strides( |
| char** data, |
| const int64_t* strides, |
| int64_t n) { |
| char* dst = data[0]; |
| char* src = data[1]; |
| // index stride is constant for the given dimension |
| const index_t ids_stride = *(index_t*)&data[2 + 2][0]; |
| |
| for (const auto i : c10::irange(n)) { |
| *(scalar_t*)&dst[i * strides[0]] = |
| interpolate_aa_single_dim_zero_strides<scalar_t, index_t>( |
| src + i * strides[1], &data[2], ids_stride); |
| } |
| } |
| |
| template <typename scalar_t, typename index_t> |
| static inline void basic_loop_aa_single_dim_nonzero_strides( |
| char** data, |
| const int64_t* strides, |
| int64_t n) { |
| char* dst = data[0]; |
| char* src = data[1]; |
| // index stride is constant for the given dimension |
| const index_t ids_stride = *(index_t*)&data[2 + 2][0]; |
| |
| if (strides[1] == 0) { |
| for (const auto i : c10::irange(n)) { |
| *(scalar_t*)&dst[i * strides[0]] = |
| interpolate_aa_single_dim<scalar_t, index_t>( |
| src, &data[2], &strides[2], i, ids_stride); |
| } |
| } else { |
| for (const auto i : c10::irange(n)) { |
| *(scalar_t*)&dst[i * strides[0]] = |
| interpolate_aa_single_dim<scalar_t, index_t>( |
| src + i * strides[1], &data[2], &strides[2], i, ids_stride); |
| } |
| } |
| } |
| |
| // Generic upsampling computation method using TensorIterator for Nd case. |
| // Supports: nearest, linear, cubic modes with interp_size template argument: 1, 2, 4 |
| // |
| // Single loop function for 1d, 2d and 3d cases and modes |
| // For N dimensions, output value up to Di dimension can be computed as |
| // |
| // output_i[a] = interpolate(output_{i+1}[a], w_{i+1}[a], output_{i+1}[a+1], w_{i+1}[a+1], ...) |
| // with |
| // output_DN[a] = interpolate(input_DN[a], w_DN[a], input_DN[a+1], w_DN[a+1], ...) |
| // and i - dimension index and a - linear index for spatial coordinates |
| // |
| // The recursive call is implemented with InterpLinear struct using template for |
| // the loop unrolling on compile time. |
| template <typename scalar_t, int out_ndims, int interp_size> |
| void cpu_upsample_generic(at::TensorIterator& iter) |
| { |
| auto loop = [&](char** data, const int64_t* strides, int64_t n) { |
| // special-cases to let the compiler apply compile-time input-specific optimizations |
| if ((strides[0] == sizeof(scalar_t) && (strides[1] == 0) && |
| // NOLINTNEXTLINE(bugprone-branch-clone) |
| check_almost_all_zero_stride<out_ndims, 1, scalar_t, int64_t, interp_size>(&strides[2]))) { |
| // contiguous channels-first case |
| basic_loop<scalar_t, int64_t, out_ndims, interp_size>(data, strides, n); |
| } else if ((strides[0] == sizeof(scalar_t) && (strides[1] == sizeof(scalar_t)) && |
| check_almost_all_zero_stride<out_ndims, -1, scalar_t, int64_t, interp_size>(&strides[2]))) { |
| // contiguous channels-last case |
| basic_loop<scalar_t, int64_t, out_ndims, interp_size>(data, strides, n); |
| } else { |
| // fallback |
| basic_loop<scalar_t, int64_t, out_ndims, interp_size>(data, strides, n); |
| } |
| }; |
| iter.for_each(loop); |
| } |
| |
| template <typename scalar_t, typename scale_type, nearest_idx_fn_t nearest_idx_fn> |
| void cpu_upsample_nearest_channels_last( |
| const Tensor& output_, |
| const Tensor& input_, |
| const scale_type& scales) { |
| TORCH_CHECK(input_.dtype() == output_.dtype(), "expected dtype ", input_.dtype(), |
| " for `output` but got dtype ", output_.dtype()); |
| |
| auto input_sizes = input_.sizes().vec(); |
| auto output_sizes = output_.sizes().vec(); |
| auto ndim = input_sizes.size(); |
| TORCH_CHECK(ndim >=4 && ndim <= 5, "Upsample with NHWC format supports tensors with 4 or 5 dims.") |
| |
| auto channels_last_memory_format = ndim == 4 ? at::MemoryFormat::ChannelsLast : at::MemoryFormat::ChannelsLast3d; |
| auto input = input_.contiguous(channels_last_memory_format); |
| auto output = output_.contiguous(channels_last_memory_format); |
| |
| auto input_data = input.data_ptr<scalar_t>(); |
| auto output_data = output.data_ptr<scalar_t>(); |
| |
| int64_t num_batches = input_sizes[0]; |
| int64_t channels = input_sizes[1]; |
| int64_t input_depth = (ndim == 5) ? input_sizes[2] : 1; |
| int64_t output_depth = (ndim == 5) ? output_sizes[2] : 1; |
| int64_t input_height = (ndim >= 4) ? input_sizes[ndim - 2] : 1; |
| int64_t output_height = (ndim >= 4) ? output_sizes[ndim - 2] : 1; |
| int64_t input_width = input_sizes[ndim - 1]; |
| int64_t output_width = output_sizes[ndim - 1]; |
| int64_t numel = output.numel(); |
| |
| TORCH_CHECK(channels > 0, "expected input and output channels greater than 0 but got ", channels); |
| |
| using Vec = vec::Vectorized<scalar_t>; |
| auto copy = [](scalar_t* out, scalar_t* in, int64_t size) { |
| int64_t d = 0; |
| for (; d < size - (size % Vec::size()); d += Vec::size()) { |
| Vec out_vec = Vec::loadu(in + d); |
| out_vec.store(out + d); |
| } |
| for (; d < size; d++) { |
| out[d] = in[d]; |
| } |
| }; |
| |
| auto loop2d = [&](int64_t begin, int64_t end) { |
| int64_t n = 0; |
| int64_t oh = 0; |
| int64_t ow = 0; |
| data_index_init(begin, n, num_batches, oh, output_height, ow, output_width); |
| |
| for (const auto i : c10::irange(begin, end)) { |
| int64_t ih = nearest_idx_fn(oh, input_height, output_height, scales[0]); |
| int64_t iw = nearest_idx_fn(ow, input_width, output_width, scales[1]); |
| scalar_t* output_ptr = output_data + i * channels; |
| scalar_t* input_ptr = input_data + n * input_height * input_width * channels + |
| ih * input_width * channels + iw * channels; |
| copy(output_ptr, input_ptr, channels); |
| data_index_step(n, num_batches, oh, output_height, ow, output_width); |
| } |
| }; |
| |
| auto loop3d = [&](int64_t begin, int64_t end) { |
| int64_t n = 0; |
| int64_t od = 0; |
| int64_t oh = 0; |
| int64_t ow = 0; |
| data_index_init(begin, n, num_batches, od, output_depth, oh, output_height, ow, output_width); |
| |
| for (const auto i : c10::irange(begin, end)) { |
| int64_t id = nearest_idx_fn(od, input_depth, output_depth, scales[0]); |
| int64_t ih = nearest_idx_fn(oh, input_height, output_height, scales[1]); |
| int64_t iw = nearest_idx_fn(ow, input_width, output_width, scales[2]); |
| scalar_t* output_ptr = output_data + i * channels; |
| scalar_t* input_ptr = input_data + n * input_depth * input_height * input_width * channels + |
| id * input_height * input_width * channels + |
| ih * input_width * channels + iw * channels; |
| copy(output_ptr, input_ptr, channels); |
| data_index_step(n, num_batches, od, output_depth, oh, output_height, ow, output_width); |
| } |
| }; |
| |
| if (ndim == 4) { |
| // upsample nearest 2d |
| at::parallel_for(0, numel / channels, at::internal::GRAIN_SIZE / channels, loop2d); |
| } else { |
| // upsample nearest 3d |
| TORCH_INTERNAL_ASSERT(ndim == 5); |
| at::parallel_for(0, numel / channels, at::internal::GRAIN_SIZE / channels, loop3d); |
| } |
| |
| if (!output_.is_contiguous(channels_last_memory_format)) { |
| output_.copy_(output); |
| } |
| } |
| |
| template <typename scalar_t, typename accscalar_t> |
| inline VecType<scalar_t> interpolate(const scalar_t* t, accscalar_t w) { |
| return VecType<scalar_t>::loadu(t) * VecType<scalar_t>(w); |
| } |
| |
| template <typename scalar_t, typename accscalar_t, typename... Args> |
| inline VecType<scalar_t> interpolate(const scalar_t* t, accscalar_t w, Args... args) { |
| return VecType<scalar_t>::loadu(t) * VecType<scalar_t>(w) + interpolate(args...); |
| } |
| |
| template <typename scalar_t, typename scale_type> |
| void cpu_upsample_linear_channels_last( |
| const Tensor& output_, |
| const Tensor& input_, |
| bool align_corners, |
| const scale_type& scales) { |
| TORCH_CHECK(input_.dtype() == output_.dtype(), "expected dtype ", input_.dtype(), |
| " for `output` but got dtype ", output_.dtype()); |
| |
| auto input_sizes = input_.sizes().vec(); |
| auto output_sizes = output_.sizes().vec(); |
| auto ndim = input_sizes.size(); |
| TORCH_CHECK(ndim >=4 && ndim <= 5, "Upsample with NHWC format supports tensors with 4 or 5 dims.") |
| |
| auto channels_last_memory_format = ndim == 4 ? at::MemoryFormat::ChannelsLast : at::MemoryFormat::ChannelsLast3d; |
| auto input = input_.contiguous(channels_last_memory_format); |
| auto output = output_.contiguous(channels_last_memory_format); |
| |
| auto input_data = input.data_ptr<scalar_t>(); |
| auto output_data = output.data_ptr<scalar_t>(); |
| |
| int64_t num_batches = input_sizes[0]; |
| int64_t channels = input_sizes[1]; |
| int64_t input_depth = (ndim == 5) ? input_sizes[2] : 1; |
| int64_t output_depth = (ndim == 5) ? output_sizes[2] : 1; |
| int64_t input_height = (ndim >= 4) ? input_sizes[ndim - 2] : 1; |
| int64_t output_height = (ndim >= 4) ? output_sizes[ndim - 2] : 1; |
| int64_t input_width = input_sizes[ndim - 1]; |
| int64_t output_width = output_sizes[ndim - 1]; |
| |
| TORCH_CHECK(channels > 0, "expected input and output channels greater than 0 but got ", channels); |
| int64_t output_slice_size = output_depth * output_height * output_width * channels; |
| |
| using accscalar_t = at::acc_type<scalar_t, false>; |
| using Vec = vec::Vectorized<scalar_t>; |
| auto loop2d = [&](int64_t begin, int64_t end) { |
| const scalar_t height_scale = area_pixel_compute_scale<scalar_t>( |
| input_height, output_height, align_corners, scales[0]); |
| const scalar_t width_scale = area_pixel_compute_scale<scalar_t>( |
| input_width, output_width, align_corners, scales[1]); |
| |
| auto input_indexr = [=](int64_t n, int64_t h, int64_t w) { |
| return input_data + n * input_height * input_width * channels + |
| h * input_width * channels + w * channels; |
| }; |
| |
| // NOLINTNEXTLINE(cppcoreguidelines-init-variables) |
| int64_t ih0, ih1, iw0, iw1; |
| scalar_t h0lambda, h1lambda, w0lambda, w1lambda; |
| for (const auto n : c10::irange(begin, end)) { |
| for (const auto oh : c10::irange(output_height)) { |
| compute_source_index_and_lambda( |
| ih0, ih1, h0lambda, h1lambda, height_scale, oh, input_height, output_height, align_corners); |
| for (const auto ow : c10::irange(output_width)) { |
| compute_source_index_and_lambda( |
| iw0, iw1, w0lambda, w1lambda, width_scale, ow, input_width, output_width, align_corners); |
| |
| scalar_t* out = output_data + n * output_slice_size + |
| oh * output_width * channels + ow * channels; |
| scalar_t* i00 = input_indexr(n, ih0, iw0); |
| scalar_t* i01 = input_indexr(n, ih0, iw1); |
| scalar_t* i10 = input_indexr(n, ih1, iw0); |
| scalar_t* i11 = input_indexr(n, ih1, iw1); |
| accscalar_t w00 = h0lambda * w0lambda; |
| accscalar_t w01 = h0lambda * w1lambda; |
| accscalar_t w10 = h1lambda * w0lambda; |
| accscalar_t w11 = h1lambda * w1lambda; |
| |
| int64_t size = channels; |
| int64_t d = 0; |
| for (; d < size - (size % Vec::size()); d += Vec::size()) { |
| auto out_vec = interpolate(i00 + d, w00, i01 + d, w01, i10 + d, w10, i11 + d, w11); |
| out_vec.store(out + d); |
| } |
| for (; d < size; d++) { |
| out[d] = i00[d] * w00 + i01[d] * w01 + i10[d] * w10 + i11[d] * w11; |
| } |
| } |
| } |
| } |
| }; |
| |
| auto loop3d = [&](int64_t begin, int64_t end) { |
| const scalar_t depth_scale = area_pixel_compute_scale<scalar_t>( |
| input_depth, output_depth, align_corners, scales[0]); |
| const scalar_t height_scale = area_pixel_compute_scale<scalar_t>( |
| input_height, output_height, align_corners, scales[1]); |
| const scalar_t width_scale = area_pixel_compute_scale<scalar_t>( |
| input_width, output_width, align_corners, scales[2]); |
| |
| auto input_indexr = [=](int64_t n, int64_t d, int64_t h, int64_t w) { |
| return input_data + n * input_depth * input_height * input_width * channels + |
| d * input_height * input_width * channels + |
| h * input_width * channels + w * channels; |
| }; |
| |
| // NOLINTNEXTLINE(cppcoreguidelines-init-variables) |
| int64_t id0, id1, ih0, ih1, iw0, iw1; |
| scalar_t d0lambda, d1lambda, h0lambda, h1lambda, w0lambda, w1lambda; |
| for (const auto n : c10::irange(begin, end)) { |
| for (const auto od : c10::irange(output_depth)) { |
| compute_source_index_and_lambda( |
| id0, id1, d0lambda, d1lambda, depth_scale, od, input_depth, output_depth, align_corners); |
| for (const auto oh : c10::irange(output_height)) { |
| compute_source_index_and_lambda( |
| ih0, ih1, h0lambda, h1lambda, height_scale, oh, input_height, output_height, align_corners); |
| for (const auto ow : c10::irange(output_width)) { |
| compute_source_index_and_lambda( |
| iw0, iw1, w0lambda, w1lambda, width_scale, ow, input_width, output_width, align_corners); |
| |
| scalar_t* out = output_data + n * output_slice_size + |
| od * output_height * output_width * channels + |
| oh * output_width * channels + ow * channels; |
| scalar_t* i000 = input_indexr(n, id0, ih0, iw0); |
| scalar_t* i001 = input_indexr(n, id0, ih0, iw1); |
| scalar_t* i010 = input_indexr(n, id0, ih1, iw0); |
| scalar_t* i011 = input_indexr(n, id0, ih1, iw1); |
| scalar_t* i100 = input_indexr(n, id1, ih0, iw0); |
| scalar_t* i101 = input_indexr(n, id1, ih0, iw1); |
| scalar_t* i110 = input_indexr(n, id1, ih1, iw0); |
| scalar_t* i111 = input_indexr(n, id1, ih1, iw1); |
| accscalar_t w000 = d0lambda * h0lambda * w0lambda; |
| accscalar_t w001 = d0lambda * h0lambda * w1lambda; |
| accscalar_t w010 = d0lambda * h1lambda * w0lambda; |
| accscalar_t w011 = d0lambda * h1lambda * w1lambda; |
| accscalar_t w100 = d1lambda * h0lambda * w0lambda; |
| accscalar_t w101 = d1lambda * h0lambda * w1lambda; |
| accscalar_t w110 = d1lambda * h1lambda * w0lambda; |
| accscalar_t w111 = d1lambda * h1lambda * w1lambda; |
| |
| int64_t size = channels; |
| int64_t d = 0; |
| for (; d < size - (size % Vec::size()); d += Vec::size()) { |
| auto out_vec = interpolate( |
| i000 + d, w000, i001 + d, w001, i010 + d, w010, i011 + d, w011, |
| i100 + d, w100, i101 + d, w101, i110 + d, w110, i111 + d, w111); |
| out_vec.store(out + d); |
| } |
| for (; d < size; d++) { |
| out[d] = |
| i000[d] * w000 + i001[d] * w001 + i010[d] * w010 + i011[d] * w011 + |
| i100[d] * w100 + i101[d] * w101 + i110[d] * w110 + i111[d] * w111; |
| } |
| } |
| } |
| } |
| } |
| }; |
| |
| if (ndim == 4) { |
| // upsample nearest 2d |
| at::parallel_for(0, num_batches, at::internal::GRAIN_SIZE / output_slice_size / 4, loop2d); |
| } else { |
| // upsample nearest 3d |
| TORCH_INTERNAL_ASSERT(ndim == 5); |
| at::parallel_for(0, num_batches, at::internal::GRAIN_SIZE / output_slice_size / 8, loop3d); |
| } |
| |
| if (!output_.is_contiguous(channels_last_memory_format)) { |
| output_.copy_(output); |
| } |
| } |
| |
| // Helper structs to use with upsample_generic_Nd_kernel_impl |
| struct HelperInterpBase { |
| |
| static inline void init_indices_weights( |
| at::ScalarType output_type, |
| std::vector<Tensor> & output, int64_t output_size, int64_t ndims, |
| int64_t reshape_dim, int interp_size |
| ) { |
| |
| auto new_shape = std::vector<int64_t>(ndims, 1); |
| new_shape[reshape_dim] = output_size; |
| |
| for (const auto j C10_UNUSED : c10::irange(interp_size)) { |
| output.emplace_back(empty(new_shape, CPU(c10::CppTypeToScalarType<int64_t>()))); |
| output.emplace_back(empty(new_shape, CPU(output_type))); |
| } |
| } |
| |
| template <typename scalar_t, typename aa_filter_fn_t> |
| static inline void _compute_weights_aa( |
| const int64_t i, const int64_t input_size, const scalar_t scale, const scalar_t support, |
| scalar_t* wt_ptr, const int64_t interp_size, aa_filter_fn_t filter_fn, |
| int64_t& xmin, int64_t& xsize |
| ) { |
| |
| scalar_t center = scale * (i + 0.5); |
| scalar_t total_w = 0.0; |
| scalar_t invscale = (scale >= 1.0) ? 1.0 / scale : 1.0; |
| xmin = std::max( |
| static_cast<int64_t>(center - support + 0.5), static_cast<int64_t>(0)); |
| xsize = std::min(static_cast<int64_t>(center + support + 0.5), input_size) - |
| xmin; |
| |
| int64_t j = 0; |
| for (; j < xsize; j++) { |
| scalar_t w = filter_fn((j + xmin - center + 0.5) * invscale); |
| wt_ptr[j] = w; |
| total_w += w; |
| } |
| for (j = 0; j < xsize; j++) { |
| if (total_w != 0.0) { |
| wt_ptr[j] /= total_w; |
| } |
| } |
| for (; j < interp_size; j++) { |
| wt_ptr[j] = static_cast<scalar_t>(0.0); |
| } |
| } |
| |
| template <typename scalar_t, typename aa_filter_fn_t> |
| static inline std::vector<Tensor> _compute_indices_weights_aa( |
| int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims, |
| int64_t reshape_dim, scalar_t scale, |
| int interp_size, aa_filter_fn_t aa_filter_fn |
| ) { |
| |
| std::vector<Tensor> output; |
| |
| scalar_t support = |
| (scale >= 1.0) ? (interp_size * 0.5) * scale : interp_size * 0.5; |
| interp_size = (int)ceilf(support) * 2 + 1; |
| |
| auto new_shape = std::vector<int64_t>(ndims, 1); |
| new_shape[reshape_dim] = output_size; |
| |
| // Bounds approach as in PIL: xmin/xmax |
| output.emplace_back( |
| empty(new_shape, CPU(c10::CppTypeToScalarType<int64_t>()))); |
| output.emplace_back( |
| empty(new_shape, CPU(c10::CppTypeToScalarType<int64_t>()))); |
| output.emplace_back( |
| empty(new_shape, CPU(c10::CppTypeToScalarType<int64_t>()))); |
| |
| { |
| // Weights |
| new_shape[reshape_dim] = output_size * interp_size; |
| auto wts = empty(new_shape, CPU(c10::CppTypeToScalarType<scalar_t>())); |
| auto strides = wts.strides().vec(); |
| strides[reshape_dim] = 0; |
| new_shape[reshape_dim] = output_size; |
| wts = wts.as_strided(new_shape, strides); |
| output.emplace_back(wts); |
| // Weights indices |
| output.emplace_back( |
| empty(new_shape, CPU(c10::CppTypeToScalarType<int64_t>()))); |
| } |
| |
| int64_t* idx_ptr_xmin = output[0].data_ptr<int64_t>(); |
| int64_t* idx_ptr_size = output[1].data_ptr<int64_t>(); |
| int64_t* idx_ptr_stride = output[2].data_ptr<int64_t>(); |
| scalar_t* wt_ptr = output[3].data_ptr<scalar_t>(); |
| int64_t* wt_idx_ptr = output[4].data_ptr<int64_t>(); |
| |
| int64_t xmin, xmax; |
| |
| for (const auto i : c10::irange(output_size)) { |
| HelperInterpBase::_compute_weights_aa( |
| i, |
| input_size, |
| scale, |
| support, |
| wt_ptr + i * interp_size, |
| interp_size, |
| aa_filter_fn, |
| xmin, |
| xmax); |
| |
| idx_ptr_xmin[i] = xmin * stride; |
| idx_ptr_size[i] = xmax; |
| idx_ptr_stride[i] = stride; |
| wt_idx_ptr[i] = i * interp_size * sizeof(scalar_t); |
| } |
| return output; |
| } |
| |
| }; |
| |
| struct HelperInterpNearest : public HelperInterpBase { |
| // This structure implements outdated and buggy method to compute indices |
| // for nearest neighbours interpolation |
| // We keep this structure for BC and consider as deprecated. |
| // See HelperInterpNearestExact as replacement |
| |
| static const int interp_size = 1; |
| |
| static inline void init_indices_weights( |
| at::ScalarType output_type, |
| std::vector<Tensor> & output, int64_t output_size, int64_t ndims, |
| int64_t reshape_dim, int interp_size |
| ) { |
| auto new_shape = std::vector<int64_t>(ndims, 1); |
| new_shape[reshape_dim] = output_size; |
| |
| for (const auto j C10_UNUSED : c10::irange(interp_size)) { |
| output.emplace_back(empty(new_shape, CPU(c10::CppTypeToScalarType<int64_t>()))); |
| // Defines weights for consistency, but not used |
| output.emplace_back(at::ones(new_shape, CPU(output_type))); |
| } |
| } |
| |
| // Compute nearest mode indices and weights for each interpolated dimension |
| // indices_weights = { |
| // {indices_0, 1.0, }, // dim -n |
| // {indices_0, 1.0, }, // dim -(n-1) |
| // ... |
| // {indices_0, 1.0, }, // dim -1 |
| // } |
| // Indices and weights are reshaped as (1, 1, ..., N, ..., 1, 1) to |
| // fit input/output tensors. |
| // Indices are already containing the strides to optimize the computations |
| static inline std::vector<Tensor> compute_indices_weights( |
| at::ScalarType scalar_type, |
| int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims, |
| int64_t reshape_dim, bool align_corners, const c10::optional<double> opt_scale |
| ) { |
| |
| TORCH_INTERNAL_ASSERT(!align_corners); |
| // NOLINTNEXTLINE(cppcoreguidelines-init-variables) |
| std::vector<Tensor> output; |
| HelperInterpNearest::init_indices_weights( |
| scalar_type, output, output_size, ndims, reshape_dim, HelperInterpNearest::interp_size); |
| |
| AT_DISPATCH_FLOATING_TYPES_AND( |
| ScalarType::BFloat16, scalar_type, "compute_indices_weights_nearest", [&] { |
| scalar_t scale = area_pixel_compute_scale<scalar_t>(input_size, output_size, align_corners, opt_scale); |
| |
| auto input_index_ptr = output[0].data_ptr<int64_t>(); |
| int64_t input_index; |
| |
| // Indices are computed as following: |
| // scale = 1.0 * isize / osize |
| // index_f32 = (output_index) * scale |
| // input_index = floor(index_f32) |
| // Same as OpenCV INTER_NEAREST |
| using accscalar_t = at::acc_type<scalar_t, false>; |
| for (const auto i : c10::irange(output_size)) { |
| const accscalar_t real_input_index = |
| area_pixel_compute_source_index<accscalar_t>( |
| scale, i, /*align_corners=*/true, /*cubic=*/false); |
| input_index = static_cast<int64_t>(floorf(real_input_index)); |
| input_index_ptr[i] = static_cast<int64_t>(std::min(input_index, input_size - 1)) * stride; |
| } |
| } |
| ); |
| return output; |
| } |
| |
| }; |
| |
| struct HelperInterpNearestExact : public HelperInterpNearest { |
| |
| // Compute nearest mode indices and weights for each interpolated dimension |
| // indices_weights = { |
| // {indices_0, 1.0, }, // dim -n |
| // {indices_0, 1.0, }, // dim -(n-1) |
| // ... |
| // {indices_0, 1.0, }, // dim -1 |
| // } |
| // Indices and weights are reshaped as (1, 1, ..., N, ..., 1, 1) to |
| // fit input/output tensors. |
| // Indices are already containing the strides to optimize the computations |
| static inline std::vector<Tensor> compute_indices_weights( |
| at::ScalarType scalar_type, |
| int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims, |
| int64_t reshape_dim, bool align_corners, const c10::optional<double> opt_scale |
| ) { |
| |
| TORCH_INTERNAL_ASSERT(!align_corners); |
| // NOLINTNEXTLINE(cppcoreguidelines-init-variables) |
| std::vector<Tensor> output; |
| HelperInterpNearest::init_indices_weights( |
| scalar_type, output, output_size, ndims, reshape_dim, HelperInterpNearest::interp_size); |
| |
| AT_DISPATCH_FLOATING_TYPES( |
| scalar_type, "compute_indices_weights_nearest", [&] { |
| scalar_t scale = area_pixel_compute_scale<scalar_t>(input_size, output_size, align_corners, opt_scale); |
| |
| auto input_index_ptr = output[0].data_ptr<int64_t>(); |
| int64_t input_index; |
| |
| // Indices should be computed as following: |
| // scale = 1.0 * isize / osize |
| // index_f32 = (output_index + 0.5) * scale - 0.5 |
| // input_index = round(index_f32) |
| // Same as Pillow and Scikit-Image/Scipy ndi.zoom |
| using accscalar_t = at::acc_type<scalar_t, false>; |
| for (const auto i : c10::irange(output_size)) { |
| const accscalar_t real_input_index = |
| area_pixel_compute_source_index<accscalar_t>( |
| scale, i, /*align_corners=*/align_corners, /*cubic=*/false); |
| input_index = static_cast<int64_t>(floorf(real_input_index + 0.5)); |
| input_index_ptr[i] = static_cast<int64_t>(std::min(input_index, input_size - 1)) * stride; |
| } |
| } |
| ); |
| return output; |
| } |
| }; |
| |
| struct HelperInterpLinear : public HelperInterpBase { |
| |
| static const int interp_size = 2; |
| |
| // Compute indices and weights for each interpolated dimension |
| // indices_weights = { |
| // {indices_0, weights_0, indices_1, weights_1}, // dim -n |
| // {indices_0, weights_0, indices_1, weights_1}, // dim -(n-1) |
| // ... |
| // {indices_0, weights_0, indices_1, weights_1}, // dim -1 |
| // } |
| // Indices and weights are reshaped as (1, 1, ..., N, ..., 1, 1) to |
| // fit input/output tensors. |
| // Indices are already containing the strides to optimize the computations |
| static inline std::vector<Tensor> compute_indices_weights( |
| at::ScalarType scalar_type, |
| int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims, int64_t reshape_dim, |
| bool align_corners, const c10::optional<double> opt_scale |
| ) { |
| // NOLINTNEXTLINE(cppcoreguidelines-init-variables) |
| std::vector<Tensor> output; |
| HelperInterpLinear::init_indices_weights( |
| scalar_type, output, output_size, ndims, reshape_dim, HelperInterpLinear::interp_size); |
| AT_DISPATCH_FLOATING_TYPES_AND( |
| ScalarType::BFloat16, scalar_type, "compute_indices_weights_linear", [&] { |
| scalar_t scale = area_pixel_compute_scale<scalar_t>(input_size, output_size, align_corners, opt_scale); |
| |
| auto input_index0_ptr = output[0].data_ptr<int64_t>(); |
| auto lambda0_ptr = output[1].data_ptr<scalar_t>(); |
| auto input_index1_ptr = output[2].data_ptr<int64_t>(); |
| auto lambda1_ptr = output[3].data_ptr<scalar_t>(); |
| |
| for (const auto i : c10::irange(output_size)) { |
| |
| compute_source_index_and_lambda<scalar_t>( |
| input_index0_ptr[i], input_index1_ptr[i], |
| lambda0_ptr[i], lambda1_ptr[i], |
| scale, i, input_size, output_size, align_corners |
| ); |
| // put stride into indices |
| // index values correspond to input indices (0, 1, 2, 3, ...) |
| // when multiplied by input stride, maximum possible value |
| // input_size[dim-1] * input_size[dim-2] * ... for the given dimension. |
| input_index0_ptr[i] *= stride; |
| input_index1_ptr[i] *= stride; |
| } |
| } |
| ); |
| return output; |
| } |
| |
| // taken from |
| // https://github.com/python-pillow/Pillow/blob/6812205f18ca4ef54372e87e1a13ce4a859434df/ |
| // src/libImaging/Resample.c#L20-L29 |
| template<typename scalar_t> |
| static inline scalar_t aa_filter(scalar_t x) { |
| if (x < 0.0) { |
| x = -x; |
| } |
| if (x < 1.0) { |
| return 1.0 - x; |
| } |
| return 0.0; |
| } |
| |
| static inline std::vector<Tensor> compute_indices_weights_aa( |
| at::ScalarType scalar_type, |
| int64_t input_size, |
| int64_t output_size, |
| int64_t stride, |
| int64_t ndims, |
| int64_t reshape_dim, |
| bool align_corners, |
| const c10::optional<double> opt_scale |
| ) { |
| |
| std::vector<Tensor> indices_weights; |
| AT_DISPATCH_FLOATING_TYPES( |
| scalar_type, "compute_indices_weights_aa", [&] { |
| |
| scalar_t scale = area_pixel_compute_scale<scalar_t>( |
| input_size, output_size, align_corners, opt_scale); |
| |
| auto interp_size = HelperInterpLinear::interp_size; |
| |
| indices_weights = HelperInterpLinear::_compute_indices_weights_aa<scalar_t>( |
| input_size, |
| output_size, |
| stride, |
| ndims, |
| reshape_dim, |
| scale, |
| interp_size, |
| &HelperInterpLinear::aa_filter<scalar_t>); |
| } |
| ); |
| return indices_weights; |
| } |
| }; |
| |
| struct HelperInterpCubic : public HelperInterpBase { |
| |
| static const int interp_size = 4; |
| |
| // Compute indices and weights for each interpolated dimension |
| // indices_weights = { |
| // {indices_0, weights_0, indices_1, weights_1, ..., indices_3, weights_3}, // dim -n |
| // {indices_0, weights_0, indices_1, weights_1, ..., indices_3, weights_3}, // dim -(n-1) |
| // ... |
| // {indices_0, weights_0, indices_1, weights_1, ..., indices_3, weights_3}, // dim -1 |
| // } |
| // Indices and weights are reshaped as (1, 1, ..., N, ..., 1, 1) to |
| // fit input/output tensors. |
| // Indices are already containing the strides to optimize the computations |
| static inline std::vector<Tensor> compute_indices_weights( |
| at::ScalarType scalar_type, |
| int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims, int64_t reshape_dim, |
| bool align_corners, const c10::optional<double> opt_scale |
| ) { |
| // NOLINTNEXTLINE(cppcoreguidelines-init-variables) |
| std::vector<Tensor> output; |
| HelperInterpCubic::init_indices_weights( |
| scalar_type, output, output_size, ndims, reshape_dim, HelperInterpCubic::interp_size); |
| |
| AT_DISPATCH_FLOATING_TYPES_AND( |
| ScalarType::BFloat16, scalar_type, "compute_indices_weights_cubic", [&] { |
| scalar_t scale = area_pixel_compute_scale<scalar_t>(input_size, output_size, align_corners, opt_scale); |
| |
| int64_t input_index; |
| int64_t zero = static_cast<int64_t>(0); |
| // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays) |
| scalar_t coeffs[4]; |
| |
| int64_t * idx_ptr; |
| scalar_t * wt_ptr; |
| using accscalar_t = at::acc_type<scalar_t, false>; |
| for (const auto i : c10::irange(output_size)) { |
| const accscalar_t real_input_index = |
| area_pixel_compute_source_index<accscalar_t>( |
| scale, i, align_corners, /*cubic=*/true); |
| input_index = static_cast<int64_t>(floorf(real_input_index)); |
| get_cubic_upsample_coefficients<scalar_t>(coeffs, real_input_index - input_index); |
| |
| for (const auto j : c10::irange(interp_size)) { |
| idx_ptr = output[2 * j + 0].data_ptr<int64_t>(); |
| idx_ptr[i] = static_cast<int64_t>(std::max(std::min(input_index + j - 1, input_size - 1), zero)) * stride; |
| wt_ptr = output[2 * j + 1].data_ptr<scalar_t>(); |
| wt_ptr[i] = coeffs[j]; |
| } |
| } |
| } |
| ); |
| return output; |
| } |
| |
| // taken from |
| // https://github.com/python-pillow/Pillow/blob/6812205f18ca4ef54372e87e1a13ce4a859434df/ |
| // src/libImaging/Resample.c#L46-L62 |
| template<typename scalar_t> |
| static inline scalar_t aa_filter(scalar_t x) { |
| // https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm |
| #define a -0.5 |
| if (x < 0.0) { |
| x = -x; |
| } |
| if (x < 1.0) { |
| return ((a + 2.0) * x - (a + 3.0)) * x * x + 1; |
| } |
| if (x < 2.0) { |
| return (((x - 5) * x + 8) * x - 4) * a; |
| } |
| return 0.0; |
| #undef a |
| } |
| |
| static inline std::vector<Tensor> compute_indices_weights_aa( |
| at::ScalarType scalar_type, |
| int64_t input_size, |
| int64_t output_size, |
| int64_t stride, |
| int64_t ndims, |
| int64_t reshape_dim, |
| bool align_corners, |
| const c10::optional<double> opt_scale |
| ) { |
| |
| std::vector<Tensor> indices_weights; |
| AT_DISPATCH_FLOATING_TYPES( |
| scalar_type, "compute_indices_weights_aa", [&] { |
| |
| scalar_t scale = area_pixel_compute_scale<scalar_t>( |
| input_size, output_size, align_corners, opt_scale); |
| |
| auto interp_size = HelperInterpCubic::interp_size; |
| |
| indices_weights = HelperInterpCubic::_compute_indices_weights_aa<scalar_t>( |
| input_size, |
| output_size, |
| stride, |
| ndims, |
| reshape_dim, |
| scale, |
| interp_size, |
| &HelperInterpCubic::aa_filter<scalar_t>); |
| } |
| ); |
| return indices_weights; |
| } |
| }; |
| |
| // Generic upsampling interpolation kernel for N-d case. |
| // Input is assumed to be like NCHW, NCL, NCKHW - interpolated spatial dimension |
| // are those from the end up to batch size N and number of channels C. |
| // |
| // Internally, it uses TensorIterator to optimize the computations. |
| // - out_ndims is the number of interpolated dims: 1, 2, 3 |
| // - scale_type is template type for scales, typically c10::optional<double> |
| // - template<typename> class F is one of the above structs to compute indices and weights |
| template <int out_ndims, typename scale_type, class F> |
| void upsample_generic_Nd_kernel_impl( |
| const Tensor& output, |
| const Tensor& input, |
| bool align_corners, |
| const scale_type& scales) { |
| |
| // input can be NCHW, NCL or NCKHW |
| auto shape = input.sizes().vec(); |
| auto strides = input.strides().vec(); |
| auto oshape = output.sizes(); |
| |
| TORCH_INTERNAL_ASSERT( |
| shape.size() == oshape.size() && shape.size() == 2 + out_ndims |
| ); |
| TORCH_INTERNAL_ASSERT(strides.size() == 2 + out_ndims); |
| |
| for (const auto i : c10::irange(out_ndims)) { |
| shape[i + 2] = oshape[i + 2]; |
| strides[i + 2] = 0; |
| } |
| auto restrided_input = input.as_strided(shape, strides); |
| |
| // NOLINTNEXTLINE(cppcoreguidelines-init-variables) |
| std::vector<std::vector<Tensor>> indices_weights; |
| |
| constexpr int interp_size = F::interp_size; |
| auto input_scalar_type = input.scalar_type(); |
| if ((interp_size == 1 && input_scalar_type == at::ScalarType::Byte)) { |
| // nearest also supports uint8 tensor, but we have to use float |
| // with compute_indices_weights |
| input_scalar_type = at::ScalarType::Float; |
| } |
| |
| for (const auto i : c10::irange(out_ndims)) { |
| // NOLINTNEXTLINE(performance-inefficient-vector-operation) |
| indices_weights.emplace_back( |
| F::compute_indices_weights( |
| input_scalar_type, input.size(i + 2), oshape[i + 2], |
| input.stride(i + 2) * input.element_size(), |
| input.dim(), i + 2, align_corners, scales[i] |
| ) |
| ); |
| } |
| |
| TensorIteratorConfig config; |
| config.check_all_same_dtype(false) |
| .declare_static_dtype_and_device(input.scalar_type(), input.device()) |
| .add_output(output) |
| .add_input(restrided_input); |
| |
| for (auto & idx_weight: indices_weights) { |
| for (auto& tensor : idx_weight) { |
| config.add_input(tensor); |
| } |
| } |
| |
| auto iter = config.build(); |
| |
| if (interp_size > 1) { |
| // Nearest also supports uint8 tensor, so need to handle it separately |
| AT_DISPATCH_FLOATING_TYPES_AND( |
| at::ScalarType::BFloat16, iter.dtype(), "upsample_generic_Nd", [&] { |
| // MSVC can not catch constexpr int interp_size here |
| constexpr int mode = F::interp_size; |
| cpu_upsample_generic<scalar_t, out_ndims, mode>(iter); |
| }); |
| } else { |
| AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Byte, at::ScalarType::BFloat16, |
| iter.dtype(), "upsample_generic_Nd", [&] { |
| constexpr int mode = F::interp_size; |
| cpu_upsample_generic<scalar_t, out_ndims, mode>(iter); |
| }); |
| } |
| } |
| |
| template <typename scalar_t> |
| void cpu_upsample_generic_aa(at::TensorIterator& iter) { |
| |
| auto loop = [&](char** data, const int64_t* strides, int64_t n) { |
| if ((strides[0] == sizeof(scalar_t)) && (strides[1] == sizeof(scalar_t)) && |
| is_zero_stride<3 + 2>(&strides[2])) { |
| basic_loop_aa_single_dim_zero_strides<scalar_t, int64_t>( |
| data, strides, n); |
| } else { |
| basic_loop_aa_single_dim_nonzero_strides<scalar_t, int64_t>( |
| data, strides, n); |
| } |
| }; |
| |
| iter.for_each(loop); |
| } |
| |
| // Generic separable upsampling interpolation kernels for N-d case with anti-aliasing |
| template <int out_ndims, typename scale_type, class F> |
| void _separable_upsample_generic_Nd_kernel_impl_single_dim( |
| const Tensor& output, |
| const Tensor& input, |
| int interp_dim, |
| bool align_corners, |
| const scale_type& scales) { |
| |
| // input can be NCHW, NCL or NCKHW |
| auto shape = input.sizes().vec(); |
| auto strides = input.strides().vec(); |
| auto oshape = output.sizes(); |
| |
| TORCH_INTERNAL_ASSERT( |
| shape.size() == oshape.size() && shape.size() == 2 + out_ndims); |
| TORCH_INTERNAL_ASSERT(strides.size() == 2 + out_ndims); |
| |
| for (const auto i : c10::irange(out_ndims)) { |
| shape[i + 2] = oshape[i + 2]; |
| } |
| strides[interp_dim] = 0; |
| auto restrided_input = input.as_strided(shape, strides); |
| |
| std::vector<std::vector<Tensor>> indices_weights; |
| |
| int interp_size = F::interp_size; |
| auto input_scalar_type = input.scalar_type(); |
| if (interp_size == 1 && input_scalar_type == at::ScalarType::Byte) { |
| // nearest also supports uint8 tensor, but we have to use float |
| // with compute_indices_weights |
| input_scalar_type = at::ScalarType::Float; |
| } |
| |
| indices_weights.emplace_back( |
| F::compute_indices_weights_aa( |
| input_scalar_type, input.size(interp_dim), oshape[interp_dim], |
| input.stride(interp_dim) * input.element_size(), |
| input.dim(), interp_dim, align_corners, scales[interp_dim - 2])); |
| |
| TensorIteratorConfig config; |
| config.check_all_same_dtype(false) |
| .declare_static_dtype_and_device(input.scalar_type(), input.device()) |
| .add_output(output) |
| .add_input(restrided_input); |
| |
| for (auto& idx_weight : indices_weights) { |
| for (auto& tensor : idx_weight) { |
| config.add_input(tensor); |
| } |
| } |
| |
| auto iter = config.build(); |
| |
| if (interp_size > 1) { |
| // Nearest also supports uint8 tensor, so need to handle it separately |
| AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "upsample_generic_Nd_aa", [&] { |
| cpu_upsample_generic_aa<scalar_t>(iter); |
| }); |
| } else { |
| AT_DISPATCH_FLOATING_TYPES_AND( |
| at::ScalarType::Byte, iter.dtype(), "upsample_generic_Nd_aa", [&] { |
| cpu_upsample_generic_aa<scalar_t>(iter); |
| }); |
| } |
| } |
| |
| template <int out_ndims, typename scale_type, class F> |
| void separable_upsample_generic_Nd_kernel_impl( |
| const Tensor& output, |
| const Tensor& input, |
| bool align_corners, |
| const scale_type& scales) { |
| |
| auto temp_oshape = input.sizes().vec(); |
| at::Tensor temp_output, temp_input = input; |
| for (const auto i : c10::irange(out_ndims - 1)) { |
| int interp_dim = 2 + out_ndims - 1 - i; |
| temp_oshape[interp_dim] = output.sizes()[interp_dim]; |
| temp_output = at::empty(temp_oshape, input.options()); |
| _separable_upsample_generic_Nd_kernel_impl_single_dim< |
| out_ndims, |
| scale_t, |
| F>( |
| temp_output, temp_input, interp_dim, align_corners, scales); |
| temp_input = temp_output; |
| } |
| _separable_upsample_generic_Nd_kernel_impl_single_dim< |
| out_ndims, |
| scale_t, |
| F>(output, temp_input, 2, align_corners, scales); |
| } |
| |
| void upsample_nearest1d_kernel_impl( |
| const Tensor& output, |
| const Tensor& input, |
| c10::optional<double> scales_w) { |
| upsample_generic_Nd_kernel_impl<1, scale_t, HelperInterpNearest>( |
| output, input, false, {scales_w}); |
| } |
| |
| void _upsample_nearest_exact1d_kernel_impl( |
| const Tensor& output, |
| const Tensor& input, |
| c10::optional<double> scales_w) { |
| upsample_generic_Nd_kernel_impl<1, scale_t, HelperInterpNearestExact>( |
| output, input, false, {scales_w}); |
| } |
| |
| int _use_vectorized_kernel_cond( |
| const Tensor& output, |
| const Tensor& input) { |
| // This condition is used to know whether we should dispatch to a vectorized |
| // kernel, or to the more general upsample_generic_Nd_kernel_impl(). For now, |
| // the vectorized kernels are only optimized for channels_last and when C >= 4 |
| // (shape = NCHW). For a very wide range of use-cases (typically image or mask |
| // resizing where we have C < 4), using upsample_generic_Nd_kernel_impl() is |
| // actually faster. On top of that, bencharmks showed that this also depends on |
| // the *output* size (output_H + output_W) , for both upsampling and |
| // downsampling. The current 128 threshold was determined through benchmarks. |
| return ((input.is_contiguous(at::MemoryFormat::ChannelsLast)) && (input.size(-3) > 3)) || ((output.size(-2) + output.size(-1)) <= 128); |
| } |
| |
| void upsample_nearest2d_kernel_impl( |
| const Tensor& output, |
| const Tensor& input, |
| c10::optional<double> scales_h, |
| c10::optional<double> scales_w) { |
| if (_use_vectorized_kernel_cond(output, input)) { |
| AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Byte, at::ScalarType::BFloat16, |
| input.scalar_type(), "upsample_nearest2d_channels_last", [&] { |
| cpu_upsample_nearest_channels_last<scalar_t, scale_t, nearest_idx>(output, input, {scales_h, scales_w}); |
| }); |
| } else { |
| upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpNearest>( |
| output, input, false, {scales_h, scales_w}); |
| } |
| } |
| |
| void _upsample_nearest_exact2d_kernel_impl( |
| const Tensor& output, |
| const Tensor& input, |
| c10::optional<double> scales_h, |
| c10::optional<double> scales_w) { |
| if (_use_vectorized_kernel_cond(output, input)) { |
| AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Byte, input.scalar_type(), "upsample_nearest2d_channels_last", [&] { |
| cpu_upsample_nearest_channels_last<scalar_t, scale_t, nearest_exact_idx>(output, input, {scales_h, scales_w}); |
| }); |
| } else { |
| upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpNearestExact>( |
| output, input, false, {scales_h, scales_w}); |
| } |
| } |
| |
| void upsample_nearest3d_kernel_impl( |
| const Tensor& output, |
| const Tensor& input, |
| c10::optional<double> scales_d, |
| c10::optional<double> scales_h, |
| c10::optional<double> scales_w) { |
| if (input.is_contiguous(at::MemoryFormat::ChannelsLast3d)) { |
| AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Byte, at::ScalarType::BFloat16, |
| input.scalar_type(), "upsample_nearest3d_channels_last", [&] { |
| cpu_upsample_nearest_channels_last<scalar_t, scale_t, nearest_idx>(output, input, {scales_d, scales_h, scales_w}); |
| }); |
| } else { |
| upsample_generic_Nd_kernel_impl<3, scale_t, HelperInterpNearest>( |
| output, input, false, {scales_d, scales_h, scales_w}); |
| } |
| } |
| |
| void _upsample_nearest_exact3d_kernel_impl( |
| const Tensor& output, |
| const Tensor& input, |
| c10::optional<double> scales_d, |
| c10::optional<double> scales_h, |
| c10::optional<double> scales_w) { |
| if (input.is_contiguous(at::MemoryFormat::ChannelsLast3d)) { |
| AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Byte, input.scalar_type(), "upsample_nearest3d_channels_last", [&] { |
| cpu_upsample_nearest_channels_last<scalar_t, scale_t, nearest_exact_idx>(output, input, {scales_d, scales_h, scales_w}); |
| }); |
| } else { |
| upsample_generic_Nd_kernel_impl<3, scale_t, HelperInterpNearestExact>( |
| output, input, false, {scales_d, scales_h, scales_w}); |
| } |
| } |
| |
| void upsample_linear1d_kernel_impl( |
| const Tensor& output, |
| const Tensor& input, |
| bool align_corners, |
| c10::optional<double> scales_w) { |
| upsample_generic_Nd_kernel_impl<1, scale_t, HelperInterpLinear>( |
| output, input, align_corners, {scales_w}); |
| } |
| |
| void upsample_bilinear2d_kernel_impl( |
| const Tensor& output, |
| const Tensor& input, |
| bool align_corners, |
| c10::optional<double> scales_h, |
| c10::optional<double> scales_w) { |
| |
| // See note above about _use_vectorized_kernel_cond(output, input). The extra cond is present |
| // because benchmarks showed that with only 1 thread, images (C == 3) were |
| // slightly faster with the vectorized kernel than with the generic one. |
| // That's not the case for masks though (C == 1), which strongly benefit from |
| // using the generic kernel. |
| if ((_use_vectorized_kernel_cond(output, input)) || (at::get_num_threads() == 1 && input.size(-3) == 3)) { |
| AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, input.scalar_type(), "upsample_bilinear2d_channels_last", [&] { |
| cpu_upsample_linear_channels_last<scalar_t, scale_t>(output, input, align_corners, {scales_h, scales_w}); |
| }); |
| } else { |
| upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpLinear>( |
| output, input, align_corners, {scales_h, scales_w}); |
| } |
| } |
| |
| void upsample_bilinear2d_aa_kernel_impl( |
| const Tensor& output, |
| const Tensor& input, |
| bool align_corners, |
| c10::optional<double> scales_h, |
| c10::optional<double> scales_w) { |
| |
| separable_upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpLinear>( |
| output, input, align_corners, {scales_h, scales_w}); |
| } |
| |
| void upsample_trilinear3d_kernel_impl( |
| const Tensor& output, |
| const Tensor& input, |
| bool align_corners, |
| c10::optional<double> scales_d, |
| c10::optional<double> scales_h, |
| c10::optional<double> scales_w) { |
| if (input.is_contiguous(at::MemoryFormat::ChannelsLast3d)) { |
| AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, input.scalar_type(), "upsample_trilinear3d_channels_last", [&] { |
| cpu_upsample_linear_channels_last<scalar_t, scale_t>(output, input, align_corners, {scales_d, scales_h, scales_w}); |
| }); |
| } else { |
| upsample_generic_Nd_kernel_impl<3, scale_t, HelperInterpLinear>( |
| output, input, align_corners, {scales_d, scales_h, scales_w}); |
| } |
| } |
| |
| void upsample_bicubic2d_kernel_impl( |
| const Tensor& output, |
| const Tensor& input, |
| bool align_corners, |
| c10::optional<double> scales_h, |
| c10::optional<double> scales_w) { |
| upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpCubic>( |
| output, input, align_corners, {scales_h, scales_w}); |
| } |
| |
| void upsample_bicubic2d_aa_kernel_impl( |
| const Tensor& output, |
| const Tensor& input, |
| bool align_corners, |
| c10::optional<double> scales_h, |
| c10::optional<double> scales_w) { |
| |
| separable_upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpCubic>( |
| output, input, align_corners, {scales_h, scales_w}); |
| } |
| |
| template < |
| typename scalar_t, |
| typename scale_type, |
| class F> |
| void cpu_upsample_genNd_backward_aa( |
| const Tensor& grad_input_, |
| const Tensor& grad_output_, |
| bool align_corners, |
| const scale_type& scales) { |
| TORCH_CHECK(grad_input_.dtype() == grad_output_.dtype(), "expected dtype ", grad_output_.dtype(), |
| " for `grad_input` but got dtype ", grad_input_.dtype()); |
| |
| auto grad_output = grad_output_.contiguous(); |
| auto grad_input = grad_input_.contiguous(); |
| |
| auto grad_output_data = grad_output.data_ptr<scalar_t>(); |
| auto grad_input_data = grad_input.data_ptr<scalar_t>(); |
| auto input_sizes = grad_input.sizes().vec(); |
| auto output_sizes = grad_output.sizes().vec(); |
| auto ndim = input_sizes.size(); |
| |
| // treat nbatch and channels as one dimension |
| int64_t channels = input_sizes[0] * input_sizes[1]; |
| int64_t output_depth = (ndim == 5) ? output_sizes[2] : 1; |
| int64_t input_height = (ndim >= 4) ? input_sizes[ndim - 2] : 1; |
| int64_t output_height = (ndim >= 4) ? output_sizes[ndim - 2] : 1; |
| int64_t input_width = input_sizes[ndim - 1]; |
| int64_t output_width = output_sizes[ndim - 1]; |
| |
| int64_t output_slice_size = output_depth * output_height * output_width; |
| int interp_size = F::interp_size; |
| |
| auto loop2d = [&](int64_t begin, int64_t end) { |
| const scalar_t height_scale = area_pixel_compute_scale<scalar_t>( |
| input_height, output_height, align_corners, scales[0]); |
| const scalar_t width_scale = area_pixel_compute_scale<scalar_t>( |
| input_width, output_width, align_corners, scales[1]); |
| |
| auto input_indexr = [=](int64_t c, int64_t h, int64_t w) { |
| return grad_input_data + c * input_height * input_width + |
| h * input_width + w; |
| }; |
| |
| const scalar_t support_h = (height_scale >= 1.0) |
| ? (interp_size * 0.5) * height_scale |
| : interp_size * 0.5; |
| const scalar_t support_w = (width_scale >= 1.0) |
| ? (interp_size * 0.5) * width_scale |
| : interp_size * 0.5; |
| |
| const int interp_height = (int)ceilf(support_h) * 2 + 1; |
| const int interp_width = (int)ceilf(support_w) * 2 + 1; |
| |
| std::vector<scalar_t> wx(interp_width, 0.0); |
| std::vector<scalar_t> wy(interp_height, 0.0); |
| |
| // NOLINTNEXTLINE(cppcoreguidelines-init-variables) |
| int64_t xmin, ymin; |
| int64_t xsize, ysize; |
| |
| typedef scalar_t (*aa_filter_fn_t)(scalar_t); |
| aa_filter_fn_t filter_fn = &F::aa_filter; |
| |
| for (const auto oh : c10::irange(output_height)) { |
| F::_compute_weights_aa( |
| oh, |
| input_height, |
| height_scale, |
| support_h, |
| wy.data(), |
| interp_height, |
| filter_fn, |
| ymin, |
| ysize); |
| |
| for (const auto ow : c10::irange(output_width)) { |
| F::_compute_weights_aa( |
| ow, |
| input_width, |
| width_scale, |
| support_w, |
| wx.data(), |
| interp_width, |
| filter_fn, |
| xmin, |
| xsize); |
| |
| for (const auto c : c10::irange(begin, end)) { |
| scalar_t grad_output_value = |
| grad_output_data[c * output_slice_size + oh * output_width + ow]; |
| |
| for (const auto y : c10::irange(ysize)) { |
| for (const auto x : c10::irange(xsize)) { |
| *input_indexr(c, ymin + y, xmin + x) += |
| wx[x] * wy[y] * grad_output_value; |
| } |
| } |
| } |
| } |
| } |
| }; |
| |
| if (ndim == 4) { |
| // upsample bilinear 2d |
| at::parallel_for( |
| 0, channels, at::internal::GRAIN_SIZE / output_slice_size / 4, loop2d); |
| } else { |
| TORCH_CHECK(false, "Unsupported tensor ndim"); |
| } |
| |
| if (!grad_input_.is_contiguous()) { |
| grad_input_.copy_(grad_input); |
| } |
| } |
| |
| void upsample_bilinear2d_aa_backward_kernel_impl( |
| const Tensor& grad_input, |
| const Tensor& grad_output, |
| bool align_corners, |
| c10::optional<double> scales_h, |
| c10::optional<double> scales_w) { |
| AT_DISPATCH_FLOATING_TYPES( |
| grad_output.scalar_type(), "upsample_bilinear2d_aa_backward_cpu", [&] { |
| cpu_upsample_genNd_backward_aa<scalar_t, scale_t, HelperInterpLinear>( |
| grad_input, grad_output, align_corners, {scales_h, scales_w}); |
| }); |
| } |
| |
| void upsample_bicubic2d_aa_backward_kernel_impl( |
| const Tensor& grad_input, |
| const Tensor& grad_output, |
| bool align_corners, |
| c10::optional<double> scales_h, |
| c10::optional<double> scales_w) { |
| AT_DISPATCH_FLOATING_TYPES( |
| grad_output.scalar_type(), "upsample_bicubic2d_aa_backward_cpu", [&] { |
| cpu_upsample_genNd_backward_aa<scalar_t, scale_t, HelperInterpCubic>( |
| grad_input, grad_output, align_corners, {scales_h, scales_w}); |
| }); |
| } |
| |
| } // anonymous namespace |
| |
| REGISTER_DISPATCH(upsample_nearest1d_kernel, &upsample_nearest1d_kernel_impl); |
| REGISTER_DISPATCH(_upsample_nearest_exact1d_kernel, &_upsample_nearest_exact1d_kernel_impl); |
| REGISTER_DISPATCH(upsample_nearest2d_kernel, &upsample_nearest2d_kernel_impl); |
| REGISTER_DISPATCH(_upsample_nearest_exact2d_kernel, &_upsample_nearest_exact2d_kernel_impl); |
| REGISTER_DISPATCH(upsample_nearest3d_kernel, &upsample_nearest3d_kernel_impl); |
| REGISTER_DISPATCH(_upsample_nearest_exact3d_kernel, &_upsample_nearest_exact3d_kernel_impl); |
| |
| REGISTER_DISPATCH(upsample_linear1d_kernel, &upsample_linear1d_kernel_impl); |
| REGISTER_DISPATCH(upsample_bilinear2d_kernel, &upsample_bilinear2d_kernel_impl); |
| REGISTER_DISPATCH(_upsample_bilinear2d_aa_kernel, &upsample_bilinear2d_aa_kernel_impl); |
| REGISTER_DISPATCH(_upsample_bilinear2d_aa_backward_kernel, &upsample_bilinear2d_aa_backward_kernel_impl); |
| REGISTER_DISPATCH(upsample_trilinear3d_kernel, &upsample_trilinear3d_kernel_impl); |
| |
| REGISTER_DISPATCH(upsample_bicubic2d_kernel, &upsample_bicubic2d_kernel_impl); |
| REGISTER_DISPATCH(_upsample_bicubic2d_aa_kernel, &upsample_bicubic2d_aa_kernel_impl); |
| REGISTER_DISPATCH(_upsample_bicubic2d_aa_backward_kernel, &upsample_bicubic2d_aa_backward_kernel_impl); |
| } // namespace native |
| } // namespace at |