| #pragma once |
| |
| #include <math.h> |
| |
| #include <ATen/OpMathType.h> |
| #include <ATen/TensorUtils.h> |
| #include <ATen/core/Tensor.h> |
| #include <ATen/cpu/vec/vec.h> |
| #include <ATen/native/DispatchStub.h> |
| |
| /** |
| * Note [compute_scales_value] |
| * Note [area_pixel_compute_scale] |
| * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| * Interpolate with scale_factor can have different behaviors |
| * depending on the value of recompute_scale_factor: |
| * |
| * - With recompute_scale_factor = True (current default behavior): |
| * the scale_factor, when provided by the user, are used to calculate |
| * the output size. The input size and the computed output_size |
| * are then used to infer new values for the scales which are |
| * used in the interpolation. Because floating-point math is not exact, |
| * this may be a different value from the user-supplied scales. |
| * |
| * - With recompute_scale_factor = False (which will be the default |
| * behavior starting 1.5.0): |
| * the behavior follows opencv logic, and the scales provided by |
| * the user are the ones used in the interpolation calculations. |
| * |
| * If the scales are not provided or if they are provided but |
| * recompute_scale_factor is set to True (default behavior), the scales |
| * are computed from the input and the output size; |
| * |
| * |
| * When the scales are inferred from the input and output sizes, |
| * we view each pixel as an area, idx + 0.5 as its center index. |
| * Here is an example formula in 1D case. |
| * if align_corners: center of two corner pixel areas are preserved, |
| * (0.5, 0.5) -> (0.5, 0.5), |
| * (input_size - 0.5, 0.5) -> (output_size - 0.5) |
| * scale = (input_size - 0.5 - 0.5) / (output_size - 0.5 - 0.5) |
| * src_index + 0.5 - 0.5 = scale * (dst_index + 0.5 - 0.5) |
| * if not align_corners: the whole range is scaled accordingly |
| * scale = input_size / output_size |
| * src_idx + 0.5 = scale * (dst_index + 0.5) |
| */ |
| |
| namespace at::native { |
| |
| namespace upsample { |
| |
| TORCH_API c10::SmallVector<int64_t, 3> compute_output_size( |
| c10::IntArrayRef input_size, // Full input tensor size. |
| at::OptionalIntArrayRef output_size, |
| c10::optional<c10::ArrayRef<double>> scale_factors); |
| |
| inline c10::optional<double> get_scale_value(c10::optional<c10::ArrayRef<double>> scales, int idx) { |
| if (!scales) { |
| return c10::nullopt; |
| } |
| return scales->at(idx); |
| } |
| |
| } // namespace upsample |
| |
| using scale_t = c10::optional<double>; |
| using upsampling_nearest1d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_w); |
| using _upsampling_nearest_exact1d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_w); |
| using upsampling_nearest2d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_h, scale_t scales_w); |
| using _upsampling_nearest_exact2d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_h, scale_t scales_w); |
| using upsampling_nearest3d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_d, scale_t scales_h, scale_t scales_w); |
| using _upsampling_nearest_exact3d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_d, scale_t scales_h, scale_t scales_w); |
| using upsampling_linear1d = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_w); |
| using upsampling_bilinear2d = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_h, scale_t scales_w); |
| using _upsampling_bilinear2d_aa = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_h, scale_t scales_w); |
| using upsampling_trilinear3d = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_d, scale_t scales_h, scale_t scales_w); |
| using upsampling_bicubic2d = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_h, scale_t scales_w); |
| using _upsampling_bicubic2d_aa = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_h, scale_t scales_w); |
| DECLARE_DISPATCH(upsampling_nearest1d, upsample_nearest1d_kernel); |
| DECLARE_DISPATCH(_upsampling_nearest_exact1d, _upsample_nearest_exact1d_kernel); |
| DECLARE_DISPATCH(upsampling_nearest2d, upsample_nearest2d_kernel); |
| DECLARE_DISPATCH(_upsampling_nearest_exact2d, _upsample_nearest_exact2d_kernel); |
| DECLARE_DISPATCH(upsampling_nearest3d, upsample_nearest3d_kernel); |
| DECLARE_DISPATCH(_upsampling_nearest_exact3d, _upsample_nearest_exact3d_kernel); |
| DECLARE_DISPATCH(upsampling_nearest1d, upsample_nearest1d_backward_kernel); |
| DECLARE_DISPATCH(_upsampling_nearest_exact1d, _upsample_nearest_exact1d_backward_kernel); |
| DECLARE_DISPATCH(upsampling_nearest2d, upsample_nearest2d_backward_kernel); |
| DECLARE_DISPATCH(_upsampling_nearest_exact2d, _upsample_nearest_exact2d_backward_kernel); |
| DECLARE_DISPATCH(upsampling_nearest3d, upsample_nearest3d_backward_kernel); |
| DECLARE_DISPATCH(_upsampling_nearest_exact3d, _upsample_nearest_exact3d_backward_kernel); |
| DECLARE_DISPATCH(upsampling_linear1d, upsample_linear1d_kernel); |
| DECLARE_DISPATCH(upsampling_bilinear2d, upsample_bilinear2d_kernel); |
| DECLARE_DISPATCH(_upsampling_bilinear2d_aa, _upsample_bilinear2d_aa_kernel); |
| DECLARE_DISPATCH(upsampling_trilinear3d, upsample_trilinear3d_kernel); |
| DECLARE_DISPATCH(upsampling_linear1d, upsample_linear1d_backward_kernel); |
| DECLARE_DISPATCH(upsampling_bilinear2d, upsample_bilinear2d_backward_kernel); |
| DECLARE_DISPATCH(_upsampling_bilinear2d_aa, _upsample_bilinear2d_aa_backward_kernel); |
| DECLARE_DISPATCH(upsampling_trilinear3d, upsample_trilinear3d_backward_kernel); |
| DECLARE_DISPATCH(upsampling_bicubic2d, upsample_bicubic2d_kernel); |
| DECLARE_DISPATCH(_upsampling_bicubic2d_aa, _upsample_bicubic2d_aa_kernel); |
| DECLARE_DISPATCH(_upsampling_bicubic2d_aa, _upsample_bicubic2d_aa_backward_kernel); |
| |
| static C10_UNUSED std::array<int64_t, 3> upsample_1d_common_check(IntArrayRef input_size, IntArrayRef output_size) { |
| TORCH_CHECK( |
| output_size.size() == 1, |
| "It is expected output_size equals to 1, but got size ", |
| output_size.size()); |
| |
| TORCH_CHECK( |
| input_size.size() == 3, |
| "It is expected input_size equals to 3, but got size ", |
| input_size.size()); |
| |
| int64_t output_width = output_size[0]; |
| |
| int64_t nbatch = input_size[0]; |
| int64_t channels = input_size[1]; |
| int64_t input_width = input_size[2]; |
| |
| TORCH_CHECK( |
| input_width > 0 && output_width > 0, |
| "Input and output sizes should be greater than 0, but got input (W: ", |
| input_width, |
| ") and output (W: ", |
| output_width, |
| ")"); |
| |
| return {nbatch, channels, output_width}; |
| } |
| |
| static C10_UNUSED std::array<int64_t, 4> upsample_2d_common_check(IntArrayRef input_size, IntArrayRef output_size) { |
| TORCH_CHECK( |
| output_size.size() == 2, |
| "It is expected output_size equals to 2, but got size ", |
| output_size.size()); |
| |
| TORCH_CHECK( |
| input_size.size() == 4, |
| "It is expected input_size equals to 4, but got size ", |
| input_size.size()); |
| |
| int64_t output_height = output_size[0]; |
| int64_t output_width = output_size[1]; |
| |
| int64_t nbatch = input_size[0]; |
| int64_t channels = input_size[1]; |
| int64_t input_height = input_size[2]; |
| int64_t input_width = input_size[3]; |
| |
| TORCH_CHECK( |
| input_height > 0 && input_width > 0 && output_height > 0 && |
| output_width > 0, |
| "Input and output sizes should be greater than 0," |
| " but got input (H: ", |
| input_height, |
| ", W: ", |
| input_width, |
| ") output (H: ", |
| output_height, |
| ", W: ", |
| output_width, |
| ")"); |
| |
| return {nbatch, channels, output_height, output_width}; |
| } |
| |
| static C10_UNUSED |
| std::array<int64_t, 5> upsample_3d_common_check(IntArrayRef input_size, IntArrayRef output_size) { |
| TORCH_CHECK( |
| output_size.size() == 3, |
| "It is expected output_size equals to 3, but got size ", |
| output_size.size()); |
| |
| TORCH_CHECK( |
| input_size.size() == 5, |
| "It is expected input_size equals to 5, but got size ", |
| input_size.size()); |
| |
| int64_t output_depth = output_size[0]; |
| int64_t output_height = output_size[1]; |
| int64_t output_width = output_size[2]; |
| |
| int64_t nbatch = input_size[0]; |
| int64_t channels = input_size[1]; |
| int64_t input_depth = input_size[2]; |
| int64_t input_height = input_size[3]; |
| int64_t input_width = input_size[4]; |
| |
| TORCH_CHECK( |
| input_depth > 0 && input_height > 0 && input_width > 0 && |
| output_depth > 0 && output_height > 0 && output_width > 0, |
| "Input and output sizes should be greater than 0, but got input (D: ", |
| input_depth, |
| ", H: ", |
| input_height, |
| ", W: ", |
| input_width, |
| ") output (D: ", |
| output_depth, |
| ", H: ", |
| output_height, |
| ", W: ", |
| output_width, |
| ")"); |
| |
| |
| return {nbatch, channels, output_depth, output_height, output_width}; |
| } |
| |
| static inline void upsample_2d_shape_check( |
| const Tensor& input, |
| const Tensor& grad_output, |
| int64_t nbatch, |
| int64_t nchannels, |
| int64_t input_height, |
| int64_t input_width, |
| int64_t output_height, |
| int64_t output_width) { |
| TORCH_CHECK( |
| input_height > 0 && input_width > 0 && output_height > 0 && |
| output_width > 0, |
| "Input and output sizes should be greater than 0," |
| " but got input (H: ", |
| input_height, |
| ", W: ", |
| input_width, |
| ") output (H: ", |
| output_height, |
| ", W: ", |
| output_width, |
| ")"); |
| |
| if (input.defined()) { |
| // Allow for empty batch size but not other dimensions |
| TORCH_CHECK( |
| (input.numel() != 0 || |
| (input.size(1) != 0 && input.size(2) != 0 && input.size(3) != 0) |
| ) && |
| input.dim() == 4, |
| "Non-empty 4D data tensor expected but got a tensor with sizes ", |
| input.sizes()); |
| } else if (grad_output.defined()) { |
| check_dim_size(grad_output, 4, 0, nbatch); |
| check_dim_size(grad_output, 4, 1, nchannels); |
| check_dim_size(grad_output, 4, 2, output_height); |
| check_dim_size(grad_output, 4, 3, output_width); |
| } |
| } |
| |
| template <typename scalar_t> |
| static inline scalar_t compute_scales_value( |
| const c10::optional<double> scale, |
| int64_t input_size, |
| int64_t output_size) { |
| // see Note [compute_scales_value] |
| // FIXME: remove magic > 0 after we ensure no models were serialized with -1 defaults. |
| return (scale.has_value() && scale.value() > 0.) |
| ? static_cast<scalar_t>(1.0 / scale.value()) |
| : (static_cast<scalar_t>(input_size) / output_size); |
| } |
| |
| template <typename scalar_t> |
| static inline scalar_t area_pixel_compute_scale( |
| int64_t input_size, |
| int64_t output_size, |
| bool align_corners, |
| const c10::optional<double> scale) { |
| // see Note [area_pixel_compute_scale] |
| if(align_corners) { |
| if(output_size > 1) { |
| return static_cast<scalar_t>(input_size - 1) / (output_size - 1); |
| } else { |
| return static_cast<scalar_t>(0); |
| } |
| } else { |
| return compute_scales_value<scalar_t>(scale, input_size, output_size); |
| } |
| } |
| |
| template <typename scalar_t> |
| static inline scalar_t area_pixel_compute_source_index( |
| scalar_t scale, |
| int64_t dst_index, |
| bool align_corners, |
| bool cubic) { |
| if (align_corners) { |
| return scale * dst_index; |
| } else { |
| scalar_t src_idx = scale * (dst_index + static_cast<scalar_t>(0.5)) - |
| static_cast<scalar_t>(0.5); |
| // [Note] Follow Opencv resize logic: |
| // We allow negative src_idx here and later will use |
| // dx = src_idx - floorf(src_idx) |
| // to compute the "distance"(which affects weights). |
| // For linear modes, weight distribution doesn't matter |
| // for negative indices as they use 2 pixels to interpolate. |
| // For example, [-1, 0], they both use pixel 0 value so it |
| // doesn't affect if we bound the src_idx to 0 or not. |
| // TODO: Our current linear mode impls use unbound indices |
| // where we should and then remove this cubic flag. |
| // This matters in cubic mode, as we might need [-1, 0, 1, 2] |
| // to interpolate and the weights can be affected. |
| return (!cubic && src_idx < static_cast<scalar_t>(0)) ? scalar_t(0) |
| : src_idx; |
| } |
| } |
| |
| static inline int64_t nearest_neighbor_compute_source_index( |
| const float scale, |
| int64_t dst_index, |
| int64_t input_size) { |
| // Index computation matching OpenCV INTER_NEAREST |
| // which is buggy and kept for BC |
| const int64_t src_index = |
| std::min(static_cast<int64_t>(floorf(dst_index * scale)), input_size - 1); |
| return src_index; |
| } |
| |
| static inline int64_t nearest_neighbor_exact_compute_source_index( |
| const float scale, |
| int64_t dst_index, |
| int64_t input_size) { |
| // index_f32 = (output_index + 0.5) * scale - 0.5 |
| // input_index = round(index_f32) |
| // Same as Pillow and Scikit-Image/Scipy ndi.zoom |
| const int64_t src_index = |
| std::min(static_cast<int64_t>(floorf((dst_index + 0.5) * scale)), input_size - 1); |
| return src_index; |
| } |
| |
| static inline int64_t nearest_idx( |
| int64_t output_index, |
| int64_t input_size, |
| int64_t output_size, |
| c10::optional<double> scales) { |
| // This method specificly treats cases: output_size == input_size or |
| // output_size == 2 * input_size, that we would like to get rid of |
| // We keep this method for BC and consider as deprecated. |
| // See nearest_exact_idx as replacement |
| if (output_size == input_size) { |
| // scale_factor = 1, simply copy |
| return output_index; |
| } else if (output_size == 2 * input_size) { |
| // scale_factor = 2, shift input index |
| return output_index >> 1; |
| } else { |
| float scale = compute_scales_value<float>(scales, input_size, output_size); |
| return nearest_neighbor_compute_source_index(scale, output_index, input_size); |
| } |
| } |
| |
| static inline int64_t nearest_exact_idx( |
| int64_t output_index, |
| int64_t input_size, |
| int64_t output_size, |
| c10::optional<double> scales) { |
| float scale = compute_scales_value<float>(scales, input_size, output_size); |
| return nearest_neighbor_exact_compute_source_index(scale, output_index, input_size); |
| } |
| |
| // Define a typedef to dispatch to nearest_idx or nearest_exact_idx |
| typedef int64_t (*nearest_idx_fn_t)(int64_t, int64_t, int64_t, c10::optional<double>); |
| |
| template <typename scalar_t> |
| static scalar_t upsample_get_value_bounded( |
| scalar_t* data, |
| int64_t width, |
| int64_t height, |
| int64_t x, |
| int64_t y) { |
| int64_t access_x = std::max(std::min(x, width - 1), static_cast<int64_t>(0)); |
| int64_t access_y = std::max(std::min(y, height - 1), static_cast<int64_t>(0)); |
| return data[access_y * width + access_x]; |
| } |
| |
| template <typename scalar_t> |
| static void upsample_increment_value_bounded( |
| scalar_t* data, |
| int64_t width, |
| int64_t height, |
| int64_t x, |
| int64_t y, |
| scalar_t value) { |
| int64_t access_x = std::max(std::min(x, width - 1), static_cast<int64_t>(0)); |
| int64_t access_y = std::max(std::min(y, height - 1), static_cast<int64_t>(0)); |
| data[access_y * width + access_x] += value; |
| } |
| |
| // Based on |
| // https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm |
| template <typename scalar_t> |
| static inline scalar_t cubic_convolution1(scalar_t x, scalar_t A) { |
| return ((A + 2) * x - (A + 3)) * x * x + 1; |
| } |
| |
| template <typename scalar_t> |
| static inline scalar_t cubic_convolution2(scalar_t x, scalar_t A) { |
| return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A; |
| } |
| |
| template <typename scalar_t> |
| static inline void get_cubic_upsample_coefficients( |
| scalar_t coeffs[4], |
| scalar_t t) { |
| scalar_t A = -0.75; |
| |
| scalar_t x1 = t; |
| coeffs[0] = cubic_convolution2<scalar_t>(x1 + 1.0, A); |
| coeffs[1] = cubic_convolution1<scalar_t>(x1, A); |
| |
| // opposite coefficients |
| scalar_t x2 = 1.0 - t; |
| coeffs[2] = cubic_convolution1<scalar_t>(x2, A); |
| coeffs[3] = cubic_convolution2<scalar_t>(x2 + 1.0, A); |
| } |
| |
| template <typename scalar_t> |
| static inline scalar_t cubic_interp1d( |
| scalar_t x0, |
| scalar_t x1, |
| scalar_t x2, |
| scalar_t x3, |
| scalar_t t) { |
| scalar_t coeffs[4]; |
| get_cubic_upsample_coefficients<scalar_t>(coeffs, t); |
| |
| return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3]; |
| } |
| |
| // when `real_input_index` becomes larger than the range the floating point |
| // type can accurately represent, the type casting to `int64_t` might exceed |
| // `input_size`, causing overflow. So we guard it with `std::min` below. |
| template<typename scalar_t, typename opmath_t> |
| static inline void guard_index_and_lambda(const opmath_t& real_input_index, const int64_t& input_size, int64_t& input_index, scalar_t& lambda) { |
| input_index = std::min(static_cast<int64_t>(floorf(real_input_index)), input_size - 1); |
| lambda = std::min( |
| std::max(real_input_index - input_index, static_cast<opmath_t>(0)), |
| static_cast<opmath_t>(1) |
| ); |
| } |
| |
| template<typename scalar_t, typename opmath_t> |
| static inline void compute_source_index_and_lambda( |
| int64_t& input_index0, |
| int64_t& input_index1, |
| scalar_t& lambda0, |
| scalar_t& lambda1, |
| opmath_t ratio, |
| int64_t output_index, |
| int64_t input_size, |
| int64_t output_size, |
| bool align_corners) { |
| if (output_size == input_size) { |
| // scale_factor = 1, simply copy |
| input_index0 = output_index; |
| input_index1 = output_index; |
| lambda0 = static_cast<scalar_t>(1); |
| lambda1 = static_cast<scalar_t>(0); |
| } else { |
| const auto real_input_index = |
| area_pixel_compute_source_index<opmath_t>( |
| ratio, output_index, align_corners, /*cubic=*/false); |
| guard_index_and_lambda(real_input_index, input_size, input_index0, lambda1); |
| int64_t offset = (input_index0 < input_size - 1) ? 1 : 0; |
| input_index1 = input_index0 + offset; |
| lambda0 = static_cast<scalar_t>(1.) - lambda1; |
| } |
| } |
| |
| // It will not be used by data types other than BFloat16. |
| template <typename scalar_in, typename scalar_out> |
| void inline apply_grad_input(scalar_in* buffer_ptr, scalar_out* gin, int64_t size) { |
| TORCH_CHECK((std::is_same<scalar_out, BFloat16>::value), |
| "Upsample backward only support BFloat16 in the lower percision data types on CPU.") |
| TORCH_CHECK((std::is_same<scalar_in, float>::value), |
| "Upsample backward should use float as acc buffer for BFloat16 grad input on CPU.") |
| return; |
| } |
| |
| template <> |
| void inline apply_grad_input(float* buffer_ptr, BFloat16* gin, int64_t size) { |
| using bVec = vec::Vectorized<BFloat16>; |
| using fVec = vec::Vectorized<float>; |
| int64_t d = 0; |
| for (; d < size - (size % bVec::size()); d += bVec::size()) { |
| bVec gin_bvec = bVec::loadu(gin + d); |
| fVec gin_fvec0, gin_fvec1; |
| std::tie(gin_fvec0, gin_fvec1) = convert_bfloat16_float(gin_bvec); |
| gin_fvec0 += fVec::loadu(buffer_ptr + d); |
| gin_fvec1 += fVec::loadu(buffer_ptr + d + fVec::size()); |
| fVec(0).store(buffer_ptr + d); |
| fVec(0).store(buffer_ptr + d + fVec::size()); |
| convert_float_bfloat16(gin_fvec0, gin_fvec1).store(gin + d); |
| } |
| for (; d < size; d++) { |
| gin[d] += buffer_ptr[d]; |
| buffer_ptr[d] = 0; |
| } |
| } |
| |
| } // namespace at::native |