blob: 2a213632ca91138d856a0b58cdde8bae5d47ef74 [file] [log] [blame]
#define TORCH_ASSERT_NO_OPERATORS
#ifndef _USE_MATH_DEFINES
#define _USE_MATH_DEFINES
#endif
#include <ATen/native/Activation.h>
#include <cmath>
#include <functional>
#include <ATen/Dispatch.h>
#include <ATen/OpMathType.h>
#include <ATen/core/TensorBase.h>
#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#include <ATen/cpu/vec/functional.h>
#include <ATen/native/TensorIterator.h>
#include <ATen/native/cpu/Loops.h>
#include <ATen/Parallel.h>
#include <c10/core/Scalar.h>
namespace at::native {
namespace {
static void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const TensorBase &input) {
if (at::isReducedFloatingType(input.scalar_type())) {
AT_DISPATCH_REDUCED_FLOATING_TYPES(input.scalar_type(), "log_sigmoid_cpu", [&]() {
using Vec = Vectorized<scalar_t>;
scalar_t* output_data = output.data_ptr<scalar_t>();
scalar_t* buffer_data = buffer.data_ptr<scalar_t>();
scalar_t* input_data = input.data_ptr<scalar_t>();
parallel_for(0, input.numel(), 1, [&] (int64_t begin, int64_t end) {
int64_t size = end - begin;
int64_t d = 0;
for (; d < size - (size % Vec::size()); d += Vec::size()) {
Vec data_vec = Vec::loadu(input_data + begin+ d);
Vectorized<float> data_vec0, data_vec1;
std::tie(data_vec0, data_vec1) = convert_to_float<scalar_t>(data_vec);
Vectorized<float> min_vec = minimum(data_vec0, Vectorized<float>(float(0)));
Vectorized<float> buffer_vec0 = data_vec0.abs().neg().exp();
Vectorized<float> output_vec0 = min_vec - buffer_vec0.log1p();
min_vec = minimum(data_vec1, Vectorized<float>(float(0)));
Vectorized<float> buffer_vec1 = data_vec1.abs().neg().exp();
Vectorized<float> output_vec1 = min_vec - buffer_vec1.log1p();
convert_from_float<scalar_t>(buffer_vec0, buffer_vec1).store(buffer_data + begin + d);
convert_from_float<scalar_t>(output_vec0, output_vec1).store(output_data + begin + d);
}
if (size - d > 0) {
Vec data_vec = Vec::loadu(input_data + begin + d, size - d);
Vectorized<float> data_vec0, data_vec1;
std::tie(data_vec0, data_vec1) = convert_to_float<scalar_t>(data_vec);
Vectorized<float> min_vec = minimum(data_vec0, Vectorized<float>(float(0)));
Vectorized<float> buffer_vec0 = data_vec0.abs().neg().exp();
Vectorized<float> output_vec0 = min_vec - buffer_vec0.log1p();
min_vec = minimum(data_vec1, Vectorized<float>(float(0)));
Vectorized<float> buffer_vec1 = data_vec1.abs().neg().exp();
Vectorized<float> output_vec1 = min_vec - buffer_vec1.log1p();
convert_from_float<scalar_t>(buffer_vec0, buffer_vec1).store(buffer_data + begin + d, size - d);
convert_from_float<scalar_t>(output_vec0, output_vec1).store(output_data + begin + d, size - d);
}
});
});
} else {
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "log_sigmoid_cpu", [&] {
using Vec = Vectorized<scalar_t>;
scalar_t* output_data = output.data_ptr<scalar_t>();
scalar_t* buffer_data = buffer.data_ptr<scalar_t>();
scalar_t* input_data = input.data_ptr<scalar_t>();
parallel_for(0, input.numel(), 1, [&] (int64_t begin, int64_t end) {
int64_t size = end - begin;
int64_t d = 0;
for (; d < size - (size % Vec::size()); d += Vec::size()) {
Vec data_vec = Vec::loadu(input_data + begin+ d);
Vec min_vec = vec::minimum(data_vec, Vec(scalar_t(0)));
Vec buffer_vec = data_vec.abs().neg().exp();
Vec output_vec = min_vec - buffer_vec.log1p();
buffer_vec.store(buffer_data + begin + d);
output_vec.store(output_data + begin + d);
}
if (size - d > 0) {
Vec data_vec = Vec::loadu(input_data + begin + d, size - d);
Vec min_vec = vec::minimum(data_vec, Vec(scalar_t(0)));
Vec buffer_vec = data_vec.abs().neg().exp();
Vec output_vec = min_vec - buffer_vec.log1p();
buffer_vec.store(buffer_data + begin + d, size - d);
output_vec.store(output_data + begin + d, size - d);
}
});
});
}
}
static void log_sigmoid_backward_cpu_kernel(TensorIterator& iter) {
if (at::isReducedFloatingType(iter.dtype())) {
AT_DISPATCH_REDUCED_FLOATING_TYPES(iter.dtype(), "log_sigmoid_backward_cpu", [&]() {
using Vec = Vectorized<scalar_t>;
auto zero_val = float(0);
auto zero_vec = Vectorized<float>(zero_val);
auto one_val = float(1);
auto one_vec = Vectorized<float>(one_val);
cpu_kernel_vec(iter,
[=](scalar_t a, scalar_t b, scalar_t c) -> scalar_t {
auto in_negative = float(a) < float(0);
auto max_deriv = in_negative ? float(1) : float(0);
auto sign = in_negative ? float(1) : -float(1);
return (max_deriv - sign * (float(b) / (float(1) + b))) * float(c);
},
[=](Vec a, Vec b, Vec c) -> Vec {
Vectorized<float> a0, a1, b0, b1, c0, c1;
std::tie(a0, a1) = convert_to_float<scalar_t>(a);
std::tie(b0, b1) = convert_to_float<scalar_t>(b);
std::tie(c0, c1) = convert_to_float<scalar_t>(c);
auto mask = a0 < zero_vec;
auto max_deriv_vec = Vectorized<float>::blendv(zero_vec, one_vec, mask);
auto sign_vec = Vectorized<float>::blendv(one_vec.neg(), one_vec, mask);
a0 = (max_deriv_vec - sign_vec * (b0 / (one_vec + b0))) * c0;
mask = a1 < zero_vec;
max_deriv_vec = Vectorized<float>::blendv(zero_vec, one_vec, mask);
sign_vec = Vectorized<float>::blendv(one_vec.neg(), one_vec, mask);
a1 = (max_deriv_vec - sign_vec * (b1 / (one_vec + b1))) * c1;
return convert_from_float<scalar_t>(a0, a1);
});
});
} else {
AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "log_sigmoid_backward_cpu", [&]() {
using Vec = Vectorized<scalar_t>;
auto zero_val = scalar_t(0);
auto zero_vec = Vec(zero_val);
auto one_val = scalar_t(1);
auto one_vec = Vec(one_val);
cpu_kernel_vec(iter,
[=](scalar_t a, scalar_t b, scalar_t c) -> scalar_t {
auto in_negative = a < scalar_t(0);
auto max_deriv = in_negative ? scalar_t(1) : scalar_t(0);
auto sign = in_negative ? scalar_t(1) : -scalar_t(1);
return (max_deriv - sign * (b / (scalar_t(1) + b))) * c;
},
[=](Vec a, Vec b, Vec c) -> Vec {
auto mask = a < zero_vec;
auto max_deriv_vec = Vec::blendv(zero_vec, one_vec, mask);
auto sign_vec = Vec::blendv(one_vec.neg(), one_vec, mask);
return (max_deriv_vec - sign_vec * (b / (one_vec + b))) * c;
});
});
}
}
static void threshold_kernel(
TensorIteratorBase& iter,
const Scalar& threshold_scalar,
const Scalar& value_scalar) {
if (at::isReducedFloatingType(iter.dtype())) {
AT_DISPATCH_REDUCED_FLOATING_TYPES(iter.dtype(), "threshold_cpu", [&]() {
using Vec = Vectorized<float>;
float threshold = threshold_scalar.to<float>();
Vec threshold_v = Vec(threshold);
scalar_t value = value_scalar.to<scalar_t>();
Vec value_v = Vec(float(value));
cpu_kernel_vec(
iter,
[&](scalar_t x, scalar_t other) -> scalar_t {
return float(x) <= threshold ? value : other;
},
[&](Vectorized<scalar_t> x, Vectorized<scalar_t> other) -> Vectorized<scalar_t> {
Vec x0, x1, other0, other1;
std::tie(x0, x1) = convert_to_float<scalar_t>(x);
std::tie(other0, other1) = convert_to_float<scalar_t>(other);
return convert_from_float<scalar_t>(Vec::blendv(other0, value_v, x0 <= threshold_v),
Vec::blendv(other1, value_v, x1 <= threshold_v));
});
});
} else {
AT_DISPATCH_ALL_TYPES(iter.dtype(), "threshold_cpu", [&] {
using Vec = Vectorized<scalar_t>;
scalar_t threshold = threshold_scalar.to<scalar_t>();
Vec threshold_v = Vec(threshold);
scalar_t value = value_scalar.to<scalar_t>();
Vec value_v = Vec(value);
cpu_kernel_vec(
iter,
[&](scalar_t x, scalar_t other) -> scalar_t {
return x <= threshold ? value : other;
},
[&](Vec x, Vec other) -> Vec {
return Vec::blendv(other, value_v, x <= threshold_v);
});
});
}
}
void elu_kernel(TensorIteratorBase& it, const Scalar& alpha, const Scalar& scale, const Scalar& input_scale) {
if (at::isReducedFloatingType(it.common_dtype())) {
AT_DISPATCH_REDUCED_FLOATING_TYPES(it.common_dtype(), "elu_cpu", [&]() {
auto negcoef = alpha.to<float>() * scale.to<float>();
auto poscoef = scale.to<float>();
auto negiptcoef = input_scale.to<float>();
const Vectorized<float> negcoef_vec(negcoef);
const Vectorized<float> negiptcoef_vec(negiptcoef);
const Vectorized<float> poscoef_vec(poscoef);
const Vectorized<float> one_vec(static_cast<float>(1));
const Vectorized<float> zero_vec(static_cast<float>(0));
cpu_kernel_vec(
it,
[negcoef, negiptcoef, poscoef](scalar_t a) -> scalar_t {
return float(a) <= float(0) ? (std::exp(float(a) * negiptcoef) - float(1)) * negcoef : float(a) * poscoef;
},
[&negcoef_vec, &negiptcoef_vec, &poscoef_vec, &one_vec, &zero_vec](Vectorized<scalar_t> a) -> Vectorized<scalar_t> {
Vectorized<float> a0, a1, res0, res1;
std::tie(a0, a1) = convert_to_float<scalar_t>(a);
auto cmp0 = (a0 > zero_vec);
auto cmp1 = (a1 > zero_vec);
auto get_res_masked = [&](Vectorized<float>& cmp, Vectorized<float>& a) {
return !cmp.zero_mask() ? a * poscoef_vec :
Vectorized<float>::blendv(((a * negiptcoef_vec).exp() - one_vec) * negcoef_vec, a * poscoef_vec, cmp);
};
res0 = get_res_masked(cmp0, a0);
res1 = get_res_masked(cmp1, a1);
return convert_from_float<scalar_t>(res0, res1);
});
});
} else {
AT_DISPATCH_FLOATING_TYPES(it.common_dtype(), "elu_cpu", [&]() {
using Vec = Vectorized<scalar_t>;
auto negcoef = alpha.to<scalar_t>() * scale.to<scalar_t>();
auto poscoef = scale.to<scalar_t>();
auto negiptcoef = input_scale.to<scalar_t>();
const Vec negcoef_vec(negcoef);
const Vec negiptcoef_vec(negiptcoef);
const Vec poscoef_vec(poscoef);
const Vec one_vec(static_cast<scalar_t>(1));
const Vec zero_vec(static_cast<scalar_t>(0));
cpu_kernel_vec(
it,
[negcoef, negiptcoef, poscoef](scalar_t a) -> scalar_t {
return a <= scalar_t(0) ? (std::exp(a * negiptcoef) - scalar_t(1)) * negcoef : a * poscoef;
},
[&negcoef_vec, &negiptcoef_vec, &poscoef_vec, &one_vec, &zero_vec](Vec a) -> Vec {
auto cmp = (a > zero_vec);
if (!cmp.zero_mask()) { // only a * poscoef (which is very quick) needs to be computed
return a * poscoef_vec;
} else {
return Vec::blendv(((a * negiptcoef_vec).exp() - one_vec) * negcoef_vec, a * poscoef_vec, cmp);
}
});
});
}
}
void elu_backward_kernel(TensorIteratorBase& it, const Scalar& alpha, const Scalar& scale, const Scalar& input_scale, bool is_result) {
if (at::isReducedFloatingType(it.common_dtype())) {
AT_DISPATCH_REDUCED_FLOATING_TYPES(it.common_dtype(), "elu_backward_cpu", [&]() {
auto negcoef = alpha.to<float>() * scale.to<float>();
auto poscoef = scale.to<float>();
auto negiptcoef = input_scale.to<float>();
const Vectorized<float> negcoef_vec(negcoef);
const Vectorized<float> negiptcoef_vec(negiptcoef);
const Vectorized<float> poscoef_vec(poscoef);
const Vectorized<float> zero_vec(static_cast<float>(0));
cpu_kernel_vec(
it,
[negcoef, negiptcoef, poscoef, is_result](scalar_t a, scalar_t b) -> scalar_t {
if (is_result) {
return float(b) <= float(0) ? float(a) * negiptcoef * (float(b) + negcoef) : float(a) * poscoef;
} else {
return float(b) <= float(0) ? float(a) * negiptcoef * negcoef * std::exp(float(b) * negiptcoef): float(a) * poscoef;
}
},
[&negcoef_vec, &negiptcoef_vec, &poscoef_vec, &zero_vec, is_result](Vectorized<scalar_t> a, Vectorized<scalar_t> b) -> Vectorized<scalar_t> {
Vectorized<float> a0, a1, res0, res1;
std::tie(a0, a1) = convert_to_float<scalar_t>(a);
Vectorized<float> b0, b1;
std::tie(b0, b1) = convert_to_float<scalar_t>(b);
auto cmp0 = (b0 > zero_vec);
auto cmp1 = (b1 > zero_vec);
auto get_res_masked = [&](Vectorized<float>& cmp, Vectorized<float>& a, Vectorized<float>& b) {
if (is_result) {
return !cmp.zero_mask() ? a * poscoef_vec :
Vectorized<float>::blendv(a * negiptcoef_vec * (b + negcoef_vec), a * poscoef_vec, cmp);
} else {
return Vectorized<float>::blendv(a * negiptcoef_vec * negcoef_vec * (b * negiptcoef_vec).exp(), a * poscoef_vec, cmp);
}
};
res0 = get_res_masked(cmp0, a0, b0);
res1 = get_res_masked(cmp1, a1, b1);
return convert_from_float<scalar_t>(res0, res1);
});
});
} else {
AT_DISPATCH_FLOATING_TYPES(it.dtype(), "elu_backward_cpu", [&]() {
using Vec = Vectorized<scalar_t>;
auto negcoef = alpha.to<scalar_t>() * scale.to<scalar_t>();
auto poscoef = scale.to<scalar_t>();
auto negiptcoef = input_scale.to<scalar_t>();
const Vec negcoef_vec(negcoef);
const Vec negiptcoef_vec(negiptcoef);
const Vec poscoef_vec(poscoef);
const Vec zero_vec(static_cast<scalar_t>(0));
cpu_kernel_vec(
it,
[negcoef, negiptcoef, poscoef, is_result](scalar_t a, scalar_t b) -> scalar_t {
if (is_result) {
return b <= scalar_t(0) ? a * negiptcoef * (b + negcoef) : a * poscoef;
} else {
return b <= scalar_t(0) ? a * negiptcoef * negcoef * std::exp(b * negiptcoef): a * poscoef;
}
},
[&negcoef_vec, &negiptcoef_vec, &poscoef_vec, &zero_vec, is_result](Vec a, Vec b) -> Vec {
auto cmp = (b > zero_vec);
if (is_result) {
if (!cmp.zero_mask()) { // only a * poscoef (which is very quick) needs to be computed
return a * poscoef_vec;
} else {
return Vec::blendv(a * negiptcoef_vec * (b + negcoef_vec), a * poscoef_vec, cmp);
}
} else {
return Vec::blendv(a * negiptcoef_vec * negcoef_vec * (b * negiptcoef_vec).exp(), a * poscoef_vec, cmp);
}
}
);
});
}
}
// TODO(yangxm): Add another fast kernel using formula
// y = 0.5x * (1 + tanh(sqrt(2/Pi) * (x + 0.044715x^3)))
// and the fast tanh impl from Eigen.
void GeluKernelImpl(TensorIteratorBase& it, GeluType approximate) {
auto grain_size = at::internal::GRAIN_SIZE;
// Numbers based on benchmarking.
// Benchmark: benchmarks/operator_benchmarks/pt/gelu_test.py
#ifdef C10_MOBILE
// Benchmarked on S8 US phone.
// Internal benchmarking that converts operator benchmark into
// a torchscript module and run that on mobile.
// Same benchmark as server side.
constexpr int64_t GELU_MIN_ELEMENTS_FOR_MULTI_THREADING{6144};
#else
// Benchmarked on i9 8 core 16 thread machine.
// 1 thread: cd benchmark/operator_benchmarks;
// python -m pt.gelu_test --tag_filter long --omp_num_threads 1
// 2 threads: cd benchmark/operator_benchmarks;
// python -m pt.gelu_test --tag_filter long --omp_num_threads 1
constexpr int64_t GELU_MIN_ELEMENTS_FOR_MULTI_THREADING{16384};
#endif
if (it.numel() > GELU_MIN_ELEMENTS_FOR_MULTI_THREADING) {
grain_size = it.numel() / at::get_num_threads();
}
if (approximate == GeluType::Tanh) {
if (at::isReducedFloatingType(it.common_dtype())) {
AT_DISPATCH_REDUCED_FLOATING_TYPES(it.common_dtype(), "GeluKernelImpl", [&]() {
auto kBetaVec = Vectorized<float>((float)(M_SQRT2 * M_2_SQRTPI * 0.5));
auto kKappaVec = Vectorized<float>((float)(0.044715));
auto kOneVec = Vectorized<float>((float)(1));
auto kPointFiveVec = Vectorized<float>((float)(0.5));
cpu_kernel_vec(
it,
[](scalar_t x) -> scalar_t {
const float kBeta = float(M_SQRT2 * M_2_SQRTPI * 0.5);
const float kKappa = float(0.044715);
float x_cube = float(x) * float(x) * float(x);
float inner = kBeta * (float(x) + kKappa * x_cube);
return float(0.5) * float(x) * (float(1) + std::tanh(inner));
},
[&](Vectorized<scalar_t> x) -> Vectorized<scalar_t> {
Vectorized<float> x0, x1;
std::tie(x0, x1) = convert_to_float<scalar_t>(x);
auto x0_cube = x0 * x0 * x0;
auto x1_cube = x1 * x1 * x1;
auto inner_vec0 = kBetaVec * (x0 + kKappaVec * x0_cube);
auto inner_vec1 = kBetaVec * (x1 + kKappaVec * x1_cube);
auto res0 = kPointFiveVec * x0 * (kOneVec + inner_vec0.tanh());
auto res1 = kPointFiveVec * x1 * (kOneVec + inner_vec1.tanh());
return convert_from_float<scalar_t>(res0, res1);
},
grain_size);
});
} else {
AT_DISPATCH_FLOATING_TYPES(
it.dtype(), "GeluKernelImpl", [&]() {
using Vec = vec::Vectorized<scalar_t>;
const Vec kBetaVec(scalar_t(M_SQRT2 * M_2_SQRTPI * 0.5));
const Vec kKappaVec(scalar_t(0.044715));
const Vec kOneVec(scalar_t(1));
const Vec kPointFiveVec(scalar_t(0.5));
cpu_kernel_vec(
it,
[](scalar_t x) {
const scalar_t kBeta = M_SQRT2 * M_2_SQRTPI * 0.5;
const scalar_t kKappa = 0.044715;
auto x_cube = x * x * x;
auto inner = kBeta * (x + kKappa * x_cube);
return scalar_t(0.5) * x * (scalar_t(1) + std::tanh(inner));
},
[&](Vec x_vec) {
auto x_cube = x_vec * x_vec * x_vec;
auto inner_vec = kBetaVec * (x_vec + kKappaVec * x_cube);
return kPointFiveVec * x_vec * (kOneVec + inner_vec.tanh());
},
grain_size);
});
}
} else {
if (at::isReducedFloatingType(it.common_dtype())) {
AT_DISPATCH_REDUCED_FLOATING_TYPES(it.dtype(), "GeluKernelImpl", [&]() {
auto kAlphaVec = Vectorized<float>((float)(M_SQRT1_2));
auto kOneVec = Vectorized<float>((float)(1));
auto kPointFiveVec = Vectorized<float>((float)(0.5));
cpu_kernel_vec(
it,
[](scalar_t x) -> scalar_t {
const float kAlpha = float(M_SQRT1_2);
return float(x) * float(0.5) * (float(1) + std::erf(float(x) * kAlpha));
},
[&](Vectorized<scalar_t> x) -> Vectorized<scalar_t> {
Vectorized<float> x0, x1;
std::tie(x0, x1) = convert_to_float<scalar_t>(x);
auto res0 = x0 * kPointFiveVec * (kOneVec + (x0 * kAlphaVec).erf());
auto res1 = x1 * kPointFiveVec * (kOneVec + (x1 * kAlphaVec).erf());
return convert_from_float<scalar_t>(res0, res1);
},
grain_size);
});
} else {
AT_DISPATCH_FLOATING_TYPES(
it.dtype(), "GeluKernelImpl", [&]() {
using Vec = vec::Vectorized<scalar_t>;
const Vec kAlphaVec(scalar_t(M_SQRT1_2));
const Vec kOneVec(scalar_t(1));
const Vec kPointFiveVec(scalar_t(0.5));
cpu_kernel_vec(
it,
[](scalar_t x) {
const scalar_t kAlpha = scalar_t(M_SQRT1_2);
return x * scalar_t(0.5) * (scalar_t(1) + std::erf(x * kAlpha));
},
[&](Vec x_vec) {
return x_vec * kPointFiveVec *
(kOneVec + (x_vec * kAlphaVec).erf());
},
grain_size);
});
}
}
}
void GeluBackwardKernelImpl(TensorIteratorBase& it, GeluType approximate) {
if (approximate == GeluType::Tanh) {
if (at::isReducedFloatingType(it.common_dtype())) {
AT_DISPATCH_REDUCED_FLOATING_TYPES(it.common_dtype(), "GeluBackwardKernelImpl", [&]() {
auto kBetaVec = Vectorized<float>((float)(M_SQRT2 * M_2_SQRTPI * 0.5));
auto kKappaVec = Vectorized<float>((float)(0.044715));
auto kOneVec = Vectorized<float>((float)(1));
auto kThreeVec = Vectorized<float>((float)(3));
auto kPointFiveVec = Vectorized<float>((float)(0.5));
cpu_kernel_vec(
it,
[](scalar_t dy, scalar_t x) -> scalar_t {
const float kBeta = float(M_SQRT2 * M_2_SQRTPI * 0.5);
const float kKappa = float(0.044715);
float x_sq = float(x) * float(x);
float x_cube = x_sq * float(x);
float inner = kBeta * (float(x) + kKappa * x_cube);
float tanh_inner = float(std::tanh(inner));
float left = float(0.5) * float(x);
float right = float(1) + tanh_inner;
float left_derivative = float(0.5) * right;
float tanh_derivative = float(1) - tanh_inner * tanh_inner;
float inner_derivative =
kBeta * (float(1) + float(3) * kKappa * x_sq);
float right_derivative = left * tanh_derivative * inner_derivative;
return float(dy) * (left_derivative + right_derivative);
},
[&](Vectorized<scalar_t> dy_vec, Vectorized<scalar_t> x_vec) -> Vectorized<scalar_t> {
Vectorized<float> x0_vec, x1_vec;
std::tie(x0_vec, x1_vec) = convert_to_float<scalar_t>(x_vec);
Vectorized<float> dy0_vec, dy1_vec;
std::tie(dy0_vec, dy1_vec) = convert_to_float<scalar_t>(dy_vec);
auto x0_sq = x0_vec * x0_vec;
auto x1_sq = x1_vec * x1_vec;
auto x0_cube = x0_vec * x0_vec * x0_vec;
auto x1_cube = x1_vec * x1_vec * x1_vec;
auto inner_vec0 = kBetaVec * (x0_vec + kKappaVec * x0_cube);
auto inner_vec1 = kBetaVec * (x1_vec + kKappaVec * x1_cube);
auto tanh_inner_vec0 = inner_vec0.tanh();
auto tanh_inner_vec1 = inner_vec1.tanh();
auto left_vec0 = kPointFiveVec * x0_vec;
auto left_vec1 = kPointFiveVec * x1_vec;
auto right_vec0 = kOneVec + tanh_inner_vec0;
auto right_vec1 = kOneVec + tanh_inner_vec1;
auto left_derivative_vec0 = kPointFiveVec * right_vec0;
auto left_derivative_vec1 = kPointFiveVec * right_vec1;
auto tanh_derivative_vec0 = kOneVec - tanh_inner_vec0 * tanh_inner_vec0;
auto tanh_derivative_vec1 = kOneVec - tanh_inner_vec1 * tanh_inner_vec1;
auto inner_derivative_vec0 = kBetaVec * (kOneVec + kThreeVec * kKappaVec * x0_sq);
auto inner_derivative_vec1 = kBetaVec * (kOneVec + kThreeVec * kKappaVec * x1_sq);
auto right_derivative_vec0 = left_vec0 * tanh_derivative_vec0 * inner_derivative_vec0;
auto right_derivative_vec1 = left_vec1 * tanh_derivative_vec1 * inner_derivative_vec1;
auto res0 = dy0_vec * (left_derivative_vec0 + right_derivative_vec0);
auto res1 = dy1_vec * (left_derivative_vec1 + right_derivative_vec1);
return convert_from_float<scalar_t>(res0, res1);
});
});
} else {
AT_DISPATCH_FLOATING_TYPES(
it.dtype(), "GeluBackwardKernelImpl", [&]() {
using Vec = vec::Vectorized<scalar_t>;
const Vec kBetaVec(scalar_t(M_SQRT2 * M_2_SQRTPI * 0.5));
const Vec kKappaVec(scalar_t(0.044715));
const Vec kOneVec(scalar_t(1));
const Vec kThreeVec(scalar_t(3));
const Vec kPointFiveVec(scalar_t(0.5));
cpu_kernel_vec(
it,
[](scalar_t dy, scalar_t x) {
const scalar_t kBeta = M_SQRT2 * M_2_SQRTPI * 0.5;
const scalar_t kKappa = 0.044715;
auto x_sq = x * x;
auto x_cube = x_sq * x;
auto inner = kBeta * (x + kKappa * x_cube);
auto tanh_inner = std::tanh(inner);
auto left = scalar_t(0.5) * x;
auto right = scalar_t(1) + tanh_inner;
auto left_derivative = scalar_t(0.5) * right;
auto tanh_derivative = scalar_t(1) - tanh_inner * tanh_inner;
auto inner_derivative =
kBeta * (scalar_t(1) + scalar_t(3) * kKappa * x_sq);
auto right_derivative = left * tanh_derivative * inner_derivative;
return dy * (left_derivative + right_derivative);
},
[&](Vec dy_vec, Vec x_vec) {
auto x_sq = x_vec * x_vec;
auto x_cube = x_vec * x_vec * x_vec;
auto inner_vec =
kBetaVec * (x_vec + kKappaVec * x_cube);
auto tanh_inner_vec = inner_vec.tanh();
auto left_vec = kPointFiveVec * x_vec;
auto right_vec = kOneVec + tanh_inner_vec;
auto left_derivative_vec = kPointFiveVec * right_vec;
auto tanh_derivative_vec =
kOneVec - tanh_inner_vec * tanh_inner_vec;
auto inner_derivative_vec =
kBetaVec * (kOneVec + kThreeVec * kKappaVec * x_sq);
auto right_derivative_vec =
left_vec * tanh_derivative_vec * inner_derivative_vec;
return dy_vec * (left_derivative_vec + right_derivative_vec);
});
});
}
} else {
if (at::isReducedFloatingType(it.common_dtype())) {
AT_DISPATCH_REDUCED_FLOATING_TYPES(it.common_dtype(), "GeluBackwardKernelImpl", [&]() {
auto kAlphaVec = Vectorized<float>((float)(M_SQRT1_2));
auto kBetaVec = Vectorized<float>((float)(M_2_SQRTPI * M_SQRT1_2 * 0.5));
auto kOneVec = Vectorized<float>((float)(1));
auto kPointFiveVec = Vectorized<float>((float)(0.5));
auto kMinusPointFiveVec = Vectorized<float>((float)(-0.5));
cpu_kernel_vec(
it,
[](scalar_t dy, scalar_t x) -> scalar_t {
const float kAlpha = float(M_SQRT1_2);
const float kBeta = float(M_2_SQRTPI) * float(M_SQRT1_2) * float(0.5);
const float cdf =
float(0.5) * (float(1) + std::erf(float(x) * kAlpha));
const float pdf = kBeta * std::exp(float(x) * float(x) * float(-0.5));
return float(dy) * (cdf + float(x) * pdf);
},
[&](Vectorized<scalar_t> dy, Vectorized<scalar_t> x) -> Vectorized<scalar_t> {
Vectorized<float> x0, x1;
std::tie(x0, x1) = convert_to_float<scalar_t>(x);
Vectorized<float> dy0, dy1;
std::tie(dy0, dy1) = convert_to_float<scalar_t>(dy);
auto cdf_vec0 = kPointFiveVec * (kOneVec + (x0 * kAlphaVec).erf());
auto cdf_vec1 = kPointFiveVec * (kOneVec + (x1 * kAlphaVec).erf());
auto pdf_vec0 = kBetaVec * (x0 * x0 * kMinusPointFiveVec).exp();
auto pdf_vec1 = kBetaVec * (x1 * x1 * kMinusPointFiveVec).exp();
auto res0 = dy0 * (cdf_vec0 + x0 * pdf_vec0);
auto res1 = dy1 * (cdf_vec1 + x1 * pdf_vec1);
return convert_from_float<scalar_t>(res0, res1);
});
});
} else {
AT_DISPATCH_FLOATING_TYPES(
it.dtype(), "GeluBackwardKernelImpl", [&]() {
using Vec = vec::Vectorized<scalar_t>;
const Vec kAlphaVec(scalar_t(M_SQRT1_2));
const Vec kBetaVec(scalar_t(M_2_SQRTPI * M_SQRT1_2 * 0.5));
const Vec kOneVec(scalar_t(1));
const Vec kPointFiveVec(scalar_t(0.5));
const Vec kMinusPointFiveVec(scalar_t(-0.5));
cpu_kernel_vec(
it,
[](scalar_t dy, scalar_t x) {
const scalar_t kAlpha = scalar_t(M_SQRT1_2);
const scalar_t kBeta = M_2_SQRTPI * M_SQRT1_2 * scalar_t(0.5);
const scalar_t cdf =
scalar_t(0.5) * (scalar_t(1) + std::erf(x * kAlpha));
const scalar_t pdf = kBeta * std::exp(x * x * scalar_t(-0.5));
return dy * (cdf + x * pdf);
},
[&](Vec dy_vec, Vec x_vec) {
const Vec cdf_vec =
kPointFiveVec * (kOneVec + (x_vec * kAlphaVec).erf());
const Vec pdf_vec =
kBetaVec * (x_vec * x_vec * kMinusPointFiveVec).exp();
return dy_vec * (cdf_vec + x_vec * pdf_vec);
});
});
}
}
}
void hardsigmoid_kernel(TensorIteratorBase& iter) {
if (at::isReducedFloatingType(iter.dtype())) {
AT_DISPATCH_REDUCED_FLOATING_TYPES(iter.dtype(), "hardsigmoid_cpu", [&]() {
const float zero(0.0f);
const float three(3.0f);
const float six(6.0f);
using Vec = vec::Vectorized<float>;
const Vec kZeroVec(zero);
const Vec kThreeVec(three);
const Vec kSixVec(six);
cpu_kernel_vec(
iter,
[&](scalar_t self_val) -> scalar_t {
return std::min(std::max(float(self_val) + three, zero), six) / six;
},
[&](vec::Vectorized<scalar_t> self_val) -> vec::Vectorized<scalar_t> {
Vectorized<float> self_val0, self_val1;
std::tie(self_val0, self_val1) = convert_to_float<scalar_t>(self_val);
self_val0 = minimum(
maximum(self_val0 + kThreeVec, kZeroVec),
kSixVec
) / kSixVec;
self_val1 = minimum(
maximum(self_val1 + kThreeVec, kZeroVec),
kSixVec
) / kSixVec;
return convert_from_float<scalar_t>(self_val0, self_val1);
});
});
} else {
AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "hardsigmoid_cpu", [&] {
const scalar_t zero(0.0f);
const scalar_t three(3.0f);
const scalar_t six(6.0f);
using Vec = vec::Vectorized<scalar_t>;
const Vec kZeroVec(zero);
const Vec kThreeVec(three);
const Vec kSixVec(six);
cpu_kernel_vec(
iter,
[&](scalar_t self_val) {
return std::min(std::max(self_val + three, zero), six) / six;
},
[&](Vec self_val) {
return vec::minimum(
vec::maximum(self_val + kThreeVec, kZeroVec),
kSixVec
) / kSixVec;
});
});
}
}
void hardsigmoid_backward_kernel(TensorIteratorBase& iter) {
if (at::isReducedFloatingType(iter.dtype())) {
AT_DISPATCH_REDUCED_FLOATING_TYPES(iter.common_dtype(), "hardsigmoid_backward", [&]() {
const float zero(0.0f);
const float three(3.0f);
const float neg_three(-3.0f);
const float one_sixth(1.0f / 6.0f);
using Vec = Vectorized<float>;
Vec kZeroVec(0.0f);
Vec kOneSixthVec(1.0f / 6.0f);
cpu_kernel_vec(
iter,
[=](scalar_t grad_val, scalar_t self_val) -> scalar_t {
return (float(self_val) > neg_three && float(self_val) < three)
? float(grad_val) * one_sixth
: zero;
},
[=](Vectorized<scalar_t> grad_val, Vectorized<scalar_t> self_val) -> Vectorized<scalar_t> {
Vec self_val0, self_val1, grad_val0, grad_val1;
std::tie(self_val0, self_val1) = convert_to_float<scalar_t>(self_val);
std::tie(grad_val0, grad_val1) = convert_to_float<scalar_t>(grad_val);
Vec gradNonZeroMask = (self_val0 > neg_three) & (self_val0 < three);
self_val0 = Vec::blendv(kZeroVec, grad_val0 * kOneSixthVec, gradNonZeroMask);
gradNonZeroMask = (self_val1 > neg_three) & (self_val1 < three);
self_val1 = Vec::blendv(kZeroVec, grad_val1 * kOneSixthVec, gradNonZeroMask);
return convert_from_float<scalar_t>(self_val0, self_val1);
});
});
} else {
AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "hardsigmoid_backward", [&] {
const scalar_t zero(0.0f);
const scalar_t three(3.0f);
const scalar_t neg_three(-3.0f);
const scalar_t one_sixth(1.0f / 6.0f);
using Vec = Vectorized<scalar_t>;
Vec kZeroVec(0.0f);
Vec kOneSixthVec(1.0f / 6.0f);
cpu_kernel_vec(
iter,
[=](scalar_t grad_val, scalar_t self_val) {
return (self_val > neg_three && self_val < three)
? grad_val * one_sixth
: zero;
},
[=](Vec grad_val, Vec self_val) {
Vec gradNonZeroMask = (self_val > neg_three) & (self_val < three);
return Vec::blendv(kZeroVec, grad_val * kOneSixthVec, gradNonZeroMask);
});
});
}
}
void hardshrink_kernel(TensorIteratorBase& iter, const Scalar& lambd) {
AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, iter.dtype(), "hardshrink_cpu", [&] {
auto lambd_val = lambd.to<scalar_t>();
using Vec = Vectorized<scalar_t>;
cpu_kernel_vec(
iter,
[=](scalar_t self_val) {
return (self_val >= -lambd_val && self_val <= lambd_val) ? scalar_t(0)
: self_val;
},
[=](Vec self_val) {
return Vec::blendv(self_val, Vec(0), (self_val >= -lambd_val) & (self_val <= lambd_val));
});
});
}
void softshrink_kernel(TensorIteratorBase& iter, const Scalar& lambd) {
if (at::isReducedFloatingType(iter.dtype())) {
AT_DISPATCH_REDUCED_FLOATING_TYPES(iter.common_dtype(), "softshrink_cpu", [&]() {
auto lambd_val = lambd.to<float>();
auto lambdVec = Vectorized<float>(lambd_val);
cpu_kernel_vec(
iter,
[=](scalar_t a) -> scalar_t {
return float(a) > lambd_val ? a - lambd_val : (float(a) < -lambd_val ? a + lambd_val : float(0));
},
[=](Vectorized<scalar_t> self_val) -> Vectorized<scalar_t> {
Vectorized<float> self_val0, self_val1;
Vectorized<scalar_t> self_val_t0, self_val_t1;
std::tie(self_val0, self_val1) = convert_to_float<scalar_t>(self_val);
self_val_t0 = convert_from_float<scalar_t>((self_val0 > lambdVec) & (self_val0 - lambdVec), (self_val1 > lambdVec) & (self_val1 - lambdVec));
self_val_t1 = convert_from_float<scalar_t>((self_val0 < -lambd_val) & (self_val0 + lambdVec), (self_val1 < -lambd_val) & (self_val1 + lambdVec));
return (self_val_t0 | self_val_t1);
});
});
} else {
AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "softshrink_cpu", [&]() {
auto lambd_val = lambd.to<scalar_t>();
auto lambdVec = Vectorized<scalar_t>(lambd_val);
cpu_kernel_vec(
iter,
[=](scalar_t a) -> scalar_t {
return a > lambd_val ? a - lambd_val : (a < -lambd_val ? a + lambd_val : scalar_t(0));
},
[=](Vectorized<scalar_t> self_val) -> Vectorized<scalar_t> {
Vectorized<scalar_t> self_val_t0, self_val_t1;
self_val_t0 = (self_val > lambdVec) & (self_val - lambdVec);
self_val_t1 = (self_val < -lambd_val) & (self_val + lambdVec);
return (self_val_t0 | self_val_t1);
});
});
}
}
void shrink_backward_kernel(TensorIteratorBase& iter, const Scalar& lambd) {
AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, iter.dtype(), "shrink_backward_cpu", [&] {
auto lambd_val = lambd.to<scalar_t>();
cpu_kernel_vec(
iter,
[=](scalar_t grad_val, scalar_t self_val) {
return (self_val >= -lambd_val && self_val <= lambd_val) ? scalar_t(0)
: grad_val;
},
[=](Vectorized<scalar_t> grad_val, Vectorized<scalar_t> self_val) {
return ((self_val < -lambd_val) | (self_val > lambd_val)) & grad_val;
});
});
}
void hardtanh_backward_kernel(TensorIterator& iter, const Scalar& min, const Scalar& max) {
if (at::isReducedFloatingType(iter.dtype())) {
AT_DISPATCH_REDUCED_FLOATING_TYPES(iter.dtype(), "hardshrink_backward_cpu", [&]() {
auto min_val = min.to<float>();
auto max_val = max.to<float>();
cpu_kernel_vec(
iter,
[=](scalar_t grad_val, scalar_t self_val) -> scalar_t {
return (float(self_val) <= min_val || float(self_val) >= max_val) ? scalar_t(0) : grad_val;
},
[=](Vectorized<scalar_t> grad_val, Vectorized<scalar_t> self_val) -> Vectorized<scalar_t> {
Vectorized<float> grad_val0, grad_val1, self_val0, self_val1;
std::tie(grad_val0, grad_val1) = convert_to_float<scalar_t>(grad_val);
std::tie(self_val0, self_val1) = convert_to_float<scalar_t>(self_val);
return convert_from_float<scalar_t>(
((self_val0 > min_val) & (self_val0 < max_val)) & grad_val0,
((self_val1 > min_val) & (self_val1 < max_val)) & grad_val1
);
});
});
} else {
AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "hardshrink_backward_cpu", [&] {
auto min_val = min.to<scalar_t>();
auto max_val = max.to<scalar_t>();
cpu_kernel_vec(
iter,
[=](scalar_t grad_val, scalar_t self_val) {
return (self_val <= min_val || self_val >= max_val) ? scalar_t(0) : grad_val;
},
[=](Vectorized<scalar_t> grad_val, Vectorized<scalar_t> self_val) {
return ((self_val > min_val) & (self_val < max_val)) & grad_val;
});
});
}
}
void hardswish_kernel(TensorIterator& iter) {
if (at::isReducedFloatingType(iter.dtype())) {
AT_DISPATCH_REDUCED_FLOATING_TYPES(iter.dtype(), "hardswish_cpu", [&]() {
const float zero(0.0f);
const float three(3.0f);
const float six(6.0f);
using Vec = vec::Vectorized<float>;
const Vec kZeroVec(zero);
const Vec kThreeVec(three);
const Vec kSixVec(six);
cpu_kernel_vec(
iter,
[&](scalar_t x) -> scalar_t {
return float(x) * std::min(std::max(float(x) + three, zero), six) / six;
},
[&](vec::Vectorized<scalar_t> x_vec) {
Vectorized<float> x_vec0, x_vec1;
std::tie(x_vec0, x_vec1) = convert_to_float<scalar_t>(x_vec);
x_vec0 = x_vec0 * minimum(
maximum(x_vec0 + kThreeVec, kZeroVec),
kSixVec
) / kSixVec;
x_vec1 = x_vec1 * minimum(
maximum(x_vec1 + kThreeVec, kZeroVec),
kSixVec
) / kSixVec;
return convert_from_float<scalar_t>(x_vec0, x_vec1);
});
});
} else {
AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "hardswish_cpu", [&]() {
const scalar_t zero(0.0f);
const scalar_t three(3.0f);
const scalar_t six(6.0f);
using Vec = vec::Vectorized<scalar_t>;
const Vec kZeroVec(zero);
const Vec kThreeVec(three);
const Vec kSixVec(six);
cpu_kernel_vec(
iter,
[&](scalar_t x) {
return x * std::min(std::max(x + three, zero), six) / six;
},
[&](Vec x_vec) {
return x_vec * vec::minimum(
vec::maximum(x_vec + kThreeVec, kZeroVec),
kSixVec
) / kSixVec;
}
);
});
}
}
void hardswish_backward_kernel(TensorIterator& iter) {
if (at::isReducedFloatingType(iter.dtype())) {
AT_DISPATCH_REDUCED_FLOATING_TYPES(iter.dtype(), "hardswish_backward_cpu", [&]() {
const float zero(0.0f);
const float three(3.0f);
const float neg_three(-3.0f);
const float one_half(0.5f);
using Vec = vec::Vectorized<float>;
const Vec kZeroVec(zero);
const Vec kThreeVec(three);
const Vec kNegThreeVec(neg_three);
const Vec kOneHalfVec(one_half);
cpu_kernel_vec(
iter,
[&](scalar_t grad_val, scalar_t self_val) -> scalar_t {
if (float(self_val) < neg_three) {
return zero;
} else if (float(self_val) <= three) {
return float(grad_val) * ((float(self_val) / three) + one_half);
} else {
return grad_val;
}
},
[&](vec::Vectorized<scalar_t> grad_val, vec::Vectorized<scalar_t> self_val) {
Vectorized<float> self_val0, self_val1, grad_val0, grad_val1;
std::tie(self_val0, self_val1) = convert_to_float<scalar_t>(self_val);
std::tie(grad_val0, grad_val1) = convert_to_float<scalar_t>(grad_val);
self_val0 = Vec::blendv(
Vec::blendv(
grad_val0 * ((self_val0 / kThreeVec) + kOneHalfVec),
grad_val0,
self_val0 >= kThreeVec
),
kZeroVec,
self_val0 < kNegThreeVec
);
self_val1 = Vec::blendv(
Vec::blendv(
grad_val1 * ((self_val1 / kThreeVec) + kOneHalfVec),
grad_val1,
self_val1 >= kThreeVec
),
kZeroVec,
self_val1 < kNegThreeVec
);
return convert_from_float<scalar_t>(self_val0, self_val1);
});
});
} else {
AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "hardswish_backward_cpu", [&]() {
const scalar_t zero(0.0f);
const scalar_t three(3.0f);
const scalar_t neg_three(-3.0f);
const scalar_t one_half(0.5f);
using Vec = vec::Vectorized<scalar_t>;
const Vec kZeroVec(zero);
const Vec kThreeVec(three);
const Vec kNegThreeVec(neg_three);
const Vec kOneHalfVec(one_half);
cpu_kernel_vec(
iter,
[&](scalar_t grad_val, scalar_t self_val) {
if (self_val < neg_three) {
return zero;
} else if (self_val <= three) {
return grad_val * ((self_val / three) + one_half);
} else {
return grad_val;
}
},
[&](Vec grad_val, Vec self_val) {
return Vec::blendv(
Vec::blendv(
grad_val * ((self_val / kThreeVec) + kOneHalfVec),
grad_val,
self_val >= kThreeVec
),
kZeroVec,
self_val < kNegThreeVec
);
}
);
});
}
}
static void leaky_relu_kernel(TensorIteratorBase& iter, const Scalar& negval_) {
if (at::isReducedFloatingType(iter.dtype())) {
AT_DISPATCH_REDUCED_FLOATING_TYPES(iter.dtype(), "leaky_relu_cpu", [&]() {
auto zero_vec = Vectorized<float>((float)(0));
auto one_vec = Vectorized<float>((float)(1));
float negval = negval_.to<float>();
Vectorized<float> negval_v = Vectorized<float>(negval);
cpu_kernel_vec(
iter,
[&](scalar_t a) -> scalar_t {
return float(a) > float(0) ? float(a) : float(a) * negval;
},
[&](Vectorized<scalar_t> a) -> Vectorized<scalar_t> {
Vectorized<float> a0, a1;
std::tie(a0, a1) = convert_to_float<scalar_t>(a);
auto res0 = a0 * (Vectorized<float>::blendv(negval_v, one_vec, a0 > zero_vec));
auto res1 = a1 * (Vectorized<float>::blendv(negval_v, one_vec, a1 > zero_vec));
return convert_from_float<scalar_t>(res0, res1);
});
});
} else {
AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "leaky_relu_cpu", [&] {
using Vec = Vectorized<scalar_t>;
auto zero_vec = Vec((scalar_t)(0));
auto one_vec = Vec((scalar_t)(1));
scalar_t negval = negval_.to<scalar_t>();
Vec negval_v = Vec(negval);
cpu_kernel_vec(
iter,
[&](scalar_t a) -> scalar_t {
return a > scalar_t(0) ? a : a * negval;
},
[&](Vec a) -> Vec {
auto r = Vec::blendv(negval_v, one_vec, a > zero_vec);
return a * r;
});
});
}
}
static void leaky_relu_backward_kernel(TensorIteratorBase& iter, const Scalar& negval_) {
if (at::isReducedFloatingType(iter.dtype())) {
AT_DISPATCH_REDUCED_FLOATING_TYPES(iter.dtype(), "leaky_relu_backward_cpu", [&]() {
auto zero_vec = Vectorized<float>((float)(0));
auto one_vec = Vectorized<float>((float)(1));
float negval = negval_.to<float>();
Vectorized<float> negval_v = Vectorized<float>(negval);
cpu_kernel_vec(
iter,
[&](scalar_t a, scalar_t b) -> scalar_t {
return float(a) > float(0) ? float(b) : float(b) * negval;
},
[&](Vectorized<scalar_t> a, Vectorized<scalar_t> b) -> Vectorized<scalar_t> {
Vectorized<float> a0, a1, b0, b1;
std::tie(a0, a1) = convert_to_float<scalar_t>(a);
std::tie(b0, b1) = convert_to_float<scalar_t>(b);
auto res0 = b0 * (Vectorized<float>::blendv(negval_v, one_vec, a0 > zero_vec));
auto res1 = b1 * (Vectorized<float>::blendv(negval_v, one_vec, a1 > zero_vec));
return convert_from_float<scalar_t>(res0, res1);
});
});
} else {
AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "leaky_relu_backward_cpu", [&] {
using Vec = Vectorized<scalar_t>;
auto zero_vec = Vec((scalar_t)(0));
auto one_vec = Vec((scalar_t)(1));
scalar_t negval = negval_.to<scalar_t>();
Vec negval_v = Vec(negval);
cpu_kernel_vec(
iter,
[&](scalar_t a, scalar_t b) -> scalar_t {
return a > scalar_t(0) ? b : b * negval;
},
[&](Vec a, Vec b) -> Vec {
auto r = Vec::blendv(negval_v, one_vec, a > zero_vec);
return b * r;
});
});
}
}
void softplus_kernel(TensorIteratorBase& iter, const Scalar& beta_, const Scalar& threshold_) {
if (at::isReducedFloatingType(iter.dtype())) {
AT_DISPATCH_REDUCED_FLOATING_TYPES(iter.dtype(), "softplus_cpu", [&]() {
using Vec = Vectorized<float>;
auto beta = beta_.to<float>();
auto threshold = threshold_.to<float>();
const Vec beta_vec(beta);
const Vec threshold_vec(threshold);
cpu_kernel_vec(
iter,
[beta, threshold](scalar_t a) -> scalar_t {
return (float(a) * beta) > threshold ? a
: static_cast<scalar_t>((std::log1p(std::exp(float(a) * beta))) / beta);
},
[beta_vec, threshold_vec](Vectorized<scalar_t> a) -> Vectorized<scalar_t> {
Vectorized<float> a0, a1;
std::tie(a0, a1) = convert_to_float<scalar_t>(a);
a0 = Vec::blendv((a0 * beta_vec).exp().log1p() / beta_vec, a0, (a0 * beta_vec) > threshold_vec);
a1 = Vec::blendv((a1 * beta_vec).exp().log1p() / beta_vec, a1, (a1 * beta_vec) > threshold_vec);
return convert_from_float<scalar_t>(a0, a1);
}
);
});
} else {
AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "softplus_cpu", [&]() {
using Vec = Vectorized<scalar_t>;
auto beta = beta_.to<scalar_t>();
auto threshold = threshold_.to<scalar_t>();
const Vec beta_vec(beta);
const Vec threshold_vec(threshold);
cpu_kernel_vec(
iter,
[beta, threshold](scalar_t a) -> scalar_t {
return (a * beta) > threshold ? a
: static_cast<scalar_t>(std::log1p(std::exp(a * beta))) / beta;
},
[beta_vec, threshold_vec](Vec a) -> Vec {
return Vec::blendv((a * beta_vec).exp().log1p() / beta_vec, a, (a * beta_vec) > threshold_vec);
}
);
});
}
}
void softplus_backward_kernel(TensorIteratorBase& iter, const Scalar& beta_, const Scalar& threshold_) {
if (at::isReducedFloatingType(iter.dtype())) {
AT_DISPATCH_REDUCED_FLOATING_TYPES(iter.dtype(), "softplus_backward_cpu", [&]() {
using Vec = Vectorized<float>;
auto beta = beta_.to<float>();
auto threshold = threshold_.to<float>();
const Vec beta_vec(beta);
const Vec threshold_vec(threshold);
const Vec one_vec(static_cast<float>(1.0));
cpu_kernel_vec(
iter,
[beta, threshold](scalar_t a, scalar_t b) -> scalar_t {
float z = std::exp(float(b) * beta);
return (float(b) * beta) > threshold ? a : static_cast<scalar_t>(float(a) * z / (z + float(1.)));
},
[beta_vec, one_vec, threshold_vec](Vectorized<scalar_t> a, Vectorized<scalar_t> b) -> Vectorized<scalar_t> {
Vectorized<float> a0, a1, b0, b1;
std::tie(a0, a1) = convert_to_float<scalar_t>(a);
std::tie(b0, b1) = convert_to_float<scalar_t>(b);
Vec z = (b0 * beta_vec).exp();
a0 = Vec::blendv(a0 * z / (z + one_vec), a0, (b0 * beta_vec) > threshold_vec);
z = (b1 * beta_vec).exp();
a1 = Vec::blendv(a1 * z / (z + one_vec), a1, (b1 * beta_vec) > threshold_vec);
return convert_from_float<scalar_t>(a0, a1);
});
});
} else {
AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "softplus_backward_cpu", [&]() {
using Vec = Vectorized<scalar_t>;
auto beta = beta_.to<scalar_t>();
auto threshold = threshold_.to<scalar_t>();
const Vec beta_vec(beta);
const Vec threshold_vec(threshold);
const Vec one_vec(static_cast<scalar_t>(1.0));
cpu_kernel_vec(
iter,
[beta, threshold](scalar_t a, scalar_t b) -> scalar_t {
scalar_t z = std::exp(b * beta);
return (b * beta) > threshold ? a : a * z / (z + scalar_t(1.));
},
[beta_vec, one_vec, threshold_vec](Vec a, Vec b) -> Vec {
const Vec z = (b * beta_vec).exp();
return Vec::blendv(a * z / (z + one_vec), a, (b * beta_vec) > threshold_vec);
}
);
});
}
}
void glu_kernel(TensorIteratorBase& iter) {
if (at::isReducedFloatingType(iter.dtype())) {
AT_DISPATCH_REDUCED_FLOATING_TYPES(iter.dtype(), "glu_cpu", [&]() {
const float float_one_val(1);
const Vectorized<float> float_one_vec(float_one_val);
cpu_kernel_vec(
iter,
[float_one_val](scalar_t a, scalar_t b) -> scalar_t {
return float(a) * (float_one_val / (float_one_val + std::exp(- float(b))));
},
[float_one_vec](Vectorized<scalar_t> a, Vectorized<scalar_t> b) -> Vectorized<scalar_t> {
Vectorized<float> a0, a1, b0, b1;
std::tie(a0, a1) = convert_to_float<scalar_t>(a);
std::tie(b0, b1) = convert_to_float<scalar_t>(b);
return convert_from_float<scalar_t>(a0 * (float_one_vec / (float_one_vec + b0.neg().exp())),
a1 * (float_one_vec / (float_one_vec + b1.neg().exp())));
});
});
} else {
AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "glu_cpu", [&] {
using Vec = Vectorized<scalar_t>;
const scalar_t one_val(1);
const Vec one_vec(one_val);
cpu_kernel_vec(
iter,
[one_val](scalar_t a, scalar_t b) -> scalar_t {
return a * (one_val / (one_val + std::exp(-b)));
},
[one_vec](Vec a, Vec b) -> Vec {
return a * (one_vec / (one_vec + b.neg().exp()));
}
);
});
}
}
void glu_jvp_kernel(TensorIteratorBase& iter) {
AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "glu_jvp_cpu", [&] {
using Vec = Vectorized<scalar_t>;
const scalar_t one(1);
const Vec ones(one);
cpu_kernel_vec(
iter,
[one](scalar_t res, scalar_t b, scalar_t da, scalar_t db) -> scalar_t {
const auto sig_b = one / (one + std::exp(-b));
return da * sig_b + res * (db - sig_b * db);
},
[ones](Vec res, Vec b, Vec da, Vec db) -> Vec {
const auto sig_b = ones / (ones + b.neg().exp());
return da * sig_b + res * (db - sig_b * db);
}
);
});
}
void glu_backward_kernel(TensorIterator& iter) {
if (at::isReducedFloatingType(iter.dtype())) {
AT_DISPATCH_REDUCED_FLOATING_TYPES(iter.dtype(), "glu_backward_cpu", [&]() {
const float float_one_val(1);
const Vectorized<float> float_one_vec(float_one_val);
cpu_kernel_vec(
iter,
[float_one_val](scalar_t a, scalar_t b, scalar_t c) -> scalar_t {
return (float_one_val - float(a)) * float(a) * float(b) * float(c);
},
[float_one_vec](Vectorized<scalar_t> a, Vectorized<scalar_t> b, Vectorized<scalar_t> c) -> Vectorized<scalar_t> {
Vectorized<float> a0, a1, b0, b1, c0, c1;
std::tie(a0, a1) = convert_to_float<scalar_t>(a);
std::tie(b0, b1) = convert_to_float<scalar_t>(b);
std::tie(c0, c1) = convert_to_float<scalar_t>(c);
a0 = (float_one_vec - a0) * a0 * b0 * c0;
a1 = (float_one_vec - a1) * a1 * b1 * c1;
return convert_from_float<scalar_t>(a0, a1);
});
});
} else {
AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "glu_backward_cpu", [&] {
using Vec = Vectorized<scalar_t>;
const scalar_t one_val(1);
const Vec one_vec(one_val);
cpu_kernel_vec(
iter,
[one_val](scalar_t a, scalar_t b, scalar_t c) -> scalar_t {
return (one_val - a) * a * b * c;
},
[one_vec](Vec a, Vec b, Vec c) -> Vec {
return (one_vec - a) * a * b * c;
}
);
});
}
}
void silu_kernel(TensorIteratorBase& iter) {
if (at::isReducedFloatingType(iter.dtype())) {
AT_DISPATCH_REDUCED_FLOATING_TYPES(iter.dtype(), "silu_cpu", [&]() {
const Vectorized<float> kOneVec(1.0f);
cpu_kernel_vec(
iter,
[](scalar_t x) -> scalar_t {
return float(x) / (1.0f + std::exp(-float(x)));
},
[kOneVec](Vectorized<scalar_t> x_vec) -> Vectorized<scalar_t> {
Vectorized<float> x_vec0, x_vec1;
std::tie(x_vec0, x_vec1) = convert_to_float<scalar_t>(x_vec);
return convert_from_float<scalar_t>(
x_vec0 / (kOneVec + x_vec0.neg().exp()),
x_vec1 / (kOneVec + x_vec1.neg().exp()));
});
});
} else {
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
iter.dtype(), "silu_cpu", [&]() {
const Vectorized<scalar_t> kOneVec(scalar_t(1));
cpu_kernel_vec(
iter,
[](scalar_t x) {
return x / (scalar_t(1) + std::exp(-x));
},
[kOneVec](Vectorized<scalar_t> x_vec) {
return x_vec / (kOneVec + x_vec.neg().exp());
});
});
}
}
void silu_backward_kernel(TensorIteratorBase& iter) {
if (at::isReducedFloatingType(iter.dtype())) {
AT_DISPATCH_REDUCED_FLOATING_TYPES(iter.dtype(), "silu_backward_cpu", [&]() {
const Vectorized<float> kOneVec(1.0f);
cpu_kernel_vec(
iter,
[](scalar_t dy, scalar_t x) -> scalar_t {
const float sigmoid =
1.0f / (1.0f + std::exp(-float(x)));
return dy * sigmoid * (1.0f + x * (1.0f - sigmoid));
},
[kOneVec](Vectorized<scalar_t> dy_vec, Vectorized<scalar_t> x_vec) -> Vectorized<scalar_t> {
Vectorized<float> x_vec0, x_vec1, dy_vec0, dy_vec1;
std::tie(x_vec0, x_vec1) = convert_to_float<scalar_t>(x_vec);
std::tie(dy_vec0, dy_vec1) = convert_to_float<scalar_t>(dy_vec);
const Vectorized<float> sigmoid0 =
kOneVec / (kOneVec + x_vec0.neg().exp());
const Vectorized<float> sigmoid1 =
kOneVec / (kOneVec + x_vec1.neg().exp());
return convert_from_float<scalar_t>(
dy_vec0 * sigmoid0 * (kOneVec + x_vec0 * (kOneVec - sigmoid0)),
dy_vec1 * sigmoid1 * (kOneVec + x_vec1 * (kOneVec - sigmoid1)));
});
});
} else {
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
iter.dtype(), "silu_backward_cpu", [&]() {
const Vectorized<scalar_t> kOneVec(scalar_t(1));
cpu_kernel_vec(
iter,
[](scalar_t dy, scalar_t x) {
const scalar_t sigmoid =
scalar_t(1) / (scalar_t(1) + std::exp(-x));
return dy * sigmoid * (scalar_t(1) + x * (scalar_t(1) - sigmoid));
},
[kOneVec](Vectorized<scalar_t> dy_vec, Vectorized<scalar_t> x_vec) {
const Vectorized<scalar_t> sigmoid =
kOneVec / (kOneVec + x_vec.neg().exp());
return dy_vec * sigmoid * (kOneVec + x_vec * (kOneVec - sigmoid));
});
});
}
}
void mish_kernel(TensorIteratorBase& iter) {
if (at::isReducedFloatingType(iter.dtype())) {
AT_DISPATCH_REDUCED_FLOATING_TYPES(iter.dtype(), "mish_cpu", [&]() {
cpu_kernel_vec(
iter,
[](scalar_t x) -> scalar_t{
return static_cast<scalar_t>(float(x) * std::tanh(std::log1p(std::exp(float(x)))));
},
[](Vectorized<scalar_t> x_vec) -> Vectorized<scalar_t> {
Vectorized<float> x_vec0, x_vec1;
std::tie(x_vec0, x_vec1) = convert_to_float<scalar_t>(x_vec);
return convert_from_float<scalar_t>(
x_vec0 * x_vec0.exp().log1p().tanh(),
x_vec1 * x_vec1.exp().log1p().tanh()
);
});
});
} else {
AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "mish_cpu", [&]() {
using Vec = Vectorized<scalar_t>;
cpu_kernel_vec(
iter,
[](scalar_t x) -> scalar_t{
return static_cast<scalar_t>(x * std::tanh(std::log1p(std::exp(x))));
},
[](Vec x_vec) -> Vec {
return x_vec * x_vec.exp().log1p().tanh();
});
});
}
}
void mish_backward_kernel(TensorIterator& iter) {
if (at::isReducedFloatingType(iter.dtype())) {
AT_DISPATCH_REDUCED_FLOATING_TYPES(iter.dtype(), "mish_backward_cpu", [&]() {
using Vec = Vectorized<float>;
const Vec kOneVec(1.0f);
cpu_kernel_vec(
iter,
[](scalar_t dy, scalar_t x) -> scalar_t {
const float sigmoid =
1.0f / (1.0f + std::exp(-float(x)));
const float tanh_softplus = std::tanh(std::log1p(std::exp(float(x))));
return dy * (tanh_softplus + x * sigmoid * (1.0f - tanh_softplus * tanh_softplus));
},
[kOneVec](Vectorized<scalar_t> dy_vec, Vectorized<scalar_t> x_vec) -> Vectorized<scalar_t> {
Vectorized<float> x_vec0, x_vec1, dy_vec0, dy_vec1;
std::tie(x_vec0, x_vec1) = convert_to_float<scalar_t>(x_vec);
std::tie(dy_vec0, dy_vec1) = convert_to_float<scalar_t>(dy_vec);
const Vec sigmoid0 = kOneVec / (kOneVec + x_vec0.neg().exp());
const Vec sigmoid1 = kOneVec / (kOneVec + x_vec1.neg().exp());
const Vec tanh_softplus0 = x_vec0.exp().log1p().tanh();
const Vec tanh_softplus1 = x_vec1.exp().log1p().tanh();
return convert_from_float<scalar_t>(
dy_vec0 * (tanh_softplus0 + x_vec0 * sigmoid0 * (kOneVec - tanh_softplus0 * tanh_softplus0)),
dy_vec1 * (tanh_softplus1 + x_vec1 * sigmoid1 * (kOneVec - tanh_softplus1 * tanh_softplus1))
);
});
});
} else {
AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "mish_backward_cpu", [&]() {
using Vec = Vectorized<scalar_t>;
const Vec kOneVec(scalar_t(1));
cpu_kernel_vec(
iter,
[](scalar_t dy, scalar_t x) -> scalar_t {
const scalar_t sigmoid =
scalar_t(1) / (scalar_t(1) + std::exp(-x));
const scalar_t tanh_softplus = std::tanh(std::log1p(std::exp(x)));
return dy * (tanh_softplus + x * sigmoid * (scalar_t(1) - tanh_softplus * tanh_softplus));
},
[kOneVec](Vec dy_vec, Vec x_vec) -> Vec {
const Vec sigmoid = kOneVec / (kOneVec + x_vec.neg().exp());
const Vec tanh_softplus = x_vec.exp().log1p().tanh();
return dy_vec * (tanh_softplus + x_vec * sigmoid * (kOneVec - tanh_softplus * tanh_softplus));
});
});
}
}
void prelu_kernel(TensorIterator& iter) {
AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, iter.dtype(), "prelu_cpu", [&]() {
using Vec = Vectorized<scalar_t>;
cpu_kernel_vec(
iter,
[](scalar_t input, scalar_t weight) {
return (input > scalar_t(0)) ? input : weight * input;
},
[](Vec input, Vec weight) {
return Vec::blendv(weight * input, input, input > Vec(0));
});
});
}
void prelu_backward_kernel(TensorIterator& iter) {
AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, iter.dtype(), "prelu_backward_cpu", [&]() {
cpu_kernel_multiple_outputs(iter,
[](scalar_t input, scalar_t weight, scalar_t grad) -> std::tuple<scalar_t, scalar_t> {
auto mask = input > scalar_t{0};
auto grad_input = mask ? grad : weight * grad;
auto grad_weight = mask ? scalar_t{0} : input * grad;
return {grad_input, grad_weight};
});
});
}
} // namespace
REGISTER_DISPATCH(hardsigmoid_stub, &hardsigmoid_kernel);
REGISTER_DISPATCH(hardsigmoid_backward_stub, &hardsigmoid_backward_kernel);
REGISTER_DISPATCH(threshold_stub, &threshold_kernel);
REGISTER_DISPATCH(leaky_relu_stub, &leaky_relu_kernel);
REGISTER_DISPATCH(leaky_relu_backward_stub, &leaky_relu_backward_kernel);
REGISTER_DISPATCH(prelu_stub, &prelu_kernel);
REGISTER_DISPATCH(prelu_backward_stub, &prelu_backward_kernel);
REGISTER_DISPATCH(hardtanh_backward_stub, &hardtanh_backward_kernel);
REGISTER_DISPATCH(hardshrink_stub, &hardshrink_kernel);
REGISTER_DISPATCH(softshrink_stub, &softshrink_kernel);
REGISTER_DISPATCH(shrink_backward_stub, &shrink_backward_kernel);
ALSO_REGISTER_AVX512_DISPATCH(log_sigmoid_cpu_stub, &log_sigmoid_cpu_kernel);
ALSO_REGISTER_AVX512_DISPATCH(log_sigmoid_backward_stub, &log_sigmoid_backward_cpu_kernel);
ALSO_REGISTER_AVX512_DISPATCH(glu_stub, &glu_kernel);
ALSO_REGISTER_AVX512_DISPATCH(glu_backward_stub, &glu_backward_kernel);
ALSO_REGISTER_AVX512_DISPATCH(glu_jvp_stub, &glu_jvp_kernel);
ALSO_REGISTER_AVX512_DISPATCH(elu_stub, &elu_kernel);
ALSO_REGISTER_AVX512_DISPATCH(elu_backward_stub, &elu_backward_kernel);
ALSO_REGISTER_AVX512_DISPATCH(GeluKernel, &GeluKernelImpl);
ALSO_REGISTER_AVX512_DISPATCH(GeluBackwardKernel, &GeluBackwardKernelImpl);
ALSO_REGISTER_AVX512_DISPATCH(hardswish_stub, &hardswish_kernel);
ALSO_REGISTER_AVX512_DISPATCH(hardswish_backward_stub, &hardswish_backward_kernel);
ALSO_REGISTER_AVX512_DISPATCH(softplus_stub, &softplus_kernel);
ALSO_REGISTER_AVX512_DISPATCH(softplus_backward_stub, &softplus_backward_kernel);
ALSO_REGISTER_AVX512_DISPATCH(silu_stub, &silu_kernel);
ALSO_REGISTER_AVX512_DISPATCH(silu_backward_stub, &silu_backward_kernel);
ALSO_REGISTER_AVX512_DISPATCH(mish_stub, &mish_kernel);
ALSO_REGISTER_AVX512_DISPATCH(mish_backward_stub, &mish_backward_kernel);
} // namespace at::native