[Inductor] Use sleef implementation for CPP backend acosh codegen (#118350)
**Summary**
Fix https://github.com/pytorch/pytorch/issues/118267. Current cpp backend using `f"({x} + ({x}*{x} - {vec_one}).sqrt()).log()"` to calculate `acosh`, the issue happens when input is a large negative value like `-910685.8125`. In this case, `(x*x - 1).sqrt() + x` equals to 0, and `0.log()` returns `-inf`. However, based on the document: https://pytorch.org/docs/stable/generated/torch.acosh.html, negative inputs should returns `Nan`. Using acosh sleef implementation to fix this issue.
**Test Plan**
```
python -u -m pytest -s -v test_cpu_repro.py -k test_acosh_with_negative_large_input
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/118350
Approved by: https://github.com/jgong5, https://github.com/lezcano
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
index a54ec6f..3e26213 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
@@ -313,6 +313,9 @@
Vectorized<T> acos() const {
return map(Sleef_acosf8_u10);
}
+ Vectorized<T> acosh() const {
+ return map(Sleef_acoshf8_u10);
+ }
Vectorized<T> asin() const {
return map(Sleef_asinf8_u10);
}
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
index a28ef21..bc82d07 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_double.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
@@ -140,6 +140,9 @@
Vectorized<double> acos() const {
return Vectorized<double>(Sleef_acosd4_u10(values));
}
+ Vectorized<double> acosh() const {
+ return Vectorized<double>(Sleef_acoshd4_u10(values));
+ }
Vectorized<double> asin() const {
return Vectorized<double>(Sleef_asind4_u10(values));
}
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
index 672f9d7..886809a 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
@@ -148,6 +148,9 @@
Vectorized<float> acos() const {
return Vectorized<float>(Sleef_acosf8_u10(values));
}
+ Vectorized<float> acosh() const {
+ return Vectorized<float>(Sleef_acoshf8_u10(values));
+ }
Vectorized<float> asin() const {
return Vectorized<float>(Sleef_asinf8_u10(values));
}
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
index 89be526..fa90816 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
@@ -400,6 +400,9 @@
Vectorized<T> acos() const {
return map(Sleef_acosf16_u10);
}
+ Vectorized<T> acosh() const {
+ return map(Sleef_acoshf16_u10);
+ }
Vectorized<T> asin() const {
return map(Sleef_asinf16_u10);
}
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
index 3a2d62f..27b2753 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
@@ -149,6 +149,9 @@
Vectorized<double> acos() const {
return Vectorized<double>(Sleef_acosd8_u10(values));
}
+ Vectorized<double> acosh() const {
+ return Vectorized<double>(Sleef_acoshd8_u10(values));
+ }
Vectorized<double> asin() const {
return Vectorized<double>(Sleef_asind8_u10(values));
}
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
index 2ff2002..ba57386 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
@@ -168,6 +168,9 @@
Vectorized<float> acos() const {
return Vectorized<float>(Sleef_acosf16_u10(values));
}
+ Vectorized<float> acosh() const {
+ return Vectorized<float>(Sleef_acoshf16_u10(values));
+ }
Vectorized<float> asin() const {
return Vectorized<float>(Sleef_asinf16_u10(values));
}
diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h
index fca2b2c..5c7f7a1 100644
--- a/aten/src/ATen/cpu/vec/vec_base.h
+++ b/aten/src/ATen/cpu/vec/vec_base.h
@@ -365,6 +365,9 @@
Vectorized<T> acos() const {
return map(std::acos);
}
+ Vectorized<T> acosh() const {
+ return map(std::acosh);
+ }
Vectorized<T> asin() const {
return map(std::asin);
}
diff --git a/test/inductor/test_cpu_repro.py b/test/inductor/test_cpu_repro.py
index c60753a..1d024d3 100644
--- a/test/inductor/test_cpu_repro.py
+++ b/test/inductor/test_cpu_repro.py
@@ -597,6 +597,39 @@
(x,),
)
+ def test_acosh_with_negative_large_input(self):
+ # https://github.com/pytorch/pytorch/issues/118267.
+
+ def fn(input):
+ out = torch.acosh(input)
+ return out
+
+ x = torch.Tensor(
+ [
+ [
+ -8493.9854,
+ 431654.1250,
+ 71741.5859,
+ 608234.5000,
+ -103814.7500,
+ -699397.0000,
+ -910685.8125,
+ -832737.1875,
+ 875343.5000,
+ ]
+ ]
+ ).repeat(3, 9)
+
+ for dtype in [torch.float32, torch.bfloat16, torch.double]:
+ with torch.no_grad():
+ torch._dynamo.reset()
+ metrics.reset()
+ _x = x.to(dtype)
+ self.common(
+ fn,
+ (_x,),
+ )
+
@config.patch(implicit_fallbacks=True)
def test_repeat_interleave(self):
def fn(y):
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index 9d27d2b..e968ef2 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -1254,9 +1254,7 @@
@staticmethod
def acosh(x):
- # For real x, acosh(x) = log(x + sqrt(x**2 -1))
- vec_one = f"decltype({x})(1)"
- return f"({x} + ({x}*{x} - {vec_one}).sqrt()).log()"
+ return f"{x}.acosh()"
@staticmethod
def relu(x):