[Inductor] Use sleef implementation for CPP backend acosh codegen (#118350)

**Summary**
Fix https://github.com/pytorch/pytorch/issues/118267. Current cpp backend using `f"({x} + ({x}*{x} - {vec_one}).sqrt()).log()"` to calculate `acosh`, the issue happens when input is a large negative value like `-910685.8125`. In this case, `(x*x - 1).sqrt() + x` equals to 0, and `0.log()` returns `-inf`. However, based on the document: https://pytorch.org/docs/stable/generated/torch.acosh.html, negative inputs should returns `Nan`. Using acosh sleef implementation to fix this issue.

**Test Plan**
```
python -u -m pytest -s -v test_cpu_repro.py -k test_acosh_with_negative_large_input
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/118350
Approved by: https://github.com/jgong5, https://github.com/lezcano
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
index a54ec6f..3e26213 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
@@ -313,6 +313,9 @@
   Vectorized<T> acos() const {
     return map(Sleef_acosf8_u10);
   }
+  Vectorized<T> acosh() const {
+    return map(Sleef_acoshf8_u10);
+  }
   Vectorized<T> asin() const {
     return map(Sleef_asinf8_u10);
   }
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
index a28ef21..bc82d07 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_double.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
@@ -140,6 +140,9 @@
   Vectorized<double> acos() const {
     return Vectorized<double>(Sleef_acosd4_u10(values));
   }
+  Vectorized<double> acosh() const {
+    return Vectorized<double>(Sleef_acoshd4_u10(values));
+  }
   Vectorized<double> asin() const {
     return Vectorized<double>(Sleef_asind4_u10(values));
   }
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
index 672f9d7..886809a 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
@@ -148,6 +148,9 @@
   Vectorized<float> acos() const {
     return Vectorized<float>(Sleef_acosf8_u10(values));
   }
+  Vectorized<float> acosh() const {
+    return Vectorized<float>(Sleef_acoshf8_u10(values));
+  }
   Vectorized<float> asin() const {
     return Vectorized<float>(Sleef_asinf8_u10(values));
   }
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
index 89be526..fa90816 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
@@ -400,6 +400,9 @@
   Vectorized<T> acos() const {
     return map(Sleef_acosf16_u10);
   }
+  Vectorized<T> acosh() const {
+    return map(Sleef_acoshf16_u10);
+  }
   Vectorized<T> asin() const {
     return map(Sleef_asinf16_u10);
   }
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
index 3a2d62f..27b2753 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
@@ -149,6 +149,9 @@
   Vectorized<double> acos() const {
     return Vectorized<double>(Sleef_acosd8_u10(values));
   }
+  Vectorized<double> acosh() const {
+    return Vectorized<double>(Sleef_acoshd8_u10(values));
+  }
   Vectorized<double> asin() const {
     return Vectorized<double>(Sleef_asind8_u10(values));
   }
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
index 2ff2002..ba57386 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
@@ -168,6 +168,9 @@
   Vectorized<float> acos() const {
     return Vectorized<float>(Sleef_acosf16_u10(values));
   }
+  Vectorized<float> acosh() const {
+    return Vectorized<float>(Sleef_acoshf16_u10(values));
+  }
   Vectorized<float> asin() const {
     return Vectorized<float>(Sleef_asinf16_u10(values));
   }
diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h
index fca2b2c..5c7f7a1 100644
--- a/aten/src/ATen/cpu/vec/vec_base.h
+++ b/aten/src/ATen/cpu/vec/vec_base.h
@@ -365,6 +365,9 @@
   Vectorized<T> acos() const {
     return map(std::acos);
   }
+  Vectorized<T> acosh() const {
+    return map(std::acosh);
+  }
   Vectorized<T> asin() const {
     return map(std::asin);
   }
diff --git a/test/inductor/test_cpu_repro.py b/test/inductor/test_cpu_repro.py
index c60753a..1d024d3 100644
--- a/test/inductor/test_cpu_repro.py
+++ b/test/inductor/test_cpu_repro.py
@@ -597,6 +597,39 @@
                 (x,),
             )
 
+    def test_acosh_with_negative_large_input(self):
+        # https://github.com/pytorch/pytorch/issues/118267.
+
+        def fn(input):
+            out = torch.acosh(input)
+            return out
+
+        x = torch.Tensor(
+            [
+                [
+                    -8493.9854,
+                    431654.1250,
+                    71741.5859,
+                    608234.5000,
+                    -103814.7500,
+                    -699397.0000,
+                    -910685.8125,
+                    -832737.1875,
+                    875343.5000,
+                ]
+            ]
+        ).repeat(3, 9)
+
+        for dtype in [torch.float32, torch.bfloat16, torch.double]:
+            with torch.no_grad():
+                torch._dynamo.reset()
+                metrics.reset()
+                _x = x.to(dtype)
+                self.common(
+                    fn,
+                    (_x,),
+                )
+
     @config.patch(implicit_fallbacks=True)
     def test_repeat_interleave(self):
         def fn(y):
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index 9d27d2b..e968ef2 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -1254,9 +1254,7 @@
 
     @staticmethod
     def acosh(x):
-        # For real x, acosh(x) = log(x + sqrt(x**2 -1))
-        vec_one = f"decltype({x})(1)"
-        return f"({x} + ({x}*{x} - {vec_one}).sqrt()).log()"
+        return f"{x}.acosh()"
 
     @staticmethod
     def relu(x):