[nnc] Allow 1 ulp tolerance in log approximation (#52165)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/52165

Apparently bitwise identicality is too high a bar (I'm seeing
differences at this level depending on the HW platform, e.g.,
Broadwell is bitwise accurate but Skylake is 1ulp off).  But anyways
VML is accurate to 1 ulp, so let's allow that.
ghstack-source-id: 121815001

Test Plan: test_approx

Reviewed By: asuhan

Differential Revision: D26408079

fbshipit-source-id: 46cbd1487c72ae7bc40567f2f72ed2b919707d0d
diff --git a/test/cpp/tensorexpr/test_approx.cpp b/test/cpp/tensorexpr/test_approx.cpp
index 09d0cf6..5a56771 100644
--- a/test/cpp/tensorexpr/test_approx.cpp
+++ b/test/cpp/tensorexpr/test_approx.cpp
@@ -8,6 +8,7 @@
 #include <torch/torch.h>
 #include <cstring>
 
+using namespace torch::indexing;
 namespace te = torch::jit::tensorexpr;
 
 static void vectorize(te::LoopNest* ln, te::Tensor* target, int width) {
@@ -17,6 +18,17 @@
   ln->vectorize(inner);
 }
 
+std::string diffs(const at::Tensor& a, const at::Tensor& b) {
+  auto diff = torch::abs(a.flatten() - b.flatten());
+  auto count_diffs = torch::sum(diff > 0.f);
+  auto greatest_diff_index = torch::argmax(diff);
+  std::stringstream ss;
+  ss << "Found " << count_diffs << " unequal element(s). "
+     << "The greatest difference was " << diff.index({greatest_diff_index})
+     << " at index " << greatest_diff_index;
+  return ss.str();
+}
+
 TEST(Approx, log_vml) {
   te::KernelScope ks;
   te::VarHandle N("N", te::kInt);
@@ -31,19 +43,25 @@
   s = te::IRSimplifier::simplify(s);
   te::LLVMCodeGen cg(s, {A, B, N});
 
+  auto eps = std::numeric_limits<float>::epsilon();
   auto test = [&](const at::Tensor& A_t) {
     at::Tensor B_ref = at::log(A_t);
     at::Tensor B_t = at::empty_like(A_t);
-    cg.call({A_t.data_ptr<float>(), B_t.data_ptr<float>(), A_t.numel()});
+    auto ap = A_t.data_ptr<float>();
+    auto bp = B_t.data_ptr<float>();
+    cg.call({ap, bp, A_t.numel()});
     // Results should be bit-identical.
-    ASSERT_TRUE(
-        memcmp(
-            B_ref.data_ptr<float>(), B_t.data_ptr<float>(), B_ref.nbytes()) ==
-        0);
+    ASSERT_TRUE(torch::allclose(
+        B_t, B_ref, /*rtol=*/eps, /*atol=*/0.0f, /*equal_nan=*/true))
+        << "Input[:8]\n"
+        << A_t.index({Slice(0, 8)}) << "\n"
+        << "Test[:8]\n"
+        << B_t.index({Slice(0, 8)}) << "\n"
+        << "Ref[:8]\n"
+        << B_ref.index({Slice(0, 8)}) << diffs(B_t, B_ref);
   };
 
   // Generate every single-precision FP value in [1.0, 2.0).
-  auto eps = std::numeric_limits<float>::epsilon();
   at::Tensor A_t = torch::arange(1.0f, 2.0f, eps);
   ASSERT_EQ(A_t.numel(), 1 << 23);