minimize header file includes from _avx2.cc (#14950) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/14950 Minimize the number of headers included from _avx2.cc files to avoid accidental compilation of functions defined the header files reused by other translation units that can lead to illegal instruction errors. Reviewed By: dskhudia Differential Revision: D13394483 fbshipit-source-id: 67149a6fb51f7f047e745bfe395cb6dd4ae7c1ae

commit: 1e0eab5df839ae4124fa558c9c82586c9993eaab [log] [tgz]
author: Jongsoo Park <jongsoo@fb.com> Thu Dec 13 00:15:51 2018 -0800
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> Thu Dec 13 00:18:11 2018 -0800
tree: 6581ced51f903b8a2fdf26aea620a6271b4763f3
parent: 4b97a4642100e26d14c34c07c31643422d60ac48 [diff]
diff --git a/caffe2/operators/fused_rowwise_random_quantization_ops.cc b/caffe2/operators/fused_rowwise_random_quantization_ops.cc
index 7b5070a..e7cb974 100644
--- a/caffe2/operators/fused_rowwise_random_quantization_ops.cc
+++ b/caffe2/operators/fused_rowwise_random_quantization_ops.cc

@@ -1,5 +1,5 @@
 #include "caffe2/operators/fused_rowwise_random_quantization_ops.h"
-#include "c10/util/Registry.h"
+#include <c10/util/Registry.h>
 #include "caffe2/utils/math.h"
 
 namespace caffe2 {
@@ -48,26 +48,36 @@
   memset(output_data, 0, output->numel());
 
   if (random_) {
-#ifdef FUSED_ROWWISE_RANDOM_QUANTIZATION_USE_MKL
     random_buffer_.resize(input_columns);
-#endif
   }
 
   for (size_t row = 0; row < input_rows; ++row) {
+    if (random_) {
+#ifdef FUSED_ROWWISE_RANDOM_QUANTIZATION_USE_MKL
+      int status = vsRngUniform(
+          VSL_RNG_METHOD_UNIFORM_STD,
+          vslStream_,
+          input_columns,
+          random_buffer_.data(),
+          0.0f,
+          1.0f);
+      if (status != VSL_ERROR_OK) {
+        LOG(WARNING) << "vsRngUniform returns " << status;
+      }
+#else
+      for (int i = 0; i < input_columns; ++i) {
+        random_buffer_[i] = (*dis_)(gen_);
+      }
+#endif
+    }
+
     math::quantize_and_compress(
         input_data + row * input_columns,
         output_data + row * output_columns,
         input_columns,
         bitwidth_,
         random_,
-#ifdef FUSED_ROWWISE_RANDOM_QUANTIZATION_USE_MKL
-        vslStream_,
-        random_buffer_
-#else
-        dis_,
-        gen_
-#endif
-    );
+        random_buffer_.data());
   }
 
   return true;

diff --git a/caffe2/operators/fused_rowwise_random_quantization_ops.h b/caffe2/operators/fused_rowwise_random_quantization_ops.h
index 1cbb69a..e1c5cb6 100644
--- a/caffe2/operators/fused_rowwise_random_quantization_ops.h
+++ b/caffe2/operators/fused_rowwise_random_quantization_ops.h

@@ -10,6 +10,11 @@
 #include "caffe2/perfkernels/math.h"
 #include "caffe2/utils/math.h"
 
+#ifdef CAFFE2_USE_MKL
+#include <mkl.h>
+#define FUSED_ROWWISE_RANDOM_QUANTIZATION_USE_MKL
+#endif
+
 namespace caffe2 {
 
 template <class Context>
@@ -61,9 +66,10 @@
  protected:
   size_t bitwidth_{8};
   bool random_{true};
+  std::vector<float> random_buffer_;
+
 #ifdef FUSED_ROWWISE_RANDOM_QUANTIZATION_USE_MKL
   VSLStreamStatePtr vslStream_;
-  std::vector<float> random_buffer_;
 #else
   std::unique_ptr<std::uniform_real_distribution<float>> dis_;
   std::minstd_rand gen_;

diff --git a/caffe2/perfkernels/adagrad.cc b/caffe2/perfkernels/adagrad.cc
index c629cb6..0d6e25e 100644
--- a/caffe2/perfkernels/adagrad.cc
+++ b/caffe2/perfkernels/adagrad.cc

@@ -195,21 +195,6 @@
   BASE_DO(adagrad_update, N, w, g, h, nw, nh, epsilon, decay, lr);
 }
 
-template <typename SIndex>
-void sparse_adagrad(
-    int num_rows,
-    int block_size,
-    size_t param_size,
-    const float* w,
-    const float* g,
-    const float* h,
-    const SIndex* indices,
-    float* nw,
-    float* nh,
-    float epsilon,
-    float lr,
-    const std::string& param_name);
-
 SPARSE_ADAGRAD_SPECIALIZATION(int32_t, base);
 
 template <>

diff --git a/caffe2/perfkernels/adagrad.h b/caffe2/perfkernels/adagrad.h
index efd4eff..6ce1965 100644
--- a/caffe2/perfkernels/adagrad.h
+++ b/caffe2/perfkernels/adagrad.h

@@ -5,7 +5,8 @@
 #define CAFFE2_PERFKERNELS_ADAGRAD_H_USE_INTRINSIC
 #include <immintrin.h>
 #endif
-#include "caffe2/core/types.h"
+#include <ATen/core/Half.h>
+#include <c10/util/Logging.h>
 
 namespace caffe2 {
 

diff --git a/caffe2/perfkernels/common.h b/caffe2/perfkernels/common.h
index 91e9e86..b128c76 100644
--- a/caffe2/perfkernels/common.h
+++ b/caffe2/perfkernels/common.h

@@ -1,3 +1,9 @@
+// !!!! PLEASE READ !!!!
+// Minimize (transitively) included headers from _avx*.cc because some of the
+// functions defined in the headers compiled with platform dependent compiler
+// options can be reused by other translation units generating illegal
+// instruction run-time error.
+
 // Common utilities for writing performance kernels and easy dispatching of
 // different backends.
 /*

diff --git a/caffe2/perfkernels/embedding_lookup_avx2.cc b/caffe2/perfkernels/embedding_lookup_avx2.cc
index cd5cb73..e470779 100644
--- a/caffe2/perfkernels/embedding_lookup_avx2.cc
+++ b/caffe2/perfkernels/embedding_lookup_avx2.cc

@@ -5,9 +5,10 @@
 //// DO NOT MODIFY!!!
 //// --------------------------
 
-#include <caffe2/core/common.h>
-#include <caffe2/core/types.h>
+#include <ATen/core/Half.h>
+#include <c10/util/Logging.h>
 #include <immintrin.h>
+#include <cassert>
 
 namespace caffe2 {
 
@@ -1309,7 +1310,7 @@
                   _mm256_loadu_ps(&op[j])));
           _mm_prefetch((&ip_next_T0[j]), _MM_HINT_T0);
         }
-        at::Half vtmp1[8] CAFFE2_ALIGNED(64);
+        alignas(64) at::Half vtmp1[8];
         for (; j < block_size; j++) {
           vtmp1[0] = ip[j];
           __m256 vtmp2 = _mm256_cvtph_ps(*((__m128i*)vtmp1));
@@ -1850,7 +1851,7 @@
                   _mm256_loadu_ps(&op[j])));
           _mm_prefetch((&ip_next_T0[j]), _MM_HINT_T0);
         }
-        at::Half vtmp1[8] CAFFE2_ALIGNED(64);
+        alignas(64) at::Half vtmp1[8];
         for (; j < block_size; j++) {
           vtmp1[0] = ip[j];
           __m256 vtmp2 = _mm256_cvtph_ps(*((__m128i*)vtmp1));

diff --git a/caffe2/perfkernels/embedding_lookup_fused_8bit_rowwise_avx2.cc b/caffe2/perfkernels/embedding_lookup_fused_8bit_rowwise_avx2.cc
index 5eeb4ef..650a2dc 100644
--- a/caffe2/perfkernels/embedding_lookup_fused_8bit_rowwise_avx2.cc
+++ b/caffe2/perfkernels/embedding_lookup_fused_8bit_rowwise_avx2.cc

@@ -5,9 +5,10 @@
 //// DO NOT MODIFY!!!
 //// --------------------------
 
-#include <caffe2/core/common.h>
-#include <caffe2/core/types.h>
+#include <ATen/core/Half.h>
+#include <c10/util/Logging.h>
 #include <immintrin.h>
+#include <cassert>
 
 namespace caffe2 {
 
@@ -1295,7 +1296,7 @@
                   _mm256_loadu_ps(&op[j])));
           _mm_prefetch((&ip_next_T0[j]), _MM_HINT_T0);
         }
-        at::Half vtmp1[8] CAFFE2_ALIGNED(64);
+        alignas(64) at::Half vtmp1[8];
         for (; j < block_size; j++) {
           vtmp1[0] = ip[j];
           __m256 vtmp2 = _mm256_cvtph_ps(*((__m128i*)vtmp1));
@@ -1830,7 +1831,7 @@
                   _mm256_loadu_ps(&op[j])));
           _mm_prefetch((&ip_next_T0[j]), _MM_HINT_T0);
         }
-        at::Half vtmp1[8] CAFFE2_ALIGNED(64);
+        alignas(64) at::Half vtmp1[8];
         for (; j < block_size; j++) {
           vtmp1[0] = ip[j];
           __m256 vtmp2 = _mm256_cvtph_ps(*((__m128i*)vtmp1));

diff --git a/caffe2/perfkernels/hp_emblookup_codegen.py b/caffe2/perfkernels/hp_emblookup_codegen.py
index 2db1cee..887a975 100644
--- a/caffe2/perfkernels/hp_emblookup_codegen.py
+++ b/caffe2/perfkernels/hp_emblookup_codegen.py

@@ -1,11 +1,10 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+from __future__ import absolute_import, division, print_function, unicode_literals
+
 import argparse
 import sys
 
-sizeof = {'float': 4, 'at::Half': 2, 'uint8_t': 1}
+
+sizeof = {"float": 4, "at::Half": 2, "uint8_t": 1}
 
 
 def unroll(uf, IndexType, InType, OutType, use_weights, isa, fused):
@@ -14,90 +13,105 @@
 
         if InType == "float":
             code.append(
-                "vop%d = _mm256_fmadd_ps(vwgt,  \
-                  _mm256_loadu_ps(ip + (%d)), vop%d);"
-                                                       % (regid, regid, regid)
+                "        vop%d = _mm256_fmadd_ps(vwgt, _mm256_loadu_ps(ip + (%d)), vop%d);"  # noqa
+                % (regid, regid, regid)
             )
         elif InType == "at::Half":
             code.append(
-                "vop%d = _mm256_fmadd_ps(vwgt,  \
-                   _mm256_cvtph_ps(_mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (%d)))), \
-                   vop%d);"
-                            % (regid, regid, regid)
+                "        vop%d = _mm256_fmadd_ps(\n"
+                "            vwgt,\n"
+                "            _mm256_cvtph_ps(\n"
+                "                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (%d)))),\n"  # noqa
+                "            vop%d);" % (regid, regid, regid)
             )
         elif InType == "uint8_t":
             code.append(
-                "vop%d = _mm256_fmadd_ps(vwgt,  \
-                   _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (%d))))), \
-                   _mm256_add_ps(vop%d, vbio));"
-                                                 % (regid, regid, regid)
+                "        vop%d = _mm256_fmadd_ps(\n"
+                "            vwgt,\n"
+                "            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(\n"
+                "                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (%d))))),\n"  # noqa
+                "            _mm256_add_ps(vop%d, vbio));" % (regid, regid, regid)
             )
         else:
             assert False
 
         if prefetch:
-            code.append("_mm_prefetch((&ip_next_T0[%d]), _MM_HINT_T0);" % (regid))
+            code.append(
+                "        _mm_prefetch((&ip_next_T0[%d]), _MM_HINT_T0);" % (regid)
+            )
         else:
-            code.append("// skip unnecessary prefetch of (&ip_next_T0[%d])" % (regid))
+            code.append(
+                "        // skip unnecessary prefetch of (&ip_next_T0[%d])" % (regid)
+            )
 
         return code
 
     code = []
-    code.append("// unrolling " + str(uf) + " times")
-    code.append(IndexType + " dataInd = 0;")
-    code.append("for (" + IndexType +
-                " rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {")
-    code.append(OutType + " *op = &out[rangeIndex * block_size];")
+    code.append("    // unrolling " + str(uf) + " times")
+    code.append("    " + IndexType + " dataInd = 0;")
+    code.append(
+        "    for ("
+        + IndexType
+        + " rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {"
+    )
+    code.append("      " + OutType + "* op = &out[rangeIndex * block_size];")
     for i in range(0, uf):
         j = 8 * i
-        code.append("__m256 vop" + str(j) + " = _mm256_setzero_ps();")
+        code.append("      __m256 vop" + str(j) + " = _mm256_setzero_ps();")
 
     # inner loop
-    code.append("for (" + IndexType +
-                " start = dataInd; dataInd < start + lengths[rangeIndex]; ++dataInd) {")
-    code.append("const  " + IndexType + " idx = indices[dataInd];")
     code.append(
-        'CAFFE_ENFORCE(idx >=0 && idx < data_size, "Index ", dataInd, "'
-        ' is out of bounds: ", idx, ", range 0 to ", data_size);')
+        "      for ("
+        + IndexType
+        + " start = dataInd; dataInd < start + lengths[rangeIndex];\n           ++dataInd) {"  # noqa
+    )
+    code.append("        const " + IndexType + " idx = indices[dataInd];")
+    code.append(
+        '        CAFFE_ENFORCE(\n            idx >= 0 && idx < data_size,\n            "Index ",\n            dataInd,\n'  # noqa
+        '            " is out of bounds: ",\n            idx,\n            ", range 0 to ",\n            data_size);'  # noqa
+    )
 
     if InType == "uint8_t":
-        code.append(OutType + " wgt = 1.f;")
-        code.append(OutType + " bio;")
-        code.append("if (weights) {")
+        code.append("        " + OutType + " wgt = 1.f;")
+        code.append("        " + OutType + " bio;")
+        code.append("        if (weights) {")
         code.append(
-            "wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];")
-        code.append("}")
+            "          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];"  # noqa
+        )
+        code.append("        }")
         if fused:
             code.append(
-                'const float* scale_bias = reinterpret_cast<'
-                'const float*>(&input[idx * fused_block_size + block_size]);'
+                "        const float* scale_bias = reinterpret_cast<const float*>(\n"
+                "            &input[idx * fused_block_size + block_size]);"
             )
-            code.append("bio = wgt * scale_bias[1];")
-            code.append("wgt = wgt * scale_bias[0];")
+            code.append("        bio = wgt * scale_bias[1];")
+            code.append("        wgt = wgt * scale_bias[0];")
         else:
-            code.append("bio = wgt * scale_bias[2 * idx + 1];")
-            code.append("wgt = wgt * scale_bias[2 * idx];")
-        code.append("__m256 vbio = _mm256_set1_ps(bio);")
+            code.append("        bio = wgt * scale_bias[2 * idx + 1];")
+            code.append("        wgt = wgt * scale_bias[2 * idx];")
+        code.append("        __m256 vbio = _mm256_set1_ps(bio);")
     else:
-        code.append(OutType + " wgt = 1.f;")
-        code.append("if (weights) {")
+        code.append("        " + OutType + " wgt = 1.f;")
+        code.append("        if (weights) {")
         code.append(
-            "wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];")
-        code.append("}")
-    code.append("__m256 vwgt = _mm256_set1_ps(wgt);")
+            "          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];"  # noqa
+        )
+        code.append("        }")
+    code.append("        __m256 vwgt = _mm256_set1_ps(wgt);")
 
-    code.append("const {} *ip = &input[idx * fused_block_size];".format(InType))
+    code.append("        const {}* ip = &input[idx * fused_block_size];".format(InType))
     code.append(
-        'const {} next_T0 = (dataInd < index_size - prefdist_T0)'
-        ' ? (dataInd + prefdist_T0) : dataInd;'.format(IndexType)
+        "        const {} next_T0 = (dataInd < index_size - prefdist_T0)\n"
+        "            ? (dataInd + prefdist_T0)\n            : dataInd;".format(
+            IndexType
+        )
     )
-    code.append("const  " + IndexType + " idx_pref_T0 = indices[next_T0];")
-    code.append(
-        "CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);")
+    code.append("        const " + IndexType + " idx_pref_T0 = indices[next_T0];")
+    code.append("        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);")
 
     code.append(
-        'const {} *ip_next_T0 = &input[idx_pref_T0'
-        ' * fused_block_size];'.format(InType)
+        "        const {}* ip_next_T0 = &input[idx_pref_T0"
+        " * fused_block_size];".format(InType)
     )
 
     for i in range(0, uf):
@@ -106,168 +120,190 @@
         byteoffset = sizeof[InType] * j
         prefetch = (byteoffset % cachelinesize) == 0
         code.extend(compute(j, InType, use_weights, isa, prefetch))
-    code.append("}")
+    code.append("      }")
 
-    code.append("if (normalize_by_lengths == false) {")
+    code.append("      if (normalize_by_lengths == false) {")
     for i in range(0, uf):
         j = 8 * i
-        code.append(
-            "_mm256_storeu_ps(&op[" + str(j) + "], vop" + str(j) + ");")
-    code.append("} else if (lengths[rangeIndex]) {")
+        code.append("        _mm256_storeu_ps(&op[" + str(j) + "], vop" + str(j) + ");")
+    code.append("      } else if (lengths[rangeIndex]) {")
     # inv of length
-    code.append(
-        "__m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);")
+    code.append("        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);")
     for i in range(0, uf):
         j = 8 * i
         code.append(
-            "_mm256_storeu_ps(&op[" + str(j) + "], _mm256_mul_ps(" + "vop" + str(j) + ", vlen_inv));")
-    code.append("}")
+            "        _mm256_storeu_ps(&op["
+            + str(j)
+            + "], _mm256_mul_ps("
+            + "vop"
+            + str(j)
+            + ", vlen_inv));"
+        )
+    code.append("      }")
 
-    code.append("}")
+    code.append("    }")
     return code
 
 
 def generic(IndexType, InType, OutType, use_weights, isa, fused):
-
     def compute(InType, use_weights, isa):
         code = []
         if InType == "float":
             code.append(
-                "_mm256_storeu_ps(&op[j], \
-                                 _mm256_fmadd_ps(vwgt,_mm256_loadu_ps(&ip[j]), _mm256_loadu_ps(&op[j])) \
-                                   );"
+                "          _mm256_storeu_ps(\n"
+                "              &op[j],\n"
+                "              _mm256_fmadd_ps(\n"
+                "                  vwgt, _mm256_loadu_ps(&ip[j]), _mm256_loadu_ps(&op[j])));"  # noqa
             )
         elif InType == "at::Half":
             code.append(
-                "_mm256_storeu_ps(&op[j], \
-                   _mm256_fmadd_ps(vwgt, \
-                     _mm256_cvtph_ps(_mm_loadu_si128(reinterpret_cast<const __m128i*>(&ip[j]))), _mm256_loadu_ps(&op[j])) \
-                                   );"
+                "          _mm256_storeu_ps(\n"
+                "              &op[j],\n"
+                "              _mm256_fmadd_ps(\n"
+                "                  vwgt,\n"
+                "                  _mm256_cvtph_ps(_mm_loadu_si128(\n"
+                "                      reinterpret_cast<const __m128i*>(&ip[j]))),\n"
+                "                  _mm256_loadu_ps(&op[j])));"
             )
         elif InType == "uint8_t":
             code.append(
-                "_mm256_storeu_ps(&op[j], \
-                   _mm256_fmadd_ps(vwgt, \
-                     _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&ip[j])))), \
-                     _mm256_add_ps(_mm256_loadu_ps(&op[j]), vbio) ) \
-                                   );"
+                "          _mm256_storeu_ps(\n"
+                "              &op[j],\n"
+                "              _mm256_fmadd_ps(\n"
+                "                  vwgt,\n"
+                "                  _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(\n"  # noqa
+                "                      reinterpret_cast<const __m128i*>(&ip[j])))),\n"
+                "                  _mm256_add_ps(_mm256_loadu_ps(&op[j]), vbio)));"
             )
         else:
             assert False
 
-        code.append("_mm_prefetch((&ip_next_T0[j]), _MM_HINT_T0);")
+        code.append("          _mm_prefetch((&ip_next_T0[j]), _MM_HINT_T0);")
 
         return code
 
     code = []
-    code.append(IndexType + " dataInd = 0;")
-    code.append("for (" + IndexType +
-                " rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {")
-    code.append(OutType + " *op = &out[rangeIndex * block_size];")
+    code.append("    " + IndexType + " dataInd = 0;")
+    code.append(
+        "    for ("
+        + IndexType
+        + " rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {"
+    )
+    code.append("      " + OutType + "* op = &out[rangeIndex * block_size];")
 
     # initialize to 0
-    code.append("int64_t j = 0;")
-    code.append("for(; j + 8 <= block_size; j += 8) {")
-    code.append("_mm256_storeu_ps(op + j, _mm256_setzero_ps());")
-    code.append("}")
-    code.append("for(; j < block_size; j++) {")
-    code.append("op[j] = 0.0f;")
-    code.append("}")
+    code.append("      int64_t j = 0;")
+    code.append("      for (; j + 8 <= block_size; j += 8) {")
+    code.append("        _mm256_storeu_ps(op + j, _mm256_setzero_ps());")
+    code.append("      }")
+    code.append("      for (; j < block_size; j++) {")
+    code.append("        op[j] = 0.0f;")
+    code.append("      }")
 
     # inner loop
-    code.append("for (" + IndexType +
-                " start = dataInd; dataInd < start + lengths[rangeIndex]; ++dataInd) {")
-    code.append("const  " + IndexType + " idx = indices[dataInd];")
     code.append(
-        'CAFFE_ENFORCE(idx >=0 && idx < data_size, "Index ", dataInd, "' +
-        ' is out of bounds: ", idx, ", range 0 to ", data_size);')
+        "      for ("
+        + IndexType
+        + " start = dataInd; dataInd < start + lengths[rangeIndex];\n           ++dataInd) {"  # noqa
+    )
+    code.append("        const " + IndexType + " idx = indices[dataInd];")
+    code.append(
+        '        CAFFE_ENFORCE(\n            idx >= 0 && idx < data_size,\n            "Index ",\n            dataInd,\n'  # noqa
+        + '            " is out of bounds: ",\n            idx,\n            ", range 0 to ",\n            data_size);'  # noqa
+    )
 
     if InType == "uint8_t":
-        code.append(OutType + " wgt = 1.f;")
-        code.append(OutType + " bio;")
-        code.append("if (weights) {")
+        code.append("        " + OutType + " wgt = 1.f;")
+        code.append("        " + OutType + " bio;")
+        code.append("        if (weights) {")
         code.append(
-            "wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];")
-        code.append("}")
+            "          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];"  # noqa
+        )
+        code.append("        }")
         if fused:
             code.append(
-                'const float* scale_bias = reinterpret_cast<'
-                'const float*>(&input[idx * fused_block_size + block_size]);'
+                "        const float* scale_bias = reinterpret_cast<const float*>(\n"
+                "            &input[idx * fused_block_size + block_size]);"
             )
-            code.append("bio = wgt * scale_bias[1];")
-            code.append("wgt = wgt * scale_bias[0];")
+            code.append("        bio = wgt * scale_bias[1];")
+            code.append("        wgt = wgt * scale_bias[0];")
         else:
-            code.append("assert (scale_bias);")
-            code.append("bio = wgt * scale_bias[2 * idx + 1];")
-            code.append("wgt = wgt * scale_bias[2 * idx];")
-        code.append("__m256 vbio = _mm256_set1_ps(bio);")
+            code.append("        assert(scale_bias);")
+            code.append("        bio = wgt * scale_bias[2 * idx + 1];")
+            code.append("        wgt = wgt * scale_bias[2 * idx];")
+        code.append("        __m256 vbio = _mm256_set1_ps(bio);")
     else:
-        code.append(OutType + " wgt = 1.f;")
-        code.append("if (weights) {")
+        code.append("        " + OutType + " wgt = 1.f;")
+        code.append("        if (weights) {")
         code.append(
-            "wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];")
-        code.append("}")
-    code.append("__m256 vwgt = _mm256_set1_ps(wgt);")
+            "          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];"  # noqa
+        )
+        code.append("        }")
+    code.append("        __m256 vwgt = _mm256_set1_ps(wgt);")
 
-    code.append("const {} *ip = &input[idx * fused_block_size];".format(InType))
+    code.append("        const {}* ip = &input[idx * fused_block_size];".format(InType))
     code.append(
-        'const {} next_T0 = (dataInd < index_size - prefdist_T0)'
-        ' ? (dataInd + prefdist_T0) : dataInd;'.format(IndexType)
+        "        const {} next_T0 = (dataInd < index_size - prefdist_T0)\n"
+        "            ? (dataInd + prefdist_T0)\n            : dataInd;".format(
+            IndexType
+        )
     )
-    code.append("const  " + IndexType + " idx_pref_T0 = indices[next_T0];")
+    code.append("        const " + IndexType + " idx_pref_T0 = indices[next_T0];")
+    code.append("        CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);")
     code.append(
-        "CAFFE_ENFORCE(idx_pref_T0 >= 0 && idx_pref_T0 < data_size);")
-    code.append(
-        "const {} *ip_next_T0 = &input[idx_pref_T0 * fused_block_size];".
-        format(InType)
+        "        const {}* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];".format(
+            InType
+        )
     )
 
     # compute and store main loop
-    code.append("j = 0;")
-    code.append("for(; j + 8 <= block_size; j += 8) {")
+    code.append("        j = 0;")
+    code.append("        for (; j + 8 <= block_size; j += 8) {")
     code.extend(compute(InType, use_weights, isa))
-    code.append("}")
+    code.append("        }")
     # leftover
     if InType == "at::Half":
-        code.append("at::Half vtmp1[8] CAFFE2_ALIGNED(64);")
-    code.append("for(; j < block_size; j++) {")
+        code.append("        alignas(64) at::Half vtmp1[8];")
+    code.append("        for (; j < block_size; j++) {")
     if InType == "float":
-        code.append("op[j] += wgt * ip[j];")
+        code.append("          op[j] += wgt * ip[j];")
     elif InType == "at::Half":
-        code.append("vtmp1[0] = ip[j];")
-        code.append("__m256 vtmp2 = _mm256_cvtph_ps(*((__m128i*)vtmp1));")
-        code.append("op[j] += wgt * ((float*)(&vtmp2))[0];")
+        code.append("          vtmp1[0] = ip[j];")
+        code.append("          __m256 vtmp2 = _mm256_cvtph_ps(*((__m128i*)vtmp1));")
+        code.append("          op[j] += wgt * ((float*)(&vtmp2))[0];")
     elif InType == "uint8_t":
-        code.append("op[j] += wgt * ((float)ip[j]) + bio;")
+        code.append("          op[j] += wgt * ((float)ip[j]) + bio;")
     else:
         assert False
 
-    code.append("}")
+    code.append("        }")
 
-    code.append("}")
+    code.append("      }")
 
-    code.append("if (normalize_by_lengths && lengths[rangeIndex]) {")
-    code.append("float len_inv = 1.0f / lengths[rangeIndex];")
-    code.append("__m256 vlen_inv = _mm256_set1_ps(len_inv);")
-    code.append("j = 0;")
-    code.append("for(; j + 8 <= block_size; j += 8) {")
+    code.append("      if (normalize_by_lengths && lengths[rangeIndex]) {")
+    code.append("        float len_inv = 1.0f / lengths[rangeIndex];")
+    code.append("        __m256 vlen_inv = _mm256_set1_ps(len_inv);")
+    code.append("        j = 0;")
+    code.append("        for (; j + 8 <= block_size; j += 8) {")
     code.append(
-        "_mm256_storeu_ps(&op[j], _mm256_mul_ps(_mm256_loadu_ps(&op[j]), vlen_inv));")
-    code.append("}")
-    code.append("for(; j < block_size; j++) {")
-    code.append("op[j] = len_inv * op[j];")
-    code.append("}")
+        "          _mm256_storeu_ps(\n"
+        "              &op[j], _mm256_mul_ps(_mm256_loadu_ps(&op[j]), vlen_inv));"
+    )
+    code.append("        }")
+    code.append("        for (; j < block_size; j++) {")
+    code.append("          op[j] = len_inv * op[j];")
+    code.append("        }")
 
-    code.append("}")
+    code.append("      }")
 
-    code.append("}")
+    code.append("    }")
     return code
 
 
 # start main code
 parser = argparse.ArgumentParser()
-parser.add_argument('-f', '--filename', help="file name")
-parser.add_argument('--fused', action='store_true')
+parser.add_argument("-f", "--filename", help="file name")
+parser.add_argument("--fused", action="store_true")
 opts = parser.parse_args()
 if opts.filename:
     filename = opts.filename
@@ -275,14 +311,16 @@
     filename = "embedding_lookup_fused_8bit_rowwise_avx2.cc"
 else:
     filename = "embedding_lookup_avx2.cc"
-fout = open(filename, 'w')
+fout = open(filename, "w")
 
-options = [["int32_t", "int32_t", "float", "float", "float", "float"],
-           ["int64_t", "int64_t", "float", "float", "float", "float"],
-           ["int32_t", "int32_t", "half", "at::Half", "float", "float"],
-           ["int64_t", "int64_t", "half", "at::Half", "float", "float"],
-           ["int32_t", "int32_t", "uint8_t", "uint8_t", "float"],
-           ["int64_t", "int64_t", "uint8_t", "uint8_t", "float"]]
+options = [
+    ["int32_t", "int32_t", "float", "float", "float", "float"],
+    ["int64_t", "int64_t", "float", "float", "float", "float"],
+    ["int32_t", "int32_t", "half", "at::Half", "float", "float"],
+    ["int64_t", "int64_t", "half", "at::Half", "float", "float"],
+    ["int32_t", "int32_t", "uint8_t", "uint8_t", "float", "float"],
+    ["int64_t", "int64_t", "uint8_t", "uint8_t", "float", "float"],
+]
 
 code = []
 # includes
@@ -291,105 +329,101 @@
 code.append("//// THIS CODE IS AUTOGENERATED")
 code.append("//// BY {}".format(sys.argv[0]))
 code.append("//// DO NOT MODIFY!!!")
-code.append("//// --------------------------\n\n")
+code.append("//// --------------------------\n")
 
-code.append("#include <caffe2/core/types.h>")
-code.append("#include <caffe2/core/common.h>")
+code.append("#include <ATen/core/Half.h>")
+code.append("#include <c10/util/Logging.h>")
 code.append("#include <immintrin.h>")
-code.append("\n")
+code.append("#include <cassert>\n")
 
 code.append("namespace caffe2 {\n")
 for o in options:
     [IndexTypeName, IndexType, InTypeName, InType, OutTypeName, OutType] = o
 
-    prefix = 'Fused8BitRowwise' if opts.fused else ''
-    code.append('template <bool IS_WEIGHT_POSITIONAL>')
-    fn_base = '{}EmbeddingLookup_{}_{}_{}'.format(
+    prefix = "Fused8BitRowwise" if opts.fused else ""
+    code.append("template <bool IS_WEIGHT_POSITIONAL>")
+    fn_base = "{}EmbeddingLookup_{}_{}_{}".format(
         prefix, IndexTypeName, InTypeName, OutTypeName
     )
-    suffix = '__avx2_fma'
+    suffix = "__avx2_fma"
     fn = "static void " + fn_base + suffix
     code.append(fn + "(")
 
     args = []
-    args.append("const int64_t block_size,")
-    args.append("const int64_t output_size,")
-    args.append("const int64_t index_size,")
-    args.append("const int64_t data_size,")
-    args.append("const " + InType + "* input,")
-    args.append("const " + IndexType + "* indices,")
-    args.append("const int* lengths,")
-    args.append("const float* weights,")
+    args.append("    const int64_t block_size,")
+    args.append("    const int64_t output_size,")
+    args.append("    const int64_t index_size,")
+    args.append("    const int64_t data_size,")
+    args.append("    const " + InType + "* input,")
+    args.append("    const " + IndexType + "* indices,")
+    args.append("    const int* lengths,")
+    args.append("    const float* weights,")
     if not opts.fused:
-        args.append("const float* scale_bias,")
-    args.append("bool normalize_by_lengths,")
-    args.append(OutType + "* out)")
+        args.append("    const float* scale_bias,")
+    args.append("    bool normalize_by_lengths,")
+    args.append("    " + OutType + "* out) {")
     code += args
 
-    code.append("{")
-    code.append("const " + IndexType + " prefdist_T0 = 16;")
+    code.append("  const " + IndexType + " prefdist_T0 = 16;")
     # block_size is the number of elements and fused_block_size is the size of
     # an entire row, including scale and bias.
     offset = (8 // sizeof[InType]) if opts.fused else 0
     code.append(
-        "const {} fused_block_size = block_size + {};".
-        format(IndexType, offset)
+        "  const {} fused_block_size = block_size + {};".format(IndexType, offset)
     )
 
-    #code.append("printf(\"calling " + fn + "\\n\");");
+    # code.append("printf(\"calling " + fn + "\\n\");");
     if not opts.fused:
         if InType != "uint8_t":
             code.append(
-                'CAFFE_ENFORCE(scale_bias == nullptr,'
+                "  CAFFE_ENFORCE(scale_bias == nullptr,"
                 ' "scale_bias must be nullptr");'
             )
         else:
             code.append(
-                'CAFFE_ENFORCE(scale_bias != nullptr,'
+                "  CAFFE_ENFORCE(scale_bias != nullptr,"
                 ' "scale_bias must not be nullptr");'
             )
 
-    code.append("if (block_size == 128) {")
+    code.append("  if (block_size == 128) {")
     code += unroll(16, IndexType, InType, OutType, True, "AVX2", opts.fused)
-    code.append("} else if (block_size == 64) {")
+    code.append("  } else if (block_size == 64) {")
     code += unroll(8, IndexType, InType, OutType, True, "AVX2", opts.fused)
-    code.append("} else if (block_size == 32) {")
+    code.append("  } else if (block_size == 32) {")
     code += unroll(4, IndexType, InType, OutType, True, "AVX2", opts.fused)
-    code.append("} else if (block_size == 16) {")
+    code.append("  } else if (block_size == 16) {")
     code += unroll(2, IndexType, InType, OutType, True, "AVX2", opts.fused)
-    code.append("} else {")
-    code.append("// generic code")
+    code.append("  } else {")
+    code.append("    // generic code")
     code += generic(IndexType, InType, OutType, True, "AVX2", opts.fused)
-    code.append("}")
+    code.append("  }")
 
     code.append("}")
 
-    for is_weight_positional in ['false', 'true']:
-        code.append(
-            "void " + fn_base + "_" + is_weight_positional + suffix + "(")
+    for is_weight_positional in ["false", "true"]:
+        code.append("void " + fn_base + "_" + is_weight_positional + suffix + "(")
         code += args
-        code.append("{")
-        code.append(fn_base + suffix + "<" + is_weight_positional + ">(")
-        code.append("block_size,")
-        code.append("output_size,")
-        code.append("index_size,")
-        code.append("data_size,")
-        code.append("input,")
-        code.append("indices,")
-        code.append("lengths,")
-        code.append("weights,")
+        code.append("  " + fn_base + suffix + "<" + is_weight_positional + ">(")
+        code.append("      block_size,")
+        code.append("      output_size,")
+        code.append("      index_size,")
+        code.append("      data_size,")
+        code.append("      input,")
+        code.append("      indices,")
+        code.append("      lengths,")
+        code.append("      weights,")
         if not opts.fused:
-            code.append("scale_bias,")
-        code.append("normalize_by_lengths,")
-        code.append("out);")
+            code.append("      scale_bias,")
+        code.append("      normalize_by_lengths,")
+        code.append("      out);")
         code.append("}")
 
-    code.append("\n")
+    code.append("")
 
 code.append("} // namespace caffe2")
 
 for c in code:
-    #print(c, file = fout)
+    # print(c, file = fout)
     fout.write(c + "\n")
 fout.close()
 

diff --git a/caffe2/perfkernels/math.h b/caffe2/perfkernels/math.h
index abbe6ca..14265f9 100644
--- a/caffe2/perfkernels/math.h
+++ b/caffe2/perfkernels/math.h

@@ -1,10 +1,6 @@
 #pragma once
 
-#ifdef CAFFE2_USE_MKL
-#include <mkl.h>
-#endif // CAFFE2_USE_MKL
-
-#include <random>
+#include <cstdint>
 
 namespace caffe2 {
 
@@ -21,27 +17,19 @@
 // |    1B    |  1B  |  4B |  4B | ...output_data....|
 // In output_data: the b-th bucket of the i-th byte stores
 // the i-th data of the b-th segment of input row
-#ifdef CAFFE2_USE_MKL
-#define FUSED_ROWWISE_RANDOM_QUANTIZATION_USE_MKL
-#endif
+
 void quantize_and_compress(
     const float* input_data,
-    uint8_t* output_data,
-    size_t input_size,
-    size_t bitwidth,
+    std::uint8_t* output_data,
+    std::size_t input_size,
+    std::size_t bitwidth,
     bool random,
-#ifdef FUSED_ROWWISE_RANDOM_QUANTIZATION_USE_MKL
-    VSLStreamStatePtr& vslStream,
-    std::vector<float>& random_buffer
-#else
-    std::unique_ptr<std::uniform_real_distribution<float>>& dis,
-    std::minstd_rand& gen
-#endif
-);
+    const float* random_buffer);
+
 void decompress_and_dequantize(
-    const uint8_t* input_data,
+    const std::uint8_t* input_data,
     float* output_data,
-    size_t input_size);
+    std::size_t input_size);
 
 } // namespace math
 } // namespace caffe2

diff --git a/caffe2/perfkernels/math_cpu_avx2.cc b/caffe2/perfkernels/math_cpu_avx2.cc
index c71c5ac..95292c3 100644
--- a/caffe2/perfkernels/math_cpu_avx2.cc
+++ b/caffe2/perfkernels/math_cpu_avx2.cc

@@ -2,46 +2,16 @@
 // The implementation in this file allows us to route the underlying numerical
 // computation library to different compiler options (-mno-avx2 or -mavx2).
 
-#include <algorithm>
-#include <array>
-#include <atomic>
-#include <chrono>
+#include <immintrin.h>
+#include <cfloat>
 #include <cmath>
-#include <cstring>
-#include <functional>
-#include <limits>
-#include <numeric>
-#include <random>
-#include <tuple>
-#include <unordered_set>
-#include <vector>
-
-#include "caffe2/core/context.h"
-#include "caffe2/perfkernels/math.h"
-#include "caffe2/utils/cpu_neon.h"
-#include "caffe2/utils/cpuid.h"
-#include "caffe2/utils/eigen_utils.h"
-#include "caffe2/utils/math.h"
-
-#include "Eigen/Core"
-#include "Eigen/Dense"
-
-#ifdef CAFFE2_USE_MKL
-#include <mkl.h>
-#endif // CAFFE2_USE_MKL
-
-#ifdef CAFFE2_USE_HPTT
-#include <hptt.h>
-#endif // CAFFE2_USE_HPTT
-
-#if defined(_MSC_VER)
-#include <process.h>
-#endif
+#include <cstdint>
 
 namespace caffe2 {
 
 namespace math {
-#define QEPSILON 1e-8
+
+static constexpr double QEPSILON = 1e-8;
 
 void quantize_and_compress__avx2(
     const float* input_data,
@@ -49,19 +19,7 @@
     size_t input_size,
     size_t bitwidth,
     bool random,
-#ifdef FUSED_ROWWISE_RANDOM_QUANTIZATION_USE_MKL
-    VSLStreamStatePtr& vslStream,
-    std::vector<float>& random_buffer
-#else
-    std::unique_ptr<std::uniform_real_distribution<float>>& dis,
-    std::minstd_rand& gen
-#endif
-) {
-  CAFFE_ENFORCE(
-      bitwidth == 1 || bitwidth == 2 || bitwidth == 4 || bitwidth == 8,
-      "Unsupported bitwidth");
-
-#ifdef __AVX2__
+    const float* random_buffer) {
   __m256i shuffle_mask_v = _mm256_set_epi8(
       0xff,
       0xff,
@@ -97,14 +55,6 @@
       0x00);
   __m256i permute_mask_v =
       _mm256_set_epi32(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00);
-#endif // __AVX2__
-
-  // memory pointers
-  ConstEigenVectorArrayMap<float> input_row(input_data, input_size);
-  uint8_t* output_row = output_data;
-  EigenVectorArrayMap<uint8_t> output_bitwidth_tail(output_row, 2);
-  EigenVectorArrayMap<float> output_row_min_max(
-      reinterpret_cast<float*>(output_row + 2), 2);
 
   size_t data_per_byte = 8 / bitwidth;
   size_t tail = input_size % data_per_byte;
@@ -112,46 +62,30 @@
   size_t segment_size = (input_size + data_per_byte - 1) / data_per_byte;
 
   // basic info
-  const float minimum_element = input_row.minCoeff();
-  const float maximum_element = input_row.maxCoeff();
-  output_bitwidth_tail(0) = bitwidth;
-  output_bitwidth_tail(1) = tail;
-  output_row_min_max(0) = minimum_element;
-  output_row_min_max(1) = maximum_element;
+  float minimum_element = INFINITY, maximum_element = -INFINITY;
+  for (auto i = 0; i < input_size; ++i) {
+    minimum_element =
+        (input_data[i] < minimum_element) ? input_data[i] : minimum_element;
+    maximum_element =
+        (input_data[i] > maximum_element) ? input_data[i] : maximum_element;
+  }
+  output_data[0] = bitwidth;
+  output_data[1] = tail;
+  reinterpret_cast<float*>(output_data + 2)[0] = minimum_element;
+  reinterpret_cast<float*>(output_data + 2)[1] = maximum_element;
 
   float gap = (maximum_element - minimum_element) / ((1 << bitwidth) - 1.0f);
   float gap_inverse = 1. / (gap + QEPSILON);
   uint8_t max_q = (1 << bitwidth) - 1;
   size_t bit_start = 0;
   if (random) {
-#ifdef FUSED_ROWWISE_RANDOM_QUANTIZATION_USE_MKL
-    int status = vsRngUniform(
-        VSL_RNG_METHOD_UNIFORM_STD,
-        vslStream,
-        input_size,
-        random_buffer.data(),
-        0.0f,
-        1.0f);
-    if (status != VSL_ERROR_OK) {
-      LOG(WARNING) << "vsRngUniform returns " << status;
-    }
-#endif
     for (int start = 0; start < input_size; start += segment_size) {
       size_t stride = start + segment_size <= input_size ? segment_size
                                                          : input_size - start;
       int i = 0;
-#ifdef __AVX2__
       constexpr int VLEN = 8;
       for (; i < stride / VLEN * VLEN; i += VLEN) {
-#ifdef FUSED_ROWWISE_RANDOM_QUANTIZATION_USE_MKL
         __m256 r_v = _mm256_loadu_ps(&random_buffer[start + i]);
-#else
-        float random_buffer[VLEN];
-        for (int j = 0; j < VLEN; ++j) {
-          random_buffer[j] = (*dis)(gen);
-        }
-        __m256 r_v = _mm256_loadu_ps(random_buffer);
-#endif
         __m256 fval_v = _mm256_loadu_ps(input_data + start + i);
         __m256 thetimes_v = _mm256_mul_ps(
             _mm256_sub_ps(fval_v, _mm256_set1_ps(minimum_element)),
@@ -162,37 +96,35 @@
             _mm256_min_ps(_mm256_set1_ps(max_q), rounded_v));
         __m256i qval_v = _mm256_cvtps_epi32(rounded_v);
         __m256i orval_v = _mm256_cvtepu8_epi32(_mm_lddqu_si128(
-            reinterpret_cast<const __m128i*>(output_row + 10 + i)));
+            reinterpret_cast<const __m128i*>(output_data + 10 + i)));
         orval_v =
             _mm256_or_si256(orval_v, _mm256_slli_epi32(qval_v, bit_start));
         orval_v = _mm256_shuffle_epi8(orval_v, shuffle_mask_v);
         orval_v = _mm256_permutevar8x32_epi32(orval_v, permute_mask_v);
-        *reinterpret_cast<int64_t*>(output_row + 10 + i) =
+        *reinterpret_cast<int64_t*>(output_data + 10 + i) =
             _mm256_extract_epi64(orval_v, 0);
       }
-#endif // __AVX2__
       for (; i < stride; ++i) {
         float fval = input_data[start + i];
         float thetimes = (fval - minimum_element) * gap_inverse;
-#ifdef FUSED_ROWWISE_RANDOM_QUANTIZATION_USE_MKL
         float rounded = floor(thetimes + random_buffer[start + i]);
-#else
-        float rounded = floor(thetimes + (*dis)(gen));
-#endif
-        rounded = std::max(0.0f, std::min(static_cast<float>(max_q), rounded));
+        rounded = rounded < static_cast<float>(max_q)
+            ? rounded
+            : static_cast<float>(max_q);
+        rounded = rounded > 0.0f ? rounded : 0.0f;
         uint8_t qval = rounded;
 
-        uint8_t orval = output_row[10 + i];
-        output_row[10 + i] = orval | static_cast<uint8_t>(qval << bit_start);
+        uint8_t orval = output_data[10 + i];
+        output_data[10 + i] = orval | static_cast<uint8_t>(qval << bit_start);
       }
       bit_start += bitwidth;
     }
   } else {
+    // !random
     for (int start = 0; start < input_size; start += segment_size) {
       size_t stride = start + segment_size <= input_size ? segment_size
                                                          : input_size - start;
       int i = 0;
-#ifdef __AVX2__
       constexpr int VLEN = 8;
       for (; i < stride / VLEN * VLEN; i += VLEN) {
         __m256 fval_v = _mm256_loadu_ps(input_data + start + i);
@@ -205,52 +137,48 @@
         __m256i qval_v = _mm256_cvtps_epi32(_mm256_round_ps(
             thetimes_v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
         __m256i orval_v = _mm256_cvtepu8_epi32(_mm_lddqu_si128(
-            reinterpret_cast<const __m128i*>(output_row + 10 + i)));
+            reinterpret_cast<const __m128i*>(output_data + 10 + i)));
         orval_v =
             _mm256_or_si256(orval_v, _mm256_slli_epi32(qval_v, bit_start));
         orval_v = _mm256_shuffle_epi8(orval_v, shuffle_mask_v);
         orval_v = _mm256_permutevar8x32_epi32(orval_v, permute_mask_v);
-        *reinterpret_cast<int64_t*>(output_row + 10 + i) =
+        *reinterpret_cast<int64_t*>(output_data + 10 + i) =
             _mm256_extract_epi64(orval_v, 0);
       }
-#endif // __AVX2__
       for (; i < stride; ++i) {
         float fval = input_data[start + i];
         float thetimes = (fval - minimum_element) * gap_inverse;
-        thetimes =
-            std::max(0.0f, std::min(static_cast<float>(max_q), thetimes));
+        thetimes = thetimes < static_cast<float>(max_q)
+            ? thetimes
+            : static_cast<float>(max_q);
+        thetimes = thetimes > 0.0f ? thetimes : 0.0f;
         uint8_t qval = nearbyint(thetimes);
 
-        uint8_t orval = output_row[10 + i];
-        output_row[10 + i] = orval | static_cast<uint8_t>(qval << bit_start);
+        uint8_t orval = output_data[10 + i];
+        output_data[10 + i] = orval | static_cast<uint8_t>(qval << bit_start);
       }
       bit_start += bitwidth;
     }
-  }
+  } // !random
 }
 
 void decompress_and_dequantize__avx2(
     const uint8_t* input_data,
     float* output_data,
     size_t input_size) {
-  ConstEigenVectorArrayMap<float> input_row_min_max(
-      reinterpret_cast<const float*>(input_data + 2), 2);
-
   // basic info
-  const float minimum_element = input_row_min_max(0);
-  const float maximum_element = input_row_min_max(1);
+  const float minimum_element =
+      reinterpret_cast<const float*>(input_data + 2)[0];
+  const float maximum_element =
+      reinterpret_cast<const float*>(input_data + 2)[1];
   const size_t bitwidth = input_data[0];
   const float gap =
       (maximum_element - minimum_element) / ((1 << bitwidth) - 1.f) +
       QEPSILON; // for exact recovering
 
-  CAFFE_ENFORCE(
-      bitwidth == 1 || bitwidth == 2 || bitwidth == 4 || bitwidth == 8,
-      "Unsupported bitwidth");
   const size_t tail = input_data[1];
 
   const size_t output_size = (input_size - 10) * (8 / bitwidth) - tail;
-  EigenVectorArrayMap<float> output_row(output_data, output_size);
   // decoding
   size_t bit_start = 0;
   const size_t segment_size = input_size - 10;
@@ -259,7 +187,6 @@
                                                         : output_size - start;
     uint8_t mask = (1 << bitwidth) - 1;
     int i = 0;
-#ifdef __AVX2__
     // Can process 8 elements at a time because we need to expand uint8_t
     // to int32_t to use epi32 vector instructions.
     constexpr int VLEN = 8;
@@ -269,19 +196,19 @@
       __m256i out_epi32_v = _mm256_and_si256(
           _mm256_srli_epi32(_mm256_cvtepu8_epi32(in_v), bit_start),
           _mm256_set1_epi32(mask));
-      _mm256_storeu_ps(
-          output_data + start + i, _mm256_cvtepi32_ps(out_epi32_v));
+      __m256 out_v = _mm256_fmadd_ps(
+          _mm256_cvtepi32_ps(out_epi32_v),
+          _mm256_set1_ps(gap),
+          _mm256_set1_ps(minimum_element));
+      _mm256_storeu_ps(output_data + start + i, out_v);
     }
-#endif
     for (; i < stride; ++i) {
-      output_data[start + i] = ((input_data[10 + i] >> bit_start) & mask);
+      output_data[start + i] =
+          ((input_data[10 + i] >> bit_start) & mask) * gap + minimum_element;
     }
     bit_start += bitwidth;
   }
-  // scaling and biasing
-  output_row = output_row * gap + minimum_element;
 }
 
-#undef QEPSILON
 } // namespace math
 } // namespace caffe2

diff --git a/caffe2/perfkernels/math_cpu_base.cc b/caffe2/perfkernels/math_cpu_base.cc
index 843cfa1..1837641 100644
--- a/caffe2/perfkernels/math_cpu_base.cc
+++ b/caffe2/perfkernels/math_cpu_base.cc

@@ -2,47 +2,16 @@
 // The implementation in this file allows us to route the underlying numerical
 // computation library to different compiler options (-mno-avx2 or -mavx2).
 
-#include <algorithm>
-#include <array>
-#include <atomic>
-#include <chrono>
-#include <cmath>
-#include <cstring>
-#include <functional>
-#include <limits>
-#include <numeric>
-#include <random>
-#include <tuple>
-#include <unordered_set>
-#include <vector>
+#include <cfloat>
 
-#include "caffe2/core/context.h"
-#include "caffe2/perfkernels/common.h"
-#include "caffe2/perfkernels/math.h"
-#include "caffe2/utils/cpu_neon.h"
-#include "caffe2/utils/cpuid.h"
-#include "caffe2/utils/eigen_utils.h"
-#include "caffe2/utils/math.h"
-
-#include "Eigen/Core"
-#include "Eigen/Dense"
-
-#ifdef CAFFE2_USE_MKL
-#include <mkl.h>
-#endif // CAFFE2_USE_MKL
-
-#ifdef CAFFE2_USE_HPTT
-#include <hptt.h>
-#endif // CAFFE2_USE_HPTT
-
-#if defined(_MSC_VER)
-#include <process.h>
-#endif
+#include "common.h"
+#include "math.h"
 
 namespace caffe2 {
 
 namespace math {
-#define QEPSILON 1e-8
+
+static constexpr double QEPSILON = 1e-8;
 
 void quantize_and_compress__base(
     const float* input_data,
@@ -50,55 +19,30 @@
     size_t input_size,
     size_t bitwidth,
     bool random,
-#ifdef FUSED_ROWWISE_RANDOM_QUANTIZATION_USE_MKL
-    VSLStreamStatePtr& vslStream,
-    std::vector<float>& random_buffer
-#else
-    std::unique_ptr<std::uniform_real_distribution<float>>& dis,
-    std::minstd_rand& gen
-#endif
-) {
-  CAFFE_ENFORCE(
-      bitwidth == 1 || bitwidth == 2 || bitwidth == 4 || bitwidth == 8,
-      "Unsupported bitwidth");
-
-  // memory pointers
-  ConstEigenVectorArrayMap<float> input_row(input_data, input_size);
-  uint8_t* output_row = output_data;
-  EigenVectorArrayMap<uint8_t> output_bitwidth_tail(output_row, 2);
-  EigenVectorArrayMap<float> output_row_min_max(
-      reinterpret_cast<float*>(output_row + 2), 2);
-
+    const float* random_buffer) {
   size_t data_per_byte = 8 / bitwidth;
   size_t tail = input_size % data_per_byte;
   tail = tail ? data_per_byte - tail : 0;
   size_t segment_size = (input_size + data_per_byte - 1) / data_per_byte;
 
   // basic info
-  const float minimum_element = input_row.minCoeff();
-  const float maximum_element = input_row.maxCoeff();
-  output_bitwidth_tail(0) = bitwidth;
-  output_bitwidth_tail(1) = tail;
-  output_row_min_max(0) = minimum_element;
-  output_row_min_max(1) = maximum_element;
+  float minimum_element = INFINITY, maximum_element = -INFINITY;
+  for (auto i = 0; i < input_size; ++i) {
+    minimum_element =
+        input_data[i] < minimum_element ? input_data[i] : minimum_element;
+    maximum_element =
+        input_data[i] > maximum_element ? input_data[i] : maximum_element;
+  }
+  output_data[0] = bitwidth;
+  output_data[1] = tail;
+  reinterpret_cast<float*>(output_data + 2)[0] = minimum_element;
+  reinterpret_cast<float*>(output_data + 2)[1] = maximum_element;
 
   float gap = (maximum_element - minimum_element) / ((1 << bitwidth) - 1.0f);
   float gap_inverse = 1. / (gap + QEPSILON);
   uint8_t max_q = (1 << bitwidth) - 1;
   size_t bit_start = 0;
   if (random) {
-#ifdef FUSED_ROWWISE_RANDOM_QUANTIZATION_USE_MKL
-    int status = vsRngUniform(
-        VSL_RNG_METHOD_UNIFORM_STD,
-        vslStream,
-        input_size,
-        random_buffer.data(),
-        0.0f,
-        1.0f);
-    if (status != VSL_ERROR_OK) {
-      LOG(WARNING) << "vsRngUniform returns " << status;
-    }
-#endif
     for (int start = 0; start < input_size; start += segment_size) {
       size_t stride = start + segment_size <= input_size ? segment_size
                                                          : input_size - start;
@@ -106,16 +50,15 @@
       for (; i < stride; ++i) {
         float fval = input_data[start + i];
         float thetimes = (fval - minimum_element) * gap_inverse;
-#ifdef FUSED_ROWWISE_RANDOM_QUANTIZATION_USE_MKL
         float rounded = floor(thetimes + random_buffer[start + i]);
-#else
-        float rounded = floor(thetimes + (*dis)(gen));
-#endif
-        rounded = std::max(0.0f, std::min(static_cast<float>(max_q), rounded));
+        rounded = rounded < static_cast<float>(max_q)
+            ? rounded
+            : static_cast<float>(max_q);
+        rounded = rounded > 0.0f ? rounded : 0.0f;
         uint8_t qval = rounded;
 
-        uint8_t orval = output_row[10 + i];
-        output_row[10 + i] = orval | static_cast<uint8_t>(qval << bit_start);
+        uint8_t orval = output_data[10 + i];
+        output_data[10 + i] = orval | static_cast<uint8_t>(qval << bit_start);
       }
       bit_start += bitwidth;
     }
@@ -127,12 +70,14 @@
       for (; i < stride; ++i) {
         float fval = input_data[start + i];
         float thetimes = (fval - minimum_element) * gap_inverse;
-        thetimes =
-            std::max(0.0f, std::min(static_cast<float>(max_q), thetimes));
+        thetimes = thetimes < static_cast<float>(max_q)
+            ? thetimes
+            : static_cast<float>(max_q);
+        thetimes = thetimes > 0.0f ? thetimes : 0.0f;
         uint8_t qval = nearbyint(thetimes);
 
-        uint8_t orval = output_row[10 + i];
-        output_row[10 + i] = orval | static_cast<uint8_t>(qval << bit_start);
+        uint8_t orval = output_data[10 + i];
+        output_data[10 + i] = orval | static_cast<uint8_t>(qval << bit_start);
       }
       bit_start += bitwidth;
     }
@@ -145,15 +90,7 @@
     size_t input_size,
     size_t bitwidth,
     bool random,
-#ifdef FUSED_ROWWISE_RANDOM_QUANTIZATION_USE_MKL
-    VSLStreamStatePtr& vslStream,
-    std::vector<float>& random_buffer
-#else
-    std::unique_ptr<std::uniform_real_distribution<float>>& dis,
-    std::minstd_rand& gen
-#endif
-) {
-#ifdef FUSED_ROWWISE_RANDOM_QUANTIZATION_USE_MKL
+    const float* random_buffer) {
   AVX2_DO(
       quantize_and_compress,
       input_data,
@@ -161,7 +98,6 @@
       input_size,
       bitwidth,
       random,
-      vslStream,
       random_buffer);
   BASE_DO(
       quantize_and_compress,
@@ -170,54 +106,26 @@
       input_size,
       bitwidth,
       random,
-      vslStream,
       random_buffer);
-#else
-  AVX2_DO(
-      quantize_and_compress,
-      input_data,
-      output_data,
-      input_size,
-      bitwidth,
-      random,
-      dis,
-      gen);
-  BASE_DO(
-      quantize_and_compress,
-      input_data,
-      output_data,
-      input_size,
-      bitwidth,
-      random,
-      dis,
-      gen);
-#endif
 }
 
 void decompress_and_dequantize__base(
     const uint8_t* input_data,
     float* output_data,
     size_t input_size) {
-  // memory pointers ///
-  ConstEigenVectorArrayMap<uint8_t> input_bitwidth_tail(input_data, 2);
-  ConstEigenVectorArrayMap<float> input_row_min_max(
-      reinterpret_cast<const float*>(input_data + 2), 2);
-
   // basic info
-  const float minimum_element = input_row_min_max(0);
-  const float maximum_element = input_row_min_max(1);
+  const float minimum_element =
+      reinterpret_cast<const float*>(input_data + 2)[0];
+  const float maximum_element =
+      reinterpret_cast<const float*>(input_data + 2)[1];
   const size_t bitwidth = input_data[0];
   const float gap =
       (maximum_element - minimum_element) / ((1 << bitwidth) - 1.f) +
       QEPSILON; // for exact recovering
 
-  CAFFE_ENFORCE(
-      bitwidth == 1 || bitwidth == 2 || bitwidth == 4 || bitwidth == 8,
-      "Unsupported bitwidth");
   const size_t tail = input_data[1];
 
   const size_t output_size = (input_size - 10) * (8 / bitwidth) - tail;
-  EigenVectorArrayMap<float> output_row(output_data, output_size);
   // decoding
   size_t bit_start = 0;
   const size_t segment_size = input_size - 10;
@@ -227,12 +135,11 @@
     uint8_t mask = (1 << bitwidth) - 1;
     int i = 0;
     for (; i < stride; ++i) {
-      output_data[start + i] = ((input_data[10 + i] >> bit_start) & mask);
+      output_data[start + i] =
+          ((input_data[10 + i] >> bit_start) & mask) * gap + minimum_element;
     }
     bit_start += bitwidth;
   }
-  // scaling and biasing
-  output_row = output_row * gap + minimum_element;
 }
 
 void decompress_and_dequantize(
@@ -243,6 +150,5 @@
   BASE_DO(decompress_and_dequantize, input_data, output_data, input_size);
 }
 
-#undef QEPSILON
 } // namespace math
 } // namespace caffe2

diff --git a/caffe2/perfkernels/typed_axpy_avx2.cc b/caffe2/perfkernels/typed_axpy_avx2.cc
index e4d4cbb..fa6c17e 100644
--- a/caffe2/perfkernels/typed_axpy_avx2.cc
+++ b/caffe2/perfkernels/typed_axpy_avx2.cc

@@ -1,8 +1,6 @@
-#include "caffe2/core/types.h"
 #include "caffe2/perfkernels/cvtsh_ss_bugfix.h"
-#include "caffe2/perfkernels/typed_axpy.h"
-#include "caffe2/utils/math.h"
 
+#include <ATen/core/Half.h>
 #include <emmintrin.h>
 #include <immintrin.h>
commit	1e0eab5df839ae4124fa558c9c82586c9993eaab	[log] [tgz]
author	Jongsoo Park <jongsoo@fb.com>	Thu Dec 13 00:15:51 2018 -0800
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	Thu Dec 13 00:18:11 2018 -0800
tree	6581ced51f903b8a2fdf26aea620a6271b4763f3
parent	4b97a4642100e26d14c34c07c31643422d60ac48 [diff]