qnnpack hardswish - LUTs (#36252) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/36252 Adds a baseline hardswish kernel using LUTs in QNNPACK. Performance is 1.9 GB/s on a Nexus 6 and 2.2 GB/s on Pixel 3 - same as other LUT based ops. Enforcing scale and zp to be equal to the input, to match the server implementation. There are some potential improvements in rewriting this as NEON kernels for a further speedup - saving that until later, if we need it. Test Plan: ``` with-proxy ./scripts/build-local.sh ./build/local/hardswish-test with-proxy scripts/build-android-armv7.sh adb push ./build/android/armeabi-v7a/hardswish-* /data/qnnpack adb shell /data/qnnpack/hardswish-test /data/qnnpack/hardswish-bench with-proxy scripts/build-android-arm64.sh adb push ./build/android/arm64-v8a/hardswish-* /data/qnnpack /data/qnnpack/hardswish-test /data/qnnpack/hardswish-bench ``` Imported from OSS Differential Revision: D20965044 fbshipit-source-id: 982938361971513cb15873438e12c23a38e819e3

commit: 0964b662c3f06c0c4a71f11f9e4e4c6129e00c7a [log] [tgz]
author: Vasiliy Kuznetsov <vasiliy@fb.com> Mon Apr 13 19:02:26 2020 -0700
committer: Facebook GitHub Bot <facebook-github-bot@users.noreply.github.com> Mon Apr 13 19:06:59 2020 -0700
tree: c5aa4ae30b7ed4b170617f8f33558ebfd63a77ca
parent: 455d4aab6479bb13e27f23139ce6bfaca909f7aa [diff]
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/CMakeLists.txt b/aten/src/ATen/native/quantized/cpu/qnnpack/CMakeLists.txt
index d6a6861..34a4ed8 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/CMakeLists.txt
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/CMakeLists.txt

@@ -143,6 +143,7 @@
   src/fc-prepack.cc
   src/fully-connected.c
   src/global-average-pooling.c
+  src/hardswish.c
   src/leaky-relu.c
   src/max-pooling.c
   src/sigmoid.c
@@ -461,6 +462,15 @@
   target_link_libraries(tanh-test PRIVATE pytorch_qnnpack cpuinfo gtest gtest_main)
   add_test(tanh-test tanh-test)
 
+  add_executable(hardswish-test test/hardswish.cc)
+  set_target_properties(hardswish-test PROPERTIES
+    CXX_STANDARD 14
+    CXX_STANDARD_REQUIRED YES
+    CXX_EXTENSIONS NO)
+  target_include_directories(hardswish-test PRIVATE src test)
+  target_link_libraries(hardswish-test PRIVATE pytorch_qnnpack cpuinfo gtest gtest_main)
+  add_test(hardswish-test hardswish-test)
+
   add_executable(max-pooling-test test/max-pooling.cc)
   set_target_properties(max-pooling-test PROPERTIES
     CXX_STANDARD 14
@@ -689,6 +699,13 @@
     CXX_EXTENSIONS NO)
   target_link_libraries(tanh-bench PRIVATE pytorch_qnnpack benchmark)
 
+  add_executable(hardswish-bench bench/hardswish.cc)
+  set_target_properties(hardswish-bench PROPERTIES
+    CXX_STANDARD 14
+    CXX_STANDARD_REQUIRED YES
+    CXX_EXTENSIONS NO)
+  target_link_libraries(hardswish-bench PRIVATE pytorch_qnnpack benchmark)
+
   add_executable(q8gemm-bench bench/q8gemm.cc)
   set_target_properties(q8gemm-bench PROPERTIES
     CXX_STANDARD 14

diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/README.md b/aten/src/ATen/native/quantized/cpu/qnnpack/README.md
index c0129ba..cce6c4d 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/README.md
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/README.md

@@ -18,6 +18,7 @@
 - [x] Sigmoid
 - [x] TanH
 - [x] Leaky ReLU
+- [x] Hardswish
 - [x] Clamp (can be used for ReLU, ReLU6 if it is not fused in another operator)
 - [x] SoftArgMax (aka SoftMax)
 - [ ] Group Normalization

diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/bench/hardswish.cc b/aten/src/ATen/native/quantized/cpu/qnnpack/bench/hardswish.cc
new file mode 100644
index 0000000..e7549e3
--- /dev/null
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/bench/hardswish.cc

@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <random>
+#include <vector>
+
+#include <pytorch_qnnpack.h>
+
+#include <benchmark/benchmark.h>
+
+static void hardswish_q8(benchmark::State& state) {
+  const size_t batchSize = static_cast<size_t>(state.range(0));
+  const size_t channels = static_cast<size_t>(state.range(1));
+
+  std::random_device randomDevice;
+  auto rng = std::mt19937(randomDevice());
+  auto u8rng = std::bind(std::uniform_int_distribution<uint8_t>(), rng);
+
+  std::vector<uint8_t> input(batchSize * channels);
+  std::vector<uint8_t> output(batchSize * channels);
+  std::generate(input.begin(), input.end(), std::ref(u8rng));
+  std::fill(output.begin(), output.end(), 0xA5);
+
+  pytorch_qnnp_status status = pytorch_qnnp_initialize();
+  if (status != pytorch_qnnp_status_success) {
+    state.SkipWithError("failed to initialize QNNPACK");
+  }
+
+  pytorch_qnnp_operator_t hardswishOperator = nullptr;
+  status = pytorch_qnnp_create_hardswish_nc_q8(
+      channels,
+      127 /* input zero point */,
+      1.0f /* input scale */,
+      127 /* output zero point */,
+      1.0f /* output scale */,
+      0 /* output min */,
+      255 /* output max */,
+      0 /* flags */,
+      &hardswishOperator);
+  if (status != pytorch_qnnp_status_success || hardswishOperator == nullptr) {
+    state.SkipWithError("failed to create Hardswish operator");
+  }
+
+  status = pytorch_qnnp_setup_hardswish_nc_q8(
+      hardswishOperator,
+      batchSize,
+      input.data(),
+      channels /* input:stride */,
+      output.data(),
+      channels /* output:stride */);
+  if (status != pytorch_qnnp_status_success) {
+    state.SkipWithError("failed to setup Hardswish operator");
+  }
+
+  for (auto _ : state) {
+    status =
+        pytorch_qnnp_run_operator(hardswishOperator, nullptr /* thread pool */);
+    if (status != pytorch_qnnp_status_success) {
+      state.SkipWithError("failed to run Hardswish operator");
+    }
+  }
+
+  const size_t itemsPerIteration = batchSize * channels;
+  state.SetItemsProcessed(
+      int64_t(state.iterations()) * int64_t(itemsPerIteration));
+
+  const size_t bytesPerIteration = 2 * itemsPerIteration * sizeof(uint8_t);
+  state.SetBytesProcessed(
+      int64_t(state.iterations()) * int64_t(bytesPerIteration));
+
+  status = pytorch_qnnp_delete_operator(hardswishOperator);
+  if (status != pytorch_qnnp_status_success) {
+    state.SkipWithError("failed to delete Hardswish operator");
+  }
+}
+
+static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "C"});
+
+  int32_t c = 16;
+  for (int32_t n = 224; n >= 7; n /= 2) {
+    b->Args({n * n, c});
+    c *= 2;
+  }
+}
+
+BENCHMARK(hardswish_q8)->Apply(CharacteristicArguments);
+
+#ifndef PYTORCH_QNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif

diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/configure.py b/aten/src/ATen/native/quantized/cpu/qnnpack/configure.py
index 743b658..a4d5d07 100755
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/configure.py
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/configure.py

@@ -87,6 +87,7 @@
             build.cc("deconvolution.c"),
             build.cc("fully-connected.c"),
             build.cc("global-average-pooling.c"),
+            build.cc("hardswish.c"),
             build.cc("leaky-relu.c"),
             build.cc("max-pooling.c"),
             build.cc("sigmoid.c"),
@@ -220,6 +221,7 @@
         build.unittest("sigmoid-test", build.cxx("sigmoid.cc"))
         build.unittest("softargmax-test", build.cxx("softargmax.cc"))
         build.unittest("tanh-test", build.cxx("tanh.cc"))
+        build.unittest("hardswish-test", build.cxx("hardswish.cc"))
         build.unittest(
             "requantization-test",
             [build.cxx("requantization.cc")] + requantization_objects,
@@ -258,6 +260,7 @@
         build.benchmark("sigmoid-bench", build.cxx("sigmoid.cc"))
         build.benchmark("softargmax-bench", build.cxx("softargmax.cc"))
         build.benchmark("tanh-bench", build.cxx("tanh.cc"))
+        build.benchmark("hardswish-bench", build.cxx("hardswish.cc"))
 
         build.benchmark("q8gemm-bench", build.cxx("q8gemm.cc"))
         build.benchmark("hgemm-bench", build.cxx("hgemm.cc"))

diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/include/pytorch_qnnpack.h b/aten/src/ATen/native/quantized/cpu/qnnpack/include/pytorch_qnnpack.h
index a1459bb..a728962 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/include/pytorch_qnnpack.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/include/pytorch_qnnpack.h

@@ -343,6 +343,25 @@
     uint8_t* output,
     size_t output_stride);
 
+enum pytorch_qnnp_status pytorch_qnnp_create_hardswish_nc_q8(
+    size_t channels,
+    uint8_t input_zero_point,
+    float input_scale,
+    uint8_t output_zero_point,
+    float output_scale,
+    uint8_t output_min,
+    uint8_t output_max,
+    uint32_t flags,
+    pytorch_qnnp_operator_t* hardswish);
+
+enum pytorch_qnnp_status pytorch_qnnp_setup_hardswish_nc_q8(
+    pytorch_qnnp_operator_t hardswish,
+    size_t batch_size,
+    const uint8_t* input,
+    size_t input_stride,
+    uint8_t* output,
+    size_t output_stride);
+
 enum pytorch_qnnp_status pytorch_qnnp_run_operator(
     pytorch_qnnp_operator_t op,
     pthreadpool_t threadpool);

diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/hardswish.c b/aten/src/ATen/native/quantized/cpu/qnnpack/src/hardswish.c
new file mode 100644
index 0000000..fba280c
--- /dev/null
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/hardswish.c

@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <pytorch_qnnpack.h>
+#include <qnnpack/log.h>
+#include <qnnpack/operator.h>
+
+enum pytorch_qnnp_status pytorch_qnnp_create_hardswish_nc_q8(
+    size_t channels,
+    uint8_t input_zero_point,
+    float input_scale,
+    uint8_t output_zero_point,
+    float output_scale,
+    uint8_t output_min,
+    uint8_t output_max,
+    uint32_t flags,
+    pytorch_qnnp_operator_t* hardswish_out) {
+  pytorch_qnnp_operator_t hardswish_op = NULL;
+  enum pytorch_qnnp_status status = pytorch_qnnp_status_uninitialized;
+
+  if (!pytorch_qnnp_params.initialized) {
+    pytorch_qnnp_log_error(
+        "pytorch_qnnp_create_hardswish_nc_q8 failed because QNNPACK is not properly initialized");
+    goto error;
+  }
+
+  status = pytorch_qnnp_status_invalid_parameter;
+
+  if (channels == 0) {
+    pytorch_qnnp_log_error(
+        "failed to create Hardswish operator with %zu channels: number of channels must be non-zero",
+        channels);
+    goto error;
+  }
+
+  if (input_scale <= 0.0f || !isnormal(input_scale)) {
+    pytorch_qnnp_log_error(
+        "failed to create Hardswish operator with %.7g input scale: scale must be finite and positive",
+        input_scale);
+    goto error;
+  }
+
+  if (output_scale <= 0.0f || !isnormal(output_scale)) {
+    pytorch_qnnp_log_error(
+        "failed to create Hardswish operator with %.7g output scale: scale must be finite and positive",
+        output_scale);
+    goto error;
+  }
+
+  if (output_min >= output_max) {
+    pytorch_qnnp_log_error(
+        "failed to create Hardswish operator with [%" PRIu8 ", %" PRIu8
+        "] output range: range min must be below range max",
+        output_min,
+        output_max);
+    goto error;
+  }
+
+  status = pytorch_qnnp_status_unsupported_parameter;
+
+  if (output_scale != input_scale) {
+    pytorch_qnnp_log_error(
+        "failed to create Hardswish operator with %.7g output scale: only output scale equal to input scale is supported",
+        output_scale);
+    goto error;
+  }
+
+  if (output_zero_point != input_zero_point) {
+    pytorch_qnnp_log_error(
+        "failed to create Hardswish operator with %" PRIu8
+        " output zero point: only output zero point equal to input zero point is supported",
+        output_zero_point);
+    goto error;
+  }
+
+  status = pytorch_qnnp_status_out_of_memory;
+
+  hardswish_op = calloc(1, sizeof(struct pytorch_qnnp_operator));
+  if (hardswish_op == NULL) {
+    pytorch_qnnp_log_error(
+        "failed to allocate %zu bytes for pytorch_qnnp_operator structure",
+        sizeof(struct pytorch_qnnp_operator));
+    goto error;
+  }
+
+  hardswish_op->lookup_table = malloc(256 * sizeof(uint8_t));
+  if (hardswish_op->lookup_table == NULL) {
+    pytorch_qnnp_log_error(
+        "failed to allocate 256 bytes for Hardswish lookup table");
+    goto error;
+  }
+
+  uint8_t* lookup_table = hardswish_op->lookup_table;
+  const float scaled_min = (float)(int32_t)output_min;
+  const float scaled_max = (float)(int32_t)output_max;
+  const float inv_output_scale = 1.0f / output_scale;
+  for (int32_t i = 0; i < 256; i++) {
+    float x =
+        input_scale * (float)(i - (int32_t)(uint32_t)input_zero_point);
+    // hardswish, no min/max functions in C
+    float x2 = x + 3.0f;
+    x2 = x2 > 0.0f ? x2 : 0.0f;
+    x2 = x2 < 6.0f ? x2 : 6.0f;
+    x2 = x * x2 / 6.0f;
+    float scaled_hardswish_x = inv_output_scale * x2 + output_zero_point;
+    if (scaled_hardswish_x < scaled_min) {
+      scaled_hardswish_x = scaled_min;
+    }
+    if (scaled_hardswish_x > scaled_max) {
+      scaled_hardswish_x = scaled_max;
+    }
+    lookup_table[(uint32_t)i] = (uint8_t)lrintf(scaled_hardswish_x);
+  }
+
+  hardswish_op->channels = channels;
+
+  hardswish_op->ukernel_type = pytorch_qnnp_ukernel_type_lut;
+  hardswish_op->format = pytorch_qnnp_format_quint8;
+
+  *hardswish_out = hardswish_op;
+  return pytorch_qnnp_status_success;
+
+error:
+  pytorch_qnnp_delete_operator(hardswish_op);
+  return status;
+}
+
+enum pytorch_qnnp_status pytorch_qnnp_setup_hardswish_nc_q8(
+    pytorch_qnnp_operator_t hardswish,
+    size_t batch_size,
+    const uint8_t* input,
+    size_t input_stride,
+    uint8_t* output,
+    size_t output_stride) {
+  if (!pytorch_qnnp_params.initialized) {
+    pytorch_qnnp_log_error(
+        "pytorch_qnnp_setup_hardswish_nc_q8 failed because QNNPACK is not properly initialized");
+    return pytorch_qnnp_status_uninitialized;
+  }
+
+  if (batch_size == 0) {
+    hardswish->batch_size = 0;
+    return pytorch_qnnp_status_success;
+  }
+
+  hardswish->batch_size = batch_size;
+  hardswish->input = input;
+  hardswish->input_pixel_stride = input_stride;
+  hardswish->output = output;
+  hardswish->output_pixel_stride = output_stride;
+
+  return pytorch_qnnp_status_success;
+}

diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/test/hardswish-operator-tester.h b/aten/src/ATen/native/quantized/cpu/qnnpack/test/hardswish-operator-tester.h
new file mode 100644
index 0000000..0b95029
--- /dev/null
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/test/hardswish-operator-tester.h

@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstddef>
+#include <cstdlib>
+#include <functional>
+#include <random>
+#include <vector>
+
+#include <pytorch_qnnpack.h>
+
+class HardswishOperatorTester {
+ public:
+  inline HardswishOperatorTester& channels(size_t channels) {
+    assert(channels != 0);
+    this->channels_ = channels;
+    return *this;
+  }
+
+  inline size_t channels() const {
+    return this->channels_;
+  }
+
+  inline HardswishOperatorTester& inputStride(size_t inputStride) {
+    assert(inputStride != 0);
+    this->inputStride_ = inputStride;
+    return *this;
+  }
+
+  inline size_t inputStride() const {
+    if (this->inputStride_ == 0) {
+      return this->channels_;
+    } else {
+      assert(this->inputStride_ >= this->channels_);
+      return this->inputStride_;
+    }
+  }
+
+  inline HardswishOperatorTester& outputStride(size_t outputStride) {
+    assert(outputStride != 0);
+    this->outputStride_ = outputStride;
+    return *this;
+  }
+
+  inline size_t outputStride() const {
+    if (this->outputStride_ == 0) {
+      return this->channels_;
+    } else {
+      assert(this->outputStride_ >= this->channels_);
+      return this->outputStride_;
+    }
+  }
+
+  inline HardswishOperatorTester& batchSize(size_t batchSize) {
+    this->batchSize_ = batchSize;
+    return *this;
+  }
+
+  inline size_t batchSize() const {
+    return this->batchSize_;
+  }
+
+  inline HardswishOperatorTester& inputScale(float inputScale) {
+    assert(inputScale > 0.0f);
+    assert(std::isnormal(inputScale));
+    this->inputScale_ = inputScale;
+    return *this;
+  }
+
+  inline float inputScale() const {
+    return this->inputScale_;
+  }
+
+  inline HardswishOperatorTester& inputZeroPoint(uint8_t inputZeroPoint) {
+    this->inputZeroPoint_ = inputZeroPoint;
+    return *this;
+  }
+
+  inline uint8_t inputZeroPoint() const {
+    return this->inputZeroPoint_;
+  }
+
+  inline float outputScale() const {
+    return this->inputScale_;
+  }
+
+  inline uint8_t outputZeroPoint() const {
+    return this->inputZeroPoint_;
+  }
+
+  inline HardswishOperatorTester& qmin(uint8_t qmin) {
+    this->qmin_ = qmin;
+    return *this;
+  }
+
+  inline uint8_t qmin() const {
+    return this->qmin_;
+  }
+
+  inline HardswishOperatorTester& qmax(uint8_t qmax) {
+    this->qmax_ = qmax;
+    return *this;
+  }
+
+  inline uint8_t qmax() const {
+    return this->qmax_;
+  }
+
+  inline HardswishOperatorTester& iterations(size_t iterations) {
+    this->iterations_ = iterations;
+    return *this;
+  }
+
+  inline size_t iterations() const {
+    return this->iterations_;
+  }
+
+  void testQ8() const {
+    std::random_device randomDevice;
+    auto rng = std::mt19937(randomDevice());
+    auto u8rng = std::bind(std::uniform_int_distribution<uint8_t>(), rng);
+
+    std::vector<uint8_t> input((batchSize() - 1) * inputStride() + channels());
+    std::vector<uint8_t> output(
+        (batchSize() - 1) * outputStride() + channels());
+    std::vector<float> outputRef(batchSize() * channels());
+    for (size_t iteration = 0; iteration < iterations(); iteration++) {
+      std::generate(input.begin(), input.end(), std::ref(u8rng));
+      std::fill(output.begin(), output.end(), 0xA5);
+
+      /* Compute reference results */
+      for (size_t i = 0; i < batchSize(); i++) {
+        for (size_t c = 0; c < channels(); c++) {
+          const float x = inputScale() *
+              (int32_t(input[i * inputStride() + c]) -
+               int32_t(inputZeroPoint()));
+          const float hardswishX =
+            x * std::min(std::max(x + 3.0f, 0.0f), 6.0f) / 6.0f;
+          const float scaledHardswishX = hardswishX / outputScale();
+          float y = scaledHardswishX;
+          y = std::min<float>(y, int32_t(qmax()) - int32_t(outputZeroPoint()));
+          y = std::max<float>(y, int32_t(qmin()) - int32_t(outputZeroPoint()));
+          outputRef[i * channels() + c] = y + int32_t(outputZeroPoint());
+        }
+      }
+
+      /* Create, setup, run, and destroy Hardswish operator */
+      ASSERT_EQ(pytorch_qnnp_status_success, pytorch_qnnp_initialize());
+      pytorch_qnnp_operator_t hardswishOp = nullptr;
+
+      ASSERT_EQ(
+          pytorch_qnnp_status_success,
+          pytorch_qnnp_create_hardswish_nc_q8(
+              channels(),
+              inputZeroPoint(),
+              inputScale(),
+              outputZeroPoint(),
+              outputScale(),
+              qmin(),
+              qmax(),
+              0,
+              &hardswishOp));
+      ASSERT_NE(nullptr, hardswishOp);
+
+      ASSERT_EQ(
+          pytorch_qnnp_status_success,
+          pytorch_qnnp_setup_hardswish_nc_q8(
+              hardswishOp,
+              batchSize(),
+              input.data(),
+              inputStride(),
+              output.data(),
+              outputStride()));
+
+      ASSERT_EQ(
+          pytorch_qnnp_status_success,
+          pytorch_qnnp_run_operator(hardswishOp, nullptr /* thread pool */));
+
+      ASSERT_EQ(
+          pytorch_qnnp_status_success, pytorch_qnnp_delete_operator(hardswishOp));
+      hardswishOp = nullptr;
+
+      /* Verify results */
+      for (size_t i = 0; i < batchSize(); i++) {
+        for (size_t c = 0; c < channels(); c++) {
+          ASSERT_NEAR(
+              float(int32_t(output[i * outputStride() + c])),
+              outputRef[i * channels() + c],
+              0.6f);
+        }
+      }
+    }
+  }
+
+ private:
+  size_t batchSize_{1};
+  size_t channels_{1};
+  size_t inputStride_{0};
+  size_t outputStride_{0};
+  float inputScale_{0.75f};
+  uint8_t inputZeroPoint_{121};
+  uint8_t qmin_{0};
+  uint8_t qmax_{255};
+  size_t iterations_{15};
+};

diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/test/hardswish.cc b/aten/src/ATen/native/quantized/cpu/qnnpack/test/hardswish.cc
new file mode 100644
index 0000000..98170d6
--- /dev/null
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/test/hardswish.cc

@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include "hardswish-operator-tester.h"
+
+#include <qnnpack/params.h>
+
+TEST(HARDSWISH_OP, zero_batch) {
+  HardswishOperatorTester().batchSize(0).channels(8).iterations(1).testQ8();
+}
+
+TEST(HARDSWISH_OP, unit_batch) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    HardswishOperatorTester()
+        .batchSize(1)
+        .channels(channels)
+        .iterations(3)
+        .testQ8();
+  }
+}
+
+TEST(HARDSWISH_OP, unit_batch_with_qmin) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    HardswishOperatorTester()
+        .batchSize(1)
+        .channels(channels)
+        .qmin(128)
+        .iterations(3)
+        .testQ8();
+  }
+}
+
+TEST(HARDSWISH_OP, unit_batch_with_qmax) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    HardswishOperatorTester()
+        .batchSize(1)
+        .channels(channels)
+        .qmax(128)
+        .iterations(3)
+        .testQ8();
+  }
+}
+
+TEST(HARDSWISH_OP, unit_batch_with_input_scale) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    for (float inputScale = 1.0e-2f; inputScale < 1.0e+2f;
+         inputScale *= 10.0f) {
+      HardswishOperatorTester()
+          .batchSize(1)
+          .channels(channels)
+          .inputScale(inputScale)
+          .iterations(1)
+          .testQ8();
+    }
+  }
+}
+
+TEST(HARDSWISH_OP, unit_batch_with_input_zero_point) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    for (int32_t inputZeroPoint = 0; inputZeroPoint <= 255;
+         inputZeroPoint += 51) {
+      HardswishOperatorTester()
+          .batchSize(1)
+          .channels(channels)
+          .inputZeroPoint(uint8_t(inputZeroPoint))
+          .iterations(1)
+          .testQ8();
+    }
+  }
+}
+
+TEST(HARDSWISH_OP, small_batch) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    HardswishOperatorTester()
+        .batchSize(3)
+        .channels(channels)
+        .iterations(3)
+        .testQ8();
+  }
+}
+
+TEST(HARDSWISH_OP, small_batch_with_input_stride) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    HardswishOperatorTester()
+        .batchSize(3)
+        .channels(channels)
+        .inputStride(129)
+        .iterations(3)
+        .testQ8();
+  }
+}
+
+TEST(HARDSWISH_OP, small_batch_with_output_stride) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    HardswishOperatorTester()
+        .batchSize(3)
+        .channels(channels)
+        .outputStride(117)
+        .iterations(3)
+        .testQ8();
+  }
+}
+
+TEST(HARDSWISH_OP, small_batch_with_qmin) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    HardswishOperatorTester()
+        .batchSize(3)
+        .channels(channels)
+        .qmin(128)
+        .iterations(3)
+        .testQ8();
+  }
+}
+
+TEST(HARDSWISH_OP, small_batch_with_qmax) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    HardswishOperatorTester()
+        .batchSize(3)
+        .channels(channels)
+        .qmax(128)
+        .iterations(3)
+        .testQ8();
+  }
+}
+
+TEST(HARDSWISH_OP, small_batch_with_input_scale) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    for (float inputScale = 1.0e-2f; inputScale < 1.0e+2f;
+         inputScale *= 10.0f) {
+      HardswishOperatorTester()
+          .batchSize(3)
+          .channels(channels)
+          .inputScale(inputScale)
+          .iterations(1)
+          .testQ8();
+    }
+  }
+}
+
+TEST(HARDSWISH_OP, small_batch_with_input_zero_point) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    for (int32_t inputZeroPoint = 0; inputZeroPoint <= 255;
+         inputZeroPoint += 51) {
+      HardswishOperatorTester()
+          .batchSize(3)
+          .channels(channels)
+          .inputZeroPoint(uint8_t(inputZeroPoint))
+          .iterations(1)
+          .testQ8();
+    }
+  }
+}
+
+TEST(HARDSWISH_OP, strided_batch) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    HardswishOperatorTester()
+        .batchSize(3)
+        .channels(channels)
+        .inputStride(129)
+        .outputStride(117)
+        .iterations(3)
+        .testQ8();
+  }
+}
+
+TEST(HARDSWISH_OP, strided_batch_with_qmin) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    HardswishOperatorTester()
+        .batchSize(3)
+        .channels(channels)
+        .inputStride(129)
+        .outputStride(117)
+        .qmin(128)
+        .iterations(3)
+        .testQ8();
+  }
+}
+
+TEST(HARDSWISH_OP, strided_batch_with_qmax) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    HardswishOperatorTester()
+        .batchSize(3)
+        .channels(channels)
+        .inputStride(129)
+        .outputStride(117)
+        .qmax(128)
+        .iterations(3)
+        .testQ8();
+  }
+}
+
+TEST(HARDSWISH_OP, strided_batch_with_input_scale) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    for (float inputScale = 1.0e-2f; inputScale < 1.0e+2f;
+         inputScale *= 10.0f) {
+      HardswishOperatorTester()
+          .batchSize(3)
+          .channels(channels)
+          .inputStride(129)
+          .outputStride(117)
+          .inputScale(inputScale)
+          .iterations(1)
+          .testQ8();
+    }
+  }
+}
+
+TEST(HARDSWISH_OP, strided_batch_with_input_zero_point) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    for (int32_t inputZeroPoint = 0; inputZeroPoint <= 255;
+         inputZeroPoint += 51) {
+      HardswishOperatorTester()
+          .batchSize(3)
+          .channels(channels)
+          .inputStride(129)
+          .outputStride(117)
+          .inputZeroPoint(uint8_t(inputZeroPoint))
+          .iterations(1)
+          .testQ8();
+    }
+  }
+}
commit	0964b662c3f06c0c4a71f11f9e4e4c6129e00c7a	[log] [tgz]
author	Vasiliy Kuznetsov <vasiliy@fb.com>	Mon Apr 13 19:02:26 2020 -0700
committer	Facebook GitHub Bot <facebook-github-bot@users.noreply.github.com>	Mon Apr 13 19:06:59 2020 -0700
tree	c5aa4ae30b7ed4b170617f8f33558ebfd63a77ca
parent	455d4aab6479bb13e27f23139ce6bfaca909f7aa [diff]