Remove SSE4.2 and VNNI placeholder code for now.
PiperOrigin-RevId: 317157906
diff --git a/ruy/BUILD b/ruy/BUILD
index bf5c7e5..00af835 100644
--- a/ruy/BUILD
+++ b/ruy/BUILD
@@ -1,8 +1,8 @@
# Ruy is not BLAS
load("@bazel_skylib//lib:selects.bzl", "selects")
-load(":build_defs.bzl", "ruy_copts", "ruy_copts_avx2", "ruy_copts_avx512", "ruy_copts_sse42")
-load(":build_defs.oss.bzl", "ruy_copts_avxvnni", "ruy_linkopts_thread_standard_library")
+load(":build_defs.bzl", "ruy_copts", "ruy_copts_avx2", "ruy_copts_avx512")
+load(":build_defs.oss.bzl", "ruy_linkopts_thread_standard_library")
load(":ruy_test_ext.oss.bzl", "ruy_test_ext_defines", "ruy_test_ext_deps")
load(":ruy_test.bzl", "ruy_benchmark", "ruy_test")
@@ -621,106 +621,6 @@
],
)
-# TODO(b/147376783): SSE 4.2 support is incomplete / placeholder.
-# Optimization is not finished. In particular the dimensions of the kernel
-# blocks can be changed as desired.
-cc_library(
- name = "kernel_sse42",
- srcs = [
- "kernel_sse42.cc",
- ],
- copts = ruy_copts() + ruy_copts_sse42(),
- deps = [
- ":check_macros",
- ":kernel_common",
- ":opt_set",
- ":platform",
- "//ruy/profiler:instrumentation",
- ],
-)
-
-cc_library(
- name = "pack_sse42",
- srcs = [
- "pack_sse42.cc",
- ],
- copts = ruy_copts() + ruy_copts_sse42(),
- deps = [
- ":check_macros",
- ":matrix",
- ":opt_set",
- ":pack_common",
- ":path",
- ":platform",
- "//ruy/profiler:instrumentation",
- ],
-)
-
-cc_library(
- name = "have_built_path_for_sse42",
- srcs = [
- "have_built_path_for_sse42.cc",
- ],
- hdrs = [
- "have_built_path_for.h",
- ],
- copts = ruy_copts() + ruy_copts_sse42(),
- deps = [
- ":opt_set",
- ":platform",
- ],
-)
-
-# TODO(b/147376783): AVX-VNNI support is incomplete / placeholder.
-# Optimization is not finished. In particular the dimensions of the kernel
-# blocks can be changed as desired.
-cc_library(
- name = "kernel_avxvnni",
- srcs = [
- "kernel_avxvnni.cc",
- ],
- copts = ruy_copts() + ruy_copts_avxvnni(),
- deps = [
- ":check_macros",
- ":kernel_common",
- ":opt_set",
- ":platform",
- "//ruy/profiler:instrumentation",
- ],
-)
-
-cc_library(
- name = "pack_avxvnni",
- srcs = [
- "pack_avxvnni.cc",
- ],
- copts = ruy_copts() + ruy_copts_avxvnni(),
- deps = [
- ":check_macros",
- ":matrix",
- ":opt_set",
- ":pack_common",
- ":path",
- ":platform",
- "//ruy/profiler:instrumentation",
- ],
-)
-
-cc_library(
- name = "have_built_path_for_avxvnni",
- srcs = [
- "have_built_path_for_avxvnni.cc",
- ],
- hdrs = [
- "have_built_path_for.h",
- ],
- copts = ruy_copts() + ruy_copts_avxvnni(),
- deps = [
- ":opt_set",
- ":platform",
- ],
-)
-
cc_library(
name = "kernel",
hdrs = [
@@ -735,9 +635,7 @@
":kernel_arm", # fixdeps: keep
":kernel_avx2", # fixdeps: keep
":kernel_avx512", # fixdeps: keep
- ":kernel_avxvnni", # fixdeps: keep
":kernel_common",
- ":kernel_sse42", # fixdeps: keep
":mat",
":matrix",
":mul_params",
@@ -767,9 +665,7 @@
":pack_arm", # fixdeps: keep
":pack_avx2", # fixdeps: keep
":pack_avx512", # fixdeps: keep
- ":pack_avxvnni", # fixdeps: keep
":pack_common",
- ":pack_sse42", # fixdeps: keep
":path",
":platform",
":tune",
@@ -785,8 +681,6 @@
deps = [
":have_built_path_for_avx2",
":have_built_path_for_avx512",
- ":have_built_path_for_avxvnni",
- ":have_built_path_for_sse42",
":platform",
],
)
diff --git a/ruy/build_defs.bzl b/ruy/build_defs.bzl
index 1da3b1b..594c4d9 100644
--- a/ruy/build_defs.bzl
+++ b/ruy/build_defs.bzl
@@ -68,12 +68,3 @@
"//ruy:x86_64": ["-mavx2", "-mfma"],
"//conditions:default": [],
})
-
-# TODO(b/147376783): SSE 4.2 support is incomplete / placeholder.
-# Optimization is not finished. In particular the dimensions of the kernel
-# blocks can be changed as desired.
-def ruy_copts_sse42():
- return select({
- "//ruy:x86_64": ["-msse4.2"],
- "//conditions:default": [],
- })
diff --git a/ruy/build_defs.oss.bzl b/ruy/build_defs.oss.bzl
index 92eac57..e405b41 100644
--- a/ruy/build_defs.oss.bzl
+++ b/ruy/build_defs.oss.bzl
@@ -1,13 +1,5 @@
"""Build definitions for Ruy that are specific to the open-source build."""
-# TODO(b/147376783): VNNI support is incomplete / placeholder.
-# Optimization is not finished. In particular the dimensions of the kernel
-# blocks can be changed as desired.
-#
-# At the moment this does nothing because current toolchains don't support VNNI.
-def ruy_copts_avxvnni():
- return []
-
# Used for targets that #include <thread>
def ruy_linkopts_thread_standard_library():
# In open source builds, GCC is a common occurence. It requires "-pthread"
diff --git a/ruy/ctx.cc b/ruy/ctx.cc
index 87faec3..9924351 100644
--- a/ruy/ctx.cc
+++ b/ruy/ctx.cc
@@ -100,14 +100,10 @@
#elif RUY_PLATFORM_X86
// x86 SIMD paths currently require both runtime detection, and detection of
// whether we're building the path at all.
- maybe_add(Path::kSse42,
- [=]() { return HaveBuiltPathForSse42() && cpuinfo->Sse42(); });
maybe_add(Path::kAvx2,
[=]() { return HaveBuiltPathForAvx2() && cpuinfo->Avx2(); });
maybe_add(Path::kAvx512,
[=]() { return HaveBuiltPathForAvx512() && cpuinfo->Avx512(); });
- maybe_add(Path::kAvxVnni,
- [=]() { return HaveBuiltPathForAvxVnni() && cpuinfo->AvxVnni(); });
#else
(void)maybe_add;
(void)cpuinfo;
diff --git a/ruy/have_built_path_for.h b/ruy/have_built_path_for.h
index f0b1cd6..94761a7 100644
--- a/ruy/have_built_path_for.h
+++ b/ruy/have_built_path_for.h
@@ -21,10 +21,8 @@
namespace ruy {
#if RUY_PLATFORM_X86
-bool HaveBuiltPathForSse42();
bool HaveBuiltPathForAvx2();
bool HaveBuiltPathForAvx512();
-bool HaveBuiltPathForAvxVnni();
#endif // RUY_PLATFORM_X86
} // namespace ruy
diff --git a/ruy/have_built_path_for_avxvnni.cc b/ruy/have_built_path_for_avxvnni.cc
deleted file mode 100644
index f6e719f..0000000
--- a/ruy/have_built_path_for_avxvnni.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "ruy/have_built_path_for.h"
-#include "ruy/opt_set.h"
-
-namespace ruy {
-
-#if RUY_PLATFORM_X86
-// IMPORTANT:
-// These patterns must match those in the pack and kernel cc files.
-#if !(RUY_PLATFORM_AVX_VNNI && RUY_OPT(ASM))
-
-bool HaveBuiltPathForAvxVnni() { return false; }
-
-#else // RUY_PLATFORM_AVX_VNNI && RUY_OPT(ASM)
-
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-bool HaveBuiltPathForAvxVnni() { return true; }
-
-#endif // RUY_PLATFORM_AVX_VNNI && RUY_OPT(ASM)
-#endif // RUY_PLATFORM_X86
-
-} // namespace ruy
diff --git a/ruy/have_built_path_for_sse42.cc b/ruy/have_built_path_for_sse42.cc
deleted file mode 100644
index 182f0d9..0000000
--- a/ruy/have_built_path_for_sse42.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "ruy/have_built_path_for.h"
-#include "ruy/opt_set.h"
-
-namespace ruy {
-
-#if RUY_PLATFORM_X86
-// IMPORTANT:
-// These patterns must match those in the pack and kernel cc files.
-#if !(RUY_PLATFORM_SSE42 && RUY_OPT(ASM))
-
-bool HaveBuiltPathForSse42() { return false; }
-
-#else // RUY_PLATFORM_SSE42 && RUY_OPT(ASM)
-
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-bool HaveBuiltPathForSse42() { return true; }
-
-#endif // RUY_PLATFORM_SSE42 && RUY_OPT(ASM)
-#endif // RUY_PLATFORM_X86
-
-} // namespace ruy
diff --git a/ruy/kernel_avxvnni.cc b/ruy/kernel_avxvnni.cc
deleted file mode 100644
index 0045185..0000000
--- a/ruy/kernel_avxvnni.cc
+++ /dev/null
@@ -1,435 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <algorithm>
-#include <cstdint>
-
-#include "ruy/check_macros.h"
-#include "ruy/kernel.h"
-#include "ruy/opt_set.h"
-#include "ruy/platform.h"
-#include "ruy/profiler/instrumentation.h"
-
-#if RUY_PLATFORM_AVX_VNNI && RUY_OPT(ASM)
-#include <immintrin.h> // IWYU pragma: keep
-#endif
-
-namespace ruy {
-
-#if !(RUY_PLATFORM_AVX_VNNI && RUY_OPT(ASM))
-
-void Kernel8bitAvxVnni(const KernelParams8bit<16, 16>&) {
- // CPU-ID-based checks should disable the path that would reach this point.
- RUY_DCHECK(false);
-}
-
-void KernelFloatAvxVnni(const KernelParamsFloat<16, 16>&) {
- // CPU-ID-based checks should disable the path that would reach this point.
- RUY_DCHECK(false);
-}
-
-#else // RUY_PLATFORM_AVX_VNNI && RUY_OPT(ASM)
-
-static constexpr int kAvxFloatBlockSize = 16;
-static constexpr int kAvx8bitBlockSize = 16;
-static constexpr int kAvx8bitInnerSize = 4;
-
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-// When removing this comment, update profiling label below.
-void Kernel8bitAvxVnni(const KernelParams8bit<16, 16>& params) {
- profiler::ScopeLabel label("Kernel kAvxVnni 8-bit (UNFINISHED)");
-
- std::int32_t accum_data[kAvx8bitBlockSize][kAvx8bitBlockSize];
-
- int bias_ptr_block_increment =
- params.flags & RUY_ASM_FLAG_HAS_BIAS ? kAvx8bitBlockSize : 0;
-
- const std::int8_t* rhs_col_ptr = params.rhs_base_ptr;
- void* dst_col_ptr = params.dst_base_ptr;
- const std::int32_t* bias_col_ptr = params.bias;
- if (params.flags & RUY_ASM_FLAG_HAS_BIAS) {
- bias_col_ptr += params.start_row;
- }
-
- for (int col = params.start_col; col <= params.last_col;
- col += kAvx8bitBlockSize) {
- const std::int8_t* lhs_col_ptr = params.lhs_base_ptr;
- void* dst_ptr = dst_col_ptr;
- const std::int32_t* bias_ptr = bias_col_ptr;
-
- for (int row = params.start_row; row <= params.last_row;
- row += kAvx8bitBlockSize) {
- const int residual_rows =
- std::min(params.dst_rows - row, kAvx8bitBlockSize);
- const int residual_cols =
- std::min(params.dst_cols - col, kAvx8bitBlockSize);
-
- // Initialize with bias.
- std::int32_t initial_accum_data[kAvx8bitBlockSize];
- for (int i = 0; i < kAvx8bitBlockSize; ++i) {
- initial_accum_data[i] = 0;
- }
- for (int i = 0; i < residual_rows; ++i) {
- initial_accum_data[i] = bias_ptr[i];
- }
-
- for (int j = 0; j < kAvx8bitBlockSize; ++j) {
- for (int i = 0; i < kAvx8bitBlockSize; ++i) {
- accum_data[j][i] = initial_accum_data[i];
- }
- }
- bias_ptr += bias_ptr_block_increment;
-
- std::int8_t lhs_data[kAvx8bitBlockSize][kAvx8bitInnerSize];
- std::int8_t rhs_data[kAvx8bitBlockSize][kAvx8bitInnerSize];
- const std::int8_t* lhs_ptr = lhs_col_ptr;
- const std::int8_t* rhs_ptr = rhs_col_ptr;
- for (int d = 0; d < params.depth; d += kAvx8bitInnerSize) {
- for (int i = 0; i < kAvx8bitBlockSize; ++i) {
- for (int x = 0; x < kAvx8bitInnerSize; ++x) {
- lhs_data[i][x] = lhs_ptr[i * kAvx8bitInnerSize + x];
- rhs_data[i][x] = rhs_ptr[i * kAvx8bitInnerSize + x];
- }
- }
-
- for (int j = 0; j < kAvx8bitBlockSize; ++j) {
- for (int i = 0; i < kAvx8bitBlockSize; ++i) {
- for (int x = 0; x < kAvx8bitInnerSize; ++x) {
- accum_data[j][i] += lhs_data[i][x] * rhs_data[j][x];
- }
- }
- }
-
- lhs_ptr += kAvx8bitBlockSize * kAvx8bitInnerSize;
- rhs_ptr += kAvx8bitBlockSize * kAvx8bitInnerSize;
- }
-
- if ((params.flags & RUY_ASM_FLAG_HAS_LHS_SUMS) && params.rhs_zero_point) {
- for (int j = 0; j < kAvx8bitBlockSize; ++j) {
- for (int i = 0; i < kAvx8bitBlockSize; ++i) {
- accum_data[j][i] -=
- params.rhs_zero_point * params.lhs_sums[row + i];
- }
- }
- }
- if ((params.flags & RUY_ASM_FLAG_HAS_RHS_SUMS) && params.lhs_zero_point) {
- for (int j = 0; j < kAvx8bitBlockSize; ++j) {
- for (int i = 0; i < kAvx8bitBlockSize; ++i) {
- accum_data[j][i] -=
- params.lhs_zero_point * params.rhs_sums[col + j];
- }
- }
- }
- if (params.lhs_zero_point && params.rhs_zero_point) {
- for (int j = 0; j < kAvx8bitBlockSize; ++j) {
- for (int i = 0; i < kAvx8bitBlockSize; ++i) {
- accum_data[j][i] += params.prod_zp_depth;
- }
- }
- }
-
- if (params.dst_type_id != DstTypeId<std::int32_t>::kValue) {
- std::int32_t m_vector[kAvx8bitBlockSize];
- std::int32_t e_vector[kAvx8bitBlockSize];
- // Does not make use of RUY_ASM_FLAG_NEEDS_LEFT_SHIFT.
- if (params.flags & RUY_ASM_FLAG_HAS_PERCHANNEL) {
- int i = 0;
- for (; i < residual_rows; ++i) {
- m_vector[i] = params.multiplier_fixedpoint[row + i];
- e_vector[i] = params.multiplier_exponent[row + i];
- }
- for (; i < kAvx8bitBlockSize; ++i) {
- m_vector[i] = m_vector[0];
- e_vector[i] = e_vector[0];
- }
- } else {
- // These arrays have size LhsCols, and are pre-filled.
- for (int i = 0; i < kAvx8bitBlockSize; ++i) {
- m_vector[i] = params.multiplier_fixedpoint[i];
- e_vector[i] = params.multiplier_exponent[i];
- }
- }
-
- for (int j = 0; j < kAvx8bitBlockSize; ++j) {
- for (int i = 0; i < kAvx8bitBlockSize; ++i) {
- accum_data[j][i] = MultiplyByQuantizedMultiplier(
- accum_data[j][i], m_vector[i], e_vector[i]);
- }
- }
-
- if (params.dst_zero_point) {
- for (int j = 0; j < kAvx8bitBlockSize; ++j) {
- for (int i = 0; i < kAvx8bitBlockSize; ++i) {
- accum_data[j][i] += params.dst_zero_point;
- }
- }
- }
-
- for (int j = 0; j < kAvx8bitBlockSize; ++j) {
- for (int i = 0; i < kAvx8bitBlockSize; ++i) {
- accum_data[j][i] =
- std::min<std::int32_t>(accum_data[j][i], params.clamp_max);
- accum_data[j][i] =
- std::max<std::int32_t>(accum_data[j][i], params.clamp_min);
- }
- }
- }
-
- const bool store_full_block = (residual_rows == kAvx8bitBlockSize) &&
- (residual_cols == kAvx8bitBlockSize);
-
- if (params.dst_type_id == DstTypeId<std::int8_t>::kValue) {
- std::int8_t* tmp_ptr =
- store_full_block
- ? static_cast<std::int8_t*>(dst_ptr)
- : const_cast<std::int8_t*>(
- reinterpret_cast<const std::int8_t*>(params.dst_tmp_buf));
- const int block_col_offset =
- store_full_block ? params.dst_stride / sizeof(std::int8_t)
- : kAvx8bitBlockSize;
- for (int j = 0; j < kAvx8bitBlockSize; ++j) {
- for (int i = 0; i < kAvx8bitBlockSize; ++i) {
- tmp_ptr[i] = accum_data[j][i];
- }
- tmp_ptr += block_col_offset;
- }
-
- if (!store_full_block) {
- const std::int8_t* block_ptr =
- reinterpret_cast<const std::int8_t*>(params.dst_tmp_buf);
- for (int j = 0; j < residual_cols; ++j) {
- for (int i = 0; i < residual_rows; ++i) {
- static_cast<std::int8_t*>(
- dst_ptr)[j * params.dst_stride / sizeof(std::int8_t) + i] =
- block_ptr[i];
- }
- block_ptr += kAvx8bitBlockSize;
- }
- }
- dst_ptr = static_cast<void*>(static_cast<std::int8_t*>(dst_ptr) +
- kAvx8bitBlockSize);
- } else if (params.dst_type_id == DstTypeId<std::uint8_t>::kValue) {
- std::uint8_t* tmp_ptr = store_full_block
- ? static_cast<std::uint8_t*>(dst_ptr)
- : const_cast<std::uint8_t*>(
- reinterpret_cast<const std::uint8_t*>(
- params.dst_tmp_buf));
- const int block_col_offset =
- store_full_block ? params.dst_stride : kAvx8bitBlockSize;
- for (int j = 0; j < kAvx8bitBlockSize; ++j) {
- for (int i = 0; i < kAvx8bitBlockSize; ++i) {
- tmp_ptr[i] = accum_data[j][i];
- }
- tmp_ptr += block_col_offset;
- }
-
- if (!store_full_block) {
- const std::uint8_t* block_ptr =
- reinterpret_cast<const std::uint8_t*>(params.dst_tmp_buf);
- for (int j = 0; j < residual_cols; ++j) {
- for (int i = 0; i < residual_rows; ++i) {
- static_cast<std::uint8_t*>(
- dst_ptr)[j * params.dst_stride / sizeof(std::uint8_t) + i] =
- block_ptr[i];
- }
- block_ptr += kAvx8bitBlockSize;
- }
- }
- dst_ptr = static_cast<void*>(static_cast<std::uint8_t*>(dst_ptr) +
- kAvx8bitBlockSize);
- } else if (params.dst_type_id == DstTypeId<std::int16_t>::kValue) {
- if (store_full_block) {
- std::int16_t* tmp_ptr = static_cast<std::int16_t*>(dst_ptr);
- const int block_col_offset = params.dst_stride / sizeof(std::int16_t);
- for (int j = 0; j < kAvx8bitBlockSize; ++j) {
- for (int i = 0; i < kAvx8bitBlockSize; ++i) {
- tmp_ptr[i] = accum_data[j][i];
- }
- tmp_ptr += block_col_offset;
- }
- } else {
- std::int16_t* tmp_ptr = const_cast<std::int16_t*>(
- reinterpret_cast<const std::int16_t*>(params.dst_tmp_buf));
- const int block_col_offset = kAvx8bitBlockSize;
- for (int j = 0; j < kAvx8bitBlockSize; ++j) {
- for (int i = 0; i < kAvx8bitBlockSize; ++i) {
- tmp_ptr[i] = accum_data[j][i];
- }
- tmp_ptr += block_col_offset;
- }
- const std::int16_t* block_ptr =
- reinterpret_cast<const std::int16_t*>(params.dst_tmp_buf);
- std::int16_t* dst_block_ptr = static_cast<std::int16_t*>(dst_ptr);
- for (int j = 0; j < residual_cols; ++j) {
- for (int i = 0; i < residual_rows; ++i) {
- dst_block_ptr[i] = block_ptr[i];
- }
- dst_block_ptr += params.dst_stride / sizeof(std::int16_t);
- block_ptr += kAvx8bitBlockSize;
- }
- }
- dst_ptr = static_cast<void*>(static_cast<std::int16_t*>(dst_ptr) +
- kAvx8bitBlockSize);
- } else if (params.dst_type_id == DstTypeId<std::int32_t>::kValue) {
- if (store_full_block) {
- std::int32_t* tmp_ptr = static_cast<std::int32_t*>(dst_ptr);
- const int block_col_offset = params.dst_stride / sizeof(std::int32_t);
- for (int j = 0; j < kAvx8bitBlockSize; ++j) {
- for (int i = 0; i < kAvx8bitBlockSize; ++i) {
- tmp_ptr[i] = accum_data[j][i];
- }
- tmp_ptr += block_col_offset;
- }
- } else {
- std::int32_t* dst_block_ptr = static_cast<std::int32_t*>(dst_ptr);
- for (int j = 0; j < residual_cols; ++j) {
- for (int i = 0; i < residual_rows; ++i) {
- dst_block_ptr[i] = accum_data[j][i];
- }
- dst_block_ptr += params.dst_stride / sizeof(std::int32_t);
- }
- }
- dst_ptr = static_cast<void*>(static_cast<std::int32_t*>(dst_ptr) +
- kAvx8bitBlockSize);
- } else {
- RUY_DCHECK(false);
- }
-
- lhs_col_ptr += kAvx8bitBlockSize * params.lhs_stride;
- } // End row-block loop.
-
- dst_col_ptr = static_cast<void*>(static_cast<char*>(dst_col_ptr) +
- kAvx8bitBlockSize * params.dst_stride);
- rhs_col_ptr += kAvx8bitBlockSize * params.rhs_stride;
- } // End col-block loop.
-} // NOLINT(readability/fn_size)
-
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-// When removing this comment, update profiling label below.
-void KernelFloatAvxVnni(const KernelParamsFloat<16, 16>& params) {
- profiler::ScopeLabel label("Kernel kAvxVnni float (UNFINISHED)");
-
- float lhs_data[kAvxFloatBlockSize];
- float rhs_data[kAvxFloatBlockSize];
- float accum_data[kAvxFloatBlockSize][kAvxFloatBlockSize];
- int bias_ptr_block_increment =
- params.flags & RUY_ASM_FLAG_HAS_BIAS ? kAvxFloatBlockSize : 0;
-
- const float* rhs_col_ptr = params.rhs_base_ptr;
- float* dst_col_ptr = params.dst_base_ptr;
- const float* bias_col_ptr = params.bias;
- if (params.flags & RUY_ASM_FLAG_HAS_BIAS) {
- bias_col_ptr += params.start_row;
- }
-
- for (int col = params.start_col; col <= params.last_col;
- col += kAvxFloatBlockSize) {
- const float* lhs_col_ptr = params.lhs_base_ptr;
- float* dst_ptr = dst_col_ptr;
- const float* bias_ptr = bias_col_ptr;
-
- for (int row = params.start_row; row <= params.last_row;
- row += kAvxFloatBlockSize) {
- const int residual_rows =
- std::min(params.dst_rows - row, kAvxFloatBlockSize);
- const int residual_cols =
- std::min(params.dst_cols - col, kAvxFloatBlockSize);
-
- // Initialize with bias.
- float initial_accum_data[kAvxFloatBlockSize];
- for (int i = 0; i < kAvxFloatBlockSize; ++i) {
- initial_accum_data[i] = 0.0f;
- }
- for (int i = 0; i < residual_rows; ++i) {
- initial_accum_data[i] = bias_ptr[i];
- }
- for (int j = 0; j < kAvxFloatBlockSize; ++j) {
- for (int i = 0; i < kAvxFloatBlockSize; ++i) {
- accum_data[j][i] = initial_accum_data[i];
- }
- }
- bias_ptr += bias_ptr_block_increment;
-
- const float* lhs_ptr = lhs_col_ptr;
- const float* rhs_ptr = rhs_col_ptr;
- for (int d = 0; d < params.depth; ++d) {
- for (int i = 0; i < kAvxFloatBlockSize; ++i) {
- lhs_data[i] = lhs_ptr[i];
- rhs_data[i] = rhs_ptr[i];
- }
-
- for (int j = 0; j < kAvxFloatBlockSize; ++j) {
- for (int i = 0; i < kAvxFloatBlockSize; ++i) {
- accum_data[j][i] += lhs_data[i] * rhs_data[j];
- }
- }
-
- lhs_ptr += kAvxFloatBlockSize;
- rhs_ptr += kAvxFloatBlockSize;
- }
-
- for (int j = 0; j < kAvxFloatBlockSize; ++j) {
- for (int i = 0; i < kAvxFloatBlockSize; ++i) {
- accum_data[j][i] =
- std::min<float>(accum_data[j][i], params.clamp_max);
- accum_data[j][i] =
- std::max<float>(accum_data[j][i], params.clamp_min);
- }
- }
-
- const bool store_full_block = (residual_rows == kAvxFloatBlockSize) &&
- (residual_cols == kAvxFloatBlockSize);
-
- {
- float* block_ptr =
- store_full_block ? dst_ptr : const_cast<float*>(params.dst_tmp_buf);
- const int block_col_offset = store_full_block
- ? params.dst_stride / sizeof(float)
- : kAvxFloatBlockSize;
- for (int j = 0; j < kAvxFloatBlockSize; ++j) {
- for (int i = 0; i < kAvxFloatBlockSize; ++i) {
- block_ptr[i] = accum_data[j][i];
- }
- block_ptr += block_col_offset;
- }
- }
- if (!store_full_block) {
- const float* block_ptr = params.dst_tmp_buf;
- for (int j = 0; j < residual_cols; ++j) {
- for (int i = 0; i < residual_rows; ++i) {
- dst_ptr[j * params.dst_stride / sizeof(float) + i] = block_ptr[i];
- }
- block_ptr += kAvxFloatBlockSize;
- }
- }
-
- lhs_col_ptr += kAvxFloatBlockSize * params.lhs_stride / sizeof(float);
- dst_ptr += kAvxFloatBlockSize;
- } // End row-block loop.
-
- dst_col_ptr += kAvxFloatBlockSize * params.dst_stride / sizeof(float);
- rhs_col_ptr += kAvxFloatBlockSize * params.rhs_stride / sizeof(float);
- } // End col-block loop.
-}
-
-#endif // RUY_PLATFORM_AVX_VNNI && RUY_OPT(ASM)
-
-} // namespace ruy
diff --git a/ruy/kernel_sse42.cc b/ruy/kernel_sse42.cc
deleted file mode 100644
index 962b6e9..0000000
--- a/ruy/kernel_sse42.cc
+++ /dev/null
@@ -1,428 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <algorithm>
-#include <cstdint>
-
-#include "ruy/check_macros.h"
-#include "ruy/kernel.h"
-#include "ruy/opt_set.h"
-#include "ruy/platform.h"
-#include "ruy/profiler/instrumentation.h"
-
-#if RUY_PLATFORM_SSE42 && RUY_OPT(ASM)
-#include <immintrin.h> // IWYU pragma: keep
-#endif
-
-namespace ruy {
-
-#if !(RUY_PLATFORM_SSE42 && RUY_OPT(ASM))
-
-void Kernel8bitSse42(const KernelParams8bit<8, 8>&) {
- // CPU-ID-based checks should disable the path that would reach this point.
- RUY_DCHECK(false);
-}
-
-void KernelFloatSse42(const KernelParamsFloat<8, 8>&) {
- // CPU-ID-based checks should disable the path that would reach this point.
- RUY_DCHECK(false);
-}
-
-#else // RUY_PLATFORM_SSE42 && RUY_OPT(ASM)
-
-static constexpr int kAvxFloatBlockSize = 8;
-static constexpr int kAvx8bitBlockSize = 8;
-static constexpr int kAvx8bitInnerSize = 4;
-
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-// When removing this comment, update profiling label below.
-void Kernel8bitSse42(const KernelParams8bit<8, 8>& params) {
- profiler::ScopeLabel label("Kernel kSse42 8-bit (UNFINISHED)");
- std::int32_t accum_data[kAvx8bitBlockSize][kAvx8bitBlockSize];
-
- int bias_ptr_block_increment =
- params.flags & RUY_ASM_FLAG_HAS_BIAS ? kAvx8bitBlockSize : 0;
-
- const std::int8_t* rhs_col_ptr = params.rhs_base_ptr;
- void* dst_col_ptr = params.dst_base_ptr;
- const std::int32_t* bias_col_ptr = params.bias;
- if (params.flags & RUY_ASM_FLAG_HAS_BIAS) {
- bias_col_ptr += params.start_row;
- }
-
- for (int col = params.start_col; col <= params.last_col;
- col += kAvx8bitBlockSize) {
- const std::int8_t* lhs_col_ptr = params.lhs_base_ptr;
- void* dst_ptr = dst_col_ptr;
- const std::int32_t* bias_ptr = bias_col_ptr;
-
- for (int row = params.start_row; row <= params.last_row;
- row += kAvx8bitBlockSize) {
- const int residual_rows =
- std::min(params.dst_rows - row, kAvx8bitBlockSize);
- const int residual_cols =
- std::min(params.dst_cols - col, kAvx8bitBlockSize);
-
- // Initialize with bias.
- std::int32_t initial_accum_data[kAvx8bitBlockSize];
- for (int i = 0; i < kAvx8bitBlockSize; ++i) {
- initial_accum_data[i] = 0;
- }
- for (int i = 0; i < residual_rows; ++i) {
- initial_accum_data[i] = bias_ptr[i];
- }
- for (int j = 0; j < kAvx8bitBlockSize; ++j) {
- for (int i = 0; i < kAvx8bitBlockSize; ++i) {
- accum_data[j][i] = initial_accum_data[i];
- }
- }
- bias_ptr += bias_ptr_block_increment;
-
- std::int8_t lhs_data[kAvx8bitBlockSize][kAvx8bitInnerSize];
- std::int8_t rhs_data[kAvx8bitBlockSize][kAvx8bitInnerSize];
- const std::int8_t* lhs_ptr = lhs_col_ptr;
- const std::int8_t* rhs_ptr = rhs_col_ptr;
- for (int d = 0; d < params.depth; d += kAvx8bitInnerSize) {
- for (int i = 0; i < kAvx8bitBlockSize; ++i) {
- for (int x = 0; x < kAvx8bitInnerSize; ++x) {
- lhs_data[i][x] = lhs_ptr[i * kAvx8bitInnerSize + x];
- rhs_data[i][x] = rhs_ptr[i * kAvx8bitInnerSize + x];
- }
- }
- for (int j = 0; j < kAvx8bitBlockSize; ++j) {
- for (int i = 0; i < kAvx8bitBlockSize; ++i) {
- for (int x = 0; x < kAvx8bitInnerSize; ++x) {
- accum_data[j][i] += lhs_data[i][x] * rhs_data[j][x];
- }
- }
- }
- lhs_ptr += kAvx8bitBlockSize * kAvx8bitInnerSize;
- rhs_ptr += kAvx8bitBlockSize * kAvx8bitInnerSize;
- }
-
- if ((params.flags & RUY_ASM_FLAG_HAS_LHS_SUMS) && params.rhs_zero_point) {
- for (int j = 0; j < kAvx8bitBlockSize; ++j) {
- for (int i = 0; i < kAvx8bitBlockSize; ++i) {
- accum_data[j][i] -=
- params.rhs_zero_point * params.lhs_sums[row + i];
- }
- }
- }
- if ((params.flags & RUY_ASM_FLAG_HAS_RHS_SUMS) && params.lhs_zero_point) {
- for (int j = 0; j < kAvx8bitBlockSize; ++j) {
- for (int i = 0; i < kAvx8bitBlockSize; ++i) {
- accum_data[j][i] -=
- params.lhs_zero_point * params.rhs_sums[col + j];
- }
- }
- }
- if (params.lhs_zero_point && params.rhs_zero_point) {
- for (int j = 0; j < kAvx8bitBlockSize; ++j) {
- for (int i = 0; i < kAvx8bitBlockSize; ++i) {
- accum_data[j][i] += params.prod_zp_depth;
- }
- }
- }
-
- if (params.dst_type_id != DstTypeId<std::int32_t>::kValue) {
- std::int32_t m_vector[kAvx8bitBlockSize];
- std::int32_t e_vector[kAvx8bitBlockSize];
- // Does not make use of RUY_ASM_FLAG_NEEDS_LEFT_SHIFT.
- if (params.flags & RUY_ASM_FLAG_HAS_PERCHANNEL) {
- int i = 0;
- for (; i < residual_rows; ++i) {
- m_vector[i] = params.multiplier_fixedpoint[row + i];
- e_vector[i] = params.multiplier_exponent[row + i];
- }
- for (; i < kAvx8bitBlockSize; ++i) {
- m_vector[i] = m_vector[0];
- e_vector[i] = e_vector[0];
- }
- } else {
- // These arrays have size LhsCols, and are pre-filled.
- for (int i = 0; i < kAvx8bitBlockSize; ++i) {
- m_vector[i] = params.multiplier_fixedpoint[i];
- e_vector[i] = params.multiplier_exponent[i];
- }
- }
- for (int j = 0; j < kAvx8bitBlockSize; ++j) {
- for (int i = 0; i < kAvx8bitBlockSize; ++i) {
- accum_data[j][i] = MultiplyByQuantizedMultiplier(
- accum_data[j][i], m_vector[i], e_vector[i]);
- }
- }
-
- if (params.dst_zero_point) {
- for (int j = 0; j < kAvx8bitBlockSize; ++j) {
- for (int i = 0; i < kAvx8bitBlockSize; ++i) {
- accum_data[j][i] += params.dst_zero_point;
- }
- }
- }
-
- for (int j = 0; j < kAvx8bitBlockSize; ++j) {
- for (int i = 0; i < kAvx8bitBlockSize; ++i) {
- accum_data[j][i] =
- std::min<std::int32_t>(accum_data[j][i], params.clamp_max);
- accum_data[j][i] =
- std::max<std::int32_t>(accum_data[j][i], params.clamp_min);
- }
- }
- }
-
- const bool store_full_block = (residual_rows == kAvx8bitBlockSize) &&
- (residual_cols == kAvx8bitBlockSize);
-
- if (params.dst_type_id == DstTypeId<std::int8_t>::kValue) {
- std::int8_t* tmp_ptr =
- store_full_block
- ? static_cast<std::int8_t*>(dst_ptr)
- : const_cast<std::int8_t*>(
- reinterpret_cast<const std::int8_t*>(params.dst_tmp_buf));
- const int block_col_offset =
- store_full_block ? params.dst_stride / sizeof(std::int8_t)
- : kAvx8bitBlockSize;
- for (int j = 0; j < kAvx8bitBlockSize; ++j) {
- for (int i = 0; i < kAvx8bitBlockSize; ++i) {
- tmp_ptr[i] = accum_data[j][i];
- }
- tmp_ptr += block_col_offset;
- }
-
- if (!store_full_block) {
- const std::int8_t* block_ptr =
- reinterpret_cast<const std::int8_t*>(params.dst_tmp_buf);
- for (int j = 0; j < residual_cols; ++j) {
- for (int i = 0; i < residual_rows; ++i) {
- static_cast<std::int8_t*>(
- dst_ptr)[j * params.dst_stride / sizeof(std::int8_t) + i] =
- block_ptr[i];
- }
- block_ptr += kAvx8bitBlockSize;
- }
- }
- dst_ptr = static_cast<void*>(static_cast<std::int8_t*>(dst_ptr) +
- kAvx8bitBlockSize);
- } else if (params.dst_type_id == DstTypeId<std::uint8_t>::kValue) {
- std::uint8_t* tmp_ptr = store_full_block
- ? static_cast<std::uint8_t*>(dst_ptr)
- : const_cast<std::uint8_t*>(
- reinterpret_cast<const std::uint8_t*>(
- params.dst_tmp_buf));
- const int block_col_offset =
- store_full_block ? params.dst_stride : kAvx8bitBlockSize;
- for (int j = 0; j < kAvx8bitBlockSize; ++j) {
- for (int i = 0; i < kAvx8bitBlockSize; ++i) {
- tmp_ptr[i] = accum_data[j][i];
- }
- tmp_ptr += block_col_offset;
- }
-
- if (!store_full_block) {
- const std::uint8_t* block_ptr =
- reinterpret_cast<const std::uint8_t*>(params.dst_tmp_buf);
- for (int j = 0; j < residual_cols; ++j) {
- for (int i = 0; i < residual_rows; ++i) {
- static_cast<std::uint8_t*>(
- dst_ptr)[j * params.dst_stride / sizeof(std::uint8_t) + i] =
- block_ptr[i];
- }
- block_ptr += kAvx8bitBlockSize;
- }
- }
- dst_ptr = static_cast<void*>(static_cast<std::uint8_t*>(dst_ptr) +
- kAvx8bitBlockSize);
- } else if (params.dst_type_id == DstTypeId<std::int16_t>::kValue) {
- if (store_full_block) {
- std::int16_t* tmp_ptr = static_cast<std::int16_t*>(dst_ptr);
- const int block_col_offset = params.dst_stride / sizeof(std::int16_t);
- for (int j = 0; j < kAvx8bitBlockSize; ++j) {
- for (int i = 0; i < kAvx8bitBlockSize; ++i) {
- tmp_ptr[i] = accum_data[j][i];
- }
- tmp_ptr += block_col_offset;
- }
- } else {
- std::int16_t* tmp_ptr = const_cast<std::int16_t*>(
- reinterpret_cast<const std::int16_t*>(params.dst_tmp_buf));
- const int block_col_offset = kAvx8bitBlockSize;
- for (int j = 0; j < kAvx8bitBlockSize; ++j) {
- for (int i = 0; i < kAvx8bitBlockSize; ++i) {
- tmp_ptr[i] = accum_data[j][i];
- }
- tmp_ptr += block_col_offset;
- }
- const std::int16_t* block_ptr =
- reinterpret_cast<const std::int16_t*>(params.dst_tmp_buf);
- std::int16_t* dst_block_ptr = static_cast<std::int16_t*>(dst_ptr);
- for (int j = 0; j < residual_cols; ++j) {
- for (int i = 0; i < residual_rows; ++i) {
- dst_block_ptr[i] = block_ptr[i];
- }
- dst_block_ptr += params.dst_stride / sizeof(std::int16_t);
- block_ptr += kAvx8bitBlockSize;
- }
- }
- dst_ptr = static_cast<void*>(static_cast<std::int16_t*>(dst_ptr) +
- kAvx8bitBlockSize);
- } else if (params.dst_type_id == DstTypeId<std::int32_t>::kValue) {
- if (store_full_block) {
- std::int32_t* tmp_ptr = static_cast<std::int32_t*>(dst_ptr);
- const int block_col_offset = params.dst_stride / sizeof(std::int32_t);
- for (int j = 0; j < kAvx8bitBlockSize; ++j) {
- for (int i = 0; i < kAvx8bitBlockSize; ++i) {
- tmp_ptr[i] = accum_data[j][i];
- }
- tmp_ptr += block_col_offset;
- }
- } else {
- std::int32_t* dst_block_ptr = static_cast<std::int32_t*>(dst_ptr);
- for (int j = 0; j < residual_cols; ++j) {
- for (int i = 0; i < residual_rows; ++i) {
- dst_block_ptr[i] = accum_data[j][i];
- }
- dst_block_ptr += params.dst_stride / sizeof(std::int32_t);
- }
- }
- dst_ptr = static_cast<void*>(static_cast<std::int32_t*>(dst_ptr) +
- kAvx8bitBlockSize);
- } else {
- RUY_DCHECK(false);
- }
-
- lhs_col_ptr += kAvx8bitBlockSize * params.lhs_stride;
- } // End row-block loop.
-
- dst_col_ptr = static_cast<void*>(static_cast<char*>(dst_col_ptr) +
- kAvx8bitBlockSize * params.dst_stride);
- rhs_col_ptr += kAvx8bitBlockSize * params.rhs_stride;
- } // End col-block loop.
-} // NOLINT(readability/fn_size)
-
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-// When removing this comment, update profiling label below.
-void KernelFloatSse42(const KernelParamsFloat<8, 8>& params) {
- profiler::ScopeLabel label("Kernel kSse42 float (UNFINISHED)");
-
- float lhs_data[kAvxFloatBlockSize];
- float rhs_data[kAvxFloatBlockSize];
- float accum_data[kAvxFloatBlockSize][kAvxFloatBlockSize];
- int bias_ptr_block_increment =
- params.flags & RUY_ASM_FLAG_HAS_BIAS ? kAvxFloatBlockSize : 0;
-
- const float* rhs_col_ptr = params.rhs_base_ptr;
- float* dst_col_ptr = params.dst_base_ptr;
- const float* bias_col_ptr = params.bias;
- if (params.flags & RUY_ASM_FLAG_HAS_BIAS) {
- bias_col_ptr += params.start_row;
- }
-
- for (int col = params.start_col; col <= params.last_col;
- col += kAvxFloatBlockSize) {
- const float* lhs_col_ptr = params.lhs_base_ptr;
- float* dst_ptr = dst_col_ptr;
- const float* bias_ptr = bias_col_ptr;
-
- for (int row = params.start_row; row <= params.last_row;
- row += kAvxFloatBlockSize) {
- const int residual_rows =
- std::min(params.dst_rows - row, kAvxFloatBlockSize);
- const int residual_cols =
- std::min(params.dst_cols - col, kAvxFloatBlockSize);
-
- // Initialize with bias.
- float initial_accum_data[kAvxFloatBlockSize];
- for (int i = 0; i < kAvxFloatBlockSize; ++i) {
- initial_accum_data[i] = 0.0f;
- }
- for (int i = 0; i < residual_rows; ++i) {
- initial_accum_data[i] = bias_ptr[i];
- }
- for (int j = 0; j < kAvxFloatBlockSize; ++j) {
- for (int i = 0; i < kAvxFloatBlockSize; ++i) {
- accum_data[j][i] = initial_accum_data[i];
- }
- }
- bias_ptr += bias_ptr_block_increment;
-
- const float* lhs_ptr = lhs_col_ptr;
- const float* rhs_ptr = rhs_col_ptr;
- for (int d = 0; d < params.depth; ++d) {
- for (int i = 0; i < kAvxFloatBlockSize; ++i) {
- lhs_data[i] = lhs_ptr[i];
- rhs_data[i] = rhs_ptr[i];
- }
- for (int j = 0; j < kAvxFloatBlockSize; ++j) {
- for (int i = 0; i < kAvxFloatBlockSize; ++i) {
- accum_data[j][i] += lhs_data[i] * rhs_data[j];
- }
- }
- lhs_ptr += kAvxFloatBlockSize;
- rhs_ptr += kAvxFloatBlockSize;
- }
-
- for (int j = 0; j < kAvxFloatBlockSize; ++j) {
- for (int i = 0; i < kAvxFloatBlockSize; ++i) {
- accum_data[j][i] =
- std::min<float>(accum_data[j][i], params.clamp_max);
- accum_data[j][i] =
- std::max<float>(accum_data[j][i], params.clamp_min);
- }
- }
-
- const bool store_full_block = (residual_rows == kAvxFloatBlockSize) &&
- (residual_cols == kAvxFloatBlockSize);
-
- {
- float* block_ptr =
- store_full_block ? dst_ptr : const_cast<float*>(params.dst_tmp_buf);
- const int block_col_offset = store_full_block
- ? params.dst_stride / sizeof(float)
- : kAvxFloatBlockSize;
- for (int j = 0; j < kAvxFloatBlockSize; ++j) {
- for (int i = 0; i < kAvxFloatBlockSize; ++i) {
- block_ptr[i] = accum_data[j][i];
- }
- block_ptr += block_col_offset;
- }
- }
- if (!store_full_block) {
- const float* block_ptr = params.dst_tmp_buf;
- for (int j = 0; j < residual_cols; ++j) {
- for (int i = 0; i < residual_rows; ++i) {
- dst_ptr[j * params.dst_stride / sizeof(float) + i] = block_ptr[i];
- }
- block_ptr += kAvxFloatBlockSize;
- }
- }
-
- lhs_col_ptr += kAvxFloatBlockSize * params.lhs_stride / sizeof(float);
- dst_ptr += kAvxFloatBlockSize;
- } // End row-block loop.
-
- dst_col_ptr += kAvxFloatBlockSize * params.dst_stride / sizeof(float);
- rhs_col_ptr += kAvxFloatBlockSize * params.rhs_stride / sizeof(float);
- } // End col-block loop.
-}
-
-#endif // RUY_PLATFORM_SSE42 && RUY_OPT(ASM)
-
-} // namespace ruy
diff --git a/ruy/kernel_x86.h b/ruy/kernel_x86.h
index cada2a6..d20c1a4 100644
--- a/ruy/kernel_x86.h
+++ b/ruy/kernel_x86.h
@@ -32,53 +32,8 @@
#if RUY_PLATFORM_X86
-RUY_INHERIT_KERNEL(Path::kStandardCpp, Path::kSse42)
-RUY_INHERIT_KERNEL(Path::kSse42, Path::kAvx2)
+RUY_INHERIT_KERNEL(Path::kStandardCpp, Path::kAvx2)
RUY_INHERIT_KERNEL(Path::kAvx2, Path::kAvx512)
-RUY_INHERIT_KERNEL(Path::kAvx512, Path::kAvxVnni)
-
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-void Kernel8bitSse42(const KernelParams8bit<8, 8>& params);
-
-template <typename DstScalar>
-struct Kernel<Path::kSse42, std::int8_t, std::int8_t, DstScalar,
- MulParams<std::int32_t, DstScalar>> {
- static constexpr Path kPath = Path::kSse42;
- Tuning tuning = Tuning::kAuto;
- using LhsLayout = FixedKernelLayout<Order::kColMajor, 4, 8>;
- using RhsLayout = FixedKernelLayout<Order::kColMajor, 4, 8>;
- explicit Kernel(Tuning tuning_) : tuning(tuning_) {}
- void Run(const PMat<std::int8_t>& lhs, const PMat<std::int8_t>& rhs,
- const MulParams<std::int32_t, DstScalar>& mul_params, int start_row,
- int start_col, int end_row, int end_col, Mat<DstScalar>* dst) const {
- KernelParams8bit<LhsLayout::kCols, RhsLayout::kCols> params;
- MakeKernelParams8bit(lhs, rhs, mul_params, start_row, start_col, end_row,
- end_col, dst, ¶ms);
- Kernel8bitSse42(params);
- }
-};
-
-void KernelFloatSse42(const KernelParamsFloat<8, 8>& params);
-
-template <>
-struct Kernel<Path::kSse42, float, float, float, MulParams<float, float>> {
- Tuning tuning = Tuning::kAuto;
- static constexpr Path kPath = Path::kSse42;
- using LhsLayout = FixedKernelLayout<Order::kRowMajor, 1, 8>;
- using RhsLayout = FixedKernelLayout<Order::kRowMajor, 1, 8>;
- explicit Kernel(Tuning tuning_) : tuning(tuning_) {}
- void Run(const PMat<float>& lhs, const PMat<float>& rhs,
- const MulParams<float, float>& mul_params, int start_row,
- int start_col, int end_row, int end_col, Mat<float>* dst) const {
- KernelParamsFloat<LhsLayout::kCols, RhsLayout::kCols> params;
- MakeKernelParamsFloat(lhs, rhs, mul_params, start_row, start_col, end_row,
- end_col, dst, ¶ms);
- KernelFloatSse42(params);
- }
-};
void Kernel8bitAvx512(const KernelParams8bit<16, 16>& params);
void Kernel8bitAvx512SingleCol(const KernelParams8bit<16, 16>& params);
@@ -178,49 +133,6 @@
}
};
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-void Kernel8bitAvxVnni(const KernelParams8bit<16, 16>& params);
-
-template <typename DstScalar>
-struct Kernel<Path::kAvxVnni, std::int8_t, std::int8_t, DstScalar,
- MulParams<std::int32_t, DstScalar>> {
- static constexpr Path kPath = Path::kAvxVnni;
- Tuning tuning = Tuning::kAuto;
- using LhsLayout = FixedKernelLayout<Order::kColMajor, 4, 16>;
- using RhsLayout = FixedKernelLayout<Order::kColMajor, 4, 16>;
- explicit Kernel(Tuning tuning_) : tuning(tuning_) {}
- void Run(const PMat<std::int8_t>& lhs, const PMat<std::int8_t>& rhs,
- const MulParams<std::int32_t, DstScalar>& mul_params, int start_row,
- int start_col, int end_row, int end_col, Mat<DstScalar>* dst) const {
- KernelParams8bit<LhsLayout::kCols, RhsLayout::kCols> params;
- MakeKernelParams8bit(lhs, rhs, mul_params, start_row, start_col, end_row,
- end_col, dst, ¶ms);
- Kernel8bitAvxVnni(params);
- }
-};
-
-void KernelFloatAvxVnni(const KernelParamsFloat<16, 16>& params);
-
-template <>
-struct Kernel<Path::kAvxVnni, float, float, float, MulParams<float, float>> {
- static constexpr Path kPath = Path::kAvxVnni;
- Tuning tuning = Tuning::kAuto;
- using LhsLayout = FixedKernelLayout<Order::kRowMajor, 1, 16>;
- using RhsLayout = FixedKernelLayout<Order::kRowMajor, 1, 16>;
- explicit Kernel(Tuning tuning_) : tuning(tuning_) {}
- void Run(const PMat<float>& lhs, const PMat<float>& rhs,
- const MulParams<float, float>& mul_params, int start_row,
- int start_col, int end_row, int end_col, Mat<float>* dst) const {
- KernelParamsFloat<LhsLayout::kCols, RhsLayout::kCols> params;
- MakeKernelParamsFloat(lhs, rhs, mul_params, start_row, start_col, end_row,
- end_col, dst, ¶ms);
- KernelFloatAvxVnni(params);
- }
-};
-
#endif // RUY_PLATFORM_X86
} // namespace ruy
diff --git a/ruy/pack_avxvnni.cc b/ruy/pack_avxvnni.cc
deleted file mode 100644
index a467bbc..0000000
--- a/ruy/pack_avxvnni.cc
+++ /dev/null
@@ -1,474 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cstdint>
-#include <cstring>
-
-#include "ruy/check_macros.h"
-#include "ruy/matrix.h"
-#include "ruy/opt_set.h"
-#include "ruy/pack.h"
-#include "ruy/path.h"
-#include "ruy/platform.h"
-#include "ruy/profiler/instrumentation.h"
-
-#if RUY_PLATFORM_AVX_VNNI && RUY_OPT(INTRINSICS)
-#include <immintrin.h> // IWYU pragma: keep
-#endif
-
-namespace ruy {
-
-#if !(RUY_PLATFORM_AVX_VNNI && RUY_OPT(ASM))
-
-void Pack8bitAvxVnni(const std::int8_t*, std::int8_t, const std::int8_t*, int,
- int, int, std::int8_t*, std::int32_t*) {
- // CPU-ID-based checks should disable the path that would reach this point.
- RUY_DCHECK(false);
-}
-
-void PackFloatAvxVnni(const float*, const float*, int, int, int, float*) {
- // CPU-ID-based checks should disable the path that would reach this point.
- RUY_DCHECK(false);
-}
-
-#else // RUY_PLATFORM_AVX_VNNI && RUY_OPT(ASM)
-
-// The first int8_t template parameter is arbitrary: this routine is common to
-// all 8-bit source matrix types.
-using PackImpl8bitAvxVnni =
- PackImpl<Path::kAvxVnni, FixedKernelLayout<Order::kColMajor, 4, 16>,
- std::int8_t, std::int8_t, std::int32_t>;
-
-namespace {
-
-inline void ZeroHalf8bitAvxVnni(int src_rows, std::int8_t packed_zero_point,
- std::int8_t* packed_ptr) {
- const int non_trailing_blocks = (src_rows & ~31) >> 2;
- // This routine fills half blocks, and typically fills the second halves. Thus
- // packed_ptr is already offset by 8*4.
- for (int k = 0; k < non_trailing_blocks; ++k) {
- for (int j = 0; j < (8 * 4); ++j) {
- packed_ptr[16 * 4 * k + j] = packed_zero_point;
- }
- }
-}
-
-inline void HalfPack8bitAvxVnni(const std::int8_t* src_ptr,
- std::int8_t input_xor,
- const std::int8_t* zerobuf, int src_stride,
- int remaining_src_cols, int src_rows,
- std::int8_t* packed_ptr, std::int32_t* sums_ptr,
- std::int8_t* trailing_buf) {
- std::int8_t in_data[8][8][4];
-
- const std::int8_t* src_ptr0 = src_ptr;
- const std::int8_t* src_ptr1 = src_ptr0 + src_stride;
- const std::int8_t* src_ptr2 = src_ptr1 + src_stride;
- const std::int8_t* src_ptr3 = src_ptr2 + src_stride;
- const std::int8_t* src_ptr4 = src_ptr3 + src_stride;
- const std::int8_t* src_ptr5 = src_ptr4 + src_stride;
- const std::int8_t* src_ptr6 = src_ptr5 + src_stride;
- const std::int8_t* src_ptr7 = src_ptr6 + src_stride;
- std::int64_t src_inc0 = 8 * 4;
- std::int64_t src_inc1 = 8 * 4;
- std::int64_t src_inc2 = 8 * 4;
- std::int64_t src_inc3 = 8 * 4;
- std::int64_t src_inc4 = 8 * 4;
- std::int64_t src_inc5 = 8 * 4;
- std::int64_t src_inc6 = 8 * 4;
- std::int64_t src_inc7 = 8 * 4;
- if (remaining_src_cols < 8) {
- if (remaining_src_cols <= 0) {
- src_ptr0 = zerobuf;
- src_inc0 = 0;
- }
- if (remaining_src_cols <= 1) {
- src_ptr1 = zerobuf;
- src_inc1 = 0;
- }
- if (remaining_src_cols <= 2) {
- src_ptr2 = zerobuf;
- src_inc2 = 0;
- }
- if (remaining_src_cols <= 3) {
- src_ptr3 = zerobuf;
- src_inc3 = 0;
- }
- if (remaining_src_cols <= 4) {
- src_ptr4 = zerobuf;
- src_inc4 = 0;
- }
- if (remaining_src_cols <= 5) {
- src_ptr5 = zerobuf;
- src_inc5 = 0;
- }
- if (remaining_src_cols <= 6) {
- src_ptr6 = zerobuf;
- src_inc6 = 0;
- }
- src_ptr7 = zerobuf;
- src_inc7 = 0;
- }
-
- const std::int8_t zero_point = zerobuf[0];
-
- if (sums_ptr) {
- for (int i = 0; i < 8; ++i) {
- sums_ptr[i] = 0;
- }
- }
-
- // The overall packing effectively pads the source rows to
- // (src_rows + 63) & ~63. The iteration over k may skip when m=1, and then we
- // only pack for (src_rows + 31) & ~31. When there is an incomplete
- // destination block, this is stored into trailing_buf instead of packed_ptr.
- for (int k = 0; k < src_rows; k += 16 * 4) {
- for (int m = 0; m < 2; ++m) {
- // Available source rows.
- // If this is less than 0 (for m=1), we skip, having filled trailing
- // buffer for m=0. Also, if source rows is zero on m=1, then we filled
- // exactly to the end of the column in the packed buffer.
- const int packed_rows = src_rows - k - 8 * m * 4;
- // Effectively,
- // packed_rows = std::max(0, std::min(8, src_rows - k - 8 * m));
- // but treat each case separately.
- if (packed_rows >= (8 * 4)) {
- for (int i = 0; i < 8; ++i) {
- for (int s = 0; s < 4; ++s) {
- in_data[0][i][s] = src_ptr0[i * 4 + s];
- in_data[1][i][s] = src_ptr1[i * 4 + s];
- in_data[2][i][s] = src_ptr2[i * 4 + s];
- in_data[3][i][s] = src_ptr3[i * 4 + s];
- in_data[4][i][s] = src_ptr4[i * 4 + s];
- in_data[5][i][s] = src_ptr5[i * 4 + s];
- in_data[6][i][s] = src_ptr6[i * 4 + s];
- in_data[7][i][s] = src_ptr7[i * 4 + s];
- }
- }
- for (int i = 0; i < 8; ++i) {
- for (int j = 0; j < 8; ++j) {
- for (int s = 0; s < 4; ++s) {
- packed_ptr[(16 * i + j) * 4 + s] =
- static_cast<std::int8_t>(in_data[j][i][s] ^ input_xor);
- }
- if (sums_ptr) {
- for (int s = 0; s < 4; ++s) {
- sums_ptr[j] += in_data[j][i][s] ^ input_xor;
- }
- }
- }
- }
- } else if (packed_rows > 0) {
- RUY_DCHECK_LT(packed_rows >> 2, 8);
- int i = 0;
- for (; i < (packed_rows >> 2); ++i) {
- for (int s = 0; s < 4; ++s) {
- in_data[0][i][s] = src_ptr0[i * 4 + s];
- in_data[1][i][s] = src_ptr1[i * 4 + s];
- in_data[2][i][s] = src_ptr2[i * 4 + s];
- in_data[3][i][s] = src_ptr3[i * 4 + s];
- in_data[4][i][s] = src_ptr4[i * 4 + s];
- in_data[5][i][s] = src_ptr5[i * 4 + s];
- in_data[6][i][s] = src_ptr6[i * 4 + s];
- in_data[7][i][s] = src_ptr7[i * 4 + s];
- }
- }
- if (i < ((packed_rows + 3) >> 2)) {
- int s = 0;
- for (; s < (packed_rows & 3); ++s) {
- in_data[0][i][s] = src_ptr0[i * 4 + s];
- in_data[1][i][s] = src_ptr1[i * 4 + s];
- in_data[2][i][s] = src_ptr2[i * 4 + s];
- in_data[3][i][s] = src_ptr3[i * 4 + s];
- in_data[4][i][s] = src_ptr4[i * 4 + s];
- in_data[5][i][s] = src_ptr5[i * 4 + s];
- in_data[6][i][s] = src_ptr6[i * 4 + s];
- in_data[7][i][s] = src_ptr7[i * 4 + s];
- }
- RUY_DCHECK_LE(s, 4);
- for (; s < 4; ++s) {
- for (int j = 0; j < 8; ++j) {
- in_data[j][i][s] = zero_point;
- }
- }
- ++i;
- }
- // We do not care what goes into the trailing buffer, but we want
- // in_data[...] ^ input_xor == 0 for irrelevant values in the summation.
- //
- // It might prove better in optimized code to pad uniformly with
- // zero_point, and compensate by initializing the summations with the
- // compensating offset, effectively
- // ((input_xor - zero_point) ^ input_xor) *
- // 4 * (8 - ((packed_rows + 3) >> 2)).
- for (; i < 8; ++i) {
- for (int s = 0; s < 4; ++s) {
- for (int j = 0; j < 8; ++j) {
- in_data[j][i][s] = input_xor;
- }
- }
- }
- // We loop through [0, 8) rather than [0, (packed_rows + 3) >> 2), since
- // that emulates what we might do in fully-optimized code.
- if (sums_ptr) {
- for (int i = 0; i < 8; ++i) {
- for (int j = 0; j < 8; ++j) {
- for (int s = 0; s < 4; ++s) {
- trailing_buf[(16 * i + j) * 4 + s] =
- static_cast<std::int8_t>(in_data[j][i][s] ^ input_xor);
- sums_ptr[j] += in_data[j][i][s] ^ input_xor;
- }
- }
- }
- } else {
- for (int i = 0; i < 8; ++i) {
- for (int j = 0; j < 8; ++j) {
- for (int s = 0; s < 4; ++s) {
- trailing_buf[(16 * i + j) * 4 + s] =
- static_cast<std::int8_t>(in_data[j][i][s] ^ input_xor);
- }
- }
- }
- }
- }
-
- packed_ptr += 16 * 8 * 4;
- src_ptr0 += src_inc0;
- src_ptr1 += src_inc1;
- src_ptr2 += src_inc2;
- src_ptr3 += src_inc3;
- src_ptr4 += src_inc4;
- src_ptr5 += src_inc5;
- src_ptr6 += src_inc6;
- src_ptr7 += src_inc7;
- }
- }
-}
-
-inline void HalfPackFloatAvxVnni(const float* src_ptr, const float* zerobuf,
- int src_stride, int remaining_src_cols,
- int src_rows, float* packed_ptr,
- float* trailing_buf) {
- float in_data[8][8];
-
- const float* src_ptr0 = src_ptr;
- const float* src_ptr1 = src_ptr0 + src_stride;
- const float* src_ptr2 = src_ptr1 + src_stride;
- const float* src_ptr3 = src_ptr2 + src_stride;
- const float* src_ptr4 = src_ptr3 + src_stride;
- const float* src_ptr5 = src_ptr4 + src_stride;
- const float* src_ptr6 = src_ptr5 + src_stride;
- const float* src_ptr7 = src_ptr6 + src_stride;
- std::int64_t src_inc0 = 8;
- std::int64_t src_inc1 = 8;
- std::int64_t src_inc2 = 8;
- std::int64_t src_inc3 = 8;
- std::int64_t src_inc4 = 8;
- std::int64_t src_inc5 = 8;
- std::int64_t src_inc6 = 8;
- std::int64_t src_inc7 = 8;
- if (remaining_src_cols < 8) {
- if (remaining_src_cols <= 0) {
- src_ptr0 = zerobuf;
- src_inc0 = 0;
- }
- if (remaining_src_cols <= 1) {
- src_ptr1 = zerobuf;
- src_inc1 = 0;
- }
- if (remaining_src_cols <= 2) {
- src_ptr2 = zerobuf;
- src_inc2 = 0;
- }
- if (remaining_src_cols <= 3) {
- src_ptr3 = zerobuf;
- src_inc3 = 0;
- }
- if (remaining_src_cols <= 4) {
- src_ptr4 = zerobuf;
- src_inc4 = 0;
- }
- if (remaining_src_cols <= 5) {
- src_ptr5 = zerobuf;
- src_inc5 = 0;
- }
- if (remaining_src_cols <= 6) {
- src_ptr6 = zerobuf;
- src_inc6 = 0;
- }
- src_ptr7 = zerobuf;
- src_inc7 = 0;
- }
-
- for (int k = 0; k < src_rows; k += 16) {
- for (int m = 0; m < 2; ++m) {
- const int packed_rows = src_rows - k - 8 * m;
- // Effectively,
- // packed_rows = std::max(0, std::min(8, src_rows - k - 8 * m));
- // but treat each case separately.
- if (packed_rows > 7) {
- for (int i = 0; i < 8; ++i) {
- in_data[0][i] = src_ptr0[i];
- in_data[1][i] = src_ptr1[i];
- in_data[2][i] = src_ptr2[i];
- in_data[3][i] = src_ptr3[i];
- in_data[4][i] = src_ptr4[i];
- in_data[5][i] = src_ptr5[i];
- in_data[6][i] = src_ptr6[i];
- in_data[7][i] = src_ptr7[i];
- }
- for (int i = 0; i < 8; ++i) {
- for (int j = 0; j < 8; ++j) {
- packed_ptr[16 * i + j] = in_data[j][i];
- }
- }
- } else if (packed_rows > 0) {
- for (int i = 0; i < packed_rows; ++i) {
- in_data[0][i] = src_ptr0[i];
- in_data[1][i] = src_ptr1[i];
- in_data[2][i] = src_ptr2[i];
- in_data[3][i] = src_ptr3[i];
- in_data[4][i] = src_ptr4[i];
- in_data[5][i] = src_ptr5[i];
- in_data[6][i] = src_ptr6[i];
- in_data[7][i] = src_ptr7[i];
- }
- for (int i = packed_rows; i < 8; ++i) {
- in_data[0][i] = 0.0f;
- in_data[1][i] = 0.0f;
- in_data[2][i] = 0.0f;
- in_data[3][i] = 0.0f;
- in_data[4][i] = 0.0f;
- in_data[5][i] = 0.0f;
- in_data[6][i] = 0.0f;
- in_data[7][i] = 0.0f;
- }
- // We loop through [0, 7) rather than [0, packed_rows), since that
- // emulates what we might do in fully-optimized code.
- for (int i = 0; i < 7; ++i) {
- for (int j = 0; j < 8; ++j) {
- trailing_buf[16 * i + j] = in_data[j][i];
- }
- }
- }
-
- packed_ptr += 16 * 8;
- src_ptr0 += src_inc0;
- src_ptr1 += src_inc1;
- src_ptr2 += src_inc2;
- src_ptr3 += src_inc3;
- src_ptr4 += src_inc4;
- src_ptr5 += src_inc5;
- src_ptr6 += src_inc6;
- src_ptr7 += src_inc7;
- }
- }
-}
-
-inline void ZeroHalfFloatAvxVnni(int src_rows, float* packed_ptr) {
- const int non_trailing_rows = src_rows & ~7;
- for (int k = 0; k < non_trailing_rows; ++k) {
- for (int j = 0; j < 8; ++j) {
- packed_ptr[j] = 0.0f;
- }
- packed_ptr += 16;
- }
-}
-
-} // namespace.
-
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-// When removing this comment, update profiling label below.
-void Pack8bitAvxVnni(const std::int8_t* src_ptr, std::int8_t input_xor,
- const std::int8_t* zerobuf, int src_stride,
- int remaining_src_cols, int src_rows,
- std::int8_t* packed_ptr, std::int32_t* sums_ptr) {
- profiler::ScopeLabel label("Pack kAvxVnni 8bit (UNFINISHED)");
-
- // Each packed block is 4*16, and there are normally 8. The trailing block is
- // only slightly shorter.
- std::int8_t trailing_buf[8 * 16 * 4];
- memset(trailing_buf, 0, 8 * 16 * 4 * sizeof(std::int8_t));
-
- std::int32_t* second_sums_ptr = sums_ptr ? sums_ptr + 8 : nullptr;
- if (remaining_src_cols > 8) {
- HalfPack8bitAvxVnni(src_ptr, input_xor, zerobuf, src_stride,
- remaining_src_cols, src_rows, packed_ptr, sums_ptr,
- trailing_buf);
- HalfPack8bitAvxVnni(src_ptr + src_stride * 8, input_xor, zerobuf,
- src_stride, remaining_src_cols - 8, src_rows,
- packed_ptr + 8 * 4, second_sums_ptr,
- trailing_buf + 8 * 4);
- } else {
- HalfPack8bitAvxVnni(src_ptr, input_xor, zerobuf, src_stride,
- remaining_src_cols, src_rows, packed_ptr, sums_ptr,
- trailing_buf);
- ZeroHalf8bitAvxVnni(src_rows, zerobuf[0] ^ input_xor, packed_ptr + 8 * 4);
- // The kernel may not need the second half-blocks sums to be set.
- if (second_sums_ptr) {
- for (int i = 0; i < 8; ++i) {
- second_sums_ptr[i] = (zerobuf[0] ^ input_xor) * ((src_rows + 3) & ~3);
- }
- }
- }
- const bool trailing_data = (src_rows & 31) > 0;
- // If the number of source rows is not a multiple of 32, there will be data in
- // the trailing buffer,
- if (trailing_data > 0) {
- const int non_trailing_rows = src_rows & ~31;
- // Destination "rows" are padded to next highest multiple of 4.
- const int dst_rows = (src_rows + 3) & ~3;
- const int trailing_rows = dst_rows - non_trailing_rows;
- memcpy(packed_ptr + 16 * non_trailing_rows, trailing_buf,
- 16 * trailing_rows * sizeof(std::int8_t));
- }
-}
-
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-// When removing this comment, update profiling label below.
-void PackFloatAvxVnni(const float* src_ptr, const float* zerobuf,
- int src_stride, int remaining_src_cols, int src_rows,
- float* packed_ptr) {
- profiler::ScopeLabel label("Pack kAvxVnni float (UNFINISHED)");
- float trailing_buf[7 * 16];
- if (remaining_src_cols > 8) {
- HalfPackFloatAvxVnni(src_ptr, zerobuf, src_stride, remaining_src_cols,
- src_rows, packed_ptr, trailing_buf);
- HalfPackFloatAvxVnni(src_ptr + src_stride * 8, zerobuf, src_stride,
- remaining_src_cols - 8, src_rows, packed_ptr + 8,
- trailing_buf + 8);
- } else {
- memset(trailing_buf, 0, sizeof(trailing_buf));
- HalfPackFloatAvxVnni(src_ptr, zerobuf, src_stride, remaining_src_cols,
- src_rows, packed_ptr, trailing_buf);
- ZeroHalfFloatAvxVnni(src_rows, packed_ptr + 8);
- }
- const int trailing_rows = src_rows & 7;
- if (trailing_rows > 0) {
- const int non_trailing_rows = src_rows & ~7;
- memcpy(packed_ptr + 16 * non_trailing_rows, trailing_buf,
- 16 * trailing_rows * sizeof(float));
- }
-}
-
-#endif // RUY_PLATFORM_AVX_VNNI && RUY_OPT(INTRINSICS)
-
-} // namespace ruy
diff --git a/ruy/pack_sse42.cc b/ruy/pack_sse42.cc
deleted file mode 100644
index 89d2ecc..0000000
--- a/ruy/pack_sse42.cc
+++ /dev/null
@@ -1,468 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cstdint>
-#include <cstring>
-
-#include "ruy/check_macros.h"
-#include "ruy/matrix.h"
-#include "ruy/opt_set.h"
-#include "ruy/pack.h"
-#include "ruy/path.h"
-#include "ruy/platform.h"
-#include "ruy/profiler/instrumentation.h"
-
-#if RUY_PLATFORM_SSE42 && RUY_OPT(INTRINSICS)
-#include <immintrin.h> // IWYU pragma: keep
-#endif
-
-namespace ruy {
-
-#if !(RUY_PLATFORM_SSE42 && RUY_OPT(ASM))
-
-void Pack8bitSse42(const std::int8_t*, std::int8_t, const std::int8_t*, int,
- int, int, std::int8_t*, std::int32_t*) {
- // CPU-ID-based checks should disable the path that would reach this point.
- RUY_DCHECK(false);
-}
-
-void PackFloatSse42(const float*, const float*, int, int, int, float*) {
- // CPU-ID-based checks should disable the path that would reach this point.
- RUY_DCHECK(false);
-}
-
-#else // RUY_PLATFORM_SSE42 && RUY_OPT(ASM)
-
-// The first int8_t template parameter is arbitrary: this routine is common to
-// all 8-bit source matrix types.
-using PackImpl8bitSse42 =
- PackImpl<Path::kSse42, FixedKernelLayout<Order::kColMajor, 4, 8>,
- std::int8_t, std::int8_t, std::int32_t>;
-
-using PackImplFloatSse42 =
- PackImpl<Path::kSse42, FixedKernelLayout<Order::kRowMajor, 1, 8>, float,
- float, float>;
-
-namespace {
-
-inline void Pack8bitSse42Packer(const std::int8_t* src_ptr,
- std::int8_t input_xor,
- const std::int8_t* zerobuf, int src_stride,
- int remaining_src_cols, int src_rows,
- std::int8_t* packed_ptr, std::int32_t* sums_ptr,
- std::int8_t* trailing_buf) {
- using Layout = PackImpl8bitSse42::Layout;
- RUY_DCHECK_EQ(Layout::kCols, 8);
- RUY_DCHECK_EQ(Layout::kRows, 4);
- // Each Layout::Rows is 4 contiguous input, contiguous packed elements.
- // We process 8 of these chunks at a time, padding short input chunks.
- constexpr int kNumRowChunks = 8;
- constexpr int kNumChunkedSrcRows = kNumRowChunks * Layout::kRows;
-
- std::int8_t in_data[Layout::kCols][kNumRowChunks][Layout::kRows];
-
- const std::int8_t* src_ptr0 = src_ptr;
- const std::int8_t* src_ptr1 = src_ptr0 + src_stride;
- const std::int8_t* src_ptr2 = src_ptr1 + src_stride;
- const std::int8_t* src_ptr3 = src_ptr2 + src_stride;
- const std::int8_t* src_ptr4 = src_ptr3 + src_stride;
- const std::int8_t* src_ptr5 = src_ptr4 + src_stride;
- const std::int8_t* src_ptr6 = src_ptr5 + src_stride;
- const std::int8_t* src_ptr7 = src_ptr6 + src_stride;
- std::int64_t src_inc0 = kNumChunkedSrcRows;
- std::int64_t src_inc1 = kNumChunkedSrcRows;
- std::int64_t src_inc2 = kNumChunkedSrcRows;
- std::int64_t src_inc3 = kNumChunkedSrcRows;
- std::int64_t src_inc4 = kNumChunkedSrcRows;
- std::int64_t src_inc5 = kNumChunkedSrcRows;
- std::int64_t src_inc6 = kNumChunkedSrcRows;
- std::int64_t src_inc7 = kNumChunkedSrcRows;
- // Handle cases where source does not have Layout::kCols (8) columns.
- if (remaining_src_cols < 8) {
- if (remaining_src_cols <= 0) {
- src_ptr0 = zerobuf;
- src_inc0 = 0;
- }
- if (remaining_src_cols <= 1) {
- src_ptr1 = zerobuf;
- src_inc1 = 0;
- }
- if (remaining_src_cols <= 2) {
- src_ptr2 = zerobuf;
- src_inc2 = 0;
- }
- if (remaining_src_cols <= 3) {
- src_ptr3 = zerobuf;
- src_inc3 = 0;
- }
- if (remaining_src_cols <= 4) {
- src_ptr4 = zerobuf;
- src_inc4 = 0;
- }
- if (remaining_src_cols <= 5) {
- src_ptr5 = zerobuf;
- src_inc5 = 0;
- }
- if (remaining_src_cols <= 6) {
- src_ptr6 = zerobuf;
- src_inc6 = 0;
- }
- src_ptr7 = zerobuf;
- src_inc7 = 0;
- }
-
- const std::int8_t zero_point = zerobuf[0];
-
- if (sums_ptr) {
- // i: Layout::kCols.
- for (int i = 0; i < 8; ++i) {
- sums_ptr[i] = 0;
- }
- }
-
- // The overall packing effectively pads the source rows to
- // (src_rows + 63) & ~63. The iteration over k may skip when m=1, and then we
- // only pack for (src_rows + 31) & ~31. When there is an incomplete
- // destination block, this is stored into trailing_buf instead of packed_ptr.
- for (int k = 0; k < src_rows; k += kNumChunkedSrcRows) {
- // Available source rows.
- // If this is less than 0 (for m=1), we skip, having filled trailing
- // buffer for m=0. Also, if source rows is zero on m=1, then we filled
- // exactly to the end of the column in the packed buffer.
- const int available_src_rows = src_rows - k;
- // Effectively,
- // available rows = std::max(0, std::min(8, src_rows - k));
- // treat each case separately.
- if (available_src_rows >= kNumChunkedSrcRows) {
- // i: chunks, s: Layout::Rows.
- for (int i = 0; i < 8; ++i) {
- for (int s = 0; s < 4; ++s) {
- in_data[0][i][s] = src_ptr0[i * 4 + s];
- in_data[1][i][s] = src_ptr1[i * 4 + s];
- in_data[2][i][s] = src_ptr2[i * 4 + s];
- in_data[3][i][s] = src_ptr3[i * 4 + s];
- in_data[4][i][s] = src_ptr4[i * 4 + s];
- in_data[5][i][s] = src_ptr5[i * 4 + s];
- in_data[6][i][s] = src_ptr6[i * 4 + s];
- in_data[7][i][s] = src_ptr7[i * 4 + s];
- }
- }
- // i: chunks, j: Layout::kCols, s: Layout::Rows.
- for (int i = 0; i < 8; ++i) {
- for (int j = 0; j < 8; ++j) {
- for (int s = 0; s < 4; ++s) {
- // 8 * 4 * i is offset for each block, that is
- // (Layout::kCols * Layout::kRows * i)
- packed_ptr[(8 * i + j) * 4 + s] = in_data[j][i][s] ^ input_xor;
- }
- if (sums_ptr) {
- for (int s = 0; s < 4; ++s) {
- sums_ptr[j] += in_data[j][i][s] ^ input_xor;
- }
- }
- }
- }
- } else if (available_src_rows > 0) {
- RUY_DCHECK_LT(available_src_rows, kNumChunkedSrcRows);
- int i = 0;
- // Consume chunks of 4 rows that are complete.
- for (; i < (available_src_rows >> 2); ++i) {
- for (int s = 0; s < 4; ++s) {
- in_data[0][i][s] = src_ptr0[i * 4 + s];
- in_data[1][i][s] = src_ptr1[i * 4 + s];
- in_data[2][i][s] = src_ptr2[i * 4 + s];
- in_data[3][i][s] = src_ptr3[i * 4 + s];
- in_data[4][i][s] = src_ptr4[i * 4 + s];
- in_data[5][i][s] = src_ptr5[i * 4 + s];
- in_data[6][i][s] = src_ptr6[i * 4 + s];
- in_data[7][i][s] = src_ptr7[i * 4 + s];
- }
- }
- // Consume any incomplete chunk.
- if (i < ((available_src_rows + 3) >> 2)) {
- int s = 0;
- for (; s < (available_src_rows & 3); ++s) {
- in_data[0][i][s] = src_ptr0[i * 4 + s];
- in_data[1][i][s] = src_ptr1[i * 4 + s];
- in_data[2][i][s] = src_ptr2[i * 4 + s];
- in_data[3][i][s] = src_ptr3[i * 4 + s];
- in_data[4][i][s] = src_ptr4[i * 4 + s];
- in_data[5][i][s] = src_ptr5[i * 4 + s];
- in_data[6][i][s] = src_ptr6[i * 4 + s];
- in_data[7][i][s] = src_ptr7[i * 4 + s];
- }
- RUY_DCHECK_LE(s, 4);
- for (; s < 4; ++s) {
- // j: Layout::kCols.
- for (int j = 0; j < 8; ++j) {
- in_data[j][i][s] = zero_point;
- }
- }
- ++i;
- }
- // We do not care what goes into the trailing buffer, but we want
- // in_data[...] ^ input_xor == 0 for irrelevant values in the summation.
- //
- // It might prove better in optimized code to pad uniformly with
- // zero_point, and compensate by initializing the summations with the
- // compensating offset, effectively
- // ((input_xor - zero_point) ^ input_xor) *
- // 4 * (8 - ((available_src_rows + 3) >> 2)).
- for (; i < 8; ++i) {
- for (int s = 0; s < 4; ++s) {
- for (int j = 0; j < 8; ++j) {
- in_data[j][i][s] = input_xor;
- }
- }
- }
- // We loop through [0, 8) rather than
- // [0, (available_src_rows + 3) >> 2), since that emulates what we might
- // do in fully-optimized code.
- //
- // i: chunks, j: Layout::kCols, s: Layout::Rows.
- if (sums_ptr) {
- for (int i = 0; i < 8; ++i) {
- for (int j = 0; j < 8; ++j) {
- for (int s = 0; s < 4; ++s) {
- trailing_buf[(8 * i + j) * 4 + s] = in_data[j][i][s] ^ input_xor;
- sums_ptr[j] = sums_ptr[j] + (in_data[j][i][s] ^ input_xor);
- }
- }
- }
- } else {
- for (int i = 0; i < 8; ++i) {
- for (int j = 0; j < 8; ++j) {
- for (int s = 0; s < 4; ++s) {
- trailing_buf[(8 * i + j) * 4 + s] = in_data[j][i][s] ^ input_xor;
- }
- }
- }
- }
- }
-
- packed_ptr += 8 * kNumChunkedSrcRows;
- src_ptr0 += src_inc0;
- src_ptr1 += src_inc1;
- src_ptr2 += src_inc2;
- src_ptr3 += src_inc3;
- src_ptr4 += src_inc4;
- src_ptr5 += src_inc5;
- src_ptr6 += src_inc6;
- src_ptr7 += src_inc7;
- }
-}
-
-inline void PackFloatSse42Packer(const float* src_ptr, const float* zerobuf,
- int src_stride, int remaining_src_cols,
- int src_rows, float* packed_ptr,
- float* trailing_buf) {
- using Layout = PackImplFloatSse42::Layout;
- RUY_DCHECK_EQ(Layout::kCols, 8);
- RUY_DCHECK_EQ(Layout::kRows, 1);
-
- // This packing amounts to tranposition of 8x8 blocks.
- static constexpr int kPackCols = 8; // Source cols packed together.
- static constexpr int kPackRows = 8; // Short input is padded.
-
- float in_data[kPackCols][kPackRows];
-
- const float* src_ptr0 = src_ptr;
- const float* src_ptr1 = src_ptr0 + src_stride;
- const float* src_ptr2 = src_ptr1 + src_stride;
- const float* src_ptr3 = src_ptr2 + src_stride;
- const float* src_ptr4 = src_ptr3 + src_stride;
- const float* src_ptr5 = src_ptr4 + src_stride;
- const float* src_ptr6 = src_ptr5 + src_stride;
- const float* src_ptr7 = src_ptr6 + src_stride;
- std::int64_t src_inc0 = 8;
- std::int64_t src_inc1 = 8;
- std::int64_t src_inc2 = 8;
- std::int64_t src_inc3 = 8;
- std::int64_t src_inc4 = 8;
- std::int64_t src_inc5 = 8;
- std::int64_t src_inc6 = 8;
- std::int64_t src_inc7 = 8;
- // Handle cases where source does not have kPackDim (8) columns.
- if (remaining_src_cols < kPackCols) {
- if (remaining_src_cols <= 0) {
- src_ptr0 = zerobuf;
- src_inc0 = 0;
- }
- if (remaining_src_cols <= 1) {
- src_ptr1 = zerobuf;
- src_inc1 = 0;
- }
- if (remaining_src_cols <= 2) {
- src_ptr2 = zerobuf;
- src_inc2 = 0;
- }
- if (remaining_src_cols <= 3) {
- src_ptr3 = zerobuf;
- src_inc3 = 0;
- }
- if (remaining_src_cols <= 4) {
- src_ptr4 = zerobuf;
- src_inc4 = 0;
- }
- if (remaining_src_cols <= 5) {
- src_ptr5 = zerobuf;
- src_inc5 = 0;
- }
- if (remaining_src_cols <= 6) {
- src_ptr6 = zerobuf;
- src_inc6 = 0;
- }
- src_ptr7 = zerobuf;
- src_inc7 = 0;
- }
-
- for (int k = 0; k < src_rows; k += kPackRows) {
- const int available_src_rows = src_rows - k;
- // Effectively,
- // available_src_rows = std::max(0, std::min(kPackDim, src_rows - k));
- // but treat each case separately.
- if (available_src_rows >= kPackRows) {
- for (int i = 0; i < 8; ++i) {
- in_data[0][i] = src_ptr0[i];
- in_data[1][i] = src_ptr1[i];
- in_data[2][i] = src_ptr2[i];
- in_data[3][i] = src_ptr3[i];
- in_data[4][i] = src_ptr4[i];
- in_data[5][i] = src_ptr5[i];
- in_data[6][i] = src_ptr6[i];
- in_data[7][i] = src_ptr7[i];
- }
- for (int i = 0; i < 8; ++i) {
- for (int j = 0; j < 8; ++j) {
- packed_ptr[8 * i + j] = in_data[j][i];
- }
- }
- } else if (available_src_rows > 0) {
- for (int i = 0; i < available_src_rows; ++i) {
- in_data[0][i] = src_ptr0[i];
- in_data[1][i] = src_ptr1[i];
- in_data[2][i] = src_ptr2[i];
- in_data[3][i] = src_ptr3[i];
- in_data[4][i] = src_ptr4[i];
- in_data[5][i] = src_ptr5[i];
- in_data[6][i] = src_ptr6[i];
- in_data[7][i] = src_ptr7[i];
- }
- for (int i = available_src_rows; i < kPackRows; ++i) {
- in_data[0][i] = 0.0f;
- in_data[1][i] = 0.0f;
- in_data[2][i] = 0.0f;
- in_data[3][i] = 0.0f;
- in_data[4][i] = 0.0f;
- in_data[5][i] = 0.0f;
- in_data[6][i] = 0.0f;
- in_data[7][i] = 0.0f;
- }
- // We loop through [0, 7) rather than [0, packed_rows), since that
- // emulates what we might do in fully-optimized code.
- // i: (kPackRows - 1), j: kPackCols.
- for (int i = 0; i < 7; ++i) {
- for (int j = 0; j < 8; ++j) {
- trailing_buf[kPackRows * i + j] = in_data[j][i];
- }
- }
- }
-
- packed_ptr += kPackRows * kPackCols;
- src_ptr0 += src_inc0;
- src_ptr1 += src_inc1;
- src_ptr2 += src_inc2;
- src_ptr3 += src_inc3;
- src_ptr4 += src_inc4;
- src_ptr5 += src_inc5;
- src_ptr6 += src_inc6;
- src_ptr7 += src_inc7;
- }
-}
-
-} // namespace.
-
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-// When removing this comment, update profiling label below.
-void Pack8bitSse42(const std::int8_t* src_ptr, std::int8_t input_xor,
- const std::int8_t* zerobuf, int src_stride,
- int remaining_src_cols, int src_rows,
- std::int8_t* packed_ptr, std::int32_t* sums_ptr) {
- profiler::ScopeLabel label("Pack kSse42 8bit (UNFINISHED)");
-
- using Layout = PackImpl8bitSse42::Layout;
- RUY_DCHECK_EQ(Layout::kCols, 8);
- RUY_DCHECK_EQ(Layout::kRows, 4);
-
- // Each Layout::Rows is 4 contiguous input, contiguous packed elements.
- // We process 8 of these chunks at a time, padding short input chunks.
- static constexpr int kNumRowChunks = 8; // Short input is padded.
-
- // Each packed block is 4*8, and there are normally 8. The trailing block is
- // only slightly shorter.
- constexpr int kTrailingBufSize =
- kNumRowChunks * Layout::kCols * Layout::kRows;
- std::int8_t trailing_buf[kTrailingBufSize];
- memset(trailing_buf, 0, kTrailingBufSize * sizeof(std::int8_t));
-
- Pack8bitSse42Packer(src_ptr, input_xor, zerobuf, src_stride,
- remaining_src_cols, src_rows, packed_ptr, sums_ptr,
- trailing_buf);
-
- constexpr int kChunkedRowMask = kNumRowChunks * Layout::kRows - 1;
- const bool trailing_data = (src_rows & kChunkedRowMask) > 0;
- // If the number of source rows is not a multiple of kChunkedRowMask, there
- // will be data in the trailing buffer,
- if (trailing_data > 0) {
- const int non_trailing_rows = src_rows & ~kChunkedRowMask;
- // Destination "rows" are padded to next highest multiple of Layout::kRows.
- const int dst_rows = (src_rows + 3) & ~3;
- const int trailing_rows = dst_rows - non_trailing_rows;
- memcpy(packed_ptr + Layout::kCols * non_trailing_rows, trailing_buf,
- Layout::kCols * trailing_rows * sizeof(std::int8_t));
- }
-}
-
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-// When removing this comment, update profiling label below.
-void PackFloatSse42(const float* src_ptr, const float* zerobuf, int src_stride,
- int remaining_src_cols, int src_rows, float* packed_ptr) {
- profiler::ScopeLabel label("Pack kSse42 float (UNFINISHED)");
- static constexpr int kPackCols = 8; // Source cols packed together.
- static constexpr int kPackRows = 8; // Short input is padded.
- float trailing_buf[(kPackRows - 1) * kPackCols];
- if (remaining_src_cols < 8) {
- memset(trailing_buf, 0, sizeof(trailing_buf));
- }
- PackFloatSse42Packer(src_ptr, zerobuf, src_stride, remaining_src_cols,
- src_rows, packed_ptr, trailing_buf);
-
- const int trailing_rows = src_rows & (kPackRows - 1);
- if (trailing_rows > 0) {
- const int non_trailing_rows = src_rows & ~(kPackRows - 1);
- memcpy(packed_ptr + kPackCols * non_trailing_rows, trailing_buf,
- kPackCols * trailing_rows * sizeof(float));
- }
-}
-
-#endif // RUY_PLATFORM_SSE42 && RUY_OPT(INTRINSICS)
-
-} // namespace ruy
diff --git a/ruy/pack_x86.h b/ruy/pack_x86.h
index 076f2c2..a9e0d2a 100644
--- a/ruy/pack_x86.h
+++ b/ruy/pack_x86.h
@@ -35,16 +35,10 @@
#if RUY_PLATFORM_X86
-RUY_INHERIT_PACK(Path::kStandardCpp, Path::kSse42)
-RUY_INHERIT_PACK(Path::kSse42, Path::kAvx2)
+RUY_INHERIT_PACK(Path::kStandardCpp, Path::kAvx2)
RUY_INHERIT_PACK(Path::kAvx2, Path::kAvx512)
-RUY_INHERIT_PACK(Path::kAvx512, Path::kAvxVnni)
template <>
-struct PackedTypeImpl<Path::kSse42, std::uint8_t> {
- using Type = std::int8_t;
-};
-template <>
struct PackedTypeImpl<Path::kAvx2, std::uint8_t> {
using Type = std::int8_t;
};
@@ -52,100 +46,6 @@
struct PackedTypeImpl<Path::kAvx512, std::uint8_t> {
using Type = std::int8_t;
};
-template <>
-struct PackedTypeImpl<Path::kAvxVnni, std::uint8_t> {
- using Type = std::int8_t;
-};
-
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-// Note that source and zero buffers can be uint8 type, but in the packing
-// function are reinterpreted as int8, and are XOR-ed with input_xor.
-void Pack8bitSse42(const std::int8_t* src_ptr, std::int8_t input_xor,
- const std::int8_t* zerobuf, int src_stride,
- int remaining_src_cols, int src_rows,
- std::int8_t* packed_ptr, std::int32_t* sums_ptr);
-
-template <typename Scalar>
-struct PackImpl<Path::kSse42, FixedKernelLayout<Order::kColMajor, 4, 8>, Scalar,
- std::int8_t, std::int32_t> {
- static_assert(std::is_same<Scalar, std::int8_t>::value ||
- std::is_same<Scalar, std::uint8_t>::value,
- "");
- using Layout = FixedKernelLayout<Order::kColMajor, 4, 8>;
- static constexpr std::int8_t kInputXor =
- std::is_same<Scalar, std::int8_t>::value ? 0 : 0x80;
-
- static void Run(Tuning, const Mat<Scalar>& src_matrix,
- PMat<std::int8_t>* packed_matrix, int start_col,
- int end_col) {
- profiler::ScopeLabel label("Pack (SSE 4.2 8-bit)");
-
- RUY_DCHECK(IsColMajor(src_matrix.layout));
- RUY_DCHECK(IsColMajor(packed_matrix->layout));
- RUY_DCHECK_EQ((end_col - start_col) % Layout::kCols, 0);
- RUY_DCHECK_EQ(start_col % Layout::kCols, 0);
- std::int32_t* sums = packed_matrix->sums;
- Scalar zerobuf[Layout::kCols * Layout::kRows];
- memset(zerobuf, packed_matrix->zero_point ^ kInputXor,
- Layout::kCols * Layout::kRows * sizeof(Scalar));
- for (int block_col = start_col; block_col < end_col;
- block_col += Layout::kCols) {
- std::int32_t* sums_ptr = sums ? sums + block_col : nullptr;
- int src_stride = src_matrix.layout.stride;
- const Scalar* src_ptr = src_matrix.data.get() + src_stride * block_col;
- int remaining_src_cols = src_matrix.layout.cols - block_col;
-
- static constexpr int block_col_mask = ~(Layout::kCols - 1); // High bits.
- std::int8_t* packed_ptr =
- packed_matrix->data +
- packed_matrix->layout.stride * (block_col & block_col_mask);
- Pack8bitSse42(reinterpret_cast<const std::int8_t*>(src_ptr), kInputXor,
- reinterpret_cast<const std::int8_t*>(zerobuf), src_stride,
- remaining_src_cols, src_matrix.layout.rows, packed_ptr,
- sums_ptr);
- }
- }
-};
-
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-void PackFloatSse42(const float* src_ptr, const float* zerobuf, int src_stride,
- int remaining_src_cols, int src_rows, float* packed_ptr);
-
-template <>
-struct PackImpl<Path::kSse42, FixedKernelLayout<Order::kRowMajor, 1, 8>, float,
- float, float> {
- using Layout = FixedKernelLayout<Order::kRowMajor, 1, 8>;
- static void Run(Tuning, const Mat<float>& src_matrix,
- PMat<float>* packed_matrix, int start_col, int end_col) {
- profiler::ScopeLabel label("Pack (SSE 4.2 float)");
-
- RUY_DCHECK(IsColMajor(src_matrix.layout));
- RUY_DCHECK(IsColMajor(packed_matrix->layout));
- RUY_DCHECK_EQ((end_col - start_col) % Layout::kCols, 0);
- RUY_DCHECK_EQ(start_col % Layout::kCols, 0);
- const float zerobuf[Layout::kCols] = {
- 0.0f}; // Remainder default inits to 0.0f.
- for (int block_col = start_col; block_col < end_col;
- block_col += Layout::kCols) {
- int src_stride = src_matrix.layout.stride;
- const float* src_ptr = src_matrix.data.get() + src_stride * block_col;
- int remaining_src_cols = src_matrix.layout.cols - block_col;
-
- static constexpr int block_col_mask = ~(Layout::kCols - 1); // High bits.
- float* packed_ptr =
- packed_matrix->data +
- packed_matrix->layout.stride * (block_col & block_col_mask);
- PackFloatSse42(src_ptr, zerobuf, src_stride, remaining_src_cols,
- src_matrix.layout.rows, packed_ptr);
- }
- }
-};
// Note that source and zero buffers can be uint8 type, but in the packing
// function are reinterpreted as int8, and are XOR-ed with input_xor.
@@ -312,100 +212,6 @@
}
}
};
-
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-// Note that source and zero buffers can be uint8 type, but in the packing
-// function are reinterpreted as int8, and are XOR-ed with input_xor.
-void Pack8bitAvxVnni(const std::int8_t* src_ptr, std::int8_t input_xor,
- const std::int8_t* zerobuf, int src_stride,
- int remaining_src_cols, int src_rows,
- std::int8_t* packed_ptr, std::int32_t* sums_ptr);
-
-template <typename Scalar>
-struct PackImpl<Path::kAvxVnni, FixedKernelLayout<Order::kColMajor, 4, 16>,
- Scalar, std::int8_t, std::int32_t> {
- static_assert(std::is_same<Scalar, std::int8_t>::value ||
- std::is_same<Scalar, std::uint8_t>::value,
- "");
- using Layout = FixedKernelLayout<Order::kColMajor, 4, 16>;
- static constexpr int kHalfLayoutCols =
- 8; // Half the number of cols in a block.
- static constexpr std::int8_t kInputXor =
- std::is_same<Scalar, std::int8_t>::value ? 0 : 0x80;
-
- static void Run(Tuning, const Mat<Scalar>& src_matrix,
- PMat<std::int8_t>* packed_matrix, int start_col,
- int end_col) {
- profiler::ScopeLabel label("Pack (AVX-512 8-bit)");
-
- RUY_DCHECK(IsColMajor(src_matrix.layout));
- RUY_DCHECK(IsColMajor(packed_matrix->layout));
- RUY_DCHECK_EQ((end_col - start_col) % Layout::kCols, 0);
- RUY_DCHECK_EQ(start_col % Layout::kCols, 0);
- RUY_DCHECK_EQ(kHalfLayoutCols * 2, Layout::kCols);
- std::int32_t* sums = packed_matrix->sums;
- Scalar zerobuf[kHalfLayoutCols * Layout::kRows];
- memset(zerobuf, packed_matrix->zero_point ^ kInputXor,
- kHalfLayoutCols * Layout::kRows * sizeof(Scalar));
- for (int block_col = start_col; block_col < end_col;
- block_col += Layout::kCols) {
- std::int32_t* sums_ptr = sums ? sums + block_col : nullptr;
- int src_stride = src_matrix.layout.stride;
- const Scalar* src_ptr = src_matrix.data.get() + src_stride * block_col;
- int remaining_src_cols = src_matrix.layout.cols - block_col;
-
- static constexpr int block_col_mask = ~(Layout::kCols - 1); // High bits.
- std::int8_t* packed_ptr =
- packed_matrix->data +
- packed_matrix->layout.stride * (block_col & block_col_mask);
- Pack8bitAvxVnni(reinterpret_cast<const std::int8_t*>(src_ptr), kInputXor,
- reinterpret_cast<const std::int8_t*>(zerobuf), src_stride,
- remaining_src_cols, src_matrix.layout.rows, packed_ptr,
- sums_ptr);
- }
- }
-};
-
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-void PackFloatAvxVnni(const float* src_ptr, const float* zerobuf,
- int src_stride, int remaining_src_cols, int src_rows,
- float* packed_ptr);
-
-template <>
-struct PackImpl<Path::kAvxVnni, FixedKernelLayout<Order::kRowMajor, 1, 16>,
- float, float, float> {
- static void Run(Tuning, const Mat<float>& src_matrix,
- PMat<float>* packed_matrix, int start_col, int end_col) {
- profiler::ScopeLabel label("Pack (AVX-512 float)");
-
- using Layout = FixedKernelLayout<Order::kRowMajor, 1, 16>;
- RUY_DCHECK(IsColMajor(src_matrix.layout));
- RUY_DCHECK(IsColMajor(packed_matrix->layout));
- RUY_DCHECK_EQ((end_col - start_col) % Layout::kCols, 0);
- RUY_DCHECK_EQ(start_col % Layout::kCols, 0);
- const float zerobuf[Layout::kCols] = {
- 0.0f}; // Remainder default inits to 0.0f.
- for (int block_col = start_col; block_col < end_col;
- block_col += Layout::kCols) {
- int src_stride = src_matrix.layout.stride;
- const float* src_ptr = src_matrix.data.get() + src_stride * block_col;
- int remaining_src_cols = src_matrix.layout.cols - block_col;
-
- static constexpr int block_col_mask = ~(Layout::kCols - 1); // High bits.
- float* packed_ptr =
- packed_matrix->data +
- packed_matrix->layout.stride * (block_col & block_col_mask);
- PackFloatAvxVnni(src_ptr, zerobuf, src_stride, remaining_src_cols,
- src_matrix.layout.rows, packed_ptr);
- }
- }
-};
#endif // RUY_PLATFORM_X86
} // namespace ruy
diff --git a/ruy/path.h b/ruy/path.h
index 2a715d7..b2909c7 100644
--- a/ruy/path.h
+++ b/ruy/path.h
@@ -74,24 +74,10 @@
#if RUY_PLATFORM_X86
// x86 architectures.
//
- // TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete /
- // placeholder.
- // Optimization is not finished. In particular the dimensions of the kernel
- // blocks can be changed as desired.
- //
- // Optimized for SSE 4.2.
- kSse42 = 0x4,
// Optimized for AVX2.
kAvx2 = 0x8,
// Optimized for AVX-512.
kAvx512 = 0x10,
- // TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete /
- // placeholder.
- // Optimization is not finished. In particular the dimensions of the kernel
- // blocks can be changed as desired.
- //
- // Optimized for AVX-VNNI.
- kAvxVnni = 0x20,
#endif // RUY_PLATFORM_X86
};
@@ -154,7 +140,7 @@
constexpr Path kExtraArchPaths = Path::kNone;
#elif RUY_PLATFORM_X86
constexpr Path kDefaultArchPaths = Path::kAvx2 | Path::kAvx512;
-constexpr Path kExtraArchPaths = Path::kSse42 | Path::kAvxVnni;
+constexpr Path kExtraArchPaths = Path::kNone;
#else
constexpr Path kDefaultArchPaths = Path::kNone;
constexpr Path kExtraArchPaths = Path::kNone;
diff --git a/ruy/test.h b/ruy/test.h
index 902fbfd..1ae4801 100644
--- a/ruy/test.h
+++ b/ruy/test.h
@@ -93,10 +93,8 @@
RUY_PATHNAME_CASE(kNeon)
RUY_PATHNAME_CASE(kNeonDotprod)
#elif RUY_PLATFORM_X86
- RUY_PATHNAME_CASE(kSse42)
RUY_PATHNAME_CASE(kAvx2)
RUY_PATHNAME_CASE(kAvx512)
- RUY_PATHNAME_CASE(kAvxVnni)
#endif
default:
RUY_CHECK(false);