Remove SSE4.2 and VNNI placeholder code for now. PiperOrigin-RevId: 317157906

commit: 8dd913669fbd4707367ca6c2c656711df5b992f8 [log] [tgz]
author: Benoit Jacob <benoitjacob@google.com> Thu Jun 18 12:33:52 2020 -0700
committer: Copybara-Service <copybara-worker@google.com> Thu Jun 18 12:34:15 2020 -0700
tree: ed5f527d732ffbee74d691eae98c5faa5acdc442
parent: 4d8ad9f776fe58ba93d300b6b69c29c5ee11cd3c [diff]
diff --git a/ruy/BUILD b/ruy/BUILD
index bf5c7e5..00af835 100644
--- a/ruy/BUILD
+++ b/ruy/BUILD

@@ -1,8 +1,8 @@
 # Ruy is not BLAS
 
 load("@bazel_skylib//lib:selects.bzl", "selects")
-load(":build_defs.bzl", "ruy_copts", "ruy_copts_avx2", "ruy_copts_avx512", "ruy_copts_sse42")
-load(":build_defs.oss.bzl", "ruy_copts_avxvnni", "ruy_linkopts_thread_standard_library")
+load(":build_defs.bzl", "ruy_copts", "ruy_copts_avx2", "ruy_copts_avx512")
+load(":build_defs.oss.bzl", "ruy_linkopts_thread_standard_library")
 load(":ruy_test_ext.oss.bzl", "ruy_test_ext_defines", "ruy_test_ext_deps")
 load(":ruy_test.bzl", "ruy_benchmark", "ruy_test")
 
@@ -621,106 +621,6 @@
     ],
 )
 
-# TODO(b/147376783): SSE 4.2 support is incomplete / placeholder.
-# Optimization is not finished. In particular the dimensions of the kernel
-# blocks can be changed as desired.
-cc_library(
-    name = "kernel_sse42",
-    srcs = [
-        "kernel_sse42.cc",
-    ],
-    copts = ruy_copts() + ruy_copts_sse42(),
-    deps = [
-        ":check_macros",
-        ":kernel_common",
-        ":opt_set",
-        ":platform",
-        "//ruy/profiler:instrumentation",
-    ],
-)
-
-cc_library(
-    name = "pack_sse42",
-    srcs = [
-        "pack_sse42.cc",
-    ],
-    copts = ruy_copts() + ruy_copts_sse42(),
-    deps = [
-        ":check_macros",
-        ":matrix",
-        ":opt_set",
-        ":pack_common",
-        ":path",
-        ":platform",
-        "//ruy/profiler:instrumentation",
-    ],
-)
-
-cc_library(
-    name = "have_built_path_for_sse42",
-    srcs = [
-        "have_built_path_for_sse42.cc",
-    ],
-    hdrs = [
-        "have_built_path_for.h",
-    ],
-    copts = ruy_copts() + ruy_copts_sse42(),
-    deps = [
-        ":opt_set",
-        ":platform",
-    ],
-)
-
-# TODO(b/147376783): AVX-VNNI support is incomplete / placeholder.
-# Optimization is not finished. In particular the dimensions of the kernel
-# blocks can be changed as desired.
-cc_library(
-    name = "kernel_avxvnni",
-    srcs = [
-        "kernel_avxvnni.cc",
-    ],
-    copts = ruy_copts() + ruy_copts_avxvnni(),
-    deps = [
-        ":check_macros",
-        ":kernel_common",
-        ":opt_set",
-        ":platform",
-        "//ruy/profiler:instrumentation",
-    ],
-)
-
-cc_library(
-    name = "pack_avxvnni",
-    srcs = [
-        "pack_avxvnni.cc",
-    ],
-    copts = ruy_copts() + ruy_copts_avxvnni(),
-    deps = [
-        ":check_macros",
-        ":matrix",
-        ":opt_set",
-        ":pack_common",
-        ":path",
-        ":platform",
-        "//ruy/profiler:instrumentation",
-    ],
-)
-
-cc_library(
-    name = "have_built_path_for_avxvnni",
-    srcs = [
-        "have_built_path_for_avxvnni.cc",
-    ],
-    hdrs = [
-        "have_built_path_for.h",
-    ],
-    copts = ruy_copts() + ruy_copts_avxvnni(),
-    deps = [
-        ":opt_set",
-        ":platform",
-    ],
-)
-
 cc_library(
     name = "kernel",
     hdrs = [
@@ -735,9 +635,7 @@
         ":kernel_arm",  # fixdeps: keep
         ":kernel_avx2",  # fixdeps: keep
         ":kernel_avx512",  # fixdeps: keep
-        ":kernel_avxvnni",  # fixdeps: keep
         ":kernel_common",
-        ":kernel_sse42",  # fixdeps: keep
         ":mat",
         ":matrix",
         ":mul_params",
@@ -767,9 +665,7 @@
         ":pack_arm",  # fixdeps: keep
         ":pack_avx2",  # fixdeps: keep
         ":pack_avx512",  # fixdeps: keep
-        ":pack_avxvnni",  # fixdeps: keep
         ":pack_common",
-        ":pack_sse42",  # fixdeps: keep
         ":path",
         ":platform",
         ":tune",
@@ -785,8 +681,6 @@
     deps = [
         ":have_built_path_for_avx2",
         ":have_built_path_for_avx512",
-        ":have_built_path_for_avxvnni",
-        ":have_built_path_for_sse42",
         ":platform",
     ],
 )

diff --git a/ruy/build_defs.bzl b/ruy/build_defs.bzl
index 1da3b1b..594c4d9 100644
--- a/ruy/build_defs.bzl
+++ b/ruy/build_defs.bzl

@@ -68,12 +68,3 @@
         "//ruy:x86_64": ["-mavx2", "-mfma"],
         "//conditions:default": [],
     })
-
-# TODO(b/147376783): SSE 4.2 support is incomplete / placeholder.
-# Optimization is not finished. In particular the dimensions of the kernel
-# blocks can be changed as desired.
-def ruy_copts_sse42():
-    return select({
-        "//ruy:x86_64": ["-msse4.2"],
-        "//conditions:default": [],
-    })

diff --git a/ruy/build_defs.oss.bzl b/ruy/build_defs.oss.bzl
index 92eac57..e405b41 100644
--- a/ruy/build_defs.oss.bzl
+++ b/ruy/build_defs.oss.bzl

@@ -1,13 +1,5 @@
 """Build definitions for Ruy that are specific to the open-source build."""
 
-# TODO(b/147376783): VNNI support is incomplete / placeholder.
-# Optimization is not finished. In particular the dimensions of the kernel
-# blocks can be changed as desired.
-#
-# At the moment this does nothing because current toolchains don't support VNNI.
-def ruy_copts_avxvnni():
-    return []
-
 # Used for targets that #include <thread>
 def ruy_linkopts_thread_standard_library():
     # In open source builds, GCC is a common occurence. It requires "-pthread"

diff --git a/ruy/ctx.cc b/ruy/ctx.cc
index 87faec3..9924351 100644
--- a/ruy/ctx.cc
+++ b/ruy/ctx.cc

@@ -100,14 +100,10 @@
 #elif RUY_PLATFORM_X86
   // x86 SIMD paths currently require both runtime detection, and detection of
   // whether we're building the path at all.
-  maybe_add(Path::kSse42,
-            [=]() { return HaveBuiltPathForSse42() && cpuinfo->Sse42(); });
   maybe_add(Path::kAvx2,
             [=]() { return HaveBuiltPathForAvx2() && cpuinfo->Avx2(); });
   maybe_add(Path::kAvx512,
             [=]() { return HaveBuiltPathForAvx512() && cpuinfo->Avx512(); });
-  maybe_add(Path::kAvxVnni,
-            [=]() { return HaveBuiltPathForAvxVnni() && cpuinfo->AvxVnni(); });
 #else
   (void)maybe_add;
   (void)cpuinfo;

diff --git a/ruy/have_built_path_for.h b/ruy/have_built_path_for.h
index f0b1cd6..94761a7 100644
--- a/ruy/have_built_path_for.h
+++ b/ruy/have_built_path_for.h

@@ -21,10 +21,8 @@
 namespace ruy {
 
 #if RUY_PLATFORM_X86
-bool HaveBuiltPathForSse42();
 bool HaveBuiltPathForAvx2();
 bool HaveBuiltPathForAvx512();
-bool HaveBuiltPathForAvxVnni();
 #endif  // RUY_PLATFORM_X86
 
 }  // namespace ruy

diff --git a/ruy/have_built_path_for_avxvnni.cc b/ruy/have_built_path_for_avxvnni.cc
deleted file mode 100644
index f6e719f..0000000
--- a/ruy/have_built_path_for_avxvnni.cc
+++ /dev/null

@@ -1,39 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "ruy/have_built_path_for.h"
-#include "ruy/opt_set.h"
-
-namespace ruy {
-
-#if RUY_PLATFORM_X86
-// IMPORTANT:
-// These patterns must match those in the pack and kernel cc files.
-#if !(RUY_PLATFORM_AVX_VNNI && RUY_OPT(ASM))
-
-bool HaveBuiltPathForAvxVnni() { return false; }
-
-#else  // RUY_PLATFORM_AVX_VNNI && RUY_OPT(ASM)
-
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-bool HaveBuiltPathForAvxVnni() { return true; }
-
-#endif  // RUY_PLATFORM_AVX_VNNI && RUY_OPT(ASM)
-#endif  // RUY_PLATFORM_X86
-
-}  // namespace ruy

diff --git a/ruy/have_built_path_for_sse42.cc b/ruy/have_built_path_for_sse42.cc
deleted file mode 100644
index 182f0d9..0000000
--- a/ruy/have_built_path_for_sse42.cc
+++ /dev/null

@@ -1,39 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "ruy/have_built_path_for.h"
-#include "ruy/opt_set.h"
-
-namespace ruy {
-
-#if RUY_PLATFORM_X86
-// IMPORTANT:
-// These patterns must match those in the pack and kernel cc files.
-#if !(RUY_PLATFORM_SSE42 && RUY_OPT(ASM))
-
-bool HaveBuiltPathForSse42() { return false; }
-
-#else  // RUY_PLATFORM_SSE42 && RUY_OPT(ASM)
-
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-bool HaveBuiltPathForSse42() { return true; }
-
-#endif  // RUY_PLATFORM_SSE42 && RUY_OPT(ASM)
-#endif  // RUY_PLATFORM_X86
-
-}  // namespace ruy

diff --git a/ruy/kernel_avxvnni.cc b/ruy/kernel_avxvnni.cc
deleted file mode 100644
index 0045185..0000000
--- a/ruy/kernel_avxvnni.cc
+++ /dev/null

@@ -1,435 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <algorithm>
-#include <cstdint>
-
-#include "ruy/check_macros.h"
-#include "ruy/kernel.h"
-#include "ruy/opt_set.h"
-#include "ruy/platform.h"
-#include "ruy/profiler/instrumentation.h"
-
-#if RUY_PLATFORM_AVX_VNNI && RUY_OPT(ASM)
-#include <immintrin.h>  // IWYU pragma: keep
-#endif
-
-namespace ruy {
-
-#if !(RUY_PLATFORM_AVX_VNNI && RUY_OPT(ASM))
-
-void Kernel8bitAvxVnni(const KernelParams8bit<16, 16>&) {
-  // CPU-ID-based checks should disable the path that would reach this point.
-  RUY_DCHECK(false);
-}
-
-void KernelFloatAvxVnni(const KernelParamsFloat<16, 16>&) {
-  // CPU-ID-based checks should disable the path that would reach this point.
-  RUY_DCHECK(false);
-}
-
-#else  // RUY_PLATFORM_AVX_VNNI && RUY_OPT(ASM)
-
-static constexpr int kAvxFloatBlockSize = 16;
-static constexpr int kAvx8bitBlockSize = 16;
-static constexpr int kAvx8bitInnerSize = 4;
-
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-// When removing this comment, update profiling label below.
-void Kernel8bitAvxVnni(const KernelParams8bit<16, 16>& params) {
-  profiler::ScopeLabel label("Kernel kAvxVnni 8-bit (UNFINISHED)");
-
-  std::int32_t accum_data[kAvx8bitBlockSize][kAvx8bitBlockSize];
-
-  int bias_ptr_block_increment =
-      params.flags & RUY_ASM_FLAG_HAS_BIAS ? kAvx8bitBlockSize : 0;
-
-  const std::int8_t* rhs_col_ptr = params.rhs_base_ptr;
-  void* dst_col_ptr = params.dst_base_ptr;
-  const std::int32_t* bias_col_ptr = params.bias;
-  if (params.flags & RUY_ASM_FLAG_HAS_BIAS) {
-    bias_col_ptr += params.start_row;
-  }
-
-  for (int col = params.start_col; col <= params.last_col;
-       col += kAvx8bitBlockSize) {
-    const std::int8_t* lhs_col_ptr = params.lhs_base_ptr;
-    void* dst_ptr = dst_col_ptr;
-    const std::int32_t* bias_ptr = bias_col_ptr;
-
-    for (int row = params.start_row; row <= params.last_row;
-         row += kAvx8bitBlockSize) {
-      const int residual_rows =
-          std::min(params.dst_rows - row, kAvx8bitBlockSize);
-      const int residual_cols =
-          std::min(params.dst_cols - col, kAvx8bitBlockSize);
-
-      // Initialize with bias.
-      std::int32_t initial_accum_data[kAvx8bitBlockSize];
-      for (int i = 0; i < kAvx8bitBlockSize; ++i) {
-        initial_accum_data[i] = 0;
-      }
-      for (int i = 0; i < residual_rows; ++i) {
-        initial_accum_data[i] = bias_ptr[i];
-      }
-
-      for (int j = 0; j < kAvx8bitBlockSize; ++j) {
-        for (int i = 0; i < kAvx8bitBlockSize; ++i) {
-          accum_data[j][i] = initial_accum_data[i];
-        }
-      }
-      bias_ptr += bias_ptr_block_increment;
-
-      std::int8_t lhs_data[kAvx8bitBlockSize][kAvx8bitInnerSize];
-      std::int8_t rhs_data[kAvx8bitBlockSize][kAvx8bitInnerSize];
-      const std::int8_t* lhs_ptr = lhs_col_ptr;
-      const std::int8_t* rhs_ptr = rhs_col_ptr;
-      for (int d = 0; d < params.depth; d += kAvx8bitInnerSize) {
-        for (int i = 0; i < kAvx8bitBlockSize; ++i) {
-          for (int x = 0; x < kAvx8bitInnerSize; ++x) {
-            lhs_data[i][x] = lhs_ptr[i * kAvx8bitInnerSize + x];
-            rhs_data[i][x] = rhs_ptr[i * kAvx8bitInnerSize + x];
-          }
-        }
-
-        for (int j = 0; j < kAvx8bitBlockSize; ++j) {
-          for (int i = 0; i < kAvx8bitBlockSize; ++i) {
-            for (int x = 0; x < kAvx8bitInnerSize; ++x) {
-              accum_data[j][i] += lhs_data[i][x] * rhs_data[j][x];
-            }
-          }
-        }
-
-        lhs_ptr += kAvx8bitBlockSize * kAvx8bitInnerSize;
-        rhs_ptr += kAvx8bitBlockSize * kAvx8bitInnerSize;
-      }
-
-      if ((params.flags & RUY_ASM_FLAG_HAS_LHS_SUMS) && params.rhs_zero_point) {
-        for (int j = 0; j < kAvx8bitBlockSize; ++j) {
-          for (int i = 0; i < kAvx8bitBlockSize; ++i) {
-            accum_data[j][i] -=
-                params.rhs_zero_point * params.lhs_sums[row + i];
-          }
-        }
-      }
-      if ((params.flags & RUY_ASM_FLAG_HAS_RHS_SUMS) && params.lhs_zero_point) {
-        for (int j = 0; j < kAvx8bitBlockSize; ++j) {
-          for (int i = 0; i < kAvx8bitBlockSize; ++i) {
-            accum_data[j][i] -=
-                params.lhs_zero_point * params.rhs_sums[col + j];
-          }
-        }
-      }
-      if (params.lhs_zero_point && params.rhs_zero_point) {
-        for (int j = 0; j < kAvx8bitBlockSize; ++j) {
-          for (int i = 0; i < kAvx8bitBlockSize; ++i) {
-            accum_data[j][i] += params.prod_zp_depth;
-          }
-        }
-      }
-
-      if (params.dst_type_id != DstTypeId<std::int32_t>::kValue) {
-        std::int32_t m_vector[kAvx8bitBlockSize];
-        std::int32_t e_vector[kAvx8bitBlockSize];
-        // Does not make use of RUY_ASM_FLAG_NEEDS_LEFT_SHIFT.
-        if (params.flags & RUY_ASM_FLAG_HAS_PERCHANNEL) {
-          int i = 0;
-          for (; i < residual_rows; ++i) {
-            m_vector[i] = params.multiplier_fixedpoint[row + i];
-            e_vector[i] = params.multiplier_exponent[row + i];
-          }
-          for (; i < kAvx8bitBlockSize; ++i) {
-            m_vector[i] = m_vector[0];
-            e_vector[i] = e_vector[0];
-          }
-        } else {
-          // These arrays have size LhsCols, and are pre-filled.
-          for (int i = 0; i < kAvx8bitBlockSize; ++i) {
-            m_vector[i] = params.multiplier_fixedpoint[i];
-            e_vector[i] = params.multiplier_exponent[i];
-          }
-        }
-
-        for (int j = 0; j < kAvx8bitBlockSize; ++j) {
-          for (int i = 0; i < kAvx8bitBlockSize; ++i) {
-            accum_data[j][i] = MultiplyByQuantizedMultiplier(
-                accum_data[j][i], m_vector[i], e_vector[i]);
-          }
-        }
-
-        if (params.dst_zero_point) {
-          for (int j = 0; j < kAvx8bitBlockSize; ++j) {
-            for (int i = 0; i < kAvx8bitBlockSize; ++i) {
-              accum_data[j][i] += params.dst_zero_point;
-            }
-          }
-        }
-
-        for (int j = 0; j < kAvx8bitBlockSize; ++j) {
-          for (int i = 0; i < kAvx8bitBlockSize; ++i) {
-            accum_data[j][i] =
-                std::min<std::int32_t>(accum_data[j][i], params.clamp_max);
-            accum_data[j][i] =
-                std::max<std::int32_t>(accum_data[j][i], params.clamp_min);
-          }
-        }
-      }
-
-      const bool store_full_block = (residual_rows == kAvx8bitBlockSize) &&
-                                    (residual_cols == kAvx8bitBlockSize);
-
-      if (params.dst_type_id == DstTypeId<std::int8_t>::kValue) {
-        std::int8_t* tmp_ptr =
-            store_full_block
-                ? static_cast<std::int8_t*>(dst_ptr)
-                : const_cast<std::int8_t*>(
-                      reinterpret_cast<const std::int8_t*>(params.dst_tmp_buf));
-        const int block_col_offset =
-            store_full_block ? params.dst_stride / sizeof(std::int8_t)
-                             : kAvx8bitBlockSize;
-        for (int j = 0; j < kAvx8bitBlockSize; ++j) {
-          for (int i = 0; i < kAvx8bitBlockSize; ++i) {
-            tmp_ptr[i] = accum_data[j][i];
-          }
-          tmp_ptr += block_col_offset;
-        }
-
-        if (!store_full_block) {
-          const std::int8_t* block_ptr =
-              reinterpret_cast<const std::int8_t*>(params.dst_tmp_buf);
-          for (int j = 0; j < residual_cols; ++j) {
-            for (int i = 0; i < residual_rows; ++i) {
-              static_cast<std::int8_t*>(
-                  dst_ptr)[j * params.dst_stride / sizeof(std::int8_t) + i] =
-                  block_ptr[i];
-            }
-            block_ptr += kAvx8bitBlockSize;
-          }
-        }
-        dst_ptr = static_cast<void*>(static_cast<std::int8_t*>(dst_ptr) +
-                                     kAvx8bitBlockSize);
-      } else if (params.dst_type_id == DstTypeId<std::uint8_t>::kValue) {
-        std::uint8_t* tmp_ptr = store_full_block
-                                    ? static_cast<std::uint8_t*>(dst_ptr)
-                                    : const_cast<std::uint8_t*>(
-                                          reinterpret_cast<const std::uint8_t*>(
-                                              params.dst_tmp_buf));
-        const int block_col_offset =
-            store_full_block ? params.dst_stride : kAvx8bitBlockSize;
-        for (int j = 0; j < kAvx8bitBlockSize; ++j) {
-          for (int i = 0; i < kAvx8bitBlockSize; ++i) {
-            tmp_ptr[i] = accum_data[j][i];
-          }
-          tmp_ptr += block_col_offset;
-        }
-
-        if (!store_full_block) {
-          const std::uint8_t* block_ptr =
-              reinterpret_cast<const std::uint8_t*>(params.dst_tmp_buf);
-          for (int j = 0; j < residual_cols; ++j) {
-            for (int i = 0; i < residual_rows; ++i) {
-              static_cast<std::uint8_t*>(
-                  dst_ptr)[j * params.dst_stride / sizeof(std::uint8_t) + i] =
-                  block_ptr[i];
-            }
-            block_ptr += kAvx8bitBlockSize;
-          }
-        }
-        dst_ptr = static_cast<void*>(static_cast<std::uint8_t*>(dst_ptr) +
-                                     kAvx8bitBlockSize);
-      } else if (params.dst_type_id == DstTypeId<std::int16_t>::kValue) {
-        if (store_full_block) {
-          std::int16_t* tmp_ptr = static_cast<std::int16_t*>(dst_ptr);
-          const int block_col_offset = params.dst_stride / sizeof(std::int16_t);
-          for (int j = 0; j < kAvx8bitBlockSize; ++j) {
-            for (int i = 0; i < kAvx8bitBlockSize; ++i) {
-              tmp_ptr[i] = accum_data[j][i];
-            }
-            tmp_ptr += block_col_offset;
-          }
-        } else {
-          std::int16_t* tmp_ptr = const_cast<std::int16_t*>(
-              reinterpret_cast<const std::int16_t*>(params.dst_tmp_buf));
-          const int block_col_offset = kAvx8bitBlockSize;
-          for (int j = 0; j < kAvx8bitBlockSize; ++j) {
-            for (int i = 0; i < kAvx8bitBlockSize; ++i) {
-              tmp_ptr[i] = accum_data[j][i];
-            }
-            tmp_ptr += block_col_offset;
-          }
-          const std::int16_t* block_ptr =
-              reinterpret_cast<const std::int16_t*>(params.dst_tmp_buf);
-          std::int16_t* dst_block_ptr = static_cast<std::int16_t*>(dst_ptr);
-          for (int j = 0; j < residual_cols; ++j) {
-            for (int i = 0; i < residual_rows; ++i) {
-              dst_block_ptr[i] = block_ptr[i];
-            }
-            dst_block_ptr += params.dst_stride / sizeof(std::int16_t);
-            block_ptr += kAvx8bitBlockSize;
-          }
-        }
-        dst_ptr = static_cast<void*>(static_cast<std::int16_t*>(dst_ptr) +
-                                     kAvx8bitBlockSize);
-      } else if (params.dst_type_id == DstTypeId<std::int32_t>::kValue) {
-        if (store_full_block) {
-          std::int32_t* tmp_ptr = static_cast<std::int32_t*>(dst_ptr);
-          const int block_col_offset = params.dst_stride / sizeof(std::int32_t);
-          for (int j = 0; j < kAvx8bitBlockSize; ++j) {
-            for (int i = 0; i < kAvx8bitBlockSize; ++i) {
-              tmp_ptr[i] = accum_data[j][i];
-            }
-            tmp_ptr += block_col_offset;
-          }
-        } else {
-          std::int32_t* dst_block_ptr = static_cast<std::int32_t*>(dst_ptr);
-          for (int j = 0; j < residual_cols; ++j) {
-            for (int i = 0; i < residual_rows; ++i) {
-              dst_block_ptr[i] = accum_data[j][i];
-            }
-            dst_block_ptr += params.dst_stride / sizeof(std::int32_t);
-          }
-        }
-        dst_ptr = static_cast<void*>(static_cast<std::int32_t*>(dst_ptr) +
-                                     kAvx8bitBlockSize);
-      } else {
-        RUY_DCHECK(false);
-      }
-
-      lhs_col_ptr += kAvx8bitBlockSize * params.lhs_stride;
-    }  // End row-block loop.
-
-    dst_col_ptr = static_cast<void*>(static_cast<char*>(dst_col_ptr) +
-                                     kAvx8bitBlockSize * params.dst_stride);
-    rhs_col_ptr += kAvx8bitBlockSize * params.rhs_stride;
-  }  // End col-block loop.
-}  // NOLINT(readability/fn_size)
-
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-// When removing this comment, update profiling label below.
-void KernelFloatAvxVnni(const KernelParamsFloat<16, 16>& params) {
-  profiler::ScopeLabel label("Kernel kAvxVnni float (UNFINISHED)");
-
-  float lhs_data[kAvxFloatBlockSize];
-  float rhs_data[kAvxFloatBlockSize];
-  float accum_data[kAvxFloatBlockSize][kAvxFloatBlockSize];
-  int bias_ptr_block_increment =
-      params.flags & RUY_ASM_FLAG_HAS_BIAS ? kAvxFloatBlockSize : 0;
-
-  const float* rhs_col_ptr = params.rhs_base_ptr;
-  float* dst_col_ptr = params.dst_base_ptr;
-  const float* bias_col_ptr = params.bias;
-  if (params.flags & RUY_ASM_FLAG_HAS_BIAS) {
-    bias_col_ptr += params.start_row;
-  }
-
-  for (int col = params.start_col; col <= params.last_col;
-       col += kAvxFloatBlockSize) {
-    const float* lhs_col_ptr = params.lhs_base_ptr;
-    float* dst_ptr = dst_col_ptr;
-    const float* bias_ptr = bias_col_ptr;
-
-    for (int row = params.start_row; row <= params.last_row;
-         row += kAvxFloatBlockSize) {
-      const int residual_rows =
-          std::min(params.dst_rows - row, kAvxFloatBlockSize);
-      const int residual_cols =
-          std::min(params.dst_cols - col, kAvxFloatBlockSize);
-
-      // Initialize with bias.
-      float initial_accum_data[kAvxFloatBlockSize];
-      for (int i = 0; i < kAvxFloatBlockSize; ++i) {
-        initial_accum_data[i] = 0.0f;
-      }
-      for (int i = 0; i < residual_rows; ++i) {
-        initial_accum_data[i] = bias_ptr[i];
-      }
-      for (int j = 0; j < kAvxFloatBlockSize; ++j) {
-        for (int i = 0; i < kAvxFloatBlockSize; ++i) {
-          accum_data[j][i] = initial_accum_data[i];
-        }
-      }
-      bias_ptr += bias_ptr_block_increment;
-
-      const float* lhs_ptr = lhs_col_ptr;
-      const float* rhs_ptr = rhs_col_ptr;
-      for (int d = 0; d < params.depth; ++d) {
-        for (int i = 0; i < kAvxFloatBlockSize; ++i) {
-          lhs_data[i] = lhs_ptr[i];
-          rhs_data[i] = rhs_ptr[i];
-        }
-
-        for (int j = 0; j < kAvxFloatBlockSize; ++j) {
-          for (int i = 0; i < kAvxFloatBlockSize; ++i) {
-            accum_data[j][i] += lhs_data[i] * rhs_data[j];
-          }
-        }
-
-        lhs_ptr += kAvxFloatBlockSize;
-        rhs_ptr += kAvxFloatBlockSize;
-      }
-
-      for (int j = 0; j < kAvxFloatBlockSize; ++j) {
-        for (int i = 0; i < kAvxFloatBlockSize; ++i) {
-          accum_data[j][i] =
-              std::min<float>(accum_data[j][i], params.clamp_max);
-          accum_data[j][i] =
-              std::max<float>(accum_data[j][i], params.clamp_min);
-        }
-      }
-
-      const bool store_full_block = (residual_rows == kAvxFloatBlockSize) &&
-                                    (residual_cols == kAvxFloatBlockSize);
-
-      {
-        float* block_ptr =
-            store_full_block ? dst_ptr : const_cast<float*>(params.dst_tmp_buf);
-        const int block_col_offset = store_full_block
-                                         ? params.dst_stride / sizeof(float)
-                                         : kAvxFloatBlockSize;
-        for (int j = 0; j < kAvxFloatBlockSize; ++j) {
-          for (int i = 0; i < kAvxFloatBlockSize; ++i) {
-            block_ptr[i] = accum_data[j][i];
-          }
-          block_ptr += block_col_offset;
-        }
-      }
-      if (!store_full_block) {
-        const float* block_ptr = params.dst_tmp_buf;
-        for (int j = 0; j < residual_cols; ++j) {
-          for (int i = 0; i < residual_rows; ++i) {
-            dst_ptr[j * params.dst_stride / sizeof(float) + i] = block_ptr[i];
-          }
-          block_ptr += kAvxFloatBlockSize;
-        }
-      }
-
-      lhs_col_ptr += kAvxFloatBlockSize * params.lhs_stride / sizeof(float);
-      dst_ptr += kAvxFloatBlockSize;
-    }  // End row-block loop.
-
-    dst_col_ptr += kAvxFloatBlockSize * params.dst_stride / sizeof(float);
-    rhs_col_ptr += kAvxFloatBlockSize * params.rhs_stride / sizeof(float);
-  }  // End col-block loop.
-}
-
-#endif  //  RUY_PLATFORM_AVX_VNNI && RUY_OPT(ASM)
-
-}  // namespace ruy

diff --git a/ruy/kernel_sse42.cc b/ruy/kernel_sse42.cc
deleted file mode 100644
index 962b6e9..0000000
--- a/ruy/kernel_sse42.cc
+++ /dev/null

@@ -1,428 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <algorithm>
-#include <cstdint>
-
-#include "ruy/check_macros.h"
-#include "ruy/kernel.h"
-#include "ruy/opt_set.h"
-#include "ruy/platform.h"
-#include "ruy/profiler/instrumentation.h"
-
-#if RUY_PLATFORM_SSE42 && RUY_OPT(ASM)
-#include <immintrin.h>  // IWYU pragma: keep
-#endif
-
-namespace ruy {
-
-#if !(RUY_PLATFORM_SSE42 && RUY_OPT(ASM))
-
-void Kernel8bitSse42(const KernelParams8bit<8, 8>&) {
-  // CPU-ID-based checks should disable the path that would reach this point.
-  RUY_DCHECK(false);
-}
-
-void KernelFloatSse42(const KernelParamsFloat<8, 8>&) {
-  // CPU-ID-based checks should disable the path that would reach this point.
-  RUY_DCHECK(false);
-}
-
-#else  // RUY_PLATFORM_SSE42 && RUY_OPT(ASM)
-
-static constexpr int kAvxFloatBlockSize = 8;
-static constexpr int kAvx8bitBlockSize = 8;
-static constexpr int kAvx8bitInnerSize = 4;
-
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-// When removing this comment, update profiling label below.
-void Kernel8bitSse42(const KernelParams8bit<8, 8>& params) {
-  profiler::ScopeLabel label("Kernel kSse42 8-bit (UNFINISHED)");
-  std::int32_t accum_data[kAvx8bitBlockSize][kAvx8bitBlockSize];
-
-  int bias_ptr_block_increment =
-      params.flags & RUY_ASM_FLAG_HAS_BIAS ? kAvx8bitBlockSize : 0;
-
-  const std::int8_t* rhs_col_ptr = params.rhs_base_ptr;
-  void* dst_col_ptr = params.dst_base_ptr;
-  const std::int32_t* bias_col_ptr = params.bias;
-  if (params.flags & RUY_ASM_FLAG_HAS_BIAS) {
-    bias_col_ptr += params.start_row;
-  }
-
-  for (int col = params.start_col; col <= params.last_col;
-       col += kAvx8bitBlockSize) {
-    const std::int8_t* lhs_col_ptr = params.lhs_base_ptr;
-    void* dst_ptr = dst_col_ptr;
-    const std::int32_t* bias_ptr = bias_col_ptr;
-
-    for (int row = params.start_row; row <= params.last_row;
-         row += kAvx8bitBlockSize) {
-      const int residual_rows =
-          std::min(params.dst_rows - row, kAvx8bitBlockSize);
-      const int residual_cols =
-          std::min(params.dst_cols - col, kAvx8bitBlockSize);
-
-      // Initialize with bias.
-      std::int32_t initial_accum_data[kAvx8bitBlockSize];
-      for (int i = 0; i < kAvx8bitBlockSize; ++i) {
-        initial_accum_data[i] = 0;
-      }
-      for (int i = 0; i < residual_rows; ++i) {
-        initial_accum_data[i] = bias_ptr[i];
-      }
-      for (int j = 0; j < kAvx8bitBlockSize; ++j) {
-        for (int i = 0; i < kAvx8bitBlockSize; ++i) {
-          accum_data[j][i] = initial_accum_data[i];
-        }
-      }
-      bias_ptr += bias_ptr_block_increment;
-
-      std::int8_t lhs_data[kAvx8bitBlockSize][kAvx8bitInnerSize];
-      std::int8_t rhs_data[kAvx8bitBlockSize][kAvx8bitInnerSize];
-      const std::int8_t* lhs_ptr = lhs_col_ptr;
-      const std::int8_t* rhs_ptr = rhs_col_ptr;
-      for (int d = 0; d < params.depth; d += kAvx8bitInnerSize) {
-        for (int i = 0; i < kAvx8bitBlockSize; ++i) {
-          for (int x = 0; x < kAvx8bitInnerSize; ++x) {
-            lhs_data[i][x] = lhs_ptr[i * kAvx8bitInnerSize + x];
-            rhs_data[i][x] = rhs_ptr[i * kAvx8bitInnerSize + x];
-          }
-        }
-        for (int j = 0; j < kAvx8bitBlockSize; ++j) {
-          for (int i = 0; i < kAvx8bitBlockSize; ++i) {
-            for (int x = 0; x < kAvx8bitInnerSize; ++x) {
-              accum_data[j][i] += lhs_data[i][x] * rhs_data[j][x];
-            }
-          }
-        }
-        lhs_ptr += kAvx8bitBlockSize * kAvx8bitInnerSize;
-        rhs_ptr += kAvx8bitBlockSize * kAvx8bitInnerSize;
-      }
-
-      if ((params.flags & RUY_ASM_FLAG_HAS_LHS_SUMS) && params.rhs_zero_point) {
-        for (int j = 0; j < kAvx8bitBlockSize; ++j) {
-          for (int i = 0; i < kAvx8bitBlockSize; ++i) {
-            accum_data[j][i] -=
-                params.rhs_zero_point * params.lhs_sums[row + i];
-          }
-        }
-      }
-      if ((params.flags & RUY_ASM_FLAG_HAS_RHS_SUMS) && params.lhs_zero_point) {
-        for (int j = 0; j < kAvx8bitBlockSize; ++j) {
-          for (int i = 0; i < kAvx8bitBlockSize; ++i) {
-            accum_data[j][i] -=
-                params.lhs_zero_point * params.rhs_sums[col + j];
-          }
-        }
-      }
-      if (params.lhs_zero_point && params.rhs_zero_point) {
-        for (int j = 0; j < kAvx8bitBlockSize; ++j) {
-          for (int i = 0; i < kAvx8bitBlockSize; ++i) {
-            accum_data[j][i] += params.prod_zp_depth;
-          }
-        }
-      }
-
-      if (params.dst_type_id != DstTypeId<std::int32_t>::kValue) {
-        std::int32_t m_vector[kAvx8bitBlockSize];
-        std::int32_t e_vector[kAvx8bitBlockSize];
-        // Does not make use of RUY_ASM_FLAG_NEEDS_LEFT_SHIFT.
-        if (params.flags & RUY_ASM_FLAG_HAS_PERCHANNEL) {
-          int i = 0;
-          for (; i < residual_rows; ++i) {
-            m_vector[i] = params.multiplier_fixedpoint[row + i];
-            e_vector[i] = params.multiplier_exponent[row + i];
-          }
-          for (; i < kAvx8bitBlockSize; ++i) {
-            m_vector[i] = m_vector[0];
-            e_vector[i] = e_vector[0];
-          }
-        } else {
-          // These arrays have size LhsCols, and are pre-filled.
-          for (int i = 0; i < kAvx8bitBlockSize; ++i) {
-            m_vector[i] = params.multiplier_fixedpoint[i];
-            e_vector[i] = params.multiplier_exponent[i];
-          }
-        }
-        for (int j = 0; j < kAvx8bitBlockSize; ++j) {
-          for (int i = 0; i < kAvx8bitBlockSize; ++i) {
-            accum_data[j][i] = MultiplyByQuantizedMultiplier(
-                accum_data[j][i], m_vector[i], e_vector[i]);
-          }
-        }
-
-        if (params.dst_zero_point) {
-          for (int j = 0; j < kAvx8bitBlockSize; ++j) {
-            for (int i = 0; i < kAvx8bitBlockSize; ++i) {
-              accum_data[j][i] += params.dst_zero_point;
-            }
-          }
-        }
-
-        for (int j = 0; j < kAvx8bitBlockSize; ++j) {
-          for (int i = 0; i < kAvx8bitBlockSize; ++i) {
-            accum_data[j][i] =
-                std::min<std::int32_t>(accum_data[j][i], params.clamp_max);
-            accum_data[j][i] =
-                std::max<std::int32_t>(accum_data[j][i], params.clamp_min);
-          }
-        }
-      }
-
-      const bool store_full_block = (residual_rows == kAvx8bitBlockSize) &&
-                                    (residual_cols == kAvx8bitBlockSize);
-
-      if (params.dst_type_id == DstTypeId<std::int8_t>::kValue) {
-        std::int8_t* tmp_ptr =
-            store_full_block
-                ? static_cast<std::int8_t*>(dst_ptr)
-                : const_cast<std::int8_t*>(
-                      reinterpret_cast<const std::int8_t*>(params.dst_tmp_buf));
-        const int block_col_offset =
-            store_full_block ? params.dst_stride / sizeof(std::int8_t)
-                             : kAvx8bitBlockSize;
-        for (int j = 0; j < kAvx8bitBlockSize; ++j) {
-          for (int i = 0; i < kAvx8bitBlockSize; ++i) {
-            tmp_ptr[i] = accum_data[j][i];
-          }
-          tmp_ptr += block_col_offset;
-        }
-
-        if (!store_full_block) {
-          const std::int8_t* block_ptr =
-              reinterpret_cast<const std::int8_t*>(params.dst_tmp_buf);
-          for (int j = 0; j < residual_cols; ++j) {
-            for (int i = 0; i < residual_rows; ++i) {
-              static_cast<std::int8_t*>(
-                  dst_ptr)[j * params.dst_stride / sizeof(std::int8_t) + i] =
-                  block_ptr[i];
-            }
-            block_ptr += kAvx8bitBlockSize;
-          }
-        }
-        dst_ptr = static_cast<void*>(static_cast<std::int8_t*>(dst_ptr) +
-                                     kAvx8bitBlockSize);
-      } else if (params.dst_type_id == DstTypeId<std::uint8_t>::kValue) {
-        std::uint8_t* tmp_ptr = store_full_block
-                                    ? static_cast<std::uint8_t*>(dst_ptr)
-                                    : const_cast<std::uint8_t*>(
-                                          reinterpret_cast<const std::uint8_t*>(
-                                              params.dst_tmp_buf));
-        const int block_col_offset =
-            store_full_block ? params.dst_stride : kAvx8bitBlockSize;
-        for (int j = 0; j < kAvx8bitBlockSize; ++j) {
-          for (int i = 0; i < kAvx8bitBlockSize; ++i) {
-            tmp_ptr[i] = accum_data[j][i];
-          }
-          tmp_ptr += block_col_offset;
-        }
-
-        if (!store_full_block) {
-          const std::uint8_t* block_ptr =
-              reinterpret_cast<const std::uint8_t*>(params.dst_tmp_buf);
-          for (int j = 0; j < residual_cols; ++j) {
-            for (int i = 0; i < residual_rows; ++i) {
-              static_cast<std::uint8_t*>(
-                  dst_ptr)[j * params.dst_stride / sizeof(std::uint8_t) + i] =
-                  block_ptr[i];
-            }
-            block_ptr += kAvx8bitBlockSize;
-          }
-        }
-        dst_ptr = static_cast<void*>(static_cast<std::uint8_t*>(dst_ptr) +
-                                     kAvx8bitBlockSize);
-      } else if (params.dst_type_id == DstTypeId<std::int16_t>::kValue) {
-        if (store_full_block) {
-          std::int16_t* tmp_ptr = static_cast<std::int16_t*>(dst_ptr);
-          const int block_col_offset = params.dst_stride / sizeof(std::int16_t);
-          for (int j = 0; j < kAvx8bitBlockSize; ++j) {
-            for (int i = 0; i < kAvx8bitBlockSize; ++i) {
-              tmp_ptr[i] = accum_data[j][i];
-            }
-            tmp_ptr += block_col_offset;
-          }
-        } else {
-          std::int16_t* tmp_ptr = const_cast<std::int16_t*>(
-              reinterpret_cast<const std::int16_t*>(params.dst_tmp_buf));
-          const int block_col_offset = kAvx8bitBlockSize;
-          for (int j = 0; j < kAvx8bitBlockSize; ++j) {
-            for (int i = 0; i < kAvx8bitBlockSize; ++i) {
-              tmp_ptr[i] = accum_data[j][i];
-            }
-            tmp_ptr += block_col_offset;
-          }
-          const std::int16_t* block_ptr =
-              reinterpret_cast<const std::int16_t*>(params.dst_tmp_buf);
-          std::int16_t* dst_block_ptr = static_cast<std::int16_t*>(dst_ptr);
-          for (int j = 0; j < residual_cols; ++j) {
-            for (int i = 0; i < residual_rows; ++i) {
-              dst_block_ptr[i] = block_ptr[i];
-            }
-            dst_block_ptr += params.dst_stride / sizeof(std::int16_t);
-            block_ptr += kAvx8bitBlockSize;
-          }
-        }
-        dst_ptr = static_cast<void*>(static_cast<std::int16_t*>(dst_ptr) +
-                                     kAvx8bitBlockSize);
-      } else if (params.dst_type_id == DstTypeId<std::int32_t>::kValue) {
-        if (store_full_block) {
-          std::int32_t* tmp_ptr = static_cast<std::int32_t*>(dst_ptr);
-          const int block_col_offset = params.dst_stride / sizeof(std::int32_t);
-          for (int j = 0; j < kAvx8bitBlockSize; ++j) {
-            for (int i = 0; i < kAvx8bitBlockSize; ++i) {
-              tmp_ptr[i] = accum_data[j][i];
-            }
-            tmp_ptr += block_col_offset;
-          }
-        } else {
-          std::int32_t* dst_block_ptr = static_cast<std::int32_t*>(dst_ptr);
-          for (int j = 0; j < residual_cols; ++j) {
-            for (int i = 0; i < residual_rows; ++i) {
-              dst_block_ptr[i] = accum_data[j][i];
-            }
-            dst_block_ptr += params.dst_stride / sizeof(std::int32_t);
-          }
-        }
-        dst_ptr = static_cast<void*>(static_cast<std::int32_t*>(dst_ptr) +
-                                     kAvx8bitBlockSize);
-      } else {
-        RUY_DCHECK(false);
-      }
-
-      lhs_col_ptr += kAvx8bitBlockSize * params.lhs_stride;
-    }  // End row-block loop.
-
-    dst_col_ptr = static_cast<void*>(static_cast<char*>(dst_col_ptr) +
-                                     kAvx8bitBlockSize * params.dst_stride);
-    rhs_col_ptr += kAvx8bitBlockSize * params.rhs_stride;
-  }  // End col-block loop.
-}  // NOLINT(readability/fn_size)
-
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-// When removing this comment, update profiling label below.
-void KernelFloatSse42(const KernelParamsFloat<8, 8>& params) {
-  profiler::ScopeLabel label("Kernel kSse42 float (UNFINISHED)");
-
-  float lhs_data[kAvxFloatBlockSize];
-  float rhs_data[kAvxFloatBlockSize];
-  float accum_data[kAvxFloatBlockSize][kAvxFloatBlockSize];
-  int bias_ptr_block_increment =
-      params.flags & RUY_ASM_FLAG_HAS_BIAS ? kAvxFloatBlockSize : 0;
-
-  const float* rhs_col_ptr = params.rhs_base_ptr;
-  float* dst_col_ptr = params.dst_base_ptr;
-  const float* bias_col_ptr = params.bias;
-  if (params.flags & RUY_ASM_FLAG_HAS_BIAS) {
-    bias_col_ptr += params.start_row;
-  }
-
-  for (int col = params.start_col; col <= params.last_col;
-       col += kAvxFloatBlockSize) {
-    const float* lhs_col_ptr = params.lhs_base_ptr;
-    float* dst_ptr = dst_col_ptr;
-    const float* bias_ptr = bias_col_ptr;
-
-    for (int row = params.start_row; row <= params.last_row;
-         row += kAvxFloatBlockSize) {
-      const int residual_rows =
-          std::min(params.dst_rows - row, kAvxFloatBlockSize);
-      const int residual_cols =
-          std::min(params.dst_cols - col, kAvxFloatBlockSize);
-
-      // Initialize with bias.
-      float initial_accum_data[kAvxFloatBlockSize];
-      for (int i = 0; i < kAvxFloatBlockSize; ++i) {
-        initial_accum_data[i] = 0.0f;
-      }
-      for (int i = 0; i < residual_rows; ++i) {
-        initial_accum_data[i] = bias_ptr[i];
-      }
-      for (int j = 0; j < kAvxFloatBlockSize; ++j) {
-        for (int i = 0; i < kAvxFloatBlockSize; ++i) {
-          accum_data[j][i] = initial_accum_data[i];
-        }
-      }
-      bias_ptr += bias_ptr_block_increment;
-
-      const float* lhs_ptr = lhs_col_ptr;
-      const float* rhs_ptr = rhs_col_ptr;
-      for (int d = 0; d < params.depth; ++d) {
-        for (int i = 0; i < kAvxFloatBlockSize; ++i) {
-          lhs_data[i] = lhs_ptr[i];
-          rhs_data[i] = rhs_ptr[i];
-        }
-        for (int j = 0; j < kAvxFloatBlockSize; ++j) {
-          for (int i = 0; i < kAvxFloatBlockSize; ++i) {
-            accum_data[j][i] += lhs_data[i] * rhs_data[j];
-          }
-        }
-        lhs_ptr += kAvxFloatBlockSize;
-        rhs_ptr += kAvxFloatBlockSize;
-      }
-
-      for (int j = 0; j < kAvxFloatBlockSize; ++j) {
-        for (int i = 0; i < kAvxFloatBlockSize; ++i) {
-          accum_data[j][i] =
-              std::min<float>(accum_data[j][i], params.clamp_max);
-          accum_data[j][i] =
-              std::max<float>(accum_data[j][i], params.clamp_min);
-        }
-      }
-
-      const bool store_full_block = (residual_rows == kAvxFloatBlockSize) &&
-                                    (residual_cols == kAvxFloatBlockSize);
-
-      {
-        float* block_ptr =
-            store_full_block ? dst_ptr : const_cast<float*>(params.dst_tmp_buf);
-        const int block_col_offset = store_full_block
-                                         ? params.dst_stride / sizeof(float)
-                                         : kAvxFloatBlockSize;
-        for (int j = 0; j < kAvxFloatBlockSize; ++j) {
-          for (int i = 0; i < kAvxFloatBlockSize; ++i) {
-            block_ptr[i] = accum_data[j][i];
-          }
-          block_ptr += block_col_offset;
-        }
-      }
-      if (!store_full_block) {
-        const float* block_ptr = params.dst_tmp_buf;
-        for (int j = 0; j < residual_cols; ++j) {
-          for (int i = 0; i < residual_rows; ++i) {
-            dst_ptr[j * params.dst_stride / sizeof(float) + i] = block_ptr[i];
-          }
-          block_ptr += kAvxFloatBlockSize;
-        }
-      }
-
-      lhs_col_ptr += kAvxFloatBlockSize * params.lhs_stride / sizeof(float);
-      dst_ptr += kAvxFloatBlockSize;
-    }  // End row-block loop.
-
-    dst_col_ptr += kAvxFloatBlockSize * params.dst_stride / sizeof(float);
-    rhs_col_ptr += kAvxFloatBlockSize * params.rhs_stride / sizeof(float);
-  }  // End col-block loop.
-}
-
-#endif  //  RUY_PLATFORM_SSE42 && RUY_OPT(ASM)
-
-}  // namespace ruy

diff --git a/ruy/kernel_x86.h b/ruy/kernel_x86.h
index cada2a6..d20c1a4 100644
--- a/ruy/kernel_x86.h
+++ b/ruy/kernel_x86.h

@@ -32,53 +32,8 @@
 
 #if RUY_PLATFORM_X86
 
-RUY_INHERIT_KERNEL(Path::kStandardCpp, Path::kSse42)
-RUY_INHERIT_KERNEL(Path::kSse42, Path::kAvx2)
+RUY_INHERIT_KERNEL(Path::kStandardCpp, Path::kAvx2)
 RUY_INHERIT_KERNEL(Path::kAvx2, Path::kAvx512)
-RUY_INHERIT_KERNEL(Path::kAvx512, Path::kAvxVnni)
-
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-void Kernel8bitSse42(const KernelParams8bit<8, 8>& params);
-
-template <typename DstScalar>
-struct Kernel<Path::kSse42, std::int8_t, std::int8_t, DstScalar,
-              MulParams<std::int32_t, DstScalar>> {
-  static constexpr Path kPath = Path::kSse42;
-  Tuning tuning = Tuning::kAuto;
-  using LhsLayout = FixedKernelLayout<Order::kColMajor, 4, 8>;
-  using RhsLayout = FixedKernelLayout<Order::kColMajor, 4, 8>;
-  explicit Kernel(Tuning tuning_) : tuning(tuning_) {}
-  void Run(const PMat<std::int8_t>& lhs, const PMat<std::int8_t>& rhs,
-           const MulParams<std::int32_t, DstScalar>& mul_params, int start_row,
-           int start_col, int end_row, int end_col, Mat<DstScalar>* dst) const {
-    KernelParams8bit<LhsLayout::kCols, RhsLayout::kCols> params;
-    MakeKernelParams8bit(lhs, rhs, mul_params, start_row, start_col, end_row,
-                         end_col, dst, &params);
-    Kernel8bitSse42(params);
-  }
-};
-
-void KernelFloatSse42(const KernelParamsFloat<8, 8>& params);
-
-template <>
-struct Kernel<Path::kSse42, float, float, float, MulParams<float, float>> {
-  Tuning tuning = Tuning::kAuto;
-  static constexpr Path kPath = Path::kSse42;
-  using LhsLayout = FixedKernelLayout<Order::kRowMajor, 1, 8>;
-  using RhsLayout = FixedKernelLayout<Order::kRowMajor, 1, 8>;
-  explicit Kernel(Tuning tuning_) : tuning(tuning_) {}
-  void Run(const PMat<float>& lhs, const PMat<float>& rhs,
-           const MulParams<float, float>& mul_params, int start_row,
-           int start_col, int end_row, int end_col, Mat<float>* dst) const {
-    KernelParamsFloat<LhsLayout::kCols, RhsLayout::kCols> params;
-    MakeKernelParamsFloat(lhs, rhs, mul_params, start_row, start_col, end_row,
-                          end_col, dst, &params);
-    KernelFloatSse42(params);
-  }
-};
 
 void Kernel8bitAvx512(const KernelParams8bit<16, 16>& params);
 void Kernel8bitAvx512SingleCol(const KernelParams8bit<16, 16>& params);
@@ -178,49 +133,6 @@
   }
 };
 
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-void Kernel8bitAvxVnni(const KernelParams8bit<16, 16>& params);
-
-template <typename DstScalar>
-struct Kernel<Path::kAvxVnni, std::int8_t, std::int8_t, DstScalar,
-              MulParams<std::int32_t, DstScalar>> {
-  static constexpr Path kPath = Path::kAvxVnni;
-  Tuning tuning = Tuning::kAuto;
-  using LhsLayout = FixedKernelLayout<Order::kColMajor, 4, 16>;
-  using RhsLayout = FixedKernelLayout<Order::kColMajor, 4, 16>;
-  explicit Kernel(Tuning tuning_) : tuning(tuning_) {}
-  void Run(const PMat<std::int8_t>& lhs, const PMat<std::int8_t>& rhs,
-           const MulParams<std::int32_t, DstScalar>& mul_params, int start_row,
-           int start_col, int end_row, int end_col, Mat<DstScalar>* dst) const {
-    KernelParams8bit<LhsLayout::kCols, RhsLayout::kCols> params;
-    MakeKernelParams8bit(lhs, rhs, mul_params, start_row, start_col, end_row,
-                         end_col, dst, &params);
-    Kernel8bitAvxVnni(params);
-  }
-};
-
-void KernelFloatAvxVnni(const KernelParamsFloat<16, 16>& params);
-
-template <>
-struct Kernel<Path::kAvxVnni, float, float, float, MulParams<float, float>> {
-  static constexpr Path kPath = Path::kAvxVnni;
-  Tuning tuning = Tuning::kAuto;
-  using LhsLayout = FixedKernelLayout<Order::kRowMajor, 1, 16>;
-  using RhsLayout = FixedKernelLayout<Order::kRowMajor, 1, 16>;
-  explicit Kernel(Tuning tuning_) : tuning(tuning_) {}
-  void Run(const PMat<float>& lhs, const PMat<float>& rhs,
-           const MulParams<float, float>& mul_params, int start_row,
-           int start_col, int end_row, int end_col, Mat<float>* dst) const {
-    KernelParamsFloat<LhsLayout::kCols, RhsLayout::kCols> params;
-    MakeKernelParamsFloat(lhs, rhs, mul_params, start_row, start_col, end_row,
-                          end_col, dst, &params);
-    KernelFloatAvxVnni(params);
-  }
-};
-
 #endif  // RUY_PLATFORM_X86
 
 }  // namespace ruy

diff --git a/ruy/pack_avxvnni.cc b/ruy/pack_avxvnni.cc
deleted file mode 100644
index a467bbc..0000000
--- a/ruy/pack_avxvnni.cc
+++ /dev/null

@@ -1,474 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cstdint>
-#include <cstring>
-
-#include "ruy/check_macros.h"
-#include "ruy/matrix.h"
-#include "ruy/opt_set.h"
-#include "ruy/pack.h"
-#include "ruy/path.h"
-#include "ruy/platform.h"
-#include "ruy/profiler/instrumentation.h"
-
-#if RUY_PLATFORM_AVX_VNNI && RUY_OPT(INTRINSICS)
-#include <immintrin.h>  // IWYU pragma: keep
-#endif
-
-namespace ruy {
-
-#if !(RUY_PLATFORM_AVX_VNNI && RUY_OPT(ASM))
-
-void Pack8bitAvxVnni(const std::int8_t*, std::int8_t, const std::int8_t*, int,
-                     int, int, std::int8_t*, std::int32_t*) {
-  // CPU-ID-based checks should disable the path that would reach this point.
-  RUY_DCHECK(false);
-}
-
-void PackFloatAvxVnni(const float*, const float*, int, int, int, float*) {
-  // CPU-ID-based checks should disable the path that would reach this point.
-  RUY_DCHECK(false);
-}
-
-#else  // RUY_PLATFORM_AVX_VNNI && RUY_OPT(ASM)
-
-// The first int8_t template parameter is arbitrary: this routine is common to
-// all 8-bit source matrix types.
-using PackImpl8bitAvxVnni =
-    PackImpl<Path::kAvxVnni, FixedKernelLayout<Order::kColMajor, 4, 16>,
-             std::int8_t, std::int8_t, std::int32_t>;
-
-namespace {
-
-inline void ZeroHalf8bitAvxVnni(int src_rows, std::int8_t packed_zero_point,
-                                std::int8_t* packed_ptr) {
-  const int non_trailing_blocks = (src_rows & ~31) >> 2;
-  // This routine fills half blocks, and typically fills the second halves. Thus
-  // packed_ptr is already offset by 8*4.
-  for (int k = 0; k < non_trailing_blocks; ++k) {
-    for (int j = 0; j < (8 * 4); ++j) {
-      packed_ptr[16 * 4 * k + j] = packed_zero_point;
-    }
-  }
-}
-
-inline void HalfPack8bitAvxVnni(const std::int8_t* src_ptr,
-                                std::int8_t input_xor,
-                                const std::int8_t* zerobuf, int src_stride,
-                                int remaining_src_cols, int src_rows,
-                                std::int8_t* packed_ptr, std::int32_t* sums_ptr,
-                                std::int8_t* trailing_buf) {
-  std::int8_t in_data[8][8][4];
-
-  const std::int8_t* src_ptr0 = src_ptr;
-  const std::int8_t* src_ptr1 = src_ptr0 + src_stride;
-  const std::int8_t* src_ptr2 = src_ptr1 + src_stride;
-  const std::int8_t* src_ptr3 = src_ptr2 + src_stride;
-  const std::int8_t* src_ptr4 = src_ptr3 + src_stride;
-  const std::int8_t* src_ptr5 = src_ptr4 + src_stride;
-  const std::int8_t* src_ptr6 = src_ptr5 + src_stride;
-  const std::int8_t* src_ptr7 = src_ptr6 + src_stride;
-  std::int64_t src_inc0 = 8 * 4;
-  std::int64_t src_inc1 = 8 * 4;
-  std::int64_t src_inc2 = 8 * 4;
-  std::int64_t src_inc3 = 8 * 4;
-  std::int64_t src_inc4 = 8 * 4;
-  std::int64_t src_inc5 = 8 * 4;
-  std::int64_t src_inc6 = 8 * 4;
-  std::int64_t src_inc7 = 8 * 4;
-  if (remaining_src_cols < 8) {
-    if (remaining_src_cols <= 0) {
-      src_ptr0 = zerobuf;
-      src_inc0 = 0;
-    }
-    if (remaining_src_cols <= 1) {
-      src_ptr1 = zerobuf;
-      src_inc1 = 0;
-    }
-    if (remaining_src_cols <= 2) {
-      src_ptr2 = zerobuf;
-      src_inc2 = 0;
-    }
-    if (remaining_src_cols <= 3) {
-      src_ptr3 = zerobuf;
-      src_inc3 = 0;
-    }
-    if (remaining_src_cols <= 4) {
-      src_ptr4 = zerobuf;
-      src_inc4 = 0;
-    }
-    if (remaining_src_cols <= 5) {
-      src_ptr5 = zerobuf;
-      src_inc5 = 0;
-    }
-    if (remaining_src_cols <= 6) {
-      src_ptr6 = zerobuf;
-      src_inc6 = 0;
-    }
-    src_ptr7 = zerobuf;
-    src_inc7 = 0;
-  }
-
-  const std::int8_t zero_point = zerobuf[0];
-
-  if (sums_ptr) {
-    for (int i = 0; i < 8; ++i) {
-      sums_ptr[i] = 0;
-    }
-  }
-
-  // The overall packing effectively pads the source rows to
-  // (src_rows + 63) & ~63. The iteration over k may skip when m=1, and then we
-  // only pack for (src_rows + 31) & ~31. When there is an incomplete
-  // destination block, this is stored into trailing_buf instead of packed_ptr.
-  for (int k = 0; k < src_rows; k += 16 * 4) {
-    for (int m = 0; m < 2; ++m) {
-      // Available source rows.
-      // If this is less than 0 (for m=1), we skip, having filled trailing
-      // buffer for m=0. Also, if source rows is zero on m=1, then we filled
-      // exactly to the end of the column in the packed buffer.
-      const int packed_rows = src_rows - k - 8 * m * 4;
-      // Effectively,
-      // packed_rows = std::max(0, std::min(8, src_rows - k - 8 * m));
-      // but treat each case separately.
-      if (packed_rows >= (8 * 4)) {
-        for (int i = 0; i < 8; ++i) {
-          for (int s = 0; s < 4; ++s) {
-            in_data[0][i][s] = src_ptr0[i * 4 + s];
-            in_data[1][i][s] = src_ptr1[i * 4 + s];
-            in_data[2][i][s] = src_ptr2[i * 4 + s];
-            in_data[3][i][s] = src_ptr3[i * 4 + s];
-            in_data[4][i][s] = src_ptr4[i * 4 + s];
-            in_data[5][i][s] = src_ptr5[i * 4 + s];
-            in_data[6][i][s] = src_ptr6[i * 4 + s];
-            in_data[7][i][s] = src_ptr7[i * 4 + s];
-          }
-        }
-        for (int i = 0; i < 8; ++i) {
-          for (int j = 0; j < 8; ++j) {
-            for (int s = 0; s < 4; ++s) {
-              packed_ptr[(16 * i + j) * 4 + s] =
-                  static_cast<std::int8_t>(in_data[j][i][s] ^ input_xor);
-            }
-            if (sums_ptr) {
-              for (int s = 0; s < 4; ++s) {
-                sums_ptr[j] += in_data[j][i][s] ^ input_xor;
-              }
-            }
-          }
-        }
-      } else if (packed_rows > 0) {
-        RUY_DCHECK_LT(packed_rows >> 2, 8);
-        int i = 0;
-        for (; i < (packed_rows >> 2); ++i) {
-          for (int s = 0; s < 4; ++s) {
-            in_data[0][i][s] = src_ptr0[i * 4 + s];
-            in_data[1][i][s] = src_ptr1[i * 4 + s];
-            in_data[2][i][s] = src_ptr2[i * 4 + s];
-            in_data[3][i][s] = src_ptr3[i * 4 + s];
-            in_data[4][i][s] = src_ptr4[i * 4 + s];
-            in_data[5][i][s] = src_ptr5[i * 4 + s];
-            in_data[6][i][s] = src_ptr6[i * 4 + s];
-            in_data[7][i][s] = src_ptr7[i * 4 + s];
-          }
-        }
-        if (i < ((packed_rows + 3) >> 2)) {
-          int s = 0;
-          for (; s < (packed_rows & 3); ++s) {
-            in_data[0][i][s] = src_ptr0[i * 4 + s];
-            in_data[1][i][s] = src_ptr1[i * 4 + s];
-            in_data[2][i][s] = src_ptr2[i * 4 + s];
-            in_data[3][i][s] = src_ptr3[i * 4 + s];
-            in_data[4][i][s] = src_ptr4[i * 4 + s];
-            in_data[5][i][s] = src_ptr5[i * 4 + s];
-            in_data[6][i][s] = src_ptr6[i * 4 + s];
-            in_data[7][i][s] = src_ptr7[i * 4 + s];
-          }
-          RUY_DCHECK_LE(s, 4);
-          for (; s < 4; ++s) {
-            for (int j = 0; j < 8; ++j) {
-              in_data[j][i][s] = zero_point;
-            }
-          }
-          ++i;
-        }
-        // We do not care what goes into the trailing buffer, but we want
-        // in_data[...] ^ input_xor == 0 for irrelevant values in the summation.
-        //
-        // It might prove better in optimized code to pad uniformly with
-        // zero_point, and compensate by initializing the summations with the
-        // compensating offset, effectively
-        // ((input_xor - zero_point) ^ input_xor) *
-        //                         4 * (8 - ((packed_rows + 3) >> 2)).
-        for (; i < 8; ++i) {
-          for (int s = 0; s < 4; ++s) {
-            for (int j = 0; j < 8; ++j) {
-              in_data[j][i][s] = input_xor;
-            }
-          }
-        }
-        // We loop through [0, 8) rather than [0, (packed_rows + 3) >> 2), since
-        // that emulates what we might do in fully-optimized code.
-        if (sums_ptr) {
-          for (int i = 0; i < 8; ++i) {
-            for (int j = 0; j < 8; ++j) {
-              for (int s = 0; s < 4; ++s) {
-                trailing_buf[(16 * i + j) * 4 + s] =
-                    static_cast<std::int8_t>(in_data[j][i][s] ^ input_xor);
-                sums_ptr[j] += in_data[j][i][s] ^ input_xor;
-              }
-            }
-          }
-        } else {
-          for (int i = 0; i < 8; ++i) {
-            for (int j = 0; j < 8; ++j) {
-              for (int s = 0; s < 4; ++s) {
-                trailing_buf[(16 * i + j) * 4 + s] =
-                    static_cast<std::int8_t>(in_data[j][i][s] ^ input_xor);
-              }
-            }
-          }
-        }
-      }
-
-      packed_ptr += 16 * 8 * 4;
-      src_ptr0 += src_inc0;
-      src_ptr1 += src_inc1;
-      src_ptr2 += src_inc2;
-      src_ptr3 += src_inc3;
-      src_ptr4 += src_inc4;
-      src_ptr5 += src_inc5;
-      src_ptr6 += src_inc6;
-      src_ptr7 += src_inc7;
-    }
-  }
-}
-
-inline void HalfPackFloatAvxVnni(const float* src_ptr, const float* zerobuf,
-                                 int src_stride, int remaining_src_cols,
-                                 int src_rows, float* packed_ptr,
-                                 float* trailing_buf) {
-  float in_data[8][8];
-
-  const float* src_ptr0 = src_ptr;
-  const float* src_ptr1 = src_ptr0 + src_stride;
-  const float* src_ptr2 = src_ptr1 + src_stride;
-  const float* src_ptr3 = src_ptr2 + src_stride;
-  const float* src_ptr4 = src_ptr3 + src_stride;
-  const float* src_ptr5 = src_ptr4 + src_stride;
-  const float* src_ptr6 = src_ptr5 + src_stride;
-  const float* src_ptr7 = src_ptr6 + src_stride;
-  std::int64_t src_inc0 = 8;
-  std::int64_t src_inc1 = 8;
-  std::int64_t src_inc2 = 8;
-  std::int64_t src_inc3 = 8;
-  std::int64_t src_inc4 = 8;
-  std::int64_t src_inc5 = 8;
-  std::int64_t src_inc6 = 8;
-  std::int64_t src_inc7 = 8;
-  if (remaining_src_cols < 8) {
-    if (remaining_src_cols <= 0) {
-      src_ptr0 = zerobuf;
-      src_inc0 = 0;
-    }
-    if (remaining_src_cols <= 1) {
-      src_ptr1 = zerobuf;
-      src_inc1 = 0;
-    }
-    if (remaining_src_cols <= 2) {
-      src_ptr2 = zerobuf;
-      src_inc2 = 0;
-    }
-    if (remaining_src_cols <= 3) {
-      src_ptr3 = zerobuf;
-      src_inc3 = 0;
-    }
-    if (remaining_src_cols <= 4) {
-      src_ptr4 = zerobuf;
-      src_inc4 = 0;
-    }
-    if (remaining_src_cols <= 5) {
-      src_ptr5 = zerobuf;
-      src_inc5 = 0;
-    }
-    if (remaining_src_cols <= 6) {
-      src_ptr6 = zerobuf;
-      src_inc6 = 0;
-    }
-    src_ptr7 = zerobuf;
-    src_inc7 = 0;
-  }
-
-  for (int k = 0; k < src_rows; k += 16) {
-    for (int m = 0; m < 2; ++m) {
-      const int packed_rows = src_rows - k - 8 * m;
-      // Effectively,
-      // packed_rows = std::max(0, std::min(8, src_rows - k - 8 * m));
-      // but treat each case separately.
-      if (packed_rows > 7) {
-        for (int i = 0; i < 8; ++i) {
-          in_data[0][i] = src_ptr0[i];
-          in_data[1][i] = src_ptr1[i];
-          in_data[2][i] = src_ptr2[i];
-          in_data[3][i] = src_ptr3[i];
-          in_data[4][i] = src_ptr4[i];
-          in_data[5][i] = src_ptr5[i];
-          in_data[6][i] = src_ptr6[i];
-          in_data[7][i] = src_ptr7[i];
-        }
-        for (int i = 0; i < 8; ++i) {
-          for (int j = 0; j < 8; ++j) {
-            packed_ptr[16 * i + j] = in_data[j][i];
-          }
-        }
-      } else if (packed_rows > 0) {
-        for (int i = 0; i < packed_rows; ++i) {
-          in_data[0][i] = src_ptr0[i];
-          in_data[1][i] = src_ptr1[i];
-          in_data[2][i] = src_ptr2[i];
-          in_data[3][i] = src_ptr3[i];
-          in_data[4][i] = src_ptr4[i];
-          in_data[5][i] = src_ptr5[i];
-          in_data[6][i] = src_ptr6[i];
-          in_data[7][i] = src_ptr7[i];
-        }
-        for (int i = packed_rows; i < 8; ++i) {
-          in_data[0][i] = 0.0f;
-          in_data[1][i] = 0.0f;
-          in_data[2][i] = 0.0f;
-          in_data[3][i] = 0.0f;
-          in_data[4][i] = 0.0f;
-          in_data[5][i] = 0.0f;
-          in_data[6][i] = 0.0f;
-          in_data[7][i] = 0.0f;
-        }
-        // We loop through [0, 7) rather than [0, packed_rows), since that
-        // emulates what we might do in fully-optimized code.
-        for (int i = 0; i < 7; ++i) {
-          for (int j = 0; j < 8; ++j) {
-            trailing_buf[16 * i + j] = in_data[j][i];
-          }
-        }
-      }
-
-      packed_ptr += 16 * 8;
-      src_ptr0 += src_inc0;
-      src_ptr1 += src_inc1;
-      src_ptr2 += src_inc2;
-      src_ptr3 += src_inc3;
-      src_ptr4 += src_inc4;
-      src_ptr5 += src_inc5;
-      src_ptr6 += src_inc6;
-      src_ptr7 += src_inc7;
-    }
-  }
-}
-
-inline void ZeroHalfFloatAvxVnni(int src_rows, float* packed_ptr) {
-  const int non_trailing_rows = src_rows & ~7;
-  for (int k = 0; k < non_trailing_rows; ++k) {
-    for (int j = 0; j < 8; ++j) {
-      packed_ptr[j] = 0.0f;
-    }
-    packed_ptr += 16;
-  }
-}
-
-}  // namespace.
-
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-// When removing this comment, update profiling label below.
-void Pack8bitAvxVnni(const std::int8_t* src_ptr, std::int8_t input_xor,
-                     const std::int8_t* zerobuf, int src_stride,
-                     int remaining_src_cols, int src_rows,
-                     std::int8_t* packed_ptr, std::int32_t* sums_ptr) {
-  profiler::ScopeLabel label("Pack kAvxVnni 8bit (UNFINISHED)");
-
-  // Each packed block is 4*16, and there are normally 8. The trailing block is
-  // only slightly shorter.
-  std::int8_t trailing_buf[8 * 16 * 4];
-  memset(trailing_buf, 0, 8 * 16 * 4 * sizeof(std::int8_t));
-
-  std::int32_t* second_sums_ptr = sums_ptr ? sums_ptr + 8 : nullptr;
-  if (remaining_src_cols > 8) {
-    HalfPack8bitAvxVnni(src_ptr, input_xor, zerobuf, src_stride,
-                        remaining_src_cols, src_rows, packed_ptr, sums_ptr,
-                        trailing_buf);
-    HalfPack8bitAvxVnni(src_ptr + src_stride * 8, input_xor, zerobuf,
-                        src_stride, remaining_src_cols - 8, src_rows,
-                        packed_ptr + 8 * 4, second_sums_ptr,
-                        trailing_buf + 8 * 4);
-  } else {
-    HalfPack8bitAvxVnni(src_ptr, input_xor, zerobuf, src_stride,
-                        remaining_src_cols, src_rows, packed_ptr, sums_ptr,
-                        trailing_buf);
-    ZeroHalf8bitAvxVnni(src_rows, zerobuf[0] ^ input_xor, packed_ptr + 8 * 4);
-    // The kernel may not need the second half-blocks sums to be set.
-    if (second_sums_ptr) {
-      for (int i = 0; i < 8; ++i) {
-        second_sums_ptr[i] = (zerobuf[0] ^ input_xor) * ((src_rows + 3) & ~3);
-      }
-    }
-  }
-  const bool trailing_data = (src_rows & 31) > 0;
-  // If the number of source rows is not a multiple of 32, there will be data in
-  // the trailing buffer,
-  if (trailing_data > 0) {
-    const int non_trailing_rows = src_rows & ~31;
-    // Destination "rows" are padded to next highest multiple of 4.
-    const int dst_rows = (src_rows + 3) & ~3;
-    const int trailing_rows = dst_rows - non_trailing_rows;
-    memcpy(packed_ptr + 16 * non_trailing_rows, trailing_buf,
-           16 * trailing_rows * sizeof(std::int8_t));
-  }
-}
-
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-// When removing this comment, update profiling label below.
-void PackFloatAvxVnni(const float* src_ptr, const float* zerobuf,
-                      int src_stride, int remaining_src_cols, int src_rows,
-                      float* packed_ptr) {
-  profiler::ScopeLabel label("Pack kAvxVnni float (UNFINISHED)");
-  float trailing_buf[7 * 16];
-  if (remaining_src_cols > 8) {
-    HalfPackFloatAvxVnni(src_ptr, zerobuf, src_stride, remaining_src_cols,
-                         src_rows, packed_ptr, trailing_buf);
-    HalfPackFloatAvxVnni(src_ptr + src_stride * 8, zerobuf, src_stride,
-                         remaining_src_cols - 8, src_rows, packed_ptr + 8,
-                         trailing_buf + 8);
-  } else {
-    memset(trailing_buf, 0, sizeof(trailing_buf));
-    HalfPackFloatAvxVnni(src_ptr, zerobuf, src_stride, remaining_src_cols,
-                         src_rows, packed_ptr, trailing_buf);
-    ZeroHalfFloatAvxVnni(src_rows, packed_ptr + 8);
-  }
-  const int trailing_rows = src_rows & 7;
-  if (trailing_rows > 0) {
-    const int non_trailing_rows = src_rows & ~7;
-    memcpy(packed_ptr + 16 * non_trailing_rows, trailing_buf,
-           16 * trailing_rows * sizeof(float));
-  }
-}
-
-#endif  // RUY_PLATFORM_AVX_VNNI && RUY_OPT(INTRINSICS)
-
-}  // namespace ruy

diff --git a/ruy/pack_sse42.cc b/ruy/pack_sse42.cc
deleted file mode 100644
index 89d2ecc..0000000
--- a/ruy/pack_sse42.cc
+++ /dev/null

@@ -1,468 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cstdint>
-#include <cstring>
-
-#include "ruy/check_macros.h"
-#include "ruy/matrix.h"
-#include "ruy/opt_set.h"
-#include "ruy/pack.h"
-#include "ruy/path.h"
-#include "ruy/platform.h"
-#include "ruy/profiler/instrumentation.h"
-
-#if RUY_PLATFORM_SSE42 && RUY_OPT(INTRINSICS)
-#include <immintrin.h>  // IWYU pragma: keep
-#endif
-
-namespace ruy {
-
-#if !(RUY_PLATFORM_SSE42 && RUY_OPT(ASM))
-
-void Pack8bitSse42(const std::int8_t*, std::int8_t, const std::int8_t*, int,
-                   int, int, std::int8_t*, std::int32_t*) {
-  // CPU-ID-based checks should disable the path that would reach this point.
-  RUY_DCHECK(false);
-}
-
-void PackFloatSse42(const float*, const float*, int, int, int, float*) {
-  // CPU-ID-based checks should disable the path that would reach this point.
-  RUY_DCHECK(false);
-}
-
-#else  // RUY_PLATFORM_SSE42 && RUY_OPT(ASM)
-
-// The first int8_t template parameter is arbitrary: this routine is common to
-// all 8-bit source matrix types.
-using PackImpl8bitSse42 =
-    PackImpl<Path::kSse42, FixedKernelLayout<Order::kColMajor, 4, 8>,
-             std::int8_t, std::int8_t, std::int32_t>;
-
-using PackImplFloatSse42 =
-    PackImpl<Path::kSse42, FixedKernelLayout<Order::kRowMajor, 1, 8>, float,
-             float, float>;
-
-namespace {
-
-inline void Pack8bitSse42Packer(const std::int8_t* src_ptr,
-                                std::int8_t input_xor,
-                                const std::int8_t* zerobuf, int src_stride,
-                                int remaining_src_cols, int src_rows,
-                                std::int8_t* packed_ptr, std::int32_t* sums_ptr,
-                                std::int8_t* trailing_buf) {
-  using Layout = PackImpl8bitSse42::Layout;
-  RUY_DCHECK_EQ(Layout::kCols, 8);
-  RUY_DCHECK_EQ(Layout::kRows, 4);
-  // Each Layout::Rows is 4 contiguous input, contiguous packed elements.
-  // We process 8 of these chunks at a time, padding short input chunks.
-  constexpr int kNumRowChunks = 8;
-  constexpr int kNumChunkedSrcRows = kNumRowChunks * Layout::kRows;
-
-  std::int8_t in_data[Layout::kCols][kNumRowChunks][Layout::kRows];
-
-  const std::int8_t* src_ptr0 = src_ptr;
-  const std::int8_t* src_ptr1 = src_ptr0 + src_stride;
-  const std::int8_t* src_ptr2 = src_ptr1 + src_stride;
-  const std::int8_t* src_ptr3 = src_ptr2 + src_stride;
-  const std::int8_t* src_ptr4 = src_ptr3 + src_stride;
-  const std::int8_t* src_ptr5 = src_ptr4 + src_stride;
-  const std::int8_t* src_ptr6 = src_ptr5 + src_stride;
-  const std::int8_t* src_ptr7 = src_ptr6 + src_stride;
-  std::int64_t src_inc0 = kNumChunkedSrcRows;
-  std::int64_t src_inc1 = kNumChunkedSrcRows;
-  std::int64_t src_inc2 = kNumChunkedSrcRows;
-  std::int64_t src_inc3 = kNumChunkedSrcRows;
-  std::int64_t src_inc4 = kNumChunkedSrcRows;
-  std::int64_t src_inc5 = kNumChunkedSrcRows;
-  std::int64_t src_inc6 = kNumChunkedSrcRows;
-  std::int64_t src_inc7 = kNumChunkedSrcRows;
-  // Handle cases where source does not have Layout::kCols (8) columns.
-  if (remaining_src_cols < 8) {
-    if (remaining_src_cols <= 0) {
-      src_ptr0 = zerobuf;
-      src_inc0 = 0;
-    }
-    if (remaining_src_cols <= 1) {
-      src_ptr1 = zerobuf;
-      src_inc1 = 0;
-    }
-    if (remaining_src_cols <= 2) {
-      src_ptr2 = zerobuf;
-      src_inc2 = 0;
-    }
-    if (remaining_src_cols <= 3) {
-      src_ptr3 = zerobuf;
-      src_inc3 = 0;
-    }
-    if (remaining_src_cols <= 4) {
-      src_ptr4 = zerobuf;
-      src_inc4 = 0;
-    }
-    if (remaining_src_cols <= 5) {
-      src_ptr5 = zerobuf;
-      src_inc5 = 0;
-    }
-    if (remaining_src_cols <= 6) {
-      src_ptr6 = zerobuf;
-      src_inc6 = 0;
-    }
-    src_ptr7 = zerobuf;
-    src_inc7 = 0;
-  }
-
-  const std::int8_t zero_point = zerobuf[0];
-
-  if (sums_ptr) {
-    // i: Layout::kCols.
-    for (int i = 0; i < 8; ++i) {
-      sums_ptr[i] = 0;
-    }
-  }
-
-  // The overall packing effectively pads the source rows to
-  // (src_rows + 63) & ~63. The iteration over k may skip when m=1, and then we
-  // only pack for (src_rows + 31) & ~31. When there is an incomplete
-  // destination block, this is stored into trailing_buf instead of packed_ptr.
-  for (int k = 0; k < src_rows; k += kNumChunkedSrcRows) {
-    // Available source rows.
-    // If this is less than 0 (for m=1), we skip, having filled trailing
-    // buffer for m=0. Also, if source rows is zero on m=1, then we filled
-    // exactly to the end of the column in the packed buffer.
-    const int available_src_rows = src_rows - k;
-    // Effectively,
-    // available rows = std::max(0, std::min(8, src_rows - k));
-    // treat each case separately.
-    if (available_src_rows >= kNumChunkedSrcRows) {
-      // i: chunks, s: Layout::Rows.
-      for (int i = 0; i < 8; ++i) {
-        for (int s = 0; s < 4; ++s) {
-          in_data[0][i][s] = src_ptr0[i * 4 + s];
-          in_data[1][i][s] = src_ptr1[i * 4 + s];
-          in_data[2][i][s] = src_ptr2[i * 4 + s];
-          in_data[3][i][s] = src_ptr3[i * 4 + s];
-          in_data[4][i][s] = src_ptr4[i * 4 + s];
-          in_data[5][i][s] = src_ptr5[i * 4 + s];
-          in_data[6][i][s] = src_ptr6[i * 4 + s];
-          in_data[7][i][s] = src_ptr7[i * 4 + s];
-        }
-      }
-      // i: chunks, j: Layout::kCols, s: Layout::Rows.
-      for (int i = 0; i < 8; ++i) {
-        for (int j = 0; j < 8; ++j) {
-          for (int s = 0; s < 4; ++s) {
-            // 8 * 4 * i is offset for each block, that is
-            // (Layout::kCols * Layout::kRows * i)
-            packed_ptr[(8 * i + j) * 4 + s] = in_data[j][i][s] ^ input_xor;
-          }
-          if (sums_ptr) {
-            for (int s = 0; s < 4; ++s) {
-              sums_ptr[j] += in_data[j][i][s] ^ input_xor;
-            }
-          }
-        }
-      }
-    } else if (available_src_rows > 0) {
-      RUY_DCHECK_LT(available_src_rows, kNumChunkedSrcRows);
-      int i = 0;
-      // Consume chunks of 4 rows that are complete.
-      for (; i < (available_src_rows >> 2); ++i) {
-        for (int s = 0; s < 4; ++s) {
-          in_data[0][i][s] = src_ptr0[i * 4 + s];
-          in_data[1][i][s] = src_ptr1[i * 4 + s];
-          in_data[2][i][s] = src_ptr2[i * 4 + s];
-          in_data[3][i][s] = src_ptr3[i * 4 + s];
-          in_data[4][i][s] = src_ptr4[i * 4 + s];
-          in_data[5][i][s] = src_ptr5[i * 4 + s];
-          in_data[6][i][s] = src_ptr6[i * 4 + s];
-          in_data[7][i][s] = src_ptr7[i * 4 + s];
-        }
-      }
-      // Consume any incomplete chunk.
-      if (i < ((available_src_rows + 3) >> 2)) {
-        int s = 0;
-        for (; s < (available_src_rows & 3); ++s) {
-          in_data[0][i][s] = src_ptr0[i * 4 + s];
-          in_data[1][i][s] = src_ptr1[i * 4 + s];
-          in_data[2][i][s] = src_ptr2[i * 4 + s];
-          in_data[3][i][s] = src_ptr3[i * 4 + s];
-          in_data[4][i][s] = src_ptr4[i * 4 + s];
-          in_data[5][i][s] = src_ptr5[i * 4 + s];
-          in_data[6][i][s] = src_ptr6[i * 4 + s];
-          in_data[7][i][s] = src_ptr7[i * 4 + s];
-        }
-        RUY_DCHECK_LE(s, 4);
-        for (; s < 4; ++s) {
-          // j: Layout::kCols.
-          for (int j = 0; j < 8; ++j) {
-            in_data[j][i][s] = zero_point;
-          }
-        }
-        ++i;
-      }
-      // We do not care what goes into the trailing buffer, but we want
-      // in_data[...] ^ input_xor == 0 for irrelevant values in the summation.
-      //
-      // It might prove better in optimized code to pad uniformly with
-      // zero_point, and compensate by initializing the summations with the
-      // compensating offset, effectively
-      // ((input_xor - zero_point) ^ input_xor) *
-      //                         4 * (8 - ((available_src_rows + 3) >> 2)).
-      for (; i < 8; ++i) {
-        for (int s = 0; s < 4; ++s) {
-          for (int j = 0; j < 8; ++j) {
-            in_data[j][i][s] = input_xor;
-          }
-        }
-      }
-      // We loop through [0, 8) rather than
-      // [0, (available_src_rows + 3) >> 2), since that emulates what we might
-      // do in fully-optimized code.
-      //
-      // i: chunks, j: Layout::kCols, s: Layout::Rows.
-      if (sums_ptr) {
-        for (int i = 0; i < 8; ++i) {
-          for (int j = 0; j < 8; ++j) {
-            for (int s = 0; s < 4; ++s) {
-              trailing_buf[(8 * i + j) * 4 + s] = in_data[j][i][s] ^ input_xor;
-              sums_ptr[j] = sums_ptr[j] + (in_data[j][i][s] ^ input_xor);
-            }
-          }
-        }
-      } else {
-        for (int i = 0; i < 8; ++i) {
-          for (int j = 0; j < 8; ++j) {
-            for (int s = 0; s < 4; ++s) {
-              trailing_buf[(8 * i + j) * 4 + s] = in_data[j][i][s] ^ input_xor;
-            }
-          }
-        }
-      }
-    }
-
-    packed_ptr += 8 * kNumChunkedSrcRows;
-    src_ptr0 += src_inc0;
-    src_ptr1 += src_inc1;
-    src_ptr2 += src_inc2;
-    src_ptr3 += src_inc3;
-    src_ptr4 += src_inc4;
-    src_ptr5 += src_inc5;
-    src_ptr6 += src_inc6;
-    src_ptr7 += src_inc7;
-  }
-}
-
-inline void PackFloatSse42Packer(const float* src_ptr, const float* zerobuf,
-                                 int src_stride, int remaining_src_cols,
-                                 int src_rows, float* packed_ptr,
-                                 float* trailing_buf) {
-  using Layout = PackImplFloatSse42::Layout;
-  RUY_DCHECK_EQ(Layout::kCols, 8);
-  RUY_DCHECK_EQ(Layout::kRows, 1);
-
-  // This packing amounts to tranposition of 8x8 blocks.
-  static constexpr int kPackCols = 8;  // Source cols packed together.
-  static constexpr int kPackRows = 8;  // Short input is padded.
-
-  float in_data[kPackCols][kPackRows];
-
-  const float* src_ptr0 = src_ptr;
-  const float* src_ptr1 = src_ptr0 + src_stride;
-  const float* src_ptr2 = src_ptr1 + src_stride;
-  const float* src_ptr3 = src_ptr2 + src_stride;
-  const float* src_ptr4 = src_ptr3 + src_stride;
-  const float* src_ptr5 = src_ptr4 + src_stride;
-  const float* src_ptr6 = src_ptr5 + src_stride;
-  const float* src_ptr7 = src_ptr6 + src_stride;
-  std::int64_t src_inc0 = 8;
-  std::int64_t src_inc1 = 8;
-  std::int64_t src_inc2 = 8;
-  std::int64_t src_inc3 = 8;
-  std::int64_t src_inc4 = 8;
-  std::int64_t src_inc5 = 8;
-  std::int64_t src_inc6 = 8;
-  std::int64_t src_inc7 = 8;
-  // Handle cases where source does not have kPackDim (8) columns.
-  if (remaining_src_cols < kPackCols) {
-    if (remaining_src_cols <= 0) {
-      src_ptr0 = zerobuf;
-      src_inc0 = 0;
-    }
-    if (remaining_src_cols <= 1) {
-      src_ptr1 = zerobuf;
-      src_inc1 = 0;
-    }
-    if (remaining_src_cols <= 2) {
-      src_ptr2 = zerobuf;
-      src_inc2 = 0;
-    }
-    if (remaining_src_cols <= 3) {
-      src_ptr3 = zerobuf;
-      src_inc3 = 0;
-    }
-    if (remaining_src_cols <= 4) {
-      src_ptr4 = zerobuf;
-      src_inc4 = 0;
-    }
-    if (remaining_src_cols <= 5) {
-      src_ptr5 = zerobuf;
-      src_inc5 = 0;
-    }
-    if (remaining_src_cols <= 6) {
-      src_ptr6 = zerobuf;
-      src_inc6 = 0;
-    }
-    src_ptr7 = zerobuf;
-    src_inc7 = 0;
-  }
-
-  for (int k = 0; k < src_rows; k += kPackRows) {
-    const int available_src_rows = src_rows - k;
-    // Effectively,
-    // available_src_rows = std::max(0, std::min(kPackDim, src_rows - k));
-    // but treat each case separately.
-    if (available_src_rows >= kPackRows) {
-      for (int i = 0; i < 8; ++i) {
-        in_data[0][i] = src_ptr0[i];
-        in_data[1][i] = src_ptr1[i];
-        in_data[2][i] = src_ptr2[i];
-        in_data[3][i] = src_ptr3[i];
-        in_data[4][i] = src_ptr4[i];
-        in_data[5][i] = src_ptr5[i];
-        in_data[6][i] = src_ptr6[i];
-        in_data[7][i] = src_ptr7[i];
-      }
-      for (int i = 0; i < 8; ++i) {
-        for (int j = 0; j < 8; ++j) {
-          packed_ptr[8 * i + j] = in_data[j][i];
-        }
-      }
-    } else if (available_src_rows > 0) {
-      for (int i = 0; i < available_src_rows; ++i) {
-        in_data[0][i] = src_ptr0[i];
-        in_data[1][i] = src_ptr1[i];
-        in_data[2][i] = src_ptr2[i];
-        in_data[3][i] = src_ptr3[i];
-        in_data[4][i] = src_ptr4[i];
-        in_data[5][i] = src_ptr5[i];
-        in_data[6][i] = src_ptr6[i];
-        in_data[7][i] = src_ptr7[i];
-      }
-      for (int i = available_src_rows; i < kPackRows; ++i) {
-        in_data[0][i] = 0.0f;
-        in_data[1][i] = 0.0f;
-        in_data[2][i] = 0.0f;
-        in_data[3][i] = 0.0f;
-        in_data[4][i] = 0.0f;
-        in_data[5][i] = 0.0f;
-        in_data[6][i] = 0.0f;
-        in_data[7][i] = 0.0f;
-      }
-      // We loop through [0, 7) rather than [0, packed_rows), since that
-      // emulates what we might do in fully-optimized code.
-      // i: (kPackRows - 1), j: kPackCols.
-      for (int i = 0; i < 7; ++i) {
-        for (int j = 0; j < 8; ++j) {
-          trailing_buf[kPackRows * i + j] = in_data[j][i];
-        }
-      }
-    }
-
-    packed_ptr += kPackRows * kPackCols;
-    src_ptr0 += src_inc0;
-    src_ptr1 += src_inc1;
-    src_ptr2 += src_inc2;
-    src_ptr3 += src_inc3;
-    src_ptr4 += src_inc4;
-    src_ptr5 += src_inc5;
-    src_ptr6 += src_inc6;
-    src_ptr7 += src_inc7;
-  }
-}
-
-}  // namespace.
-
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-// When removing this comment, update profiling label below.
-void Pack8bitSse42(const std::int8_t* src_ptr, std::int8_t input_xor,
-                   const std::int8_t* zerobuf, int src_stride,
-                   int remaining_src_cols, int src_rows,
-                   std::int8_t* packed_ptr, std::int32_t* sums_ptr) {
-  profiler::ScopeLabel label("Pack kSse42 8bit (UNFINISHED)");
-
-  using Layout = PackImpl8bitSse42::Layout;
-  RUY_DCHECK_EQ(Layout::kCols, 8);
-  RUY_DCHECK_EQ(Layout::kRows, 4);
-
-  // Each Layout::Rows is 4 contiguous input, contiguous packed elements.
-  // We process 8 of these chunks at a time, padding short input chunks.
-  static constexpr int kNumRowChunks = 8;  // Short input is padded.
-
-  // Each packed block is 4*8, and there are normally 8. The trailing block is
-  // only slightly shorter.
-  constexpr int kTrailingBufSize =
-      kNumRowChunks * Layout::kCols * Layout::kRows;
-  std::int8_t trailing_buf[kTrailingBufSize];
-  memset(trailing_buf, 0, kTrailingBufSize * sizeof(std::int8_t));
-
-  Pack8bitSse42Packer(src_ptr, input_xor, zerobuf, src_stride,
-                      remaining_src_cols, src_rows, packed_ptr, sums_ptr,
-                      trailing_buf);
-
-  constexpr int kChunkedRowMask = kNumRowChunks * Layout::kRows - 1;
-  const bool trailing_data = (src_rows & kChunkedRowMask) > 0;
-  // If the number of source rows is not a multiple of kChunkedRowMask, there
-  // will be data in the trailing buffer,
-  if (trailing_data > 0) {
-    const int non_trailing_rows = src_rows & ~kChunkedRowMask;
-    // Destination "rows" are padded to next highest multiple of Layout::kRows.
-    const int dst_rows = (src_rows + 3) & ~3;
-    const int trailing_rows = dst_rows - non_trailing_rows;
-    memcpy(packed_ptr + Layout::kCols * non_trailing_rows, trailing_buf,
-           Layout::kCols * trailing_rows * sizeof(std::int8_t));
-  }
-}
-
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-// When removing this comment, update profiling label below.
-void PackFloatSse42(const float* src_ptr, const float* zerobuf, int src_stride,
-                    int remaining_src_cols, int src_rows, float* packed_ptr) {
-  profiler::ScopeLabel label("Pack kSse42 float (UNFINISHED)");
-  static constexpr int kPackCols = 8;  // Source cols packed together.
-  static constexpr int kPackRows = 8;  // Short input is padded.
-  float trailing_buf[(kPackRows - 1) * kPackCols];
-  if (remaining_src_cols < 8) {
-    memset(trailing_buf, 0, sizeof(trailing_buf));
-  }
-  PackFloatSse42Packer(src_ptr, zerobuf, src_stride, remaining_src_cols,
-                       src_rows, packed_ptr, trailing_buf);
-
-  const int trailing_rows = src_rows & (kPackRows - 1);
-  if (trailing_rows > 0) {
-    const int non_trailing_rows = src_rows & ~(kPackRows - 1);
-    memcpy(packed_ptr + kPackCols * non_trailing_rows, trailing_buf,
-           kPackCols * trailing_rows * sizeof(float));
-  }
-}
-
-#endif  // RUY_PLATFORM_SSE42 && RUY_OPT(INTRINSICS)
-
-}  // namespace ruy

diff --git a/ruy/pack_x86.h b/ruy/pack_x86.h
index 076f2c2..a9e0d2a 100644
--- a/ruy/pack_x86.h
+++ b/ruy/pack_x86.h

@@ -35,16 +35,10 @@
 
 #if RUY_PLATFORM_X86
 
-RUY_INHERIT_PACK(Path::kStandardCpp, Path::kSse42)
-RUY_INHERIT_PACK(Path::kSse42, Path::kAvx2)
+RUY_INHERIT_PACK(Path::kStandardCpp, Path::kAvx2)
 RUY_INHERIT_PACK(Path::kAvx2, Path::kAvx512)
-RUY_INHERIT_PACK(Path::kAvx512, Path::kAvxVnni)
 
 template <>
-struct PackedTypeImpl<Path::kSse42, std::uint8_t> {
-  using Type = std::int8_t;
-};
-template <>
 struct PackedTypeImpl<Path::kAvx2, std::uint8_t> {
   using Type = std::int8_t;
 };
@@ -52,100 +46,6 @@
 struct PackedTypeImpl<Path::kAvx512, std::uint8_t> {
   using Type = std::int8_t;
 };
-template <>
-struct PackedTypeImpl<Path::kAvxVnni, std::uint8_t> {
-  using Type = std::int8_t;
-};
-
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-// Note that source and zero buffers can be uint8 type, but in the packing
-// function are reinterpreted as int8, and are XOR-ed with input_xor.
-void Pack8bitSse42(const std::int8_t* src_ptr, std::int8_t input_xor,
-                   const std::int8_t* zerobuf, int src_stride,
-                   int remaining_src_cols, int src_rows,
-                   std::int8_t* packed_ptr, std::int32_t* sums_ptr);
-
-template <typename Scalar>
-struct PackImpl<Path::kSse42, FixedKernelLayout<Order::kColMajor, 4, 8>, Scalar,
-                std::int8_t, std::int32_t> {
-  static_assert(std::is_same<Scalar, std::int8_t>::value ||
-                    std::is_same<Scalar, std::uint8_t>::value,
-                "");
-  using Layout = FixedKernelLayout<Order::kColMajor, 4, 8>;
-  static constexpr std::int8_t kInputXor =
-      std::is_same<Scalar, std::int8_t>::value ? 0 : 0x80;
-
-  static void Run(Tuning, const Mat<Scalar>& src_matrix,
-                  PMat<std::int8_t>* packed_matrix, int start_col,
-                  int end_col) {
-    profiler::ScopeLabel label("Pack (SSE 4.2 8-bit)");
-
-    RUY_DCHECK(IsColMajor(src_matrix.layout));
-    RUY_DCHECK(IsColMajor(packed_matrix->layout));
-    RUY_DCHECK_EQ((end_col - start_col) % Layout::kCols, 0);
-    RUY_DCHECK_EQ(start_col % Layout::kCols, 0);
-    std::int32_t* sums = packed_matrix->sums;
-    Scalar zerobuf[Layout::kCols * Layout::kRows];
-    memset(zerobuf, packed_matrix->zero_point ^ kInputXor,
-           Layout::kCols * Layout::kRows * sizeof(Scalar));
-    for (int block_col = start_col; block_col < end_col;
-         block_col += Layout::kCols) {
-      std::int32_t* sums_ptr = sums ? sums + block_col : nullptr;
-      int src_stride = src_matrix.layout.stride;
-      const Scalar* src_ptr = src_matrix.data.get() + src_stride * block_col;
-      int remaining_src_cols = src_matrix.layout.cols - block_col;
-
-      static constexpr int block_col_mask = ~(Layout::kCols - 1);  // High bits.
-      std::int8_t* packed_ptr =
-          packed_matrix->data +
-          packed_matrix->layout.stride * (block_col & block_col_mask);
-      Pack8bitSse42(reinterpret_cast<const std::int8_t*>(src_ptr), kInputXor,
-                    reinterpret_cast<const std::int8_t*>(zerobuf), src_stride,
-                    remaining_src_cols, src_matrix.layout.rows, packed_ptr,
-                    sums_ptr);
-    }
-  }
-};
-
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-void PackFloatSse42(const float* src_ptr, const float* zerobuf, int src_stride,
-                    int remaining_src_cols, int src_rows, float* packed_ptr);
-
-template <>
-struct PackImpl<Path::kSse42, FixedKernelLayout<Order::kRowMajor, 1, 8>, float,
-                float, float> {
-  using Layout = FixedKernelLayout<Order::kRowMajor, 1, 8>;
-  static void Run(Tuning, const Mat<float>& src_matrix,
-                  PMat<float>* packed_matrix, int start_col, int end_col) {
-    profiler::ScopeLabel label("Pack (SSE 4.2 float)");
-
-    RUY_DCHECK(IsColMajor(src_matrix.layout));
-    RUY_DCHECK(IsColMajor(packed_matrix->layout));
-    RUY_DCHECK_EQ((end_col - start_col) % Layout::kCols, 0);
-    RUY_DCHECK_EQ(start_col % Layout::kCols, 0);
-    const float zerobuf[Layout::kCols] = {
-        0.0f};  // Remainder default inits to 0.0f.
-    for (int block_col = start_col; block_col < end_col;
-         block_col += Layout::kCols) {
-      int src_stride = src_matrix.layout.stride;
-      const float* src_ptr = src_matrix.data.get() + src_stride * block_col;
-      int remaining_src_cols = src_matrix.layout.cols - block_col;
-
-      static constexpr int block_col_mask = ~(Layout::kCols - 1);  // High bits.
-      float* packed_ptr =
-          packed_matrix->data +
-          packed_matrix->layout.stride * (block_col & block_col_mask);
-      PackFloatSse42(src_ptr, zerobuf, src_stride, remaining_src_cols,
-                     src_matrix.layout.rows, packed_ptr);
-    }
-  }
-};
 
 // Note that source and zero buffers can be uint8 type, but in the packing
 // function are reinterpreted as int8, and are XOR-ed with input_xor.
@@ -312,100 +212,6 @@
     }
   }
 };
-
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-// Note that source and zero buffers can be uint8 type, but in the packing
-// function are reinterpreted as int8, and are XOR-ed with input_xor.
-void Pack8bitAvxVnni(const std::int8_t* src_ptr, std::int8_t input_xor,
-                     const std::int8_t* zerobuf, int src_stride,
-                     int remaining_src_cols, int src_rows,
-                     std::int8_t* packed_ptr, std::int32_t* sums_ptr);
-
-template <typename Scalar>
-struct PackImpl<Path::kAvxVnni, FixedKernelLayout<Order::kColMajor, 4, 16>,
-                Scalar, std::int8_t, std::int32_t> {
-  static_assert(std::is_same<Scalar, std::int8_t>::value ||
-                    std::is_same<Scalar, std::uint8_t>::value,
-                "");
-  using Layout = FixedKernelLayout<Order::kColMajor, 4, 16>;
-  static constexpr int kHalfLayoutCols =
-      8;  // Half the number of cols in a block.
-  static constexpr std::int8_t kInputXor =
-      std::is_same<Scalar, std::int8_t>::value ? 0 : 0x80;
-
-  static void Run(Tuning, const Mat<Scalar>& src_matrix,
-                  PMat<std::int8_t>* packed_matrix, int start_col,
-                  int end_col) {
-    profiler::ScopeLabel label("Pack (AVX-512 8-bit)");
-
-    RUY_DCHECK(IsColMajor(src_matrix.layout));
-    RUY_DCHECK(IsColMajor(packed_matrix->layout));
-    RUY_DCHECK_EQ((end_col - start_col) % Layout::kCols, 0);
-    RUY_DCHECK_EQ(start_col % Layout::kCols, 0);
-    RUY_DCHECK_EQ(kHalfLayoutCols * 2, Layout::kCols);
-    std::int32_t* sums = packed_matrix->sums;
-    Scalar zerobuf[kHalfLayoutCols * Layout::kRows];
-    memset(zerobuf, packed_matrix->zero_point ^ kInputXor,
-           kHalfLayoutCols * Layout::kRows * sizeof(Scalar));
-    for (int block_col = start_col; block_col < end_col;
-         block_col += Layout::kCols) {
-      std::int32_t* sums_ptr = sums ? sums + block_col : nullptr;
-      int src_stride = src_matrix.layout.stride;
-      const Scalar* src_ptr = src_matrix.data.get() + src_stride * block_col;
-      int remaining_src_cols = src_matrix.layout.cols - block_col;
-
-      static constexpr int block_col_mask = ~(Layout::kCols - 1);  // High bits.
-      std::int8_t* packed_ptr =
-          packed_matrix->data +
-          packed_matrix->layout.stride * (block_col & block_col_mask);
-      Pack8bitAvxVnni(reinterpret_cast<const std::int8_t*>(src_ptr), kInputXor,
-                      reinterpret_cast<const std::int8_t*>(zerobuf), src_stride,
-                      remaining_src_cols, src_matrix.layout.rows, packed_ptr,
-                      sums_ptr);
-    }
-  }
-};
-
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-void PackFloatAvxVnni(const float* src_ptr, const float* zerobuf,
-                      int src_stride, int remaining_src_cols, int src_rows,
-                      float* packed_ptr);
-
-template <>
-struct PackImpl<Path::kAvxVnni, FixedKernelLayout<Order::kRowMajor, 1, 16>,
-                float, float, float> {
-  static void Run(Tuning, const Mat<float>& src_matrix,
-                  PMat<float>* packed_matrix, int start_col, int end_col) {
-    profiler::ScopeLabel label("Pack (AVX-512 float)");
-
-    using Layout = FixedKernelLayout<Order::kRowMajor, 1, 16>;
-    RUY_DCHECK(IsColMajor(src_matrix.layout));
-    RUY_DCHECK(IsColMajor(packed_matrix->layout));
-    RUY_DCHECK_EQ((end_col - start_col) % Layout::kCols, 0);
-    RUY_DCHECK_EQ(start_col % Layout::kCols, 0);
-    const float zerobuf[Layout::kCols] = {
-        0.0f};  // Remainder default inits to 0.0f.
-    for (int block_col = start_col; block_col < end_col;
-         block_col += Layout::kCols) {
-      int src_stride = src_matrix.layout.stride;
-      const float* src_ptr = src_matrix.data.get() + src_stride * block_col;
-      int remaining_src_cols = src_matrix.layout.cols - block_col;
-
-      static constexpr int block_col_mask = ~(Layout::kCols - 1);  // High bits.
-      float* packed_ptr =
-          packed_matrix->data +
-          packed_matrix->layout.stride * (block_col & block_col_mask);
-      PackFloatAvxVnni(src_ptr, zerobuf, src_stride, remaining_src_cols,
-                       src_matrix.layout.rows, packed_ptr);
-    }
-  }
-};
 #endif  // RUY_PLATFORM_X86
 
 }  // namespace ruy

diff --git a/ruy/path.h b/ruy/path.h
index 2a715d7..b2909c7 100644
--- a/ruy/path.h
+++ b/ruy/path.h

@@ -74,24 +74,10 @@
 #if RUY_PLATFORM_X86
   // x86 architectures.
   //
-  // TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete /
-  // placeholder.
-  // Optimization is not finished. In particular the dimensions of the kernel
-  // blocks can be changed as desired.
-  //
-  // Optimized for SSE 4.2.
-  kSse42 = 0x4,
   // Optimized for AVX2.
   kAvx2 = 0x8,
   // Optimized for AVX-512.
   kAvx512 = 0x10,
-  // TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete /
-  // placeholder.
-  // Optimization is not finished. In particular the dimensions of the kernel
-  // blocks can be changed as desired.
-  //
-  // Optimized for AVX-VNNI.
-  kAvxVnni = 0x20,
 #endif  // RUY_PLATFORM_X86
 };
 
@@ -154,7 +140,7 @@
 constexpr Path kExtraArchPaths = Path::kNone;
 #elif RUY_PLATFORM_X86
 constexpr Path kDefaultArchPaths = Path::kAvx2 | Path::kAvx512;
-constexpr Path kExtraArchPaths = Path::kSse42 | Path::kAvxVnni;
+constexpr Path kExtraArchPaths = Path::kNone;
 #else
 constexpr Path kDefaultArchPaths = Path::kNone;
 constexpr Path kExtraArchPaths = Path::kNone;

diff --git a/ruy/test.h b/ruy/test.h
index 902fbfd..1ae4801 100644
--- a/ruy/test.h
+++ b/ruy/test.h

@@ -93,10 +93,8 @@
     RUY_PATHNAME_CASE(kNeon)
     RUY_PATHNAME_CASE(kNeonDotprod)
 #elif RUY_PLATFORM_X86
-    RUY_PATHNAME_CASE(kSse42)
     RUY_PATHNAME_CASE(kAvx2)
     RUY_PATHNAME_CASE(kAvx512)
-    RUY_PATHNAME_CASE(kAvxVnni)
 #endif
     default:
       RUY_CHECK(false);
commit	8dd913669fbd4707367ca6c2c656711df5b992f8	[log] [tgz]
author	Benoit Jacob <benoitjacob@google.com>	Thu Jun 18 12:33:52 2020 -0700
committer	Copybara-Service <copybara-worker@google.com>	Thu Jun 18 12:34:15 2020 -0700
tree	ed5f527d732ffbee74d691eae98c5faa5acdc442
parent	4d8ad9f776fe58ba93d300b6b69c29c5ee11cd3c [diff]