Automated rollback of commit b0c2468b4b84ce3ae443390ea5d84cb6e950aee7

PiperOrigin-RevId: 265455751
diff --git a/tensorflow/lite/experimental/ruy/BUILD b/tensorflow/lite/experimental/ruy/BUILD
index dd01acf..5608727 100644
--- a/tensorflow/lite/experimental/ruy/BUILD
+++ b/tensorflow/lite/experimental/ruy/BUILD
@@ -2,31 +2,11 @@
 
 # TODO(b/123403203) actually make TFLite use ruy.
 
-load(":build_defs.bzl", "ruy_copts_avx2", "ruy_copts_skylake")
-load(":ruy_visibility.bzl", "ruy_visibility")
+load(":build_defs.bzl", "ruy_copts_avx2", "ruy_copts_base", "ruy_copts_skylake", "ruy_visibility")
 load(":ruy_test_ext.bzl", "ruy_test_ext_defines", "ruy_test_ext_deps")
 load(":ruy_test.bzl", "ruy_benchmark", "ruy_benchmark_opt_sets", "ruy_test")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
-# 1. Enable -mfpu=neon unconditionally on ARM32. If it turns out that we need to support
-#    ARM32 without NEON then we'll implement runtime detection and dispatch at that point.
-# 2. Explicitly pass -O3 on mobile configs where just "-c opt" means "optimize for code size".
-#    We would want to only do that when compilation_mode is "opt", but limitations of
-#    the "select" keyword (no nested selects, no AND boolean) seem to make that difficult
-#    at the moment. For debugging purposes, one needs to manually edit this to remove these
-#    -O3. Otherwise, not even `bazel build --copt=-O0` will override that.
-RUY_COPTS = select({
-    "//tensorflow:android_arm64": [
-        "-O3",
-    ],
-    "//tensorflow:android_arm": [
-        "-O3",
-        "-mfpu=neon",
-    ],
-    "//conditions:default": [
-    ],
-})
-
 package(
     default_visibility = ["//visibility:private"],
     licenses = ["notice"],  # Apache 2.0
@@ -35,33 +15,33 @@
 cc_library(
     name = "platform",
     hdrs = ["platform.h"],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
 )
 
 cc_library(
     name = "check_macros",
     hdrs = ["check_macros.h"],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     deps = ["//tensorflow/lite/kernels/internal:compatibility"],
 )
 
 cc_library(
     name = "opt_set",
     hdrs = ["opt_set.h"],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
 )
 
 cc_library(
     name = "time",
     hdrs = ["time.h"],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
 )
 
 cc_library(
     name = "wait",
     srcs = ["wait.cc"],
     hdrs = ["wait.h"],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     deps = [":time"],
 )
 
@@ -78,7 +58,7 @@
 cc_library(
     name = "size_util",
     hdrs = ["size_util.h"],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     deps = [":check_macros"],
 )
 
@@ -99,7 +79,7 @@
     hdrs = [
         "tune.h",
     ],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     deps = [
         ":opt_set",
         ":platform",
@@ -132,7 +112,7 @@
     hdrs = [
         "allocator.h",
     ],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     deps = [
         ":check_macros",
         ":size_util",
@@ -151,7 +131,7 @@
 cc_library(
     name = "side_pair",
     hdrs = ["side_pair.h"],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     deps = [":check_macros"],
 )
 
@@ -163,7 +143,7 @@
     hdrs = [
         "block_map.h",
     ],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     deps = [
         ":check_macros",
         ":opt_set",
@@ -181,7 +161,7 @@
     hdrs = [
         "blocking_counter.h",
     ],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     deps = [
         ":check_macros",
         ":wait",
@@ -196,7 +176,7 @@
     hdrs = [
         "thread_pool.h",
     ],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     visibility = ruy_visibility(),
     deps = [
         ":blocking_counter",
@@ -213,7 +193,7 @@
     hdrs = [
         "detect_arm.h",
     ],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     visibility = ruy_visibility(),
 )
 
@@ -225,7 +205,7 @@
     hdrs = [
         "detect_x86.h",
     ],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     visibility = ruy_visibility(),
     deps = [
         ":platform",
@@ -235,7 +215,7 @@
 cc_library(
     name = "path",
     hdrs = ["path.h"],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     visibility = ruy_visibility(),
     deps = [
         ":platform",
@@ -251,7 +231,7 @@
     hdrs = [
         "trace.h",
     ],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     deps = [
         ":block_map",
         ":check_macros",
@@ -263,7 +243,7 @@
 cc_library(
     name = "matrix",
     hdrs = ["matrix.h"],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     visibility = ruy_visibility(),
     deps = [":check_macros"],
 )
@@ -271,7 +251,7 @@
 cc_library(
     name = "spec",
     hdrs = ["spec.h"],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     visibility = ruy_visibility(),
     deps = [":matrix"],
 )
@@ -279,7 +259,7 @@
 cc_library(
     name = "internal_matrix",
     hdrs = ["internal_matrix.h"],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     deps = [
         ":check_macros",
         ":common",
@@ -293,7 +273,7 @@
     hdrs = [
         "common.h",
     ],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     deps = [
         ":check_macros",
         ":matrix",
@@ -311,7 +291,7 @@
         "kernel_common.h",
         "kernel_x86.h",
     ],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     deps = [
         ":check_macros",
         ":common",
@@ -337,7 +317,7 @@
         "pack_common.h",
         "pack_x86.h",
     ],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     deps = [
         ":check_macros",
         ":common",
@@ -357,7 +337,7 @@
         "kernel_arm32.cc",
         "kernel_arm64.cc",
     ],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     deps = [
         ":common",
         ":kernel_common",
@@ -372,7 +352,7 @@
     srcs = [
         "pack_arm.cc",
     ],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     deps = [
         ":common",
         ":opt_set",
@@ -382,12 +362,17 @@
     ],
 )
 
+# AVX-512 compilation units.
+#
+# These must use the same compiler options.
+RUY_COPTS_BUILT_FOR_AVX512 = ruy_copts_base() + ruy_copts_skylake()
+
 cc_library(
     name = "kernel_avx512",
     srcs = [
         "kernel_avx512.cc",
     ],
-    copts = RUY_COPTS + ruy_copts_skylake(),
+    copts = RUY_COPTS_BUILT_FOR_AVX512,
     deps = [
         ":check_macros",
         ":kernel_common",
@@ -402,7 +387,7 @@
     srcs = [
         "pack_avx512.cc",
     ],
-    copts = RUY_COPTS + ruy_copts_skylake(),
+    copts = RUY_COPTS_BUILT_FOR_AVX512,
     deps = [
         ":check_macros",
         ":matrix",
@@ -415,11 +400,32 @@
 )
 
 cc_library(
+    name = "have_built_path_for_avx512",
+    srcs = [
+        "have_built_path_for_avx512.cc",
+    ],
+    hdrs = [
+        "have_built_path_for.h",
+    ],
+    copts = RUY_COPTS_BUILT_FOR_AVX512,
+    deps = [
+        ":opt_set",
+        ":platform",
+    ],
+)
+# End: AVX-512 compilation units.
+
+# AVX2 compilation units.
+#
+# These must use the same compiler options.
+RUY_COPTS_BUILT_FOR_AVX2 = ruy_copts_base() + ruy_copts_avx2()
+
+cc_library(
     name = "kernel_avx2",
     srcs = [
         "kernel_avx2.cc",
     ],
-    copts = RUY_COPTS + ruy_copts_avx2(),
+    copts = RUY_COPTS_BUILT_FOR_AVX2,
     deps = [
         ":check_macros",
         ":kernel_common",
@@ -434,7 +440,7 @@
     srcs = [
         "pack_avx2.cc",
     ],
-    copts = RUY_COPTS + ruy_copts_avx2(),
+    copts = RUY_COPTS_BUILT_FOR_AVX2,
     deps = [
         ":check_macros",
         ":matrix",
@@ -447,12 +453,28 @@
 )
 
 cc_library(
+    name = "have_built_path_for_avx2",
+    srcs = [
+        "have_built_path_for_avx2.cc",
+    ],
+    hdrs = [
+        "have_built_path_for.h",
+    ],
+    copts = RUY_COPTS_BUILT_FOR_AVX2,
+    deps = [
+        ":opt_set",
+        ":platform",
+    ],
+)
+# End: AVX2 compilation units.
+
+cc_library(
     name = "kernel",
     hdrs = [
         "kernel.h",
         "kernel_common.h",
     ],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     deps = [
         ":check_macros",
         ":common",
@@ -480,7 +502,7 @@
         "pack.h",
         "pack_common.h",
     ],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     deps = [
         ":check_macros",
         ":common",
@@ -499,6 +521,18 @@
 )
 
 cc_library(
+    name = "have_built_path_for",
+    hdrs = [
+        "have_built_path_for.h",
+    ],
+    deps = [
+        ":have_built_path_for_avx2",
+        ":have_built_path_for_avx512",
+        ":platform",
+    ],
+)
+
+cc_library(
     name = "context",
     srcs = [
         "context.cc",
@@ -506,13 +540,14 @@
     hdrs = [
         "context.h",
     ],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     visibility = ruy_visibility(),
     deps = [
         ":allocator",
         ":check_macros",
         ":detect_arm",
         ":detect_x86",
+        ":have_built_path_for",
         ":path",
         ":platform",
         ":thread_pool",
@@ -521,10 +556,21 @@
     ],
 )
 
+cc_test(
+    name = "context_test",
+    srcs = ["context_test.cc"],
+    deps = [
+        ":context",
+        ":path",
+        ":platform",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "trmul_params",
     hdrs = ["trmul_params.h"],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     deps = [
         ":internal_matrix",
         ":side_pair",
@@ -536,7 +582,7 @@
     name = "trmul",
     srcs = ["trmul.cc"],
     hdrs = ["trmul.h"],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     deps = [
         ":allocator",
         ":block_map",
@@ -568,7 +614,7 @@
         "ruy.h",
         "ruy_advanced.h",
     ],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     visibility = ruy_visibility(),
     deps = [
         ":check_macros",
@@ -610,7 +656,7 @@
     testonly = True,
     srcs = ["pmu.cc"],
     hdrs = ["pmu.h"],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     deps = [":check_macros"],
 )
 
@@ -619,7 +665,7 @@
     name = "test_lib",
     testonly = True,
     hdrs = ["test.h"],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     # need defines, not copts, because it's controlling a header, test.h
     defines = ruy_test_ext_defines(),
     linkopts = select({
@@ -640,7 +686,7 @@
 ruy_benchmark(
     name = "benchmark",
     srcs = ["benchmark.cc"],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     lhs_rhs_accum_dst = [
         ("f32", "f32", "f32", "f32"),
         ("u8", "u8", "i32", "u8"),
@@ -654,7 +700,7 @@
 ruy_test(
     name = "test_fast",
     srcs = ["test_fast.cc"],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     lhs_rhs_accum_dst = [
         ("f32", "f32", "f32", "f32"),
         ("f64", "f32", "f64", "f32"),
@@ -670,7 +716,7 @@
 ruy_test(
     name = "test_slow",
     srcs = ["test_slow.cc"],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     lhs_rhs_accum_dst = [
         ("f32", "f32", "f32", "f32"),
         ("u8", "u8", "i32", "u8"),
@@ -684,7 +730,7 @@
 ruy_test(
     name = "test_special_specs",
     srcs = ["test_special_specs.cc"],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     lhs_rhs_accum_dst = [
         ("f32", "f32", "f32", "f32"),
         ("u8", "u8", "i32", "u8"),
@@ -695,7 +741,7 @@
 ruy_benchmark_opt_sets(
     name = "benchmark_opt_set",
     srcs = ["benchmark.cc"],
-    copts = RUY_COPTS,
+    copts = ruy_copts_base(),
     lhs_rhs_accum_dst = [
         ("f32", "f32", "f32", "f32"),
         ("u8", "u8", "i32", "u8"),
diff --git a/tensorflow/lite/experimental/ruy/blocking_counter.h b/tensorflow/lite/experimental/ruy/blocking_counter.h
index 40f903b..e8c76d5 100644
--- a/tensorflow/lite/experimental/ruy/blocking_counter.h
+++ b/tensorflow/lite/experimental/ruy/blocking_counter.h
@@ -17,8 +17,8 @@
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_BLOCKING_COUNTER_H_
 
 #include <atomic>
-#include <condition_variable>  // NOLINT(build/c++11)
-#include <mutex>               // NOLINT(build/c++11)
+#include <condition_variable>  // NOLINT(build/c++11) // IWYU pragma: keep
+#include <mutex>               // NOLINT(build/c++11) // IWYU pragma: keep
 
 namespace ruy {
 
diff --git a/tensorflow/lite/experimental/ruy/build_defs.bzl b/tensorflow/lite/experimental/ruy/build_defs.bzl
index e40ed6d..d375b4f 100644
--- a/tensorflow/lite/experimental/ruy/build_defs.bzl
+++ b/tensorflow/lite/experimental/ruy/build_defs.bzl
@@ -1,7 +1,32 @@
 """Build definitions for Ruy."""
 
+def ruy_visibility():
+    return [
+        "//tensorflow/lite/kernels:__subpackages__",
+    ]
+
+# 1. Enable -mfpu=neon unconditionally on ARM32. If it turns out that we need to support
+#    ARM32 without NEON then we'll implement runtime detection and dispatch at that point.
+# 2. Explicitly pass -O3 on mobile configs where just "-c opt" means "optimize for code size".
+#    We would want to only do that when compilation_mode is "opt", but limitations of
+#    the "select" keyword (no nested selects, no AND boolean) seem to make that difficult
+#    at the moment. For debugging purposes, this can be overridded on the command line, e.g.
+#      bazel build -c dbg --copt=-O0 ...
+
+def ruy_copts_base():
+    return select({
+        "//tensorflow:android_arm64": ["-O3"],
+        "//tensorflow:android_arm": [
+            "-O3",
+            "-mfpu=neon",
+        ],
+        "//conditions:default": [],
+    })
+
+# Used for targets that are compiled with extra features that are skipped at runtime if unavailable.
 def ruy_copts_skylake():
     return []
 
+# Used for targets that are compiled with extra features that are skipped at runtime if unavailable.
 def ruy_copts_avx2():
     return []
diff --git a/tensorflow/lite/experimental/ruy/context.cc b/tensorflow/lite/experimental/ruy/context.cc
index 32f222c..aea42cd 100644
--- a/tensorflow/lite/experimental/ruy/context.cc
+++ b/tensorflow/lite/experimental/ruy/context.cc
@@ -18,6 +18,8 @@
 #include "tensorflow/lite/experimental/ruy/check_macros.h"
 #include "tensorflow/lite/experimental/ruy/detect_arm.h"
 #include "tensorflow/lite/experimental/ruy/detect_x86.h"
+#include "tensorflow/lite/experimental/ruy/have_built_path_for.h"
+#include "tensorflow/lite/experimental/ruy/platform.h"
 
 namespace ruy {
 
@@ -47,11 +49,11 @@
       RUY_DCHECK((runtime_enabled_paths_ & Path::kNeonDotprod) == Path::kNone);
     }
   }
-#endif
+#endif  // RUY_PLATFORM(ARM)
 
 #if RUY_PLATFORM(X86)
   if ((runtime_enabled_paths_ & Path::kAvx2) != Path::kNone) {
-    if (!DetectCpuAvx2()) {
+    if (!(HaveBuiltPathForAvx2() && DetectCpuAvx2())) {
       runtime_enabled_paths_ = runtime_enabled_paths_ & ~Path::kAvx2;
       // Sanity check.
       RUY_DCHECK((runtime_enabled_paths_ & Path::kAvx2) == Path::kNone);
@@ -59,13 +61,13 @@
   }
 
   if ((runtime_enabled_paths_ & Path::kAvx512) != Path::kNone) {
-    if (!DetectCpuAvx512()) {
+    if (!(HaveBuiltPathForAvx512() && DetectCpuAvx512())) {
       runtime_enabled_paths_ = runtime_enabled_paths_ & ~Path::kAvx512;
       // Sanity check.
       RUY_DCHECK((runtime_enabled_paths_ & Path::kAvx512) == Path::kNone);
     }
   }
-#endif
+#endif  // RUY_PLATFORM(X86)
 
   // Sanity check. We can't possibly have disabled all paths, as some paths
   // are universally available (kReference, kStandardCpp).
diff --git a/tensorflow/lite/experimental/ruy/context_test.cc b/tensorflow/lite/experimental/ruy/context_test.cc
new file mode 100644
index 0000000..1a184b8
--- /dev/null
+++ b/tensorflow/lite/experimental/ruy/context_test.cc
@@ -0,0 +1,62 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/ruy/context.h"
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/ruy/path.h"
+#include "tensorflow/lite/experimental/ruy/platform.h"
+
+namespace ruy {
+namespace {
+
+TEST(ContextTest, EnabledPathsGeneral) {
+  ruy::Context ruy_context;
+  const auto ruy_paths = ruy_context.GetRuntimeEnabledPaths();
+  const auto ruy_paths_repeat = ruy_context.GetRuntimeEnabledPaths();
+  ASSERT_EQ(ruy_paths, ruy_paths_repeat);
+  EXPECT_NE(ruy_paths, Path::kNone);
+  EXPECT_EQ(ruy_paths & Path::kReference, Path::kReference);
+  EXPECT_EQ(ruy_paths & Path::kStandardCpp, Path::kStandardCpp);
+}
+
+#if RUY_PLATFORM(X86)
+TEST(ContextTest, EnabledPathsX86) {
+  ruy::Context ruy_context;
+  ruy_context.SetRuntimeEnabledPaths(Path::kAvx2 | Path::kAvx512);
+  const auto ruy_paths = ruy_context.GetRuntimeEnabledPaths();
+  EXPECT_EQ(ruy_paths & Path::kReference, Path::kNone);
+  EXPECT_EQ(ruy_paths & Path::kStandardCpp, Path::kNone);
+}
+#endif  // RUY_PLATFORM(X86)
+
+#if RUY_PLATFORM(ARM)
+TEST(ContextTest, EnabledPathsArm) {
+  ruy::Context ruy_context;
+  ruy_context.SetRuntimeEnabledPaths(Path::kNeon | Path::kNeonDotprod);
+  const auto ruy_paths = ruy_context.GetRuntimeEnabledPaths();
+  EXPECT_EQ(ruy_paths & Path::kReference, Path::kNone);
+  EXPECT_EQ(ruy_paths & Path::kStandardCpp, Path::kNone);
+  EXPECT_EQ(ruy_paths & Path::kNeon, Path::kNeon);
+}
+#endif  // RUY_PLATFORM(ARM)
+
+}  // namespace
+}  // namespace ruy
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/experimental/ruy/have_built_path_for.h b/tensorflow/lite/experimental/ruy/have_built_path_for.h
new file mode 100644
index 0000000..4e340f5
--- /dev/null
+++ b/tensorflow/lite/experimental/ruy/have_built_path_for.h
@@ -0,0 +1,30 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_HAVE_BUILT_PATH_FOR_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_HAVE_BUILT_PATH_FOR_H_
+
+#include "tensorflow/lite/experimental/ruy/platform.h"
+
+namespace ruy {
+
+#if RUY_PLATFORM(X86)
+bool HaveBuiltPathForAvx2();
+bool HaveBuiltPathForAvx512();
+#endif  // RUY_PLATFORM(X86)
+
+}  // namespace ruy
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_HAVE_BUILT_PATH_FOR_H_
diff --git a/tensorflow/lite/experimental/ruy/have_built_path_for_avx2.cc b/tensorflow/lite/experimental/ruy/have_built_path_for_avx2.cc
new file mode 100644
index 0000000..be694ce
--- /dev/null
+++ b/tensorflow/lite/experimental/ruy/have_built_path_for_avx2.cc
@@ -0,0 +1,35 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/ruy/have_built_path_for.h"
+#include "tensorflow/lite/experimental/ruy/opt_set.h"
+
+namespace ruy {
+
+#if RUY_PLATFORM(X86)
+// IMPORTANT:
+// These patterns must match those in the pack and kernel cc files.
+#if !(RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_ASM))
+
+bool HaveBuiltPathForAvx2() { return false; }
+
+#else  // RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+
+bool HaveBuiltPathForAvx2() { return true; }
+
+#endif  // RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+#endif  // RUY_PLATFORM(X86)
+
+}  // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/have_built_path_for_avx512.cc b/tensorflow/lite/experimental/ruy/have_built_path_for_avx512.cc
new file mode 100644
index 0000000..ccfea77
--- /dev/null
+++ b/tensorflow/lite/experimental/ruy/have_built_path_for_avx512.cc
@@ -0,0 +1,35 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/ruy/have_built_path_for.h"
+#include "tensorflow/lite/experimental/ruy/opt_set.h"
+
+namespace ruy {
+
+#if RUY_PLATFORM(X86)
+// IMPORTANT:
+// These patterns must match those in the pack and kernel cc files.
+#if !(RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_ASM))
+
+bool HaveBuiltPathForAvx512() { return false; }
+
+#else  // RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+
+bool HaveBuiltPathForAvx512() { return true; }
+
+#endif  // RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+#endif  // RUY_PLATFORM(X86)
+
+}  // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/kernel_avx2.cc b/tensorflow/lite/experimental/ruy/kernel_avx2.cc
index d1a3150..eb38add 100644
--- a/tensorflow/lite/experimental/ruy/kernel_avx2.cc
+++ b/tensorflow/lite/experimental/ruy/kernel_avx2.cc
@@ -28,7 +28,19 @@
 
 namespace ruy {
 
-#if RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+#if !(RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_ASM))
+
+void Kernel8bitAvx2(const KernelParams8bit<8, 8>& params) {
+  // CPU-ID-based checks should disable the path that would reach this point.
+  RUY_DCHECK(false);
+}
+
+void KernelFloatAvx2(const KernelParamsFloat<8, 8>& params) {
+  // CPU-ID-based checks should disable the path that would reach this point.
+  RUY_DCHECK(false);
+}
+
+#else  // RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_ASM)
 
 static constexpr int kAvxFloatBlockSize = 8;
 static constexpr int kAvx8bitBlockSize = 8;
diff --git a/tensorflow/lite/experimental/ruy/kernel_avx512.cc b/tensorflow/lite/experimental/ruy/kernel_avx512.cc
index bdcd7eb..98bff00 100644
--- a/tensorflow/lite/experimental/ruy/kernel_avx512.cc
+++ b/tensorflow/lite/experimental/ruy/kernel_avx512.cc
@@ -28,7 +28,19 @@
 
 namespace ruy {
 
-#if RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+#if !(RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_ASM))
+
+void Kernel8bitAvx512(const KernelParams8bit<16, 16>& params) {
+  // CPU-ID-based checks should disable the path that would reach this point.
+  RUY_DCHECK(false);
+}
+
+void KernelFloatAvx512(const KernelParamsFloat<16, 16>& params) {
+  // CPU-ID-based checks should disable the path that would reach this point.
+  RUY_DCHECK(false);
+}
+
+#else  // RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_ASM)
 
 inline std::int32_t mm512_get1_epi32(const __m512i v, int i) {
   __m256i a =
diff --git a/tensorflow/lite/experimental/ruy/kernel_x86.h b/tensorflow/lite/experimental/ruy/kernel_x86.h
index 31269ea..78dcffb 100644
--- a/tensorflow/lite/experimental/ruy/kernel_x86.h
+++ b/tensorflow/lite/experimental/ruy/kernel_x86.h
@@ -30,7 +30,7 @@
 
 namespace ruy {
 
-#if RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+#if RUY_PLATFORM(X86)
 void Kernel8bitAvx512(const KernelParams8bit<16, 16>& params);
 
 template <typename DstScalar>
@@ -69,9 +69,7 @@
     KernelFloatAvx512(params);
   }
 };
-#endif  // RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_ASM)
 
-#if RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_ASM)
 void Kernel8bitAvx2(const KernelParams8bit<8, 8>& params);
 
 template <typename DstScalar>
@@ -110,7 +108,7 @@
     KernelFloatAvx2(params);
   }
 };
-#endif  // RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+#endif  // RUY_PLATFORM(X86)
 
 }  // namespace ruy
 
diff --git a/tensorflow/lite/experimental/ruy/pack_avx2.cc b/tensorflow/lite/experimental/ruy/pack_avx2.cc
index f89bf62..7483419 100644
--- a/tensorflow/lite/experimental/ruy/pack_avx2.cc
+++ b/tensorflow/lite/experimental/ruy/pack_avx2.cc
@@ -30,7 +30,23 @@
 
 namespace ruy {
 
-#if RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_INTRINSICS)
+#if !(RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_ASM))
+
+void Pack8bitAvx2(const std::int8_t* src_ptr, std::int8_t input_xor,
+                  const std::int8_t* zerobuf, int src_stride,
+                  int remaining_src_cols, int src_rows, std::int8_t* packed_ptr,
+                  std::int32_t* sums_ptr) {
+  // CPU-ID-based checks should disable the path that would reach this point.
+  RUY_DCHECK(false);
+}
+
+void PackFloatAvx2(const float* src_ptr, const float* zerobuf, int src_stride,
+                   int remaining_src_cols, int src_rows, float* packed_ptr) {
+  // CPU-ID-based checks should disable the path that would reach this point.
+  RUY_DCHECK(false);
+}
+
+#else  // RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_ASM)
 
 static constexpr int kAvxFloatBlockSize = 8;
 static constexpr int kAvx8bitBlockSize = 8;
diff --git a/tensorflow/lite/experimental/ruy/pack_avx512.cc b/tensorflow/lite/experimental/ruy/pack_avx512.cc
index f795423..0c14660 100644
--- a/tensorflow/lite/experimental/ruy/pack_avx512.cc
+++ b/tensorflow/lite/experimental/ruy/pack_avx512.cc
@@ -30,7 +30,23 @@
 
 namespace ruy {
 
-#if RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_INTRINSICS)
+#if !(RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_ASM))
+
+void Pack8bitAvx512(const std::int8_t* src_ptr, std::int8_t input_xor,
+                    const std::int8_t* zerobuf, int src_stride,
+                    int remaining_src_cols, int src_rows,
+                    std::int8_t* packed_ptr, std::int32_t* sums_ptr) {
+  // CPU-ID-based checks should disable the path that would reach this point.
+  RUY_DCHECK(false);
+}
+
+void PackFloatAvx512(const float* src_ptr, const float* zerobuf, int src_stride,
+                     int remaining_src_cols, int src_rows, float* packed_ptr) {
+  // CPU-ID-based checks should disable the path that would reach this point.
+  RUY_DCHECK(false);
+}
+
+#else  // RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_ASM)
 
 // The first int8_t template parameter is arbitrary: this routine is common to
 // all 8-bit source matrix types.
diff --git a/tensorflow/lite/experimental/ruy/pack_x86.h b/tensorflow/lite/experimental/ruy/pack_x86.h
index 96c5a97..cf8b097 100644
--- a/tensorflow/lite/experimental/ruy/pack_x86.h
+++ b/tensorflow/lite/experimental/ruy/pack_x86.h
@@ -84,6 +84,7 @@
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_PACK_X86_H_
 
 #include <cstdint>
+#include <cstring>
 #include <type_traits>
 
 #include "profiling/instrumentation.h"
@@ -99,7 +100,7 @@
 
 namespace ruy {
 
-#if RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+#if RUY_PLATFORM(X86)
 // Note that source and zero buffers can be uint8 type, but in the packing
 // function are reinterpreted as int8, and are XOR-ed with input_xor.
 void Pack8bitAvx2(const std::int8_t* src_ptr, std::int8_t input_xor,
@@ -180,9 +181,7 @@
     }
   }
 };
-#endif  // RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_ASM)
 
-#if RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_ASM)
 // Note that source and zero buffers can be uint8 type, but in the packing
 // function are reinterpreted as int8, and are XOR-ed with input_xor.
 void Pack8bitAvx512(const std::int8_t* src_ptr, std::int8_t input_xor,
@@ -266,7 +265,7 @@
     }
   }
 };
-#endif  // RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+#endif  // RUY_PLATFORM(X86)
 
 }  // namespace ruy
 
diff --git a/tensorflow/lite/experimental/ruy/path.h b/tensorflow/lite/experimental/ruy/path.h
index 43e8ae4..8d861a0 100644
--- a/tensorflow/lite/experimental/ruy/path.h
+++ b/tensorflow/lite/experimental/ruy/path.h
@@ -80,7 +80,7 @@
   // Optimized path making use of ARM NEON dot product instructions that are
   // available on newer ARM cores.
   kNeonDotprod = 0x8,
-#endif
+#endif  // RUY_PLATFORM(ARM)
 
 #if RUY_PLATFORM(X86)
   // x86 architectures.
@@ -89,7 +89,7 @@
   kAvx2 = 0x4,
   // Optimized for AVX-512.
   kAvx512 = 0x8,
-#endif
+#endif  // RUY_PLATFORM(X86)
 };
 
 inline constexpr Path operator|(Path p, Path q) {
diff --git a/tensorflow/lite/experimental/ruy/ruy_visibility.bzl b/tensorflow/lite/experimental/ruy/ruy_visibility.bzl
deleted file mode 100644
index 3668ada..0000000
--- a/tensorflow/lite/experimental/ruy/ruy_visibility.bzl
+++ /dev/null
@@ -1,6 +0,0 @@
-"""Control of ruy visibility"""
-
-def ruy_visibility():
-    return [
-        "//tensorflow/lite/kernels:__subpackages__",
-    ]
diff --git a/tensorflow/lite/experimental/ruy/time.h b/tensorflow/lite/experimental/ruy/time.h
index 07d6caa..d96ed34 100644
--- a/tensorflow/lite/experimental/ruy/time.h
+++ b/tensorflow/lite/experimental/ruy/time.h
@@ -21,7 +21,8 @@
 #include <ratio>    // NOLINT(build/c++11)
 
 #ifdef __linux__
-#include <sys/time.h>  // for CLOCK_MONOTONIC_COARSE
+#include <sys/time.h>
+// IWYU pragma: no_include <type_traits>
 
 #include <ctime>
 #endif
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_test.cc b/tensorflow/lite/kernels/cpu_backend_gemm_test.cc
index 427c6ab..d545b80 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_test.cc
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_test.cc
@@ -207,8 +207,8 @@
     // compromise between something that works and something that's simple
     // enough code that doesn't feel too ad-hoc. As above in the float path,
     // abs_mean_diff is subject to a stricter requirement as it is a bias.
-    tolerated_relative_mean_abs_diff = std::sqrt(inverse_size);
-    tolerated_relative_abs_mean_diff = inverse_size;
+    tolerated_relative_mean_abs_diff = std::sqrt(inverse_size) * 0.5;
+    tolerated_relative_abs_mean_diff = inverse_size * 2.;
   }
 
   double tolerated_max_abs_diff =