Automated rollback of commit b0c2468b4b84ce3ae443390ea5d84cb6e950aee7
PiperOrigin-RevId: 265455751
diff --git a/tensorflow/lite/experimental/ruy/BUILD b/tensorflow/lite/experimental/ruy/BUILD
index dd01acf..5608727 100644
--- a/tensorflow/lite/experimental/ruy/BUILD
+++ b/tensorflow/lite/experimental/ruy/BUILD
@@ -2,31 +2,11 @@
# TODO(b/123403203) actually make TFLite use ruy.
-load(":build_defs.bzl", "ruy_copts_avx2", "ruy_copts_skylake")
-load(":ruy_visibility.bzl", "ruy_visibility")
+load(":build_defs.bzl", "ruy_copts_avx2", "ruy_copts_base", "ruy_copts_skylake", "ruy_visibility")
load(":ruy_test_ext.bzl", "ruy_test_ext_defines", "ruy_test_ext_deps")
load(":ruy_test.bzl", "ruy_benchmark", "ruy_benchmark_opt_sets", "ruy_test")
load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
-# 1. Enable -mfpu=neon unconditionally on ARM32. If it turns out that we need to support
-# ARM32 without NEON then we'll implement runtime detection and dispatch at that point.
-# 2. Explicitly pass -O3 on mobile configs where just "-c opt" means "optimize for code size".
-# We would want to only do that when compilation_mode is "opt", but limitations of
-# the "select" keyword (no nested selects, no AND boolean) seem to make that difficult
-# at the moment. For debugging purposes, one needs to manually edit this to remove these
-# -O3. Otherwise, not even `bazel build --copt=-O0` will override that.
-RUY_COPTS = select({
- "//tensorflow:android_arm64": [
- "-O3",
- ],
- "//tensorflow:android_arm": [
- "-O3",
- "-mfpu=neon",
- ],
- "//conditions:default": [
- ],
-})
-
package(
default_visibility = ["//visibility:private"],
licenses = ["notice"], # Apache 2.0
@@ -35,33 +15,33 @@
cc_library(
name = "platform",
hdrs = ["platform.h"],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
)
cc_library(
name = "check_macros",
hdrs = ["check_macros.h"],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
deps = ["//tensorflow/lite/kernels/internal:compatibility"],
)
cc_library(
name = "opt_set",
hdrs = ["opt_set.h"],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
)
cc_library(
name = "time",
hdrs = ["time.h"],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
)
cc_library(
name = "wait",
srcs = ["wait.cc"],
hdrs = ["wait.h"],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
deps = [":time"],
)
@@ -78,7 +58,7 @@
cc_library(
name = "size_util",
hdrs = ["size_util.h"],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
deps = [":check_macros"],
)
@@ -99,7 +79,7 @@
hdrs = [
"tune.h",
],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
deps = [
":opt_set",
":platform",
@@ -132,7 +112,7 @@
hdrs = [
"allocator.h",
],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
deps = [
":check_macros",
":size_util",
@@ -151,7 +131,7 @@
cc_library(
name = "side_pair",
hdrs = ["side_pair.h"],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
deps = [":check_macros"],
)
@@ -163,7 +143,7 @@
hdrs = [
"block_map.h",
],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
deps = [
":check_macros",
":opt_set",
@@ -181,7 +161,7 @@
hdrs = [
"blocking_counter.h",
],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
deps = [
":check_macros",
":wait",
@@ -196,7 +176,7 @@
hdrs = [
"thread_pool.h",
],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
visibility = ruy_visibility(),
deps = [
":blocking_counter",
@@ -213,7 +193,7 @@
hdrs = [
"detect_arm.h",
],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
visibility = ruy_visibility(),
)
@@ -225,7 +205,7 @@
hdrs = [
"detect_x86.h",
],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
visibility = ruy_visibility(),
deps = [
":platform",
@@ -235,7 +215,7 @@
cc_library(
name = "path",
hdrs = ["path.h"],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
visibility = ruy_visibility(),
deps = [
":platform",
@@ -251,7 +231,7 @@
hdrs = [
"trace.h",
],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
deps = [
":block_map",
":check_macros",
@@ -263,7 +243,7 @@
cc_library(
name = "matrix",
hdrs = ["matrix.h"],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
visibility = ruy_visibility(),
deps = [":check_macros"],
)
@@ -271,7 +251,7 @@
cc_library(
name = "spec",
hdrs = ["spec.h"],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
visibility = ruy_visibility(),
deps = [":matrix"],
)
@@ -279,7 +259,7 @@
cc_library(
name = "internal_matrix",
hdrs = ["internal_matrix.h"],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
deps = [
":check_macros",
":common",
@@ -293,7 +273,7 @@
hdrs = [
"common.h",
],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
deps = [
":check_macros",
":matrix",
@@ -311,7 +291,7 @@
"kernel_common.h",
"kernel_x86.h",
],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
deps = [
":check_macros",
":common",
@@ -337,7 +317,7 @@
"pack_common.h",
"pack_x86.h",
],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
deps = [
":check_macros",
":common",
@@ -357,7 +337,7 @@
"kernel_arm32.cc",
"kernel_arm64.cc",
],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
deps = [
":common",
":kernel_common",
@@ -372,7 +352,7 @@
srcs = [
"pack_arm.cc",
],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
deps = [
":common",
":opt_set",
@@ -382,12 +362,17 @@
],
)
+# AVX-512 compilation units.
+#
+# These must use the same compiler options.
+RUY_COPTS_BUILT_FOR_AVX512 = ruy_copts_base() + ruy_copts_skylake()
+
cc_library(
name = "kernel_avx512",
srcs = [
"kernel_avx512.cc",
],
- copts = RUY_COPTS + ruy_copts_skylake(),
+ copts = RUY_COPTS_BUILT_FOR_AVX512,
deps = [
":check_macros",
":kernel_common",
@@ -402,7 +387,7 @@
srcs = [
"pack_avx512.cc",
],
- copts = RUY_COPTS + ruy_copts_skylake(),
+ copts = RUY_COPTS_BUILT_FOR_AVX512,
deps = [
":check_macros",
":matrix",
@@ -415,11 +400,32 @@
)
cc_library(
+ name = "have_built_path_for_avx512",
+ srcs = [
+ "have_built_path_for_avx512.cc",
+ ],
+ hdrs = [
+ "have_built_path_for.h",
+ ],
+ copts = RUY_COPTS_BUILT_FOR_AVX512,
+ deps = [
+ ":opt_set",
+ ":platform",
+ ],
+)
+# End: AVX-512 compilation units.
+
+# AVX2 compilation units.
+#
+# These must use the same compiler options.
+RUY_COPTS_BUILT_FOR_AVX2 = ruy_copts_base() + ruy_copts_avx2()
+
+cc_library(
name = "kernel_avx2",
srcs = [
"kernel_avx2.cc",
],
- copts = RUY_COPTS + ruy_copts_avx2(),
+ copts = RUY_COPTS_BUILT_FOR_AVX2,
deps = [
":check_macros",
":kernel_common",
@@ -434,7 +440,7 @@
srcs = [
"pack_avx2.cc",
],
- copts = RUY_COPTS + ruy_copts_avx2(),
+ copts = RUY_COPTS_BUILT_FOR_AVX2,
deps = [
":check_macros",
":matrix",
@@ -447,12 +453,28 @@
)
cc_library(
+ name = "have_built_path_for_avx2",
+ srcs = [
+ "have_built_path_for_avx2.cc",
+ ],
+ hdrs = [
+ "have_built_path_for.h",
+ ],
+ copts = RUY_COPTS_BUILT_FOR_AVX2,
+ deps = [
+ ":opt_set",
+ ":platform",
+ ],
+)
+# End: AVX2 compilation units.
+
+cc_library(
name = "kernel",
hdrs = [
"kernel.h",
"kernel_common.h",
],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
deps = [
":check_macros",
":common",
@@ -480,7 +502,7 @@
"pack.h",
"pack_common.h",
],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
deps = [
":check_macros",
":common",
@@ -499,6 +521,18 @@
)
cc_library(
+ name = "have_built_path_for",
+ hdrs = [
+ "have_built_path_for.h",
+ ],
+ deps = [
+ ":have_built_path_for_avx2",
+ ":have_built_path_for_avx512",
+ ":platform",
+ ],
+)
+
+cc_library(
name = "context",
srcs = [
"context.cc",
@@ -506,13 +540,14 @@
hdrs = [
"context.h",
],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
visibility = ruy_visibility(),
deps = [
":allocator",
":check_macros",
":detect_arm",
":detect_x86",
+ ":have_built_path_for",
":path",
":platform",
":thread_pool",
@@ -521,10 +556,21 @@
],
)
+cc_test(
+ name = "context_test",
+ srcs = ["context_test.cc"],
+ deps = [
+ ":context",
+ ":path",
+ ":platform",
+ "@com_google_googletest//:gtest",
+ ],
+)
+
cc_library(
name = "trmul_params",
hdrs = ["trmul_params.h"],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
deps = [
":internal_matrix",
":side_pair",
@@ -536,7 +582,7 @@
name = "trmul",
srcs = ["trmul.cc"],
hdrs = ["trmul.h"],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
deps = [
":allocator",
":block_map",
@@ -568,7 +614,7 @@
"ruy.h",
"ruy_advanced.h",
],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
visibility = ruy_visibility(),
deps = [
":check_macros",
@@ -610,7 +656,7 @@
testonly = True,
srcs = ["pmu.cc"],
hdrs = ["pmu.h"],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
deps = [":check_macros"],
)
@@ -619,7 +665,7 @@
name = "test_lib",
testonly = True,
hdrs = ["test.h"],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
# need defines, not copts, because it's controlling a header, test.h
defines = ruy_test_ext_defines(),
linkopts = select({
@@ -640,7 +686,7 @@
ruy_benchmark(
name = "benchmark",
srcs = ["benchmark.cc"],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
lhs_rhs_accum_dst = [
("f32", "f32", "f32", "f32"),
("u8", "u8", "i32", "u8"),
@@ -654,7 +700,7 @@
ruy_test(
name = "test_fast",
srcs = ["test_fast.cc"],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
lhs_rhs_accum_dst = [
("f32", "f32", "f32", "f32"),
("f64", "f32", "f64", "f32"),
@@ -670,7 +716,7 @@
ruy_test(
name = "test_slow",
srcs = ["test_slow.cc"],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
lhs_rhs_accum_dst = [
("f32", "f32", "f32", "f32"),
("u8", "u8", "i32", "u8"),
@@ -684,7 +730,7 @@
ruy_test(
name = "test_special_specs",
srcs = ["test_special_specs.cc"],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
lhs_rhs_accum_dst = [
("f32", "f32", "f32", "f32"),
("u8", "u8", "i32", "u8"),
@@ -695,7 +741,7 @@
ruy_benchmark_opt_sets(
name = "benchmark_opt_set",
srcs = ["benchmark.cc"],
- copts = RUY_COPTS,
+ copts = ruy_copts_base(),
lhs_rhs_accum_dst = [
("f32", "f32", "f32", "f32"),
("u8", "u8", "i32", "u8"),
diff --git a/tensorflow/lite/experimental/ruy/blocking_counter.h b/tensorflow/lite/experimental/ruy/blocking_counter.h
index 40f903b..e8c76d5 100644
--- a/tensorflow/lite/experimental/ruy/blocking_counter.h
+++ b/tensorflow/lite/experimental/ruy/blocking_counter.h
@@ -17,8 +17,8 @@
#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_BLOCKING_COUNTER_H_
#include <atomic>
-#include <condition_variable> // NOLINT(build/c++11)
-#include <mutex> // NOLINT(build/c++11)
+#include <condition_variable> // NOLINT(build/c++11) // IWYU pragma: keep
+#include <mutex> // NOLINT(build/c++11) // IWYU pragma: keep
namespace ruy {
diff --git a/tensorflow/lite/experimental/ruy/build_defs.bzl b/tensorflow/lite/experimental/ruy/build_defs.bzl
index e40ed6d..d375b4f 100644
--- a/tensorflow/lite/experimental/ruy/build_defs.bzl
+++ b/tensorflow/lite/experimental/ruy/build_defs.bzl
@@ -1,7 +1,32 @@
"""Build definitions for Ruy."""
+def ruy_visibility():
+ return [
+ "//tensorflow/lite/kernels:__subpackages__",
+ ]
+
+# 1. Enable -mfpu=neon unconditionally on ARM32. If it turns out that we need to support
+# ARM32 without NEON then we'll implement runtime detection and dispatch at that point.
+# 2. Explicitly pass -O3 on mobile configs where just "-c opt" means "optimize for code size".
+# We would want to only do that when compilation_mode is "opt", but limitations of
+# the "select" keyword (no nested selects, no AND boolean) seem to make that difficult
+# at the moment. For debugging purposes, this can be overridded on the command line, e.g.
+# bazel build -c dbg --copt=-O0 ...
+
+def ruy_copts_base():
+ return select({
+ "//tensorflow:android_arm64": ["-O3"],
+ "//tensorflow:android_arm": [
+ "-O3",
+ "-mfpu=neon",
+ ],
+ "//conditions:default": [],
+ })
+
+# Used for targets that are compiled with extra features that are skipped at runtime if unavailable.
def ruy_copts_skylake():
return []
+# Used for targets that are compiled with extra features that are skipped at runtime if unavailable.
def ruy_copts_avx2():
return []
diff --git a/tensorflow/lite/experimental/ruy/context.cc b/tensorflow/lite/experimental/ruy/context.cc
index 32f222c..aea42cd 100644
--- a/tensorflow/lite/experimental/ruy/context.cc
+++ b/tensorflow/lite/experimental/ruy/context.cc
@@ -18,6 +18,8 @@
#include "tensorflow/lite/experimental/ruy/check_macros.h"
#include "tensorflow/lite/experimental/ruy/detect_arm.h"
#include "tensorflow/lite/experimental/ruy/detect_x86.h"
+#include "tensorflow/lite/experimental/ruy/have_built_path_for.h"
+#include "tensorflow/lite/experimental/ruy/platform.h"
namespace ruy {
@@ -47,11 +49,11 @@
RUY_DCHECK((runtime_enabled_paths_ & Path::kNeonDotprod) == Path::kNone);
}
}
-#endif
+#endif // RUY_PLATFORM(ARM)
#if RUY_PLATFORM(X86)
if ((runtime_enabled_paths_ & Path::kAvx2) != Path::kNone) {
- if (!DetectCpuAvx2()) {
+ if (!(HaveBuiltPathForAvx2() && DetectCpuAvx2())) {
runtime_enabled_paths_ = runtime_enabled_paths_ & ~Path::kAvx2;
// Sanity check.
RUY_DCHECK((runtime_enabled_paths_ & Path::kAvx2) == Path::kNone);
@@ -59,13 +61,13 @@
}
if ((runtime_enabled_paths_ & Path::kAvx512) != Path::kNone) {
- if (!DetectCpuAvx512()) {
+ if (!(HaveBuiltPathForAvx512() && DetectCpuAvx512())) {
runtime_enabled_paths_ = runtime_enabled_paths_ & ~Path::kAvx512;
// Sanity check.
RUY_DCHECK((runtime_enabled_paths_ & Path::kAvx512) == Path::kNone);
}
}
-#endif
+#endif // RUY_PLATFORM(X86)
// Sanity check. We can't possibly have disabled all paths, as some paths
// are universally available (kReference, kStandardCpp).
diff --git a/tensorflow/lite/experimental/ruy/context_test.cc b/tensorflow/lite/experimental/ruy/context_test.cc
new file mode 100644
index 0000000..1a184b8
--- /dev/null
+++ b/tensorflow/lite/experimental/ruy/context_test.cc
@@ -0,0 +1,62 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/ruy/context.h"
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/ruy/path.h"
+#include "tensorflow/lite/experimental/ruy/platform.h"
+
+namespace ruy {
+namespace {
+
+TEST(ContextTest, EnabledPathsGeneral) {
+ ruy::Context ruy_context;
+ const auto ruy_paths = ruy_context.GetRuntimeEnabledPaths();
+ const auto ruy_paths_repeat = ruy_context.GetRuntimeEnabledPaths();
+ ASSERT_EQ(ruy_paths, ruy_paths_repeat);
+ EXPECT_NE(ruy_paths, Path::kNone);
+ EXPECT_EQ(ruy_paths & Path::kReference, Path::kReference);
+ EXPECT_EQ(ruy_paths & Path::kStandardCpp, Path::kStandardCpp);
+}
+
+#if RUY_PLATFORM(X86)
+TEST(ContextTest, EnabledPathsX86) {
+ ruy::Context ruy_context;
+ ruy_context.SetRuntimeEnabledPaths(Path::kAvx2 | Path::kAvx512);
+ const auto ruy_paths = ruy_context.GetRuntimeEnabledPaths();
+ EXPECT_EQ(ruy_paths & Path::kReference, Path::kNone);
+ EXPECT_EQ(ruy_paths & Path::kStandardCpp, Path::kNone);
+}
+#endif // RUY_PLATFORM(X86)
+
+#if RUY_PLATFORM(ARM)
+TEST(ContextTest, EnabledPathsArm) {
+ ruy::Context ruy_context;
+ ruy_context.SetRuntimeEnabledPaths(Path::kNeon | Path::kNeonDotprod);
+ const auto ruy_paths = ruy_context.GetRuntimeEnabledPaths();
+ EXPECT_EQ(ruy_paths & Path::kReference, Path::kNone);
+ EXPECT_EQ(ruy_paths & Path::kStandardCpp, Path::kNone);
+ EXPECT_EQ(ruy_paths & Path::kNeon, Path::kNeon);
+}
+#endif // RUY_PLATFORM(ARM)
+
+} // namespace
+} // namespace ruy
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/experimental/ruy/have_built_path_for.h b/tensorflow/lite/experimental/ruy/have_built_path_for.h
new file mode 100644
index 0000000..4e340f5
--- /dev/null
+++ b/tensorflow/lite/experimental/ruy/have_built_path_for.h
@@ -0,0 +1,30 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_HAVE_BUILT_PATH_FOR_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_HAVE_BUILT_PATH_FOR_H_
+
+#include "tensorflow/lite/experimental/ruy/platform.h"
+
+namespace ruy {
+
+#if RUY_PLATFORM(X86)
+bool HaveBuiltPathForAvx2();
+bool HaveBuiltPathForAvx512();
+#endif // RUY_PLATFORM(X86)
+
+} // namespace ruy
+
+#endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_HAVE_BUILT_PATH_FOR_H_
diff --git a/tensorflow/lite/experimental/ruy/have_built_path_for_avx2.cc b/tensorflow/lite/experimental/ruy/have_built_path_for_avx2.cc
new file mode 100644
index 0000000..be694ce
--- /dev/null
+++ b/tensorflow/lite/experimental/ruy/have_built_path_for_avx2.cc
@@ -0,0 +1,35 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/ruy/have_built_path_for.h"
+#include "tensorflow/lite/experimental/ruy/opt_set.h"
+
+namespace ruy {
+
+#if RUY_PLATFORM(X86)
+// IMPORTANT:
+// These patterns must match those in the pack and kernel cc files.
+#if !(RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_ASM))
+
+bool HaveBuiltPathForAvx2() { return false; }
+
+#else // RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+
+bool HaveBuiltPathForAvx2() { return true; }
+
+#endif // RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+#endif // RUY_PLATFORM(X86)
+
+} // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/have_built_path_for_avx512.cc b/tensorflow/lite/experimental/ruy/have_built_path_for_avx512.cc
new file mode 100644
index 0000000..ccfea77
--- /dev/null
+++ b/tensorflow/lite/experimental/ruy/have_built_path_for_avx512.cc
@@ -0,0 +1,35 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/ruy/have_built_path_for.h"
+#include "tensorflow/lite/experimental/ruy/opt_set.h"
+
+namespace ruy {
+
+#if RUY_PLATFORM(X86)
+// IMPORTANT:
+// These patterns must match those in the pack and kernel cc files.
+#if !(RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_ASM))
+
+bool HaveBuiltPathForAvx512() { return false; }
+
+#else // RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+
+bool HaveBuiltPathForAvx512() { return true; }
+
+#endif // RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+#endif // RUY_PLATFORM(X86)
+
+} // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/kernel_avx2.cc b/tensorflow/lite/experimental/ruy/kernel_avx2.cc
index d1a3150..eb38add 100644
--- a/tensorflow/lite/experimental/ruy/kernel_avx2.cc
+++ b/tensorflow/lite/experimental/ruy/kernel_avx2.cc
@@ -28,7 +28,19 @@
namespace ruy {
-#if RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+#if !(RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_ASM))
+
+void Kernel8bitAvx2(const KernelParams8bit<8, 8>& params) {
+ // CPU-ID-based checks should disable the path that would reach this point.
+ RUY_DCHECK(false);
+}
+
+void KernelFloatAvx2(const KernelParamsFloat<8, 8>& params) {
+ // CPU-ID-based checks should disable the path that would reach this point.
+ RUY_DCHECK(false);
+}
+
+#else // RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_ASM)
static constexpr int kAvxFloatBlockSize = 8;
static constexpr int kAvx8bitBlockSize = 8;
diff --git a/tensorflow/lite/experimental/ruy/kernel_avx512.cc b/tensorflow/lite/experimental/ruy/kernel_avx512.cc
index bdcd7eb..98bff00 100644
--- a/tensorflow/lite/experimental/ruy/kernel_avx512.cc
+++ b/tensorflow/lite/experimental/ruy/kernel_avx512.cc
@@ -28,7 +28,19 @@
namespace ruy {
-#if RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+#if !(RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_ASM))
+
+void Kernel8bitAvx512(const KernelParams8bit<16, 16>& params) {
+ // CPU-ID-based checks should disable the path that would reach this point.
+ RUY_DCHECK(false);
+}
+
+void KernelFloatAvx512(const KernelParamsFloat<16, 16>& params) {
+ // CPU-ID-based checks should disable the path that would reach this point.
+ RUY_DCHECK(false);
+}
+
+#else // RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_ASM)
inline std::int32_t mm512_get1_epi32(const __m512i v, int i) {
__m256i a =
diff --git a/tensorflow/lite/experimental/ruy/kernel_x86.h b/tensorflow/lite/experimental/ruy/kernel_x86.h
index 31269ea..78dcffb 100644
--- a/tensorflow/lite/experimental/ruy/kernel_x86.h
+++ b/tensorflow/lite/experimental/ruy/kernel_x86.h
@@ -30,7 +30,7 @@
namespace ruy {
-#if RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+#if RUY_PLATFORM(X86)
void Kernel8bitAvx512(const KernelParams8bit<16, 16>& params);
template <typename DstScalar>
@@ -69,9 +69,7 @@
KernelFloatAvx512(params);
}
};
-#endif // RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_ASM)
-#if RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_ASM)
void Kernel8bitAvx2(const KernelParams8bit<8, 8>& params);
template <typename DstScalar>
@@ -110,7 +108,7 @@
KernelFloatAvx2(params);
}
};
-#endif // RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+#endif // RUY_PLATFORM(X86)
} // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/pack_avx2.cc b/tensorflow/lite/experimental/ruy/pack_avx2.cc
index f89bf62..7483419 100644
--- a/tensorflow/lite/experimental/ruy/pack_avx2.cc
+++ b/tensorflow/lite/experimental/ruy/pack_avx2.cc
@@ -30,7 +30,23 @@
namespace ruy {
-#if RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_INTRINSICS)
+#if !(RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_ASM))
+
+void Pack8bitAvx2(const std::int8_t* src_ptr, std::int8_t input_xor,
+ const std::int8_t* zerobuf, int src_stride,
+ int remaining_src_cols, int src_rows, std::int8_t* packed_ptr,
+ std::int32_t* sums_ptr) {
+ // CPU-ID-based checks should disable the path that would reach this point.
+ RUY_DCHECK(false);
+}
+
+void PackFloatAvx2(const float* src_ptr, const float* zerobuf, int src_stride,
+ int remaining_src_cols, int src_rows, float* packed_ptr) {
+ // CPU-ID-based checks should disable the path that would reach this point.
+ RUY_DCHECK(false);
+}
+
+#else // RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_ASM)
static constexpr int kAvxFloatBlockSize = 8;
static constexpr int kAvx8bitBlockSize = 8;
diff --git a/tensorflow/lite/experimental/ruy/pack_avx512.cc b/tensorflow/lite/experimental/ruy/pack_avx512.cc
index f795423..0c14660 100644
--- a/tensorflow/lite/experimental/ruy/pack_avx512.cc
+++ b/tensorflow/lite/experimental/ruy/pack_avx512.cc
@@ -30,7 +30,23 @@
namespace ruy {
-#if RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_INTRINSICS)
+#if !(RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_ASM))
+
+void Pack8bitAvx512(const std::int8_t* src_ptr, std::int8_t input_xor,
+ const std::int8_t* zerobuf, int src_stride,
+ int remaining_src_cols, int src_rows,
+ std::int8_t* packed_ptr, std::int32_t* sums_ptr) {
+ // CPU-ID-based checks should disable the path that would reach this point.
+ RUY_DCHECK(false);
+}
+
+void PackFloatAvx512(const float* src_ptr, const float* zerobuf, int src_stride,
+ int remaining_src_cols, int src_rows, float* packed_ptr) {
+ // CPU-ID-based checks should disable the path that would reach this point.
+ RUY_DCHECK(false);
+}
+
+#else // RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_ASM)
// The first int8_t template parameter is arbitrary: this routine is common to
// all 8-bit source matrix types.
diff --git a/tensorflow/lite/experimental/ruy/pack_x86.h b/tensorflow/lite/experimental/ruy/pack_x86.h
index 96c5a97..cf8b097 100644
--- a/tensorflow/lite/experimental/ruy/pack_x86.h
+++ b/tensorflow/lite/experimental/ruy/pack_x86.h
@@ -84,6 +84,7 @@
#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_PACK_X86_H_
#include <cstdint>
+#include <cstring>
#include <type_traits>
#include "profiling/instrumentation.h"
@@ -99,7 +100,7 @@
namespace ruy {
-#if RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+#if RUY_PLATFORM(X86)
// Note that source and zero buffers can be uint8 type, but in the packing
// function are reinterpreted as int8, and are XOR-ed with input_xor.
void Pack8bitAvx2(const std::int8_t* src_ptr, std::int8_t input_xor,
@@ -180,9 +181,7 @@
}
}
};
-#endif // RUY_PLATFORM(AVX2) && RUY_OPT_ENABLED(RUY_OPT_ASM)
-#if RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_ASM)
// Note that source and zero buffers can be uint8 type, but in the packing
// function are reinterpreted as int8, and are XOR-ed with input_xor.
void Pack8bitAvx512(const std::int8_t* src_ptr, std::int8_t input_xor,
@@ -266,7 +265,7 @@
}
}
};
-#endif // RUY_PLATFORM(AVX512) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+#endif // RUY_PLATFORM(X86)
} // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/path.h b/tensorflow/lite/experimental/ruy/path.h
index 43e8ae4..8d861a0 100644
--- a/tensorflow/lite/experimental/ruy/path.h
+++ b/tensorflow/lite/experimental/ruy/path.h
@@ -80,7 +80,7 @@
// Optimized path making use of ARM NEON dot product instructions that are
// available on newer ARM cores.
kNeonDotprod = 0x8,
-#endif
+#endif // RUY_PLATFORM(ARM)
#if RUY_PLATFORM(X86)
// x86 architectures.
@@ -89,7 +89,7 @@
kAvx2 = 0x4,
// Optimized for AVX-512.
kAvx512 = 0x8,
-#endif
+#endif // RUY_PLATFORM(X86)
};
inline constexpr Path operator|(Path p, Path q) {
diff --git a/tensorflow/lite/experimental/ruy/ruy_visibility.bzl b/tensorflow/lite/experimental/ruy/ruy_visibility.bzl
deleted file mode 100644
index 3668ada..0000000
--- a/tensorflow/lite/experimental/ruy/ruy_visibility.bzl
+++ /dev/null
@@ -1,6 +0,0 @@
-"""Control of ruy visibility"""
-
-def ruy_visibility():
- return [
- "//tensorflow/lite/kernels:__subpackages__",
- ]
diff --git a/tensorflow/lite/experimental/ruy/time.h b/tensorflow/lite/experimental/ruy/time.h
index 07d6caa..d96ed34 100644
--- a/tensorflow/lite/experimental/ruy/time.h
+++ b/tensorflow/lite/experimental/ruy/time.h
@@ -21,7 +21,8 @@
#include <ratio> // NOLINT(build/c++11)
#ifdef __linux__
-#include <sys/time.h> // for CLOCK_MONOTONIC_COARSE
+#include <sys/time.h>
+// IWYU pragma: no_include <type_traits>
#include <ctime>
#endif
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_test.cc b/tensorflow/lite/kernels/cpu_backend_gemm_test.cc
index 427c6ab..d545b80 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_test.cc
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_test.cc
@@ -207,8 +207,8 @@
// compromise between something that works and something that's simple
// enough code that doesn't feel too ad-hoc. As above in the float path,
// abs_mean_diff is subject to a stricter requirement as it is a bias.
- tolerated_relative_mean_abs_diff = std::sqrt(inverse_size);
- tolerated_relative_abs_mean_diff = inverse_size;
+ tolerated_relative_mean_abs_diff = std::sqrt(inverse_size) * 0.5;
+ tolerated_relative_abs_mean_diff = inverse_size * 2.;
}
double tolerated_max_abs_diff =