Use the cpuinfo library instead of our own code for CPU feature detection.
PiperOrigin-RevId: 313469646
diff --git a/WORKSPACE b/WORKSPACE
index abdd661..587ed6e 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -27,3 +27,26 @@
"https://github.com/google/googletest/archive/release-1.8.1.tar.gz",
],
)
+
+# clog library, used by cpuinfo for logging
+http_archive(
+ name = "clog",
+ strip_prefix = "cpuinfo-d5e37adf1406cf899d7d9ec1d317c47506ccb970",
+ sha256 = "3f2dc1970f397a0e59db72f9fca6ff144b216895c1d606f6c94a507c1e53a025",
+ urls = [
+ "https://github.com/pytorch/cpuinfo/archive/d5e37adf1406cf899d7d9ec1d317c47506ccb970.tar.gz",
+ ],
+ build_file = "@//third_party:clog.BUILD",
+)
+
+# cpuinfo library, used for detecting processor characteristics
+http_archive(
+ name = "cpuinfo",
+ strip_prefix = "cpuinfo-c2092219e7c874783a00a62edb94ddc672f57ab3",
+ sha256 = "ea56c399a4f6ca5f749e71acb6a7bfdc653eb65d8f658cb2e414a2fcdca1fe8b",
+ urls = [
+ "https://github.com/pytorch/cpuinfo/archive/c2092219e7c874783a00a62edb94ddc672f57ab3.zip",
+ ],
+ build_file = "@//third_party:cpuinfo.BUILD",
+ patches = ["@//third_party:cpuinfo.patch"],
+)
diff --git a/ruy/BUILD b/ruy/BUILD
index 589fb6f..10ea17b 100644
--- a/ruy/BUILD
+++ b/ruy/BUILD
@@ -20,6 +20,18 @@
)
config_setting(
+ name = "ppc",
+ values = {
+ "cpu": "ppc",
+ },
+)
+
+config_setting(
+ name = "fuchsia",
+ values = {"cpu": "fuchsia"},
+)
+
+config_setting(
name = "optimized",
values = {
"compilation_mode": "opt",
@@ -278,31 +290,23 @@
)
cc_library(
- name = "detect_arm",
+ name = "cpuinfo",
srcs = [
- "detect_arm.cc",
+ "cpuinfo.cc",
],
hdrs = [
- "detect_arm.h",
+ "cpuinfo.h",
],
- copts = ruy_copts(),
- deps = [
- ":platform",
+ copts = ruy_copts() + [
+ # ruy_copts contains -Wundef, but cpuinfo's header warns with that.
+ "-Wno-undef",
],
-)
-
-cc_library(
- name = "detect_x86",
- srcs = [
- "detect_x86.cc",
- ],
- hdrs = [
- "detect_x86.h",
- ],
- copts = ruy_copts(),
- deps = [
- ":platform",
- ],
+ deps = [":platform"] + select({
+ # cpuinfo does not build on ppc.
+ ":ppc": [],
+ ":fuchsia": [],
+ "//conditions:default": ["@cpuinfo"],
+ }),
)
cc_library(
@@ -752,8 +756,6 @@
":allocator",
":check_macros",
":ctx",
- ":detect_arm",
- ":detect_x86",
":have_built_path_for",
":path",
":platform",
@@ -799,8 +801,7 @@
deps = [
":allocator",
":check_macros",
- ":detect_arm",
- ":detect_x86",
+ ":cpuinfo",
":have_built_path_for",
":path",
":platform",
@@ -959,7 +960,7 @@
":context",
":ctx",
":context_get_ctx",
- "//ruy/profiler:profiler",
+ "//ruy/profiler",
] + ruy_test_ext_deps(),
)
@@ -997,8 +998,8 @@
("i8", "u8", "i32", "i32"),
],
deps = [
- "@com_google_googletest//:gtest_main",
"//ruy:test_lib",
+ "@com_google_googletest//:gtest_main",
],
)
@@ -1015,8 +1016,8 @@
],
tags = ["slow"],
deps = [
- "@com_google_googletest//:gtest_main",
"//ruy:test_lib",
+ "@com_google_googletest//:gtest_main",
],
)
@@ -1030,7 +1031,7 @@
("u8", "u8", "i32", "i16"),
],
deps = [
- "@com_google_googletest//:gtest_main",
"//ruy:test_lib",
+ "@com_google_googletest//:gtest_main",
],
)
diff --git a/ruy/cpuinfo.cc b/ruy/cpuinfo.cc
new file mode 100644
index 0000000..793ba7b
--- /dev/null
+++ b/ruy/cpuinfo.cc
@@ -0,0 +1,61 @@
+#include "ruy/cpuinfo.h"
+
+#include "ruy/platform.h"
+
+#define RUY_HAVE_CPUINFO (!(RUY_PLATFORM_PPC || RUY_PLATFORM_FUCHSIA))
+
+#if RUY_HAVE_CPUINFO
+
+#include <cpuinfo.h>
+
+namespace ruy {
+
+CpuInfo::~CpuInfo() {
+ if (init_status_ == InitStatus::kInitialized) {
+ cpuinfo_deinitialize();
+ }
+}
+
+bool CpuInfo::EnsureInitialized() {
+ if (init_status_ == InitStatus::kNotYetAttempted) {
+ init_status_ =
+ cpuinfo_initialize() ? InitStatus::kInitialized : InitStatus::kFailed;
+ }
+ return init_status_ == InitStatus::kInitialized;
+}
+
+bool CpuInfo::NeonDotprod() {
+ return EnsureInitialized() && cpuinfo_has_arm_neon_dot();
+}
+
+bool CpuInfo::Sse42() {
+ return EnsureInitialized() && cpuinfo_has_x86_sse4_2();
+}
+
+bool CpuInfo::Avx2() { return EnsureInitialized() && cpuinfo_has_x86_avx2(); }
+
+bool CpuInfo::Avx512() {
+ return EnsureInitialized() && cpuinfo_has_x86_avx512f() &&
+ cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512cd() &&
+ cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512vl();
+}
+
+bool CpuInfo::AvxVnni() {
+ return EnsureInitialized() && cpuinfo_has_x86_avx512vnni();
+}
+
+} // namespace ruy
+
+#else // not RUY_HAVE_CPUINFO
+
+namespace ruy {
+CpuInfo::~CpuInfo() {}
+bool CpuInfo::EnsureInitialized() { return false; }
+bool CpuInfo::NeonDotprod() { return false; }
+bool CpuInfo::Sse42() { return false; }
+bool CpuInfo::Avx2() { return false; }
+bool CpuInfo::Avx512() { return false; }
+bool CpuInfo::AvxVnni() { return false; }
+} // namespace ruy
+
+#endif
diff --git a/ruy/cpuinfo.h b/ruy/cpuinfo.h
new file mode 100644
index 0000000..0a3de28
--- /dev/null
+++ b/ruy/cpuinfo.h
@@ -0,0 +1,49 @@
+/* Copyright 2020 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef RUY_RUY_CPUINFO_H_
+#define RUY_RUY_CPUINFO_H_
+
+namespace ruy {
+
+// Wraps the functionality that ruy needs from the cpuinfo library.
+class CpuInfo final {
+ public:
+ CpuInfo() {}
+ ~CpuInfo();
+
+ // ARM features
+ bool NeonDotprod();
+
+ // X86 features
+ bool Sse42();
+ bool Avx2();
+ bool Avx512();
+ bool AvxVnni();
+
+ private:
+ enum class InitStatus {
+ kNotYetAttempted,
+ kInitialized,
+ kFailed,
+ };
+ InitStatus init_status_ = InitStatus::kNotYetAttempted;
+ bool EnsureInitialized();
+ CpuInfo(const CpuInfo&) = delete;
+};
+
+} // namespace ruy
+
+#endif // RUY_RUY_CPUINFO_H_
diff --git a/ruy/ctx.cc b/ruy/ctx.cc
index cbb855d..5a01bd2 100644
--- a/ruy/ctx.cc
+++ b/ruy/ctx.cc
@@ -18,9 +18,8 @@
#include <functional>
#include "ruy/check_macros.h"
+#include "ruy/cpuinfo.h"
#include "ruy/ctx_impl.h"
-#include "ruy/detect_arm.h"
-#include "ruy/detect_x86.h"
#include "ruy/have_built_path_for.h"
#include "ruy/path.h"
#include "ruy/platform.h"
@@ -47,13 +46,15 @@
mutable_impl()->runtime_enabled_paths_ = paths | kNonArchPaths;
}
+CpuInfo* Ctx::mutable_cpuinfo() { return &mutable_impl()->cpuinfo_; }
+
namespace {
// For each Path bit set in `paths_to_test`, performs runtime detection and
// sets the corresponding bit in the return value if and only if it is
// supported. Path bits that are not set in the input
// `paths_to_detect` value are also left not set in the return value.
-Path DetectRuntimeSupportedPaths(Path paths_to_detect) {
+Path DetectRuntimeSupportedPaths(Path paths_to_detect, CpuInfo* cpuinfo) {
// Paths in kNonArchPaths are always implicitly supported.
// Further logic below may add more bits to `results`.
Path result = kNonArchPaths;
@@ -84,20 +85,21 @@
// build it at the moment. That is largely because we have had to machine
// encode dotprod instructions, so we don't actually rely on toolchain support
// for them.
- maybe_add(Path::kNeonDotprod, []() { return DetectDotprod(); });
+ maybe_add(Path::kNeonDotprod, [=]() { return cpuinfo->NeonDotprod(); });
#elif RUY_PLATFORM_X86
// x86 SIMD paths currently require both runtime detection, and detection of
// whether we're building the path at all.
maybe_add(Path::kSse42,
- []() { return HaveBuiltPathForSse42() && DetectCpuSse42(); });
+ [=]() { return HaveBuiltPathForSse42() && cpuinfo->Sse42(); });
maybe_add(Path::kAvx2,
- []() { return HaveBuiltPathForAvx2() && DetectCpuAvx2(); });
+ [=]() { return HaveBuiltPathForAvx2() && cpuinfo->Avx2(); });
maybe_add(Path::kAvx512,
- []() { return HaveBuiltPathForAvx512() && DetectCpuAvx512(); });
+ [=]() { return HaveBuiltPathForAvx512() && cpuinfo->Avx512(); });
maybe_add(Path::kAvxVnni,
- []() { return HaveBuiltPathForAvxVnni() && DetectCpuAvxVnni(); });
+ [=]() { return HaveBuiltPathForAvxVnni() && cpuinfo->AvxVnni(); });
#else
(void)maybe_add;
+ (void)cpuinfo;
#endif
// Sanity checks
@@ -116,7 +118,7 @@
// The value Path::kNone indicates the initial state before detection has been
// performed.
if (*paths == Path::kNone) {
- *paths = DetectRuntimeSupportedPaths(kAllPaths);
+ *paths = DetectRuntimeSupportedPaths(kAllPaths, mutable_cpuinfo());
}
return *paths;
diff --git a/ruy/ctx.h b/ruy/ctx.h
index 1fb8fda..0d731e4 100644
--- a/ruy/ctx.h
+++ b/ruy/ctx.h
@@ -28,6 +28,7 @@
class Allocator;
class TuningResolver;
class PrepackedCache;
+class CpuInfo;
enum class Path : std::uint8_t;
enum class Tuning;
@@ -47,6 +48,7 @@
ThreadPool* mutable_thread_pool();
int max_num_threads() const;
void set_max_num_threads(int value);
+ CpuInfo* mutable_cpuinfo();
// Returns the set of Path's that are available. By default, this is based on
// runtime detection of CPU features, as well as on which code paths were
diff --git a/ruy/ctx_impl.h b/ruy/ctx_impl.h
index d182518..e297ca7 100644
--- a/ruy/ctx_impl.h
+++ b/ruy/ctx_impl.h
@@ -24,6 +24,7 @@
#include <vector>
#include "ruy/allocator.h"
+#include "ruy/cpuinfo.h"
#include "ruy/ctx.h"
#include "ruy/path.h"
#include "ruy/prepacked_cache.h"
@@ -66,9 +67,11 @@
// this allocator, and its per-thread allocator.
std::unique_ptr<Allocator> main_allocator_;
std::unique_ptr<PrepackedCache> prepacked_cache_;
- // Set of Paths detected at runtime to be supported. The initial value kNone
+ // Set of Paths enabled at runtime. By default, that is based on runtime
+ // detection, but may be overridden. The initial value kNone
// means that detection has not yet been performed.
Path runtime_enabled_paths_ = Path::kNone;
+ CpuInfo cpuinfo_;
// State for each thread in the thread pool. Entry 0 is the main thread.
std::vector<std::unique_ptr<ThreadSpecificResource>>
thread_specific_resources_;
diff --git a/ruy/detect_arm.cc b/ruy/detect_arm.cc
deleted file mode 100644
index 37f30df..0000000
--- a/ruy/detect_arm.cc
+++ /dev/null
@@ -1,236 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/* Temporary dotprod-detection until we can rely on proper feature-detection
-such as getauxval on Linux (requires a newer Linux kernel than we can
-currently rely on on Android).
-
-There are two main ways that this could be implemented: using a signal
-handler or a fork. The current implementation uses a signal handler.
-This is because on current Android, an uncaught signal gives a latency
-of over 100 ms. In order for the fork approach to be worthwhile, it would
-have to save us the hassle of handling signals, and such an approach thus
-has an unavoidable 100ms latency. By contrast, the present signal-handling
-approach has low latency.
-
-Downsides of the current signal-handling approach include:
- 1. Setting and restoring signal handlers is not thread-safe: we can't
- prevent another thread from interfering with us. We at least prevent
- other threads from calling our present code concurrently by using a lock,
- but we can't do anything about other threads using their own code to
- set signal handlers.
- 2. Signal handlers are not entirely portable, e.g. b/132973173 showed that
- on Apple platform the EXC_BAD_INSTRUCTION signal is not always caught
- by a SIGILL handler (difference between Release and Debug builds).
- 3. The signal handler approach looks confusing in a debugger (has to
- tell the debugger to 'continue' past the signal every time). Fix:
- ```
- (gdb) handle SIGILL nostop noprint pass
- ```
-
-Here is what the nicer fork-based alternative would look like.
-Its only downside, as discussed above, is high latency, 100 ms on Android.
-
-```
-bool TryAsmSnippet(bool (*asm_snippet)()) {
- int child_pid = fork();
- if (child_pid == -1) {
- // Fork failed.
- return false;
- }
- if (child_pid == 0) {
- // Child process code path. Pass the raw boolean return value of
- // asm_snippet as exit code (unconventional: 1 means true == success).
- _exit(asm_snippet());
- }
-
- int child_status;
- waitpid(child_pid, &child_status, 0);
- if (WIFSIGNALED(child_status)) {
- // Child process terminated by signal, meaning the instruction was
- // not supported.
- return false;
- }
- // Return the exit code of the child, which per child code above was
- // the return value of asm_snippet().
- return WEXITSTATUS(child_status);
-}
-```
-*/
-
-#include "ruy/detect_arm.h"
-
-#if (defined __linux__) && (defined __aarch64__)
-#define RUY_DETECT_DOTPROD
-#endif
-
-#ifdef RUY_DETECT_DOTPROD
-
-#include <setjmp.h>
-#include <signal.h>
-
-#include <cstdlib>
-#include <cstring>
-#include <mutex>
-
-#ifdef __linux__
-#include <sys/auxv.h>
-#endif
-
-#endif
-
-namespace ruy {
-
-#ifdef RUY_DETECT_DOTPROD
-
-namespace {
-
-// long-jump buffer used to continue execution after a caught SIGILL.
-sigjmp_buf global_sigjmp_buf;
-
-// Signal handler. Long-jumps to just before
-// we ran the snippet that we know is the only thing that could have generated
-// the SIGILL.
-void SignalHandler(int) { siglongjmp(global_sigjmp_buf, 1); }
-
-// RAII helper for calling sigprocmask to unblock all signals temporarily.
-class ScopeUnblockSignals final {
- public:
- ScopeUnblockSignals() {
- sigset_t procmask;
- sigemptyset(&procmask);
- success_ = !sigprocmask(SIG_SETMASK, &procmask, &old_procmask_);
- }
- ~ScopeUnblockSignals() {
- if (success_) {
- sigprocmask(SIG_SETMASK, &old_procmask_, nullptr);
- }
- }
- bool success() const { return success_; }
-
- private:
- sigset_t old_procmask_;
- bool success_ = false;
-};
-
-// RAII helper to install and uninstall a signal handler.
-class ScopeSigaction final {
- public:
- ScopeSigaction(int signal_number, void (*handler_function)(int))
- : signal_number_(signal_number) {
- struct sigaction action;
- memset(&action, 0, sizeof(action));
- sigemptyset(&action.sa_mask);
- action.sa_handler = handler_function;
- success_ = !sigaction(signal_number_, &action, &old_action_);
- }
- ~ScopeSigaction() {
- if (success_) {
- sigaction(signal_number_, &old_action_, nullptr);
- }
- }
- bool success() const { return success_; }
-
- private:
- const int signal_number_;
- struct sigaction old_action_;
- bool success_ = false;
-};
-
-// Try an asm snippet. Returns true if it passed i.e. ran without generating
-// an illegal-instruction signal and returned true. Returns false otherwise.
-bool TryAsmSnippet(bool (*asm_snippet)()) {
- // This function installs and restores signal handlers and the signal-blocking
- // mask. We can't prevent another thread from interfering, but we can at least
- // put a big lock here so that it works if, for whatever reason, another
- // thread calls this function concurrently.
- static std::mutex mutex;
- std::lock_guard<std::mutex> lock(mutex);
-
- ScopeUnblockSignals unblock_signals;
- if (!unblock_signals.success()) {
- return false;
- }
- ScopeSigaction handle_sigill(SIGILL, SignalHandler);
- if (!handle_sigill.success()) {
- return false;
- }
-
- // Set the long jump buffer to this point in the code. This normally returns
- // 0 so we don't take this branch...
- if (sigsetjmp(global_sigjmp_buf, false)) {
- // ... except in the fake return from sigsetjmp that is produced when
- // the long-jump back to here actually happened, that is, in the signal
- // handler. In this case, we know that the asm_snippet triggered an illegal
- // instruction signal, so we return false.
- return false;
- }
-
- return asm_snippet();
-}
-
-bool DotprodAsmSnippet() {
- // maratek@ mentioned that for some other ISA extensions (fp16)
- // there have been implementations that failed to generate SIGILL even
- // though they did not correctly implement the instruction. Just in case
- // a similar situation might exist here, we do a simple correctness test.
- int result = 0;
- asm volatile(
- "mov w0, #100\n"
- "dup v0.16b, w0\n"
- "dup v1.4s, w0\n"
- ".word 0x6e809401 // udot v1.4s, v0.16b, v0.16b\n"
- "mov %w[result], v1.s[0]\n"
- : [result] "=r"(result)
- :
- : "x0", "v0", "v1");
- // Expecting 100 (input accumulator value) + 100 * 100 + ... (repeat 4 times)
- return result == 40100;
-}
-
-bool DetectDotprodBySignalMethod() { return TryAsmSnippet(DotprodAsmSnippet); }
-
-#ifdef __linux__
-bool DetectDotprodByLinuxAuxvMethod() {
- // This is the value of HWCAP_ASIMDDP in sufficiently recent Linux headers,
- // however we need to support building against older headers for the time
- // being.
- const int kLocalHwcapAsimddp = 1 << 20;
- return getauxval(AT_HWCAP) & kLocalHwcapAsimddp;
-}
-#endif
-
-} // namespace
-
-bool DetectDotprod() {
-#ifdef __linux__
- // We always try the auxv method and don't try to check the linux version
- // before. It's only in the mainline linux tree from 4.14.151, but it's been
- // backported to earlier linux versions in Android vendor device trees.
- // The cost of just trying this is near zero, and the benefit is large
- // as the signal method has higher latency and a substantial crash potential.
- if (DetectDotprodByLinuxAuxvMethod()) {
- return true;
- }
-#endif
-
- return DetectDotprodBySignalMethod();
-}
-
-#else // not defined RUY_DETECT_DOTPROD
-bool DetectDotprod() { return false; }
-#endif // defined RUY_DETECT_DOTPROD
-
-} // namespace ruy
diff --git a/ruy/detect_arm.h b/ruy/detect_arm.h
deleted file mode 100644
index 937a067..0000000
--- a/ruy/detect_arm.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Temporary dotprod-detection code until we can rely on getauxval.
-
-#ifndef RUY_RUY_DETECT_ARM_H_
-#define RUY_RUY_DETECT_ARM_H_
-
-namespace ruy {
-
-// On A64, returns true if the dotprod extension is present.
-// On other architectures, returns false unconditionally.
-bool DetectDotprod();
-
-} // namespace ruy
-
-#endif // RUY_RUY_DETECT_ARM_H_
diff --git a/ruy/detect_x86.cc b/ruy/detect_x86.cc
deleted file mode 100644
index 9d0a3b6..0000000
--- a/ruy/detect_x86.cc
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "ruy/detect_x86.h"
-
-#include <cstdint>
-
-#if RUY_PLATFORM_X86 && RUY_PLATFORM_X86_ENHANCEMENTS
-#include <immintrin.h> // IWYU pragma: keep
-
-#endif
-
-namespace ruy {
-#if RUY_PLATFORM_X86 && RUY_PLATFORM_X86_ENHANCEMENTS
-
-namespace {
-
-// See Intel docs, such as http://goo.gl/c6IkGX.
-inline void RunCpuid(std::uint32_t eax, std::uint32_t ecx,
- std::uint32_t abcd[4]) {
- std::uint32_t ebx = 0, edx;
-#if defined(__i386__) && defined(__PIC__)
- /* in case of PIC under 32-bit EBX cannot be clobbered */
- asm volatile("movl %%ebx, %%edi \n\t cpuid \n\t xchgl %%ebx, %%edi"
- : "=D"(ebx),
-#else
- asm volatile("cpuid"
- : "+b"(ebx),
-#endif
- "+a"(eax), "+c"(ecx), "=d"(edx));
- abcd[0] = eax;
- abcd[1] = ebx;
- abcd[2] = ecx;
- abcd[3] = edx;
-}
-
-} // namespace
-
-bool DetectCpuSse42() {
- std::uint32_t abcd[4];
-
- constexpr std::uint32_t kEcxSse42 = 1u << 20;
- RunCpuid(1, 0, abcd);
- const bool has_sse4_2_base = (abcd[2] & kEcxSse42) == kEcxSse42;
-
-#ifdef RUY_ENABLE_AMD_CPUID_CHECKS
- constexpr std::uint32_t kEcxAbm = 1u << 5;
- RunCpuid(0x80000001, 0, abcd);
- const bool has_extras = (abcd[2] & kEcxAbm) == kEcxAbm;
-#else
- constexpr std::uint32_t kEcxPopcnt = 1u << 23;
- RunCpuid(1, 0, abcd);
- const bool has_extras = (abcd[2] & kEcxPopcnt) == kEcxPopcnt;
-#endif
-
- return has_sse4_2_base && has_extras;
-}
-
-bool DetectCpuAvx2() {
- constexpr std::uint32_t kEbxAvx2 = 1u << 5;
- constexpr std::uint32_t kEcxFma = 1u << 12;
-
- std::uint32_t abcd[4];
-
- RunCpuid(7, 0, abcd);
- const bool has_avx2 = (abcd[1] & kEbxAvx2) == kEbxAvx2;
- RunCpuid(1, 0, abcd);
- const bool has_fma = (abcd[2] & kEcxFma) == kEcxFma;
-
- return has_avx2 && has_fma;
-}
-
-bool DetectCpuAvx512() {
- constexpr std::uint32_t kEbxAvx512F = 1u << 16;
- constexpr std::uint32_t kEbxAvx512Dq = 1u << 17;
- constexpr std::uint32_t kEbxAvx512Cd = 1u << 28;
- constexpr std::uint32_t kEbxAvx512Bw = 1u << 30;
- constexpr std::uint32_t kEbxAvx512Vl = 1u << 31;
-
- constexpr std::uint32_t kEbxAvx512Mask =
- kEbxAvx512F | kEbxAvx512Dq | kEbxAvx512Cd | kEbxAvx512Bw | kEbxAvx512Vl;
- std::uint32_t abcd[4];
- RunCpuid(7, 0, abcd);
-
- return (abcd[1] & kEbxAvx512Mask) == kEbxAvx512Mask;
-}
-
-#endif
-} // namespace ruy
diff --git a/ruy/detect_x86.h b/ruy/detect_x86.h
deleted file mode 100644
index 68b6202..0000000
--- a/ruy/detect_x86.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef RUY_RUY_DETECT_X86_H_
-#define RUY_RUY_DETECT_X86_H_
-
-#include "ruy/platform.h"
-
-namespace ruy {
-
-#if RUY_PLATFORM_X86
-#if RUY_PLATFORM_X86_ENHANCEMENTS
-
-// This also checks ABM support, which implies LZCNT and POPCNT.
-bool DetectCpuSse42();
-bool DetectCpuAvx2();
-bool DetectCpuAvx512();
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-// TODO(b/146646451): Introduce and activate.
-inline bool DetectCpuAvxVnni() { return false; }
-
-#else // RUY_PLATFORM_X86_ENHANCEMENTS
-
-inline bool DetectCpuSse42() { return false; }
-inline bool DetectCpuAvx2() { return false; }
-inline bool DetectCpuAvx512() { return false; }
-inline bool DetectCpuAvxVnni() { return false; }
-
-#endif // !RUY_PLATFORM_X86_ENHANCEMENTS
-#endif // RUY_PLATFORM_X86
-
-} // namespace ruy
-
-#endif // RUY_RUY_DETECT_X86_H_
diff --git a/ruy/platform.h b/ruy/platform.h
index 9977b04..2f9cbb3 100644
--- a/ruy/platform.h
+++ b/ruy/platform.h
@@ -32,6 +32,20 @@
#define RUY_PLATFORM_APPLE 0
#endif
+// Detect APPLE.
+#ifdef __ppc__
+#define RUY_PLATFORM_PPC 1
+#else
+#define RUY_PLATFORM_PPC 0
+#endif
+
+// Detect Fuchsia
+#ifdef __Fuchsia__
+#define RUY_PLATFORM_FUCHSIA 1
+#else
+#define RUY_PLATFORM_FUCHSIA 0
+#endif
+
// Architecture-level platform detection.
//
// Ruy requires these to be mutually exclusive.
diff --git a/third_party/BUILD b/third_party/BUILD
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/third_party/BUILD
diff --git a/third_party/clog.BUILD b/third_party/clog.BUILD
new file mode 100644
index 0000000..84cc3a2
--- /dev/null
+++ b/third_party/clog.BUILD
@@ -0,0 +1,38 @@
+# Description:
+# C-style (a-la printf) logging library
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])
+
+exports_files(["LICENSE"])
+
+cc_library(
+ name = "clog",
+ srcs = [
+ "deps/clog/src/clog.c",
+ ],
+ copts = select({
+ ":windows": [],
+ "//conditions:default": ["-Wno-unused-result"],
+ }),
+ hdrs = [
+ "deps/clog/include/clog.h",
+ ],
+ linkopts = select({
+ ":android": ["-llog"],
+ "//conditions:default": [],
+ }),
+ linkstatic = True,
+ strip_include_prefix = "deps/clog/include",
+)
+
+config_setting(
+ name = "android",
+ values = {"crosstool_top": "//external:android/crosstool"},
+)
+
+config_setting(
+ name = "windows",
+ values = {"cpu": "x64_windows"},
+)
diff --git a/third_party/cpuinfo.BUILD b/third_party/cpuinfo.BUILD
new file mode 100644
index 0000000..ad120de
--- /dev/null
+++ b/third_party/cpuinfo.BUILD
@@ -0,0 +1,326 @@
+# cpuinfo, a library to detect information about the host CPU
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])
+
+exports_files(["LICENSE"])
+
+C99OPTS = [
+ "-std=gnu99", # gnu99, not c99, because dprintf is used
+ "-Wno-vla",
+ "-D_GNU_SOURCE=1", # to use CPU_SETSIZE
+ "-DCPUINFO_INTERNAL=",
+ "-DCPUINFO_PRIVATE=",
+]
+
+# Source code common to all platforms.
+COMMON_SRCS = [
+ "src/api.c",
+ "src/init.c",
+ "src/cache.c",
+]
+
+# Architecture-specific sources and headers.
+X86_SRCS = [
+ "src/x86/cache/descriptor.c",
+ "src/x86/cache/deterministic.c",
+ "src/x86/cache/init.c",
+ "src/x86/info.c",
+ "src/x86/init.c",
+ "src/x86/isa.c",
+ "src/x86/name.c",
+ "src/x86/topology.c",
+ "src/x86/uarch.c",
+ "src/x86/vendor.c",
+]
+
+ARM_SRCS = [
+ "src/arm/cache.c",
+ "src/arm/uarch.c",
+]
+
+# Platform-specific sources and headers
+LINUX_SRCS = [
+ "src/linux/cpulist.c",
+ "src/linux/multiline.c",
+ "src/linux/processors.c",
+ "src/linux/smallfile.c",
+]
+
+MOCK_LINUX_SRCS = [
+ "src/linux/mockfile.c",
+]
+
+MACH_SRCS = [
+ "src/mach/topology.c",
+]
+
+EMSCRIPTEN_SRCS = [
+ "src/emscripten/init.c",
+]
+
+LINUX_X86_SRCS = [
+ "src/x86/linux/cpuinfo.c",
+ "src/x86/linux/init.c",
+]
+
+LINUX_ARM_SRCS = [
+ "src/arm/linux/chipset.c",
+ "src/arm/linux/clusters.c",
+ "src/arm/linux/cpuinfo.c",
+ "src/arm/linux/hwcap.c",
+ "src/arm/linux/init.c",
+ "src/arm/linux/midr.c",
+]
+
+LINUX_ARM32_SRCS = LINUX_ARM_SRCS + ["src/arm/linux/aarch32-isa.c"]
+
+LINUX_ARM64_SRCS = LINUX_ARM_SRCS + ["src/arm/linux/aarch64-isa.c"]
+
+ANDROID_ARM_SRCS = [
+ "src/arm/android/properties.c",
+]
+
+WINDOWS_X86_SRCS = [
+ "src/x86/windows/init.c",
+]
+
+MACH_X86_SRCS = [
+ "src/x86/mach/init.c",
+]
+
+MACH_ARM_SRCS = [
+ "src/arm/mach/init.c",
+]
+
+cc_library(
+ name = "cpuinfo_impl",
+ srcs = select({
+ ":linux_x86_64": COMMON_SRCS + X86_SRCS + LINUX_SRCS + LINUX_X86_SRCS,
+ ":linux_arm": COMMON_SRCS + ARM_SRCS + LINUX_SRCS + LINUX_ARM32_SRCS,
+ ":linux_armhf": COMMON_SRCS + ARM_SRCS + LINUX_SRCS + LINUX_ARM32_SRCS,
+ ":linux_aarch64": COMMON_SRCS + ARM_SRCS + LINUX_SRCS + LINUX_ARM64_SRCS,
+ ":macos_x86_64": COMMON_SRCS + X86_SRCS + MACH_SRCS + MACH_X86_SRCS,
+ ":windows_x86_64": COMMON_SRCS + X86_SRCS + WINDOWS_X86_SRCS,
+ ":android_armv7": COMMON_SRCS + ARM_SRCS + LINUX_SRCS + LINUX_ARM32_SRCS + ANDROID_ARM_SRCS,
+ ":android_arm64": COMMON_SRCS + ARM_SRCS + LINUX_SRCS + LINUX_ARM64_SRCS + ANDROID_ARM_SRCS,
+ ":android_x86": COMMON_SRCS + X86_SRCS + LINUX_SRCS + LINUX_X86_SRCS,
+ ":android_x86_64": COMMON_SRCS + X86_SRCS + LINUX_SRCS + LINUX_X86_SRCS,
+ ":ios_x86_64": COMMON_SRCS + X86_SRCS + MACH_SRCS + MACH_X86_SRCS,
+ ":ios_x86": COMMON_SRCS + X86_SRCS + MACH_SRCS + MACH_X86_SRCS,
+ ":ios_armv7": COMMON_SRCS + MACH_SRCS + MACH_ARM_SRCS,
+ ":ios_arm64": COMMON_SRCS + MACH_SRCS + MACH_ARM_SRCS,
+ ":ios_arm64e": COMMON_SRCS + MACH_SRCS + MACH_ARM_SRCS,
+ ":watchos_x86_64": COMMON_SRCS + X86_SRCS + MACH_SRCS + MACH_X86_SRCS,
+ ":watchos_x86": COMMON_SRCS + X86_SRCS + MACH_SRCS + MACH_X86_SRCS,
+ ":watchos_armv7k": COMMON_SRCS + MACH_SRCS + MACH_ARM_SRCS,
+ ":watchos_arm64_32": COMMON_SRCS + MACH_SRCS + MACH_ARM_SRCS,
+ ":tvos_x86_64": COMMON_SRCS + X86_SRCS + MACH_SRCS + MACH_X86_SRCS,
+ ":tvos_arm64": COMMON_SRCS + MACH_SRCS + MACH_ARM_SRCS,
+ ":emscripten": COMMON_SRCS + EMSCRIPTEN_SRCS,
+ }),
+ copts = select({
+ ":windows_x86_64": [],
+ "//conditions:default": C99OPTS,
+ }) + [
+ "-Iexternal/cpuinfo/include",
+ "-Iexternal/cpuinfo/src",
+ ],
+ linkstatic = True,
+ # Headers must be in textual_hdrs to allow us to set the standard to C99
+ textual_hdrs = [
+ "include/cpuinfo.h",
+ "src/linux/api.h",
+ "src/mach/api.h",
+ "src/cpuinfo/common.h",
+ "src/cpuinfo/internal-api.h",
+ "src/cpuinfo/log.h",
+ "src/cpuinfo/utils.h",
+ "src/x86/api.h",
+ "src/x86/cpuid.h",
+ "src/x86/linux/api.h",
+ "src/arm/android/api.h",
+ "src/arm/linux/api.h",
+ "src/arm/linux/cp.h",
+ "src/arm/api.h",
+ "src/arm/midr.h",
+ ],
+ deps = [
+ "@clog",
+ ],
+)
+
+cc_library(
+ name = "cpuinfo",
+ hdrs = [
+ "include/cpuinfo.h",
+ ],
+ strip_include_prefix = "include",
+ deps = [
+ ":cpuinfo_impl",
+ ],
+)
+
+############################# Build configurations #############################
+
+config_setting(
+ name = "linux_x86_64",
+ values = {"cpu": "k8"},
+)
+
+config_setting(
+ name = "linux_arm",
+ values = {"cpu": "arm"},
+)
+
+config_setting(
+ name = "linux_armhf",
+ values = {"cpu": "armhf"},
+)
+
+config_setting(
+ name = "linux_aarch64",
+ values = {"cpu": "aarch64"},
+)
+
+config_setting(
+ name = "macos_x86_64",
+ values = {
+ "apple_platform_type": "macos",
+ "cpu": "darwin",
+ },
+)
+
+config_setting(
+ name = "windows_x86_64",
+ values = {"cpu": "x64_windows"},
+)
+
+config_setting(
+ name = "android_armv7",
+ values = {
+ "crosstool_top": "//external:android/crosstool",
+ "cpu": "armeabi-v7a",
+ },
+ visibility = ["//visibility:public"],
+)
+
+config_setting(
+ name = "android_arm64",
+ values = {
+ "crosstool_top": "//external:android/crosstool",
+ "cpu": "arm64-v8a",
+ },
+ visibility = ["//visibility:public"],
+)
+
+config_setting(
+ name = "android_x86",
+ values = {
+ "crosstool_top": "//external:android/crosstool",
+ "cpu": "x86",
+ },
+ visibility = ["//visibility:public"],
+)
+
+config_setting(
+ name = "android_x86_64",
+ values = {
+ "crosstool_top": "//external:android/crosstool",
+ "cpu": "x86_64",
+ },
+ visibility = ["//visibility:public"],
+)
+
+config_setting(
+ name = "ios_armv7",
+ values = {
+ "apple_platform_type": "ios",
+ "cpu": "ios_armv7",
+ },
+)
+
+config_setting(
+ name = "ios_arm64",
+ values = {
+ "apple_platform_type": "ios",
+ "cpu": "ios_arm64",
+ },
+)
+
+config_setting(
+ name = "ios_arm64e",
+ values = {
+ "apple_platform_type": "ios",
+ "cpu": "ios_arm64e",
+ },
+)
+
+config_setting(
+ name = "ios_x86",
+ values = {
+ "apple_platform_type": "ios",
+ "cpu": "ios_i386",
+ },
+)
+
+config_setting(
+ name = "ios_x86_64",
+ values = {
+ "apple_platform_type": "ios",
+ "cpu": "ios_x86_64",
+ },
+)
+
+config_setting(
+ name = "watchos_armv7k",
+ values = {
+ "apple_platform_type": "watchos",
+ "cpu": "watchos_armv7k",
+ },
+)
+
+config_setting(
+ name = "watchos_arm64_32",
+ values = {
+ "apple_platform_type": "watchos",
+ "cpu": "watchos_arm64_32",
+ },
+)
+
+config_setting(
+ name = "watchos_x86",
+ values = {
+ "apple_platform_type": "watchos",
+ "cpu": "watchos_i386",
+ },
+)
+
+config_setting(
+ name = "watchos_x86_64",
+ values = {
+ "apple_platform_type": "watchos",
+ "cpu": "watchos_x86_64",
+ },
+)
+
+config_setting(
+ name = "tvos_arm64",
+ values = {
+ "apple_platform_type": "tvos",
+ "cpu": "tvos_arm64",
+ },
+)
+
+config_setting(
+ name = "tvos_x86_64",
+ values = {
+ "apple_platform_type": "tvos",
+ "cpu": "tvos_x86_64",
+ },
+)
+
+config_setting(
+ name = "emscripten",
+ values = {"crosstool_top": "//toolchain:emscripten"},
+)
diff --git a/third_party/cpuinfo.patch b/third_party/cpuinfo.patch
new file mode 100644
index 0000000..eb57412
--- /dev/null
+++ b/third_party/cpuinfo.patch
@@ -0,0 +1,503 @@
+diff --git src/arm/cache.c src/arm/cache.c
+index 1a8bf91..666ad78 100644
+--- src/arm/cache.c
++++ src/arm/cache.c
+@@ -635,6 +635,13 @@ void cpuinfo_arm_decode_cache(
+ break;
+ }
+ break;
++ case cpuinfo_arm_chipset_series_broadcom_bcm:
++ switch (chipset->model) {
++ case 2837: /* BCM2837 */
++ l2_size = 512 * 1024;
++ break;
++ }
++ break;
+ case cpuinfo_arm_chipset_series_samsung_exynos:
+ l1_size = 32 * 1024;
+ break;
+@@ -922,11 +929,13 @@ void cpuinfo_arm_decode_cache(
+ * | MediaTek Helio X23 | 2(+4+4) | ? | ? | ? | |
+ * | MediaTek Helio X25 | 2(+4+4) | ? | ? | ? | |
+ * | MediaTek Helio X27 | 2(+4+4) | ? | ? | ? | |
++ * | Broadcom BCM2711 | 4 | 32K | 48K | 1M | [4] |
+ * +---------------------+---------+-----------+-----------+------------+-----------+
+ *
+ * [1] http://pdadb.net/index.php?m=processor&id=578&c=qualcomm_snapdragon_618_msm8956__snapdragon_650
+ * [2] http://pdadb.net/index.php?m=processor&id=667&c=qualcomm_snapdragon_620_apq8076__snapdragon_652
+ * [3] http://pdadb.net/index.php?m=processor&id=692&c=qualcomm_snapdragon_653_msm8976sg__msm8976_pro
++ * [4] https://www.raspberrypi.org/documentation/hardware/raspberrypi/bcm2711/README.md
+ */
+ uint32_t l2_size;
+ switch (chipset->series) {
+diff --git src/arm/linux/aarch32-isa.c src/arm/linux/aarch32-isa.c
+index 6aedda3..64dd168 100644
+--- src/arm/linux/aarch32-isa.c
++++ src/arm/linux/aarch32-isa.c
+@@ -77,18 +77,24 @@ void cpuinfo_arm_linux_decode_isa_from_proc_cpuinfo(
+
+ /*
+ * NEON VDOT instructions are not indicated in /proc/cpuinfo.
+- * Use a MIDR-based heuristic to whitelist processors known to support it:
+- * - Processors with Qualcomm-modified Cortex-A76 cores
+- * - Kirin 980 processor
++ * Use a MIDR-based heuristic to whitelist processors known to support it.
+ */
+ switch (midr & (CPUINFO_ARM_MIDR_IMPLEMENTER_MASK | CPUINFO_ARM_MIDR_PART_MASK)) {
++ case UINT32_C(0x4100D0B0): /* Cortex-A76 */
++ case UINT32_C(0x4100D0D0): /* Cortex-A77 */
++ case UINT32_C(0x4100D0E0): /* Cortex-A76AE */
++ case UINT32_C(0x4800D400): /* Cortex-A76 (HiSilicon) */
+ case UINT32_C(0x51008040): /* Kryo 485 Gold (Cortex-A76) */
++ case UINT32_C(0x51008050): /* Kryo 485 Silver (Cortex-A55) */
++ case UINT32_C(0x53000030): /* Exynos-M4 */
++ case UINT32_C(0x53000040): /* Exynos-M5 */
+ isa->dot = true;
+ break;
+- default:
+- if (chipset->series == cpuinfo_arm_chipset_series_hisilicon_kirin && chipset->model == 980) {
+- isa->dot = true;
+- }
++ case UINT32_C(0x4100D050): /* Cortex A55: revision 1 or later only */
++ isa->dot = !!(midr_get_variant(midr) >= 1);
++ break;
++ case UINT32_C(0x4100D0A0): /* Cortex A75: revision 2 or later only */
++ isa->dot = !!(midr_get_variant(midr) >= 2);
+ break;
+ }
+ } else {
+diff --git src/arm/linux/aarch64-isa.c src/arm/linux/aarch64-isa.c
+index f193e81..619cda5 100644
+--- src/arm/linux/aarch64-isa.c
++++ src/arm/linux/aarch64-isa.c
+@@ -67,21 +67,32 @@ void cpuinfo_arm64_linux_decode_isa_from_proc_cpuinfo(
+ }
+ /*
+ * Many phones ship with an old kernel configuration that doesn't report UDOT/SDOT instructions.
+- * Use a MIDR-based heuristic to whitelist processors known to support it:
+- * - Processors with Qualcomm-modified Cortex-A76 cores
+- * - Kirin 980 processor
++ * Use a MIDR-based heuristic to whitelist processors known to support it.
+ */
+ switch (midr & (CPUINFO_ARM_MIDR_IMPLEMENTER_MASK | CPUINFO_ARM_MIDR_PART_MASK)) {
++ case UINT32_C(0x4100D060): /* Cortex-A65 */
++ case UINT32_C(0x4100D0B0): /* Cortex-A76 */
++ case UINT32_C(0x4100D0C0): /* Neoverse N1 */
++ case UINT32_C(0x4100D0D0): /* Cortex-A77 */
++ case UINT32_C(0x4100D0E0): /* Cortex-A76AE */
++ case UINT32_C(0x4100D4A0): /* Neoverse E1 */
++ case UINT32_C(0x4800D400): /* Cortex-A76 (HiSilicon) */
+ case UINT32_C(0x51008040): /* Kryo 485 Gold (Cortex-A76) */
++ case UINT32_C(0x51008050): /* Kryo 485 Silver (Cortex-A55) */
++ case UINT32_C(0x53000030): /* Exynos-M4 */
++ case UINT32_C(0x53000040): /* Exynos-M5 */
+ isa->dot = true;
+ break;
++ case UINT32_C(0x4100D050): /* Cortex A55: revision 1 or later only */
++ isa->dot = !!(midr_get_variant(midr) >= 1);
++ break;
++ case UINT32_C(0x4100D0A0): /* Cortex A75: revision 2 or later only */
++ isa->dot = !!(midr_get_variant(midr) >= 2);
++ break;
+ default:
+ if (features & CPUINFO_ARM_LINUX_FEATURE_ASIMDDP) {
+ isa->dot = true;
+ }
+- if (chipset->series == cpuinfo_arm_chipset_series_hisilicon_kirin && chipset->model == 980) {
+- isa->dot = true;
+- }
+ break;
+ }
+ if (features & CPUINFO_ARM_LINUX_FEATURE_JSCVT) {
+diff --git src/arm/linux/api.h src/arm/linux/api.h
+index f99da66..2597e49 100644
+--- src/arm/linux/api.h
++++ src/arm/linux/api.h
+@@ -11,6 +11,8 @@
+
+ /* No hard limit in the kernel, maximum length observed on non-rogue kernels is 64 */
+ #define CPUINFO_HARDWARE_VALUE_MAX 64
++/* No hard limit in the kernel, maximum length on Raspberry Pi is 8. Add 1 symbol to detect overly large revision strings */
++#define CPUINFO_REVISION_VALUE_MAX 9
+
+ #ifdef __ANDROID__
+ /* As per include/sys/system_properties.h in Android NDK */
+@@ -259,6 +261,7 @@ static inline bool cpuinfo_arm_linux_processor_not_equals(
+
+ CPUINFO_INTERNAL bool cpuinfo_arm_linux_parse_proc_cpuinfo(
+ char hardware[restrict static CPUINFO_HARDWARE_VALUE_MAX],
++ char revision[restrict static CPUINFO_REVISION_VALUE_MAX],
+ uint32_t max_processors_count,
+ struct cpuinfo_arm_linux_processor processors[restrict static max_processors_count]);
+
+@@ -297,6 +300,7 @@ CPUINFO_INTERNAL bool cpuinfo_arm_linux_parse_proc_cpuinfo(
+ CPUINFO_INTERNAL struct cpuinfo_arm_chipset
+ cpuinfo_arm_linux_decode_chipset(
+ const char hardware[restrict static CPUINFO_HARDWARE_VALUE_MAX],
++ const char revision[restrict static CPUINFO_REVISION_VALUE_MAX],
+ uint32_t cores,
+ uint32_t max_cpu_freq_max);
+ #endif
+@@ -327,6 +331,10 @@ CPUINFO_INTERNAL struct cpuinfo_arm_chipset
+ CPUINFO_INTERNAL struct cpuinfo_arm_chipset
+ cpuinfo_arm_android_decode_chipset_from_ro_hardware_chipname(
+ const char ro_hardware_chipname[restrict static CPUINFO_BUILD_PROP_VALUE_MAX]);
++#else
++ CPUINFO_INTERNAL struct cpuinfo_arm_chipset
++ cpuinfo_arm_linux_decode_chipset_from_proc_cpuinfo_revision(
++ const char proc_cpuinfo_revision[restrict static CPUINFO_REVISION_VALUE_MAX]);
+ #endif
+
+ CPUINFO_INTERNAL bool cpuinfo_arm_linux_detect_core_clusters_by_heuristic(
+diff --git src/arm/linux/chipset.c src/arm/linux/chipset.c
+index 35058d9..e36283c 100644
+--- src/arm/linux/chipset.c
++++ src/arm/linux/chipset.c
+@@ -1011,12 +1011,59 @@ write_chipset:
+ return true;
+ }
+
++/**
++ * Tries to match /BCM\d{4}$/ signature for Broadcom BCM chipsets.
++ * If match successful, extracts model information into \p chipset argument.
++ *
++ * @param start - start of the /proc/cpuinfo Hardware string to match.
++ * @param end - end of the /proc/cpuinfo Hardware string to match.
++ * @param[out] chipset - location where chipset information will be stored upon a successful match.
++ *
++ * @returns true if signature matched, false otherwise.
++ */
++static bool match_bcm(
++ const char* start, const char* end,
++ struct cpuinfo_arm_chipset chipset[restrict static 1])
++{
++ /* Expect exactly 7 symbols: "BCM" (3 symbols) + 4-digit model number */
++ if (start + 7 != end) {
++ return false;
++ }
++
++ /* Check that the string starts with "BCM".
++ * The first three characters are loaded and compared as a 24-bit little endian word.
++ */
++ const uint32_t expected_bcm = load_u24le(start);
++ if (expected_bcm != UINT32_C(0x004D4342) /* "MCB" = reverse("BCM") */) {
++ return false;
++ }
++
++ /* Validate and parse 4-digit model number */
++ uint32_t model = 0;
++ for (uint32_t i = 3; i < 7; i++) {
++ const uint32_t digit = (uint32_t) (uint8_t) start[i] - '0';
++ if (digit >= 10) {
++ /* Not really a digit */
++ return false;
++ }
++ model = model * 10 + digit;
++ }
++
++ /* Return parsed chipset. */
++ *chipset = (struct cpuinfo_arm_chipset) {
++ .vendor = cpuinfo_arm_chipset_vendor_broadcom,
++ .series = cpuinfo_arm_chipset_series_broadcom_bcm,
++ .model = model,
++ };
++ return true;
++}
++
+ /**
+ * Tries to match /OMAP\d{4}$/ signature for Texas Instruments OMAP chipsets.
+ * If match successful, extracts model information into \p chipset argument.
+ *
+ * @param start - start of the /proc/cpuinfo Hardware string to match.
+- * @param end - end of the /proc/cpuinfo Hardaware string to match.
++ * @param end - end of the /proc/cpuinfo Hardware string to match.
+ * @param[out] chipset - location where chipset information will be stored upon a successful match.
+ *
+ * @returns true if signature matched, false otherwise.
+@@ -2328,6 +2375,14 @@ struct cpuinfo_arm_chipset cpuinfo_arm_linux_decode_chipset_from_proc_cpuinfo_ha
+ return chipset;
+ }
+
++ /* Check Broadcom BCM signature */
++ if (match_bcm(hardware, hardware_end, &chipset)) {
++ cpuinfo_log_debug(
++ "matched Broadcom BCM signature in /proc/cpuinfo Hardware string \"%.*s\"",
++ (int) hardware_length, hardware);
++ return chipset;
++ }
++
+ #if CPUINFO_ARCH_ARM
+ /* Check Texas Instruments OMAP signature */
+ if (match_omap(hardware, hardware_end, &chipset)) {
+@@ -3713,6 +3768,62 @@ void cpuinfo_arm_chipset_to_string(
+ return chipset;
+ }
+ #else /* !defined(__ANDROID__) */
++ /*
++ * Fix commonly misreported Broadcom BCM models on Raspberry Pi boards.
++ *
++ * @param[in,out] chipset - chipset name to fix.
++ * @param[in] revision - /proc/cpuinfo Revision string.
++ */
++ void cpuinfo_arm_fixup_raspberry_pi_chipset(
++ struct cpuinfo_arm_chipset chipset[restrict static 1],
++ const char revision[restrict static CPUINFO_HARDWARE_VALUE_MAX])
++ {
++ const size_t revision_length = strnlen(revision, CPUINFO_REVISION_VALUE_MAX);
++
++ /* Parse revision codes according to https://www.raspberrypi.org/documentation/hardware/raspberrypi/revision-codes/README.md */
++ #if CPUINFO_ARCH_ARM
++ if (revision_length == 4) {
++ /*
++ * Old-style revision codes.
++ * All Raspberry Pi models with old-style revision code use Broadcom BCM2835.
++ */
++
++ /* BCM2835 often misreported as BCM2708 */
++ if (chipset->model == 2708) {
++ chipset->model = 2835;
++ }
++ return;
++ }
++ #endif
++ if ((size_t) (revision_length - 5) <= (size_t) (8 - 5) /* 5 <= length(revision) <= 8 */) {
++ /* New-style revision codes */
++
++ uint32_t model = 0;
++ switch (revision[revision_length - 4]) {
++ case '0':
++ /* BCM2835 */
++ model = 2835;
++ break;
++ case '1':
++ /* BCM2836 */
++ model = 2836;
++ break;
++ case '2':
++ /* BCM2837 */
++ model = 2837;
++ break;
++ case '3':
++ /* BCM2711 */
++ model = 2711;
++ break;
++ }
++
++ if (model != 0) {
++ chipset->model = model;
++ chipset->suffix[0] = 0;
++ }
++ }
++ }
+
+ /*
+ * Decodes chipset name from /proc/cpuinfo Hardware string.
+@@ -3727,6 +3838,7 @@ void cpuinfo_arm_chipset_to_string(
+ */
+ struct cpuinfo_arm_chipset cpuinfo_arm_linux_decode_chipset(
+ const char hardware[restrict static CPUINFO_HARDWARE_VALUE_MAX],
++ const char revision[restrict static CPUINFO_REVISION_VALUE_MAX],
+ uint32_t cores,
+ uint32_t max_cpu_freq_max)
+ {
+@@ -3736,6 +3848,9 @@ void cpuinfo_arm_chipset_to_string(
+ if (chipset.vendor == cpuinfo_arm_chipset_vendor_unknown) {
+ cpuinfo_log_warning(
+ "chipset detection failed: /proc/cpuinfo Hardware string did not match known signatures");
++ } else if (chipset.vendor == cpuinfo_arm_chipset_vendor_broadcom) {
++ /* Raspberry Pi kernel reports bogus chipset models; detect chipset from RPi revision */
++ cpuinfo_arm_fixup_raspberry_pi_chipset(&chipset, revision);
+ } else {
+ cpuinfo_arm_fixup_chipset(&chipset, cores, max_cpu_freq_max);
+ }
+diff --git src/arm/linux/cpuinfo.c src/arm/linux/cpuinfo.c
+index c70055f..90e1631 100644
+--- src/arm/linux/cpuinfo.c
++++ src/arm/linux/cpuinfo.c
+@@ -631,6 +631,7 @@ static void parse_cache_number(
+
+ struct proc_cpuinfo_parser_state {
+ char* hardware;
++ char* revision;
+ uint32_t processor_index;
+ uint32_t max_processors_count;
+ struct cpuinfo_arm_linux_processor* processors;
+@@ -791,7 +792,17 @@ static bool parse_line(
+ memcpy(state->hardware, value_start, value_length);
+ cpuinfo_log_debug("parsed /proc/cpuinfo Hardware = \"%.*s\"", (int) value_length, value_start);
+ } else if (memcmp(line_start, "Revision", key_length) == 0) {
+- /* Board revision, no use for now */
++ size_t value_length = value_end - value_start;
++ if (value_length > CPUINFO_REVISION_VALUE_MAX) {
++ cpuinfo_log_info(
++ "length of Revision value \"%.*s\" in /proc/cpuinfo exceeds limit (%d): truncating to the limit",
++ (int) value_length, value_start, CPUINFO_REVISION_VALUE_MAX);
++ value_length = CPUINFO_REVISION_VALUE_MAX;
++ } else {
++ state->revision[value_length] = '\0';
++ }
++ memcpy(state->revision, value_start, value_length);
++ cpuinfo_log_debug("parsed /proc/cpuinfo Revision = \"%.*s\"", (int) value_length, value_start);
+ } else {
+ goto unknown;
+ }
+@@ -881,11 +892,13 @@ static bool parse_line(
+
+ bool cpuinfo_arm_linux_parse_proc_cpuinfo(
+ char hardware[restrict static CPUINFO_HARDWARE_VALUE_MAX],
++ char revision[restrict static CPUINFO_REVISION_VALUE_MAX],
+ uint32_t max_processors_count,
+ struct cpuinfo_arm_linux_processor processors[restrict static max_processors_count])
+ {
+ struct proc_cpuinfo_parser_state state = {
+ .hardware = hardware,
++ .revision = revision,
+ .processor_index = 0,
+ .max_processors_count = max_processors_count,
+ .processors = processors,
+diff --git src/arm/linux/init.c src/arm/linux/init.c
+index 6272abf..89d957e 100644
+--- src/arm/linux/init.c
++++ src/arm/linux/init.c
+@@ -167,8 +167,9 @@ void cpuinfo_arm_linux_init(void) {
+ struct cpuinfo_android_properties android_properties;
+ cpuinfo_arm_android_parse_properties(&android_properties);
+ #else
+- char proc_cpuinfo_hardware[CPUINFO_HARDWARE_VALUE_MAX] = { 0 };
++ char proc_cpuinfo_hardware[CPUINFO_HARDWARE_VALUE_MAX];
+ #endif
++ char proc_cpuinfo_revision[CPUINFO_REVISION_VALUE_MAX];
+
+ if (!cpuinfo_arm_linux_parse_proc_cpuinfo(
+ #if defined(__ANDROID__)
+@@ -176,6 +177,7 @@ void cpuinfo_arm_linux_init(void) {
+ #else
+ proc_cpuinfo_hardware,
+ #endif
++ proc_cpuinfo_revision,
+ arm_linux_processors_count,
+ arm_linux_processors)) {
+ cpuinfo_log_error("failed to parse processor information from /proc/cpuinfo");
+@@ -228,10 +230,8 @@ void cpuinfo_arm_linux_init(void) {
+ const struct cpuinfo_arm_chipset chipset =
+ cpuinfo_arm_android_decode_chipset(&android_properties, valid_processors, 0);
+ #else
+- const struct cpuinfo_arm_chipset chipset = {
+- .vendor = cpuinfo_arm_chipset_vendor_unknown,
+- .series = cpuinfo_arm_chipset_series_unknown,
+- };
++ const struct cpuinfo_arm_chipset chipset =
++ cpuinfo_arm_linux_decode_chipset(proc_cpuinfo_hardware, proc_cpuinfo_revision, valid_processors, 0);
+ #endif
+
+ #if CPUINFO_ARCH_ARM
+diff --git test/arm-cache.cc test/arm-cache.cc
+index 7d2e4a4..4d6218b 100644
+--- test/arm-cache.cc
++++ test/arm-cache.cc
+@@ -1664,3 +1664,91 @@ TEST(ROCKCHIP, rk3368) {
+ EXPECT_EQ(256 * 1024, little_l2.size);
+ EXPECT_EQ(0, little_l3.size);
+ }
++
++TEST(BROADCOM, bcm2835) {
++ const struct cpuinfo_arm_chipset chipset = {
++ .vendor = cpuinfo_arm_chipset_vendor_broadcom,
++ .series = cpuinfo_arm_chipset_series_broadcom_bcm,
++ .model = 2835,
++ };
++
++ struct cpuinfo_cache l1i = { 0 };
++ struct cpuinfo_cache l1d = { 0 };
++ struct cpuinfo_cache l2 = { 0 };
++ struct cpuinfo_cache l3 = { 0 };
++ cpuinfo_arm_decode_cache(
++ cpuinfo_uarch_arm11, 4, UINT32_C(0x410FB767),
++ &chipset, 0, 4,
++ &l1i, &l1d, &l2, &l3);
++
++ EXPECT_EQ(16 * 1024, l1i.size);
++ EXPECT_EQ(16 * 1024, l1d.size);
++ EXPECT_EQ(0, l2.size);
++ EXPECT_EQ(0, big_l3.size);
++}
++
++TEST(BROADCOM, bcm2836) {
++ const struct cpuinfo_arm_chipset chipset = {
++ .vendor = cpuinfo_arm_chipset_vendor_broadcom,
++ .series = cpuinfo_arm_chipset_series_broadcom_bcm,
++ .model = 2836,
++ };
++
++ struct cpuinfo_cache l1i = { 0 };
++ struct cpuinfo_cache l1d = { 0 };
++ struct cpuinfo_cache l2 = { 0 };
++ struct cpuinfo_cache l3 = { 0 };
++ cpuinfo_arm_decode_cache(
++ cpuinfo_uarch_cortex_a7, 4, UINT32_C(0x410FC075),
++ &chipset, 0, 4,
++ &l1i, &l1d, &l2, &l3);
++
++ EXPECT_EQ(32 * 1024, l1i.size);
++ EXPECT_EQ(32 * 1024, l1d.size);
++ EXPECT_EQ(512 * 1024, l2.size);
++ EXPECT_EQ(0, big_l3.size);
++}
++
++TEST(BROADCOM, bcm2837) {
++ const struct cpuinfo_arm_chipset chipset = {
++ .vendor = cpuinfo_arm_chipset_vendor_broadcom,
++ .series = cpuinfo_arm_chipset_series_broadcom_bcm,
++ .model = 2837,
++ };
++
++ struct cpuinfo_cache l1i = { 0 };
++ struct cpuinfo_cache l1d = { 0 };
++ struct cpuinfo_cache l2 = { 0 };
++ struct cpuinfo_cache l3 = { 0 };
++ cpuinfo_arm_decode_cache(
++ cpuinfo_uarch_cortex_a53, 4, UINT32_C(0x410FD034),
++ &chipset, 0, 4,
++ &l1i, &l1d, &l2, &l3);
++
++ EXPECT_EQ(16 * 1024, l1i.size);
++ EXPECT_EQ(16 * 1024, l1d.size);
++ EXPECT_EQ(512 * 1024, l2.size);
++ EXPECT_EQ(0, big_l3.size);
++}
++
++TEST(BROADCOM, bcm2711) {
++ const struct cpuinfo_arm_chipset chipset = {
++ .vendor = cpuinfo_arm_chipset_vendor_broadcom,
++ .series = cpuinfo_arm_chipset_series_broadcom_bcm,
++ .model = 2711,
++ };
++
++ struct cpuinfo_cache l1i = { 0 };
++ struct cpuinfo_cache l1d = { 0 };
++ struct cpuinfo_cache l2 = { 0 };
++ struct cpuinfo_cache l3 = { 0 };
++ cpuinfo_arm_decode_cache(
++ cpuinfo_uarch_cortex_a72, 4, UINT32_C(0x410FD083),
++ &chipset, 0, 4,
++ &l1i, &l1d, &l2, &l3);
++
++ EXPECT_EQ(48 * 1024, l1i.size);
++ EXPECT_EQ(32 * 1024, l1d.size);
++ EXPECT_EQ(1024 * 1024, l2.size);
++ EXPECT_EQ(0, big_l3.size);
++}
+diff --git test/get-current.cc test/get-current.cc
+index f410b12..96b11dc 100644
+--- test/get-current.cc
++++ test/get-current.cc
+@@ -36,3 +36,9 @@ TEST(CURRENT_UARCH_INDEX, within_bounds) {
+
+ ASSERT_LT(cpuinfo_get_current_uarch_index(), cpuinfo_get_uarchs_count());
+ }
++
++TEST(CURRENT_UARCH_INDEX_WITH_DEFAULT, within_bounds) {
++ ASSERT_TRUE(cpuinfo_initialize());
++
++ ASSERT_LE(cpuinfo_get_current_uarch_index_with_default(cpuinfo_get_uarchs_count()), cpuinfo_get_uarchs_count());
++}