Use the cpuinfo library instead of our own code for CPU feature detection.

PiperOrigin-RevId: 313469646
diff --git a/WORKSPACE b/WORKSPACE
index abdd661..587ed6e 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -27,3 +27,26 @@
         "https://github.com/google/googletest/archive/release-1.8.1.tar.gz",
     ],
 )
+
+# clog library, used by cpuinfo for logging
+http_archive(
+    name = "clog",
+    strip_prefix = "cpuinfo-d5e37adf1406cf899d7d9ec1d317c47506ccb970",
+    sha256 = "3f2dc1970f397a0e59db72f9fca6ff144b216895c1d606f6c94a507c1e53a025",
+    urls = [
+        "https://github.com/pytorch/cpuinfo/archive/d5e37adf1406cf899d7d9ec1d317c47506ccb970.tar.gz",
+    ],
+    build_file = "@//third_party:clog.BUILD",
+)
+
+# cpuinfo library, used for detecting processor characteristics
+http_archive(
+    name = "cpuinfo",
+    strip_prefix = "cpuinfo-c2092219e7c874783a00a62edb94ddc672f57ab3",
+    sha256 = "ea56c399a4f6ca5f749e71acb6a7bfdc653eb65d8f658cb2e414a2fcdca1fe8b",
+    urls = [
+        "https://github.com/pytorch/cpuinfo/archive/c2092219e7c874783a00a62edb94ddc672f57ab3.zip",
+    ],
+    build_file = "@//third_party:cpuinfo.BUILD",
+    patches = ["@//third_party:cpuinfo.patch"],
+)
diff --git a/ruy/BUILD b/ruy/BUILD
index 589fb6f..10ea17b 100644
--- a/ruy/BUILD
+++ b/ruy/BUILD
@@ -20,6 +20,18 @@
 )
 
 config_setting(
+    name = "ppc",
+    values = {
+        "cpu": "ppc",
+    },
+)
+
+config_setting(
+    name = "fuchsia",
+    values = {"cpu": "fuchsia"},
+)
+
+config_setting(
     name = "optimized",
     values = {
         "compilation_mode": "opt",
@@ -278,31 +290,23 @@
 )
 
 cc_library(
-    name = "detect_arm",
+    name = "cpuinfo",
     srcs = [
-        "detect_arm.cc",
+        "cpuinfo.cc",
     ],
     hdrs = [
-        "detect_arm.h",
+        "cpuinfo.h",
     ],
-    copts = ruy_copts(),
-    deps = [
-        ":platform",
+    copts = ruy_copts() + [
+        # ruy_copts contains -Wundef, but cpuinfo's header warns with that.
+        "-Wno-undef",
     ],
-)
-
-cc_library(
-    name = "detect_x86",
-    srcs = [
-        "detect_x86.cc",
-    ],
-    hdrs = [
-        "detect_x86.h",
-    ],
-    copts = ruy_copts(),
-    deps = [
-        ":platform",
-    ],
+    deps = [":platform"] + select({
+        # cpuinfo does not build on ppc.
+        ":ppc": [],
+        ":fuchsia": [],
+        "//conditions:default": ["@cpuinfo"],
+    }),
 )
 
 cc_library(
@@ -752,8 +756,6 @@
         ":allocator",
         ":check_macros",
         ":ctx",
-        ":detect_arm",
-        ":detect_x86",
         ":have_built_path_for",
         ":path",
         ":platform",
@@ -799,8 +801,7 @@
     deps = [
         ":allocator",
         ":check_macros",
-        ":detect_arm",
-        ":detect_x86",
+        ":cpuinfo",
         ":have_built_path_for",
         ":path",
         ":platform",
@@ -959,7 +960,7 @@
         ":context",
         ":ctx",
         ":context_get_ctx",
-        "//ruy/profiler:profiler",
+        "//ruy/profiler",
     ] + ruy_test_ext_deps(),
 )
 
@@ -997,8 +998,8 @@
         ("i8", "u8", "i32", "i32"),
     ],
     deps = [
-        "@com_google_googletest//:gtest_main",
         "//ruy:test_lib",
+        "@com_google_googletest//:gtest_main",
     ],
 )
 
@@ -1015,8 +1016,8 @@
     ],
     tags = ["slow"],
     deps = [
-        "@com_google_googletest//:gtest_main",
         "//ruy:test_lib",
+        "@com_google_googletest//:gtest_main",
     ],
 )
 
@@ -1030,7 +1031,7 @@
         ("u8", "u8", "i32", "i16"),
     ],
     deps = [
-        "@com_google_googletest//:gtest_main",
         "//ruy:test_lib",
+        "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/ruy/cpuinfo.cc b/ruy/cpuinfo.cc
new file mode 100644
index 0000000..793ba7b
--- /dev/null
+++ b/ruy/cpuinfo.cc
@@ -0,0 +1,61 @@
+#include "ruy/cpuinfo.h"
+
+#include "ruy/platform.h"
+
+#define RUY_HAVE_CPUINFO (!(RUY_PLATFORM_PPC || RUY_PLATFORM_FUCHSIA))
+
+#if RUY_HAVE_CPUINFO
+
+#include <cpuinfo.h>
+
+namespace ruy {
+
+CpuInfo::~CpuInfo() {
+  if (init_status_ == InitStatus::kInitialized) {
+    cpuinfo_deinitialize();
+  }
+}
+
+bool CpuInfo::EnsureInitialized() {
+  if (init_status_ == InitStatus::kNotYetAttempted) {
+    init_status_ =
+        cpuinfo_initialize() ? InitStatus::kInitialized : InitStatus::kFailed;
+  }
+  return init_status_ == InitStatus::kInitialized;
+}
+
+bool CpuInfo::NeonDotprod() {
+  return EnsureInitialized() && cpuinfo_has_arm_neon_dot();
+}
+
+bool CpuInfo::Sse42() {
+  return EnsureInitialized() && cpuinfo_has_x86_sse4_2();
+}
+
+bool CpuInfo::Avx2() { return EnsureInitialized() && cpuinfo_has_x86_avx2(); }
+
+bool CpuInfo::Avx512() {
+  return EnsureInitialized() && cpuinfo_has_x86_avx512f() &&
+         cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512cd() &&
+         cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512vl();
+}
+
+bool CpuInfo::AvxVnni() {
+  return EnsureInitialized() && cpuinfo_has_x86_avx512vnni();
+}
+
+}  // namespace ruy
+
+#else  // not RUY_HAVE_CPUINFO
+
+namespace ruy {
+CpuInfo::~CpuInfo() {}
+bool CpuInfo::EnsureInitialized() { return false; }
+bool CpuInfo::NeonDotprod() { return false; }
+bool CpuInfo::Sse42() { return false; }
+bool CpuInfo::Avx2() { return false; }
+bool CpuInfo::Avx512() { return false; }
+bool CpuInfo::AvxVnni() { return false; }
+}  // namespace ruy
+
+#endif
diff --git a/ruy/cpuinfo.h b/ruy/cpuinfo.h
new file mode 100644
index 0000000..0a3de28
--- /dev/null
+++ b/ruy/cpuinfo.h
@@ -0,0 +1,49 @@
+/* Copyright 2020 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef RUY_RUY_CPUINFO_H_
+#define RUY_RUY_CPUINFO_H_
+
+namespace ruy {
+
+// Wraps the functionality that ruy needs from the cpuinfo library.
+class CpuInfo final {
+ public:
+  CpuInfo() {}
+  ~CpuInfo();
+
+  // ARM features
+  bool NeonDotprod();
+
+  // X86 features
+  bool Sse42();
+  bool Avx2();
+  bool Avx512();
+  bool AvxVnni();
+
+ private:
+  enum class InitStatus {
+    kNotYetAttempted,
+    kInitialized,
+    kFailed,
+  };
+  InitStatus init_status_ = InitStatus::kNotYetAttempted;
+  bool EnsureInitialized();
+  CpuInfo(const CpuInfo&) = delete;
+};
+
+}  // namespace ruy
+
+#endif  // RUY_RUY_CPUINFO_H_
diff --git a/ruy/ctx.cc b/ruy/ctx.cc
index cbb855d..5a01bd2 100644
--- a/ruy/ctx.cc
+++ b/ruy/ctx.cc
@@ -18,9 +18,8 @@
 #include <functional>
 
 #include "ruy/check_macros.h"
+#include "ruy/cpuinfo.h"
 #include "ruy/ctx_impl.h"
-#include "ruy/detect_arm.h"
-#include "ruy/detect_x86.h"
 #include "ruy/have_built_path_for.h"
 #include "ruy/path.h"
 #include "ruy/platform.h"
@@ -47,13 +46,15 @@
   mutable_impl()->runtime_enabled_paths_ = paths | kNonArchPaths;
 }
 
+CpuInfo* Ctx::mutable_cpuinfo() { return &mutable_impl()->cpuinfo_; }
+
 namespace {
 
 // For each Path bit set in `paths_to_test`, performs runtime detection and
 // sets the corresponding bit in the return value if and only if it is
 // supported. Path bits that are not set in the input
 // `paths_to_detect` value are also left not set in the return value.
-Path DetectRuntimeSupportedPaths(Path paths_to_detect) {
+Path DetectRuntimeSupportedPaths(Path paths_to_detect, CpuInfo* cpuinfo) {
   // Paths in kNonArchPaths are always implicitly supported.
   // Further logic below may add more bits to `results`.
   Path result = kNonArchPaths;
@@ -84,20 +85,21 @@
   // build it at the moment. That is largely because we have had to machine
   // encode dotprod instructions, so we don't actually rely on toolchain support
   // for them.
-  maybe_add(Path::kNeonDotprod, []() { return DetectDotprod(); });
+  maybe_add(Path::kNeonDotprod, [=]() { return cpuinfo->NeonDotprod(); });
 #elif RUY_PLATFORM_X86
   // x86 SIMD paths currently require both runtime detection, and detection of
   // whether we're building the path at all.
   maybe_add(Path::kSse42,
-            []() { return HaveBuiltPathForSse42() && DetectCpuSse42(); });
+            [=]() { return HaveBuiltPathForSse42() && cpuinfo->Sse42(); });
   maybe_add(Path::kAvx2,
-            []() { return HaveBuiltPathForAvx2() && DetectCpuAvx2(); });
+            [=]() { return HaveBuiltPathForAvx2() && cpuinfo->Avx2(); });
   maybe_add(Path::kAvx512,
-            []() { return HaveBuiltPathForAvx512() && DetectCpuAvx512(); });
+            [=]() { return HaveBuiltPathForAvx512() && cpuinfo->Avx512(); });
   maybe_add(Path::kAvxVnni,
-            []() { return HaveBuiltPathForAvxVnni() && DetectCpuAvxVnni(); });
+            [=]() { return HaveBuiltPathForAvxVnni() && cpuinfo->AvxVnni(); });
 #else
   (void)maybe_add;
+  (void)cpuinfo;
 #endif
 
   // Sanity checks
@@ -116,7 +118,7 @@
   // The value Path::kNone indicates the initial state before detection has been
   // performed.
   if (*paths == Path::kNone) {
-    *paths = DetectRuntimeSupportedPaths(kAllPaths);
+    *paths = DetectRuntimeSupportedPaths(kAllPaths, mutable_cpuinfo());
   }
 
   return *paths;
diff --git a/ruy/ctx.h b/ruy/ctx.h
index 1fb8fda..0d731e4 100644
--- a/ruy/ctx.h
+++ b/ruy/ctx.h
@@ -28,6 +28,7 @@
 class Allocator;
 class TuningResolver;
 class PrepackedCache;
+class CpuInfo;
 enum class Path : std::uint8_t;
 enum class Tuning;
 
@@ -47,6 +48,7 @@
   ThreadPool* mutable_thread_pool();
   int max_num_threads() const;
   void set_max_num_threads(int value);
+  CpuInfo* mutable_cpuinfo();
 
   // Returns the set of Path's that are available. By default, this is based on
   // runtime detection of CPU features, as well as on which code paths were
diff --git a/ruy/ctx_impl.h b/ruy/ctx_impl.h
index d182518..e297ca7 100644
--- a/ruy/ctx_impl.h
+++ b/ruy/ctx_impl.h
@@ -24,6 +24,7 @@
 #include <vector>
 
 #include "ruy/allocator.h"
+#include "ruy/cpuinfo.h"
 #include "ruy/ctx.h"
 #include "ruy/path.h"
 #include "ruy/prepacked_cache.h"
@@ -66,9 +67,11 @@
   // this allocator, and its per-thread allocator.
   std::unique_ptr<Allocator> main_allocator_;
   std::unique_ptr<PrepackedCache> prepacked_cache_;
-  // Set of Paths detected at runtime to be supported. The initial value kNone
+  // Set of Paths enabled at runtime. By default, that is based on runtime
+  // detection, but may be overridden. The initial value kNone
   // means that detection has not yet been performed.
   Path runtime_enabled_paths_ = Path::kNone;
+  CpuInfo cpuinfo_;
   // State for each thread in the thread pool. Entry 0 is the main thread.
   std::vector<std::unique_ptr<ThreadSpecificResource>>
       thread_specific_resources_;
diff --git a/ruy/detect_arm.cc b/ruy/detect_arm.cc
deleted file mode 100644
index 37f30df..0000000
--- a/ruy/detect_arm.cc
+++ /dev/null
@@ -1,236 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/* Temporary dotprod-detection until we can rely on proper feature-detection
-such as getauxval on Linux (requires a newer Linux kernel than we can
-currently rely on on Android).
-
-There are two main ways that this could be implemented: using a signal
-handler or a fork. The current implementation uses a signal handler.
-This is because on current Android, an uncaught signal gives a latency
-of over 100 ms. In order for the fork approach to be worthwhile, it would
-have to save us the hassle of handling signals, and such an approach thus
-has an unavoidable 100ms latency. By contrast, the present signal-handling
-approach has low latency.
-
-Downsides of the current signal-handling approach include:
- 1. Setting and restoring signal handlers is not thread-safe: we can't
-    prevent another thread from interfering with us. We at least prevent
-    other threads from calling our present code concurrently by using a lock,
-    but we can't do anything about other threads using their own code to
-    set signal handlers.
- 2. Signal handlers are not entirely portable, e.g. b/132973173 showed that
-    on Apple platform the EXC_BAD_INSTRUCTION signal is not always caught
-    by a SIGILL handler (difference between Release and Debug builds).
- 3. The signal handler approach looks confusing in a debugger (has to
-    tell the debugger to 'continue' past the signal every time). Fix:
-    ```
-    (gdb) handle SIGILL nostop noprint pass
-    ```
-
-Here is what the nicer fork-based alternative would look like.
-Its only downside, as discussed above, is high latency, 100 ms on Android.
-
-```
-bool TryAsmSnippet(bool (*asm_snippet)()) {
-  int child_pid = fork();
-  if (child_pid == -1) {
-    // Fork failed.
-    return false;
-  }
-  if (child_pid == 0) {
-    // Child process code path. Pass the raw boolean return value of
-    // asm_snippet as exit code (unconventional: 1 means true == success).
-    _exit(asm_snippet());
-  }
-
-  int child_status;
-  waitpid(child_pid, &child_status, 0);
-  if (WIFSIGNALED(child_status)) {
-    // Child process terminated by signal, meaning the instruction was
-    // not supported.
-    return false;
-  }
-  // Return the exit code of the child, which per child code above was
-  // the return value of asm_snippet().
-  return WEXITSTATUS(child_status);
-}
-```
-*/
-
-#include "ruy/detect_arm.h"
-
-#if (defined __linux__) && (defined __aarch64__)
-#define RUY_DETECT_DOTPROD
-#endif
-
-#ifdef RUY_DETECT_DOTPROD
-
-#include <setjmp.h>
-#include <signal.h>
-
-#include <cstdlib>
-#include <cstring>
-#include <mutex>
-
-#ifdef __linux__
-#include <sys/auxv.h>
-#endif
-
-#endif
-
-namespace ruy {
-
-#ifdef RUY_DETECT_DOTPROD
-
-namespace {
-
-// long-jump buffer used to continue execution after a caught SIGILL.
-sigjmp_buf global_sigjmp_buf;
-
-// Signal handler. Long-jumps to just before
-// we ran the snippet that we know is the only thing that could have generated
-// the SIGILL.
-void SignalHandler(int) { siglongjmp(global_sigjmp_buf, 1); }
-
-// RAII helper for calling sigprocmask to unblock all signals temporarily.
-class ScopeUnblockSignals final {
- public:
-  ScopeUnblockSignals() {
-    sigset_t procmask;
-    sigemptyset(&procmask);
-    success_ = !sigprocmask(SIG_SETMASK, &procmask, &old_procmask_);
-  }
-  ~ScopeUnblockSignals() {
-    if (success_) {
-      sigprocmask(SIG_SETMASK, &old_procmask_, nullptr);
-    }
-  }
-  bool success() const { return success_; }
-
- private:
-  sigset_t old_procmask_;
-  bool success_ = false;
-};
-
-// RAII helper to install and uninstall a signal handler.
-class ScopeSigaction final {
- public:
-  ScopeSigaction(int signal_number, void (*handler_function)(int))
-      : signal_number_(signal_number) {
-    struct sigaction action;
-    memset(&action, 0, sizeof(action));
-    sigemptyset(&action.sa_mask);
-    action.sa_handler = handler_function;
-    success_ = !sigaction(signal_number_, &action, &old_action_);
-  }
-  ~ScopeSigaction() {
-    if (success_) {
-      sigaction(signal_number_, &old_action_, nullptr);
-    }
-  }
-  bool success() const { return success_; }
-
- private:
-  const int signal_number_;
-  struct sigaction old_action_;
-  bool success_ = false;
-};
-
-// Try an asm snippet. Returns true if it passed i.e. ran without generating
-// an illegal-instruction signal and returned true. Returns false otherwise.
-bool TryAsmSnippet(bool (*asm_snippet)()) {
-  // This function installs and restores signal handlers and the signal-blocking
-  // mask. We can't prevent another thread from interfering, but we can at least
-  // put a big lock here so that it works if, for whatever reason, another
-  // thread calls this function concurrently.
-  static std::mutex mutex;
-  std::lock_guard<std::mutex> lock(mutex);
-
-  ScopeUnblockSignals unblock_signals;
-  if (!unblock_signals.success()) {
-    return false;
-  }
-  ScopeSigaction handle_sigill(SIGILL, SignalHandler);
-  if (!handle_sigill.success()) {
-    return false;
-  }
-
-  // Set the long jump buffer to this point in the code. This normally returns
-  // 0 so we don't take this branch...
-  if (sigsetjmp(global_sigjmp_buf, false)) {
-    // ... except in the fake return from sigsetjmp that is produced when
-    // the long-jump back to here actually happened, that is, in the signal
-    // handler. In this case, we know that the asm_snippet triggered an illegal
-    // instruction signal, so we return false.
-    return false;
-  }
-
-  return asm_snippet();
-}
-
-bool DotprodAsmSnippet() {
-  // maratek@ mentioned that for some other ISA extensions (fp16)
-  // there have been implementations that failed to generate SIGILL even
-  // though they did not correctly implement the instruction. Just in case
-  // a similar situation might exist here, we do a simple correctness test.
-  int result = 0;
-  asm volatile(
-      "mov w0, #100\n"
-      "dup v0.16b, w0\n"
-      "dup v1.4s, w0\n"
-      ".word 0x6e809401  // udot v1.4s, v0.16b, v0.16b\n"
-      "mov %w[result], v1.s[0]\n"
-      : [result] "=r"(result)
-      :
-      : "x0", "v0", "v1");
-  // Expecting 100 (input accumulator value) + 100 * 100 + ... (repeat 4 times)
-  return result == 40100;
-}
-
-bool DetectDotprodBySignalMethod() { return TryAsmSnippet(DotprodAsmSnippet); }
-
-#ifdef __linux__
-bool DetectDotprodByLinuxAuxvMethod() {
-  // This is the value of HWCAP_ASIMDDP in sufficiently recent Linux headers,
-  // however we need to support building against older headers for the time
-  // being.
-  const int kLocalHwcapAsimddp = 1 << 20;
-  return getauxval(AT_HWCAP) & kLocalHwcapAsimddp;
-}
-#endif
-
-}  // namespace
-
-bool DetectDotprod() {
-#ifdef __linux__
-  // We always try the auxv method and don't try to check the linux version
-  // before. It's only in the mainline linux tree from 4.14.151, but it's been
-  // backported to earlier linux versions in Android vendor device trees.
-  // The cost of just trying this is near zero, and the benefit is large
-  // as the signal method has higher latency and a substantial crash potential.
-  if (DetectDotprodByLinuxAuxvMethod()) {
-    return true;
-  }
-#endif
-
-  return DetectDotprodBySignalMethod();
-}
-
-#else   // not defined RUY_DETECT_DOTPROD
-bool DetectDotprod() { return false; }
-#endif  // defined RUY_DETECT_DOTPROD
-
-}  // namespace ruy
diff --git a/ruy/detect_arm.h b/ruy/detect_arm.h
deleted file mode 100644
index 937a067..0000000
--- a/ruy/detect_arm.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Temporary dotprod-detection code until we can rely on getauxval.
-
-#ifndef RUY_RUY_DETECT_ARM_H_
-#define RUY_RUY_DETECT_ARM_H_
-
-namespace ruy {
-
-// On A64, returns true if the dotprod extension is present.
-// On other architectures, returns false unconditionally.
-bool DetectDotprod();
-
-}  // namespace ruy
-
-#endif  // RUY_RUY_DETECT_ARM_H_
diff --git a/ruy/detect_x86.cc b/ruy/detect_x86.cc
deleted file mode 100644
index 9d0a3b6..0000000
--- a/ruy/detect_x86.cc
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "ruy/detect_x86.h"
-
-#include <cstdint>
-
-#if RUY_PLATFORM_X86 && RUY_PLATFORM_X86_ENHANCEMENTS
-#include <immintrin.h>  // IWYU pragma: keep
-
-#endif
-
-namespace ruy {
-#if RUY_PLATFORM_X86 && RUY_PLATFORM_X86_ENHANCEMENTS
-
-namespace {
-
-// See Intel docs, such as http://goo.gl/c6IkGX.
-inline void RunCpuid(std::uint32_t eax, std::uint32_t ecx,
-                     std::uint32_t abcd[4]) {
-  std::uint32_t ebx = 0, edx;
-#if defined(__i386__) && defined(__PIC__)
-  /* in case of PIC under 32-bit EBX cannot be clobbered */
-  asm volatile("movl %%ebx, %%edi \n\t cpuid \n\t xchgl %%ebx, %%edi"
-               : "=D"(ebx),
-#else
-  asm volatile("cpuid"
-               : "+b"(ebx),
-#endif
-                 "+a"(eax), "+c"(ecx), "=d"(edx));
-  abcd[0] = eax;
-  abcd[1] = ebx;
-  abcd[2] = ecx;
-  abcd[3] = edx;
-}
-
-}  // namespace
-
-bool DetectCpuSse42() {
-  std::uint32_t abcd[4];
-
-  constexpr std::uint32_t kEcxSse42 = 1u << 20;
-  RunCpuid(1, 0, abcd);
-  const bool has_sse4_2_base = (abcd[2] & kEcxSse42) == kEcxSse42;
-
-#ifdef RUY_ENABLE_AMD_CPUID_CHECKS
-  constexpr std::uint32_t kEcxAbm = 1u << 5;
-  RunCpuid(0x80000001, 0, abcd);
-  const bool has_extras = (abcd[2] & kEcxAbm) == kEcxAbm;
-#else
-  constexpr std::uint32_t kEcxPopcnt = 1u << 23;
-  RunCpuid(1, 0, abcd);
-  const bool has_extras = (abcd[2] & kEcxPopcnt) == kEcxPopcnt;
-#endif
-
-  return has_sse4_2_base && has_extras;
-}
-
-bool DetectCpuAvx2() {
-  constexpr std::uint32_t kEbxAvx2 = 1u << 5;
-  constexpr std::uint32_t kEcxFma = 1u << 12;
-
-  std::uint32_t abcd[4];
-
-  RunCpuid(7, 0, abcd);
-  const bool has_avx2 = (abcd[1] & kEbxAvx2) == kEbxAvx2;
-  RunCpuid(1, 0, abcd);
-  const bool has_fma = (abcd[2] & kEcxFma) == kEcxFma;
-
-  return has_avx2 && has_fma;
-}
-
-bool DetectCpuAvx512() {
-  constexpr std::uint32_t kEbxAvx512F = 1u << 16;
-  constexpr std::uint32_t kEbxAvx512Dq = 1u << 17;
-  constexpr std::uint32_t kEbxAvx512Cd = 1u << 28;
-  constexpr std::uint32_t kEbxAvx512Bw = 1u << 30;
-  constexpr std::uint32_t kEbxAvx512Vl = 1u << 31;
-
-  constexpr std::uint32_t kEbxAvx512Mask =
-      kEbxAvx512F | kEbxAvx512Dq | kEbxAvx512Cd | kEbxAvx512Bw | kEbxAvx512Vl;
-  std::uint32_t abcd[4];
-  RunCpuid(7, 0, abcd);
-
-  return (abcd[1] & kEbxAvx512Mask) == kEbxAvx512Mask;
-}
-
-#endif
-}  // namespace ruy
diff --git a/ruy/detect_x86.h b/ruy/detect_x86.h
deleted file mode 100644
index 68b6202..0000000
--- a/ruy/detect_x86.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright 2019 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef RUY_RUY_DETECT_X86_H_
-#define RUY_RUY_DETECT_X86_H_
-
-#include "ruy/platform.h"
-
-namespace ruy {
-
-#if RUY_PLATFORM_X86
-#if RUY_PLATFORM_X86_ENHANCEMENTS
-
-// This also checks ABM support, which implies LZCNT and POPCNT.
-bool DetectCpuSse42();
-bool DetectCpuAvx2();
-bool DetectCpuAvx512();
-// TODO(b/147376783): SSE 4.2 and AVX-VNNI support is incomplete / placeholder.
-// Optimization is not finished. In particular the dimensions of the kernel
-// blocks can be changed as desired.
-//
-// TODO(b/146646451): Introduce and activate.
-inline bool DetectCpuAvxVnni() { return false; }
-
-#else  // RUY_PLATFORM_X86_ENHANCEMENTS
-
-inline bool DetectCpuSse42() { return false; }
-inline bool DetectCpuAvx2() { return false; }
-inline bool DetectCpuAvx512() { return false; }
-inline bool DetectCpuAvxVnni() { return false; }
-
-#endif  // !RUY_PLATFORM_X86_ENHANCEMENTS
-#endif  // RUY_PLATFORM_X86
-
-}  // namespace ruy
-
-#endif  // RUY_RUY_DETECT_X86_H_
diff --git a/ruy/platform.h b/ruy/platform.h
index 9977b04..2f9cbb3 100644
--- a/ruy/platform.h
+++ b/ruy/platform.h
@@ -32,6 +32,20 @@
 #define RUY_PLATFORM_APPLE 0
 #endif
 
+// Detect APPLE.
+#ifdef __ppc__
+#define RUY_PLATFORM_PPC 1
+#else
+#define RUY_PLATFORM_PPC 0
+#endif
+
+// Detect Fuchsia
+#ifdef __Fuchsia__
+#define RUY_PLATFORM_FUCHSIA 1
+#else
+#define RUY_PLATFORM_FUCHSIA 0
+#endif
+
 // Architecture-level platform detection.
 //
 // Ruy requires these to be mutually exclusive.
diff --git a/third_party/BUILD b/third_party/BUILD
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/third_party/BUILD
diff --git a/third_party/clog.BUILD b/third_party/clog.BUILD
new file mode 100644
index 0000000..84cc3a2
--- /dev/null
+++ b/third_party/clog.BUILD
@@ -0,0 +1,38 @@
+# Description:
+#   C-style (a-la printf) logging library
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])
+
+exports_files(["LICENSE"])
+
+cc_library(
+    name = "clog",
+    srcs = [
+        "deps/clog/src/clog.c",
+    ],
+    copts = select({
+        ":windows": [],
+        "//conditions:default": ["-Wno-unused-result"],
+    }),
+    hdrs = [
+        "deps/clog/include/clog.h",
+    ],
+    linkopts = select({
+        ":android": ["-llog"],
+        "//conditions:default": [],
+    }),
+    linkstatic = True,
+    strip_include_prefix = "deps/clog/include",
+)
+
+config_setting(
+    name = "android",
+    values = {"crosstool_top": "//external:android/crosstool"},
+)
+
+config_setting(
+    name = "windows",
+    values = {"cpu": "x64_windows"},
+)
diff --git a/third_party/cpuinfo.BUILD b/third_party/cpuinfo.BUILD
new file mode 100644
index 0000000..ad120de
--- /dev/null
+++ b/third_party/cpuinfo.BUILD
@@ -0,0 +1,326 @@
+# cpuinfo, a library to detect information about the host CPU
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])
+
+exports_files(["LICENSE"])
+
+C99OPTS = [
+    "-std=gnu99",  # gnu99, not c99, because dprintf is used
+    "-Wno-vla",
+    "-D_GNU_SOURCE=1",  # to use CPU_SETSIZE
+    "-DCPUINFO_INTERNAL=",
+    "-DCPUINFO_PRIVATE=",
+]
+
+# Source code common to all platforms.
+COMMON_SRCS = [
+    "src/api.c",
+    "src/init.c",
+    "src/cache.c",
+]
+
+# Architecture-specific sources and headers.
+X86_SRCS = [
+    "src/x86/cache/descriptor.c",
+    "src/x86/cache/deterministic.c",
+    "src/x86/cache/init.c",
+    "src/x86/info.c",
+    "src/x86/init.c",
+    "src/x86/isa.c",
+    "src/x86/name.c",
+    "src/x86/topology.c",
+    "src/x86/uarch.c",
+    "src/x86/vendor.c",
+]
+
+ARM_SRCS = [
+    "src/arm/cache.c",
+    "src/arm/uarch.c",
+]
+
+# Platform-specific sources and headers
+LINUX_SRCS = [
+    "src/linux/cpulist.c",
+    "src/linux/multiline.c",
+    "src/linux/processors.c",
+    "src/linux/smallfile.c",
+]
+
+MOCK_LINUX_SRCS = [
+    "src/linux/mockfile.c",
+]
+
+MACH_SRCS = [
+    "src/mach/topology.c",
+]
+
+EMSCRIPTEN_SRCS = [
+    "src/emscripten/init.c",
+]
+
+LINUX_X86_SRCS = [
+    "src/x86/linux/cpuinfo.c",
+    "src/x86/linux/init.c",
+]
+
+LINUX_ARM_SRCS = [
+    "src/arm/linux/chipset.c",
+    "src/arm/linux/clusters.c",
+    "src/arm/linux/cpuinfo.c",
+    "src/arm/linux/hwcap.c",
+    "src/arm/linux/init.c",
+    "src/arm/linux/midr.c",
+]
+
+LINUX_ARM32_SRCS = LINUX_ARM_SRCS + ["src/arm/linux/aarch32-isa.c"]
+
+LINUX_ARM64_SRCS = LINUX_ARM_SRCS + ["src/arm/linux/aarch64-isa.c"]
+
+ANDROID_ARM_SRCS = [
+    "src/arm/android/properties.c",
+]
+
+WINDOWS_X86_SRCS = [
+    "src/x86/windows/init.c",
+]
+
+MACH_X86_SRCS = [
+    "src/x86/mach/init.c",
+]
+
+MACH_ARM_SRCS = [
+    "src/arm/mach/init.c",
+]
+
+cc_library(
+    name = "cpuinfo_impl",
+    srcs = select({
+        ":linux_x86_64": COMMON_SRCS + X86_SRCS + LINUX_SRCS + LINUX_X86_SRCS,
+        ":linux_arm": COMMON_SRCS + ARM_SRCS + LINUX_SRCS + LINUX_ARM32_SRCS,
+        ":linux_armhf": COMMON_SRCS + ARM_SRCS + LINUX_SRCS + LINUX_ARM32_SRCS,
+        ":linux_aarch64": COMMON_SRCS + ARM_SRCS + LINUX_SRCS + LINUX_ARM64_SRCS,
+        ":macos_x86_64": COMMON_SRCS + X86_SRCS + MACH_SRCS + MACH_X86_SRCS,
+        ":windows_x86_64": COMMON_SRCS + X86_SRCS + WINDOWS_X86_SRCS,
+        ":android_armv7": COMMON_SRCS + ARM_SRCS + LINUX_SRCS + LINUX_ARM32_SRCS + ANDROID_ARM_SRCS,
+        ":android_arm64": COMMON_SRCS + ARM_SRCS + LINUX_SRCS + LINUX_ARM64_SRCS + ANDROID_ARM_SRCS,
+        ":android_x86": COMMON_SRCS + X86_SRCS + LINUX_SRCS + LINUX_X86_SRCS,
+        ":android_x86_64": COMMON_SRCS + X86_SRCS + LINUX_SRCS + LINUX_X86_SRCS,
+        ":ios_x86_64": COMMON_SRCS + X86_SRCS + MACH_SRCS + MACH_X86_SRCS,
+        ":ios_x86": COMMON_SRCS + X86_SRCS + MACH_SRCS + MACH_X86_SRCS,
+        ":ios_armv7": COMMON_SRCS + MACH_SRCS + MACH_ARM_SRCS,
+        ":ios_arm64": COMMON_SRCS + MACH_SRCS + MACH_ARM_SRCS,
+        ":ios_arm64e": COMMON_SRCS + MACH_SRCS + MACH_ARM_SRCS,
+        ":watchos_x86_64": COMMON_SRCS + X86_SRCS + MACH_SRCS + MACH_X86_SRCS,
+        ":watchos_x86": COMMON_SRCS + X86_SRCS + MACH_SRCS + MACH_X86_SRCS,
+        ":watchos_armv7k": COMMON_SRCS + MACH_SRCS + MACH_ARM_SRCS,
+        ":watchos_arm64_32": COMMON_SRCS + MACH_SRCS + MACH_ARM_SRCS,
+        ":tvos_x86_64": COMMON_SRCS + X86_SRCS + MACH_SRCS + MACH_X86_SRCS,
+        ":tvos_arm64": COMMON_SRCS + MACH_SRCS + MACH_ARM_SRCS,
+        ":emscripten": COMMON_SRCS + EMSCRIPTEN_SRCS,
+    }),
+    copts = select({
+        ":windows_x86_64": [],
+        "//conditions:default": C99OPTS,
+    }) + [
+        "-Iexternal/cpuinfo/include",
+        "-Iexternal/cpuinfo/src",
+    ],
+    linkstatic = True,
+    # Headers must be in textual_hdrs to allow us to set the standard to C99
+    textual_hdrs = [
+        "include/cpuinfo.h",
+        "src/linux/api.h",
+        "src/mach/api.h",
+        "src/cpuinfo/common.h",
+        "src/cpuinfo/internal-api.h",
+        "src/cpuinfo/log.h",
+        "src/cpuinfo/utils.h",
+        "src/x86/api.h",
+        "src/x86/cpuid.h",
+        "src/x86/linux/api.h",
+        "src/arm/android/api.h",
+        "src/arm/linux/api.h",
+        "src/arm/linux/cp.h",
+        "src/arm/api.h",
+        "src/arm/midr.h",
+    ],
+    deps = [
+        "@clog",
+    ],
+)
+
+cc_library(
+    name = "cpuinfo",
+    hdrs = [
+        "include/cpuinfo.h",
+    ],
+    strip_include_prefix = "include",
+    deps = [
+        ":cpuinfo_impl",
+    ],
+)
+
+############################# Build configurations #############################
+
+config_setting(
+    name = "linux_x86_64",
+    values = {"cpu": "k8"},
+)
+
+config_setting(
+    name = "linux_arm",
+    values = {"cpu": "arm"},
+)
+
+config_setting(
+    name = "linux_armhf",
+    values = {"cpu": "armhf"},
+)
+
+config_setting(
+    name = "linux_aarch64",
+    values = {"cpu": "aarch64"},
+)
+
+config_setting(
+    name = "macos_x86_64",
+    values = {
+        "apple_platform_type": "macos",
+        "cpu": "darwin",
+    },
+)
+
+config_setting(
+    name = "windows_x86_64",
+    values = {"cpu": "x64_windows"},
+)
+
+config_setting(
+    name = "android_armv7",
+    values = {
+        "crosstool_top": "//external:android/crosstool",
+        "cpu": "armeabi-v7a",
+    },
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "android_arm64",
+    values = {
+        "crosstool_top": "//external:android/crosstool",
+        "cpu": "arm64-v8a",
+    },
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "android_x86",
+    values = {
+        "crosstool_top": "//external:android/crosstool",
+        "cpu": "x86",
+    },
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "android_x86_64",
+    values = {
+        "crosstool_top": "//external:android/crosstool",
+        "cpu": "x86_64",
+    },
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "ios_armv7",
+    values = {
+        "apple_platform_type": "ios",
+        "cpu": "ios_armv7",
+    },
+)
+
+config_setting(
+    name = "ios_arm64",
+    values = {
+        "apple_platform_type": "ios",
+        "cpu": "ios_arm64",
+    },
+)
+
+config_setting(
+    name = "ios_arm64e",
+    values = {
+        "apple_platform_type": "ios",
+        "cpu": "ios_arm64e",
+    },
+)
+
+config_setting(
+    name = "ios_x86",
+    values = {
+        "apple_platform_type": "ios",
+        "cpu": "ios_i386",
+    },
+)
+
+config_setting(
+    name = "ios_x86_64",
+    values = {
+        "apple_platform_type": "ios",
+        "cpu": "ios_x86_64",
+    },
+)
+
+config_setting(
+    name = "watchos_armv7k",
+    values = {
+        "apple_platform_type": "watchos",
+        "cpu": "watchos_armv7k",
+    },
+)
+
+config_setting(
+    name = "watchos_arm64_32",
+    values = {
+        "apple_platform_type": "watchos",
+        "cpu": "watchos_arm64_32",
+    },
+)
+
+config_setting(
+    name = "watchos_x86",
+    values = {
+        "apple_platform_type": "watchos",
+        "cpu": "watchos_i386",
+    },
+)
+
+config_setting(
+    name = "watchos_x86_64",
+    values = {
+        "apple_platform_type": "watchos",
+        "cpu": "watchos_x86_64",
+    },
+)
+
+config_setting(
+    name = "tvos_arm64",
+    values = {
+        "apple_platform_type": "tvos",
+        "cpu": "tvos_arm64",
+    },
+)
+
+config_setting(
+    name = "tvos_x86_64",
+    values = {
+        "apple_platform_type": "tvos",
+        "cpu": "tvos_x86_64",
+    },
+)
+
+config_setting(
+    name = "emscripten",
+    values = {"crosstool_top": "//toolchain:emscripten"},
+)
diff --git a/third_party/cpuinfo.patch b/third_party/cpuinfo.patch
new file mode 100644
index 0000000..eb57412
--- /dev/null
+++ b/third_party/cpuinfo.patch
@@ -0,0 +1,503 @@
+diff --git src/arm/cache.c src/arm/cache.c
+index 1a8bf91..666ad78 100644
+--- src/arm/cache.c
++++ src/arm/cache.c
+@@ -635,6 +635,13 @@ void cpuinfo_arm_decode_cache(
+ 								break;
+ 						}
+ 						break;
++					case cpuinfo_arm_chipset_series_broadcom_bcm:
++						switch (chipset->model) {
++							case 2837: /* BCM2837 */
++								l2_size = 512 * 1024;
++								break;
++						}
++						break;
+ 					case cpuinfo_arm_chipset_series_samsung_exynos:
+ 						l1_size = 32 * 1024;
+ 						break;
+@@ -922,11 +929,13 @@ void cpuinfo_arm_decode_cache(
+ 			 *  | MediaTek Helio X23  | 2(+4+4) |     ?     |     ?     |     ?      |           |
+ 			 *  | MediaTek Helio X25  | 2(+4+4) |     ?     |     ?     |     ?      |           |
+ 			 *  | MediaTek Helio X27  | 2(+4+4) |     ?     |     ?     |     ?      |           |
++			 *  | Broadcom BCM2711    |    4    |    32K    |    48K    |     1M     |    [4]    |
+ 			 *  +---------------------+---------+-----------+-----------+------------+-----------+
+ 			 *
+ 			 * [1] http://pdadb.net/index.php?m=processor&id=578&c=qualcomm_snapdragon_618_msm8956__snapdragon_650
+ 			 * [2] http://pdadb.net/index.php?m=processor&id=667&c=qualcomm_snapdragon_620_apq8076__snapdragon_652
+ 			 * [3] http://pdadb.net/index.php?m=processor&id=692&c=qualcomm_snapdragon_653_msm8976sg__msm8976_pro
++			 * [4] https://www.raspberrypi.org/documentation/hardware/raspberrypi/bcm2711/README.md
+ 			 */
+ 			uint32_t l2_size;
+ 			switch (chipset->series) {
+diff --git src/arm/linux/aarch32-isa.c src/arm/linux/aarch32-isa.c
+index 6aedda3..64dd168 100644
+--- src/arm/linux/aarch32-isa.c
++++ src/arm/linux/aarch32-isa.c
+@@ -77,18 +77,24 @@ void cpuinfo_arm_linux_decode_isa_from_proc_cpuinfo(
+ 
+ 		/*
+ 		 * NEON VDOT instructions are not indicated in /proc/cpuinfo.
+-		 * Use a MIDR-based heuristic to whitelist processors known to support it:
+-		 * - Processors with Qualcomm-modified Cortex-A76 cores
+-		 * - Kirin 980 processor
++		 * Use a MIDR-based heuristic to whitelist processors known to support it.
+ 		 */
+ 		switch (midr & (CPUINFO_ARM_MIDR_IMPLEMENTER_MASK | CPUINFO_ARM_MIDR_PART_MASK)) {
++			case UINT32_C(0x4100D0B0): /* Cortex-A76 */
++			case UINT32_C(0x4100D0D0): /* Cortex-A77 */
++			case UINT32_C(0x4100D0E0): /* Cortex-A76AE */
++			case UINT32_C(0x4800D400): /* Cortex-A76 (HiSilicon) */
+ 			case UINT32_C(0x51008040): /* Kryo 485 Gold (Cortex-A76) */
++			case UINT32_C(0x51008050): /* Kryo 485 Silver (Cortex-A55) */
++			case UINT32_C(0x53000030): /* Exynos-M4 */
++			case UINT32_C(0x53000040): /* Exynos-M5 */
+ 				isa->dot = true;
+ 				break;
+-			default:
+-				if (chipset->series == cpuinfo_arm_chipset_series_hisilicon_kirin && chipset->model == 980) {
+-					isa->dot = true;
+-				}
++			case UINT32_C(0x4100D050): /* Cortex A55: revision 1 or later only */
++				isa->dot = !!(midr_get_variant(midr) >= 1);
++				break;
++			case UINT32_C(0x4100D0A0): /* Cortex A75: revision 2 or later only */
++				isa->dot = !!(midr_get_variant(midr) >= 2);
+ 				break;
+ 		}
+ 	} else {
+diff --git src/arm/linux/aarch64-isa.c src/arm/linux/aarch64-isa.c
+index f193e81..619cda5 100644
+--- src/arm/linux/aarch64-isa.c
++++ src/arm/linux/aarch64-isa.c
+@@ -67,21 +67,32 @@ void cpuinfo_arm64_linux_decode_isa_from_proc_cpuinfo(
+ 	}
+ 	/*
+ 	 * Many phones ship with an old kernel configuration that doesn't report UDOT/SDOT instructions.
+-	 * Use a MIDR-based heuristic to whitelist processors known to support it:
+-	 * - Processors with Qualcomm-modified Cortex-A76 cores
+-	 * - Kirin 980 processor
++	 * Use a MIDR-based heuristic to whitelist processors known to support it.
+ 	 */
+ 	switch (midr & (CPUINFO_ARM_MIDR_IMPLEMENTER_MASK | CPUINFO_ARM_MIDR_PART_MASK)) {
++		case UINT32_C(0x4100D060): /* Cortex-A65 */
++		case UINT32_C(0x4100D0B0): /* Cortex-A76 */
++		case UINT32_C(0x4100D0C0): /* Neoverse N1 */
++		case UINT32_C(0x4100D0D0): /* Cortex-A77 */
++		case UINT32_C(0x4100D0E0): /* Cortex-A76AE */
++		case UINT32_C(0x4100D4A0): /* Neoverse E1 */
++		case UINT32_C(0x4800D400): /* Cortex-A76 (HiSilicon) */
+ 		case UINT32_C(0x51008040): /* Kryo 485 Gold (Cortex-A76) */
++		case UINT32_C(0x51008050): /* Kryo 485 Silver (Cortex-A55) */
++		case UINT32_C(0x53000030): /* Exynos-M4 */
++		case UINT32_C(0x53000040): /* Exynos-M5 */
+ 			isa->dot = true;
+ 			break;
++		case UINT32_C(0x4100D050): /* Cortex A55: revision 1 or later only */
++			isa->dot = !!(midr_get_variant(midr) >= 1);
++			break;
++		case UINT32_C(0x4100D0A0): /* Cortex A75: revision 2 or later only */
++			isa->dot = !!(midr_get_variant(midr) >= 2);
++			break;
+ 		default:
+ 			if (features & CPUINFO_ARM_LINUX_FEATURE_ASIMDDP) {
+ 				isa->dot = true;
+ 			}
+-			if (chipset->series == cpuinfo_arm_chipset_series_hisilicon_kirin && chipset->model == 980) {
+-				isa->dot = true;
+-			}
+ 			break;
+ 	}
+ 	if (features & CPUINFO_ARM_LINUX_FEATURE_JSCVT) {
+diff --git src/arm/linux/api.h src/arm/linux/api.h
+index f99da66..2597e49 100644
+--- src/arm/linux/api.h
++++ src/arm/linux/api.h
+@@ -11,6 +11,8 @@
+ 
+ /* No hard limit in the kernel, maximum length observed on non-rogue kernels is 64 */
+ #define CPUINFO_HARDWARE_VALUE_MAX 64
++/* No hard limit in the kernel, maximum length on Raspberry Pi is 8. Add 1 symbol to detect overly large revision strings */
++#define CPUINFO_REVISION_VALUE_MAX 9
+ 
+ #ifdef __ANDROID__
+ 	/* As per include/sys/system_properties.h in Android NDK */
+@@ -259,6 +261,7 @@ static inline bool cpuinfo_arm_linux_processor_not_equals(
+ 
+ CPUINFO_INTERNAL bool cpuinfo_arm_linux_parse_proc_cpuinfo(
+ 	char hardware[restrict static CPUINFO_HARDWARE_VALUE_MAX],
++	char revision[restrict static CPUINFO_REVISION_VALUE_MAX],
+ 	uint32_t max_processors_count,
+ 	struct cpuinfo_arm_linux_processor processors[restrict static max_processors_count]);
+ 
+@@ -297,6 +300,7 @@ CPUINFO_INTERNAL bool cpuinfo_arm_linux_parse_proc_cpuinfo(
+ 	CPUINFO_INTERNAL struct cpuinfo_arm_chipset
+ 		cpuinfo_arm_linux_decode_chipset(
+ 			const char hardware[restrict static CPUINFO_HARDWARE_VALUE_MAX],
++			const char revision[restrict static CPUINFO_REVISION_VALUE_MAX],
+ 			uint32_t cores,
+ 			uint32_t max_cpu_freq_max);
+ #endif
+@@ -327,6 +331,10 @@ CPUINFO_INTERNAL struct cpuinfo_arm_chipset
+ 	CPUINFO_INTERNAL struct cpuinfo_arm_chipset
+ 		cpuinfo_arm_android_decode_chipset_from_ro_hardware_chipname(
+ 			const char ro_hardware_chipname[restrict static CPUINFO_BUILD_PROP_VALUE_MAX]);
++#else
++	CPUINFO_INTERNAL struct cpuinfo_arm_chipset
++		cpuinfo_arm_linux_decode_chipset_from_proc_cpuinfo_revision(
++			const char proc_cpuinfo_revision[restrict static CPUINFO_REVISION_VALUE_MAX]);
+ #endif
+ 
+ CPUINFO_INTERNAL bool cpuinfo_arm_linux_detect_core_clusters_by_heuristic(
+diff --git src/arm/linux/chipset.c src/arm/linux/chipset.c
+index 35058d9..e36283c 100644
+--- src/arm/linux/chipset.c
++++ src/arm/linux/chipset.c
+@@ -1011,12 +1011,59 @@ write_chipset:
+ 	return true;
+ }
+ 
++/**
++ * Tries to match /BCM\d{4}$/ signature for Broadcom BCM chipsets.
++ * If match successful, extracts model information into \p chipset argument.
++ *
++ * @param start - start of the /proc/cpuinfo Hardware string to match.
++ * @param end - end of the /proc/cpuinfo Hardware string to match.
++ * @param[out] chipset - location where chipset information will be stored upon a successful match.
++ *
++ * @returns true if signature matched, false otherwise.
++ */
++static bool match_bcm(
++	const char* start, const char* end,
++	struct cpuinfo_arm_chipset chipset[restrict static 1])
++{
++	/* Expect exactly 7 symbols: "BCM" (3 symbols) + 4-digit model number */
++	if (start + 7 != end) {
++		return false;
++	}
++
++	/* Check that the string starts with "BCM".
++	 * The first three characters are loaded and compared as a 24-bit little endian word.
++	 */
++	const uint32_t expected_bcm = load_u24le(start);
++	if (expected_bcm != UINT32_C(0x004D4342) /* "MCB" = reverse("BCM") */) {
++		return false;
++	}
++
++	/* Validate and parse 4-digit model number */
++	uint32_t model = 0;
++	for (uint32_t i = 3; i < 7; i++) {
++		const uint32_t digit = (uint32_t) (uint8_t) start[i] - '0';
++		if (digit >= 10) {
++			/* Not really a digit */
++			return false;
++		}
++		model = model * 10 + digit;
++	}
++
++	/* Return parsed chipset. */
++	*chipset = (struct cpuinfo_arm_chipset) {
++		.vendor = cpuinfo_arm_chipset_vendor_broadcom,
++		.series = cpuinfo_arm_chipset_series_broadcom_bcm,
++		.model = model,
++	};
++	return true;
++}
++
+ /**
+  * Tries to match /OMAP\d{4}$/ signature for Texas Instruments OMAP chipsets.
+  * If match successful, extracts model information into \p chipset argument.
+  *
+  * @param start - start of the /proc/cpuinfo Hardware string to match.
+- * @param end - end of the /proc/cpuinfo Hardaware string to match.
++ * @param end - end of the /proc/cpuinfo Hardware string to match.
+  * @param[out] chipset - location where chipset information will be stored upon a successful match.
+  *
+  * @returns true if signature matched, false otherwise.
+@@ -2328,6 +2375,14 @@ struct cpuinfo_arm_chipset cpuinfo_arm_linux_decode_chipset_from_proc_cpuinfo_ha
+ 			return chipset;
+ 		}
+ 
++		/* Check Broadcom BCM signature */
++		if (match_bcm(hardware, hardware_end, &chipset)) {
++			cpuinfo_log_debug(
++				"matched Broadcom BCM signature in /proc/cpuinfo Hardware string \"%.*s\"",
++				(int) hardware_length, hardware);
++			return chipset;
++		}
++
+ 		#if CPUINFO_ARCH_ARM
+ 			/* Check Texas Instruments OMAP signature */
+ 			if (match_omap(hardware, hardware_end, &chipset)) {
+@@ -3713,6 +3768,62 @@ void cpuinfo_arm_chipset_to_string(
+ 		return chipset;
+ 	}
+ #else /* !defined(__ANDROID__) */
++	/*
++	 * Fix commonly misreported Broadcom BCM models on Raspberry Pi boards.
++	 *
++	 * @param[in,out] chipset - chipset name to fix.
++	 * @param[in] revision - /proc/cpuinfo Revision string.
++	 */
++	void cpuinfo_arm_fixup_raspberry_pi_chipset(
++		struct cpuinfo_arm_chipset chipset[restrict static 1],
++		const char revision[restrict static CPUINFO_HARDWARE_VALUE_MAX])
++	{
++		const size_t revision_length = strnlen(revision, CPUINFO_REVISION_VALUE_MAX);
++
++		/* Parse revision codes according to https://www.raspberrypi.org/documentation/hardware/raspberrypi/revision-codes/README.md */
++		#if CPUINFO_ARCH_ARM
++			if (revision_length == 4) {
++				/*
++				 * Old-style revision codes.
++				 * All Raspberry Pi models with old-style revision code use Broadcom BCM2835.
++				 */
++
++				/* BCM2835 often misreported as BCM2708 */
++				if (chipset->model == 2708) {
++					chipset->model = 2835;
++				}
++				return;
++			}
++		#endif
++		if ((size_t) (revision_length - 5) <= (size_t) (8 - 5) /* 5 <= length(revision) <= 8 */) {
++			/* New-style revision codes */
++
++			uint32_t model = 0;
++			switch (revision[revision_length - 4]) {
++				case '0':
++					/* BCM2835 */
++					model = 2835;
++					break;
++				case '1':
++					/* BCM2836 */
++					model = 2836;
++					break;
++				case '2':
++					/* BCM2837 */
++					model = 2837;
++					break;
++				case '3':
++					/* BCM2711 */
++					model = 2711;
++					break;
++			}
++
++			if (model != 0) {
++				chipset->model = model;
++				chipset->suffix[0] = 0;
++			}
++		}
++	}
+ 
+ 	/*
+ 	 * Decodes chipset name from /proc/cpuinfo Hardware string.
+@@ -3727,6 +3838,7 @@ void cpuinfo_arm_chipset_to_string(
+ 	 */
+ 	struct cpuinfo_arm_chipset cpuinfo_arm_linux_decode_chipset(
+ 		const char hardware[restrict static CPUINFO_HARDWARE_VALUE_MAX],
++		const char revision[restrict static CPUINFO_REVISION_VALUE_MAX],
+ 		uint32_t cores,
+ 		uint32_t max_cpu_freq_max)
+ 	{
+@@ -3736,6 +3848,9 @@ void cpuinfo_arm_chipset_to_string(
+ 		if (chipset.vendor == cpuinfo_arm_chipset_vendor_unknown) {
+ 			cpuinfo_log_warning(
+ 				"chipset detection failed: /proc/cpuinfo Hardware string did not match known signatures");
++		} else if (chipset.vendor == cpuinfo_arm_chipset_vendor_broadcom) {
++			/* Raspberry Pi kernel reports bogus chipset models; detect chipset from RPi revision */
++			cpuinfo_arm_fixup_raspberry_pi_chipset(&chipset, revision);
+ 		} else {
+ 			cpuinfo_arm_fixup_chipset(&chipset, cores, max_cpu_freq_max);
+ 		}
+diff --git src/arm/linux/cpuinfo.c src/arm/linux/cpuinfo.c
+index c70055f..90e1631 100644
+--- src/arm/linux/cpuinfo.c
++++ src/arm/linux/cpuinfo.c
+@@ -631,6 +631,7 @@ static void parse_cache_number(
+ 
+ struct proc_cpuinfo_parser_state {
+ 	char* hardware;
++	char* revision;
+ 	uint32_t processor_index;
+ 	uint32_t max_processors_count;
+ 	struct cpuinfo_arm_linux_processor* processors;
+@@ -791,7 +792,17 @@ static bool parse_line(
+ 				memcpy(state->hardware, value_start, value_length);
+ 				cpuinfo_log_debug("parsed /proc/cpuinfo Hardware = \"%.*s\"", (int) value_length, value_start);
+ 			} else if (memcmp(line_start, "Revision", key_length) == 0) {
+-				/* Board revision, no use for now */
++				size_t value_length = value_end - value_start;
++				if (value_length > CPUINFO_REVISION_VALUE_MAX) {
++					cpuinfo_log_info(
++						"length of Revision value \"%.*s\" in /proc/cpuinfo exceeds limit (%d): truncating to the limit",
++						(int) value_length, value_start, CPUINFO_REVISION_VALUE_MAX);
++					value_length = CPUINFO_REVISION_VALUE_MAX;
++				} else {
++					state->revision[value_length] = '\0';
++				}
++				memcpy(state->revision, value_start, value_length);
++				cpuinfo_log_debug("parsed /proc/cpuinfo Revision = \"%.*s\"", (int) value_length, value_start);
+ 			} else {
+ 				goto unknown;
+ 			}
+@@ -881,11 +892,13 @@ static bool parse_line(
+ 
+ bool cpuinfo_arm_linux_parse_proc_cpuinfo(
+ 	char hardware[restrict static CPUINFO_HARDWARE_VALUE_MAX],
++	char revision[restrict static CPUINFO_REVISION_VALUE_MAX],
+ 	uint32_t max_processors_count,
+ 	struct cpuinfo_arm_linux_processor processors[restrict static max_processors_count])
+ {
+ 	struct proc_cpuinfo_parser_state state = {
+ 		.hardware = hardware,
++		.revision = revision,
+ 		.processor_index = 0,
+ 		.max_processors_count = max_processors_count,
+ 		.processors = processors,
+diff --git src/arm/linux/init.c src/arm/linux/init.c
+index 6272abf..89d957e 100644
+--- src/arm/linux/init.c
++++ src/arm/linux/init.c
+@@ -167,8 +167,9 @@ void cpuinfo_arm_linux_init(void) {
+ 	struct cpuinfo_android_properties android_properties;
+ 	cpuinfo_arm_android_parse_properties(&android_properties);
+ #else
+-	char proc_cpuinfo_hardware[CPUINFO_HARDWARE_VALUE_MAX] = { 0 };
++	char proc_cpuinfo_hardware[CPUINFO_HARDWARE_VALUE_MAX];
+ #endif
++	char proc_cpuinfo_revision[CPUINFO_REVISION_VALUE_MAX];
+ 
+ 	if (!cpuinfo_arm_linux_parse_proc_cpuinfo(
+ #if defined(__ANDROID__)
+@@ -176,6 +177,7 @@ void cpuinfo_arm_linux_init(void) {
+ #else
+ 			proc_cpuinfo_hardware,
+ #endif
++			proc_cpuinfo_revision,
+ 			arm_linux_processors_count,
+ 			arm_linux_processors)) {
+ 		cpuinfo_log_error("failed to parse processor information from /proc/cpuinfo");
+@@ -228,10 +230,8 @@ void cpuinfo_arm_linux_init(void) {
+ 	const struct cpuinfo_arm_chipset chipset =
+ 		cpuinfo_arm_android_decode_chipset(&android_properties, valid_processors, 0);
+ #else
+-	const struct cpuinfo_arm_chipset chipset = {
+-		.vendor = cpuinfo_arm_chipset_vendor_unknown,
+-		.series = cpuinfo_arm_chipset_series_unknown,
+-	};
++	const struct cpuinfo_arm_chipset chipset =
++		cpuinfo_arm_linux_decode_chipset(proc_cpuinfo_hardware, proc_cpuinfo_revision, valid_processors, 0);
+ #endif
+ 
+ 	#if CPUINFO_ARCH_ARM
+diff --git test/arm-cache.cc test/arm-cache.cc
+index 7d2e4a4..4d6218b 100644
+--- test/arm-cache.cc
++++ test/arm-cache.cc
+@@ -1664,3 +1664,91 @@ TEST(ROCKCHIP, rk3368) {
+ 	EXPECT_EQ(256 * 1024, little_l2.size);
+ 	EXPECT_EQ(0, little_l3.size);
+ }
++
++TEST(BROADCOM, bcm2835) {
++	const struct cpuinfo_arm_chipset chipset = {
++		.vendor = cpuinfo_arm_chipset_vendor_broadcom,
++		.series = cpuinfo_arm_chipset_series_broadcom_bcm,
++		.model = 2835,
++	};
++
++	struct cpuinfo_cache l1i = { 0 };
++	struct cpuinfo_cache l1d = { 0 };
++	struct cpuinfo_cache l2 = { 0 };
++	struct cpuinfo_cache l3 = { 0 };
++	cpuinfo_arm_decode_cache(
++		cpuinfo_uarch_arm11, 4, UINT32_C(0x410FB767),
++		&chipset, 0, 4,
++		&l1i, &l1d, &l2, &l3);
++
++	EXPECT_EQ(16 * 1024, l1i.size);
++	EXPECT_EQ(16 * 1024, l1d.size);
++	EXPECT_EQ(0, l2.size);
++	EXPECT_EQ(0, big_l3.size);
++}
++
++TEST(BROADCOM, bcm2836) {
++	const struct cpuinfo_arm_chipset chipset = {
++		.vendor = cpuinfo_arm_chipset_vendor_broadcom,
++		.series = cpuinfo_arm_chipset_series_broadcom_bcm,
++		.model = 2836,
++	};
++
++	struct cpuinfo_cache l1i = { 0 };
++	struct cpuinfo_cache l1d = { 0 };
++	struct cpuinfo_cache l2 = { 0 };
++	struct cpuinfo_cache l3 = { 0 };
++	cpuinfo_arm_decode_cache(
++		cpuinfo_uarch_cortex_a7, 4, UINT32_C(0x410FC075),
++		&chipset, 0, 4,
++		&l1i, &l1d, &l2, &l3);
++
++	EXPECT_EQ(32 * 1024, l1i.size);
++	EXPECT_EQ(32 * 1024, l1d.size);
++	EXPECT_EQ(512 * 1024, l2.size);
++	EXPECT_EQ(0, big_l3.size);
++}
++
++TEST(BROADCOM, bcm2837) {
++	const struct cpuinfo_arm_chipset chipset = {
++		.vendor = cpuinfo_arm_chipset_vendor_broadcom,
++		.series = cpuinfo_arm_chipset_series_broadcom_bcm,
++		.model = 2837,
++	};
++
++	struct cpuinfo_cache l1i = { 0 };
++	struct cpuinfo_cache l1d = { 0 };
++	struct cpuinfo_cache l2 = { 0 };
++	struct cpuinfo_cache l3 = { 0 };
++	cpuinfo_arm_decode_cache(
++		cpuinfo_uarch_cortex_a53, 4, UINT32_C(0x410FD034),
++		&chipset, 0, 4,
++		&l1i, &l1d, &l2, &l3);
++
++	EXPECT_EQ(16 * 1024, l1i.size);
++	EXPECT_EQ(16 * 1024, l1d.size);
++	EXPECT_EQ(512 * 1024, l2.size);
++	EXPECT_EQ(0, big_l3.size);
++}
++
++TEST(BROADCOM, bcm2711) {
++	const struct cpuinfo_arm_chipset chipset = {
++		.vendor = cpuinfo_arm_chipset_vendor_broadcom,
++		.series = cpuinfo_arm_chipset_series_broadcom_bcm,
++		.model = 2711,
++	};
++
++	struct cpuinfo_cache l1i = { 0 };
++	struct cpuinfo_cache l1d = { 0 };
++	struct cpuinfo_cache l2 = { 0 };
++	struct cpuinfo_cache l3 = { 0 };
++	cpuinfo_arm_decode_cache(
++		cpuinfo_uarch_cortex_a72, 4, UINT32_C(0x410FD083),
++		&chipset, 0, 4,
++		&l1i, &l1d, &l2, &l3);
++
++	EXPECT_EQ(48 * 1024, l1i.size);
++	EXPECT_EQ(32 * 1024, l1d.size);
++	EXPECT_EQ(1024 * 1024, l2.size);
++	EXPECT_EQ(0, big_l3.size);
++}
+diff --git test/get-current.cc test/get-current.cc
+index f410b12..96b11dc 100644
+--- test/get-current.cc
++++ test/get-current.cc
+@@ -36,3 +36,9 @@ TEST(CURRENT_UARCH_INDEX, within_bounds) {
+ 
+ 	ASSERT_LT(cpuinfo_get_current_uarch_index(), cpuinfo_get_uarchs_count());
+ }
++
++TEST(CURRENT_UARCH_INDEX_WITH_DEFAULT, within_bounds) {
++	ASSERT_TRUE(cpuinfo_initialize());
++
++	ASSERT_LE(cpuinfo_get_current_uarch_index_with_default(cpuinfo_get_uarchs_count()), cpuinfo_get_uarchs_count());
++}