Snap for 8562061 from a4ec35f609b902eee833a485306c5a7beefac4da to mainline-media-release

Change-Id: I9896e3ff7f89dc8a15855b13a87186410ddbe586
diff --git a/Android.bp b/Android.bp
index 5e0d28f..02e2d11 100644
--- a/Android.bp
+++ b/Android.bp
@@ -128,7 +128,7 @@
                 "simd/arm/jidctred-neon.c",
                 "simd/arm/jquanti-neon.c",
             ],
-            cflags: ["-DNEON_INTRINSICS"],
+            cflags: ["-DNEON_INTRINSICS", "-mfpu=neon"],
             local_include_dirs: ["simd/arm"],
         },
         arm64: {
diff --git a/BUILD.gn b/BUILD.gn
index d980299..d566340 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -137,22 +137,15 @@
 
 static_library("simd") {
   include_dirs = [ "." ]
-  deps = [
-    ":libjpeg_headers",
-  ]
+  deps = [ ":libjpeg_headers" ]
 
   if (current_cpu == "x86") {
     deps += [ ":simd_asm" ]
-    sources = [
-      "simd/i386/jsimd.c",
-    ]
+    sources = [ "simd/i386/jsimd.c" ]
   } else if (current_cpu == "x64") {
     deps += [ ":simd_asm" ]
-    sources = [
-      "simd/x86_64/jsimd.c",
-    ]
-  } else if ((current_cpu == "arm" || current_cpu == "arm64") &&
-             arm_use_neon) {
+    sources = [ "simd/x86_64/jsimd.c" ]
+  } else if ((current_cpu == "arm" || current_cpu == "arm64") && arm_use_neon) {
     include_dirs += [ "simd/arm/" ]
 
     sources = [
@@ -175,23 +168,19 @@
         "simd/arm/aarch32/jchuff-neon.c",
         "simd/arm/aarch32/jsimd.c",
       ]
-    } else if (current_cpu == "arm64"){
+    } else if (current_cpu == "arm64") {
       sources += [
         "simd/arm/aarch64/jchuff-neon.c",
         "simd/arm/aarch64/jsimd.c",
       ]
     }
 
-    defines = [
-      "NEON_INTRINSICS"
-    ]
+    defines = [ "NEON_INTRINSICS" ]
 
     configs -= [ "//build/config/compiler:default_optimization" ]
     configs += [ "//build/config/compiler:optimize_speed" ]
   } else {
-    sources = [
-      "jsimd_none.c",
-    ]
+    sources = [ "jsimd_none.c" ]
   }
 
   if (is_win) {
@@ -264,9 +253,7 @@
   configs += [ ":libjpeg_config" ]
 
   public_configs = [ ":libjpeg_config" ]
-  public_deps = [
-    ":libjpeg_headers",
-  ]
+  public_deps = [ ":libjpeg_headers" ]
 
   # MemorySanitizer doesn't support assembly code, so keep it disabled in x86
   # and x64 MSan builds for now.
@@ -276,19 +263,19 @@
     public_deps += [ ":simd" ]
 
     if ((current_cpu == "arm" || current_cpu == "arm64") && arm_use_neon) {
-      defines += [ "NEON_INTRINSICS", ]
+      defines += [ "NEON_INTRINSICS" ]
     }
   }
 }
 
 static_library("turbojpeg") {
   sources = [
-    "turbojpeg.c",
-    "transupp.c",
     "jdatadst-tj.c",
     "jdatasrc-tj.c",
     "rdbmp.c",
     "rdppm.c",
+    "transupp.c",
+    "turbojpeg.c",
     "wrbmp.c",
     "wrppm.c",
   ]
@@ -302,9 +289,7 @@
   configs += [ ":libjpeg_config" ]
 
   public_configs = [ ":libjpeg_config" ]
-  public_deps = [
-    ":libjpeg",
-  ]
+  public_deps = [ ":libjpeg" ]
 }
 
 if (build_with_chromium) {
@@ -326,12 +311,12 @@
       "jpegtran.c",
       "md5/md5.c",
       "md5/md5hl.c",
-      "tjbench.c",
-      "tjunittest.c",
-      "tjutil.c",
       "rdcolmap.c",
       "rdgif.c",
       "rdswitch.c",
+      "tjbench.c",
+      "tjunittest.c",
+      "tjutil.c",
     ]
 
     deps = [
@@ -341,9 +326,7 @@
       "//testing/gtest:gtest_main",
     ]
 
-    data = [
-      "testimages/"
-    ]
+    data = [ "testimages/" ]
 
     defines = [
       "GTEST",
diff --git a/ChangeLog.md b/ChangeLog.md
index 498b8f2..ca5208b 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -1,3 +1,21 @@
+2.1.1
+=====
+
+### Significant changes relative to 2.1.0
+
+1. Fixed a regression introduced in 2.1.0 that caused build failures with
+non-GCC-compatible compilers for Un*x/Arm platforms.
+
+2. Fixed a regression introduced by 2.1 beta1[13] that prevented the Arm 32-bit
+(AArch32) Neon SIMD extensions from building unless the C compiler flags
+included `-mfloat-abi=softfp` or `-mfloat-abi=hard`.
+
+3. Fixed an issue in the AArch32 Neon SIMD Huffman encoder whereby reliance on
+undefined C compiler behavior led to crashes ("SIGBUS: illegal alignment") on
+Android systems when running AArch32/Thumb builds of libjpeg-turbo built with
+recent versions of Clang.
+
+
 2.1.0
 =====
 
@@ -284,15 +302,15 @@
 decompress some such images using `tjDecompressToYUV2()` or
 `tjDecompressToYUVPlanes()`.
 
-5. Fixed an issue, detected by ASan, whereby attempting to losslessly transform
-a specially-crafted malformed JPEG image containing an extremely-high-frequency
-coefficient block (junk image data that could never be generated by a
-legitimate JPEG compressor) could cause the Huffman encoder's local buffer to
-be overrun. (Refer to 1.4.0[9] and 1.4beta1[15].)  Given that the buffer
-overrun was fully contained within the stack and did not cause a segfault or
-other user-visible errant behavior, and given that the lossless transformer
-(unlike the decompressor) is not generally exposed to arbitrary data exploits,
-this issue did not likely pose a security risk.
+5. Fixed an issue (CVE-2020-17541), detected by ASan, whereby attempting to
+losslessly transform a specially-crafted malformed JPEG image containing an
+extremely-high-frequency coefficient block (junk image data that could never be
+generated by a legitimate JPEG compressor) could cause the Huffman encoder's
+local buffer to be overrun. (Refer to 1.4.0[9] and 1.4beta1[15].)  Given that
+the buffer overrun was fully contained within the stack and did not cause a
+segfault or other user-visible errant behavior, and given that the lossless
+transformer (unlike the decompressor) is not generally exposed to arbitrary
+data exploits, this issue did not likely pose a security risk.
 
 6. The Arm 64-bit (Armv8) Neon SIMD assembly code now stores constants in a
 separate read-only data section rather than in the text section, to support
diff --git a/METADATA b/METADATA
index f8f0c4c..afa6cd7 100644
--- a/METADATA
+++ b/METADATA
@@ -5,11 +5,14 @@
     type: GIT
     value: "https://chromium.googlesource.com/chromium/deps/libjpeg_turbo"
   }
-  version: "764c5fca09cb558f88f3145d67b705f92ffee2d9"
+  version: "ad8b3b0f84baf155f3bde5626c3bf9d20535bcae"
   license_type: NOTICE
   last_upgrade_date {
     year: 2021
-    month: 2
-    day: 9
+    month: 7
+    day: 15
+  }
+  security {
+      tag: "NVD-CPE2.3:cpe:/a:libjpeg-turbo:libjpeg-turbo:2.1.0"
   }
 }
diff --git a/README.chromium b/README.chromium
index d260cb2..de1fe85 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libjpeg-turbo
 URL: https://github.com/libjpeg-turbo/libjpeg-turbo/
-Version: 2.1.0
+Version: b201838d8b5f2f80c9f86ec8405a62a002232b2c (post 2.1.0)
 License: Custom license
 License File: LICENSE.md
 Security Critical: yes
@@ -8,7 +8,7 @@
 
 Description:
 This consists of the components:
-* libjpeg-turbo 2.1.0
+* libjpeg-turbo b201838d8b5f2f80c9f86ec8405a62a002232b2c (post 2.1.0)
 * This file (README.chromium)
 * A build file (BUILD.gn)
 * An OWNERS file
diff --git a/jchuff.c b/jchuff.c
index 2bce767..8ff817b 100644
--- a/jchuff.c
+++ b/jchuff.c
@@ -44,8 +44,9 @@
  * flags (this defines __thumb__).
  */
 
-#if defined(__arm__) || defined(__aarch64__) || defined(_M_ARM) || \
-    defined(_M_ARM64)
+/* NOTE: Both GCC and Clang define __GNUC__ */
+#if (defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))) || \
+    defined(_M_ARM) || defined(_M_ARM64)
 #if !defined(__thumb__) || defined(__thumb2__)
 #define USE_CLZ_INTRINSIC
 #endif
diff --git a/jcmaster.c b/jcmaster.c
index 998dc40..c2b2600 100644
--- a/jcmaster.c
+++ b/jcmaster.c
@@ -493,7 +493,7 @@
     master->pass_type = output_pass;
     master->pass_number++;
 #endif
-    /*FALLTHROUGH*/
+    FALLTHROUGH                 /*FALLTHROUGH*/
   case output_pass:
     /* Do a data-output pass. */
     /* We need not repeat per-scan setup if prior optimization pass did it. */
diff --git a/jconfigint.h b/jconfigint.h
index 4f23dff..cb9915c 100644
--- a/jconfigint.h
+++ b/jconfigint.h
@@ -52,3 +52,13 @@
 #define HAVEBITSCANFORWARD
 #endif
 #endif
+
+#if defined(__has_attribute)
+#if __has_attribute(fallthrough)
+#define FALLTHROUGH __attribute__((fallthrough));
+#else
+#define FALLTHROUGH
+#endif
+#else
+#define FALLTHROUGH
+#endif
diff --git a/jconfigint.h.in b/jconfigint.h.in
index 68cbc2a..d087d7b 100644
--- a/jconfigint.h.in
+++ b/jconfigint.h.in
@@ -32,3 +32,13 @@
 #define HAVE_BITSCANFORWARD
 #endif
 #endif
+
+#if defined(__has_attribute)
+#if __has_attribute(fallthrough)
+#define FALLTHROUGH  __attribute__((fallthrough));
+#else
+#define FALLTHROUGH
+#endif
+#else
+#define FALLTHROUGH
+#endif
diff --git a/jcphuff.c b/jcphuff.c
index bd14fc2..9bf9612 100644
--- a/jcphuff.c
+++ b/jcphuff.c
@@ -52,8 +52,9 @@
  * flags (this defines __thumb__).
  */
 
-#if defined(__arm__) || defined(__aarch64__) || defined(_M_ARM) || \
-    defined(_M_ARM64)
+/* NOTE: Both GCC and Clang define __GNUC__ */
+#if (defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))) || \
+    defined(_M_ARM) || defined(_M_ARM64)
 #if !defined(__thumb__) || defined(__thumb2__)
 #define USE_CLZ_INTRINSIC
 #endif
diff --git a/jdapimin.c b/jdapimin.c
index 21a41d2..4609b13 100644
--- a/jdapimin.c
+++ b/jdapimin.c
@@ -23,6 +23,7 @@
 #include "jinclude.h"
 #include "jpeglib.h"
 #include "jdmaster.h"
+#include "jconfigint.h"
 
 
 /*
@@ -308,7 +309,7 @@
     /* Initialize application's data source module */
     (*cinfo->src->init_source) (cinfo);
     cinfo->global_state = DSTATE_INHEADER;
-    /*FALLTHROUGH*/
+    FALLTHROUGH                 /*FALLTHROUGH*/
   case DSTATE_INHEADER:
     retcode = (*cinfo->inputctl->consume_input) (cinfo);
     if (retcode == JPEG_REACHED_SOS) { /* Found SOS, prepare to decompress */
diff --git a/jdmainct.c b/jdmainct.c
index 50301d6..f466b25 100644
--- a/jdmainct.c
+++ b/jdmainct.c
@@ -18,6 +18,7 @@
 
 #include "jinclude.h"
 #include "jdmainct.h"
+#include "jconfigint.h"
 
 
 /*
@@ -360,7 +361,7 @@
     main_ptr->context_state = CTX_PREPARE_FOR_IMCU;
     if (*out_row_ctr >= out_rows_avail)
       return;                   /* Postprocessor exactly filled output buf */
-    /*FALLTHROUGH*/
+    FALLTHROUGH                 /*FALLTHROUGH*/
   case CTX_PREPARE_FOR_IMCU:
     /* Prepare to process first M-1 row groups of this iMCU row */
     main_ptr->rowgroup_ctr = 0;
@@ -371,7 +372,7 @@
     if (main_ptr->iMCU_row_ctr == cinfo->total_iMCU_rows)
       set_bottom_pointers(cinfo);
     main_ptr->context_state = CTX_PROCESS_IMCU;
-    /*FALLTHROUGH*/
+    FALLTHROUGH                 /*FALLTHROUGH*/
   case CTX_PROCESS_IMCU:
     /* Call postprocessor using previously set pointers */
     (*cinfo->post->post_process_data) (cinfo,
diff --git a/simd/arm/jchuff.h b/simd/arm/jchuff.h
index d4edd5e..2fbd252 100644
--- a/simd/arm/jchuff.h
+++ b/simd/arm/jchuff.h
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009, 2018, D. R. Commander.
+ * Copyright (C) 2009, 2018, 2021, D. R. Commander.
  * Copyright (C) 2018, Matthias Räncker.
  * Copyright (C) 2020-2021, Arm Limited.
  * For conditions of distribution and use, see the accompanying README.ijg
@@ -74,6 +74,21 @@
 
 #else
 
+#if defined(_MSC_VER) && !defined(__clang__)
+#define SPLAT() { \
+  buffer[0] = (JOCTET)(put_buffer >> 24); \
+  buffer[1] = (JOCTET)(put_buffer >> 16); \
+  buffer[2] = (JOCTET)(put_buffer >>  8); \
+  buffer[3] = (JOCTET)(put_buffer      ); \
+  buffer += 4; \
+}
+#else
+#define SPLAT() { \
+  put_buffer = __builtin_bswap32(put_buffer); \
+  __asm__("str %1, [%0], #4" : "+r" (buffer) : "r" (put_buffer)); \
+}
+#endif
+
 #define FLUSH() { \
   if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
     EMIT_BYTE(put_buffer >> 24) \
@@ -81,8 +96,7 @@
     EMIT_BYTE(put_buffer >>  8) \
     EMIT_BYTE(put_buffer      ) \
   } else { \
-    *((uint32_t *)buffer) = BUILTIN_BSWAP32(put_buffer); \
-    buffer += 4; \
+    SPLAT(); \
   } \
 }
 
diff --git a/simd/arm/jcphuff-neon.c b/simd/arm/jcphuff-neon.c
index 86a263f..b91c5db 100644
--- a/simd/arm/jcphuff-neon.c
+++ b/simd/arm/jcphuff-neon.c
@@ -21,6 +21,7 @@
  */
 
 #define JPEG_INTERNALS
+#include "jconfigint.h"
 #include "../../jinclude.h"
 #include "../../jpeglib.h"
 #include "../../jsimd.h"
@@ -105,18 +106,25 @@
     switch (remaining_coefs) {
     case 15:
       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 14:
       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 13:
       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 12:
       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 11:
       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 10:
       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 9:
       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
     default:
       break;
     }
@@ -149,20 +157,28 @@
     switch (remaining_coefs) {
     case 8:
       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 7:
       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 6:
       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 5:
       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 4:
       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 3:
       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 2:
       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 1:
       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
     default:
       break;
     }
@@ -337,18 +353,25 @@
     switch (remaining_coefs) {
     case 15:
       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 14:
       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 13:
       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 12:
       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 11:
       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 10:
       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 9:
       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
     default:
       break;
     }
@@ -389,20 +412,28 @@
     switch (remaining_coefs) {
     case 8:
       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 7:
       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 6:
       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 5:
       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 4:
       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 3:
       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 2:
       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 1:
       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
     default:
       break;
     }
diff --git a/simd/arm/jdcolext-neon.c b/simd/arm/jdcolext-neon.c
index ae440f4..c3c07a1 100644
--- a/simd/arm/jdcolext-neon.c
+++ b/simd/arm/jdcolext-neon.c
@@ -283,18 +283,25 @@
       switch (cols_remaining) {
       case 7:
         vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba, 6);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 6:
         vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba, 5);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 5:
         vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba, 4);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 4:
         vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba, 3);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 3:
         vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba, 2);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 2:
         vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba, 1);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 1:
         vst4_lane_u8(outptr, rgba, 0);
+        FALLTHROUGH             /*FALLTHROUGH*/
       default:
         break;
       }
@@ -308,18 +315,25 @@
       switch (cols_remaining) {
       case 7:
         vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb, 6);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 6:
         vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb, 5);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 5:
         vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb, 4);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 4:
         vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb, 3);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 3:
         vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb, 2);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 2:
         vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb, 1);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 1:
         vst3_lane_u8(outptr, rgb, 0);
+        FALLTHROUGH             /*FALLTHROUGH*/
       default:
         break;
       }
@@ -332,18 +346,25 @@
       switch (cols_remaining) {
       case 7:
         vst1q_lane_u16((uint16_t *)(outptr + 6 * RGB_PIXELSIZE), rgb565, 6);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 6:
         vst1q_lane_u16((uint16_t *)(outptr + 5 * RGB_PIXELSIZE), rgb565, 5);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 5:
         vst1q_lane_u16((uint16_t *)(outptr + 4 * RGB_PIXELSIZE), rgb565, 4);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 4:
         vst1q_lane_u16((uint16_t *)(outptr + 3 * RGB_PIXELSIZE), rgb565, 3);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 3:
         vst1q_lane_u16((uint16_t *)(outptr + 2 * RGB_PIXELSIZE), rgb565, 2);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 2:
         vst1q_lane_u16((uint16_t *)(outptr + RGB_PIXELSIZE), rgb565, 1);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 1:
         vst1q_lane_u16((uint16_t *)outptr, rgb565, 0);
+        FALLTHROUGH             /*FALLTHROUGH*/
       default:
         break;
       }
diff --git a/simd/arm/jdcolor-neon.c b/simd/arm/jdcolor-neon.c
index 28dbc57..ea4668f 100644
--- a/simd/arm/jdcolor-neon.c
+++ b/simd/arm/jdcolor-neon.c
@@ -21,6 +21,7 @@
  */
 
 #define JPEG_INTERNALS
+#include "jconfigint.h"
 #include "../../jinclude.h"
 #include "../../jpeglib.h"
 #include "../../jsimd.h"
diff --git a/simd/arm/jdmerge-neon.c b/simd/arm/jdmerge-neon.c
index 18fb9d8..e4f91fd 100644
--- a/simd/arm/jdmerge-neon.c
+++ b/simd/arm/jdmerge-neon.c
@@ -21,6 +21,7 @@
  */
 
 #define JPEG_INTERNALS
+#include "jconfigint.h"
 #include "../../jinclude.h"
 #include "../../jpeglib.h"
 #include "../../jsimd.h"
diff --git a/simd/arm/jdmrgext-neon.c b/simd/arm/jdmrgext-neon.c
index fa2ec05..5b89bdb 100644
--- a/simd/arm/jdmrgext-neon.c
+++ b/simd/arm/jdmrgext-neon.c
@@ -226,35 +226,49 @@
     switch (cols_remaining) {
     case 15:
       vst4_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgba_h, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 14:
       vst4_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgba_h, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 13:
       vst4_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgba_h, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 12:
       vst4_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgba_h, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 11:
       vst4_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgba_h, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 10:
       vst4_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgba_h, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 9:
       vst4_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgba_h, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 8:
       vst4_u8(outptr, rgba_l);
       break;
     case 7:
       vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba_l, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 6:
       vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba_l, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 5:
       vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba_l, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 4:
       vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba_l, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 3:
       vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba_l, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 2:
       vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba_l, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 1:
       vst4_lane_u8(outptr, rgba_l, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
     default:
       break;
     }
@@ -271,35 +285,49 @@
     switch (cols_remaining) {
     case 15:
       vst3_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgb_h, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 14:
       vst3_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgb_h, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 13:
       vst3_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgb_h, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 12:
       vst3_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgb_h, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 11:
       vst3_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgb_h, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 10:
       vst3_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgb_h, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 9:
       vst3_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgb_h, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 8:
       vst3_u8(outptr, rgb_l);
       break;
     case 7:
       vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb_l, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 6:
       vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb_l, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 5:
       vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb_l, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 4:
       vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb_l, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 3:
       vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb_l, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 2:
       vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb_l, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 1:
       vst3_lane_u8(outptr, rgb_l, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
     default:
       break;
     }
@@ -549,24 +577,31 @@
     case 15:
       vst4_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgba0_h, 6);
       vst4_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgba1_h, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 14:
       vst4_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgba0_h, 5);
       vst4_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgba1_h, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 13:
       vst4_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgba0_h, 4);
       vst4_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgba1_h, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 12:
       vst4_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgba0_h, 3);
       vst4_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgba1_h, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 11:
       vst4_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgba0_h, 2);
       vst4_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgba1_h, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 10:
       vst4_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgba0_h, 1);
       vst4_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgba1_h, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 9:
       vst4_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgba0_h, 0);
       vst4_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgba1_h, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 8:
       vst4_u8(outptr0, rgba0_l);
       vst4_u8(outptr1, rgba1_l);
@@ -574,24 +609,31 @@
     case 7:
       vst4_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgba0_l, 6);
       vst4_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgba1_l, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 6:
       vst4_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgba0_l, 5);
       vst4_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgba1_l, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 5:
       vst4_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgba0_l, 4);
       vst4_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgba1_l, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 4:
       vst4_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgba0_l, 3);
       vst4_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgba1_l, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 3:
       vst4_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgba0_l, 2);
       vst4_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgba1_l, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 2:
       vst4_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgba0_l, 1);
       vst4_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgba1_l, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 1:
       vst4_lane_u8(outptr0, rgba0_l, 0);
       vst4_lane_u8(outptr1, rgba1_l, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
     default:
       break;
     }
@@ -616,24 +658,31 @@
     case 15:
       vst3_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgb0_h, 6);
       vst3_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgb1_h, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 14:
       vst3_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgb0_h, 5);
       vst3_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgb1_h, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 13:
       vst3_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgb0_h, 4);
       vst3_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgb1_h, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 12:
       vst3_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgb0_h, 3);
       vst3_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgb1_h, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 11:
       vst3_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgb0_h, 2);
       vst3_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgb1_h, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 10:
       vst3_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgb0_h, 1);
       vst3_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgb1_h, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 9:
       vst3_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgb0_h, 0);
       vst3_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgb1_h, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 8:
       vst3_u8(outptr0, rgb0_l);
       vst3_u8(outptr1, rgb1_l);
@@ -641,24 +690,31 @@
     case 7:
       vst3_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgb0_l, 6);
       vst3_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgb1_l, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 6:
       vst3_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgb0_l, 5);
       vst3_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgb1_l, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 5:
       vst3_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgb0_l, 4);
       vst3_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgb1_l, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 4:
       vst3_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgb0_l, 3);
       vst3_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgb1_l, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 3:
       vst3_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgb0_l, 2);
       vst3_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgb1_l, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 2:
       vst3_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgb0_l, 1);
       vst3_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgb1_l, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 1:
       vst3_lane_u8(outptr0, rgb0_l, 0);
       vst3_lane_u8(outptr1, rgb1_l, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
     default:
       break;
     }
diff --git a/simd/arm/neon-compat.h b/simd/arm/neon-compat.h
index 3d77527..73c57ae 100644
--- a/simd/arm/neon-compat.h
+++ b/simd/arm/neon-compat.h
@@ -29,12 +29,10 @@
 #if defined(_MSC_VER) && !defined(__clang__)
 #define BUILTIN_CLZ(x)  _CountLeadingZeros(x)
 #define BUILTIN_CLZLL(x)  _CountLeadingZeros64(x)
-#define BUILTIN_BSWAP32(x)  _byteswap_ulong(x)
 #define BUILTIN_BSWAP64(x)  _byteswap_uint64(x)
 #elif defined(__clang__) || defined(__GNUC__)
 #define BUILTIN_CLZ(x)  __builtin_clz(x)
 #define BUILTIN_CLZLL(x)  __builtin_clzll(x)
-#define BUILTIN_BSWAP32(x)  __builtin_bswap32(x)
 #define BUILTIN_BSWAP64(x)  __builtin_bswap64(x)
 #else
 #error "Unknown compiler"
diff --git a/simd/arm/neon-compat.h.in b/simd/arm/neon-compat.h.in
index 436c402..d403f22 100644
--- a/simd/arm/neon-compat.h.in
+++ b/simd/arm/neon-compat.h.in
@@ -27,12 +27,10 @@
 #if defined(_MSC_VER) && !defined(__clang__)
 #define BUILTIN_CLZ(x)  _CountLeadingZeros(x)
 #define BUILTIN_CLZLL(x)  _CountLeadingZeros64(x)
-#define BUILTIN_BSWAP32(x)  _byteswap_ulong(x)
 #define BUILTIN_BSWAP64(x)  _byteswap_uint64(x)
 #elif defined(__clang__) || defined(__GNUC__)
 #define BUILTIN_CLZ(x)  __builtin_clz(x)
 #define BUILTIN_CLZLL(x)  __builtin_clzll(x)
-#define BUILTIN_BSWAP32(x)  __builtin_bswap32(x)
 #define BUILTIN_BSWAP64(x)  __builtin_bswap64(x)
 #else
 #error "Unknown compiler"