Upgrade libjpeg-turbo to ad8b3b0f84baf155f3bde5626c3bf9d20535bcae am: 3457af89e3 am: bbd5cf7276

Original change: https://android-review.googlesource.com/c/platform/external/libjpeg-turbo/+/1768605

Change-Id: I5064d98d2dfd5049768c9a62df93077db03a467b
diff --git a/ChangeLog.md b/ChangeLog.md
index 498b8f2..ca5208b 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -1,3 +1,21 @@
+2.1.1
+=====
+
+### Significant changes relative to 2.1.0
+
+1. Fixed a regression introduced in 2.1.0 that caused build failures with
+non-GCC-compatible compilers for Un*x/Arm platforms.
+
+2. Fixed a regression introduced by 2.1 beta1[13] that prevented the Arm 32-bit
+(AArch32) Neon SIMD extensions from building unless the C compiler flags
+included `-mfloat-abi=softfp` or `-mfloat-abi=hard`.
+
+3. Fixed an issue in the AArch32 Neon SIMD Huffman encoder whereby reliance on
+undefined C compiler behavior led to crashes ("SIGBUS: illegal alignment") on
+Android systems when running AArch32/Thumb builds of libjpeg-turbo built with
+recent versions of Clang.
+
+
 2.1.0
 =====
 
@@ -284,15 +302,15 @@
 decompress some such images using `tjDecompressToYUV2()` or
 `tjDecompressToYUVPlanes()`.
 
-5. Fixed an issue, detected by ASan, whereby attempting to losslessly transform
-a specially-crafted malformed JPEG image containing an extremely-high-frequency
-coefficient block (junk image data that could never be generated by a
-legitimate JPEG compressor) could cause the Huffman encoder's local buffer to
-be overrun. (Refer to 1.4.0[9] and 1.4beta1[15].)  Given that the buffer
-overrun was fully contained within the stack and did not cause a segfault or
-other user-visible errant behavior, and given that the lossless transformer
-(unlike the decompressor) is not generally exposed to arbitrary data exploits,
-this issue did not likely pose a security risk.
+5. Fixed an issue (CVE-2020-17541), detected by ASan, whereby attempting to
+losslessly transform a specially-crafted malformed JPEG image containing an
+extremely-high-frequency coefficient block (junk image data that could never be
+generated by a legitimate JPEG compressor) could cause the Huffman encoder's
+local buffer to be overrun. (Refer to 1.4.0[9] and 1.4beta1[15].)  Given that
+the buffer overrun was fully contained within the stack and did not cause a
+segfault or other user-visible errant behavior, and given that the lossless
+transformer (unlike the decompressor) is not generally exposed to arbitrary
+data exploits, this issue did not likely pose a security risk.
 
 6. The Arm 64-bit (Armv8) Neon SIMD assembly code now stores constants in a
 separate read-only data section rather than in the text section, to support
diff --git a/METADATA b/METADATA
index 9623a96..8d4fd50 100644
--- a/METADATA
+++ b/METADATA
@@ -5,11 +5,11 @@
     type: GIT
     value: "https://chromium.googlesource.com/chromium/deps/libjpeg_turbo"
   }
-  version: "b7bef8c05b7cdb1a038ae271a2c2b6647af4c879"
+  version: "ad8b3b0f84baf155f3bde5626c3bf9d20535bcae"
   license_type: NOTICE
   last_upgrade_date {
     year: 2021
     month: 7
-    day: 8
+    day: 15
   }
 }
diff --git a/README.chromium b/README.chromium
index d260cb2..de1fe85 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libjpeg-turbo
 URL: https://github.com/libjpeg-turbo/libjpeg-turbo/
-Version: 2.1.0
+Version: b201838d8b5f2f80c9f86ec8405a62a002232b2c (post 2.1.0)
 License: Custom license
 License File: LICENSE.md
 Security Critical: yes
@@ -8,7 +8,7 @@
 
 Description:
 This consists of the components:
-* libjpeg-turbo 2.1.0
+* libjpeg-turbo b201838d8b5f2f80c9f86ec8405a62a002232b2c (post 2.1.0)
 * This file (README.chromium)
 * A build file (BUILD.gn)
 * An OWNERS file
diff --git a/jchuff.c b/jchuff.c
index 2bce767..8ff817b 100644
--- a/jchuff.c
+++ b/jchuff.c
@@ -44,8 +44,9 @@
  * flags (this defines __thumb__).
  */
 
-#if defined(__arm__) || defined(__aarch64__) || defined(_M_ARM) || \
-    defined(_M_ARM64)
+/* NOTE: Both GCC and Clang define __GNUC__ */
+#if (defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))) || \
+    defined(_M_ARM) || defined(_M_ARM64)
 #if !defined(__thumb__) || defined(__thumb2__)
 #define USE_CLZ_INTRINSIC
 #endif
diff --git a/jcmaster.c b/jcmaster.c
index 998dc40..c2b2600 100644
--- a/jcmaster.c
+++ b/jcmaster.c
@@ -493,7 +493,7 @@
     master->pass_type = output_pass;
     master->pass_number++;
 #endif
-    /*FALLTHROUGH*/
+    FALLTHROUGH                 /*FALLTHROUGH*/
   case output_pass:
     /* Do a data-output pass. */
     /* We need not repeat per-scan setup if prior optimization pass did it. */
diff --git a/jconfigint.h b/jconfigint.h
index 4f23dff..cb9915c 100644
--- a/jconfigint.h
+++ b/jconfigint.h
@@ -52,3 +52,13 @@
 #define HAVEBITSCANFORWARD
 #endif
 #endif
+
+#if defined(__has_attribute)
+#if __has_attribute(fallthrough)
+#define FALLTHROUGH __attribute__((fallthrough));
+#else
+#define FALLTHROUGH
+#endif
+#else
+#define FALLTHROUGH
+#endif
diff --git a/jconfigint.h.in b/jconfigint.h.in
index 68cbc2a..d087d7b 100644
--- a/jconfigint.h.in
+++ b/jconfigint.h.in
@@ -32,3 +32,13 @@
 #define HAVE_BITSCANFORWARD
 #endif
 #endif
+
+#if defined(__has_attribute)
+#if __has_attribute(fallthrough)
+#define FALLTHROUGH  __attribute__((fallthrough));
+#else
+#define FALLTHROUGH
+#endif
+#else
+#define FALLTHROUGH
+#endif
diff --git a/jcphuff.c b/jcphuff.c
index bd14fc2..9bf9612 100644
--- a/jcphuff.c
+++ b/jcphuff.c
@@ -52,8 +52,9 @@
  * flags (this defines __thumb__).
  */
 
-#if defined(__arm__) || defined(__aarch64__) || defined(_M_ARM) || \
-    defined(_M_ARM64)
+/* NOTE: Both GCC and Clang define __GNUC__ */
+#if (defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))) || \
+    defined(_M_ARM) || defined(_M_ARM64)
 #if !defined(__thumb__) || defined(__thumb2__)
 #define USE_CLZ_INTRINSIC
 #endif
diff --git a/jdapimin.c b/jdapimin.c
index 21a41d2..4609b13 100644
--- a/jdapimin.c
+++ b/jdapimin.c
@@ -23,6 +23,7 @@
 #include "jinclude.h"
 #include "jpeglib.h"
 #include "jdmaster.h"
+#include "jconfigint.h"
 
 
 /*
@@ -308,7 +309,7 @@
     /* Initialize application's data source module */
     (*cinfo->src->init_source) (cinfo);
     cinfo->global_state = DSTATE_INHEADER;
-    /*FALLTHROUGH*/
+    FALLTHROUGH                 /*FALLTHROUGH*/
   case DSTATE_INHEADER:
     retcode = (*cinfo->inputctl->consume_input) (cinfo);
     if (retcode == JPEG_REACHED_SOS) { /* Found SOS, prepare to decompress */
diff --git a/jdmainct.c b/jdmainct.c
index 50301d6..f466b25 100644
--- a/jdmainct.c
+++ b/jdmainct.c
@@ -18,6 +18,7 @@
 
 #include "jinclude.h"
 #include "jdmainct.h"
+#include "jconfigint.h"
 
 
 /*
@@ -360,7 +361,7 @@
     main_ptr->context_state = CTX_PREPARE_FOR_IMCU;
     if (*out_row_ctr >= out_rows_avail)
       return;                   /* Postprocessor exactly filled output buf */
-    /*FALLTHROUGH*/
+    FALLTHROUGH                 /*FALLTHROUGH*/
   case CTX_PREPARE_FOR_IMCU:
     /* Prepare to process first M-1 row groups of this iMCU row */
     main_ptr->rowgroup_ctr = 0;
@@ -371,7 +372,7 @@
     if (main_ptr->iMCU_row_ctr == cinfo->total_iMCU_rows)
       set_bottom_pointers(cinfo);
     main_ptr->context_state = CTX_PROCESS_IMCU;
-    /*FALLTHROUGH*/
+    FALLTHROUGH                 /*FALLTHROUGH*/
   case CTX_PROCESS_IMCU:
     /* Call postprocessor using previously set pointers */
     (*cinfo->post->post_process_data) (cinfo,
diff --git a/simd/arm/jchuff.h b/simd/arm/jchuff.h
index d4edd5e..2fbd252 100644
--- a/simd/arm/jchuff.h
+++ b/simd/arm/jchuff.h
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009, 2018, D. R. Commander.
+ * Copyright (C) 2009, 2018, 2021, D. R. Commander.
  * Copyright (C) 2018, Matthias Räncker.
  * Copyright (C) 2020-2021, Arm Limited.
  * For conditions of distribution and use, see the accompanying README.ijg
@@ -74,6 +74,21 @@
 
 #else
 
+#if defined(_MSC_VER) && !defined(__clang__)
+#define SPLAT() { \
+  buffer[0] = (JOCTET)(put_buffer >> 24); \
+  buffer[1] = (JOCTET)(put_buffer >> 16); \
+  buffer[2] = (JOCTET)(put_buffer >>  8); \
+  buffer[3] = (JOCTET)(put_buffer      ); \
+  buffer += 4; \
+}
+#else
+#define SPLAT() { \
+  put_buffer = __builtin_bswap32(put_buffer); \
+  __asm__("str %1, [%0], #4" : "+r" (buffer) : "r" (put_buffer)); \
+}
+#endif
+
 #define FLUSH() { \
   if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
     EMIT_BYTE(put_buffer >> 24) \
@@ -81,8 +96,7 @@
     EMIT_BYTE(put_buffer >>  8) \
     EMIT_BYTE(put_buffer      ) \
   } else { \
-    *((uint32_t *)buffer) = BUILTIN_BSWAP32(put_buffer); \
-    buffer += 4; \
+    SPLAT(); \
   } \
 }
 
diff --git a/simd/arm/jcphuff-neon.c b/simd/arm/jcphuff-neon.c
index 86a263f..b91c5db 100644
--- a/simd/arm/jcphuff-neon.c
+++ b/simd/arm/jcphuff-neon.c
@@ -21,6 +21,7 @@
  */
 
 #define JPEG_INTERNALS
+#include "jconfigint.h"
 #include "../../jinclude.h"
 #include "../../jpeglib.h"
 #include "../../jsimd.h"
@@ -105,18 +106,25 @@
     switch (remaining_coefs) {
     case 15:
       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 14:
       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 13:
       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 12:
       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 11:
       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 10:
       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 9:
       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
     default:
       break;
     }
@@ -149,20 +157,28 @@
     switch (remaining_coefs) {
     case 8:
       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 7:
       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 6:
       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 5:
       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 4:
       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 3:
       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 2:
       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 1:
       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
     default:
       break;
     }
@@ -337,18 +353,25 @@
     switch (remaining_coefs) {
     case 15:
       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 14:
       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 13:
       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 12:
       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 11:
       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 10:
       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 9:
       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
     default:
       break;
     }
@@ -389,20 +412,28 @@
     switch (remaining_coefs) {
     case 8:
       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 7:
       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 6:
       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 5:
       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 4:
       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 3:
       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 2:
       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 1:
       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
     default:
       break;
     }
diff --git a/simd/arm/jdcolext-neon.c b/simd/arm/jdcolext-neon.c
index ae440f4..c3c07a1 100644
--- a/simd/arm/jdcolext-neon.c
+++ b/simd/arm/jdcolext-neon.c
@@ -283,18 +283,25 @@
       switch (cols_remaining) {
       case 7:
         vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba, 6);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 6:
         vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba, 5);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 5:
         vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba, 4);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 4:
         vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba, 3);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 3:
         vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba, 2);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 2:
         vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba, 1);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 1:
         vst4_lane_u8(outptr, rgba, 0);
+        FALLTHROUGH             /*FALLTHROUGH*/
       default:
         break;
       }
@@ -308,18 +315,25 @@
       switch (cols_remaining) {
       case 7:
         vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb, 6);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 6:
         vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb, 5);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 5:
         vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb, 4);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 4:
         vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb, 3);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 3:
         vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb, 2);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 2:
         vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb, 1);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 1:
         vst3_lane_u8(outptr, rgb, 0);
+        FALLTHROUGH             /*FALLTHROUGH*/
       default:
         break;
       }
@@ -332,18 +346,25 @@
       switch (cols_remaining) {
       case 7:
         vst1q_lane_u16((uint16_t *)(outptr + 6 * RGB_PIXELSIZE), rgb565, 6);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 6:
         vst1q_lane_u16((uint16_t *)(outptr + 5 * RGB_PIXELSIZE), rgb565, 5);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 5:
         vst1q_lane_u16((uint16_t *)(outptr + 4 * RGB_PIXELSIZE), rgb565, 4);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 4:
         vst1q_lane_u16((uint16_t *)(outptr + 3 * RGB_PIXELSIZE), rgb565, 3);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 3:
         vst1q_lane_u16((uint16_t *)(outptr + 2 * RGB_PIXELSIZE), rgb565, 2);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 2:
         vst1q_lane_u16((uint16_t *)(outptr + RGB_PIXELSIZE), rgb565, 1);
+        FALLTHROUGH             /*FALLTHROUGH*/
       case 1:
         vst1q_lane_u16((uint16_t *)outptr, rgb565, 0);
+        FALLTHROUGH             /*FALLTHROUGH*/
       default:
         break;
       }
diff --git a/simd/arm/jdcolor-neon.c b/simd/arm/jdcolor-neon.c
index 28dbc57..ea4668f 100644
--- a/simd/arm/jdcolor-neon.c
+++ b/simd/arm/jdcolor-neon.c
@@ -21,6 +21,7 @@
  */
 
 #define JPEG_INTERNALS
+#include "jconfigint.h"
 #include "../../jinclude.h"
 #include "../../jpeglib.h"
 #include "../../jsimd.h"
diff --git a/simd/arm/jdmerge-neon.c b/simd/arm/jdmerge-neon.c
index 18fb9d8..e4f91fd 100644
--- a/simd/arm/jdmerge-neon.c
+++ b/simd/arm/jdmerge-neon.c
@@ -21,6 +21,7 @@
  */
 
 #define JPEG_INTERNALS
+#include "jconfigint.h"
 #include "../../jinclude.h"
 #include "../../jpeglib.h"
 #include "../../jsimd.h"
diff --git a/simd/arm/jdmrgext-neon.c b/simd/arm/jdmrgext-neon.c
index fa2ec05..5b89bdb 100644
--- a/simd/arm/jdmrgext-neon.c
+++ b/simd/arm/jdmrgext-neon.c
@@ -226,35 +226,49 @@
     switch (cols_remaining) {
     case 15:
       vst4_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgba_h, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 14:
       vst4_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgba_h, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 13:
       vst4_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgba_h, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 12:
       vst4_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgba_h, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 11:
       vst4_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgba_h, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 10:
       vst4_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgba_h, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 9:
       vst4_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgba_h, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 8:
       vst4_u8(outptr, rgba_l);
       break;
     case 7:
       vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba_l, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 6:
       vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba_l, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 5:
       vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba_l, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 4:
       vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba_l, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 3:
       vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba_l, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 2:
       vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba_l, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 1:
       vst4_lane_u8(outptr, rgba_l, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
     default:
       break;
     }
@@ -271,35 +285,49 @@
     switch (cols_remaining) {
     case 15:
       vst3_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgb_h, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 14:
       vst3_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgb_h, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 13:
       vst3_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgb_h, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 12:
       vst3_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgb_h, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 11:
       vst3_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgb_h, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 10:
       vst3_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgb_h, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 9:
       vst3_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgb_h, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 8:
       vst3_u8(outptr, rgb_l);
       break;
     case 7:
       vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb_l, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 6:
       vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb_l, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 5:
       vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb_l, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 4:
       vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb_l, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 3:
       vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb_l, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 2:
       vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb_l, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 1:
       vst3_lane_u8(outptr, rgb_l, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
     default:
       break;
     }
@@ -549,24 +577,31 @@
     case 15:
       vst4_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgba0_h, 6);
       vst4_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgba1_h, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 14:
       vst4_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgba0_h, 5);
       vst4_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgba1_h, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 13:
       vst4_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgba0_h, 4);
       vst4_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgba1_h, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 12:
       vst4_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgba0_h, 3);
       vst4_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgba1_h, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 11:
       vst4_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgba0_h, 2);
       vst4_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgba1_h, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 10:
       vst4_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgba0_h, 1);
       vst4_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgba1_h, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 9:
       vst4_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgba0_h, 0);
       vst4_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgba1_h, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 8:
       vst4_u8(outptr0, rgba0_l);
       vst4_u8(outptr1, rgba1_l);
@@ -574,24 +609,31 @@
     case 7:
       vst4_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgba0_l, 6);
       vst4_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgba1_l, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 6:
       vst4_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgba0_l, 5);
       vst4_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgba1_l, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 5:
       vst4_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgba0_l, 4);
       vst4_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgba1_l, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 4:
       vst4_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgba0_l, 3);
       vst4_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgba1_l, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 3:
       vst4_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgba0_l, 2);
       vst4_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgba1_l, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 2:
       vst4_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgba0_l, 1);
       vst4_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgba1_l, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 1:
       vst4_lane_u8(outptr0, rgba0_l, 0);
       vst4_lane_u8(outptr1, rgba1_l, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
     default:
       break;
     }
@@ -616,24 +658,31 @@
     case 15:
       vst3_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgb0_h, 6);
       vst3_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgb1_h, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 14:
       vst3_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgb0_h, 5);
       vst3_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgb1_h, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 13:
       vst3_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgb0_h, 4);
       vst3_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgb1_h, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 12:
       vst3_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgb0_h, 3);
       vst3_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgb1_h, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 11:
       vst3_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgb0_h, 2);
       vst3_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgb1_h, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 10:
       vst3_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgb0_h, 1);
       vst3_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgb1_h, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 9:
       vst3_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgb0_h, 0);
       vst3_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgb1_h, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 8:
       vst3_u8(outptr0, rgb0_l);
       vst3_u8(outptr1, rgb1_l);
@@ -641,24 +690,31 @@
     case 7:
       vst3_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgb0_l, 6);
       vst3_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgb1_l, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 6:
       vst3_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgb0_l, 5);
       vst3_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgb1_l, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 5:
       vst3_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgb0_l, 4);
       vst3_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgb1_l, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 4:
       vst3_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgb0_l, 3);
       vst3_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgb1_l, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 3:
       vst3_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgb0_l, 2);
       vst3_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgb1_l, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 2:
       vst3_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgb0_l, 1);
       vst3_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgb1_l, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
     case 1:
       vst3_lane_u8(outptr0, rgb0_l, 0);
       vst3_lane_u8(outptr1, rgb1_l, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
     default:
       break;
     }
diff --git a/simd/arm/neon-compat.h b/simd/arm/neon-compat.h
index 3d77527..73c57ae 100644
--- a/simd/arm/neon-compat.h
+++ b/simd/arm/neon-compat.h
@@ -29,12 +29,10 @@
 #if defined(_MSC_VER) && !defined(__clang__)
 #define BUILTIN_CLZ(x)  _CountLeadingZeros(x)
 #define BUILTIN_CLZLL(x)  _CountLeadingZeros64(x)
-#define BUILTIN_BSWAP32(x)  _byteswap_ulong(x)
 #define BUILTIN_BSWAP64(x)  _byteswap_uint64(x)
 #elif defined(__clang__) || defined(__GNUC__)
 #define BUILTIN_CLZ(x)  __builtin_clz(x)
 #define BUILTIN_CLZLL(x)  __builtin_clzll(x)
-#define BUILTIN_BSWAP32(x)  __builtin_bswap32(x)
 #define BUILTIN_BSWAP64(x)  __builtin_bswap64(x)
 #else
 #error "Unknown compiler"
diff --git a/simd/arm/neon-compat.h.in b/simd/arm/neon-compat.h.in
index 436c402..d403f22 100644
--- a/simd/arm/neon-compat.h.in
+++ b/simd/arm/neon-compat.h.in
@@ -27,12 +27,10 @@
 #if defined(_MSC_VER) && !defined(__clang__)
 #define BUILTIN_CLZ(x)  _CountLeadingZeros(x)
 #define BUILTIN_CLZLL(x)  _CountLeadingZeros64(x)
-#define BUILTIN_BSWAP32(x)  _byteswap_ulong(x)
 #define BUILTIN_BSWAP64(x)  _byteswap_uint64(x)
 #elif defined(__clang__) || defined(__GNUC__)
 #define BUILTIN_CLZ(x)  __builtin_clz(x)
 #define BUILTIN_CLZLL(x)  __builtin_clzll(x)
-#define BUILTIN_BSWAP32(x)  __builtin_bswap32(x)
 #define BUILTIN_BSWAP64(x)  __builtin_bswap64(x)
 #else
 #error "Unknown compiler"