avoid rgb-premultiply if there's only trivial alpha values

With this, MODE_rgbA can safely be used without speed penalty
even in case of pure-lossy alpha-less input.
It's also an optimization when cropping a fully-opaque region from
an image with alpha: premultiply is then skipped

Change-Id: Ibee28c75744f193dacdfccd5a2e7cd1e44604db6
diff --git a/src/dec/io.c b/src/dec/io.c
index 2c75f78..b90f6c5 100644
--- a/src/dec/io.c
+++ b/src/dec/io.c
@@ -196,6 +196,7 @@
     const WebPRGBABuffer* const buf = &p->output->u.RGBA;
     int start_y = io->mb_y;
     int num_rows = mb_h;
+    uint32_t alpha_mask = 0xff;
 
     // We compensate for the 1-line delay of fancy upscaler.
     // This is similar to EmitFancyRGB().
@@ -219,11 +220,16 @@
       uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
       uint8_t* dst = base_rgba + (alpha_first ? 0 : 3);
       for (j = 0; j < num_rows; ++j) {
-        for (i = 0; i < mb_w; ++i) dst[4 * i] = alpha[i];
+        for (i = 0; i < mb_w; ++i) {
+          const uint32_t alpha_value = alpha[i];
+          dst[4 * i] = alpha_value;
+          alpha_mask &= alpha_value;
+        }
         alpha += io->width;
         dst += buf->stride;
       }
-      if (WebPIsPremultipliedMode(colorspace)) {
+      // alpha_mask is < 0xff if there's non-trivial alpha to premultiply with.
+      if (alpha_mask != 0xff && WebPIsPremultipliedMode(colorspace)) {
         WebPApplyAlphaMultiply(base_rgba, alpha_first,
                                mb_w, num_rows, buf->stride);
       }
@@ -241,16 +247,18 @@
     const WebPRGBABuffer* const buf = &p->output->u.RGBA;
     uint8_t* const base_rgba = buf->rgba + io->mb_y * buf->stride;
     uint8_t* alpha_dst = base_rgba + 1;
+    uint32_t alpha_mask = 0x0f;
     for (j = 0; j < mb_h; ++j) {
       for (i = 0; i < mb_w; ++i) {
         // Fill in the alpha value (converted to 4 bits).
-        const uint32_t alpha_val = VP8Clip4Bits(alpha[i]);
-        alpha_dst[2 * i] = (alpha_dst[2 * i] & 0xf0) | alpha_val;
+        const uint32_t alpha_value = VP8Clip4Bits(alpha[i]);
+        alpha_dst[2 * i] = (alpha_dst[2 * i] & 0xf0) | alpha_value;
+        alpha_mask &= alpha_value;
       }
       alpha += io->width;
       alpha_dst += buf->stride;
     }
-    if (p->output->colorspace == MODE_rgbA_4444) {
+    if (alpha_mask != 0x0f && p->output->colorspace == MODE_rgbA_4444) {
       WebPApplyAlphaMultiply4444(base_rgba, mb_w, mb_h, buf->stride);
     }
   }
@@ -396,17 +404,22 @@
   uint8_t* dst = base_rgba + (alpha_first ? 0 : 3);
   int num_lines_out = 0;
   const int is_premult_alpha = WebPIsPremultipliedMode(colorspace);
+  uint32_t alpha_mask = 0xff;
   const int width = p->scaler_a.dst_width;
 
   while (WebPRescalerHasPendingOutput(&p->scaler_a)) {
     int i;
     assert(p->last_y + y_pos + num_lines_out < p->output->height);
     WebPRescalerExportRow(&p->scaler_a);
-    for (i = 0; i < width; ++i) dst[4 * i] = p->scaler_a.dst[i];
+    for (i = 0; i < width; ++i) {
+      const uint32_t alpha_value = p->scaler_a.dst[i];
+      dst[4 * i] = alpha_value;
+      alpha_mask &= alpha_value;
+    }
     dst += buf->stride;
     ++num_lines_out;
   }
-  if (is_premult_alpha) {
+  if (is_premult_alpha && alpha_mask != 0xff) {
     WebPApplyAlphaMultiply(base_rgba, alpha_first,
                            width, num_lines_out, buf->stride);
   }
@@ -421,6 +434,7 @@
   const WEBP_CSP_MODE colorspace = p->output->colorspace;
   const int width = p->scaler_a.dst_width;
   const int is_premult_alpha = WebPIsPremultipliedMode(colorspace);
+  uint32_t alpha_mask = 0x0f;
 
   while (WebPRescalerHasPendingOutput(&p->scaler_a)) {
     int i;
@@ -428,13 +442,14 @@
     WebPRescalerExportRow(&p->scaler_a);
     for (i = 0; i < width; ++i) {
       // Fill in the alpha value (converted to 4 bits).
-      const uint32_t alpha_val = VP8Clip4Bits(p->scaler_a.dst[i]);
-      alpha_dst[2 * i] = (alpha_dst[2 * i] & 0xf0) | alpha_val;
+      const uint32_t alpha_value = VP8Clip4Bits(p->scaler_a.dst[i]);
+      alpha_dst[2 * i] = (alpha_dst[2 * i] & 0xf0) | alpha_value;
+      alpha_mask &= alpha_value;
     }
     alpha_dst += buf->stride;
     ++num_lines_out;
   }
-  if (is_premult_alpha) {
+  if (is_premult_alpha && alpha_mask != 0x0f) {
     WebPApplyAlphaMultiply4444(base_rgba, width, num_lines_out, buf->stride);
   }
   return num_lines_out;