Speedups for empty histograms.

When histograms are empty, it is easy to add them.
They should also not be considered when merging histograms
(it is a waste of CPU).
This does not change the compression performance,
just the speed.

Change-Id: I42c721ca0f9c5ea067e73b792aa3db6d5e71d01f
diff --git a/src/dsp/lossless_enc.c b/src/dsp/lossless_enc.c
index 6ec9e46..1408fbf 100644
--- a/src/dsp/lossless_enc.c
+++ b/src/dsp/lossless_enc.c
@@ -643,25 +643,56 @@
   for (i = 0; i < size; ++i) out[i] += a[i];
 }
 
+#define ADD(X, ARG, LEN) do {                                                  \
+  if (a->is_used_[X]) {                                                        \
+    if (b->is_used_[X]) {                                                      \
+      VP8LAddVector(a->ARG, b->ARG, out->ARG, (LEN));                          \
+    } else {                                                                   \
+      memcpy(&out->ARG[0], &a->ARG[0], (LEN) * sizeof(out->ARG[0]));           \
+    }                                                                          \
+  } else if (b->is_used_[X]) {                                                 \
+    memcpy(&out->ARG[0], &b->ARG[0], (LEN) * sizeof(out->ARG[0]));             \
+  } else {                                                                     \
+    memset(&out->ARG[0], 0, (LEN) * sizeof(out->ARG[0]));                      \
+  }                                                                            \
+} while (0)
+
+#define ADD_EQ(X, ARG, LEN) do {                                               \
+  if (a->is_used_[X]) {                                                        \
+    if (out->is_used_[X]) {                                                    \
+      VP8LAddVectorEq(a->ARG, out->ARG, (LEN));                                \
+    } else {                                                                   \
+      memcpy(&out->ARG[0], &a->ARG[0], (LEN) * sizeof(out->ARG[0]));           \
+    }                                                                          \
+  }                                                                            \
+} while (0)
+
 void VP8LHistogramAdd(const VP8LHistogram* const a,
                       const VP8LHistogram* const b, VP8LHistogram* const out) {
+  int i;
   const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
   assert(a->palette_code_bits_ == b->palette_code_bits_);
+
   if (b != out) {
-    VP8LAddVector(a->literal_, b->literal_, out->literal_, literal_size);
-    VP8LAddVector(a->distance_, b->distance_, out->distance_,
-                  NUM_DISTANCE_CODES);
-    VP8LAddVector(a->red_, b->red_, out->red_, NUM_LITERAL_CODES);
-    VP8LAddVector(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES);
-    VP8LAddVector(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES);
+    ADD(0, literal_, literal_size);
+    ADD(1, red_, NUM_LITERAL_CODES);
+    ADD(2, blue_, NUM_LITERAL_CODES);
+    ADD(3, alpha_, NUM_LITERAL_CODES);
+    ADD(4, distance_, NUM_DISTANCE_CODES);
+    for (i = 0; i < 5; ++i) {
+      out->is_used_[i] = (a->is_used_[i] | b->is_used_[i]);
+    }
   } else {
-    VP8LAddVectorEq(a->literal_, out->literal_, literal_size);
-    VP8LAddVectorEq(a->distance_, out->distance_, NUM_DISTANCE_CODES);
-    VP8LAddVectorEq(a->red_, out->red_, NUM_LITERAL_CODES);
-    VP8LAddVectorEq(a->blue_, out->blue_, NUM_LITERAL_CODES);
-    VP8LAddVectorEq(a->alpha_, out->alpha_, NUM_LITERAL_CODES);
+    ADD_EQ(0, literal_, literal_size);
+    ADD_EQ(1, red_, NUM_LITERAL_CODES);
+    ADD_EQ(2, blue_, NUM_LITERAL_CODES);
+    ADD_EQ(3, alpha_, NUM_LITERAL_CODES);
+    ADD_EQ(4, distance_, NUM_DISTANCE_CODES);
+    for (i = 0; i < 5; ++i) out->is_used_[i] |= a->is_used_[i];
   }
 }
+#undef ADD
+#undef ADD_EQ
 
 //------------------------------------------------------------------------------
 // Image transforms.
diff --git a/src/enc/histogram_enc.c b/src/enc/histogram_enc.c
index 9fdbc62..c988b4d 100644
--- a/src/enc/histogram_enc.c
+++ b/src/enc/histogram_enc.c
@@ -51,10 +51,12 @@
                           VP8LHistogram* const dst) {
   uint32_t* const dst_literal = dst->literal_;
   const int dst_cache_bits = dst->palette_code_bits_;
+  const int literal_size = VP8LHistogramNumCodes(dst_cache_bits);
   const int histo_size = VP8LGetHistogramSize(dst_cache_bits);
   assert(src->palette_code_bits_ == dst_cache_bits);
   memcpy(dst, src, histo_size);
   dst->literal_ = dst_literal;
+  memcpy(dst->literal_, src->literal_, literal_size * sizeof(*dst->literal_));
 }
 
 int VP8LGetHistogramSize(int cache_bits) {
@@ -237,7 +239,8 @@
 // Get the symbol entropy for the distribution 'population'.
 // Set 'trivial_sym', if there's only one symbol present in the distribution.
 static double PopulationCost(const uint32_t* const population, int length,
-                             uint32_t* const trivial_sym) {
+                             uint32_t* const trivial_sym,
+                             uint8_t* const is_used) {
   VP8LBitEntropy bit_entropy;
   VP8LStreaks stats;
   VP8LGetEntropyUnrefined(population, length, &bit_entropy, &stats);
@@ -245,6 +248,8 @@
     *trivial_sym = (bit_entropy.nonzeros == 1) ? bit_entropy.nonzero_code
                                                : VP8L_NON_TRIVIAL_SYM;
   }
+  // The histogram is used if there is at least one non-zero streak.
+  *is_used = (stats.streaks[1][0] != 0 || stats.streaks[1][1] != 0);
 
   return BitsEntropyRefine(&bit_entropy) + FinalHuffmanCost(&stats);
 }
@@ -253,7 +258,9 @@
 // non-zero: both the zero-th one, or both the last one.
 static WEBP_INLINE double GetCombinedEntropy(const uint32_t* const X,
                                              const uint32_t* const Y,
-                                             int length, int trivial_at_end) {
+                                             int length, int is_X_used,
+                                             int is_Y_used,
+                                             int trivial_at_end) {
   VP8LStreaks stats;
   if (trivial_at_end) {
     // This configuration is due to palettization that transforms an indexed
@@ -262,28 +269,43 @@
     // Only FinalHuffmanCost needs to be evaluated.
     memset(&stats, 0, sizeof(stats));
     // Deal with the non-zero value at index 0 or length-1.
-    stats.streaks[1][0] += 1;
+    stats.streaks[1][0] = 1;
     // Deal with the following/previous zero streak.
-    stats.counts[0] += 1;
-    stats.streaks[0][1] += length - 1;
+    stats.counts[0] = 1;
+    stats.streaks[0][1] = length - 1;
     return FinalHuffmanCost(&stats);
   } else {
     VP8LBitEntropy bit_entropy;
-    VP8LGetCombinedEntropyUnrefined(X, Y, length, &bit_entropy, &stats);
+    if (is_X_used) {
+      if (is_Y_used) {
+        VP8LGetCombinedEntropyUnrefined(X, Y, length, &bit_entropy, &stats);
+      } else {
+        VP8LGetEntropyUnrefined(X, length, &bit_entropy, &stats);
+      }
+    } else {
+      if (is_Y_used) {
+        VP8LGetEntropyUnrefined(Y, length, &bit_entropy, &stats);
+      } else {
+        memset(&stats, 0, sizeof(stats));
+        stats.counts[0] = 1;
+        stats.streaks[0][length > 3] = length;
+        VP8LBitEntropyInit(&bit_entropy);
+      }
+    }
 
     return BitsEntropyRefine(&bit_entropy) + FinalHuffmanCost(&stats);
   }
 }
 
 // Estimates the Entropy + Huffman + other block overhead size cost.
-double VP8LHistogramEstimateBits(const VP8LHistogram* const p) {
+double VP8LHistogramEstimateBits(VP8LHistogram* const p) {
   return
-      PopulationCost(
-          p->literal_, VP8LHistogramNumCodes(p->palette_code_bits_), NULL)
-      + PopulationCost(p->red_, NUM_LITERAL_CODES, NULL)
-      + PopulationCost(p->blue_, NUM_LITERAL_CODES, NULL)
-      + PopulationCost(p->alpha_, NUM_LITERAL_CODES, NULL)
-      + PopulationCost(p->distance_, NUM_DISTANCE_CODES, NULL)
+      PopulationCost(p->literal_, VP8LHistogramNumCodes(p->palette_code_bits_),
+                     NULL, &p->is_used_[0])
+      + PopulationCost(p->red_, NUM_LITERAL_CODES, NULL, &p->is_used_[1])
+      + PopulationCost(p->blue_, NUM_LITERAL_CODES, NULL, &p->is_used_[2])
+      + PopulationCost(p->alpha_, NUM_LITERAL_CODES, NULL, &p->is_used_[3])
+      + PopulationCost(p->distance_, NUM_DISTANCE_CODES, NULL, &p->is_used_[4])
       + VP8LExtraCost(p->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES)
       + VP8LExtraCost(p->distance_, NUM_DISTANCE_CODES);
 }
@@ -299,7 +321,8 @@
   int trivial_at_end = 0;
   assert(a->palette_code_bits_ == b->palette_code_bits_);
   *cost += GetCombinedEntropy(a->literal_, b->literal_,
-                              VP8LHistogramNumCodes(palette_code_bits), 0);
+                              VP8LHistogramNumCodes(palette_code_bits),
+                              a->is_used_[0], b->is_used_[0], 0);
   *cost += VP8LExtraCostCombined(a->literal_ + NUM_LITERAL_CODES,
                                  b->literal_ + NUM_LITERAL_CODES,
                                  NUM_LENGTH_CODES);
@@ -319,19 +342,23 @@
   }
 
   *cost +=
-      GetCombinedEntropy(a->red_, b->red_, NUM_LITERAL_CODES, trivial_at_end);
+      GetCombinedEntropy(a->red_, b->red_, NUM_LITERAL_CODES, a->is_used_[1],
+                         b->is_used_[1], trivial_at_end);
   if (*cost > cost_threshold) return 0;
 
   *cost +=
-      GetCombinedEntropy(a->blue_, b->blue_, NUM_LITERAL_CODES, trivial_at_end);
-  if (*cost > cost_threshold) return 0;
-
-  *cost += GetCombinedEntropy(a->alpha_, b->alpha_, NUM_LITERAL_CODES,
-                              trivial_at_end);
+      GetCombinedEntropy(a->blue_, b->blue_, NUM_LITERAL_CODES, a->is_used_[2],
+                         b->is_used_[2], trivial_at_end);
   if (*cost > cost_threshold) return 0;
 
   *cost +=
-      GetCombinedEntropy(a->distance_, b->distance_, NUM_DISTANCE_CODES, 0);
+      GetCombinedEntropy(a->alpha_, b->alpha_, NUM_LITERAL_CODES,
+                         a->is_used_[3], b->is_used_[3], trivial_at_end);
+  if (*cost > cost_threshold) return 0;
+
+  *cost +=
+      GetCombinedEntropy(a->distance_, b->distance_, NUM_DISTANCE_CODES,
+                         a->is_used_[4], b->is_used_[4], 0);
   *cost +=
       VP8LExtraCostCombined(a->distance_, b->distance_, NUM_DISTANCE_CODES);
   if (*cost > cost_threshold) return 0;
@@ -419,16 +446,19 @@
 static void UpdateHistogramCost(VP8LHistogram* const h) {
   uint32_t alpha_sym, red_sym, blue_sym;
   const double alpha_cost =
-      PopulationCost(h->alpha_, NUM_LITERAL_CODES, &alpha_sym);
+      PopulationCost(h->alpha_, NUM_LITERAL_CODES, &alpha_sym,
+                     &h->is_used_[3]);
   const double distance_cost =
-      PopulationCost(h->distance_, NUM_DISTANCE_CODES, NULL) +
+      PopulationCost(h->distance_, NUM_DISTANCE_CODES, NULL, &h->is_used_[4]) +
       VP8LExtraCost(h->distance_, NUM_DISTANCE_CODES);
   const int num_codes = VP8LHistogramNumCodes(h->palette_code_bits_);
-  h->literal_cost_ = PopulationCost(h->literal_, num_codes, NULL) +
-                     VP8LExtraCost(h->literal_ + NUM_LITERAL_CODES,
-                                   NUM_LENGTH_CODES);
-  h->red_cost_ = PopulationCost(h->red_, NUM_LITERAL_CODES, &red_sym);
-  h->blue_cost_ = PopulationCost(h->blue_, NUM_LITERAL_CODES, &blue_sym);
+  h->literal_cost_ =
+      PopulationCost(h->literal_, num_codes, NULL, &h->is_used_[0]) +
+          VP8LExtraCost(h->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES);
+  h->red_cost_ =
+      PopulationCost(h->red_, NUM_LITERAL_CODES, &red_sym, &h->is_used_[1]);
+  h->blue_cost_ =
+      PopulationCost(h->blue_, NUM_LITERAL_CODES, &blue_sym, &h->is_used_[2]);
   h->bit_cost_ = h->literal_cost_ + h->red_cost_ + h->blue_cost_ +
                  alpha_cost + distance_cost;
   if ((alpha_sym | red_sym | blue_sym) == VP8L_NON_TRIVIAL_SYM) {
@@ -493,11 +523,19 @@
   const int histo_size = orig_histo->size;
   VP8LHistogram** const orig_histograms = orig_histo->histograms;
   VP8LHistogram** const histograms = image_histo->histograms;
+  image_histo->size = 0;
   for (i = 0; i < histo_size; ++i) {
     VP8LHistogram* const histo = orig_histograms[i];
     UpdateHistogramCost(histo);
+
+    // Skip the histogram if it is completely empty, which can happen for tiles
+    // with no information (when they are skipped because of LZ77).
+    if (!histo->is_used_[0] && !histo->is_used_[1] && !histo->is_used_[2]
+        && !histo->is_used_[3] && !histo->is_used_[4]) {
+      continue;
+    }
     // Copy histograms from orig_histo[] to image_histo[].
-    HistogramCopy(histo, histograms[i]);
+    HistogramCopy(histo, histograms[image_histo->size++]);
   }
 }
 
@@ -987,8 +1025,7 @@
   // histograms of small sizes (as bin_map will be very sparse) and
   // maximum quality q==100 (to preserve the compression gains at that level).
   const int entropy_combine_num_bins = low_effort ? NUM_PARTITIONS : BIN_SIZE;
-  const int entropy_combine =
-      (orig_histo->size > entropy_combine_num_bins * 2) && (quality < 100);
+  int entropy_combine;
 
   if (orig_histo == NULL) goto Error;
 
@@ -996,15 +1033,16 @@
   HistogramBuild(xsize, histo_bits, refs, orig_histo);
   // Copies the histograms and computes its bit_cost.
   HistogramCopyAndAnalyze(orig_histo, image_histo);
-
+  entropy_combine =
+      (image_histo->size > entropy_combine_num_bins * 2) && (quality < 100);
   if (entropy_combine) {
-    const int bin_map_size = orig_histo->size;
+    const int bin_map_size = image_histo->size;
     // Reuse histogram_symbols storage. By definition, it's guaranteed to be ok.
     uint16_t* const bin_map = histogram_symbols;
     const double combine_cost_factor =
         GetCombineCostFactor(image_histo_raw_size, quality);
 
-    HistogramAnalyzeEntropyBin(orig_histo, bin_map, low_effort);
+    HistogramAnalyzeEntropyBin(image_histo, bin_map, low_effort);
     // Collapse histograms with similar entropy.
     HistogramCombineEntropyBin(image_histo, tmp_histo, bin_map, bin_map_size,
                                entropy_combine_num_bins, combine_cost_factor,
diff --git a/src/enc/histogram_enc.h b/src/enc/histogram_enc.h
index e8c4c83..67d8253 100644
--- a/src/enc/histogram_enc.h
+++ b/src/enc/histogram_enc.h
@@ -44,6 +44,7 @@
   double literal_cost_;      // Cached values of dominant entropy costs:
   double red_cost_;          // literal, red & blue.
   double blue_cost_;
+  uint8_t is_used_[5];       // 5 for literal, red, blue, alpha, distance
 } VP8LHistogram;
 
 // Collection of histograms with fixed capacity, allocated as one
@@ -113,7 +114,7 @@
 
 // Estimate how many bits the combined entropy of literals and distance
 // approximately maps to.
-double VP8LHistogramEstimateBits(const VP8LHistogram* const p);
+double VP8LHistogramEstimateBits(VP8LHistogram* const p);
 
 #ifdef __cplusplus
 }