Added SSE2 variants for Average2/3/4

The predictors based on Average2 are tad slower.

Following is the performance data for these predictors normalized to
number of instruction cycles (as per valgrind) per operation:
- Predictor6 & Predictor7 now takes 15 instruction cycles compared to 11
  instruction cycles for the C version.
- Predictor8 & Predictor9 now takes 15 instruction cycles compared to 12
  instruction cycles for the C version.

The predictors based on Average4 is faster and Average3 is tad slower:
- Predictor10 (Average4) now takes 23 instruction cycles compared to 25
  instruction cycles for the C version.
- Predictor5 (Average3) now takes 20 instruction cycles compared to 18
  instruction cycles for the C version.

Maybe SSE2 version of Average2 can be improved further. Otherwise, we can
remove the SSE2 version and always fallback to the C version.

Change-Id: I388b2871919985bc28faaad37c1d4beeb20ba029
diff --git a/src/dsp/lossless_sse2.c b/src/dsp/lossless_sse2.c
index 0ac9beb..7130909 100644
--- a/src/dsp/lossless_sse2.c
+++ b/src/dsp/lossless_sse2.c
@@ -76,8 +76,44 @@
   return (pa_minus_pb <= 0) ? a : b;
 }
 
-#if 0
-// TODO(skal): optimize these to SSE2 too.
+static WEBP_INLINE __m128i Average2_128i(uint32_t a0, uint32_t a1) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a0), zero);
+  const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
+  const __m128i sum = _mm_add_epi16(A1, A0);
+  const __m128i avg = _mm_srli_epi16(sum, 1);
+  return avg;
+}
+
+static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
+  const __m128i avg = Average2_128i(a0, a1);
+  const __m128i A2 = _mm_packus_epi16(avg, avg);
+  const uint32_t output = _mm_cvtsi128_si32(A2);
+  return output;
+}
+
+static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i avg1 = Average2_128i(a0, a2);
+  const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
+  const __m128i sum = _mm_add_epi16(avg1, A1);
+  const __m128i avg2 = _mm_srli_epi16(sum, 1);
+  const __m128i A2 = _mm_packus_epi16(avg2, avg2);
+  const uint32_t output = _mm_cvtsi128_si32(A2);
+  return output;
+}
+
+static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
+                                     uint32_t a2, uint32_t a3) {
+  const __m128i avg1 = Average2_128i(a0, a1);
+  const __m128i avg2 = Average2_128i(a2, a3);
+  const __m128i sum = _mm_add_epi16(avg2, avg1);
+  const __m128i avg3 = _mm_srli_epi16(sum, 1);
+  const __m128i A0 = _mm_packus_epi16(avg3, avg3);
+  const uint32_t output = _mm_cvtsi128_si32(A0);
+  return output;
+}
+
 static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Average3(left, top[0], top[1]);
   return pred;
@@ -104,8 +140,6 @@
   const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
   return pred;
 }
-#endif
-
 static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Select(top[0], left, top[-1]);
   return pred;
@@ -473,6 +507,12 @@
 
 void VP8LDspInitSSE2(void) {
 #if defined(WEBP_USE_SSE2)
+  VP8LPredictors[5] = Predictor5;
+  VP8LPredictors[6] = Predictor6;
+  VP8LPredictors[7] = Predictor7;
+  VP8LPredictors[8] = Predictor8;
+  VP8LPredictors[9] = Predictor9;
+  VP8LPredictors[10] = Predictor10;
   VP8LPredictors[11] = Predictor11;
   VP8LPredictors[12] = Predictor12;
   VP8LPredictors[13] = Predictor13;