libvpx/vp9/encoder/x86/temporal_filter_sse4.c - platform/external/libvpx - Git at Google

 /*
  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */

 #include <assert.h>
 #include <smmintrin.h>

 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"

 // Division using multiplication and shifting. The C implementation does:
 // modifier *= 3;
 // modifier /= index;
 // where 'modifier' is a set of summed values and 'index' is the number of
 // summed values. 'index' may be 4, 6, or 9, representing a block of 9 values
 // which may be bound by the edges of the block being filtered.
 //
 // This equation works out to (m * 3) / i which reduces to:
 // m * 3/4
 // m * 1/2
 // m * 1/3
 //
 // By pairing the multiply with a down shift by 16 (_mm_mulhi_epu16):
 // m * C / 65536
 // we can create a C to replicate the division.
 //
 // m * 49152 / 65536 = m * 3/4
 // m * 32758 / 65536 = m * 1/2
 // m * 21846 / 65536 = m * 0.3333
 //
 // These are loaded using an instruction expecting int16_t values but are used
 // with _mm_mulhi_epu16(), which treats them as unsigned.
 #define NEIGHBOR_CONSTANT_4 (int16_t)49152
 #define NEIGHBOR_CONSTANT_6 (int16_t)32768
 #define NEIGHBOR_CONSTANT_9 (int16_t)21846

 // Load values from 'a' and 'b'. Compute the difference squared and sum
 // neighboring values such that:
 // sum[1] = (a[0]-b[0])^2 + (a[1]-b[1])^2 + (a[2]-b[2])^2
 // Values to the left and right of the row are set to 0.
 // The values are returned in sum_0 and sum_1 as *unsigned* 16 bit values.
 static void sum_8(const uint8_t *a, const uint8_t *b, __m128i *sum) {
   const __m128i a_u8 = _mm_loadl_epi64((const __m128i *)a);
   const __m128i b_u8 = _mm_loadl_epi64((const __m128i *)b);

   const __m128i a_u16 = _mm_cvtepu8_epi16(a_u8);
   const __m128i b_u16 = _mm_cvtepu8_epi16(b_u8);

   const __m128i diff_s16 = _mm_sub_epi16(a_u16, b_u16);
   const __m128i diff_sq_u16 = _mm_mullo_epi16(diff_s16, diff_s16);

   // Shift all the values one place to the left/right so we can efficiently sum
   // diff_sq_u16[i - 1] + diff_sq_u16[i] + diff_sq_u16[i + 1].
   const __m128i shift_left = _mm_slli_si128(diff_sq_u16, 2);
   const __m128i shift_right = _mm_srli_si128(diff_sq_u16, 2);

   // It becomes necessary to treat the values as unsigned at this point. The
   // 255^2 fits in uint16_t but not int16_t. Use saturating adds from this point
   // forward since the filter is only applied to smooth small pixel changes.
   // Once the value has saturated to uint16_t it is well outside the useful
   // range.
   __m128i sum_u16 = _mm_adds_epu16(diff_sq_u16, shift_left);
   sum_u16 = _mm_adds_epu16(sum_u16, shift_right);

   *sum = sum_u16;
 }

 static void sum_16(const uint8_t *a, const uint8_t *b, __m128i *sum_0,
                    __m128i *sum_1) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i a_u8 = _mm_loadu_si128((const __m128i *)a);
   const __m128i b_u8 = _mm_loadu_si128((const __m128i *)b);

   const __m128i a_0_u16 = _mm_cvtepu8_epi16(a_u8);
   const __m128i a_1_u16 = _mm_unpackhi_epi8(a_u8, zero);
   const __m128i b_0_u16 = _mm_cvtepu8_epi16(b_u8);
   const __m128i b_1_u16 = _mm_unpackhi_epi8(b_u8, zero);

   const __m128i diff_0_s16 = _mm_sub_epi16(a_0_u16, b_0_u16);
   const __m128i diff_1_s16 = _mm_sub_epi16(a_1_u16, b_1_u16);
   const __m128i diff_sq_0_u16 = _mm_mullo_epi16(diff_0_s16, diff_0_s16);
   const __m128i diff_sq_1_u16 = _mm_mullo_epi16(diff_1_s16, diff_1_s16);

   __m128i shift_left = _mm_slli_si128(diff_sq_0_u16, 2);
   // Use _mm_alignr_epi8() to "shift in" diff_sq_u16[8].
   __m128i shift_right = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 2);

   __m128i sum_u16 = _mm_adds_epu16(diff_sq_0_u16, shift_left);
   sum_u16 = _mm_adds_epu16(sum_u16, shift_right);

   *sum_0 = sum_u16;

   shift_left = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 14);
   shift_right = _mm_srli_si128(diff_sq_1_u16, 2);

   sum_u16 = _mm_adds_epu16(diff_sq_1_u16, shift_left);
   sum_u16 = _mm_adds_epu16(sum_u16, shift_right);

   *sum_1 = sum_u16;
 }

 // Average the value based on the number of values summed (9 for pixels away
 // from the border, 4 for pixels in corners, and 6 for other edge values).
 //
 // Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
 // by weight.
 static __m128i average_8(__m128i sum, const __m128i mul_constants,
                          const int strength, const int rounding,
                          const int weight) {
   // _mm_srl_epi16 uses the lower 64 bit value for the shift.
   const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
   const __m128i rounding_u16 = _mm_set1_epi16(rounding);
   const __m128i weight_u16 = _mm_set1_epi16(weight);
   const __m128i sixteen = _mm_set1_epi16(16);

   // modifier * 3 / index;
   sum = _mm_mulhi_epu16(sum, mul_constants);

   sum = _mm_adds_epu16(sum, rounding_u16);
   sum = _mm_srl_epi16(sum, strength_u128);

   // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
   // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
   // So this needs to use the epu16 version which did not come until SSE4.
   sum = _mm_min_epu16(sum, sixteen);

   sum = _mm_sub_epi16(sixteen, sum);

   return _mm_mullo_epi16(sum, weight_u16);
 }

 static void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16,
                        const __m128i mul_constants_0,
                        const __m128i mul_constants_1, const int strength,
                        const int rounding, const int weight) {
   const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
   const __m128i rounding_u16 = _mm_set1_epi16(rounding);
   const __m128i weight_u16 = _mm_set1_epi16(weight);
   const __m128i sixteen = _mm_set1_epi16(16);
   __m128i input_0, input_1;

   input_0 = _mm_mulhi_epu16(*sum_0_u16, mul_constants_0);
   input_0 = _mm_adds_epu16(input_0, rounding_u16);

   input_1 = _mm_mulhi_epu16(*sum_1_u16, mul_constants_1);
   input_1 = _mm_adds_epu16(input_1, rounding_u16);

   input_0 = _mm_srl_epi16(input_0, strength_u128);
   input_1 = _mm_srl_epi16(input_1, strength_u128);

   input_0 = _mm_min_epu16(input_0, sixteen);
   input_1 = _mm_min_epu16(input_1, sixteen);
   input_0 = _mm_sub_epi16(sixteen, input_0);
   input_1 = _mm_sub_epi16(sixteen, input_1);

   *sum_0_u16 = _mm_mullo_epi16(input_0, weight_u16);
   *sum_1_u16 = _mm_mullo_epi16(input_1, weight_u16);
 }

 // Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.'
 static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred,
                                    uint16_t *count, uint32_t *accumulator) {
   const __m128i pred_u8 = _mm_loadl_epi64((const __m128i *)pred);
   const __m128i zero = _mm_setzero_si128();
   __m128i count_u16 = _mm_loadu_si128((const __m128i *)count);
   __m128i pred_u16 = _mm_cvtepu8_epi16(pred_u8);
   __m128i pred_0_u32, pred_1_u32;
   __m128i accum_0_u32, accum_1_u32;

   count_u16 = _mm_adds_epu16(count_u16, sum_u16);
   _mm_storeu_si128((__m128i *)count, count_u16);

   pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16);

   pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
   pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);

   accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
   accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));

   accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
   accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);

   _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
   _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
 }

 static void accumulate_and_store_16(const __m128i sum_0_u16,
                                     const __m128i sum_1_u16,
                                     const uint8_t *pred, uint16_t *count,
                                     uint32_t *accumulator) {
   const __m128i pred_u8 = _mm_loadu_si128((const __m128i *)pred);
   const __m128i zero = _mm_setzero_si128();
   __m128i count_0_u16 = _mm_loadu_si128((const __m128i *)count),
           count_1_u16 = _mm_loadu_si128((const __m128i *)(count + 8));
   __m128i pred_0_u16 = _mm_cvtepu8_epi16(pred_u8),
           pred_1_u16 = _mm_unpackhi_epi8(pred_u8, zero);
   __m128i pred_0_u32, pred_1_u32, pred_2_u32, pred_3_u32;
   __m128i accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32;

   count_0_u16 = _mm_adds_epu16(count_0_u16, sum_0_u16);
   _mm_storeu_si128((__m128i *)count, count_0_u16);

   count_1_u16 = _mm_adds_epu16(count_1_u16, sum_1_u16);
   _mm_storeu_si128((__m128i *)(count + 8), count_1_u16);

   pred_0_u16 = _mm_mullo_epi16(sum_0_u16, pred_0_u16);
   pred_1_u16 = _mm_mullo_epi16(sum_1_u16, pred_1_u16);

   pred_0_u32 = _mm_cvtepu16_epi32(pred_0_u16);
   pred_1_u32 = _mm_unpackhi_epi16(pred_0_u16, zero);
   pred_2_u32 = _mm_cvtepu16_epi32(pred_1_u16);
   pred_3_u32 = _mm_unpackhi_epi16(pred_1_u16, zero);

   accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
   accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
   accum_2_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 8));
   accum_3_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 12));

   accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
   accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
   accum_2_u32 = _mm_add_epi32(pred_2_u32, accum_2_u32);
   accum_3_u32 = _mm_add_epi32(pred_3_u32, accum_3_u32);

   _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
   _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
   _mm_storeu_si128((__m128i *)(accumulator + 8), accum_2_u32);
   _mm_storeu_si128((__m128i *)(accumulator + 12), accum_3_u32);
 }

 void vp9_temporal_filter_apply_sse4_1(const uint8_t *a, unsigned int stride,
                                       const uint8_t *b, unsigned int width,
                                       unsigned int height, int strength,
                                       int weight, uint32_t *accumulator,
                                       uint16_t *count) {
   unsigned int h;
   const int rounding = strength > 0 ? 1 << (strength - 1) : 0;

   assert(strength >= 0);
   assert(strength <= 6);

   assert(weight >= 0);
   assert(weight <= 2);

   assert(width == 8 || width == 16);

   if (width == 8) {
     __m128i sum_row_a, sum_row_b, sum_row_c;
     __m128i mul_constants = _mm_setr_epi16(
         NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
         NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
         NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);

     sum_8(a, b, &sum_row_a);
     sum_8(a + stride, b + width, &sum_row_b);
     sum_row_c = _mm_adds_epu16(sum_row_a, sum_row_b);
     sum_row_c = average_8(sum_row_c, mul_constants, strength, rounding, weight);
     accumulate_and_store_8(sum_row_c, b, count, accumulator);

     a += stride + stride;
     b += width;
     count += width;
     accumulator += width;

     mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9,
                                    NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
                                    NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
                                    NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6);

     for (h = 0; h < height - 2; ++h) {
       sum_8(a, b + width, &sum_row_c);
       sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b);
       sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_c);
       sum_row_a =
           average_8(sum_row_a, mul_constants, strength, rounding, weight);
       accumulate_and_store_8(sum_row_a, b, count, accumulator);

       a += stride;
       b += width;
       count += width;
       accumulator += width;

       sum_row_a = sum_row_b;
       sum_row_b = sum_row_c;
     }

     mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6,
                                    NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
                                    NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
                                    NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
     sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b);
     sum_row_a = average_8(sum_row_a, mul_constants, strength, rounding, weight);
     accumulate_and_store_8(sum_row_a, b, count, accumulator);

   } else {  // width == 16
     __m128i sum_row_a_0, sum_row_a_1;
     __m128i sum_row_b_0, sum_row_b_1;
     __m128i sum_row_c_0, sum_row_c_1;
     __m128i mul_constants_0 = _mm_setr_epi16(
                 NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
                 NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
                 NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6),
             mul_constants_1 = _mm_setr_epi16(
                 NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
                 NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
                 NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);

     sum_16(a, b, &sum_row_a_0, &sum_row_a_1);
     sum_16(a + stride, b + width, &sum_row_b_0, &sum_row_b_1);

     sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
     sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);

     average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1,
                strength, rounding, weight);
     accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator);

     a += stride + stride;
     b += width;
     count += width;
     accumulator += width;

     mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9,
                                      NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
                                      NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
                                      NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9);
     mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
                                      NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
                                      NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
                                      NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6);
     for (h = 0; h < height - 2; ++h) {
       sum_16(a, b + width, &sum_row_c_0, &sum_row_c_1);

       sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
       sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_c_0);
       sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
       sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_c_1);

       average_16(&sum_row_a_0, &sum_row_a_1, mul_constants_0, mul_constants_1,
                  strength, rounding, weight);
       accumulate_and_store_16(sum_row_a_0, sum_row_a_1, b, count, accumulator);

       a += stride;
       b += width;
       count += width;
       accumulator += width;

       sum_row_a_0 = sum_row_b_0;
       sum_row_a_1 = sum_row_b_1;
       sum_row_b_0 = sum_row_c_0;
       sum_row_b_1 = sum_row_c_1;
     }

     mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6,
                                      NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
                                      NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
                                      NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6);
     mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
                                      NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
                                      NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
                                      NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
     sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
     sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);

     average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1,
                strength, rounding, weight);
     accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator);
   }
 }
	/*
	* Copyright (c) 2017 The WebM project authors. All Rights Reserved.
	*
	* Use of this source code is governed by a BSD-style license
	* that can be found in the LICENSE file in the root of the source
	* tree. An additional intellectual property rights grant can be found
	* in the file PATENTS. All contributing project authors may
	* be found in the AUTHORS file in the root of the source tree.
	*/

	#include <assert.h>
	#include <smmintrin.h>

	#include "./vpx_config.h"
	#include "vpx/vpx_integer.h"

	// Division using multiplication and shifting. The C implementation does:
	// modifier *= 3;
	// modifier /= index;
	// where 'modifier' is a set of summed values and 'index' is the number of
	// summed values. 'index' may be 4, 6, or 9, representing a block of 9 values
	// which may be bound by the edges of the block being filtered.
	//
	// This equation works out to (m * 3) / i which reduces to:
	// m * 3/4
	// m * 1/2
	// m * 1/3
	//
	// By pairing the multiply with a down shift by 16 (_mm_mulhi_epu16):
	// m * C / 65536
	// we can create a C to replicate the division.
	//
	// m * 49152 / 65536 = m * 3/4
	// m * 32758 / 65536 = m * 1/2
	// m * 21846 / 65536 = m * 0.3333
	//
	// These are loaded using an instruction expecting int16_t values but are used
	// with _mm_mulhi_epu16(), which treats them as unsigned.
	#define NEIGHBOR_CONSTANT_4 (int16_t)49152
	#define NEIGHBOR_CONSTANT_6 (int16_t)32768
	#define NEIGHBOR_CONSTANT_9 (int16_t)21846

	// Load values from 'a' and 'b'. Compute the difference squared and sum
	// neighboring values such that:
	// sum[1] = (a[0]-b[0])^2 + (a[1]-b[1])^2 + (a[2]-b[2])^2
	// Values to the left and right of the row are set to 0.
	// The values are returned in sum_0 and sum_1 as unsigned 16 bit values.
	static void sum_8(const uint8_t a, const uint8_t b, __m128i *sum) {
	const __m128i a_u8 = _mm_loadl_epi64((const __m128i *)a);
	const __m128i b_u8 = _mm_loadl_epi64((const __m128i *)b);

	const __m128i a_u16 = _mm_cvtepu8_epi16(a_u8);
	const __m128i b_u16 = _mm_cvtepu8_epi16(b_u8);

	const __m128i diff_s16 = _mm_sub_epi16(a_u16, b_u16);
	const __m128i diff_sq_u16 = _mm_mullo_epi16(diff_s16, diff_s16);

	// Shift all the values one place to the left/right so we can efficiently sum
	// diff_sq_u16[i - 1] + diff_sq_u16[i] + diff_sq_u16[i + 1].
	const __m128i shift_left = _mm_slli_si128(diff_sq_u16, 2);
	const __m128i shift_right = _mm_srli_si128(diff_sq_u16, 2);

	// It becomes necessary to treat the values as unsigned at this point. The
	// 255^2 fits in uint16_t but not int16_t. Use saturating adds from this point
	// forward since the filter is only applied to smooth small pixel changes.
	// Once the value has saturated to uint16_t it is well outside the useful
	// range.
	__m128i sum_u16 = _mm_adds_epu16(diff_sq_u16, shift_left);
	sum_u16 = _mm_adds_epu16(sum_u16, shift_right);

	*sum = sum_u16;
	}

	static void sum_16(const uint8_t a, const uint8_t b, __m128i *sum_0,
	__m128i *sum_1) {
	const __m128i zero = _mm_setzero_si128();
	const __m128i a_u8 = _mm_loadu_si128((const __m128i *)a);
	const __m128i b_u8 = _mm_loadu_si128((const __m128i *)b);

	const __m128i a_0_u16 = _mm_cvtepu8_epi16(a_u8);
	const __m128i a_1_u16 = _mm_unpackhi_epi8(a_u8, zero);
	const __m128i b_0_u16 = _mm_cvtepu8_epi16(b_u8);
	const __m128i b_1_u16 = _mm_unpackhi_epi8(b_u8, zero);

	const __m128i diff_0_s16 = _mm_sub_epi16(a_0_u16, b_0_u16);
	const __m128i diff_1_s16 = _mm_sub_epi16(a_1_u16, b_1_u16);
	const __m128i diff_sq_0_u16 = _mm_mullo_epi16(diff_0_s16, diff_0_s16);
	const __m128i diff_sq_1_u16 = _mm_mullo_epi16(diff_1_s16, diff_1_s16);

	__m128i shift_left = _mm_slli_si128(diff_sq_0_u16, 2);
	// Use _mm_alignr_epi8() to "shift in" diff_sq_u16[8].
	__m128i shift_right = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 2);

	__m128i sum_u16 = _mm_adds_epu16(diff_sq_0_u16, shift_left);
	sum_u16 = _mm_adds_epu16(sum_u16, shift_right);

	*sum_0 = sum_u16;

	shift_left = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 14);
	shift_right = _mm_srli_si128(diff_sq_1_u16, 2);

	sum_u16 = _mm_adds_epu16(diff_sq_1_u16, shift_left);
	sum_u16 = _mm_adds_epu16(sum_u16, shift_right);

	*sum_1 = sum_u16;
	}

	// Average the value based on the number of values summed (9 for pixels away
	// from the border, 4 for pixels in corners, and 6 for other edge values).
	//
	// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
	// by weight.
	static __m128i average_8(__m128i sum, const __m128i mul_constants,
	const int strength, const int rounding,
	const int weight) {
	// _mm_srl_epi16 uses the lower 64 bit value for the shift.
	const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
	const __m128i rounding_u16 = _mm_set1_epi16(rounding);
	const __m128i weight_u16 = _mm_set1_epi16(weight);
	const __m128i sixteen = _mm_set1_epi16(16);

	// modifier * 3 / index;
	sum = _mm_mulhi_epu16(sum, mul_constants);

	sum = _mm_adds_epu16(sum, rounding_u16);
	sum = _mm_srl_epi16(sum, strength_u128);

	// The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
	// >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
	// So this needs to use the epu16 version which did not come until SSE4.
	sum = _mm_min_epu16(sum, sixteen);

	sum = _mm_sub_epi16(sixteen, sum);

	return _mm_mullo_epi16(sum, weight_u16);
	}

	static void average_16(__m128i sum_0_u16, __m128i sum_1_u16,
	const __m128i mul_constants_0,
	const __m128i mul_constants_1, const int strength,
	const int rounding, const int weight) {
	const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
	const __m128i rounding_u16 = _mm_set1_epi16(rounding);
	const __m128i weight_u16 = _mm_set1_epi16(weight);
	const __m128i sixteen = _mm_set1_epi16(16);
	__m128i input_0, input_1;

	input_0 = _mm_mulhi_epu16(*sum_0_u16, mul_constants_0);
	input_0 = _mm_adds_epu16(input_0, rounding_u16);

	input_1 = _mm_mulhi_epu16(*sum_1_u16, mul_constants_1);
	input_1 = _mm_adds_epu16(input_1, rounding_u16);

	input_0 = _mm_srl_epi16(input_0, strength_u128);
	input_1 = _mm_srl_epi16(input_1, strength_u128);

	input_0 = _mm_min_epu16(input_0, sixteen);
	input_1 = _mm_min_epu16(input_1, sixteen);
	input_0 = _mm_sub_epi16(sixteen, input_0);
	input_1 = _mm_sub_epi16(sixteen, input_1);

	*sum_0_u16 = _mm_mullo_epi16(input_0, weight_u16);
	*sum_1_u16 = _mm_mullo_epi16(input_1, weight_u16);
	}

	// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.'
	static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred,
	uint16_t count, uint32_t accumulator) {
	const __m128i pred_u8 = _mm_loadl_epi64((const __m128i *)pred);
	const __m128i zero = _mm_setzero_si128();
	__m128i count_u16 = _mm_loadu_si128((const __m128i *)count);
	__m128i pred_u16 = _mm_cvtepu8_epi16(pred_u8);
	__m128i pred_0_u32, pred_1_u32;
	__m128i accum_0_u32, accum_1_u32;

	count_u16 = _mm_adds_epu16(count_u16, sum_u16);
	_mm_storeu_si128((__m128i *)count, count_u16);

	pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16);

	pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
	pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);

	accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
	accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));

	accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
	accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);

	_mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
	_mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
	}

	static void accumulate_and_store_16(const __m128i sum_0_u16,
	const __m128i sum_1_u16,
	const uint8_t pred, uint16_t count,
	uint32_t *accumulator) {
	const __m128i pred_u8 = _mm_loadu_si128((const __m128i *)pred);
	const __m128i zero = _mm_setzero_si128();
	__m128i count_0_u16 = _mm_loadu_si128((const __m128i *)count),
	count_1_u16 = _mm_loadu_si128((const __m128i *)(count + 8));
	__m128i pred_0_u16 = _mm_cvtepu8_epi16(pred_u8),
	pred_1_u16 = _mm_unpackhi_epi8(pred_u8, zero);
	__m128i pred_0_u32, pred_1_u32, pred_2_u32, pred_3_u32;
	__m128i accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32;

	count_0_u16 = _mm_adds_epu16(count_0_u16, sum_0_u16);
	_mm_storeu_si128((__m128i *)count, count_0_u16);

	count_1_u16 = _mm_adds_epu16(count_1_u16, sum_1_u16);
	_mm_storeu_si128((__m128i *)(count + 8), count_1_u16);

	pred_0_u16 = _mm_mullo_epi16(sum_0_u16, pred_0_u16);
	pred_1_u16 = _mm_mullo_epi16(sum_1_u16, pred_1_u16);

	pred_0_u32 = _mm_cvtepu16_epi32(pred_0_u16);
	pred_1_u32 = _mm_unpackhi_epi16(pred_0_u16, zero);
	pred_2_u32 = _mm_cvtepu16_epi32(pred_1_u16);
	pred_3_u32 = _mm_unpackhi_epi16(pred_1_u16, zero);

	accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
	accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
	accum_2_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 8));
	accum_3_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 12));

	accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
	accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
	accum_2_u32 = _mm_add_epi32(pred_2_u32, accum_2_u32);
	accum_3_u32 = _mm_add_epi32(pred_3_u32, accum_3_u32);

	_mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
	_mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
	_mm_storeu_si128((__m128i *)(accumulator + 8), accum_2_u32);
	_mm_storeu_si128((__m128i *)(accumulator + 12), accum_3_u32);
	}

	void vp9_temporal_filter_apply_sse4_1(const uint8_t *a, unsigned int stride,
	const uint8_t *b, unsigned int width,
	unsigned int height, int strength,
	int weight, uint32_t *accumulator,
	uint16_t *count) {
	unsigned int h;
	const int rounding = strength > 0 ? 1 << (strength - 1) : 0;

	assert(strength >= 0);
	assert(strength <= 6);

	assert(weight >= 0);
	assert(weight <= 2);

	assert(width == 8 \|\| width == 16);

	if (width == 8) {
	__m128i sum_row_a, sum_row_b, sum_row_c;
	__m128i mul_constants = _mm_setr_epi16(
	NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
	NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
	NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);

	sum_8(a, b, &sum_row_a);
	sum_8(a + stride, b + width, &sum_row_b);
	sum_row_c = _mm_adds_epu16(sum_row_a, sum_row_b);
	sum_row_c = average_8(sum_row_c, mul_constants, strength, rounding, weight);
	accumulate_and_store_8(sum_row_c, b, count, accumulator);

	a += stride + stride;
	b += width;
	count += width;
	accumulator += width;

	mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9,
	NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
	NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
	NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6);

	for (h = 0; h < height - 2; ++h) {
	sum_8(a, b + width, &sum_row_c);
	sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b);
	sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_c);
	sum_row_a =
	average_8(sum_row_a, mul_constants, strength, rounding, weight);
	accumulate_and_store_8(sum_row_a, b, count, accumulator);

	a += stride;
	b += width;
	count += width;
	accumulator += width;

	sum_row_a = sum_row_b;
	sum_row_b = sum_row_c;
	}

	mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6,
	NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
	NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
	NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
	sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b);
	sum_row_a = average_8(sum_row_a, mul_constants, strength, rounding, weight);
	accumulate_and_store_8(sum_row_a, b, count, accumulator);

	} else { // width == 16
	__m128i sum_row_a_0, sum_row_a_1;
	__m128i sum_row_b_0, sum_row_b_1;
	__m128i sum_row_c_0, sum_row_c_1;
	__m128i mul_constants_0 = _mm_setr_epi16(
	NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
	NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
	NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6),
	mul_constants_1 = _mm_setr_epi16(
	NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
	NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
	NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);

	sum_16(a, b, &sum_row_a_0, &sum_row_a_1);
	sum_16(a + stride, b + width, &sum_row_b_0, &sum_row_b_1);

	sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
	sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);

	average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1,
	strength, rounding, weight);
	accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator);

	a += stride + stride;
	b += width;
	count += width;
	accumulator += width;

	mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9,
	NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
	NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
	NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9);
	mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
	NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
	NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
	NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6);
	for (h = 0; h < height - 2; ++h) {
	sum_16(a, b + width, &sum_row_c_0, &sum_row_c_1);

	sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
	sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_c_0);
	sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
	sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_c_1);

	average_16(&sum_row_a_0, &sum_row_a_1, mul_constants_0, mul_constants_1,
	strength, rounding, weight);
	accumulate_and_store_16(sum_row_a_0, sum_row_a_1, b, count, accumulator);

	a += stride;
	b += width;
	count += width;
	accumulator += width;

	sum_row_a_0 = sum_row_b_0;
	sum_row_a_1 = sum_row_b_1;
	sum_row_b_0 = sum_row_c_0;
	sum_row_b_1 = sum_row_c_1;
	}

	mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6,
	NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
	NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
	NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6);
	mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
	NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
	NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
	NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
	sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
	sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);

	average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1,
	strength, rounding, weight);
	accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator);
	}
	}