blob: be4cd8685c5a13901e64406245c3b291f298ddda [file] [log] [blame]
/*
* Copyright (c) 2017 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <assert.h>
#include <smmintrin.h>
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
// Division using multiplication and shifting. The C implementation does:
// modifier *= 3;
// modifier /= index;
// where 'modifier' is a set of summed values and 'index' is the number of
// summed values. 'index' may be 4, 6, or 9, representing a block of 9 values
// which may be bound by the edges of the block being filtered.
//
// This equation works out to (m * 3) / i which reduces to:
// m * 3/4
// m * 1/2
// m * 1/3
//
// By pairing the multiply with a down shift by 16 (_mm_mulhi_epu16):
// m * C / 65536
// we can create a C to replicate the division.
//
// m * 49152 / 65536 = m * 3/4
// m * 32758 / 65536 = m * 1/2
// m * 21846 / 65536 = m * 0.3333
//
// These are loaded using an instruction expecting int16_t values but are used
// with _mm_mulhi_epu16(), which treats them as unsigned.
#define NEIGHBOR_CONSTANT_4 (int16_t)49152
#define NEIGHBOR_CONSTANT_6 (int16_t)32768
#define NEIGHBOR_CONSTANT_9 (int16_t)21846
// Load values from 'a' and 'b'. Compute the difference squared and sum
// neighboring values such that:
// sum[1] = (a[0]-b[0])^2 + (a[1]-b[1])^2 + (a[2]-b[2])^2
// Values to the left and right of the row are set to 0.
// The values are returned in sum_0 and sum_1 as *unsigned* 16 bit values.
static void sum_8(const uint8_t *a, const uint8_t *b, __m128i *sum) {
const __m128i a_u8 = _mm_loadl_epi64((const __m128i *)a);
const __m128i b_u8 = _mm_loadl_epi64((const __m128i *)b);
const __m128i a_u16 = _mm_cvtepu8_epi16(a_u8);
const __m128i b_u16 = _mm_cvtepu8_epi16(b_u8);
const __m128i diff_s16 = _mm_sub_epi16(a_u16, b_u16);
const __m128i diff_sq_u16 = _mm_mullo_epi16(diff_s16, diff_s16);
// Shift all the values one place to the left/right so we can efficiently sum
// diff_sq_u16[i - 1] + diff_sq_u16[i] + diff_sq_u16[i + 1].
const __m128i shift_left = _mm_slli_si128(diff_sq_u16, 2);
const __m128i shift_right = _mm_srli_si128(diff_sq_u16, 2);
// It becomes necessary to treat the values as unsigned at this point. The
// 255^2 fits in uint16_t but not int16_t. Use saturating adds from this point
// forward since the filter is only applied to smooth small pixel changes.
// Once the value has saturated to uint16_t it is well outside the useful
// range.
__m128i sum_u16 = _mm_adds_epu16(diff_sq_u16, shift_left);
sum_u16 = _mm_adds_epu16(sum_u16, shift_right);
*sum = sum_u16;
}
static void sum_16(const uint8_t *a, const uint8_t *b, __m128i *sum_0,
__m128i *sum_1) {
const __m128i zero = _mm_setzero_si128();
const __m128i a_u8 = _mm_loadu_si128((const __m128i *)a);
const __m128i b_u8 = _mm_loadu_si128((const __m128i *)b);
const __m128i a_0_u16 = _mm_cvtepu8_epi16(a_u8);
const __m128i a_1_u16 = _mm_unpackhi_epi8(a_u8, zero);
const __m128i b_0_u16 = _mm_cvtepu8_epi16(b_u8);
const __m128i b_1_u16 = _mm_unpackhi_epi8(b_u8, zero);
const __m128i diff_0_s16 = _mm_sub_epi16(a_0_u16, b_0_u16);
const __m128i diff_1_s16 = _mm_sub_epi16(a_1_u16, b_1_u16);
const __m128i diff_sq_0_u16 = _mm_mullo_epi16(diff_0_s16, diff_0_s16);
const __m128i diff_sq_1_u16 = _mm_mullo_epi16(diff_1_s16, diff_1_s16);
__m128i shift_left = _mm_slli_si128(diff_sq_0_u16, 2);
// Use _mm_alignr_epi8() to "shift in" diff_sq_u16[8].
__m128i shift_right = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 2);
__m128i sum_u16 = _mm_adds_epu16(diff_sq_0_u16, shift_left);
sum_u16 = _mm_adds_epu16(sum_u16, shift_right);
*sum_0 = sum_u16;
shift_left = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 14);
shift_right = _mm_srli_si128(diff_sq_1_u16, 2);
sum_u16 = _mm_adds_epu16(diff_sq_1_u16, shift_left);
sum_u16 = _mm_adds_epu16(sum_u16, shift_right);
*sum_1 = sum_u16;
}
// Average the value based on the number of values summed (9 for pixels away
// from the border, 4 for pixels in corners, and 6 for other edge values).
//
// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
// by weight.
static __m128i average_8(__m128i sum, const __m128i mul_constants,
const int strength, const int rounding,
const int weight) {
// _mm_srl_epi16 uses the lower 64 bit value for the shift.
const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
const __m128i rounding_u16 = _mm_set1_epi16(rounding);
const __m128i weight_u16 = _mm_set1_epi16(weight);
const __m128i sixteen = _mm_set1_epi16(16);
// modifier * 3 / index;
sum = _mm_mulhi_epu16(sum, mul_constants);
sum = _mm_adds_epu16(sum, rounding_u16);
sum = _mm_srl_epi16(sum, strength_u128);
// The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
// >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
// So this needs to use the epu16 version which did not come until SSE4.
sum = _mm_min_epu16(sum, sixteen);
sum = _mm_sub_epi16(sixteen, sum);
return _mm_mullo_epi16(sum, weight_u16);
}
static void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16,
const __m128i mul_constants_0,
const __m128i mul_constants_1, const int strength,
const int rounding, const int weight) {
const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
const __m128i rounding_u16 = _mm_set1_epi16(rounding);
const __m128i weight_u16 = _mm_set1_epi16(weight);
const __m128i sixteen = _mm_set1_epi16(16);
__m128i input_0, input_1;
input_0 = _mm_mulhi_epu16(*sum_0_u16, mul_constants_0);
input_0 = _mm_adds_epu16(input_0, rounding_u16);
input_1 = _mm_mulhi_epu16(*sum_1_u16, mul_constants_1);
input_1 = _mm_adds_epu16(input_1, rounding_u16);
input_0 = _mm_srl_epi16(input_0, strength_u128);
input_1 = _mm_srl_epi16(input_1, strength_u128);
input_0 = _mm_min_epu16(input_0, sixteen);
input_1 = _mm_min_epu16(input_1, sixteen);
input_0 = _mm_sub_epi16(sixteen, input_0);
input_1 = _mm_sub_epi16(sixteen, input_1);
*sum_0_u16 = _mm_mullo_epi16(input_0, weight_u16);
*sum_1_u16 = _mm_mullo_epi16(input_1, weight_u16);
}
// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.'
static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred,
uint16_t *count, uint32_t *accumulator) {
const __m128i pred_u8 = _mm_loadl_epi64((const __m128i *)pred);
const __m128i zero = _mm_setzero_si128();
__m128i count_u16 = _mm_loadu_si128((const __m128i *)count);
__m128i pred_u16 = _mm_cvtepu8_epi16(pred_u8);
__m128i pred_0_u32, pred_1_u32;
__m128i accum_0_u32, accum_1_u32;
count_u16 = _mm_adds_epu16(count_u16, sum_u16);
_mm_storeu_si128((__m128i *)count, count_u16);
pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16);
pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);
accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
_mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
_mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
}
static void accumulate_and_store_16(const __m128i sum_0_u16,
const __m128i sum_1_u16,
const uint8_t *pred, uint16_t *count,
uint32_t *accumulator) {
const __m128i pred_u8 = _mm_loadu_si128((const __m128i *)pred);
const __m128i zero = _mm_setzero_si128();
__m128i count_0_u16 = _mm_loadu_si128((const __m128i *)count),
count_1_u16 = _mm_loadu_si128((const __m128i *)(count + 8));
__m128i pred_0_u16 = _mm_cvtepu8_epi16(pred_u8),
pred_1_u16 = _mm_unpackhi_epi8(pred_u8, zero);
__m128i pred_0_u32, pred_1_u32, pred_2_u32, pred_3_u32;
__m128i accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32;
count_0_u16 = _mm_adds_epu16(count_0_u16, sum_0_u16);
_mm_storeu_si128((__m128i *)count, count_0_u16);
count_1_u16 = _mm_adds_epu16(count_1_u16, sum_1_u16);
_mm_storeu_si128((__m128i *)(count + 8), count_1_u16);
pred_0_u16 = _mm_mullo_epi16(sum_0_u16, pred_0_u16);
pred_1_u16 = _mm_mullo_epi16(sum_1_u16, pred_1_u16);
pred_0_u32 = _mm_cvtepu16_epi32(pred_0_u16);
pred_1_u32 = _mm_unpackhi_epi16(pred_0_u16, zero);
pred_2_u32 = _mm_cvtepu16_epi32(pred_1_u16);
pred_3_u32 = _mm_unpackhi_epi16(pred_1_u16, zero);
accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
accum_2_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 8));
accum_3_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 12));
accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
accum_2_u32 = _mm_add_epi32(pred_2_u32, accum_2_u32);
accum_3_u32 = _mm_add_epi32(pred_3_u32, accum_3_u32);
_mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
_mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
_mm_storeu_si128((__m128i *)(accumulator + 8), accum_2_u32);
_mm_storeu_si128((__m128i *)(accumulator + 12), accum_3_u32);
}
void vp9_temporal_filter_apply_sse4_1(const uint8_t *a, unsigned int stride,
const uint8_t *b, unsigned int width,
unsigned int height, int strength,
int weight, uint32_t *accumulator,
uint16_t *count) {
unsigned int h;
const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
assert(strength >= 0);
assert(strength <= 6);
assert(weight >= 0);
assert(weight <= 2);
assert(width == 8 || width == 16);
if (width == 8) {
__m128i sum_row_a, sum_row_b, sum_row_c;
__m128i mul_constants = _mm_setr_epi16(
NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
sum_8(a, b, &sum_row_a);
sum_8(a + stride, b + width, &sum_row_b);
sum_row_c = _mm_adds_epu16(sum_row_a, sum_row_b);
sum_row_c = average_8(sum_row_c, mul_constants, strength, rounding, weight);
accumulate_and_store_8(sum_row_c, b, count, accumulator);
a += stride + stride;
b += width;
count += width;
accumulator += width;
mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9,
NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6);
for (h = 0; h < height - 2; ++h) {
sum_8(a, b + width, &sum_row_c);
sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b);
sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_c);
sum_row_a =
average_8(sum_row_a, mul_constants, strength, rounding, weight);
accumulate_and_store_8(sum_row_a, b, count, accumulator);
a += stride;
b += width;
count += width;
accumulator += width;
sum_row_a = sum_row_b;
sum_row_b = sum_row_c;
}
mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6,
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b);
sum_row_a = average_8(sum_row_a, mul_constants, strength, rounding, weight);
accumulate_and_store_8(sum_row_a, b, count, accumulator);
} else { // width == 16
__m128i sum_row_a_0, sum_row_a_1;
__m128i sum_row_b_0, sum_row_b_1;
__m128i sum_row_c_0, sum_row_c_1;
__m128i mul_constants_0 = _mm_setr_epi16(
NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6),
mul_constants_1 = _mm_setr_epi16(
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
sum_16(a, b, &sum_row_a_0, &sum_row_a_1);
sum_16(a + stride, b + width, &sum_row_b_0, &sum_row_b_1);
sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1,
strength, rounding, weight);
accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator);
a += stride + stride;
b += width;
count += width;
accumulator += width;
mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9,
NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9);
mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6);
for (h = 0; h < height - 2; ++h) {
sum_16(a, b + width, &sum_row_c_0, &sum_row_c_1);
sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_c_0);
sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_c_1);
average_16(&sum_row_a_0, &sum_row_a_1, mul_constants_0, mul_constants_1,
strength, rounding, weight);
accumulate_and_store_16(sum_row_a_0, sum_row_a_1, b, count, accumulator);
a += stride;
b += width;
count += width;
accumulator += width;
sum_row_a_0 = sum_row_b_0;
sum_row_a_1 = sum_row_b_1;
sum_row_b_0 = sum_row_c_0;
sum_row_b_1 = sum_row_c_1;
}
mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6,
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6);
mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1,
strength, rounding, weight);
accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator);
}
}