SSE2 optimization for lpf 16_dual implementations
covers horizontal and vertical variations and
including low and high bitdepth types.
Appropriate tests are enabled
Performance changes, SSE2 over C:
Horizontal methods: up to 3x
Vertical methods: up to 2x
Change-Id: If430a916394c7befa743e4fbaa9913fd37c535ed
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index ea05415..3133b4e 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -431,6 +431,7 @@
specialize qw/aom_lpf_vertical_16 sse2/;
add_proto qw/void aom_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_vertical_16_dual sse2/;
add_proto qw/void aom_lpf_vertical_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
specialize qw/aom_lpf_vertical_6 sse2/;
diff --git a/aom_dsp/loopfilter.c b/aom_dsp/loopfilter.c
index 2f2d0ce..9072a5e 100644
--- a/aom_dsp/loopfilter.c
+++ b/aom_dsp/loopfilter.c
@@ -835,7 +835,7 @@
const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh, int bd) {
- highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);
+ highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd);
}
static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
diff --git a/aom_dsp/x86/highbd_loopfilter_sse2.c b/aom_dsp/x86/highbd_loopfilter_sse2.c
index ecbee3f..623b0f0 100644
--- a/aom_dsp/x86/highbd_loopfilter_sse2.c
+++ b/aom_dsp/x86/highbd_loopfilter_sse2.c
@@ -434,6 +434,7 @@
const uint8_t *_limit,
const uint8_t *_thresh, int bd) {
highbd_lpf_horz_edge_8_4p(s, p, _blimit, _limit, _thresh, bd);
+ highbd_lpf_horz_edge_8_4p(s + 4, p, _blimit, _limit, _thresh, bd);
}
static INLINE void store_horizontal_8(const __m128i *p2, const __m128i *p1,
diff --git a/aom_dsp/x86/loopfilter_sse2.c b/aom_dsp/x86/loopfilter_sse2.c
index a0e7ded..47f62ba 100644
--- a/aom_dsp/x86/loopfilter_sse2.c
+++ b/aom_dsp/x86/loopfilter_sse2.c
@@ -626,350 +626,6 @@
return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
}
-typedef enum { FOUR_PIXELS, EIGHT_PIXELS, SIXTEEN_PIXELS } PixelOutput;
-
-static INLINE void store_buffer_horz_16(PixelOutput pixel_num, const __m128i *x,
- int p, int offset, uint8_t *s) {
- int i;
- if (pixel_num == FOUR_PIXELS) {
- for (i = 13; i >= 0; i--) {
- xx_storel_32(s - (i - offset) * p, x[i]);
- }
- }
- if (pixel_num == EIGHT_PIXELS) {
- for (i = 13; i >= 0; i--) {
- xx_storel_64(s - (i - offset) * p, x[i]);
- }
- }
- if (pixel_num == SIXTEEN_PIXELS) {
- for (i = 13; i >= 0; i--) {
- xx_storeu_128(s - (i - offset) * p, x[i]);
- }
- }
-}
-
-static INLINE void lpf_horz_edge_16_internal(PixelOutput pixel_num,
- unsigned char *s, int p,
- const unsigned char *_blimit,
- const unsigned char *_limit,
- const unsigned char *_thresh) {
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i one = _mm_set1_epi8(1);
- const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
- const __m128i limit = _mm_load_si128((const __m128i *)_limit);
- const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
- __m128i mask, hev, flat, flat2;
- __m128i p7, p6, p5;
- __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
- __m128i q5, q6, q7;
-
- __m128i op2, op1, op0, oq0, oq1, oq2;
-
- __m128i max_abs_p1p0q1q0;
-
- p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
- p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
- p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
- p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
- p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
- p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
- p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
- p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
- q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
- q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
- q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
- q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
- q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
- q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
- q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
- q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
-
- {
- const __m128i abs_p1p0 = abs_diff(p1, p0);
- const __m128i abs_q1q0 = abs_diff(q1, q0);
- const __m128i fe = _mm_set1_epi8(0xfe);
- const __m128i ff = _mm_cmpeq_epi8(zero, zero);
- __m128i abs_p0q0 = abs_diff(p0, q0);
- __m128i abs_p1q1 = abs_diff(p1, q1);
- __m128i work;
- max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
-
- abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
- mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
- mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
- // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
- mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
- // mask |= (abs(p1 - p0) > limit) * -1;
- // mask |= (abs(q1 - q0) > limit) * -1;
- work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
- mask = _mm_max_epu8(work, mask);
- work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
- mask = _mm_max_epu8(work, mask);
- mask = _mm_subs_epu8(mask, limit);
- mask = _mm_cmpeq_epi8(mask, zero);
- }
-
- {
- __m128i work;
- work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
- flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
- work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
- flat = _mm_max_epu8(work, flat);
- work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
- flat = _mm_subs_epu8(flat, one);
- flat = _mm_cmpeq_epi8(flat, zero);
- flat = _mm_and_si128(flat, mask);
- flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
- flat2 = _mm_max_epu8(work, flat2);
- work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
- flat2 = _mm_max_epu8(work, flat2);
- work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0));
- flat2 = _mm_max_epu8(work, flat2);
- flat2 = _mm_subs_epu8(flat2, one);
- flat2 = _mm_cmpeq_epi8(flat2, zero);
- flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
- }
-
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- // filter4
- {
- const __m128i t4 = _mm_set1_epi8(4);
- const __m128i t3 = _mm_set1_epi8(3);
- const __m128i t80 = _mm_set1_epi8(0x80);
- const __m128i te0 = _mm_set1_epi8(0xe0);
- const __m128i t1f = _mm_set1_epi8(0x1f);
- const __m128i t1 = _mm_set1_epi8(0x1);
- const __m128i t7f = _mm_set1_epi8(0x7f);
- const __m128i ff = _mm_cmpeq_epi8(t4, t4);
-
- __m128i filt;
- __m128i work_a;
- __m128i filter1, filter2;
-
- op1 = _mm_xor_si128(p1, t80);
- op0 = _mm_xor_si128(p0, t80);
- oq0 = _mm_xor_si128(q0, t80);
- oq1 = _mm_xor_si128(q1, t80);
-
- hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh);
- hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
- filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
-
- work_a = _mm_subs_epi8(oq0, op0);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- filt = _mm_adds_epi8(filt, work_a);
- // (aom_filter + 3 * (qs0 - ps0)) & mask
- filt = _mm_and_si128(filt, mask);
- filter1 = _mm_adds_epi8(filt, t4);
- filter2 = _mm_adds_epi8(filt, t3);
-
- // Filter1 >> 3
- work_a = _mm_cmpgt_epi8(zero, filter1);
- filter1 = _mm_srli_epi16(filter1, 3);
- work_a = _mm_and_si128(work_a, te0);
- filter1 = _mm_and_si128(filter1, t1f);
- filter1 = _mm_or_si128(filter1, work_a);
- oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
-
- // Filter2 >> 3
- work_a = _mm_cmpgt_epi8(zero, filter2);
- filter2 = _mm_srli_epi16(filter2, 3);
- work_a = _mm_and_si128(work_a, te0);
- filter2 = _mm_and_si128(filter2, t1f);
- filter2 = _mm_or_si128(filter2, work_a);
- op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
-
- // filt >> 1
- filt = _mm_adds_epi8(filter1, t1);
- work_a = _mm_cmpgt_epi8(zero, filt);
- filt = _mm_srli_epi16(filt, 1);
- work_a = _mm_and_si128(work_a, t80);
- filt = _mm_and_si128(filt, t7f);
- filt = _mm_or_si128(filt, work_a);
- filt = _mm_andnot_si128(hev, filt);
- op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
- oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
- // loopfilter done
-
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- // filter8
- {
- const __m128i four = _mm_set1_epi16(4);
- const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
- const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
- const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
- const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
- const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
- const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
- const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
- const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
-
- const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
- const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
- const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
- const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
- const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
- const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
- const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
- const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
- __m128i f8_lo, f8_hi;
-
- f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
- _mm_add_epi16(p3_lo, p2_lo));
- f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
- _mm_add_epi16(p2_lo, p1_lo));
- f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
-
- f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
- _mm_add_epi16(p3_hi, p2_hi));
- f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
- _mm_add_epi16(p2_hi, p1_hi));
- f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
-
- op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
-
- f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
- f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
- op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
-
- f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
- f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
- op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
-
- f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
- f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
- oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
-
- f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
- f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
- oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
-
- f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
- f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
- oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
- }
-
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- // wide flat calculations
- {
- const __m128i eight = _mm_set1_epi16(8);
- const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero);
- const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
- const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
- const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
- const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
- const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
- const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
- const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
- const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
- const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
- const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
- const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
- const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
- const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
- const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
- const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero);
-
- const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero);
- const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
- const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
- const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
- const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
- const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
- const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
- const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
- const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
- const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
- const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
- const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
- const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
- const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
- const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
- const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero);
-
- __m128i f_lo;
- __m128i f_hi;
-
- f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo); // p7 * 7
- f_lo =
- _mm_add_epi16(_mm_slli_epi16(p6_lo, 1), _mm_add_epi16(p4_lo, f_lo));
- f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
- _mm_add_epi16(p2_lo, p1_lo));
- f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
- f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo);
-
- f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi); // p7 * 7
- f_hi =
- _mm_add_epi16(_mm_slli_epi16(p6_hi, 1), _mm_add_epi16(p4_hi, f_hi));
- f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
- _mm_add_epi16(p2_hi, p1_hi));
- f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
- f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
-
- __m128i x[14];
- x[13] = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
-
- f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
- f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
- x[12] = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
-
- f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
- f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
- x[11] = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
-
- f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
- f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
- x[10] = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
-
- f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
- f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
- x[9] = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
-
- f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
- f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
- x[8] = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
-
- f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
- f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
- x[7] = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
-
- f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
- f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
- x[6] = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
-
- f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
- f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
- x[5] = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
-
- f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
- f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
- x[4] = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
-
- f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
- f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
- x[3] = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
-
- f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
- f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
- x[2] = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
-
- f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
- f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
- x[1] = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
-
- f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
- f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
- x[0] = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
-
- store_buffer_horz_16(pixel_num, x, p, 6, s);
- }
- // wide flat
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- }
-}
-
void aom_lpf_horizontal_6_sse2(unsigned char *s, int p,
const unsigned char *_blimit,
const unsigned char *_limit,
@@ -1316,7 +972,8 @@
const unsigned char *_blimit,
const unsigned char *_limit,
const unsigned char *_thresh) {
- lpf_horz_edge_16_internal(FOUR_PIXELS, s, p, _blimit, _limit, _thresh);
+ aom_lpf_horizontal_16_sse2(s, p, _blimit, _limit, _thresh);
+ aom_lpf_horizontal_16_sse2(s + 4, p, _blimit, _limit, _thresh);
}
void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
diff --git a/test/lpf_test.cc b/test/lpf_test.cc
index f3d0aa1..afde34f 100644
--- a/test/lpf_test.cc
+++ b/test/lpf_test.cc
@@ -450,6 +450,8 @@
8),
make_tuple(&aom_highbd_lpf_horizontal_16_sse2,
&aom_highbd_lpf_horizontal_16_c, 8),
+ make_tuple(&aom_highbd_lpf_horizontal_16_dual_sse2,
+ &aom_highbd_lpf_horizontal_16_dual_c, 8),
make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 8),
make_tuple(&aom_highbd_lpf_vertical_16_sse2, &aom_highbd_lpf_vertical_16_c,
8),
@@ -460,6 +462,8 @@
10),
make_tuple(&aom_highbd_lpf_horizontal_16_sse2,
&aom_highbd_lpf_horizontal_16_c, 10),
+ make_tuple(&aom_highbd_lpf_horizontal_16_dual_sse2,
+ &aom_highbd_lpf_horizontal_16_dual_c, 10),
make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 10),
make_tuple(&aom_highbd_lpf_vertical_16_sse2, &aom_highbd_lpf_vertical_16_c,
10),
@@ -470,6 +474,16 @@
12),
make_tuple(&aom_highbd_lpf_horizontal_16_sse2,
&aom_highbd_lpf_horizontal_16_c, 12),
+ make_tuple(&aom_highbd_lpf_horizontal_16_dual_sse2,
+ &aom_highbd_lpf_horizontal_16_dual_c, 12),
+ make_tuple(&aom_highbd_lpf_vertical_16_sse2, &aom_highbd_lpf_vertical_16_c,
+ 12),
+ make_tuple(&aom_highbd_lpf_vertical_16_dual_sse2,
+ &aom_highbd_lpf_vertical_16_dual_c, 8),
+ make_tuple(&aom_highbd_lpf_vertical_16_dual_sse2,
+ &aom_highbd_lpf_vertical_16_dual_c, 10),
+ make_tuple(&aom_highbd_lpf_vertical_16_dual_sse2,
+ &aom_highbd_lpf_vertical_16_dual_c, 12),
make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 12)
};
@@ -482,9 +496,12 @@
make_tuple(&aom_lpf_horizontal_6_sse2, &aom_lpf_horizontal_6_c, 8),
make_tuple(&aom_lpf_vertical_6_sse2, &aom_lpf_vertical_6_c, 8),
make_tuple(&aom_lpf_horizontal_16_sse2, &aom_lpf_horizontal_16_c, 8),
+ make_tuple(&aom_lpf_horizontal_16_dual_sse2, &aom_lpf_horizontal_16_dual_c,
+ 8),
make_tuple(&aom_lpf_vertical_4_sse2, &aom_lpf_vertical_4_c, 8),
make_tuple(&aom_lpf_vertical_8_sse2, &aom_lpf_vertical_8_c, 8),
make_tuple(&aom_lpf_vertical_16_sse2, &aom_lpf_vertical_16_c, 8),
+ make_tuple(&aom_lpf_vertical_16_dual_sse2, &aom_lpf_vertical_16_dual_c, 8)
};
INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test6Param_lbd,