av1/common/arm/compound_convolve_neon.h - platform/external/libaom - Git at Google

 /*
  * Copyright (c) 2023, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
  * was not distributed with this source code in the LICENSE file, you can
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */

 #include <arm_neon.h>

 #include "av1/common/convolve.h"
 #include "av1/common/enums.h"
 #include "av1/common/filter.h"

 static INLINE void compute_dist_wtd_avg_4x4(
     uint16x4_t dd0, uint16x4_t dd1, uint16x4_t dd2, uint16x4_t dd3,
     uint16x4_t d0, uint16x4_t d1, uint16x4_t d2, uint16x4_t d3,
     const uint16_t fwd_offset, const uint16_t bck_offset,
     const int16x8_t round_offset, uint8x8_t *d01_u8, uint8x8_t *d23_u8) {
   uint32x4_t blend0 = vmull_n_u16(dd0, fwd_offset);
   blend0 = vmlal_n_u16(blend0, d0, bck_offset);
   uint32x4_t blend1 = vmull_n_u16(dd1, fwd_offset);
   blend1 = vmlal_n_u16(blend1, d1, bck_offset);
   uint32x4_t blend2 = vmull_n_u16(dd2, fwd_offset);
   blend2 = vmlal_n_u16(blend2, d2, bck_offset);
   uint32x4_t blend3 = vmull_n_u16(dd3, fwd_offset);
   blend3 = vmlal_n_u16(blend3, d3, bck_offset);

   uint16x4_t avg0 = vshrn_n_u32(blend0, DIST_PRECISION_BITS);
   uint16x4_t avg1 = vshrn_n_u32(blend1, DIST_PRECISION_BITS);
   uint16x4_t avg2 = vshrn_n_u32(blend2, DIST_PRECISION_BITS);
   uint16x4_t avg3 = vshrn_n_u32(blend3, DIST_PRECISION_BITS);

   int16x8_t dst_01 = vreinterpretq_s16_u16(vcombine_u16(avg0, avg1));
   int16x8_t dst_23 = vreinterpretq_s16_u16(vcombine_u16(avg2, avg3));

   dst_01 = vsubq_s16(dst_01, round_offset);
   dst_23 = vsubq_s16(dst_23, round_offset);

   *d01_u8 = vqrshrun_n_s16(dst_01, FILTER_BITS - ROUND0_BITS);
   *d23_u8 = vqrshrun_n_s16(dst_23, FILTER_BITS - ROUND0_BITS);
 }

 static INLINE void compute_basic_avg_4x4(uint16x4_t dd0, uint16x4_t dd1,
                                          uint16x4_t dd2, uint16x4_t dd3,
                                          uint16x4_t d0, uint16x4_t d1,
                                          uint16x4_t d2, uint16x4_t d3,
                                          const int16x8_t round_offset,
                                          uint8x8_t *d01_u8, uint8x8_t *d23_u8) {
   uint16x4_t avg0 = vhadd_u16(dd0, d0);
   uint16x4_t avg1 = vhadd_u16(dd1, d1);
   uint16x4_t avg2 = vhadd_u16(dd2, d2);
   uint16x4_t avg3 = vhadd_u16(dd3, d3);

   int16x8_t dst_01 = vreinterpretq_s16_u16(vcombine_u16(avg0, avg1));
   int16x8_t dst_23 = vreinterpretq_s16_u16(vcombine_u16(avg2, avg3));

   dst_01 = vsubq_s16(dst_01, round_offset);
   dst_23 = vsubq_s16(dst_23, round_offset);

   *d01_u8 = vqrshrun_n_s16(dst_01, FILTER_BITS - ROUND0_BITS);
   *d23_u8 = vqrshrun_n_s16(dst_23, FILTER_BITS - ROUND0_BITS);
 }

 static INLINE void compute_dist_wtd_avg_8x4(
     uint16x8_t dd0, uint16x8_t dd1, uint16x8_t dd2, uint16x8_t dd3,
     uint16x8_t d0, uint16x8_t d1, uint16x8_t d2, uint16x8_t d3,
     const uint16_t fwd_offset, const uint16_t bck_offset,
     const int16x8_t round_offset, uint8x8_t *d0_u8, uint8x8_t *d1_u8,
     uint8x8_t *d2_u8, uint8x8_t *d3_u8) {
   uint32x4_t blend0_lo = vmull_n_u16(vget_low_u16(dd0), fwd_offset);
   blend0_lo = vmlal_n_u16(blend0_lo, vget_low_u16(d0), bck_offset);
   uint32x4_t blend0_hi = vmull_n_u16(vget_high_u16(dd0), fwd_offset);
   blend0_hi = vmlal_n_u16(blend0_hi, vget_high_u16(d0), bck_offset);

   uint32x4_t blend1_lo = vmull_n_u16(vget_low_u16(dd1), fwd_offset);
   blend1_lo = vmlal_n_u16(blend1_lo, vget_low_u16(d1), bck_offset);
   uint32x4_t blend1_hi = vmull_n_u16(vget_high_u16(dd1), fwd_offset);
   blend1_hi = vmlal_n_u16(blend1_hi, vget_high_u16(d1), bck_offset);

   uint32x4_t blend2_lo = vmull_n_u16(vget_low_u16(dd2), fwd_offset);
   blend2_lo = vmlal_n_u16(blend2_lo, vget_low_u16(d2), bck_offset);
   uint32x4_t blend2_hi = vmull_n_u16(vget_high_u16(dd2), fwd_offset);
   blend2_hi = vmlal_n_u16(blend2_hi, vget_high_u16(d2), bck_offset);

   uint32x4_t blend3_lo = vmull_n_u16(vget_low_u16(dd3), fwd_offset);
   blend3_lo = vmlal_n_u16(blend3_lo, vget_low_u16(d3), bck_offset);
   uint32x4_t blend3_hi = vmull_n_u16(vget_high_u16(dd3), fwd_offset);
   blend3_hi = vmlal_n_u16(blend3_hi, vget_high_u16(d3), bck_offset);

   uint16x8_t avg0 = vcombine_u16(vshrn_n_u32(blend0_lo, DIST_PRECISION_BITS),
                                  vshrn_n_u32(blend0_hi, DIST_PRECISION_BITS));
   uint16x8_t avg1 = vcombine_u16(vshrn_n_u32(blend1_lo, DIST_PRECISION_BITS),
                                  vshrn_n_u32(blend1_hi, DIST_PRECISION_BITS));
   uint16x8_t avg2 = vcombine_u16(vshrn_n_u32(blend2_lo, DIST_PRECISION_BITS),
                                  vshrn_n_u32(blend2_hi, DIST_PRECISION_BITS));
   uint16x8_t avg3 = vcombine_u16(vshrn_n_u32(blend3_lo, DIST_PRECISION_BITS),
                                  vshrn_n_u32(blend3_hi, DIST_PRECISION_BITS));

   int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset);
   int16x8_t dst1 = vsubq_s16(vreinterpretq_s16_u16(avg1), round_offset);
   int16x8_t dst2 = vsubq_s16(vreinterpretq_s16_u16(avg2), round_offset);
   int16x8_t dst3 = vsubq_s16(vreinterpretq_s16_u16(avg3), round_offset);

   *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS);
   *d1_u8 = vqrshrun_n_s16(dst1, FILTER_BITS - ROUND0_BITS);
   *d2_u8 = vqrshrun_n_s16(dst2, FILTER_BITS - ROUND0_BITS);
   *d3_u8 = vqrshrun_n_s16(dst3, FILTER_BITS - ROUND0_BITS);
 }

 static INLINE void compute_basic_avg_8x4(uint16x8_t dd0, uint16x8_t dd1,
                                          uint16x8_t dd2, uint16x8_t dd3,
                                          uint16x8_t d0, uint16x8_t d1,
                                          uint16x8_t d2, uint16x8_t d3,
                                          const int16x8_t round_offset,
                                          uint8x8_t *d0_u8, uint8x8_t *d1_u8,
                                          uint8x8_t *d2_u8, uint8x8_t *d3_u8) {
   uint16x8_t avg0 = vhaddq_u16(dd0, d0);
   uint16x8_t avg1 = vhaddq_u16(dd1, d1);
   uint16x8_t avg2 = vhaddq_u16(dd2, d2);
   uint16x8_t avg3 = vhaddq_u16(dd3, d3);

   int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset);
   int16x8_t dst1 = vsubq_s16(vreinterpretq_s16_u16(avg1), round_offset);
   int16x8_t dst2 = vsubq_s16(vreinterpretq_s16_u16(avg2), round_offset);
   int16x8_t dst3 = vsubq_s16(vreinterpretq_s16_u16(avg3), round_offset);

   *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS);
   *d1_u8 = vqrshrun_n_s16(dst1, FILTER_BITS - ROUND0_BITS);
   *d2_u8 = vqrshrun_n_s16(dst2, FILTER_BITS - ROUND0_BITS);
   *d3_u8 = vqrshrun_n_s16(dst3, FILTER_BITS - ROUND0_BITS);
 }
	/*
	* Copyright (c) 2023, Alliance for Open Media. All rights reserved
	*
	* This source code is subject to the terms of the BSD 2 Clause License and
	* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
	* was not distributed with this source code in the LICENSE file, you can
	* obtain it at www.aomedia.org/license/software. If the Alliance for Open
	* Media Patent License 1.0 was not distributed with this source code in the
	* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
	*/

	#include <arm_neon.h>

	#include "av1/common/convolve.h"
	#include "av1/common/enums.h"
	#include "av1/common/filter.h"

	static INLINE void compute_dist_wtd_avg_4x4(
	uint16x4_t dd0, uint16x4_t dd1, uint16x4_t dd2, uint16x4_t dd3,
	uint16x4_t d0, uint16x4_t d1, uint16x4_t d2, uint16x4_t d3,
	const uint16_t fwd_offset, const uint16_t bck_offset,
	const int16x8_t round_offset, uint8x8_t d01_u8, uint8x8_t d23_u8) {
	uint32x4_t blend0 = vmull_n_u16(dd0, fwd_offset);
	blend0 = vmlal_n_u16(blend0, d0, bck_offset);
	uint32x4_t blend1 = vmull_n_u16(dd1, fwd_offset);
	blend1 = vmlal_n_u16(blend1, d1, bck_offset);
	uint32x4_t blend2 = vmull_n_u16(dd2, fwd_offset);
	blend2 = vmlal_n_u16(blend2, d2, bck_offset);
	uint32x4_t blend3 = vmull_n_u16(dd3, fwd_offset);
	blend3 = vmlal_n_u16(blend3, d3, bck_offset);

	uint16x4_t avg0 = vshrn_n_u32(blend0, DIST_PRECISION_BITS);
	uint16x4_t avg1 = vshrn_n_u32(blend1, DIST_PRECISION_BITS);
	uint16x4_t avg2 = vshrn_n_u32(blend2, DIST_PRECISION_BITS);
	uint16x4_t avg3 = vshrn_n_u32(blend3, DIST_PRECISION_BITS);

	int16x8_t dst_01 = vreinterpretq_s16_u16(vcombine_u16(avg0, avg1));
	int16x8_t dst_23 = vreinterpretq_s16_u16(vcombine_u16(avg2, avg3));

	dst_01 = vsubq_s16(dst_01, round_offset);
	dst_23 = vsubq_s16(dst_23, round_offset);

	*d01_u8 = vqrshrun_n_s16(dst_01, FILTER_BITS - ROUND0_BITS);
	*d23_u8 = vqrshrun_n_s16(dst_23, FILTER_BITS - ROUND0_BITS);
	}

	static INLINE void compute_basic_avg_4x4(uint16x4_t dd0, uint16x4_t dd1,
	uint16x4_t dd2, uint16x4_t dd3,
	uint16x4_t d0, uint16x4_t d1,
	uint16x4_t d2, uint16x4_t d3,
	const int16x8_t round_offset,
	uint8x8_t d01_u8, uint8x8_t d23_u8) {
	uint16x4_t avg0 = vhadd_u16(dd0, d0);
	uint16x4_t avg1 = vhadd_u16(dd1, d1);
	uint16x4_t avg2 = vhadd_u16(dd2, d2);
	uint16x4_t avg3 = vhadd_u16(dd3, d3);

	int16x8_t dst_01 = vreinterpretq_s16_u16(vcombine_u16(avg0, avg1));
	int16x8_t dst_23 = vreinterpretq_s16_u16(vcombine_u16(avg2, avg3));

	dst_01 = vsubq_s16(dst_01, round_offset);
	dst_23 = vsubq_s16(dst_23, round_offset);

	*d01_u8 = vqrshrun_n_s16(dst_01, FILTER_BITS - ROUND0_BITS);
	*d23_u8 = vqrshrun_n_s16(dst_23, FILTER_BITS - ROUND0_BITS);
	}

	static INLINE void compute_dist_wtd_avg_8x4(
	uint16x8_t dd0, uint16x8_t dd1, uint16x8_t dd2, uint16x8_t dd3,
	uint16x8_t d0, uint16x8_t d1, uint16x8_t d2, uint16x8_t d3,
	const uint16_t fwd_offset, const uint16_t bck_offset,
	const int16x8_t round_offset, uint8x8_t d0_u8, uint8x8_t d1_u8,
	uint8x8_t d2_u8, uint8x8_t d3_u8) {
	uint32x4_t blend0_lo = vmull_n_u16(vget_low_u16(dd0), fwd_offset);
	blend0_lo = vmlal_n_u16(blend0_lo, vget_low_u16(d0), bck_offset);
	uint32x4_t blend0_hi = vmull_n_u16(vget_high_u16(dd0), fwd_offset);
	blend0_hi = vmlal_n_u16(blend0_hi, vget_high_u16(d0), bck_offset);

	uint32x4_t blend1_lo = vmull_n_u16(vget_low_u16(dd1), fwd_offset);
	blend1_lo = vmlal_n_u16(blend1_lo, vget_low_u16(d1), bck_offset);
	uint32x4_t blend1_hi = vmull_n_u16(vget_high_u16(dd1), fwd_offset);
	blend1_hi = vmlal_n_u16(blend1_hi, vget_high_u16(d1), bck_offset);

	uint32x4_t blend2_lo = vmull_n_u16(vget_low_u16(dd2), fwd_offset);
	blend2_lo = vmlal_n_u16(blend2_lo, vget_low_u16(d2), bck_offset);
	uint32x4_t blend2_hi = vmull_n_u16(vget_high_u16(dd2), fwd_offset);
	blend2_hi = vmlal_n_u16(blend2_hi, vget_high_u16(d2), bck_offset);

	uint32x4_t blend3_lo = vmull_n_u16(vget_low_u16(dd3), fwd_offset);
	blend3_lo = vmlal_n_u16(blend3_lo, vget_low_u16(d3), bck_offset);
	uint32x4_t blend3_hi = vmull_n_u16(vget_high_u16(dd3), fwd_offset);
	blend3_hi = vmlal_n_u16(blend3_hi, vget_high_u16(d3), bck_offset);

	uint16x8_t avg0 = vcombine_u16(vshrn_n_u32(blend0_lo, DIST_PRECISION_BITS),
	vshrn_n_u32(blend0_hi, DIST_PRECISION_BITS));
	uint16x8_t avg1 = vcombine_u16(vshrn_n_u32(blend1_lo, DIST_PRECISION_BITS),
	vshrn_n_u32(blend1_hi, DIST_PRECISION_BITS));
	uint16x8_t avg2 = vcombine_u16(vshrn_n_u32(blend2_lo, DIST_PRECISION_BITS),
	vshrn_n_u32(blend2_hi, DIST_PRECISION_BITS));
	uint16x8_t avg3 = vcombine_u16(vshrn_n_u32(blend3_lo, DIST_PRECISION_BITS),
	vshrn_n_u32(blend3_hi, DIST_PRECISION_BITS));

	int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset);
	int16x8_t dst1 = vsubq_s16(vreinterpretq_s16_u16(avg1), round_offset);
	int16x8_t dst2 = vsubq_s16(vreinterpretq_s16_u16(avg2), round_offset);
	int16x8_t dst3 = vsubq_s16(vreinterpretq_s16_u16(avg3), round_offset);

	*d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS);
	*d1_u8 = vqrshrun_n_s16(dst1, FILTER_BITS - ROUND0_BITS);
	*d2_u8 = vqrshrun_n_s16(dst2, FILTER_BITS - ROUND0_BITS);
	*d3_u8 = vqrshrun_n_s16(dst3, FILTER_BITS - ROUND0_BITS);
	}

	static INLINE void compute_basic_avg_8x4(uint16x8_t dd0, uint16x8_t dd1,
	uint16x8_t dd2, uint16x8_t dd3,
	uint16x8_t d0, uint16x8_t d1,
	uint16x8_t d2, uint16x8_t d3,
	const int16x8_t round_offset,
	uint8x8_t d0_u8, uint8x8_t d1_u8,
	uint8x8_t d2_u8, uint8x8_t d3_u8) {
	uint16x8_t avg0 = vhaddq_u16(dd0, d0);
	uint16x8_t avg1 = vhaddq_u16(dd1, d1);
	uint16x8_t avg2 = vhaddq_u16(dd2, d2);
	uint16x8_t avg3 = vhaddq_u16(dd3, d3);

	int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset);
	int16x8_t dst1 = vsubq_s16(vreinterpretq_s16_u16(avg1), round_offset);
	int16x8_t dst2 = vsubq_s16(vreinterpretq_s16_u16(avg2), round_offset);
	int16x8_t dst3 = vsubq_s16(vreinterpretq_s16_u16(avg3), round_offset);

	*d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS);
	*d1_u8 = vqrshrun_n_s16(dst1, FILTER_BITS - ROUND0_BITS);
	*d2_u8 = vqrshrun_n_s16(dst2, FILTER_BITS - ROUND0_BITS);
	*d3_u8 = vqrshrun_n_s16(dst3, FILTER_BITS - ROUND0_BITS);
	}