libgav1/src/dsp/arm/average_blend_neon.cc - platform/external/libgav1 - Git at Google

 // Copyright 2019 The libgav1 Authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include "src/dsp/average_blend.h"
 #include "src/utils/cpu.h"

 #if LIBGAV1_ENABLE_NEON

 #include <arm_neon.h>

 #include <cassert>
 #include <cstddef>
 #include <cstdint>

 #include "src/dsp/arm/common_neon.h"
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
 #include "src/utils/common.h"

 namespace libgav1 {
 namespace dsp {
 namespace {

 constexpr int kInterPostRoundBit =
     kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;

 }  // namespace

 namespace low_bitdepth {
 namespace {

 inline uint8x8_t AverageBlend8Row(const int16_t* prediction_0,
                                   const int16_t* prediction_1) {
   const int16x8_t pred0 = vld1q_s16(prediction_0);
   const int16x8_t pred1 = vld1q_s16(prediction_1);
   const int16x8_t res = vaddq_s16(pred0, pred1);
   return vqrshrun_n_s16(res, kInterPostRoundBit + 1);
 }

 inline void AverageBlendLargeRow(const int16_t* prediction_0,
                                  const int16_t* prediction_1, const int width,
                                  uint8_t* dest) {
   int x = width;
   do {
     const int16x8_t pred_00 = vld1q_s16(prediction_0);
     const int16x8_t pred_01 = vld1q_s16(prediction_1);
     prediction_0 += 8;
     prediction_1 += 8;
     const int16x8_t res0 = vaddq_s16(pred_00, pred_01);
     const uint8x8_t res_out0 = vqrshrun_n_s16(res0, kInterPostRoundBit + 1);
     const int16x8_t pred_10 = vld1q_s16(prediction_0);
     const int16x8_t pred_11 = vld1q_s16(prediction_1);
     prediction_0 += 8;
     prediction_1 += 8;
     const int16x8_t res1 = vaddq_s16(pred_10, pred_11);
     const uint8x8_t res_out1 = vqrshrun_n_s16(res1, kInterPostRoundBit + 1);
     vst1q_u8(dest, vcombine_u8(res_out0, res_out1));
     dest += 16;
     x -= 16;
   } while (x != 0);
 }

 void AverageBlend_NEON(const void* prediction_0, const void* prediction_1,
                        const int width, const int height, void* const dest,
                        const ptrdiff_t dest_stride) {
   auto* dst = static_cast<uint8_t*>(dest);
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y = height;

   if (width == 4) {
     do {
       const uint8x8_t result = AverageBlend8Row(pred_0, pred_1);
       pred_0 += 8;
       pred_1 += 8;

       StoreLo4(dst, result);
       dst += dest_stride;
       StoreHi4(dst, result);
       dst += dest_stride;
       y -= 2;
     } while (y != 0);
     return;
   }

   if (width == 8) {
     do {
       vst1_u8(dst, AverageBlend8Row(pred_0, pred_1));
       dst += dest_stride;
       pred_0 += 8;
       pred_1 += 8;

       vst1_u8(dst, AverageBlend8Row(pred_0, pred_1));
       dst += dest_stride;
       pred_0 += 8;
       pred_1 += 8;

       y -= 2;
     } while (y != 0);
     return;
   }

   do {
     AverageBlendLargeRow(pred_0, pred_1, width, dst);
     dst += dest_stride;
     pred_0 += width;
     pred_1 += width;

     AverageBlendLargeRow(pred_0, pred_1, width, dst);
     dst += dest_stride;
     pred_0 += width;
     pred_1 += width;

     y -= 2;
   } while (y != 0);
 }

 void Init8bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
   assert(dsp != nullptr);
   dsp->average_blend = AverageBlend_NEON;
 }

 }  // namespace
 }  // namespace low_bitdepth

 #if LIBGAV1_MAX_BITDEPTH >= 10
 namespace high_bitdepth {
 namespace {

 inline uint16x8_t AverageBlend8Row(const uint16_t* prediction_0,
                                    const uint16_t* prediction_1,
                                    const int32x4_t compound_offset,
                                    const uint16x8_t v_bitdepth) {
   const uint16x8_t pred0 = vld1q_u16(prediction_0);
   const uint16x8_t pred1 = vld1q_u16(prediction_1);
   const uint32x4_t pred_lo =
       vaddl_u16(vget_low_u16(pred0), vget_low_u16(pred1));
   const uint32x4_t pred_hi =
       vaddl_u16(vget_high_u16(pred0), vget_high_u16(pred1));
   const int32x4_t offset_lo =
       vsubq_s32(vreinterpretq_s32_u32(pred_lo), compound_offset);
   const int32x4_t offset_hi =
       vsubq_s32(vreinterpretq_s32_u32(pred_hi), compound_offset);
   const uint16x4_t res_lo = vqrshrun_n_s32(offset_lo, kInterPostRoundBit + 1);
   const uint16x4_t res_hi = vqrshrun_n_s32(offset_hi, kInterPostRoundBit + 1);
   return vminq_u16(vcombine_u16(res_lo, res_hi), v_bitdepth);
 }

 inline void AverageBlendLargeRow(const uint16_t* prediction_0,
                                  const uint16_t* prediction_1, const int width,
                                  uint16_t* dest,
                                  const int32x4_t compound_offset,
                                  const uint16x8_t v_bitdepth) {
   int x = width;
   do {
     vst1q_u16(dest, AverageBlend8Row(prediction_0, prediction_1,
                                      compound_offset, v_bitdepth));
     prediction_0 += 8;
     prediction_1 += 8;
     dest += 8;

     vst1q_u16(dest, AverageBlend8Row(prediction_0, prediction_1,
                                      compound_offset, v_bitdepth));
     prediction_0 += 8;
     prediction_1 += 8;
     dest += 8;

     x -= 16;
   } while (x != 0);
 }

 void AverageBlend_NEON(const void* prediction_0, const void* prediction_1,
                        const int width, const int height, void* const dest,
                        const ptrdiff_t dest_stride) {
   auto* dst = static_cast<uint16_t*>(dest);
   const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
   int y = height;

   const ptrdiff_t dst_stride = dest_stride >> 1;
   const int32x4_t compound_offset =
       vdupq_n_s32(static_cast<int32_t>(kCompoundOffset + kCompoundOffset));
   const uint16x8_t v_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
   if (width == 4) {
     do {
       const uint16x8_t result =
           AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth);
       pred_0 += 8;
       pred_1 += 8;

       vst1_u16(dst, vget_low_u16(result));
       dst += dst_stride;
       vst1_u16(dst, vget_high_u16(result));
       dst += dst_stride;
       y -= 2;
     } while (y != 0);
     return;
   }

   if (width == 8) {
     do {
       vst1q_u16(dst,
                 AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth));
       dst += dst_stride;
       pred_0 += 8;
       pred_1 += 8;

       vst1q_u16(dst,
                 AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth));
       dst += dst_stride;
       pred_0 += 8;
       pred_1 += 8;

       y -= 2;
     } while (y != 0);
     return;
   }

   do {
     AverageBlendLargeRow(pred_0, pred_1, width, dst, compound_offset,
                          v_bitdepth);
     dst += dst_stride;
     pred_0 += width;
     pred_1 += width;

     AverageBlendLargeRow(pred_0, pred_1, width, dst, compound_offset,
                          v_bitdepth);
     dst += dst_stride;
     pred_0 += width;
     pred_1 += width;

     y -= 2;
   } while (y != 0);
 }

 void Init10bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
   assert(dsp != nullptr);
   dsp->average_blend = AverageBlend_NEON;
 }

 }  // namespace
 }  // namespace high_bitdepth
 #endif  // LIBGAV1_MAX_BITDEPTH >= 10

 void AverageBlendInit_NEON() {
   low_bitdepth::Init8bpp();
 #if LIBGAV1_MAX_BITDEPTH >= 10
   high_bitdepth::Init10bpp();
 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
 }

 }  // namespace dsp
 }  // namespace libgav1

 #else   // !LIBGAV1_ENABLE_NEON

 namespace libgav1 {
 namespace dsp {

 void AverageBlendInit_NEON() {}

 }  // namespace dsp
 }  // namespace libgav1
 #endif  // LIBGAV1_ENABLE_NEON
	// Copyright 2019 The libgav1 Authors
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#include "src/dsp/average_blend.h"
	#include "src/utils/cpu.h"

	#if LIBGAV1_ENABLE_NEON

	#include <arm_neon.h>

	#include <cassert>
	#include <cstddef>
	#include <cstdint>

	#include "src/dsp/arm/common_neon.h"
	#include "src/dsp/constants.h"
	#include "src/dsp/dsp.h"
	#include "src/utils/common.h"

	namespace libgav1 {
	namespace dsp {
	namespace {

	constexpr int kInterPostRoundBit =
	kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;

	} // namespace

	namespace low_bitdepth {
	namespace {

	inline uint8x8_t AverageBlend8Row(const int16_t* prediction_0,
	const int16_t* prediction_1) {
	const int16x8_t pred0 = vld1q_s16(prediction_0);
	const int16x8_t pred1 = vld1q_s16(prediction_1);
	const int16x8_t res = vaddq_s16(pred0, pred1);
	return vqrshrun_n_s16(res, kInterPostRoundBit + 1);
	}

	inline void AverageBlendLargeRow(const int16_t* prediction_0,
	const int16_t* prediction_1, const int width,
	uint8_t* dest) {
	int x = width;
	do {
	const int16x8_t pred_00 = vld1q_s16(prediction_0);
	const int16x8_t pred_01 = vld1q_s16(prediction_1);
	prediction_0 += 8;
	prediction_1 += 8;
	const int16x8_t res0 = vaddq_s16(pred_00, pred_01);
	const uint8x8_t res_out0 = vqrshrun_n_s16(res0, kInterPostRoundBit + 1);
	const int16x8_t pred_10 = vld1q_s16(prediction_0);
	const int16x8_t pred_11 = vld1q_s16(prediction_1);
	prediction_0 += 8;
	prediction_1 += 8;
	const int16x8_t res1 = vaddq_s16(pred_10, pred_11);
	const uint8x8_t res_out1 = vqrshrun_n_s16(res1, kInterPostRoundBit + 1);
	vst1q_u8(dest, vcombine_u8(res_out0, res_out1));
	dest += 16;
	x -= 16;
	} while (x != 0);
	}

	void AverageBlend_NEON(const void* prediction_0, const void* prediction_1,
	const int width, const int height, void* const dest,
	const ptrdiff_t dest_stride) {
	auto* dst = static_cast<uint8_t*>(dest);
	const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
	const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
	int y = height;

	if (width == 4) {
	do {
	const uint8x8_t result = AverageBlend8Row(pred_0, pred_1);
	pred_0 += 8;
	pred_1 += 8;

	StoreLo4(dst, result);
	dst += dest_stride;
	StoreHi4(dst, result);
	dst += dest_stride;
	y -= 2;
	} while (y != 0);
	return;
	}

	if (width == 8) {
	do {
	vst1_u8(dst, AverageBlend8Row(pred_0, pred_1));
	dst += dest_stride;
	pred_0 += 8;
	pred_1 += 8;

	vst1_u8(dst, AverageBlend8Row(pred_0, pred_1));
	dst += dest_stride;
	pred_0 += 8;
	pred_1 += 8;

	y -= 2;
	} while (y != 0);
	return;
	}

	do {
	AverageBlendLargeRow(pred_0, pred_1, width, dst);
	dst += dest_stride;
	pred_0 += width;
	pred_1 += width;

	AverageBlendLargeRow(pred_0, pred_1, width, dst);
	dst += dest_stride;
	pred_0 += width;
	pred_1 += width;

	y -= 2;
	} while (y != 0);
	}

	void Init8bpp() {
	Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
	assert(dsp != nullptr);
	dsp->average_blend = AverageBlend_NEON;
	}

	} // namespace
	} // namespace low_bitdepth

	#if LIBGAV1_MAX_BITDEPTH >= 10
	namespace high_bitdepth {
	namespace {

	inline uint16x8_t AverageBlend8Row(const uint16_t* prediction_0,
	const uint16_t* prediction_1,
	const int32x4_t compound_offset,
	const uint16x8_t v_bitdepth) {
	const uint16x8_t pred0 = vld1q_u16(prediction_0);
	const uint16x8_t pred1 = vld1q_u16(prediction_1);
	const uint32x4_t pred_lo =
	vaddl_u16(vget_low_u16(pred0), vget_low_u16(pred1));
	const uint32x4_t pred_hi =
	vaddl_u16(vget_high_u16(pred0), vget_high_u16(pred1));
	const int32x4_t offset_lo =
	vsubq_s32(vreinterpretq_s32_u32(pred_lo), compound_offset);
	const int32x4_t offset_hi =
	vsubq_s32(vreinterpretq_s32_u32(pred_hi), compound_offset);
	const uint16x4_t res_lo = vqrshrun_n_s32(offset_lo, kInterPostRoundBit + 1);
	const uint16x4_t res_hi = vqrshrun_n_s32(offset_hi, kInterPostRoundBit + 1);
	return vminq_u16(vcombine_u16(res_lo, res_hi), v_bitdepth);
	}

	inline void AverageBlendLargeRow(const uint16_t* prediction_0,
	const uint16_t* prediction_1, const int width,
	uint16_t* dest,
	const int32x4_t compound_offset,
	const uint16x8_t v_bitdepth) {
	int x = width;
	do {
	vst1q_u16(dest, AverageBlend8Row(prediction_0, prediction_1,
	compound_offset, v_bitdepth));
	prediction_0 += 8;
	prediction_1 += 8;
	dest += 8;

	vst1q_u16(dest, AverageBlend8Row(prediction_0, prediction_1,
	compound_offset, v_bitdepth));
	prediction_0 += 8;
	prediction_1 += 8;
	dest += 8;

	x -= 16;
	} while (x != 0);
	}

	void AverageBlend_NEON(const void* prediction_0, const void* prediction_1,
	const int width, const int height, void* const dest,
	const ptrdiff_t dest_stride) {
	auto* dst = static_cast<uint16_t*>(dest);
	const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
	const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
	int y = height;

	const ptrdiff_t dst_stride = dest_stride >> 1;
	const int32x4_t compound_offset =
	vdupq_n_s32(static_cast<int32_t>(kCompoundOffset + kCompoundOffset));
	const uint16x8_t v_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
	if (width == 4) {
	do {
	const uint16x8_t result =
	AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth);
	pred_0 += 8;
	pred_1 += 8;

	vst1_u16(dst, vget_low_u16(result));
	dst += dst_stride;
	vst1_u16(dst, vget_high_u16(result));
	dst += dst_stride;
	y -= 2;
	} while (y != 0);
	return;
	}

	if (width == 8) {
	do {
	vst1q_u16(dst,
	AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth));
	dst += dst_stride;
	pred_0 += 8;
	pred_1 += 8;

	vst1q_u16(dst,
	AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth));
	dst += dst_stride;
	pred_0 += 8;
	pred_1 += 8;

	y -= 2;
	} while (y != 0);
	return;
	}

	do {
	AverageBlendLargeRow(pred_0, pred_1, width, dst, compound_offset,
	v_bitdepth);
	dst += dst_stride;
	pred_0 += width;
	pred_1 += width;

	AverageBlendLargeRow(pred_0, pred_1, width, dst, compound_offset,
	v_bitdepth);
	dst += dst_stride;
	pred_0 += width;
	pred_1 += width;

	y -= 2;
	} while (y != 0);
	}

	void Init10bpp() {
	Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
	assert(dsp != nullptr);
	dsp->average_blend = AverageBlend_NEON;
	}

	} // namespace
	} // namespace high_bitdepth
	#endif // LIBGAV1_MAX_BITDEPTH >= 10

	void AverageBlendInit_NEON() {
	low_bitdepth::Init8bpp();
	#if LIBGAV1_MAX_BITDEPTH >= 10
	high_bitdepth::Init10bpp();
	#endif // LIBGAV1_MAX_BITDEPTH >= 10
	}

	} // namespace dsp
	} // namespace libgav1

	#else // !LIBGAV1_ENABLE_NEON

	namespace libgav1 {
	namespace dsp {

	void AverageBlendInit_NEON() {}

	} // namespace dsp
	} // namespace libgav1
	#endif // LIBGAV1_ENABLE_NEON