blob: 33aa8e4ae6150af75a4e791127d70508c7c1627b [file] [log] [blame]
#include "src/dsp/dsp.h"
#include "src/dsp/obmc.h"
#include <xmmintrin.h>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include "src/dsp/x86/common_sse4.h"
#include "src/utils/common.h"
#include "src/utils/constants.h"
namespace libgav1 {
namespace dsp {
namespace {
#include "src/dsp/"
inline void OverlapBlendFromLeft2xH_SSE4_1(
uint8_t* const prediction, const ptrdiff_t prediction_stride,
const int height, const uint8_t* const obmc_prediction,
const ptrdiff_t obmc_prediction_stride) {
uint8_t* pred = prediction;
const uint8_t* obmc_pred = obmc_prediction;
const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
const __m128i mask_val = _mm_shufflelo_epi16(Load4(kObmcMask), 0);
// 64 - mask
const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
int y = height;
do {
const __m128i pred_val = Load2x2(pred, pred + prediction_stride);
const __m128i obmc_pred_val =
Load2x2(obmc_pred, obmc_pred + obmc_prediction_stride);
const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
const __m128i result =
RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
const __m128i packed_result = _mm_packus_epi16(result, result);
Store2(pred, packed_result);
pred += prediction_stride;
const int16_t second_row_result = _mm_extract_epi16(packed_result, 1);
memcpy(pred, &second_row_result, sizeof(second_row_result));
pred += prediction_stride;
obmc_pred += obmc_prediction_stride << 1;
y -= 2;
} while (y != 0);
inline void OverlapBlendFromLeft4xH_SSE4_1(
uint8_t* const prediction, const ptrdiff_t prediction_stride,
const int height, const uint8_t* const obmc_prediction,
const ptrdiff_t obmc_prediction_stride) {
uint8_t* pred = prediction;
const uint8_t* obmc_pred = obmc_prediction;
const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
const __m128i mask_val = Load4(kObmcMask + 2);
// 64 - mask
const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
// Duplicate first half of vector.
const __m128i masks =
_mm_shuffle_epi32(_mm_unpacklo_epi8(mask_val, obmc_mask_val), 0x44);
int y = height;
do {
const __m128i pred_val0 = Load4(pred);
const __m128i obmc_pred_val0 = Load4(obmc_pred);
pred += prediction_stride;
obmc_pred += obmc_prediction_stride;
// Place the second row of each source in the second four bytes.
const __m128i pred_val =
_mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12);
const __m128i obmc_pred_val = _mm_alignr_epi8(
Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12);
const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
const __m128i result =
RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
const __m128i packed_result = _mm_packus_epi16(result, result);
Store4(pred - prediction_stride, packed_result);
const int second_row_result = _mm_extract_epi32(packed_result, 1);
memcpy(pred, &second_row_result, sizeof(second_row_result));
pred += prediction_stride;
obmc_pred += obmc_prediction_stride;
y -= 2;
} while (y != 0);
inline void OverlapBlendFromLeft8xH_SSE4_1(
uint8_t* const prediction, const ptrdiff_t prediction_stride,
const int height, const uint8_t* const obmc_prediction,
const ptrdiff_t obmc_prediction_stride) {
uint8_t* pred = prediction;
const uint8_t* obmc_pred = obmc_prediction;
const __m128i mask_inverter = _mm_set1_epi8(64);
const __m128i mask_val = LoadLo8(kObmcMask + 6);
// 64 - mask
const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
int y = height;
do {
const __m128i pred_val = LoadLo8(pred);
const __m128i obmc_pred_val = LoadLo8(obmc_pred);
const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
const __m128i result =
RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
StoreLo8(pred, _mm_packus_epi16(result, result));
pred += prediction_stride;
obmc_pred += obmc_prediction_stride;
} while (--y != 0);
void OverlapBlendFromLeft_SSE4_1(void* const prediction,
const ptrdiff_t prediction_stride,
const int width, const int height,
const void* const obmc_prediction,
const ptrdiff_t obmc_prediction_stride) {
auto* pred = static_cast<uint8_t*>(prediction);
const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
if (width == 2) {
OverlapBlendFromLeft2xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
if (width == 4) {
OverlapBlendFromLeft4xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
if (width == 8) {
OverlapBlendFromLeft8xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
const __m128i mask_inverter = _mm_set1_epi8(64);
const uint8_t* mask = kObmcMask + width - 2;
int x = 0;
do {
pred = static_cast<uint8_t*>(prediction) + x;
obmc_pred = static_cast<const uint8_t*>(obmc_prediction) + x;
const __m128i mask_val = LoadUnaligned16(mask + x);
// 64 - mask
const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
const __m128i masks_lo = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
const __m128i masks_hi = _mm_unpackhi_epi8(mask_val, obmc_mask_val);
int y = 0;
do {
const __m128i pred_val = LoadUnaligned16(pred);
const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
const __m128i result_lo =
RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks_lo), 6);
const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
const __m128i result_hi =
RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks_hi), 6);
StoreUnaligned16(pred, _mm_packus_epi16(result_lo, result_hi));
pred += prediction_stride;
obmc_pred += obmc_prediction_stride;
} while (++y < height);
x += 16;
} while (x < width);
inline void OverlapBlendFromTop4xH_SSE4_1(
uint8_t* const prediction, const ptrdiff_t prediction_stride,
const int height, const uint8_t* const obmc_prediction,
const ptrdiff_t obmc_prediction_stride) {
uint8_t* pred = prediction;
const uint8_t* obmc_pred = obmc_prediction;
const __m128i mask_inverter = _mm_set1_epi16(64);
const __m128i mask_shuffler = _mm_set_epi32(0x01010101, 0x01010101, 0, 0);
const __m128i mask_preinverter = _mm_set1_epi16(-256 | 1);
const uint8_t* mask = kObmcMask + height - 2;
const int compute_height = height - (height >> 2);
int y = 0;
do {
// First mask in the first half, second mask in the second half.
const __m128i mask_val = _mm_shuffle_epi8(
_mm_cvtsi32_si128(*reinterpret_cast<const uint16_t*>(mask + y)),
const __m128i masks =
_mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter));
const __m128i pred_val0 = Load4(pred);
const __m128i obmc_pred_val0 = Load4(obmc_pred);
pred += prediction_stride;
obmc_pred += obmc_prediction_stride;
const __m128i pred_val =
_mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12);
const __m128i obmc_pred_val = _mm_alignr_epi8(
Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12);
const __m128i terms = _mm_unpacklo_epi8(obmc_pred_val, pred_val);
const __m128i result =
RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
const __m128i packed_result = _mm_packus_epi16(result, result);
Store4(pred - prediction_stride, packed_result);
Store4(pred, _mm_srli_si128(packed_result, 4));
pred += prediction_stride;
obmc_pred += obmc_prediction_stride;
y += 2;
} while (y < compute_height);
inline void OverlapBlendFromTop8xH_SSE4_1(
uint8_t* const prediction, const ptrdiff_t prediction_stride,
const int height, const uint8_t* const obmc_prediction,
const ptrdiff_t obmc_prediction_stride) {
uint8_t* pred = prediction;
const uint8_t* obmc_pred = obmc_prediction;
const uint8_t* mask = kObmcMask + height - 2;
const __m128i mask_inverter = _mm_set1_epi8(64);
const int compute_height = height - (height >> 2);
int y = compute_height;
do {
const __m128i mask_val = _mm_set1_epi8(mask[compute_height - y]);
// 64 - mask
const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
const __m128i pred_val = LoadLo8(pred);
const __m128i obmc_pred_val = LoadLo8(obmc_pred);
const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
const __m128i result =
RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
StoreLo8(pred, _mm_packus_epi16(result, result));
pred += prediction_stride;
obmc_pred += obmc_prediction_stride;
} while (--y != 0);
void OverlapBlendFromTop_SSE4_1(void* const prediction,
const ptrdiff_t prediction_stride,
const int width, const int height,
const void* const obmc_prediction,
const ptrdiff_t obmc_prediction_stride) {
auto* pred = static_cast<uint8_t*>(prediction);
const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
if (width <= 4) {
OverlapBlendFromTop4xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
if (width == 8) {
OverlapBlendFromTop8xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
// Stop when mask value becomes 64.
const int compute_height = height - (height >> 2);
const __m128i mask_inverter = _mm_set1_epi8(64);
int y = 0;
const uint8_t* mask = kObmcMask + height - 2;
do {
const __m128i mask_val = _mm_set1_epi8(mask[y]);
// 64 - mask
const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
int x = 0;
do {
const __m128i pred_val = LoadUnaligned16(pred + x);
const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred + x);
const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
const __m128i result_lo =
RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks), 6);
const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
const __m128i result_hi =
RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks), 6);
StoreUnaligned16(pred + x, _mm_packus_epi16(result_lo, result_hi));
x += 16;
} while (x < width);
pred += prediction_stride;
obmc_pred += obmc_prediction_stride;
} while (++y < compute_height);
void Init8bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
assert(dsp != nullptr);
#if DSP_ENABLED_8BPP_SSE4_1(ObmcVertical)
dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop_SSE4_1;
#if DSP_ENABLED_8BPP_SSE4_1(ObmcHorizontal)
dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft_SSE4_1;
} // namespace
void ObmcInit_SSE4_1() { Init8bpp(); }
} // namespace dsp
} // namespace libgav1
#else // !LIBGAV1_ENABLE_SSE4_1
namespace libgav1 {
namespace dsp {
void ObmcInit_SSE4_1() {}
} // namespace dsp
} // namespace libgav1
#endif // LIBGAV1_ENABLE_SSE4_1