libgav1/src/dsp/x86/obmc_sse4.cc - platform/external/libgav1 - Git at Google

 // Copyright 2019 The libgav1 Authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include "src/dsp/obmc.h"
 #include "src/utils/cpu.h"

 #if LIBGAV1_ENABLE_SSE4_1

 #include <xmmintrin.h>

 #include <cassert>
 #include <cstddef>
 #include <cstdint>

 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
 #include "src/dsp/x86/common_sse4.h"
 #include "src/utils/common.h"
 #include "src/utils/constants.h"

 namespace libgav1 {
 namespace dsp {
 namespace {

 #include "src/dsp/obmc.inc"

 inline void OverlapBlendFromLeft2xH_SSE4_1(
     uint8_t* const prediction, const ptrdiff_t prediction_stride,
     const int height, const uint8_t* const obmc_prediction,
     const ptrdiff_t obmc_prediction_stride) {
   uint8_t* pred = prediction;
   const uint8_t* obmc_pred = obmc_prediction;
   const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
   const __m128i mask_val = _mm_shufflelo_epi16(Load4(kObmcMask), 0);
   // 64 - mask
   const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
   const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
   int y = height;
   do {
     const __m128i pred_val = Load2x2(pred, pred + prediction_stride);
     const __m128i obmc_pred_val =
         Load2x2(obmc_pred, obmc_pred + obmc_prediction_stride);

     const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
     const __m128i result =
         RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
     const __m128i packed_result = _mm_packus_epi16(result, result);
     Store2(pred, packed_result);
     pred += prediction_stride;
     const int16_t second_row_result = _mm_extract_epi16(packed_result, 1);
     memcpy(pred, &second_row_result, sizeof(second_row_result));
     pred += prediction_stride;
     obmc_pred += obmc_prediction_stride << 1;
     y -= 2;
   } while (y != 0);
 }

 inline void OverlapBlendFromLeft4xH_SSE4_1(
     uint8_t* const prediction, const ptrdiff_t prediction_stride,
     const int height, const uint8_t* const obmc_prediction,
     const ptrdiff_t obmc_prediction_stride) {
   uint8_t* pred = prediction;
   const uint8_t* obmc_pred = obmc_prediction;
   const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
   const __m128i mask_val = Load4(kObmcMask + 2);
   // 64 - mask
   const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
   // Duplicate first half of vector.
   const __m128i masks =
       _mm_shuffle_epi32(_mm_unpacklo_epi8(mask_val, obmc_mask_val), 0x44);
   int y = height;
   do {
     const __m128i pred_val0 = Load4(pred);
     const __m128i obmc_pred_val0 = Load4(obmc_pred);
     pred += prediction_stride;
     obmc_pred += obmc_prediction_stride;

     // Place the second row of each source in the second four bytes.
     const __m128i pred_val =
         _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12);
     const __m128i obmc_pred_val = _mm_alignr_epi8(
         Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12);
     const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
     const __m128i result =
         RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
     const __m128i packed_result = _mm_packus_epi16(result, result);
     Store4(pred - prediction_stride, packed_result);
     const int second_row_result = _mm_extract_epi32(packed_result, 1);
     memcpy(pred, &second_row_result, sizeof(second_row_result));
     pred += prediction_stride;
     obmc_pred += obmc_prediction_stride;
     y -= 2;
   } while (y != 0);
 }

 inline void OverlapBlendFromLeft8xH_SSE4_1(
     uint8_t* const prediction, const ptrdiff_t prediction_stride,
     const int height, const uint8_t* const obmc_prediction,
     const ptrdiff_t obmc_prediction_stride) {
   uint8_t* pred = prediction;
   const uint8_t* obmc_pred = obmc_prediction;
   const __m128i mask_inverter = _mm_set1_epi8(64);
   const __m128i mask_val = LoadLo8(kObmcMask + 6);
   // 64 - mask
   const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
   const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
   int y = height;
   do {
     const __m128i pred_val = LoadLo8(pred);
     const __m128i obmc_pred_val = LoadLo8(obmc_pred);
     const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
     const __m128i result =
         RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);

     StoreLo8(pred, _mm_packus_epi16(result, result));
     pred += prediction_stride;
     obmc_pred += obmc_prediction_stride;
   } while (--y != 0);
 }

 void OverlapBlendFromLeft_SSE4_1(void* const prediction,
                                  const ptrdiff_t prediction_stride,
                                  const int width, const int height,
                                  const void* const obmc_prediction,
                                  const ptrdiff_t obmc_prediction_stride) {
   auto* pred = static_cast<uint8_t*>(prediction);
   const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);

   if (width == 2) {
     OverlapBlendFromLeft2xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
                                    obmc_prediction_stride);
     return;
   }
   if (width == 4) {
     OverlapBlendFromLeft4xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
                                    obmc_prediction_stride);
     return;
   }
   if (width == 8) {
     OverlapBlendFromLeft8xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
                                    obmc_prediction_stride);
     return;
   }
   const __m128i mask_inverter = _mm_set1_epi8(64);
   const uint8_t* mask = kObmcMask + width - 2;
   int x = 0;
   do {
     pred = static_cast<uint8_t*>(prediction) + x;
     obmc_pred = static_cast<const uint8_t*>(obmc_prediction) + x;
     const __m128i mask_val = LoadUnaligned16(mask + x);
     // 64 - mask
     const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
     const __m128i masks_lo = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
     const __m128i masks_hi = _mm_unpackhi_epi8(mask_val, obmc_mask_val);

     int y = 0;
     do {
       const __m128i pred_val = LoadUnaligned16(pred);
       const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
       const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
       const __m128i result_lo =
           RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks_lo), 6);
       const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
       const __m128i result_hi =
           RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks_hi), 6);
       StoreUnaligned16(pred, _mm_packus_epi16(result_lo, result_hi));

       pred += prediction_stride;
       obmc_pred += obmc_prediction_stride;
     } while (++y < height);
     x += 16;
   } while (x < width);
 }

 inline void OverlapBlendFromTop4xH_SSE4_1(
     uint8_t* const prediction, const ptrdiff_t prediction_stride,
     const int height, const uint8_t* const obmc_prediction,
     const ptrdiff_t obmc_prediction_stride) {
   uint8_t* pred = prediction;
   const uint8_t* obmc_pred = obmc_prediction;
   const __m128i mask_inverter = _mm_set1_epi16(64);
   const __m128i mask_shuffler = _mm_set_epi32(0x01010101, 0x01010101, 0, 0);
   const __m128i mask_preinverter = _mm_set1_epi16(-256 | 1);

   const uint8_t* mask = kObmcMask + height - 2;
   const int compute_height = height - (height >> 2);
   int y = 0;
   do {
     // First mask in the first half, second mask in the second half.
     const __m128i mask_val = _mm_shuffle_epi8(
         _mm_cvtsi32_si128(*reinterpret_cast<const uint16_t*>(mask + y)),
         mask_shuffler);
     const __m128i masks =
         _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter));
     const __m128i pred_val0 = Load4(pred);

     const __m128i obmc_pred_val0 = Load4(obmc_pred);
     pred += prediction_stride;
     obmc_pred += obmc_prediction_stride;
     const __m128i pred_val =
         _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12);
     const __m128i obmc_pred_val = _mm_alignr_epi8(
         Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12);
     const __m128i terms = _mm_unpacklo_epi8(obmc_pred_val, pred_val);
     const __m128i result =
         RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);

     const __m128i packed_result = _mm_packus_epi16(result, result);
     Store4(pred - prediction_stride, packed_result);
     Store4(pred, _mm_srli_si128(packed_result, 4));
     pred += prediction_stride;
     obmc_pred += obmc_prediction_stride;
     y += 2;
   } while (y < compute_height);
 }

 inline void OverlapBlendFromTop8xH_SSE4_1(
     uint8_t* const prediction, const ptrdiff_t prediction_stride,
     const int height, const uint8_t* const obmc_prediction,
     const ptrdiff_t obmc_prediction_stride) {
   uint8_t* pred = prediction;
   const uint8_t* obmc_pred = obmc_prediction;
   const uint8_t* mask = kObmcMask + height - 2;
   const __m128i mask_inverter = _mm_set1_epi8(64);
   const int compute_height = height - (height >> 2);
   int y = compute_height;
   do {
     const __m128i mask_val = _mm_set1_epi8(mask[compute_height - y]);
     // 64 - mask
     const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
     const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
     const __m128i pred_val = LoadLo8(pred);
     const __m128i obmc_pred_val = LoadLo8(obmc_pred);
     const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
     const __m128i result =
         RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);

     StoreLo8(pred, _mm_packus_epi16(result, result));
     pred += prediction_stride;
     obmc_pred += obmc_prediction_stride;
   } while (--y != 0);
 }

 void OverlapBlendFromTop_SSE4_1(void* const prediction,
                                 const ptrdiff_t prediction_stride,
                                 const int width, const int height,
                                 const void* const obmc_prediction,
                                 const ptrdiff_t obmc_prediction_stride) {
   auto* pred = static_cast<uint8_t*>(prediction);
   const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);

   if (width <= 4) {
     OverlapBlendFromTop4xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
                                   obmc_prediction_stride);
     return;
   }
   if (width == 8) {
     OverlapBlendFromTop8xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
                                   obmc_prediction_stride);
     return;
   }

   // Stop when mask value becomes 64.
   const int compute_height = height - (height >> 2);
   const __m128i mask_inverter = _mm_set1_epi8(64);
   int y = 0;
   const uint8_t* mask = kObmcMask + height - 2;
   do {
     const __m128i mask_val = _mm_set1_epi8(mask[y]);
     // 64 - mask
     const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
     const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
     int x = 0;
     do {
       const __m128i pred_val = LoadUnaligned16(pred + x);
       const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred + x);
       const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
       const __m128i result_lo =
           RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks), 6);
       const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
       const __m128i result_hi =
           RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks), 6);
       StoreUnaligned16(pred + x, _mm_packus_epi16(result_lo, result_hi));
       x += 16;
     } while (x < width);
     pred += prediction_stride;
     obmc_pred += obmc_prediction_stride;
   } while (++y < compute_height);
 }

 void Init8bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
   assert(dsp != nullptr);
 #if DSP_ENABLED_8BPP_SSE4_1(ObmcVertical)
   dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop_SSE4_1;
 #endif
 #if DSP_ENABLED_8BPP_SSE4_1(ObmcHorizontal)
   dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft_SSE4_1;
 #endif
 }

 }  // namespace

 void ObmcInit_SSE4_1() { Init8bpp(); }

 }  // namespace dsp
 }  // namespace libgav1

 #else  // !LIBGAV1_ENABLE_SSE4_1

 namespace libgav1 {
 namespace dsp {

 void ObmcInit_SSE4_1() {}

 }  // namespace dsp
 }  // namespace libgav1
 #endif  // LIBGAV1_ENABLE_SSE4_1
	// Copyright 2019 The libgav1 Authors
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#include "src/dsp/obmc.h"
	#include "src/utils/cpu.h"

	#if LIBGAV1_ENABLE_SSE4_1

	#include <xmmintrin.h>

	#include <cassert>
	#include <cstddef>
	#include <cstdint>

	#include "src/dsp/constants.h"
	#include "src/dsp/dsp.h"
	#include "src/dsp/x86/common_sse4.h"
	#include "src/utils/common.h"
	#include "src/utils/constants.h"

	namespace libgav1 {
	namespace dsp {
	namespace {

	#include "src/dsp/obmc.inc"

	inline void OverlapBlendFromLeft2xH_SSE4_1(
	uint8_t* const prediction, const ptrdiff_t prediction_stride,
	const int height, const uint8_t* const obmc_prediction,
	const ptrdiff_t obmc_prediction_stride) {
	uint8_t* pred = prediction;
	const uint8_t* obmc_pred = obmc_prediction;
	const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
	const __m128i mask_val = _mm_shufflelo_epi16(Load4(kObmcMask), 0);
	// 64 - mask
	const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
	const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
	int y = height;
	do {
	const __m128i pred_val = Load2x2(pred, pred + prediction_stride);
	const __m128i obmc_pred_val =
	Load2x2(obmc_pred, obmc_pred + obmc_prediction_stride);

	const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
	const __m128i result =
	RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
	const __m128i packed_result = _mm_packus_epi16(result, result);
	Store2(pred, packed_result);
	pred += prediction_stride;
	const int16_t second_row_result = _mm_extract_epi16(packed_result, 1);
	memcpy(pred, &second_row_result, sizeof(second_row_result));
	pred += prediction_stride;
	obmc_pred += obmc_prediction_stride << 1;
	y -= 2;
	} while (y != 0);
	}

	inline void OverlapBlendFromLeft4xH_SSE4_1(
	uint8_t* const prediction, const ptrdiff_t prediction_stride,
	const int height, const uint8_t* const obmc_prediction,
	const ptrdiff_t obmc_prediction_stride) {
	uint8_t* pred = prediction;
	const uint8_t* obmc_pred = obmc_prediction;
	const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
	const __m128i mask_val = Load4(kObmcMask + 2);
	// 64 - mask
	const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
	// Duplicate first half of vector.
	const __m128i masks =
	_mm_shuffle_epi32(_mm_unpacklo_epi8(mask_val, obmc_mask_val), 0x44);
	int y = height;
	do {
	const __m128i pred_val0 = Load4(pred);
	const __m128i obmc_pred_val0 = Load4(obmc_pred);
	pred += prediction_stride;
	obmc_pred += obmc_prediction_stride;

	// Place the second row of each source in the second four bytes.
	const __m128i pred_val =
	_mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12);
	const __m128i obmc_pred_val = _mm_alignr_epi8(
	Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12);
	const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
	const __m128i result =
	RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
	const __m128i packed_result = _mm_packus_epi16(result, result);
	Store4(pred - prediction_stride, packed_result);
	const int second_row_result = _mm_extract_epi32(packed_result, 1);
	memcpy(pred, &second_row_result, sizeof(second_row_result));
	pred += prediction_stride;
	obmc_pred += obmc_prediction_stride;
	y -= 2;
	} while (y != 0);
	}

	inline void OverlapBlendFromLeft8xH_SSE4_1(
	uint8_t* const prediction, const ptrdiff_t prediction_stride,
	const int height, const uint8_t* const obmc_prediction,
	const ptrdiff_t obmc_prediction_stride) {
	uint8_t* pred = prediction;
	const uint8_t* obmc_pred = obmc_prediction;
	const __m128i mask_inverter = _mm_set1_epi8(64);
	const __m128i mask_val = LoadLo8(kObmcMask + 6);
	// 64 - mask
	const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
	const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
	int y = height;
	do {
	const __m128i pred_val = LoadLo8(pred);
	const __m128i obmc_pred_val = LoadLo8(obmc_pred);
	const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
	const __m128i result =
	RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);

	StoreLo8(pred, _mm_packus_epi16(result, result));
	pred += prediction_stride;
	obmc_pred += obmc_prediction_stride;
	} while (--y != 0);
	}

	void OverlapBlendFromLeft_SSE4_1(void* const prediction,
	const ptrdiff_t prediction_stride,
	const int width, const int height,
	const void* const obmc_prediction,
	const ptrdiff_t obmc_prediction_stride) {
	auto* pred = static_cast<uint8_t*>(prediction);
	const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);

	if (width == 2) {
	OverlapBlendFromLeft2xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
	obmc_prediction_stride);
	return;
	}
	if (width == 4) {
	OverlapBlendFromLeft4xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
	obmc_prediction_stride);
	return;
	}
	if (width == 8) {
	OverlapBlendFromLeft8xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
	obmc_prediction_stride);
	return;
	}
	const __m128i mask_inverter = _mm_set1_epi8(64);
	const uint8_t* mask = kObmcMask + width - 2;
	int x = 0;
	do {
	pred = static_cast<uint8_t*>(prediction) + x;
	obmc_pred = static_cast<const uint8_t*>(obmc_prediction) + x;
	const __m128i mask_val = LoadUnaligned16(mask + x);
	// 64 - mask
	const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
	const __m128i masks_lo = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
	const __m128i masks_hi = _mm_unpackhi_epi8(mask_val, obmc_mask_val);

	int y = 0;
	do {
	const __m128i pred_val = LoadUnaligned16(pred);
	const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
	const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
	const __m128i result_lo =
	RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks_lo), 6);
	const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
	const __m128i result_hi =
	RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks_hi), 6);
	StoreUnaligned16(pred, _mm_packus_epi16(result_lo, result_hi));

	pred += prediction_stride;
	obmc_pred += obmc_prediction_stride;
	} while (++y < height);
	x += 16;
	} while (x < width);
	}

	inline void OverlapBlendFromTop4xH_SSE4_1(
	uint8_t* const prediction, const ptrdiff_t prediction_stride,
	const int height, const uint8_t* const obmc_prediction,
	const ptrdiff_t obmc_prediction_stride) {
	uint8_t* pred = prediction;
	const uint8_t* obmc_pred = obmc_prediction;
	const __m128i mask_inverter = _mm_set1_epi16(64);
	const __m128i mask_shuffler = _mm_set_epi32(0x01010101, 0x01010101, 0, 0);
	const __m128i mask_preinverter = _mm_set1_epi16(-256 \| 1);

	const uint8_t* mask = kObmcMask + height - 2;
	const int compute_height = height - (height >> 2);
	int y = 0;
	do {
	// First mask in the first half, second mask in the second half.
	const __m128i mask_val = _mm_shuffle_epi8(
	_mm_cvtsi32_si128(reinterpret_cast<const uint16_t>(mask + y)),
	mask_shuffler);
	const __m128i masks =
	_mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter));
	const __m128i pred_val0 = Load4(pred);

	const __m128i obmc_pred_val0 = Load4(obmc_pred);
	pred += prediction_stride;
	obmc_pred += obmc_prediction_stride;
	const __m128i pred_val =
	_mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12);
	const __m128i obmc_pred_val = _mm_alignr_epi8(
	Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12);
	const __m128i terms = _mm_unpacklo_epi8(obmc_pred_val, pred_val);
	const __m128i result =
	RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);

	const __m128i packed_result = _mm_packus_epi16(result, result);
	Store4(pred - prediction_stride, packed_result);
	Store4(pred, _mm_srli_si128(packed_result, 4));
	pred += prediction_stride;
	obmc_pred += obmc_prediction_stride;
	y += 2;
	} while (y < compute_height);
	}

	inline void OverlapBlendFromTop8xH_SSE4_1(
	uint8_t* const prediction, const ptrdiff_t prediction_stride,
	const int height, const uint8_t* const obmc_prediction,
	const ptrdiff_t obmc_prediction_stride) {
	uint8_t* pred = prediction;
	const uint8_t* obmc_pred = obmc_prediction;
	const uint8_t* mask = kObmcMask + height - 2;
	const __m128i mask_inverter = _mm_set1_epi8(64);
	const int compute_height = height - (height >> 2);
	int y = compute_height;
	do {
	const __m128i mask_val = _mm_set1_epi8(mask[compute_height - y]);
	// 64 - mask
	const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
	const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
	const __m128i pred_val = LoadLo8(pred);
	const __m128i obmc_pred_val = LoadLo8(obmc_pred);
	const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
	const __m128i result =
	RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);

	StoreLo8(pred, _mm_packus_epi16(result, result));
	pred += prediction_stride;
	obmc_pred += obmc_prediction_stride;
	} while (--y != 0);
	}

	void OverlapBlendFromTop_SSE4_1(void* const prediction,
	const ptrdiff_t prediction_stride,
	const int width, const int height,
	const void* const obmc_prediction,
	const ptrdiff_t obmc_prediction_stride) {
	auto* pred = static_cast<uint8_t*>(prediction);
	const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);

	if (width <= 4) {
	OverlapBlendFromTop4xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
	obmc_prediction_stride);
	return;
	}
	if (width == 8) {
	OverlapBlendFromTop8xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
	obmc_prediction_stride);
	return;
	}

	// Stop when mask value becomes 64.
	const int compute_height = height - (height >> 2);
	const __m128i mask_inverter = _mm_set1_epi8(64);
	int y = 0;
	const uint8_t* mask = kObmcMask + height - 2;
	do {
	const __m128i mask_val = _mm_set1_epi8(mask[y]);
	// 64 - mask
	const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
	const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
	int x = 0;
	do {
	const __m128i pred_val = LoadUnaligned16(pred + x);
	const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred + x);
	const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
	const __m128i result_lo =
	RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks), 6);
	const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
	const __m128i result_hi =
	RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks), 6);
	StoreUnaligned16(pred + x, _mm_packus_epi16(result_lo, result_hi));
	x += 16;
	} while (x < width);
	pred += prediction_stride;
	obmc_pred += obmc_prediction_stride;
	} while (++y < compute_height);
	}

	void Init8bpp() {
	Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
	assert(dsp != nullptr);
	#if DSP_ENABLED_8BPP_SSE4_1(ObmcVertical)
	dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop_SSE4_1;
	#endif
	#if DSP_ENABLED_8BPP_SSE4_1(ObmcHorizontal)
	dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft_SSE4_1;
	#endif
	}

	} // namespace

	void ObmcInit_SSE4_1() { Init8bpp(); }

	} // namespace dsp
	} // namespace libgav1

	#else // !LIBGAV1_ENABLE_SSE4_1

	namespace libgav1 {
	namespace dsp {

	void ObmcInit_SSE4_1() {}

	} // namespace dsp
	} // namespace libgav1
	#endif // LIBGAV1_ENABLE_SSE4_1