| /****************************************************************************** |
| * |
| * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at: |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| * |
| ******************************************************************************/ |
| /** |
| ******************************************************************************* |
| * @file |
| * ihevc_weighted_pred_x86_intr.c |
| * |
| * @brief |
| * Contains function definitions for weighted prediction used in inter |
| * prediction |
| * |
| * @author |
| * |
| * |
| * @par List of Functions: |
| * - ihevc_weighted_pred_uni_sse42() |
| * - ihevc_weighted_pred_bi_sse42() |
| * - ihevc_weighted_pred_bi_default_sse42() |
| * - ihevc_weighted_pred_chroma_uni_sse42() |
| * - ihevc_weighted_pred_chroma_bi_sse42() |
| * |
| * @remarks |
| * None |
| * |
| ******************************************************************************* |
| */ |
| /*****************************************************************************/ |
| /* File Includes */ |
| /*****************************************************************************/ |
| #include <stdio.h> |
| #include <assert.h> |
| |
| #include "ihevc_debug.h" |
| #include "ihevc_typedefs.h" |
| #include "ihevc_macros.h" |
| #include "ihevc_platform_macros.h" |
| #include "ihevc_func_selector.h" |
| #include "ihevc_defs.h" |
| #include "ihevc_weighted_pred.h" |
| #include "ihevc_inter_pred.h" |
| |
| #include <immintrin.h> |
| |
| /** |
| ******************************************************************************* |
| * |
| * @brief |
| * Does uni-weighted prediction on the array pointed by pi2_src and stores |
| * it at the location pointed by pi2_dst |
| * |
| * @par Description: |
| * dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift + |
| * offset |
| * |
| * @param[in] pi2_src |
| * Pointer to the source |
| * |
| * @param[out] pu1_dst |
| * Pointer to the destination |
| * |
| * @param[in] src_strd |
| * Source stride |
| * |
| * @param[in] dst_strd |
| * Destination stride |
| * |
| * @param[in] wgt0 |
| * weight to be multiplied to the source |
| * |
| * @param[in] off0 |
| * offset to be added after rounding and |
| * |
| * @param[in] shifting |
| * |
| * |
| * @param[in] shift |
| * (14 Bit depth) + log2_weight_denominator |
| * |
| * @param[in] lvl_shift |
| * added before shift and offset |
| * |
| * @param[in] ht |
| * height of the source |
| * |
| * @param[in] wd |
| * width of the source |
| * |
| * @returns |
| * |
| * @remarks |
| * None |
| * |
| ******************************************************************************* |
| */ |
| |
| void ihevc_weighted_pred_uni_sse42(WORD16 *pi2_src, |
| UWORD8 *pu1_dst, |
| WORD32 src_strd, |
| WORD32 dst_strd, |
| WORD32 wgt0, |
| WORD32 off0, |
| WORD32 shift, |
| WORD32 lvl_shift, |
| WORD32 ht, |
| WORD32 wd) |
| { |
| WORD32 row, col, temp; |
| WORD32 dst0, dst1, dst2, dst3; |
| |
| /* all 128 bit registers are named with a suffix mxnb, where m is the */ |
| /* number of n bits packed in the register */ |
| __m128i src_temp0_4x32b, src_temp1_4x32b, src_temp2_4x32b, src_temp3_4x32b; |
| __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_4x32b, off0_4x32b; |
| |
| ASSERT(wd % 4 == 0); /* checking assumption*/ |
| ASSERT(ht % 4 == 0); /* checking assumption*/ |
| |
| temp = 1 << (shift - 1); |
| |
| // seting values in register |
| const_temp_4x32b = _mm_set1_epi32(temp); |
| lvl_shift_4x32b = _mm_set1_epi32(lvl_shift); |
| wgt0_4x32b = _mm_set1_epi32(wgt0); |
| off0_4x32b = _mm_set1_epi32(off0); |
| |
| if(0 == (wd & 7)) /* wd multiple of 8 case */ |
| { |
| __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b; |
| |
| /* outer for loop starts from here */ |
| for(row = 0; row < ht; row += 4) |
| { |
| for(col = 0; col < wd; col += 8) |
| { /* for row =0 ,1,2,3*/ |
| |
| /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ |
| src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src)); |
| /* row = 1 */ |
| src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd)); |
| /* row = 2 */ |
| src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd)); |
| /* row = 3 */ |
| src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd)); |
| |
| /* row = 0 */ /* Last 4 pixels */ |
| src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 4)); |
| /* row = 1 */ |
| src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 4)); |
| /* row = 2 */ |
| src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd + 4)); |
| /* row = 3 */ |
| src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd + 4)); |
| |
| /* considering pix. 4:0 by converting 16-into 32 bit */ /* First 4 pixels */ |
| src_temp0_4x32b = _mm_cvtepi16_epi32(src_temp0_4x32b); |
| src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b); |
| src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b); |
| src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b); |
| |
| /* (pi2_src[col] + lvl_shift)*/ /* First 4 pixels */ |
| src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b); |
| src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b); |
| src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b); |
| src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b); |
| |
| /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ /* First 4 pixels */ |
| src_temp0_4x32b = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b); |
| src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b); |
| src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b); |
| src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b); |
| |
| /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */ |
| src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b); |
| src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b); |
| src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b); |
| src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b); |
| |
| /* (pi2_src[col] + lvl_shift)*/ /* Last 4 pixels */ |
| src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift_4x32b); |
| src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift_4x32b); |
| src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift_4x32b); |
| src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift_4x32b); |
| |
| /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ /* Last 4 pixels */ |
| src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt0_4x32b); |
| src_temp5_4x32b = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b); |
| src_temp6_4x32b = _mm_mullo_epi32(src_temp6_4x32b, wgt0_4x32b); |
| src_temp7_4x32b = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b); |
| |
| /* i4_tmp += 1 << (shift - 1) */ /* First 4 pixels */ |
| src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b); |
| src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b); |
| src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b); |
| src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b); |
| |
| /* (i4_tmp >> shift) */ /* First 4 pixels */ |
| src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift); |
| src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift); |
| src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift); |
| src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift); |
| |
| /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */ |
| src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, const_temp_4x32b); |
| src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b); |
| src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, const_temp_4x32b); |
| src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b); |
| |
| /* (i4_tmp >> shift) */ /* Last 4 pixels */ |
| src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b, shift); |
| src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b, shift); |
| src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b, shift); |
| src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b, shift); |
| |
| /*i4_tmp = (i4_tmp >> shift) + off0; */ /* First 4 pixels */ |
| src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b); |
| src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b); |
| src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b); |
| src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b); |
| |
| /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */ |
| src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, off0_4x32b); |
| src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, off0_4x32b); |
| src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, off0_4x32b); |
| src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, off0_4x32b); |
| |
| src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp4_4x32b); |
| src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp5_4x32b); |
| src_temp2_4x32b = _mm_packs_epi32(src_temp2_4x32b, src_temp6_4x32b); |
| src_temp3_4x32b = _mm_packs_epi32(src_temp3_4x32b, src_temp7_4x32b); |
| /* pu1_dst[col] = CLIP_U8(i4_tmp); */ |
| src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp0_4x32b); |
| src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b); |
| src_temp2_4x32b = _mm_packus_epi16(src_temp2_4x32b, src_temp2_4x32b); |
| src_temp3_4x32b = _mm_packus_epi16(src_temp3_4x32b, src_temp3_4x32b); |
| |
| /* store four 8-bit output values */ |
| _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp0_4x32b); /* row = 0*/ |
| _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp1_4x32b); /* row = 2*/ |
| _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp2_4x32b); /* row = 1*/ |
| _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp3_4x32b); /* row = 3*/ |
| |
| /* To update pointer */ |
| pi2_src += 8; |
| pu1_dst += 8; |
| |
| } /* inner loop ends here(4-output values in single iteration) */ |
| |
| pi2_src = pi2_src - wd + 4 * src_strd; /* Pointer update */ |
| pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */ |
| |
| } |
| } |
| else /* wd multiple of 4 case */ |
| { |
| /* outer for loop starts from here */ |
| for(row = 0; row < ht; row += 4) |
| { |
| for(col = 0; col < wd; col += 4) |
| { /* for row =0 ,1,2,3*/ |
| |
| /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ |
| src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src)); |
| /* row = 1 */ |
| src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd)); |
| /* row = 2 */ |
| src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd)); |
| /* row = 3 */ |
| src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd)); |
| |
| /* considering pix. 4:0 by converting 16-into 32 bit */ |
| src_temp0_4x32b = _mm_cvtepi16_epi32(src_temp0_4x32b); |
| src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b); |
| src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b); |
| src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b); |
| |
| /* (pi2_src[col] + lvl_shift)*/ |
| src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b); |
| src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b); |
| src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b); |
| src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b); |
| |
| /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ |
| src_temp0_4x32b = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b); |
| src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b); |
| src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b); |
| src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b); |
| |
| /* i4_tmp += 1 << (shift - 1) */ |
| src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b); |
| src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b); |
| src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b); |
| src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b); |
| |
| /* (i4_tmp >> shift) */ |
| src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift); |
| src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift); |
| src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift); |
| src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift); |
| |
| /*i4_tmp = (i4_tmp >> shift) + off0; */ |
| src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b); |
| src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b); |
| src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b); |
| src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b); |
| |
| src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp1_4x32b); |
| src_temp2_4x32b = _mm_packs_epi32(src_temp2_4x32b, src_temp3_4x32b); |
| |
| /* pu1_dst[col] = CLIP_U8(i4_tmp); */ |
| src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp2_4x32b); |
| |
| dst0 = _mm_cvtsi128_si32(src_temp0_4x32b); |
| /* dst row = 1 to 3 */ |
| src_temp1_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 1); |
| src_temp2_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 2); |
| src_temp3_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 3); |
| |
| /* store four 8-bit output values */ |
| *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; |
| |
| dst1 = _mm_cvtsi128_si32(src_temp1_4x32b); |
| dst2 = _mm_cvtsi128_si32(src_temp2_4x32b); |
| dst3 = _mm_cvtsi128_si32(src_temp3_4x32b); |
| |
| /* row = 1 to row = 3 */ |
| *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; |
| *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2; |
| *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3; |
| |
| /* To update pointer */ |
| pi2_src += 4; |
| pu1_dst += 4; |
| |
| } /* inner loop ends here(4-output values in single iteration) */ |
| |
| pi2_src = pi2_src - wd + 4 * src_strd; /* Pointer update */ |
| pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */ |
| |
| } |
| } |
| } |
| |
| /** |
| ******************************************************************************* |
| * |
| * @brief |
| * Does chroma uni-weighted prediction on array pointed by pi2_src and stores |
| * it at the location pointed by pi2_dst |
| * |
| * @par Description: |
| * dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift + |
| * offset |
| * |
| * @param[in] pi2_src |
| * Pointer to the source |
| * |
| * @param[out] pu1_dst |
| * Pointer to the destination |
| * |
| * @param[in] src_strd |
| * Source stride |
| * |
| * @param[in] dst_strd |
| * Destination stride |
| * |
| * @param[in] wgt0 |
| * weight to be multiplied to the source |
| * |
| * @param[in] off0 |
| * offset to be added after rounding and |
| * |
| * @param[in] shifting |
| * |
| * |
| * @param[in] shift |
| * (14 Bit depth) + log2_weight_denominator |
| * |
| * @param[in] lvl_shift |
| * added before shift and offset |
| * |
| * @param[in] ht |
| * height of the source |
| * |
| * @param[in] wd |
| * width of the source (each colour component) |
| * |
| * @returns |
| * |
| * @remarks |
| * None |
| * |
| ******************************************************************************* |
| */ |
| |
| void ihevc_weighted_pred_chroma_uni_sse42(WORD16 *pi2_src, |
| UWORD8 *pu1_dst, |
| WORD32 src_strd, |
| WORD32 dst_strd, |
| WORD32 wgt0_cb, |
| WORD32 wgt0_cr, |
| WORD32 off0_cb, |
| WORD32 off0_cr, |
| WORD32 shift, |
| WORD32 lvl_shift, |
| WORD32 ht, |
| WORD32 wd) |
| { |
| WORD32 row, col, temp, wdx2; |
| /* all 128 bit registers are named with a suffix mxnb, where m is the */ |
| /* number of n bits packed in the register */ |
| |
| __m128i src_temp0_4x32b, src_temp1_4x32b; |
| __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_4x32b, off0_4x32b; |
| |
| ASSERT(wd % 2 == 0); /* checking assumption*/ |
| ASSERT(ht % 2 == 0); /* checking assumption*/ |
| |
| temp = 1 << (shift - 1); |
| wdx2 = 2 * wd; |
| |
| // seting values in register |
| const_temp_4x32b = _mm_set1_epi32(temp); |
| lvl_shift_4x32b = _mm_set1_epi32(lvl_shift); |
| wgt0_4x32b = _mm_set_epi32(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb); |
| off0_4x32b = _mm_set_epi32(off0_cr, off0_cb, off0_cr, off0_cb); |
| |
| #if 0 /* Enable this for ht%4=0 case. But was degrading performance for lower sizes and improving for higher sizes!!! */ |
| if( 0 == (ht & 3)) /* ht multiple of 4 case */ |
| { |
| if( 0 == (wdx2 & 15)) /* 2*wd multiple of 168 case */ |
| { |
| __m128i src_temp2_4x32b, src_temp3_4x32b; |
| __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b; |
| __m128i src_temp8_4x32b, src_temp9_4x32b, src_temp10_4x32b, src_temp11_4x32b; |
| __m128i src_temp12_4x32b, src_temp13_4x32b, src_temp14_4x32b, src_temp15_4x32b; |
| /* outer for loop starts from here */ |
| for(row = 0; row < ht; row +=4) |
| { |
| for(col = 0; col < wdx2; col +=16) |
| { |
| /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ |
| src_temp0_4x32b = _mm_loadu_si128((__m128i*)(pi2_src)); |
| /* row = 1 */ |
| src_temp1_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd)); |
| /* row = 0 */ /* Second 4 pixels */ |
| src_temp2_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+4)); |
| /* row = 1 */ |
| src_temp3_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+4)); |
| /* row = 0 */ /* Third 4 pixels */ |
| src_temp4_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+8)); |
| /* row = 1 */ |
| src_temp5_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+8)); |
| /* row = 0 */ /* Last 4 pixels */ |
| src_temp6_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+12)); |
| /* row = 1 */ |
| src_temp7_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+12)); |
| |
| /* considering pix. 4:0 by converting 16-into 32 bit */ |
| src_temp0_4x32b = _mm_cvtepi16_epi32(src_temp0_4x32b); |
| src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b); |
| /* (pi2_src[col] + lvl_shift)*/ |
| src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, lvl_shift_4x32b); |
| src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, lvl_shift_4x32b); |
| /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ |
| src_temp0_4x32b = _mm_mullo_epi32 (src_temp0_4x32b, wgt0_4x32b); |
| src_temp1_4x32b = _mm_mullo_epi32 (src_temp1_4x32b, wgt0_4x32b); |
| |
| /* considering pix. 4:0 by converting 16-into 32 bit */ /* Second 4 pixels */ |
| src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b); |
| src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b); |
| /* (pi2_src[col] + lvl_shift)*/ |
| src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, lvl_shift_4x32b); |
| src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, lvl_shift_4x32b); |
| /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ |
| src_temp2_4x32b = _mm_mullo_epi32 (src_temp2_4x32b, wgt0_4x32b); |
| src_temp3_4x32b = _mm_mullo_epi32 (src_temp3_4x32b, wgt0_4x32b); |
| |
| /* considering pix. 4:0 by converting 16-into 32 bit */ /* Third 4 pixels */ |
| src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b); |
| src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b); |
| /* (pi2_src[col] + lvl_shift)*/ |
| src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, lvl_shift_4x32b); |
| src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, lvl_shift_4x32b); |
| /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ |
| src_temp4_4x32b = _mm_mullo_epi32 (src_temp4_4x32b, wgt0_4x32b); |
| src_temp5_4x32b = _mm_mullo_epi32 (src_temp5_4x32b, wgt0_4x32b); |
| |
| /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */ |
| src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b); |
| src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b); |
| /* (pi2_src[col] + lvl_shift)*/ |
| src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, lvl_shift_4x32b); |
| src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, lvl_shift_4x32b); |
| /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ |
| src_temp6_4x32b = _mm_mullo_epi32 (src_temp6_4x32b, wgt0_4x32b); |
| src_temp7_4x32b = _mm_mullo_epi32 (src_temp7_4x32b, wgt0_4x32b); |
| |
| /* i4_tmp += 1 << (shift - 1) */ |
| src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, const_temp_4x32b); |
| src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, const_temp_4x32b); |
| /* (i4_tmp >> shift) */ |
| src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift); |
| src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift); |
| |
| /* i4_tmp += 1 << (shift - 1) */ /* Second 4 pixels */ |
| src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, const_temp_4x32b); |
| src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, const_temp_4x32b); |
| /* (i4_tmp >> shift) */ |
| src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift); |
| src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift); |
| |
| /* i4_tmp += 1 << (shift - 1) */ /* Third 4 pixels */ |
| src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, const_temp_4x32b); |
| src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, const_temp_4x32b); |
| /* (i4_tmp >> shift) */ |
| src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b, shift); |
| src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b, shift); |
| |
| /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */ |
| src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, const_temp_4x32b); |
| src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, const_temp_4x32b); |
| /* (i4_tmp >> shift) */ |
| src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b, shift); |
| src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b, shift); |
| |
| /*i4_tmp = (i4_tmp >> shift) + off0; */ |
| src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, off0_4x32b); |
| src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, off0_4x32b); |
| /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */ |
| src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, off0_4x32b); |
| src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, off0_4x32b); |
| /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */ |
| src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, off0_4x32b); |
| src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, off0_4x32b); |
| /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */ |
| src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, off0_4x32b); |
| src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, off0_4x32b); |
| |
| src_temp0_4x32b = _mm_packs_epi32 (src_temp0_4x32b, src_temp2_4x32b); |
| src_temp1_4x32b = _mm_packs_epi32 (src_temp1_4x32b, src_temp3_4x32b); |
| src_temp4_4x32b = _mm_packs_epi32 (src_temp4_4x32b, src_temp6_4x32b); |
| src_temp5_4x32b = _mm_packs_epi32 (src_temp5_4x32b, src_temp7_4x32b); |
| /* pu1_dst[col] = CLIP_U8(i4_tmp); */ |
| src_temp0_4x32b = _mm_packus_epi16 (src_temp0_4x32b, src_temp4_4x32b); |
| src_temp1_4x32b = _mm_packus_epi16 (src_temp1_4x32b, src_temp5_4x32b); |
| |
| /* store 16 8-bit output values */ |
| _mm_storeu_si128((__m128i*)(pu1_dst+0*dst_strd), src_temp0_4x32b); /* row = 0*/ |
| _mm_storeu_si128((__m128i*)(pu1_dst+1*dst_strd), src_temp1_4x32b); /* row = 1*/ |
| |
| /* row = 2 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ |
| src_temp8_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd)); |
| /* row = 3 */ |
| src_temp9_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd)); |
| /* row = 2 */ /* Second 4 pixels */ |
| src_temp10_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+4)); |
| /* row = 3 */ |
| src_temp11_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+4)); |
| /* row = 2 */ /* Third 4 pixels */ |
| src_temp12_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+8)); |
| /* row = 3 */ |
| src_temp13_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+8)); |
| /* row = 2 */ /* Last 4 pixels */ |
| src_temp14_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+12)); |
| /* row = 3 */ |
| src_temp15_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+12)); |
| |
| /* considering pix. 4:0 by converting 16-into 32 bit */ |
| src_temp8_4x32b = _mm_cvtepi16_epi32(src_temp8_4x32b); |
| src_temp9_4x32b = _mm_cvtepi16_epi32(src_temp9_4x32b); |
| /* (pi2_src[col] + lvl_shift)*/ |
| src_temp8_4x32b = _mm_add_epi32 (src_temp8_4x32b, lvl_shift_4x32b); |
| src_temp9_4x32b = _mm_add_epi32 (src_temp9_4x32b, lvl_shift_4x32b); |
| /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ |
| src_temp8_4x32b = _mm_mullo_epi32 (src_temp8_4x32b, wgt0_4x32b); |
| src_temp9_4x32b = _mm_mullo_epi32 (src_temp9_4x32b, wgt0_4x32b); |
| |
| /* considering pix. 4:0 by converting 16-into 32 bit */ /* Second 4 pixels */ |
| src_temp10_4x32b = _mm_cvtepi16_epi32(src_temp10_4x32b); |
| src_temp11_4x32b = _mm_cvtepi16_epi32(src_temp11_4x32b); |
| /* (pi2_src[col] + lvl_shift)*/ |
| src_temp10_4x32b = _mm_add_epi32 (src_temp10_4x32b, lvl_shift_4x32b); |
| src_temp11_4x32b = _mm_add_epi32 (src_temp11_4x32b, lvl_shift_4x32b); |
| /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ |
| src_temp10_4x32b = _mm_mullo_epi32 (src_temp10_4x32b, wgt0_4x32b); |
| src_temp11_4x32b = _mm_mullo_epi32 (src_temp11_4x32b, wgt0_4x32b); |
| |
| /* considering pix. 4:0 by converting 16-into 32 bit */ /* Third 4 pixels */ |
| src_temp12_4x32b = _mm_cvtepi16_epi32(src_temp12_4x32b); |
| src_temp13_4x32b = _mm_cvtepi16_epi32(src_temp13_4x32b); |
| /* (pi2_src[col] + lvl_shift)*/ |
| src_temp12_4x32b = _mm_add_epi32 (src_temp12_4x32b, lvl_shift_4x32b); |
| src_temp13_4x32b = _mm_add_epi32 (src_temp13_4x32b, lvl_shift_4x32b); |
| /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ |
| src_temp12_4x32b = _mm_mullo_epi32 (src_temp12_4x32b, wgt0_4x32b); |
| src_temp13_4x32b = _mm_mullo_epi32 (src_temp13_4x32b, wgt0_4x32b); |
| |
| /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */ |
| src_temp14_4x32b = _mm_cvtepi16_epi32(src_temp14_4x32b); |
| src_temp15_4x32b = _mm_cvtepi16_epi32(src_temp15_4x32b); |
| /* (pi2_src[col] + lvl_shift)*/ |
| src_temp14_4x32b = _mm_add_epi32 (src_temp14_4x32b, lvl_shift_4x32b); |
| src_temp15_4x32b = _mm_add_epi32 (src_temp15_4x32b, lvl_shift_4x32b); |
| /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ |
| src_temp14_4x32b = _mm_mullo_epi32 (src_temp14_4x32b, wgt0_4x32b); |
| src_temp15_4x32b = _mm_mullo_epi32 (src_temp15_4x32b, wgt0_4x32b); |
| |
| /* i4_tmp += 1 << (shift - 1) */ |
| src_temp8_4x32b = _mm_add_epi32 (src_temp8_4x32b, const_temp_4x32b); |
| src_temp9_4x32b = _mm_add_epi32 (src_temp9_4x32b, const_temp_4x32b); |
| /* (i4_tmp >> shift) */ |
| src_temp8_4x32b = _mm_srai_epi32(src_temp8_4x32b, shift); |
| src_temp9_4x32b = _mm_srai_epi32(src_temp9_4x32b, shift); |
| |
| /* i4_tmp += 1 << (shift - 1) */ /* Second 4 pixels */ |
| src_temp10_4x32b = _mm_add_epi32 (src_temp10_4x32b, const_temp_4x32b); |
| src_temp11_4x32b = _mm_add_epi32 (src_temp11_4x32b, const_temp_4x32b); |
| /* (i4_tmp >> shift) */ |
| src_temp10_4x32b = _mm_srai_epi32(src_temp10_4x32b, shift); |
| src_temp11_4x32b = _mm_srai_epi32(src_temp11_4x32b, shift); |
| |
| /* i4_tmp += 1 << (shift - 1) */ /* Third 4 pixels */ |
| src_temp12_4x32b = _mm_add_epi32 (src_temp12_4x32b, const_temp_4x32b); |
| src_temp13_4x32b = _mm_add_epi32 (src_temp13_4x32b, const_temp_4x32b); |
| /* (i4_tmp >> shift) */ |
| src_temp12_4x32b = _mm_srai_epi32(src_temp12_4x32b, shift); |
| src_temp13_4x32b = _mm_srai_epi32(src_temp13_4x32b, shift); |
| |
| /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */ |
| src_temp14_4x32b = _mm_add_epi32 (src_temp14_4x32b, const_temp_4x32b); |
| src_temp15_4x32b = _mm_add_epi32 (src_temp15_4x32b, const_temp_4x32b); |
| /* (i4_tmp >> shift) */ |
| src_temp14_4x32b = _mm_srai_epi32(src_temp14_4x32b, shift); |
| src_temp15_4x32b = _mm_srai_epi32(src_temp15_4x32b, shift); |
| |
| /*i4_tmp = (i4_tmp >> shift) + off0; */ |
| src_temp8_4x32b = _mm_add_epi32 (src_temp8_4x32b, off0_4x32b); |
| src_temp9_4x32b = _mm_add_epi32 (src_temp9_4x32b, off0_4x32b); |
| /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */ |
| src_temp10_4x32b = _mm_add_epi32 (src_temp10_4x32b, off0_4x32b); |
| src_temp11_4x32b = _mm_add_epi32 (src_temp11_4x32b, off0_4x32b); |
| /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */ |
| src_temp12_4x32b = _mm_add_epi32 (src_temp12_4x32b, off0_4x32b); |
| src_temp13_4x32b = _mm_add_epi32 (src_temp13_4x32b, off0_4x32b); |
| /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */ |
| src_temp14_4x32b = _mm_add_epi32 (src_temp14_4x32b, off0_4x32b); |
| src_temp15_4x32b = _mm_add_epi32 (src_temp15_4x32b, off0_4x32b); |
| |
| src_temp8_4x32b = _mm_packs_epi32 (src_temp8_4x32b, src_temp10_4x32b); |
| src_temp9_4x32b = _mm_packs_epi32 (src_temp9_4x32b, src_temp11_4x32b); |
| src_temp12_4x32b = _mm_packs_epi32 (src_temp12_4x32b, src_temp14_4x32b); |
| src_temp13_4x32b = _mm_packs_epi32 (src_temp13_4x32b, src_temp15_4x32b); |
| /* pu1_dst[col] = CLIP_U8(i4_tmp); */ |
| src_temp8_4x32b = _mm_packus_epi16 (src_temp8_4x32b, src_temp12_4x32b); |
| src_temp9_4x32b = _mm_packus_epi16 (src_temp9_4x32b, src_temp13_4x32b); |
| |
| /* store 16 8-bit output values */ |
| _mm_storeu_si128((__m128i*)(pu1_dst+2*dst_strd), src_temp8_4x32b); /* row = 2*/ |
| _mm_storeu_si128((__m128i*)(pu1_dst+3*dst_strd), src_temp9_4x32b); /* row = 3*/ |
| |
| pi2_src += 16; /* Pointer update */ |
| pu1_dst += 16; /* Pointer update */ |
| |
| } /* inner loop ends here(4-output values in single iteration) */ |
| pi2_src = pi2_src - wdx2 + 4*src_strd; /* Pointer update */ |
| pu1_dst = pu1_dst - wdx2 + 4*dst_strd; /* Pointer update */ |
| } |
| } |
| else if( 0 == (wdx2 & 7)) /* 2*wd multiple of 8 case */ |
| { |
| __m128i src_temp2_4x32b,src_temp3_4x32b; |
| __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b; |
| /* outer for loop starts from here */ |
| for(row = 0; row < ht; row +=4) |
| { |
| for(col = 0; col < wdx2; col +=8) |
| { |
| /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ |
| src_temp0_4x32b = _mm_loadu_si128((__m128i*)(pi2_src)); |
| /* row = 1 */ |
| src_temp1_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd)); |
| /* row = 2 */ |
| src_temp2_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd)); |
| /* row = 3 */ |
| src_temp3_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd)); |
| |
| /* row = 0 */ /* Last 4 pixels */ |
| src_temp4_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+4)); |
| /* row = 1 */ |
| src_temp5_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+4)); |
| /* row = 2 */ |
| src_temp6_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+4)); |
| /* row = 3 */ |
| src_temp7_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+4)); |
| |
| /* considering pix. 4:0 by converting 16-into 32 bit */ |
| src_temp0_4x32b = _mm_cvtepi16_epi32(src_temp0_4x32b); |
| src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b); |
| /* (pi2_src[col] + lvl_shift)*/ |
| src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, lvl_shift_4x32b); |
| src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, lvl_shift_4x32b); |
| /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ |
| src_temp0_4x32b = _mm_mullo_epi32 (src_temp0_4x32b, wgt0_4x32b); |
| src_temp1_4x32b = _mm_mullo_epi32 (src_temp1_4x32b, wgt0_4x32b); |
| |
| /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */ |
| src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b); |
| src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b); |
| /* (pi2_src[col] + lvl_shift)*/ |
| src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, lvl_shift_4x32b); |
| src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, lvl_shift_4x32b); |
| /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ |
| src_temp2_4x32b = _mm_mullo_epi32 (src_temp2_4x32b, wgt0_4x32b); |
| src_temp3_4x32b = _mm_mullo_epi32 (src_temp3_4x32b, wgt0_4x32b); |
| |
| /* considering pix. 4:0 by converting 16-into 32 bit */ |
| src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b); |
| src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b); |
| /* (pi2_src[col] + lvl_shift)*/ |
| src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, lvl_shift_4x32b); |
| src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, lvl_shift_4x32b); |
| /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ |
| src_temp4_4x32b = _mm_mullo_epi32 (src_temp4_4x32b, wgt0_4x32b); |
| src_temp5_4x32b = _mm_mullo_epi32 (src_temp5_4x32b, wgt0_4x32b); |
| |
| /* considering pix. 4:0 by converting 16-into 32 bit */ |
| src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b); |
| src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b); |
| /* (pi2_src[col] + lvl_shift)*/ |
| src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, lvl_shift_4x32b); |
| src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, lvl_shift_4x32b); |
| /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ |
| src_temp6_4x32b = _mm_mullo_epi32 (src_temp6_4x32b, wgt0_4x32b); |
| src_temp7_4x32b = _mm_mullo_epi32 (src_temp7_4x32b, wgt0_4x32b); |
| |
| /* i4_tmp += 1 << (shift - 1) */ |
| src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, const_temp_4x32b); |
| src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, const_temp_4x32b); |
| /* (i4_tmp >> shift) */ |
| src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift); |
| src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift); |
| |
| /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */ |
| src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, const_temp_4x32b); |
| src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, const_temp_4x32b); |
| /* (i4_tmp >> shift) */ |
| src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift); |
| src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift); |
| |
| /* i4_tmp += 1 << (shift - 1) */ |
| src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, const_temp_4x32b); |
| src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, const_temp_4x32b); |
| /* (i4_tmp >> shift) */ |
| src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b, shift); |
| src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b, shift); |
| |
| /* i4_tmp += 1 << (shift - 1) */ |
| src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, const_temp_4x32b); |
| src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, const_temp_4x32b); |
| /* (i4_tmp >> shift) */ |
| src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b, shift); |
| src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b, shift); |
| |
| /*i4_tmp = (i4_tmp >> shift) + off0; */ |
| src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, off0_4x32b); |
| src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, off0_4x32b); |
| /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */ |
| src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, off0_4x32b); |
| src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, off0_4x32b); |
| /*i4_tmp = (i4_tmp >> shift) + off0; */ |
| src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, off0_4x32b); |
| src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, off0_4x32b); |
| /*i4_tmp = (i4_tmp >> shift) + off0; */ |
| src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, off0_4x32b); |
| src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, off0_4x32b); |
| |
| src_temp0_4x32b = _mm_packs_epi32 (src_temp0_4x32b, src_temp4_4x32b); |
| src_temp1_4x32b = _mm_packs_epi32 (src_temp1_4x32b, src_temp5_4x32b); |
| src_temp2_4x32b = _mm_packs_epi32 (src_temp2_4x32b, src_temp6_4x32b); |
| src_temp3_4x32b = _mm_packs_epi32 (src_temp3_4x32b, src_temp7_4x32b); |
| |
| /* pu1_dst[col] = CLIP_U8(i4_tmp); */ |
| src_temp0_4x32b = _mm_packus_epi16 (src_temp0_4x32b, src_temp0_4x32b); |
| src_temp1_4x32b = _mm_packus_epi16 (src_temp1_4x32b, src_temp1_4x32b); |
| src_temp2_4x32b = _mm_packus_epi16 (src_temp2_4x32b, src_temp2_4x32b); |
| src_temp3_4x32b = _mm_packus_epi16 (src_temp3_4x32b, src_temp3_4x32b); |
| |
| /* store four 8-bit output values */ |
| _mm_storel_epi64((__m128i*)(pu1_dst+0*dst_strd), src_temp0_4x32b); /* row = 0*/ |
| _mm_storel_epi64((__m128i*)(pu1_dst+1*dst_strd), src_temp1_4x32b); /* row = 1*/ |
| _mm_storel_epi64((__m128i*)(pu1_dst+2*dst_strd), src_temp2_4x32b); /* row = 0*/ |
| _mm_storel_epi64((__m128i*)(pu1_dst+3*dst_strd), src_temp3_4x32b); /* row = 1*/ |
| |
| pi2_src += 8; /* Pointer update */ |
| pu1_dst += 8; /* Pointer update */ |
| |
| } /* inner loop ends here(4-output values in single iteration) */ |
| pi2_src = pi2_src - wdx2 + 4*src_strd; /* Pointer update */ |
| pu1_dst = pu1_dst - wdx2 + 4*dst_strd; /* Pointer update */ |
| } |
| } |
| else /* 2*wd multiple of 4 case */ |
| { |
| WORD32 dst0, dst1, dst2, dst3; |
| __m128i src_temp2_4x32b,src_temp3_4x32b; |
| /* outer for loop starts from here */ |
| for(row = 0; row < ht; row +=4) |
| { |
| for(col = 0; col < wdx2; col +=4) |
| { |
| /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ |
| src_temp0_4x32b = _mm_loadu_si128((__m128i*)(pi2_src)); |
| /* row = 1 */ |
| src_temp1_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+1*src_strd)); |
| /* row = 2 */ |
| src_temp2_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd)); |
| /* row = 3 */ |
| src_temp3_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd)); |
| |
| /* considering pix. 4:0 by converting 16-into 32 bit */ |
| src_temp0_4x32b = _mm_cvtepi16_epi32(src_temp0_4x32b); |
| src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b); |
| src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b); |
| src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b); |
| |
| /* (pi2_src[col] + lvl_shift)*/ |
| src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, lvl_shift_4x32b); |
| src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, lvl_shift_4x32b); |
| /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ |
| src_temp0_4x32b = _mm_mullo_epi32 (src_temp0_4x32b, wgt0_4x32b); |
| src_temp1_4x32b = _mm_mullo_epi32 (src_temp1_4x32b, wgt0_4x32b); |
| |
| /* (pi2_src[col] + lvl_shift)*/ |
| src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, lvl_shift_4x32b); |
| src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, lvl_shift_4x32b); |
| /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ |
| src_temp2_4x32b = _mm_mullo_epi32 (src_temp2_4x32b, wgt0_4x32b); |
| src_temp3_4x32b = _mm_mullo_epi32 (src_temp3_4x32b, wgt0_4x32b); |
| |
| /* i4_tmp += 1 << (shift - 1) */ |
| src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, const_temp_4x32b); |
| src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, const_temp_4x32b); |
| /* (i4_tmp >> shift) */ |
| src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift); |
| src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift); |
| /*i4_tmp = (i4_tmp >> shift) + off0; */ |
| src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, off0_4x32b); |
| src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, off0_4x32b); |
| |
| /* i4_tmp += 1 << (shift - 1) */ |
| src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, const_temp_4x32b); |
| src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, const_temp_4x32b); |
| /* (i4_tmp >> shift) */ |
| src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift); |
| src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift); |
| /*i4_tmp = (i4_tmp >> shift) + off0; */ |
| src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, off0_4x32b); |
| src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, off0_4x32b); |
| |
| src_temp0_4x32b = _mm_packs_epi32 (src_temp0_4x32b, src_temp1_4x32b); |
| src_temp2_4x32b = _mm_packs_epi32 (src_temp2_4x32b, src_temp3_4x32b); |
| |
| /* pu1_dst[col] = CLIP_U8(i4_tmp); */ |
| src_temp0_4x32b = _mm_packus_epi16 (src_temp0_4x32b, src_temp2_4x32b); |
| |
| dst0 = _mm_cvtsi128_si32(src_temp0_4x32b); |
| /* dst row = 1 to 3 */ |
| src_temp1_4x32b = _mm_shuffle_epi32 (src_temp0_4x32b, 1); |
| src_temp2_4x32b = _mm_shuffle_epi32 (src_temp0_4x32b, 2); |
| src_temp3_4x32b = _mm_shuffle_epi32 (src_temp0_4x32b, 3); |
| |
| /* store four 8-bit output values */ |
| *(WORD32 *) (&pu1_dst[0*dst_strd]) = dst0; |
| |
| dst1 = _mm_cvtsi128_si32(src_temp1_4x32b); |
| dst2 = _mm_cvtsi128_si32(src_temp2_4x32b); |
| dst3 = _mm_cvtsi128_si32(src_temp3_4x32b); |
| /* row = 1 */ |
| *(WORD32 *) (&pu1_dst[1*dst_strd]) = dst1; |
| /* row = 2 */ |
| *(WORD32 *) (&pu1_dst[2*dst_strd]) = dst2; |
| /* row = 3 */ |
| *(WORD32 *) (&pu1_dst[3*dst_strd]) = dst3; |
| |
| pi2_src += 4; /* Pointer update */ |
| pu1_dst += 4; /* Pointer update */ |
| |
| } /* inner loop ends here(4-output values in single iteration) */ |
| pi2_src = pi2_src - wdx2 + 4*src_strd; /* Pointer update */ |
| pu1_dst = pu1_dst - wdx2 + 4*dst_strd; /* Pointer update */ |
| } |
| } |
| } |
| else /* ht multiple of 2 case */ |
| #endif |
| |
| { |
| if(0 == (wdx2 & 15)) /* 2*wd multiple of 168 case */ |
| { |
| __m128i src_temp2_4x32b, src_temp3_4x32b; |
| __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b; |
| /* outer for loop starts from here */ |
| for(row = 0; row < ht; row += 2) |
| { |
| for(col = 0; col < wdx2; col += 16) |
| { |
| /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ |
| src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src)); |
| /* row = 1 */ |
| src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd)); |
| |
| /* row = 0 */ /* Second 4 pixels */ |
| src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 4)); |
| /* row = 1 */ |
| src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 4)); |
| /* row = 0 */ /* Third 4 pixels */ |
| src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 8)); |
| /* row = 1 */ |
| src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 8)); |
| /* row = 0 */ /* Last 4 pixels */ |
| src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 12)); |
| /* row = 1 */ |
| src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 12)); |
| |
| /* considering pix. 4:0 by converting 16-into 32 bit */ |
| src_temp0_4x32b = _mm_cvtepi16_epi32(src_temp0_4x32b); |
| src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b); |
| /* (pi2_src[col] + lvl_shift)*/ |
| src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b); |
| src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b); |
| /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ |
| src_temp0_4x32b = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b); |
| src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b); |
| |
| /* considering pix. 4:0 by converting 16-into 32 bit */ /* Second 4 pixels */ |
| src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b); |
| src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b); |
| /* (pi2_src[col] + lvl_shift)*/ |
| src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b); |
| src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b); |
| /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ |
| src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b); |
| src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b); |
| |
| /* considering pix. 4:0 by converting 16-into 32 bit */ /* Third 4 pixels */ |
| src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b); |
| src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b); |
| /* (pi2_src[col] + lvl_shift)*/ |
| src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift_4x32b); |
| src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift_4x32b); |
| /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ |
| src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt0_4x32b); |
| src_temp5_4x32b = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b); |
| |
| /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */ |
| src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b); |
| src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b); |
| /* (pi2_src[col] + lvl_shift)*/ |
| src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift_4x32b); |
| src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift_4x32b); |
| /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ |
| src_temp6_4x32b = _mm_mullo_epi32(src_temp6_4x32b, wgt0_4x32b); |
| src_temp7_4x32b = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b); |
| |
| /* i4_tmp += 1 << (shift - 1) */ |
| src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b); |
| src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b); |
| /* (i4_tmp >> shift) */ |
| src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift); |
| src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift); |
| |
| /* i4_tmp += 1 << (shift - 1) */ /* Second 4 pixels */ |
| src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b); |
| src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b); |
| /* (i4_tmp >> shift) */ |
| src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift); |
| src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift); |
| |
| /* i4_tmp += 1 << (shift - 1) */ /* Third 4 pixels */ |
| src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, const_temp_4x32b); |
| src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b); |
| /* (i4_tmp >> shift) */ |
| src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b, shift); |
| src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b, shift); |
| |
| /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */ |
| src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, const_temp_4x32b); |
| src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b); |
| /* (i4_tmp >> shift) */ |
| src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b, shift); |
| src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b, shift); |
| |
| /*i4_tmp = (i4_tmp >> shift) + off0; */ |
| src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b); |
| src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b); |
| /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */ |
| src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b); |
| src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b); |
| /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */ |
| src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, off0_4x32b); |
| src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, off0_4x32b); |
| /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */ |
| src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, off0_4x32b); |
| src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, off0_4x32b); |
| |
| src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp2_4x32b); |
| src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b); |
| src_temp4_4x32b = _mm_packs_epi32(src_temp4_4x32b, src_temp6_4x32b); |
| src_temp5_4x32b = _mm_packs_epi32(src_temp5_4x32b, src_temp7_4x32b); |
| /* pu1_dst[col] = CLIP_U8(i4_tmp); */ |
| src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp4_4x32b); |
| src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp5_4x32b); |
| |
| /* store 16 8-bit output values */ |
| _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), src_temp0_4x32b); /* row = 0*/ |
| _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src_temp1_4x32b); /* row = 1*/ |
| |
| pi2_src += 16; /* Pointer update */ |
| pu1_dst += 16; /* Pointer update */ |
| |
| } /* inner loop ends here(4-output values in single iteration) */ |
| pi2_src = pi2_src - wdx2 + 2 * src_strd; /* Pointer update */ |
| pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ |
| } |
| } |
| else if(0 == (wdx2 & 7)) /* 2*wd multiple of 8 case */ |
| { |
| __m128i src_temp2_4x32b, src_temp3_4x32b; |
| /* outer for loop starts from here */ |
| for(row = 0; row < ht; row += 2) |
| { |
| for(col = 0; col < wdx2; col += 8) |
| { |
| /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ |
| src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src)); |
| /* row = 1 */ |
| src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd)); |
| |
| /* row = 0 */ /* Last 4 pixels */ |
| src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 4)); |
| /* row = 1 */ |
| src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 4)); |
| |
| /* considering pix. 4:0 by converting 16-into 32 bit */ |
| src_temp0_4x32b = _mm_cvtepi16_epi32(src_temp0_4x32b); |
| src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b); |
| /* (pi2_src[col] + lvl_shift)*/ |
| src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b); |
| src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b); |
| /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ |
| src_temp0_4x32b = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b); |
| src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b); |
| |
| /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */ |
| src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b); |
| src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b); |
| /* (pi2_src[col] + lvl_shift)*/ |
| src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b); |
| src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b); |
| /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ |
| src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b); |
| src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b); |
| |
| /* i4_tmp += 1 << (shift - 1) */ |
| src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b); |
| src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b); |
| /* (i4_tmp >> shift) */ |
| src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift); |
| src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift); |
| |
| /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */ |
| src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b); |
| src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b); |
| /* (i4_tmp >> shift) */ |
| src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift); |
| src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift); |
| |
| /*i4_tmp = (i4_tmp >> shift) + off0; */ |
| src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b); |
| src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b); |
| /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */ |
| src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b); |
| src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b); |
| |
| src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp2_4x32b); |
| src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b); |
| |
| /* pu1_dst[col] = CLIP_U8(i4_tmp); */ |
| src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp0_4x32b); |
| src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b); |
| |
| /* store four 8-bit output values */ |
| _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp0_4x32b); /* row = 0*/ |
| _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp1_4x32b); /* row = 1*/ |
| |
| pi2_src += 8; /* Pointer update */ |
| pu1_dst += 8; /* Pointer update */ |
| |
| } /* inner loop ends here(4-output values in single iteration) */ |
| pi2_src = pi2_src - wdx2 + 2 * src_strd; /* Pointer update */ |
| pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ |
| } |
| } |
| else /* 2*wd multiple of 4 case */ |
| { |
| WORD32 dst0, dst1; |
| /* outer for loop starts from here */ |
| for(row = 0; row < ht; row += 2) |
| { |
| for(col = 0; col < wdx2; col += 4) |
| { |
| /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ |
| src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src)); |
| /* row = 1 */ |
| src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd)); |
| |
| /* considering pix. 4:0 by converting 16-into 32 bit */ |
| src_temp0_4x32b = _mm_cvtepi16_epi32(src_temp0_4x32b); |
| src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b); |
| |
| /* (pi2_src[col] + lvl_shift)*/ |
| src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b); |
| src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b); |
| |
| /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ |
| src_temp0_4x32b = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b); |
| src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b); |
| |
| /* i4_tmp += 1 << (shift - 1) */ |
| src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b); |
| src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b); |
| |
| /* (i4_tmp >> shift) */ |
| src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift); |
| src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift); |
| |
| /*i4_tmp = (i4_tmp >> shift) + off0; */ |
| src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b); |
| src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b); |
| |
| src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp1_4x32b); |
| |
| /* pu1_dst[col] = CLIP_U8(i4_tmp); */ |
| src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp0_4x32b); |
| |
| dst0 = _mm_cvtsi128_si32(src_temp0_4x32b); |
| /* dst row = 1 to 3 */ |
| src_temp1_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 1); |
| |
| /* store four 8-bit output values */ |
| *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; |
| |
| dst1 = _mm_cvtsi128_si32(src_temp1_4x32b); |
| /* row = 1 */ |
| *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; |
| |
| pi2_src += 4; /* Pointer update */ |
| pu1_dst += 4; /* Pointer update */ |
| |
| } /* inner loop ends here(4-output values in single iteration) */ |
| pi2_src = pi2_src - wdx2 + 2 * src_strd; /* Pointer update */ |
| pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ |
| } |
| } |
| } |
| } |
| |
| /** |
| ******************************************************************************* |
| * |
| * @brief |
| * Does bi-weighted prediction on the arrays pointed by pi2_src1 and |
| * pi2_src2 and stores it at location pointed by pi2_dst |
| * |
| * @par Description: |
| * dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 + |
| * off1 + 1) << (shift - 1) ) >> shift |
| * |
| * @param[in] pi2_src1 |
| * Pointer to source 1 |
| * |
| * @param[in] pi2_src2 |
| * Pointer to source 2 |
| * |
| * @param[out] pu1_dst |
| * Pointer to destination |
| * |
| * @param[in] src_strd1 |
| * Source stride 1 |
| * |
| * @param[in] src_strd2 |
| * Source stride 2 |
| * |
| * @param[in] dst_strd |
| * Destination stride |
| * |
| * @param[in] wgt0 |
| * weight to be multiplied to source 1 |
| * |
| * @param[in] off0 |
| * offset 0 |
| * |
| * @param[in] wgt1 |
| * weight to be multiplied to source 2 |
| * |
| * @param[in] off1 |
| * offset 1 |
| * |
| * @param[in] shift |
| * (14 Bit depth) + log2_weight_denominator |
| * |
| * @param[in] lvl_shift1 |
| * added before shift and offset |
| * |
| * @param[in] lvl_shift2 |
| * added before shift and offset |
| * |
| * @param[in] ht |
| * height of the source |
| * |
| * @param[in] wd |
| * width of the source |
| * |
| * @returns |
| * |
| * @remarks |
| * None |
| * |
| ******************************************************************************* |
| */ |
| |
| void ihevc_weighted_pred_bi_sse42(WORD16 *pi2_src1, |
| WORD16 *pi2_src2, |
| UWORD8 *pu1_dst, |
| WORD32 src_strd1, |
| WORD32 src_strd2, |
| WORD32 dst_strd, |
| WORD32 wgt0, |
| WORD32 off0, |
| WORD32 wgt1, |
| WORD32 off1, |
| WORD32 shift, |
| WORD32 lvl_shift1, |
| WORD32 lvl_shift2, |
| WORD32 ht, |
| WORD32 wd) |
| { |
| WORD32 row, col, temp; |
| |
| __m128i src_temp1_4x32b, src_temp2_4x32b, src_temp3_4x32b, src_temp4_4x32b; |
| __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_4x32b, wgt1_4x32b; |
| |
| |
| ASSERT(wd % 4 == 0); /* checking assumption*/ |
| ASSERT(ht % 2 == 0); /* checking assumption*/ |
| |
| temp = (off0 + off1 + 1) << (shift - 1); |
| |
| // seting values in register |
| const_temp_4x32b = _mm_set1_epi32(temp); |
| lvl_shift1_4x32b = _mm_set1_epi32(lvl_shift1); |
| lvl_shift2_4x32b = _mm_set1_epi32(lvl_shift2); |
| wgt0_4x32b = _mm_set1_epi32(wgt0); |
| wgt1_4x32b = _mm_set1_epi32(wgt1); |
| |
| if(0 == (wd & 7)) /* wd multiple of 8 case */ |
| { |
| __m128i src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b, src_temp8_4x32b; |
| /* outer for loop starts from here */ |
| for(row = 0; row < ht; row += 2) |
| { |
| for(col = 0; col < wd; col += 8) |
| { |
| /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ |
| src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */ |
| src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */ |
| src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */ |
| src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */ |
| /* Next 4 pixels */ |
| src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 4)); /* row = 0 */ |
| src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 4)); /* row = 0 */ |
| src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1 + 4)); /* row = 1 */ |
| src_temp8_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2 + 4)); /* row = 1 */ |
| |
| /* considering pix. 4:0 by converting 16-into 32 bit */ |
| src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b); |
| src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b); |
| /* (pi2_src1[col] + lvl_shift1) */ |
| src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b); |
| /* (pi2_src2[col] + lvl_shift2) */ |
| src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b); |
| /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */ |
| src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b); |
| /*(pi2_src2[col] + lvl_shift2) * wgt1 */ |
| src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b); |
| |
| src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b); |
| src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b); |
| src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b); |
| src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b); |
| src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b); |
| src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b); |
| |
| /* Next 4 Pixels */ |
| src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b); |
| src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b); |
| src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift1_4x32b); |
| src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift2_4x32b); |
| src_temp5_4x32b = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b); |
| src_temp6_4x32b = _mm_mullo_epi32(src_temp6_4x32b, wgt1_4x32b); |
| src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b); |
| src_temp8_4x32b = _mm_cvtepi16_epi32(src_temp8_4x32b); |
| src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift1_4x32b); |
| src_temp8_4x32b = _mm_add_epi32(src_temp8_4x32b, lvl_shift2_4x32b); |
| src_temp7_4x32b = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b); |
| src_temp8_4x32b = _mm_mullo_epi32(src_temp8_4x32b, wgt1_4x32b); |
| |
| /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */ |
| src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b); |
| src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b); |
| /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */ |
| src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b); |
| src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b); |
| /* (i4_tmp >> shift) */ |
| src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift); |
| src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift); |
| |
| /* Next 4 Pixels */ |
| src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, src_temp6_4x32b); |
| src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, src_temp8_4x32b); |
| src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b); |
| src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b); |
| src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b, shift); |
| src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b, shift); |
| |
| src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp5_4x32b); |
| src_temp3_4x32b = _mm_packs_epi32(src_temp3_4x32b, src_temp7_4x32b); |
| |
| /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ |
| src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b); |
| src_temp3_4x32b = _mm_packus_epi16(src_temp3_4x32b, src_temp3_4x32b); |
| |
| /* store four 8-bit output values */ |
| _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_4x32b); /* row = 0*/ |
| _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_4x32b); /* row = 1*/ |
| |
| pi2_src1 += 8; /* Pointer update */ |
| pi2_src2 += 8; /* Pointer update */ |
| pu1_dst += 8; /* Pointer update */ |
| |
| } /* inner loop ends here(4-output values in single iteration) */ |
| |
| pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */ |
| pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */ |
| pu1_dst = pu1_dst - wd + 2 * dst_strd; /* Pointer update */ |
| |
| } /* outer loop ends */ |
| } |
| else /* wd multiple of 4 case */ |
| { |
| WORD32 dst0, dst1; |
| /* outer for loop starts from here */ |
| for(row = 0; row < ht; row += 2) |
| { |
| for(col = 0; col < wd; col += 4) |
| { |
| /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ |
| src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */ |
| src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */ |
| src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */ |
| src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */ |
| |
| /* considering pix. 4:0 by converting 16-into 32 bit */ |
| src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b); |
| src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b); |
| /* (pi2_src1[col] + lvl_shift1) */ |
| src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b); |
| /* (pi2_src2[col] + lvl_shift2) */ |
| src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b); |
| /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */ |
| src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b); |
| /*(pi2_src2[col] + lvl_shift2) * wgt1 */ |
| src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b); |
| |
| src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b); |
| src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b); |
| src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b); |
| src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b); |
| src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b); |
| src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b); |
| |
| /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */ |
| src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b); |
| src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b); |
| |
| /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */ |
| src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b); |
| src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b); |
| |
| /* (i4_tmp >> shift) */ |
| src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift); |
| src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift); |
| |
| src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b); |
| |
| /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ |
| src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b); |
| |
| dst0 = _mm_cvtsi128_si32(src_temp1_4x32b); |
| |
| /* dst row = 1 to 3 */ |
| src_temp2_4x32b = _mm_shuffle_epi32(src_temp1_4x32b, 1); |
| |
| /* store four 8-bit output values */ |
| *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; |
| |
| dst1 = _mm_cvtsi128_si32(src_temp2_4x32b); |
| |
| /* row = 1 to 3 */ |
| *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; |
| |
| pi2_src1 += 4; /* Pointer update */ |
| pi2_src2 += 4; /* Pointer update */ |
| pu1_dst += 4; /* Pointer update */ |
| |
| } /* inner loop ends here(4-output values in single iteration) */ |
| |
| pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */ |
| pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */ |
| pu1_dst = pu1_dst - wd + 2 * dst_strd; /* Pointer update */ |
| |
| } /* outer loop ends */ |
| } |
| |
| } |
| |
| /** |
| ******************************************************************************* |
| * |
| * @brief |
| * Does chroma bi-weighted prediction on the arrays pointed by pi2_src1 and |
| * pi2_src2 and stores it at location pointed by pi2_dst |
| * |
| * @par Description: |
| * dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 + |
| * off1 + 1) << (shift - 1) ) >> shift |
| * |
| * @param[in] pi2_src1 |
| * Pointer to source 1 |
| * |
| * @param[in] pi2_src2 |
| * Pointer to source 2 |
| * |
| * @param[out] pu1_dst |
| * Pointer to destination |
| * |
| * @param[in] src_strd1 |
| * Source stride 1 |
| * |
| * @param[in] src_strd2 |
| * Source stride 2 |
| * |
| * @param[in] dst_strd |
| * Destination stride |
| * |
| * @param[in] wgt0 |
| * weight to be multiplied to source 1 |
| * |
| * @param[in] off0 |
| * offset 0 |
| * |
| * @param[in] wgt1 |
| * weight to be multiplied to source 2 |
| * |
| * @param[in] off1 |
| * offset 1 |
| * |
| * @param[in] shift |
| * (14 Bit depth) + log2_weight_denominator |
| * |
| * @param[in] lvl_shift1 |
| * added before shift and offset |
| * |
| * @param[in] lvl_shift2 |
| * added before shift and offset |
| * |
| * @param[in] ht |
| * height of the source |
| * |
| * @param[in] wd |
| * width of the source (each colour component) |
| * |
| * @returns |
| * |
| * @remarks |
| * None |
| * |
| ******************************************************************************* |
| */ |
| |
| void ihevc_weighted_pred_chroma_bi_sse42(WORD16 *pi2_src1, |
| WORD16 *pi2_src2, |
| UWORD8 *pu1_dst, |
| WORD32 src_strd1, |
| WORD32 src_strd2, |
| WORD32 dst_strd, |
| WORD32 wgt0_cb, |
| WORD32 wgt0_cr, |
| WORD32 off0_cb, |
| WORD32 off0_cr, |
| WORD32 wgt1_cb, |
| WORD32 wgt1_cr, |
| WORD32 off1_cb, |
| WORD32 off1_cr, |
| WORD32 shift, |
| WORD32 lvl_shift1, |
| WORD32 lvl_shift2, |
| WORD32 ht, |
| WORD32 wd) |
| { |
| WORD32 row, col, temp1, temp2; |
| WORD32 wdx2; |
| |
| __m128i src_temp1_4x32b, src_temp2_4x32b, src_temp3_4x32b, src_temp4_4x32b; |
| __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_4x32b, wgt1_4x32b; |
| |
| |
| ASSERT(wd % 2 == 0); /* checking assumption*/ |
| ASSERT(ht % 2 == 0); /* checking assumption*/ |
| |
| temp1 = (off0_cb + off1_cb + 1) << (shift - 1); |
| temp2 = (off0_cr + off1_cr + 1) << (shift - 1); |
| |
| // seting values in register |
| const_temp_4x32b = _mm_set_epi32(temp2, temp1, temp2, temp1); |
| lvl_shift1_4x32b = _mm_set1_epi32(lvl_shift1); |
| lvl_shift2_4x32b = _mm_set1_epi32(lvl_shift2); |
| wgt0_4x32b = _mm_set_epi32(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb); |
| wgt1_4x32b = _mm_set_epi32(wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb); |
| |
| wdx2 = wd * 2; |
| |
| if(0 == (wdx2 & 7)) /* wdx2 multiple of 8 case */ |
| { |
| __m128i src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b, src_temp8_4x32b; |
| /* outer for loop starts from here */ |
| for(row = 0; row < ht; row += 2) |
| { |
| for(col = 0; col < wdx2; col += 8) |
| { |
| /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ |
| src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */ |
| src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */ |
| src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */ |
| src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */ |
| /* Next 4 pixels */ |
| src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 4)); /* row = 0 */ |
| src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 4)); /* row = 0 */ |
| src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1 + 4)); /* row = 1 */ |
| src_temp8_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2 + 4)); /* row = 1 */ |
| |
| /* considering pix. 4:0 by converting 16-into 32 bit */ |
| src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b); |
| src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b); |
| /* (pi2_src1[col] + lvl_shift1) */ |
| src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b); |
| /* (pi2_src2[col] + lvl_shift2) */ |
| src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b); |
| /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */ |
| src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b); |
| /*(pi2_src2[col] + lvl_shift2) * wgt1 */ |
| src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b); |
| |
| src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b); |
| src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b); |
| src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b); |
| src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b); |
| src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b); |
| src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b); |
| |
| /* Next 4 Pixels */ |
| src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b); |
| src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b); |
| src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift1_4x32b); |
| src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift2_4x32b); |
| src_temp5_4x32b = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b); |
| src_temp6_4x32b = _mm_mullo_epi32(src_temp6_4x32b, wgt1_4x32b); |
| src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b); |
| src_temp8_4x32b = _mm_cvtepi16_epi32(src_temp8_4x32b); |
| src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift1_4x32b); |
| src_temp8_4x32b = _mm_add_epi32(src_temp8_4x32b, lvl_shift2_4x32b); |
| src_temp7_4x32b = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b); |
| src_temp8_4x32b = _mm_mullo_epi32(src_temp8_4x32b, wgt1_4x32b); |
| |
| /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */ |
| src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b); |
| src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b); |
| /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */ |
| src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b); |
| src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b); |
| /* (i4_tmp >> shift) */ |
| src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift); |
| src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift); |
| |
| /* Next 4 Pixels */ |
| src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, src_temp6_4x32b); |
| src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, src_temp8_4x32b); |
| src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b); |
| src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b); |
| src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b, shift); |
| src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b, shift); |
| |
| src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp5_4x32b); |
| src_temp3_4x32b = _mm_packs_epi32(src_temp3_4x32b, src_temp7_4x32b); |
| |
| /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ |
| src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b); |
| src_temp3_4x32b = _mm_packus_epi16(src_temp3_4x32b, src_temp3_4x32b); |
| |
| /* store four 8-bit output values */ |
| _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_4x32b); /* row = 0*/ |
| _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_4x32b); /* row = 1*/ |
| |
| pi2_src1 += 8; /* Pointer update */ |
| pi2_src2 += 8; /* Pointer update */ |
| pu1_dst += 8; /* Pointer update */ |
| |
| } /* inner loop ends here(4-output values in single iteration) */ |
| |
| pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1; /* Pointer update */ |
| pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2; /* Pointer update */ |
| pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ |
| |
| } /* outer loop ends */ |
| } |
| else /* wdx2 multiple of 4 case */ |
| { |
| WORD32 dst0, dst1; |
| /* outer for loop starts from here */ |
| for(row = 0; row < ht; row += 2) |
| { |
| for(col = 0; col < wdx2; col += 4) |
| { |
| /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ |
| src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */ |
| src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */ |
| src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */ |
| src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */ |
| |
| /* considering pix. 4:0 by converting 16-into 32 bit */ |
| src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b); |
| src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b); |
| /* (pi2_src1[col] + lvl_shift1) */ |
| src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b); |
| /* (pi2_src2[col] + lvl_shift2) */ |
| src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b); |
| /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */ |
| src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b); |
| /*(pi2_src2[col] + lvl_shift2) * wgt1 */ |
| src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b); |
| |
| src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b); |
| src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b); |
| src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b); |
| src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b); |
| src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b); |
| src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b); |
| |
| /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */ |
| src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b); |
| src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b); |
| |
| /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */ |
| src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b); |
| src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b); |
| |
| /* (i4_tmp >> shift) */ |
| src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift); |
| src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift); |
| |
| src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b); |
| |
| /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ |
| src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b); |
| |
| dst0 = _mm_cvtsi128_si32(src_temp1_4x32b); |
| |
| /* dst row = 1 to 3 */ |
| src_temp2_4x32b = _mm_shuffle_epi32(src_temp1_4x32b, 1); |
| |
| /* store four 8-bit output values */ |
| *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; |
| |
| dst1 = _mm_cvtsi128_si32(src_temp2_4x32b); |
| |
| /* row = 1 to 3 */ |
| *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; |
| |
| pi2_src1 += 4; /* Pointer update */ |
| pi2_src2 += 4; /* Pointer update */ |
| pu1_dst += 4; /* Pointer update */ |
| |
| } /* inner loop ends here(4-output values in single iteration) */ |
| |
| pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1; /* Pointer update */ |
| pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2; /* Pointer update */ |
| pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ |
| } |
| } |
| |
| } |
| |
| /** |
| ******************************************************************************* |
| * |
| * @brief |
| * Does default bi-weighted prediction on the arrays pointed by pi2_src1 and |
| * pi2_src2 and stores it at location pointed by pi2_dst |
| * |
| * @par Description: |
| * dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) ) |
| * >> shift where shift = 15 - BitDepth |
| * |
| * @param[in] pi2_src1 |
| * Pointer to source 1 |
| * |
| * @param[in] pi2_src2 |
| * Pointer to source 2 |
| * |
| * @param[out] pu1_dst |
| * Pointer to destination |
| * |
| * @param[in] src_strd1 |
| * Source stride 1 |
| * |
| * @param[in] src_strd2 |
| * Source stride 2 |
| * |
| * @param[in] dst_strd |
| * Destination stride |
| * |
| * @param[in] lvl_shift1 |
| * added before shift and offset |
| * |
| * @param[in] lvl_shift2 |
| * added before shift and offset |
| * |
| * @param[in] ht |
| * height of the source |
| * |
| * @param[in] wd |
| * width of the source |
| * |
| * @returns |
| * |
| * @remarks |
| * None |
| * |
| * Assumption : ht%4 == 0, wd%4 == 0 |
| * shift == 7, (lvl_shift1+lvl_shift2) can take {0, 8K, 16K}. In that case, |
| * final result will match even if intermediate precision is in 16 bit. |
| * |
| ******************************************************************************* |
| */ |
| |
| void ihevc_weighted_pred_bi_default_sse42(WORD16 *pi2_src1, |
| WORD16 *pi2_src2, |
| UWORD8 *pu1_dst, |
| WORD32 src_strd1, |
| WORD32 src_strd2, |
| WORD32 dst_strd, |
| WORD32 lvl_shift1, |
| WORD32 lvl_shift2, |
| WORD32 ht, |
| WORD32 wd) |
| { |
| WORD32 row, col, temp; |
| WORD32 shift; |
| |
| __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; |
| __m128i const_temp_8x16b, lvl_shift1_8x16b, lvl_shift2_8x16b; |
| __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b; |
| |
| ASSERT(wd % 4 == 0); /* checking assumption*/ |
| ASSERT(ht % 2 == 0); /* checking assumption*/ |
| |
| shift = SHIFT_14_MINUS_BIT_DEPTH + 1; |
| temp = 1 << (shift - 1); |
| |
| // seting values in register |
| lvl_shift1_8x16b = _mm_set1_epi16(lvl_shift1); |
| lvl_shift2_8x16b = _mm_set1_epi16(lvl_shift2); |
| const_temp_8x16b = _mm_set1_epi16(temp); |
| |
| lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, lvl_shift2_8x16b); |
| lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, const_temp_8x16b); |
| |
| if(0 == (ht & 3)) /* ht multiple of 4*/ |
| { |
| if(0 == (wd & 15)) /* wd multiple of 16 case */ |
| { |
| __m128i src_temp9_8x16b, src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b; |
| __m128i src_temp13_8x16b, src_temp14_8x16b, src_temp15_8x16b, src_temp16_8x16b; |
| /* outer for loop starts from here */ |
| for(row = 0; row < ht; row += 4) |
| { |
| for(col = 0; col < wd; col += 16) |
| { |
| /*load 8 pixel values */ /* First 8 Values */ |
| src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); |
| src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); |
| /* row = 1 */ |
| src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1)); |
| src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2)); |
| /* row = 2 */ |
| src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1)); |
| src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2)); |
| /* row = 3 */ |
| src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1)); |
| src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2)); |
| |
| /*load 8 pixel values */ /* Second 8 Values */ |
| src_temp9_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 8)); |
| src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8)); |
| /* row = 1 */ |
| src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8)); |
| src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8)); |
| /* row = 2 */ |
| src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1 + 8)); |
| src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2 + 8)); |
| |
| /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */ |
| src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); |
| src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b); |
| src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b); |
| src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b); |
| |
| /*load 8 pixel values */ /* Second 8 Values */ |
| /* row = 3 */ |
| src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1 + 8)); |
| src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2 + 8)); |
| |
| /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */ |
| src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); |
| src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b); |
| src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b); |
| src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b); |
| |
| /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */ |
| src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, src_temp10_8x16b); |
| src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b); |
| src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, src_temp14_8x16b); |
| src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, src_temp16_8x16b); |
| |
| /* (i4_tmp >> shift) */ /* First 8 Values */ |
| src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); |
| src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift); |
| src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift); |
| src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b, shift); |
| |
| /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */ |
| src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b); |
| src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b); |
| src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, lvl_shift1_8x16b); |
| src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, lvl_shift1_8x16b); |
| |
| /* (i4_tmp >> shift) */ /* Second 8 Values */ |
| src_temp9_8x16b = _mm_srai_epi16(src_temp9_8x16b, shift); |
| src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, shift); |
| src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, shift); |
| src_temp15_8x16b = _mm_srai_epi16(src_temp15_8x16b, shift); |
| |
| /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* 16 8 Values */ |
| src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp9_8x16b); |
| src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp11_8x16b); |
| src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp13_8x16b); |
| src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp15_8x16b); |
| |
| /* store four 8-bit output values */ /* 16 8 Values */ |
| _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/ |
| _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/ |
| _mm_storeu_si128((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/ |
| _mm_storeu_si128((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/ |
| |
| /* To update pointer */ |
| pi2_src1 += 16; |
| pi2_src2 += 16; |
| pu1_dst += 16; |
| |
| } /* inner loop ends here(8-output values in single iteration) */ |
| |
| pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */ |
| pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */ |
| pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */ |
| |
| } |
| } |
| else if(0 == (wd & 7)) /* multiple of 8 case */ |
| { |
| /* outer for loop starts from here */ |
| for(row = 0; row < ht; row += 4) |
| { |
| for(col = 0; col < wd; col += 8) |
| { |
| /*load 8 pixel values */ |
| src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); |
| src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); |
| /* row = 1 */ |
| src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1)); |
| src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2)); |
| /* row = 2 */ |
| src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1)); |
| src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2)); |
| /* row = 3 */ |
| src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1)); |
| src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2)); |
| |
| /* (pi2_src1[col] + pi2_src2[col]) */ |
| src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); |
| src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b); |
| src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b); |
| src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b); |
| |
| /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ |
| src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); |
| src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b); |
| src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b); |
| src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b); |
| |
| /* (i4_tmp >> shift) */ |
| src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); |
| src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift); |
| src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift); |
| src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b, shift); |
| |
| /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ |
| src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b); |
| src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b); |
| src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b); |
| src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b); |
| |
| /* store four 8-bit output values */ |
| _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/ |
| _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/ |
| _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/ |
| _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/ |
| |
| /* To update pointer */ |
| pi2_src1 += 8; |
| pi2_src2 += 8; |
| pu1_dst += 8; |
| |
| } /* inner loop ends here(8-output values in single iteration) */ |
| |
| pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */ |
| pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */ |
| pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */ |
| |
| } |
| } |
| else /* wd multiple of 4 case*/ |
| { |
| WORD32 dst0, dst1, dst2, dst3; |
| |
| /* outer for loop starts from here */ |
| for(row = 0; row < ht; row += 4) |
| { |
| for(col = 0; col < wd; col += 4) |
| { |
| /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/ |
| src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); |
| /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ |
| src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); |
| |
| /* row = 1 */ |
| src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1)); |
| src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2)); |
| /* row = 2 */ |
| src_temp5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 2 * src_strd1)); |
| src_temp6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 2 * src_strd2)); |
| /* row = 3 */ |
| src_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 3 * src_strd1)); |
| src_temp8_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 3 * src_strd2)); |
| |
| /* Pack two rows together */ |
| src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b); |
| src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b); |
| src_temp5_8x16b = _mm_unpacklo_epi64(src_temp5_8x16b, src_temp7_8x16b); |
| src_temp6_8x16b = _mm_unpacklo_epi64(src_temp6_8x16b, src_temp8_8x16b); |
| |
| /* (pi2_src1[col] + pi2_src2[col]) */ |
| src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); |
| src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b); |
| |
| /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ |
| src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); |
| src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b); |
| |
| /* (i4_tmp >> shift) */ |
| src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); |
| src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift); |
| |
| /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ |
| src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b); |
| src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b); |
| |
| dst0 = _mm_cvtsi128_si32(src_temp1_8x16b); |
| /* dst row = 1 to 3 */ |
| src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1); |
| src_temp4_8x16b = _mm_shuffle_epi32(src_temp5_8x16b, 1); |
| |
| /* store four 8-bit output values */ |
| *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; |
| |
| dst1 = _mm_cvtsi128_si32(src_temp2_8x16b); |
| dst2 = _mm_cvtsi128_si32(src_temp5_8x16b); |
| dst3 = _mm_cvtsi128_si32(src_temp4_8x16b); |
| |
| /* row = 1 to row = 3 */ |
| *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; |
| *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2; |
| *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3; |
| |
| /* To update pointer */ |
| pi2_src1 += 4; |
| pi2_src2 += 4; |
| pu1_dst += 4; |
| |
| } /* inner loop ends here(4-output values in single iteration) */ |
| |
| pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */ |
| pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */ |
| pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */ |
| |
| } |
| } |
| } |
| else /* ht multiple of 2 case and wd multiple of 4 case*/ |
| { |
| |
| WORD32 dst0, dst1; |
| |
| /* outer for loop starts from here */ |
| for(row = 0; row < ht; row += 2) |
| { |
| for(col = 0; col < wd; col += 4) |
| { |
| /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/ |
| src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); |
| /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ |
| src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); |
| |
| /* row = 1 */ |
| src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1)); |
| src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2)); |
| |
| /* Pack two rows together */ |
| src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b); |
| src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b); |
| |
| /* (pi2_src1[col] + pi2_src2[col]) */ |
| src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); |
| |
| /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ |
| src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); |
| |
| /* (i4_tmp >> shift) */ |
| src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); |
| |
| /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ |
| src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b); |
| |
| dst0 = _mm_cvtsi128_si32(src_temp1_8x16b); |
| /* dst row = 1 to 3 */ |
| src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1); |
| |
| /* store four 8-bit output values */ |
| *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; |
| |
| dst1 = _mm_cvtsi128_si32(src_temp2_8x16b); |
| |
| /* row = 1 to row = 3 */ |
| *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; |
| |
| /* To update pointer */ |
| pi2_src1 += 4; |
| pi2_src2 += 4; |
| pu1_dst += 4; |
| |
| } /* inner loop ends here(4-output values in single iteration) */ |
| |
| pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */ |
| pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */ |
| pu1_dst = pu1_dst - wd + 2 * dst_strd; /* Pointer update */ |
| |
| } |
| |
| } |
| |
| } |