| /****************************************************************************** |
| * |
| * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at: |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| * |
| ******************************************************************************/ |
| /** |
| ******************************************************************************* |
| * @file |
| * ihevc_32x32_itrans_recon_x86_intr.c |
| * |
| * @brief |
| * Contains function definitions for inverse quantization, inverse |
| * transform and reconstruction |
| * |
| * @author |
| * 100470 |
| * |
| * @par List of Functions: |
| * - ihevc_itrans_recon_32x32_sse42() |
| * |
| * @remarks |
| * None |
| * |
| ******************************************************************************* |
| */ |
| #include <stdio.h> |
| #include <string.h> |
| #include "ihevc_typedefs.h" |
| #include "ihevc_platform_macros.h" |
| #include "ihevc_macros.h" |
| #include "ihevc_defs.h" |
| #include "ihevc_trans_tables.h" |
| #include "ihevc_iquant_itrans_recon.h" |
| #include "ihevc_func_selector.h" |
| #include "ihevc_trans_macros.h" |
| |
| #include <emmintrin.h> |
| #include <smmintrin.h> |
| #include <tmmintrin.h> |
| |
| /** |
| ******************************************************************************* |
| * |
| * @brief |
| * This function performs inverse quantization, inverse transform and |
| * reconstruction for 16x16 input block |
| * |
| * @par Description: |
| * Performs inverse quantization , inverse transform and adds the |
| * prediction data and clips output to 8 bit |
| * |
| * @param[in] pi2_src |
| * Input 16x16 coefficients |
| * |
| * @param[in] pi2_tmp |
| * Temporary 16x16 buffer for storing inverse |
| * transform 1st stage output |
| * |
| * @param[in] pu1_pred |
| * Prediction 16x16 block |
| * |
| * @param[in] pi2_dequant_coeff |
| * Dequant Coeffs |
| * |
| * @param[out] pu1_dst |
| * Output 16x16 block |
| * |
| * @param[in] qp_div |
| * Quantization parameter / 6 |
| * |
| * @param[in] qp_rem |
| * Quantization parameter % 6 |
| * |
| * @param[in] src_strd |
| * Input stride |
| * |
| * @param[in] pred_strd |
| * Prediction stride |
| * |
| * @param[in] dst_strd |
| * Output Stride |
| * |
| * @param[in] zero_cols |
| * Zero columns in pi2_src |
| * |
| * @returns Void |
| * |
| * @remarks |
| * None |
| * |
| ******************************************************************************* |
| */ |
| /**/ |
| |
| void ihevc_itrans_recon_32x32_sse42(WORD16 *pi2_src, |
| WORD16 *pi2_tmp, |
| UWORD8 *pu1_pred, |
| UWORD8 *pu1_dst, |
| WORD32 src_strd, |
| WORD32 pred_strd, |
| WORD32 dst_strd, |
| WORD32 zero_cols, |
| WORD32 zero_rows) |
| { |
| /* Inverse Transform */ |
| |
| WORD32 j; |
| |
| |
| WORD16 *pi2_tmp_orig; |
| |
| |
| WORD16 *o_temp_ptr; |
| WORD16 *temp_ptr; |
| |
| __m128i m_temp_reg_0; |
| __m128i m_temp_reg_1; |
| __m128i m_temp_reg_2; |
| __m128i m_temp_reg_3; |
| __m128i m_temp_reg_4; |
| __m128i m_temp_reg_5; |
| __m128i m_temp_reg_6; |
| __m128i m_temp_reg_7; |
| __m128i m_temp_reg_10; |
| __m128i m_temp_reg_11; |
| __m128i m_temp_reg_12; |
| __m128i m_temp_reg_13; |
| __m128i m_temp_reg_14; |
| __m128i m_temp_reg_15; |
| __m128i m_temp_reg_16; |
| __m128i m_temp_reg_17; |
| __m128i m_temp_reg_18; |
| __m128i m_temp_reg_19; |
| __m128i m_temp_reg_20; |
| __m128i m_temp_reg_21; |
| __m128i m_temp_reg_22; |
| __m128i m_temp_reg_23; |
| __m128i m_temp_reg_30; |
| __m128i m_temp_reg_31; |
| __m128i m_temp_reg_32; |
| __m128i m_temp_reg_33; |
| __m128i m_temp_reg_34; |
| __m128i m_temp_reg_35; |
| __m128i m_temp_reg_36; |
| __m128i m_temp_reg_37; |
| __m128i m_temp_reg_40; |
| __m128i m_temp_reg_41; |
| __m128i m_temp_reg_42; |
| __m128i m_temp_reg_43; |
| __m128i m_temp_reg_44; |
| __m128i m_temp_reg_45; |
| __m128i m_temp_reg_46; |
| __m128i m_temp_reg_47; |
| |
| __m128i m_temp_reg_70; |
| __m128i m_temp_reg_71; |
| __m128i m_temp_reg_72; |
| __m128i m_temp_reg_73; |
| __m128i m_temp_reg_74; |
| __m128i m_temp_reg_75; |
| __m128i m_temp_reg_76; |
| __m128i m_temp_reg_77; |
| |
| __m128i m_temp_reg_80; |
| __m128i m_temp_reg_81; |
| __m128i m_temp_reg_82; |
| __m128i m_temp_reg_83; |
| __m128i m_temp_reg_84; |
| __m128i m_temp_reg_85; |
| __m128i m_temp_reg_86; |
| __m128i m_temp_reg_87; |
| |
| __m128i m_temp_reg_90; |
| __m128i m_temp_reg_91; |
| __m128i m_temp_reg_92; |
| __m128i m_temp_reg_93; |
| __m128i m_temp_reg_94; |
| __m128i m_temp_reg_95; |
| __m128i m_temp_reg_96; |
| __m128i m_temp_reg_97; |
| |
| __m128i m_rdng_factor; |
| __m128i m_count; |
| __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4; |
| __m128i m_coeff5, m_coeff6, m_coeff7, m_coeff8; |
| |
| __m128i temp1, temp2, temp3, temp4; |
| __m128i temp5, temp6, temp7, temp8; |
| |
| __m128i all_zero_reg; |
| WORD32 i; |
| |
| /*Lokesh*/ |
| WORD32 zero_last24_cols_stg1; |
| WORD32 zero_last24_rows_stg1; |
| WORD32 zero_last28_rows_stg1; |
| |
| WORD32 zero_last28_rows_stg2; |
| WORD32 zero_last24_rows_stg2; |
| |
| WORD32 trans_size_stg1; |
| |
| WORD32 i4_shift = IT_SHIFT_STAGE_1; |
| WORD32 trans_size = TRANS_SIZE_32; |
| |
| |
| /* Last 8 cols of 16x16 block are skipped based on the below flag : Lokesh */ |
| zero_last24_cols_stg1 = ((zero_cols & 0xFFFFFF00) == 0xFFFFFF00) ? 1 : 0; |
| zero_last24_rows_stg1 = ((zero_rows & 0xFFFFFF00) == 0xFFFFFF00) ? 1 : 0; |
| zero_last28_rows_stg1 = ((zero_rows & 0xFFFFFFF0) == 0xFFFFFFF0) ? 1 : 0; |
| |
| zero_last28_rows_stg2 = ((zero_cols & 0xFFFFFFF0) == 0xFFFFFFF0) ? 1 : 0; |
| zero_last24_rows_stg2 = zero_last24_cols_stg1; |
| |
| if((zero_last28_rows_stg2) || (zero_last24_cols_stg1)) |
| { |
| trans_size_stg1 = 8; |
| |
| } |
| else |
| { |
| trans_size_stg1 = 32; |
| } |
| |
| all_zero_reg = _mm_setzero_si128(); |
| |
| o_temp_ptr = pi2_tmp; |
| temp_ptr = (pi2_tmp + 1024); |
| |
| pi2_tmp += 2048; |
| pi2_tmp_orig = pi2_tmp; |
| |
| for(i = 0; i < trans_size_stg1; i += 8) |
| { |
| |
| { |
| WORD16 *pi2_tmp_src = pi2_src; |
| |
| m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| |
| m_temp_reg_80 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_81 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_82 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_83 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_84 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_85 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_86 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_87 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| } |
| |
| if(zero_last28_rows_stg1) |
| { |
| /* eeo */ |
| /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ |
| /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ |
| { |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 |
| |
| m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg); |
| |
| m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); |
| |
| /* eeeo[0]= m_temp_reg_20 */ |
| /* eeeo[1]= m_temp_reg_21 */ |
| /* eeee[0]= m_temp_reg_22 */ |
| /* eeee[1]= m_temp_reg_23 */ |
| |
| /* eee[0] = eeee[0] + eeeo[0]; */ |
| m_temp_reg_40 = m_temp_reg_14; |
| |
| /* eee[3] = eeee[0] - eeeo[0]; */ |
| m_temp_reg_43 = m_temp_reg_14; |
| |
| /* eee[2] = eeee[1] - eeeo[1]; */ |
| m_temp_reg_42 = m_temp_reg_14; //m_temp_reg_16; |
| |
| /* eee[1] = eeee[1] + eeeo[1];*/ |
| m_temp_reg_41 = m_temp_reg_14; //m_temp_reg_16; |
| |
| m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8); |
| |
| m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg); |
| |
| m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); |
| |
| /* eeeo[0]= m_temp_reg_20 */ |
| /* eeeo[1]= m_temp_reg_21 */ |
| /* eeee[0]= m_temp_reg_22 */ |
| /* eeee[1]= m_temp_reg_23 */ |
| |
| /* eee[0] = eeee[0] + eeeo[0]; */ |
| m_temp_reg_44 = m_temp_reg_14; |
| |
| /* eee[3] = eeee[0] - eeeo[0]; */ |
| m_temp_reg_47 = m_temp_reg_14; |
| |
| /* eee[2] = eeee[1] - eeeo[1]; */ |
| m_temp_reg_46 = m_temp_reg_14; //m_temp_reg_16; |
| |
| /* eee[1] = eeee[1] + eeeo[1];*/ |
| m_temp_reg_45 = m_temp_reg_14; //m_temp_reg_16; |
| |
| |
| } |
| /* eo */ |
| { |
| WORD16 *pi2_scratch = o_temp_ptr; |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80 |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70 |
| m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 |
| m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43 |
| m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 |
| m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //9 |
| |
| m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, all_zero_reg); |
| |
| m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8); |
| |
| /* eo0[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, all_zero_reg); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| /* eo0[4-7] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| /* eo1[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff2); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| /* eo1[4-7] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff2); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| /* eo2[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| /* eo2[4-7] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| /**************************************************************************/ |
| |
| |
| /* eo3[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff4); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| /* eo3[4-7] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff4); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| |
| /* eo4[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| /* eo4[4-7] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| /***********************************************************************/ |
| |
| /* eo5[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff6); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| |
| /* eo5[4-7] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff6); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| /* eo6[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff7); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| |
| /* eo6[4-7] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff7); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| |
| /* eo7[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff8); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| |
| /* eo7[4-7] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff8); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| } |
| } |
| else if(zero_last24_rows_stg1) |
| { |
| { |
| /* eeo */ |
| /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ |
| /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83 |
| |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64 |
| |
| m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg); |
| |
| m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); |
| |
| /* eeeo[0]= m_temp_reg_20 */ |
| /* eeeo[1]= m_temp_reg_21 */ |
| /* eeee[0]= m_temp_reg_22 */ |
| /* eeee[1]= m_temp_reg_23 */ |
| |
| /* eee[0] = eeee[0] + eeeo[0]; */ |
| m_temp_reg_40 = m_temp_reg_14; |
| |
| /* eee[3] = eeee[0] - eeeo[0]; */ |
| m_temp_reg_43 = m_temp_reg_14; |
| |
| /* eee[2] = eeee[1] - eeeo[1]; */ |
| m_temp_reg_42 = m_temp_reg_14; //m_temp_reg_16; |
| |
| /* eee[1] = eeee[1] + eeeo[1];*/ |
| m_temp_reg_41 = m_temp_reg_14; //m_temp_reg_16; |
| |
| /* for row 4 to 7 */ |
| |
| m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8); |
| |
| m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg); |
| |
| m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); |
| |
| /* eeeo[0]= m_temp_reg_20 */ |
| /* eeeo[1]= m_temp_reg_21 */ |
| /* eeee[0]= m_temp_reg_22 */ |
| /* eeee[1]= m_temp_reg_23 */ |
| |
| /* eee[0] = eeee[0] + eeeo[0]; */ |
| m_temp_reg_44 = m_temp_reg_14; |
| |
| /* eee[3] = eeee[0] - eeeo[0]; */ |
| m_temp_reg_47 = m_temp_reg_14; |
| |
| /* eee[2] = eeee[1] - eeeo[1]; */ |
| m_temp_reg_46 = m_temp_reg_14; //m_temp_reg_16; |
| |
| /* eee[1] = eeee[1] + eeeo[1];*/ |
| m_temp_reg_45 = m_temp_reg_14; //m_temp_reg_16; |
| |
| |
| // eeo[] |
| /* for(k = 0; k < 4; k++) */ |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18 |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18 |
| |
| m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg); |
| |
| m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8); |
| |
| m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg); |
| |
| m_temp_reg_33 = _mm_setzero_si128(); |
| |
| /* eeo */ |
| { |
| /* eeo0[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); |
| |
| m_temp_reg_90 = m_temp_reg_34; |
| m_temp_reg_97 = m_temp_reg_35; |
| } |
| /* eeo0[4-7] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); |
| |
| m_temp_reg_91 = m_temp_reg_34; |
| m_temp_reg_96 = m_temp_reg_35; |
| |
| } |
| |
| /* eeo1[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff2); |
| |
| /* e[1][0-3] stored in pi2_tmp[2][0-7] */ |
| /* e[6][0-3] stored in pi2_tmp[2][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30); |
| |
| m_temp_reg_92 = m_temp_reg_34; |
| m_temp_reg_95 = m_temp_reg_35; |
| |
| } |
| |
| /* eo1[4-7] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff2); |
| |
| /* e[1][4-7] stored in pi2_tmp[3][0-7] */ |
| /* e[6][4-7] stored in pi2_tmp[3][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30); |
| |
| m_temp_reg_93 = m_temp_reg_34; |
| m_temp_reg_94 = m_temp_reg_35; |
| |
| |
| } |
| |
| /* eo2[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff4); |
| |
| /* e[2][0-3] stored in pi2_tmp[4][0-7] */ |
| /* e[5][0-3] stored in pi2_tmp[4][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); |
| |
| temp1 = m_temp_reg_34; |
| temp7 = m_temp_reg_35; |
| |
| } |
| |
| /* eo2[4-7] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff4); |
| |
| /* e[2][4-7] stored in pi2_tmp[5][0-7] */ |
| /* e[5][4-7] stored in pi2_tmp[5][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); |
| |
| temp2 = m_temp_reg_34; |
| temp6 = m_temp_reg_35; |
| |
| } |
| |
| /* eo3[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); |
| |
| /* e[3][0-3] stored in pi2_tmp[6][0-7] */ |
| /* e[4][0-3] stored in pi2_tmp[6][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30); |
| |
| temp3 = m_temp_reg_34; |
| temp5 = m_temp_reg_35; |
| |
| } |
| |
| |
| /* eo3[4-7] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); |
| |
| /* e[3][4-7] stored in pi2_tmp[7][0-7] */ |
| /* e[4][4-7] stored in pi2_tmp[7][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30); |
| |
| temp4 = m_temp_reg_34; |
| temp8 = m_temp_reg_35; |
| |
| |
| } |
| /* All values of ee[] array in pi2_temp */ |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70 |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43 |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9 |
| |
| m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); |
| |
| m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8); |
| m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8); |
| |
| } |
| } |
| /* eo */ |
| { |
| |
| WORD16 *pi2_scratch = o_temp_ptr; |
| |
| /* eo0[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| |
| /* eo0[4-7] */ |
| { |
| m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); |
| |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 57 |
| |
| /* eo1[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| |
| /* eo1[4-7] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80 9 |
| |
| /* eo2[0-3] */ |
| { |
| |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| m_temp_reg_34 = _mm_add_epi32(temp1, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(temp1, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| /* eo2[4-7] */ |
| { |
| |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); |
| |
| m_temp_reg_34 = _mm_add_epi32(temp2, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(temp2, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| /**************************************************************************/ |
| |
| |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70 -43 |
| |
| /* eo3[0-3] */ |
| { |
| |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| m_temp_reg_34 = _mm_add_epi32(temp3, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(temp3, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| |
| /* eo3[4-7] */ |
| { |
| |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); |
| |
| m_temp_reg_34 = _mm_add_epi32(temp4, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(temp4, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57 -80 |
| |
| /* eo4[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| m_temp_reg_34 = _mm_add_epi32(temp5, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(temp5, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| /* eo4[4-7] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); |
| |
| m_temp_reg_34 = _mm_add_epi32(temp8, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(temp8, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| /***********************************************************************/ |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43 -90 |
| |
| /* eo5[0-3] */ |
| { |
| |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| m_temp_reg_34 = _mm_add_epi32(temp7, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(temp7, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| |
| /* eo5[4-7] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); |
| |
| m_temp_reg_34 = _mm_add_epi32(temp6, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(temp6, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25 -70 |
| |
| /* eo6[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| |
| /* eo6[4-7] */ |
| { |
| |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9 -25 |
| |
| /* eo7[0-3] */ |
| { |
| |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| |
| /* eo7[4-7] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| } |
| |
| } |
| else |
| { |
| |
| { |
| /* eeo */ |
| /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ |
| /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83 |
| |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64 |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[3][0]); //64 -64 |
| |
| m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84); |
| |
| m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80); |
| |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); /* eeeo[0] */ |
| m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); /* eeeo[1] */ |
| |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); /* eeee[0] */ |
| m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4); /* eeee[1] */ |
| |
| |
| /* eeeo[0]= m_temp_reg_20 */ |
| /* eeeo[1]= m_temp_reg_21 */ |
| /* eeee[0]= m_temp_reg_22 */ |
| /* eeee[1]= m_temp_reg_23 */ |
| |
| /* eee[0] = eeee[0] + eeeo[0]; */ |
| m_temp_reg_40 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[0] */ |
| |
| /* eee[3] = eeee[0] - eeeo[0]; */ |
| m_temp_reg_43 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[1] */ |
| |
| /* eee[2] = eeee[1] - eeeo[1]; */ |
| m_temp_reg_42 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[1] */ |
| |
| /* eee[1] = eeee[1] + eeeo[1];*/ |
| m_temp_reg_41 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[0] */ |
| |
| /* for row 4 to 7 */ |
| |
| m_temp_reg_74 = _mm_srli_si128(m_temp_reg_74, 8); |
| m_temp_reg_84 = _mm_srli_si128(m_temp_reg_84, 8); |
| |
| /* Interleaving row 8 and row 24*/ |
| m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84); |
| |
| m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8); |
| m_temp_reg_80 = _mm_srli_si128(m_temp_reg_80, 8); |
| |
| m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80); |
| |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); /* eeeo[0] */ |
| m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); /* eeeo[1] */ |
| |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); /* eeee[0] */ |
| m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4); /* eeee[1] */ |
| |
| |
| /* eeeo[0]= m_temp_reg_20 */ |
| /* eeeo[1]= m_temp_reg_21 */ |
| /* eeee[0]= m_temp_reg_22 */ |
| /* eeee[1]= m_temp_reg_23 */ |
| |
| /* eee[0] = eeee[0] + eeeo[0]; */ |
| m_temp_reg_44 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[0] */ |
| |
| /* eee[3] = eeee[0] - eeeo[0]; */ |
| m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[1] */ |
| |
| /* eee[2] = eeee[1] - eeeo[1]; */ |
| m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[1] */ |
| |
| /* eee[1] = eeee[1] + eeeo[1];*/ |
| m_temp_reg_45 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[0] */ |
| |
| |
| // eeo[] |
| /* for(k = 0; k < 4; k++) */ |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18 |
| |
| /* eeo */ |
| { |
| /* eeo0[0-3] */ |
| { |
| m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); |
| m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86); |
| |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| m_temp_reg_90 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); |
| m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); |
| |
| } |
| |
| m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8); |
| m_temp_reg_76 = _mm_srli_si128(m_temp_reg_76, 8); |
| m_temp_reg_82 = _mm_srli_si128(m_temp_reg_82, 8); |
| m_temp_reg_86 = _mm_srli_si128(m_temp_reg_86, 8); |
| |
| /* eeo0[4-7] */ |
| { |
| m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); |
| m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86); |
| |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); |
| |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| m_temp_reg_91 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); |
| m_temp_reg_96 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); |
| |
| } |
| |
| |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 -18 |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[7][0]); //89 50 |
| |
| /* eeo1[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30); |
| |
| m_temp_reg_92 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_31); |
| m_temp_reg_95 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_31); |
| |
| } |
| |
| /* eeo1[4-7] */ |
| { |
| |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30); |
| |
| m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_31); |
| m_temp_reg_94 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_31); |
| |
| |
| } |
| |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 -89 |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18 75 |
| |
| /* eeo2[0-3] */ |
| { |
| |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4); |
| |
| /* e[2][0-3] stored in pi2_tmp[4][0-7] */ |
| /* e[5][0-3] stored in pi2_tmp[4][8-15] */ |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); |
| |
| temp1 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31); |
| temp7 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31); |
| |
| } |
| |
| /* eeo2[4-7] */ |
| { |
| |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4); |
| |
| /* e[2][4-7] stored in pi2_tmp[5][0-7] */ |
| /* e[5][4-7] stored in pi2_tmp[5][8-15] */ |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); |
| |
| temp2 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31); |
| temp6 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31); |
| |
| } |
| |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[10][0]); //18 -50 |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[11][0]); //75 -89 |
| |
| /* eeo3[0-3] */ |
| { |
| |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4); |
| |
| /* e[3][0-3] stored in pi2_tmp[6][0-7] */ |
| /* e[4][0-3] stored in pi2_tmp[6][8-15] */ |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30); |
| |
| temp3 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31); |
| temp5 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31); |
| |
| |
| } |
| |
| /* eeo3[4-7] */ |
| { |
| |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4); |
| |
| /* e[3][4-7] stored in pi2_tmp[7][0-7] */ |
| /* e[4][4-7] stored in pi2_tmp[7][8-15] */ |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30); |
| temp4 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31); |
| temp8 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31); |
| |
| } |
| |
| |
| /* All values of ee[] array in pi2_temp */ |
| |
| /* for(k = 0; k < 8; k++) */ |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70 |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43 |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9 |
| } |
| } |
| /* eo */ |
| { |
| |
| WORD16 *pi2_scratch = o_temp_ptr; |
| |
| m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); |
| m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77); |
| m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_81, m_temp_reg_83); |
| m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_85, m_temp_reg_87); |
| |
| m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8); |
| m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8); |
| m_temp_reg_75 = _mm_srli_si128(m_temp_reg_75, 8); |
| m_temp_reg_77 = _mm_srli_si128(m_temp_reg_77, 8); |
| |
| m_temp_reg_81 = _mm_srli_si128(m_temp_reg_81, 8); |
| m_temp_reg_83 = _mm_srli_si128(m_temp_reg_83, 8); |
| m_temp_reg_85 = _mm_srli_si128(m_temp_reg_85, 8); |
| m_temp_reg_87 = _mm_srli_si128(m_temp_reg_87, 8); |
| |
| /* eo0[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); |
| m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); |
| |
| m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); |
| |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| /* eo0[4-7] */ |
| { |
| m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); |
| m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77); |
| m_temp_reg_16 = _mm_unpacklo_epi16(m_temp_reg_81, m_temp_reg_83); |
| m_temp_reg_17 = _mm_unpacklo_epi16(m_temp_reg_85, m_temp_reg_87); |
| |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); |
| |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3); |
| m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4); |
| |
| m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); |
| |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 57 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //0 -43 |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80 90 |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70 25 |
| |
| /* eo1[0-3] */ |
| { |
| |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); |
| m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); |
| |
| m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); |
| |
| m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| /* eo1[4-7] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); |
| |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3); |
| m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4); |
| |
| m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); |
| |
| m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80 9 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[9][0]); //70 87 |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[10][0]); //-25 57 |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[11][0]); //90 43 |
| |
| /* eo2[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| |
| m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); |
| m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); |
| |
| m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); |
| |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); |
| |
| m_temp_reg_34 = _mm_add_epi32(temp1, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(temp1, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| |
| /* eo2[4-7] */ |
| { |
| |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); |
| |
| m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3); |
| m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4); |
| |
| m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); |
| |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); |
| |
| m_temp_reg_34 = _mm_add_epi32(temp2, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(temp2, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| /**************************************************************************/ |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70 -43 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[13][0]); //-87 9 |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[14][0]); //90 25 |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[15][0]); //80 57 |
| |
| /* eo3[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); |
| m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); |
| |
| m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33); |
| |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); |
| |
| m_temp_reg_34 = _mm_add_epi32(temp3, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(temp3, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| |
| /* eo3[4-7] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); |
| |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3); |
| m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4); |
| |
| m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33); |
| |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); |
| |
| m_temp_reg_34 = _mm_add_epi32(temp4, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(temp4, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57 -80 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[17][0]); //-25 90 |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[18][0]); //9 87 |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43 70 |
| |
| /* eo4[0-3] */ |
| { |
| |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); |
| m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); |
| |
| m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32); |
| |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); |
| |
| m_temp_reg_34 = _mm_add_epi32(temp5, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(temp5, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| |
| /* eo4[4-7] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); |
| |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3); |
| m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4); |
| |
| m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32); |
| |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); |
| |
| m_temp_reg_34 = _mm_add_epi32(temp8, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(temp8, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| /***********************************************************************/ |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43 -90 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[21][0]); //57 25 |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[22][0]); //-87 70 |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[23][0]); //9 -80 |
| |
| /* eo5[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); |
| m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); |
| |
| m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); |
| |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); |
| |
| m_temp_reg_34 = _mm_add_epi32(temp7, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(temp7, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| |
| /* eo5[4-7] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); |
| |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3); |
| m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4); |
| |
| m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); |
| |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); |
| |
| m_temp_reg_34 = _mm_add_epi32(temp6, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(temp6, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25 -70 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[25][0]); //90 -80 |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[26][0]); //43 9 |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[27][0]); //-57 87 |
| |
| /* eo6[0-3] */ |
| { |
| |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); |
| m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); |
| |
| m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); |
| |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| |
| /* eo6[4-7] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); |
| |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3); |
| m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4); |
| |
| m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); |
| |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9 -25 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[29][0]); //43 -57 |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[30][0]); //70 -80 |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[31][0]); //87 -90 |
| |
| /* eo7[0-3] */ |
| { |
| |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); |
| m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); |
| |
| m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); |
| |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| |
| /* eo7[4-7] */ |
| { |
| |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); |
| |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3); |
| m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4); |
| |
| m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); |
| |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| } |
| |
| } |
| /* All e[] are done */ |
| /****************************/ |
| |
| { |
| |
| WORD16 *pi2_tmp_src = pi2_src + src_strd; |
| |
| m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| |
| m_temp_reg_80 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_81 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_82 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_83 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_84 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_85 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_86 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_87 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| } |
| |
| if(zero_last28_rows_stg1) |
| { |
| /* o & stage 1 out */ |
| { |
| WORD32 j; |
| WORD16 *pi2_src_scratch = o_temp_ptr; |
| WORD16 *pi2_dst_scratch = temp_ptr; |
| WORD32 out_stride = (trans_size << 1); |
| WORD32 in_stride = trans_size; |
| |
| for(j = 0; j < 2; j++) |
| { |
| if(j) |
| { |
| m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8); |
| m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8); |
| } |
| |
| m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]); |
| |
| /* o0[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += in_stride; |
| |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += out_stride; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]); |
| |
| /* o1[0-3] */ |
| { |
| |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += in_stride; |
| |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += out_stride; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]); |
| |
| /* o2[0-3] */ |
| { |
| |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += in_stride; |
| |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += out_stride; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]); |
| |
| /* o3[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += in_stride; |
| |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += out_stride; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]); |
| |
| /* o4[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += in_stride; |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += out_stride; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]); |
| |
| /* o5[0-3] */ |
| { |
| |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += in_stride; |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += out_stride; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]); |
| |
| /* o6[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += in_stride; |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += out_stride; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]); |
| |
| /* o7[0-3] */ |
| { |
| |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += 8; |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += 8; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]); |
| |
| /* o8[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch -= in_stride; |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch -= out_stride; |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]); |
| |
| /* o9[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch -= in_stride; |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch -= out_stride; |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]); |
| |
| /* o10[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch -= in_stride; |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch -= out_stride; |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]); |
| |
| /* o11[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch -= in_stride; |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch -= out_stride; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]); |
| |
| /* o12[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch -= in_stride; |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch -= out_stride; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]); |
| |
| /* o13[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch -= in_stride; |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch -= out_stride; |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]); |
| |
| /* o14[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch -= in_stride; |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch -= out_stride; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]); |
| |
| /* o15[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += 8; |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += 8; |
| } |
| |
| } |
| } |
| } |
| else if(zero_last24_rows_stg1) |
| { |
| /* o & stage 1 out */ |
| { |
| WORD32 j; |
| |
| WORD16 *pi2_src_scratch = o_temp_ptr; |
| WORD16 *pi2_dst_scratch = temp_ptr; |
| WORD32 out_stride = (trans_size << 1); |
| |
| WORD32 in_stride = trans_size; |
| |
| for(j = 0; j < 2; j++) |
| { |
| if(j) |
| { |
| m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8); |
| m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8); |
| m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8); |
| m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8); |
| } |
| |
| m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved |
| m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]); |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]); |
| |
| /* o0[0-3] */ |
| { |
| |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| |
| m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += in_stride; |
| |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += out_stride; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]); |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]); |
| |
| /* o1[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| |
| m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += in_stride; |
| |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += out_stride; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]); |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]); |
| |
| /* o2[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| |
| m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += in_stride; |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += out_stride; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]); |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]); |
| |
| /* o3[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| |
| m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += in_stride; |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += out_stride; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]); |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]); |
| |
| /* o4[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| |
| m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += in_stride; |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += out_stride; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]); |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]); |
| |
| /* o5[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| |
| m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += in_stride; |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += out_stride; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]); |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]); |
| |
| /* o6[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| |
| m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += in_stride; |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += out_stride; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]); |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]); |
| |
| /* o7[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| |
| m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += 8; |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += 8; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]); |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]); |
| |
| /* o8[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| |
| m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch -= in_stride; |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch -= out_stride; |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]); |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]); |
| |
| /* o9[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| |
| m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch -= in_stride; |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch -= out_stride; |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]); |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]); |
| |
| /* o10[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| |
| m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch -= in_stride; |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch -= out_stride; |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]); |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]); |
| |
| /* o11[0-3] */ |
| { |
| |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| |
| m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch -= in_stride; |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch -= out_stride; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]); |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]); |
| |
| /* o12[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| |
| m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch -= in_stride; |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch -= out_stride; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]); |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]); |
| |
| /* o13[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| |
| m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch -= in_stride; |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch -= out_stride; |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]); |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]); |
| |
| /* o14[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| |
| m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch -= in_stride; |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch -= out_stride; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]); |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]); |
| |
| /* o15[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| |
| m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += 8; |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += 8; |
| } |
| |
| } |
| } |
| } |
| else |
| { |
| /* o & stage 1 out */ |
| { |
| WORD32 j; |
| |
| WORD16 *pi2_src_scratch = o_temp_ptr; |
| WORD16 *pi2_dst_scratch = temp_ptr; |
| WORD32 out_stride = (trans_size << 1); |
| |
| WORD32 in_stride = trans_size; |
| |
| |
| for(j = 0; j < 2; j++) |
| { |
| if(j) |
| { |
| m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8); |
| m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8); |
| m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8); |
| m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8); |
| m_temp_reg_74 = _mm_srli_si128(m_temp_reg_74, 8); |
| m_temp_reg_75 = _mm_srli_si128(m_temp_reg_75, 8); |
| m_temp_reg_76 = _mm_srli_si128(m_temp_reg_76, 8); |
| m_temp_reg_77 = _mm_srli_si128(m_temp_reg_77, 8); |
| |
| m_temp_reg_80 = _mm_srli_si128(m_temp_reg_80, 8); |
| m_temp_reg_81 = _mm_srli_si128(m_temp_reg_81, 8); |
| m_temp_reg_82 = _mm_srli_si128(m_temp_reg_82, 8); |
| m_temp_reg_83 = _mm_srli_si128(m_temp_reg_83, 8); |
| m_temp_reg_84 = _mm_srli_si128(m_temp_reg_84, 8); |
| m_temp_reg_85 = _mm_srli_si128(m_temp_reg_85, 8); |
| m_temp_reg_86 = _mm_srli_si128(m_temp_reg_86, 8); |
| m_temp_reg_87 = _mm_srli_si128(m_temp_reg_87, 8); |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]); |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]); |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[2][0]); |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[3][0]); |
| m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[4][0]); |
| m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[5][0]); |
| m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[6][0]); |
| m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[7][0]); |
| |
| m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved |
| m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved |
| m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 interleaved |
| m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 interleaved |
| temp1 = _mm_unpacklo_epi16(m_temp_reg_80, m_temp_reg_81); //row 17 and row 19 interleaved |
| temp2 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_83); //row 21 and row 23 interleaved |
| temp3 = _mm_unpacklo_epi16(m_temp_reg_84, m_temp_reg_85); //row 25 and row 27 interleaved |
| temp4 = _mm_unpacklo_epi16(m_temp_reg_86, m_temp_reg_87); //row 29 and row 31 interleaved |
| |
| |
| /* o0[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); |
| m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); |
| |
| m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); |
| m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); |
| |
| m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); |
| |
| m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); |
| m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); |
| m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); |
| m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); |
| |
| m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); |
| m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); |
| |
| m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); |
| |
| m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += in_stride; |
| |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += out_stride; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]); |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]); |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[10][0]); |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[11][0]); |
| m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[12][0]); |
| m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[13][0]); |
| m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[14][0]); |
| m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[15][0]); |
| |
| |
| /* o1[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); |
| m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); |
| |
| m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); |
| m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); |
| |
| m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_20); |
| |
| m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); |
| m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); |
| m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); |
| m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); |
| |
| m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); |
| m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); |
| |
| m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); |
| |
| m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += in_stride; |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += out_stride; |
| |
| } |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]); |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]); |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[18][0]); |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[19][0]); |
| m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[20][0]); |
| m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[21][0]); |
| m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[22][0]); |
| m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[23][0]); |
| |
| /* o2[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); |
| m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); |
| |
| m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); |
| m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); |
| |
| m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); |
| |
| m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); |
| m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); |
| m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); |
| m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); |
| |
| m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_41); |
| m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); |
| |
| m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_42); |
| |
| m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += in_stride; |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_s
|