| /****************************************************************************** |
| * |
| * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at: |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| * |
| ******************************************************************************/ |
| /** |
| ******************************************************************************* |
| * @file |
| * ihevc_16x16_itrans_recon_x86_intr.c |
| * |
| * @brief |
| * Contains function definitions for inverse |
| * transform and reconstruction for 16x16. |
| * |
| * @author |
| * 100470 |
| * 100592 (edited by) |
| * |
| * @par List of Functions: |
| * - ihevc_itrans_recon_16x16_sse42() |
| * |
| * @remarks |
| * None |
| * |
| ******************************************************************************* |
| */ |
| #include <stdio.h> |
| #include <string.h> |
| #include "ihevc_typedefs.h" |
| #include "ihevc_macros.h" |
| #include "ihevc_platform_macros.h" |
| #include "ihevc_defs.h" |
| #include "ihevc_trans_tables.h" |
| #include "ihevc_itrans_recon.h" |
| #include "ihevc_func_selector.h" |
| #include "ihevc_trans_macros.h" |
| |
| #include <immintrin.h> |
| #include <emmintrin.h> |
| #include <smmintrin.h> |
| #include <tmmintrin.h> |
| |
| /** |
| ******************************************************************************* |
| * |
| * @brief |
| * This function performs inverse quantization, inverse transform and |
| * reconstruction for 16x16 input block |
| * |
| * @par Description: |
| * Performs inverse quantization , inverse transform and adds the |
| * prediction data and clips output to 8 bit |
| * |
| * @param[in] pi2_src |
| * Input 16x16 coefficients |
| * |
| * @param[in] pi2_tmp |
| * Temporary 16x16 buffer for storing inverse |
| * transform 1st stage output |
| * |
| * @param[in] pu1_pred |
| * Prediction 16x16 block |
| * |
| * @param[in] pi2_dequant_coeff |
| * Dequant Coeffs |
| * |
| * @param[out] pu1_dst |
| * Output 16x16 block |
| * |
| * @param[in] qp_div |
| * Quantization parameter / 6 |
| * |
| * @param[in] qp_rem |
| * Quantization parameter % 6 |
| * |
| * @param[in] src_strd |
| * Input stride |
| * |
| * @param[in] pred_strd |
| * Prediction stride |
| * |
| * @param[in] dst_strd |
| * Output Stride |
| * |
| * @param[in] zero_cols |
| * Zero columns in pi2_src |
| * |
| * @returns Void |
| * |
| * @remarks |
| * None |
| * |
| ******************************************************************************* |
| */ |
| |
| void ihevc_itrans_recon_16x16_sse42(WORD16 *pi2_src, |
| WORD16 *pi2_tmp, |
| UWORD8 *pu1_pred, |
| UWORD8 *pu1_dst, |
| WORD32 src_strd, |
| WORD32 pred_strd, |
| WORD32 dst_strd, |
| WORD32 zero_cols, |
| WORD32 zero_rows) |
| { |
| __m128i m_temp_reg_0; |
| __m128i m_temp_reg_1; |
| __m128i m_temp_reg_10; |
| __m128i m_temp_reg_11; |
| __m128i m_temp_reg_12; |
| __m128i m_temp_reg_13; |
| __m128i m_temp_reg_14; |
| __m128i m_temp_reg_20; |
| __m128i m_temp_reg_21; |
| __m128i m_temp_reg_22; |
| __m128i m_temp_reg_23; |
| __m128i m_temp_reg_24; |
| __m128i m_temp_reg_25; |
| __m128i m_temp_reg_26; |
| __m128i m_temp_reg_27; |
| __m128i m_temp_reg_30; |
| __m128i m_temp_reg_31; |
| __m128i m_temp_reg_32; |
| __m128i m_temp_reg_33; |
| __m128i m_temp_reg_34; |
| __m128i m_temp_reg_35; |
| __m128i m_temp_reg_36; |
| __m128i m_temp_reg_37; |
| __m128i m_temp_reg_40; |
| __m128i m_temp_reg_41; |
| __m128i m_temp_reg_42; |
| __m128i m_temp_reg_43; |
| __m128i m_temp_reg_44; |
| __m128i m_temp_reg_45; |
| __m128i m_temp_reg_46; |
| __m128i m_temp_reg_47; |
| |
| __m128i m_temp_reg_70; |
| __m128i m_temp_reg_71; |
| __m128i m_temp_reg_72; |
| __m128i m_temp_reg_73; |
| __m128i m_temp_reg_74; |
| __m128i m_temp_reg_75; |
| __m128i m_temp_reg_76; |
| __m128i m_temp_reg_77; |
| __m128i m_rdng_factor; |
| __m128i m_count; |
| __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4; |
| __m128i m_coeff5, m_coeff6, m_coeff7, m_coeff8; |
| |
| WORD32 i; |
| |
| WORD32 zero_last8_cols_stg1; |
| WORD32 zero_last8_rows_stg1; |
| WORD32 zero_last12_rows_stg1; |
| WORD32 zero_last12_rows_stg2; |
| WORD32 zero_last8_rows_stg2; |
| |
| WORD32 loop = 0; |
| |
| WORD32 i4_shift = IT_SHIFT_STAGE_1; |
| WORD32 trans_size = TRANS_SIZE_16; |
| |
| /* Following 3 instructions replicates the value in the */ |
| /* lower 16 bits of m_add_iq in the entire register */ |
| |
| /* Last 8 cols of 16x16 block are skipped based on the below flag : Lokesh */ |
| |
| zero_last8_cols_stg1 = ((zero_cols & 0xFF00) == 0xFF00) ? 1 : 0; |
| zero_last8_rows_stg1 = ((zero_rows & 0xFF00) == 0xFF00) ? 1 : 0; |
| zero_last12_rows_stg1 = ((zero_rows & 0xFFF0) == 0xFFF0) ? 1 : 0; |
| |
| zero_last12_rows_stg2 = ((zero_cols & 0xFFF0) == 0xFFF0) ? 1 : 0; |
| zero_last8_rows_stg2 = zero_last8_cols_stg1; |
| |
| if(zero_last8_cols_stg1) |
| { |
| loop = 1; |
| } |
| else |
| loop = 2; |
| |
| /* i = 0 => lower 8 samples */ |
| /* i = 1 => higher 8 samples */ |
| for(i = 0; i < loop; i++) |
| { |
| { |
| WORD32 sample_half_index = i << 3; |
| WORD16 *pi2_tmp_src = pi2_src + sample_half_index; |
| WORD16 *pi2_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp; |
| |
| m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| |
| |
| |
| |
| /* If last 12 rows are zero : Rishab */ |
| if(zero_last12_rows_stg1) |
| { |
| |
| /* eee */ |
| /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */ |
| /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */ |
| { |
| /* Loading coeff and src for use in next block */ |
| |
| m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_77, m_temp_reg_70); //to get sign |
| m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0 |
| |
| m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6); |
| |
| m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77); |
| |
| m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6); |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75 |
| |
| m_temp_reg_26 = m_temp_reg_24; |
| m_temp_reg_27 = m_temp_reg_25; |
| } |
| |
| /* eo */ |
| |
| /* eo0[0-3] */ |
| { |
| m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); |
| m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); |
| |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ |
| |
| /* e[0][0-3] stored in pi2_tmp[0][0-7] */ |
| /* e[7][0-3] stored in pi2_tmp[0][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| |
| /* eo0[4-7] */ |
| { |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); |
| |
| /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ |
| |
| /* e[0][4-7] stored in pi2_tmp[1][0-7] */ |
| /* e[7][4-7] stored in pi2_tmp[1][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18 |
| } |
| |
| /* eo1[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); |
| |
| /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ |
| |
| /* e[1][0-3] stored in pi2_tmp[2][0-7] */ |
| /* e[6][0-3] stored in pi2_tmp[2][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| } |
| |
| /* eo1[4-7] */ |
| { |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); |
| |
| /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ |
| |
| /* e[1][4-7] stored in pi2_tmp[3][0-7] */ |
| /* e[6][4-7] stored in pi2_tmp[3][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89 |
| |
| } |
| |
| /* eo2[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| /* e[2][0-3] stored in pi2_tmp[4][0-7] */ |
| /* e[5][0-3] stored in pi2_tmp[4][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| /* eo2[4-7] */ |
| { |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); |
| |
| /* e[2][4-7] stored in pi2_tmp[5][0-7] */ |
| /* e[5][4-7] stored in pi2_tmp[5][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31); |
| |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50 |
| } |
| |
| /* eo3[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); |
| |
| /* e[3][0-3] stored in pi2_tmp[6][0-7] */ |
| /* e[4][0-3] stored in pi2_tmp[6][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| } |
| |
| /* eo3[4-7] */ |
| { |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); |
| |
| /* e[3][4-7] stored in pi2_tmp[7][0-7] */ |
| /* e[4][4-7] stored in pi2_tmp[7][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| } |
| } |
| /* If last 8 rows are zero : Rishab */ |
| else if(zero_last8_rows_stg1) |
| { |
| /* eeo */ |
| /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ |
| /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ |
| { |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83 |
| |
| m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's |
| m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's |
| |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); |
| m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); |
| |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); |
| m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); |
| |
| } |
| |
| /* eee */ |
| /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */ |
| /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */ |
| { |
| /* Loading coeff and src for use in next block */ |
| m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_77, m_temp_reg_70); //to get signs |
| m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0 |
| |
| m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6); |
| |
| //m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8); |
| |
| m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77); |
| |
| m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6); |
| |
| m_temp_reg_26 = m_temp_reg_24; |
| m_temp_reg_27 = m_temp_reg_25; |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18 |
| } |
| |
| /* eo0[0-3] */ |
| { |
| m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); |
| m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); |
| |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ |
| m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20); |
| m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20); |
| |
| /* e[0][0-3] stored in pi2_tmp[0][0-7] */ |
| /* e[7][0-3] stored in pi2_tmp[0][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| /* eo0[4-7] */ |
| { |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); |
| |
| /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ |
| m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21); |
| m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21); |
| |
| /* e[0][4-7] stored in pi2_tmp[1][0-7] */ |
| /* e[7][4-7] stored in pi2_tmp[1][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18 |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50 |
| |
| } |
| |
| /* eo1[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); |
| |
| /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ |
| m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22); |
| m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22); |
| |
| /* e[1][0-3] stored in pi2_tmp[2][0-7] */ |
| /* e[6][0-3] stored in pi2_tmp[2][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| /* eo1[4-7] */ |
| { |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); |
| |
| /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ |
| m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23); |
| m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23); |
| |
| /* e[1][4-7] stored in pi2_tmp[3][0-7] */ |
| /* e[6][4-7] stored in pi2_tmp[3][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75 |
| |
| } |
| |
| /* eo2[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| /* e[2][0-3] stored in pi2_tmp[4][0-7] */ |
| /* e[5][0-3] stored in pi2_tmp[4][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| /* eo2[4-7] */ |
| { |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); |
| |
| /* e[2][4-7] stored in pi2_tmp[5][0-7] */ |
| /* e[5][4-7] stored in pi2_tmp[5][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50 |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89 |
| |
| } |
| |
| /* eo3[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); |
| |
| /* e[3][0-3] stored in pi2_tmp[6][0-7] */ |
| /* e[4][0-3] stored in pi2_tmp[6][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| } |
| |
| /* eo3[4-7] */ |
| { |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); |
| |
| /* e[3][4-7] stored in pi2_tmp[7][0-7] */ |
| /* e[4][4-7] stored in pi2_tmp[7][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| } |
| } /* If all the rows are non-zero : Rishab */ |
| else |
| { |
| /* eeo */ |
| /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ |
| /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ |
| |
| { |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83 |
| |
| m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's |
| m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's |
| |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); |
| m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); |
| |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); |
| m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); |
| } |
| |
| /* eee */ |
| /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */ |
| /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */ |
| { |
| /* Loading coeff and src for use in next block */ |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[0][0]); //64 64 |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[1][0]); //64 -64 |
| |
| m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved LSB's |
| m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved MSB's |
| |
| m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_0, m_coeff3); |
| m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_0, m_coeff4); |
| |
| m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); |
| m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_1, m_coeff4); |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18 |
| |
| } |
| /* eo0[0-3] */ |
| { |
| m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); |
| m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); |
| m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77); |
| m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77); |
| |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2); |
| |
| |
| /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ |
| m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20); |
| m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20); |
| |
| /* e[0][0-3] stored in pi2_tmp[0][0-7] */ |
| /* e[7][0-3] stored in pi2_tmp[0][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| |
| } |
| |
| /* eo0[4-7] */ |
| { |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); |
| m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2); |
| |
| /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ |
| m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21); |
| m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21); |
| |
| /* e[0][4-7] stored in pi2_tmp[1][0-7] */ |
| /* e[7][4-7] stored in pi2_tmp[1][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18 |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50 |
| |
| } |
| |
| /* eo1[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); |
| m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4); |
| |
| /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ |
| m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22); |
| m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22); |
| |
| /* e[1][0-3] stored in pi2_tmp[2][0-7] */ |
| /* e[6][0-3] stored in pi2_tmp[2][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); |
| m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_32); |
| m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_32); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| } |
| |
| /* eo1[4-7] */ |
| { |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); |
| m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); |
| |
| /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ |
| m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23); |
| m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23); |
| |
| /* e[1][4-7] stored in pi2_tmp[3][0-7] */ |
| /* e[6][4-7] stored in pi2_tmp[3][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31); |
| m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_33); |
| m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_33); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75 |
| } |
| |
| /* eo2[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2); |
| |
| /* e[2][0-3] stored in pi2_tmp[4][0-7] */ |
| /* e[5][0-3] stored in pi2_tmp[4][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| } |
| |
| /* eo2[4-7] */ |
| { |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); |
| m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2); |
| |
| /* e[2][4-7] stored in pi2_tmp[5][0-7] */ |
| /* e[5][4-7] stored in pi2_tmp[5][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31); |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50 |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89 |
| |
| } |
| |
| /* eo3[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); |
| m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4); |
| |
| /* e[3][0-3] stored in pi2_tmp[6][0-7] */ |
| /* e[4][0-3] stored in pi2_tmp[6][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32); |
| |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| } |
| |
| /* eo3[4-7] */ |
| { |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); |
| m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); |
| |
| /* e[3][4-7] stored in pi2_tmp[7][0-7] */ |
| /* e[4][4-7] stored in pi2_tmp[7][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31); |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += 8; |
| } |
| |
| } |
| } |
| |
| { |
| WORD32 sample_half_index = i << 3; |
| WORD16 *pi2_tmp_src = pi2_src + sample_half_index + src_strd; |
| |
| m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_tmp_src); |
| pi2_tmp_src += (src_strd << 1); |
| } |
| |
| /* o & stage 1 out */ |
| { |
| WORD32 j; |
| WORD16 *pi2_src_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp; |
| WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp; |
| WORD32 out_stride = (trans_size << 1); |
| WORD32 in_stride = trans_size << 1; |
| |
| if(zero_last12_rows_stg1) |
| { |
| for(j = 0; j < 2; j++) |
| { |
| if(j) //H8B= higher 8 bytes L8B lower 8 bytes |
| { |
| m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B |
| } |
| else |
| { |
| m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B |
| } |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87 |
| |
| |
| /* o0[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += in_stride; |
| |
| m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57 |
| |
| |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += out_stride; |
| } |
| |
| /* o1[0-3] */ |
| { |
| m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); |
| |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += in_stride; |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9 |
| |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += out_stride; |
| } |
| |
| /* o2[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += in_stride; |
| |
| m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43 |
| |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += out_stride; |
| } |
| |
| /* o3[0-3] */ |
| { |
| m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += 8; |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80 |
| |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += 8; |
| } |
| |
| /* o4[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch -= in_stride; |
| |
| m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90 |
| |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch -= out_stride; |
| } |
| |
| /* o5[0-3] */ |
| { |
| m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); |
| |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch -= in_stride; |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70 |
| |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch -= out_stride; |
| } |
| |
| /* o6[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch -= in_stride; |
| |
| m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25 |
| |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch -= out_stride; |
| } |
| |
| /* o7[0-3] */ |
| { |
| m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += 8; |
| |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += 8; |
| } |
| } |
| } |
| else if(zero_last8_rows_stg1) |
| { |
| for(j = 0; j < 2; j++) |
| { |
| if(j) |
| { |
| m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B |
| m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B |
| } |
| else |
| { |
| m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B |
| m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B |
| } |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70 |
| |
| /* o0[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += in_stride; |
| |
| m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57 |
| m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43 |
| |
| m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| |
| m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += out_stride; |
| } |
| |
| /* o1[0-3] */ |
| { |
| m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); |
| m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += in_stride; |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87 |
| |
| m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += out_stride; |
| } |
| |
| /* o2[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += in_stride; |
| |
| m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43 |
| m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9 |
| |
| m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += out_stride; |
| } |
| |
| /* o3[0-3] */ |
| { |
| m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); |
| m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += 8; |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90 |
| |
| m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25); |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += 8; |
| } |
| |
| /* o4[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch -= in_stride; |
| |
| m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90 |
| m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25 |
| |
| m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch -= out_stride; |
| } |
| |
| /* o5[0-3] */ |
| { |
| m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); |
| m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch -= in_stride; |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80 |
| |
| m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); |
| |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch -= out_stride; |
| } |
| |
| /* o6[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch -= in_stride; |
| |
| m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25 |
| m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57 |
| |
| m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch -= out_stride; |
| } |
| |
| /* o7[0-3] */ |
| { |
| m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); |
| m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += 8; |
| |
| m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += 8; |
| } |
| } |
| |
| } |
| else |
| { |
| |
| |
| |
| for(j = 0; j < 2; j++) |
| { |
| if(j) //H8B= higher 8 bytes L8B lower 8 bytes |
| { |
| m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B |
| m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B |
| m_temp_reg_12 = _mm_unpackhi_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 H8B |
| m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 H8B |
| } |
| else |
| { |
| m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B |
| m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B |
| m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 L8B |
| m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 L8B |
| } |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70 |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[2][0]); //57 43 |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[3][0]); //25 9 |
| |
| |
| /* o0[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); |
| m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); |
| |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += in_stride; |
| |
| m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57 |
| m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43 |
| m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[6][0]); //80 90 |
| m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[7][0]); //70 25 |
| |
| m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); |
| m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); |
| m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); |
| m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += out_stride; |
| } |
| |
| /* o1[0-3] */ |
| { |
| m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); |
| m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); |
| m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); |
| m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); |
| |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += in_stride; |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87 |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[10][0]); //25 -57 |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[11][0]); //90 43 |
| |
| m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); |
| m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27); |
| m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26); |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += out_stride; |
| } |
| |
| /* o2[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); |
| m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += in_stride; |
| |
| m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43 |
| m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9 |
| m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[14][0]); //90 25 |
| m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[15][0]); //80 57 |
| |
| m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); |
| m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23); |
| m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22); |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += out_stride; |
| } |
| |
| /* o3[0-3] */ |
| { |
| m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); |
| m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); |
| m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); |
| m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); |
| |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += 8; |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90 |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[18][0]); //9 87 |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[19][0]); //43 70 |
| |
| m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25); |
| m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27); |
| m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26); |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += 8; |
| } |
| |
| /* o4[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); |
| m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch -= in_stride; |
| |
| m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90 |
| m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25 |
| m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[22][0]); //87 -70 |
| m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[23][0]); //9 -80 |
| |
| m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); |
| m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23); |
| m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22); |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch -= out_stride; |
| } |
| |
| /* o5[0-3] */ |
| { |
| m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); |
| m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); |
| m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); |
| m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); |
| |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch -= in_stride; |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80 |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[26][0]); //43 9 |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[27][0]); //57 -87 |
| |
| m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); |
| m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27); |
| m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26); |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch -= out_stride; |
| } |
| |
| /* o6[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); |
| m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); |
| |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch -= in_stride; |
| |
| m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25 |
| m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57 |
| m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[30][0]); //70 -80 |
| m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[31][0]); //87 -90 |
| |
| |
| m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); |
| m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23); |
| m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch -= out_stride; |
| } |
| |
| /* o7[0-3] */ |
| { |
| m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); |
| m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); |
| m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); |
| m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += 8; |
| |
| m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); |
| m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27); |
| m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26); |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); |
| |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += 8; |
| } |
| } |
| } |
| } |
| |
| /* Transpose */ |
| { |
| WORD16 *pi2_src_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp; |
| WORD16 *pi2_dst_scratch = ((i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp); |
| WORD32 out_stride = (trans_size << 1); |
| WORD32 in_stride = (trans_size << 1); |
| WORD32 j; |
| |
| for(j = 0; j < 2; j++) |
| { |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //b, a |
| pi2_src_scratch += in_stride; |
| m_temp_reg_31 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //d, c |
| pi2_src_scratch += in_stride; |
| m_temp_reg_32 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //f, e |
| pi2_src_scratch += in_stride; |
| m_temp_reg_33 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //h, g |
| pi2_src_scratch += 8; |
| m_temp_reg_34 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //j, i |
| pi2_src_scratch -= in_stride; |
| m_temp_reg_35 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //l, k |
| pi2_src_scratch -= in_stride; |
| m_temp_reg_36 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //n, m |
| pi2_src_scratch -= in_stride; |
| m_temp_reg_37 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //p, o |
| pi2_src_scratch += 8; |
| |
| m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); //ca3ca2ca1ca0 |
| m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); //bd3bd2bd1bd0 |
| |
| m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); //ge3ge2ge1ge0 |
| m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); //fh3fh2fh1fh0 |
| |
| m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); //ki3ki2ki1ki0 |
| m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); //jl3jl2jl1jl0 |
| |
| m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); //om3om2om1om0 |
| m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); //np3np2np1np0 |
| |
| |
| m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); //ge1ca1ge0ca0 |
| m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); //ge3ca3ge2ca2 |
| |
| m_temp_reg_32 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); //om1ki1om0ki0 |
| m_temp_reg_33 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); //om3ki3om2ki2 |
| |
| m_temp_reg_34 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); //bd1fh1bd0fh0 |
| m_temp_reg_35 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); //bd3fh3bd2fh2 |
| |
| m_temp_reg_36 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); //jl1np1jl0np0 |
| m_temp_reg_37 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); //jl3np3jl2np2 |
| |
| |
| m_temp_reg_40 = _mm_unpacklo_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca0 |
| m_temp_reg_41 = _mm_unpackhi_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca1 |
| |
| m_temp_reg_42 = _mm_unpacklo_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca2 |
| m_temp_reg_43 = _mm_unpackhi_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca3 |
| |
| m_temp_reg_44 = _mm_unpacklo_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0 |
| m_temp_reg_45 = _mm_unpackhi_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp1 |
| |
| m_temp_reg_46 = _mm_unpacklo_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp2 |
| m_temp_reg_47 = _mm_unpackhi_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp3 |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); |
| pi2_dst_scratch += out_stride; |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_44); |
| pi2_dst_scratch += out_stride; |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_41); |
| pi2_dst_scratch += out_stride; |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_45); |
| pi2_dst_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_42); |
| pi2_dst_scratch -= out_stride; |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_46); |
| pi2_dst_scratch -= out_stride; |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_43); |
| pi2_dst_scratch -= out_stride; |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_47); |
| pi2_dst_scratch += 8; |
| } |
| } |
| } |
| |
| if(zero_last8_cols_stg1) |
| { |
| WORD16 *pi2_dst_scratch = (pi2_tmp + 8 * trans_size); |
| WORD32 out_stride = (trans_size << 1); |
| WORD32 j; |
| |
| m_temp_reg_40 = _mm_setzero_si128(); |
| for(j = 0; j < 2; j++) |
| { |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); |
| pi2_dst_scratch += out_stride; |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); |
| pi2_dst_scratch += out_stride; |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); |
| pi2_dst_scratch += out_stride; |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); |
| pi2_dst_scratch += 8; |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); |
| pi2_dst_scratch -= out_stride; |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); |
| pi2_dst_scratch -= out_stride; |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); |
| pi2_dst_scratch -= out_stride; |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); |
| pi2_dst_scratch += 8; |
| } |
| } |
| |
| |
| |
| |
| /* Stage 2 */ |
| for(i = 0; i < 2; i++) |
| { |
| //__m128i m_temp_reg_15,m_temp_reg_16; |
| WORD16 *pi2_src_temp = (i) ? (pi2_tmp + 2 * trans_size) : (WORD16 *)(pi2_tmp); |
| WORD32 stride = (trans_size); |
| WORD16 temp_array[256]; |
| |
| i4_shift = IT_SHIFT_STAGE_2; |
| |
| if(zero_last12_rows_stg2) |
| { |
| /* eeo */ |
| /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ |
| /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ |
| { |
| m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0 |
| |
| pi2_src_temp += (stride * 9); |
| |
| if(!i) |
| { |
| pi2_src_temp += (stride * 6 + 8); |
| } |
| else |
| { |
| pi2_src_temp += (stride * 2 + 8); |
| } |
| |
| pi2_src_temp -= (stride * 9); |
| |
| m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2 |
| |
| m_temp_reg_20 = _mm_setzero_si128(); |
| m_temp_reg_22 = _mm_setzero_si128(); |
| |
| m_temp_reg_21 = _mm_setzero_si128(); |
| m_temp_reg_23 = _mm_setzero_si128(); |
| } |
| |
| /* eee */ |
| /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */ |
| /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */ |
| { |
| /* Loading coeff and src for use in next block */ |
| |
| /* Loading coeff and src for use in next block */ |
| m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_20, m_temp_reg_70); |
| |
| m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0 |
| |
| m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6); |
| |
| m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77); |
| m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6); |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18 |
| |
| m_temp_reg_26 = m_temp_reg_24; |
| m_temp_reg_27 = m_temp_reg_25; |
| /* */ |
| |
| m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_20); |
| m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_20); |
| } |
| |
| /* eo */ |
| { |
| WORD16 *pi2_scratch = temp_array; |
| WORD32 out_stride = 8; |
| |
| |
| /* eo0[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ |
| |
| /* e[0][0-3] stored in pu1_dst[0] */ |
| /* e[7][0-3] stored in pu1_dst[1] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += out_stride; |
| _mm_storeu_si128((__m128i *)(pi2_scratch), m_temp_reg_35); |
| pi2_scratch += out_stride; |
| } |
| |
| /* eo0[4-7] */ |
| { |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); |
| |
| /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ |
| |
| /* e[0][4-7] stored in pu1_dst[2] */ |
| /* e[7][4-7] stored in pu1_dst[3] */ |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += out_stride; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += out_stride; |
| |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18 |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50 |
| |
| } |
| |
| /* eo1[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); |
| |
| /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ |
| |
| /* e[1][0-3] stored in pu1_dst[4] */ |
| /* e[6][0-3] stored in pu1_dst[5] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += out_stride; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += out_stride; |
| } |
| |
| /* eo1[4-7] */ |
| { |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); |
| |
| /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ |
| |
| /* e[1][4-7] stored in pu1_dst[6]*/ |
| /* e[6][4-7] stored in pu1_dst[7] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += out_stride; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += out_stride; |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89 |
| |
| } |
| |
| /* eo2[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| /* e[2][0-3] stored in pu1_dst[8]*/ |
| /* e[5][0-3] stored in pu1_dst[9] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += out_stride; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += out_stride; |
| } |
| |
| /* eo2[4-7] */ |
| { |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); |
| |
| /* e[2][4-7] stored in pu1_dst[10]*/ |
| /* e[5][4-7] stored in pu1_dst[11] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += out_stride; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += out_stride; |
| |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50 |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89 |
| |
| } |
| |
| /* eo3[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); |
| |
| /* e[3][0-3] stored in pu1_dst[12]*/ |
| /* e[4][0-3] stored in pu1_dst[13] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += out_stride; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += out_stride; |
| } |
| |
| /* eo3[4-7] */ |
| { |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); |
| |
| /* e[3][4-7] stored in pu1_dst[14]*/ |
| /* e[4][4-7] stored in pu1_dst[15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += out_stride; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += out_stride; |
| } |
| |
| } |
| } |
| else if(zero_last8_rows_stg2) |
| { |
| /* eeo */ |
| /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ |
| /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ |
| { |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_16_even[3][0]); //83 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_16_even[4][0]); //36 |
| |
| m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0 |
| pi2_src_temp += (stride); |
| m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //4 |
| pi2_src_temp += (stride * 8); |
| |
| if(!i) |
| { |
| pi2_src_temp += (stride * 6 + 8); |
| } |
| else |
| { |
| pi2_src_temp += (stride * 2 + 8); |
| } |
| |
| pi2_src_temp -= (stride * 8); |
| m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //6 |
| pi2_src_temp -= (stride); |
| m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2 |
| |
| |
| m_temp_reg_76 = _mm_setzero_si128(); |
| |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83 |
| |
| m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's |
| m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's |
| |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); |
| m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); |
| |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); |
| m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); |
| } |
| |
| /* eee */ |
| /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */ |
| /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */ |
| { |
| /* Loading coeff and src for use in next block */ |
| |
| |
| m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_76, m_temp_reg_70); |
| |
| m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0 |
| |
| m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6); |
| m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77); |
| m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6); |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75 |
| |
| m_temp_reg_26 = m_temp_reg_24; |
| m_temp_reg_27 = m_temp_reg_25; |
| |
| m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); |
| m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); |
| } |
| |
| /* eo */ |
| { |
| WORD16 *pi2_scratch = temp_array; |
| WORD32 out_stride = 8; |
| |
| |
| /* eo0[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ |
| m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20); |
| m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20); |
| |
| /* e[0][0-3] stored in pu1_dst[0] */ |
| /* e[7][0-3] stored in pu1_dst[1] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += out_stride; |
| _mm_storeu_si128((__m128i *)(pi2_scratch), m_temp_reg_35); |
| pi2_scratch += out_stride; |
| } |
| |
| /* eo0[4-7] */ |
| { |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); |
| |
| /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ |
| m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21); |
| m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21); |
| |
| /* e[0][4-7] stored in pu1_dst[2] */ |
| /* e[7][4-7] stored in pu1_dst[3] */ |
| |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += out_stride; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += out_stride; |
| |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18 |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50 |
| |
| } |
| |
| /* eo1[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); |
| |
| /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ |
| m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22); |
| m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22); |
| |
| /* e[1][0-3] stored in pu1_dst[4] */ |
| /* e[6][0-3] stored in pu1_dst[5] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += out_stride; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += out_stride; |
| } |
| |
| /* eo1[4-7] */ |
| { |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); |
| |
| /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ |
| m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23); |
| m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23); |
| |
| /* e[1][4-7] stored in pu1_dst[6]*/ |
| /* e[6][4-7] stored in pu1_dst[7] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += out_stride; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += out_stride; |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89 |
| |
| } |
| |
| /* eo2[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| /* e[2][0-3] stored in pu1_dst[8]*/ |
| /* e[5][0-3] stored in pu1_dst[9] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += out_stride; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += out_stride; |
| } |
| |
| /* eo2[4-7] */ |
| { |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); |
| |
| /* e[2][4-7] stored in pu1_dst[10]*/ |
| /* e[5][4-7] stored in pu1_dst[11] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += out_stride; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += out_stride; |
| |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50 |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89 |
| |
| } |
| |
| /* eo3[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); |
| |
| /* e[3][0-3] stored in pu1_dst[12]*/ |
| /* e[4][0-3] stored in pu1_dst[13] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += out_stride; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += out_stride; |
| } |
| |
| /* eo3[4-7] */ |
| { |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); |
| |
| /* e[3][4-7] stored in pu1_dst[14]*/ |
| /* e[4][4-7] stored in pu1_dst[15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += out_stride; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += out_stride; |
| } |
| } |
| } |
| |
| else |
| { |
| /* eeo */ |
| /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ |
| /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ |
| { |
| |
| |
| m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0 |
| pi2_src_temp += (stride); |
| m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //4 |
| pi2_src_temp += (stride * 7); |
| m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src_temp); //8 |
| pi2_src_temp += (stride); |
| m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src_temp); //12 |
| if(!i) |
| { |
| pi2_src_temp += (stride * 6 + 8); |
| } |
| else |
| { |
| pi2_src_temp += (stride * 2 + 8); |
| } |
| m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src_temp); //14 |
| pi2_src_temp -= (stride); |
| m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src_temp); //10 |
| pi2_src_temp -= (stride * 7); |
| m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //6 |
| pi2_src_temp -= (stride); |
| m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2 |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83 |
| |
| m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's |
| m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's |
| |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); |
| m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); |
| |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); |
| m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); |
| |
| |
| } |
| |
| /* eee */ |
| /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */ |
| /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */ |
| { |
| /* Loading coeff and src for use in next block */ |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[0][0]); //64 64 |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[1][0]); //64 -64 |
| |
| m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved LSB's |
| m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved MSB's |
| |
| m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_0, m_coeff3); |
| m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_0, m_coeff4); |
| |
| m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); |
| m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_1, m_coeff4); |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18 |
| |
| } |
| |
| /* eo */ |
| { |
| WORD16 *pi2_scratch = temp_array; |
| WORD32 out_stride = 8; |
| |
| |
| |
| /* eo0[0-3] */ |
| { |
| m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); |
| m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); |
| m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77); |
| m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77); |
| |
| |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2); |
| |
| |
| /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ |
| m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20); |
| m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20); |
| |
| |
| /* e[0][0-3] stored in pi2_tmp[0][0-7] */ |
| /* e[7][0-3] stored in pi2_tmp[0][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += out_stride; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += out_stride; |
| |
| |
| } |
| |
| /* eo0[4-7] */ |
| { |
| |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); |
| m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2); |
| |
| /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ |
| m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21); |
| m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21); |
| |
| /* e[0][4-7] stored in pi2_tmp[1][0-7] */ |
| /* e[7][4-7] stored in pi2_tmp[1][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += out_stride; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += out_stride; |
| |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18 |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50 |
| |
| } |
| |
| /* eo1[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); |
| m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4); |
| |
| /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ |
| m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22); |
| m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22); |
| |
| /* e[1][0-3] stored in pi2_tmp[2][0-7] */ |
| /* e[6][0-3] stored in pi2_tmp[2][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); |
| m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_32); |
| m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_32); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += out_stride; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += out_stride; |
| |
| } |
| |
| /* eo1[4-7] */ |
| { |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); |
| m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); |
| |
| /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ |
| m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23); |
| m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23); |
| |
| /* e[1][4-7] stored in pi2_tmp[3][0-7] */ |
| /* e[6][4-7] stored in pi2_tmp[3][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31); |
| m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_33); |
| m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_33); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += out_stride; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += out_stride; |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75 |
| } |
| |
| /* eo2[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2); |
| |
| /* e[2][0-3] stored in pi2_tmp[4][0-7] */ |
| /* e[5][0-3] stored in pi2_tmp[4][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += out_stride; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += out_stride; |
| } |
| |
| /* eo2[4-7] */ |
| { |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); |
| m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2); |
| |
| /* e[2][4-7] stored in pi2_tmp[5][0-7] */ |
| /* e[5][4-7] stored in pi2_tmp[5][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31); |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += out_stride; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += out_stride; |
| |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50 |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89 |
| |
| } |
| |
| /* eo3[0-3] */ |
| { |
| m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); |
| m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4); |
| |
| /* e[3][0-3] stored in pi2_tmp[6][0-7] */ |
| /* e[4][0-3] stored in pi2_tmp[6][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32); |
| |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += out_stride; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += out_stride; |
| } |
| |
| /* eo3[4-7] */ |
| { |
| m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); |
| m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); |
| |
| /* e[3][4-7] stored in pi2_tmp[7][0-7] */ |
| /* e[4][4-7] stored in pi2_tmp[7][8-15] */ |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31); |
| m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33); |
| m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33); |
| |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); |
| pi2_scratch += out_stride; |
| _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); |
| pi2_scratch += out_stride; |
| } |
| } |
| } |
| |
| if(zero_last12_rows_stg2) |
| { |
| /* o & stage 2 pre-transposed out */ |
| { |
| WORD32 j; |
| WORD16 *pi2_src_scratch = temp_array; |
| WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp); |
| WORD32 out_stride = (trans_size); |
| WORD32 in_stride = (8) * 4; |
| |
| pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2); |
| |
| m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1 |
| |
| pi2_src_temp += (stride * 9); |
| |
| if(0 == i) |
| { |
| pi2_src_temp -= (stride * 2 - 8); |
| } |
| else |
| { |
| pi2_src_temp -= (stride * 6 - 8); |
| } |
| pi2_src_temp -= (stride * 9); |
| |
| m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3 |
| |
| |
| for(j = 0; j < 2; j++) |
| { |
| if(j) |
| { |
| m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B |
| } |
| else |
| { |
| m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B |
| } |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87 |
| |
| /* o0[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += in_stride; |
| |
| m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57 |
| |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00); |
| |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += out_stride; |
| } |
| |
| /* o1[0-3] */ |
| { |
| m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += in_stride; |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9 |
| |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += ((!i) * out_stride + 8); |
| } |
| |
| /* o2[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += in_stride; |
| |
| m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43 |
| |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += out_stride; |
| } |
| |
| /* o3[0-3] */ |
| { |
| m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += 8; |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80 |
| |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += (i * out_stride + 8); |
| } |
| |
| /* o4[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch -= in_stride; |
| |
| m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90 |
| |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += out_stride; |
| } |
| |
| /* o5[0-3] */ |
| { |
| m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch -= in_stride; |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70 |
| |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); |
| |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += ((!i) * out_stride + 8); |
| } |
| |
| /* o6[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch -= in_stride; |
| |
| m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25 |
| |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += out_stride; |
| } |
| |
| /* o7[0-3] */ |
| { |
| m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += 8; |
| |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += (i * out_stride + 8); |
| } |
| |
| |
| } |
| } |
| } |
| else if(zero_last8_rows_stg2) |
| { |
| /* o & stage 2 pre-transposed out */ |
| { |
| WORD32 j; |
| WORD16 *pi2_src_scratch = temp_array; |
| WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp); |
| WORD32 out_stride = (trans_size); |
| WORD32 in_stride = (8) * 4; |
| |
| pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2); |
| |
| |
| m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1 |
| pi2_src_temp += (stride); |
| m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //5 |
| pi2_src_temp += (stride * 8); |
| |
| if(0 == i) |
| { |
| pi2_src_temp -= (stride * 2 - 8); |
| } |
| else |
| { |
| pi2_src_temp -= (stride * 6 - 8); |
| } |
| |
| pi2_src_temp -= (stride * 8); |
| m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //7 |
| pi2_src_temp -= (stride); |
| m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3 |
| |
| |
| for(j = 0; j < 2; j++) |
| { |
| if(j) |
| { |
| m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B |
| m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B |
| } |
| else |
| { |
| m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B |
| m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B |
| } |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70 |
| |
| /* o0[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += in_stride; |
| |
| m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57 |
| m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43 |
| |
| m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| |
| m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); |
| m_count = _mm_cvtsi32_si128(i4_shift); |
| |
| m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += out_stride; |
| } |
| |
| /* o1[0-3] */ |
| { |
| m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); |
| m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += in_stride; |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87 |
| |
| m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += ((!i) * out_stride + 8); |
| } |
| |
| /* o2[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += in_stride; |
| |
| m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43 |
| m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9 |
| |
| m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += out_stride; |
| } |
| |
| /* o3[0-3] */ |
| { |
| m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); |
| m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += 8; |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90 |
| |
| m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25); |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += (i * out_stride + 8); |
| } |
| |
| /* o4[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch -= in_stride; |
| |
| m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90 |
| m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25 |
| |
| m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += out_stride; |
| } |
| |
| /* o5[0-3] */ |
| { |
| m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); |
| m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch -= in_stride; |
| |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80 |
| |
| m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); |
| |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += ((!i) * out_stride + 8); |
| } |
| |
| /* o6[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |
| m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch -= in_stride; |
| |
| m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25 |
| m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57 |
| |
| m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += out_stride; |
| } |
| |
| /* o7[0-3] */ |
| { |
| m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); |
| m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); |
| |
| m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); |
| pi2_src_scratch += 8; |
| |
| m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); |
| m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); |
| |
| m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); |
| m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); |
| m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); |
| m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); |
| |
| m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); |
| |
| _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); |
| pi2_dst_scratch += (i * out_stride + 8); |
| } |
| } |
| } |
| } |
| else |
| { |
| /* o & stage 2 pre-transposed out */ |
| { |
| WORD32 j; |
| WORD16 *pi2_src_scratch = temp_array; |
| WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp); |
| WORD32 out_stride = (trans_size); |
| WORD32 in_stride = (8) * 4; |
| |
| pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2); |
| |
| |
| m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1 |
| pi2_src_temp += (stride); |
| m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //5 |
| pi2_src_temp += (stride * 7); |
| m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src_temp); //9 |
| pi2_src_temp += (stride); |
| m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src_temp); //13 |
| if(0 == i) |
| { |
| pi2_src_temp -= (stride * 2 - 8); |
| } |
| else |
| { |
| pi2_src_temp -= (stride * 6 - 8); |
| } |
| m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src_temp); //15 |
| pi2_src_temp -= (stride); |
| m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src_temp); //11 |
| pi2_src_temp -= (stride * 7); |
| m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //7 |
| pi2_src_temp -= (stride); |
| m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3 |
| |
| |
| for(j = 0; j < 2; j++) |
| { |
| |
| if(j) //H8B= higher 8 bytes L8B lower 8 bytes |
| { |
| m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B |
| m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B |
| m_temp_reg_12 = _mm_unpackhi_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 H8B |
| m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 H8B |
| } |
| else |
| { |
| m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B |
| m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B |
| m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 L8B |
| m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 L8B |
| } |
| m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87 |
| m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70 |
| m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[2][0]); //57 43 |
| m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[3][0]); //25 9 |
| |
| |
| /* o0[0-3] */ |
| { |
| m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); |