| /****************************************************************************** |
| * |
| * Copyright (C) 2018 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at: |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| * |
| ***************************************************************************** |
| * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
| */ |
| |
| /** |
| ******************************************************************************* |
| * @file |
| * ihevc_quant_iquant_ssd_neon_intr.c |
| * |
| * @brief |
| * Contains function definitions for quantization, followed by Inverse |
| * quantization to find transform domain SSD |
| * |
| * @author |
| * 100736 |
| * |
| * @par List of Functions: |
| * - ihevc_quant_iquant_ssd_flat_scale_mat_neon() |
| * - ihevc_q_iq_ssd_flat_scale_mat_var_rnd_fact_neon() |
| * |
| * @remarks |
| * |
| * |
| ******************************************************************************* |
| */ |
| /* System include files */ |
| #include <stdio.h> |
| #include <string.h> |
| #include <stdlib.h> |
| |
| /* User include files */ |
| #include "ihevc_typedefs.h" |
| #include "ihevc_macros.h" |
| #include "ihevc_platform_macros.h" |
| #include "ihevc_defs.h" |
| #include "ihevc_debug.h" |
| #include "ihevc_trans_tables.h" |
| #include "ihevc_quant_iquant_ssd.h" |
| #include "ihevc_func_selector.h" |
| #include "ihevc_trans_macros.h" |
| #include "arm_neon.h" |
| |
| /*****************************************************************************/ |
| /* Function Definitions */ |
| /*****************************************************************************/ |
| |
| WORD32 ihevc_quant_iquant_ssd_flat_scale_mat_neon( |
| WORD16 *pi2_coeffs, |
| WORD16 *pi2_quant_coeff, |
| WORD16 *pi2_q_dst, |
| WORD16 *pi2_iq_dst, |
| WORD32 trans_size, |
| WORD32 qp_div, |
| WORD32 qp_rem, |
| WORD32 q_add, |
| WORD32 *pi4_quant_round_factor_0_1, |
| WORD32 *pi4_quant_round_factor_1_2, |
| WORD32 src_strd, |
| WORD32 dst_q_strd, |
| WORD32 dst_iq_strd, |
| UWORD8 *csbf, |
| WORD32 csbf_strd, |
| WORD32 *zero_col, |
| WORD32 *zero_row, |
| WORD16 *pi2_dequant_coeff, |
| LWORD64 *pi8_cost) |
| { |
| WORD32 i, j; |
| WORD32 log2_size; |
| WORD32 cbf = 0; |
| |
| WORD16 qm = 4; |
| WORD16 bd = 8; |
| WORD32 q_bits, tr, temp; |
| WORD32 block_col = 0; |
| WORD32 block_row = 0; |
| WORD32 temp_zero_col = 0; |
| WORD32 temp_zero_row = 0; |
| |
| WORD32 sh; |
| WORD32 s_iq; |
| WORD32 sh_tmp; |
| |
| // ssd |
| int32x4_t ssd0 = vdupq_n_s32(0); |
| int32x2_t ssd1; |
| WORD32 ssd; |
| // const |
| const int16x4_t zero = vdup_n_s16(0); |
| const int16x4_t zero_d = vdup_n_s16(0); |
| const int16x4_t sq = vdup_n_s16(g_ihevc_quant_scales[qp_rem]); |
| const int16x4_t siq = vdup_n_s16((g_ihevc_iquant_scales_flat_scale[qp_rem])); |
| // src |
| int16x4_t s0, s1, s2, s3; |
| // q-iq |
| int16x4_t q0, q1, q2, q3; |
| int16x4_t iq0, iq1, iq2, iq3; |
| // residue |
| int32x4_t r0, r1, r2, r3; |
| // sign |
| uint16x4_t psgn0, psgn1, psgn2, psgn3; |
| uint16x4_t nsgn0, nsgn1, nsgn2, nsgn3; |
| // abs(src) |
| int16x4_t abs_s0, abs_s1, abs_s2, abs_s3; |
| // q-temp |
| int32x4_t qtmp_0, qtmp_1, qtmp_2, qtmp_3; |
| int16x4_t pq0, pq1, pq2, pq3; |
| int16x4_t nq0, nq1, nq2, nq3; |
| // iq-temp |
| int32x4_t iqtmp_0, iqtmp_1, iqtmp_2, iqtmp_3; |
| |
| int32x4_t add_q; |
| int32x4_t add_iq = vdupq_n_s32(1); |
| int32x4_t sh_iq_1; |
| int32x4_t sh_iq; |
| int32x4_t q_v_bits; |
| |
| (void)pi4_quant_round_factor_0_1; |
| (void)pi4_quant_round_factor_1_2; |
| (void)pi2_dequant_coeff; |
| |
| GETRANGE(log2_size, trans_size); |
| log2_size -= 1; |
| |
| tr = MAX_TR_DYNAMIC_RANGE - bd - log2_size; |
| q_bits = QUANT_SHIFT + qp_div + tr + SCALING_Q_SHIFT - qm - FLAT_RESCALE_MAT_Q_SHIFT; |
| temp = (((WORD32)q_add) << (q_bits - QUANT_ROUND_FACTOR_Q)); |
| |
| q_v_bits = vdupq_n_s32(-q_bits); |
| add_q = vdupq_n_s32(temp); |
| |
| sh = bd + log2_size - 5; |
| |
| sh_tmp = (sh - qp_div - 1); |
| sh_iq_1 = vdupq_n_s32(sh_tmp); |
| add_iq = vshlq_s32(add_iq, sh_iq_1); |
| |
| s_iq = (-(sh - qp_div)); |
| sh_iq = vdupq_n_s32(s_iq); |
| |
| for(i = 0; i < trans_size; i += 4) |
| { |
| for(j = 0; j < trans_size; j += 4) |
| { |
| s0 = vld1_s16(pi2_coeffs + j); |
| s1 = vld1_s16(pi2_coeffs + j + (src_strd)); |
| s2 = vld1_s16(pi2_coeffs + j + (2 * src_strd)); |
| s3 = vld1_s16(pi2_coeffs + j + (3 * src_strd)); |
| |
| /* quantization */ |
| /* sign */ |
| psgn0 = vcge_s16(s0, zero); |
| psgn1 = vcge_s16(s1, zero); |
| psgn2 = vcge_s16(s2, zero); |
| psgn3 = vcge_s16(s3, zero); |
| |
| nsgn0 = vclt_s16(s0, zero); |
| nsgn1 = vclt_s16(s1, zero); |
| nsgn2 = vclt_s16(s2, zero); |
| nsgn3 = vclt_s16(s3, zero); |
| |
| /* |src| */ |
| abs_s0 = vabs_s16(s0); |
| abs_s1 = vabs_s16(s1); |
| abs_s2 = vabs_s16(s2); |
| abs_s3 = vabs_s16(s3); |
| |
| /* tmp = tmp * quant_coeff */ |
| qtmp_0 = vmull_s16(abs_s0, sq); |
| qtmp_1 = vmull_s16(abs_s1, sq); |
| qtmp_2 = vmull_s16(abs_s2, sq); |
| qtmp_3 = vmull_s16(abs_s3, sq); |
| |
| /* tmp += (((WORD32)q_add) << (q_bits - QUANT_ROUND_FACTOR_Q)) */ |
| qtmp_0 = vaddq_s32(qtmp_0, add_q); |
| qtmp_1 = vaddq_s32(qtmp_1, add_q); |
| qtmp_2 = vaddq_s32(qtmp_2, add_q); |
| qtmp_3 = vaddq_s32(qtmp_3, add_q); |
| |
| /* tmp >>= q_bits; */ |
| qtmp_0 = vshlq_s32(qtmp_0, q_v_bits); |
| qtmp_1 = vshlq_s32(qtmp_1, q_v_bits); |
| qtmp_2 = vshlq_s32(qtmp_2, q_v_bits); |
| qtmp_3 = vshlq_s32(qtmp_3, q_v_bits); |
| |
| /* clip */ |
| q0 = vqmovn_s32(qtmp_0); |
| q1 = vqmovn_s32(qtmp_1); |
| q2 = vqmovn_s32(qtmp_2); |
| q3 = vqmovn_s32(qtmp_3); |
| |
| /* restore sign */ |
| pq0 = vand_s16(q0, vreinterpret_s16_u16(psgn0)); |
| pq1 = vand_s16(q1, vreinterpret_s16_u16(psgn1)); |
| pq2 = vand_s16(q2, vreinterpret_s16_u16(psgn2)); |
| pq3 = vand_s16(q3, vreinterpret_s16_u16(psgn3)); |
| |
| nq0 = vand_s16(q0, vreinterpret_s16_u16(nsgn0)); |
| nq1 = vand_s16(q1, vreinterpret_s16_u16(nsgn1)); |
| nq2 = vand_s16(q2, vreinterpret_s16_u16(nsgn2)); |
| nq3 = vand_s16(q3, vreinterpret_s16_u16(nsgn3)); |
| |
| q0 = vsub_s16(pq0, nq0); |
| q1 = vsub_s16(pq1, nq1); |
| q2 = vsub_s16(pq2, nq2); |
| q3 = vsub_s16(pq3, nq3); |
| |
| /* store */ |
| vst1_s16((pi2_q_dst + j), q0); |
| vst1_s16((pi2_q_dst + j + dst_q_strd), q1); |
| vst1_s16((pi2_q_dst + j + (2 * dst_q_strd)), q2); |
| vst1_s16((pi2_q_dst + j + (3 * dst_q_strd)), q3); |
| |
| *(csbf + block_col) = 0; |
| if(vget_lane_s64(vreinterpret_s64_s16(q0), 0) || |
| vget_lane_s64(vreinterpret_s64_s16(q1), 0) || |
| vget_lane_s64(vreinterpret_s64_s16(q2), 0) || |
| vget_lane_s64(vreinterpret_s64_s16(q3), 0)) |
| { |
| *(csbf + block_col) = 1; |
| } |
| |
| if(*(csbf + block_col) == 1) |
| { |
| temp_zero_col |= (0xF << block_col * 4); |
| temp_zero_row |= (0xF << block_row); |
| |
| /* inverse quantization */ |
| iqtmp_0 = vmull_s16(q0, siq); |
| iqtmp_1 = vmull_s16(q1, siq); |
| iqtmp_2 = vmull_s16(q2, siq); |
| iqtmp_3 = vmull_s16(q3, siq); |
| |
| iqtmp_0 = vaddq_s32(iqtmp_0, add_iq); |
| iqtmp_1 = vaddq_s32(iqtmp_1, add_iq); |
| iqtmp_2 = vaddq_s32(iqtmp_2, add_iq); |
| iqtmp_3 = vaddq_s32(iqtmp_3, add_iq); |
| |
| iqtmp_0 = vshlq_s32(iqtmp_0, sh_iq); |
| iqtmp_1 = vshlq_s32(iqtmp_1, sh_iq); |
| iqtmp_2 = vshlq_s32(iqtmp_2, sh_iq); |
| iqtmp_3 = vshlq_s32(iqtmp_3, sh_iq); |
| |
| /* clip */ |
| iq0 = vqmovn_s32(iqtmp_0); |
| iq1 = vqmovn_s32(iqtmp_1); |
| iq2 = vqmovn_s32(iqtmp_2); |
| iq3 = vqmovn_s32(iqtmp_3); |
| |
| /* store */ |
| vst1_s16((pi2_iq_dst + j), iq0); |
| vst1_s16((pi2_iq_dst + j + dst_iq_strd), iq1); |
| vst1_s16((pi2_iq_dst + j + (2 * dst_iq_strd)), iq2); |
| vst1_s16((pi2_iq_dst + j + (3 * dst_iq_strd)), iq3); |
| |
| /* ssd */ |
| /* trans_coeff - inv.quant */ |
| r0 = vsubl_s16(s0, iq0); |
| r1 = vsubl_s16(s1, iq1); |
| r2 = vsubl_s16(s2, iq2); |
| r3 = vsubl_s16(s3, iq3); |
| |
| /* SD */ |
| r0 = vmulq_s32(r0, r0); |
| r1 = vmulq_s32(r1, r1); |
| r2 = vmulq_s32(r2, r2); |
| r3 = vmulq_s32(r3, r3); |
| } |
| else |
| { |
| /* store */ |
| vst1_s16((pi2_iq_dst + j), zero_d); |
| vst1_s16((pi2_iq_dst + j + dst_iq_strd), zero_d); |
| vst1_s16((pi2_iq_dst + j + (2 * dst_iq_strd)), zero_d); |
| vst1_s16((pi2_iq_dst + j + (3 * dst_iq_strd)), zero_d); |
| |
| /* SD */ |
| r0 = vmull_s16(s0, s0); |
| r1 = vmull_s16(s1, s1); |
| r2 = vmull_s16(s2, s2); |
| r3 = vmull_s16(s3, s3); |
| } |
| |
| /* SSD */ |
| r0 = vaddq_s32(r0, r1); |
| r2 = vaddq_s32(r2, r3); |
| |
| r0 = vaddq_s32(r0, r2); |
| |
| /* SSD Accumulation */ |
| ssd0 = vaddq_s32(ssd0, r0); |
| |
| cbf = cbf || (*(csbf + block_col)); // cbf update |
| block_col++; |
| } |
| |
| block_col = 0; |
| block_row += 4; |
| csbf += csbf_strd; |
| |
| pi2_coeffs += 4 * src_strd; |
| pi2_q_dst += 4 * dst_q_strd; |
| pi2_iq_dst += 4 * dst_iq_strd; |
| pi2_quant_coeff += 4 * trans_size; |
| } |
| |
| /* SSD Computation */ |
| ssd1 = vpadd_s32(vget_low_s32(ssd0), vget_high_s32(ssd0)); |
| ssd1 = vpadd_s32(ssd1, ssd1); |
| ssd = vget_lane_s32(ssd1, 0); |
| |
| *zero_col = ~temp_zero_col; //final zero_col storing |
| *zero_row = ~temp_zero_row; //final zero_row storing |
| |
| /* Store the cost */ |
| *pi8_cost = ssd; |
| |
| return cbf; |
| } |
| |
| WORD32 ihevc_q_iq_ssd_flat_scale_mat_var_rnd_fact_neon( |
| WORD16 *pi2_coeffs, |
| WORD16 *pi2_quant_coeff, |
| WORD16 *pi2_q_dst, |
| WORD16 *pi2_iq_dst, |
| WORD32 trans_size, |
| WORD32 qp_div, /* qpscaled / 6 */ |
| WORD32 qp_rem, /* qpscaled % 6 */ |
| WORD32 q_add, |
| WORD32 *pi4_quant_round_factor_0_1, |
| WORD32 *pi4_quant_round_factor_1_2, |
| WORD32 src_strd, |
| WORD32 dst_q_strd, |
| WORD32 dst_iq_strd, |
| UWORD8 *csbf, |
| WORD32 csbf_strd, |
| WORD32 *zero_col, |
| WORD32 *zero_row, |
| WORD16 *pi2_dequant_coeff, |
| LWORD64 *pi8_cost) |
| { |
| WORD32 i, j; |
| WORD32 log2_size; |
| WORD32 cbf = 0; |
| |
| WORD16 qm = 4; |
| WORD16 bd = 8; |
| WORD32 q_bits, tr; |
| WORD32 block_col = 0; |
| WORD32 block_row = 0; |
| WORD32 temp_zero_col = 0; |
| WORD32 temp_zero_row = 0; |
| |
| WORD32 sh; |
| WORD32 s_iq; |
| WORD32 sh_tmp; |
| |
| // ssd |
| int32x4_t ssd0 = vdupq_n_s32(0); |
| int32x2_t ssd1; |
| WORD32 ssd; |
| // const |
| const int16x8_t zero = vdupq_n_s16(0); |
| const int16x4_t zero_d = vdup_n_s16(0); |
| const int16x8_t one = vdupq_n_s16(1); |
| const int16x8_t two = vdupq_n_s16(2); |
| const int16x4_t sq = vdup_n_s16(g_ihevc_quant_scales[qp_rem]); |
| const int16x4_t siq = vdup_n_s16((g_ihevc_iquant_scales_flat_scale[qp_rem])); |
| // src |
| int16x4_t s0, s1, s2, s3; |
| // sign |
| uint16x8_t psgn0, psgn1; |
| uint16x8_t nsgn0, nsgn1; |
| int16x8_t pq0, pq1; |
| int16x8_t nq0, nq1; |
| // abs(src) |
| int16x4_t abs_s0, abs_s1, abs_s2, abs_s3; |
| // q-temp |
| int32x4_t mul_0, mul_1, mul_2, mul_3; |
| int32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3; |
| int16x8_t q_00, q_01; |
| int16x8_t q_10, q_11; |
| int16x8_t q_20, q_21; |
| int16x8_t q_30, q_31; |
| // cmp |
| uint16x8_t cmp_00, cmp_01; |
| uint16x8_t cmp_10, cmp_11; |
| uint16x8_t cmp_20, cmp_21; |
| // iq-temp |
| int32x4_t iqtmp_0, iqtmp_1, iqtmp_2, iqtmp_3; |
| int16x4_t iq0, iq1, iq2, iq3; |
| //residue |
| int32x4_t r0, r1, r2, r3; |
| // add_q |
| int32x4_t add_q; |
| int32x4_t add_q0, add_q1, add_q2, add_q3; |
| int32x4_t add_iq = vdupq_n_s32(1); |
| int32x4_t sh_iq_1; |
| int32x4_t sh_iq; |
| int32x4_t q_v_bits; |
| int32x4_t stmp; |
| |
| (void)q_add; |
| (void)pi2_dequant_coeff; |
| GETRANGE(log2_size, trans_size); |
| log2_size -= 1; |
| |
| tr = MAX_TR_DYNAMIC_RANGE - bd - log2_size; |
| q_bits = QUANT_SHIFT + qp_div + tr + SCALING_Q_SHIFT - qm - FLAT_RESCALE_MAT_Q_SHIFT; |
| |
| stmp = vdupq_n_s32(q_bits - QUANT_ROUND_FACTOR_Q); |
| |
| add_q = vdupq_n_s32((1 << QUANT_ROUND_FACTOR_Q) / 2); |
| add_q = vshlq_s32(add_q, stmp); |
| |
| q_v_bits = vdupq_n_s32(-q_bits); |
| |
| sh = bd + log2_size - 5; |
| |
| sh_tmp = (sh - qp_div - 1); |
| sh_iq_1 = vdupq_n_s32(sh_tmp); |
| add_iq = vshlq_s32(add_iq, sh_iq_1); |
| |
| s_iq = (-(sh - qp_div)); |
| sh_iq = vdupq_n_s32(s_iq); |
| |
| for(i = 0; i < trans_size; i += 4) |
| { |
| for(j = 0; j < trans_size; j += 4) |
| { |
| s0 = vld1_s16(pi2_coeffs + j); |
| s1 = vld1_s16(pi2_coeffs + j + (src_strd)); |
| s2 = vld1_s16(pi2_coeffs + j + (2 * src_strd)); |
| s3 = vld1_s16(pi2_coeffs + j + (3 * src_strd)); |
| |
| /* quantization */ |
| /* sign */ |
| psgn0 = vcgeq_s16(vcombine_s16(s0, s1), zero); |
| psgn1 = vcgeq_s16(vcombine_s16(s2, s3), zero); |
| |
| nsgn0 = vcltq_s16(vcombine_s16(s0, s1), zero); |
| nsgn1 = vcltq_s16(vcombine_s16(s2, s3), zero); |
| |
| /* |src| */ |
| abs_s0 = vabs_s16(s0); |
| abs_s1 = vabs_s16(s1); |
| abs_s2 = vabs_s16(s2); |
| abs_s3 = vabs_s16(s3); |
| |
| /* tmp = tmp * quant_coeff */ |
| mul_0 = vmull_s16(abs_s0, sq); |
| mul_1 = vmull_s16(abs_s1, sq); |
| mul_2 = vmull_s16(abs_s2, sq); |
| mul_3 = vmull_s16(abs_s3, sq); |
| |
| /* qadd = 0 */ |
| /* tmp >>= q_bits; */ |
| q_tmp0 = vshlq_s32(mul_0, q_v_bits); |
| q_tmp1 = vshlq_s32(mul_1, q_v_bits); |
| q_tmp2 = vshlq_s32(mul_2, q_v_bits); |
| q_tmp3 = vshlq_s32(mul_3, q_v_bits); |
| |
| /* clip */ |
| q_00 = vcombine_s16(vqmovn_s32(q_tmp0), vqmovn_s32(q_tmp1)); |
| q_01 = vcombine_s16(vqmovn_s32(q_tmp2), vqmovn_s32(q_tmp3)); |
| |
| /* compare qtmp_10, qtmp_20 with 2*/ |
| cmp_00 = vcltq_s16(q_00, two); |
| cmp_01 = vcltq_s16(q_01, two); |
| |
| /* qadd = (1 << QUANT_ROUND_FACTOR_Q)/2) */ |
| /* tmp >>= q_bits; */ |
| q_tmp0 = vaddq_s32(mul_0, add_q); |
| q_tmp1 = vaddq_s32(mul_1, add_q); |
| q_tmp2 = vaddq_s32(mul_2, add_q); |
| q_tmp3 = vaddq_s32(mul_3, add_q); |
| |
| q_tmp0 = vshlq_s32(q_tmp0, q_v_bits); |
| q_tmp1 = vshlq_s32(q_tmp1, q_v_bits); |
| q_tmp2 = vshlq_s32(q_tmp2, q_v_bits); |
| q_tmp3 = vshlq_s32(q_tmp3, q_v_bits); |
| |
| /* clip */ |
| q_10 = vcombine_s16(vqmovn_s32(q_tmp0), vqmovn_s32(q_tmp1)); |
| q_11 = vcombine_s16(vqmovn_s32(q_tmp2), vqmovn_s32(q_tmp3)); |
| |
| if(vget_lane_s64(vreinterpret_s64_u16(vget_low_u16(cmp_00)), 0) || |
| vget_lane_s64(vreinterpret_s64_u16(vget_high_u16(cmp_00)), 0) || |
| vget_lane_s64(vreinterpret_s64_u16(vget_low_u16(cmp_01)), 0) || |
| vget_lane_s64(vreinterpret_s64_u16(vget_high_u16(cmp_01)), 0)) |
| { |
| /* qadd = *pi4_quant_round_factor_1_2 */ |
| /* tmp >>= q_bits; */ |
| add_q0 = vld1q_s32(pi4_quant_round_factor_1_2 + j); |
| add_q1 = vld1q_s32(pi4_quant_round_factor_1_2 + j + (trans_size)); |
| add_q2 = vld1q_s32(pi4_quant_round_factor_1_2 + j + (2 * trans_size)); |
| add_q3 = vld1q_s32(pi4_quant_round_factor_1_2 + j + (3 * trans_size)); |
| |
| add_q0 = vshlq_s32(add_q0, stmp); |
| add_q1 = vshlq_s32(add_q1, stmp); |
| add_q2 = vshlq_s32(add_q2, stmp); |
| add_q3 = vshlq_s32(add_q3, stmp); |
| |
| q_tmp0 = vaddq_s32(mul_0, add_q0); |
| q_tmp1 = vaddq_s32(mul_1, add_q1); |
| q_tmp2 = vaddq_s32(mul_2, add_q2); |
| q_tmp3 = vaddq_s32(mul_3, add_q3); |
| |
| q_tmp0 = vshlq_s32(q_tmp0, q_v_bits); |
| q_tmp1 = vshlq_s32(q_tmp1, q_v_bits); |
| q_tmp2 = vshlq_s32(q_tmp2, q_v_bits); |
| q_tmp3 = vshlq_s32(q_tmp3, q_v_bits); |
| |
| /* clip */ |
| q_20 = vcombine_s16(vqmovn_s32(q_tmp0), vqmovn_s32(q_tmp1)); |
| q_21 = vcombine_s16(vqmovn_s32(q_tmp2), vqmovn_s32(q_tmp3)); |
| |
| /* qadd = *pi4_quant_round_factor_0_1 */ |
| /* tmp >>= q_bits; */ |
| add_q0 = vld1q_s32(pi4_quant_round_factor_0_1 + j); |
| add_q1 = vld1q_s32(pi4_quant_round_factor_0_1 + j + (trans_size)); |
| add_q2 = vld1q_s32(pi4_quant_round_factor_0_1 + j + (2 * trans_size)); |
| add_q3 = vld1q_s32(pi4_quant_round_factor_0_1 + j + (3 * trans_size)); |
| |
| add_q0 = vshlq_s32(add_q0, stmp); |
| add_q1 = vshlq_s32(add_q1, stmp); |
| add_q2 = vshlq_s32(add_q2, stmp); |
| add_q3 = vshlq_s32(add_q3, stmp); |
| |
| q_tmp0 = vaddq_s32(mul_0, add_q0); |
| q_tmp1 = vaddq_s32(mul_1, add_q1); |
| q_tmp2 = vaddq_s32(mul_2, add_q2); |
| q_tmp3 = vaddq_s32(mul_3, add_q3); |
| |
| q_tmp0 = vshlq_s32(q_tmp0, q_v_bits); |
| q_tmp1 = vshlq_s32(q_tmp1, q_v_bits); |
| q_tmp2 = vshlq_s32(q_tmp2, q_v_bits); |
| q_tmp3 = vshlq_s32(q_tmp3, q_v_bits); |
| |
| /* clip */ |
| q_30 = vcombine_s16(vqmovn_s32(q_tmp0), vqmovn_s32(q_tmp1)); |
| q_31 = vcombine_s16(vqmovn_s32(q_tmp2), vqmovn_s32(q_tmp3)); |
| |
| /* compare qtmp_10, qtmp_20 with 1*/ |
| cmp_10 = vcltq_s16(q_00, one); |
| cmp_11 = vcltq_s16(q_01, one); |
| |
| cmp_20 = vbicq_u16(cmp_00, cmp_10); |
| cmp_21 = vbicq_u16(cmp_01, cmp_11); |
| |
| q_10 = vbslq_s16(cmp_10, q_30, q_10); |
| q_11 = vbslq_s16(cmp_11, q_31, q_11); |
| |
| q_10 = vbslq_s16(cmp_20, q_20, q_10); |
| q_11 = vbslq_s16(cmp_21, q_21, q_11); |
| } |
| |
| /* restore sign */ |
| pq0 = vandq_s16(q_10, vreinterpretq_s16_u16(psgn0)); |
| pq1 = vandq_s16(q_11, vreinterpretq_s16_u16(psgn1)); |
| |
| nq0 = vandq_s16(q_10, vreinterpretq_s16_u16(nsgn0)); |
| nq1 = vandq_s16(q_11, vreinterpretq_s16_u16(nsgn1)); |
| |
| q_10 = vsubq_s16(pq0, nq0); |
| q_11 = vsubq_s16(pq1, nq1); |
| |
| /* store */ |
| vst1_s16((pi2_q_dst + j), vget_low_s16(q_10)); |
| vst1_s16((pi2_q_dst + j + dst_q_strd), vget_high_s16(q_10)); |
| vst1_s16((pi2_q_dst + j + (2 * dst_q_strd)), vget_low_s16(q_11)); |
| vst1_s16((pi2_q_dst + j + (3 * dst_q_strd)), vget_high_s16(q_11)); |
| |
| *(csbf + block_col) = 0; |
| if(vget_lane_s64(vreinterpret_s64_s16(vget_low_s16(q_10)), 0) || |
| vget_lane_s64(vreinterpret_s64_s16(vget_high_s16(q_10)), 0) || |
| vget_lane_s64(vreinterpret_s64_s16(vget_low_s16(q_11)), 0) || |
| vget_lane_s64(vreinterpret_s64_s16(vget_high_s16(q_11)), 0)) |
| { |
| *(csbf + block_col) = 1; |
| } |
| |
| if(*(csbf + block_col) == 1) |
| { |
| temp_zero_col |= (0xF << block_col * 4); |
| temp_zero_row |= (0xF << block_row); |
| |
| /* inverse quantization */ |
| iqtmp_0 = vmull_s16(vget_low_s16(q_10), siq); |
| iqtmp_1 = vmull_s16(vget_high_s16(q_10), siq); |
| iqtmp_2 = vmull_s16(vget_low_s16(q_11), siq); |
| iqtmp_3 = vmull_s16(vget_high_s16(q_11), siq); |
| |
| iqtmp_0 = vaddq_s32(iqtmp_0, add_iq); |
| iqtmp_1 = vaddq_s32(iqtmp_1, add_iq); |
| iqtmp_2 = vaddq_s32(iqtmp_2, add_iq); |
| iqtmp_3 = vaddq_s32(iqtmp_3, add_iq); |
| |
| iqtmp_0 = vshlq_s32(iqtmp_0, sh_iq); |
| iqtmp_1 = vshlq_s32(iqtmp_1, sh_iq); |
| iqtmp_2 = vshlq_s32(iqtmp_2, sh_iq); |
| iqtmp_3 = vshlq_s32(iqtmp_3, sh_iq); |
| |
| /* clip */ |
| iq0 = vqmovn_s32(iqtmp_0); |
| iq1 = vqmovn_s32(iqtmp_1); |
| iq2 = vqmovn_s32(iqtmp_2); |
| iq3 = vqmovn_s32(iqtmp_3); |
| |
| /* store */ |
| vst1_s16((pi2_iq_dst + j), iq0); |
| vst1_s16((pi2_iq_dst + j + dst_iq_strd), iq1); |
| vst1_s16((pi2_iq_dst + j + (2 * dst_iq_strd)), iq2); |
| vst1_s16((pi2_iq_dst + j + (3 * dst_iq_strd)), iq3); |
| |
| /* ssd */ |
| /* trans_coeff - inv.quant */ |
| r0 = vsubl_s16(s0, iq0); |
| r1 = vsubl_s16(s1, iq1); |
| r2 = vsubl_s16(s2, iq2); |
| r3 = vsubl_s16(s3, iq3); |
| |
| /* SD */ |
| r0 = vmulq_s32(r0, r0); |
| r1 = vmulq_s32(r1, r1); |
| r2 = vmulq_s32(r2, r2); |
| r3 = vmulq_s32(r3, r3); |
| } |
| else |
| { |
| /* store */ |
| vst1_s16((pi2_iq_dst + j), zero_d); |
| vst1_s16((pi2_iq_dst + j + dst_iq_strd), zero_d); |
| vst1_s16((pi2_iq_dst + j + (2 * dst_iq_strd)), zero_d); |
| vst1_s16((pi2_iq_dst + j + (3 * dst_iq_strd)), zero_d); |
| |
| /* SD */ |
| r0 = vmull_s16(s0, s0); |
| r1 = vmull_s16(s1, s1); |
| r2 = vmull_s16(s2, s2); |
| r3 = vmull_s16(s3, s3); |
| } |
| |
| /* SSD */ |
| r0 = vaddq_s32(r0, r1); |
| r2 = vaddq_s32(r2, r3); |
| |
| r0 = vaddq_s32(r0, r2); |
| |
| /* SSD Accumulation */ |
| ssd0 = vaddq_s32(ssd0, r0); |
| |
| cbf = cbf || (*(csbf + block_col)); // cbf update |
| block_col++; |
| } |
| |
| block_col = 0; |
| block_row += 4; |
| csbf += csbf_strd; |
| |
| pi2_coeffs += 4 * src_strd; |
| pi2_q_dst += 4 * dst_q_strd; |
| pi2_iq_dst += 4 * dst_iq_strd; |
| pi2_quant_coeff += 4 * trans_size; |
| pi4_quant_round_factor_1_2 += 4 * trans_size; |
| pi4_quant_round_factor_0_1 += 4 * trans_size; |
| } |
| |
| /* SSD Computation */ |
| ssd1 = vpadd_s32(vget_low_s32(ssd0), vget_high_s32(ssd0)); |
| ssd1 = vpadd_s32(ssd1, ssd1); |
| ssd = vget_lane_s32(ssd1, 0); |
| |
| *zero_col = ~temp_zero_col; //final zero_col storing |
| *zero_row = ~temp_zero_row; //final zero_row storing |
| |
| /* Store the cost */ |
| *pi8_cost = ssd; |
| |
| return cbf; |
| } |