| //****************************************************************************** |
| //* |
| //* Copyright (C) 2015 The Android Open Source Project |
| //* |
| //* Licensed under the Apache License, Version 2.0 (the "License"); |
| //* you may not use this file except in compliance with the License. |
| //* You may obtain a copy of the License at: |
| //* |
| //* http://www.apache.org/licenses/LICENSE-2.0 |
| //* |
| //* Unless required by applicable law or agreed to in writing, software |
| //* distributed under the License is distributed on an "AS IS" BASIS, |
| //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| //* See the License for the specific language governing permissions and |
| //* limitations under the License. |
| //* |
| //***************************************************************************** |
| //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
| //*/ |
| ///** |
| // ******************************************************************************* |
| // * @file |
| // * ih264_ihadamard_scaling_av8.s |
| // * |
| // * @brief |
| // * Contains function definitions for inverse hadamard transform on 4x4 DC outputs |
| // * of 16x16 intra-prediction |
| // * |
| // * @author |
| // * Mohit |
| // * |
| // * @par List of Functions: |
| // * - ih264_ihadamard_scaling_4x4_av8() |
| // * |
| // * @remarks |
| // * None |
| // * |
| .include "ih264_neon_macros.s" |
| |
| // ******************************************************************************* |
| // */ |
| // * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients |
| // * of a 16x16 intra prediction macroblock, and then performs scaling. |
| // * prediction buffer |
| // * |
| // * @par Description: |
| // * The DC coefficients pass through a 2-stage inverse hadamard transform. |
| // * This inverse transformed content is scaled to based on Qp value. |
| // * |
| // * @param[in] pi2_src |
| // * input 4x4 block of DC coefficients |
| // * |
| // * @param[out] pi2_out |
| // * output 4x4 block |
| // * |
| // * @param[in] pu2_iscal_mat |
| // * pointer to scaling list |
| // * |
| // * @param[in] pu2_weigh_mat |
| // * pointer to weight matrix |
| // * |
| // * @param[in] u4_qp_div_6 |
| // * Floor (qp/6) |
| // * |
| // * @param[in] pi4_tmp |
| // * temporary buffer of size 1*16 |
| // * |
| // * @returns none |
| // * |
| // * @remarks none |
| // * |
| // ******************************************************************************* |
| // */ |
| // * |
| // ******************************************************************************* |
| // */ |
| // void ih264_ihadamard_scaling_4x4(word16* pi2_src, |
| // word16* pi2_out, |
| // const uword16 *pu2_iscal_mat, |
| // const uword16 *pu2_weigh_mat, |
| // uword32 u4_qp_div_6, |
| // word32* pi4_tmp) |
| //**************variables vs registers***************************************** |
| //x0 => *pi2_src |
| //x1 => *pi2_out |
| //x2 => *pu2_iscal_mat |
| //x3 => *pu2_weigh_mat |
| //x4=> u4_qp_div_6 |
| |
| .text |
| .p2align 2 |
| |
| .global ih264_ihadamard_scaling_4x4_av8 |
| ih264_ihadamard_scaling_4x4_av8: |
| |
| //only one shift is done in horizontal inverse because, |
| //if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value |
| //if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0 |
| push_v_regs |
| |
| //=======================inverse hadamard transform================================ |
| |
| ld4 {v0.4h-v3.4h}, [x0] //load x4,x5,x6,x7 |
| |
| dup v14.4s, w4 // populate the u4_qp_div_6 |
| ld1 {v15.h}[0], [x3] // pu2_weigh_mat |
| ld1 {v16.h}[0], [x2] //pu2_iscal_mat |
| |
| saddl v4.4s, v0.4h, v3.4h //x0 = x4 + x7 |
| saddl v5.4s, v1.4h, v2.4h //x1 = x5 + x6 |
| ssubl v6.4s, v1.4h, v2.4h //x2 = x5 - x6 |
| ssubl v7.4s, v0.4h, v3.4h //x3 = x4 - x7 |
| |
| add v0.4s, v4.4s, v5.4s //pi4_tmp_ptr[0] = x0 + x1 |
| add v1.4s, v7.4s, v6.4s //pi4_tmp_ptr[1] = x3 + x2 |
| sub v2.4s, v4.4s, v5.4s //pi4_tmp_ptr[2] = x0 - x1 |
| sub v3.4s, v7.4s, v6.4s //pi4_tmp_ptr[3] = x3 - x2 |
| |
| umull v15.4s, v15.4h, v16.4h |
| dup v15.4s, v15.s[0] //pu2_weigh_mat[0]*pu2_iscal_mat[0] |
| |
| //transpose |
| trn1 v4.4s, v0.4s, v1.4s |
| trn2 v5.4s, v0.4s, v1.4s |
| trn1 v6.4s, v2.4s, v3.4s |
| trn2 v7.4s, v2.4s, v3.4s |
| |
| trn1 v0.2d, v4.2d, v6.2d |
| trn2 v2.2d, v4.2d, v6.2d |
| trn1 v1.2d, v5.2d, v7.2d |
| trn2 v3.2d, v5.2d, v7.2d |
| //end transpose |
| |
| add v4.4s, v0.4s, v3.4s //x0 = x4+x7 |
| add v5.4s, v1.4s, v2.4s //x1 = x5+x6 |
| sub v6.4s, v1.4s, v2.4s //x2 = x5-x6 |
| sub v7.4s, v0.4s, v3.4s //x3 = x4-x7 |
| |
| add v0.4s, v4.4s, v5.4s //pi4_tmp_ptr[0] = x0 + x1 |
| add v1.4s, v7.4s, v6.4s //pi4_tmp_ptr[1] = x3 + x2 |
| sub v2.4s, v4.4s, v5.4s //pi4_tmp_ptr[2] = x0 - x1 |
| sub v3.4s, v7.4s, v6.4s //pi4_tmp_ptr[3] = x3 - x2 |
| |
| mul v0.4s, v0.4s, v15.4s // q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3 |
| mul v1.4s, v1.4s, v15.4s // q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7 |
| mul v2.4s, v2.4s, v15.4s // q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11 |
| mul v3.4s, v3.4s, v15.4s // q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15 |
| |
| sshl v0.4s, v0.4s, v14.4s // q0 = q[i] = (p[i] << (qp/6)) where i = 0..3 |
| sshl v1.4s, v1.4s, v14.4s // q1 = q[i] = (p[i] << (qp/6)) where i = 4..7 |
| sshl v2.4s, v2.4s, v14.4s // q2 = q[i] = (p[i] << (qp/6)) where i = 8..11 |
| sshl v3.4s, v3.4s, v14.4s // q3 = q[i] = (p[i] << (qp/6)) where i = 12..15 |
| |
| sqrshrn v0.4h, v0.4s, #6 // d0 = c[i] = ((q[i] + 32) >> 4) where i = 0..3 |
| sqrshrn v1.4h, v1.4s, #6 // d1 = c[i] = ((q[i] + 32) >> 4) where i = 4..7 |
| sqrshrn v2.4h, v2.4s, #6 // d2 = c[i] = ((q[i] + 32) >> 4) where i = 8..11 |
| sqrshrn v3.4h, v3.4s, #6 // d3 = c[i] = ((q[i] + 32) >> 4) where i = 12..15 |
| |
| st1 {v0.4h-v3.4h}, [x1] //store the result |
| |
| pop_v_regs |
| ret |
| |
| |
| // ******************************************************************************* |
| // */ |
| // * @brief This function performs a 2x2 inverse hadamard transform for chroma block |
| // * |
| // * @par Description: |
| // * The DC coefficients pass through a 2-stage inverse hadamard transform. |
| // * This inverse transformed content is scaled to based on Qp value. |
| // * Both DC blocks of U and v blocks are processesd |
| // * |
| // * @param[in] pi2_src |
| // * input 1x8 block of ceffs. First 4 are from U and next from V |
| // * |
| // * @param[out] pi2_out |
| // * output 1x8 block |
| // * |
| // * @param[in] pu2_iscal_mat |
| // * pointer to scaling list |
| // * |
| // * @param[in] pu2_weigh_mat |
| // * pointer to weight matrix |
| // * |
| // * @param[in] u4_qp_div_6 |
| // * Floor (qp/6) |
| // * |
| // * @returns none |
| // * |
| // * @remarks none |
| // * |
| // ******************************************************************************* |
| // */ |
| // * |
| // ******************************************************************************* |
| // */ |
| // void ih264_ihadamard_scaling_2x2_uv(WORD16* pi2_src, |
| // WORD16* pi2_out, |
| // const UWORD16 *pu2_iscal_mat, |
| // const UWORD16 *pu2_weigh_mat, |
| // UWORD32 u4_qp_div_6, |
| |
| .global ih264_ihadamard_scaling_2x2_uv_av8 |
| ih264_ihadamard_scaling_2x2_uv_av8: |
| |
| //Registers used |
| // x0 : *pi2_src |
| // x1 : *pi2_out |
| // x2 : *pu2_iscal_mat |
| // x3 : *pu2_weigh_mat |
| // x4 : u4_qp_div_6 |
| push_v_regs |
| ld1 {v26.h}[0], [x2] |
| ld1 {v27.h}[0], [x3] |
| |
| sub w4, w4, #5 //qp/6 - 4 |
| dup v28.4s, w4 //load qp/6 |
| |
| ld2 {v0.4h, v1.4h}, [x0] //load 8 dc coeffs |
| //i2_x4,i2_x6,i2_y4,i1_y6 -> d0 |
| //i2_x5,i2_x7,i2_y5,i1_y6 -> d1 |
| |
| saddl v2.4s, v0.4h, v1.4h //i4_x0 = i4_x4 + i4_x5;...x2 |
| ssubl v4.4s, v0.4h, v1.4h //i4_x1 = i4_x4 - i4_x5;...x3 |
| |
| umull v30.4s, v26.4h, v27.4h //pu2_iscal_mat[0]*pu2_weigh_mat[0] |
| dup v30.4s, v30.s[0] |
| |
| trn1 v0.4s, v2.4s, v4.4s |
| trn2 v1.4s, v2.4s, v4.4s //i4_x0 i4_x1 -> q1 |
| |
| add v2.4s, v0.4s, v1.4s //i4_x4 = i4_x0+i4_x2;.. i4_x5 |
| sub v3.4s, v0.4s, v1.4s //i4_x6 = i4_x0-i4_x2;.. i4_x7 |
| |
| mul v2.4s, v2.4s, v30.4s |
| mul v3.4s, v3.4s, v30.4s |
| |
| sshl v2.4s, v2.4s, v28.4s |
| sshl v3.4s, v3.4s, v28.4s |
| |
| xtn v0.4h, v2.4s //i4_x4 i4_x5 i4_y4 i4_y5 |
| xtn v1.4h, v3.4s //i4_x6 i4_x7 i4_y6 i4_y7 |
| |
| st2 {v0.4s-v1.4s}, [x1] |
| pop_v_regs |
| ret |
| |
| |
| |