common/x86/ih264_deblk_chroma_ssse3.c - platform/external/libavc - Git at Google

 /******************************************************************************
  *
  * Copyright (C) 2015 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at:
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  *
  *****************************************************************************
  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 */
 /*****************************************************************************/
 /*                                                                           */
 /*  File Name         : ih264_deblk_chroma_ssse3.c                           */
 /*                                                                           */
 /*  Description       : Contains function definitions for deblocking         */
 /*                                                                           */
 /*  List of Functions : ih264_deblk_chroma_vert_bs4_ssse3()                  */
 /*                      ih264_deblk_chroma_horz_bs4_ssse3()                  */
 /*                      ih264_deblk_chroma_vert_bslt4_ssse3()                */
 /*                      ih264_deblk_chroma_horz_bslt4_ssse3()                */
 /*                      ih264_deblk_chroma_vert_bs4_mbaff_ssse3()            */
 /*                      ih264_deblk_chroma_vert_bslt4_mbaff_ssse3()          */
 /*                                                                           */
 /*  Issues / Problems : None                                                 */
 /*                                                                           */
 /*  Revision History  :                                                      */
 /*                                                                           */
 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
 /*         12 02 2015   Naveen Kumar P  Added chrom deblocking ssse3         */
 /*                                      intrinsics                           */
 /*                                                                           */
 /*****************************************************************************/

 /*****************************************************************************/
 /* File Includes                                                             */
 /*****************************************************************************/

 /* System include files */
 #include <stdio.h>

 /* User include files */
 #include "ih264_typedefs.h"
 #include "ih264_platform_macros.h"
 #include "ih264_deblk_edge_filters.h"
 #include "ih264_macros.h"

 /*****************************************************************************/
 /* Function Definitions                                                      */
 /*****************************************************************************/

 /*****************************************************************************/
 /*                                                                           */
 /*  Function Name : ih264_deblk_chroma_vert_bs4_ssse3()                      */
 /*                                                                           */
 /*  Description   : This function performs filtering of a chroma block       */
 /*                  vertical edge when the boundary strength is set to 4 in  */
 /*                  high profile.                                            */
 /*                                                                           */
 /*  Inputs        : pu1_src    - pointer to the src sample q0 of U           */
 /*                  src_strd   - source stride                               */
 /*                  alpha_cb   - alpha value for the boundary in U           */
 /*                  beta_cb    - beta value for the boundary in U            */
 /*                  alpha_cr   - alpha value for the boundary in V           */
 /*                  beta_cr    - beta value for the boundary in V            */
 /*                                                                           */
 /*  Globals       : None                                                     */
 /*                                                                           */
 /*  Processing    : This operation is described in Sec. 8.7.2.4 under the    */
 /*                  title "Filtering process for edges for bS equal to 4" in */
 /*                  ITU T Rec H.264 with alpha and beta values different in  */
 /*                  U and V.                                                 */
 /*                                                                           */
 /*  Outputs       : None                                                     */
 /*                                                                           */
 /*  Returns       : None                                                     */
 /*                                                                           */
 /*  Issues        : None                                                     */
 /*                                                                           */
 /*  Revision History:                                                        */
 /*                                                                           */
 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
 /*         12 02 2015   Naveen Kumar P  Initial version                      */
 /*                                                                           */
 /*****************************************************************************/
 void ih264_deblk_chroma_vert_bs4_ssse3(UWORD8 *pu1_src,
                                        WORD32 src_strd,
                                        WORD32 alpha_cb,
                                        WORD32 beta_cb,
                                        WORD32 alpha_cr,
                                        WORD32 beta_cr)
 {
     UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
     WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
     WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
     __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
     __m128i temp1, temp2, temp3, temp4;

     __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
     __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
     __m128i flag1, flag2;
     __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;
     __m128i zero = _mm_setzero_si128();
     __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;

     /* Load and transpose the pixel values */
     linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
     lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
     linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
     lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
     linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd));
     linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd));
     lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd));
     lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd));

     temp1 = _mm_unpacklo_epi16(linea, lineb);
     temp2 = _mm_unpacklo_epi16(linec, lined);
     temp3 = _mm_unpacklo_epi16(linee, linef);
     temp4 = _mm_unpacklo_epi16(lineg, lineh);

     p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2);
     p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4);
     q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2);
     q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4);

     p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16);
     p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16);
     q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16);
     q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16);
     /* End of transpose */

     q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
     q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
     p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
     p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);

     diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
     diff = _mm_abs_epi16(diff);
     alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
     flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);

     diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
     diff = _mm_abs_epi16(diff);
     beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

     diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
     diff = _mm_abs_epi16(diff);
     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

     temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
     temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
     temp1 = _mm_add_epi16(temp1, temp2);
     p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);

     temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
     temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
     temp1 = _mm_add_epi16(temp1, temp2);
     q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);

     q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
     q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
     p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
     p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);

     diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
     diff = _mm_abs_epi16(diff);
     alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
     flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);

     diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
     diff = _mm_abs_epi16(diff);
     beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
     flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

     diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
     diff = _mm_abs_epi16(diff);
     flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

     temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
     temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
     temp1 = _mm_add_epi16(temp1, temp2);
     p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);

     temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
     temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
     temp1 = _mm_add_epi16(temp1, temp2);
     q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);

     p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
     q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);

     flag1 = _mm_packs_epi16(flag1, flag2);

     p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
     p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
     p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);

     q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
     q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
     q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);

     /* Inverse-transpose and store back */
     temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
     temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8);
     temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
     temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8);

     linea = _mm_unpacklo_epi32(temp1, temp3);
     lineb = _mm_srli_si128(linea, 8);
     linec = _mm_unpackhi_epi32(temp1, temp3);
     lined = _mm_srli_si128(linec, 8);
     linee = _mm_unpacklo_epi32(temp2, temp4);
     linef = _mm_srli_si128(linee, 8);
     lineg = _mm_unpackhi_epi32(temp2, temp4);
     lineh = _mm_srli_si128(lineg, 8);

     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee);
     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef);
     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg);
     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh);

 }

 /*****************************************************************************/
 /*                                                                           */
 /*  Function Name : ih264_deblk_chroma_horz_bs4_ssse3()                      */
 /*                                                                           */
 /*  Description   : This function performs filtering of a chroma block       */
 /*                  horizontal edge when the boundary strength is set to 4   */
 /*                  in high profile.                                         */
 /*                                                                           */
 /*  Inputs        : pu1_src    - pointer to the src sample q0 of U           */
 /*                  src_strd   - source stride                               */
 /*                  alpha_cb   - alpha value for the boundary in U           */
 /*                  beta_cb    - beta value for the boundary in U            */
 /*                  alpha_cr   - alpha value for the boundary in V           */
 /*                  beta_cr    - beta value for the boundary in V            */
 /*                                                                           */
 /*  Globals       : None                                                     */
 /*                                                                           */
 /*  Processing    : This operation is described in Sec. 8.7.2.4 under the    */
 /*                  title "Filtering process for edges for bS equal to 4" in */
 /*                  ITU T Rec H.264 with alpha and beta values different in  */
 /*                  U and V.                                                 */
 /*                                                                           */
 /*  Outputs       : None                                                     */
 /*                                                                           */
 /*  Returns       : None                                                     */
 /*                                                                           */
 /*  Issues        : None                                                     */
 /*                                                                           */
 /*  Revision History:                                                        */
 /*                                                                           */
 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
 /*         12 02 2015   Naveen Kumar P  Initial version                      */
 /*                                                                           */
 /*****************************************************************************/
 void ih264_deblk_chroma_horz_bs4_ssse3(UWORD8 *pu1_src,
                                        WORD32 src_strd,
                                        WORD32 alpha_cb,
                                        WORD32 beta_cb,
                                        WORD32 alpha_cr,
                                        WORD32 beta_cr)
 {
     UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
     WORD16 i16_posP1, i16_posP0, i16_posQ1;

     UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */
     WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
     WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
     __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
     __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
     __m128i flag1, flag2;
     __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;
     __m128i zero = _mm_setzero_si128();
     __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
     __m128i temp1, temp2;

     pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1);

     i16_posQ1 = src_strd;
     i16_posP0 = src_strd;
     i16_posP1 = 0;

     q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv));
     q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1));
     p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1));
     p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0));

     q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
     q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
     p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
     p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);

     diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
     diff = _mm_abs_epi16(diff);
     alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
     flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);

     diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
     diff = _mm_abs_epi16(diff);
     beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

     diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
     diff = _mm_abs_epi16(diff);
     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

     temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
     temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
     temp1 = _mm_add_epi16(temp1, temp2);
     p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);

     temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
     temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
     temp1 = _mm_add_epi16(temp1, temp2);
     q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);

     q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
     q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
     p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
     p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);

     diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
     diff = _mm_abs_epi16(diff);
     alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
     flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);

     diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
     diff = _mm_abs_epi16(diff);
     beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
     flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

     diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
     diff = _mm_abs_epi16(diff);
     flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

     temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
     temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
     temp1 = _mm_add_epi16(temp1, temp2);
     p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);

     temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
     temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
     temp1 = _mm_add_epi16(temp1, temp2);
     q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);

     p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
     q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);

     flag1 = _mm_packs_epi16(flag1, flag2);

     p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
     p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
     p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
     _mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1);

     q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
     q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
     q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
     _mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1);

 }

 /*****************************************************************************/
 /*                                                                           */
 /*  Function Name : ih264_deblk_chroma_vert_bslt4_ssse3()                    */
 /*                                                                           */
 /*  Description   : This function performs filtering of a chroma block       */
 /*                  vertical edge when the boundary strength is less than 4  */
 /*                  in high profile.                                         */
 /*                                                                           */
 /*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
 /*                  src_strd         - source stride                         */
 /*                  alpha_cb         - alpha value for the boundary in U     */
 /*                  beta_cb          - beta value for the boundary in U      */
 /*                  alpha_cr         - alpha value for the boundary in V     */
 /*                  beta_cr          - beta value for the boundary in V      */
 /*                  u4_bs            - packed Boundary strength array        */
 /*                  pu1_cliptab_cb   - tc0_table for U                       */
 /*                  pu1_cliptab_cr   - tc0_table for V                       */
 /*                                                                           */
 /*  Globals       : None                                                     */
 /*                                                                           */
 /*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
 /*                  title "Filtering process for edges for bS less than 4"   */
 /*                  in ITU T Rec H.264 with alpha and beta values different  */
 /*                  in U and V.                                              */
 /*                                                                           */
 /*  Outputs       : None                                                     */
 /*                                                                           */
 /*  Returns       : None                                                     */
 /*                                                                           */
 /*  Issues        : None                                                     */
 /*                                                                           */
 /*  Revision History:                                                        */
 /*                                                                           */
 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
 /*         12 02 2015   Naveen Kumar P  Initial version                      */
 /*                                                                           */
 /*****************************************************************************/
 void ih264_deblk_chroma_vert_bslt4_ssse3(UWORD8 *pu1_src,
                                          WORD32 src_strd,
                                          WORD32 alpha_cb,
                                          WORD32 beta_cb,
                                          WORD32 alpha_cr,
                                          WORD32 beta_cr,
                                          UWORD32 u4_bs,
                                          const UWORD8 *pu1_cliptab_cb,
                                          const UWORD8 *pu1_cliptab_cr)
 {
     UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
     UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
     WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
     WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
     __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
     __m128i temp1, temp2, temp3, temp4;

     __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
     __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
     __m128i flag_bs, flag1, flag2;
     __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;
     __m128i zero = _mm_setzero_si128();
     __m128i C0_uv_8x16;
     __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;

     u1_Bs0 = (u4_bs >> 24) & 0xff;
     u1_Bs1 = (u4_bs >> 16) & 0xff;
     u1_Bs2 = (u4_bs >> 8) & 0xff;
     u1_Bs3 = (u4_bs >> 0) & 0xff;

     flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2,
                            u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
                            u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
     flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
     flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask

     /* Load and transpose the pixel values */
     linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
     lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
     linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
     lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
     linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd));
     linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd));
     lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd));
     lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd));

     temp1 = _mm_unpacklo_epi16(linea, lineb);
     temp2 = _mm_unpacklo_epi16(linec, lined);
     temp3 = _mm_unpacklo_epi16(linee, linef);
     temp4 = _mm_unpacklo_epi16(lineg, lineh);

     p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2);
     p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4);
     q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2);
     q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4);

     p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16);
     p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16);
     q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16);
     q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16);
     /* End of transpose */

     q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
     q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
     p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
     p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);

     diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
     diff = _mm_abs_epi16(diff);
     alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
     flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);

     diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
     diff = _mm_abs_epi16(diff);
     beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

     diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
     diff = _mm_abs_epi16(diff);
     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

     diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
     diff = _mm_slli_epi16(diff, 2);
     diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
     diff = _mm_add_epi16(diff, diff1);
     diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
     in_macro = _mm_srai_epi16(diff, 3);

     C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
                                pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
                                pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
                                pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);

     C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));

     in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
     C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
     in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);

     p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);
     q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);

     q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
     q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
     p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
     p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);

     diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
     diff = _mm_abs_epi16(diff);
     alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
     flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);

     diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
     diff = _mm_abs_epi16(diff);
     beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
     flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

     diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
     diff = _mm_abs_epi16(diff);
     flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

     diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
     diff = _mm_slli_epi16(diff, 2);
     diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
     diff = _mm_add_epi16(diff, diff1);
     diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
     in_macro = _mm_srai_epi16(diff, 3);

     C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
                                pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
                                pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
                                pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]);

     C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));

     in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
     C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
     in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);

     p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro);
     q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro);

     p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
     q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);

     flag1 = _mm_packs_epi16(flag1, flag2);
     flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)

     p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
     p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
     p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);

     q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
     q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
     q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);

     /* Inverse-transpose and store back */
     temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
     temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8);
     temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
     temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8);

     linea = _mm_unpacklo_epi32(temp1, temp3);
     lineb = _mm_srli_si128(linea, 8);
     linec = _mm_unpackhi_epi32(temp1, temp3);
     lined = _mm_srli_si128(linec, 8);
     linee = _mm_unpacklo_epi32(temp2, temp4);
     linef = _mm_srli_si128(linee, 8);
     lineg = _mm_unpackhi_epi32(temp2, temp4);
     lineh = _mm_srli_si128(lineg, 8);

     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee);
     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef);
     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg);
     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh);

 }

 /*****************************************************************************/
 /*                                                                           */
 /*  Function Name : ih264_deblk_chroma_horz_bslt4_ssse3()                    */
 /*                                                                           */
 /*  Description   : This function performs filtering of a chroma block       */
 /*                  horizontal edge when the boundary strength is less than  */
 /*                  4 in high profile.                                       */
 /*                                                                           */
 /*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
 /*                  src_strd         - source stride                         */
 /*                  alpha_cb         - alpha value for the boundary in U     */
 /*                  beta_cb          - beta value for the boundary in U      */
 /*                  alpha_cr         - alpha value for the boundary in V     */
 /*                  beta_cr          - beta value for the boundary in V      */
 /*                  u4_bs            - packed Boundary strength array        */
 /*                  pu1_cliptab_cb   - tc0_table for U                       */
 /*                  pu1_cliptab_cr   - tc0_table for V                       */
 /*                                                                           */
 /*  Globals       : None                                                     */
 /*                                                                           */
 /*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
 /*                  title "Filtering process for edges for bS less than 4"   */
 /*                  in ITU T Rec H.264 with alpha and beta values different  */
 /*                  in U and V.                                              */
 /*                                                                           */
 /*  Outputs       : None                                                     */
 /*                                                                           */
 /*  Returns       : None                                                     */
 /*                                                                           */
 /*  Issues        : None                                                     */
 /*                                                                           */
 /*  Revision History:                                                        */
 /*                                                                           */
 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
 /*         12 02 2015   Naveen Kumar P  Initial version                      */
 /*                                                                           */
 /*****************************************************************************/
 void ih264_deblk_chroma_horz_bslt4_ssse3(UWORD8 *pu1_src,
                                          WORD32 src_strd,
                                          WORD32 alpha_cb,
                                          WORD32 beta_cb,
                                          WORD32 alpha_cr,
                                          WORD32 beta_cr,
                                          UWORD32 u4_bs,
                                          const UWORD8 *pu1_cliptab_cb,
                                          const UWORD8 *pu1_cliptab_cr)
 {
     UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
     WORD16 i16_posP1, i16_posP0, i16_posQ1;
     UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;

     UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */
     WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
     WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
     __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
     __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
     __m128i flag_bs, flag1, flag2;
     __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;
     __m128i zero = _mm_setzero_si128();
     __m128i C0_uv_8x16;
     __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;

     pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1);

     i16_posQ1 = src_strd;
     i16_posP0 = src_strd;
     i16_posP1 = 0;

     u1_Bs0 = (u4_bs >> 24) & 0xff;
     u1_Bs1 = (u4_bs >> 16) & 0xff;
     u1_Bs2 = (u4_bs >> 8) & 0xff;
     u1_Bs3 = (u4_bs >> 0) & 0xff;

     flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2,
                            u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
                            u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
     flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
     flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask

     q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv));
     q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1));
     p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1));
     p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0));

     q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
     q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
     p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
     p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);

     diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
     diff = _mm_abs_epi16(diff);
     alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
     flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);

     diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
     diff = _mm_abs_epi16(diff);
     beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

     diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
     diff = _mm_abs_epi16(diff);
     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

     diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
     diff = _mm_slli_epi16(diff, 2);
     diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
     diff = _mm_add_epi16(diff, diff1);
     diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
     in_macro = _mm_srai_epi16(diff, 3);

     C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
                                pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
                                pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
                                pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);

     C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));

     in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
     C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
     in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);

     p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);
     q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);

     q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
     q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
     p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
     p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);

     diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
     diff = _mm_abs_epi16(diff);
     alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
     flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);

     diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
     diff = _mm_abs_epi16(diff);
     beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
     flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

     diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
     diff = _mm_abs_epi16(diff);
     flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

     diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
     diff = _mm_slli_epi16(diff, 2);
     diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
     diff = _mm_add_epi16(diff, diff1);
     diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
     in_macro = _mm_srai_epi16(diff, 3);

     C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
                                pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
                                pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
                                pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]);

     C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));

     in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
     C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
     in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);

     p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro);
     q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro);

     p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
     q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);

     flag1 = _mm_packs_epi16(flag1, flag2);
     flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)

     p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
     p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
     p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
     _mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1);

     q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
     q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
     q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
     _mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1);

 }

 /*****************************************************************************/
 /*                                                                           */
 /*  Function Name : ih264_deblk_chroma_vert_bs4_mbaff_ssse3()                */
 /*                                                                           */
 /*  Description   : This function performs filtering of a chroma block       */
 /*                  vertical edge when boundary strength is set to 4 in high */
 /*                  profile.                                                 */
 /*                                                                           */
 /*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
 /*                  src_strd         - source stride                         */
 /*                  alpha_cb         - alpha value for the boundary in U     */
 /*                  beta_cb          - beta value for the boundary in U      */
 /*                  alpha_cr         - alpha value for the boundary in V     */
 /*                  beta_cr          - beta value for the boundary in V      */
 /*                  u4_bs            - packed Boundary strength array        */
 /*                  pu1_cliptab_cb   - tc0_table for U                       */
 /*                  pu1_cliptab_cr   - tc0_table for V                       */
 /*                                                                           */
 /*  Globals       : None                                                     */
 /*                                                                           */
 /*  Processing    : When the function is called twice, this operation is as  */
 /*                  described in Sec. 8.7.2.4 under the title "Filtering     */
 /*                  process for edges for bS equal to 4" in ITU T Rec H.264  */
 /*                  with alpha and beta values different in U and V.         */
 /*                                                                           */
 /*  Outputs       : None                                                     */
 /*                                                                           */
 /*  Returns       : None                                                     */
 /*                                                                           */
 /*  Issues        : None                                                     */
 /*                                                                           */
 /*  Revision History:                                                        */
 /*                                                                           */
 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
 /*         12 02 2015   Naveen Kumar P  Initial version                      */
 /*                                                                           */
 /*****************************************************************************/
 void ih264_deblk_chroma_vert_bs4_mbaff_ssse3(UWORD8 *pu1_src,
                                              WORD32 src_strd,
                                              WORD32 alpha_cb,
                                              WORD32 beta_cb,
                                              WORD32 alpha_cr,
                                              WORD32 beta_cr)
 {
     UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
     WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
     WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
     __m128i linea, lineb, linec, lined;
     __m128i temp1, temp2;

     __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
     __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
     __m128i flag1;
     __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;
     __m128i zero = _mm_setzero_si128();
     __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;

     /* Load and transpose the pixel values */
     linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
     lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
     linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
     lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));

     temp1 = _mm_unpacklo_epi16(linea, lineb);
     temp2 = _mm_unpacklo_epi16(linec, lined);

     p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2);
     p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8);
     q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2);
     q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8);
     /* End of transpose */

     q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
     q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
     p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
     p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);

     diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
     diff = _mm_abs_epi16(diff);
     alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
     flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);

     diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
     diff = _mm_abs_epi16(diff);
     beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

     diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
     diff = _mm_abs_epi16(diff);
     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

     temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
     temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
     temp1 = _mm_add_epi16(temp1, temp2);
     p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);

     temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
     temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
     temp1 = _mm_add_epi16(temp1, temp2);
     q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);

     p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1);
     q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1);

     flag1 = _mm_packs_epi16(flag1, flag1);

     p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
     p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
     p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);

     q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
     q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
     q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);

     /* Inverse-transpose and store back */
     temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
     temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);

     linea = _mm_unpacklo_epi32(temp1, temp2);
     lineb = _mm_srli_si128(linea, 8);
     linec = _mm_unpackhi_epi32(temp1, temp2);
     lined = _mm_srli_si128(linec, 8);

     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);

 }

 /*****************************************************************************/
 /*                                                                           */
 /*  Function Name : ih264_deblk_chroma_vert_bslt4_mbaff_ssse3()              */
 /*                                                                           */
 /*  Description   : This function performs filtering of a chroma block       */
 /*                  vertical edge when boundary strength is less than 4 in   */
 /*                  high profile.                                            */
 /*                                                                           */
 /*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
 /*                  src_strd         - source stride                         */
 /*                  alpha_cb         - alpha value for the boundary in U     */
 /*                  beta_cb          - beta value for the boundary in U      */
 /*                  alpha_cr         - alpha value for the boundary in V     */
 /*                  beta_cr          - beta value for the boundary in V      */
 /*                  u4_bs            - packed Boundary strength array        */
 /*                  pu1_cliptab_cb   - tc0_table for U                       */
 /*                  pu1_cliptab_cr   - tc0_table for V                       */
 /*                                                                           */
 /*  Globals       : None                                                     */
 /*                                                                           */
 /*  Processing    : When the function is called twice, this operation is as  */
 /*                  described in Sec. 8.7.2.4 under the title "Filtering     */
 /*                  process for edges for bS less than 4" in ITU T Rec H.264 */
 /*                  with alpha and beta values different in U and V.         */
 /*                                                                           */
 /*  Outputs       : None                                                     */
 /*                                                                           */
 /*  Returns       : None                                                     */
 /*                                                                           */
 /*  Issues        : None                                                     */
 /*                                                                           */
 /*  Revision History:                                                        */
 /*                                                                           */
 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
 /*         12 02 2015   Naveen Kumar P  Initial version                      */
 /*                                                                           */
 /*****************************************************************************/
 void ih264_deblk_chroma_vert_bslt4_mbaff_ssse3(UWORD8 *pu1_src,
                                                WORD32 src_strd,
                                                WORD32 alpha_cb,
                                                WORD32 beta_cb,
                                                WORD32 alpha_cr,
                                                WORD32 beta_cr,
                                                UWORD32 u4_bs,
                                                const UWORD8 *pu1_cliptab_cb,
                                                const UWORD8 *pu1_cliptab_cr)
 {
     UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
     UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
     WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
     WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
     __m128i linea, lineb, linec, lined;
     __m128i temp1, temp2;

     __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
     __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
     __m128i flag_bs, flag1;
     __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;
     __m128i zero = _mm_setzero_si128();
     __m128i C0_uv_8x16;
     __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;

     u1_Bs0 = (u4_bs >> 24) & 0xff;
     u1_Bs1 = (u4_bs >> 16) & 0xff;
     u1_Bs2 = (u4_bs >> 8) & 0xff;
     u1_Bs3 = (u4_bs >> 0) & 0xff;

     flag_bs = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, u1_Bs3, u1_Bs3, u1_Bs2,
                            u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs0, u1_Bs0);
     flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
     flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask

     /* Load and transpose the pixel values */
     linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
     lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
     linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
     lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));

     temp1 = _mm_unpacklo_epi16(linea, lineb);
     temp2 = _mm_unpacklo_epi16(linec, lined);

     p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2);
     p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8);
     q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2);
     q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8);
     /* End of transpose */

     q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
     q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
     p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
     p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);

     diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
     diff = _mm_abs_epi16(diff);
     alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
     flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);

     diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
     diff = _mm_abs_epi16(diff);
     beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

     diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
     diff = _mm_abs_epi16(diff);
     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));

     diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
     diff = _mm_slli_epi16(diff, 2);
     diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
     diff = _mm_add_epi16(diff, diff1);
     diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
     in_macro = _mm_srai_epi16(diff, 3);

     C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
                                pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
                                pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
                                pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);

     C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));

     in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
     C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
     in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);

     p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);
     q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);

     p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1);
     q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1);

     flag1 = _mm_packs_epi16(flag1, flag1);
     flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)

     p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
     p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
     p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);

     q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
     q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
     q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);

     /* Inverse-transpose and store back */
     temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
     temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);

     linea = _mm_unpacklo_epi32(temp1, temp2);
     lineb = _mm_srli_si128(linea, 8);
     linec = _mm_unpackhi_epi32(temp1, temp2);
     lined = _mm_srli_si128(linec, 8);

     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);

 }