| /****************************************************************************** |
| * |
| * Copyright (C) 2015 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at: |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| * |
| ***************************************************************************** |
| * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
| */ |
| |
| /** |
| ******************************************************************************* |
| * @file |
| * impeg2_inter_pred_sse42_intr.c |
| * |
| * @brief |
| * Contains Motion compensation function definitions for MPEG2 decoder |
| * |
| * @author |
| * Mohit [100664] |
| * |
| * - impeg2_copy_mb_sse42() |
| * - impeg2_interpolate_sse42() |
| * - impeg2_mc_halfx_halfy_8x8_sse42() |
| * - impeg2_mc_halfx_fully_8x8_sse42() |
| * - impeg2_mc_fullx_halfy_8x8_sse42() |
| * - impeg2_mc_fullx_fully_8x8_sse42() |
| * |
| * @remarks |
| * None |
| * |
| ******************************************************************************* |
| */ |
| #include <stdio.h> |
| #include <string.h> |
| #include "iv_datatypedef.h" |
| #include "impeg2_macros.h" |
| #include "impeg2_defs.h" |
| #include "impeg2_inter_pred.h" |
| |
| #include <immintrin.h> |
| #include <emmintrin.h> |
| #include <smmintrin.h> |
| #include <tmmintrin.h> |
| |
| /******************************************************************************* |
| * Function Name : impeg2_copy_mb |
| * |
| * Description : copies 3 components to the frame from mc_buf |
| * |
| * Arguments : |
| * src_buf : Source Buffer |
| * dst_buf : Destination Buffer |
| * src_wd : Source Width |
| * dst_wd : destination Width |
| * |
| * Values Returned : None |
| *******************************************************************************/ |
| void impeg2_copy_mb_sse42(yuv_buf_t *src_buf, |
| yuv_buf_t *dst_buf, |
| UWORD32 src_wd, |
| UWORD32 dst_wd) |
| { |
| UWORD8 *src; |
| UWORD8 *dst; |
| __m128i src_r0, src_r1, src_r2, src_r3; |
| |
| /*******************************************************/ |
| /* copy Y */ |
| /*******************************************************/ |
| src = src_buf->pu1_y; |
| dst = dst_buf->pu1_y; |
| // Row 0-3 |
| src_r0 = _mm_loadu_si128((__m128i *) (src)); |
| src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd)); |
| src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd)); |
| src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd)); |
| |
| _mm_storeu_si128((__m128i *) dst, src_r0); |
| _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1); |
| _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2); |
| _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3); |
| |
| // Row 4-7 |
| src += 4 * src_wd; |
| dst += 4 * dst_wd; |
| src_r0 = _mm_loadu_si128((__m128i *) (src)); |
| src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd)); |
| src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd)); |
| src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd)); |
| |
| _mm_storeu_si128((__m128i *) dst, src_r0); |
| _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1); |
| _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2); |
| _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3); |
| |
| // Row 8-11 |
| src += 4 * src_wd; |
| dst += 4 * dst_wd; |
| src_r0 = _mm_loadu_si128((__m128i *) (src)); |
| src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd)); |
| src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd)); |
| src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd)); |
| |
| _mm_storeu_si128((__m128i *) dst, src_r0); |
| _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1); |
| _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2); |
| _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3); |
| |
| // Row 12-15 |
| src += 4 * src_wd; |
| dst += 4 * dst_wd; |
| src_r0 = _mm_loadu_si128((__m128i *) (src)); |
| src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd)); |
| src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd)); |
| src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd)); |
| |
| _mm_storeu_si128((__m128i *) dst, src_r0); |
| _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1); |
| _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2); |
| _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3); |
| |
| src_wd >>= 1; |
| dst_wd >>= 1; |
| |
| /*******************************************************/ |
| /* copy U */ |
| /*******************************************************/ |
| src = src_buf->pu1_u; |
| dst = dst_buf->pu1_u; |
| |
| // Row 0-3 |
| src_r0 = _mm_loadl_epi64((__m128i *)src); |
| src_r1 = _mm_loadl_epi64((__m128i *)(src + src_wd)); |
| src_r2 = _mm_loadl_epi64((__m128i *)(src + 2 * src_wd)); |
| src_r3 = _mm_loadl_epi64((__m128i *)(src + 3 * src_wd)); |
| |
| _mm_storel_epi64((__m128i *)dst, src_r0); |
| _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1); |
| _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2); |
| _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3); |
| |
| // Row 4-7 |
| src += 4 * src_wd; |
| dst += 4 * dst_wd; |
| |
| src_r0 = _mm_loadl_epi64((__m128i *)src); |
| src_r1 = _mm_loadl_epi64((__m128i *)(src + src_wd)); |
| src_r2 = _mm_loadl_epi64((__m128i *)(src + 2 * src_wd)); |
| src_r3 = _mm_loadl_epi64((__m128i *)(src + 3 * src_wd)); |
| |
| _mm_storel_epi64((__m128i *)dst, src_r0); |
| _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1); |
| _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2); |
| _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3); |
| |
| /*******************************************************/ |
| /* copy V */ |
| /*******************************************************/ |
| src = src_buf->pu1_v; |
| dst = dst_buf->pu1_v; |
| // Row 0-3 |
| src_r0 = _mm_loadl_epi64((__m128i *)src); |
| src_r1 = _mm_loadl_epi64((__m128i *)(src + src_wd)); |
| src_r2 = _mm_loadl_epi64((__m128i *)(src + 2 * src_wd)); |
| src_r3 = _mm_loadl_epi64((__m128i *)(src + 3 * src_wd)); |
| |
| _mm_storel_epi64((__m128i *)dst, src_r0); |
| _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1); |
| _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2); |
| _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3); |
| |
| // Row 4-7 |
| src += 4 * src_wd; |
| dst += 4 * dst_wd; |
| |
| src_r0 = _mm_loadl_epi64((__m128i *)src); |
| src_r1 = _mm_loadl_epi64((__m128i *)(src + src_wd)); |
| src_r2 = _mm_loadl_epi64((__m128i *)(src + 2 * src_wd)); |
| src_r3 = _mm_loadl_epi64((__m128i *)(src + 3 * src_wd)); |
| |
| _mm_storel_epi64((__m128i *)dst, src_r0); |
| _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1); |
| _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2); |
| _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3); |
| } |
| |
| /*****************************************************************************/ |
| /* */ |
| /* Function Name : impeg2_interpolate */ |
| /* */ |
| /* Description : averages the contents of buf_src1 and buf_src2 and stores*/ |
| /* result in buf_dst */ |
| /* */ |
| /* Inputs : buf_src1 - First Source */ |
| /* buf_src2 - Second Source */ |
| /* */ |
| /* Globals : None */ |
| /* */ |
| /* Processing : Avg the values from two sources and store the result in */ |
| /* destination buffer */ |
| /* */ |
| /* Outputs : buf_dst - Avg of contents of buf_src1 and buf_src2 */ |
| /* */ |
| /* Returns : None */ |
| /* */ |
| /* Issues : Assumes that all 3 buffers are of same size */ |
| /* */ |
| /*****************************************************************************/ |
| void impeg2_interpolate_sse42(yuv_buf_t *buf_src1, |
| yuv_buf_t *buf_src2, |
| yuv_buf_t *buf_dst, |
| UWORD32 stride) |
| { |
| UWORD8 *src1, *src2; |
| UWORD8 *dst; |
| __m128i src1_r0, src1_r1, src1_r2, src1_r3; |
| __m128i src2_r0, src2_r1, src2_r2, src2_r3; |
| |
| /*******************************************************/ |
| /* interpolate Y */ |
| /*******************************************************/ |
| src1 = buf_src1->pu1_y; |
| src2 = buf_src2->pu1_y; |
| dst = buf_dst->pu1_y; |
| // Row 0-3 |
| src1_r0 = _mm_loadu_si128((__m128i *) (src1)); |
| src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16)); |
| src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16)); |
| src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16)); |
| |
| src2_r0 = _mm_loadu_si128((__m128i *) (src2)); |
| src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16)); |
| src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16)); |
| src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16)); |
| |
| src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); |
| src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); |
| src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); |
| src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); |
| |
| _mm_storeu_si128((__m128i *) dst, src1_r0); |
| _mm_storeu_si128((__m128i *) (dst + stride), src1_r1); |
| _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2); |
| _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3); |
| |
| // Row 4-7 |
| src1 += 4 * 16; |
| src2 += 4 * 16; |
| dst += 4 * stride; |
| src1_r0 = _mm_loadu_si128((__m128i *) (src1)); |
| src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16)); |
| src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16)); |
| src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16)); |
| |
| src2_r0 = _mm_loadu_si128((__m128i *) (src2)); |
| src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16)); |
| src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16)); |
| src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16)); |
| |
| src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); |
| src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); |
| src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); |
| src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); |
| |
| _mm_storeu_si128((__m128i *) dst, src1_r0); |
| _mm_storeu_si128((__m128i *) (dst + stride), src1_r1); |
| _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2); |
| _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3); |
| |
| // Row 8-11 |
| src1 += 4 * 16; |
| src2 += 4 * 16; |
| dst += 4 * stride; |
| src1_r0 = _mm_loadu_si128((__m128i *) (src1)); |
| src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16)); |
| src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16)); |
| src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16)); |
| |
| src2_r0 = _mm_loadu_si128((__m128i *) (src2)); |
| src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16)); |
| src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16)); |
| src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16)); |
| |
| src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); |
| src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); |
| src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); |
| src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); |
| |
| _mm_storeu_si128((__m128i *) dst, src1_r0); |
| _mm_storeu_si128((__m128i *) (dst + stride), src1_r1); |
| _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2); |
| _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3); |
| |
| // Row 12-15 |
| src1 += 4 * 16; |
| src2 += 4 * 16; |
| dst += 4 * stride; |
| src1_r0 = _mm_loadu_si128((__m128i *) (src1)); |
| src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16)); |
| src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16)); |
| src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16)); |
| |
| src2_r0 = _mm_loadu_si128((__m128i *) (src2)); |
| src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16)); |
| src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16)); |
| src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16)); |
| |
| src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); |
| src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); |
| src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); |
| src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); |
| |
| _mm_storeu_si128((__m128i *) dst, src1_r0); |
| _mm_storeu_si128((__m128i *) (dst + stride), src1_r1); |
| _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2); |
| _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3); |
| |
| stride >>= 1; |
| |
| /*******************************************************/ |
| /* interpolate U */ |
| /*******************************************************/ |
| src1 = buf_src1->pu1_u; |
| src2 = buf_src2->pu1_u; |
| dst = buf_dst->pu1_u; |
| // Row 0-3 |
| src1_r0 = _mm_loadl_epi64((__m128i *) (src1)); |
| src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8)); |
| src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8)); |
| src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8)); |
| |
| src2_r0 = _mm_loadl_epi64((__m128i *) (src2)); |
| src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8)); |
| src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8)); |
| src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8)); |
| |
| src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); |
| src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); |
| src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); |
| src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); |
| |
| _mm_storel_epi64((__m128i *) dst, src1_r0); |
| _mm_storel_epi64((__m128i *) (dst + stride), src1_r1); |
| _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2); |
| _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3); |
| |
| // Row 4-7 |
| src1 += 4 * 8; |
| src2 += 4 * 8; |
| dst += 4 * stride; |
| |
| src1_r0 = _mm_loadl_epi64((__m128i *) (src1)); |
| src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8)); |
| src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8)); |
| src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8)); |
| |
| src2_r0 = _mm_loadl_epi64((__m128i *) (src2)); |
| src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8)); |
| src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8)); |
| src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8)); |
| |
| src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); |
| src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); |
| src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); |
| src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); |
| |
| _mm_storel_epi64((__m128i *) dst, src1_r0); |
| _mm_storel_epi64((__m128i *) (dst + stride), src1_r1); |
| _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2); |
| _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3); |
| |
| /*******************************************************/ |
| /* interpolate V */ |
| /*******************************************************/ |
| src1 = buf_src1->pu1_v; |
| src2 = buf_src2->pu1_v; |
| dst = buf_dst->pu1_v; |
| |
| // Row 0-3 |
| src1_r0 = _mm_loadl_epi64((__m128i *) (src1)); |
| src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8)); |
| src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8)); |
| src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8)); |
| |
| src2_r0 = _mm_loadl_epi64((__m128i *) (src2)); |
| src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8)); |
| src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8)); |
| src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8)); |
| |
| src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); |
| src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); |
| src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); |
| src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); |
| |
| _mm_storel_epi64((__m128i *) dst, src1_r0); |
| _mm_storel_epi64((__m128i *) (dst + stride), src1_r1); |
| _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2); |
| _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3); |
| |
| // Row 4-7 |
| src1 += 4 * 8; |
| src2 += 4 * 8; |
| dst += 4 * stride; |
| |
| src1_r0 = _mm_loadl_epi64((__m128i *) (src1)); |
| src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8)); |
| src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8)); |
| src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8)); |
| |
| src2_r0 = _mm_loadl_epi64((__m128i *) (src2)); |
| src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8)); |
| src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8)); |
| src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8)); |
| |
| src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); |
| src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); |
| src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); |
| src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); |
| |
| _mm_storel_epi64((__m128i *) dst, src1_r0); |
| _mm_storel_epi64((__m128i *) (dst + stride), src1_r1); |
| _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2); |
| _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3); |
| } |
| |
| /*****************************************************************************/ |
| /* */ |
| /* Function Name : impeg2_mc_halfx_halfy_8x8_sse42() */ |
| /* */ |
| /* Description : Gets the buffer from (0.5,0.5) to (8.5,8.5) */ |
| /* and the above block of size 8 x 8 will be placed as a */ |
| /* block from the current position of out_buf */ |
| /* */ |
| /* Inputs : ref - Reference frame from which the block will be */ |
| /* block will be extracted. */ |
| /* ref_wid - WIdth of reference frame */ |
| /* out_wid - WIdth of the output frame */ |
| /* blk_width - width of the block */ |
| /* blk_width - height of the block */ |
| /* */ |
| /* Globals : None */ |
| /* */ |
| /* Processing : Point to the (0,0),(1,0),(0,1),(1,1) position in */ |
| /* the ref frame.Interpolate these four values to get the */ |
| /* value at(0.5,0.5).Repeat this to get an 8 x 8 block */ |
| /* using 9 x 9 block from reference frame */ |
| /* */ |
| /* Outputs : out - Output containing the extracted block */ |
| /* */ |
| /* Returns : None */ |
| /* */ |
| /* Issues : None */ |
| /* */ |
| /*****************************************************************************/ |
| void impeg2_mc_halfx_halfy_8x8_sse42(UWORD8 *out, |
| UWORD8 *ref, |
| UWORD32 ref_wid, |
| UWORD32 out_wid) |
| { |
| UWORD8 *ref_p0,*ref_p1,*ref_p2,*ref_p3; |
| /* P0-P3 are the pixels in the reference frame and Q is the value being */ |
| /* estimated */ |
| /* |
| P0 P1 |
| Q |
| P2 P3 |
| */ |
| __m128i src_r0, src_r0_1, src_r1, src_r1_1; |
| __m128i tmp0, tmp1; |
| __m128i value_2 = _mm_set1_epi16(2); |
| |
| ref_p0 = ref; |
| ref_p1 = ref + 1; |
| ref_p2 = ref + ref_wid; |
| ref_p3 = ref + ref_wid + 1; |
| |
| src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 0 |
| src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1)); |
| src_r1 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 1 |
| src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); |
| |
| src_r0 = _mm_cvtepu8_epi16(src_r0); |
| src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); |
| src_r1 = _mm_cvtepu8_epi16(src_r1); |
| src_r1_1 = _mm_cvtepu8_epi16(src_r1_1); |
| |
| tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 0 horizontal interpolation |
| tmp1 = _mm_add_epi16(src_r1, src_r1_1); //Row 1 horizontal interpolation |
| tmp0 = _mm_add_epi16(tmp0, tmp1); //Row 0 vertical interpolation |
| tmp0 = _mm_add_epi16(tmp0, value_2); |
| tmp0 = _mm_srli_epi16(tmp0, 2); |
| tmp0 = _mm_packus_epi16(tmp0, value_2); |
| |
| _mm_storel_epi64((__m128i *)out, tmp0); |
| |
| //Row 1 |
| ref_p2 += ref_wid; |
| ref_p3 += ref_wid; |
| out += out_wid; |
| |
| src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 2 |
| src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); |
| |
| src_r0 = _mm_cvtepu8_epi16(src_r0); |
| src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); |
| |
| tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 2 horizontal interpolation |
| tmp1 = _mm_add_epi16(tmp0, tmp1); //Row 1 vertical interpolation |
| tmp1 = _mm_add_epi16(tmp1, value_2); |
| tmp1 = _mm_srli_epi16(tmp1, 2); |
| tmp1 = _mm_packus_epi16(tmp1, value_2); |
| |
| _mm_storel_epi64((__m128i *)out, tmp1); |
| |
| //Row 2 |
| ref_p2 += ref_wid; |
| ref_p3 += ref_wid; |
| out += out_wid; |
| |
| src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 3 |
| src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); |
| |
| src_r0 = _mm_cvtepu8_epi16(src_r0); |
| src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); |
| |
| tmp1 = _mm_add_epi16(src_r0, src_r0_1); //Row 3 horizontal interpolation |
| |
| tmp0 = _mm_add_epi16(tmp0, tmp1); //Row 2 vertical interpolation |
| tmp0 = _mm_add_epi16(tmp0, value_2); |
| tmp0 = _mm_srli_epi16(tmp0, 2); |
| tmp0 = _mm_packus_epi16(tmp0, value_2); |
| |
| _mm_storel_epi64((__m128i *)out, tmp0); |
| |
| //Row 3 |
| ref_p2 += ref_wid; |
| ref_p3 += ref_wid; |
| out += out_wid; |
| |
| src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 4 |
| src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); |
| |
| src_r0 = _mm_cvtepu8_epi16(src_r0); |
| src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); |
| |
| tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 4 horizontal interpolation |
| |
| tmp1 = _mm_add_epi16(tmp0, tmp1); //Row 3 vertical interpolation |
| tmp1 = _mm_add_epi16(tmp1, value_2); |
| tmp1 = _mm_srli_epi16(tmp1, 2); |
| tmp1 = _mm_packus_epi16(tmp1, value_2); |
| |
| _mm_storel_epi64((__m128i *)out, tmp1); |
| |
| //Row 4 |
| ref_p2 += ref_wid; |
| ref_p3 += ref_wid; |
| out += out_wid; |
| |
| src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 5 |
| src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); |
| |
| src_r0 = _mm_cvtepu8_epi16(src_r0); |
| src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); |
| |
| tmp1 = _mm_add_epi16(src_r0, src_r0_1); //Row 5 horizontal interpolation |
| |
| tmp0 = _mm_add_epi16(tmp0, tmp1); //Row 4 vertical interpolation |
| tmp0 = _mm_add_epi16(tmp0, value_2); |
| tmp0 = _mm_srli_epi16(tmp0, 2); |
| tmp0 = _mm_packus_epi16(tmp0, value_2); |
| |
| _mm_storel_epi64((__m128i *)out, tmp0); |
| |
| //Row 5 |
| ref_p2 += ref_wid; |
| ref_p3 += ref_wid; |
| out += out_wid; |
| |
| src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 6 |
| src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); |
| |
| src_r0 = _mm_cvtepu8_epi16(src_r0); |
| src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); |
| |
| tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 6 horizontal interpolation |
| |
| tmp1 = _mm_add_epi16(tmp0, tmp1); //Row 5 vertical interpolation |
| tmp1 = _mm_add_epi16(tmp1, value_2); |
| tmp1 = _mm_srli_epi16(tmp1, 2); |
| tmp1 = _mm_packus_epi16(tmp1, value_2); |
| |
| _mm_storel_epi64((__m128i *)out, tmp1); |
| |
| //Row 6 |
| ref_p2 += ref_wid; |
| ref_p3 += ref_wid; |
| out += out_wid; |
| |
| src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 7 |
| src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); |
| |
| src_r0 = _mm_cvtepu8_epi16(src_r0); |
| src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); |
| |
| tmp1 = _mm_add_epi16(src_r0, src_r0_1); //Row 7 horizontal interpolation |
| |
| tmp0 = _mm_add_epi16(tmp0, tmp1); //Row 6 vertical interpolation |
| tmp0 = _mm_add_epi16(tmp0, value_2); |
| tmp0 = _mm_srli_epi16(tmp0, 2); |
| tmp0 = _mm_packus_epi16(tmp0, value_2); |
| |
| _mm_storel_epi64((__m128i *)out, tmp0); |
| |
| //Row 7 |
| ref_p2 += ref_wid; |
| ref_p3 += ref_wid; |
| out += out_wid; |
| |
| src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 8 |
| src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); |
| |
| src_r0 = _mm_cvtepu8_epi16(src_r0); |
| src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); |
| |
| tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 8 horizontal interpolation |
| |
| tmp1 = _mm_add_epi16(tmp0, tmp1); //Row 7 vertical interpolation |
| tmp1 = _mm_add_epi16(tmp1, value_2); |
| tmp1 = _mm_srli_epi16(tmp1, 2); |
| tmp1 = _mm_packus_epi16(tmp1, value_2); |
| |
| _mm_storel_epi64((__m128i *)out, tmp1); |
| |
| return; |
| } |
| |
| /*****************************************************************************/ |
| /* */ |
| /* Function Name : impeg2_mc_halfx_fully_8x8_sse42() */ |
| /* */ |
| /* Description : Gets the buffer from (0.5,0) to (8.5,8) */ |
| /* and the above block of size 8 x 8 will be placed as a */ |
| /* block from the current position of out_buf */ |
| /* */ |
| /* Inputs : ref - Reference frame from which the block will be */ |
| /* block will be extracted. */ |
| /* ref_wid - WIdth of reference frame */ |
| /* out_wid - WIdth of the output frame */ |
| /* blk_width - width of the block */ |
| /* blk_width - height of the block */ |
| /* */ |
| /* Globals : None */ |
| /* */ |
| /* Processing : Point to the (0,0) and (1,0) position in the ref frame */ |
| /* Interpolate these two values to get the value at(0.5,0) */ |
| /* Repeat this to get an 8 x 8 block using 9 x 8 block from */ |
| /* reference frame */ |
| /* */ |
| /* Outputs : out - Output containing the extracted block */ |
| /* */ |
| /* Returns : None */ |
| /* */ |
| /* Issues : None */ |
| /* */ |
| /*****************************************************************************/ |
| void impeg2_mc_halfx_fully_8x8_sse42(UWORD8 *out, |
| UWORD8 *ref, |
| UWORD32 ref_wid, |
| UWORD32 out_wid) |
| { |
| UWORD8 *ref_p0,*ref_p1; |
| __m128i src_r0, src_r0_1, src_r1, src_r1_1; |
| /* P0-P3 are the pixels in the reference frame and Q is the value being */ |
| /* estimated */ |
| /* |
| P0 Q P1 |
| */ |
| |
| ref_p0 = ref; |
| ref_p1 = ref + 1; |
| |
| // Row 0 and 1 |
| src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 0 |
| src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1)); |
| src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid)); //Row 1 |
| src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid)); |
| |
| src_r0 = _mm_avg_epu8(src_r0, src_r0_1); |
| src_r1 = _mm_avg_epu8(src_r1, src_r1_1); |
| |
| _mm_storel_epi64((__m128i *)out, src_r0); |
| _mm_storel_epi64((__m128i *)(out + out_wid), src_r1); |
| |
| // Row 2 and 3 |
| ref_p0 += 2*ref_wid; |
| ref_p1 += 2*ref_wid; |
| out += 2*out_wid; |
| |
| src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 2 |
| src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1)); |
| src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid)); //Row 3 |
| src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid)); |
| |
| src_r0 = _mm_avg_epu8(src_r0, src_r0_1); |
| src_r1 = _mm_avg_epu8(src_r1, src_r1_1); |
| |
| _mm_storel_epi64((__m128i *)out, src_r0); |
| _mm_storel_epi64((__m128i *)(out + out_wid), src_r1); |
| |
| // Row 4 and 5 |
| ref_p0 += 2*ref_wid; |
| ref_p1 += 2*ref_wid; |
| out += 2*out_wid; |
| |
| src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 4 |
| src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1)); |
| src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid)); //Row 5 |
| src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid)); |
| |
| src_r0 = _mm_avg_epu8(src_r0, src_r0_1); |
| src_r1 = _mm_avg_epu8(src_r1, src_r1_1); |
| |
| _mm_storel_epi64((__m128i *)out, src_r0); |
| _mm_storel_epi64((__m128i *)(out + out_wid), src_r1); |
| |
| // Row 6 and 7 |
| ref_p0 += 2*ref_wid; |
| ref_p1 += 2*ref_wid; |
| out += 2*out_wid; |
| |
| src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 6 |
| src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1)); |
| src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid)); //Row 7 |
| src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid)); |
| |
| src_r0 = _mm_avg_epu8(src_r0, src_r0_1); |
| src_r1 = _mm_avg_epu8(src_r1, src_r1_1); |
| |
| _mm_storel_epi64((__m128i *)out, src_r0); |
| _mm_storel_epi64((__m128i *)(out + out_wid), src_r1); |
| |
| return; |
| } |
| |
| |
| /*****************************************************************************/ |
| /* */ |
| /* Function Name : impeg2_mc_fullx_halfy_8x8_sse42() */ |
| /* */ |
| /* Description : Gets the buffer from (0,0.5) to (8,8.5) */ |
| /* and the above block of size 8 x 8 will be placed as a */ |
| /* block from the current position of out_buf */ |
| /* */ |
| /* Inputs : ref - Reference frame from which the block will be */ |
| /* block will be extracted. */ |
| /* ref_wid - WIdth of reference frame */ |
| /* out_wid - WIdth of the output frame */ |
| /* blk_width - width of the block */ |
| /* blk_width - height of the block */ |
| /* */ |
| /* Globals : None */ |
| /* */ |
| /* Processing : Point to the (0,0) and (0,1) position in the ref frame */ |
| /* Interpolate these two values to get the value at(0,0.5) */ |
| /* Repeat this to get an 8 x 8 block using 8 x 9 block from */ |
| /* reference frame */ |
| /* */ |
| /* Outputs : out - Output containing the extracted block */ |
| /* */ |
| /* Returns : None */ |
| /* */ |
| /* Issues : None */ |
| /* */ |
| /*****************************************************************************/ |
| void impeg2_mc_fullx_halfy_8x8_sse42(UWORD8 *out, |
| UWORD8 *ref, |
| UWORD32 ref_wid, |
| UWORD32 out_wid) |
| { |
| __m128i src_r0, src_r1, src_r2, temp0, temp1; |
| /* P0-P3 are the pixels in the reference frame and Q is the value being */ |
| /* estimated */ |
| /* |
| P0 |
| x |
| P1 |
| */ |
| src_r0 = _mm_loadl_epi64((__m128i *)ref); //Row 0 |
| src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); //Row 1 |
| src_r2 = _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid)); //Row 2 |
| temp0 = _mm_avg_epu8(src_r0, src_r1); |
| temp1 = _mm_avg_epu8(src_r1, src_r2); |
| _mm_storel_epi64((__m128i *)out, temp0); //Row 0 |
| _mm_storel_epi64((__m128i *)(out + out_wid), temp1); //Row 1 |
| |
| ref+= 3*ref_wid; |
| out+= 2*out_wid; |
| |
| src_r0 = _mm_loadl_epi64((__m128i *)ref); //Row 3 |
| src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); //Row 4 |
| temp0 = _mm_avg_epu8(src_r2, src_r0); |
| temp1 = _mm_avg_epu8(src_r0, src_r1); |
| _mm_storel_epi64((__m128i *)out, temp0); //Row 2 |
| _mm_storel_epi64((__m128i *)(out + out_wid), temp1); //Row 3 |
| |
| ref += 2*ref_wid; |
| out+= 2*out_wid; |
| |
| src_r2 = _mm_loadl_epi64((__m128i *)ref); //Row 5 |
| src_r0 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); //Row 6 |
| temp0 = _mm_avg_epu8(src_r1, src_r2); |
| temp1 = _mm_avg_epu8(src_r2, src_r0); |
| _mm_storel_epi64((__m128i *)out, temp0); //Row 4 |
| _mm_storel_epi64((__m128i *)(out + out_wid), temp1); //Row 5 |
| |
| ref += 2*ref_wid; |
| out+= 2*out_wid; |
| |
| src_r1 = _mm_loadl_epi64((__m128i *)ref); //Row 7 |
| src_r2 = _mm_loadl_epi64((__m128i *) (ref + ref_wid)); //Row 8 |
| temp0 = _mm_avg_epu8(src_r0, src_r1); |
| temp1 = _mm_avg_epu8(src_r1, src_r2); |
| _mm_storel_epi64((__m128i *)out, temp0); //Row 6 |
| _mm_storel_epi64((__m128i *)(out + out_wid), temp1); //Row 7 |
| |
| return; |
| } |
| |
| /*****************************************************************************/ |
| /* */ |
| /* Function Name : impeg2_mc_fullx_fully_8x8_sse42() */ |
| /* */ |
| /* Description : Gets the buffer from (x,y) to (x+8,y+8) */ |
| /* and the above block of size 8 x 8 will be placed as a */ |
| /* block from the current position of out_buf */ |
| /* */ |
| /* Inputs : ref - Reference frame from which the block will be */ |
| /* block will be extracted. */ |
| /* ref_wid - WIdth of reference frame */ |
| /* out_wid - WIdth of the output frame */ |
| /* blk_width - width of the block */ |
| /* blk_width - height of the block */ |
| /* */ |
| /* Globals : None */ |
| /* */ |
| /* Processing : Point to the (0,0) position in the ref frame */ |
| /* Get an 8 x 8 block from reference frame */ |
| /* */ |
| /* Outputs : out - Output containing the extracted block */ |
| /* */ |
| /* Returns : None */ |
| /* */ |
| /* Issues : None */ |
| /* */ |
| /*****************************************************************************/ |
| void impeg2_mc_fullx_fully_8x8_sse42(UWORD8 *out, |
| UWORD8 *ref, |
| UWORD32 ref_wid, |
| UWORD32 out_wid) |
| { |
| __m128i src_r0, src_r1, src_r2, src_r3; |
| // Row 0-3 |
| src_r0 = _mm_loadl_epi64((__m128i *)ref); |
| src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); |
| src_r2 = _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid)); |
| src_r3 = _mm_loadl_epi64((__m128i *)(ref + 3 * ref_wid)); |
| |
| _mm_storel_epi64((__m128i *)out, src_r0); |
| _mm_storel_epi64((__m128i *)(out + out_wid), src_r1); |
| _mm_storel_epi64((__m128i *)(out + 2 * out_wid), src_r2); |
| _mm_storel_epi64((__m128i *)(out + 3 * out_wid), src_r3); |
| |
| // Row 4-7 |
| ref += 4 * ref_wid; |
| out += 4 * out_wid; |
| |
| src_r0 = _mm_loadl_epi64((__m128i *)ref); |
| src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); |
| src_r2 = _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid)); |
| src_r3 = _mm_loadl_epi64((__m128i *)(ref + 3 * ref_wid)); |
| |
| _mm_storel_epi64((__m128i *)out, src_r0); |
| _mm_storel_epi64((__m128i *)(out + out_wid), src_r1); |
| _mm_storel_epi64((__m128i *)(out + 2 * out_wid), src_r2); |
| _mm_storel_epi64((__m128i *)(out + 3 * out_wid), src_r3); |
| return; |
| } |