| /* ------------------------------------------------------------------ |
| * Copyright (C) 2008 PacketVideo |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either |
| * express or implied. |
| * See the License for the specific language governing permissions |
| * and limitations under the License. |
| * ------------------------------------------------------------------- |
| */ |
| #include "avcenc_lib.h" |
| #include "avcenc_int.h" |
| #include "oscl_mem.h" |
| |
| |
| #define CLIP_RESULT(x) if((uint)x > 0xFF){ \ |
| x = 0xFF & (~(x>>31));} |
| |
| /* (blkwidth << 2) + (dy << 1) + dx */ |
| static void (*const eChromaMC_SIMD[8])(uint8 *, int , int , int , uint8 *, int, int , int) = |
| { |
| &eChromaFullMC_SIMD, |
| &eChromaHorizontalMC_SIMD, |
| &eChromaVerticalMC_SIMD, |
| &eChromaDiagonalMC_SIMD, |
| &eChromaFullMC_SIMD, |
| &eChromaHorizontalMC2_SIMD, |
| &eChromaVerticalMC2_SIMD, |
| &eChromaDiagonalMC2_SIMD |
| }; |
| /* Perform motion prediction and compensation with residue if exist. */ |
| void AVCMBMotionComp(AVCEncObject *encvid, AVCCommonObj *video) |
| { |
| (void)(encvid); |
| |
| AVCMacroblock *currMB = video->currMB; |
| AVCPictureData *currPic = video->currPic; |
| int mbPartIdx, subMbPartIdx; |
| int ref_idx; |
| int offset_MbPart_indx = 0; |
| int16 *mv; |
| uint32 x_pos, y_pos; |
| uint8 *curL, *curCb, *curCr; |
| uint8 *ref_l, *ref_Cb, *ref_Cr; |
| uint8 *predBlock, *predCb, *predCr; |
| int block_x, block_y, offset_x, offset_y, offsetP, offset; |
| int x_position = (video->mb_x << 4); |
| int y_position = (video->mb_y << 4); |
| int MbHeight, MbWidth, mbPartIdx_X, mbPartIdx_Y, offset_indx; |
| int picWidth = currPic->width; |
| int picPitch = currPic->pitch; |
| int picHeight = currPic->height; |
| uint32 tmp_word; |
| |
| tmp_word = y_position * picPitch; |
| curL = currPic->Sl + tmp_word + x_position; |
| offset = (tmp_word >> 2) + (x_position >> 1); |
| curCb = currPic->Scb + offset; |
| curCr = currPic->Scr + offset; |
| |
| predBlock = curL; |
| predCb = curCb; |
| predCr = curCr; |
| |
| GetMotionVectorPredictor(video, 1); |
| |
| for (mbPartIdx = 0; mbPartIdx < currMB->NumMbPart; mbPartIdx++) |
| { |
| MbHeight = currMB->SubMbPartHeight[mbPartIdx]; |
| MbWidth = currMB->SubMbPartWidth[mbPartIdx]; |
| mbPartIdx_X = ((mbPartIdx + offset_MbPart_indx) & 1); |
| mbPartIdx_Y = (mbPartIdx + offset_MbPart_indx) >> 1; |
| ref_idx = currMB->ref_idx_L0[(mbPartIdx_Y << 1) + mbPartIdx_X]; |
| offset_indx = 0; |
| |
| ref_l = video->RefPicList0[ref_idx]->Sl; |
| ref_Cb = video->RefPicList0[ref_idx]->Scb; |
| ref_Cr = video->RefPicList0[ref_idx]->Scr; |
| |
| for (subMbPartIdx = 0; subMbPartIdx < currMB->NumSubMbPart[mbPartIdx]; subMbPartIdx++) |
| { |
| block_x = (mbPartIdx_X << 1) + ((subMbPartIdx + offset_indx) & 1); // check this MC |
| block_y = (mbPartIdx_Y << 1) + (((subMbPartIdx + offset_indx) >> 1) & 1); |
| mv = &currMB->mvL0[0] + (block_x << 1) + (block_y << 3); |
| offset_x = x_position + (block_x << 2); |
| offset_y = y_position + (block_y << 2); |
| x_pos = (offset_x << 2) + *mv++; /*quarter pel */ |
| y_pos = (offset_y << 2) + *mv; /*quarter pel */ |
| |
| //offset = offset_y * currPic->width; |
| //offsetC = (offset >> 2) + (offset_x >> 1); |
| offsetP = (block_y << 2) * picPitch + (block_x << 2); |
| eLumaMotionComp(ref_l, picPitch, picHeight, x_pos, y_pos, |
| /*comp_Sl + offset + offset_x,*/ |
| predBlock + offsetP, picPitch, MbWidth, MbHeight); |
| |
| offsetP = (block_y * picWidth) + (block_x << 1); |
| eChromaMotionComp(ref_Cb, picWidth >> 1, picHeight >> 1, x_pos, y_pos, |
| /*comp_Scb + offsetC,*/ |
| predCb + offsetP, picPitch >> 1, MbWidth >> 1, MbHeight >> 1); |
| eChromaMotionComp(ref_Cr, picWidth >> 1, picHeight >> 1, x_pos, y_pos, |
| /*comp_Scr + offsetC,*/ |
| predCr + offsetP, picPitch >> 1, MbWidth >> 1, MbHeight >> 1); |
| |
| offset_indx = currMB->SubMbPartWidth[mbPartIdx] >> 3; |
| } |
| offset_MbPart_indx = currMB->MbPartWidth >> 4; |
| } |
| |
| return ; |
| } |
| |
| |
| /* preform the actual motion comp here */ |
| void eLumaMotionComp(uint8 *ref, int picpitch, int picheight, |
| int x_pos, int y_pos, |
| uint8 *pred, int pred_pitch, |
| int blkwidth, int blkheight) |
| { |
| (void)(picheight); |
| |
| int dx, dy; |
| int temp2[21][21]; /* for intermediate results */ |
| uint8 *ref2; |
| |
| dx = x_pos & 3; |
| dy = y_pos & 3; |
| x_pos = x_pos >> 2; /* round it to full-pel resolution */ |
| y_pos = y_pos >> 2; |
| |
| /* perform actual motion compensation */ |
| if (dx == 0 && dy == 0) |
| { /* fullpel position *//* G */ |
| |
| ref += y_pos * picpitch + x_pos; |
| |
| eFullPelMC(ref, picpitch, pred, pred_pitch, blkwidth, blkheight); |
| |
| } /* other positions */ |
| else if (dy == 0) |
| { /* no vertical interpolation *//* a,b,c*/ |
| |
| ref += y_pos * picpitch + x_pos; |
| |
| eHorzInterp1MC(ref, picpitch, pred, pred_pitch, blkwidth, blkheight, dx); |
| } |
| else if (dx == 0) |
| { /*no horizontal interpolation *//* d,h,n */ |
| |
| ref += y_pos * picpitch + x_pos; |
| |
| eVertInterp1MC(ref, picpitch, pred, pred_pitch, blkwidth, blkheight, dy); |
| } |
| else if (dy == 2) |
| { /* horizontal cross *//* i, j, k */ |
| |
| ref += y_pos * picpitch + x_pos - 2; /* move to the left 2 pixels */ |
| |
| eVertInterp2MC(ref, picpitch, &temp2[0][0], 21, blkwidth + 5, blkheight); |
| |
| eHorzInterp2MC(&temp2[0][2], 21, pred, pred_pitch, blkwidth, blkheight, dx); |
| } |
| else if (dx == 2) |
| { /* vertical cross */ /* f,q */ |
| |
| ref += (y_pos - 2) * picpitch + x_pos; /* move to up 2 lines */ |
| |
| eHorzInterp3MC(ref, picpitch, &temp2[0][0], 21, blkwidth, blkheight + 5); |
| eVertInterp3MC(&temp2[2][0], 21, pred, pred_pitch, blkwidth, blkheight, dy); |
| } |
| else |
| { /* diagonal *//* e,g,p,r */ |
| |
| ref2 = ref + (y_pos + (dy / 2)) * picpitch + x_pos; |
| |
| ref += (y_pos * picpitch) + x_pos + (dx / 2); |
| |
| eDiagonalInterpMC(ref2, ref, picpitch, pred, pred_pitch, blkwidth, blkheight); |
| } |
| |
| return ; |
| } |
| |
| void eCreateAlign(uint8 *ref, int picpitch, int y_pos, |
| uint8 *out, int blkwidth, int blkheight) |
| { |
| int i, j; |
| int offset, out_offset; |
| uint32 prev_pix, result, pix1, pix2, pix4; |
| |
| ref += y_pos * picpitch;// + x_pos; |
| out_offset = 24 - blkwidth; |
| |
| //switch(x_pos&0x3){ |
| switch (((uint32)ref)&0x3) |
| { |
| case 1: |
| offset = picpitch - blkwidth - 3; |
| for (j = 0; j < blkheight; j++) |
| { |
| pix1 = *ref++; |
| pix2 = *((uint16*)ref); |
| ref += 2; |
| result = (pix2 << 8) | pix1; |
| |
| for (i = 3; i < blkwidth; i += 4) |
| { |
| pix4 = *((uint32*)ref); |
| ref += 4; |
| prev_pix = (pix4 << 24) & 0xFF000000; /* mask out byte belong to previous word */ |
| result |= prev_pix; |
| *((uint32*)out) = result; /* write 4 bytes */ |
| out += 4; |
| result = pix4 >> 8; /* for the next loop */ |
| } |
| ref += offset; |
| out += out_offset; |
| } |
| break; |
| case 2: |
| offset = picpitch - blkwidth - 2; |
| for (j = 0; j < blkheight; j++) |
| { |
| result = *((uint16*)ref); |
| ref += 2; |
| for (i = 2; i < blkwidth; i += 4) |
| { |
| pix4 = *((uint32*)ref); |
| ref += 4; |
| prev_pix = (pix4 << 16) & 0xFFFF0000; /* mask out byte belong to previous word */ |
| result |= prev_pix; |
| *((uint32*)out) = result; /* write 4 bytes */ |
| out += 4; |
| result = pix4 >> 16; /* for the next loop */ |
| } |
| ref += offset; |
| out += out_offset; |
| } |
| break; |
| case 3: |
| offset = picpitch - blkwidth - 1; |
| for (j = 0; j < blkheight; j++) |
| { |
| result = *ref++; |
| for (i = 1; i < blkwidth; i += 4) |
| { |
| pix4 = *((uint32*)ref); |
| ref += 4; |
| prev_pix = (pix4 << 8) & 0xFFFFFF00; /* mask out byte belong to previous word */ |
| result |= prev_pix; |
| *((uint32*)out) = result; /* write 4 bytes */ |
| out += 4; |
| result = pix4 >> 24; /* for the next loop */ |
| } |
| ref += offset; |
| out += out_offset; |
| } |
| break; |
| } |
| } |
| |
| void eHorzInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch, |
| int blkwidth, int blkheight, int dx) |
| { |
| uint8 *p_ref; |
| uint32 *p_cur; |
| uint32 tmp, pkres; |
| int result, curr_offset, ref_offset; |
| int j; |
| int32 r0, r1, r2, r3, r4, r5; |
| int32 r13, r6; |
| |
| p_cur = (uint32*)out; /* assume it's word aligned */ |
| curr_offset = (outpitch - blkwidth) >> 2; |
| p_ref = in; |
| ref_offset = inpitch - blkwidth; |
| |
| if (dx&1) |
| { |
| dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */ |
| p_ref -= 2; |
| r13 = 0; |
| for (j = blkheight; j > 0; j--) |
| { |
| tmp = (uint32)(p_ref + blkwidth); |
| r0 = p_ref[0]; |
| r1 = p_ref[2]; |
| r0 |= (r1 << 16); /* 0,c,0,a */ |
| r1 = p_ref[1]; |
| r2 = p_ref[3]; |
| r1 |= (r2 << 16); /* 0,d,0,b */ |
| while ((uint32)p_ref < tmp) |
| { |
| r2 = *(p_ref += 4); /* move pointer to e */ |
| r3 = p_ref[2]; |
| r2 |= (r3 << 16); /* 0,g,0,e */ |
| r3 = p_ref[1]; |
| r4 = p_ref[3]; |
| r3 |= (r4 << 16); /* 0,h,0,f */ |
| |
| r4 = r0 + r3; /* c+h, a+f */ |
| r5 = r0 + r1; /* c+d, a+b */ |
| r6 = r2 + r3; /* g+h, e+f */ |
| r5 >>= 16; |
| r5 |= (r6 << 16); /* e+f, c+d */ |
| r4 += r5 * 20; /* c+20*e+20*f+h, a+20*c+20*d+f */ |
| r4 += 0x100010; /* +16, +16 */ |
| r5 = r1 + r2; /* d+g, b+e */ |
| r4 -= r5 * 5; /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */ |
| r4 >>= 5; |
| r13 |= r4; /* check clipping */ |
| |
| r5 = p_ref[dx+2]; |
| r6 = p_ref[dx+4]; |
| r5 |= (r6 << 16); |
| r4 += r5; |
| r4 += 0x10001; |
| r4 = (r4 >> 1) & 0xFF00FF; |
| |
| r5 = p_ref[4]; /* i */ |
| r6 = (r5 << 16); |
| r5 = r6 | (r2 >> 16);/* 0,i,0,g */ |
| r5 += r1; /* d+i, b+g */ /* r5 not free */ |
| r1 >>= 16; |
| r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */ |
| r1 += r2; /* f+g, d+e */ |
| r5 += 20 * r1; /* d+20f+20g+i, b+20d+20e+g */ |
| r0 >>= 16; |
| r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */ |
| r0 += r3; /* e+h, c+f */ |
| r5 += 0x100010; /* 16,16 */ |
| r5 -= r0 * 5; /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */ |
| r5 >>= 5; |
| r13 |= r5; /* check clipping */ |
| |
| r0 = p_ref[dx+3]; |
| r1 = p_ref[dx+5]; |
| r0 |= (r1 << 16); |
| r5 += r0; |
| r5 += 0x10001; |
| r5 = (r5 >> 1) & 0xFF00FF; |
| |
| r4 |= (r5 << 8); /* pack them together */ |
| *p_cur++ = r4; |
| r1 = r3; |
| r0 = r2; |
| } |
| p_cur += curr_offset; /* move to the next line */ |
| p_ref += ref_offset; /* ref_offset = inpitch-blkwidth; */ |
| |
| if (r13&0xFF000700) /* need clipping */ |
| { |
| /* move back to the beginning of the line */ |
| p_ref -= (ref_offset + blkwidth); /* input */ |
| p_cur -= (outpitch >> 2); |
| |
| tmp = (uint32)(p_ref + blkwidth); |
| for (; (uint32)p_ref < tmp;) |
| { |
| |
| r0 = *p_ref++; |
| r1 = *p_ref++; |
| r2 = *p_ref++; |
| r3 = *p_ref++; |
| r4 = *p_ref++; |
| /* first pixel */ |
| r5 = *p_ref++; |
| result = (r0 + r5); |
| r0 = (r1 + r4); |
| result -= (r0 * 5);//result -= r0; result -= (r0<<2); |
| r0 = (r2 + r3); |
| result += (r0 * 20);//result += (r0<<4); result += (r0<<2); |
| result = (result + 16) >> 5; |
| CLIP_RESULT(result) |
| /* 3/4 pel, no need to clip */ |
| result = (result + p_ref[dx] + 1); |
| pkres = (result >> 1) ; |
| /* second pixel */ |
| r0 = *p_ref++; |
| result = (r1 + r0); |
| r1 = (r2 + r5); |
| result -= (r1 * 5);//result -= r1; result -= (r1<<2); |
| r1 = (r3 + r4); |
| result += (r1 * 20);//result += (r1<<4); result += (r1<<2); |
| result = (result + 16) >> 5; |
| CLIP_RESULT(result) |
| /* 3/4 pel, no need to clip */ |
| result = (result + p_ref[dx] + 1); |
| result = (result >> 1); |
| pkres |= (result << 8); |
| /* third pixel */ |
| r1 = *p_ref++; |
| result = (r2 + r1); |
| r2 = (r3 + r0); |
| result -= (r2 * 5);//result -= r2; result -= (r2<<2); |
| r2 = (r4 + r5); |
| result += (r2 * 20);//result += (r2<<4); result += (r2<<2); |
| result = (result + 16) >> 5; |
| CLIP_RESULT(result) |
| /* 3/4 pel, no need to clip */ |
| result = (result + p_ref[dx] + 1); |
| result = (result >> 1); |
| pkres |= (result << 16); |
| /* fourth pixel */ |
| r2 = *p_ref++; |
| result = (r3 + r2); |
| r3 = (r4 + r1); |
| result -= (r3 * 5);//result -= r3; result -= (r3<<2); |
| r3 = (r5 + r0); |
| result += (r3 * 20);//result += (r3<<4); result += (r3<<2); |
| result = (result + 16) >> 5; |
| CLIP_RESULT(result) |
| /* 3/4 pel, no need to clip */ |
| result = (result + p_ref[dx] + 1); |
| result = (result >> 1); |
| pkres |= (result << 24); |
| *p_cur++ = pkres; /* write 4 pixels */ |
| p_ref -= 5; /* offset back to the middle of filter */ |
| } |
| p_cur += curr_offset; /* move to the next line */ |
| p_ref += ref_offset; /* move to the next line */ |
| } |
| } |
| } |
| else |
| { |
| p_ref -= 2; |
| r13 = 0; |
| for (j = blkheight; j > 0; j--) |
| { |
| tmp = (uint32)(p_ref + blkwidth); |
| r0 = p_ref[0]; |
| r1 = p_ref[2]; |
| r0 |= (r1 << 16); /* 0,c,0,a */ |
| r1 = p_ref[1]; |
| r2 = p_ref[3]; |
| r1 |= (r2 << 16); /* 0,d,0,b */ |
| while ((uint32)p_ref < tmp) |
| { |
| r2 = *(p_ref += 4); /* move pointer to e */ |
| r3 = p_ref[2]; |
| r2 |= (r3 << 16); /* 0,g,0,e */ |
| r3 = p_ref[1]; |
| r4 = p_ref[3]; |
| r3 |= (r4 << 16); /* 0,h,0,f */ |
| |
| r4 = r0 + r3; /* c+h, a+f */ |
| r5 = r0 + r1; /* c+d, a+b */ |
| r6 = r2 + r3; /* g+h, e+f */ |
| r5 >>= 16; |
| r5 |= (r6 << 16); /* e+f, c+d */ |
| r4 += r5 * 20; /* c+20*e+20*f+h, a+20*c+20*d+f */ |
| r4 += 0x100010; /* +16, +16 */ |
| r5 = r1 + r2; /* d+g, b+e */ |
| r4 -= r5 * 5; /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */ |
| r4 >>= 5; |
| r13 |= r4; /* check clipping */ |
| r4 &= 0xFF00FF; /* mask */ |
| |
| r5 = p_ref[4]; /* i */ |
| r6 = (r5 << 16); |
| r5 = r6 | (r2 >> 16);/* 0,i,0,g */ |
| r5 += r1; /* d+i, b+g */ /* r5 not free */ |
| r1 >>= 16; |
| r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */ |
| r1 += r2; /* f+g, d+e */ |
| r5 += 20 * r1; /* d+20f+20g+i, b+20d+20e+g */ |
| r0 >>= 16; |
| r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */ |
| r0 += r3; /* e+h, c+f */ |
| r5 += 0x100010; /* 16,16 */ |
| r5 -= r0 * 5; /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */ |
| r5 >>= 5; |
| r13 |= r5; /* check clipping */ |
| r5 &= 0xFF00FF; /* mask */ |
| |
| r4 |= (r5 << 8); /* pack them together */ |
| *p_cur++ = r4; |
| r1 = r3; |
| r0 = r2; |
| } |
| p_cur += curr_offset; /* move to the next line */ |
| p_ref += ref_offset; /* ref_offset = inpitch-blkwidth; */ |
| |
| if (r13&0xFF000700) /* need clipping */ |
| { |
| /* move back to the beginning of the line */ |
| p_ref -= (ref_offset + blkwidth); /* input */ |
| p_cur -= (outpitch >> 2); |
| |
| tmp = (uint32)(p_ref + blkwidth); |
| for (; (uint32)p_ref < tmp;) |
| { |
| |
| r0 = *p_ref++; |
| r1 = *p_ref++; |
| r2 = *p_ref++; |
| r3 = *p_ref++; |
| r4 = *p_ref++; |
| /* first pixel */ |
| r5 = *p_ref++; |
| result = (r0 + r5); |
| r0 = (r1 + r4); |
| result -= (r0 * 5);//result -= r0; result -= (r0<<2); |
| r0 = (r2 + r3); |
| result += (r0 * 20);//result += (r0<<4); result += (r0<<2); |
| result = (result + 16) >> 5; |
| CLIP_RESULT(result) |
| pkres = result; |
| /* second pixel */ |
| r0 = *p_ref++; |
| result = (r1 + r0); |
| r1 = (r2 + r5); |
| result -= (r1 * 5);//result -= r1; result -= (r1<<2); |
| r1 = (r3 + r4); |
| result += (r1 * 20);//result += (r1<<4); result += (r1<<2); |
| result = (result + 16) >> 5; |
| CLIP_RESULT(result) |
| pkres |= (result << 8); |
| /* third pixel */ |
| r1 = *p_ref++; |
| result = (r2 + r1); |
| r2 = (r3 + r0); |
| result -= (r2 * 5);//result -= r2; result -= (r2<<2); |
| r2 = (r4 + r5); |
| result += (r2 * 20);//result += (r2<<4); result += (r2<<2); |
| result = (result + 16) >> 5; |
| CLIP_RESULT(result) |
| pkres |= (result << 16); |
| /* fourth pixel */ |
| r2 = *p_ref++; |
| result = (r3 + r2); |
| r3 = (r4 + r1); |
| result -= (r3 * 5);//result -= r3; result -= (r3<<2); |
| r3 = (r5 + r0); |
| result += (r3 * 20);//result += (r3<<4); result += (r3<<2); |
| result = (result + 16) >> 5; |
| CLIP_RESULT(result) |
| pkres |= (result << 24); |
| *p_cur++ = pkres; /* write 4 pixels */ |
| p_ref -= 5; |
| } |
| p_cur += curr_offset; /* move to the next line */ |
| p_ref += ref_offset; |
| } |
| } |
| } |
| |
| return ; |
| } |
| |
| void eHorzInterp2MC(int *in, int inpitch, uint8 *out, int outpitch, |
| int blkwidth, int blkheight, int dx) |
| { |
| int *p_ref; |
| uint32 *p_cur; |
| uint32 tmp, pkres; |
| int result, result2, curr_offset, ref_offset; |
| int j, r0, r1, r2, r3, r4, r5; |
| |
| p_cur = (uint32*)out; /* assume it's word aligned */ |
| curr_offset = (outpitch - blkwidth) >> 2; |
| p_ref = in; |
| ref_offset = inpitch - blkwidth; |
| |
| if (dx&1) |
| { |
| dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */ |
| |
| for (j = blkheight; j > 0 ; j--) |
| { |
| tmp = (uint32)(p_ref + blkwidth); |
| for (; (uint32)p_ref < tmp;) |
| { |
| |
| r0 = p_ref[-2]; |
| r1 = p_ref[-1]; |
| r2 = *p_ref++; |
| r3 = *p_ref++; |
| r4 = *p_ref++; |
| /* first pixel */ |
| r5 = *p_ref++; |
| result = (r0 + r5); |
| r0 = (r1 + r4); |
| result -= (r0 * 5);//result -= r0; result -= (r0<<2); |
| r0 = (r2 + r3); |
| result += (r0 * 20);//result += (r0<<4); result += (r0<<2); |
| result = (result + 512) >> 10; |
| CLIP_RESULT(result) |
| result2 = ((p_ref[dx] + 16) >> 5); |
| CLIP_RESULT(result2) |
| /* 3/4 pel, no need to clip */ |
| result = (result + result2 + 1); |
| pkres = (result >> 1); |
| /* second pixel */ |
| r0 = *p_ref++; |
| result = (r1 + r0); |
| r1 = (r2 + r5); |
| result -= (r1 * 5);//result -= r1; result -= (r1<<2); |
| r1 = (r3 + r4); |
| result += (r1 * 20);//result += (r1<<4); result += (r1<<2); |
| result = (result + 512) >> 10; |
| CLIP_RESULT(result) |
| result2 = ((p_ref[dx] + 16) >> 5); |
| CLIP_RESULT(result2) |
| /* 3/4 pel, no need to clip */ |
| result = (result + result2 + 1); |
| result = (result >> 1); |
| pkres |= (result << 8); |
| /* third pixel */ |
| r1 = *p_ref++; |
| result = (r2 + r1); |
| r2 = (r3 + r0); |
| result -= (r2 * 5);//result -= r2; result -= (r2<<2); |
| r2 = (r4 + r5); |
| result += (r2 * 20);//result += (r2<<4); result += (r2<<2); |
| result = (result + 512) >> 10; |
| CLIP_RESULT(result) |
| result2 = ((p_ref[dx] + 16) >> 5); |
| CLIP_RESULT(result2) |
| /* 3/4 pel, no need to clip */ |
| result = (result + result2 + 1); |
| result = (result >> 1); |
| pkres |= (result << 16); |
| /* fourth pixel */ |
| r2 = *p_ref++; |
| result = (r3 + r2); |
| r3 = (r4 + r1); |
| result -= (r3 * 5);//result -= r3; result -= (r3<<2); |
| r3 = (r5 + r0); |
| result += (r3 * 20);//result += (r3<<4); result += (r3<<2); |
| result = (result + 512) >> 10; |
| CLIP_RESULT(result) |
| result2 = ((p_ref[dx] + 16) >> 5); |
| CLIP_RESULT(result2) |
| /* 3/4 pel, no need to clip */ |
| result = (result + result2 + 1); |
| result = (result >> 1); |
| pkres |= (result << 24); |
| *p_cur++ = pkres; /* write 4 pixels */ |
| p_ref -= 3; /* offset back to the middle of filter */ |
| } |
| p_cur += curr_offset; /* move to the next line */ |
| p_ref += ref_offset; /* move to the next line */ |
| } |
| } |
| else |
| { |
| for (j = blkheight; j > 0 ; j--) |
| { |
| tmp = (uint32)(p_ref + blkwidth); |
| for (; (uint32)p_ref < tmp;) |
| { |
| |
| r0 = p_ref[-2]; |
| r1 = p_ref[-1]; |
| r2 = *p_ref++; |
| r3 = *p_ref++; |
| r4 = *p_ref++; |
| /* first pixel */ |
| r5 = *p_ref++; |
| result = (r0 + r5); |
| r0 = (r1 + r4); |
| result -= (r0 * 5);//result -= r0; result -= (r0<<2); |
| r0 = (r2 + r3); |
| result += (r0 * 20);//result += (r0<<4); result += (r0<<2); |
| result = (result + 512) >> 10; |
| CLIP_RESULT(result) |
| pkres = result; |
| /* second pixel */ |
| r0 = *p_ref++; |
| result = (r1 + r0); |
| r1 = (r2 + r5); |
| result -= (r1 * 5);//result -= r1; result -= (r1<<2); |
| r1 = (r3 + r4); |
| result += (r1 * 20);//result += (r1<<4); result += (r1<<2); |
| result = (result + 512) >> 10; |
| CLIP_RESULT(result) |
| pkres |= (result << 8); |
| /* third pixel */ |
| r1 = *p_ref++; |
| result = (r2 + r1); |
| r2 = (r3 + r0); |
| result -= (r2 * 5);//result -= r2; result -= (r2<<2); |
| r2 = (r4 + r5); |
| result += (r2 * 20);//result += (r2<<4); result += (r2<<2); |
| result = (result + 512) >> 10; |
| CLIP_RESULT(result) |
| pkres |= (result << 16); |
| /* fourth pixel */ |
| r2 = *p_ref++; |
| result = (r3 + r2); |
| r3 = (r4 + r1); |
| result -= (r3 * 5);//result -= r3; result -= (r3<<2); |
| r3 = (r5 + r0); |
| result += (r3 * 20);//result += (r3<<4); result += (r3<<2); |
| result = (result + 512) >> 10; |
| CLIP_RESULT(result) |
| pkres |= (result << 24); |
| *p_cur++ = pkres; /* write 4 pixels */ |
| p_ref -= 3; /* offset back to the middle of filter */ |
| } |
| p_cur += curr_offset; /* move to the next line */ |
| p_ref += ref_offset; /* move to the next line */ |
| } |
| } |
| |
| return ; |
| } |
| |
| void eHorzInterp3MC(uint8 *in, int inpitch, int *out, int outpitch, |
| int blkwidth, int blkheight) |
| { |
| uint8 *p_ref; |
| int *p_cur; |
| uint32 tmp; |
| int result, curr_offset, ref_offset; |
| int j, r0, r1, r2, r3, r4, r5; |
| |
| p_cur = out; |
| curr_offset = (outpitch - blkwidth); |
| p_ref = in; |
| ref_offset = inpitch - blkwidth; |
| |
| for (j = blkheight; j > 0 ; j--) |
| { |
| tmp = (uint32)(p_ref + blkwidth); |
| for (; (uint32)p_ref < tmp;) |
| { |
| |
| r0 = p_ref[-2]; |
| r1 = p_ref[-1]; |
| r2 = *p_ref++; |
| r3 = *p_ref++; |
| r4 = *p_ref++; |
| /* first pixel */ |
| r5 = *p_ref++; |
| result = (r0 + r5); |
| r0 = (r1 + r4); |
| result -= (r0 * 5);//result -= r0; result -= (r0<<2); |
| r0 = (r2 + r3); |
| result += (r0 * 20);//result += (r0<<4); result += (r0<<2); |
| *p_cur++ = result; |
| /* second pixel */ |
| r0 = *p_ref++; |
| result = (r1 + r0); |
| r1 = (r2 + r5); |
| result -= (r1 * 5);//result -= r1; result -= (r1<<2); |
| r1 = (r3 + r4); |
| result += (r1 * 20);//result += (r1<<4); result += (r1<<2); |
| *p_cur++ = result; |
| /* third pixel */ |
| r1 = *p_ref++; |
| result = (r2 + r1); |
| r2 = (r3 + r0); |
| result -= (r2 * 5);//result -= r2; result -= (r2<<2); |
| r2 = (r4 + r5); |
| result += (r2 * 20);//result += (r2<<4); result += (r2<<2); |
| *p_cur++ = result; |
| /* fourth pixel */ |
| r2 = *p_ref++; |
| result = (r3 + r2); |
| r3 = (r4 + r1); |
| result -= (r3 * 5);//result -= r3; result -= (r3<<2); |
| r3 = (r5 + r0); |
| result += (r3 * 20);//result += (r3<<4); result += (r3<<2); |
| *p_cur++ = result; |
| p_ref -= 3; /* move back to the middle of the filter */ |
| } |
| p_cur += curr_offset; /* move to the next line */ |
| p_ref += ref_offset; |
| } |
| |
| return ; |
| } |
| void eVertInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch, |
| int blkwidth, int blkheight, int dy) |
| { |
| uint8 *p_cur, *p_ref; |
| uint32 tmp; |
| int result, curr_offset, ref_offset; |
| int j, i; |
| int32 r0, r1, r2, r3, r4, r5, r6, r7, r8, r13; |
| uint8 tmp_in[24][24]; |
| |
| /* not word-aligned */ |
| if (((uint32)in)&0x3) |
| { |
| eCreateAlign(in, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5); |
| in = &tmp_in[2][0]; |
| inpitch = 24; |
| } |
| p_cur = out; |
| curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */ |
| ref_offset = blkheight * inpitch; /* for limit */ |
| |
| curr_offset += 3; |
| |
| if (dy&1) |
| { |
| dy = (dy >> 1) ? 0 : -inpitch; |
| |
| for (j = 0; j < blkwidth; j += 4, in += 4) |
| { |
| r13 = 0; |
| p_ref = in; |
| p_cur -= outpitch; /* compensate for the first offset */ |
| tmp = (uint32)(p_ref + ref_offset); /* limit */ |
| while ((uint32)p_ref < tmp) /* the loop un-rolled */ |
| { |
| r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */ |
| p_ref += inpitch; |
| r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */ |
| r0 &= 0xFF00FF; |
| |
| r1 = *((uint32*)(p_ref + (inpitch << 1))); /* r1, r7, ref[3] */ |
| r7 = (r1 >> 8) & 0xFF00FF; |
| r1 &= 0xFF00FF; |
| |
| r0 += r1; |
| r6 += r7; |
| |
| r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */ |
| r8 = (r2 >> 8) & 0xFF00FF; |
| r2 &= 0xFF00FF; |
| |
| r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */ |
| r7 = (r1 >> 8) & 0xFF00FF; |
| r1 &= 0xFF00FF; |
| r1 += r2; |
| |
| r7 += r8; |
| |
| r0 += 20 * r1; |
| r6 += 20 * r7; |
| r0 += 0x100010; |
| r6 += 0x100010; |
| |
| r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */ |
| r8 = (r2 >> 8) & 0xFF00FF; |
| r2 &= 0xFF00FF; |
| |
| r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */ |
| r7 = (r1 >> 8) & 0xFF00FF; |
| r1 &= 0xFF00FF; |
| r1 += r2; |
| |
| r7 += r8; |
| |
| r0 -= 5 * r1; |
| r6 -= 5 * r7; |
| |
| r0 >>= 5; |
| r6 >>= 5; |
| /* clip */ |
| r13 |= r6; |
| r13 |= r0; |
| //CLIPPACK(r6,result) |
| |
| r1 = *((uint32*)(p_ref + dy)); |
| r2 = (r1 >> 8) & 0xFF00FF; |
| r1 &= 0xFF00FF; |
| r0 += r1; |
| r6 += r2; |
| r0 += 0x10001; |
| r6 += 0x10001; |
| r0 = (r0 >> 1) & 0xFF00FF; |
| r6 = (r6 >> 1) & 0xFF00FF; |
| |
| r0 |= (r6 << 8); /* pack it back */ |
| *((uint32*)(p_cur += outpitch)) = r0; |
| } |
| p_cur += curr_offset; /* offset to the next pixel */ |
| if (r13 & 0xFF000700) /* this column need clipping */ |
| { |
| p_cur -= 4; |
| for (i = 0; i < 4; i++) |
| { |
| p_ref = in + i; |
| p_cur -= outpitch; /* compensate for the first offset */ |
| |
| tmp = (uint32)(p_ref + ref_offset); /* limit */ |
| while ((uint32)p_ref < tmp) |
| { /* loop un-rolled */ |
| r0 = *(p_ref - (inpitch << 1)); |
| r1 = *(p_ref - inpitch); |
| r2 = *p_ref; |
| r3 = *(p_ref += inpitch); /* modify pointer before loading */ |
| r4 = *(p_ref += inpitch); |
| /* first pixel */ |
| r5 = *(p_ref += inpitch); |
| result = (r0 + r5); |
| r0 = (r1 + r4); |
| result -= (r0 * 5);//result -= r0; result -= (r0<<2); |
| r0 = (r2 + r3); |
| result += (r0 * 20);//result += (r0<<4); result += (r0<<2); |
| result = (result + 16) >> 5; |
| CLIP_RESULT(result) |
| /* 3/4 pel, no need to clip */ |
| result = (result + p_ref[dy-(inpitch<<1)] + 1); |
| result = (result >> 1); |
| *(p_cur += outpitch) = result; |
| /* second pixel */ |
| r0 = *(p_ref += inpitch); |
| result = (r1 + r0); |
| r1 = (r2 + r5); |
| result -= (r1 * 5);//result -= r1; result -= (r1<<2); |
| r1 = (r3 + r4); |
| result += (r1 * 20);//result += (r1<<4); result += (r1<<2); |
| result = (result + 16) >> 5; |
| CLIP_RESULT(result) |
| /* 3/4 pel, no need to clip */ |
| result = (result + p_ref[dy-(inpitch<<1)] + 1); |
| result = (result >> 1); |
| *(p_cur += outpitch) = result; |
| /* third pixel */ |
| r1 = *(p_ref += inpitch); |
| result = (r2 + r1); |
| r2 = (r3 + r0); |
| result -= (r2 * 5);//result -= r2; result -= (r2<<2); |
| r2 = (r4 + r5); |
| result += (r2 * 20);//result += (r2<<4); result += (r2<<2); |
| result = (result + 16) >> 5; |
| CLIP_RESULT(result) |
| /* 3/4 pel, no need to clip */ |
| result = (result + p_ref[dy-(inpitch<<1)] + 1); |
| result = (result >> 1); |
| *(p_cur += outpitch) = result; |
| /* fourth pixel */ |
| r2 = *(p_ref += inpitch); |
| result = (r3 + r2); |
| r3 = (r4 + r1); |
| result -= (r3 * 5);//result -= r3; result -= (r3<<2); |
| r3 = (r5 + r0); |
| result += (r3 * 20);//result += (r3<<4); result += (r3<<2); |
| result = (result + 16) >> 5; |
| CLIP_RESULT(result) |
| /* 3/4 pel, no need to clip */ |
| result = (result + p_ref[dy-(inpitch<<1)] + 1); |
| result = (result >> 1); |
| *(p_cur += outpitch) = result; |
| p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */ |
| } |
| p_cur += (curr_offset - 3); |
| } |
| } |
| } |
| } |
| else |
| { |
| for (j = 0; j < blkwidth; j += 4, in += 4) |
| { |
| r13 = 0; |
| p_ref = in; |
| p_cur -= outpitch; /* compensate for the first offset */ |
| tmp = (uint32)(p_ref + ref_offset); /* limit */ |
| while ((uint32)p_ref < tmp) /* the loop un-rolled */ |
| { |
| r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */ |
| p_ref += inpitch; |
| r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */ |
| r0 &= 0xFF00FF; |
| |
| r1 = *((uint32*)(p_ref + (inpitch << 1))); /* r1, r7, ref[3] */ |
| r7 = (r1 >> 8) & 0xFF00FF; |
| r1 &= 0xFF00FF; |
| |
| r0 += r1; |
| r6 += r7; |
| |
| r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */ |
| r8 = (r2 >> 8) & 0xFF00FF; |
| r2 &= 0xFF00FF; |
| |
| r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */ |
| r7 = (r1 >> 8) & 0xFF00FF; |
| r1 &= 0xFF00FF; |
| r1 += r2; |
| |
| r7 += r8; |
| |
| r0 += 20 * r1; |
| r6 += 20 * r7; |
| r0 += 0x100010; |
| r6 += 0x100010; |
| |
| r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */ |
| r8 = (r2 >> 8) & 0xFF00FF; |
| r2 &= 0xFF00FF; |
| |
| r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */ |
| r7 = (r1 >> 8) & 0xFF00FF; |
| r1 &= 0xFF00FF; |
| r1 += r2; |
| |
| r7 += r8; |
| |
| r0 -= 5 * r1; |
| r6 -= 5 * r7; |
| |
| r0 >>= 5; |
| r6 >>= 5; |
| /* clip */ |
| r13 |= r6; |
| r13 |= r0; |
| //CLIPPACK(r6,result) |
| r0 &= 0xFF00FF; |
| r6 &= 0xFF00FF; |
| r0 |= (r6 << 8); /* pack it back */ |
| *((uint32*)(p_cur += outpitch)) = r0; |
| } |
| p_cur += curr_offset; /* offset to the next pixel */ |
| if (r13 & 0xFF000700) /* this column need clipping */ |
| { |
| p_cur -= 4; |
| for (i = 0; i < 4; i++) |
| { |
| p_ref = in + i; |
| p_cur -= outpitch; /* compensate for the first offset */ |
| tmp = (uint32)(p_ref + ref_offset); /* limit */ |
| while ((uint32)p_ref < tmp) |
| { /* loop un-rolled */ |
| r0 = *(p_ref - (inpitch << 1)); |
| r1 = *(p_ref - inpitch); |
| r2 = *p_ref; |
| r3 = *(p_ref += inpitch); /* modify pointer before loading */ |
| r4 = *(p_ref += inpitch); |
| /* first pixel */ |
| r5 = *(p_ref += inpitch); |
| result = (r0 + r5); |
| r0 = (r1 + r4); |
| result -= (r0 * 5);//result -= r0; result -= (r0<<2); |
| r0 = (r2 + r3); |
| result += (r0 * 20);//result += (r0<<4); result += (r0<<2); |
| result = (result + 16) >> 5; |
| CLIP_RESULT(result) |
| *(p_cur += outpitch) = result; |
| /* second pixel */ |
| r0 = *(p_ref += inpitch); |
| result = (r1 + r0); |
| r1 = (r2 + r5); |
| result -= (r1 * 5);//result -= r1; result -= (r1<<2); |
| r1 = (r3 + r4); |
| result += (r1 * 20);//result += (r1<<4); result += (r1<<2); |
| result = (result + 16) >> 5; |
| CLIP_RESULT(result) |
| *(p_cur += outpitch) = result; |
| /* third pixel */ |
| r1 = *(p_ref += inpitch); |
| result = (r2 + r1); |
| r2 = (r3 + r0); |
| result -= (r2 * 5);//result -= r2; result -= (r2<<2); |
| r2 = (r4 + r5); |
| result += (r2 * 20);//result += (r2<<4); result += (r2<<2); |
| result = (result + 16) >> 5; |
| CLIP_RESULT(result) |
| *(p_cur += outpitch) = result; |
| /* fourth pixel */ |
| r2 = *(p_ref += inpitch); |
| result = (r3 + r2); |
| r3 = (r4 + r1); |
| result -= (r3 * 5);//result -= r3; result -= (r3<<2); |
| r3 = (r5 + r0); |
| result += (r3 * 20);//result += (r3<<4); result += (r3<<2); |
| result = (result + 16) >> 5; |
| CLIP_RESULT(result) |
| *(p_cur += outpitch) = result; |
| p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */ |
| } |
| p_cur += (curr_offset - 3); |
| } |
| } |
| } |
| } |
| |
| return ; |
| } |
| |
| void eVertInterp2MC(uint8 *in, int inpitch, int *out, int outpitch, |
| int blkwidth, int blkheight) |
| { |
| int *p_cur; |
| uint8 *p_ref; |
| uint32 tmp; |
| int result, curr_offset, ref_offset; |
| int j, r0, r1, r2, r3, r4, r5; |
| |
| p_cur = out; |
| curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */ |
| ref_offset = blkheight * inpitch; /* for limit */ |
| |
| for (j = 0; j < blkwidth; j++) |
| { |
| p_cur -= outpitch; /* compensate for the first offset */ |
| p_ref = in++; |
| |
| tmp = (uint32)(p_ref + ref_offset); /* limit */ |
| while ((uint32)p_ref < tmp) |
| { /* loop un-rolled */ |
| r0 = *(p_ref - (inpitch << 1)); |
| r1 = *(p_ref - inpitch); |
| r2 = *p_ref; |
| r3 = *(p_ref += inpitch); /* modify pointer before loading */ |
| r4 = *(p_ref += inpitch); |
| /* first pixel */ |
| r5 = *(p_ref += inpitch); |
| result = (r0 + r5); |
| r0 = (r1 + r4); |
| result -= (r0 * 5);//result -= r0; result -= (r0<<2); |
| r0 = (r2 + r3); |
| result += (r0 * 20);//result += (r0<<4); result += (r0<<2); |
| *(p_cur += outpitch) = result; |
| /* second pixel */ |
| r0 = *(p_ref += inpitch); |
| result = (r1 + r0); |
| r1 = (r2 + r5); |
| result -= (r1 * 5);//result -= r1; result -= (r1<<2); |
| r1 = (r3 + r4); |
| result += (r1 * 20);//result += (r1<<4); result += (r1<<2); |
| *(p_cur += outpitch) = result; |
| /* third pixel */ |
| r1 = *(p_ref += inpitch); |
| result = (r2 + r1); |
| r2 = (r3 + r0); |
| result -= (r2 * 5);//result -= r2; result -= (r2<<2); |
| r2 = (r4 + r5); |
| result += (r2 * 20);//result += (r2<<4); result += (r2<<2); |
| *(p_cur += outpitch) = result; |
| /* fourth pixel */ |
| r2 = *(p_ref += inpitch); |
| result = (r3 + r2); |
| r3 = (r4 + r1); |
| result -= (r3 * 5);//result -= r3; result -= (r3<<2); |
| r3 = (r5 + r0); |
| result += (r3 * 20);//result += (r3<<4); result += (r3<<2); |
| *(p_cur += outpitch) = result; |
| p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */ |
| } |
| p_cur += curr_offset; |
| } |
| |
| return ; |
| } |
| |
| void eVertInterp3MC(int *in, int inpitch, uint8 *out, int outpitch, |
| int blkwidth, int blkheight, int dy) |
| { |
| uint8 *p_cur; |
| int *p_ref; |
| uint32 tmp; |
| int result, result2, curr_offset, ref_offset; |
| int j, r0, r1, r2, r3, r4, r5; |
| |
| p_cur = out; |
| curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */ |
| ref_offset = blkheight * inpitch; /* for limit */ |
| |
| if (dy&1) |
| { |
| dy = (dy >> 1) ? -(inpitch << 1) : -(inpitch << 1) - inpitch; |
| |
| for (j = 0; j < blkwidth; j++) |
| { |
| p_cur -= outpitch; /* compensate for the first offset */ |
| p_ref = in++; |
| |
| tmp = (uint32)(p_ref + ref_offset); /* limit */ |
| while ((uint32)p_ref < tmp) |
| { /* loop un-rolled */ |
| r0 = *(p_ref - (inpitch << 1)); |
| r1 = *(p_ref - inpitch); |
| r2 = *p_ref; |
| r3 = *(p_ref += inpitch); /* modify pointer before loading */ |
| r4 = *(p_ref += inpitch); |
| /* first pixel */ |
| r5 = *(p_ref += inpitch); |
| result = (r0 + r5); |
| r0 = (r1 + r4); |
| result -= (r0 * 5);//result -= r0; result -= (r0<<2); |
| r0 = (r2 + r3); |
| result += (r0 * 20);//result += (r0<<4); result += (r0<<2); |
| result = (result + 512) >> 10; |
| CLIP_RESULT(result) |
| result2 = ((p_ref[dy] + 16) >> 5); |
| CLIP_RESULT(result2) |
| /* 3/4 pel, no need to clip */ |
| result = (result + result2 + 1); |
| result = (result >> 1); |
| *(p_cur += outpitch) = result; |
| /* second pixel */ |
| r0 = *(p_ref += inpitch); |
| result = (r1 + r0); |
| r1 = (r2 + r5); |
| result -= (r1 * 5);//result -= r1; result -= (r1<<2); |
| r1 = (r3 + r4); |
| result += (r1 * 20);//result += (r1<<4); result += (r1<<2); |
| result = (result + 512) >> 10; |
| CLIP_RESULT(result) |
| result2 = ((p_ref[dy] + 16) >> 5); |
| CLIP_RESULT(result2) |
| /* 3/4 pel, no need to clip */ |
| result = (result + result2 + 1); |
| result = (result >> 1); |
| *(p_cur += outpitch) = result; |
| /* third pixel */ |
| r1 = *(p_ref += inpitch); |
| result = (r2 + r1); |
| r2 = (r3 + r0); |
| result -= (r2 * 5);//result -= r2; result -= (r2<<2); |
| r2 = (r4 + r5); |
| result += (r2 * 20);//result += (r2<<4); result += (r2<<2); |
| result = (result + 512) >> 10; |
| CLIP_RESULT(result) |
| result2 = ((p_ref[dy] + 16) >> 5); |
| CLIP_RESULT(result2) |
| /* 3/4 pel, no need to clip */ |
| result = (result + result2 + 1); |
| result = (result >> 1); |
| *(p_cur += outpitch) = result; |
| /* fourth pixel */ |
| r2 = *(p_ref += inpitch); |
| result = (r3 + r2); |
| r3 = (r4 + r1); |
| result -= (r3 * 5);//result -= r3; result -= (r3<<2); |
| r3 = (r5 + r0); |
| result += (r3 * 20);//result += (r3<<4); result += (r3<<2); |
| result = (result + 512) >> 10; |
| CLIP_RESULT(result) |
| result2 = ((p_ref[dy] + 16) >> 5); |
| CLIP_RESULT(result2) |
| /* 3/4 pel, no need to clip */ |
| result = (result + result2 + 1); |
| result = (result >> 1); |
| *(p_cur += outpitch) = result; |
| p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */ |
| } |
| p_cur += curr_offset; |
| } |
| } |
| else |
| { |
| for (j = 0; j < blkwidth; j++) |
| { |
| p_cur -= outpitch; /* compensate for the first offset */ |
| p_ref = in++; |
| |
| tmp = (uint32)(p_ref + ref_offset); /* limit */ |
| while ((uint32)p_ref < tmp) |
| { /* loop un-rolled */ |
| r0 = *(p_ref - (inpitch << 1)); |
| r1 = *(p_ref - inpitch); |
| r2 = *p_ref; |
| r3 = *(p_ref += inpitch); /* modify pointer before loading */ |
| r4 = *(p_ref += inpitch); |
| /* first pixel */ |
| r5 = *(p_ref += inpitch); |
| result = (r0 + r5); |
| r0 = (r1 + r4); |
| result -= (r0 * 5);//result -= r0; result -= (r0<<2); |
| r0 = (r2 + r3); |
| result += (r0 * 20);//result += (r0<<4); result += (r0<<2); |
| result = (result + 512) >> 10; |
| CLIP_RESULT(result) |
| *(p_cur += outpitch) = result; |
| /* second pixel */ |
| r0 = *(p_ref += inpitch); |
| result = (r1 + r0); |
| r1 = (r2 + r5); |
| result -= (r1 * 5);//result -= r1; result -= (r1<<2); |
| r1 = (r3 + r4); |
| result += (r1 * 20);//result += (r1<<4); result += (r1<<2); |
| result = (result + 512) >> 10; |
| CLIP_RESULT(result) |
| *(p_cur += outpitch) = result; |
| /* third pixel */ |
| r1 = *(p_ref += inpitch); |
| result = (r2 + r1); |
| r2 = (r3 + r0); |
| result -= (r2 * 5);//result -= r2; result -= (r2<<2); |
| r2 = (r4 + r5); |
| result += (r2 * 20);//result += (r2<<4); result += (r2<<2); |
| result = (result + 512) >> 10; |
| CLIP_RESULT(result) |
| *(p_cur += outpitch) = result; |
| /* fourth pixel */ |
| r2 = *(p_ref += inpitch); |
| result = (r3 + r2); |
| r3 = (r4 + r1); |
| result -= (r3 * 5);//result -= r3; result -= (r3<<2); |
| r3 = (r5 + r0); |
| result += (r3 * 20);//result += (r3<<4); result += (r3<<2); |
| result = (result + 512) >> 10; |
| CLIP_RESULT(result) |
| *(p_cur += outpitch) = result; |
| p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */ |
| } |
| p_cur += curr_offset; |
| } |
| } |
| |
| return ; |
| } |
| |
| void eDiagonalInterpMC(uint8 *in1, uint8 *in2, int inpitch, |
| uint8 *out, int outpitch, |
| int blkwidth, int blkheight) |
| { |
| int j, i; |
| int result; |
| uint8 *p_cur, *p_ref, *p_tmp8; |
| int curr_offset, ref_offset; |
| uint8 tmp_res[24][24], tmp_in[24][24]; |
| uint32 *p_tmp; |
| uint32 tmp, pkres, tmp_result; |
| int32 r0, r1, r2, r3, r4, r5; |
| int32 r6, r7, r8, r9, r10, r13; |
| |
| ref_offset = inpitch - blkwidth; |
| p_ref = in1 - 2; |
| /* perform horizontal interpolation */ |
| /* not word-aligned */ |
| /* It is faster to read 1 byte at time to avoid calling CreateAlign */ |
| /* if(((uint32)p_ref)&0x3) |
| { |
| CreateAlign(p_ref,inpitch,0,&tmp_in[0][0],blkwidth+8,blkheight); |
| p_ref = &tmp_in[0][0]; |
| ref_offset = 24-blkwidth; |
| }*/ |
| |
| p_tmp = (uint32*) & (tmp_res[0][0]); |
| for (j = blkheight; j > 0; j--) |
| { |
| r13 = 0; |
| tmp = (uint32)(p_ref + blkwidth); |
| |
| //r0 = *((uint32*)p_ref); /* d,c,b,a */ |
| //r1 = (r0>>8)&0xFF00FF; /* 0,d,0,b */ |
| //r0 &= 0xFF00FF; /* 0,c,0,a */ |
| /* It is faster to read 1 byte at a time */ |
| r0 = p_ref[0]; |
| r1 = p_ref[2]; |
| r0 |= (r1 << 16); /* 0,c,0,a */ |
| r1 = p_ref[1]; |
| r2 = p_ref[3]; |
| r1 |= (r2 << 16); /* 0,d,0,b */ |
| |
| while ((uint32)p_ref < tmp) |
| { |
| //r2 = *((uint32*)(p_ref+=4));/* h,g,f,e */ |
| //r3 = (r2>>8)&0xFF00FF; /* 0,h,0,f */ |
| //r2 &= 0xFF00FF; /* 0,g,0,e */ |
| /* It is faster to read 1 byte at a time */ |
| r2 = *(p_ref += 4); |
| r3 = p_ref[2]; |
| r2 |= (r3 << 16); /* 0,g,0,e */ |
| r3 = p_ref[1]; |
| r4 = p_ref[3]; |
| r3 |= (r4 << 16); /* 0,h,0,f */ |
| |
| r4 = r0 + r3; /* c+h, a+f */ |
| r5 = r0 + r1; /* c+d, a+b */ |
| r6 = r2 + r3; /* g+h, e+f */ |
| r5 >>= 16; |
| r5 |= (r6 << 16); /* e+f, c+d */ |
| r4 += r5 * 20; /* c+20*e+20*f+h, a+20*c+20*d+f */ |
| r4 += 0x100010; /* +16, +16 */ |
| r5 = r1 + r2; /* d+g, b+e */ |
| r4 -= r5 * 5; /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */ |
| r4 >>= 5; |
| r13 |= r4; /* check clipping */ |
| r4 &= 0xFF00FF; /* mask */ |
| |
| r5 = p_ref[4]; /* i */ |
| r6 = (r5 << 16); |
| r5 = r6 | (r2 >> 16);/* 0,i,0,g */ |
| r5 += r1; /* d+i, b+g */ /* r5 not free */ |
| r1 >>= 16; |
| r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */ |
| r1 += r2; /* f+g, d+e */ |
| r5 += 20 * r1; /* d+20f+20g+i, b+20d+20e+g */ |
| r0 >>= 16; |
| r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */ |
| r0 += r3; /* e+h, c+f */ |
| r5 += 0x100010; /* 16,16 */ |
| r5 -= r0 * 5; /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */ |
| r5 >>= 5; |
| r13 |= r5; /* check clipping */ |
| r5 &= 0xFF00FF; /* mask */ |
| |
| r4 |= (r5 << 8); /* pack them together */ |
| *p_tmp++ = r4; |
| r1 = r3; |
| r0 = r2; |
| } |
| p_tmp += ((24 - blkwidth) >> 2); /* move to the next line */ |
| p_ref += ref_offset; /* ref_offset = inpitch-blkwidth; */ |
| |
| if (r13&0xFF000700) /* need clipping */ |
| { |
| /* move back to the beginning of the line */ |
| p_ref -= (ref_offset + blkwidth); /* input */ |
| p_tmp -= 6; /* intermediate output */ |
| tmp = (uint32)(p_ref + blkwidth); |
| while ((uint32)p_ref < tmp) |
| { |
| r0 = *p_ref++; |
| r1 = *p_ref++; |
| r2 = *p_ref++; |
| r3 = *p_ref++; |
| r4 = *p_ref++; |
| /* first pixel */ |
| r5 = *p_ref++; |
| result = (r0 + r5); |
| r0 = (r1 + r4); |
| result -= (r0 * 5);//result -= r0; result -= (r0<<2); |
| r0 = (r2 + r3); |
| result += (r0 * 20);//result += (r0<<4); result += (r0<<2); |
| result = (result + 16) >> 5; |
| CLIP_RESULT(result) |
| pkres = result; |
| /* second pixel */ |
| r0 = *p_ref++; |
| result = (r1 + r0); |
| r1 = (r2 + r5); |
| result -= (r1 * 5);//result -= r1; result -= (r1<<2); |
| r1 = (r3 + r4); |
| result += (r1 * 20);//result += (r1<<4); result += (r1<<2); |
| result = (result + 16) >> 5; |
| CLIP_RESULT(result) |
| pkres |= (result << 8); |
| /* third pixel */ |
| r1 = *p_ref++; |
| result = (r2 + r1); |
| r2 = (r3 + r0); |
| result -= (r2 * 5);//result -= r2; result -= (r2<<2); |
| r2 = (r4 + r5); |
| result += (r2 * 20);//result += (r2<<4); result += (r2<<2); |
| result = (result + 16) >> 5; |
| CLIP_RESULT(result) |
| pkres |= (result << 16); |
| /* fourth pixel */ |
| r2 = *p_ref++; |
| result = (r3 + r2); |
| r3 = (r4 + r1); |
| result -= (r3 * 5);//result -= r3; result -= (r3<<2); |
| r3 = (r5 + r0); |
| result += (r3 * 20);//result += (r3<<4); result += (r3<<2); |
| result = (result + 16) >> 5; |
| CLIP_RESULT(result) |
| pkres |= (result << 24); |
| |
| *p_tmp++ = pkres; /* write 4 pixel */ |
| p_ref -= 5; |
| } |
| p_tmp += ((24 - blkwidth) >> 2); /* move to the next line */ |
| p_ref += ref_offset; /* ref_offset = inpitch-blkwidth; */ |
| } |
| } |
| |
| /* perform vertical interpolation */ |
| /* not word-aligned */ |
| if (((uint32)in2)&0x3) |
| { |
| eCreateAlign(in2, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5); |
| in2 = &tmp_in[2][0]; |
| inpitch = 24; |
| } |
| |
| p_cur = out; |
| curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically up and one pixel right */ |
| pkres = blkheight * inpitch; /* reuse it for limit */ |
| |
| curr_offset += 3; |
| |
| for (j = 0; j < blkwidth; j += 4, in2 += 4) |
| { |
| r13 = 0; |
| p_ref = in2; |
| p_tmp8 = &(tmp_res[0][j]); /* intermediate result */ |
| p_tmp8 -= 24; /* compensate for the first offset */ |
| p_cur -= outpitch; /* compensate for the first offset */ |
| tmp = (uint32)(p_ref + pkres); /* limit */ |
| while ((uint32)p_ref < tmp) /* the loop un-rolled */ |
| { |
| /* Read 1 byte at a time is too slow, too many read and pack ops, need to call CreateAlign */ |
| /*p_ref8 = p_ref-(inpitch<<1); r0 = p_ref8[0]; r1 = p_ref8[2]; |
| r0 |= (r1<<16); r6 = p_ref8[1]; r1 = p_ref8[3]; |
| r6 |= (r1<<16); p_ref+=inpitch; */ |
| r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */ |
| p_ref += inpitch; |
| r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */ |
| r0 &= 0xFF00FF; |
| |
| /*p_ref8 = p_ref+(inpitch<<1); |
| r1 = p_ref8[0]; r7 = p_ref8[2]; r1 |= (r7<<16); |
| r7 = p_ref8[1]; r2 = p_ref8[3]; r7 |= (r2<<16);*/ |
| r1 = *((uint32*)(p_ref + (inpitch << 1))); /* r1, r7, ref[3] */ |
| r7 = (r1 >> 8) & 0xFF00FF; |
| r1 &= 0xFF00FF; |
| |
| r0 += r1; |
| r6 += r7; |
| |
| /*r2 = p_ref[0]; r8 = p_ref[2]; r2 |= (r8<<16); |
| r8 = p_ref[1]; r1 = p_ref[3]; r8 |= (r1<<16);*/ |
| r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */ |
| r8 = (r2 >> 8) & 0xFF00FF; |
| r2 &= 0xFF00FF; |
| |
| /*p_ref8 = p_ref-inpitch; r1 = p_ref8[0]; r7 = p_ref8[2]; |
| r1 |= (r7<<16); r1 += r2; r7 = p_ref8[1]; |
| r2 = p_ref8[3]; r7 |= (r2<<16);*/ |
| r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */ |
| r7 = (r1 >> 8) & 0xFF00FF; |
| r1 &= 0xFF00FF; |
| r1 += r2; |
| |
| r7 += r8; |
| |
| r0 += 20 * r1; |
| r6 += 20 * r7; |
| r0 += 0x100010; |
| r6 += 0x100010; |
| |
| /*p_ref8 = p_ref-(inpitch<<1); r2 = p_ref8[0]; r8 = p_ref8[2]; |
| r2 |= (r8<<16); r8 = p_ref8[1]; r1 = p_ref8[3]; r8 |= (r1<<16);*/ |
| r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */ |
| r8 = (r2 >> 8) & 0xFF00FF; |
| r2 &= 0xFF00FF; |
| |
| /*p_ref8 = p_ref+inpitch; r1 = p_ref8[0]; r7 = p_ref8[2]; |
| r1 |= (r7<<16); r1 += r2; r7 = p_ref8[1]; |
| r2 = p_ref8[3]; r7 |= (r2<<16);*/ |
| r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */ |
| r7 = (r1 >> 8) & 0xFF00FF; |
| r1 &= 0xFF00FF; |
| r1 += r2; |
| |
| r7 += r8; |
| |
| r0 -= 5 * r1; |
| r6 -= 5 * r7; |
| |
| r0 >>= 5; |
| r6 >>= 5; |
| /* clip */ |
| r13 |= r6; |
| r13 |= r0; |
| //CLIPPACK(r6,result) |
| /* add with horizontal results */ |
| r10 = *((uint32*)(p_tmp8 += 24)); |
| r9 = (r10 >> 8) & 0xFF00FF; |
| r10 &= 0xFF00FF; |
| |
| r0 += r10; |
| r0 += 0x10001; |
| r0 = (r0 >> 1) & 0xFF00FF; /* mask to 8 bytes */ |
| |
| r6 += r9; |
| r6 += 0x10001; |
| r6 = (r6 >> 1) & 0xFF00FF; /* mask to 8 bytes */ |
| |
| r0 |= (r6 << 8); /* pack it back */ |
| *((uint32*)(p_cur += outpitch)) = r0; |
| } |
| p_cur += curr_offset; /* offset to the next pixel */ |
| if (r13 & 0xFF000700) /* this column need clipping */ |
| { |
| p_cur -= 4; |
| for (i = 0; i < 4; i++) |
| { |
| p_ref = in2 + i; |
| p_tmp8 = &(tmp_res[0][j+i]); /* intermediate result */ |
| p_tmp8 -= 24; /* compensate for the first offset */ |
| p_cur -= outpitch; /* compensate for the first offset */ |
| tmp = (uint32)(p_ref + pkres); /* limit */ |
| while ((uint32)p_ref < tmp) /* the loop un-rolled */ |
| { |
| r0 = *(p_ref - (inpitch << 1)); |
| r1 = *(p_ref - inpitch); |
| r2 = *p_ref; |
| r3 = *(p_ref += inpitch); /* modify pointer before loading */ |
| r4 = *(p_ref += inpitch); |
| /* first pixel */ |
| r5 = *(p_ref += inpitch); |
| result = (r0 + r5); |
| r0 = (r1 + r4); |
| result -= (r0 * 5);//result -= r0; result -= (r0<<2); |
| r0 = (r2 + r3); |
| result += (r0 * 20);//result += (r0<<4); result += (r0<<2); |
| result = (result + 16) >> 5; |
| CLIP_RESULT(result) |
| tmp_result = *(p_tmp8 += 24); /* modify pointer before loading */ |
| result = (result + tmp_result + 1); /* no clip */ |
| result = (result >> 1); |
| *(p_cur += outpitch) = result; |
| /* second pixel */ |
| r0 = *(p_ref += inpitch); |
| result = (r1 + r0); |
| r1 = (r2 + r5); |
| result -= (r1 * 5);//result -= r1; result -= (r1<<2); |
| r1 = (r3 + r4); |
| result += (r1 * 20);//result += (r1<<4); result += (r1<<2); |
| result = (result + 16) >> 5; |
| CLIP_RESULT(result) |
| tmp_result = *(p_tmp8 += 24); /* intermediate result */ |
| result = (result + tmp_result + 1); /* no clip */ |
| result = (result >> 1); |
| *(p_cur += outpitch) = result; |
| /* third pixel */ |
| r1 = *(p_ref += inpitch); |
| result = (r2 + r1); |
| r2 = (r3 + r0); |
| result -= (r2 * 5);//result -= r2; result -= (r2<<2); |
| r2 = (r4 + r5); |
| result += (r2 * 20);//result += (r2<<4); result += (r2<<2); |
| result = (result + 16) >> 5; |
| CLIP_RESULT(result) |
| tmp_result = *(p_tmp8 += 24); /* intermediate result */ |
| result = (result + tmp_result + 1); /* no clip */ |
| result = (result >> 1); |
| *(p_cur += outpitch) = result; |
| /* fourth pixel */ |
| r2 = *(p_ref += inpitch); |
| result = (r3 + r2); |
| r3 = (r4 + r1); |
| result -= (r3 * 5);//result -= r3; result -= (r3<<2); |
| r3 = (r5 + r0); |
| result += (r3 * 20);//result += (r3<<4); result += (r3<<2); |
| result = (result + 16) >> 5; |
| CLIP_RESULT(result) |
| tmp_result = *(p_tmp8 += 24); /* intermediate result */ |
| result = (result + tmp_result + 1); /* no clip */ |
| result = (result >> 1); |
| *(p_cur += outpitch) = result; |
| p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */ |
| } |
| p_cur += (curr_offset - 3); |
| } |
| } |
| } |
| |
| return ; |
| } |
| |
| /* position G */ |
| void eFullPelMC(uint8 *in, int inpitch, uint8 *out, int outpitch, |
| int blkwidth, int blkheight) |
| { |
| int i, j; |
| int offset_in = inpitch - blkwidth; |
| int offset_out = outpitch - blkwidth; |
| uint32 temp; |
| uint8 byte; |
| |
| if (((uint32)in)&3) |
| { |
| for (j = blkheight; j > 0; j--) |
| { |
| for (i = blkwidth; i > 0; i -= 4) |
| { |
| temp = *in++; |
| byte = *in++; |
| temp |= (byte << 8); |
| byte = *in++; |
| temp |= (byte << 16); |
| byte = *in++; |
| temp |= (byte << 24); |
| |
| *((uint32*)out) = temp; /* write 4 bytes */ |
| out += 4; |
| } |
| out += offset_out; |
| in += offset_in; |
| } |
| } |
| else |
| { |
| for (j = blkheight; j > 0; j--) |
| { |
| for (i = blkwidth; i > 0; i -= 4) |
| { |
| temp = *((uint32*)in); |
| *((uint32*)out) = temp; |
| in += 4; |
| out += 4; |
| } |
| out += offset_out; |
| in += offset_in; |
| } |
| } |
| return ; |
| } |
| |
| void ePadChroma(uint8 *ref, int picwidth, int picheight, int picpitch, int x_pos, int y_pos) |
| { |
| int pad_height; |
| int pad_width; |
| uint8 *start; |
| uint32 word1, word2, word3; |
| int offset, j; |
| |
| |
| pad_height = 8 + ((y_pos & 7) ? 1 : 0); |
| pad_width = 8 + ((x_pos & 7) ? 1 : 0); |
| |
| y_pos >>= 3; |
| x_pos >>= 3; |
| // pad vertical first |
| if (y_pos < 0) // need to pad up |
| { |
| if (x_pos < -8) start = ref - 8; |
| else if (x_pos + pad_width > picwidth + 7) start = ref + picwidth + 7 - pad_width; |
| else start = ref + x_pos; |
| |
| /* word-align start */ |
| offset = (uint32)start & 0x3; |
| if (offset) start -= offset; |
| |
| word1 = *((uint32*)start); |
| word2 = *((uint32*)(start + 4)); |
| word3 = *((uint32*)(start + 8)); |
| |
| /* pad up N rows */ |
| j = -y_pos; |
| if (j > 8) j = 8; |
| while (j--) |
| { |
| *((uint32*)(start -= picpitch)) = word1; |
| *((uint32*)(start + 4)) = word2; |
| *((uint32*)(start + 8)) = word3; |
| } |
| |
| } |
| else if (y_pos + pad_height >= picheight) /* pad down */ |
| { |
| if (x_pos < -8) start = ref + picpitch * (picheight - 1) - 8; |
| else if (x_pos + pad_width > picwidth + 7) start = ref + picpitch * (picheight - 1) + |
| picwidth + 7 - pad_width; |
| else start = ref + picpitch * (picheight - 1) + x_pos; |
| |
| /* word-align start */ |
| offset = (uint32)start & 0x3; |
| if (offset) start -= offset; |
| |
| word1 = *((uint32*)start); |
| word2 = *((uint32*)(start + 4)); |
| word3 = *((uint32*)(start + 8)); |
| |
| /* pad down N rows */ |
| j = y_pos + pad_height - picheight; |
| if (j > 8) j = 8; |
| while (j--) |
| { |
| *((uint32*)(start += picpitch)) = word1; |
| *((uint32*)(start + 4)) = word2; |
| *((uint32*)(start + 8)) = word3; |
| } |
| } |
| |
| /* now pad horizontal */ |
| if (x_pos < 0) // pad left |
| { |
| if (y_pos < -8) start = ref - (picpitch << 3); |
| else if (y_pos + pad_height > picheight + 7) start = ref + (picheight + 7 - pad_height) * picpitch; |
| else start = ref + y_pos * picpitch; |
| |
| // now pad left 8 pixels for pad_height rows */ |
| j = pad_height; |
| start -= picpitch; |
| while (j--) |
| { |
| word1 = *(start += picpitch); |
| word1 |= (word1 << 8); |
| word1 |= (word1 << 16); |
| *((uint32*)(start - 8)) = word1; |
| *((uint32*)(start - 4)) = word1; |
| } |
| } |
| else if (x_pos + pad_width >= picwidth) /* pad right */ |
| { |
| if (y_pos < -8) start = ref - (picpitch << 3) + picwidth - 1; |
| else if (y_pos + pad_height > picheight + 7) start = ref + (picheight + 7 - pad_height) * picpitch + picwidth - 1; |
| else start = ref + y_pos * picpitch + picwidth - 1; |
| |
| // now pad right 8 pixels for pad_height rows */ |
| j = pad_height; |
| start -= picpitch; |
| while (j--) |
| { |
| word1 = *(start += picpitch); |
| word1 |= (word1 << 8); |
| word1 |= (word1 << 16); |
| *((uint32*)(start + 1)) = word1; |
| *((uint32*)(start + 5)) = word1; |
| } |
| } |
| |
| return ; |
| } |
| |
| |
| void eChromaMotionComp(uint8 *ref, int picwidth, int picheight, |
| int x_pos, int y_pos, |
| uint8 *pred, int picpitch, |
| int blkwidth, int blkheight) |
| { |
| int dx, dy; |
| int offset_dx, offset_dy; |
| int index; |
| |
| ePadChroma(ref, picwidth, picheight, picpitch, x_pos, y_pos); |
| |
| dx = x_pos & 7; |
| dy = y_pos & 7; |
| offset_dx = (dx + 7) >> 3; |
| offset_dy = (dy + 7) >> 3; |
| x_pos = x_pos >> 3; /* round it to full-pel resolution */ |
| y_pos = y_pos >> 3; |
| |
| ref += y_pos * picpitch + x_pos; |
| |
| index = offset_dx + (offset_dy << 1) + ((blkwidth << 1) & 0x7); |
| |
| (*(eChromaMC_SIMD[index]))(ref, picpitch , dx, dy, pred, picpitch, blkwidth, blkheight); |
| return ; |
| } |
| |
| |
| /* SIMD routines, unroll the loops in vertical direction, decreasing loops (things to be done) MC */ |
| void eChromaDiagonalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy, |
| uint8 *pOut, int predPitch, int blkwidth, int blkheight) |
| { |
| int32 r0, r1, r2, r3, result0, result1; |
| uint8 temp[288]; |
| uint8 *ref, *out; |
| int i, j; |
| int dx_8 = 8 - dx; |
| int dy_8 = 8 - dy; |
| |
| /* horizontal first */ |
| out = temp; |
| for (i = 0; i < blkheight + 1; i++) |
| { |
| ref = pRef; |
| r0 = ref[0]; |
| for (j = 0; j < blkwidth; j += 4) |
| { |
| r0 |= (ref[2] << 16); |
| result0 = dx_8 * r0; |
| |
| r1 = ref[1] | (ref[3] << 16); |
| result0 += dx * r1; |
| *(int32 *)out = result0; |
| |
| result0 = dx_8 * r1; |
| |
| r2 = ref[4]; |
| r0 = r0 >> 16; |
| r1 = r0 | (r2 << 16); |
| result0 += dx * r1; |
| *(int32 *)(out + 16) = result0; |
| |
| ref += 4; |
| out += 4; |
| r0 = r2; |
| } |
| pRef += srcPitch; |
| out += (32 - blkwidth); |
| } |
| |
| // pRef -= srcPitch*(blkheight+1); |
| ref = temp; |
| |
| for (j = 0; j < blkwidth; j += 4) |
| { |
| r0 = *(int32 *)ref; |
| r1 = *(int32 *)(ref + 16); |
| ref += 32; |
| out = pOut; |
| for (i = 0; i < (blkheight >> 1); i++) |
| { |
| result0 = dy_8 * r0 + 0x00200020; |
| r2 = *(int32 *)ref; |
| result0 += dy * r2; |
| result0 >>= 6; |
| result0 &= 0x00FF00FF; |
| r0 = r2; |
| |
| result1 = dy_8 * r1 + 0x00200020; |
| r3 = *(int32 *)(ref + 16); |
| result1 += dy * r3; |
| result1 >>= 6; |
| result1 &= 0x00FF00FF; |
| r1 = r3; |
| *(int32 *)out = result0 | (result1 << 8); |
| out += predPitch; |
| ref += 32; |
| |
| result0 = dy_8 * r0 + 0x00200020; |
| r2 = *(int32 *)ref; |
| result0 += dy * r2; |
| result0 >>= 6; |
| result0 &= 0x00FF00FF; |
| r0 = r2; |
| |
| result1 = dy_8 * r1 + 0x00200020; |
| r3 = *(int32 *)(ref + 16); |
| result1 += dy * r3; |
| result1 >>= 6; |
| result1 &= 0x00FF00FF; |
| r1 = r3; |
| *(int32 *)out = result0 | (result1 << 8); |
| out += predPitch; |
| ref += 32; |
| } |
| pOut += 4; |
| ref = temp + 4; /* since it can only iterate twice max MC */ |
| } |
| return; |
| } |
| |
| void eChromaHorizontalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy, |
| uint8 *pOut, int predPitch, int blkwidth, int blkheight) |
| { |
| (void)(dy); |
| |
| int32 r0, r1, r2, result0, result1; |
| uint8 *ref, *out; |
| int i, j; |
| int dx_8 = 8 - dx; |
| |
| /* horizontal first */ |
| for (i = 0; i < blkheight; i++) |
| { |
| ref = pRef; |
| out = pOut; |
| |
| r0 = ref[0]; |
| for (j = 0; j < blkwidth; j += 4) |
| { |
| r0 |= (ref[2] << 16); |
| result0 = dx_8 * r0 + 0x00040004; |
| |
| r1 = ref[1] | (ref[3] << 16); |
| result0 += dx * r1; |
| result0 >>= 3; |
| result0 &= 0x00FF00FF; |
| |
| result1 = dx_8 * r1 + 0x00040004; |
| |
| r2 = ref[4]; |
| r0 = r0 >> 16; |
| r1 = r0 | (r2 << 16); |
| result1 += dx * r1; |
| result1 >>= 3; |
| result1 &= 0x00FF00FF; |
| |
| *(int32 *)out = result0 | (result1 << 8); |
| |
| ref += 4; |
| out += 4; |
| r0 = r2; |
| } |
| |
| pRef += srcPitch; |
| pOut += predPitch; |
| } |
| return; |
| } |
| |
| void eChromaVerticalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy, |
| uint8 *pOut, int predPitch, int blkwidth, int blkheight) |
| { |
| (void)(dx); |
| |
| int32 r0, r1, r2, r3, result0, result1; |
| int i, j; |
| uint8 *ref, *out; |
| int dy_8 = 8 - dy; |
| /* vertical first */ |
| for (i = 0; i < blkwidth; i += 4) |
| { |
| ref = pRef; |
| out = pOut; |
| |
| r0 = ref[0] | (ref[2] << 16); |
| r1 = ref[1] | (ref[3] << 16); |
| ref += srcPitch; |
| for (j = 0; j < blkheight; j++) |
| { |
| result0 = dy_8 * r0 + 0x00040004; |
| r2 = ref[0] | (ref[2] << 16); |
| result0 += dy * r2; |
| result0 >>= 3; |
| result0 &= 0x00FF00FF; |
| r0 = r2; |
| |
| result1 = dy_8 * r1 + 0x00040004; |
| r3 = ref[1] | (ref[3] << 16); |
| result1 += dy * r3; |
| result1 >>= 3; |
| result1 &= 0x00FF00FF; |
| r1 = r3; |
| *(int32 *)out = result0 | (result1 << 8); |
| ref += srcPitch; |
| out += predPitch; |
| } |
| pOut += 4; |
| pRef += 4; |
| } |
| return; |
| } |
| |
| void eChromaDiagonalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy, |
| uint8 *pOut, int predPitch, int blkwidth, int blkheight) |
| { |
| (void)(blkwidth); |
| |
| int32 r0, r1, temp0, temp1, result; |
| int32 temp[9]; |
| int32 *out; |
| int i, r_temp; |
| int dy_8 = 8 - dy; |
| |
| /* horizontal first */ |
| out = temp; |
| for (i = 0; i < blkheight + 1; i++) |
| { |
| r_temp = pRef[1]; |
| temp0 = (pRef[0] << 3) + dx * (r_temp - pRef[0]); |
| temp1 = (r_temp << 3) + dx * (pRef[2] - r_temp); |
| r0 = temp0 | (temp1 << 16); |
| *out++ = r0; |
| pRef += srcPitch; |
| } |
| |
| pRef -= srcPitch * (blkheight + 1); |
| |
| out = temp; |
| |
| r0 = *out++; |
| |
| for (i = 0; i < blkheight; i++) |
| { |
| result = dy_8 * r0 + 0x00200020; |
| r1 = *out++; |
| result += dy * r1; |
| result >>= 6; |
| result &= 0x00FF00FF; |
| *(int16 *)pOut = (result >> 8) | (result & 0xFF); |
| r0 = r1; |
| pOut += predPitch; |
| } |
| return; |
| } |
| |
| void eChromaHorizontalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy, |
| uint8 *pOut, int predPitch, int blkwidth, int blkheight) |
| { |
| (void)(dy); |
| (void)(blkwidth); |
| |
| int i, temp, temp0, temp1; |
| |
| /* horizontal first */ |
| for (i = 0; i < blkheight; i++) |
| { |
| temp = pRef[1]; |
| temp0 = ((pRef[0] << 3) + dx * (temp - pRef[0]) + 4) >> 3; |
| temp1 = ((temp << 3) + dx * (pRef[2] - temp) + 4) >> 3; |
| |
| *(int16 *)pOut = temp0 | (temp1 << 8); |
| pRef += srcPitch; |
| pOut += predPitch; |
| |
| } |
| return; |
| } |
| #if 1 |
| void eChromaVerticalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy, |
| uint8 *pOut, int predPitch, int blkwidth, int blkheight) |
| { |
| (void)(dx); |
| (void)(blkwidth); |
| |
| int32 r0, r1, result; |
| int i; |
| int dy_8 = 8 - dy; |
| r0 = pRef[0] | (pRef[1] << 16); |
| pRef += srcPitch; |
| for (i = 0; i < blkheight; i++) |
| { |
| result = dy_8 * r0 + 0x00040004; |
| r1 = pRef[0] | (pRef[1] << 16); |
| result += dy * r1; |
| result >>= 3; |
| result &= 0x00FF00FF; |
| *(int16 *)pOut = (result >> 8) | (result & 0xFF); |
| r0 = r1; |
| pRef += srcPitch; |
| pOut += predPitch; |
| } |
| return; |
| } |
| #else |
| void eChromaVerticalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy, |
| uint8 *pOut, int predPitch, int blkwidth, int blkheight) |
| { |
| (void)(dx); |
| (void)(dy); |
| |
| int32 r0, r1, result; |
| int i, temp0, temp1; |
| |
| temp0 = pRef[0]; |
| temp1 = pRef[1]; |
| pRef += srcPitch; |
| for (i = 0; i < blkheight; i++) |
| { |
| temp2 = pRef[0]; |
| temp3 = pRef[1]; |
| |
| result0 = ((temp0 << 3) + dy * (temp2 - temp0) + 4); |
| result0 >>= 3; |
| result1 = ((temp1 << 3) + dy * (temp3 - temp1) + 4); |
| result1 >>= 3; |
| *(int32 *)pOut = result0 | (result1 << 8); |
| temp0 = temp2; |
| temp1 = temp3; |
| pRef += srcPitch; |
| pOut += predPitch; |
| } |
| return; |
| } |
| #endif |
| |
| void eChromaFullMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy, |
| uint8 *pOut, int predPitch, int blkwidth, int blkheight) |
| { |
| (void)(dx); |
| (void)(dy); |
| |
| int i, j; |
| int offset_in = srcPitch - blkwidth; |
| int offset_out = predPitch - blkwidth; |
| uint16 temp; |
| uint8 byte; |
| |
| if (((uint32)pRef)&1) |
| { |
| for (j = blkheight; j > 0; j--) |
| { |
| for (i = blkwidth; i > 0; i -= 2) |
| { |
| temp = *pRef++; |
| byte = *pRef++; |
| temp |= (byte << 8); |
| *((uint16*)pOut) = temp; /* write 2 bytes */ |
| pOut += 2; |
| } |
| pOut += offset_out; |
| pRef += offset_in; |
| } |
| } |
| else |
| { |
| for (j = blkheight; j > 0; j--) |
| { |
| for (i = blkwidth; i > 0; i -= 2) |
| { |
| temp = *((uint16*)pRef); |
| *((uint16*)pOut) = temp; |
| pRef += 2; |
| pOut += 2; |
| } |
| pOut += offset_out; |
| pRef += offset_in; |
| } |
| } |
| return ; |
| } |