blob: 0aa327c2ba26206cd17f149761758f411b0b3329 [file] [log] [blame]
/*
* Copyright (c) 2003, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
#if !defined(JAVA2D_NO_MLIB) || defined(MLIB_ADD_SUFF)
#include <vis_proto.h>
#include "java2d_Mlib.h"
#include "vis_AlphaMacros.h"
/***************************************************************/
extern mlib_d64 vis_d64_div_tbl[256];
/***************************************************************/
#define RGB2GRAY(r, g, b) \
(((19672 * (r)) + (38621 * (g)) + (7500 * (b))) >> 8)
/***************************************************************/
static const mlib_s32 RGB_weight[] = {
(19672/2) | ((19672/2) << 16),
(38621/2) | ((38621/2) << 16),
( 7500/2) | (( 7500/2) << 16),
/*(1 << 6)*/ - (1 << 22)
};
/***************************************************************/
#define RGB_VARS \
mlib_d64 r, g, b, ar, gb, s02, s13; \
mlib_f32 ff; \
mlib_f32 alpha = ((mlib_f32*)RGB_weight)[0]; \
mlib_f32 beta = ((mlib_f32*)RGB_weight)[1]; \
mlib_f32 gamma = ((mlib_f32*)RGB_weight)[2]; \
mlib_f32 fzeros = vis_fzeros(); \
mlib_d64 d_half = vis_to_double_dup(RGB_weight[3]); \
mlib_f32 mask8000 = vis_to_float(0x80008000); \
\
vis_write_gsr(((16 - 7) << 3) | 6)
/***************************************************************/
#define GRAY_U16(ff, r, g, b) \
{ \
mlib_d64 dr, dg, db; \
dr = vis_fmuld8ulx16(r, alpha); \
dg = vis_fmuld8ulx16(g, beta); \
db = vis_fmuld8ulx16(b, gamma); \
dr = vis_fpadd32(dr, dg); \
db = vis_fpadd32(db, d_half); \
dr = vis_fpadd32(dr, db); \
ff = vis_fpackfix(dr); \
ff = vis_fxors(ff, mask8000); \
}
/***************************************************************/
#define LOAD_BGR(ind) \
b = vis_faligndata(vis_ld_u8(src + (ind )), b); \
g = vis_faligndata(vis_ld_u8(src + (ind + 1)), g); \
r = vis_faligndata(vis_ld_u8(src + (ind + 2)), r)
/***************************************************************/
void ADD_SUFF(IntArgbToUshortGrayConvert)(BLIT_PARAMS)
{
mlib_s32 dstScan = pDstInfo->scanStride;
mlib_s32 srcScan = pSrcInfo->scanStride;
mlib_s32 j;
RGB_VARS;
if (srcScan == 4*width && dstScan == 2*width) {
width *= height;
height = 1;
}
for (j = 0; j < height; j++) {
mlib_f32 *src = srcBase;
mlib_u16 *dst = dstBase;
mlib_u16 *dst_end;
dst_end = dst + width;
while (((mlib_s32)dst & 3) && dst < dst_end) {
r = vis_ld_u8((mlib_u8*)src + 1);
g = vis_ld_u8((mlib_u8*)src + 2);
b = vis_ld_u8((mlib_u8*)src + 3);
GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
vis_st_u16(D64_FROM_F32x2(ff), dst);
dst++;
src++;
}
#pragma pipeloop(0)
for (; dst <= (dst_end - 2); dst += 2) {
s02 = vis_fpmerge(src[0], src[1]);
ar = vis_fpmerge(fzeros, vis_read_hi(s02));
gb = vis_fpmerge(fzeros, vis_read_lo(s02));
GRAY_U16(ff, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb));
*(mlib_f32*)dst = ff;
src += 2;
}
while (dst < dst_end) {
r = vis_ld_u8((mlib_u8*)src + 1);
g = vis_ld_u8((mlib_u8*)src + 2);
b = vis_ld_u8((mlib_u8*)src + 3);
GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
vis_st_u16(D64_FROM_F32x2(ff), dst);
dst++;
src++;
}
PTR_ADD(dstBase, dstScan);
PTR_ADD(srcBase, srcScan);
}
}
/***************************************************************/
void ADD_SUFF(ThreeByteBgrToUshortGrayConvert)(BLIT_PARAMS)
{
mlib_s32 dstScan = pDstInfo->scanStride;
mlib_s32 srcScan = pSrcInfo->scanStride;
mlib_u16 *dst_end;
mlib_s32 j;
RGB_VARS;
if (srcScan == 3*width && dstScan == 2*width) {
width *= height;
height = 1;
}
for (j = 0; j < height; j++) {
mlib_u8 *src = srcBase;
mlib_u16 *dst = dstBase;
dst_end = dst + width;
while (((mlib_s32)dst & 3) && dst < dst_end) {
b = vis_ld_u8(src);
g = vis_ld_u8(src + 1);
r = vis_ld_u8(src + 2);
GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
vis_st_u16(D64_FROM_F32x2(ff), dst);
dst++;
src += 3;
}
#pragma pipeloop(0)
for (; dst <= (dst_end - 2); dst += 2) {
LOAD_BGR(3);
LOAD_BGR(0);
GRAY_U16(ff, vis_read_hi(r), vis_read_hi(g), vis_read_hi(b));
*(mlib_f32*)dst = ff;
src += 3*2;
}
while (dst < dst_end) {
b = vis_ld_u8(src);
g = vis_ld_u8(src + 1);
r = vis_ld_u8(src + 2);
GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
vis_st_u16(D64_FROM_F32x2(ff), dst);
dst++;
src += 3;
}
PTR_ADD(dstBase, dstScan);
PTR_ADD(srcBase, srcScan);
}
}
/***************************************************************/
void ADD_SUFF(IntArgbToUshortGrayScaleConvert)(SCALE_PARAMS)
{
mlib_s32 dstScan = pDstInfo->scanStride;
mlib_s32 srcScan = pSrcInfo->scanStride;
mlib_u16 *dst_end;
mlib_s32 i, j;
RGB_VARS;
for (j = 0; j < height; j++) {
mlib_f32 *src = srcBase;
mlib_u16 *dst = dstBase;
mlib_s32 tmpsxloc = sxloc;
PTR_ADD(src, (syloc >> shift) * srcScan);
dst_end = dst + width;
while (((mlib_s32)dst & 3) && dst < dst_end) {
i = tmpsxloc >> shift;
tmpsxloc += sxinc;
r = vis_ld_u8((mlib_u8*)(src + i) + 1);
g = vis_ld_u8((mlib_u8*)(src + i) + 2);
b = vis_ld_u8((mlib_u8*)(src + i) + 3);
GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
vis_st_u16(D64_FROM_F32x2(ff), dst);
dst++;
}
#pragma pipeloop(0)
for (; dst <= (dst_end - 2); dst += 2) {
s02 = vis_fpmerge(src[(tmpsxloc ) >> shift],
src[(tmpsxloc + sxinc) >> shift]);
tmpsxloc += 2*sxinc;
ar = vis_fpmerge(fzeros, vis_read_hi(s02));
gb = vis_fpmerge(fzeros, vis_read_lo(s02));
GRAY_U16(ff, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb));
*(mlib_f32*)dst = ff;
}
while (dst < dst_end) {
i = tmpsxloc >> shift;
tmpsxloc += sxinc;
r = vis_ld_u8((mlib_u8*)(src + i) + 1);
g = vis_ld_u8((mlib_u8*)(src + i) + 2);
b = vis_ld_u8((mlib_u8*)(src + i) + 3);
GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
vis_st_u16(D64_FROM_F32x2(ff), dst);
dst++;
}
PTR_ADD(dstBase, dstScan);
syloc += syinc;
}
}
/***************************************************************/
void ADD_SUFF(ThreeByteBgrToUshortGrayScaleConvert)(SCALE_PARAMS)
{
mlib_s32 dstScan = pDstInfo->scanStride;
mlib_s32 srcScan = pSrcInfo->scanStride;
mlib_u16 *dst_end;
mlib_s32 j, i0, i1;
RGB_VARS;
for (j = 0; j < height; j++) {
mlib_u8 *src = srcBase;
mlib_u16 *dst = dstBase;
mlib_s32 tmpsxloc = sxloc;
PTR_ADD(src, (syloc >> shift) * srcScan);
dst_end = dst + width;
while (((mlib_s32)dst & 3) && dst < dst_end) {
i0 = 3*(tmpsxloc >> shift);
tmpsxloc += sxinc;
b = vis_ld_u8(src + i0);
g = vis_ld_u8(src + i0 + 1);
r = vis_ld_u8(src + i0 + 2);
GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
vis_st_u16(D64_FROM_F32x2(ff), dst);
dst++;
}
#pragma pipeloop(0)
for (; dst <= (dst_end - 2); dst += 2) {
i0 = 3*(tmpsxloc >> shift);
tmpsxloc += sxinc;
i1 = 3*(tmpsxloc >> shift);
tmpsxloc += sxinc;
LOAD_BGR(i1);
LOAD_BGR(i0);
GRAY_U16(ff, vis_read_hi(r), vis_read_hi(g), vis_read_hi(b));
*(mlib_f32*)dst = ff;
}
while (dst < dst_end) {
i0 = 3*(tmpsxloc >> shift);
tmpsxloc += sxinc;
b = vis_ld_u8(src + i0);
g = vis_ld_u8(src + i0 + 1);
r = vis_ld_u8(src + i0 + 2);
GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
vis_st_u16(D64_FROM_F32x2(ff), dst);
dst++;
}
PTR_ADD(dstBase, dstScan);
syloc += syinc;
}
}
/***************************************************************/
#if 0
void ADD_SUFF(IntArgbBmToUshortGrayXparOver)(BLIT_PARAMS)
{
mlib_s32 dstScan = pDstInfo->scanStride;
mlib_s32 srcScan = pSrcInfo->scanStride;
mlib_d64 dzero = vis_fzero();
mlib_f32 f0, f1;
mlib_s32 i, j, mask0, mask1;
RGB_VARS;
if (width < 8) {
for (j = 0; j < height; j++) {
mlib_u8 *src = srcBase;
mlib_u16 *dst = dstBase;
for (i = 0; i < width; i++) {
if (src[4*i]) {
dst[i] = RGB2GRAY(src[4*i + 1], src[4*i + 2], src[4*i + 3]);
}
}
PTR_ADD(dstBase, dstScan);
PTR_ADD(srcBase, srcScan);
}
return;
}
for (j = 0; j < height; j++) {
mlib_f32 *src = srcBase;
mlib_u16 *dst = dstBase;
mlib_u16 *dst_end;
dst_end = dst + width;
while (((mlib_s32)dst & 7) && dst < dst_end) {
if (*(mlib_u8*)src) {
r = vis_ld_u8((mlib_u8*)src + 1);
g = vis_ld_u8((mlib_u8*)src + 2);
b = vis_ld_u8((mlib_u8*)src + 3);
GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
vis_st_u16(D64_FROM_F32x2(ff), dst);
}
dst++;
src++;
}
#pragma pipeloop(0)
for (; dst <= (dst_end - 4); dst += 4) {
s02 = vis_fpmerge(src[0], src[1]);
src += 2;
ar = vis_fpmerge(fzeros, vis_read_hi(s02));
gb = vis_fpmerge(fzeros, vis_read_lo(s02));
mask0 = vis_fcmpne16(ar, dzero) & 0xC;
GRAY_U16(f0, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb));
s02 = vis_fpmerge(src[0], src[1]);
src += 2;
ar = vis_fpmerge(fzeros, vis_read_hi(s02));
gb = vis_fpmerge(fzeros, vis_read_lo(s02));
mask1 = vis_fcmpne16(ar, dzero) >> 2;
GRAY_U16(f1, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb));
vis_pst_16(vis_freg_pair(f0, f1), dst, mask0 | mask1);
}
while (dst < dst_end) {
if (*(mlib_u8*)src) {
r = vis_ld_u8((mlib_u8*)src + 1);
g = vis_ld_u8((mlib_u8*)src + 2);
b = vis_ld_u8((mlib_u8*)src + 3);
GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
vis_st_u16(D64_FROM_F32x2(ff), dst);
}
dst++;
src++;
}
PTR_ADD(dstBase, dstScan);
PTR_ADD(srcBase, srcScan);
}
}
/***************************************************************/
void ADD_SUFF(IntArgbBmToUshortGrayXparBgCopy)(BCOPY_PARAMS)
{
mlib_s32 dstScan = pDstInfo->scanStride;
mlib_s32 srcScan = pSrcInfo->scanStride;
mlib_d64 dzero = vis_fzero(), d_bgpixel;
mlib_f32 f0, f1;
mlib_s32 i, j, mask0, mask1;
RGB_VARS;
if (width < 8) {
for (j = 0; j < height; j++) {
mlib_u8 *src = srcBase;
mlib_u16 *dst = dstBase;
mlib_s32 srcpixel, r, g, b;
for (i = 0; i < width; i++) {
if (src[4*i]) {
dst[i] = RGB2GRAY(src[4*i + 1], src[4*i + 2], src[4*i + 3]);
} else {
dst[i] = bgpixel;
}
}
PTR_ADD(dstBase, dstScan);
PTR_ADD(srcBase, srcScan);
}
return;
}
D64_FROM_U16x4(d_bgpixel, bgpixel);
for (j = 0; j < height; j++) {
mlib_f32 *src = srcBase;
mlib_u16 *dst = dstBase;
mlib_u16 *dst_end;
dst_end = dst + width;
while (((mlib_s32)dst & 7) && dst < dst_end) {
if (*(mlib_u8*)src) {
r = vis_ld_u8((mlib_u8*)src + 1);
g = vis_ld_u8((mlib_u8*)src + 2);
b = vis_ld_u8((mlib_u8*)src + 3);
GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
vis_st_u16(D64_FROM_F32x2(ff), dst);
} else {
*dst = bgpixel;
}
dst++;
src++;
}
#pragma pipeloop(0)
for (; dst <= (dst_end - 4); dst += 4) {
s02 = vis_fpmerge(src[0], src[1]);
src += 2;
ar = vis_fpmerge(fzeros, vis_read_hi(s02));
gb = vis_fpmerge(fzeros, vis_read_lo(s02));
mask0 = vis_fcmpne16(ar, dzero) & 0xC;
GRAY_U16(f0, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb));
s02 = vis_fpmerge(src[0], src[1]);
src += 2;
ar = vis_fpmerge(fzeros, vis_read_hi(s02));
gb = vis_fpmerge(fzeros, vis_read_lo(s02));
mask1 = vis_fcmpne16(ar, dzero) >> 2;
GRAY_U16(f1, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb));
*(mlib_d64*)dst = d_bgpixel;
vis_pst_16(vis_freg_pair(f0, f1), dst, mask0 | mask1);
}
while (dst < dst_end) {
if (*(mlib_u8*)src) {
r = vis_ld_u8((mlib_u8*)src + 1);
g = vis_ld_u8((mlib_u8*)src + 2);
b = vis_ld_u8((mlib_u8*)src + 3);
GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
vis_st_u16(D64_FROM_F32x2(ff), dst);
} else {
*dst = bgpixel;
}
dst++;
src++;
}
PTR_ADD(dstBase, dstScan);
PTR_ADD(srcBase, srcScan);
}
}
#endif
/***************************************************************/
void ADD_SUFF(IntArgbToUshortGrayXorBlit)(BLIT_PARAMS)
{
mlib_s32 dstScan = pDstInfo->scanStride;
mlib_s32 srcScan = pSrcInfo->scanStride;
mlib_d64 dd, d_xorpixel, d_alphamask, dzero = vis_fzero();
mlib_f32 f0, f1;
mlib_s32 i, j, mask0, mask1;
jint xorpixel = pCompInfo->details.xorPixel;
juint alphamask = pCompInfo->alphaMask;
RGB_VARS;
if (width < 8) {
for (j = 0; j < height; j++) {
mlib_s32 *src = srcBase;
mlib_u16 *dst = dstBase;
mlib_s32 srcpixel, r, g, b;
for (i = 0; i < width; i++) {
srcpixel = src[i];
if (srcpixel >= 0) continue;
b = (srcpixel) & 0xff;
g = (srcpixel >> 8) & 0xff;
r = (srcpixel >> 16) & 0xff;
srcpixel = (77*r + 150*g + 29*b + 128) / 256;
dst[i] ^= (((srcpixel) ^ (xorpixel)) & ~(alphamask));
}
PTR_ADD(dstBase, dstScan);
PTR_ADD(srcBase, srcScan);
}
return;
}
D64_FROM_U16x4(d_xorpixel, xorpixel);
D64_FROM_U16x4(d_alphamask, alphamask);
for (j = 0; j < height; j++) {
mlib_f32 *src = srcBase;
mlib_u16 *dst = dstBase;
mlib_u16 *dst_end;
dst_end = dst + width;
while (((mlib_s32)dst & 7) && dst < dst_end) {
if ((*(mlib_u8*)src) & 0x80) {
r = vis_ld_u8((mlib_u8*)src + 1);
g = vis_ld_u8((mlib_u8*)src + 2);
b = vis_ld_u8((mlib_u8*)src + 3);
GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
dd = vis_fxor(D64_FROM_F32x2(ff), d_xorpixel);
dd = vis_fandnot(d_alphamask, dd);
vis_st_u16(vis_fxor(vis_ld_u8(dst), dd), dst);
}
dst++;
src++;
}
#pragma pipeloop(0)
for (; dst <= (dst_end - 8); dst += 8) {
s02 = vis_fpmerge(src[0], src[1]);
src += 2;
ar = vis_fpmerge(fzeros, vis_read_hi(s02));
gb = vis_fpmerge(fzeros, vis_read_lo(s02));
mask0 = vis_fcmplt16(ar, dzero) & 0xC;
GRAY_U16(f0, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb));
s02 = vis_fpmerge(src[0], src[1]);
src += 2;
ar = vis_fpmerge(fzeros, vis_read_hi(s02));
gb = vis_fpmerge(fzeros, vis_read_lo(s02));
mask1 = vis_fcmplt16(ar, dzero) >> 2;
GRAY_U16(f1, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb));
dd = vis_freg_pair(f0, f1);
dd = vis_fandnot(d_alphamask, vis_fxor(dd, d_xorpixel));
vis_pst_16(vis_fxor(*(mlib_d64*)dst, dd), dst, mask0 | mask1);
}
while (dst < dst_end) {
if ((*(mlib_u8*)src) & 0x80) {
r = vis_ld_u8((mlib_u8*)src + 1);
g = vis_ld_u8((mlib_u8*)src + 2);
b = vis_ld_u8((mlib_u8*)src + 3);
GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
dd = vis_fxor(D64_FROM_F32x2(ff), d_xorpixel);
dd = vis_fandnot(d_alphamask, dd);
vis_st_u16(vis_fxor(vis_ld_u8(dst), dd), dst);
}
dst++;
src++;
}
PTR_ADD(dstBase, dstScan);
PTR_ADD(srcBase, srcScan);
}
}
/***************************************************************/
void ADD_SUFF(IntArgbBmToUshortGrayScaleXparOver)(SCALE_PARAMS)
{
mlib_s32 dstScan = pDstInfo->scanStride;
mlib_s32 srcScan = pSrcInfo->scanStride;
mlib_d64 dzero = vis_fzero();
mlib_f32 f0, f1;
mlib_s32 i, j, mask0, mask1;
RGB_VARS;
for (j = 0; j < height; j++) {
mlib_f32 *src = srcBase;
mlib_u16 *dst = dstBase;
mlib_u16 *dst_end;
mlib_s32 tmpsxloc = sxloc;
PTR_ADD(src, (syloc >> shift) * srcScan);
dst_end = dst + width;
while (((mlib_s32)dst & 7) && dst < dst_end) {
i = tmpsxloc >> shift;
tmpsxloc += sxinc;
if (*(mlib_u8*)(src + i)) {
r = vis_ld_u8((mlib_u8*)(src + i) + 1);
g = vis_ld_u8((mlib_u8*)(src + i) + 2);
b = vis_ld_u8((mlib_u8*)(src + i) + 3);
GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
vis_st_u16(D64_FROM_F32x2(ff), dst);
}
dst++;
}
#pragma pipeloop(0)
for (; dst <= (dst_end - 4); dst += 4) {
s02 = vis_fpmerge(src[(tmpsxloc ) >> shift],
src[(tmpsxloc + sxinc) >> shift]);
tmpsxloc += 2*sxinc;
ar = vis_fpmerge(fzeros, vis_read_hi(s02));
gb = vis_fpmerge(fzeros, vis_read_lo(s02));
mask0 = vis_fcmpne16(ar, dzero) & 0xC;
GRAY_U16(f0, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb));
s02 = vis_fpmerge(src[(tmpsxloc ) >> shift],
src[(tmpsxloc + sxinc) >> shift]);
tmpsxloc += 2*sxinc;
ar = vis_fpmerge(fzeros, vis_read_hi(s02));
gb = vis_fpmerge(fzeros, vis_read_lo(s02));
mask1 = vis_fcmpne16(ar, dzero) >> 2;
GRAY_U16(f1, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb));
vis_pst_16(vis_freg_pair(f0, f1), dst, mask0 | mask1);
}
while (dst < dst_end) {
i = tmpsxloc >> shift;
tmpsxloc += sxinc;
if (*(mlib_u8*)(src + i)) {
r = vis_ld_u8((mlib_u8*)(src + i) + 1);
g = vis_ld_u8((mlib_u8*)(src + i) + 2);
b = vis_ld_u8((mlib_u8*)(src + i) + 3);
GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
vis_st_u16(D64_FROM_F32x2(ff), dst);
}
dst++;
}
PTR_ADD(dstBase, dstScan);
syloc += syinc;
}
}
/***************************************************************/
#define TBL_MUL ((mlib_s16*)vis_mul8s_tbl + 1)
#define TBL_DIV ((mlib_u8*)vis_div8_tbl + 2)
void ADD_SUFF(IntArgbToUshortGraySrcOverMaskBlit)(MASKBLIT_PARAMS)
{
mlib_s32 extraA;
mlib_s32 dstScan = pDstInfo->scanStride;
mlib_s32 srcScan = pSrcInfo->scanStride;
mlib_u8 *mul8_extra;
mlib_u16 *dst_end;
mlib_d64 srcAx4, dd, d0, d1;
mlib_d64 done = vis_to_double_dup(0x7fff7fff);
mlib_s32 j, srcA0, srcA1, srcA2, srcA3;
RGB_VARS;
extraA = (mlib_s32)(pCompInfo->details.extraAlpha * 255.0 + 0.5);
mul8_extra = mul8table[extraA];
if (pMask != NULL) {
pMask += maskOff;
if (srcScan == 4*width && dstScan == 2*width && maskScan == width) {
width *= height;
height = 1;
}
maskScan -= width;
for (j = 0; j < height; j++) {
mlib_f32 *src = srcBase;
mlib_u16 *dst = dstBase;
dst_end = dst + width;
while (((mlib_s32)dst & 3) && dst < dst_end) {
srcA0 = mul8table[mul8_extra[*pMask++]][*(mlib_u8*)src];
r = vis_ld_u8((mlib_u8*)src + 1);
g = vis_ld_u8((mlib_u8*)src + 2);
b = vis_ld_u8((mlib_u8*)src + 3);
GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
d0 = vis_fpadd16(MUL8_VIS(ff, srcA0), d_half);
d1 = MUL8_VIS(vis_read_lo(vis_ld_u8(dst)), 255 - srcA0);
dd = vis_fpadd16(d0, d1);
vis_st_u16(D64_FROM_F32x2(vis_fpack16(dd)), dst);
dst++;
src++;
}
#pragma pipeloop(0)
for (; dst <= (dst_end - 4); dst += 4) {
srcA0 = mul8table[mul8_extra[*pMask++]][*(mlib_u8*)src];
srcA1 = mul8table[mul8_extra[*pMask++]][*(mlib_u8*)(src + 1)];
srcA2 = mul8table[mul8_extra[*pMask++]][*(mlib_u8*)(src + 2)];
srcA3 = mul8table[mul8_extra[*pMask++]][*(mlib_u8*)(src + 3)];
srcAx4 = vis_faligndata(vis_ld_u16(TBL_MUL + 2*srcA3), srcAx4);
srcAx4 = vis_faligndata(vis_ld_u16(TBL_MUL + 2*srcA2), srcAx4);
srcAx4 = vis_faligndata(vis_ld_u16(TBL_MUL + 2*srcA1), srcAx4);
srcAx4 = vis_faligndata(vis_ld_u16(TBL_MUL + 2*srcA0), srcAx4);
s02 = vis_fpmerge(src[0], src[1]);
ar = vis_fpmerge(fzeros, vis_read_hi(s02));
gb = vis_fpmerge(fzeros, vis_read_lo(s02));
GRAY_U16(ff, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb));
d0 = vis_fpadd16(vis_fmul8x16(ff, srcAx4), d_half);
d1 = vis_fmul8x16(*(mlib_f32*)dst, vis_fpsub16(done, srcAx4));
dd = vis_fpadd16(d0, d1);
*(mlib_f32*)dst = vis_fpack16(dd);
src += 4;
}
while (dst < dst_end) {
srcA0 = mul8table[mul8_extra[*pMask++]][*(mlib_u8*)src];
r = vis_ld_u8((mlib_u8*)src + 1);
g = vis_ld_u8((mlib_u8*)src + 2);
b = vis_ld_u8((mlib_u8*)src + 3);
GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
d0 = vis_fpadd16(MUL8_VIS(ff, srcA0), d_half);
d1 = MUL8_VIS(vis_read_lo(vis_ld_u8(dst)), 255 - srcA0);
dd = vis_fpadd16(d0, d1);
vis_st_u16(D64_FROM_F32x2(vis_fpack16(dd)), dst);
dst++;
src++;
}
PTR_ADD(dstBase, dstScan);
PTR_ADD(srcBase, srcScan);
PTR_ADD(pMask, maskScan);
}
} else {
if (dstScan == width && srcScan == 4*width) {
width *= height;
height = 1;
}
for (j = 0; j < height; j++) {
mlib_f32 *src = srcBase;
mlib_u16 *dst = dstBase;
dst_end = dst + width;
while (((mlib_s32)dst & 3) && dst < dst_end) {
srcA0 = mul8_extra[*(mlib_u8*)src];
r = vis_ld_u8((mlib_u8*)src + 1);
g = vis_ld_u8((mlib_u8*)src + 2);
b = vis_ld_u8((mlib_u8*)src + 3);
GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
d0 = vis_fpadd16(MUL8_VIS(ff, srcA0), d_half);
d1 = MUL8_VIS(vis_read_lo(vis_ld_u8(dst)), 255 - srcA0);
dd = vis_fpadd16(d0, d1);
vis_st_u16(D64_FROM_F32x2(vis_fpack16(dd)), dst);
dst++;
src++;
}
#pragma pipeloop(0)
for (; dst <= (dst_end - 4); dst += 4) {
srcA0 = mul8_extra[*(mlib_u8*)src];
srcA1 = mul8_extra[*(mlib_u8*)(src + 1)];
srcA2 = mul8_extra[*(mlib_u8*)(src + 2)];
srcA3 = mul8_extra[*(mlib_u8*)(src + 3)];
srcAx4 = vis_faligndata(vis_ld_u16(TBL_MUL + 2*srcA3), srcAx4);
srcAx4 = vis_faligndata(vis_ld_u16(TBL_MUL + 2*srcA2), srcAx4);
srcAx4 = vis_faligndata(vis_ld_u16(TBL_MUL + 2*srcA1), srcAx4);
srcAx4 = vis_faligndata(vis_ld_u16(TBL_MUL + 2*srcA0), srcAx4);
s02 = vis_fpmerge(src[0], src[2]);
s13 = vis_fpmerge(src[1], src[3]);
ar = vis_fpmerge(vis_read_hi(s02), vis_read_hi(s13));
gb = vis_fpmerge(vis_read_lo(s02), vis_read_lo(s13));
GRAY_U16(ff, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb));
d0 = vis_fpadd16(vis_fmul8x16(ff, srcAx4), d_half);
d1 = vis_fmul8x16(*(mlib_f32*)dst, vis_fpsub16(done, srcAx4));
dd = vis_fpadd16(d0, d1);
*(mlib_f32*)dst = vis_fpack16(dd);
src += 4;
}
while (dst < dst_end) {
srcA0 = mul8_extra[*(mlib_u8*)src];
r = vis_ld_u8((mlib_u8*)src + 1);
g = vis_ld_u8((mlib_u8*)src + 2);
b = vis_ld_u8((mlib_u8*)src + 3);
GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
d0 = vis_fpadd16(MUL8_VIS(ff, srcA0), d_half);
d1 = MUL8_VIS(vis_read_lo(vis_ld_u8(dst)), 255 - srcA0);
dd = vis_fpadd16(d0, d1);
vis_st_u16(D64_FROM_F32x2(vis_fpack16(dd)), dst);
dst++;
src++;
}
PTR_ADD(dstBase, dstScan);
PTR_ADD(srcBase, srcScan);
}
}
}
/***************************************************************/
#define GET_COEF(i) \
pathA = pMask[i]; \
srcA = *(mlib_u8*)(src + i); \
srcA = mul8table[extraA][srcA]; \
dstF = ((((srcA) & DstOpAnd) ^ DstOpXor) + DstOpAdd); \
srcF = mul8table[pathA][srcFbase]; \
dstA = 0xff - pathA + mul8table[pathA][dstF]; \
srcA = mul8table[srcF][srcA]; \
resA = srcA + dstA; \
srcAx4 = vis_faligndata(vis_ld_u16(TBL_MUL + 2*srcA), srcAx4); \
divAx4 = vis_faligndata(vis_ld_u16(TBL_DIV + 8*resA), divAx4)
/***************************************************************/
void ADD_SUFF(IntArgbToUshortGrayAlphaMaskBlit)(MASKBLIT_PARAMS)
{
mlib_s32 extraA;
mlib_s32 dstScan = pDstInfo->scanStride;
mlib_s32 srcScan = pSrcInfo->scanStride;
mlib_u16 *dst_end;
mlib_d64 srcAx4, dstAx4, divAx4, dd, ds;
mlib_d64 done = vis_to_double_dup(0x01000100);
mlib_f32 fscale = vis_to_float(0x02020202);
mlib_s32 j;
mlib_s32 SrcOpAnd, SrcOpXor, SrcOpAdd;
mlib_s32 DstOpAnd, DstOpXor, DstOpAdd;
mlib_s32 pathA, srcFbase, resA, resG, srcF, dstF, srcA, dstA;
RGB_VARS;
SrcOpAnd = (AlphaRules[pCompInfo->rule].srcOps).andval;
SrcOpXor = (AlphaRules[pCompInfo->rule].srcOps).xorval;
SrcOpAdd =
(jint) (AlphaRules[pCompInfo->rule].srcOps).addval - SrcOpXor;
DstOpAnd = (AlphaRules[pCompInfo->rule].dstOps).andval;
DstOpXor = (AlphaRules[pCompInfo->rule].dstOps).xorval;
DstOpAdd =
(jint) (AlphaRules[pCompInfo->rule].dstOps).addval - DstOpXor;
extraA = (mlib_s32)(pCompInfo->details.extraAlpha * 255.0 + 0.5);
srcFbase = ((((0xff) & SrcOpAnd) ^ SrcOpXor) + SrcOpAdd);
vis_write_gsr((7 << 3) | 6);
if (pMask != NULL) {
pMask += maskOff;
if (dstScan == width && srcScan == 4*width && maskScan == width) {
width *= height;
height = 1;
}
maskScan -= width;
for (j = 0; j < height; j++) {
mlib_f32 *src = srcBase;
mlib_u16 *dst = dstBase;
dst_end = dst + width;
while (((mlib_s32)dst & 3) && dst < dst_end) {
pathA = *pMask++;
srcA = *(mlib_u8*)src;
srcA = mul8table[extraA][srcA];
dstF = ((((srcA) & DstOpAnd) ^ DstOpXor) + DstOpAdd);
srcF = mul8table[pathA][srcFbase];
dstA = 0xff - pathA + mul8table[pathA][dstF];
srcA = mul8table[srcF][srcA];
resA = srcA + dstA;
r = vis_ld_u8((mlib_u8*)src + 1);
g = vis_ld_u8((mlib_u8*)src + 2);
b = vis_ld_u8((mlib_u8*)src + 3);
GRAY_U16(dd, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
dd = vis_fmul8x16(fscale, dd);
ff = vis_fpack16(dd);
dd = vis_freg_pair(vis_fzeros(),
((mlib_f32*)vis_mul8s_tbl)[dstA]);
DIV_ALPHA(dd, resA);
ds = vis_fpsub16(done, dd);
dd = vis_fmul8x16(vis_read_lo(vis_ld_u8(dst)), dd);
ds = vis_fmul8x16(ff, ds);
dd = vis_fpadd16(dd, ds);
ff = vis_fpack16(dd);
vis_st_u16(D64_FROM_F32x2(ff), dst);
dst++;
src++;
}
#pragma pipeloop(0)
for (; dst <= (dst_end - 4); dst += 4) {
GET_COEF(3);
GET_COEF(2);
GET_COEF(1);
GET_COEF(0);
pMask += 4;
srcAx4 = FMUL_16x16(srcAx4, divAx4);
dstAx4 = vis_fpsub16(done, srcAx4);
s02 = vis_fpmerge(src[0], src[2]);
s13 = vis_fpmerge(src[1], src[3]);
ar = vis_fpmerge(vis_read_hi(s02), vis_read_hi(s13));
gb = vis_fpmerge(vis_read_lo(s02), vis_read_lo(s13));
GRAY_U16(dd, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb));
dd = vis_fmul8x16(fscale, dd);
ff = vis_fpack16(dd);
dd = vis_fmul8x16(*(mlib_f32*)dst, dstAx4);
ds = vis_fmul8x16(ff, srcAx4);
dd = vis_fpadd16(dd, ds);
*(mlib_f32*)dst = vis_fpack16(dd);
src += 4;
}
while (dst < dst_end) {
pathA = *pMask++;
srcA = *(mlib_u8*)src;
srcA = mul8table[extraA][srcA];
dstF = ((((srcA) & DstOpAnd) ^ DstOpXor) + DstOpAdd);
srcF = mul8table[pathA][srcFbase];
dstA = 0xff - pathA + mul8table[pathA][dstF];
srcA = mul8table[srcF][srcA];
resA = srcA + dstA;
r = vis_ld_u8((mlib_u8*)src + 1);
g = vis_ld_u8((mlib_u8*)src + 2);
b = vis_ld_u8((mlib_u8*)src + 3);
GRAY_U16(dd, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
dd = vis_fmul8x16(fscale, dd);
ff = vis_fpack16(dd);
dd = vis_freg_pair(vis_fzeros(),
((mlib_f32*)vis_mul8s_tbl)[dstA]);
DIV_ALPHA(dd, resA);
ds = vis_fpsub16(done, dd);
dd = vis_fmul8x16(vis_read_lo(vis_ld_u8(dst)), dd);
ds = vis_fmul8x16(ff, ds);
dd = vis_fpadd16(dd, ds);
ff = vis_fpack16(dd);
vis_st_u16(D64_FROM_F32x2(ff), dst);
dst++;
src++;
}
PTR_ADD(dstBase, dstScan);
PTR_ADD(srcBase, srcScan);
PTR_ADD(pMask, maskScan);
}
} else {
if (dstScan == width && srcScan == 4*width) {
width *= height;
height = 1;
}
for (j = 0; j < height; j++) {
mlib_f32 *src = srcBase;
mlib_u16 *dst = dstBase;
dst_end = dst + width;
while (dst < dst_end) {
srcA = *(mlib_u8*)src;
srcA = mul8table[extraA][srcA];
dstA = ((((srcA) & DstOpAnd) ^ DstOpXor) + DstOpAdd);
srcA = mul8table[srcFbase][srcA];
resA = srcA + dstA;
r = vis_ld_u8((mlib_u8*)src + 1);
g = vis_ld_u8((mlib_u8*)src + 2);
b = vis_ld_u8((mlib_u8*)src + 3);
GRAY_U16(dd, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
dd = vis_fmul8x16(fscale, dd);
ff = vis_fpack16(dd);
resG = mul8table[dstA][*dst] +
mul8table[srcA][((mlib_u8*)&ff)[3]];
*dst = div8table[resA][resG];
dst++;
src++;
}
PTR_ADD(dstBase, dstScan);
PTR_ADD(srcBase, srcScan);
}
}
}
/***************************************************************/
void ADD_SUFF(IntRgbToUshortGrayAlphaMaskBlit)(MASKBLIT_PARAMS)
{
mlib_s32 extraA;
mlib_s32 dstScan = pDstInfo->scanStride;
mlib_s32 srcScan = pSrcInfo->scanStride;
mlib_u16 *dst_end;
mlib_d64 srcA_d, dstA_d, dd, d0, d1;
mlib_s32 i, j, srcG;
mlib_s32 SrcOpAnd, SrcOpXor, SrcOpAdd;
mlib_s32 DstOpAnd, DstOpXor, DstOpAdd;
mlib_s32 pathA, srcFbase, dstFbase, resA, resG, srcA, dstA;
RGB_VARS;
SrcOpAnd = (AlphaRules[pCompInfo->rule].srcOps).andval;
SrcOpXor = (AlphaRules[pCompInfo->rule].srcOps).xorval;
SrcOpAdd =
(jint) (AlphaRules[pCompInfo->rule].srcOps).addval - SrcOpXor;
DstOpAnd = (AlphaRules[pCompInfo->rule].dstOps).andval;
DstOpXor = (AlphaRules[pCompInfo->rule].dstOps).xorval;
DstOpAdd =
(jint) (AlphaRules[pCompInfo->rule].dstOps).addval - DstOpXor;
extraA = (mlib_s32)(pCompInfo->details.extraAlpha * 255.0 + 0.5);
srcFbase = ((((0xff) & SrcOpAnd) ^ SrcOpXor) + SrcOpAdd);
dstFbase = (((extraA & DstOpAnd) ^ DstOpXor) + DstOpAdd);
srcFbase = mul8table[srcFbase][extraA];
if (width < 16) {
if (pMask != NULL) {
pMask += maskOff;
for (j = 0; j < height; j++) {
mlib_u16 *dst = dstBase;
mlib_u8 *src = srcBase;
for (i = 0; i < width; i++) {
pathA = pMask[i];
dstA = 0xff - pathA + mul8table[dstFbase][pathA];
srcA = mul8table[srcFbase][pathA];
resA = srcA + dstA;
srcG = RGB2GRAY(src[4*i + 1], src[4*i + 2], src[4*i + 3]);
resG = mul8table[dstA][dst[i]] + mul8table[srcA][srcG];
resG = div8table[resA][resG];
dst[i] = resG;
}
PTR_ADD(dstBase, dstScan);
PTR_ADD(srcBase, srcScan);
PTR_ADD(pMask, maskScan);
}
} else {
dstA = dstFbase;
srcA = srcFbase;
resA = srcA + dstA;
for (j = 0; j < height; j++) {
mlib_u16 *dst = dstBase;
mlib_u8 *src = srcBase;
for (i = 0; i < width; i++) {
srcG = RGB2GRAY(src[4*i + 1], src[4*i + 2], src[4*i + 3]);
resG = mul8table[dstA][dst[i]] + mul8table[srcA][srcG];
resG = div8table[resA][resG];
dst[i] = resG;
}
PTR_ADD(dstBase, dstScan);
PTR_ADD(srcBase, srcScan);
}
}
return;
}
if (pMask != NULL) {
mlib_s32 srcA_buff[256];
mlib_d64 dscale = (mlib_d64)(1 << 15)*(1 << 16), ddiv;
mlib_d64 d_one = vis_to_double_dup(0x7FFF7FFF);
srcA_buff[0] = 0;
#pragma pipeloop(0)
for (pathA = 1; pathA < 256; pathA++) {
dstA = 0xff - pathA + mul8table[dstFbase][pathA];
srcA = mul8table[srcFbase][pathA];
resA = dstA + srcA;
ddiv = dscale*vis_d64_div_tbl[resA];
srcA_buff[pathA] = srcA*ddiv + (1 << 15);
}
pMask += maskOff;
maskScan -= width;
if (dstScan == width && srcScan == 4*width && maskScan == width) {
width *= height;
height = 1;
}
for (j = 0; j < height; j++) {
mlib_f32 *src = srcBase;
mlib_u16 *dst = dstBase;
dst_end = dst + width;
while (((mlib_s32)dst & 3) && dst < dst_end) {
pathA = *pMask++;
srcA_d = vis_ld_u16(srcA_buff + pathA);
dstA_d = vis_fpsub16(d_one, srcA_d);
r = vis_ld_u8((mlib_u8*)src + 1);
g = vis_ld_u8((mlib_u8*)src + 2);
b = vis_ld_u8((mlib_u8*)src + 3);
GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
d0 = vis_fpadd16(vis_fmul8x16(ff, srcA_d), d_half);
d1 = vis_fmul8x16(vis_read_lo(vis_ld_u8(dst)), dstA_d);
dd = vis_fpadd16(d0, d1);
vis_st_u16(D64_FROM_F32x2(vis_fpack16(dd)), dst);
dst++;
src++;
}
#pragma pipeloop(0)
for (; dst <= (dst_end - 4); dst += 4) {
LOAD_NEXT_U16(srcA_d, srcA_buff + pMask[3]);
LOAD_NEXT_U16(srcA_d, srcA_buff + pMask[2]);
LOAD_NEXT_U16(srcA_d, srcA_buff + pMask[1]);
LOAD_NEXT_U16(srcA_d, srcA_buff + pMask[0]);
dstA_d = vis_fpsub16(d_one, srcA_d);
pMask += 4;
s02 = vis_fpmerge(src[0], src[2]);
s13 = vis_fpmerge(src[1], src[3]);
ar = vis_fpmerge(vis_read_hi(s02), vis_read_hi(s13));
gb = vis_fpmerge(vis_read_lo(s02), vis_read_lo(s13));
GRAY_U16(ff, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb));
dd = vis_fpadd16(vis_fmul8x16(ff, srcA_d), d_half);
dd = vis_fpadd16(vis_fmul8x16(*(mlib_f32*)dst, dstA_d), dd);
*(mlib_f32*)dst = vis_fpack16(dd);
src += 4;
}
while (dst < dst_end) {
pathA = *pMask++;
srcA_d = vis_ld_u16(srcA_buff + pathA);
dstA_d = vis_fpsub16(d_one, srcA_d);
r = vis_ld_u8((mlib_u8*)src + 1);
g = vis_ld_u8((mlib_u8*)src + 2);
b = vis_ld_u8((mlib_u8*)src + 3);
GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
d0 = vis_fpadd16(vis_fmul8x16(ff, srcA_d), d_half);
d1 = vis_fmul8x16(vis_read_lo(vis_ld_u8(dst)), dstA_d);
dd = vis_fpadd16(d0, d1);
ff = vis_fpack16(dd);
vis_st_u16(D64_FROM_F32x2(ff), dst);
dst++;
src++;
}
PTR_ADD(dstBase, dstScan);
PTR_ADD(srcBase, srcScan);
PTR_ADD(pMask, maskScan);
}
} else {
mlib_d64 dscale = (mlib_d64)(1 << 15)*(1 << 16), ddiv;
mlib_d64 d_one = vis_to_double_dup(0x7FFF7FFF);
dstA = dstFbase;
srcA = srcFbase;
resA = dstA + srcA;
ddiv = dscale*vis_d64_div_tbl[resA];
srcA = (mlib_s32)(srcA*ddiv + (1 << 15)) >> 16;
srcA_d = vis_to_double_dup((srcA << 16) | srcA);
dstA_d = vis_fpsub16(d_one, srcA_d);
if (dstScan == width && srcScan == 4*width) {
width *= height;
height = 1;
}
for (j = 0; j < height; j++) {
mlib_f32 *src = srcBase;
mlib_u16 *dst = dstBase;
dst_end = dst + width;
while (((mlib_s32)dst & 3) && dst < dst_end) {
r = vis_ld_u8((mlib_u8*)src + 1);
g = vis_ld_u8((mlib_u8*)src + 2);
b = vis_ld_u8((mlib_u8*)src + 3);
GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
d0 = vis_fpadd16(vis_fmul8x16(ff, srcA_d), d_half);
d1 = vis_fmul8x16(vis_read_lo(vis_ld_u8(dst)), dstA_d);
dd = vis_fpadd16(d0, d1);
vis_st_u16(D64_FROM_F32x2(vis_fpack16(dd)), dst);
dst++;
src++;
}
#pragma pipeloop(0)
for (; dst <= (dst_end - 4); dst += 4) {
s02 = vis_fpmerge(src[0], src[2]);
s13 = vis_fpmerge(src[1], src[3]);
ar = vis_fpmerge(vis_read_hi(s02), vis_read_hi(s13));
gb = vis_fpmerge(vis_read_lo(s02), vis_read_lo(s13));
GRAY_U16(ff, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb));
dd = vis_fpadd16(vis_fmul8x16(ff, srcA_d), d_half);
dd = vis_fpadd16(vis_fmul8x16(*(mlib_f32*)dst, dstA_d), dd);
*(mlib_f32*)dst = vis_fpack16(dd);
src += 4;
}
while (dst < dst_end) {
r = vis_ld_u8((mlib_u8*)src + 1);
g = vis_ld_u8((mlib_u8*)src + 2);
b = vis_ld_u8((mlib_u8*)src + 3);
GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
d0 = vis_fpadd16(vis_fmul8x16(ff, srcA_d), d_half);
d1 = vis_fmul8x16(vis_read_lo(vis_ld_u8(dst)), dstA_d);
dd = vis_fpadd16(d0, d1);
ff = vis_fpack16(dd);
vis_st_u16(D64_FROM_F32x2(ff), dst);
dst++;
src++;
}
PTR_ADD(dstBase, dstScan);
PTR_ADD(srcBase, srcScan);
}
}
}
/***************************************************************/
#endif