blob: bf2db26d791922f0b26115b1cf6024b621151c8d [file] [log] [blame]
/*
**
** Copyright 2009, The Android Open Source Project
**
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
**
** http://www.apache.org/licenses/LICENSE-2.0
**
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*/
#include "SkBlitRow_opts_SSE2.h"
#include "SkColorPriv.h"
#include <emmintrin.h>
/* SSE2 version of S32_Blend_BlitRow32()
* portable version is in core/SkBlitRow_D32.cpp
*/
void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src,
int count, U8CPU alpha) {
SkASSERT(alpha <= 255);
if (count <= 0) {
return;
}
uint32_t src_scale = SkAlpha255To256(alpha);
uint32_t dst_scale = 256 - src_scale;
if (count >= 4) {
SkASSERT(((size_t)dst & 0x03) == 0);
while (((size_t)dst & 0x0F) != 0) {
*dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
src++;
dst++;
count--;
}
const __m128i *s = reinterpret_cast<const __m128i*>(src);
__m128i *d = reinterpret_cast<__m128i*>(dst);
__m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
__m128i src_scale_wide = _mm_set1_epi16(src_scale);
__m128i dst_scale_wide = _mm_set1_epi16(dst_scale);
while (count >= 4) {
// Load 4 pixels each of src and dest.
__m128i src_pixel = _mm_loadu_si128(s);
__m128i dst_pixel = _mm_load_si128(d);
// Get red and blue pixels into lower byte of each word.
__m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
__m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
// Get alpha and green into lower byte of each word.
__m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
__m128i src_ag = _mm_srli_epi16(src_pixel, 8);
// Multiply by scale.
src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
dst_rb = _mm_mullo_epi16(dst_rb, dst_scale_wide);
dst_ag = _mm_mullo_epi16(dst_ag, dst_scale_wide);
// Divide by 256.
src_rb = _mm_srli_epi16(src_rb, 8);
dst_rb = _mm_srli_epi16(dst_rb, 8);
src_ag = _mm_andnot_si128(rb_mask, src_ag);
dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
// Combine back into RGBA.
src_pixel = _mm_or_si128(src_rb, src_ag);
dst_pixel = _mm_or_si128(dst_rb, dst_ag);
// Add result
__m128i result = _mm_add_epi8(src_pixel, dst_pixel);
_mm_store_si128(d, result);
s++;
d++;
count -= 4;
}
src = reinterpret_cast<const SkPMColor*>(s);
dst = reinterpret_cast<SkPMColor*>(d);
}
while (count > 0) {
*dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
src++;
dst++;
count--;
}
}
void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src,
int count, U8CPU alpha) {
SkASSERT(alpha == 255);
if (count <= 0) {
return;
}
if (count >= 4) {
SkASSERT(((size_t)dst & 0x03) == 0);
while (((size_t)dst & 0x0F) != 0) {
*dst = SkPMSrcOver(*src, *dst);
src++;
dst++;
count--;
}
const __m128i *s = reinterpret_cast<const __m128i*>(src);
__m128i *d = reinterpret_cast<__m128i*>(dst);
#ifdef SK_USE_ACCURATE_BLENDING
__m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
__m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit)
__m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit)
while (count >= 4) {
// Load 4 pixels
__m128i src_pixel = _mm_loadu_si128(s);
__m128i dst_pixel = _mm_load_si128(d);
__m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
__m128i dst_ag = _mm_andnot_si128(rb_mask, dst_pixel);
dst_ag = _mm_srli_epi16(dst_ag, 8);
// Shift alphas down to lower 8 bits of each quad.
__m128i alpha = _mm_srli_epi32(src_pixel, 24);
// Copy alpha to upper 3rd byte of each quad
alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
// Subtract alphas from 255, to get 0..255
alpha = _mm_sub_epi16(c_255, alpha);
// Multiply by red and blue by src alpha.
dst_rb = _mm_mullo_epi16(dst_rb, alpha);
// Multiply by alpha and green by src alpha.
dst_ag = _mm_mullo_epi16(dst_ag, alpha);
// dst_rb_low = (dst_rb >> 8)
__m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
__m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
// dst_rb = (dst_rb + dst_rb_low + 128) >> 8
dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
dst_rb = _mm_add_epi16(dst_rb, c_128);
dst_rb = _mm_srli_epi16(dst_rb, 8);
// dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
dst_ag = _mm_add_epi16(dst_ag, c_128);
dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
// Combine back into RGBA.
dst_pixel = _mm_or_si128(dst_rb, dst_ag);
// Add result
__m128i result = _mm_add_epi8(src_pixel, dst_pixel);
_mm_store_si128(d, result);
s++;
d++;
count -= 4;
}
#else
__m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
__m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit)
while (count >= 4) {
// Load 4 pixels
__m128i src_pixel = _mm_loadu_si128(s);
__m128i dst_pixel = _mm_load_si128(d);
__m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
__m128i dst_ag = _mm_andnot_si128(rb_mask, dst_pixel);
dst_ag = _mm_srli_epi16(dst_ag, 8);
// Shift alphas down to lower 8 bits of each quad.
__m128i alpha = _mm_srli_epi32(src_pixel, 24);
// Copy alpha to upper 3rd byte of each quad
alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
// Subtract alphas from 256, to get 1..256
alpha = _mm_sub_epi16(c_256, alpha);
// Multiply by red and blue by src alpha.
dst_rb = _mm_mullo_epi16(dst_rb, alpha);
// Multiply by alpha and green by src alpha.
dst_ag = _mm_mullo_epi16(dst_ag, alpha);
// Divide by 256.
dst_rb = _mm_srli_epi16(dst_rb, 8);
// Mask out high bits (already in the right place)
dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
// Combine back into RGBA.
dst_pixel = _mm_or_si128(dst_rb, dst_ag);
// Add result
__m128i result = _mm_add_epi8(src_pixel, dst_pixel);
_mm_store_si128(d, result);
s++;
d++;
count -= 4;
}
#endif
src = reinterpret_cast<const SkPMColor*>(s);
dst = reinterpret_cast<SkPMColor*>(d);
}
while (count > 0) {
*dst = SkPMSrcOver(*src, *dst);
src++;
dst++;
count--;
}
}
void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src,
int count, U8CPU alpha) {
SkASSERT(alpha <= 255);
if (count <= 0) {
return;
}
if (count >= 4) {
while (((size_t)dst & 0x0F) != 0) {
*dst = SkBlendARGB32(*src, *dst, alpha);
src++;
dst++;
count--;
}
uint32_t src_scale = SkAlpha255To256(alpha);
const __m128i *s = reinterpret_cast<const __m128i*>(src);
__m128i *d = reinterpret_cast<__m128i*>(dst);
__m128i src_scale_wide = _mm_set1_epi16(src_scale);
__m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
__m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit)
while (count >= 4) {
// Load 4 pixels each of src and dest.
__m128i src_pixel = _mm_loadu_si128(s);
__m128i dst_pixel = _mm_load_si128(d);
// Get red and blue pixels into lower byte of each word.
__m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
__m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
// Get alpha and green into lower byte of each word.
__m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
__m128i src_ag = _mm_srli_epi16(src_pixel, 8);
// Put per-pixel alpha in low byte of each word.
__m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
// dst_alpha = dst_alpha * src_scale
dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
// Divide by 256.
dst_alpha = _mm_srli_epi16(dst_alpha, 8);
// Subtract alphas from 256, to get 1..256
dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
// Multiply red and blue by dst pixel alpha.
dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
// Multiply alpha and green by dst pixel alpha.
dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
// Multiply red and blue by global alpha.
src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
// Multiply alpha and green by global alpha.
src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
// Divide by 256.
dst_rb = _mm_srli_epi16(dst_rb, 8);
src_rb = _mm_srli_epi16(src_rb, 8);
// Mask out low bits (goodies already in the right place; no need to divide)
dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
src_ag = _mm_andnot_si128(rb_mask, src_ag);
// Combine back into RGBA.
dst_pixel = _mm_or_si128(dst_rb, dst_ag);
src_pixel = _mm_or_si128(src_rb, src_ag);
// Add two pixels into result.
__m128i result = _mm_add_epi8(src_pixel, dst_pixel);
_mm_store_si128(d, result);
s++;
d++;
count -= 4;
}
src = reinterpret_cast<const SkPMColor*>(s);
dst = reinterpret_cast<SkPMColor*>(d);
}
while (count > 0) {
*dst = SkBlendARGB32(*src, *dst, alpha);
src++;
dst++;
count--;
}
}