X86: Fix IntrinsicBlend bugs and re-enable BLEND_DST_ATOP optimization
Bug: 22047392
Change-Id: Ife4cfc6bdb385f360b493fbb7212e1cf8d5d33c3
Signed-off-by: Yong Chen <yong.a.chen@intel.com>
diff --git a/cpu_ref/rsCpuIntrinsicBlend.cpp b/cpu_ref/rsCpuIntrinsicBlend.cpp
index ca40c5e..34bc82d 100644
--- a/cpu_ref/rsCpuIntrinsicBlend.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlend.cpp
@@ -273,9 +273,6 @@
}
break;
case BLEND_DST_ATOP:
- // Bug: 22047392 - We need to make sure that "out->w = in->w;" in all
- // accelerated versions before re-enabling optimizations.
- #if false // Bug: 22047392
#if defined(ARCH_X86_HAVE_SSSE3)
if (gArchUseSIMD) {
if ((x1 + 8) < x2) {
@@ -287,7 +284,6 @@
}
}
#endif
- #endif // false for Bug: 22047392
for (;x1 < x2; x1++, out++, in++) {
short4 in_s = convert_short4(*in);
short4 out_s = convert_short4(*out);
diff --git a/cpu_ref/rsCpuIntrinsics_x86.cpp b/cpu_ref/rsCpuIntrinsics_x86.cpp
index cb502c6..d983075 100644
--- a/cpu_ref/rsCpuIntrinsics_x86.cpp
+++ b/cpu_ref/rsCpuIntrinsics_x86.cpp
@@ -776,7 +776,7 @@
ina = _mm_shufflehi_epi16(ina, 0xFF);
t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
- t0 = _mm_srai_epi16(t0, 8);
+ t0 = _mm_srli_epi16(t0, 8);
t0 = _mm_add_epi16(t0, ins);
ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
@@ -784,7 +784,7 @@
ina = _mm_shufflehi_epi16(ina, 0xFF);
t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
- t1 = _mm_srai_epi16(t1, 8);
+ t1 = _mm_srli_epi16(t1, 8);
t1 = _mm_add_epi16(t1, ins);
ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
@@ -792,7 +792,7 @@
ina = _mm_shufflehi_epi16(ina, 0xFF);
t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
- t2 = _mm_srai_epi16(t2, 8);
+ t2 = _mm_srli_epi16(t2, 8);
t2 = _mm_add_epi16(t2, ins);
ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
@@ -800,7 +800,7 @@
ina = _mm_shufflehi_epi16(ina, 0xFF);
t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
- t3 = _mm_srai_epi16(t3, 8);
+ t3 = _mm_srli_epi16(t3, 8);
t3 = _mm_add_epi16(t3, ins);
t0 = _mm_packus_epi16(t0, t1);
@@ -833,7 +833,7 @@
outa = _mm_shufflehi_epi16(outa, 0xFF);
t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
- t0 = _mm_srai_epi16(t0, 8);
+ t0 = _mm_srli_epi16(t0, 8);
t0 = _mm_add_epi16(t0, outs);
outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
@@ -841,7 +841,7 @@
outa = _mm_shufflehi_epi16(outa, 0xFF);
t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
- t1 = _mm_srai_epi16(t1, 8);
+ t1 = _mm_srli_epi16(t1, 8);
t1 = _mm_add_epi16(t1, outs);
outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
@@ -849,7 +849,7 @@
outa = _mm_shufflehi_epi16(outa, 0xFF);
t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
- t2 = _mm_srai_epi16(t2, 8);
+ t2 = _mm_srli_epi16(t2, 8);
t2 = _mm_add_epi16(t2, outs);
outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
@@ -857,7 +857,7 @@
outa = _mm_shufflehi_epi16(outa, 0xFF);
t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
- t3 = _mm_srai_epi16(t3, 8);
+ t3 = _mm_srli_epi16(t3, 8);
t3 = _mm_add_epi16(t3, outs);
t0 = _mm_packus_epi16(t0, t1);
@@ -887,28 +887,28 @@
outa = _mm_shufflehi_epi16(outa, 0xFF);
t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
t0 = _mm_mullo_epi16(t0, outa);
- t0 = _mm_srai_epi16(t0, 8);
+ t0 = _mm_srli_epi16(t0, 8);
outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
outa = _mm_shufflelo_epi16(outa, 0xFF);
outa = _mm_shufflehi_epi16(outa, 0xFF);
t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
t1 = _mm_mullo_epi16(t1, outa);
- t1 = _mm_srai_epi16(t1, 8);
+ t1 = _mm_srli_epi16(t1, 8);
outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
outa = _mm_shufflelo_epi16(outa, 0xFF);
outa = _mm_shufflehi_epi16(outa, 0xFF);
t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
t2 = _mm_mullo_epi16(t2, outa);
- t2 = _mm_srai_epi16(t2, 8);
+ t2 = _mm_srli_epi16(t2, 8);
outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
outa = _mm_shufflelo_epi16(outa, 0xFF);
outa = _mm_shufflehi_epi16(outa, 0xFF);
t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
t3 = _mm_mullo_epi16(t3, outa);
- t3 = _mm_srai_epi16(t3, 8);
+ t3 = _mm_srli_epi16(t3, 8);
t0 = _mm_packus_epi16(t0, t1);
t2 = _mm_packus_epi16(t2, t3);
@@ -937,28 +937,28 @@
ina = _mm_shufflehi_epi16(ina, 0xFF);
t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
t0 = _mm_mullo_epi16(t0, ina);
- t0 = _mm_srai_epi16(t0, 8);
+ t0 = _mm_srli_epi16(t0, 8);
ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
ina = _mm_shufflelo_epi16(ina, 0xFF);
ina = _mm_shufflehi_epi16(ina, 0xFF);
t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
t1 = _mm_mullo_epi16(t1, ina);
- t1 = _mm_srai_epi16(t1, 8);
+ t1 = _mm_srli_epi16(t1, 8);
ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
ina = _mm_shufflelo_epi16(ina, 0xFF);
ina = _mm_shufflehi_epi16(ina, 0xFF);
t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
t2 = _mm_mullo_epi16(t2, ina);
- t2 = _mm_srai_epi16(t2, 8);
+ t2 = _mm_srli_epi16(t2, 8);
ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
ina = _mm_shufflelo_epi16(ina, 0xFF);
ina = _mm_shufflehi_epi16(ina, 0xFF);
t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
t3 = _mm_mullo_epi16(t3, ina);
- t3 = _mm_srai_epi16(t3, 8);
+ t3 = _mm_srli_epi16(t3, 8);
t0 = _mm_packus_epi16(t0, t1);
t2 = _mm_packus_epi16(t2, t3);
@@ -989,28 +989,28 @@
outa = _mm_shufflehi_epi16(outa, 0xFF);
t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
- t0 = _mm_srai_epi16(t0, 8);
+ t0 = _mm_srli_epi16(t0, 8);
outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
outa = _mm_shufflelo_epi16(outa, 0xFF);
outa = _mm_shufflehi_epi16(outa, 0xFF);
t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
- t1 = _mm_srai_epi16(t1, 8);
+ t1 = _mm_srli_epi16(t1, 8);
outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
outa = _mm_shufflelo_epi16(outa, 0xFF);
outa = _mm_shufflehi_epi16(outa, 0xFF);
t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
- t2 = _mm_srai_epi16(t2, 8);
+ t2 = _mm_srli_epi16(t2, 8);
outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
outa = _mm_shufflelo_epi16(outa, 0xFF);
outa = _mm_shufflehi_epi16(outa, 0xFF);
t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
- t3 = _mm_srai_epi16(t3, 8);
+ t3 = _mm_srli_epi16(t3, 8);
t0 = _mm_packus_epi16(t0, t1);
t2 = _mm_packus_epi16(t2, t3);
@@ -1041,28 +1041,28 @@
ina = _mm_shufflehi_epi16(ina, 0xFF);
t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
- t0 = _mm_srai_epi16(t0, 8);
+ t0 = _mm_srli_epi16(t0, 8);
ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
ina = _mm_shufflelo_epi16(ina, 0xFF);
ina = _mm_shufflehi_epi16(ina, 0xFF);
t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
- t1 = _mm_srai_epi16(t1, 8);
+ t1 = _mm_srli_epi16(t1, 8);
ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
ina = _mm_shufflelo_epi16(ina, 0xFF);
ina = _mm_shufflehi_epi16(ina, 0xFF);
t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
- t2 = _mm_srai_epi16(t2, 8);
+ t2 = _mm_srli_epi16(t2, 8);
ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
ina = _mm_shufflelo_epi16(ina, 0xFF);
ina = _mm_shufflehi_epi16(ina, 0xFF);
t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
- t3 = _mm_srai_epi16(t3, 8);
+ t3 = _mm_srli_epi16(t3, 8);
t0 = _mm_packus_epi16(t0, t1);
t2 = _mm_packus_epi16(t2, t3);
@@ -1205,9 +1205,9 @@
t3 = _mm_srli_epi16(t3, 8);
t0 = _mm_packus_epi16(t0, t1);
- t0 = blendv_epi8(t0, out0, M0001);
+ t0 = blendv_epi8(t0, in0, M0001);
t2 = _mm_packus_epi16(t2, t3);
- t2 = blendv_epi8(t2, out1, M0001);
+ t2 = blendv_epi8(t2, in1, M0001);
_mm_storeu_si128((__m128i *)dst, t0);
_mm_storeu_si128((__m128i *)dst + 1, t2);