Remove alignment constraint for SSE2. Allows the optimized function to be used with unaligned memory, improving performance in that use case. Hurts performance on core2 and prior where memory was faster with movdqa instruction.
BUG=365
TESTED=psnr, ssim and djb2 unittests pass.
R=tpsiaki@google.com
Review URL: https://webrtc-codereview.appspot.com/22859004
git-svn-id: http://libyuv.googlecode.com/svn/trunk@1100 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/source/compare.cc b/source/compare.cc
index dc715e0..255e772 100644
--- a/source/compare.cc
+++ b/source/compare.cc
@@ -114,8 +114,7 @@
}
#endif
#if defined(HAS_SUMSQUAREERROR_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2)) {
// Note only used for multiples of 16 so count is not checked.
SumSquareError = SumSquareError_SSE2;
}
diff --git a/source/compare_posix.cc b/source/compare_posix.cc
index ac36119..64dfc35 100644
--- a/source/compare_posix.cc
+++ b/source/compare_posix.cc
@@ -25,9 +25,9 @@
"pxor %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm1 \n"
"lea " MEMLEA(0x10, 0) ",%0 \n"
- "movdqa " MEMACCESS(1) ",%%xmm2 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm2 \n"
"lea " MEMLEA(0x10, 1) ",%1 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm1,%%xmm3 \n"
diff --git a/source/compare_win.cc b/source/compare_win.cc
index 9983165..50d4d34 100644
--- a/source/compare_win.cc
+++ b/source/compare_win.cc
@@ -29,9 +29,9 @@
align 4
wloop:
- movdqa xmm1, [eax]
+ movdqu xmm1, [eax]
lea eax, [eax + 16]
- movdqa xmm2, [edx]
+ movdqu xmm2, [edx]
lea edx, [edx + 16]
sub ecx, 16
movdqa xmm3, xmm1 // abs trick
diff --git a/unit_test/compare_test.cc b/unit_test/compare_test.cc
index 141445e..464e255 100644
--- a/unit_test/compare_test.cc
+++ b/unit_test/compare_test.cc
@@ -244,6 +244,32 @@
free_aligned_buffer_64(src_b);
}
+
+TEST_F(libyuvTest, BenchmarkPsnr_Unaligned) {
+ align_buffer_64(src_a, benchmark_width_ * benchmark_height_ + 1);
+ align_buffer_64(src_b, benchmark_width_ * benchmark_height_);
+ for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+ src_a[i + 1] = i;
+ src_b[i] = i;
+ }
+
+ MaskCpuFlags(-1);
+
+ double opt_time = get_time();
+ for (int i = 0; i < benchmark_iterations_; ++i)
+ CalcFramePsnr(src_a + 1, benchmark_width_,
+ src_b, benchmark_width_,
+ benchmark_width_, benchmark_height_);
+
+ opt_time = (get_time() - opt_time) / benchmark_iterations_;
+ printf("BenchmarkPsnr_Opt - %8.2f us opt\n", opt_time * 1e6);
+
+ EXPECT_EQ(0, 0);
+
+ free_aligned_buffer_64(src_a);
+ free_aligned_buffer_64(src_b);
+}
+
TEST_F(libyuvTest, Psnr) {
const int kSrcWidth = benchmark_width_;
const int kSrcHeight = benchmark_height_;