Remove alignment constraint for SSE2. Allows the optimized function to be used with unaligned memory, improving performance in that use case. Hurts performance on core2 and prior where memory was faster with movdqa instruction. BUG=365 TESTED=psnr, ssim and djb2 unittests pass. R=tpsiaki@google.com Review URL: https://webrtc-codereview.appspot.com/22859004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1100 16f28f9a-4ce2-e073-06de-1de4eb20be90

commit: 9c4c82181bd5e5c9e9f973dc7cf591802118be7f [log] [tgz]
author: fbarchard@google.com <fbarchard@google.com> Tue Sep 30 18:53:34 2014 +0000
committer: fbarchard@google.com <fbarchard@google.com> Tue Sep 30 18:53:34 2014 +0000
tree: ae85f4d87d48cc3e92a86facced243036f183a9e
parent: bb5cc129e5faca3c24ac6f83a678d95d49e2ba88 [diff]
diff --git a/source/compare.cc b/source/compare.cc
index dc715e0..255e772 100644
--- a/source/compare.cc
+++ b/source/compare.cc

@@ -114,8 +114,7 @@
   }
 #endif
 #if defined(HAS_SUMSQUAREERROR_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     // Note only used for multiples of 16 so count is not checked.
     SumSquareError = SumSquareError_SSE2;
   }

diff --git a/source/compare_posix.cc b/source/compare_posix.cc
index ac36119..64dfc35 100644
--- a/source/compare_posix.cc
+++ b/source/compare_posix.cc

@@ -25,9 +25,9 @@
     "pxor      %%xmm5,%%xmm5                   \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm1         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
     "lea       " MEMLEA(0x10, 0) ",%0          \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm2         \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
     "lea       " MEMLEA(0x10, 1) ",%1          \n"
     "sub       $0x10,%2                        \n"
     "movdqa    %%xmm1,%%xmm3                   \n"

diff --git a/source/compare_win.cc b/source/compare_win.cc
index 9983165..50d4d34 100644
--- a/source/compare_win.cc
+++ b/source/compare_win.cc

@@ -29,9 +29,9 @@
 
     align      4
   wloop:
-    movdqa     xmm1, [eax]
+    movdqu     xmm1, [eax]
     lea        eax,  [eax + 16]
-    movdqa     xmm2, [edx]
+    movdqu     xmm2, [edx]
     lea        edx,  [edx + 16]
     sub        ecx, 16
     movdqa     xmm3, xmm1  // abs trick

diff --git a/unit_test/compare_test.cc b/unit_test/compare_test.cc
index 141445e..464e255 100644
--- a/unit_test/compare_test.cc
+++ b/unit_test/compare_test.cc

@@ -244,6 +244,32 @@
   free_aligned_buffer_64(src_b);
 }
 
+
+TEST_F(libyuvTest, BenchmarkPsnr_Unaligned) {
+  align_buffer_64(src_a, benchmark_width_ * benchmark_height_ + 1);
+  align_buffer_64(src_b, benchmark_width_ * benchmark_height_);
+  for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+    src_a[i + 1] = i;
+    src_b[i] = i;
+  }
+
+  MaskCpuFlags(-1);
+
+  double opt_time = get_time();
+  for (int i = 0; i < benchmark_iterations_; ++i)
+    CalcFramePsnr(src_a + 1, benchmark_width_,
+                  src_b, benchmark_width_,
+                  benchmark_width_, benchmark_height_);
+
+  opt_time = (get_time() - opt_time) / benchmark_iterations_;
+  printf("BenchmarkPsnr_Opt - %8.2f us opt\n", opt_time * 1e6);
+
+  EXPECT_EQ(0, 0);
+
+  free_aligned_buffer_64(src_a);
+  free_aligned_buffer_64(src_b);
+}
+
 TEST_F(libyuvTest, Psnr) {
   const int kSrcWidth = benchmark_width_;
   const int kSrcHeight = benchmark_height_;
commit	9c4c82181bd5e5c9e9f973dc7cf591802118be7f	[log] [tgz]
author	fbarchard@google.com <fbarchard@google.com>	Tue Sep 30 18:53:34 2014 +0000
committer	fbarchard@google.com <fbarchard@google.com>	Tue Sep 30 18:53:34 2014 +0000
tree	ae85f4d87d48cc3e92a86facced243036f183a9e
parent	bb5cc129e5faca3c24ac6f83a678d95d49e2ba88 [diff]