Reduced usage of xmm6 and xmm7 which need to be saved on win64.
BUG=none
TEST=tested with talk unittests
Review URL: http://webrtc-codereview.appspot.com/261003
git-svn-id: http://libyuv.googlecode.com/svn/trunk@55 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/source/rotate.cc b/source/rotate.cc
index 7d3a332..78b2fee 100644
--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -287,9 +287,9 @@
static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
asm volatile(
-"1:"
// Read in the data from the source pointer.
// First round of bit swap.
+"1:\n"
"movq (%0),%%xmm0\n"
"movq (%0,%3),%%xmm1\n"
"lea (%0,%3,2),%0\n"
@@ -363,7 +363,10 @@
"+r"(width) // %2
: "r"(static_cast<intptr_t>(src_stride)), // %3
"r"(static_cast<intptr_t>(dst_stride)) // %4
- : "memory"
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
);
}
@@ -398,7 +401,7 @@
"mov %ecx,0x10(%esp)\n"
"mov 0x2c(%ecx),%ecx\n"
-"1:"
+"1:\n"
"movdqa (%eax),%xmm0\n"
"movdqa (%eax,%edi,1),%xmm1\n"
"lea (%eax,%edi,2),%eax\n"
@@ -502,9 +505,9 @@
static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
asm volatile(
-"1:"
// Read in the data from the source pointer.
// First round of bit swap.
+"1:\n"
"movdqa (%0),%%xmm0\n"
"movdqa (%0,%3),%%xmm1\n"
"lea (%0,%3,2),%0\n"
@@ -630,7 +633,9 @@
"+r"(width) // %2
: "r"(static_cast<intptr_t>(src_stride)), // %3
"r"(static_cast<intptr_t>(dst_stride)) // %4
- : "memory"
+ : "memory", "cc",
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+ "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
);
}
@@ -640,9 +645,9 @@
uint8* dst_b, int dst_stride_b,
int w) {
asm volatile(
-"1:"
// Read in the data from the source pointer.
// First round of bit swap.
+"1:\n"
"movdqa (%0),%%xmm0\n"
"movdqa (%0,%4),%%xmm1\n"
"lea (%0,%4,2),%0\n"
@@ -738,7 +743,9 @@
: "r"(static_cast<intptr_t>(src_stride)), // %4
"r"(static_cast<intptr_t>(dst_stride_a)), // %5
"r"(static_cast<intptr_t>(dst_stride_b)) // %6
- : "memory"
+ : "memory", "cc",
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+ "xmm8", "xmm9"
);
}
#endif
@@ -863,12 +870,12 @@
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // width
- movdqa xmm7, _kShuffleReverse
+ movdqa xmm5, _kShuffleReverse
lea eax, [eax + ecx - 16]
convertloop :
movdqa xmm0, [eax]
lea eax, [eax - 16]
- pshufb xmm0, xmm7
+ pshufb xmm0, xmm5
movdqa [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
@@ -883,21 +890,24 @@
static void ReverseLine_SSSE3(const uint8* src, uint8* dst, int width) {
intptr_t temp_width = static_cast<intptr_t>(width);
asm volatile(
- "movdqa (%3),%%xmm7\n"
+ "movdqa (%3),%%xmm5\n"
"lea -0x10(%0,%2,1),%0\n"
-"1:"
+"1:\n"
"movdqa (%0),%%xmm0\n"
"lea -0x10(%0),%0\n"
- "pshufb %%xmm7,%%xmm0\n"
+ "pshufb %%xmm5,%%xmm0\n"
"movdqa %%xmm0,(%1)\n"
"lea 0x10(%1),%1\n"
"sub $0x10,%2\n"
"ja 1b\n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(temp_width) // %2
- : "r"(kShuffleReverse) // %3
- : "memory"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(temp_width) // %2
+ : "r"(kShuffleReverse) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm5"
+#endif
);
}
#endif
@@ -1073,13 +1083,13 @@
mov edx, [esp + 4 + 8] // dst_a
mov edi, [esp + 4 + 12] // dst_b
mov ecx, [esp + 4 + 16] // width
- movdqa xmm7, _kShuffleReverseUV
+ movdqa xmm5, _kShuffleReverseUV
lea eax, [eax + ecx * 2 - 16]
convertloop :
movdqa xmm0, [eax]
lea eax, [eax - 16]
- pshufb xmm0, xmm7
+ pshufb xmm0, xmm5
movlpd qword ptr [edx], xmm0
lea edx, [edx + 8]
movhpd qword ptr [edi], xmm0
@@ -1099,12 +1109,12 @@
int width) {
intptr_t temp_width = static_cast<intptr_t>(width);
asm volatile(
- "movdqa (%4),%%xmm7\n"
+ "movdqa (%4),%%xmm5\n"
"lea -0x10(%0,%3,2),%0\n"
-"1:"
+"1:\n"
"movdqa (%0),%%xmm0\n"
"lea -0x10(%0),%0\n"
- "pshufb %%xmm7,%%xmm0\n"
+ "pshufb %%xmm5,%%xmm0\n"
"movlpd %%xmm0,(%1)\n"
"lea 0x8(%1),%1\n"
"movhpd %%xmm0,(%2)\n"
@@ -1114,9 +1124,12 @@
: "+r"(src), // %0
"+r"(dst_a), // %1
"+r"(dst_b), // %2
- "+r"(temp_width) // %3
+ "+r"(temp_width) // %3
: "r"(kShuffleReverseUV) // %4
- : "memory"
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm5"
+#endif
);
}
#endif