Optimize x86 long V*V by skipping imul
The algorithm for long multiplication can take advantage of the fact
that we are multiplying a value by itself by converting 1L*2H + 2L*1H
into (2H*1L)+(2H*1L), thus converting a multiply into an addition.
Change-Id: I259a25699a8787badd943318e99bafdd06587ec6
Signed-off-by: Mark Mendell <mark.p.mendell@intel.com>
diff --git a/compiler/dex/quick/x86/int_x86.cc b/compiler/dex/quick/x86/int_x86.cc
index 1df9254..4edc3bc 100644
--- a/compiler/dex/quick/x86/int_x86.cc
+++ b/compiler/dex/quick/x86/int_x86.cc
@@ -978,6 +978,10 @@
}
// Nope. Do it the hard way
+ // Check for V*V. We can eliminate a multiply in that case, as 2L*1H == 2H*1L.
+ bool is_square = mir_graph_->SRegToVReg(rl_src1.s_reg_low) ==
+ mir_graph_->SRegToVReg(rl_src2.s_reg_low);
+
FlushAllRegs();
LockCallTemps(); // Prepare for explicit register usage.
rl_src1 = UpdateLocWide(rl_src1);
@@ -995,36 +999,52 @@
kWord, GetSRegHi(rl_src1.s_reg_low));
}
- // EAX <- 2H
- if (src2_in_reg) {
- NewLIR2(kX86Mov32RR, r0, rl_src2.high_reg);
- } else {
- LoadBaseDisp(rX86_SP, SRegOffset(rl_src2.s_reg_low) + HIWORD_OFFSET, r0,
- kWord, GetSRegHi(rl_src2.s_reg_low));
- }
+ if (is_square) {
+ // Take advantage of the fact that the values are the same.
+ // ECX <- ECX * 2L (1H * 2L)
+ if (src2_in_reg) {
+ NewLIR2(kX86Imul32RR, r1, rl_src2.low_reg);
+ } else {
+ int displacement = SRegOffset(rl_src2.s_reg_low);
+ LIR *m = NewLIR3(kX86Imul32RM, r1, rX86_SP, displacement + LOWORD_OFFSET);
+ AnnotateDalvikRegAccess(m, (displacement + LOWORD_OFFSET) >> 2,
+ true /* is_load */, true /* is_64bit */);
+ }
- // EAX <- EAX * 1L (2H * 1L)
- if (src1_in_reg) {
- NewLIR2(kX86Imul32RR, r0, rl_src1.low_reg);
+ // ECX <- 2*ECX (2H * 1L) + (1H * 2L)
+ NewLIR2(kX86Add32RR, r1, r1);
} else {
- int displacement = SRegOffset(rl_src1.s_reg_low);
- LIR *m = NewLIR3(kX86Imul32RM, r0, rX86_SP, displacement + LOWORD_OFFSET);
- AnnotateDalvikRegAccess(m, (displacement + LOWORD_OFFSET) >> 2,
- true /* is_load */, true /* is_64bit */);
- }
+ // EAX <- 2H
+ if (src2_in_reg) {
+ NewLIR2(kX86Mov32RR, r0, rl_src2.high_reg);
+ } else {
+ LoadBaseDisp(rX86_SP, SRegOffset(rl_src2.s_reg_low) + HIWORD_OFFSET, r0,
+ kWord, GetSRegHi(rl_src2.s_reg_low));
+ }
- // ECX <- ECX * 2L (1H * 2L)
- if (src2_in_reg) {
- NewLIR2(kX86Imul32RR, r1, rl_src2.low_reg);
- } else {
- int displacement = SRegOffset(rl_src2.s_reg_low);
- LIR *m = NewLIR3(kX86Imul32RM, r1, rX86_SP, displacement + LOWORD_OFFSET);
- AnnotateDalvikRegAccess(m, (displacement + LOWORD_OFFSET) >> 2,
- true /* is_load */, true /* is_64bit */);
- }
+ // EAX <- EAX * 1L (2H * 1L)
+ if (src1_in_reg) {
+ NewLIR2(kX86Imul32RR, r0, rl_src1.low_reg);
+ } else {
+ int displacement = SRegOffset(rl_src1.s_reg_low);
+ LIR *m = NewLIR3(kX86Imul32RM, r0, rX86_SP, displacement + LOWORD_OFFSET);
+ AnnotateDalvikRegAccess(m, (displacement + LOWORD_OFFSET) >> 2,
+ true /* is_load */, true /* is_64bit */);
+ }
- // ECX <- ECX + EAX (2H * 1L) + (1H * 2L)
- NewLIR2(kX86Add32RR, r1, r0);
+ // ECX <- ECX * 2L (1H * 2L)
+ if (src2_in_reg) {
+ NewLIR2(kX86Imul32RR, r1, rl_src2.low_reg);
+ } else {
+ int displacement = SRegOffset(rl_src2.s_reg_low);
+ LIR *m = NewLIR3(kX86Imul32RM, r1, rX86_SP, displacement + LOWORD_OFFSET);
+ AnnotateDalvikRegAccess(m, (displacement + LOWORD_OFFSET) >> 2,
+ true /* is_load */, true /* is_64bit */);
+ }
+
+ // ECX <- ECX + EAX (2H * 1L) + (1H * 2L)
+ NewLIR2(kX86Add32RR, r1, r0);
+ }
// EAX <- 2L
if (src2_in_reg) {