Add multiplication for integral types

This also fixes an issue where we could allocate a pair register even if
one of its parts was already blocked.

Change-Id: I4869175933409add2a56f1ccfb369c3d3dd3cb01
diff --git a/compiler/optimizing/builder.cc b/compiler/optimizing/builder.cc
index 5bcc65b..2648d4d 100644
--- a/compiler/optimizing/builder.cc
+++ b/compiler/optimizing/builder.cc
@@ -713,6 +713,16 @@
       break;
     }
 
+    case Instruction::MUL_INT: {
+      Binop_23x<HMul>(instruction, Primitive::kPrimInt);
+      break;
+    }
+
+    case Instruction::MUL_LONG: {
+      Binop_23x<HMul>(instruction, Primitive::kPrimLong);
+      break;
+    }
+
     case Instruction::ADD_LONG_2ADDR: {
       Binop_12x<HAdd>(instruction, Primitive::kPrimLong);
       break;
@@ -738,6 +748,16 @@
       break;
     }
 
+    case Instruction::MUL_INT_2ADDR: {
+      Binop_12x<HMul>(instruction, Primitive::kPrimInt);
+      break;
+    }
+
+    case Instruction::MUL_LONG_2ADDR: {
+      Binop_12x<HMul>(instruction, Primitive::kPrimLong);
+      break;
+    }
+
     case Instruction::ADD_INT_LIT16: {
       Binop_22s<HAdd>(instruction, false);
       break;
@@ -748,6 +768,11 @@
       break;
     }
 
+    case Instruction::MUL_INT_LIT16: {
+      Binop_22s<HMul>(instruction, false);
+      break;
+    }
+
     case Instruction::ADD_INT_LIT8: {
       Binop_22b<HAdd>(instruction, false);
       break;
@@ -758,6 +783,11 @@
       break;
     }
 
+    case Instruction::MUL_INT_LIT8: {
+      Binop_22b<HMul>(instruction, false);
+      break;
+    }
+
     case Instruction::NEW_INSTANCE: {
       current_block_->AddInstruction(
           new (arena_) HNewInstance(dex_offset, instruction.VRegB_21c()));
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index cdee845..a2cf670 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -236,19 +236,12 @@
       size_t reg = FindFreeEntry(blocked_register_pairs_, kNumberOfRegisterPairs);
       ArmManagedRegister pair =
           ArmManagedRegister::FromRegisterPair(static_cast<RegisterPair>(reg));
+      DCHECK(!blocked_core_registers_[pair.AsRegisterPairLow()]);
+      DCHECK(!blocked_core_registers_[pair.AsRegisterPairHigh()]);
+
       blocked_core_registers_[pair.AsRegisterPairLow()] = true;
       blocked_core_registers_[pair.AsRegisterPairHigh()] = true;
-       // Block all other register pairs that share a register with `pair`.
-      for (int i = 0; i < kNumberOfRegisterPairs; i++) {
-        ArmManagedRegister current =
-            ArmManagedRegister::FromRegisterPair(static_cast<RegisterPair>(i));
-        if (current.AsRegisterPairLow() == pair.AsRegisterPairLow()
-            || current.AsRegisterPairLow() == pair.AsRegisterPairHigh()
-            || current.AsRegisterPairHigh() == pair.AsRegisterPairLow()
-            || current.AsRegisterPairHigh() == pair.AsRegisterPairHigh()) {
-          blocked_register_pairs_[i] = true;
-        }
-      }
+      UpdateBlockedPairRegisters();
       return Location::RegisterPairLocation(pair.AsRegisterPairLow(), pair.AsRegisterPairHigh());
     }
 
@@ -294,7 +287,6 @@
 
   // Reserve R4 for suspend check.
   blocked_core_registers_[R4] = true;
-  blocked_register_pairs_[R4_R5] = true;
 
   // Reserve thread register.
   blocked_core_registers_[TR] = true;
@@ -318,6 +310,19 @@
   blocked_fpu_registers_[D13] = true;
   blocked_fpu_registers_[D14] = true;
   blocked_fpu_registers_[D15] = true;
+
+  UpdateBlockedPairRegisters();
+}
+
+void CodeGeneratorARM::UpdateBlockedPairRegisters() const {
+  for (int i = 0; i < kNumberOfRegisterPairs; i++) {
+    ArmManagedRegister current =
+        ArmManagedRegister::FromRegisterPair(static_cast<RegisterPair>(i));
+    if (blocked_core_registers_[current.AsRegisterPairLow()]
+        || blocked_core_registers_[current.AsRegisterPairHigh()]) {
+      blocked_register_pairs_[i] = true;
+    }
+  }
 }
 
 InstructionCodeGeneratorARM::InstructionCodeGeneratorARM(HGraph* graph, CodeGeneratorARM* codegen)
@@ -1139,6 +1144,82 @@
   }
 }
 
+void LocationsBuilderARM::VisitMul(HMul* mul) {
+  LocationSummary* locations =
+      new (GetGraph()->GetArena()) LocationSummary(mul, LocationSummary::kNoCall);
+  switch (mul->GetResultType()) {
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:  {
+      locations->SetInAt(0, Location::RequiresRegister(), Location::kDiesAtEntry);
+      locations->SetInAt(1, Location::RequiresRegister(), Location::kDiesAtEntry);
+      locations->SetOut(Location::RequiresRegister());
+      break;
+    }
+
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      LOG(FATAL) << "Unexpected mul type " << mul->GetResultType();
+      break;
+
+    default:
+      LOG(FATAL) << "Unimplemented mul type " << mul->GetResultType();
+  }
+}
+
+void InstructionCodeGeneratorARM::VisitMul(HMul* mul) {
+  LocationSummary* locations = mul->GetLocations();
+  Location out = locations->Out();
+  Location first = locations->InAt(0);
+  Location second = locations->InAt(1);
+  switch (mul->GetResultType()) {
+    case Primitive::kPrimInt: {
+      __ mul(out.As<Register>(), first.As<Register>(), second.As<Register>());
+      break;
+    }
+    case Primitive::kPrimLong: {
+      Register out_hi = out.AsRegisterPairHigh<Register>();
+      Register out_lo = out.AsRegisterPairLow<Register>();
+      Register in1_hi = first.AsRegisterPairHigh<Register>();
+      Register in1_lo = first.AsRegisterPairLow<Register>();
+      Register in2_hi = second.AsRegisterPairHigh<Register>();
+      Register in2_lo = second.AsRegisterPairLow<Register>();
+
+      // Extra checks to protect caused by the existence of R1_R2.
+      // The algorithm is wrong if out.hi is either in1.lo or in2.lo:
+      // (e.g. in1=r0_r1, in2=r2_r3 and out=r1_r2);
+      DCHECK_NE(out_hi, in1_lo);
+      DCHECK_NE(out_hi, in2_lo);
+
+      // input: in1 - 64 bits, in2 - 64 bits
+      // output: out
+      // formula: out.hi : out.lo = (in1.lo * in2.hi + in1.hi * in2.lo)* 2^32 + in1.lo * in2.lo
+      // parts: out.hi = in1.lo * in2.hi + in1.hi * in2.lo + (in1.lo * in2.lo)[63:32]
+      // parts: out.lo = (in1.lo * in2.lo)[31:0]
+
+      // IP <- in1.lo * in2.hi
+      __ mul(IP, in1_lo, in2_hi);
+      // out.hi <- in1.lo * in2.hi + in1.hi * in2.lo
+      __ mla(out_hi, in1_hi, in2_lo, IP);
+      // out.lo <- (in1.lo * in2.lo)[31:0];
+      __ umull(out_lo, IP, in1_lo, in2_lo);
+      // out.hi <- in2.hi * in1.lo +  in2.lo * in1.hi + (in1.lo * in2.lo)[63:32]
+      __ add(out_hi, out_hi, ShifterOperand(IP));
+      break;
+    }
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      LOG(FATAL) << "Unexpected mul type " << mul->GetResultType();
+      break;
+
+    default:
+      LOG(FATAL) << "Unimplemented mul type " << mul->GetResultType();
+  }
+}
+
 void LocationsBuilderARM::VisitNewInstance(HNewInstance* instruction) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index 7c063f1..57b289c 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -164,6 +164,7 @@
   }
 
   virtual void SetupBlockedRegisters() const OVERRIDE;
+
   virtual Location AllocateFreeRegister(Primitive::Type type) const OVERRIDE;
 
   virtual Location GetStackLocation(HLoadLocal* load) const OVERRIDE;
@@ -171,6 +172,9 @@
   virtual void DumpCoreRegister(std::ostream& stream, int reg) const OVERRIDE;
   virtual void DumpFloatingPointRegister(std::ostream& stream, int reg) const OVERRIDE;
 
+  // Blocks all register pairs made out of blocked core registers.
+  void UpdateBlockedPairRegisters() const;
+
   ParallelMoveResolverARM* GetMoveResolver() {
     return &move_resolver_;
   }
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 98d3ad4..041acdf 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -207,19 +207,11 @@
       size_t reg = FindFreeEntry(blocked_register_pairs_, kNumberOfRegisterPairs);
       X86ManagedRegister pair =
           X86ManagedRegister::FromRegisterPair(static_cast<RegisterPair>(reg));
+      DCHECK(!blocked_core_registers_[pair.AsRegisterPairLow()]);
+      DCHECK(!blocked_core_registers_[pair.AsRegisterPairHigh()]);
       blocked_core_registers_[pair.AsRegisterPairLow()] = true;
       blocked_core_registers_[pair.AsRegisterPairHigh()] = true;
-      // Block all other register pairs that share a register with `pair`.
-      for (int i = 0; i < kNumberOfRegisterPairs; i++) {
-        X86ManagedRegister current =
-            X86ManagedRegister::FromRegisterPair(static_cast<RegisterPair>(i));
-        if (current.AsRegisterPairLow() == pair.AsRegisterPairLow()
-            || current.AsRegisterPairLow() == pair.AsRegisterPairHigh()
-            || current.AsRegisterPairHigh() == pair.AsRegisterPairLow()
-            || current.AsRegisterPairHigh() == pair.AsRegisterPairHigh()) {
-          blocked_register_pairs_[i] = true;
-        }
-      }
+      UpdateBlockedPairRegisters();
       return Location::RegisterPairLocation(pair.AsRegisterPairLow(), pair.AsRegisterPairHigh());
     }
 
@@ -266,10 +258,19 @@
   blocked_core_registers_[EBP] = true;
   blocked_core_registers_[ESI] = true;
   blocked_core_registers_[EDI] = true;
-  blocked_register_pairs_[EAX_EDI] = true;
-  blocked_register_pairs_[EDX_EDI] = true;
-  blocked_register_pairs_[ECX_EDI] = true;
-  blocked_register_pairs_[EBX_EDI] = true;
+
+  UpdateBlockedPairRegisters();
+}
+
+void CodeGeneratorX86::UpdateBlockedPairRegisters() const {
+  for (int i = 0; i < kNumberOfRegisterPairs; i++) {
+    X86ManagedRegister current =
+        X86ManagedRegister::FromRegisterPair(static_cast<RegisterPair>(i));
+    if (blocked_core_registers_[current.AsRegisterPairLow()]
+        || blocked_core_registers_[current.AsRegisterPairHigh()]) {
+      blocked_register_pairs_[i] = true;
+    }
+  }
 }
 
 InstructionCodeGeneratorX86::InstructionCodeGeneratorX86(HGraph* graph, CodeGeneratorX86* codegen)
@@ -1118,6 +1119,113 @@
   }
 }
 
+void LocationsBuilderX86::VisitMul(HMul* mul) {
+  LocationSummary* locations =
+      new (GetGraph()->GetArena()) LocationSummary(mul, LocationSummary::kNoCall);
+  switch (mul->GetResultType()) {
+    case Primitive::kPrimInt:
+      locations->SetInAt(0, Location::RequiresRegister());
+      locations->SetInAt(1, Location::Any());
+      locations->SetOut(Location::SameAsFirstInput());
+      break;
+    case Primitive::kPrimLong: {
+      locations->SetInAt(0, Location::RequiresRegister());
+      // TODO: Currently this handles only stack operands:
+      // - we don't have enough registers because we currently use Quick ABI.
+      // - by the time we have a working register allocator we will probably change the ABI
+      // and fix the above.
+      // - we don't have a way yet to request operands on stack but the base line compiler
+      // will leave the operands on the stack with Any().
+      locations->SetInAt(1, Location::Any());
+      locations->SetOut(Location::SameAsFirstInput());
+      // Needed for imul on 32bits with 64bits output.
+      locations->AddTemp(Location::RegisterLocation(EAX));
+      locations->AddTemp(Location::RegisterLocation(EDX));
+      break;
+    }
+
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      LOG(FATAL) << "Unexpected mul type " << mul->GetResultType();
+      break;
+
+    default:
+      LOG(FATAL) << "Unimplemented mul type " << mul->GetResultType();
+  }
+}
+
+void InstructionCodeGeneratorX86::VisitMul(HMul* mul) {
+  LocationSummary* locations = mul->GetLocations();
+  Location first = locations->InAt(0);
+  Location second = locations->InAt(1);
+  DCHECK(first.Equals(locations->Out()));
+
+  switch (mul->GetResultType()) {
+    case Primitive::kPrimInt: {
+      if (second.IsRegister()) {
+        __ imull(first.As<Register>(), second.As<Register>());
+      } else if (second.IsConstant()) {
+        Immediate imm(second.GetConstant()->AsIntConstant()->GetValue());
+        __ imull(first.As<Register>(), imm);
+      } else {
+        DCHECK(second.IsStackSlot());
+        __ imull(first.As<Register>(), Address(ESP, second.GetStackIndex()));
+      }
+      break;
+    }
+
+    case Primitive::kPrimLong: {
+      DCHECK(second.IsDoubleStackSlot());
+
+      Register in1_hi = first.AsRegisterPairHigh<Register>();
+      Register in1_lo = first.AsRegisterPairLow<Register>();
+      Address in2_hi(ESP, second.GetHighStackIndex(kX86WordSize));
+      Address in2_lo(ESP, second.GetStackIndex());
+      Register eax = locations->GetTemp(0).As<Register>();
+      Register edx = locations->GetTemp(1).As<Register>();
+
+      DCHECK_EQ(EAX, eax);
+      DCHECK_EQ(EDX, edx);
+
+      // input: in1 - 64 bits, in2 - 64 bits
+      // output: in1
+      // formula: in1.hi : in1.lo = (in1.lo * in2.hi + in1.hi * in2.lo)* 2^32 + in1.lo * in2.lo
+      // parts: in1.hi = in1.lo * in2.hi + in1.hi * in2.lo + (in1.lo * in2.lo)[63:32]
+      // parts: in1.lo = (in1.lo * in2.lo)[31:0]
+
+      __ movl(eax, in2_hi);
+      // eax <- in1.lo * in2.hi
+      __ imull(eax, in1_lo);
+      // in1.hi <- in1.hi * in2.lo
+      __ imull(in1_hi, in2_lo);
+      // in1.hi <- in1.lo * in2.hi + in1.hi * in2.lo
+      __ addl(in1_hi, eax);
+      // move in1_lo to eax to prepare for double precision
+      __ movl(eax, in1_lo);
+      // edx:eax <- in1.lo * in2.lo
+      __ mull(in2_lo);
+      // in1.hi <- in2.hi * in1.lo +  in2.lo * in1.hi + (in1.lo * in2.lo)[63:32]
+      __ addl(in1_hi, edx);
+      // in1.lo <- (in1.lo * in2.lo)[31:0];
+      __ movl(in1_lo, eax);
+
+      break;
+    }
+
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      LOG(FATAL) << "Unexpected mul type " << mul->GetResultType();
+      break;
+
+    default:
+      LOG(FATAL) << "Unimplemented mul type " << mul->GetResultType();
+  }
+}
+
 void LocationsBuilderX86::VisitNewInstance(HNewInstance* instruction) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index aa5fee0..db8b9ab 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -166,6 +166,7 @@
   }
 
   virtual void SetupBlockedRegisters() const OVERRIDE;
+
   virtual Location AllocateFreeRegister(Primitive::Type type) const OVERRIDE;
 
   virtual Location GetStackLocation(HLoadLocal* load) const OVERRIDE;
@@ -173,6 +174,9 @@
   virtual void DumpCoreRegister(std::ostream& stream, int reg) const OVERRIDE;
   virtual void DumpFloatingPointRegister(std::ostream& stream, int reg) const OVERRIDE;
 
+  // Blocks all register pairs made out of blocked core registers.
+  void UpdateBlockedPairRegisters() const;
+
   ParallelMoveResolverX86* GetMoveResolver() {
     return &move_resolver_;
   }
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 059ff3f..5fa9305 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -1080,6 +1080,70 @@
   }
 }
 
+void LocationsBuilderX86_64::VisitMul(HMul* mul) {
+  LocationSummary* locations =
+      new (GetGraph()->GetArena()) LocationSummary(mul, LocationSummary::kNoCall);
+  switch (mul->GetResultType()) {
+    case Primitive::kPrimInt: {
+      locations->SetInAt(0, Location::RequiresRegister());
+      locations->SetInAt(1, Location::Any());
+      locations->SetOut(Location::SameAsFirstInput());
+      break;
+    }
+    case Primitive::kPrimLong: {
+      locations->SetInAt(0, Location::RequiresRegister());
+      locations->SetInAt(1, Location::RequiresRegister());
+      locations->SetOut(Location::SameAsFirstInput());
+      break;
+    }
+
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      LOG(FATAL) << "Unexpected mul type " << mul->GetResultType();
+      break;
+
+    default:
+      LOG(FATAL) << "Unimplemented mul type " << mul->GetResultType();
+  }
+}
+
+void InstructionCodeGeneratorX86_64::VisitMul(HMul* mul) {
+  LocationSummary* locations = mul->GetLocations();
+  Location first = locations->InAt(0);
+  Location second = locations->InAt(1);
+  DCHECK(first.Equals(locations->Out()));
+  switch (mul->GetResultType()) {
+    case Primitive::kPrimInt: {
+      if (second.IsRegister()) {
+        __ imull(first.As<CpuRegister>(), second.As<CpuRegister>());
+      } else if (second.IsConstant()) {
+        Immediate imm(second.GetConstant()->AsIntConstant()->GetValue());
+        __ imull(first.As<CpuRegister>(), imm);
+      } else {
+        DCHECK(second.IsStackSlot());
+        __ imull(first.As<CpuRegister>(), Address(CpuRegister(RSP), second.GetStackIndex()));
+      }
+      break;
+    }
+    case Primitive::kPrimLong: {
+      __ imulq(first.As<CpuRegister>(), second.As<CpuRegister>());
+      break;
+    }
+
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      LOG(FATAL) << "Unexpected mul type " << mul->GetResultType();
+      break;
+
+    default:
+      LOG(FATAL) << "Unimplemented mul type " << mul->GetResultType();
+  }
+}
+
 void LocationsBuilderX86_64::VisitNewInstance(HNewInstance* instruction) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc
index 3037f1c..8bb12de 100644
--- a/compiler/optimizing/codegen_test.cc
+++ b/compiler/optimizing/codegen_test.cc
@@ -349,4 +349,49 @@
   RunCodeOptimized(graph, hook_before_codegen, true, 0);
 }
 
+#define MUL_TEST(TYPE, TEST_NAME)                     \
+  TEST(CodegenTest, Return ## TEST_NAME) {            \
+    const uint16_t data[] = TWO_REGISTERS_CODE_ITEM(  \
+      Instruction::CONST_4 | 3 << 12 | 0,             \
+      Instruction::CONST_4 | 4 << 12 | 1 << 8,        \
+      Instruction::MUL_ ## TYPE, 1 << 8 | 0,          \
+      Instruction::RETURN);                           \
+                                                      \
+    TestCode(data, true, 12);                         \
+  }                                                   \
+                                                      \
+  TEST(CodegenTest, Return ## TEST_NAME ## 2addr) {   \
+    const uint16_t data[] = TWO_REGISTERS_CODE_ITEM(  \
+      Instruction::CONST_4 | 3 << 12 | 0,             \
+      Instruction::CONST_4 | 4 << 12 | 1 << 8,        \
+      Instruction::MUL_ ## TYPE ## _2ADDR | 1 << 12,  \
+      Instruction::RETURN);                           \
+                                                      \
+    TestCode(data, true, 12);                         \
+  }
+
+MUL_TEST(INT, MulInt);
+MUL_TEST(LONG, MulLong);
+// MUL_TEST(FLOAT, Float);
+// MUL_TEST(DOUBLE, Double);
+
+TEST(CodegenTest, ReturnMulIntLit8) {
+  const uint16_t data[] = ONE_REGISTER_CODE_ITEM(
+    Instruction::CONST_4 | 4 << 12 | 0 << 8,
+    Instruction::MUL_INT_LIT8, 3 << 8 | 0,
+    Instruction::RETURN);
+
+  TestCode(data, true, 12);
+}
+
+TEST(CodegenTest, ReturnMulIntLit16) {
+  const uint16_t data[] = ONE_REGISTER_CODE_ITEM(
+    Instruction::CONST_4 | 4 << 12 | 0 << 8,
+    Instruction::MUL_INT_LIT16, 3,
+    Instruction::RETURN);
+
+  TestCode(data, true, 12);
+}
+
+
 }  // namespace art
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index e60a7e6..ec26c4a 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -502,11 +502,12 @@
   M(NullCheck, Instruction)                                             \
   M(Temporary, Instruction)                                             \
   M(SuspendCheck, Instruction)                                          \
+  M(Mul, BinaryOperation)                                               \
 
 #define FOR_EACH_INSTRUCTION(M)                                         \
   FOR_EACH_CONCRETE_INSTRUCTION(M)                                      \
   M(Constant, Instruction)                                              \
-  M(BinaryOperation, Instruction) \
+  M(BinaryOperation, Instruction)                                       \
   M(Invoke, Instruction)
 
 #define FORWARD_DECLARATION(type, super) class H##type;
@@ -1556,6 +1557,22 @@
   DISALLOW_COPY_AND_ASSIGN(HSub);
 };
 
+class HMul : public HBinaryOperation {
+ public:
+  HMul(Primitive::Type result_type, HInstruction* left, HInstruction* right)
+      : HBinaryOperation(result_type, left, right) {}
+
+  virtual bool IsCommutative() { return true; }
+
+  virtual int32_t Evaluate(int32_t x, int32_t y) const { return x * y; }
+  virtual int64_t Evaluate(int64_t x, int64_t y) const { return x * y; }
+
+  DECLARE_INSTRUCTION(Mul);
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HMul);
+};
+
 // The value of a parameter in this method. Its location depends on
 // the calling convention.
 class HParameterValue : public HExpression<0> {
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index 75823e3..db7151c 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -1238,6 +1238,34 @@
 }
 
 
+void X86_64Assembler::imulq(CpuRegister dst, CpuRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitRex64(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0xAF);
+  EmitRegisterOperand(dst.LowBits(), src.LowBits());
+}
+
+
+void X86_64Assembler::imulq(CpuRegister reg, const Immediate& imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  CHECK(imm.is_int32());  // imulq only supports 32b immediate.
+  EmitRex64(reg);
+  EmitUint8(0x69);
+  EmitOperand(reg.LowBits(), Operand(reg));
+  EmitImmediate(imm);
+}
+
+
+void X86_64Assembler::imulq(CpuRegister reg, const Address& address) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitRex64(reg, address);
+  EmitUint8(0x0F);
+  EmitUint8(0xAF);
+  EmitOperand(reg.LowBits(), address);
+}
+
+
 void X86_64Assembler::imull(CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitOptionalRex32(reg);
@@ -1270,7 +1298,6 @@
 }
 
 
-
 void X86_64Assembler::shll(CpuRegister reg, const Immediate& imm) {
   EmitGenericShift(false, 4, reg, imm);
 }
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index 1d9eba4..4ffb6b5 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -436,6 +436,10 @@
   void imull(CpuRegister reg, const Immediate& imm);
   void imull(CpuRegister reg, const Address& address);
 
+  void imulq(CpuRegister dst, CpuRegister src);
+  void imulq(CpuRegister reg, const Immediate& imm);
+  void imulq(CpuRegister reg, const Address& address);
+
   void imull(CpuRegister reg);
   void imull(const Address& address);
 
diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc
index 7a48b63..69a5fa0 100644
--- a/compiler/utils/x86_64/assembler_x86_64_test.cc
+++ b/compiler/utils/x86_64/assembler_x86_64_test.cc
@@ -112,6 +112,9 @@
   DriverStr(RepeatRI(&x86_64::X86_64Assembler::addq, 4U, "addq ${imm}, %{reg}"), "addqi");
 }
 
+TEST_F(AssemblerX86_64Test, ImulqRegs) {
+  DriverStr(RepeatRR(&x86_64::X86_64Assembler::imulq, "imulq %{reg2}, %{reg1}"), "imulq");
+}
 
 TEST_F(AssemblerX86_64Test, SubqRegs) {
   DriverStr(RepeatRR(&x86_64::X86_64Assembler::subq, "subq %{reg2}, %{reg1}"), "subq");
diff --git a/test/411-optimizing-arith/expected.txt b/test/411-optimizing-arith/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/411-optimizing-arith/expected.txt
diff --git a/test/411-optimizing-arith/info.txt b/test/411-optimizing-arith/info.txt
new file mode 100644
index 0000000..1015551
--- /dev/null
+++ b/test/411-optimizing-arith/info.txt
@@ -0,0 +1 @@
+Tests for basic arithmethic operations.
diff --git a/test/411-optimizing-arith/src/Main.java b/test/411-optimizing-arith/src/Main.java
new file mode 100644
index 0000000..74c47a6
--- /dev/null
+++ b/test/411-optimizing-arith/src/Main.java
@@ -0,0 +1,64 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Note that $opt$ is a marker for the optimizing compiler to ensure
+// it does compile the method.
+
+public class Main {
+
+  public static void expectEquals(int expected, int result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  public static void expectEquals(long expected, long result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  public static void main(String[] args) {
+    mul();
+  }
+
+  public static void mul() {
+    expectEquals(15, $opt$Mul(5, 3));
+    expectEquals(0, $opt$Mul(0, 3));
+    expectEquals(0, $opt$Mul(3, 0));
+    expectEquals(-3, $opt$Mul(1, -3));
+    expectEquals(36, $opt$Mul(-12, -3));
+    expectEquals(33, $opt$Mul(1, 3) * 11);
+    expectEquals(671088645, $opt$Mul(134217729, 5)); // (2^27 + 1) * 5
+
+    expectEquals(15L, $opt$Mul(5L, 3L));
+    expectEquals(0L, $opt$Mul(0L, 3L));
+    expectEquals(0L, $opt$Mul(3L, 0L));
+    expectEquals(-3L, $opt$Mul(1L, -3L));
+    expectEquals(36L, $opt$Mul(-12L, -3L));
+    expectEquals(33L, $opt$Mul(1L, 3L) * 11);
+    expectEquals(240518168583L, $opt$Mul(34359738369L, 7L)); // (2^35 + 1) * 7
+  }
+
+  static int $opt$Mul(int a, int b) {
+    return a * b;
+  }
+
+  static long $opt$Mul(long a, long b) {
+    return a * b;
+  }
+
+}