Optimize more easy multiplications by constants.

Rather than make these changes in the libraries (*10 being a common case),
let's do them once and for all in the JIT.

The 2^n-1 case could be better if we generated RSB instructions, but the
current "fake" RSB is still better than a full multiply.

Thumb doesn't support reg/reg/reg/shift instructions, so we can't optimize
the "population count <= 2" cases (such as *10) there.

Tested on sholes, passion, and passion-running-sapphire (and visually
inspected to check we weren't trying to generate Thumb2 instructions there).
Also tested with the self-verifier.
diff --git a/vm/Android.mk b/vm/Android.mk
index 8e41da7..04e0910 100644
--- a/vm/Android.mk
+++ b/vm/Android.mk
@@ -56,7 +56,7 @@
     # Enable assert and JIT tuning
     include $(LOCAL_PATH)/ReconfigureDvm.mk
 
-    # Enable asserion and JIT-tuning
+    # Enable assertions and JIT-tuning
     LOCAL_CFLAGS += -UNDEBUG -DDEBUG=1 -DLOG_NDEBUG=1 -DWITH_DALVIK_ASSERT \
 				    -DWITH_JIT_TUNING -DEXIT_STATS
     LOCAL_MODULE := libdvm_assert
@@ -66,7 +66,7 @@
     # Enable assert and self-verification
     include $(LOCAL_PATH)/ReconfigureDvm.mk
 
-    # Enable asserion and JIT self-verification
+    # Enable assertions and JIT self-verification
     LOCAL_CFLAGS += -UNDEBUG -DDEBUG=1 -DLOG_NDEBUG=1 -DWITH_DALVIK_ASSERT \
 					-DWITH_SELF_VERIFICATION
     LOCAL_MODULE := libdvm_sv
diff --git a/vm/compiler/codegen/arm/ArmLIR.h b/vm/compiler/codegen/arm/ArmLIR.h
index b329aed..7d26221 100644
--- a/vm/compiler/codegen/arm/ArmLIR.h
+++ b/vm/compiler/codegen/arm/ArmLIR.h
@@ -706,7 +706,7 @@
 
 /*
  * Each instance of this struct holds a pseudo or real LIR instruction:
- * - pesudo ones (eg labels and marks) and will be discarded by the assembler.
+ * - pseudo ones (eg labels and marks) and will be discarded by the assembler.
  * - real ones will be assembled into Thumb instructions.
  *
  * Machine resources are encoded into a 64-bit vector, where the encodings are
diff --git a/vm/compiler/codegen/arm/Assemble.c b/vm/compiler/codegen/arm/Assemble.c
index 09783f2..7f8ad9e 100644
--- a/vm/compiler/codegen/arm/Assemble.c
+++ b/vm/compiler/codegen/arm/Assemble.c
@@ -36,7 +36,7 @@
  * s2e: src2 end bit position
  * operands: number of operands (for sanity check purposes)
  * name: mnemonic name
- * fmt: for pretty-prining
+ * fmt: for pretty-printing
  */
 #define ENCODING_MAP(opcode, skeleton, k0, ds, de, k1, s1s, s1e, k2, s2s, s2e, \
                      k3, k3s, k3e, flags, name, fmt, size) \
diff --git a/vm/compiler/codegen/arm/CodegenDriver.c b/vm/compiler/codegen/arm/CodegenDriver.c
index 4048c0a..f117de8 100644
--- a/vm/compiler/codegen/arm/CodegenDriver.c
+++ b/vm/compiler/codegen/arm/CodegenDriver.c
@@ -1844,20 +1844,75 @@
     return false;
 }
 
-/* Positive power of 2?  Return shiftCount if so, -1 if not */
-static int mulToShift(int val) {
-    int shiftCount;
-    if (val > 0 && ((val & (val - 1)) == 0)) {
-        shiftCount = 0;
-        val = ~val & (val - 1);
-        while (val) {
-            shiftCount++;
-            val >>= 1;
-        }
-    } else {
-        shiftCount = -1;
+static bool isPowerOfTwo(int x)
+{
+    return (x & (x - 1)) == 0;
+}
+
+// Returns true if no more than two bits are set in 'x'.
+static bool isPopCountLE2(unsigned int x)
+{
+    x &= x - 1;
+    return (x & (x - 1)) == 0;
+}
+
+// Returns the index of the lowest set bit in 'x'.
+static int lowestSetBit(unsigned int x) {
+    int bit_posn = 0;
+    while ((x & 0xf) == 0) {
+        bit_posn += 4;
+        x >>= 4;
     }
-    return shiftCount;
+    while ((x & 1) == 0) {
+        bit_posn++;
+        x >>= 1;
+    }
+    return bit_posn;
+}
+
+// Returns true if it added instructions to 'cUnit' to multiply 'rlSrc' by 'lit'
+// and store the result in 'rlDest'.
+static bool handleEasyMultiply(CompilationUnit *cUnit,
+                               RegLocation rlSrc, RegLocation rlDest, int lit)
+{
+    // Can we simplify this multiplication?
+    bool powerOfTwo = false;
+    bool popCountLE2 = false;
+    bool powerOfTwoMinusOne = false;
+    if (lit < 2) {
+        // Avoid special cases.
+        return false;
+    } else if (isPowerOfTwo(lit)) {
+        powerOfTwo = true;
+    } else if (isPopCountLE2(lit)) {
+        popCountLE2 = true;
+    } else if (isPowerOfTwo(lit + 1)) {
+        powerOfTwoMinusOne = true;
+    } else {
+        return false;
+    }
+    rlSrc = loadValue(cUnit, rlSrc, kCoreReg);
+    RegLocation rlResult = dvmCompilerEvalLoc(cUnit, rlDest, kCoreReg, true);
+    if (powerOfTwo) {
+        // Shift.
+        opRegRegImm(cUnit, kOpLsl, rlResult.lowReg, rlSrc.lowReg,
+                    lowestSetBit(lit));
+    } else if (popCountLE2) {
+        // Shift and add and shift.
+        int firstBit = lowestSetBit(lit);
+        int secondBit = lowestSetBit(lit ^ (1 << firstBit));
+        genMultiplyByTwoBitMultiplier(cUnit, rlSrc, rlResult, lit,
+                                      firstBit, secondBit);
+    } else {
+        // Reverse subtract: (src << (shift + 1)) - src.
+        assert(powerOfTwoMinusOne);
+        // TODO: rsb dst, src, src lsl#lowestSetBit(lit + 1)
+        int tReg = dvmCompilerAllocTemp(cUnit);
+        opRegRegImm(cUnit, kOpLsl, tReg, rlSrc.lowReg, lowestSetBit(lit + 1));
+        opRegRegReg(cUnit, kOpSub, rlResult.lowReg, tReg, rlSrc.lowReg);
+    }
+    storeValue(cUnit, rlDest, rlResult);
+    return true;
 }
 
 static bool handleFmt22b_Fmt22s(CompilationUnit *cUnit, MIR *mir)
@@ -1896,15 +1951,10 @@
             break;
         case OP_MUL_INT_LIT8:
         case OP_MUL_INT_LIT16: {
-            // TUNING: General shift & add for small constants
-            int shiftCount = mulToShift(lit);
-            if (shiftCount >= 0) {
-                op = kOpLsl;
-                lit = shiftCount;
-                shiftOp = true;
-            } else {
-                op = kOpMul;
+            if (handleEasyMultiply(cUnit, rlSrc, rlDest, lit)) {
+                return false;
             }
+            op = kOpMul;
             break;
         }
         case OP_AND_INT_LIT8:
diff --git a/vm/compiler/codegen/arm/Thumb/Gen.c b/vm/compiler/codegen/arm/Thumb/Gen.c
index a274e19..35c4451 100644
--- a/vm/compiler/codegen/arm/Thumb/Gen.c
+++ b/vm/compiler/codegen/arm/Thumb/Gen.c
@@ -289,3 +289,12 @@
     dvmCompilerClobber(cUnit, ophi);
     return false;
 }
+
+static void genMultiplyByTwoBitMultiplier(CompilationUnit *cUnit,
+        RegLocation rlSrc, RegLocation rlResult, int lit,
+        int firstBit, int secondBit)
+{
+    // We can't implement "add src, src, src, lsl#shift" on Thumb, so we have
+    // to do a regular multiply.
+    opRegRegImm(cUnit, kOpMul, rlResult.lowReg, rlSrc.lowReg, lit);
+}
diff --git a/vm/compiler/codegen/arm/Thumb2/Gen.c b/vm/compiler/codegen/arm/Thumb2/Gen.c
index 1390d64..dbce452 100644
--- a/vm/compiler/codegen/arm/Thumb2/Gen.c
+++ b/vm/compiler/codegen/arm/Thumb2/Gen.c
@@ -442,3 +442,14 @@
     storeValueWide(cUnit, rlDest, rlResult);
     return false;
 }
+
+static void genMultiplyByTwoBitMultiplier(CompilationUnit *cUnit,
+        RegLocation rlSrc, RegLocation rlResult, int lit,
+        int firstBit, int secondBit)
+{
+    opRegRegRegShift(cUnit, kOpAdd, rlResult.lowReg, rlSrc.lowReg, rlSrc.lowReg,
+                     encodeShift(kArmLsl, secondBit - firstBit));
+    if (firstBit != 0) {
+        opRegRegImm(cUnit, kOpLsl, rlResult.lowReg, rlResult.lowReg, firstBit);
+    }
+}