Tiny optimization for complier templates for arm.

1. Remove possible bubble in TEMPLATE_STRING_INDEXOF.S
2. Remove 1 instruction and reorder the opcodes
   TEMPLATE_MUL_LONG.S
3. Reorder ldr r2 instruction in TEMPLATE_RETURN.S

(cherry-pick of a2dc68acd954827cdc67929a859354e5ed9b5713.)

Change-Id: I78b9797aff3c2255c5d34a8391b1a94a1b09b613
diff --git a/vm/compiler/template/armv5te/TEMPLATE_MUL_LONG.S b/vm/compiler/template/armv5te/TEMPLATE_MUL_LONG.S
index 8a9b115..6652b71 100644
--- a/vm/compiler/template/armv5te/TEMPLATE_MUL_LONG.S
+++ b/vm/compiler/template/armv5te/TEMPLATE_MUL_LONG.S
@@ -22,7 +22,6 @@
     mul     ip, r2, r1                  @  ip<- ZxW
     umull   r9, r10, r2, r0             @  r9/r10 <- ZxX
     mla     r2, r0, r3, ip              @  r2<- YxX + (ZxW)
-    add     r10, r2, r10                @  r10<- r10 + low(ZxW + (YxX))
-    mov     r0,r9
-    mov     r1,r10
+    mov     r0, r9
+    add     r1, r2, r10                 @  r1<- r10 + low(ZxW + (YxX))
     bx      lr
diff --git a/vm/compiler/template/armv5te/TEMPLATE_RETURN.S b/vm/compiler/template/armv5te/TEMPLATE_RETURN.S
index b10afcf..e8e2d52 100644
--- a/vm/compiler/template/armv5te/TEMPLATE_RETURN.S
+++ b/vm/compiler/template/armv5te/TEMPLATE_RETURN.S
@@ -17,12 +17,12 @@
     ldr     r10, [r0, #offStackSaveArea_prevFrame] @ r10<- saveArea->prevFrame
     ldrb    r8, [rSELF, #offThread_breakFlags] @ r8<- breakFlags
     ldr     rPC, [r0, #offStackSaveArea_savedPc] @ rPC<- saveArea->savedPc
+    ldr     r2, [r10, #(offStackSaveArea_method - sizeofStackSaveArea)]
 #if !defined(WITH_SELF_VERIFICATION)
     ldr     r9,  [r0, #offStackSaveArea_returnAddr] @ r9<- chaining cell ret
 #else
     mov     r9, #0                      @ disable chaining
 #endif
-    ldr     r2, [r10, #(offStackSaveArea_method - sizeofStackSaveArea)]
                                         @ r2<- method we're returning to
     cmp     r2, #0                      @ break frame?
 #if !defined(WITH_SELF_VERIFICATION)
diff --git a/vm/compiler/template/armv5te/TEMPLATE_STRING_INDEXOF.S b/vm/compiler/template/armv5te/TEMPLATE_STRING_INDEXOF.S
index bdfdf28..d970372 100644
--- a/vm/compiler/template/armv5te/TEMPLATE_STRING_INDEXOF.S
+++ b/vm/compiler/template/armv5te/TEMPLATE_STRING_INDEXOF.S
@@ -15,22 +15,23 @@
      *    r2:   Starting offset in string data
      */
 
+    ldr    r3, [r0, #STRING_FIELDOFF_VALUE]
     ldr    r7, [r0, #STRING_FIELDOFF_OFFSET]
     ldr    r8, [r0, #STRING_FIELDOFF_COUNT]
-    ldr    r0, [r0, #STRING_FIELDOFF_VALUE]
+
 
     /*
      * At this point, we have:
-     *    r0: object pointer
      *    r1: char to match
      *    r2: starting offset
+     *    r3: object pointer (final result -> r0)
      *    r7: offset
      *    r8: string length
      */
 
      /* Build pointer to start of string data */
-     add   r0, #16
-     add   r0, r0, r7, lsl #1
+     add   r3, #16
+     add   r0, r3, r7, lsl #1
 
      /* Save a copy of starting data in r7 */
      mov   r7, r0
diff --git a/vm/compiler/template/out/CompilerTemplateAsm-armv5te-vfp.S b/vm/compiler/template/out/CompilerTemplateAsm-armv5te-vfp.S
index 27319e7..7ba1596 100644
--- a/vm/compiler/template/out/CompilerTemplateAsm-armv5te-vfp.S
+++ b/vm/compiler/template/out/CompilerTemplateAsm-armv5te-vfp.S
@@ -178,12 +178,12 @@
     ldr     r10, [r0, #offStackSaveArea_prevFrame] @ r10<- saveArea->prevFrame
     ldrb    r8, [rSELF, #offThread_breakFlags] @ r8<- breakFlags
     ldr     rPC, [r0, #offStackSaveArea_savedPc] @ rPC<- saveArea->savedPc
+    ldr     r2, [r10, #(offStackSaveArea_method - sizeofStackSaveArea)]
 #if !defined(WITH_SELF_VERIFICATION)
     ldr     r9,  [r0, #offStackSaveArea_returnAddr] @ r9<- chaining cell ret
 #else
     mov     r9, #0                      @ disable chaining
 #endif
-    ldr     r2, [r10, #(offStackSaveArea_method - sizeofStackSaveArea)]
                                         @ r2<- method we're returning to
     cmp     r2, #0                      @ break frame?
 #if !defined(WITH_SELF_VERIFICATION)
@@ -520,9 +520,8 @@
     mul     ip, r2, r1                  @  ip<- ZxW
     umull   r9, r10, r2, r0             @  r9/r10 <- ZxX
     mla     r2, r0, r3, ip              @  r2<- YxX + (ZxW)
-    add     r10, r2, r10                @  r10<- r10 + low(ZxW + (YxX))
-    mov     r0,r9
-    mov     r1,r10
+    mov     r0, r9
+    add     r1, r2, r10                 @  r1<- r10 + low(ZxW + (YxX))
     bx      lr
 
 /* ------------------------------ */
@@ -1265,22 +1264,23 @@
      *    r2:   Starting offset in string data
      */
 
+    ldr    r3, [r0, #STRING_FIELDOFF_VALUE]
     ldr    r7, [r0, #STRING_FIELDOFF_OFFSET]
     ldr    r8, [r0, #STRING_FIELDOFF_COUNT]
-    ldr    r0, [r0, #STRING_FIELDOFF_VALUE]
+
 
     /*
      * At this point, we have:
-     *    r0: object pointer
      *    r1: char to match
      *    r2: starting offset
+     *    r3: object pointer (final result -> r0)
      *    r7: offset
      *    r8: string length
      */
 
      /* Build pointer to start of string data */
-     add   r0, #16
-     add   r0, r0, r7, lsl #1
+     add   r3, #16
+     add   r0, r3, r7, lsl #1
 
      /* Save a copy of starting data in r7 */
      mov   r7, r0
@@ -1516,12 +1516,12 @@
     ldr     r10, [r0, #offStackSaveArea_prevFrame] @ r10<- saveArea->prevFrame
     ldrb    r8, [rSELF, #offThread_breakFlags] @ r8<- breakFlags
     ldr     rPC, [r0, #offStackSaveArea_savedPc] @ rPC<- saveArea->savedPc
+    ldr     r2, [r10, #(offStackSaveArea_method - sizeofStackSaveArea)]
 #if !defined(WITH_SELF_VERIFICATION)
     ldr     r9,  [r0, #offStackSaveArea_returnAddr] @ r9<- chaining cell ret
 #else
     mov     r9, #0                      @ disable chaining
 #endif
-    ldr     r2, [r10, #(offStackSaveArea_method - sizeofStackSaveArea)]
                                         @ r2<- method we're returning to
     cmp     r2, #0                      @ break frame?
 #if !defined(WITH_SELF_VERIFICATION)
diff --git a/vm/compiler/template/out/CompilerTemplateAsm-armv5te.S b/vm/compiler/template/out/CompilerTemplateAsm-armv5te.S
index 68f6441..7d67595 100644
--- a/vm/compiler/template/out/CompilerTemplateAsm-armv5te.S
+++ b/vm/compiler/template/out/CompilerTemplateAsm-armv5te.S
@@ -178,12 +178,12 @@
     ldr     r10, [r0, #offStackSaveArea_prevFrame] @ r10<- saveArea->prevFrame
     ldrb    r8, [rSELF, #offThread_breakFlags] @ r8<- breakFlags
     ldr     rPC, [r0, #offStackSaveArea_savedPc] @ rPC<- saveArea->savedPc
+    ldr     r2, [r10, #(offStackSaveArea_method - sizeofStackSaveArea)]
 #if !defined(WITH_SELF_VERIFICATION)
     ldr     r9,  [r0, #offStackSaveArea_returnAddr] @ r9<- chaining cell ret
 #else
     mov     r9, #0                      @ disable chaining
 #endif
-    ldr     r2, [r10, #(offStackSaveArea_method - sizeofStackSaveArea)]
                                         @ r2<- method we're returning to
     cmp     r2, #0                      @ break frame?
 #if !defined(WITH_SELF_VERIFICATION)
@@ -732,9 +732,8 @@
     mul     ip, r2, r1                  @  ip<- ZxW
     umull   r9, r10, r2, r0             @  r9/r10 <- ZxX
     mla     r2, r0, r3, ip              @  r2<- YxX + (ZxW)
-    add     r10, r2, r10                @  r10<- r10 + low(ZxW + (YxX))
-    mov     r0,r9
-    mov     r1,r10
+    mov     r0, r9
+    add     r1, r2, r10                 @  r1<- r10 + low(ZxW + (YxX))
     bx      lr
 
 /* ------------------------------ */
@@ -996,22 +995,23 @@
      *    r2:   Starting offset in string data
      */
 
+    ldr    r3, [r0, #STRING_FIELDOFF_VALUE]
     ldr    r7, [r0, #STRING_FIELDOFF_OFFSET]
     ldr    r8, [r0, #STRING_FIELDOFF_COUNT]
-    ldr    r0, [r0, #STRING_FIELDOFF_VALUE]
+
 
     /*
      * At this point, we have:
-     *    r0: object pointer
      *    r1: char to match
      *    r2: starting offset
+     *    r3: object pointer (final result -> r0)
      *    r7: offset
      *    r8: string length
      */
 
      /* Build pointer to start of string data */
-     add   r0, #16
-     add   r0, r0, r7, lsl #1
+     add   r3, #16
+     add   r0, r3, r7, lsl #1
 
      /* Save a copy of starting data in r7 */
      mov   r7, r0
@@ -1247,12 +1247,12 @@
     ldr     r10, [r0, #offStackSaveArea_prevFrame] @ r10<- saveArea->prevFrame
     ldrb    r8, [rSELF, #offThread_breakFlags] @ r8<- breakFlags
     ldr     rPC, [r0, #offStackSaveArea_savedPc] @ rPC<- saveArea->savedPc
+    ldr     r2, [r10, #(offStackSaveArea_method - sizeofStackSaveArea)]
 #if !defined(WITH_SELF_VERIFICATION)
     ldr     r9,  [r0, #offStackSaveArea_returnAddr] @ r9<- chaining cell ret
 #else
     mov     r9, #0                      @ disable chaining
 #endif
-    ldr     r2, [r10, #(offStackSaveArea_method - sizeofStackSaveArea)]
                                         @ r2<- method we're returning to
     cmp     r2, #0                      @ break frame?
 #if !defined(WITH_SELF_VERIFICATION)
diff --git a/vm/compiler/template/out/CompilerTemplateAsm-armv7-a-neon.S b/vm/compiler/template/out/CompilerTemplateAsm-armv7-a-neon.S
index 7573bd8..0dfdd87 100644
--- a/vm/compiler/template/out/CompilerTemplateAsm-armv7-a-neon.S
+++ b/vm/compiler/template/out/CompilerTemplateAsm-armv7-a-neon.S
@@ -178,12 +178,12 @@
     ldr     r10, [r0, #offStackSaveArea_prevFrame] @ r10<- saveArea->prevFrame
     ldrb    r8, [rSELF, #offThread_breakFlags] @ r8<- breakFlags
     ldr     rPC, [r0, #offStackSaveArea_savedPc] @ rPC<- saveArea->savedPc
+    ldr     r2, [r10, #(offStackSaveArea_method - sizeofStackSaveArea)]
 #if !defined(WITH_SELF_VERIFICATION)
     ldr     r9,  [r0, #offStackSaveArea_returnAddr] @ r9<- chaining cell ret
 #else
     mov     r9, #0                      @ disable chaining
 #endif
-    ldr     r2, [r10, #(offStackSaveArea_method - sizeofStackSaveArea)]
                                         @ r2<- method we're returning to
     cmp     r2, #0                      @ break frame?
 #if !defined(WITH_SELF_VERIFICATION)
@@ -520,9 +520,8 @@
     mul     ip, r2, r1                  @  ip<- ZxW
     umull   r9, r10, r2, r0             @  r9/r10 <- ZxX
     mla     r2, r0, r3, ip              @  r2<- YxX + (ZxW)
-    add     r10, r2, r10                @  r10<- r10 + low(ZxW + (YxX))
-    mov     r0,r9
-    mov     r1,r10
+    mov     r0, r9
+    add     r1, r2, r10                 @  r1<- r10 + low(ZxW + (YxX))
     bx      lr
 
 /* ------------------------------ */
@@ -1265,22 +1264,23 @@
      *    r2:   Starting offset in string data
      */
 
+    ldr    r3, [r0, #STRING_FIELDOFF_VALUE]
     ldr    r7, [r0, #STRING_FIELDOFF_OFFSET]
     ldr    r8, [r0, #STRING_FIELDOFF_COUNT]
-    ldr    r0, [r0, #STRING_FIELDOFF_VALUE]
+
 
     /*
      * At this point, we have:
-     *    r0: object pointer
      *    r1: char to match
      *    r2: starting offset
+     *    r3: object pointer (final result -> r0)
      *    r7: offset
      *    r8: string length
      */
 
      /* Build pointer to start of string data */
-     add   r0, #16
-     add   r0, r0, r7, lsl #1
+     add   r3, #16
+     add   r0, r3, r7, lsl #1
 
      /* Save a copy of starting data in r7 */
      mov   r7, r0
@@ -1516,12 +1516,12 @@
     ldr     r10, [r0, #offStackSaveArea_prevFrame] @ r10<- saveArea->prevFrame
     ldrb    r8, [rSELF, #offThread_breakFlags] @ r8<- breakFlags
     ldr     rPC, [r0, #offStackSaveArea_savedPc] @ rPC<- saveArea->savedPc
+    ldr     r2, [r10, #(offStackSaveArea_method - sizeofStackSaveArea)]
 #if !defined(WITH_SELF_VERIFICATION)
     ldr     r9,  [r0, #offStackSaveArea_returnAddr] @ r9<- chaining cell ret
 #else
     mov     r9, #0                      @ disable chaining
 #endif
-    ldr     r2, [r10, #(offStackSaveArea_method - sizeofStackSaveArea)]
                                         @ r2<- method we're returning to
     cmp     r2, #0                      @ break frame?
 #if !defined(WITH_SELF_VERIFICATION)
diff --git a/vm/compiler/template/out/CompilerTemplateAsm-armv7-a.S b/vm/compiler/template/out/CompilerTemplateAsm-armv7-a.S
index fd21a0e..7a4fa2c 100644
--- a/vm/compiler/template/out/CompilerTemplateAsm-armv7-a.S
+++ b/vm/compiler/template/out/CompilerTemplateAsm-armv7-a.S
@@ -178,12 +178,12 @@
     ldr     r10, [r0, #offStackSaveArea_prevFrame] @ r10<- saveArea->prevFrame
     ldrb    r8, [rSELF, #offThread_breakFlags] @ r8<- breakFlags
     ldr     rPC, [r0, #offStackSaveArea_savedPc] @ rPC<- saveArea->savedPc
+    ldr     r2, [r10, #(offStackSaveArea_method - sizeofStackSaveArea)]
 #if !defined(WITH_SELF_VERIFICATION)
     ldr     r9,  [r0, #offStackSaveArea_returnAddr] @ r9<- chaining cell ret
 #else
     mov     r9, #0                      @ disable chaining
 #endif
-    ldr     r2, [r10, #(offStackSaveArea_method - sizeofStackSaveArea)]
                                         @ r2<- method we're returning to
     cmp     r2, #0                      @ break frame?
 #if !defined(WITH_SELF_VERIFICATION)
@@ -520,9 +520,8 @@
     mul     ip, r2, r1                  @  ip<- ZxW
     umull   r9, r10, r2, r0             @  r9/r10 <- ZxX
     mla     r2, r0, r3, ip              @  r2<- YxX + (ZxW)
-    add     r10, r2, r10                @  r10<- r10 + low(ZxW + (YxX))
-    mov     r0,r9
-    mov     r1,r10
+    mov     r0, r9
+    add     r1, r2, r10                 @  r1<- r10 + low(ZxW + (YxX))
     bx      lr
 
 /* ------------------------------ */
@@ -1265,22 +1264,23 @@
      *    r2:   Starting offset in string data
      */
 
+    ldr    r3, [r0, #STRING_FIELDOFF_VALUE]
     ldr    r7, [r0, #STRING_FIELDOFF_OFFSET]
     ldr    r8, [r0, #STRING_FIELDOFF_COUNT]
-    ldr    r0, [r0, #STRING_FIELDOFF_VALUE]
+
 
     /*
      * At this point, we have:
-     *    r0: object pointer
      *    r1: char to match
      *    r2: starting offset
+     *    r3: object pointer (final result -> r0)
      *    r7: offset
      *    r8: string length
      */
 
      /* Build pointer to start of string data */
-     add   r0, #16
-     add   r0, r0, r7, lsl #1
+     add   r3, #16
+     add   r0, r3, r7, lsl #1
 
      /* Save a copy of starting data in r7 */
      mov   r7, r0
@@ -1516,12 +1516,12 @@
     ldr     r10, [r0, #offStackSaveArea_prevFrame] @ r10<- saveArea->prevFrame
     ldrb    r8, [rSELF, #offThread_breakFlags] @ r8<- breakFlags
     ldr     rPC, [r0, #offStackSaveArea_savedPc] @ rPC<- saveArea->savedPc
+    ldr     r2, [r10, #(offStackSaveArea_method - sizeofStackSaveArea)]
 #if !defined(WITH_SELF_VERIFICATION)
     ldr     r9,  [r0, #offStackSaveArea_returnAddr] @ r9<- chaining cell ret
 #else
     mov     r9, #0                      @ disable chaining
 #endif
-    ldr     r2, [r10, #(offStackSaveArea_method - sizeofStackSaveArea)]
                                         @ r2<- method we're returning to
     cmp     r2, #0                      @ break frame?
 #if !defined(WITH_SELF_VERIFICATION)