Reorder the taken and fallthroug blocks and remove dead branches.

In the following example the order of the two chaining cells are reversed and
the branch to 0x42862ce4 is eliminated.

Before:

D/dalvikvm( 1550): 0x42862cd6 (000a): bge     0x42862cdc
D/dalvikvm( 1550): 0x42862cd8 (000c): b       0x42862ce4
D/dalvikvm( 1550): Exception_Handling:
D/dalvikvm( 1550): 0x42862cda (000e): .align4
D/dalvikvm( 1550): -------- chaining cell (normal): 0x0049
D/dalvikvm( 1550): 0x42862cdc (0010): ldr     r0, [r6, #76]
D/dalvikvm( 1550): 0x42862cde (0012): blx     r0
D/dalvikvm( 1550): 0x42862ce0 (0014): data    0x1512(5394)
D/dalvikvm( 1550): 0x42862ce2 (0016): data    0x42a7(17063)
D/dalvikvm( 1550): 0x42862ce4 (0018): .align4
D/dalvikvm( 1550): -------- chaining cell (normal): 0x001d
D/dalvikvm( 1550): 0x42862ce4 (0018): ldr     r0, [r6, #76]
D/dalvikvm( 1550): 0x42862ce6 (001a): blx     r0
D/dalvikvm( 1550): 0x42862ce8 (001c): data    0x14ba(5306)
D/dalvikvm( 1550): 0x42862cea (001e): data    0x42a7(17063)

After:

D/dalvikvm(  367): 0x42865c92 (000a): bge     0x42865c9c
D/dalvikvm(  367): Exception_Handling:
D/dalvikvm(  367): 0x42865c94 (000c): .align4
D/dalvikvm(  367): -------- chaining cell (normal): 0x001d
D/dalvikvm(  367): 0x42865c94 (000c): ldr     r0, [r6, #76]
D/dalvikvm(  367): 0x42865c96 (000e): blx     r0
D/dalvikvm(  367): 0x42865c98 (0010): data    0x44ba(17594)
D/dalvikvm(  367): 0x42865c9a (0012): data    0x42a7(17063)
D/dalvikvm(  367): 0x42865c9c (0014): .align4
D/dalvikvm(  367): -------- chaining cell (normal): 0x0049
D/dalvikvm(  367): 0x42865c9c (0014): ldr     r0, [r6, #76]
D/dalvikvm(  367): 0x42865c9e (0016): blx     r0
D/dalvikvm(  367): 0x42865ca0 (0018): data    0x4512(17682)
D/dalvikvm(  367): 0x42865ca2 (001a): data    0x42a7(17063)
diff --git a/vm/compiler/Frontend.c b/vm/compiler/Frontend.c
index ec3a98d..c4105f3 100644
--- a/vm/compiler/Frontend.c
+++ b/vm/compiler/Frontend.c
@@ -476,6 +476,24 @@
             cUnit.hasLoop = true;
         }
 
+        /* Fallthrough block not included in the trace */
+        if (!isUnconditionalBranch(lastInsn) && curBB->fallThrough == NULL) {
+            /*
+             * If the chaining cell is after an invoke or
+             * instruction that cannot change the control flow, request a hot
+             * chaining cell.
+             */
+            if (isInvoke || curBB->needFallThroughBranch) {
+                lastBB->next = dvmCompilerNewBB(CHAINING_CELL_HOT);
+            } else {
+                lastBB->next = dvmCompilerNewBB(CHAINING_CELL_NORMAL);
+            }
+            lastBB = lastBB->next;
+            lastBB->id = numBlocks++;
+            lastBB->startOffset = fallThroughOffset;
+            curBB->fallThrough = lastBB;
+        }
+
         /* Target block not included in the trace */
         if (curBB->taken == NULL &&
             (isInvoke || (targetOffset != curOffset))) {
@@ -516,24 +534,6 @@
             lastBB->next = newBB;
             lastBB = newBB;
         }
-
-        /* Fallthrough block not included in the trace */
-        if (!isUnconditionalBranch(lastInsn) && curBB->fallThrough == NULL) {
-            /*
-             * If the chaining cell is after an invoke or
-             * instruction that cannot change the control flow, request a hot
-             * chaining cell.
-             */
-            if (isInvoke || curBB->needFallThroughBranch) {
-                lastBB->next = dvmCompilerNewBB(CHAINING_CELL_HOT);
-            } else {
-                lastBB->next = dvmCompilerNewBB(CHAINING_CELL_NORMAL);
-            }
-            lastBB = lastBB->next;
-            lastBB->id = numBlocks++;
-            lastBB->startOffset = fallThroughOffset;
-            curBB->fallThrough = lastBB;
-        }
     }
 
     /* Now create a special block to host PC reconstruction code */
diff --git a/vm/compiler/codegen/arm/Assemble.c b/vm/compiler/codegen/arm/Assemble.c
index f391288..b140457 100644
--- a/vm/compiler/codegen/arm/Assemble.c
+++ b/vm/compiler/codegen/arm/Assemble.c
@@ -742,7 +742,12 @@
 };
 
 
-#define PADDING_MOV_R0_R0               0x1C00
+/*
+ * The fake NOP of moving r0 to r0 actually will incur data stalls if r0 is
+ * not ready. Since r5 (rFP) is not updated often, it is less likely to
+ * generate unnecessary stall cycles.
+ */
+#define PADDING_MOV_R5_R5               0x1C2D
 
 /* Write the numbers in the literal pool to the codegen stream */
 static void installDataContent(CompilationUnit *cUnit)
@@ -777,7 +782,7 @@
             if ((lir->opCode == ARM_PSEUDO_ALIGN4) &&
                 /* 1 means padding is needed */
                 (lir->operands[0] == 1)) {
-                *bufferAddr++ = PADDING_MOV_R0_R0;
+                *bufferAddr++ = PADDING_MOV_R5_R5;
             }
             continue;
         }
diff --git a/vm/compiler/codegen/arm/GlobalOptimizations.c b/vm/compiler/codegen/arm/GlobalOptimizations.c
index 40e1f07..687b7d2 100644
--- a/vm/compiler/codegen/arm/GlobalOptimizations.c
+++ b/vm/compiler/codegen/arm/GlobalOptimizations.c
@@ -48,8 +48,7 @@
                 /*
                  * Found real useful stuff between the branch and the target
                  */
-                if (!isPseudoOpCode(nextLIR->opCode) ||
-                    nextLIR->opCode == ARM_PSEUDO_ALIGN4)
+                if (!isPseudoOpCode(nextLIR->opCode))
                     break;
             }
         }