Improve interleave on VFP compare.

Improves FloatOps/FloatOpsD from 229/230ns to 228/229ns, an exciting
half-percentage-point gain.
diff --git a/vm/mterp/arm-vfp/OP_CMPG_DOUBLE.S b/vm/mterp/arm-vfp/OP_CMPG_DOUBLE.S
index 1a377cf..a8c3ea4 100644
--- a/vm/mterp/arm-vfp/OP_CMPG_DOUBLE.S
+++ b/vm/mterp/arm-vfp/OP_CMPG_DOUBLE.S
@@ -20,13 +20,13 @@
      */
     /* op vAA, vBB, vCC */
     FETCH(r0, 1)                        @ r0<- CCBB
-    and     r2, r0, #255                @ r2<- BB
-    VREG_INDEX_TO_ADDR(r2, r2)          @ r2<- &vBB
-    fldd    d0, [r2]                    @ d0<- vBB
-    mov     r3, r0, lsr #8              @ r3<- CC
-    VREG_INDEX_TO_ADDR(r3, r3)          @ r3<- &vCC
-    fldd    d1, [r3]                    @ d1<- vCC
     mov     r9, rINST, lsr #8           @ r9<- AA
+    and     r2, r0, #255                @ r2<- BB
+    mov     r3, r0, lsr #8              @ r3<- CC
+    VREG_INDEX_TO_ADDR(r2, r2)          @ r2<- &vBB
+    VREG_INDEX_TO_ADDR(r3, r3)          @ r3<- &vCC
+    fldd    d0, [r2]                    @ d0<- vBB
+    fldd    d1, [r3]                    @ d1<- vCC
     fcmped  d0, d1                      @ compare (vBB, vCC)
     FETCH_ADVANCE_INST(2)               @ advance rPC, load rINST
     mov     r0, #1                      @ r0<- 1 (default)
@@ -34,7 +34,7 @@
     fmstat                              @ export status flags
     mvnmi   r0, #0                      @ (less than) r1<- -1
     moveq   r0, #0                      @ (equal) r1<- 0
-    bl      .L${opcode}_finish          @ argh
+    b       .L${opcode}_finish          @ argh
 
 %break
 .L${opcode}_finish:
diff --git a/vm/mterp/arm-vfp/OP_CMPG_FLOAT.S b/vm/mterp/arm-vfp/OP_CMPG_FLOAT.S
index 4dc142d..4c14fbb 100644
--- a/vm/mterp/arm-vfp/OP_CMPG_FLOAT.S
+++ b/vm/mterp/arm-vfp/OP_CMPG_FLOAT.S
@@ -20,13 +20,13 @@
      */
     /* op vAA, vBB, vCC */
     FETCH(r0, 1)                        @ r0<- CCBB
-    and     r2, r0, #255                @ r2<- BB
-    VREG_INDEX_TO_ADDR(r2, r2)          @ r2<- &vBB
-    flds    s0, [r2]                    @ s0<- vBB
-    mov     r3, r0, lsr #8              @ r3<- CC
-    VREG_INDEX_TO_ADDR(r3, r3)          @ r3<- &vCC
-    flds    s1, [r3]                    @ s1<- vCC
     mov     r9, rINST, lsr #8           @ r9<- AA
+    and     r2, r0, #255                @ r2<- BB
+    mov     r3, r0, lsr #8              @ r3<- CC
+    VREG_INDEX_TO_ADDR(r2, r2)          @ r2<- &vBB
+    VREG_INDEX_TO_ADDR(r3, r3)          @ r3<- &vCC
+    flds    s0, [r2]                    @ s0<- vBB
+    flds    s1, [r3]                    @ s1<- vCC
     fcmpes  s0, s1                      @ compare (vBB, vCC)
     FETCH_ADVANCE_INST(2)               @ advance rPC, load rINST
     mov     r0, #1                      @ r0<- 1 (default)
@@ -34,7 +34,7 @@
     fmstat                              @ export status flags
     mvnmi   r0, #0                      @ (less than) r1<- -1
     moveq   r0, #0                      @ (equal) r1<- 0
-    bl      .L${opcode}_finish          @ argh
+    b       .L${opcode}_finish          @ argh
 
 %break
 .L${opcode}_finish:
diff --git a/vm/mterp/arm-vfp/OP_CMPL_DOUBLE.S b/vm/mterp/arm-vfp/OP_CMPL_DOUBLE.S
index 94d9102..999faee 100644
--- a/vm/mterp/arm-vfp/OP_CMPL_DOUBLE.S
+++ b/vm/mterp/arm-vfp/OP_CMPL_DOUBLE.S
@@ -20,13 +20,13 @@
      */
     /* op vAA, vBB, vCC */
     FETCH(r0, 1)                        @ r0<- CCBB
-    and     r2, r0, #255                @ r2<- BB
-    VREG_INDEX_TO_ADDR(r2, r2)          @ r2<- &vBB
-    fldd    d0, [r2]                    @ d0<- vBB
-    mov     r3, r0, lsr #8              @ r3<- CC
-    VREG_INDEX_TO_ADDR(r3, r3)          @ r3<- &vCC
-    fldd    d1, [r3]                    @ d1<- vCC
     mov     r9, rINST, lsr #8           @ r9<- AA
+    and     r2, r0, #255                @ r2<- BB
+    mov     r3, r0, lsr #8              @ r3<- CC
+    VREG_INDEX_TO_ADDR(r2, r2)          @ r2<- &vBB
+    VREG_INDEX_TO_ADDR(r3, r3)          @ r3<- &vCC
+    fldd    d0, [r2]                    @ d0<- vBB
+    fldd    d1, [r3]                    @ d1<- vCC
     fcmped  d0, d1                      @ compare (vBB, vCC)
     FETCH_ADVANCE_INST(2)               @ advance rPC, load rINST
     mvn     r0, #0                      @ r0<- -1 (default)
@@ -34,7 +34,7 @@
     fmstat                              @ export status flags
     movgt   r0, #1                      @ (greater than) r1<- 1
     moveq   r0, #0                      @ (equal) r1<- 0
-    bl      .L${opcode}_finish          @ argh
+    b       .L${opcode}_finish          @ argh
 
 %break
 .L${opcode}_finish:
diff --git a/vm/mterp/arm-vfp/OP_CMPL_FLOAT.S b/vm/mterp/arm-vfp/OP_CMPL_FLOAT.S
index 4e72e13..9b2133c 100644
--- a/vm/mterp/arm-vfp/OP_CMPL_FLOAT.S
+++ b/vm/mterp/arm-vfp/OP_CMPL_FLOAT.S
@@ -20,13 +20,13 @@
      */
     /* op vAA, vBB, vCC */
     FETCH(r0, 1)                        @ r0<- CCBB
-    and     r2, r0, #255                @ r2<- BB
-    VREG_INDEX_TO_ADDR(r2, r2)          @ r2<- &vBB
-    flds    s0, [r2]                    @ s0<- vBB
-    mov     r3, r0, lsr #8              @ r3<- CC
-    VREG_INDEX_TO_ADDR(r3, r3)          @ r3<- &vCC
-    flds    s1, [r3]                    @ s1<- vCC
     mov     r9, rINST, lsr #8           @ r9<- AA
+    and     r2, r0, #255                @ r2<- BB
+    mov     r3, r0, lsr #8              @ r3<- CC
+    VREG_INDEX_TO_ADDR(r2, r2)          @ r2<- &vBB
+    VREG_INDEX_TO_ADDR(r3, r3)          @ r3<- &vCC
+    flds    s0, [r2]                    @ s0<- vBB
+    flds    s1, [r3]                    @ s1<- vCC
     fcmpes  s0, s1                      @ compare (vBB, vCC)
     FETCH_ADVANCE_INST(2)               @ advance rPC, load rINST
     mvn     r0, #0                      @ r0<- -1 (default)
@@ -34,7 +34,7 @@
     fmstat                              @ export status flags
     movgt   r0, #1                      @ (greater than) r1<- 1
     moveq   r0, #0                      @ (equal) r1<- 0
-    bl      .L${opcode}_finish          @ argh
+    b       .L${opcode}_finish          @ argh
 
 %break
 .L${opcode}_finish:
diff --git a/vm/mterp/arm-vfp/README.txt b/vm/mterp/arm-vfp/README.txt
new file mode 100644
index 0000000..5201bbe
--- /dev/null
+++ b/vm/mterp/arm-vfp/README.txt
@@ -0,0 +1,5 @@
+Instruction handlers that take advantage of ARM VFP.  These work with VFP
+v2 and v3 (VFPLite).
+
+The ARM code driving the floating-point calculations will run on ARMv5TE
+and later.
diff --git a/vm/mterp/out/InterpAsm-armv5te-vfp.S b/vm/mterp/out/InterpAsm-armv5te-vfp.S
index 7412f29..aa7e958 100644
--- a/vm/mterp/out/InterpAsm-armv5te-vfp.S
+++ b/vm/mterp/out/InterpAsm-armv5te-vfp.S
@@ -1321,13 +1321,13 @@
      */
     /* op vAA, vBB, vCC */
     FETCH(r0, 1)                        @ r0<- CCBB
-    and     r2, r0, #255                @ r2<- BB
-    VREG_INDEX_TO_ADDR(r2, r2)          @ r2<- &vBB
-    flds    s0, [r2]                    @ s0<- vBB
-    mov     r3, r0, lsr #8              @ r3<- CC
-    VREG_INDEX_TO_ADDR(r3, r3)          @ r3<- &vCC
-    flds    s1, [r3]                    @ s1<- vCC
     mov     r9, rINST, lsr #8           @ r9<- AA
+    and     r2, r0, #255                @ r2<- BB
+    mov     r3, r0, lsr #8              @ r3<- CC
+    VREG_INDEX_TO_ADDR(r2, r2)          @ r2<- &vBB
+    VREG_INDEX_TO_ADDR(r3, r3)          @ r3<- &vCC
+    flds    s0, [r2]                    @ s0<- vBB
+    flds    s1, [r3]                    @ s1<- vCC
     fcmpes  s0, s1                      @ compare (vBB, vCC)
     FETCH_ADVANCE_INST(2)               @ advance rPC, load rINST
     mvn     r0, #0                      @ r0<- -1 (default)
@@ -1335,7 +1335,7 @@
     fmstat                              @ export status flags
     movgt   r0, #1                      @ (greater than) r1<- 1
     moveq   r0, #0                      @ (equal) r1<- 0
-    bl      .LOP_CMPL_FLOAT_finish          @ argh
+    b       .LOP_CMPL_FLOAT_finish          @ argh
 
 
 /* ------------------------------ */
@@ -1360,13 +1360,13 @@
      */
     /* op vAA, vBB, vCC */
     FETCH(r0, 1)                        @ r0<- CCBB
-    and     r2, r0, #255                @ r2<- BB
-    VREG_INDEX_TO_ADDR(r2, r2)          @ r2<- &vBB
-    flds    s0, [r2]                    @ s0<- vBB
-    mov     r3, r0, lsr #8              @ r3<- CC
-    VREG_INDEX_TO_ADDR(r3, r3)          @ r3<- &vCC
-    flds    s1, [r3]                    @ s1<- vCC
     mov     r9, rINST, lsr #8           @ r9<- AA
+    and     r2, r0, #255                @ r2<- BB
+    mov     r3, r0, lsr #8              @ r3<- CC
+    VREG_INDEX_TO_ADDR(r2, r2)          @ r2<- &vBB
+    VREG_INDEX_TO_ADDR(r3, r3)          @ r3<- &vCC
+    flds    s0, [r2]                    @ s0<- vBB
+    flds    s1, [r3]                    @ s1<- vCC
     fcmpes  s0, s1                      @ compare (vBB, vCC)
     FETCH_ADVANCE_INST(2)               @ advance rPC, load rINST
     mov     r0, #1                      @ r0<- 1 (default)
@@ -1374,7 +1374,7 @@
     fmstat                              @ export status flags
     mvnmi   r0, #0                      @ (less than) r1<- -1
     moveq   r0, #0                      @ (equal) r1<- 0
-    bl      .LOP_CMPG_FLOAT_finish          @ argh
+    b       .LOP_CMPG_FLOAT_finish          @ argh
 
 
 /* ------------------------------ */
@@ -1399,13 +1399,13 @@
      */
     /* op vAA, vBB, vCC */
     FETCH(r0, 1)                        @ r0<- CCBB
-    and     r2, r0, #255                @ r2<- BB
-    VREG_INDEX_TO_ADDR(r2, r2)          @ r2<- &vBB
-    fldd    d0, [r2]                    @ d0<- vBB
-    mov     r3, r0, lsr #8              @ r3<- CC
-    VREG_INDEX_TO_ADDR(r3, r3)          @ r3<- &vCC
-    fldd    d1, [r3]                    @ d1<- vCC
     mov     r9, rINST, lsr #8           @ r9<- AA
+    and     r2, r0, #255                @ r2<- BB
+    mov     r3, r0, lsr #8              @ r3<- CC
+    VREG_INDEX_TO_ADDR(r2, r2)          @ r2<- &vBB
+    VREG_INDEX_TO_ADDR(r3, r3)          @ r3<- &vCC
+    fldd    d0, [r2]                    @ d0<- vBB
+    fldd    d1, [r3]                    @ d1<- vCC
     fcmped  d0, d1                      @ compare (vBB, vCC)
     FETCH_ADVANCE_INST(2)               @ advance rPC, load rINST
     mvn     r0, #0                      @ r0<- -1 (default)
@@ -1413,7 +1413,7 @@
     fmstat                              @ export status flags
     movgt   r0, #1                      @ (greater than) r1<- 1
     moveq   r0, #0                      @ (equal) r1<- 0
-    bl      .LOP_CMPL_DOUBLE_finish          @ argh
+    b       .LOP_CMPL_DOUBLE_finish          @ argh
 
 
 /* ------------------------------ */
@@ -1438,13 +1438,13 @@
      */
     /* op vAA, vBB, vCC */
     FETCH(r0, 1)                        @ r0<- CCBB
-    and     r2, r0, #255                @ r2<- BB
-    VREG_INDEX_TO_ADDR(r2, r2)          @ r2<- &vBB
-    fldd    d0, [r2]                    @ d0<- vBB
-    mov     r3, r0, lsr #8              @ r3<- CC
-    VREG_INDEX_TO_ADDR(r3, r3)          @ r3<- &vCC
-    fldd    d1, [r3]                    @ d1<- vCC
     mov     r9, rINST, lsr #8           @ r9<- AA
+    and     r2, r0, #255                @ r2<- BB
+    mov     r3, r0, lsr #8              @ r3<- CC
+    VREG_INDEX_TO_ADDR(r2, r2)          @ r2<- &vBB
+    VREG_INDEX_TO_ADDR(r3, r3)          @ r3<- &vCC
+    fldd    d0, [r2]                    @ d0<- vBB
+    fldd    d1, [r3]                    @ d1<- vCC
     fcmped  d0, d1                      @ compare (vBB, vCC)
     FETCH_ADVANCE_INST(2)               @ advance rPC, load rINST
     mov     r0, #1                      @ r0<- 1 (default)
@@ -1452,7 +1452,7 @@
     fmstat                              @ export status flags
     mvnmi   r0, #0                      @ (less than) r1<- -1
     moveq   r0, #0                      @ (equal) r1<- 0
-    bl      .LOP_CMPG_DOUBLE_finish          @ argh
+    b       .LOP_CMPG_DOUBLE_finish          @ argh
 
 
 /* ------------------------------ */