Get rid of a bunch of loads in the arm dispatcher inner loops, and
make some attempt to schedule for Cortex-A8.  Improves overall IPC
for none running perf/bz2.c "-O" from 0.879 to 0.925.



git-svn-id: svn://svn.valgrind.org/valgrind/trunk@11780 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/coregrind/m_dispatch/dispatch-arm-linux.S b/coregrind/m_dispatch/dispatch-arm-linux.S
index f67aeef..20b13fa 100644
--- a/coregrind/m_dispatch/dispatch-arm-linux.S
+++ b/coregrind/m_dispatch/dispatch-arm-linux.S
@@ -75,6 +75,9 @@
 /*--- NO-PROFILING (standard) dispatcher           ---*/
 /*----------------------------------------------------*/
 
+/* Pairing of insns below is my guesstimate of how dual dispatch would
+   work on an A8.  JRS, 2011-May-28 */
+ 
 .global	VG_(run_innerloop__dispatch_unprofiled)
 VG_(run_innerloop__dispatch_unprofiled):
 
@@ -83,35 +86,47 @@
 
         /* Has the guest state pointer been messed with?  If yes, exit. */
 	ldr  r1, [sp, #0]
+        movw r3, #:lower16:VG_(dispatch_ctr)
+
 	cmp  r8, r1
+        movt r3, #:upper16:VG_(dispatch_ctr)
+
 	bne  gsp_changed
 
 	/* save the jump address in the guest state */
         str  r0, [r8, #OFFSET_arm_R15T]
 
         /* Are we out of timeslice?  If yes, defer to scheduler. */
-        ldr  r1, =VG_(dispatch_ctr)
-        ldr  r2, [r1]
+        ldr  r2, [r3]
+
         subs r2, r2, #1
-        str  r2, [r1]
+
+        str  r2, [r3]
+
         beq  counter_is_zero
 
         /* try a fast lookup in the translation cache */
-        // r0 = next guest, r1,r2,r3 scratch
-	ldr  r1, =VG_TT_FAST_MASK       // r1 = VG_TT_FAST_MASK
+        // r0 = next guest, r1,r2,r3,r4 scratch
+        movw r1, #VG_TT_FAST_MASK       // r1 = VG_TT_FAST_MASK
+        movw r4, #:lower16:VG_(tt_fast)
+
 	and  r2, r1, r0, LSR #1         // r2 = entry #
-	ldr  r1, =VG_(tt_fast)          // r1 = &tt_fast[0]
-	add  r1, r1, r2, LSL #3         // r1 = &tt_fast[entry#]
-	ldr  r3, [r1, #0]               /* .guest */
-	ldr  r1, [r1, #4]               /* .host  */
-	cmp  r0, r3
+        movt r4, #:upper16:VG_(tt_fast) // r4 = &VG_(tt_fast)
+
+	add  r1, r4, r2, LSL #3         // r1 = &tt_fast[entry#]
+
+        ldrd r4, r5, [r1, #0]           // r4 = .guest, r5 = .host
+
+	cmp  r4, r0
+
 	bne  fast_lookup_failed
-        // r1: live, next-host    r8: live, gsp
-        // r2: entry # (but not live)
-        // r0, r3: dead
+        // r5: next-host    r8: live, gsp
+        // r4: next-guest
+        // r2: entry #
+        // LIVE: r5, r8; all others dead
         
         /* Found a match.  Jump to .host. */
-	blx  r1
+	blx  r5
 	b    VG_(run_innerloop__dispatch_unprofiled)
 .ltorg
 	/*NOTREACHED*/
@@ -128,42 +143,55 @@
 
         /* Has the guest state pointer been messed with?  If yes, exit. */
 	ldr  r1, [sp, #0]
+        movw r3, #:lower16:VG_(dispatch_ctr)
+
 	cmp  r8, r1
+        movt r3, #:upper16:VG_(dispatch_ctr)
+
 	bne  gsp_changed
 
 	/* save the jump address in the guest state */
         str  r0, [r8, #OFFSET_arm_R15T]
 
         /* Are we out of timeslice?  If yes, defer to scheduler. */
-        ldr  r1, =VG_(dispatch_ctr)
-        ldr  r2, [r1]
+        ldr  r2, [r3]
+
         subs r2, r2, #1
-        str  r2, [r1]
+
+        str  r2, [r3]
+
         beq  counter_is_zero
 
         /* try a fast lookup in the translation cache */
-        // r0 = next guest, r1,r2,r3 scratch
-	ldr  r1, =VG_TT_FAST_MASK       // r1 = VG_TT_FAST_MASK
+        // r0 = next guest, r1,r2,r3,r4 scratch
+        movw r1, #VG_TT_FAST_MASK       // r1 = VG_TT_FAST_MASK
+        movw r4, #:lower16:VG_(tt_fast)
+
 	and  r2, r1, r0, LSR #1         // r2 = entry #
-	ldr  r1, =VG_(tt_fast)          // r1 = &tt_fast[0]
-	add  r1, r1, r2, LSL #3         // r1 = &tt_fast[entry#]
-	ldr  r3, [r1, #0]               /* .guest */
-	ldr  r1, [r1, #4]               /* .host  */
-	cmp  r0, r3
+        movt r4, #:upper16:VG_(tt_fast) // r4 = &VG_(tt_fast)
+
+	add  r1, r4, r2, LSL #3         // r1 = &tt_fast[entry#]
+
+        ldrd r4, r5, [r1, #0]           // r4 = .guest, r5 = .host
+
+	cmp  r4, r0
+
 	bne  fast_lookup_failed
-        // r1: live, next-host    r8: live, gsp
-        // r2: entry # (but not live)
-        // r0, r3: dead
+        // r5: next-host    r8: live, gsp
+        // r4: next-guest
+        // r2: entry #
+        // LIVE: r5, r8; all others dead
         
         /* increment bb profile counter */
-        ldr  r0, =VG_(tt_fastN)         // r0 = &tt_fastN[0]
-        ldr  r0, [r0, r2, LSL #2]       // r0 = tt_fast[entry #]
-        ldr  r3, [r0]                   // *r0 ++
+        movw r0, #:lower16:VG_(tt_fastN)
+        movt r0, #:upper16:VG_(tt_fastN) // r0 = &tt_fastN[0]
+        ldr  r0, [r0, r2, LSL #2]        // r0 = tt_fast[entry #]
+        ldr  r3, [r0]                    // *r0 ++
         add  r3, r3, #1
         str  r3, [r0]
 
         /* Found a match.  Jump to .host. */
-	blx  r1
+	blx  r5
 	b    VG_(run_innerloop__dispatch_profiled)
 	/*NOTREACHED*/