ARM: Improve String.getCharsNoCheck intrinsic.

- Optimize for short retrievals.
- Improve performance of longer retrievals with unrolled LDR/STR.

Change-Id: I24ec52067104b2f68799c04aebb61d20fff94020
diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc
index 29f7672..7d1c2eb 100644
--- a/compiler/optimizing/intrinsics_arm.cc
+++ b/compiler/optimizing/intrinsics_arm.cc
@@ -2031,7 +2031,7 @@
   locations->SetInAt(3, Location::RequiresRegister());
   locations->SetInAt(4, Location::RequiresRegister());
 
-  locations->AddTemp(Location::RequiresRegister());
+  // Temporary registers to store lengths of strings and for calculations.
   locations->AddTemp(Location::RequiresRegister());
   locations->AddTemp(Location::RequiresRegister());
   locations->AddTemp(Location::RequiresRegister());
@@ -2059,28 +2059,55 @@
   Register dstObj = locations->InAt(3).AsRegister<Register>();
   Register dstBegin = locations->InAt(4).AsRegister<Register>();
 
-  Register src_ptr = locations->GetTemp(0).AsRegister<Register>();
-  Register src_ptr_end = locations->GetTemp(1).AsRegister<Register>();
+  Register num_chr = locations->GetTemp(0).AsRegister<Register>();
+  Register src_ptr = locations->GetTemp(1).AsRegister<Register>();
   Register dst_ptr = locations->GetTemp(2).AsRegister<Register>();
-  Register tmp = locations->GetTemp(3).AsRegister<Register>();
 
   // src range to copy.
   __ add(src_ptr, srcObj, ShifterOperand(value_offset));
-  __ add(src_ptr_end, src_ptr, ShifterOperand(srcEnd, LSL, 1));
   __ add(src_ptr, src_ptr, ShifterOperand(srcBegin, LSL, 1));
 
   // dst to be copied.
   __ add(dst_ptr, dstObj, ShifterOperand(data_offset));
   __ add(dst_ptr, dst_ptr, ShifterOperand(dstBegin, LSL, 1));
 
+  __ subs(num_chr, srcEnd, ShifterOperand(srcBegin));
+
   // Do the copy.
-  Label loop, done;
-  __ Bind(&loop);
-  __ cmp(src_ptr, ShifterOperand(src_ptr_end));
+  Label loop, remainder, done;
+
+  // Early out for valid zero-length retrievals.
   __ b(&done, EQ);
-  __ ldrh(tmp, Address(src_ptr, char_size, Address::PostIndex));
-  __ strh(tmp, Address(dst_ptr, char_size, Address::PostIndex));
-  __ b(&loop);
+
+  // Save repairing the value of num_chr on the < 4 character path.
+  __ subs(IP, num_chr, ShifterOperand(4));
+  __ b(&remainder, LT);
+
+  // Keep the result of the earlier subs, we are going to fetch at least 4 characters.
+  __ mov(num_chr, ShifterOperand(IP));
+
+  // Main loop used for longer fetches loads and stores 4x16-bit characters at a time.
+  // (LDRD/STRD fault on unaligned addresses and it's not worth inlining extra code
+  // to rectify these everywhere this intrinsic applies.)
+  __ Bind(&loop);
+  __ ldr(IP, Address(src_ptr, char_size * 2));
+  __ subs(num_chr, num_chr, ShifterOperand(4));
+  __ str(IP, Address(dst_ptr, char_size * 2));
+  __ ldr(IP, Address(src_ptr, char_size * 4, Address::PostIndex));
+  __ str(IP, Address(dst_ptr, char_size * 4, Address::PostIndex));
+  __ b(&loop, GE);
+
+  __ adds(num_chr, num_chr, ShifterOperand(4));
+  __ b(&done, EQ);
+
+  // Main loop for < 4 character case and remainder handling. Loads and stores one
+  // 16-bit Java character at a time.
+  __ Bind(&remainder);
+  __ ldrh(IP, Address(src_ptr, char_size, Address::PostIndex));
+  __ subs(num_chr, num_chr, ShifterOperand(1));
+  __ strh(IP, Address(dst_ptr, char_size, Address::PostIndex));
+  __ b(&remainder, GT);
+
   __ Bind(&done);
 }