X86: Use short forward jumps if possible

The optimizing compiler uses 32 bit relative jumps for all forward
jumps, just in case the offset is too large to fit in one byte.  Some of
the generated code knows that the jumps will in fact fit.

Use the 'NearLabel' class for the code generator and intrinsics.

Use the jecxz/jrcxz instruction for string intrinsics.

Unfortunately, conditional jumps to basic blocks don't know enough to
use this, as we don't know how much code will be generated.

This saves a whopping 0.24% for core.oat and boot.oat sizes, but
every little bit helps, and it reduces icache footprint slightly.

Change-Id: I633fe3b2e0e810b4ce12fdad8c02135644b63506
Signed-off-by: Mark Mendell <mark.p.mendell@intel.com>
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 72c690d..8cddc67 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -1310,7 +1310,7 @@
   }
 
   // Convert the jumps into the result.
-  Label done_label;
+  NearLabel done_label;
 
   // False case: result = 0.
   __ Bind(&false_label);
@@ -1978,7 +1978,7 @@
           XmmRegister input = in.AsFpuRegister<XmmRegister>();
           Register output = out.AsRegister<Register>();
           XmmRegister temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
-          Label done, nan;
+          NearLabel done, nan;
 
           __ movl(output, Immediate(kPrimIntMax));
           // temp = int-to-float(output)
@@ -2003,7 +2003,7 @@
           XmmRegister input = in.AsFpuRegister<XmmRegister>();
           Register output = out.AsRegister<Register>();
           XmmRegister temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
-          Label done, nan;
+          NearLabel done, nan;
 
           __ movl(output, Immediate(kPrimIntMax));
           // temp = int-to-double(output)
@@ -2592,7 +2592,7 @@
   PushOntoFPStack(first, 0, 2 * elem_size, /* is_fp */ true, is_wide);
 
   // Loop doing FPREM until we stabilize.
-  Label retry;
+  NearLabel retry;
   __ Bind(&retry);
   __ fprem();
 
@@ -2706,8 +2706,8 @@
   int shift;
   CalculateMagicAndShiftForDivRem(imm, false /* is_long */, &magic, &shift);
 
-  Label ndiv;
-  Label end;
+  NearLabel ndiv;
+  NearLabel end;
   // If numerator is 0, the result is 0, no computation needed.
   __ testl(eax, eax);
   __ j(kNotEqual, &ndiv);
@@ -3160,7 +3160,7 @@
 }
 
 void InstructionCodeGeneratorX86::GenerateShlLong(const Location& loc, Register shifter) {
-  Label done;
+  NearLabel done;
   __ shld(loc.AsRegisterPairHigh<Register>(), loc.AsRegisterPairLow<Register>(), shifter);
   __ shll(loc.AsRegisterPairLow<Register>(), shifter);
   __ testl(shifter, Immediate(32));
@@ -3192,7 +3192,7 @@
 }
 
 void InstructionCodeGeneratorX86::GenerateShrLong(const Location& loc, Register shifter) {
-  Label done;
+  NearLabel done;
   __ shrd(loc.AsRegisterPairLow<Register>(), loc.AsRegisterPairHigh<Register>(), shifter);
   __ sarl(loc.AsRegisterPairHigh<Register>(), shifter);
   __ testl(shifter, Immediate(32));
@@ -3227,7 +3227,7 @@
 }
 
 void InstructionCodeGeneratorX86::GenerateUShrLong(const Location& loc, Register shifter) {
-  Label done;
+  NearLabel done;
   __ shrd(loc.AsRegisterPairLow<Register>(), loc.AsRegisterPairHigh<Register>(), shifter);
   __ shrl(loc.AsRegisterPairHigh<Register>(), shifter);
   __ testl(shifter, Immediate(32));
@@ -3402,7 +3402,7 @@
   Location left = locations->InAt(0);
   Location right = locations->InAt(1);
 
-  Label less, greater, done;
+  NearLabel less, greater, done;
   switch (compare->InputAt(0)->GetType()) {
     case Primitive::kPrimLong: {
       Register left_low = left.AsRegisterPairLow<Register>();
@@ -3606,7 +3606,7 @@
                                   Register object,
                                   Register value,
                                   bool value_can_be_null) {
-  Label is_null;
+  NearLabel is_null;
   if (value_can_be_null) {
     __ testl(value, value);
     __ j(kEqual, &is_null);
@@ -4839,7 +4839,7 @@
   Location cls = locations->InAt(1);
   Register out = locations->Out().AsRegister<Register>();
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
-  Label done, zero;
+  NearLabel done, zero;
   SlowPathCodeX86* slow_path = nullptr;
 
   // Return 0 if `obj` is null.
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 820ec78..4aae037 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -1303,7 +1303,7 @@
   }
 
   // Convert the jumps into the result.
-  Label done_label;
+  NearLabel done_label;
 
   // False case: result = 0.
   __ Bind(&false_label);
@@ -1392,7 +1392,7 @@
   Location left = locations->InAt(0);
   Location right = locations->InAt(1);
 
-  Label less, greater, done;
+  NearLabel less, greater, done;
   Primitive::Type type = compare->InputAt(0)->GetType();
   switch (type) {
     case Primitive::kPrimLong: {
@@ -2117,7 +2117,7 @@
           // Processing a Dex `float-to-int' instruction.
           XmmRegister input = in.AsFpuRegister<XmmRegister>();
           CpuRegister output = out.AsRegister<CpuRegister>();
-          Label done, nan;
+          NearLabel done, nan;
 
           __ movl(output, Immediate(kPrimIntMax));
           // if input >= (float)INT_MAX goto done
@@ -2139,7 +2139,7 @@
           // Processing a Dex `double-to-int' instruction.
           XmmRegister input = in.AsFpuRegister<XmmRegister>();
           CpuRegister output = out.AsRegister<CpuRegister>();
-          Label done, nan;
+          NearLabel done, nan;
 
           __ movl(output, Immediate(kPrimIntMax));
           // if input >= (double)INT_MAX goto done
@@ -2181,7 +2181,7 @@
           // Processing a Dex `float-to-long' instruction.
           XmmRegister input = in.AsFpuRegister<XmmRegister>();
           CpuRegister output = out.AsRegister<CpuRegister>();
-          Label done, nan;
+          NearLabel done, nan;
 
           codegen_->Load64BitValue(output, kPrimLongMax);
           // if input >= (float)LONG_MAX goto done
@@ -2203,7 +2203,7 @@
           // Processing a Dex `double-to-long' instruction.
           XmmRegister input = in.AsFpuRegister<XmmRegister>();
           CpuRegister output = out.AsRegister<CpuRegister>();
-          Label done, nan;
+          NearLabel done, nan;
 
           codegen_->Load64BitValue(output, kPrimLongMax);
           // if input >= (double)LONG_MAX goto done
@@ -2766,7 +2766,7 @@
   PushOntoFPStack(first, 0, 2 * elem_size, is_float);
 
   // Loop doing FPREM until we stabilize.
-  Label retry;
+  NearLabel retry;
   __ Bind(&retry);
   __ fprem();
 
@@ -2920,8 +2920,8 @@
 
     __ movl(numerator, eax);
 
-    Label no_div;
-    Label end;
+    NearLabel no_div;
+    NearLabel end;
     __ testl(eax, eax);
     __ j(kNotEqual, &no_div);
 
@@ -4235,7 +4235,7 @@
                                      CpuRegister object,
                                      CpuRegister value,
                                      bool value_can_be_null) {
-  Label is_null;
+  NearLabel is_null;
   if (value_can_be_null) {
     __ testl(value, value);
     __ j(kEqual, &is_null);
@@ -4662,7 +4662,7 @@
   Location cls = locations->InAt(1);
   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
   uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
-  Label done, zero;
+  NearLabel done, zero;
   SlowPathCodeX86_64* slow_path = nullptr;
 
   // Return 0 if `obj` is null.
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index b7126b2..cc4e705 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -508,7 +508,7 @@
 
   XmmRegister op2 = op2_loc.AsFpuRegister<XmmRegister>();
 
-  Label nan, done, op2_label;
+  NearLabel nan, done, op2_label;
   if (is_double) {
     __ ucomisd(out, op2);
   } else {
@@ -842,7 +842,7 @@
   Register out = locations->Out().AsRegister<Register>();
   XmmRegister maxInt = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
   XmmRegister inPlusPointFive = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
-  Label done, nan;
+  NearLabel done, nan;
   X86Assembler* assembler = GetAssembler();
 
   // Generate 0.5 into inPlusPointFive.
@@ -971,9 +971,7 @@
   Register edi = locations->GetTemp(1).AsRegister<Register>();
   Register esi = locations->Out().AsRegister<Register>();
 
-  Label end;
-  Label return_true;
-  Label return_false;
+  NearLabel end, return_true, return_false;
 
   // Get offsets of count, value, and class fields within a string object.
   const uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
@@ -1005,8 +1003,7 @@
   __ cmpl(ecx, Address(arg, count_offset));
   __ j(kNotEqual, &return_false);
   // Return true if both strings are empty.
-  __ testl(ecx, ecx);
-  __ j(kEqual, &return_true);
+  __ jecxz(&return_true);
 
   // Load starting addresses of string values into ESI/EDI as required for repe_cmpsl instruction.
   __ leal(esi, Address(str, value_offset));
@@ -1116,7 +1113,7 @@
 
   // Do a zero-length check.
   // TODO: Support jecxz.
-  Label not_found_label;
+  NearLabel not_found_label;
   __ testl(string_length, string_length);
   __ j(kEqual, &not_found_label);
 
@@ -1159,7 +1156,7 @@
   __ subl(string_length, counter);
   __ leal(out, Address(string_length, -1));
 
-  Label done;
+  NearLabel done;
   __ jmp(&done);
 
   // Failed to match; return -1.
@@ -1879,7 +1876,7 @@
     }
 
     // BSR sets ZF if the input was zero, and the output is undefined.
-    Label all_zeroes, done;
+    NearLabel all_zeroes, done;
     __ j(kEqual, &all_zeroes);
 
     // Correct the result from BSR to get the final CLZ result.
@@ -1898,7 +1895,7 @@
   DCHECK(src.IsRegisterPair());
   Register src_lo = src.AsRegisterPairLow<Register>();
   Register src_hi = src.AsRegisterPairHigh<Register>();
-  Label handle_low, done, all_zeroes;
+  NearLabel handle_low, done, all_zeroes;
 
   // Is the high word zero?
   __ testl(src_hi, src_hi);
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index 15fbac1..a9af9ec 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -406,7 +406,7 @@
 
   XmmRegister op2 = op2_loc.AsFpuRegister<XmmRegister>();
 
-  Label nan, done, op2_label;
+  NearLabel nan, done, op2_label;
   if (is_double) {
     __ ucomisd(out, op2);
   } else {
@@ -703,7 +703,7 @@
   XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
   XmmRegister inPlusPointFive = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
-  Label done, nan;
+  NearLabel done, nan;
   X86_64Assembler* assembler = GetAssembler();
 
   // Load 0.5 into inPlusPointFive.
@@ -751,7 +751,7 @@
   XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
   CpuRegister out = locations->Out().AsRegister<CpuRegister>();
   XmmRegister inPlusPointFive = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
-  Label done, nan;
+  NearLabel done, nan;
   X86_64Assembler* assembler = GetAssembler();
 
   // Load 0.5 into inPlusPointFive.
@@ -880,9 +880,7 @@
   CpuRegister rdi = locations->GetTemp(1).AsRegister<CpuRegister>();
   CpuRegister rsi = locations->Out().AsRegister<CpuRegister>();
 
-  Label end;
-  Label return_true;
-  Label return_false;
+  NearLabel end, return_true, return_false;
 
   // Get offsets of count, value, and class fields within a string object.
   const uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
@@ -914,8 +912,7 @@
   __ cmpl(rcx, Address(arg, count_offset));
   __ j(kNotEqual, &return_false);
   // Return true if both strings are empty.
-  __ testl(rcx, rcx);
-  __ j(kEqual, &return_true);
+  __ jrcxz(&return_true);
 
   // Load starting addresses of string values into RSI/RDI as required for repe_cmpsq instruction.
   __ leal(rsi, Address(str, value_offset));
@@ -1025,7 +1022,7 @@
 
   // Do a length check.
   // TODO: Support jecxz.
-  Label not_found_label;
+  NearLabel not_found_label;
   __ testl(string_length, string_length);
   __ j(kEqual, &not_found_label);
 
@@ -1067,7 +1064,7 @@
   __ subl(string_length, counter);
   __ leal(out, Address(string_length, -1));
 
-  Label done;
+  NearLabel done;
   __ jmp(&done);
 
   // Failed to match; return -1.
@@ -1732,7 +1729,7 @@
   }
 
   // BSR sets ZF if the input was zero, and the output is undefined.
-  Label is_zero, done;
+  NearLabel is_zero, done;
   __ j(kEqual, &is_zero);
 
   // Correct the result from BSR to get the CLZ result.