Change deoptimize entrypoint to save everything.

And implement FPU register retrieval from stack on x86.

On Nexus 9, AOSP ToT, the boot.oat size reduction is
  prebuilt multi-part boot image:
    - 32-bit boot.oat: -20KiB (-0.03%)
    - 64-bit boot.oat: -45KiB (-0.06%)
  on-device built single boot image:
    - 32-bit boot.oat: -24KiB (-0.04%)
    - 64-bit boot.oat: -36KiB (-0.05%)

Test: Run ART test suite on host and Nexus 9.
Bug: 30212852
Change-Id: I5d98e2a24363136d73dfec6100ab02f8eb101911
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 5d7b491..2ef1802 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -344,7 +344,6 @@
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen);
     __ Bind(GetEntryLabel());
-    SaveLiveRegisters(codegen, instruction_->GetLocations());
     arm_codegen->InvokeRuntime(kQuickDeoptimize, instruction_, instruction_->GetDexPc(), this);
     CheckEntrypointTypes<kQuickDeoptimize, void, void>();
   }
@@ -1532,6 +1531,7 @@
 void LocationsBuilderARM::VisitDeoptimize(HDeoptimize* deoptimize) {
   LocationSummary* locations = new (GetGraph()->GetArena())
       LocationSummary(deoptimize, LocationSummary::kCallOnSlowPath);
+  locations->SetCustomSlowPathCallerSaves(RegisterSet());  // No caller-save registers.
   if (IsBooleanValueOrMaterializedCondition(deoptimize->InputAt(0))) {
     locations->SetInAt(0, Location::RequiresRegister());
   }
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 3923f52..ceceedd 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -494,7 +494,6 @@
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen);
     __ Bind(GetEntryLabel());
-    SaveLiveRegisters(codegen, instruction_->GetLocations());
     arm64_codegen->InvokeRuntime(kQuickDeoptimize, instruction_, instruction_->GetDexPc(), this);
     CheckEntrypointTypes<kQuickDeoptimize, void, void>();
   }
@@ -3060,6 +3059,7 @@
 void LocationsBuilderARM64::VisitDeoptimize(HDeoptimize* deoptimize) {
   LocationSummary* locations = new (GetGraph()->GetArena())
       LocationSummary(deoptimize, LocationSummary::kCallOnSlowPath);
+  locations->SetCustomSlowPathCallerSaves(RegisterSet());  // No caller-save registers.
   if (IsBooleanValueOrMaterializedCondition(deoptimize->InputAt(0))) {
     locations->SetInAt(0, Location::RequiresRegister());
   }
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index 36bb55a..92e9cd9 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -418,7 +418,6 @@
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     CodeGeneratorMIPS* mips_codegen = down_cast<CodeGeneratorMIPS*>(codegen);
     __ Bind(GetEntryLabel());
-    SaveLiveRegisters(codegen, instruction_->GetLocations());
     mips_codegen->InvokeRuntime(kQuickDeoptimize, instruction_, instruction_->GetDexPc(), this);
     CheckEntrypointTypes<kQuickDeoptimize, void, void>();
   }
@@ -3470,6 +3469,7 @@
 void LocationsBuilderMIPS::VisitDeoptimize(HDeoptimize* deoptimize) {
   LocationSummary* locations = new (GetGraph()->GetArena())
       LocationSummary(deoptimize, LocationSummary::kCallOnSlowPath);
+  locations->SetCustomSlowPathCallerSaves(RegisterSet());  // No caller-save registers.
   if (IsBooleanValueOrMaterializedCondition(deoptimize->InputAt(0))) {
     locations->SetInAt(0, Location::RequiresRegister());
   }
diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc
index 18d928d..664d498 100644
--- a/compiler/optimizing/code_generator_mips64.cc
+++ b/compiler/optimizing/code_generator_mips64.cc
@@ -376,7 +376,6 @@
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     CodeGeneratorMIPS64* mips64_codegen = down_cast<CodeGeneratorMIPS64*>(codegen);
     __ Bind(GetEntryLabel());
-    SaveLiveRegisters(codegen, instruction_->GetLocations());
     mips64_codegen->InvokeRuntime(kQuickDeoptimize, instruction_, instruction_->GetDexPc(), this);
     CheckEntrypointTypes<kQuickDeoptimize, void, void>();
   }
@@ -2631,6 +2630,7 @@
 void LocationsBuilderMIPS64::VisitDeoptimize(HDeoptimize* deoptimize) {
   LocationSummary* locations = new (GetGraph()->GetArena())
       LocationSummary(deoptimize, LocationSummary::kCallOnSlowPath);
+  locations->SetCustomSlowPathCallerSaves(RegisterSet());  // No caller-save registers.
   if (IsBooleanValueOrMaterializedCondition(deoptimize->InputAt(0))) {
     locations->SetInAt(0, Location::RequiresRegister());
   }
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 4b3eddd..8858def 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -369,7 +369,6 @@
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     CodeGeneratorX86* x86_codegen = down_cast<CodeGeneratorX86*>(codegen);
     __ Bind(GetEntryLabel());
-    SaveLiveRegisters(codegen, instruction_->GetLocations());
     x86_codegen->InvokeRuntime(kQuickDeoptimize, instruction_, instruction_->GetDexPc(), this);
     CheckEntrypointTypes<kQuickDeoptimize, void, void>();
   }
@@ -1499,6 +1498,7 @@
 void LocationsBuilderX86::VisitDeoptimize(HDeoptimize* deoptimize) {
   LocationSummary* locations = new (GetGraph()->GetArena())
       LocationSummary(deoptimize, LocationSummary::kCallOnSlowPath);
+  locations->SetCustomSlowPathCallerSaves(RegisterSet());  // No caller-save registers.
   if (IsBooleanValueOrMaterializedCondition(deoptimize->InputAt(0))) {
     locations->SetInAt(0, Location::Any());
   }
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index a60c270..52302697 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -388,7 +388,6 @@
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
     __ Bind(GetEntryLabel());
-    SaveLiveRegisters(codegen, instruction_->GetLocations());
     x86_64_codegen->InvokeRuntime(kQuickDeoptimize, instruction_, instruction_->GetDexPc(), this);
     CheckEntrypointTypes<kQuickDeoptimize, void, void>();
   }
@@ -1563,6 +1562,7 @@
 void LocationsBuilderX86_64::VisitDeoptimize(HDeoptimize* deoptimize) {
   LocationSummary* locations = new (GetGraph()->GetArena())
       LocationSummary(deoptimize, LocationSummary::kCallOnSlowPath);
+  locations->SetCustomSlowPathCallerSaves(RegisterSet());  // No caller-save registers.
   if (IsBooleanValueOrMaterializedCondition(deoptimize->InputAt(0))) {
     locations->SetInAt(0, Location::Any());
   }
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index c51c336..a3f053b 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -1544,7 +1544,7 @@
      */
     .extern artDeoptimizeFromCompiledCode
 ENTRY art_quick_deoptimize_from_compiled_code
-    SETUP_SAVE_ALL_CALLEE_SAVES_FRAME r0
+    SETUP_SAVE_EVERYTHING_FRAME r0
     mov    r0, r9                         @ Set up args.
     blx    artDeoptimizeFromCompiledCode  @ artDeoptimizeFromCompiledCode(Thread*)
 END art_quick_deoptimize_from_compiled_code
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 03768af..b476762 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -2529,7 +2529,7 @@
      */
     .extern artDeoptimizeFromCompiledCode
 ENTRY art_quick_deoptimize_from_compiled_code
-    SETUP_SAVE_ALL_CALLEE_SAVES_FRAME
+    SETUP_SAVE_EVERYTHING_FRAME
     mov    x0, xSELF                      // Pass thread.
     bl     artDeoptimizeFromCompiledCode  // artDeoptimizeFromCompiledCode(Thread*)
     brk 0
diff --git a/runtime/arch/mips/quick_entrypoints_mips.S b/runtime/arch/mips/quick_entrypoints_mips.S
index 3d393f6..4bd1314 100644
--- a/runtime/arch/mips/quick_entrypoints_mips.S
+++ b/runtime/arch/mips/quick_entrypoints_mips.S
@@ -2094,7 +2094,7 @@
      */
     .extern artDeoptimizeFromCompiledCode
 ENTRY art_quick_deoptimize_from_compiled_code
-    SETUP_SAVE_ALL_CALLEE_SAVES_FRAME
+    SETUP_SAVE_EVERYTHING_FRAME
     la       $t9, artDeoptimizeFromCompiledCode
     jalr     $t9                            # artDeoptimizeFromCompiledCode(Thread*)
                                             # Returns caller method's frame size.
diff --git a/runtime/arch/mips64/quick_entrypoints_mips64.S b/runtime/arch/mips64/quick_entrypoints_mips64.S
index 9774eb9..0bf2a35 100644
--- a/runtime/arch/mips64/quick_entrypoints_mips64.S
+++ b/runtime/arch/mips64/quick_entrypoints_mips64.S
@@ -2138,8 +2138,8 @@
      * will long jump to the upcall with a special exception of -1.
      */
     .extern artDeoptimizeFromCompiledCode
-ENTRY art_quick_deoptimize_from_compiled_code
-    SETUP_SAVE_ALL_CALLEE_SAVES_FRAME
+ENTRY_NO_GP art_quick_deoptimize_from_compiled_code
+    SETUP_SAVE_EVERYTHING_FRAME
     jal      artDeoptimizeFromCompiledCode    # artDeoptimizeFromCompiledCode(Thread*, SP)
                                               # Returns caller method's frame size.
     move     $a0, rSELF                       # pass Thread::current
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 67ebf50..646a80c 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -1930,7 +1930,7 @@
      * will long jump to the interpreter bridge.
      */
 DEFINE_FUNCTION art_quick_deoptimize_from_compiled_code
-    SETUP_SAVE_ALL_CALLEE_SAVES_FRAME ebx, ebx
+    SETUP_SAVE_EVERYTHING_FRAME ebx, ebx
     subl LITERAL(12), %esp                      // Align stack.
     CFI_ADJUST_CFA_OFFSET(12)
     pushl %fs:THREAD_SELF_OFFSET                // Pass Thread::Current().
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index b805703..5ea58af 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -2053,7 +2053,7 @@
      * will long jump to the interpreter bridge.
      */
 DEFINE_FUNCTION art_quick_deoptimize_from_compiled_code
-    SETUP_SAVE_ALL_CALLEE_SAVES_FRAME
+    SETUP_SAVE_EVERYTHING_FRAME
                                                 // Stack should be aligned now.
     movq %gs:THREAD_SELF_OFFSET, %rdi           // Pass Thread.
     call SYMBOL(artDeoptimizeFromCompiledCode)  // artDeoptimizeFromCompiledCode(Thread*)
diff --git a/runtime/oat.h b/runtime/oat.h
index 7c84fe9..35d0c92 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,7 +32,7 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr uint8_t kOatMagic[] = { 'o', 'a', 't', '\n' };
-  static constexpr uint8_t kOatVersion[] = { '0', '8', '6', '\0' };
+  static constexpr uint8_t kOatVersion[] = { '0', '8', '7', '\0' };
 
   static constexpr const char* kImageLocationKey = "image-location";
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";
diff --git a/runtime/stack.cc b/runtime/stack.cc
index ec492ed..4678ac6 100644
--- a/runtime/stack.cc
+++ b/runtime/stack.cc
@@ -319,8 +319,11 @@
 bool StackVisitor::GetRegisterIfAccessible(uint32_t reg, VRegKind kind, uint32_t* val) const {
   const bool is_float = (kind == kFloatVReg) || (kind == kDoubleLoVReg) || (kind == kDoubleHiVReg);
 
-  // X86 float registers are 64-bit and the logic below does not apply.
-  DCHECK(!is_float || kRuntimeISA != InstructionSet::kX86);
+  if (kRuntimeISA == InstructionSet::kX86 && is_float) {
+    // X86 float registers are 64-bit and each XMM register is provided as two separate
+    // 32-bit registers by the context.
+    reg = (kind == kDoubleHiVReg) ? (2 * reg + 1) : (2 * reg);
+  }
 
   if (!IsAccessibleRegister(reg, is_float)) {
     return false;
diff --git a/test/534-checker-bce-deoptimization/expected.txt b/test/534-checker-bce-deoptimization/expected.txt
index 3823a29..b9a1e27 100644
--- a/test/534-checker-bce-deoptimization/expected.txt
+++ b/test/534-checker-bce-deoptimization/expected.txt
@@ -1 +1,5 @@
+array[0]=2.5f
+array[1]=2.625f
+array[0]=3.5
+array[1]=3.625
 finish
diff --git a/test/534-checker-bce-deoptimization/src/Main.java b/test/534-checker-bce-deoptimization/src/Main.java
index 8cd20f6..c4e4cbf 100644
--- a/test/534-checker-bce-deoptimization/src/Main.java
+++ b/test/534-checker-bce-deoptimization/src/Main.java
@@ -17,6 +17,8 @@
 public class Main {
     public static void main(String[] args) {
         new Main().run();
+        testPreserveFloat();
+        testPreserveDouble();
         System.out.println("finish");
     }
 
@@ -53,5 +55,77 @@
             b[i + 1] += c * b[i + 1];
         }
     }
+
+    /*
+     * Test that we correctly preserve floating point registers when we deoptimize.
+     *
+     * Note: These tests rely on the deoptimization happening before the loop,
+     * so that the loop is interpreted and fills the provided arrays. However,
+     * the BCE transformation can be modified to execute the loop as many times
+     * as the compiler can guarantee no AIOOBE and only deoptimize thereafter,
+     * just before the throwing iteration. Then the floating point registers
+     * would no longer be used after the deoptimization and another approach
+     * would be needed to test this.
+     */
+
+    static public void testPreserveFloat() {
+        float[] array = new float[2];
+        try {
+            $noinline$FloatFill(1.125f, 2.5f, array, 3);
+            throw new Error();
+        } catch (ArrayIndexOutOfBoundsException expected) {
+            System.out.println("array[0]=" + array[0] + "f");
+            System.out.println("array[1]=" + array[1] + "f");
+        }
+    }
+
+    /// CHECK-START: void Main.$noinline$FloatFill(float, float, float[], int) BCE (after)
+    /// CHECK-DAG:          Deoptimize
+    /// CHECK-DAG:          Deoptimize
+    /// CHECK-DAG:          Deoptimize
+    /// CHECK-NOT:          Deoptimize
+
+    /// CHECK-START: void Main.$noinline$FloatFill(float, float, float[], int) BCE (after)
+    /// CHECK-NOT:          BoundsCheck
+
+    public static void $noinline$FloatFill(float f1, float f2, float[] array, int n) {
+        if (doThrow) { throw new Error(); }
+        for (int i = 0; i < n; ++i) {
+            array[i] = ((i & 1) == 1) ? f1 : f2;
+            f1 += 1.5f;
+            f2 += 2.25f;
+        }
+    }
+
+    static public void testPreserveDouble() {
+        double[] array = new double[2];
+        try {
+            $noinline$DoubleFill(2.125, 3.5, array, 3);
+            throw new Error();
+        } catch (ArrayIndexOutOfBoundsException expected) {
+            System.out.println("array[0]=" + array[0]);
+            System.out.println("array[1]=" + array[1]);
+        }
+    }
+
+    /// CHECK-START: void Main.$noinline$DoubleFill(double, double, double[], int) BCE (after)
+    /// CHECK-DAG:          Deoptimize
+    /// CHECK-DAG:          Deoptimize
+    /// CHECK-DAG:          Deoptimize
+    /// CHECK-NOT:          Deoptimize
+
+    /// CHECK-START: void Main.$noinline$DoubleFill(double, double, double[], int) BCE (after)
+    /// CHECK-NOT:          BoundsCheck
+
+    public static void $noinline$DoubleFill(double d1, double d2, double[] array, int n) {
+        if (doThrow) { throw new Error(); }
+        for (int i = 0; i < n; ++i) {
+            array[i] = ((i & 1) == 1) ? d1 : d2;
+            d1 += 1.5;
+            d2 += 2.25;
+        }
+    }
+
+    public static boolean doThrow = false;
 }