Revert "Revert "Fix deoptimization with pending exception""

This reverts commit 6e2d5747d00697a25251d25dd33b953e54709507.

Fixes the deoptimization path from compiled code (generated by the
Optimizing compiler) by adding wrapper artDeoptimizeFromCompiledCode.
This wrapper, called through the matching assembler stub
art_quick_deoptimize_from_compiled_code, pushes the deoptimization
context just before deoptimizing the stack.

Bug: 23371176
Bug: 19944235
Change-Id: Ia7082656998aebdd0157438f7e6504c120e10d3e
diff --git a/runtime/arch/arm/entrypoints_init_arm.cc b/runtime/arch/arm/entrypoints_init_arm.cc
index be9af98..1599025 100644
--- a/runtime/arch/arm/entrypoints_init_arm.cc
+++ b/runtime/arch/arm/entrypoints_init_arm.cc
@@ -167,7 +167,8 @@
   qpoints->pThrowNullPointer = art_quick_throw_null_pointer_exception;
   qpoints->pThrowStackOverflow = art_quick_throw_stack_overflow;
 
-  qpoints->pDeoptimize = art_quick_deoptimize;
+  // Deoptimization from compiled code.
+  qpoints->pDeoptimize = art_quick_deoptimize_from_compiled_code;
 
   // Read barrier
   qpoints->pReadBarrierJni = ReadBarrierJni;
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index f6d954f..b9f8cb1 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -1141,6 +1141,17 @@
 END art_quick_deoptimize
 
     /*
+     * Compiled code has requested that we deoptimize into the interpreter. The deoptimization
+     * will long jump to the upcall with a special exception of -1.
+     */
+    .extern artDeoptimizeFromCompiledCode
+ENTRY art_quick_deoptimize_from_compiled_code
+    SETUP_SAVE_ALL_CALLEE_SAVE_FRAME r0, r1
+    mov    r0, r9                         @ Set up args.
+    blx    artDeoptimizeFromCompiledCode  @ artDeoptimizeFromCompiledCode(Thread*)
+END art_quick_deoptimize_from_compiled_code
+
+    /*
      * Signed 64-bit integer multiply.
      *
      * Consider WXxYZ (r1r0 x r3r2) with a long multiply:
diff --git a/runtime/arch/arm64/entrypoints_init_arm64.cc b/runtime/arch/arm64/entrypoints_init_arm64.cc
index 0f06727..e9c816f 100644
--- a/runtime/arch/arm64/entrypoints_init_arm64.cc
+++ b/runtime/arch/arm64/entrypoints_init_arm64.cc
@@ -150,8 +150,8 @@
   qpoints->pThrowNullPointer = art_quick_throw_null_pointer_exception;
   qpoints->pThrowStackOverflow = art_quick_throw_stack_overflow;
 
-  // Deoptimize
-  qpoints->pDeoptimize = art_quick_deoptimize;
+  // Deoptimization from compiled code.
+  qpoints->pDeoptimize = art_quick_deoptimize_from_compiled_code;
 
   // Read barrier
   qpoints->pReadBarrierJni = ReadBarrierJni;
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 8ba3d43..07b91a1 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -1739,6 +1739,18 @@
     brk 0
 END art_quick_deoptimize
 
+    /*
+     * Compiled code has requested that we deoptimize into the interpreter. The deoptimization
+     * will long jump to the upcall with a special exception of -1.
+     */
+    .extern artDeoptimizeFromCompiledCode
+ENTRY art_quick_deoptimize_from_compiled_code
+    SETUP_SAVE_ALL_CALLEE_SAVE_FRAME
+    mov    x0, xSELF                      // Pass thread.
+    bl     artDeoptimizeFromCompiledCode  // artDeoptimizeFromCompiledCode(Thread*)
+    brk 0
+END art_quick_deoptimize_from_compiled_code
+
 
     /*
      * String's indexOf.
diff --git a/runtime/arch/mips/entrypoints_init_mips.cc b/runtime/arch/mips/entrypoints_init_mips.cc
index 4e4b91f..6721e54 100644
--- a/runtime/arch/mips/entrypoints_init_mips.cc
+++ b/runtime/arch/mips/entrypoints_init_mips.cc
@@ -267,8 +267,8 @@
   qpoints->pThrowStackOverflow = art_quick_throw_stack_overflow;
   static_assert(!IsDirectEntrypoint(kQuickThrowStackOverflow), "Non-direct C stub marked direct.");
 
-  // Deoptimize
-  qpoints->pDeoptimize = art_quick_deoptimize;
+  // Deoptimization from compiled code.
+  qpoints->pDeoptimize = art_quick_deoptimize_from_compiled_code;
   static_assert(!IsDirectEntrypoint(kQuickDeoptimize), "Non-direct C stub marked direct.");
 
   // Atomic 64-bit load/store
diff --git a/runtime/arch/mips/quick_entrypoints_mips.S b/runtime/arch/mips/quick_entrypoints_mips.S
index 8bc75e5..0147230 100644
--- a/runtime/arch/mips/quick_entrypoints_mips.S
+++ b/runtime/arch/mips/quick_entrypoints_mips.S
@@ -1542,6 +1542,18 @@
 END art_quick_deoptimize
 
     /*
+     * Compiled code has requested that we deoptimize into the interpreter. The deoptimization
+     * will long jump to the upcall with a special exception of -1.
+     */
+    .extern artDeoptimizeFromCompiledCode
+ENTRY art_quick_deoptimize_from_compiled_code
+    SETUP_SAVE_ALL_CALLEE_SAVE_FRAME
+    jal      artDeoptimizeFromCompiledCode  # artDeoptimizeFromCompiledCode(Thread*)
+                                            # Returns caller method's frame size.
+    move     $a0, rSELF                     # pass Thread::current
+END art_quick_deoptimize_from_compiled_code
+
+    /*
      * Long integer shift.  This is different from the generic 32/64-bit
      * binary operations because vAA/vBB are 64-bit but vCC (the shift
      * distance) is 32-bit.  Also, Dalvik requires us to ignore all but the low
diff --git a/runtime/arch/mips64/entrypoints_init_mips64.cc b/runtime/arch/mips64/entrypoints_init_mips64.cc
index ec02d5a..9f1f0e0 100644
--- a/runtime/arch/mips64/entrypoints_init_mips64.cc
+++ b/runtime/arch/mips64/entrypoints_init_mips64.cc
@@ -176,8 +176,8 @@
   qpoints->pThrowNullPointer = art_quick_throw_null_pointer_exception;
   qpoints->pThrowStackOverflow = art_quick_throw_stack_overflow;
 
-  // Deoptimize
-  qpoints->pDeoptimize = art_quick_deoptimize;
+  // Deoptimization from compiled code.
+  qpoints->pDeoptimize = art_quick_deoptimize_from_compiled_code;
 
   // TODO - use lld/scd instructions for Mips64
   // Atomic 64-bit load/store
diff --git a/runtime/arch/mips64/quick_entrypoints_mips64.S b/runtime/arch/mips64/quick_entrypoints_mips64.S
index c30e6ca..08717a4 100644
--- a/runtime/arch/mips64/quick_entrypoints_mips64.S
+++ b/runtime/arch/mips64/quick_entrypoints_mips64.S
@@ -1603,5 +1603,17 @@
     move     $a0, rSELF        # pass Thread::current
 END art_quick_deoptimize
 
+    /*
+     * Compiled code has requested that we deoptimize into the interpreter. The deoptimization
+     * will long jump to the upcall with a special exception of -1.
+     */
+    .extern artDeoptimizeFromCompiledCode
+ENTRY art_quick_deoptimize_from_compiled_code
+    SETUP_SAVE_ALL_CALLEE_SAVE_FRAME
+    jal      artDeoptimizeFromCompiledCode    # artDeoptimizeFromCompiledCode(Thread*, SP)
+                                              # Returns caller method's frame size.
+    move     $a0, rSELF                       # pass Thread::current
+END art_quick_deoptimize_from_compiled_code
+
 UNIMPLEMENTED art_quick_indexof
 UNIMPLEMENTED art_quick_string_compareto
diff --git a/runtime/arch/x86/entrypoints_init_x86.cc b/runtime/arch/x86/entrypoints_init_x86.cc
index e2632c1..10fc281 100644
--- a/runtime/arch/x86/entrypoints_init_x86.cc
+++ b/runtime/arch/x86/entrypoints_init_x86.cc
@@ -140,7 +140,7 @@
   qpoints->pThrowStackOverflow = art_quick_throw_stack_overflow;
 
   // Deoptimize
-  qpoints->pDeoptimize = art_quick_deoptimize_from_compiled_slow_path;
+  qpoints->pDeoptimize = art_quick_deoptimize_from_compiled_code;
 
   // Read barrier
   qpoints->pReadBarrierJni = ReadBarrierJni;
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 1da5a2f..7fbcf8d 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -1677,9 +1677,6 @@
      */
 DEFINE_FUNCTION art_quick_deoptimize
     PUSH ebx                      // Entry point for a jump. Fake that we were called.
-.globl SYMBOL(art_quick_deoptimize_from_compiled_slow_path)  // Entry point for real calls
-                                                             // from compiled slow paths.
-SYMBOL(art_quick_deoptimize_from_compiled_slow_path):
     SETUP_SAVE_ALL_CALLEE_SAVE_FRAME ebx, ebx
     subl LITERAL(12), %esp        // Align stack.
     CFI_ADJUST_CFA_OFFSET(12)
@@ -1690,6 +1687,20 @@
 END_FUNCTION art_quick_deoptimize
 
     /*
+     * Compiled code has requested that we deoptimize into the interpreter. The deoptimization
+     * will long jump to the upcall with a special exception of -1.
+     */
+DEFINE_FUNCTION art_quick_deoptimize_from_compiled_code
+    SETUP_SAVE_ALL_CALLEE_SAVE_FRAME ebx, ebx
+    subl LITERAL(12), %esp                      // Align stack.
+    CFI_ADJUST_CFA_OFFSET(12)
+    pushl %fs:THREAD_SELF_OFFSET                // Pass Thread::Current().
+    CFI_ADJUST_CFA_OFFSET(4)
+    call SYMBOL(artDeoptimizeFromCompiledCode)  // artDeoptimizeFromCompiledCode(Thread*)
+    UNREACHABLE
+END_FUNCTION art_quick_deoptimize_from_compiled_code
+
+    /*
      * String's compareTo.
      *
      * On entry:
diff --git a/runtime/arch/x86_64/entrypoints_init_x86_64.cc b/runtime/arch/x86_64/entrypoints_init_x86_64.cc
index ef1bb5f..5cc72e3 100644
--- a/runtime/arch/x86_64/entrypoints_init_x86_64.cc
+++ b/runtime/arch/x86_64/entrypoints_init_x86_64.cc
@@ -144,7 +144,7 @@
   qpoints->pThrowStackOverflow = art_quick_throw_stack_overflow;
 
   // Deoptimize
-  qpoints->pDeoptimize = art_quick_deoptimize_from_compiled_slow_path;
+  qpoints->pDeoptimize = art_quick_deoptimize_from_compiled_code;
 
   // Read barrier
   qpoints->pReadBarrierJni = ReadBarrierJni;
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index f4c9488..5f3f175 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -1721,9 +1721,6 @@
 DEFINE_FUNCTION art_quick_deoptimize
     pushq %rsi                     // Entry point for a jump. Fake that we were called.
                                    // Use hidden arg.
-.globl SYMBOL(art_quick_deoptimize_from_compiled_slow_path)  // Entry point for real calls
-                                                             // from compiled slow paths.
-SYMBOL(art_quick_deoptimize_from_compiled_slow_path):
     SETUP_SAVE_ALL_CALLEE_SAVE_FRAME
                                    // Stack should be aligned now.
     movq %gs:THREAD_SELF_OFFSET, %rdi         // Pass Thread.
@@ -1732,6 +1729,18 @@
 END_FUNCTION art_quick_deoptimize
 
     /*
+     * Compiled code has requested that we deoptimize into the interpreter. The deoptimization
+     * will long jump to the upcall with a special exception of -1.
+     */
+DEFINE_FUNCTION art_quick_deoptimize_from_compiled_code
+    SETUP_SAVE_ALL_CALLEE_SAVE_FRAME
+                                                // Stack should be aligned now.
+    movq %gs:THREAD_SELF_OFFSET, %rdi           // Pass Thread.
+    call SYMBOL(artDeoptimizeFromCompiledCode)  // artDeoptimizeFromCompiledCode(Thread*)
+    UNREACHABLE
+END_FUNCTION art_quick_deoptimize_from_compiled_code
+
+    /*
      * String's compareTo.
      *
      * On entry:
diff --git a/runtime/art_method.cc b/runtime/art_method.cc
index 56f7b35..e46402d 100644
--- a/runtime/art_method.cc
+++ b/runtime/art_method.cc
@@ -427,9 +427,16 @@
         self->ClearException();
         ShadowFrame* shadow_frame =
             self->PopStackedShadowFrame(StackedShadowFrameType::kDeoptimizationShadowFrame);
-        result->SetJ(self->PopDeoptimizationReturnValue().GetJ());
+        mirror::Throwable* pending_exception = nullptr;
+        self->PopDeoptimizationContext(result, &pending_exception);
         self->SetTopOfStack(nullptr);
         self->SetTopOfShadowStack(shadow_frame);
+
+        // Restore the exception that was pending before deoptimization then interpret the
+        // deoptimized frames.
+        if (pending_exception != nullptr) {
+          self->SetException(pending_exception);
+        }
         interpreter::EnterInterpreterFromDeoptimize(self, shadow_frame, result);
       }
       if (kLogInvocationStartAndReturn) {
diff --git a/runtime/asm_support.h b/runtime/asm_support.h
index 084c88e..5c1922e 100644
--- a/runtime/asm_support.h
+++ b/runtime/asm_support.h
@@ -89,7 +89,7 @@
             art::Thread::ThinLockIdOffset<__SIZEOF_POINTER__>().Int32Value())
 
 // Offset of field Thread::tlsPtr_.card_table.
-#define THREAD_CARD_TABLE_OFFSET 136
+#define THREAD_CARD_TABLE_OFFSET 128
 ADD_TEST_EQ(THREAD_CARD_TABLE_OFFSET,
             art::Thread::CardTableOffset<__SIZEOF_POINTER__>().Int32Value())
 
diff --git a/runtime/entrypoints/quick/quick_deoptimization_entrypoints.cc b/runtime/entrypoints/quick/quick_deoptimization_entrypoints.cc
index a4feac1..d749664 100644
--- a/runtime/entrypoints/quick/quick_deoptimization_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_deoptimization_entrypoints.cc
@@ -28,17 +28,30 @@
 
 namespace art {
 
-extern "C" NO_RETURN void artDeoptimize(Thread* self) SHARED_REQUIRES(Locks::mutator_lock_) {
-  ScopedQuickEntrypointChecks sqec(self);
-
+NO_RETURN static void artDeoptimizeImpl(Thread* self) SHARED_REQUIRES(Locks::mutator_lock_) {
   if (VLOG_IS_ON(deopt)) {
     LOG(INFO) << "Deopting:";
     self->Dump(LOG(INFO));
   }
 
-  self->PushAndClearDeoptimizationReturnValue();
+  self->AssertHasDeoptimizationContext();
   self->SetException(Thread::GetDeoptimizationException());
   self->QuickDeliverException();
 }
 
+extern "C" NO_RETURN void artDeoptimize(Thread* self) SHARED_REQUIRES(Locks::mutator_lock_) {
+  ScopedQuickEntrypointChecks sqec(self);
+  artDeoptimizeImpl(self);
+}
+
+extern "C" NO_RETURN void artDeoptimizeFromCompiledCode(Thread* self)
+    SHARED_REQUIRES(Locks::mutator_lock_) {
+  ScopedQuickEntrypointChecks sqec(self);
+  // Before deoptimizing to interpreter, we must push the deoptimization context.
+  JValue return_value;
+  return_value.SetJ(0);  // we never deoptimize from compiled code with an invoke result.
+  self->PushDeoptimizationContext(return_value, false, self->GetException());
+  artDeoptimizeImpl(self);
+}
+
 }  // namespace art
diff --git a/runtime/entrypoints/quick/quick_instrumentation_entrypoints.cc b/runtime/entrypoints/quick/quick_instrumentation_entrypoints.cc
index ad5ee84..8e660a2 100644
--- a/runtime/entrypoints/quick/quick_instrumentation_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_instrumentation_entrypoints.cc
@@ -51,6 +51,9 @@
                                                               uint64_t gpr_result,
                                                               uint64_t fpr_result)
     SHARED_REQUIRES(Locks::mutator_lock_) {
+  // Instrumentation exit stub must not be entered with a pending exception.
+  CHECK(!self->IsExceptionPending()) << "Enter instrumentation exit stub with pending exception "
+                                     << self->GetException()->Dump();
   // Compute address of return PC and sanity check that it currently holds 0.
   size_t return_pc_offset = GetCalleeSaveReturnPcOffset(kRuntimeISA, Runtime::kRefsOnly);
   uintptr_t* return_pc = reinterpret_cast<uintptr_t*>(reinterpret_cast<uint8_t*>(sp) +
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index aa35ec1..0c7caf3 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -688,8 +688,12 @@
     // Request a stack deoptimization if needed
     ArtMethod* caller = QuickArgumentVisitor::GetCallingMethod(sp);
     if (UNLIKELY(Dbg::IsForcedInterpreterNeededForUpcall(self, caller))) {
+      // Push the context of the deoptimization stack so we can restore the return value and the
+      // exception before executing the deoptimized frames.
+      self->PushDeoptimizationContext(result, shorty[0] == 'L', self->GetException());
+
+      // Set special exception to cause deoptimization.
       self->SetException(Thread::GetDeoptimizationException());
-      self->SetDeoptimizationReturnValue(result, shorty[0] == 'L');
     }
 
     // No need to restore the args since the method has already been run by the interpreter.
diff --git a/runtime/entrypoints/runtime_asm_entrypoints.h b/runtime/entrypoints/runtime_asm_entrypoints.h
index 8209dc8..2842c5a 100644
--- a/runtime/entrypoints/runtime_asm_entrypoints.h
+++ b/runtime/entrypoints/runtime_asm_entrypoints.h
@@ -70,7 +70,8 @@
   return reinterpret_cast<const void*>(art_quick_instrumentation_entry);
 }
 
-extern "C" void art_quick_deoptimize_from_compiled_slow_path();
+// Stub to deoptimize from compiled code.
+extern "C" void art_quick_deoptimize_from_compiled_code();
 
 // The return_pc of instrumentation exit stub.
 extern "C" void art_quick_instrumentation_exit();
diff --git a/runtime/entrypoints_order_test.cc b/runtime/entrypoints_order_test.cc
index f7a3cd5..7db8888 100644
--- a/runtime/entrypoints_order_test.cc
+++ b/runtime/entrypoints_order_test.cc
@@ -72,15 +72,12 @@
     EXPECT_OFFSET_DIFFP(Thread, tls32_, throwing_OutOfMemoryError, no_thread_suspension, 4);
     EXPECT_OFFSET_DIFFP(Thread, tls32_, no_thread_suspension, thread_exit_check_count, 4);
     EXPECT_OFFSET_DIFFP(Thread, tls32_, thread_exit_check_count, handling_signal_, 4);
-    EXPECT_OFFSET_DIFFP(Thread, tls32_, handling_signal_,
-                        deoptimization_return_value_is_reference, 4);
 
     // TODO: Better connection. Take alignment into account.
     EXPECT_OFFSET_DIFF_GT3(Thread, tls32_.thread_exit_check_count, tls64_.trace_clock_base, 4,
                            thread_tls32_to_tls64);
 
-    EXPECT_OFFSET_DIFFP(Thread, tls64_, trace_clock_base, deoptimization_return_value, 8);
-    EXPECT_OFFSET_DIFFP(Thread, tls64_, deoptimization_return_value, stats, 8);
+    EXPECT_OFFSET_DIFFP(Thread, tls64_, trace_clock_base, stats, 8);
 
     // TODO: Better connection. Take alignment into account.
     EXPECT_OFFSET_DIFF_GT3(Thread, tls64_.stats, tlsPtr_.card_table, 8, thread_tls64_to_tlsptr);
@@ -108,8 +105,8 @@
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, single_step_control, stacked_shadow_frame_record,
                         sizeof(void*));
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, stacked_shadow_frame_record,
-                        deoptimization_return_value_stack, sizeof(void*));
-    EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, deoptimization_return_value_stack, name, sizeof(void*));
+                        deoptimization_context_stack, sizeof(void*));
+    EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, deoptimization_context_stack, name, sizeof(void*));
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, name, pthread_self, sizeof(void*));
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, pthread_self, last_no_thread_suspension_cause,
                         sizeof(void*));
diff --git a/runtime/instrumentation.cc b/runtime/instrumentation.cc
index e28d578..63c02ed 100644
--- a/runtime/instrumentation.cc
+++ b/runtime/instrumentation.cc
@@ -1016,7 +1016,8 @@
                                 PrettyMethod(method).c_str(),
                                 return_value.GetJ()) << *self;
     }
-    self->SetDeoptimizationReturnValue(return_value, return_shorty == 'L');
+    self->PushDeoptimizationContext(return_value, return_shorty == 'L',
+                                    nullptr /* no pending exception */);
     return GetTwoWordSuccessValue(*return_pc,
                                   reinterpret_cast<uintptr_t>(GetQuickDeoptimizationEntryPoint()));
   } else {
diff --git a/runtime/oat.h b/runtime/oat.h
index 29dd76c..1520a9b 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,7 +32,7 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr uint8_t kOatMagic[] = { 'o', 'a', 't', '\n' };
-  static constexpr uint8_t kOatVersion[] = { '0', '6', '8', '\0' };
+  static constexpr uint8_t kOatVersion[] = { '0', '6', '9', '\0' };
 
   static constexpr const char* kImageLocationKey = "image-location";
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";
diff --git a/runtime/thread.cc b/runtime/thread.cc
index a33e150..63534b1 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -162,27 +162,41 @@
   ResetQuickAllocEntryPoints(&tlsPtr_.quick_entrypoints);
 }
 
-class DeoptimizationReturnValueRecord {
+class DeoptimizationContextRecord {
  public:
-  DeoptimizationReturnValueRecord(const JValue& ret_val,
-                                  bool is_reference,
-                                  DeoptimizationReturnValueRecord* link)
-      : ret_val_(ret_val), is_reference_(is_reference), link_(link) {}
+  DeoptimizationContextRecord(const JValue& ret_val, bool is_reference,
+                              mirror::Throwable* pending_exception,
+                              DeoptimizationContextRecord* link)
+      : ret_val_(ret_val), is_reference_(is_reference), pending_exception_(pending_exception),
+        link_(link) {}
 
   JValue GetReturnValue() const { return ret_val_; }
   bool IsReference() const { return is_reference_; }
-  DeoptimizationReturnValueRecord* GetLink() const { return link_; }
-  mirror::Object** GetGCRoot() {
+  mirror::Throwable* GetPendingException() const { return pending_exception_; }
+  DeoptimizationContextRecord* GetLink() const { return link_; }
+  mirror::Object** GetReturnValueAsGCRoot() {
     DCHECK(is_reference_);
     return ret_val_.GetGCRoot();
   }
+  mirror::Object** GetPendingExceptionAsGCRoot() {
+    return reinterpret_cast<mirror::Object**>(&pending_exception_);
+  }
 
  private:
+  // The value returned by the method at the top of the stack before deoptimization.
   JValue ret_val_;
-  const bool is_reference_;
-  DeoptimizationReturnValueRecord* const link_;
 
-  DISALLOW_COPY_AND_ASSIGN(DeoptimizationReturnValueRecord);
+  // Indicates whether the returned value is a reference. If so, the GC will visit it.
+  const bool is_reference_;
+
+  // The exception that was pending before deoptimization (or null if there was no pending
+  // exception).
+  mirror::Throwable* pending_exception_;
+
+  // A link to the previous DeoptimizationContextRecord.
+  DeoptimizationContextRecord* const link_;
+
+  DISALLOW_COPY_AND_ASSIGN(DeoptimizationContextRecord);
 };
 
 class StackedShadowFrameRecord {
@@ -206,22 +220,28 @@
   DISALLOW_COPY_AND_ASSIGN(StackedShadowFrameRecord);
 };
 
-void Thread::PushAndClearDeoptimizationReturnValue() {
-  DeoptimizationReturnValueRecord* record = new DeoptimizationReturnValueRecord(
-      tls64_.deoptimization_return_value,
-      tls32_.deoptimization_return_value_is_reference,
-      tlsPtr_.deoptimization_return_value_stack);
-  tlsPtr_.deoptimization_return_value_stack = record;
-  ClearDeoptimizationReturnValue();
+void Thread::PushDeoptimizationContext(const JValue& return_value, bool is_reference,
+                                       mirror::Throwable* exception) {
+  DeoptimizationContextRecord* record = new DeoptimizationContextRecord(
+      return_value,
+      is_reference,
+      exception,
+      tlsPtr_.deoptimization_context_stack);
+  tlsPtr_.deoptimization_context_stack = record;
 }
 
-JValue Thread::PopDeoptimizationReturnValue() {
-  DeoptimizationReturnValueRecord* record = tlsPtr_.deoptimization_return_value_stack;
-  DCHECK(record != nullptr);
-  tlsPtr_.deoptimization_return_value_stack = record->GetLink();
-  JValue ret_val(record->GetReturnValue());
+void Thread::PopDeoptimizationContext(JValue* result, mirror::Throwable** exception) {
+  AssertHasDeoptimizationContext();
+  DeoptimizationContextRecord* record = tlsPtr_.deoptimization_context_stack;
+  tlsPtr_.deoptimization_context_stack = record->GetLink();
+  result->SetJ(record->GetReturnValue().GetJ());
+  *exception = record->GetPendingException();
   delete record;
-  return ret_val;
+}
+
+void Thread::AssertHasDeoptimizationContext() {
+  CHECK(tlsPtr_.deoptimization_context_stack != nullptr)
+      << "No deoptimization context for thread " << *this;
 }
 
 void Thread::PushStackedShadowFrame(ShadowFrame* sf, StackedShadowFrameType type) {
@@ -1575,6 +1595,9 @@
   CHECK(tlsPtr_.flip_function == nullptr);
   CHECK_EQ(tls32_.suspended_at_suspend_check, false);
 
+  // Make sure we processed all deoptimization requests.
+  CHECK(tlsPtr_.deoptimization_context_stack == nullptr) << "Missed deoptimization";
+
   // We may be deleting a still born thread.
   SetStateUnsafe(kTerminated);
 
@@ -2593,7 +2616,7 @@
   visitor->VisitRootIfNonNull(&tlsPtr_.opeer, RootInfo(kRootThreadObject, thread_id));
   if (tlsPtr_.exception != nullptr && tlsPtr_.exception != GetDeoptimizationException()) {
     visitor->VisitRoot(reinterpret_cast<mirror::Object**>(&tlsPtr_.exception),
-                   RootInfo(kRootNativeStack, thread_id));
+                       RootInfo(kRootNativeStack, thread_id));
   }
   visitor->VisitRootIfNonNull(&tlsPtr_.monitor_enter_object, RootInfo(kRootNativeStack, thread_id));
   tlsPtr_.jni_env->locals.VisitRoots(visitor, RootInfo(kRootJNILocal, thread_id));
@@ -2602,6 +2625,7 @@
   if (tlsPtr_.debug_invoke_req != nullptr) {
     tlsPtr_.debug_invoke_req->VisitRoots(visitor, RootInfo(kRootDebugger, thread_id));
   }
+  // Visit roots for deoptimization.
   if (tlsPtr_.stacked_shadow_frame_record != nullptr) {
     RootCallbackVisitor visitor_to_callback(visitor, thread_id);
     ReferenceMapVisitor<RootCallbackVisitor> mapper(this, nullptr, visitor_to_callback);
@@ -2615,14 +2639,16 @@
       }
     }
   }
-  if (tlsPtr_.deoptimization_return_value_stack != nullptr) {
-    for (DeoptimizationReturnValueRecord* record = tlsPtr_.deoptimization_return_value_stack;
+  if (tlsPtr_.deoptimization_context_stack != nullptr) {
+    for (DeoptimizationContextRecord* record = tlsPtr_.deoptimization_context_stack;
          record != nullptr;
          record = record->GetLink()) {
       if (record->IsReference()) {
-        visitor->VisitRootIfNonNull(record->GetGCRoot(),
-            RootInfo(kRootThreadObject, thread_id));
+        visitor->VisitRootIfNonNull(record->GetReturnValueAsGCRoot(),
+                                    RootInfo(kRootThreadObject, thread_id));
       }
+      visitor->VisitRootIfNonNull(record->GetPendingExceptionAsGCRoot(),
+                                  RootInfo(kRootThreadObject, thread_id));
     }
   }
   for (auto* verifier = tlsPtr_.method_verifier; verifier != nullptr; verifier = verifier->link_) {
diff --git a/runtime/thread.h b/runtime/thread.h
index 9bb57bf..2d450f5 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -77,7 +77,7 @@
 class Closure;
 class Context;
 struct DebugInvokeReq;
-class DeoptimizationReturnValueRecord;
+class DeoptimizationContextRecord;
 class DexFile;
 class JavaVMExt;
 struct JNIEnvExt;
@@ -830,19 +830,13 @@
   // and execute Java code, so there might be nested deoptimizations happening.
   // We need to save the ongoing deoptimization shadow frames and return
   // values on stacks.
-  void SetDeoptimizationReturnValue(const JValue& ret_val, bool is_reference) {
-    tls64_.deoptimization_return_value.SetJ(ret_val.GetJ());
-    tls32_.deoptimization_return_value_is_reference = is_reference;
-  }
-  bool IsDeoptimizationReturnValueReference() {
-    return tls32_.deoptimization_return_value_is_reference;
-  }
-  void ClearDeoptimizationReturnValue() {
-    tls64_.deoptimization_return_value.SetJ(0);
-    tls32_.deoptimization_return_value_is_reference = false;
-  }
-  void PushAndClearDeoptimizationReturnValue();
-  JValue PopDeoptimizationReturnValue();
+  void PushDeoptimizationContext(const JValue& return_value, bool is_reference,
+                                 mirror::Throwable* exception)
+      SHARED_REQUIRES(Locks::mutator_lock_);
+  void PopDeoptimizationContext(JValue* result, mirror::Throwable** exception)
+      SHARED_REQUIRES(Locks::mutator_lock_);
+  void AssertHasDeoptimizationContext()
+      SHARED_REQUIRES(Locks::mutator_lock_);
   void PushStackedShadowFrame(ShadowFrame* sf, StackedShadowFrameType type);
   ShadowFrame* PopStackedShadowFrame(StackedShadowFrameType type);
 
@@ -1102,9 +1096,8 @@
       suspend_count(0), debug_suspend_count(0), thin_lock_thread_id(0), tid(0),
       daemon(is_daemon), throwing_OutOfMemoryError(false), no_thread_suspension(0),
       thread_exit_check_count(0), handling_signal_(false),
-      deoptimization_return_value_is_reference(false), suspended_at_suspend_check(false),
-      ready_for_debug_invoke(false), debug_method_entry_(false), is_gc_marking(false),
-      weak_ref_access_enabled(true) {
+      suspended_at_suspend_check(false), ready_for_debug_invoke(false),
+      debug_method_entry_(false), is_gc_marking(false), weak_ref_access_enabled(true) {
     }
 
     union StateAndFlags state_and_flags;
@@ -1144,10 +1137,6 @@
     // True if signal is being handled by this thread.
     bool32_t handling_signal_;
 
-    // True if the return value for interpreter after deoptimization is a reference.
-    // For gc purpose.
-    bool32_t deoptimization_return_value_is_reference;
-
     // True if the thread is suspended in FullSuspendCheck(). This is
     // used to distinguish runnable threads that are suspended due to
     // a normal suspend check from other threads.
@@ -1178,15 +1167,12 @@
   } tls32_;
 
   struct PACKED(8) tls_64bit_sized_values {
-    tls_64bit_sized_values() : trace_clock_base(0), deoptimization_return_value() {
+    tls_64bit_sized_values() : trace_clock_base(0) {
     }
 
     // The clock base used for tracing.
     uint64_t trace_clock_base;
 
-    // Return value used by deoptimization.
-    JValue deoptimization_return_value;
-
     RuntimeStats stats;
   } tls64_;
 
@@ -1197,7 +1183,7 @@
       stack_trace_sample(nullptr), wait_next(nullptr), monitor_enter_object(nullptr),
       top_handle_scope(nullptr), class_loader_override(nullptr), long_jump_context(nullptr),
       instrumentation_stack(nullptr), debug_invoke_req(nullptr), single_step_control(nullptr),
-      stacked_shadow_frame_record(nullptr), deoptimization_return_value_stack(nullptr),
+      stacked_shadow_frame_record(nullptr), deoptimization_context_stack(nullptr),
       name(nullptr), pthread_self(0),
       last_no_thread_suspension_cause(nullptr), thread_local_start(nullptr),
       thread_local_pos(nullptr), thread_local_end(nullptr), thread_local_objects(0),
@@ -1282,7 +1268,7 @@
     StackedShadowFrameRecord* stacked_shadow_frame_record;
 
     // Deoptimization return value record stack.
-    DeoptimizationReturnValueRecord* deoptimization_return_value_stack;
+    DeoptimizationContextRecord* deoptimization_context_stack;
 
     // A cached copy of the java.lang.Thread's name.
     std::string* name;