Merge "Fix tests to pass with VerifyObject."
diff --git a/Android.mk b/Android.mk
index 92f5c70..a179a97 100644
--- a/Android.mk
+++ b/Android.mk
@@ -368,20 +368,59 @@
 .PHONY: use-art
 use-art:
 	adb root && sleep 3
+	adb shell stop
 	adb shell setprop persist.sys.dalvik.vm.lib.1 libart.so
-	adb reboot
+	adb shell start
 
 .PHONY: use-artd
 use-artd:
 	adb root && sleep 3
+	adb shell stop
 	adb shell setprop persist.sys.dalvik.vm.lib.1 libartd.so
-	adb reboot
+	adb shell start
 
 .PHONY: use-dalvik
 use-dalvik:
 	adb root && sleep 3
+	adb shell stop
 	adb shell setprop persist.sys.dalvik.vm.lib.1 libdvm.so
-	adb reboot
+	adb shell start
+
+.PHONY: use-art-full
+use-art-full:
+	adb root && sleep 3
+	adb shell stop
+	adb shell rm $(ART_DALVIK_CACHE_DIR)/*.dex
+	adb shell rm $(ART_DALVIK_CACHE_DIR)/*.oat
+	adb shell rm $(ART_DALVIK_CACHE_DIR)/*.art
+	adb shell setprop dalvik.vm.dex2oat-flags ""
+	adb shell setprop dalvik.vm.image-dex2oat-flags ""
+	adb shell setprop persist.sys.dalvik.vm.lib.1 libart.so
+	adb shell start
+
+.PHONY: use-art-smart
+use-art-smart:
+	adb root && sleep 3
+	adb shell stop
+	adb shell rm $(ART_DALVIK_CACHE_DIR)/*.dex
+	adb shell rm $(ART_DALVIK_CACHE_DIR)/*.oat
+	adb shell rm $(ART_DALVIK_CACHE_DIR)/*.art
+	adb shell setprop dalvik.vm.dex2oat-flags "--compiler-filter=interpret-only"
+	adb shell setprop dalvik.vm.image-dex2oat-flags ""
+	adb shell setprop persist.sys.dalvik.vm.lib.1 libart.so
+	adb shell start
+
+.PHONY: use-art-interpret-only
+use-art-interpret-only:
+	adb root && sleep 3
+	adb shell stop
+	adb shell rm $(ART_DALVIK_CACHE_DIR)/*.dex
+	adb shell rm $(ART_DALVIK_CACHE_DIR)/*.oat
+	adb shell rm $(ART_DALVIK_CACHE_DIR)/*.art
+	adb shell setprop dalvik.vm.dex2oat-flags "--compiler-filter=interpret-only"
+	adb shell setprop dalvik.vm.image-dex2oat-flags "--compiler-filter=interpret-only"
+	adb shell setprop persist.sys.dalvik.vm.lib.1 libart.so
+	adb shell start
 
 ########################################################################
 
diff --git a/compiler/dex/frontend.cc b/compiler/dex/frontend.cc
index 1c2d16f..243395a 100644
--- a/compiler/dex/frontend.cc
+++ b/compiler/dex/frontend.cc
@@ -17,6 +17,7 @@
 #include "compiler_backend.h"
 #include "compiler_internals.h"
 #include "driver/compiler_driver.h"
+#include "driver/compiler_options.h"
 #include "dataflow_iterator-inl.h"
 #include "leb128.h"
 #include "mirror/object.h"
@@ -25,7 +26,7 @@
 #include "backend.h"
 #include "base/logging.h"
 #include "base/timing_logger.h"
-
+#include "driver/compiler_options.h"
 #include "dex/quick/dex_file_to_method_inliner_map.h"
 
 namespace art {
@@ -209,13 +210,26 @@
     cu.mir_graph->EnableOpcodeCounting();
   }
 
+  const CompilerOptions& compiler_options = cu.compiler_driver->GetCompilerOptions();
+  CompilerOptions::CompilerFilter compiler_filter = compiler_options.GetCompilerFilter();
+
+  // Check early if we should skip this compilation if using the profiled filter.
+  if (cu.compiler_driver->ProfilePresent()) {
+    std::string methodname = PrettyMethod(method_idx, dex_file);
+    if (cu.mir_graph->SkipCompilation(methodname)) {
+      return NULL;
+    }
+  }
+
   /* Build the raw MIR graph */
   cu.mir_graph->InlineMethod(code_item, access_flags, invoke_type, class_def_idx, method_idx,
                               class_loader, dex_file);
 
   cu.NewTimingSplit("MIROpt:CheckFilters");
-  if (cu.mir_graph->SkipCompilation()) {
-    return NULL;
+  if (compiler_filter != CompilerOptions::kInterpretOnly) {
+    if (cu.mir_graph->SkipCompilation()) {
+      return NULL;
+    }
   }
 
   /* Create the pass driver and launch it */
diff --git a/compiler/dex/mir_analysis.cc b/compiler/dex/mir_analysis.cc
index 667ee26..5314bb7 100644
--- a/compiler/dex/mir_analysis.cc
+++ b/compiler/dex/mir_analysis.cc
@@ -999,7 +999,6 @@
 
  /*
   * Will eventually want this to be a bit more sophisticated and happen at verification time.
-  * Ultimate goal is to drive with profile data.
   */
 bool MIRGraph::SkipCompilation() {
   const CompilerOptions& compiler_options = cu_->compiler_driver->GetCompilerOptions();
@@ -1013,8 +1012,7 @@
     return true;
   }
 
-  if (compiler_filter == CompilerOptions::kInterpretOnly) {
-    LOG(WARNING) << "InterpretOnly should ideally be filtered out prior to parsing.";
+  if (compiler_filter == CompilerOptions::kInterpretOnly || compiler_filter == CompilerOptions::kProfiled) {
     return true;
   }
 
@@ -1170,4 +1168,8 @@
   }
 }
 
+bool MIRGraph::SkipCompilation(const std::string& methodname) {
+  return cu_->compiler_driver->SkipCompilation(methodname);
+}
+
 }  // namespace art
diff --git a/compiler/dex/mir_graph.h b/compiler/dex/mir_graph.h
index 85d6d89..94b3816 100644
--- a/compiler/dex/mir_graph.h
+++ b/compiler/dex/mir_graph.h
@@ -383,6 +383,11 @@
   bool SkipCompilation();
 
   /*
+   * Should we skip the compilation of this method based on its name?
+   */
+  bool SkipCompilation(const std::string& methodname);
+
+  /*
    * Parse dex method and add MIR at current insert point.  Returns id (which is
    * actually the index of the method in the m_units_ array).
    */
diff --git a/compiler/dex/verification_results.cc b/compiler/dex/verification_results.cc
index 947c22d..6b0875c 100644
--- a/compiler/dex/verification_results.cc
+++ b/compiler/dex/verification_results.cc
@@ -110,7 +110,7 @@
   if (((access_flags & kAccConstructor) != 0) && ((access_flags & kAccStatic) != 0)) {
     return false;
   }
-  return (compiler_options_->GetCompilerFilter() != CompilerOptions::kInterpretOnly);
+  return true;
 }
 
 }  // namespace art
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index d3d58c9..a46015d 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -21,6 +21,7 @@
 
 #include <vector>
 #include <unistd.h>
+#include <fstream>
 
 #include "base/stl_util.h"
 #include "base/timing_logger.h"
@@ -303,8 +304,9 @@
                                InstructionSet instruction_set,
                                InstructionSetFeatures instruction_set_features,
                                bool image, DescriptorSet* image_classes, size_t thread_count,
-                               bool dump_stats, bool dump_passes, CumulativeLogger* timer)
-    : compiler_options_(compiler_options),
+                               bool dump_stats, bool dump_passes, CumulativeLogger* timer,
+                               std::string profile_file)
+    : profile_ok_(false), compiler_options_(compiler_options),
       verification_results_(verification_results),
       method_inliner_map_(method_inliner_map),
       compiler_backend_(CompilerBackend::Create(compiler_backend_kind)),
@@ -338,6 +340,11 @@
 
   CHECK_PTHREAD_CALL(pthread_key_create, (&tls_key_, NULL), "compiler tls key");
 
+  // Read the profile file if one is provided.
+  if (profile_file != "") {
+    profile_ok_ = ReadProfile(profile_file);
+  }
+
   dex_to_dex_compiler_ = reinterpret_cast<DexToDexCompilerFn>(ArtCompileDEX);
 
   compiler_backend_->Init(*this);
@@ -1936,7 +1943,6 @@
   } else {
     MethodReference method_ref(&dex_file, method_idx);
     bool compile = verification_results_->IsCandidateForCompilation(method_ref, access_flags);
-
     if (compile) {
       // NOTE: if compiler declines to compile this method, it will return NULL.
       compiled_method = compiler_backend_->Compile(
@@ -2073,4 +2079,86 @@
       LOG(FATAL) << "Unknown instruction set: " << instruction_set;
     }
   }
+
+bool CompilerDriver::ReadProfile(const std::string& filename) {
+  VLOG(compiler) << "reading profile file " << filename;
+  struct stat st;
+  int err = stat(filename.c_str(), &st);
+  if (err == -1) {
+    VLOG(compiler) << "not found";
+    return false;
+  }
+  std::ifstream in(filename.c_str());
+  if (!in) {
+    VLOG(compiler) << "profile file " << filename << " exists but can't be opened";
+    VLOG(compiler) << "file owner: " << st.st_uid << ":" << st.st_gid;
+    VLOG(compiler) << "me: " << getuid() << ":" << getgid();
+    VLOG(compiler) << "file permissions: " << std::oct << st.st_mode;
+    VLOG(compiler) << "errno: " << errno;
+    return false;
+  }
+  // The first line contains summary information.
+  std::string line;
+  std::getline(in, line);
+  if (in.eof()) {
+    return false;
+  }
+  std::vector<std::string> summary_info;
+  Split(line, '/', summary_info);
+  if (summary_info.size() != 3) {
+    // Bad summary info.  It should be count/total/bootpath
+    return false;
+  }
+  // This is the number of hits in all methods.
+  uint32_t total_count = 0;
+  for (int i = 0 ; i < 3; ++i) {
+    total_count += atoi(summary_info[0].c_str());
+  }
+
+  // Now read each line until the end of file.  Each line consists of 3 fields separated by /
+  while (!in.eof()) {
+    std::getline(in, line);
+    if (in.eof()) {
+      break;
+    }
+    std::vector<std::string> info;
+    Split(line, '/', info);
+    if (info.size() != 3) {
+      // Malformed.
+      break;
+    }
+    const std::string& methodname = info[0];
+    uint32_t count = atoi(info[1].c_str());
+    uint32_t size = atoi(info[2].c_str());
+    double percent = (count * 100.0) / total_count;
+    // Add it to the profile map
+    profile_map_[methodname] = ProfileData(methodname, count, size, percent);
+  }
+  return true;
+}
+
+bool CompilerDriver::SkipCompilation(const std::string& method_name) {
+  if (!profile_ok_) {
+    return true;
+  }
+  constexpr double kThresholdPercent = 2.0;      // Anything above this threshold will be compiled.
+
+  // First find the method in the profile map.
+  ProfileMap::iterator i = profile_map_.find(method_name);
+  if (i == profile_map_.end()) {
+    // Not in profile, no information can be determined.
+    VLOG(compiler) << "not compiling " << method_name << " because it's not in the profile";
+    return true;
+  }
+  const ProfileData& data = i->second;
+  bool compile = data.IsAbove(kThresholdPercent);
+  if (compile) {
+    LOG(INFO) << "compiling method " << method_name << " because its usage is " <<
+        data.GetPercent() << "%";
+  } else {
+    VLOG(compiler) << "not compiling method " << method_name << " because usage is too low ("
+        << data.GetPercent() << "%)";
+  }
+  return !compile;
+}
 }  // namespace art
diff --git a/compiler/driver/compiler_driver.h b/compiler/driver/compiler_driver.h
index ac70e5a..12463a9 100644
--- a/compiler/driver/compiler_driver.h
+++ b/compiler/driver/compiler_driver.h
@@ -105,7 +105,8 @@
                           InstructionSetFeatures instruction_set_features,
                           bool image, DescriptorSet* image_classes,
                           size_t thread_count, bool dump_stats, bool dump_passes,
-                          CumulativeLogger* timer);
+                          CumulativeLogger* timer,
+                          std::string profile_file = "");
 
   ~CompilerDriver();
 
@@ -141,6 +142,10 @@
     return compiler_backend_.get();
   }
 
+  bool ProfilePresent() const {
+    return profile_ok_;
+  }
+
   // Are we compiling and creating an image file?
   bool IsImage() const {
     return image_;
@@ -554,6 +559,37 @@
     return cfi_info_.get();
   }
 
+  // Profile data.  This is generated from previous runs of the program and stored
+  // in a file.  It is used to determine whether to compile a particular method or not.
+  class ProfileData {
+   public:
+    ProfileData() : count_(0), method_size_(0), percent_(0) {}
+    ProfileData(std::string method_name, uint32_t count, uint32_t method_size, double percent) :
+      method_name_(method_name), count_(count), method_size_(method_size), percent_(percent) {
+    }
+
+    bool IsAbove(double v) const { return percent_ >= v; }
+    double GetPercent() const { return percent_; }
+
+   private:
+    std::string method_name_;   // Method name.
+    uint32_t count_;            // Number number of times it has been called.
+    uint32_t method_size_;      // Size of the method on dex instructions.
+    double percent_;            // Percentage of time spent in this method.
+  };
+
+  // Profile data is stored in a map, indexed by the full method name.
+  typedef std::map<const std::string, ProfileData> ProfileMap;
+  ProfileMap profile_map_;
+  bool profile_ok_;
+
+  // Read the profile data from the given file.  Calculates the percentage for each method.
+  // Returns false if there was no profile file or it was malformed.
+  bool ReadProfile(const std::string& filename);
+
+  // Should the compiler run on this method given profile information?
+  bool SkipCompilation(const std::string& method_name);
+
  private:
   // Compute constant code and method pointers when possible
   void GetCodeAndMethodForDirectCall(InvokeType* type, InvokeType sharp_type,
diff --git a/compiler/driver/compiler_options.h b/compiler/driver/compiler_options.h
index 39738ab..0cca1e9 100644
--- a/compiler/driver/compiler_options.h
+++ b/compiler/driver/compiler_options.h
@@ -23,6 +23,7 @@
  public:
   enum CompilerFilter {
     kInterpretOnly,       // Compile nothing.
+    kProfiled,            // Compile based on profile.
     kSpace,               // Maximize space savings.
     kBalanced,            // Try to get the best performance return on compilation investment.
     kSpeed,               // Maximize runtime performance.
@@ -30,7 +31,11 @@
   };
 
   // Guide heuristics to determine whether to compile method if profile data not available.
+#if ART_SMALL_MODE
+  static const CompilerFilter kDefaultCompilerFilter = kProfiled;
+#else
   static const CompilerFilter kDefaultCompilerFilter = kSpeed;
+#endif
   static const size_t kDefaultHugeMethodThreshold = 10000;
   static const size_t kDefaultLargeMethodThreshold = 600;
   static const size_t kDefaultSmallMethodThreshold = 60;
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index 7c81ffb..cc78816 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -200,6 +200,8 @@
   UsageError("      such as initial heap size, maximum heap size, and verbose output.");
   UsageError("      Use a separate --runtime-arg switch for each argument.");
   UsageError("      Example: --runtime-arg -Xms256m");
+    UsageError("");
+    UsageError("  --profile-file=<filename>: specify profiler output file to use for compilation.");
   UsageError("");
   std::cerr << "See log for usage error information\n";
   exit(EXIT_FAILURE);
@@ -310,7 +312,8 @@
                                       bool dump_stats,
                                       bool dump_passes,
                                       TimingLogger& timings,
-                                      CumulativeLogger& compiler_phases_timings) {
+                                      CumulativeLogger& compiler_phases_timings,
+                                      std::string profile_file) {
     // SirtRef and ClassLoader creation needs to come after Runtime::Create
     jobject class_loader = NULL;
     Thread* self = Thread::Current();
@@ -340,7 +343,8 @@
                                                         thread_count_,
                                                         dump_stats,
                                                         dump_passes,
-                                                        &compiler_phases_timings));
+                                                        &compiler_phases_timings,
+                                                        profile_file));
 
     driver->GetCompilerBackend()->SetBitcodeFileName(*driver.get(), bitcode_filename);
 
@@ -742,6 +746,8 @@
   InstructionSet instruction_set = kNone;
 #endif
 
+  // Profile file to use
+  std::string profile_file;
 
   bool is_host = false;
   bool dump_stats = false;
@@ -896,6 +902,12 @@
       dump_passes = true;
     } else if (option == "--dump-stats") {
       dump_stats = true;
+    } else if (option.starts_with("--profile-file=")) {
+      profile_file = option.substr(strlen("--profile-file=")).data();
+      VLOG(compiler) << "dex2oat: profile file is " << profile_file;
+    } else if (option == "--no-profile-file") {
+      LOG(INFO) << "dex2oat: no profile file supplied (explictly)";
+      // No profile
     } else {
       Usage("Unknown argument %s", option.data());
     }
@@ -1204,7 +1216,8 @@
                                                                   dump_stats,
                                                                   dump_passes,
                                                                   timings,
-                                                                  compiler_phases_timings));
+                                                                  compiler_phases_timings,
+                                                                  profile_file));
 
   if (compiler.get() == NULL) {
     LOG(ERROR) << "Failed to create oat file: " << oat_location;
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index da09861..a78a1e5 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -766,7 +766,11 @@
     // 16-byte aligned: 4336
     // Note: 14x8 = 7*16, so the stack stays aligned for the native call...
     //       Also means: the padding is somewhere in the middle
-    subq LITERAL(4336), %rsp
+    //
+    //
+    // New test: use 5K and release
+    // 5k = 5120
+    subq LITERAL(5120), %rsp
     // prepare for artQuickGenericJniTrampoline call
     // (Thread*,  SP)
     //    rdi    rsi      <= C calling convention
@@ -774,9 +778,13 @@
     movq %gs:THREAD_SELF_OFFSET, %rdi
     movq %rbp, %rsi
     call PLT_SYMBOL(artQuickGenericJniTrampoline)  // (Thread*, sp)
-    test %rax, %rax                 // check whether code pointer is NULL, also indicates exception
-    jz 1f
-    // pop from the register-passing alloca
+    test %rax, %rax                 // check whether error (negative value)
+    js 1f
+    // release part of the alloca
+    addq %rax, %rsp
+    // get the code pointer
+    popq %rax
+    // pop from the register-passing alloca region
     // what's the right layout?
     popq %rdi
     popq %rsi
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index bf8b8ba..1bbaa6a 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -820,84 +820,492 @@
   return code;
 }
 
-// Visits arguments on the stack placing them into a region lower down the stack for the benefit
-// of transitioning into native code.
-class BuildGenericJniFrameVisitor FINAL : public QuickArgumentVisitor {
+
+
+/*
+ * This class uses a couple of observations to unite the different calling conventions through
+ * a few constants.
+ *
+ * 1) Number of registers used for passing is normally even, so counting down has no penalty for
+ *    possible alignment.
+ * 2) Known 64b architectures store 8B units on the stack, both for integral and floating point
+ *    types, so using uintptr_t is OK. Also means that we can use kRegistersNeededX to denote
+ *    when we have to split things
+ * 3) The only soft-float, Arm, is 32b, so no widening needs to be taken into account for floats
+ *    and we can use Int handling directly.
+ * 4) Only 64b architectures widen, and their stack is aligned 8B anyways, so no padding code
+ *    necessary when widening. Also, widening of Ints will take place implicitly, and the
+ *    extension should be compatible with Aarch64, which mandates copying the available bits
+ *    into LSB and leaving the rest unspecified.
+ * 5) Aligning longs and doubles is necessary on arm only, and it's the same in registers and on
+ *    the stack.
+ * 6) There is only little endian.
+ *
+ *
+ * Actual work is supposed to be done in a delegate of the template type. The interface is as
+ * follows:
+ *
+ * void PushGpr(uintptr_t):   Add a value for the next GPR
+ *
+ * void PushFpr4(float):      Add a value for the next FPR of size 32b. Is only called if we need
+ *                            padding, that is, think the architecture is 32b and aligns 64b.
+ *
+ * void PushFpr8(uint64_t):   Push a double. We _will_ call this on 32b, it's the callee's job to
+ *                            split this if necessary. The current state will have aligned, if
+ *                            necessary.
+ *
+ * void PushStack(uintptr_t): Push a value to the stack.
+ *
+ * uintptr_t PushSirt(mirror::Object* ref): Add a reference to the Sirt. Is guaranteed != nullptr.
+ *                                          Must return the jobject, that is, the reference to the
+ *                                          entry in the Sirt.
+ *
+ */
+template <class T> class BuildGenericJniFrameStateMachine {
+ public:
 #if defined(__arm__)
   // TODO: These are all dummy values!
-  static constexpr bool kNativeSoftFloatAbi = false;  // This is a hard float ABI.
-  static constexpr size_t kNumNativeGprArgs = 3;  // 3 arguments passed in GPRs.
+  static constexpr bool kNativeSoftFloatAbi = true;
+  static constexpr size_t kNumNativeGprArgs = 4;  // 4 arguments passed in GPRs, r0-r3
   static constexpr size_t kNumNativeFprArgs = 0;  // 0 arguments passed in FPRs.
 
-  static constexpr size_t kGprStackOffset = 4336;
-  static constexpr size_t kFprStackOffset = 4336 - 6*8;
-  static constexpr size_t kCallStackStackOffset = 4336 - 112;
-
   static constexpr size_t kRegistersNeededForLong = 2;
   static constexpr size_t kRegistersNeededForDouble = 2;
+  static constexpr bool kMultiRegistersAligned = true;
+  static constexpr bool kMultiRegistersWidened = false;
+  static constexpr bool kAlignLongOnStack = true;
+  static constexpr bool kAlignDoubleOnStack = true;
 #elif defined(__mips__)
   // TODO: These are all dummy values!
   static constexpr bool kNativeSoftFloatAbi = true;  // This is a hard float ABI.
   static constexpr size_t kNumNativeGprArgs = 0;  // 6 arguments passed in GPRs.
   static constexpr size_t kNumNativeFprArgs = 0;  // 8 arguments passed in FPRs.
 
-  // update these
-  static constexpr size_t kGprStackOffset = 4336;
-  static constexpr size_t kFprStackOffset = 4336 - 6*8;
-  static constexpr size_t kCallStackStackOffset = 4336 - 112;
-
   static constexpr size_t kRegistersNeededForLong = 2;
   static constexpr size_t kRegistersNeededForDouble = 2;
+  static constexpr bool kMultiRegistersAligned = true;
+  static constexpr bool kMultiRegistersWidened = true;
+  static constexpr bool kAlignLongOnStack = false;
+  static constexpr bool kAlignDoubleOnStack = false;
 #elif defined(__i386__)
   // TODO: Check these!
-  static constexpr bool kNativeSoftFloatAbi = true;  // This is a soft float ABI.
+  static constexpr bool kNativeSoftFloatAbi = false;  // Not using int registers for fp
   static constexpr size_t kNumNativeGprArgs = 0;  // 6 arguments passed in GPRs.
   static constexpr size_t kNumNativeFprArgs = 0;  // 8 arguments passed in FPRs.
 
-  // update these
-  static constexpr size_t kGprStackOffset = 4336;
-  static constexpr size_t kFprStackOffset = 4336 - 6*8;
-  static constexpr size_t kCallStackStackOffset = 4336 - 112;
-
   static constexpr size_t kRegistersNeededForLong = 2;
   static constexpr size_t kRegistersNeededForDouble = 2;
+  static constexpr bool kMultiRegistersAligned = false;       // x86 not using regs, anyways
+  static constexpr bool kMultiRegistersWidened = false;
+  static constexpr bool kAlignLongOnStack = false;
+  static constexpr bool kAlignDoubleOnStack = false;
 #elif defined(__x86_64__)
   static constexpr bool kNativeSoftFloatAbi = false;  // This is a hard float ABI.
   static constexpr size_t kNumNativeGprArgs = 6;  // 6 arguments passed in GPRs.
   static constexpr size_t kNumNativeFprArgs = 8;  // 8 arguments passed in FPRs.
 
-  static constexpr size_t kGprStackOffset = 4336;
-  static constexpr size_t kFprStackOffset = 4336 - 6*8;
-  static constexpr size_t kCallStackStackOffset = 4336 - 112;
-
   static constexpr size_t kRegistersNeededForLong = 1;
   static constexpr size_t kRegistersNeededForDouble = 1;
+  static constexpr bool kMultiRegistersAligned = false;
+  static constexpr bool kMultiRegistersWidened = true;
+  static constexpr bool kAlignLongOnStack = false;
+  static constexpr bool kAlignDoubleOnStack = false;
 #else
 #error "Unsupported architecture"
 #endif
 
+ public:
+  explicit BuildGenericJniFrameStateMachine(T* delegate) : gpr_index_(kNumNativeGprArgs),
+                                                           fpr_index_(kNumNativeFprArgs),
+                                                           stack_entries_(0),
+                                                           delegate_(delegate) {
+    // For register alignment, we want to assume that counters (gpr_index_, fpr_index_) are even iff
+    // the next register is even; counting down is just to make the compiler happy...
+    CHECK_EQ(kNumNativeGprArgs % 2, 0U);
+    CHECK_EQ(kNumNativeFprArgs % 2, 0U);
+  }
 
+  virtual ~BuildGenericJniFrameStateMachine() {}
+
+  bool HavePointerGpr() {
+    return gpr_index_ > 0;
+  }
+
+  void AdvancePointer(void* val) {
+    if (HavePointerGpr()) {
+      gpr_index_--;
+      PushGpr(reinterpret_cast<uintptr_t>(val));
+    } else {
+      stack_entries_++;         // TODO: have a field for pointer length as multiple of 32b
+      PushStack(reinterpret_cast<uintptr_t>(val));
+      gpr_index_ = 0;
+    }
+  }
+
+
+  bool HaveSirtGpr() {
+    return gpr_index_ > 0;
+  }
+
+  void AdvanceSirt(mirror::Object* ptr) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    uintptr_t sirtRef;
+    if (ptr != nullptr) {
+      sirtRef = PushSirt(ptr);
+    } else {
+      sirtRef = reinterpret_cast<uintptr_t>(nullptr);
+    }
+    if (HaveSirtGpr()) {
+      gpr_index_--;
+      PushGpr(sirtRef);
+    } else {
+      stack_entries_++;
+      PushStack(sirtRef);
+      gpr_index_ = 0;
+    }
+  }
+
+
+  bool HaveIntGpr() {
+    return gpr_index_ > 0;
+  }
+
+  void AdvanceInt(uint32_t val) {
+    if (HaveIntGpr()) {
+      gpr_index_--;
+      PushGpr(val);
+    } else {
+      stack_entries_++;
+      PushStack(val);
+      gpr_index_ = 0;
+    }
+  }
+
+
+  bool HaveLongGpr() {
+    return gpr_index_ >= kRegistersNeededForLong + (LongGprNeedsPadding() ? 1 : 0);
+  }
+
+  bool LongGprNeedsPadding() {
+    return kRegistersNeededForLong > 1 &&     // only pad when using multiple registers
+        kAlignLongOnStack &&                  // and when it needs alignment
+        (gpr_index_ & 1) == 1;                // counter is odd, see constructor
+  }
+
+  bool LongStackNeedsPadding() {
+    return kRegistersNeededForLong > 1 &&     // only pad when using multiple registers
+        kAlignLongOnStack &&                  // and when it needs 8B alignment
+        (stack_entries_ & 1) == 1;            // counter is odd
+  }
+
+  void AdvanceLong(uint64_t val) {
+    if (HaveLongGpr()) {
+      if (LongGprNeedsPadding()) {
+        PushGpr(0);
+        gpr_index_--;
+      }
+      if (kRegistersNeededForLong == 1) {
+        PushGpr(static_cast<uintptr_t>(val));
+      } else {
+        PushGpr(static_cast<uintptr_t>(val & 0xFFFFFFFF));
+        PushGpr(static_cast<uintptr_t>((val >> 32) & 0xFFFFFFFF));
+      }
+      gpr_index_ -= kRegistersNeededForLong;
+    } else {
+      if (LongStackNeedsPadding()) {
+        PushStack(0);
+        stack_entries_++;
+      }
+      if (kRegistersNeededForLong == 1) {
+        PushStack(static_cast<uintptr_t>(val));
+        stack_entries_++;
+      } else {
+        PushStack(static_cast<uintptr_t>(val & 0xFFFFFFFF));
+        PushStack(static_cast<uintptr_t>((val >> 32) & 0xFFFFFFFF));
+        stack_entries_ += 2;
+      }
+      gpr_index_ = 0;
+    }
+  }
+
+
+  bool HaveFloatFpr() {
+    return fpr_index_ > 0;
+  }
+
+  // TODO: please review this bit representation retrieving.
+  template <typename U, typename V> V convert(U in) {
+    CHECK_LE(sizeof(U), sizeof(V));
+    union { U u; V v; } tmp;
+    tmp.u = in;
+    return tmp.v;
+  }
+
+  void AdvanceFloat(float val) {
+    if (kNativeSoftFloatAbi) {
+      AdvanceInt(convert<float, uint32_t>(val));
+    } else {
+      if (HaveFloatFpr()) {
+        fpr_index_--;
+        if (kRegistersNeededForDouble == 1) {
+          if (kMultiRegistersWidened) {
+            PushFpr8(convert<double, uint64_t>(val));
+          } else {
+            // No widening, just use the bits.
+            PushFpr8(convert<float, uint64_t>(val));
+          }
+        } else {
+          PushFpr4(val);
+        }
+      } else {
+        stack_entries_++;
+        if (kRegistersNeededForDouble == 1 && kMultiRegistersWidened) {
+          // Need to widen before storing: Note the "double" in the template instantiation.
+          PushStack(convert<double, uintptr_t>(val));
+        } else {
+          PushStack(convert<float, uintptr_t>(val));
+        }
+        fpr_index_ = 0;
+      }
+    }
+  }
+
+
+  bool HaveDoubleFpr() {
+    return fpr_index_ >= kRegistersNeededForDouble + (DoubleFprNeedsPadding() ? 1 : 0);
+  }
+
+  bool DoubleFprNeedsPadding() {
+    return kRegistersNeededForDouble > 1 &&     // only pad when using multiple registers
+        kAlignDoubleOnStack &&                  // and when it needs alignment
+        (fpr_index_ & 1) == 1;                  // counter is odd, see constructor
+  }
+
+  bool DoubleStackNeedsPadding() {
+    return kRegistersNeededForDouble > 1 &&     // only pad when using multiple registers
+        kAlignDoubleOnStack &&                  // and when it needs 8B alignment
+        (stack_entries_ & 1) == 1;              // counter is odd
+  }
+
+  void AdvanceDouble(uint64_t val) {
+    if (kNativeSoftFloatAbi) {
+      AdvanceLong(val);
+    } else {
+      if (HaveDoubleFpr()) {
+        if (DoubleFprNeedsPadding()) {
+          PushFpr4(0);
+          fpr_index_--;
+        }
+        PushFpr8(val);
+        fpr_index_ -= kRegistersNeededForDouble;
+      } else {
+        if (DoubleStackNeedsPadding()) {
+          PushStack(0);
+          stack_entries_++;
+        }
+        if (kRegistersNeededForDouble == 1) {
+          PushStack(static_cast<uintptr_t>(val));
+          stack_entries_++;
+        } else {
+          PushStack(static_cast<uintptr_t>(val & 0xFFFFFFFF));
+          PushStack(static_cast<uintptr_t>((val >> 32) & 0xFFFFFFFF));
+          stack_entries_ += 2;
+        }
+        fpr_index_ = 0;
+      }
+    }
+  }
+
+  uint32_t getStackEntries() {
+    return stack_entries_;
+  }
+
+  uint32_t getNumberOfUsedGprs() {
+    return kNumNativeGprArgs - gpr_index_;
+  }
+
+  uint32_t getNumberOfUsedFprs() {
+    return kNumNativeFprArgs - fpr_index_;
+  }
+
+ private:
+  void PushGpr(uintptr_t val) {
+    delegate_->PushGpr(val);
+  }
+  void PushFpr4(float val) {
+    delegate_->PushFpr4(val);
+  }
+  void PushFpr8(uint64_t val) {
+    delegate_->PushFpr8(val);
+  }
+  void PushStack(uintptr_t val) {
+    delegate_->PushStack(val);
+  }
+  uintptr_t PushSirt(mirror::Object* ref) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    return delegate_->PushSirt(ref);
+  }
+
+  uint32_t gpr_index_;      // Number of free GPRs
+  uint32_t fpr_index_;      // Number of free FPRs
+  uint32_t stack_entries_;  // Stack entries are in multiples of 32b, as floats are usually not
+                            // extended
+  T* delegate_;             // What Push implementation gets called
+};
+
+class ComputeGenericJniFrameSize FINAL {
+ public:
+  ComputeGenericJniFrameSize() : num_sirt_references_(0), num_stack_entries_(0) {}
+
+  // (negative) offset from SP to top of Sirt.
+  uint32_t GetSirtOffset() {
+    return 8;
+  }
+
+  uint32_t GetFirstSirtEntryOffset() {
+    return GetSirtOffset() + sizeof(StackReference<mirror::Object>);
+  }
+
+  uint32_t GetNumSirtReferences() {
+    return num_sirt_references_;
+  }
+
+  uint32_t GetStackSize() {
+    return num_stack_entries_ * sizeof(uintptr_t);
+  }
+
+  void ComputeLayout(bool is_static, const char* shorty, uint32_t shorty_len, void* sp,
+                     StackReference<mirror::Object>** start_sirt, StackIndirectReferenceTable** table,
+                     uint32_t* sirt_entries, uintptr_t** start_stack, uintptr_t** start_gpr,
+                     uint32_t** start_fpr, void** code_return, size_t* overall_size)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    ComputeAll(is_static, shorty, shorty_len);
+
+    uint8_t* sp8 = reinterpret_cast<uint8_t*>(sp);
+    *start_sirt = reinterpret_cast<StackReference<mirror::Object>*>(sp8-GetFirstSirtEntryOffset());
+
+    // Add padding entries if necessary for alignment.
+    if (sizeof(uintptr_t) < sizeof(uint64_t)) {
+      uint32_t size = sizeof(uintptr_t) * num_sirt_references_;
+      uint32_t rem = size % 8;
+      if (rem != 0) {
+        DCHECK_EQ(rem, 4U);
+        num_sirt_references_++;
+      }
+    }
+    *sirt_entries = num_sirt_references_;
+    size_t sirt_size = StackIndirectReferenceTable::SizeOf(num_sirt_references_);
+    sp8 -= GetSirtOffset() + sirt_size;
+    *table = reinterpret_cast<StackIndirectReferenceTable*>(sp8);
+
+    sp8 -= GetStackSize();
+    // Now align the call stack under the Sirt. This aligns by 16.
+    uintptr_t mask = ~0x0F;
+    sp8 = reinterpret_cast<uint8_t*>(reinterpret_cast<uintptr_t>(sp8) & mask);
+    *start_stack = reinterpret_cast<uintptr_t*>(sp8);
+
+    // put fprs and gprs below
+    // Assumption is OK right now, as we have soft-float arm
+    size_t fregs = BuildGenericJniFrameStateMachine<ComputeGenericJniFrameSize>::kNumNativeFprArgs;
+    sp8 -= fregs * sizeof(uintptr_t);
+    *start_fpr = reinterpret_cast<uint32_t*>(sp8);
+    size_t iregs = BuildGenericJniFrameStateMachine<ComputeGenericJniFrameSize>::kNumNativeGprArgs;
+    sp8 -= iregs * sizeof(uintptr_t);
+    *start_gpr = reinterpret_cast<uintptr_t*>(sp8);
+
+    // reserve space for the code pointer
+    sp8 -= sizeof(void*);
+    *code_return = reinterpret_cast<void*>(sp8);
+
+    *overall_size = reinterpret_cast<uint8_t*>(sp) - sp8;
+  }
+
+  void ComputeSirtOffset() { }  // nothing to do, static right now
+
+  void ComputeAll(bool is_static, const char* shorty, uint32_t shorty_len)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    BuildGenericJniFrameStateMachine<ComputeGenericJniFrameSize> sm(this);
+
+    // JNIEnv
+    sm.AdvancePointer(nullptr);
+
+    // Class object or this as first argument
+    sm.AdvanceSirt(reinterpret_cast<mirror::Object*>(0x12345678));
+
+    for (uint32_t i = 1; i < shorty_len; ++i) {
+      Primitive::Type cur_type_ = Primitive::GetType(shorty[i]);
+      switch (cur_type_) {
+        case Primitive::kPrimNot:
+          sm.AdvanceSirt(reinterpret_cast<mirror::Object*>(0x12345678));
+          break;
+
+        case Primitive::kPrimBoolean:
+        case Primitive::kPrimByte:
+        case Primitive::kPrimChar:
+        case Primitive::kPrimShort:
+        case Primitive::kPrimInt:
+          sm.AdvanceInt(0);
+          break;
+        case Primitive::kPrimFloat:
+          sm.AdvanceFloat(0);
+          break;
+        case Primitive::kPrimDouble:
+          sm.AdvanceDouble(0);
+          break;
+        case Primitive::kPrimLong:
+          sm.AdvanceLong(0);
+          break;
+        default:
+          LOG(FATAL) << "Unexpected type: " << cur_type_ << " in " << shorty;
+      }
+    }
+
+    num_stack_entries_ = sm.getStackEntries();
+  }
+
+  void PushGpr(uintptr_t /* val */) {
+    // not optimizing registers, yet
+  }
+
+  void PushFpr4(float /* val */) {
+    // not optimizing registers, yet
+  }
+
+  void PushFpr8(uint64_t /* val */) {
+    // not optimizing registers, yet
+  }
+
+  void PushStack(uintptr_t /* val */) {
+    // counting is already done in the superclass
+  }
+
+  uintptr_t PushSirt(mirror::Object* /* ptr */) {
+    num_sirt_references_++;
+    return reinterpret_cast<uintptr_t>(nullptr);
+  }
+
+ private:
+  uint32_t num_sirt_references_;
+  uint32_t num_stack_entries_;
+};
+
+// Visits arguments on the stack placing them into a region lower down the stack for the benefit
+// of transitioning into native code.
+class BuildGenericJniFrameVisitor FINAL : public QuickArgumentVisitor {
  public:
   BuildGenericJniFrameVisitor(mirror::ArtMethod** sp, bool is_static, const char* shorty,
                               uint32_t shorty_len, Thread* self) :
-      QuickArgumentVisitor(sp, is_static, shorty, shorty_len) {
-    // size of cookie plus padding
-    uint8_t* sp8 = reinterpret_cast<uint8_t*>(sp);
-    top_of_sirt_ =  sp8 - 8;
-    cur_sirt_entry_ = reinterpret_cast<StackReference<mirror::Object>*>(top_of_sirt_) - 1;
+      QuickArgumentVisitor(sp, is_static, shorty, shorty_len), sm_(this) {
+    ComputeGenericJniFrameSize fsc;
+    fsc.ComputeLayout(is_static, shorty, shorty_len, sp, &cur_sirt_entry_, &sirt_,
+                      &sirt_expected_refs_, &cur_stack_arg_, &cur_gpr_reg_, &cur_fpr_reg_,
+                      &code_return_, &alloca_used_size_);
     sirt_number_of_references_ = 0;
-    gpr_index_ = kNumNativeGprArgs;
-    fpr_index_ = kNumNativeFprArgs;
-
-    cur_gpr_reg_ = reinterpret_cast<uintptr_t*>(sp8 - kGprStackOffset);
-    cur_fpr_reg_ = reinterpret_cast<uint32_t*>(sp8 - kFprStackOffset);
-    cur_stack_arg_ = reinterpret_cast<uintptr_t*>(sp8 - kCallStackStackOffset);
+    top_of_sirt_ = cur_sirt_entry_;
 
     // jni environment is always first argument
-    PushPointer(self->GetJniEnv());
+    sm_.AdvancePointer(self->GetJniEnv());
 
     if (is_static) {
-      PushArgumentInSirt((*sp)->GetDeclaringClass());
+      sm_.AdvanceSirt((*sp)->GetDeclaringClass());
     }
   }
 
@@ -911,7 +1319,7 @@
         } else {
           long_arg = *reinterpret_cast<jlong*>(GetParamAddress());
         }
-        PushLongArgument(long_arg);
+        sm_.AdvanceLong(long_arg);
         break;
       }
       case Primitive::kPrimDouble: {
@@ -922,24 +1330,24 @@
         } else {
           double_arg = *reinterpret_cast<uint64_t*>(GetParamAddress());
         }
-        PushDoubleArgument(double_arg);
+        sm_.AdvanceDouble(double_arg);
         break;
       }
       case Primitive::kPrimNot: {
         StackReference<mirror::Object>* stack_ref =
             reinterpret_cast<StackReference<mirror::Object>*>(GetParamAddress());
-        PushArgumentInSirt(stack_ref->AsMirrorPtr());
+        sm_.AdvanceSirt(stack_ref->AsMirrorPtr());
         break;
       }
       case Primitive::kPrimFloat:
-        PushFloatArgument(*reinterpret_cast<int32_t*>(GetParamAddress()));
+        sm_.AdvanceFloat(*reinterpret_cast<float*>(GetParamAddress()));
         break;
       case Primitive::kPrimBoolean:  // Fall-through.
       case Primitive::kPrimByte:     // Fall-through.
       case Primitive::kPrimChar:     // Fall-through.
       case Primitive::kPrimShort:    // Fall-through.
       case Primitive::kPrimInt:      // Fall-through.
-        PushIntArgument(*reinterpret_cast<jint*>(GetParamAddress()));
+        sm_.AdvanceInt(*reinterpret_cast<jint*>(GetParamAddress()));
         break;
       case Primitive::kPrimVoid:
         LOG(FATAL) << "UNREACHABLE";
@@ -948,149 +1356,87 @@
   }
 
   void FinalizeSirt(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    if (!IsAligned<8>(StackIndirectReferenceTable::SizeOf(sirt_number_of_references_))) {
-      sirt_number_of_references_++;
+    // Initialize padding entries.
+    while (sirt_number_of_references_ < sirt_expected_refs_) {
       *cur_sirt_entry_ = StackReference<mirror::Object>();
       cur_sirt_entry_--;
+      sirt_number_of_references_++;
     }
-    CHECK(IsAligned<8>(StackIndirectReferenceTable::SizeOf(sirt_number_of_references_)));
-    StackIndirectReferenceTable* sirt = reinterpret_cast<StackIndirectReferenceTable*>(
-        top_of_sirt_ - StackIndirectReferenceTable::SizeOf(sirt_number_of_references_));
+    sirt_->SetNumberOfReferences(sirt_expected_refs_);
 
-    sirt->SetNumberOfReferences(sirt_number_of_references_);
-    self->PushSirt(sirt);
+    // Install Sirt.
+    self->PushSirt(sirt_);
   }
 
   jobject GetFirstSirtEntry() {
-    return reinterpret_cast<jobject>(reinterpret_cast<StackReference<mirror::Object>*>(top_of_sirt_) - 1);
+    return reinterpret_cast<jobject>(top_of_sirt_);
+  }
+
+  void PushGpr(uintptr_t val) {
+    *cur_gpr_reg_ = val;
+    cur_gpr_reg_++;
+  }
+
+  void PushFpr4(float val) {
+    *cur_fpr_reg_ = val;
+    cur_fpr_reg_++;
+  }
+
+  void PushFpr8(uint64_t val) {
+    uint64_t* tmp = reinterpret_cast<uint64_t*>(cur_fpr_reg_);
+    *tmp = val;
+    cur_fpr_reg_ += 2;
+  }
+
+  void PushStack(uintptr_t val) {
+    *cur_stack_arg_ = val;
+    cur_stack_arg_++;
+  }
+
+  uintptr_t PushSirt(mirror::Object* ref) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    *cur_sirt_entry_ = StackReference<mirror::Object>::FromMirrorPtr(ref);
+    uintptr_t tmp = reinterpret_cast<uintptr_t>(cur_sirt_entry_);
+    cur_sirt_entry_--;
+    sirt_number_of_references_++;
+    return tmp;
+  }
+
+  // Size of the part of the alloca that we actually need.
+  size_t GetAllocaUsedSize() {
+    return alloca_used_size_;
+  }
+
+  void* GetCodeReturn() {
+    return code_return_;
   }
 
  private:
-  void PushArgumentInSirt(mirror::Object* obj) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    // Do something to push into the SIRT.
-    uintptr_t sirt_or_null;
-    if (obj != nullptr) {
-      sirt_number_of_references_++;
-      *cur_sirt_entry_ = StackReference<mirror::Object>::FromMirrorPtr(obj);
-      sirt_or_null = reinterpret_cast<uintptr_t>(cur_sirt_entry_);
-      cur_sirt_entry_--;
-    } else {
-      sirt_or_null = reinterpret_cast<uintptr_t>(nullptr);
-    }
-    // Push the GPR or stack arg.
-    if (gpr_index_ > 0) {
-      *cur_gpr_reg_ = sirt_or_null;
-      cur_gpr_reg_++;
-      gpr_index_--;
-    } else {
-      *cur_stack_arg_ = sirt_or_null;
-      cur_stack_arg_++;
-    }
-  }
-
-  void PushPointer(void* val) {
-    if (gpr_index_ > 0) {
-      *cur_gpr_reg_ = reinterpret_cast<uintptr_t>(val);
-      cur_gpr_reg_++;
-      gpr_index_--;
-    } else {
-      *cur_stack_arg_ = reinterpret_cast<uintptr_t>(val);
-      cur_stack_arg_++;
-    }
-  }
-
-  void PushIntArgument(jint val) {
-    if (gpr_index_ > 0) {
-      *cur_gpr_reg_ = val;
-      cur_gpr_reg_++;
-      gpr_index_--;
-    } else {
-      *cur_stack_arg_ = val;
-      cur_stack_arg_++;
-    }
-  }
-
-  void PushLongArgument(jlong val) {
-    // This is an ugly hack for the following problem:
-    //  Assume odd number of 32b registers. Then having exactly kRegsNeeded left needs to spill!
-    if (gpr_index_ >= kRegistersNeededForLong + (kNumNativeGprArgs % kRegistersNeededForLong)) {
-      if (kRegistersNeededForLong > 1 && ((kNumNativeGprArgs - gpr_index_) & 1) == 1) {
-        // Pad.
-        gpr_index_--;
-        cur_gpr_reg_++;
-      }
-      uint64_t* tmp = reinterpret_cast<uint64_t*>(cur_gpr_reg_);
-      *tmp = val;
-      cur_gpr_reg_ += kRegistersNeededForLong;
-      gpr_index_ -= kRegistersNeededForLong;
-    } else {
-      uint64_t* tmp = reinterpret_cast<uint64_t*>(cur_stack_arg_);
-      *tmp = val;
-      cur_stack_arg_ += kRegistersNeededForLong;
-
-      gpr_index_ = 0;                   // can't use GPRs anymore
-    }
-  }
-
-  void PushFloatArgument(int32_t val) {
-    if (kNativeSoftFloatAbi) {
-      PushIntArgument(val);
-    } else {
-      if (fpr_index_ > 0) {
-        *cur_fpr_reg_ = val;
-        cur_fpr_reg_++;
-        if (kRegistersNeededForDouble == 1) {
-          // will pop 64 bits from the stack
-          // TODO: extend/clear bits???
-          cur_fpr_reg_++;
-        }
-        fpr_index_--;
-      } else {
-        // TODO: Check ABI for floats.
-        *cur_stack_arg_ = val;
-        cur_stack_arg_++;
-      }
-    }
-  }
-
-  void PushDoubleArgument(uint64_t val) {
-    // See PushLongArgument for explanation
-    if (fpr_index_ >= kRegistersNeededForDouble + (kNumNativeFprArgs % kRegistersNeededForDouble)) {
-      if (kRegistersNeededForDouble > 1 && ((kNumNativeFprArgs - fpr_index_) & 1) == 1) {
-        // Pad.
-        fpr_index_--;
-        cur_fpr_reg_++;
-      }
-      uint64_t* tmp = reinterpret_cast<uint64_t*>(cur_fpr_reg_);
-      *tmp = val;
-      // TODO: the whole thing doesn't make sense if we take uint32_t*...
-      cur_fpr_reg_ += 2;        // kRegistersNeededForDouble;
-      fpr_index_ -= kRegistersNeededForDouble;
-    } else {
-      if (!IsAligned<8>(cur_stack_arg_)) {
-        cur_stack_arg_++;  // Pad.
-      }
-      uint64_t* tmp = reinterpret_cast<uint64_t*>(cur_stack_arg_);
-      *tmp = val;
-      cur_stack_arg_ += kRegistersNeededForDouble;
-
-      fpr_index_ = 0;                   // can't use FPRs anymore
-    }
-  }
-
   uint32_t sirt_number_of_references_;
   StackReference<mirror::Object>* cur_sirt_entry_;
-  uint32_t gpr_index_;           // should be uint, but gives error because on some archs no regs
+  StackIndirectReferenceTable* sirt_;
+  uint32_t sirt_expected_refs_;
   uintptr_t* cur_gpr_reg_;
-  uint32_t fpr_index_;           //                      ----- # -----
   uint32_t* cur_fpr_reg_;
   uintptr_t* cur_stack_arg_;
-  uint8_t* top_of_sirt_;
+  StackReference<mirror::Object>* top_of_sirt_;
+  void* code_return_;
+  size_t alloca_used_size_;
+
+  BuildGenericJniFrameStateMachine<BuildGenericJniFrameVisitor> sm_;
 
   DISALLOW_COPY_AND_ASSIGN(BuildGenericJniFrameVisitor);
 };
 
-extern "C" const void* artQuickGenericJniTrampoline(Thread* self, mirror::ArtMethod** sp)
+/*
+ * Initializes an alloca region assumed to be directly below sp for a native call:
+ * Create a Sirt and call stack and fill a mini stack with values to be pushed to registers.
+ * The final element on the stack is a pointer to the native code.
+ *
+ * The return of this function denotes:
+ * 1) How many bytes of the alloca can be released, if the value is non-negative.
+ * 2) An error, if the value is negative.
+ */
+extern "C" ssize_t artQuickGenericJniTrampoline(Thread* self, mirror::ArtMethod** sp)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   uint32_t* sp32 = reinterpret_cast<uint32_t*>(sp);
   mirror::ArtMethod* called = *sp;
@@ -1098,6 +1444,7 @@
 
   // run the visitor
   MethodHelper mh(called);
+
   BuildGenericJniFrameVisitor visitor(sp, called->IsStatic(), mh.GetShorty(), mh.GetShortyLength(),
                                       self);
   visitor.VisitArguments();
@@ -1110,10 +1457,10 @@
   uint32_t cookie;
   if (called->IsSynchronized()) {
     cookie = JniMethodStartSynchronized(visitor.GetFirstSirtEntry(), self);
-    // TODO: error checking.
     if (self->IsExceptionPending()) {
       self->PopSirt();
-      return nullptr;
+      // A negative value denotes an error.
+      return -1;
     }
   } else {
     cookie = JniMethodStart(self);
@@ -1127,7 +1474,12 @@
     LOG(FATAL) << "Finding native code not implemented yet.";
   }
 
-  return nativeCode;
+  uintptr_t* code_pointer = reinterpret_cast<uintptr_t*>(visitor.GetCodeReturn());
+  size_t window_size = visitor.GetAllocaUsedSize();
+  *code_pointer = reinterpret_cast<uintptr_t>(nativeCode);
+
+  // 5K reserved, window_size used.
+  return 5*1024 - window_size;
 }
 
 /*
@@ -1141,27 +1493,30 @@
   mirror::ArtMethod* called = *sp;
   uint32_t cookie = *(sp32-1);
 
-  // TODO: synchronized.
   MethodHelper mh(called);
   char return_shorty_char = mh.GetShorty()[0];
 
   if (return_shorty_char == 'L') {
     // the only special ending call
     if (called->IsSynchronized()) {
-      BuildGenericJniFrameVisitor visitor(sp, called->IsStatic(), mh.GetShorty(),
-                                          mh.GetShortyLength(), self);
-      return reinterpret_cast<uint64_t>(JniMethodEndWithReferenceSynchronized(result.l, cookie,
-                                                                              visitor.GetFirstSirtEntry(),
+      ComputeGenericJniFrameSize fsc;
+      fsc.ComputeSirtOffset();
+      uint32_t offset = fsc.GetFirstSirtEntryOffset();
+      jobject tmp = reinterpret_cast<jobject>(reinterpret_cast<uint8_t*>(sp)-offset);
+
+      return reinterpret_cast<uint64_t>(JniMethodEndWithReferenceSynchronized(result.l, cookie, tmp,
                                                                               self));
     } else {
       return reinterpret_cast<uint64_t>(JniMethodEndWithReference(result.l, cookie, self));
     }
   } else {
     if (called->IsSynchronized()) {
-      // run the visitor
-      BuildGenericJniFrameVisitor visitor(sp, called->IsStatic(), mh.GetShorty(),
-                                          mh.GetShortyLength(), self);
-      JniMethodEndSynchronized(cookie, visitor.GetFirstSirtEntry(), self);
+      ComputeGenericJniFrameSize fsc;
+      fsc.ComputeSirtOffset();
+      uint32_t offset = fsc.GetFirstSirtEntryOffset();
+      jobject tmp = reinterpret_cast<jobject>(reinterpret_cast<uint8_t*>(sp)-offset);
+
+      JniMethodEndSynchronized(cookie, tmp, self);
     } else {
       JniMethodEnd(cookie, self);
     }
diff --git a/runtime/native/dalvik_system_DexFile.cc b/runtime/native/dalvik_system_DexFile.cc
index 8a96d79..bab0604 100644
--- a/runtime/native/dalvik_system_DexFile.cc
+++ b/runtime/native/dalvik_system_DexFile.cc
@@ -15,6 +15,7 @@
  */
 
 #include <unistd.h>
+#include <fcntl.h>
 
 #include "base/logging.h"
 #include "class_linker.h"
@@ -36,6 +37,10 @@
 #include "toStringArray.h"
 #include "zip_archive.h"
 
+#ifdef HAVE_ANDROID_OS
+#include "cutils/properties.h"
+#endif
+
 namespace art {
 
 // A smart pointer that provides read-only access to a Java string's UTF chars.
@@ -193,7 +198,40 @@
   return toStringArray(env, class_names);
 }
 
-static jboolean DexFile_isDexOptNeeded(JNIEnv* env, jclass, jstring javaFilename) {
+// Copy a profile file
+static void CopyProfileFile(const char* oldfile, const char* newfile) {
+  int fd = open(oldfile, O_RDONLY);
+  if (fd < 0) {
+    // If we can't open the file show the uid:gid of the this process to allow
+    // diagnosis of the problem.
+    LOG(ERROR) << "Failed to open profile file " << oldfile<< ".  My uid:gid is "
+      << getuid() << ":" << getgid();
+    return;
+  }
+
+  // Create the copy with rw------- (only accessible by system)
+  int fd2 = open(newfile, O_WRONLY|O_CREAT|O_TRUNC, 0600);
+  if (fd2 < 0) {
+    // If we can't open the file show the uid:gid of the this process to allow
+    // diagnosis of the problem.
+    LOG(ERROR) << "Failed to create/write prev profile file " << newfile << ".  My uid:gid is "
+      << getuid() << ":" << getgid();
+    return;
+  }
+  char buf[4096];
+  while (true) {
+    int n = read(fd, buf, sizeof(buf));
+    if (n <= 0) {
+      break;
+    }
+    write(fd2, buf, n);
+  }
+  close(fd);
+  close(fd2);
+}
+
+static jboolean DexFile_isDexOptNeededInternal(JNIEnv* env, jclass, jstring javaFilename,
+    jstring javaPkgname, jboolean defer) {
   const bool kVerboseLogging = false;  // Spammy logging.
   const bool kDebugLogging = true;  // Logging useful for debugging.
 
@@ -221,6 +259,97 @@
     }
   }
 
+  // Check the profile file.  We need to rerun dex2oat if the profile has changed significantly
+  // since the last time, or it's new.
+  // If the 'defer' argument is true then this will be retried later.  In this case we
+  // need to make sure that the profile file copy is not made so that we will get the
+  // same result second time.
+  if (javaPkgname != NULL) {
+    ScopedUtfChars pkgname(env, javaPkgname);
+    std::string profile_file = GetDalvikCacheOrDie(GetAndroidData()) + std::string("/profiles/") +
+    pkgname.c_str();
+
+    std::string profile_cache_dir = GetDalvikCacheOrDie(GetAndroidData()) + "/profile-cache";
+
+    // Make the profile cache if it doesn't exist.
+    mkdir(profile_cache_dir.c_str(), 0700);
+
+    // The previous profile file (a copy of the profile the last time this was run) is
+    // in the dalvik-cache directory because this is owned by system.  The profiles
+    // directory is owned by install so system cannot write files in there.
+    std::string prev_profile_file = profile_cache_dir + std::string("/") + pkgname.c_str();
+
+    struct stat profstat, prevstat;
+    int e1 = stat(profile_file.c_str(), &profstat);
+    int e2 = stat(prev_profile_file.c_str(), &prevstat);
+
+    if (e1 < 0) {
+      // No profile file, need to run dex2oat
+      if (kDebugLogging) {
+        LOG(INFO) << "DexFile_isDexOptNeeded profile file " << profile_file << " doesn't exist";
+      }
+      return JNI_TRUE;
+    }
+    if (e2 == 0) {
+      // There is a previous profile file.  Check if the profile has changed significantly.
+      // Let's use the file size as a proxy for significance.  If the new profile is 10%
+      // different in size than the the old profile then we run dex2oat.
+      double newsize = profstat.st_size;
+      double oldsize = prevstat.st_size;
+      bool need_profile = false;
+
+      double ratio = 0;     // If the old file was empty and the new one not
+      if (oldsize > 0 && newsize > 0) {
+        ratio = newsize / oldsize;
+      } else if (oldsize == 0 && newsize > 0) {
+        need_profile = true;
+      } else if (oldsize > 0 && newsize == 0) {
+        // Unlikely to happen, but cover all the bases.
+        need_profile = true;
+      }
+
+      double significant_difference = 10.0;
+#ifdef HAVE_ANDROID_OS
+      // Switch off profiler if the dalvik.vm.profiler property has value 0.
+      char buf[PROP_VALUE_MAX];
+      property_get("dalvik.vm.profiler.dex2oat.threshold", buf, "10.0");
+      significant_difference = strtod(buf, nullptr);
+
+      // Something reasonable?
+      if (significant_difference < 1.0 || significant_difference > 90.0) {
+        significant_difference = 10.0;
+      }
+#endif      // The percentage difference that we consider as being significant.
+      double diff_hwm = 1.0 + significant_difference/10.0;
+      double diff_lwm = 1.0 - significant_difference/10.0;
+
+      if (ratio > diff_hwm || ratio < diff_lwm) {
+        need_profile = true;
+      }
+
+      if (need_profile) {
+        if (kDebugLogging) {
+          LOG(INFO) << "DexFile_isDexOptNeeded size of new profile file " << profile_file <<
+          " is significantly different from old profile file " << prev_profile_file << " (new: " <<
+          newsize << ", old: " << oldsize << ", ratio: " << ratio << ")";
+        }
+        if (!defer) {
+          CopyProfileFile(profile_file.c_str(), prev_profile_file.c_str());
+        }
+        return JNI_TRUE;
+      }
+    } else {
+      // Previous profile does not exist.  Make a copy of the current one.
+      if (kDebugLogging) {
+        LOG(INFO) << "DexFile_isDexOptNeeded previous profile doesn't exist: " << prev_profile_file;
+      }
+      if (!defer) {
+        CopyProfileFile(profile_file.c_str(), prev_profile_file.c_str());
+      }
+      return JNI_TRUE;
+    }
+  }
+
   // Check if we have an odex file next to the dex file.
   std::string odex_filename(OatFile::DexFilenameToOdexFilename(filename.c_str()));
   std::string error_msg;
@@ -329,11 +458,18 @@
   return JNI_FALSE;
 }
 
+// public API, NULL pkgname
+static jboolean DexFile_isDexOptNeeded(JNIEnv* env, jclass c, jstring javaFilename) {
+  return DexFile_isDexOptNeededInternal(env, c, javaFilename, NULL, false);
+}
+
+
 static JNINativeMethod gMethods[] = {
   NATIVE_METHOD(DexFile, closeDexFile, "(J)V"),
   NATIVE_METHOD(DexFile, defineClassNative, "(Ljava/lang/String;Ljava/lang/ClassLoader;J)Ljava/lang/Class;"),
   NATIVE_METHOD(DexFile, getClassNameList, "(J)[Ljava/lang/String;"),
   NATIVE_METHOD(DexFile, isDexOptNeeded, "(Ljava/lang/String;)Z"),
+  NATIVE_METHOD(DexFile, isDexOptNeededInternal, "(Ljava/lang/String;Ljava/lang/String;Z)Z"),
   NATIVE_METHOD(DexFile, openDexFileNative, "(Ljava/lang/String;Ljava/lang/String;I)J"),
 };
 
diff --git a/runtime/native/dalvik_system_VMRuntime.cc b/runtime/native/dalvik_system_VMRuntime.cc
index 4aa1d10..0e2d921 100644
--- a/runtime/native/dalvik_system_VMRuntime.cc
+++ b/runtime/native/dalvik_system_VMRuntime.cc
@@ -203,6 +203,7 @@
 
 static void VMRuntime_updateProcessState(JNIEnv* env, jobject, jint process_state) {
   Runtime::Current()->GetHeap()->UpdateProcessState(static_cast<gc::ProcessState>(process_state));
+  Runtime::Current()->UpdateProfilerState(process_state);
 }
 
 static void VMRuntime_trimHeap(JNIEnv*, jobject) {
@@ -511,13 +512,16 @@
  * process name.  We use this information to start up the sampling profiler for
  * for ART.
  */
-static void VMRuntime_registerAppInfo(JNIEnv* env, jclass, jstring appDir, jstring procName) {
+static void VMRuntime_registerAppInfo(JNIEnv* env, jclass, jstring pkgName, jstring appDir, jstring procName) {
+  const char *pkgNameChars = env->GetStringUTFChars(pkgName, NULL);
   const char *appDirChars = env->GetStringUTFChars(appDir, NULL);
   const char *procNameChars = env->GetStringUTFChars(procName, NULL);
-  std::string profileFile = std::string(appDirChars) + "/art-profile-" + std::string(procNameChars);
-  Runtime::Current()->StartProfiler(profileFile.c_str());
+
+  std::string profileFile = StringPrintf("/data/dalvik-cache/profiles/%s", pkgNameChars);
+  Runtime::Current()->StartProfiler(profileFile.c_str(), procNameChars);
   env->ReleaseStringUTFChars(appDir, appDirChars);
   env->ReleaseStringUTFChars(procName, procNameChars);
+  env->ReleaseStringUTFChars(pkgName, pkgNameChars);
 }
 
 static JNINativeMethod gMethods[] = {
@@ -542,7 +546,7 @@
   NATIVE_METHOD(VMRuntime, vmVersion, "()Ljava/lang/String;"),
   NATIVE_METHOD(VMRuntime, vmLibrary, "()Ljava/lang/String;"),
   NATIVE_METHOD(VMRuntime, preloadDexCaches, "()V"),
-  NATIVE_METHOD(VMRuntime, registerAppInfo, "(Ljava/lang/String;Ljava/lang/String;)V"),
+  NATIVE_METHOD(VMRuntime, registerAppInfo, "(Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;)V"),
 };
 
 void register_dalvik_system_VMRuntime(JNIEnv* env) {
diff --git a/runtime/oat.cc b/runtime/oat.cc
index c8eb3e2..d4eea85 100644
--- a/runtime/oat.cc
+++ b/runtime/oat.cc
@@ -22,7 +22,7 @@
 namespace art {
 
 const uint8_t OatHeader::kOatMagic[] = { 'o', 'a', 't', '\n' };
-const uint8_t OatHeader::kOatVersion[] = { '0', '1', '6', '\0' };
+const uint8_t OatHeader::kOatVersion[] = { '0', '1', '7', '\0' };
 
 OatHeader::OatHeader() {
   memset(this, 0, sizeof(*this));
diff --git a/runtime/profiler.cc b/runtime/profiler.cc
index 20e08b8..da98938 100644
--- a/runtime/profiler.cc
+++ b/runtime/profiler.cc
@@ -17,6 +17,7 @@
 #include "profiler.h"
 
 #include <sys/uio.h>
+#include <sys/file.h>
 
 #include "base/stl_util.h"
 #include "base/unix_file/fd_file.h"
@@ -170,6 +171,7 @@
 
     SampleCheckpoint check_point(profiler);
 
+    size_t valid_samples = 0;
     while (now_us < end_us) {
       if (ShuttingDown(self)) {
         break;
@@ -180,7 +182,15 @@
       ThreadList* thread_list = runtime->GetThreadList();
 
       profiler->profiler_barrier_->Init(self, 0);
-      size_t barrier_count = thread_list->RunCheckpoint(&check_point);
+      size_t barrier_count = thread_list->RunCheckpointOnRunnableThreads(&check_point);
+
+      // All threads are suspended, nothing to do.
+      if (barrier_count == 0) {
+        now_us = MicroTime();
+        continue;
+      }
+
+      valid_samples += barrier_count;
 
       ThreadState old_state = self->SetState(kWaitingForCheckPointsToRun);
 
@@ -206,7 +216,7 @@
       now_us = MicroTime();
     }
 
-    if (!ShuttingDown(self)) {
+    if (valid_samples > 0 && !ShuttingDown(self)) {
       // After the profile has been taken, write it out.
       ScopedObjectAccess soa(self);   // Acquire the mutator lock.
       uint32_t size = profiler->WriteProfile();
@@ -221,39 +231,65 @@
 
 // Write out the profile file if we are generating a profile.
 uint32_t BackgroundMethodSamplingProfiler::WriteProfile() {
-  UniquePtr<File> profile_file;
-  Runtime* runtime = Runtime::Current();
-  std::string classpath = runtime->GetClassPathString();
-  size_t colon = classpath.find(':');
-  if (colon != std::string::npos) {
-    // More than one file in the classpath.  Possible?
-    classpath = classpath.substr(0, colon);
-  }
-
-  std::replace(classpath.begin(), classpath.end(), '/', '@');
   std::string full_name = profile_file_name_;
-  if (classpath != "") {
-    full_name = StringPrintf("%s-%s", profile_file_name_.c_str(), classpath.c_str());
-  }
   LOG(DEBUG) << "Saving profile to " << full_name;
 
-  profile_file.reset(OS::CreateEmptyFile(full_name.c_str()));
-  if (profile_file.get() == nullptr) {
-    // Failed to open the profile file, ignore.
-    LOG(INFO) << "Failed to op file";
+  int fd = open(full_name.c_str(), O_RDWR);
+  if (fd < 0) {
+    // Open failed.
+    LOG(ERROR) << "Failed to open profile file " << full_name;
     return 0;
   }
+
+  // Lock the file for exclusive access.  This will block if another process is using
+  // the file.
+  int err = flock(fd, LOCK_EX);
+  if (err < 0) {
+    LOG(ERROR) << "Failed to lock profile file " << full_name;
+    return 0;
+  }
+
+  // Read the previous profile.
+  profile_table_.ReadPrevious(fd);
+
+  // Move back to the start of the file.
+  lseek(fd, 0, SEEK_SET);
+
+  // Format the profile output and write to the file.
   std::ostringstream os;
   uint32_t num_methods = DumpProfile(os);
   std::string data(os.str());
-  profile_file->WriteFully(data.c_str(), data.length());
-  profile_file->Close();
+  const char *p = data.c_str();
+  size_t length = data.length();
+  size_t full_length = length;
+  do {
+    int n = ::write(fd, p, length);
+    p += n;
+    length -= n;
+  } while (length > 0);
+
+  // Truncate the file to the new length.
+  ftruncate(fd, full_length);
+
+  // Now unlock the file, allowing another process in.
+  err = flock(fd, LOCK_UN);
+  if (err < 0) {
+    LOG(ERROR) << "Failed to unlock profile file " << full_name;
+  }
+
+  // Done, close the file.
+  ::close(fd);
+
+  // Clean the profile for the next time.
+  CleanProfile();
+
   return num_methods;
 }
 
 // Start a profile thread with the user-supplied arguments.
 void BackgroundMethodSamplingProfiler::Start(int period, int duration,
-                  std::string profile_file_name, int interval_us,
+                  const std::string& profile_file_name, const std::string& procName,
+                  int interval_us,
                   double backoff_coefficient, bool startImmediately) {
   Thread* self = Thread::Current();
   {
@@ -266,12 +302,14 @@
 
   // Only on target...
 #ifdef HAVE_ANDROID_OS
-  // Switch off profiler if the dalvik.vm.profiler property has value 0.
-  char buf[PROP_VALUE_MAX];
-  property_get("dalvik.vm.profiler", buf, "0");
-  if (strcmp(buf, "0") == 0) {
-    LOG(INFO) << "Profiler disabled.  To enable setprop dalvik.vm.profiler 1";
-    return;
+  if (!startImmediately) {
+    // Switch off profiler if the dalvik.vm.profiler property has value 0.
+    char buf[PROP_VALUE_MAX];
+    property_get("dalvik.vm.profiler", buf, "0");
+    if (strcmp(buf, "0") == 0) {
+      LOG(INFO) << "Profiler disabled.  To enable setprop dalvik.vm.profiler 1";
+      return;
+    }
   }
 #endif
 
@@ -281,6 +319,7 @@
   {
     MutexLock mu(self, *Locks::profiler_lock_);
     profiler_ = new BackgroundMethodSamplingProfiler(period, duration, profile_file_name,
+                                      procName,
                                       backoff_coefficient,
                                       interval_us, startImmediately);
 
@@ -323,9 +362,10 @@
 }
 
 BackgroundMethodSamplingProfiler::BackgroundMethodSamplingProfiler(int period, int duration,
-                   std::string profile_file_name,
+                   const std::string& profile_file_name,
+                   const std::string& process_name,
                    double backoff_coefficient, int interval_us, bool startImmediately)
-    : profile_file_name_(profile_file_name),
+    : profile_file_name_(profile_file_name), process_name_(process_name),
       period_s_(period), start_immediately_(startImmediately),
       interval_us_(interval_us), backoff_factor_(1.0),
       backoff_coefficient_(backoff_coefficient), duration_s_(duration),
@@ -423,9 +463,13 @@
   lock_.Unlock(Thread::Current());
 }
 
-// Write the profile table to the output stream.
+// Write the profile table to the output stream.  Also merge with the previous profile.
 uint32_t ProfileSampleResults::Write(std::ostream &os) {
   ScopedObjectAccess soa(Thread::Current());
+  num_samples_ += previous_num_samples_;
+  num_null_methods_ += previous_num_null_methods_;
+  num_boot_methods_ += previous_num_boot_methods_;
+
   LOG(DEBUG) << "Profile: " << num_samples_ << "/" << num_null_methods_ << "/" << num_boot_methods_;
   os << num_samples_ << "/" << num_null_methods_ << "/" << num_boot_methods_ << "\n";
   uint32_t num_methods = 0;
@@ -433,14 +477,35 @@
     Map *map = table[i];
     if (map != nullptr) {
       for (const auto &meth_iter : *map) {
-         mirror::ArtMethod *method = meth_iter.first;
-         std::string method_name = PrettyMethod(method);
-         uint32_t method_size = method->GetCodeSize();
-         os << StringPrintf("%s/%u/%u\n",  method_name.c_str(), meth_iter.second, method_size);
-         ++num_methods;
-       }
+        mirror::ArtMethod *method = meth_iter.first;
+        std::string method_name = PrettyMethod(method);
+
+        MethodHelper mh(method);
+        const DexFile::CodeItem* codeitem = mh.GetCodeItem();
+        uint32_t method_size = 0;
+        if (codeitem != nullptr) {
+          method_size = codeitem->insns_size_in_code_units_;
+        }
+        uint32_t count = meth_iter.second;
+
+        // Merge this profile entry with one from a previous run (if present).  Also
+        // remove the previous entry.
+        PreviousProfile::iterator pi = previous_.find(method_name);
+        if (pi != previous_.end()) {
+          count += pi->second.count_;
+          previous_.erase(pi);
+        }
+        os << StringPrintf("%s/%u/%u\n",  method_name.c_str(), count, method_size);
+        ++num_methods;
+      }
     }
   }
+
+  // Now we write out the remaining previous methods.
+  for (PreviousProfile::iterator pi = previous_.begin(); pi != previous_.end(); ++pi) {
+    os << StringPrintf("%s/%u/%u\n",  pi->first.c_str(), pi->second.count_, pi->second.method_size_);
+    ++num_methods;
+  }
   return num_methods;
 }
 
@@ -452,11 +517,67 @@
      delete table[i];
      table[i] = nullptr;
   }
+  previous_.clear();
 }
 
 uint32_t ProfileSampleResults::Hash(mirror::ArtMethod* method) {
   return (PointerToLowMemUInt32(method) >> 3) % kHashSize;
 }
 
+// Read a single line into the given string.  Returns true if everything OK, false
+// on EOF or error.
+static bool ReadProfileLine(int fd, std::string& line) {
+  char buf[4];
+  line.clear();
+  while (true) {
+    int n = read(fd, buf, 1);     // TODO: could speed this up but is it worth it?
+    if (n != 1) {
+      return false;
+    }
+    if (buf[0] == '\n') {
+      break;
+    }
+    line += buf[0];
+  }
+  return true;
+}
+
+void ProfileSampleResults::ReadPrevious(int fd) {
+  // Reset counters.
+  previous_num_samples_ = previous_num_null_methods_ = previous_num_boot_methods_ = 0;
+
+  std::string line;
+
+  // The first line contains summary information.
+  if (!ReadProfileLine(fd, line)) {
+    return;
+  }
+  std::vector<std::string> summary_info;
+  Split(line, '/', summary_info);
+  if (summary_info.size() != 3) {
+    // Bad summary info.  It should be count/nullcount/bootcount
+    return;
+  }
+  previous_num_samples_ = atoi(summary_info[0].c_str());
+  previous_num_null_methods_ = atoi(summary_info[1].c_str());
+  previous_num_boot_methods_ = atoi(summary_info[2].c_str());
+
+  // Now read each line until the end of file.  Each line consists of 3 fields separated by /
+  while (true) {
+    if (!ReadProfileLine(fd, line)) {
+      break;
+    }
+    std::vector<std::string> info;
+    Split(line, '/', info);
+    if (info.size() != 3) {
+      // Malformed.
+      break;
+    }
+    std::string methodname = info[0];
+    uint32_t count = atoi(info[1].c_str());
+    uint32_t size = atoi(info[2].c_str());
+    previous_[methodname] = PreviousValue(count, size);
+  }
+}
 }  // namespace art
 
diff --git a/runtime/profiler.h b/runtime/profiler.h
index 6ea6c84..b03b170 100644
--- a/runtime/profiler.h
+++ b/runtime/profiler.h
@@ -54,10 +54,12 @@
 
   void Put(mirror::ArtMethod* method);
   uint32_t Write(std::ostream &os);
+  void ReadPrevious(int fd);
   void Clear();
   uint32_t GetNumSamples() { return num_samples_; }
   void NullMethod() { ++num_null_methods_; }
   void BootMethod() { ++num_boot_methods_; }
+
  private:
   uint32_t Hash(mirror::ArtMethod* method);
   static constexpr int kHashSize = 17;
@@ -68,6 +70,19 @@
 
   typedef std::map<mirror::ArtMethod*, uint32_t> Map;   // Map of method vs its count.
   Map *table[kHashSize];
+
+  struct PreviousValue {
+    PreviousValue() : count_(0), method_size_(0) {}
+    PreviousValue(uint32_t count, uint32_t method_size) : count_(count), method_size_(method_size) {}
+    uint32_t count_;
+    uint32_t method_size_;
+  };
+
+  typedef std::map<std::string, PreviousValue> PreviousProfile;
+  PreviousProfile previous_;
+  uint32_t previous_num_samples_;
+  uint32_t previous_num_null_methods_;     // Number of samples where can don't know the method.
+  uint32_t previous_num_boot_methods_;     // Number of samples in the boot path.
 };
 
 //
@@ -87,7 +102,8 @@
 
 class BackgroundMethodSamplingProfiler {
  public:
-  static void Start(int period, int duration, std::string profile_filename, int interval_us,
+  static void Start(int period, int duration, const std::string& profile_filename,
+                    const std::string& procName, int interval_us,
                     double backoff_coefficient, bool startImmediately)
   LOCKS_EXCLUDED(Locks::mutator_lock_,
                  Locks::thread_list_lock_,
@@ -104,8 +120,10 @@
   }
 
  private:
-  explicit BackgroundMethodSamplingProfiler(int period, int duration, std::string profile_filename,
-                 double backoff_coefficient, int interval_us, bool startImmediately);
+  explicit BackgroundMethodSamplingProfiler(int period, int duration,
+                                            const std::string& profile_filename,
+                                            const std::string& process_name,
+                                            double backoff_coefficient, int interval_us, bool startImmediately);
 
   // The sampling interval in microseconds is passed as an argument.
   static void* RunProfilerThread(void* arg) LOCKS_EXCLUDED(Locks::profiler_lock_);
@@ -130,6 +148,9 @@
   // File to write profile data out to.  Cannot be empty if we are profiling.
   std::string profile_file_name_;
 
+  // Process name.
+  std::string process_name_;
+
   // Number of seconds between profile runs.
   uint32_t period_s_;
 
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index fdbf245..d1c8370 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -27,6 +27,7 @@
 #include <cstdlib>
 #include <limits>
 #include <vector>
+#include <fcntl.h>
 
 #include "arch/arm/registers_arm.h"
 #include "arch/mips/registers_mips.h"
@@ -69,6 +70,10 @@
 
 #include "JniConstants.h"  // Last to avoid LOG redefinition in ics-mr1-plus-art.
 
+#ifdef HAVE_ANDROID_OS
+#include "cutils/properties.h"
+#endif
+
 namespace art {
 
 Runtime* Runtime::instance_ = NULL;
@@ -370,7 +375,12 @@
 
   if (profile_) {
     // User has asked for a profile using -Xprofile
-    StartProfiler(profile_output_filename_.c_str(), true);
+    // Create the profile file if it doesn't exist.
+    int fd = open(profile_output_filename_.c_str(), O_RDWR|O_CREAT|O_EXCL, 0660);
+    if (fd >= 0) {
+      close(fd);
+    }
+    StartProfiler(profile_output_filename_.c_str(), "", true);
   }
 
   return true;
@@ -1055,10 +1065,10 @@
   method_verifiers_.erase(it);
 }
 
-void Runtime::StartProfiler(const char *appDir, bool startImmediately) {
+void Runtime::StartProfiler(const char* appDir, const char* procName, bool startImmediately) {
   BackgroundMethodSamplingProfiler::Start(profile_period_s_, profile_duration_s_, appDir,
-                                          profile_interval_us_, profile_backoff_coefficient_,
-                                          startImmediately);
+      procName, profile_interval_us_,
+      profile_backoff_coefficient_, startImmediately);
 }
 
 // Transaction support.
@@ -1136,4 +1146,7 @@
   fault_message_ = message;
 }
 
+void Runtime::UpdateProfilerState(int state) {
+  LOG(DEBUG) << "Profiler state updated to " << state;
+}
 }  // namespace art
diff --git a/runtime/runtime.h b/runtime/runtime.h
index 65d296a..109f031 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -373,7 +373,8 @@
   const std::vector<const DexFile*>& GetCompileTimeClassPath(jobject class_loader);
   void SetCompileTimeClassPath(jobject class_loader, std::vector<const DexFile*>& class_path);
 
-  void StartProfiler(const char *appDir, bool startImmediately = false);
+  void StartProfiler(const char* appDir, const char* procName, bool startImmediately = false);
+  void UpdateProfilerState(int state);
 
   // Transaction support.
   bool IsActiveTransaction() const;
@@ -419,6 +420,12 @@
   void StartDaemonThreads();
   void StartSignalCatcher();
 
+  // NOTE: these must match the gc::ProcessState values as they come directly
+  // from the framework.
+  static constexpr int kProfileForground = 0;
+  static constexpr int kProfileBackgrouud = 1;
+
+
   // A pointer to the active runtime or NULL.
   static Runtime* instance_;
 
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index bddebbd..ac5750b 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -269,6 +269,36 @@
   return count + suspended_count_modified_threads.size() + 1;
 }
 
+// Request that a checkpoint function be run on all active (non-suspended)
+// threads.  Returns the number of successful requests.
+size_t ThreadList::RunCheckpointOnRunnableThreads(Closure* checkpoint_function) {
+  Thread* self = Thread::Current();
+  if (kIsDebugBuild) {
+    Locks::mutator_lock_->AssertNotExclusiveHeld(self);
+    Locks::thread_list_lock_->AssertNotHeld(self);
+    Locks::thread_suspend_count_lock_->AssertNotHeld(self);
+    CHECK_NE(self->GetState(), kRunnable);
+  }
+
+  size_t count = 0;
+  {
+    // Call a checkpoint function for each non-suspended thread.
+    MutexLock mu(self, *Locks::thread_list_lock_);
+    MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
+    for (const auto& thread : list_) {
+      if (thread != self) {
+        if (thread->RequestCheckpoint(checkpoint_function)) {
+          // This thread will run its checkpoint some time in the near future.
+          count++;
+        }
+      }
+    }
+  }
+
+  // Return the number of threads that will run the checkpoint function.
+  return count;
+}
+
 void ThreadList::SuspendAll() {
   Thread* self = Thread::Current();
   DCHECK(self != nullptr);
diff --git a/runtime/thread_list.h b/runtime/thread_list.h
index 1a76705..58bd92a 100644
--- a/runtime/thread_list.h
+++ b/runtime/thread_list.h
@@ -90,6 +90,10 @@
       LOCKS_EXCLUDED(Locks::thread_list_lock_,
                      Locks::thread_suspend_count_lock_);
 
+  size_t RunCheckpointOnRunnableThreads(Closure* checkpoint_function);
+      LOCKS_EXCLUDED(Locks::thread_list_lock_,
+                 Locks::thread_suspend_count_lock_);
+
   // Suspends all threads
   void SuspendAllForDebugger()
       LOCKS_EXCLUDED(Locks::mutator_lock_,
diff --git a/runtime/utils.cc b/runtime/utils.cc
index d2d23e8..2b57778 100644
--- a/runtime/utils.cc
+++ b/runtime/utils.cc
@@ -468,7 +468,8 @@
       break;
     }
   }
-  return StringPrintf("%s%" PRId64 "%s", negative_str, byte_count / kBytesPerUnit[i], kUnitStrings[i]);
+  return StringPrintf("%s%" PRId64 "%s",
+                      negative_str, byte_count / kBytesPerUnit[i], kUnitStrings[i]);
 }
 
 std::string PrettyDuration(uint64_t nano_duration) {
@@ -1080,7 +1081,9 @@
       os << it->func_name;
     } else {
       if (current_method != nullptr && current_method->IsWithinQuickCode(it->pc)) {
-        os << JniLongName(current_method) << "+" << (it->pc - current_method->GetQuickOatCodeOffset());
+        const void* start_of_code = current_method->GetEntryPointFromQuickCompiledCode();
+        os << JniLongName(current_method) << "+"
+           << (it->pc - reinterpret_cast<uintptr_t>(start_of_code));
       } else {
         os << "???";
       }
@@ -1119,7 +1122,8 @@
   // which looking at the source appears to be the kernel's way of saying "that's all, folks!".
   kernel_stack_frames.pop_back();
   for (size_t i = 0; i < kernel_stack_frames.size(); ++i) {
-    // Turn "[<ffffffff8109156d>] futex_wait_queue_me+0xcd/0x110" into "futex_wait_queue_me+0xcd/0x110".
+    // Turn "[<ffffffff8109156d>] futex_wait_queue_me+0xcd/0x110"
+    // into "futex_wait_queue_me+0xcd/0x110".
     const char* text = kernel_stack_frames[i].c_str();
     const char* close_bracket = strchr(text, ']');
     if (close_bracket != NULL) {