Merge "simpleperf: support tracing offcpu time."
diff --git a/simpleperf/cmd_dumprecord.cpp b/simpleperf/cmd_dumprecord.cpp
index 2ce5295..dcd7bf4 100644
--- a/simpleperf/cmd_dumprecord.cpp
+++ b/simpleperf/cmd_dumprecord.cpp
@@ -151,6 +151,7 @@
       {FEAT_PMU_MAPPINGS, "pmu_mappings"},
       {FEAT_GROUP_DESC, "group_desc"},
       {FEAT_FILE, "file"},
+      {FEAT_META_INFO, "meta_info"},
   };
   auto it = feature_name_map.find(feature);
   if (it != feature_name_map.end()) {
diff --git a/simpleperf/cmd_record.cpp b/simpleperf/cmd_record.cpp
index 8bd7962..390535c 100644
--- a/simpleperf/cmd_record.cpp
+++ b/simpleperf/cmd_record.cpp
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <inttypes.h>
 #include <libgen.h>
 #include <signal.h>
 #include <sys/prctl.h>
@@ -150,6 +151,7 @@
 "                 This option is used to provide files with symbol table and\n"
 "                 debug information, which are used for unwinding and dumping symbols.\n"
 "-t tid1,tid2,... Record events on existing threads. Mutually exclusive with -a.\n"
+"--trace-offcpu   Generate samples when threads are scheduled off cpu.\n"
 #if 0
 // Below options are only used internally and shouldn't be visible to the public.
 "--in-app         We are already running in the app's context.\n"
@@ -179,7 +181,8 @@
         sample_record_count_(0),
         lost_record_count_(0),
         start_profiling_fd_(-1),
-        in_app_context_(false) {
+        in_app_context_(false),
+        trace_offcpu_(false) {
     // Stop profiling if parent exits.
     prctl(PR_SET_PDEATHSIG, SIGHUP, 0, 0, 0);
     app_package_name_ = GetDefaultAppPackageName();
@@ -190,6 +193,7 @@
  private:
   bool ParseOptions(const std::vector<std::string>& args,
                     std::vector<std::string>* non_option_args);
+  bool TraceOffCpu();
   bool SetEventSelectionFlags();
   bool CreateAndInitRecordFile();
   std::unique_ptr<RecordFileWriter> CreateRecordFile(
@@ -205,6 +209,7 @@
   bool DumpAdditionalFeatures(const std::vector<std::string>& args);
   bool DumpBuildIdFeature();
   bool DumpFileFeature();
+  bool DumpMetaInfoFeature();
   void CollectHitFileInfo(const SampleRecord& r);
 
   bool use_sample_freq_;
@@ -239,6 +244,7 @@
   int start_profiling_fd_;
   std::string app_package_name_;
   bool in_app_context_;
+  bool trace_offcpu_;
 };
 
 bool RecordCommand::Run(const std::vector<std::string>& args) {
@@ -257,7 +263,7 @@
     // root.
     if (!IsRoot()) {
       return RunInAppContext(app_package_name_, "record", args, workload_args.size(),
-                             record_filename_, !event_selection_set_.GetTracepointEvents().empty());
+                             record_filename_, true);
     }
   }
   if (event_selection_set_.empty()) {
@@ -265,6 +271,9 @@
       return false;
     }
   }
+  if (trace_offcpu_ && !TraceOffCpu()) {
+    return false;
+  }
   if (!SetEventSelectionFlags()) {
     return false;
   }
@@ -586,6 +595,8 @@
         return false;
       }
       event_selection_set_.AddMonitoredThreads(tids);
+    } else if (args[i] == "--trace-offcpu") {
+      trace_offcpu_ = true;
     } else if (args[i] == "--tracepoint-events") {
       if (!NextArgumentOrError(args, &i)) {
         return false;
@@ -654,6 +665,24 @@
   return true;
 }
 
+bool RecordCommand::TraceOffCpu() {
+  if (FindEventTypeByName("sched:sched_switch") == nullptr) {
+    LOG(ERROR) << "Can't trace off cpu because sched:sched_switch event is not available";
+    return false;
+  }
+  for (auto& event_type : event_selection_set_.GetTracepointEvents()) {
+    if (event_type->name == "sched:sched_switch") {
+      LOG(ERROR) << "Trace offcpu can't be used together with sched:sched_switch event";
+      return false;
+    }
+  }
+  if (!IsDumpingRegsForTracepointEventsSupported()) {
+    LOG(ERROR) << "Dumping regs for tracepoint events is not supported by the kernel";
+    return false;
+  }
+  return event_selection_set_.AddEventType("sched:sched_switch");
+}
+
 bool RecordCommand::SetEventSelectionFlags() {
   if (use_sample_freq_) {
     event_selection_set_.SetSampleFreq(sample_freq_);
@@ -1033,10 +1062,7 @@
       !record_file_writer_->WriteBranchStackFeature()) {
     return false;
   }
-
-  std::unordered_map<std::string, std::string> info_map;
-  info_map["simpleperf_version"] = GetSimpleperfVersion();
-  if (!record_file_writer_->WriteMetaInfoFeature(info_map)) {
+  if (!DumpMetaInfoFeature()) {
     return false;
   }
 
@@ -1130,6 +1156,17 @@
   return true;
 }
 
+bool RecordCommand::DumpMetaInfoFeature() {
+  std::unordered_map<std::string, std::string> info_map;
+  info_map["simpleperf_version"] = GetSimpleperfVersion();
+  info_map["system_wide_collection"] = system_wide_collection_ ? "true" : "false";
+  info_map["trace_offcpu"] = trace_offcpu_ ? "true" : "false";
+  // By storing event types information in perf.data, the readers of perf.data have the same
+  // understanding of event types, even if they are on another machine.
+  info_map["event_type_info"] = ScopedEventTypes::BuildString(event_selection_set_.GetEvents());
+  return record_file_writer_->WriteMetaInfoFeature(info_map);
+}
+
 void RecordCommand::CollectHitFileInfo(const SampleRecord& r) {
   const ThreadEntry* thread =
       thread_tree_.FindThreadOrNew(r.tid_data.pid, r.tid_data.tid);
diff --git a/simpleperf/cmd_record_test.cpp b/simpleperf/cmd_record_test.cpp
index 8b70fcd..d888a58 100644
--- a/simpleperf/cmd_record_test.cpp
+++ b/simpleperf/cmd_record_test.cpp
@@ -431,32 +431,21 @@
 }
 
 TEST(record_cmd, dump_regs_for_tracepoint_events) {
+  TEST_REQUIRE_HOST_ROOT();
   // Check if the kernel can dump registers for tracepoint events.
   // If not, probably a kernel patch below is missing:
   // "5b09a094f2 arm64: perf: Fix callchain parse error with kernel tracepoint events"
-  std::vector<std::unique_ptr<Workload>> workloads;
-  CreateProcesses(1, &workloads);
-  std::string pid = std::to_string(workloads[0]->GetPid());
-  TemporaryFile tmpfile;
-  ASSERT_TRUE(RecordCmd()->Run({"-o", tmpfile.path, "-p", pid, "-e", "sched:sched_switch",
-                                "-g", "--no-unwind", "--duration", "1"}));
+  ASSERT_TRUE(IsDumpingRegsForTracepointEventsSupported());
+}
 
-  // If the kernel patch is missing, all regs dumped in sample records are zero.
+TEST(record_cmd, trace_offcpu_option) {
+  // On linux host, we need root privilege to read tracepoint events.
+  TEST_REQUIRE_HOST_ROOT();
+  TemporaryFile tmpfile;
+  ASSERT_TRUE(RunRecordCmd({"--trace-offcpu"}, tmpfile.path));
   std::unique_ptr<RecordFileReader> reader = RecordFileReader::CreateInstance(tmpfile.path);
-  CHECK(reader != nullptr);
-  std::unique_ptr<Record> r;
-  bool regs_all_zero = true;
-  while (reader->ReadRecord(r) && r && regs_all_zero) {
-    if (r->type() != PERF_RECORD_SAMPLE) {
-      continue;
-    }
-    SampleRecord* s = static_cast<SampleRecord*>(r.get());
-    for (size_t i = 0; i < s->regs_user_data.reg_nr; ++i) {
-      if (s->regs_user_data.regs[i] != 0u) {
-        regs_all_zero = false;
-        break;
-      }
-    }
-  }
-  ASSERT_FALSE(regs_all_zero);
+  ASSERT_TRUE(reader != nullptr);
+  std::unordered_map<std::string, std::string> info_map;
+  ASSERT_TRUE(reader->ReadMetaInfoFeature(&info_map));
+  ASSERT_EQ(info_map["trace_offcpu"], "true");
 }
diff --git a/simpleperf/cmd_report.cpp b/simpleperf/cmd_report.cpp
index 29cf743..5965e63 100644
--- a/simpleperf/cmd_report.cpp
+++ b/simpleperf/cmd_report.cpp
@@ -105,10 +105,9 @@
 BUILD_COMPARE_VALUE_FUNCTION(CompareVaddrInFile, vaddr_in_file);
 BUILD_DISPLAY_HEX64_FUNCTION(DisplayVaddrInFile, vaddr_in_file);
 
-class ReportCmdSampleTreeBuilder
-    : public SampleTreeBuilder<SampleEntry, uint64_t> {
+class ReportCmdSampleTreeBuilder : public SampleTreeBuilder<SampleEntry, uint64_t> {
  public:
-  ReportCmdSampleTreeBuilder(SampleComparator<SampleEntry> sample_comparator,
+  ReportCmdSampleTreeBuilder(const SampleComparator<SampleEntry>& sample_comparator,
                              ThreadTree* thread_tree)
       : SampleTreeBuilder(sample_comparator),
         thread_tree_(thread_tree),
@@ -138,7 +137,17 @@
     return sample_tree;
   }
 
+  virtual void ReportCmdProcessSampleRecord(std::shared_ptr<SampleRecord>& r) {
+    return ProcessSampleRecord(*r);
+  }
+
+  virtual void ReportCmdProcessSampleRecord(const SampleRecord& r) {
+    return ProcessSampleRecord(r);
+  }
+
  protected:
+  virtual uint64_t GetPeriod(const SampleRecord& r) = 0;
+
   SampleEntry* CreateSample(const SampleRecord& r, bool in_kernel,
                             uint64_t* acc_info) override {
     const ThreadEntry* thread =
@@ -148,10 +157,10 @@
     uint64_t vaddr_in_file;
     const Symbol* symbol =
         thread_tree_->FindSymbol(map, r.ip_data.ip, &vaddr_in_file);
-    *acc_info = r.period_data.period;
+    uint64_t period = GetPeriod(r);
+    *acc_info = period;
     return InsertSample(std::unique_ptr<SampleEntry>(
-        new SampleEntry(r.time_data.time, r.period_data.period, 0, 1, thread,
-                        map, symbol, vaddr_in_file)));
+        new SampleEntry(r.time_data.time, period, 0, 1, thread, map, symbol, vaddr_in_file)));
   }
 
   SampleEntry* CreateBranchSample(const SampleRecord& r,
@@ -253,6 +262,53 @@
   uint64_t total_error_callchains_;
 };
 
+// Build sample tree based on event count in each sample.
+class EventCountSampleTreeBuilder : public ReportCmdSampleTreeBuilder {
+ public:
+  EventCountSampleTreeBuilder(const SampleComparator<SampleEntry>& sample_comparator,
+                              ThreadTree* thread_tree)
+      : ReportCmdSampleTreeBuilder(sample_comparator, thread_tree) { }
+
+ protected:
+  uint64_t GetPeriod(const SampleRecord& r) override {
+    return r.period_data.period;
+  }
+};
+
+// Build sample tree based on the time difference between current sample and next sample.
+class TimestampSampleTreeBuilder : public ReportCmdSampleTreeBuilder {
+ public:
+  TimestampSampleTreeBuilder(const SampleComparator<SampleEntry>& sample_comparator,
+                             ThreadTree* thread_tree)
+      : ReportCmdSampleTreeBuilder(sample_comparator, thread_tree) { }
+
+  void ReportCmdProcessSampleRecord(std::shared_ptr<SampleRecord>& r) override {
+    pid_t tid = static_cast<pid_t>(r->tid_data.tid);
+    auto it = next_sample_cache_.find(tid);
+    if (it == next_sample_cache_.end()) {
+      next_sample_cache_[tid] = r;
+    } else {
+      std::shared_ptr<SampleRecord> cur = it->second;
+      it->second = r;
+      ProcessSampleRecord(*cur);
+    }
+  }
+
+ protected:
+  uint64_t GetPeriod(const SampleRecord& r) override {
+    auto it = next_sample_cache_.find(r.tid_data.tid);
+    CHECK(it != next_sample_cache_.end());
+    // Normally the samples are sorted by time, but check here for safety.
+    if (it->second->time_data.time > r.time_data.time) {
+      return it->second->time_data.time - r.time_data.time;
+    }
+    return 1u;
+  }
+
+ private:
+  std::unordered_map<pid_t, std::shared_ptr<SampleRecord>> next_sample_cache_;
+};
+
 struct SampleTreeBuilderOptions {
   SampleComparator<SampleEntry> comparator;
   ThreadTree* thread_tree;
@@ -266,10 +322,15 @@
   bool build_callchain;
   bool use_caller_as_callchain_root;
   bool strict_unwind_arch_check;
+  bool trace_offcpu;
 
   std::unique_ptr<ReportCmdSampleTreeBuilder> CreateSampleTreeBuilder() {
-    std::unique_ptr<ReportCmdSampleTreeBuilder> builder(
-        new ReportCmdSampleTreeBuilder(comparator, thread_tree));
+    std::unique_ptr<ReportCmdSampleTreeBuilder> builder;
+    if (trace_offcpu) {
+      builder.reset(new TimestampSampleTreeBuilder(comparator, thread_tree));
+    } else {
+      builder.reset(new EventCountSampleTreeBuilder(comparator, thread_tree));
+    }
     builder->SetFilters(pid_filter, tid_filter, comm_filter, dso_filter, symbol_filter);
     builder->SetBranchSampleOption(use_branch_address);
     builder->SetCallChainSampleOptions(accumulate_callchain, build_callchain,
@@ -365,16 +426,20 @@
         callgraph_max_stack_(UINT32_MAX),
         callgraph_percent_limit_(0),
         raw_period_(false),
-        brief_callgraph_(true) {}
+        brief_callgraph_(true),
+        trace_offcpu_(false),
+        sched_switch_attr_id_(0u) {}
 
   bool Run(const std::vector<std::string>& args);
 
  private:
   bool ParseOptions(const std::vector<std::string>& args);
+  bool ReadMetaInfoFromRecordFile();
   bool ReadEventAttrFromRecordFile();
   bool ReadFeaturesFromRecordFile();
   bool ReadSampleTreeFromRecordFile();
   bool ProcessRecord(std::unique_ptr<Record> record);
+  void ProcessSampleRecordInTraceOffCpuMode(std::unique_ptr<Record> record, size_t attr_id);
   bool ProcessTracingData(const std::vector<char>& data);
   bool PrintReport();
   void PrintReportContext(FILE* fp);
@@ -401,8 +466,12 @@
   double callgraph_percent_limit_;
   bool raw_period_;
   bool brief_callgraph_;
+  bool trace_offcpu_;
+  size_t sched_switch_attr_id_;
 
   std::string report_filename_;
+  std::unordered_map<std::string, std::string> meta_info_;
+  std::unique_ptr<ScopedEventTypes> scoped_event_types_;
 };
 
 bool ReportCommand::Run(const std::vector<std::string>& args) {
@@ -416,6 +485,9 @@
   if (record_file_reader_ == nullptr) {
     return false;
   }
+  if (!ReadMetaInfoFromRecordFile()) {
+    return false;
+  }
   if (!ReadEventAttrFromRecordFile()) {
     return false;
   }
@@ -676,8 +748,30 @@
   return true;
 }
 
+bool ReportCommand::ReadMetaInfoFromRecordFile() {
+  if (record_file_reader_->HasFeature(PerfFileFormat::FEAT_META_INFO)) {
+    if (!record_file_reader_->ReadMetaInfoFeature(&meta_info_)) {
+      return false;
+    }
+    auto it = meta_info_.find("system_wide_collection");
+    if (it != meta_info_.end()) {
+      system_wide_collection_ = it->second == "true";
+    }
+    it = meta_info_.find("trace_offcpu");
+    if (it != meta_info_.end()) {
+      trace_offcpu_ = it->second == "true";
+    }
+    it = meta_info_.find("event_type_info");
+    if (it != meta_info_.end()) {
+      scoped_event_types_.reset(new ScopedEventTypes(it->second));
+    }
+  }
+  return true;
+}
+
 bool ReportCommand::ReadEventAttrFromRecordFile() {
   std::vector<EventAttrWithId> attrs = record_file_reader_->AttrSection();
+  LOG(ERROR) << "attrs.size() = " << attrs.size();
   for (const auto& attr_with_id : attrs) {
     EventAttrWithName attr;
     attr.attr = *attr_with_id.attr;
@@ -698,6 +792,17 @@
       return false;
     }
   }
+  if (trace_offcpu_) {
+    size_t i;
+    for (i = 0; i < event_attrs_.size(); ++i) {
+      LOG(ERROR) << "event_attrs[" << i << "].name = " << event_attrs_[i].name;
+      if (event_attrs_[i].name == "sched:sched_switch") {
+        break;
+      }
+    }
+    CHECK_NE(i, event_attrs_.size());
+    sched_switch_attr_id_ = i;
+  }
   return true;
 }
 
@@ -716,19 +821,21 @@
   std::vector<std::string> cmdline = record_file_reader_->ReadCmdlineFeature();
   if (!cmdline.empty()) {
     record_cmdline_ = android::base::Join(cmdline, ' ');
-    // TODO: the code to detect system wide collection option is fragile, remove
-    // it once we can do cross unwinding.
-    for (size_t i = 0; i < cmdline.size(); i++) {
-      std::string& s = cmdline[i];
-      if (s == "-a") {
-        system_wide_collection_ = true;
-        break;
-      } else if (s == "--call-graph" || s == "--cpu" || s == "-e" ||
-                 s == "-f" || s == "-F" || s == "-j" || s == "-m" ||
-                 s == "-o" || s == "-p" || s == "-t") {
-        i++;
-      } else if (!s.empty() && s[0] != '-') {
-        break;
+    if (meta_info_.find("system_wide_collection") == meta_info_.end()) {
+      // TODO: the code to detect system wide collection option is fragile, remove
+      // it once we can do cross unwinding.
+      for (size_t i = 0; i < cmdline.size(); i++) {
+        std::string& s = cmdline[i];
+        if (s == "-a") {
+          system_wide_collection_ = true;
+          break;
+        } else if (s == "--call-graph" || s == "--cpu" || s == "-e" ||
+                   s == "-f" || s == "-F" || s == "-j" || s == "-m" ||
+                   s == "-o" || s == "-p" || s == "-t") {
+          i++;
+        } else if (!s.empty() && s[0] != '-') {
+          break;
+        }
       }
     }
   }
@@ -753,6 +860,7 @@
   sample_tree_builder_options_.accumulate_callchain = accumulate_callchain_;
   sample_tree_builder_options_.build_callchain = print_callgraph_;
   sample_tree_builder_options_.use_caller_as_callchain_root = !callgraph_show_callee_;
+  sample_tree_builder_options_.trace_offcpu = trace_offcpu_;
 
   for (size_t i = 0; i < event_attrs_.size(); ++i) {
     sample_tree_builder_.push_back(sample_tree_builder_options_.CreateSampleTreeBuilder());
@@ -775,8 +883,12 @@
   thread_tree_.Update(*record);
   if (record->type() == PERF_RECORD_SAMPLE) {
     size_t attr_id = record_file_reader_->GetAttrIndexOfRecord(record.get());
-    sample_tree_builder_[attr_id]->ProcessSampleRecord(
-        *static_cast<const SampleRecord*>(record.get()));
+    if (!trace_offcpu_) {
+      sample_tree_builder_[attr_id]->ReportCmdProcessSampleRecord(
+          *static_cast<SampleRecord*>(record.get()));
+    } else {
+      ProcessSampleRecordInTraceOffCpuMode(std::move(record), attr_id);
+    }
   } else if (record->type() == PERF_RECORD_TRACING_DATA) {
     const auto& r = *static_cast<TracingDataRecord*>(record.get());
     if (!ProcessTracingData(std::vector<char>(r.data, r.data + r.data_size))) {
@@ -786,6 +898,24 @@
   return true;
 }
 
+
+void ReportCommand::ProcessSampleRecordInTraceOffCpuMode(std::unique_ptr<Record> record,
+                                                         size_t attr_id) {
+  std::shared_ptr<SampleRecord> r(static_cast<SampleRecord*>(record.release()));
+  if (attr_id == sched_switch_attr_id_) {
+    // If this sample belongs to sched_switch event, we should broadcast the offcpu info
+    // to other event types.
+    for (size_t i = 0; i < event_attrs_.size(); ++i) {
+      if (i == sched_switch_attr_id_) {
+        continue;
+      }
+      sample_tree_builder_[i]->ReportCmdProcessSampleRecord(r);
+    }
+  } else {
+    sample_tree_builder_[attr_id]->ReportCmdProcessSampleRecord(r);
+  }
+}
+
 bool ReportCommand::ProcessTracingData(const std::vector<char>& data) {
   Tracing tracing(data);
   for (auto& attr : event_attrs_) {
@@ -810,6 +940,9 @@
   }
   PrintReportContext(report_fp);
   for (size_t i = 0; i < event_attrs_.size(); ++i) {
+    if (trace_offcpu_ && i == sched_switch_attr_id_) {
+      continue;
+    }
     if (i != 0) {
       fprintf(report_fp, "\n");
     }
@@ -823,7 +956,8 @@
               sample_tree.total_error_callchains,
               sample_tree.total_error_callchains * 100.0 / sample_tree.total_samples);
     }
-    fprintf(report_fp, "Event count: %" PRIu64 "\n\n", sample_tree.total_period);
+    const char* period_prefix = trace_offcpu_ ? "Time in ns" : "Event count";
+    fprintf(report_fp, "%s: %" PRIu64 "\n\n", period_prefix, sample_tree.total_period);
     sample_tree_displayer_->DisplaySamples(report_fp, sample_tree.samples, &sample_tree);
   }
   fflush(report_fp);
diff --git a/simpleperf/cmd_report_test.cpp b/simpleperf/cmd_report_test.cpp
index f1f5b1f..f1a9eb6 100644
--- a/simpleperf/cmd_report_test.cpp
+++ b/simpleperf/cmd_report_test.cpp
@@ -471,6 +471,21 @@
   ASSERT_EQ(content.find("skipped in brief callgraph mode"), std::string::npos);
 }
 
+TEST_F(ReportCommandTest, report_offcpu_time) {
+  Report(PERF_DATA_WITH_TRACE_OFFCPU, {"--children"});
+  ASSERT_TRUE(success);
+  ASSERT_NE(content.find("Time in ns"), std::string::npos);
+  bool found = false;
+  for (auto& line : lines) {
+    if (line.find("SleepFunction") != std::string::npos) {
+      ASSERT_NE(line.find("46.29%"), std::string::npos);
+      found = true;
+      break;
+    }
+  }
+  ASSERT_TRUE(found);
+}
+
 #if defined(__linux__)
 #include "event_selection_set.h"
 
diff --git a/simpleperf/event_fd.cpp b/simpleperf/event_fd.cpp
index c6edb24..26f542f 100644
--- a/simpleperf/event_fd.cpp
+++ b/simpleperf/event_fd.cpp
@@ -56,7 +56,11 @@
   if (attr.freq) {
     uint64_t max_sample_freq;
     if (GetMaxSampleFrequency(&max_sample_freq) && max_sample_freq < attr.sample_freq) {
-      PLOG(INFO) << "Adjust sample freq to max allowed sample freq " << max_sample_freq;
+      static bool warned = false;
+      if (!warned) {
+        warned = true;
+        PLOG(INFO) << "Adjust sample freq to max allowed sample freq " << max_sample_freq;
+      }
       real_attr.sample_freq = max_sample_freq;
     }
   }
diff --git a/simpleperf/event_selection_set.cpp b/simpleperf/event_selection_set.cpp
index 86f57b9..5f8cf5c 100644
--- a/simpleperf/event_selection_set.cpp
+++ b/simpleperf/event_selection_set.cpp
@@ -16,6 +16,9 @@
 
 #include "event_selection_set.h"
 
+#include <atomic>
+#include <thread>
+
 #include <android-base/logging.h>
 
 #include "environment.h"
@@ -53,6 +56,53 @@
   return IsEventAttrSupported(attr);
 }
 
+bool IsDumpingRegsForTracepointEventsSupported() {
+  const EventType* event_type = FindEventTypeByName("sched:sched_switch");
+  if (event_type == nullptr) {
+    return false;
+  }
+  std::atomic<bool> done(false);
+  std::atomic<pid_t> thread_id(0);
+  std::thread thread([&]() {
+    thread_id = gettid();
+    while (!done) {
+      usleep(1);
+    }
+    usleep(1);  // Make a sched out to generate one sample.
+  });
+  while (thread_id == 0) {
+    usleep(1);
+  }
+  perf_event_attr attr = CreateDefaultPerfEventAttr(*event_type);
+  attr.freq = 0;
+  attr.sample_period = 1;
+  std::unique_ptr<EventFd> event_fd =
+      EventFd::OpenEventFile(attr, thread_id, -1, nullptr);
+  if (event_fd == nullptr) {
+    return false;
+  }
+  if (!event_fd->CreateMappedBuffer(4, true)) {
+    return false;
+  }
+  done = true;
+  thread.join();
+
+  std::vector<char> buffer;
+  size_t buffer_pos = 0;
+  size_t size = event_fd->GetAvailableMmapData(buffer, buffer_pos);
+  std::vector<std::unique_ptr<Record>> records =
+      ReadRecordsFromBuffer(attr, buffer.data(), size);
+  for (auto& r : records) {
+    if (r->type() == PERF_RECORD_SAMPLE) {
+      auto& record = *static_cast<SampleRecord*>(r.get());
+      if (record.ip_data.ip != 0) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 bool EventSelectionSet::BuildAndCheckEventSelection(
     const std::string& event_name, EventSelection* selection) {
   std::unique_ptr<EventTypeAndModifier> event_type = ParseEventType(event_name);
@@ -116,6 +166,16 @@
   return true;
 }
 
+std::vector<const EventType*> EventSelectionSet::GetEvents() const {
+  std::vector<const EventType*> result;
+  for (const auto& group : groups_) {
+    for (const auto& selection : group) {
+      result.push_back(&selection.event_type_modifier.event_type);
+    }
+  }
+  return result;
+}
+
 std::vector<const EventType*> EventSelectionSet::GetTracepointEvents() const {
   std::vector<const EventType*> result;
   for (const auto& group : groups_) {
diff --git a/simpleperf/event_selection_set.h b/simpleperf/event_selection_set.h
index 97ad7e5..f5cb53a 100644
--- a/simpleperf/event_selection_set.h
+++ b/simpleperf/event_selection_set.h
@@ -73,6 +73,7 @@
 
   bool AddEventType(const std::string& event_name);
   bool AddEventGroup(const std::vector<std::string>& event_names);
+  std::vector<const EventType*> GetEvents() const;
   std::vector<const EventType*> GetTracepointEvents() const;
   bool HasInplaceSampler() const;
   std::vector<EventAttrWithId> GetEventAttrWithId() const;
@@ -186,5 +187,6 @@
 
 bool IsBranchSamplingSupported();
 bool IsDwarfCallChainSamplingSupported();
+bool IsDumpingRegsForTracepointEventsSupported();
 
 #endif  // SIMPLE_PERF_EVENT_SELECTION_SET_H_
diff --git a/simpleperf/event_type.cpp b/simpleperf/event_type.cpp
index bc639d1..21da17c 100644
--- a/simpleperf/event_type.cpp
+++ b/simpleperf/event_type.cpp
@@ -51,6 +51,9 @@
 std::string GetTracepointEvents() {
   std::string result;
   for (const EventType& event : GetAllEventTypes()) {
+    if (event.type != PERF_TYPE_TRACEPOINT) {
+      continue;
+    }
     if (!result.empty()) {
       result.push_back('\n');
     }
@@ -107,8 +110,37 @@
   return result;
 }
 
+static std::vector<EventType> event_type_array;
+
+std::string ScopedEventTypes::BuildString(const std::vector<const EventType*>& event_types) {
+  std::string result;
+  for (auto type : event_types) {
+    if (!result.empty()) {
+      result.push_back('\n');
+    }
+    result += android::base::StringPrintf("%s,%u,%" PRIu64, type->name.c_str(), type->type,
+                                          type->config);
+  }
+  return result;
+}
+
+ScopedEventTypes::ScopedEventTypes(const std::string& event_type_str) {
+  saved_event_types_ = std::move(event_type_array);
+  event_type_array.clear();
+  for (auto& s : android::base::Split(event_type_str, "\n")) {
+    std::string name = s.substr(0, s.find(','));
+    uint32_t type;
+    uint64_t config;
+    sscanf(s.c_str() + name.size(), ",%u,%" PRIu64, &type, &config);
+    event_type_array.emplace_back(name, type, config, "", "");
+  }
+}
+
+ScopedEventTypes::~ScopedEventTypes() {
+  event_type_array = std::move(saved_event_types_);
+}
+
 const std::vector<EventType>& GetAllEventTypes() {
-  static std::vector<EventType> event_type_array;
   if (event_type_array.empty()) {
     event_type_array.insert(event_type_array.end(), static_event_type_array.begin(),
                             static_event_type_array.end());
diff --git a/simpleperf/event_type.h b/simpleperf/event_type.h
index 6ec544d..1dd7f39 100644
--- a/simpleperf/event_type.h
+++ b/simpleperf/event_type.h
@@ -54,6 +54,19 @@
 
 bool SetTracepointEventsFilePath(const std::string& filepath);
 std::string GetTracepointEvents();
+
+// Used to temporarily change event types returned by GetAllEventTypes().
+class ScopedEventTypes {
+ public:
+  static std::string BuildString(const std::vector<const EventType*>& event_types);
+
+  ScopedEventTypes(const std::string& event_type_str);
+  ~ScopedEventTypes();
+
+ private:
+  std::vector<EventType> saved_event_types_;
+};
+
 const std::vector<EventType>& GetAllEventTypes();
 const EventType* FindEventTypeByName(const std::string& name);
 
diff --git a/simpleperf/get_test_data.h b/simpleperf/get_test_data.h
index 963fe00..3b9121c 100644
--- a/simpleperf/get_test_data.h
+++ b/simpleperf/get_test_data.h
@@ -108,4 +108,7 @@
 // generated by recording an app.
 static const std::string PERF_DATA_WITH_WRONG_IP_IN_CALLCHAIN = "wrong_ip_callchain_perf.data";
 
+// generated by `simpleperf record --trace-offcpu --duration 2 -g ./simpleperf_runtest_run_and_sleep64`.
+static const std::string PERF_DATA_WITH_TRACE_OFFCPU = "perf_with_trace_offcpu.data";
+
 #endif  // SIMPLE_PERF_GET_TEST_DATA_H_
diff --git a/simpleperf/runtest/Android.build.mk b/simpleperf/runtest/Android.build.mk
index 5a8f92d..65f5fdb 100644
--- a/simpleperf/runtest/Android.build.mk
+++ b/simpleperf/runtest/Android.build.mk
@@ -31,16 +31,16 @@
 LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.build.mk
 include $(BUILD_EXECUTABLE)
 
-ifeq ($(HOST_OS),linux)
 include $(CLEAR_VARS)
 LOCAL_CLANG := true
 LOCAL_CPPFLAGS := $(simpleperf_runtest_cppflags)
 LOCAL_SRC_FILES := $(module_src_files)
 LOCAL_SHARED_LIBRARIES := libsimpleperf_inplace_sampler
 LOCAL_MODULE := $(module)
+LOCAL_MODULE_HOST_OS := linux
 LOCAL_MULTILIB := both
 LOCAL_MODULE_STEM_32 := $(module)32
 LOCAL_MODULE_STEM_64 := $(module)64
+LOCAL_LDLIBS_linux := -lrt
 LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.build.mk
-include $(BUILD_HOST_EXECUTABLE)
-endif
\ No newline at end of file
+include $(BUILD_HOST_EXECUTABLE)
\ No newline at end of file
diff --git a/simpleperf/runtest/Android.mk b/simpleperf/runtest/Android.mk
index 55bf3b7..ad3f5c7 100644
--- a/simpleperf/runtest/Android.mk
+++ b/simpleperf/runtest/Android.mk
@@ -42,4 +42,8 @@
 
 module := simpleperf_runtest_function_indirect_recursive
 module_src_files := function_indirect_recursive.cpp
+include $(LOCAL_PATH)/Android.build.mk
+
+module := simpleperf_runtest_run_and_sleep
+module_src_files := run_and_sleep.cpp
 include $(LOCAL_PATH)/Android.build.mk
\ No newline at end of file
diff --git a/simpleperf/runtest/run_and_sleep.cpp b/simpleperf/runtest/run_and_sleep.cpp
new file mode 100644
index 0000000..5ed09ac
--- /dev/null
+++ b/simpleperf/runtest/run_and_sleep.cpp
@@ -0,0 +1,46 @@
+#include <inttypes.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+
+#define noinline __attribute__((__noinline__))
+
+static inline uint64_t GetSystemClock() {
+  timespec ts;
+  // Assume clock_gettime() doesn't fail.
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
+}
+
+constexpr int LOOP_COUNT = 100000000;
+uint64_t noinline RunFunction() {
+  uint64_t start_time_in_ns = GetSystemClock();
+  for (volatile int i = 0; i < LOOP_COUNT; ++i) {
+  }
+  return GetSystemClock() - start_time_in_ns;
+}
+
+uint64_t noinline SleepFunction(unsigned long long sleep_time_in_ns) {
+  uint64_t start_time_in_ns = GetSystemClock();
+  struct timespec req;
+  req.tv_sec = sleep_time_in_ns / 1000000000;
+  req.tv_nsec = sleep_time_in_ns % 1000000000;
+  nanosleep(&req, nullptr);
+  return GetSystemClock() - start_time_in_ns;
+}
+
+void noinline GlobalFunction() {
+  uint64_t total_sleep_time_in_ns = 0;
+  uint64_t total_run_time_in_ns = 0;
+  while (true) {
+    total_run_time_in_ns += RunFunction();
+    if (total_sleep_time_in_ns < total_run_time_in_ns) {
+      total_sleep_time_in_ns += SleepFunction(total_run_time_in_ns - total_sleep_time_in_ns);
+    }
+  }
+}
+
+int main() {
+  GlobalFunction();
+  return 0;
+}
diff --git a/simpleperf/runtest/runtest.conf b/simpleperf/runtest/runtest.conf
index 2fbbd98..22babf9 100644
--- a/simpleperf/runtest/runtest.conf
+++ b/simpleperf/runtest/runtest.conf
@@ -203,4 +203,24 @@
   </symbol_overhead>
 </test>
 
+<test name="run_and_sleep">
+  <executable name="simpleperf_runtest_run_and_sleep"/>
+
+  <symbol_overhead>
+    <symbol name="RunFunction()" min="80"/>
+  </symbol_overhead>
+
+</test>
+
+<test name="run_and_sleep_trace_offcpu">
+  <executable name="simpleperf_runtest_run_and_sleep"/>
+  <disable_host/>
+  <record option="--trace-offcpu"/>
+
+  <symbol_children_overhead>
+    <symbol name="RunFunction()" min="20" max="80"/>
+    <symbol name="SleepFunction(unsigned long long)" min="20" max="80"/>
+  </symbol_children_overhead>
+</test>
+
 </runtests>
diff --git a/simpleperf/runtest/runtest.py b/simpleperf/runtest/runtest.py
index a1b4520..e4bfe16 100644
--- a/simpleperf/runtest/runtest.py
+++ b/simpleperf/runtest/runtest.py
@@ -168,12 +168,16 @@
           self,
           test_name,
           executable_name,
+          disable_host,
+          record_options,
           report_options,
           symbol_overhead_requirements,
           symbol_children_overhead_requirements,
           symbol_relation_requirements):
     self.test_name = test_name
     self.executable_name = executable_name
+    self.disable_host = disable_host
+    self.record_options = record_options
     self.report_options = report_options
     self.symbol_overhead_requirements = symbol_overhead_requirements
     self.symbol_children_overhead_requirements = (
@@ -184,6 +188,8 @@
     strs = []
     strs.append('Test test_name=%s' % self.test_name)
     strs.append('\texecutable_name=%s' % self.executable_name)
+    strs.append('\tdisable_host=%s' % self.disable_host)
+    strs.append('\trecord_options=%s' % (' '.join(self.record_options)))
     strs.append('\treport_options=%s' % (' '.join(self.report_options)))
     strs.append('\tsymbol_overhead_requirements:')
     for req in self.symbol_overhead_requirements:
@@ -206,6 +212,8 @@
     assert test.tag == 'test'
     test_name = test.attrib['name']
     executable_name = None
+    disable_host = False
+    record_options = []
     report_options = []
     symbol_overhead_requirements = []
     symbol_children_overhead_requirements = []
@@ -213,6 +221,10 @@
     for test_item in test:
       if test_item.tag == 'executable':
         executable_name = test_item.attrib['name']
+      elif test_item.tag == 'disable_host':
+        disable_host = True
+      elif test_item.tag == 'record':
+        record_options = test_item.attrib['option'].split()
       elif test_item.tag == 'report':
         report_options = test_item.attrib['option'].split()
       elif (test_item.tag == 'symbol_overhead' or
@@ -256,6 +268,8 @@
         Test(
             test_name,
             executable_name,
+            disable_host,
+            record_options,
             report_options,
             symbol_overhead_requirements,
             symbol_children_overhead_requirements,
@@ -403,18 +417,20 @@
         continue
       if not line[0].isspace():
         if has_callgraph:
-          m = re.search(r'^([\d\.]+)%\s+([\d\.]+)%\s+(\S+).*\s+(\S+)$', line)
-          children_overhead = float(m.group(1))
-          overhead = float(m.group(2))
-          comm = m.group(3)
-          symbol_name = m.group(4)
+          items = line.split(None, 6)
+          assert len(items) == 7
+          children_overhead = float(items[0][:-1])
+          overhead = float(items[1][:-1])
+          comm = items[2]
+          symbol_name = items[6]
           cur_symbol = Symbol(symbol_name, comm, overhead, children_overhead)
           symbols.append(cur_symbol)
         else:
-          m = re.search(r'^([\d\.]+)%\s+(\S+).*\s+(\S+)$', line)
-          overhead = float(m.group(1))
-          comm = m.group(2)
-          symbol_name = m.group(3)
+          items = line.split(None, 5)
+          assert len(items) == 6
+          overhead = float(items[0][:-1])
+          comm = items[1]
+          symbol_name = items[5]
           cur_symbol = Symbol(symbol_name, comm, overhead, 0)
           symbols.append(cur_symbol)
         # Each report item can have different column depths.
@@ -551,7 +567,10 @@
 def test_with_runner(runner, tests):
   report_analyzer = ReportAnalyzer()
   for test in tests:
-    runner.record(test.executable_name, 'perf.data')
+    if test.disable_host and runner.target.startswith('host'):
+      print('Skip test %s on %s' % (test.test_name, runner.target))
+      continue
+    runner.record(test.executable_name, 'perf.data', additional_options = test.record_options)
     if runner.sampler == 'inplace-sampler':
       # TODO: fix this when inplace-sampler actually works.
       runner.report('perf.data', 'perf.report')
diff --git a/simpleperf/sample_tree.h b/simpleperf/sample_tree.h
index c7a0ac8..67f28da 100644
--- a/simpleperf/sample_tree.h
+++ b/simpleperf/sample_tree.h
@@ -53,7 +53,7 @@
 template <typename EntryT, typename AccumulateInfoT>
 class SampleTreeBuilder {
  public:
-  explicit SampleTreeBuilder(SampleComparator<EntryT> comparator)
+  explicit SampleTreeBuilder(const SampleComparator<EntryT>& comparator)
       : sample_set_(comparator),
         accumulate_callchain_(false),
         sample_comparator_(comparator),
diff --git a/simpleperf/test_util.h b/simpleperf/test_util.h
index f242c89..bb061ba 100644
--- a/simpleperf/test_util.h
+++ b/simpleperf/test_util.h
@@ -40,3 +40,9 @@
       GTEST_LOG_(INFO) << "Didn't test \"" << #TestStatement << "\" requires root privileges";     \
     }                                                                                              \
   } while (0)
+
+#if defined(__ANDROID__)
+#define TEST_REQUIRE_HOST_ROOT()
+#else
+#define TEST_REQUIRE_HOST_ROOT()  if (!IsRoot()) return
+#endif
diff --git a/simpleperf/testdata/perf_with_trace_offcpu.data b/simpleperf/testdata/perf_with_trace_offcpu.data
new file mode 100644
index 0000000..6d0c5e0
--- /dev/null
+++ b/simpleperf/testdata/perf_with_trace_offcpu.data
Binary files differ