simpleperf: Switch to use post-unwind by default in record cmd.

When recording google.sample.tunnel app for 30s:
It took 3s to unwind samples and write unwound samples to file.
It took 0.3s to write samples containing stack/reg data to file.

The result shows recording with post unwinding consumes much
less time than unwinding samples immediately. This means we can
record with higher freq and get smaller lose rate when using
post unwinding. So make below changes:
1. Make post unwinding by default.
2. Replace --post-unwind with --no-post-unwind option.
3. Make --trace-offcpu and callchain joiner work with post unwinding.
4. Remove special operations in --log debug mode. Those will be
   supported in a new command.

Bug: http://b/72556486
Test: run simpleperf_unit_test.
Test: run python test.py.

Change-Id: I9a5a5defda9d040985e674c43db19ee68e7aa305
diff --git a/simpleperf/cmd_record.cpp b/simpleperf/cmd_record.cpp
index 1e2bbc1..2a6bf33 100644
--- a/simpleperf/cmd_record.cpp
+++ b/simpleperf/cmd_record.cpp
@@ -161,12 +161,13 @@
 "-m mmap_pages   Set the size of the buffer used to receiving sample data from\n"
 "                the kernel. It should be a power of 2. If not set, the max\n"
 "                possible value <= 1024 will be used.\n"
-"--no-dump-kernel-symbols  Don't dump kernel symbols in perf.data. By default\n"
-"                          kernel symbols will be dumped when needed.\n"
-"--no-dump-symbols       Don't dump symbols in perf.data. By default symbols are\n"
-"                        dumped in perf.data, to support reporting in another\n"
-"                        environment.\n"
 "--no-inherit  Don't record created child threads/processes.\n"
+"\n"
+"Dwarf unwinding options:\n"
+"--no-post-unwind   If `--call-graph dwarf` option is used, then the user's stack\n"
+"                   will be recorded in perf.data and unwound after recording.\n"
+"                   However, this takes a lot of disk space. Use this option to\n"
+"                   unwind while recording.\n"
 "--no-unwind   If `--call-graph dwarf` option is used, then the user's stack\n"
 "              will be unwound by default. Use this option to disable the\n"
 "              unwinding of the user's stack.\n"
@@ -177,11 +178,14 @@
 "--callchain-joiner-min-matching-nodes count\n"
 "               When callchain joiner is used, set the matched nodes needed to join\n"
 "               callchains. The count should be >= 1. By default it is 1.\n"
+"\n"
+"Recording file options:\n"
+"--no-dump-kernel-symbols  Don't dump kernel symbols in perf.data. By default\n"
+"                          kernel symbols will be dumped when needed.\n"
+"--no-dump-symbols       Don't dump symbols in perf.data. By default symbols are\n"
+"                        dumped in perf.data, to support reporting in another\n"
+"                        environment.\n"
 "-o record_file_name    Set record file name, default is perf.data.\n"
-"--post-unwind  If `--call-graph dwarf` option is used, then the user's stack\n"
-"               will be unwound while recording by default. But it may lose\n"
-"               records as stacking unwinding can be time consuming. Use this\n"
-"               option to unwind the user's stack after recording.\n"
 "--exit-with-parent            Stop recording when the process starting\n"
 "                              simpleperf dies.\n"
 "--start_profiling_fd fd_no    After starting profiling, write \"STARTED\" to\n"
@@ -202,7 +206,7 @@
         dwarf_callchain_sampling_(false),
         dump_stack_size_in_dwarf_sampling_(MAX_DUMP_STACK_SIZE),
         unwind_dwarf_callchain_(true),
-        post_unwind_(false),
+        post_unwind_(true),
         child_inherit_(true),
         duration_in_sec_(0),
         can_dump_kernel_symbols_(true),
@@ -246,9 +250,13 @@
   bool DumpKernelAndModuleMmaps(const perf_event_attr& attr, uint64_t event_id);
   bool DumpThreadCommAndMmaps(const perf_event_attr& attr, uint64_t event_id);
   bool ProcessRecord(Record* record);
+  bool SaveRecordForPostUnwinding(Record* record);
+  bool SaveRecordAfterUnwinding(Record* record);
+  bool SaveRecordWithoutUnwinding(Record* record);
+
   void UpdateRecordForEmbeddedElfPath(Record* record);
   bool UnwindRecord(SampleRecord& r);
-  bool PostUnwind(const std::vector<std::string>& args);
+  bool PostUnwindRecords();
   bool JoinCallChains();
   bool DumpAdditionalFeatures(const std::vector<std::string>& args);
   bool DumpBuildIdFeature();
@@ -361,14 +369,12 @@
     // Normally do strict arch check when unwinding stack. But allow unwinding
     // 32-bit processes on 64-bit devices for system wide profiling.
     bool strict_arch_check = !system_wide_collection_;
-    bool collect_stat = WOULD_LOG(DEBUG);
-    offline_unwinder_.reset(new OfflineUnwinder(strict_arch_check, collect_stat));
+    offline_unwinder_.reset(new OfflineUnwinder(strict_arch_check, false));
   }
-  if (unwind_dwarf_callchain_ && !post_unwind_ && allow_callchain_joiner_) {
-    bool keep_original_callchains = WOULD_LOG(DEBUG);
+  if (unwind_dwarf_callchain_ && allow_callchain_joiner_) {
     callchain_joiner_.reset(new CallChainJoiner(DEFAULT_CALL_CHAIN_JOINER_CACHE_SIZE,
                                                 callchain_joiner_min_matching_nodes_,
-                                                keep_original_callchains));
+                                                false));
   }
 
   // 4. Add monitored targets.
@@ -469,12 +475,19 @@
 }
 
 bool RecordCommand::PostProcessRecording(const std::vector<std::string>& args) {
-  // 1. Optionally join Callchains.
+  // 1. Post unwind dwarf callchain.
+  if (unwind_dwarf_callchain_ && post_unwind_) {
+    if (!PostUnwindRecords()) {
+      return false;
+    }
+  }
+
+  // 2. Optionally join Callchains.
   if (callchain_joiner_) {
     JoinCallChains();
   }
 
-  // 2. Dump additional features, and close record file.
+  // 3. Dump additional features, and close record file.
   if (!DumpAdditionalFeatures(args)) {
     return false;
   }
@@ -482,13 +495,6 @@
     return false;
   }
 
-  // 3. Post unwind dwarf callchain.
-  if (post_unwind_) {
-    if (!PostUnwind(args)) {
-      return false;
-    }
-  }
-
   // 4. Show brief record result.
   LOG(INFO) << "Samples recorded: " << sample_record_count_
             << ". Samples lost: " << lost_record_count_ << ".";
@@ -704,8 +710,8 @@
         return false;
       }
       event_selection_set_.AddMonitoredProcesses(pids);
-    } else if (args[i] == "--post-unwind") {
-      post_unwind_ = true;
+    } else if (args[i] == "--no-post-unwind") {
+      post_unwind_ = false;
     } else if (args[i] == "--start_profiling_fd") {
       if (!NextArgumentOrError(args, &i)) {
         return false;
@@ -754,13 +760,17 @@
     unwind_dwarf_callchain_ = false;
   }
   if (post_unwind_) {
+    if (!dwarf_callchain_sampling_ || !unwind_dwarf_callchain_) {
+      post_unwind_ = false;
+    }
+  } else {
     if (!dwarf_callchain_sampling_) {
       LOG(ERROR)
-          << "--post-unwind is only used with `--call-graph dwarf` option.";
+          << "--no-post-unwind is only used with `--call-graph dwarf` option.";
       return false;
     }
     if (!unwind_dwarf_callchain_) {
-      LOG(ERROR) << "--post-unwind can't be used with `--no-unwind` option.";
+      LOG(ERROR) << "--no-post-unwind can't be used with `--no-unwind` option.";
       return false;
     }
   }
@@ -1005,26 +1015,35 @@
 }
 
 bool RecordCommand::ProcessRecord(Record* record) {
-  if (system_wide_collection_ && record->type() == PERF_RECORD_SAMPLE) {
-    auto& r = *static_cast<SampleRecord*>(record);
-    // Omit samples get before start sampling time.
-    if (r.time_data.time < start_sampling_time_in_ns_) {
-      return true;
+  if (unwind_dwarf_callchain_) {
+    if (post_unwind_) {
+      return SaveRecordForPostUnwinding(record);
     }
+    return SaveRecordAfterUnwinding(record);
   }
-  UpdateRecordForEmbeddedElfPath(record);
+  return SaveRecordWithoutUnwinding(record);
+}
 
-  bool unwind_callchain = unwind_dwarf_callchain_ && !post_unwind_;
+bool RecordCommand::SaveRecordForPostUnwinding(Record* record) {
+  if (record->type() == PERF_RECORD_SAMPLE) {
+    static_cast<SampleRecord*>(record)->RemoveInvalidStackData();
+  }
+  if (!record_file_writer_->WriteRecord(*record)) {
+    LOG(ERROR) << "If there isn't enough space for storing profiling data, consider using "
+               << "--no-post-unwind option.";
+    return false;
+  }
+  return true;
+}
 
+bool RecordCommand::SaveRecordAfterUnwinding(Record* record) {
   if (record->type() == PERF_RECORD_SAMPLE) {
     auto& r = *static_cast<SampleRecord*>(record);
     // AdjustCallChainGeneratedByKernel() should go before UnwindRecord(). Because we don't want
     // to adjust callchains generated by dwarf unwinder.
-    if (fp_callchain_sampling_ || dwarf_callchain_sampling_) {
-      r.AdjustCallChainGeneratedByKernel();
-    }
-    if (unwind_callchain) {
-      UnwindRecord(r);
+    r.AdjustCallChainGeneratedByKernel();
+    if (!UnwindRecord(r)) {
+      return false;
     }
     // ExcludeKernelCallChain() should go after UnwindRecord() to notice the generated user call
     // chain.
@@ -1035,11 +1054,28 @@
     sample_record_count_++;
   } else if (record->type() == PERF_RECORD_LOST) {
     lost_record_count_ += static_cast<LostRecord*>(record)->lost;
-  } else if (unwind_callchain) {
+  } else {
+    UpdateRecordForEmbeddedElfPath(record);
     thread_tree_.Update(*record);
   }
-  bool result = record_file_writer_->WriteRecord(*record);
-  return result;
+  return record_file_writer_->WriteRecord(*record);
+}
+
+bool RecordCommand::SaveRecordWithoutUnwinding(Record* record) {
+  if (record->type() == PERF_RECORD_SAMPLE) {
+    auto& r = *static_cast<SampleRecord*>(record);
+    if (fp_callchain_sampling_ || dwarf_callchain_sampling_) {
+      r.AdjustCallChainGeneratedByKernel();
+    }
+    if (r.InKernel() && exclude_kernel_callchain_ && r.ExcludeKernelCallChain() == 0u) {
+      // If current record contains no user callchain, skip it.
+      return true;
+    }
+    sample_record_count_++;
+  } else if (record->type() == PERF_RECORD_LOST) {
+    lost_record_count_ += static_cast<LostRecord*>(record)->lost;
+  }
+  return record_file_writer_->WriteRecord(*record);
 }
 
 template <class RecordType>
@@ -1093,13 +1129,6 @@
                                             &ips, &sps)) {
       return false;
     }
-    if (offline_unwinder_->HasStat()) {
-      const UnwindingResult& unwinding_result = offline_unwinder_->GetUnwindingResult();
-      UnwindingResultRecord record(r.time_data.time, unwinding_result);
-      if (!record_file_writer_->WriteRecord(record)) {
-        return false;
-      }
-    }
     r.ReplaceRegAndStackWithCallChain(ips);
     if (callchain_joiner_) {
       return callchain_joiner_->AddCallChain(r.tid_data.pid, r.tid_data.tid,
@@ -1109,49 +1138,32 @@
   return true;
 }
 
-bool RecordCommand::PostUnwind(const std::vector<std::string>& args) {
-  thread_tree_.ClearThreadAndMap();
-  std::unique_ptr<RecordFileReader> reader =
-      RecordFileReader::CreateInstance(record_filename_);
-  if (reader == nullptr) {
-    return false;
-  }
-  std::string tmp_filename = record_filename_ + ".tmp";
-  record_file_writer_ = CreateRecordFile(tmp_filename);
-  if (record_file_writer_ == nullptr) {
-    return false;
-  }
-  bool result = reader->ReadDataSection(
-      [this](std::unique_ptr<Record> record) {
-        thread_tree_.Update(*record);
-        if (record->type() == PERF_RECORD_SAMPLE) {
-          if (!UnwindRecord(*static_cast<SampleRecord*>(record.get()))) {
-            return false;
-          }
-        }
-        return record_file_writer_->WriteRecord(*record);
-      },
-      false);
-  if (!result) {
-    return false;
-  }
-  if (!DumpAdditionalFeatures(args)) {
-    return false;
-  }
+bool RecordCommand::PostUnwindRecords() {
+  // 1. Move records from record_filename_ to a temporary file.
   if (!record_file_writer_->Close()) {
     return false;
   }
+  record_file_writer_.reset();
+  std::unique_ptr<TemporaryFile> tmpfile = CreateTempFileUsedInRecording();
+  if (!Workload::RunCmd({"mv", record_filename_, tmpfile->path})) {
+    return false;
+  }
+  std::unique_ptr<RecordFileReader> reader = RecordFileReader::CreateInstance(tmpfile->path);
+  if (!reader) {
+    return false;
+  }
 
-  if (unlink(record_filename_.c_str()) != 0) {
-    PLOG(ERROR) << "failed to remove " << record_filename_;
+  // 2. Read records from the temporary file, and write unwound records back to record_filename_.
+  record_file_writer_ = CreateRecordFile(record_filename_);
+  if (!record_file_writer_) {
     return false;
   }
-  if (rename(tmp_filename.c_str(), record_filename_.c_str()) != 0) {
-    PLOG(ERROR) << "failed to rename " << tmp_filename << " to "
-                << record_filename_;
-    return false;
-  }
-  return true;
+  sample_record_count_ = 0;
+  lost_record_count_ = 0;
+  auto callback = [this](std::unique_ptr<Record> record) {
+    return SaveRecordAfterUnwinding(record.get());
+  };
+  return reader->ReadDataSection(callback, false);
 }
 
 bool RecordCommand::JoinCallChains() {
@@ -1176,7 +1188,6 @@
   if (!reader || !record_file_writer_) {
     return false;
   }
-  bool store_callchains = WOULD_LOG(DEBUG);
 
   auto record_callback = [&](std::unique_ptr<Record> r) {
     if (r->type() != PERF_RECORD_SAMPLE) {
@@ -1191,17 +1202,10 @@
     CallChainJoiner::ChainType type;
     std::vector<uint64_t> ips;
     std::vector<uint64_t> sps;
-    do {
-      if (!callchain_joiner_->GetNextCallChain(pid, tid, type, ips, sps)) {
-        return false;
-      }
-      if (store_callchains) {
-        CallChainRecord record(pid, tid, type, sr.Timestamp(), ips, sps);
-        if (!record_file_writer_->WriteRecord(record)) {
-          return false;
-        }
-      }
-    } while (type != CallChainJoiner::JOINED_OFFLINE);
+    if (!callchain_joiner_->GetNextCallChain(pid, tid, type, ips, sps)) {
+      return false;
+    }
+    CHECK_EQ(type, CallChainJoiner::JOINED_OFFLINE);
     CHECK_EQ(pid, static_cast<pid_t>(sr.tid_data.pid));
     CHECK_EQ(tid, static_cast<pid_t>(sr.tid_data.tid));
     sr.UpdateUserCallChain(ips);
diff --git a/simpleperf/cmd_record_test.cpp b/simpleperf/cmd_record_test.cpp
index 660e679..7869037 100644
--- a/simpleperf/cmd_record_test.cpp
+++ b/simpleperf/cmd_record_test.cpp
@@ -241,16 +241,16 @@
   ASSERT_FALSE(RunRecordCmd({"--no-unwind"}));
 }
 
-TEST(record_cmd, post_unwind_option) {
+TEST(record_cmd, no_post_unwind_option) {
   OMIT_TEST_ON_NON_NATIVE_ABIS();
   ASSERT_TRUE(IsDwarfCallChainSamplingSupported());
   std::vector<std::unique_ptr<Workload>> workloads;
   CreateProcesses(1, &workloads);
   std::string pid = std::to_string(workloads[0]->GetPid());
-  ASSERT_TRUE(RunRecordCmd({"-p", pid, "--call-graph", "dwarf", "--post-unwind"}));
-  ASSERT_FALSE(RunRecordCmd({"--post-unwind"}));
+  ASSERT_TRUE(RunRecordCmd({"-p", pid, "--call-graph", "dwarf", "--no-post-unwind"}));
+  ASSERT_FALSE(RunRecordCmd({"--no-post-unwind"}));
   ASSERT_FALSE(
-      RunRecordCmd({"--call-graph", "dwarf", "--no-unwind", "--post-unwind"}));
+      RunRecordCmd({"--call-graph", "dwarf", "--no-unwind", "--no-post-unwind"}));
 }
 
 TEST(record_cmd, existing_processes) {
diff --git a/simpleperf/record.cpp b/simpleperf/record.cpp
index 44e3d55..a03f377 100644
--- a/simpleperf/record.cpp
+++ b/simpleperf/record.cpp
@@ -712,6 +712,31 @@
   UpdateBinary(new_binary);
 }
 
+// When simpleperf requests the kernel to dump 64K stack per sample, it will allocate 64K space in
+// each sample to store stack data. However, a thread may use less stack than 64K. So not all the
+// 64K stack data in a sample is valid. And this function is used to remove invalid stack data in
+// a sample, which can save time and disk space when storing samples in file.
+void SampleRecord::RemoveInvalidStackData() {
+  if (sample_type & PERF_SAMPLE_STACK_USER) {
+    uint64_t valid_stack_size = GetValidStackSize();
+    if (stack_user_data.size > valid_stack_size) {
+      // Shrink stack size to valid_stack_size, and update it in binary.
+      stack_user_data.size = valid_stack_size;
+      char* p = stack_user_data.data - sizeof(stack_user_data.size);
+      MoveToBinaryFormat(stack_user_data.size, p);
+      p += valid_stack_size;
+      // Update dyn_size in binary.
+      if (valid_stack_size != 0u) {
+        MoveToBinaryFormat(stack_user_data.dyn_size, p);
+      }
+      // Update sample size.
+      header.size = p - binary_;
+      p = binary_;
+      header.MoveToBinaryFormat(p);
+    }
+  }
+}
+
 void SampleRecord::DumpData(size_t indent) const {
   PrintIndented(indent, "sample_type: 0x%" PRIx64 "\n", sample_type);
   if (sample_type & PERF_SAMPLE_IP) {
diff --git a/simpleperf/record.h b/simpleperf/record.h
index 6e1cb37..b862e40 100644
--- a/simpleperf/record.h
+++ b/simpleperf/record.h
@@ -395,6 +395,7 @@
   size_t ExcludeKernelCallChain();
   bool HasUserCallChain() const;
   void UpdateUserCallChain(const std::vector<uint64_t>& user_ips);
+  void RemoveInvalidStackData();
 
   uint64_t Timestamp() const override;
   uint32_t Cpu() const override;
diff --git a/simpleperf/record_file.h b/simpleperf/record_file.h
index 32c91fa..4e09d48 100644
--- a/simpleperf/record_file.h
+++ b/simpleperf/record_file.h
@@ -36,6 +36,8 @@
 #include "thread_tree.h"
 
 // RecordFileWriter writes to a perf record file, like perf.data.
+// User should call RecordFileWriter::Close() to finish writing the file, otherwise the file will
+// be removed in RecordFileWriter::~RecordFileWriter().
 class RecordFileWriter {
  public:
   static std::unique_ptr<RecordFileWriter> CreateInstance(const std::string& filename);
@@ -59,9 +61,6 @@
   bool WriteMetaInfoFeature(const std::unordered_map<std::string, std::string>& info_map);
   bool EndWriteFeatures();
 
-  // Normally, Close() should be called after writing. But if something
-  // wrong happens and we need to finish in advance, the destructor
-  // will take care of calling Close().
   bool Close();
 
  private:
diff --git a/simpleperf/record_file_writer.cpp b/simpleperf/record_file_writer.cpp
index 083a746..6d0fdd9 100644
--- a/simpleperf/record_file_writer.cpp
+++ b/simpleperf/record_file_writer.cpp
@@ -64,7 +64,8 @@
 
 RecordFileWriter::~RecordFileWriter() {
   if (record_fp_ != nullptr) {
-    Close();
+    fclose(record_fp_);
+    unlink(filename_.c_str());
   }
 }