Merge "Add switches for compressor"
diff --git a/ext4_utils/wipe.h b/ext4_utils/wipe.h
index bd119e3..c7a86fa 100644
--- a/ext4_utils/wipe.h
+++ b/ext4_utils/wipe.h
@@ -17,6 +17,10 @@
 #ifndef _WIPE_H_
 #define _WIPE_H_
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #include "ext4_utils.h"
 
 /* Set WIPE_IS_SUPPORTED to 1 if the current platform supports
@@ -30,4 +34,8 @@
 
 int wipe_block_device(int fd, s64 len);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/simpleperf/Android.mk b/simpleperf/Android.mk
index b923738..35bec54 100644
--- a/simpleperf/Android.mk
+++ b/simpleperf/Android.mk
@@ -17,8 +17,9 @@
 LOCAL_PATH := $(call my-dir)
 
 simpleperf_common_cppflags := -std=c++11 -Wall -Wextra -Werror -Wunused
+
 simpleperf_host_common_cppflags := $(simpleperf_common_cppflags) \
-                                   -DUSE_BIONIC_PERF_EVENT_H -I bionic \
+                                   -DUSE_BIONIC_UAPI_HEADERS -I bionic/libc/kernel \
 
 simpleperf_host_darwin_cppflags := $(simpleperf_host_common_cppflags) \
                                    -I $(LOCAL_PATH)/darwin_support \
@@ -40,10 +41,12 @@
   dso.cpp \
   event_attr.cpp \
   event_type.cpp \
+  perf_regs.cpp \
   read_elf.cpp \
   record.cpp \
   record_file_reader.cpp \
   sample_tree.cpp \
+  thread_tree.cpp \
   utils.cpp \
 
 libsimpleperf_src_files := \
diff --git a/simpleperf/build_id.h b/simpleperf/build_id.h
index 5a4b12c..a244e7f 100644
--- a/simpleperf/build_id.h
+++ b/simpleperf/build_id.h
@@ -17,10 +17,44 @@
 #ifndef SIMPLE_PERF_BUILD_ID_H_
 #define SIMPLE_PERF_BUILD_ID_H_
 
-#include <array>
+#include <string.h>
+#include <algorithm>
+#include <base/stringprintf.h>
 
-static constexpr int BUILD_ID_SIZE = 20;
+constexpr size_t BUILD_ID_SIZE = 20;
 
-typedef std::array<unsigned char, BUILD_ID_SIZE> BuildId;
+class BuildId {
+ public:
+  static size_t Size() {
+    return BUILD_ID_SIZE;
+  }
+
+  BuildId() {
+    memset(data_, '\0', BUILD_ID_SIZE);
+  }
+
+  BuildId(const void* data, size_t len = BUILD_ID_SIZE) : BuildId() {
+    memcpy(data_, data, std::min(len, BUILD_ID_SIZE));
+  }
+
+  const unsigned char* Data() const {
+    return data_;
+  }
+
+  std::string ToString() const {
+    std::string s = "0x";
+    for (size_t i = 0; i < BUILD_ID_SIZE; ++i) {
+      s += android::base::StringPrintf("%02x", data_[i]);
+    }
+    return s;
+  }
+
+  bool operator==(const BuildId& build_id) const {
+    return memcmp(data_, build_id.data_, BUILD_ID_SIZE) == 0;
+  }
+
+ private:
+  unsigned char data_[BUILD_ID_SIZE];
+};
 
 #endif  // SIMPLE_PERF_BUILD_ID_H_
diff --git a/simpleperf/cmd_dumprecord.cpp b/simpleperf/cmd_dumprecord.cpp
index 28175d9..8f73732 100644
--- a/simpleperf/cmd_dumprecord.cpp
+++ b/simpleperf/cmd_dumprecord.cpp
@@ -26,6 +26,7 @@
 
 #include "command.h"
 #include "event_attr.h"
+#include "perf_regs.h"
 #include "record.h"
 #include "record_file.h"
 #include "utils.h"
@@ -62,6 +63,12 @@
   if (record_file_reader_ == nullptr) {
     return false;
   }
+  std::string arch = record_file_reader_->ReadFeatureString(FEAT_ARCH);
+  if (!arch.empty()) {
+    if (!SetCurrentArch(arch)) {
+      return false;
+    }
+  }
   DumpFileHeader();
   DumpAttrSection();
   DumpDataSection();
@@ -165,7 +172,7 @@
 }
 
 void DumpRecordCommand::DumpDataSection() {
-  std::vector<std::unique_ptr<const Record>> records = record_file_reader_->DataSection();
+  std::vector<std::unique_ptr<Record>> records = record_file_reader_->DataSection();
   for (auto& record : records) {
     record->Dump();
   }
@@ -179,16 +186,16 @@
     printf("feature section for %s: offset %" PRId64 ", size %" PRId64 "\n",
            GetFeatureName(feature).c_str(), section.offset, section.size);
     if (feature == FEAT_BUILD_ID) {
-      const char* p = record_file_reader_->DataAtOffset(section.offset);
-      const char* end = p + section.size;
-      while (p < end) {
-        const perf_event_header* header = reinterpret_cast<const perf_event_header*>(p);
-        CHECK_LE(p + header->size, end);
-        BuildIdRecord record(header);
-        record.header.type = PERF_RECORD_BUILD_ID;  // Set type explicitly as perf doesn't set it.
-        record.Dump(1);
-        p += header->size;
+      std::vector<BuildIdRecord> records = record_file_reader_->ReadBuildIdFeature();
+      for (auto& r : records) {
+        r.Dump(1);
       }
+    } else if (feature == FEAT_OSRELEASE) {
+      std::string s = record_file_reader_->ReadFeatureString(feature);
+      PrintIndented(1, "osrelease: %s\n", s.c_str());
+    } else if (feature == FEAT_ARCH) {
+      std::string s = record_file_reader_->ReadFeatureString(feature);
+      PrintIndented(1, "arch: %s\n", s.c_str());
     } else if (feature == FEAT_CMDLINE) {
       std::vector<std::string> cmdline = record_file_reader_->ReadCmdlineFeature();
       PrintIndented(1, "cmdline: %s\n", android::base::Join(cmdline, ' ').c_str());
diff --git a/simpleperf/cmd_record.cpp b/simpleperf/cmd_record.cpp
index f91100a..e4f1d41 100644
--- a/simpleperf/cmd_record.cpp
+++ b/simpleperf/cmd_record.cpp
@@ -17,6 +17,8 @@
 #include <libgen.h>
 #include <poll.h>
 #include <signal.h>
+#include <sys/utsname.h>
+#include <unistd.h>
 #include <set>
 #include <string>
 #include <unordered_map>
@@ -32,6 +34,7 @@
 #include "read_elf.h"
 #include "record.h"
 #include "record_file.h"
+#include "thread_tree.h"
 #include "utils.h"
 #include "workload.h"
 
@@ -61,6 +64,9 @@
             "    -a           System-wide collection.\n"
             "    -b           Enable take branch stack sampling. Same as '-j any'\n"
             "    -c count     Set event sample period.\n"
+            "    --call-graph fp | dwarf[,<dump_stack_size>]\n"
+            "                 Enable call graph recording. Use frame pointer or dwarf as the\n"
+            "                 method to parse call graph in stack. Default is dwarf,8192.\n"
             "    -e event[:modifier]\n"
             "                 Select the event to sample. Use `simpleperf list` to find\n"
             "                 all possible event names. Modifiers can be added to define\n"
@@ -69,7 +75,7 @@
             "                   k - monitor kernel space events only\n"
             "    -f freq      Set event sample frequency.\n"
             "    -F freq      Same as '-f freq'.\n"
-            "    -g           Enables call-graph recording.\n"
+            "    -g           Same as '--call-graph dwarf'.\n"
             "    -j branch_filter1,branch_filter2,...\n"
             "                 Enable taken branch stack sampling. Each sample\n"
             "                 captures a series of consecutive taken branches.\n"
@@ -93,7 +99,9 @@
         sample_freq_(4000),
         system_wide_collection_(false),
         branch_sampling_(0),
-        callchain_sampling_(false),
+        fp_callchain_sampling_(false),
+        dwarf_callchain_sampling_(false),
+        dump_stack_size_in_dwarf_sampling_(8192),
         child_inherit_(true),
         perf_mmap_pages_(256),
         record_filename_("perf.data") {
@@ -112,9 +120,10 @@
   bool SetEventSelection();
   bool WriteData(const char* data, size_t size);
   bool DumpKernelAndModuleMmaps();
-  bool DumpThreadCommAndMmaps();
+  bool DumpThreadCommAndMmaps(bool all_threads, const std::vector<pid_t>& selected_threads);
   bool DumpAdditionalFeatures(const std::vector<std::string>& args);
   bool DumpBuildIdFeature();
+  bool GetHitFiles(std::set<std::string>* kernel_modules, std::set<std::string>* user_files);
 
   bool use_sample_freq_;    // Use sample_freq_ when true, otherwise using sample_period_.
   uint64_t sample_freq_;    // Sample 'sample_freq_' times per second.
@@ -122,7 +131,9 @@
 
   bool system_wide_collection_;
   uint64_t branch_sampling_;
-  bool callchain_sampling_;
+  bool fp_callchain_sampling_;
+  bool dwarf_callchain_sampling_;
+  uint32_t dump_stack_size_in_dwarf_sampling_;
   bool child_inherit_;
   std::vector<pid_t> monitored_threads_;
   std::unique_ptr<EventTypeAndModifier> measured_event_type_modifier_;
@@ -198,7 +209,7 @@
   if (!DumpKernelAndModuleMmaps()) {
     return false;
   }
-  if (system_wide_collection_ && !DumpThreadCommAndMmaps()) {
+  if (!DumpThreadCommAndMmaps(system_wide_collection_, monitored_threads_)) {
     return false;
   }
 
@@ -253,6 +264,34 @@
         return false;
       }
       use_sample_freq_ = false;
+    } else if (args[i] == "--call-graph") {
+      if (!NextArgumentOrError(args, &i)) {
+        return false;
+      }
+      std::vector<std::string> strs = android::base::Split(args[i], ",");
+      if (strs[0] == "fp") {
+        fp_callchain_sampling_ = true;
+        dwarf_callchain_sampling_ = false;
+      } else if (strs[0] == "dwarf") {
+        fp_callchain_sampling_ = false;
+        dwarf_callchain_sampling_ = true;
+        if (strs.size() > 1) {
+          char* endptr;
+          uint64_t size = strtoull(strs[1].c_str(), &endptr, 0);
+          if (*endptr != '\0' || size > UINT_MAX) {
+            LOG(ERROR) << "invalid dump stack size in --call-graph option: " << strs[1];
+            return false;
+          }
+          if ((size & 7) != 0) {
+            LOG(ERROR) << "dump stack size " << size << " is not 8-byte aligned.";
+            return false;
+          }
+          dump_stack_size_in_dwarf_sampling_ = static_cast<uint32_t>(size);
+        }
+      } else {
+        LOG(ERROR) << "unexpected argument for --call-graph option: " << args[i];
+        return false;
+      }
     } else if (args[i] == "-e") {
       if (!NextArgumentOrError(args, &i)) {
         return false;
@@ -272,7 +311,8 @@
       }
       use_sample_freq_ = true;
     } else if (args[i] == "-g") {
-      callchain_sampling_ = true;
+      fp_callchain_sampling_ = false;
+      dwarf_callchain_sampling_ = true;
     } else if (args[i] == "-j") {
       if (!NextArgumentOrError(args, &i)) {
         return false;
@@ -351,8 +391,12 @@
   if (!event_selection_set_.SetBranchSampling(branch_sampling_)) {
     return false;
   }
-  if (callchain_sampling_) {
-    event_selection_set_.EnableCallChainSampling();
+  if (fp_callchain_sampling_) {
+    event_selection_set_.EnableFpCallChainSampling();
+  } else if (dwarf_callchain_sampling_) {
+    if (!event_selection_set_.EnableDwarfCallChainSampling(dump_stack_size_in_dwarf_sampling_)) {
+      return false;
+    }
   }
   event_selection_set_.SetInherit(child_inherit_);
   return true;
@@ -389,11 +433,24 @@
   return true;
 }
 
-bool RecordCommand::DumpThreadCommAndMmaps() {
+bool RecordCommand::DumpThreadCommAndMmaps(bool all_threads,
+                                           const std::vector<pid_t>& selected_threads) {
   std::vector<ThreadComm> thread_comms;
   if (!GetThreadComms(&thread_comms)) {
     return false;
   }
+  // Decide which processes and threads to dump.
+  std::set<pid_t> dump_processes;
+  std::set<pid_t> dump_threads;
+  for (auto& tid : selected_threads) {
+    dump_threads.insert(tid);
+  }
+  for (auto& thread : thread_comms) {
+    if (dump_threads.find(thread.tid) != dump_threads.end()) {
+      dump_processes.insert(thread.pid);
+    }
+  }
+
   const perf_event_attr& attr =
       event_selection_set_.FindEventAttrByType(measured_event_type_modifier_->event_type);
 
@@ -402,6 +459,9 @@
     if (thread.pid != thread.tid) {
       continue;
     }
+    if (!all_threads && dump_processes.find(thread.pid) == dump_processes.end()) {
+      continue;
+    }
     CommRecord record = CreateCommRecord(attr, thread.pid, thread.tid, thread.comm);
     if (!record_file_writer_->WriteData(record.BinaryFormat())) {
       return false;
@@ -429,6 +489,9 @@
     if (thread.pid == thread.tid) {
       continue;
     }
+    if (!all_threads && dump_threads.find(thread.tid) == dump_threads.end()) {
+      continue;
+    }
     ForkRecord fork_record = CreateForkRecord(attr, thread.pid, thread.tid, thread.pid, thread.pid);
     if (!record_file_writer_->WriteData(fork_record.BinaryFormat())) {
       return false;
@@ -442,13 +505,25 @@
 }
 
 bool RecordCommand::DumpAdditionalFeatures(const std::vector<std::string>& args) {
-  size_t feature_count = (branch_sampling_ != 0 ? 3 : 2);
+  size_t feature_count = (branch_sampling_ != 0 ? 5 : 4);
   if (!record_file_writer_->WriteFeatureHeader(feature_count)) {
     return false;
   }
   if (!DumpBuildIdFeature()) {
     return false;
   }
+  utsname uname_buf;
+  if (TEMP_FAILURE_RETRY(uname(&uname_buf)) != 0) {
+    PLOG(ERROR) << "uname() failed";
+    return false;
+  }
+  if (!record_file_writer_->WriteFeatureString(PerfFileFormat::FEAT_OSRELEASE, uname_buf.release)) {
+    return false;
+  }
+  if (!record_file_writer_->WriteFeatureString(PerfFileFormat::FEAT_ARCH, uname_buf.machine)) {
+    return false;
+  }
+
   std::string exec_path = "simpleperf";
   GetExecPath(&exec_path);
   std::vector<std::string> cmdline;
@@ -465,15 +540,15 @@
 }
 
 bool RecordCommand::DumpBuildIdFeature() {
-  std::vector<std::string> hit_kernel_modules;
-  std::vector<std::string> hit_user_files;
-  if (!record_file_writer_->GetHitModules(&hit_kernel_modules, &hit_user_files)) {
+  std::set<std::string> kernel_modules;
+  std::set<std::string> user_files;
+  if (!GetHitFiles(&kernel_modules, &user_files)) {
     return false;
   }
   std::vector<BuildIdRecord> build_id_records;
   BuildId build_id;
   // Add build_ids for kernel/modules.
-  for (auto& filename : hit_kernel_modules) {
+  for (auto& filename : kernel_modules) {
     if (filename == DEFAULT_KERNEL_MMAP_NAME) {
       if (!GetKernelBuildId(&build_id)) {
         LOG(DEBUG) << "can't read build_id for kernel";
@@ -482,7 +557,8 @@
       build_id_records.push_back(
           CreateBuildIdRecord(true, UINT_MAX, build_id, DEFAULT_KERNEL_FILENAME_FOR_BUILD_ID));
     } else {
-      std::string module_name = basename(&filename[0]);
+      std::string path = filename;
+      std::string module_name = basename(&path[0]);
       if (android::base::EndsWith(module_name, ".ko")) {
         module_name = module_name.substr(0, module_name.size() - 3);
       }
@@ -494,7 +570,7 @@
     }
   }
   // Add build_ids for user elf files.
-  for (auto& filename : hit_user_files) {
+  for (auto& filename : user_files) {
     if (filename == DEFAULT_EXECNAME_FOR_THREAD_MMAP) {
       continue;
     }
@@ -510,6 +586,30 @@
   return true;
 }
 
+bool RecordCommand::GetHitFiles(std::set<std::string>* kernel_modules,
+                                std::set<std::string>* user_files) {
+  std::vector<std::unique_ptr<Record>> records;
+  if (!record_file_writer_->ReadDataSection(&records)) {
+    return false;
+  }
+  ThreadTree thread_tree;
+  BuildThreadTree(records, &thread_tree);
+  for (auto& record : records) {
+    if (record->header.type == PERF_RECORD_SAMPLE) {
+      auto r = *static_cast<const SampleRecord*>(record.get());
+      bool in_kernel = ((r.header.misc & PERF_RECORD_MISC_CPUMODE_MASK) == PERF_RECORD_MISC_KERNEL);
+      const ThreadEntry* thread = thread_tree.FindThreadOrNew(r.tid_data.pid, r.tid_data.tid);
+      const MapEntry* map = thread_tree.FindMap(thread, r.ip_data.ip, in_kernel);
+      if (in_kernel) {
+        kernel_modules->insert(map->dso->path);
+      } else {
+        user_files->insert(map->dso->path);
+      }
+    }
+  }
+  return true;
+}
+
 __attribute__((constructor)) static void RegisterRecordCommand() {
   RegisterCommand("record", [] { return std::unique_ptr<Command>(new RecordCommand()); });
 }
diff --git a/simpleperf/cmd_record_test.cpp b/simpleperf/cmd_record_test.cpp
index a4e2be6..b9c3b9f 100644
--- a/simpleperf/cmd_record_test.cpp
+++ b/simpleperf/cmd_record_test.cpp
@@ -20,6 +20,7 @@
 
 #include "command.h"
 #include "environment.h"
+#include "event_selection_set.h"
 #include "record.h"
 #include "record_file.h"
 #include "test_util.h"
@@ -59,7 +60,7 @@
   ASSERT_TRUE(RecordCmd()->Run({"sleep", "1"}));
   std::unique_ptr<RecordFileReader> reader = RecordFileReader::CreateInstance("perf.data");
   ASSERT_TRUE(reader != nullptr);
-  std::vector<std::unique_ptr<const Record>> records = reader->DataSection();
+  std::vector<std::unique_ptr<Record>> records = reader->DataSection();
   ASSERT_GT(records.size(), 0U);
   bool have_kernel_mmap = false;
   for (auto& record : records) {
@@ -88,8 +89,6 @@
   ASSERT_TRUE(RecordCmd()->Run({"-a", "-e", "sched:sched_switch", "sleep", "1"}));
 }
 
-extern bool IsBranchSamplingSupported();
-
 TEST(record_cmd, branch_sampling) {
   if (IsBranchSamplingSupported()) {
     ASSERT_TRUE(RecordCmd()->Run({"-a", "-b", "sleep", "1"}));
@@ -107,8 +106,19 @@
   ASSERT_TRUE(RecordCmd()->Run({"-e", "cpu-cycles:u", "sleep", "1"}));
 }
 
-TEST(record_cmd, callchain_sampling) {
-  ASSERT_TRUE(RecordCmd()->Run({"-g", "sleep", "1"}));
+TEST(record_cmd, fp_callchain_sampling) {
+  ASSERT_TRUE(RecordCmd()->Run({"--call-graph", "fp", "sleep", "1"}));
+}
+
+TEST(record_cmd, dwarf_callchain_sampling) {
+  if (IsDwarfCallChainSamplingSupported()) {
+    ASSERT_TRUE(RecordCmd()->Run({"--call-graph", "dwarf", "sleep", "1"}));
+    ASSERT_TRUE(RecordCmd()->Run({"--call-graph", "dwarf,16384", "sleep", "1"}));
+    ASSERT_TRUE(RecordCmd()->Run({"-g", "sleep", "1"}));
+  } else {
+    GTEST_LOG_(INFO)
+        << "This test does nothing as dwarf callchain sampling is not supported on this device.";
+  }
 }
 
 TEST(record_cmd, existing_processes) {
diff --git a/simpleperf/cmd_report.cpp b/simpleperf/cmd_report.cpp
index 0e8f8e6..c92aa65 100644
--- a/simpleperf/cmd_report.cpp
+++ b/simpleperf/cmd_report.cpp
@@ -30,9 +30,11 @@
 #include "environment.h"
 #include "event_attr.h"
 #include "event_type.h"
+#include "perf_regs.h"
 #include "record.h"
 #include "record_file.h"
 #include "sample_tree.h"
+#include "thread_tree.h"
 
 class Displayable {
  public:
@@ -247,14 +249,17 @@
             "                  include pid, tid, comm, dso, symbol, dso_from, dso_to, symbol_from\n"
             "                  symbol_to. dso_from, dso_to, symbol_from, symbol_to can only be\n"
             "                  used with -b option. Default keys are \"comm,pid,tid,dso,symbol\"\n"
-            "    --symfs <dir>  Look for files with symbols relative to this directory.\n"),
+            "    --symfs <dir> Look for files with symbols relative to this directory.\n"
+            "    --vmlinux <file>\n"
+            "                  Parse kernel symbols from <file>.\n"),
         record_filename_("perf.data"),
         use_branch_address_(false),
         accumulate_callchain_(false),
         print_callgraph_(false) {
     compare_sample_func_t compare_sample_callback = std::bind(
         &ReportCommand::CompareSampleEntry, this, std::placeholders::_1, std::placeholders::_2);
-    sample_tree_ = std::unique_ptr<SampleTree>(new SampleTree(compare_sample_callback));
+    sample_tree_ =
+        std::unique_ptr<SampleTree>(new SampleTree(&thread_tree_, compare_sample_callback));
   }
 
   bool Run(const std::vector<std::string>& args);
@@ -264,7 +269,7 @@
   bool ReadEventAttrFromRecordFile();
   void ReadSampleTreeFromRecordFile();
   void ProcessSampleRecord(const SampleRecord& r);
-  void ReadFeaturesFromRecordFile();
+  bool ReadFeaturesFromRecordFile();
   int CompareSampleEntry(const SampleEntry& sample1, const SampleEntry& sample2);
   void PrintReport();
   void PrintReportContext();
@@ -279,6 +284,7 @@
   perf_event_attr event_attr_;
   std::vector<std::unique_ptr<Displayable>> displayable_items_;
   std::vector<Comparable*> comparable_items_;
+  ThreadTree thread_tree_;
   std::unique_ptr<SampleTree> sample_tree_;
   bool use_branch_address_;
   std::string record_cmdline_;
@@ -300,8 +306,11 @@
   if (!ReadEventAttrFromRecordFile()) {
     return false;
   }
+  // Read features first to prepare build ids used when building SampleTree.
+  if (!ReadFeaturesFromRecordFile()) {
+    return false;
+  }
   ReadSampleTreeFromRecordFile();
-  ReadFeaturesFromRecordFile();
 
   // 3. Show collected information.
   PrintReport();
@@ -310,6 +319,9 @@
 }
 
 bool ReportCommand::ParseOptions(const std::vector<std::string>& args) {
+  bool demangle = true;
+  std::string symfs_dir;
+  std::string vmlinux;
   bool print_sample_count = false;
   std::vector<std::string> sort_keys = {"comm", "pid", "tid", "dso", "symbol"};
   for (size_t i = 0; i < args.size(); ++i) {
@@ -330,7 +342,7 @@
       print_sample_count = true;
 
     } else if (args[i] == "--no-demangle") {
-      DsoFactory::SetDemangle(false);
+      demangle = false;
 
     } else if (args[i] == "--sort") {
       if (!NextArgumentOrError(args, &i)) {
@@ -341,15 +353,26 @@
       if (!NextArgumentOrError(args, &i)) {
         return false;
       }
-      if (!DsoFactory::SetSymFsDir(args[i])) {
+      symfs_dir = args[i];
+    } else if (args[i] == "--vmlinux") {
+      if (!NextArgumentOrError(args, &i)) {
         return false;
       }
+      vmlinux = args[i];
     } else {
       ReportUnknownOption(args, i);
       return false;
     }
   }
 
+  DsoFactory::GetInstance()->SetDemangle(demangle);
+  if (!DsoFactory::GetInstance()->SetSymFsDir(symfs_dir)) {
+    return false;
+  }
+  if (!vmlinux.empty()) {
+    DsoFactory::GetInstance()->SetVmlinux(vmlinux);
+  }
+
   if (!accumulate_callchain_) {
     displayable_items_.push_back(
         std::unique_ptr<Displayable>(new SelfOverheadItem(*sample_tree_, "Overhead")));
@@ -425,38 +448,12 @@
 }
 
 void ReportCommand::ReadSampleTreeFromRecordFile() {
-  sample_tree_->AddThread(0, 0, "swapper");
-
-  std::vector<std::unique_ptr<const Record>> records = record_file_reader_->DataSection();
+  std::vector<std::unique_ptr<Record>> records = record_file_reader_->DataSection();
+  thread_tree_.AddThread(0, 0, "swapper");
+  BuildThreadTree(records, &thread_tree_);
   for (auto& record : records) {
-    if (record->header.type == PERF_RECORD_MMAP) {
-      const MmapRecord& r = *static_cast<const MmapRecord*>(record.get());
-      if ((r.header.misc & PERF_RECORD_MISC_CPUMODE_MASK) == PERF_RECORD_MISC_KERNEL) {
-        sample_tree_->AddKernelMap(r.data.addr, r.data.len, r.data.pgoff,
-                                   r.sample_id.time_data.time, r.filename);
-      } else {
-        sample_tree_->AddThreadMap(r.data.pid, r.data.tid, r.data.addr, r.data.len, r.data.pgoff,
-                                   r.sample_id.time_data.time, r.filename);
-      }
-    } else if (record->header.type == PERF_RECORD_MMAP2) {
-      const Mmap2Record& r = *static_cast<const Mmap2Record*>(record.get());
-      if ((r.header.misc & PERF_RECORD_MISC_CPUMODE_MASK) == PERF_RECORD_MISC_KERNEL) {
-        sample_tree_->AddKernelMap(r.data.addr, r.data.len, r.data.pgoff,
-                                   r.sample_id.time_data.time, r.filename);
-      } else {
-        std::string filename =
-            (r.filename == DEFAULT_EXECNAME_FOR_THREAD_MMAP) ? "[unknown]" : r.filename;
-        sample_tree_->AddThreadMap(r.data.pid, r.data.tid, r.data.addr, r.data.len, r.data.pgoff,
-                                   r.sample_id.time_data.time, filename);
-      }
-    } else if (record->header.type == PERF_RECORD_SAMPLE) {
+    if (record->header.type == PERF_RECORD_SAMPLE) {
       ProcessSampleRecord(*static_cast<const SampleRecord*>(record.get()));
-    } else if (record->header.type == PERF_RECORD_COMM) {
-      const CommRecord& r = *static_cast<const CommRecord*>(record.get());
-      sample_tree_->AddThread(r.data.pid, r.data.tid, r.comm);
-    } else if (record->header.type == PERF_RECORD_FORK) {
-      const ForkRecord& r = *static_cast<const ForkRecord*>(record.get());
-      sample_tree_->ForkThread(r.data.pid, r.data.tid, r.data.ppid, r.data.ptid);
     }
   }
 }
@@ -522,11 +519,26 @@
   }
 }
 
-void ReportCommand::ReadFeaturesFromRecordFile() {
+bool ReportCommand::ReadFeaturesFromRecordFile() {
+  std::vector<BuildIdRecord> records = record_file_reader_->ReadBuildIdFeature();
+  std::vector<std::pair<std::string, BuildId>> build_ids;
+  for (auto& r : records) {
+    build_ids.push_back(std::make_pair(r.filename, r.build_id));
+  }
+  DsoFactory::GetInstance()->SetBuildIds(build_ids);
+
+  std::string arch = record_file_reader_->ReadFeatureString(PerfFileFormat::FEAT_ARCH);
+  if (!arch.empty()) {
+    if (!SetCurrentArch(arch)) {
+      return false;
+    }
+  }
+
   std::vector<std::string> cmdline = record_file_reader_->ReadCmdlineFeature();
   if (!cmdline.empty()) {
     record_cmdline_ = android::base::Join(cmdline, ' ');
   }
+  return true;
 }
 
 int ReportCommand::CompareSampleEntry(const SampleEntry& sample1, const SampleEntry& sample2) {
diff --git a/simpleperf/cmd_report_test.cpp b/simpleperf/cmd_report_test.cpp
index a0dc596..0a376e9 100644
--- a/simpleperf/cmd_report_test.cpp
+++ b/simpleperf/cmd_report_test.cpp
@@ -31,7 +31,7 @@
   static void SetUpTestCase() {
     ASSERT_TRUE(RecordCmd()->Run({"-a", "sleep", "1"}));
     ASSERT_TRUE(RecordCmd()->Run({"-a", "-o", "perf2.data", "sleep", "1"}));
-    ASSERT_TRUE(RecordCmd()->Run({"-g", "-o", "perf_g.data", "sleep", "1"}));
+    ASSERT_TRUE(RecordCmd()->Run({"--call-graph", "fp", "-o", "perf_g.data", "sleep", "1"}));
   }
 };
 
diff --git a/simpleperf/dso.cpp b/simpleperf/dso.cpp
index 562727b..2b63641 100644
--- a/simpleperf/dso.cpp
+++ b/simpleperf/dso.cpp
@@ -27,7 +27,15 @@
   return symbol1->addr < symbol2->addr;
 }
 
+DsoEntry::DsoEntry(DsoType type, const std::string& path)
+    : type(type), path(path), is_loaded(false) {
+}
+
 const SymbolEntry* DsoEntry::FindSymbol(uint64_t offset_in_dso) {
+  if (!is_loaded) {
+    DsoFactory::GetInstance()->LoadDso(this);
+    is_loaded = true;
+  }
   std::unique_ptr<SymbolEntry> symbol(new SymbolEntry{
       "",             // name
       offset_in_dso,  // addr
@@ -44,30 +52,70 @@
   return nullptr;
 }
 
-bool DsoFactory::demangle = true;
-
-void DsoFactory::SetDemangle(bool demangle) {
-  DsoFactory::demangle = demangle;
+DsoFactory* DsoFactory::GetInstance() {
+  static DsoFactory dso_factory;
+  return &dso_factory;
 }
 
-std::string DsoFactory::symfs_dir;
+DsoFactory::DsoFactory() : demangle_(true) {
+}
+
+void DsoFactory::SetDemangle(bool demangle) {
+  demangle_ = demangle;
+}
 
 bool DsoFactory::SetSymFsDir(const std::string& symfs_dir) {
   std::string dirname = symfs_dir;
-  if (!dirname.empty() && dirname.back() != '/') {
-    dirname.push_back('/');
+  if (!dirname.empty()) {
+    if (dirname.back() != '/') {
+      dirname.push_back('/');
+    }
+    std::vector<std::string> files;
+    std::vector<std::string> subdirs;
+    GetEntriesInDir(symfs_dir, &files, &subdirs);
+    if (files.empty() && subdirs.empty()) {
+      LOG(ERROR) << "Invalid symfs_dir '" << symfs_dir << "'";
+      return false;
+    }
   }
-  std::vector<std::string> files;
-  std::vector<std::string> subdirs;
-  GetEntriesInDir(symfs_dir, &files, &subdirs);
-  if (files.empty() && subdirs.empty()) {
-    LOG(ERROR) << "Invalid symfs_dir '" << symfs_dir << "'";
-    return false;
-  }
-  DsoFactory::symfs_dir = dirname;
+  symfs_dir_ = dirname;
   return true;
 }
 
+void DsoFactory::SetVmlinux(const std::string& vmlinux) {
+  vmlinux_ = vmlinux;
+}
+
+void DsoFactory::SetBuildIds(const std::vector<std::pair<std::string, BuildId>>& build_ids) {
+  std::unordered_map<std::string, BuildId> map;
+  for (auto& pair : build_ids) {
+    LOG(DEBUG) << "build_id_map: " << pair.first << ", " << pair.second.ToString();
+    map.insert(pair);
+  }
+  build_id_map_ = std::move(map);
+}
+
+std::unique_ptr<DsoEntry> DsoFactory::CreateDso(DsoType dso_type, const std::string& dso_path) {
+  std::string path = dso_path;
+  if (dso_type == DSO_KERNEL) {
+    path = "[kernel.kallsyms]";
+  }
+  return std::unique_ptr<DsoEntry>(new DsoEntry(dso_type, path));
+}
+
+bool DsoFactory::LoadDso(DsoEntry* dso) {
+  switch (dso->type) {
+    case DSO_KERNEL:
+      return LoadKernel(dso);
+    case DSO_KERNEL_MODULE:
+      return LoadKernelModule(dso);
+    case DSO_ELF_FILE:
+      return LoadElfFile(dso);
+    default:
+      return false;
+  }
+}
+
 static bool IsKernelFunctionSymbol(const KernelSymbol& symbol) {
   return (symbol.type == 'T' || symbol.type == 't' || symbol.type == 'W' || symbol.type == 'w');
 }
@@ -84,6 +132,17 @@
   return false;
 }
 
+static void VmlinuxSymbolCallback(const ElfFileSymbol& elf_symbol, DsoEntry* dso) {
+  if (elf_symbol.is_func) {
+    SymbolEntry* symbol = new SymbolEntry{
+        elf_symbol.name,   // name
+        elf_symbol.vaddr,  // addr
+        elf_symbol.len,    // len
+    };
+    dso->symbols.insert(std::unique_ptr<SymbolEntry>(symbol));
+  }
+}
+
 static void FixupSymbolLength(DsoEntry* dso) {
   SymbolEntry* prev_symbol = nullptr;
   for (auto& symbol : dso->symbols) {
@@ -97,14 +156,23 @@
   }
 }
 
-// TODO: Fix the way to get kernel symbols. See b/22179177.
-std::unique_ptr<DsoEntry> DsoFactory::LoadKernel() {
-  std::unique_ptr<DsoEntry> dso(new DsoEntry);
-  dso->path = "[kernel.kallsyms]";
-
-  ProcessKernelSymbols("/proc/kallsyms",
-                       std::bind(&KernelSymbolCallback, std::placeholders::_1, dso.get()));
-  FixupSymbolLength(dso.get());
+bool DsoFactory::LoadKernel(DsoEntry* dso) {
+  BuildId build_id = GetExpectedBuildId(DEFAULT_KERNEL_FILENAME_FOR_BUILD_ID);
+  if (!vmlinux_.empty()) {
+    ParseSymbolsFromElfFile(vmlinux_, build_id,
+                            std::bind(VmlinuxSymbolCallback, std::placeholders::_1, dso));
+  } else {
+    BuildId real_build_id;
+    GetKernelBuildId(&real_build_id);
+    bool match = (build_id == real_build_id);
+    LOG(DEBUG) << "check kernel build id (" << (match ? "match" : "mismatch") << "): expected "
+               << build_id.ToString() << ", real " << real_build_id.ToString();
+    if (match) {
+      ProcessKernelSymbols("/proc/kallsyms",
+                           std::bind(&KernelSymbolCallback, std::placeholders::_1, dso));
+    }
+  }
+  FixupSymbolLength(dso);
   return dso;
 }
 
@@ -125,12 +193,12 @@
   return (elf_symbol.is_func && elf_symbol.is_in_text_section);
 }
 
-std::unique_ptr<DsoEntry> DsoFactory::LoadKernelModule(const std::string& dso_path) {
-  std::unique_ptr<DsoEntry> dso(new DsoEntry);
-  dso->path = dso_path;
-  ParseSymbolsFromElfFile(symfs_dir + dso_path, std::bind(ParseSymbolCallback, std::placeholders::_1,
-                                                          dso.get(), SymbolFilterForKernelModule));
-  FixupSymbolLength(dso.get());
+bool DsoFactory::LoadKernelModule(DsoEntry* dso) {
+  BuildId build_id = GetExpectedBuildId(dso->path);
+  ParseSymbolsFromElfFile(
+      symfs_dir_ + dso->path, build_id,
+      std::bind(ParseSymbolCallback, std::placeholders::_1, dso, SymbolFilterForKernelModule));
+  FixupSymbolLength(dso);
   return dso;
 }
 
@@ -161,16 +229,24 @@
   }
 }
 
-std::unique_ptr<DsoEntry> DsoFactory::LoadDso(const std::string& dso_path) {
-  std::unique_ptr<DsoEntry> dso(new DsoEntry);
-  dso->path = dso_path;
-  ParseSymbolsFromElfFile(symfs_dir + dso_path, std::bind(ParseSymbolCallback, std::placeholders::_1,
-                                                          dso.get(), SymbolFilterForDso));
-  if (demangle) {
+bool DsoFactory::LoadElfFile(DsoEntry* dso) {
+  BuildId build_id = GetExpectedBuildId(dso->path);
+  ParseSymbolsFromElfFile(
+      symfs_dir_ + dso->path, build_id,
+      std::bind(ParseSymbolCallback, std::placeholders::_1, dso, SymbolFilterForDso));
+  if (demangle_) {
     for (auto& symbol : dso->symbols) {
       DemangleInPlace(&symbol->name);
     }
   }
-  FixupSymbolLength(dso.get());
+  FixupSymbolLength(dso);
   return dso;
 }
+
+BuildId DsoFactory::GetExpectedBuildId(const std::string& filename) {
+  auto it = build_id_map_.find(filename);
+  if (it != build_id_map_.end()) {
+    return it->second;
+  }
+  return BuildId();
+}
diff --git a/simpleperf/dso.h b/simpleperf/dso.h
index 2d79c92..8b83000 100644
--- a/simpleperf/dso.h
+++ b/simpleperf/dso.h
@@ -20,6 +20,10 @@
 #include <memory>
 #include <set>
 #include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "build_id.h"
 
 struct SymbolEntry {
   std::string name;
@@ -32,24 +36,45 @@
                   const std::unique_ptr<SymbolEntry>& symbol2);
 };
 
+enum DsoType {
+  DSO_KERNEL,
+  DSO_KERNEL_MODULE,
+  DSO_ELF_FILE,
+};
+
 struct DsoEntry {
+  DsoType type;
   std::string path;
   std::set<std::unique_ptr<SymbolEntry>, SymbolComparator> symbols;
 
+  DsoEntry(DsoType type, const std::string& path);
   const SymbolEntry* FindSymbol(uint64_t offset_in_dso);
+
+ private:
+  bool is_loaded;
 };
 
 class DsoFactory {
  public:
-  static void SetDemangle(bool demangle);
-  static bool SetSymFsDir(const std::string& symfs_dir);
-  static std::unique_ptr<DsoEntry> LoadKernel();
-  static std::unique_ptr<DsoEntry> LoadKernelModule(const std::string& dso_path);
-  static std::unique_ptr<DsoEntry> LoadDso(const std::string& dso_path);
+  static DsoFactory* GetInstance();
+  void SetDemangle(bool demangle);
+  bool SetSymFsDir(const std::string& symfs_dir);
+  void SetVmlinux(const std::string& vmlinux);
+  void SetBuildIds(const std::vector<std::pair<std::string, BuildId>>& build_ids);
+  std::unique_ptr<DsoEntry> CreateDso(DsoType dso_type, const std::string& dso_path = "");
+  bool LoadDso(DsoEntry* dso);
 
  private:
-  static bool demangle;
-  static std::string symfs_dir;
+  DsoFactory();
+  bool LoadKernel(DsoEntry* dso);
+  bool LoadKernelModule(DsoEntry* dso);
+  bool LoadElfFile(DsoEntry* dso);
+  BuildId GetExpectedBuildId(const std::string& filename);
+
+  bool demangle_;
+  std::string symfs_dir_;
+  std::string vmlinux_;
+  std::unordered_map<std::string, BuildId> build_id_map_;
 };
 
 #endif  // SIMPLE_PERF_DSO_H_
diff --git a/simpleperf/environment_fake.cpp b/simpleperf/environment_fake.cpp
index e8c9dd8..fdcf814 100644
--- a/simpleperf/environment_fake.cpp
+++ b/simpleperf/environment_fake.cpp
@@ -20,3 +20,7 @@
 bool ProcessKernelSymbols(const std::string&, std::function<bool(const KernelSymbol&)>) {
   return false;
 }
+
+bool GetKernelBuildId(BuildId*) {
+  return false;
+}
diff --git a/simpleperf/event_attr.cpp b/simpleperf/event_attr.cpp
index c7ee182..ddfb37f 100644
--- a/simpleperf/event_attr.cpp
+++ b/simpleperf/event_attr.cpp
@@ -47,6 +47,7 @@
 static std::string SampleTypeToString(uint64_t sample_type) {
   static std::vector<std::pair<int, std::string>> sample_type_names = {
       {PERF_SAMPLE_ADDR, "addr"},
+      {PERF_SAMPLE_BRANCH_STACK, "branch_stack"},
       {PERF_SAMPLE_CALLCHAIN, "callchain"},
       {PERF_SAMPLE_CPU, "cpu"},
       {PERF_SAMPLE_ID, "id"},
@@ -54,6 +55,8 @@
       {PERF_SAMPLE_PERIOD, "period"},
       {PERF_SAMPLE_RAW, "raw"},
       {PERF_SAMPLE_READ, "read"},
+      {PERF_SAMPLE_REGS_USER, "regs_user"},
+      {PERF_SAMPLE_STACK_USER, "stack_user"},
       {PERF_SAMPLE_STREAM_ID, "stream_id"},
       {PERF_SAMPLE_TID, "tid"},
       {PERF_SAMPLE_TIME, "time"},
@@ -133,4 +136,7 @@
 
   PrintIndented(indent + 1, "sample_id_all %u, exclude_host %u, exclude_guest %u\n",
                 attr.sample_id_all, attr.exclude_host, attr.exclude_guest);
+  PrintIndented(indent + 1, "branch_sample_type 0x%" PRIx64 "\n", attr.branch_sample_type);
+  PrintIndented(indent + 1, "sample_regs_user 0x%" PRIx64 "\n", attr.sample_regs_user);
+  PrintIndented(indent + 1, "sample_stack_user 0x%" PRIx64 "\n", attr.sample_stack_user);
 }
diff --git a/simpleperf/event_selection_set.cpp b/simpleperf/event_selection_set.cpp
index a9a0f96..9d09b06 100644
--- a/simpleperf/event_selection_set.cpp
+++ b/simpleperf/event_selection_set.cpp
@@ -22,6 +22,7 @@
 #include "environment.h"
 #include "event_attr.h"
 #include "event_type.h"
+#include "perf_regs.h"
 
 bool IsBranchSamplingSupported() {
   const EventType* type = FindEventTypeByName("cpu-cycles");
@@ -34,6 +35,19 @@
   return IsEventAttrSupportedByKernel(attr);
 }
 
+bool IsDwarfCallChainSamplingSupported() {
+  const EventType* type = FindEventTypeByName("cpu-cycles");
+  if (type == nullptr) {
+    return false;
+  }
+  perf_event_attr attr = CreateDefaultPerfEventAttr(*type);
+  attr.sample_type |= PERF_SAMPLE_CALLCHAIN | PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER;
+  attr.exclude_callchain_user = 1;
+  attr.sample_regs_user = GetSupportedRegMask();
+  attr.sample_stack_user = 8192;
+  return IsEventAttrSupportedByKernel(attr);
+}
+
 bool EventSelectionSet::AddEventType(const EventTypeAndModifier& event_type_modifier) {
   EventSelection selection;
   selection.event_type = event_type_modifier.event_type;
@@ -112,12 +126,27 @@
   return true;
 }
 
-void EventSelectionSet::EnableCallChainSampling() {
+void EventSelectionSet::EnableFpCallChainSampling() {
   for (auto& selection : selections_) {
     selection.event_attr.sample_type |= PERF_SAMPLE_CALLCHAIN;
   }
 }
 
+bool EventSelectionSet::EnableDwarfCallChainSampling(uint32_t dump_stack_size) {
+  if (!IsDwarfCallChainSamplingSupported()) {
+    LOG(ERROR) << "dwarf callchain sampling is not supported on this device.";
+    return false;
+  }
+  for (auto& selection : selections_) {
+    selection.event_attr.sample_type |=
+        PERF_SAMPLE_CALLCHAIN | PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER;
+    selection.event_attr.exclude_callchain_user = 1;
+    selection.event_attr.sample_regs_user = GetSupportedRegMask();
+    selection.event_attr.sample_stack_user = dump_stack_size;
+  }
+  return true;
+}
+
 void EventSelectionSet::SetInherit(bool enable) {
   for (auto& selection : selections_) {
     selection.event_attr.inherit = (enable ? 1 : 0);
diff --git a/simpleperf/event_selection_set.h b/simpleperf/event_selection_set.h
index e52ec5f..229b424 100644
--- a/simpleperf/event_selection_set.h
+++ b/simpleperf/event_selection_set.h
@@ -54,7 +54,8 @@
   void SetSampleFreq(uint64_t sample_freq);
   void SetSamplePeriod(uint64_t sample_period);
   bool SetBranchSampling(uint64_t branch_sample_type);
-  void EnableCallChainSampling();
+  void EnableFpCallChainSampling();
+  bool EnableDwarfCallChainSampling(uint32_t dump_stack_size);
   void SetInherit(bool enable);
 
   bool OpenEventFilesForAllCpus();
@@ -85,4 +86,7 @@
   DISALLOW_COPY_AND_ASSIGN(EventSelectionSet);
 };
 
+bool IsBranchSamplingSupported();
+bool IsDwarfCallChainSamplingSupported();
+
 #endif  // SIMPLE_PERF_EVENT_SELECTION_SET_H_
diff --git a/simpleperf/perf_event.h b/simpleperf/perf_event.h
index 1688dc9..7e7e48d 100644
--- a/simpleperf/perf_event.h
+++ b/simpleperf/perf_event.h
@@ -17,14 +17,10 @@
 #ifndef SIMPLE_PERF_PERF_EVENT_H_
 #define SIMPLE_PERF_PERF_EVENT_H_
 
-#if defined(USE_BIONIC_PERF_EVENT_H)
-
-#include <libc/kernel/uapi/linux/perf_event.h>
-
+#if defined(USE_BIONIC_UAPI_HEADERS)
+#include <uapi/linux/perf_event.h>
 #else
-
 #include <linux/perf_event.h>
-
 #endif
 
 #endif  // SIMPLE_PERF_PERF_EVENT_H_
diff --git a/simpleperf/perf_regs.cpp b/simpleperf/perf_regs.cpp
new file mode 100644
index 0000000..6a63876
--- /dev/null
+++ b/simpleperf/perf_regs.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "perf_regs.h"
+
+#include <unordered_map>
+#include <base/logging.h>
+#include <base/stringprintf.h>
+#include <base/strings.h>
+
+constexpr ArchType GetBuildArch() {
+#if defined(__i386__)
+  return ARCH_X86_32;
+#elif defined(__x86_64__)
+  return ARCH_X86_64;
+#elif defined(__aarch64__)
+  return ARCH_ARM64;
+#elif defined(__arm__)
+  return ARCH_ARM;
+#else
+  return ARCH_UNSUPPORTED;
+#endif
+}
+
+static ArchType current_arch = GetBuildArch();
+
+ArchType GetCurrentArch() {
+  return current_arch;
+}
+
+bool SetCurrentArch(const std::string& arch) {
+  if (arch == "x86") {
+    current_arch = ARCH_X86_32;
+  } else if (arch == "x86_64") {
+    current_arch = ARCH_X86_64;
+  } else if (arch == "aarch64") {
+    current_arch = ARCH_ARM64;
+  } else if (android::base::StartsWith(arch, "arm")) {
+    current_arch = ARCH_ARM;
+  } else {
+    LOG(ERROR) << "unsupported arch: " << arch;
+    return false;
+  }
+  return true;
+}
+
+uint64_t GetSupportedRegMask() {
+  switch (GetCurrentArch()) {
+    case ARCH_X86_32:
+      return ((1ULL << PERF_REG_X86_32_MAX) - 1);
+    case ARCH_X86_64:
+      return (((1ULL << PERF_REG_X86_64_MAX) - 1) & ~(1ULL << PERF_REG_X86_DS) &
+              ~(1ULL << PERF_REG_X86_ES) & ~(1ULL << PERF_REG_X86_FS) & ~(1ULL << PERF_REG_X86_GS));
+    case ARCH_ARM:
+      return ((1ULL << PERF_REG_ARM_MAX) - 1);
+    case ARCH_ARM64:
+      return ((1ULL << PERF_REG_ARM64_MAX) - 1);
+    default:
+      return 0;
+  }
+  return 0;
+}
+
+static std::unordered_map<size_t, std::string> x86_reg_map = {
+    {PERF_REG_X86_AX, "ax"},       {PERF_REG_X86_BX, "bx"}, {PERF_REG_X86_CX, "cx"},
+    {PERF_REG_X86_DX, "dx"},       {PERF_REG_X86_SI, "si"}, {PERF_REG_X86_DI, "di"},
+    {PERF_REG_X86_BP, "bp"},       {PERF_REG_X86_SP, "sp"}, {PERF_REG_X86_IP, "ip"},
+    {PERF_REG_X86_FLAGS, "flags"}, {PERF_REG_X86_CS, "cs"}, {PERF_REG_X86_SS, "ss"},
+    {PERF_REG_X86_DS, "ds"},       {PERF_REG_X86_ES, "es"}, {PERF_REG_X86_FS, "fs"},
+    {PERF_REG_X86_GS, "gs"},
+};
+
+static std::unordered_map<size_t, std::string> arm_reg_map = {
+    {PERF_REG_ARM_FP, "fp"}, {PERF_REG_ARM_IP, "ip"}, {PERF_REG_ARM_SP, "sp"},
+    {PERF_REG_ARM_LR, "lr"}, {PERF_REG_ARM_PC, "pc"},
+};
+
+static std::unordered_map<size_t, std::string> arm64_reg_map = {
+    {PERF_REG_ARM64_LR, "lr"}, {PERF_REG_ARM64_SP, "sp"}, {PERF_REG_ARM64_PC, "pc"},
+};
+
+std::string GetRegName(size_t reg) {
+  switch (GetCurrentArch()) {
+    case ARCH_X86_64: {
+      if (reg >= PERF_REG_X86_R8 && reg <= PERF_REG_X86_R15) {
+        return android::base::StringPrintf("r%zu", reg - PERF_REG_X86_R8 + 8);
+      }
+    }  // go through
+    case ARCH_X86_32: {
+      auto it = x86_reg_map.find(reg);
+      CHECK(it != x86_reg_map.end()) << "unknown reg " << reg;
+      return it->second;
+    }
+    case ARCH_ARM: {
+      if (reg >= PERF_REG_ARM_R0 && reg <= PERF_REG_ARM_R10) {
+        return android::base::StringPrintf("r%zu", reg - PERF_REG_ARM_R0);
+      }
+      auto it = arm_reg_map.find(reg);
+      CHECK(it != arm_reg_map.end()) << "unknown reg " << reg;
+      return it->second;
+    }
+    case ARCH_ARM64: {
+      if (reg >= PERF_REG_ARM64_X0 && reg <= PERF_REG_ARM64_X29) {
+        return android::base::StringPrintf("r%zu", reg - PERF_REG_ARM64_X0);
+      }
+      auto it = arm64_reg_map.find(reg);
+      CHECK(it != arm64_reg_map.end()) << "unknown reg " << reg;
+      return it->second;
+    }
+    case ARCH_UNSUPPORTED:
+      return "unknown";
+  }
+}
diff --git a/simpleperf/perf_regs.h b/simpleperf/perf_regs.h
new file mode 100644
index 0000000..97e230f
--- /dev/null
+++ b/simpleperf/perf_regs.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SIMPLE_PERF_PERF_REGS_H_
+#define SIMPLE_PERF_PERF_REGS_H_
+
+#if defined(USE_BIONIC_UAPI_HEADERS)
+#include <uapi/asm-x86/asm/perf_regs.h>
+#include <uapi/asm-arm/asm/perf_regs.h>
+#define perf_event_arm_regs perf_event_arm64_regs
+#include <uapi/asm-arm64/asm/perf_regs.h>
+#else
+#include <asm-x86/asm/perf_regs.h>
+#include <asm-arm/asm/perf_regs.h>
+#define perf_event_arm_regs perf_event_arm64_regs
+#include <asm-arm64/asm/perf_regs.h>
+#endif
+
+#include <stdint.h>
+#include <string>
+
+enum ArchType {
+  ARCH_X86_32,
+  ARCH_X86_64,
+  ARCH_ARM,
+  ARCH_ARM64,
+  ARCH_UNSUPPORTED,
+};
+
+ArchType GetCurrentArch();
+bool SetCurrentArch(const std::string& arch);
+
+uint64_t GetSupportedRegMask();
+
+std::string GetRegName(size_t reg);
+
+#endif  // SIMPLE_PERF_PERF_REGS_H_
diff --git a/simpleperf/read_elf.cpp b/simpleperf/read_elf.cpp
index 4d41165..41120fe 100644
--- a/simpleperf/read_elf.cpp
+++ b/simpleperf/read_elf.cpp
@@ -52,8 +52,7 @@
     descsz = ALIGN(descsz, 4);
     CHECK_LE(p + namesz + descsz, end);
     if ((type == NT_GNU_BUILD_ID) && (strcmp(p, ELF_NOTE_GNU) == 0)) {
-      std::fill(build_id->begin(), build_id->end(), 0);
-      memcpy(build_id->data(), p + namesz, std::min(build_id->size(), descsz));
+      *build_id = BuildId(p + namesz, descsz);
       return true;
     }
     p += namesz + descsz;
@@ -93,27 +92,35 @@
   return false;
 }
 
+static bool GetBuildIdFromObjectFile(llvm::object::ObjectFile* obj, BuildId* build_id) {
+  bool result = false;
+  if (auto elf = llvm::dyn_cast<llvm::object::ELF32LEObjectFile>(obj)) {
+    result = GetBuildIdFromELFFile(elf->getELFFile(), build_id);
+  } else if (auto elf = llvm::dyn_cast<llvm::object::ELF64LEObjectFile>(obj)) {
+    result = GetBuildIdFromELFFile(elf->getELFFile(), build_id);
+  } else {
+    LOG(ERROR) << "unknown elf format in file " << obj->getFileName().data();
+    return false;
+  }
+  if (!result) {
+    LOG(DEBUG) << "no build id present in file " << obj->getFileName().data();
+  }
+  return result;
+}
+
 bool GetBuildIdFromElfFile(const std::string& filename, BuildId* build_id) {
   auto owning_binary = llvm::object::createBinary(llvm::StringRef(filename));
   if (owning_binary.getError()) {
     PLOG(DEBUG) << "can't open file " << filename;
     return false;
   }
-  bool result = false;
   llvm::object::Binary* binary = owning_binary.get().getBinary();
-  if (auto obj = llvm::dyn_cast<llvm::object::ObjectFile>(binary)) {
-    if (auto elf = llvm::dyn_cast<llvm::object::ELF32LEObjectFile>(obj)) {
-      result = GetBuildIdFromELFFile(elf->getELFFile(), build_id);
-    } else if (auto elf = llvm::dyn_cast<llvm::object::ELF64LEObjectFile>(obj)) {
-      result = GetBuildIdFromELFFile(elf->getELFFile(), build_id);
-    } else {
-      PLOG(DEBUG) << "unknown elf format in file " << filename;
-    }
+  auto obj = llvm::dyn_cast<llvm::object::ObjectFile>(binary);
+  if (obj == nullptr) {
+    LOG(DEBUG) << filename << " is not an object file";
+    return false;
   }
-  if (!result) {
-    PLOG(DEBUG) << "can't read build_id from file " << filename;
-  }
-  return result;
+  return GetBuildIdFromObjectFile(obj, build_id);
 }
 
 bool IsArmMappingSymbol(const char* name) {
@@ -124,7 +131,7 @@
 }
 
 template <class ELFT>
-bool ParseSymbolsFromELFFile(const llvm::object::ELFFile<ELFT>* elf,
+void ParseSymbolsFromELFFile(const llvm::object::ELFFile<ELFT>* elf,
                              std::function<void(const ElfFileSymbol&)> callback) {
   bool is_arm = (elf->getHeader()->e_machine == llvm::ELF::EM_ARM ||
                  elf->getHeader()->e_machine == llvm::ELF::EM_AARCH64);
@@ -160,7 +167,7 @@
     if (symbol.name.empty()) {
       continue;
     }
-
+    symbol.vaddr = elf_symbol.st_value;
     symbol.start_in_file = elf_symbol.st_value - shdr->sh_addr + shdr->sh_offset;
     if ((symbol.start_in_file & 1) != 0 && is_arm) {
       // Arm sets bit 0 to mark it as thumb code, remove the flag.
@@ -187,29 +194,38 @@
 
     callback(symbol);
   }
-  return true;
 }
 
-bool ParseSymbolsFromElfFile(const std::string& filename,
+bool ParseSymbolsFromElfFile(const std::string& filename, const BuildId& expected_build_id,
                              std::function<void(const ElfFileSymbol&)> callback) {
   auto owning_binary = llvm::object::createBinary(llvm::StringRef(filename));
   if (owning_binary.getError()) {
     PLOG(DEBUG) << "can't open file '" << filename << "'";
     return false;
   }
-  bool result = false;
   llvm::object::Binary* binary = owning_binary.get().getBinary();
-  if (auto obj = llvm::dyn_cast<llvm::object::ObjectFile>(binary)) {
-    if (auto elf = llvm::dyn_cast<llvm::object::ELF32LEObjectFile>(obj)) {
-      result = ParseSymbolsFromELFFile(elf->getELFFile(), callback);
-    } else if (auto elf = llvm::dyn_cast<llvm::object::ELF64LEObjectFile>(obj)) {
-      result = ParseSymbolsFromELFFile(elf->getELFFile(), callback);
-    } else {
-      PLOG(DEBUG) << "unknown elf format in file" << filename;
-    }
+  auto obj = llvm::dyn_cast<llvm::object::ObjectFile>(binary);
+  if (obj == nullptr) {
+    LOG(DEBUG) << filename << " is not an object file";
+    return false;
   }
+  BuildId real_build_id;
+  GetBuildIdFromObjectFile(obj, &real_build_id);
+  bool result = (expected_build_id == real_build_id);
+  LOG(DEBUG) << "check build id for \"" << filename << "\" (" << (result ? "match" : "mismatch")
+             << "): expected " << expected_build_id.ToString() << ", real "
+             << real_build_id.ToString();
   if (!result) {
-    PLOG(DEBUG) << "can't parse symbols from file " << filename;
+    return result;
   }
-  return result;
+
+  if (auto elf = llvm::dyn_cast<llvm::object::ELF32LEObjectFile>(obj)) {
+    ParseSymbolsFromELFFile(elf->getELFFile(), callback);
+  } else if (auto elf = llvm::dyn_cast<llvm::object::ELF64LEObjectFile>(obj)) {
+    ParseSymbolsFromELFFile(elf->getELFFile(), callback);
+  } else {
+    LOG(ERROR) << "unknown elf format in file" << filename;
+    return false;
+  }
+  return true;
 }
diff --git a/simpleperf/read_elf.h b/simpleperf/read_elf.h
index 96eb2f3..439d6bc 100644
--- a/simpleperf/read_elf.h
+++ b/simpleperf/read_elf.h
@@ -28,6 +28,7 @@
 static const std::string linker_prefix = "__dl_";
 
 struct ElfFileSymbol {
+  uint64_t vaddr;
   uint64_t start_in_file;
   uint64_t len;
   bool is_func;
@@ -36,7 +37,7 @@
   std::string name;
 };
 
-bool ParseSymbolsFromElfFile(const std::string& filename,
+bool ParseSymbolsFromElfFile(const std::string& filename, const BuildId& expected_build_id,
                              std::function<void(const ElfFileSymbol&)> callback);
 
 // Expose the following functions for unit tests.
diff --git a/simpleperf/read_elf_test.cpp b/simpleperf/read_elf_test.cpp
index c0ff660..924af97 100644
--- a/simpleperf/read_elf_test.cpp
+++ b/simpleperf/read_elf_test.cpp
@@ -31,9 +31,11 @@
   ASSERT_LT(static_cast<size_t>(elf_file_len), sizeof(elf_file));
   elf_file[elf_file_len] = '\0';
 
+  BuildId build_id;
+  GetBuildIdFromElfFile(elf_file, &build_id);
   bool result = false;
-  ASSERT_TRUE(
-      ParseSymbolsFromElfFile(elf_file, std::bind(ParseSymbol, std::placeholders::_1, &result)));
+  ASSERT_TRUE(ParseSymbolsFromElfFile(elf_file, build_id,
+                                      std::bind(ParseSymbol, std::placeholders::_1, &result)));
   ASSERT_TRUE(result);
 }
 
diff --git a/simpleperf/record.cpp b/simpleperf/record.cpp
index f6b2560..90e0977 100644
--- a/simpleperf/record.cpp
+++ b/simpleperf/record.cpp
@@ -17,12 +17,14 @@
 #include "record.h"
 
 #include <inttypes.h>
+#include <algorithm>
 #include <unordered_map>
 
 #include <base/logging.h>
 #include <base/stringprintf.h>
 
 #include "environment.h"
+#include "perf_regs.h"
 #include "utils.h"
 
 static std::string RecordTypeToString(int record_type) {
@@ -307,6 +309,33 @@
     branch_stack_data.stack.resize(nr);
     MoveFromBinaryFormat(branch_stack_data.stack.data(), nr, p);
   }
+  if (sample_type & PERF_SAMPLE_REGS_USER) {
+    MoveFromBinaryFormat(regs_user_data.abi, p);
+    if (regs_user_data.abi == 0) {
+      regs_user_data.reg_mask = 0;
+    } else {
+      regs_user_data.reg_mask = attr.sample_regs_user;
+      size_t bit_nr = 0;
+      for (size_t i = 0; i < 64; ++i) {
+        if ((regs_user_data.reg_mask >> i) & 1) {
+          bit_nr++;
+        }
+      }
+      regs_user_data.regs.resize(bit_nr);
+      MoveFromBinaryFormat(regs_user_data.regs.data(), bit_nr, p);
+    }
+  }
+  if (sample_type & PERF_SAMPLE_STACK_USER) {
+    uint64_t size;
+    MoveFromBinaryFormat(size, p);
+    if (size == 0) {
+      stack_user_data.dyn_size = 0;
+    } else {
+      stack_user_data.data.resize(size);
+      MoveFromBinaryFormat(stack_user_data.data.data(), size, p);
+      MoveFromBinaryFormat(stack_user_data.dyn_size, p);
+    }
+  }
   // TODO: Add parsing of other PERF_SAMPLE_*.
   CHECK_LE(p, end);
   if (p < end) {
@@ -353,14 +382,37 @@
                     item.from, item.to, item.flags);
     }
   }
+  if (sample_type & PERF_SAMPLE_REGS_USER) {
+    PrintIndented(indent, "user regs: abi=%" PRId64 "\n", regs_user_data.abi);
+    for (size_t i = 0, pos = 0; i < 64; ++i) {
+      if ((regs_user_data.reg_mask >> i) & 1) {
+        PrintIndented(indent + 1, "reg (%s) 0x%016" PRIx64 "\n", GetRegName(i).c_str(),
+                      regs_user_data.regs[pos++]);
+      }
+    }
+  }
+  if (sample_type & PERF_SAMPLE_STACK_USER) {
+    PrintIndented(indent, "user stack: size %zu dyn_size %" PRIu64 "\n",
+                  stack_user_data.data.size(), stack_user_data.dyn_size);
+    const uint64_t* p = reinterpret_cast<const uint64_t*>(stack_user_data.data.data());
+    const uint64_t* end = p + (stack_user_data.data.size() / sizeof(uint64_t));
+    while (p < end) {
+      PrintIndented(indent + 1, "");
+      for (size_t i = 0; i < 4 && p < end; ++i, ++p) {
+        printf(" %016" PRIx64, *p);
+      }
+      printf("\n");
+    }
+    printf("\n");
+  }
 }
 
 BuildIdRecord::BuildIdRecord(const perf_event_header* pheader) : Record(pheader) {
   const char* p = reinterpret_cast<const char*>(pheader + 1);
   const char* end = reinterpret_cast<const char*>(pheader) + pheader->size;
   MoveFromBinaryFormat(pid, p);
-  std::copy_n(p, build_id.size(), build_id.begin());
-  p += ALIGN(build_id.size(), 8);
+  build_id = BuildId(p);
+  p += ALIGN(build_id.Size(), 8);
   filename = p;
   p += ALIGN(filename.size() + 1, 64);
   CHECK_EQ(p, end);
@@ -368,11 +420,7 @@
 
 void BuildIdRecord::DumpData(size_t indent) const {
   PrintIndented(indent, "pid %u\n", pid);
-  PrintIndented(indent, "build_id 0x");
-  for (auto& c : build_id) {
-    printf("%02x", c);
-  }
-  printf("\n");
+  PrintIndented(indent, "build_id %s\n", build_id.ToString().c_str());
   PrintIndented(indent, "filename %s\n", filename.c_str());
 }
 
@@ -381,33 +429,72 @@
   char* p = buf.data();
   MoveToBinaryFormat(header, p);
   MoveToBinaryFormat(pid, p);
-  memcpy(p, build_id.data(), build_id.size());
-  p += ALIGN(build_id.size(), 8);
+  memcpy(p, build_id.Data(), build_id.Size());
+  p += ALIGN(build_id.Size(), 8);
   strcpy(p, filename.c_str());
   p += ALIGN(filename.size() + 1, 64);
   return buf;
 }
 
-std::unique_ptr<const Record> ReadRecordFromBuffer(const perf_event_attr& attr,
-                                                   const perf_event_header* pheader) {
+static std::unique_ptr<Record> ReadRecordFromBuffer(const perf_event_attr& attr,
+                                                    const perf_event_header* pheader) {
   switch (pheader->type) {
     case PERF_RECORD_MMAP:
-      return std::unique_ptr<const Record>(new MmapRecord(attr, pheader));
+      return std::unique_ptr<Record>(new MmapRecord(attr, pheader));
     case PERF_RECORD_MMAP2:
-      return std::unique_ptr<const Record>(new Mmap2Record(attr, pheader));
+      return std::unique_ptr<Record>(new Mmap2Record(attr, pheader));
     case PERF_RECORD_COMM:
-      return std::unique_ptr<const Record>(new CommRecord(attr, pheader));
+      return std::unique_ptr<Record>(new CommRecord(attr, pheader));
     case PERF_RECORD_EXIT:
-      return std::unique_ptr<const Record>(new ExitRecord(attr, pheader));
+      return std::unique_ptr<Record>(new ExitRecord(attr, pheader));
     case PERF_RECORD_FORK:
-      return std::unique_ptr<const Record>(new ForkRecord(attr, pheader));
+      return std::unique_ptr<Record>(new ForkRecord(attr, pheader));
     case PERF_RECORD_SAMPLE:
-      return std::unique_ptr<const Record>(new SampleRecord(attr, pheader));
+      return std::unique_ptr<Record>(new SampleRecord(attr, pheader));
     default:
-      return std::unique_ptr<const Record>(new Record(pheader));
+      return std::unique_ptr<Record>(new Record(pheader));
   }
 }
 
+static bool IsRecordHappensBefore(const std::unique_ptr<Record>& r1,
+                                  const std::unique_ptr<Record>& r2) {
+  bool is_r1_sample = (r1->header.type == PERF_RECORD_SAMPLE);
+  bool is_r2_sample = (r2->header.type == PERF_RECORD_SAMPLE);
+  uint64_t time1 = (is_r1_sample ? static_cast<const SampleRecord*>(r1.get())->time_data.time
+                                 : r1->sample_id.time_data.time);
+  uint64_t time2 = (is_r2_sample ? static_cast<const SampleRecord*>(r2.get())->time_data.time
+                                 : r2->sample_id.time_data.time);
+  // The record with smaller time happens first.
+  if (time1 != time2) {
+    return time1 < time2;
+  }
+  // If happening at the same time, make non-sample records before sample records,
+  // because non-sample records may contain useful information to parse sample records.
+  if (is_r1_sample != is_r2_sample) {
+    return is_r1_sample ? false : true;
+  }
+  // Otherwise, don't care of the order.
+  return false;
+}
+
+std::vector<std::unique_ptr<Record>> ReadRecordsFromBuffer(const perf_event_attr& attr,
+                                                           const char* buf, size_t buf_size) {
+  std::vector<std::unique_ptr<Record>> result;
+  const char* p = buf;
+  const char* end = buf + buf_size;
+  while (p < end) {
+    const perf_event_header* header = reinterpret_cast<const perf_event_header*>(p);
+    if (p + header->size <= end) {
+      result.push_back(ReadRecordFromBuffer(attr, header));
+    }
+    p += header->size;
+  }
+  if ((attr.sample_type & PERF_SAMPLE_TIME) && attr.sample_id_all) {
+    std::sort(result.begin(), result.end(), IsRecordHappensBefore);
+  }
+  return result;
+}
+
 MmapRecord CreateMmapRecord(const perf_event_attr& attr, bool in_kernel, uint32_t pid, uint32_t tid,
                             uint64_t addr, uint64_t len, uint64_t pgoff,
                             const std::string& filename) {
@@ -464,6 +551,6 @@
   record.build_id = build_id;
   record.filename = filename;
   record.header.size = sizeof(record.header) + sizeof(record.pid) +
-                       ALIGN(record.build_id.size(), 8) + ALIGN(filename.size() + 1, 64);
+                       ALIGN(record.build_id.Size(), 8) + ALIGN(filename.size() + 1, 64);
   return record;
 }
diff --git a/simpleperf/record.h b/simpleperf/record.h
index 6f41234..c63a38e 100644
--- a/simpleperf/record.h
+++ b/simpleperf/record.h
@@ -81,6 +81,17 @@
   std::vector<BranchStackItemType> stack;
 };
 
+struct PerfSampleRegsUserType {
+  uint64_t abi;
+  uint64_t reg_mask;
+  std::vector<uint64_t> regs;
+};
+
+struct PerfSampleStackUserType {
+  std::vector<char> data;
+  uint64_t dyn_size;
+};
+
 // SampleId is optional at the end of a record in binary format. Its content is determined by
 // sample_id_all and sample_type in perf_event_attr. To avoid the complexity of referring to
 // perf_event_attr each time, we copy sample_id_all and sample_type inside the SampleId structure.
@@ -232,6 +243,8 @@
 
   PerfSampleCallChainType callchain_data;       // Valid if PERF_SAMPLE_CALLCHAIN.
   PerfSampleBranchStackType branch_stack_data;  // Valid if PERF_SAMPLE_BRANCH_STACK.
+  PerfSampleRegsUserType regs_user_data;        // Valid if PERF_SAMPLE_REGS_USER.
+  PerfSampleStackUserType stack_user_data;      // Valid if PERF_SAMPLE_STACK_USER.
 
   SampleRecord(const perf_event_attr& attr, const perf_event_header* pheader);
 
@@ -255,8 +268,8 @@
   void DumpData(size_t indent) const override;
 };
 
-std::unique_ptr<const Record> ReadRecordFromBuffer(const perf_event_attr& attr,
-                                                   const perf_event_header* pheader);
+std::vector<std::unique_ptr<Record>> ReadRecordsFromBuffer(const perf_event_attr& attr,
+                                                           const char* buf, size_t buf_size);
 MmapRecord CreateMmapRecord(const perf_event_attr& attr, bool in_kernel, uint32_t pid, uint32_t tid,
                             uint64_t addr, uint64_t len, uint64_t pgoff,
                             const std::string& filename);
@@ -266,4 +279,5 @@
                             uint32_t ptid);
 BuildIdRecord CreateBuildIdRecord(bool in_kernel, pid_t pid, const BuildId& build_id,
                                   const std::string& filename);
+
 #endif  // SIMPLE_PERF_RECORD_H_
diff --git a/simpleperf/record_file.h b/simpleperf/record_file.h
index d8b4413..d94e083 100644
--- a/simpleperf/record_file.h
+++ b/simpleperf/record_file.h
@@ -46,13 +46,12 @@
     return WriteData(data.data(), data.size());
   }
 
-  // Use MmapRecords and SampleRecords in record file to conclude which modules/files were executing
-  // at sample times.
-  bool GetHitModules(std::vector<std::string>* hit_kernel_modules,
-                     std::vector<std::string>* hit_user_files);
+  // Read data section that has been written, for further processing.
+  bool ReadDataSection(std::vector<std::unique_ptr<Record>>* records);
 
   bool WriteFeatureHeader(size_t feature_count);
   bool WriteBuildIdFeature(const std::vector<BuildIdRecord>& build_id_records);
+  bool WriteFeatureString(int feature, const std::string& s);
   bool WriteCmdlineFeature(const std::vector<std::string>& cmdline);
   bool WriteBranchStackFeature();
 
@@ -100,17 +99,20 @@
   const PerfFileFormat::FileHeader* FileHeader();
   std::vector<const PerfFileFormat::FileAttr*> AttrSection();
   std::vector<uint64_t> IdsForAttr(const PerfFileFormat::FileAttr* attr);
-  std::vector<std::unique_ptr<const Record>> DataSection();
+  std::vector<std::unique_ptr<Record>> DataSection();
   const std::map<int, PerfFileFormat::SectionDesc>& FeatureSectionDescriptors();
   const char* DataAtOffset(uint64_t offset) {
     return mmap_addr_ + offset;
   }
   std::vector<std::string> ReadCmdlineFeature();
+  std::vector<BuildIdRecord> ReadBuildIdFeature();
+  std::string ReadFeatureString(int feature);
   bool Close();
 
  private:
   RecordFileReader(const std::string& filename, int fd);
   bool MmapFile();
+  bool GetFeatureSection(int feature, const char** pstart, const char** pend);
 
   const std::string filename_;
   int record_fd_;
diff --git a/simpleperf/record_file_reader.cpp b/simpleperf/record_file_reader.cpp
index 8407d32..8e61aa0 100644
--- a/simpleperf/record_file_reader.cpp
+++ b/simpleperf/record_file_reader.cpp
@@ -110,47 +110,12 @@
   return result;
 }
 
-static bool IsRecordHappensBefore(const std::unique_ptr<const Record>& r1,
-                                  const std::unique_ptr<const Record>& r2) {
-  bool is_r1_sample = (r1->header.type == PERF_RECORD_SAMPLE);
-  bool is_r2_sample = (r2->header.type == PERF_RECORD_SAMPLE);
-  uint64_t time1 = (is_r1_sample ? static_cast<const SampleRecord*>(r1.get())->time_data.time
-                                 : r1->sample_id.time_data.time);
-  uint64_t time2 = (is_r2_sample ? static_cast<const SampleRecord*>(r2.get())->time_data.time
-                                 : r2->sample_id.time_data.time);
-  // The record with smaller time happens first.
-  if (time1 != time2) {
-    return time1 < time2;
-  }
-  // If happening at the same time, make non-sample records before sample records,
-  // because non-sample records may contain useful information to parse sample records.
-  if (is_r1_sample != is_r2_sample) {
-    return is_r1_sample ? false : true;
-  }
-  // Otherwise, don't care of the order.
-  return false;
-}
-
-std::vector<std::unique_ptr<const Record>> RecordFileReader::DataSection() {
-  std::vector<std::unique_ptr<const Record>> result;
+std::vector<std::unique_ptr<Record>> RecordFileReader::DataSection() {
   const struct FileHeader* header = FileHeader();
   auto file_attrs = AttrSection();
   CHECK(file_attrs.size() > 0);
-  perf_event_attr attr = file_attrs[0]->attr;
-
-  const char* end = mmap_addr_ + header->data.offset + header->data.size;
-  const char* p = mmap_addr_ + header->data.offset;
-  while (p < end) {
-    const perf_event_header* header = reinterpret_cast<const perf_event_header*>(p);
-    if (p + header->size <= end) {
-      result.push_back(std::move(ReadRecordFromBuffer(attr, header)));
-    }
-    p += header->size;
-  }
-  if ((attr.sample_type & PERF_SAMPLE_TIME) && attr.sample_id_all) {
-    std::sort(result.begin(), result.end(), IsRecordHappensBefore);
-  }
-  return result;
+  return ReadRecordsFromBuffer(file_attrs[0]->attr, mmap_addr_ + header->data.offset,
+                               header->data.size);
 }
 
 const std::map<int, SectionDesc>& RecordFileReader::FeatureSectionDescriptors() {
@@ -174,15 +139,24 @@
   return feature_sections_;
 }
 
-std::vector<std::string> RecordFileReader::ReadCmdlineFeature() {
+bool RecordFileReader::GetFeatureSection(int feature, const char** pstart, const char** pend) {
   const std::map<int, SectionDesc>& section_map = FeatureSectionDescriptors();
-  auto it = section_map.find(FEAT_CMDLINE);
+  auto it = section_map.find(feature);
   if (it == section_map.end()) {
-    return std::vector<std::string>();
+    return false;
   }
   SectionDesc section = it->second;
-  const char* p = DataAtOffset(section.offset);
-  const char* end = DataAtOffset(section.offset + section.size);
+  *pstart = DataAtOffset(section.offset);
+  *pend = DataAtOffset(section.offset + section.size);
+  return true;
+}
+
+std::vector<std::string> RecordFileReader::ReadCmdlineFeature() {
+  const char* p;
+  const char* end;
+  if (!GetFeatureSection(FEAT_CMDLINE, &p, &end)) {
+    return std::vector<std::string>();
+  }
   std::vector<std::string> cmdline;
   uint32_t arg_count;
   MoveFromBinaryFormat(arg_count, p);
@@ -196,3 +170,34 @@
   }
   return cmdline;
 }
+
+std::vector<BuildIdRecord> RecordFileReader::ReadBuildIdFeature() {
+  const char* p;
+  const char* end;
+  if (!GetFeatureSection(FEAT_BUILD_ID, &p, &end)) {
+    return std::vector<BuildIdRecord>();
+  }
+  std::vector<BuildIdRecord> result;
+  while (p < end) {
+    const perf_event_header* header = reinterpret_cast<const perf_event_header*>(p);
+    CHECK_LE(p + header->size, end);
+    BuildIdRecord record(header);
+    // Set type explicitly as the perf.data produced by perf doesn't set it.
+    record.header.type = PERF_RECORD_BUILD_ID;
+    result.push_back(record);
+    p += header->size;
+  }
+  return result;
+}
+
+std::string RecordFileReader::ReadFeatureString(int feature) {
+  const char* p;
+  const char* end;
+  if (!GetFeatureSection(feature, &p, &end)) {
+    return std::string();
+  }
+  uint32_t len;
+  MoveFromBinaryFormat(len, p);
+  CHECK_LE(p + len, end);
+  return p;
+}
diff --git a/simpleperf/record_file_test.cpp b/simpleperf/record_file_test.cpp
index 6e6bc13..3cefb83 100644
--- a/simpleperf/record_file_test.cpp
+++ b/simpleperf/record_file_test.cpp
@@ -58,12 +58,19 @@
       CreateMmapRecord(event_attr, true, 1, 1, 0x1000, 0x2000, 0x3000, "mmap_record_example");
   ASSERT_TRUE(writer->WriteData(mmap_record.BinaryFormat()));
 
+  // Check data section that has been written.
+  std::vector<std::unique_ptr<Record>> records;
+  ASSERT_TRUE(writer->ReadDataSection(&records));
+  ASSERT_EQ(1u, records.size());
+  CheckRecordEqual(mmap_record, *records[0]);
+
   // Write feature section.
   ASSERT_TRUE(writer->WriteFeatureHeader(1));
-  BuildId build_id;
-  for (size_t i = 0; i < build_id.size(); ++i) {
-    build_id[i] = i;
+  char p[BuildId::Size()];
+  for (size_t i = 0; i < BuildId::Size(); ++i) {
+    p[i] = i;
   }
+  BuildId build_id(p);
   BuildIdRecord build_id_record = CreateBuildIdRecord(false, getpid(), build_id, "init");
   ASSERT_TRUE(writer->WriteBuildIdFeature({build_id_record}));
   ASSERT_TRUE(writer->Close());
@@ -80,7 +87,7 @@
   ASSERT_EQ(1u, ids.size());
 
   // Read and check data section.
-  std::vector<std::unique_ptr<const Record>> records = reader->DataSection();
+  records = reader->DataSection();
   ASSERT_EQ(1u, records.size());
   CheckRecordEqual(mmap_record, *records[0]);
 
@@ -119,7 +126,7 @@
   // Read from a record file.
   std::unique_ptr<RecordFileReader> reader = RecordFileReader::CreateInstance(filename);
   ASSERT_TRUE(reader != nullptr);
-  std::vector<std::unique_ptr<const Record>> records = reader->DataSection();
+  std::vector<std::unique_ptr<Record>> records = reader->DataSection();
   ASSERT_EQ(3u, records.size());
   CheckRecordEqual(r2, *records[0]);
   CheckRecordEqual(r1, *records[1]);
diff --git a/simpleperf/record_file_writer.cpp b/simpleperf/record_file_writer.cpp
index deb0ada..79bb7cf 100644
--- a/simpleperf/record_file_writer.cpp
+++ b/simpleperf/record_file_writer.cpp
@@ -133,64 +133,7 @@
   return true;
 }
 
-void RecordFileWriter::GetHitModulesInBuffer(const char* p, const char* end,
-                                             std::vector<std::string>* hit_kernel_modules,
-                                             std::vector<std::string>* hit_user_files) {
-  std::vector<std::unique_ptr<const Record>> kernel_mmaps;
-  std::vector<std::unique_ptr<const Record>> user_mmaps;
-  std::set<std::string> hit_kernel_set;
-  std::set<std::string> hit_user_set;
-
-  while (p < end) {
-    auto header = reinterpret_cast<const perf_event_header*>(p);
-    CHECK_LE(p + header->size, end);
-    p += header->size;
-    std::unique_ptr<const Record> record = ReadRecordFromBuffer(event_attr_, header);
-    CHECK(record != nullptr);
-    if (record->header.type == PERF_RECORD_MMAP) {
-      if (record->header.misc & PERF_RECORD_MISC_KERNEL) {
-        kernel_mmaps.push_back(std::move(record));
-      } else {
-        user_mmaps.push_back(std::move(record));
-      }
-    } else if (record->header.type == PERF_RECORD_SAMPLE) {
-      auto& r = *static_cast<const SampleRecord*>(record.get());
-      if (!(r.sample_type & PERF_SAMPLE_IP) || !(r.sample_type & PERF_SAMPLE_TID)) {
-        continue;
-      }
-      uint32_t pid = r.tid_data.pid;
-      uint64_t ip = r.ip_data.ip;
-      if (r.header.misc & PERF_RECORD_MISC_KERNEL) {
-        // Loop from back to front, because new MmapRecords are inserted at the end of the mmaps,
-        // and we want to match the newest one.
-        for (auto it = kernel_mmaps.rbegin(); it != kernel_mmaps.rend(); ++it) {
-          auto& m_record = *reinterpret_cast<const MmapRecord*>(it->get());
-          if (ip >= m_record.data.addr && ip < m_record.data.addr + m_record.data.len) {
-            hit_kernel_set.insert(m_record.filename);
-            break;
-          }
-        }
-      } else {
-        for (auto it = user_mmaps.rbegin(); it != user_mmaps.rend(); ++it) {
-          auto& m_record = *reinterpret_cast<const MmapRecord*>(it->get());
-          if (pid == m_record.data.pid && ip >= m_record.data.addr &&
-              ip < m_record.data.addr + m_record.data.len) {
-            hit_user_set.insert(m_record.filename);
-            break;
-          }
-        }
-      }
-    }
-  }
-  hit_kernel_modules->clear();
-  hit_kernel_modules->insert(hit_kernel_modules->begin(), hit_kernel_set.begin(),
-                             hit_kernel_set.end());
-  hit_user_files->clear();
-  hit_user_files->insert(hit_user_files->begin(), hit_user_set.begin(), hit_user_set.end());
-}
-
-bool RecordFileWriter::GetHitModules(std::vector<std::string>* hit_kernel_modules,
-                                     std::vector<std::string>* hit_user_files) {
+bool RecordFileWriter::ReadDataSection(std::vector<std::unique_ptr<Record>>* records) {
   if (fflush(record_fp_) != 0) {
     PLOG(ERROR) << "fflush() failed";
     return false;
@@ -205,14 +148,14 @@
     PLOG(ERROR) << "mmap() failed";
     return false;
   }
-  const char* data_section_p = reinterpret_cast<const char*>(mmap_addr) + data_section_offset_;
-  const char* data_section_end = data_section_p + data_section_size_;
-  GetHitModulesInBuffer(data_section_p, data_section_end, hit_kernel_modules, hit_user_files);
-
+  const char* data_section = reinterpret_cast<char*>(mmap_addr) + data_section_offset_;
+  std::vector<std::unique_ptr<Record>> result =
+      ReadRecordsFromBuffer(event_attr_, data_section, data_section_size_);
   if (munmap(mmap_addr, mmap_len) == -1) {
     PLOG(ERROR) << "munmap() failed";
     return false;
   }
+  *records = std::move(result);
   return true;
 }
 
@@ -258,6 +201,23 @@
   return WriteFeatureEnd(FEAT_BUILD_ID, start_offset);
 }
 
+bool RecordFileWriter::WriteFeatureString(int feature, const std::string& s) {
+  uint64_t start_offset;
+  if (!WriteFeatureBegin(&start_offset)) {
+    return false;
+  }
+  uint32_t len = static_cast<uint32_t>(ALIGN(s.size() + 1, 64));
+  if (!Write(&len, sizeof(len))) {
+    return false;
+  }
+  std::vector<char> v(len, '\0');
+  std::copy(s.begin(), s.end(), v.begin());
+  if (!Write(v.data(), v.size())) {
+    return false;
+  }
+  return WriteFeatureEnd(feature, start_offset);
+}
+
 bool RecordFileWriter::WriteCmdlineFeature(const std::vector<std::string>& cmdline) {
   uint64_t start_offset;
   if (!WriteFeatureBegin(&start_offset)) {
diff --git a/simpleperf/record_test.cpp b/simpleperf/record_test.cpp
index 96262a8..27edc52 100644
--- a/simpleperf/record_test.cpp
+++ b/simpleperf/record_test.cpp
@@ -38,10 +38,10 @@
 template <class RecordType>
 void RecordTest::CheckRecordMatchBinary(const RecordType& record) {
   std::vector<char> binary = record.BinaryFormat();
-  std::unique_ptr<const Record> record_p =
-      ReadRecordFromBuffer(event_attr, reinterpret_cast<const perf_event_header*>(binary.data()));
-  ASSERT_TRUE(record_p != nullptr);
-  CheckRecordEqual(record, *record_p);
+  std::vector<std::unique_ptr<Record>> records =
+      ReadRecordsFromBuffer(event_attr, binary.data(), binary.size());
+  ASSERT_EQ(1u, records.size());
+  CheckRecordEqual(record, *records[0]);
 }
 
 TEST_F(RecordTest, MmapRecordMatchBinary) {
diff --git a/simpleperf/sample_tree.cpp b/simpleperf/sample_tree.cpp
index 3f0e5b3..f8f27b1 100644
--- a/simpleperf/sample_tree.cpp
+++ b/simpleperf/sample_tree.cpp
@@ -20,151 +20,11 @@
 
 #include "environment.h"
 
-bool MapComparator::operator()(const MapEntry* map1, const MapEntry* map2) const {
-  if (map1->start_addr != map2->start_addr) {
-    return map1->start_addr < map2->start_addr;
-  }
-  if (map1->len != map2->len) {
-    return map1->len < map2->len;
-  }
-  if (map1->time != map2->time) {
-    return map1->time < map2->time;
-  }
-  return false;
-}
-
-void SampleTree::AddThread(int pid, int tid, const std::string& comm) {
-  auto it = thread_tree_.find(tid);
-  if (it == thread_tree_.end()) {
-    ThreadEntry* thread = new ThreadEntry{
-        pid, tid,
-        "unknown",                             // comm
-        std::set<MapEntry*, MapComparator>(),  // maps
-    };
-    auto pair = thread_tree_.insert(std::make_pair(tid, std::unique_ptr<ThreadEntry>(thread)));
-    CHECK(pair.second);
-    it = pair.first;
-  }
-  thread_comm_storage_.push_back(std::unique_ptr<std::string>(new std::string(comm)));
-  it->second->comm = thread_comm_storage_.back()->c_str();
-}
-
-void SampleTree::ForkThread(int pid, int tid, int ppid, int ptid) {
-  ThreadEntry* parent = FindThreadOrNew(ppid, ptid);
-  ThreadEntry* child = FindThreadOrNew(pid, tid);
-  child->comm = parent->comm;
-  child->maps = parent->maps;
-}
-
-static void RemoveOverlappedMap(std::set<MapEntry*, MapComparator>* map_set, const MapEntry* map) {
-  for (auto it = map_set->begin(); it != map_set->end();) {
-    if ((*it)->start_addr >= map->start_addr + map->len) {
-      break;
-    }
-    if ((*it)->start_addr + (*it)->len <= map->start_addr) {
-      ++it;
-    } else {
-      it = map_set->erase(it);
-    }
-  }
-}
-
-void SampleTree::AddKernelMap(uint64_t start_addr, uint64_t len, uint64_t pgoff, uint64_t time,
-                              const std::string& filename) {
-  // kernel map len can be 0 when record command is not run in supervisor mode.
-  if (len == 0) {
-    return;
-  }
-  DsoEntry* dso = FindKernelDsoOrNew(filename);
-  MapEntry* map = new MapEntry{
-      start_addr, len, pgoff, time, dso,
-  };
-  map_storage_.push_back(std::unique_ptr<MapEntry>(map));
-  RemoveOverlappedMap(&kernel_map_tree_, map);
-  auto pair = kernel_map_tree_.insert(map);
-  CHECK(pair.second);
-}
-
-DsoEntry* SampleTree::FindKernelDsoOrNew(const std::string& filename) {
-  if (filename == DEFAULT_KERNEL_MMAP_NAME) {
-    if (kernel_dso_ == nullptr) {
-      kernel_dso_ = DsoFactory::LoadKernel();
-    }
-    return kernel_dso_.get();
-  }
-  auto it = module_dso_tree_.find(filename);
-  if (it == module_dso_tree_.end()) {
-    module_dso_tree_[filename] = DsoFactory::LoadKernelModule(filename);
-    it = module_dso_tree_.find(filename);
-  }
-  return it->second.get();
-}
-
-void SampleTree::AddThreadMap(int pid, int tid, uint64_t start_addr, uint64_t len, uint64_t pgoff,
-                              uint64_t time, const std::string& filename) {
-  ThreadEntry* thread = FindThreadOrNew(pid, tid);
-  DsoEntry* dso = FindUserDsoOrNew(filename);
-  MapEntry* map = new MapEntry{
-      start_addr, len, pgoff, time, dso,
-  };
-  map_storage_.push_back(std::unique_ptr<MapEntry>(map));
-  RemoveOverlappedMap(&thread->maps, map);
-  auto pair = thread->maps.insert(map);
-  CHECK(pair.second);
-}
-
-ThreadEntry* SampleTree::FindThreadOrNew(int pid, int tid) {
-  auto it = thread_tree_.find(tid);
-  if (it == thread_tree_.end()) {
-    AddThread(pid, tid, "unknown");
-    it = thread_tree_.find(tid);
-  } else {
-    CHECK_EQ(pid, it->second.get()->pid) << "tid = " << tid;
-  }
-  return it->second.get();
-}
-
-DsoEntry* SampleTree::FindUserDsoOrNew(const std::string& filename) {
-  auto it = user_dso_tree_.find(filename);
-  if (it == user_dso_tree_.end()) {
-    user_dso_tree_[filename] = DsoFactory::LoadDso(filename);
-    it = user_dso_tree_.find(filename);
-  }
-  return it->second.get();
-}
-
-static bool IsIpInMap(uint64_t ip, const MapEntry* map) {
-  return (map->start_addr <= ip && map->start_addr + map->len > ip);
-}
-
-const MapEntry* SampleTree::FindMap(const ThreadEntry* thread, uint64_t ip, bool in_kernel) {
-  // Construct a map_entry which is strictly after the searched map_entry, based on MapComparator.
-  MapEntry find_map = {
-      ip,          // start_addr
-      ULLONG_MAX,  // len
-      0,           // pgoff
-      ULLONG_MAX,  // time
-      nullptr,     // dso
-  };
-  if (!in_kernel) {
-    auto it = thread->maps.upper_bound(&find_map);
-    if (it != thread->maps.begin() && IsIpInMap(ip, *--it)) {
-      return *it;
-    }
-  } else {
-    auto it = kernel_map_tree_.upper_bound(&find_map);
-    if (it != kernel_map_tree_.begin() && IsIpInMap(ip, *--it)) {
-      return *it;
-    }
-  }
-  return &unknown_map_;
-}
-
 SampleEntry* SampleTree::AddSample(int pid, int tid, uint64_t ip, uint64_t time, uint64_t period,
                                    bool in_kernel) {
-  const ThreadEntry* thread = FindThreadOrNew(pid, tid);
-  const MapEntry* map = FindMap(thread, ip, in_kernel);
-  const SymbolEntry* symbol = FindSymbol(map, ip);
+  const ThreadEntry* thread = thread_tree_->FindThreadOrNew(pid, tid);
+  const MapEntry* map = thread_tree_->FindMap(thread, ip, in_kernel);
+  const SymbolEntry* symbol = thread_tree_->FindSymbol(map, ip);
 
   SampleEntry value(ip, time, period, 0, 1, thread, map, symbol);
 
@@ -173,17 +33,17 @@
 
 void SampleTree::AddBranchSample(int pid, int tid, uint64_t from_ip, uint64_t to_ip,
                                  uint64_t branch_flags, uint64_t time, uint64_t period) {
-  const ThreadEntry* thread = FindThreadOrNew(pid, tid);
-  const MapEntry* from_map = FindMap(thread, from_ip, false);
-  if (from_map == &unknown_map_) {
-    from_map = FindMap(thread, from_ip, true);
+  const ThreadEntry* thread = thread_tree_->FindThreadOrNew(pid, tid);
+  const MapEntry* from_map = thread_tree_->FindMap(thread, from_ip, false);
+  if (from_map == thread_tree_->UnknownMap()) {
+    from_map = thread_tree_->FindMap(thread, from_ip, true);
   }
-  const SymbolEntry* from_symbol = FindSymbol(from_map, from_ip);
-  const MapEntry* to_map = FindMap(thread, to_ip, false);
-  if (to_map == &unknown_map_) {
-    to_map = FindMap(thread, to_ip, true);
+  const SymbolEntry* from_symbol = thread_tree_->FindSymbol(from_map, from_ip);
+  const MapEntry* to_map = thread_tree_->FindMap(thread, to_ip, false);
+  if (to_map == thread_tree_->UnknownMap()) {
+    to_map = thread_tree_->FindMap(thread, to_ip, true);
   }
-  const SymbolEntry* to_symbol = FindSymbol(to_map, to_ip);
+  const SymbolEntry* to_symbol = thread_tree_->FindSymbol(to_map, to_ip);
 
   SampleEntry value(to_ip, time, period, 0, 1, thread, to_map, to_symbol);
   value.branch_from.ip = from_ip;
@@ -197,9 +57,9 @@
 SampleEntry* SampleTree::AddCallChainSample(int pid, int tid, uint64_t ip, uint64_t time,
                                             uint64_t period, bool in_kernel,
                                             const std::vector<SampleEntry*>& callchain) {
-  const ThreadEntry* thread = FindThreadOrNew(pid, tid);
-  const MapEntry* map = FindMap(thread, ip, in_kernel);
-  const SymbolEntry* symbol = FindSymbol(map, ip);
+  const ThreadEntry* thread = thread_tree_->FindThreadOrNew(pid, tid);
+  const MapEntry* map = thread_tree_->FindMap(thread, ip, in_kernel);
+  const SymbolEntry* symbol = thread_tree_->FindSymbol(map, ip);
 
   SampleEntry value(ip, time, 0, period, 0, thread, map, symbol);
 
@@ -238,20 +98,6 @@
   return sample;
 }
 
-const SymbolEntry* SampleTree::FindSymbol(const MapEntry* map, uint64_t ip) {
-  uint64_t offset_in_file;
-  if (map->dso == kernel_dso_.get()) {
-    offset_in_file = ip;
-  } else {
-    offset_in_file = ip - map->start_addr + map->pgoff;
-  }
-  const SymbolEntry* symbol = map->dso->FindSymbol(offset_in_file);
-  if (symbol == nullptr) {
-    symbol = &unknown_symbol_;
-  }
-  return symbol;
-}
-
 void SampleTree::InsertCallChainForSample(SampleEntry* sample,
                                           const std::vector<SampleEntry*>& callchain,
                                           uint64_t period) {
diff --git a/simpleperf/sample_tree.h b/simpleperf/sample_tree.h
index 2e97ceb..b79f164 100644
--- a/simpleperf/sample_tree.h
+++ b/simpleperf/sample_tree.h
@@ -25,26 +25,7 @@
 #include <vector>
 
 #include "callchain.h"
-#include "dso.h"
-
-struct MapEntry {
-  uint64_t start_addr;
-  uint64_t len;
-  uint64_t pgoff;
-  uint64_t time;  // Map creation time.
-  DsoEntry* dso;
-};
-
-struct MapComparator {
-  bool operator()(const MapEntry* map1, const MapEntry* map2) const;
-};
-
-struct ThreadEntry {
-  int pid;
-  int tid;
-  const char* comm;  // It always refers to the latest comm.
-  std::set<MapEntry*, MapComparator> maps;
-};
+#include "thread_tree.h"
 
 struct BranchFromEntry {
   uint64_t ip;
@@ -92,34 +73,16 @@
 
 class SampleTree {
  public:
-  SampleTree(compare_sample_func_t sample_compare_function)
-      : sample_comparator_(sample_compare_function),
+  SampleTree(ThreadTree* thread_tree, compare_sample_func_t sample_compare_function)
+      : thread_tree_(thread_tree),
+        sample_comparator_(sample_compare_function),
         sample_tree_(sample_comparator_),
         sorted_sample_comparator_(sample_compare_function),
         sorted_sample_tree_(sorted_sample_comparator_),
         total_samples_(0),
         total_period_(0) {
-    unknown_map_ = MapEntry{
-        0,              // start_addr
-        ULLONG_MAX,     // len
-        0,              // pgoff
-        0,              // time
-        &unknown_dso_,  // dso
-    };
-    unknown_dso_.path = "unknown";
-    unknown_symbol_ = SymbolEntry{
-        "unknown",   // name
-        0,           // addr
-        ULLONG_MAX,  // len
-    };
   }
 
-  void AddThread(int pid, int tid, const std::string& comm);
-  void ForkThread(int pid, int tid, int ppid, int ptid);
-  void AddKernelMap(uint64_t start_addr, uint64_t len, uint64_t pgoff, uint64_t time,
-                    const std::string& filename);
-  void AddThreadMap(int pid, int tid, uint64_t start_addr, uint64_t len, uint64_t pgoff,
-                    uint64_t time, const std::string& filename);
   SampleEntry* AddSample(int pid, int tid, uint64_t ip, uint64_t time, uint64_t period,
                          bool in_kernel);
   void AddBranchSample(int pid, int tid, uint64_t from_ip, uint64_t to_ip, uint64_t branch_flags,
@@ -139,11 +102,6 @@
   }
 
  private:
-  ThreadEntry* FindThreadOrNew(int pid, int tid);
-  const MapEntry* FindMap(const ThreadEntry* thread, uint64_t ip, bool in_kernel);
-  DsoEntry* FindKernelDsoOrNew(const std::string& filename);
-  DsoEntry* FindUserDsoOrNew(const std::string& filename);
-  const SymbolEntry* FindSymbol(const MapEntry* map, uint64_t ip);
   SampleEntry* InsertSample(SampleEntry& value);
   SampleEntry* AllocateSample(SampleEntry& value);
 
@@ -173,19 +131,7 @@
     compare_sample_func_t compare_function;
   };
 
-  std::unordered_map<int, std::unique_ptr<ThreadEntry>> thread_tree_;
-  std::vector<std::unique_ptr<std::string>> thread_comm_storage_;
-
-  std::set<MapEntry*, MapComparator> kernel_map_tree_;
-  std::vector<std::unique_ptr<MapEntry>> map_storage_;
-  MapEntry unknown_map_;
-
-  std::unique_ptr<DsoEntry> kernel_dso_;
-  std::unordered_map<std::string, std::unique_ptr<DsoEntry>> module_dso_tree_;
-  std::unordered_map<std::string, std::unique_ptr<DsoEntry>> user_dso_tree_;
-  DsoEntry unknown_dso_;
-  SymbolEntry unknown_symbol_;
-
+  ThreadTree* thread_tree_;
   SampleComparator sample_comparator_;
   std::set<SampleEntry*, SampleComparator> sample_tree_;
   SortedSampleComparator sorted_sample_comparator_;
diff --git a/simpleperf/sample_tree_test.cpp b/simpleperf/sample_tree_test.cpp
index 9b37f47..cb59083 100644
--- a/simpleperf/sample_tree_test.cpp
+++ b/simpleperf/sample_tree_test.cpp
@@ -80,21 +80,22 @@
 class SampleTreeTest : public testing::Test {
  protected:
   virtual void SetUp() {
-    sample_tree = std::unique_ptr<SampleTree>(new SampleTree(CompareSampleFunction));
-    sample_tree->AddThread(1, 1, "p1t1");
-    sample_tree->AddThread(1, 11, "p1t11");
-    sample_tree->AddThread(2, 2, "p2t2");
-    sample_tree->AddThreadMap(1, 1, 1, 5, 0, 0, "process1_thread1");
-    sample_tree->AddThreadMap(1, 1, 6, 5, 0, 0, "process1_thread1_map2");
-    sample_tree->AddThreadMap(1, 11, 1, 10, 0, 0, "process1_thread11");
-    sample_tree->AddThreadMap(2, 2, 1, 20, 0, 0, "process2_thread2");
-    sample_tree->AddKernelMap(10, 20, 0, 0, "kernel");
+    thread_tree.AddThread(1, 1, "p1t1");
+    thread_tree.AddThread(1, 11, "p1t11");
+    thread_tree.AddThread(2, 2, "p2t2");
+    thread_tree.AddThreadMap(1, 1, 1, 5, 0, 0, "process1_thread1");
+    thread_tree.AddThreadMap(1, 1, 6, 5, 0, 0, "process1_thread1_map2");
+    thread_tree.AddThreadMap(1, 11, 1, 10, 0, 0, "process1_thread11");
+    thread_tree.AddThreadMap(2, 2, 1, 20, 0, 0, "process2_thread2");
+    thread_tree.AddKernelMap(10, 20, 0, 0, "kernel");
+    sample_tree = std::unique_ptr<SampleTree>(new SampleTree(&thread_tree, CompareSampleFunction));
   }
 
   void VisitSampleTree(const std::vector<ExpectedSampleInMap>& expected_samples) {
     ::VisitSampleTree(sample_tree.get(), expected_samples);
   }
 
+  ThreadTree thread_tree;
   std::unique_ptr<SampleTree> sample_tree;
 };
 
@@ -128,7 +129,7 @@
 
 TEST_F(SampleTreeTest, different_comm) {
   sample_tree->AddSample(1, 1, 1, 0, 0, false);
-  sample_tree->AddThread(1, 1, "p1t1_comm2");
+  thread_tree.AddThread(1, 1, "p1t1_comm2");
   sample_tree->AddSample(1, 1, 1, 0, 0, false);
   std::vector<ExpectedSampleInMap> expected_samples = {
       {1, 1, "p1t1", "process1_thread1", 1, 1}, {1, 1, "p1t1_comm2", "process1_thread1", 1, 1},
@@ -166,16 +167,17 @@
 }
 
 TEST(sample_tree, overlapped_map) {
-  auto sample_tree = std::unique_ptr<SampleTree>(new SampleTree(CompareSampleFunction));
-  sample_tree->AddThread(1, 1, "thread1");
-  sample_tree->AddThreadMap(1, 1, 1, 10, 0, 0, "map1");  // Add map 1.
-  sample_tree->AddSample(1, 1, 5, 0, 0, false);          // Hit map 1.
-  sample_tree->AddThreadMap(1, 1, 5, 20, 0, 0, "map2");  // Add map 2.
-  sample_tree->AddSample(1, 1, 6, 0, 0, false);          // Hit map 2.
-  sample_tree->AddSample(1, 1, 4, 0, 0, false);          // Hit unknown map.
-  sample_tree->AddThreadMap(1, 1, 2, 7, 0, 0, "map3");   // Add map 3.
-  sample_tree->AddSample(1, 1, 7, 0, 0, false);          // Hit map 3.
-  sample_tree->AddSample(1, 1, 10, 0, 0, false);         // Hit unknown map.
+  ThreadTree thread_tree;
+  SampleTree sample_tree(&thread_tree, CompareSampleFunction);
+  thread_tree.AddThread(1, 1, "thread1");
+  thread_tree.AddThreadMap(1, 1, 1, 10, 0, 0, "map1");  // Add map 1.
+  sample_tree.AddSample(1, 1, 5, 0, 0, false);          // Hit map 1.
+  thread_tree.AddThreadMap(1, 1, 5, 20, 0, 0, "map2");  // Add map 2.
+  sample_tree.AddSample(1, 1, 6, 0, 0, false);          // Hit map 2.
+  sample_tree.AddSample(1, 1, 4, 0, 0, false);          // Hit unknown map.
+  thread_tree.AddThreadMap(1, 1, 2, 7, 0, 0, "map3");   // Add map 3.
+  sample_tree.AddSample(1, 1, 7, 0, 0, false);          // Hit map 3.
+  sample_tree.AddSample(1, 1, 10, 0, 0, false);         // Hit unknown map.
 
   std::vector<ExpectedSampleInMap> expected_samples = {
       {1, 1, "thread1", "map1", 1, 1},
@@ -183,5 +185,5 @@
       {1, 1, "thread1", "map3", 2, 1},
       {1, 1, "thread1", "unknown", 0, 2},
   };
-  VisitSampleTree(sample_tree.get(), expected_samples);
+  VisitSampleTree(&sample_tree, expected_samples);
 }
diff --git a/simpleperf/thread_tree.cpp b/simpleperf/thread_tree.cpp
new file mode 100644
index 0000000..937dfcf
--- /dev/null
+++ b/simpleperf/thread_tree.cpp
@@ -0,0 +1,215 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "thread_tree.h"
+
+#include <base/logging.h>
+#include "environment.h"
+#include "perf_event.h"
+#include "record.h"
+
+bool MapComparator::operator()(const MapEntry* map1, const MapEntry* map2) const {
+  if (map1->start_addr != map2->start_addr) {
+    return map1->start_addr < map2->start_addr;
+  }
+  if (map1->len != map2->len) {
+    return map1->len < map2->len;
+  }
+  if (map1->time != map2->time) {
+    return map1->time < map2->time;
+  }
+  return false;
+}
+
+void ThreadTree::AddThread(int pid, int tid, const std::string& comm) {
+  auto it = thread_tree_.find(tid);
+  if (it == thread_tree_.end()) {
+    ThreadEntry* thread = new ThreadEntry{
+        pid, tid,
+        "unknown",                             // comm
+        std::set<MapEntry*, MapComparator>(),  // maps
+    };
+    auto pair = thread_tree_.insert(std::make_pair(tid, std::unique_ptr<ThreadEntry>(thread)));
+    CHECK(pair.second);
+    it = pair.first;
+  }
+  thread_comm_storage_.push_back(std::unique_ptr<std::string>(new std::string(comm)));
+  it->second->comm = thread_comm_storage_.back()->c_str();
+}
+
+void ThreadTree::ForkThread(int pid, int tid, int ppid, int ptid) {
+  ThreadEntry* parent = FindThreadOrNew(ppid, ptid);
+  ThreadEntry* child = FindThreadOrNew(pid, tid);
+  child->comm = parent->comm;
+  child->maps = parent->maps;
+}
+
+ThreadEntry* ThreadTree::FindThreadOrNew(int pid, int tid) {
+  auto it = thread_tree_.find(tid);
+  if (it == thread_tree_.end()) {
+    AddThread(pid, tid, "unknown");
+    it = thread_tree_.find(tid);
+  } else {
+    if (pid != it->second.get()->pid) {
+      // TODO: b/22185053.
+      LOG(DEBUG) << "unexpected (pid, tid) pair: expected (" << it->second.get()->pid << ", " << tid
+                 << "), actual (" << pid << ", " << tid << ")";
+    }
+  }
+  return it->second.get();
+}
+
+static void RemoveOverlappedMap(std::set<MapEntry*, MapComparator>* map_set, const MapEntry* map) {
+  for (auto it = map_set->begin(); it != map_set->end();) {
+    if ((*it)->start_addr >= map->start_addr + map->len) {
+      break;
+    }
+    if ((*it)->start_addr + (*it)->len <= map->start_addr) {
+      ++it;
+    } else {
+      it = map_set->erase(it);
+    }
+  }
+}
+
+void ThreadTree::AddKernelMap(uint64_t start_addr, uint64_t len, uint64_t pgoff, uint64_t time,
+                              const std::string& filename) {
+  // kernel map len can be 0 when record command is not run in supervisor mode.
+  if (len == 0) {
+    return;
+  }
+  DsoEntry* dso = FindKernelDsoOrNew(filename);
+  MapEntry* map = new MapEntry{
+      start_addr, len, pgoff, time, dso,
+  };
+  map_storage_.push_back(std::unique_ptr<MapEntry>(map));
+  RemoveOverlappedMap(&kernel_map_tree_, map);
+  auto pair = kernel_map_tree_.insert(map);
+  CHECK(pair.second);
+}
+
+DsoEntry* ThreadTree::FindKernelDsoOrNew(const std::string& filename) {
+  if (filename == DEFAULT_KERNEL_MMAP_NAME) {
+    if (kernel_dso_ == nullptr) {
+      kernel_dso_ = DsoFactory::GetInstance()->CreateDso(DSO_KERNEL);
+    }
+    return kernel_dso_.get();
+  }
+  auto it = module_dso_tree_.find(filename);
+  if (it == module_dso_tree_.end()) {
+    module_dso_tree_[filename] = DsoFactory::GetInstance()->CreateDso(DSO_KERNEL_MODULE, filename);
+    it = module_dso_tree_.find(filename);
+  }
+  return it->second.get();
+}
+
+void ThreadTree::AddThreadMap(int pid, int tid, uint64_t start_addr, uint64_t len, uint64_t pgoff,
+                              uint64_t time, const std::string& filename) {
+  ThreadEntry* thread = FindThreadOrNew(pid, tid);
+  DsoEntry* dso = FindUserDsoOrNew(filename);
+  MapEntry* map = new MapEntry{
+      start_addr, len, pgoff, time, dso,
+  };
+  map_storage_.push_back(std::unique_ptr<MapEntry>(map));
+  RemoveOverlappedMap(&thread->maps, map);
+  auto pair = thread->maps.insert(map);
+  CHECK(pair.second);
+}
+
+DsoEntry* ThreadTree::FindUserDsoOrNew(const std::string& filename) {
+  auto it = user_dso_tree_.find(filename);
+  if (it == user_dso_tree_.end()) {
+    user_dso_tree_[filename] = DsoFactory::GetInstance()->CreateDso(DSO_ELF_FILE, filename);
+    it = user_dso_tree_.find(filename);
+  }
+  return it->second.get();
+}
+
+static bool IsAddrInMap(uint64_t addr, const MapEntry* map) {
+  return (addr >= map->start_addr && addr < map->start_addr + map->len);
+}
+
+static MapEntry* FindMapByAddr(const std::set<MapEntry*, MapComparator>& maps, uint64_t addr) {
+  // Construct a map_entry which is strictly after the searched map_entry, based on MapComparator.
+  MapEntry find_map = {
+      addr,        // start_addr
+      ULLONG_MAX,  // len
+      0,           // pgoff
+      ULLONG_MAX,  // time
+      nullptr,     // dso
+  };
+  auto it = maps.upper_bound(&find_map);
+  if (it != maps.begin() && IsAddrInMap(addr, *--it)) {
+    return *it;
+  }
+  return nullptr;
+}
+
+const MapEntry* ThreadTree::FindMap(const ThreadEntry* thread, uint64_t ip, bool in_kernel) {
+  MapEntry* result = nullptr;
+  if (!in_kernel) {
+    result = FindMapByAddr(thread->maps, ip);
+  } else {
+    result = FindMapByAddr(kernel_map_tree_, ip);
+  }
+  return result != nullptr ? result : &unknown_map_;
+}
+
+const SymbolEntry* ThreadTree::FindSymbol(const MapEntry* map, uint64_t ip) {
+  uint64_t offset_in_file;
+  if (map->dso == kernel_dso_.get()) {
+    offset_in_file = ip;
+  } else {
+    offset_in_file = ip - map->start_addr + map->pgoff;
+  }
+  const SymbolEntry* symbol = map->dso->FindSymbol(offset_in_file);
+  if (symbol == nullptr) {
+    symbol = &unknown_symbol_;
+  }
+  return symbol;
+}
+
+void BuildThreadTree(const std::vector<std::unique_ptr<Record>>& records, ThreadTree* thread_tree) {
+  for (auto& record : records) {
+    if (record->header.type == PERF_RECORD_MMAP) {
+      const MmapRecord& r = *static_cast<const MmapRecord*>(record.get());
+      if ((r.header.misc & PERF_RECORD_MISC_CPUMODE_MASK) == PERF_RECORD_MISC_KERNEL) {
+        thread_tree->AddKernelMap(r.data.addr, r.data.len, r.data.pgoff, r.sample_id.time_data.time,
+                                  r.filename);
+      } else {
+        thread_tree->AddThreadMap(r.data.pid, r.data.tid, r.data.addr, r.data.len, r.data.pgoff,
+                                  r.sample_id.time_data.time, r.filename);
+      }
+    } else if (record->header.type == PERF_RECORD_MMAP2) {
+      const Mmap2Record& r = *static_cast<const Mmap2Record*>(record.get());
+      if ((r.header.misc & PERF_RECORD_MISC_CPUMODE_MASK) == PERF_RECORD_MISC_KERNEL) {
+        thread_tree->AddKernelMap(r.data.addr, r.data.len, r.data.pgoff, r.sample_id.time_data.time,
+                                  r.filename);
+      } else {
+        std::string filename =
+            (r.filename == DEFAULT_EXECNAME_FOR_THREAD_MMAP) ? "[unknown]" : r.filename;
+        thread_tree->AddThreadMap(r.data.pid, r.data.tid, r.data.addr, r.data.len, r.data.pgoff,
+                                  r.sample_id.time_data.time, filename);
+      }
+    } else if (record->header.type == PERF_RECORD_COMM) {
+      const CommRecord& r = *static_cast<const CommRecord*>(record.get());
+      thread_tree->AddThread(r.data.pid, r.data.tid, r.comm);
+    } else if (record->header.type == PERF_RECORD_FORK) {
+      const ForkRecord& r = *static_cast<const ForkRecord*>(record.get());
+      thread_tree->ForkThread(r.data.pid, r.data.tid, r.data.ppid, r.data.ptid);
+    }
+  }
+}
diff --git a/simpleperf/thread_tree.h b/simpleperf/thread_tree.h
new file mode 100644
index 0000000..9388c8d
--- /dev/null
+++ b/simpleperf/thread_tree.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SIMPLE_PERF_THREAD_TREE_H_
+#define SIMPLE_PERF_THREAD_TREE_H_
+
+#include <limits.h>
+#include <stdint.h>
+#include <set>
+#include "dso.h"
+
+struct MapEntry {
+  uint64_t start_addr;
+  uint64_t len;
+  uint64_t pgoff;
+  uint64_t time;  // Map creation time.
+  DsoEntry* dso;
+};
+
+struct MapComparator {
+  bool operator()(const MapEntry* map1, const MapEntry* map2) const;
+};
+
+struct ThreadEntry {
+  int pid;
+  int tid;
+  const char* comm;  // It always refers to the latest comm.
+  std::set<MapEntry*, MapComparator> maps;
+};
+
+class ThreadTree {
+ public:
+  ThreadTree() : unknown_dso_(DSO_ELF_FILE, "unknown") {
+    unknown_map_ = MapEntry{
+        0,              // start_addr
+        ULLONG_MAX,     // len
+        0,              // pgoff
+        0,              // time
+        &unknown_dso_,  // dso
+    };
+    unknown_symbol_ = SymbolEntry{
+        "unknown",   // name
+        0,           // addr
+        ULLONG_MAX,  // len
+    };
+  }
+
+  void AddThread(int pid, int tid, const std::string& comm);
+  void ForkThread(int pid, int tid, int ppid, int ptid);
+  ThreadEntry* FindThreadOrNew(int pid, int tid);
+  void AddKernelMap(uint64_t start_addr, uint64_t len, uint64_t pgoff, uint64_t time,
+                    const std::string& filename);
+  void AddThreadMap(int pid, int tid, uint64_t start_addr, uint64_t len, uint64_t pgoff,
+                    uint64_t time, const std::string& filename);
+  const MapEntry* FindMap(const ThreadEntry* thread, uint64_t ip, bool in_kernel);
+  const SymbolEntry* FindSymbol(const MapEntry* map, uint64_t ip);
+  const MapEntry* UnknownMap() const {
+    return &unknown_map_;
+  }
+
+ private:
+  DsoEntry* FindKernelDsoOrNew(const std::string& filename);
+  DsoEntry* FindUserDsoOrNew(const std::string& filename);
+
+  std::unordered_map<int, std::unique_ptr<ThreadEntry>> thread_tree_;
+  std::vector<std::unique_ptr<std::string>> thread_comm_storage_;
+
+  std::set<MapEntry*, MapComparator> kernel_map_tree_;
+  std::vector<std::unique_ptr<MapEntry>> map_storage_;
+  MapEntry unknown_map_;
+
+  std::unique_ptr<DsoEntry> kernel_dso_;
+  std::unordered_map<std::string, std::unique_ptr<DsoEntry>> module_dso_tree_;
+  std::unordered_map<std::string, std::unique_ptr<DsoEntry>> user_dso_tree_;
+  DsoEntry unknown_dso_;
+  SymbolEntry unknown_symbol_;
+};
+
+struct Record;
+
+void BuildThreadTree(const std::vector<std::unique_ptr<Record>>& records, ThreadTree* thread_tree);
+
+#endif  // SIMPLE_PERF_THREAD_TREE_H_
diff --git a/tests/net_test/csocket.py b/tests/net_test/csocket.py
index 4b268e9..5dc495c 100644
--- a/tests/net_test/csocket.py
+++ b/tests/net_test/csocket.py
@@ -24,6 +24,7 @@
 
 
 # Data structures.
+# These aren't constants, they're classes. So, pylint: disable=invalid-name
 CMsgHdr = cstruct.Struct("cmsghdr", "@Lii", "len level type")
 Iovec = cstruct.Struct("iovec", "@LL", "base len")
 MsgHdr = cstruct.Struct("msghdr", "@LLLLLLi",
@@ -112,6 +113,7 @@
   MaybeRaiseSocketError(ret)
   return ret
 
+
 def Connect(s, to):
   """Python wrapper for connect."""
   ret = libc.connect(s.fileno(), to.CPointer(), len(to))
diff --git a/tests/net_test/neighbour_test.py b/tests/net_test/neighbour_test.py
index 828a86b..2a21304 100755
--- a/tests/net_test/neighbour_test.py
+++ b/tests/net_test/neighbour_test.py
@@ -208,7 +208,7 @@
       self.ReceiveUnicastAdvertisement(addr, mac)
       self.assertNeighbourState(NUD_REACHABLE, addr)
 
-    for i in xrange(5):
+    for _ in xrange(5):
       ForceProbe(router6, routermac)