Merge "simpleperf: build libsimpleperf_inplace_sampler library."
diff --git a/simpleperf/Android.mk b/simpleperf/Android.mk
index 2a39b59..f47e08b 100644
--- a/simpleperf/Android.mk
+++ b/simpleperf/Android.mk
@@ -226,7 +226,6 @@
 LOCAL_MODULE := libsimpleperf_report
 LOCAL_MODULE_HOST_OS := darwin linux windows
 LOCAL_CPPFLAGS := $(simpleperf_cppflags_host)
-LOCAL_CPPFLAGS := $(simpleperf_cppflags_host)
 LOCAL_CPPFLAGS_darwin := $(simpleperf_cppflags_host_darwin)
 LOCAL_CPPFLAGS_linux := $(simpleperf_cppflags_host_linux)
 LOCAL_CPPFLAGS_windows := $(simpleperf_cppflags_host_windows)
@@ -248,6 +247,45 @@
 $(call dist-for-goals,win_sdk,$(ALL_MODULES.host_cross_libsimpleperf_report$(HOST_CROSS_2ND_ARCH_MODULE_SUFFIX).BUILT))
 endif
 
+
+# libsimpleperf_inplace_sampler.so
+# It is the shared library linked with user's app and get samples from
+# signal handlers in each thread.
+# =========================================================
+
+libsimpleperf_inplace_sampler_static_libraries_target := \
+	$(filter-out libc,$(simpleperf_static_libraries_target)) \
+
+# libsimpleperf_inplace_sampler.so on target
+include $(CLEAR_VARS)
+LOCAL_CLANG := true
+LOCAL_MODULE := libsimpleperf_inplace_sampler
+LOCAL_CPPFLAGS := $(simpleperf_cppflags_target)
+LOCAL_SRC_FILES := inplace_sampler_lib.cpp
+LOCAL_STATIC_LIBRARIES := libsimpleperf $(libsimpleperf_inplace_sampler_static_libraries_target)
+LOCAL_MULTILIB := both
+LOCAL_CXX_STL := libc++_static
+LOCAL_LDLIBS := -Wl,--exclude-libs,ALL
+include $(LLVM_DEVICE_BUILD_MK)
+include $(BUILD_SHARED_LIBRARY)
+
+# libsimpleperf_inplace_sampler.so on host
+include $(CLEAR_VARS)
+LOCAL_CLANG := true
+LOCAL_MODULE := libsimpleperf_inplace_sampler
+LOCAL_MODULE_HOST_OS := linux
+LOCAL_CPPFLAGS := $(simpleperf_cppflags_host)
+LOCAL_CPPFLAGS_linux := $(simpleperf_cppflags_host_linux)
+LOCAL_SRC_FILES := inplace_sampler_lib.cpp
+LOCAL_STATIC_LIBRARIES := libsimpleperf $(simpleperf_static_libraries_host)
+LOCAL_STATIC_LIBRARIES_linux := $(simpleperf_static_libraries_host_linux)
+LOCAL_LDLIBS_linux := $(simpleperf_ldlibs_host_linux) -Wl,--exclude-libs,ALL
+LOCAL_MULTILIB := both
+LOCAL_CXX_STL := libc++_static
+include $(LLVM_HOST_BUILD_MK)
+include $(BUILD_HOST_SHARED_LIBRARY)
+
+
 # simpleperf_unit_test
 # =========================================================
 simpleperf_unit_test_src_files := \
diff --git a/simpleperf/InplaceSamplerClient.cpp b/simpleperf/InplaceSamplerClient.cpp
index 5a78861..13e8408 100644
--- a/simpleperf/InplaceSamplerClient.cpp
+++ b/simpleperf/InplaceSamplerClient.cpp
@@ -16,14 +16,10 @@
 
 #include "InplaceSamplerClient.h"
 
-#include <sys/time.h>
-#include <sys/types.h>
-#include <stdint.h>
-
-#include <memory>
-#include <vector>
+#include <algorithm>
 
 #include "environment.h"
+#include "inplace_sampler_lib.h"
 #include "utils.h"
 
 static constexpr uint64_t EVENT_ID_FOR_INPLACE_SAMPLER = ULONG_MAX;
@@ -39,15 +35,17 @@
   if (!sampler->ConnectServer()) {
     return nullptr;
   }
-  if (!sampler->StartProfiling()) {
-    return nullptr;
-  }
   return sampler;
 }
 
 InplaceSamplerClient::InplaceSamplerClient(const perf_event_attr& attr, pid_t pid,
                                            const std::set<pid_t>& tids)
-    : attr_(attr), pid_(pid), tids_(tids), closed_(false) {
+    : attr_(attr), pid_(pid), tids_(tids), got_start_profiling_reply_msg_(false) {
+  if (attr_.freq) {
+    sample_freq_ = attr_.sample_freq;
+  } else {
+    sample_freq_ = std::max(1u, static_cast<uint32_t>(1000000000 / attr_.sample_period));
+  }
 }
 
 uint64_t InplaceSamplerClient::Id() const {
@@ -55,39 +53,139 @@
 }
 
 bool InplaceSamplerClient::ConnectServer() {
-  return true;
-}
-
-bool InplaceSamplerClient::StartProfiling() {
-  return true;
+  std::string server_path = "inplace_sampler_server_" + std::to_string(pid_);
+  // Try to connect server in 1s.
+  uint64_t timeout = GetSystemClock() + 10000000000ull;
+  while (GetSystemClock() < timeout) {
+    conn_ = UnixSocketConnection::Connect(server_path, true);
+    if (conn_ != nullptr) {
+      return true;
+    }
+    usleep(10);
+  }
+  LOG(ERROR) << "Can't find inplace_sampler_server for process " << pid_;
+  return false;
 }
 
 bool InplaceSamplerClient::StartPolling(IOEventLoop& loop,
                                         const std::function<bool(Record*)>& record_callback,
                                         const std::function<bool()>& close_callback) {
   record_callback_ = record_callback;
-  close_callback_ = close_callback;
-  auto callback = [this]() {
-    // Fake records for testing.
-    uint64_t time = GetSystemClock();
-    CommRecord comm_r(attr_, pid_, pid_, "fake_comm", Id(), time);
-    if (!record_callback_(&comm_r)) {
+  CHECK(conn_ != nullptr);
+  auto read_callback = [&](const UnixSocketMessage& msg) {
+    return HandleMessage(msg);
+  };
+  if (!conn_->PrepareForIO(loop, read_callback, close_callback)) {
+    return false;
+  }
+  if (!SendStartProfilingMessage()) {
+    return false;
+  }
+  // If the inplace sampler doesn't reply in 3 seconds, report the error.
+  timeval tv;
+  tv.tv_sec = 3;
+  tv.tv_usec = 0;
+  auto check_reply_callback = [this]() {
+    if (!got_start_profiling_reply_msg_) {
+      LOG(ERROR) << "can't receive START_PROFILING_REPLY from process " << pid_;
       return false;
     }
-    MmapRecord mmap_r(attr_, false, pid_, pid_, 0x1000, 0x1000, 0x0, "fake_elf", Id(), time);
-    if (!record_callback_(&mmap_r)) {
-      return false;
+    return true;
+  };
+  return loop.AddPeriodicEvent(tv, check_reply_callback);
+}
+
+bool InplaceSamplerClient::SendStartProfilingMessage() {
+  std::string options;
+  options += "freq=" + std::to_string(sample_freq_);
+  if (attr_.sample_type & PERF_SAMPLE_CALLCHAIN) {
+    options += " dump_callchain=1";
+  }
+  if (!tids_.empty()) {
+    options += " tids=";
+    bool first = true;
+    for (auto& tid : tids_) {
+      if (first) {
+        first = false;
+      } else {
+        options.push_back(',');
+      }
+      options += std::to_string(tid);
     }
-    std::vector<uint64_t> ips(1, 0x1000);
-    SampleRecord r(attr_, Id(), ips[0], pid_, pid_, time, 0, 1, ips);
+  }
+  size_t size = sizeof(UnixSocketMessage) + options.size() + 1;
+  std::unique_ptr<char[]> data(new char[size]);
+  UnixSocketMessage* msg = reinterpret_cast<UnixSocketMessage*>(data.get());
+  msg->len = size;
+  msg->type = START_PROFILING;
+  strcpy(msg->data, options.c_str());
+  return conn_->SendMessage(*msg, true);
+}
+
+bool InplaceSamplerClient::StopProfiling(IOEventLoop& loop,
+                                         const std::function<bool()>& close_callback) {
+  auto read_callback = [&](const UnixSocketMessage& msg) {
+    return HandleMessage(msg);
+  };
+  if (!conn_->PrepareForIO(loop, read_callback, close_callback)) {
+    return false;
+  }
+  // Notify inplace sampler to send buffered data and close the connection.
+  UnixSocketMessage msg;
+  msg.len = sizeof(UnixSocketMessage);
+  msg.type = END_PROFILING;
+  return conn_->SendMessage(msg, true);
+}
+
+bool InplaceSamplerClient::HandleMessage(const UnixSocketMessage& msg) {
+  const char* p = msg.data;
+  if (msg.type == START_PROFILING_REPLY) {
+      got_start_profiling_reply_msg_ = true;
+      if (strcmp(p, "ok") != 0) {
+        LOG(ERROR) << "receive reply from inplace_sampler_server of " << pid_ << ": " << p;
+        return false;
+      }
+  } else if (msg.type == THREAD_INFO) {
+    uint64_t time;
+    uint32_t tid;
+    MoveFromBinaryFormat(time, p);
+    MoveFromBinaryFormat(tid, p);
+    CommRecord r(attr_, pid_, tid, p, Id(), time);
     if (!record_callback_(&r)) {
       return false;
     }
-    closed_ = true;
-    return close_callback_();
-  };
-  timeval duration;
-  duration.tv_sec = 0;
-  duration.tv_usec = 1000;
-  return loop.AddPeriodicEvent(duration, callback);
+  } else if (msg.type == MAP_INFO) {
+    uint64_t time;
+    uint64_t start;
+    uint64_t len;
+    uint64_t pgoff;
+    MoveFromBinaryFormat(time, p);
+    MoveFromBinaryFormat(start, p);
+    MoveFromBinaryFormat(len, p);
+    MoveFromBinaryFormat(pgoff, p);
+    MmapRecord r(attr_, false, pid_, pid_, start, len, pgoff, p, Id(), time);
+    if (!record_callback_(&r)) {
+      return false;
+    }
+  } else if (msg.type == SAMPLE_INFO) {
+    uint64_t time;
+    uint32_t tid;
+    uint32_t period;
+    uint32_t ip_nr;
+    MoveFromBinaryFormat(time, p);
+    MoveFromBinaryFormat(tid, p);
+    MoveFromBinaryFormat(period, p);
+    MoveFromBinaryFormat(ip_nr, p);
+    std::vector<uint64_t> ips(ip_nr);
+    MoveFromBinaryFormat(ips.data(), ip_nr, p);
+    // Don't know which cpu tid is running on, use cpu 0.
+    SampleRecord r(attr_, Id(), ips[0], pid_, tid, time, 0, period, ips);
+    if (!record_callback_(&r)) {
+      return false;
+    }
+  } else {
+    LOG(ERROR) << "Unexpected msg type: " << msg.type;
+    return false;
+  }
+  return true;
 }
diff --git a/simpleperf/InplaceSamplerClient.h b/simpleperf/InplaceSamplerClient.h
index 0c606bb..cf9819c 100644
--- a/simpleperf/InplaceSamplerClient.h
+++ b/simpleperf/InplaceSamplerClient.h
@@ -31,23 +31,25 @@
                                                       const std::set<pid_t>& tids);
   uint64_t Id() const;
   bool IsClosed() {
-    return closed_;
+    return conn_->IsClosed();
   }
   bool StartPolling(IOEventLoop& loop, const std::function<bool(Record*)>& record_callback,
                     const std::function<bool()>& close_callback);
-  bool StopProfiling();
+  bool StopProfiling(IOEventLoop& loop, const std::function<bool()>& close_callback);
 
  private:
   InplaceSamplerClient(const perf_event_attr& attr, pid_t pid, const std::set<pid_t>& tids);
   bool ConnectServer();
-  bool StartProfiling();
+  bool SendStartProfilingMessage();
+  bool HandleMessage(const UnixSocketMessage& msg);
 
   const perf_event_attr attr_;
   const pid_t pid_;
   const std::set<pid_t> tids_;
+  uint32_t sample_freq_;
+  std::unique_ptr<UnixSocketConnection> conn_;
   std::function<bool(Record*)> record_callback_;
-  std::function<bool()> close_callback_;
-  bool closed_;
+  bool got_start_profiling_reply_msg_;
 };
 
 #endif  // SIMPLE_PERF_INPLACE_SAMPLER_CLIENT_H_
diff --git a/simpleperf/UnixSocket.h b/simpleperf/UnixSocket.h
index 87a045b..fd2796a 100644
--- a/simpleperf/UnixSocket.h
+++ b/simpleperf/UnixSocket.h
@@ -150,6 +150,10 @@
 
   ~UnixSocketConnection();
 
+  bool IsClosed() {
+    return fd_ == -1;
+  }
+
   bool PrepareForIO(IOEventLoop& loop,
                     const std::function<bool(const UnixSocketMessage&)>&
                         receive_message_callback,
diff --git a/simpleperf/cmd_record_test.cpp b/simpleperf/cmd_record_test.cpp
index c3fb8f2..3820c2c 100644
--- a/simpleperf/cmd_record_test.cpp
+++ b/simpleperf/cmd_record_test.cpp
@@ -18,7 +18,6 @@
 
 #include <android-base/stringprintf.h>
 #include <android-base/test_utils.h>
-#include <sys/syscall.h>
 
 #include <map>
 #include <memory>
@@ -378,7 +377,7 @@
   TemporaryFile tmpfile;
   std::atomic<int> tid(0);
   std::thread thread([&]() {
-    tid = syscall(__NR_gettid);
+    tid = gettid();
     sleep(1);
   });
   thread.detach();
diff --git a/simpleperf/cmd_report.cpp b/simpleperf/cmd_report.cpp
index 2c00f73..3b727a5 100644
--- a/simpleperf/cmd_report.cpp
+++ b/simpleperf/cmd_report.cpp
@@ -182,6 +182,7 @@
     const Symbol* symbol = thread_tree_->FindSymbol(map, ip, &vaddr_in_file);
     std::unique_ptr<SampleEntry> callchain_sample(new SampleEntry(
         sample->time, 0, acc_info, 0, thread, map, symbol, vaddr_in_file));
+    callchain_sample->thread_comm = sample->thread_comm;
     return InsertCallChainSample(std::move(callchain_sample), callchain);
   }
 
diff --git a/simpleperf/cmd_stat_test.cpp b/simpleperf/cmd_stat_test.cpp
index 5aa1e3b..89e75f8 100644
--- a/simpleperf/cmd_stat_test.cpp
+++ b/simpleperf/cmd_stat_test.cpp
@@ -19,11 +19,11 @@
 #include <android-base/file.h>
 #include <android-base/stringprintf.h>
 #include <android-base/test_utils.h>
-#include <sys/syscall.h>
 
 #include <thread>
 
 #include "command.h"
+#include "environment.h"
 #include "get_test_data.h"
 #include "test_util.h"
 
@@ -159,7 +159,7 @@
 TEST(stat_cmd, stop_when_no_more_targets) {
   std::atomic<int> tid(0);
   std::thread thread([&]() {
-    tid = syscall(__NR_gettid);
+    tid = gettid();
     sleep(1);
   });
   thread.detach();
diff --git a/simpleperf/cpu_hotplug_test.cpp b/simpleperf/cpu_hotplug_test.cpp
index c30ca67..16e0e5c 100644
--- a/simpleperf/cpu_hotplug_test.cpp
+++ b/simpleperf/cpu_hotplug_test.cpp
@@ -17,7 +17,6 @@
 #include <gtest/gtest.h>
 
 #include <sys/stat.h>
-#include <sys/syscall.h>
 #include <unistd.h>
 #if defined(__BIONIC__)
 #include <sys/system_properties.h>
@@ -32,6 +31,7 @@
 #include <android-base/logging.h>
 #include <android-base/stringprintf.h>
 
+#include "environment.h"
 #include "event_attr.h"
 #include "event_fd.h"
 #include "event_type.h"
@@ -330,7 +330,7 @@
 };
 
 static void CpuSpinThread(CpuSpinThreadArg* arg) {
-  arg->tid = syscall(__NR_gettid);
+  arg->tid = gettid();
   while (!arg->end_flag) {
     cpu_set_t mask;
     CPU_ZERO(&mask);
diff --git a/simpleperf/environment.h b/simpleperf/environment.h
index 5d9cee8..16df690 100644
--- a/simpleperf/environment.h
+++ b/simpleperf/environment.h
@@ -20,6 +20,11 @@
 #include <sys/types.h>
 #include <time.h>
 
+#if defined(__linux__)
+#include <sys/syscall.h>
+#include <unistd.h>
+#endif
+
 #include <functional>
 #include <set>
 #include <string>
@@ -74,6 +79,12 @@
   clock_gettime(CLOCK_MONOTONIC, &ts);
   return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
 }
+
+#if !defined(__ANDROID__)
+static inline int gettid() {
+  return syscall(__NR_gettid);
+}
+#endif
 #endif
 
 ArchType GetMachineArch();
diff --git a/simpleperf/event_selection_set.cpp b/simpleperf/event_selection_set.cpp
index bc1f5a8..b966273 100644
--- a/simpleperf/event_selection_set.cpp
+++ b/simpleperf/event_selection_set.cpp
@@ -629,7 +629,46 @@
 }
 
 bool EventSelectionSet::FinishReadMmapEventData() {
-  return ReadMmapEventData();
+  if (!ReadMmapEventData()) {
+    return false;
+  }
+  if (!HasInplaceSampler()) {
+    return true;
+  }
+  // Inplace sampler server uses a buffer to cache samples before sending them, so we need to
+  // explicitly ask it to send the cached samples.
+  loop_.reset(new IOEventLoop);
+  size_t inplace_sampler_count = 0;
+  auto close_callback = [&]() {
+    if (--inplace_sampler_count == 0) {
+      return loop_->ExitLoop();
+    }
+    return true;
+  };
+  for (auto& group : groups_) {
+    for (auto& sel : group) {
+      for (auto& sampler : sel.inplace_samplers) {
+        if (!sampler->IsClosed()) {
+          if (!sampler->StopProfiling(*loop_, close_callback)) {
+            return false;
+          }
+          inplace_sampler_count++;
+        }
+      }
+    }
+  }
+  if (inplace_sampler_count == 0) {
+    return true;
+  }
+
+  // Set a timeout to exit the loop.
+  timeval tv;
+  tv.tv_sec = 1;
+  tv.tv_usec = 0;
+  if (!loop_->AddPeriodicEvent(tv, [&]() { return loop_->ExitLoop(); })) {
+    return false;
+  }
+  return loop_->RunLoop();
 }
 
 bool EventSelectionSet::HandleCpuHotplugEvents(const std::vector<int>& monitored_cpus,
diff --git a/simpleperf/inplace_sampler_lib.cpp b/simpleperf/inplace_sampler_lib.cpp
new file mode 100644
index 0000000..b335e37
--- /dev/null
+++ b/simpleperf/inplace_sampler_lib.cpp
@@ -0,0 +1,362 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "inplace_sampler_lib.h"
+
+#include <inttypes.h>
+#include <pthread.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/syscall.h>
+#include <sys/ucontext.h>
+#include <unistd.h>
+
+#include <map>
+#include <memory>
+#include <queue>
+#include <set>
+#include <string>
+#include <unordered_map>
+
+#include <android-base/logging.h>
+#include <android-base/macros.h>
+#include <backtrace/Backtrace.h>
+#define LOG_TAG "InplaceSampler"
+#include <log/log.h>
+
+#include "environment.h"
+#include "UnixSocket.h"
+#include "utils.h"
+
+#define DEFAULT_SIGNO  SIGRTMAX
+static constexpr int DEFAULT_SAMPLE_FREQ = 4000;
+static constexpr int CHECK_THREADS_INTERVAL_IN_MS = 200;
+
+namespace {
+
+struct ThreadInfo {
+  std::string name;
+};
+
+// SampleManager controls the whole sampling process:
+//   Read commands from simpleperf
+//   Set up timers to send signals for each profiled thread regularly.
+//   Send thread info and map info to simpleperf.
+class SampleManager {
+ public:
+  SampleManager(std::unique_ptr<UnixSocketConnection> conn) : conn_(std::move(conn)),
+      tid_(gettid()), signo_(DEFAULT_SIGNO), sample_freq_(DEFAULT_SAMPLE_FREQ),
+      sample_period_in_ns_(0), dump_callchain_(false), monitor_all_threads_(true) {
+  }
+  void Run();
+
+ private:
+  bool HandleMessage(const UnixSocketMessage& msg);
+  bool ParseStartProfilingMessage(const UnixSocketMessage& msg);
+  bool SendStartProfilingReplyMessage(bool ok);
+  bool StartProfiling();
+  bool InstallSignalHandler();
+  bool CheckThreads();
+  bool CheckThreadNameChange(uint64_t timestamp);
+  bool CheckMapChange(uint64_t timestamp);
+  void SendThreadMapInfo();
+  void SendFakeSampleRecord();
+
+  std::unique_ptr<UnixSocketConnection> conn_;
+
+  int tid_;
+  int signo_;
+  uint32_t sample_freq_;
+  uint32_t sample_period_in_ns_;
+  bool dump_callchain_;
+  bool monitor_all_threads_;
+  std::set<int> monitor_tid_filter_;
+  std::map<int, ThreadInfo> threads_;
+  std::map<uint64_t, ThreadMmap> maps_;
+  std::queue<std::unique_ptr<char[]>> thread_map_info_q_;
+
+  IOEventLoop loop_;
+};
+
+void SampleManager::Run() {
+  auto read_callback = [&](const UnixSocketMessage& msg) {
+    return HandleMessage(msg);
+  };
+  auto close_callback = [&]() {
+    return loop_.ExitLoop();
+  };
+  if (!conn_->PrepareForIO(loop_, read_callback, close_callback)) {
+    return;
+  }
+  loop_.RunLoop();
+}
+
+bool SampleManager::HandleMessage(const UnixSocketMessage& msg) {
+  if (msg.type == START_PROFILING) {
+    if (!ParseStartProfilingMessage(msg)) {
+      if (!SendStartProfilingReplyMessage(false)) {
+        return false;
+      }
+      return conn_->NoMoreMessage();
+    }
+    if (!SendStartProfilingReplyMessage(true)) {
+      return false;
+    }
+    return StartProfiling();
+  }
+  if (msg.type == END_PROFILING) {
+    // Close connection after clearing send buffer.
+    return conn_->NoMoreMessage();
+  }
+  LOG(ERROR) << "Unexpected msg type: " << msg.type;
+  return false;
+}
+
+bool SampleManager::ParseStartProfilingMessage(const UnixSocketMessage& msg) {
+  char* option = const_cast<char*>(msg.data);
+  while (option != nullptr && *option != '\0') {
+    char* next_option = strchr(option, ' ');
+    if (next_option != nullptr) {
+      *next_option++ = '\0';
+    }
+    char* equal_op = strchr(option, '=');
+    if (equal_op != nullptr) {
+      char* key = option;
+      *equal_op = '\0';
+      char* value = equal_op + 1;
+      if (strcmp(key, "freq") == 0) {
+        sample_freq_ = atoi(value);
+      } else if (strcmp(key, "signal") == 0) {
+        signo_ = atoi(value);
+      } else if (strcmp(key, "tids") == 0) {
+        monitor_all_threads_ = false;
+        while (*value != '\0') {
+          int tid = static_cast<int>(strtol(value, &value, 10));
+          monitor_tid_filter_.insert(tid);
+          if (*value == ',') {
+            ++value;
+          }
+        }
+      } else if (strcmp(key, "dump_callchain") == 0) {
+        dump_callchain_ = (strcmp(value, "1") == 0);
+      }
+    }
+    option = next_option;
+  }
+  if (sample_freq_ == 0 || sample_freq_ > 1000000000) {
+    LOG(ERROR) << "Unexpected sample_freq: " << sample_freq_;
+    return false;
+  }
+  if (sample_freq_ == 1) {
+    sample_period_in_ns_ = 999999999;
+  } else {
+    sample_period_in_ns_ = 1000000000 / sample_freq_;
+  }
+  return true;
+}
+
+bool SampleManager::SendStartProfilingReplyMessage(bool ok) {
+  const char* s = ok ? "ok" : "error";
+  size_t size = sizeof(UnixSocketMessage) + strlen(s) + 1;
+  std::unique_ptr<char[]> data(new char[size]);
+  UnixSocketMessage* msg = reinterpret_cast<UnixSocketMessage*>(data.get());
+  msg->len = size;
+  msg->type = START_PROFILING_REPLY;
+  strcpy(msg->data, s);
+  return conn_->SendMessage(*msg, true);
+}
+
+bool SampleManager::StartProfiling() {
+  if (!InstallSignalHandler()) {
+    return false;
+  }
+  if (!CheckThreads()) {
+    return false;
+  }
+  timeval tv;
+  tv.tv_sec = CHECK_THREADS_INTERVAL_IN_MS / 1000;
+  tv.tv_usec = CHECK_THREADS_INTERVAL_IN_MS % 1000 * 1000;
+  return loop_.AddPeriodicEvent(tv, [&]() {
+    return CheckThreads();
+  });
+}
+
+bool SampleManager::InstallSignalHandler() {
+  return true;
+}
+
+bool SampleManager::CheckThreads() {
+  uint64_t timestamp = GetSystemClock();
+  if (!CheckMapChange(timestamp)) {
+    return false;
+  }
+  if (!CheckThreadNameChange(timestamp)) {
+    return false;
+  }
+  SendThreadMapInfo();
+  // For testing.
+  SendFakeSampleRecord();
+  return true;
+}
+
+bool SampleManager::CheckThreadNameChange(uint64_t timestamp) {
+  std::vector<pid_t> tids = GetThreadsInProcess(getpid());
+  std::map<pid_t, std::string> current;
+  for (auto& tid : tids) {
+    if (tid == tid_) {
+      // Skip sample thread.
+      continue;
+    }
+    if (monitor_all_threads_ || monitor_tid_filter_.find(tid) != monitor_tid_filter_.end()) {
+      std::string name;
+      if (GetThreadName(tid, &name)) {
+        current[tid] = name;
+      }
+    }
+  }
+  // Check new threads or threads with new names.
+  for (auto& pair : current) {
+    pid_t tid = pair.first;
+    auto it = threads_.find(tid);
+    if (it == threads_.end() || it->second.name != pair.second) {
+      threads_[tid].name = pair.second;
+      size_t size = sizeof(UnixSocketMessage) + sizeof(uint64_t) + sizeof(uint32_t) +
+          pair.second.size() + 1;
+      std::unique_ptr<char[]> data(new char[size]);
+      UnixSocketMessage* msg = reinterpret_cast<UnixSocketMessage*>(data.get());
+      msg->len = size;
+      msg->type = THREAD_INFO;
+      char* p = msg->data;
+      MoveToBinaryFormat(timestamp, p);
+      MoveToBinaryFormat(static_cast<uint32_t>(tid), p);
+      MoveToBinaryFormat(pair.second.c_str(), pair.second.size() + 1, p);
+      thread_map_info_q_.push(std::move(data));
+    }
+  }
+  // Check deleted threads.
+  for (auto it = threads_.begin(); it != threads_.end();) {
+    int tid = it->first;
+    if (current.find(tid) == current.end()) {
+      it = threads_.erase(it);
+    } else {
+      ++it;
+    }
+  }
+  return true;
+}
+
+bool SampleManager::CheckMapChange(uint64_t timestamp) {
+  std::vector<ThreadMmap> maps;
+  if (!GetThreadMmapsInProcess(getpid(), &maps)) {
+    return false;
+  }
+  // Check new maps or changed maps.
+  for (auto& map : maps) {
+    if (!map.executable) {
+      continue;
+    }
+    auto it = maps_.find(map.start_addr);
+    if (it == maps_.end() || it->second.len != map.len || it->second.pgoff != map.pgoff ||
+        it->second.name != map.name) {
+      maps_[map.start_addr] = map;
+      size_t size = sizeof(UnixSocketMessage) + sizeof(uint64_t) * 4 + map.name.size() + 1;
+      std::unique_ptr<char[]> data(new char[size]);
+      UnixSocketMessage* msg = reinterpret_cast<UnixSocketMessage*>(data.get());
+      msg->len = size;
+      msg->type = MAP_INFO;
+      char* p = msg->data;
+      MoveToBinaryFormat(timestamp, p);
+      MoveToBinaryFormat(map.start_addr, p);
+      MoveToBinaryFormat(map.len, p);
+      MoveToBinaryFormat(map.pgoff, p);
+      MoveToBinaryFormat(map.name.c_str(), map.name.size() + 1, p);
+      thread_map_info_q_.push(std::move(data));
+    }
+  }
+  return true;
+}
+
+void SampleManager::SendThreadMapInfo() {
+  while (!thread_map_info_q_.empty()) {
+    auto& data = thread_map_info_q_.front();
+    UnixSocketMessage* msg = reinterpret_cast<UnixSocketMessage*>(data.get());
+    if (!conn_->SendMessage(*msg, false)) {
+      break;
+    }
+    thread_map_info_q_.pop();
+  }
+}
+
+static void FakeFunction() {
+}
+
+void SampleManager::SendFakeSampleRecord() {
+  size_t size = sizeof(UnixSocketMessage) + sizeof(uint64_t) * 2 + sizeof(uint32_t) *  3;
+  std::unique_ptr<char[]> data(new char[size]);
+  UnixSocketMessage* msg = reinterpret_cast<UnixSocketMessage*>(data.get());
+  uint64_t ip = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(&FakeFunction));
+  msg->len = size;
+  msg->type = SAMPLE_INFO;
+  char* p = msg->data;
+  MoveToBinaryFormat(GetSystemClock(), p);
+  MoveToBinaryFormat(static_cast<uint32_t>(tid_), p);
+  MoveToBinaryFormat(1u, p);
+  MoveToBinaryFormat(1u, p);
+  MoveToBinaryFormat(ip, p);
+  conn_->SendMessage(*msg, false);
+}
+
+static void* CommunicationThread(void*) {
+  pthread_setname_np(pthread_self(), "inplace_sampler");
+  std::string server_path = "inplace_sampler_server_" + std::to_string(getpid());
+  std::unique_ptr<UnixSocketServer> server = UnixSocketServer::Create(server_path, true);
+  if (server == nullptr) {
+    LOG(ERROR) << "failed to create server at path " << server_path;
+    return nullptr;
+  }
+  LOG(INFO) << "Create inplace_sampler_server at " << server_path;
+  while (true) {
+    std::unique_ptr<UnixSocketConnection> conn = server->AcceptConnection();
+    if (conn == nullptr) {
+      break;
+    }
+    SampleManager manager(std::move(conn));
+    manager.Run();
+  }
+  return nullptr;
+}
+
+__attribute__((constructor)) void InitSampler() {
+  pthread_attr_t attr;
+  if (pthread_attr_init(&attr) != 0) {
+    LOG(ERROR) << "pthread_attr_init failed";
+    return;
+  }
+  if (pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED) != 0) {
+    LOG(ERROR) << "pthread_attr_setdetachstate failed";
+    return;
+  }
+  pthread_t thread;
+  if (pthread_create(&thread, &attr, CommunicationThread, nullptr) != 0) {
+    LOG(ERROR) << "pthread_create failed";
+    return;
+  }
+  pthread_attr_destroy(&attr);
+}
+
+}  // namespace
diff --git a/simpleperf/inplace_sampler_lib.h b/simpleperf/inplace_sampler_lib.h
new file mode 100644
index 0000000..f1590ae
--- /dev/null
+++ b/simpleperf/inplace_sampler_lib.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SIMPLE_PERF_INPLACE_SAMPLER_LIB_H_
+#define SIMPLE_PERF_INPLACE_SAMPLER_LIB_H_
+
+enum InplaceSamplerMessageType {
+  START_PROFILING,
+  START_PROFILING_REPLY,
+  THREAD_INFO,
+  MAP_INFO,
+  SAMPLE_INFO,
+  END_PROFILING,
+  END_PROFILING_REPLY,
+};
+
+// Type: START_PROFILING
+// Direction: simpleperf to inplace_sampler
+// Data:
+//   char options[];  // ended by '\0'
+//
+// options[] contains space separated options like below:
+//   freq=4000 # sample at 4000/s.
+//   signal=14  # use signal 14 to raise sample recording.
+//   tids=1432,1433  # take samples of thread 1432,1433.
+
+
+// Type: START_PROFILING_REPLY
+// Direction: inplace_sampler to simpleperf
+// Data:
+//   char reply[]; // ended by '\0'
+// reply[] contains a string, which is either 'ok' or 'error'
+
+// Type: THREAD_INFO
+// Direction: inplace_sampler to simpleperf
+// Data:
+//  uint64_t time;
+//  uint32_t tid;
+//  char thread_name[];  // ended by '\0'
+
+// Type: MAP_INFO
+// Direction: inplace_sampler to simpleperf
+// Data:
+//  uint64_t time;
+//  uint64_t start;
+//  uint64_t len;
+//  uint64_t pgoff;
+//  char[] dso;  // ended by '\0'
+
+// Type: SAMPLE_INFO
+// Direction: inplace_sampler to simpleperf
+// Data:
+//  uint64_t time;
+//  uint32_t tid;
+//  uint32_t period;
+//  uint32_t ip_nr;
+//  uint64_t ip[ip_nr];
+
+// Type: END_PROFILING
+// Direction: simpleperf to inplace_sampler
+// Data:
+//   None.
+
+// Type: END_PROFILING_REPLY
+// Direction: inplace_sampler to simpleperf
+// Data:
+//   uint64_t used_cpu_time;
+//   uint64_t lost_samples;
+
+
+#endif  // SIMPLE_PERF_INPLACE_SAMPLER_LIB_H_
diff --git a/simpleperf/runtest/Android.build.mk b/simpleperf/runtest/Android.build.mk
index 8520765..5a8f92d 100644
--- a/simpleperf/runtest/Android.build.mk
+++ b/simpleperf/runtest/Android.build.mk
@@ -22,7 +22,11 @@
 LOCAL_CLANG := true
 LOCAL_CPPFLAGS := $(simpleperf_runtest_cppflags)
 LOCAL_SRC_FILES := $(module_src_files)
+LOCAL_SHARED_LIBRARIES := libsimpleperf_inplace_sampler
 LOCAL_MODULE := $(module)
+LOCAL_MULTILIB := both
+LOCAL_MODULE_STEM_32 := $(module)32
+LOCAL_MODULE_STEM_64 := $(module)64
 LOCAL_STRIP_MODULE := false
 LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.build.mk
 include $(BUILD_EXECUTABLE)
@@ -32,7 +36,11 @@
 LOCAL_CLANG := true
 LOCAL_CPPFLAGS := $(simpleperf_runtest_cppflags)
 LOCAL_SRC_FILES := $(module_src_files)
+LOCAL_SHARED_LIBRARIES := libsimpleperf_inplace_sampler
 LOCAL_MODULE := $(module)
+LOCAL_MULTILIB := both
+LOCAL_MODULE_STEM_32 := $(module)32
+LOCAL_MODULE_STEM_64 := $(module)64
 LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.build.mk
 include $(BUILD_HOST_EXECUTABLE)
 endif
\ No newline at end of file
diff --git a/simpleperf/runtest/runtest.conf b/simpleperf/runtest/runtest.conf
index 863ea99..2fbbd98 100644
--- a/simpleperf/runtest/runtest.conf
+++ b/simpleperf/runtest/runtest.conf
@@ -41,6 +41,7 @@
 
 <test name="function_fork">
   <executable name="simpleperf_runtest_function_fork"/>
+  <report option="--sort comm,symbol"/>
 
   <symbol_overhead>
     <symbol name="ParentFunction()" min="10" max="90"/>
@@ -48,7 +49,7 @@
   </symbol_overhead>
 
   <symbol_children_overhead>
-    <symbol name="main" min="10" max="90"/>
+    <symbol name="main" min="10"/>
   </symbol_children_overhead>
 
   <symbol_callgraph_relation>
@@ -156,8 +157,8 @@
   <executable name="simpleperf_runtest_function_indirect_recursive"/>
 
   <symbol_overhead>
-    <symbol name="FunctionRecursiveOne(int)" min="30" max="70"/>
-    <symbol name="FunctionRecursiveTwo(int)" min="30" max="70"/>
+    <symbol name="FunctionRecursiveOne(int)" min="20"/>
+    <symbol name="FunctionRecursiveTwo(int)" min="20"/>
   </symbol_overhead>
 
   <symbol_children_overhead>
diff --git a/simpleperf/runtest/runtest.py b/simpleperf/runtest/runtest.py
index 77fc566..a1b4520 100644
--- a/simpleperf/runtest/runtest.py
+++ b/simpleperf/runtest/runtest.py
@@ -279,18 +279,20 @@
 
   def __init__(self, target, perf_path):
     self.target = target
+    self.is32 = target.endswith('32')
     self.perf_path = perf_path
     self.use_callgraph = False
     self.sampler = 'cpu-cycles'
 
   def record(self, test_executable_name, record_file, additional_options=[]):
     call_args = [self.perf_path, 'record']
-    call_args += ['--duration', '1']
+    call_args += ['--duration', '2']
     call_args += ['-e', '%s:u' % self.sampler]
     if self.use_callgraph:
       call_args += ['-f', '1000', '-g']
     call_args += ['-o', record_file]
     call_args += additional_options
+    test_executable_name += '32' if self.is32 else '64'
     call_args += [test_executable_name]
     self._call(call_args)
 
@@ -310,8 +312,9 @@
 
   """Run perf test on host."""
 
-  def __init__(self, perf_path):
-    super(HostRunner, self).__init__('host', perf_path)
+  def __init__(self, target):
+    perf_path = 'simpleperf32' if target.endswith('32') else 'simpleperf'
+    super(HostRunner, self).__init__(target, perf_path)
 
   def _call(self, args, output_file=None):
     output_fh = None
@@ -326,17 +329,21 @@
 
   """Run perf test on device."""
 
-  def __init__(self, perf_path):
+  def __init__(self, target):
     self.tmpdir = '/data/local/tmp/'
-    super(DeviceRunner, self).__init__('device', self.tmpdir + perf_path)
+    perf_path = 'simpleperf32' if target.endswith('32') else 'simpleperf'
+    super(DeviceRunner, self).__init__(target, self.tmpdir + perf_path)
     self._download(os.environ['OUT'] + '/system/xbin/' + perf_path, self.tmpdir)
+    lib = 'lib' if self.is32 else 'lib64'
+    self._download(os.environ['OUT'] + '/system/' + lib + '/libsimpleperf_inplace_sampler.so',
+                   self.tmpdir)
 
   def _call(self, args, output_file=None):
     output_fh = None
     if output_file is not None:
       output_fh = open(output_file, 'w')
     args_with_adb = ['adb', 'shell']
-    args_with_adb.extend(args)
+    args_with_adb.append('export LD_LIBRARY_PATH=' + self.tmpdir + ' && ' + ' '.join(args))
     subprocess.check_call(args_with_adb, stdout=output_fh)
     if output_fh is not None:
       output_fh.close()
@@ -346,8 +353,8 @@
     subprocess.check_call(args)
 
   def record(self, test_executable_name, record_file, additional_options=[]):
-    self._download(os.environ['OUT'] + '/system/bin/' + test_executable_name,
-                   self.tmpdir)
+    self._download(os.environ['OUT'] + '/system/bin/' + test_executable_name +
+                   ('32' if self.is32 else '64'), self.tmpdir)
     super(DeviceRunner, self).record(self.tmpdir + test_executable_name,
                                      self.tmpdir + record_file,
                                      additional_options)
@@ -528,75 +535,14 @@
     return result
 
 
-def runtest(host, device, normal, callgraph, use_inplace_sampler, selected_tests):
-  tests = load_config_file(os.path.dirname(os.path.realpath(__file__)) + \
-                           '/runtest.conf')
-  host_runner = HostRunner('simpleperf')
-  device_runner = DeviceRunner('simpleperf')
-  report_analyzer = ReportAnalyzer()
-  for test in tests:
-    if selected_tests is not None:
-      if test.test_name not in selected_tests:
-        continue
-    if host and normal:
-      host_runner.record(test.executable_name, 'perf.data')
-      host_runner.report('perf.data', 'perf.report',
-                         additional_options = test.report_options)
-      result = report_analyzer.check_report_file(
-          test, 'perf.report', False)
-      print 'test %s on host %s' % (
-          test.test_name, 'Succeeded' if result else 'Failed')
-      if not result:
-        exit(1)
-
-    if device and normal:
-      device_runner.record(test.executable_name, 'perf.data')
-      device_runner.report('perf.data', 'perf.report',
-                           additional_options = test.report_options)
-      result = report_analyzer.check_report_file(test, 'perf.report', False)
-      print 'test %s on device %s' % (
-          test.test_name, 'Succeeded' if result else 'Failed')
-      if not result:
-        exit(1)
-
-    if host and callgraph:
-      host_runner.record(
-          test.executable_name,
-          'perf_g.data',
-          additional_options=['-g', '-f', '1000'])
-      host_runner.report(
-          'perf_g.data',
-          'perf_g.report',
-          additional_options=['-g', 'callee'] + test.report_options)
-      result = report_analyzer.check_report_file(test, 'perf_g.report', True)
-      print 'call-graph test %s on host %s' % (
-          test.test_name, 'Succeeded' if result else 'Failed')
-      if not result:
-        exit(1)
-
-    if device and callgraph:
-      # Decrease sampling frequency by -f 1000 to avoid losing records
-      # while recording call-graph.
-      device_runner.record(
-          test.executable_name,
-          'perf_g.data',
-          additional_options=['-g', '-f', '1000'])
-      device_runner.report(
-          'perf_g.data',
-          'perf_g.report',
-          additional_options=['-g', 'callee'] + test.report_options)
-      result = report_analyzer.check_report_file(test, 'perf_g.report', True)
-      print 'call-graph test %s on device %s' % (
-          test.test_name, 'Succeeded' if result else 'Failed')
-      if not result:
-        exit(1)
-
-
 def build_runner(target, use_callgraph, sampler):
-  if target == 'host':
-    runner = HostRunner('simpleperf')
+  if target == 'host32' and use_callgraph:
+    print "Current 64bit linux host doesn't support `simpleperf32 record -g`"
+    return None
+  if target.startswith('host'):
+    runner = HostRunner(target)
   else:
-    runner = DeviceRunner('simpleperf')
+    runner = DeviceRunner(target)
   runner.use_callgraph = use_callgraph
   runner.sampler = sampler
   return runner
@@ -611,7 +557,7 @@
       runner.report('perf.data', 'perf.report')
       symbols = report_analyzer._read_report_file('perf.report', runner.use_callgraph)
       result = False
-      if len(symbols) == 1 and symbols[0].name == 'fake_elf[+0]':
+      if len(symbols) == 1 and symbols[0].name.find('FakeFunction()') != -1:
         result = True
     else:
       runner.report('perf.data', 'perf.report', additional_options = test.report_options)
@@ -639,20 +585,21 @@
     for use_callgraph in use_callgraph_options:
       for sampler in sampler_options:
         runner = build_runner(target, use_callgraph, sampler)
-        test_with_runner(runner, tests)
+        if runner is not None:
+          test_with_runner(runner, tests)
 
 
 def main():
-  target_options = ['host', 'target']
+  target_options = ['host64', 'host32', 'device64', 'device32']
   use_callgraph_options = [False, True]
   sampler_options = ['cpu-cycles', 'inplace-sampler']
   selected_tests = None
   i = 1
   while i < len(sys.argv):
     if sys.argv[i] == '--host':
-      use_callgraph_options = ['host']
+      target_options = ['host64', 'host32']
     elif sys.argv[i] == '--device':
-      use_callgraph_options = ['device']
+      target_options = ['device64', 'device32']
     elif sys.argv[i] == '--normal':
       use_callgraph_options = [False]
     elif sys.argv[i] == '--callgraph':
diff --git a/simpleperf/thread_tree.h b/simpleperf/thread_tree.h
index b112fa5..e6536bb 100644
--- a/simpleperf/thread_tree.h
+++ b/simpleperf/thread_tree.h
@@ -24,7 +24,6 @@
 #include <set>
 
 #include "dso.h"
-//#include "environment.h"
 
 struct Record;