blob: 6f484b4980eeeb353389a2c714d44cd6eb928762 [file] [log] [blame]
#include <unordered_map>
#include <unordered_set>
#include <torch/csrc/profiler/perf-inl.h>
#include <torch/csrc/profiler/perf.h>
namespace torch {
namespace profiler {
namespace impl {
namespace linux_perf {
#if defined(__ANDROID__) || defined(__linux__)
/*
* PerfEvent
* ---------
*/
/*
* Syscall wrapper for perf_event_open(2)
*/
inline long perf_event_open(
struct perf_event_attr* hw_event,
pid_t pid,
int cpu,
int group_fd,
unsigned long flags) {
return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
}
// TODO sync with Kineto level abstract events in profiler/events.h
static const std::unordered_map<
std::string,
std::pair<perf_type_id, /* perf event type */ uint32_t>>
EventTable{
{"cycles",
std::make_pair(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES)},
{"instructions",
std::make_pair(PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS)},
// Non Standard events for testing
{"pagefaults",
std::make_pair(PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS)},
{"backend-stall-cycles",
std::make_pair(
PERF_TYPE_HARDWARE,
PERF_COUNT_HW_STALLED_CYCLES_BACKEND)},
{"frontend-stall-cycles",
std::make_pair(
PERF_TYPE_HARDWARE,
PERF_COUNT_HW_STALLED_CYCLES_FRONTEND)}};
PerfEvent::~PerfEvent() {
if (fd_ > -1) {
close(fd_);
}
fd_ = -1; // poison
}
void PerfEvent::Init() {
TORCH_CHECK(!name_.empty(), "Invalid profiler event name");
auto const it = EventTable.find(name_);
if (it == EventTable.end()) {
TORCH_CHECK(false, "Unsupported profiler event name: ", name_);
}
struct perf_event_attr attr {};
memset(&attr, 0, sizeof(attr));
attr.size = sizeof(perf_event_attr);
attr.type = it->second.first;
attr.config = it->second.second;
attr.disabled = 1;
attr.inherit = 1;
attr.exclude_kernel = 1; // TBD
attr.exclude_hv = 1;
/*
* These can be used to calculate estimated totals if the PMU is overcommitted
* and multiplexing is happening
*/
attr.read_format =
PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
pid_t pid = getpid(); // this pid
int cpu = -1; // all cpus
int group_fd = -1;
unsigned long flags = 0;
fd_ = static_cast<int>(perf_event_open(&attr, pid, cpu, group_fd, flags));
if (fd_ == -1) {
TORCH_CHECK(
false, "perf_event_open() failed, error: ", std::strerror(errno));
}
Reset();
}
uint64_t PerfEvent::ReadCounter() const {
PerfCounter counter{};
long n = read(fd_, &counter, sizeof(PerfCounter));
TORCH_CHECK(
n == sizeof(counter),
"Read failed for Perf event fd, event : ",
name_,
", error: ",
std::strerror(errno));
TORCH_CHECK(
counter.time_enabled == counter.time_running,
"Hardware performance counter time multiplexing is not handled yet",
", name: ",
name_,
", enabled: ",
counter.time_enabled,
", running: ",
counter.time_running);
return counter.value;
}
#else /* __ANDROID__ || __linux__ */
/*
* Shim class for unsupported platforms - this will always return 0 counter
* value
*/
PerfEvent::~PerfEvent(){};
void PerfEvent::Init(){};
uint64_t PerfEvent::ReadCounter() const {
return 0;
};
#endif /* __ANDROID__ || __linux__ */
/*
* PerfProfiler
* ------------
*/
void PerfProfiler::Configure(std::vector<std::string>& event_names) {
TORCH_CHECK(
event_names.size() <= MAX_EVENTS,
"Too many events to configure, configured: ",
event_names.size(),
", max allowed:",
MAX_EVENTS);
std::unordered_set<std::string> s(event_names.begin(), event_names.end());
TORCH_CHECK(
s.size() == event_names.size(), "Duplicate event names are not allowed!")
for (auto name : event_names) {
events_.emplace_back(name);
events_.back().Init();
}
// TODO
// Reset pthreadpool here to make sure we can attach to new children
// threads
}
void PerfProfiler::Enable() {
if (!start_values_.empty()) {
StopCounting();
}
start_values_.emplace(events_.size(), 0);
auto& sv = start_values_.top();
for (unsigned i = 0; i < events_.size(); ++i) {
sv[i] = events_[i].ReadCounter();
}
StartCounting();
}
void PerfProfiler::Disable(perf_counters_t& vals) {
StopCounting();
TORCH_CHECK(
vals.size() == events_.size(),
"Can not fit all perf counters in the supplied container");
TORCH_CHECK(
!start_values_.empty(), "PerfProfiler must be enabled before disabling");
/* Always connecting this disable event to the last enable event i.e. using
* whatever is on the top of the start counter value stack. */
perf_counters_t& sv = start_values_.top();
for (unsigned i = 0; i < events_.size(); ++i) {
vals[i] = CalcDelta(sv[i], events_[i].ReadCounter());
}
start_values_.pop();
// Restore it for a parent
if (!start_values_.empty()) {
StartCounting();
}
}
} // namespace linux_perf
} // namespace impl
} // namespace profiler
} // namespace torch