|  | #include <unordered_map> | 
|  | #include <unordered_set> | 
|  |  | 
|  | #include <torch/csrc/profiler/perf-inl.h> | 
|  | #include <torch/csrc/profiler/perf.h> | 
|  |  | 
|  | namespace torch { | 
|  | namespace profiler { | 
|  | namespace impl { | 
|  |  | 
|  | namespace linux_perf { | 
|  |  | 
|  | #if defined(__ANDROID__) || defined(__linux__) | 
|  |  | 
|  | /* | 
|  | * PerfEvent | 
|  | * --------- | 
|  | */ | 
|  |  | 
|  | /* | 
|  | * Syscall wrapper for perf_event_open(2) | 
|  | */ | 
|  | inline long perf_event_open( | 
|  | struct perf_event_attr* hw_event, | 
|  | pid_t pid, | 
|  | int cpu, | 
|  | int group_fd, | 
|  | unsigned long flags) { | 
|  | return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); | 
|  | } | 
|  |  | 
|  | // TODO sync with Kineto level abstract events in profiler/events.h | 
|  | static const std::unordered_map< | 
|  | std::string, | 
|  | std::pair<perf_type_id, /* perf event type */ uint32_t>> | 
|  | EventTable{ | 
|  | {"cycles", | 
|  | std::make_pair(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES)}, | 
|  | {"instructions", | 
|  | std::make_pair(PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS)}, | 
|  |  | 
|  | // Non Standard events for testing | 
|  | {"pagefaults", | 
|  | std::make_pair(PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS)}, | 
|  | {"backend-stall-cycles", | 
|  | std::make_pair( | 
|  | PERF_TYPE_HARDWARE, | 
|  | PERF_COUNT_HW_STALLED_CYCLES_BACKEND)}, | 
|  | {"frontend-stall-cycles", | 
|  | std::make_pair( | 
|  | PERF_TYPE_HARDWARE, | 
|  | PERF_COUNT_HW_STALLED_CYCLES_FRONTEND)}}; | 
|  |  | 
|  | PerfEvent::~PerfEvent() { | 
|  | if (fd_ > -1) { | 
|  | close(fd_); | 
|  | } | 
|  | fd_ = -1; // poison | 
|  | } | 
|  |  | 
|  | void PerfEvent::Init() { | 
|  | TORCH_CHECK(!name_.empty(), "Invalid profiler event name"); | 
|  |  | 
|  | auto const it = EventTable.find(name_); | 
|  | if (it == EventTable.end()) { | 
|  | TORCH_CHECK(false, "Unsupported profiler event name: ", name_); | 
|  | } | 
|  |  | 
|  | struct perf_event_attr attr {}; | 
|  | memset(&attr, 0, sizeof(attr)); | 
|  |  | 
|  | attr.size = sizeof(perf_event_attr); | 
|  | attr.type = it->second.first; | 
|  | attr.config = it->second.second; | 
|  | attr.disabled = 1; | 
|  | attr.inherit = 1; | 
|  | attr.exclude_kernel = 1; // TBD | 
|  | attr.exclude_hv = 1; | 
|  | /* | 
|  | * These can be used to calculate estimated totals if the PMU is overcommitted | 
|  | * and multiplexing is happening | 
|  | */ | 
|  | attr.read_format = | 
|  | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING; | 
|  |  | 
|  | pid_t pid = getpid(); // this pid | 
|  | int cpu = -1; // all cpus | 
|  | int group_fd = -1; | 
|  | unsigned long flags = 0; | 
|  |  | 
|  | fd_ = static_cast<int>(perf_event_open(&attr, pid, cpu, group_fd, flags)); | 
|  | if (fd_ == -1) { | 
|  | TORCH_CHECK( | 
|  | false, "perf_event_open() failed, error: ", std::strerror(errno)); | 
|  | } | 
|  | Reset(); | 
|  | } | 
|  |  | 
|  | uint64_t PerfEvent::ReadCounter() const { | 
|  | PerfCounter counter{}; | 
|  | long n = read(fd_, &counter, sizeof(PerfCounter)); | 
|  | TORCH_CHECK( | 
|  | n == sizeof(counter), | 
|  | "Read failed for Perf event fd, event : ", | 
|  | name_, | 
|  | ", error: ", | 
|  | std::strerror(errno)); | 
|  | TORCH_CHECK( | 
|  | counter.time_enabled == counter.time_running, | 
|  | "Hardware performance counter time multiplexing is not handled yet", | 
|  | ", name: ", | 
|  | name_, | 
|  | ", enabled: ", | 
|  | counter.time_enabled, | 
|  | ", running: ", | 
|  | counter.time_running); | 
|  | return counter.value; | 
|  | } | 
|  |  | 
|  | #else /* __ANDROID__ || __linux__ */ | 
|  | /* | 
|  | * Shim class for unsupported platforms - this will always return 0 counter | 
|  | * value | 
|  | */ | 
|  |  | 
|  | PerfEvent::~PerfEvent(){}; | 
|  |  | 
|  | void PerfEvent::Init(){}; | 
|  |  | 
|  | uint64_t PerfEvent::ReadCounter() const { | 
|  | return 0; | 
|  | }; | 
|  |  | 
|  | #endif /* __ANDROID__ || __linux__ */ | 
|  |  | 
|  | /* | 
|  | * PerfProfiler | 
|  | * ------------ | 
|  | */ | 
|  |  | 
|  | void PerfProfiler::Configure(std::vector<std::string>& event_names) { | 
|  | TORCH_CHECK( | 
|  | event_names.size() <= MAX_EVENTS, | 
|  | "Too many events to configure, configured: ", | 
|  | event_names.size(), | 
|  | ", max allowed:", | 
|  | MAX_EVENTS); | 
|  | std::unordered_set<std::string> s(event_names.begin(), event_names.end()); | 
|  | TORCH_CHECK( | 
|  | s.size() == event_names.size(), "Duplicate event names are not allowed!") | 
|  | for (auto name : event_names) { | 
|  | events_.emplace_back(name); | 
|  | events_.back().Init(); | 
|  | } | 
|  |  | 
|  | // TODO | 
|  | // Reset pthreadpool here to make sure we can attach to new children | 
|  | // threads | 
|  | } | 
|  |  | 
|  | void PerfProfiler::Enable() { | 
|  | if (!start_values_.empty()) { | 
|  | StopCounting(); | 
|  | } | 
|  |  | 
|  | start_values_.emplace(events_.size(), 0); | 
|  |  | 
|  | auto& sv = start_values_.top(); | 
|  | for (unsigned i = 0; i < events_.size(); ++i) { | 
|  | sv[i] = events_[i].ReadCounter(); | 
|  | } | 
|  | StartCounting(); | 
|  | } | 
|  |  | 
|  | void PerfProfiler::Disable(perf_counters_t& vals) { | 
|  | StopCounting(); | 
|  | TORCH_CHECK( | 
|  | vals.size() == events_.size(), | 
|  | "Can not fit all perf counters in the supplied container"); | 
|  | TORCH_CHECK( | 
|  | !start_values_.empty(), "PerfProfiler must be enabled before disabling"); | 
|  |  | 
|  | /* Always connecting this disable event to the last enable event i.e. using | 
|  | * whatever is on the top of the start counter value stack. */ | 
|  | perf_counters_t& sv = start_values_.top(); | 
|  | for (unsigned i = 0; i < events_.size(); ++i) { | 
|  | vals[i] = CalcDelta(sv[i], events_[i].ReadCounter()); | 
|  | } | 
|  | start_values_.pop(); | 
|  |  | 
|  | // Restore it for a parent | 
|  | if (!start_values_.empty()) { | 
|  | StartCounting(); | 
|  | } | 
|  | } | 
|  | } // namespace linux_perf | 
|  | } // namespace impl | 
|  | } // namespace profiler | 
|  | } // namespace torch |