| #include <torch/csrc/autograd/profiler.h> |
| #include <torch/csrc/autograd/function.h> |
| #include <torch/csrc/jit/frontend/code_template.h> |
| |
| #include <torch/csrc/jit/frontend/tracer.h> |
| #include <torch/csrc/jit/runtime/operator.h> |
| |
| #include <ATen/core/op_registration/op_registration.h> |
| #include <torch/library.h> |
| |
| #include <fstream> |
| #include <list> |
| #include <mutex> |
| #include <sstream> |
| #include <string> |
| #include <vector> |
| |
| #include <ATen/record_function.h> |
| #include <c10/core/Allocator.h> |
| #include <c10/util/ThreadLocalDebugInfo.h> |
| |
| #include <iostream> |
| |
| namespace torch { namespace autograd { namespace profiler { |
| |
| std::vector<FileLineFunc> prepareCallstack(const std::vector<jit::StackEntry>& cs) { |
| std::vector<FileLineFunc> entries; |
| entries.reserve(cs.size()); |
| for (const auto& entry : cs) { |
| auto& range = entry.range; |
| if (range.source()) { |
| auto& src = range.source(); |
| if (src && src->filename()) { |
| auto line = src->starting_line_no() + |
| src->lineno_for_offset(range.start()); |
| entries.emplace_back(FileLineFunc{*(src->filename()), line, entry.filename}); |
| } |
| } |
| } |
| return entries; |
| } |
| |
| std::vector<std::string> callstackStr(const std::vector<FileLineFunc>& cs) { |
| std::vector<std::string> cs_str; |
| cs_str.reserve(cs.size()); |
| for (const auto& entry : cs) { |
| std::stringstream loc; |
| loc << entry.filename << "(" << entry.line << "): " << entry.funcname; |
| cs_str.push_back(loc.str()); |
| } |
| return cs_str; |
| } |
| |
| // We decompose the profiler logic into the following components: |
| // |
| // ThreadLocalDebugInfo: |
| // |
| // ThreadLocalDebugInfo is a thread local mapping from slots into |
| // the debug information structs. |
| // ThreadLocalDebugInfo is automatically propagated across thread |
| // boundaries, including the cases of: |
| // - launching async jobs with at::launch |
| // - executing JIT continuations |
| // - moving from the forward threads into autograd (backward) threads |
| // |
| // Entries in ThreadLocalDebugInfo are managed by DebugInfoGuard |
| // which can be used to add or overwrite an entry in the thread local |
| // mapping. A corresponding entry is removed when the guard is destroyed, |
| // potentially revealing the previously set value for the same slot. |
| // |
| // For the async tasks, slots previuosly set in the main thread before |
| // launching of an async task are shared and visible in the async task. |
| // |
| // On the other hand, any adding or overwriting of the mapping by the |
| // async task is not visible to the main thread and any modification |
| // (including removal of the entries) in the main thread is not visible |
| // to the async task if it happends after launching the task. |
| // |
| // We use ThreadLocalDebugInfo (slot PROFILER_STATE) to store profiler config, |
| // as well as a list of events that happen during profiling. |
| // An instance of ThreadLocalDebugInfo is created each time we enter |
| // profiler (i.e. enter profiling context manager/call enableConfig) and |
| // uniquely identifies a profiling run. |
| // |
| // We automatically propagate ThreadLocalDebugInfo into async tasks, |
| // as well as across JIT continuations and autograd thread, so all |
| // the operations that happen between profiling start and end |
| // (not necessarily within the same thread) are recorded. |
| // Unless the profiling slot is overwritten as in the case of nested |
| // profiling ranges (in this case events for the subrange are handled |
| // by the nested profiler) |
| // |
| // When we exit a profiling range (either by exiting profiling context |
| // manager or by calling disableProfiler), we remove the previously set |
| // profiling entry for the given thread local mapping, and consolidate |
| // events in the profiling result |
| // |
| // |
| // ThreadLocalState: |
| // |
| // ThreadLocalState takes a 'snapshot' of thread local variables |
| // using provided getters. It is used together with ThreadLocalStateGuard |
| // to transfer the snapshot across thread boundary and set the thread local |
| // values as in the parent task. |
| // |
| // Profiler uses ThreadLocalState to propagate profiler's thread local state. |
| // ThreadLocalState also automatically propagates profiler callbacks. |
| // |
| // |
| // at::RecordFunction and observers |
| // |
| // Profiler uses observers mechanism to add a pair of thread local callbacks |
| // that are executed on a number of predetermined ranges, including: |
| // - c10/ATen ops |
| // - TorchScript functions/methods |
| // - user defined named ranges (see `record_function` python context manager) |
| // |
| // Profiler setups a pair of callbacks that record profiling events and save |
| // them into the thread local profiler struct (ThreadLocalDebugInfo, |
| // PROFILER_STATE slot) |
| // |
| // |
| // Thus, the overall logic is: |
| // |
| // enableProfiler: |
| // - checks that profiler is not enabled (otherwise throws) |
| // - pushes new ThreadLocalDebugInfo (slot PROFILER_STATE) as the profiler |
| // config for the current thread |
| // - pushes profiling callbacks for the current thread |
| // |
| // disableProfiler: |
| // - pops PROFILER_STATE slot from the current ThreadLocalDebugInfo and |
| // consolidates events |
| // - removes profiling callbacks |
| // |
| // ThreadLocalState: |
| // - propagates ThreadLocalDebugInfo across threads |
| // - propagates profiler callbacks across threads |
| // |
| // Profiler callbacks: |
| // - get the current profiling state (PROFILER slot in ThreadLocalDebugInfo) |
| // - save profiling events into the profiling state |
| // |
| |
| namespace { |
| const CUDAStubs default_stubs; |
| constexpr const CUDAStubs* default_stubs_addr = &default_stubs; |
| // Constant initialization, so it is guaranteed to be initialized before |
| // static initialization calls which may invoke registerCUDAMethods |
| inline const CUDAStubs*& cuda_stubs() { |
| static const CUDAStubs* stubs_ = default_stubs_addr; |
| return stubs_; |
| } |
| } |
| |
| // Profiler state |
| const ProfilerConfig& ProfilerThreadLocalState::config() const { |
| return config_; |
| } |
| |
| thread_event_lists ProfilerThreadLocalState::consolidate() { |
| std::lock_guard<std::mutex> g(state_mutex_); |
| thread_event_lists result; |
| for (auto& kv : event_lists_map_) { |
| auto& list = kv.second; |
| result.emplace_back(list->consolidate()); |
| } |
| // Consolidate remote events if applicable as well. |
| if (remoteProfiledEvents_) { |
| result.insert( |
| result.end(), |
| std::make_move_iterator(remoteProfiledEvents_->begin()), |
| std::make_move_iterator(remoteProfiledEvents_->end())); |
| } |
| return result; |
| } |
| |
| void ProfilerThreadLocalState::mark(std::string name, bool include_cuda) { |
| if (config_.state == ProfilerState::Disabled) { |
| return; |
| } |
| if (config_.state == ProfilerState::NVTX) { |
| cuda_stubs()->nvtxMarkA(name.c_str()); |
| } else { |
| LegacyEvent evt( |
| EventKind::Mark, |
| at::StringView(std::move(name)), |
| at::RecordFunction::currentThreadId(), |
| include_cuda && config_.state == ProfilerState::CUDA); |
| evt.setNodeId(at::RecordFunction::getDefaultNodeId()); |
| getEventList().record(std::move(evt)); |
| } |
| } |
| |
| void ProfilerThreadLocalState::setOrAddRemoteProfiledEvents( |
| std::vector<LegacyEvent>&& remoteProfiledEvents) { |
| // Lock to serialize access from multiple callback threads. |
| std::lock_guard<std::mutex> guard(state_mutex_); |
| if (remoteProfiledEvents_) { |
| (*remoteProfiledEvents_).emplace_back(remoteProfiledEvents); |
| } else { |
| remoteProfiledEvents_ = {std::move(remoteProfiledEvents)}; |
| } |
| } |
| |
| void ProfilerThreadLocalState::pushRange( |
| const at::RecordFunction& fn, |
| const bool record_cuda, |
| const char* msg, |
| std::vector<std::vector<int64_t>>&& shapes) { |
| if (config_.state == ProfilerState::Disabled) { |
| return; |
| } |
| if (config_.state == ProfilerState::NVTX) { |
| cuda_stubs()->nvtxRangePushA(getNvtxStr( |
| fn.name(), msg, fn.seqNr(), shapes).c_str()); |
| } else { |
| LegacyEvent evt( |
| EventKind::PushRange, |
| fn.name(), |
| at::RecordFunction::currentThreadId(), |
| record_cuda, |
| fn.handle(), |
| std::move(shapes), |
| at::RecordFunction::getDefaultNodeId()); |
| evt.setSequenceNr(fn.seqNr()); |
| evt.setFwdThreadId(fn.forwardThreadId()); |
| evt.setScope((uint8_t)fn.scope()); |
| #ifndef C10_MOBILE |
| // backward nodes source range corresponds to the forward node |
| // TODO: consider using C++ stack trace |
| if (config_.with_stack && fn.scope() != at::RecordScope::BACKWARD_FUNCTION) { |
| auto cs = prepareCallstack(jit::currentCallstack()); |
| if (cs.empty()) { |
| cs = prepareCallstack(jit::tracer::pythonCallstack()); |
| } |
| evt.setStack(callstackStr(cs)); |
| } |
| #endif |
| getEventList().record(std::move(evt)); |
| } |
| } |
| |
| void ProfilerThreadLocalState::popRange(const at::RecordFunction& fn, const bool record_cuda) { |
| if (config_.state == ProfilerState::Disabled) { |
| return; |
| } |
| if (config_.state == ProfilerState::NVTX) { |
| cuda_stubs()->nvtxRangePop(); |
| } else { |
| // In some cases RecordFunction (and popRange) may be |
| // called on a different thread than pushRange |
| // As a convention, we put the async pop on the original |
| // thread and save current thread id in pop event |
| LegacyEvent evt( |
| EventKind::PopRange, |
| at::StringView(""), |
| at::RecordFunction::currentThreadId(), |
| record_cuda, |
| fn.handle()); |
| evt.setNodeId(at::RecordFunction::getDefaultNodeId()); |
| getEventList(fn.threadId()).record(std::move(evt)); |
| } |
| } |
| |
| void ProfilerThreadLocalState::reportMemoryUsage( |
| void* /* unused */, |
| int64_t alloc_size, |
| c10::Device device) { |
| if (config_.profile_memory && config_.state != ProfilerState::Disabled) { |
| uint64_t thread_id = at::RecordFunction::currentThreadId(); |
| LegacyEvent evt( |
| EventKind::MemoryAlloc, |
| at::StringView(""), |
| thread_id, |
| config_.state == ProfilerState::CUDA); |
| evt.updateMemoryStats(alloc_size, device); |
| getEventList(thread_id).record(std::move(evt)); |
| } |
| } |
| |
| bool ProfilerThreadLocalState::memoryProfilingEnabled() const { |
| return config_.profile_memory; |
| } |
| |
| std::string ProfilerThreadLocalState::getNvtxStr( |
| const at::StringView& name, |
| const char* msg, |
| int64_t sequence_nr, |
| const std::vector<std::vector<int64_t>>& shapes) const { |
| if (sequence_nr >= 0 || shapes.size() > 0) { |
| std::stringstream s; |
| #ifdef __HIP_PLATFORM_HCC__ |
| s << name.str(); |
| #endif |
| if (sequence_nr >= 0) { |
| #ifdef __HIP_PLATFORM_HCC__ |
| s << msg << sequence_nr; |
| #else |
| s << name.str() << msg << sequence_nr; |
| #endif |
| } |
| if (shapes.size() > 0) { |
| s << ", sizes = ["; |
| for (size_t idx = 0; idx < shapes.size(); ++idx) { |
| if (shapes[idx].size() > 0) { |
| s << "["; |
| for (size_t dim = 0; dim < shapes[idx].size(); ++dim) { |
| s << shapes[idx][dim]; |
| if (dim < shapes[idx].size() - 1) { |
| s << ", "; |
| } |
| } |
| s << "]"; |
| } else { |
| s << "[]"; |
| } |
| if (idx < shapes.size() - 1) { |
| s << ", "; |
| } |
| } |
| s << "]"; |
| } |
| return s.str(); |
| } else { |
| return name.str(); |
| } |
| } |
| |
| RangeEventList& ProfilerThreadLocalState::getEventList(int64_t thread_id) { |
| if (thread_id < 0) { |
| thread_id = at::RecordFunction::currentThreadId(); |
| } |
| RangeEventList* list_ptr = nullptr; |
| std::lock_guard<std::mutex> guard(state_mutex_); |
| auto it = event_lists_map_.find(thread_id); |
| if (it != event_lists_map_.end()) { |
| list_ptr = it->second.get(); |
| } else { |
| auto event_list = std::make_shared<RangeEventList>(); |
| event_lists_map_[thread_id] = event_list; |
| list_ptr = event_list.get(); |
| } |
| return *list_ptr; |
| } |
| |
| std::vector<std::vector<int64_t>> inputSizes(const at::RecordFunction& fn) { |
| std::vector<std::vector<int64_t>> sizes; |
| sizes.reserve(fn.inputs().size()); |
| for (const c10::IValue& input : fn.inputs()) { |
| if (!input.isTensor()) { |
| sizes.emplace_back(); |
| continue; |
| } |
| const at::Tensor& tensor = input.toTensor(); |
| if (tensor.defined()) { |
| sizes.push_back(input.toTensor().sizes().vec()); |
| } else { |
| sizes.emplace_back(); |
| } |
| } |
| return sizes; |
| } |
| |
| namespace { |
| |
| enum EventIValueIdx { |
| KIND = 0, |
| NAME, |
| THREAD_ID, |
| HANDLE, |
| NODE_ID, |
| CPU_MEM_USAGE, |
| CPU_NS, |
| CUDA_RECORDED, |
| CUDA_MEM_USAGE, |
| CUDA_DEVICE, |
| CUDA_US, |
| SHAPES, |
| NUM_EVENT_IVALUE_IDX // must be last in list |
| }; |
| |
| enum ProfilerIValueIdx { |
| STATE = 0, |
| REPORT_INPUT_SHAPES, |
| PROFILE_MEMORY, |
| NUM_PROFILER_CFG_IVALUE_IDX // must be last in list |
| }; |
| |
| const std::unordered_set<std::string> disable_cuda_profiling = { |
| "aten::view", |
| "aten::t", |
| "aten::transpose", |
| "aten::stride", |
| "aten::empty", |
| "aten::empty_like", |
| "aten::empty_strided", |
| "aten::as_strided", |
| "aten::expand", |
| "aten::resize_", |
| "aten::squeeze", |
| "aten::unsqueeze", |
| "aten::slice", |
| "aten::_unsafe_view", |
| "aten::size" |
| }; |
| |
| ProfilerThreadLocalState* getProfilerTLSState() { |
| return static_cast<ProfilerThreadLocalState*>( |
| c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::PROFILER_STATE)); |
| } |
| |
| void pushProfilingCallbacksLegacy() { |
| auto state_ptr = getProfilerTLSState(); |
| TORCH_INTERNAL_ASSERT(state_ptr, "Expected profiler state set"); |
| auto handle = at::addThreadLocalCallback(at::RecordFunctionCallback( |
| [](const at::RecordFunction& fn) { |
| auto state_ptr = getProfilerTLSState(); |
| if (!state_ptr || state_ptr->config().state == ProfilerState::Disabled) { |
| return nullptr; |
| } |
| bool record_cuda = |
| state_ptr->config().state == ProfilerState::CUDA; |
| if (record_cuda && disable_cuda_profiling.find(fn.name().str()) != disable_cuda_profiling.end()) { |
| record_cuda = false; |
| } |
| |
| auto* msg = (fn.seqNr() >= 0) ? ", seq = " : ""; |
| if (state_ptr->config().report_input_shapes) { |
| auto sizes = inputSizes(fn); |
| state_ptr->pushRange(fn, record_cuda, msg, std::move(sizes)); |
| } else { |
| state_ptr->pushRange(fn, record_cuda, msg); |
| } |
| |
| return nullptr; |
| }, |
| [](const at::RecordFunction& fn, at::ObserverContext*) { |
| auto state_ptr = getProfilerTLSState(); |
| if (!state_ptr || state_ptr->config().state == ProfilerState::Disabled) { |
| return; |
| } |
| bool record_cuda = |
| state_ptr->config().state == ProfilerState::CUDA; |
| if (record_cuda && disable_cuda_profiling.find(fn.name().str()) != disable_cuda_profiling.end()) { |
| record_cuda = false; |
| } |
| state_ptr->popRange(fn, record_cuda); |
| }) |
| .needsInputs(state_ptr->config().report_input_shapes) |
| .needsIds(true)); |
| state_ptr->setCallbackHandle(handle); |
| } |
| |
| const int kCUDAWarmupStart = 5; |
| |
| } // namespace |
| |
| void registerCUDAMethods(CUDAStubs* stubs) { |
| cuda_stubs() = stubs; |
| } |
| |
| at::IValue ProfilerConfig::toIValue() const { |
| c10::impl::GenericList eventIValueList(at::AnyType::get()); |
| eventIValueList.reserve(NUM_PROFILER_CFG_IVALUE_IDX); |
| eventIValueList.emplace_back(static_cast<int64_t>(state)); |
| eventIValueList.emplace_back(report_input_shapes); |
| eventIValueList.emplace_back(profile_memory); |
| return eventIValueList; |
| } |
| |
| ProfilerConfig ProfilerConfig::fromIValue( |
| const at::IValue& profilerConfigIValue) { |
| TORCH_INTERNAL_ASSERT( |
| profilerConfigIValue.isList(), |
| "Expected IValue to contain type c10::impl::GenericList"); |
| auto ivalues = profilerConfigIValue.toList(); |
| TORCH_INTERNAL_ASSERT( |
| ivalues.size() == NUM_PROFILER_CFG_IVALUE_IDX, |
| c10::str( |
| "Expected exactly ", |
| NUM_PROFILER_CFG_IVALUE_IDX, |
| " ivalues to resconstruct ProfilerConfig.")); |
| return ProfilerConfig( |
| static_cast<ProfilerState>(ivalues.get(ProfilerIValueIdx::STATE).toInt()), |
| ivalues.get(ProfilerIValueIdx::REPORT_INPUT_SHAPES).toBool(), |
| ivalues.get(ProfilerIValueIdx::PROFILE_MEMORY).toBool()); |
| } |
| |
| ProfilerConfig getProfilerConfig() { |
| auto state_ptr = getProfilerTLSState(); |
| TORCH_CHECK( |
| state_ptr, |
| "Tried to access profiler config, but profiler is not enabled!"); |
| return state_ptr->config(); |
| } |
| |
| bool profilerEnabled() { |
| auto state_ptr = getProfilerTLSState(); |
| return state_ptr && state_ptr->config().state != ProfilerState::Disabled; |
| } |
| |
| void enableProfilerLegacy(const ProfilerConfig& new_config) { |
| TORCH_CHECK(new_config.state != ProfilerState::NVTX || cuda_stubs()->enabled(), |
| "Can't use NVTX profiler - PyTorch was compiled without CUDA"); |
| |
| TORCH_CHECK(new_config.state != ProfilerState::KINETO); |
| |
| auto state_ptr = getProfilerTLSState(); |
| TORCH_CHECK(!state_ptr, "Profiler is already enabled on this thread"); |
| auto state = std::make_shared<ProfilerThreadLocalState>(new_config); |
| c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state); |
| |
| pushProfilingCallbacksLegacy(); |
| |
| if (new_config.state == ProfilerState::CUDA) { |
| // event recording appears to have some startup overhead, so we need to |
| // to generate some dummy events first before recording synchronization events |
| for (int idx = 0; idx < kCUDAWarmupStart; ++idx) { |
| cuda_stubs()->onEachDevice([state](int /* unused */) { |
| state->mark("__cuda_startup"); |
| cuda_stubs()->synchronize(); |
| }); |
| } |
| |
| // cuda events must be on the same device, so we need a start event recorded |
| // for each gpu. we then use this event to synchronize time on the GPU |
| // with the CPU clock. |
| cuda_stubs()->onEachDevice([state](int d) { |
| state->mark("__cuda_start_event"); |
| }); |
| } |
| state->mark("__start_profile", false); |
| } |
| |
| thread_event_lists disableProfilerLegacy(c10::optional<ProfilerDisableOptions> profilerDisableOptions) { |
| auto cleanupTLSState = profilerDisableOptions ? profilerDisableOptions->cleanupTLSState : true; |
| auto consolidate = profilerDisableOptions ? profilerDisableOptions->consolidate : true; |
| // all the DebugInfoBase objects are scope based and supposed to use DebugInfoGuard |
| std::shared_ptr<c10::DebugInfoBase> state; |
| if (cleanupTLSState) { |
| state = c10::ThreadLocalDebugInfo::_pop(c10::DebugInfoKind::PROFILER_STATE); |
| } else { |
| state = c10::ThreadLocalDebugInfo::_peek(c10::DebugInfoKind::PROFILER_STATE); |
| } |
| |
| auto state_ptr = static_cast<ProfilerThreadLocalState*>(state.get()); |
| TORCH_CHECK(state_ptr && state_ptr->config().state != ProfilerState::Disabled, |
| "Can't disable profiler when it's not running"); |
| |
| if (cleanupTLSState) { |
| at::removeCallback(state_ptr->callbackHandle()); |
| } |
| |
| if (!consolidate || state_ptr->config().state == ProfilerState::NVTX) { |
| return thread_event_lists(); |
| } |
| |
| state_ptr->mark("__stop_profile"); |
| // Note that this will erase the underlying events. |
| return state_ptr->consolidate(); |
| } |
| |
| void addEventList(std::vector<LegacyEvent>&& profiledEvents) { |
| auto state_ptr = getProfilerTLSState(); |
| TORCH_CHECK(state_ptr, "Profiler must be enabled."); |
| state_ptr->setOrAddRemoteProfiledEvents(std::move(profiledEvents)); |
| } |
| |
| void LegacyEvent::record(bool record_cuda) { |
| if (record_cuda) { |
| cuda_stubs()->record(&device_, &cuda_event, &cpu_ns_); |
| return; |
| } |
| cpu_ns_ = getTime(); |
| } |
| |
| /* static */ LegacyEvent LegacyEvent::fromIValue(const at::IValue& eventIValue) { |
| TORCH_INTERNAL_ASSERT( |
| eventIValue.isList(), |
| "Expected IValue to contain type c10::impl::GenericList"); |
| auto ivalues = eventIValue.toList(); |
| TORCH_INTERNAL_ASSERT( |
| ivalues.size() >= NUM_EVENT_IVALUE_IDX, |
| "Expected at least ", |
| NUM_EVENT_IVALUE_IDX, |
| " elements to reconstruct LegacyEvent."); |
| |
| // Reconstruct input shapes from ivalues. |
| auto shapeListIValue = ivalues.get(EventIValueIdx::SHAPES); |
| TORCH_INTERNAL_ASSERT( |
| shapeListIValue.isList(), |
| "Expected profiler shapes IValue to contain type c10::impl::GenericList." |
| ); |
| |
| auto shapeList = shapeListIValue.toList(); |
| std::vector<std::vector<int64_t>> shapes; |
| shapes.reserve(shapeList.size()); |
| for (size_t i = 0 ; i < shapeList.size(); ++i) { |
| std::vector<int64_t> s; |
| auto shapeIValue = shapeList.get(i); |
| TORCH_INTERNAL_ASSERT( |
| shapeIValue.isList(), |
| "Expected each profiler shape element to contain shapes of type c10::impl::GenericList.") |
| auto curShapesList = shapeIValue.toList(); |
| s.reserve(curShapesList.size()); |
| for (size_t j = 0; j < curShapesList.size(); ++j) { |
| s.emplace_back(curShapesList.get(j).toInt()); |
| } |
| shapes.emplace_back(s); |
| } |
| |
| LegacyEvent evt( |
| static_cast<EventKind>( |
| ivalues.get(EventIValueIdx::KIND).toInt()), // EventKind |
| at::StringView(ivalues.get(EventIValueIdx::NAME).toStringRef()), // name |
| ivalues.get(EventIValueIdx::THREAD_ID).toInt(), // thread_id |
| static_cast<at::RecordFunctionHandle>( |
| ivalues.get(EventIValueIdx::HANDLE).toDouble()), // handle |
| std::move(shapes), // input shapes |
| ivalues.get(EventIValueIdx::NODE_ID).toInt(), // node id |
| true, // is remote |
| ivalues.get(EventIValueIdx::CPU_MEM_USAGE).toInt(), // cpu_mem_usage |
| ivalues.get(EventIValueIdx::CPU_NS).toInt(), // cpu_ns |
| ivalues.get(EventIValueIdx::CUDA_RECORDED).toBool(), // was cuda recorded |
| ivalues.get(EventIValueIdx::CUDA_MEM_USAGE).toInt(), // cuda memory usage |
| ivalues.get(EventIValueIdx::CUDA_DEVICE).toInt(), // device |
| ivalues.get(EventIValueIdx::CUDA_US).toInt() // cuda_us |
| ); |
| return evt; |
| } |
| |
| at::IValue LegacyEvent::toIValue() const { |
| c10::impl::GenericList eventIValueList(at::AnyType::get()); |
| eventIValueList.reserve(NUM_EVENT_IVALUE_IDX); |
| eventIValueList.emplace_back(static_cast<int64_t>(kind_)); |
| eventIValueList.emplace_back(std::string(name_.str())); |
| eventIValueList.emplace_back(static_cast<int64_t>(thread_id_)); |
| eventIValueList.emplace_back(static_cast<double>(handle_)); |
| eventIValueList.emplace_back(node_id_); |
| eventIValueList.emplace_back(cpu_memory_usage_); |
| eventIValueList.emplace_back(cpu_ns_); |
| // CUDA event information |
| bool cuda_profiling_enabled = hasCuda(); |
| eventIValueList.emplace_back(cuda_profiling_enabled); |
| eventIValueList.emplace_back(static_cast<int64_t>(cuda_memory_usage_)); |
| eventIValueList.emplace_back(device_); |
| eventIValueList.emplace_back(cuda_us_); |
| // Shapes |
| c10::impl::GenericList shapesList = |
| c10::impl::GenericList(at::ListType::create(at::IntType::get())); |
| shapesList.reserve(shapes_.size()); |
| for (const auto& shape : shapes_) { |
| c10::impl::GenericList s = c10::impl::GenericList(at::IntType::get()); |
| s.reserve(shape.size()); |
| for (const auto& k : shape) { |
| s.emplace_back(k); |
| } |
| shapesList.emplace_back(s); |
| } |
| eventIValueList.emplace_back(shapesList); |
| return at::IValue(eventIValueList); |
| } |
| |
| double LegacyEvent::cudaElapsedUs(const LegacyEvent& e) const { |
| TORCH_CHECK(e.hasCuda() && hasCuda(), "Events were not recorded for CUDA"); |
| TORCH_CHECK( |
| e.device() == device(), |
| c10::str( |
| "Events are not on the same device: ", e.device(), " vs ", device())); |
| if (isRemote() && e.isRemote()) { |
| // validate that cuda_us_ has been set properly. |
| TORCH_INTERNAL_ASSERT(cuda_us_ >= 0 && e.cuda_us_ >= 0); |
| return static_cast<double>(e.cuda_us_ - cuda_us_); |
| } |
| return cuda_stubs()->elapsed(&cuda_event, &e.cuda_event); |
| } |
| |
| CUDAStubs::~CUDAStubs() = default; |
| |
| static const jit::CodeTemplate event_template(R"( |
| { |
| "name": "${name}", |
| "ph": "X", |
| "ts": ${ts}, |
| "dur": ${dur}, |
| "tid": ${tid}, |
| "pid": "CPU Functions", |
| "args": {} |
| })"); |
| |
| void writeProfilerEventsToStream(std::ostream& out, const std::vector<LegacyEvent*>& events) { |
| TORCH_CHECK(out, "Could not open file"); |
| LegacyEvent* profiler_start = nullptr; |
| for (LegacyEvent* e : events) { |
| if (0 == strcmp(e->name(), "__start_profile")) { |
| profiler_start = e; |
| break; |
| } |
| } |
| TORCH_CHECK(profiler_start, "Could not find __start_profile mark"); |
| |
| struct PairHash { |
| size_t operator()(std::pair<at::RecordFunctionHandle, int> p) const |
| noexcept { |
| return std::hash<at::RecordFunctionHandle>()(p.first) ^ std::hash<int64_t>()(p.second); |
| } |
| }; |
| std::unordered_map<std::pair<at::RecordFunctionHandle, int64_t>, LegacyEvent*, PairHash> events_map; |
| out << "[\n"; |
| bool first = true; |
| for (LegacyEvent* evt : events) { |
| if (evt->kindStr() == "push") { |
| events_map[std::make_pair(evt->handle(), evt->nodeId())] = evt; |
| } else if (evt->kindStr() == "pop") { |
| if (!first) { |
| out << ",\n"; |
| } |
| first = false; |
| auto it = events_map.find(std::make_pair(evt->handle(), evt->nodeId())); |
| TORCH_CHECK(it != events_map.end(), "Unmatched pop event"); |
| LegacyEvent* evt_start = it->second; |
| events_map.erase(it); |
| |
| jit::TemplateEnv env; |
| env.s("name", evt_start->name()); |
| env.d("ts", profiler_start->cpuElapsedUs(*evt_start)); |
| env.d("dur", evt_start->cpuElapsedUs(*evt)); |
| env.d("tid", evt_start->threadId()); |
| out << event_template.format(env); |
| } |
| } |
| out << "]\n"; |
| } |
| |
| RecordProfile::RecordProfile(std::ostream& out) |
| : out_(out) { |
| init(); |
| } |
| |
| RecordProfile::RecordProfile(const std::string& filename) |
| : file_(new std::ofstream(filename)), out_(*file_) { |
| init(); |
| } |
| |
| void RecordProfile::init() { |
| enableProfilerLegacy(ProfilerConfig(ProfilerState::CPU)); |
| } |
| |
| RecordProfile::~RecordProfile() { |
| try { |
| thread_event_lists event_lists = disableProfilerLegacy(); |
| std::vector<LegacyEvent*> events; |
| for (auto& l : event_lists) { |
| for (auto& e : l) { |
| events.push_back(&e); |
| } |
| } |
| processEvents(events); |
| } catch (const std::exception& e) { |
| LOG(ERROR) << e.what() << std::endl; |
| } catch (...) { |
| LOG(ERROR) << "Unknown error" << std::endl; |
| } |
| } |
| |
| void RecordProfile::processEvents(const std::vector<LegacyEvent*>& events) { |
| writeProfilerEventsToStream(out_, events); |
| } |
| |
| }}} |