torch/csrc/autograd/profiler_kineto.cpp - platform/external/pytorch - Git at Google

 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <torch/csrc/autograd/profiler_kineto.h>

 #include <c10/macros/Export.h>
 #include <c10/util/flat_hash_map.h>
 #include <c10/util/irange.h>

 #include <torch/csrc/jit/frontend/tracer.h>
 #include <torch/csrc/jit/runtime/interpreter.h>
 #include <torch/csrc/jit/runtime/operator.h>
 #include <torch/csrc/profiler/api.h>
 #include <torch/csrc/profiler/kineto_shim.h>
 #include <torch/csrc/profiler/nvtx_observer.h>

 #include <ATen/Context.h>

 #include <deque>
 #include <limits>
 #include <sstream>
 #include <stdexcept>

 #ifdef USE_KINETO
 #include <libkineto.h>
 #include <time_since_epoch.h>

 #ifndef _MSC_VER
 // TODO: TO be removed, once this properly works from libkineto
 // Literal copy-n-paste from third_party/kineto/libkineto/src/WeakSymbols.cpp
 extern "C" {
 // This function is needed to avoid superfluous dependency on GNU OpenMP library
 // when cuPTI is linked statically For more details see
 // https://github.com/pytorch/pytorch/issues/51026
 __attribute__((weak)) int acc_get_device_type() {
   throw std::runtime_error(
       "Dummy implementation of acc_get_device_type is not supposed to be called!");
 }
 } // extern "C"
 #endif // _MSC_VER
 #endif // USE_KINETO

 namespace torch {
 namespace autograd {
 namespace profiler {

 namespace {
 const std::string kMemoryEventName = "[memory]";
 // TODO: consider TLS (tid + tls counter)
 uint64_t next_correlation_id() {
   static std::atomic<uint64_t> corr_id_{1};
   return corr_id_++;
 }

 inline int64_t getTimeUs() {
 #ifdef USE_KINETO
   return libkineto::timeSinceEpoch(std::chrono::system_clock::now());
 #else
   return torch::profiler::impl::getTime() / 1000;
 #endif // USE_KINETO
 }
 } // namespace

 namespace python_tracer {
 namespace {
 CallFn call_fn;
 TraceEventsFn get_events_fn;
 } // namespace

 void registerFunctions(CallFn call, TraceEventsFn get_events) {
   call_fn = call;
   get_events_fn = get_events;
 }

 void call(Command c) {
   if (call_fn != nullptr) {
     call_fn(c);
   }
 }

 std::vector<std::unique_ptr<PyTraceEvent>> get_events() {
   return get_events_fn != nullptr
       ? get_events_fn()
       : std::vector<std::unique_ptr<PyTraceEvent>>();
 }

 // We do not want `getTimeUs` to be directly visible, but we need a way for
 // the python tracer to use the same timing convention as the profiler.
 int64_t now() {
   return getTimeUs();
 }

 struct Replay {
   PyTraceEvent* frame_;
   bool enter_;

   C10_NODISCARD int64_t t() const {
     return enter_ ? frame_->startTime_ : frame_->endTime_;
   }

   C10_NODISCARD size_t idx() const {
     return enter_ ? frame_->call_idx_ : frame_->return_idx_;
   }

   bool operator<(const Replay& other) const {
     return idx() < other.idx();
   }
 };

 void _push_reverse_order(PyTraceEvent* e, std::vector<std::string>& names) {
   if (e != nullptr) {
     _push_reverse_order(e->parent_, names);
     names.push_back(e->name_);
   }
 }
 } // namespace python_tracer

 namespace {
 using torch::profiler::impl::ProfilerThreadLocalStateBase;
 using torch::profiler::impl::ActiveProfilerType;

 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
 struct OpEventData {
     // POD members
     int64_t start_us_;
     int64_t end_us_;
     uint64_t correlation_id_;
     uint64_t start_thread_id_;
     uint64_t end_thread_id_;
     int64_t sequence_number_;
     uint64_t forward_thread_id_;
     uint8_t record_function_scope_;
     bool is_async_;
     int64_t debug_handle_;
     torch::profiler::impl::kineto::DeviceAndResource kineto_info_;

     std::string name_;

     // report_input_shapes
     std::vector<std::vector<int64_t>> shapes_;
     std::vector<std::string> dtypes_;

     // with_stack
     std::vector<std::string> stack_;

     // with_modules
     c10::optional<std::vector<std::string>> module_hierarchy_;

     // with_flops
     std::unordered_map<std::string, c10::IValue> extra_args_;

     // reportBackendEventToActiveKinetoProfiler
     c10::optional<std::string> backend_;

     // ProfilerState::KINETO_GPU_FALLBACK
     torch::profiler::impl::CUDAEventStub cuda_event_start_ = nullptr;
     torch::profiler::impl::CUDAEventStub cuda_event_end_ = nullptr;
 };

 // Assumption: Total threads number will not exceed 2^16-1, and total ops will
 // not exceed 2^48 -1.
 static inline uint64_t getForwardThreadKey(uint64_t tid, uint64_t seqNr) {
   return (((tid) << 48) | ((seqNr) & (((uint64_t)1 << 48) - 1)));
 }

 struct KinetoObserverContext : public at::ObserverContext {
   explicit KinetoObserverContext(OpEventData* data) : data_(data) {}
   OpEventData* data_;
 };

 struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
   explicit KinetoThreadLocalState(
       const ProfilerConfig& config,
       std::set<torch::profiler::impl::ActivityType> activities)
       : ProfilerThreadLocalStateBase(config),
         start_time_(getTimeUs()),
         activities_(std::move(activities)),
         cpu_trace_(start_time_, "PyTorch Profiler") {}
   ~KinetoThreadLocalState() override = default;

   static KinetoThreadLocalState* getTLS() {
     auto tls = ProfilerThreadLocalStateBase::getTLS();
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
         tls == nullptr || tls->profilerType() == ActiveProfilerType::KINETO);
     return static_cast<KinetoThreadLocalState*>(tls);
   }

   ActiveProfilerType profilerType() override {
     return ActiveProfilerType::KINETO;
   }

   bool tracePython() {
     return config().with_stack && activities_.count(ActivityType::CPU);
   }

   std::unique_ptr<KinetoObserverContext> newOpEvent() {
     std::lock_guard<std::mutex> guard(state_mutex_);
     op_events_.emplace_back();
     return std::make_unique<KinetoObserverContext>(&op_events_.back());
   }

   void reportMemoryUsage(
       void* ptr,
       int64_t alloc_size,
       int64_t total_allocated,
       int64_t total_reserved,
       c10::Device device) override {
     if (config_.profile_memory && config_.state != ProfilerState::Disabled) {
       std::lock_guard<std::mutex> guard(state_mutex_);
       auto start_time = getTimeUs();
       if (cpu_trace_) {
         torch::profiler::impl::kineto::recordThreadInfo();
         cpu_trace_.addMemoryUsageActivity(
             kMemoryEventName,
             torch::profiler::impl::kineto::kineto_ids(),
             start_time,
             device,
             ptr,
             alloc_size,
             total_allocated,
             total_reserved);
       }

       kineto_events_.emplace_back();
       auto& evt = kineto_events_.back();
       evt.name(kMemoryEventName)
           .startUs(start_time)
           .deviceIndex(device.index())
           .deviceType(device.type())
           .nBytes(alloc_size)
           .startThreadId(at::RecordFunction::currentThreadId());
     }
   }

   const std::function<void(std::vector<KinetoEvent>&)>&
   getEventPostProcessingCallback() const {
     return event_post_process_cb_;
   }

   void setEventPostProcessingCallback(
       std::function<void(std::vector<KinetoEvent>&)>&& cb) {
     event_post_process_cb_ = std::move(cb);
   }

   torch::profiler::impl::kineto::ActivityTraceWrapper finalizeTrace() {
     auto end_time = getTimeUs();
     materializeOpEvents();

     // Call events post processing callback before finalizing trace, if there is
     // one.
     if (getEventPostProcessingCallback()) {
       getEventPostProcessingCallback()(kineto_events_);
     }

     finalizeCPUTrace(cpu_trace_.get());
     {
       std::lock_guard<std::mutex> guard(state_mutex_);
       cpu_trace_.transferCpuTrace(end_time);
     }

     auto trace = torch::profiler::impl::kineto::stopTrace();
     TORCH_CHECK(trace || !torch::profiler::kKinetoAvailable);
     addTraceEvents(trace);
     return trace;
   }

   void materializeOpEvents() {
     std::lock_guard<std::mutex> guard(state_mutex_);
     for (const auto& e : op_events_) {
       if (e.end_us_ < e.start_us_) {
         // We initialize end_us_ to the smallest int64_t, so this means that
         // the op did not finish before we stopped profiling.
         continue;
       }

       cpu_trace_.addCPUActivity(
           e.name_,
           e.kineto_info_,
           e.correlation_id_,
           e.start_us_,
           e.end_us_);

       kineto_events_.emplace_back();
       kineto_events_.back()
           .name(e.name_)
           .startUs(e.start_us_)
           .durationUs(e.end_us_ - e.start_us_)
           .correlationId(e.correlation_id_)
           .deviceType(c10::DeviceType::CPU)
           .startThreadId(e.start_thread_id_)
           .endThreadId(e.end_thread_id_)
           .sequenceNr(e.sequence_number_)
           .fwdThreadId(e.forward_thread_id_)
           .scope(e.record_function_scope_)
           .setAsync(e.is_async_)
           .debugHandle(e.debug_handle_);

       if (!e.shapes_.empty()) {
         kineto_events_.back().shapes(e.shapes_);
       }

       if (!e.dtypes_.empty()) {
         kineto_events_.back().dtypes(e.dtypes_);
       }

       if (!e.stack_.empty()) {
         kineto_events_.back().stack(e.stack_);
       }

       if (e.module_hierarchy_) {
         kineto_events_.back().moduleHierarchy(*e.module_hierarchy_);
       }

       if (!e.extra_args_.empty()) {
         kineto_events_.back().flops(
             computeFlops(std::string(e.name_), e.extra_args_));
       }
       if (e.backend_) {
         kineto_events_.back().backend(*e.backend_);
       }
       kineto_events_.back().cuda_event_start_ = e.cuda_event_start_;
       kineto_events_.back().cuda_event_end_ = e.cuda_event_end_;
     }
     op_events_.clear();
   }

   void finalizeCPUTrace(std::unique_ptr<torch::profiler::impl::kineto::trace_t>& cpu_trace) {
 #ifndef USE_KINETO
   }
 #else // USE_KINETO
     TORCH_INTERNAL_ASSERT(
         cpu_trace->activities.size() == kineto_events_.size());
     // startThreadId_seqNum to pointer of activity.
     // Low-16bits of startThreadId and low-48bits seqNum are concatenated into
     // one uint64_t variable as key.
     std::unordered_map<uint64_t, libkineto::GenericTraceActivity*>
         tidSeq2activity;
     uint64_t fwd_bwd_link_id = 1;

     for (const auto idx : c10::irange(cpu_trace->activities.size())) {
       auto& kineto_event = kineto_events_[idx];
       auto& activity = cpu_trace->activities[idx];

       if (kineto_event.hasShapes()) {
         activity.addMetadata("Input Dims", torch::profiler::impl::shapesToStr(kineto_event.shapes()));
       }
       if (kineto_event.hasStack()) {
         // NB: This is only for the JIT stack. The python stack (if applicable)
         //     is constructed later.
         activity.addMetadata(
             "Call stack", torch::profiler::impl::stacksToStr(kineto_event.stack(), ";"));
       }
       if (kineto_event.hasModuleHierarchy()) {
         activity.addMetadata(
             "Module Hierarchy",
             torch::profiler::impl::stacksToStr(kineto_event.moduleHierarchy(), "."));
       }
       if (kineto_event.hasTypes()) {
         activity.addMetadata("Input type", torch::profiler::impl::dtypesToStr(kineto_event.dtypes()));
       }
       if (!kineto_event.backend().empty()) {
         activity.addMetadata("Backend", "\"" + kineto_event.backend() + "\"");
       }

       // add information about an associated forward op, if a sequence number
       // is available (e.g. during training)
       if (kineto_event.sequenceNr() >= 0) {
         activity.addMetadata(
             "Fwd thread id", std::to_string(kineto_event.fwdThreadId()));
         activity.addMetadata(
             "Sequence number", std::to_string(kineto_event.sequenceNr()));
         generateForwardBackwardLink(
             kineto_event, fwd_bwd_link_id, activity, tidSeq2activity);
       }
     }

     addPythonEvents(cpu_trace);
   }

   void addPythonEvents(std::unique_ptr<torch::profiler::impl::kineto::trace_t>& cpu_trace) {
     if (!tracePython()) {
       return;
     }

     auto py_events = python_tracer::get_events();
     for (const auto& e : py_events) {
       TORCH_INTERNAL_ASSERT(
           !e->thread_id_,
           "Profiler expects only single threaded Python tracing.");
     }

     // The remainder of this function merges the Python and Kineto event
     // streams into a single stream. If Python tracing is not enabled, we want
     // to avoid this process altogether to cut down on processing time.
     if (!py_events.size()) {
       return;
     }

     // Kineto event times
     std::vector<int64_t> op_start_times;
     for (const auto& a : cpu_trace->activities) {
       op_start_times.push_back(a.startTime);
     }
     std::sort(op_start_times.begin(), op_start_times.end());

     // Map PyTraceEvent* to sequential integers for JSON export.
     ska::flat_hash_map<python_tracer::PyTraceEvent*, std::string>
         py_event_indices_{
             { nullptr,
               std::string("null") }};
     for (size_t i = 0; i < py_events.size(); i++) {
       py_event_indices_.insert({py_events[i].get(), std::to_string(i)});
     }

     ska::flat_hash_map<std::string, size_t> module_counter_;
     ska::flat_hash_map<size_t, std::string> module_id_map_;
     auto record_module_id = [&](python_tracer::PyTraceEvent* e) {
       if (e->call_type_ == python_tracer::CallType::kPyModuleCall &&
           module_id_map_.find(e->module_id_) == module_id_map_.end()) {
         // We use the fact that operator[] will default initialize new keys.
         module_id_map_[e->module_id_] =
             std::to_string(module_counter_[e->name_]++);
       }
     };

     // Python events
     std::vector<python_tracer::Replay> py_replay;
     for (const auto& e : py_events) {
       py_replay.push_back({e.get(), true});
       py_replay.push_back({e.get(), false});
     }
     std::sort(py_replay.begin(), py_replay.end());

     // In order to determine the state of the python interpreter when a
     // particular op is called, we have to replay the python events and note
     // timestamps which are associated with op start times.
     std::vector<python_tracer::PyTraceEvent*> py_stack;
     ska::flat_hash_map<int64_t, python_tracer::PyTraceEvent*> op_py_map;
     auto replay_it = py_replay.begin();
     for (auto t : op_start_times) {
       while (replay_it != py_replay.end() && replay_it->t() <= t) {
         if (replay_it->enter_) {
           py_stack.push_back(replay_it->frame_);
           record_module_id(replay_it->frame_);
         } else {
           TORCH_INTERNAL_ASSERT(py_stack.size());
           TORCH_INTERNAL_ASSERT(py_stack.back() == replay_it->frame_);
           py_stack.pop_back();
         }
         replay_it++;
       }
       op_py_map.insert({t, py_stack.size() ? py_stack.back() : nullptr});
     }

     auto activities = std::move(cpu_trace->activities);
     auto py_events_it = py_events.begin();
     auto py_device = libkineto::processId();
     auto main_thread = libkineto::systemThreadId();
     auto push_py_event = [&]() {
       auto e = (*py_events_it).get();
       libkineto::GenericTraceActivity op(
           cpu_trace->span, libkineto::ActivityType::PYTHON_FUNCTION, e->name_);

       op.device = py_device;
       op.resource = main_thread;
       op.startTime = e->startTime_;
       op.endTime = e->endTime_;

       op.addMetadata("Python id", py_event_indices_.at(e));
       op.addMetadata("Python parent id", py_event_indices_.at(e->parent_));
       op.addMetadata("Python thread", std::to_string(e->thread_id_));
       if (e->call_type_ == python_tracer::CallType::kPyModuleCall) {
         op.addMetadata("Python module id", module_id_map_.at(e->module_id_));
       }

       cpu_trace->activities.push_back(op);
       py_events_it++;
     };

     TORCH_INTERNAL_ASSERT(activities.size() == kineto_events_.size());
     for (const auto idx : c10::irange(activities.size())) {
       auto& activity = activities[idx];

       // Add any python events that occurred between this Kineto event and the
       // previous Kineto event.
       while (py_events_it != py_events.end() &&
              (*py_events_it)->endTime_ <= activity.endTime) {
         push_py_event();
       }

       auto python_caller = op_py_map.at(activity.startTime);
       activity.addMetadata(
           "python_caller_id", py_event_indices_.at(python_caller));

       // If the kineto event has a stack that means the JIT model has a stack
       // associated with it that we need to respect.
       if (!kineto_events_[idx].hasStack()) {
         std::vector<std::string> py_names;
         _push_reverse_order(python_caller, py_names);
         kineto_events_[idx].stack(py_names);
         activity.addMetadata("Call stack", torch::profiler::impl::stacksToStr(py_names, ";"));
       }

       cpu_trace->activities.push_back(activity);
     }

     // Add any Python events which finish after the last Kineto event.
     while (py_events_it != py_events.end()) {
       push_py_event();
     }
   }

   void generateForwardBackwardLink(
       const KinetoEvent& kineto_event,
       uint64_t& fwd_bwd_link_id,
       libkineto::GenericTraceActivity& activity,
       std::unordered_map<uint64_t, libkineto::GenericTraceActivity*>&
           tidSeq2activity) {
     if (kineto_event.fwdThreadId() > 0) {
       // act is backward op.
       uint64_t key = getForwardThreadKey(
           kineto_event.fwdThreadId(), kineto_event.sequenceNr());
       auto iter = tidSeq2activity.find(key);
       if (iter != tidSeq2activity.end()) {
         libkineto::GenericTraceActivity* fwd = iter->second;
 #ifdef USE_KINETO_UPDATED
         fwd->flow.start = true;
 #else
         activity.flow.linkedActivity = fwd; // Only destination side set this,
                                             // to distinguish with start side.
 #endif
         activity.flow.id = fwd->flow.id = fwd_bwd_link_id;
         activity.flow.type = fwd->flow.type = libkineto::kLinkFwdBwd;
         ++fwd_bwd_link_id;
       }
     } else if (kineto_event.startThreadId() != 0) {
       // act is forward op.
       uint64_t key = getForwardThreadKey(
           kineto_event.startThreadId(), kineto_event.sequenceNr());
       // Assumption: Among all ops with same sequence number,
       // the one with biggest start time is most likely launching backward op.
       auto iter = tidSeq2activity.find(key);
       if (iter == tidSeq2activity.end()) {
         tidSeq2activity[key] = &activity;
       } else {
         // Now the sequence number is only incremented on creating a "Node"
         // object for backward pass, by calling
         // "at::sequence_number::get_and_increment()". Among all ops with same
         // sequence number, the one with biggest startTime is the one launching
         // backward op.
         if (activity.startTime >= iter->second->startTime) {
           tidSeq2activity[key] = &activity;
         }
       }
     }
   }
 #endif // USE_KINETO

   void addTraceEvents(torch::profiler::impl::kineto::ActivityTraceWrapper& trace) {
 #ifdef USE_KINETO
     const auto& events = *(trace.get()->activities());
     for (const auto& ev_ptr : events) {
       const auto& activity = *ev_ptr;
       // These events are already processed
       if (activity.type() != libkineto::ActivityType::CPU_OP &&
           activity.type() != libkineto::ActivityType::CPU_INSTANT_EVENT &&
           activity.type() != libkineto::ActivityType::USER_ANNOTATION &&
           activity.type() != libkineto::ActivityType::PYTHON_FUNCTION) {
         kineto_events_.emplace_back();
         auto& kineto_event = kineto_events_.back();
         kineto_event.name(activity.name())
             .deviceIndex(activity.deviceId())
             .deviceResourceId(activity.resourceId())
             .startUs(activity.timestamp())
             .durationUs(activity.duration())
             .activityType((uint8_t)activity.type());
         if (activity.linkedActivity()) {
           kineto_event.linkedCorrelationId(
               activity.linkedActivity()->correlationId());
         }
         kineto_event.deviceType(deviceTypeFromActivity(activity.type()));
       }
     }
 #endif // USE_KINETO
   }

   uint64_t start_time_;
   std::set<torch::profiler::impl::ActivityType> activities_;
   std::deque<OpEventData> op_events_;
   torch::profiler::impl::kineto::TraceWrapper cpu_trace_;
   std::vector<KinetoEvent> kineto_events_;
   // Optional, if event post-processing is enabled.
   std::function<void(std::vector<KinetoEvent>&)> event_post_process_cb_;
 };

 void pushProfilingCallbacks(const std::unordered_set<at::RecordScope>& scopes) {
   auto registration_state_ptr = KinetoThreadLocalState::getTLS();
   TORCH_INTERNAL_ASSERT(registration_state_ptr, "Expected profiler state set");
   auto handle = at::addThreadLocalCallback(
       at::RecordFunctionCallback(
           [](const at::RecordFunction& fn)
               -> std::unique_ptr<at::ObserverContext> {
             auto state_ptr = KinetoThreadLocalState::getTLS();
             if (!state_ptr) {
               return nullptr;
             }
             const auto& config = state_ptr->config();
             auto corr_id = next_correlation_id();
             torch::profiler::impl::kineto::pushCorrelationId(corr_id);

             auto ctx_ptr = state_ptr->newOpEvent();
             auto data_ptr = ctx_ptr->data_;

             data_ptr->end_us_ = std::numeric_limits<int64_t>::min();
             data_ptr->correlation_id_ = corr_id;
             data_ptr->start_thread_id_ = fn.threadId();
             data_ptr->sequence_number_ = fn.seqNr();
             data_ptr->forward_thread_id_ = fn.forwardThreadId();
             data_ptr->record_function_scope_ = (uint8_t)fn.scope();
             data_ptr->is_async_ = fn.isAsync();
             data_ptr->debug_handle_ = fn.debugHandle();
             data_ptr->kineto_info_ = torch::profiler::impl::kineto::kineto_ids();
             data_ptr->name_ = fn.name();
             if (config.report_input_shapes) {
               data_ptr->shapes_ = torch::profiler::impl::inputSizes(fn);
               data_ptr->dtypes_ = torch::profiler::impl::inputTypes(fn);
             }
 #if !defined BUILD_LITE_INTERPRETER && !defined C10_MOBILE
             // backward nodes source range corresponds to the forward node
             // TODO: consider using C++ stack trace
             if (config.with_stack &&
                 fn.scope() != at::RecordScope::BACKWARD_FUNCTION) {
               auto cs = torch::profiler::impl::prepareCallstack(jit::currentCallstack());
               data_ptr->stack_ = callstackStr(cs);
             }
             if (config.with_modules &&
                 fn.scope() != at::RecordScope::BACKWARD_FUNCTION) {
               data_ptr->module_hierarchy_ = jit::currentModuleHierarchy();
             }
 #endif
             if (config.with_flops) {
               data_ptr->extra_args_ = torch::profiler::impl::saveExtraArgs(fn);
             }
             data_ptr->start_us_ = getTimeUs();

             if (config.state == ProfilerState::KINETO_GPU_FALLBACK) {
               try {
                 torch::profiler::impl::cudaStubs()->record(
                     nullptr, &data_ptr->cuda_event_start_, nullptr);
               } catch (const std::exception& e) {
                 LOG(WARNING) << "Failed to record CUDA event. " << e.what();
               }
             }
             return ctx_ptr;
           },
           [](const at::RecordFunction& fn, at::ObserverContext* ctx_ptr) {
             auto state_ptr = KinetoThreadLocalState::getTLS();
             if (!state_ptr) {
               return;
             }
             const auto& config = state_ptr->config();
             auto* kineto_ctx_ptr =
                 static_cast<KinetoObserverContext*>(ctx_ptr);
             TORCH_INTERNAL_ASSERT(kineto_ctx_ptr != nullptr);
             auto data_ptr = kineto_ctx_ptr->data_;
             data_ptr->end_us_ = getTimeUs();
             data_ptr->end_thread_id_ = at::RecordFunction::currentThreadId();

             if (config.state == ProfilerState::KINETO_GPU_FALLBACK) {
               try {
                 torch::profiler::impl::cudaStubs()->record(
                     nullptr, &data_ptr->cuda_event_end_, nullptr);
               } catch (const std::exception& e) {
                 LOG(WARNING) << "Failed to record CUDA event. " << e.what();
               }
             }

             torch::profiler::impl::kineto::popCorrelationId();
             torch::profiler::impl::kineto::recordThreadInfo();
           })
           .needsInputs(registration_state_ptr->config().report_input_shapes)
           .scopes(scopes));
   registration_state_ptr->setCallbackHandle(handle);
 }

 } // namespace

 void reportBackendEventToActiveKinetoProfiler(
     const int64_t start_time_us,
     const int64_t end_time_us,
     const int64_t debug_handle,
     const at::RecordScope scope,
     const std::string& event_name,
     const std::string& backend_name) {
   auto state_ptr = KinetoThreadLocalState::getTLS();
   if (!state_ptr) {
     return;
   }

   auto ctx_ptr = state_ptr->newOpEvent();
   auto data_ptr = ctx_ptr->data_;
   data_ptr->start_us_ = start_time_us;
   data_ptr->end_us_ = end_time_us;
   data_ptr->correlation_id_ = std::numeric_limits<uint64_t>::max();
   data_ptr->start_thread_id_ = at::RecordFunction::currentThreadId();
   data_ptr->end_thread_id_ = data_ptr->start_thread_id_;
   data_ptr->sequence_number_ = -1;
   data_ptr->forward_thread_id_ = data_ptr->start_thread_id_;
   data_ptr->record_function_scope_ = (uint8_t)scope;
   data_ptr->is_async_ = false;
   data_ptr->debug_handle_ = debug_handle;
   data_ptr->kineto_info_ = torch::profiler::impl::kineto::kineto_ids();
   data_ptr->name_ = event_name;
   data_ptr->backend_ = backend_name;

   /* no support for input shapes now?
   if (config.report_input_shapes) {
     ctx_ptr->shapes = inputSizes(fn);
     ctx_ptr->dtypes = inputTypes(fn);
   }
   */

   torch::profiler::impl::kineto::recordThreadInfo();
 }

 void prepareProfiler(
     const torch::profiler::impl::ProfilerConfig& config,
     const std::set<torch::profiler::impl::ActivityType>& activities) {
   if (config.state == ProfilerState::NVTX) {
     return;
   }
   TORCH_CHECK(
       config.state == ProfilerState::KINETO ||
           config.state == ProfilerState::KINETO_GPU_FALLBACK,
       "Supported only in Kineto profiler");
   torch::profiler::impl::kineto::prepareTrace(
       /*cpuOnly=*/!at::hasCUDA(), activities);
 }

 void enableProfilerWithEventPostProcess(
     const torch::profiler::impl::ProfilerConfig& config,
     const std::set<torch::profiler::impl::ActivityType>& activities,
     std::function<void(std::vector<KinetoEvent>&)>&& cb,
     const std::unordered_set<at::RecordScope>& scopes) {
   TORCH_CHECK(
       config.state != ProfilerState::NVTX,
       "NVTX does not support post processing callback.");
   enableProfiler(config, activities, scopes);
   auto state_ptr = KinetoThreadLocalState::getTLS();
   state_ptr->setEventPostProcessingCallback(std::move(cb));
 }

 void enableProfiler(
     const torch::profiler::impl::ProfilerConfig& config,
     const std::set<torch::profiler::impl::ActivityType>& activities,
     const std::unordered_set<at::RecordScope>& scopes) {
   TORCH_CHECK(!profilerEnabled(), "Profiler is already enabled on this thread");
   if (config.state == ProfilerState::NVTX) {
     torch::profiler::impl::pushNVTXCallbacks(config, scopes);
     return;
   }

   TORCH_CHECK(
       config.state == ProfilerState::KINETO ||
       config.state == ProfilerState::KINETO_GPU_FALLBACK);
   TORCH_CHECK(
       !activities.empty(), "No activities specified for Kineto profiler");

   auto state = std::make_shared<KinetoThreadLocalState>(config, activities);
   c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state);

   if (state->tracePython()) {
     python_tracer::call(python_tracer::Command::kStartOne);
   }

   if (activities.count(ActivityType::CPU)) {
     pushProfilingCallbacks(scopes);
   }

   torch::profiler::impl::kineto::startTrace();
 }

 std::unique_ptr<ProfilerResult> disableProfiler() {
   // all the DebugInfoBase objects are scope based and supposed to use
   // DebugInfoGuard
   auto state =
       c10::ThreadLocalDebugInfo::_pop(c10::DebugInfoKind::PROFILER_STATE);

   auto state_ptr = static_cast<ProfilerThreadLocalStateBase*>(state.get());
   const auto& config = state_ptr->config();
   TORCH_CHECK(
       state_ptr &&
           (config.state == ProfilerState::KINETO ||
            config.state == ProfilerState::KINETO_GPU_FALLBACK ||
            config.state == ProfilerState::NVTX),
       "Can't disable Kineto profiler when it's not running");

   if (state_ptr->hasCallbackHandle()) {
     at::removeCallback(state_ptr->callbackHandle());
   }

   if (state_ptr->config().state == ProfilerState::NVTX) {
     return std::make_unique<ProfilerResult>();
   }

   auto kineto_state_ptr = static_cast<KinetoThreadLocalState*>(state_ptr);
   if (kineto_state_ptr->tracePython()) {
     python_tracer::call(python_tracer::Command::kStop);
   }

   auto trace = kineto_state_ptr->finalizeTrace();
   if (kineto_state_ptr->tracePython()) {
     python_tracer::call(python_tracer::Command::kClear);
   }

   return std::make_unique<ProfilerResult>(
       kineto_state_ptr->start_time_,
       std::move(kineto_state_ptr->kineto_events_),
       std::move(trace));
 }

 int64_t KinetoEvent::cudaElapsedUs() const {
   if (!cuda_event_start_ || !cuda_event_end_) {
     return -1;
   }
   try {
     return (int64_t)torch::profiler::impl::cudaStubs()->elapsed(&cuda_event_start_, &cuda_event_end_);
   } catch (std::exception& e) {
     LOG(WARNING) << "Failed to measure time between two CUDA events. "
                  << e.what();
   }
   return -1;
 }

 ProfilerResult::ProfilerResult(
     uint64_t start_time,
     std::vector<KinetoEvent> events,
     torch::profiler::impl::kineto::ActivityTraceWrapper trace)
     : trace_start_us_(start_time),
       events_(std::move(events)),
       trace_(std::move(trace)) {}
 ProfilerResult::ProfilerResult() = default;
 ProfilerResult::~ProfilerResult() = default;

 void ProfilerResult::save(const std::string& path) {
   trace_.save(path);
 }

 } // namespace profiler
 } // namespace autograd
 } // namespace torch