torch/csrc/profiler/collection.cpp - platform/external/pytorch - Git at Google

 #include <torch/csrc/profiler/collection.h>

 #include <algorithm>
 #include <queue>

 #include <fmt/format.h>

 #include <ATen/record_function.h>
 #include <c10/core/ScalarTypeToTypeMeta.h>
 #include <c10/util/flat_hash_map.h>
 #include <c10/util/overloaded.h>
 #include <torch/csrc/jit/runtime/interpreter.h>

 namespace torch {
 namespace profiler {
 namespace impl {

 void InputOutputEncoder::push(c10::ArrayRef<const c10::IValue> values) {
   for (const auto& value : values) {
     if (value.isTensor()) {
       push(value.toTensor());
     } else if (value.isScalar()) {
       tags_.emplace_back(Tag::Scalar);
     } else if (value.isTensorList()) {
       tags_.emplace_back(Tag::TensorListBegin);
       // TODO: Skip TensorList for now.
       tags_.emplace_back(Tag::TERMINATOR);
     } else {
       tags_.emplace_back(Tag::Other);
     }
   }
   tags_.emplace_back(Tag::TERMINATOR);
 }

 void InputOutputEncoder::push(const at::Tensor& t) {
   if (t.defined()) {
     tags_.emplace_back(Tag::Tensor);
     const auto& sizes = t.sizes();
     const auto dim = sizes.size();
     TORCH_CHECK(
         dim <= std::numeric_limits<uint32_t>::max(),
         "Cannot profile Tensors of size > uint32 max. Got dim: ",
         dim);

     tensor_metadata_.emplace_back(
         /*ptr_=*/(void*)t.unsafeGetTensorImpl(),
         /*dtype_=*/t.scalar_type(),
         /*dim_=*/(uint32_t)dim);

     for (const auto i : sizes) {
       tensor_sizes_.emplace_back(i);
     }
   } else {
     tags_.emplace_back(Tag::UndefinedTensor);
   }
 }

 // This is a custom-iterator-like getter to obtain input shapes and dtypes.
 auto InputOutputEncoder::getNextShapesAndDtypes() {
   return [this,
           tag_it = tags_.begin(),
           tensor_metadata_it = tensor_metadata_.begin(),
           tensor_size_it = tensor_sizes_.begin()]() mutable {
     struct Inputs out;
     bool terminate = false;
     while (!terminate && tag_it != tags_.end()) {
       out.shapes_.emplace_back();
       switch (*tag_it) {
         case Tag::Tensor: {
           const auto& md = *tensor_metadata_it++;
           for (const auto _ : c10::irange(md.dim_)) {
             (void)_; // Suppress unused variable warning
             out.shapes_.back().push_back(*tensor_size_it++);
           }
           out.dtypes_.emplace_back(scalarTypeToTypeMeta(md.dtype_).name());
         } break;

         case Tag::TensorListBegin:
           while (*(++tag_it) != Tag::TERMINATOR) {
             // TODO: Skip TensorLists for now.
           }
           out.dtypes_.emplace_back("TensorList");
           break;

         case Tag::Scalar:
           out.dtypes_.emplace_back("Scalar");
           break;

         case Tag::UndefinedTensor:
         case Tag::Other:
           out.dtypes_.emplace_back();
           break;

         case Tag::TERMINATOR:
           // This marks the end of this op.
           out.shapes_.pop_back();
           terminate = true;
           break;

         default:
           break;
       }
       ++tag_it;
     }
     return out;
   };
 }

 void InputOutputEncoder::clear() {
   tags_.clear();
   tensor_metadata_.clear();
   tensor_sizes_.clear();
 }

 namespace {
 // See `RecordQueue::getSubqueue()` for an overview of this cache.
 struct SubQueueThreadCache {
   uint32_t key_;
   ThreadLocalSubqueue* ref_;
 };

 // The astute observer will note that this leaves a dangling reference; nothing
 // in the teardown of `RecordQueue` or `ThreadLocalSubqueue` clears this value.
 // (And the raw pointer in `SubQueueThreadCache` will not extend the lifetime
 // of `*ref_`.) This is safe, however, because `getSubqueue` will check
 // `sub_queue_cache_.key_` before attempting to access `ref_`, and if `key_`
 // does not match the RecordQueue's *unique* `id_` it will evict
 // `sub_queue_cache_` and fall back to a different mechanism.
 std::atomic<uint32_t> queue_id_{0};
 thread_local SubQueueThreadCache sub_queue_cache_{0, nullptr};
 } // namespace

 namespace python_tracer {
 namespace {
 GetFn get_fn;

 struct NoOpPythonTracer : public PythonTracerBase {
   static NoOpPythonTracer& singleton() {
     static NoOpPythonTracer singleton_;
     return singleton_;
   }
   void start(RecordQueue*) override {}
   void stop() override {}
   void clear() override {}
   std::vector<std::shared_ptr<Result>> getEvents(
       std::function<time_t(approx_time_t)>,
       std::vector<CompressedEvent>&) override {
     return {};
   }
   ~NoOpPythonTracer() = default;
 };
 } // namespace

 void registerTracer(GetFn get_tracer) {
   get_fn = get_tracer;
 }

 PythonTracerBase& PythonTracerBase::get() {
   if (get_fn == nullptr) {
     return NoOpPythonTracer::singleton();
   }
   return get_fn();
 }
 } // namespace python_tracer

 #define OUT_T(method_name) decltype(std::declval<Result>().method_name())
 #define DEFINE_VISITOR(                                                 \
     method_name,                                                        \
     torch_op_field,                                                     \
     backend_field,                                                      \
     allocation_field,                                                   \
     py_field,                                                           \
     py_c_field)                                                         \
   OUT_T(method_name) Result::method_name() const {                      \
     using out_t = OUT_T(method_name);                                   \
     return c10::visit(                                                  \
         c10::overloaded(                                                \
             [&](const ExtraFields<EventType::TorchOp>& e) -> out_t {    \
               (void)e;                                                  \
               return torch_op_field;                                    \
             },                                                          \
             [&](const ExtraFields<EventType::Backend>& e) -> out_t {    \
               (void)e;                                                  \
               return backend_field;                                     \
             },                                                          \
             [&](const ExtraFields<EventType::Allocation>& e) -> out_t { \
               (void)e;                                                  \
               return allocation_field;                                  \
             },                                                          \
             [&](const ExtraFields<EventType::PyCall>& e) -> out_t {     \
               (void)e;                                                  \
               return py_field;                                          \
             },                                                          \
             [&](const ExtraFields<EventType::PyCCall>& e) -> out_t {    \
               (void)e;                                                  \
               return py_c_field;                                        \
             }),                                                         \
         extra_fields_);                                                 \
   }

 std::string toString(const ExtraFields<EventType::PyCall>& e) {
   if (e.module_.has_value()) {
     return fmt::format(
         "nn.Module: {}_{}", e.module_->cls_name_.str(), e.module_->id_);
   }
   return fmt::format(
       "{}({}): {}",
       e.callsite_.filename_.str(),
       e.callsite_.line_no_,
       e.callsite_.funcname_.str());
 }

 namespace {
 auto scopeToType(at::RecordScope scope) {
   return scope == at::RecordScope::USER_SCOPE
       ? libkineto::ActivityType::USER_ANNOTATION
       : libkineto::ActivityType::CPU_OP;
 }
 } // namespace

 DEFINE_VISITOR(
     name,
     e.name_,
     e.name_,
     "[memory]",
     toString(e),
     e.function_name_.str());
 DEFINE_VISITOR(
     kinetoType,
     scopeToType(e.scope_),
     scopeToType(e.scope_),
     libkineto::ActivityType::CPU_INSTANT_EVENT,
     libkineto::ActivityType::PYTHON_FUNCTION,
     libkineto::ActivityType::PYTHON_FUNCTION);
 DEFINE_VISITOR(correlationID, e.correlation_id_, 0, 0, 0, 0);
 DEFINE_VISITOR(
     endTimeNS,
     e.end_time_ns_,
     e.end_time_us_ * 1000,
     start_time_ns_,
     e.end_time_ns_,
     e.end_time_ns_);
 DEFINE_VISITOR(
     endTID,
     e.end_tid_,
     start_tid_,
     start_tid_,
     start_tid_,
     start_tid_);
 DEFINE_VISITOR(
     deviceType,
     c10::DeviceType::CPU,
     c10::DeviceType::CPU,
     e.device_type_,
     c10::DeviceType::CPU,
     c10::DeviceType::CPU);
 #undef DEFINE_VISITOR
 #undef OUT_T

 template <typename T, size_t ChunkSize>
 ThreadLocalSubqueue::EventBlock<T, ChunkSize>::EventBlock() {
   static std::atomic<uint64_t> counter_{0};
   id_start_ = 1 + ChunkSize * counter_++;
 }
 template <class... Args>
 std::pair<KinetoObserverContext::Event*, uint64_t> ThreadLocalSubqueue::OpList::
     emplace_back(Args&&... args) {
   maybe_grow();
   *next_ = {std::forward<Args>(args)...};
   auto corr_id = buffer_last_->correlation_id(next_);
   return {next_++, corr_id};
 }
 uint64_t ThreadLocalSubqueue::OpList::correlationID(const OpList::Iterator& e) {
   return e.address().first->correlation_id(&*e);
 }

 ThreadLocalSubqueue::ThreadLocalSubqueue(
     const uint64_t tid,
     const ProfilerConfig& config)
     : tid_{tid}, config_{config}, kineto_info_{kineto::kineto_ids()} {
   torch::profiler::impl::kineto::recordThreadInfo();
 }

 std::unique_ptr<KinetoObserverContext> ThreadLocalSubqueue::begin_op(
     const at::RecordFunction& fn) {
   KinetoObserverContext::Event* event;
   uint64_t corr_id;
   std::tie(event, corr_id) = op_events_.emplace_back(
       fn.seqNr(),
       fn.forwardThreadId(),
       fn.scope(),
       fn.isAsync(),
       fn.debugHandle(),
       fn.name());
   if (config_.report_input_shapes) {
     inputs_outputs_.push(fn.inputs());
   }
   if (fn.scope() == at::RecordScope::USER_SCOPE) {
     torch::profiler::impl::kineto::pushUserCorrelationId(corr_id);
   } else {
     torch::profiler::impl::kineto::pushCorrelationId(corr_id);
   }

 #if !defined BUILD_LITE_INTERPRETER && !defined C10_MOBILE
   // backward nodes source range corresponds to the forward node
   // TODO: consider using C++ stack trace
   if (config_.with_stack && fn.scope() != at::RecordScope::BACKWARD_FUNCTION) {
     auto cs = torch::profiler::impl::prepareCallstack(jit::currentCallstack());
     jit_stack_.emplace_back(callstackStr(cs));
   }
   if (config_.with_modules &&
       fn.scope() != at::RecordScope::BACKWARD_FUNCTION) {
     jit_modules_.emplace_back(jit::currentModuleHierarchy());
   }
 #endif
   if (config_.with_flops) {
     extra_args_.emplace_back(torch::profiler::impl::saveExtraArgs(fn));
   }

   auto out = std::make_unique<KinetoObserverContext>(event);

   if (config_.state == ProfilerState::KINETO_GPU_FALLBACK) {
     try {
       out->fallback_ = gpu_fallback_.emplace_back();
       torch::profiler::impl::cudaStubs()->record(
           nullptr, &out->fallback_->cuda_event_start_, nullptr);
     } catch (const std::exception& e) {
       LOG(WARNING) << "Failed to record CUDA event. " << e.what();
     }
   }

   event->start_time_ = torch::profiler::impl::getApproximateTime();
   return out;
 }

 RecordQueue::RecordQueue(
     const ProfilerConfig& config,
     std::set<ActivityType> activities)
     : id_(++queue_id_), config_{config}, activities_{activities} {
   if (tracePython()) {
     python_tracer::PythonTracerBase::get().start(this);
   }
 }

 bool RecordQueue::tracePython() const {
   return config_.with_stack && activities_.count(ActivityType::CPU);
 }

 ThreadLocalSubqueue* RecordQueue::getSubqueue() {
   // In the most common case, a thread will want to write to the same sub-queue
   // that it wrote to last call. The only time that isn't true is if:
   //  A) The profiler context has ended and we are in a new one.
   //  B) Two profilers are active in different TLS contexts, and this thread
   //     is a worker helping with intra-op parallelism.
   // Since we expect this to be the OVERWHELMINGLY common case (>99%), we add a
   // special thread_local cache so that we can skip the overall `flat_hash_map`
   // (and corresponding lock).
   if (id_ == sub_queue_cache_.key_) {
     return sub_queue_cache_.ref_;
   }

   const auto tid = at::RecordFunction::currentThreadId();
   std::lock_guard<std::mutex> guard(sub_queue_mutex_);
   auto it = sub_queues_.find(tid);
   if (it == sub_queues_.end()) {
     it = sub_queues_
              .emplace(tid, std::make_unique<ThreadLocalSubqueue>(tid, config_))
              .first;
   }

   sub_queue_cache_ = SubQueueThreadCache{id_, it->second.get()};
   return it->second.get();
 }

 void RecordQueue::stop() {
   if (tracePython()) {
     python_tracer::PythonTracerBase::get().stop();
   }
 }

 namespace {
 template <typename T>
 auto steal_or_default(T& it) {
   if (it.exhausted()) {
     return typename T::value_type();
   } else {
     auto result = std::move(*it);
     ++it;
     return result;
   }
 }

 struct EvaluateFunctionVisitor {
   void operator()(
       ExtraFields<EventType::TorchOp>& first,
       ExtraFields<EventType::TorchOp>& second) {
     if (first.scope_ == at::RecordScope::FUNCTION &&
         second.scope_ == at::RecordScope::BACKWARD_FUNCTION &&
         first.name_.rfind("autograd::engine::evaluate_function: ", 0) == 0) {
       first.sequence_number_ = second.sequence_number_;
       first.forward_tid_ = second.forward_tid_;
     }
   }

   template <typename T0, typename T1>
   void operator()(T0&, T1&) {}
 };

 void set_autograd_evaluate(std::vector<std::shared_ptr<Result>>& results) {
   auto end = results.size() > 2 ? results.end() - 1 : results.begin();
   for (auto it = results.begin(); it < end; ++it) {
     if ((*it)->start_tid_ == (*(it + 1))->start_tid_) {
       c10::visit(
           EvaluateFunctionVisitor(),
           (*it)->extra_fields_,
           (*(it + 1))->extra_fields_);
     }
   }
 }

 using result_ptr_t = std::shared_ptr<Result>;
 struct ResultGreater {
   bool operator()(const result_ptr_t& a, const result_ptr_t& b) const {
     return a->endTimeNS() > b->endTimeNS();
   }
 };

 void build_tree(std::vector<std::shared_ptr<Result>>& events) {
   set_autograd_evaluate(events);
   std::stable_sort(
       events.begin(), events.end(), [](const auto& a, const auto& b) {
         return a->start_time_ns_ < b->start_time_ns_;
       });

   using op_fields = ExtraFields<EventType::TorchOp>;
   ska::flat_hash_map<uint64_t, std::shared_ptr<Result>> stacks;
   std::priority_queue<result_ptr_t, std::vector<result_ptr_t>, ResultGreater>
       end_events_;

   auto push_event = [&stacks, &end_events_](std::shared_ptr<Result>& event) {
     TORCH_INTERNAL_ASSERT(event->parent_.expired());
     TORCH_INTERNAL_ASSERT(event->children_.empty());
     TORCH_INTERNAL_ASSERT(!event->finished_);

     auto parent_it = stacks.find(event->start_tid_);
     if (parent_it == stacks.end()) {
       auto fwd_tid = c10::visit(
           c10::overloaded(
               [](const op_fields& i) { return i.forward_tid_; },
               [](const auto&) -> uint64_t { return 0; }),
           event->extra_fields_);
       if (fwd_tid) {
         parent_it = stacks.find(fwd_tid);
       }
     }

     if (parent_it != stacks.end()) {
       event->parent_ = parent_it->second;
       parent_it->second->children_.push_back(event);
     }

     if (event->endTimeNS() > event->start_time_ns_) {
       stacks[event->start_tid_] = event;
       end_events_.push(event);
     } else if (event->endTimeNS() == std::numeric_limits<time_t>::min()) {
       // We use min time to indicate the lack of a termination event, so if we
       // encounter such a case we don't push to `end_events_`.
       stacks[event->start_tid_] = event;
     } else {
       event->finished_ = true;
     }
   };

   auto pop_event = [&stacks](const std::shared_ptr<Result>& event) {
     if (event->finished_) {
       // This event was marked finished by a previous `pop_event` call.
       return;
     }

     auto start_tid = event->start_tid_;
     auto frame = stacks.at(start_tid);

     while (frame.get() != event.get()) {
       TORCH_INTERNAL_ASSERT(frame != nullptr);
       frame->finished_ = true;
       TORCH_INTERNAL_ASSERT(!frame->parent_.expired());
       frame = frame->parent_.lock();
     }

     event->finished_ = true;
     stacks.erase(start_tid);
     auto new_frame = event->parent_.lock();
     if (new_frame != nullptr) {
       stacks[start_tid] = new_frame;
     }
   };

   // Stack replay loop.
   for (auto& event : events) {
     while (!end_events_.empty() &&
            end_events_.top()->endTimeNS() < event->start_time_ns_) {
       pop_event(end_events_.top());
       end_events_.pop();
     }
     push_event(event);
   }

   // Cleanup remaining exit events.
   while (!end_events_.empty()) {
     pop_event(end_events_.top());
     end_events_.pop();
   }
 }
 } // namespace

 std::vector<std::shared_ptr<Result>> RecordQueue::getRecords(
     std::function<time_t(approx_time_t)> time_converter) {
   auto converter = [&](approx_time_t t) {
     return t == std::numeric_limits<approx_time_t>::min()
         ? std::numeric_limits<time_t>::min()
         : time_converter(t);
   };
   std::vector<std::shared_ptr<Result>> out;
   std::vector<python_tracer::CompressedEvent> python_enters;
   for (auto& subqueue_it : sub_queues_) {
     auto& queue = *subqueue_it.second;
     for (auto& i : queue.backend_events_) {
       auto start_time = i.start_time_us_;
       out.emplace_back(Result::create(
           /*start_time_ns_=*/start_time * 1000,
           /*start_tid_=*/queue.tid(),
           /*kineto_info_=*/queue.kineto_info(),
           /*extra_fields_=*/std::move(i)));
     }
     queue.backend_events_.clear();

     auto input_getter = queue.inputs_outputs_.getNextShapesAndDtypes();
     auto jit_stack_it = queue.jit_stack_.begin();
     auto jit_module_it = queue.jit_modules_.begin();
     auto extra_args_it = queue.extra_args_.begin();
     auto gpu_fallback_it = queue.gpu_fallback_.begin();
     for (auto event = queue.op_events_.begin(); event != queue.op_events_.end();
          ++event) {
       auto& i = *event;
       auto start_time = converter(i.start_time_);
       out.emplace_back(Result::create(
           start_time,
           /*start_tid_=*/queue.tid(),
           /*kineto_info_=*/queue.kineto_info(),
           /*extra_fields_=*/
           ExtraFields<EventType::TorchOp>(
               std::move(i.basic_fields_),
               ThreadLocalSubqueue::OpList::correlationID(event),
               converter(i.end_time_),
               input_getter(),
               steal_or_default(jit_stack_it),
               steal_or_default(jit_module_it),
               steal_or_default(extra_args_it),
               steal_or_default(gpu_fallback_it))));
     }
     queue.op_events_.clear();
     queue.inputs_outputs_.clear();
     queue.jit_stack_.clear();
     queue.jit_modules_.clear();
     queue.extra_args_.clear();
     queue.gpu_fallback_.clear();

     for (auto& i : queue.allocations_) {
       auto start_time = converter(i.start_time_);
       out.emplace_back(Result::create(
           start_time,
           /*start_tid_=*/queue.tid(),
           /*kineto_info_=*/queue.kineto_info(),
           /*extra_fields_=*/std::move(i)));
     }
     queue.allocations_.clear();

     for (auto& i : queue.py_calls_) {
       python_enters.push_back(
           {i.first, queue.tid(), queue.kineto_info(), converter(i.second)});
     }
   }

   if (tracePython()) {
     auto& tracer = python_tracer::PythonTracerBase::get();
     for (auto i : tracer.getEvents(converter, python_enters)) {
       out.push_back(i);
     }
     tracer.clear();
   }

   build_tree(out);
   return out;
 }

 } // namespace impl
 } // namespace profiler
 } // namespace torch
	#include <torch/csrc/profiler/collection.h>

	#include <algorithm>
	#include <queue>

	#include <fmt/format.h>

	#include <ATen/record_function.h>
	#include <c10/core/ScalarTypeToTypeMeta.h>
	#include <c10/util/flat_hash_map.h>
	#include <c10/util/overloaded.h>
	#include <torch/csrc/jit/runtime/interpreter.h>

	namespace torch {
	namespace profiler {
	namespace impl {

	void InputOutputEncoder::push(c10::ArrayRef<const c10::IValue> values) {
	for (const auto& value : values) {
	if (value.isTensor()) {
	push(value.toTensor());
	} else if (value.isScalar()) {
	tags_.emplace_back(Tag::Scalar);
	} else if (value.isTensorList()) {
	tags_.emplace_back(Tag::TensorListBegin);
	// TODO: Skip TensorList for now.
	tags_.emplace_back(Tag::TERMINATOR);
	} else {
	tags_.emplace_back(Tag::Other);
	}
	}
	tags_.emplace_back(Tag::TERMINATOR);
	}

	void InputOutputEncoder::push(const at::Tensor& t) {
	if (t.defined()) {
	tags_.emplace_back(Tag::Tensor);
	const auto& sizes = t.sizes();
	const auto dim = sizes.size();
	TORCH_CHECK(
	dim <= std::numeric_limits<uint32_t>::max(),
	"Cannot profile Tensors of size > uint32 max. Got dim: ",
	dim);

	tensor_metadata_.emplace_back(
	/ptr_=/(void*)t.unsafeGetTensorImpl(),
	/dtype_=/t.scalar_type(),
	/dim_=/(uint32_t)dim);

	for (const auto i : sizes) {
	tensor_sizes_.emplace_back(i);
	}
	} else {
	tags_.emplace_back(Tag::UndefinedTensor);
	}
	}

	// This is a custom-iterator-like getter to obtain input shapes and dtypes.
	auto InputOutputEncoder::getNextShapesAndDtypes() {
	return [this,
	tag_it = tags_.begin(),
	tensor_metadata_it = tensor_metadata_.begin(),
	tensor_size_it = tensor_sizes_.begin()]() mutable {
	struct Inputs out;
	bool terminate = false;
	while (!terminate && tag_it != tags_.end()) {
	out.shapes_.emplace_back();
	switch (*tag_it) {
	case Tag::Tensor: {
	const auto& md = *tensor_metadata_it++;
	for (const auto _ : c10::irange(md.dim_)) {
	(void)_; // Suppress unused variable warning
	out.shapes_.back().push_back(*tensor_size_it++);
	}
	out.dtypes_.emplace_back(scalarTypeToTypeMeta(md.dtype_).name());
	} break;

	case Tag::TensorListBegin:
	while (*(++tag_it) != Tag::TERMINATOR) {
	// TODO: Skip TensorLists for now.
	}
	out.dtypes_.emplace_back("TensorList");
	break;

	case Tag::Scalar:
	out.dtypes_.emplace_back("Scalar");
	break;

	case Tag::UndefinedTensor:
	case Tag::Other:
	out.dtypes_.emplace_back();
	break;

	case Tag::TERMINATOR:
	// This marks the end of this op.
	out.shapes_.pop_back();
	terminate = true;
	break;

	default:
	break;
	}
	++tag_it;
	}
	return out;
	};
	}

	void InputOutputEncoder::clear() {
	tags_.clear();
	tensor_metadata_.clear();
	tensor_sizes_.clear();
	}

	namespace {
	// See `RecordQueue::getSubqueue()` for an overview of this cache.
	struct SubQueueThreadCache {
	uint32_t key_;
	ThreadLocalSubqueue* ref_;
	};

	// The astute observer will note that this leaves a dangling reference; nothing
	// in the teardown of `RecordQueue` or `ThreadLocalSubqueue` clears this value.
	// (And the raw pointer in `SubQueueThreadCache` will not extend the lifetime
	// of `*ref_`.) This is safe, however, because `getSubqueue` will check
	// `sub_queue_cache_.key_` before attempting to access `ref_`, and if `key_`
	// does not match the RecordQueue's unique `id_` it will evict
	// `sub_queue_cache_` and fall back to a different mechanism.
	std::atomic<uint32_t> queue_id_{0};
	thread_local SubQueueThreadCache sub_queue_cache_{0, nullptr};
	} // namespace

	namespace python_tracer {
	namespace {
	GetFn get_fn;

	struct NoOpPythonTracer : public PythonTracerBase {
	static NoOpPythonTracer& singleton() {
	static NoOpPythonTracer singleton_;
	return singleton_;
	}
	void start(RecordQueue*) override {}
	void stop() override {}
	void clear() override {}
	std::vector<std::shared_ptr<Result>> getEvents(
	std::function<time_t(approx_time_t)>,
	std::vector<CompressedEvent>&) override {
	return {};
	}
	~NoOpPythonTracer() = default;
	};
	} // namespace

	void registerTracer(GetFn get_tracer) {
	get_fn = get_tracer;
	}

	PythonTracerBase& PythonTracerBase::get() {
	if (get_fn == nullptr) {
	return NoOpPythonTracer::singleton();
	}
	return get_fn();
	}
	} // namespace python_tracer

	#define OUT_T(method_name) decltype(std::declval<Result>().method_name())
	#define DEFINE_VISITOR( \
	method_name, \
	torch_op_field, \
	backend_field, \
	allocation_field, \
	py_field, \
	py_c_field) \
	OUT_T(method_name) Result::method_name() const { \
	using out_t = OUT_T(method_name); \
	return c10::visit( \
	c10::overloaded( \
	[&](const ExtraFields<EventType::TorchOp>& e) -> out_t { \
	(void)e; \
	return torch_op_field; \
	}, \
	[&](const ExtraFields<EventType::Backend>& e) -> out_t { \
	(void)e; \
	return backend_field; \
	}, \
	[&](const ExtraFields<EventType::Allocation>& e) -> out_t { \
	(void)e; \
	return allocation_field; \
	}, \
	[&](const ExtraFields<EventType::PyCall>& e) -> out_t { \
	(void)e; \
	return py_field; \
	}, \
	[&](const ExtraFields<EventType::PyCCall>& e) -> out_t { \
	(void)e; \
	return py_c_field; \
	}), \
	extra_fields_); \
	}

	std::string toString(const ExtraFields<EventType::PyCall>& e) {
	if (e.module_.has_value()) {
	return fmt::format(
	"nn.Module: {}_{}", e.module_->cls_name_.str(), e.module_->id_);
	}
	return fmt::format(
	"{}({}): {}",
	e.callsite_.filename_.str(),
	e.callsite_.line_no_,
	e.callsite_.funcname_.str());
	}

	namespace {
	auto scopeToType(at::RecordScope scope) {
	return scope == at::RecordScope::USER_SCOPE
	? libkineto::ActivityType::USER_ANNOTATION
	: libkineto::ActivityType::CPU_OP;
	}
	} // namespace

	DEFINE_VISITOR(
	name,
	e.name_,
	e.name_,
	"[memory]",
	toString(e),
	e.function_name_.str());
	DEFINE_VISITOR(
	kinetoType,
	scopeToType(e.scope_),
	scopeToType(e.scope_),
	libkineto::ActivityType::CPU_INSTANT_EVENT,
	libkineto::ActivityType::PYTHON_FUNCTION,
	libkineto::ActivityType::PYTHON_FUNCTION);
	DEFINE_VISITOR(correlationID, e.correlation_id_, 0, 0, 0, 0);
	DEFINE_VISITOR(
	endTimeNS,
	e.end_time_ns_,
	e.end_time_us_ * 1000,
	start_time_ns_,
	e.end_time_ns_,
	e.end_time_ns_);
	DEFINE_VISITOR(
	endTID,
	e.end_tid_,
	start_tid_,
	start_tid_,
	start_tid_,
	start_tid_);
	DEFINE_VISITOR(
	deviceType,
	c10::DeviceType::CPU,
	c10::DeviceType::CPU,
	e.device_type_,
	c10::DeviceType::CPU,
	c10::DeviceType::CPU);
	#undef DEFINE_VISITOR
	#undef OUT_T

	template <typename T, size_t ChunkSize>
	ThreadLocalSubqueue::EventBlock<T, ChunkSize>::EventBlock() {
	static std::atomic<uint64_t> counter_{0};
	id_start_ = 1 + ChunkSize * counter_++;
	}
	template <class... Args>
	std::pair<KinetoObserverContext::Event*, uint64_t> ThreadLocalSubqueue::OpList::
	emplace_back(Args&&... args) {
	maybe_grow();
	*next_ = {std::forward<Args>(args)...};
	auto corr_id = buffer_last_->correlation_id(next_);
	return {next_++, corr_id};
	}
	uint64_t ThreadLocalSubqueue::OpList::correlationID(const OpList::Iterator& e) {
	return e.address().first->correlation_id(&*e);
	}

	ThreadLocalSubqueue::ThreadLocalSubqueue(
	const uint64_t tid,
	const ProfilerConfig& config)
	: tid_{tid}, config_{config}, kineto_info_{kineto::kineto_ids()} {
	torch::profiler::impl::kineto::recordThreadInfo();
	}

	std::unique_ptr<KinetoObserverContext> ThreadLocalSubqueue::begin_op(
	const at::RecordFunction& fn) {
	KinetoObserverContext::Event* event;
	uint64_t corr_id;
	std::tie(event, corr_id) = op_events_.emplace_back(
	fn.seqNr(),
	fn.forwardThreadId(),
	fn.scope(),
	fn.isAsync(),
	fn.debugHandle(),
	fn.name());
	if (config_.report_input_shapes) {
	inputs_outputs_.push(fn.inputs());
	}
	if (fn.scope() == at::RecordScope::USER_SCOPE) {
	torch::profiler::impl::kineto::pushUserCorrelationId(corr_id);
	} else {
	torch::profiler::impl::kineto::pushCorrelationId(corr_id);
	}

	#if !defined BUILD_LITE_INTERPRETER && !defined C10_MOBILE
	// backward nodes source range corresponds to the forward node
	// TODO: consider using C++ stack trace
	if (config_.with_stack && fn.scope() != at::RecordScope::BACKWARD_FUNCTION) {
	auto cs = torch::profiler::impl::prepareCallstack(jit::currentCallstack());
	jit_stack_.emplace_back(callstackStr(cs));
	}
	if (config_.with_modules &&
	fn.scope() != at::RecordScope::BACKWARD_FUNCTION) {
	jit_modules_.emplace_back(jit::currentModuleHierarchy());
	}
	#endif
	if (config_.with_flops) {
	extra_args_.emplace_back(torch::profiler::impl::saveExtraArgs(fn));
	}

	auto out = std::make_unique<KinetoObserverContext>(event);

	if (config_.state == ProfilerState::KINETO_GPU_FALLBACK) {
	try {
	out->fallback_ = gpu_fallback_.emplace_back();
	torch::profiler::impl::cudaStubs()->record(
	nullptr, &out->fallback_->cuda_event_start_, nullptr);
	} catch (const std::exception& e) {
	LOG(WARNING) << "Failed to record CUDA event. " << e.what();
	}
	}

	event->start_time_ = torch::profiler::impl::getApproximateTime();
	return out;
	}

	RecordQueue::RecordQueue(
	const ProfilerConfig& config,
	std::set<ActivityType> activities)
	: id_(++queue_id_), config_{config}, activities_{activities} {
	if (tracePython()) {
	python_tracer::PythonTracerBase::get().start(this);
	}
	}

	bool RecordQueue::tracePython() const {
	return config_.with_stack && activities_.count(ActivityType::CPU);
	}

	ThreadLocalSubqueue* RecordQueue::getSubqueue() {
	// In the most common case, a thread will want to write to the same sub-queue
	// that it wrote to last call. The only time that isn't true is if:
	// A) The profiler context has ended and we are in a new one.
	// B) Two profilers are active in different TLS contexts, and this thread
	// is a worker helping with intra-op parallelism.
	// Since we expect this to be the OVERWHELMINGLY common case (>99%), we add a
	// special thread_local cache so that we can skip the overall `flat_hash_map`
	// (and corresponding lock).
	if (id_ == sub_queue_cache_.key_) {
	return sub_queue_cache_.ref_;
	}

	const auto tid = at::RecordFunction::currentThreadId();
	std::lock_guard<std::mutex> guard(sub_queue_mutex_);
	auto it = sub_queues_.find(tid);
	if (it == sub_queues_.end()) {
	it = sub_queues_
	.emplace(tid, std::make_unique<ThreadLocalSubqueue>(tid, config_))
	.first;
	}

	sub_queue_cache_ = SubQueueThreadCache{id_, it->second.get()};
	return it->second.get();
	}

	void RecordQueue::stop() {
	if (tracePython()) {
	python_tracer::PythonTracerBase::get().stop();
	}
	}

	namespace {
	template <typename T>
	auto steal_or_default(T& it) {
	if (it.exhausted()) {
	return typename T::value_type();
	} else {
	auto result = std::move(*it);
	++it;
	return result;
	}
	}

	struct EvaluateFunctionVisitor {
	void operator()(
	ExtraFields<EventType::TorchOp>& first,
	ExtraFields<EventType::TorchOp>& second) {
	if (first.scope_ == at::RecordScope::FUNCTION &&
	second.scope_ == at::RecordScope::BACKWARD_FUNCTION &&
	first.name_.rfind("autograd::engine::evaluate_function: ", 0) == 0) {
	first.sequence_number_ = second.sequence_number_;
	first.forward_tid_ = second.forward_tid_;
	}
	}

	template <typename T0, typename T1>
	void operator()(T0&, T1&) {}
	};

	void set_autograd_evaluate(std::vector<std::shared_ptr<Result>>& results) {
	auto end = results.size() > 2 ? results.end() - 1 : results.begin();
	for (auto it = results.begin(); it < end; ++it) {
	if ((it)->start_tid_ == ((it + 1))->start_tid_) {
	c10::visit(
	EvaluateFunctionVisitor(),
	(*it)->extra_fields_,
	(*(it + 1))->extra_fields_);
	}
	}
	}

	using result_ptr_t = std::shared_ptr<Result>;
	struct ResultGreater {
	bool operator()(const result_ptr_t& a, const result_ptr_t& b) const {
	return a->endTimeNS() > b->endTimeNS();
	}
	};

	void build_tree(std::vector<std::shared_ptr<Result>>& events) {
	set_autograd_evaluate(events);
	std::stable_sort(
	events.begin(), events.end(), [](const auto& a, const auto& b) {
	return a->start_time_ns_ < b->start_time_ns_;
	});

	using op_fields = ExtraFields<EventType::TorchOp>;
	ska::flat_hash_map<uint64_t, std::shared_ptr<Result>> stacks;
	std::priority_queue<result_ptr_t, std::vector<result_ptr_t>, ResultGreater>
	end_events_;

	auto push_event = [&stacks, &end_events_](std::shared_ptr<Result>& event) {
	TORCH_INTERNAL_ASSERT(event->parent_.expired());
	TORCH_INTERNAL_ASSERT(event->children_.empty());
	TORCH_INTERNAL_ASSERT(!event->finished_);

	auto parent_it = stacks.find(event->start_tid_);
	if (parent_it == stacks.end()) {
	auto fwd_tid = c10::visit(
	c10::overloaded(
	[](const op_fields& i) { return i.forward_tid_; },
	[](const auto&) -> uint64_t { return 0; }),
	event->extra_fields_);
	if (fwd_tid) {
	parent_it = stacks.find(fwd_tid);
	}
	}

	if (parent_it != stacks.end()) {
	event->parent_ = parent_it->second;
	parent_it->second->children_.push_back(event);
	}

	if (event->endTimeNS() > event->start_time_ns_) {
	stacks[event->start_tid_] = event;
	end_events_.push(event);
	} else if (event->endTimeNS() == std::numeric_limits<time_t>::min()) {
	// We use min time to indicate the lack of a termination event, so if we
	// encounter such a case we don't push to `end_events_`.
	stacks[event->start_tid_] = event;
	} else {
	event->finished_ = true;
	}
	};

	auto pop_event = [&stacks](const std::shared_ptr<Result>& event) {
	if (event->finished_) {
	// This event was marked finished by a previous `pop_event` call.
	return;
	}

	auto start_tid = event->start_tid_;
	auto frame = stacks.at(start_tid);

	while (frame.get() != event.get()) {
	TORCH_INTERNAL_ASSERT(frame != nullptr);
	frame->finished_ = true;
	TORCH_INTERNAL_ASSERT(!frame->parent_.expired());
	frame = frame->parent_.lock();
	}

	event->finished_ = true;
	stacks.erase(start_tid);
	auto new_frame = event->parent_.lock();
	if (new_frame != nullptr) {
	stacks[start_tid] = new_frame;
	}
	};

	// Stack replay loop.
	for (auto& event : events) {
	while (!end_events_.empty() &&
	end_events_.top()->endTimeNS() < event->start_time_ns_) {
	pop_event(end_events_.top());
	end_events_.pop();
	}
	push_event(event);
	}

	// Cleanup remaining exit events.
	while (!end_events_.empty()) {
	pop_event(end_events_.top());
	end_events_.pop();
	}
	}
	} // namespace

	std::vector<std::shared_ptr<Result>> RecordQueue::getRecords(
	std::function<time_t(approx_time_t)> time_converter) {
	auto converter = [&](approx_time_t t) {
	return t == std::numeric_limits<approx_time_t>::min()
	? std::numeric_limits<time_t>::min()
	: time_converter(t);
	};
	std::vector<std::shared_ptr<Result>> out;
	std::vector<python_tracer::CompressedEvent> python_enters;
	for (auto& subqueue_it : sub_queues_) {
	auto& queue = *subqueue_it.second;
	for (auto& i : queue.backend_events_) {
	auto start_time = i.start_time_us_;
	out.emplace_back(Result::create(
	/start_time_ns_=/start_time * 1000,
	/start_tid_=/queue.tid(),
	/kineto_info_=/queue.kineto_info(),
	/extra_fields_=/std::move(i)));
	}
	queue.backend_events_.clear();

	auto input_getter = queue.inputs_outputs_.getNextShapesAndDtypes();
	auto jit_stack_it = queue.jit_stack_.begin();
	auto jit_module_it = queue.jit_modules_.begin();
	auto extra_args_it = queue.extra_args_.begin();
	auto gpu_fallback_it = queue.gpu_fallback_.begin();
	for (auto event = queue.op_events_.begin(); event != queue.op_events_.end();
	++event) {
	auto& i = *event;
	auto start_time = converter(i.start_time_);
	out.emplace_back(Result::create(
	start_time,
	/start_tid_=/queue.tid(),
	/kineto_info_=/queue.kineto_info(),
	/extra_fields_=/
	ExtraFields<EventType::TorchOp>(
	std::move(i.basic_fields_),
	ThreadLocalSubqueue::OpList::correlationID(event),
	converter(i.end_time_),
	input_getter(),
	steal_or_default(jit_stack_it),
	steal_or_default(jit_module_it),
	steal_or_default(extra_args_it),
	steal_or_default(gpu_fallback_it))));
	}
	queue.op_events_.clear();
	queue.inputs_outputs_.clear();
	queue.jit_stack_.clear();
	queue.jit_modules_.clear();
	queue.extra_args_.clear();
	queue.gpu_fallback_.clear();

	for (auto& i : queue.allocations_) {
	auto start_time = converter(i.start_time_);
	out.emplace_back(Result::create(
	start_time,
	/start_tid_=/queue.tid(),
	/kineto_info_=/queue.kineto_info(),
	/extra_fields_=/std::move(i)));
	}
	queue.allocations_.clear();

	for (auto& i : queue.py_calls_) {
	python_enters.push_back(
	{i.first, queue.tid(), queue.kineto_info(), converter(i.second)});
	}
	}

	if (tracePython()) {
	auto& tracer = python_tracer::PythonTracerBase::get();
	for (auto i : tracer.getEvents(converter, python_enters)) {
	out.push_back(i);
	}
	tracer.clear();
	}

	build_tree(out);
	return out;
	}

	} // namespace impl
	} // namespace profiler
	} // namespace torch