torch/csrc/profiler/collection.cpp - platform/external/pytorch - Git at Google

 #include <torch/csrc/profiler/collection.h>

 #include <algorithm>

 #include <ATen/record_function.h>
 #include <c10/core/ScalarTypeToTypeMeta.h>
 #include <c10/util/overloaded.h>
 #include <torch/csrc/jit/runtime/interpreter.h>

 namespace torch {
 namespace profiler {
 namespace impl {

 void InputOutputEncoder::push(c10::ArrayRef<const c10::IValue> values) {
   for (const auto& value : values) {
     if (value.isTensor()) {
       push(value.toTensor());
     } else if (value.isScalar()) {
       tags_.emplace_back(Tag::Scalar);
     } else if (value.isTensorList()) {
       tags_.emplace_back(Tag::TensorListBegin);
       // TODO: Skip TensorList for now.
       tags_.emplace_back(Tag::TERMINATOR);
     } else {
       tags_.emplace_back(Tag::Other);
     }
   }
   tags_.emplace_back(Tag::TERMINATOR);
 }

 void InputOutputEncoder::push(const at::Tensor& t) {
   if (t.defined()) {
     tags_.emplace_back(Tag::Tensor);
     const auto& sizes = t.sizes();
     const auto dim = sizes.size();
     TORCH_CHECK(
       dim <= std::numeric_limits<uint32_t>::max(),
       "Cannot profile Tensors of size > uint32 max. Got dim: ", dim);

     tensor_metadata_.emplace_back(
       /*ptr_=*/(void*)t.unsafeGetTensorImpl(),
       /*dtype_=*/t.scalar_type(),
       /*dim_=*/(uint32_t)dim
     );

     for (const auto i : sizes) {
       tensor_sizes_.emplace_back(i);
     }
   } else {
     tags_.emplace_back(Tag::UndefinedTensor);
   }
 }

 // This is a custom-iterator-like getter to obtain input shapes and dtypes.
 auto InputOutputEncoder::getNextShapesAndDtypes() {
   return [this, tag_it = tags_.begin(),
           tensor_metadata_it = tensor_metadata_.begin(),
           tensor_size_it = tensor_sizes_.begin()]() mutable {
     struct Inputs out;
     bool terminate = false;
     while (!terminate && tag_it != tags_.end()) {
       out.shapes_.emplace_back();
       switch (*tag_it) {
         case Tag::Tensor:
           {
             const auto& md = *tensor_metadata_it++;
             for (const auto _ : c10::irange(md.dim_)) {
               (void)_; // Suppress unused variable warning
               out.shapes_.back().push_back(*tensor_size_it++);
             }
             out.dtypes_.emplace_back(scalarTypeToTypeMeta(md.dtype_).name());
           }
           break;

         case Tag::TensorListBegin:
             while (*(++tag_it) != Tag::TERMINATOR) {
               // TODO: Skip TensorLists for now.
             }
           out.dtypes_.emplace_back("TensorList");
           break;

         case Tag::Scalar:
           out.dtypes_.emplace_back("Scalar");
           break;

         case Tag::UndefinedTensor:
         case Tag::Other:
           out.dtypes_.emplace_back();
           break;

         case Tag::TERMINATOR:
           // This marks the end of this op.
           out.shapes_.pop_back();
           terminate = true;
           break;

         default:
           break;
       }
       ++tag_it;
     }
     return out;
   };
 }

 void InputOutputEncoder::clear() {
   tags_.clear();
   tensor_metadata_.clear();
   tensor_sizes_.clear();
 }

 namespace {
 // See `RecordQueue::getSubqueue()` for an overview of this cache.
 struct SubQueueThreadCache {
   uint32_t key_;
   ThreadLocalSubqueue* ref_;
 };

 // The astute observer will note that this leaves a dangling reference; nothing
 // in the teardown of `RecordQueue` or `ThreadLocalSubqueue` clears this value.
 // (And the raw pointer in `SubQueueThreadCache` will not extend the lifetime
 // of `*ref_`.) This is safe, however, because `getSubqueue` will check
 // `sub_queue_cache_.key_` before attempting to access `ref_`, and if `key_`
 // does not match the RecordQueue's *unique* `id_` it will evict
 // `sub_queue_cache_` and fall back to a different mechanism.
 std::atomic<uint32_t> queue_id_{0};
 thread_local SubQueueThreadCache sub_queue_cache_{0, nullptr};
 } // namespace

 #define OUT_T(method_name) decltype(std::declval<Result>().method_name())
 #define DEFINE_VISITOR(                                                 \
     method_name, torch_op_field, backend_field, allocation_field)       \
   OUT_T(method_name) Result::method_name() const {                      \
     using out_t = OUT_T(method_name);                                   \
     return c10::visit(                                                  \
         c10::overloaded(                                                \
             [&](const ExtraFields<EventType::TorchOp>& e) -> out_t {    \
               (void)e;                                                  \
               return torch_op_field;                                    \
             },                                                          \
             [&](const ExtraFields<EventType::Backend>& e) -> out_t {    \
               (void)e;                                                  \
               return backend_field;                                     \
             },                                                          \
             [&](const ExtraFields<EventType::Allocation>& e) -> out_t { \
               (void)e;                                                  \
               return allocation_field;                                  \
             }),                                                         \
         extra_fields_);                                                 \
   }

 using torch::profiler::impl::kineto::KinetoActivityType;
 namespace {
 KinetoActivityType scopeToType(at::RecordScope scope) {
   return scope == at::RecordScope::USER_SCOPE
       ? KinetoActivityType::USER_ANNOTATION
       : KinetoActivityType::CPU_OP;
 }
 } // namespace

 DEFINE_VISITOR(name, e.name_, e.name_, "[memory]");
 DEFINE_VISITOR(
     kinetoType,
     scopeToType(e.scope_),
     scopeToType(e.scope_),
     KinetoActivityType::CPU_INSTANT_EVENT);
 DEFINE_VISITOR(correlationID, e.correlation_id_, 0, 0);
 DEFINE_VISITOR(endTimeNS, e.end_time_ns_, e.end_time_us_ * 1000, start_time_ns_);
 DEFINE_VISITOR(endTID, e.end_tid_, start_tid_, start_tid_);
 DEFINE_VISITOR(
     deviceType,
     c10::DeviceType::CPU,
     c10::DeviceType::CPU,
     e.device_type_);
 #undef DEFINE_VISITOR
 #undef OUT_T

 ThreadLocalSubqueue::ThreadLocalSubqueue(
     const uint64_t tid,
     const ProfilerConfig& config)
     : tid_{tid}, config_{config}, kineto_info_{kineto::kineto_ids()} {
   torch::profiler::impl::kineto::recordThreadInfo();
 }

 std::unique_ptr<KinetoObserverContext> ThreadLocalSubqueue::begin_op(
     const at::RecordFunction& fn,
     uint64_t correlation_id) {
   auto event = op_events_.emplace_back(
       correlation_id,
       fn.seqNr(),
       fn.forwardThreadId(),
       fn.scope(),
       fn.isAsync(),
       fn.debugHandle(),
       fn.name());
   if (config_.report_input_shapes) {
     inputs_outputs_.push(fn.inputs());
   }

 #if !defined BUILD_LITE_INTERPRETER && !defined C10_MOBILE
   // backward nodes source range corresponds to the forward node
   // TODO: consider using C++ stack trace
   if (config_.with_stack && fn.scope() != at::RecordScope::BACKWARD_FUNCTION) {
     auto cs = torch::profiler::impl::prepareCallstack(jit::currentCallstack());
     jit_stack_.emplace_back(callstackStr(cs));
   }
   if (config_.with_modules &&
       fn.scope() != at::RecordScope::BACKWARD_FUNCTION) {
     jit_modules_.emplace_back(jit::currentModuleHierarchy());
   }
 #endif
   if (config_.with_flops) {
     extra_args_.emplace_back(torch::profiler::impl::saveExtraArgs(fn));
   }

   auto out = std::make_unique<KinetoObserverContext>(event);

   if (config_.state == ProfilerState::KINETO_GPU_FALLBACK) {
     try {
       out->fallback_ = gpu_fallback_.emplace_back();
       torch::profiler::impl::cudaStubs()->record(
           nullptr, &out->fallback_->cuda_event_start_, nullptr);
     } catch (const std::exception& e) {
       LOG(WARNING) << "Failed to record CUDA event. " << e.what();
     }
   }

   event->start_time_ = torch::profiler::impl::getApproximateTime();
   return out;
 }

 RecordQueue::RecordQueue(const ProfilerConfig& config)
     : id_(++queue_id_), config_{config} {}

 ThreadLocalSubqueue* RecordQueue::getSubqueue() {
   // In the most common case, a thread will want to write to the same sub-queue
   // that it wrote to last call. The only time that isn't true is if:
   //  A) The profiler context has ended and we are in a new one.
   //  B) Two profilers are active in different TLS contexts, and this thread
   //     is a worker helping with intra-op parallelism.
   // Since we expect this to be the OVERWHELMINGLY common case (>99%), we add a
   // special thread_local cache so that we can skip the overall `flat_hash_map`
   // (and corresponding lock).
   if (id_ == sub_queue_cache_.key_) {
     return sub_queue_cache_.ref_;
   }

   const auto tid = at::RecordFunction::currentThreadId();
   std::lock_guard<std::mutex> guard(sub_queue_mutex_);
   auto it = sub_queues_.find(tid);
   if (it == sub_queues_.end()) {
     it =
         sub_queues_.emplace(tid, std::make_unique<ThreadLocalSubqueue>(tid, config_)).first;
   }

   sub_queue_cache_ = SubQueueThreadCache{id_, it->second.get()};
   return it->second.get();
 }

 namespace {
 template <typename T>
 auto steal_or_default(T& it) {
   if (it.exhausted()) {
     return typename T::value_type();
   } else {
     auto result = std::move(*it);
     ++it;
     return result;
   }
 }

 struct EvaluateFunctionVisitor {
   void operator()(
       ExtraFields<EventType::TorchOp>& first,
       ExtraFields<EventType::TorchOp>& second) {
     if (first.scope_ == at::RecordScope::FUNCTION &&
         second.scope_ == at::RecordScope::BACKWARD_FUNCTION &&
         first.name_.rfind("autograd::engine::evaluate_function: ", 0) == 0) {
       first.sequence_number_ = second.sequence_number_;
       first.forward_tid_ = second.forward_tid_;
     }
   }

   template <typename T0, typename T1>
   void operator()(T0&, T1&) {}
 };

 void set_autograd_evaluate(std::vector<std::shared_ptr<Result>>& results) {
   auto end = results.size() > 2 ? results.end() - 1 : results.begin();
   for (auto it = results.begin(); it < end; ++it) {
     if ((*it)->start_tid_ == (*(it + 1))->start_tid_) {
       c10::visit(
           EvaluateFunctionVisitor(),
           (*it)->extra_fields_,
           (*(it + 1))->extra_fields_);
     }
   }
 }
 } // namespace

 std::vector<std::shared_ptr<Result>> RecordQueue::getRecords(
     std::function<time_t(approx_time_t)> time_converter) {
   auto converter = [&](approx_time_t t) {
     return t == std::numeric_limits<approx_time_t>::min()
         ? std::numeric_limits<int64_t>::min()
         : time_converter(t);
   };
   std::vector<std::shared_ptr<Result>> out;
   for (auto& subqueue_it : sub_queues_) {
     auto& queue = *subqueue_it.second;
     for (auto& i : queue.backend_events_) {
       auto start_time = i.start_time_us_;
       out.emplace_back(Result::create(
           /*start_time_ns_=*/start_time * 1000,
           /*start_tid_=*/queue.tid(),
           /*kineto_info_=*/queue.kineto_info(),
           /*extra_fields_=*/std::move(i)));
     }
     queue.backend_events_.clear();

     auto input_getter = queue.inputs_outputs_.getNextShapesAndDtypes();
     auto jit_stack_it = queue.jit_stack_.begin();
     auto jit_module_it = queue.jit_modules_.begin();
     auto extra_args_it = queue.extra_args_.begin();
     auto gpu_fallback_it = queue.gpu_fallback_.begin();
     for (auto& i : queue.op_events_) {
       auto start_time = converter(i.start_time_);
       out.emplace_back(Result::create(
           start_time,
           /*start_tid_=*/queue.tid(),
           /*kineto_info_=*/queue.kineto_info(),
           /*extra_fields_=*/
           ExtraFields<EventType::TorchOp>(
               std::move(i.basic_fields_),
               converter(i.end_time_),
               input_getter(),
               steal_or_default(jit_stack_it),
               steal_or_default(jit_module_it),
               steal_or_default(extra_args_it),
               steal_or_default(gpu_fallback_it))));
     }
     queue.op_events_.clear();
     queue.inputs_outputs_.clear();
     queue.jit_stack_.clear();
     queue.jit_modules_.clear();
     queue.extra_args_.clear();
     queue.gpu_fallback_.clear();

     for (auto& i : queue.allocations_) {
       auto start_time = converter(i.start_time_);
       out.emplace_back(Result::create(
           start_time,
           /*start_tid_=*/queue.tid(),
           /*kineto_info_=*/queue.kineto_info(),
           /*extra_fields_=*/std::move(i)));
     }
     queue.allocations_.clear();
   }

   set_autograd_evaluate(out);
   std::stable_sort(out.begin(), out.end(), [](const auto& a, const auto& b) {
     return a->start_time_ns_ < b->start_time_ns_;
   });
   return out;
 }

 } // namespace impl
 } // namespace profiler
 } // namespace torch
	#include <torch/csrc/profiler/collection.h>

	#include <algorithm>

	#include <ATen/record_function.h>
	#include <c10/core/ScalarTypeToTypeMeta.h>
	#include <c10/util/overloaded.h>
	#include <torch/csrc/jit/runtime/interpreter.h>

	namespace torch {
	namespace profiler {
	namespace impl {

	void InputOutputEncoder::push(c10::ArrayRef<const c10::IValue> values) {
	for (const auto& value : values) {
	if (value.isTensor()) {
	push(value.toTensor());
	} else if (value.isScalar()) {
	tags_.emplace_back(Tag::Scalar);
	} else if (value.isTensorList()) {
	tags_.emplace_back(Tag::TensorListBegin);
	// TODO: Skip TensorList for now.
	tags_.emplace_back(Tag::TERMINATOR);
	} else {
	tags_.emplace_back(Tag::Other);
	}
	}
	tags_.emplace_back(Tag::TERMINATOR);
	}

	void InputOutputEncoder::push(const at::Tensor& t) {
	if (t.defined()) {
	tags_.emplace_back(Tag::Tensor);
	const auto& sizes = t.sizes();
	const auto dim = sizes.size();
	TORCH_CHECK(
	dim <= std::numeric_limits<uint32_t>::max(),
	"Cannot profile Tensors of size > uint32 max. Got dim: ", dim);

	tensor_metadata_.emplace_back(
	/ptr_=/(void*)t.unsafeGetTensorImpl(),
	/dtype_=/t.scalar_type(),
	/dim_=/(uint32_t)dim
	);

	for (const auto i : sizes) {
	tensor_sizes_.emplace_back(i);
	}
	} else {
	tags_.emplace_back(Tag::UndefinedTensor);
	}
	}

	// This is a custom-iterator-like getter to obtain input shapes and dtypes.
	auto InputOutputEncoder::getNextShapesAndDtypes() {
	return [this, tag_it = tags_.begin(),
	tensor_metadata_it = tensor_metadata_.begin(),
	tensor_size_it = tensor_sizes_.begin()]() mutable {
	struct Inputs out;
	bool terminate = false;
	while (!terminate && tag_it != tags_.end()) {
	out.shapes_.emplace_back();
	switch (*tag_it) {
	case Tag::Tensor:
	{
	const auto& md = *tensor_metadata_it++;
	for (const auto _ : c10::irange(md.dim_)) {
	(void)_; // Suppress unused variable warning
	out.shapes_.back().push_back(*tensor_size_it++);
	}
	out.dtypes_.emplace_back(scalarTypeToTypeMeta(md.dtype_).name());
	}
	break;

	case Tag::TensorListBegin:
	while (*(++tag_it) != Tag::TERMINATOR) {
	// TODO: Skip TensorLists for now.
	}
	out.dtypes_.emplace_back("TensorList");
	break;

	case Tag::Scalar:
	out.dtypes_.emplace_back("Scalar");
	break;

	case Tag::UndefinedTensor:
	case Tag::Other:
	out.dtypes_.emplace_back();
	break;

	case Tag::TERMINATOR:
	// This marks the end of this op.
	out.shapes_.pop_back();
	terminate = true;
	break;

	default:
	break;
	}
	++tag_it;
	}
	return out;
	};
	}

	void InputOutputEncoder::clear() {
	tags_.clear();
	tensor_metadata_.clear();
	tensor_sizes_.clear();
	}

	namespace {
	// See `RecordQueue::getSubqueue()` for an overview of this cache.
	struct SubQueueThreadCache {
	uint32_t key_;
	ThreadLocalSubqueue* ref_;
	};

	// The astute observer will note that this leaves a dangling reference; nothing
	// in the teardown of `RecordQueue` or `ThreadLocalSubqueue` clears this value.
	// (And the raw pointer in `SubQueueThreadCache` will not extend the lifetime
	// of `*ref_`.) This is safe, however, because `getSubqueue` will check
	// `sub_queue_cache_.key_` before attempting to access `ref_`, and if `key_`
	// does not match the RecordQueue's unique `id_` it will evict
	// `sub_queue_cache_` and fall back to a different mechanism.
	std::atomic<uint32_t> queue_id_{0};
	thread_local SubQueueThreadCache sub_queue_cache_{0, nullptr};
	} // namespace

	#define OUT_T(method_name) decltype(std::declval<Result>().method_name())
	#define DEFINE_VISITOR( \
	method_name, torch_op_field, backend_field, allocation_field) \
	OUT_T(method_name) Result::method_name() const { \
	using out_t = OUT_T(method_name); \
	return c10::visit( \
	c10::overloaded( \
	[&](const ExtraFields<EventType::TorchOp>& e) -> out_t { \
	(void)e; \
	return torch_op_field; \
	}, \
	[&](const ExtraFields<EventType::Backend>& e) -> out_t { \
	(void)e; \
	return backend_field; \
	}, \
	[&](const ExtraFields<EventType::Allocation>& e) -> out_t { \
	(void)e; \
	return allocation_field; \
	}), \
	extra_fields_); \
	}

	using torch::profiler::impl::kineto::KinetoActivityType;
	namespace {
	KinetoActivityType scopeToType(at::RecordScope scope) {
	return scope == at::RecordScope::USER_SCOPE
	? KinetoActivityType::USER_ANNOTATION
	: KinetoActivityType::CPU_OP;
	}
	} // namespace

	DEFINE_VISITOR(name, e.name_, e.name_, "[memory]");
	DEFINE_VISITOR(
	kinetoType,
	scopeToType(e.scope_),
	scopeToType(e.scope_),
	KinetoActivityType::CPU_INSTANT_EVENT);
	DEFINE_VISITOR(correlationID, e.correlation_id_, 0, 0);
	DEFINE_VISITOR(endTimeNS, e.end_time_ns_, e.end_time_us_ * 1000, start_time_ns_);
	DEFINE_VISITOR(endTID, e.end_tid_, start_tid_, start_tid_);
	DEFINE_VISITOR(
	deviceType,
	c10::DeviceType::CPU,
	c10::DeviceType::CPU,
	e.device_type_);
	#undef DEFINE_VISITOR
	#undef OUT_T

	ThreadLocalSubqueue::ThreadLocalSubqueue(
	const uint64_t tid,
	const ProfilerConfig& config)
	: tid_{tid}, config_{config}, kineto_info_{kineto::kineto_ids()} {
	torch::profiler::impl::kineto::recordThreadInfo();
	}

	std::unique_ptr<KinetoObserverContext> ThreadLocalSubqueue::begin_op(
	const at::RecordFunction& fn,
	uint64_t correlation_id) {
	auto event = op_events_.emplace_back(
	correlation_id,
	fn.seqNr(),
	fn.forwardThreadId(),
	fn.scope(),
	fn.isAsync(),
	fn.debugHandle(),
	fn.name());
	if (config_.report_input_shapes) {
	inputs_outputs_.push(fn.inputs());
	}

	#if !defined BUILD_LITE_INTERPRETER && !defined C10_MOBILE
	// backward nodes source range corresponds to the forward node
	// TODO: consider using C++ stack trace
	if (config_.with_stack && fn.scope() != at::RecordScope::BACKWARD_FUNCTION) {
	auto cs = torch::profiler::impl::prepareCallstack(jit::currentCallstack());
	jit_stack_.emplace_back(callstackStr(cs));
	}
	if (config_.with_modules &&
	fn.scope() != at::RecordScope::BACKWARD_FUNCTION) {
	jit_modules_.emplace_back(jit::currentModuleHierarchy());
	}
	#endif
	if (config_.with_flops) {
	extra_args_.emplace_back(torch::profiler::impl::saveExtraArgs(fn));
	}

	auto out = std::make_unique<KinetoObserverContext>(event);

	if (config_.state == ProfilerState::KINETO_GPU_FALLBACK) {
	try {
	out->fallback_ = gpu_fallback_.emplace_back();
	torch::profiler::impl::cudaStubs()->record(
	nullptr, &out->fallback_->cuda_event_start_, nullptr);
	} catch (const std::exception& e) {
	LOG(WARNING) << "Failed to record CUDA event. " << e.what();
	}
	}

	event->start_time_ = torch::profiler::impl::getApproximateTime();
	return out;
	}

	RecordQueue::RecordQueue(const ProfilerConfig& config)
	: id_(++queue_id_), config_{config} {}

	ThreadLocalSubqueue* RecordQueue::getSubqueue() {
	// In the most common case, a thread will want to write to the same sub-queue
	// that it wrote to last call. The only time that isn't true is if:
	// A) The profiler context has ended and we are in a new one.
	// B) Two profilers are active in different TLS contexts, and this thread
	// is a worker helping with intra-op parallelism.
	// Since we expect this to be the OVERWHELMINGLY common case (>99%), we add a
	// special thread_local cache so that we can skip the overall `flat_hash_map`
	// (and corresponding lock).
	if (id_ == sub_queue_cache_.key_) {
	return sub_queue_cache_.ref_;
	}

	const auto tid = at::RecordFunction::currentThreadId();
	std::lock_guard<std::mutex> guard(sub_queue_mutex_);
	auto it = sub_queues_.find(tid);
	if (it == sub_queues_.end()) {
	it =
	sub_queues_.emplace(tid, std::make_unique<ThreadLocalSubqueue>(tid, config_)).first;
	}

	sub_queue_cache_ = SubQueueThreadCache{id_, it->second.get()};
	return it->second.get();
	}

	namespace {
	template <typename T>
	auto steal_or_default(T& it) {
	if (it.exhausted()) {
	return typename T::value_type();
	} else {
	auto result = std::move(*it);
	++it;
	return result;
	}
	}

	struct EvaluateFunctionVisitor {
	void operator()(
	ExtraFields<EventType::TorchOp>& first,
	ExtraFields<EventType::TorchOp>& second) {
	if (first.scope_ == at::RecordScope::FUNCTION &&
	second.scope_ == at::RecordScope::BACKWARD_FUNCTION &&
	first.name_.rfind("autograd::engine::evaluate_function: ", 0) == 0) {
	first.sequence_number_ = second.sequence_number_;
	first.forward_tid_ = second.forward_tid_;
	}
	}

	template <typename T0, typename T1>
	void operator()(T0&, T1&) {}
	};

	void set_autograd_evaluate(std::vector<std::shared_ptr<Result>>& results) {
	auto end = results.size() > 2 ? results.end() - 1 : results.begin();
	for (auto it = results.begin(); it < end; ++it) {
	if ((it)->start_tid_ == ((it + 1))->start_tid_) {
	c10::visit(
	EvaluateFunctionVisitor(),
	(*it)->extra_fields_,
	(*(it + 1))->extra_fields_);
	}
	}
	}
	} // namespace

	std::vector<std::shared_ptr<Result>> RecordQueue::getRecords(
	std::function<time_t(approx_time_t)> time_converter) {
	auto converter = [&](approx_time_t t) {
	return t == std::numeric_limits<approx_time_t>::min()
	? std::numeric_limits<int64_t>::min()
	: time_converter(t);
	};
	std::vector<std::shared_ptr<Result>> out;
	for (auto& subqueue_it : sub_queues_) {
	auto& queue = *subqueue_it.second;
	for (auto& i : queue.backend_events_) {
	auto start_time = i.start_time_us_;
	out.emplace_back(Result::create(
	/start_time_ns_=/start_time * 1000,
	/start_tid_=/queue.tid(),
	/kineto_info_=/queue.kineto_info(),
	/extra_fields_=/std::move(i)));
	}
	queue.backend_events_.clear();

	auto input_getter = queue.inputs_outputs_.getNextShapesAndDtypes();
	auto jit_stack_it = queue.jit_stack_.begin();
	auto jit_module_it = queue.jit_modules_.begin();
	auto extra_args_it = queue.extra_args_.begin();
	auto gpu_fallback_it = queue.gpu_fallback_.begin();
	for (auto& i : queue.op_events_) {
	auto start_time = converter(i.start_time_);
	out.emplace_back(Result::create(
	start_time,
	/start_tid_=/queue.tid(),
	/kineto_info_=/queue.kineto_info(),
	/extra_fields_=/
	ExtraFields<EventType::TorchOp>(
	std::move(i.basic_fields_),
	converter(i.end_time_),
	input_getter(),
	steal_or_default(jit_stack_it),
	steal_or_default(jit_module_it),
	steal_or_default(extra_args_it),
	steal_or_default(gpu_fallback_it))));
	}
	queue.op_events_.clear();
	queue.inputs_outputs_.clear();
	queue.jit_stack_.clear();
	queue.jit_modules_.clear();
	queue.extra_args_.clear();
	queue.gpu_fallback_.clear();

	for (auto& i : queue.allocations_) {
	auto start_time = converter(i.start_time_);
	out.emplace_back(Result::create(
	start_time,
	/start_tid_=/queue.tid(),
	/kineto_info_=/queue.kineto_info(),
	/extra_fields_=/std::move(i)));
	}
	queue.allocations_.clear();
	}

	set_autograd_evaluate(out);
	std::stable_sort(out.begin(), out.end(), [](const auto& a, const auto& b) {
	return a->start_time_ns_ < b->start_time_ns_;
	});
	return out;
	}

	} // namespace impl
	} // namespace profiler
	} // namespace torch