torch/csrc/autograd/profiler_legacy.cpp - platform/external/pytorch - Git at Google

 #include <torch/csrc/autograd/profiler.h>
 #include <torch/csrc/autograd/function.h>
 #include <torch/csrc/jit/frontend/code_template.h>

 #include <torch/csrc/jit/frontend/tracer.h>
 #include <torch/csrc/jit/runtime/operator.h>

 #include <ATen/core/op_registration/op_registration.h>
 #include <torch/library.h>

 #include <fstream>
 #include <list>
 #include <mutex>
 #include <sstream>
 #include <string>
 #include <vector>

 #include <ATen/record_function.h>
 #include <c10/core/Allocator.h>
 #include <c10/util/ThreadLocalDebugInfo.h>

 #include <iostream>

 namespace torch { namespace autograd { namespace profiler {

 std::vector<FileLineFunc> prepareCallstack(const std::vector<jit::StackEntry>& cs) {
   std::vector<FileLineFunc> entries;
   entries.reserve(cs.size());
   for (const auto& entry : cs) {
     auto& range = entry.range;
     if (range.source()) {
       auto& src = range.source();
       if (src && src->filename()) {
         auto line = src->starting_line_no() +
             src->lineno_for_offset(range.start());
         entries.emplace_back(FileLineFunc{*(src->filename()), line, entry.filename});
       }
     }
   }
   return entries;
 }

 std::vector<std::string> callstackStr(const std::vector<FileLineFunc>& cs) {
   std::vector<std::string> cs_str;
   cs_str.reserve(cs.size());
   for (const auto& entry : cs) {
     std::stringstream loc;
     loc << entry.filename << "(" << entry.line << "): " << entry.funcname;
     cs_str.push_back(loc.str());
   }
   return cs_str;
 }

 // We decompose the profiler logic into the following components:
 //
 // ThreadLocalDebugInfo:
 //
 // ThreadLocalDebugInfo is a thread local mapping from slots into
 // the debug information structs.
 // ThreadLocalDebugInfo is automatically propagated across thread
 // boundaries, including the cases of:
 //  - launching async jobs with at::launch
 //  - executing JIT continuations
 //  - moving from the forward threads into autograd (backward) threads
 //
 // Entries in ThreadLocalDebugInfo are managed by DebugInfoGuard
 // which can be used to add or overwrite an entry in the thread local
 // mapping. A corresponding entry is removed when the guard is destroyed,
 // potentially revealing the previously set value for the same slot.
 //
 // For the async tasks, slots previuosly set in the main thread before
 // launching of an async task are shared and visible in the async task.
 //
 // On the other hand, any adding or overwriting of the mapping by the
 // async task is not visible to the main thread and any modification
 // (including removal of the entries) in the main thread is not visible
 // to the async task if it happends after launching the task.
 //
 // We use ThreadLocalDebugInfo (slot PROFILER_STATE) to store profiler config,
 // as well as a list of events that happen during profiling.
 // An instance of ThreadLocalDebugInfo is created each time we enter
 // profiler (i.e. enter profiling context manager/call enableConfig) and
 // uniquely identifies a profiling run.
 //
 // We automatically propagate ThreadLocalDebugInfo into async tasks,
 // as well as across JIT continuations and autograd thread, so all
 // the operations that happen between profiling start and end
 // (not necessarily within the same thread) are recorded.
 // Unless the profiling slot is overwritten as in the case of nested
 // profiling ranges (in this case events for the subrange are handled
 // by the nested profiler)
 //
 // When we exit a profiling range (either by exiting profiling context
 // manager or by calling disableProfiler), we remove the previously set
 // profiling entry for the given thread local mapping, and consolidate
 // events in the profiling result
 //
 //
 // ThreadLocalState:
 //
 // ThreadLocalState takes a 'snapshot' of thread local variables
 // using provided getters. It is used together with ThreadLocalStateGuard
 // to transfer the snapshot across thread boundary and set the thread local
 // values as in the parent task.
 //
 // Profiler uses ThreadLocalState to propagate profiler's thread local state.
 // ThreadLocalState also automatically propagates profiler callbacks.
 //
 //
 // at::RecordFunction and observers
 //
 // Profiler uses observers mechanism to add a pair of thread local callbacks
 // that are executed on a number of predetermined ranges, including:
 //  - c10/ATen ops
 //  - TorchScript functions/methods
 //  - user defined named ranges (see `record_function` python context manager)
 //
 // Profiler setups a pair of callbacks that record profiling events and save
 // them into the thread local profiler struct (ThreadLocalDebugInfo,
 // PROFILER_STATE slot)
 //
 //
 // Thus, the overall logic is:
 //
 // enableProfiler:
 //  - checks that profiler is not enabled (otherwise throws)
 //  - pushes new ThreadLocalDebugInfo (slot PROFILER_STATE) as the profiler
 //    config for the current thread
 //  - pushes profiling callbacks for the current thread
 //
 // disableProfiler:
 //  - pops PROFILER_STATE slot from the current ThreadLocalDebugInfo and
 //    consolidates events
 //  - removes profiling callbacks
 //
 // ThreadLocalState:
 //  - propagates ThreadLocalDebugInfo across threads
 //  - propagates profiler callbacks across threads
 //
 // Profiler callbacks:
 //  - get the current profiling state (PROFILER slot in ThreadLocalDebugInfo)
 //  - save profiling events into the profiling state
 //

 namespace {
 const CUDAStubs default_stubs;
 constexpr const CUDAStubs* default_stubs_addr = &default_stubs;
 // Constant initialization, so it is guaranteed to be initialized before
 // static initialization calls which may invoke registerCUDAMethods
 inline const CUDAStubs*& cuda_stubs() {
   static const CUDAStubs* stubs_ = default_stubs_addr;
   return stubs_;
 }
 }

 // Profiler state
 const ProfilerConfig& ProfilerThreadLocalState::config() const {
   return config_;
 }

 thread_event_lists ProfilerThreadLocalState::consolidate() {
   std::lock_guard<std::mutex> g(state_mutex_);
   thread_event_lists result;
   for (auto& kv : event_lists_map_) {
     auto& list = kv.second;
     result.emplace_back(list->consolidate());
   }
   // Consolidate remote events if applicable as well.
   if (remoteProfiledEvents_) {
     result.insert(
         result.end(),
         std::make_move_iterator(remoteProfiledEvents_->begin()),
         std::make_move_iterator(remoteProfiledEvents_->end()));
   }
   return result;
 }

 void ProfilerThreadLocalState::mark(std::string name, bool include_cuda) {
   if (config_.state == ProfilerState::Disabled) {
     return;
   }
   if (config_.state == ProfilerState::NVTX) {
     cuda_stubs()->nvtxMarkA(name.c_str());
   } else {
     LegacyEvent evt(
         EventKind::Mark,
         at::StringView(std::move(name)),
         at::RecordFunction::currentThreadId(),
         include_cuda && config_.state == ProfilerState::CUDA);
     evt.setNodeId(at::RecordFunction::getDefaultNodeId());
     getEventList().record(std::move(evt));
   }
 }

 void ProfilerThreadLocalState::setOrAddRemoteProfiledEvents(
     std::vector<LegacyEvent>&& remoteProfiledEvents) {
   // Lock to serialize access from multiple callback threads.
   std::lock_guard<std::mutex> guard(state_mutex_);
   if (remoteProfiledEvents_) {
     (*remoteProfiledEvents_).emplace_back(remoteProfiledEvents);
   } else {
     remoteProfiledEvents_ = {std::move(remoteProfiledEvents)};
   }
 }

 void ProfilerThreadLocalState::pushRange(
     const at::RecordFunction& fn,
     const bool record_cuda,
     const char* msg,
     std::vector<std::vector<int64_t>>&& shapes) {
   if (config_.state == ProfilerState::Disabled) {
     return;
   }
   if (config_.state == ProfilerState::NVTX) {
     cuda_stubs()->nvtxRangePushA(getNvtxStr(
         fn.name(), msg, fn.seqNr(), shapes).c_str());
   } else {
     LegacyEvent evt(
         EventKind::PushRange,
         fn.name(),
         at::RecordFunction::currentThreadId(),
         record_cuda,
         fn.handle(),
         std::move(shapes),
         at::RecordFunction::getDefaultNodeId());
     evt.setSequenceNr(fn.seqNr());
     evt.setFwdThreadId(fn.forwardThreadId());
     evt.setScope((uint8_t)fn.scope());
 #ifndef C10_MOBILE
     // backward nodes source range corresponds to the forward node
     // TODO: consider using C++ stack trace
     if (config_.with_stack && fn.scope() != at::RecordScope::BACKWARD_FUNCTION) {
       auto cs = prepareCallstack(jit::currentCallstack());
       if (cs.empty()) {
         cs = prepareCallstack(jit::tracer::pythonCallstack());
       }
       evt.setStack(callstackStr(cs));
     }
 #endif
     getEventList().record(std::move(evt));
   }
 }

 void ProfilerThreadLocalState::popRange(const at::RecordFunction& fn, const bool record_cuda) {
   if (config_.state == ProfilerState::Disabled) {
     return;
   }
   if (config_.state == ProfilerState::NVTX) {
     cuda_stubs()->nvtxRangePop();
   } else {
     // In some cases RecordFunction (and popRange) may be
     // called on a different thread than pushRange
     // As a convention, we put the async pop on the original
     // thread and save current thread id in pop event
     LegacyEvent evt(
         EventKind::PopRange,
         at::StringView(""),
         at::RecordFunction::currentThreadId(),
         record_cuda,
         fn.handle());
     evt.setNodeId(at::RecordFunction::getDefaultNodeId());
     getEventList(fn.threadId()).record(std::move(evt));
   }
 }

 void ProfilerThreadLocalState::reportMemoryUsage(
     void* /* unused */,
     int64_t alloc_size,
     c10::Device device) {
   if (config_.profile_memory && config_.state != ProfilerState::Disabled) {
     uint64_t thread_id = at::RecordFunction::currentThreadId();
     LegacyEvent evt(
         EventKind::MemoryAlloc,
         at::StringView(""),
         thread_id,
         config_.state == ProfilerState::CUDA);
     evt.updateMemoryStats(alloc_size, device);
     getEventList(thread_id).record(std::move(evt));
   }
 }

 bool ProfilerThreadLocalState::memoryProfilingEnabled() const {
   return config_.profile_memory;
 }

 std::string ProfilerThreadLocalState::getNvtxStr(
     const at::StringView& name,
     const char* msg,
     int64_t sequence_nr,
     const std::vector<std::vector<int64_t>>& shapes) const {
   if (sequence_nr >= 0 || shapes.size() > 0) {
     std::stringstream s;
 #ifdef __HIP_PLATFORM_HCC__
     s << name.str();
 #endif
     if (sequence_nr >= 0) {
 #ifdef __HIP_PLATFORM_HCC__
       s << msg << sequence_nr;
 #else
       s << name.str() << msg << sequence_nr;
 #endif
     }
     if (shapes.size() > 0) {
       s << ", sizes = [";
       for (size_t idx = 0; idx < shapes.size(); ++idx) {
         if (shapes[idx].size() > 0) {
           s << "[";
           for (size_t dim = 0; dim < shapes[idx].size(); ++dim) {
             s << shapes[idx][dim];
             if (dim < shapes[idx].size() - 1) {
               s << ", ";
             }
           }
           s << "]";
         } else {
           s << "[]";
         }
         if (idx < shapes.size() - 1) {
           s << ", ";
         }
       }
       s << "]";
     }
     return s.str();
   } else {
     return name.str();
   }
 }

 RangeEventList& ProfilerThreadLocalState::getEventList(int64_t thread_id) {
   if (thread_id < 0) {
     thread_id = at::RecordFunction::currentThreadId();
   }
   RangeEventList* list_ptr = nullptr;
   std::lock_guard<std::mutex> guard(state_mutex_);
   auto it = event_lists_map_.find(thread_id);
   if (it != event_lists_map_.end()) {
     list_ptr = it->second.get();
   } else {
     auto event_list = std::make_shared<RangeEventList>();
     event_lists_map_[thread_id] = event_list;
     list_ptr = event_list.get();
   }
   return *list_ptr;
 }

 std::vector<std::vector<int64_t>> inputSizes(const at::RecordFunction& fn) {
   std::vector<std::vector<int64_t>> sizes;
   sizes.reserve(fn.inputs().size());
   for (const c10::IValue& input : fn.inputs()) {
     if (!input.isTensor()) {
       sizes.emplace_back();
       continue;
     }
     const at::Tensor& tensor = input.toTensor();
     if (tensor.defined()) {
       sizes.push_back(input.toTensor().sizes().vec());
     } else {
       sizes.emplace_back();
     }
   }
   return sizes;
 }

 namespace {

 enum EventIValueIdx {
   KIND = 0,
   NAME,
   THREAD_ID,
   HANDLE,
   NODE_ID,
   CPU_MEM_USAGE,
   CPU_NS,
   CUDA_RECORDED,
   CUDA_MEM_USAGE,
   CUDA_DEVICE,
   CUDA_US,
   SHAPES,
   NUM_EVENT_IVALUE_IDX // must be last in list
 };

 enum ProfilerIValueIdx {
   STATE = 0,
   REPORT_INPUT_SHAPES,
   PROFILE_MEMORY,
   NUM_PROFILER_CFG_IVALUE_IDX // must be last in list
 };

 const std::unordered_set<std::string> disable_cuda_profiling = {
   "aten::view",
   "aten::t",
   "aten::transpose",
   "aten::stride",
   "aten::empty",
   "aten::empty_like",
   "aten::empty_strided",
   "aten::as_strided",
   "aten::expand",
   "aten::resize_",
   "aten::squeeze",
   "aten::unsqueeze",
   "aten::slice",
   "aten::_unsafe_view",
   "aten::size"
 };

 ProfilerThreadLocalState* getProfilerTLSState() {
   return static_cast<ProfilerThreadLocalState*>(
       c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::PROFILER_STATE));
 }

 void pushProfilingCallbacksLegacy() {
   auto state_ptr = getProfilerTLSState();
   TORCH_INTERNAL_ASSERT(state_ptr, "Expected profiler state set");
   auto handle = at::addThreadLocalCallback(at::RecordFunctionCallback(
       [](const at::RecordFunction& fn) {
         auto state_ptr = getProfilerTLSState();
         if (!state_ptr || state_ptr->config().state == ProfilerState::Disabled) {
           return nullptr;
         }
         bool record_cuda =
             state_ptr->config().state == ProfilerState::CUDA;
         if (record_cuda && disable_cuda_profiling.find(fn.name().str()) != disable_cuda_profiling.end()) {
           record_cuda = false;
         }

         auto* msg = (fn.seqNr() >= 0) ? ", seq = " : "";
         if (state_ptr->config().report_input_shapes) {
           auto sizes = inputSizes(fn);
           state_ptr->pushRange(fn, record_cuda, msg, std::move(sizes));
         } else {
           state_ptr->pushRange(fn, record_cuda, msg);
         }

         return nullptr;
       },
       [](const at::RecordFunction& fn, at::ObserverContext*) {
         auto state_ptr = getProfilerTLSState();
         if (!state_ptr || state_ptr->config().state == ProfilerState::Disabled) {
           return;
         }
         bool record_cuda =
             state_ptr->config().state == ProfilerState::CUDA;
         if (record_cuda && disable_cuda_profiling.find(fn.name().str()) != disable_cuda_profiling.end()) {
           record_cuda = false;
         }
         state_ptr->popRange(fn, record_cuda);
       })
     .needsInputs(state_ptr->config().report_input_shapes)
     .needsIds(true));
   state_ptr->setCallbackHandle(handle);
 }

 const int kCUDAWarmupStart = 5;

 } // namespace

 void registerCUDAMethods(CUDAStubs* stubs) {
   cuda_stubs() = stubs;
 }

 at::IValue ProfilerConfig::toIValue() const {
   c10::impl::GenericList eventIValueList(at::AnyType::get());
   eventIValueList.reserve(NUM_PROFILER_CFG_IVALUE_IDX);
   eventIValueList.emplace_back(static_cast<int64_t>(state));
   eventIValueList.emplace_back(report_input_shapes);
   eventIValueList.emplace_back(profile_memory);
   return eventIValueList;
 }

 ProfilerConfig ProfilerConfig::fromIValue(
     const at::IValue& profilerConfigIValue) {
   TORCH_INTERNAL_ASSERT(
       profilerConfigIValue.isList(),
       "Expected IValue to contain type c10::impl::GenericList");
   auto ivalues = profilerConfigIValue.toList();
   TORCH_INTERNAL_ASSERT(
       ivalues.size() == NUM_PROFILER_CFG_IVALUE_IDX,
       c10::str(
           "Expected exactly ",
           NUM_PROFILER_CFG_IVALUE_IDX,
           " ivalues to resconstruct ProfilerConfig."));
   return ProfilerConfig(
       static_cast<ProfilerState>(ivalues.get(ProfilerIValueIdx::STATE).toInt()),
       ivalues.get(ProfilerIValueIdx::REPORT_INPUT_SHAPES).toBool(),
       ivalues.get(ProfilerIValueIdx::PROFILE_MEMORY).toBool());
 }

 ProfilerConfig getProfilerConfig() {
   auto state_ptr = getProfilerTLSState();
   TORCH_CHECK(
       state_ptr,
       "Tried to access profiler config, but profiler is not enabled!");
   return state_ptr->config();
 }

 bool profilerEnabled() {
   auto state_ptr = getProfilerTLSState();
   return state_ptr && state_ptr->config().state != ProfilerState::Disabled;
 }

 void enableProfilerLegacy(const ProfilerConfig& new_config) {
   TORCH_CHECK(new_config.state != ProfilerState::NVTX || cuda_stubs()->enabled(),
     "Can't use NVTX profiler - PyTorch was compiled without CUDA");

   TORCH_CHECK(new_config.state != ProfilerState::KINETO);

   auto state_ptr = getProfilerTLSState();
   TORCH_CHECK(!state_ptr, "Profiler is already enabled on this thread");
   auto state = std::make_shared<ProfilerThreadLocalState>(new_config);
   c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state);

   pushProfilingCallbacksLegacy();

   if (new_config.state == ProfilerState::CUDA) {
     // event recording appears to have some startup overhead, so we need to
     // to generate some dummy events first before recording synchronization events
     for (int idx = 0; idx < kCUDAWarmupStart; ++idx) {
       cuda_stubs()->onEachDevice([state](int /* unused */) {
           state->mark("__cuda_startup");
           cuda_stubs()->synchronize();
       });
     }

     // cuda events must be on the same device, so we need a start event recorded
     // for each gpu. we then use this event to synchronize time on the GPU
     // with the CPU clock.
     cuda_stubs()->onEachDevice([state](int d) {
         state->mark("__cuda_start_event");
     });
   }
   state->mark("__start_profile", false);
 }

 thread_event_lists disableProfilerLegacy(c10::optional<ProfilerDisableOptions> profilerDisableOptions) {
   auto cleanupTLSState = profilerDisableOptions ? profilerDisableOptions->cleanupTLSState : true;
   auto consolidate = profilerDisableOptions ? profilerDisableOptions->consolidate : true;
   // all the DebugInfoBase objects are scope based and supposed to use DebugInfoGuard
   std::shared_ptr<c10::DebugInfoBase> state;
   if (cleanupTLSState) {
     state = c10::ThreadLocalDebugInfo::_pop(c10::DebugInfoKind::PROFILER_STATE);
   } else {
     state = c10::ThreadLocalDebugInfo::_peek(c10::DebugInfoKind::PROFILER_STATE);
   }

   auto state_ptr = static_cast<ProfilerThreadLocalState*>(state.get());
   TORCH_CHECK(state_ptr && state_ptr->config().state != ProfilerState::Disabled,
       "Can't disable profiler when it's not running");

   if (cleanupTLSState) {
     at::removeCallback(state_ptr->callbackHandle());
   }

   if (!consolidate || state_ptr->config().state == ProfilerState::NVTX) {
     return thread_event_lists();
   }

   state_ptr->mark("__stop_profile");
   // Note that this will erase the underlying events.
   return state_ptr->consolidate();
 }

 void addEventList(std::vector<LegacyEvent>&& profiledEvents) {
   auto state_ptr = getProfilerTLSState();
   TORCH_CHECK(state_ptr, "Profiler must be enabled.");
   state_ptr->setOrAddRemoteProfiledEvents(std::move(profiledEvents));
 }

 void LegacyEvent::record(bool record_cuda) {
   if (record_cuda) {
     cuda_stubs()->record(&device_, &cuda_event, &cpu_ns_);
     return;
   }
   cpu_ns_ = getTime();
 }

 /* static */ LegacyEvent LegacyEvent::fromIValue(const at::IValue& eventIValue) {
   TORCH_INTERNAL_ASSERT(
       eventIValue.isList(),
       "Expected IValue to contain type c10::impl::GenericList");
   auto ivalues = eventIValue.toList();
   TORCH_INTERNAL_ASSERT(
       ivalues.size() >= NUM_EVENT_IVALUE_IDX,
       "Expected at least ",
       NUM_EVENT_IVALUE_IDX,
       " elements to reconstruct LegacyEvent.");

   // Reconstruct input shapes from ivalues.
   auto shapeListIValue = ivalues.get(EventIValueIdx::SHAPES);
   TORCH_INTERNAL_ASSERT(
     shapeListIValue.isList(),
     "Expected profiler shapes IValue to contain type c10::impl::GenericList."
   );

   auto shapeList = shapeListIValue.toList();
   std::vector<std::vector<int64_t>> shapes;
   shapes.reserve(shapeList.size());
   for (size_t i = 0 ; i < shapeList.size(); ++i) {
     std::vector<int64_t> s;
     auto shapeIValue = shapeList.get(i);
     TORCH_INTERNAL_ASSERT(
         shapeIValue.isList(),
         "Expected each profiler shape element to contain shapes of type c10::impl::GenericList.")
     auto curShapesList = shapeIValue.toList();
     s.reserve(curShapesList.size());
     for (size_t j = 0; j < curShapesList.size(); ++j) {
       s.emplace_back(curShapesList.get(j).toInt());
     }
     shapes.emplace_back(s);
   }

   LegacyEvent evt(
       static_cast<EventKind>(
           ivalues.get(EventIValueIdx::KIND).toInt()), // EventKind
       at::StringView(ivalues.get(EventIValueIdx::NAME).toStringRef()), // name
       ivalues.get(EventIValueIdx::THREAD_ID).toInt(), // thread_id
       static_cast<at::RecordFunctionHandle>(
           ivalues.get(EventIValueIdx::HANDLE).toDouble()), // handle
       std::move(shapes), // input shapes
       ivalues.get(EventIValueIdx::NODE_ID).toInt(), // node id
       true, // is remote
       ivalues.get(EventIValueIdx::CPU_MEM_USAGE).toInt(), // cpu_mem_usage
       ivalues.get(EventIValueIdx::CPU_NS).toInt(), // cpu_ns
       ivalues.get(EventIValueIdx::CUDA_RECORDED).toBool(), // was cuda recorded
       ivalues.get(EventIValueIdx::CUDA_MEM_USAGE).toInt(), // cuda memory usage
       ivalues.get(EventIValueIdx::CUDA_DEVICE).toInt(), // device
       ivalues.get(EventIValueIdx::CUDA_US).toInt() // cuda_us
   );
   return evt;
 }

 at::IValue LegacyEvent::toIValue() const {
   c10::impl::GenericList eventIValueList(at::AnyType::get());
   eventIValueList.reserve(NUM_EVENT_IVALUE_IDX);
   eventIValueList.emplace_back(static_cast<int64_t>(kind_));
   eventIValueList.emplace_back(std::string(name_.str()));
   eventIValueList.emplace_back(static_cast<int64_t>(thread_id_));
   eventIValueList.emplace_back(static_cast<double>(handle_));
   eventIValueList.emplace_back(node_id_);
   eventIValueList.emplace_back(cpu_memory_usage_);
   eventIValueList.emplace_back(cpu_ns_);
   // CUDA event information
   bool cuda_profiling_enabled = hasCuda();
   eventIValueList.emplace_back(cuda_profiling_enabled);
   eventIValueList.emplace_back(static_cast<int64_t>(cuda_memory_usage_));
   eventIValueList.emplace_back(device_);
   eventIValueList.emplace_back(cuda_us_);
   // Shapes
   c10::impl::GenericList shapesList =
       c10::impl::GenericList(at::ListType::create(at::IntType::get()));
   shapesList.reserve(shapes_.size());
   for (const auto& shape : shapes_) {
     c10::impl::GenericList s = c10::impl::GenericList(at::IntType::get());
     s.reserve(shape.size());
     for (const auto& k : shape) {
       s.emplace_back(k);
     }
     shapesList.emplace_back(s);
   }
   eventIValueList.emplace_back(shapesList);
   return at::IValue(eventIValueList);
 }

 double LegacyEvent::cudaElapsedUs(const LegacyEvent& e) const {
   TORCH_CHECK(e.hasCuda() && hasCuda(), "Events were not recorded for CUDA");
   TORCH_CHECK(
       e.device() == device(),
       c10::str(
           "Events are not on the same device: ", e.device(), " vs ", device()));
   if (isRemote() && e.isRemote()) {
     // validate that cuda_us_ has been set properly.
     TORCH_INTERNAL_ASSERT(cuda_us_ >= 0 && e.cuda_us_ >= 0);
     return static_cast<double>(e.cuda_us_ - cuda_us_);
   }
   return cuda_stubs()->elapsed(&cuda_event, &e.cuda_event);
 }

 CUDAStubs::~CUDAStubs() = default;

 static const jit::CodeTemplate event_template(R"(
 {
   "name": "${name}",
   "ph": "X",
   "ts": ${ts},
   "dur": ${dur},
   "tid": ${tid},
   "pid": "CPU Functions",
   "args": {}
 })");

 void writeProfilerEventsToStream(std::ostream& out, const std::vector<LegacyEvent*>& events) {
   TORCH_CHECK(out, "Could not open file");
   LegacyEvent* profiler_start = nullptr;
   for (LegacyEvent* e : events) {
     if (0 == strcmp(e->name(), "__start_profile")) {
       profiler_start = e;
       break;
     }
   }
   TORCH_CHECK(profiler_start, "Could not find __start_profile mark");

   struct PairHash {
     size_t operator()(std::pair<at::RecordFunctionHandle, int> p) const
         noexcept {
       return std::hash<at::RecordFunctionHandle>()(p.first) ^ std::hash<int64_t>()(p.second);
     }
   };
   std::unordered_map<std::pair<at::RecordFunctionHandle, int64_t>, LegacyEvent*, PairHash> events_map;
   out << "[\n";
   bool first = true;
   for (LegacyEvent* evt : events) {
     if (evt->kindStr() == "push") {
       events_map[std::make_pair(evt->handle(), evt->nodeId())] = evt;
     } else if (evt->kindStr() == "pop") {
       if (!first) {
         out << ",\n";
       }
       first = false;
       auto it = events_map.find(std::make_pair(evt->handle(), evt->nodeId()));
       TORCH_CHECK(it != events_map.end(), "Unmatched pop event");
       LegacyEvent* evt_start = it->second;
       events_map.erase(it);

       jit::TemplateEnv env;
       env.s("name", evt_start->name());
       env.d("ts", profiler_start->cpuElapsedUs(*evt_start));
       env.d("dur", evt_start->cpuElapsedUs(*evt));
       env.d("tid", evt_start->threadId());
       out << event_template.format(env);
     }
   }
   out << "]\n";
 }

 RecordProfile::RecordProfile(std::ostream& out)
 : out_(out) {
   init();
 }

 RecordProfile::RecordProfile(const std::string& filename)
 : file_(new std::ofstream(filename)), out_(*file_) {
   init();
 }

 void RecordProfile::init() {
   enableProfilerLegacy(ProfilerConfig(ProfilerState::CPU));
 }

 RecordProfile::~RecordProfile() {
   try {
     thread_event_lists event_lists = disableProfilerLegacy();
     std::vector<LegacyEvent*> events;
     for (auto& l : event_lists) {
       for (auto& e : l) {
           events.push_back(&e);
       }
     }
     processEvents(events);
   } catch (const std::exception& e) {
     LOG(ERROR) << e.what() << std::endl;
   } catch (...) {
     LOG(ERROR) << "Unknown error" << std::endl;
   }
 }

 void RecordProfile::processEvents(const std::vector<LegacyEvent*>& events) {
   writeProfilerEventsToStream(out_, events);
 }

 }}}