blob: ba0fee1510baa2aa67e46eab9ba97f7904af2a58 [file] [log] [blame]
#pragma once
#ifdef USE_CUDA
#include <nvToolsExt.h>
#endif
#include <thread>
#include <iostream>
#include <mutex>
#include <memory>
#include <vector>
#include <cstdint>
#include <string>
#include <list>
#include <sstream>
#include <forward_list>
#include <tuple>
#include "ATen/ATen.h"
#include "torch/csrc/WindowsTorchApiMacro.h"
#include "torch/csrc/cuda/cuda_check.h"
#ifdef USE_CUDA
#include "ATen/cuda/CUDAContext.h"
#include <cuda_runtime.h>
#endif
namespace torch { namespace autograd {
struct Function;
namespace profiler {
constexpr inline size_t ceilToMultiple(size_t a, size_t b) {
return ((a + b - 1) / b) * b;
}
inline uint64_t getTime() {
using namespace std::chrono;
using clock = std::conditional<high_resolution_clock::is_steady, high_resolution_clock, steady_clock>::type;
return duration_cast<nanoseconds>(clock::now().time_since_epoch()).count();
}
enum class EventKind {
Mark,
PushRange,
PopRange
};
struct Event {
Event(EventKind kind, std::string name, uint32_t thread_id, bool record_cuda)
: kind_(kind)
, name_(std::move(name))
, thread_id_(thread_id) {
#ifdef USE_CUDA
if(record_cuda) {
TORCH_CUDA_CHECK(cudaGetDevice(&device_));
TORCH_CUDA_CHECK(cudaEventCreate(&event));
auto stream = at::cuda::getCurrentCUDAStream();
cpu_ns_ = getTime();
TORCH_CUDA_CHECK(cudaEventRecord(event, stream));
} else {
cpu_ns_ = getTime();
}
#else
cpu_ns_ = getTime();
#endif
}
std::string kind() const {
switch(kind_) {
case EventKind::Mark: return "mark";
case EventKind::PushRange: return "push";
case EventKind::PopRange: return "pop";
}
throw std::runtime_error("unknown EventKind");
}
const std::string & name() const {
return name_;
}
uint32_t thread_id() const {
return thread_id_;
}
double cpu_elapsed_us(const Event & e) {
return (e.cpu_ns_ - cpu_ns_)/(1000.0);
}
double cuda_elapsed_us(const Event & e) {
#ifdef USE_CUDA
if(!e.has_cuda() || !has_cuda()) {
throw std::logic_error("Events were not recorded for CUDA");
}
if(e.device() != device()) {
throw std::logic_error("Events are not on the same device");
}
TORCH_CUDA_CHECK(cudaEventSynchronize(event));
TORCH_CUDA_CHECK(cudaEventSynchronize(e.event));
float ms;
TORCH_CUDA_CHECK(cudaEventElapsedTime(&ms, event, e.event));
return ms*1000.0;
#else
throw std::logic_error("CUDA not enabled");
#endif
}
bool has_cuda() const {
#ifdef USE_CUDA
return event != nullptr;
#else
return false;
#endif
}
int device() const {
return device_;
}
private:
EventKind kind_;
std::string name_;
uint32_t thread_id_;
int64_t cpu_ns_; // signed to allow for negative intervals
#ifdef USE_CUDA
cudaEvent_t event = nullptr;
#endif
int device_ = -1;
};
// a linked-list of fixed sized vectors, to avoid
// a std::vector resize from taking a large amount of time inside
// a profiling event
struct RangeEventList {
constexpr static size_t MB = 1024 * 1024;
constexpr static size_t event_block_size = 16 * MB;
constexpr static size_t num_block_elements =
event_block_size / ceilToMultiple(sizeof(Event), alignof(Event));
static_assert(sizeof(Event[num_block_elements]) <= event_block_size,
"num_block_elements is calculated incorrectly");
using block_type = std::vector<Event>;
void allocBlock() {
blocks.emplace_front();
blocks.front().reserve(num_block_elements);
}
template<typename... Args>
void record(Args&&... args) {
if (blocks.empty() || blocks.front().size() == num_block_elements) {
allocBlock();
}
blocks.front().emplace_back(std::forward<Args>(args)...);
}
std::vector<Event> consolidate() {
std::vector<Event> result;
for (auto & block : blocks) {
result.insert(result.begin(),
std::make_move_iterator(block.begin()),
std::make_move_iterator(block.end()));
}
blocks.clear();
return result;
}
std::forward_list<block_type> blocks;
};
enum class ProfilerState {
Disabled,
CPU, // CPU-only profiling
CUDA, // CPU + CUDA events
NVTX, // only emit NVTX markers
};
TORCH_API RangeEventList& getEventList();
TORCH_API void mark(std::string name, bool include_cuda = true);
TORCH_API void pushRange(std::string name);
TORCH_API void popRange();
struct TORCH_API RecordFunction {
explicit RecordFunction(Function* fn);
explicit RecordFunction(std::string name);
explicit RecordFunction(const char* name);
~RecordFunction();
// Needed only because we don't have Function defined yet.
void pushFunctionRange(Function *fn);
};
using thread_event_lists = std::vector<std::vector<Event>>;
// NOTE: changing profiler modes is **NOT THREAD SAFE**. You should ensure that
// there no autograd functions are being executed when these function are used.
TORCH_API void enableProfiler(ProfilerState new_state);
TORCH_API thread_event_lists disableProfiler();
} // namespace profiler
}} // namespace torch::autograd