blob: b6e46ac73820f974e877163bb8fb73660aca1678 [file] [log] [blame]
#pragma once
#include <iostream>
#include <mutex>
#include <memory>
#include <vector>
#include <cstdint>
#include <string>
#include <sstream>
#include <forward_list>
#include <tuple>
#include <ATen/ATen.h>
#include <torch/csrc/WindowsTorchApiMacro.h>
#ifndef _WIN32
#include <ctime>
#endif
#include <torch/csrc/autograd/record_function.h>
typedef struct CUevent_st* CUDAEventStub;
namespace torch { namespace autograd {
struct Node;
namespace profiler {
struct TORCH_API CUDAStubs {
virtual void record(int* device, CUDAEventStub* event, int64_t* cpu_ns) {
fail();
}
virtual float elapsed(CUDAEventStub event, CUDAEventStub event2) {
fail();
return 0.f;
}
virtual void nvtxMarkA(const char* name) {
fail();
}
virtual void nvtxRangePushA(const char* name) {
fail();
}
virtual void nvtxRangePop() {
fail();
}
virtual bool enabled() {
return false;
}
virtual void onEachDevice(std::function<void(int)> op) {
fail();
}
virtual void synchronize() {
fail();
}
virtual ~CUDAStubs();
private:
void fail() {
AT_ERROR("CUDA used in profiler but not enabled.");
}
};
TORCH_API void registerCUDAMethods(CUDAStubs* stubs);
constexpr inline size_t ceilToMultiple(size_t a, size_t b) {
return ((a + b - 1) / b) * b;
}
#if (defined(__MACH__) && !defined(CLOCK_REALTIME)) || defined(C10_IOS)
#include <sys/time.h>
// clock_gettime is not implemented on older versions of OS X (< 10.12).
// If implemented, CLOCK_REALTIME will have already been defined.
// clock_gettime is only available on iOS 10.0 or newer. Unlike OS X, iOS can't rely on
// CLOCK_REALTIME, as it is defined no matter if clock_gettime is implemented or not
#endif
inline int64_t getTime() {
#ifdef _WIN32
using namespace std::chrono;
using clock = std::conditional<high_resolution_clock::is_steady, high_resolution_clock, steady_clock>::type;
return duration_cast<nanoseconds>(clock::now().time_since_epoch()).count();
#elif (defined(__MACH__) && !defined(CLOCK_REALTIME)) || defined(C10_IOS)
struct timeval now;
gettimeofday(&now, NULL);
return static_cast<int64_t>(now.tv_sec) * 1000000000 + static_cast<int64_t>(now.tv_usec) * 1000;
#else
// clock_gettime is *much* faster than std::chrono implementation on Linux
struct timespec t{};
clock_gettime(CLOCK_MONOTONIC, &t);
return static_cast<int64_t>(t.tv_sec) * 1000000000 + static_cast<int64_t>(t.tv_nsec);
#endif
}
// Old GCC versions generate warnings incorrectly
// see https://stackoverflow.com/questions/2463113/g-c0x-enum-class-compiler-warnings
#ifndef _MSC_VER
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wattributes"
#endif
enum class TORCH_API ProfilerState {
Disabled,
CPU, // CPU-only profiling
CUDA, // CPU + CUDA events
NVTX, // only emit NVTX markers
};
struct TORCH_API ProfilerConfig {
ProfilerConfig(ProfilerState state, bool report_input_shapes)
: state(state), report_input_shapes(report_input_shapes) {}
~ProfilerConfig();
ProfilerState state;
bool report_input_shapes;
};
enum class TORCH_API EventKind : uint16_t {
Mark,
PushRange,
PopRange
};
#ifndef _MSC_VER
# pragma GCC diagnostic pop
#endif
struct TORCH_API Event final {
Event(
EventKind kind,
StringView name,
uint16_t thread_id,
bool record_cuda,
std::vector<std::vector<int64_t>>&& shapes = {})
: name_(std::move(name)),
kind_(kind),
thread_id_(thread_id),
shapes_(shapes) {
record(record_cuda);
}
void record(bool record_cuda);
std::string kind() const {
switch(kind_) {
case EventKind::Mark: return "mark";
case EventKind::PushRange: return "push";
case EventKind::PopRange: return "pop";
}
throw std::runtime_error("unknown EventKind");
}
const char* name() const {
return name_.str();
}
uint16_t thread_id() const {
return thread_id_;
}
std::vector<std::vector<int64_t>> shapes() const {
return shapes_;
}
double cpu_elapsed_us(const Event & e) {
return (e.cpu_ns_ - cpu_ns_)/(1000.0);
}
double cuda_elapsed_us(const Event & e);
bool has_cuda() const {
return event != nullptr;
}
int device() const {
return device_;
}
private:
// signed to allow for negative intervals, initialized for safety.
int64_t cpu_ns_ = 0;
StringView name_;
EventKind kind_;
uint16_t thread_id_;
std::vector<std::vector<int64_t>> shapes_;
int device_ = -1;
struct CUevent_st* event = nullptr;
};
// a linked-list of fixed sized vectors, to avoid
// a std::vector resize from taking a large amount of time inside
// a profiling event
struct RangeEventList {
// This mutex is used to serialize access when different threads are writing
// to the same instance of RangeEventList.
std::mutex mutex_;
constexpr static size_t MB = 1024 * 1024;
constexpr static size_t event_block_size = 16 * MB;
constexpr static size_t num_block_elements =
event_block_size / ceilToMultiple(sizeof(Event), alignof(Event));
static_assert(sizeof(Event[num_block_elements]) <= event_block_size,
"num_block_elements is calculated incorrectly");
using block_type = std::vector<Event>;
template<typename... Args>
void record(Args&&... args) {
std::lock_guard<std::mutex> guard(mutex_);
if (blocks.empty() || blocks.front().size() == num_block_elements) {
allocBlock();
}
blocks.front().emplace_back(std::forward<Args>(args)...);
}
std::vector<Event> consolidate() {
std::unique_lock<std::mutex> lock(mutex_);
std::forward_list<block_type> localBlocks;
localBlocks.swap(blocks);
lock.unlock();
std::vector<Event> result;
for (auto & block : localBlocks) {
result.insert(result.begin(),
std::make_move_iterator(block.begin()),
std::make_move_iterator(block.end()));
}
return result;
}
std::forward_list<block_type> blocks;
private:
// allocBlock() assumes that mutex_ is held when called, in order to prevent
// multiple threads' block writes stomping over each other.
void allocBlock() {
blocks.emplace_front();
auto & new_block = blocks.front();
new_block.reserve(num_block_elements);
// Materialize all pages in the new block to release jitter when recording events.
const char * const end_ptr = reinterpret_cast<char*>(new_block.data() + num_block_elements);
for (volatile const char * ptr = reinterpret_cast<char*>(new_block.data());
ptr < end_ptr; ptr += 4 * 1024) {
(*ptr);
}
}
};
TORCH_API RangeEventList& getEventList();
TORCH_API void mark(std::string name, bool include_cuda = true);
using thread_event_lists = std::vector<std::vector<Event>>;
// NOTE: changing profiler modes is **NOT THREAD SAFE**. You should ensure that
// there no autograd functions are being executed when these function are used.
TORCH_API void enableProfiler(ProfilerConfig);
TORCH_API thread_event_lists disableProfiler();
TORCH_API bool profilerEnabled();
// Usage:
// {
// RecordProfile guard("filename.trace");
// // code you want to profile
// }
// Then open filename.trace in chrome://tracing
struct TORCH_API RecordProfile {
RecordProfile(std::ostream& out);
RecordProfile(const std::string& filename);
~RecordProfile();
private:
void init();
std::unique_ptr<std::ofstream> file_;
std::ostream& out_;
void processEvents(const std::vector<Event*>& events);
};
} // namespace profiler
}} // namespace torch::autograd