blob: 2e856044134d8858f67cd970a0d60642a7b898a9 [file] [log] [blame]
#pragma once
#include <cstdint>
#include <memory>
#include <mutex>
#include <type_traits>
#include <utility>
#include <ATen/Context.h>
#include <c10/core/DeviceType.h>
#include <c10/macros/Macros.h>
#include <c10/util/flat_hash_map.h>
#include <c10/util/strong_type.h>
#include <c10/util/variant.h>
#include <torch/csrc/profiler/containers.h>
#include <torch/csrc/profiler/kineto_shim.h>
#include <torch/csrc/profiler/util.h>
#include <torch/csrc/utils/python_stub.h>
namespace torch {
namespace profiler {
namespace impl {
enum class EventType : uint8_t {
TorchOp = 0,
Backend,
Allocation,
PyCall,
PyCCall
};
template <EventType>
struct ExtraFields;
struct TorchOpBasicFields {
int64_t sequence_number_;
uint64_t forward_tid_;
at::RecordScope scope_;
bool is_async_;
int64_t debug_handle_;
std::string name_;
// Set in the exit callback.
uint64_t end_tid_{0};
};
struct Inputs {
std::vector<std::vector<int64_t>> shapes_;
std::vector<std::string> dtypes_;
};
using jit_stack_t = std::vector<std::string>;
using jit_modules_t = std::vector<std::string>;
using extra_args_t = std::unordered_map<std::string, c10::IValue>;
struct FallbackPair {
ProfilerEventStub cuda_event_start_ = nullptr;
ProfilerEventStub cuda_event_end_ = nullptr;
};
template <>
struct ExtraFields<EventType::TorchOp> : TorchOpBasicFields {
ExtraFields(
TorchOpBasicFields&& f,
uint64_t correlation_id,
time_t end_time_ns,
Inputs&& inputs,
jit_stack_t&& jit_stack,
jit_modules_t&& jit_modules,
extra_args_t&& extra_args,
FallbackPair&& gpu_fallback)
: TorchOpBasicFields(std::move(f)),
correlation_id_{correlation_id},
end_time_ns_{end_time_ns},
inputs_{std::move(inputs)},
jit_stack_{std::move(jit_stack)},
jit_modules_{std::move(jit_modules)},
extra_args_{std::move(extra_args)},
gpu_fallback_{std::move(gpu_fallback)} {}
uint64_t correlation_id_;
time_t end_time_ns_;
Inputs inputs_;
jit_stack_t jit_stack_;
jit_modules_t jit_modules_;
extra_args_t extra_args_;
FallbackPair gpu_fallback_;
};
template <>
struct ExtraFields<EventType::Backend> {
int64_t start_time_us_;
int64_t end_time_us_;
int64_t debug_handle_;
at::RecordScope scope_;
std::string name_;
std::string backend_;
jit_stack_t jit_stack_;
jit_modules_t jit_modules_;
};
template <>
struct ExtraFields<EventType::Allocation> {
torch::profiler::impl::approx_time_t start_time_;
void* ptr_;
int64_t alloc_size_;
int64_t total_allocated_;
int64_t total_reserved_;
c10::DeviceType device_type_;
c10::DeviceIndex device_index_;
};
// For performance.
static_assert(
std::is_pod<ExtraFields<EventType::Allocation>>::value,
"Non-POD member of ExtraFields<EventType::Allocation>.");
struct PyFrameState {
int line_no_;
at::StringView filename_;
at::StringView funcname_;
};
template <typename T, typename Tag>
using strong_t = strong::
type<T, Tag, strong::regular, strong::convertible_to<T>, strong::hashable>;
using PyModuleSelf = strong_t<PyObject*, struct PyModuleSelf_>;
using PyModuleCls = strong_t<PyObject*, struct PyModuleCls_>;
using PyCFunction = strong_t<PyObject*, struct PyCFunction_>;
struct NNModuleInfo {
PyModuleSelf self_;
PyModuleCls cls_;
at::StringView cls_name_;
// Indicates that `self_` is the kth instance of `cls_` observed.
size_t id_{std::numeric_limits<size_t>::max()};
};
struct PyExtraFieldsBase {
PyExtraFieldsBase(time_t end_time_ns, size_t python_tid, PyFrameState caller)
: end_time_ns_{end_time_ns}, python_tid_{python_tid}, caller_{caller} {}
time_t end_time_ns_;
size_t python_tid_;
PyFrameState caller_;
// kth python event observed. (Used by TensorBoard)
size_t id_{std::numeric_limits<size_t>::max()};
};
template <>
struct ExtraFields<EventType::PyCall> : public PyExtraFieldsBase {
using args_t = std::pair<PyFrameState, c10::optional<NNModuleInfo>>;
ExtraFields(
time_t end_time_ns,
size_t python_tid,
PyFrameState caller,
args_t args)
: PyExtraFieldsBase(end_time_ns, python_tid, caller),
callsite_{args.first},
module_{args.second} {}
PyFrameState callsite_;
c10::optional<NNModuleInfo> module_;
};
template <>
struct ExtraFields<EventType::PyCCall> : public PyExtraFieldsBase {
using args_t = at::StringView;
ExtraFields(
time_t end_time_ns,
size_t python_tid,
PyFrameState caller,
args_t args)
: PyExtraFieldsBase(end_time_ns, python_tid, caller),
function_name_{args} {}
at::StringView function_name_;
};
struct TORCH_API Result : public std::enable_shared_from_this<Result> {
template <typename... Args>
[[nodiscard]] static std::shared_ptr<Result> create(Args... args) {
return std::shared_ptr<Result>(new Result(std::forward<Args>(args)...));
}
std::string name() const;
torch::profiler::impl::kineto::KinetoActivityType kinetoType() const;
uint64_t correlationID() const;
int64_t endTimeNS() const;
uint64_t endTID() const;
c10::DeviceType deviceType() const;
int64_t start_time_ns_;
uint64_t start_tid_;
kineto::DeviceAndResource kineto_info_;
c10::variant<
ExtraFields<EventType::TorchOp>,
ExtraFields<EventType::Backend>,
ExtraFields<EventType::Allocation>,
ExtraFields<EventType::PyCall>,
ExtraFields<EventType::PyCCall>>
extra_fields_;
std::weak_ptr<Result> parent_;
std::vector<std::shared_ptr<Result>> children_;
bool finished_{false};
private:
template <EventType E>
Result(
int64_t start_time_ns,
uint64_t start_tid,
kineto::DeviceAndResource kineto_info,
ExtraFields<E>&& extra_fields)
: start_time_ns_{start_time_ns},
start_tid_{start_tid},
kineto_info_{kineto_info},
extra_fields_{std::move(extra_fields)} {}
};
struct KinetoObserverContext : public at::ObserverContext {
struct Event {
TorchOpBasicFields basic_fields_;
approx_time_t start_time_;
// Set in the exit callback.
approx_time_t end_time_{std::numeric_limits<approx_time_t>::min()};
};
explicit KinetoObserverContext(Event* event) : event_{event} {}
Event* event_;
FallbackPair* fallback_{nullptr};
};
constexpr int IO_ENCODER_DEFAULT_BLOCK_SIZE = 1024;
// InputOutputEncoder
// Stores each op_events' shapes and dtypes into a contiguous AppendOnlyList
// so that we no longer create vectors for shapes and dtypes on every op.
// Those vectors can be created during post-processing.
class InputOutputEncoder final {
public:
void push(c10::ArrayRef<const c10::IValue> values);
// Used during post-processing to create vectors for shapes and dtype.
auto getNextShapesAndDtypes();
void clear();
private:
enum class Tag {
Tensor = 0,
UndefinedTensor,
TensorListBegin, // TODO: generalize to other lists.
Scalar,
Other,
TERMINATOR
};
struct TensorMetadata {
void* ptr_;
c10::ScalarType dtype_;
uint32_t dim_;
};
void push(const at::Tensor& t);
AppendOnlyList<Tag, IO_ENCODER_DEFAULT_BLOCK_SIZE> tags_;
AppendOnlyList<TensorMetadata, IO_ENCODER_DEFAULT_BLOCK_SIZE>
tensor_metadata_;
AppendOnlyList<int64_t, IO_ENCODER_DEFAULT_BLOCK_SIZE> tensor_sizes_;
};
class RecordQueue;
namespace python_tracer {
/*
Libtorch does not depend on Python (e.g. cannot #include <Python.h>); however
when we call the profiler from libtorch_python we need the profiler to be able
to ingest the data that we collect from the Python tracer. (`PyEval_SetProfile`)
In order to solve this dependency issue we define a virtual base and a function
to register a getter. The python tracer then implements these functions and
exposes itself by calling `registerTracer` from `torch/csrc/autograd/init.cpp`.
This pattern of registration for faux python dependencies in libtorch is common
in the PyTorch codebase.
*/
using TraceKey = strong::type<
uint64_t,
struct TraceKey_,
strong::regular,
strong::hashable,
strong::ostreamable>;
struct CompressedEvent {
TraceKey key_;
uint64_t system_tid_;
kineto::DeviceAndResource kineto_info_;
time_t enter_t_;
};
struct TORCH_API PythonTracerBase {
static PythonTracerBase& get();
virtual ~PythonTracerBase() = default;
virtual void start(RecordQueue* queue) = 0;
virtual void stop() = 0;
virtual std::vector<std::shared_ptr<Result>> getEvents(
std::function<time_t(approx_time_t)> time_converter,
std::vector<CompressedEvent>& enters) = 0;
virtual void clear() = 0;
};
using GetFn = PythonTracerBase& (*)();
TORCH_API void registerTracer(GetFn get_tracer);
} // namespace python_tracer
class TORCH_API ThreadLocalSubqueue {
public:
ThreadLocalSubqueue(const uint64_t tid, const ProfilerConfig& config);
std::unique_ptr<KinetoObserverContext> begin_op(const at::RecordFunction& fn);
template <class... Args>
void emplace_backend_event(Args&&... args) {
backend_events_.emplace_back(std::forward<Args>(args)...);
}
template <class... Args>
void emplace_allocation_event(Args&&... args) {
allocations_.emplace_back(std::forward<Args>(args)...);
}
template <class... Args>
void emplace_py_call(Args&&... args) {
py_calls_.emplace_back(std::forward<Args>(args)...);
}
uint64_t tid() const {
return tid_;
}
const kineto::DeviceAndResource& kineto_info() const {
return kineto_info_;
}
private:
uint64_t tid_;
ProfilerConfig config_;
kineto::DeviceAndResource kineto_info_;
friend class RecordQueue;
// See `containers.h` for block size benchmarks.
static constexpr size_t BlockSize = 512;
template <typename T, size_t ChunkSize>
class EventBlock : public std::array<T, ChunkSize> {
public:
EventBlock();
uint64_t correlation_id(const T* ptr) const {
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
ptr >= this->data() && ptr < this->data() + ChunkSize);
return id_start_ + (ptr - this->data());
}
private:
uint64_t id_start_;
};
class OpList : public AppendOnlyList<
KinetoObserverContext::Event,
BlockSize,
EventBlock> {
public:
template <class... Args>
std::pair<KinetoObserverContext::Event*, uint64_t> emplace_back(
Args&&... args);
static uint64_t correlationID(const OpList::Iterator& e);
};
OpList op_events_;
// report_input_shapes
InputOutputEncoder inputs_outputs_;
// with_stack
AppendOnlyList<jit_stack_t, BlockSize> jit_stack_;
AppendOnlyList<std::pair<python_tracer::TraceKey, approx_time_t>, BlockSize>
py_calls_;
// with_modules
AppendOnlyList<jit_modules_t, BlockSize> jit_modules_;
// with_flops
AppendOnlyList<extra_args_t, BlockSize> extra_args_;
// ProfilerState::KINETO_GPU_FALLBACK
AppendOnlyList<FallbackPair, BlockSize> gpu_fallback_;
// reportBackendEventToActiveKinetoProfiler
AppendOnlyList<ExtraFields<EventType::Backend>, BlockSize> backend_events_;
// reportMemoryUsage
AppendOnlyList<ExtraFields<EventType::Allocation>, BlockSize> allocations_;
};
class TORCH_API RecordQueue {
public:
RecordQueue(const ProfilerConfig& config, std::set<ActivityType> activities);
bool tracePython() const;
ThreadLocalSubqueue* getSubqueue();
void stop();
// NB: This is a destructive operation.
std::vector<std::shared_ptr<Result>> getRecords(
std::function<time_t(approx_time_t)> time_converter);
private:
uint32_t id_;
ProfilerConfig config_;
std::set<ActivityType> activities_;
ska::flat_hash_map<uint64_t, std::unique_ptr<ThreadLocalSubqueue>>
sub_queues_;
std::mutex sub_queue_mutex_;
};
} // namespace impl
} // namespace profiler
} // namespace torch