| #pragma once |
| |
| #include <cstdint> |
| #include <memory> |
| #include <mutex> |
| #include <type_traits> |
| #include <utility> |
| #include <variant> |
| |
| #include <ATen/Context.h> |
| #include <c10/core/Device.h> |
| #include <c10/core/TensorImpl.h> |
| #include <c10/macros/Macros.h> |
| #include <c10/util/flat_hash_map.h> |
| #include <c10/util/strong_type.h> |
| #include <torch/csrc/profiler/containers.h> |
| #include <torch/csrc/profiler/data_flow.h> |
| #include <torch/csrc/profiler/events.h> |
| #include <torch/csrc/profiler/kineto_shim.h> |
| #include <torch/csrc/profiler/orchestration/python_tracer.h> |
| #include <torch/csrc/profiler/perf.h> |
| #include <torch/csrc/profiler/stubs/base.h> |
| #include <torch/csrc/profiler/util.h> |
| #include <torch/csrc/utils/python_stub.h> |
| |
| namespace torch { |
| namespace profiler { |
| namespace impl { |
| |
| enum class EventType : uint8_t { |
| TorchOp = 0, |
| Backend, |
| Vulkan, |
| Allocation, |
| OutOfMemory, |
| PyCall, |
| PyCCall, |
| Kineto |
| }; |
| |
| // ============================================================================ |
| // == Value (Tensor, Scalar) summary ========================================== |
| // ============================================================================ |
| struct TORCH_API RawTensorMetadataBase { |
| RawTensorMetadataBase() = default; |
| explicit RawTensorMetadataBase(const at::Tensor& t); |
| |
| StorageImplData data_; |
| c10::ScalarType dtype_; |
| c10::Layout layout_; |
| uint32_t dim_; |
| }; |
| |
| // Collected during profiling. |
| struct TORCH_API RawTensorMetadata : RawTensorMetadataBase { |
| RawTensorMetadata() = default; |
| RawTensorMetadata(const RawTensorMetadata&) = default; |
| explicit RawTensorMetadata(const at::Tensor& t); |
| |
| // Wrap `weak_self_` in `c10::optional` and split device into components to |
| // keep struct default constructable. (which the std::array initializer needs) |
| c10::optional<WeakTensor> weak_self_; |
| c10::DeviceType device_type_; |
| c10::DeviceIndex device_index_; |
| }; |
| |
| // Used during post processing. |
| struct TORCH_API TensorMetadata : public RawTensorMetadataBase { |
| TensorMetadata( |
| const RawTensorMetadata& r, |
| std::vector<int64_t> sizes, |
| std::vector<int64_t> strides); |
| |
| TensorImplAddress impl() const { |
| return weak_self_.get(); |
| } |
| |
| WeakTensor weak_self_; |
| c10::Device device_; |
| std::vector<int64_t> sizes_; |
| std::vector<int64_t> strides_; |
| |
| // Set during `calculateUniqueTensorIDs`. |
| c10::optional<TensorID> id_; |
| c10::optional<AllocationID> allocation_id_; |
| }; |
| |
| using op_input_t = std::variant< |
| TensorMetadata, |
| std::vector<TensorMetadata>, |
| c10::IValue, |
| c10::nullopt_t>; |
| |
| // ============================================================================ |
| // == ExtraFields ============================================================= |
| // ============================================================================ |
| template <EventType> |
| struct ExtraFields; |
| |
| struct Result; |
| |
| struct TorchOpBasicFields { |
| int64_t sequence_number_{0}; |
| uint64_t forward_tid_{0}; |
| at::RecordScope scope_{}; |
| bool is_async_{false}; |
| int64_t debug_handle_{0}; |
| std::string name_; |
| |
| // Set in the exit callback. |
| uint64_t end_tid_{0}; |
| }; |
| |
| using jit_stack_t = std::vector<std::string>; |
| using jit_modules_t = std::vector<std::string>; |
| using extra_args_t = std::unordered_map<std::string, c10::IValue>; |
| |
| struct FallbackPair { |
| ProfilerVoidEventStub device_event_start_ = nullptr; |
| ProfilerVoidEventStub device_event_end_ = nullptr; |
| }; |
| |
| template <> |
| struct ExtraFields<EventType::TorchOp> : TorchOpBasicFields { |
| ExtraFields( |
| TorchOpBasicFields&& f, |
| uint64_t correlation_id, |
| time_t end_time_ns, |
| std::vector<op_input_t>&& inputs, |
| std::vector<op_input_t>&& concrete_inputs, |
| jit_stack_t&& jit_stack, |
| jit_modules_t&& jit_modules, |
| extra_args_t&& extra_args, |
| FallbackPair&& device_fallback, |
| bool allow_tf32_cublas, |
| std::unique_ptr<perf_counters_t>&& perf_event_counters) |
| : TorchOpBasicFields(std::move(f)), |
| correlation_id_{correlation_id}, |
| end_time_ns_{end_time_ns}, |
| inputs_{std::move(inputs)}, |
| concrete_inputs_{std::move(concrete_inputs)}, |
| jit_stack_{std::move(jit_stack)}, |
| jit_modules_{std::move(jit_modules)}, |
| extra_args_{std::move(extra_args)}, |
| device_fallback_{std::move(device_fallback)}, |
| allow_tf32_cublas_{allow_tf32_cublas}, |
| perf_event_counters_{std::move(perf_event_counters)} {} |
| uint64_t correlation_id_; |
| time_t end_time_ns_; |
| std::vector<op_input_t> inputs_; |
| std::vector<op_input_t> concrete_inputs_; |
| jit_stack_t jit_stack_; |
| jit_modules_t jit_modules_; |
| extra_args_t extra_args_; |
| FallbackPair device_fallback_; |
| bool allow_tf32_cublas_; |
| std::unique_ptr<perf_counters_t> perf_event_counters_; |
| }; |
| |
| template <> |
| struct ExtraFields<EventType::Backend> { |
| int64_t start_time_us_; |
| int64_t end_time_us_; |
| int64_t debug_handle_; |
| at::RecordScope scope_; |
| std::string name_; |
| std::string backend_; |
| jit_stack_t jit_stack_; |
| jit_modules_t jit_modules_; |
| }; |
| |
| template <> |
| struct ExtraFields<EventType::Vulkan> { |
| using raw_event_t = std::pair<approx_time_t, vulkan_id_t>; |
| std::string name_; |
| int64_t duration_ns_{0}; |
| // While building the event tree, we want to report a vulkan event's duration |
| // as 0 so that its end time doesn't exceed that of its parent cpu op |
| bool in_tree_building_{false}; |
| }; |
| |
| struct RawAllocation { |
| torch::profiler::impl::approx_time_t start_time_; |
| void* ptr_; |
| int64_t alloc_size_; |
| size_t total_allocated_; |
| size_t total_reserved_; |
| c10::DeviceType device_type_; |
| c10::DeviceIndex device_index_; |
| }; |
| |
| // For performance. |
| static_assert(c10::is_pod_v<RawAllocation>, "Non-POD member of RawAllocation."); |
| |
| template <> |
| struct ExtraFields<EventType::Allocation> : RawAllocation { |
| ExtraFields(const RawAllocation& allocation) : RawAllocation(allocation) {} |
| |
| c10::Device device() const { |
| return {device_type_, device_index_}; |
| } |
| |
| c10::optional<TensorID> id_; |
| c10::optional<AllocationID> allocation_id_; |
| }; |
| |
| template <> |
| struct ExtraFields<EventType::OutOfMemory> { |
| torch::profiler::impl::approx_time_t start_time_; |
| int64_t alloc_size_; |
| size_t total_allocated_; |
| size_t total_reserved_; |
| c10::DeviceType device_type_; |
| c10::DeviceIndex device_index_; |
| }; |
| |
| // For performance. |
| static_assert( |
| c10::is_pod_v<ExtraFields<EventType::OutOfMemory>>, |
| "Non-POD member of ExtraFields<EventType::OutOfMemory>."); |
| |
| struct PyFrameState { |
| int line_no_; |
| at::StringView filename_; |
| at::StringView funcname_; |
| }; |
| |
| template <typename T, typename Tag> |
| using strong_t = strong:: |
| type<T, Tag, strong::regular, strong::convertible_to<T>, strong::hashable>; |
| |
| using PyModuleSelf = strong_t<PyObject*, struct PyModuleSelf_>; |
| using PyModuleCls = strong_t<PyObject*, struct PyModuleCls_>; |
| using PyMethod = strong_t</*PyMethodDef*/ void*, struct PyMethod_>; |
| using PyOptimizerSelf = strong_t<PyObject*, struct PyOptSelf_>; |
| using PyOptimizerCls = strong_t<PyObject*, struct PyOptimizer_>; |
| |
| struct NNModuleInfo { |
| struct ParameterInfo { |
| std::string name_; |
| TensorMetadata metadata_; |
| c10::optional<TensorMetadata> grad_metadata_; |
| }; |
| |
| PyModuleSelf self_; |
| PyModuleCls cls_; |
| at::StringView cls_name_; |
| |
| std::vector<ParameterInfo> parameters_; |
| // Indicates that `self_` is the kth instance of `cls_` observed. |
| size_t id_{std::numeric_limits<size_t>::max()}; |
| }; |
| |
| struct OptimizerInfo { |
| struct ParameterInfo { |
| TensorMetadata metadata_; |
| c10::optional<TensorMetadata> grad_metadata_; |
| std::vector<std::pair<std::string, TensorMetadata>> state_; |
| }; |
| |
| PyOptimizerSelf self_; |
| PyOptimizerCls cls_; |
| at::StringView cls_name_; |
| |
| std::vector<ParameterInfo> parameters_; |
| }; |
| |
| struct PyExtraFieldsBase { |
| PyExtraFieldsBase(time_t end_time_ns, size_t python_tid, PyFrameState caller) |
| : end_time_ns_{end_time_ns}, |
| python_tid_{python_tid}, |
| caller_{std::move(caller)} {} |
| |
| time_t end_time_ns_; |
| size_t python_tid_; |
| PyFrameState caller_; |
| |
| // kth python event observed. (Used by TensorBoard) |
| size_t id_{std::numeric_limits<size_t>::max()}; |
| }; |
| |
| template <> |
| struct ExtraFields<EventType::PyCall> : public PyExtraFieldsBase { |
| struct args_t { |
| PyFrameState frame_state_; |
| c10::optional<NNModuleInfo> module_info_; |
| c10::optional<OptimizerInfo> optimizer_info_; |
| }; |
| |
| ExtraFields( |
| time_t end_time_ns, |
| size_t python_tid, |
| PyFrameState caller, |
| args_t args) |
| : PyExtraFieldsBase(end_time_ns, python_tid, caller), |
| callsite_{args.frame_state_}, |
| module_{args.module_info_}, |
| optimizer_{args.optimizer_info_} {} |
| |
| PyFrameState callsite_; |
| c10::optional<NNModuleInfo> module_; |
| c10::optional<OptimizerInfo> optimizer_; |
| }; |
| |
| template <> |
| struct ExtraFields<EventType::PyCCall> : public PyExtraFieldsBase { |
| using args_t = at::StringView; |
| |
| ExtraFields( |
| time_t end_time_ns, |
| size_t python_tid, |
| PyFrameState caller, |
| args_t args) |
| : PyExtraFieldsBase(end_time_ns, python_tid, caller), |
| function_name_{std::move(args)} {} |
| |
| at::StringView function_name_; |
| }; |
| |
| template <> |
| struct ExtraFields<EventType::Kineto> { |
| // Mirrors `libkineto::GenericTraceActivity::Flow`. This information is used |
| // during post processing to properly embed Kineto events into the broader |
| // profiler tree structure. End users are not generally expected to use these |
| // fields directly, but they are available for debugging. |
| struct Flow { |
| uint32_t id{0}; |
| uint32_t type{0}; |
| uint32_t start{0}; |
| }; |
| |
| std::string name_; |
| int64_t duration_us_{0}; |
| uint64_t correlation_id_{0}; |
| libkineto::ActivityType activity_type_; |
| Flow flow; |
| std::weak_ptr<Result> linked_activity_{}; |
| }; |
| |
| struct TORCH_API Result : public std::enable_shared_from_this<Result> { |
| template <typename... Args> |
| [[nodiscard]] static std::shared_ptr<Result> create(Args... args) { |
| return std::shared_ptr<Result>(new Result(std::forward<Args>(args)...)); |
| } |
| |
| template <typename T> |
| decltype(auto) visit(T&& visitor) { |
| return std::visit(std::forward<T>(visitor), extra_fields_); |
| } |
| |
| template <typename T> |
| decltype(auto) visit(T&& visitor) const { |
| return std::visit(std::forward<T>(visitor), extra_fields_); |
| } |
| |
| template <typename T, typename Fn> |
| void visit_if_base(Fn&& fn) const { |
| visit([&](const auto& extra_fields) { |
| using extra_fields_t = typename std::remove_cv< |
| typename std::remove_reference<decltype(extra_fields)>::type>::type; |
| |
| if constexpr (std::is_base_of_v<T, extra_fields_t>) { |
| fn(extra_fields); |
| } |
| }); |
| } |
| |
| EventType tag() const { |
| return visit([](const auto& i) { return deduceTag(i); }); |
| } |
| |
| std::string name() const; |
| libkineto::ActivityType kinetoType() const; |
| uint64_t correlationID() const; |
| int64_t endTimeNS() const; |
| uint64_t endTID() const; |
| c10::DeviceType deviceType() const; |
| |
| int64_t start_time_ns_; |
| uint64_t start_tid_; |
| kineto::DeviceAndResource kineto_info_; |
| std::variant< |
| ExtraFields<EventType::TorchOp>, |
| ExtraFields<EventType::Backend>, |
| ExtraFields<EventType::Vulkan>, |
| ExtraFields<EventType::Allocation>, |
| ExtraFields<EventType::OutOfMemory>, |
| ExtraFields<EventType::PyCall>, |
| ExtraFields<EventType::PyCCall>, |
| ExtraFields<EventType::Kineto>> |
| extra_fields_; |
| |
| std::weak_ptr<Result> parent_; |
| std::vector<std::shared_ptr<Result>> children_; |
| bool finished_{false}; |
| |
| const torch::profiler::impl::kineto::activity_t* kineto_activity_{nullptr}; |
| |
| private: |
| template <EventType E> |
| Result( |
| int64_t start_time_ns, |
| uint64_t start_tid, |
| kineto::DeviceAndResource kineto_info, |
| ExtraFields<E>&& extra_fields) |
| : start_time_ns_{start_time_ns}, |
| start_tid_{start_tid}, |
| kineto_info_{kineto_info}, |
| extra_fields_{std::move(extra_fields)} {} |
| |
| template <EventType E> |
| static EventType deduceTag(const ExtraFields<E>&) { |
| return E; |
| } |
| }; |
| |
| struct KinetoObserverContext : public at::ObserverContext { |
| struct Event { |
| TorchOpBasicFields basic_fields_; |
| approx_time_t start_time_; |
| |
| // Set in the exit callback. |
| approx_time_t end_time_{std::numeric_limits<approx_time_t>::min()}; |
| |
| bool allow_tf32_cublas_; |
| std::unique_ptr<perf_counters_t> counters_; |
| }; |
| |
| explicit KinetoObserverContext(Event* event) : event_{event} {} |
| |
| Event* event_; |
| FallbackPair* fallback_{nullptr}; |
| }; |
| |
| constexpr int IO_ENCODER_DEFAULT_BLOCK_SIZE = 1024; |
| |
| constexpr int SCALAR_LIST_LENGTH_LIMIT = 30; |
| |
| // InputOutputEncoder |
| // Stores each op_events' shapes and dtypes, and concrete values into a |
| // contiguous AppendOnlyList so that we no longer create vectors for shapes |
| // and dtypes on every op. Those vectors can be created during |
| // post-processing. |
| // It splits the data into two categories: input shapes and concrete inputs. |
| class InputOutputEncoder final { |
| public: |
| void push(c10::ArrayRef<const c10::IValue> values); |
| |
| // Used during post-processing to unpack the encoded data. |
| // Each method returns a "supplier" lambda which takes no arguments; |
| // invoking the lambda once will return a list of args that represent |
| // the inputs for one op. |
| // The data is split into two streams: "input shapes" and "concrete inputs". |
| // Note: "auto" only works because these are only used in collection.cpp, |
| // where they are implemented. |
| auto getInputShapeGenerator(); |
| auto getConcreteInputGenerator(); |
| |
| bool isSupportedScalarList(const c10::IValue& list_candidate); |
| |
| void clear(); |
| |
| enum class Tag { |
| Tensor = 0, |
| UndefinedTensor, |
| TensorListBegin, // TODO: generalize to other lists. |
| ScalarList, |
| Scalar, |
| Other, |
| TERMINATOR |
| }; |
| |
| enum class IOType { Shapes, ConcreteInputs, None }; |
| |
| private: |
| void push(const at::Tensor& t); |
| |
| // Implementation detail for getInputShapeGenerator and |
| // getConcreteInputGenerator |
| auto getIValueGenerator(const IOType& io_type); |
| |
| AppendOnlyList<Tag, IO_ENCODER_DEFAULT_BLOCK_SIZE> tags_; |
| AppendOnlyList<RawTensorMetadata, IO_ENCODER_DEFAULT_BLOCK_SIZE> |
| tensor_metadata_; |
| AppendOnlyList<int64_t, IO_ENCODER_DEFAULT_BLOCK_SIZE> tensor_sizes_strides_; |
| AppendOnlyList<c10::IValue, IO_ENCODER_DEFAULT_BLOCK_SIZE> ivalues_; |
| }; |
| |
| using perf_profiler_t = torch::profiler::impl::linux_perf::PerfProfiler; |
| |
| class TORCH_API ThreadLocalSubqueue { |
| public: |
| ThreadLocalSubqueue(const uint64_t tid, const ProfilerConfig& config); |
| |
| std::unique_ptr<KinetoObserverContext> begin_op(const at::RecordFunction& fn); |
| |
| template <class... Args> |
| void emplace_backend_event(Args&&... args) { |
| backend_events_.emplace_back(std::forward<Args>(args)...); |
| } |
| |
| template <class... Args> |
| void emplace_vulkan_event(Args&&... args) { |
| vulkan_events_.emplace_back(std::forward<Args>(args)...); |
| } |
| |
| template <class... Args> |
| void emplace_allocation_event(Args&&... args) { |
| allocations_.emplace_back(std::forward<Args>(args)...); |
| } |
| |
| template <class... Args> |
| void emplace_ooms_event(Args&&... args) { |
| ooms_.emplace_back(std::forward<Args>(args)...); |
| } |
| |
| template <class... Args> |
| void emplace_py_call(Args&&... args) { |
| py_calls_.emplace_back(std::forward<Args>(args)...); |
| } |
| |
| uint64_t tid() const { |
| return tid_; |
| } |
| |
| const kineto::DeviceAndResource& kineto_info() const { |
| return kineto_info_; |
| } |
| |
| inline void disable_perf_profiler(perf_counters_t& counters) const { |
| perf_profiler_->Disable(counters); |
| } |
| |
| private: |
| uint64_t tid_; |
| ProfilerConfig config_; |
| kineto::DeviceAndResource kineto_info_; |
| std::unique_ptr<perf_profiler_t> perf_profiler_; |
| |
| friend class RecordQueue; |
| // See `containers.h` for block size benchmarks. |
| static constexpr size_t BlockSize = 512; |
| |
| struct TorchOpStorage { |
| // NB: This is a destructive operation. |
| void materialize( |
| std::vector<std::shared_ptr<Result>>& out, |
| const std::function<time_t(approx_time_t)>& time_converter, |
| const uint64_t tid, |
| const kineto::DeviceAndResource& kineto_info); |
| |
| template <typename T, size_t ChunkSize> |
| class EventBlock : public std::array<T, ChunkSize> { |
| public: |
| EventBlock(); |
| uint64_t correlation_id(const T* ptr) const; |
| |
| private: |
| uint64_t id_start_; |
| }; |
| |
| using event_t = KinetoObserverContext::Event; |
| class OpList : public AppendOnlyList<event_t, BlockSize, EventBlock> { |
| public: |
| template <class... Args> |
| std::pair<event_t*, uint64_t> emplace_back(Args&&... args); |
| static uint64_t correlationID(const OpList::Iterator& e); |
| } op_events_; |
| |
| // report_input_shapes |
| InputOutputEncoder inputs_outputs_; |
| |
| // with_stack (JIT) |
| AppendOnlyList<jit_stack_t, BlockSize> jit_stack_; |
| |
| // with_modules |
| AppendOnlyList<jit_modules_t, BlockSize> jit_modules_; |
| |
| // with_flops |
| AppendOnlyList<extra_args_t, BlockSize> extra_args_; |
| |
| // ProfilerState::KINETO_GPU_FALLBACK or |
| // ProfilerState::KINETO_PRIVATEUSE1_FALLBACK |
| AppendOnlyList<FallbackPair, BlockSize> device_fallback_; |
| } torch_ops_; |
| |
| // reportBackendEventToActiveKinetoProfiler |
| AppendOnlyList<ExtraFields<EventType::Backend>, BlockSize> backend_events_; |
| |
| // _reportVulkanEventToProfiler |
| AppendOnlyList<ExtraFields<EventType::Vulkan>::raw_event_t, BlockSize> |
| vulkan_events_; |
| |
| // reportMemoryUsage |
| AppendOnlyList<RawAllocation, BlockSize> allocations_; |
| |
| // reportOOMs |
| AppendOnlyList<ExtraFields<EventType::OutOfMemory>, BlockSize> ooms_; |
| |
| // with_stack (Python) |
| AppendOnlyList<std::pair<python_tracer::TraceKey, approx_time_t>, BlockSize> |
| py_calls_; |
| }; |
| |
| class TORCH_API RecordQueue { |
| public: |
| RecordQueue(const ProfilerConfig& config, std::set<ActivityType> activities); |
| |
| bool tracePython() const; |
| ThreadLocalSubqueue* getSubqueue(); |
| void stop(); |
| |
| // NB: This is a destructive operation. |
| std::pair< |
| std::vector<std::shared_ptr<Result>>, |
| std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper>> |
| getRecords( |
| std::function<time_t(approx_time_t)> time_converter, |
| uint64_t start_time_us, |
| uint64_t end_time_us); |
| |
| private: |
| uint32_t id_; |
| ProfilerConfig config_; |
| std::set<ActivityType> activities_; |
| ska::flat_hash_map<uint64_t, std::unique_ptr<ThreadLocalSubqueue>> |
| sub_queues_; |
| std::mutex sub_queue_mutex_; |
| std::unique_ptr<python_tracer::PythonTracerBase> python_tracer_; |
| }; |
| |
| TORCH_API bool get_record_concrete_inputs_enabled(); |
| TORCH_API void set_record_concrete_inputs_enabled_fn(std::function<bool()>); |
| TORCH_API void set_record_concrete_inputs_enabled_val(bool); |
| |
| TORCH_API bool get_fwd_bwd_enabled(); |
| TORCH_API void set_fwd_bwd_enabled_fn(std::function<bool()>); |
| TORCH_API void set_fwd_bwd_enabled_val(bool); |
| |
| TORCH_API bool get_cuda_sync_enabled(); |
| TORCH_API void set_cuda_sync_enabled_fn(std::function<bool()>); |
| TORCH_API void set_cuda_sync_enabled_val(bool); |
| |
| } // namespace impl |
| } // namespace profiler |
| } // namespace torch |