| /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| |
| #if GOOGLE_CUDA |
| |
| #include <stdlib.h> |
| |
| #include <memory> |
| |
| #include "absl/container/fixed_array.h" |
| #include "absl/strings/str_cat.h" |
| #include "tensorflow/core/common_runtime/step_stats_collector.h" |
| #include "tensorflow/core/lib/core/errors.h" |
| #include "tensorflow/core/lib/strings/str_util.h" |
| #include "tensorflow/core/platform/abi.h" |
| #include "tensorflow/core/platform/annotation.h" |
| #include "tensorflow/core/platform/macros.h" |
| #include "tensorflow/core/platform/stringprintf.h" |
| #include "tensorflow/core/profiler/internal/gpu/cupti_tracer.h" |
| #include "tensorflow/core/profiler/internal/gpu/cupti_wrapper.h" |
| #include "tensorflow/core/profiler/internal/parse_annotation.h" |
| #include "tensorflow/core/profiler/internal/profiler_interface.h" |
| #include "tensorflow/core/util/env_var.h" |
| |
| namespace tensorflow { |
| namespace profiler { |
| |
| // Adapter from CuptiTraceCollector to StepStatsCollector: This class convert |
| // and filter from CuptiTracerEvent to tensorflow::NodeExecStats. |
| // We can not just forward event on the fly because StepStatsCollector have |
| // a single mutex for all devices, Therefore we will cache events and forward |
| // only when Flush(). |
| class StepStatsCuptiTracerAdaptor : public CuptiTraceCollector { |
| public: |
| StepStatsCuptiTracerAdaptor(const CuptiTracerCollectorOptions& option, |
| const std::string prefix, int num_gpus, |
| uint64 start_walltime_ns, uint64 start_gpu_ns, |
| StepStatsCollector* trace_collector) |
| : CuptiTraceCollector(option), |
| trace_collector_(trace_collector), |
| num_callback_events_(0), |
| num_activity_events_(0), |
| start_walltime_ns_(start_walltime_ns), |
| start_gpu_ns_(start_gpu_ns), |
| num_gpus_(num_gpus), |
| per_device_adaptor_(num_gpus) { |
| for (int i = 0; i < num_gpus; ++i) { // for each device id. |
| per_device_adaptor_[i].stream_device = |
| strings::StrCat(prefix, "/device:GPU:", i, "/stream:"); |
| per_device_adaptor_[i].memcpy_device = |
| strings::StrCat(prefix, "/device:GPU:", i, "/memcpy"); |
| per_device_adaptor_[i].sync_device = |
| strings::StrCat(prefix, "/device:GPU:", i, "/sync"); |
| } |
| } |
| |
| void AddEvent(CuptiTracerEvent&& event) override { |
| if (event.device_id >= num_gpus_) return; |
| if (event.source == CuptiTracerEventSource::DriverCallback) { |
| if (num_callback_events_ > options_.max_callback_api_events) { |
| OnEventsDropped("trace collector", 1); |
| return; |
| } |
| num_callback_events_++; |
| } else { |
| if (num_activity_events_ > options_.max_activity_api_events) { |
| OnEventsDropped("trace collector", 1); |
| return; |
| } |
| num_activity_events_++; |
| } |
| per_device_adaptor_[event.device_id].AddEvent(std::move(event)); |
| } |
| void OnEventsDropped(const std::string& reason, uint32 num_events) override {} |
| void Flush() override { |
| LOG(INFO) << " GpuTracer has collected " << num_callback_events_ |
| << " callback api events and " << num_activity_events_ |
| << " activity events."; |
| for (int i = 0; i < num_gpus_; ++i) { |
| per_device_adaptor_[i].Flush(trace_collector_, start_walltime_ns_, |
| start_gpu_ns_); |
| } |
| } |
| |
| private: |
| StepStatsCollector* trace_collector_; |
| std::atomic<int> num_callback_events_; |
| std::atomic<int> num_activity_events_; |
| uint64 start_walltime_ns_; |
| uint64 start_gpu_ns_; |
| int num_gpus_; |
| |
| struct CorrelationInfo { |
| CorrelationInfo(uint32 t, uint32 e) : thread_id(t), enqueue_time_ns(e) {} |
| uint32 thread_id; |
| uint64 enqueue_time_ns; |
| }; |
| struct PerDeviceAdaptor { |
| void AddEvent(CuptiTracerEvent&& event) { |
| absl::MutexLock lock(&mutex); |
| if (event.source == CuptiTracerEventSource::DriverCallback) { |
| // Cupti api callcack events were used to populate launch times etc. |
| if (event.name == "cuStreamSynchronize") { |
| events.emplace_back(std::move(event)); |
| } |
| if (event.correlation_id != CuptiTracerEvent::kInvalidCorrelationId) { |
| correlation_info.insert( |
| {event.correlation_id, |
| CorrelationInfo(event.thread_id, event.start_time_ns)}); |
| } |
| } else { |
| // Cupti activity events measure device times etc. |
| events.emplace_back(std::move(event)); |
| } |
| } |
| void Flush(StepStatsCollector* collector, uint64 start_walltime_ns, |
| uint64 start_gpu_ns) { |
| absl::MutexLock lock(&mutex); |
| for (auto& event : events) { |
| NodeExecStats* ns = new NodeExecStats; |
| ns->set_all_start_micros( |
| (start_walltime_ns + (event.start_time_ns - start_gpu_ns)) / 1000); |
| ns->set_op_start_rel_micros(0); |
| auto elapsed_ns = event.end_time_ns - event.start_time_ns; |
| ns->set_op_end_rel_micros(elapsed_ns / 1000); |
| ns->set_all_end_rel_micros(elapsed_ns / 1000); |
| |
| if (event.source == CuptiTracerEventSource::DriverCallback) { |
| DCHECK_EQ(event.name, "cuStreamSynchronize"); |
| ns->set_node_name(event.name); |
| ns->set_timeline_label(absl::StrCat("ThreadId ", event.thread_id)); |
| ns->set_thread_id(event.thread_id); |
| collector->Save(sync_device, ns); |
| } else { // CuptiTracerEventSource::Activity |
| // Get launch information if available. |
| if (event.correlation_id != CuptiTracerEvent::kInvalidCorrelationId) { |
| auto it = correlation_info.find(event.correlation_id); |
| if (it != correlation_info.end()) { |
| ns->set_scheduled_micros(it->second.enqueue_time_ns / 1000); |
| ns->set_thread_id(it->second.thread_id); |
| } |
| } |
| |
| auto annotation_stack = ParseAnnotationStack(event.annotation); |
| std::string activity_name = |
| !annotation_stack.empty() |
| ? std::string(annotation_stack.back().name) |
| : port::MaybeAbiDemangle(event.name.c_str()); |
| switch (event.type) { |
| case CuptiTracerEventType::Kernel: { |
| const std::string details = strings::Printf( |
| "regs:%llu shm:%llu grid:%llu,%llu,%llu block:%llu,%llu,%llu", |
| event.kernel_info.registers_per_thread, |
| event.kernel_info.static_shared_memory_usage, |
| event.kernel_info.grid_x, event.kernel_info.grid_y, |
| event.kernel_info.grid_z, event.kernel_info.block_x, |
| event.kernel_info.block_y, event.kernel_info.block_z); |
| ns->set_timeline_label(absl::StrCat(activity_name, " ", details)); |
| auto nscopy = new NodeExecStats(*ns); |
| collector->Save(absl::StrCat(stream_device, "all"), ns); |
| collector->Save(absl::StrCat(stream_device, event.stream_id), |
| nscopy); |
| break; |
| } |
| case CuptiTracerEventType::MemcpyH2D: |
| case CuptiTracerEventType::MemcpyD2H: |
| case CuptiTracerEventType::MemcpyD2D: |
| case CuptiTracerEventType::MemcpyP2P: { |
| std::string details = absl::StrCat( |
| activity_name, " bytes:", event.memcpy_info.num_bytes); |
| if (event.memcpy_info.async) { |
| absl::StrAppend(&details, " aync"); |
| } |
| if (event.memcpy_info.destination != event.device_id) { |
| absl::StrAppend(&details, |
| " to device:", event.memcpy_info.destination); |
| } |
| ns->set_timeline_label(std::move(details)); |
| auto nscopy = new NodeExecStats(*ns); |
| collector->Save(memcpy_device, ns); |
| collector->Save( |
| absl::StrCat(stream_device, event.stream_id, "<", |
| GetTraceEventTypeName(event.type), ">"), |
| nscopy); |
| break; |
| } |
| default: |
| ns->set_timeline_label(activity_name); |
| collector->Save(stream_device, ns); |
| } |
| ns->set_node_name(std::move(activity_name)); |
| } |
| } |
| } |
| |
| absl::Mutex mutex; |
| std::string stream_device GUARDED_BY(mutex); |
| std::string memcpy_device GUARDED_BY(mutex); |
| std::string sync_device GUARDED_BY(mutex); |
| std::vector<CuptiTracerEvent> events GUARDED_BY(mutex); |
| absl::flat_hash_map<uint32, CorrelationInfo> correlation_info |
| GUARDED_BY(mutex); |
| }; |
| absl::FixedArray<PerDeviceAdaptor> per_device_adaptor_; |
| |
| TF_DISALLOW_COPY_AND_ASSIGN(StepStatsCuptiTracerAdaptor); |
| }; |
| |
| // GpuTracer for GPU. |
| class GpuTracer : public profiler::ProfilerInterface { |
| public: |
| GpuTracer(CuptiTracer* cupti_tracer, CuptiInterface* cupti_interface) |
| : cupti_tracer_(cupti_tracer), |
| cupti_interface_(cupti_interface), |
| trace_collector_(&step_stats_) { |
| VLOG(1) << "GpuTracer created."; |
| } |
| ~GpuTracer() override {} |
| |
| // GpuTracer interface: |
| Status Start() override; |
| Status Stop() override; |
| Status CollectData(RunMetadata* run_metadata) override; |
| profiler::DeviceType GetDeviceType() override { |
| return profiler::DeviceType::kGpu; |
| } |
| |
| private: |
| Status DoStart(); |
| Status DoStop(); |
| |
| enum State { |
| kNotStarted, |
| kStartedOk, |
| kStartedError, |
| kStoppedOk, |
| kStoppedError |
| }; |
| State profiling_state_ = State::kNotStarted; |
| |
| CuptiTracer* cupti_tracer_; |
| CuptiTracerOptions options_; |
| CuptiInterface* cupti_interface_; |
| StepStats step_stats_; |
| StepStatsCollector trace_collector_; |
| std::unique_ptr<StepStatsCuptiTracerAdaptor> step_stats_cupti_adaptor_; |
| }; |
| |
| Status GpuTracer::DoStart() { |
| if (!cupti_tracer_->IsAvailable()) { |
| return errors::Unavailable("Another profile session running."); |
| } |
| |
| options_.cbids_selected = { |
| // KERNEL |
| CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel, |
| // MEMCPY |
| CUPTI_DRIVER_TRACE_CBID_cuMemcpy, |
| CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync, |
| CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2, |
| CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2, |
| CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2, |
| CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2, |
| CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2, |
| CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2, |
| CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2, |
| CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2, |
| CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2, |
| CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2, |
| CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2, |
| CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2, |
| CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2, |
| CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2, |
| CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2, |
| CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2, |
| CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2, |
| CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2, |
| // GENERIC |
| CUPTI_DRIVER_TRACE_CBID_cuStreamSynchronize, |
| }; |
| |
| bool trace_concurrent_kernels = false; |
| ReadBoolFromEnvVar("TF_GPU_CUPTI_FORCE_CONCURRENT_KERNEL", false, |
| &trace_concurrent_kernels) |
| .IgnoreError(); |
| options_.activities_selected.push_back( |
| trace_concurrent_kernels ? CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL |
| : CUPTI_ACTIVITY_KIND_KERNEL); |
| options_.activities_selected.push_back(CUPTI_ACTIVITY_KIND_MEMCPY); |
| options_.activities_selected.push_back(CUPTI_ACTIVITY_KIND_MEMCPY2); |
| options_.activities_selected.push_back(CUPTI_ACTIVITY_KIND_OVERHEAD); |
| |
| #if CUDA_VERSION < 10000 |
| if (!trace_concurrent_kernels) options_.cupti_finalize = true; |
| #endif |
| |
| CuptiTracerCollectorOptions collector_options; |
| uint64 start_gputime_ns = CuptiTracer::GetTimestamp(); |
| uint64 start_walltime_ns = tensorflow::EnvTime::Default()->NowNanos(); |
| int num_gpus = cupti_tracer_->NumGpus(); |
| step_stats_cupti_adaptor_ = absl::make_unique<StepStatsCuptiTracerAdaptor>( |
| collector_options, "", num_gpus, start_walltime_ns, start_gputime_ns, |
| &trace_collector_); |
| |
| tensorflow::tracing::ScopedAnnotation::Enable(true); |
| cupti_tracer_->Enable(options_, cupti_interface_, |
| step_stats_cupti_adaptor_.get()); |
| return Status::OK(); |
| } |
| |
| Status GpuTracer::Start() { |
| Status status = DoStart(); |
| if (status.ok()) { |
| profiling_state_ = State::kStartedOk; |
| return Status::OK(); |
| } else { |
| profiling_state_ = State::kStartedError; |
| return status; |
| } |
| } |
| |
| Status GpuTracer::DoStop() { |
| cupti_tracer_->Disable(); |
| tensorflow::tracing::ScopedAnnotation::Enable(false); |
| return Status::OK(); |
| } |
| |
| Status GpuTracer::Stop() { |
| if (profiling_state_ == State::kStartedOk) { |
| Status status = DoStop(); |
| profiling_state_ = status.ok() ? State::kStoppedOk : State::kStoppedError; |
| } |
| return Status::OK(); |
| } |
| |
| Status GpuTracer::CollectData(RunMetadata* run_metadata) { |
| switch (profiling_state_) { |
| case State::kNotStarted: |
| VLOG(1) << "No trace data collected, session wasn't started"; |
| return Status::OK(); |
| case State::kStartedOk: |
| return errors::FailedPrecondition("Cannot collect trace before stopping"); |
| case State::kStartedError: |
| LOG(ERROR) << "Cannot collect, xprof failed to start"; |
| return Status::OK(); |
| case State::kStoppedError: |
| VLOG(1) << "No trace data collected"; |
| return Status::OK(); |
| case State::kStoppedOk: { |
| // Input run_metadata is shared by profiler interfaces, we need append. |
| trace_collector_.Finalize(); |
| for (auto& dev_stats : *step_stats_.mutable_dev_stats()) { |
| run_metadata->mutable_step_stats()->add_dev_stats()->Swap(&dev_stats); |
| } |
| return Status::OK(); |
| } |
| } |
| return errors::Internal("Invalid profiling state: ", profiling_state_); |
| } |
| |
| } // namespace profiler |
| |
| // Not in anonymous namespace for testing purposes. |
| std::unique_ptr<profiler::ProfilerInterface> CreateGpuTracer( |
| const profiler::ProfilerOptions& options) { |
| if (options.device_type != profiler::DeviceType::kGpu && |
| options.device_type != profiler::DeviceType::kUnspecified) |
| return nullptr; |
| profiler::CuptiTracer* cupti_tracer = |
| profiler::CuptiTracer::GetCuptiTracerSingleton(); |
| if (!cupti_tracer->IsAvailable()) { |
| return nullptr; |
| } |
| profiler::CuptiInterface* cupti_interface = profiler::GetCuptiInterface(); |
| return absl::make_unique<profiler::GpuTracer>(cupti_tracer, cupti_interface); |
| } |
| |
| auto register_gpu_tracer_factory = [] { |
| RegisterProfilerFactory(&CreateGpuTracer); |
| return 0; |
| }(); |
| |
| } // namespace tensorflow |
| |
| #endif // GOOGLE_CUDA |