| /* Copyright 2020 The TensorFlow Authors. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| |
| #include "tensorflow/core/profiler/internal/gpu/cupti_collector.h" |
| |
| #include "absl/container/flat_hash_set.h" |
| #include "absl/strings/str_cat.h" |
| #include "absl/strings/str_join.h" |
| #include "third_party/gpus/cuda/include/cuda.h" |
| #include "tensorflow/core/platform/abi.h" |
| #include "tensorflow/core/platform/host_info.h" |
| #include "tensorflow/core/platform/mutex.h" |
| #include "tensorflow/core/profiler/utils/parse_annotation.h" |
| #include "tensorflow/core/profiler/utils/xplane_builder.h" |
| #include "tensorflow/core/profiler/utils/xplane_schema.h" |
| #include "tensorflow/core/profiler/utils/xplane_utils.h" |
| |
| namespace tensorflow { |
| namespace profiler { |
| |
| namespace { |
| |
| bool IsHostEvent(const CuptiTracerEvent& event, int64* line_id) { |
| // DriverCallback(i.e. kernel launching) events are host events. |
| if (event.source == CuptiTracerEventSource::DriverCallback) { |
| *line_id = event.thread_id; |
| return true; |
| } |
| // Non-overhead activity events are device events. |
| if (event.type != CuptiTracerEventType::Overhead) { |
| *line_id = event.stream_id; |
| return false; |
| } |
| // Overhead events can be associated with a thread or a stream, etc. |
| // If a valid thread id is specified, we consider it as a host event. |
| // |
| if (event.stream_id != CuptiTracerEvent::kInvalidStreamId) { |
| *line_id = event.stream_id; |
| return false; |
| } else if (event.thread_id != CuptiTracerEvent::kInvalidThreadId && |
| event.thread_id != 0) { |
| *line_id = event.thread_id; |
| return true; |
| } else { |
| *line_id = kThreadIdOverhead; |
| return false; |
| } |
| } |
| |
| void CreateXEvent(const CuptiTracerEvent& event, XPlaneBuilder* plane, |
| uint64 start_gpu_ns, uint64 end_gpu_ns, XLineBuilder* line) { |
| if (event.start_time_ns < start_gpu_ns || event.end_time_ns > end_gpu_ns || |
| event.start_time_ns > event.end_time_ns) { |
| VLOG(2) << "events have abnormal timestamps:" << event.name |
| << " start time(ns): " << event.start_time_ns |
| << " end time(ns): " << event.end_time_ns; |
| return; |
| } |
| std::string kernel_name = port::MaybeAbiDemangle(event.name.c_str()); |
| if (kernel_name.empty()) { |
| kernel_name = GetTraceEventTypeName(event.type); |
| } |
| XEventMetadata* event_metadata = |
| plane->GetOrCreateEventMetadata(std::move(kernel_name)); |
| XEventBuilder xevent = line->AddEvent(*event_metadata); |
| xevent.SetTimestampNs(event.start_time_ns); |
| xevent.SetEndTimestampNs(event.end_time_ns); |
| if (event.source == CuptiTracerEventSource::DriverCallback) { |
| xevent.AddStatValue( |
| *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kDeviceId)), |
| event.device_id); |
| } |
| if (event.correlation_id != CuptiTracerEvent::kInvalidCorrelationId) { |
| xevent.AddStatValue(*plane->GetOrCreateStatMetadata( |
| GetStatTypeStr(StatType::kCorrelationId)), |
| event.correlation_id); |
| } |
| if (!event.annotation.empty()) { |
| xevent.AddStatValue(*plane->GetOrCreateStatMetadata( |
| GetStatTypeStr(StatType::kKernelAnnotation)), |
| *plane->GetOrCreateStatMetadata(event.annotation)); |
| } |
| if (event.context_id != CuptiTracerEvent::kInvalidContextId) { |
| xevent.AddStatValue( |
| *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kContextId)), |
| absl::StrCat("$$", static_cast<uint64>(event.context_id))); |
| } |
| if (event.type == CuptiTracerEventType::Kernel) { |
| std::string kernel_details = absl::StrCat( |
| "regs:", event.kernel_info.registers_per_thread, |
| " shm:", event.kernel_info.static_shared_memory_usage, |
| " grid:", event.kernel_info.grid_x, ",", event.kernel_info.grid_y, ",", |
| event.kernel_info.grid_z, " block:", event.kernel_info.block_x, ",", |
| event.kernel_info.block_y, ",", event.kernel_info.block_z); |
| xevent.AddStatValue(*plane->GetOrCreateStatMetadata( |
| GetStatTypeStr(StatType::kKernelDetails)), |
| *plane->GetOrCreateStatMetadata(kernel_details)); |
| } else if (event.type == CuptiTracerEventType::MemcpyH2D || |
| event.type == CuptiTracerEventType::MemcpyD2H || |
| event.type == CuptiTracerEventType::MemcpyD2D || |
| event.type == CuptiTracerEventType::MemcpyP2P || |
| event.type == CuptiTracerEventType::MemcpyOther) { |
| const auto& memcpy_info = event.memcpy_info; |
| std::string memcpy_details = absl::StrCat("size:", memcpy_info.num_bytes, |
| " dest:", memcpy_info.destination, |
| " async:", memcpy_info.async); |
| xevent.AddStatValue( |
| *plane->GetOrCreateStatMetadata( |
| GetStatTypeStr(StatType::kMemcpyDetails)), |
| *plane->GetOrCreateStatMetadata(std::move(memcpy_details))); |
| } else if (event.type == CuptiTracerEventType::MemoryAlloc) { |
| std::string memalloc_details = |
| absl::StrCat("num_bytes:", event.memalloc_info.num_bytes); |
| xevent.AddStatValue( |
| *plane->GetOrCreateStatMetadata( |
| GetStatTypeStr(StatType::kMemallocDetails)), |
| *plane->GetOrCreateStatMetadata(std::move(memalloc_details))); |
| } |
| |
| std::vector<Annotation> annotation_stack = |
| ParseAnnotationStack(event.annotation); |
| // If multiple metadata have the same key name, show the values from the top |
| // of the stack (innermost annotation). Concatenate the values from "hlo_op". |
| absl::flat_hash_set<absl::string_view> key_set; |
| std::vector<absl::string_view> hlo_op_names; |
| for (auto annotation = annotation_stack.rbegin(); |
| annotation != annotation_stack.rend(); ++annotation) { |
| for (const Annotation::Metadata& metadata : annotation->metadata) { |
| if (metadata.key == "tf_op") { |
| continue; // ignored, obtained from HLO proto via DebugInfoMap |
| } else if (key_set.insert(metadata.key).second) { |
| xevent.ParseAndAddStatValue( |
| *plane->GetOrCreateStatMetadata(metadata.key), metadata.value); |
| } |
| } |
| } |
| if (!annotation_stack.empty()) { |
| xevent.AddStatValue( |
| *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kTfOp)), |
| *plane->GetOrCreateStatMetadata(annotation_stack.begin()->name)); |
| } |
| } |
| |
| absl::optional<int> GetDeviceAttribute(CUdevice device, |
| CUdevice_attribute attrib) { |
| int ret_val; |
| CUresult err = cuDeviceGetAttribute(&ret_val, attrib, device); |
| if (err != CUDA_SUCCESS) return absl::nullopt; |
| return ret_val; |
| } |
| |
| std::string GetDeviceXLineName( |
| int64 stream_id, absl::flat_hash_set<CuptiTracerEventType>& event_types) { |
| std::string line_name = absl::StrCat("Stream #", stream_id); |
| event_types.erase(CuptiTracerEventType::Unsupported); |
| if (event_types.empty()) return line_name; |
| if (event_types.count(CuptiTracerEventType::Overhead)) |
| return "CUPTI overhead"; |
| std::vector<const char*> type_names; |
| for (const auto event_type : event_types) { |
| type_names.emplace_back(GetTraceEventTypeName(event_type)); |
| } |
| return absl::StrCat(line_name, "(", absl::StrJoin(type_names, ","), ")"); |
| } |
| |
| } // namespace |
| |
| void AnnotationMap::Add(uint32 device_id, uint32 correlation_id, |
| const std::string& annotation) { |
| if (annotation.empty()) return; |
| VLOG(3) << "Add annotation: device_id: " << device_id |
| << " correlation_id: " << correlation_id |
| << " annotation: " << annotation; |
| if (device_id >= per_device_map_.size()) return; |
| auto& per_device_map = per_device_map_[device_id]; |
| absl::MutexLock lock(&per_device_map.mutex); |
| if (per_device_map.annotations.size() < max_size_) { |
| absl::string_view annotation_str = |
| *per_device_map.annotations.insert(annotation).first; |
| per_device_map.correlation_map.emplace(correlation_id, annotation_str); |
| } |
| } |
| |
| absl::string_view AnnotationMap::LookUp(uint32 device_id, |
| uint32 correlation_id) { |
| if (device_id >= per_device_map_.size()) return absl::string_view(); |
| auto& per_device_map = per_device_map_[device_id]; |
| absl::MutexLock lock(&per_device_map.mutex); |
| auto it = per_device_map.correlation_map.find(correlation_id); |
| return it != per_device_map.correlation_map.end() ? it->second |
| : absl::string_view(); |
| } |
| |
| // CuptiTraceCollectorImpl store the CuptiTracerEvents from CuptiTracer and |
| // eventually convert and filter them to StepStats or XSpace. |
| class CuptiTraceCollectorImpl : public CuptiTraceCollector { |
| public: |
| CuptiTraceCollectorImpl(const CuptiTracerCollectorOptions& option, |
| uint64 start_walltime_ns, uint64 start_gpu_ns) |
| : CuptiTraceCollector(option), |
| num_callback_events_(0), |
| num_activity_events_(0), |
| start_walltime_ns_(start_walltime_ns), |
| start_gpu_ns_(start_gpu_ns), |
| num_gpus_(option.num_gpus), |
| per_device_collector_(option.num_gpus) {} |
| |
| void AddEvent(CuptiTracerEvent&& event) override { |
| if (event.device_id >= num_gpus_) return; |
| if (event.source == CuptiTracerEventSource::DriverCallback) { |
| if (num_callback_events_ > options_.max_callback_api_events) { |
| OnEventsDropped("total driver(callback) events reaches max", 1); |
| return; |
| } |
| num_callback_events_++; |
| } else { |
| if (num_activity_events_ > options_.max_activity_api_events) { |
| OnEventsDropped("total device(activity) events reaches max", 1); |
| return; |
| } |
| num_activity_events_++; |
| } |
| per_device_collector_[event.device_id].AddEvent(std::move(event)); |
| } |
| void OnEventsDropped(const std::string& reason, uint32 num_events) override { |
| absl::MutexLock lock(&mutex_); |
| dropped_events_[reason] += num_events; |
| } |
| void Flush() override {} |
| void Export(StepStats* step_stats) override { |
| LOG(INFO) << " GpuTracer has collected " << num_callback_events_ |
| << " callback api events and " << num_activity_events_ |
| << " activity events. " << ReportDroppedEvents(); |
| for (int i = 0; i < num_gpus_; ++i) { |
| per_device_collector_[i].Flush(i, start_walltime_ns_, start_gpu_ns_, |
| step_stats); |
| } |
| } |
| // Returns true if some GPU events are captured. |
| bool Export(XSpace* space, uint64 end_gpu_ns) override { |
| LOG(INFO) << " GpuTracer has collected " << num_callback_events_ |
| << " callback api events and " << num_activity_events_ |
| << " activity events. " << ReportDroppedEvents(); |
| size_t num_events = 0; |
| XPlaneBuilder host_plane( |
| FindOrAddMutablePlaneWithName(space, kCuptiDriverApiPlaneName)); |
| for (int device_ordinal = 0; device_ordinal < num_gpus_; ++device_ordinal) { |
| std::string name = GpuPlaneName(device_ordinal); |
| XPlaneBuilder device_plane(FindOrAddMutablePlaneWithName(space, name)); |
| device_plane.SetId(device_ordinal); |
| num_events += per_device_collector_[device_ordinal].Flush( |
| start_gpu_ns_, end_gpu_ns, &device_plane, &host_plane); |
| per_device_collector_[device_ordinal].GetDeviceCapabilities( |
| device_ordinal, &device_plane); |
| NormalizeTimeStamps(&device_plane, start_walltime_ns_); |
| } |
| NormalizeTimeStamps(&host_plane, start_walltime_ns_); |
| return num_events > 0; |
| } |
| std::string ReportDroppedEvents() { |
| absl::MutexLock lock(&mutex_); |
| string result; |
| for (const auto& dropped : dropped_events_) { |
| absl::StrAppend(&result, " ", dropped.second, " events dropped because ", |
| dropped.first, ";"); |
| } |
| if (!result.empty()) result.back() = '.'; |
| return result; |
| } |
| std::string ReportNumEventsIfDropped() override { |
| std::string events_dropped = ReportDroppedEvents(); |
| if (events_dropped.empty()) return ""; |
| return absl::StrCat("Detected GPU events dropped on ", port::Hostname(), |
| ": Profiler has collected ", |
| num_callback_events_.load(), " driver events and ", |
| num_activity_events_.load(), " device events.", |
| events_dropped); |
| } |
| |
| private: |
| std::atomic<int> num_callback_events_; |
| std::atomic<int> num_activity_events_; |
| absl::Mutex mutex_; |
| absl::flat_hash_map<std::string, uint64> dropped_events_ |
| ABSL_GUARDED_BY(mutex_); |
| uint64 start_walltime_ns_; |
| uint64 start_gpu_ns_; |
| int num_gpus_; |
| |
| // Set the all XLines of specified XPlane to starting walltime. |
| // Events time in both host and device planes are CUTPI timestamps. |
| // We set initial CUPTI timestamp as start time for all lines to reflect |
| // this fact. Eventually we change line start time to corresponding |
| // start_walltime_ns to normalize with CPU wall time. |
| static void NormalizeTimeStamps(XPlaneBuilder* plane, |
| uint64 start_walltime_ns) { |
| plane->ForEachLine( |
| [&](XLineBuilder line) { line.SetTimestampNs(start_walltime_ns); }); |
| } |
| |
| struct CorrelationInfo { |
| CorrelationInfo(uint32 t, uint32 e) : thread_id(t), enqueue_time_ns(e) {} |
| uint32 thread_id; |
| uint64 enqueue_time_ns; |
| }; |
| struct PerDeviceCollector { |
| void AddEvent(CuptiTracerEvent&& event) { |
| mutex_lock l(m); |
| if (event.source == CuptiTracerEventSource::DriverCallback) { |
| // Cupti api callback events were used to populate launch times etc. |
| if (event.correlation_id != CuptiTracerEvent::kInvalidCorrelationId) { |
| correlation_info.insert( |
| {event.correlation_id, |
| CorrelationInfo(event.thread_id, event.start_time_ns)}); |
| } |
| events.emplace_back(std::move(event)); |
| } else { |
| // Cupti activity events measure device times etc. |
| events.emplace_back(std::move(event)); |
| } |
| } |
| |
| void Flush(int32 device_ordinal, uint64 start_walltime_ns, |
| uint64 start_gpu_ns, StepStats* step_stats) { |
| mutex_lock l(m); |
| absl::flat_hash_map<std::pair<int64 /*stream_id*/, CuptiTracerEventType>, |
| DeviceStepStats*> |
| stream_dev_stats_map; |
| DeviceStepStats* unknown_stream_dev_stats = nullptr; |
| DeviceStepStats* all_streams_dev_stats = nullptr; |
| DeviceStepStats* memcpy_dev_stats = nullptr; |
| DeviceStepStats* sync_dev_stats = nullptr; |
| for (const CuptiTracerEvent& event : events) { |
| NodeExecStats* ns = new NodeExecStats; |
| ns->set_all_start_micros( |
| (start_walltime_ns + (event.start_time_ns - start_gpu_ns)) / 1000); |
| ns->set_op_start_rel_micros(0); |
| auto elapsed_ns = event.end_time_ns - event.start_time_ns; |
| ns->set_op_end_rel_micros(elapsed_ns / 1000); |
| ns->set_all_end_rel_micros(elapsed_ns / 1000); |
| |
| if (event.source == CuptiTracerEventSource::DriverCallback) { |
| // Legacy code ignore all other launch events except |
| // cuStreamSynchronize. |
| if (event.name == "cuStreamSynchronize") { |
| ns->set_node_name(event.name); |
| ns->set_timeline_label(absl::StrCat("ThreadId ", event.thread_id)); |
| ns->set_thread_id(event.thread_id); |
| if (sync_dev_stats == nullptr) { |
| sync_dev_stats = step_stats->add_dev_stats(); |
| sync_dev_stats->set_device( |
| absl::StrCat("/device:GPU:", device_ordinal, "/sync")); |
| } |
| sync_dev_stats->add_node_stats()->Swap(ns); |
| } |
| } else { // CuptiTracerEventSource::Activity |
| // Get launch information if available. |
| if (event.correlation_id != CuptiTracerEvent::kInvalidCorrelationId) { |
| auto it = correlation_info.find(event.correlation_id); |
| if (it != correlation_info.end()) { |
| ns->set_scheduled_micros(it->second.enqueue_time_ns / 1000); |
| ns->set_thread_id(it->second.thread_id); |
| } |
| } |
| |
| auto annotation_stack = ParseAnnotationStack(event.annotation); |
| std::string kernel_name = port::MaybeAbiDemangle(event.name.c_str()); |
| std::string activity_name = |
| !annotation_stack.empty() |
| ? std::string(annotation_stack.back().name) |
| : kernel_name; |
| ns->set_node_name(activity_name); |
| switch (event.type) { |
| case CuptiTracerEventType::Kernel: { |
| ns->set_timeline_label(absl::StrCat( |
| kernel_name, " regs:", event.kernel_info.registers_per_thread, |
| " shm:", event.kernel_info.static_shared_memory_usage, |
| " grid: ", event.kernel_info.grid_x, ",", |
| event.kernel_info.grid_y, ",", event.kernel_info.grid_z, |
| " block:", event.kernel_info.block_x, ",", |
| event.kernel_info.block_y, ",", event.kernel_info.block_z, |
| "@@", event.annotation)); |
| DeviceStepStats*& stream_dev_stats = |
| stream_dev_stats_map[std::make_pair(event.stream_id, |
| event.type)]; |
| if (stream_dev_stats == nullptr) { |
| stream_dev_stats = step_stats->add_dev_stats(); |
| stream_dev_stats->set_device( |
| absl::StrCat("/device:GPU:", device_ordinal, |
| "/stream:", event.stream_id)); |
| } |
| *stream_dev_stats->add_node_stats() = *ns; |
| if (all_streams_dev_stats == nullptr) { |
| all_streams_dev_stats = step_stats->add_dev_stats(); |
| all_streams_dev_stats->set_device(absl::StrCat( |
| "/device:GPU:", device_ordinal, "/stream:all")); |
| } |
| all_streams_dev_stats->add_node_stats()->Swap(ns); |
| break; |
| } |
| case CuptiTracerEventType::MemcpyH2D: |
| case CuptiTracerEventType::MemcpyD2H: |
| case CuptiTracerEventType::MemcpyD2D: |
| case CuptiTracerEventType::MemcpyP2P: { |
| std::string details = absl::StrCat( |
| activity_name, " bytes:", event.memcpy_info.num_bytes); |
| if (event.memcpy_info.async) { |
| absl::StrAppend(&details, " aync"); |
| } |
| if (event.memcpy_info.destination != event.device_id) { |
| absl::StrAppend(&details, |
| " to device:", event.memcpy_info.destination); |
| } |
| ns->set_timeline_label(std::move(details)); |
| DeviceStepStats*& stream_dev_stats = |
| stream_dev_stats_map[std::make_pair(event.stream_id, |
| event.type)]; |
| if (stream_dev_stats == nullptr) { |
| stream_dev_stats = step_stats->add_dev_stats(); |
| stream_dev_stats->set_device(absl::StrCat( |
| "/device:GPU:", device_ordinal, "/stream:", event.stream_id, |
| "<", GetTraceEventTypeName(event.type), ">")); |
| } |
| *stream_dev_stats->add_node_stats() = *ns; |
| if (memcpy_dev_stats == nullptr) { |
| memcpy_dev_stats = step_stats->add_dev_stats(); |
| memcpy_dev_stats->set_device( |
| absl::StrCat("/device:GPU:", device_ordinal, "/memcpy")); |
| } |
| memcpy_dev_stats->add_node_stats()->Swap(ns); |
| break; |
| } |
| default: |
| ns->set_timeline_label(activity_name); |
| if (unknown_stream_dev_stats == nullptr) { |
| unknown_stream_dev_stats = step_stats->add_dev_stats(); |
| unknown_stream_dev_stats->set_device( |
| absl::StrCat("/device:GPU:", device_ordinal, "/stream:")); |
| } |
| unknown_stream_dev_stats->add_node_stats()->Swap(ns); |
| break; |
| } |
| } |
| } |
| events.clear(); |
| } |
| |
| size_t Flush(uint64 start_gpu_ns, uint64 end_gpu_ns, |
| XPlaneBuilder* device_plane, XPlaneBuilder* host_plane) { |
| mutex_lock l(m); |
| // Tracking event types per line. |
| absl::flat_hash_map<int64, absl::flat_hash_set<CuptiTracerEventType>> |
| events_types_per_line; |
| for (auto& event : events) { |
| int64 line_id = CuptiTracerEvent::kInvalidThreadId; |
| bool is_host_event = IsHostEvent(event, &line_id); |
| if (line_id == CuptiTracerEvent::kInvalidThreadId || |
| line_id == CuptiTracerEvent::kInvalidStreamId) |
| continue; |
| auto* plane = is_host_event ? host_plane : device_plane; |
| XLineBuilder line = plane->GetOrCreateLine(line_id); |
| line.SetTimestampNs(start_gpu_ns); |
| CreateXEvent(event, plane, start_gpu_ns, end_gpu_ns, &line); |
| events_types_per_line[line_id].emplace(event.type); |
| } |
| device_plane->ForEachLine([&](XLineBuilder line) { |
| line.SetName( |
| GetDeviceXLineName(line.Id(), events_types_per_line[line.Id()])); |
| }); |
| host_plane->ForEachLine([&](XLineBuilder line) { |
| line.SetName(absl::StrCat("Host Threads/", line.Id())); |
| }); |
| size_t num_events = events.size(); |
| events.clear(); |
| return num_events; |
| } |
| |
| void GetDeviceCapabilities(int32 device_ordinal, |
| XPlaneBuilder* device_plane) { |
| CUdevice device; |
| if (cuDeviceGet(&device, device_ordinal) != CUDA_SUCCESS) return; |
| |
| auto clock_rate_in_khz = |
| GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_CLOCK_RATE); |
| if (clock_rate_in_khz) { |
| device_plane->AddStatValue( |
| *device_plane->GetOrCreateStatMetadata( |
| GetStatTypeStr(StatType::kDevCapClockRateKHz)), |
| *clock_rate_in_khz); |
| } |
| |
| auto core_count = |
| GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT); |
| if (core_count) { |
| device_plane->AddStatValue( |
| *device_plane->GetOrCreateStatMetadata( |
| GetStatTypeStr(StatType::kDevCapCoreCount)), |
| *core_count); |
| } |
| |
| auto mem_clock_khz = |
| GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE); |
| auto mem_bus_width_bits = GetDeviceAttribute( |
| device, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH); |
| if (mem_clock_khz && mem_bus_width_bits) { |
| // Times 2 because HBM is DDR memory; it gets two data bits per each |
| // data lane. |
| auto memory_bandwidth = |
| uint64{2} * (*mem_clock_khz) * 1000 * (*mem_bus_width_bits) / 8; |
| device_plane->AddStatValue( |
| *device_plane->GetOrCreateStatMetadata( |
| GetStatTypeStr(StatType::kDevCapMemoryBandwidth)), |
| memory_bandwidth); |
| } |
| |
| size_t total_memory = 0; |
| if (cuDeviceTotalMem(&total_memory, device) == CUDA_SUCCESS) { |
| device_plane->AddStatValue( |
| *device_plane->GetOrCreateStatMetadata( |
| GetStatTypeStr(StatType::kDevCapMemorySize)), |
| static_cast<uint64>(total_memory)); |
| } |
| |
| auto compute_capability_major = GetDeviceAttribute( |
| device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR); |
| if (compute_capability_major) { |
| device_plane->AddStatValue( |
| *device_plane->GetOrCreateStatMetadata( |
| GetStatTypeStr(StatType::kDevCapComputeCapMajor)), |
| *compute_capability_major); |
| } |
| auto compute_capability_minor = GetDeviceAttribute( |
| device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR); |
| if (compute_capability_minor) { |
| device_plane->AddStatValue( |
| *device_plane->GetOrCreateStatMetadata( |
| GetStatTypeStr(StatType::kDevCapComputeCapMinor)), |
| *compute_capability_minor); |
| } |
| } |
| |
| mutex m; |
| std::vector<CuptiTracerEvent> events TF_GUARDED_BY(m); |
| absl::flat_hash_map<uint32, CorrelationInfo> correlation_info |
| TF_GUARDED_BY(m); |
| }; |
| absl::FixedArray<PerDeviceCollector> per_device_collector_; |
| |
| TF_DISALLOW_COPY_AND_ASSIGN(CuptiTraceCollectorImpl); |
| }; |
| |
| std::unique_ptr<CuptiTraceCollector> CreateCuptiCollector( |
| const CuptiTracerCollectorOptions& options, const uint64 start_walltime_ns, |
| const uint64 start_gputime_ns) { |
| return absl::make_unique<CuptiTraceCollectorImpl>(options, start_walltime_ns, |
| start_gputime_ns); |
| } |
| |
| } // namespace profiler |
| } // namespace tensorflow |