blob: 7fdd6ffd8cbeb56a90b100bb6bb540daac274305 [file] [log] [blame]
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
#include "tensorflow/core/platform/types.h"
#include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
#include "tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h"
#include "tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h"
#include "tensorflow/core/profiler/convert/xplane_to_step_events.h"
#include "tensorflow/core/profiler/convert/xplane_to_tf_functions.h"
#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
#include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
#include "tensorflow/core/profiler/utils/event_span.h"
#include "tensorflow/core/profiler/utils/hardware_type_utils.h"
#include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
#include "tensorflow/core/profiler/utils/xplane_schema.h"
#include "tensorflow/core/profiler/utils/xplane_utils.h"
namespace tensorflow {
namespace profiler {
namespace {
DeviceCapabilities GetDeviceCapFromXPlane(const XPlane& device_plane) {
DeviceCapabilities cap;
XPlaneVisitor plane = CreateTfXPlaneVisitor(&device_plane);
if (auto clock_rate_khz = plane.GetStats(kDevCapClockRateKHz)) {
cap.set_clock_rate_in_ghz(clock_rate_khz->int64_value() / 1000000.0);
}
if (auto core_count = plane.GetStats(kDevCapCoreCount)) {
cap.set_num_cores(core_count->int64_value());
}
// Set memory bandwidth in bytes/s.
if (auto memory_bw = plane.GetStats(kDevCapMemoryBandwidth)) {
cap.set_memory_bandwidth(memory_bw->int64_value());
}
if (auto memory_size_in_bytes = plane.GetStats(kDevCapMemorySize)) {
cap.set_memory_size_in_bytes(memory_size_in_bytes->uint64_value());
}
if (auto cap_major = plane.GetStats(kDevCapComputeCapMajor)) {
cap.mutable_compute_capability()->set_major(cap_major->int64_value());
}
if (auto cap_minor = plane.GetStats(kDevCapComputeCapMinor)) {
cap.mutable_compute_capability()->set_minor(cap_minor->int64_value());
}
return cap;
}
PerfEnv GetPerfEnvFromXPlane(const XPlane& device_plane) {
PerfEnv result;
DeviceCapabilities cap = GetDeviceCapFromXPlane(device_plane);
result.set_peak_tera_flops_per_second(GetFlopMaxThroughputPerSM(cap) / 1000 *
cap.num_cores());
result.set_peak_hbm_bw_giga_bytes_per_second(cap.memory_bandwidth() / 1e9);
result.set_ridge_point(result.peak_tera_flops_per_second() * 1000 /
result.peak_hbm_bw_giga_bytes_per_second());
return result;
}
void SetRunEnvironment(int32 accelerator_count, RunEnvironment* env) {
// Currently, we only support profiling one host and one program.
env->set_host_count(1);
env->set_task_count(1);
env->set_device_type(accelerator_count > 0 ? "GPU" : "CPU");
env->set_device_core_count(accelerator_count);
}
void ProcessHostPlane(const XPlane* host_plane, bool use_device_step_events,
OpMetricsDb* op_metrics_db, StepEvents* step_events,
TfFunctionDb* tf_function_db) {
absl::flat_hash_map<int64, TfOp> tf_ops =
CollectTfOpsFromHostThreadsXPlane(*host_plane);
OpMetricsDbCombiner combiner(op_metrics_db);
XPlaneVisitor plane = CreateTfXPlaneVisitor(host_plane);
plane.ForEachLine([&](const XLineVisitor& line) {
ConsumeTfMetricsDbData(
ConvertHostThreadsXLineToTfMetricsDbData(line, tf_ops), &combiner);
CombineStepEvents(ConvertHostThreadsXLineToStepEvents(
line, use_device_step_events, *step_events),
step_events);
CombineTfFunctionDb(ConvertHostThreadsXLineToTfFunctionDb(line),
tf_function_db);
});
}
} // namespace
OpStats ConvertXSpaceToOpStats(const XSpace& space) {
const XPlane* host_plane = FindPlaneWithName(space, kHostThreads);
std::vector<const XPlane*> device_planes =
FindPlanesWithPrefix(space, kGpuPlanePrefix);
OpStats op_stats;
StepEvents step_events;
// Convert device planes.
OpMetricsDbCombiner op_metrics_db_combiner(
op_stats.mutable_device_op_metrics_db());
SetRunEnvironment(device_planes.size(), op_stats.mutable_run_environment());
std::vector<KernelReport> reports;
for (const XPlane* device_trace : device_planes) {
if (!op_stats.has_perf_env()) {
*op_stats.mutable_perf_env() = GetPerfEnvFromXPlane(*device_trace);
}
const PerfEnv& perf_env = op_stats.perf_env();
OpMetricsDb device_op_metrics_db = ConvertDeviceTraceXPlaneToOpMetricsDb(
*device_trace, perf_env.peak_tera_flops_per_second(),
perf_env.peak_hbm_bw_giga_bytes_per_second());
op_metrics_db_combiner.Combine(device_op_metrics_db);
CombineStepEvents(ConvertDeviceTraceXPlaneToStepEvents(*device_trace),
&step_events);
KernelStatsDb kernel_stats_db = ConvertDeviceTraceXPlaneToKernelStatsDb(
*device_trace, /*on_kernel_fn=*/{});
reports.insert(reports.begin(), kernel_stats_db.reports().begin(),
kernel_stats_db.reports().end());
}
GroupKernelReports(&reports, op_stats.mutable_kernel_stats_db());
SortKernelsByTotalDurationDesc(op_stats.mutable_kernel_stats_db());
// Convert a host plane.
bool has_device = !device_planes.empty();
if (host_plane) {
ProcessHostPlane(host_plane, has_device,
op_stats.mutable_host_op_metrics_db(), &step_events,
op_stats.mutable_tf_function_db());
}
StepEvents nonoverlapped_step_events = ToNonOverlappedStepEvents(step_events);
*op_stats.mutable_step_db() =
ConvertStepEventsToStepDb(has_device, nonoverlapped_step_events);
*op_stats.mutable_device_op_metrics_db()->mutable_precision_stats() =
ComputePrecisionStats(nonoverlapped_step_events);
return op_stats;
}
} // namespace profiler
} // namespace tensorflow