blob: aa7cd563a33f1f883c644c9ac107a2ddabb552ea [file] [log] [blame]
syntax = "proto3";
package tensorflow.profiler;
import "tensorflow/core/profiler/protobuf/kernel_stats.proto";
import "tensorflow/core/profiler/protobuf/op_metrics.proto";
import "tensorflow/core/profiler/protobuf/steps_db.proto";
import "tensorflow/core/profiler/protobuf/tf_function.proto";
// Performance environment, e.g the peak performance capabilities of the device.
message PerfEnv {
// Peak performance of a TPU core or a GPU in TFLOP/s.
double peak_tera_flops_per_second = 1;
// Peak memory bandwidth of a TPU core or a GPU in GiBs/s.
double peak_hbm_bw_giga_bytes_per_second = 2;
// The ridge point of roofline model in FLOP/Byte. (i.e., minimum operational
// intensity required to achieve maximum performance).
double ridge_point = 3;
}
// Result proto for host-independent job information.
message HostIndependentJobInfoResult {
// The change-list number of this build.
int64 change_list = 1;
// The time of this build (nanoseconds since the Unix epoch).
int64 build_time = 2;
// The target of this build.
string build_target = 3;
// Profiling duration (in ms).
uint32 profile_duration_ms = 4;
}
// Result proto for host-dependent job information.
message HostDependentJobInfoResult {
// This ID of the host where the job was run on.
string host_id = 1;
// The command line used to run the job.
string command_line = 2;
// The start time of this run (nanoseconds since the Unix epoch).
int64 start_time = 3;
// BNS address specified by client at time of profiling request.
string bns_address = 4;
// Profiling start walltime (in ns).
uint64 profile_time_ns = 5;
}
// System topology, which describes the number of chips in a pod
// and the connectivity style.
message SystemTopology {
// The X, Y, and Z dimensions of this topology. 0 means that dimension does
// not exist.
int64 x_dimension = 1;
int64 y_dimension = 2;
int64 z_dimension = 3;
// The number of expected bad chips in this system.
int64 num_expected_reduced_chips = 4;
}
// The run environment of a profiling session.
message RunEnvironment {
// Number of hosts used.
int32 host_count = 1;
// Number of tasks used.
int32 task_count = 2;
// Distinct hostnames seen.
map<string, bool> hostnames = 3;
// The type of device used.
string device_type = 4;
// The number of device cores used.
// In TPU case, this corresponds to the number of TPU cores
// In GPU case, this corresponds to the number of GPUs (not the number of
// SMs).
int32 device_core_count = 5;
// The per-device-core batch size.
int32 per_core_batch_size = 6;
// Host-independent information about this job.
HostIndependentJobInfoResult host_independent_job_info = 7;
// Host-dependent information about this job.
repeated HostDependentJobInfoResult host_dependent_job_info = 8;
// The number of replicas, corresponds to input parallelism.
// If there is no model parallelism, replica_count = device_core_count
int32 replica_count = 9;
// The number of cores used for a single replica, e.g. model parallelism.
// If there is no model parallelism, then num_cores_per_replica = 1
int32 num_cores_per_replica = 10;
// The chip interconnection topology.
SystemTopology topology = 11;
// Host trace level.
uint32 host_trace_level = 12;
}
// Operator Statistics.
message OpStats {
// The database for the op metrics collected from the host over the entire
// profiling session including incomplete steps.
OpMetricsDb host_op_metrics_db = 1;
// The database for the op metrics collected from the device over the entire
// profiling session including incomplete steps.
OpMetricsDb device_op_metrics_db = 2;
// Performance environment of the op metrics collected.
PerfEnv perf_env = 3;
// The database of step sequences.
StepDatabaseResult step_db = 4;
// The run environment of this profiling session.
RunEnvironment run_environment = 5;
// Kernel stats results from all GPUs.
KernelStatsDb kernel_stats_db = 6;
// Statistics for all tf-functions.
TfFunctionDb tf_function_db = 8;
// Errors seen.
repeated string errors = 7;
}