blob: 5d6e7017b1541cf63e66eb77035c2882d8638121 [file] [log] [blame]
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/core/framework/metrics.h"
#include <cstdint>
#include <string>
#include "absl/strings/str_cat.h"
#include "tensorflow/core/lib/monitoring/counter.h"
#include "tensorflow/core/lib/monitoring/gauge.h"
#include "tensorflow/core/lib/monitoring/sampler.h"
#include "tensorflow/core/protobuf/data_service.pb.h"
namespace tensorflow {
namespace metrics {
namespace {
auto* graph_runs = monitoring::Counter<0>::New(
"/tensorflow/core/graph_runs",
"The number of graph executions used to collect "
"/tensorflow/core/graph_run_time_usecs");
auto* graph_run_time_usecs = monitoring::Counter<0>::New(
"/tensorflow/core/graph_run_time_usecs",
"The total time spent on executing graphs in microseconds.");
auto* graph_run_time_usecs_histogram = monitoring::Sampler<0>::New(
{"/tensorflow/core/graph_run_time_usecs_histogram",
"The wall-clock time spent on executing graphs in microseconds."},
// Power of 2 with bucket count 20 (> 17 minutes)
{monitoring::Buckets::Exponential(1000, 2, 20)});
auto* graph_pending_queue_length_histogram = monitoring::Sampler<0>::New(
{"/tensorflow/core/graph_pending_queue_length_histogram",
"The number of pending (ready but not running) tasks in graph executor."},
// Power of 1.5 with bucket count 30 (> 191k)
{monitoring::Buckets::Exponential(1, 1.5, 30)});
auto* graph_run_input_tensor_bytes = monitoring::Sampler<0>::New(
{"/tensorflow/core/graph_run_input_tensor_bytes",
"The size of input tensors in bytes."},
// Power of 2 with bucket count 14 (256MB)
{monitoring::Buckets::Exponential(1, 4, 14)});
auto* graph_run_output_tensor_bytes = monitoring::Sampler<0>::New(
{"/tensorflow/core/graph_run_output_tensor_bytes",
"The size of output tensors in bytes."},
// Power of 2 with bucket count 14 (256MB)
{monitoring::Buckets::Exponential(1, 4, 14)});
auto* graph_unused_outputs = monitoring::Counter<1>::New(
"/tensorflow/core/graph_unused_outputs",
"The number of unused outputs for ops of a given type.", "name");
auto* tf_data_autotune_counter = monitoring::Counter<1>::New(
"/tensorflow/data/autotune", "tf.data autotuning", "name");
auto* tf_data_bytes_consumed_counter = monitoring::Counter<1>::New(
"/tensorflow/data/bytes_consumed",
"The number of bytes consumed by a tf.data Dataset.", "name");
auto* tf_data_bytes_produced_counter = monitoring::Counter<1>::New(
"/tensorflow/data/bytes_produced",
"The number of bytes produced by a tf.data Dataset.", "name");
auto* tf_data_bytes_read_counter = monitoring::Counter<1>::New(
"/tensorflow/data/bytes_read",
"The number of bytes read by tf.data Dataset sources.", "name");
auto* tf_data_bytes_fetched_counter = monitoring::Counter<0>::New(
"/tensorflow/data/bytes_fetched",
"The number of bytes fetched from tf.data Dataset iterator.");
auto* tf_data_elements_counter = monitoring::Counter<1>::New(
"/tensorflow/data/elements", "tf.data elements", "name");
auto* tf_data_experiment_counter = monitoring::Counter<1>::New(
"/tensorflow/data/experiment",
"The number of times tf.data experiment is applied to input pipelines.",
"name");
auto* tf_data_fingerprint_counter = monitoring::Counter<1>::New(
"/tensorflow/data/fingerprint", "tf.data fingerprint", "name");
auto* tf_data_get_next_duration_usecs_histogram = monitoring::Sampler<0>::New(
{"/tensorflow/data/getnext_duration",
"Microseconds spent fetching an element from tf.data iterator."},
// Power of 2 with bucket count 10 (1024 microseconds) and 1 second.
{monitoring::Buckets::Explicit(
{2., 4., 8., 16., 32., 64., 128., 256., 512., 1024., 1e6})});
auto* tf_data_used_vs_budget_ratio_histogram = monitoring::Sampler<0>::New(
{"/tensorflow/data/used_vs_budget_ratio",
"Ratio of tf.data used ram over ram budget when running optimization."},
// Uniform linear buckets with count 10 from 0 to 2
{monitoring::Buckets::Explicit(
{0.0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0})});
auto* tf_data_buffered_vs_budget_ratio_histogram = monitoring::Sampler<0>::New(
{"/tensorflow/data/buffered_vs_budget_ratio",
"Ratio of tf.data max buffer bytes over ram budget when running "
"optimization."},
// Uniform linear buckets with count 10 from 0 to 2
{monitoring::Buckets::Explicit(
{0.0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0})});
auto* tf_data_iterator_busy_counter =
monitoring::Counter<0>::New("/tensorflow/data/iterator_busy",
"The time (in microseconds) during which a "
"tf.data iterator was busy processing at "
"least one `GetNext()` request.");
auto* tf_data_iterator_lifetime_counter = monitoring::Counter<0>::New(
"/tensorflow/data/iterator_lifetime",
"The time (in microseconds) between a tf.data iterator receiving the first "
"`GetNext()` request and responding to the last `GetNext()` request.");
auto* tf_data_iterator_gap_usec_histogram = monitoring::Sampler<0>::New(
{"/tensorflow/data/iterator_gap",
"The time (in microseconds) between a tf.data iterator responding to a "
"`GetNext()` request and receiving the next `GetNext()` request."},
// Buckets of 0.1ms, 0.2ms, 0.4ms, ..., 2s.
{monitoring::Buckets::Exponential(100, 2, 12)});
auto* tf_data_optimization_counter = monitoring::Counter<1>::New(
"/tensorflow/data/optimization", "tf.data optimization", "name");
auto* tf_data_service_workers_created_counter =
monitoring::Counter<0>::New("/tensorflow/data/service/workers_created",
"Number of tf.data service workers created");
auto* tf_data_service_jobs_created_counter = monitoring::Counter<2>::New(
"/tensorflow/data/service/jobs_created", "Number of tf.data service jobs.",
"processing_mode", "coordinated_read");
auto* tf_data_service_client_iterators_counter = monitoring::Counter<4>::New(
"/tensorflow/data/service/client_iterators",
"Number of tf.data service client iterators created.", "worker_uid",
"deployment_mode", "processing_mode", "is_coordinated_read");
auto* tf_data_service_multi_trainer_cache_queries_counter =
monitoring::Counter<1>::New(
"/tensorflow/data/service/multi_trainer_cache_queries",
"tf.data service multi-client cache queries counter. The result can be "
"hit or miss.",
"cache_hit");
auto* tf_data_service_multi_trainer_cache_size_bytes =
monitoring::Gauge<int64_t, 0>::New(
"/tensorflow/data/service/multi_trainer_cache_size_bytes",
"tf.data service multi-client cache memory usage in bytes.");
auto* tf_data_filename_counter = monitoring::Counter<2>::New(
"/tensorflow/data/filename", "The file name read by a tf.data Dataset.",
"name", "filename");
auto* tf_data_model_gauge =
monitoring::Gauge<std::function<std::string()>, 1>::New(
"/tensorflow/data/model", "tf.data autotuning model proto.", "id");
auto* tf_data_auto_shard = monitoring::Gauge<int64, 2>::New(
"/tensorflow/data/autoshard", "tf.data autoshard statistics.", "id",
"name");
auto* tf_data_auto_shard_rewrite_batch_size_eligible =
monitoring::Counter<1>::New(
"/tensorflow/data/autoshard_rewrite_batch_size/eligible",
"Whether tf.data pipelines that are eligible for autoshard "
"to rewrite the batch size.",
"eligible");
auto* tf_data_auto_shard_rewrite_batch_size_reason =
monitoring::Counter<1>::New(
"/tensorflow/data/autoshard_rewrite_batch_size/reason",
"The reasons that tf.data pipelines are ineligible for autoshard "
"to rewrite the batch size.",
"reason");
auto* tf_data_autotune_stopping_criteria_counter =
monitoring::Counter<1>::New("/tensorflow/data/autotune_stopping_criteria",
"The number of times each tf.data autotune "
"algorithm stopping criterion is met.",
"name");
auto* parse_dense_feature_counter = monitoring::Counter<0>::New(
"/tensorflow/data/dense_feature",
"The number of dense features parsed by ops for parsing tf.Example.");
auto* parse_sparse_feature_counter = monitoring::Counter<0>::New(
"/tensorflow/data/sparse_feature",
"The number of sparse features parsed by ops for parsing tf.Example.");
auto* parse_ragged_feature_counter = monitoring::Counter<0>::New(
"/tensorflow/data/ragged_feature",
"The number of ragged features parsed by ops for parsing tf.Example.");
auto* build_graph_calls = monitoring::Counter<0>::New(
"/tensorflow/core/graph_build_calls",
"The number of times TensorFlow has created a new client graph. "
"A client graph is a sub-graph of the full graph, induced by a set of "
"options, including the requested feeds and fetches. It includes time "
"spent optimizing the graph with Grappler, and time spent pruning the "
"sub-graph.");
auto* build_graph_time_usecs = monitoring::Counter<0>::New(
"/tensorflow/core/graph_build_time_usecs",
"The amount of time TensorFlow has spent creating new client graphs in "
"microseconds. "
"A client graph is a sub-graph of the full graph, induced by a set of "
"options, including the requested feeds and fetches. It includes time "
"spent optimizing the graph with Grappler, and time spent pruning the "
"sub-graph.");
auto* xla_compilations = monitoring::Counter<0>::New(
"/tensorflow/core/xla_compilations",
"The number of XLA compilations used to collect "
"/tensorflow/core/xla_compilation_time_usecs");
auto* xla_compilation_time_usecs = monitoring::Counter<0>::New(
"/tensorflow/core/xla_compilation_time_usecs",
"The total time spent on compiling XLA graphs in microseconds.");
auto* xla_tpu_spmd_cores_per_replica = monitoring::Counter<1>::New(
"/tensorflow/tpu/xla_spmd_cores_per_replica",
"The number of cores used by XLA SPMD-replicated models.", "cores");
auto* bfc_allocator_delay =
monitoring::Counter<0>::New("/tensorflow/core/bfc_allocator_delay",
"The total time spent running each graph "
"optimization pass in microseconds.");
auto* tpu_variable_distribution_time_usecs = monitoring::Counter<0>::New(
"/tensorflow/tpu/variable_distribution_time",
"Time spent sending variables from primary task to other worker tasks "
"at the start of a call to TPUExecute. Timer starts at RunGraph "
"invocation and ends when TPUExecute args are ready on the current task.");
auto* test_counters =
monitoring::Counter<2>::New("/tensorflow/core/test_counters",
"Counters used for testing.", "name", "label");
} // namespace
auto* tpu_op_error_counter = monitoring::Counter<2>::New(
"/tensorflow/tpu/op_error_count",
"Count the tpu related errors by op and error_type.", "op", "error_type");
auto* eager_client_error_counter = monitoring::Counter<2>::New(
"/tensorflow/core/eager_client_error_count",
"Count the errors in eager client as a central place.", "error_source",
"error_type");
monitoring::Counter<2>* GetGraphOptimizationCounter() {
static auto* graph_optimization_counter =
monitoring::Counter<2>::New("/tensorflow/core/graph_optimization_usecs",
"The total time spent running each graph "
"optimization pass in microseconds.",
"kind", "name");
return graph_optimization_counter;
}
void RecordTFDataAutotune(const string& name) {
tf_data_autotune_counter->GetCell(name)->IncrementBy(1);
}
monitoring::CounterCell* GetTFDataBytesConsumedCounter(const string& name) {
return tf_data_bytes_consumed_counter->GetCell(name);
}
monitoring::CounterCell* GetTFDataBytesProducedCounter(const string& name) {
return tf_data_bytes_produced_counter->GetCell(name);
}
monitoring::CounterCell* GetTFDataBytesReadCounter(const string& name) {
return tf_data_bytes_read_counter->GetCell(name);
}
monitoring::CounterCell* GetTFDataElementsCounter(const string& name) {
return tf_data_elements_counter->GetCell(name);
}
monitoring::GaugeCell<std::function<std::string()>>* GetTFDataModelGauge(
const string& id) {
return tf_data_model_gauge->GetCell(id);
}
void RecordTFDataBytesFetched(int64_t num_bytes) {
tf_data_bytes_fetched_counter->GetCell()->IncrementBy(num_bytes);
}
void RecordTFDataExperiment(const string& name) {
tf_data_experiment_counter->GetCell(name)->IncrementBy(1);
}
void RecordTFDataFingerprint(const string& name) {
tf_data_fingerprint_counter->GetCell(name)->IncrementBy(1);
}
void RecordTFDataGetNextDuration(uint64 duration_us) {
static auto* tf_data_get_next_duration_cell =
tf_data_get_next_duration_usecs_histogram->GetCell();
tf_data_get_next_duration_cell->Add(duration_us);
}
void RecordTFDataAutotuneUsedRamBudgetRatio(const double ratio) {
static auto* tf_data_used_vs_budget_ratio_histogram_cell =
tf_data_used_vs_budget_ratio_histogram->GetCell();
tf_data_used_vs_budget_ratio_histogram_cell->Add(ratio);
}
void RecordTFDataAutotuneMaxBufferBudgetRatio(const double ratio) {
static auto* tf_data_buffered_vs_budget_ratio_histogram_cell =
tf_data_buffered_vs_budget_ratio_histogram->GetCell();
tf_data_buffered_vs_budget_ratio_histogram_cell->Add(ratio);
}
void RecordTFDataIteratorBusy(uint64 duration_us) {
static auto* tf_data_iterator_busy_cell =
tf_data_iterator_busy_counter->GetCell();
tf_data_iterator_busy_cell->IncrementBy(duration_us);
}
void RecordTFDataIteratorLifetime(uint64 duration_us) {
static auto* tf_data_iterator_lifetime_cell =
tf_data_iterator_lifetime_counter->GetCell();
tf_data_iterator_lifetime_cell->IncrementBy(duration_us);
}
void RecordTFDataIteratorGap(uint64 duration_us) {
static auto* tf_data_iterator_gap_usec_histogram_cell =
tf_data_iterator_gap_usec_histogram->GetCell();
tf_data_iterator_gap_usec_histogram_cell->Add(duration_us);
}
void RecordTFDataOptimization(const string& name, int64_t num_changes) {
tf_data_optimization_counter->GetCell(name)->IncrementBy(num_changes);
}
void RecordTFDataServiceWorkerCreated() {
tf_data_service_workers_created_counter->GetCell()->IncrementBy(1);
}
void RecordTFDataServiceJobsCreated(
const tensorflow::data::ProcessingModeDef& processing_mode,
bool is_coordinated_read) {
const std::string sharding_policy_str =
data::ProcessingModeDef::ShardingPolicy_Name(
processing_mode.sharding_policy());
const std::string coordinated_read_str =
is_coordinated_read ? "true" : "false";
tf_data_service_jobs_created_counter
->GetCell(sharding_policy_str, coordinated_read_str)
->IncrementBy(1);
}
void RecordTFDataServiceClientIterators(
int64_t worker_uid, tensorflow::data::DeploymentMode deployment_mode,
const tensorflow::data::ProcessingModeDef& processing_mode,
bool is_coordinated_read) {
const std::string deployment_mode_str =
tensorflow::data::DeploymentMode_Name(deployment_mode);
const std::string sharding_policy_str =
data::ProcessingModeDef::ShardingPolicy_Name(
processing_mode.sharding_policy());
const std::string coordinated_read_str =
is_coordinated_read ? "true" : "false";
tf_data_service_client_iterators_counter
->GetCell(absl::StrCat(worker_uid), deployment_mode_str,
sharding_policy_str, coordinated_read_str)
->IncrementBy(1);
}
void RecordTFDataServiceMultiTrainerCacheQuery(bool cache_hit) {
std::string cache_hit_str = cache_hit ? "true" : "false";
tf_data_service_multi_trainer_cache_queries_counter->GetCell(cache_hit_str)
->IncrementBy(1);
}
void RecordTFDataServiceMultiTrainerCacheSizeBytes(size_t bytes) {
tf_data_service_multi_trainer_cache_size_bytes->GetCell()->Set(
static_cast<int64_t>(bytes));
}
void RecordTFDataFilename(const string& name, const string& filename) {
tf_data_filename_counter->GetCell(name, filename)->IncrementBy(1);
}
void RecordTFDataAutoShard(const string& id, data::AutoShardPolicy policy,
int64 num_workers, int64 num_replicas) {
tf_data_auto_shard->GetCell(id, "policy")->Set(static_cast<int64_t>(policy));
tf_data_auto_shard->GetCell(id, "num_workers")->Set(num_workers);
tf_data_auto_shard->GetCell(id, "num_replicas")->Set(num_replicas);
}
void RecordTFDataAutoShardRewriteBatchSize(
bool eligible, const std::vector<string>& ineligible_reason) {
tf_data_auto_shard_rewrite_batch_size_eligible
->GetCell(eligible ? "true" : "false")
->IncrementBy(1);
for (const string& reason : ineligible_reason) {
tf_data_auto_shard_rewrite_batch_size_reason->GetCell(reason)->IncrementBy(
1);
}
}
void RecordTFDataAutotuneStoppingCriteria(const string& name) {
tf_data_autotune_stopping_criteria_counter->GetCell(name)->IncrementBy(1);
}
void RecordParseDenseFeature(int64 num_features) {
static auto* parse_dense_feature_counter_cell =
parse_dense_feature_counter->GetCell();
parse_dense_feature_counter_cell->IncrementBy(num_features);
}
void RecordParseSparseFeature(int64_t num_features) {
static auto* parse_sparse_feature_counter_cell =
parse_sparse_feature_counter->GetCell();
parse_sparse_feature_counter_cell->IncrementBy(num_features);
}
void RecordParseRaggedFeature(int64_t num_features) {
static auto* parse_ragged_feature_counter_cell =
parse_ragged_feature_counter->GetCell();
parse_ragged_feature_counter_cell->IncrementBy(num_features);
}
void RecordGraphInputTensors(const size_t size) {
static auto* graph_run_input_tensor_bytes_cell =
graph_run_input_tensor_bytes->GetCell();
graph_run_input_tensor_bytes_cell->Add(size);
}
void RecordGraphOutputTensors(const size_t size) {
static auto* graph_run_output_tensor_bytes_cell =
graph_run_output_tensor_bytes->GetCell();
graph_run_output_tensor_bytes_cell->Add(size);
}
void RecordTPUXlaSpmdCoresPerReplica(int64_t cores_per_replica) {
xla_tpu_spmd_cores_per_replica->GetCell(absl::StrCat(cores_per_replica))
->IncrementBy(1);
}
void UpdateGraphExecTime(const uint64 running_time_usecs) {
if (running_time_usecs > 0) {
static auto* graph_runs_cell = graph_runs->GetCell();
static auto* graph_run_time_usecs_cell = graph_run_time_usecs->GetCell();
static auto* graph_run_time_usecs_histogram_cell =
graph_run_time_usecs_histogram->GetCell();
graph_runs_cell->IncrementBy(1);
graph_run_time_usecs_cell->IncrementBy(running_time_usecs);
graph_run_time_usecs_histogram_cell->Add(running_time_usecs);
}
}
void UpdateGraphPendingQueueLength(uint64 len) {
static auto* graph_pending_queue_length_cell =
graph_pending_queue_length_histogram->GetCell();
graph_pending_queue_length_cell->Add(len);
}
void UpdateGraphBuildTime(const uint64 running_time_usecs) {
if (running_time_usecs > 0) {
static auto* build_graph_calls_cell = build_graph_calls->GetCell();
static auto* build_graph_time_usecs_cell =
build_graph_time_usecs->GetCell();
build_graph_calls_cell->IncrementBy(1);
build_graph_time_usecs_cell->IncrementBy(running_time_usecs);
}
}
void UpdateTpuVariableDistributionTime(const uint64 distribution_time_usecs) {
if (distribution_time_usecs > 0) {
tpu_variable_distribution_time_usecs->GetCell()->IncrementBy(
distribution_time_usecs);
}
}
void UpdateXlaCompilationTime(const uint64 compilation_time_usecs) {
if (compilation_time_usecs > 0) {
static auto* xla_compilations_cell = xla_compilations->GetCell();
static auto* xla_compilation_time_usecs_cell =
xla_compilation_time_usecs->GetCell();
xla_compilations_cell->IncrementBy(1);
xla_compilation_time_usecs_cell->IncrementBy(compilation_time_usecs);
}
}
void UpdateBfcAllocatorDelayTime(const uint64 delay_usecs) {
static auto* bfc_allocator_delay_cell = bfc_allocator_delay->GetCell();
if (delay_usecs > 0) {
bfc_allocator_delay_cell->IncrementBy(delay_usecs);
}
}
void RecordUnusedOutput(const string& op_name) {
graph_unused_outputs->GetCell(op_name)->IncrementBy(1);
}
void IncrementTestCounter(const string& name, const string& label) {
test_counters->GetCell(name, label)->IncrementBy(1);
}
const monitoring::CounterCell* TestCounter(const string& name,
const string& label) {
return test_counters->GetCell(name, label);
}
TestDelta::TestDelta(const string& name, const string& label)
: cell_(TestCounter(name, label)) {
Reset();
}
void TestDelta::Reset() { last_value_ = cell_->value(); }
int64 TestDelta::Get() { return cell_->value() - last_value_; }
void UpdateTfMlirGraphOptimizationPassStateCounter(
const std::string& pass_state, const std::string& processing_state) {
static auto* metric = monitoring::Counter<2>::New(
"/tensorflow/core/tf_mlir_update_graph_optimization_pass_state_counter",
"Tracks changes in a graph's UpdateTfMlirGraphOptimizationPassState",
"PassState", "ProcessingState");
metric->GetCell(pass_state, processing_state)->IncrementBy(1);
}
void UpdateTfMlirBridgeFirstPhaseCounter(const std::string& device_type,
const std::string& bridge_version,
bool fallback_enabled,
const std::string& result) {
static auto* metric = monitoring::Counter<4>::New(
"/tensorflow/core/tf_mlir_bridge_first_phase_count",
"Tracks processing state in first phase of mlir bridge", "device",
"version", "fallback", "result");
std::string fallback_status =
fallback_enabled ? "fallback_enabled" : "fallback_disabled";
metric->GetCell(device_type, bridge_version, fallback_status, result)
->IncrementBy(1);
}
void UpdateTpuErrorCounter(const string& op, const string& error_type) {
tpu_op_error_counter->GetCell(op, error_type)->IncrementBy(1);
}
void UpdateEagerClientErrorCounter(const string& source,
const string& error_type) {
eager_client_error_counter->GetCell(source, error_type)->IncrementBy(1);
}
} // namespace metrics
} // namespace tensorflow