metrics: add per-version daily stats reporting
Adds a few kernel crash stats which are reported daily but
are accumulated from beginning to end of a Chrome OS version.
BUG=chromium:339588
TEST=ran and checked histograms on device
BRANCH=none
Change-Id: I630c673156c28dc90ffe0c9c2df58caaada082dc
Reviewed-on: https://chromium-review.googlesource.com/190404
Reviewed-by: Luigi Semenzato <semenzato@chromium.org>
Commit-Queue: Luigi Semenzato <semenzato@chromium.org>
Tested-by: Luigi Semenzato <semenzato@chromium.org>
diff --git a/metrics/metrics_daemon.cc b/metrics/metrics_daemon.cc
index 883c132..b9d8cf0 100644
--- a/metrics/metrics_daemon.cc
+++ b/metrics/metrics_daemon.cc
@@ -106,6 +106,9 @@
const char MetricsDaemon::kMetricSwapOutShortName[] =
"Platform.SwapOutShort";
+const char MetricsDaemon::kMetricsProcStatFileName[] = "/proc/stat";
+const int MetricsDaemon::kMetricsProcStatFirstLineItemsCount = 11;
+
// Thermal CPU throttling.
const char MetricsDaemon::kMetricScaledCpuFrequencyName[] =
@@ -147,7 +150,9 @@
write_sectors_(0),
vmstats_(),
stats_state_(kStatsShort),
- stats_initial_time_(0) {}
+ stats_initial_time_(0),
+ ticks_per_second_(0),
+ latest_cpu_use_ticks_(0) {}
MetricsDaemon::~MetricsDaemon() {
}
@@ -181,8 +186,8 @@
int32 version = GetOsVersionHash();
if (version_cycle_->Get() != version) {
version_cycle_->Set(version);
- SendKernelCrashesCumulativeCountSample();
kernel_crashes_version_count_->Set(0);
+ version_cumulative_cpu_use_->Set(0);
}
Loop();
@@ -215,8 +220,14 @@
DCHECK(metrics_lib != NULL);
metrics_lib_ = metrics_lib;
+ // Get ticks per second (HZ) on this system.
+ // Sysconf cannot fail, so no sanity checks are needed.
+ ticks_per_second_ = sysconf(_SC_CLK_TCK);
+
daily_use_.reset(
new PersistentInteger("Logging.DailyUseTime"));
+ version_cumulative_cpu_use_.reset(
+ new PersistentInteger("Logging.CumulativeCpuTime"));
kernel_crash_interval_.reset(
new PersistentInteger("Logging.KernelCrashInterval"));
@@ -398,7 +409,7 @@
return kUnknownSessionState;
}
-void MetricsDaemon::ReportStats(Time now) {
+void MetricsDaemon::ReportStats(int64 active_use_seconds, Time now) {
TimeDelta since_epoch = now - Time::UnixEpoch();
int day = since_epoch.InDays();
int week = day / 7;
@@ -414,7 +425,7 @@
SendCrashFrequencySample(user_crashes_daily_count_);
SendCrashFrequencySample(kernel_crashes_daily_count_);
SendCrashFrequencySample(unclean_shutdowns_daily_count_);
- SendKernelCrashesCumulativeCountSample();
+ SendKernelCrashesCumulativeCountStats(active_use_seconds);
if (weekly_cycle_->Get() == week) {
// We did this week already.
@@ -429,6 +440,54 @@
SendCrashFrequencySample(unclean_shutdowns_weekly_count_);
}
+// One might argue that parts of this should go into
+// chromium/src/base/sys_info_chromeos.c instead, but put it here for now.
+
+TimeDelta MetricsDaemon::GetIncrementalCpuUse() {
+
+ FilePath proc_stat_path = FilePath(kMetricsProcStatFileName);
+ std::string proc_stat_string;
+ if (!base::ReadFileToString(proc_stat_path, &proc_stat_string)) {
+ LOG(WARNING) << "cannot open " << kMetricsProcStatFileName;
+ return TimeDelta();
+ }
+
+ std::vector<std::string> proc_stat_lines;
+ base::SplitString(proc_stat_string, '\n', &proc_stat_lines);
+ if (proc_stat_lines.empty()) {
+ LOG(WARNING) << "cannot parse " << kMetricsProcStatFileName
+ << ": " << proc_stat_string;
+ return TimeDelta();
+ }
+ std::vector<std::string> proc_stat_totals;
+ base::SplitStringAlongWhitespace(proc_stat_lines[0], &proc_stat_totals);
+
+ uint64 user_ticks, user_nice_ticks, system_ticks;
+ if (proc_stat_totals.size() != kMetricsProcStatFirstLineItemsCount ||
+ proc_stat_totals[0] != "cpu" ||
+ !base::StringToUint64(proc_stat_totals[1], &user_ticks) ||
+ !base::StringToUint64(proc_stat_totals[2], &user_nice_ticks) ||
+ !base::StringToUint64(proc_stat_totals[3], &system_ticks)) {
+ LOG(WARNING) << "cannot parse first line: " << proc_stat_lines[0];
+ return TimeDelta(base::TimeDelta::FromSeconds(0));
+ }
+
+ uint64 total_cpu_use_ticks = user_ticks + user_nice_ticks + system_ticks;
+
+ // Sanity check.
+ if (total_cpu_use_ticks < latest_cpu_use_ticks_) {
+ LOG(WARNING) << "CPU time decreasing from " << latest_cpu_use_ticks_
+ << " to " << total_cpu_use_ticks;
+ return TimeDelta();
+ }
+
+ uint64 diff = total_cpu_use_ticks - latest_cpu_use_ticks_;
+ latest_cpu_use_ticks_ = total_cpu_use_ticks;
+ // Use microseconds to avoid significant truncations.
+ return base::TimeDelta::FromMicroseconds(
+ diff * 1000 * 1000 / ticks_per_second_);
+}
+
void MetricsDaemon::SetUserActiveState(bool active, Time now) {
DLOG(INFO) << "user: " << (active ? "active" : "inactive");
@@ -448,8 +507,11 @@
user_crash_interval_->Add(seconds);
kernel_crash_interval_->Add(seconds);
+ // Updates the CPU time accumulator.
+ version_cumulative_cpu_use_->Add(GetIncrementalCpuUse().InMilliseconds());
+
// Report daily and weekly stats as needed.
- ReportStats(now);
+ ReportStats(daily_use_->Get(), now);
// Schedules a use monitor on inactive->active transitions and
// unschedules it on active->inactive transitions.
@@ -1084,14 +1146,44 @@
metrics_lib_->SendToUMA(name, sample, min, max, nbuckets);
}
-void MetricsDaemon::SendKernelCrashesCumulativeCountSample() {
+void MetricsDaemon::SendKernelCrashesCumulativeCountStats(
+ int64 active_use_seconds) {
// Report the number of crashes for this OS version, but don't clear the
// counter. It is cleared elsewhere on version change.
+ int64 crashes_count = kernel_crashes_version_count_->Get();
SendSample(kernel_crashes_version_count_->Name(),
- kernel_crashes_version_count_->Get(),
- 1, // value of first bucket
- 500, // value of last bucket
- 100); // number of buckets
+ crashes_count,
+ 1, // value of first bucket
+ 500, // value of last bucket
+ 100); // number of buckets
+
+
+ int64 cpu_use_ms = version_cumulative_cpu_use_->Get();
+ SendSample(version_cumulative_cpu_use_->Name(),
+ cpu_use_ms / 1000, // stat is in seconds
+ 1, // device may be used very little...
+ 8 * 1000 * 1000, // ... or a lot (a little over 90 days)
+ 100);
+
+ // On the first run after an autoupdate, cpu_use_ms and active_use_seconds
+ // can be zero. Avoid division by zero.
+ if (cpu_use_ms > 0) {
+ // Send the crash frequency since update in number of crashes per CPU year.
+ SendSample("Logging.KernelCrashesPerCpuYear",
+ crashes_count * kSecondsPerDay * 365 * 1000 / cpu_use_ms,
+ 1,
+ 1000 * 1000, // about one crash every 30s of CPU time
+ 100);
+ }
+
+ if (active_use_seconds > 0) {
+ // Same as above, but per year of active time.
+ SendSample("Logging.KernelCrashesPerActiveYear",
+ crashes_count * kSecondsPerDay * 365 / active_use_seconds,
+ 1,
+ 1000 * 1000, // about one crash every 30s of active time
+ 100);
+ }
}
void MetricsDaemon::SendCrashIntervalSample(
diff --git a/metrics/metrics_daemon.h b/metrics/metrics_daemon.h
index e4bf853..2805cb7 100644
--- a/metrics/metrics_daemon.h
+++ b/metrics/metrics_daemon.h
@@ -145,6 +145,8 @@
static const int kMetricPageFaultsBuckets;
static const char kMetricsDiskStatsPath[];
static const char kMetricsVmStatsPath[];
+ static const char kMetricsProcStatFileName[];
+ static const int kMetricsProcStatFirstLineItemsCount;
// Array of power states.
static const char* kPowerStates_[kNumberPowerStates];
@@ -243,9 +245,13 @@
void SendLinearSample(const std::string& name, int sample,
int max, int nbuckets);
- // Sends a histogram sample with the total number of kernel crashes since the
- // last version update.
- void SendKernelCrashesCumulativeCountSample();
+ // Sends various cumulative kernel crash-related stats, for instance the
+ // total number of kernel crashes since the last version update.
+ void SendKernelCrashesCumulativeCountStats(int64 active_time_seconds);
+
+ // Returns the total (system-wide) CPU usage between the time of the most
+ // recent call to this function and now.
+ base::TimeDelta GetIncrementalCpuUse();
// Sends a sample representing a time interval between two crashes of the
// same type.
@@ -322,7 +328,7 @@
bool ReadFreqToInt(const std::string& sysfs_file_name, int* value);
// Report UMA stats when cycles (daily or weekly) have changed.
- void ReportStats(base::Time now);
+ void ReportStats(int64 active_time_seconds, base::Time now);
// Reads the current OS version from /etc/lsb-release and hashes it
// to a unsigned 32-bit int.
@@ -375,13 +381,24 @@
StatsState stats_state_;
double stats_initial_time_;
- // Persistent counters for crash statistics.
+ // The system "HZ", or frequency of ticks. Some system data uses ticks as a
+ // unit, and this is used to convert to standard time units.
+ uint32 ticks_per_second_;
+ // Used internally by GetIncrementalCpuUse() to return the CPU utilization
+ // between calls.
+ uint64 latest_cpu_use_ticks_;
+
+ // Persistent values and accumulators for crash statistics.
scoped_ptr<PersistentInteger> daily_cycle_;
scoped_ptr<PersistentInteger> weekly_cycle_;
scoped_ptr<PersistentInteger> version_cycle_;
scoped_ptr<PersistentInteger> daily_use_;
+ // The CPU time accumulator. This contains the CPU time, in milliseconds,
+ // used by the system since the most recent OS version update.
+ scoped_ptr<PersistentInteger> version_cumulative_cpu_use_;
+
scoped_ptr<PersistentInteger> user_crash_interval_;
scoped_ptr<PersistentInteger> kernel_crash_interval_;
scoped_ptr<PersistentInteger> unclean_shutdown_interval_;