metrics: add per-version daily stats reporting

Adds a few kernel crash stats which are reported daily but
are accumulated from beginning to end of a Chrome OS version.

BUG=chromium:339588
TEST=ran and checked histograms on device
BRANCH=none

Change-Id: I630c673156c28dc90ffe0c9c2df58caaada082dc
Reviewed-on: https://chromium-review.googlesource.com/190404
Reviewed-by: Luigi Semenzato <semenzato@chromium.org>
Commit-Queue: Luigi Semenzato <semenzato@chromium.org>
Tested-by: Luigi Semenzato <semenzato@chromium.org>
diff --git a/metrics/metrics_daemon.cc b/metrics/metrics_daemon.cc
index 883c132..b9d8cf0 100644
--- a/metrics/metrics_daemon.cc
+++ b/metrics/metrics_daemon.cc
@@ -106,6 +106,9 @@
 const char MetricsDaemon::kMetricSwapOutShortName[] =
     "Platform.SwapOutShort";
 
+const char MetricsDaemon::kMetricsProcStatFileName[] = "/proc/stat";
+const int MetricsDaemon::kMetricsProcStatFirstLineItemsCount = 11;
+
 // Thermal CPU throttling.
 
 const char MetricsDaemon::kMetricScaledCpuFrequencyName[] =
@@ -147,7 +150,9 @@
       write_sectors_(0),
       vmstats_(),
       stats_state_(kStatsShort),
-      stats_initial_time_(0) {}
+      stats_initial_time_(0),
+      ticks_per_second_(0),
+      latest_cpu_use_ticks_(0) {}
 
 MetricsDaemon::~MetricsDaemon() {
 }
@@ -181,8 +186,8 @@
   int32 version = GetOsVersionHash();
   if (version_cycle_->Get() != version) {
     version_cycle_->Set(version);
-    SendKernelCrashesCumulativeCountSample();
     kernel_crashes_version_count_->Set(0);
+    version_cumulative_cpu_use_->Set(0);
   }
 
   Loop();
@@ -215,8 +220,14 @@
   DCHECK(metrics_lib != NULL);
   metrics_lib_ = metrics_lib;
 
+  // Get ticks per second (HZ) on this system.
+  // Sysconf cannot fail, so no sanity checks are needed.
+  ticks_per_second_ = sysconf(_SC_CLK_TCK);
+
   daily_use_.reset(
       new PersistentInteger("Logging.DailyUseTime"));
+  version_cumulative_cpu_use_.reset(
+      new PersistentInteger("Logging.CumulativeCpuTime"));
 
   kernel_crash_interval_.reset(
       new PersistentInteger("Logging.KernelCrashInterval"));
@@ -398,7 +409,7 @@
   return kUnknownSessionState;
 }
 
-void MetricsDaemon::ReportStats(Time now) {
+void MetricsDaemon::ReportStats(int64 active_use_seconds, Time now) {
   TimeDelta since_epoch = now - Time::UnixEpoch();
   int day = since_epoch.InDays();
   int week = day / 7;
@@ -414,7 +425,7 @@
   SendCrashFrequencySample(user_crashes_daily_count_);
   SendCrashFrequencySample(kernel_crashes_daily_count_);
   SendCrashFrequencySample(unclean_shutdowns_daily_count_);
-  SendKernelCrashesCumulativeCountSample();
+  SendKernelCrashesCumulativeCountStats(active_use_seconds);
 
   if (weekly_cycle_->Get() == week) {
     // We did this week already.
@@ -429,6 +440,54 @@
   SendCrashFrequencySample(unclean_shutdowns_weekly_count_);
 }
 
+// One might argue that parts of this should go into
+// chromium/src/base/sys_info_chromeos.c instead, but put it here for now.
+
+TimeDelta MetricsDaemon::GetIncrementalCpuUse() {
+
+  FilePath proc_stat_path = FilePath(kMetricsProcStatFileName);
+  std::string proc_stat_string;
+  if (!base::ReadFileToString(proc_stat_path, &proc_stat_string)) {
+    LOG(WARNING) << "cannot open " << kMetricsProcStatFileName;
+    return TimeDelta();
+  }
+
+  std::vector<std::string> proc_stat_lines;
+  base::SplitString(proc_stat_string, '\n', &proc_stat_lines);
+  if (proc_stat_lines.empty()) {
+    LOG(WARNING) << "cannot parse " << kMetricsProcStatFileName
+                 << ": " << proc_stat_string;
+    return TimeDelta();
+  }
+  std::vector<std::string> proc_stat_totals;
+  base::SplitStringAlongWhitespace(proc_stat_lines[0], &proc_stat_totals);
+
+  uint64 user_ticks, user_nice_ticks, system_ticks;
+  if (proc_stat_totals.size() != kMetricsProcStatFirstLineItemsCount ||
+      proc_stat_totals[0] != "cpu" ||
+      !base::StringToUint64(proc_stat_totals[1], &user_ticks) ||
+      !base::StringToUint64(proc_stat_totals[2], &user_nice_ticks) ||
+      !base::StringToUint64(proc_stat_totals[3], &system_ticks)) {
+    LOG(WARNING) << "cannot parse first line: " << proc_stat_lines[0];
+    return TimeDelta(base::TimeDelta::FromSeconds(0));
+  }
+
+  uint64 total_cpu_use_ticks = user_ticks + user_nice_ticks + system_ticks;
+
+  // Sanity check.
+  if (total_cpu_use_ticks < latest_cpu_use_ticks_) {
+    LOG(WARNING) << "CPU time decreasing from " << latest_cpu_use_ticks_
+                 << " to " << total_cpu_use_ticks;
+    return TimeDelta();
+  }
+
+  uint64 diff = total_cpu_use_ticks - latest_cpu_use_ticks_;
+  latest_cpu_use_ticks_ = total_cpu_use_ticks;
+  // Use microseconds to avoid significant truncations.
+  return base::TimeDelta::FromMicroseconds(
+      diff * 1000 * 1000 / ticks_per_second_);
+}
+
 void MetricsDaemon::SetUserActiveState(bool active, Time now) {
   DLOG(INFO) << "user: " << (active ? "active" : "inactive");
 
@@ -448,8 +507,11 @@
   user_crash_interval_->Add(seconds);
   kernel_crash_interval_->Add(seconds);
 
+  // Updates the CPU time accumulator.
+  version_cumulative_cpu_use_->Add(GetIncrementalCpuUse().InMilliseconds());
+
   // Report daily and weekly stats as needed.
-  ReportStats(now);
+  ReportStats(daily_use_->Get(), now);
 
   // Schedules a use monitor on inactive->active transitions and
   // unschedules it on active->inactive transitions.
@@ -1084,14 +1146,44 @@
   metrics_lib_->SendToUMA(name, sample, min, max, nbuckets);
 }
 
-void MetricsDaemon::SendKernelCrashesCumulativeCountSample() {
+void MetricsDaemon::SendKernelCrashesCumulativeCountStats(
+    int64 active_use_seconds) {
   // Report the number of crashes for this OS version, but don't clear the
   // counter.  It is cleared elsewhere on version change.
+  int64 crashes_count = kernel_crashes_version_count_->Get();
   SendSample(kernel_crashes_version_count_->Name(),
-             kernel_crashes_version_count_->Get(),
-             1,                        // value of first bucket
-             500,                      // value of last bucket
-             100);                     // number of buckets
+             crashes_count,
+             1,                         // value of first bucket
+             500,                       // value of last bucket
+             100);                      // number of buckets
+
+
+  int64 cpu_use_ms = version_cumulative_cpu_use_->Get();
+  SendSample(version_cumulative_cpu_use_->Name(),
+             cpu_use_ms / 1000,         // stat is in seconds
+             1,                         // device may be used very little...
+             8 * 1000 * 1000,           // ... or a lot (a little over 90 days)
+             100);
+
+  // On the first run after an autoupdate, cpu_use_ms and active_use_seconds
+  // can be zero.  Avoid division by zero.
+  if (cpu_use_ms > 0) {
+    // Send the crash frequency since update in number of crashes per CPU year.
+    SendSample("Logging.KernelCrashesPerCpuYear",
+               crashes_count * kSecondsPerDay * 365 * 1000 / cpu_use_ms,
+               1,
+               1000 * 1000,     // about one crash every 30s of CPU time
+               100);
+  }
+
+  if (active_use_seconds > 0) {
+    // Same as above, but per year of active time.
+    SendSample("Logging.KernelCrashesPerActiveYear",
+               crashes_count * kSecondsPerDay * 365 / active_use_seconds,
+               1,
+               1000 * 1000,     // about one crash every 30s of active time
+               100);
+  }
 }
 
 void MetricsDaemon::SendCrashIntervalSample(
diff --git a/metrics/metrics_daemon.h b/metrics/metrics_daemon.h
index e4bf853..2805cb7 100644
--- a/metrics/metrics_daemon.h
+++ b/metrics/metrics_daemon.h
@@ -145,6 +145,8 @@
   static const int kMetricPageFaultsBuckets;
   static const char kMetricsDiskStatsPath[];
   static const char kMetricsVmStatsPath[];
+  static const char kMetricsProcStatFileName[];
+  static const int kMetricsProcStatFirstLineItemsCount;
 
   // Array of power states.
   static const char* kPowerStates_[kNumberPowerStates];
@@ -243,9 +245,13 @@
   void SendLinearSample(const std::string& name, int sample,
                         int max, int nbuckets);
 
-  // Sends a histogram sample with the total number of kernel crashes since the
-  // last version update.
-  void SendKernelCrashesCumulativeCountSample();
+  // Sends various cumulative kernel crash-related stats, for instance the
+  // total number of kernel crashes since the last version update.
+  void SendKernelCrashesCumulativeCountStats(int64 active_time_seconds);
+
+  // Returns the total (system-wide) CPU usage between the time of the most
+  // recent call to this function and now.
+  base::TimeDelta GetIncrementalCpuUse();
 
   // Sends a sample representing a time interval between two crashes of the
   // same type.
@@ -322,7 +328,7 @@
   bool ReadFreqToInt(const std::string& sysfs_file_name, int* value);
 
   // Report UMA stats when cycles (daily or weekly) have changed.
-  void ReportStats(base::Time now);
+  void ReportStats(int64 active_time_seconds, base::Time now);
 
   // Reads the current OS version from /etc/lsb-release and hashes it
   // to a unsigned 32-bit int.
@@ -375,13 +381,24 @@
   StatsState stats_state_;
   double stats_initial_time_;
 
-  // Persistent counters for crash statistics.
+  // The system "HZ", or frequency of ticks.  Some system data uses ticks as a
+  // unit, and this is used to convert to standard time units.
+  uint32 ticks_per_second_;
+  // Used internally by GetIncrementalCpuUse() to return the CPU utilization
+  // between calls.
+  uint64 latest_cpu_use_ticks_;
+
+  // Persistent values and accumulators for crash statistics.
   scoped_ptr<PersistentInteger> daily_cycle_;
   scoped_ptr<PersistentInteger> weekly_cycle_;
   scoped_ptr<PersistentInteger> version_cycle_;
 
   scoped_ptr<PersistentInteger> daily_use_;
 
+  // The CPU time accumulator.  This contains the CPU time, in milliseconds,
+  // used by the system since the most recent OS version update.
+  scoped_ptr<PersistentInteger> version_cumulative_cpu_use_;
+
   scoped_ptr<PersistentInteger> user_crash_interval_;
   scoped_ptr<PersistentInteger> kernel_crash_interval_;
   scoped_ptr<PersistentInteger> unclean_shutdown_interval_;