Merge "simpleperf: update doc for stat --per-core option."
diff --git a/simpleperf/cmd_stat.cpp b/simpleperf/cmd_stat.cpp
index adc4f0a..0f8f615 100644
--- a/simpleperf/cmd_stat.cpp
+++ b/simpleperf/cmd_stat.cpp
@@ -257,7 +257,8 @@
       w = std::max(w, size);
     };
 
-    for (size_t i = 0; i < titles.size(); i++) {
+    // The last title is too long. Don't include it for width adjustment.
+    for (size_t i = 0; i + 1 < titles.size(); i++) {
       adjust_width(width[i], titles[i].size());
     }
 
@@ -956,8 +957,8 @@
     if (sum.time_running < sum.time_enabled && sum.time_running != 0) {
       scale = static_cast<double>(sum.time_enabled) / sum.time_running;
     }
-    if (system_wide_collection_ && report_per_thread_ && sum.time_running == 0) {
-      // No need to report threads not running in system wide per thread report.
+    if ((report_per_thread_ || report_per_core_) && sum.time_running == 0) {
+      // No need to report threads or cpus not running.
       return;
     }
     ThreadInfo* thread = nullptr;
@@ -1032,26 +1033,28 @@
   else
     fprintf(fp, "\nTotal test time: %lf seconds.\n", duration_in_sec);
 
+  const char* COUNTER_MULTIPLEX_INFO =
+      "probably caused by hardware counter multiplexing (less counters than events).\n"
+      "Try --use-devfreq-counters if on a rooted device.";
+
   if (cpus_ == std::vector<int>(1, -1) ||
       event_selection_set_.GetMonitoredThreads() == std::set<pid_t>({-1})) {
     // We either monitor a thread on all cpus, or monitor all threads on a cpu. In both cases,
     // if percentages < 100%, probably it is caused by hardware counter multiplexing.
     if (!counters_always_available) {
-      LOG(WARNING) << "Percentages < 100% means some events only run a subset of enabled time.\n"
-                   << "Probably because there are less hardware counters available than events.\n"
-                   << "Try --use-devfreq-counters if on a rooted device.";
+      LOG(WARNING) << "Percentages < 100% means some events only run a subset of enabled time,\n"
+                   << COUNTER_MULTIPLEX_INFO;
     }
+  } else if (report_per_thread_) {
+    // We monitor each thread on each cpu.
+    LOG(INFO) << "A percentage represents runtime_on_a_cpu / runtime_on_all_cpus for each thread.\n"
+              << "If percentage sum of a thread < 99%, or report for a running thread is missing,\n"
+              << COUNTER_MULTIPLEX_INFO;
   } else {
-    // We monitor a thread on a cpu. A percentage represents
-    // runtime_of_a_thread_on_a_cpu / runtime_of_a_thread_on_all_cpus. If percentage sum of a
-    // thread < 100%, or total event count for a running thread is 0, probably it is caused by
-    // hardware counter multiplexing. It is hard to detect the second case, so always print below
-    // info.
-    LOG(INFO) << "A percentage represents runtime_of_a_thread_on_a_cpu / "
-                 "runtime_of_a_thread_on_all_cpus.\n"
-              << "If percentage sum of a thread < 100%, or total event count for a running\n"
-              << "thread is 0, probably because there are less hardware counters available than\n"
-              << "events. Try --use-devfreq-counters if on a rooted device.";
+    // We monitor some threads on each cpu.
+    LOG(INFO) << "A percentage represents runtime_on_a_cpu / runtime_on_all_cpus for monitored\n"
+              << "threads. If percentage sum < 99%, or report for an event is missing,\n"
+              << COUNTER_MULTIPLEX_INFO;
   }
   return true;
 }
diff --git a/simpleperf/doc/executable_commands_reference.md b/simpleperf/doc/executable_commands_reference.md
index 22ed52f..5d51a15 100644
--- a/simpleperf/doc/executable_commands_reference.md
+++ b/simpleperf/doc/executable_commands_reference.md
@@ -294,10 +294,26 @@
 By default, stat cmd outputs an event count sum for all monitored cpu cores. But when `--per-core`
 option is used, stat cmd outputs an event count for each core. It can be used to see how events
 are distributed on different cores.
+When stating non-system wide with `--per-core` option, simpleperf creates a perf event for each
+monitored thread on each core. When a thread is in running state, perf events on all cores are
+enabled, but only the perf event on the core running the thread is in running state. So the
+percentage comment shows runtime_on_a_core / runtime_on_all_cores. Note that, percentage is still
+affected by hardware counter multiplexing. Check simpleperf log output for ways to distinguish it.
 
 ```sh
 # Print event counts for each cpu running threads in process 11904.
+# A percentage shows runtime_on_a_cpu / runtime_on_all_cpus.
 $ simpleperf stat --per-core -p 11904 --duration 1
+Performance counter statistics:
+
+# cpu       count  event_name   # percentage = event_run_time / enabled_time
+  7    56,552,838  cpu-cycles   #   (60%)
+  3    25,958,605  cpu-cycles   #   (20%)
+  0    22,822,698  cpu-cycles   #   (15%)
+  1     6,661,495  cpu-cycles   #   (5%)
+  4     1,519,093  cpu-cycles   #   (0%)
+
+Total test time: 1.001082 seconds.
 
 # Print event counts for each cpu system wide.
 $ su 0 simpleperf stat --per-core -a --duration 1