crash_collector: Collect kernel task states for chrome crashes

When handling chrome crash:
call debugd to write kernel task states into syslog,
parse only the relevant task states out of syslog,
pack them into the log file to be uploaded.

BUG=chromium:244916
TEST=Kill chrome, see the task states in crash log, on Link, Snow and Alex.

Change-Id: Ib8aad79622886581ae5ce50c03097f01db9153a7
Reviewed-on: https://gerrit.chromium.org/gerrit/63205
Reviewed-by: Albert Chaulk <achaulk@chromium.org>
Reviewed-by: Balazs Engedy <engedy@chromium.org>
Tested-by: Yuly Novikov <ynovikov@chromium.org>
Commit-Queue: Yuly Novikov <ynovikov@chromium.org>
diff --git a/crash_reporter/chrome_collector.cc b/crash_reporter/chrome_collector.cc
index 4b0cd24..55a7a53 100644
--- a/crash_reporter/chrome_collector.cc
+++ b/crash_reporter/chrome_collector.cc
@@ -4,15 +4,20 @@
 
 #include "crash-reporter/chrome_collector.h"
 
+#include <fstream>
 #include <glib.h>
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
 #include <pcrecpp.h>
 #include <string>
 #include <vector>
 
 #include <base/file_util.h>
 #include <base/logging.h>
+#include <base/rand_util.h>
 #include <base/string_number_conversions.h>
 #include <base/string_util.h>
+#include <base/stringprintf.h>
 #include "chromeos/process.h"
 #include "chromeos/syslog_logging.h"
 #include "chromeos/dbus/dbus.h"
@@ -23,6 +28,33 @@
 // From //net/crash/collector/collector.h
 const int kDefaultMaxUploadBytes = 1024 * 1024;
 
+const char kHeader[] = "kernel_task_states";
+const char kEnter[] = "enter";
+const char kLeave[] = "leave";
+const char kSyslogPath[] = "/var/log/messages";
+const char kFirstTwoColumnsRE[] = "\\S+\\s+\\S+\\s+";
+const char kIdent[] = "crash_reporter";
+const char kSpacesRE[] = "\\s+";
+const char kKernelIdentRE[] = "kernel:";
+const char kKernelTimestampRE[] = "\\[\\s*\\d+\\.\\d+\\]";
+const char kTaskNameRE[] = "(.{15})";
+// kernel's TASK_STATE_TO_CHAR_STR + '?'
+const char kTaskStateRE[] = "([RSDTtZXxKW?])";
+const char kHexNumRE[] = "([[:xdigit:]]+)";
+const char kNumRE[] = "([[:digit:]]+)";
+const char kRunning32[] = "running ";
+const char kRunning64[] = " running task   ";
+// This regex matches symbol+offset as printed by printk(%pB) or (%pS).
+// See https://www.kernel.org/doc/Documentation/printk-formats.txt
+// As noted in http://sourceware.org/binutils/docs-2.23.1/as/Symbol-Names.html
+// symbols may contain periods and dollars, like "scm_recv.isra.18+0x61/0xdf".
+// Strangely, occasionally there is an additional string following the symbol,
+// like "qcusbnet_get+0x3e4/0x1e46 [gobi]".
+// Those lines will be missing in the output, as parsing them is troublesome.
+// Sometimes the address is printed and not the symbol. Those are ignored too.
+const char kSymbolAndOffsetRE[] =
+    "([[:alpha:]._$][[:alnum:]._$]*)\\+0x([[:xdigit:]]+)/0x([[:xdigit:]]+)";
+
 namespace {
 
 // Extract a string delimited by the given character, from the given offset
@@ -83,6 +115,216 @@
   return true;
 }
 
+bool LogKernelTaskStates(const chromeos::dbus::Proxy &proxy,
+                         const std::string &task_states_id) {
+  // Wrap task states with unique id - header
+  LOG(INFO) << kHeader << " " << task_states_id.c_str() << " " << kEnter;
+
+  // Trigger asynchronous logging of task states
+  chromeos::glib::ScopedError error;
+  if (!dbus_g_proxy_call(proxy.gproxy(), debugd::kLogKernelTaskStates,
+                         &chromeos::Resetter(&error).lvalue(),
+                         G_TYPE_INVALID, G_TYPE_INVALID)) {
+    LOG(ERROR) << "Error performing D-Bus proxy call "
+               << "'" << debugd::kLogKernelTaskStates << "'"
+               << ": " << (error ? error->message : "");
+    return false;
+  }
+
+  // Give kernel a second to finish the asynchronous logging
+  sleep(1);
+  // Wrap task states with unique id - footer
+  LOG(INFO) << kHeader << " " << task_states_id.c_str() << " " << kLeave;
+
+  return true;
+}
+
+bool GetKernelTaskStates(const chromeos::dbus::Proxy &proxy,
+                         std::vector<std::string> *output) {
+  std::string task_states_id = StringPrintf("%016"PRIX64, base::RandUint64());
+
+  if (!LogKernelTaskStates(proxy, task_states_id))
+    return false;
+
+  std::ifstream syslog_stream(kSyslogPath, std::ios::in);
+  if (!syslog_stream.is_open()) {
+    LOG(ERROR) << "Couldn't open syslog for reading";
+    return false;
+  }
+
+  std::string ident_re = StringPrintf("%s\\[%d\\]:\\s+", kIdent, getpid());
+  // 2013-07-05T11:52:55.530503-07:00 localhost crash_reporter[9349]:
+  // kernel_task_states 33F114E753768B70 enter
+  pcrecpp::RE enter_re(kFirstTwoColumnsRE + ident_re + kHeader + " " +
+                       task_states_id + " " + kEnter);
+  // 2013-07-05T11:52:56.556177-07:00 localhost crash_reporter[9349]:
+  // kernel_task_states 33F114E753768B70 leave
+  pcrecpp::RE leave_re(kFirstTwoColumnsRE + ident_re + kHeader + " " +
+                       task_states_id + " " + kLeave);
+  // 2013-06-21T21:18:14.855071+00:00 localhost kernel: [362142.027766]
+  // NSS SSL ThreadW S 804f8e28     0  5995   1016 0x00000000
+  pcrecpp::RE task_header_re(std::string(kFirstTwoColumnsRE) +
+                             kKernelIdentRE + kSpacesRE +
+                             kKernelTimestampRE + kSpacesRE +
+                             kTaskNameRE + kSpacesRE +
+                             kTaskStateRE + kSpacesRE +
+                             kHexNumRE + kSpacesRE +
+                             kNumRE + kSpacesRE +
+                             kNumRE + kSpacesRE +
+                             kNumRE + kSpacesRE +
+                             "0x" + kHexNumRE);
+  // 2013-06-21T21:18:14.815190+00:00 localhost kernel: [362142.016915]
+  // rsyslogd        R running      0   340      1 0x00000000
+  pcrecpp::RE task_header_run32_re(std::string(kFirstTwoColumnsRE) +
+                                   kKernelIdentRE + kSpacesRE +
+                                   kKernelTimestampRE + kSpacesRE +
+                                   kTaskNameRE + kSpacesRE +
+                                   kTaskStateRE + kSpacesRE +
+                                   kRunning32 + kSpacesRE +
+                                   kNumRE + kSpacesRE +
+                                   kNumRE + kSpacesRE +
+                                   kNumRE + kSpacesRE +
+                                   "0x" + kHexNumRE);
+  // 2013-06-21T14:11:18.718810-07:00 localhost kernel: [  656.217932]
+  // kworker/2:1     R  running task        0    54      2 0x00000000
+  pcrecpp::RE task_header_run64_re(std::string(kFirstTwoColumnsRE) +
+                                   kKernelIdentRE + kSpacesRE +
+                                   kKernelTimestampRE + kSpacesRE +
+                                   kTaskNameRE + kSpacesRE +
+                                   kTaskStateRE + kSpacesRE +
+                                   kRunning64 + kSpacesRE +
+                                   kNumRE + kSpacesRE +
+                                   kNumRE + kSpacesRE +
+                                   kNumRE + kSpacesRE +
+                                   "0x" + kHexNumRE);
+  std::string address_re = std::string("\\[<") + kHexNumRE + ">\\]";
+  std::string braced_sym_re = std::string("\\(") + kSymbolAndOffsetRE + "\\)";
+  // 2013-07-05T11:53:20.141355-07:00 localhost kernel: [  169.476749]
+  // [<8002fb74>] (do_wait+0x1a8/0x248) from [<80030c48>] (sys_wait4+0xbc/0xf8)
+  pcrecpp::RE arm_stack_re(std::string(kFirstTwoColumnsRE) +
+                           kKernelIdentRE + kSpacesRE +
+                           kKernelTimestampRE + kSpacesRE +
+                           address_re + kSpacesRE +
+                           braced_sym_re + kSpacesRE +
+                           "from" + kSpacesRE +
+                           address_re + kSpacesRE +
+                           braced_sym_re);
+  // 2013-06-21T14:12:34.457315-07:00 localhost kernel: [  731.965907]
+  // [<ffffffff814aa602>] ? system_call_fastpath+0x16/0x1b
+  pcrecpp::RE x86_stack_re(std::string(kFirstTwoColumnsRE) +
+                           kKernelIdentRE + kSpacesRE +
+                           kKernelTimestampRE + kSpacesRE +
+                           address_re + kSpacesRE +
+                           "(\\?\\s+)?" +
+                           kSymbolAndOffsetRE);
+
+  bool enter_matched = false;
+  bool leave_matched = false;
+  uint64 prev_address = 0;
+  std::string line;
+  while (std::getline(syslog_stream, line)) {
+    if (leave_re.FullMatch(line)) {
+      if (!enter_matched) {
+        LOG(ERROR) << "Task states header not found before footer";
+        return false;
+      }
+      leave_matched = true;
+      break;  // Stop processing input after footer
+    }
+
+    if (enter_re.FullMatch(line)) {
+      if (enter_matched) {
+        LOG(ERROR) << "Task states header encountered twice";
+        return false;
+      }
+      enter_matched = true;
+      continue;
+    }
+
+    if(!enter_matched)
+      continue;  // Skip lines before header
+
+    std::string task_name, task_state;
+    uint64 address1, address2;
+    unsigned long free_stack;
+    pid_t pid, ppid;
+    unsigned long flags;
+    std::string sym1, sym2;
+    uint64 offset1a, offset1b, offset2a, offset2b;
+    std::string question;
+
+    if (task_header_re.FullMatch(line, &task_name, &task_state,
+                                 pcrecpp::Hex(&address1), &free_stack, &pid,
+                                 &ppid, pcrecpp::Hex(&flags))) {
+      prev_address = 0;
+      output->push_back(StringPrintf("%s %s %016"PRIx64" %5lu %5d %6d 0x%08lx",
+                                     task_name.c_str(), task_state.c_str(),
+                                     address1, free_stack, pid, ppid, flags));
+    } else if (task_header_run32_re.FullMatch(line, &task_name, &task_state,
+                                              &free_stack, &pid, &ppid,
+                                              pcrecpp::Hex(&flags)) ||
+        task_header_run64_re.FullMatch(line, &task_name, &task_state,
+                                       &free_stack, &pid, &ppid,
+                                       pcrecpp::Hex(&flags))) {
+      prev_address = 0;
+      output->push_back(StringPrintf("%s %s %s %5lu %5d %6d 0x%08lx",
+                                     task_name.c_str(), task_state.c_str(),
+                                     kRunning64, free_stack, pid, ppid, flags));
+    } else if (arm_stack_re.FullMatch(line, pcrecpp::Hex(&address1), &sym1,
+                                      pcrecpp::Hex(&offset1a),
+                                      pcrecpp::Hex(&offset1b),
+                                      pcrecpp::Hex(&address2), &sym2,
+                                      pcrecpp::Hex(&offset2a),
+                                      pcrecpp::Hex(&offset2b))) {
+      if (prev_address != address1) {
+        output->push_back(StringPrintf("%s+0x%"PRIx64"/0x%"PRIx64,
+                                       sym1.c_str(), offset1a, offset1b));
+      }
+      output->push_back(StringPrintf("%s+0x%"PRIx64"/0x%"PRIx64,
+                                     sym2.c_str(), offset2a, offset2b));
+      prev_address = address2;
+    } else if (x86_stack_re.FullMatch(line, pcrecpp::Hex(&address1), &question,
+                                      &sym1, pcrecpp::Hex(&offset1a),
+                                      pcrecpp::Hex(&offset1b))) {
+      output->push_back(StringPrintf("%s%s+0x%"PRIx64"/0x%"PRIx64,
+                                     question.c_str(), sym1.c_str(),
+                                     offset1a, offset1b));
+    }
+  }
+  syslog_stream.close();
+
+  if(!enter_matched || !leave_matched) {
+    LOG(ERROR) << "Task states header or footer not found";
+    return false;
+  }
+
+  return true;
+}
+
+bool WriteKernelTaskStates(const chromeos::dbus::Proxy &proxy,
+                           const FilePath &task_states_path) {
+  std::ofstream task_states_stream(task_states_path.value().c_str(),
+                                   std::ios::out);
+  if (!task_states_stream.is_open()) {
+    LOG(ERROR) << "Could not write file " << task_states_path.value();
+    return false;
+  }
+
+  std::vector<std::string> task_sates;
+  if (!GetKernelTaskStates(proxy, &task_sates)) {
+    file_util::Delete(task_states_path, false);
+    return false;
+  }
+
+  for (std::vector<std::string>::iterator it = task_sates.begin();
+      it != task_sates.end(); ++it) {
+    task_states_stream << *it << std::endl;
+  }
+  task_states_stream.close();
+
+  return true;
+}
+
 bool GetAdditionalLogs(const FilePath &log_path) {
   chromeos::dbus::BusConnection dbus = chromeos::dbus::GetSystemBusConnection();
   if (!dbus.HasConnection()) {
@@ -102,7 +344,17 @@
 
   FilePath error_state_path =
       log_path.DirName().Append("i915_error_state.log.xz");
-  if (!GetDriErrorState(proxy, error_state_path))
+  bool has_dri_error_state = false;
+  if (GetDriErrorState(proxy, error_state_path))
+    has_dri_error_state = true;
+
+  FilePath task_states_path = log_path.DirName().Append("task_states.log");
+  bool has_task_states = false;
+  if (WriteKernelTaskStates(proxy, task_states_path)) {
+    has_task_states = true;
+  }
+
+  if (!has_dri_error_state && !has_task_states)
     return false;
 
   chromeos::ProcessImpl tar_process;
@@ -110,10 +362,14 @@
   tar_process.AddArg("cfJ");
   tar_process.AddArg(log_path.value());
   tar_process.AddStringOption("-C", log_path.DirName().value());
-  tar_process.AddArg(error_state_path.BaseName().value());
+  if (has_dri_error_state)
+    tar_process.AddArg(error_state_path.BaseName().value());
+  if (has_task_states)
+    tar_process.AddArg(task_states_path.BaseName().value());
   int res = tar_process.Run();
 
   file_util::Delete(error_state_path, false);
+  file_util::Delete(task_states_path, false);
 
   if (res || !file_util::PathExists(log_path)) {
     LOG(ERROR) << "Could not tar file " << log_path.value();