lmkd: Add current and max thrashing levels in LMK_MEMORY_STATS reports

Thrashing threshold tuning requires collecting thrashing level data from
the field and correlating these levels with other indications of device
being non-responsive.
Include current and max thrashing levels in the lmkd kill reports. Max
thrashing level captures the highest level seen since the last kill report.

Bug: 194433891
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Change-Id: I8a34dc41e7f03668bfad4ac2cbcb5d2570a10752
Merged-In: I8a34dc41e7f03668bfad4ac2cbcb5d2570a10752
diff --git a/lmkd.cpp b/lmkd.cpp
index f3c301e..2f0df91 100644
--- a/lmkd.cpp
+++ b/lmkd.cpp
@@ -2110,10 +2110,16 @@
     maxevents++;
 }
 
+struct kill_info {
+    enum kill_reasons kill_reason;
+    const char *kill_desc;
+    int thrashing;
+    int max_thrashing;
+};
+
 /* Kill one process specified by procp.  Returns the size (in pages) of the process killed */
-static int kill_one_process(struct proc* procp, int min_oom_score, enum kill_reasons kill_reason,
-                            const char *kill_desc, union meminfo *mi, struct wakeup_info *wi,
-                            struct timespec *tm) {
+static int kill_one_process(struct proc* procp, int min_oom_score, struct kill_info *ki,
+                            union meminfo *mi, struct wakeup_info *wi, struct timespec *tm) {
     int pid = procp->pid;
     int pidfd = procp->pidfd;
     uid_t uid = procp->uid;
@@ -2180,19 +2186,25 @@
 
     inc_killcnt(procp->oomadj);
 
-    killinfo_log(procp, min_oom_score, rss_kb, swap_kb, kill_reason, mi, wi, tm);
-
-    if (kill_desc) {
+    if (ki) {
+        kill_st.kill_reason = ki->kill_reason;
+        kill_st.thrashing = ki->thrashing;
+        kill_st.max_thrashing = ki->max_thrashing;
+        killinfo_log(procp, min_oom_score, rss_kb, swap_kb, ki->kill_reason, mi, wi, tm);
         ALOGI("Kill '%s' (%d), uid %d, oom_score_adj %d to free %" PRId64 "kB rss, %" PRId64
-              "kB swap; reason: %s", taskname, pid, uid, procp->oomadj, rss_kb, swap_kb, kill_desc);
+              "kB swap; reason: %s", taskname, pid, uid, procp->oomadj, rss_kb, swap_kb,
+              ki->kill_desc);
     } else {
+        kill_st.kill_reason = NONE;
+        kill_st.thrashing = 0;
+        kill_st.max_thrashing = 0;
+        killinfo_log(procp, min_oom_score, rss_kb, swap_kb, NONE, mi, wi, tm);
         ALOGI("Kill '%s' (%d), uid %d, oom_score_adj %d to free %" PRId64 "kB rss, %" PRId64
               "kb swap", taskname, pid, uid, procp->oomadj, rss_kb, swap_kb);
     }
 
     kill_st.uid = static_cast<int32_t>(uid);
     kill_st.taskname = taskname;
-    kill_st.kill_reason = kill_reason;
     kill_st.oom_score = procp->oomadj;
     kill_st.min_oom_score = min_oom_score;
     kill_st.free_mem_kb = mi->field.nr_free_pages * page_k;
@@ -2216,8 +2228,7 @@
  * Find one process to kill at or above the given oom_score_adj level.
  * Returns size of the killed process.
  */
-static int find_and_kill_process(int min_score_adj, enum kill_reasons kill_reason,
-                                 const char *kill_desc, union meminfo *mi,
+static int find_and_kill_process(int min_score_adj, struct kill_info *ki, union meminfo *mi,
                                  struct wakeup_info *wi, struct timespec *tm) {
     int i;
     int killed_size = 0;
@@ -2242,8 +2253,7 @@
             if (!procp)
                 break;
 
-            killed_size = kill_one_process(procp, min_score_adj, kill_reason, kill_desc,
-                                           mi, wi, tm);
+            killed_size = kill_one_process(procp, min_score_adj, ki, mi, wi, tm);
             if (killed_size >= 0) {
                 if (!lmk_state_change_start) {
                     lmk_state_change_start = true;
@@ -2399,6 +2409,7 @@
     static struct timespec thrashing_reset_tm;
     static int64_t prev_thrash_growth = 0;
     static bool check_filecache = false;
+    static int max_thrashing = 0;
 
     union meminfo mi;
     union vmstat vs;
@@ -2524,6 +2535,9 @@
     }
     /* Add previous cycle's decayed thrashing amount */
     thrashing += prev_thrash_growth;
+    if (max_thrashing < thrashing) {
+        max_thrashing = thrashing;
+    }
 
     /*
      * Refresh watermarks once per min in case user updated one of the margins.
@@ -2636,10 +2650,16 @@
 
     /* Kill a process if necessary */
     if (kill_reason != NONE) {
-        int pages_freed = find_and_kill_process(min_score_adj, kill_reason, kill_desc, &mi,
-                                                &wi, &curr_tm);
+        struct kill_info ki = {
+            .kill_reason = kill_reason,
+            .kill_desc = kill_desc,
+            .thrashing = (int)thrashing,
+            .max_thrashing = max_thrashing,
+        };
+        int pages_freed = find_and_kill_process(min_score_adj, &ki, &mi, &wi, &curr_tm);
         if (pages_freed > 0) {
             killing = true;
+            max_thrashing = 0;
             if (cut_thrashing_limit) {
                 /*
                  * Cut thrasing limit by thrashing_limit_decay_pct percentage of the current
@@ -2856,7 +2876,7 @@
 do_kill:
     if (low_ram_device) {
         /* For Go devices kill only one task */
-        if (find_and_kill_process(level_oomadj[level], NONE, NULL, &mi, &wi, &curr_tm) == 0) {
+        if (find_and_kill_process(level_oomadj[level], NULL, &mi, &wi, &curr_tm) == 0) {
             if (debug_process_killing) {
                 ALOGI("Nothing to kill");
             }
@@ -2879,7 +2899,7 @@
             min_score_adj = level_oomadj[level];
         }
 
-        pages_freed = find_and_kill_process(min_score_adj, NONE, NULL, &mi, &wi, &curr_tm);
+        pages_freed = find_and_kill_process(min_score_adj, NULL, &mi, &wi, &curr_tm);
 
         if (pages_freed == 0) {
             /* Rate limit kill reports when nothing was reclaimed */
diff --git a/statslog.cpp b/statslog.cpp
index ba39f54..6568f73 100644
--- a/statslog.cpp
+++ b/statslog.cpp
@@ -323,6 +323,8 @@
     index = pack_int32(packet, index, (int)kill_stat->free_mem_kb);
     index = pack_int32(packet, index, (int)kill_stat->free_swap_kb);
     index = pack_int32(packet, index, (int)kill_stat->kill_reason);
+    index = pack_int32(packet, index, kill_stat->thrashing);
+    index = pack_int32(packet, index, kill_stat->max_thrashing);
 
     index = pack_string(packet, index, kill_stat->taskname);
     return index;
diff --git a/statslog.h b/statslog.h
index 44af35f..89e4d2e 100644
--- a/statslog.h
+++ b/statslog.h
@@ -35,13 +35,13 @@
  * Max LMKD reply packet length in bytes
  * Notes about size calculation:
  * 4 bytes for packet type
- * 80 bytes for the LmkKillOccurred fields: memory_stat + kill_stat
+ * 88 bytes for the LmkKillOccurred fields: memory_stat + kill_stat
  * 2 bytes for process name string size
  * MAX_TASKNAME_LEN bytes for the process name string
  *
  * Must be in sync with LmkdConnection.java
  */
-#define LMKD_REPLY_MAX_SIZE 214
+#define LMKD_REPLY_MAX_SIZE 222
 
 /* LMK_MEMORY_STATS packet payload */
 struct memory_stat {
@@ -76,6 +76,8 @@
     int32_t min_oom_score;
     int64_t free_mem_kb;
     int64_t free_swap_kb;
+    int32_t thrashing;
+    int32_t max_thrashing;
 };
 
 /* LMKD reply packet to hold data for the LmkKillOccurred statsd atom */