ANDROID: Add hold functionality to schedtune CPU boost

When tasks come and go from a runqueue quickly, this can lead to boost
being applied and removed quickly which sometimes means we cannot raise
the CPU frequency again when we need to (due to the rate limit on
frequency updates). This has proved to be a particular issue for RT tasks
and alternative methods have been used in the past to work around it.

This is an attempt to solve the issue for all task classes and cpufreq
governors by introducing a generic mechanism in schedtune to retain
the max boost level from task enqueue for a minimum period - defined
here as 50ms. This timeout was determined experimentally and is not
configurable.

A sched_feat guards the application of this to tasks - in the default
configuration, task boosting only applied to tasks which have RT
policy. Change SCHEDTUNE_BOOST_HOLD_ALL to true to apply it to all
tasks regardless of class.

It works like so:

Every task enqueue (in an allowed class) stores a cpu-local timestamp.
If the task is not a member of an allowed class (all or RT depending
upon feature selection), the timestamp is not updated.
The boost group will stay active regardless of tasks present until
50ms beyond the last timestamp stored. We also store the timestamp
of the active boost group to avoid unneccesarily revisiting the boost
groups when checking CPU boost level.

If the timestamp is more than 50ms in the past when we check boost then
we re-evaluate the boost groups for that CPU, taking into account the
timestamps associated with each group.

Idea based on rt-boost-retention patches from Joel.

Change-Id: I52cc2d2e82d1c5aa03550378c8836764f41630c1
Suggested-by: Joel Fernandes <joelaf@google.com>
Reviewed-by: Patrick Bellasi <patrick.bellasi@arm.com>
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
[forward ported from android-4.9-eas-dev proposal]
(cherry picked from commit a485e8b7bf8e95759e600396feeb7bfb400b6e46)
[ - Trivial cherry-pick conflicts in include/trace/events/sched.h ]
Signed-off-by: Quentin Perret <quentin.perret@arm.com>
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index e3928f72..ee2dcc0 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -902,9 +902,9 @@
 TRACE_EVENT(sched_tune_tasks_update,
 
 	TP_PROTO(struct task_struct *tsk, int cpu, int tasks, int idx,
-		int boost, int max_boost),
+		int boost, int max_boost, u64 group_ts),
 
-	TP_ARGS(tsk, cpu, tasks, idx, boost, max_boost),
+	TP_ARGS(tsk, cpu, tasks, idx, boost, max_boost, group_ts),
 
 	TP_STRUCT__entry(
 		__array( char,  comm,   TASK_COMM_LEN   )
@@ -914,6 +914,7 @@
 		__field( int,           idx             )
 		__field( int,           boost           )
 		__field( int,           max_boost       )
+		__field( u64,		group_ts	)
 	),
 
 	TP_fast_assign(
@@ -924,13 +925,15 @@
 		__entry->idx            = idx;
 		__entry->boost          = boost;
 		__entry->max_boost      = max_boost;
+		__entry->group_ts	= group_ts;
 	),
 
 	TP_printk("pid=%d comm=%s "
-		"cpu=%d tasks=%d idx=%d boost=%d max_boost=%d",
+		"cpu=%d tasks=%d idx=%d boost=%d max_boost=%d timeout=%llu",
 		__entry->pid, __entry->comm,
 		__entry->cpu, __entry->tasks, __entry->idx,
-		__entry->boost, __entry->max_boost)
+		__entry->boost, __entry->max_boost,
+		__entry->group_ts)
 );
 
 /*
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 5590f47..50bdfd7 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -109,3 +109,14 @@
  * Request max frequency from schedutil whenever a RT task is running.
  */
 SCHED_FEAT(SUGOV_RT_MAX_FREQ, false)
+
+/*
+ * Apply schedtune boost hold to tasks of all sched classes.
+ * If enabled, schedtune will hold the boost applied to a CPU
+ * for 50ms regardless of task activation - if the task is
+ * still running 50ms later, the boost hold expires and schedtune
+ * boost will expire immediately the task stops.
+ * If disabled, this behaviour will only apply to tasks of the
+ * RT class.
+ */
+SCHED_FEAT(SCHEDTUNE_BOOST_HOLD_ALL, false)
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index 065b9c1..6bf88d2 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -13,6 +13,9 @@
 bool schedtune_initialized = false;
 extern struct reciprocal_value schedtune_spc_rdiv;
 
+/* We hold schedtune boost in effect for at least this long */
+#define SCHEDTUNE_BOOST_HOLD_NS 50000000ULL
+
 /*
  * EAS scheduler tunables for task groups.
  */
@@ -94,11 +97,14 @@
 	/* Maximum boost value for all RUNNABLE tasks on a CPU */
 	bool idle;
 	int boost_max;
+	u64 boost_ts;
 	struct {
 		/* The boost for tasks on that boost group */
 		int boost;
 		/* Count of RUNNABLE tasks on that boost group */
 		unsigned tasks;
+		/* Timestamp of boost activation */
+		u64 ts;
 	} group[BOOSTGROUPS_COUNT];
 	/* CPU's boost group locking */
 	raw_spinlock_t lock;
@@ -107,30 +113,53 @@
 /* Boost groups affecting each CPU in the system */
 DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups);
 
+static inline bool schedtune_boost_timeout(u64 now, u64 ts)
+{
+	return ((now - ts) > SCHEDTUNE_BOOST_HOLD_NS);
+}
+
+static inline bool
+schedtune_boost_group_active(int idx, struct boost_groups* bg, u64 now)
+{
+	if (bg->group[idx].tasks)
+		return true;
+
+	return !schedtune_boost_timeout(now, bg->group[idx].ts);
+}
+
 static void
-schedtune_cpu_update(int cpu)
+schedtune_cpu_update(int cpu, u64 now)
 {
 	struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
 	int boost_max;
+	u64 boost_ts;
 	int idx;
 
 	/* The root boost group is always active */
 	boost_max = bg->group[0].boost;
+	boost_ts = now;
 	for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) {
 		/*
 		 * A boost group affects a CPU only if it has
-		 * RUNNABLE tasks on that CPU
+		 * RUNNABLE tasks on that CPU or it has hold
+		 * in effect from a previous task.
 		 */
-		if (bg->group[idx].tasks == 0)
+		if (!schedtune_boost_group_active(idx, bg, now))
 			continue;
 
-		boost_max = max(boost_max, bg->group[idx].boost);
+		/* This boost group is active */
+		if (boost_max > bg->group[idx].boost)
+			continue;
+
+		boost_max = bg->group[idx].boost;
+		boost_ts =  bg->group[idx].ts;
 	}
 	/* Ensures boost_max is non-negative when all cgroup boost values
 	 * are neagtive. Avoids under-accounting of cpu capacity which may cause
 	 * task stacking and frequency spikes.*/
 	boost_max = max(boost_max, 0);
 	bg->boost_max = boost_max;
+	bg->boost_ts = boost_ts;
 }
 
 static int
@@ -140,6 +169,7 @@
 	int cur_boost_max;
 	int old_boost;
 	int cpu;
+	u64 now;
 
 	/* Update per CPU boost groups */
 	for_each_possible_cpu(cpu) {
@@ -157,15 +187,19 @@
 		bg->group[idx].boost = boost;
 
 		/* Check if this update increase current max */
-		if (boost > cur_boost_max && bg->group[idx].tasks) {
+		now = sched_clock_cpu(cpu);
+		if (boost > cur_boost_max &&
+			schedtune_boost_group_active(idx, bg, now)) {
 			bg->boost_max = boost;
+			bg->boost_ts = bg->group[idx].ts;
+
 			trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max);
 			continue;
 		}
 
 		/* Check if this update has decreased current max */
 		if (cur_boost_max == old_boost && old_boost > boost) {
-			schedtune_cpu_update(cpu);
+			schedtune_cpu_update(cpu, now);
 			trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max);
 			continue;
 		}
@@ -179,6 +213,15 @@
 #define ENQUEUE_TASK  1
 #define DEQUEUE_TASK -1
 
+static inline bool
+schedtune_update_timestamp(struct task_struct *p)
+{
+	if (sched_feat(SCHEDTUNE_BOOST_HOLD_ALL))
+		return true;
+
+	return task_has_rt_policy(p);
+}
+
 static inline void
 schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
 {
@@ -188,12 +231,21 @@
 	/* Update boosted tasks count while avoiding to make it negative */
 	bg->group[idx].tasks = max(0, tasks);
 
-	trace_sched_tune_tasks_update(p, cpu, tasks, idx,
-			bg->group[idx].boost, bg->boost_max);
+	/* Update timeout on enqueue */
+	if (task_count > 0) {
+		u64 now = sched_clock_cpu(cpu);
 
-	/* Boost group activation or deactivation on that RQ */
-	if (tasks == 1 || tasks == 0)
-		schedtune_cpu_update(cpu);
+		if (schedtune_update_timestamp(p))
+			bg->group[idx].ts = now;
+
+		/* Boost group activation or deactivation on that RQ */
+		if (bg->group[idx].tasks == 1)
+			schedtune_cpu_update(cpu, now);
+	}
+
+	trace_sched_tune_tasks_update(p, cpu, tasks, idx,
+			bg->group[idx].boost, bg->boost_max,
+			bg->group[idx].ts);
 }
 
 /*
@@ -237,6 +289,7 @@
 	int src_bg; /* Source boost group index */
 	int dst_bg; /* Destination boost group index */
 	int tasks;
+	u64 now;
 
 	if (unlikely(!schedtune_initialized))
 		return 0;
@@ -287,13 +340,15 @@
 		bg->group[src_bg].tasks = max(0, tasks);
 		bg->group[dst_bg].tasks += 1;
 
+		/* Update boost hold start for this group */
+		now = sched_clock_cpu(cpu);
+		bg->group[dst_bg].ts = now;
+
+		/* Force boost group re-evaluation at next boost check */
+		bg->boost_ts = now - SCHEDTUNE_BOOST_HOLD_NS;
+
 		raw_spin_unlock(&bg->lock);
 		task_rq_unlock(rq, task, &rq_flags);
-
-		/* Update CPU boost group */
-		if (bg->group[src_bg].tasks == 0 || bg->group[dst_bg].tasks == 1)
-			schedtune_cpu_update(task_cpu(task));
-
 	}
 
 	return 0;
@@ -340,8 +395,15 @@
 int schedtune_cpu_boost(int cpu)
 {
 	struct boost_groups *bg;
+	u64 now;
 
 	bg = &per_cpu(cpu_boost_groups, cpu);
+	now = sched_clock_cpu(cpu);
+
+	/* Check to see if we have a hold in effect */
+	if (schedtune_boost_timeout(now, bg->boost_ts))
+		schedtune_cpu_update(cpu, now);
+
 	return bg->boost_max;
 }
 
@@ -450,6 +512,7 @@
 		bg = &per_cpu(cpu_boost_groups, cpu);
 		bg->group[st->idx].boost = 0;
 		bg->group[st->idx].tasks = 0;
+		bg->group[st->idx].ts = 0;
 	}
 
 	return 0;