blob: 782e9faa99a49d6fc262cedc7e5ba0d1e3ca256d [file] [log] [blame]
/*
* Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
*
* Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
*
* Interactivity improvements by Mike Galbraith
* (C) 2007 Mike Galbraith <efault@gmx.de>
*
* Various enhancements by Dmitry Adamushko.
* (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
*
* Group scheduling enhancements by Srivatsa Vaddagiri
* Copyright IBM Corporation, 2007
* Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
*
* Scaled math optimizations by Thomas Gleixner
* Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
*
* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
*/
#include <linux/latencytop.h>
#include <linux/sched.h>
#include <linux/cpumask.h>
#include <linux/slab.h>
#include <linux/profile.h>
#include <linux/interrupt.h>
#include <linux/mempolicy.h>
#include <linux/migrate.h>
#include <linux/task_work.h>
#include <trace/events/sched.h>
#include "sched.h"
/*
* Targeted preemption latency for CPU-bound tasks:
* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
*
* NOTE: this latency value is not the same as the concept of
* 'timeslice length' - timeslices in CFS are of variable length
* and have no persistent notion like in traditional, time-slice
* based scheduling concepts.
*
* (to see the precise effective timeslice length of your workload,
* run vmstat and monitor the context-switches (cs) field)
*/
unsigned int sysctl_sched_latency = 6000000ULL;
unsigned int normalized_sysctl_sched_latency = 6000000ULL;
/*
* The initial- and re-scaling of tunables is configurable
* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
*
* Options are:
* SCHED_TUNABLESCALING_NONE - unscaled, always *1
* SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
*/
enum sched_tunable_scaling sysctl_sched_tunable_scaling
= SCHED_TUNABLESCALING_LOG;
/*
* Minimal preemption granularity for CPU-bound tasks:
* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
unsigned int sysctl_sched_min_granularity = 750000ULL;
unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
/*
* is kept at sysctl_sched_latency / sysctl_sched_min_granularity
*/
static unsigned int sched_nr_latency = 8;
/*
* After fork, child runs first. If set to 0 (default) then
* parent will (try to) run first.
*/
unsigned int sysctl_sched_child_runs_first __read_mostly;
/*
* Controls whether, when SD_SHARE_PKG_RESOURCES is on, if all
* tasks go to idle CPUs when woken. If this is off, note that the
* per-task flag PF_WAKE_ON_IDLE can still cause a task to go to an
* idle CPU upon being woken.
*/
unsigned int __read_mostly sysctl_sched_wake_to_idle;
/*
* SCHED_OTHER wake-up granularity.
* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
*
* This option delays the preemption effects of decoupled workloads
* and reduces their over-scheduling. Synchronous workloads will still
* have immediate wakeup/sleep latencies.
*/
unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
/*
* The exponential sliding window over which load is averaged for shares
* distribution.
* (default: 10msec)
*/
unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
#ifdef CONFIG_CFS_BANDWIDTH
/*
* Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
* each time a cfs_rq requests quota.
*
* Note: in the case that the slice exceeds the runtime remaining (either due
* to consumption or the quota being specified to be smaller than the slice)
* we will always only issue the remaining available time.
*
* default: 5 msec, units: microseconds
*/
unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
#endif
/*
* Increase the granularity value when there are more CPUs,
* because with more CPUs the 'effective latency' as visible
* to users decreases. But the relationship is not linear,
* so pick a second-best guess by going with the log2 of the
* number of CPUs.
*
* This idea comes from the SD scheduler of Con Kolivas:
*/
static int get_update_sysctl_factor(void)
{
unsigned int cpus = min_t(int, num_online_cpus(), 8);
unsigned int factor;
switch (sysctl_sched_tunable_scaling) {
case SCHED_TUNABLESCALING_NONE:
factor = 1;
break;
case SCHED_TUNABLESCALING_LINEAR:
factor = cpus;
break;
case SCHED_TUNABLESCALING_LOG:
default:
factor = 1 + ilog2(cpus);
break;
}
return factor;
}
static void update_sysctl(void)
{
unsigned int factor = get_update_sysctl_factor();
#define SET_SYSCTL(name) \
(sysctl_##name = (factor) * normalized_sysctl_##name)
SET_SYSCTL(sched_min_granularity);
SET_SYSCTL(sched_latency);
SET_SYSCTL(sched_wakeup_granularity);
#undef SET_SYSCTL
}
void sched_init_granularity(void)
{
update_sysctl();
}
#if BITS_PER_LONG == 32
# define WMULT_CONST (~0UL)
#else
# define WMULT_CONST (1UL << 32)
#endif
#define WMULT_SHIFT 32
/*
* Shift right and round:
*/
#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
/*
* delta *= weight / lw
*/
static unsigned long
calc_delta_mine(unsigned long delta_exec, unsigned long weight,
struct load_weight *lw)
{
u64 tmp;
/*
* weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
* entities since MIN_SHARES = 2. Treat weight as 1 if less than
* 2^SCHED_LOAD_RESOLUTION.
*/
if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
tmp = (u64)delta_exec * scale_load_down(weight);
else
tmp = (u64)delta_exec;
if (!lw->inv_weight) {
unsigned long w = scale_load_down(lw->weight);
if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
lw->inv_weight = 1;
else if (unlikely(!w))
lw->inv_weight = WMULT_CONST;
else
lw->inv_weight = WMULT_CONST / w;
}
/*
* Check whether we'd overflow the 64-bit multiplication:
*/
if (unlikely(tmp > WMULT_CONST))
tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
WMULT_SHIFT/2);
else
tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
}
#ifdef CONFIG_SMP
static int active_load_balance_cpu_stop(void *data);
#endif
const struct sched_class fair_sched_class;
/**************************************************************
* CFS operations on generic schedulable entities:
*/
#ifdef CONFIG_FAIR_GROUP_SCHED
/* cpu runqueue to which this cfs_rq is attached */
static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
{
return cfs_rq->rq;
}
/* An entity is a task if it doesn't "own" a runqueue */
#define entity_is_task(se) (!se->my_q)
static inline struct task_struct *task_of(struct sched_entity *se)
{
#ifdef CONFIG_SCHED_DEBUG
WARN_ON_ONCE(!entity_is_task(se));
#endif
return container_of(se, struct task_struct, se);
}
/* Walk up scheduling entities hierarchy */
#define for_each_sched_entity(se) \
for (; se; se = se->parent)
static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
{
return p->se.cfs_rq;
}
/* runqueue on which this entity is (to be) queued */
static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
{
return se->cfs_rq;
}
/* runqueue "owned" by this group */
static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
{
return grp->my_q;
}
static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
int force_update);
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (!cfs_rq->on_list) {
/*
* Ensure we either appear before our parent (if already
* enqueued) or force our parent to appear after us when it is
* enqueued. The fact that we always enqueue bottom-up
* reduces this to two cases.
*/
if (cfs_rq->tg->parent &&
cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
&rq_of(cfs_rq)->leaf_cfs_rq_list);
} else {
list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
&rq_of(cfs_rq)->leaf_cfs_rq_list);
}
cfs_rq->on_list = 1;
/* We should have no load, but we need to update last_decay. */
update_cfs_rq_blocked_load(cfs_rq, 0);
}
}
static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (cfs_rq->on_list) {
list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
cfs_rq->on_list = 0;
}
}
/* Iterate thr' all leaf cfs_rq's on a runqueue */
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
/* Do the two (enqueued) entities belong to the same group ? */
static inline int
is_same_group(struct sched_entity *se, struct sched_entity *pse)
{
if (se->cfs_rq == pse->cfs_rq)
return 1;
return 0;
}
static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
return se->parent;
}
/* return depth at which a sched entity is present in the hierarchy */
static inline int depth_se(struct sched_entity *se)
{
int depth = 0;
for_each_sched_entity(se)
depth++;
return depth;
}
static void
find_matching_se(struct sched_entity **se, struct sched_entity **pse)
{
int se_depth, pse_depth;
/*
* preemption test can be made between sibling entities who are in the
* same cfs_rq i.e who have a common parent. Walk up the hierarchy of
* both tasks until we find their ancestors who are siblings of common
* parent.
*/
/* First walk up until both entities are at same depth */
se_depth = depth_se(*se);
pse_depth = depth_se(*pse);
while (se_depth > pse_depth) {
se_depth--;
*se = parent_entity(*se);
}
while (pse_depth > se_depth) {
pse_depth--;
*pse = parent_entity(*pse);
}
while (!is_same_group(*se, *pse)) {
*se = parent_entity(*se);
*pse = parent_entity(*pse);
}
}
#else /* !CONFIG_FAIR_GROUP_SCHED */
static inline struct task_struct *task_of(struct sched_entity *se)
{
return container_of(se, struct task_struct, se);
}
static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
{
return container_of(cfs_rq, struct rq, cfs);
}
#define entity_is_task(se) 1
#define for_each_sched_entity(se) \
for (; se; se = NULL)
static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
{
return &task_rq(p)->cfs;
}
static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
{
struct task_struct *p = task_of(se);
struct rq *rq = task_rq(p);
return &rq->cfs;
}
/* runqueue "owned" by this group */
static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
{
return NULL;
}
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
}
static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
}
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
static inline int
is_same_group(struct sched_entity *se, struct sched_entity *pse)
{
return 1;
}
static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
return NULL;
}
static inline void
find_matching_se(struct sched_entity **se, struct sched_entity **pse)
{
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
static __always_inline
void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
/**************************************************************
* Scheduling class tree data structure manipulation methods:
*/
static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
{
s64 delta = (s64)(vruntime - max_vruntime);
if (delta > 0)
max_vruntime = vruntime;
return max_vruntime;
}
static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
{
s64 delta = (s64)(vruntime - min_vruntime);
if (delta < 0)
min_vruntime = vruntime;
return min_vruntime;
}
static inline int entity_before(struct sched_entity *a,
struct sched_entity *b)
{
return (s64)(a->vruntime - b->vruntime) < 0;
}
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
u64 vruntime = cfs_rq->min_vruntime;
if (cfs_rq->curr)
vruntime = cfs_rq->curr->vruntime;
if (cfs_rq->rb_leftmost) {
struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
struct sched_entity,
run_node);
if (!cfs_rq->curr)
vruntime = se->vruntime;
else
vruntime = min_vruntime(vruntime, se->vruntime);
}
/* ensure we never gain time by being placed backwards. */
cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
#ifndef CONFIG_64BIT
smp_wmb();
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
}
/*
* Enqueue an entity into the rb-tree:
*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
struct rb_node *parent = NULL;
struct sched_entity *entry;
int leftmost = 1;
/*
* Find the right place in the rbtree:
*/
while (*link) {
parent = *link;
entry = rb_entry(parent, struct sched_entity, run_node);
/*
* We dont care about collisions. Nodes with
* the same key stay together.
*/
if (entity_before(se, entry)) {
link = &parent->rb_left;
} else {
link = &parent->rb_right;
leftmost = 0;
}
}
/*
* Maintain a cache of leftmost tree entries (it is frequently
* used):
*/
if (leftmost)
cfs_rq->rb_leftmost = &se->run_node;
rb_link_node(&se->run_node, parent, link);
rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
}
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if (cfs_rq->rb_leftmost == &se->run_node) {
struct rb_node *next_node;
next_node = rb_next(&se->run_node);
cfs_rq->rb_leftmost = next_node;
}
rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}
struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *left = cfs_rq->rb_leftmost;
if (!left)
return NULL;
return rb_entry(left, struct sched_entity, run_node);
}
static struct sched_entity *__pick_next_entity(struct sched_entity *se)
{
struct rb_node *next = rb_next(&se->run_node);
if (!next)
return NULL;
return rb_entry(next, struct sched_entity, run_node);
}
#ifdef CONFIG_SCHED_DEBUG
struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
if (!last)
return NULL;
return rb_entry(last, struct sched_entity, run_node);
}
/**************************************************************
* Scheduling class statistics methods:
*/
int sched_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
int factor = get_update_sysctl_factor();
if (ret || !write)
return ret;
sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
sysctl_sched_min_granularity);
#define WRT_SYSCTL(name) \
(normalized_sysctl_##name = sysctl_##name / (factor))
WRT_SYSCTL(sched_min_granularity);
WRT_SYSCTL(sched_latency);
WRT_SYSCTL(sched_wakeup_granularity);
#undef WRT_SYSCTL
return 0;
}
#endif
/*
* delta /= w
*/
static inline unsigned long
calc_delta_fair(unsigned long delta, struct sched_entity *se)
{
if (unlikely(se->load.weight != NICE_0_LOAD))
delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
return delta;
}
/*
* The idea is to set a period in which each task runs once.
*
* When there are too many tasks (sched_nr_latency) we have to stretch
* this period because otherwise the slices get too small.
*
* p = (nr <= nl) ? l : l*nr/nl
*/
static u64 __sched_period(unsigned long nr_running)
{
u64 period = sysctl_sched_latency;
unsigned long nr_latency = sched_nr_latency;
if (unlikely(nr_running > nr_latency)) {
period = sysctl_sched_min_granularity;
period *= nr_running;
}
return period;
}
/*
* We calculate the wall-time slice from the period by taking a part
* proportional to the weight.
*
* s = p*P[w/rw]
*/
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
for_each_sched_entity(se) {
struct load_weight *load;
struct load_weight lw;
cfs_rq = cfs_rq_of(se);
load = &cfs_rq->load;
if (unlikely(!se->on_rq)) {
lw = cfs_rq->load;
update_load_add(&lw, se->load.weight);
load = &lw;
}
slice = calc_delta_mine(slice, se->load.weight, load);
}
return slice;
}
/*
* We calculate the vruntime slice of a to-be-inserted task.
*
* vs = s/w
*/
static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
return calc_delta_fair(sched_slice(cfs_rq, se), se);
}
/*
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
*/
static inline void
__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
unsigned long delta_exec)
{
unsigned long delta_exec_weighted;
schedstat_set(curr->statistics.exec_max,
max((u64)delta_exec, curr->statistics.exec_max));
curr->sum_exec_runtime += delta_exec;
schedstat_add(cfs_rq, exec_clock, delta_exec);
delta_exec_weighted = calc_delta_fair(delta_exec, curr);
curr->vruntime += delta_exec_weighted;
update_min_vruntime(cfs_rq);
}
static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
u64 now = rq_of(cfs_rq)->clock_task;
unsigned long delta_exec;
if (unlikely(!curr))
return;
/*
* Get the amount of time the current task was running
* since the last time we changed load (this cannot
* overflow on 32 bits):
*/
delta_exec = (unsigned long)(now - curr->exec_start);
if (!delta_exec)
return;
__update_curr(cfs_rq, curr, delta_exec);
curr->exec_start = now;
if (entity_is_task(curr)) {
struct task_struct *curtask = task_of(curr);
trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
cpuacct_charge(curtask, delta_exec);
account_group_exec_runtime(curtask, delta_exec);
}
account_cfs_rq_runtime(cfs_rq, delta_exec);
}
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
}
/*
* Task is being enqueued - update stats:
*/
static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/*
* Are we enqueueing a waiting task? (for current tasks
* a dequeue/enqueue event is a NOP)
*/
if (se != cfs_rq->curr)
update_stats_wait_start(cfs_rq, se);
}
static void
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
rq_of(cfs_rq)->clock - se->statistics.wait_start));
schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
rq_of(cfs_rq)->clock - se->statistics.wait_start);
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
trace_sched_stat_wait(task_of(se),
rq_of(cfs_rq)->clock - se->statistics.wait_start);
}
#endif
schedstat_set(se->statistics.wait_start, 0);
}
static inline void
update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/*
* Mark the end of the wait period if dequeueing a
* waiting task:
*/
if (se != cfs_rq->curr)
update_stats_wait_end(cfs_rq, se);
}
/*
* We are picking a new current task - update its stats:
*/
static inline void
update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/*
* We are starting a new run period:
*/
se->exec_start = rq_of(cfs_rq)->clock_task;
}
/**************************************************
* Scheduling class queueing methods:
*/
#ifdef CONFIG_NUMA_BALANCING
/*
* numa task sample period in ms
*/
unsigned int sysctl_numa_balancing_scan_period_min = 100;
unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
/* Portion of address space to scan in MB */
unsigned int sysctl_numa_balancing_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;
static void task_numa_placement(struct task_struct *p)
{
int seq;
if (!p->mm) /* for example, ksmd faulting in a user's mm */
return;
seq = ACCESS_ONCE(p->mm->numa_scan_seq);
if (p->numa_scan_seq == seq)
return;
p->numa_scan_seq = seq;
/* FIXME: Scheduling placement policy hints go here */
}
/*
* Got a PROT_NONE fault for a page on @node.
*/
void task_numa_fault(int node, int pages, bool migrated)
{
struct task_struct *p = current;
if (!sched_feat_numa(NUMA))
return;
/* FIXME: Allocate task-specific structure for placement policy here */
/*
* If pages are properly placed (did not migrate) then scan slower.
* This is reset periodically in case of phase changes
*/
if (!migrated)
p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
p->numa_scan_period + jiffies_to_msecs(10));
task_numa_placement(p);
}
static void reset_ptenuma_scan(struct task_struct *p)
{
ACCESS_ONCE(p->mm->numa_scan_seq)++;
p->mm->numa_scan_offset = 0;
}
/*
* The expensive part of numa migration is done from task_work context.
* Triggered from task_tick_numa().
*/
void task_numa_work(struct callback_head *work)
{
unsigned long migrate, next_scan, now = jiffies;
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
struct vm_area_struct *vma;
unsigned long start, end;
long pages;
WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
work->next = work; /* protect against double add */
/*
* Who cares about NUMA placement when they're dying.
*
* NOTE: make sure not to dereference p->mm before this check,
* exit_task_work() happens _after_ exit_mm() so we could be called
* without p->mm even though we still had it when we enqueued this
* work.
*/
if (p->flags & PF_EXITING)
return;
/*
* We do not care about task placement until a task runs on a node
* other than the first one used by the address space. This is
* largely because migrations are driven by what CPU the task
* is running on. If it's never scheduled on another node, it'll
* not migrate so why bother trapping the fault.
*/
if (mm->first_nid == NUMA_PTE_SCAN_INIT)
mm->first_nid = numa_node_id();
if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
/* Are we running on a new node yet? */
if (numa_node_id() == mm->first_nid &&
!sched_feat_numa(NUMA_FORCE))
return;
mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
}
/*
* Reset the scan period if enough time has gone by. Objective is that
* scanning will be reduced if pages are properly placed. As tasks
* can enter different phases this needs to be re-examined. Lacking
* proper tracking of reference behaviour, this blunt hammer is used.
*/
migrate = mm->numa_next_reset;
if (time_after(now, migrate)) {
p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
xchg(&mm->numa_next_reset, next_scan);
}
/*
* Enforce maximal scan/migration frequency..
*/
migrate = mm->numa_next_scan;
if (time_before(now, migrate))
return;
if (p->numa_scan_period == 0)
p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
next_scan = now + msecs_to_jiffies(p->numa_scan_period);
if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
return;
/*
* Do not set pte_numa if the current running node is rate-limited.
* This loses statistics on the fault but if we are unwilling to
* migrate to this node, it is less likely we can do useful work
*/
if (migrate_ratelimited(numa_node_id()))
return;
start = mm->numa_scan_offset;
pages = sysctl_numa_balancing_scan_size;
pages <<= 20 - PAGE_SHIFT; /* MB in pages */
if (!pages)
return;
down_read(&mm->mmap_sem);
vma = find_vma(mm, start);
if (!vma) {
reset_ptenuma_scan(p);
start = 0;
vma = mm->mmap;
}
for (; vma; vma = vma->vm_next) {
if (!vma_migratable(vma))
continue;
/* Skip small VMAs. They are not likely to be of relevance */
if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
continue;
/*
* Skip inaccessible VMAs to avoid any confusion between
* PROT_NONE and NUMA hinting ptes
*/
if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
continue;
do {
start = max(start, vma->vm_start);
end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
end = min(end, vma->vm_end);
pages -= change_prot_numa(vma, start, end);
start = end;
if (pages <= 0)
goto out;
} while (end != vma->vm_end);
}
out:
/*
* It is possible to reach the end of the VMA list but the last few VMAs are
* not guaranteed to the vma_migratable. If they are not, we would find the
* !migratable VMA on the next scan but not reset the scanner to the start
* so check it now.
*/
if (vma)
mm->numa_scan_offset = start;
else
reset_ptenuma_scan(p);
up_read(&mm->mmap_sem);
}
/*
* Drive the periodic memory faults..
*/
void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
struct callback_head *work = &curr->numa_work;
u64 period, now;
/*
* We don't care about NUMA placement if we don't have memory.
*/
if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
return;
/*
* Using runtime rather than walltime has the dual advantage that
* we (mostly) drive the selection from busy threads and that the
* task needs to have done some actual work before we bother with
* NUMA placement.
*/
now = curr->se.sum_exec_runtime;
period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
if (now - curr->node_stamp > period) {
if (!curr->node_stamp)
curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
curr->node_stamp = now;
if (!time_before(jiffies, curr->mm->numa_next_scan)) {
init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
task_work_add(curr, work, true);
}
}
}
#else
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
}
#endif /* CONFIG_NUMA_BALANCING */
static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_add(&cfs_rq->load, se->load.weight);
if (!parent_entity(se))
update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
#ifdef CONFIG_SMP
if (entity_is_task(se))
list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
#endif
cfs_rq->nr_running++;
}
static void
account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_sub(&cfs_rq->load, se->load.weight);
if (!parent_entity(se))
update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
if (entity_is_task(se))
list_del_init(&se->group_node);
cfs_rq->nr_running--;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
# ifdef CONFIG_SMP
static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
{
long tg_weight;
/*
* Use this CPU's actual weight instead of the last load_contribution
* to gain a more accurate current total weight. See
* update_cfs_rq_load_contribution().
*/
tg_weight = atomic64_read(&tg->load_avg);
tg_weight -= cfs_rq->tg_load_contrib;
tg_weight += cfs_rq->load.weight;
return tg_weight;
}
static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
{
long tg_weight, load, shares;
tg_weight = calc_tg_weight(tg, cfs_rq);
load = cfs_rq->load.weight;
shares = (tg->shares * load);
if (tg_weight)
shares /= tg_weight;
if (shares < MIN_SHARES)
shares = MIN_SHARES;
if (shares > tg->shares)
shares = tg->shares;
return shares;
}
# else /* CONFIG_SMP */
static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
{
return tg->shares;
}
# endif /* CONFIG_SMP */
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
if (se->on_rq) {
/* commit outstanding execution time */
if (cfs_rq->curr == se)
update_curr(cfs_rq);
account_entity_dequeue(cfs_rq, se);
}
update_load_set(&se->load, weight);
if (se->on_rq)
account_entity_enqueue(cfs_rq, se);
}
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
static void update_cfs_shares(struct cfs_rq *cfs_rq)
{
struct task_group *tg;
struct sched_entity *se;
long shares;
tg = cfs_rq->tg;
se = tg->se[cpu_of(rq_of(cfs_rq))];
if (!se || throttled_hierarchy(cfs_rq))
return;
#ifndef CONFIG_SMP
if (likely(se->load.weight == tg->shares))
return;
#endif
shares = calc_cfs_shares(cfs_rq, tg);
reweight_entity(cfs_rq_of(se), se, shares);
}
#else /* CONFIG_FAIR_GROUP_SCHED */
static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
{
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
/*
* We choose a half-life close to 1 scheduling period.
* Note: The tables below are dependent on this value.
*/
#define LOAD_AVG_PERIOD 32
#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
/* Precomputed fixed inverse multiplies for multiplication by y^n */
static const u32 runnable_avg_yN_inv[] = {
0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
0x85aac367, 0x82cd8698,
};
/*
* Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
* over-estimates when re-combining.
*/
static const u32 runnable_avg_yN_sum[] = {
0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
};
/*
* Approximate:
* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
*/
static __always_inline u64 decay_load(u64 val, u64 n)
{
unsigned int local_n;
if (!n)
return val;
else if (unlikely(n > LOAD_AVG_PERIOD * 63))
return 0;
/* after bounds checking we can collapse to 32-bit */
local_n = n;
/*
* As y^PERIOD = 1/2, we can combine
* y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
* With a look-up table which covers k^n (n<PERIOD)
*
* To achieve constant time decay_load.
*/
if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
val >>= local_n / LOAD_AVG_PERIOD;
local_n %= LOAD_AVG_PERIOD;
}
val *= runnable_avg_yN_inv[local_n];
/* We don't use SRR here since we always want to round down. */
return val >> 32;
}
/*
* For updates fully spanning n periods, the contribution to runnable
* average will be: \Sum 1024*y^n
*
* We can compute this reasonably efficiently by combining:
* y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
*/
static u32 __compute_runnable_contrib(u64 n)
{
u32 contrib = 0;
if (likely(n <= LOAD_AVG_PERIOD))
return runnable_avg_yN_sum[n];
else if (unlikely(n >= LOAD_AVG_MAX_N))
return LOAD_AVG_MAX;
/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
do {
contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
n -= LOAD_AVG_PERIOD;
} while (n > LOAD_AVG_PERIOD);
contrib = decay_load(contrib, n);
return contrib + runnable_avg_yN_sum[n];
}
static void add_to_scaled_stat(int cpu, struct sched_avg *sa, u64 delta);
static inline void decay_scaled_stat(struct sched_avg *sa, u64 periods);
#ifdef CONFIG_SCHED_HMP
/* Initial task load. Newly created tasks are assigned this load. */
unsigned int __read_mostly sched_init_task_load_pelt;
unsigned int __read_mostly sched_init_task_load_windows;
unsigned int __read_mostly sysctl_sched_init_task_load_pct = 15;
static inline unsigned int task_load(struct task_struct *p)
{
if (sched_use_pelt)
return p->se.avg.runnable_avg_sum_scaled;
return p->ravg.demand;
}
unsigned int max_task_load(void)
{
if (sched_use_pelt)
return LOAD_AVG_MAX;
return sched_ravg_window;
}
/* Use this knob to turn on or off HMP-aware task placement logic */
unsigned int __read_mostly sched_enable_hmp = 0;
/* A cpu can no longer accomodate more tasks if:
*
* rq->nr_running > sysctl_sched_spill_nr_run ||
* rq->cumulative_runnable_avg > sched_spill_load
*/
unsigned int __read_mostly sysctl_sched_spill_nr_run = 10;
/*
* Control whether or not individual CPU power consumption is used to
* guide task placement.
*/
unsigned int __read_mostly sched_enable_power_aware = 0;
/*
* This specifies the maximum percent power difference between 2
* CPUs for them to be considered identical in terms of their
* power characteristics (i.e. they are in the same power band).
*/
unsigned int __read_mostly sysctl_sched_powerband_limit_pct = 20;
/*
* CPUs with load greater than the sched_spill_load_threshold are not
* eligible for task placement. When all CPUs in a cluster achieve a
* load higher than this level, tasks becomes eligible for inter
* cluster migration.
*/
unsigned int __read_mostly sched_spill_load;
unsigned int __read_mostly sysctl_sched_spill_load_pct = 100;
/*
* Tasks whose bandwidth consumption on a cpu is less than
* sched_small_task are considered as small tasks.
*/
unsigned int __read_mostly sched_small_task;
unsigned int __read_mostly sysctl_sched_small_task_pct = 10;
/*
* Tasks with demand >= sched_heavy_task will have their
* window-based demand added to the previous window's CPU
* time when they wake up, if they have slept for at least
* one full window. This feature is disabled when the tunable
* is set to 0 (the default).
*/
#ifdef CONFIG_SCHED_FREQ_INPUT
unsigned int __read_mostly sysctl_sched_heavy_task_pct;
unsigned int __read_mostly sched_heavy_task;
#endif
/*
* Tasks whose bandwidth consumption on a cpu is more than
* sched_upmigrate are considered "big" tasks. Big tasks will be
* considered for "up" migration, i.e migrating to a cpu with better
* capacity.
*/
unsigned int __read_mostly sched_upmigrate;
unsigned int __read_mostly sysctl_sched_upmigrate_pct = 80;
/*
* Big tasks, once migrated, will need to drop their bandwidth
* consumption to less than sched_downmigrate before they are "down"
* migrated.
*/
unsigned int __read_mostly sched_downmigrate;
unsigned int __read_mostly sysctl_sched_downmigrate_pct = 60;
/*
* Tasks whose nice value is > sysctl_sched_upmigrate_min_nice are never
* considered as "big" tasks.
*/
int __read_mostly sysctl_sched_upmigrate_min_nice = 15;
/*
* Tunable to govern scheduler wakeup placement CPU selection
* preference. If set, the scheduler chooses to wake up a task
* on an idle CPU.
*/
unsigned int __read_mostly sysctl_sched_prefer_idle;
/*
* Scheduler boost is a mechanism to temporarily place tasks on CPUs
* with higher capacity than those where a task would have normally
* ended up with their load characteristics. Any entity enabling
* boost is responsible for disabling it as well.
*/
unsigned int sysctl_sched_boost;
static inline int available_cpu_capacity(int cpu)
{
struct rq *rq = cpu_rq(cpu);
return rq->capacity;
}
void set_hmp_defaults(void)
{
sched_spill_load =
pct_to_real(sysctl_sched_spill_load_pct);
sched_small_task =
pct_to_real(sysctl_sched_small_task_pct);
sched_upmigrate =
pct_to_real(sysctl_sched_upmigrate_pct);
sched_downmigrate =
pct_to_real(sysctl_sched_downmigrate_pct);
#ifdef CONFIG_SCHED_FREQ_INPUT
sched_heavy_task =
pct_to_real(sysctl_sched_heavy_task_pct);
#endif
sched_init_task_load_pelt =
div64_u64((u64)sysctl_sched_init_task_load_pct *
(u64)LOAD_AVG_MAX, 100);
sched_init_task_load_windows =
div64_u64((u64)sysctl_sched_init_task_load_pct *
(u64)sched_ravg_window, 100);
}
int sched_set_cpu_mostly_idle_load(int cpu, int mostly_idle_pct)
{
struct rq *rq = cpu_rq(cpu);
if (mostly_idle_pct < 0 || mostly_idle_pct > 100)
return -EINVAL;
rq->mostly_idle_load = pct_to_real(mostly_idle_pct);
return 0;
}
int sched_set_cpu_mostly_idle_freq(int cpu, unsigned int mostly_idle_freq)
{
struct rq *rq = cpu_rq(cpu);
if (mostly_idle_freq > rq->max_possible_freq)
return -EINVAL;
rq->mostly_idle_freq = mostly_idle_freq;
return 0;
}
unsigned int sched_get_cpu_mostly_idle_freq(int cpu)
{
struct rq *rq = cpu_rq(cpu);
return rq->mostly_idle_freq;
}
int sched_get_cpu_mostly_idle_load(int cpu)
{
struct rq *rq = cpu_rq(cpu);
int mostly_idle_pct;
mostly_idle_pct = real_to_pct(rq->mostly_idle_load);
return mostly_idle_pct;
}
int sched_set_cpu_mostly_idle_nr_run(int cpu, int nr_run)
{
struct rq *rq = cpu_rq(cpu);
rq->mostly_idle_nr_run = nr_run;
return 0;
}
int sched_get_cpu_mostly_idle_nr_run(int cpu)
{
struct rq *rq = cpu_rq(cpu);
return rq->mostly_idle_nr_run;
}
/*
* 'load' is in reference to "best cpu" at its best frequency.
* Scale that in reference to a given cpu, accounting for how bad it is
* in reference to "best cpu".
*/
u64 scale_load_to_cpu(u64 task_load, int cpu)
{
struct rq *rq = cpu_rq(cpu);
task_load *= (u64)rq->load_scale_factor;
task_load /= 1024;
return task_load;
}
/* Is a task "big" on its current cpu */
static inline int is_big_task(struct task_struct *p)
{
u64 load = task_load(p);
int nice = TASK_NICE(p);
/* Todo: Provide cgroup-based control as well? */
if (nice > sysctl_sched_upmigrate_min_nice)
return 0;
load = scale_load_to_cpu(load, task_cpu(p));
return load > sched_upmigrate;
}
/* Is a task "small" on the minimum capacity CPU */
static inline int is_small_task(struct task_struct *p)
{
u64 load = task_load(p);
load *= (u64)max_load_scale_factor;
load /= 1024;
return load < sched_small_task;
}
static inline u64 cpu_load(int cpu)
{
struct rq *rq = cpu_rq(cpu);
return scale_load_to_cpu(rq->cumulative_runnable_avg, cpu);
}
static inline u64 cpu_load_sync(int cpu, int sync)
{
struct rq *rq = cpu_rq(cpu);
u64 load;
load = rq->cumulative_runnable_avg;
/*
* If load is being checked in a sync wakeup environment,
* we may want to discount the load of the currently running
* task.
*/
if (sync && cpu == smp_processor_id())
load -= rq->curr->ravg.demand;
return scale_load_to_cpu(load, cpu);
}
static int
spill_threshold_crossed(struct task_struct *p, struct rq *rq, int cpu,
int sync)
{
u64 total_load = cpu_load_sync(cpu, sync) +
scale_load_to_cpu(task_load(p), cpu);
if (total_load > sched_spill_load ||
(rq->nr_running + 1) > sysctl_sched_spill_nr_run)
return 1;
return 0;
}
int mostly_idle_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
int mostly_idle;
mostly_idle = (cpu_load(cpu) <= rq->mostly_idle_load
&& rq->nr_running <= rq->mostly_idle_nr_run);
return mostly_idle;
}
static int mostly_idle_cpu_sync(int cpu, int sync)
{
struct rq *rq = cpu_rq(cpu);
u64 load = cpu_load(cpu);
int nr_running;
nr_running = rq->nr_running;
/*
* Sync wakeups mean that the waker task will go to sleep
* soon so we should discount its load from this test.
*/
if (sync && cpu == smp_processor_id()) {
nr_running--;
load -= rq->curr->ravg.demand;
}
return (load <= rq->mostly_idle_load
&& nr_running <= rq->mostly_idle_nr_run);
}
static int boost_refcount;
static DEFINE_SPINLOCK(boost_lock);
static DEFINE_MUTEX(boost_mutex);
static void boost_kick_cpus(void)
{
int i;
for_each_online_cpu(i) {
if (cpu_rq(i)->capacity != max_capacity)
boost_kick(i);
}
}
static inline int sched_boost(void)
{
return boost_refcount > 0;
}
int sched_set_boost(int enable)
{
unsigned long flags;
int ret = 0;
int old_refcount;
if (!sched_enable_hmp)
return -EINVAL;
spin_lock_irqsave(&boost_lock, flags);
old_refcount = boost_refcount;
if (enable == 1) {
boost_refcount++;
} else if (!enable) {
if (boost_refcount >= 1)
boost_refcount--;
else
ret = -EINVAL;
} else {
ret = -EINVAL;
}
if (!old_refcount && boost_refcount)
boost_kick_cpus();
trace_sched_set_boost(boost_refcount);
spin_unlock_irqrestore(&boost_lock, flags);
return ret;
}
int sched_boost_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
mutex_lock(&boost_mutex);
if (!write)
sysctl_sched_boost = sched_boost();
ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (ret || !write)
goto done;
ret = (sysctl_sched_boost <= 1) ?
sched_set_boost(sysctl_sched_boost) : -EINVAL;
done:
mutex_unlock(&boost_mutex);
return ret;
}
/*
* Task will fit on a cpu if it's bandwidth consumption on that cpu
* will be less than sched_upmigrate. A big task that was previously
* "up" migrated will be considered fitting on "little" cpu if its
* bandwidth consumption on "little" cpu will be less than
* sched_downmigrate. This will help avoid frequenty migrations for
* tasks with load close to the upmigrate threshold
*/
static int task_will_fit(struct task_struct *p, int cpu)
{
u64 load;
int prev_cpu = task_cpu(p);
struct rq *prev_rq = cpu_rq(prev_cpu);
struct rq *rq = cpu_rq(cpu);
int upmigrate = sched_upmigrate;
int nice = TASK_NICE(p);
if (rq->capacity == max_capacity)
return 1;
if (sched_boost()) {
if (rq->capacity > prev_rq->capacity)
return 1;
} else {
/* Todo: Provide cgroup-based control as well? */
if (nice > sysctl_sched_upmigrate_min_nice)
return 1;
load = scale_load_to_cpu(task_load(p), cpu);
if (prev_rq->capacity > rq->capacity)
upmigrate = sched_downmigrate;
if (load < upmigrate)
return 1;
}
return 0;
}
static int eligible_cpu(struct task_struct *p, int cpu, int sync)
{
struct rq *rq = cpu_rq(cpu);
if (mostly_idle_cpu_sync(cpu, sync))
return 1;
if (rq->capacity != max_capacity)
return !spill_threshold_crossed(p, rq, cpu, sync);
return 0;
}
struct cpu_pwr_stats __weak *get_cpu_pwr_stats(void)
{
return NULL;
}
int power_delta_exceeded(unsigned int cpu_cost, unsigned int base_cost)
{
int delta, cost_limit;
if (!base_cost || cpu_cost == base_cost)
return 0;
delta = cpu_cost - base_cost;
cost_limit = div64_u64((u64)sysctl_sched_powerband_limit_pct *
(u64)base_cost, 100);
return abs(delta) > cost_limit;
}
unsigned int power_cost_at_freq(int cpu, unsigned int freq)
{
int i = 0;
struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats();
struct cpu_pstate_pwr *costs;
if (!per_cpu_info || !per_cpu_info[cpu].ptable ||
!sched_enable_power_aware)
/* When power aware scheduling is not in use, or CPU
* power data is not available, just use the CPU
* capacity as a rough stand-in for real CPU power
* numbers, assuming bigger CPUs are more power
* hungry. */
return cpu_rq(cpu)->max_possible_capacity;
if (!freq)
freq = min_max_freq;
costs = per_cpu_info[cpu].ptable;
while (costs[i].freq != 0) {
if (costs[i].freq >= freq ||
costs[i+1].freq == 0)
return costs[i].power;
i++;
}
BUG();
}
/* Return the cost of running task p on CPU cpu. This function
* currently assumes that task p is the only task which will run on
* the CPU. */
static unsigned int power_cost(struct task_struct *p, int cpu)
{
u64 demand;
unsigned int task_freq;
unsigned int cur_freq = cpu_rq(cpu)->cur_freq;
if (!sched_enable_power_aware)
return cpu_rq(cpu)->max_possible_capacity;
/* calculate % of max freq needed */
demand = scale_load_to_cpu(task_load(p), cpu) * 100;
demand = div64_u64(demand, max_task_load());
task_freq = demand * cpu_rq(cpu)->max_possible_freq;
task_freq /= 100; /* khz needed */
task_freq = max(cur_freq, task_freq);
return power_cost_at_freq(cpu, task_freq);
}
static int best_small_task_cpu(struct task_struct *p, int sync)
{
int best_nonlpm_sibling_cpu = -1,
best_nonlpm_sibling_load = INT_MAX;
int best_nonlpm_nonsibling_cpu = -1,
best_nonlpm_nonsibling_load = INT_MAX;
int best_lpm_sibling_cpu = -1,
best_lpm_sibling_cstate = INT_MAX;
int best_lpm_nonsibling_cpu = -1,
best_lpm_nonsibling_cstate = INT_MAX;
int cluster_cost, i, cstate;
u64 load;
struct cpumask search_cpus;
int cpu = smp_processor_id();
cpumask_and(&search_cpus, tsk_cpus_allowed(p), cpu_online_mask);
if (cpumask_empty(&search_cpus))
return task_cpu(p);
/*
* If a CPU is doing a sync wakeup and it only has one currently
* running task, just run the waking small task on that CPU regardless
* of what type of CPU it is.
*/
if (sync && cpu_rq(cpu)->nr_running == 1)
return cpu;
cluster_cost = power_cost(p, cpu);
/*
* 1. Least-loaded CPU in the same cluster which is not in a low
* power mode and where adding the task will not cause the
* spill threshold to be crossed.
* 2. Least-loaded CPU in the rest of the topology which is not
* in a low power mode and where adding the task will not cause
* the spill threshold to be crossed.
* 3. The idle CPU in the same cluster which is in the shallowest
* C-state.
* 4. The idle CPU in the rest of the topology which is in the
* shallowest C-state.
* 5. The task's previous CPU.
*/
for_each_cpu(i, &search_cpus) {
struct rq *rq = cpu_rq(i);
cstate = rq->cstate;
load = cpu_load_sync(i, sync);
trace_sched_cpu_load(rq, idle_cpu(i),
mostly_idle_cpu_sync(i, sync),
power_cost(p, i));
if (power_cost(p, i) == cluster_cost) {
/* This CPU is within the same cluster as the waker. */
if (cstate) {
if (cstate < best_lpm_sibling_cstate) {
best_lpm_sibling_cpu = i;
best_lpm_sibling_cstate = cstate;
}
continue;
}
if (load < best_nonlpm_sibling_load &&
!spill_threshold_crossed(p, rq, i, sync)) {
best_nonlpm_sibling_cpu = i;
best_nonlpm_sibling_load = load;
}
continue;
}
/* This CPU is not within the same cluster as the waker. */
if (cstate) {
if (cstate < best_lpm_nonsibling_cstate) {
best_lpm_nonsibling_cpu = i;
best_lpm_nonsibling_cstate = cstate;
}
continue;
}
if (load < best_nonlpm_nonsibling_load &&
!spill_threshold_crossed(p, rq, i, sync)) {
best_nonlpm_nonsibling_cpu = i;
best_nonlpm_nonsibling_load = load;
}
}
if (best_nonlpm_sibling_cpu != -1)
return best_nonlpm_sibling_cpu;
if (best_nonlpm_nonsibling_cpu != -1)
return best_nonlpm_nonsibling_cpu;
if (best_lpm_sibling_cpu != -1)
return best_lpm_sibling_cpu;
if (best_lpm_nonsibling_cpu != -1)
return best_lpm_nonsibling_cpu;
return task_cpu(p);
}
#define MOVE_TO_BIG_CPU 1
#define MOVE_TO_LITTLE_CPU 2
#define MOVE_TO_POWER_EFFICIENT_CPU 3
static int skip_cpu(struct task_struct *p, int cpu, int reason)
{
struct rq *rq = cpu_rq(cpu);
struct rq *task_rq = task_rq(p);
int skip = 0;
if (!reason)
return 0;
if (is_reserved(cpu))
return 1;
switch (reason) {
case MOVE_TO_BIG_CPU:
skip = (rq->capacity <= task_rq->capacity);
break;
case MOVE_TO_LITTLE_CPU:
skip = (rq->capacity >= task_rq->capacity);
break;
default:
skip = (cpu == task_cpu(p));
break;
}
return skip;
}
/*
* Select a single cpu in cluster as target for packing, iff cluster frequency
* is less than a threshold level
*/
static int select_packing_target(struct task_struct *p, int best_cpu)
{
struct rq *rq = cpu_rq(best_cpu);
struct cpumask search_cpus;
int i;
int min_cost = INT_MAX;
int target = best_cpu;
if (rq->cur_freq >= rq->mostly_idle_freq)
return best_cpu;
/* Don't pack if current freq is low because of throttling */
if (rq->max_freq <= rq->mostly_idle_freq)
return best_cpu;
cpumask_and(&search_cpus, tsk_cpus_allowed(p), cpu_online_mask);
cpumask_and(&search_cpus, &search_cpus, &rq->freq_domain_cpumask);
/* Pick the first lowest power cpu as target */
for_each_cpu(i, &search_cpus) {
int cost = power_cost(p, i);
if (cost < min_cost) {
target = i;
min_cost = cost;
}
}
return target;
}
/* return cheapest cpu that can fit this task */
static int select_best_cpu(struct task_struct *p, int target, int reason,
int sync)
{
int i, best_cpu = -1, fallback_idle_cpu = -1, min_cstate_cpu = -1;
int prev_cpu = task_cpu(p);
int cpu_cost, min_cost = INT_MAX;
int min_idle_cost = INT_MAX, min_busy_cost = INT_MAX;
u64 load, min_load = ULLONG_MAX, min_fallback_load = ULLONG_MAX;
int small_task = is_small_task(p);
int boost = sched_boost();
int cstate, min_cstate = INT_MAX;
int prefer_idle = reason ? 1 : sysctl_sched_prefer_idle;
/*
* PF_WAKE_UP_IDLE is a hint to scheduler that the thread waking up
* (p) needs to be placed on idle cpu.
*/
if ((current->flags & PF_WAKE_UP_IDLE) ||
(p->flags & PF_WAKE_UP_IDLE)) {
prefer_idle = 1;
small_task = 0;
}
trace_sched_task_load(p, small_task, boost, reason, sync);
if (small_task && !boost) {
best_cpu = best_small_task_cpu(p, sync);
goto done;
}
/* Todo : Optimize this loop */
for_each_cpu_and(i, tsk_cpus_allowed(p), cpu_online_mask) {
if (skip_cpu(p, i, reason))
continue;
trace_sched_cpu_load(cpu_rq(i), idle_cpu(i),
mostly_idle_cpu_sync(i, sync), power_cost(p, i));
/*
* The least-loaded mostly-idle CPU where the task
* won't fit is our fallback if we can't find a CPU
* where the task will fit.
*/
if (!task_will_fit(p, i)) {
if (mostly_idle_cpu_sync(i, sync)) {
load = cpu_load_sync(i, sync);
if (load < min_fallback_load) {
min_fallback_load = load;
fallback_idle_cpu = i;
}
}
continue;
}
if (!eligible_cpu(p, i, sync))
continue;
/*
* The task will fit on this CPU, and the CPU is either
* mostly_idle or not max capacity and can fit it under
* spill.
*/
load = cpu_load_sync(i, sync);
cpu_cost = power_cost(p, i);
cstate = cpu_rq(i)->cstate;
/*
* If the task fits in a CPU in a lower power band, that
* overrides load and C-state.
*/
if (power_delta_exceeded(cpu_cost, min_cost)) {
if (cpu_cost > min_cost)
continue;
min_cost = cpu_cost;
min_load = ULLONG_MAX;
min_cstate = INT_MAX;
min_cstate_cpu = -1;
best_cpu = -1;
}
/*
* Partition CPUs based on whether they are completely idle
* or not. For completely idle CPUs we choose the one in
* the lowest C-state and then break ties with power cost
*/
if (idle_cpu(i)) {
if (cstate > min_cstate)
continue;
if (cstate < min_cstate) {
min_idle_cost = cpu_cost;
min_cstate = cstate;
min_cstate_cpu = i;
continue;
}
if (cpu_cost < min_idle_cost) {
min_idle_cost = cpu_cost;
min_cstate_cpu = i;
}
continue;
}
/*
* For CPUs that are not completely idle, pick one with the
* lowest load and break ties with power cost
*/
if (load > min_load)
continue;
if (load < min_load) {
min_load = load;
min_busy_cost = cpu_cost;
best_cpu = i;
continue;
}
/*
* The load is equal to the previous selected CPU.
* This is rare but when it does happen opt for the
* more power efficient CPU option.
*/
if (cpu_cost < min_busy_cost) {
min_busy_cost = cpu_cost;
best_cpu = i;
}
}
if (min_cstate_cpu >= 0 &&
(prefer_idle ||
!(best_cpu >= 0 && mostly_idle_cpu_sync(best_cpu, sync))))
best_cpu = min_cstate_cpu;
done:
if (best_cpu < 0) {
if (unlikely(fallback_idle_cpu < 0))
/*
* For the lack of a better choice just use
* prev_cpu. We may just benefit from having
* a hot cache.
*/
best_cpu = prev_cpu;
else
best_cpu = fallback_idle_cpu;
}
if (cpu_rq(best_cpu)->mostly_idle_freq)
best_cpu = select_packing_target(p, best_cpu);
return best_cpu;
}
void inc_nr_big_small_task(struct rq *rq, struct task_struct *p)
{
if (!sched_enable_hmp || sched_disable_window_stats)
return;
if (is_big_task(p))
rq->nr_big_tasks++;
else if (is_small_task(p))
rq->nr_small_tasks++;
}
void dec_nr_big_small_task(struct rq *rq, struct task_struct *p)
{
if (!sched_enable_hmp || sched_disable_window_stats)
return;
if (is_big_task(p))
rq->nr_big_tasks--;
else if (is_small_task(p))
rq->nr_small_tasks--;
BUG_ON(rq->nr_big_tasks < 0 || rq->nr_small_tasks < 0);
}
/*
* Walk runqueue of cpu and re-initialize 'nr_big_tasks' and 'nr_small_tasks'
* counters.
*/
void fixup_nr_big_small_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct task_struct *p;
rq->nr_big_tasks = 0;
rq->nr_small_tasks = 0;
list_for_each_entry(p, &rq->cfs_tasks, se.group_node)
inc_nr_big_small_task(rq, p);
}
/* Disable interrupts and grab runqueue lock of all cpus listed in @cpus */
void pre_big_small_task_count_change(const struct cpumask *cpus)
{
int i;
local_irq_disable();
for_each_cpu(i, cpus)
raw_spin_lock(&cpu_rq(i)->lock);
}
/*
* Reinitialize 'nr_big_tasks' and 'nr_small_tasks' counters on all affected
* cpus
*/
void post_big_small_task_count_change(const struct cpumask *cpus)
{
int i;
/* Assumes local_irq_disable() keeps online cpumap stable */
for_each_cpu(i, cpus)
fixup_nr_big_small_task(i);
for_each_cpu(i, cpus)
raw_spin_unlock(&cpu_rq(i)->lock);
local_irq_enable();
}
DEFINE_MUTEX(policy_mutex);
static inline int invalid_value(unsigned int *data)
{
int val = *data;
if (data == &sysctl_sched_ravg_hist_size)
return (val < 2 || val > RAVG_HIST_SIZE_MAX);
if (data == &sysctl_sched_window_stats_policy)
return (val >= WINDOW_STATS_INVALID_POLICY);
return 0;
}
/*
* Handle "atomic" update of sysctl_sched_window_stats_policy,
* sysctl_sched_ravg_hist_size, sysctl_sched_account_wait_time and
* sched_freq_legacy_mode variables.
*/
int sched_window_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
unsigned int *data = (unsigned int *)table->data;
unsigned int old_val;
if (!sched_enable_hmp)
return -EINVAL;
mutex_lock(&policy_mutex);
old_val = *data;
ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (ret || !write || (write && (old_val == *data)))
goto done;
if (invalid_value(data)) {
*data = old_val;
ret = -EINVAL;
goto done;
}
reset_all_window_stats(0, 0);
done:
mutex_unlock(&policy_mutex);
return ret;
}
/*
* Convert percentage value into absolute form. This will avoid div() operation
* in fast path, to convert task load in percentage scale.
*/
int sched_hmp_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
unsigned int *data = (unsigned int *)table->data;
unsigned int old_val = *data;
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write || !sched_enable_hmp)
return ret;
if ((sysctl_sched_downmigrate_pct > sysctl_sched_upmigrate_pct) ||
*data > 100) {
*data = old_val;
return -EINVAL;
}
/*
* Big/Small task tunable change will need to re-classify tasks on
* runqueue as big and small and set their counters appropriately.
* sysctl interface affects secondary variables (*_pct), which is then
* "atomically" carried over to the primary variables. Atomic change
* includes taking runqueue lock of all online cpus and re-initiatizing
* their big/small counter values based on changed criteria.
*/
if ((*data != old_val) &&
(data == &sysctl_sched_upmigrate_pct ||
data == &sysctl_sched_small_task_pct)) {
get_online_cpus();
pre_big_small_task_count_change(cpu_online_mask);
}
set_hmp_defaults();
if ((*data != old_val) &&
(data == &sysctl_sched_upmigrate_pct ||
data == &sysctl_sched_small_task_pct)) {
post_big_small_task_count_change(cpu_online_mask);
put_online_cpus();
}
return 0;
}
/*
* Reset balance_interval at all sched_domain levels of given cpu, so that it
* honors kick.
*/
static inline void reset_balance_interval(int cpu)
{
struct sched_domain *sd;
if (cpu >= nr_cpu_ids)
return;
rcu_read_lock();
for_each_domain(cpu, sd)
sd->balance_interval = 0;
rcu_read_unlock();
}
static inline int find_new_hmp_ilb(int call_cpu, int type)
{
int i;
int best_cpu = nr_cpu_ids;
struct sched_domain *sd;
int min_cost = INT_MAX, cost;
struct rq *src_rq = cpu_rq(call_cpu);
struct rq *dst_rq;
rcu_read_lock();
/* Pick an idle cpu "closest" to call_cpu */
for_each_domain(call_cpu, sd) {
for_each_cpu(i, sched_domain_span(sd)) {
dst_rq = cpu_rq(i);
if (!idle_cpu(i) || (type == NOHZ_KICK_RESTRICT
&& dst_rq->capacity > src_rq->capacity))
continue;
cost = power_cost_at_freq(i, min_max_freq);
if (cost < min_cost) {
best_cpu = i;
min_cost = cost;
}
}
if (best_cpu < nr_cpu_ids)
break;
}
rcu_read_unlock();
reset_balance_interval(best_cpu);
return best_cpu;
}
/*
* For the current task's CPU, we don't check whether there are
* multiple tasks. Just see if running the task on another CPU is
* lower power than running only this task on the current CPU. This is
* not the most accurate model, but we should be load balanced most of
* the time anyway. */
static int lower_power_cpu_available(struct task_struct *p, int cpu)
{
int i;
int lowest_power_cpu = task_cpu(p);
int lowest_power = power_cost(p, task_cpu(p));
/* Is a lower-powered idle CPU available which will fit this task? */
for_each_cpu_and(i, tsk_cpus_allowed(p), cpu_online_mask) {
if (idle_cpu(i) && task_will_fit(p, i)) {
int idle_power_cost = power_cost(p, i);
if (idle_power_cost < lowest_power) {
lowest_power_cpu = i;
lowest_power = idle_power_cost;
}
}
}
return (lowest_power_cpu != task_cpu(p));
}
/*
* Check if a task is on the "wrong" cpu (i.e its current cpu is not the ideal
* cpu as per its demand or priority)
*
* Returns reason why task needs to be migrated
*/
static inline int migration_needed(struct rq *rq, struct task_struct *p)
{
int nice = TASK_NICE(p);
if (!sched_enable_hmp || p->state != TASK_RUNNING)
return 0;
if (sched_boost()) {
if (rq->capacity != max_capacity)
return MOVE_TO_BIG_CPU;
return 0;
}
if (is_small_task(p))
return 0;
/* Todo: cgroup-based control? */
if (nice > sysctl_sched_upmigrate_min_nice &&
rq->capacity > min_capacity)
return MOVE_TO_LITTLE_CPU;
if (!task_will_fit(p, cpu_of(rq)))
return MOVE_TO_BIG_CPU;
if (sched_enable_power_aware &&
lower_power_cpu_available(p, cpu_of(rq)))
return MOVE_TO_POWER_EFFICIENT_CPU;
return 0;
}
static DEFINE_RAW_SPINLOCK(migration_lock);
static inline int
kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
{
unsigned long flags;
int rc = 0;
/* Invoke active balance to force migrate currently running task */
raw_spin_lock_irqsave(&rq->lock, flags);
if (!rq->active_balance) {
rq->active_balance = 1;
rq->push_cpu = new_cpu;
get_task_struct(p);
rq->push_task = p;
rc = 1;
}
raw_spin_unlock_irqrestore(&rq->lock, flags);
return rc;
}
/*
* Check if currently running task should be migrated to a better cpu.
*
* Todo: Effect this via changes to nohz_balancer_kick() and load balance?
*/
void check_for_migration(struct rq *rq, struct task_struct *p)
{
int cpu = cpu_of(rq), new_cpu;
int active_balance = 0, reason;
reason = migration_needed(rq, p);
if (!reason)
return;
raw_spin_lock(&migration_lock);
new_cpu = select_best_cpu(p, cpu, reason, 0);
if (new_cpu != cpu) {
active_balance = kick_active_balance(rq, p, new_cpu);
if (active_balance)
mark_reserved(new_cpu);
}
raw_spin_unlock(&migration_lock);
if (active_balance)
stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, rq,
&rq->active_balance_work);
}
static inline int capacity(struct rq *rq)
{
return rq->capacity;
}
static inline int nr_big_tasks(struct rq *rq)
{
return rq->nr_big_tasks;
}
#else /* CONFIG_SCHED_HMP */
#define sched_enable_power_aware 0
static inline int select_best_cpu(struct task_struct *p, int target, int reason)
{
return 0;
}
static inline int find_new_hmp_ilb(int call_cpu, int type)
{
return 0;
}
static inline int power_cost(struct task_struct *p, int cpu)
{
return SCHED_POWER_SCALE;
}
static inline int
spill_threshold_crossed(struct task_struct *p, struct rq *rq, int cpu, int sync)
{
return 0;
}
static inline int mostly_idle_cpu(int cpu)
{
return 0;
}
static inline int sched_boost(void)
{
return 0;
}
static inline int is_small_task(struct task_struct *p)
{
return 0;
}
static inline int is_big_task(struct task_struct *p)
{
return 0;
}
static inline int nr_big_tasks(struct rq *rq)
{
return 0;
}
static inline int capacity(struct rq *rq)
{
return SCHED_LOAD_SCALE;
}
#endif /* CONFIG_SCHED_HMP */
#ifdef CONFIG_SCHED_HMP
void init_new_task_load(struct task_struct *p)
{
int i;
memset(&p->ravg, 0, sizeof(struct ravg));
p->se.avg.decay_count = 0;
for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
p->ravg.sum_history[i] = sched_init_task_load_windows;
p->se.avg.runnable_avg_period =
sysctl_sched_init_task_load_pct ? LOAD_AVG_MAX : 0;
p->se.avg.runnable_avg_sum = sched_init_task_load_pelt;
p->se.avg.runnable_avg_sum_scaled = sched_init_task_load_pelt;
p->ravg.demand = sched_init_task_load_windows;
}
#else /* CONFIG_SCHED_HMP */
#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
void init_new_task_load(struct task_struct *p)
{
p->se.avg.decay_count = 0;
p->se.avg.runnable_avg_period = 0;
p->se.avg.runnable_avg_sum = 0;
}
#else /* CONFIG_SMP && CONFIG_FAIR_GROUP_SCHED */
void init_new_task_load(struct task_struct *p)
{
}
#endif /* CONFIG_SMP && CONFIG_FAIR_GROUP_SCHED */
#endif /* CONFIG_SCHED_HMP */
/*
* We can represent the historical contribution to runnable average as the
* coefficients of a geometric series. To do this we sub-divide our runnable
* history into segments of approximately 1ms (1024us); label the segment that
* occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
*
* [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
* p0 p1 p2
* (now) (~1ms ago) (~2ms ago)
*
* Let u_i denote the fraction of p_i that the entity was runnable.
*
* We then designate the fractions u_i as our co-efficients, yielding the
* following representation of historical load:
* u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
*
* We choose y based on the with of a reasonably scheduling period, fixing:
* y^32 = 0.5
*
* This means that the contribution to load ~32ms ago (u_32) will be weighted
* approximately half as much as the contribution to load within the last ms
* (u_0).
*
* When a period "rolls over" and we have new u_0`, multiplying the previous
* sum again by y is sufficient to update:
* load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
*/
static __always_inline int __update_entity_runnable_avg(int cpu, u64 now,
struct sched_avg *sa,
int runnable)
{
u64 delta, periods;
u32 runnable_contrib;
int delta_w, decayed = 0;
delta = now - sa->last_runnable_update;
/*
* This should only happen when time goes backwards, which it
* unfortunately does during sched clock init when we swap over to TSC.
*/
if ((s64)delta < 0) {
sa->last_runnable_update = now;
return 0;
}
/*
* Use 1024ns as the unit of measurement since it's a reasonable
* approximation of 1us and fast to compute.
*/
delta >>= 10;
if (!delta)
return 0;
sa->last_runnable_update = now;
/* delta_w is the amount already accumulated against our next period */
delta_w = sa->runnable_avg_period % 1024;
if (delta + delta_w >= 1024) {
/* period roll-over */
decayed = 1;
/*
* Now that we know we're crossing a period boundary, figure
* out how much from delta we need to complete the current
* period and accrue it.
*/
delta_w = 1024 - delta_w;
if (runnable) {
sa->runnable_avg_sum += delta_w;
add_to_scaled_stat(cpu, sa, delta_w);
}
sa->runnable_avg_period += delta_w;
delta -= delta_w;
/* Figure out how many additional periods this update spans */
periods = delta / 1024;
delta %= 1024;
sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
periods + 1);
decay_scaled_stat(sa, periods + 1);
sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
periods + 1);
/* Efficiently calculate \sum (1..n_period) 1024*y^i */
runnable_contrib = __compute_runnable_contrib(periods);
if (runnable) {
sa->runnable_avg_sum += runnable_contrib;
add_to_scaled_stat(cpu, sa, runnable_contrib);
}
sa->runnable_avg_period += runnable_contrib;
}
/* Remainder of delta accrued against u_0` */
if (runnable) {
sa->runnable_avg_sum += delta;
add_to_scaled_stat(cpu, sa, delta);
}
sa->runnable_avg_period += delta;
return decayed;
}
/* Synchronize an entity's decay with its parenting cfs_rq.*/
static inline u64 __synchronize_entity_decay(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 decays = atomic64_read(&cfs_rq->decay_counter);
decays -= se->avg.decay_count;
if (!decays) {
se->avg.decay_count = 0;
return 0;
}
se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
se->avg.decay_count = 0;
return decays;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
int force_update)
{
struct task_group *tg = cfs_rq->tg;
s64 tg_contrib;
tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
tg_contrib -= cfs_rq->tg_load_contrib;
if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
atomic64_add(tg_contrib, &tg->load_avg);
cfs_rq->tg_load_contrib += tg_contrib;
}
}
/*
* Aggregate cfs_rq runnable averages into an equivalent task_group
* representation for computing load contributions.
*/
static inline void __update_tg_runnable_avg(struct sched_avg *sa,
struct cfs_rq *cfs_rq)
{
struct task_group *tg = cfs_rq->tg;
long contrib;
/* The fraction of a cpu used by this cfs_rq */
contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
sa->runnable_avg_period + 1);
contrib -= cfs_rq->tg_runnable_contrib;
if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
atomic_add(contrib, &tg->runnable_avg);
cfs_rq->tg_runnable_contrib += contrib;
}
}
static inline void __update_group_entity_contrib(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = group_cfs_rq(se);
struct task_group *tg = cfs_rq->tg;
int runnable_avg;
u64 contrib;
contrib = cfs_rq->tg_load_contrib * tg->shares;
se->avg.load_avg_contrib = div64_u64(contrib,
atomic64_read(&tg->load_avg) + 1);
/*
* For group entities we need to compute a correction term in the case
* that they are consuming <1 cpu so that we would contribute the same
* load as a task of equal weight.
*
* Explicitly co-ordinating this measurement would be expensive, but
* fortunately the sum of each cpus contribution forms a usable
* lower-bound on the true value.
*
* Consider the aggregate of 2 contributions. Either they are disjoint
* (and the sum represents true value) or they are disjoint and we are
* understating by the aggregate of their overlap.
*
* Extending this to N cpus, for a given overlap, the maximum amount we
* understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
* cpus that overlap for this interval and w_i is the interval width.
*
* On a small machine; the first term is well-bounded which bounds the
* total error since w_i is a subset of the period. Whereas on a
* larger machine, while this first term can be larger, if w_i is the
* of consequential size guaranteed to see n_i*w_i quickly converge to
* our upper bound of 1-cpu.
*/
runnable_avg = atomic_read(&tg->runnable_avg);
if (runnable_avg < NICE_0_LOAD) {
se->avg.load_avg_contrib *= runnable_avg;
se->avg.load_avg_contrib >>= NICE_0_SHIFT;
}
}
#else
static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
int force_update) {}
static inline void __update_tg_runnable_avg(struct sched_avg *sa,
struct cfs_rq *cfs_rq) {}
static inline void __update_group_entity_contrib(struct sched_entity *se) {}
#endif
static inline void __update_task_entity_contrib(struct sched_entity *se)
{
u32 contrib;
/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
contrib /= (se->avg.runnable_avg_period + 1);
se->avg.load_avg_contrib = scale_load(contrib);
}
/* Compute the current contribution to load_avg by se, return any delta */
static long __update_entity_load_avg_contrib(struct sched_entity *se)
{
long old_contrib = se->avg.load_avg_contrib;
if (entity_is_task(se)) {
__update_task_entity_contrib(se);
} else {
__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
__update_group_entity_contrib(se);
}
return se->avg.load_avg_contrib - old_contrib;
}
static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
long load_contrib)
{
if (likely(load_contrib < cfs_rq->blocked_load_avg))
cfs_rq->blocked_load_avg -= load_contrib;
else
cfs_rq->blocked_load_avg = 0;
}
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
/* Update a sched_entity's runnable average */
static inline void update_entity_load_avg(struct sched_entity *se,
int update_cfs_rq)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
long contrib_delta;
u64 now;
int cpu = cpu_of(rq_of(cfs_rq));
int decayed;
/*
* For a group entity we need to use their owned cfs_rq_clock_task() in
* case they are the parent of a throttled hierarchy.
*/
if (entity_is_task(se)) {
now = cfs_rq_clock_task(cfs_rq);
if (se->on_rq) {
dec_cumulative_runnable_avg(rq_of(cfs_rq), task_of(se));
dec_nr_big_small_task(rq_of(cfs_rq), task_of(se));
}
} else
now = cfs_rq_clock_task(group_cfs_rq(se));
decayed = __update_entity_runnable_avg(cpu, now, &se->avg, se->on_rq);
if (entity_is_task(se) && se->on_rq) {
inc_cumulative_runnable_avg(rq_of(cfs_rq), task_of(se));
inc_nr_big_small_task(rq_of(cfs_rq), task_of(se));
}
if (!decayed)
return;
contrib_delta = __update_entity_load_avg_contrib(se);
if (!update_cfs_rq)
return;
if (se->on_rq)
cfs_rq->runnable_load_avg += contrib_delta;
else
subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
}
/*
* Decay the load contributed by all blocked children and account this so that
* their contribution may appropriately discounted when they wake up.
*/
static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
{
u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
u64 decays;
decays = now - cfs_rq->last_decay;
if (!decays && !force_update)
return;
if (atomic64_read(&cfs_rq->removed_load)) {
u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
subtract_blocked_load_contrib(cfs_rq, removed_load);
}
if (decays) {
cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
decays);
atomic64_add(decays, &cfs_rq->decay_counter);
cfs_rq->last_decay = now;
}
__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
}
static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
{
__update_entity_runnable_avg(cpu_of(rq), rq->clock_task,
&rq->avg, runnable);
__update_tg_runnable_avg(&rq->avg, &rq->cfs);
}
/* Add the load generated by se into cfs_rq's child load-average */
static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
struct sched_entity *se,
int wakeup)
{
/*
* We track migrations using entity decay_count <= 0, on a wake-up
* migration we use a negative decay count to track the remote decays
* accumulated while sleeping.
*/
if (unlikely(se->avg.decay_count <= 0)) {
se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
if (se->avg.decay_count) {
/*
* In a wake-up migration we have to approximate the
* time sleeping. This is because we can't synchronize
* clock_task between the two cpus, and it is not
* guaranteed to be read-safe. Instead, we can
* approximate this using our carried decays, which are
* explicitly atomically readable.
*/
se->avg.last_runnable_update -= (-se->avg.decay_count)
<< 20;
update_entity_load_avg(se, 0);
/* Indicate that we're now synchronized and on-rq */
se->avg.decay_count = 0;
}
wakeup = 0;
} else {
__synchronize_entity_decay(se);
}
/* migrated tasks did not contribute to our blocked load */
if (wakeup) {
subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
update_entity_load_avg(se, 0);
}
cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
/* we force update consideration on load-balancer moves */
update_cfs_rq_blocked_load(cfs_rq, !wakeup);
}
/*
* Remove se's load from this cfs_rq child load-average, if the entity is
* transitioning to a blocked state we track its projected decay using
* blocked_load_avg.
*/
static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
struct sched_entity *se,
int sleep)
{
update_entity_load_avg(se, 1);
/* we force update consideration on load-balancer moves */
update_cfs_rq_blocked_load(cfs_rq, !sleep);
cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
if (sleep) {
cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
} /* migrations, e.g. sleep=0 leave decay_count == 0 */
}
/*
* Update the rq's load with the elapsed running time before entering
* idle. if the last scheduled task is not a CFS task, idle_enter will
* be the only way to update the runnable statistic.
*/
void idle_enter_fair(struct rq *this_rq)
{
update_rq_runnable_avg(this_rq, 1);
}
/*
* Update the rq's load with the elapsed idle time before a task is
* scheduled. if the newly scheduled task is not a CFS task, idle_exit will
* be the only way to update the runnable statistic.
*/
void idle_exit_fair(struct rq *this_rq)
{
update_rq_runnable_avg(this_rq, 0);
}
#else
static inline void update_entity_load_avg(struct sched_entity *se,
int update_cfs_rq) {}
static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
struct sched_entity *se,
int wakeup) {}
static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
struct sched_entity *se,
int sleep) {}
static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
int force_update) {}
#endif
#ifdef CONFIG_SCHED_HMP
/* Return task demand in percentage scale */
unsigned int pct_task_load(struct task_struct *p)
{
unsigned int load;
load = div64_u64((u64)task_load(p) * 100, (u64)max_task_load());
return load;
}
/*
* Add scaled version of 'delta' to runnable_avg_sum_scaled
* 'delta' is scaled in reference to "best" cpu
*/
static inline void
add_to_scaled_stat(int cpu, struct sched_avg *sa, u64 delta)
{
struct rq *rq = cpu_rq(cpu);
int cur_freq = rq->cur_freq, max_freq = rq->max_freq;
int cpu_max_possible_freq = rq->max_possible_freq;
u64 scaled_delta;
int sf;
if (!sched_enable_hmp)
return;
if (unlikely(cur_freq > max_possible_freq ||
(cur_freq == max_freq &&
max_freq < cpu_max_possible_freq)))
cur_freq = max_possible_freq;
scaled_delta = div64_u64(delta * cur_freq, max_possible_freq);
sf = (rq->efficiency * 1024) / max_possible_efficiency;
scaled_delta *= sf;
scaled_delta >>= 10;
sa->runnable_avg_sum_scaled += scaled_delta;
}
static inline void decay_scaled_stat(struct sched_avg *sa, u64 periods)
{
if (!sched_enable_hmp)
return;
sa->runnable_avg_sum_scaled =
decay_load(sa->runnable_avg_sum_scaled,
periods);
}
#else /* CONFIG_SCHED_HMP */
static inline void
add_to_scaled_stat(int cpu, struct sched_avg *sa, u64 delta)
{
}
static inline void decay_scaled_stat(struct sched_avg *sa, u64 periods)
{
}
#endif /* CONFIG_SCHED_HMP */
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHEDSTATS
struct task_struct *tsk = NULL;
if (entity_is_task(se))
tsk = task_of(se);
if (se->statistics.sleep_start) {
u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
if ((s64)delta < 0)
delta = 0;
if (unlikely(delta > se->statistics.sleep_max))
se->statistics.sleep_max = delta;
se->statistics.sleep_start = 0;
se->statistics.sum_sleep_runtime += delta;
if (tsk) {
account_scheduler_latency(tsk, delta >> 10, 1);
trace_sched_stat_sleep(tsk, delta);
}
}
if (se->statistics.block_start) {
u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
if ((s64)delta < 0)
delta = 0;
if (unlikely(delta > se->statistics.block_max))
se->statistics.block_max = delta;
se->statistics.block_start = 0;
se->statistics.sum_sleep_runtime += delta;
if (tsk) {
if (tsk->in_iowait) {
se->statistics.iowait_sum += delta;
se->statistics.iowait_count++;
trace_sched_stat_iowait(tsk, delta);
}
trace_sched_stat_blocked(tsk, delta);
trace_sched_blocked_reason(tsk);
/*
* Blocking time is in units of nanosecs, so shift by
* 20 to get a milliseconds-range estimation of the
* amount of time that the task spent sleeping:
*/
if (unlikely(prof_on == SLEEP_PROFILING)) {
profile_hits(SLEEP_PROFILING,
(void *)get_wchan(tsk),
delta >> 20);
}
account_scheduler_latency(tsk, delta >> 10, 0);
}
}
#endif
}
static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHED_DEBUG
s64 d = se->vruntime - cfs_rq->min_vruntime;
if (d < 0)
d = -d;
if (d > 3*sysctl_sched_latency)
schedstat_inc(cfs_rq, nr_spread_over);
#endif
}
static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
u64 vruntime = cfs_rq->min_vruntime;
/*
* The 'current' period is already promised to the current tasks,
* however the extra weight of the new task will slow them down a
* little, place the new task so that it fits in the slot that
* stays open at the end.
*/
if (initial && sched_feat(START_DEBIT))
vruntime += sched_vslice(cfs_rq, se);
/* sleeps up to a single latency don't count. */
if (!initial) {
unsigned long thresh = sysctl_sched_latency;
/*
* Halve their sleep time's effect, to allow
* for a gentler effect of sleepers:
*/
if (sched_feat(GENTLE_FAIR_SLEEPERS))
thresh >>= 1;
vruntime -= thresh;
}
/* ensure we never gain time by being placed backwards. */
se->vruntime = max_vruntime(se->vruntime, vruntime);
}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
/*
* Update the normalized vruntime before updating min_vruntime
* through callig update_curr().
*/
if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
se->vruntime += cfs_rq->min_vruntime;
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
account_entity_enqueue(cfs_rq, se);
update_cfs_shares(cfs_rq);
if (flags & ENQUEUE_WAKEUP) {
place_entity(cfs_rq, se, 0);
enqueue_sleeper(cfs_rq, se);
}
update_stats_enqueue(cfs_rq, se);
check_spread(cfs_rq, se);
if (se != cfs_rq->curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
if (cfs_rq->nr_running == 1) {
list_add_leaf_cfs_rq(cfs_rq);
check_enqueue_throttle(cfs_rq);
}
}
static void __clear_buddies_last(struct sched_entity *se)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
if (cfs_rq->last == se)
cfs_rq->last = NULL;
else
break;
}
}
static void __clear_buddies_next(struct sched_entity *se)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
if (cfs_rq->next == se)
cfs_rq->next = NULL;
else
break;
}
}
static void __clear_buddies_skip(struct sched_entity *se)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
if (cfs_rq->skip == se)
cfs_rq->skip = NULL;
else
break;
}
}
static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if (cfs_rq->last == se)
__clear_buddies_last(se);
if (cfs_rq->next == se)
__clear_buddies_next(se);
if (cfs_rq->skip == se)
__clear_buddies_skip(se);
}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
update_stats_dequeue(cfs_rq, se);
if (flags & DEQUEUE_SLEEP) {
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
struct task_struct *tsk = task_of(se);
if (tsk->state & TASK_INTERRUPTIBLE)
se->statistics.sleep_start = rq_of(cfs_rq)->clock;
if (tsk->state & TASK_UNINTERRUPTIBLE)
se->statistics.block_start = rq_of(cfs_rq)->clock;
}
#endif
}
clear_buddies(cfs_rq, se);
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
se->on_rq = 0;
account_entity_dequeue(cfs_rq, se);
/*
* Normalize the entity after updating the min_vruntime because the
* update can refer to the ->curr item and we need to reflect this
* movement in our normalized position.
*/
if (!(flags & DEQUEUE_SLEEP))
se->vruntime -= cfs_rq->min_vruntime;
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
update_min_vruntime(cfs_rq);
update_cfs_shares(cfs_rq);
}
/*
* Preempt the current task with a newly woken task if needed:
*/
static void
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
unsigned long ideal_runtime, delta_exec;
struct sched_entity *se;
s64 delta;
ideal_runtime = sched_slice(cfs_rq, curr);
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
if (delta_exec > ideal_runtime) {
resched_task(rq_of(cfs_rq)->curr);
/*
* The current task ran long enough, ensure it doesn't get
* re-elected due to buddy favours.
*/
clear_buddies(cfs_rq, curr);
return;
}
/*
* Ensure that a task that missed wakeup preemption by a
* narrow margin doesn't have to wait for a full slice.
* This also mitigates buddy induced latencies under load.
*/
if (delta_exec < sysctl_sched_min_granularity)
return;
se = __pick_first_entity(cfs_rq);
delta = curr->vruntime - se->vruntime;
if (delta < 0)
return;
if (delta > ideal_runtime)
resched_task(rq_of(cfs_rq)->curr);
}
static void
set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/* 'current' is not kept within the tree. */
if (se->on_rq) {
/*
* Any task has to be enqueued before it get to execute on
* a CPU. So account for the time it spent waiting on the
* runqueue.
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
}
update_stats_curr_start(cfs_rq, se);
cfs_rq->curr = se;
#ifdef CONFIG_SCHEDSTATS
/*
* Track our maximum slice length, if the CPU's load is at
* least twice that of our own weight (i.e. dont track it
* when there are only lesser-weight tasks around):
*/
if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
se->statistics.slice_max = max(se->statistics.slice_max,
se->sum_exec_runtime - se->prev_sum_exec_runtime);
}
#endif
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
static int
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
/*
* Pick the next process, keeping these things in mind, in this order:
* 1) keep things fair between processes/task groups
* 2) pick the "next" process, since someone really wants that to run
* 3) pick the "last" process, for cache locality
* 4) do not run the "skip" process, if something else is available