| /* |
| * CPU ConCurrency (CC) is measures the CPU load by averaging |
| * the number of running tasks. Using CC, the scheduler can |
| * evaluate the load of CPUs to improve load balance for power |
| * efficiency without sacrificing performance. |
| * |
| * Copyright (C) 2013 Intel, Inc., |
| * |
| * Author: Du, Yuyang <yuyang.du@intel.com> |
| * |
| * CPU Workload Consolidation consolidate workload to the smallest |
| * number of CPUs that are capable of handling it. We measure |
| * capability of CPU by CC, then compare it with a threshold, |
| * and finally run the workload on non-shielded CPUs if they are |
| * predicted capable after the consolidation. |
| * |
| * Copyright (C) 2013 Intel, Inc., |
| * |
| * Author: Rudramuni, Vishwesh M <vishwesh.m.rudramuni@intel.com> |
| * Du, Yuyang <yuyang.du@intel.com> |
| * |
| */ |
| |
| #ifdef CONFIG_CPU_CONCURRENCY |
| |
| #include "sched.h" |
| |
| /* |
| * the sum period of time is 2^26 ns (~64) by default |
| */ |
| unsigned long sysctl_concurrency_sum_period = 26UL; |
| |
| /* |
| * the number of sum periods, after which the original |
| * will be reduced/decayed to half |
| */ |
| unsigned long sysctl_concurrency_decay_rate = 1UL; |
| |
| /* |
| * the contrib period of time is 2^10 (~1us) by default, |
| * us has better precision than ms, and |
| * 1024 makes use of faster shift than div |
| */ |
| static unsigned long cc_contrib_period = 10UL; |
| |
| #ifdef CONFIG_WORKLOAD_CONSOLIDATION |
| /* |
| * whether we use concurrency to select cpu to run |
| * the woken up task |
| */ |
| static unsigned long wc_wakeup = 1UL; |
| |
| /* |
| * concurrency lower than percentage of this number |
| * is capable of running wakee |
| */ |
| static unsigned long wc_wakeup_threshold = 80UL; |
| |
| /* |
| * aggressively push the task even it is hot |
| */ |
| static unsigned long wc_push_hot_task = 1UL; |
| #endif |
| |
| /* |
| * the concurrency is scaled up for decaying, |
| * thus, concurrency 1 is effectively 2^cc_resolution (1024), |
| * which can be halved by 10 half-life periods |
| */ |
| static unsigned long cc_resolution = 10UL; |
| |
| /* |
| * after this number of half-life periods, even |
| * (1>>32)-1 (which is sufficiently large) is less than 1 |
| */ |
| static unsigned long cc_decay_max_pds = 32UL; |
| |
| static inline unsigned long cc_scale_up(unsigned long c) |
| { |
| return c << cc_resolution; |
| } |
| |
| static inline unsigned long cc_scale_down(unsigned long c) |
| { |
| return c >> cc_resolution; |
| } |
| |
| /* from nanoseconds to sum periods */ |
| static inline u64 cc_sum_pds(u64 n) |
| { |
| return n >> sysctl_concurrency_sum_period; |
| } |
| |
| /* from sum period to timestamp in ns */ |
| static inline u64 cc_timestamp(u64 p) |
| { |
| return p << sysctl_concurrency_sum_period; |
| } |
| |
| /* |
| * from nanoseconds to contrib periods, because |
| * ns so risky that can overflow cc->contrib |
| */ |
| static inline u64 cc_contrib_pds(u64 n) |
| { |
| return n >> cc_contrib_period; |
| } |
| |
| /* |
| * cc_decay_factor only works for 32bit integer, |
| * cc_decay_factor_x, x indicates the number of periods |
| * as half-life (sysctl_concurrency_decay_rate) |
| */ |
| static const unsigned long cc_decay_factor_1[] = { |
| 0xFFFFFFFF, |
| }; |
| |
| static const unsigned long cc_decay_factor_2[] = { |
| 0xFFFFFFFF, 0xB504F333, |
| }; |
| |
| static const unsigned long cc_decay_factor_4[] = { |
| 0xFFFFFFFF, 0xD744FCCA, 0xB504F333, 0x9837F051, |
| }; |
| |
| static const unsigned long cc_decay_factor_8[] = { |
| 0xFFFFFFFF, 0xEAC0C6E7, 0xD744FCCA, 0xC5672A11, |
| 0xB504F333, 0xA5FED6A9, 0x9837F051, 0x8B95C1E3, |
| }; |
| |
| /* by default sysctl_concurrency_decay_rate */ |
| static const unsigned long *cc_decay_factor = |
| cc_decay_factor_1; |
| |
| /* |
| * cc_decayed_sum depends on cc_resolution (fixed 10), |
| * cc_decayed_sum_x, x indicates the number of periods |
| * as half-life (sysctl_concurrency_decay_rate) |
| */ |
| static const unsigned long cc_decayed_sum_1[] = { |
| 0, 512, 768, 896, 960, 992, |
| 1008, 1016, 1020, 1022, 1023, |
| }; |
| |
| static const unsigned long cc_decayed_sum_2[] = { |
| 0, 724, 1235, 1597, 1853, 2034, 2162, 2252, |
| 2316, 2361, 2393, 2416, 2432, 2443, 2451, |
| 2457, 2461, 2464, 2466, 2467, 2468, 2469, |
| }; |
| |
| static const unsigned long cc_decayed_sum_4[] = { |
| 0, 861, 1585, 2193, 2705, 3135, 3497, 3801, 4057, |
| 4272, 4453, 4605, 4733, 4840, 4930, 5006, 5070, |
| 5124, 5169, 5207, 5239, 5266, 5289, 5308, 5324, |
| 5337, 5348, 5358, 5366, 5373, 5379, 5384, 5388, |
| 5391, 5394, 5396, 5398, 5400, 5401, 5402, 5403, |
| 5404, 5405, 5406, |
| }; |
| |
| static const unsigned long cc_decayed_sum_8[] = { |
| 0, 939, 1800, 2589, 3313, 3977, 4585, 5143, |
| 5655, 6124, 6554, 6949, 7311, 7643, 7947, 8226, |
| 8482, 8717, 8932, 9129, 9310, 9476, 9628, 9767, |
| 9895, 10012, 10120, 10219, 10309, 10392, 10468, 10538, |
| 10602, 10661, 10715, 10764, 10809, 10850, 10888, 10923, |
| 10955, 10984, 11011, 11036, 11059, 11080, 11099, 11116, |
| 11132, 11147, 11160, 11172, 11183, 11193, 11203, 11212, |
| 11220, 11227, 11234, 11240, 11246, 11251, 11256, 11260, |
| 11264, 11268, 11271, 11274, 11277, 11280, 11282, 11284, |
| 11286, 11288, 11290, 11291, 11292, 11293, 11294, 11295, |
| 11296, 11297, 11298, 11299, 11300, 11301, 11302, |
| }; |
| |
| /* by default sysctl_concurrency_decay_rate */ |
| static const unsigned long *cc_decayed_sum = cc_decayed_sum_1; |
| |
| /* |
| * the last index of cc_decayed_sum array |
| */ |
| static unsigned long cc_decayed_sum_len = |
| sizeof(cc_decayed_sum_1) / sizeof(cc_decayed_sum_1[0]) - 1; |
| |
| /* |
| * sysctl handler to update decay rate |
| */ |
| int concurrency_decay_rate_handler(struct ctl_table *table, int write, |
| void __user *buffer, size_t *lenp, loff_t *ppos) |
| { |
| int ret = proc_dointvec(table, write, buffer, lenp, ppos); |
| |
| if (ret || !write) |
| return ret; |
| |
| switch (sysctl_concurrency_decay_rate) { |
| case 1: |
| cc_decay_factor = cc_decay_factor_1; |
| cc_decayed_sum = cc_decayed_sum_1; |
| cc_decayed_sum_len = sizeof(cc_decayed_sum_1) / |
| sizeof(cc_decayed_sum_1[0]) - 1; |
| break; |
| case 2: |
| cc_decay_factor = cc_decay_factor_2; |
| cc_decayed_sum = cc_decayed_sum_2; |
| cc_decayed_sum_len = sizeof(cc_decayed_sum_2) / |
| sizeof(cc_decayed_sum_2[0]) - 1; |
| break; |
| case 4: |
| cc_decay_factor = cc_decay_factor_4; |
| cc_decayed_sum = cc_decayed_sum_4; |
| cc_decayed_sum_len = sizeof(cc_decayed_sum_4) / |
| sizeof(cc_decayed_sum_4[0]) - 1; |
| break; |
| case 8: |
| cc_decay_factor = cc_decay_factor_8; |
| cc_decayed_sum = cc_decayed_sum_8; |
| cc_decayed_sum_len = sizeof(cc_decayed_sum_8) / |
| sizeof(cc_decayed_sum_8[0]) - 1; |
| break; |
| default: |
| return -EINVAL; |
| } |
| |
| cc_decay_max_pds *= sysctl_concurrency_decay_rate; |
| |
| return 0; |
| } |
| |
| /* |
| * decay concurrency at some decay rate |
| */ |
| static inline u64 decay_cc(u64 cc, u64 periods) |
| { |
| u32 periods_l; |
| |
| if (periods <= 0) |
| return cc; |
| |
| if (unlikely(periods >= cc_decay_max_pds)) |
| return 0; |
| |
| /* now period is not too large */ |
| periods_l = (u32)periods; |
| if (periods_l >= sysctl_concurrency_decay_rate) { |
| cc >>= periods_l / sysctl_concurrency_decay_rate; |
| periods_l %= sysctl_concurrency_decay_rate; |
| } |
| |
| if (!periods_l) |
| return cc; |
| |
| cc *= cc_decay_factor[periods_l]; |
| |
| return cc >> 32; |
| } |
| |
| /* |
| * add missed periods by predefined constants |
| */ |
| static inline u64 cc_missed_pds(u64 periods) |
| { |
| if (periods <= 0) |
| return 0; |
| |
| if (periods > cc_decayed_sum_len) |
| periods = cc_decayed_sum_len; |
| |
| return cc_decayed_sum[periods]; |
| } |
| |
| /* |
| * scale up nr_running, because we decay |
| */ |
| static inline unsigned long cc_weight(unsigned long nr_running) |
| { |
| /* |
| * scaling factor, this should be tunable |
| */ |
| return cc_scale_up(nr_running); |
| } |
| |
| static inline void |
| __update_concurrency(struct rq *rq, u64 now, struct cpu_concurrency_t *cc) |
| { |
| u64 sum_pds, sum_pds_s, sum_pds_e; |
| u64 contrib_pds, ts_contrib, contrib_pds_one; |
| u64 sum_now; |
| unsigned long weight; |
| int updated = 0; |
| |
| /* |
| * guarantee contrib_timestamp always >= sum_timestamp, |
| * and sum_timestamp is at period boundary |
| */ |
| if (now <= cc->sum_timestamp) { |
| cc->sum_timestamp = cc_timestamp(cc_sum_pds(now)); |
| cc->contrib_timestamp = now; |
| return; |
| } |
| |
| weight = cc_weight(cc->nr_running); |
| |
| /* start and end of sum periods */ |
| sum_pds_s = cc_sum_pds(cc->sum_timestamp); |
| sum_pds_e = cc_sum_pds(now); |
| sum_pds = sum_pds_e - sum_pds_s; |
| /* number of contrib periods in one sum period */ |
| contrib_pds_one = cc_contrib_pds(cc_timestamp(1)); |
| |
| /* |
| * if we have passed at least one period, |
| * we need to do four things: |
| */ |
| if (sum_pds) { |
| /* 1) complete the last period */ |
| ts_contrib = cc_timestamp(sum_pds_s + 1); |
| contrib_pds = cc_contrib_pds(ts_contrib); |
| contrib_pds -= cc_contrib_pds(cc->contrib_timestamp); |
| |
| if (likely(contrib_pds)) |
| cc->contrib += weight * contrib_pds; |
| |
| cc->contrib = div64_u64(cc->contrib, contrib_pds_one); |
| |
| cc->sum += cc->contrib; |
| cc->contrib = 0; |
| |
| /* 2) update/decay them */ |
| cc->sum = decay_cc(cc->sum, sum_pds); |
| sum_now = decay_cc(cc->sum, sum_pds - 1); |
| |
| /* 3) compensate missed periods if any */ |
| sum_pds -= 1; |
| cc->sum += cc->nr_running * cc_missed_pds(sum_pds); |
| sum_now += cc->nr_running * cc_missed_pds(sum_pds - 1); |
| updated = 1; |
| |
| /* 4) update contrib timestamp to period boundary */ |
| ts_contrib = cc_timestamp(sum_pds_e); |
| |
| cc->sum_timestamp = ts_contrib; |
| cc->contrib_timestamp = ts_contrib; |
| } |
| |
| /* current period */ |
| contrib_pds = cc_contrib_pds(now); |
| contrib_pds -= cc_contrib_pds(cc->contrib_timestamp); |
| |
| if (likely(contrib_pds)) |
| cc->contrib += weight * contrib_pds; |
| |
| /* new nr_running for next update */ |
| cc->nr_running = rq->nr_running; |
| |
| /* |
| * we need to account for the current sum period, |
| * if now has passed 1/2 of sum period, we contribute, |
| * otherwise, we use the last complete sum period |
| */ |
| contrib_pds = cc_contrib_pds(now - cc->sum_timestamp); |
| |
| if (contrib_pds > contrib_pds_one / 2) { |
| sum_now = div64_u64(cc->contrib, contrib_pds); |
| sum_now += cc->sum; |
| updated = 1; |
| } |
| |
| if (updated == 1) |
| cc->sum_now = sum_now; |
| cc->contrib_timestamp = now; |
| } |
| |
| void init_cpu_concurrency(struct rq *rq) |
| { |
| rq->concurrency.sum = 0; |
| rq->concurrency.sum_now = 0; |
| rq->concurrency.contrib = 0; |
| rq->concurrency.nr_running = 0; |
| rq->concurrency.sum_timestamp = ULLONG_MAX; |
| rq->concurrency.contrib_timestamp = ULLONG_MAX; |
| #ifdef CONFIG_WORKLOAD_CONSOLIDATION |
| rq->concurrency.unload = 0; |
| #endif |
| } |
| |
| /* |
| * we update cpu concurrency at: |
| * 1) enqueue task, which increases concurrency |
| * 2) dequeue task, which decreases concurrency |
| * 3) periodic scheduler tick, in case no en/dequeue for long |
| * 4) enter and exit idle (necessary?) |
| */ |
| void update_cpu_concurrency(struct rq *rq) |
| { |
| /* |
| * protected under rq->lock |
| */ |
| struct cpu_concurrency_t *cc = &rq->concurrency; |
| u64 now = rq->clock; |
| |
| __update_concurrency(rq, now, cc); |
| } |
| |
| #endif |
| |
| #ifdef CONFIG_WORKLOAD_CONSOLIDATION |
| /* |
| * whether cpu is capable of having more concurrency |
| */ |
| static int cpu_cc_capable(int cpu) |
| { |
| u64 sum = cpu_rq(cpu)->concurrency.sum_now; |
| u64 threshold = cc_weight(1); |
| |
| sum *= 100; |
| sum *= cpu_rq(cpu)->cpu_power; |
| |
| threshold *= wc_wakeup_threshold; |
| threshold <<= SCHED_POWER_SHIFT; |
| |
| if (sum <= threshold) |
| return 1; |
| |
| return 0; |
| } |
| |
| /* |
| * we do not select idle, if the cc of the |
| * wakee and waker (in this order) is capable |
| * of handling the wakee task |
| */ |
| int workload_consolidation_wakeup(int prev, int target) |
| { |
| if (!wc_wakeup) { |
| if (idle_cpu(target)) |
| return target; |
| |
| return nr_cpu_ids; |
| } |
| |
| if (idle_cpu(prev) || cpu_cc_capable(prev)) |
| return prev; |
| |
| if (prev != target && (idle_cpu(target) || cpu_cc_capable(target))) |
| return target; |
| |
| return nr_cpu_ids; |
| } |
| |
| static inline u64 sched_group_cc(struct sched_group *sg) |
| { |
| u64 sg_cc = 0; |
| int i; |
| |
| for_each_cpu(i, sched_group_cpus(sg)) |
| sg_cc += cpu_rq(i)->concurrency.sum_now * |
| cpu_rq(i)->cpu_power; |
| |
| return sg_cc; |
| } |
| |
| static inline u64 sched_domain_cc(struct sched_domain *sd) |
| { |
| struct sched_group *sg = sd->groups; |
| u64 sd_cc = 0; |
| |
| do { |
| sd_cc += sched_group_cc(sg); |
| sg = sg->next; |
| } while (sg != sd->groups); |
| |
| return sd_cc; |
| } |
| |
| static inline struct sched_group * |
| find_lowest_cc_group(struct sched_group *sg, int span) |
| { |
| u64 grp_cc, min = ULLONG_MAX; |
| struct sched_group *lowest = NULL; |
| int i; |
| |
| for (i = 0; i < span; ++i) { |
| grp_cc = sched_group_cc(sg); |
| |
| if (grp_cc < min) { |
| min = grp_cc; |
| lowest = sg; |
| } |
| |
| sg = sg->next; |
| } |
| |
| return lowest; |
| } |
| |
| static inline u64 __calc_cc_thr(int cpus, unsigned int asym_cc) |
| { |
| u64 thr = cpus; |
| |
| thr *= cc_weight(1); |
| thr *= asym_cc; |
| thr <<= SCHED_POWER_SHIFT; |
| |
| return thr; |
| } |
| |
| /* |
| * can @src_cc of @src_nr cpus be consolidated |
| * to @dst_cc of @dst_nr cpus |
| */ |
| static inline int |
| __can_consolidate_cc(u64 src_cc, int src_nr, u64 dst_cc, int dst_nr) |
| { |
| dst_cc *= dst_nr; |
| src_nr -= dst_nr; |
| |
| if (unlikely(src_nr <= 0)) |
| return 0; |
| |
| src_nr = ilog2(src_nr); |
| src_nr += dst_nr; |
| src_cc *= src_nr; |
| |
| if (src_cc > dst_cc) |
| return 0; |
| |
| return 1; |
| } |
| |
| /* |
| * find the group for asymmetric concurrency |
| * problem to address: traverse sd from top to down |
| */ |
| struct sched_group * |
| workload_consolidation_find_group(struct sched_domain *sd, |
| struct task_struct *p, int this_cpu) |
| { |
| int half, sg_weight, ns_half = 0; |
| struct sched_group *sg; |
| u64 sd_cc; |
| |
| half = DIV_ROUND_CLOSEST(sd->total_groups, 2); |
| sg_weight = sd->groups->group_weight; |
| |
| sd_cc = sched_domain_cc(sd); |
| sd_cc *= 100; |
| |
| while (half) { |
| int allowed = 0, i; |
| int cpus = sg_weight * half; |
| u64 threshold = __calc_cc_thr(cpus, |
| sd->asym_concurrency); |
| |
| /* |
| * we did not consider the added cc by this |
| * wakeup (mostly from fork/exec) |
| */ |
| if (!__can_consolidate_cc(sd_cc, sd->span_weight, |
| threshold, cpus)) |
| break; |
| |
| sg = sd->first_group; |
| for (i = 0; i < half; ++i) { |
| /* if it has no cpus allowed */ |
| if (!cpumask_intersects(sched_group_cpus(sg), |
| tsk_cpus_allowed(p))) |
| continue; |
| |
| allowed = 1; |
| sg = sg->next; |
| } |
| |
| if (!allowed) |
| break; |
| |
| ns_half = half; |
| half /= 2; |
| } |
| |
| if (!ns_half) |
| return NULL; |
| |
| if (ns_half == 1) |
| return sd->first_group; |
| |
| return find_lowest_cc_group(sd->first_group, ns_half); |
| } |
| |
| /* |
| * top_flag_domain - return top sched_domain containing flag. |
| * @cpu: the cpu whose highest level of sched domain is to |
| * be returned. |
| * @flag: the flag to check for the highest sched_domain |
| * for the given cpu. |
| * |
| * returns the highest sched_domain of a cpu which contains the given flag. |
| * different from highest_flag_domain in that along the domain upward chain |
| * domain may or may not contain the flag. |
| */ |
| static inline struct sched_domain *top_flag_domain(int cpu, int flag) |
| { |
| struct sched_domain *sd, *hsd = NULL; |
| |
| for_each_domain(cpu, sd) { |
| if (!(sd->flags & flag)) |
| continue; |
| hsd = sd; |
| } |
| |
| return hsd; |
| } |
| |
| /* |
| * workload_consolidation_cpu_shielded - return whether @cpu is shielded or not |
| * |
| * traverse downward the sched_domain tree when the sched_domain contains |
| * flag SD_ASYM_CONCURRENCY, each sd may have more than two groups, but |
| * we assume 1) every sched_group has the same weight, 2) every CPU has |
| * the same computing power |
| */ |
| int workload_consolidation_cpu_shielded(int cpu) |
| { |
| struct sched_domain *sd; |
| |
| sd = top_flag_domain(cpu, SD_ASYM_CONCURRENCY); |
| |
| while (sd) { |
| int half, sg_weight, this_sg_nr; |
| u64 sd_cc; |
| |
| if (!(sd->flags & SD_ASYM_CONCURRENCY)) { |
| sd = sd->child; |
| continue; |
| } |
| |
| half = DIV_ROUND_CLOSEST(sd->total_groups, 2); |
| sg_weight = sd->groups->group_weight; |
| this_sg_nr = sd->group_number; |
| |
| sd_cc = sched_domain_cc(sd); |
| sd_cc *= 100; |
| |
| while (half) { |
| int cpus = sg_weight * half; |
| u64 threshold = __calc_cc_thr(cpus, |
| sd->asym_concurrency); |
| |
| if (!__can_consolidate_cc(sd_cc, sd->span_weight, |
| threshold, cpus)) |
| return 0; |
| |
| if (this_sg_nr >= half) |
| return 1; |
| |
| half /= 2; |
| } |
| |
| sd = sd->child; |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * as of now, we have the following assumption |
| * 1) every sched_group has the same weight |
| * 2) every CPU has the same computing power |
| */ |
| static inline int __nonshielded_groups(struct sched_domain *sd) |
| { |
| int half, sg_weight, ret = 0; |
| u64 sd_cc; |
| |
| half = DIV_ROUND_CLOSEST(sd->total_groups, 2); |
| sg_weight = sd->groups->group_weight; |
| |
| sd_cc = sched_domain_cc(sd); |
| sd_cc *= 100; |
| |
| while (half) { |
| int cpus = sg_weight * half; |
| u64 threshold = __calc_cc_thr(cpus, |
| sd->asym_concurrency); |
| |
| if (!__can_consolidate_cc(sd_cc, sd->span_weight, |
| threshold, cpus)) |
| return ret; |
| |
| ret = half; |
| half /= 2; |
| } |
| |
| return ret; |
| } |
| |
| static DEFINE_PER_CPU(struct cpumask, nonshielded_cpumask); |
| |
| /* |
| * workload_consolidation_nonshielded_mask - return the nonshielded cpus in the @mask, |
| * which is unmasked by the shielded cpus |
| * |
| * traverse downward the sched_domain tree when the sched_domain contains |
| * flag SD_ASYM_CONCURRENCY, each sd may have more than two groups |
| */ |
| void workload_consolidation_nonshielded_mask(int cpu, struct cpumask *mask) |
| { |
| struct sched_domain *sd; |
| struct cpumask *pcpu_mask = &per_cpu(nonshielded_cpumask, cpu); |
| int i; |
| |
| sd = top_flag_domain(cpu, SD_ASYM_CONCURRENCY); |
| |
| if (!sd) |
| return; |
| |
| while (sd) { |
| struct sched_group *sg; |
| int this_sg_nr, ns_half; |
| |
| if (!(sd->flags & SD_ASYM_CONCURRENCY)) { |
| sd = sd->child; |
| continue; |
| } |
| |
| ns_half = __nonshielded_groups(sd); |
| |
| if (!ns_half) |
| break; |
| |
| cpumask_clear(pcpu_mask); |
| sg = sd->first_group; |
| |
| for (i = 0; i < ns_half; ++i) { |
| cpumask_or(pcpu_mask, pcpu_mask, |
| sched_group_cpus(sg)); |
| sg = sg->next; |
| } |
| |
| cpumask_and(mask, mask, pcpu_mask); |
| |
| this_sg_nr = sd->group_number; |
| if (this_sg_nr) |
| break; |
| |
| sd = sd->child; |
| } |
| } |
| |
| static int cpu_task_hot(struct task_struct *p, u64 now) |
| { |
| s64 delta; |
| |
| if (p->sched_class != &fair_sched_class) |
| return 0; |
| |
| if (unlikely(p->policy == SCHED_IDLE)) |
| return 0; |
| |
| if (sysctl_sched_migration_cost == -1) |
| return 1; |
| |
| if (sysctl_sched_migration_cost == 0) |
| return 0; |
| |
| if (wc_push_hot_task) |
| return 0; |
| |
| /* |
| * buddy candidates are cache hot: |
| */ |
| if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && |
| (&p->se == p->se.cfs_rq->next || |
| &p->se == p->se.cfs_rq->last)) { |
| return 1; |
| } |
| |
| delta = now - p->se.exec_start; |
| |
| if (delta < (s64)sysctl_sched_migration_cost) |
| return 1; |
| |
| return 0; |
| } |
| |
| static int |
| cpu_move_task(struct task_struct *p, struct rq *src_rq, struct rq *dst_rq) |
| { |
| /* |
| * we do not migrate tasks that are: |
| * 1) running (obviously), or |
| * 2) cannot be migrated to this CPU due to cpus_allowed, or |
| * 3) are cache-hot on their current CPU. |
| */ |
| if (!cpumask_test_cpu(dst_rq->cpu, tsk_cpus_allowed(p))) |
| return 0; |
| |
| if (task_running(src_rq, p)) |
| return 0; |
| |
| /* |
| * aggressive migration if task is cache cold |
| */ |
| if (!cpu_task_hot(p, src_rq->clock_task)) { |
| /* |
| * move a task |
| */ |
| deactivate_task(src_rq, p, 0); |
| set_task_cpu(p, dst_rq->cpu); |
| activate_task(dst_rq, p, 0); |
| check_preempt_curr(dst_rq, p, 0); |
| return 1; |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * __unload_cpu_work is run by src cpu stopper, which pushes running |
| * tasks off src cpu onto dst cpu |
| */ |
| static int __unload_cpu_work(void *data) |
| { |
| struct rq *src_rq = data; |
| int src_cpu = cpu_of(src_rq); |
| struct cpu_concurrency_t *cc = &src_rq->concurrency; |
| struct rq *dst_rq = cpu_rq(cc->dst_cpu); |
| |
| struct list_head *tasks = &src_rq->cfs_tasks; |
| struct task_struct *p, *n; |
| int pushed = 0; |
| int nr_migrate_break = 1; |
| |
| raw_spin_lock_irq(&src_rq->lock); |
| |
| /* make sure the requested cpu hasn't gone down in the meantime */ |
| if (unlikely(src_cpu != smp_processor_id() || !cc->unload)) |
| goto out_unlock; |
| |
| /* Is there any task to move? */ |
| if (src_rq->nr_running <= 1) |
| goto out_unlock; |
| |
| double_lock_balance(src_rq, dst_rq); |
| |
| list_for_each_entry_safe(p, n, tasks, se.group_node) { |
| |
| if (!cpu_move_task(p, src_rq, dst_rq)) |
| continue; |
| |
| pushed++; |
| |
| if (pushed >= nr_migrate_break) |
| break; |
| } |
| |
| double_unlock_balance(src_rq, dst_rq); |
| out_unlock: |
| cc->unload = 0; |
| raw_spin_unlock_irq(&src_rq->lock); |
| |
| return 0; |
| } |
| |
| /* |
| * unload src_cpu to dst_cpu |
| */ |
| static void unload_cpu(int src_cpu, int dst_cpu) |
| { |
| unsigned long flags; |
| struct rq *src_rq = cpu_rq(src_cpu); |
| struct cpu_concurrency_t *cc = &src_rq->concurrency; |
| int unload = 0; |
| |
| raw_spin_lock_irqsave(&src_rq->lock, flags); |
| |
| if (!cc->unload) { |
| cc->unload = 1; |
| cc->dst_cpu = dst_cpu; |
| unload = 1; |
| } |
| |
| raw_spin_unlock_irqrestore(&src_rq->lock, flags); |
| |
| if (unload) |
| stop_one_cpu_nowait(src_cpu, __unload_cpu_work, src_rq, |
| &cc->unload_work); |
| } |
| |
| static inline int find_lowest_cc_cpu(struct cpumask *mask) |
| { |
| u64 cpu_cc, min = ULLONG_MAX; |
| int i, lowest = nr_cpu_ids; |
| struct rq *rq; |
| |
| for_each_cpu(i, mask) { |
| rq = cpu_rq(i); |
| cpu_cc = rq->concurrency.sum_now * rq->cpu_power; |
| |
| if (cpu_cc < min) { |
| min = cpu_cc; |
| lowest = i; |
| } |
| } |
| |
| return lowest; |
| } |
| |
| /* |
| * find the lowest cc cpu in shielded and nonshielded cpus, |
| * aggressively unload the shielded to the nonshielded |
| */ |
| void workload_consolidation_unload(struct cpumask *nonshielded) |
| { |
| int src_cpu = nr_cpu_ids, dst_cpu, i; |
| u64 cpu_cc, min = ULLONG_MAX; |
| struct rq *rq; |
| |
| for_each_cpu_not(i, nonshielded) { |
| if (i >= nr_cpu_ids) |
| break; |
| |
| rq = cpu_rq(i); |
| if (rq->nr_running <= 0) |
| continue; |
| |
| cpu_cc = rq->concurrency.sum_now * rq->cpu_power; |
| if (cpu_cc < min) { |
| min = cpu_cc; |
| src_cpu = i; |
| } |
| } |
| |
| if (src_cpu >= nr_cpu_ids) |
| return; |
| |
| dst_cpu = find_lowest_cc_cpu(nonshielded); |
| if (dst_cpu >= nr_cpu_ids) |
| return; |
| |
| if (src_cpu != dst_cpu) |
| unload_cpu(src_cpu, dst_cpu); |
| } |
| |
| #endif /* CONFIG_WORKLOAD_CONSOLIDATION */ |