1 /*
2  * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
3  *
4  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5  *
6  *  Interactivity improvements by Mike Galbraith
7  *  (C) 2007 Mike Galbraith <efault@gmx.de>
8  *
9  *  Various enhancements by Dmitry Adamushko.
10  *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
11  *
12  *  Group scheduling enhancements by Srivatsa Vaddagiri
13  *  Copyright IBM Corporation, 2007
14  *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
15  *
16  *  Scaled math optimizations by Thomas Gleixner
17  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
18  *
19  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
20  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
21  */
22 
23 #include <linux/latencytop.h>
24 #include <linux/sched.h>
25 #include <linux/cpumask.h>
26 #include <linux/slab.h>
27 #include <linux/profile.h>
28 #include <linux/interrupt.h>
29 
30 #include <trace/events/sched.h>
31 
32 #include "sched.h"
33 
34 /*
35  * Targeted preemption latency for CPU-bound tasks:
36  * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
37  *
38  * NOTE: this latency value is not the same as the concept of
39  * 'timeslice length' - timeslices in CFS are of variable length
40  * and have no persistent notion like in traditional, time-slice
41  * based scheduling concepts.
42  *
43  * (to see the precise effective timeslice length of your workload,
44  *  run vmstat and monitor the context-switches (cs) field)
45  */
46 unsigned int sysctl_sched_latency = 6000000ULL;
47 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
48 
49 /*
50  * The initial- and re-scaling of tunables is configurable
51  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
52  *
53  * Options are:
54  * SCHED_TUNABLESCALING_NONE - unscaled, always *1
55  * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
56  * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
57  */
58 enum sched_tunable_scaling sysctl_sched_tunable_scaling
59 	= SCHED_TUNABLESCALING_LOG;
60 
61 /*
62  * Minimal preemption granularity for CPU-bound tasks:
63  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
64  */
65 unsigned int sysctl_sched_min_granularity = 750000ULL;
66 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
67 
68 /*
69  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
70  */
71 static unsigned int sched_nr_latency = 8;
72 
73 /*
74  * After fork, child runs first. If set to 0 (default) then
75  * parent will (try to) run first.
76  */
77 unsigned int sysctl_sched_child_runs_first __read_mostly;
78 
79 /*
80  * SCHED_OTHER wake-up granularity.
81  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
82  *
83  * This option delays the preemption effects of decoupled workloads
84  * and reduces their over-scheduling. Synchronous workloads will still
85  * have immediate wakeup/sleep latencies.
86  */
87 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
88 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
89 
90 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
91 
92 /*
93  * The exponential sliding  window over which load is averaged for shares
94  * distribution.
95  * (default: 10msec)
96  */
97 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
98 
99 #ifdef CONFIG_CFS_BANDWIDTH
100 /*
101  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
102  * each time a cfs_rq requests quota.
103  *
104  * Note: in the case that the slice exceeds the runtime remaining (either due
105  * to consumption or the quota being specified to be smaller than the slice)
106  * we will always only issue the remaining available time.
107  *
108  * default: 5 msec, units: microseconds
109   */
110 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
111 #endif
112 
113 /*
114  * Increase the granularity value when there are more CPUs,
115  * because with more CPUs the 'effective latency' as visible
116  * to users decreases. But the relationship is not linear,
117  * so pick a second-best guess by going with the log2 of the
118  * number of CPUs.
119  *
120  * This idea comes from the SD scheduler of Con Kolivas:
121  */
get_update_sysctl_factor(void)122 static int get_update_sysctl_factor(void)
123 {
124 	unsigned int cpus = min_t(int, num_online_cpus(), 8);
125 	unsigned int factor;
126 
127 	switch (sysctl_sched_tunable_scaling) {
128 	case SCHED_TUNABLESCALING_NONE:
129 		factor = 1;
130 		break;
131 	case SCHED_TUNABLESCALING_LINEAR:
132 		factor = cpus;
133 		break;
134 	case SCHED_TUNABLESCALING_LOG:
135 	default:
136 		factor = 1 + ilog2(cpus);
137 		break;
138 	}
139 
140 	return factor;
141 }
142 
update_sysctl(void)143 static void update_sysctl(void)
144 {
145 	unsigned int factor = get_update_sysctl_factor();
146 
147 #define SET_SYSCTL(name) \
148 	(sysctl_##name = (factor) * normalized_sysctl_##name)
149 	SET_SYSCTL(sched_min_granularity);
150 	SET_SYSCTL(sched_latency);
151 	SET_SYSCTL(sched_wakeup_granularity);
152 #undef SET_SYSCTL
153 }
154 
sched_init_granularity(void)155 void sched_init_granularity(void)
156 {
157 	update_sysctl();
158 }
159 
160 #if BITS_PER_LONG == 32
161 # define WMULT_CONST	(~0UL)
162 #else
163 # define WMULT_CONST	(1UL << 32)
164 #endif
165 
166 #define WMULT_SHIFT	32
167 
168 /*
169  * Shift right and round:
170  */
171 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
172 
173 /*
174  * delta *= weight / lw
175  */
176 static unsigned long
calc_delta_mine(unsigned long delta_exec,unsigned long weight,struct load_weight * lw)177 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
178 		struct load_weight *lw)
179 {
180 	u64 tmp;
181 
182 	/*
183 	 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
184 	 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
185 	 * 2^SCHED_LOAD_RESOLUTION.
186 	 */
187 	if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
188 		tmp = (u64)delta_exec * scale_load_down(weight);
189 	else
190 		tmp = (u64)delta_exec;
191 
192 	if (!lw->inv_weight) {
193 		unsigned long w = scale_load_down(lw->weight);
194 
195 		if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
196 			lw->inv_weight = 1;
197 		else if (unlikely(!w))
198 			lw->inv_weight = WMULT_CONST;
199 		else
200 			lw->inv_weight = WMULT_CONST / w;
201 	}
202 
203 	/*
204 	 * Check whether we'd overflow the 64-bit multiplication:
205 	 */
206 	if (unlikely(tmp > WMULT_CONST))
207 		tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
208 			WMULT_SHIFT/2);
209 	else
210 		tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
211 
212 	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
213 }
214 
215 
216 const struct sched_class fair_sched_class;
217 
218 /**************************************************************
219  * CFS operations on generic schedulable entities:
220  */
221 
222 #ifdef CONFIG_FAIR_GROUP_SCHED
223 
224 /* cpu runqueue to which this cfs_rq is attached */
rq_of(struct cfs_rq * cfs_rq)225 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
226 {
227 	return cfs_rq->rq;
228 }
229 
230 /* An entity is a task if it doesn't "own" a runqueue */
231 #define entity_is_task(se)	(!se->my_q)
232 
task_of(struct sched_entity * se)233 static inline struct task_struct *task_of(struct sched_entity *se)
234 {
235 #ifdef CONFIG_SCHED_DEBUG
236 	WARN_ON_ONCE(!entity_is_task(se));
237 #endif
238 	return container_of(se, struct task_struct, se);
239 }
240 
241 /* Walk up scheduling entities hierarchy */
242 #define for_each_sched_entity(se) \
243 		for (; se; se = se->parent)
244 
task_cfs_rq(struct task_struct * p)245 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
246 {
247 	return p->se.cfs_rq;
248 }
249 
250 /* runqueue on which this entity is (to be) queued */
cfs_rq_of(struct sched_entity * se)251 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
252 {
253 	return se->cfs_rq;
254 }
255 
256 /* runqueue "owned" by this group */
group_cfs_rq(struct sched_entity * grp)257 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
258 {
259 	return grp->my_q;
260 }
261 
list_add_leaf_cfs_rq(struct cfs_rq * cfs_rq)262 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
263 {
264 	if (!cfs_rq->on_list) {
265 		/*
266 		 * Ensure we either appear before our parent (if already
267 		 * enqueued) or force our parent to appear after us when it is
268 		 * enqueued.  The fact that we always enqueue bottom-up
269 		 * reduces this to two cases.
270 		 */
271 		if (cfs_rq->tg->parent &&
272 		    cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
273 			list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
274 				&rq_of(cfs_rq)->leaf_cfs_rq_list);
275 		} else {
276 			list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
277 				&rq_of(cfs_rq)->leaf_cfs_rq_list);
278 		}
279 
280 		cfs_rq->on_list = 1;
281 	}
282 }
283 
list_del_leaf_cfs_rq(struct cfs_rq * cfs_rq)284 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
285 {
286 	if (cfs_rq->on_list) {
287 		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
288 		cfs_rq->on_list = 0;
289 	}
290 }
291 
292 /* Iterate thr' all leaf cfs_rq's on a runqueue */
293 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
294 	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
295 
296 /* Do the two (enqueued) entities belong to the same group ? */
297 static inline int
is_same_group(struct sched_entity * se,struct sched_entity * pse)298 is_same_group(struct sched_entity *se, struct sched_entity *pse)
299 {
300 	if (se->cfs_rq == pse->cfs_rq)
301 		return 1;
302 
303 	return 0;
304 }
305 
parent_entity(struct sched_entity * se)306 static inline struct sched_entity *parent_entity(struct sched_entity *se)
307 {
308 	return se->parent;
309 }
310 
311 /* return depth at which a sched entity is present in the hierarchy */
depth_se(struct sched_entity * se)312 static inline int depth_se(struct sched_entity *se)
313 {
314 	int depth = 0;
315 
316 	for_each_sched_entity(se)
317 		depth++;
318 
319 	return depth;
320 }
321 
322 static void
find_matching_se(struct sched_entity ** se,struct sched_entity ** pse)323 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
324 {
325 	int se_depth, pse_depth;
326 
327 	/*
328 	 * preemption test can be made between sibling entities who are in the
329 	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
330 	 * both tasks until we find their ancestors who are siblings of common
331 	 * parent.
332 	 */
333 
334 	/* First walk up until both entities are at same depth */
335 	se_depth = depth_se(*se);
336 	pse_depth = depth_se(*pse);
337 
338 	while (se_depth > pse_depth) {
339 		se_depth--;
340 		*se = parent_entity(*se);
341 	}
342 
343 	while (pse_depth > se_depth) {
344 		pse_depth--;
345 		*pse = parent_entity(*pse);
346 	}
347 
348 	while (!is_same_group(*se, *pse)) {
349 		*se = parent_entity(*se);
350 		*pse = parent_entity(*pse);
351 	}
352 }
353 
354 #else	/* !CONFIG_FAIR_GROUP_SCHED */
355 
task_of(struct sched_entity * se)356 static inline struct task_struct *task_of(struct sched_entity *se)
357 {
358 	return container_of(se, struct task_struct, se);
359 }
360 
rq_of(struct cfs_rq * cfs_rq)361 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
362 {
363 	return container_of(cfs_rq, struct rq, cfs);
364 }
365 
366 #define entity_is_task(se)	1
367 
368 #define for_each_sched_entity(se) \
369 		for (; se; se = NULL)
370 
task_cfs_rq(struct task_struct * p)371 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
372 {
373 	return &task_rq(p)->cfs;
374 }
375 
cfs_rq_of(struct sched_entity * se)376 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
377 {
378 	struct task_struct *p = task_of(se);
379 	struct rq *rq = task_rq(p);
380 
381 	return &rq->cfs;
382 }
383 
384 /* runqueue "owned" by this group */
group_cfs_rq(struct sched_entity * grp)385 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
386 {
387 	return NULL;
388 }
389 
list_add_leaf_cfs_rq(struct cfs_rq * cfs_rq)390 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
391 {
392 }
393 
list_del_leaf_cfs_rq(struct cfs_rq * cfs_rq)394 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
395 {
396 }
397 
398 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
399 		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
400 
401 static inline int
is_same_group(struct sched_entity * se,struct sched_entity * pse)402 is_same_group(struct sched_entity *se, struct sched_entity *pse)
403 {
404 	return 1;
405 }
406 
parent_entity(struct sched_entity * se)407 static inline struct sched_entity *parent_entity(struct sched_entity *se)
408 {
409 	return NULL;
410 }
411 
412 static inline void
find_matching_se(struct sched_entity ** se,struct sched_entity ** pse)413 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
414 {
415 }
416 
417 #endif	/* CONFIG_FAIR_GROUP_SCHED */
418 
419 static __always_inline
420 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
421 
422 /**************************************************************
423  * Scheduling class tree data structure manipulation methods:
424  */
425 
max_vruntime(u64 min_vruntime,u64 vruntime)426 static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
427 {
428 	s64 delta = (s64)(vruntime - min_vruntime);
429 	if (delta > 0)
430 		min_vruntime = vruntime;
431 
432 	return min_vruntime;
433 }
434 
min_vruntime(u64 min_vruntime,u64 vruntime)435 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
436 {
437 	s64 delta = (s64)(vruntime - min_vruntime);
438 	if (delta < 0)
439 		min_vruntime = vruntime;
440 
441 	return min_vruntime;
442 }
443 
entity_before(struct sched_entity * a,struct sched_entity * b)444 static inline int entity_before(struct sched_entity *a,
445 				struct sched_entity *b)
446 {
447 	return (s64)(a->vruntime - b->vruntime) < 0;
448 }
449 
update_min_vruntime(struct cfs_rq * cfs_rq)450 static void update_min_vruntime(struct cfs_rq *cfs_rq)
451 {
452 	u64 vruntime = cfs_rq->min_vruntime;
453 
454 	if (cfs_rq->curr)
455 		vruntime = cfs_rq->curr->vruntime;
456 
457 	if (cfs_rq->rb_leftmost) {
458 		struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
459 						   struct sched_entity,
460 						   run_node);
461 
462 		if (!cfs_rq->curr)
463 			vruntime = se->vruntime;
464 		else
465 			vruntime = min_vruntime(vruntime, se->vruntime);
466 	}
467 
468 	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
469 #ifndef CONFIG_64BIT
470 	smp_wmb();
471 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
472 #endif
473 }
474 
475 /*
476  * Enqueue an entity into the rb-tree:
477  */
__enqueue_entity(struct cfs_rq * cfs_rq,struct sched_entity * se)478 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
479 {
480 	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
481 	struct rb_node *parent = NULL;
482 	struct sched_entity *entry;
483 	int leftmost = 1;
484 
485 	/*
486 	 * Find the right place in the rbtree:
487 	 */
488 	while (*link) {
489 		parent = *link;
490 		entry = rb_entry(parent, struct sched_entity, run_node);
491 		/*
492 		 * We dont care about collisions. Nodes with
493 		 * the same key stay together.
494 		 */
495 		if (entity_before(se, entry)) {
496 			link = &parent->rb_left;
497 		} else {
498 			link = &parent->rb_right;
499 			leftmost = 0;
500 		}
501 	}
502 
503 	/*
504 	 * Maintain a cache of leftmost tree entries (it is frequently
505 	 * used):
506 	 */
507 	if (leftmost)
508 		cfs_rq->rb_leftmost = &se->run_node;
509 
510 	rb_link_node(&se->run_node, parent, link);
511 	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
512 }
513 
__dequeue_entity(struct cfs_rq * cfs_rq,struct sched_entity * se)514 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
515 {
516 	if (cfs_rq->rb_leftmost == &se->run_node) {
517 		struct rb_node *next_node;
518 
519 		next_node = rb_next(&se->run_node);
520 		cfs_rq->rb_leftmost = next_node;
521 	}
522 
523 	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
524 }
525 
__pick_first_entity(struct cfs_rq * cfs_rq)526 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
527 {
528 	struct rb_node *left = cfs_rq->rb_leftmost;
529 
530 	if (!left)
531 		return NULL;
532 
533 	return rb_entry(left, struct sched_entity, run_node);
534 }
535 
__pick_next_entity(struct sched_entity * se)536 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
537 {
538 	struct rb_node *next = rb_next(&se->run_node);
539 
540 	if (!next)
541 		return NULL;
542 
543 	return rb_entry(next, struct sched_entity, run_node);
544 }
545 
546 #ifdef CONFIG_SCHED_DEBUG
__pick_last_entity(struct cfs_rq * cfs_rq)547 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
548 {
549 	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
550 
551 	if (!last)
552 		return NULL;
553 
554 	return rb_entry(last, struct sched_entity, run_node);
555 }
556 
557 /**************************************************************
558  * Scheduling class statistics methods:
559  */
560 
sched_proc_update_handler(struct ctl_table * table,int write,void __user * buffer,size_t * lenp,loff_t * ppos)561 int sched_proc_update_handler(struct ctl_table *table, int write,
562 		void __user *buffer, size_t *lenp,
563 		loff_t *ppos)
564 {
565 	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
566 	int factor = get_update_sysctl_factor();
567 
568 	if (ret || !write)
569 		return ret;
570 
571 	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
572 					sysctl_sched_min_granularity);
573 
574 #define WRT_SYSCTL(name) \
575 	(normalized_sysctl_##name = sysctl_##name / (factor))
576 	WRT_SYSCTL(sched_min_granularity);
577 	WRT_SYSCTL(sched_latency);
578 	WRT_SYSCTL(sched_wakeup_granularity);
579 #undef WRT_SYSCTL
580 
581 	return 0;
582 }
583 #endif
584 
585 /*
586  * delta /= w
587  */
588 static inline unsigned long
calc_delta_fair(unsigned long delta,struct sched_entity * se)589 calc_delta_fair(unsigned long delta, struct sched_entity *se)
590 {
591 	if (unlikely(se->load.weight != NICE_0_LOAD))
592 		delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
593 
594 	return delta;
595 }
596 
597 /*
598  * The idea is to set a period in which each task runs once.
599  *
600  * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
601  * this period because otherwise the slices get too small.
602  *
603  * p = (nr <= nl) ? l : l*nr/nl
604  */
__sched_period(unsigned long nr_running)605 static u64 __sched_period(unsigned long nr_running)
606 {
607 	u64 period = sysctl_sched_latency;
608 	unsigned long nr_latency = sched_nr_latency;
609 
610 	if (unlikely(nr_running > nr_latency)) {
611 		period = sysctl_sched_min_granularity;
612 		period *= nr_running;
613 	}
614 
615 	return period;
616 }
617 
618 /*
619  * We calculate the wall-time slice from the period by taking a part
620  * proportional to the weight.
621  *
622  * s = p*P[w/rw]
623  */
sched_slice(struct cfs_rq * cfs_rq,struct sched_entity * se)624 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
625 {
626 	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
627 
628 	for_each_sched_entity(se) {
629 		struct load_weight *load;
630 		struct load_weight lw;
631 
632 		cfs_rq = cfs_rq_of(se);
633 		load = &cfs_rq->load;
634 
635 		if (unlikely(!se->on_rq)) {
636 			lw = cfs_rq->load;
637 
638 			update_load_add(&lw, se->load.weight);
639 			load = &lw;
640 		}
641 		slice = calc_delta_mine(slice, se->load.weight, load);
642 	}
643 	return slice;
644 }
645 
646 /*
647  * We calculate the vruntime slice of a to be inserted task
648  *
649  * vs = s/w
650  */
sched_vslice(struct cfs_rq * cfs_rq,struct sched_entity * se)651 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
652 {
653 	return calc_delta_fair(sched_slice(cfs_rq, se), se);
654 }
655 
656 static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
657 static void update_cfs_shares(struct cfs_rq *cfs_rq);
658 
659 /*
660  * Update the current task's runtime statistics. Skip current tasks that
661  * are not in our scheduling class.
662  */
663 static inline void
__update_curr(struct cfs_rq * cfs_rq,struct sched_entity * curr,unsigned long delta_exec)664 __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
665 	      unsigned long delta_exec)
666 {
667 	unsigned long delta_exec_weighted;
668 
669 	schedstat_set(curr->statistics.exec_max,
670 		      max((u64)delta_exec, curr->statistics.exec_max));
671 
672 	curr->sum_exec_runtime += delta_exec;
673 	schedstat_add(cfs_rq, exec_clock, delta_exec);
674 	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
675 
676 	curr->vruntime += delta_exec_weighted;
677 	update_min_vruntime(cfs_rq);
678 
679 #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
680 	cfs_rq->load_unacc_exec_time += delta_exec;
681 #endif
682 }
683 
update_curr(struct cfs_rq * cfs_rq)684 static void update_curr(struct cfs_rq *cfs_rq)
685 {
686 	struct sched_entity *curr = cfs_rq->curr;
687 	u64 now = rq_of(cfs_rq)->clock_task;
688 	unsigned long delta_exec;
689 
690 	if (unlikely(!curr))
691 		return;
692 
693 	/*
694 	 * Get the amount of time the current task was running
695 	 * since the last time we changed load (this cannot
696 	 * overflow on 32 bits):
697 	 */
698 	delta_exec = (unsigned long)(now - curr->exec_start);
699 	if (!delta_exec)
700 		return;
701 
702 	__update_curr(cfs_rq, curr, delta_exec);
703 	curr->exec_start = now;
704 
705 	if (entity_is_task(curr)) {
706 		struct task_struct *curtask = task_of(curr);
707 
708 		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
709 		cpuacct_charge(curtask, delta_exec);
710 		account_group_exec_runtime(curtask, delta_exec);
711 	}
712 
713 	account_cfs_rq_runtime(cfs_rq, delta_exec);
714 }
715 
716 static inline void
update_stats_wait_start(struct cfs_rq * cfs_rq,struct sched_entity * se)717 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
718 {
719 	schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
720 }
721 
722 /*
723  * Task is being enqueued - update stats:
724  */
update_stats_enqueue(struct cfs_rq * cfs_rq,struct sched_entity * se)725 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
726 {
727 	/*
728 	 * Are we enqueueing a waiting task? (for current tasks
729 	 * a dequeue/enqueue event is a NOP)
730 	 */
731 	if (se != cfs_rq->curr)
732 		update_stats_wait_start(cfs_rq, se);
733 }
734 
735 static void
update_stats_wait_end(struct cfs_rq * cfs_rq,struct sched_entity * se)736 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
737 {
738 	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
739 			rq_of(cfs_rq)->clock - se->statistics.wait_start));
740 	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
741 	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
742 			rq_of(cfs_rq)->clock - se->statistics.wait_start);
743 #ifdef CONFIG_SCHEDSTATS
744 	if (entity_is_task(se)) {
745 		trace_sched_stat_wait(task_of(se),
746 			rq_of(cfs_rq)->clock - se->statistics.wait_start);
747 	}
748 #endif
749 	schedstat_set(se->statistics.wait_start, 0);
750 }
751 
752 static inline void
update_stats_dequeue(struct cfs_rq * cfs_rq,struct sched_entity * se)753 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
754 {
755 	/*
756 	 * Mark the end of the wait period if dequeueing a
757 	 * waiting task:
758 	 */
759 	if (se != cfs_rq->curr)
760 		update_stats_wait_end(cfs_rq, se);
761 }
762 
763 /*
764  * We are picking a new current task - update its stats:
765  */
766 static inline void
update_stats_curr_start(struct cfs_rq * cfs_rq,struct sched_entity * se)767 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
768 {
769 	/*
770 	 * We are starting a new run period:
771 	 */
772 	se->exec_start = rq_of(cfs_rq)->clock_task;
773 }
774 
775 /**************************************************
776  * Scheduling class queueing methods:
777  */
778 
779 static void
account_entity_enqueue(struct cfs_rq * cfs_rq,struct sched_entity * se)780 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
781 {
782 	update_load_add(&cfs_rq->load, se->load.weight);
783 	if (!parent_entity(se))
784 		update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
785 #ifdef CONFIG_SMP
786 	if (entity_is_task(se))
787 		list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
788 #endif
789 	cfs_rq->nr_running++;
790 }
791 
792 static void
account_entity_dequeue(struct cfs_rq * cfs_rq,struct sched_entity * se)793 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
794 {
795 	update_load_sub(&cfs_rq->load, se->load.weight);
796 	if (!parent_entity(se))
797 		update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
798 	if (entity_is_task(se))
799 		list_del_init(&se->group_node);
800 	cfs_rq->nr_running--;
801 }
802 
803 #ifdef CONFIG_FAIR_GROUP_SCHED
804 /* we need this in update_cfs_load and load-balance functions below */
805 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
806 # ifdef CONFIG_SMP
update_cfs_rq_load_contribution(struct cfs_rq * cfs_rq,int global_update)807 static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
808 					    int global_update)
809 {
810 	struct task_group *tg = cfs_rq->tg;
811 	long load_avg;
812 
813 	load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
814 	load_avg -= cfs_rq->load_contribution;
815 
816 	if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
817 		atomic_add(load_avg, &tg->load_weight);
818 		cfs_rq->load_contribution += load_avg;
819 	}
820 }
821 
update_cfs_load(struct cfs_rq * cfs_rq,int global_update)822 static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
823 {
824 	u64 period = sysctl_sched_shares_window;
825 	u64 now, delta;
826 	unsigned long load = cfs_rq->load.weight;
827 
828 	if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
829 		return;
830 
831 	now = rq_of(cfs_rq)->clock_task;
832 	delta = now - cfs_rq->load_stamp;
833 
834 	/* truncate load history at 4 idle periods */
835 	if (cfs_rq->load_stamp > cfs_rq->load_last &&
836 	    now - cfs_rq->load_last > 4 * period) {
837 		cfs_rq->load_period = 0;
838 		cfs_rq->load_avg = 0;
839 		delta = period - 1;
840 	}
841 
842 	cfs_rq->load_stamp = now;
843 	cfs_rq->load_unacc_exec_time = 0;
844 	cfs_rq->load_period += delta;
845 	if (load) {
846 		cfs_rq->load_last = now;
847 		cfs_rq->load_avg += delta * load;
848 	}
849 
850 	/* consider updating load contribution on each fold or truncate */
851 	if (global_update || cfs_rq->load_period > period
852 	    || !cfs_rq->load_period)
853 		update_cfs_rq_load_contribution(cfs_rq, global_update);
854 
855 	while (cfs_rq->load_period > period) {
856 		/*
857 		 * Inline assembly required to prevent the compiler
858 		 * optimising this loop into a divmod call.
859 		 * See __iter_div_u64_rem() for another example of this.
860 		 */
861 		asm("" : "+rm" (cfs_rq->load_period));
862 		cfs_rq->load_period /= 2;
863 		cfs_rq->load_avg /= 2;
864 	}
865 
866 	if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
867 		list_del_leaf_cfs_rq(cfs_rq);
868 }
869 
calc_tg_weight(struct task_group * tg,struct cfs_rq * cfs_rq)870 static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
871 {
872 	long tg_weight;
873 
874 	/*
875 	 * Use this CPU's actual weight instead of the last load_contribution
876 	 * to gain a more accurate current total weight. See
877 	 * update_cfs_rq_load_contribution().
878 	 */
879 	tg_weight = atomic_read(&tg->load_weight);
880 	tg_weight -= cfs_rq->load_contribution;
881 	tg_weight += cfs_rq->load.weight;
882 
883 	return tg_weight;
884 }
885 
calc_cfs_shares(struct cfs_rq * cfs_rq,struct task_group * tg)886 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
887 {
888 	long tg_weight, load, shares;
889 
890 	tg_weight = calc_tg_weight(tg, cfs_rq);
891 	load = cfs_rq->load.weight;
892 
893 	shares = (tg->shares * load);
894 	if (tg_weight)
895 		shares /= tg_weight;
896 
897 	if (shares < MIN_SHARES)
898 		shares = MIN_SHARES;
899 	if (shares > tg->shares)
900 		shares = tg->shares;
901 
902 	return shares;
903 }
904 
update_entity_shares_tick(struct cfs_rq * cfs_rq)905 static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
906 {
907 	if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
908 		update_cfs_load(cfs_rq, 0);
909 		update_cfs_shares(cfs_rq);
910 	}
911 }
912 # else /* CONFIG_SMP */
update_cfs_load(struct cfs_rq * cfs_rq,int global_update)913 static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
914 {
915 }
916 
calc_cfs_shares(struct cfs_rq * cfs_rq,struct task_group * tg)917 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
918 {
919 	return tg->shares;
920 }
921 
update_entity_shares_tick(struct cfs_rq * cfs_rq)922 static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
923 {
924 }
925 # endif /* CONFIG_SMP */
reweight_entity(struct cfs_rq * cfs_rq,struct sched_entity * se,unsigned long weight)926 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
927 			    unsigned long weight)
928 {
929 	if (se->on_rq) {
930 		/* commit outstanding execution time */
931 		if (cfs_rq->curr == se)
932 			update_curr(cfs_rq);
933 		account_entity_dequeue(cfs_rq, se);
934 	}
935 
936 	update_load_set(&se->load, weight);
937 
938 	if (se->on_rq)
939 		account_entity_enqueue(cfs_rq, se);
940 }
941 
update_cfs_shares(struct cfs_rq * cfs_rq)942 static void update_cfs_shares(struct cfs_rq *cfs_rq)
943 {
944 	struct task_group *tg;
945 	struct sched_entity *se;
946 	long shares;
947 
948 	tg = cfs_rq->tg;
949 	se = tg->se[cpu_of(rq_of(cfs_rq))];
950 	if (!se || throttled_hierarchy(cfs_rq))
951 		return;
952 #ifndef CONFIG_SMP
953 	if (likely(se->load.weight == tg->shares))
954 		return;
955 #endif
956 	shares = calc_cfs_shares(cfs_rq, tg);
957 
958 	reweight_entity(cfs_rq_of(se), se, shares);
959 }
960 #else /* CONFIG_FAIR_GROUP_SCHED */
update_cfs_load(struct cfs_rq * cfs_rq,int global_update)961 static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
962 {
963 }
964 
update_cfs_shares(struct cfs_rq * cfs_rq)965 static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
966 {
967 }
968 
update_entity_shares_tick(struct cfs_rq * cfs_rq)969 static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
970 {
971 }
972 #endif /* CONFIG_FAIR_GROUP_SCHED */
973 
enqueue_sleeper(struct cfs_rq * cfs_rq,struct sched_entity * se)974 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
975 {
976 #ifdef CONFIG_SCHEDSTATS
977 	struct task_struct *tsk = NULL;
978 
979 	if (entity_is_task(se))
980 		tsk = task_of(se);
981 
982 	if (se->statistics.sleep_start) {
983 		u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
984 
985 		if ((s64)delta < 0)
986 			delta = 0;
987 
988 		if (unlikely(delta > se->statistics.sleep_max))
989 			se->statistics.sleep_max = delta;
990 
991 		se->statistics.sleep_start = 0;
992 		se->statistics.sum_sleep_runtime += delta;
993 
994 		if (tsk) {
995 			account_scheduler_latency(tsk, delta >> 10, 1);
996 			trace_sched_stat_sleep(tsk, delta);
997 		}
998 	}
999 	if (se->statistics.block_start) {
1000 		u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
1001 
1002 		if ((s64)delta < 0)
1003 			delta = 0;
1004 
1005 		if (unlikely(delta > se->statistics.block_max))
1006 			se->statistics.block_max = delta;
1007 
1008 		se->statistics.block_start = 0;
1009 		se->statistics.sum_sleep_runtime += delta;
1010 
1011 		if (tsk) {
1012 			if (tsk->in_iowait) {
1013 				se->statistics.iowait_sum += delta;
1014 				se->statistics.iowait_count++;
1015 				trace_sched_stat_iowait(tsk, delta);
1016 			}
1017 
1018 			trace_sched_stat_blocked(tsk, delta);
1019 
1020 			/*
1021 			 * Blocking time is in units of nanosecs, so shift by
1022 			 * 20 to get a milliseconds-range estimation of the
1023 			 * amount of time that the task spent sleeping:
1024 			 */
1025 			if (unlikely(prof_on == SLEEP_PROFILING)) {
1026 				profile_hits(SLEEP_PROFILING,
1027 						(void *)get_wchan(tsk),
1028 						delta >> 20);
1029 			}
1030 			account_scheduler_latency(tsk, delta >> 10, 0);
1031 		}
1032 	}
1033 #endif
1034 }
1035 
check_spread(struct cfs_rq * cfs_rq,struct sched_entity * se)1036 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
1037 {
1038 #ifdef CONFIG_SCHED_DEBUG
1039 	s64 d = se->vruntime - cfs_rq->min_vruntime;
1040 
1041 	if (d < 0)
1042 		d = -d;
1043 
1044 	if (d > 3*sysctl_sched_latency)
1045 		schedstat_inc(cfs_rq, nr_spread_over);
1046 #endif
1047 }
1048 
1049 static void
place_entity(struct cfs_rq * cfs_rq,struct sched_entity * se,int initial)1050 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
1051 {
1052 	u64 vruntime = cfs_rq->min_vruntime;
1053 
1054 	/*
1055 	 * The 'current' period is already promised to the current tasks,
1056 	 * however the extra weight of the new task will slow them down a
1057 	 * little, place the new task so that it fits in the slot that
1058 	 * stays open at the end.
1059 	 */
1060 	if (initial && sched_feat(START_DEBIT))
1061 		vruntime += sched_vslice(cfs_rq, se);
1062 
1063 	/* sleeps up to a single latency don't count. */
1064 	if (!initial) {
1065 		unsigned long thresh = sysctl_sched_latency;
1066 
1067 		/*
1068 		 * Halve their sleep time's effect, to allow
1069 		 * for a gentler effect of sleepers:
1070 		 */
1071 		if (sched_feat(GENTLE_FAIR_SLEEPERS))
1072 			thresh >>= 1;
1073 
1074 		vruntime -= thresh;
1075 	}
1076 
1077 	/* ensure we never gain time by being placed backwards. */
1078 	vruntime = max_vruntime(se->vruntime, vruntime);
1079 
1080 	se->vruntime = vruntime;
1081 }
1082 
1083 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
1084 
1085 static void
enqueue_entity(struct cfs_rq * cfs_rq,struct sched_entity * se,int flags)1086 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1087 {
1088 	/*
1089 	 * Update the normalized vruntime before updating min_vruntime
1090 	 * through callig update_curr().
1091 	 */
1092 	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
1093 		se->vruntime += cfs_rq->min_vruntime;
1094 
1095 	/*
1096 	 * Update run-time statistics of the 'current'.
1097 	 */
1098 	update_curr(cfs_rq);
1099 	update_cfs_load(cfs_rq, 0);
1100 	account_entity_enqueue(cfs_rq, se);
1101 	update_cfs_shares(cfs_rq);
1102 
1103 	if (flags & ENQUEUE_WAKEUP) {
1104 		place_entity(cfs_rq, se, 0);
1105 		enqueue_sleeper(cfs_rq, se);
1106 	}
1107 
1108 	update_stats_enqueue(cfs_rq, se);
1109 	check_spread(cfs_rq, se);
1110 	if (se != cfs_rq->curr)
1111 		__enqueue_entity(cfs_rq, se);
1112 	se->on_rq = 1;
1113 
1114 	if (cfs_rq->nr_running == 1) {
1115 		list_add_leaf_cfs_rq(cfs_rq);
1116 		check_enqueue_throttle(cfs_rq);
1117 	}
1118 }
1119 
__clear_buddies_last(struct sched_entity * se)1120 static void __clear_buddies_last(struct sched_entity *se)
1121 {
1122 	for_each_sched_entity(se) {
1123 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
1124 		if (cfs_rq->last == se)
1125 			cfs_rq->last = NULL;
1126 		else
1127 			break;
1128 	}
1129 }
1130 
__clear_buddies_next(struct sched_entity * se)1131 static void __clear_buddies_next(struct sched_entity *se)
1132 {
1133 	for_each_sched_entity(se) {
1134 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
1135 		if (cfs_rq->next == se)
1136 			cfs_rq->next = NULL;
1137 		else
1138 			break;
1139 	}
1140 }
1141 
__clear_buddies_skip(struct sched_entity * se)1142 static void __clear_buddies_skip(struct sched_entity *se)
1143 {
1144 	for_each_sched_entity(se) {
1145 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
1146 		if (cfs_rq->skip == se)
1147 			cfs_rq->skip = NULL;
1148 		else
1149 			break;
1150 	}
1151 }
1152 
clear_buddies(struct cfs_rq * cfs_rq,struct sched_entity * se)1153 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1154 {
1155 	if (cfs_rq->last == se)
1156 		__clear_buddies_last(se);
1157 
1158 	if (cfs_rq->next == se)
1159 		__clear_buddies_next(se);
1160 
1161 	if (cfs_rq->skip == se)
1162 		__clear_buddies_skip(se);
1163 }
1164 
1165 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1166 
1167 static void
dequeue_entity(struct cfs_rq * cfs_rq,struct sched_entity * se,int flags)1168 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1169 {
1170 	/*
1171 	 * Update run-time statistics of the 'current'.
1172 	 */
1173 	update_curr(cfs_rq);
1174 
1175 	update_stats_dequeue(cfs_rq, se);
1176 	if (flags & DEQUEUE_SLEEP) {
1177 #ifdef CONFIG_SCHEDSTATS
1178 		if (entity_is_task(se)) {
1179 			struct task_struct *tsk = task_of(se);
1180 
1181 			if (tsk->state & TASK_INTERRUPTIBLE)
1182 				se->statistics.sleep_start = rq_of(cfs_rq)->clock;
1183 			if (tsk->state & TASK_UNINTERRUPTIBLE)
1184 				se->statistics.block_start = rq_of(cfs_rq)->clock;
1185 		}
1186 #endif
1187 	}
1188 
1189 	clear_buddies(cfs_rq, se);
1190 
1191 	if (se != cfs_rq->curr)
1192 		__dequeue_entity(cfs_rq, se);
1193 	se->on_rq = 0;
1194 	update_cfs_load(cfs_rq, 0);
1195 	account_entity_dequeue(cfs_rq, se);
1196 
1197 	/*
1198 	 * Normalize the entity after updating the min_vruntime because the
1199 	 * update can refer to the ->curr item and we need to reflect this
1200 	 * movement in our normalized position.
1201 	 */
1202 	if (!(flags & DEQUEUE_SLEEP))
1203 		se->vruntime -= cfs_rq->min_vruntime;
1204 
1205 	/* return excess runtime on last dequeue */
1206 	return_cfs_rq_runtime(cfs_rq);
1207 
1208 	update_min_vruntime(cfs_rq);
1209 	update_cfs_shares(cfs_rq);
1210 }
1211 
1212 /*
1213  * Preempt the current task with a newly woken task if needed:
1214  */
1215 static void
check_preempt_tick(struct cfs_rq * cfs_rq,struct sched_entity * curr)1216 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1217 {
1218 	unsigned long ideal_runtime, delta_exec;
1219 	struct sched_entity *se;
1220 	s64 delta;
1221 
1222 	ideal_runtime = sched_slice(cfs_rq, curr);
1223 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
1224 	if (delta_exec > ideal_runtime) {
1225 		resched_task(rq_of(cfs_rq)->curr);
1226 		/*
1227 		 * The current task ran long enough, ensure it doesn't get
1228 		 * re-elected due to buddy favours.
1229 		 */
1230 		clear_buddies(cfs_rq, curr);
1231 		return;
1232 	}
1233 
1234 	/*
1235 	 * Ensure that a task that missed wakeup preemption by a
1236 	 * narrow margin doesn't have to wait for a full slice.
1237 	 * This also mitigates buddy induced latencies under load.
1238 	 */
1239 	if (delta_exec < sysctl_sched_min_granularity)
1240 		return;
1241 
1242 	se = __pick_first_entity(cfs_rq);
1243 	delta = curr->vruntime - se->vruntime;
1244 
1245 	if (delta < 0)
1246 		return;
1247 
1248 	if (delta > ideal_runtime)
1249 		resched_task(rq_of(cfs_rq)->curr);
1250 }
1251 
1252 static void
set_next_entity(struct cfs_rq * cfs_rq,struct sched_entity * se)1253 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
1254 {
1255 	/* 'current' is not kept within the tree. */
1256 	if (se->on_rq) {
1257 		/*
1258 		 * Any task has to be enqueued before it get to execute on
1259 		 * a CPU. So account for the time it spent waiting on the
1260 		 * runqueue.
1261 		 */
1262 		update_stats_wait_end(cfs_rq, se);
1263 		__dequeue_entity(cfs_rq, se);
1264 	}
1265 
1266 	update_stats_curr_start(cfs_rq, se);
1267 	cfs_rq->curr = se;
1268 #ifdef CONFIG_SCHEDSTATS
1269 	/*
1270 	 * Track our maximum slice length, if the CPU's load is at
1271 	 * least twice that of our own weight (i.e. dont track it
1272 	 * when there are only lesser-weight tasks around):
1273 	 */
1274 	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
1275 		se->statistics.slice_max = max(se->statistics.slice_max,
1276 			se->sum_exec_runtime - se->prev_sum_exec_runtime);
1277 	}
1278 #endif
1279 	se->prev_sum_exec_runtime = se->sum_exec_runtime;
1280 }
1281 
1282 static int
1283 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
1284 
1285 /*
1286  * Pick the next process, keeping these things in mind, in this order:
1287  * 1) keep things fair between processes/task groups
1288  * 2) pick the "next" process, since someone really wants that to run
1289  * 3) pick the "last" process, for cache locality
1290  * 4) do not run the "skip" process, if something else is available
1291  */
pick_next_entity(struct cfs_rq * cfs_rq)1292 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1293 {
1294 	struct sched_entity *se = __pick_first_entity(cfs_rq);
1295 	struct sched_entity *left = se;
1296 
1297 	/*
1298 	 * Avoid running the skip buddy, if running something else can
1299 	 * be done without getting too unfair.
1300 	 */
1301 	if (cfs_rq->skip == se) {
1302 		struct sched_entity *second = __pick_next_entity(se);
1303 		if (second && wakeup_preempt_entity(second, left) < 1)
1304 			se = second;
1305 	}
1306 
1307 	/*
1308 	 * Prefer last buddy, try to return the CPU to a preempted task.
1309 	 */
1310 	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
1311 		se = cfs_rq->last;
1312 
1313 	/*
1314 	 * Someone really wants this to run. If it's not unfair, run it.
1315 	 */
1316 	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
1317 		se = cfs_rq->next;
1318 
1319 	clear_buddies(cfs_rq, se);
1320 
1321 	return se;
1322 }
1323 
1324 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1325 
put_prev_entity(struct cfs_rq * cfs_rq,struct sched_entity * prev)1326 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
1327 {
1328 	/*
1329 	 * If still on the runqueue then deactivate_task()
1330 	 * was not called and update_curr() has to be done:
1331 	 */
1332 	if (prev->on_rq)
1333 		update_curr(cfs_rq);
1334 
1335 	/* throttle cfs_rqs exceeding runtime */
1336 	check_cfs_rq_runtime(cfs_rq);
1337 
1338 	check_spread(cfs_rq, prev);
1339 	if (prev->on_rq) {
1340 		update_stats_wait_start(cfs_rq, prev);
1341 		/* Put 'current' back into the tree. */
1342 		__enqueue_entity(cfs_rq, prev);
1343 	}
1344 	cfs_rq->curr = NULL;
1345 }
1346 
1347 static void
entity_tick(struct cfs_rq * cfs_rq,struct sched_entity * curr,int queued)1348 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1349 {
1350 	/*
1351 	 * Update run-time statistics of the 'current'.
1352 	 */
1353 	update_curr(cfs_rq);
1354 
1355 	/*
1356 	 * Update share accounting for long-running entities.
1357 	 */
1358 	update_entity_shares_tick(cfs_rq);
1359 
1360 #ifdef CONFIG_SCHED_HRTICK
1361 	/*
1362 	 * queued ticks are scheduled to match the slice, so don't bother
1363 	 * validating it and just reschedule.
1364 	 */
1365 	if (queued) {
1366 		resched_task(rq_of(cfs_rq)->curr);
1367 		return;
1368 	}
1369 	/*
1370 	 * don't let the period tick interfere with the hrtick preemption
1371 	 */
1372 	if (!sched_feat(DOUBLE_TICK) &&
1373 			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
1374 		return;
1375 #endif
1376 
1377 	if (cfs_rq->nr_running > 1)
1378 		check_preempt_tick(cfs_rq, curr);
1379 }
1380 
1381 
1382 /**************************************************
1383  * CFS bandwidth control machinery
1384  */
1385 
1386 #ifdef CONFIG_CFS_BANDWIDTH
1387 
1388 #ifdef HAVE_JUMP_LABEL
1389 static struct static_key __cfs_bandwidth_used;
1390 
cfs_bandwidth_used(void)1391 static inline bool cfs_bandwidth_used(void)
1392 {
1393 	return static_key_false(&__cfs_bandwidth_used);
1394 }
1395 
cfs_bandwidth_usage_inc(void)1396 void cfs_bandwidth_usage_inc(void)
1397 {
1398 	static_key_slow_inc(&__cfs_bandwidth_used);
1399 }
1400 
cfs_bandwidth_usage_dec(void)1401 void cfs_bandwidth_usage_dec(void)
1402 {
1403 	static_key_slow_dec(&__cfs_bandwidth_used);
1404 }
1405 #else /* HAVE_JUMP_LABEL */
cfs_bandwidth_used(void)1406 static bool cfs_bandwidth_used(void)
1407 {
1408 	return true;
1409 }
1410 
cfs_bandwidth_usage_inc(void)1411 void cfs_bandwidth_usage_inc(void) {}
cfs_bandwidth_usage_dec(void)1412 void cfs_bandwidth_usage_dec(void) {}
1413 #endif /* HAVE_JUMP_LABEL */
1414 
1415 /*
1416  * default period for cfs group bandwidth.
1417  * default: 0.1s, units: nanoseconds
1418  */
default_cfs_period(void)1419 static inline u64 default_cfs_period(void)
1420 {
1421 	return 100000000ULL;
1422 }
1423 
sched_cfs_bandwidth_slice(void)1424 static inline u64 sched_cfs_bandwidth_slice(void)
1425 {
1426 	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
1427 }
1428 
1429 /*
1430  * Replenish runtime according to assigned quota and update expiration time.
1431  * We use sched_clock_cpu directly instead of rq->clock to avoid adding
1432  * additional synchronization around rq->lock.
1433  *
1434  * requires cfs_b->lock
1435  */
__refill_cfs_bandwidth_runtime(struct cfs_bandwidth * cfs_b)1436 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
1437 {
1438 	u64 now;
1439 
1440 	if (cfs_b->quota == RUNTIME_INF)
1441 		return;
1442 
1443 	now = sched_clock_cpu(smp_processor_id());
1444 	cfs_b->runtime = cfs_b->quota;
1445 	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
1446 }
1447 
tg_cfs_bandwidth(struct task_group * tg)1448 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
1449 {
1450 	return &tg->cfs_bandwidth;
1451 }
1452 
1453 /* returns 0 on failure to allocate runtime */
assign_cfs_rq_runtime(struct cfs_rq * cfs_rq)1454 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1455 {
1456 	struct task_group *tg = cfs_rq->tg;
1457 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
1458 	u64 amount = 0, min_amount, expires;
1459 
1460 	/* note: this is a positive sum as runtime_remaining <= 0 */
1461 	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
1462 
1463 	raw_spin_lock(&cfs_b->lock);
1464 	if (cfs_b->quota == RUNTIME_INF)
1465 		amount = min_amount;
1466 	else {
1467 		/*
1468 		 * If the bandwidth pool has become inactive, then at least one
1469 		 * period must have elapsed since the last consumption.
1470 		 * Refresh the global state and ensure bandwidth timer becomes
1471 		 * active.
1472 		 */
1473 		if (!cfs_b->timer_active) {
1474 			__refill_cfs_bandwidth_runtime(cfs_b);
1475 			__start_cfs_bandwidth(cfs_b);
1476 		}
1477 
1478 		if (cfs_b->runtime > 0) {
1479 			amount = min(cfs_b->runtime, min_amount);
1480 			cfs_b->runtime -= amount;
1481 			cfs_b->idle = 0;
1482 		}
1483 	}
1484 	expires = cfs_b->runtime_expires;
1485 	raw_spin_unlock(&cfs_b->lock);
1486 
1487 	cfs_rq->runtime_remaining += amount;
1488 	/*
1489 	 * we may have advanced our local expiration to account for allowed
1490 	 * spread between our sched_clock and the one on which runtime was
1491 	 * issued.
1492 	 */
1493 	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
1494 		cfs_rq->runtime_expires = expires;
1495 
1496 	return cfs_rq->runtime_remaining > 0;
1497 }
1498 
1499 /*
1500  * Note: This depends on the synchronization provided by sched_clock and the
1501  * fact that rq->clock snapshots this value.
1502  */
expire_cfs_rq_runtime(struct cfs_rq * cfs_rq)1503 static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1504 {
1505 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1506 	struct rq *rq = rq_of(cfs_rq);
1507 
1508 	/* if the deadline is ahead of our clock, nothing to do */
1509 	if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
1510 		return;
1511 
1512 	if (cfs_rq->runtime_remaining < 0)
1513 		return;
1514 
1515 	/*
1516 	 * If the local deadline has passed we have to consider the
1517 	 * possibility that our sched_clock is 'fast' and the global deadline
1518 	 * has not truly expired.
1519 	 *
1520 	 * Fortunately we can check determine whether this the case by checking
1521 	 * whether the global deadline has advanced.
1522 	 */
1523 
1524 	if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
1525 		/* extend local deadline, drift is bounded above by 2 ticks */
1526 		cfs_rq->runtime_expires += TICK_NSEC;
1527 	} else {
1528 		/* global deadline is ahead, expiration has passed */
1529 		cfs_rq->runtime_remaining = 0;
1530 	}
1531 }
1532 
__account_cfs_rq_runtime(struct cfs_rq * cfs_rq,unsigned long delta_exec)1533 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1534 				     unsigned long delta_exec)
1535 {
1536 	/* dock delta_exec before expiring quota (as it could span periods) */
1537 	cfs_rq->runtime_remaining -= delta_exec;
1538 	expire_cfs_rq_runtime(cfs_rq);
1539 
1540 	if (likely(cfs_rq->runtime_remaining > 0))
1541 		return;
1542 
1543 	/*
1544 	 * if we're unable to extend our runtime we resched so that the active
1545 	 * hierarchy can be throttled
1546 	 */
1547 	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
1548 		resched_task(rq_of(cfs_rq)->curr);
1549 }
1550 
1551 static __always_inline
account_cfs_rq_runtime(struct cfs_rq * cfs_rq,unsigned long delta_exec)1552 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
1553 {
1554 	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
1555 		return;
1556 
1557 	__account_cfs_rq_runtime(cfs_rq, delta_exec);
1558 }
1559 
cfs_rq_throttled(struct cfs_rq * cfs_rq)1560 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
1561 {
1562 	return cfs_bandwidth_used() && cfs_rq->throttled;
1563 }
1564 
1565 /* check whether cfs_rq, or any parent, is throttled */
throttled_hierarchy(struct cfs_rq * cfs_rq)1566 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
1567 {
1568 	return cfs_bandwidth_used() && cfs_rq->throttle_count;
1569 }
1570 
1571 /*
1572  * Ensure that neither of the group entities corresponding to src_cpu or
1573  * dest_cpu are members of a throttled hierarchy when performing group
1574  * load-balance operations.
1575  */
throttled_lb_pair(struct task_group * tg,int src_cpu,int dest_cpu)1576 static inline int throttled_lb_pair(struct task_group *tg,
1577 				    int src_cpu, int dest_cpu)
1578 {
1579 	struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
1580 
1581 	src_cfs_rq = tg->cfs_rq[src_cpu];
1582 	dest_cfs_rq = tg->cfs_rq[dest_cpu];
1583 
1584 	return throttled_hierarchy(src_cfs_rq) ||
1585 	       throttled_hierarchy(dest_cfs_rq);
1586 }
1587 
1588 /* updated child weight may affect parent so we have to do this bottom up */
tg_unthrottle_up(struct task_group * tg,void * data)1589 static int tg_unthrottle_up(struct task_group *tg, void *data)
1590 {
1591 	struct rq *rq = data;
1592 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
1593 
1594 	cfs_rq->throttle_count--;
1595 #ifdef CONFIG_SMP
1596 	if (!cfs_rq->throttle_count) {
1597 		u64 delta = rq->clock_task - cfs_rq->load_stamp;
1598 
1599 		/* leaving throttled state, advance shares averaging windows */
1600 		cfs_rq->load_stamp += delta;
1601 		cfs_rq->load_last += delta;
1602 
1603 		/* update entity weight now that we are on_rq again */
1604 		update_cfs_shares(cfs_rq);
1605 	}
1606 #endif
1607 
1608 	return 0;
1609 }
1610 
tg_throttle_down(struct task_group * tg,void * data)1611 static int tg_throttle_down(struct task_group *tg, void *data)
1612 {
1613 	struct rq *rq = data;
1614 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
1615 
1616 	/* group is entering throttled state, record last load */
1617 	if (!cfs_rq->throttle_count)
1618 		update_cfs_load(cfs_rq, 0);
1619 	cfs_rq->throttle_count++;
1620 
1621 	return 0;
1622 }
1623 
throttle_cfs_rq(struct cfs_rq * cfs_rq)1624 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
1625 {
1626 	struct rq *rq = rq_of(cfs_rq);
1627 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1628 	struct sched_entity *se;
1629 	long task_delta, dequeue = 1;
1630 
1631 	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
1632 
1633 	/* account load preceding throttle */
1634 	rcu_read_lock();
1635 	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
1636 	rcu_read_unlock();
1637 
1638 	task_delta = cfs_rq->h_nr_running;
1639 	for_each_sched_entity(se) {
1640 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
1641 		/* throttled entity or throttle-on-deactivate */
1642 		if (!se->on_rq)
1643 			break;
1644 
1645 		if (dequeue)
1646 			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
1647 		qcfs_rq->h_nr_running -= task_delta;
1648 
1649 		if (qcfs_rq->load.weight)
1650 			dequeue = 0;
1651 	}
1652 
1653 	if (!se)
1654 		rq->nr_running -= task_delta;
1655 
1656 	cfs_rq->throttled = 1;
1657 	cfs_rq->throttled_timestamp = rq->clock;
1658 	raw_spin_lock(&cfs_b->lock);
1659 	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
1660 	if (!cfs_b->timer_active)
1661 		__start_cfs_bandwidth(cfs_b);
1662 	raw_spin_unlock(&cfs_b->lock);
1663 }
1664 
unthrottle_cfs_rq(struct cfs_rq * cfs_rq)1665 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
1666 {
1667 	struct rq *rq = rq_of(cfs_rq);
1668 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1669 	struct sched_entity *se;
1670 	int enqueue = 1;
1671 	long task_delta;
1672 
1673 	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
1674 
1675 	cfs_rq->throttled = 0;
1676 	raw_spin_lock(&cfs_b->lock);
1677 	cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
1678 	list_del_rcu(&cfs_rq->throttled_list);
1679 	raw_spin_unlock(&cfs_b->lock);
1680 	cfs_rq->throttled_timestamp = 0;
1681 
1682 	update_rq_clock(rq);
1683 	/* update hierarchical throttle state */
1684 	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
1685 
1686 	if (!cfs_rq->load.weight)
1687 		return;
1688 
1689 	task_delta = cfs_rq->h_nr_running;
1690 	for_each_sched_entity(se) {
1691 		if (se->on_rq)
1692 			enqueue = 0;
1693 
1694 		cfs_rq = cfs_rq_of(se);
1695 		if (enqueue)
1696 			enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
1697 		cfs_rq->h_nr_running += task_delta;
1698 
1699 		if (cfs_rq_throttled(cfs_rq))
1700 			break;
1701 	}
1702 
1703 	if (!se)
1704 		rq->nr_running += task_delta;
1705 
1706 	/* determine whether we need to wake up potentially idle cpu */
1707 	if (rq->curr == rq->idle && rq->cfs.nr_running)
1708 		resched_task(rq->curr);
1709 }
1710 
distribute_cfs_runtime(struct cfs_bandwidth * cfs_b,u64 remaining,u64 expires)1711 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
1712 		u64 remaining, u64 expires)
1713 {
1714 	struct cfs_rq *cfs_rq;
1715 	u64 runtime = remaining;
1716 
1717 	rcu_read_lock();
1718 	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
1719 				throttled_list) {
1720 		struct rq *rq = rq_of(cfs_rq);
1721 
1722 		raw_spin_lock(&rq->lock);
1723 		if (!cfs_rq_throttled(cfs_rq))
1724 			goto next;
1725 
1726 		runtime = -cfs_rq->runtime_remaining + 1;
1727 		if (runtime > remaining)
1728 			runtime = remaining;
1729 		remaining -= runtime;
1730 
1731 		cfs_rq->runtime_remaining += runtime;
1732 		cfs_rq->runtime_expires = expires;
1733 
1734 		/* we check whether we're throttled above */
1735 		if (cfs_rq->runtime_remaining > 0)
1736 			unthrottle_cfs_rq(cfs_rq);
1737 
1738 next:
1739 		raw_spin_unlock(&rq->lock);
1740 
1741 		if (!remaining)
1742 			break;
1743 	}
1744 	rcu_read_unlock();
1745 
1746 	return remaining;
1747 }
1748 
1749 /*
1750  * Responsible for refilling a task_group's bandwidth and unthrottling its
1751  * cfs_rqs as appropriate. If there has been no activity within the last
1752  * period the timer is deactivated until scheduling resumes; cfs_b->idle is
1753  * used to track this state.
1754  */
do_sched_cfs_period_timer(struct cfs_bandwidth * cfs_b,int overrun)1755 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
1756 {
1757 	u64 runtime, runtime_expires;
1758 	int idle = 1, throttled;
1759 
1760 	raw_spin_lock(&cfs_b->lock);
1761 	/* no need to continue the timer with no bandwidth constraint */
1762 	if (cfs_b->quota == RUNTIME_INF)
1763 		goto out_unlock;
1764 
1765 	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
1766 	/* idle depends on !throttled (for the case of a large deficit) */
1767 	idle = cfs_b->idle && !throttled;
1768 	cfs_b->nr_periods += overrun;
1769 
1770 	/* if we're going inactive then everything else can be deferred */
1771 	if (idle)
1772 		goto out_unlock;
1773 
1774 	/*
1775 	 * if we have relooped after returning idle once, we need to update our
1776 	 * status as actually running, so that other cpus doing
1777 	 * __start_cfs_bandwidth will stop trying to cancel us.
1778 	 */
1779 	cfs_b->timer_active = 1;
1780 
1781 	__refill_cfs_bandwidth_runtime(cfs_b);
1782 
1783 	if (!throttled) {
1784 		/* mark as potentially idle for the upcoming period */
1785 		cfs_b->idle = 1;
1786 		goto out_unlock;
1787 	}
1788 
1789 	/* account preceding periods in which throttling occurred */
1790 	cfs_b->nr_throttled += overrun;
1791 
1792 	/*
1793 	 * There are throttled entities so we must first use the new bandwidth
1794 	 * to unthrottle them before making it generally available.  This
1795 	 * ensures that all existing debts will be paid before a new cfs_rq is
1796 	 * allowed to run.
1797 	 */
1798 	runtime = cfs_b->runtime;
1799 	runtime_expires = cfs_b->runtime_expires;
1800 	cfs_b->runtime = 0;
1801 
1802 	/*
1803 	 * This check is repeated as we are holding onto the new bandwidth
1804 	 * while we unthrottle.  This can potentially race with an unthrottled
1805 	 * group trying to acquire new bandwidth from the global pool.
1806 	 */
1807 	while (throttled && runtime > 0) {
1808 		raw_spin_unlock(&cfs_b->lock);
1809 		/* we can't nest cfs_b->lock while distributing bandwidth */
1810 		runtime = distribute_cfs_runtime(cfs_b, runtime,
1811 						 runtime_expires);
1812 		raw_spin_lock(&cfs_b->lock);
1813 
1814 		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
1815 	}
1816 
1817 	/* return (any) remaining runtime */
1818 	cfs_b->runtime = runtime;
1819 	/*
1820 	 * While we are ensured activity in the period following an
1821 	 * unthrottle, this also covers the case in which the new bandwidth is
1822 	 * insufficient to cover the existing bandwidth deficit.  (Forcing the
1823 	 * timer to remain active while there are any throttled entities.)
1824 	 */
1825 	cfs_b->idle = 0;
1826 out_unlock:
1827 	if (idle)
1828 		cfs_b->timer_active = 0;
1829 	raw_spin_unlock(&cfs_b->lock);
1830 
1831 	return idle;
1832 }
1833 
1834 /* a cfs_rq won't donate quota below this amount */
1835 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
1836 /* minimum remaining period time to redistribute slack quota */
1837 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
1838 /* how long we wait to gather additional slack before distributing */
1839 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
1840 
1841 /*
1842  * Are we near the end of the current quota period?
1843  *
1844  * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
1845  * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
1846  * migrate_hrtimers, base is never cleared, so we are fine.
1847  */
runtime_refresh_within(struct cfs_bandwidth * cfs_b,u64 min_expire)1848 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
1849 {
1850 	struct hrtimer *refresh_timer = &cfs_b->period_timer;
1851 	u64 remaining;
1852 
1853 	/* if the call-back is running a quota refresh is already occurring */
1854 	if (hrtimer_callback_running(refresh_timer))
1855 		return 1;
1856 
1857 	/* is a quota refresh about to occur? */
1858 	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
1859 	if (remaining < min_expire)
1860 		return 1;
1861 
1862 	return 0;
1863 }
1864 
start_cfs_slack_bandwidth(struct cfs_bandwidth * cfs_b)1865 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
1866 {
1867 	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
1868 
1869 	/* if there's a quota refresh soon don't bother with slack */
1870 	if (runtime_refresh_within(cfs_b, min_left))
1871 		return;
1872 
1873 	start_bandwidth_timer(&cfs_b->slack_timer,
1874 				ns_to_ktime(cfs_bandwidth_slack_period));
1875 }
1876 
1877 /* we know any runtime found here is valid as update_curr() precedes return */
__return_cfs_rq_runtime(struct cfs_rq * cfs_rq)1878 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1879 {
1880 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1881 	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
1882 
1883 	if (slack_runtime <= 0)
1884 		return;
1885 
1886 	raw_spin_lock(&cfs_b->lock);
1887 	if (cfs_b->quota != RUNTIME_INF &&
1888 	    cfs_rq->runtime_expires == cfs_b->runtime_expires) {
1889 		cfs_b->runtime += slack_runtime;
1890 
1891 		/* we are under rq->lock, defer unthrottling using a timer */
1892 		if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
1893 		    !list_empty(&cfs_b->throttled_cfs_rq))
1894 			start_cfs_slack_bandwidth(cfs_b);
1895 	}
1896 	raw_spin_unlock(&cfs_b->lock);
1897 
1898 	/* even if it's not valid for return we don't want to try again */
1899 	cfs_rq->runtime_remaining -= slack_runtime;
1900 }
1901 
return_cfs_rq_runtime(struct cfs_rq * cfs_rq)1902 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1903 {
1904 	if (!cfs_bandwidth_used())
1905 		return;
1906 
1907 	if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
1908 		return;
1909 
1910 	__return_cfs_rq_runtime(cfs_rq);
1911 }
1912 
1913 /*
1914  * This is done with a timer (instead of inline with bandwidth return) since
1915  * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
1916  */
do_sched_cfs_slack_timer(struct cfs_bandwidth * cfs_b)1917 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
1918 {
1919 	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
1920 	u64 expires;
1921 
1922 	/* confirm we're still not at a refresh boundary */
1923 	raw_spin_lock(&cfs_b->lock);
1924 	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
1925 		raw_spin_unlock(&cfs_b->lock);
1926 		return;
1927 	}
1928 
1929 	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
1930 		runtime = cfs_b->runtime;
1931 		cfs_b->runtime = 0;
1932 	}
1933 	expires = cfs_b->runtime_expires;
1934 	raw_spin_unlock(&cfs_b->lock);
1935 
1936 	if (!runtime)
1937 		return;
1938 
1939 	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
1940 
1941 	raw_spin_lock(&cfs_b->lock);
1942 	if (expires == cfs_b->runtime_expires)
1943 		cfs_b->runtime = runtime;
1944 	raw_spin_unlock(&cfs_b->lock);
1945 }
1946 
1947 /*
1948  * When a group wakes up we want to make sure that its quota is not already
1949  * expired/exceeded, otherwise it may be allowed to steal additional ticks of
1950  * runtime as update_curr() throttling can not not trigger until it's on-rq.
1951  */
check_enqueue_throttle(struct cfs_rq * cfs_rq)1952 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
1953 {
1954 	if (!cfs_bandwidth_used())
1955 		return;
1956 
1957 	/* an active group must be handled by the update_curr()->put() path */
1958 	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
1959 		return;
1960 
1961 	/* ensure the group is not already throttled */
1962 	if (cfs_rq_throttled(cfs_rq))
1963 		return;
1964 
1965 	/* update runtime allocation */
1966 	account_cfs_rq_runtime(cfs_rq, 0);
1967 	if (cfs_rq->runtime_remaining <= 0)
1968 		throttle_cfs_rq(cfs_rq);
1969 }
1970 
1971 /* conditionally throttle active cfs_rq's from put_prev_entity() */
check_cfs_rq_runtime(struct cfs_rq * cfs_rq)1972 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1973 {
1974 	if (!cfs_bandwidth_used())
1975 		return;
1976 
1977 	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
1978 		return;
1979 
1980 	/*
1981 	 * it's possible for a throttled entity to be forced into a running
1982 	 * state (e.g. set_curr_task), in this case we're finished.
1983 	 */
1984 	if (cfs_rq_throttled(cfs_rq))
1985 		return;
1986 
1987 	throttle_cfs_rq(cfs_rq);
1988 }
1989 
1990 static inline u64 default_cfs_period(void);
1991 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
1992 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
1993 
sched_cfs_slack_timer(struct hrtimer * timer)1994 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
1995 {
1996 	struct cfs_bandwidth *cfs_b =
1997 		container_of(timer, struct cfs_bandwidth, slack_timer);
1998 	do_sched_cfs_slack_timer(cfs_b);
1999 
2000 	return HRTIMER_NORESTART;
2001 }
2002 
sched_cfs_period_timer(struct hrtimer * timer)2003 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
2004 {
2005 	struct cfs_bandwidth *cfs_b =
2006 		container_of(timer, struct cfs_bandwidth, period_timer);
2007 	ktime_t now;
2008 	int overrun;
2009 	int idle = 0;
2010 
2011 	for (;;) {
2012 		now = hrtimer_cb_get_time(timer);
2013 		overrun = hrtimer_forward(timer, now, cfs_b->period);
2014 
2015 		if (!overrun)
2016 			break;
2017 
2018 		idle = do_sched_cfs_period_timer(cfs_b, overrun);
2019 	}
2020 
2021 	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
2022 }
2023 
init_cfs_bandwidth(struct cfs_bandwidth * cfs_b)2024 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2025 {
2026 	raw_spin_lock_init(&cfs_b->lock);
2027 	cfs_b->runtime = 0;
2028 	cfs_b->quota = RUNTIME_INF;
2029 	cfs_b->period = ns_to_ktime(default_cfs_period());
2030 
2031 	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
2032 	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2033 	cfs_b->period_timer.function = sched_cfs_period_timer;
2034 	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2035 	cfs_b->slack_timer.function = sched_cfs_slack_timer;
2036 }
2037 
init_cfs_rq_runtime(struct cfs_rq * cfs_rq)2038 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2039 {
2040 	cfs_rq->runtime_enabled = 0;
2041 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
2042 }
2043 
2044 /* requires cfs_b->lock, may release to reprogram timer */
__start_cfs_bandwidth(struct cfs_bandwidth * cfs_b)2045 void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2046 {
2047 	/*
2048 	 * The timer may be active because we're trying to set a new bandwidth
2049 	 * period or because we're racing with the tear-down path
2050 	 * (timer_active==0 becomes visible before the hrtimer call-back
2051 	 * terminates).  In either case we ensure that it's re-programmed
2052 	 */
2053 	while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
2054 	       hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
2055 		/* bounce the lock to allow do_sched_cfs_period_timer to run */
2056 		raw_spin_unlock(&cfs_b->lock);
2057 		cpu_relax();
2058 		raw_spin_lock(&cfs_b->lock);
2059 		/* if someone else restarted the timer then we're done */
2060 		if (cfs_b->timer_active)
2061 			return;
2062 	}
2063 
2064 	cfs_b->timer_active = 1;
2065 	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
2066 }
2067 
destroy_cfs_bandwidth(struct cfs_bandwidth * cfs_b)2068 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2069 {
2070 	hrtimer_cancel(&cfs_b->period_timer);
2071 	hrtimer_cancel(&cfs_b->slack_timer);
2072 }
2073 
unthrottle_offline_cfs_rqs(struct rq * rq)2074 static void unthrottle_offline_cfs_rqs(struct rq *rq)
2075 {
2076 	struct cfs_rq *cfs_rq;
2077 
2078 	for_each_leaf_cfs_rq(rq, cfs_rq) {
2079 		struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2080 
2081 		if (!cfs_rq->runtime_enabled)
2082 			continue;
2083 
2084 		/*
2085 		 * clock_task is not advancing so we just need to make sure
2086 		 * there's some valid quota amount
2087 		 */
2088 		cfs_rq->runtime_remaining = cfs_b->quota;
2089 		if (cfs_rq_throttled(cfs_rq))
2090 			unthrottle_cfs_rq(cfs_rq);
2091 	}
2092 }
2093 
2094 #else /* CONFIG_CFS_BANDWIDTH */
2095 static __always_inline
account_cfs_rq_runtime(struct cfs_rq * cfs_rq,unsigned long delta_exec)2096 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {}
check_cfs_rq_runtime(struct cfs_rq * cfs_rq)2097 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
check_enqueue_throttle(struct cfs_rq * cfs_rq)2098 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
return_cfs_rq_runtime(struct cfs_rq * cfs_rq)2099 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2100 
cfs_rq_throttled(struct cfs_rq * cfs_rq)2101 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
2102 {
2103 	return 0;
2104 }
2105 
throttled_hierarchy(struct cfs_rq * cfs_rq)2106 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
2107 {
2108 	return 0;
2109 }
2110 
throttled_lb_pair(struct task_group * tg,int src_cpu,int dest_cpu)2111 static inline int throttled_lb_pair(struct task_group *tg,
2112 				    int src_cpu, int dest_cpu)
2113 {
2114 	return 0;
2115 }
2116 
init_cfs_bandwidth(struct cfs_bandwidth * cfs_b)2117 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2118 
2119 #ifdef CONFIG_FAIR_GROUP_SCHED
init_cfs_rq_runtime(struct cfs_rq * cfs_rq)2120 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2121 #endif
2122 
tg_cfs_bandwidth(struct task_group * tg)2123 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2124 {
2125 	return NULL;
2126 }
destroy_cfs_bandwidth(struct cfs_bandwidth * cfs_b)2127 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
unthrottle_offline_cfs_rqs(struct rq * rq)2128 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
2129 
2130 #endif /* CONFIG_CFS_BANDWIDTH */
2131 
2132 /**************************************************
2133  * CFS operations on tasks:
2134  */
2135 
2136 #ifdef CONFIG_SCHED_HRTICK
hrtick_start_fair(struct rq * rq,struct task_struct * p)2137 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
2138 {
2139 	struct sched_entity *se = &p->se;
2140 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
2141 
2142 	WARN_ON(task_rq(p) != rq);
2143 
2144 	if (cfs_rq->nr_running > 1) {
2145 		u64 slice = sched_slice(cfs_rq, se);
2146 		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
2147 		s64 delta = slice - ran;
2148 
2149 		if (delta < 0) {
2150 			if (rq->curr == p)
2151 				resched_task(p);
2152 			return;
2153 		}
2154 
2155 		/*
2156 		 * Don't schedule slices shorter than 10000ns, that just
2157 		 * doesn't make sense. Rely on vruntime for fairness.
2158 		 */
2159 		if (rq->curr != p)
2160 			delta = max_t(s64, 10000LL, delta);
2161 
2162 		hrtick_start(rq, delta);
2163 	}
2164 }
2165 
2166 /*
2167  * called from enqueue/dequeue and updates the hrtick when the
2168  * current task is from our class and nr_running is low enough
2169  * to matter.
2170  */
hrtick_update(struct rq * rq)2171 static void hrtick_update(struct rq *rq)
2172 {
2173 	struct task_struct *curr = rq->curr;
2174 
2175 	if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
2176 		return;
2177 
2178 	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
2179 		hrtick_start_fair(rq, curr);
2180 }
2181 #else /* !CONFIG_SCHED_HRTICK */
2182 static inline void
hrtick_start_fair(struct rq * rq,struct task_struct * p)2183 hrtick_start_fair(struct rq *rq, struct task_struct *p)
2184 {
2185 }
2186 
hrtick_update(struct rq * rq)2187 static inline void hrtick_update(struct rq *rq)
2188 {
2189 }
2190 #endif
2191 
2192 /*
2193  * The enqueue_task method is called before nr_running is
2194  * increased. Here we update the fair scheduling stats and
2195  * then put the task into the rbtree:
2196  */
2197 static void
enqueue_task_fair(struct rq * rq,struct task_struct * p,int flags)2198 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2199 {
2200 	struct cfs_rq *cfs_rq;
2201 	struct sched_entity *se = &p->se;
2202 
2203 	for_each_sched_entity(se) {
2204 		if (se->on_rq)
2205 			break;
2206 		cfs_rq = cfs_rq_of(se);
2207 		enqueue_entity(cfs_rq, se, flags);
2208 
2209 		/*
2210 		 * end evaluation on encountering a throttled cfs_rq
2211 		 *
2212 		 * note: in the case of encountering a throttled cfs_rq we will
2213 		 * post the final h_nr_running increment below.
2214 		*/
2215 		if (cfs_rq_throttled(cfs_rq))
2216 			break;
2217 		cfs_rq->h_nr_running++;
2218 
2219 		flags = ENQUEUE_WAKEUP;
2220 	}
2221 
2222 	for_each_sched_entity(se) {
2223 		cfs_rq = cfs_rq_of(se);
2224 		cfs_rq->h_nr_running++;
2225 
2226 		if (cfs_rq_throttled(cfs_rq))
2227 			break;
2228 
2229 		update_cfs_load(cfs_rq, 0);
2230 		update_cfs_shares(cfs_rq);
2231 	}
2232 
2233 	if (!se)
2234 		inc_nr_running(rq);
2235 	hrtick_update(rq);
2236 }
2237 
2238 static void set_next_buddy(struct sched_entity *se);
2239 
2240 /*
2241  * The dequeue_task method is called before nr_running is
2242  * decreased. We remove the task from the rbtree and
2243  * update the fair scheduling stats:
2244  */
dequeue_task_fair(struct rq * rq,struct task_struct * p,int flags)2245 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2246 {
2247 	struct cfs_rq *cfs_rq;
2248 	struct sched_entity *se = &p->se;
2249 	int task_sleep = flags & DEQUEUE_SLEEP;
2250 
2251 	for_each_sched_entity(se) {
2252 		cfs_rq = cfs_rq_of(se);
2253 		dequeue_entity(cfs_rq, se, flags);
2254 
2255 		/*
2256 		 * end evaluation on encountering a throttled cfs_rq
2257 		 *
2258 		 * note: in the case of encountering a throttled cfs_rq we will
2259 		 * post the final h_nr_running decrement below.
2260 		*/
2261 		if (cfs_rq_throttled(cfs_rq))
2262 			break;
2263 		cfs_rq->h_nr_running--;
2264 
2265 		/* Don't dequeue parent if it has other entities besides us */
2266 		if (cfs_rq->load.weight) {
2267 			/*
2268 			 * Bias pick_next to pick a task from this cfs_rq, as
2269 			 * p is sleeping when it is within its sched_slice.
2270 			 */
2271 			if (task_sleep && parent_entity(se))
2272 				set_next_buddy(parent_entity(se));
2273 
2274 			/* avoid re-evaluating load for this entity */
2275 			se = parent_entity(se);
2276 			break;
2277 		}
2278 		flags |= DEQUEUE_SLEEP;
2279 	}
2280 
2281 	for_each_sched_entity(se) {
2282 		cfs_rq = cfs_rq_of(se);
2283 		cfs_rq->h_nr_running--;
2284 
2285 		if (cfs_rq_throttled(cfs_rq))
2286 			break;
2287 
2288 		update_cfs_load(cfs_rq, 0);
2289 		update_cfs_shares(cfs_rq);
2290 	}
2291 
2292 	if (!se)
2293 		dec_nr_running(rq);
2294 	hrtick_update(rq);
2295 }
2296 
2297 #ifdef CONFIG_SMP
2298 /* Used instead of source_load when we know the type == 0 */
weighted_cpuload(const int cpu)2299 static unsigned long weighted_cpuload(const int cpu)
2300 {
2301 	return cpu_rq(cpu)->load.weight;
2302 }
2303 
2304 /*
2305  * Return a low guess at the load of a migration-source cpu weighted
2306  * according to the scheduling class and "nice" value.
2307  *
2308  * We want to under-estimate the load of migration sources, to
2309  * balance conservatively.
2310  */
source_load(int cpu,int type)2311 static unsigned long source_load(int cpu, int type)
2312 {
2313 	struct rq *rq = cpu_rq(cpu);
2314 	unsigned long total = weighted_cpuload(cpu);
2315 
2316 	if (type == 0 || !sched_feat(LB_BIAS))
2317 		return total;
2318 
2319 	return min(rq->cpu_load[type-1], total);
2320 }
2321 
2322 /*
2323  * Return a high guess at the load of a migration-target cpu weighted
2324  * according to the scheduling class and "nice" value.
2325  */
target_load(int cpu,int type)2326 static unsigned long target_load(int cpu, int type)
2327 {
2328 	struct rq *rq = cpu_rq(cpu);
2329 	unsigned long total = weighted_cpuload(cpu);
2330 
2331 	if (type == 0 || !sched_feat(LB_BIAS))
2332 		return total;
2333 
2334 	return max(rq->cpu_load[type-1], total);
2335 }
2336 
power_of(int cpu)2337 static unsigned long power_of(int cpu)
2338 {
2339 	return cpu_rq(cpu)->cpu_power;
2340 }
2341 
cpu_avg_load_per_task(int cpu)2342 static unsigned long cpu_avg_load_per_task(int cpu)
2343 {
2344 	struct rq *rq = cpu_rq(cpu);
2345 	unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
2346 
2347 	if (nr_running)
2348 		return rq->load.weight / nr_running;
2349 
2350 	return 0;
2351 }
2352 
2353 
task_waking_fair(struct task_struct * p)2354 static void task_waking_fair(struct task_struct *p)
2355 {
2356 	struct sched_entity *se = &p->se;
2357 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
2358 	u64 min_vruntime;
2359 
2360 #ifndef CONFIG_64BIT
2361 	u64 min_vruntime_copy;
2362 
2363 	do {
2364 		min_vruntime_copy = cfs_rq->min_vruntime_copy;
2365 		smp_rmb();
2366 		min_vruntime = cfs_rq->min_vruntime;
2367 	} while (min_vruntime != min_vruntime_copy);
2368 #else
2369 	min_vruntime = cfs_rq->min_vruntime;
2370 #endif
2371 
2372 	se->vruntime -= min_vruntime;
2373 }
2374 
2375 #ifdef CONFIG_FAIR_GROUP_SCHED
2376 /*
2377  * effective_load() calculates the load change as seen from the root_task_group
2378  *
2379  * Adding load to a group doesn't make a group heavier, but can cause movement
2380  * of group shares between cpus. Assuming the shares were perfectly aligned one
2381  * can calculate the shift in shares.
2382  *
2383  * Calculate the effective load difference if @wl is added (subtracted) to @tg
2384  * on this @cpu and results in a total addition (subtraction) of @wg to the
2385  * total group weight.
2386  *
2387  * Given a runqueue weight distribution (rw_i) we can compute a shares
2388  * distribution (s_i) using:
2389  *
2390  *   s_i = rw_i / \Sum rw_j						(1)
2391  *
2392  * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
2393  * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
2394  * shares distribution (s_i):
2395  *
2396  *   rw_i = {   2,   4,   1,   0 }
2397  *   s_i  = { 2/7, 4/7, 1/7,   0 }
2398  *
2399  * As per wake_affine() we're interested in the load of two CPUs (the CPU the
2400  * task used to run on and the CPU the waker is running on), we need to
2401  * compute the effect of waking a task on either CPU and, in case of a sync
2402  * wakeup, compute the effect of the current task going to sleep.
2403  *
2404  * So for a change of @wl to the local @cpu with an overall group weight change
2405  * of @wl we can compute the new shares distribution (s'_i) using:
2406  *
2407  *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)				(2)
2408  *
2409  * Suppose we're interested in CPUs 0 and 1, and want to compute the load
2410  * differences in waking a task to CPU 0. The additional task changes the
2411  * weight and shares distributions like:
2412  *
2413  *   rw'_i = {   3,   4,   1,   0 }
2414  *   s'_i  = { 3/8, 4/8, 1/8,   0 }
2415  *
2416  * We can then compute the difference in effective weight by using:
2417  *
2418  *   dw_i = S * (s'_i - s_i)						(3)
2419  *
2420  * Where 'S' is the group weight as seen by its parent.
2421  *
2422  * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
2423  * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
2424  * 4/7) times the weight of the group.
2425  */
effective_load(struct task_group * tg,int cpu,long wl,long wg)2426 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
2427 {
2428 	struct sched_entity *se = tg->se[cpu];
2429 
2430 	if (!tg->parent)	/* the trivial, non-cgroup case */
2431 		return wl;
2432 
2433 	for_each_sched_entity(se) {
2434 		long w, W;
2435 
2436 		tg = se->my_q->tg;
2437 
2438 		/*
2439 		 * W = @wg + \Sum rw_j
2440 		 */
2441 		W = wg + calc_tg_weight(tg, se->my_q);
2442 
2443 		/*
2444 		 * w = rw_i + @wl
2445 		 */
2446 		w = se->my_q->load.weight + wl;
2447 
2448 		/*
2449 		 * wl = S * s'_i; see (2)
2450 		 */
2451 		if (W > 0 && w < W)
2452 			wl = (w * tg->shares) / W;
2453 		else
2454 			wl = tg->shares;
2455 
2456 		/*
2457 		 * Per the above, wl is the new se->load.weight value; since
2458 		 * those are clipped to [MIN_SHARES, ...) do so now. See
2459 		 * calc_cfs_shares().
2460 		 */
2461 		if (wl < MIN_SHARES)
2462 			wl = MIN_SHARES;
2463 
2464 		/*
2465 		 * wl = dw_i = S * (s'_i - s_i); see (3)
2466 		 */
2467 		wl -= se->load.weight;
2468 
2469 		/*
2470 		 * Recursively apply this logic to all parent groups to compute
2471 		 * the final effective load change on the root group. Since
2472 		 * only the @tg group gets extra weight, all parent groups can
2473 		 * only redistribute existing shares. @wl is the shift in shares
2474 		 * resulting from this level per the above.
2475 		 */
2476 		wg = 0;
2477 	}
2478 
2479 	return wl;
2480 }
2481 #else
2482 
effective_load(struct task_group * tg,int cpu,unsigned long wl,unsigned long wg)2483 static inline unsigned long effective_load(struct task_group *tg, int cpu,
2484 		unsigned long wl, unsigned long wg)
2485 {
2486 	return wl;
2487 }
2488 
2489 #endif
2490 
wake_affine(struct sched_domain * sd,struct task_struct * p,int sync)2491 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
2492 {
2493 	s64 this_load, load;
2494 	int idx, this_cpu, prev_cpu;
2495 	unsigned long tl_per_task;
2496 	struct task_group *tg;
2497 	unsigned long weight;
2498 	int balanced;
2499 
2500 	idx	  = sd->wake_idx;
2501 	this_cpu  = smp_processor_id();
2502 	prev_cpu  = task_cpu(p);
2503 	load	  = source_load(prev_cpu, idx);
2504 	this_load = target_load(this_cpu, idx);
2505 
2506 	/*
2507 	 * If sync wakeup then subtract the (maximum possible)
2508 	 * effect of the currently running task from the load
2509 	 * of the current CPU:
2510 	 */
2511 	if (sync) {
2512 		tg = task_group(current);
2513 		weight = current->se.load.weight;
2514 
2515 		this_load += effective_load(tg, this_cpu, -weight, -weight);
2516 		load += effective_load(tg, prev_cpu, 0, -weight);
2517 	}
2518 
2519 	tg = task_group(p);
2520 	weight = p->se.load.weight;
2521 
2522 	/*
2523 	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
2524 	 * due to the sync cause above having dropped this_load to 0, we'll
2525 	 * always have an imbalance, but there's really nothing you can do
2526 	 * about that, so that's good too.
2527 	 *
2528 	 * Otherwise check if either cpus are near enough in load to allow this
2529 	 * task to be woken on this_cpu.
2530 	 */
2531 	if (this_load > 0) {
2532 		s64 this_eff_load, prev_eff_load;
2533 
2534 		this_eff_load = 100;
2535 		this_eff_load *= power_of(prev_cpu);
2536 		this_eff_load *= this_load +
2537 			effective_load(tg, this_cpu, weight, weight);
2538 
2539 		prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
2540 		prev_eff_load *= power_of(this_cpu);
2541 		prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
2542 
2543 		balanced = this_eff_load <= prev_eff_load;
2544 	} else
2545 		balanced = true;
2546 
2547 	/*
2548 	 * If the currently running task will sleep within
2549 	 * a reasonable amount of time then attract this newly
2550 	 * woken task:
2551 	 */
2552 	if (sync && balanced)
2553 		return 1;
2554 
2555 	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
2556 	tl_per_task = cpu_avg_load_per_task(this_cpu);
2557 
2558 	if (balanced ||
2559 	    (this_load <= load &&
2560 	     this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
2561 		/*
2562 		 * This domain has SD_WAKE_AFFINE and
2563 		 * p is cache cold in this domain, and
2564 		 * there is no bad imbalance.
2565 		 */
2566 		schedstat_inc(sd, ttwu_move_affine);
2567 		schedstat_inc(p, se.statistics.nr_wakeups_affine);
2568 
2569 		return 1;
2570 	}
2571 	return 0;
2572 }
2573 
2574 /*
2575  * find_idlest_group finds and returns the least busy CPU group within the
2576  * domain.
2577  */
2578 static struct sched_group *
find_idlest_group(struct sched_domain * sd,struct task_struct * p,int this_cpu,int load_idx)2579 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
2580 		  int this_cpu, int load_idx)
2581 {
2582 	struct sched_group *idlest = NULL, *group = sd->groups;
2583 	unsigned long min_load = ULONG_MAX, this_load = 0;
2584 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
2585 
2586 	do {
2587 		unsigned long load, avg_load;
2588 		int local_group;
2589 		int i;
2590 
2591 		/* Skip over this group if it has no CPUs allowed */
2592 		if (!cpumask_intersects(sched_group_cpus(group),
2593 					tsk_cpus_allowed(p)))
2594 			continue;
2595 
2596 		local_group = cpumask_test_cpu(this_cpu,
2597 					       sched_group_cpus(group));
2598 
2599 		/* Tally up the load of all CPUs in the group */
2600 		avg_load = 0;
2601 
2602 		for_each_cpu(i, sched_group_cpus(group)) {
2603 			/* Bias balancing toward cpus of our domain */
2604 			if (local_group)
2605 				load = source_load(i, load_idx);
2606 			else
2607 				load = target_load(i, load_idx);
2608 
2609 			avg_load += load;
2610 		}
2611 
2612 		/* Adjust by relative CPU power of the group */
2613 		avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
2614 
2615 		if (local_group) {
2616 			this_load = avg_load;
2617 		} else if (avg_load < min_load) {
2618 			min_load = avg_load;
2619 			idlest = group;
2620 		}
2621 	} while (group = group->next, group != sd->groups);
2622 
2623 	if (!idlest || 100*this_load < imbalance*min_load)
2624 		return NULL;
2625 	return idlest;
2626 }
2627 
2628 /*
2629  * find_idlest_cpu - find the idlest cpu among the cpus in group.
2630  */
2631 static int
find_idlest_cpu(struct sched_group * group,struct task_struct * p,int this_cpu)2632 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
2633 {
2634 	unsigned long load, min_load = ULONG_MAX;
2635 	int idlest = -1;
2636 	int i;
2637 
2638 	/* Traverse only the allowed CPUs */
2639 	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
2640 		load = weighted_cpuload(i);
2641 
2642 		if (load < min_load || (load == min_load && i == this_cpu)) {
2643 			min_load = load;
2644 			idlest = i;
2645 		}
2646 	}
2647 
2648 	return idlest;
2649 }
2650 
2651 /*
2652  * Try and locate an idle CPU in the sched_domain.
2653  */
select_idle_sibling(struct task_struct * p,int target)2654 static int select_idle_sibling(struct task_struct *p, int target)
2655 {
2656 	int cpu = smp_processor_id();
2657 	int prev_cpu = task_cpu(p);
2658 	struct sched_domain *sd;
2659 	struct sched_group *sg;
2660 	int i;
2661 
2662 	/*
2663 	 * If the task is going to be woken-up on this cpu and if it is
2664 	 * already idle, then it is the right target.
2665 	 */
2666 	if (target == cpu && idle_cpu(cpu))
2667 		return cpu;
2668 
2669 	/*
2670 	 * If the task is going to be woken-up on the cpu where it previously
2671 	 * ran and if it is currently idle, then it the right target.
2672 	 */
2673 	if (target == prev_cpu && idle_cpu(prev_cpu))
2674 		return prev_cpu;
2675 
2676 	/*
2677 	 * Otherwise, iterate the domains and find an elegible idle cpu.
2678 	 */
2679 	sd = rcu_dereference(per_cpu(sd_llc, target));
2680 	for_each_lower_domain(sd) {
2681 		sg = sd->groups;
2682 		do {
2683 			if (!cpumask_intersects(sched_group_cpus(sg),
2684 						tsk_cpus_allowed(p)))
2685 				goto next;
2686 
2687 			for_each_cpu(i, sched_group_cpus(sg)) {
2688 				if (!idle_cpu(i))
2689 					goto next;
2690 			}
2691 
2692 			target = cpumask_first_and(sched_group_cpus(sg),
2693 					tsk_cpus_allowed(p));
2694 			goto done;
2695 next:
2696 			sg = sg->next;
2697 		} while (sg != sd->groups);
2698 	}
2699 done:
2700 	return target;
2701 }
2702 
2703 /*
2704  * sched_balance_self: balance the current task (running on cpu) in domains
2705  * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
2706  * SD_BALANCE_EXEC.
2707  *
2708  * Balance, ie. select the least loaded group.
2709  *
2710  * Returns the target CPU number, or the same CPU if no balancing is needed.
2711  *
2712  * preempt must be disabled.
2713  */
2714 static int
select_task_rq_fair(struct task_struct * p,int sd_flag,int wake_flags)2715 select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2716 {
2717 	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
2718 	int cpu = smp_processor_id();
2719 	int prev_cpu = task_cpu(p);
2720 	int new_cpu = cpu;
2721 	int want_affine = 0;
2722 	int want_sd = 1;
2723 	int sync = wake_flags & WF_SYNC;
2724 
2725 	if (p->rt.nr_cpus_allowed == 1)
2726 		return prev_cpu;
2727 
2728 	if (sd_flag & SD_BALANCE_WAKE) {
2729 		if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
2730 			want_affine = 1;
2731 		new_cpu = prev_cpu;
2732 	}
2733 
2734 	rcu_read_lock();
2735 	for_each_domain(cpu, tmp) {
2736 		if (!(tmp->flags & SD_LOAD_BALANCE))
2737 			continue;
2738 
2739 		/*
2740 		 * If power savings logic is enabled for a domain, see if we
2741 		 * are not overloaded, if so, don't balance wider.
2742 		 */
2743 		if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
2744 			unsigned long power = 0;
2745 			unsigned long nr_running = 0;
2746 			unsigned long capacity;
2747 			int i;
2748 
2749 			for_each_cpu(i, sched_domain_span(tmp)) {
2750 				power += power_of(i);
2751 				nr_running += cpu_rq(i)->cfs.nr_running;
2752 			}
2753 
2754 			capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
2755 
2756 			if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2757 				nr_running /= 2;
2758 
2759 			if (nr_running < capacity)
2760 				want_sd = 0;
2761 		}
2762 
2763 		/*
2764 		 * If both cpu and prev_cpu are part of this domain,
2765 		 * cpu is a valid SD_WAKE_AFFINE target.
2766 		 */
2767 		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
2768 		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
2769 			affine_sd = tmp;
2770 			want_affine = 0;
2771 		}
2772 
2773 		if (!want_sd && !want_affine)
2774 			break;
2775 
2776 		if (!(tmp->flags & sd_flag))
2777 			continue;
2778 
2779 		if (want_sd)
2780 			sd = tmp;
2781 	}
2782 
2783 	if (affine_sd) {
2784 		if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
2785 			prev_cpu = cpu;
2786 
2787 		new_cpu = select_idle_sibling(p, prev_cpu);
2788 		goto unlock;
2789 	}
2790 
2791 	while (sd) {
2792 		int load_idx = sd->forkexec_idx;
2793 		struct sched_group *group;
2794 		int weight;
2795 
2796 		if (!(sd->flags & sd_flag)) {
2797 			sd = sd->child;
2798 			continue;
2799 		}
2800 
2801 		if (sd_flag & SD_BALANCE_WAKE)
2802 			load_idx = sd->wake_idx;
2803 
2804 		group = find_idlest_group(sd, p, cpu, load_idx);
2805 		if (!group) {
2806 			sd = sd->child;
2807 			continue;
2808 		}
2809 
2810 		new_cpu = find_idlest_cpu(group, p, cpu);
2811 		if (new_cpu == -1 || new_cpu == cpu) {
2812 			/* Now try balancing at a lower domain level of cpu */
2813 			sd = sd->child;
2814 			continue;
2815 		}
2816 
2817 		/* Now try balancing at a lower domain level of new_cpu */
2818 		cpu = new_cpu;
2819 		weight = sd->span_weight;
2820 		sd = NULL;
2821 		for_each_domain(cpu, tmp) {
2822 			if (weight <= tmp->span_weight)
2823 				break;
2824 			if (tmp->flags & sd_flag)
2825 				sd = tmp;
2826 		}
2827 		/* while loop will break here if sd == NULL */
2828 	}
2829 unlock:
2830 	rcu_read_unlock();
2831 
2832 	return new_cpu;
2833 }
2834 #endif /* CONFIG_SMP */
2835 
2836 static unsigned long
wakeup_gran(struct sched_entity * curr,struct sched_entity * se)2837 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
2838 {
2839 	unsigned long gran = sysctl_sched_wakeup_granularity;
2840 
2841 	/*
2842 	 * Since its curr running now, convert the gran from real-time
2843 	 * to virtual-time in his units.
2844 	 *
2845 	 * By using 'se' instead of 'curr' we penalize light tasks, so
2846 	 * they get preempted easier. That is, if 'se' < 'curr' then
2847 	 * the resulting gran will be larger, therefore penalizing the
2848 	 * lighter, if otoh 'se' > 'curr' then the resulting gran will
2849 	 * be smaller, again penalizing the lighter task.
2850 	 *
2851 	 * This is especially important for buddies when the leftmost
2852 	 * task is higher priority than the buddy.
2853 	 */
2854 	return calc_delta_fair(gran, se);
2855 }
2856 
2857 /*
2858  * Should 'se' preempt 'curr'.
2859  *
2860  *             |s1
2861  *        |s2
2862  *   |s3
2863  *         g
2864  *      |<--->|c
2865  *
2866  *  w(c, s1) = -1
2867  *  w(c, s2) =  0
2868  *  w(c, s3) =  1
2869  *
2870  */
2871 static int
wakeup_preempt_entity(struct sched_entity * curr,struct sched_entity * se)2872 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
2873 {
2874 	s64 gran, vdiff = curr->vruntime - se->vruntime;
2875 
2876 	if (vdiff <= 0)
2877 		return -1;
2878 
2879 	gran = wakeup_gran(curr, se);
2880 	if (vdiff > gran)
2881 		return 1;
2882 
2883 	return 0;
2884 }
2885 
set_last_buddy(struct sched_entity * se)2886 static void set_last_buddy(struct sched_entity *se)
2887 {
2888 	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
2889 		return;
2890 
2891 	for_each_sched_entity(se)
2892 		cfs_rq_of(se)->last = se;
2893 }
2894 
set_next_buddy(struct sched_entity * se)2895 static void set_next_buddy(struct sched_entity *se)
2896 {
2897 	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
2898 		return;
2899 
2900 	for_each_sched_entity(se)
2901 		cfs_rq_of(se)->next = se;
2902 }
2903 
set_skip_buddy(struct sched_entity * se)2904 static void set_skip_buddy(struct sched_entity *se)
2905 {
2906 	for_each_sched_entity(se)
2907 		cfs_rq_of(se)->skip = se;
2908 }
2909 
2910 /*
2911  * Preempt the current task with a newly woken task if needed:
2912  */
check_preempt_wakeup(struct rq * rq,struct task_struct * p,int wake_flags)2913 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
2914 {
2915 	struct task_struct *curr = rq->curr;
2916 	struct sched_entity *se = &curr->se, *pse = &p->se;
2917 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
2918 	int scale = cfs_rq->nr_running >= sched_nr_latency;
2919 	int next_buddy_marked = 0;
2920 
2921 	if (unlikely(se == pse))
2922 		return;
2923 
2924 	/*
2925 	 * This is possible from callers such as move_task(), in which we
2926 	 * unconditionally check_prempt_curr() after an enqueue (which may have
2927 	 * lead to a throttle).  This both saves work and prevents false
2928 	 * next-buddy nomination below.
2929 	 */
2930 	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
2931 		return;
2932 
2933 	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
2934 		set_next_buddy(pse);
2935 		next_buddy_marked = 1;
2936 	}
2937 
2938 	/*
2939 	 * We can come here with TIF_NEED_RESCHED already set from new task
2940 	 * wake up path.
2941 	 *
2942 	 * Note: this also catches the edge-case of curr being in a throttled
2943 	 * group (e.g. via set_curr_task), since update_curr() (in the
2944 	 * enqueue of curr) will have resulted in resched being set.  This
2945 	 * prevents us from potentially nominating it as a false LAST_BUDDY
2946 	 * below.
2947 	 */
2948 	if (test_tsk_need_resched(curr))
2949 		return;
2950 
2951 	/* Idle tasks are by definition preempted by non-idle tasks. */
2952 	if (unlikely(curr->policy == SCHED_IDLE) &&
2953 	    likely(p->policy != SCHED_IDLE))
2954 		goto preempt;
2955 
2956 	/*
2957 	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
2958 	 * is driven by the tick):
2959 	 */
2960 	if (unlikely(p->policy != SCHED_NORMAL))
2961 		return;
2962 
2963 	find_matching_se(&se, &pse);
2964 	update_curr(cfs_rq_of(se));
2965 	BUG_ON(!pse);
2966 	if (wakeup_preempt_entity(se, pse) == 1) {
2967 		/*
2968 		 * Bias pick_next to pick the sched entity that is
2969 		 * triggering this preemption.
2970 		 */
2971 		if (!next_buddy_marked)
2972 			set_next_buddy(pse);
2973 		goto preempt;
2974 	}
2975 
2976 	return;
2977 
2978 preempt:
2979 	resched_task(curr);
2980 	/*
2981 	 * Only set the backward buddy when the current task is still
2982 	 * on the rq. This can happen when a wakeup gets interleaved
2983 	 * with schedule on the ->pre_schedule() or idle_balance()
2984 	 * point, either of which can * drop the rq lock.
2985 	 *
2986 	 * Also, during early boot the idle thread is in the fair class,
2987 	 * for obvious reasons its a bad idea to schedule back to it.
2988 	 */
2989 	if (unlikely(!se->on_rq || curr == rq->idle))
2990 		return;
2991 
2992 	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
2993 		set_last_buddy(se);
2994 }
2995 
pick_next_task_fair(struct rq * rq)2996 static struct task_struct *pick_next_task_fair(struct rq *rq)
2997 {
2998 	struct task_struct *p;
2999 	struct cfs_rq *cfs_rq = &rq->cfs;
3000 	struct sched_entity *se;
3001 
3002 	if (!cfs_rq->nr_running)
3003 		return NULL;
3004 
3005 	do {
3006 		se = pick_next_entity(cfs_rq);
3007 		set_next_entity(cfs_rq, se);
3008 		cfs_rq = group_cfs_rq(se);
3009 	} while (cfs_rq);
3010 
3011 	p = task_of(se);
3012 	if (hrtick_enabled(rq))
3013 		hrtick_start_fair(rq, p);
3014 
3015 	return p;
3016 }
3017 
3018 /*
3019  * Account for a descheduled task:
3020  */
put_prev_task_fair(struct rq * rq,struct task_struct * prev)3021 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
3022 {
3023 	struct sched_entity *se = &prev->se;
3024 	struct cfs_rq *cfs_rq;
3025 
3026 	for_each_sched_entity(se) {
3027 		cfs_rq = cfs_rq_of(se);
3028 		put_prev_entity(cfs_rq, se);
3029 	}
3030 }
3031 
3032 /*
3033  * sched_yield() is very simple
3034  *
3035  * The magic of dealing with the ->skip buddy is in pick_next_entity.
3036  */
yield_task_fair(struct rq * rq)3037 static void yield_task_fair(struct rq *rq)
3038 {
3039 	struct task_struct *curr = rq->curr;
3040 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
3041 	struct sched_entity *se = &curr->se;
3042 
3043 	/*
3044 	 * Are we the only task in the tree?
3045 	 */
3046 	if (unlikely(rq->nr_running == 1))
3047 		return;
3048 
3049 	clear_buddies(cfs_rq, se);
3050 
3051 	if (curr->policy != SCHED_BATCH) {
3052 		update_rq_clock(rq);
3053 		/*
3054 		 * Update run-time statistics of the 'current'.
3055 		 */
3056 		update_curr(cfs_rq);
3057 		/*
3058 		 * Tell update_rq_clock() that we've just updated,
3059 		 * so we don't do microscopic update in schedule()
3060 		 * and double the fastpath cost.
3061 		 */
3062 		 rq->skip_clock_update = 1;
3063 	}
3064 
3065 	set_skip_buddy(se);
3066 }
3067 
yield_to_task_fair(struct rq * rq,struct task_struct * p,bool preempt)3068 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
3069 {
3070 	struct sched_entity *se = &p->se;
3071 
3072 	/* throttled hierarchies are not runnable */
3073 	if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
3074 		return false;
3075 
3076 	/* Tell the scheduler that we'd really like pse to run next. */
3077 	set_next_buddy(se);
3078 
3079 	yield_task_fair(rq);
3080 
3081 	return true;
3082 }
3083 
3084 #ifdef CONFIG_SMP
3085 /**************************************************
3086  * Fair scheduling class load-balancing methods:
3087  */
3088 
3089 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3090 
3091 #define LBF_ALL_PINNED	0x01
3092 #define LBF_NEED_BREAK	0x02
3093 
3094 struct lb_env {
3095 	struct sched_domain	*sd;
3096 
3097 	int			src_cpu;
3098 	struct rq		*src_rq;
3099 
3100 	int			dst_cpu;
3101 	struct rq		*dst_rq;
3102 
3103 	enum cpu_idle_type	idle;
3104 	long			load_move;
3105 	unsigned int		flags;
3106 
3107 	unsigned int		loop;
3108 	unsigned int		loop_break;
3109 	unsigned int		loop_max;
3110 };
3111 
3112 /*
3113  * move_task - move a task from one runqueue to another runqueue.
3114  * Both runqueues must be locked.
3115  */
move_task(struct task_struct * p,struct lb_env * env)3116 static void move_task(struct task_struct *p, struct lb_env *env)
3117 {
3118 	deactivate_task(env->src_rq, p, 0);
3119 	set_task_cpu(p, env->dst_cpu);
3120 	activate_task(env->dst_rq, p, 0);
3121 	check_preempt_curr(env->dst_rq, p, 0);
3122 }
3123 
3124 /*
3125  * Is this task likely cache-hot:
3126  */
3127 static int
task_hot(struct task_struct * p,u64 now,struct sched_domain * sd)3128 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
3129 {
3130 	s64 delta;
3131 
3132 	if (p->sched_class != &fair_sched_class)
3133 		return 0;
3134 
3135 	if (unlikely(p->policy == SCHED_IDLE))
3136 		return 0;
3137 
3138 	/*
3139 	 * Buddy candidates are cache hot:
3140 	 */
3141 	if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
3142 			(&p->se == cfs_rq_of(&p->se)->next ||
3143 			 &p->se == cfs_rq_of(&p->se)->last))
3144 		return 1;
3145 
3146 	if (sysctl_sched_migration_cost == -1)
3147 		return 1;
3148 	if (sysctl_sched_migration_cost == 0)
3149 		return 0;
3150 
3151 	delta = now - p->se.exec_start;
3152 
3153 	return delta < (s64)sysctl_sched_migration_cost;
3154 }
3155 
3156 /*
3157  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3158  */
3159 static
can_migrate_task(struct task_struct * p,struct lb_env * env)3160 int can_migrate_task(struct task_struct *p, struct lb_env *env)
3161 {
3162 	int tsk_cache_hot = 0;
3163 	/*
3164 	 * We do not migrate tasks that are:
3165 	 * 1) running (obviously), or
3166 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3167 	 * 3) are cache-hot on their current CPU.
3168 	 */
3169 	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
3170 		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
3171 		return 0;
3172 	}
3173 	env->flags &= ~LBF_ALL_PINNED;
3174 
3175 	if (task_running(env->src_rq, p)) {
3176 		schedstat_inc(p, se.statistics.nr_failed_migrations_running);
3177 		return 0;
3178 	}
3179 
3180 	/*
3181 	 * Aggressive migration if:
3182 	 * 1) task is cache cold, or
3183 	 * 2) too many balance attempts have failed.
3184 	 */
3185 
3186 	tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
3187 	if (!tsk_cache_hot ||
3188 		env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
3189 #ifdef CONFIG_SCHEDSTATS
3190 		if (tsk_cache_hot) {
3191 			schedstat_inc(env->sd, lb_hot_gained[env->idle]);
3192 			schedstat_inc(p, se.statistics.nr_forced_migrations);
3193 		}
3194 #endif
3195 		return 1;
3196 	}
3197 
3198 	if (tsk_cache_hot) {
3199 		schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
3200 		return 0;
3201 	}
3202 	return 1;
3203 }
3204 
3205 /*
3206  * move_one_task tries to move exactly one task from busiest to this_rq, as
3207  * part of active balancing operations within "domain".
3208  * Returns 1 if successful and 0 otherwise.
3209  *
3210  * Called with both runqueues locked.
3211  */
move_one_task(struct lb_env * env)3212 static int move_one_task(struct lb_env *env)
3213 {
3214 	struct task_struct *p, *n;
3215 
3216 	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
3217 		if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
3218 			continue;
3219 
3220 		if (!can_migrate_task(p, env))
3221 			continue;
3222 
3223 		move_task(p, env);
3224 		/*
3225 		 * Right now, this is only the second place move_task()
3226 		 * is called, so we can safely collect move_task()
3227 		 * stats here rather than inside move_task().
3228 		 */
3229 		schedstat_inc(env->sd, lb_gained[env->idle]);
3230 		return 1;
3231 	}
3232 	return 0;
3233 }
3234 
3235 static unsigned long task_h_load(struct task_struct *p);
3236 
3237 static const unsigned int sched_nr_migrate_break = 32;
3238 
3239 /*
3240  * move_tasks tries to move up to load_move weighted load from busiest to
3241  * this_rq, as part of a balancing operation within domain "sd".
3242  * Returns 1 if successful and 0 otherwise.
3243  *
3244  * Called with both runqueues locked.
3245  */
move_tasks(struct lb_env * env)3246 static int move_tasks(struct lb_env *env)
3247 {
3248 	struct list_head *tasks = &env->src_rq->cfs_tasks;
3249 	struct task_struct *p;
3250 	unsigned long load;
3251 	int pulled = 0;
3252 
3253 	if (env->load_move <= 0)
3254 		return 0;
3255 
3256 	while (!list_empty(tasks)) {
3257 		p = list_first_entry(tasks, struct task_struct, se.group_node);
3258 
3259 		env->loop++;
3260 		/* We've more or less seen every task there is, call it quits */
3261 		if (env->loop > env->loop_max)
3262 			break;
3263 
3264 		/* take a breather every nr_migrate tasks */
3265 		if (env->loop > env->loop_break) {
3266 			env->loop_break += sched_nr_migrate_break;
3267 			env->flags |= LBF_NEED_BREAK;
3268 			break;
3269 		}
3270 
3271 		if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
3272 			goto next;
3273 
3274 		load = task_h_load(p);
3275 
3276 		if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
3277 			goto next;
3278 
3279 		if ((load / 2) > env->load_move)
3280 			goto next;
3281 
3282 		if (!can_migrate_task(p, env))
3283 			goto next;
3284 
3285 		move_task(p, env);
3286 		pulled++;
3287 		env->load_move -= load;
3288 
3289 #ifdef CONFIG_PREEMPT
3290 		/*
3291 		 * NEWIDLE balancing is a source of latency, so preemptible
3292 		 * kernels will stop after the first task is pulled to minimize
3293 		 * the critical section.
3294 		 */
3295 		if (env->idle == CPU_NEWLY_IDLE)
3296 			break;
3297 #endif
3298 
3299 		/*
3300 		 * We only want to steal up to the prescribed amount of
3301 		 * weighted load.
3302 		 */
3303 		if (env->load_move <= 0)
3304 			break;
3305 
3306 		continue;
3307 next:
3308 		list_move_tail(&p->se.group_node, tasks);
3309 	}
3310 
3311 	/*
3312 	 * Right now, this is one of only two places move_task() is called,
3313 	 * so we can safely collect move_task() stats here rather than
3314 	 * inside move_task().
3315 	 */
3316 	schedstat_add(env->sd, lb_gained[env->idle], pulled);
3317 
3318 	return pulled;
3319 }
3320 
3321 #ifdef CONFIG_FAIR_GROUP_SCHED
3322 /*
3323  * update tg->load_weight by folding this cpu's load_avg
3324  */
update_shares_cpu(struct task_group * tg,int cpu)3325 static int update_shares_cpu(struct task_group *tg, int cpu)
3326 {
3327 	struct cfs_rq *cfs_rq;
3328 	unsigned long flags;
3329 	struct rq *rq;
3330 
3331 	if (!tg->se[cpu])
3332 		return 0;
3333 
3334 	rq = cpu_rq(cpu);
3335 	cfs_rq = tg->cfs_rq[cpu];
3336 
3337 	raw_spin_lock_irqsave(&rq->lock, flags);
3338 
3339 	update_rq_clock(rq);
3340 	update_cfs_load(cfs_rq, 1);
3341 
3342 	/*
3343 	 * We need to update shares after updating tg->load_weight in
3344 	 * order to adjust the weight of groups with long running tasks.
3345 	 */
3346 	update_cfs_shares(cfs_rq);
3347 
3348 	raw_spin_unlock_irqrestore(&rq->lock, flags);
3349 
3350 	return 0;
3351 }
3352 
update_shares(int cpu)3353 static void update_shares(int cpu)
3354 {
3355 	struct cfs_rq *cfs_rq;
3356 	struct rq *rq = cpu_rq(cpu);
3357 
3358 	rcu_read_lock();
3359 	/*
3360 	 * Iterates the task_group tree in a bottom up fashion, see
3361 	 * list_add_leaf_cfs_rq() for details.
3362 	 */
3363 	for_each_leaf_cfs_rq(rq, cfs_rq) {
3364 		/* throttled entities do not contribute to load */
3365 		if (throttled_hierarchy(cfs_rq))
3366 			continue;
3367 
3368 		update_shares_cpu(cfs_rq->tg, cpu);
3369 	}
3370 	rcu_read_unlock();
3371 }
3372 
3373 /*
3374  * Compute the cpu's hierarchical load factor for each task group.
3375  * This needs to be done in a top-down fashion because the load of a child
3376  * group is a fraction of its parents load.
3377  */
tg_load_down(struct task_group * tg,void * data)3378 static int tg_load_down(struct task_group *tg, void *data)
3379 {
3380 	unsigned long load;
3381 	long cpu = (long)data;
3382 
3383 	if (!tg->parent) {
3384 		load = cpu_rq(cpu)->load.weight;
3385 	} else {
3386 		load = tg->parent->cfs_rq[cpu]->h_load;
3387 		load *= tg->se[cpu]->load.weight;
3388 		load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
3389 	}
3390 
3391 	tg->cfs_rq[cpu]->h_load = load;
3392 
3393 	return 0;
3394 }
3395 
update_h_load(long cpu)3396 static void update_h_load(long cpu)
3397 {
3398 	rcu_read_lock();
3399 	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
3400 	rcu_read_unlock();
3401 }
3402 
task_h_load(struct task_struct * p)3403 static unsigned long task_h_load(struct task_struct *p)
3404 {
3405 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
3406 	unsigned long load;
3407 
3408 	load = p->se.load.weight;
3409 	load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
3410 
3411 	return load;
3412 }
3413 #else
update_shares(int cpu)3414 static inline void update_shares(int cpu)
3415 {
3416 }
3417 
update_h_load(long cpu)3418 static inline void update_h_load(long cpu)
3419 {
3420 }
3421 
task_h_load(struct task_struct * p)3422 static unsigned long task_h_load(struct task_struct *p)
3423 {
3424 	return p->se.load.weight;
3425 }
3426 #endif
3427 
3428 /********** Helpers for find_busiest_group ************************/
3429 /*
3430  * sd_lb_stats - Structure to store the statistics of a sched_domain
3431  * 		during load balancing.
3432  */
3433 struct sd_lb_stats {
3434 	struct sched_group *busiest; /* Busiest group in this sd */
3435 	struct sched_group *this;  /* Local group in this sd */
3436 	unsigned long total_load;  /* Total load of all groups in sd */
3437 	unsigned long total_pwr;   /*	Total power of all groups in sd */
3438 	unsigned long avg_load;	   /* Average load across all groups in sd */
3439 
3440 	/** Statistics of this group */
3441 	unsigned long this_load;
3442 	unsigned long this_load_per_task;
3443 	unsigned long this_nr_running;
3444 	unsigned long this_has_capacity;
3445 	unsigned int  this_idle_cpus;
3446 
3447 	/* Statistics of the busiest group */
3448 	unsigned int  busiest_idle_cpus;
3449 	unsigned long max_load;
3450 	unsigned long busiest_load_per_task;
3451 	unsigned long busiest_nr_running;
3452 	unsigned long busiest_group_capacity;
3453 	unsigned long busiest_has_capacity;
3454 	unsigned int  busiest_group_weight;
3455 
3456 	int group_imb; /* Is there imbalance in this sd */
3457 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3458 	int power_savings_balance; /* Is powersave balance needed for this sd */
3459 	struct sched_group *group_min; /* Least loaded group in sd */
3460 	struct sched_group *group_leader; /* Group which relieves group_min */
3461 	unsigned long min_load_per_task; /* load_per_task in group_min */
3462 	unsigned long leader_nr_running; /* Nr running of group_leader */
3463 	unsigned long min_nr_running; /* Nr running of group_min */
3464 #endif
3465 };
3466 
3467 /*
3468  * sg_lb_stats - stats of a sched_group required for load_balancing
3469  */
3470 struct sg_lb_stats {
3471 	unsigned long avg_load; /*Avg load across the CPUs of the group */
3472 	unsigned long group_load; /* Total load over the CPUs of the group */
3473 	unsigned long sum_nr_running; /* Nr tasks running in the group */
3474 	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
3475 	unsigned long group_capacity;
3476 	unsigned long idle_cpus;
3477 	unsigned long group_weight;
3478 	int group_imb; /* Is there an imbalance in the group ? */
3479 	int group_has_capacity; /* Is there extra capacity in the group? */
3480 };
3481 
3482 /**
3483  * get_sd_load_idx - Obtain the load index for a given sched domain.
3484  * @sd: The sched_domain whose load_idx is to be obtained.
3485  * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
3486  */
get_sd_load_idx(struct sched_domain * sd,enum cpu_idle_type idle)3487 static inline int get_sd_load_idx(struct sched_domain *sd,
3488 					enum cpu_idle_type idle)
3489 {
3490 	int load_idx;
3491 
3492 	switch (idle) {
3493 	case CPU_NOT_IDLE:
3494 		load_idx = sd->busy_idx;
3495 		break;
3496 
3497 	case CPU_NEWLY_IDLE:
3498 		load_idx = sd->newidle_idx;
3499 		break;
3500 	default:
3501 		load_idx = sd->idle_idx;
3502 		break;
3503 	}
3504 
3505 	return load_idx;
3506 }
3507 
3508 
3509 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3510 /**
3511  * init_sd_power_savings_stats - Initialize power savings statistics for
3512  * the given sched_domain, during load balancing.
3513  *
3514  * @sd: Sched domain whose power-savings statistics are to be initialized.
3515  * @sds: Variable containing the statistics for sd.
3516  * @idle: Idle status of the CPU at which we're performing load-balancing.
3517  */
init_sd_power_savings_stats(struct sched_domain * sd,struct sd_lb_stats * sds,enum cpu_idle_type idle)3518 static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3519 	struct sd_lb_stats *sds, enum cpu_idle_type idle)
3520 {
3521 	/*
3522 	 * Busy processors will not participate in power savings
3523 	 * balance.
3524 	 */
3525 	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3526 		sds->power_savings_balance = 0;
3527 	else {
3528 		sds->power_savings_balance = 1;
3529 		sds->min_nr_running = ULONG_MAX;
3530 		sds->leader_nr_running = 0;
3531 	}
3532 }
3533 
3534 /**
3535  * update_sd_power_savings_stats - Update the power saving stats for a
3536  * sched_domain while performing load balancing.
3537  *
3538  * @group: sched_group belonging to the sched_domain under consideration.
3539  * @sds: Variable containing the statistics of the sched_domain
3540  * @local_group: Does group contain the CPU for which we're performing
3541  * 		load balancing ?
3542  * @sgs: Variable containing the statistics of the group.
3543  */
update_sd_power_savings_stats(struct sched_group * group,struct sd_lb_stats * sds,int local_group,struct sg_lb_stats * sgs)3544 static inline void update_sd_power_savings_stats(struct sched_group *group,
3545 	struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3546 {
3547 
3548 	if (!sds->power_savings_balance)
3549 		return;
3550 
3551 	/*
3552 	 * If the local group is idle or completely loaded
3553 	 * no need to do power savings balance at this domain
3554 	 */
3555 	if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3556 				!sds->this_nr_running))
3557 		sds->power_savings_balance = 0;
3558 
3559 	/*
3560 	 * If a group is already running at full capacity or idle,
3561 	 * don't include that group in power savings calculations
3562 	 */
3563 	if (!sds->power_savings_balance ||
3564 		sgs->sum_nr_running >= sgs->group_capacity ||
3565 		!sgs->sum_nr_running)
3566 		return;
3567 
3568 	/*
3569 	 * Calculate the group which has the least non-idle load.
3570 	 * This is the group from where we need to pick up the load
3571 	 * for saving power
3572 	 */
3573 	if ((sgs->sum_nr_running < sds->min_nr_running) ||
3574 	    (sgs->sum_nr_running == sds->min_nr_running &&
3575 	     group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3576 		sds->group_min = group;
3577 		sds->min_nr_running = sgs->sum_nr_running;
3578 		sds->min_load_per_task = sgs->sum_weighted_load /
3579 						sgs->sum_nr_running;
3580 	}
3581 
3582 	/*
3583 	 * Calculate the group which is almost near its
3584 	 * capacity but still has some space to pick up some load
3585 	 * from other group and save more power
3586 	 */
3587 	if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3588 		return;
3589 
3590 	if (sgs->sum_nr_running > sds->leader_nr_running ||
3591 	    (sgs->sum_nr_running == sds->leader_nr_running &&
3592 	     group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3593 		sds->group_leader = group;
3594 		sds->leader_nr_running = sgs->sum_nr_running;
3595 	}
3596 }
3597 
3598 /**
3599  * check_power_save_busiest_group - see if there is potential for some power-savings balance
3600  * @sds: Variable containing the statistics of the sched_domain
3601  *	under consideration.
3602  * @this_cpu: Cpu at which we're currently performing load-balancing.
3603  * @imbalance: Variable to store the imbalance.
3604  *
3605  * Description:
3606  * Check if we have potential to perform some power-savings balance.
3607  * If yes, set the busiest group to be the least loaded group in the
3608  * sched_domain, so that it's CPUs can be put to idle.
3609  *
3610  * Returns 1 if there is potential to perform power-savings balance.
3611  * Else returns 0.
3612  */
check_power_save_busiest_group(struct sd_lb_stats * sds,int this_cpu,unsigned long * imbalance)3613 static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3614 					int this_cpu, unsigned long *imbalance)
3615 {
3616 	if (!sds->power_savings_balance)
3617 		return 0;
3618 
3619 	if (sds->this != sds->group_leader ||
3620 			sds->group_leader == sds->group_min)
3621 		return 0;
3622 
3623 	*imbalance = sds->min_load_per_task;
3624 	sds->busiest = sds->group_min;
3625 
3626 	return 1;
3627 
3628 }
3629 #else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
init_sd_power_savings_stats(struct sched_domain * sd,struct sd_lb_stats * sds,enum cpu_idle_type idle)3630 static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3631 	struct sd_lb_stats *sds, enum cpu_idle_type idle)
3632 {
3633 	return;
3634 }
3635 
update_sd_power_savings_stats(struct sched_group * group,struct sd_lb_stats * sds,int local_group,struct sg_lb_stats * sgs)3636 static inline void update_sd_power_savings_stats(struct sched_group *group,
3637 	struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3638 {
3639 	return;
3640 }
3641 
check_power_save_busiest_group(struct sd_lb_stats * sds,int this_cpu,unsigned long * imbalance)3642 static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3643 					int this_cpu, unsigned long *imbalance)
3644 {
3645 	return 0;
3646 }
3647 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3648 
3649 
default_scale_freq_power(struct sched_domain * sd,int cpu)3650 unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3651 {
3652 	return SCHED_POWER_SCALE;
3653 }
3654 
arch_scale_freq_power(struct sched_domain * sd,int cpu)3655 unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3656 {
3657 	return default_scale_freq_power(sd, cpu);
3658 }
3659 
default_scale_smt_power(struct sched_domain * sd,int cpu)3660 unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3661 {
3662 	unsigned long weight = sd->span_weight;
3663 	unsigned long smt_gain = sd->smt_gain;
3664 
3665 	smt_gain /= weight;
3666 
3667 	return smt_gain;
3668 }
3669 
arch_scale_smt_power(struct sched_domain * sd,int cpu)3670 unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3671 {
3672 	return default_scale_smt_power(sd, cpu);
3673 }
3674 
scale_rt_power(int cpu)3675 unsigned long scale_rt_power(int cpu)
3676 {
3677 	struct rq *rq = cpu_rq(cpu);
3678 	u64 total, available;
3679 
3680 	total = sched_avg_period() + (rq->clock - rq->age_stamp);
3681 
3682 	if (unlikely(total < rq->rt_avg)) {
3683 		/* Ensures that power won't end up being negative */
3684 		available = 0;
3685 	} else {
3686 		available = total - rq->rt_avg;
3687 	}
3688 
3689 	if (unlikely((s64)total < SCHED_POWER_SCALE))
3690 		total = SCHED_POWER_SCALE;
3691 
3692 	total >>= SCHED_POWER_SHIFT;
3693 
3694 	return div_u64(available, total);
3695 }
3696 
update_cpu_power(struct sched_domain * sd,int cpu)3697 static void update_cpu_power(struct sched_domain *sd, int cpu)
3698 {
3699 	unsigned long weight = sd->span_weight;
3700 	unsigned long power = SCHED_POWER_SCALE;
3701 	struct sched_group *sdg = sd->groups;
3702 
3703 	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3704 		if (sched_feat(ARCH_POWER))
3705 			power *= arch_scale_smt_power(sd, cpu);
3706 		else
3707 			power *= default_scale_smt_power(sd, cpu);
3708 
3709 		power >>= SCHED_POWER_SHIFT;
3710 	}
3711 
3712 	sdg->sgp->power_orig = power;
3713 
3714 	if (sched_feat(ARCH_POWER))
3715 		power *= arch_scale_freq_power(sd, cpu);
3716 	else
3717 		power *= default_scale_freq_power(sd, cpu);
3718 
3719 	power >>= SCHED_POWER_SHIFT;
3720 
3721 	power *= scale_rt_power(cpu);
3722 	power >>= SCHED_POWER_SHIFT;
3723 
3724 	if (!power)
3725 		power = 1;
3726 
3727 	cpu_rq(cpu)->cpu_power = power;
3728 	sdg->sgp->power = power;
3729 }
3730 
update_group_power(struct sched_domain * sd,int cpu)3731 void update_group_power(struct sched_domain *sd, int cpu)
3732 {
3733 	struct sched_domain *child = sd->child;
3734 	struct sched_group *group, *sdg = sd->groups;
3735 	unsigned long power;
3736 	unsigned long interval;
3737 
3738 	interval = msecs_to_jiffies(sd->balance_interval);
3739 	interval = clamp(interval, 1UL, max_load_balance_interval);
3740 	sdg->sgp->next_update = jiffies + interval;
3741 
3742 	if (!child) {
3743 		update_cpu_power(sd, cpu);
3744 		return;
3745 	}
3746 
3747 	power = 0;
3748 
3749 	group = child->groups;
3750 	do {
3751 		power += group->sgp->power;
3752 		group = group->next;
3753 	} while (group != child->groups);
3754 
3755 	sdg->sgp->power = power;
3756 }
3757 
3758 /*
3759  * Try and fix up capacity for tiny siblings, this is needed when
3760  * things like SD_ASYM_PACKING need f_b_g to select another sibling
3761  * which on its own isn't powerful enough.
3762  *
3763  * See update_sd_pick_busiest() and check_asym_packing().
3764  */
3765 static inline int
fix_small_capacity(struct sched_domain * sd,struct sched_group * group)3766 fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
3767 {
3768 	/*
3769 	 * Only siblings can have significantly less than SCHED_POWER_SCALE
3770 	 */
3771 	if (!(sd->flags & SD_SHARE_CPUPOWER))
3772 		return 0;
3773 
3774 	/*
3775 	 * If ~90% of the cpu_power is still there, we're good.
3776 	 */
3777 	if (group->sgp->power * 32 > group->sgp->power_orig * 29)
3778 		return 1;
3779 
3780 	return 0;
3781 }
3782 
3783 /**
3784  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3785  * @sd: The sched_domain whose statistics are to be updated.
3786  * @group: sched_group whose statistics are to be updated.
3787  * @this_cpu: Cpu for which load balance is currently performed.
3788  * @idle: Idle status of this_cpu
3789  * @load_idx: Load index of sched_domain of this_cpu for load calc.
3790  * @local_group: Does group contain this_cpu.
3791  * @cpus: Set of cpus considered for load balancing.
3792  * @balance: Should we balance.
3793  * @sgs: variable to hold the statistics for this group.
3794  */
update_sg_lb_stats(struct sched_domain * sd,struct sched_group * group,int this_cpu,enum cpu_idle_type idle,int load_idx,int local_group,const struct cpumask * cpus,int * balance,struct sg_lb_stats * sgs)3795 static inline void update_sg_lb_stats(struct sched_domain *sd,
3796 			struct sched_group *group, int this_cpu,
3797 			enum cpu_idle_type idle, int load_idx,
3798 			int local_group, const struct cpumask *cpus,
3799 			int *balance, struct sg_lb_stats *sgs)
3800 {
3801 	unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
3802 	int i;
3803 	unsigned int balance_cpu = -1, first_idle_cpu = 0;
3804 	unsigned long avg_load_per_task = 0;
3805 
3806 	if (local_group)
3807 		balance_cpu = group_first_cpu(group);
3808 
3809 	/* Tally up the load of all CPUs in the group */
3810 	max_cpu_load = 0;
3811 	min_cpu_load = ~0UL;
3812 	max_nr_running = 0;
3813 
3814 	for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3815 		struct rq *rq = cpu_rq(i);
3816 
3817 		/* Bias balancing toward cpus of our domain */
3818 		if (local_group) {
3819 			if (idle_cpu(i) && !first_idle_cpu) {
3820 				first_idle_cpu = 1;
3821 				balance_cpu = i;
3822 			}
3823 
3824 			load = target_load(i, load_idx);
3825 		} else {
3826 			load = source_load(i, load_idx);
3827 			if (load > max_cpu_load) {
3828 				max_cpu_load = load;
3829 				max_nr_running = rq->nr_running;
3830 			}
3831 			if (min_cpu_load > load)
3832 				min_cpu_load = load;
3833 		}
3834 
3835 		sgs->group_load += load;
3836 		sgs->sum_nr_running += rq->nr_running;
3837 		sgs->sum_weighted_load += weighted_cpuload(i);
3838 		if (idle_cpu(i))
3839 			sgs->idle_cpus++;
3840 	}
3841 
3842 	/*
3843 	 * First idle cpu or the first cpu(busiest) in this sched group
3844 	 * is eligible for doing load balancing at this and above
3845 	 * domains. In the newly idle case, we will allow all the cpu's
3846 	 * to do the newly idle load balance.
3847 	 */
3848 	if (local_group) {
3849 		if (idle != CPU_NEWLY_IDLE) {
3850 			if (balance_cpu != this_cpu) {
3851 				*balance = 0;
3852 				return;
3853 			}
3854 			update_group_power(sd, this_cpu);
3855 		} else if (time_after_eq(jiffies, group->sgp->next_update))
3856 			update_group_power(sd, this_cpu);
3857 	}
3858 
3859 	/* Adjust by relative CPU power of the group */
3860 	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
3861 
3862 	/*
3863 	 * Consider the group unbalanced when the imbalance is larger
3864 	 * than the average weight of a task.
3865 	 *
3866 	 * APZ: with cgroup the avg task weight can vary wildly and
3867 	 *      might not be a suitable number - should we keep a
3868 	 *      normalized nr_running number somewhere that negates
3869 	 *      the hierarchy?
3870 	 */
3871 	if (sgs->sum_nr_running)
3872 		avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
3873 
3874 	if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
3875 		sgs->group_imb = 1;
3876 
3877 	sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
3878 						SCHED_POWER_SCALE);
3879 	if (!sgs->group_capacity)
3880 		sgs->group_capacity = fix_small_capacity(sd, group);
3881 	sgs->group_weight = group->group_weight;
3882 
3883 	if (sgs->group_capacity > sgs->sum_nr_running)
3884 		sgs->group_has_capacity = 1;
3885 }
3886 
3887 /**
3888  * update_sd_pick_busiest - return 1 on busiest group
3889  * @sd: sched_domain whose statistics are to be checked
3890  * @sds: sched_domain statistics
3891  * @sg: sched_group candidate to be checked for being the busiest
3892  * @sgs: sched_group statistics
3893  * @this_cpu: the current cpu
3894  *
3895  * Determine if @sg is a busier group than the previously selected
3896  * busiest group.
3897  */
update_sd_pick_busiest(struct sched_domain * sd,struct sd_lb_stats * sds,struct sched_group * sg,struct sg_lb_stats * sgs,int this_cpu)3898 static bool update_sd_pick_busiest(struct sched_domain *sd,
3899 				   struct sd_lb_stats *sds,
3900 				   struct sched_group *sg,
3901 				   struct sg_lb_stats *sgs,
3902 				   int this_cpu)
3903 {
3904 	if (sgs->avg_load <= sds->max_load)
3905 		return false;
3906 
3907 	if (sgs->sum_nr_running > sgs->group_capacity)
3908 		return true;
3909 
3910 	if (sgs->group_imb)
3911 		return true;
3912 
3913 	/*
3914 	 * ASYM_PACKING needs to move all the work to the lowest
3915 	 * numbered CPUs in the group, therefore mark all groups
3916 	 * higher than ourself as busy.
3917 	 */
3918 	if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
3919 	    this_cpu < group_first_cpu(sg)) {
3920 		if (!sds->busiest)
3921 			return true;
3922 
3923 		if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
3924 			return true;
3925 	}
3926 
3927 	return false;
3928 }
3929 
3930 /**
3931  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
3932  * @sd: sched_domain whose statistics are to be updated.
3933  * @this_cpu: Cpu for which load balance is currently performed.
3934  * @idle: Idle status of this_cpu
3935  * @cpus: Set of cpus considered for load balancing.
3936  * @balance: Should we balance.
3937  * @sds: variable to hold the statistics for this sched_domain.
3938  */
update_sd_lb_stats(struct sched_domain * sd,int this_cpu,enum cpu_idle_type idle,const struct cpumask * cpus,int * balance,struct sd_lb_stats * sds)3939 static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3940 			enum cpu_idle_type idle, const struct cpumask *cpus,
3941 			int *balance, struct sd_lb_stats *sds)
3942 {
3943 	struct sched_domain *child = sd->child;
3944 	struct sched_group *sg = sd->groups;
3945 	struct sg_lb_stats sgs;
3946 	int load_idx, prefer_sibling = 0;
3947 
3948 	if (child && child->flags & SD_PREFER_SIBLING)
3949 		prefer_sibling = 1;
3950 
3951 	init_sd_power_savings_stats(sd, sds, idle);
3952 	load_idx = get_sd_load_idx(sd, idle);
3953 
3954 	do {
3955 		int local_group;
3956 
3957 		local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
3958 		memset(&sgs, 0, sizeof(sgs));
3959 		update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx,
3960 				local_group, cpus, balance, &sgs);
3961 
3962 		if (local_group && !(*balance))
3963 			return;
3964 
3965 		sds->total_load += sgs.group_load;
3966 		sds->total_pwr += sg->sgp->power;
3967 
3968 		/*
3969 		 * In case the child domain prefers tasks go to siblings
3970 		 * first, lower the sg capacity to one so that we'll try
3971 		 * and move all the excess tasks away. We lower the capacity
3972 		 * of a group only if the local group has the capacity to fit
3973 		 * these excess tasks, i.e. nr_running < group_capacity. The
3974 		 * extra check prevents the case where you always pull from the
3975 		 * heaviest group when it is already under-utilized (possible
3976 		 * with a large weight task outweighs the tasks on the system).
3977 		 */
3978 		if (prefer_sibling && !local_group && sds->this_has_capacity)
3979 			sgs.group_capacity = min(sgs.group_capacity, 1UL);
3980 
3981 		if (local_group) {
3982 			sds->this_load = sgs.avg_load;
3983 			sds->this = sg;
3984 			sds->this_nr_running = sgs.sum_nr_running;
3985 			sds->this_load_per_task = sgs.sum_weighted_load;
3986 			sds->this_has_capacity = sgs.group_has_capacity;
3987 			sds->this_idle_cpus = sgs.idle_cpus;
3988 		} else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
3989 			sds->max_load = sgs.avg_load;
3990 			sds->busiest = sg;
3991 			sds->busiest_nr_running = sgs.sum_nr_running;
3992 			sds->busiest_idle_cpus = sgs.idle_cpus;
3993 			sds->busiest_group_capacity = sgs.group_capacity;
3994 			sds->busiest_load_per_task = sgs.sum_weighted_load;
3995 			sds->busiest_has_capacity = sgs.group_has_capacity;
3996 			sds->busiest_group_weight = sgs.group_weight;
3997 			sds->group_imb = sgs.group_imb;
3998 		}
3999 
4000 		update_sd_power_savings_stats(sg, sds, local_group, &sgs);
4001 		sg = sg->next;
4002 	} while (sg != sd->groups);
4003 }
4004 
4005 /**
4006  * check_asym_packing - Check to see if the group is packed into the
4007  *			sched doman.
4008  *
4009  * This is primarily intended to used at the sibling level.  Some
4010  * cores like POWER7 prefer to use lower numbered SMT threads.  In the
4011  * case of POWER7, it can move to lower SMT modes only when higher
4012  * threads are idle.  When in lower SMT modes, the threads will
4013  * perform better since they share less core resources.  Hence when we
4014  * have idle threads, we want them to be the higher ones.
4015  *
4016  * This packing function is run on idle threads.  It checks to see if
4017  * the busiest CPU in this domain (core in the P7 case) has a higher
4018  * CPU number than the packing function is being run on.  Here we are
4019  * assuming lower CPU number will be equivalent to lower a SMT thread
4020  * number.
4021  *
4022  * Returns 1 when packing is required and a task should be moved to
4023  * this CPU.  The amount of the imbalance is returned in *imbalance.
4024  *
4025  * @sd: The sched_domain whose packing is to be checked.
4026  * @sds: Statistics of the sched_domain which is to be packed
4027  * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
4028  * @imbalance: returns amount of imbalanced due to packing.
4029  */
check_asym_packing(struct sched_domain * sd,struct sd_lb_stats * sds,int this_cpu,unsigned long * imbalance)4030 static int check_asym_packing(struct sched_domain *sd,
4031 			      struct sd_lb_stats *sds,
4032 			      int this_cpu, unsigned long *imbalance)
4033 {
4034 	int busiest_cpu;
4035 
4036 	if (!(sd->flags & SD_ASYM_PACKING))
4037 		return 0;
4038 
4039 	if (!sds->busiest)
4040 		return 0;
4041 
4042 	busiest_cpu = group_first_cpu(sds->busiest);
4043 	if (this_cpu > busiest_cpu)
4044 		return 0;
4045 
4046 	*imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power,
4047 				       SCHED_POWER_SCALE);
4048 	return 1;
4049 }
4050 
4051 /**
4052  * fix_small_imbalance - Calculate the minor imbalance that exists
4053  *			amongst the groups of a sched_domain, during
4054  *			load balancing.
4055  * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
4056  * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
4057  * @imbalance: Variable to store the imbalance.
4058  */
fix_small_imbalance(struct sd_lb_stats * sds,int this_cpu,unsigned long * imbalance)4059 static inline void fix_small_imbalance(struct sd_lb_stats *sds,
4060 				int this_cpu, unsigned long *imbalance)
4061 {
4062 	unsigned long tmp, pwr_now = 0, pwr_move = 0;
4063 	unsigned int imbn = 2;
4064 	unsigned long scaled_busy_load_per_task;
4065 
4066 	if (sds->this_nr_running) {
4067 		sds->this_load_per_task /= sds->this_nr_running;
4068 		if (sds->busiest_load_per_task >
4069 				sds->this_load_per_task)
4070 			imbn = 1;
4071 	} else
4072 		sds->this_load_per_task =
4073 			cpu_avg_load_per_task(this_cpu);
4074 
4075 	scaled_busy_load_per_task = sds->busiest_load_per_task
4076 					 * SCHED_POWER_SCALE;
4077 	scaled_busy_load_per_task /= sds->busiest->sgp->power;
4078 
4079 	if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
4080 			(scaled_busy_load_per_task * imbn)) {
4081 		*imbalance = sds->busiest_load_per_task;
4082 		return;
4083 	}
4084 
4085 	/*
4086 	 * OK, we don't have enough imbalance to justify moving tasks,
4087 	 * however we may be able to increase total CPU power used by
4088 	 * moving them.
4089 	 */
4090 
4091 	pwr_now += sds->busiest->sgp->power *
4092 			min(sds->busiest_load_per_task, sds->max_load);
4093 	pwr_now += sds->this->sgp->power *
4094 			min(sds->this_load_per_task, sds->this_load);
4095 	pwr_now /= SCHED_POWER_SCALE;
4096 
4097 	/* Amount of load we'd subtract */
4098 	tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
4099 		sds->busiest->sgp->power;
4100 	if (sds->max_load > tmp)
4101 		pwr_move += sds->busiest->sgp->power *
4102 			min(sds->busiest_load_per_task, sds->max_load - tmp);
4103 
4104 	/* Amount of load we'd add */
4105 	if (sds->max_load * sds->busiest->sgp->power <
4106 		sds->busiest_load_per_task * SCHED_POWER_SCALE)
4107 		tmp = (sds->max_load * sds->busiest->sgp->power) /
4108 			sds->this->sgp->power;
4109 	else
4110 		tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
4111 			sds->this->sgp->power;
4112 	pwr_move += sds->this->sgp->power *
4113 			min(sds->this_load_per_task, sds->this_load + tmp);
4114 	pwr_move /= SCHED_POWER_SCALE;
4115 
4116 	/* Move if we gain throughput */
4117 	if (pwr_move > pwr_now)
4118 		*imbalance = sds->busiest_load_per_task;
4119 }
4120 
4121 /**
4122  * calculate_imbalance - Calculate the amount of imbalance present within the
4123  *			 groups of a given sched_domain during load balance.
4124  * @sds: statistics of the sched_domain whose imbalance is to be calculated.
4125  * @this_cpu: Cpu for which currently load balance is being performed.
4126  * @imbalance: The variable to store the imbalance.
4127  */
calculate_imbalance(struct sd_lb_stats * sds,int this_cpu,unsigned long * imbalance)4128 static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4129 		unsigned long *imbalance)
4130 {
4131 	unsigned long max_pull, load_above_capacity = ~0UL;
4132 
4133 	sds->busiest_load_per_task /= sds->busiest_nr_running;
4134 	if (sds->group_imb) {
4135 		sds->busiest_load_per_task =
4136 			min(sds->busiest_load_per_task, sds->avg_load);
4137 	}
4138 
4139 	/*
4140 	 * In the presence of smp nice balancing, certain scenarios can have
4141 	 * max load less than avg load(as we skip the groups at or below
4142 	 * its cpu_power, while calculating max_load..)
4143 	 */
4144 	if (sds->max_load < sds->avg_load) {
4145 		*imbalance = 0;
4146 		return fix_small_imbalance(sds, this_cpu, imbalance);
4147 	}
4148 
4149 	if (!sds->group_imb) {
4150 		/*
4151 		 * Don't want to pull so many tasks that a group would go idle.
4152 		 */
4153 		load_above_capacity = (sds->busiest_nr_running -
4154 						sds->busiest_group_capacity);
4155 
4156 		load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
4157 
4158 		load_above_capacity /= sds->busiest->sgp->power;
4159 	}
4160 
4161 	/*
4162 	 * We're trying to get all the cpus to the average_load, so we don't
4163 	 * want to push ourselves above the average load, nor do we wish to
4164 	 * reduce the max loaded cpu below the average load. At the same time,
4165 	 * we also don't want to reduce the group load below the group capacity
4166 	 * (so that we can implement power-savings policies etc). Thus we look
4167 	 * for the minimum possible imbalance.
4168 	 * Be careful of negative numbers as they'll appear as very large values
4169 	 * with unsigned longs.
4170 	 */
4171 	max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
4172 
4173 	/* How much load to actually move to equalise the imbalance */
4174 	*imbalance = min(max_pull * sds->busiest->sgp->power,
4175 		(sds->avg_load - sds->this_load) * sds->this->sgp->power)
4176 			/ SCHED_POWER_SCALE;
4177 
4178 	/*
4179 	 * if *imbalance is less than the average load per runnable task
4180 	 * there is no guarantee that any tasks will be moved so we'll have
4181 	 * a think about bumping its value to force at least one task to be
4182 	 * moved
4183 	 */
4184 	if (*imbalance < sds->busiest_load_per_task)
4185 		return fix_small_imbalance(sds, this_cpu, imbalance);
4186 
4187 }
4188 
4189 /******* find_busiest_group() helpers end here *********************/
4190 
4191 /**
4192  * find_busiest_group - Returns the busiest group within the sched_domain
4193  * if there is an imbalance. If there isn't an imbalance, and
4194  * the user has opted for power-savings, it returns a group whose
4195  * CPUs can be put to idle by rebalancing those tasks elsewhere, if
4196  * such a group exists.
4197  *
4198  * Also calculates the amount of weighted load which should be moved
4199  * to restore balance.
4200  *
4201  * @sd: The sched_domain whose busiest group is to be returned.
4202  * @this_cpu: The cpu for which load balancing is currently being performed.
4203  * @imbalance: Variable which stores amount of weighted load which should
4204  *		be moved to restore balance/put a group to idle.
4205  * @idle: The idle status of this_cpu.
4206  * @cpus: The set of CPUs under consideration for load-balancing.
4207  * @balance: Pointer to a variable indicating if this_cpu
4208  *	is the appropriate cpu to perform load balancing at this_level.
4209  *
4210  * Returns:	- the busiest group if imbalance exists.
4211  *		- If no imbalance and user has opted for power-savings balance,
4212  *		   return the least loaded group whose CPUs can be
4213  *		   put to idle by rebalancing its tasks onto our group.
4214  */
4215 static struct sched_group *
find_busiest_group(struct sched_domain * sd,int this_cpu,unsigned long * imbalance,enum cpu_idle_type idle,const struct cpumask * cpus,int * balance)4216 find_busiest_group(struct sched_domain *sd, int this_cpu,
4217 		   unsigned long *imbalance, enum cpu_idle_type idle,
4218 		   const struct cpumask *cpus, int *balance)
4219 {
4220 	struct sd_lb_stats sds;
4221 
4222 	memset(&sds, 0, sizeof(sds));
4223 
4224 	/*
4225 	 * Compute the various statistics relavent for load balancing at
4226 	 * this level.
4227 	 */
4228 	update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
4229 
4230 	/*
4231 	 * this_cpu is not the appropriate cpu to perform load balancing at
4232 	 * this level.
4233 	 */
4234 	if (!(*balance))
4235 		goto ret;
4236 
4237 	if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
4238 	    check_asym_packing(sd, &sds, this_cpu, imbalance))
4239 		return sds.busiest;
4240 
4241 	/* There is no busy sibling group to pull tasks from */
4242 	if (!sds.busiest || sds.busiest_nr_running == 0)
4243 		goto out_balanced;
4244 
4245 	sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
4246 
4247 	/*
4248 	 * If the busiest group is imbalanced the below checks don't
4249 	 * work because they assumes all things are equal, which typically
4250 	 * isn't true due to cpus_allowed constraints and the like.
4251 	 */
4252 	if (sds.group_imb)
4253 		goto force_balance;
4254 
4255 	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
4256 	if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
4257 			!sds.busiest_has_capacity)
4258 		goto force_balance;
4259 
4260 	/*
4261 	 * If the local group is more busy than the selected busiest group
4262 	 * don't try and pull any tasks.
4263 	 */
4264 	if (sds.this_load >= sds.max_load)
4265 		goto out_balanced;
4266 
4267 	/*
4268 	 * Don't pull any tasks if this group is already above the domain
4269 	 * average load.
4270 	 */
4271 	if (sds.this_load >= sds.avg_load)
4272 		goto out_balanced;
4273 
4274 	if (idle == CPU_IDLE) {
4275 		/*
4276 		 * This cpu is idle. If the busiest group load doesn't
4277 		 * have more tasks than the number of available cpu's and
4278 		 * there is no imbalance between this and busiest group
4279 		 * wrt to idle cpu's, it is balanced.
4280 		 */
4281 		if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
4282 		    sds.busiest_nr_running <= sds.busiest_group_weight)
4283 			goto out_balanced;
4284 	} else {
4285 		/*
4286 		 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
4287 		 * imbalance_pct to be conservative.
4288 		 */
4289 		if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
4290 			goto out_balanced;
4291 	}
4292 
4293 force_balance:
4294 	/* Looks like there is an imbalance. Compute it */
4295 	calculate_imbalance(&sds, this_cpu, imbalance);
4296 	return sds.busiest;
4297 
4298 out_balanced:
4299 	/*
4300 	 * There is no obvious imbalance. But check if we can do some balancing
4301 	 * to save power.
4302 	 */
4303 	if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
4304 		return sds.busiest;
4305 ret:
4306 	*imbalance = 0;
4307 	return NULL;
4308 }
4309 
4310 /*
4311  * find_busiest_queue - find the busiest runqueue among the cpus in group.
4312  */
4313 static struct rq *
find_busiest_queue(struct sched_domain * sd,struct sched_group * group,enum cpu_idle_type idle,unsigned long imbalance,const struct cpumask * cpus)4314 find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4315 		   enum cpu_idle_type idle, unsigned long imbalance,
4316 		   const struct cpumask *cpus)
4317 {
4318 	struct rq *busiest = NULL, *rq;
4319 	unsigned long max_load = 0;
4320 	int i;
4321 
4322 	for_each_cpu(i, sched_group_cpus(group)) {
4323 		unsigned long power = power_of(i);
4324 		unsigned long capacity = DIV_ROUND_CLOSEST(power,
4325 							   SCHED_POWER_SCALE);
4326 		unsigned long wl;
4327 
4328 		if (!capacity)
4329 			capacity = fix_small_capacity(sd, group);
4330 
4331 		if (!cpumask_test_cpu(i, cpus))
4332 			continue;
4333 
4334 		rq = cpu_rq(i);
4335 		wl = weighted_cpuload(i);
4336 
4337 		/*
4338 		 * When comparing with imbalance, use weighted_cpuload()
4339 		 * which is not scaled with the cpu power.
4340 		 */
4341 		if (capacity && rq->nr_running == 1 && wl > imbalance)
4342 			continue;
4343 
4344 		/*
4345 		 * For the load comparisons with the other cpu's, consider
4346 		 * the weighted_cpuload() scaled with the cpu power, so that
4347 		 * the load can be moved away from the cpu that is potentially
4348 		 * running at a lower capacity.
4349 		 */
4350 		wl = (wl * SCHED_POWER_SCALE) / power;
4351 
4352 		if (wl > max_load) {
4353 			max_load = wl;
4354 			busiest = rq;
4355 		}
4356 	}
4357 
4358 	return busiest;
4359 }
4360 
4361 /*
4362  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
4363  * so long as it is large enough.
4364  */
4365 #define MAX_PINNED_INTERVAL	512
4366 
4367 /* Working cpumask for load_balance and load_balance_newidle. */
4368 DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4369 
need_active_balance(struct sched_domain * sd,int idle,int busiest_cpu,int this_cpu)4370 static int need_active_balance(struct sched_domain *sd, int idle,
4371 			       int busiest_cpu, int this_cpu)
4372 {
4373 	if (idle == CPU_NEWLY_IDLE) {
4374 
4375 		/*
4376 		 * ASYM_PACKING needs to force migrate tasks from busy but
4377 		 * higher numbered CPUs in order to pack all tasks in the
4378 		 * lowest numbered CPUs.
4379 		 */
4380 		if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
4381 			return 1;
4382 
4383 		/*
4384 		 * The only task running in a non-idle cpu can be moved to this
4385 		 * cpu in an attempt to completely freeup the other CPU
4386 		 * package.
4387 		 *
4388 		 * The package power saving logic comes from
4389 		 * find_busiest_group(). If there are no imbalance, then
4390 		 * f_b_g() will return NULL. However when sched_mc={1,2} then
4391 		 * f_b_g() will select a group from which a running task may be
4392 		 * pulled to this cpu in order to make the other package idle.
4393 		 * If there is no opportunity to make a package idle and if
4394 		 * there are no imbalance, then f_b_g() will return NULL and no
4395 		 * action will be taken in load_balance_newidle().
4396 		 *
4397 		 * Under normal task pull operation due to imbalance, there
4398 		 * will be more than one task in the source run queue and
4399 		 * move_tasks() will succeed.  ld_moved will be true and this
4400 		 * active balance code will not be triggered.
4401 		 */
4402 		if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4403 			return 0;
4404 	}
4405 
4406 	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
4407 }
4408 
4409 static int active_load_balance_cpu_stop(void *data);
4410 
4411 /*
4412  * Check this_cpu to ensure it is balanced within domain. Attempt to move
4413  * tasks if there is an imbalance.
4414  */
load_balance(int this_cpu,struct rq * this_rq,struct sched_domain * sd,enum cpu_idle_type idle,int * balance)4415 static int load_balance(int this_cpu, struct rq *this_rq,
4416 			struct sched_domain *sd, enum cpu_idle_type idle,
4417 			int *balance)
4418 {
4419 	int ld_moved, active_balance = 0;
4420 	struct sched_group *group;
4421 	unsigned long imbalance;
4422 	struct rq *busiest;
4423 	unsigned long flags;
4424 	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4425 
4426 	struct lb_env env = {
4427 		.sd		= sd,
4428 		.dst_cpu	= this_cpu,
4429 		.dst_rq		= this_rq,
4430 		.idle		= idle,
4431 		.loop_break	= sched_nr_migrate_break,
4432 	};
4433 
4434 	cpumask_copy(cpus, cpu_active_mask);
4435 
4436 	schedstat_inc(sd, lb_count[idle]);
4437 
4438 redo:
4439 	group = find_busiest_group(sd, this_cpu, &imbalance, idle,
4440 				   cpus, balance);
4441 
4442 	if (*balance == 0)
4443 		goto out_balanced;
4444 
4445 	if (!group) {
4446 		schedstat_inc(sd, lb_nobusyg[idle]);
4447 		goto out_balanced;
4448 	}
4449 
4450 	busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
4451 	if (!busiest) {
4452 		schedstat_inc(sd, lb_nobusyq[idle]);
4453 		goto out_balanced;
4454 	}
4455 
4456 	BUG_ON(busiest == this_rq);
4457 
4458 	schedstat_add(sd, lb_imbalance[idle], imbalance);
4459 
4460 	ld_moved = 0;
4461 	if (busiest->nr_running > 1) {
4462 		/*
4463 		 * Attempt to move tasks. If find_busiest_group has found
4464 		 * an imbalance but busiest->nr_running <= 1, the group is
4465 		 * still unbalanced. ld_moved simply stays zero, so it is
4466 		 * correctly treated as an imbalance.
4467 		 */
4468 		env.flags |= LBF_ALL_PINNED;
4469 		env.load_move	= imbalance;
4470 		env.src_cpu	= busiest->cpu;
4471 		env.src_rq	= busiest;
4472 		env.loop_max	= min_t(unsigned long, sysctl_sched_nr_migrate, busiest->nr_running);
4473 
4474 more_balance:
4475 		local_irq_save(flags);
4476 		double_rq_lock(this_rq, busiest);
4477 		if (!env.loop)
4478 			update_h_load(env.src_cpu);
4479 		ld_moved += move_tasks(&env);
4480 		double_rq_unlock(this_rq, busiest);
4481 		local_irq_restore(flags);
4482 
4483 		if (env.flags & LBF_NEED_BREAK) {
4484 			env.flags &= ~LBF_NEED_BREAK;
4485 			goto more_balance;
4486 		}
4487 
4488 		/*
4489 		 * some other cpu did the load balance for us.
4490 		 */
4491 		if (ld_moved && this_cpu != smp_processor_id())
4492 			resched_cpu(this_cpu);
4493 
4494 		/* All tasks on this runqueue were pinned by CPU affinity */
4495 		if (unlikely(env.flags & LBF_ALL_PINNED)) {
4496 			cpumask_clear_cpu(cpu_of(busiest), cpus);
4497 			if (!cpumask_empty(cpus))
4498 				goto redo;
4499 			goto out_balanced;
4500 		}
4501 	}
4502 
4503 	if (!ld_moved) {
4504 		schedstat_inc(sd, lb_failed[idle]);
4505 		/*
4506 		 * Increment the failure counter only on periodic balance.
4507 		 * We do not want newidle balance, which can be very
4508 		 * frequent, pollute the failure counter causing
4509 		 * excessive cache_hot migrations and active balances.
4510 		 */
4511 		if (idle != CPU_NEWLY_IDLE)
4512 			sd->nr_balance_failed++;
4513 
4514 		if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) {
4515 			raw_spin_lock_irqsave(&busiest->lock, flags);
4516 
4517 			/* don't kick the active_load_balance_cpu_stop,
4518 			 * if the curr task on busiest cpu can't be
4519 			 * moved to this_cpu
4520 			 */
4521 			if (!cpumask_test_cpu(this_cpu,
4522 					tsk_cpus_allowed(busiest->curr))) {
4523 				raw_spin_unlock_irqrestore(&busiest->lock,
4524 							    flags);
4525 				env.flags |= LBF_ALL_PINNED;
4526 				goto out_one_pinned;
4527 			}
4528 
4529 			/*
4530 			 * ->active_balance synchronizes accesses to
4531 			 * ->active_balance_work.  Once set, it's cleared
4532 			 * only after active load balance is finished.
4533 			 */
4534 			if (!busiest->active_balance) {
4535 				busiest->active_balance = 1;
4536 				busiest->push_cpu = this_cpu;
4537 				active_balance = 1;
4538 			}
4539 			raw_spin_unlock_irqrestore(&busiest->lock, flags);
4540 
4541 			if (active_balance)
4542 				stop_one_cpu_nowait(cpu_of(busiest),
4543 					active_load_balance_cpu_stop, busiest,
4544 					&busiest->active_balance_work);
4545 
4546 			/*
4547 			 * We've kicked active balancing, reset the failure
4548 			 * counter.
4549 			 */
4550 			sd->nr_balance_failed = sd->cache_nice_tries+1;
4551 		}
4552 	} else
4553 		sd->nr_balance_failed = 0;
4554 
4555 	if (likely(!active_balance)) {
4556 		/* We were unbalanced, so reset the balancing interval */
4557 		sd->balance_interval = sd->min_interval;
4558 	} else {
4559 		/*
4560 		 * If we've begun active balancing, start to back off. This
4561 		 * case may not be covered by the all_pinned logic if there
4562 		 * is only 1 task on the busy runqueue (because we don't call
4563 		 * move_tasks).
4564 		 */
4565 		if (sd->balance_interval < sd->max_interval)
4566 			sd->balance_interval *= 2;
4567 	}
4568 
4569 	goto out;
4570 
4571 out_balanced:
4572 	schedstat_inc(sd, lb_balanced[idle]);
4573 
4574 	sd->nr_balance_failed = 0;
4575 
4576 out_one_pinned:
4577 	/* tune up the balancing interval */
4578 	if (((env.flags & LBF_ALL_PINNED) &&
4579 			sd->balance_interval < MAX_PINNED_INTERVAL) ||
4580 			(sd->balance_interval < sd->max_interval))
4581 		sd->balance_interval *= 2;
4582 
4583 	ld_moved = 0;
4584 out:
4585 	return ld_moved;
4586 }
4587 
4588 /*
4589  * idle_balance is called by schedule() if this_cpu is about to become
4590  * idle. Attempts to pull tasks from other CPUs.
4591  */
idle_balance(int this_cpu,struct rq * this_rq)4592 void idle_balance(int this_cpu, struct rq *this_rq)
4593 {
4594 	struct sched_domain *sd;
4595 	int pulled_task = 0;
4596 	unsigned long next_balance = jiffies + HZ;
4597 
4598 	this_rq->idle_stamp = this_rq->clock;
4599 
4600 	if (this_rq->avg_idle < sysctl_sched_migration_cost)
4601 		return;
4602 
4603 	/*
4604 	 * Drop the rq->lock, but keep IRQ/preempt disabled.
4605 	 */
4606 	raw_spin_unlock(&this_rq->lock);
4607 
4608 	update_shares(this_cpu);
4609 	rcu_read_lock();
4610 	for_each_domain(this_cpu, sd) {
4611 		unsigned long interval;
4612 		int balance = 1;
4613 
4614 		if (!(sd->flags & SD_LOAD_BALANCE))
4615 			continue;
4616 
4617 		if (sd->flags & SD_BALANCE_NEWIDLE) {
4618 			/* If we've pulled tasks over stop searching: */
4619 			pulled_task = load_balance(this_cpu, this_rq,
4620 						   sd, CPU_NEWLY_IDLE, &balance);
4621 		}
4622 
4623 		interval = msecs_to_jiffies(sd->balance_interval);
4624 		if (time_after(next_balance, sd->last_balance + interval))
4625 			next_balance = sd->last_balance + interval;
4626 		if (pulled_task) {
4627 			this_rq->idle_stamp = 0;
4628 			break;
4629 		}
4630 	}
4631 	rcu_read_unlock();
4632 
4633 	raw_spin_lock(&this_rq->lock);
4634 
4635 	if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4636 		/*
4637 		 * We are going idle. next_balance may be set based on
4638 		 * a busy processor. So reset next_balance.
4639 		 */
4640 		this_rq->next_balance = next_balance;
4641 	}
4642 }
4643 
4644 /*
4645  * active_load_balance_cpu_stop is run by cpu stopper. It pushes
4646  * running tasks off the busiest CPU onto idle CPUs. It requires at
4647  * least 1 task to be running on each physical CPU where possible, and
4648  * avoids physical / logical imbalances.
4649  */
active_load_balance_cpu_stop(void * data)4650 static int active_load_balance_cpu_stop(void *data)
4651 {
4652 	struct rq *busiest_rq = data;
4653 	int busiest_cpu = cpu_of(busiest_rq);
4654 	int target_cpu = busiest_rq->push_cpu;
4655 	struct rq *target_rq = cpu_rq(target_cpu);
4656 	struct sched_domain *sd;
4657 
4658 	raw_spin_lock_irq(&busiest_rq->lock);
4659 
4660 	/* make sure the requested cpu hasn't gone down in the meantime */
4661 	if (unlikely(busiest_cpu != smp_processor_id() ||
4662 		     !busiest_rq->active_balance))
4663 		goto out_unlock;
4664 
4665 	/* Is there any task to move? */
4666 	if (busiest_rq->nr_running <= 1)
4667 		goto out_unlock;
4668 
4669 	/*
4670 	 * This condition is "impossible", if it occurs
4671 	 * we need to fix it. Originally reported by
4672 	 * Bjorn Helgaas on a 128-cpu setup.
4673 	 */
4674 	BUG_ON(busiest_rq == target_rq);
4675 
4676 	/* move a task from busiest_rq to target_rq */
4677 	double_lock_balance(busiest_rq, target_rq);
4678 
4679 	/* Search for an sd spanning us and the target CPU. */
4680 	rcu_read_lock();
4681 	for_each_domain(target_cpu, sd) {
4682 		if ((sd->flags & SD_LOAD_BALANCE) &&
4683 		    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
4684 				break;
4685 	}
4686 
4687 	if (likely(sd)) {
4688 		struct lb_env env = {
4689 			.sd		= sd,
4690 			.dst_cpu	= target_cpu,
4691 			.dst_rq		= target_rq,
4692 			.src_cpu	= busiest_rq->cpu,
4693 			.src_rq		= busiest_rq,
4694 			.idle		= CPU_IDLE,
4695 		};
4696 
4697 		schedstat_inc(sd, alb_count);
4698 
4699 		if (move_one_task(&env))
4700 			schedstat_inc(sd, alb_pushed);
4701 		else
4702 			schedstat_inc(sd, alb_failed);
4703 	}
4704 	rcu_read_unlock();
4705 	double_unlock_balance(busiest_rq, target_rq);
4706 out_unlock:
4707 	busiest_rq->active_balance = 0;
4708 	raw_spin_unlock_irq(&busiest_rq->lock);
4709 	return 0;
4710 }
4711 
4712 #ifdef CONFIG_NO_HZ
4713 /*
4714  * idle load balancing details
4715  * - When one of the busy CPUs notice that there may be an idle rebalancing
4716  *   needed, they will kick the idle load balancer, which then does idle
4717  *   load balancing for all the idle CPUs.
4718  */
4719 static struct {
4720 	cpumask_var_t idle_cpus_mask;
4721 	atomic_t nr_cpus;
4722 	unsigned long next_balance;     /* in jiffy units */
4723 } nohz ____cacheline_aligned;
4724 
4725 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4726 /**
4727  * lowest_flag_domain - Return lowest sched_domain containing flag.
4728  * @cpu:	The cpu whose lowest level of sched domain is to
4729  *		be returned.
4730  * @flag:	The flag to check for the lowest sched_domain
4731  *		for the given cpu.
4732  *
4733  * Returns the lowest sched_domain of a cpu which contains the given flag.
4734  */
lowest_flag_domain(int cpu,int flag)4735 static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4736 {
4737 	struct sched_domain *sd;
4738 
4739 	for_each_domain(cpu, sd)
4740 		if (sd->flags & flag)
4741 			break;
4742 
4743 	return sd;
4744 }
4745 
4746 /**
4747  * for_each_flag_domain - Iterates over sched_domains containing the flag.
4748  * @cpu:	The cpu whose domains we're iterating over.
4749  * @sd:		variable holding the value of the power_savings_sd
4750  *		for cpu.
4751  * @flag:	The flag to filter the sched_domains to be iterated.
4752  *
4753  * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4754  * set, starting from the lowest sched_domain to the highest.
4755  */
4756 #define for_each_flag_domain(cpu, sd, flag) \
4757 	for (sd = lowest_flag_domain(cpu, flag); \
4758 		(sd && (sd->flags & flag)); sd = sd->parent)
4759 
4760 /**
4761  * find_new_ilb - Finds the optimum idle load balancer for nomination.
4762  * @cpu:	The cpu which is nominating a new idle_load_balancer.
4763  *
4764  * Returns:	Returns the id of the idle load balancer if it exists,
4765  *		Else, returns >= nr_cpu_ids.
4766  *
4767  * This algorithm picks the idle load balancer such that it belongs to a
4768  * semi-idle powersavings sched_domain. The idea is to try and avoid
4769  * completely idle packages/cores just for the purpose of idle load balancing
4770  * when there are other idle cpu's which are better suited for that job.
4771  */
find_new_ilb(int cpu)4772 static int find_new_ilb(int cpu)
4773 {
4774 	int ilb = cpumask_first(nohz.idle_cpus_mask);
4775 	struct sched_group *ilbg;
4776 	struct sched_domain *sd;
4777 
4778 	/*
4779 	 * Have idle load balancer selection from semi-idle packages only
4780 	 * when power-aware load balancing is enabled
4781 	 */
4782 	if (!(sched_smt_power_savings || sched_mc_power_savings))
4783 		goto out_done;
4784 
4785 	/*
4786 	 * Optimize for the case when we have no idle CPUs or only one
4787 	 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4788 	 */
4789 	if (cpumask_weight(nohz.idle_cpus_mask) < 2)
4790 		goto out_done;
4791 
4792 	rcu_read_lock();
4793 	for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4794 		ilbg = sd->groups;
4795 
4796 		do {
4797 			if (ilbg->group_weight !=
4798 				atomic_read(&ilbg->sgp->nr_busy_cpus)) {
4799 				ilb = cpumask_first_and(nohz.idle_cpus_mask,
4800 							sched_group_cpus(ilbg));
4801 				goto unlock;
4802 			}
4803 
4804 			ilbg = ilbg->next;
4805 
4806 		} while (ilbg != sd->groups);
4807 	}
4808 unlock:
4809 	rcu_read_unlock();
4810 
4811 out_done:
4812 	if (ilb < nr_cpu_ids && idle_cpu(ilb))
4813 		return ilb;
4814 
4815 	return nr_cpu_ids;
4816 }
4817 #else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
find_new_ilb(int call_cpu)4818 static inline int find_new_ilb(int call_cpu)
4819 {
4820 	return nr_cpu_ids;
4821 }
4822 #endif
4823 
4824 /*
4825  * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
4826  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
4827  * CPU (if there is one).
4828  */
nohz_balancer_kick(int cpu)4829 static void nohz_balancer_kick(int cpu)
4830 {
4831 	int ilb_cpu;
4832 
4833 	nohz.next_balance++;
4834 
4835 	ilb_cpu = find_new_ilb(cpu);
4836 
4837 	if (ilb_cpu >= nr_cpu_ids)
4838 		return;
4839 
4840 	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
4841 		return;
4842 	/*
4843 	 * Use smp_send_reschedule() instead of resched_cpu().
4844 	 * This way we generate a sched IPI on the target cpu which
4845 	 * is idle. And the softirq performing nohz idle load balance
4846 	 * will be run before returning from the IPI.
4847 	 */
4848 	smp_send_reschedule(ilb_cpu);
4849 	return;
4850 }
4851 
clear_nohz_tick_stopped(int cpu)4852 static inline void clear_nohz_tick_stopped(int cpu)
4853 {
4854 	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
4855 		cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
4856 		atomic_dec(&nohz.nr_cpus);
4857 		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
4858 	}
4859 }
4860 
set_cpu_sd_state_busy(void)4861 static inline void set_cpu_sd_state_busy(void)
4862 {
4863 	struct sched_domain *sd;
4864 	int cpu = smp_processor_id();
4865 
4866 	if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
4867 		return;
4868 	clear_bit(NOHZ_IDLE, nohz_flags(cpu));
4869 
4870 	rcu_read_lock();
4871 	for_each_domain(cpu, sd)
4872 		atomic_inc(&sd->groups->sgp->nr_busy_cpus);
4873 	rcu_read_unlock();
4874 }
4875 
set_cpu_sd_state_idle(void)4876 void set_cpu_sd_state_idle(void)
4877 {
4878 	struct sched_domain *sd;
4879 	int cpu = smp_processor_id();
4880 
4881 	if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
4882 		return;
4883 	set_bit(NOHZ_IDLE, nohz_flags(cpu));
4884 
4885 	rcu_read_lock();
4886 	for_each_domain(cpu, sd)
4887 		atomic_dec(&sd->groups->sgp->nr_busy_cpus);
4888 	rcu_read_unlock();
4889 }
4890 
4891 /*
4892  * This routine will record that this cpu is going idle with tick stopped.
4893  * This info will be used in performing idle load balancing in the future.
4894  */
select_nohz_load_balancer(int stop_tick)4895 void select_nohz_load_balancer(int stop_tick)
4896 {
4897 	int cpu = smp_processor_id();
4898 
4899 	/*
4900 	 * If this cpu is going down, then nothing needs to be done.
4901 	 */
4902 	if (!cpu_active(cpu))
4903 		return;
4904 
4905 	if (stop_tick) {
4906 		if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
4907 			return;
4908 
4909 		cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
4910 		atomic_inc(&nohz.nr_cpus);
4911 		set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
4912 	}
4913 	return;
4914 }
4915 
sched_ilb_notifier(struct notifier_block * nfb,unsigned long action,void * hcpu)4916 static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
4917 					unsigned long action, void *hcpu)
4918 {
4919 	switch (action & ~CPU_TASKS_FROZEN) {
4920 	case CPU_DYING:
4921 		clear_nohz_tick_stopped(smp_processor_id());
4922 		return NOTIFY_OK;
4923 	default:
4924 		return NOTIFY_DONE;
4925 	}
4926 }
4927 #endif
4928 
4929 static DEFINE_SPINLOCK(balancing);
4930 
4931 /*
4932  * Scale the max load_balance interval with the number of CPUs in the system.
4933  * This trades load-balance latency on larger machines for less cross talk.
4934  */
update_max_interval(void)4935 void update_max_interval(void)
4936 {
4937 	max_load_balance_interval = HZ*num_online_cpus()/10;
4938 }
4939 
4940 /*
4941  * It checks each scheduling domain to see if it is due to be balanced,
4942  * and initiates a balancing operation if so.
4943  *
4944  * Balancing parameters are set up in arch_init_sched_domains.
4945  */
rebalance_domains(int cpu,enum cpu_idle_type idle)4946 static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4947 {
4948 	int balance = 1;
4949 	struct rq *rq = cpu_rq(cpu);
4950 	unsigned long interval;
4951 	struct sched_domain *sd;
4952 	/* Earliest time when we have to do rebalance again */
4953 	unsigned long next_balance = jiffies + 60*HZ;
4954 	int update_next_balance = 0;
4955 	int need_serialize;
4956 
4957 	update_shares(cpu);
4958 
4959 	rcu_read_lock();
4960 	for_each_domain(cpu, sd) {
4961 		if (!(sd->flags & SD_LOAD_BALANCE))
4962 			continue;
4963 
4964 		interval = sd->balance_interval;
4965 		if (idle != CPU_IDLE)
4966 			interval *= sd->busy_factor;
4967 
4968 		/* scale ms to jiffies */
4969 		interval = msecs_to_jiffies(interval);
4970 		interval = clamp(interval, 1UL, max_load_balance_interval);
4971 
4972 		need_serialize = sd->flags & SD_SERIALIZE;
4973 
4974 		if (need_serialize) {
4975 			if (!spin_trylock(&balancing))
4976 				goto out;
4977 		}
4978 
4979 		if (time_after_eq(jiffies, sd->last_balance + interval)) {
4980 			if (load_balance(cpu, rq, sd, idle, &balance)) {
4981 				/*
4982 				 * We've pulled tasks over so either we're no
4983 				 * longer idle.
4984 				 */
4985 				idle = CPU_NOT_IDLE;
4986 			}
4987 			sd->last_balance = jiffies;
4988 		}
4989 		if (need_serialize)
4990 			spin_unlock(&balancing);
4991 out:
4992 		if (time_after(next_balance, sd->last_balance + interval)) {
4993 			next_balance = sd->last_balance + interval;
4994 			update_next_balance = 1;
4995 		}
4996 
4997 		/*
4998 		 * Stop the load balance at this level. There is another
4999 		 * CPU in our sched group which is doing load balancing more
5000 		 * actively.
5001 		 */
5002 		if (!balance)
5003 			break;
5004 	}
5005 	rcu_read_unlock();
5006 
5007 	/*
5008 	 * next_balance will be updated only when there is a need.
5009 	 * When the cpu is attached to null domain for ex, it will not be
5010 	 * updated.
5011 	 */
5012 	if (likely(update_next_balance))
5013 		rq->next_balance = next_balance;
5014 }
5015 
5016 #ifdef CONFIG_NO_HZ
5017 /*
5018  * In CONFIG_NO_HZ case, the idle balance kickee will do the
5019  * rebalancing for all the cpus for whom scheduler ticks are stopped.
5020  */
nohz_idle_balance(int this_cpu,enum cpu_idle_type idle)5021 static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
5022 {
5023 	struct rq *this_rq = cpu_rq(this_cpu);
5024 	struct rq *rq;
5025 	int balance_cpu;
5026 
5027 	if (idle != CPU_IDLE ||
5028 	    !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
5029 		goto end;
5030 
5031 	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
5032 		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
5033 			continue;
5034 
5035 		/*
5036 		 * If this cpu gets work to do, stop the load balancing
5037 		 * work being done for other cpus. Next load
5038 		 * balancing owner will pick it up.
5039 		 */
5040 		if (need_resched())
5041 			break;
5042 
5043 		raw_spin_lock_irq(&this_rq->lock);
5044 		update_rq_clock(this_rq);
5045 		update_idle_cpu_load(this_rq);
5046 		raw_spin_unlock_irq(&this_rq->lock);
5047 
5048 		rebalance_domains(balance_cpu, CPU_IDLE);
5049 
5050 		rq = cpu_rq(balance_cpu);
5051 		if (time_after(this_rq->next_balance, rq->next_balance))
5052 			this_rq->next_balance = rq->next_balance;
5053 	}
5054 	nohz.next_balance = this_rq->next_balance;
5055 end:
5056 	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
5057 }
5058 
5059 /*
5060  * Current heuristic for kicking the idle load balancer in the presence
5061  * of an idle cpu is the system.
5062  *   - This rq has more than one task.
5063  *   - At any scheduler domain level, this cpu's scheduler group has multiple
5064  *     busy cpu's exceeding the group's power.
5065  *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
5066  *     domain span are idle.
5067  */
nohz_kick_needed(struct rq * rq,int cpu)5068 static inline int nohz_kick_needed(struct rq *rq, int cpu)
5069 {
5070 	unsigned long now = jiffies;
5071 	struct sched_domain *sd;
5072 
5073 	if (unlikely(idle_cpu(cpu)))
5074 		return 0;
5075 
5076        /*
5077 	* We may be recently in ticked or tickless idle mode. At the first
5078 	* busy tick after returning from idle, we will update the busy stats.
5079 	*/
5080 	set_cpu_sd_state_busy();
5081 	clear_nohz_tick_stopped(cpu);
5082 
5083 	/*
5084 	 * None are in tickless mode and hence no need for NOHZ idle load
5085 	 * balancing.
5086 	 */
5087 	if (likely(!atomic_read(&nohz.nr_cpus)))
5088 		return 0;
5089 
5090 	if (time_before(now, nohz.next_balance))
5091 		return 0;
5092 
5093 	if (rq->nr_running >= 2)
5094 		goto need_kick;
5095 
5096 	rcu_read_lock();
5097 	for_each_domain(cpu, sd) {
5098 		struct sched_group *sg = sd->groups;
5099 		struct sched_group_power *sgp = sg->sgp;
5100 		int nr_busy = atomic_read(&sgp->nr_busy_cpus);
5101 
5102 		if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
5103 			goto need_kick_unlock;
5104 
5105 		if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
5106 		    && (cpumask_first_and(nohz.idle_cpus_mask,
5107 					  sched_domain_span(sd)) < cpu))
5108 			goto need_kick_unlock;
5109 
5110 		if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
5111 			break;
5112 	}
5113 	rcu_read_unlock();
5114 	return 0;
5115 
5116 need_kick_unlock:
5117 	rcu_read_unlock();
5118 need_kick:
5119 	return 1;
5120 }
5121 #else
nohz_idle_balance(int this_cpu,enum cpu_idle_type idle)5122 static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
5123 #endif
5124 
5125 /*
5126  * run_rebalance_domains is triggered when needed from the scheduler tick.
5127  * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
5128  */
run_rebalance_domains(struct softirq_action * h)5129 static void run_rebalance_domains(struct softirq_action *h)
5130 {
5131 	int this_cpu = smp_processor_id();
5132 	struct rq *this_rq = cpu_rq(this_cpu);
5133 	enum cpu_idle_type idle = this_rq->idle_balance ?
5134 						CPU_IDLE : CPU_NOT_IDLE;
5135 
5136 	rebalance_domains(this_cpu, idle);
5137 
5138 	/*
5139 	 * If this cpu has a pending nohz_balance_kick, then do the
5140 	 * balancing on behalf of the other idle cpus whose ticks are
5141 	 * stopped.
5142 	 */
5143 	nohz_idle_balance(this_cpu, idle);
5144 }
5145 
on_null_domain(int cpu)5146 static inline int on_null_domain(int cpu)
5147 {
5148 	return !rcu_dereference_sched(cpu_rq(cpu)->sd);
5149 }
5150 
5151 /*
5152  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
5153  */
trigger_load_balance(struct rq * rq,int cpu)5154 void trigger_load_balance(struct rq *rq, int cpu)
5155 {
5156 	/* Don't need to rebalance while attached to NULL domain */
5157 	if (time_after_eq(jiffies, rq->next_balance) &&
5158 	    likely(!on_null_domain(cpu)))
5159 		raise_softirq(SCHED_SOFTIRQ);
5160 #ifdef CONFIG_NO_HZ
5161 	if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
5162 		nohz_balancer_kick(cpu);
5163 #endif
5164 }
5165 
rq_online_fair(struct rq * rq)5166 static void rq_online_fair(struct rq *rq)
5167 {
5168 	update_sysctl();
5169 }
5170 
rq_offline_fair(struct rq * rq)5171 static void rq_offline_fair(struct rq *rq)
5172 {
5173 	update_sysctl();
5174 
5175 	/* Ensure any throttled groups are reachable by pick_next_task */
5176 	unthrottle_offline_cfs_rqs(rq);
5177 }
5178 
5179 #endif /* CONFIG_SMP */
5180 
5181 /*
5182  * scheduler tick hitting a task of our scheduling class:
5183  */
task_tick_fair(struct rq * rq,struct task_struct * curr,int queued)5184 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
5185 {
5186 	struct cfs_rq *cfs_rq;
5187 	struct sched_entity *se = &curr->se;
5188 
5189 	for_each_sched_entity(se) {
5190 		cfs_rq = cfs_rq_of(se);
5191 		entity_tick(cfs_rq, se, queued);
5192 	}
5193 }
5194 
5195 /*
5196  * called on fork with the child task as argument from the parent's context
5197  *  - child not yet on the tasklist
5198  *  - preemption disabled
5199  */
task_fork_fair(struct task_struct * p)5200 static void task_fork_fair(struct task_struct *p)
5201 {
5202 	struct cfs_rq *cfs_rq;
5203 	struct sched_entity *se = &p->se, *curr;
5204 	int this_cpu = smp_processor_id();
5205 	struct rq *rq = this_rq();
5206 	unsigned long flags;
5207 
5208 	raw_spin_lock_irqsave(&rq->lock, flags);
5209 
5210 	update_rq_clock(rq);
5211 
5212 	cfs_rq = task_cfs_rq(current);
5213 	curr = cfs_rq->curr;
5214 
5215 	/*
5216 	 * Not only the cpu but also the task_group of the parent might have
5217 	 * been changed after parent->se.parent,cfs_rq were copied to
5218 	 * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
5219 	 * of child point to valid ones.
5220 	 */
5221 	rcu_read_lock();
5222 	__set_task_cpu(p, this_cpu);
5223 	rcu_read_unlock();
5224 
5225 	update_curr(cfs_rq);
5226 
5227 	if (curr)
5228 		se->vruntime = curr->vruntime;
5229 	place_entity(cfs_rq, se, 1);
5230 
5231 	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
5232 		/*
5233 		 * Upon rescheduling, sched_class::put_prev_task() will place
5234 		 * 'current' within the tree based on its new key value.
5235 		 */
5236 		swap(curr->vruntime, se->vruntime);
5237 		resched_task(rq->curr);
5238 	}
5239 
5240 	se->vruntime -= cfs_rq->min_vruntime;
5241 
5242 	raw_spin_unlock_irqrestore(&rq->lock, flags);
5243 }
5244 
5245 /*
5246  * Priority of the task has changed. Check to see if we preempt
5247  * the current task.
5248  */
5249 static void
prio_changed_fair(struct rq * rq,struct task_struct * p,int oldprio)5250 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
5251 {
5252 	if (!p->se.on_rq)
5253 		return;
5254 
5255 	/*
5256 	 * Reschedule if we are currently running on this runqueue and
5257 	 * our priority decreased, or if we are not currently running on
5258 	 * this runqueue and our priority is higher than the current's
5259 	 */
5260 	if (rq->curr == p) {
5261 		if (p->prio > oldprio)
5262 			resched_task(rq->curr);
5263 	} else
5264 		check_preempt_curr(rq, p, 0);
5265 }
5266 
switched_from_fair(struct rq * rq,struct task_struct * p)5267 static void switched_from_fair(struct rq *rq, struct task_struct *p)
5268 {
5269 	struct sched_entity *se = &p->se;
5270 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
5271 
5272 	/*
5273 	 * Ensure the task's vruntime is normalized, so that when it's
5274 	 * switched back to the fair class the enqueue_entity(.flags=0) will
5275 	 * do the right thing.
5276 	 *
5277 	 * If it's on_rq, then the dequeue_entity(.flags=0) will already
5278 	 * have normalized the vruntime, if it's !on_rq, then only when
5279 	 * the task is sleeping will it still have non-normalized vruntime.
5280 	 */
5281 	if (!p->on_rq && p->state != TASK_RUNNING) {
5282 		/*
5283 		 * Fix up our vruntime so that the current sleep doesn't
5284 		 * cause 'unlimited' sleep bonus.
5285 		 */
5286 		place_entity(cfs_rq, se, 0);
5287 		se->vruntime -= cfs_rq->min_vruntime;
5288 	}
5289 }
5290 
5291 /*
5292  * We switched to the sched_fair class.
5293  */
switched_to_fair(struct rq * rq,struct task_struct * p)5294 static void switched_to_fair(struct rq *rq, struct task_struct *p)
5295 {
5296 	if (!p->se.on_rq)
5297 		return;
5298 
5299 	/*
5300 	 * We were most likely switched from sched_rt, so
5301 	 * kick off the schedule if running, otherwise just see
5302 	 * if we can still preempt the current task.
5303 	 */
5304 	if (rq->curr == p)
5305 		resched_task(rq->curr);
5306 	else
5307 		check_preempt_curr(rq, p, 0);
5308 }
5309 
5310 /* Account for a task changing its policy or group.
5311  *
5312  * This routine is mostly called to set cfs_rq->curr field when a task
5313  * migrates between groups/classes.
5314  */
set_curr_task_fair(struct rq * rq)5315 static void set_curr_task_fair(struct rq *rq)
5316 {
5317 	struct sched_entity *se = &rq->curr->se;
5318 
5319 	for_each_sched_entity(se) {
5320 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
5321 
5322 		set_next_entity(cfs_rq, se);
5323 		/* ensure bandwidth has been allocated on our new cfs_rq */
5324 		account_cfs_rq_runtime(cfs_rq, 0);
5325 	}
5326 }
5327 
init_cfs_rq(struct cfs_rq * cfs_rq)5328 void init_cfs_rq(struct cfs_rq *cfs_rq)
5329 {
5330 	cfs_rq->tasks_timeline = RB_ROOT;
5331 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
5332 #ifndef CONFIG_64BIT
5333 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
5334 #endif
5335 }
5336 
5337 #ifdef CONFIG_FAIR_GROUP_SCHED
task_move_group_fair(struct task_struct * p,int on_rq)5338 static void task_move_group_fair(struct task_struct *p, int on_rq)
5339 {
5340 	/*
5341 	 * If the task was not on the rq at the time of this cgroup movement
5342 	 * it must have been asleep, sleeping tasks keep their ->vruntime
5343 	 * absolute on their old rq until wakeup (needed for the fair sleeper
5344 	 * bonus in place_entity()).
5345 	 *
5346 	 * If it was on the rq, we've just 'preempted' it, which does convert
5347 	 * ->vruntime to a relative base.
5348 	 *
5349 	 * Make sure both cases convert their relative position when migrating
5350 	 * to another cgroup's rq. This does somewhat interfere with the
5351 	 * fair sleeper stuff for the first placement, but who cares.
5352 	 */
5353 	/*
5354 	 * When !on_rq, vruntime of the task has usually NOT been normalized.
5355 	 * But there are some cases where it has already been normalized:
5356 	 *
5357 	 * - Moving a forked child which is waiting for being woken up by
5358 	 *   wake_up_new_task().
5359 	 * - Moving a task which has been woken up by try_to_wake_up() and
5360 	 *   waiting for actually being woken up by sched_ttwu_pending().
5361 	 *
5362 	 * To prevent boost or penalty in the new cfs_rq caused by delta
5363 	 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
5364 	 */
5365 	if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
5366 		on_rq = 1;
5367 
5368 	if (!on_rq)
5369 		p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
5370 	set_task_rq(p, task_cpu(p));
5371 	if (!on_rq)
5372 		p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
5373 }
5374 
free_fair_sched_group(struct task_group * tg)5375 void free_fair_sched_group(struct task_group *tg)
5376 {
5377 	int i;
5378 
5379 	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
5380 
5381 	for_each_possible_cpu(i) {
5382 		if (tg->cfs_rq)
5383 			kfree(tg->cfs_rq[i]);
5384 		if (tg->se)
5385 			kfree(tg->se[i]);
5386 	}
5387 
5388 	kfree(tg->cfs_rq);
5389 	kfree(tg->se);
5390 }
5391 
alloc_fair_sched_group(struct task_group * tg,struct task_group * parent)5392 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
5393 {
5394 	struct cfs_rq *cfs_rq;
5395 	struct sched_entity *se;
5396 	int i;
5397 
5398 	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
5399 	if (!tg->cfs_rq)
5400 		goto err;
5401 	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
5402 	if (!tg->se)
5403 		goto err;
5404 
5405 	tg->shares = NICE_0_LOAD;
5406 
5407 	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
5408 
5409 	for_each_possible_cpu(i) {
5410 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
5411 				      GFP_KERNEL, cpu_to_node(i));
5412 		if (!cfs_rq)
5413 			goto err;
5414 
5415 		se = kzalloc_node(sizeof(struct sched_entity),
5416 				  GFP_KERNEL, cpu_to_node(i));
5417 		if (!se)
5418 			goto err_free_rq;
5419 
5420 		init_cfs_rq(cfs_rq);
5421 		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
5422 	}
5423 
5424 	return 1;
5425 
5426 err_free_rq:
5427 	kfree(cfs_rq);
5428 err:
5429 	return 0;
5430 }
5431 
unregister_fair_sched_group(struct task_group * tg,int cpu)5432 void unregister_fair_sched_group(struct task_group *tg, int cpu)
5433 {
5434 	struct rq *rq = cpu_rq(cpu);
5435 	unsigned long flags;
5436 
5437 	/*
5438 	* Only empty task groups can be destroyed; so we can speculatively
5439 	* check on_list without danger of it being re-added.
5440 	*/
5441 	if (!tg->cfs_rq[cpu]->on_list)
5442 		return;
5443 
5444 	raw_spin_lock_irqsave(&rq->lock, flags);
5445 	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
5446 	raw_spin_unlock_irqrestore(&rq->lock, flags);
5447 }
5448 
init_tg_cfs_entry(struct task_group * tg,struct cfs_rq * cfs_rq,struct sched_entity * se,int cpu,struct sched_entity * parent)5449 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
5450 			struct sched_entity *se, int cpu,
5451 			struct sched_entity *parent)
5452 {
5453 	struct rq *rq = cpu_rq(cpu);
5454 
5455 	cfs_rq->tg = tg;
5456 	cfs_rq->rq = rq;
5457 #ifdef CONFIG_SMP
5458 	/* allow initial update_cfs_load() to truncate */
5459 	cfs_rq->load_stamp = 1;
5460 #endif
5461 	init_cfs_rq_runtime(cfs_rq);
5462 
5463 	tg->cfs_rq[cpu] = cfs_rq;
5464 	tg->se[cpu] = se;
5465 
5466 	/* se could be NULL for root_task_group */
5467 	if (!se)
5468 		return;
5469 
5470 	if (!parent)
5471 		se->cfs_rq = &rq->cfs;
5472 	else
5473 		se->cfs_rq = parent->my_q;
5474 
5475 	se->my_q = cfs_rq;
5476 	/* guarantee group entities always have weight */
5477 	update_load_set(&se->load, NICE_0_LOAD);
5478 	se->parent = parent;
5479 }
5480 
5481 static DEFINE_MUTEX(shares_mutex);
5482 
sched_group_set_shares(struct task_group * tg,unsigned long shares)5483 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
5484 {
5485 	int i;
5486 	unsigned long flags;
5487 
5488 	/*
5489 	 * We can't change the weight of the root cgroup.
5490 	 */
5491 	if (!tg->se[0])
5492 		return -EINVAL;
5493 
5494 	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
5495 
5496 	mutex_lock(&shares_mutex);
5497 	if (tg->shares == shares)
5498 		goto done;
5499 
5500 	tg->shares = shares;
5501 	for_each_possible_cpu(i) {
5502 		struct rq *rq = cpu_rq(i);
5503 		struct sched_entity *se;
5504 
5505 		se = tg->se[i];
5506 		/* Propagate contribution to hierarchy */
5507 		raw_spin_lock_irqsave(&rq->lock, flags);
5508 		for_each_sched_entity(se)
5509 			update_cfs_shares(group_cfs_rq(se));
5510 		raw_spin_unlock_irqrestore(&rq->lock, flags);
5511 	}
5512 
5513 done:
5514 	mutex_unlock(&shares_mutex);
5515 	return 0;
5516 }
5517 #else /* CONFIG_FAIR_GROUP_SCHED */
5518 
free_fair_sched_group(struct task_group * tg)5519 void free_fair_sched_group(struct task_group *tg) { }
5520 
alloc_fair_sched_group(struct task_group * tg,struct task_group * parent)5521 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
5522 {
5523 	return 1;
5524 }
5525 
unregister_fair_sched_group(struct task_group * tg,int cpu)5526 void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
5527 
5528 #endif /* CONFIG_FAIR_GROUP_SCHED */
5529 
5530 
get_rr_interval_fair(struct rq * rq,struct task_struct * task)5531 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
5532 {
5533 	struct sched_entity *se = &task->se;
5534 	unsigned int rr_interval = 0;
5535 
5536 	/*
5537 	 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
5538 	 * idle runqueue:
5539 	 */
5540 	if (rq->cfs.load.weight)
5541 		rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
5542 
5543 	return rr_interval;
5544 }
5545 
5546 /*
5547  * All the scheduling class methods:
5548  */
5549 const struct sched_class fair_sched_class = {
5550 	.next			= &idle_sched_class,
5551 	.enqueue_task		= enqueue_task_fair,
5552 	.dequeue_task		= dequeue_task_fair,
5553 	.yield_task		= yield_task_fair,
5554 	.yield_to_task		= yield_to_task_fair,
5555 
5556 	.check_preempt_curr	= check_preempt_wakeup,
5557 
5558 	.pick_next_task		= pick_next_task_fair,
5559 	.put_prev_task		= put_prev_task_fair,
5560 
5561 #ifdef CONFIG_SMP
5562 	.select_task_rq		= select_task_rq_fair,
5563 
5564 	.rq_online		= rq_online_fair,
5565 	.rq_offline		= rq_offline_fair,
5566 
5567 	.task_waking		= task_waking_fair,
5568 #endif
5569 
5570 	.set_curr_task          = set_curr_task_fair,
5571 	.task_tick		= task_tick_fair,
5572 	.task_fork		= task_fork_fair,
5573 
5574 	.prio_changed		= prio_changed_fair,
5575 	.switched_from		= switched_from_fair,
5576 	.switched_to		= switched_to_fair,
5577 
5578 	.get_rr_interval	= get_rr_interval_fair,
5579 
5580 #ifdef CONFIG_FAIR_GROUP_SCHED
5581 	.task_move_group	= task_move_group_fair,
5582 #endif
5583 };
5584 
5585 #ifdef CONFIG_SCHED_DEBUG
print_cfs_stats(struct seq_file * m,int cpu)5586 void print_cfs_stats(struct seq_file *m, int cpu)
5587 {
5588 	struct cfs_rq *cfs_rq;
5589 
5590 	rcu_read_lock();
5591 	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
5592 		print_cfs_rq(m, cpu, cfs_rq);
5593 	rcu_read_unlock();
5594 }
5595 #endif
5596 
init_sched_fair_class(void)5597 __init void init_sched_fair_class(void)
5598 {
5599 #ifdef CONFIG_SMP
5600 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
5601 
5602 #ifdef CONFIG_NO_HZ
5603 	nohz.next_balance = jiffies;
5604 	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
5605 	cpu_notifier(sched_ilb_notifier, 0);
5606 #endif
5607 #endif /* SMP */
5608 
5609 }
5610