1 /*
2  * kernel/workqueue.c - generic async execution with shared worker pool
3  *
4  * Copyright (C) 2002		Ingo Molnar
5  *
6  *   Derived from the taskqueue/keventd code by:
7  *     David Woodhouse <dwmw2@infradead.org>
8  *     Andrew Morton
9  *     Kai Petzke <wpp@marie.physik.tu-berlin.de>
10  *     Theodore Ts'o <tytso@mit.edu>
11  *
12  * Made to use alloc_percpu by Christoph Lameter.
13  *
14  * Copyright (C) 2010		SUSE Linux Products GmbH
15  * Copyright (C) 2010		Tejun Heo <tj@kernel.org>
16  *
17  * This is the generic async execution mechanism.  Work items as are
18  * executed in process context.  The worker pool is shared and
19  * automatically managed.  There is one worker pool for each CPU and
20  * one extra for works which are better served by workers which are
21  * not bound to any specific CPU.
22  *
23  * Please read Documentation/workqueue.txt for details.
24  */
25 
26 #include <linux/export.h>
27 #include <linux/kernel.h>
28 #include <linux/sched.h>
29 #include <linux/init.h>
30 #include <linux/signal.h>
31 #include <linux/completion.h>
32 #include <linux/workqueue.h>
33 #include <linux/slab.h>
34 #include <linux/cpu.h>
35 #include <linux/notifier.h>
36 #include <linux/kthread.h>
37 #include <linux/hardirq.h>
38 #include <linux/mempolicy.h>
39 #include <linux/freezer.h>
40 #include <linux/kallsyms.h>
41 #include <linux/debug_locks.h>
42 #include <linux/lockdep.h>
43 #include <linux/idr.h>
44 
45 #include "workqueue_sched.h"
46 
47 enum {
48 	/* global_cwq flags */
49 	GCWQ_MANAGE_WORKERS	= 1 << 0,	/* need to manage workers */
50 	GCWQ_MANAGING_WORKERS	= 1 << 1,	/* managing workers */
51 	GCWQ_DISASSOCIATED	= 1 << 2,	/* cpu can't serve workers */
52 	GCWQ_FREEZING		= 1 << 3,	/* freeze in progress */
53 	GCWQ_HIGHPRI_PENDING	= 1 << 4,	/* highpri works on queue */
54 
55 	/* worker flags */
56 	WORKER_STARTED		= 1 << 0,	/* started */
57 	WORKER_DIE		= 1 << 1,	/* die die die */
58 	WORKER_IDLE		= 1 << 2,	/* is idle */
59 	WORKER_PREP		= 1 << 3,	/* preparing to run works */
60 	WORKER_ROGUE		= 1 << 4,	/* not bound to any cpu */
61 	WORKER_REBIND		= 1 << 5,	/* mom is home, come back */
62 	WORKER_CPU_INTENSIVE	= 1 << 6,	/* cpu intensive */
63 	WORKER_UNBOUND		= 1 << 7,	/* worker is unbound */
64 
65 	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_ROGUE | WORKER_REBIND |
66 				  WORKER_CPU_INTENSIVE | WORKER_UNBOUND,
67 
68 	/* gcwq->trustee_state */
69 	TRUSTEE_START		= 0,		/* start */
70 	TRUSTEE_IN_CHARGE	= 1,		/* trustee in charge of gcwq */
71 	TRUSTEE_BUTCHER		= 2,		/* butcher workers */
72 	TRUSTEE_RELEASE		= 3,		/* release workers */
73 	TRUSTEE_DONE		= 4,		/* trustee is done */
74 
75 	BUSY_WORKER_HASH_ORDER	= 6,		/* 64 pointers */
76 	BUSY_WORKER_HASH_SIZE	= 1 << BUSY_WORKER_HASH_ORDER,
77 	BUSY_WORKER_HASH_MASK	= BUSY_WORKER_HASH_SIZE - 1,
78 
79 	MAX_IDLE_WORKERS_RATIO	= 4,		/* 1/4 of busy can be idle */
80 	IDLE_WORKER_TIMEOUT	= 300 * HZ,	/* keep idle ones for 5 mins */
81 
82 	MAYDAY_INITIAL_TIMEOUT  = HZ / 100 >= 2 ? HZ / 100 : 2,
83 						/* call for help after 10ms
84 						   (min two ticks) */
85 	MAYDAY_INTERVAL		= HZ / 10,	/* and then every 100ms */
86 	CREATE_COOLDOWN		= HZ,		/* time to breath after fail */
87 	TRUSTEE_COOLDOWN	= HZ / 10,	/* for trustee draining */
88 
89 	/*
90 	 * Rescue workers are used only on emergencies and shared by
91 	 * all cpus.  Give -20.
92 	 */
93 	RESCUER_NICE_LEVEL	= -20,
94 };
95 
96 /*
97  * Structure fields follow one of the following exclusion rules.
98  *
99  * I: Modifiable by initialization/destruction paths and read-only for
100  *    everyone else.
101  *
102  * P: Preemption protected.  Disabling preemption is enough and should
103  *    only be modified and accessed from the local cpu.
104  *
105  * L: gcwq->lock protected.  Access with gcwq->lock held.
106  *
107  * X: During normal operation, modification requires gcwq->lock and
108  *    should be done only from local cpu.  Either disabling preemption
109  *    on local cpu or grabbing gcwq->lock is enough for read access.
110  *    If GCWQ_DISASSOCIATED is set, it's identical to L.
111  *
112  * F: wq->flush_mutex protected.
113  *
114  * W: workqueue_lock protected.
115  */
116 
117 struct global_cwq;
118 
119 /*
120  * The poor guys doing the actual heavy lifting.  All on-duty workers
121  * are either serving the manager role, on idle list or on busy hash.
122  */
123 struct worker {
124 	/* on idle list while idle, on busy hash table while busy */
125 	union {
126 		struct list_head	entry;	/* L: while idle */
127 		struct hlist_node	hentry;	/* L: while busy */
128 	};
129 
130 	struct work_struct	*current_work;	/* L: work being processed */
131 	work_func_t		current_func;	/* L: current_work's fn */
132 	struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
133 	struct list_head	scheduled;	/* L: scheduled works */
134 	struct task_struct	*task;		/* I: worker task */
135 	struct global_cwq	*gcwq;		/* I: the associated gcwq */
136 	/* 64 bytes boundary on 64bit, 32 on 32bit */
137 	unsigned long		last_active;	/* L: last active timestamp */
138 	unsigned int		flags;		/* X: flags */
139 	int			id;		/* I: worker id */
140 	struct work_struct	rebind_work;	/* L: rebind worker to cpu */
141 };
142 
143 /*
144  * Global per-cpu workqueue.  There's one and only one for each cpu
145  * and all works are queued and processed here regardless of their
146  * target workqueues.
147  */
148 struct global_cwq {
149 	spinlock_t		lock;		/* the gcwq lock */
150 	struct list_head	worklist;	/* L: list of pending works */
151 	unsigned int		cpu;		/* I: the associated cpu */
152 	unsigned int		flags;		/* L: GCWQ_* flags */
153 
154 	int			nr_workers;	/* L: total number of workers */
155 	int			nr_idle;	/* L: currently idle ones */
156 
157 	/* workers are chained either in the idle_list or busy_hash */
158 	struct list_head	idle_list;	/* X: list of idle workers */
159 	struct hlist_head	busy_hash[BUSY_WORKER_HASH_SIZE];
160 						/* L: hash of busy workers */
161 
162 	struct timer_list	idle_timer;	/* L: worker idle timeout */
163 	struct timer_list	mayday_timer;	/* L: SOS timer for dworkers */
164 
165 	struct ida		worker_ida;	/* L: for worker IDs */
166 
167 	struct task_struct	*trustee;	/* L: for gcwq shutdown */
168 	unsigned int		trustee_state;	/* L: trustee state */
169 	wait_queue_head_t	trustee_wait;	/* trustee wait */
170 	struct worker		*first_idle;	/* L: first idle worker */
171 } ____cacheline_aligned_in_smp;
172 
173 /*
174  * The per-CPU workqueue.  The lower WORK_STRUCT_FLAG_BITS of
175  * work_struct->data are used for flags and thus cwqs need to be
176  * aligned at two's power of the number of flag bits.
177  */
178 struct cpu_workqueue_struct {
179 	struct global_cwq	*gcwq;		/* I: the associated gcwq */
180 	struct workqueue_struct *wq;		/* I: the owning workqueue */
181 	int			work_color;	/* L: current color */
182 	int			flush_color;	/* L: flushing color */
183 	int			nr_in_flight[WORK_NR_COLORS];
184 						/* L: nr of in_flight works */
185 	int			nr_active;	/* L: nr of active works */
186 	int			max_active;	/* L: max active works */
187 	struct list_head	delayed_works;	/* L: delayed works */
188 };
189 
190 /*
191  * Structure used to wait for workqueue flush.
192  */
193 struct wq_flusher {
194 	struct list_head	list;		/* F: list of flushers */
195 	int			flush_color;	/* F: flush color waiting for */
196 	struct completion	done;		/* flush completion */
197 };
198 
199 /*
200  * All cpumasks are assumed to be always set on UP and thus can't be
201  * used to determine whether there's something to be done.
202  */
203 #ifdef CONFIG_SMP
204 typedef cpumask_var_t mayday_mask_t;
205 #define mayday_test_and_set_cpu(cpu, mask)	\
206 	cpumask_test_and_set_cpu((cpu), (mask))
207 #define mayday_clear_cpu(cpu, mask)		cpumask_clear_cpu((cpu), (mask))
208 #define for_each_mayday_cpu(cpu, mask)		for_each_cpu((cpu), (mask))
209 #define alloc_mayday_mask(maskp, gfp)		zalloc_cpumask_var((maskp), (gfp))
210 #define free_mayday_mask(mask)			free_cpumask_var((mask))
211 #else
212 typedef unsigned long mayday_mask_t;
213 #define mayday_test_and_set_cpu(cpu, mask)	test_and_set_bit(0, &(mask))
214 #define mayday_clear_cpu(cpu, mask)		clear_bit(0, &(mask))
215 #define for_each_mayday_cpu(cpu, mask)		if ((cpu) = 0, (mask))
216 #define alloc_mayday_mask(maskp, gfp)		true
217 #define free_mayday_mask(mask)			do { } while (0)
218 #endif
219 
220 /*
221  * The externally visible workqueue abstraction is an array of
222  * per-CPU workqueues:
223  */
224 struct workqueue_struct {
225 	unsigned int		flags;		/* W: WQ_* flags */
226 	union {
227 		struct cpu_workqueue_struct __percpu	*pcpu;
228 		struct cpu_workqueue_struct		*single;
229 		unsigned long				v;
230 	} cpu_wq;				/* I: cwq's */
231 	struct list_head	list;		/* W: list of all workqueues */
232 
233 	struct mutex		flush_mutex;	/* protects wq flushing */
234 	int			work_color;	/* F: current work color */
235 	int			flush_color;	/* F: current flush color */
236 	atomic_t		nr_cwqs_to_flush; /* flush in progress */
237 	struct wq_flusher	*first_flusher;	/* F: first flusher */
238 	struct list_head	flusher_queue;	/* F: flush waiters */
239 	struct list_head	flusher_overflow; /* F: flush overflow list */
240 
241 	mayday_mask_t		mayday_mask;	/* cpus requesting rescue */
242 	struct worker		*rescuer;	/* I: rescue worker */
243 
244 	int			nr_drainers;	/* W: drain in progress */
245 	int			saved_max_active; /* W: saved cwq max_active */
246 #ifdef CONFIG_LOCKDEP
247 	struct lockdep_map	lockdep_map;
248 #endif
249 	char			name[];		/* I: workqueue name */
250 };
251 
252 struct workqueue_struct *system_wq __read_mostly;
253 struct workqueue_struct *system_long_wq __read_mostly;
254 struct workqueue_struct *system_nrt_wq __read_mostly;
255 struct workqueue_struct *system_unbound_wq __read_mostly;
256 struct workqueue_struct *system_freezable_wq __read_mostly;
257 struct workqueue_struct *system_nrt_freezable_wq __read_mostly;
258 EXPORT_SYMBOL_GPL(system_wq);
259 EXPORT_SYMBOL_GPL(system_long_wq);
260 EXPORT_SYMBOL_GPL(system_nrt_wq);
261 EXPORT_SYMBOL_GPL(system_unbound_wq);
262 EXPORT_SYMBOL_GPL(system_freezable_wq);
263 EXPORT_SYMBOL_GPL(system_nrt_freezable_wq);
264 
265 #define CREATE_TRACE_POINTS
266 #include <trace/events/workqueue.h>
267 
268 #define for_each_busy_worker(worker, i, pos, gcwq)			\
269 	for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)			\
270 		hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
271 
__next_gcwq_cpu(int cpu,const struct cpumask * mask,unsigned int sw)272 static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask,
273 				  unsigned int sw)
274 {
275 	if (cpu < nr_cpu_ids) {
276 		if (sw & 1) {
277 			cpu = cpumask_next(cpu, mask);
278 			if (cpu < nr_cpu_ids)
279 				return cpu;
280 		}
281 		if (sw & 2)
282 			return WORK_CPU_UNBOUND;
283 	}
284 	return WORK_CPU_NONE;
285 }
286 
__next_wq_cpu(int cpu,const struct cpumask * mask,struct workqueue_struct * wq)287 static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
288 				struct workqueue_struct *wq)
289 {
290 	return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
291 }
292 
293 /*
294  * CPU iterators
295  *
296  * An extra gcwq is defined for an invalid cpu number
297  * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any
298  * specific CPU.  The following iterators are similar to
299  * for_each_*_cpu() iterators but also considers the unbound gcwq.
300  *
301  * for_each_gcwq_cpu()		: possible CPUs + WORK_CPU_UNBOUND
302  * for_each_online_gcwq_cpu()	: online CPUs + WORK_CPU_UNBOUND
303  * for_each_cwq_cpu()		: possible CPUs for bound workqueues,
304  *				  WORK_CPU_UNBOUND for unbound workqueues
305  */
306 #define for_each_gcwq_cpu(cpu)						\
307 	for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3);		\
308 	     (cpu) < WORK_CPU_NONE;					\
309 	     (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3))
310 
311 #define for_each_online_gcwq_cpu(cpu)					\
312 	for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3);		\
313 	     (cpu) < WORK_CPU_NONE;					\
314 	     (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3))
315 
316 #define for_each_cwq_cpu(cpu, wq)					\
317 	for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq));	\
318 	     (cpu) < WORK_CPU_NONE;					\
319 	     (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq)))
320 
321 #ifdef CONFIG_DEBUG_OBJECTS_WORK
322 
323 static struct debug_obj_descr work_debug_descr;
324 
work_debug_hint(void * addr)325 static void *work_debug_hint(void *addr)
326 {
327 	return ((struct work_struct *) addr)->func;
328 }
329 
330 /*
331  * fixup_init is called when:
332  * - an active object is initialized
333  */
work_fixup_init(void * addr,enum debug_obj_state state)334 static int work_fixup_init(void *addr, enum debug_obj_state state)
335 {
336 	struct work_struct *work = addr;
337 
338 	switch (state) {
339 	case ODEBUG_STATE_ACTIVE:
340 		cancel_work_sync(work);
341 		debug_object_init(work, &work_debug_descr);
342 		return 1;
343 	default:
344 		return 0;
345 	}
346 }
347 
348 /*
349  * fixup_activate is called when:
350  * - an active object is activated
351  * - an unknown object is activated (might be a statically initialized object)
352  */
work_fixup_activate(void * addr,enum debug_obj_state state)353 static int work_fixup_activate(void *addr, enum debug_obj_state state)
354 {
355 	struct work_struct *work = addr;
356 
357 	switch (state) {
358 
359 	case ODEBUG_STATE_NOTAVAILABLE:
360 		/*
361 		 * This is not really a fixup. The work struct was
362 		 * statically initialized. We just make sure that it
363 		 * is tracked in the object tracker.
364 		 */
365 		if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
366 			debug_object_init(work, &work_debug_descr);
367 			debug_object_activate(work, &work_debug_descr);
368 			return 0;
369 		}
370 		WARN_ON_ONCE(1);
371 		return 0;
372 
373 	case ODEBUG_STATE_ACTIVE:
374 		WARN_ON(1);
375 
376 	default:
377 		return 0;
378 	}
379 }
380 
381 /*
382  * fixup_free is called when:
383  * - an active object is freed
384  */
work_fixup_free(void * addr,enum debug_obj_state state)385 static int work_fixup_free(void *addr, enum debug_obj_state state)
386 {
387 	struct work_struct *work = addr;
388 
389 	switch (state) {
390 	case ODEBUG_STATE_ACTIVE:
391 		cancel_work_sync(work);
392 		debug_object_free(work, &work_debug_descr);
393 		return 1;
394 	default:
395 		return 0;
396 	}
397 }
398 
399 static struct debug_obj_descr work_debug_descr = {
400 	.name		= "work_struct",
401 	.debug_hint	= work_debug_hint,
402 	.fixup_init	= work_fixup_init,
403 	.fixup_activate	= work_fixup_activate,
404 	.fixup_free	= work_fixup_free,
405 };
406 
debug_work_activate(struct work_struct * work)407 static inline void debug_work_activate(struct work_struct *work)
408 {
409 	debug_object_activate(work, &work_debug_descr);
410 }
411 
debug_work_deactivate(struct work_struct * work)412 static inline void debug_work_deactivate(struct work_struct *work)
413 {
414 	debug_object_deactivate(work, &work_debug_descr);
415 }
416 
__init_work(struct work_struct * work,int onstack)417 void __init_work(struct work_struct *work, int onstack)
418 {
419 	if (onstack)
420 		debug_object_init_on_stack(work, &work_debug_descr);
421 	else
422 		debug_object_init(work, &work_debug_descr);
423 }
424 EXPORT_SYMBOL_GPL(__init_work);
425 
destroy_work_on_stack(struct work_struct * work)426 void destroy_work_on_stack(struct work_struct *work)
427 {
428 	debug_object_free(work, &work_debug_descr);
429 }
430 EXPORT_SYMBOL_GPL(destroy_work_on_stack);
431 
432 #else
debug_work_activate(struct work_struct * work)433 static inline void debug_work_activate(struct work_struct *work) { }
debug_work_deactivate(struct work_struct * work)434 static inline void debug_work_deactivate(struct work_struct *work) { }
435 #endif
436 
437 /* Serializes the accesses to the list of workqueues. */
438 static DEFINE_SPINLOCK(workqueue_lock);
439 static LIST_HEAD(workqueues);
440 static bool workqueue_freezing;		/* W: have wqs started freezing? */
441 
442 /*
443  * The almighty global cpu workqueues.  nr_running is the only field
444  * which is expected to be used frequently by other cpus via
445  * try_to_wake_up().  Put it in a separate cacheline.
446  */
447 static DEFINE_PER_CPU(struct global_cwq, global_cwq);
448 static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
449 
450 /*
451  * Global cpu workqueue and nr_running counter for unbound gcwq.  The
452  * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its
453  * workers have WORKER_UNBOUND set.
454  */
455 static struct global_cwq unbound_global_cwq;
456 static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0);	/* always 0 */
457 
458 static int worker_thread(void *__worker);
459 
get_gcwq(unsigned int cpu)460 static struct global_cwq *get_gcwq(unsigned int cpu)
461 {
462 	if (cpu != WORK_CPU_UNBOUND)
463 		return &per_cpu(global_cwq, cpu);
464 	else
465 		return &unbound_global_cwq;
466 }
467 
get_gcwq_nr_running(unsigned int cpu)468 static atomic_t *get_gcwq_nr_running(unsigned int cpu)
469 {
470 	if (cpu != WORK_CPU_UNBOUND)
471 		return &per_cpu(gcwq_nr_running, cpu);
472 	else
473 		return &unbound_gcwq_nr_running;
474 }
475 
get_cwq(unsigned int cpu,struct workqueue_struct * wq)476 static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
477 					    struct workqueue_struct *wq)
478 {
479 	if (!(wq->flags & WQ_UNBOUND)) {
480 		if (likely(cpu < nr_cpu_ids))
481 			return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
482 	} else if (likely(cpu == WORK_CPU_UNBOUND))
483 		return wq->cpu_wq.single;
484 	return NULL;
485 }
486 
work_color_to_flags(int color)487 static unsigned int work_color_to_flags(int color)
488 {
489 	return color << WORK_STRUCT_COLOR_SHIFT;
490 }
491 
get_work_color(struct work_struct * work)492 static int get_work_color(struct work_struct *work)
493 {
494 	return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
495 		((1 << WORK_STRUCT_COLOR_BITS) - 1);
496 }
497 
work_next_color(int color)498 static int work_next_color(int color)
499 {
500 	return (color + 1) % WORK_NR_COLORS;
501 }
502 
503 /*
504  * A work's data points to the cwq with WORK_STRUCT_CWQ set while the
505  * work is on queue.  Once execution starts, WORK_STRUCT_CWQ is
506  * cleared and the work data contains the cpu number it was last on.
507  *
508  * set_work_{cwq|cpu}() and clear_work_data() can be used to set the
509  * cwq, cpu or clear work->data.  These functions should only be
510  * called while the work is owned - ie. while the PENDING bit is set.
511  *
512  * get_work_[g]cwq() can be used to obtain the gcwq or cwq
513  * corresponding to a work.  gcwq is available once the work has been
514  * queued anywhere after initialization.  cwq is available only from
515  * queueing until execution starts.
516  */
set_work_data(struct work_struct * work,unsigned long data,unsigned long flags)517 static inline void set_work_data(struct work_struct *work, unsigned long data,
518 				 unsigned long flags)
519 {
520 	BUG_ON(!work_pending(work));
521 	atomic_long_set(&work->data, data | flags | work_static(work));
522 }
523 
set_work_cwq(struct work_struct * work,struct cpu_workqueue_struct * cwq,unsigned long extra_flags)524 static void set_work_cwq(struct work_struct *work,
525 			 struct cpu_workqueue_struct *cwq,
526 			 unsigned long extra_flags)
527 {
528 	set_work_data(work, (unsigned long)cwq,
529 		      WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags);
530 }
531 
set_work_cpu(struct work_struct * work,unsigned int cpu)532 static void set_work_cpu(struct work_struct *work, unsigned int cpu)
533 {
534 	set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING);
535 }
536 
clear_work_data(struct work_struct * work)537 static void clear_work_data(struct work_struct *work)
538 {
539 	set_work_data(work, WORK_STRUCT_NO_CPU, 0);
540 }
541 
get_work_cwq(struct work_struct * work)542 static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work)
543 {
544 	unsigned long data = atomic_long_read(&work->data);
545 
546 	if (data & WORK_STRUCT_CWQ)
547 		return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
548 	else
549 		return NULL;
550 }
551 
get_work_gcwq(struct work_struct * work)552 static struct global_cwq *get_work_gcwq(struct work_struct *work)
553 {
554 	unsigned long data = atomic_long_read(&work->data);
555 	unsigned int cpu;
556 
557 	if (data & WORK_STRUCT_CWQ)
558 		return ((struct cpu_workqueue_struct *)
559 			(data & WORK_STRUCT_WQ_DATA_MASK))->gcwq;
560 
561 	cpu = data >> WORK_STRUCT_FLAG_BITS;
562 	if (cpu == WORK_CPU_NONE)
563 		return NULL;
564 
565 	BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND);
566 	return get_gcwq(cpu);
567 }
568 
569 /*
570  * Policy functions.  These define the policies on how the global
571  * worker pool is managed.  Unless noted otherwise, these functions
572  * assume that they're being called with gcwq->lock held.
573  */
574 
__need_more_worker(struct global_cwq * gcwq)575 static bool __need_more_worker(struct global_cwq *gcwq)
576 {
577 	return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) ||
578 		gcwq->flags & GCWQ_HIGHPRI_PENDING;
579 }
580 
581 /*
582  * Need to wake up a worker?  Called from anything but currently
583  * running workers.
584  */
need_more_worker(struct global_cwq * gcwq)585 static bool need_more_worker(struct global_cwq *gcwq)
586 {
587 	return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq);
588 }
589 
590 /* Can I start working?  Called from busy but !running workers. */
may_start_working(struct global_cwq * gcwq)591 static bool may_start_working(struct global_cwq *gcwq)
592 {
593 	return gcwq->nr_idle;
594 }
595 
596 /* Do I need to keep working?  Called from currently running workers. */
keep_working(struct global_cwq * gcwq)597 static bool keep_working(struct global_cwq *gcwq)
598 {
599 	atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
600 
601 	return !list_empty(&gcwq->worklist) &&
602 		(atomic_read(nr_running) <= 1 ||
603 		 gcwq->flags & GCWQ_HIGHPRI_PENDING);
604 }
605 
606 /* Do we need a new worker?  Called from manager. */
need_to_create_worker(struct global_cwq * gcwq)607 static bool need_to_create_worker(struct global_cwq *gcwq)
608 {
609 	return need_more_worker(gcwq) && !may_start_working(gcwq);
610 }
611 
612 /* Do I need to be the manager? */
need_to_manage_workers(struct global_cwq * gcwq)613 static bool need_to_manage_workers(struct global_cwq *gcwq)
614 {
615 	return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS;
616 }
617 
618 /* Do we have too many workers and should some go away? */
too_many_workers(struct global_cwq * gcwq)619 static bool too_many_workers(struct global_cwq *gcwq)
620 {
621 	bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS;
622 	int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */
623 	int nr_busy = gcwq->nr_workers - nr_idle;
624 
625 	return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
626 }
627 
628 /*
629  * Wake up functions.
630  */
631 
632 /* Return the first worker.  Safe with preemption disabled */
first_worker(struct global_cwq * gcwq)633 static struct worker *first_worker(struct global_cwq *gcwq)
634 {
635 	if (unlikely(list_empty(&gcwq->idle_list)))
636 		return NULL;
637 
638 	return list_first_entry(&gcwq->idle_list, struct worker, entry);
639 }
640 
641 /**
642  * wake_up_worker - wake up an idle worker
643  * @gcwq: gcwq to wake worker for
644  *
645  * Wake up the first idle worker of @gcwq.
646  *
647  * CONTEXT:
648  * spin_lock_irq(gcwq->lock).
649  */
wake_up_worker(struct global_cwq * gcwq)650 static void wake_up_worker(struct global_cwq *gcwq)
651 {
652 	struct worker *worker = first_worker(gcwq);
653 
654 	if (likely(worker))
655 		wake_up_process(worker->task);
656 }
657 
658 /**
659  * wq_worker_waking_up - a worker is waking up
660  * @task: task waking up
661  * @cpu: CPU @task is waking up to
662  *
663  * This function is called during try_to_wake_up() when a worker is
664  * being awoken.
665  *
666  * CONTEXT:
667  * spin_lock_irq(rq->lock)
668  */
wq_worker_waking_up(struct task_struct * task,unsigned int cpu)669 void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
670 {
671 	struct worker *worker = kthread_data(task);
672 
673 	if (!(worker->flags & WORKER_NOT_RUNNING))
674 		atomic_inc(get_gcwq_nr_running(cpu));
675 }
676 
677 /**
678  * wq_worker_sleeping - a worker is going to sleep
679  * @task: task going to sleep
680  * @cpu: CPU in question, must be the current CPU number
681  *
682  * This function is called during schedule() when a busy worker is
683  * going to sleep.  Worker on the same cpu can be woken up by
684  * returning pointer to its task.
685  *
686  * CONTEXT:
687  * spin_lock_irq(rq->lock)
688  *
689  * RETURNS:
690  * Worker task on @cpu to wake up, %NULL if none.
691  */
wq_worker_sleeping(struct task_struct * task,unsigned int cpu)692 struct task_struct *wq_worker_sleeping(struct task_struct *task,
693 				       unsigned int cpu)
694 {
695 	struct worker *worker = kthread_data(task), *to_wakeup = NULL;
696 	struct global_cwq *gcwq = get_gcwq(cpu);
697 	atomic_t *nr_running = get_gcwq_nr_running(cpu);
698 
699 	if (worker->flags & WORKER_NOT_RUNNING)
700 		return NULL;
701 
702 	/* this can only happen on the local cpu */
703 	BUG_ON(cpu != raw_smp_processor_id());
704 
705 	/*
706 	 * The counterpart of the following dec_and_test, implied mb,
707 	 * worklist not empty test sequence is in insert_work().
708 	 * Please read comment there.
709 	 *
710 	 * NOT_RUNNING is clear.  This means that trustee is not in
711 	 * charge and we're running on the local cpu w/ rq lock held
712 	 * and preemption disabled, which in turn means that none else
713 	 * could be manipulating idle_list, so dereferencing idle_list
714 	 * without gcwq lock is safe.
715 	 */
716 	if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist))
717 		to_wakeup = first_worker(gcwq);
718 	return to_wakeup ? to_wakeup->task : NULL;
719 }
720 
721 /**
722  * worker_set_flags - set worker flags and adjust nr_running accordingly
723  * @worker: self
724  * @flags: flags to set
725  * @wakeup: wakeup an idle worker if necessary
726  *
727  * Set @flags in @worker->flags and adjust nr_running accordingly.  If
728  * nr_running becomes zero and @wakeup is %true, an idle worker is
729  * woken up.
730  *
731  * CONTEXT:
732  * spin_lock_irq(gcwq->lock)
733  */
worker_set_flags(struct worker * worker,unsigned int flags,bool wakeup)734 static inline void worker_set_flags(struct worker *worker, unsigned int flags,
735 				    bool wakeup)
736 {
737 	struct global_cwq *gcwq = worker->gcwq;
738 
739 	WARN_ON_ONCE(worker->task != current);
740 
741 	/*
742 	 * If transitioning into NOT_RUNNING, adjust nr_running and
743 	 * wake up an idle worker as necessary if requested by
744 	 * @wakeup.
745 	 */
746 	if ((flags & WORKER_NOT_RUNNING) &&
747 	    !(worker->flags & WORKER_NOT_RUNNING)) {
748 		atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
749 
750 		if (wakeup) {
751 			if (atomic_dec_and_test(nr_running) &&
752 			    !list_empty(&gcwq->worklist))
753 				wake_up_worker(gcwq);
754 		} else
755 			atomic_dec(nr_running);
756 	}
757 
758 	worker->flags |= flags;
759 }
760 
761 /**
762  * worker_clr_flags - clear worker flags and adjust nr_running accordingly
763  * @worker: self
764  * @flags: flags to clear
765  *
766  * Clear @flags in @worker->flags and adjust nr_running accordingly.
767  *
768  * CONTEXT:
769  * spin_lock_irq(gcwq->lock)
770  */
worker_clr_flags(struct worker * worker,unsigned int flags)771 static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
772 {
773 	struct global_cwq *gcwq = worker->gcwq;
774 	unsigned int oflags = worker->flags;
775 
776 	WARN_ON_ONCE(worker->task != current);
777 
778 	worker->flags &= ~flags;
779 
780 	/*
781 	 * If transitioning out of NOT_RUNNING, increment nr_running.  Note
782 	 * that the nested NOT_RUNNING is not a noop.  NOT_RUNNING is mask
783 	 * of multiple flags, not a single flag.
784 	 */
785 	if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
786 		if (!(worker->flags & WORKER_NOT_RUNNING))
787 			atomic_inc(get_gcwq_nr_running(gcwq->cpu));
788 }
789 
790 /**
791  * busy_worker_head - return the busy hash head for a work
792  * @gcwq: gcwq of interest
793  * @work: work to be hashed
794  *
795  * Return hash head of @gcwq for @work.
796  *
797  * CONTEXT:
798  * spin_lock_irq(gcwq->lock).
799  *
800  * RETURNS:
801  * Pointer to the hash head.
802  */
busy_worker_head(struct global_cwq * gcwq,struct work_struct * work)803 static struct hlist_head *busy_worker_head(struct global_cwq *gcwq,
804 					   struct work_struct *work)
805 {
806 	const int base_shift = ilog2(sizeof(struct work_struct));
807 	unsigned long v = (unsigned long)work;
808 
809 	/* simple shift and fold hash, do we need something better? */
810 	v >>= base_shift;
811 	v += v >> BUSY_WORKER_HASH_ORDER;
812 	v &= BUSY_WORKER_HASH_MASK;
813 
814 	return &gcwq->busy_hash[v];
815 }
816 
817 /**
818  * __find_worker_executing_work - find worker which is executing a work
819  * @gcwq: gcwq of interest
820  * @bwh: hash head as returned by busy_worker_head()
821  * @work: work to find worker for
822  *
823  * Find a worker which is executing @work on @gcwq.  @bwh should be
824  * the hash head obtained by calling busy_worker_head() with the same
825  * work.
826  *
827  * CONTEXT:
828  * spin_lock_irq(gcwq->lock).
829  *
830  * RETURNS:
831  * Pointer to worker which is executing @work if found, NULL
832  * otherwise.
833  */
__find_worker_executing_work(struct global_cwq * gcwq,struct hlist_head * bwh,struct work_struct * work)834 static struct worker *__find_worker_executing_work(struct global_cwq *gcwq,
835 						   struct hlist_head *bwh,
836 						   struct work_struct *work)
837 {
838 	struct worker *worker;
839 	struct hlist_node *tmp;
840 
841 	hlist_for_each_entry(worker, tmp, bwh, hentry)
842 		if (worker->current_work == work &&
843 		    worker->current_func == work->func)
844 			return worker;
845 	return NULL;
846 }
847 
848 /**
849  * find_worker_executing_work - find worker which is executing a work
850  * @gcwq: gcwq of interest
851  * @work: work to find worker for
852  *
853  * Find a worker which is executing @work on @gcwq by searching
854  * @gcwq->busy_hash which is keyed by the address of @work.  For a worker
855  * to match, its current execution should match the address of @work and
856  * its work function.  This is to avoid unwanted dependency between
857  * unrelated work executions through a work item being recycled while still
858  * being executed.
859  *
860  * This is a bit tricky.  A work item may be freed once its execution
861  * starts and nothing prevents the freed area from being recycled for
862  * another work item.  If the same work item address ends up being reused
863  * before the original execution finishes, workqueue will identify the
864  * recycled work item as currently executing and make it wait until the
865  * current execution finishes, introducing an unwanted dependency.
866  *
867  * This function checks the work item address, work function and workqueue
868  * to avoid false positives.  Note that this isn't complete as one may
869  * construct a work function which can introduce dependency onto itself
870  * through a recycled work item.  Well, if somebody wants to shoot oneself
871  * in the foot that badly, there's only so much we can do, and if such
872  * deadlock actually occurs, it should be easy to locate the culprit work
873  * function.
874  *
875  * CONTEXT:
876  * spin_lock_irq(gcwq->lock).
877  *
878  * RETURNS:
879  * Pointer to worker which is executing @work if found, NULL
880  * otherwise.
881  */
find_worker_executing_work(struct global_cwq * gcwq,struct work_struct * work)882 static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
883 						 struct work_struct *work)
884 {
885 	return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work),
886 					    work);
887 }
888 
889 /**
890  * gcwq_determine_ins_pos - find insertion position
891  * @gcwq: gcwq of interest
892  * @cwq: cwq a work is being queued for
893  *
894  * A work for @cwq is about to be queued on @gcwq, determine insertion
895  * position for the work.  If @cwq is for HIGHPRI wq, the work is
896  * queued at the head of the queue but in FIFO order with respect to
897  * other HIGHPRI works; otherwise, at the end of the queue.  This
898  * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that
899  * there are HIGHPRI works pending.
900  *
901  * CONTEXT:
902  * spin_lock_irq(gcwq->lock).
903  *
904  * RETURNS:
905  * Pointer to inserstion position.
906  */
gcwq_determine_ins_pos(struct global_cwq * gcwq,struct cpu_workqueue_struct * cwq)907 static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq,
908 					       struct cpu_workqueue_struct *cwq)
909 {
910 	struct work_struct *twork;
911 
912 	if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
913 		return &gcwq->worklist;
914 
915 	list_for_each_entry(twork, &gcwq->worklist, entry) {
916 		struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
917 
918 		if (!(tcwq->wq->flags & WQ_HIGHPRI))
919 			break;
920 	}
921 
922 	gcwq->flags |= GCWQ_HIGHPRI_PENDING;
923 	return &twork->entry;
924 }
925 
926 /**
927  * insert_work - insert a work into gcwq
928  * @cwq: cwq @work belongs to
929  * @work: work to insert
930  * @head: insertion point
931  * @extra_flags: extra WORK_STRUCT_* flags to set
932  *
933  * Insert @work which belongs to @cwq into @gcwq after @head.
934  * @extra_flags is or'd to work_struct flags.
935  *
936  * CONTEXT:
937  * spin_lock_irq(gcwq->lock).
938  */
insert_work(struct cpu_workqueue_struct * cwq,struct work_struct * work,struct list_head * head,unsigned int extra_flags)939 static void insert_work(struct cpu_workqueue_struct *cwq,
940 			struct work_struct *work, struct list_head *head,
941 			unsigned int extra_flags)
942 {
943 	struct global_cwq *gcwq = cwq->gcwq;
944 
945 	/* we own @work, set data and link */
946 	set_work_cwq(work, cwq, extra_flags);
947 
948 	/*
949 	 * Ensure that we get the right work->data if we see the
950 	 * result of list_add() below, see try_to_grab_pending().
951 	 */
952 	smp_wmb();
953 
954 	list_add_tail(&work->entry, head);
955 
956 	/*
957 	 * Ensure either worker_sched_deactivated() sees the above
958 	 * list_add_tail() or we see zero nr_running to avoid workers
959 	 * lying around lazily while there are works to be processed.
960 	 */
961 	smp_mb();
962 
963 	if (__need_more_worker(gcwq))
964 		wake_up_worker(gcwq);
965 }
966 
967 /*
968  * Test whether @work is being queued from another work executing on the
969  * same workqueue.  This is rather expensive and should only be used from
970  * cold paths.
971  */
is_chained_work(struct workqueue_struct * wq)972 static bool is_chained_work(struct workqueue_struct *wq)
973 {
974 	unsigned long flags;
975 	unsigned int cpu;
976 
977 	for_each_gcwq_cpu(cpu) {
978 		struct global_cwq *gcwq = get_gcwq(cpu);
979 		struct worker *worker;
980 		struct hlist_node *pos;
981 		int i;
982 
983 		spin_lock_irqsave(&gcwq->lock, flags);
984 		for_each_busy_worker(worker, i, pos, gcwq) {
985 			if (worker->task != current)
986 				continue;
987 			spin_unlock_irqrestore(&gcwq->lock, flags);
988 			/*
989 			 * I'm @worker, no locking necessary.  See if @work
990 			 * is headed to the same workqueue.
991 			 */
992 			return worker->current_cwq->wq == wq;
993 		}
994 		spin_unlock_irqrestore(&gcwq->lock, flags);
995 	}
996 	return false;
997 }
998 
__queue_work(unsigned int cpu,struct workqueue_struct * wq,struct work_struct * work)999 static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1000 			 struct work_struct *work)
1001 {
1002 	struct global_cwq *gcwq;
1003 	struct cpu_workqueue_struct *cwq;
1004 	struct list_head *worklist;
1005 	unsigned int work_flags;
1006 	unsigned long flags;
1007 
1008 	debug_work_activate(work);
1009 
1010 	/* if dying, only works from the same workqueue are allowed */
1011 	if (unlikely(wq->flags & WQ_DRAINING) &&
1012 	    WARN_ON_ONCE(!is_chained_work(wq)))
1013 		return;
1014 
1015 	/* determine gcwq to use */
1016 	if (!(wq->flags & WQ_UNBOUND)) {
1017 		struct global_cwq *last_gcwq;
1018 
1019 		if (unlikely(cpu == WORK_CPU_UNBOUND))
1020 			cpu = raw_smp_processor_id();
1021 
1022 		/*
1023 		 * It's multi cpu.  If @wq is non-reentrant and @work
1024 		 * was previously on a different cpu, it might still
1025 		 * be running there, in which case the work needs to
1026 		 * be queued on that cpu to guarantee non-reentrance.
1027 		 */
1028 		gcwq = get_gcwq(cpu);
1029 		if (wq->flags & WQ_NON_REENTRANT &&
1030 		    (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) {
1031 			struct worker *worker;
1032 
1033 			spin_lock_irqsave(&last_gcwq->lock, flags);
1034 
1035 			worker = find_worker_executing_work(last_gcwq, work);
1036 
1037 			if (worker && worker->current_cwq->wq == wq)
1038 				gcwq = last_gcwq;
1039 			else {
1040 				/* meh... not running there, queue here */
1041 				spin_unlock_irqrestore(&last_gcwq->lock, flags);
1042 				spin_lock_irqsave(&gcwq->lock, flags);
1043 			}
1044 		} else
1045 			spin_lock_irqsave(&gcwq->lock, flags);
1046 	} else {
1047 		gcwq = get_gcwq(WORK_CPU_UNBOUND);
1048 		spin_lock_irqsave(&gcwq->lock, flags);
1049 	}
1050 
1051 	/* gcwq determined, get cwq and queue */
1052 	cwq = get_cwq(gcwq->cpu, wq);
1053 	trace_workqueue_queue_work(cpu, cwq, work);
1054 
1055 	BUG_ON(!list_empty(&work->entry));
1056 
1057 	cwq->nr_in_flight[cwq->work_color]++;
1058 	work_flags = work_color_to_flags(cwq->work_color);
1059 
1060 	if (likely(cwq->nr_active < cwq->max_active)) {
1061 		trace_workqueue_activate_work(work);
1062 		cwq->nr_active++;
1063 		worklist = gcwq_determine_ins_pos(gcwq, cwq);
1064 	} else {
1065 		work_flags |= WORK_STRUCT_DELAYED;
1066 		worklist = &cwq->delayed_works;
1067 	}
1068 
1069 	insert_work(cwq, work, worklist, work_flags);
1070 
1071 	spin_unlock_irqrestore(&gcwq->lock, flags);
1072 }
1073 
1074 /**
1075  * queue_work - queue work on a workqueue
1076  * @wq: workqueue to use
1077  * @work: work to queue
1078  *
1079  * Returns 0 if @work was already on a queue, non-zero otherwise.
1080  *
1081  * We queue the work to the CPU on which it was submitted, but if the CPU dies
1082  * it can be processed by another CPU.
1083  */
queue_work(struct workqueue_struct * wq,struct work_struct * work)1084 int queue_work(struct workqueue_struct *wq, struct work_struct *work)
1085 {
1086 	int ret;
1087 
1088 	ret = queue_work_on(get_cpu(), wq, work);
1089 	put_cpu();
1090 
1091 	return ret;
1092 }
1093 EXPORT_SYMBOL_GPL(queue_work);
1094 
1095 /**
1096  * queue_work_on - queue work on specific cpu
1097  * @cpu: CPU number to execute work on
1098  * @wq: workqueue to use
1099  * @work: work to queue
1100  *
1101  * Returns 0 if @work was already on a queue, non-zero otherwise.
1102  *
1103  * We queue the work to a specific CPU, the caller must ensure it
1104  * can't go away.
1105  */
1106 int
queue_work_on(int cpu,struct workqueue_struct * wq,struct work_struct * work)1107 queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
1108 {
1109 	int ret = 0;
1110 
1111 	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
1112 		__queue_work(cpu, wq, work);
1113 		ret = 1;
1114 	}
1115 	return ret;
1116 }
1117 EXPORT_SYMBOL_GPL(queue_work_on);
1118 
delayed_work_timer_fn(unsigned long __data)1119 static void delayed_work_timer_fn(unsigned long __data)
1120 {
1121 	struct delayed_work *dwork = (struct delayed_work *)__data;
1122 	struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
1123 
1124 	__queue_work(smp_processor_id(), cwq->wq, &dwork->work);
1125 }
1126 
1127 /**
1128  * queue_delayed_work - queue work on a workqueue after delay
1129  * @wq: workqueue to use
1130  * @dwork: delayable work to queue
1131  * @delay: number of jiffies to wait before queueing
1132  *
1133  * Returns 0 if @work was already on a queue, non-zero otherwise.
1134  */
queue_delayed_work(struct workqueue_struct * wq,struct delayed_work * dwork,unsigned long delay)1135 int queue_delayed_work(struct workqueue_struct *wq,
1136 			struct delayed_work *dwork, unsigned long delay)
1137 {
1138 	if (delay == 0)
1139 		return queue_work(wq, &dwork->work);
1140 
1141 	return queue_delayed_work_on(-1, wq, dwork, delay);
1142 }
1143 EXPORT_SYMBOL_GPL(queue_delayed_work);
1144 
1145 /**
1146  * queue_delayed_work_on - queue work on specific CPU after delay
1147  * @cpu: CPU number to execute work on
1148  * @wq: workqueue to use
1149  * @dwork: work to queue
1150  * @delay: number of jiffies to wait before queueing
1151  *
1152  * Returns 0 if @work was already on a queue, non-zero otherwise.
1153  */
queue_delayed_work_on(int cpu,struct workqueue_struct * wq,struct delayed_work * dwork,unsigned long delay)1154 int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
1155 			struct delayed_work *dwork, unsigned long delay)
1156 {
1157 	int ret = 0;
1158 	struct timer_list *timer = &dwork->timer;
1159 	struct work_struct *work = &dwork->work;
1160 
1161 	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
1162 		unsigned int lcpu;
1163 
1164 		WARN_ON_ONCE(timer_pending(timer));
1165 		WARN_ON_ONCE(!list_empty(&work->entry));
1166 
1167 		timer_stats_timer_set_start_info(&dwork->timer);
1168 
1169 		/*
1170 		 * This stores cwq for the moment, for the timer_fn.
1171 		 * Note that the work's gcwq is preserved to allow
1172 		 * reentrance detection for delayed works.
1173 		 */
1174 		if (!(wq->flags & WQ_UNBOUND)) {
1175 			struct global_cwq *gcwq = get_work_gcwq(work);
1176 
1177 			if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND)
1178 				lcpu = gcwq->cpu;
1179 			else
1180 				lcpu = raw_smp_processor_id();
1181 		} else
1182 			lcpu = WORK_CPU_UNBOUND;
1183 
1184 		set_work_cwq(work, get_cwq(lcpu, wq), 0);
1185 
1186 		timer->expires = jiffies + delay;
1187 		timer->data = (unsigned long)dwork;
1188 		timer->function = delayed_work_timer_fn;
1189 
1190 		if (unlikely(cpu >= 0))
1191 			add_timer_on(timer, cpu);
1192 		else
1193 			add_timer(timer);
1194 		ret = 1;
1195 	}
1196 	return ret;
1197 }
1198 EXPORT_SYMBOL_GPL(queue_delayed_work_on);
1199 
1200 /**
1201  * worker_enter_idle - enter idle state
1202  * @worker: worker which is entering idle state
1203  *
1204  * @worker is entering idle state.  Update stats and idle timer if
1205  * necessary.
1206  *
1207  * LOCKING:
1208  * spin_lock_irq(gcwq->lock).
1209  */
worker_enter_idle(struct worker * worker)1210 static void worker_enter_idle(struct worker *worker)
1211 {
1212 	struct global_cwq *gcwq = worker->gcwq;
1213 
1214 	BUG_ON(worker->flags & WORKER_IDLE);
1215 	BUG_ON(!list_empty(&worker->entry) &&
1216 	       (worker->hentry.next || worker->hentry.pprev));
1217 
1218 	/* can't use worker_set_flags(), also called from start_worker() */
1219 	worker->flags |= WORKER_IDLE;
1220 	gcwq->nr_idle++;
1221 	worker->last_active = jiffies;
1222 
1223 	/* idle_list is LIFO */
1224 	list_add(&worker->entry, &gcwq->idle_list);
1225 
1226 	if (likely(!(worker->flags & WORKER_ROGUE))) {
1227 		if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer))
1228 			mod_timer(&gcwq->idle_timer,
1229 				  jiffies + IDLE_WORKER_TIMEOUT);
1230 	} else
1231 		wake_up_all(&gcwq->trustee_wait);
1232 
1233 	/*
1234 	 * Sanity check nr_running.  Because trustee releases gcwq->lock
1235 	 * between setting %WORKER_ROGUE and zapping nr_running, the
1236 	 * warning may trigger spuriously.  Check iff trustee is idle.
1237 	 */
1238 	WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE &&
1239 		     gcwq->nr_workers == gcwq->nr_idle &&
1240 		     atomic_read(get_gcwq_nr_running(gcwq->cpu)));
1241 }
1242 
1243 /**
1244  * worker_leave_idle - leave idle state
1245  * @worker: worker which is leaving idle state
1246  *
1247  * @worker is leaving idle state.  Update stats.
1248  *
1249  * LOCKING:
1250  * spin_lock_irq(gcwq->lock).
1251  */
worker_leave_idle(struct worker * worker)1252 static void worker_leave_idle(struct worker *worker)
1253 {
1254 	struct global_cwq *gcwq = worker->gcwq;
1255 
1256 	BUG_ON(!(worker->flags & WORKER_IDLE));
1257 	worker_clr_flags(worker, WORKER_IDLE);
1258 	gcwq->nr_idle--;
1259 	list_del_init(&worker->entry);
1260 }
1261 
1262 /**
1263  * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq
1264  * @worker: self
1265  *
1266  * Works which are scheduled while the cpu is online must at least be
1267  * scheduled to a worker which is bound to the cpu so that if they are
1268  * flushed from cpu callbacks while cpu is going down, they are
1269  * guaranteed to execute on the cpu.
1270  *
1271  * This function is to be used by rogue workers and rescuers to bind
1272  * themselves to the target cpu and may race with cpu going down or
1273  * coming online.  kthread_bind() can't be used because it may put the
1274  * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
1275  * verbatim as it's best effort and blocking and gcwq may be
1276  * [dis]associated in the meantime.
1277  *
1278  * This function tries set_cpus_allowed() and locks gcwq and verifies
1279  * the binding against GCWQ_DISASSOCIATED which is set during
1280  * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters
1281  * idle state or fetches works without dropping lock, it can guarantee
1282  * the scheduling requirement described in the first paragraph.
1283  *
1284  * CONTEXT:
1285  * Might sleep.  Called without any lock but returns with gcwq->lock
1286  * held.
1287  *
1288  * RETURNS:
1289  * %true if the associated gcwq is online (@worker is successfully
1290  * bound), %false if offline.
1291  */
worker_maybe_bind_and_lock(struct worker * worker)1292 static bool worker_maybe_bind_and_lock(struct worker *worker)
1293 __acquires(&gcwq->lock)
1294 {
1295 	struct global_cwq *gcwq = worker->gcwq;
1296 	struct task_struct *task = worker->task;
1297 
1298 	while (true) {
1299 		/*
1300 		 * The following call may fail, succeed or succeed
1301 		 * without actually migrating the task to the cpu if
1302 		 * it races with cpu hotunplug operation.  Verify
1303 		 * against GCWQ_DISASSOCIATED.
1304 		 */
1305 		if (!(gcwq->flags & GCWQ_DISASSOCIATED))
1306 			set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu));
1307 
1308 		spin_lock_irq(&gcwq->lock);
1309 		if (gcwq->flags & GCWQ_DISASSOCIATED)
1310 			return false;
1311 		if (task_cpu(task) == gcwq->cpu &&
1312 		    cpumask_equal(&current->cpus_allowed,
1313 				  get_cpu_mask(gcwq->cpu)))
1314 			return true;
1315 		spin_unlock_irq(&gcwq->lock);
1316 
1317 		/*
1318 		 * We've raced with CPU hot[un]plug.  Give it a breather
1319 		 * and retry migration.  cond_resched() is required here;
1320 		 * otherwise, we might deadlock against cpu_stop trying to
1321 		 * bring down the CPU on non-preemptive kernel.
1322 		 */
1323 		cpu_relax();
1324 		cond_resched();
1325 	}
1326 }
1327 
1328 /*
1329  * Function for worker->rebind_work used to rebind rogue busy workers
1330  * to the associated cpu which is coming back online.  This is
1331  * scheduled by cpu up but can race with other cpu hotplug operations
1332  * and may be executed twice without intervening cpu down.
1333  */
worker_rebind_fn(struct work_struct * work)1334 static void worker_rebind_fn(struct work_struct *work)
1335 {
1336 	struct worker *worker = container_of(work, struct worker, rebind_work);
1337 	struct global_cwq *gcwq = worker->gcwq;
1338 
1339 	if (worker_maybe_bind_and_lock(worker))
1340 		worker_clr_flags(worker, WORKER_REBIND);
1341 
1342 	spin_unlock_irq(&gcwq->lock);
1343 }
1344 
alloc_worker(void)1345 static struct worker *alloc_worker(void)
1346 {
1347 	struct worker *worker;
1348 
1349 	worker = kzalloc(sizeof(*worker), GFP_KERNEL);
1350 	if (worker) {
1351 		INIT_LIST_HEAD(&worker->entry);
1352 		INIT_LIST_HEAD(&worker->scheduled);
1353 		INIT_WORK(&worker->rebind_work, worker_rebind_fn);
1354 		/* on creation a worker is in !idle && prep state */
1355 		worker->flags = WORKER_PREP;
1356 	}
1357 	return worker;
1358 }
1359 
1360 /**
1361  * create_worker - create a new workqueue worker
1362  * @gcwq: gcwq the new worker will belong to
1363  * @bind: whether to set affinity to @cpu or not
1364  *
1365  * Create a new worker which is bound to @gcwq.  The returned worker
1366  * can be started by calling start_worker() or destroyed using
1367  * destroy_worker().
1368  *
1369  * CONTEXT:
1370  * Might sleep.  Does GFP_KERNEL allocations.
1371  *
1372  * RETURNS:
1373  * Pointer to the newly created worker.
1374  */
create_worker(struct global_cwq * gcwq,bool bind)1375 static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
1376 {
1377 	bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
1378 	struct worker *worker = NULL;
1379 	int id = -1;
1380 
1381 	spin_lock_irq(&gcwq->lock);
1382 	while (ida_get_new(&gcwq->worker_ida, &id)) {
1383 		spin_unlock_irq(&gcwq->lock);
1384 		if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL))
1385 			goto fail;
1386 		spin_lock_irq(&gcwq->lock);
1387 	}
1388 	spin_unlock_irq(&gcwq->lock);
1389 
1390 	worker = alloc_worker();
1391 	if (!worker)
1392 		goto fail;
1393 
1394 	worker->gcwq = gcwq;
1395 	worker->id = id;
1396 
1397 	if (!on_unbound_cpu)
1398 		worker->task = kthread_create_on_node(worker_thread,
1399 						      worker,
1400 						      cpu_to_node(gcwq->cpu),
1401 						      "kworker/%u:%d", gcwq->cpu, id);
1402 	else
1403 		worker->task = kthread_create(worker_thread, worker,
1404 					      "kworker/u:%d", id);
1405 	if (IS_ERR(worker->task))
1406 		goto fail;
1407 
1408 	/*
1409 	 * A rogue worker will become a regular one if CPU comes
1410 	 * online later on.  Make sure every worker has
1411 	 * PF_THREAD_BOUND set.
1412 	 */
1413 	if (bind && !on_unbound_cpu)
1414 		kthread_bind(worker->task, gcwq->cpu);
1415 	else {
1416 		worker->task->flags |= PF_THREAD_BOUND;
1417 		if (on_unbound_cpu)
1418 			worker->flags |= WORKER_UNBOUND;
1419 	}
1420 
1421 	return worker;
1422 fail:
1423 	if (id >= 0) {
1424 		spin_lock_irq(&gcwq->lock);
1425 		ida_remove(&gcwq->worker_ida, id);
1426 		spin_unlock_irq(&gcwq->lock);
1427 	}
1428 	kfree(worker);
1429 	return NULL;
1430 }
1431 
1432 /**
1433  * start_worker - start a newly created worker
1434  * @worker: worker to start
1435  *
1436  * Make the gcwq aware of @worker and start it.
1437  *
1438  * CONTEXT:
1439  * spin_lock_irq(gcwq->lock).
1440  */
start_worker(struct worker * worker)1441 static void start_worker(struct worker *worker)
1442 {
1443 	worker->flags |= WORKER_STARTED;
1444 	worker->gcwq->nr_workers++;
1445 	worker_enter_idle(worker);
1446 	wake_up_process(worker->task);
1447 }
1448 
1449 /**
1450  * destroy_worker - destroy a workqueue worker
1451  * @worker: worker to be destroyed
1452  *
1453  * Destroy @worker and adjust @gcwq stats accordingly.
1454  *
1455  * CONTEXT:
1456  * spin_lock_irq(gcwq->lock) which is released and regrabbed.
1457  */
destroy_worker(struct worker * worker)1458 static void destroy_worker(struct worker *worker)
1459 {
1460 	struct global_cwq *gcwq = worker->gcwq;
1461 	int id = worker->id;
1462 
1463 	/* sanity check frenzy */
1464 	BUG_ON(worker->current_work);
1465 	BUG_ON(!list_empty(&worker->scheduled));
1466 
1467 	if (worker->flags & WORKER_STARTED)
1468 		gcwq->nr_workers--;
1469 	if (worker->flags & WORKER_IDLE)
1470 		gcwq->nr_idle--;
1471 
1472 	/*
1473 	 * Once WORKER_DIE is set, the kworker may destroy itself at any
1474 	 * point.  Pin to ensure the task stays until we're done with it.
1475 	 */
1476 	get_task_struct(worker->task);
1477 
1478 	list_del_init(&worker->entry);
1479 	worker->flags |= WORKER_DIE;
1480 
1481 	spin_unlock_irq(&gcwq->lock);
1482 
1483 	kthread_stop(worker->task);
1484 	put_task_struct(worker->task);
1485 	kfree(worker);
1486 
1487 	spin_lock_irq(&gcwq->lock);
1488 	ida_remove(&gcwq->worker_ida, id);
1489 }
1490 
idle_worker_timeout(unsigned long __gcwq)1491 static void idle_worker_timeout(unsigned long __gcwq)
1492 {
1493 	struct global_cwq *gcwq = (void *)__gcwq;
1494 
1495 	spin_lock_irq(&gcwq->lock);
1496 
1497 	if (too_many_workers(gcwq)) {
1498 		struct worker *worker;
1499 		unsigned long expires;
1500 
1501 		/* idle_list is kept in LIFO order, check the last one */
1502 		worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
1503 		expires = worker->last_active + IDLE_WORKER_TIMEOUT;
1504 
1505 		if (time_before(jiffies, expires))
1506 			mod_timer(&gcwq->idle_timer, expires);
1507 		else {
1508 			/* it's been idle for too long, wake up manager */
1509 			gcwq->flags |= GCWQ_MANAGE_WORKERS;
1510 			wake_up_worker(gcwq);
1511 		}
1512 	}
1513 
1514 	spin_unlock_irq(&gcwq->lock);
1515 }
1516 
send_mayday(struct work_struct * work)1517 static bool send_mayday(struct work_struct *work)
1518 {
1519 	struct cpu_workqueue_struct *cwq = get_work_cwq(work);
1520 	struct workqueue_struct *wq = cwq->wq;
1521 	unsigned int cpu;
1522 
1523 	if (!(wq->flags & WQ_RESCUER))
1524 		return false;
1525 
1526 	/* mayday mayday mayday */
1527 	cpu = cwq->gcwq->cpu;
1528 	/* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
1529 	if (cpu == WORK_CPU_UNBOUND)
1530 		cpu = 0;
1531 	if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask))
1532 		wake_up_process(wq->rescuer->task);
1533 	return true;
1534 }
1535 
gcwq_mayday_timeout(unsigned long __gcwq)1536 static void gcwq_mayday_timeout(unsigned long __gcwq)
1537 {
1538 	struct global_cwq *gcwq = (void *)__gcwq;
1539 	struct work_struct *work;
1540 
1541 	spin_lock_irq(&gcwq->lock);
1542 
1543 	if (need_to_create_worker(gcwq)) {
1544 		/*
1545 		 * We've been trying to create a new worker but
1546 		 * haven't been successful.  We might be hitting an
1547 		 * allocation deadlock.  Send distress signals to
1548 		 * rescuers.
1549 		 */
1550 		list_for_each_entry(work, &gcwq->worklist, entry)
1551 			send_mayday(work);
1552 	}
1553 
1554 	spin_unlock_irq(&gcwq->lock);
1555 
1556 	mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL);
1557 }
1558 
1559 /**
1560  * maybe_create_worker - create a new worker if necessary
1561  * @gcwq: gcwq to create a new worker for
1562  *
1563  * Create a new worker for @gcwq if necessary.  @gcwq is guaranteed to
1564  * have at least one idle worker on return from this function.  If
1565  * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
1566  * sent to all rescuers with works scheduled on @gcwq to resolve
1567  * possible allocation deadlock.
1568  *
1569  * On return, need_to_create_worker() is guaranteed to be false and
1570  * may_start_working() true.
1571  *
1572  * LOCKING:
1573  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1574  * multiple times.  Does GFP_KERNEL allocations.  Called only from
1575  * manager.
1576  *
1577  * RETURNS:
1578  * false if no action was taken and gcwq->lock stayed locked, true
1579  * otherwise.
1580  */
maybe_create_worker(struct global_cwq * gcwq)1581 static bool maybe_create_worker(struct global_cwq *gcwq)
1582 __releases(&gcwq->lock)
1583 __acquires(&gcwq->lock)
1584 {
1585 	if (!need_to_create_worker(gcwq))
1586 		return false;
1587 restart:
1588 	spin_unlock_irq(&gcwq->lock);
1589 
1590 	/* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
1591 	mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
1592 
1593 	while (true) {
1594 		struct worker *worker;
1595 
1596 		worker = create_worker(gcwq, true);
1597 		if (worker) {
1598 			del_timer_sync(&gcwq->mayday_timer);
1599 			spin_lock_irq(&gcwq->lock);
1600 			start_worker(worker);
1601 			BUG_ON(need_to_create_worker(gcwq));
1602 			return true;
1603 		}
1604 
1605 		if (!need_to_create_worker(gcwq))
1606 			break;
1607 
1608 		__set_current_state(TASK_INTERRUPTIBLE);
1609 		schedule_timeout(CREATE_COOLDOWN);
1610 
1611 		if (!need_to_create_worker(gcwq))
1612 			break;
1613 	}
1614 
1615 	del_timer_sync(&gcwq->mayday_timer);
1616 	spin_lock_irq(&gcwq->lock);
1617 	if (need_to_create_worker(gcwq))
1618 		goto restart;
1619 	return true;
1620 }
1621 
1622 /**
1623  * maybe_destroy_worker - destroy workers which have been idle for a while
1624  * @gcwq: gcwq to destroy workers for
1625  *
1626  * Destroy @gcwq workers which have been idle for longer than
1627  * IDLE_WORKER_TIMEOUT.
1628  *
1629  * LOCKING:
1630  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1631  * multiple times.  Called only from manager.
1632  *
1633  * RETURNS:
1634  * false if no action was taken and gcwq->lock stayed locked, true
1635  * otherwise.
1636  */
maybe_destroy_workers(struct global_cwq * gcwq)1637 static bool maybe_destroy_workers(struct global_cwq *gcwq)
1638 {
1639 	bool ret = false;
1640 
1641 	while (too_many_workers(gcwq)) {
1642 		struct worker *worker;
1643 		unsigned long expires;
1644 
1645 		worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
1646 		expires = worker->last_active + IDLE_WORKER_TIMEOUT;
1647 
1648 		if (time_before(jiffies, expires)) {
1649 			mod_timer(&gcwq->idle_timer, expires);
1650 			break;
1651 		}
1652 
1653 		destroy_worker(worker);
1654 		ret = true;
1655 	}
1656 
1657 	return ret;
1658 }
1659 
1660 /**
1661  * manage_workers - manage worker pool
1662  * @worker: self
1663  *
1664  * Assume the manager role and manage gcwq worker pool @worker belongs
1665  * to.  At any given time, there can be only zero or one manager per
1666  * gcwq.  The exclusion is handled automatically by this function.
1667  *
1668  * The caller can safely start processing works on false return.  On
1669  * true return, it's guaranteed that need_to_create_worker() is false
1670  * and may_start_working() is true.
1671  *
1672  * CONTEXT:
1673  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1674  * multiple times.  Does GFP_KERNEL allocations.
1675  *
1676  * RETURNS:
1677  * false if no action was taken and gcwq->lock stayed locked, true if
1678  * some action was taken.
1679  */
manage_workers(struct worker * worker)1680 static bool manage_workers(struct worker *worker)
1681 {
1682 	struct global_cwq *gcwq = worker->gcwq;
1683 	bool ret = false;
1684 
1685 	if (gcwq->flags & GCWQ_MANAGING_WORKERS)
1686 		return ret;
1687 
1688 	gcwq->flags &= ~GCWQ_MANAGE_WORKERS;
1689 	gcwq->flags |= GCWQ_MANAGING_WORKERS;
1690 
1691 	/*
1692 	 * Destroy and then create so that may_start_working() is true
1693 	 * on return.
1694 	 */
1695 	ret |= maybe_destroy_workers(gcwq);
1696 	ret |= maybe_create_worker(gcwq);
1697 
1698 	gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
1699 
1700 	/*
1701 	 * The trustee might be waiting to take over the manager
1702 	 * position, tell it we're done.
1703 	 */
1704 	if (unlikely(gcwq->trustee))
1705 		wake_up_all(&gcwq->trustee_wait);
1706 
1707 	return ret;
1708 }
1709 
1710 /**
1711  * move_linked_works - move linked works to a list
1712  * @work: start of series of works to be scheduled
1713  * @head: target list to append @work to
1714  * @nextp: out paramter for nested worklist walking
1715  *
1716  * Schedule linked works starting from @work to @head.  Work series to
1717  * be scheduled starts at @work and includes any consecutive work with
1718  * WORK_STRUCT_LINKED set in its predecessor.
1719  *
1720  * If @nextp is not NULL, it's updated to point to the next work of
1721  * the last scheduled work.  This allows move_linked_works() to be
1722  * nested inside outer list_for_each_entry_safe().
1723  *
1724  * CONTEXT:
1725  * spin_lock_irq(gcwq->lock).
1726  */
move_linked_works(struct work_struct * work,struct list_head * head,struct work_struct ** nextp)1727 static void move_linked_works(struct work_struct *work, struct list_head *head,
1728 			      struct work_struct **nextp)
1729 {
1730 	struct work_struct *n;
1731 
1732 	/*
1733 	 * Linked worklist will always end before the end of the list,
1734 	 * use NULL for list head.
1735 	 */
1736 	list_for_each_entry_safe_from(work, n, NULL, entry) {
1737 		list_move_tail(&work->entry, head);
1738 		if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
1739 			break;
1740 	}
1741 
1742 	/*
1743 	 * If we're already inside safe list traversal and have moved
1744 	 * multiple works to the scheduled queue, the next position
1745 	 * needs to be updated.
1746 	 */
1747 	if (nextp)
1748 		*nextp = n;
1749 }
1750 
cwq_activate_delayed_work(struct work_struct * work)1751 static void cwq_activate_delayed_work(struct work_struct *work)
1752 {
1753 	struct cpu_workqueue_struct *cwq = get_work_cwq(work);
1754 	struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
1755 
1756 	trace_workqueue_activate_work(work);
1757 	move_linked_works(work, pos, NULL);
1758 	__clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
1759 	cwq->nr_active++;
1760 }
1761 
cwq_activate_first_delayed(struct cpu_workqueue_struct * cwq)1762 static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
1763 {
1764 	struct work_struct *work = list_first_entry(&cwq->delayed_works,
1765 						    struct work_struct, entry);
1766 
1767 	cwq_activate_delayed_work(work);
1768 }
1769 
1770 /**
1771  * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
1772  * @cwq: cwq of interest
1773  * @color: color of work which left the queue
1774  * @delayed: for a delayed work
1775  *
1776  * A work either has completed or is removed from pending queue,
1777  * decrement nr_in_flight of its cwq and handle workqueue flushing.
1778  *
1779  * CONTEXT:
1780  * spin_lock_irq(gcwq->lock).
1781  */
cwq_dec_nr_in_flight(struct cpu_workqueue_struct * cwq,int color,bool delayed)1782 static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color,
1783 				 bool delayed)
1784 {
1785 	/* ignore uncolored works */
1786 	if (color == WORK_NO_COLOR)
1787 		return;
1788 
1789 	cwq->nr_in_flight[color]--;
1790 
1791 	if (!delayed) {
1792 		cwq->nr_active--;
1793 		if (!list_empty(&cwq->delayed_works)) {
1794 			/* one down, submit a delayed one */
1795 			if (cwq->nr_active < cwq->max_active)
1796 				cwq_activate_first_delayed(cwq);
1797 		}
1798 	}
1799 
1800 	/* is flush in progress and are we at the flushing tip? */
1801 	if (likely(cwq->flush_color != color))
1802 		return;
1803 
1804 	/* are there still in-flight works? */
1805 	if (cwq->nr_in_flight[color])
1806 		return;
1807 
1808 	/* this cwq is done, clear flush_color */
1809 	cwq->flush_color = -1;
1810 
1811 	/*
1812 	 * If this was the last cwq, wake up the first flusher.  It
1813 	 * will handle the rest.
1814 	 */
1815 	if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
1816 		complete(&cwq->wq->first_flusher->done);
1817 }
1818 
1819 /**
1820  * process_one_work - process single work
1821  * @worker: self
1822  * @work: work to process
1823  *
1824  * Process @work.  This function contains all the logics necessary to
1825  * process a single work including synchronization against and
1826  * interaction with other workers on the same cpu, queueing and
1827  * flushing.  As long as context requirement is met, any worker can
1828  * call this function to process a work.
1829  *
1830  * CONTEXT:
1831  * spin_lock_irq(gcwq->lock) which is released and regrabbed.
1832  */
process_one_work(struct worker * worker,struct work_struct * work)1833 static void process_one_work(struct worker *worker, struct work_struct *work)
1834 __releases(&gcwq->lock)
1835 __acquires(&gcwq->lock)
1836 {
1837 	struct cpu_workqueue_struct *cwq = get_work_cwq(work);
1838 	struct global_cwq *gcwq = cwq->gcwq;
1839 	struct hlist_head *bwh = busy_worker_head(gcwq, work);
1840 	bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
1841 	int work_color;
1842 	struct worker *collision;
1843 #ifdef CONFIG_LOCKDEP
1844 	/*
1845 	 * It is permissible to free the struct work_struct from
1846 	 * inside the function that is called from it, this we need to
1847 	 * take into account for lockdep too.  To avoid bogus "held
1848 	 * lock freed" warnings as well as problems when looking into
1849 	 * work->lockdep_map, make a copy and use that here.
1850 	 */
1851 	struct lockdep_map lockdep_map = work->lockdep_map;
1852 #endif
1853 	/*
1854 	 * A single work shouldn't be executed concurrently by
1855 	 * multiple workers on a single cpu.  Check whether anyone is
1856 	 * already processing the work.  If so, defer the work to the
1857 	 * currently executing one.
1858 	 */
1859 	collision = __find_worker_executing_work(gcwq, bwh, work);
1860 	if (unlikely(collision)) {
1861 		move_linked_works(work, &collision->scheduled, NULL);
1862 		return;
1863 	}
1864 
1865 	/* claim and process */
1866 	debug_work_deactivate(work);
1867 	hlist_add_head(&worker->hentry, bwh);
1868 	worker->current_work = work;
1869 	worker->current_func = work->func;
1870 	worker->current_cwq = cwq;
1871 	work_color = get_work_color(work);
1872 
1873 	/* record the current cpu number in the work data and dequeue */
1874 	set_work_cpu(work, gcwq->cpu);
1875 	list_del_init(&work->entry);
1876 
1877 	/*
1878 	 * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
1879 	 * wake up another worker; otherwise, clear HIGHPRI_PENDING.
1880 	 */
1881 	if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) {
1882 		struct work_struct *nwork = list_first_entry(&gcwq->worklist,
1883 						struct work_struct, entry);
1884 
1885 		if (!list_empty(&gcwq->worklist) &&
1886 		    get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
1887 			wake_up_worker(gcwq);
1888 		else
1889 			gcwq->flags &= ~GCWQ_HIGHPRI_PENDING;
1890 	}
1891 
1892 	/*
1893 	 * CPU intensive works don't participate in concurrency
1894 	 * management.  They're the scheduler's responsibility.
1895 	 */
1896 	if (unlikely(cpu_intensive))
1897 		worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
1898 
1899 	spin_unlock_irq(&gcwq->lock);
1900 
1901 	smp_wmb();	/* paired with test_and_set_bit(PENDING) */
1902 	work_clear_pending(work);
1903 
1904 	lock_map_acquire_read(&cwq->wq->lockdep_map);
1905 	lock_map_acquire(&lockdep_map);
1906 	trace_workqueue_execute_start(work);
1907 	worker->current_func(work);
1908 	/*
1909 	 * While we must be careful to not use "work" after this, the trace
1910 	 * point will only record its address.
1911 	 */
1912 	trace_workqueue_execute_end(work);
1913 	lock_map_release(&lockdep_map);
1914 	lock_map_release(&cwq->wq->lockdep_map);
1915 
1916 	if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
1917 		pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
1918 		       "     last function: %pf\n",
1919 		       current->comm, preempt_count(), task_pid_nr(current),
1920 		       worker->current_func);
1921 		debug_show_held_locks(current);
1922 		dump_stack();
1923 	}
1924 
1925 	/*
1926 	 * The following prevents a kworker from hogging CPU on !PREEMPT
1927 	 * kernels, where a requeueing work item waiting for something to
1928 	 * happen could deadlock with stop_machine as such work item could
1929 	 * indefinitely requeue itself while all other CPUs are trapped in
1930 	 * stop_machine.
1931 	 */
1932 	cond_resched();
1933 
1934 	spin_lock_irq(&gcwq->lock);
1935 
1936 	/* clear cpu intensive status */
1937 	if (unlikely(cpu_intensive))
1938 		worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
1939 
1940 	/* we're done with it, release */
1941 	hlist_del_init(&worker->hentry);
1942 	worker->current_work = NULL;
1943 	worker->current_func = NULL;
1944 	worker->current_cwq = NULL;
1945 	cwq_dec_nr_in_flight(cwq, work_color, false);
1946 }
1947 
1948 /**
1949  * process_scheduled_works - process scheduled works
1950  * @worker: self
1951  *
1952  * Process all scheduled works.  Please note that the scheduled list
1953  * may change while processing a work, so this function repeatedly
1954  * fetches a work from the top and executes it.
1955  *
1956  * CONTEXT:
1957  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1958  * multiple times.
1959  */
process_scheduled_works(struct worker * worker)1960 static void process_scheduled_works(struct worker *worker)
1961 {
1962 	while (!list_empty(&worker->scheduled)) {
1963 		struct work_struct *work = list_first_entry(&worker->scheduled,
1964 						struct work_struct, entry);
1965 		process_one_work(worker, work);
1966 	}
1967 }
1968 
1969 /**
1970  * worker_thread - the worker thread function
1971  * @__worker: self
1972  *
1973  * The gcwq worker thread function.  There's a single dynamic pool of
1974  * these per each cpu.  These workers process all works regardless of
1975  * their specific target workqueue.  The only exception is works which
1976  * belong to workqueues with a rescuer which will be explained in
1977  * rescuer_thread().
1978  */
worker_thread(void * __worker)1979 static int worker_thread(void *__worker)
1980 {
1981 	struct worker *worker = __worker;
1982 	struct global_cwq *gcwq = worker->gcwq;
1983 
1984 	/* tell the scheduler that this is a workqueue worker */
1985 	worker->task->flags |= PF_WQ_WORKER;
1986 woke_up:
1987 	spin_lock_irq(&gcwq->lock);
1988 
1989 	/* DIE can be set only while we're idle, checking here is enough */
1990 	if (worker->flags & WORKER_DIE) {
1991 		spin_unlock_irq(&gcwq->lock);
1992 		worker->task->flags &= ~PF_WQ_WORKER;
1993 		return 0;
1994 	}
1995 
1996 	worker_leave_idle(worker);
1997 recheck:
1998 	/* no more worker necessary? */
1999 	if (!need_more_worker(gcwq))
2000 		goto sleep;
2001 
2002 	/* do we need to manage? */
2003 	if (unlikely(!may_start_working(gcwq)) && manage_workers(worker))
2004 		goto recheck;
2005 
2006 	/*
2007 	 * ->scheduled list can only be filled while a worker is
2008 	 * preparing to process a work or actually processing it.
2009 	 * Make sure nobody diddled with it while I was sleeping.
2010 	 */
2011 	BUG_ON(!list_empty(&worker->scheduled));
2012 
2013 	/*
2014 	 * When control reaches this point, we're guaranteed to have
2015 	 * at least one idle worker or that someone else has already
2016 	 * assumed the manager role.
2017 	 */
2018 	worker_clr_flags(worker, WORKER_PREP);
2019 
2020 	do {
2021 		struct work_struct *work =
2022 			list_first_entry(&gcwq->worklist,
2023 					 struct work_struct, entry);
2024 
2025 		if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
2026 			/* optimization path, not strictly necessary */
2027 			process_one_work(worker, work);
2028 			if (unlikely(!list_empty(&worker->scheduled)))
2029 				process_scheduled_works(worker);
2030 		} else {
2031 			move_linked_works(work, &worker->scheduled, NULL);
2032 			process_scheduled_works(worker);
2033 		}
2034 	} while (keep_working(gcwq));
2035 
2036 	worker_set_flags(worker, WORKER_PREP, false);
2037 sleep:
2038 	if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker))
2039 		goto recheck;
2040 
2041 	/*
2042 	 * gcwq->lock is held and there's no work to process and no
2043 	 * need to manage, sleep.  Workers are woken up only while
2044 	 * holding gcwq->lock or from local cpu, so setting the
2045 	 * current state before releasing gcwq->lock is enough to
2046 	 * prevent losing any event.
2047 	 */
2048 	worker_enter_idle(worker);
2049 	__set_current_state(TASK_INTERRUPTIBLE);
2050 	spin_unlock_irq(&gcwq->lock);
2051 	schedule();
2052 	goto woke_up;
2053 }
2054 
2055 /**
2056  * rescuer_thread - the rescuer thread function
2057  * @__wq: the associated workqueue
2058  *
2059  * Workqueue rescuer thread function.  There's one rescuer for each
2060  * workqueue which has WQ_RESCUER set.
2061  *
2062  * Regular work processing on a gcwq may block trying to create a new
2063  * worker which uses GFP_KERNEL allocation which has slight chance of
2064  * developing into deadlock if some works currently on the same queue
2065  * need to be processed to satisfy the GFP_KERNEL allocation.  This is
2066  * the problem rescuer solves.
2067  *
2068  * When such condition is possible, the gcwq summons rescuers of all
2069  * workqueues which have works queued on the gcwq and let them process
2070  * those works so that forward progress can be guaranteed.
2071  *
2072  * This should happen rarely.
2073  */
rescuer_thread(void * __wq)2074 static int rescuer_thread(void *__wq)
2075 {
2076 	struct workqueue_struct *wq = __wq;
2077 	struct worker *rescuer = wq->rescuer;
2078 	struct list_head *scheduled = &rescuer->scheduled;
2079 	bool is_unbound = wq->flags & WQ_UNBOUND;
2080 	unsigned int cpu;
2081 
2082 	set_user_nice(current, RESCUER_NICE_LEVEL);
2083 repeat:
2084 	set_current_state(TASK_INTERRUPTIBLE);
2085 
2086 	if (kthread_should_stop()) {
2087 		__set_current_state(TASK_RUNNING);
2088 		return 0;
2089 	}
2090 
2091 	/*
2092 	 * See whether any cpu is asking for help.  Unbounded
2093 	 * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND.
2094 	 */
2095 	for_each_mayday_cpu(cpu, wq->mayday_mask) {
2096 		unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
2097 		struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
2098 		struct global_cwq *gcwq = cwq->gcwq;
2099 		struct work_struct *work, *n;
2100 
2101 		__set_current_state(TASK_RUNNING);
2102 		mayday_clear_cpu(cpu, wq->mayday_mask);
2103 
2104 		/* migrate to the target cpu if possible */
2105 		rescuer->gcwq = gcwq;
2106 		worker_maybe_bind_and_lock(rescuer);
2107 
2108 		/*
2109 		 * Slurp in all works issued via this workqueue and
2110 		 * process'em.
2111 		 */
2112 		BUG_ON(!list_empty(&rescuer->scheduled));
2113 		list_for_each_entry_safe(work, n, &gcwq->worklist, entry)
2114 			if (get_work_cwq(work) == cwq)
2115 				move_linked_works(work, scheduled, &n);
2116 
2117 		process_scheduled_works(rescuer);
2118 
2119 		/*
2120 		 * Leave this gcwq.  If keep_working() is %true, notify a
2121 		 * regular worker; otherwise, we end up with 0 concurrency
2122 		 * and stalling the execution.
2123 		 */
2124 		if (keep_working(gcwq))
2125 			wake_up_worker(gcwq);
2126 
2127 		spin_unlock_irq(&gcwq->lock);
2128 	}
2129 
2130 	schedule();
2131 	goto repeat;
2132 }
2133 
2134 struct wq_barrier {
2135 	struct work_struct	work;
2136 	struct completion	done;
2137 };
2138 
wq_barrier_func(struct work_struct * work)2139 static void wq_barrier_func(struct work_struct *work)
2140 {
2141 	struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
2142 	complete(&barr->done);
2143 }
2144 
2145 /**
2146  * insert_wq_barrier - insert a barrier work
2147  * @cwq: cwq to insert barrier into
2148  * @barr: wq_barrier to insert
2149  * @target: target work to attach @barr to
2150  * @worker: worker currently executing @target, NULL if @target is not executing
2151  *
2152  * @barr is linked to @target such that @barr is completed only after
2153  * @target finishes execution.  Please note that the ordering
2154  * guarantee is observed only with respect to @target and on the local
2155  * cpu.
2156  *
2157  * Currently, a queued barrier can't be canceled.  This is because
2158  * try_to_grab_pending() can't determine whether the work to be
2159  * grabbed is at the head of the queue and thus can't clear LINKED
2160  * flag of the previous work while there must be a valid next work
2161  * after a work with LINKED flag set.
2162  *
2163  * Note that when @worker is non-NULL, @target may be modified
2164  * underneath us, so we can't reliably determine cwq from @target.
2165  *
2166  * CONTEXT:
2167  * spin_lock_irq(gcwq->lock).
2168  */
insert_wq_barrier(struct cpu_workqueue_struct * cwq,struct wq_barrier * barr,struct work_struct * target,struct worker * worker)2169 static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
2170 			      struct wq_barrier *barr,
2171 			      struct work_struct *target, struct worker *worker)
2172 {
2173 	struct list_head *head;
2174 	unsigned int linked = 0;
2175 
2176 	/*
2177 	 * debugobject calls are safe here even with gcwq->lock locked
2178 	 * as we know for sure that this will not trigger any of the
2179 	 * checks and call back into the fixup functions where we
2180 	 * might deadlock.
2181 	 */
2182 	INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
2183 	__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
2184 	init_completion(&barr->done);
2185 
2186 	/*
2187 	 * If @target is currently being executed, schedule the
2188 	 * barrier to the worker; otherwise, put it after @target.
2189 	 */
2190 	if (worker)
2191 		head = worker->scheduled.next;
2192 	else {
2193 		unsigned long *bits = work_data_bits(target);
2194 
2195 		head = target->entry.next;
2196 		/* there can already be other linked works, inherit and set */
2197 		linked = *bits & WORK_STRUCT_LINKED;
2198 		__set_bit(WORK_STRUCT_LINKED_BIT, bits);
2199 	}
2200 
2201 	debug_work_activate(&barr->work);
2202 	insert_work(cwq, &barr->work, head,
2203 		    work_color_to_flags(WORK_NO_COLOR) | linked);
2204 }
2205 
2206 /**
2207  * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing
2208  * @wq: workqueue being flushed
2209  * @flush_color: new flush color, < 0 for no-op
2210  * @work_color: new work color, < 0 for no-op
2211  *
2212  * Prepare cwqs for workqueue flushing.
2213  *
2214  * If @flush_color is non-negative, flush_color on all cwqs should be
2215  * -1.  If no cwq has in-flight commands at the specified color, all
2216  * cwq->flush_color's stay at -1 and %false is returned.  If any cwq
2217  * has in flight commands, its cwq->flush_color is set to
2218  * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq
2219  * wakeup logic is armed and %true is returned.
2220  *
2221  * The caller should have initialized @wq->first_flusher prior to
2222  * calling this function with non-negative @flush_color.  If
2223  * @flush_color is negative, no flush color update is done and %false
2224  * is returned.
2225  *
2226  * If @work_color is non-negative, all cwqs should have the same
2227  * work_color which is previous to @work_color and all will be
2228  * advanced to @work_color.
2229  *
2230  * CONTEXT:
2231  * mutex_lock(wq->flush_mutex).
2232  *
2233  * RETURNS:
2234  * %true if @flush_color >= 0 and there's something to flush.  %false
2235  * otherwise.
2236  */
flush_workqueue_prep_cwqs(struct workqueue_struct * wq,int flush_color,int work_color)2237 static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
2238 				      int flush_color, int work_color)
2239 {
2240 	bool wait = false;
2241 	unsigned int cpu;
2242 
2243 	if (flush_color >= 0) {
2244 		BUG_ON(atomic_read(&wq->nr_cwqs_to_flush));
2245 		atomic_set(&wq->nr_cwqs_to_flush, 1);
2246 	}
2247 
2248 	for_each_cwq_cpu(cpu, wq) {
2249 		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2250 		struct global_cwq *gcwq = cwq->gcwq;
2251 
2252 		spin_lock_irq(&gcwq->lock);
2253 
2254 		if (flush_color >= 0) {
2255 			BUG_ON(cwq->flush_color != -1);
2256 
2257 			if (cwq->nr_in_flight[flush_color]) {
2258 				cwq->flush_color = flush_color;
2259 				atomic_inc(&wq->nr_cwqs_to_flush);
2260 				wait = true;
2261 			}
2262 		}
2263 
2264 		if (work_color >= 0) {
2265 			BUG_ON(work_color != work_next_color(cwq->work_color));
2266 			cwq->work_color = work_color;
2267 		}
2268 
2269 		spin_unlock_irq(&gcwq->lock);
2270 	}
2271 
2272 	if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush))
2273 		complete(&wq->first_flusher->done);
2274 
2275 	return wait;
2276 }
2277 
2278 /**
2279  * flush_workqueue - ensure that any scheduled work has run to completion.
2280  * @wq: workqueue to flush
2281  *
2282  * Forces execution of the workqueue and blocks until its completion.
2283  * This is typically used in driver shutdown handlers.
2284  *
2285  * We sleep until all works which were queued on entry have been handled,
2286  * but we are not livelocked by new incoming ones.
2287  */
flush_workqueue(struct workqueue_struct * wq)2288 void flush_workqueue(struct workqueue_struct *wq)
2289 {
2290 	struct wq_flusher this_flusher = {
2291 		.list = LIST_HEAD_INIT(this_flusher.list),
2292 		.flush_color = -1,
2293 		.done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
2294 	};
2295 	int next_color;
2296 
2297 	lock_map_acquire(&wq->lockdep_map);
2298 	lock_map_release(&wq->lockdep_map);
2299 
2300 	mutex_lock(&wq->flush_mutex);
2301 
2302 	/*
2303 	 * Start-to-wait phase
2304 	 */
2305 	next_color = work_next_color(wq->work_color);
2306 
2307 	if (next_color != wq->flush_color) {
2308 		/*
2309 		 * Color space is not full.  The current work_color
2310 		 * becomes our flush_color and work_color is advanced
2311 		 * by one.
2312 		 */
2313 		BUG_ON(!list_empty(&wq->flusher_overflow));
2314 		this_flusher.flush_color = wq->work_color;
2315 		wq->work_color = next_color;
2316 
2317 		if (!wq->first_flusher) {
2318 			/* no flush in progress, become the first flusher */
2319 			BUG_ON(wq->flush_color != this_flusher.flush_color);
2320 
2321 			wq->first_flusher = &this_flusher;
2322 
2323 			if (!flush_workqueue_prep_cwqs(wq, wq->flush_color,
2324 						       wq->work_color)) {
2325 				/* nothing to flush, done */
2326 				wq->flush_color = next_color;
2327 				wq->first_flusher = NULL;
2328 				goto out_unlock;
2329 			}
2330 		} else {
2331 			/* wait in queue */
2332 			BUG_ON(wq->flush_color == this_flusher.flush_color);
2333 			list_add_tail(&this_flusher.list, &wq->flusher_queue);
2334 			flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
2335 		}
2336 	} else {
2337 		/*
2338 		 * Oops, color space is full, wait on overflow queue.
2339 		 * The next flush completion will assign us
2340 		 * flush_color and transfer to flusher_queue.
2341 		 */
2342 		list_add_tail(&this_flusher.list, &wq->flusher_overflow);
2343 	}
2344 
2345 	mutex_unlock(&wq->flush_mutex);
2346 
2347 	wait_for_completion(&this_flusher.done);
2348 
2349 	/*
2350 	 * Wake-up-and-cascade phase
2351 	 *
2352 	 * First flushers are responsible for cascading flushes and
2353 	 * handling overflow.  Non-first flushers can simply return.
2354 	 */
2355 	if (wq->first_flusher != &this_flusher)
2356 		return;
2357 
2358 	mutex_lock(&wq->flush_mutex);
2359 
2360 	/* we might have raced, check again with mutex held */
2361 	if (wq->first_flusher != &this_flusher)
2362 		goto out_unlock;
2363 
2364 	wq->first_flusher = NULL;
2365 
2366 	BUG_ON(!list_empty(&this_flusher.list));
2367 	BUG_ON(wq->flush_color != this_flusher.flush_color);
2368 
2369 	while (true) {
2370 		struct wq_flusher *next, *tmp;
2371 
2372 		/* complete all the flushers sharing the current flush color */
2373 		list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
2374 			if (next->flush_color != wq->flush_color)
2375 				break;
2376 			list_del_init(&next->list);
2377 			complete(&next->done);
2378 		}
2379 
2380 		BUG_ON(!list_empty(&wq->flusher_overflow) &&
2381 		       wq->flush_color != work_next_color(wq->work_color));
2382 
2383 		/* this flush_color is finished, advance by one */
2384 		wq->flush_color = work_next_color(wq->flush_color);
2385 
2386 		/* one color has been freed, handle overflow queue */
2387 		if (!list_empty(&wq->flusher_overflow)) {
2388 			/*
2389 			 * Assign the same color to all overflowed
2390 			 * flushers, advance work_color and append to
2391 			 * flusher_queue.  This is the start-to-wait
2392 			 * phase for these overflowed flushers.
2393 			 */
2394 			list_for_each_entry(tmp, &wq->flusher_overflow, list)
2395 				tmp->flush_color = wq->work_color;
2396 
2397 			wq->work_color = work_next_color(wq->work_color);
2398 
2399 			list_splice_tail_init(&wq->flusher_overflow,
2400 					      &wq->flusher_queue);
2401 			flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
2402 		}
2403 
2404 		if (list_empty(&wq->flusher_queue)) {
2405 			BUG_ON(wq->flush_color != wq->work_color);
2406 			break;
2407 		}
2408 
2409 		/*
2410 		 * Need to flush more colors.  Make the next flusher
2411 		 * the new first flusher and arm cwqs.
2412 		 */
2413 		BUG_ON(wq->flush_color == wq->work_color);
2414 		BUG_ON(wq->flush_color != next->flush_color);
2415 
2416 		list_del_init(&next->list);
2417 		wq->first_flusher = next;
2418 
2419 		if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1))
2420 			break;
2421 
2422 		/*
2423 		 * Meh... this color is already done, clear first
2424 		 * flusher and repeat cascading.
2425 		 */
2426 		wq->first_flusher = NULL;
2427 	}
2428 
2429 out_unlock:
2430 	mutex_unlock(&wq->flush_mutex);
2431 }
2432 EXPORT_SYMBOL_GPL(flush_workqueue);
2433 
2434 /**
2435  * drain_workqueue - drain a workqueue
2436  * @wq: workqueue to drain
2437  *
2438  * Wait until the workqueue becomes empty.  While draining is in progress,
2439  * only chain queueing is allowed.  IOW, only currently pending or running
2440  * work items on @wq can queue further work items on it.  @wq is flushed
2441  * repeatedly until it becomes empty.  The number of flushing is detemined
2442  * by the depth of chaining and should be relatively short.  Whine if it
2443  * takes too long.
2444  */
drain_workqueue(struct workqueue_struct * wq)2445 void drain_workqueue(struct workqueue_struct *wq)
2446 {
2447 	unsigned int flush_cnt = 0;
2448 	unsigned int cpu;
2449 
2450 	/*
2451 	 * __queue_work() needs to test whether there are drainers, is much
2452 	 * hotter than drain_workqueue() and already looks at @wq->flags.
2453 	 * Use WQ_DRAINING so that queue doesn't have to check nr_drainers.
2454 	 */
2455 	spin_lock(&workqueue_lock);
2456 	if (!wq->nr_drainers++)
2457 		wq->flags |= WQ_DRAINING;
2458 	spin_unlock(&workqueue_lock);
2459 reflush:
2460 	flush_workqueue(wq);
2461 
2462 	for_each_cwq_cpu(cpu, wq) {
2463 		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2464 		bool drained;
2465 
2466 		spin_lock_irq(&cwq->gcwq->lock);
2467 		drained = !cwq->nr_active && list_empty(&cwq->delayed_works);
2468 		spin_unlock_irq(&cwq->gcwq->lock);
2469 
2470 		if (drained)
2471 			continue;
2472 
2473 		if (++flush_cnt == 10 ||
2474 		    (flush_cnt % 100 == 0 && flush_cnt <= 1000))
2475 			pr_warning("workqueue %s: flush on destruction isn't complete after %u tries\n",
2476 				   wq->name, flush_cnt);
2477 		goto reflush;
2478 	}
2479 
2480 	spin_lock(&workqueue_lock);
2481 	if (!--wq->nr_drainers)
2482 		wq->flags &= ~WQ_DRAINING;
2483 	spin_unlock(&workqueue_lock);
2484 }
2485 EXPORT_SYMBOL_GPL(drain_workqueue);
2486 
start_flush_work(struct work_struct * work,struct wq_barrier * barr,bool wait_executing)2487 static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
2488 			     bool wait_executing)
2489 {
2490 	struct worker *worker = NULL;
2491 	struct global_cwq *gcwq;
2492 	struct cpu_workqueue_struct *cwq;
2493 
2494 	might_sleep();
2495 	gcwq = get_work_gcwq(work);
2496 	if (!gcwq)
2497 		return false;
2498 
2499 	spin_lock_irq(&gcwq->lock);
2500 	if (!list_empty(&work->entry)) {
2501 		/*
2502 		 * See the comment near try_to_grab_pending()->smp_rmb().
2503 		 * If it was re-queued to a different gcwq under us, we
2504 		 * are not going to wait.
2505 		 */
2506 		smp_rmb();
2507 		cwq = get_work_cwq(work);
2508 		if (unlikely(!cwq || gcwq != cwq->gcwq))
2509 			goto already_gone;
2510 	} else if (wait_executing) {
2511 		worker = find_worker_executing_work(gcwq, work);
2512 		if (!worker)
2513 			goto already_gone;
2514 		cwq = worker->current_cwq;
2515 	} else
2516 		goto already_gone;
2517 
2518 	insert_wq_barrier(cwq, barr, work, worker);
2519 	spin_unlock_irq(&gcwq->lock);
2520 
2521 	/*
2522 	 * If @max_active is 1 or rescuer is in use, flushing another work
2523 	 * item on the same workqueue may lead to deadlock.  Make sure the
2524 	 * flusher is not running on the same workqueue by verifying write
2525 	 * access.
2526 	 */
2527 	if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER)
2528 		lock_map_acquire(&cwq->wq->lockdep_map);
2529 	else
2530 		lock_map_acquire_read(&cwq->wq->lockdep_map);
2531 	lock_map_release(&cwq->wq->lockdep_map);
2532 
2533 	return true;
2534 already_gone:
2535 	spin_unlock_irq(&gcwq->lock);
2536 	return false;
2537 }
2538 
2539 /**
2540  * flush_work - wait for a work to finish executing the last queueing instance
2541  * @work: the work to flush
2542  *
2543  * Wait until @work has finished execution.  This function considers
2544  * only the last queueing instance of @work.  If @work has been
2545  * enqueued across different CPUs on a non-reentrant workqueue or on
2546  * multiple workqueues, @work might still be executing on return on
2547  * some of the CPUs from earlier queueing.
2548  *
2549  * If @work was queued only on a non-reentrant, ordered or unbound
2550  * workqueue, @work is guaranteed to be idle on return if it hasn't
2551  * been requeued since flush started.
2552  *
2553  * RETURNS:
2554  * %true if flush_work() waited for the work to finish execution,
2555  * %false if it was already idle.
2556  */
flush_work(struct work_struct * work)2557 bool flush_work(struct work_struct *work)
2558 {
2559 	struct wq_barrier barr;
2560 
2561 	if (start_flush_work(work, &barr, true)) {
2562 		wait_for_completion(&barr.done);
2563 		destroy_work_on_stack(&barr.work);
2564 		return true;
2565 	} else
2566 		return false;
2567 }
2568 EXPORT_SYMBOL_GPL(flush_work);
2569 
wait_on_cpu_work(struct global_cwq * gcwq,struct work_struct * work)2570 static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
2571 {
2572 	struct wq_barrier barr;
2573 	struct worker *worker;
2574 
2575 	spin_lock_irq(&gcwq->lock);
2576 
2577 	worker = find_worker_executing_work(gcwq, work);
2578 	if (unlikely(worker))
2579 		insert_wq_barrier(worker->current_cwq, &barr, work, worker);
2580 
2581 	spin_unlock_irq(&gcwq->lock);
2582 
2583 	if (unlikely(worker)) {
2584 		wait_for_completion(&barr.done);
2585 		destroy_work_on_stack(&barr.work);
2586 		return true;
2587 	} else
2588 		return false;
2589 }
2590 
wait_on_work(struct work_struct * work)2591 static bool wait_on_work(struct work_struct *work)
2592 {
2593 	bool ret = false;
2594 	int cpu;
2595 
2596 	might_sleep();
2597 
2598 	lock_map_acquire(&work->lockdep_map);
2599 	lock_map_release(&work->lockdep_map);
2600 
2601 	for_each_gcwq_cpu(cpu)
2602 		ret |= wait_on_cpu_work(get_gcwq(cpu), work);
2603 	return ret;
2604 }
2605 
2606 /**
2607  * flush_work_sync - wait until a work has finished execution
2608  * @work: the work to flush
2609  *
2610  * Wait until @work has finished execution.  On return, it's
2611  * guaranteed that all queueing instances of @work which happened
2612  * before this function is called are finished.  In other words, if
2613  * @work hasn't been requeued since this function was called, @work is
2614  * guaranteed to be idle on return.
2615  *
2616  * RETURNS:
2617  * %true if flush_work_sync() waited for the work to finish execution,
2618  * %false if it was already idle.
2619  */
flush_work_sync(struct work_struct * work)2620 bool flush_work_sync(struct work_struct *work)
2621 {
2622 	struct wq_barrier barr;
2623 	bool pending, waited;
2624 
2625 	/* we'll wait for executions separately, queue barr only if pending */
2626 	pending = start_flush_work(work, &barr, false);
2627 
2628 	/* wait for executions to finish */
2629 	waited = wait_on_work(work);
2630 
2631 	/* wait for the pending one */
2632 	if (pending) {
2633 		wait_for_completion(&barr.done);
2634 		destroy_work_on_stack(&barr.work);
2635 	}
2636 
2637 	return pending || waited;
2638 }
2639 EXPORT_SYMBOL_GPL(flush_work_sync);
2640 
2641 /*
2642  * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
2643  * so this work can't be re-armed in any way.
2644  */
try_to_grab_pending(struct work_struct * work)2645 static int try_to_grab_pending(struct work_struct *work)
2646 {
2647 	struct global_cwq *gcwq;
2648 	int ret = -1;
2649 
2650 	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
2651 		return 0;
2652 
2653 	/*
2654 	 * The queueing is in progress, or it is already queued. Try to
2655 	 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
2656 	 */
2657 	gcwq = get_work_gcwq(work);
2658 	if (!gcwq)
2659 		return ret;
2660 
2661 	spin_lock_irq(&gcwq->lock);
2662 	if (!list_empty(&work->entry)) {
2663 		/*
2664 		 * This work is queued, but perhaps we locked the wrong gcwq.
2665 		 * In that case we must see the new value after rmb(), see
2666 		 * insert_work()->wmb().
2667 		 */
2668 		smp_rmb();
2669 		if (gcwq == get_work_gcwq(work)) {
2670 			debug_work_deactivate(work);
2671 
2672 			/*
2673 			 * A delayed work item cannot be grabbed directly
2674 			 * because it might have linked NO_COLOR work items
2675 			 * which, if left on the delayed_list, will confuse
2676 			 * cwq->nr_active management later on and cause
2677 			 * stall.  Make sure the work item is activated
2678 			 * before grabbing.
2679 			 */
2680 			if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
2681 				cwq_activate_delayed_work(work);
2682 
2683 			list_del_init(&work->entry);
2684 			cwq_dec_nr_in_flight(get_work_cwq(work),
2685 				get_work_color(work),
2686 				*work_data_bits(work) & WORK_STRUCT_DELAYED);
2687 			ret = 1;
2688 		}
2689 	}
2690 	spin_unlock_irq(&gcwq->lock);
2691 
2692 	return ret;
2693 }
2694 
__cancel_work_timer(struct work_struct * work,struct timer_list * timer)2695 static bool __cancel_work_timer(struct work_struct *work,
2696 				struct timer_list* timer)
2697 {
2698 	int ret;
2699 
2700 	do {
2701 		ret = (timer && likely(del_timer(timer)));
2702 		if (!ret)
2703 			ret = try_to_grab_pending(work);
2704 		wait_on_work(work);
2705 	} while (unlikely(ret < 0));
2706 
2707 	clear_work_data(work);
2708 	return ret;
2709 }
2710 
2711 /**
2712  * cancel_work_sync - cancel a work and wait for it to finish
2713  * @work: the work to cancel
2714  *
2715  * Cancel @work and wait for its execution to finish.  This function
2716  * can be used even if the work re-queues itself or migrates to
2717  * another workqueue.  On return from this function, @work is
2718  * guaranteed to be not pending or executing on any CPU.
2719  *
2720  * cancel_work_sync(&delayed_work->work) must not be used for
2721  * delayed_work's.  Use cancel_delayed_work_sync() instead.
2722  *
2723  * The caller must ensure that the workqueue on which @work was last
2724  * queued can't be destroyed before this function returns.
2725  *
2726  * RETURNS:
2727  * %true if @work was pending, %false otherwise.
2728  */
cancel_work_sync(struct work_struct * work)2729 bool cancel_work_sync(struct work_struct *work)
2730 {
2731 	return __cancel_work_timer(work, NULL);
2732 }
2733 EXPORT_SYMBOL_GPL(cancel_work_sync);
2734 
2735 /**
2736  * flush_delayed_work - wait for a dwork to finish executing the last queueing
2737  * @dwork: the delayed work to flush
2738  *
2739  * Delayed timer is cancelled and the pending work is queued for
2740  * immediate execution.  Like flush_work(), this function only
2741  * considers the last queueing instance of @dwork.
2742  *
2743  * RETURNS:
2744  * %true if flush_work() waited for the work to finish execution,
2745  * %false if it was already idle.
2746  */
flush_delayed_work(struct delayed_work * dwork)2747 bool flush_delayed_work(struct delayed_work *dwork)
2748 {
2749 	if (del_timer_sync(&dwork->timer))
2750 		__queue_work(raw_smp_processor_id(),
2751 			     get_work_cwq(&dwork->work)->wq, &dwork->work);
2752 	return flush_work(&dwork->work);
2753 }
2754 EXPORT_SYMBOL(flush_delayed_work);
2755 
2756 /**
2757  * flush_delayed_work_sync - wait for a dwork to finish
2758  * @dwork: the delayed work to flush
2759  *
2760  * Delayed timer is cancelled and the pending work is queued for
2761  * execution immediately.  Other than timer handling, its behavior
2762  * is identical to flush_work_sync().
2763  *
2764  * RETURNS:
2765  * %true if flush_work_sync() waited for the work to finish execution,
2766  * %false if it was already idle.
2767  */
flush_delayed_work_sync(struct delayed_work * dwork)2768 bool flush_delayed_work_sync(struct delayed_work *dwork)
2769 {
2770 	if (del_timer_sync(&dwork->timer))
2771 		__queue_work(raw_smp_processor_id(),
2772 			     get_work_cwq(&dwork->work)->wq, &dwork->work);
2773 	return flush_work_sync(&dwork->work);
2774 }
2775 EXPORT_SYMBOL(flush_delayed_work_sync);
2776 
2777 /**
2778  * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
2779  * @dwork: the delayed work cancel
2780  *
2781  * This is cancel_work_sync() for delayed works.
2782  *
2783  * RETURNS:
2784  * %true if @dwork was pending, %false otherwise.
2785  */
cancel_delayed_work_sync(struct delayed_work * dwork)2786 bool cancel_delayed_work_sync(struct delayed_work *dwork)
2787 {
2788 	return __cancel_work_timer(&dwork->work, &dwork->timer);
2789 }
2790 EXPORT_SYMBOL(cancel_delayed_work_sync);
2791 
2792 /**
2793  * schedule_work - put work task in global workqueue
2794  * @work: job to be done
2795  *
2796  * Returns zero if @work was already on the kernel-global workqueue and
2797  * non-zero otherwise.
2798  *
2799  * This puts a job in the kernel-global workqueue if it was not already
2800  * queued and leaves it in the same position on the kernel-global
2801  * workqueue otherwise.
2802  */
schedule_work(struct work_struct * work)2803 int schedule_work(struct work_struct *work)
2804 {
2805 	return queue_work(system_wq, work);
2806 }
2807 EXPORT_SYMBOL(schedule_work);
2808 
2809 /*
2810  * schedule_work_on - put work task on a specific cpu
2811  * @cpu: cpu to put the work task on
2812  * @work: job to be done
2813  *
2814  * This puts a job on a specific cpu
2815  */
schedule_work_on(int cpu,struct work_struct * work)2816 int schedule_work_on(int cpu, struct work_struct *work)
2817 {
2818 	return queue_work_on(cpu, system_wq, work);
2819 }
2820 EXPORT_SYMBOL(schedule_work_on);
2821 
2822 /**
2823  * schedule_delayed_work - put work task in global workqueue after delay
2824  * @dwork: job to be done
2825  * @delay: number of jiffies to wait or 0 for immediate execution
2826  *
2827  * After waiting for a given time this puts a job in the kernel-global
2828  * workqueue.
2829  */
schedule_delayed_work(struct delayed_work * dwork,unsigned long delay)2830 int schedule_delayed_work(struct delayed_work *dwork,
2831 					unsigned long delay)
2832 {
2833 	return queue_delayed_work(system_wq, dwork, delay);
2834 }
2835 EXPORT_SYMBOL(schedule_delayed_work);
2836 
2837 /**
2838  * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
2839  * @cpu: cpu to use
2840  * @dwork: job to be done
2841  * @delay: number of jiffies to wait
2842  *
2843  * After waiting for a given time this puts a job in the kernel-global
2844  * workqueue on the specified CPU.
2845  */
schedule_delayed_work_on(int cpu,struct delayed_work * dwork,unsigned long delay)2846 int schedule_delayed_work_on(int cpu,
2847 			struct delayed_work *dwork, unsigned long delay)
2848 {
2849 	return queue_delayed_work_on(cpu, system_wq, dwork, delay);
2850 }
2851 EXPORT_SYMBOL(schedule_delayed_work_on);
2852 
2853 /**
2854  * schedule_on_each_cpu - execute a function synchronously on each online CPU
2855  * @func: the function to call
2856  *
2857  * schedule_on_each_cpu() executes @func on each online CPU using the
2858  * system workqueue and blocks until all CPUs have completed.
2859  * schedule_on_each_cpu() is very slow.
2860  *
2861  * RETURNS:
2862  * 0 on success, -errno on failure.
2863  */
schedule_on_each_cpu(work_func_t func)2864 int schedule_on_each_cpu(work_func_t func)
2865 {
2866 	int cpu;
2867 	struct work_struct __percpu *works;
2868 
2869 	works = alloc_percpu(struct work_struct);
2870 	if (!works)
2871 		return -ENOMEM;
2872 
2873 	get_online_cpus();
2874 
2875 	for_each_online_cpu(cpu) {
2876 		struct work_struct *work = per_cpu_ptr(works, cpu);
2877 
2878 		INIT_WORK(work, func);
2879 		schedule_work_on(cpu, work);
2880 	}
2881 
2882 	for_each_online_cpu(cpu)
2883 		flush_work(per_cpu_ptr(works, cpu));
2884 
2885 	put_online_cpus();
2886 	free_percpu(works);
2887 	return 0;
2888 }
2889 
2890 /**
2891  * flush_scheduled_work - ensure that any scheduled work has run to completion.
2892  *
2893  * Forces execution of the kernel-global workqueue and blocks until its
2894  * completion.
2895  *
2896  * Think twice before calling this function!  It's very easy to get into
2897  * trouble if you don't take great care.  Either of the following situations
2898  * will lead to deadlock:
2899  *
2900  *	One of the work items currently on the workqueue needs to acquire
2901  *	a lock held by your code or its caller.
2902  *
2903  *	Your code is running in the context of a work routine.
2904  *
2905  * They will be detected by lockdep when they occur, but the first might not
2906  * occur very often.  It depends on what work items are on the workqueue and
2907  * what locks they need, which you have no control over.
2908  *
2909  * In most situations flushing the entire workqueue is overkill; you merely
2910  * need to know that a particular work item isn't queued and isn't running.
2911  * In such cases you should use cancel_delayed_work_sync() or
2912  * cancel_work_sync() instead.
2913  */
flush_scheduled_work(void)2914 void flush_scheduled_work(void)
2915 {
2916 	flush_workqueue(system_wq);
2917 }
2918 EXPORT_SYMBOL(flush_scheduled_work);
2919 
2920 /**
2921  * execute_in_process_context - reliably execute the routine with user context
2922  * @fn:		the function to execute
2923  * @ew:		guaranteed storage for the execute work structure (must
2924  *		be available when the work executes)
2925  *
2926  * Executes the function immediately if process context is available,
2927  * otherwise schedules the function for delayed execution.
2928  *
2929  * Returns:	0 - function was executed
2930  *		1 - function was scheduled for execution
2931  */
execute_in_process_context(work_func_t fn,struct execute_work * ew)2932 int execute_in_process_context(work_func_t fn, struct execute_work *ew)
2933 {
2934 	if (!in_interrupt()) {
2935 		fn(&ew->work);
2936 		return 0;
2937 	}
2938 
2939 	INIT_WORK(&ew->work, fn);
2940 	schedule_work(&ew->work);
2941 
2942 	return 1;
2943 }
2944 EXPORT_SYMBOL_GPL(execute_in_process_context);
2945 
keventd_up(void)2946 int keventd_up(void)
2947 {
2948 	return system_wq != NULL;
2949 }
2950 
alloc_cwqs(struct workqueue_struct * wq)2951 static int alloc_cwqs(struct workqueue_struct *wq)
2952 {
2953 	/*
2954 	 * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
2955 	 * Make sure that the alignment isn't lower than that of
2956 	 * unsigned long long.
2957 	 */
2958 	const size_t size = sizeof(struct cpu_workqueue_struct);
2959 	const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
2960 				   __alignof__(unsigned long long));
2961 
2962 	if (!(wq->flags & WQ_UNBOUND))
2963 		wq->cpu_wq.pcpu = __alloc_percpu(size, align);
2964 	else {
2965 		void *ptr;
2966 
2967 		/*
2968 		 * Allocate enough room to align cwq and put an extra
2969 		 * pointer at the end pointing back to the originally
2970 		 * allocated pointer which will be used for free.
2971 		 */
2972 		ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);
2973 		if (ptr) {
2974 			wq->cpu_wq.single = PTR_ALIGN(ptr, align);
2975 			*(void **)(wq->cpu_wq.single + 1) = ptr;
2976 		}
2977 	}
2978 
2979 	/* just in case, make sure it's actually aligned */
2980 	BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
2981 	return wq->cpu_wq.v ? 0 : -ENOMEM;
2982 }
2983 
free_cwqs(struct workqueue_struct * wq)2984 static void free_cwqs(struct workqueue_struct *wq)
2985 {
2986 	if (!(wq->flags & WQ_UNBOUND))
2987 		free_percpu(wq->cpu_wq.pcpu);
2988 	else if (wq->cpu_wq.single) {
2989 		/* the pointer to free is stored right after the cwq */
2990 		kfree(*(void **)(wq->cpu_wq.single + 1));
2991 	}
2992 }
2993 
wq_clamp_max_active(int max_active,unsigned int flags,const char * name)2994 static int wq_clamp_max_active(int max_active, unsigned int flags,
2995 			       const char *name)
2996 {
2997 	int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
2998 
2999 	if (max_active < 1 || max_active > lim)
3000 		printk(KERN_WARNING "workqueue: max_active %d requested for %s "
3001 		       "is out of range, clamping between %d and %d\n",
3002 		       max_active, name, 1, lim);
3003 
3004 	return clamp_val(max_active, 1, lim);
3005 }
3006 
__alloc_workqueue_key(const char * fmt,unsigned int flags,int max_active,struct lock_class_key * key,const char * lock_name,...)3007 struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3008 					       unsigned int flags,
3009 					       int max_active,
3010 					       struct lock_class_key *key,
3011 					       const char *lock_name, ...)
3012 {
3013 	va_list args, args1;
3014 	struct workqueue_struct *wq;
3015 	unsigned int cpu;
3016 	size_t namelen;
3017 
3018 	/* determine namelen, allocate wq and format name */
3019 	va_start(args, lock_name);
3020 	va_copy(args1, args);
3021 	namelen = vsnprintf(NULL, 0, fmt, args) + 1;
3022 
3023 	wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL);
3024 	if (!wq)
3025 		goto err;
3026 
3027 	vsnprintf(wq->name, namelen, fmt, args1);
3028 	va_end(args);
3029 	va_end(args1);
3030 
3031 	/*
3032 	 * Workqueues which may be used during memory reclaim should
3033 	 * have a rescuer to guarantee forward progress.
3034 	 */
3035 	if (flags & WQ_MEM_RECLAIM)
3036 		flags |= WQ_RESCUER;
3037 
3038 	/*
3039 	 * Unbound workqueues aren't concurrency managed and should be
3040 	 * dispatched to workers immediately.
3041 	 */
3042 	if (flags & WQ_UNBOUND)
3043 		flags |= WQ_HIGHPRI;
3044 
3045 	max_active = max_active ?: WQ_DFL_ACTIVE;
3046 	max_active = wq_clamp_max_active(max_active, flags, wq->name);
3047 
3048 	/* init wq */
3049 	wq->flags = flags;
3050 	wq->saved_max_active = max_active;
3051 	mutex_init(&wq->flush_mutex);
3052 	atomic_set(&wq->nr_cwqs_to_flush, 0);
3053 	INIT_LIST_HEAD(&wq->flusher_queue);
3054 	INIT_LIST_HEAD(&wq->flusher_overflow);
3055 
3056 	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
3057 	INIT_LIST_HEAD(&wq->list);
3058 
3059 	if (alloc_cwqs(wq) < 0)
3060 		goto err;
3061 
3062 	for_each_cwq_cpu(cpu, wq) {
3063 		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3064 		struct global_cwq *gcwq = get_gcwq(cpu);
3065 
3066 		BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
3067 		cwq->gcwq = gcwq;
3068 		cwq->wq = wq;
3069 		cwq->flush_color = -1;
3070 		cwq->max_active = max_active;
3071 		INIT_LIST_HEAD(&cwq->delayed_works);
3072 	}
3073 
3074 	if (flags & WQ_RESCUER) {
3075 		struct worker *rescuer;
3076 
3077 		if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL))
3078 			goto err;
3079 
3080 		wq->rescuer = rescuer = alloc_worker();
3081 		if (!rescuer)
3082 			goto err;
3083 
3084 		rescuer->task = kthread_create(rescuer_thread, wq, "%s",
3085 					       wq->name);
3086 		if (IS_ERR(rescuer->task))
3087 			goto err;
3088 
3089 		rescuer->task->flags |= PF_THREAD_BOUND;
3090 		wake_up_process(rescuer->task);
3091 	}
3092 
3093 	/*
3094 	 * workqueue_lock protects global freeze state and workqueues
3095 	 * list.  Grab it, set max_active accordingly and add the new
3096 	 * workqueue to workqueues list.
3097 	 */
3098 	spin_lock(&workqueue_lock);
3099 
3100 	if (workqueue_freezing && wq->flags & WQ_FREEZABLE)
3101 		for_each_cwq_cpu(cpu, wq)
3102 			get_cwq(cpu, wq)->max_active = 0;
3103 
3104 	list_add(&wq->list, &workqueues);
3105 
3106 	spin_unlock(&workqueue_lock);
3107 
3108 	return wq;
3109 err:
3110 	if (wq) {
3111 		free_cwqs(wq);
3112 		free_mayday_mask(wq->mayday_mask);
3113 		kfree(wq->rescuer);
3114 		kfree(wq);
3115 	}
3116 	return NULL;
3117 }
3118 EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
3119 
3120 /**
3121  * destroy_workqueue - safely terminate a workqueue
3122  * @wq: target workqueue
3123  *
3124  * Safely destroy a workqueue. All work currently pending will be done first.
3125  */
destroy_workqueue(struct workqueue_struct * wq)3126 void destroy_workqueue(struct workqueue_struct *wq)
3127 {
3128 	unsigned int cpu;
3129 
3130 	/* drain it before proceeding with destruction */
3131 	drain_workqueue(wq);
3132 
3133 	/*
3134 	 * wq list is used to freeze wq, remove from list after
3135 	 * flushing is complete in case freeze races us.
3136 	 */
3137 	spin_lock(&workqueue_lock);
3138 	list_del(&wq->list);
3139 	spin_unlock(&workqueue_lock);
3140 
3141 	/* sanity check */
3142 	for_each_cwq_cpu(cpu, wq) {
3143 		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3144 		int i;
3145 
3146 		for (i = 0; i < WORK_NR_COLORS; i++)
3147 			BUG_ON(cwq->nr_in_flight[i]);
3148 		BUG_ON(cwq->nr_active);
3149 		BUG_ON(!list_empty(&cwq->delayed_works));
3150 	}
3151 
3152 	if (wq->flags & WQ_RESCUER) {
3153 		kthread_stop(wq->rescuer->task);
3154 		free_mayday_mask(wq->mayday_mask);
3155 		kfree(wq->rescuer);
3156 	}
3157 
3158 	free_cwqs(wq);
3159 	kfree(wq);
3160 }
3161 EXPORT_SYMBOL_GPL(destroy_workqueue);
3162 
3163 /**
3164  * workqueue_set_max_active - adjust max_active of a workqueue
3165  * @wq: target workqueue
3166  * @max_active: new max_active value.
3167  *
3168  * Set max_active of @wq to @max_active.
3169  *
3170  * CONTEXT:
3171  * Don't call from IRQ context.
3172  */
workqueue_set_max_active(struct workqueue_struct * wq,int max_active)3173 void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
3174 {
3175 	unsigned int cpu;
3176 
3177 	max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
3178 
3179 	spin_lock(&workqueue_lock);
3180 
3181 	wq->saved_max_active = max_active;
3182 
3183 	for_each_cwq_cpu(cpu, wq) {
3184 		struct global_cwq *gcwq = get_gcwq(cpu);
3185 
3186 		spin_lock_irq(&gcwq->lock);
3187 
3188 		if (!(wq->flags & WQ_FREEZABLE) ||
3189 		    !(gcwq->flags & GCWQ_FREEZING))
3190 			get_cwq(gcwq->cpu, wq)->max_active = max_active;
3191 
3192 		spin_unlock_irq(&gcwq->lock);
3193 	}
3194 
3195 	spin_unlock(&workqueue_lock);
3196 }
3197 EXPORT_SYMBOL_GPL(workqueue_set_max_active);
3198 
3199 /**
3200  * workqueue_congested - test whether a workqueue is congested
3201  * @cpu: CPU in question
3202  * @wq: target workqueue
3203  *
3204  * Test whether @wq's cpu workqueue for @cpu is congested.  There is
3205  * no synchronization around this function and the test result is
3206  * unreliable and only useful as advisory hints or for debugging.
3207  *
3208  * RETURNS:
3209  * %true if congested, %false otherwise.
3210  */
workqueue_congested(unsigned int cpu,struct workqueue_struct * wq)3211 bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq)
3212 {
3213 	struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3214 
3215 	return !list_empty(&cwq->delayed_works);
3216 }
3217 EXPORT_SYMBOL_GPL(workqueue_congested);
3218 
3219 /**
3220  * work_cpu - return the last known associated cpu for @work
3221  * @work: the work of interest
3222  *
3223  * RETURNS:
3224  * CPU number if @work was ever queued.  WORK_CPU_NONE otherwise.
3225  */
work_cpu(struct work_struct * work)3226 unsigned int work_cpu(struct work_struct *work)
3227 {
3228 	struct global_cwq *gcwq = get_work_gcwq(work);
3229 
3230 	return gcwq ? gcwq->cpu : WORK_CPU_NONE;
3231 }
3232 EXPORT_SYMBOL_GPL(work_cpu);
3233 
3234 /**
3235  * work_busy - test whether a work is currently pending or running
3236  * @work: the work to be tested
3237  *
3238  * Test whether @work is currently pending or running.  There is no
3239  * synchronization around this function and the test result is
3240  * unreliable and only useful as advisory hints or for debugging.
3241  * Especially for reentrant wqs, the pending state might hide the
3242  * running state.
3243  *
3244  * RETURNS:
3245  * OR'd bitmask of WORK_BUSY_* bits.
3246  */
work_busy(struct work_struct * work)3247 unsigned int work_busy(struct work_struct *work)
3248 {
3249 	struct global_cwq *gcwq = get_work_gcwq(work);
3250 	unsigned long flags;
3251 	unsigned int ret = 0;
3252 
3253 	if (!gcwq)
3254 		return false;
3255 
3256 	spin_lock_irqsave(&gcwq->lock, flags);
3257 
3258 	if (work_pending(work))
3259 		ret |= WORK_BUSY_PENDING;
3260 	if (find_worker_executing_work(gcwq, work))
3261 		ret |= WORK_BUSY_RUNNING;
3262 
3263 	spin_unlock_irqrestore(&gcwq->lock, flags);
3264 
3265 	return ret;
3266 }
3267 EXPORT_SYMBOL_GPL(work_busy);
3268 
3269 /*
3270  * CPU hotplug.
3271  *
3272  * There are two challenges in supporting CPU hotplug.  Firstly, there
3273  * are a lot of assumptions on strong associations among work, cwq and
3274  * gcwq which make migrating pending and scheduled works very
3275  * difficult to implement without impacting hot paths.  Secondly,
3276  * gcwqs serve mix of short, long and very long running works making
3277  * blocked draining impractical.
3278  *
3279  * This is solved by allowing a gcwq to be detached from CPU, running
3280  * it with unbound (rogue) workers and allowing it to be reattached
3281  * later if the cpu comes back online.  A separate thread is created
3282  * to govern a gcwq in such state and is called the trustee of the
3283  * gcwq.
3284  *
3285  * Trustee states and their descriptions.
3286  *
3287  * START	Command state used on startup.  On CPU_DOWN_PREPARE, a
3288  *		new trustee is started with this state.
3289  *
3290  * IN_CHARGE	Once started, trustee will enter this state after
3291  *		assuming the manager role and making all existing
3292  *		workers rogue.  DOWN_PREPARE waits for trustee to
3293  *		enter this state.  After reaching IN_CHARGE, trustee
3294  *		tries to execute the pending worklist until it's empty
3295  *		and the state is set to BUTCHER, or the state is set
3296  *		to RELEASE.
3297  *
3298  * BUTCHER	Command state which is set by the cpu callback after
3299  *		the cpu has went down.  Once this state is set trustee
3300  *		knows that there will be no new works on the worklist
3301  *		and once the worklist is empty it can proceed to
3302  *		killing idle workers.
3303  *
3304  * RELEASE	Command state which is set by the cpu callback if the
3305  *		cpu down has been canceled or it has come online
3306  *		again.  After recognizing this state, trustee stops
3307  *		trying to drain or butcher and clears ROGUE, rebinds
3308  *		all remaining workers back to the cpu and releases
3309  *		manager role.
3310  *
3311  * DONE		Trustee will enter this state after BUTCHER or RELEASE
3312  *		is complete.
3313  *
3314  *          trustee                 CPU                draining
3315  *         took over                down               complete
3316  * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
3317  *                        |                     |                  ^
3318  *                        | CPU is back online  v   return workers |
3319  *                         ----------------> RELEASE --------------
3320  */
3321 
3322 /**
3323  * trustee_wait_event_timeout - timed event wait for trustee
3324  * @cond: condition to wait for
3325  * @timeout: timeout in jiffies
3326  *
3327  * wait_event_timeout() for trustee to use.  Handles locking and
3328  * checks for RELEASE request.
3329  *
3330  * CONTEXT:
3331  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3332  * multiple times.  To be used by trustee.
3333  *
3334  * RETURNS:
3335  * Positive indicating left time if @cond is satisfied, 0 if timed
3336  * out, -1 if canceled.
3337  */
3338 #define trustee_wait_event_timeout(cond, timeout) ({			\
3339 	long __ret = (timeout);						\
3340 	while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) &&	\
3341 	       __ret) {							\
3342 		spin_unlock_irq(&gcwq->lock);				\
3343 		__wait_event_timeout(gcwq->trustee_wait, (cond) ||	\
3344 			(gcwq->trustee_state == TRUSTEE_RELEASE),	\
3345 			__ret);						\
3346 		spin_lock_irq(&gcwq->lock);				\
3347 	}								\
3348 	gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret);		\
3349 })
3350 
3351 /**
3352  * trustee_wait_event - event wait for trustee
3353  * @cond: condition to wait for
3354  *
3355  * wait_event() for trustee to use.  Automatically handles locking and
3356  * checks for CANCEL request.
3357  *
3358  * CONTEXT:
3359  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3360  * multiple times.  To be used by trustee.
3361  *
3362  * RETURNS:
3363  * 0 if @cond is satisfied, -1 if canceled.
3364  */
3365 #define trustee_wait_event(cond) ({					\
3366 	long __ret1;							\
3367 	__ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
3368 	__ret1 < 0 ? -1 : 0;						\
3369 })
3370 
trustee_thread(void * __gcwq)3371 static int __cpuinit trustee_thread(void *__gcwq)
3372 {
3373 	struct global_cwq *gcwq = __gcwq;
3374 	struct worker *worker;
3375 	struct work_struct *work;
3376 	struct hlist_node *pos;
3377 	long rc;
3378 	int i;
3379 
3380 	BUG_ON(gcwq->cpu != smp_processor_id());
3381 
3382 	spin_lock_irq(&gcwq->lock);
3383 	/*
3384 	 * Claim the manager position and make all workers rogue.
3385 	 * Trustee must be bound to the target cpu and can't be
3386 	 * cancelled.
3387 	 */
3388 	BUG_ON(gcwq->cpu != smp_processor_id());
3389 	rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS));
3390 	BUG_ON(rc < 0);
3391 
3392 	gcwq->flags |= GCWQ_MANAGING_WORKERS;
3393 
3394 	list_for_each_entry(worker, &gcwq->idle_list, entry)
3395 		worker->flags |= WORKER_ROGUE;
3396 
3397 	for_each_busy_worker(worker, i, pos, gcwq)
3398 		worker->flags |= WORKER_ROGUE;
3399 
3400 	/*
3401 	 * Call schedule() so that we cross rq->lock and thus can
3402 	 * guarantee sched callbacks see the rogue flag.  This is
3403 	 * necessary as scheduler callbacks may be invoked from other
3404 	 * cpus.
3405 	 */
3406 	spin_unlock_irq(&gcwq->lock);
3407 	schedule();
3408 	spin_lock_irq(&gcwq->lock);
3409 
3410 	/*
3411 	 * Sched callbacks are disabled now.  Zap nr_running.  After
3412 	 * this, nr_running stays zero and need_more_worker() and
3413 	 * keep_working() are always true as long as the worklist is
3414 	 * not empty.
3415 	 */
3416 	atomic_set(get_gcwq_nr_running(gcwq->cpu), 0);
3417 
3418 	spin_unlock_irq(&gcwq->lock);
3419 	del_timer_sync(&gcwq->idle_timer);
3420 	spin_lock_irq(&gcwq->lock);
3421 
3422 	/*
3423 	 * We're now in charge.  Notify and proceed to drain.  We need
3424 	 * to keep the gcwq running during the whole CPU down
3425 	 * procedure as other cpu hotunplug callbacks may need to
3426 	 * flush currently running tasks.
3427 	 */
3428 	gcwq->trustee_state = TRUSTEE_IN_CHARGE;
3429 	wake_up_all(&gcwq->trustee_wait);
3430 
3431 	/*
3432 	 * The original cpu is in the process of dying and may go away
3433 	 * anytime now.  When that happens, we and all workers would
3434 	 * be migrated to other cpus.  Try draining any left work.  We
3435 	 * want to get it over with ASAP - spam rescuers, wake up as
3436 	 * many idlers as necessary and create new ones till the
3437 	 * worklist is empty.  Note that if the gcwq is frozen, there
3438 	 * may be frozen works in freezable cwqs.  Don't declare
3439 	 * completion while frozen.
3440 	 */
3441 	while (gcwq->nr_workers != gcwq->nr_idle ||
3442 	       gcwq->flags & GCWQ_FREEZING ||
3443 	       gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
3444 		int nr_works = 0;
3445 
3446 		list_for_each_entry(work, &gcwq->worklist, entry) {
3447 			send_mayday(work);
3448 			nr_works++;
3449 		}
3450 
3451 		list_for_each_entry(worker, &gcwq->idle_list, entry) {
3452 			if (!nr_works--)
3453 				break;
3454 			wake_up_process(worker->task);
3455 		}
3456 
3457 		if (need_to_create_worker(gcwq)) {
3458 			spin_unlock_irq(&gcwq->lock);
3459 			worker = create_worker(gcwq, false);
3460 			spin_lock_irq(&gcwq->lock);
3461 			if (worker) {
3462 				worker->flags |= WORKER_ROGUE;
3463 				start_worker(worker);
3464 			}
3465 		}
3466 
3467 		/* give a breather */
3468 		if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
3469 			break;
3470 	}
3471 
3472 	/*
3473 	 * Either all works have been scheduled and cpu is down, or
3474 	 * cpu down has already been canceled.  Wait for and butcher
3475 	 * all workers till we're canceled.
3476 	 */
3477 	do {
3478 		rc = trustee_wait_event(!list_empty(&gcwq->idle_list));
3479 		while (!list_empty(&gcwq->idle_list))
3480 			destroy_worker(list_first_entry(&gcwq->idle_list,
3481 							struct worker, entry));
3482 	} while (gcwq->nr_workers && rc >= 0);
3483 
3484 	/*
3485 	 * At this point, either draining has completed and no worker
3486 	 * is left, or cpu down has been canceled or the cpu is being
3487 	 * brought back up.  There shouldn't be any idle one left.
3488 	 * Tell the remaining busy ones to rebind once it finishes the
3489 	 * currently scheduled works by scheduling the rebind_work.
3490 	 */
3491 	WARN_ON(!list_empty(&gcwq->idle_list));
3492 
3493 	for_each_busy_worker(worker, i, pos, gcwq) {
3494 		struct work_struct *rebind_work = &worker->rebind_work;
3495 		unsigned long worker_flags = worker->flags;
3496 
3497 		/*
3498 		 * Rebind_work may race with future cpu hotplug
3499 		 * operations.  Use a separate flag to mark that
3500 		 * rebinding is scheduled.  The morphing should
3501 		 * be atomic.
3502 		 */
3503 		worker_flags |= WORKER_REBIND;
3504 		worker_flags &= ~WORKER_ROGUE;
3505 		ACCESS_ONCE(worker->flags) = worker_flags;
3506 
3507 		/* queue rebind_work, wq doesn't matter, use the default one */
3508 		if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
3509 				     work_data_bits(rebind_work)))
3510 			continue;
3511 
3512 		debug_work_activate(rebind_work);
3513 		insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
3514 			    worker->scheduled.next,
3515 			    work_color_to_flags(WORK_NO_COLOR));
3516 	}
3517 
3518 	/* relinquish manager role */
3519 	gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
3520 
3521 	/* notify completion */
3522 	gcwq->trustee = NULL;
3523 	gcwq->trustee_state = TRUSTEE_DONE;
3524 	wake_up_all(&gcwq->trustee_wait);
3525 	spin_unlock_irq(&gcwq->lock);
3526 	return 0;
3527 }
3528 
3529 /**
3530  * wait_trustee_state - wait for trustee to enter the specified state
3531  * @gcwq: gcwq the trustee of interest belongs to
3532  * @state: target state to wait for
3533  *
3534  * Wait for the trustee to reach @state.  DONE is already matched.
3535  *
3536  * CONTEXT:
3537  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3538  * multiple times.  To be used by cpu_callback.
3539  */
wait_trustee_state(struct global_cwq * gcwq,int state)3540 static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
3541 __releases(&gcwq->lock)
3542 __acquires(&gcwq->lock)
3543 {
3544 	if (!(gcwq->trustee_state == state ||
3545 	      gcwq->trustee_state == TRUSTEE_DONE)) {
3546 		spin_unlock_irq(&gcwq->lock);
3547 		__wait_event(gcwq->trustee_wait,
3548 			     gcwq->trustee_state == state ||
3549 			     gcwq->trustee_state == TRUSTEE_DONE);
3550 		spin_lock_irq(&gcwq->lock);
3551 	}
3552 }
3553 
workqueue_cpu_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)3554 static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
3555 						unsigned long action,
3556 						void *hcpu)
3557 {
3558 	unsigned int cpu = (unsigned long)hcpu;
3559 	struct global_cwq *gcwq = get_gcwq(cpu);
3560 	struct task_struct *new_trustee = NULL;
3561 	struct worker *uninitialized_var(new_worker);
3562 	unsigned long flags;
3563 
3564 	action &= ~CPU_TASKS_FROZEN;
3565 
3566 	switch (action) {
3567 	case CPU_DOWN_PREPARE:
3568 		new_trustee = kthread_create(trustee_thread, gcwq,
3569 					     "workqueue_trustee/%d\n", cpu);
3570 		if (IS_ERR(new_trustee))
3571 			return notifier_from_errno(PTR_ERR(new_trustee));
3572 		kthread_bind(new_trustee, cpu);
3573 		/* fall through */
3574 	case CPU_UP_PREPARE:
3575 		BUG_ON(gcwq->first_idle);
3576 		new_worker = create_worker(gcwq, false);
3577 		if (!new_worker) {
3578 			if (new_trustee)
3579 				kthread_stop(new_trustee);
3580 			return NOTIFY_BAD;
3581 		}
3582 	}
3583 
3584 	/* some are called w/ irq disabled, don't disturb irq status */
3585 	spin_lock_irqsave(&gcwq->lock, flags);
3586 
3587 	switch (action) {
3588 	case CPU_DOWN_PREPARE:
3589 		/* initialize trustee and tell it to acquire the gcwq */
3590 		BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
3591 		gcwq->trustee = new_trustee;
3592 		gcwq->trustee_state = TRUSTEE_START;
3593 		wake_up_process(gcwq->trustee);
3594 		wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
3595 		/* fall through */
3596 	case CPU_UP_PREPARE:
3597 		BUG_ON(gcwq->first_idle);
3598 		gcwq->first_idle = new_worker;
3599 		break;
3600 
3601 	case CPU_DYING:
3602 		/*
3603 		 * Before this, the trustee and all workers except for
3604 		 * the ones which are still executing works from
3605 		 * before the last CPU down must be on the cpu.  After
3606 		 * this, they'll all be diasporas.
3607 		 */
3608 		gcwq->flags |= GCWQ_DISASSOCIATED;
3609 		break;
3610 
3611 	case CPU_POST_DEAD:
3612 		gcwq->trustee_state = TRUSTEE_BUTCHER;
3613 		/* fall through */
3614 	case CPU_UP_CANCELED:
3615 		destroy_worker(gcwq->first_idle);
3616 		gcwq->first_idle = NULL;
3617 		break;
3618 
3619 	case CPU_DOWN_FAILED:
3620 	case CPU_ONLINE:
3621 		gcwq->flags &= ~GCWQ_DISASSOCIATED;
3622 		if (gcwq->trustee_state != TRUSTEE_DONE) {
3623 			gcwq->trustee_state = TRUSTEE_RELEASE;
3624 			wake_up_process(gcwq->trustee);
3625 			wait_trustee_state(gcwq, TRUSTEE_DONE);
3626 		}
3627 
3628 		/*
3629 		 * Trustee is done and there might be no worker left.
3630 		 * Put the first_idle in and request a real manager to
3631 		 * take a look.
3632 		 */
3633 		spin_unlock_irq(&gcwq->lock);
3634 		kthread_bind(gcwq->first_idle->task, cpu);
3635 		spin_lock_irq(&gcwq->lock);
3636 		gcwq->flags |= GCWQ_MANAGE_WORKERS;
3637 		start_worker(gcwq->first_idle);
3638 		gcwq->first_idle = NULL;
3639 		break;
3640 	}
3641 
3642 	spin_unlock_irqrestore(&gcwq->lock, flags);
3643 
3644 	return notifier_from_errno(0);
3645 }
3646 
3647 /*
3648  * Workqueues should be brought up before normal priority CPU notifiers.
3649  * This will be registered high priority CPU notifier.
3650  */
workqueue_cpu_up_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)3651 static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
3652 					       unsigned long action,
3653 					       void *hcpu)
3654 {
3655 	switch (action & ~CPU_TASKS_FROZEN) {
3656 	case CPU_UP_PREPARE:
3657 	case CPU_UP_CANCELED:
3658 	case CPU_DOWN_FAILED:
3659 	case CPU_ONLINE:
3660 		return workqueue_cpu_callback(nfb, action, hcpu);
3661 	}
3662 	return NOTIFY_OK;
3663 }
3664 
3665 /*
3666  * Workqueues should be brought down after normal priority CPU notifiers.
3667  * This will be registered as low priority CPU notifier.
3668  */
workqueue_cpu_down_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)3669 static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
3670 						 unsigned long action,
3671 						 void *hcpu)
3672 {
3673 	switch (action & ~CPU_TASKS_FROZEN) {
3674 	case CPU_DOWN_PREPARE:
3675 	case CPU_DYING:
3676 	case CPU_POST_DEAD:
3677 		return workqueue_cpu_callback(nfb, action, hcpu);
3678 	}
3679 	return NOTIFY_OK;
3680 }
3681 
3682 #ifdef CONFIG_SMP
3683 
3684 struct work_for_cpu {
3685 	struct work_struct work;
3686 	long (*fn)(void *);
3687 	void *arg;
3688 	long ret;
3689 };
3690 
work_for_cpu_fn(struct work_struct * work)3691 static void work_for_cpu_fn(struct work_struct *work)
3692 {
3693 	struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);
3694 
3695 	wfc->ret = wfc->fn(wfc->arg);
3696 }
3697 
3698 /**
3699  * work_on_cpu - run a function in user context on a particular cpu
3700  * @cpu: the cpu to run on
3701  * @fn: the function to run
3702  * @arg: the function arg
3703  *
3704  * This will return the value @fn returns.
3705  * It is up to the caller to ensure that the cpu doesn't go offline.
3706  * The caller must not hold any locks which would prevent @fn from completing.
3707  */
work_on_cpu(unsigned int cpu,long (* fn)(void *),void * arg)3708 long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
3709 {
3710 	struct work_for_cpu wfc = { .fn = fn, .arg = arg };
3711 
3712 	INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
3713 	schedule_work_on(cpu, &wfc.work);
3714 	flush_work(&wfc.work);
3715 	return wfc.ret;
3716 }
3717 EXPORT_SYMBOL_GPL(work_on_cpu);
3718 #endif /* CONFIG_SMP */
3719 
3720 #ifdef CONFIG_FREEZER
3721 
3722 /**
3723  * freeze_workqueues_begin - begin freezing workqueues
3724  *
3725  * Start freezing workqueues.  After this function returns, all freezable
3726  * workqueues will queue new works to their frozen_works list instead of
3727  * gcwq->worklist.
3728  *
3729  * CONTEXT:
3730  * Grabs and releases workqueue_lock and gcwq->lock's.
3731  */
freeze_workqueues_begin(void)3732 void freeze_workqueues_begin(void)
3733 {
3734 	unsigned int cpu;
3735 
3736 	spin_lock(&workqueue_lock);
3737 
3738 	BUG_ON(workqueue_freezing);
3739 	workqueue_freezing = true;
3740 
3741 	for_each_gcwq_cpu(cpu) {
3742 		struct global_cwq *gcwq = get_gcwq(cpu);
3743 		struct workqueue_struct *wq;
3744 
3745 		spin_lock_irq(&gcwq->lock);
3746 
3747 		BUG_ON(gcwq->flags & GCWQ_FREEZING);
3748 		gcwq->flags |= GCWQ_FREEZING;
3749 
3750 		list_for_each_entry(wq, &workqueues, list) {
3751 			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3752 
3753 			if (cwq && wq->flags & WQ_FREEZABLE)
3754 				cwq->max_active = 0;
3755 		}
3756 
3757 		spin_unlock_irq(&gcwq->lock);
3758 	}
3759 
3760 	spin_unlock(&workqueue_lock);
3761 }
3762 
3763 /**
3764  * freeze_workqueues_busy - are freezable workqueues still busy?
3765  *
3766  * Check whether freezing is complete.  This function must be called
3767  * between freeze_workqueues_begin() and thaw_workqueues().
3768  *
3769  * CONTEXT:
3770  * Grabs and releases workqueue_lock.
3771  *
3772  * RETURNS:
3773  * %true if some freezable workqueues are still busy.  %false if freezing
3774  * is complete.
3775  */
freeze_workqueues_busy(void)3776 bool freeze_workqueues_busy(void)
3777 {
3778 	unsigned int cpu;
3779 	bool busy = false;
3780 
3781 	spin_lock(&workqueue_lock);
3782 
3783 	BUG_ON(!workqueue_freezing);
3784 
3785 	for_each_gcwq_cpu(cpu) {
3786 		struct workqueue_struct *wq;
3787 		/*
3788 		 * nr_active is monotonically decreasing.  It's safe
3789 		 * to peek without lock.
3790 		 */
3791 		list_for_each_entry(wq, &workqueues, list) {
3792 			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3793 
3794 			if (!cwq || !(wq->flags & WQ_FREEZABLE))
3795 				continue;
3796 
3797 			BUG_ON(cwq->nr_active < 0);
3798 			if (cwq->nr_active) {
3799 				busy = true;
3800 				goto out_unlock;
3801 			}
3802 		}
3803 	}
3804 out_unlock:
3805 	spin_unlock(&workqueue_lock);
3806 	return busy;
3807 }
3808 
3809 /**
3810  * thaw_workqueues - thaw workqueues
3811  *
3812  * Thaw workqueues.  Normal queueing is restored and all collected
3813  * frozen works are transferred to their respective gcwq worklists.
3814  *
3815  * CONTEXT:
3816  * Grabs and releases workqueue_lock and gcwq->lock's.
3817  */
thaw_workqueues(void)3818 void thaw_workqueues(void)
3819 {
3820 	unsigned int cpu;
3821 
3822 	spin_lock(&workqueue_lock);
3823 
3824 	if (!workqueue_freezing)
3825 		goto out_unlock;
3826 
3827 	for_each_gcwq_cpu(cpu) {
3828 		struct global_cwq *gcwq = get_gcwq(cpu);
3829 		struct workqueue_struct *wq;
3830 
3831 		spin_lock_irq(&gcwq->lock);
3832 
3833 		BUG_ON(!(gcwq->flags & GCWQ_FREEZING));
3834 		gcwq->flags &= ~GCWQ_FREEZING;
3835 
3836 		list_for_each_entry(wq, &workqueues, list) {
3837 			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3838 
3839 			if (!cwq || !(wq->flags & WQ_FREEZABLE))
3840 				continue;
3841 
3842 			/* restore max_active and repopulate worklist */
3843 			cwq->max_active = wq->saved_max_active;
3844 
3845 			while (!list_empty(&cwq->delayed_works) &&
3846 			       cwq->nr_active < cwq->max_active)
3847 				cwq_activate_first_delayed(cwq);
3848 		}
3849 
3850 		wake_up_worker(gcwq);
3851 
3852 		spin_unlock_irq(&gcwq->lock);
3853 	}
3854 
3855 	workqueue_freezing = false;
3856 out_unlock:
3857 	spin_unlock(&workqueue_lock);
3858 }
3859 #endif /* CONFIG_FREEZER */
3860 
init_workqueues(void)3861 static int __init init_workqueues(void)
3862 {
3863 	unsigned int cpu;
3864 	int i;
3865 
3866 	cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
3867 	cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
3868 
3869 	/* initialize gcwqs */
3870 	for_each_gcwq_cpu(cpu) {
3871 		struct global_cwq *gcwq = get_gcwq(cpu);
3872 
3873 		spin_lock_init(&gcwq->lock);
3874 		INIT_LIST_HEAD(&gcwq->worklist);
3875 		gcwq->cpu = cpu;
3876 		gcwq->flags |= GCWQ_DISASSOCIATED;
3877 
3878 		INIT_LIST_HEAD(&gcwq->idle_list);
3879 		for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
3880 			INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
3881 
3882 		init_timer_deferrable(&gcwq->idle_timer);
3883 		gcwq->idle_timer.function = idle_worker_timeout;
3884 		gcwq->idle_timer.data = (unsigned long)gcwq;
3885 
3886 		setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout,
3887 			    (unsigned long)gcwq);
3888 
3889 		ida_init(&gcwq->worker_ida);
3890 
3891 		gcwq->trustee_state = TRUSTEE_DONE;
3892 		init_waitqueue_head(&gcwq->trustee_wait);
3893 	}
3894 
3895 	/* create the initial worker */
3896 	for_each_online_gcwq_cpu(cpu) {
3897 		struct global_cwq *gcwq = get_gcwq(cpu);
3898 		struct worker *worker;
3899 
3900 		if (cpu != WORK_CPU_UNBOUND)
3901 			gcwq->flags &= ~GCWQ_DISASSOCIATED;
3902 		worker = create_worker(gcwq, true);
3903 		BUG_ON(!worker);
3904 		spin_lock_irq(&gcwq->lock);
3905 		start_worker(worker);
3906 		spin_unlock_irq(&gcwq->lock);
3907 	}
3908 
3909 	system_wq = alloc_workqueue("events", 0, 0);
3910 	system_long_wq = alloc_workqueue("events_long", 0, 0);
3911 	system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
3912 	system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
3913 					    WQ_UNBOUND_MAX_ACTIVE);
3914 	system_freezable_wq = alloc_workqueue("events_freezable",
3915 					      WQ_FREEZABLE, 0);
3916 	system_nrt_freezable_wq = alloc_workqueue("events_nrt_freezable",
3917 			WQ_NON_REENTRANT | WQ_FREEZABLE, 0);
3918 	BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
3919 	       !system_unbound_wq || !system_freezable_wq ||
3920 		!system_nrt_freezable_wq);
3921 	return 0;
3922 }
3923 early_initcall(init_workqueues);
3924