1 // SPDX-License-Identifier: GPL-2.0-or-later
2 #include <linux/bug.h>
3 #include <linux/compiler.h>
4 #include <linux/export.h>
5 #include <linux/percpu.h>
6 #include <linux/processor.h>
7 #include <linux/smp.h>
8 #include <linux/topology.h>
9 #include <linux/sched/clock.h>
10 #include <asm/qspinlock.h>
11 #include <asm/paravirt.h>
12 
13 #define MAX_NODES	4
14 
15 struct qnode {
16 	struct qnode	*next;
17 	struct qspinlock *lock;
18 	int		cpu;
19 	int		yield_cpu;
20 	u8		locked; /* 1 if lock acquired */
21 };
22 
23 struct qnodes {
24 	int		count;
25 	struct qnode nodes[MAX_NODES];
26 };
27 
28 /* Tuning parameters */
29 static int steal_spins __read_mostly = (1 << 5);
30 static int remote_steal_spins __read_mostly = (1 << 2);
31 #if _Q_SPIN_TRY_LOCK_STEAL == 1
32 static const bool maybe_stealers = true;
33 #else
34 static bool maybe_stealers __read_mostly = true;
35 #endif
36 static int head_spins __read_mostly = (1 << 8);
37 
38 static bool pv_yield_owner __read_mostly = true;
39 static bool pv_yield_allow_steal __read_mostly = false;
40 static bool pv_spin_on_preempted_owner __read_mostly = false;
41 static bool pv_sleepy_lock __read_mostly = true;
42 static bool pv_sleepy_lock_sticky __read_mostly = false;
43 static u64 pv_sleepy_lock_interval_ns __read_mostly = 0;
44 static int pv_sleepy_lock_factor __read_mostly = 256;
45 static bool pv_yield_prev __read_mostly = true;
46 static bool pv_yield_propagate_owner __read_mostly = true;
47 static bool pv_prod_head __read_mostly = false;
48 
49 static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
50 static DEFINE_PER_CPU_ALIGNED(u64, sleepy_lock_seen_clock);
51 
52 #if _Q_SPIN_SPEC_BARRIER == 1
53 #define spec_barrier() do { asm volatile("ori 31,31,0" ::: "memory"); } while (0)
54 #else
55 #define spec_barrier() do { } while (0)
56 #endif
57 
recently_sleepy(void)58 static __always_inline bool recently_sleepy(void)
59 {
60 	/* pv_sleepy_lock is true when this is called */
61 	if (pv_sleepy_lock_interval_ns) {
62 		u64 seen = this_cpu_read(sleepy_lock_seen_clock);
63 
64 		if (seen) {
65 			u64 delta = sched_clock() - seen;
66 			if (delta < pv_sleepy_lock_interval_ns)
67 				return true;
68 			this_cpu_write(sleepy_lock_seen_clock, 0);
69 		}
70 	}
71 
72 	return false;
73 }
74 
get_steal_spins(bool paravirt,bool sleepy)75 static __always_inline int get_steal_spins(bool paravirt, bool sleepy)
76 {
77 	if (paravirt && sleepy)
78 		return steal_spins * pv_sleepy_lock_factor;
79 	else
80 		return steal_spins;
81 }
82 
get_remote_steal_spins(bool paravirt,bool sleepy)83 static __always_inline int get_remote_steal_spins(bool paravirt, bool sleepy)
84 {
85 	if (paravirt && sleepy)
86 		return remote_steal_spins * pv_sleepy_lock_factor;
87 	else
88 		return remote_steal_spins;
89 }
90 
get_head_spins(bool paravirt,bool sleepy)91 static __always_inline int get_head_spins(bool paravirt, bool sleepy)
92 {
93 	if (paravirt && sleepy)
94 		return head_spins * pv_sleepy_lock_factor;
95 	else
96 		return head_spins;
97 }
98 
encode_tail_cpu(int cpu)99 static inline u32 encode_tail_cpu(int cpu)
100 {
101 	return (cpu + 1) << _Q_TAIL_CPU_OFFSET;
102 }
103 
decode_tail_cpu(u32 val)104 static inline int decode_tail_cpu(u32 val)
105 {
106 	return (val >> _Q_TAIL_CPU_OFFSET) - 1;
107 }
108 
get_owner_cpu(u32 val)109 static inline int get_owner_cpu(u32 val)
110 {
111 	return (val & _Q_OWNER_CPU_MASK) >> _Q_OWNER_CPU_OFFSET;
112 }
113 
114 /*
115  * Try to acquire the lock if it was not already locked. If the tail matches
116  * mytail then clear it, otherwise leave it unchnaged. Return previous value.
117  *
118  * This is used by the head of the queue to acquire the lock and clean up
119  * its tail if it was the last one queued.
120  */
trylock_clean_tail(struct qspinlock * lock,u32 tail)121 static __always_inline u32 trylock_clean_tail(struct qspinlock *lock, u32 tail)
122 {
123 	u32 newval = queued_spin_encode_locked_val();
124 	u32 prev, tmp;
125 
126 	asm volatile(
127 "1:	lwarx	%0,0,%2,%7	# trylock_clean_tail			\n"
128 	/* This test is necessary if there could be stealers */
129 "	andi.	%1,%0,%5						\n"
130 "	bne	3f							\n"
131 	/* Test whether the lock tail == mytail */
132 "	and	%1,%0,%6						\n"
133 "	cmpw	0,%1,%3							\n"
134 	/* Merge the new locked value */
135 "	or	%1,%1,%4						\n"
136 "	bne	2f							\n"
137 	/* If the lock tail matched, then clear it, otherwise leave it. */
138 "	andc	%1,%1,%6						\n"
139 "2:	stwcx.	%1,0,%2							\n"
140 "	bne-	1b							\n"
141 "\t"	PPC_ACQUIRE_BARRIER "						\n"
142 "3:									\n"
143 	: "=&r" (prev), "=&r" (tmp)
144 	: "r" (&lock->val), "r"(tail), "r" (newval),
145 	  "i" (_Q_LOCKED_VAL),
146 	  "r" (_Q_TAIL_CPU_MASK),
147 	  "i" (_Q_SPIN_EH_HINT)
148 	: "cr0", "memory");
149 
150 	return prev;
151 }
152 
153 /*
154  * Publish our tail, replacing previous tail. Return previous value.
155  *
156  * This provides a release barrier for publishing node, this pairs with the
157  * acquire barrier in get_tail_qnode() when the next CPU finds this tail
158  * value.
159  */
publish_tail_cpu(struct qspinlock * lock,u32 tail)160 static __always_inline u32 publish_tail_cpu(struct qspinlock *lock, u32 tail)
161 {
162 	u32 prev, tmp;
163 
164 	kcsan_release();
165 
166 	asm volatile(
167 "\t"	PPC_RELEASE_BARRIER "						\n"
168 "1:	lwarx	%0,0,%2		# publish_tail_cpu			\n"
169 "	andc	%1,%0,%4						\n"
170 "	or	%1,%1,%3						\n"
171 "	stwcx.	%1,0,%2							\n"
172 "	bne-	1b							\n"
173 	: "=&r" (prev), "=&r"(tmp)
174 	: "r" (&lock->val), "r" (tail), "r"(_Q_TAIL_CPU_MASK)
175 	: "cr0", "memory");
176 
177 	return prev;
178 }
179 
set_mustq(struct qspinlock * lock)180 static __always_inline u32 set_mustq(struct qspinlock *lock)
181 {
182 	u32 prev;
183 
184 	asm volatile(
185 "1:	lwarx	%0,0,%1		# set_mustq				\n"
186 "	or	%0,%0,%2						\n"
187 "	stwcx.	%0,0,%1							\n"
188 "	bne-	1b							\n"
189 	: "=&r" (prev)
190 	: "r" (&lock->val), "r" (_Q_MUST_Q_VAL)
191 	: "cr0", "memory");
192 
193 	return prev;
194 }
195 
clear_mustq(struct qspinlock * lock)196 static __always_inline u32 clear_mustq(struct qspinlock *lock)
197 {
198 	u32 prev;
199 
200 	asm volatile(
201 "1:	lwarx	%0,0,%1		# clear_mustq				\n"
202 "	andc	%0,%0,%2						\n"
203 "	stwcx.	%0,0,%1							\n"
204 "	bne-	1b							\n"
205 	: "=&r" (prev)
206 	: "r" (&lock->val), "r" (_Q_MUST_Q_VAL)
207 	: "cr0", "memory");
208 
209 	return prev;
210 }
211 
try_set_sleepy(struct qspinlock * lock,u32 old)212 static __always_inline bool try_set_sleepy(struct qspinlock *lock, u32 old)
213 {
214 	u32 prev;
215 	u32 new = old | _Q_SLEEPY_VAL;
216 
217 	BUG_ON(!(old & _Q_LOCKED_VAL));
218 	BUG_ON(old & _Q_SLEEPY_VAL);
219 
220 	asm volatile(
221 "1:	lwarx	%0,0,%1		# try_set_sleepy			\n"
222 "	cmpw	0,%0,%2							\n"
223 "	bne-	2f							\n"
224 "	stwcx.	%3,0,%1							\n"
225 "	bne-	1b							\n"
226 "2:									\n"
227 	: "=&r" (prev)
228 	: "r" (&lock->val), "r"(old), "r" (new)
229 	: "cr0", "memory");
230 
231 	return likely(prev == old);
232 }
233 
seen_sleepy_owner(struct qspinlock * lock,u32 val)234 static __always_inline void seen_sleepy_owner(struct qspinlock *lock, u32 val)
235 {
236 	if (pv_sleepy_lock) {
237 		if (pv_sleepy_lock_interval_ns)
238 			this_cpu_write(sleepy_lock_seen_clock, sched_clock());
239 		if (!(val & _Q_SLEEPY_VAL))
240 			try_set_sleepy(lock, val);
241 	}
242 }
243 
seen_sleepy_lock(void)244 static __always_inline void seen_sleepy_lock(void)
245 {
246 	if (pv_sleepy_lock && pv_sleepy_lock_interval_ns)
247 		this_cpu_write(sleepy_lock_seen_clock, sched_clock());
248 }
249 
seen_sleepy_node(struct qspinlock * lock,u32 val)250 static __always_inline void seen_sleepy_node(struct qspinlock *lock, u32 val)
251 {
252 	if (pv_sleepy_lock) {
253 		if (pv_sleepy_lock_interval_ns)
254 			this_cpu_write(sleepy_lock_seen_clock, sched_clock());
255 		if (val & _Q_LOCKED_VAL) {
256 			if (!(val & _Q_SLEEPY_VAL))
257 				try_set_sleepy(lock, val);
258 		}
259 	}
260 }
261 
get_tail_qnode(struct qspinlock * lock,u32 val)262 static struct qnode *get_tail_qnode(struct qspinlock *lock, u32 val)
263 {
264 	int cpu = decode_tail_cpu(val);
265 	struct qnodes *qnodesp = per_cpu_ptr(&qnodes, cpu);
266 	int idx;
267 
268 	/*
269 	 * After publishing the new tail and finding a previous tail in the
270 	 * previous val (which is the control dependency), this barrier
271 	 * orders the release barrier in publish_tail_cpu performed by the
272 	 * last CPU, with subsequently looking at its qnode structures
273 	 * after the barrier.
274 	 */
275 	smp_acquire__after_ctrl_dep();
276 
277 	for (idx = 0; idx < MAX_NODES; idx++) {
278 		struct qnode *qnode = &qnodesp->nodes[idx];
279 		if (qnode->lock == lock)
280 			return qnode;
281 	}
282 
283 	BUG();
284 }
285 
286 /* Called inside spin_begin(). Returns whether or not the vCPU was preempted. */
__yield_to_locked_owner(struct qspinlock * lock,u32 val,bool paravirt,bool mustq)287 static __always_inline bool __yield_to_locked_owner(struct qspinlock *lock, u32 val, bool paravirt, bool mustq)
288 {
289 	int owner;
290 	u32 yield_count;
291 	bool preempted = false;
292 
293 	BUG_ON(!(val & _Q_LOCKED_VAL));
294 
295 	if (!paravirt)
296 		goto relax;
297 
298 	if (!pv_yield_owner)
299 		goto relax;
300 
301 	owner = get_owner_cpu(val);
302 	yield_count = yield_count_of(owner);
303 
304 	if ((yield_count & 1) == 0)
305 		goto relax; /* owner vcpu is running */
306 
307 	spin_end();
308 
309 	seen_sleepy_owner(lock, val);
310 	preempted = true;
311 
312 	/*
313 	 * Read the lock word after sampling the yield count. On the other side
314 	 * there may a wmb because the yield count update is done by the
315 	 * hypervisor preemption and the value update by the OS, however this
316 	 * ordering might reduce the chance of out of order accesses and
317 	 * improve the heuristic.
318 	 */
319 	smp_rmb();
320 
321 	if (READ_ONCE(lock->val) == val) {
322 		if (mustq)
323 			clear_mustq(lock);
324 		yield_to_preempted(owner, yield_count);
325 		if (mustq)
326 			set_mustq(lock);
327 		spin_begin();
328 
329 		/* Don't relax if we yielded. Maybe we should? */
330 		return preempted;
331 	}
332 	spin_begin();
333 relax:
334 	spin_cpu_relax();
335 
336 	return preempted;
337 }
338 
339 /* Called inside spin_begin(). Returns whether or not the vCPU was preempted. */
yield_to_locked_owner(struct qspinlock * lock,u32 val,bool paravirt)340 static __always_inline bool yield_to_locked_owner(struct qspinlock *lock, u32 val, bool paravirt)
341 {
342 	return __yield_to_locked_owner(lock, val, paravirt, false);
343 }
344 
345 /* Called inside spin_begin(). Returns whether or not the vCPU was preempted. */
yield_head_to_locked_owner(struct qspinlock * lock,u32 val,bool paravirt)346 static __always_inline bool yield_head_to_locked_owner(struct qspinlock *lock, u32 val, bool paravirt)
347 {
348 	bool mustq = false;
349 
350 	if ((val & _Q_MUST_Q_VAL) && pv_yield_allow_steal)
351 		mustq = true;
352 
353 	return __yield_to_locked_owner(lock, val, paravirt, mustq);
354 }
355 
propagate_yield_cpu(struct qnode * node,u32 val,int * set_yield_cpu,bool paravirt)356 static __always_inline void propagate_yield_cpu(struct qnode *node, u32 val, int *set_yield_cpu, bool paravirt)
357 {
358 	struct qnode *next;
359 	int owner;
360 
361 	if (!paravirt)
362 		return;
363 	if (!pv_yield_propagate_owner)
364 		return;
365 
366 	owner = get_owner_cpu(val);
367 	if (*set_yield_cpu == owner)
368 		return;
369 
370 	next = READ_ONCE(node->next);
371 	if (!next)
372 		return;
373 
374 	if (vcpu_is_preempted(owner)) {
375 		next->yield_cpu = owner;
376 		*set_yield_cpu = owner;
377 	} else if (*set_yield_cpu != -1) {
378 		next->yield_cpu = owner;
379 		*set_yield_cpu = owner;
380 	}
381 }
382 
383 /* Called inside spin_begin() */
yield_to_prev(struct qspinlock * lock,struct qnode * node,u32 val,bool paravirt)384 static __always_inline bool yield_to_prev(struct qspinlock *lock, struct qnode *node, u32 val, bool paravirt)
385 {
386 	int prev_cpu = decode_tail_cpu(val);
387 	u32 yield_count;
388 	int yield_cpu;
389 	bool preempted = false;
390 
391 	if (!paravirt)
392 		goto relax;
393 
394 	if (!pv_yield_propagate_owner)
395 		goto yield_prev;
396 
397 	yield_cpu = READ_ONCE(node->yield_cpu);
398 	if (yield_cpu == -1) {
399 		/* Propagate back the -1 CPU */
400 		if (node->next && node->next->yield_cpu != -1)
401 			node->next->yield_cpu = yield_cpu;
402 		goto yield_prev;
403 	}
404 
405 	yield_count = yield_count_of(yield_cpu);
406 	if ((yield_count & 1) == 0)
407 		goto yield_prev; /* owner vcpu is running */
408 
409 	if (get_owner_cpu(READ_ONCE(lock->val)) != yield_cpu)
410 		goto yield_prev; /* re-sample lock owner */
411 
412 	spin_end();
413 
414 	preempted = true;
415 	seen_sleepy_node(lock, val);
416 
417 	smp_rmb();
418 
419 	if (yield_cpu == node->yield_cpu) {
420 		if (node->next && node->next->yield_cpu != yield_cpu)
421 			node->next->yield_cpu = yield_cpu;
422 		yield_to_preempted(yield_cpu, yield_count);
423 		spin_begin();
424 		return preempted;
425 	}
426 	spin_begin();
427 
428 yield_prev:
429 	if (!pv_yield_prev)
430 		goto relax;
431 
432 	yield_count = yield_count_of(prev_cpu);
433 	if ((yield_count & 1) == 0)
434 		goto relax; /* owner vcpu is running */
435 
436 	spin_end();
437 
438 	preempted = true;
439 	seen_sleepy_node(lock, val);
440 
441 	smp_rmb(); /* See __yield_to_locked_owner comment */
442 
443 	if (!READ_ONCE(node->locked)) {
444 		yield_to_preempted(prev_cpu, yield_count);
445 		spin_begin();
446 		return preempted;
447 	}
448 	spin_begin();
449 
450 relax:
451 	spin_cpu_relax();
452 
453 	return preempted;
454 }
455 
steal_break(u32 val,int iters,bool paravirt,bool sleepy)456 static __always_inline bool steal_break(u32 val, int iters, bool paravirt, bool sleepy)
457 {
458 	if (iters >= get_steal_spins(paravirt, sleepy))
459 		return true;
460 
461 	if (IS_ENABLED(CONFIG_NUMA) &&
462 	    (iters >= get_remote_steal_spins(paravirt, sleepy))) {
463 		int cpu = get_owner_cpu(val);
464 		if (numa_node_id() != cpu_to_node(cpu))
465 			return true;
466 	}
467 	return false;
468 }
469 
try_to_steal_lock(struct qspinlock * lock,bool paravirt)470 static __always_inline bool try_to_steal_lock(struct qspinlock *lock, bool paravirt)
471 {
472 	bool seen_preempted = false;
473 	bool sleepy = false;
474 	int iters = 0;
475 	u32 val;
476 
477 	if (!steal_spins) {
478 		/* XXX: should spin_on_preempted_owner do anything here? */
479 		return false;
480 	}
481 
482 	/* Attempt to steal the lock */
483 	spin_begin();
484 	do {
485 		bool preempted = false;
486 
487 		val = READ_ONCE(lock->val);
488 		if (val & _Q_MUST_Q_VAL)
489 			break;
490 		spec_barrier();
491 
492 		if (unlikely(!(val & _Q_LOCKED_VAL))) {
493 			spin_end();
494 			if (__queued_spin_trylock_steal(lock))
495 				return true;
496 			spin_begin();
497 		} else {
498 			preempted = yield_to_locked_owner(lock, val, paravirt);
499 		}
500 
501 		if (paravirt && pv_sleepy_lock) {
502 			if (!sleepy) {
503 				if (val & _Q_SLEEPY_VAL) {
504 					seen_sleepy_lock();
505 					sleepy = true;
506 				} else if (recently_sleepy()) {
507 					sleepy = true;
508 				}
509 			}
510 			if (pv_sleepy_lock_sticky && seen_preempted &&
511 			    !(val & _Q_SLEEPY_VAL)) {
512 				if (try_set_sleepy(lock, val))
513 					val |= _Q_SLEEPY_VAL;
514 			}
515 		}
516 
517 		if (preempted) {
518 			seen_preempted = true;
519 			sleepy = true;
520 			if (!pv_spin_on_preempted_owner)
521 				iters++;
522 			/*
523 			 * pv_spin_on_preempted_owner don't increase iters
524 			 * while the owner is preempted -- we won't interfere
525 			 * with it by definition. This could introduce some
526 			 * latency issue if we continually observe preempted
527 			 * owners, but hopefully that's a rare corner case of
528 			 * a badly oversubscribed system.
529 			 */
530 		} else {
531 			iters++;
532 		}
533 	} while (!steal_break(val, iters, paravirt, sleepy));
534 
535 	spin_end();
536 
537 	return false;
538 }
539 
queued_spin_lock_mcs_queue(struct qspinlock * lock,bool paravirt)540 static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, bool paravirt)
541 {
542 	struct qnodes *qnodesp;
543 	struct qnode *next, *node;
544 	u32 val, old, tail;
545 	bool seen_preempted = false;
546 	bool sleepy = false;
547 	bool mustq = false;
548 	int idx;
549 	int set_yield_cpu = -1;
550 	int iters = 0;
551 
552 	BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
553 
554 	qnodesp = this_cpu_ptr(&qnodes);
555 	if (unlikely(qnodesp->count >= MAX_NODES)) {
556 		spec_barrier();
557 		while (!queued_spin_trylock(lock))
558 			cpu_relax();
559 		return;
560 	}
561 
562 	idx = qnodesp->count++;
563 	/*
564 	 * Ensure that we increment the head node->count before initialising
565 	 * the actual node. If the compiler is kind enough to reorder these
566 	 * stores, then an IRQ could overwrite our assignments.
567 	 */
568 	barrier();
569 	node = &qnodesp->nodes[idx];
570 	node->next = NULL;
571 	node->lock = lock;
572 	node->cpu = smp_processor_id();
573 	node->yield_cpu = -1;
574 	node->locked = 0;
575 
576 	tail = encode_tail_cpu(node->cpu);
577 
578 	/*
579 	 * Assign all attributes of a node before it can be published.
580 	 * Issues an lwsync, serving as a release barrier, as well as a
581 	 * compiler barrier.
582 	 */
583 	old = publish_tail_cpu(lock, tail);
584 
585 	/*
586 	 * If there was a previous node; link it and wait until reaching the
587 	 * head of the waitqueue.
588 	 */
589 	if (old & _Q_TAIL_CPU_MASK) {
590 		struct qnode *prev = get_tail_qnode(lock, old);
591 
592 		/* Link @node into the waitqueue. */
593 		WRITE_ONCE(prev->next, node);
594 
595 		/* Wait for mcs node lock to be released */
596 		spin_begin();
597 		while (!READ_ONCE(node->locked)) {
598 			spec_barrier();
599 
600 			if (yield_to_prev(lock, node, old, paravirt))
601 				seen_preempted = true;
602 		}
603 		spec_barrier();
604 		spin_end();
605 
606 		/* Clear out stale propagated yield_cpu */
607 		if (paravirt && pv_yield_propagate_owner && node->yield_cpu != -1)
608 			node->yield_cpu = -1;
609 
610 		smp_rmb(); /* acquire barrier for the mcs lock */
611 
612 		/*
613 		 * Generic qspinlocks have this prefetch here, but it seems
614 		 * like it could cause additional line transitions because
615 		 * the waiter will keep loading from it.
616 		 */
617 		if (_Q_SPIN_PREFETCH_NEXT) {
618 			next = READ_ONCE(node->next);
619 			if (next)
620 				prefetchw(next);
621 		}
622 	}
623 
624 	/* We're at the head of the waitqueue, wait for the lock. */
625 again:
626 	spin_begin();
627 	for (;;) {
628 		bool preempted;
629 
630 		val = READ_ONCE(lock->val);
631 		if (!(val & _Q_LOCKED_VAL))
632 			break;
633 		spec_barrier();
634 
635 		if (paravirt && pv_sleepy_lock && maybe_stealers) {
636 			if (!sleepy) {
637 				if (val & _Q_SLEEPY_VAL) {
638 					seen_sleepy_lock();
639 					sleepy = true;
640 				} else if (recently_sleepy()) {
641 					sleepy = true;
642 				}
643 			}
644 			if (pv_sleepy_lock_sticky && seen_preempted &&
645 			    !(val & _Q_SLEEPY_VAL)) {
646 				if (try_set_sleepy(lock, val))
647 					val |= _Q_SLEEPY_VAL;
648 			}
649 		}
650 
651 		propagate_yield_cpu(node, val, &set_yield_cpu, paravirt);
652 		preempted = yield_head_to_locked_owner(lock, val, paravirt);
653 		if (!maybe_stealers)
654 			continue;
655 
656 		if (preempted)
657 			seen_preempted = true;
658 
659 		if (paravirt && preempted) {
660 			sleepy = true;
661 
662 			if (!pv_spin_on_preempted_owner)
663 				iters++;
664 		} else {
665 			iters++;
666 		}
667 
668 		if (!mustq && iters >= get_head_spins(paravirt, sleepy)) {
669 			mustq = true;
670 			set_mustq(lock);
671 			val |= _Q_MUST_Q_VAL;
672 		}
673 	}
674 	spec_barrier();
675 	spin_end();
676 
677 	/* If we're the last queued, must clean up the tail. */
678 	old = trylock_clean_tail(lock, tail);
679 	if (unlikely(old & _Q_LOCKED_VAL)) {
680 		BUG_ON(!maybe_stealers);
681 		goto again; /* Can only be true if maybe_stealers. */
682 	}
683 
684 	if ((old & _Q_TAIL_CPU_MASK) == tail)
685 		goto release; /* We were the tail, no next. */
686 
687 	/* There is a next, must wait for node->next != NULL (MCS protocol) */
688 	next = READ_ONCE(node->next);
689 	if (!next) {
690 		spin_begin();
691 		while (!(next = READ_ONCE(node->next)))
692 			cpu_relax();
693 		spin_end();
694 	}
695 	spec_barrier();
696 
697 	/*
698 	 * Unlock the next mcs waiter node. Release barrier is not required
699 	 * here because the acquirer is only accessing the lock word, and
700 	 * the acquire barrier we took the lock with orders that update vs
701 	 * this store to locked. The corresponding barrier is the smp_rmb()
702 	 * acquire barrier for mcs lock, above.
703 	 */
704 	if (paravirt && pv_prod_head) {
705 		int next_cpu = next->cpu;
706 		WRITE_ONCE(next->locked, 1);
707 		if (_Q_SPIN_MISO)
708 			asm volatile("miso" ::: "memory");
709 		if (vcpu_is_preempted(next_cpu))
710 			prod_cpu(next_cpu);
711 	} else {
712 		WRITE_ONCE(next->locked, 1);
713 		if (_Q_SPIN_MISO)
714 			asm volatile("miso" ::: "memory");
715 	}
716 
717 release:
718 	qnodesp->count--; /* release the node */
719 }
720 
queued_spin_lock_slowpath(struct qspinlock * lock)721 void queued_spin_lock_slowpath(struct qspinlock *lock)
722 {
723 	/*
724 	 * This looks funny, but it induces the compiler to inline both
725 	 * sides of the branch rather than share code as when the condition
726 	 * is passed as the paravirt argument to the functions.
727 	 */
728 	if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) && is_shared_processor()) {
729 		if (try_to_steal_lock(lock, true)) {
730 			spec_barrier();
731 			return;
732 		}
733 		queued_spin_lock_mcs_queue(lock, true);
734 	} else {
735 		if (try_to_steal_lock(lock, false)) {
736 			spec_barrier();
737 			return;
738 		}
739 		queued_spin_lock_mcs_queue(lock, false);
740 	}
741 }
742 EXPORT_SYMBOL(queued_spin_lock_slowpath);
743 
744 #ifdef CONFIG_PARAVIRT_SPINLOCKS
pv_spinlocks_init(void)745 void pv_spinlocks_init(void)
746 {
747 }
748 #endif
749 
750 #include <linux/debugfs.h>
steal_spins_set(void * data,u64 val)751 static int steal_spins_set(void *data, u64 val)
752 {
753 #if _Q_SPIN_TRY_LOCK_STEAL == 1
754 	/* MAYBE_STEAL remains true */
755 	steal_spins = val;
756 #else
757 	static DEFINE_MUTEX(lock);
758 
759 	/*
760 	 * The lock slow path has a !maybe_stealers case that can assume
761 	 * the head of queue will not see concurrent waiters. That waiter
762 	 * is unsafe in the presence of stealers, so must keep them away
763 	 * from one another.
764 	 */
765 
766 	mutex_lock(&lock);
767 	if (val && !steal_spins) {
768 		maybe_stealers = true;
769 		/* wait for queue head waiter to go away */
770 		synchronize_rcu();
771 		steal_spins = val;
772 	} else if (!val && steal_spins) {
773 		steal_spins = val;
774 		/* wait for all possible stealers to go away */
775 		synchronize_rcu();
776 		maybe_stealers = false;
777 	} else {
778 		steal_spins = val;
779 	}
780 	mutex_unlock(&lock);
781 #endif
782 
783 	return 0;
784 }
785 
steal_spins_get(void * data,u64 * val)786 static int steal_spins_get(void *data, u64 *val)
787 {
788 	*val = steal_spins;
789 
790 	return 0;
791 }
792 
793 DEFINE_SIMPLE_ATTRIBUTE(fops_steal_spins, steal_spins_get, steal_spins_set, "%llu\n");
794 
remote_steal_spins_set(void * data,u64 val)795 static int remote_steal_spins_set(void *data, u64 val)
796 {
797 	remote_steal_spins = val;
798 
799 	return 0;
800 }
801 
remote_steal_spins_get(void * data,u64 * val)802 static int remote_steal_spins_get(void *data, u64 *val)
803 {
804 	*val = remote_steal_spins;
805 
806 	return 0;
807 }
808 
809 DEFINE_SIMPLE_ATTRIBUTE(fops_remote_steal_spins, remote_steal_spins_get, remote_steal_spins_set, "%llu\n");
810 
head_spins_set(void * data,u64 val)811 static int head_spins_set(void *data, u64 val)
812 {
813 	head_spins = val;
814 
815 	return 0;
816 }
817 
head_spins_get(void * data,u64 * val)818 static int head_spins_get(void *data, u64 *val)
819 {
820 	*val = head_spins;
821 
822 	return 0;
823 }
824 
825 DEFINE_SIMPLE_ATTRIBUTE(fops_head_spins, head_spins_get, head_spins_set, "%llu\n");
826 
pv_yield_owner_set(void * data,u64 val)827 static int pv_yield_owner_set(void *data, u64 val)
828 {
829 	pv_yield_owner = !!val;
830 
831 	return 0;
832 }
833 
pv_yield_owner_get(void * data,u64 * val)834 static int pv_yield_owner_get(void *data, u64 *val)
835 {
836 	*val = pv_yield_owner;
837 
838 	return 0;
839 }
840 
841 DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_owner, pv_yield_owner_get, pv_yield_owner_set, "%llu\n");
842 
pv_yield_allow_steal_set(void * data,u64 val)843 static int pv_yield_allow_steal_set(void *data, u64 val)
844 {
845 	pv_yield_allow_steal = !!val;
846 
847 	return 0;
848 }
849 
pv_yield_allow_steal_get(void * data,u64 * val)850 static int pv_yield_allow_steal_get(void *data, u64 *val)
851 {
852 	*val = pv_yield_allow_steal;
853 
854 	return 0;
855 }
856 
857 DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_allow_steal, pv_yield_allow_steal_get, pv_yield_allow_steal_set, "%llu\n");
858 
pv_spin_on_preempted_owner_set(void * data,u64 val)859 static int pv_spin_on_preempted_owner_set(void *data, u64 val)
860 {
861 	pv_spin_on_preempted_owner = !!val;
862 
863 	return 0;
864 }
865 
pv_spin_on_preempted_owner_get(void * data,u64 * val)866 static int pv_spin_on_preempted_owner_get(void *data, u64 *val)
867 {
868 	*val = pv_spin_on_preempted_owner;
869 
870 	return 0;
871 }
872 
873 DEFINE_SIMPLE_ATTRIBUTE(fops_pv_spin_on_preempted_owner, pv_spin_on_preempted_owner_get, pv_spin_on_preempted_owner_set, "%llu\n");
874 
pv_sleepy_lock_set(void * data,u64 val)875 static int pv_sleepy_lock_set(void *data, u64 val)
876 {
877 	pv_sleepy_lock = !!val;
878 
879 	return 0;
880 }
881 
pv_sleepy_lock_get(void * data,u64 * val)882 static int pv_sleepy_lock_get(void *data, u64 *val)
883 {
884 	*val = pv_sleepy_lock;
885 
886 	return 0;
887 }
888 
889 DEFINE_SIMPLE_ATTRIBUTE(fops_pv_sleepy_lock, pv_sleepy_lock_get, pv_sleepy_lock_set, "%llu\n");
890 
pv_sleepy_lock_sticky_set(void * data,u64 val)891 static int pv_sleepy_lock_sticky_set(void *data, u64 val)
892 {
893 	pv_sleepy_lock_sticky = !!val;
894 
895 	return 0;
896 }
897 
pv_sleepy_lock_sticky_get(void * data,u64 * val)898 static int pv_sleepy_lock_sticky_get(void *data, u64 *val)
899 {
900 	*val = pv_sleepy_lock_sticky;
901 
902 	return 0;
903 }
904 
905 DEFINE_SIMPLE_ATTRIBUTE(fops_pv_sleepy_lock_sticky, pv_sleepy_lock_sticky_get, pv_sleepy_lock_sticky_set, "%llu\n");
906 
pv_sleepy_lock_interval_ns_set(void * data,u64 val)907 static int pv_sleepy_lock_interval_ns_set(void *data, u64 val)
908 {
909 	pv_sleepy_lock_interval_ns = val;
910 
911 	return 0;
912 }
913 
pv_sleepy_lock_interval_ns_get(void * data,u64 * val)914 static int pv_sleepy_lock_interval_ns_get(void *data, u64 *val)
915 {
916 	*val = pv_sleepy_lock_interval_ns;
917 
918 	return 0;
919 }
920 
921 DEFINE_SIMPLE_ATTRIBUTE(fops_pv_sleepy_lock_interval_ns, pv_sleepy_lock_interval_ns_get, pv_sleepy_lock_interval_ns_set, "%llu\n");
922 
pv_sleepy_lock_factor_set(void * data,u64 val)923 static int pv_sleepy_lock_factor_set(void *data, u64 val)
924 {
925 	pv_sleepy_lock_factor = val;
926 
927 	return 0;
928 }
929 
pv_sleepy_lock_factor_get(void * data,u64 * val)930 static int pv_sleepy_lock_factor_get(void *data, u64 *val)
931 {
932 	*val = pv_sleepy_lock_factor;
933 
934 	return 0;
935 }
936 
937 DEFINE_SIMPLE_ATTRIBUTE(fops_pv_sleepy_lock_factor, pv_sleepy_lock_factor_get, pv_sleepy_lock_factor_set, "%llu\n");
938 
pv_yield_prev_set(void * data,u64 val)939 static int pv_yield_prev_set(void *data, u64 val)
940 {
941 	pv_yield_prev = !!val;
942 
943 	return 0;
944 }
945 
pv_yield_prev_get(void * data,u64 * val)946 static int pv_yield_prev_get(void *data, u64 *val)
947 {
948 	*val = pv_yield_prev;
949 
950 	return 0;
951 }
952 
953 DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_prev, pv_yield_prev_get, pv_yield_prev_set, "%llu\n");
954 
pv_yield_propagate_owner_set(void * data,u64 val)955 static int pv_yield_propagate_owner_set(void *data, u64 val)
956 {
957 	pv_yield_propagate_owner = !!val;
958 
959 	return 0;
960 }
961 
pv_yield_propagate_owner_get(void * data,u64 * val)962 static int pv_yield_propagate_owner_get(void *data, u64 *val)
963 {
964 	*val = pv_yield_propagate_owner;
965 
966 	return 0;
967 }
968 
969 DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_propagate_owner, pv_yield_propagate_owner_get, pv_yield_propagate_owner_set, "%llu\n");
970 
pv_prod_head_set(void * data,u64 val)971 static int pv_prod_head_set(void *data, u64 val)
972 {
973 	pv_prod_head = !!val;
974 
975 	return 0;
976 }
977 
pv_prod_head_get(void * data,u64 * val)978 static int pv_prod_head_get(void *data, u64 *val)
979 {
980 	*val = pv_prod_head;
981 
982 	return 0;
983 }
984 
985 DEFINE_SIMPLE_ATTRIBUTE(fops_pv_prod_head, pv_prod_head_get, pv_prod_head_set, "%llu\n");
986 
spinlock_debugfs_init(void)987 static __init int spinlock_debugfs_init(void)
988 {
989 	debugfs_create_file("qspl_steal_spins", 0600, arch_debugfs_dir, NULL, &fops_steal_spins);
990 	debugfs_create_file("qspl_remote_steal_spins", 0600, arch_debugfs_dir, NULL, &fops_remote_steal_spins);
991 	debugfs_create_file("qspl_head_spins", 0600, arch_debugfs_dir, NULL, &fops_head_spins);
992 	if (is_shared_processor()) {
993 		debugfs_create_file("qspl_pv_yield_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_owner);
994 		debugfs_create_file("qspl_pv_yield_allow_steal", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_allow_steal);
995 		debugfs_create_file("qspl_pv_spin_on_preempted_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_spin_on_preempted_owner);
996 		debugfs_create_file("qspl_pv_sleepy_lock", 0600, arch_debugfs_dir, NULL, &fops_pv_sleepy_lock);
997 		debugfs_create_file("qspl_pv_sleepy_lock_sticky", 0600, arch_debugfs_dir, NULL, &fops_pv_sleepy_lock_sticky);
998 		debugfs_create_file("qspl_pv_sleepy_lock_interval_ns", 0600, arch_debugfs_dir, NULL, &fops_pv_sleepy_lock_interval_ns);
999 		debugfs_create_file("qspl_pv_sleepy_lock_factor", 0600, arch_debugfs_dir, NULL, &fops_pv_sleepy_lock_factor);
1000 		debugfs_create_file("qspl_pv_yield_prev", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_prev);
1001 		debugfs_create_file("qspl_pv_yield_propagate_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_propagate_owner);
1002 		debugfs_create_file("qspl_pv_prod_head", 0600, arch_debugfs_dir, NULL, &fops_pv_prod_head);
1003 	}
1004 
1005 	return 0;
1006 }
1007 device_initcall(spinlock_debugfs_init);
1008