1 /*
2  *	Intel SMP support routines.
3  *
4  *	(c) 1995 Alan Cox, Building #3 <alan@redhat.com>
5  *	(c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
6  *
7  *	This code is released under the GNU General Public License version 2 or
8  *	later.
9  */
10 
11 #include <linux/init.h>
12 
13 #include <linux/mm.h>
14 #include <linux/irq.h>
15 #include <linux/delay.h>
16 #include <linux/spinlock.h>
17 #include <linux/smp_lock.h>
18 #include <linux/kernel_stat.h>
19 #include <linux/mc146818rtc.h>
20 #include <linux/cache.h>
21 
22 #include <asm/mtrr.h>
23 #include <asm/pgalloc.h>
24 #include <asm/smpboot.h>
25 
26 /*
27  *	Some notes on x86 processor bugs affecting SMP operation:
28  *
29  *	Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
30  *	The Linux implications for SMP are handled as follows:
31  *
32  *	Pentium III / [Xeon]
33  *		None of the E1AP-E3AP errata are visible to the user.
34  *
35  *	E1AP.	see PII A1AP
36  *	E2AP.	see PII A2AP
37  *	E3AP.	see PII A3AP
38  *
39  *	Pentium II / [Xeon]
40  *		None of the A1AP-A3AP errata are visible to the user.
41  *
42  *	A1AP.	see PPro 1AP
43  *	A2AP.	see PPro 2AP
44  *	A3AP.	see PPro 7AP
45  *
46  *	Pentium Pro
47  *		None of 1AP-9AP errata are visible to the normal user,
48  *	except occasional delivery of 'spurious interrupt' as trap #15.
49  *	This is very rare and a non-problem.
50  *
51  *	1AP.	Linux maps APIC as non-cacheable
52  *	2AP.	worked around in hardware
53  *	3AP.	fixed in C0 and above steppings microcode update.
54  *		Linux does not use excessive STARTUP_IPIs.
55  *	4AP.	worked around in hardware
56  *	5AP.	symmetric IO mode (normal Linux operation) not affected.
57  *		'noapic' mode has vector 0xf filled out properly.
58  *	6AP.	'noapic' mode might be affected - fixed in later steppings
59  *	7AP.	We do not assume writes to the LVT deassering IRQs
60  *	8AP.	We do not enable low power mode (deep sleep) during MP bootup
61  *	9AP.	We do not use mixed mode
62  *
63  *	Pentium
64  *		There is a marginal case where REP MOVS on 100MHz SMP
65  *	machines with B stepping processors can fail. XXX should provide
66  *	an L1cache=Writethrough or L1cache=off option.
67  *
68  *		B stepping CPUs may hang. There are hardware work arounds
69  *	for this. We warn about it in case your board doesn't have the work
70  *	arounds. Basically thats so I can tell anyone with a B stepping
71  *	CPU and SMP problems "tough".
72  *
73  *	Specific items [From Pentium Processor Specification Update]
74  *
75  *	1AP.	Linux doesn't use remote read
76  *	2AP.	Linux doesn't trust APIC errors
77  *	3AP.	We work around this
78  *	4AP.	Linux never generated 3 interrupts of the same priority
79  *		to cause a lost local interrupt.
80  *	5AP.	Remote read is never used
81  *	6AP.	not affected - worked around in hardware
82  *	7AP.	not affected - worked around in hardware
83  *	8AP.	worked around in hardware - we get explicit CS errors if not
84  *	9AP.	only 'noapic' mode affected. Might generate spurious
85  *		interrupts, we log only the first one and count the
86  *		rest silently.
87  *	10AP.	not affected - worked around in hardware
88  *	11AP.	Linux reads the APIC between writes to avoid this, as per
89  *		the documentation. Make sure you preserve this as it affects
90  *		the C stepping chips too.
91  *	12AP.	not affected - worked around in hardware
92  *	13AP.	not affected - worked around in hardware
93  *	14AP.	we always deassert INIT during bootup
94  *	15AP.	not affected - worked around in hardware
95  *	16AP.	not affected - worked around in hardware
96  *	17AP.	not affected - worked around in hardware
97  *	18AP.	not affected - worked around in hardware
98  *	19AP.	not affected - worked around in BIOS
99  *
100  *	If this sounds worrying believe me these bugs are either ___RARE___,
101  *	or are signal timing bugs worked around in hardware and there's
102  *	about nothing of note with C stepping upwards.
103  */
104 
105 /* The 'big kernel lock' */
106 spinlock_cacheline_t kernel_flag_cacheline = {SPIN_LOCK_UNLOCKED};
107 
108 struct tlb_state cpu_tlbstate[NR_CPUS] __cacheline_aligned = {[0 ... NR_CPUS-1] = { &init_mm, 0, }};
109 
110 /*
111  * the following functions deal with sending IPIs between CPUs.
112  *
113  * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
114  */
115 
__prepare_ICR(unsigned int shortcut,int vector)116 static inline int __prepare_ICR (unsigned int shortcut, int vector)
117 {
118 	return APIC_DM_FIXED | shortcut | vector | INT_DEST_ADDR_MODE;
119 }
120 
__prepare_ICR2(unsigned int mask)121 static inline int __prepare_ICR2 (unsigned int mask)
122 {
123 	return SET_APIC_DEST_FIELD(mask);
124 }
125 
__send_IPI_shortcut(unsigned int shortcut,int vector)126 static inline void __send_IPI_shortcut(unsigned int shortcut, int vector)
127 {
128 	/*
129 	 * Subtle. In the case of the 'never do double writes' workaround
130 	 * we have to lock out interrupts to be safe.  As we don't care
131 	 * of the value read we use an atomic rmw access to avoid costly
132 	 * cli/sti.  Otherwise we use an even cheaper single atomic write
133 	 * to the APIC.
134 	 */
135 	unsigned int cfg;
136 
137 	/*
138 	 * Wait for idle.
139 	 */
140 	apic_wait_icr_idle();
141 
142 	/*
143 	 * No need to touch the target chip field
144 	 */
145 	cfg = __prepare_ICR(shortcut, vector);
146 
147 	/*
148 	 * Send the IPI. The write to APIC_ICR fires this off.
149 	 */
150 	apic_write_around(APIC_ICR, cfg);
151 }
152 
send_IPI_self(int vector)153 void fastcall send_IPI_self(int vector)
154 {
155 	__send_IPI_shortcut(APIC_DEST_SELF, vector);
156 }
157 
send_IPI_mask_bitmask(int mask,int vector)158 static inline void send_IPI_mask_bitmask(int mask, int vector)
159 {
160 	unsigned long cfg;
161 	unsigned long flags;
162 
163 	__save_flags(flags);
164 	__cli();
165 
166 
167 	/*
168 	 * Wait for idle.
169 	 */
170 	apic_wait_icr_idle();
171 
172 	/*
173 	 * prepare target chip field
174 	 */
175 	cfg = __prepare_ICR2(mask);
176 	apic_write_around(APIC_ICR2, cfg);
177 
178 	/*
179 	 * program the ICR
180 	 */
181 	cfg = __prepare_ICR(0, vector);
182 
183 	/*
184 	 * Send the IPI. The write to APIC_ICR fires this off.
185 	 */
186 	apic_write_around(APIC_ICR, cfg);
187 
188 	__restore_flags(flags);
189 }
190 
send_IPI_mask_sequence(int mask,int vector)191 static inline void send_IPI_mask_sequence(int mask, int vector)
192 {
193 	unsigned long cfg, flags;
194 	unsigned int query_cpu, query_mask;
195 
196 	/*
197 	 * Hack. The clustered APIC addressing mode doesn't allow us to send
198 	 * to an arbitrary mask, so I do a unicasts to each CPU instead. This
199 	 * should be modified to do 1 message per cluster ID - mbligh
200 	 */
201 
202 	__save_flags(flags);
203 	__cli();
204 
205 	for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) {
206 		query_mask = 1 << query_cpu;
207 		if (query_mask & mask) {
208 
209 			/*
210 			 * Wait for idle.
211 			 */
212 			apic_wait_icr_idle();
213 
214 			/*
215 			 * prepare target chip field
216 			 */
217 			if(clustered_apic_mode == CLUSTERED_APIC_XAPIC)
218 				cfg = __prepare_ICR2(cpu_to_physical_apicid(query_cpu));
219 			else
220 				cfg = __prepare_ICR2(cpu_to_logical_apicid(query_cpu));
221 			apic_write_around(APIC_ICR2, cfg);
222 
223 			/*
224 			 * program the ICR
225 			 */
226 			cfg = __prepare_ICR(0, vector);
227 
228 			/*
229 			 * Send the IPI. The write to APIC_ICR fires this off.
230 			 */
231 			apic_write_around(APIC_ICR, cfg);
232 		}
233 	}
234 	__restore_flags(flags);
235 }
236 
send_IPI_mask(int mask,int vector)237 static inline void send_IPI_mask(int mask, int vector)
238 {
239 	if (clustered_apic_mode)
240 		send_IPI_mask_sequence(mask, vector);
241 	else
242 		send_IPI_mask_bitmask(mask, vector);
243 }
244 
send_IPI_allbutself(int vector)245 static inline void send_IPI_allbutself(int vector)
246 {
247 	/*
248 	 * if there are no other CPUs in the system then
249 	 * we get an APIC send error if we try to broadcast.
250 	 * thus we have to avoid sending IPIs in this case.
251 	 */
252 	if (!(smp_num_cpus > 1))
253 		return;
254 
255 	if (clustered_apic_mode) {
256 		// Pointless. Use send_IPI_mask to do this instead
257 		int cpu;
258 
259 		if (smp_num_cpus > 1) {
260 			for (cpu = 0; cpu < smp_num_cpus; ++cpu) {
261 				if (cpu != smp_processor_id())
262 					send_IPI_mask(1 << cpu, vector);
263 			}
264 		}
265 	} else {
266 		__send_IPI_shortcut(APIC_DEST_ALLBUT, vector);
267 		return;
268 	}
269 }
270 
send_IPI_all(int vector)271 static inline void send_IPI_all(int vector)
272 {
273 	if (clustered_apic_mode) {
274 		// Pointless. Use send_IPI_mask to do this instead
275 		int cpu;
276 
277 		for (cpu = 0; cpu < smp_num_cpus; ++cpu) {
278 			send_IPI_mask(1 << cpu, vector);
279 		}
280 	} else {
281 		__send_IPI_shortcut(APIC_DEST_ALLINC, vector);
282 	}
283 }
284 
285 /*
286  *	Smarter SMP flushing macros.
287  *		c/o Linus Torvalds.
288  *
289  *	These mean you can really definitely utterly forget about
290  *	writing to user space from interrupts. (Its not allowed anyway).
291  *
292  *	Optimizations Manfred Spraul <manfred@colorfullife.com>
293  */
294 
295 static volatile unsigned long flush_cpumask;
296 static struct mm_struct * flush_mm;
297 static unsigned long flush_va;
298 static spinlock_t tlbstate_lock = SPIN_LOCK_UNLOCKED;
299 #define FLUSH_ALL	0xffffffff
300 
301 /*
302  * We cannot call mmdrop() because we are in interrupt context,
303  * instead update mm->cpu_vm_mask.
304  *
305  * We need to reload %cr3 since the page tables may be going
306  * away frm under us...
307  */
leave_mm(unsigned long cpu)308 static void inline leave_mm (unsigned long cpu)
309 {
310 	BUG_ON(cpu_tlbstate[cpu].state == TLBSTATE_OK);
311 	clear_bit(cpu, &cpu_tlbstate[cpu].active_mm->cpu_vm_mask);
312 	load_cr3(swapper_pg_dir);
313 }
314 
315 /*
316  *
317  * The flush IPI assumes that a thread switch happens in this order:
318  * [cpu0: the cpu that switches]
319  * 1) switch_mm() either 1a) or 1b)
320  * 1a) thread switch to a different mm
321  * 1a1) clear_bit(cpu, &old_mm->cpu_vm_mask);
322  * 	Stop ipi delivery for the old mm. This is not synchronized with
323  * 	the other cpus, but smp_invalidate_interrupt ignore flush ipis
324  * 	for the wrong mm, and in the worst case we perform a superflous
325  * 	tlb flush.
326  * 1a2) set cpu_tlbstate to TLBSTATE_OK
327  * 	Now the smp_invalidate_interrupt won't call leave_mm if cpu0
328  *	was in lazy tlb mode.
329  * 1a3) update cpu_tlbstate[].active_mm
330  * 	Now cpu0 accepts tlb flushes for the new mm.
331  * 1a4) set_bit(cpu, &new_mm->cpu_vm_mask);
332  * 	Now the other cpus will send tlb flush ipis.
333  * 1a4) change cr3.
334  * 1b) thread switch without mm change
335  *	cpu_tlbstate[].active_mm is correct, cpu0 already handles
336  *	flush ipis.
337  * 1b1) set cpu_tlbstate to TLBSTATE_OK
338  * 1b2) test_and_set the cpu bit in cpu_vm_mask.
339  * 	Atomically set the bit [other cpus will start sending flush ipis],
340  * 	and test the bit.
341  * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
342  * 2) switch %%esp, ie current
343  *
344  * The interrupt must handle 2 special cases:
345  * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
346  * - the cpu performs speculative tlb reads, i.e. even if the cpu only
347  *   runs in kernel space, the cpu could load tlb entries for user space
348  *   pages.
349  *
350  * The good news is that cpu_tlbstate is local to each cpu, no
351  * write/read ordering problems.
352  */
353 
354 /*
355  * TLB flush IPI:
356  *
357  * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
358  * 2) Leave the mm if we are in the lazy tlb mode.
359  */
360 
smp_invalidate_interrupt(void)361 asmlinkage void smp_invalidate_interrupt (void)
362 {
363 	unsigned long cpu = smp_processor_id();
364 
365 	if (!test_bit(cpu, &flush_cpumask))
366 		return;
367 		/*
368 		 * This was a BUG() but until someone can quote me the
369 		 * line from the intel manual that guarantees an IPI to
370 		 * multiple CPUs is retried _only_ on the erroring CPUs
371 		 * its staying as a return
372 		 *
373 		 * BUG();
374 		 */
375 
376 	if (flush_mm == cpu_tlbstate[cpu].active_mm) {
377 		if (cpu_tlbstate[cpu].state == TLBSTATE_OK) {
378 			if (flush_va == FLUSH_ALL)
379 				local_flush_tlb();
380 			else
381 				__flush_tlb_one(flush_va);
382 		} else
383 			leave_mm(cpu);
384 	}
385 	ack_APIC_irq();
386 	clear_bit(cpu, &flush_cpumask);
387 }
388 
flush_tlb_others(unsigned long cpumask,struct mm_struct * mm,unsigned long va)389 static void flush_tlb_others (unsigned long cpumask, struct mm_struct *mm,
390 						unsigned long va)
391 {
392 	/*
393 	 * A couple of (to be removed) sanity checks:
394 	 *
395 	 * - we do not send IPIs to not-yet booted CPUs.
396 	 * - current CPU must not be in mask
397 	 * - mask must exist :)
398 	 */
399 	if (!cpumask)
400 		BUG();
401 	if ((cpumask & cpu_online_map) != cpumask)
402 		BUG();
403 	if (cpumask & (1 << smp_processor_id()))
404 		BUG();
405 	if (!mm)
406 		BUG();
407 
408 	/*
409 	 * i'm not happy about this global shared spinlock in the
410 	 * MM hot path, but we'll see how contended it is.
411 	 * Temporarily this turns IRQs off, so that lockups are
412 	 * detected by the NMI watchdog.
413 	 */
414 	spin_lock(&tlbstate_lock);
415 
416 	flush_mm = mm;
417 	flush_va = va;
418 	atomic_set_mask(cpumask, &flush_cpumask);
419 	/*
420 	 * We have to send the IPI only to
421 	 * CPUs affected.
422 	 */
423 	send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
424 
425 	while (flush_cpumask)
426 		/* nothing. lockup detection does not belong here */;
427 
428 	flush_mm = NULL;
429 	flush_va = 0;
430 	spin_unlock(&tlbstate_lock);
431 }
432 
flush_tlb_current_task(void)433 void flush_tlb_current_task(void)
434 {
435 	struct mm_struct *mm = current->mm;
436 	unsigned long cpu_mask = mm->cpu_vm_mask & ~(1 << smp_processor_id());
437 
438 	local_flush_tlb();
439 	if (cpu_mask)
440 		flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
441 }
442 
flush_tlb_mm(struct mm_struct * mm)443 void flush_tlb_mm (struct mm_struct * mm)
444 {
445 	unsigned long cpu_mask = mm->cpu_vm_mask & ~(1 << smp_processor_id());
446 
447 	if (current->active_mm == mm) {
448 		if (current->mm)
449 			local_flush_tlb();
450 		else
451 			leave_mm(smp_processor_id());
452 	}
453 	if (cpu_mask)
454 		flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
455 }
456 
flush_tlb_page(struct vm_area_struct * vma,unsigned long va)457 void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
458 {
459 	struct mm_struct *mm = vma->vm_mm;
460 	unsigned long cpu_mask = mm->cpu_vm_mask & ~(1 << smp_processor_id());
461 
462 	if (current->active_mm == mm) {
463 		if(current->mm)
464 			__flush_tlb_one(va);
465 		 else
466 		 	leave_mm(smp_processor_id());
467 	}
468 
469 	if (cpu_mask)
470 		flush_tlb_others(cpu_mask, mm, va);
471 }
472 
do_flush_tlb_all_local(void)473 static inline void do_flush_tlb_all_local(void)
474 {
475 	unsigned long cpu = smp_processor_id();
476 
477 	__flush_tlb_all();
478 	if (cpu_tlbstate[cpu].state == TLBSTATE_LAZY)
479 		leave_mm(cpu);
480 }
481 
flush_tlb_all_ipi(void * info)482 static void flush_tlb_all_ipi(void* info)
483 {
484 	do_flush_tlb_all_local();
485 }
486 
flush_tlb_all(void)487 void flush_tlb_all(void)
488 {
489 	smp_call_function (flush_tlb_all_ipi,0,1,1);
490 
491 	do_flush_tlb_all_local();
492 }
493 
494 /*
495  * this function sends a 'reschedule' IPI to another CPU.
496  * it goes straight through and wastes no time serializing
497  * anything. Worst case is that we lose a reschedule ...
498  */
499 
smp_send_reschedule(int cpu)500 void fastcall smp_send_reschedule(int cpu)
501 {
502 	send_IPI_mask(1 << cpu, RESCHEDULE_VECTOR);
503 }
504 
505 /*
506  * Structure and data for smp_call_function(). This is designed to minimise
507  * static memory requirements. It also looks cleaner.
508  */
509 static spinlock_t call_lock = SPIN_LOCK_UNLOCKED;
510 
511 struct call_data_struct {
512 	void (*func) (void *info);
513 	void *info;
514 	atomic_t started;
515 	atomic_t finished;
516 	int wait;
517 };
518 
519 static struct call_data_struct * call_data;
520 
521 /*
522  * this function sends a 'generic call function' IPI to all other CPUs
523  * in the system.
524  */
525 
smp_call_function(void (* func)(void * info),void * info,int nonatomic,int wait)526 int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
527 			int wait)
528 /*
529  * [SUMMARY] Run a function on all other CPUs.
530  * <func> The function to run. This must be fast and non-blocking.
531  * <info> An arbitrary pointer to pass to the function.
532  * <nonatomic> currently unused.
533  * <wait> If true, wait (atomically) until function has completed on other CPUs.
534  * [RETURNS] 0 on success, else a negative status code. Does not return until
535  * remote CPUs are nearly ready to execute <<func>> or are or have executed.
536  *
537  * You must not call this function with disabled interrupts or from a
538  * hardware interrupt handler or from a bottom half handler.
539  */
540 {
541 	struct call_data_struct data;
542 	int cpus = smp_num_cpus-1;
543 
544 	if (!cpus)
545 		return 0;
546 
547 	data.func = func;
548 	data.info = info;
549 	atomic_set(&data.started, 0);
550 	data.wait = wait;
551 	if (wait)
552 		atomic_set(&data.finished, 0);
553 
554 	spin_lock(&call_lock);
555 	call_data = &data;
556 	wmb();
557 	/* Send a message to all other CPUs and wait for them to respond */
558 	send_IPI_allbutself(CALL_FUNCTION_VECTOR);
559 
560 	/* Wait for response */
561 	while (atomic_read(&data.started) != cpus)
562 		barrier();
563 
564 	if (wait)
565 		while (atomic_read(&data.finished) != cpus)
566 			barrier();
567 	spin_unlock(&call_lock);
568 
569 	return 0;
570 }
571 
stop_this_cpu(void * dummy)572 static void stop_this_cpu (void * dummy)
573 {
574 	/*
575 	 * Remove this CPU:
576 	 */
577 	clear_bit(smp_processor_id(), &cpu_online_map);
578 	__cli();
579 	disable_local_APIC();
580 	if (cpu_data[smp_processor_id()].hlt_works_ok)
581 		for(;;) __asm__("hlt");
582 	for (;;);
583 }
584 
585 /*
586  * this function calls the 'stop' function on all other CPUs in the system.
587  */
588 
smp_send_stop(void)589 void smp_send_stop(void)
590 {
591 	smp_call_function(stop_this_cpu, NULL, 1, 0);
592 	smp_num_cpus = 1;
593 
594 	__cli();
595 	disable_local_APIC();
596 	__sti();
597 }
598 
599 /*
600  * Reschedule call back. Nothing to do,
601  * all the work is done automatically when
602  * we return from the interrupt.
603  */
smp_reschedule_interrupt(void)604 asmlinkage void smp_reschedule_interrupt(void)
605 {
606 	ack_APIC_irq();
607 }
608 
smp_call_function_interrupt(void)609 asmlinkage void smp_call_function_interrupt(void)
610 {
611 	void (*func) (void *info) = call_data->func;
612 	void *info = call_data->info;
613 	int wait = call_data->wait;
614 
615 	ack_APIC_irq();
616 	/*
617 	 * Notify initiating CPU that I've grabbed the data and am
618 	 * about to execute the function
619 	 */
620 	mb();
621 	atomic_inc(&call_data->started);
622 	/*
623 	 * At this point the info structure may be out of scope unless wait==1
624 	 */
625 	(*func)(info);
626 	if (wait) {
627 		mb();
628 		atomic_inc(&call_data->finished);
629 	}
630 }
631 
632